From 6d2010ae8f7a6078e10b361c6962983bab233e0f Mon Sep 17 00:00:00 2001 From: Apple <opensource@apple.com> Date: Wed, 13 Jul 2011 23:06:20 +0000 Subject: [PATCH] xnu-1699.22.73.tar.gz --- EXTERNAL_HEADERS/Availability.h | 156 + EXTERNAL_HEADERS/AvailabilityInternal.h | 393 + EXTERNAL_HEADERS/AvailabilityMacros.h | 820 ++ EXTERNAL_HEADERS/Makefile | 7 +- EXTERNAL_HEADERS/architecture/Makefile | 4 +- EXTERNAL_HEADERS/architecture/ppc/Makefile | 33 - EXTERNAL_HEADERS/architecture/ppc/asm_help.h | 456 - .../architecture/ppc/basic_regs.h | 306 - EXTERNAL_HEADERS/architecture/ppc/fp_regs.h | 153 - .../architecture/ppc/macro_help.h | 64 - .../architecture/ppc/pseudo_inst.h | 420 - EXTERNAL_HEADERS/architecture/ppc/reg_help.h | 230 - EXTERNAL_HEADERS/mach-o/arm/reloc.h | 42 - EXTERNAL_HEADERS/mach-o/loader.h | 6 + EXTERNAL_HEADERS/mach-o/ppc/reloc.h | 65 - EXTERNAL_HEADERS/stdarg.h | 164 +- Makefile | 36 +- README | 71 +- {osfmk/profiling/ppc => SETUP}/Makefile | 17 +- SETUP/config/Makefile | 42 + SETUP/config/config.h | 293 + .../doconf/doconf.csh => SETUP/config/doconf | 19 +- SETUP/config/externs.c | 82 + SETUP/config/lexer.l | 214 + SETUP/config/main.c | 296 + SETUP/config/mkglue.c | 331 + SETUP/config/mkheaders.c | 276 + SETUP/config/mkioconf.c | 2086 ++++ SETUP/config/mkmakefile.c | 1182 +++ SETUP/config/mkswapconf.c | 247 + SETUP/config/openp.c | 93 + SETUP/config/parser.y | 1278 +++ SETUP/config/searchp.c | 90 + SETUP/kextsymboltool/Makefile | 31 + SETUP/kextsymboltool/kextsymboltool.c | 912 ++ .../newvers/newvers.csh => SETUP/newvers | 0 SETUP/seed_objroot | 133 - SETUP/setsegname/Makefile | 31 + SETUP/setsegname/setsegname.c | 237 + bsd/Makefile | 17 +- bsd/bsm/Makefile | 4 - bsd/bsm/audit.h | 3 + bsd/bsm/audit_kevents.h | 12 +- bsd/conf/MASTER | 32 +- bsd/conf/MASTER.i386 | 13 +- bsd/conf/MASTER.ppc | 99 - bsd/conf/MASTER.x86_64 | 13 +- bsd/conf/Makefile | 100 +- bsd/conf/Makefile.i386 | 40 +- bsd/conf/Makefile.ppc | 53 - bsd/conf/Makefile.template | 79 +- bsd/conf/Makefile.x86_64 | 40 +- bsd/conf/files | 36 +- bsd/conf/files.i386 | 18 +- bsd/conf/files.ppc | 34 - bsd/conf/files.x86_64 | 16 +- bsd/conf/param.c | 3 +- bsd/conf/tools/Makefile | 32 - bsd/conf/tools/doconf/Makefile | 49 - bsd/crypto/Makefile | 6 +- bsd/crypto/aes/Assert.c | 34 + bsd/crypto/aes/Makefile | 6 +- bsd/crypto/aes/aes.h | 16 +- bsd/crypto/aes/gen/Makefile | 4 - bsd/crypto/aes/gen/aesopt.h | 4 +- bsd/crypto/aes/i386/AES.s | 143 + bsd/crypto/aes/i386/Context.h | 9 + bsd/crypto/aes/i386/Data.mk | 30 + bsd/crypto/aes/i386/Data.s | 5196 ++++++++++ bsd/crypto/aes/i386/EncryptDecrypt.s | 607 ++ bsd/crypto/aes/i386/ExpandKeyForDecryption.s | 1214 +++ bsd/crypto/aes/i386/ExpandKeyForEncryption.s | 801 ++ bsd/crypto/aes/i386/MakeData.c | 516 + bsd/crypto/aes/i386/Makefile | 16 +- bsd/crypto/aes/i386/ReadMe.txt | 22 + bsd/crypto/aes/i386/aes_crypt_hw.s | 472 + bsd/crypto/aes/i386/aes_key_hw.s | 405 + bsd/crypto/aes/i386/aes_modes.c | 471 - bsd/crypto/aes/i386/aes_modes_asm.s | 420 + bsd/crypto/aes/i386/aes_modes_hw.s | 1669 ++++ bsd/crypto/aes/i386/aes_x86_v2.s | 1298 --- bsd/crypto/aes/i386/aesopt.h | 719 -- bsd/crypto/aes/i386/aesxts.c | 392 + bsd/crypto/aes/i386/aesxts.h | 103 + bsd/crypto/aes/i386/aesxts_asm.s | 1305 +++ bsd/crypto/aes/i386/edefs.h | 130 - bsd/crypto/aes/ppc/Makefile | 36 - bsd/crypto/aes/ppc/aescrypt.c | 411 - bsd/crypto/aes/ppc/aeskey.c | 455 - bsd/crypto/aes/ppc/aesopt.h | 753 -- bsd/crypto/aes/ppc/aestab.c | 384 - bsd/crypto/aes/ppc/aestab.h | 175 - bsd/crypto/aes/test/ReadMe.txt | 97 + bsd/crypto/aes/test/makegenx86.sh | 8 + bsd/crypto/aes/test/makeoptx86.sh | 10 + bsd/crypto/aes/test/tstaes.c | 131 + bsd/crypto/blowfish/Makefile | 4 - bsd/crypto/cast128/Makefile | 4 - bsd/crypto/des/Makefile | 4 - bsd/crypto/doc/KernelCrypto.plist | 76 + bsd/crypto/doc/KernelCrypto.txt | 149 + bsd/crypto/rc4/Makefile | 4 - bsd/crypto/sha2/Makefile | 4 - bsd/crypto/sha2/intel/sha256.s | 617 ++ bsd/crypto/sha2/intel/sha256nossse3.s | 649 ++ bsd/crypto/sha2/sha2.c | 38 +- bsd/dev/Makefile | 4 - bsd/dev/chud/chud_bsd_callback.c | 48 +- bsd/dev/dtrace/dtrace.c | 1098 +- bsd/dev/dtrace/dtrace_glue.c | 55 +- bsd/dev/dtrace/dtrace_subr.c | 7 +- bsd/dev/dtrace/fasttrap.c | 24 +- bsd/dev/dtrace/fbt.c | 186 +- bsd/dev/dtrace/lockstat.c | 14 +- bsd/dev/dtrace/profile_prvd.c | 55 +- bsd/dev/dtrace/sdt.c | 240 +- bsd/dev/dtrace/sdt_subr.c | 72 +- bsd/dev/dtrace/systrace.c | 62 +- bsd/dev/i386/conf.c | 1 + bsd/dev/i386/dtrace_isa.c | 37 +- bsd/dev/i386/fasttrap_isa.c | 5 +- bsd/dev/i386/fbt_x86.c | 1729 ++-- bsd/dev/i386/mem.c | 17 +- bsd/dev/i386/munge.s | 139 +- bsd/dev/i386/sdt_x86.c | 112 + bsd/dev/i386/sysctl.c | 167 +- bsd/dev/i386/systemcalls.c | 98 +- bsd/dev/i386/unix_signal.c | 6 +- bsd/dev/memdev.c | 12 +- bsd/dev/ppc/conf.c | 354 - bsd/dev/ppc/cons.c | 139 - bsd/dev/ppc/dtrace_isa.c | 589 -- bsd/dev/ppc/dtrace_subr_ppc.c | 193 - bsd/dev/ppc/fasttrap_isa.c | 734 -- bsd/dev/ppc/fbt_ppc.c | 694 -- bsd/dev/ppc/ffs.c | 59 - bsd/dev/ppc/ffs.s | 70 - bsd/dev/ppc/kern_machdep.c | 263 - bsd/dev/ppc/km.c | 392 - bsd/dev/ppc/mem.c | 241 - bsd/dev/ppc/munge.s | 477 - bsd/dev/ppc/ppc_init.c | 276 - bsd/dev/ppc/sdt_ppc.c | 71 - bsd/dev/ppc/stubs.c | 102 - bsd/dev/ppc/systemcalls.c | 435 - bsd/dev/ppc/unix_signal.c | 953 -- bsd/dev/ppc/xsumas.s | 401 - bsd/dev/random/Makefile | 4 - bsd/dev/unix_startup.c | 72 +- bsd/dev/vn/Makefile | 5 - bsd/dev/vn/vn.c | 14 +- bsd/dev/x86_64/munge.s | 130 + bsd/hfs/Makefile | 4 - bsd/hfs/hfs.h | 131 +- bsd/hfs/hfs_attrlist.c | 46 +- bsd/hfs/hfs_btreeio.c | 17 +- bsd/hfs/hfs_catalog.c | 338 +- bsd/hfs/hfs_catalog.h | 23 +- bsd/hfs/hfs_chash.c | 57 +- bsd/hfs/hfs_cnode.c | 1154 ++- bsd/hfs/hfs_cnode.h | 59 +- bsd/hfs/hfs_cprotect.c | 908 ++ bsd/hfs/hfs_dbg.h | 2 +- bsd/hfs/hfs_encodings.c | 6 +- bsd/hfs/hfs_endian.c | 15 +- bsd/hfs/hfs_format.h | 25 +- bsd/hfs/hfs_fsctl.h | 9 +- bsd/hfs/hfs_hotfiles.c | 31 +- bsd/hfs/hfs_kdebug.h | 54 + bsd/hfs/hfs_link.c | 88 +- bsd/hfs/hfs_lookup.c | 155 +- bsd/hfs/hfs_mount.h | 1 + bsd/hfs/hfs_notification.c | 16 + bsd/hfs/hfs_readwrite.c | 811 +- bsd/hfs/hfs_search.c | 48 +- bsd/hfs/hfs_vfsops.c | 3015 ++++-- bsd/hfs/hfs_vfsutils.c | 583 +- bsd/hfs/hfs_vnops.c | 1418 ++- bsd/hfs/hfs_xattr.c | 1038 +- bsd/hfs/hfscommon/BTree/BTree.c | 8 +- bsd/hfs/hfscommon/BTree/BTreeAllocate.c | 29 +- bsd/hfs/hfscommon/BTree/BTreeScanner.c | 2 +- bsd/hfs/hfscommon/Catalog/FileIDsServices.c | 286 +- bsd/hfs/hfscommon/Misc/FileExtentMapping.c | 118 +- bsd/hfs/hfscommon/Misc/HybridAllocator.c | 533 + bsd/hfs/hfscommon/Misc/VolumeAllocation.c | 3141 +++++- bsd/hfs/hfscommon/headers/FileMgrInternal.h | 36 +- bsd/hfs/hfscommon/headers/HybridAllocator.h | 101 + bsd/hfs/hfscommon/headers/RedBlackTree.h | 969 ++ bsd/i386/param.h | 14 +- bsd/kern/Makefile | 26 + bsd/kern/bsd_init.c | 246 +- bsd/kern/bsd_stubs.c | 96 + bsd/kern/decmpfs.c | 16 +- bsd/kern/imageboot.c | 268 +- bsd/kern/kdebug.c | 1234 ++- bsd/kern/kern_acct.c | 8 +- bsd/kern/kern_aio.c | 2 +- bsd/kern/kern_authorization.c | 63 +- bsd/kern/kern_clock.c | 2 +- bsd/kern/kern_control.c | 157 +- bsd/kern/kern_core.c | 42 +- bsd/kern/kern_credential.c | 1171 ++- bsd/kern/kern_descrip.c | 572 +- bsd/kern/kern_event.c | 237 +- bsd/kern/kern_exec.c | 1183 ++- bsd/kern/kern_exit.c | 108 +- bsd/kern/kern_fork.c | 45 +- bsd/kern/kern_lockf.c | 153 +- bsd/kern/kern_malloc.c | 226 +- bsd/kern/kern_memorystatus.c | 839 +- bsd/kern/kern_mib.c | 273 +- bsd/kern/kern_mman.c | 108 +- bsd/kern/kern_newsysctl.c | 580 +- bsd/kern/kern_panicinfo.c | 49 +- bsd/kern/kern_priv.c | 119 + bsd/kern/kern_proc.c | 137 +- bsd/kern/kern_prot.c | 153 +- bsd/kern/kern_resource.c | 281 +- bsd/kern/kern_shutdown.c | 49 +- bsd/kern/kern_sig.c | 156 +- bsd/kern/kern_symfile.c | 227 +- bsd/kern/kern_synch.c | 35 +- bsd/kern/kern_sysctl.c | 1457 ++- bsd/kern/kern_time.c | 3 +- bsd/kern/kern_xxx.c | 2 +- bsd/kern/kpi_mbuf.c | 100 +- bsd/kern/kpi_socket.c | 250 +- bsd/kern/kpi_socketfilter.c | 1210 ++- bsd/kern/mach_loader.c | 552 +- bsd/kern/mach_loader.h | 3 + bsd/kern/mach_process.c | 14 +- bsd/kern/makesyscalls.sh | 42 +- bsd/kern/mcache.c | 48 +- bsd/kern/netboot.c | 171 +- bsd/kern/policy_check.c | 511 + bsd/kern/posix_sem.c | 44 +- bsd/kern/posix_shm.c | 194 +- bsd/kern/proc_info.c | 481 +- bsd/kern/process_policy.c | 460 + bsd/kern/pthread_support.c | 3311 ++++--- bsd/kern/pthread_synch.c | 300 +- bsd/kern/subr_log.c | 234 +- bsd/kern/subr_prof.c | 58 +- bsd/kern/sys_generic.c | 349 +- bsd/kern/sys_pipe.c | 53 +- bsd/kern/sys_socket.c | 28 +- bsd/kern/syscalls.master | 128 +- bsd/kern/sysv_ipc.c | 91 +- bsd/kern/sysv_msg.c | 6 +- bsd/kern/sysv_sem.c | 34 +- bsd/kern/sysv_shm.c | 18 +- bsd/kern/trace.codes | 2149 ++++ bsd/kern/tty.c | 32 +- bsd/kern/tty_ptmx.c | 340 +- bsd/kern/tty_subr.c | 4 +- bsd/kern/tty_tty.c | 14 +- bsd/kern/ubc_subr.c | 276 +- bsd/kern/uipc_domain.c | 99 +- bsd/kern/uipc_mbuf.c | 1958 ++-- bsd/kern/uipc_mbuf2.c | 244 +- bsd/kern/uipc_socket.c | 701 +- bsd/kern/uipc_socket2.c | 237 +- bsd/kern/uipc_syscalls.c | 261 +- bsd/kern/uipc_usrreq.c | 189 +- bsd/kern/vm_pressure.c | 236 + bsd/{ppc/reg.h => kern/vm_pressure.h} | 22 +- bsd/libkern/libkern.h | 11 +- bsd/machine/_limits.h | 4 +- bsd/machine/_param.h | 4 +- bsd/machine/_structs.h | 4 +- bsd/machine/_types.h | 4 +- bsd/machine/dis_tables.h | 4 +- bsd/machine/disklabel.h | 4 +- bsd/machine/endian.h | 4 +- bsd/machine/exec.h | 6 +- bsd/machine/fasttrap_isa.h | 4 +- bsd/machine/limits.h | 4 +- bsd/machine/param.h | 4 +- bsd/machine/profile.h | 4 +- bsd/machine/psl.h | 4 +- bsd/machine/ptrace.h | 4 +- bsd/machine/reboot.h | 4 +- bsd/machine/reg.h | 4 +- bsd/machine/setjmp.h | 4 +- bsd/machine/signal.h | 4 +- bsd/machine/types.h | 4 +- bsd/machine/ucontext.h | 4 +- bsd/machine/vmparam.h | 4 +- bsd/man/man2/Makefile | 14 +- bsd/man/man2/auditon.2 | 16 + bsd/man/man2/dup.2 | 16 +- bsd/man/man2/exchangedata.2 | 3 +- bsd/man/man2/fcntl.2 | 111 +- bsd/man/man2/getattrlist.2 | 85 +- bsd/man/man2/getaudit.2 | 15 +- bsd/man/man2/getauid.2 | 4 +- bsd/man/man2/getdirentries.2 | 20 +- bsd/man/man2/getdirentriesattr.2 | 23 +- bsd/man/man2/getdtablesize.2 | 63 + bsd/man/man2/getfsstat.2 | 113 +- bsd/man/man2/getgroups.2 | 32 +- bsd/man/man2/gettimeofday.2 | 1 - bsd/man/man2/kqueue.2 | 42 +- bsd/man/man2/madvise.2 | 2 +- bsd/man/man2/mmap.2 | 9 + bsd/man/man2/open.2 | 18 +- bsd/man/man2/pathconf.2 | 10 + bsd/man/man2/pipe.2 | 9 +- bsd/man/man2/posix_spawn.2 | 13 +- bsd/man/man2/quotactl.2 | 1 - bsd/man/man2/sem_close.2 | 60 + bsd/man/man2/sem_open.2 | 169 + bsd/man/man2/sem_post.2 | 65 + bsd/man/man2/sem_unlink.2 | 74 + bsd/man/man2/sem_wait.2 | 88 + bsd/man/man2/sendfile.2 | 14 +- bsd/man/man2/setaudit.2 | 38 +- bsd/man/man2/setgroups.2 | 6 +- bsd/man/man2/setregid.2 | 92 + bsd/man/man2/setreuid.2 | 90 + bsd/man/man2/setxattr.2 | 7 + bsd/man/man2/shm_open.2 | 179 + bsd/man/man2/shm_unlink.2 | 87 + bsd/man/man2/stat.2 | 152 +- bsd/man/man2/statfs.2 | 27 +- bsd/man/man2/undelete.2 | 108 + .../man3/posix_spawn_file_actions_addclose.3 | 50 +- bsd/man/man3/posix_spawnattr_setflags.3 | 12 +- bsd/man/man4/auditpipe.4 | 16 +- bsd/man/man4/gif.4 | 4 +- bsd/man/man4/icmp6.4 | 2 +- bsd/man/man4/netintro.4 | 3 +- bsd/man/man4/random.4 | 2 +- bsd/man/man5/Makefile | 2 - bsd/man/man5/dir.5 | 16 +- bsd/man/man5/fs.5 | 343 - bsd/man/man5/inode.5 | 1 - bsd/miscfs/Makefile | 4 - bsd/miscfs/devfs/Makefile | 4 - bsd/miscfs/devfs/devfs_tree.c | 2 + bsd/miscfs/devfs/devfs_vfsops.c | 11 +- bsd/miscfs/devfs/devfs_vnops.c | 178 +- bsd/miscfs/devfs/devfsdefs.h | 44 +- bsd/miscfs/fifofs/Makefile | 4 - bsd/miscfs/nullfs/null.h | 118 - bsd/miscfs/nullfs/null_subr.c | 304 - bsd/miscfs/nullfs/null_vfsops.c | 382 - bsd/miscfs/nullfs/null_vnops.c | 570 -- bsd/miscfs/specfs/Makefile | 4 - bsd/miscfs/specfs/spec_vnops.c | 628 +- bsd/miscfs/specfs/specdev.h | 1 + bsd/miscfs/union/Makefile | 4 - bsd/miscfs/union/union.h | 151 - bsd/miscfs/union/union_subr.c | 1604 --- bsd/miscfs/union/union_vfsops.c | 563 -- bsd/miscfs/union/union_vnops.c | 1726 ---- bsd/net/Makefile | 13 +- bsd/net/bpf.c | 447 +- bsd/net/bpf.h | 12 +- bsd/net/bpf_filter.c | 137 +- bsd/net/bpfdesc.h | 16 +- bsd/net/bridgestp.c | 2425 +++++ bsd/net/bridgestp.h | 441 + bsd/net/dlil.c | 4276 ++++---- bsd/net/dlil.h | 121 +- bsd/net/ether_if_module.c | 45 +- bsd/net/ether_inet6_pr_module.c | 174 +- bsd/net/ether_inet_pr_module.c | 512 +- bsd/net/ethernet.h | 28 +- bsd/net/if.c | 1851 ++-- bsd/net/if.h | 51 +- bsd/net/if_atm.h | 136 - bsd/net/if_bond.c | 101 +- bsd/net/if_bridge.c | 5138 ++++++++++ bsd/net/if_bridgevar.h | 499 + bsd/net/if_disc.c | 240 - bsd/net/if_dummy.c | 290 - bsd/net/if_ethersubr.c | 229 - bsd/net/if_fddisubr.c | 637 -- bsd/net/if_gif.c | 134 +- bsd/net/if_gif.h | 1 + bsd/net/if_llreach.c | 565 ++ bsd/net/if_llreach.h | 150 + bsd/net/if_loop.c | 22 +- bsd/net/if_media.h | 2 +- bsd/net/if_mib.c | 64 +- bsd/net/if_mib.h | 2 +- bsd/net/if_pflog.c | 5 +- bsd/net/if_stf.c | 119 +- bsd/net/if_types.h | 5 +- bsd/net/if_utun.c | 20 +- bsd/net/if_var.h | 747 +- bsd/net/if_vlan.c | 641 +- bsd/net/kext_net.h | 63 +- bsd/net/kpi_interface.c | 1243 +-- bsd/net/kpi_interface.h | 112 +- bsd/net/kpi_protocol.c | 20 +- bsd/net/multicast_list.c | 3 +- bsd/net/ndrv.c | 4 + bsd/net/ndrv.h | 2 +- bsd/net/net_osdep.h | 14 +- bsd/net/net_str_id.c | 12 +- bsd/net/netsrc.c | 253 + .../ppc/cframe.h => bsd/net/netsrc.h | 62 +- bsd/net/ntstat.c | 1954 ++++ bsd/net/ntstat.h | 348 + bsd/net/pf.c | 126 +- bsd/net/pf_if.c | 36 +- bsd/net/pf_ioctl.c | 445 +- bsd/net/pf_osfp.c | 11 +- bsd/net/pf_table.c | 13 +- bsd/net/pfkeyv2.h | 1 + bsd/net/pfvar.h | 75 +- bsd/net/ppp_deflate.c | 4 +- bsd/net/route.c | 974 +- bsd/net/route.h | 76 +- bsd/net/rtsock.c | 965 +- bsd/net/rtsock_mip.c | 76 - bsd/netat/Makefile | 4 - bsd/netat/asp_proto.c | 2 +- bsd/netat/at.c | 78 +- bsd/netat/at_var.h | 4 +- bsd/netat/ddp.c | 13 +- bsd/netat/ddp_lap.c | 10 +- bsd/netat/sys_glue.c | 4 +- bsd/netinet/Makefile | 9 +- bsd/netinet/icmp6.h | 195 +- bsd/netinet/if_atm.c | 303 - bsd/netinet/if_atm.h | 77 - bsd/netinet/if_fddi.h | 114 - bsd/netinet/igmp.c | 3949 +++++++- bsd/netinet/igmp.h | 104 +- bsd/netinet/igmp_var.h | 239 +- bsd/netinet/in.c | 767 +- bsd/netinet/in.h | 204 +- bsd/netinet/in_arp.c | 621 +- bsd/netinet/in_arp.h | 5 +- bsd/netinet/in_cksum.c | 78 +- bsd/netinet/in_dhcp.c | 22 +- bsd/netinet/in_gif.c | 14 +- bsd/netinet/in_mcast.c | 3641 +++++++ bsd/netinet/in_pcb.c | 436 +- bsd/netinet/in_pcb.h | 147 +- bsd/netinet/in_pcblist.c | 383 + bsd/netinet/in_proto.c | 2 +- bsd/netinet/in_rmx.c | 86 +- bsd/netinet/in_tclass.c | 850 ++ bsd/netinet/in_var.h | 285 +- bsd/netinet/ip6.h | 103 +- bsd/netinet/ip_divert.c | 48 +- bsd/netinet/ip_dummynet.c | 129 +- bsd/netinet/ip_dummynet.h | 2 +- bsd/netinet/ip_encap.c | 14 +- bsd/netinet/ip_encap.h | 2 +- bsd/netinet/ip_flow.c | 380 - bsd/netinet/ip_flow.h | 87 - bsd/netinet/ip_fw.h | 2 + bsd/netinet/ip_fw2.c | 49 +- bsd/netinet/ip_fw2.h | 2 + bsd/netinet/ip_fw2_compat.c | 4 +- bsd/netinet/ip_icmp.c | 56 +- bsd/netinet/ip_id.c | 1 + bsd/netinet/ip_input.c | 376 +- bsd/netinet/ip_mroute.c | 40 +- bsd/netinet/ip_mroute.h | 4 +- bsd/netinet/ip_output.c | 1092 +- bsd/netinet/ip_var.h | 60 +- bsd/netinet/kpi_ipfilter.c | 169 +- bsd/netinet/kpi_ipfilter.h | 9 +- bsd/netinet/raw_ip.c | 179 +- bsd/netinet/tcp.h | 88 +- bsd/netinet/tcp_cc.h | 124 + bsd/netinet/tcp_debug.c | 2 +- bsd/netinet/tcp_input.c | 1305 ++- bsd/netinet/tcp_ledbat.c | 434 + bsd/netinet/tcp_newreno.c | 344 + bsd/netinet/tcp_output.c | 318 +- bsd/netinet/tcp_sack.c | 20 +- bsd/netinet/tcp_seq.h | 2 + bsd/netinet/tcp_subr.c | 440 +- bsd/netinet/tcp_timer.c | 953 +- bsd/netinet/tcp_timer.h | 117 +- bsd/netinet/tcp_usrreq.c | 278 +- bsd/netinet/tcp_var.h | 269 +- bsd/netinet/udp_usrreq.c | 395 +- bsd/netinet6/Makefile | 8 +- bsd/netinet6/ah.h | 2 +- bsd/netinet6/ah6.h | 2 +- bsd/netinet6/ah_core.c | 82 +- bsd/netinet6/ah_input.c | 57 +- bsd/netinet6/dest6.c | 10 +- bsd/netinet6/esp6.h | 2 +- bsd/netinet6/esp_core.c | 2 +- bsd/netinet6/esp_input.c | 146 +- bsd/netinet6/frag6.c | 75 +- bsd/netinet6/icmp6.c | 758 +- bsd/netinet6/in6.c | 1609 +-- bsd/netinet6/in6.h | 279 +- bsd/netinet6/in6_cksum.c | 2 +- bsd/netinet6/in6_gif.c | 23 +- bsd/netinet6/in6_gif.h | 2 +- bsd/netinet6/in6_ifattach.c | 448 +- bsd/netinet6/in6_ifattach.h | 40 +- bsd/netinet6/in6_mcast.c | 3490 +++++++ bsd/netinet6/in6_pcb.c | 362 +- bsd/netinet6/in6_pcb.h | 12 +- bsd/netinet6/in6_prefix.c | 241 +- bsd/netinet6/in6_prefix.h | 1 - bsd/netinet6/in6_proto.c | 117 +- bsd/netinet6/in6_rmx.c | 124 +- bsd/netinet6/in6_src.c | 1471 ++- bsd/netinet6/in6_var.h | 391 +- bsd/netinet6/ip6_forward.c | 132 +- bsd/netinet6/ip6_fw.c | 46 +- bsd/netinet6/ip6_fw.h | 2 + bsd/netinet6/ip6_id.c | 304 + bsd/netinet6/ip6_input.c | 863 +- bsd/netinet6/ip6_mroute.c | 141 +- bsd/netinet6/ip6_mroute.h | 16 +- bsd/netinet6/ip6_output.c | 2978 +++--- bsd/netinet6/ip6_var.h | 192 +- bsd/netinet6/ip6protosw.h | 10 +- bsd/netinet6/ipcomp6.h | 2 +- bsd/netinet6/ipcomp_input.c | 11 +- bsd/netinet6/ipsec.c | 156 +- bsd/netinet6/mld6.c | 3577 ++++++- bsd/netinet6/mld6.h | 139 + bsd/netinet6/mld6_var.h | 215 +- bsd/netinet6/nd6.c | 878 +- bsd/netinet6/nd6.h | 243 +- bsd/netinet6/nd6_nbr.c | 1082 +- bsd/netinet6/nd6_rtr.c | 3033 ++++-- bsd/netinet6/raw_ip6.c | 230 +- bsd/netinet6/route6.c | 62 +- bsd/netinet6/scope6.c | 257 +- bsd/netinet6/scope6_var.h | 46 + bsd/netinet6/tcp6_var.h | 32 +- bsd/netinet6/udp6_output.c | 110 +- bsd/netinet6/udp6_usrreq.c | 177 +- bsd/netinet6/udp6_var.h | 2 +- bsd/netkey/Makefile | 4 - bsd/netkey/key.c | 726 +- bsd/nfs/Makefile | 4 - bsd/nfs/krpc.h | 22 +- bsd/nfs/nfs.h | 485 +- bsd/nfs/nfs4_subs.c | 1990 +++- bsd/nfs/nfs4_vnops.c | 6126 ++++++++---- bsd/nfs/nfs_bio.c | 260 +- bsd/nfs/nfs_boot.c | 8 +- bsd/nfs/nfs_gss.c | 338 +- bsd/nfs/nfs_gss.h | 17 +- bsd/nfs/nfs_lock.c | 791 +- bsd/nfs/nfs_lock.h | 23 +- bsd/nfs/nfs_node.c | 275 +- bsd/nfs/nfs_serv.c | 211 +- bsd/nfs/nfs_socket.c | 2481 ++++- bsd/nfs/nfs_srvcache.c | 31 +- bsd/nfs/nfs_subs.c | 687 +- bsd/nfs/nfs_syscalls.c | 163 +- bsd/nfs/nfs_vfsops.c | 3856 ++++++-- bsd/nfs/nfs_vnops.c | 1388 ++- bsd/nfs/nfsm_subs.h | 68 +- bsd/nfs/nfsmount.h | 229 +- bsd/nfs/nfsnode.h | 136 +- bsd/nfs/nfsproto.h | 28 +- bsd/nfs/nfsrvcache.h | 11 +- bsd/nfs/rpcv2.h | 5 +- bsd/nfs/xdr_subs.h | 417 +- bsd/ppc/Makefile | 33 - bsd/ppc/_limits.h | 27 - bsd/ppc/_param.h | 46 - bsd/ppc/_structs.h | 217 - bsd/ppc/_types.h | 120 - bsd/ppc/decodePPC.h | 919 -- bsd/ppc/endian.h | 124 - bsd/ppc/exec.h | 108 - bsd/ppc/fasttrap_isa.h | 106 - bsd/ppc/limits.h | 107 - bsd/ppc/param.h | 141 - bsd/ppc/profile.h | 58 - bsd/ppc/reboot.h | 55 - bsd/ppc/setjmp.h | 121 - bsd/ppc/signal.h | 83 - bsd/ppc/types.h | 172 - bsd/ppc/ucontext.h | 73 - bsd/ppc/vmparam.h | 66 - bsd/security/Makefile | 4 - bsd/security/audit/Makefile | 4 - bsd/security/audit/audit.c | 83 +- bsd/security/audit/audit.h | 19 +- bsd/security/audit/audit_arg.c | 8 +- bsd/security/audit/audit_bsd.c | 169 +- bsd/security/audit/audit_bsd.h | 74 +- bsd/security/audit/audit_bsm.c | 22 +- bsd/security/audit/audit_ioctl.h | 25 + bsd/security/audit/audit_private.h | 5 +- bsd/security/audit/audit_session.c | 1949 ++-- bsd/security/audit/audit_syscalls.c | 214 +- bsd/security/audit/audit_worker.c | 33 +- bsd/sys/Makefile | 49 +- bsd/sys/attr.h | 31 +- bsd/sys/buf.h | 62 +- bsd/sys/buf_internal.h | 36 +- bsd/sys/cdefs.h | 122 +- bsd/sys/codesign.h | 3 + bsd/sys/conf.h | 32 +- .../sys/content_protection.h | 32 +- bsd/sys/cprotect.h | 117 +- bsd/sys/decmpfs.h | 2 +- bsd/sys/disk.h | 17 +- bsd/sys/dtrace.h | 70 +- bsd/sys/dtrace_glue.h | 96 +- bsd/sys/dtrace_impl.h | 39 + bsd/sys/errno.h | 56 +- bsd/sys/event.h | 47 +- bsd/sys/fasttrap_impl.h | 5 +- bsd/sys/fbt.h | 15 +- bsd/sys/fcntl.h | 63 +- bsd/sys/file.h | 2 + bsd/sys/file_internal.h | 10 +- bsd/sys/filedesc.h | 6 +- osfmk/ppc/machine_cpu.h => bsd/sys/fileport.h | 39 +- bsd/sys/fsctl.h | 176 +- bsd/sys/fsevents.h | 1 + bsd/sys/fslog.h | 8 + bsd/sys/imageboot.h | 11 +- bsd/sys/imgact.h | 30 +- bsd/{ppc/ptrace.h => sys/imgsrc.h} | 46 +- bsd/sys/kauth.h | 31 +- bsd/sys/kdebug.h | 168 +- bsd/sys/kern_control.h | 6 + bsd/sys/kern_memorystatus.h | 42 +- bsd/sys/kpi_mbuf.h | 61 +- bsd/sys/kpi_socket.h | 32 +- bsd/sys/make_posix_availability.sh | 71 + bsd/sys/make_symbol_aliasing.sh | 86 + bsd/sys/malloc.h | 20 +- bsd/sys/mbuf.h | 623 +- bsd/sys/mcache.h | 51 +- bsd/sys/mman.h | 1 + bsd/sys/mount.h | 89 +- bsd/sys/mount_internal.h | 53 +- bsd/sys/msgbuf.h | 22 +- bsd/sys/namei.h | 56 +- bsd/{dev/ppc/memmove.c => sys/netboot.h} | 38 +- bsd/sys/priv.h | 95 + bsd/sys/proc.h | 36 +- bsd/sys/proc_info.h | 90 +- bsd/sys/proc_internal.h | 42 +- bsd/sys/process_policy.h | 177 + bsd/sys/protosw.h | 16 +- bsd/sys/pthread_internal.h | 41 +- bsd/sys/queue.h | 68 +- bsd/sys/reboot.h | 2 +- bsd/sys/resource.h | 13 +- bsd/sys/sdt_impl.h | 5 +- bsd/sys/signal.h | 6 - bsd/sys/socket.h | 91 +- bsd/sys/socketvar.h | 144 +- bsd/sys/sockio.h | 17 +- bsd/sys/spawn.h | 9 + bsd/sys/spawn_internal.h | 5 +- bsd/sys/stat.h | 13 +- bsd/sys/sys_domain.h | 3 +- bsd/sys/sysctl.h | 123 +- bsd/sys/sysent.h | 5 +- bsd/sys/syslog.h | 164 +- bsd/sys/systm.h | 11 +- bsd/sys/time.h | 4 +- bsd/sys/tree.h | 693 +- bsd/sys/tty.h | 2 + bsd/sys/ubc.h | 1 + bsd/sys/ubc_internal.h | 12 + bsd/sys/ucontext.h | 6 - bsd/sys/ucred.h | 12 +- bsd/sys/un.h | 5 + bsd/sys/unistd.h | 3 +- bsd/sys/unpcb.h | 5 +- bsd/sys/user.h | 30 +- bsd/sys/vfs_context.h | 2 + bsd/sys/vnode.h | 312 +- bsd/sys/vnode_if.h | 211 +- bsd/sys/vnode_internal.h | 99 +- bsd/sys/xattr.h | 16 +- bsd/uuid/Makefile | 4 - bsd/vfs/Makefile | 4 - bsd/vfs/kpi_vfs.c | 942 +- bsd/vfs/vfs_attrlist.c | 211 +- bsd/vfs/vfs_bio.c | 787 +- bsd/vfs/vfs_cache.c | 103 +- bsd/vfs/vfs_cluster.c | 538 +- bsd/vfs/vfs_conf.c | 26 +- bsd/vfs/vfs_fsevents.c | 28 +- bsd/vfs/vfs_fslog.c | 82 +- bsd/vfs/vfs_init.c | 18 +- bsd/vfs/vfs_journal.c | 4873 +++++---- bsd/vfs/vfs_journal.h | 51 +- bsd/vfs/vfs_lookup.c | 1533 +-- bsd/vfs/vfs_subr.c | 2005 +++- bsd/vfs/vfs_syscalls.c | 2813 ++++-- bsd/vfs/vfs_utfconv.c | 17 +- bsd/vfs/vfs_vnops.c | 518 +- bsd/vfs/vfs_xattr.c | 87 +- bsd/vfs/vnode_if.c | 95 + bsd/vm/Makefile | 4 - bsd/vm/dp_backing_file.c | 62 +- bsd/vm/vm_unix.c | 481 +- bsd/vm/vnode_pager.c | 120 +- config/BSDKernel.exports | 16 + config/BSDKernel.ppc.exports | 37 - config/Dummy.exports | 1 + config/IOKit.exports | 10 +- config/IOKit.i386.exports | 3 - config/IOKit.ppc.exports | 383 - config/IOKit.x86_64.exports | 3 - config/Libkern.exports | 3 +- config/Libkern.i386.exports | 3 + config/Libkern.ppc.exports | 29 - config/Libkern.x86_64.exports | 4 +- config/MACFramework.exports | 2 + config/MACFramework.ppc.exports | 9 - config/Mach.ppc.exports | 1 - config/Makefile | 153 +- config/MasterVersion | 2 +- config/Private.exports | 42 +- config/Private.i386.exports | 23 +- config/Private.ppc.exports | 2 - config/Private.x86_64.exports | 22 + config/System6.0.exports | 8 +- config/System6.0.i386.exports | 4 +- config/System6.0.ppc.exports | 256 - config/Unsupported.exports | 10 +- config/Unsupported.i386.exports | 4 +- config/Unsupported.ppc.exports | 118 - config/Unsupported.x86_64.exports | 1 + config/version.c | 2 + .../IOKit/AppleKeyStoreInterface.h | 50 +- iokit/IOKit/IOBufferMemoryDescriptor.h | 13 +- iokit/IOKit/IOCatalogue.h | 22 +- iokit/IOKit/IOCommandGate.h | 6 +- iokit/IOKit/IODMACommand.h | 2 +- iokit/IOKit/IODataQueueShared.h | 2 +- iokit/IOKit/IOEventSource.h | 33 +- iokit/IOKit/IOHibernatePrivate.h | 58 +- iokit/IOKit/IOInterruptEventSource.h | 11 + iokit/IOKit/IOKitDebug.h | 29 +- iokit/IOKit/IOKitKeys.h | 1 + iokit/IOKit/IOKitKeysPrivate.h | 22 +- iokit/IOKit/IOKitServer.h | 9 +- iokit/IOKit/IOLib.h | 32 +- iokit/IOKit/IOMemoryCursor.h | 80 - iokit/IOKit/IOMemoryDescriptor.h | 26 +- iokit/IOKit/IOMessage.h | 176 +- iokit/IOKit/IONVRAM.h | 12 +- iokit/IOKit/IOPlatformExpert.h | 9 +- iokit/IOKit/IOService.h | 96 +- iokit/IOKit/IOServicePM.h | 9 + iokit/IOKit/IOSharedLock.h | 79 +- iokit/IOKit/IOStatistics.h | 220 + iokit/IOKit/IOStatisticsPrivate.h | 359 + iokit/IOKit/IOTimeStamp.h | 6 +- iokit/IOKit/IOTimerEventSource.h | 6 +- iokit/IOKit/IOTypes.h | 5 + iokit/IOKit/IOUserClient.h | 16 +- iokit/IOKit/IOWorkLoop.h | 39 +- iokit/IOKit/Makefile | 29 +- iokit/IOKit/i386/IOSharedLockImp.h | 113 - iokit/IOKit/machine/Makefile | 2 - iokit/IOKit/nvram/Makefile | 2 - iokit/IOKit/platform/Makefile | 2 - iokit/IOKit/power/Makefile | 2 - iokit/IOKit/ppc/IODBDMA.h | 367 - iokit/IOKit/ppc/IOSharedLockImp.h | 199 - iokit/IOKit/ppc/Makefile | 32 - iokit/IOKit/pwr_mgt/IOPM.h | 105 +- iokit/IOKit/pwr_mgt/IOPMDeprecated.h | 177 - iokit/IOKit/pwr_mgt/IOPMPrivate.h | 436 +- iokit/IOKit/pwr_mgt/IOPowerConnection.h | 22 +- iokit/IOKit/pwr_mgt/Makefile | 4 +- iokit/IOKit/pwr_mgt/RootDomain.h | 348 +- iokit/IOKit/rtc/Makefile | 2 - iokit/IOKit/system_management/Makefile | 2 - iokit/Kernel/IOBufferMemoryDescriptor.cpp | 10 - iokit/Kernel/IOCPU.cpp | 13 +- iokit/Kernel/IOCatalogue.cpp | 248 +- iokit/Kernel/IOCommandGate.cpp | 40 +- iokit/Kernel/IOCommandQueue.cpp | 27 +- iokit/Kernel/IODMACommand.cpp | 26 +- iokit/Kernel/IODMAController.cpp | 4 +- iokit/Kernel/IODeviceTreeSupport.cpp | 54 +- iokit/Kernel/IOEventSource.cpp | 109 +- iokit/Kernel/IOFilterInterruptEventSource.cpp | 45 +- iokit/Kernel/IOHibernateIO.cpp | 663 +- iokit/Kernel/IOHibernateInternal.h | 13 +- iokit/Kernel/IOHibernateRestoreKernel.c | 751 +- iokit/Kernel/IOInterruptController.cpp | 66 +- iokit/Kernel/IOInterruptEventSource.cpp | 90 +- iokit/Kernel/IOKitDebug.cpp | 14 +- iokit/Kernel/IOKitKernelInternal.h | 27 + iokit/Kernel/IOLib.cpp | 80 +- iokit/Kernel/IOMemoryCursor.cpp | 63 - iokit/Kernel/IOMemoryDescriptor.cpp | 117 +- iokit/Kernel/IONVRAM.cpp | 93 +- iokit/Kernel/IOPMPowerSource.cpp | 20 +- iokit/Kernel/IOPMrootDomain.cpp | 6646 ++++++++----- iokit/Kernel/IOPlatformExpert.cpp | 127 +- iokit/Kernel/IORegistryEntry.cpp | 7 +- iokit/Kernel/IOService.cpp | 296 +- iokit/Kernel/IOServicePM.cpp | 4078 +++++--- iokit/Kernel/IOServicePMPrivate.h | 493 +- iokit/Kernel/IOServicePrivate.h | 4 + iokit/Kernel/IOStartIOKit.cpp | 23 +- iokit/Kernel/IOStatistics.cpp | 1279 +++ iokit/Kernel/IOTimerEventSource.cpp | 49 +- iokit/Kernel/IOUserClient.cpp | 287 +- iokit/Kernel/IOWorkLoop.cpp | 356 +- iokit/Kernel/RootDomainUserClient.cpp | 303 +- iokit/Kernel/RootDomainUserClient.h | 27 +- iokit/Kernel/i386/IOKeyStoreHelper.cpp | 104 + iokit/Kernel/i386/IOSharedLock.s | 59 +- iokit/Kernel/ppc/IOAsmSupport.s | 120 - iokit/Kernel/ppc/IODBDMA.cpp | 161 - iokit/Kernel/x86_64/IOSharedLock.s | 55 +- iokit/KernelConfigTables.cpp | 26 +- iokit/Makefile | 5 +- iokit/bsddev/DINetBootHook.cpp | 132 +- iokit/bsddev/IOKitBSDInit.cpp | 8 +- iokit/conf/MASTER | 13 +- iokit/conf/MASTER.i386 | 5 +- iokit/conf/MASTER.ppc | 18 - iokit/conf/MASTER.x86_64 | 5 +- iokit/conf/Makefile | 19 +- iokit/conf/Makefile.i386 | 14 +- iokit/conf/Makefile.ppc | 27 - iokit/conf/Makefile.template | 22 +- iokit/conf/Makefile.x86_64 | 14 +- iokit/conf/files | 4 +- iokit/conf/files.i386 | 5 +- iokit/conf/files.ppc | 20 - iokit/conf/files.x86_64 | 5 +- iokit/conf/tools/Makefile | 32 - iokit/conf/tools/doconf/Makefile | 47 - iokit/conf/tools/doconf/doconf.csh | 321 - kgmacros | 4263 ++++++-- libkern/Makefile | 17 +- libkern/OSKextLib.cpp | 67 +- libkern/OSKextVersion.c | 1 + libkern/c++/OSKext.cpp | 2820 ++++-- libkern/c++/OSMetaClass.cpp | 1 + libkern/c++/OSObject.cpp | 26 - libkern/c++/OSObjectAsm.s | 75 - libkern/c++/OSOrderedSet.cpp | 4 +- libkern/c++/OSRuntime.cpp | 43 +- libkern/c++/OSSet.cpp | 31 +- libkern/c++/OSSymbol.cpp | 11 +- .../TestSerialization/test1/test1_main.cpp | 0 libkern/conf/MASTER | 7 + libkern/conf/MASTER.i386 | 5 +- libkern/conf/MASTER.ppc | 19 - libkern/conf/MASTER.x86_64 | 5 +- libkern/conf/Makefile | 19 +- libkern/conf/Makefile.i386 | 6 + libkern/conf/Makefile.ppc | 7 - libkern/conf/Makefile.template | 17 +- libkern/conf/Makefile.x86_64 | 6 + libkern/conf/files | 3 +- libkern/conf/files.i386 | 7 + libkern/conf/files.ppc | 6 - libkern/conf/files.x86_64 | 7 + libkern/conf/tools/Makefile | 32 - libkern/conf/tools/doconf/Makefile | 47 - libkern/conf/tools/doconf/doconf.csh | 321 - libkern/crypto/intel/sha1edp.h | 51 + libkern/crypto/intel/sha1edp.s | 1481 +++ libkern/crypto/sha1.c | 55 +- libkern/gen/OSAtomicOperations.c | 11 +- libkern/gen/OSDebug.cpp | 43 +- libkern/kernel_mach_header.c | 27 + libkern/kmod/Makefile.kmod | 18 +- libkern/kmod/cplus_start.c | 7 +- libkern/kmod/cplus_stop.c | 7 +- libkern/kxld/Makefile | 94 +- {iokit/Kernel => libkern/kxld}/WKdmCompress.c | 6 +- .../Kernel => libkern/kxld}/WKdmDecompress.c | 0 libkern/kxld/i386/WKdmCompress.s | 597 ++ libkern/kxld/i386/WKdmDecompress.s | 675 ++ libkern/kxld/kxld.c | 456 +- libkern/kxld/kxld_array.c | 3 + libkern/kxld/kxld_copyright.c | 34 +- libkern/kxld/kxld_demangle.c | 28 + libkern/kxld/kxld_demangle.h | 28 + libkern/kxld/kxld_kext.c | 3260 ++---- libkern/kxld/kxld_kext.h | 85 +- libkern/kxld/kxld_object.c | 2185 ++++ libkern/kxld/kxld_object.h | 159 + libkern/kxld/kxld_reloc.c | 298 +- libkern/kxld/kxld_reloc.h | 54 +- libkern/kxld/kxld_sect.c | 25 +- libkern/kxld/kxld_sect.h | 5 +- libkern/kxld/kxld_seg.c | 44 +- libkern/kxld/kxld_seg.h | 10 +- libkern/kxld/kxld_state.c | 1072 -- libkern/kxld/kxld_state.h | 155 - libkern/kxld/kxld_stubs.c | 25 +- libkern/kxld/kxld_sym.c | 222 +- libkern/kxld/kxld_sym.h | 19 +- libkern/kxld/kxld_symtab.c | 212 +- libkern/kxld/kxld_symtab.h | 33 +- libkern/kxld/kxld_util.c | 28 +- libkern/kxld/kxld_util.h | 12 +- libkern/kxld/kxld_vtable.c | 531 +- libkern/kxld/kxld_vtable.h | 41 +- libkern/kxld/tests/kextcopyright.c | 29 + libkern/kxld/tests/kxld_array_test.c | 160 + libkern/kxld/tests/kxld_dict_test.c | 44 +- .../psl.h => libkern/kxld/tests/kxld_test.c | 26 +- .../kxld/tests/kxld_test.h | 13 +- libkern/kxld/tests/loadtest.py | 28 + libkern/libkern/Makefile | 13 +- libkern/libkern/OSAtomic.h | 113 +- libkern/libkern/OSAtomic.h.save | 305 - libkern/libkern/OSByteOrder.h | 4 +- libkern/libkern/OSCrossEndian.h | 21 - libkern/libkern/OSDebug.h | 3 + libkern/libkern/OSKextLib.h | 40 +- libkern/libkern/OSKextLibPrivate.h | 122 +- {iokit/Kernel => libkern/libkern}/WKdm.h | 6 +- libkern/libkern/_OSByteOrder.h | 2 +- libkern/libkern/c++/Makefile | 4 - libkern/libkern/c++/OSKext.h | 95 +- libkern/libkern/c++/OSMetaClass.h | 9 +- libkern/libkern/c++/OSObject.h | 21 +- libkern/libkern/c++/OSOrderedSet.h | 4 +- libkern/libkern/c++/OSSet.h | 36 +- libkern/libkern/crypto/Makefile | 4 - libkern/libkern/crypto/sha1.h | 2 + libkern/libkern/kernel_mach_header.h | 6 +- libkern/libkern/kext_request_keys.h | 42 +- libkern/libkern/kxld.h | 45 +- libkern/libkern/kxld_types.h | 26 +- libkern/libkern/machine/Makefile | 4 - libkern/libkern/mkext.h | 1 + libkern/libkern/ppc/Makefile | 31 - libkern/libkern/ppc/OSByteOrder.h | 206 - libkern/libkern/prelink.h | 2 +- libkern/libkern/tree.h | 802 ++ libkern/libkern/version.h.template | 42 +- libkern/ppc/OSAtomic.s | 104 - libkern/ppc/bcmp.s | 92 - libkern/ppc/memcmp.s | 106 - libkern/ppc/strlen.s | 118 - libkern/uuid/Makefile | 4 - libkern/uuid/uuid.c | 21 +- libkern/x86_64/OSAtomic.s | 16 +- libkern/zlib/adler32.c | 22 +- libkern/zlib/arm/adler32vec.s | 428 - libkern/zlib/arm/inffastS.s | 565 -- libkern/zlib/inffast.c | 9 +- libkern/zlib/intel/adler32vec.s | 1050 ++ libkern/zlib/intel/inffastS.s | 1179 +++ libsa/Makefile | 4 +- libsa/bootstrap.cpp | 113 +- libsa/conf/MASTER | 1 - libsa/conf/MASTER.i386 | 1 - libsa/conf/MASTER.ppc | 18 - libsa/conf/MASTER.x86_64 | 1 - libsa/conf/Makefile | 19 +- libsa/conf/Makefile.i386 | 1 + libsa/conf/Makefile.ppc | 7 - libsa/conf/Makefile.template | 17 +- libsa/conf/Makefile.x86_64 | 1 + libsa/conf/files.ppc | 1 - libsa/conf/tools/Makefile | 32 - libsa/conf/tools/doconf/Makefile | 47 - libsa/conf/tools/doconf/doconf.csh | 321 - libsa/lastkernelconstructor.c | 4 +- libsa/libsa/Makefile | 2 - libsyscall/BSDmakefile | 141 - libsyscall/GNUmakefile | 8 - libsyscall/Libsyscall.xcconfig | 31 + .../Libsyscall.xcodeproj/project.pbxproj | 1029 ++ libsyscall/Makefile | 65 - libsyscall/Makefile.inc | 52 - libsyscall/Makefile.xbs | 130 - libsyscall/Platforms/MacOSX/i386/syscall.map | 93 + .../Platforms/MacOSX/x86_64/syscall.map | 54 + libsyscall/Platforms/syscall.map | 16 + libsyscall/create-syscalls.pl | 266 - libsyscall/custom/SYS.h | 47 +- libsyscall/custom/__fork.s | 176 +- libsyscall/custom/__getpid.s | 40 +- libsyscall/custom/__gettimeofday.s | 16 +- libsyscall/custom/__lseek.s | 6 +- libsyscall/custom/__pipe.s | 16 +- libsyscall/custom/__psynch_cvbroad.s | 4 +- libsyscall/custom/__psynch_cvwait.s | 4 +- libsyscall/custom/__ptrace.s | 14 +- libsyscall/custom/__sigaltstack.s | 6 +- libsyscall/custom/__sigreturn.s | 6 +- libsyscall/custom/__syscall.s | 8 +- libsyscall/custom/__thread_selfid.s | 4 +- libsyscall/custom/__vfork.s | 51 +- libsyscall/custom/custom.s | 39 +- .../custom/errno.c | 3 +- libsyscall/include/Makefile.inc | 1 - libsyscall/include/processor_facilities.h | 36 - libsyscall/mach/Makefile.inc | 74 - .../mach/abort.h | 15 +- libsyscall/mach/bootstrap_ports.c | 72 - libsyscall/mach/brk.2 | 150 - libsyscall/mach/clock_sleep.c | 13 +- .../mach/dylib_link.c | 4 +- libsyscall/mach/err_iokit.sub | 16 +- libsyscall/mach/err_ipc.sub | 6 +- libsyscall/mach/err_kern.sub | 4 +- libsyscall/mach/err_libkern.sub | 6 +- libsyscall/mach/err_mach_ipc.sub | 6 +- libsyscall/mach/err_server.sub | 32 +- libsyscall/mach/error_codes.c | 4 +- libsyscall/mach/errorlib.h | 15 +- libsyscall/mach/exc_catcher.c | 36 +- libsyscall/mach/exc_catcher.h | 64 + libsyscall/mach/exc_catcher_state.c | 35 +- libsyscall/mach/exc_catcher_state_identity.c | 35 +- libsyscall/mach/fprintf_stderr.c | 22 +- libsyscall/mach/headers/Makefile.inc | 10 - libsyscall/mach/i386/Makefile.inc | 3 - libsyscall/mach/{headers => mach}/errorlib.h | 12 +- libsyscall/mach/{headers => mach}/mach.h | 0 .../mach/{headers => mach}/mach_error.h | 0 libsyscall/mach/{headers => mach}/mach_init.h | 7 +- .../mach/{headers => mach}/mach_interface.h | 0 libsyscall/mach/{headers => mach}/port_obj.h | 0 libsyscall/mach/{headers => mach}/sync.h | 0 libsyscall/mach/{headers => mach}/task.h | 4 - .../mach/{headers => mach}/thread_act.h | 4 - libsyscall/mach/{headers => mach}/vm_task.h | 0 libsyscall/mach/mach_error.c | 16 +- libsyscall/mach/mach_error_string.c | 1 - libsyscall/mach/mach_init.c | 197 +- libsyscall/mach/mach_init_libSystem.c | 58 - libsyscall/mach/mach_init_ports.c | 140 - .../mach/mach_legacy.c | 29 +- libsyscall/mach/mach_msg.c | 68 +- libsyscall/mach/mig_allocate.c | 2 +- libsyscall/mach/mig_deallocate.c | 2 +- libsyscall/mach/mig_reply_port.c | 95 + .../rpc.h => libsyscall/mach/mig_reply_port.h | 13 +- libsyscall/mach/mig_strncpy.c | 23 +- libsyscall/mach/ms_thread_switch.c | 10 +- libsyscall/mach/panic.c | 19 +- libsyscall/mach/port_obj.c | 2 +- libsyscall/mach/ppc/Makefile.inc | 3 - libsyscall/mach/ppc64/Makefile.inc | 4 - libsyscall/mach/sbrk.c | 78 - libsyscall/mach/servers/Makefile.inc | 16 - libsyscall/mach/slot_name.c | 20 - libsyscall/mach/string.c | 120 + .../mach/string.h | 41 +- libsyscall/mach/x86_64/Makefile.inc | 3 - libsyscall/wrappers/__get_cpu_capabilities.s | 49 + .../wrappers/_errno.h | 8 +- .../wrappers/_libc_funcptr.c | 78 +- .../wrappers/_libkernel_init.c | 36 +- .../wrappers/_libkernel_init.h | 38 +- libsyscall/wrappers/cancelable/fcntl-base.c | 65 + .../cancelable/fcntl-cancel.c} | 15 +- libsyscall/wrappers/cancelable/fcntl.c | 34 + .../wrappers/cancelable/select-cancel.c | 26 + libsyscall/wrappers/cancelable/select.c | 27 + .../wrappers/cancelable/sigsuspend-cancel.c | 26 + .../cancelable/sigsuspend.c} | 13 +- .../wrappers/init_cpu_capabilities.c | 33 +- libsyscall/wrappers/ioctl.c | 47 + libsyscall/wrappers/kill.c | 43 + libsyscall/wrappers/legacy/accept.c | 56 + libsyscall/wrappers/legacy/bind.c | 56 + libsyscall/wrappers/legacy/connect.c | 56 + libsyscall/wrappers/legacy/getattrlist.c | 65 + libsyscall/wrappers/legacy/getpeername.c | 56 + libsyscall/wrappers/legacy/getsockname.c | 56 + libsyscall/wrappers/legacy/kill.c | 30 + libsyscall/wrappers/legacy/lchown.c | 55 + .../wrappers/legacy/listen.c | 47 +- libsyscall/wrappers/legacy/mprotect.c | 69 + libsyscall/wrappers/legacy/msync.c | 53 + libsyscall/wrappers/legacy/munmap.c | 65 + libsyscall/wrappers/legacy/open.c | 54 + libsyscall/wrappers/legacy/recvfrom.c | 55 + libsyscall/wrappers/legacy/recvmsg.c | 55 + libsyscall/wrappers/legacy/select-pre1050.c | 32 + libsyscall/wrappers/legacy/select.c | 31 + libsyscall/wrappers/legacy/sendmsg.c | 56 + libsyscall/wrappers/legacy/sendto.c | 56 + libsyscall/wrappers/legacy/setattrlist.c | 65 + libsyscall/wrappers/legacy/sigsuspend.c | 31 + libsyscall/wrappers/legacy/socketpair.c | 57 + libsyscall/wrappers/memcpy.c | 143 + libsyscall/wrappers/remove-counter.c | 49 + libsyscall/wrappers/rename.c | 33 + libsyscall/wrappers/rmdir.c | 33 + libsyscall/wrappers/select-base.c | 82 + libsyscall/wrappers/sigsuspend-base.c | 41 + libsyscall/wrappers/unix03/chmod.c | 62 + libsyscall/wrappers/unix03/fchmod.c | 62 + libsyscall/wrappers/unix03/getrlimit.c | 46 + libsyscall/wrappers/unix03/mmap.c | 62 + libsyscall/wrappers/unix03/setrlimit.c | 46 + libsyscall/wrappers/unlink.c | 33 + libsyscall/xcodescripts/compat-symlinks.sh | 32 + libsyscall/xcodescripts/compile-syscalls.pl | 130 + libsyscall/xcodescripts/create-syscalls.pl | 403 + libsyscall/xcodescripts/mach_install_mig.sh | 97 + makedefs/MakeInc.cmd | 127 +- makedefs/MakeInc.def | 202 +- makedefs/MakeInc.dir | 147 +- makedefs/MakeInc.rule | 131 +- osfmk/Makefile | 16 +- osfmk/UserNotification/Makefile | 4 - osfmk/UserNotification/UNDRequest.defs | 5 +- osfmk/chud/chud_cpu.c | 12 - osfmk/chud/chud_thread.c | 47 +- osfmk/chud/chud_xnu.h | 23 +- osfmk/chud/chud_xnu_glue.h | 4 +- osfmk/chud/chud_xnu_private.h | 4 +- osfmk/chud/i386/chud_osfmk_callback_i386.c | 90 +- osfmk/chud/i386/chud_thread_i386.c | 7 - osfmk/chud/ppc/chud_cpu_asm.h | 38 - osfmk/chud/ppc/chud_cpu_asm.s | 593 -- osfmk/chud/ppc/chud_cpu_ppc.c | 1182 --- osfmk/chud/ppc/chud_osfmk_callback_ppc.c | 549 - osfmk/chud/ppc/chud_spr.h | 273 - osfmk/chud/ppc/chud_thread_ppc.c | 586 -- osfmk/chud/ppc/chud_xnu_private.h | 59 - osfmk/conf/MASTER | 29 +- osfmk/conf/MASTER.i386 | 6 +- osfmk/conf/MASTER.ppc | 67 - osfmk/conf/MASTER.x86_64 | 12 +- osfmk/conf/Makefile | 17 +- osfmk/conf/Makefile.i386 | 15 +- osfmk/conf/Makefile.ppc | 76 - osfmk/conf/Makefile.template | 24 +- osfmk/conf/Makefile.x86_64 | 27 +- osfmk/conf/files | 15 +- osfmk/conf/files.i386 | 44 +- osfmk/conf/files.ppc | 120 - osfmk/conf/files.x86_64 | 41 +- osfmk/conf/tools/Makefile | 32 - osfmk/conf/tools/doconf/Makefile | 47 - osfmk/conf/tools/doconf/doconf.csh | 321 - osfmk/console/i386/serial_console.c | 10 +- osfmk/console/ppc/serial_console.c | 329 - osfmk/console/ppc/video_scroll.s | 141 - osfmk/console/serial_general.c | 1 - osfmk/console/serial_protos.h | 11 +- osfmk/console/video_console.c | 10 +- osfmk/ddb/db_command.c | 70 - osfmk/ddb/db_print.c | 2 +- osfmk/ddb/db_sym.c | 4 +- osfmk/ddb/db_trap.c | 4 - osfmk/ddb/db_variables.c | 4 +- osfmk/ddb/db_variables.h | 2 +- osfmk/ddb/makedis.c | 5 +- osfmk/default_pager/default_pager.c | 1 + osfmk/default_pager/default_pager_internal.h | 13 +- osfmk/default_pager/dp_backing_store.c | 448 +- osfmk/default_pager/dp_memory_object.c | 28 + osfmk/device/device.defs | 8 +- osfmk/device/iokit_rpc.c | 49 +- osfmk/device/subrs.c | 8 +- osfmk/gssd/Makefile | 4 - osfmk/gssd/gssd_mach.defs | 106 +- osfmk/gssd/gssd_mach_types.h | 45 +- osfmk/i386/AT386/model_dep.c | 242 +- osfmk/i386/Diagnostics.h | 4 +- osfmk/i386/Makefile | 8 +- osfmk/i386/acpi.c | 23 +- osfmk/i386/asm.h | 100 + osfmk/i386/bsd_i386.c | 283 +- osfmk/i386/bsd_i386_native.c | 283 + osfmk/i386/bzero.s | 2 +- osfmk/i386/commpage/atomic.s | 396 - osfmk/i386/commpage/bcopy_scalar.s | 136 - osfmk/i386/commpage/bcopy_sse2.s | 473 - osfmk/i386/commpage/bcopy_sse3x.s | 823 -- osfmk/i386/commpage/bcopy_sse3x_64.s | 820 -- osfmk/i386/commpage/bcopy_sse42.s | 311 - osfmk/i386/commpage/bcopy_sse42_64.s | 301 - osfmk/i386/commpage/bzero_scalar.s | 115 - osfmk/i386/commpage/bzero_sse2.s | 162 - osfmk/i386/commpage/bzero_sse2_64.s | 161 - osfmk/i386/commpage/bzero_sse42.s | 151 - osfmk/i386/commpage/bzero_sse42_64.s | 148 - osfmk/i386/commpage/cacheflush.s | 79 - osfmk/i386/commpage/commpage.c | 149 +- osfmk/i386/commpage/commpage.h | 1 + osfmk/i386/commpage/commpage_asm.s | 78 - osfmk/i386/commpage/commpage_gettimeofday.s | 122 - .../commpage/commpage_mach_absolute_time.s | 173 - osfmk/i386/commpage/commpage_sigs.c | 189 - osfmk/i386/commpage/cpu_number.s | 77 - osfmk/i386/commpage/fifo_queues.s | 74 - osfmk/i386/commpage/longcopy_sse3x.s | 221 - osfmk/i386/commpage/longcopy_sse3x_64.s | 210 - osfmk/i386/commpage/memset_pattern_sse2.s | 183 - osfmk/i386/commpage/memset_pattern_sse2_64.s | 184 - osfmk/i386/commpage/pthreads.s | 111 - osfmk/i386/commpage/spinlocks.s | 189 - osfmk/i386/copyio.c | 621 ++ osfmk/i386/cpu.c | 16 +- osfmk/i386/cpu_capabilities.h | 131 +- osfmk/i386/cpu_data.h | 90 +- osfmk/i386/cpuid.c | 57 +- osfmk/i386/cpuid.h | 2 +- osfmk/i386/cswitch.s | 12 +- osfmk/i386/db_interface.c | 2 + osfmk/i386/db_machdep.h | 4 +- osfmk/i386/db_trace.c | 18 +- osfmk/i386/endian.h | 8 +- osfmk/i386/etimer.c | 126 +- osfmk/i386/fpu.c | 37 +- osfmk/i386/fpu.h | 10 +- osfmk/i386/gdt.c | 14 +- osfmk/i386/genassym.c | 107 +- osfmk/i386/hibernate_i386.c | 18 +- osfmk/i386/hibernate_restore.c | 96 +- osfmk/i386/hw_lock_types.h | 2 +- osfmk/i386/i386_init.c | 114 +- osfmk/i386/i386_lock.s | 705 +- osfmk/i386/i386_vm_init.c | 271 +- osfmk/i386/idle_pt.c | 16 +- osfmk/i386/idt.s | 545 +- osfmk/i386/idt64.s | 717 +- osfmk/i386/ipl.h | 112 - osfmk/i386/lapic.c | 866 +- osfmk/i386/lapic.h | 2 + osfmk/i386/lapic_native.c | 919 ++ osfmk/i386/ldt.c | 12 +- osfmk/i386/locks.h | 82 +- osfmk/i386/locks_i386.c | 219 +- osfmk/i386/locore.s | 1241 +-- osfmk/i386/loose_ends.c | 644 +- osfmk/i386/machine_check.c | 53 +- osfmk/i386/machine_check.h | 9 +- osfmk/i386/machine_cpu.h | 1 + osfmk/i386/machine_routines.c | 79 +- osfmk/i386/machine_routines.h | 31 +- osfmk/i386/machine_routines_asm.s | 93 +- osfmk/i386/misc_protos.h | 5 + osfmk/i386/mp.c | 628 +- osfmk/i386/mp.h | 51 +- osfmk/i386/mp_desc.c | 103 +- osfmk/i386/mp_desc.h | 65 +- osfmk/i386/mp_events.h | 2 +- osfmk/i386/mp_native.c | 126 + osfmk/i386/mtrr.c | 5 +- osfmk/i386/pal_hibernate.h | 45 + .../{ppc/cpu_number.h => i386/pal_lock_asm.h} | 19 +- osfmk/i386/pal_native.h | 102 + osfmk/i386/pal_routines.c | 349 + osfmk/i386/pal_routines.h | 184 + osfmk/i386/pal_routines_asm.s | 192 + osfmk/{ppc/mp.h => i386/pal_rtclock_asm.h} | 15 +- osfmk/i386/pcb.c | 665 +- osfmk/i386/pcb_native.c | 652 ++ osfmk/i386/pmCPU.c | 131 +- osfmk/i386/pmCPU.h | 19 +- osfmk/i386/pmap.c | 811 +- osfmk/i386/pmap.h | 119 +- osfmk/i386/pmap_common.c | 505 + osfmk/i386/pmap_internal.h | 515 +- osfmk/i386/pmap_pcid.h | 99 + osfmk/i386/pmap_x86_common.c | 438 +- osfmk/i386/proc_reg.h | 122 +- osfmk/i386/rtclock.c | 231 +- osfmk/i386/rtclock_asm.h | 290 + .../i386/{rtclock.h => rtclock_asm_native.h} | 64 +- osfmk/i386/rtclock_native.c | 202 + .../{ppc/rtclock.h => i386/rtclock_protos.h} | 47 +- osfmk/i386/seg.h | 4 +- osfmk/i386/serial_io.h | 4 +- osfmk/i386/simple_lock.h | 9 +- osfmk/i386/start.s | 27 +- osfmk/i386/startup64.c | 4 - osfmk/i386/thread.h | 166 +- osfmk/i386/trap.c | 448 +- osfmk/i386/trap.h | 11 +- osfmk/i386/trap_native.c | 295 + osfmk/i386/tsc.c | 1 - osfmk/i386/tsc.h | 4 +- osfmk/i386/ucode.c | 201 + osfmk/i386/ucode.h | 30 + osfmk/i386/user_ldt.c | 4 +- osfmk/i386/vmx/vmx_asm.h | 4 +- osfmk/i386/vmx/vmx_cpu.c | 13 +- osfmk/ipc/ipc_entry.c | 11 +- osfmk/ipc/ipc_entry.h | 2 + osfmk/ipc/ipc_init.c | 21 +- osfmk/ipc/ipc_kmsg.c | 378 +- osfmk/ipc/ipc_kmsg.h | 21 +- osfmk/ipc/ipc_labelh.c | 4 + osfmk/ipc/ipc_mqueue.c | 37 +- osfmk/ipc/ipc_mqueue.h | 2 +- osfmk/ipc/ipc_notify.c | 18 + osfmk/ipc/ipc_notify.h | 5 + osfmk/ipc/ipc_object.c | 46 +- osfmk/ipc/ipc_object.h | 5 + osfmk/ipc/ipc_port.c | 316 +- osfmk/ipc/ipc_port.h | 63 +- osfmk/ipc/ipc_pset.c | 4 +- osfmk/ipc/ipc_right.c | 357 +- osfmk/ipc/ipc_right.h | 15 +- osfmk/ipc/ipc_space.c | 5 + osfmk/ipc/ipc_table.c | 14 +- osfmk/ipc/ipc_table.h | 6 +- osfmk/ipc/ipc_types.h | 1 + osfmk/ipc/mach_debug.c | 32 +- osfmk/ipc/mach_msg.c | 32 +- osfmk/ipc/mach_port.c | 87 +- osfmk/kdp/kdp.c | 156 +- osfmk/kdp/kdp_core.h | 15 +- osfmk/kdp/kdp_dyld.h | 2 +- osfmk/kdp/kdp_en_debugger.h | 1 + osfmk/kdp/kdp_private.h | 1 + osfmk/kdp/kdp_udp.c | 498 +- osfmk/kdp/ml/i386/kdp_vm.c | 102 +- osfmk/kdp/ml/i386/kdp_x86_common.c | 10 +- osfmk/kdp/ml/ppc/kdp_asm.s | 95 - osfmk/kdp/ml/ppc/kdp_machdep.c | 827 -- osfmk/kdp/ml/ppc/kdp_misc.s | 71 - osfmk/kdp/ml/ppc/kdp_vm.c | 570 -- osfmk/kdp/ml/x86_64/kdp_machdep.c | 5 + osfmk/kdp/ml/x86_64/kdp_vm.c | 37 +- osfmk/kern/Makefile | 1 + osfmk/kern/ast.c | 11 +- osfmk/kern/audit_sessionport.c | 139 +- osfmk/kern/audit_sessionport.h | 8 +- osfmk/kern/bsd_kern.c | 54 +- osfmk/kern/call_entry.h | 121 +- osfmk/kern/clock.c | 104 +- osfmk/kern/clock_oldops.c | 2 +- osfmk/kern/debug.c | 103 +- osfmk/kern/debug.h | 43 +- osfmk/kern/etimer.h | 13 +- osfmk/kern/exception.c | 1 - osfmk/kern/extmod_statistics.c | 136 + .../PPCcalls.c => kern/extmod_statistics.h} | 33 +- osfmk/kern/hibernate.c | 11 + osfmk/kern/host.c | 121 +- osfmk/kern/host.h | 4 +- osfmk/kern/host_notify.c | 12 +- osfmk/kern/host_statistics.h | 7 - osfmk/kern/ipc_kobject.c | 2 - osfmk/kern/ipc_mig.c | 46 +- osfmk/kern/ipc_misc.c | 99 +- osfmk/kern/ipc_misc.h | 6 +- osfmk/kern/kalloc.c | 142 +- osfmk/kern/kalloc.h | 6 +- osfmk/kern/kern_types.h | 9 + osfmk/kern/kext_alloc.c | 7 +- osfmk/kern/kmod.c | 8 +- osfmk/kern/locks.c | 34 +- osfmk/kern/locks.h | 11 +- osfmk/kern/mach_param.h | 2 +- osfmk/kern/machine.c | 23 +- osfmk/kern/misc_protos.h | 5 +- osfmk/kern/mk_sp.c | 26 +- osfmk/kern/pms.h | 11 - osfmk/kern/printf.c | 16 +- osfmk/kern/priority.c | 181 +- osfmk/kern/processor.c | 41 +- osfmk/kern/processor.h | 32 +- osfmk/kern/processor_data.h | 42 + osfmk/kern/queue.c | 65 +- osfmk/kern/queue.h | 52 +- osfmk/kern/sched.h | 126 +- osfmk/kern/sched_average.c | 38 +- osfmk/kern/sched_fixedpriority.c | 727 ++ osfmk/kern/sched_grrr.c | 956 ++ osfmk/kern/sched_prim.c | 1623 ++- osfmk/kern/sched_prim.h | 368 +- osfmk/kern/sched_proto.c | 597 ++ osfmk/kern/stack.c | 111 +- osfmk/kern/startup.c | 113 +- osfmk/kern/startup.h | 1 + osfmk/kern/sync_lock.c | 10 +- osfmk/kern/sync_sema.c | 4 +- osfmk/kern/syscall_subr.c | 44 +- osfmk/kern/syscall_sw.c | 4 + osfmk/kern/syscall_sw.h | 29 +- osfmk/kern/task.c | 281 +- osfmk/kern/task.h | 211 +- osfmk/kern/task_policy.c | 1154 ++- osfmk/kern/thread.c | 195 +- osfmk/kern/thread.h | 93 +- osfmk/kern/thread_act.c | 78 +- osfmk/kern/thread_call.c | 187 +- osfmk/kern/thread_policy.c | 178 +- osfmk/kern/timer_call.c | 552 +- osfmk/kern/timer_call.h | 31 +- osfmk/kern/timer_queue.h | 26 +- osfmk/kern/wait_queue.c | 105 +- osfmk/kern/wait_queue.h | 19 +- osfmk/kern/zalloc.c | 1332 ++- osfmk/kern/zalloc.h | 76 +- osfmk/kextd/Makefile | 4 - osfmk/libsa/machine/types.h | 4 +- osfmk/libsa/ppc/types.h | 71 - osfmk/libsa/types.h | 1 - osfmk/lockd/Makefile | 4 - osfmk/mach/Makefile | 38 +- osfmk/mach/branch_predicates.h | 35 + osfmk/mach/clock_types.h | 1 + osfmk/mach/host_info.h | 45 + osfmk/mach/i386/_structs.h | 28 +- osfmk/mach/i386/_types.h | 221 - osfmk/mach/i386/sdt_isa.h | 8 +- osfmk/mach/i386/thread_status.h | 123 +- osfmk/mach/i386/vm_param.h | 25 +- osfmk/mach/mach_host.defs | 14 + osfmk/mach/mach_port.defs | 13 +- osfmk/mach/mach_traps.h | 51 +- osfmk/mach/mach_types.defs | 5 +- osfmk/mach/mach_types.h | 1 + osfmk/mach/mach_vm.defs | 4 + osfmk/mach/machine.h | 2 + osfmk/mach/machine/asm.h | 4 +- osfmk/mach/machine/boolean.h | 4 +- osfmk/mach/machine/exception.h | 4 +- osfmk/mach/machine/kern_return.h | 4 +- osfmk/mach/machine/machine_types.defs | 4 +- osfmk/mach/machine/ndr_def.h | 4 +- osfmk/mach/machine/processor_info.h | 4 +- osfmk/mach/machine/rpc.h | 4 +- osfmk/mach/machine/sdt.h | 64 + osfmk/mach/machine/sdt_isa.h | 4 +- osfmk/mach/machine/syscall_sw.h | 4 +- osfmk/mach/machine/thread_state.h | 4 +- osfmk/mach/machine/thread_status.h | 4 +- osfmk/mach/machine/vm_param.h | 4 +- osfmk/mach/machine/vm_types.h | 4 +- osfmk/mach/memory_object.defs | 4 + osfmk/mach/memory_object_types.h | 17 +- osfmk/mach/message.h | 4 +- osfmk/mach/notify.defs | 12 +- osfmk/mach/notify.h | 11 +- osfmk/mach/port.h | 4 +- osfmk/mach/ppc/Makefile | 35 - osfmk/mach/ppc/_structs.h | 392 - osfmk/mach/ppc/_types.h | 234 - osfmk/mach/ppc/boolean.h | 74 - osfmk/mach/ppc/exception.h | 119 - osfmk/mach/ppc/kern_return.h | 74 - osfmk/mach/ppc/machine_types.defs | 126 - osfmk/mach/ppc/ndr_def.h | 43 - osfmk/mach/ppc/processor_info.h | 176 - osfmk/mach/ppc/sdt_isa.h | 427 - osfmk/mach/ppc/syscall_sw.h | 79 - osfmk/mach/ppc/thread_status.h | 150 - osfmk/mach/ppc/vm_param.h | 110 - osfmk/mach/ppc/vm_types.h | 157 - osfmk/mach/processor.defs | 2 +- osfmk/mach/security.defs | 2 +- osfmk/mach/shared_region.h | 18 +- osfmk/mach/syscall_sw.h | 4 +- osfmk/mach/task.defs | 28 +- osfmk/mach/task_info.h | 30 +- osfmk/mach/task_policy.h | 1 + osfmk/mach/thread_act.defs | 14 +- osfmk/mach/thread_policy.h | 16 + osfmk/mach/vm_prot.h | 9 +- osfmk/mach/vm_region.h | 2 + osfmk/mach/vm_statistics.h | 64 +- osfmk/mach_debug/mach_debug_types.defs | 17 +- osfmk/mach_debug/zone_info.h | 47 +- osfmk/machine/Makefile | 2 + osfmk/machine/asm.h | 4 +- osfmk/machine/ast.h | 4 +- osfmk/machine/ast_types.h | 4 +- osfmk/machine/commpage.h | 4 +- osfmk/machine/cpu_affinity.h | 4 +- osfmk/machine/cpu_capabilities.h | 8 +- osfmk/machine/cpu_data.h | 4 +- osfmk/machine/cpu_number.h | 4 +- osfmk/machine/db_machdep.h | 4 +- osfmk/machine/endian.h | 4 +- osfmk/machine/io_map_entries.h | 4 +- osfmk/machine/lock.h | 4 +- osfmk/machine/locks.h | 4 +- osfmk/machine/machine_cpu.h | 4 +- osfmk/machine/machine_routines.h | 4 +- osfmk/machine/machine_rpc.h | 4 +- osfmk/machine/machlimits.h | 4 +- osfmk/machine/machparam.h | 4 +- .../machine/pal_hibernate.h | 18 +- .../machine/pal_routines.h | 11 +- osfmk/machine/pmap.h | 4 +- osfmk/machine/sched_param.h | 4 +- osfmk/machine/setjmp.h | 4 +- osfmk/machine/simple_lock.h | 4 +- osfmk/machine/task.h | 4 +- osfmk/machine/thread.h | 4 +- osfmk/machine/timer.h | 4 +- osfmk/machine/trap.h | 4 +- osfmk/machine/vm_tuning.h | 4 +- osfmk/machine/xpr.h | 4 +- osfmk/pmc/pmc.c | 33 +- osfmk/pmc/pmc.h | 10 + osfmk/ppc/AltiAssist.s | 91 - osfmk/ppc/Diagnostics.c | 571 -- osfmk/ppc/Diagnostics.h | 124 - osfmk/ppc/Emulate.s | 1445 --- osfmk/ppc/Emulate64.s | 957 -- osfmk/ppc/Firmware.h | 166 - osfmk/ppc/Firmware.s | 2517 ----- osfmk/ppc/FirmwareC.c | 338 - osfmk/ppc/FirmwareCalls.h | 81 - osfmk/ppc/Makefile | 36 - osfmk/ppc/PPCcalls.h | 84 - osfmk/ppc/Performance.s | 124 - osfmk/ppc/PseudoKernel.c | 450 - osfmk/ppc/PseudoKernel.h | 99 - osfmk/ppc/_setjmp.s | 194 - osfmk/ppc/aligned_data.s | 209 - osfmk/ppc/asm.h | 781 -- osfmk/ppc/ast.h | 43 - osfmk/ppc/ast_types.h | 41 - osfmk/ppc/atomic_switch.h | 130 - osfmk/ppc/atomic_switch.s | 238 - osfmk/ppc/bat_init.c | 301 - osfmk/ppc/bcopy.s | 981 -- osfmk/ppc/bcopytest.c | 621 -- osfmk/ppc/bits.s | 111 - osfmk/ppc/boot.h | 28 - osfmk/ppc/bzero.s | 331 - osfmk/ppc/cache.s | 389 - osfmk/ppc/commpage/atomic.s | 280 - osfmk/ppc/commpage/bcopy_64.s | 306 - osfmk/ppc/commpage/bcopy_970.s | 626 -- osfmk/ppc/commpage/bcopy_g3.s | 275 - osfmk/ppc/commpage/bcopy_g4.s | 622 -- osfmk/ppc/commpage/bigcopy_970.s | 331 - osfmk/ppc/commpage/bzero_128.s | 173 - osfmk/ppc/commpage/bzero_32.s | 129 - osfmk/ppc/commpage/cacheflush.s | 110 - osfmk/ppc/commpage/commpage.c | 679 -- osfmk/ppc/commpage/commpage.h | 92 - osfmk/ppc/commpage/commpage_asm.s | 272 - osfmk/ppc/commpage/gettimeofday.s | 255 - osfmk/ppc/commpage/mach_absolute_time.s | 80 - osfmk/ppc/commpage/memset_64.s | 96 - osfmk/ppc/commpage/memset_g3.s | 132 - osfmk/ppc/commpage/memset_g4.s | 131 - osfmk/ppc/commpage/memset_g5.s | 168 - osfmk/ppc/commpage/pthread.s | 121 - osfmk/ppc/commpage/spinlocks.s | 247 - osfmk/ppc/conf.c | 87 - osfmk/ppc/console_feed.c | 266 - osfmk/ppc/console_feed_entries.h | 48 - osfmk/ppc/cpu.c | 1184 --- osfmk/ppc/cpu_capabilities.h | 254 - osfmk/ppc/cpu_data.h | 63 - osfmk/ppc/cpu_internal.h | 89 - osfmk/ppc/cswtch.s | 2486 ----- osfmk/ppc/db_asm.s | 107 - osfmk/ppc/db_disasm.c | 232 - osfmk/ppc/db_interface.c | 592 -- osfmk/ppc/db_low_trace.c | 1106 --- osfmk/ppc/db_low_trace.h | 62 - osfmk/ppc/db_machdep.h | 186 - osfmk/ppc/db_trace.c | 1122 --- osfmk/ppc/endian.h | 93 - osfmk/ppc/etimer.c | 195 - osfmk/ppc/exception.h | 693 -- osfmk/ppc/fpu_protos.h | 41 - osfmk/ppc/genassym.c | 1438 --- osfmk/ppc/hexfont.h | 301 - osfmk/ppc/hibernate_ppc.c | 213 - osfmk/ppc/hibernate_restore.s | 192 - osfmk/ppc/hw_exception.s | 1832 ---- osfmk/ppc/hw_lock.s | 2187 ---- osfmk/ppc/hw_lock_types.h | 74 - osfmk/ppc/hw_perfmon.c | 959 -- osfmk/ppc/hw_perfmon.h | 122 - osfmk/ppc/hw_perfmon_mmcr.h | 186 - osfmk/ppc/hw_vm.s | 8794 ----------------- osfmk/ppc/instrumentation.h | 61 - osfmk/ppc/interrupt.c | 187 - osfmk/ppc/io_map.c | 131 - osfmk/ppc/io_map_entries.h | 45 - osfmk/ppc/lock.h | 86 - osfmk/ppc/locks.h | 220 - osfmk/ppc/locks_ppc.c | 2360 ----- osfmk/ppc/low_trace.h | 92 - osfmk/ppc/lowglobals.h | 102 - osfmk/ppc/lowmem_vectors.s | 4010 -------- osfmk/ppc/machine_routines.c | 847 -- osfmk/ppc/machine_routines.h | 338 - osfmk/ppc/machine_routines_asm.s | 2345 ----- osfmk/ppc/machine_task.c | 85 - osfmk/ppc/machlimits.h | 92 - osfmk/ppc/machparam.h | 86 - osfmk/ppc/mappings.c | 1805 ---- osfmk/ppc/mappings.h | 499 - osfmk/ppc/mcount.s | 81 - osfmk/ppc/mem.h | 68 - osfmk/ppc/misc.c | 120 - osfmk/ppc/misc_asm.s | 287 - osfmk/ppc/misc_protos.h | 138 - osfmk/ppc/model_dep.c | 1045 -- osfmk/ppc/movc.s | 1303 --- osfmk/ppc/new_screen.h | 48 - osfmk/ppc/pcb.c | 672 -- osfmk/ppc/pmap.c | 2121 ---- osfmk/ppc/pmap.h | 338 - osfmk/ppc/pms.c | 743 -- osfmk/ppc/pmsCPU.c | 313 - osfmk/ppc/ppc_disasm.i | 234 - osfmk/ppc/ppc_init.c | 302 - osfmk/ppc/ppc_vm_init.c | 427 - osfmk/ppc/proc_reg.h | 403 - osfmk/ppc/rtclock.c | 306 - osfmk/ppc/savearea.c | 327 - osfmk/ppc/savearea.h | 393 - osfmk/ppc/savearea_asm.s | 1621 --- osfmk/ppc/scc_8530.h | 428 - osfmk/ppc/sched_param.h | 70 - osfmk/ppc/screen_switch.h | 141 - osfmk/ppc/serial_defs.h | 83 - osfmk/ppc/serial_io.c | 659 -- osfmk/ppc/serial_io.h | 150 - osfmk/ppc/setjmp.h | 57 - osfmk/ppc/simple_lock.h | 178 - osfmk/ppc/skiplists.s | 1297 --- osfmk/ppc/spec_reg.h | 47 - osfmk/ppc/start.s | 1283 --- osfmk/ppc/status.c | 1820 ---- osfmk/ppc/task.h | 63 - osfmk/ppc/thread.h | 212 - osfmk/ppc/trap.c | 1012 -- osfmk/ppc/trap.h | 105 - osfmk/ppc/vm_tuning.h | 35 - osfmk/ppc/vmachmon.c | 2024 ---- osfmk/ppc/vmachmon.h | 498 - osfmk/ppc/vmachmon_asm.s | 2368 ----- osfmk/profiling/Makefile | 6 - osfmk/profiling/machine/profile-md.h | 4 +- osfmk/profiling/ppc/profile-md.h | 144 - osfmk/vm/bsd_vm.c | 58 +- osfmk/vm/default_freezer.c | 616 ++ osfmk/vm/default_freezer.h | 160 + osfmk/vm/device_vm.c | 3 +- osfmk/vm/memory_object.c | 613 +- osfmk/vm/memory_object.h | 11 + osfmk/vm/pmap.h | 24 +- osfmk/vm/vm_apple_protect.c | 12 +- osfmk/vm/vm_debug.c | 8 +- osfmk/vm/vm_fault.c | 404 +- osfmk/vm/vm_fault.h | 3 +- osfmk/vm/vm_init.c | 3 + osfmk/vm/vm_kern.c | 52 +- osfmk/vm/vm_map.c | 1433 ++- osfmk/vm/vm_map.h | 68 +- osfmk/vm/vm_map_store.c | 176 + osfmk/vm/vm_map_store.h | 135 + osfmk/vm/vm_map_store_ll.c | 246 + .../machdep.c => osfmk/vm/vm_map_store_ll.h | 52 +- osfmk/vm/vm_map_store_rb.c | 166 + osfmk/vm/vm_map_store_rb.h | 46 + osfmk/vm/vm_object.c | 1134 ++- osfmk/vm/vm_object.h | 90 +- osfmk/vm/vm_page.h | 142 +- osfmk/vm/vm_pageout.c | 1700 ++-- osfmk/vm/vm_pageout.h | 61 +- osfmk/vm/vm_protos.h | 48 +- osfmk/vm/vm_purgeable_internal.h | 6 - osfmk/vm/vm_resident.c | 635 +- osfmk/vm/vm_shared_region.c | 452 +- osfmk/vm/vm_shared_region.h | 54 +- osfmk/vm/vm_swapfile_pager.c | 3 +- osfmk/vm/vm_user.c | 148 +- osfmk/x86_64/bzero.s | 2 +- osfmk/x86_64/copyio.c | 351 + osfmk/x86_64/cswitch.s | 13 +- osfmk/x86_64/idt64.s | 487 +- osfmk/x86_64/idt_table.h | 93 +- osfmk/x86_64/locore.s | 26 +- osfmk/x86_64/loose_ends.c | 369 +- osfmk/x86_64/machine_routines_asm.s | 41 +- osfmk/x86_64/pal_routines_asm.s | 194 + osfmk/x86_64/pmap.c | 948 +- osfmk/x86_64/pmap_pcid.c | 310 + osfmk/x86_64/start.s | 74 +- pexpert/Makefile | 20 +- pexpert/conf/MASTER | 1 - pexpert/conf/MASTER.i386 | 1 - pexpert/conf/MASTER.ppc | 18 - pexpert/conf/MASTER.x86_64 | 1 - pexpert/conf/Makefile | 19 +- pexpert/conf/Makefile.ppc | 8 - pexpert/conf/Makefile.template | 10 +- pexpert/conf/files.ppc | 7 - pexpert/conf/tools/Makefile | 32 - pexpert/conf/tools/doconf/Makefile | 47 - pexpert/conf/tools/doconf/doconf.csh | 321 - pexpert/gen/bootargs.c | 107 +- pexpert/i386/pe_init.c | 4 +- pexpert/i386/pe_kprintf.c | 15 +- pexpert/i386/pe_serial.c | 1 - pexpert/pexpert/Makefile | 11 - pexpert/pexpert/i386/boot.h | 31 +- pexpert/pexpert/i386/efi.h | 24 +- pexpert/pexpert/machine/boot.h | 4 +- pexpert/pexpert/machine/protos.h | 4 +- pexpert/pexpert/pexpert.h | 11 + pexpert/pexpert/ppc/Makefile | 27 - pexpert/pexpert/ppc/boot.h | 92 - pexpert/pexpert/ppc/interrupts.h | 36 - pexpert/pexpert/ppc/powermac.h | 60 - pexpert/pexpert/ppc/protos.h | 160 - pexpert/pexpert/protos.h | 4 - pexpert/ppc/pe_clock_speed.c | 183 - pexpert/ppc/pe_clock_speed_asm.s | 116 - pexpert/ppc/pe_identify_machine.c | 194 - pexpert/ppc/pe_init.c | 269 - pexpert/ppc/pe_kprintf.c | 154 - security/Makefile | 7 +- security/conf/MASTER | 2 +- security/conf/MASTER.i386 | 2 +- security/conf/MASTER.ppc | 31 - security/conf/MASTER.x86_64 | 2 +- security/conf/Makefile | 19 +- security/conf/Makefile.i386 | 11 - security/conf/Makefile.ppc | 18 - security/conf/Makefile.template | 11 +- security/conf/Makefile.x86_64 | 11 - security/conf/files | 1 + security/conf/files.i386 | 1 - security/conf/files.ppc | 1 - security/conf/tools/Makefile | 32 - security/conf/tools/doconf/Makefile | 49 - security/conf/tools/doconf/doconf.csh | 321 - security/conf/tools/newvers/Makefile | 47 - security/mac.h | 18 + security/mac_alloc.h | 1 + security/mac_audit.c | 11 - security/mac_base.c | 147 +- security/mac_framework.h | 27 +- security/mac_internal.h | 38 + security/mac_iokit.c | 27 + security/mac_label.c | 1 + security/mac_net.c | 4 +- security/mac_policy.h | 253 +- security/mac_posix_shm.c | 2 +- security/mac_priv.c | 106 + security/mac_process.c | 48 +- security/mac_stub.c | 20 + security/mac_system.c | 13 + security/mac_vfs.c | 48 +- tools/lockstat/Makefile | 2 +- tools/lockstat/lockstat.c | 6 +- tools/symbolify.py | 82 + tools/tests/MPMMTest/KQMPMMtest.c | 23 + tools/tests/MPMMTest/Makefile | 2 +- tools/tests/affinity/Makefile | 4 +- tools/tests/execperf/Makefile | 79 + tools/tests/execperf/exit-asm.S | 42 + tools/tests/execperf/exit.c | 12 + tools/tests/execperf/printexecinfo.c | 68 + tools/tests/execperf/run.c | 89 + tools/tests/execperf/test.sh | 30 + tools/tests/jitter/Makefile | 16 + tools/tests/jitter/cpu_number.s | 33 + tools/tests/jitter/timer_jitter.c | 480 + tools/tests/kqueue_tests/Makefile | 8 +- ..._readwrite_tests.c => kqueue_file_tests.c} | 380 +- tools/tests/libMicro/AppleReadMe | 107 +- tools/tests/libMicro/Makefile | 31 +- tools/tests/libMicro/Makefile.Darwin | 20 +- tools/tests/libMicro/Makefile.com.Darwin | 0 tools/tests/libMicro/README | 11 + tools/tests/libMicro/apple/Makefile.Darwin | 27 +- .../tests/libMicro/apple/Makefile.benchmarks | 17 +- .../tests/libMicro/apple/Makefile.com.Darwin | 3 + tools/tests/libMicro/apple/getaddrinfo_host.c | 244 + tools/tests/libMicro/apple/getaddrinfo_port.c | 157 + tools/tests/libMicro/apple/getgrent.c | 163 + tools/tests/libMicro/apple/getgrgid.c | 228 + tools/tests/libMicro/apple/getgrnam.c | 231 + tools/tests/libMicro/apple/getpwent.c | 163 + tools/tests/libMicro/apple/getpwnam.c | 262 + tools/tests/libMicro/apple/getpwuid.c | 256 + tools/tests/libMicro/apple/lmbench_bw_mem.c | 29 +- .../libMicro/apple/mbr_check_membership.c | 254 + .../apple/mbr_check_service_membership.c | 281 + .../apple/od_query_create_with_node.c | 381 + tools/tests/libMicro/bench.sh | 56 +- tools/tests/libMicro/benchDS.sh | 324 + tools/tests/libMicro/coreos_bench.sh | 837 ++ tools/tests/libMicro/exp.c | 19 + tools/tests/libMicro/libmicro.h | 2 + tools/tests/libMicro/log.c | 19 + tools/tests/libMicro/longjmp.c | 7 +- tools/tests/libMicro/od_account_create.sh | 129 + tools/tests/libMicro/od_account_delete.sh | 98 + tools/tests/libMicro/siglongjmp.c | 4 + tools/tests/superpages/testsp.c | 210 +- .../testkext.xcodeproj/project.pbxproj | 105 + .../tests/testkext/testthreadcall-Info.plist | 47 + tools/tests/testkext/testthreadcall.cpp | 65 + tools/tests/testkext/testthreadcall.h | 18 + tools/tests/testkext/testvmx.cpp | 3 - tools/tests/testkext/testvmx.h | 3 - .../tests/xnu_quick_test/32bit_inode_tests.c | 1 - tools/tests/xnu_quick_test/README | 13 +- .../xnu_quick_test/atomic_fifo_queue_test.c | 33 + tools/tests/xnu_quick_test/commpage_tests.c | 361 + tools/tests/xnu_quick_test/helpers/arch.c | 6 - .../tests/xnu_quick_test/helpers/data_exec.c | 19 +- tools/tests/xnu_quick_test/helpers/launch.c | 37 - tools/tests/xnu_quick_test/kqueue_tests.c | 76 +- tools/tests/xnu_quick_test/machvm_tests.c | 146 +- tools/tests/xnu_quick_test/main.c | 64 +- tools/tests/xnu_quick_test/makefile | 32 +- tools/tests/xnu_quick_test/memory_tests.c | 157 +- tools/tests/xnu_quick_test/misc.c | 11 +- tools/tests/xnu_quick_test/sched_tests.c | 231 + tools/tests/xnu_quick_test/socket_tests.c | 123 +- tools/tests/xnu_quick_test/tests.c | 254 +- tools/tests/xnu_quick_test/tests.h | 18 +- tools/tests/xnu_quick_test/xattr_tests.c | 70 +- tools/tests/zero-to-n/Makefile | 5 + tools/tests/zero-to-n/zero-to-n.c | 579 ++ 1834 files changed, 222690 insertions(+), 195265 deletions(-) create mode 100644 EXTERNAL_HEADERS/Availability.h create mode 100644 EXTERNAL_HEADERS/AvailabilityInternal.h create mode 100644 EXTERNAL_HEADERS/AvailabilityMacros.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/Makefile delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/asm_help.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/basic_regs.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/fp_regs.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/macro_help.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/pseudo_inst.h delete mode 100644 EXTERNAL_HEADERS/architecture/ppc/reg_help.h delete mode 100644 EXTERNAL_HEADERS/mach-o/arm/reloc.h delete mode 100644 EXTERNAL_HEADERS/mach-o/ppc/reloc.h rename {osfmk/profiling/ppc => SETUP}/Makefile (66%) create mode 100644 SETUP/config/Makefile create mode 100644 SETUP/config/config.h rename bsd/conf/tools/doconf/doconf.csh => SETUP/config/doconf (94%) create mode 100644 SETUP/config/externs.c create mode 100644 SETUP/config/lexer.l create mode 100644 SETUP/config/main.c create mode 100644 SETUP/config/mkglue.c create mode 100644 SETUP/config/mkheaders.c create mode 100644 SETUP/config/mkioconf.c create mode 100644 SETUP/config/mkmakefile.c create mode 100644 SETUP/config/mkswapconf.c create mode 100644 SETUP/config/openp.c create mode 100644 SETUP/config/parser.y create mode 100644 SETUP/config/searchp.c create mode 100644 SETUP/kextsymboltool/Makefile create mode 100644 SETUP/kextsymboltool/kextsymboltool.c rename security/conf/tools/newvers/newvers.csh => SETUP/newvers (100%) mode change 100644 => 100755 delete mode 100755 SETUP/seed_objroot create mode 100644 SETUP/setsegname/Makefile create mode 100644 SETUP/setsegname/setsegname.c delete mode 100644 bsd/conf/MASTER.ppc delete mode 100644 bsd/conf/Makefile.ppc delete mode 100644 bsd/conf/files.ppc delete mode 100644 bsd/conf/tools/Makefile delete mode 100644 bsd/conf/tools/doconf/Makefile create mode 100644 bsd/crypto/aes/Assert.c mode change 100644 => 100755 bsd/crypto/aes/aes.h create mode 100644 bsd/crypto/aes/i386/AES.s create mode 100644 bsd/crypto/aes/i386/Context.h create mode 100644 bsd/crypto/aes/i386/Data.mk create mode 100644 bsd/crypto/aes/i386/Data.s create mode 100644 bsd/crypto/aes/i386/EncryptDecrypt.s create mode 100644 bsd/crypto/aes/i386/ExpandKeyForDecryption.s create mode 100644 bsd/crypto/aes/i386/ExpandKeyForEncryption.s create mode 100644 bsd/crypto/aes/i386/MakeData.c create mode 100644 bsd/crypto/aes/i386/ReadMe.txt create mode 100644 bsd/crypto/aes/i386/aes_crypt_hw.s create mode 100644 bsd/crypto/aes/i386/aes_key_hw.s delete mode 100644 bsd/crypto/aes/i386/aes_modes.c create mode 100644 bsd/crypto/aes/i386/aes_modes_asm.s create mode 100644 bsd/crypto/aes/i386/aes_modes_hw.s delete mode 100644 bsd/crypto/aes/i386/aes_x86_v2.s delete mode 100644 bsd/crypto/aes/i386/aesopt.h create mode 100644 bsd/crypto/aes/i386/aesxts.c create mode 100644 bsd/crypto/aes/i386/aesxts.h create mode 100644 bsd/crypto/aes/i386/aesxts_asm.s delete mode 100644 bsd/crypto/aes/i386/edefs.h delete mode 100644 bsd/crypto/aes/ppc/Makefile delete mode 100644 bsd/crypto/aes/ppc/aescrypt.c delete mode 100644 bsd/crypto/aes/ppc/aeskey.c delete mode 100644 bsd/crypto/aes/ppc/aesopt.h delete mode 100644 bsd/crypto/aes/ppc/aestab.c delete mode 100644 bsd/crypto/aes/ppc/aestab.h create mode 100644 bsd/crypto/aes/test/ReadMe.txt create mode 100755 bsd/crypto/aes/test/makegenx86.sh create mode 100755 bsd/crypto/aes/test/makeoptx86.sh create mode 100644 bsd/crypto/aes/test/tstaes.c create mode 100644 bsd/crypto/doc/KernelCrypto.plist create mode 100644 bsd/crypto/doc/KernelCrypto.txt create mode 100644 bsd/crypto/sha2/intel/sha256.s create mode 100644 bsd/crypto/sha2/intel/sha256nossse3.s delete mode 100644 bsd/dev/ppc/conf.c delete mode 100644 bsd/dev/ppc/cons.c delete mode 100644 bsd/dev/ppc/dtrace_isa.c delete mode 100644 bsd/dev/ppc/dtrace_subr_ppc.c delete mode 100644 bsd/dev/ppc/fasttrap_isa.c delete mode 100644 bsd/dev/ppc/fbt_ppc.c delete mode 100644 bsd/dev/ppc/ffs.c delete mode 100644 bsd/dev/ppc/ffs.s delete mode 100644 bsd/dev/ppc/kern_machdep.c delete mode 100644 bsd/dev/ppc/km.c delete mode 100644 bsd/dev/ppc/mem.c delete mode 100644 bsd/dev/ppc/munge.s delete mode 100644 bsd/dev/ppc/ppc_init.c delete mode 100644 bsd/dev/ppc/sdt_ppc.c delete mode 100644 bsd/dev/ppc/stubs.c delete mode 100644 bsd/dev/ppc/systemcalls.c delete mode 100644 bsd/dev/ppc/unix_signal.c delete mode 100644 bsd/dev/ppc/xsumas.s create mode 100644 bsd/hfs/hfs_cprotect.c create mode 100644 bsd/hfs/hfs_kdebug.h create mode 100644 bsd/hfs/hfscommon/Misc/HybridAllocator.c create mode 100644 bsd/hfs/hfscommon/headers/HybridAllocator.h create mode 100644 bsd/hfs/hfscommon/headers/RedBlackTree.h create mode 100644 bsd/kern/Makefile create mode 100644 bsd/kern/kern_priv.c create mode 100644 bsd/kern/policy_check.c create mode 100644 bsd/kern/process_policy.c create mode 100644 bsd/kern/trace.codes create mode 100644 bsd/kern/vm_pressure.c rename bsd/{ppc/reg.h => kern/vm_pressure.h} (80%) create mode 100644 bsd/man/man2/getdtablesize.2 create mode 100644 bsd/man/man2/sem_close.2 create mode 100644 bsd/man/man2/sem_open.2 create mode 100644 bsd/man/man2/sem_post.2 create mode 100644 bsd/man/man2/sem_unlink.2 create mode 100644 bsd/man/man2/sem_wait.2 create mode 100644 bsd/man/man2/setregid.2 create mode 100644 bsd/man/man2/setreuid.2 create mode 100644 bsd/man/man2/shm_open.2 create mode 100644 bsd/man/man2/shm_unlink.2 create mode 100644 bsd/man/man2/undelete.2 delete mode 100644 bsd/man/man5/fs.5 delete mode 100644 bsd/man/man5/inode.5 delete mode 100644 bsd/miscfs/nullfs/null.h delete mode 100644 bsd/miscfs/nullfs/null_subr.c delete mode 100644 bsd/miscfs/nullfs/null_vfsops.c delete mode 100644 bsd/miscfs/nullfs/null_vnops.c delete mode 100644 bsd/miscfs/union/union_subr.c delete mode 100644 bsd/miscfs/union/union_vfsops.c delete mode 100644 bsd/miscfs/union/union_vnops.c create mode 100644 bsd/net/bridgestp.c create mode 100644 bsd/net/bridgestp.h delete mode 100644 bsd/net/if_atm.h create mode 100644 bsd/net/if_bridge.c create mode 100644 bsd/net/if_bridgevar.h delete mode 100644 bsd/net/if_disc.c delete mode 100644 bsd/net/if_dummy.c delete mode 100644 bsd/net/if_ethersubr.c delete mode 100644 bsd/net/if_fddisubr.c create mode 100644 bsd/net/if_llreach.c create mode 100644 bsd/net/if_llreach.h create mode 100644 bsd/net/netsrc.c rename EXTERNAL_HEADERS/architecture/ppc/cframe.h => bsd/net/netsrc.h (58%) create mode 100644 bsd/net/ntstat.c create mode 100644 bsd/net/ntstat.h delete mode 100644 bsd/net/rtsock_mip.c delete mode 100644 bsd/netinet/if_atm.c delete mode 100644 bsd/netinet/if_atm.h delete mode 100644 bsd/netinet/if_fddi.h create mode 100644 bsd/netinet/in_mcast.c create mode 100644 bsd/netinet/in_pcblist.c create mode 100644 bsd/netinet/in_tclass.c delete mode 100644 bsd/netinet/ip_flow.c delete mode 100644 bsd/netinet/ip_flow.h create mode 100644 bsd/netinet/tcp_cc.h create mode 100644 bsd/netinet/tcp_ledbat.c create mode 100644 bsd/netinet/tcp_newreno.c create mode 100644 bsd/netinet6/in6_mcast.c create mode 100644 bsd/netinet6/ip6_id.c create mode 100644 bsd/netinet6/mld6.h delete mode 100644 bsd/ppc/Makefile delete mode 100644 bsd/ppc/_limits.h delete mode 100644 bsd/ppc/_param.h delete mode 100644 bsd/ppc/_structs.h delete mode 100644 bsd/ppc/_types.h delete mode 100644 bsd/ppc/decodePPC.h delete mode 100644 bsd/ppc/endian.h delete mode 100644 bsd/ppc/exec.h delete mode 100644 bsd/ppc/fasttrap_isa.h delete mode 100644 bsd/ppc/limits.h delete mode 100644 bsd/ppc/param.h delete mode 100644 bsd/ppc/profile.h delete mode 100644 bsd/ppc/reboot.h delete mode 100644 bsd/ppc/setjmp.h delete mode 100644 bsd/ppc/signal.h delete mode 100644 bsd/ppc/types.h delete mode 100644 bsd/ppc/ucontext.h delete mode 100644 bsd/ppc/vmparam.h rename osfmk/ppc/Performance.h => bsd/sys/content_protection.h (71%) rename osfmk/ppc/machine_cpu.h => bsd/sys/fileport.h (68%) rename bsd/{ppc/ptrace.h => sys/imgsrc.h} (74%) create mode 100755 bsd/sys/make_posix_availability.sh create mode 100755 bsd/sys/make_symbol_aliasing.sh rename bsd/{dev/ppc/memmove.c => sys/netboot.h} (72%) create mode 100644 bsd/sys/priv.h create mode 100644 bsd/sys/process_policy.h delete mode 100644 config/BSDKernel.ppc.exports create mode 100644 config/Dummy.exports delete mode 100644 config/IOKit.ppc.exports delete mode 100644 config/Libkern.ppc.exports delete mode 100644 config/MACFramework.ppc.exports delete mode 100644 config/Mach.ppc.exports delete mode 100644 config/Private.ppc.exports delete mode 100644 config/System6.0.ppc.exports delete mode 100644 config/Unsupported.ppc.exports rename osfmk/ppc/cpu_affinity.h => iokit/IOKit/AppleKeyStoreInterface.h (62%) create mode 100644 iokit/IOKit/IOStatistics.h create mode 100644 iokit/IOKit/IOStatisticsPrivate.h delete mode 100644 iokit/IOKit/i386/IOSharedLockImp.h delete mode 100644 iokit/IOKit/ppc/IODBDMA.h delete mode 100644 iokit/IOKit/ppc/IOSharedLockImp.h delete mode 100644 iokit/IOKit/ppc/Makefile delete mode 100644 iokit/IOKit/pwr_mgt/IOPMDeprecated.h create mode 100644 iokit/Kernel/IOStatistics.cpp create mode 100644 iokit/Kernel/i386/IOKeyStoreHelper.cpp delete mode 100644 iokit/Kernel/ppc/IOAsmSupport.s delete mode 100644 iokit/Kernel/ppc/IODBDMA.cpp delete mode 100644 iokit/conf/MASTER.ppc delete mode 100644 iokit/conf/Makefile.ppc delete mode 100644 iokit/conf/files.ppc delete mode 100644 iokit/conf/tools/Makefile delete mode 100644 iokit/conf/tools/doconf/Makefile delete mode 100755 iokit/conf/tools/doconf/doconf.csh delete mode 100644 libkern/c++/OSObjectAsm.s mode change 100644 => 100755 libkern/c++/Tests/TestSerialization/test1/test1_main.cpp delete mode 100644 libkern/conf/MASTER.ppc delete mode 100644 libkern/conf/Makefile.ppc delete mode 100644 libkern/conf/files.ppc delete mode 100644 libkern/conf/tools/Makefile delete mode 100644 libkern/conf/tools/doconf/Makefile delete mode 100755 libkern/conf/tools/doconf/doconf.csh create mode 100644 libkern/crypto/intel/sha1edp.h create mode 100644 libkern/crypto/intel/sha1edp.s rename {iokit/Kernel => libkern/kxld}/WKdmCompress.c (97%) rename {iokit/Kernel => libkern/kxld}/WKdmDecompress.c (100%) create mode 100644 libkern/kxld/i386/WKdmCompress.s create mode 100644 libkern/kxld/i386/WKdmDecompress.s create mode 100644 libkern/kxld/kxld_object.c create mode 100644 libkern/kxld/kxld_object.h delete mode 100644 libkern/kxld/kxld_state.c delete mode 100644 libkern/kxld/kxld_state.h create mode 100644 libkern/kxld/tests/kxld_array_test.c rename bsd/ppc/psl.h => libkern/kxld/tests/kxld_test.c (76%) rename osfmk/ppc/machine_rpc.h => libkern/kxld/tests/kxld_test.h (86%) delete mode 100644 libkern/libkern/OSAtomic.h.save rename {iokit/Kernel => libkern/libkern}/WKdm.h (97%) delete mode 100644 libkern/libkern/ppc/Makefile delete mode 100644 libkern/libkern/ppc/OSByteOrder.h create mode 100644 libkern/libkern/tree.h delete mode 100644 libkern/ppc/OSAtomic.s delete mode 100644 libkern/ppc/bcmp.s delete mode 100644 libkern/ppc/memcmp.s delete mode 100644 libkern/ppc/strlen.s delete mode 100644 libkern/zlib/arm/adler32vec.s delete mode 100644 libkern/zlib/arm/inffastS.s create mode 100644 libkern/zlib/intel/adler32vec.s create mode 100644 libkern/zlib/intel/inffastS.s delete mode 100644 libsa/conf/MASTER.ppc delete mode 100644 libsa/conf/Makefile.ppc delete mode 100644 libsa/conf/files.ppc delete mode 100644 libsa/conf/tools/Makefile delete mode 100644 libsa/conf/tools/doconf/Makefile delete mode 100755 libsa/conf/tools/doconf/doconf.csh delete mode 100644 libsyscall/BSDmakefile delete mode 100644 libsyscall/GNUmakefile create mode 100644 libsyscall/Libsyscall.xcconfig create mode 100644 libsyscall/Libsyscall.xcodeproj/project.pbxproj delete mode 100644 libsyscall/Makefile delete mode 100644 libsyscall/Makefile.inc delete mode 100644 libsyscall/Makefile.xbs create mode 100644 libsyscall/Platforms/MacOSX/i386/syscall.map create mode 100644 libsyscall/Platforms/MacOSX/x86_64/syscall.map create mode 100644 libsyscall/Platforms/syscall.map delete mode 100755 libsyscall/create-syscalls.pl rename osfmk/chud/ppc/chud_xnu_glue.h => libsyscall/custom/errno.c (95%) delete mode 100644 libsyscall/include/Makefile.inc delete mode 100644 libsyscall/include/processor_facilities.h delete mode 100644 libsyscall/mach/Makefile.inc rename osfmk/ppc/hardclock_entries.h => libsyscall/mach/abort.h (83%) delete mode 100644 libsyscall/mach/bootstrap_ports.c delete mode 100644 libsyscall/mach/brk.2 rename iokit/Kernel/ppc/IOSharedLock.s => libsyscall/mach/dylib_link.c (90%) create mode 100644 libsyscall/mach/exc_catcher.h delete mode 100644 libsyscall/mach/headers/Makefile.inc delete mode 100644 libsyscall/mach/i386/Makefile.inc rename libsyscall/mach/{headers => mach}/errorlib.h (94%) rename libsyscall/mach/{headers => mach}/mach.h (100%) rename libsyscall/mach/{headers => mach}/mach_error.h (100%) rename libsyscall/mach/{headers => mach}/mach_init.h (95%) rename libsyscall/mach/{headers => mach}/mach_interface.h (100%) rename libsyscall/mach/{headers => mach}/port_obj.h (100%) rename libsyscall/mach/{headers => mach}/sync.h (100%) rename libsyscall/mach/{headers => mach}/task.h (93%) rename libsyscall/mach/{headers => mach}/thread_act.h (92%) rename libsyscall/mach/{headers => mach}/vm_task.h (100%) delete mode 100644 libsyscall/mach/mach_init_libSystem.c delete mode 100644 libsyscall/mach/mach_init_ports.c rename pexpert/ppc/pe_bootargs.c => libsyscall/mach/mach_legacy.c (80%) create mode 100644 libsyscall/mach/mig_reply_port.c rename osfmk/mach/ppc/rpc.h => libsyscall/mach/mig_reply_port.h (85%) delete mode 100644 libsyscall/mach/ppc/Makefile.inc delete mode 100644 libsyscall/mach/ppc64/Makefile.inc delete mode 100644 libsyscall/mach/sbrk.c delete mode 100644 libsyscall/mach/servers/Makefile.inc create mode 100644 libsyscall/mach/string.c rename osfmk/mach/ppc/thread_state.h => libsyscall/mach/string.h (59%) delete mode 100644 libsyscall/mach/x86_64/Makefile.inc create mode 100644 libsyscall/wrappers/__get_cpu_capabilities.s rename osfmk/x86_64/genassym.c => libsyscall/wrappers/_errno.h (88%) rename osfmk/ppc/testjump.c => libsyscall/wrappers/_libc_funcptr.c (55%) rename bsd/hfs/cprotect.c => libsyscall/wrappers/_libkernel_init.c (67%) rename bsd/ppc/disklabel.h => libsyscall/wrappers/_libkernel_init.h (65%) create mode 100644 libsyscall/wrappers/cancelable/fcntl-base.c rename libsyscall/{mach/x86_64/mach_absolute_time.S => wrappers/cancelable/fcntl-cancel.c} (81%) create mode 100644 libsyscall/wrappers/cancelable/fcntl.c create mode 100644 libsyscall/wrappers/cancelable/select-cancel.c create mode 100644 libsyscall/wrappers/cancelable/select.c create mode 100644 libsyscall/wrappers/cancelable/sigsuspend-cancel.c rename libsyscall/{mach/i386/mach_absolute_time.S => wrappers/cancelable/sigsuspend.c} (81%) rename bsd/dev/ppc/sysctl.c => libsyscall/wrappers/init_cpu_capabilities.c (61%) create mode 100644 libsyscall/wrappers/ioctl.c create mode 100644 libsyscall/wrappers/kill.c create mode 100644 libsyscall/wrappers/legacy/accept.c create mode 100644 libsyscall/wrappers/legacy/bind.c create mode 100644 libsyscall/wrappers/legacy/connect.c create mode 100644 libsyscall/wrappers/legacy/getattrlist.c create mode 100644 libsyscall/wrappers/legacy/getpeername.c create mode 100644 libsyscall/wrappers/legacy/getsockname.c create mode 100644 libsyscall/wrappers/legacy/kill.c create mode 100644 libsyscall/wrappers/legacy/lchown.c rename osfmk/ppc/xpr.h => libsyscall/wrappers/legacy/listen.c (50%) create mode 100644 libsyscall/wrappers/legacy/mprotect.c create mode 100644 libsyscall/wrappers/legacy/msync.c create mode 100644 libsyscall/wrappers/legacy/munmap.c create mode 100644 libsyscall/wrappers/legacy/open.c create mode 100644 libsyscall/wrappers/legacy/recvfrom.c create mode 100644 libsyscall/wrappers/legacy/recvmsg.c create mode 100644 libsyscall/wrappers/legacy/select-pre1050.c create mode 100644 libsyscall/wrappers/legacy/select.c create mode 100644 libsyscall/wrappers/legacy/sendmsg.c create mode 100644 libsyscall/wrappers/legacy/sendto.c create mode 100644 libsyscall/wrappers/legacy/setattrlist.c create mode 100644 libsyscall/wrappers/legacy/sigsuspend.c create mode 100644 libsyscall/wrappers/legacy/socketpair.c create mode 100644 libsyscall/wrappers/memcpy.c create mode 100644 libsyscall/wrappers/remove-counter.c create mode 100644 libsyscall/wrappers/rename.c create mode 100644 libsyscall/wrappers/rmdir.c create mode 100644 libsyscall/wrappers/select-base.c create mode 100644 libsyscall/wrappers/sigsuspend-base.c create mode 100644 libsyscall/wrappers/unix03/chmod.c create mode 100644 libsyscall/wrappers/unix03/fchmod.c create mode 100644 libsyscall/wrappers/unix03/getrlimit.c create mode 100644 libsyscall/wrappers/unix03/mmap.c create mode 100644 libsyscall/wrappers/unix03/setrlimit.c create mode 100644 libsyscall/wrappers/unlink.c create mode 100755 libsyscall/xcodescripts/compat-symlinks.sh create mode 100755 libsyscall/xcodescripts/compile-syscalls.pl create mode 100755 libsyscall/xcodescripts/create-syscalls.pl create mode 100755 libsyscall/xcodescripts/mach_install_mig.sh delete mode 100644 osfmk/chud/ppc/chud_cpu_asm.h delete mode 100644 osfmk/chud/ppc/chud_cpu_asm.s delete mode 100644 osfmk/chud/ppc/chud_cpu_ppc.c delete mode 100644 osfmk/chud/ppc/chud_osfmk_callback_ppc.c delete mode 100644 osfmk/chud/ppc/chud_spr.h delete mode 100644 osfmk/chud/ppc/chud_thread_ppc.c delete mode 100644 osfmk/chud/ppc/chud_xnu_private.h delete mode 100644 osfmk/conf/MASTER.ppc delete mode 100644 osfmk/conf/Makefile.ppc delete mode 100644 osfmk/conf/files.ppc delete mode 100644 osfmk/conf/tools/Makefile delete mode 100644 osfmk/conf/tools/doconf/Makefile delete mode 100755 osfmk/conf/tools/doconf/doconf.csh delete mode 100644 osfmk/console/ppc/serial_console.c delete mode 100644 osfmk/console/ppc/video_scroll.s create mode 100644 osfmk/i386/bsd_i386_native.c delete mode 100644 osfmk/i386/commpage/atomic.s delete mode 100644 osfmk/i386/commpage/bcopy_scalar.s delete mode 100644 osfmk/i386/commpage/bcopy_sse2.s delete mode 100644 osfmk/i386/commpage/bcopy_sse3x.s delete mode 100644 osfmk/i386/commpage/bcopy_sse3x_64.s delete mode 100644 osfmk/i386/commpage/bcopy_sse42.s delete mode 100644 osfmk/i386/commpage/bcopy_sse42_64.s delete mode 100644 osfmk/i386/commpage/bzero_scalar.s delete mode 100644 osfmk/i386/commpage/bzero_sse2.s delete mode 100644 osfmk/i386/commpage/bzero_sse2_64.s delete mode 100644 osfmk/i386/commpage/bzero_sse42.s delete mode 100644 osfmk/i386/commpage/bzero_sse42_64.s delete mode 100644 osfmk/i386/commpage/cacheflush.s delete mode 100644 osfmk/i386/commpage/commpage_gettimeofday.s delete mode 100644 osfmk/i386/commpage/commpage_mach_absolute_time.s delete mode 100644 osfmk/i386/commpage/commpage_sigs.c delete mode 100644 osfmk/i386/commpage/cpu_number.s delete mode 100644 osfmk/i386/commpage/longcopy_sse3x.s delete mode 100644 osfmk/i386/commpage/longcopy_sse3x_64.s delete mode 100644 osfmk/i386/commpage/memset_pattern_sse2.s delete mode 100644 osfmk/i386/commpage/memset_pattern_sse2_64.s delete mode 100644 osfmk/i386/commpage/spinlocks.s create mode 100644 osfmk/i386/copyio.c delete mode 100644 osfmk/i386/ipl.h create mode 100644 osfmk/i386/lapic_native.c create mode 100644 osfmk/i386/mp_native.c create mode 100644 osfmk/i386/pal_hibernate.h rename osfmk/{ppc/cpu_number.h => i386/pal_lock_asm.h} (82%) create mode 100644 osfmk/i386/pal_native.h create mode 100644 osfmk/i386/pal_routines.c create mode 100644 osfmk/i386/pal_routines.h create mode 100644 osfmk/i386/pal_routines_asm.s rename osfmk/{ppc/mp.h => i386/pal_rtclock_asm.h} (86%) create mode 100644 osfmk/i386/pcb_native.c create mode 100644 osfmk/i386/pmap_common.c create mode 100644 osfmk/i386/pmap_pcid.h create mode 100644 osfmk/i386/rtclock_asm.h rename osfmk/i386/{rtclock.h => rtclock_asm_native.h} (67%) create mode 100644 osfmk/i386/rtclock_native.c rename osfmk/{ppc/rtclock.h => i386/rtclock_protos.h} (64%) create mode 100644 osfmk/i386/trap_native.c create mode 100644 osfmk/i386/ucode.c create mode 100644 osfmk/i386/ucode.h delete mode 100644 osfmk/kdp/ml/ppc/kdp_asm.s delete mode 100644 osfmk/kdp/ml/ppc/kdp_machdep.c delete mode 100644 osfmk/kdp/ml/ppc/kdp_misc.s delete mode 100644 osfmk/kdp/ml/ppc/kdp_vm.c create mode 100644 osfmk/kern/extmod_statistics.c rename osfmk/{ppc/PPCcalls.c => kern/extmod_statistics.h} (71%) create mode 100644 osfmk/kern/sched_fixedpriority.c create mode 100644 osfmk/kern/sched_grrr.c create mode 100644 osfmk/kern/sched_proto.c delete mode 100644 osfmk/libsa/ppc/types.h create mode 100644 osfmk/mach/branch_predicates.h delete mode 100644 osfmk/mach/i386/_types.h delete mode 100644 osfmk/mach/ppc/Makefile delete mode 100644 osfmk/mach/ppc/_structs.h delete mode 100644 osfmk/mach/ppc/_types.h delete mode 100644 osfmk/mach/ppc/boolean.h delete mode 100644 osfmk/mach/ppc/exception.h delete mode 100644 osfmk/mach/ppc/kern_return.h delete mode 100644 osfmk/mach/ppc/machine_types.defs delete mode 100644 osfmk/mach/ppc/ndr_def.h delete mode 100644 osfmk/mach/ppc/processor_info.h delete mode 100644 osfmk/mach/ppc/sdt_isa.h delete mode 100644 osfmk/mach/ppc/syscall_sw.h delete mode 100644 osfmk/mach/ppc/thread_status.h delete mode 100644 osfmk/mach/ppc/vm_param.h delete mode 100644 osfmk/mach/ppc/vm_types.h rename libsyscall/mach/ppc/mach_absolute_time.s => osfmk/machine/pal_hibernate.h (81%) rename iokit/IOKit/machine/IOSharedLockImp.h => osfmk/machine/pal_routines.h (85%) delete mode 100644 osfmk/ppc/AltiAssist.s delete mode 100644 osfmk/ppc/Diagnostics.c delete mode 100644 osfmk/ppc/Diagnostics.h delete mode 100644 osfmk/ppc/Emulate.s delete mode 100644 osfmk/ppc/Emulate64.s delete mode 100644 osfmk/ppc/Firmware.h delete mode 100644 osfmk/ppc/Firmware.s delete mode 100644 osfmk/ppc/FirmwareC.c delete mode 100644 osfmk/ppc/FirmwareCalls.h delete mode 100644 osfmk/ppc/Makefile delete mode 100644 osfmk/ppc/PPCcalls.h delete mode 100644 osfmk/ppc/Performance.s delete mode 100644 osfmk/ppc/PseudoKernel.c delete mode 100644 osfmk/ppc/PseudoKernel.h delete mode 100644 osfmk/ppc/_setjmp.s delete mode 100644 osfmk/ppc/aligned_data.s delete mode 100644 osfmk/ppc/asm.h delete mode 100644 osfmk/ppc/ast.h delete mode 100644 osfmk/ppc/ast_types.h delete mode 100644 osfmk/ppc/atomic_switch.h delete mode 100644 osfmk/ppc/atomic_switch.s delete mode 100644 osfmk/ppc/bat_init.c delete mode 100644 osfmk/ppc/bcopy.s delete mode 100644 osfmk/ppc/bcopytest.c delete mode 100644 osfmk/ppc/bits.s delete mode 100644 osfmk/ppc/boot.h delete mode 100644 osfmk/ppc/bzero.s delete mode 100644 osfmk/ppc/cache.s delete mode 100644 osfmk/ppc/commpage/atomic.s delete mode 100644 osfmk/ppc/commpage/bcopy_64.s delete mode 100644 osfmk/ppc/commpage/bcopy_970.s delete mode 100644 osfmk/ppc/commpage/bcopy_g3.s delete mode 100644 osfmk/ppc/commpage/bcopy_g4.s delete mode 100644 osfmk/ppc/commpage/bigcopy_970.s delete mode 100644 osfmk/ppc/commpage/bzero_128.s delete mode 100644 osfmk/ppc/commpage/bzero_32.s delete mode 100644 osfmk/ppc/commpage/cacheflush.s delete mode 100644 osfmk/ppc/commpage/commpage.c delete mode 100644 osfmk/ppc/commpage/commpage.h delete mode 100644 osfmk/ppc/commpage/commpage_asm.s delete mode 100644 osfmk/ppc/commpage/gettimeofday.s delete mode 100644 osfmk/ppc/commpage/mach_absolute_time.s delete mode 100644 osfmk/ppc/commpage/memset_64.s delete mode 100644 osfmk/ppc/commpage/memset_g3.s delete mode 100644 osfmk/ppc/commpage/memset_g4.s delete mode 100644 osfmk/ppc/commpage/memset_g5.s delete mode 100644 osfmk/ppc/commpage/pthread.s delete mode 100644 osfmk/ppc/commpage/spinlocks.s delete mode 100644 osfmk/ppc/conf.c delete mode 100644 osfmk/ppc/console_feed.c delete mode 100644 osfmk/ppc/console_feed_entries.h delete mode 100644 osfmk/ppc/cpu.c delete mode 100644 osfmk/ppc/cpu_capabilities.h delete mode 100644 osfmk/ppc/cpu_data.h delete mode 100644 osfmk/ppc/cpu_internal.h delete mode 100644 osfmk/ppc/cswtch.s delete mode 100644 osfmk/ppc/db_asm.s delete mode 100644 osfmk/ppc/db_disasm.c delete mode 100644 osfmk/ppc/db_interface.c delete mode 100644 osfmk/ppc/db_low_trace.c delete mode 100644 osfmk/ppc/db_low_trace.h delete mode 100644 osfmk/ppc/db_machdep.h delete mode 100644 osfmk/ppc/db_trace.c delete mode 100644 osfmk/ppc/endian.h delete mode 100644 osfmk/ppc/etimer.c delete mode 100644 osfmk/ppc/exception.h delete mode 100644 osfmk/ppc/fpu_protos.h delete mode 100644 osfmk/ppc/genassym.c delete mode 100644 osfmk/ppc/hexfont.h delete mode 100644 osfmk/ppc/hibernate_ppc.c delete mode 100644 osfmk/ppc/hibernate_restore.s delete mode 100644 osfmk/ppc/hw_exception.s delete mode 100644 osfmk/ppc/hw_lock.s delete mode 100644 osfmk/ppc/hw_lock_types.h delete mode 100644 osfmk/ppc/hw_perfmon.c delete mode 100644 osfmk/ppc/hw_perfmon.h delete mode 100644 osfmk/ppc/hw_perfmon_mmcr.h delete mode 100644 osfmk/ppc/hw_vm.s delete mode 100644 osfmk/ppc/instrumentation.h delete mode 100644 osfmk/ppc/interrupt.c delete mode 100644 osfmk/ppc/io_map.c delete mode 100644 osfmk/ppc/io_map_entries.h delete mode 100644 osfmk/ppc/lock.h delete mode 100644 osfmk/ppc/locks.h delete mode 100644 osfmk/ppc/locks_ppc.c delete mode 100644 osfmk/ppc/low_trace.h delete mode 100644 osfmk/ppc/lowglobals.h delete mode 100644 osfmk/ppc/lowmem_vectors.s delete mode 100644 osfmk/ppc/machine_routines.c delete mode 100644 osfmk/ppc/machine_routines.h delete mode 100644 osfmk/ppc/machine_routines_asm.s delete mode 100644 osfmk/ppc/machine_task.c delete mode 100644 osfmk/ppc/machlimits.h delete mode 100644 osfmk/ppc/machparam.h delete mode 100644 osfmk/ppc/mappings.c delete mode 100644 osfmk/ppc/mappings.h delete mode 100644 osfmk/ppc/mcount.s delete mode 100644 osfmk/ppc/mem.h delete mode 100644 osfmk/ppc/misc.c delete mode 100644 osfmk/ppc/misc_asm.s delete mode 100644 osfmk/ppc/misc_protos.h delete mode 100644 osfmk/ppc/model_dep.c delete mode 100644 osfmk/ppc/movc.s delete mode 100644 osfmk/ppc/new_screen.h delete mode 100644 osfmk/ppc/pcb.c delete mode 100644 osfmk/ppc/pmap.c delete mode 100644 osfmk/ppc/pmap.h delete mode 100644 osfmk/ppc/pms.c delete mode 100644 osfmk/ppc/pmsCPU.c delete mode 100644 osfmk/ppc/ppc_disasm.i delete mode 100644 osfmk/ppc/ppc_init.c delete mode 100644 osfmk/ppc/ppc_vm_init.c delete mode 100644 osfmk/ppc/proc_reg.h delete mode 100644 osfmk/ppc/rtclock.c delete mode 100644 osfmk/ppc/savearea.c delete mode 100644 osfmk/ppc/savearea.h delete mode 100644 osfmk/ppc/savearea_asm.s delete mode 100644 osfmk/ppc/scc_8530.h delete mode 100644 osfmk/ppc/sched_param.h delete mode 100644 osfmk/ppc/screen_switch.h delete mode 100644 osfmk/ppc/serial_defs.h delete mode 100644 osfmk/ppc/serial_io.c delete mode 100644 osfmk/ppc/serial_io.h delete mode 100644 osfmk/ppc/setjmp.h delete mode 100644 osfmk/ppc/simple_lock.h delete mode 100644 osfmk/ppc/skiplists.s delete mode 100644 osfmk/ppc/spec_reg.h delete mode 100644 osfmk/ppc/start.s delete mode 100644 osfmk/ppc/status.c delete mode 100644 osfmk/ppc/task.h delete mode 100644 osfmk/ppc/thread.h delete mode 100644 osfmk/ppc/trap.c delete mode 100644 osfmk/ppc/trap.h delete mode 100644 osfmk/ppc/vm_tuning.h delete mode 100644 osfmk/ppc/vmachmon.c delete mode 100644 osfmk/ppc/vmachmon.h delete mode 100644 osfmk/ppc/vmachmon_asm.s delete mode 100644 osfmk/profiling/ppc/profile-md.h create mode 100644 osfmk/vm/default_freezer.c create mode 100644 osfmk/vm/default_freezer.h create mode 100644 osfmk/vm/vm_map_store.c create mode 100644 osfmk/vm/vm_map_store.h create mode 100644 osfmk/vm/vm_map_store_ll.c rename bsd/dev/ppc/machdep.c => osfmk/vm/vm_map_store_ll.h (61%) create mode 100644 osfmk/vm/vm_map_store_rb.c create mode 100644 osfmk/vm/vm_map_store_rb.h create mode 100644 osfmk/x86_64/copyio.c create mode 100644 osfmk/x86_64/pal_routines_asm.s create mode 100644 osfmk/x86_64/pmap_pcid.c delete mode 100644 pexpert/conf/MASTER.ppc delete mode 100644 pexpert/conf/Makefile.ppc delete mode 100644 pexpert/conf/files.ppc delete mode 100644 pexpert/conf/tools/Makefile delete mode 100644 pexpert/conf/tools/doconf/Makefile delete mode 100755 pexpert/conf/tools/doconf/doconf.csh delete mode 100644 pexpert/pexpert/ppc/Makefile delete mode 100644 pexpert/pexpert/ppc/boot.h delete mode 100644 pexpert/pexpert/ppc/interrupts.h delete mode 100644 pexpert/pexpert/ppc/powermac.h delete mode 100644 pexpert/pexpert/ppc/protos.h delete mode 100644 pexpert/ppc/pe_clock_speed.c delete mode 100644 pexpert/ppc/pe_clock_speed_asm.s delete mode 100644 pexpert/ppc/pe_identify_machine.c delete mode 100644 pexpert/ppc/pe_init.c delete mode 100644 pexpert/ppc/pe_kprintf.c delete mode 100644 security/conf/MASTER.ppc delete mode 100644 security/conf/Makefile.ppc delete mode 100644 security/conf/files.ppc delete mode 100644 security/conf/tools/Makefile delete mode 100644 security/conf/tools/doconf/Makefile delete mode 100644 security/conf/tools/doconf/doconf.csh delete mode 100644 security/conf/tools/newvers/Makefile create mode 100644 security/mac_priv.c create mode 100755 tools/symbolify.py create mode 100644 tools/tests/execperf/Makefile create mode 100644 tools/tests/execperf/exit-asm.S create mode 100644 tools/tests/execperf/exit.c create mode 100644 tools/tests/execperf/printexecinfo.c create mode 100644 tools/tests/execperf/run.c create mode 100755 tools/tests/execperf/test.sh create mode 100644 tools/tests/jitter/Makefile create mode 100644 tools/tests/jitter/cpu_number.s create mode 100644 tools/tests/jitter/timer_jitter.c mode change 100644 => 100755 tools/tests/kqueue_tests/Makefile rename tools/tests/kqueue_tests/{kqueue_readwrite_tests.c => kqueue_file_tests.c} (98%) mode change 100644 => 100755 tools/tests/libMicro/Makefile.com.Darwin create mode 100644 tools/tests/libMicro/apple/getaddrinfo_host.c create mode 100644 tools/tests/libMicro/apple/getaddrinfo_port.c create mode 100644 tools/tests/libMicro/apple/getgrent.c create mode 100644 tools/tests/libMicro/apple/getgrgid.c create mode 100644 tools/tests/libMicro/apple/getgrnam.c create mode 100644 tools/tests/libMicro/apple/getpwent.c create mode 100644 tools/tests/libMicro/apple/getpwnam.c create mode 100644 tools/tests/libMicro/apple/getpwuid.c create mode 100644 tools/tests/libMicro/apple/mbr_check_membership.c create mode 100644 tools/tests/libMicro/apple/mbr_check_service_membership.c create mode 100644 tools/tests/libMicro/apple/od_query_create_with_node.c create mode 100644 tools/tests/libMicro/benchDS.sh create mode 100644 tools/tests/libMicro/coreos_bench.sh create mode 100644 tools/tests/libMicro/od_account_create.sh create mode 100644 tools/tests/libMicro/od_account_delete.sh create mode 100644 tools/tests/testkext/testthreadcall-Info.plist create mode 100644 tools/tests/testkext/testthreadcall.cpp create mode 100644 tools/tests/testkext/testthreadcall.h create mode 100644 tools/tests/xnu_quick_test/atomic_fifo_queue_test.c create mode 100644 tools/tests/xnu_quick_test/commpage_tests.c create mode 100644 tools/tests/xnu_quick_test/sched_tests.c create mode 100644 tools/tests/zero-to-n/Makefile create mode 100644 tools/tests/zero-to-n/zero-to-n.c diff --git a/EXTERNAL_HEADERS/Availability.h b/EXTERNAL_HEADERS/Availability.h new file mode 100644 index 000000000..e811335c1 --- /dev/null +++ b/EXTERNAL_HEADERS/Availability.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2007-2010 by Apple Inc.. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#ifndef __AVAILABILITY__ +#define __AVAILABILITY__ + /* + These macros are for use in OS header files. They enable function prototypes + and Objective-C methods to be tagged with the OS version in which they + were first available; and, if applicable, the OS version in which they + became deprecated. + + The desktop Mac OS X and the iPhone OS X each have different version numbers. + The __OSX_AVAILABLE_STARTING() macro allows you to specify both the desktop + and phone OS version numbers. For instance: + __OSX_AVAILABLE_STARTING(__MAC_10_2,__IPHONE_2_0) + means the function/method was first available on Mac OS X 10.2 on the desktop + and first available in OS X 2.0 on the iPhone. + + If a function is available on one platform, but not the other a _NA (not + applicable) parameter is used. For instance: + __OSX_AVAILABLE_STARTING(__MAC_10_3,__IPHONE_NA) + means that the function/method was first available on Mac OS X 10.3, and it + currently not implemented on the iPhone. + + At some point, a function/method may be deprecated. That means Apple + recommends applications stop using the function, either because there is a + better replacement or the functionality is being phased out. Deprecated + functions/methods can be tagged with a __OSX_AVAILABLE_BUT_DEPRECATED() + macro which specifies the OS version where the function became available + as well as the OS version in which it became deprecated. For instance: + __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_0,__MAC_10_5,__IPHONE_NA,__IPHONE_NA) + means that the function/method was introduced in Mac OS X 10.0, then + became deprecated beginning in Mac OS X 10.5. On the iPhone the function + has never been available. + + For these macros to function properly, a program must specify the OS version range + it is targeting. The min OS version is specified as an option to the compiler: + -mmacosx-version-min=10.x when building for Mac OS X, and -miphone-version-min=1.x.x + when building for the iPhone. The upper bound for the OS version is rarely needed, + but it can be set on the command line via: -D__MAC_OS_X_VERSION_MAX_ALLOWED=10xx for + Mac OS X and __IPHONE_OS_VERSION_MAX_ALLOWED = 1xxx for iPhone. + + Examples: + + A function available in Mac OS X 10.5 and later, but not on the phone: + + extern void mymacfunc() __OSX_AVAILABLE_STARTING(__MAC_10_5,__IPHONE_NA); + + + An Objective-C method in Mac OS X 10.5 and later, but not on the phone: + + @interface MyClass : NSObject + -(void) mymacmethod __OSX_AVAILABLE_STARTING(__MAC_10_5,__IPHONE_NA); + @end + + + An enum available on the phone, but not available on Mac OS X: + + #if __IPHONE_OS_VERSION_MIN_REQUIRED + enum { myEnum = 1 }; + #endif + Note: this works when targeting the Mac OS X platform because + __IPHONE_OS_VERSION_MIN_REQUIRED is undefined which evaluates to zero. + + + An enum with values added in different iPhoneOS versions: + + enum { + myX = 1, // Usable on iPhoneOS 2.1 and later + myY = 2, // Usable on iPhoneOS 3.0 and later + myZ = 3, // Usable on iPhoneOS 3.0 and later + ... + Note: you do not want to use #if with enumeration values + when a client needs to see all values at compile time + and use runtime logic to only use the viable values. + + + It is also possible to use the *_VERSION_MIN_REQUIRED in source code to make one + source base that can be compiled to target a range of OS versions. It is best + to not use the _MAC_* and __IPHONE_* macros for comparisons, but rather their values. + That is because you might get compiled on an old OS that does not define a later + OS version macro, and in the C preprocessor undefined values evaluate to zero + in expresssions, which could cause the #if expression to evaluate in an unexpected + way. + + #ifdef __MAC_OS_X_VERSION_MIN_REQUIRED + // code only compiled when targeting Mac OS X and not iPhone + // note use of 1050 instead of __MAC_10_5 + #if __MAC_OS_X_VERSION_MIN_REQUIRED < 1050 + // code in here might run on pre-Leopard OS + #else + // code here can assume Leopard or later + #endif + #endif + + +*/ + +#define __MAC_10_0 1000 +#define __MAC_10_1 1010 +#define __MAC_10_2 1020 +#define __MAC_10_3 1030 +#define __MAC_10_4 1040 +#define __MAC_10_5 1050 +#define __MAC_10_6 1060 +#define __MAC_10_7 1070 +#define __MAC_NA 9999 /* not available */ + +#define __IPHONE_2_0 20000 +#define __IPHONE_2_1 20100 +#define __IPHONE_2_2 20200 +#define __IPHONE_3_0 30000 +#define __IPHONE_3_1 30100 +#define __IPHONE_3_2 30200 +#define __IPHONE_NA 99999 /* not available */ + +#include <AvailabilityInternal.h> + + +#ifdef __IPHONE_OS_VERSION_MIN_REQUIRED + #define __OSX_AVAILABLE_STARTING(_mac, _iphone) __AVAILABILITY_INTERNAL##_iphone + #define __OSX_AVAILABLE_BUT_DEPRECATED(_macIntro, _macDep, _iphoneIntro, _iphoneDep) \ + __AVAILABILITY_INTERNAL##_iphoneIntro##_DEP##_iphoneDep + +#elif defined(__MAC_OS_X_VERSION_MIN_REQUIRED) + #define __OSX_AVAILABLE_STARTING(_mac, _iphone) __AVAILABILITY_INTERNAL##_mac + #define __OSX_AVAILABLE_BUT_DEPRECATED(_macIntro, _macDep, _iphoneIntro, _iphoneDep) \ + __AVAILABILITY_INTERNAL##_macIntro##_DEP##_macDep + +#else + #define __OSX_AVAILABLE_STARTING(_mac, _iphone) + #define __OSX_AVAILABLE_BUT_DEPRECATED(_macIntro, _macDep, _iphoneIntro, _iphoneDep) +#endif + + +#endif /* __AVAILABILITY__ */ diff --git a/EXTERNAL_HEADERS/AvailabilityInternal.h b/EXTERNAL_HEADERS/AvailabilityInternal.h new file mode 100644 index 000000000..a4524708e --- /dev/null +++ b/EXTERNAL_HEADERS/AvailabilityInternal.h @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2007-2010 by Apple Inc.. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + File: AvailabilityInternal.h + + Contains: implementation details of __OSX_AVAILABLE_* macros from <Availability.h> + +*/ +#ifndef __AVAILABILITY_INTERNAL__ +#define __AVAILABILITY_INTERNAL__ + + + +#ifndef __IPHONE_OS_VERSION_MIN_REQUIRED + #ifdef __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ + /* compiler sets __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ when -miphoneos-version-min is used */ + #define __IPHONE_OS_VERSION_MIN_REQUIRED __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ + #endif +#endif + +#ifdef __IPHONE_OS_VERSION_MIN_REQUIRED + /* don't use visibility attribute for iPhoneOS */ + #define __AVAILABILITY_INTERNAL_DEPRECATED __attribute__((deprecated)) + #define __AVAILABILITY_INTERNAL_UNAVAILABLE __attribute__((unavailable)) + #define __AVAILABILITY_INTERNAL_WEAK_IMPORT __attribute__((weak_import)) + #define __AVAILABILITY_INTERNAL_REGULAR +#else + #define __AVAILABILITY_INTERNAL_DEPRECATED __attribute__((deprecated,visibility("default"))) + #define __AVAILABILITY_INTERNAL_UNAVAILABLE __attribute__((unavailable,visibility("default"))) + #define __AVAILABILITY_INTERNAL_WEAK_IMPORT __attribute__((weak_import,visibility("default"))) + #define __AVAILABILITY_INTERNAL_REGULAR __attribute__((visibility("default"))) +#endif + +#ifdef __IPHONE_OS_VERSION_MIN_REQUIRED + /* make sure a default max version is set */ + #ifndef __IPHONE_OS_VERSION_MAX_ALLOWED + #define __IPHONE_OS_VERSION_MAX_ALLOWED __IPHONE_3_2 + #endif + /* make sure a valid min is set */ + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_0 + #undef __IPHONE_OS_VERSION_MIN_REQUIRED + #define __IPHONE_OS_VERSION_MIN_REQUIRED __IPHONE_2_0 + #endif + + /* set up internal macros (up to 2.0) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_0 __AVAILABILITY_INTERNAL_DEPRECATED + /* set up internal macros (up to 2.1) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_1 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_1 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (up to 2.2) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_2_2 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_2_2 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (up to 3.0) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_0 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_0 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (up to 3.1) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_1 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_1 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (up to 3.2) */ + #if __IPHONE_OS_VERSION_MAX_ALLOWED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_NA __AVAILABILITY_INTERNAL__IPHONE_3_2 + #if __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_2_2 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_0 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_1 + #elif __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_3_2 + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_REGULAR + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL__IPHONE_3_2 + #else + #define __AVAILABILITY_INTERNAL__IPHONE_2_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_2_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_0_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_1_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__IPHONE_3_2_DEP__IPHONE_3_2 __AVAILABILITY_INTERNAL_DEPRECATED + #endif + /* set up internal macros (n/a) */ + #define __AVAILABILITY_INTERNAL__IPHONE_NA __AVAILABILITY_INTERNAL_UNAVAILABLE + #define __AVAILABILITY_INTERNAL__IPHONE_NA_DEP__IPHONE_NA __AVAILABILITY_INTERNAL_UNAVAILABLE + +#elif defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) + /* compiler for Mac OS X sets __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ */ + #define __MAC_OS_X_VERSION_MIN_REQUIRED __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ + /* make sure a default max version is set */ + #ifndef __MAC_OS_X_VERSION_MAX_ALLOWED + #define __MAC_OS_X_VERSION_MAX_ALLOWED __MAC_10_7 + #endif + /* set up internal macros */ + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_7 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_6 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_5 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_4 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_3 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_2 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_1 __AVAILABILITY_INTERNAL_REGULAR + #endif + #if __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_UNAVAILABLE + #elif __MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_WEAK_IMPORT + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0 __AVAILABILITY_INTERNAL_REGULAR + #endif + #define __AVAILABILITY_INTERNAL__MAC_NA __AVAILABILITY_INTERNAL_UNAVAILABLE + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_1 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_1 __AVAILABILITY_INTERNAL__MAC_10_0 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_2 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_2 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_2 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_2 __AVAILABILITY_INTERNAL__MAC_10_1 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_3 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_3 __AVAILABILITY_INTERNAL__MAC_10_2 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_4 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_4 __AVAILABILITY_INTERNAL__MAC_10_3 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_5 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_5 __AVAILABILITY_INTERNAL__MAC_10_4 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_6 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_6 __AVAILABILITY_INTERNAL__MAC_10_5 + #endif + #if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_7 __AVAILABILITY_INTERNAL_DEPRECATED + #else + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_10_7 __AVAILABILITY_INTERNAL__MAC_10_6 + #endif + #define __AVAILABILITY_INTERNAL__MAC_10_0_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_0 + #define __AVAILABILITY_INTERNAL__MAC_10_1_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_1 + #define __AVAILABILITY_INTERNAL__MAC_10_2_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_2 + #define __AVAILABILITY_INTERNAL__MAC_10_3_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_3 + #define __AVAILABILITY_INTERNAL__MAC_10_4_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_4 + #define __AVAILABILITY_INTERNAL__MAC_10_5_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_5 + #define __AVAILABILITY_INTERNAL__MAC_10_6_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_6 + #define __AVAILABILITY_INTERNAL__MAC_10_7_DEP__MAC_NA __AVAILABILITY_INTERNAL__MAC_10_7 + #define __AVAILABILITY_INTERNAL__MAC_NA_DEP__MAC_NA __AVAILABILITY_INTERNAL_UNAVAILABLE +#endif + +#endif /* __AVAILABILITY_INTERNAL__ */ diff --git a/EXTERNAL_HEADERS/AvailabilityMacros.h b/EXTERNAL_HEADERS/AvailabilityMacros.h new file mode 100644 index 000000000..02981bd13 --- /dev/null +++ b/EXTERNAL_HEADERS/AvailabilityMacros.h @@ -0,0 +1,820 @@ +/* + * Copyright (c) 2001-2010 by Apple Inc.. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + File: AvailabilityMacros.h + + More Info: See TechNote 2064 + + Contains: Autoconfiguration of AVAILABLE_ macros for Mac OS X + + This header enables a developer to specify build time + constraints on what Mac OS X versions the resulting + application will be run. There are two bounds a developer + can specify: + + MAC_OS_X_VERSION_MIN_REQUIRED + MAC_OS_X_VERSION_MAX_ALLOWED + + The lower bound controls which calls to OS functions will + be weak-importing (allowed to be unresolved at launch time). + The upper bound controls which OS functionality, if used, + will result in a compiler error because that functionality is + not available on on any OS is the specifed range. + + For example, suppose an application is compiled with: + + MAC_OS_X_VERSION_MIN_REQUIRED = MAC_OS_X_VERSION_10_2 + MAC_OS_X_VERSION_MAX_ALLOWED = MAC_OS_X_VERSION_10_3 + + and an OS header contains: + + extern void funcA(void) AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER; + extern void funcB(void) AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2; + extern void funcC(void) AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3; + extern void funcD(void) AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER; + extern void funcE(void) AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER; + extern void funcF(void) AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER; + extern void funcG(void) AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER; + + typedef long TypeA DEPRECATED_IN_MAC_OS_X_VERSION_10_0_AND_LATER; + typedef long TypeB DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER; + typedef long TypeC DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER; + typedef long TypeD DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER; + typedef long TypeE DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER; + + Any application code which uses these declarations will get the following: + + compile link run + ------- ------ ------- + funcA: normal normal normal + funcB: warning normal normal + funcC: normal normal normal + funcD: normal normal normal + funcE: normal normal normal + funcF: normal weak on 10.3 normal, on 10.2 (&funcF == NULL) + funcG: error error n/a + typeA: warning + typeB: warning + typeC: warning + typeD: normal + typeE: normal + + +*/ +#ifndef __AVAILABILITYMACROS__ +#define __AVAILABILITYMACROS__ + + +/* + * Set up standard Mac OS X versions + */ +#define MAC_OS_X_VERSION_10_0 1000 +#define MAC_OS_X_VERSION_10_1 1010 +#define MAC_OS_X_VERSION_10_2 1020 +#define MAC_OS_X_VERSION_10_3 1030 +#define MAC_OS_X_VERSION_10_4 1040 +#define MAC_OS_X_VERSION_10_5 1050 +#define MAC_OS_X_VERSION_10_6 1060 +#define MAC_OS_X_VERSION_10_7 1070 + + +/* + * If min OS not specified, assume 10.1 for ppc and 10.4 for all others + * Note: gcc driver may set _ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED_ based on MACOSX_DEPLOYMENT_TARGET environment variable + */ +#ifndef MAC_OS_X_VERSION_MIN_REQUIRED + #ifdef __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ + #if (__i386__ || __x86_64__) && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < MAC_OS_X_VERSION_10_4) + #warning Building for Intel with Mac OS X Deployment Target < 10.4 is invalid. + #elif __ppc64__ && (__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < MAC_OS_X_VERSION_10_4) + #warning Building for ppc64 with Mac OS X Deployment Target < 10.4 is invalid. + #endif + #define MAC_OS_X_VERSION_MIN_REQUIRED __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ + #else + #if __ppc64__ || __i386__ || __x86_64__ + #define MAC_OS_X_VERSION_MIN_REQUIRED MAC_OS_X_VERSION_10_4 + #else + #define MAC_OS_X_VERSION_MIN_REQUIRED MAC_OS_X_VERSION_10_1 + #endif + #endif +#endif + +/* + * if max OS not specified, assume largerof(10.6, min) + */ +#ifndef MAC_OS_X_VERSION_MAX_ALLOWED + #if MAC_OS_X_VERSION_MIN_REQUIRED > MAC_OS_X_VERSION_10_7 + #define MAC_OS_X_VERSION_MAX_ALLOWED MAC_OS_X_VERSION_MIN_REQUIRED + #else + #define MAC_OS_X_VERSION_MAX_ALLOWED MAC_OS_X_VERSION_10_7 + #endif +#endif + +/* + * Error on bad values + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_MIN_REQUIRED + #error MAC_OS_X_VERSION_MAX_ALLOWED must be >= MAC_OS_X_VERSION_MIN_REQUIRED +#endif +#if MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_0 + #error MAC_OS_X_VERSION_MIN_REQUIRED must be >= MAC_OS_X_VERSION_10_0 +#endif + +/* + * only certain compilers support __attribute__((weak_import)) + */ +#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) && (MAC_OS_X_VERSION_MIN_REQUIRED >= 1020) + #define WEAK_IMPORT_ATTRIBUTE __attribute__((weak_import)) +#elif defined(__MWERKS__) && (__MWERKS__ >= 0x3205) && (MAC_OS_X_VERSION_MIN_REQUIRED >= 1020) && !defined(__INTEL__) + #define WEAK_IMPORT_ATTRIBUTE __attribute__((weak_import)) +#else + #define WEAK_IMPORT_ATTRIBUTE +#endif + +/* + * only certain compilers support __attribute__((deprecated)) + */ +#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) + #define DEPRECATED_ATTRIBUTE __attribute__((deprecated)) +#else + #define DEPRECATED_ATTRIBUTE +#endif + +/* + * only certain compilers support __attribute__((unavailable)) + */ +#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) + #define UNAVAILABLE_ATTRIBUTE __attribute__((unavailable)) +#else + #define UNAVAILABLE_ATTRIBUTE +#endif + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER + * + * Used on functions introduced in Mac OS X 10.0 + */ +#define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED + * + * Used on functions introduced in Mac OS X 10.0, + * and deprecated in Mac OS X 10.0 + */ +#define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_0_AND_LATER + * + * Used on types deprecated in Mac OS X 10.0 + */ +#define DEPRECATED_IN_MAC_OS_X_VERSION_10_0_AND_LATER DEPRECATED_ATTRIBUTE + + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.1 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_1 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_1 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.1, + * and deprecated in Mac OS X 10.1 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_1 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.1 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_1 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_1 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER + * + * Used on types deprecated in Mac OS X 10.1 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_1 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + + + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.2 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_2 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_2 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.2, + * and deprecated in Mac OS X 10.2 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.2 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.2 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_2 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER + * + * Used on types deprecated in Mac OS X 10.2 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_2 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.3, + * and deprecated in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_3 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER + * + * Used on types deprecated in Mac OS X 10.3 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_3 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.4, + * and deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 + * + * Used on declarations introduced in Mac OS X 10.3, + * but later deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_4 AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER + * + * Used on types deprecated in Mac OS X 10.4 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_4 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.5, + * and deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 + * + * Used on declarations introduced in Mac OS X 10.3, + * but later deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 + * + * Used on declarations introduced in Mac OS X 10.4, + * but later deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_5 AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER + * + * Used on types deprecated in Mac OS X 10.5 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_5 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.6, + * and deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.3, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.4, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 + * + * Used on declarations introduced in Mac OS X 10.5, + * but later deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_6 AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER + * + * Used on types deprecated in Mac OS X 10.6 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_6 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_6_AND_LATER +#endif + + + + + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + * + * Used on declarations introduced in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MAX_ALLOWED < MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER UNAVAILABLE_ATTRIBUTE +#elif MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER WEAK_IMPORT_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED + * + * Used on declarations introduced in Mac OS X 10.7, + * and deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.0, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_0_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.1, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_1_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.2, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_2_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.3, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_3_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.4, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_4_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.5, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_5_AND_LATER +#endif + +/* + * AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + * + * Used on declarations introduced in Mac OS X 10.6, + * but later deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 DEPRECATED_ATTRIBUTE +#else + #define AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER +#endif + +/* + * DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER + * + * Used on types deprecated in Mac OS X 10.7 + */ +#if MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7 + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER DEPRECATED_ATTRIBUTE +#else + #define DEPRECATED_IN_MAC_OS_X_VERSION_10_7_AND_LATER +#endif + +#endif /* __AVAILABILITYMACROS__ */ + + diff --git a/EXTERNAL_HEADERS/Makefile b/EXTERNAL_HEADERS/Makefile index 9f8e3535b..46ee40f90 100644 --- a/EXTERNAL_HEADERS/Makefile +++ b/EXTERNAL_HEADERS/Makefile @@ -11,9 +11,6 @@ INSTINC_SUBDIRS = \ architecture \ mach-o -INSTINC_SUBDIRS_PPC = \ - architecture - INSTINC_SUBDIRS_I386 = \ architecture @@ -23,8 +20,12 @@ INSTINC_SUBDIRS_X86_64 = \ INSTINC_SUBDIRS_ARM = \ architecture + EXPORT_FILES = \ AppleSecureBootEpoch.h \ + Availability.h \ + AvailabilityInternal.h \ + AvailabilityMacros.h \ ar.h \ stdarg.h \ stdbool.h \ diff --git a/EXTERNAL_HEADERS/architecture/Makefile b/EXTERNAL_HEADERS/architecture/Makefile index 8c929ba14..a322a080f 100644 --- a/EXTERNAL_HEADERS/architecture/Makefile +++ b/EXTERNAL_HEADERS/architecture/Makefile @@ -9,9 +9,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = \ - ppc - INSTINC_SUBDIRS_I386 = \ i386 @@ -21,6 +18,7 @@ INSTINC_SUBDIRS_X86_64 = \ INSTINC_SUBDIRS_ARM = \ arm + EXPORT_FILES = INSTALL_MI_LIST = diff --git a/EXTERNAL_HEADERS/architecture/ppc/Makefile b/EXTERNAL_HEADERS/architecture/ppc/Makefile deleted file mode 100644 index 374f3bd9a..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTINC_SUBDIRS_PPC = - -EXPORT_FILES = \ - asm_help.h \ - basic_regs.h \ - cframe.h \ - fp_regs.h \ - macro_help.h \ - pseudo_inst.h \ - reg_help.h - - -INSTALL_MD_LIST = - -INSTALL_MD_DIR = - -EXPORT_MD_LIST = ${EXPORT_FILES} - -EXPORT_MD_DIR = architecture/ppc - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/EXTERNAL_HEADERS/architecture/ppc/asm_help.h b/EXTERNAL_HEADERS/architecture/ppc/asm_help.h deleted file mode 100644 index 0ff2171c4..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/asm_help.h +++ /dev/null @@ -1,456 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1996 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/asm_help.h - * Author: Mike DeMoney, NeXT Software, Inc. - * - * This header file defines macros useful when writing assembly code - * for the PowerPC processors. - * r12 is used as the tmp register / PICIFY base. - * - * HISTORY - * 20-May-97 Umesh Vaishampayan (umeshv@apple.com) - * Implemented Dynamic / PIC macros. - * - * 28-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * added ".align" directive to various macros to avoid alignment - * faults. Moved Register Usage #defines to reg_help.h as that's - * where they should have been in the first place. - * Added Dynamic / PIC macroes for routines which refernce external - * symbols. Not implemented fully as yet. - * - * 05-Nov-92 Mike DeMoney (mike@next.com) - * Created. - */ - -#ifndef _ARCH_PPC_ASM_HELP_H_ -#define _ARCH_PPC_ASM_HELP_H_ - -#include <architecture/ppc/reg_help.h> - -#ifdef __ASSEMBLER__ -/* - * ppc stack frames look like this after procedure prolog has - * been executed: - * - * Higher address: - * ......... - * +-------------------------------+ - * | caller's LR | - * +-------------------------------+ - * | caller's CR | - * +-------------------------------+ - * Caller's SP->| caller's caller's sp | ^^ Caller's Frame ^^ - * +===============================+ vv Called Rtn Frame vv - * | Save Area for | FPF 31 - * .......... - * | Caller's FPF's | FPF n - * +-------------------------------+ - * | Save Area for | GRF 31 - * .......... - * | Caller's GRF's | GRF n - * +-------------------------------+ - * | alignment pad | - * ............ - * | (if necessary) | - * +-------------------------------+ - * | Local | - * ........ - * | Variables | - * +-------------------------------+ - * SP + X -> | aN for FUTURE call | - * +-------------------------------+ - * .......... - * +-------------------------------+ - * SP + 28 -> | a1 for FUTURE call | - * +-------------------------------+ - * SP + 24 -> | a0 for FUTURE call | - * +-------------------------------+ - * SP + 20 -> | caller's TOC | - * +-------------------------------+ - * SP + 16 -> | reserved | - * +-------------------------------+ - * SP + 12 -> | reserved | - * +-------------------------------+ - * SP + 8 -> | LR callee-save for FUTURE call| - * +-------------------------------+ - * SP + 4 -> | CR callee-save for FUTURE call| - * +-------------------------------+ - * SP -> | caller's sp | - * +===============================+ - * Lower address: - * - * NOTE: All state with the exception of LR and CR are saved in the - * called routines frame. LR and CR are saved in the CALLER'S FRAME. - * - * ALSO NOTE: Args to the called routine are found in the caller's frame. - */ - -/* - * ARG(n) -- stack offset to n'th argument - * - * NOTE CAREFULLY! These macros start numbering arguments at 1 (NOT 0) - * The first argument is ARG(1). - * - * ALSO NOTE: This stack offset is only valid if using routine - * DOES NOT alter SP. - * - */ -#define ARG(n) ((((n) - 1) * 4) + 24) - -/* - * Macros for building stack frame according to C calling conventions. - * lr, cr, and sp are saved. - * - * NOTE WELL: localvarsize is in bytes, maxargsout is a count of words, - * grfsaved and fpfsaved is a count of registers. BE SURE TO COUNT - * BOTH FP (r31) AND sN REGISTERS IN THE COUNT OF GRF REGISTERS SAVED! - * This will be TWO more than the N of the highest sN register you - * save: s2 implies you are saving s2, s1, s0, and fp => grfsaved - * should be 4! - * - * FURTHER NOTE: These macros do NOT SAVE GRF or FPF registers. User - * must do that. GRF sN regs should be saved via - * stmw sN,SAVED_GRF_S(N)(sp) - * where N is the highest numbered s* register to be saved. E.g. if - * s0, s1, and s2 are to be saved use: - * stmw s2,SAVED_GRF_S(2)(sp) - * Note that this also saves fp. - * An individual saved grf can be loaded via: - * lwz s2,SAVED_GRF_S(2)(sp) - * Analogous stuff works for fpf's. - * - * NOTE: these simple routines will be replaced with more complicated - * ones once we know what the linker and gdb will require as for as - * register use masks and frame declarations. - * - * Warning: ROUND_TO_STACK is only to be used in assembly language; - * for C usage, use ROUND_FRAME() in reg_help.h. - */ -#define ROUND_TO_STACK(len) \ - (((len) + STACK_INCR - 1) / STACK_INCR * STACK_INCR) - -#define BUILD_FRAME(localvarsize, maxargsout, grfsaved, fpfsaved) \ - .set __argoutsize, ROUND_TO_STACK((maxargsout) * 4) @\ - .if __argoutsize < 32 @\ - .set __argoutsize,32 @\ - .endif @\ - .set __framesize, ROUND_TO_STACK( \ - 24 + __argoutsize + (localvarsize) \ - + 4*(grfsaved) + 8*(fpfsaved)) @\ - .set __grfbase,(__framesize - 4*(grfsaved) - 8*(fpfsaved)) @\ - .set __fpfbase,(__framesize - 8*(fpfsaved)) @\ - mflr r0 @\ - mfcr r12 @\ - stw r0,8(sp) @\ - stw r12,4(sp) @\ - stwu r1,-__framesize(r1) - -/* - * Macros for referencing data in stack frame. - * - * NOTE WELL: ARG's and VAR's start at 1, NOT 0. Why ??? (FIXME) - */ -#define LOCAL_VAR(n) (((n)-1)*4 + __argoutsize + 24) -#define SAVED_GRF_S(n) (__grfbase + ((grfsaved) - (n) - 2) * 4) -#define SAVED_FRF_FS(n) (__fpfbase + ((fpfsaved) - (n) - 1) * 4) -#define ARG_IN(n) (ARG(n) + __framesize) -#define ARG_OUT(n) (ARG(n) + 0) -#define SAVED_FP (__grfbase + ((grfsaved) - 1) * 4) -#define SAVED_LR (__framesize + 8) -#define SAVED_CR (__framesize + 4) - -/* - * Macros for unwinding stack frame. - * NOTE: GRF's and FPF's are NOT RESTORED. User must do this before - * using this macro. - */ -#define RETURN \ - .if __framesize @\ - lwz32 r0,r1,SAVED_LR @\ - lwz32 r12,r1,SAVED_CR @\ - addic sp,r1,__framesize @\ - mtlr r0 @\ - mtcrf 0xff,r12 @\ - blr @\ - .else @\ - blr @\ - .endif - - -/* - * Macros for declaring procedures - * - * Use of these macros allows ctags to have a predictable way - * to find various types of declarations. They also simplify - * inserting appropriate symbol table information. - * - * NOTE: these simple stubs will be replaced with more - * complicated versions once we know what the linker and gdb - * will require as far as register use masks and frame declarations. - * These macros may also be ifdef'ed in the future to contain profiling - * code. - * - * FIXME: Document what makes a leaf a LEAF and a handler a HANDLER. - * (E.g. leaf's have return pc in lr, NESTED's have rpc in offset off - * sp, handlers have rpc in exception frame which is found via exception - * link, etc etc.) - */ - -/* - * TEXT -- declare start of text segment - */ -#define TEXT \ - .text @\ - .align 2 - -/* - * LEAF -- declare global leaf procedure - * NOTE: Control SHOULD NOT FLOW into a LEAF! A LEAF should only - * be jumped to. (A leaf may do an align.) Use a LABEL() if you - * need control to flow into the label. - */ -#define LEAF(name) \ - .align 2 @\ - .globl name @\ -name: @\ - .set __framesize,0 - -/* - * X_LEAF -- declare alternate global label for leaf - */ -#define X_LEAF(name, value) \ - .globl name @\ - .set name,value - -/* - * P_LEAF -- declare private leaf procedure - */ -#define P_LEAF(name) \ - .align 2 @\ -name: @\ - .set __framesize,0 - -/* - * LABEL -- declare a global code label - * MUST be used (rather than LEAF, NESTED, etc) if control - * "flows into" the label. - */ -#define LABEL(name) \ - .align 2 @\ - .globl name @\ -name: - -/* - * NESTED -- declare procedure that invokes other procedures - */ -#define NESTED(name, localvarsize, maxargsout, grfsaved, fpfsaved)\ - .align 2 @\ - .globl name @\ -name: @\ - BUILD_FRAME(localvarsize, maxargsout, grfsaved, fpfsaved) - -/* - * X_NESTED -- declare alternate global label for nested proc - */ -#define X_NESTED(name, value) \ - .globl name @\ - .set name,value - -/* - * P_NESTED -- declare private nested procedure - */ -#define P_NESTED(name, localvarsize, maxargsout, grfsaved, fpfsaved)\ - .align 2 @\ -name: @\ - BUILD_FRAME(locavarsize, maxargsout, grfsaved, fpfsaved) - -/* - * HANDLER -- declare procedure with exception frame rather than - * standard C frame - */ -#define HANDLER(name) \ - .align 2 @\ - .globl name @\ -name: - -/* - * X_HANDLER -- declare alternate name for exception handler - * (Should appear immediately before a HANDLER declaration or - * another X_HANDLER declaration) - */ -#define X_HANDLER(name) \ - .align 2 @\ - .globl name @\ -name: - -/* - * P_HANDLER -- declare private handler - */ -#define P_HANDLER(name) \ - .align 2 @\ -name: - -/* - * END -- mark end of procedure - * FIXME: Unimplemented for now. - */ -#define END(name) - -/* - * BL -- call procedure (relative) - */ -#define BL(name) \ - bl name - -/* - * Storage definition macros - * The main purpose of these is to allow an easy handle for ctags - */ - -/* - * IMPORT -- import symbol - */ -#define IMPORT(name) \ - .reference name - -/* - * ABS -- declare global absolute symbol - */ -#define ABS(name, value) \ - .globl name @\ - .set name,value - -/* - * P_ABS -- declare private absolute symbol - */ -#define P_ABS(name, value) \ - .set name,value - -/* - * EXPORT -- declare global label for data - */ -#define EXPORT(name) \ - .align 2 @\ - .globl name @\ -name: - -/* - * BSS -- declare global zero'ed storage - */ -#define BSS(name,size) \ - .comm name,size - - -/* - * P_BSS -- declare private zero'ed storage - */ -#define P_BSS(name,size) \ - .lcomm name,size - -/* - * dynamic/PIC macros for routines which reference external symbols - */ -#if defined(__DYNAMIC__) -#define PICIFY_REG r12 - -/* Assume that the lr is saved before calling any of these macros */ -/* using PICIFY() */ - -#define PICIFY(var) \ - mflr r0 @\ - bl 1f @\ -1: mflr PICIFY_REG @\ - mtlr r0 @\ - addis PICIFY_REG, PICIFY_REG, ha16(L ## var ## $non_lazy_ptr - 1b) @\ - lwz PICIFY_REG, lo16(L ## var ## $non_lazy_ptr - 1b)(PICIFY_REG) - -#define CALL_EXTERN_AGAIN(var) \ - PICIFY(var) @\ - mtctr PICIFY_REG @\ - mflr r0 @\ - stw r0,8(r1) @\ - stwu r1,-56(r1) @\ - bctrl @\ - addic r1,r1,56 @\ - lwz r0,8(r1) @\ - mtlr r0 - -#define NON_LAZY_STUB(var) \ - .non_lazy_symbol_pointer @\ - .align 2 @\ -L ## var ## $non_lazy_ptr: @\ - .indirect_symbol var @\ - .long 0 @\ - .text @\ - .align 2 - -#define BRANCH_EXTERN(var) \ - PICIFY(var) @\ - mtctr PICIFY_REG @\ - bctr @\ - NON_LAZY_STUB(var) - -#define CALL_EXTERN(var) \ - CALL_EXTERN_AGAIN(var) @\ - NON_LAZY_STUB(var) - -#define REG_TO_EXTERN(reg, var) \ - PICIFY(var) @\ - stw reg, 0(PICIFY_REG) @\ - NON_LAZY_STUB(var) - -#define EXTERN_TO_REG(reg, var) \ - PICIFY(var) @\ - lwz reg, 0(PICIFY_REG) @\ - NON_LAZY_STUB(var) - -#else /* ! __DYNAMIC__ */ -#define TMP_REG r12 -#define BRANCH_EXTERN(var) \ - b var - -#define CALL_EXTERN(var) \ - bl var - -#define CALL_EXTERN_AGAIN(var) \ - CALL_EXTERN(var) - -#define REG_TO_EXTERN(reg, var) \ - lis TMP_REG, ha16(var) @\ - stw reg, lo16(var)(TMP_REG) - -#define EXTERN_TO_REG(reg, var) \ - lis reg, ha16(var) @\ - lwz reg, lo16(var)(reg) - -#endif /* __DYNAMIC__ */ - -#endif /* __ASSEMBLER__ */ -#endif /* _ARCH_PPC_ASM_HELP_H_ */ diff --git a/EXTERNAL_HEADERS/architecture/ppc/basic_regs.h b/EXTERNAL_HEADERS/architecture/ppc/basic_regs.h deleted file mode 100644 index b9dbdf699..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/basic_regs.h +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1996 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/basic_regs.h - * Author: Doug Mitchell, NeXT Software, Inc. - * - * Basic ppc registers. - * - * HISTORY - * 22-May-97 Umesh Vaishampayan (umeshv@apple.com) - Updated to match MPCFPE32B/AD 1/97 REV. 1 - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported from m98k. - * 05-Nov-92 Doug Mitchell at NeXT - * Created. - */ - -#ifndef _ARCH_PPC_BASIC_REGS_H_ -#define _ARCH_PPC_BASIC_REGS_H_ - -#include <architecture/ppc/reg_help.h> -#include <architecture/ppc/macro_help.h> - -#if !defined(__ASSEMBLER__) - -/* - * Number of General Purpose registers. - */ -#define PPC_NGP_REGS 32 - -/* - * Common half-word used in Machine State Register and in - * various exception frames. Defined as a macro because the compiler - * will align a struct to a word boundary when used inside another struct. - */ -#define MSR_BITS \ - unsigned ee:BIT_WIDTH(15), /* external intr enable */ \ - pr:BIT_WIDTH(14), /* problem state */ \ - fp:BIT_WIDTH(13), /* floating point avail */ \ - me:BIT_WIDTH(12), /* machine check enable */ \ - fe0:BIT_WIDTH(11), /* fp exception mode 0 */ \ - se:BIT_WIDTH(10), /* single step enable */ \ - be:BIT_WIDTH(9), /* branch trace enable */ \ - fe1:BIT_WIDTH(8), /* fp exception mode 0 */ \ - rsvd1:BIT_WIDTH(7), /* reserved */ \ - ip:BIT_WIDTH(6), /* interrupt prefix */ \ - ir:BIT_WIDTH(5), /* instruction relocate */ \ - dr:BIT_WIDTH(4), /* data relocate */ \ - rsvd2:BITS_WIDTH(3,2), /* reserved */ \ - ri:BIT_WIDTH(1), /* recoverable exception */ \ - le:BIT_WIDTH(0) /* Little-endian mode */ - -/* - * Machine state register. - * Read and written via get_msr() and set_msr() inlines, below. - */ -typedef struct { - unsigned rsvd3:BITS_WIDTH(31,19), // reserved - pow:BIT_WIDTH(18), // Power management enable - rsvd0: BIT_WIDTH(17), // reserved - ile: BIT_WIDTH(16); // exception little endian - - MSR_BITS; // see above -} msr_t; - -/* - * Data Storage Interrupt Status Register (DSISR) - */ -typedef struct { - unsigned dse:BIT_WIDTH(31); // direct-store error - unsigned tnf:BIT_WIDTH(30); // translation not found - unsigned :BITS_WIDTH(29,28); - unsigned pe:BIT_WIDTH(27); // protection error - unsigned dsr:BIT_WIDTH(26); // lwarx/stwcx to direct-store - unsigned rw:BIT_WIDTH(25); // 1 => store, 0 => load - unsigned :BITS_WIDTH(24,23); - unsigned dab:BIT_WIDTH(22); // data address bkpt (601) - unsigned ssf:BIT_WIDTH(21); // seg table search failed - unsigned :BITS_WIDTH(20,0); -} dsisr_t; - -/* - * Instruction Storage Interrupt Status Register (really SRR1) - */ -typedef struct { - unsigned :BIT_WIDTH(31); - unsigned tnf:BIT_WIDTH(30); // translation not found - unsigned :BIT_WIDTH(29); - unsigned dse:BIT_WIDTH(28); // direct-store fetch error - unsigned pe:BIT_WIDTH(27); // protection error - unsigned :BITS_WIDTH(26,22); - unsigned ssf:BIT_WIDTH(21); // seg table search failed - unsigned :BITS_WIDTH(20,16); - MSR_BITS; -} isisr_t; - -/* - * Alignment Interrupt Status Register (really DSISR) - * NOTE: bit numbers in field *names* are in IBM'ese (0 is MSB). - * FIXME: Yuck!!! Double Yuck!!! - */ -typedef struct { - unsigned :BITS_WIDTH(31,20); - unsigned ds3031:BITS_WIDTH(19,18);// bits 30:31 if DS form - unsigned :BIT_WIDTH(17); - unsigned x2930:BITS_WIDTH(16,15); // bits 29:30 if X form - unsigned x25:BIT_WIDTH(14); // bit 25 if X form or - // bit 5 if D or DS form - unsigned x2124:BITS_WIDTH(13,10); // bits 21:24 if X form or - // bits 1:4 if D or DS form - unsigned all615:BITS_WIDTH(9,0); // bits 6:15 of instr - MSR_BITS; -} aisr_t; - -/* - * Program Interrupt Status Register (really SRR1) - */ -typedef struct { - unsigned :BITS_WIDTH(31,21); - unsigned fpee:BIT_WIDTH(20); // floating pt enable exception - unsigned ill:BIT_WIDTH(19); // illegal instruction - unsigned priv:BIT_WIDTH(18); // privileged instruction - unsigned trap:BIT_WIDTH(17); // trap program interrupt - unsigned subseq:BIT_WIDTH(16); // 1 => SRR0 points to - // subsequent instruction - MSR_BITS; -} pisr_t; - -/* - * Condition register. May not be useful in C, let's see... - */ -typedef struct { - unsigned lt:BIT_WIDTH(31), // negative - gt:BIT_WIDTH(30), // positive - eq:BIT_WIDTH(29), // equal to zero - so:BIT_WIDTH(28), // summary overflow - fx:BIT_WIDTH(27), // floating point exception - fex:BIT_WIDTH(26), // fp enabled exception - vx:BIT_WIDTH(25), // fp invalid operation - // exception - ox:BIT_WIDTH(24), // fp overflow exception - rsvd:BITS_WIDTH(23,0); // reserved -} cr_t; - -/* - * Abstract values representing fe0:fe1. - * See get_fp_exc_mode(), below. - */ -typedef enum { - FEM_IGNORE_EXCEP, // ignore exceptions - FEM_IMPR_NONREC, // imprecise nonrecoverable - FEM_IMPR_RECOV, // imprecise recoverable - FEM_PRECISE -} fp_exc_mode_t; - - -/* - * Special purpose registers. - */ - -/* - * Processor version register (special purpose register pvr). - */ -typedef struct { - unsigned version:BITS_WIDTH(31,16), - revision:BITS_WIDTH(15,0); -} pvr_t; - -/* - * Fixed point exception register (special purpose register xer) - */ -typedef struct { - unsigned so:BIT_WIDTH(31), // summary overflow - ov:BIT_WIDTH(30), // overflow - ca:BIT_WIDTH(29), // carry - rsvd1:BITS_WIDTH(28,7), // reserved - byte_count:BITS_WIDTH(6,0); -} xer_t; - -/* - * Inlines and macros to manipulate the above registers. - */ - -/* - * Get/set machine state register. - */ -static __inline__ msr_t -get_msr() -{ - msr_t __msr_tmp; - __asm__ volatile ("mfmsr %0 /* mfmsr */" : "=r" (__msr_tmp)); - return __msr_tmp; -} - -static __inline__ void -set_msr(msr_t msr) -{ - __asm__ volatile ("mtmsr %0 /* mtmsr */ " : : "r" (msr)); -} - -/* - * Determine current fp_exc_mode_t given prog_mode. - */ -static __inline__ fp_exc_mode_t -get_fp_exc_mode(pmr_t pmr) -{ - if(pmr.fe0) - return pmr.fe1 ? FEM_PRECISE : FEM_IMPR_RECOV; - else - return pmr.fe1 ? FEM_IMPR_NONREC : FEM_IGNORE_EXCEP; -} - -/* - * Software definitions for special purpose registers. - * The same register is used as per_cpu data pointer and - * vector base register. This requires that the vector - * table be the first item in the per_cpu table. - */ -#define SR_EXCEPTION_TMP_LR sprg0 -#define SR_EXCEPTION_TMP_CR sprg1 -#define SR_EXCEPTION_TMP_AT sprg2 -#define SR_PER_CPU_DATA sprg3 -#define SR_VBR sprg3 - -/* - * Get/set special purpose registers. - * - * GET_SPR - get SPR by name. - * - * Example usage: - * - * { - * xer_t some_xer; - * - * some_xer = GET_SPR(xer_t, xer); - * ... - * } - * - * This is a strange one. We're creating a list of C expressions within - * a set of curlies; the last expression ("__spr_tmp;") is the return value - * of the statement created by the curlies. - * - */ - -#define GET_SPR(type, spr) \ -({ \ - unsigned __spr_tmp; \ - __asm__ volatile ("mfspr %0, " STRINGIFY(spr) : "=r" (__spr_tmp)); \ - *(type *)&__spr_tmp; \ -}) - -/* - * Example usage of SET_SPR: - * - * { - * xer_t some_xer; - * - * ...set up some_xer... - * SET_SPR(xer, some_xer); - * } - */ -#define SET_SPR(spr, val) \ -MACRO_BEGIN \ - __typeof__ (val) __spr_tmp = (val); \ - __asm__ volatile ("mtspr "STRINGIFY(spr) ", %0" : : "r" (__spr_tmp)); \ -MACRO_END - -/* - * Fully synchronize instruction stream. - */ -static __inline__ void -ppc_sync() -{ - __asm__ volatile ("sync /* sync */" : : ); -} - -#endif /* ! __ASSEMBLER__ */ - -#endif /* _ARCH_PPC_BASIC_REGS_H_ */ - diff --git a/EXTERNAL_HEADERS/architecture/ppc/fp_regs.h b/EXTERNAL_HEADERS/architecture/ppc/fp_regs.h deleted file mode 100644 index ab48b8821..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/fp_regs.h +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1996 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/fp_regs.h - * Author: Doug Mitchell, NeXT Software, Inc. - * - * ppc floating point registers. - * - * HISTORY - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported from m98k. - * 05-Nov-92 Doug Mitchell at NeXT - * Created. - */ - -#ifndef _ARCH_PPC_FP_REGS_H_ -#define _ARCH_PPC_FP_REGS_H_ - -#include <architecture/ppc/reg_help.h> - -#if !defined(__ASSEMBLER__) -/* - * Floating point status and control register. - * - * This struct is aligned to an 8-byte boundary because 64-bit - * load/store instructions (lfd/stfd) are used to access it. The - * FPSCR can only be read/written through other FP registers. - */ -typedef struct { - unsigned unused[1] __attribute__(( aligned(8) )); - unsigned fx:BIT_WIDTH(31), // exception summary - fex:BIT_WIDTH(30), // enabled exception summary - vx:BIT_WIDTH(29), // invalid op exception - // summary - ox:BIT_WIDTH(28), // overflow exception - ux:BIT_WIDTH(27), // underflow exception - zx:BIT_WIDTH(26), // divide by zero exception - xx:BIT_WIDTH(25), // inexact exception - vx_snan:BIT_WIDTH(24), // not a number exception - vx_isi:BIT_WIDTH(23), // exception - vx_idi:BIT_WIDTH(22), // exception - vx_zdz:BIT_WIDTH(21), // exception - vx_imz:BIT_WIDTH(20), // exception - vx_xvc:BIT_WIDTH(19), // exception - fr:BIT_WIDTH(18), // fraction rounded - fi:BIT_WIDTH(17), // fraction inexact - class:BIT_WIDTH(16), // class descriptor - fl:BIT_WIDTH(15), // negative - fg:BIT_WIDTH(14), // positive - fe:BIT_WIDTH(13), // equal or zero - fu:BIT_WIDTH(12), // not a number - rsvd1:BIT_WIDTH(11), // reserved - vx_soft:BIT_WIDTH(10), // software request exception - rsvd2:BIT_WIDTH(9), // reserved - vx_cvi:BIT_WIDTH(8), // invalid integer convert - // exception - ve:BIT_WIDTH(7), // invalid op exception enable - oe:BIT_WIDTH(6), // overflow exception enable - ue:BIT_WIDTH(5), // underflow exception enable - ze:BIT_WIDTH(4), // divide by zero exception - // enable - xe:BIT_WIDTH(3), // inexact exception enable - ni:BIT_WIDTH(2), // non-IEEE exception enable - rn:BITS_WIDTH(1,0); // rounding control -} ppc_fp_scr_t; - -/* - * Values for fp_scr_t.rn (rounding control). - */ -typedef enum { - RN_NEAREST = 0, - RN_TOWARD_ZERO = 1, - RN_TOWARD_PLUS = 2, - RN_TOWARD_MINUS = 3 -} ppc_fp_rn_t; - -/* - * ppc_fpf_t -- data types that MAY be in floating point register file - * Actual data types supported is implementation dependent - */ -typedef union { - float f; // 32 bit IEEE single - double d; // 64 bit IEEE double - - /* - * Insure compiler aligns struct appropriately - */ - unsigned x[2] __attribute__(( aligned(8) )); -} ppc_fpf_t; - -/* - * Number of FP registers. - */ -#define PPC_NFP_REGS 32 - -/* - * Read/write FPSCR. - * FIXME - these don't work, you need to go thru a fp register. - */ -typedef union { - double __dbl; - ppc_fp_scr_t __scr; -} __fp_un_t; - -static __inline__ ppc_fp_scr_t -get_fp_scr() -{ - __fp_un_t __fp_un; - - __asm__ volatile ("mffs. %0 /* mffs */" \ - : "=f" (__fp_un.__dbl)); - return (__fp_un.__scr); -} - -static __inline__ void -set_fp_scr(ppc_fp_scr_t fp_scr) -{ - __fp_un_t __fp_un; - - __fp_un.__scr = fp_scr; - __asm__ volatile ("mtfsf 0xff, %0; /* mtfsf */ " \ - : : "f" (__fp_un.__dbl)); -} - -#endif /* ! __ASSEMBLER__ */ - -#endif /* _ARCH_PPC_FP_REGS_H_ */ diff --git a/EXTERNAL_HEADERS/architecture/ppc/macro_help.h b/EXTERNAL_HEADERS/architecture/ppc/macro_help.h deleted file mode 100644 index a149f8eb0..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/macro_help.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1996 NeXT Software, Inc. - */ -/* - * Mach Operating System - * Copyright (c) 1989 Carnegie-Mellon University - * Copyright (c) 1988 Carnegie-Mellon University - * All rights reserved. The CMU software License Agreement specifies - * the terms and conditions for use and redistribution. - * - * File: architecture/ppc/macro_help.h - * - * Provide help in making lint-free macro routines - * - * HISTORY - * - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Created from m98k version. - */ - -#ifndef _ARCH_PPC_MACRO_HELP_H_ -#define _ARCH_PPC_MACRO_HELP_H_ - -#ifndef MACRO_BEGIN -# define MACRO_BEGIN do { -#endif /* MACRO_BEGIN */ - -#ifndef MACRO_END -# define MACRO_END } while (0) -#endif /* MACRO_END */ - -#ifndef MACRO_RETURN -# define MACRO_RETURN if (1) return -#endif /* MACRO_RETURN */ - -#endif /* _ARCH_PPC_MACRO_HELP_H_ */ - diff --git a/EXTERNAL_HEADERS/architecture/ppc/pseudo_inst.h b/EXTERNAL_HEADERS/architecture/ppc/pseudo_inst.h deleted file mode 100644 index da4071e6b..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/pseudo_inst.h +++ /dev/null @@ -1,420 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1996 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/pseudo_inst.h - * Author: Mike DeMoney - * - * This header file defines assembler pseudo-instruction macros for - * for the ppc. - * - * NOTE: This is obviously only useful to include in assembly - * code source. - * - * ALSO NOTE: These macros don't attempt to be 64-bit compatable - * - * HISTORY - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported from m98k. - * 05-Nov-92 Mike DeMoney (mike@next.com) - * Created. - */ - -#ifndef _ARCH_PPC_PSEUDO_INST_H_ -#define _ARCH_PPC_PSEUDO_INST_H_ - -#include <architecture/ppc/reg_help.h> -#include <architecture/ppc/asm_help.h> - -#ifdef __ASSEMBLER__ - -/* - * Pseudo instruction definitions - */ - -/* - * Macro package initialization - */ - .set __no_at,0 /* allow at by default */ - -/* - * .at_off -- disable use of at by macros - * .at_on -- enable use of at by macros - */ -.macro .at_off - .set __no_at,1 -.endmacro - -.macro .at_on - .set __no_at,0 -.endmacro - -/* - * li32 rD,IMMED - * - * Load 32-bit immediate into rD - * FIXME: Need a way to undefine built-in macro for this. - */ -.macro li32 // li32 rD,immed -.if $n != 2 - .abort "invalid operands of li32" -.endif -.abs __is_abs,$1 -.if !__is_abs - addis $0,0,hi16($1) - ori $0,$0,lo16($1) -.elseif $1 == 0 - addi $0,0,0 -.elseif ($1 & 0xffff) == 0 - addis $0,0,hi16($1) -.elseif ($1 & 0xffff8000) == 0 - addi $0,0,$1 -.elseif ($1 & 0xffff8000) == 0xffff8000 - addi $0,0,$1 -.else - addis $0,0,hi16($1) - ori $0,$0,lo16($1) -.endif -.endmacro - - -/* - * andi32. rD,rS1,IMMED - * - * Perform "andi." with (possibly) 32-bit immediate - */ -.macro andi32. // andi32. rD,rS1,IMMED -.if $n != 3 - .abort "invalid operands of andi." -.endif - .set __used_at,0 -.abs __is_abs,$2 -.if !__is_abs - .set __used_at,1 - li32 at,$2 - and. $0,$1,at -.elseif ($2 & 0xffff0000) == 0 - andi. $0,$1,$2 -.elseif ($2 & 0xffff) == 0 - andis. $0,$1,hi16($2) -.else - .set __used_at,1 - li32 at,$2 - and. $0,$1,at -.endif -.if __no_at & __used_at - .abort "Macro uses at while .no_at in effect" -.endif -.endmacro - -/* - * ori32 rD,rS1,IMMED - * - * Perform "ori" with (possibly) 32-bit immediate - */ -.macro ori32 // ori32 rD,rS1,IMMED -.if $n != 3 - .abort "invalid operands of ori" -.endif -.abs __is_abs,$2 -.if !__is_abs - oris $0,$1,hi16($2) - ori $0,$1,lo16($2) -.elseif ($2 & 0xffff0000) == 0 - ori $0,$1,$2 -.elseif ($2 & 0xffff) == 0 - oris $0,$1,hi16($2) -.else - oris $0,$1,hi16($2) - ori $0,$1,lo16($2) -.endif -.endmacro - -/* - * xori32 rD,rS1,IMMED - * - * Perform "xor" with (possibly) 32-bit immediate - */ -.macro xori32 // xori32 rD,rS1,IMMED -.if $n != 3 - .abort "invalid operands of xori" -.endif -.abs __is_abs,$2 -.if !__is_abs - xoris $0,$1,hi16($2) - xori $0,$1,lo16($2) -.elseif ($2 & 0xffff0000) == 0 - xori $0,$1,$2 -.elseif ($2 & 0xffff) == 0 - xoris $0,$1,hi16($2) -.else - xoris $0,$1,hi16($2) - xori $0,$1,lo16($2) -.endif -.endmacro - - -/* - * MEMREF_INST -- macros to memory referencing instructions - * "capable" of dealing with 32 bit offsets. - * - * NOTE: Because the assembler doesn't have any mechanism for easily - * parsing the d(rS) syntax of register-displacement form instructions, - * these instructions do NOT mirror the normal memory reference - * instructions. The following "transformation" is used: - * lbz rD,d(rS) - * becomes: - * lbz32 rD,rS,d - * I.e.: "32" is appended to the instruction name and the base register - * and displacement become the 2'nd and 3'rd comma-separated operands. - * - * The forms: - * lbz32 rD,d - * and: - * lbz32 rD,rS - * are also recognized and the missing operand is assumed 0. - * - * ALSO NOTE: r0 or zt should never be used as rS in these instructions. - * Use "0" as rS in this case. - */ -#define MEMREF_INST(op) \ -.macro op ## 32 @\ -.set __used_at,0 @\ -.if $n == 3 @\ - .greg __is_greg,$1 @\ - .abs __is_abs,$2 @\ - .if __is_abs @\ - .if ($2 & 0xffff8000) == 0 @\ - op $0,$2($1) @\ - .elseif ($2 & 0xffff8000) == 0xffff8000 @\ - op $0,$2($1) @\ - .else @\ - .if !__is_greg @\ - .set __used_at,1 @\ - lis at,ha16($2) @\ - op $0,lo16($2)(at) @\ - .else @\ - .set __used_at,1 @\ - lis at,ha16($2) @\ - add at,at,$1 @\ - op $0,lo16($2)(at) @\ - .endif @\ - .endif @\ - .else @\ - .if !__is_greg @\ - .set __used_at,1 @\ - lis at,ha16($2) @\ - op $0,lo16($2)(at) @\ - .else @\ - .set __used_at,1 @\ - lis at,ha16($2) @\ - add at,at,$1 @\ - op $0,lo16($2)(at) @\ - .endif @\ - .endif @\ -.elseif $n == 2 @\ - .greg __is_greg,$1 @\ - .if !__is_greg @\ - .abs __is_abs,$1 @\ - .if __is_abs @\ - .if ($1 & 0xffff8000) == 0 @\ - op $0,$1(0) @\ - .elseif ($1 & 0xffff8000) == 0xffff8000 @\ - op $0,$1(0) @\ - .else @\ - .set __used_at,1 @\ - lis at,ha16($1) @\ - op $0,lo16($1)(at) @\ - .endif @\ - .else @\ - .set __used_at,1 @\ - lis at,ha16($1) @\ - op $0,lo16($1)(at) @\ - .endif @\ - .else @\ - op $0,0($1) @\ - .endif @\ -.else @\ - .abort "Invalid operands of " #op "32" @\ -.endif @\ -.if __no_at & __used_at @\ - .abort "Macro uses at while .no_at in effect" @\ -.endif @\ -.endmacro - -MEMREF_INST(lbz) -MEMREF_INST(lhz) -MEMREF_INST(lha) -MEMREF_INST(lwz) -MEMREF_INST(lwa) -MEMREF_INST(ld) - -MEMREF_INST(stb) -MEMREF_INST(sth) -MEMREF_INST(stw) -MEMREF_INST(std) - -MEMREF_INST(lmw) -MEMREF_INST(lmd) -MEMREF_INST(stmw) -MEMREF_INST(stmd) - -/* - * ARITH_INST -- define 32-bit immediate forms of arithmetic - * instructions - * - * E.g. addi32 rD,rS,IMMED - */ -#define ARITH_INST(op, op3, sf) \ -.macro op ## 32 ## sf @\ -.if $n != 3 @\ - .abort "invalid operands to " #op "32" @\ -.endif @\ -.abs __is_abs,$2 @\ -.if __is_abs @\ - .if ($2 & 0xffff8000) == 0 @\ - op##sf $0,$1,$2 @\ - .elseif ($2 & 0xffff8000) == 0xffff8000 @\ - op##sf $0,$1,$2 @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$2 @\ - op3##sf $0,$1,at @\ - .endif @\ -.elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ -.else @\ - li32 at,$2 @\ - op3##sf $0,$1,at @\ -.endif @\ -.endmacro - -ARITH_INST(addi, add, ) -ARITH_INST(subi, sub, ) -ARITH_INST(addic, addc, ) -ARITH_INST(subic, subc, ) -ARITH_INST(addic, addc, .) -ARITH_INST(subic, subc, .) -ARITH_INST(mulli, mull, ) - -/* - * CMPEX_INST -- define 32-bit immediate forms of extended compare - * instructions - * - * E.g. cmpwi32 cr3,rS,IMMED - * cmpwi32 rS,IMMED - */ -#define CMPEX_INST(op, op3) \ -.macro op ## 32 @\ -.if $n == 3 @\ - .abs __is_abs,$2 @\ - .if __is_abs @\ - .if ($2 & 0xffff8000) == 0 @\ - op $0,$1,$2 @\ - .elseif ($2 & 0xffff8000) == 0xffff8000 @\ - op $0,$1,$2 @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$2 @\ - op3 $0,$1,at @\ - .endif @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$2 @\ - op3 $0,$1,at @\ - .endif @\ -.elseif $n == 2 @\ - .abs __is_abs,$1 @\ - .if __is_abs @\ - .if ($1 & 0xffff8000) == 0 @\ - op $0,$1 @\ - .elseif ($1 & 0xffff8000) == 0xffff8000 @\ - op $0,$1 @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$1 @\ - op3 $0,at @\ - .endif @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$1 @\ - op3 $0,at @\ - .endif @\ -.else @\ - .abort "invalid operands to " #op "32" @\ -.endif @\ -.endmacro - -CMPEX_INST(cmpdi, cmpd) -CMPEX_INST(cmpwi, cmpw) -CMPEX_INST(cmpldi, cmpld) -CMPEX_INST(cmplwi, cmplw) - -/* - * CMP_INST -- define 32-bit immediate forms of standard compare - * instructions - * - * E.g. cmpi32 cr3,0,rS,IMMED - */ -#define CMP_INST(op, op3) \ -.macro op ## 32 @\ -.if $n == 4 @\ - .abs __is_abs,$3 @\ - .if __is_abs @\ - .if ($3 & 0xffff8000) == 0 @\ - op $0,$1,$2,$3 @\ - .elseif ($3 & 0xffff8000) == 0xffff8000 @\ - op $0,$1,$2,$3 @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$3 @\ - op3 $0,$1,$2,at @\ - .endif @\ - .elseif __no_at @\ - .abort "Macro uses at while .no_at in effect" @\ - .else @\ - li32 at,$3 @\ - op3 $0,$1,$2,at @\ - .endif @\ -.else @\ - .abort "invalid operands to " #op "32" @\ -.endif @\ -.endmacro - -CMP_INST(cmpi, cmp) -CMP_INST(cmpli, cmpl) - -#endif /* __ASSEMBLER__ */ - -#endif /* _ARCH_PPC_PSEUDO_INST_H_ */ diff --git a/EXTERNAL_HEADERS/architecture/ppc/reg_help.h b/EXTERNAL_HEADERS/architecture/ppc/reg_help.h deleted file mode 100644 index 6a0e2842e..000000000 --- a/EXTERNAL_HEADERS/architecture/ppc/reg_help.h +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1996 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/reg_help.h - * Author: Doug Mitchell, NeXT Computer, Inc. - * - * m98k-specific macros and inlines for defining machine registers. - * - * HISTORY - * 05-Nov-92 Doug Mitchell at NeXT - * Created. - * - * 29-Dec-96 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported from m98k. Removed dependency on nrw directory. - * Merged code from architecture/nrw/reg_help.h. - * Moved Register Usage #defines from asm_help.h in here. - */ - -#ifndef _ARCH_PPC_REG_HELP_H_ -#define _ARCH_PPC_REG_HELP_H_ - -#if defined(__ASSEMBLER__) -/* - * GRF Register Usage Aliases - */ -#define zt r0 // architecturally 0 for mem refs only! - // real reg other inst, caller-saved -#define sp r1 // stack pointer, callee-saved -#define toc r2 // tbl of contents, callee-saved -#define a0 r3 // arg 0, return value 0, caller saved -#define a1 r4 // arg 1, return value 1, caller saved -#define a2 r5 // .... -#define a3 r6 -#define a4 r7 -#define a5 r8 -#define a6 r9 -#define a7 r10 // arg 7, return value 7, caller saved -#define ep r11 // environment ptr, caller saved -#define at r12 // assembler temp, caller saved -#define s17 r13 // callee-saved 17 -#define s16 r14 -#define s15 r15 -#define s14 r16 -#define s13 r17 -#define s12 r18 -#define s11 r19 -#define s10 r20 -#define s9 r21 -#define s8 r22 -#define s7 r23 -#define s6 r24 -#define s5 r25 -#define s4 r26 -#define s3 r27 -#define s2 r28 -#define s1 r29 // .... -#define s0 r30 // callee-saved 0 -#define fp r31 // frame-pointer, callee-saved - -/* - * Conversion of GRF aliases to register numbers - */ -#define GRF_ZT 0 // architecturally 0 for mem refs only! - // real reg other inst, caller-saved -#define GRF_SP 1 // stack pointer, callee-saved -#define GRF_TOC 2 // tbl of contents, callee-saved -#define GRF_A0 3 // arg 0, return value 0, caller saved -#define GRF_A1 4 // arg 1, return value 1, caller saved -#define GRF_A2 5 // .... -#define GRF_A3 6 -#define GRF_A4 7 -#define GRF_A5 8 -#define GRF_A6 9 -#define GRF_A7 10 // arg 7, return value 7, caller saved -#define GRF_EP 11 // environment ptr, caller saved -#define GRF_AT 12 // assembler temp, caller saved -#define GRF_S17 13 // callee-saved 17 -#define GRF_S16 14 -#define GRF_S15 15 -#define GRF_S14 16 -#define GRF_S13 17 -#define GRF_S12 18 -#define GRF_S11 19 -#define GRF_S10 20 -#define GRF_S9 21 -#define GRF_S8 22 -#define GRF_S7 23 -#define GRF_S6 24 -#define GRF_S5 25 -#define GRF_S4 26 -#define GRF_S3 27 -#define GRF_S2 28 -#define GRF_S1 29 // .... -#define GRF_S0 30 // callee-saved 0 -#define GRF_FP 31 // frame pointer, callee-saved - -/* - * FPF Register names - */ -#define ft0 f0 // scratch reg, caller-saved -#define fa0 f1 // fp arg 0, return 0, caller-saved -#define fa1 f2 // fp arg 1, caller-saved -#define fa2 f3 // fp arg 2, caller-saved -#define fa3 f4 -#define fa4 f5 -#define fa5 f6 -#define fa6 f7 -#define fa7 f8 -#define fa8 f9 -#define fa9 f10 -#define fa10 f11 -#define fa11 f12 -#define fa12 f13 // fp arg 12, caller-saved -#define fs17 f14 // callee-saved 17 -#define fs16 f15 -#define fs15 f16 -#define fs14 f17 -#define fs13 f18 -#define fs12 f19 -#define fs11 f20 -#define fs10 f21 -#define fs9 f22 -#define fs8 f23 -#define fs7 f24 -#define fs6 f25 -#define fs5 f26 -#define fs4 f27 -#define fs3 f28 -#define fs2 f29 -#define fs1 f30 -#define fs0 f31 // callee-saved 0 - -/* - * Conversion of FPF aliases to register numbers - */ -#define FPF_FT0 0 // scratch reg, caller-saved -#define FPF_FA0 1 // fp arg 0, return 0, caller-saved -#define FPF_FA1 2 // fp arg 1, caller-saved -#define FPF_FA2 3 // fp arg 2, caller-saved -#define FPF_FA3 4 -#define FPF_FA4 5 -#define FPF_FA5 6 -#define FPF_FA6 7 -#define FPF_FA7 8 -#define FPF_FA8 9 -#define FPF_FA9 10 -#define FPF_FA10 11 -#define FPF_FA11 12 -#define FPF_FA12 13 // fp arg 12, caller-saved -#define FPF_FS17 14 // callee-saved 17 -#define FPF_FS16 15 -#define FPF_FS15 16 -#define FPF_FS14 17 -#define FPF_FS13 18 -#define FPF_FS12 19 -#define FPF_FS11 20 -#define FPF_FS10 21 -#define FPF_FS9 22 -#define FPF_FS8 23 -#define FPF_FS7 24 -#define FPF_FS6 25 -#define FPF_FS5 26 -#define FPF_FS4 27 -#define FPF_FS3 28 -#define FPF_FS2 29 -#define FPF_FS1 30 -#define FPF_FS0 31 // callee-saved 0 - -#endif /* __ASSEMBLER__ */ - - -/* Bitfield definition aid */ -#define BITS_WIDTH(msb, lsb) ((msb)-(lsb)+1) -#define BIT_WIDTH(pos) (1) /* mostly to record the position */ - -/* Mask creation */ -#define MKMASK(width, offset) (((unsigned)-1)>>(32-(width))<<(offset)) -#define BITSMASK(msb, lsb) MKMASK(BITS_WIDTH(msb, lsb), lsb & 0x1f) -#define BITMASK(pos) MKMASK(BIT_WIDTH(pos), pos & 0x1f) - -/* Register addresses */ -#if __ASSEMBLER__ -# define REG_ADDR(type, addr) (addr) -#else /* ! __ASSEMBLER__ */ -# define REG_ADDR(type, addr) (*(volatile type *)(addr)) -#endif /* __ASSEMBLER__ */ - -/* Cast a register to be an unsigned */ -/* CAUTION : non naturally aligned foo can result into alignment traps - * use at own risk. - */ -#define CONTENTS(foo) (*(unsigned *) &(foo)) - -/* STRINGIFY -- perform all possible substitutions, then stringify */ -#define __STR(x) #x /* just a helper macro */ -#define STRINGIFY(x) __STR(x) - -/* - * Stack pointer must always be a multiple of 16 - */ -#define STACK_INCR 16 -#define ROUND_FRAME(x) ((((unsigned)(x)) + STACK_INCR - 1) & ~(STACK_INCR-1)) - -#endif /* _ARCH_PPC_REG_HELP_H_ */ diff --git a/EXTERNAL_HEADERS/mach-o/arm/reloc.h b/EXTERNAL_HEADERS/mach-o/arm/reloc.h deleted file mode 100644 index e2da8b80c..000000000 --- a/EXTERNAL_HEADERS/mach-o/arm/reloc.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Relocation types used in the arm implementation. Relocation entries for - * things other than instructions use the same generic relocation as discribed - * in <mach-o/reloc.h> and their r_type is ARM_RELOC_VANILLA, one of the - * *_SECTDIFF or the *_PB_LA_PTR types. The rest of the relocation types are - * for instructions. Since they are for instructions the r_address field - * indicates the 32 bit instruction that the relocation is to be preformed on. - */ -enum reloc_type_arm -{ - ARM_RELOC_VANILLA, /* generic relocation as discribed above */ - ARM_RELOC_PAIR, /* the second relocation entry of a pair */ - ARM_RELOC_SECTDIFF, /* a PAIR follows with subtract symbol value */ - ARM_RELOC_LOCAL_SECTDIFF, /* like ARM_RELOC_SECTDIFF, but the symbol - referenced was local. */ - ARM_RELOC_PB_LA_PTR,/* prebound lazy pointer */ - ARM_RELOC_BR24, /* 24 bit branch displacement (to a word address) */ - ARM_THUMB_RELOC_BR22, /* 22 bit branch displacement (to a half-word - address) */ -}; diff --git a/EXTERNAL_HEADERS/mach-o/loader.h b/EXTERNAL_HEADERS/mach-o/loader.h index b00ac7a67..9fecf2b4a 100644 --- a/EXTERNAL_HEADERS/mach-o/loader.h +++ b/EXTERNAL_HEADERS/mach-o/loader.h @@ -197,6 +197,12 @@ struct mach_header_64 { load the main executable at a random address. Only used in MH_EXECUTE filetypes. */ +#define MH_NO_HEAP_EXECUTION 0x1000000 /* When this bit is set, the OS will + run the main executable with + a non-executable heap even on + platforms (e.g. i386) that don't + require it. Only used in MH_EXECUTE + filetypes. */ /* * The load commands directly follow the mach_header. The total size of all diff --git a/EXTERNAL_HEADERS/mach-o/ppc/reloc.h b/EXTERNAL_HEADERS/mach-o/ppc/reloc.h deleted file mode 100644 index 7b564cc0a..000000000 --- a/EXTERNAL_HEADERS/mach-o/ppc/reloc.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -/* - * Relocation types used in the ppc implementation. Relocation entries for - * things other than instructions use the same generic relocation as discribed - * above and their r_type is RELOC_VANILLA. The rest of the relocation types - * are for instructions. Since they are for instructions the r_address field - * indicates the 32 bit instruction that the relocation is to be preformed on. - * The fields r_pcrel and r_length are ignored for non-RELOC_VANILLA r_types - * except for PPC_RELOC_BR14. - * - * For PPC_RELOC_BR14 if the r_length is the unused value 3, then the branch was - * statically predicted setting or clearing the Y-bit based on the sign of the - * displacement or the opcode. If this is the case the static linker must flip - * the value of the Y-bit if the sign of the displacement changes for non-branch - * always conditions. - */ -enum reloc_type_ppc -{ - PPC_RELOC_VANILLA, /* generic relocation as discribed above */ - PPC_RELOC_PAIR, /* the second relocation entry of a pair */ - PPC_RELOC_BR14, /* 14 bit branch displacement (to a word address) */ - PPC_RELOC_BR24, /* 24 bit branch displacement (to a word address) */ - PPC_RELOC_HI16, /* a PAIR follows with the low half */ - PPC_RELOC_LO16, /* a PAIR follows with the high half */ - PPC_RELOC_HA16, /* Same as the RELOC_HI16 except the low 16 bits and the - * high 16 bits are added together with the low 16 bits - * sign extened first. This means if bit 15 of the low - * 16 bits is set the high 16 bits stored in the - * instruction will be adjusted. - */ - PPC_RELOC_LO14, /* Same as the LO16 except that the low 2 bits are not - * stored in the instruction and are always zero. This - * is used in double word load/store instructions. - */ - PPC_RELOC_SECTDIFF, /* a PAIR follows with subtract symbol value */ - PPC_RELOC_PB_LA_PTR,/* prebound lazy pointer */ - PPC_RELOC_HI16_SECTDIFF, /* section difference forms of above. a PAIR */ - PPC_RELOC_LO16_SECTDIFF, /* follows these with subtract symbol value */ - PPC_RELOC_HA16_SECTDIFF, - PPC_RELOC_JBSR, - PPC_RELOC_LO14_SECTDIFF, - PPC_RELOC_LOCAL_SECTDIFF /* like PPC_RELOC_SECTDIFF, but the symbol - referenced was local. */ -}; diff --git a/EXTERNAL_HEADERS/stdarg.h b/EXTERNAL_HEADERS/stdarg.h index f178505e8..bbbaff93e 100644 --- a/EXTERNAL_HEADERS/stdarg.h +++ b/EXTERNAL_HEADERS/stdarg.h @@ -1,133 +1,47 @@ -/* Copyright (C) 1989, 1997, 1998, 1999, 2000 Free Software Foundation, Inc. - -This file is part of GCC. - -GCC is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2, or (at your option) -any later version. - -GCC is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with GCC; see the file COPYING. If not, write to -the Free Software Foundation, 59 Temple Place - Suite 330, -Boston, MA 02111-1307, USA. */ - -/* As a special exception, if you include this header file into source - files compiled by GCC, this header file does not by itself cause - the resulting executable to be covered by the GNU General Public - License. This exception does not however invalidate any other - reasons why the executable file might be covered by the GNU General - Public License. */ - -/* - * ISO C Standard: 7.15 Variable arguments <stdarg.h> +/*===---- stdarg.h - Variable argument handling ----------------------------=== + * + * Copyright (c) 2008 Eli Friedman + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== */ -#ifndef _STDARG_H -#ifndef _ANSI_STDARG_H_ -#ifndef __need___va_list -#define _STDARG_H -#define _ANSI_STDARG_H_ -#endif /* not __need___va_list */ -#undef __need___va_list - -/* Define __gnuc_va_list. */ +#ifndef __STDARG_H +#define __STDARG_H -#ifndef __GNUC_VA_LIST -#define __GNUC_VA_LIST -typedef __builtin_va_list __gnuc_va_list; -#endif +typedef __builtin_va_list va_list; +#define va_start(ap, param) __builtin_va_start(ap, param) +#define va_end(ap) __builtin_va_end(ap) +#define va_arg(ap, type) __builtin_va_arg(ap, type) -/* Define the standard macros for the user, - if this invocation was from the user program. */ -#ifdef _STDARG_H - -#define va_start(v,l) __builtin_va_start(v,l) -#define va_end(v) __builtin_va_end(v) -#define va_arg(v,l) __builtin_va_arg(v,l) -#if !defined(__STRICT_ANSI__) || __STDC_VERSION__ + 0 >= 199900L -#define va_copy(d,s) __builtin_va_copy(d,s) -#endif -#define __va_copy(d,s) __builtin_va_copy(d,s) - -/* Define va_list, if desired, from __gnuc_va_list. */ -/* We deliberately do not define va_list when called from - stdio.h, because ANSI C says that stdio.h is not supposed to define - va_list. stdio.h needs to have access to that data type, - but must not use that name. It should use the name __gnuc_va_list, - which is safe because it is reserved for the implementation. */ - -#ifdef _HIDDEN_VA_LIST /* On OSF1, this means varargs.h is "half-loaded". */ -#undef _VA_LIST -#endif - -#ifdef _BSD_VA_LIST -#undef _BSD_VA_LIST -#endif +/* GCC always defines __va_copy, but does not define va_copy unless in c99 mode + * or -ansi is not specified, since it was not part of C90. + */ +#define __va_copy(d,s) __builtin_va_copy(d,s) -#if defined(__svr4__) || (defined(_SCO_DS) && !defined(__VA_LIST)) -/* SVR4.2 uses _VA_LIST for an internal alias for va_list, - so we must avoid testing it and setting it here. - SVR4 uses _VA_LIST as a flag in stdarg.h, but we should - have no conflict with that. */ -#ifndef _VA_LIST_ -#define _VA_LIST_ -#ifdef __i860__ -#ifndef _VA_LIST -#define _VA_LIST va_list -#endif -#endif /* __i860__ */ -typedef __gnuc_va_list va_list; -#ifdef _SCO_DS -#define __VA_LIST +#if __STDC_VERSION__ >= 199900L || !defined(__STRICT_ANSI__) +#define va_copy(dest, src) __builtin_va_copy(dest, src) #endif -#endif /* _VA_LIST_ */ -#else /* not __svr4__ || _SCO_DS */ -/* The macro _VA_LIST_ is the same thing used by this file in Ultrix. - But on BSD NET2 we must not test or define or undef it. - (Note that the comments in NET 2's ansi.h - are incorrect for _VA_LIST_--see stdio.h!) */ -#if !defined (_VA_LIST_) || defined (__BSD_NET2__) || defined (____386BSD____) || defined (__bsdi__) || defined (__sequent__) || defined (__FreeBSD__) || defined(WINNT) -/* The macro _VA_LIST_DEFINED is used in Windows NT 3.5 */ -#ifndef _VA_LIST_DEFINED -/* The macro _VA_LIST is used in SCO Unix 3.2. */ -#ifndef _VA_LIST -/* The macro _VA_LIST_T_H is used in the Bull dpx2 */ -#ifndef _VA_LIST_T_H -/* The macro __va_list__ is used by BeOS. */ -#ifndef __va_list__ -typedef __gnuc_va_list va_list; -#endif /* not __va_list__ */ -#endif /* not _VA_LIST_T_H */ -#endif /* not _VA_LIST */ -#endif /* not _VA_LIST_DEFINED */ -#if !(defined (__BSD_NET2__) || defined (____386BSD____) || defined (__bsdi__) || defined (__sequent__) || defined (__FreeBSD__)) -#define _VA_LIST_ -#endif -#ifndef _VA_LIST -#define _VA_LIST -#endif -#ifndef _VA_LIST_DEFINED -#define _VA_LIST_DEFINED -#endif -#ifndef _VA_LIST_T_H -#define _VA_LIST_T_H -#endif -#ifndef __va_list__ -#define __va_list__ -#endif - -#endif /* not _VA_LIST_, except on certain systems */ - -#endif /* not __svr4__ */ - -#endif /* _STDARG_H */ +/* Hack required to make standard headers work, at least on Ubuntu */ +#define __GNUC_VA_LIST 1 +typedef __builtin_va_list __gnuc_va_list; -#endif /* not _ANSI_STDARG_H_ */ -#endif /* not _STDARG_H */ +#endif /* __STDARG_H */ diff --git a/Makefile b/Makefile index 57c8a4c88..acd493419 100644 --- a/Makefile +++ b/Makefile @@ -32,40 +32,24 @@ ALL_SUBDIRS = \ libsa \ security -CONFIG_SUBDIRS_PPC = config - CONFIG_SUBDIRS_I386 = config - CONFIG_SUBDIRS_X86_64 = config - CONFIG_SUBDIRS_ARM = config INSTINC_SUBDIRS = $(ALL_SUBDIRS) EXTERNAL_HEADERS - -INSTINC_SUBDIRS_PPC = $(INSTINC_SUBDIRS) EXTERNAL_HEADERS - -INSTINC_SUBDIRS_I386 = $(INSTINC_SUBDIRS) EXTERNAL_HEADERS - -INSTINC_SUBDIRS_X86_64 = $(INSTINC_SUBDIRS) EXTERNAL_HEADERS - -INSTINC_SUBDIRS_ARM = $(INSTINC_SUBDIRS) EXTERNAL_HEADERS +INSTINC_SUBDIRS_I386 = $(INSTINC_SUBDIRS) +INSTINC_SUBDIRS_X86_64 = $(INSTINC_SUBDIRS) +INSTINC_SUBDIRS_ARM = $(INSTINC_SUBDIRS) EXPINC_SUBDIRS = $(ALL_SUBDIRS) - -EXPINC_SUBDIRS_PPC = $(EXPINC_SUBDIRS) - EXPINC_SUBDIRS_I386 = $(EXPINC_SUBDIRS) - EXPINC_SUBDIRS_X86_64 = $(EXPINC_SUBDIRS) - EXPINC_SUBDIRS_ARM = $(EXPINC_SUBDIRS) -COMP_SUBDIRS_PPC = $(ALL_SUBDIRS) +SETUP_SUBDIRS = SETUP COMP_SUBDIRS_I386 = $(ALL_SUBDIRS) - COMP_SUBDIRS_X86_64 = $(ALL_SUBDIRS) - COMP_SUBDIRS_ARM = $(ALL_SUBDIRS) INST_SUBDIRS = \ @@ -77,14 +61,18 @@ INST_SUBDIRS = \ config \ security -INSTALL_FILE_LIST= \ - mach_kernel +INSTALL_KERNEL_FILE = mach_kernel + +INSTALL_KERNEL_DIR = / -INSTALL_FILE_DIR= \ - / INSTMAN_SUBDIRS = \ bsd include $(MakeInc_rule) include $(MakeInc_dir) + +# This target is defined to compile and run xnu_quick_test under testbots +testbots: + /usr/bin/make MORECFLAGS="-D RUN_UNDER_TESTBOTS=1" testbots -C ./tools/tests/xnu_quick_test/ + diff --git a/README b/README index 2040c2cee..b9e102527 100644 --- a/README +++ b/README @@ -15,32 +15,17 @@ A. How to build XNU: By default, architecture defaults to the build machine architecture, and the kernel configuration is set to build for DEVELOPMENT. - The machine configuration defaults to S5L8900X for arm and default for i386 and ppc. This will also create a bootable image, mach_kernel, and a kernel binary with symbols, mach_kernel.sys. - - Examples: - /* make a debug kernel for H1 arm board */ - make TARGET_CONFIGS="debug arm s5l8900x" SDKROOT=/path/to/SDK - - $(OBJROOT)/DEBUG_ARM_S5L8900X/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEBUG_ARM_S5L8900X/mach_kernel: bootable image - /* make debug and development kernels for H1 arm board */ - make TARGET_CONFIGS="debug arm s5l8900x development arm s5l8900x" SDKROOT=/path/to/SDK - - $(OBJROOT)/DEBUG_ARM_S5L8900X/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEBUG_ARM_S5L8900X/mach_kernel: bootable image - $(OBJROOT)/DEVELOPMENT_ARM_S5L8900X/osfmk/DEVELOPMENT/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEVELOPMENT_ARM_S5L8900X/mach_kernel: bootable image - /* this is all you need to do to build H1 arm with DEVELOPMENT kernel configuration */ - make TARGET_CONFIGS="default arm default" SDKROOT=/path/to/SDK + /* this is all you need to do to build with RELEASE kernel configuration */ + make TARGET_CONFIGS="release x86_64 default" SDKROOT=/path/to/SDK or the following is equivalent (ommitted SDKROOT will use /) - make ARCH_CONFIGS=ARM + make ARCH_CONFIGS=X86_64 2) Building a Component @@ -64,7 +49,7 @@ A. How to build XNU: and KERNEL_CONFIGS). Example: - $(OBJROOT)/RELEASE_PPC/osfmk/RELEASE/osfmk.o: pre-linked object for osfmk component + $(OBJROOT)/RELEASE_X86_64/osfmk/RELEASE/osfmk.filelist: list of objects in osfmk component From the component top directory: @@ -81,36 +66,36 @@ A. How to build XNU: Define kernel configuration to DEBUG in your environment or when running a make command. Then, apply procedures 4, 5 - $ make TARGET_CONFIGS="DEBUG PPC DEFAULT" all + $ make TARGET_CONFIGS="DEBUG X86_64 DEFAULT" all or - $ make KERNEL_CONFIGS=DEBUG all + $ make KERNEL_CONFIGS=DEBUG ARCH_CONFIGS=X86_64 all or - $ export TARGET_CONFIGS="DEBUG ARM MX31ADS" + $ export TARGET_CONFIGS="DEBUG X86_64 DEFAULT" $ export SDKROOT=/path/to/SDK $ make all Example: - $(OBJROOT)/DEBUG_PPC/osfmk/DEBUG/osfmk.o: pre-linked object for osfmk component - $(OBJROOT)/DEBUG_PPC/mach_kernel: bootable image + $(OBJROOT)/DEBUG_X86_64/osfmk/DEBUG/osfmk.filelist: list of objects in osfmk component + $(OBJROOT)/DEBUG_X86_64/mach_kernel: bootable image 4) Building fat Define architectures in your environment or when running a make command. Apply procedures 3, 4, 5 - $ make TARGET_CONFIGS="RELEASE PPC default RELEASE I386 default" exporthdrs all + $ make TARGET_CONFIGS="RELEASE I386 DEFAULT RELEASE X86_64 DEFAULT" exporthdrs all or - $ make ARCH_CONFIGS="PPC I386" exporthdrs all + $ make ARCH_CONFIGS="I386 X86_64" exporthdrs all or - $ export ARCH_CONFIGS="PPC I386" + $ export ARCH_CONFIGS="I386 X86_64" $ make exporthdrs all 5) Verbose make @@ -127,16 +112,28 @@ A. How to build XNU: From the top directory, run: - $ ~rc/bin/buildit . -arch ppc -arch i386 -noinstallsrc -nosum - - or for multiple arm builds - - $ ~rc/bin/buildit . -noinstallsrc -nosum -- TARGET_CONFIGS="release arm MX31ADS release arm LN2410SBC" - - or for default arm build (kernel config DEVELOPMENT and machine config MX31ADS) - - $ ~rc/bin/buildit . -arch arm -noinstallsrc -nosum -- TARGET_CONFIGS="release arm MX31ADS release arm LN2410SBC" + $ ~rc/bin/buildit . -arch i386 -arch x86_64 -arch armv7 -arch ppc -noinstallsrc -nosum + + xnu supports a number of XBS build aliases, which allow B&I to build + the same source submission multiple times in different ways, to + produce different results. Each build alias supports the standard + "clean", "install", "installsrc", "installhdrs" targets, but + conditionalize their behavior on the RC_ProjectName make variable + which is passed as the -project argument to ~rc/bin/buildit, which + can be one of: + + -project xnu # the default, builds /mach_kernel, kernel-space + # headers, user-space headers, man pages, + # symbol-set kexts + + -project xnu_debug # a DEBUG kernel in /AppleInternal with dSYM + + -project libkxld # user-space version of kernel linker + + -project Libsyscall # automatically generate BSD syscall stubs + + 8) Creating tags and cscope @@ -157,6 +154,8 @@ A. How to build XNU: $ make -w # trace recursive make invocations. Useful in combination with VERBOSE=YES + $ make BUILD_LTO=1 # built with LLVM Link Time Optimization (experimental) + ============================================= B. How to install a new header file from XNU diff --git a/osfmk/profiling/ppc/Makefile b/SETUP/Makefile similarity index 66% rename from osfmk/profiling/ppc/Makefile rename to SETUP/Makefile index ebea6420f..7a0e5c5b4 100644 --- a/osfmk/profiling/ppc/Makefile +++ b/SETUP/Makefile @@ -7,19 +7,10 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) - -DATAFILES = \ - profile-md.h - -INSTALL_MD_LIST = ${DATAFILES} - -INSTALL_MD_DIR = profile/ppc - -EXPORT_MD_LIST = ${DATAFILES} - -EXPORT_MD_DIR = profile/ppc +SETUP_SUBDIRS = \ + config \ + kextsymboltool \ + setsegname include $(MakeInc_rule) include $(MakeInc_dir) - - diff --git a/SETUP/config/Makefile b/SETUP/config/Makefile new file mode 100644 index 000000000..8889afef3 --- /dev/null +++ b/SETUP/config/Makefile @@ -0,0 +1,42 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +OBJS = externs.o main.o mkglue.o mkheaders.o mkioconf.o mkmakefile.o \ + mkswapconf.o openp.o searchp.o lexer.yy.o parser.o + +CFLAGS = -isysroot $(HOST_SDKROOT) -g -O0 -I$(SOURCE) -I. + +WARNFLAGS = -Wall + +LDFLAGS = -isysroot $(HOST_SDKROOT) + +config: $(OBJS) + $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ + @echo HOST_LD $@ + $(_v)$(HOST_CODESIGN) -s - $@ + @echo HOST_CODESIGN $@ + +.c.o: + $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< + @echo HOST_CC $@ + +parser.c: parser.y + $(_v)$(HOST_BISON) -y -d -d -o $@ $< + @echo HOST_BISON $@ + +lexer.yy.c: lexer.l + $(_v)$(HOST_FLEX) --header-file=lexer.yy.h -o $@ $< + @echo HOST_FLEX $@ + +main.o mkglue.o mkheaders.o mkioconf.o mkmakefile.o lexer.yy.c: parser.c + +do_build_setup: config + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/SETUP/config/config.h b/SETUP/config/config.h new file mode 100644 index 000000000..54219e1db --- /dev/null +++ b/SETUP/config/config.h @@ -0,0 +1,293 @@ +/* + * Copyright (c) 1999-2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * @(#)config.h 5.8 (Berkeley) 6/18/88 + */ + +/* + * Config. + */ + +#include <stdio.h> +#include <sys/types.h> +#include <sys/param.h> +#include <stdlib.h> +#include <string.h> + +struct file_list { + struct file_list *f_next; + char *f_fn; /* the name */ + u_char f_type; /* see below */ + u_char f_flags; /* see below */ + short f_special; /* requires special make rule */ + char *f_needs; + char *f_extra; /* stuff to add to make line */ + /* + * Random values: + * swap space parameters for swap areas + * root device, etc. for system specifications + */ + union { + struct { /* when swap specification */ + dev_t fuw_swapdev; + int fuw_swapsize; + } fuw; + struct { /* when system specification */ + dev_t fus_rootdev; + dev_t fus_argdev; + dev_t fus_dumpdev; + } fus; + } fun; +#define f_swapdev fun.fuw.fuw_swapdev +#define f_swapsize fun.fuw.fuw_swapsize +#define f_rootdev fun.fus.fus_rootdev +#define f_argdev fun.fus.fus_argdev +#define f_dumpdev fun.fus.fus_dumpdev +}; + +/* + * Types. + */ +#define DRIVER 1 +#define NORMAL 2 +#define INVISIBLE 3 +#define PROFILING 4 +#define SYSTEMSPEC 5 +#define SWAPSPEC 6 + +/* + * Attributes (flags). + */ +#define CONFIGDEP 0x01 /* obsolete? */ +#define OPTIONSDEF 0x02 /* options definition entry */ +#define ORDERED 0x04 /* don't list in OBJ's, keep "files" order */ +#define SEDIT 0x08 /* run sed filter (SQT) */ + +/* + * Maximum number of fields for variable device fields (SQT). + */ +#define NFIELDS 10 + +struct idlst { + char *id; + struct idlst *id_next; + int id_vec; /* Sun interrupt vector number */ +}; + +struct device { + int d_type; /* CONTROLLER, DEVICE, bus adaptor */ + struct device *d_conn; /* what it is connected to */ + const char *d_name; /* name of device (e.g. rk11) */ + struct idlst *d_vec; /* interrupt vectors */ + int d_pri; /* interrupt priority */ + int d_addr; /* address of csr */ + int d_unit; /* unit number */ + int d_drive; /* drive number */ + int d_slave; /* slave number */ +#define QUES -1 /* -1 means '?' */ +#define UNKNOWN -2 /* -2 means not set yet */ + int d_dk; /* if init 1 set to number for iostat */ + int d_flags; /* nlags for device init */ + struct device *d_next; /* Next one in list */ + u_short d_mach; /* Sun - machine type (0 = all)*/ + u_short d_bus; /* Sun - bus type (0 = unknown) */ + u_long d_fields[NFIELDS]; /* fields values (SQT) */ + int d_bin; /* interrupt bin (SQT) */ + int d_addrmod; /* address modifier (MIPS) */ + char *d_init; /* pseudo device init routine name */ +}; +#define TO_NEXUS (struct device *)-1 +#define TO_SLOT (struct device *)-1 + +struct config { + char *c_dev; + char *s_sysname; +}; + +/* + * Config has a global notion of which machine type is + * being used. It uses the name of the machine in choosing + * files and directories. Thus if the name of the machine is ``vax'', + * it will build from ``Makefile.vax'' and use ``../vax/inline'' + * in the makerules, etc. + */ +extern int machine; +extern const char *machinename; +#define MACHINE_VAX 1 +#define MACHINE_SUN 2 +#define MACHINE_ROMP 3 +#define MACHINE_SUN2 4 +#define MACHINE_SUN3 5 +#define MACHINE_MMAX 6 +#define MACHINE_SQT 7 +#define MACHINE_SUN4 8 +#define MACHINE_I386 9 +#define MACHINE_IX 10 +#define MACHINE_MIPSY 11 +#define MACHINE_MIPS 12 +#define MACHINE_I860 13 +#define MACHINE_M68K 14 +#define MACHINE_M88K 15 +#define MACHINE_M98K 16 +#define MACHINE_HPPA 17 +#define MACHINE_SPARC 18 +#define MACHINE_PPC 19 +#define MACHINE_ARM 20 +#define MACHINE_X86_64 21 + +/* + * For each machine, a set of CPU's may be specified as supported. + * These and the options (below) are put in the C flags in the makefile. + */ +struct cputype { + char *cpu_name; + struct cputype *cpu_next; +}; + +extern struct cputype *cputype; + +/* + * In order to configure and build outside the kernel source tree, + * we may wish to specify where the source tree lives. + */ +extern const char *source_directory; +extern const char *object_directory; +extern char *config_directory; + +FILE *fopenp(const char *fpath, char *file, char *complete, const char *ftype); +const char *get_VPATH(void); +#define VPATH get_VPATH() + +/* + * A set of options may also be specified which are like CPU types, + * but which may also specify values for the options. + * A separate set of options may be defined for make-style options. + */ +struct opt { + char *op_name; + char *op_value; + struct opt *op_next; +}; + +extern struct opt *opt, *mkopt, *opt_tail, *mkopt_tail; + +extern char *ident; +const char *get_word(FILE *fp); +char *ns(const char *str); +char *qu(int num); +char *path(const char *file); + +extern int do_trace; + +#if MACHINE_VAX +extern int seen_mba, seen_uba; +#endif + +extern int seen_vme, seen_mbii; + +extern struct device *dtab; +dev_t nametodev(char *name, int defunit, char defpartition); +char *devtoname(dev_t dev); + +extern char errbuf[80]; +extern int yyline; + +extern struct file_list *ftab, *conf_list, **confp; +extern char *build_directory; + +extern int profiling; + +extern int maxusers; + +#define eq(a,b) (!strcmp(a,b)) + +#ifdef mips +#define DEV_MASK 0xf +#define DEV_SHIFT 4 +#else mips +#define DEV_MASK 0x7 +#define DEV_SHIFT 3 +#endif mips + +/* External function references */ +char *get_rest(FILE *fp); + +int yyparse(void); +void yyerror(const char *s); + +void vax_ioconf(void); +void sun_ioconf(void); +void romp_ioconf(void); +void mmax_ioconf(void); +void sqt_ioconf(void); +void i386_ioconf(void); +void mips_ioconf(void); +void m68k_ioconf(void); +void m88k_ioconf(void); +void m98k_ioconf(void); +void hppa_ioconf(void); +void sparc_ioconf(void); +void ppc_ioconf(void); +void arm_ioconf(void); +void x86_64_ioconf(void); + +void swapconf(void); + +void ubglue(void); +void mbglue(void); + +void makefile(void); +void headers(void); +int opteq(const char *cp, const char *dp); + +void init_dev(struct device *dp); +void newdev(struct device *dp); +void dev_param(struct device *dp, const char *str, long num); + +int searchp(const char *spath, char *file, char *fullname, int (*func)(char *)); diff --git a/bsd/conf/tools/doconf/doconf.csh b/SETUP/config/doconf similarity index 94% rename from bsd/conf/tools/doconf/doconf.csh rename to SETUP/config/doconf index 6fedb4786..2d4e952e9 100755 --- a/bsd/conf/tools/doconf/doconf.csh +++ b/SETUP/config/doconf @@ -69,17 +69,14 @@ set prog=$0 set prog=$prog:t set nonomatch set OBJDIR=../BUILD -if ("`/usr/bin/uname`" == "Rhapsody" ) then -set CONFIG_DIR=/usr/local/bin -else -set CONFIG_DIR=/usr/bin -endif +set CONFIG_DIR=$OBJROOT/SETUP/config unset domake unset doconfig unset beverbose unset MACHINE unset profile +unset SOC_CONFIG while ($#argv >= 1) if ("$argv[1]" =~ -*) then @@ -100,6 +97,14 @@ while ($#argv >= 1) set MACHINE="$argv[2]" shift breaksw + case "-soc": + if ($#argv < 2) then + echo "${prog}: missing argument to ${argv[1]}" + exit 1 + endif + set SOC_CONFIG="$argv[2]" + shift + breaksw case "-d": if ($#argv < 2) then echo "${prog}: missing argument to ${argv[1]}" @@ -168,11 +173,15 @@ set FEATURES_H=(cs_*.h mach_*.h net_*.h\ set MASTER_DIR=../conf set MASTER = ${MASTER_DIR}/MASTER set MASTER_CPU=${MASTER}.${cpu} +set MASTER_CPU_PER_SOC=${MASTER}.${cpu}.${SOC_CONFIG} +if (-f $MASTER_CPU_PER_SOC) set MASTER_CPU = ${MASTER_CPU_PER_SOC} set MASTER_LOCAL = ${MASTER}.local set MASTER_CPU_LOCAL = ${MASTER_CPU}.local +set MASTER_CPU_PER_SOC_LOCAL = ${MASTER_CPU_PER_SOC}.local if (! -f $MASTER_LOCAL) set MASTER_LOCAL = "" if (! -f $MASTER_CPU_LOCAL) set MASTER_CPU_LOCAL = "" +if (-f $MASTER_CPU_PER_SOC_LOCAL) set MASTER_CPU_LOCAL = ${MASTER_CPU_PER_SOC_LOCAL} if (! -d $OBJDIR) then if ($?beverbose) then diff --git a/SETUP/config/externs.c b/SETUP/config/externs.c new file mode 100644 index 000000000..d1bdd8942 --- /dev/null +++ b/SETUP/config/externs.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* Copyright (c) Apple Computer, Inc. All rights reserved. */ + +#include <config.h> + + +/* + * Config has a global notion of which machine type is + * being used. It uses the name of the machine in choosing + * files and directories. Thus if the name of the machine is ``vax'', + * it will build from ``Makefile.vax'' and use ``../vax/inline'' + * in the makerules, etc. + */ +int machine; +const char *machinename; + +/* + * For each machine, a set of CPU's may be specified as supported. + * These and the options (below) are put in the C flags in the makefile. + */ + +struct cputype *cputype; + +/* + * In order to configure and build outside the kernel source tree, + * we may wish to specify where the source tree lives. + */ +const char *source_directory; +const char *object_directory; +char *config_directory; + +/* + * A set of options may also be specified which are like CPU types, + * but which may also specify values for the options. + * A separate set of options may be defined for make-style options. + */ +struct opt *opt, *mkopt, *opt_tail, *mkopt_tail; + +char *ident; + +int do_trace; + +#if MACHINE_VAX +int seen_mba, seen_uba; +#endif + +int seen_vme, seen_mbii; + +struct device *dtab; + +char errbuf[80]; +int yyline; + +struct file_list *ftab, *conf_list, **confp; +char *build_directory; + +int profiling = 0; + +int maxusers; + diff --git a/SETUP/config/lexer.l b/SETUP/config/lexer.l new file mode 100644 index 000000000..c5502b4ba --- /dev/null +++ b/SETUP/config/lexer.l @@ -0,0 +1,214 @@ +%{ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * @(#)config.l 5.5 (Berkeley) 6/18/88 + */ + +#include <ctype.h> +#include "parser.h" +#include "config.h" + +int kw_lookup(char *word); +int octal(char *str); +int hex(char *str); +int yylex(void); + +#define tprintf if (do_trace) printf + +/* + * Key word table + */ + +struct kt { + const char *kt_name; + int kt_val; +} key_words[] = { + { "and", AND }, + { "args", ARGS }, + { "at", AT }, + { "builddir", BUILDDIR }, + { "config", CONFIG }, + { "configdir", CONFIGDIR }, + { "controller", CONTROLLER }, + { "cpu", CPU }, + { "csr", CSR }, + { "device", DEVICE }, + { "disk", DISK }, + { "drive", DRIVE }, + { "dumps", DUMPS }, + { "flags", FLAGS }, + { "hz", HZ }, + { "ident", IDENT }, + { "init", INIT }, + { "machine", MACHINE }, + { "major", MAJOR }, + { "makeoptions", MAKEOPTIONS }, + { "makevariables", MAKEOPTIONS }, + { "master", MASTER }, + { "maxusers", MAXUSERS }, + { "mba", MBA }, + { "minor", MINOR }, + { "nexus", NEXUS }, + { "objectdir", OBJECTDIR }, + { "on", ON }, + { "options", OPTIONS }, + { "priority", PRIORITY }, + { "profile", PROFILE }, + { "pseudo-device",PSEUDO_DEVICE }, + { "root", ROOT }, + { "size", SIZE }, + { "slave", SLAVE }, + { "sourcedir", SOURCEDIR }, + { "swap", SWAP }, + { "tape", DEVICE }, + { "trace", TRACE }, + { "uba", UBA }, + { "vector", VECTOR }, + { "lun", LUN }, /* MMAX only */ + { "slot", SLOT }, /* MMAX only */ + { "tape", TAPE }, /* MMAX only */ + { "bin", BIN }, /* SQT ONLY */ + { "am", ADDRMOD }, /* MIPS */ + { "mbii", MBII }, /* MIPS */ + { "vme", VME }, /* MIPS */ + { 0, 0 }, +}; +%} + +%option nounput + +WORD ([A-Za-z_][-A-Za-z_]*|[A-Z][-A-Za-z_0-9]*) +WORD1 ([A-Za-z_][-A-Za-z_0-9]*) +%% +{WORD} | +{WORD1} { + int i; + + if ((i = kw_lookup(yytext)) == -1) + { + yylval.str = yytext; + tprintf("id(%s) ", yytext); + return ID; + } + tprintf("(%s) ", yytext); + return i; + } +\"[^"]+\" { + yytext[strlen(yytext)-1] = '\0'; + yylval.str = yytext + 1; + return ID; + } +0[0-7]* { + yylval.val = octal(yytext); + tprintf("#O:%o ", yylval.val); + return NUMBER; + } +0x[0-9a-fA-F]+ { + yylval.val = hex(yytext); + tprintf("#X:%x ", yylval.val); + return NUMBER; + } +[1-9][0-9]* { + yylval.val = atoi(yytext); + tprintf("#D:%d ", yylval.val); + return NUMBER; + } +[0-9]"."[0-9]* { + yylval.val = (int) (60 * atof(yytext) + 0.5); + return FPNUMBER; + } +"-" { + return MINUS; + } +"?" { + yylval.val = -1; + tprintf("? "); + return NUMBER; + } +\n/[ \t] { + yyline++; + tprintf("\n... "); + } +\n { + yyline++; + tprintf("\n"); + return SEMICOLON; + } +#.* { /* Ignored (comment) */; } +[ \t]* { /* Ignored (white space) */; } +";" { return SEMICOLON; } +"," { return COMMA; } +"=" { return EQUALS; } +"@" { return AT; } +. { return yytext[0]; } + + +%% +/* + * kw_lookup + * Look up a string in the keyword table. Returns a -1 if the + * string is not a keyword otherwise it returns the keyword number + */ + +int +kw_lookup(char *word) +{ + register struct kt *kp; + + for (kp = key_words; kp->kt_name != 0; kp++) + if (eq(word, kp->kt_name)) + return kp->kt_val; + return -1; +} + +/* + * Number conversion routines + */ + +int +octal(char *str) +{ + int num; + + (void) sscanf(str, "%o", &num); + return num; +} + +int +hex(char *str) +{ + int num; + + (void) sscanf(str+2, "%x", &num); + return num; +} + +int +yywrap() +{ + return 1; +} diff --git a/SETUP/config/main.c b/SETUP/config/main.c new file mode 100644 index 000000000..024b17be8 --- /dev/null +++ b/SETUP/config/main.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 1999-2009 Apple Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef lint +char copyright[] = +"@(#) Copyright (c) 1980 Regents of the University of California.\n\ + All rights reserved.\n"; +#endif /* not lint */ + +#ifndef lint +static char sccsid[] __attribute__((used)) = "@(#)main.c 5.9 (Berkeley) 6/18/88"; +#endif /* not lint */ + +#include <stdio.h> +#include <ctype.h> +#include "parser.h" +#include "config.h" + +/* + * Config builds a set of files for building a UNIX + * system given a description of the desired system. + */ +int +main(int argc, char *argv[]) +{ + + source_directory = ".."; /* default */ + object_directory = ".."; + config_directory = (char *) 0; + while ((argc > 1) && (argv[1][0] == '-')) { + char *c; + + argv++; argc--; + for (c = &argv[0][1]; *c ; c++) { + switch (*c) { + case 'b': + build_directory = argv[1]; + goto check_arg; + + case 'd': + source_directory = argv[1]; + goto check_arg; + + case 'o': + object_directory = argv[1]; + goto check_arg; + + case 'c': + config_directory = argv[1]; + + check_arg: + if (argv[1] == (char *) 0) + goto usage_error; + argv++; argc--; + break; + + case 'p': + profiling++; + break; + default: + goto usage_error; + } + } + } + if (config_directory == (char *) 0) { + config_directory = + malloc((unsigned) strlen(source_directory) + 6); + (void) sprintf(config_directory, "%s/conf", source_directory); + } + if (argc != 2) { + usage_error: ; + fprintf(stderr, "usage: config [ -bcdo dir ] [ -p ] sysname\n"); + exit(1); + } + if (!build_directory) + build_directory = argv[1]; + if (freopen(argv[1], "r", stdin) == NULL) { + perror(argv[1]); + exit(2); + } + dtab = NULL; + confp = &conf_list; + opt = 0; + if (yyparse()) + exit(3); + switch (machine) { + + case MACHINE_VAX: + vax_ioconf(); /* Print ioconf.c */ + ubglue(); /* Create ubglue.s */ + break; + + case MACHINE_SUN: + sun_ioconf(); + break; + + case MACHINE_SUN2: + case MACHINE_SUN3: + case MACHINE_SUN4: + sun_ioconf(); /* Print ioconf.c */ + mbglue(); /* Create mbglue.s */ + break; + + case MACHINE_ROMP: + romp_ioconf(); + break; + + case MACHINE_MMAX: + mmax_ioconf(); + break; + + case MACHINE_SQT: + sqt_ioconf(); + break; + + case MACHINE_I386: + case MACHINE_IX: + i386_ioconf(); + break; + + case MACHINE_MIPSY: + case MACHINE_MIPS: + mips_ioconf(); + break; + + case MACHINE_I860: + /* i860_ioconf(); */ + break; + + case MACHINE_M68K: + m68k_ioconf(); + break; + + case MACHINE_M88K: + m88k_ioconf(); + break; + + case MACHINE_M98K: + m98k_ioconf(); + break; + + case MACHINE_HPPA: + hppa_ioconf(); + break; + + case MACHINE_SPARC: + sparc_ioconf(); + break; + + case MACHINE_PPC: + ppc_ioconf(); + break; + + case MACHINE_ARM: + arm_ioconf(); + break; + + case MACHINE_X86_64: + x86_64_ioconf(); + break; + + default: + printf("Specify machine type, e.g. ``machine vax''\n"); + exit(1); + } + + makefile(); /* build Makefile */ + headers(); /* make a lot of .h files */ + swapconf(); /* swap config files */ + + return 0; +} + +/* + * get_word + * returns EOF on end of file + * NULL on end of line + * pointer to the word otherwise + */ +const char * +get_word(FILE *fp) +{ + static char line[80]; + register int ch; + register char *cp; + + while ((ch = getc(fp)) != EOF) + if (ch != ' ' && ch != '\t') + break; + if (ch == EOF) + return ((char *)EOF); + if (ch == '\n') + return (NULL); + if (ch == '|') + return( "|"); + cp = line; + *cp++ = ch; + while ((ch = getc(fp)) != EOF) { + if (isspace(ch)) + break; + *cp++ = ch; + } + *cp = 0; + if (ch == EOF) + return ((char *)EOF); + (void) ungetc(ch, fp); + return (line); +} + +/* + * get_rest + * returns EOF on end of file + * NULL on end of line + * pointer to the word otherwise + */ +char * +get_rest(FILE *fp) +{ + static char line[80]; + register int ch; + register char *cp; + + cp = line; + while ((ch = getc(fp)) != EOF) { + if (ch == '\n') + break; + *cp++ = ch; + } + *cp = 0; + if (ch == EOF) + return ((char *)EOF); + return (line); +} + +/* + * prepend the path to a filename + */ +char * +path(const char *file) +{ + register char *cp; + + cp = malloc((unsigned)(strlen(build_directory)+ + strlen(file)+ + strlen(object_directory)+ + 3)); + (void) sprintf(cp, "%s/%s/%s", object_directory, build_directory, file); + return (cp); +} diff --git a/SETUP/config/mkglue.c b/SETUP/config/mkglue.c new file mode 100644 index 000000000..9d4b5ac6f --- /dev/null +++ b/SETUP/config/mkglue.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef lint +static char sccsid[] __attribute__((used)) = "@(#)mkglue.c 5.6 (Berkeley) 6/18/88"; +#endif /* not lint */ + +/* + * Make the bus adaptor interrupt glue files. + */ +#include <stdio.h> +#include <string.h> +#include "config.h" +#include "parser.h" +#include <ctype.h> + +void dump_mb_handler(FILE *fp, struct idlst *vec, int number); +void dump_ubavec(FILE *fp, char *vector, int number); +void dump_std(FILE *fp, FILE *gp); +void dump_intname(FILE *fp, char *vector, int number); +void dump_ctrs(FILE *fp); +void glue(FILE *fp, void (*dump_handler)(FILE *, struct idlst *, int)); + +/* + * Create the UNIBUS interrupt vector glue file. + */ +void +ubglue(void) +{ + register FILE *fp, *gp; + register struct device *dp, *mp; + + fp = fopen(path("ubglue.s"), "w"); + if (fp == 0) { + perror(path("ubglue.s")); + exit(1); + } + gp = fopen(path("ubvec.s"), "w"); + if (gp == 0) { + perror(path("ubvec.s")); + exit(1); + } + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp != 0 && mp != (struct device *)-1 && + !eq(mp->d_name, "mba")) { + struct idlst *id, *id2; + + for (id = dp->d_vec; id; id = id->id_next) { + for (id2 = dp->d_vec; id2; id2 = id2->id_next) { + if (id2 == id) { + dump_ubavec(fp, id->id, + dp->d_unit); + break; + } + if (!strcmp(id->id, id2->id)) + break; + } + } + } + } + dump_std(fp, gp); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp != 0 && mp != (struct device *)-1 && + !eq(mp->d_name, "mba")) { + struct idlst *id, *id2; + + for (id = dp->d_vec; id; id = id->id_next) { + for (id2 = dp->d_vec; id2; id2 = id2->id_next) { + if (id2 == id) { + dump_intname(fp, id->id, + dp->d_unit); + break; + } + if (!strcmp(id->id, id2->id)) + break; + } + } + } + } + dump_ctrs(fp); + (void) fclose(fp); + (void) fclose(gp); +} + +static int cntcnt = 0; /* number of interrupt counters allocated */ + +/* + * Print a UNIBUS interrupt vector. + */ +void +dump_ubavec(FILE *fp, char *vector, int number) +{ + char nbuf[80]; + register char *v = nbuf; + + switch (machine) { + + case MACHINE_VAX: + (void) sprintf(v, "%s%d", vector, number); + fprintf(fp, "\t.globl\t_X%s\n\t.align\t2\n_X%s:\n", + v, v); + fprintf(fp,"\tTIM_PUSHR(0)\n"); + fprintf(fp, "\tincl\t_fltintrcnt+(4*%d)\n", cntcnt++); + if (strncmp(vector, "dzx", 3) == 0) + fprintf(fp, "\tmovl\t$%d,r0\n\tjmp\tdzdma\n\n", number); + else { + if (strncmp(vector, "uur", 3) == 0) { + fprintf(fp, "#ifdef UUDMA\n"); + fprintf(fp, "\tmovl\t$%d,r0\n\tjsb\tuudma\n", + number); + fprintf(fp, "#endif\n"); + } + fprintf(fp, "\tpushl\t$%d\n", number); + fprintf(fp, "\tcalls\t$1,_%s\n",vector); + fprintf(fp, "\tCOUNT(V_INTR)\n"); + fprintf(fp, "\tTSREI_POPR\n"); + } + break; + + case MACHINE_MIPSY: + case MACHINE_MIPS: + /* + * Actually, we should never get here! + * Main does not even call ubglue. + */ + if (strncmp(vector, "dzx", 3) == 0) + fprintf(fp, "\tDZINTR(%s,%d)\n", vector, number); + else + fprintf(fp, "\tDEVINTR(%s,%d)\n", vector, number); + break; + } + +} + +static const char *vaxinames[] = { + "clock", "cnr", "cnx", "tur", "tux", + "mba0", "mba1", "mba2", "mba3", + "uba0", "uba1", "uba2", "uba3" +}; +static struct stdintrs { + const char **si_names; /* list of standard interrupt names */ + int si_n; /* number of such names */ +} stdintrs[] = { + { vaxinames, sizeof (vaxinames) / sizeof (vaxinames[0]) }, +}; +/* + * Start the interrupt name table with the names + * of the standard vectors not directly associated + * with a bus. Also, dump the defines needed to + * reference the associated counters into a separate + * file which is prepended to locore.s. + */ +void +dump_std(FILE *fp, FILE *gp) +{ + register struct stdintrs *si = &stdintrs[machine-1]; + register const char **cpp; + register int i; + + fprintf(fp, "\n\t.globl\t_intrnames\n"); + fprintf(fp, "\n\t.globl\t_eintrnames\n"); + fprintf(fp, "\t.data\n"); + fprintf(fp, "_intrnames:\n"); + cpp = si->si_names; + for (i = 0; i < si->si_n; i++) { + const char *cp; + char *tp; + char buf[80]; + + cp = *cpp; + if (cp[0] == 'i' && cp[1] == 'n' && cp[2] == 't') { + cp += 3; + if (*cp == 'r') + cp++; + } + for (tp = buf; *cp; cp++) + if (islower(*cp)) + *tp++ = toupper(*cp); + else + *tp++ = *cp; + *tp = '\0'; + fprintf(gp, "#define\tI_%s\t%lu\n", buf, i*sizeof (long)); + fprintf(fp, "\t.asciz\t\"%s\"\n", *cpp); + cpp++; + } +} + +void +dump_intname(FILE *fp, char *vector, int number) +{ + register char *cp = vector; + + fprintf(fp, "\t.asciz\t\""); + /* + * Skip any "int" or "intr" in the name. + */ + while (*cp) + if (cp[0] == 'i' && cp[1] == 'n' && cp[2] == 't') { + cp += 3; + if (*cp == 'r') + cp++; + } else { + putc(*cp, fp); + cp++; + } + fprintf(fp, "%d\"\n", number); +} + +/* + * Reserve space for the interrupt counters. + */ +void +dump_ctrs(FILE *fp) +{ + struct stdintrs *si = &stdintrs[machine-1]; + + fprintf(fp, "_eintrnames:\n"); + fprintf(fp, "\n\t.globl\t_intrcnt\n"); + fprintf(fp, "\n\t.globl\t_eintrcnt\n"); + fprintf(fp, "\t.align 2\n"); + fprintf(fp, "_intrcnt:\n"); + fprintf(fp, "\t.space\t4 * %d\n", si->si_n); + fprintf(fp, "_fltintrcnt:\n"); + fprintf(fp, "\t.space\t4 * %d\n", cntcnt); + fprintf(fp, "_eintrcnt:\n\n"); + fprintf(fp, "\t.text\n"); +} + +/* + * Routines for making Sun mb interrupt file mbglue.s + */ + +/* + * print an interrupt handler for mainbus + */ +void +dump_mb_handler(FILE *fp, struct idlst *vec, int number) +{ + fprintf(fp, "\tVECINTR(_X%s%d, _%s, _V%s%d)\n", + vec->id, number, vec->id, vec->id, number); +} + +void +mbglue(void) +{ + register FILE *fp; + const char *name = "mbglue.s"; + + fp = fopen(path(name), "w"); + if (fp == 0) { + perror(path(name)); + exit(1); + } + fprintf(fp, "#include <machine/asm_linkage.h>\n\n"); + glue(fp, dump_mb_handler); + (void) fclose(fp); +} + +void +glue(FILE *fp, void (*dump_handler)(FILE *, struct idlst *, int)) +{ + register struct device *dp, *mp; + + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp != 0 && mp != (struct device *)-1 && + !eq(mp->d_name, "mba")) { + struct idlst *vd, *vd2; + + for (vd = dp->d_vec; vd; vd = vd->id_next) { + for (vd2 = dp->d_vec; vd2; vd2 = vd2->id_next) { + if (vd2 == vd) { + (void)(*dump_handler) + (fp, vd, dp->d_unit); + break; + } + if (!strcmp(vd->id, vd2->id)) + break; + } + } + } + } +} diff --git a/SETUP/config/mkheaders.c b/SETUP/config/mkheaders.c new file mode 100644 index 000000000..a0e3fdc38 --- /dev/null +++ b/SETUP/config/mkheaders.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 1999-2006 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef lint +static char sccsid[] __attribute__((used)) = "@(#)mkheaders.c 5.5 (Berkeley) 6/18/88"; +#endif /* not lint */ + +/* + * Make all the .h files for the optional entries + */ + +#include <stdio.h> +#include <unistd.h> /* unlink */ +#include <ctype.h> +#include "config.h" +#include "parser.h" + +static void do_count(const char *dev, const char *hname, int search); +static void do_header(const char *dev, const char *hname, int count); +static int file_needed(const char *name); +static char *toheader(const char *dev); +static char *tomacro(const char *dev); + +void +headers(void) +{ + struct file_list *fl; + + for (fl = ftab; fl != 0; fl = fl->f_next) + if (fl->f_needs != 0) + do_count(fl->f_needs, fl->f_needs, 1); +} + +/* + * count all the devices of a certain type and recurse to count + * whatever the device is connected to + */ +void +do_count(const char *dev, const char *hname, int search) +{ + struct device *dp, *mp; + int count; + + for (count = 0,dp = dtab; dp != 0; dp = dp->d_next) + if (dp->d_unit != -1 && eq(dp->d_name, dev)) { + /* + * Avoid making .h files for bus types on sun machines + */ + if ((machine == MACHINE_SUN2 || + machine == MACHINE_SUN3 || + machine == MACHINE_SUN4) + && dp->d_conn == TO_NEXUS){ + return; + } + if (dp->d_type == PSEUDO_DEVICE) { + count = + dp->d_slave != UNKNOWN ? dp->d_slave : 1; + if (dp->d_flags) + dev = NULL; + break; + } + if (machine != MACHINE_SUN2 && machine != MACHINE_SUN3 + && machine != MACHINE_SUN4) + /* avoid ie0,ie0,ie1 setting NIE to 3 */ + count++; + /* + * Allow holes in unit numbering, + * assumption is unit numbering starts + * at zero. + */ + if (dp->d_unit + 1 > count) + count = dp->d_unit + 1; + if (search) { + mp = dp->d_conn; + if (mp != 0 && mp != TO_NEXUS && + mp->d_conn != TO_NEXUS) { + /* + * Check for the case of the + * controller that the device + * is attached to is in a separate + * file (e.g. "sd" and "sc"). + * In this case, do NOT define + * the number of controllers + * in the hname .h file. + */ + if (!file_needed(mp->d_name)) + do_count(mp->d_name, hname, 0); + search = 0; + } + } + } + do_header(dev, hname, count); +} + +/* + * Scan the file list to see if name is needed to bring in a file. + */ +static int +file_needed(const char *name) +{ + struct file_list *fl; + + for (fl = ftab; fl != 0; fl = fl->f_next) { + if (fl->f_needs && strcmp(fl->f_needs, name) == 0) + return (1); + } + return (0); +} + +static void +do_header(const char *dev, const char *hname, int count) +{ + char *file, *name; + const char *inw; + char *inwcopy; + struct file_list *fl = NULL; /* may exit for(;;) uninitted */ + struct file_list *fl_head, *fl_prev; + FILE *inf, *outf; + int inc, oldcount; + + file = toheader(hname); + name = tomacro(dev?dev:hname) + (dev == NULL); + inf = fopen(file, "r"); + oldcount = -1; + if (inf == 0) { + (void) unlink(file); + outf = fopen(file, "w"); + if (outf == 0) { + perror(file); + exit(1); + } + fprintf(outf, "#define %s %d\n", name, count); + (void) fclose(outf); + file = path("meta_features.h"); + outf = fopen(file, "a"); + if (outf == 0) { + perror(file); + exit(1); + } + fprintf(outf, "#include <%s.h>\n", hname); + (void) fclose(outf); + return; + } + fl_head = 0; + for (;;) { + const char *cp; + if ((inw = get_word(inf)) == 0 || inw == (char *)EOF) + break; + if ((inw = get_word(inf)) == 0 || inw == (char *)EOF) + break; + inwcopy = ns(inw); + cp = get_word(inf); + if (cp == 0 || cp == (char *)EOF) + break; + inc = atoi(cp); + if (eq(inwcopy, name)) { + oldcount = inc; + inc = count; + } + cp = get_word(inf); + if (cp == (char *)EOF) + break; + fl = (struct file_list *) malloc(sizeof *fl); + fl->f_fn = inwcopy; + fl->f_type = inc; + fl->f_next = fl_head; + fl_head = fl; + } + (void) fclose(inf); + if (count == oldcount) { + while (fl !=0) { + fl_prev = fl; + fl = fl->f_next; + free((char *)fl_prev); + } + return; + } + if (oldcount == -1) { + fl = (struct file_list *) malloc(sizeof *fl); + fl->f_fn = name; + fl->f_type = count; + fl->f_next = fl_head; + fl_head = fl; + } + unlink(file); + outf = fopen(file, "w"); + if (outf == 0) { + perror(file); + exit(1); + } + for (fl = fl_head; fl != 0; fl = fl->f_next) { + fprintf(outf, "#define %s %d\n", + fl->f_fn, count ? fl->f_type : 0); + free((char *)fl); + } + (void) fclose(outf); +} + +/* + * convert a dev name to a .h file name + */ +static char * +toheader(const char *dev) +{ + static char hbuf[MAXPATHLEN]; + (void) snprintf(hbuf, sizeof hbuf, "%s.h", path(dev)); + hbuf[MAXPATHLEN-1] = '\0'; + return (hbuf); +} + +/* + * convert a dev name to a macro name + */ +static char * +tomacro(const char *dev) +{ + static char mbuf[FILENAME_MAX]; + char *cp; + + cp = mbuf; + *cp++ = 'N'; + while (*dev) + if (!islower(*dev)) + *cp++ = *dev++; + else + *cp++ = toupper(*dev++); + *cp++ = 0; + return (mbuf); +} diff --git a/SETUP/config/mkioconf.c b/SETUP/config/mkioconf.c new file mode 100644 index 000000000..90b6c2f97 --- /dev/null +++ b/SETUP/config/mkioconf.c @@ -0,0 +1,2086 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <stdio.h> +#include <unistd.h> /* for unlink */ +#include "parser.h" +#include "config.h" + +/* + * build the ioconf.c file + */ +char *intv(struct device *dev); +char *intv2(struct device *dev); +void i386_pseudo_inits(FILE *fp); /* XXX function in wrong block */ +void check_vector(struct idlst *vec); +void nrw_ioconf(void); +void m88k_pseudo_inits(FILE *fp); +void m98k_pseudo_inits(FILE *fp); +char *m88k_dn(char *name); +char *m98k_dn(char *name); +char *concat3(char *buf, const char *p1, const char *p2, const char *p3); + +#if MACHINE_VAX + +void +vax_ioconf(void) +{ + register struct device *dp, *mp, *np; + register int uba_n, slave; + FILE *fp; + + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } +/*MACH_KERNEL*/ + fprintf(fp, "#ifndef MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#include <machine/pte.h>\n"); + fprintf(fp, "#include <sys/param.h>\n"); + fprintf(fp, "#include <sys/buf.h>\n"); + fprintf(fp, "#include <sys/map.h>\n"); + fprintf(fp, "#include <sys/vm.h>\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#endif MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "\n"); + fprintf(fp, "#include <vaxmba/mbavar.h>\n"); + fprintf(fp, "#include <vaxuba/ubavar.h>\n\n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (caddr_t)\n\n"); + /* + * First print the mba initialization structures + */ + if (seen_mba) { + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, "mba")) + continue; + fprintf(fp, "extern struct mba_driver %sdriver;\n", + dp->d_name); + } + fprintf(fp, "\nstruct mba_device mbdinit[] = {\n"); + fprintf(fp, "\t/* Device, Unit, Mba, Drive, Dk */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || mp == 0 || + mp == TO_NEXUS || !eq(mp->d_name, "mba")) + continue; + if (dp->d_addr) { + printf("can't specify csr address on mba for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_vec != 0) { + printf("can't specify vector for %s%d on mba\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive == UNKNOWN) { + printf("drive not specified for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_slave != UNKNOWN) { + printf("can't specify slave number for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + fprintf(fp, "\t{ &%sdriver, %d, %s,", + dp->d_name, dp->d_unit, qu(mp->d_unit)); + fprintf(fp, " %s, %d },\n", + qu(dp->d_drive), dp->d_dk); + } + fprintf(fp, "\t0\n};\n\n"); + /* + * Print the mbsinit structure + * Driver Controller Unit Slave + */ + fprintf(fp, "struct mba_slave mbsinit [] = {\n"); + fprintf(fp, "\t/* Driver, Ctlr, Unit, Slave */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + /* + * All slaves are connected to something which + * is connected to the massbus. + */ + if ((mp = dp->d_conn) == 0 || mp == TO_NEXUS) + continue; + np = mp->d_conn; + if (np == 0 || np == TO_NEXUS || + !eq(np->d_name, "mba")) + continue; + fprintf(fp, "\t{ &%sdriver, %s", + mp->d_name, qu(mp->d_unit)); + fprintf(fp, ", %2d, %s },\n", + dp->d_unit, qu(dp->d_slave)); + } + fprintf(fp, "\t0\n};\n\n"); + } + /* + * Now generate interrupt vectors for the unibus + */ + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_vec != 0) { + struct idlst *ip; + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, "uba")) + continue; + fprintf(fp, + "extern struct uba_driver %sdriver;\n", + dp->d_name); + fprintf(fp, "extern "); + ip = dp->d_vec; + for (;;) { + fprintf(fp, "X%s%d()", ip->id, dp->d_unit); + ip = ip->id_next; + if (ip == 0) + break; + fprintf(fp, ", "); + } + fprintf(fp, ";\n"); + fprintf(fp, "int\t (*%sint%d[])() = { ", dp->d_name, + dp->d_unit); + ip = dp->d_vec; + for (;;) { + fprintf(fp, "X%s%d", ip->id, dp->d_unit); + ip = ip->id_next; + if (ip == 0) + break; + fprintf(fp, ", "); + } + fprintf(fp, ", 0 } ;\n"); + } + } + fprintf(fp, "\nstruct uba_ctlr ubminit[] = {\n"); + fprintf(fp, "/*\t driver,\tctlr,\tubanum,\talive,\tintr,\taddr */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_type != CONTROLLER || mp == TO_NEXUS || mp == 0 || + !eq(mp->d_name, "uba")) + continue; + if (dp->d_vec == 0) { + printf("must specify vector for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr address for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives need their own entries; dont "); + printf("specify drive or slave for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_flags) { + printf("controllers (e.g. %s%d) ", + dp->d_name, dp->d_unit); + printf("don't have flags, only devices do\n"); + continue; + } + fprintf(fp, + "\t{ &%sdriver,\t%d,\t%s,\t0,\t%sint%d, C 0%o },\n", + dp->d_name, dp->d_unit, qu(mp->d_unit), + dp->d_name, dp->d_unit, dp->d_addr); + } + fprintf(fp, "\t0\n};\n"); +/* unibus devices */ + fprintf(fp, "\nstruct uba_device ubdinit[] = {\n"); + fprintf(fp, +"\t/* driver, unit, ctlr, ubanum, slave, intr, addr, dk, flags*/\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || dp->d_type != DEVICE || mp == 0 || + mp == TO_NEXUS || mp->d_type == MASTER || + eq(mp->d_name, "mba")) + continue; + np = mp->d_conn; + if (np != 0 && np != TO_NEXUS && eq(np->d_name, "mba")) + continue; + np = 0; + if (eq(mp->d_name, "uba")) { + if (dp->d_vec == 0) { + printf("must specify vector for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives/slaves can be specified "); + printf("only for controllers, "); + printf("not for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + uba_n = mp->d_unit; + slave = QUES; + } else { + if ((np = mp->d_conn) == 0) { + printf("%s%d isn't connected to anything ", + mp->d_name, mp->d_unit); + printf(", so %s%d is unattached\n", + dp->d_name, dp->d_unit); + continue; + } + uba_n = np->d_unit; + if (dp->d_drive == UNKNOWN) { + printf("must specify ``drive number'' "); + printf("for %s%d\n", dp->d_name, dp->d_unit); + continue; + } + /* NOTE THAT ON THE UNIBUS ``drive'' IS STORED IN */ + /* ``SLAVE'' AND WE DON'T WANT A SLAVE SPECIFIED */ + if (dp->d_slave != UNKNOWN) { + printf("slave numbers should be given only "); + printf("for massbus tapes, not for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_vec != 0) { + printf("interrupt vectors should not be "); + printf("given for drive %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr != 0) { + printf("csr addresses should be given only "); + printf("on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = dp->d_drive; + } + fprintf(fp, "\t{ &%sdriver, %2d, %s,", + eq(mp->d_name, "uba") ? dp->d_name : mp->d_name, dp->d_unit, + eq(mp->d_name, "uba") ? " -1" : qu(mp->d_unit)); + fprintf(fp, " %s, %2d, %s, C 0%-6o, %d, 0x%x },\n", + qu(uba_n), slave, intv(dp), dp->d_addr, dp->d_dk, + dp->d_flags); + } + fprintf(fp, "\t0\n};\n"); + (void) fclose(fp); +} +#endif + +#if MACHINE_SUN +#define SP_OBIO 0x0004 /* on board i/o (for sun/autoconf.h) */ + +#define VEC_LO 64 +#define VEC_HI 255 + +void pseudo_inits(FILE *fp); + +void +check_vector(struct idlst *vec) +{ + + if (vec->id_vec == 0) + fprintf(stderr, "vector number for %s not given\n", vec->id); + else if (vec->id_vec < VEC_LO || vec->id_vec > VEC_HI) + fprintf(stderr, + "vector number %d for %s is not between %d and %d\n", + vec->id_vec, vec->id, VEC_LO, VEC_HI); +} + +void +sun_ioconf(void) +{ + register struct device *dp, *mp; + register int slave; + register struct idlst *vp; + FILE *fp; + + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } +/*MACH_KERNEL*/ + fprintf(fp, "#ifndef MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#include <sys/param.h>\n"); + fprintf(fp, "#include <sys/buf.h>\n"); + fprintf(fp, "#include <sys/map.h>\n"); + fprintf(fp, "#include <sys/vm.h>\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#endif MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "\n"); + fprintf(fp, "#include <sundev/mbvar.h>\n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (caddr_t)\n\n"); + fprintf(fp, "\n"); + + /* + * Now generate interrupt vectors for the Mainbus + */ + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == TO_NEXUS || mp == 0 || mp->d_conn != TO_NEXUS) + continue; + fprintf(fp, "extern struct mb_driver %sdriver;\n", + dp->d_name); + if (dp->d_vec != 0) { + if (dp->d_pri == 0) + fprintf(stderr, + "no priority specified for %s%d\n", + dp->d_name, dp->d_unit); + fprintf(fp, "extern "); + for (vp = dp->d_vec;;) { + if (machine == MACHINE_SUN4) + fprintf(fp, "%s()", vp->id); + else + fprintf(fp, "X%s%d()", + vp->id, dp->d_unit); + vp = vp->id_next; + if (vp == 0) + break; + fprintf(fp, ", "); + } + fprintf(fp, ";\n"); + + for (vp = dp->d_vec; vp; vp = vp->id_next) { + fprintf(fp, "int V%s%d = %d;\n", + vp->id, dp->d_unit, dp->d_unit); + } + + fprintf(fp, "struct vec %s[] = { ", intv(dp)); + for (vp = dp->d_vec; vp != 0; vp = vp->id_next) { + if (machine == MACHINE_SUN4) + fprintf(fp, "{ %s, %d, &V%s%d }, ", + vp->id, vp->id_vec, + vp->id, dp->d_unit); + else + fprintf(fp, "{ X%s%d, %d, &V%s%d }, ", + vp->id, dp->d_unit, vp->id_vec, + vp->id, dp->d_unit); + check_vector(vp); + } + fprintf(fp, "0 };\n"); + } + } + + /* + * Now spew forth the mb_ctlr structures + */ + fprintf(fp, "\nstruct mb_ctlr mbcinit[] = {\n"); + fprintf(fp, +"/* driver,\tctlr,\talive,\taddress,\tintpri,\t intr,\tspace */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_type != CONTROLLER || mp == TO_NEXUS || mp == 0 || + mp->d_conn != TO_NEXUS) + continue; + if (dp->d_addr == UNKNOWN) { + printf("must specify csr address for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives need their own entries; "); + printf("don't specify drive or slave for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_flags) { + printf("controllers (e.g. %s%d) don't have flags, ", + dp->d_name, dp->d_unit); + printf("only devices do\n"); + continue; + } + if (machine == MACHINE_SUN4) + fprintf(fp, + "{ &%sdriver,\t%d,\t0,\tC 0x%08x,\t%d,\t%s, 0x%x },\n", + dp->d_name, dp->d_unit, dp->d_addr, + (dp->d_bus==SP_OBIO) ? (dp->d_pri << 1) : (dp->d_pri<<1)-1, + intv(dp), ((dp->d_mach << 16) | dp->d_bus)); + else + fprintf(fp, + "{ &%sdriver,\t%d,\t0,\tC 0x%08x,\t%d,\t%s, 0x%x },\n", + dp->d_name, dp->d_unit, dp->d_addr, + dp->d_pri, intv(dp), ((dp->d_mach << 16) | dp->d_bus)); + } + fprintf(fp, "\t0\n};\n"); + + /* + * Now we go for the mb_device stuff + */ + fprintf(fp, "\nstruct mb_device mbdinit[] = {\n"); + fprintf(fp, +"/* driver,\tunit, ctlr, slave, address, pri, dk, flags, intr, space */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || dp->d_type != DEVICE || mp == 0 || + mp == TO_NEXUS || mp->d_type == MASTER) + continue; + if (mp->d_conn == TO_NEXUS) { + if (dp->d_addr == UNKNOWN) { + printf("must specify csr for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives/slaves can be specified only "); + printf("for controllers, not for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = QUES; + } else { + if (mp->d_conn == 0) { + printf("%s%d isn't connected to anything, ", + mp->d_name, mp->d_unit); + printf("so %s%d is unattached\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive == UNKNOWN) { + printf("must specify ``drive number'' for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + /* NOTE THAT ON THE UNIBUS ``drive'' IS STORED IN */ + /* ``SLAVE'' AND WE DON'T WANT A SLAVE SPECIFIED */ + if (dp->d_slave != UNKNOWN) { + printf("slave numbers should be given only "); + printf("for massbus tapes, not for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_pri != 0) { + printf("interrupt priority should not be "); + printf("given for drive %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr != UNKNOWN) { + printf("csr addresses should be given only"); + printf(" on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = dp->d_drive; + } + if (machine == MACHINE_SUN4) + fprintf(fp, +"{ &%sdriver,\t%d, %s, %2d, C 0x%08x, %d, %d, 0x%x, %s, 0x%x },\n", + mp->d_conn == TO_NEXUS? dp->d_name : mp->d_name, dp->d_unit, + mp->d_conn == TO_NEXUS? " -1" : qu(mp->d_unit), + slave, + dp->d_addr == UNKNOWN? 0 : dp->d_addr, + dp->d_pri * 2, dp->d_dk, dp->d_flags, intv(dp), + ((dp->d_mach << 16) | dp->d_bus)); + else + fprintf(fp, +"{ &%sdriver,\t%d, %s, %2d, C 0x%08x, %d, %d, 0x%x, %s, 0x%x },\n", + mp->d_conn == TO_NEXUS? dp->d_name : mp->d_name, dp->d_unit, + mp->d_conn == TO_NEXUS? " -1" : qu(mp->d_unit), + slave, + dp->d_addr == UNKNOWN? 0 : dp->d_addr, + dp->d_pri, dp->d_dk, dp->d_flags, intv(dp), + ((dp->d_mach << 16) | dp->d_bus)); + } + fprintf(fp, "\t0\n};\n"); + pseudo_inits(fp); + (void) fclose(fp); +} + +void +pseudo_inits(FILE *fp) +{ +#ifdef notdef + register struct device *dp; + int count; + + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } +#endif notdef + fprintf(fp, "struct pseudo_init {\n"); + fprintf(fp, "\tint\tps_count;\n\tint\t(*ps_func)();\n"); + fprintf(fp, "} pseudo_inits[] = {\n"); +#ifdef notdef + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } +#endif notdef + fprintf(fp, "\t{0,\t0},\n};\n"); +} +#endif + +#if MACHINE_ROMP +void +romp_ioconf(void) +{ + register struct device *dp, *mp; + register int slave; + FILE *fp; + + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } +/*MACH_KERNEL*/ + fprintf(fp, "#ifndef MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#include <sys/param.h>\n"); + fprintf(fp, "#include <sys/buf.h>\n"); + fprintf(fp, "#include <sys/map.h>\n"); + fprintf(fp, "#include <sys/vm.h>\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#endif MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "\n"); + fprintf(fp, "#include <caio/ioccvar.h>\n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (caddr_t)\n\n"); + fprintf(fp, "\n"); + + fprintf (fp, "struct iocc_hd iocc_hd[] = {{C 0xF0000000,}};\n"); + /* + * Now generate interrupt vectors for the Winnerbus + */ + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_pri != 0) { + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, "iocc")) + continue; + fprintf(fp, "extern struct iocc_driver %sdriver;\n", + dp->d_name); + } + } + /* + * Now spew forth the iocc_cinfo structure + */ + fprintf(fp, "\nstruct iocc_ctlr iocccinit[] = {\n"); + fprintf(fp, "/*\t driver,\tctlr,\talive,\taddr,\tintpri */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_type != CONTROLLER) + continue; + if (mp == TO_NEXUS || mp == 0 || !eq(mp->d_name, "iocc")) + continue; + if (dp->d_unit == QUES && eq(dp->d_name,"hdc")) + continue; + if (dp->d_unit == QUES && eq(dp->d_name,"fdc")) + continue; + if (dp->d_pri == 0) { + printf("must specify priority for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr address for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives need their own entries; "); + printf("dont specify drive or slave for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_flags) { + printf("controllers (e.g. %s%d) don't have flags, ", + dp->d_name, dp->d_unit); + printf("only devices do\n"); + continue; + } + fprintf(fp, "\t{ &%sdriver,\t%d,\t0,\tC 0x%x,\t%d },\n", + dp->d_name, dp->d_unit, dp->d_addr, dp->d_pri); + } + fprintf(fp, "\t0\n};\n"); + /* + * Now we go for the iocc_device stuff + */ + fprintf(fp, "\nstruct iocc_device ioccdinit[] = {\n"); + fprintf(fp, +"\t/* driver, unit, ctlr, slave, addr, pri, dk, flags*/\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || dp->d_type != DEVICE || mp == 0 || + mp == TO_NEXUS || mp->d_type == MASTER || + eq(mp->d_name, "iocca")) + continue; + if (eq(mp->d_name, "iocc")) { + if (dp->d_pri == 0) { + printf("must specify vector for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives/slaves can be specified only "); + printf("for controllers, not for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = QUES; + } else { + if (mp->d_conn == 0) { + printf("%s%d isn't connected to anything, ", + mp->d_name, mp->d_unit); + printf("so %s%d is unattached\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive == UNKNOWN) { + printf("must specify ``drive number'' for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + /* NOTE THAT ON THE UNIBUS ``drive'' IS STORED IN */ + /* ``SLAVE'' AND WE DON'T WANT A SLAVE SPECIFIED */ + if (dp->d_slave != UNKNOWN) { + printf("slave numbers should be given only "); + printf("for massbus tapes, not for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_pri != 0) { + printf("interrupt priority should not be "); + printf("given for drive %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr != 0) { + printf("csr addresses should be given only"); + printf("on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = dp->d_drive; + } + fprintf(fp, +"\t{ &%sdriver, %2d, %s, %2d, C 0x%x, %d, %d, 0x%x },\n", + eq(mp->d_name, "iocc") ? dp->d_name : mp->d_name, dp->d_unit, + eq(mp->d_name, "iocc") ? " -1" : qu(mp->d_unit), + slave, dp->d_addr, dp->d_pri, dp->d_dk, dp->d_flags); + } + fprintf(fp, "\t0\n};\n"); + (void) fclose(fp); +} + +#endif MACHINE_ROMP + +#if MACHINE_MMAX +void +mmax_ioconf(void) +{ + register struct device *dp, *dp1, *mp; + FILE *fp; + int unit; + + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include <mmaxio/io.h>\n\n"); + + /* + * Multimax code is a little messy because we have to + * scan the entire list for each device to generate the + * structures correctly. We cheat and use the d->d_pri + * field to avoid doing anything twice. -1000 is an obvious + * bogus value for this field. + */ + + for (dp1 = dtab; dp1 != 0; dp1 = dp1->d_next) { + /* + * If pri is not -1000, then haven't seen device yet. + */ + if (dp1->d_pri != -1000) switch (dp1->d_type) { + + case CONTROLLER: + fprintf(fp,"struct devaddr %s_devaddr[] = {\n", + dp1->d_name); + /* + * Now scan entire list and get all of them. Use + * unit to make sure unit numbers are right. + */ + unit = 0; + for (dp = dp1; dp != 0; dp = dp->d_next) { + if (!strcmp(dp->d_name, dp1->d_name)) { + mp = dp->d_conn; + if (mp != TO_SLOT) { + printf("%s%d: controller must be connected to slot.\n", + dp->d_name, dp->d_unit); + exit(1); + } + if (dp->d_vec != 0) { + printf("%s%d: cannot configure multimax interrupt vectors.\n", + dp->d_name, dp->d_unit); + } + if (dp->d_pri != 0) { + printf("%s%d: interrupt priority is nonsense on multimax.\n", + dp->d_name, dp->d_unit); + } + if ((dp->d_drive != UNKNOWN) || + (dp->d_slave !=UNKNOWN)) { + printf("%s%d: don't specify drive or slave for controller.\n", + dp->d_name, dp->d_unit); + } + /* + * Fix unit number if bogus + */ + if(dp->d_unit != unit) { + printf("Warning: %s%d configured as %s%d -- fix config file.\n", + dp->d_name,dp->d_unit,dp->d_name,unit); + dp->d_unit = unit; + } + unit++; + fprintf(fp,"\t{ %d, 0, 0},\n",dp->d_addr); + dp->d_pri = -1000; /* done this one */ + } + } + fprintf(fp,"} ;\n\n"); + break; + + case DEVICE: + fprintf(fp,"struct subdevaddr %s_subdevaddr[] = {\n", + dp1->d_name); + /* + * Now scan entire list and get all of them. Use + * unit to make sure unit numbers are right. + */ + unit = 0; + for (dp = dp1; dp != 0; dp = dp->d_next) { + if (!strcmp(dp->d_name, dp1->d_name)) { + mp = dp->d_conn; + if ( (mp == 0) || (mp == TO_SLOT) || + (mp->d_type != CONTROLLER)) { + printf("%s%d: device has no controller.\n", + dp->d_name, dp->d_unit); + exit(1); + } + if (dp->d_vec != 0) { + printf("%s%d: cannot configure multimax interrupt vectors.\n", + dp->d_name, dp->d_unit); + } + if (dp->d_pri != 0) { + printf("%s%d: interrupt priority is nonsense on multimax.\n", + dp->d_name, dp->d_unit); + } + if ((dp->d_drive != UNKNOWN) || + (dp->d_slave !=UNKNOWN)) { + printf("%s%d: use 'unit' instead of 'drive' or 'slave'.\n", + dp->d_name, dp->d_unit); + } + /* + * Fix unit number if bogus + */ + if(dp->d_unit != unit) { + printf("Warning: %s%d configured as %s%d -- fix config file.\n", + dp->d_name,dp->d_unit,dp->d_name,unit); + dp->d_unit = unit; + } + unit++; + if((dp->d_addr == 0) || (dp->d_addr == QUES)){ + printf("%s%d: must specify logical unit number.\n", + dp->d_name,dp->d_unit); + exit(1); + } + fprintf(fp,"\t{ %d, %d, 0},\n",mp->d_unit, + dp->d_addr); + dp->d_pri = -1000; /* don't do this again */ + } + } + fprintf(fp,"} ;\n\n"); + break; + + case PSEUDO_DEVICE: + /* + * Doesn't exist as far as ioconf.c is concerned. + */ + break; + + default: + printf("Bogus device type for %s\n", dp1->d_name); + exit(1); + break; + } + } + + (void) fclose(fp); +} + +#endif MACHINE_MMAX + +#if MACHINE_SQT + +/* + * Define prototype device spec lines. + * + * For now, have static set of controller prototypes. This should be + * upgraded to using (eg) controllers.balance (ala Sequent /etc/config) + * to support custom boards without need to edit this file. + */ + +/* + * flags for indicating presence of upper and lower bound values + */ + +#define P_LB 1 +#define P_UB 2 + +struct p_entry { + const char *p_name; /* name of field */ + long p_def; /* default value */ + long p_lb; /* lower bound for field */ + long p_ub; /* upper bound of field */ + char p_flags; /* bound valid flags */ +}; + +struct proto { + const char *p_name; /* name of controller type */ + struct p_entry p_fields[NFIELDS]; /* ordered list of fields */ + int p_seen; /* any seen? */ +}; + +/* + * MULTIBUS Adapter: + * type mbad index csr flags maps[0,256] bin[0,7] intr[0,7] + */ + +static struct proto mbad_proto = { + "mbad", + {{ "index", 0, 0, 0, 0 }, + { "csr", 0, 0, 0, 0 }, + { "flags", 0, 0, 0, 0 }, + { "maps", 0, 0, 256, P_LB|P_UB }, + { "bin", 0, 0, 7, P_LB|P_UB }, + { "intr", 0, 0, 7, P_LB|P_UB },}, + 0 +}; + +/* + * SCSI/Ether Controller: + * type sec flags bin[0,7] req doneq index target[0,7]=-1 unit + */ + +static struct proto sec_proto = { + "sec", + {{ "flags", 0, 0, 0, 0 }, + { "bin", 0, 0, 7, P_LB|P_UB } , + { "req", 0, 0, 0, 0 }, + { "doneq", 0, 0, 0, 0 }, + { "index", 0, 0, 0, 0 }, + { "target", -1, 0, 7, P_LB|P_UB }, + { "unit", 0, 0, 0, 0 },}, + 0 +}; + +/* + * "Zeke" (FAST) Disk Controller (Dual-Channel Disk Controller): + * type zdc index[0,31] drive[-1,7] drive_type[-1,1] + * + * Levgal values for drive_type: + * M2333K = 0 (swallow) + * M2351A = 1 (eagle) + * wildcard = -1 (run-time determined) + */ + +static struct proto zdc_proto = { + "zdc", + {{ "index", 0, 0, 31, P_LB|P_UB }, + { "drive", 0, -1, 7, P_LB|P_UB }, + { "drive_type", 0, -1, 1, P_LB|P_UB },}, + 0 +}; + +static struct proto *ptab[] = { + &mbad_proto, + &sec_proto, + &zdc_proto, + (struct proto *) 0 +}; + +/* + * locate a prototype structure in the queue of such structures. + * return NULL if not found. + */ + +static struct proto * +find_proto(const char *str) +{ + register struct proto *ptp; + register int ptbx; + + for (ptbx = 0; (ptp = ptab[ptbx]) != NULL; ptbx++) { + if (eq(str, ptp->p_name)) + return(ptp); + } + return(NULL); +} + +void +dev_param(struct device *dp, const char *str, long num) +{ + register struct p_entry *entry; + register struct proto *ptp; + + ptp = find_proto(dp->d_conn->d_name); + if (ptp == NULL) { + fprintf(stderr,"dev %s cont %s", dp->d_name, dp->d_conn->d_name); + yyerror("invalid controller"); + return; + } + + for (entry = ptp->p_fields; entry->p_name != NULL; entry++) { + if (eq(entry->p_name, str)) { + if ((entry->p_flags & P_LB) && (num < entry->p_lb)) { + yyerror("parameter below range"); + return; + } + if ((entry->p_flags & P_UB) && (num > entry->p_ub)) { + yyerror("parameter above range"); + return; + } + dp->d_fields[entry-ptp->p_fields] = num; + return; + } + } + + yyerror("invalid parameter"); +} + +void +sqt_ioconf(void) +{ + register struct device *dp, *mp; + register int count; + const char *namep; + register struct proto *ptp; + register struct p_entry *entry; + FILE *fp; + int bin_table[8]; + int ptbx; + int found; + + for (count = 0; count < 8; count++) + bin_table[count] = 0; + fp = fopen(path("ioconf.c"), "w"); + if (fp == NULL) { + perror(path("ioconf.c")); + exit(1); + } +/*MACH_KERNEL*/ + fprintf(fp, "#ifndef MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#include <sys/param.h>\n"); + fprintf(fp, "#include <sys/systm.h>\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#endif MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "\n"); + fprintf(fp, "#include <machine/ioconf.h>\n"); + + fprintf(fp, "\nu_long\tMBAd_IOwindow =\t\t3*256*1024;\t/* top 1/4 Meg */\n\n"); + + for (ptbx = 0; (ptp = ptab[ptbx]) != NULL; ptbx++) { + + fprintf(fp, "/*\n"); + fprintf(fp, " * %s device configuration.\n", ptp->p_name); + fprintf(fp, " */\n\n"); + fprintf(fp, "\n"); + fprintf(fp, "#include <sqt%s/ioconf.h>\n", ptp->p_name); + fprintf(fp, "\n"); + + /* + * Generate dev structures for this controller + */ + for (dp = dtab, namep = NULL; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, ptp->p_name) || + (namep != NULL && eq(dp->d_name, namep)) ) + continue; + fprintf(fp, "extern\tstruct\t%s_driver\t%s_driver;\n", + ptp->p_name, namep = dp->d_name); + ptp->p_seen = 1; + } + + found = 0; + for (dp = dtab, namep = NULL; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, ptp->p_name)) + continue; + if (namep == NULL || !eq(namep, dp->d_name)) { + count = 0; + if (namep != NULL) + fprintf(fp, "};\n"); + found = 1; + fprintf(fp, "\nstruct\t%s_dev %s_%s[] = {\n", + ptp->p_name, + ptp->p_name, + namep = dp->d_name); + fprintf(fp, "/*"); + entry = ptp->p_fields; + for (; entry->p_name != NULL; entry++) + fprintf(fp, "\t%s",entry->p_name); + fprintf(fp, " */\n"); + } + if (dp->d_bin != UNKNOWN) + bin_table[dp->d_bin]++; + fprintf(fp, "{"); + for (entry = ptp->p_fields; entry->p_name != NULL; entry++) { + if (eq(entry->p_name,"index")) + fprintf(fp, "\t%d,", mp->d_unit); + else + fprintf(fp, "\t%lu,", + dp->d_fields[entry-ptp->p_fields]); + } + fprintf(fp, "\t},\t/* %s%d */\n", dp->d_name, count++); + } + if (found) + fprintf(fp, "};\n\n"); + + /* + * Generate conf array + */ + fprintf(fp, "/*\n"); + fprintf(fp, " * %s_conf array collects all %s devices\n", + ptp->p_name, ptp->p_name); + fprintf(fp, " */\n\n"); + fprintf(fp, "struct\t%s_conf %s_conf[] = {\n", + ptp->p_name, ptp->p_name); + fprintf(fp, "/*\tDriver\t\t#Entries\tDevices\t\t*/\n"); + for (dp = dtab, namep = NULL; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || + !eq(mp->d_name, ptp->p_name)) + continue; + if (namep == NULL || !eq(namep, dp->d_name)) { + if (namep != NULL) + fprintf(fp, + "{\t&%s_driver,\t%d,\t\t%s_%s,\t},\t/* %s */\n", + namep, count, ptp->p_name, namep, namep); + count = 0; + namep = dp->d_name; + } + ++count; + } + if (namep != NULL) { + fprintf(fp, + "{\t&%s_driver,\t%d,\t\t%s_%s,\t},\t/* %s */\n", + namep, count, ptp->p_name, namep, namep); + } + fprintf(fp, "\t{ 0 },\n"); + fprintf(fp, "};\n\n"); + + } + + /* + * Pseudo's + */ + + fprintf(fp, "/*\n"); + fprintf(fp, " * Pseudo-device configuration\n"); + fprintf(fp, " */\n\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type == PSEUDO_DEVICE) { + fprintf(fp, "extern\tint\t%sboot();\n", dp->d_name); + } + } + fprintf(fp, "\nstruct\tpseudo_dev pseudo_dev[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type == PSEUDO_DEVICE) { + fprintf(fp, "\t{ \"%s\",\t%d,\t%sboot,\t},\n", + dp->d_name, + dp->d_slave == UNKNOWN ? 32 : dp->d_slave, + dp->d_name); + } + } + fprintf(fp, "\t{ 0 },\n"); + fprintf(fp, "};\n\n"); + + /* + * Bin interrupt table and misc + */ + + fprintf(fp, "/*\n"); + fprintf(fp, " * Interrupt table\n"); + fprintf(fp, " */\n\n"); + fprintf(fp, "int\tbin_intr[8] = {\n"); + fprintf(fp, "\t\t0,\t\t\t\t/* bin 0, always zero */\n"); + for (count=1; count < 8; count++) { + fprintf(fp, "\t\t%d,\t\t\t\t/* bin %d */\n", + bin_table[count], count); + } + fprintf(fp, "};\n"); + + /* + * b8k_cntlrs[] + */ + + fprintf(fp, "/*\n"); + fprintf(fp, " * b8k_cntlrs array collects all controller entries\n"); + fprintf(fp, " */\n\n"); + for (ptbx = 0; (ptp = ptab[ptbx]) != NULL; ptbx++) { + if (ptp->p_seen) + fprintf(fp, "extern int conf_%s(),\tprobe_%s_devices(),\t%s_map();\n", + ptp->p_name, ptp->p_name, ptp->p_name); + } + fprintf(fp, "\n\nstruct\tcntlrs b8k_cntlrs[] = {\n"); + fprintf(fp, "/*\tconf\t\tprobe_devs\t\tmap\t*/\n"); + + for (ptbx = 0; (ptp = ptab[ptbx]) != NULL; ptbx++) { + if (ptp->p_seen) + fprintf(fp, "{\tconf_%s,\tprobe_%s_devices,\t%s_map\t}, \n", + ptp->p_name, ptp->p_name, ptp->p_name); + } + fprintf(fp, "{\t0,\t},\n"); + fprintf(fp, "};\n"); + + (void) fclose(fp); +} + +#endif MACHINE_SQT +#if MACHINE_I386 +void +i386_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include <dev/busvar.h>\n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (void *)\n"); + fprintf(fp, "\n"); + + i386_pseudo_inits (fp); + (void) fclose(fp); +} +#endif MACHINE_I386 + +#if MACHINE_MIPSY || MACHINE_MIPS + +void declare(const char *cp); +int is_declared(const char *cp); + +void +mips_ioconf(void) +{ + register struct device *dp, *mp, *np; + register int slave; + FILE *fp; + char buf1[64], buf2[64]; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } +/*MACH_KERNEL*/ + fprintf(fp, "#ifndef MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#include <sys/param.h>\n"); + fprintf(fp, "#include <sys/buf.h>\n"); + fprintf(fp, "#include <sys/map.h>\n"); + fprintf(fp, "#include <sys/vm.h>\n"); +/*MACH_KERNEL*/ + fprintf(fp, "#endif MACH_KERNEL\n"); +/*MACH_KERNEL*/ + fprintf(fp, "\n"); + if (seen_mbii && seen_vme) { + printf("can't have both vme and mbii devices\n"); + exit(1); + } + if (seen_mbii) + fprintf(fp, "#include <mipsmbii/mbiivar.h>\n"); + if (seen_vme) + fprintf(fp, "#include <mipsvme/vmevar.h>\n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (caddr_t)\n"); + fprintf(fp, "#define NULL 0\n\n"); + if (!seen_mbii) + goto checkvme; + /* + * MBII stuff should go here + */ + +checkvme: + if (!seen_vme) + goto closefile; + /* + * Now generate interrupt vectors for the vme bus + */ + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_vec != 0) { + struct idlst *ip; + mp = dp->d_conn; + if (mp == 0 || mp == TO_NEXUS || !eq(mp->d_name, "vme")) + continue; + if (is_declared(dp->d_name)) + continue; + declare(dp->d_name); + fprintf(fp, "extern struct vme_driver %sdriver;\n", + dp->d_name); + fprintf(fp, "extern "); + ip = dp->d_vec; + for (;;) { + fprintf(fp, "%s()", ip->id); + ip = ip->id_next; + if (ip == 0) + break; + fprintf(fp, ", "); + } + fprintf(fp, ";\n"); + fprintf(fp, "int (*_%sint%d[])() = { ", dp->d_name, + dp->d_unit); + ip = dp->d_vec; + for (;;) { + fprintf(fp, "%s", ip->id); + ip = ip->id_next; + if (ip == 0) + break; + fprintf(fp, ", "); + } + fprintf(fp, ", 0 } ;\n\n"); + } + } + fprintf(fp, "\nstruct vme_ctlr vmminit[] = {\n"); + fprintf(fp, +" /* driver ctlr alive intr addr am */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_type != CONTROLLER || mp == TO_NEXUS || mp == 0 || + !eq(mp->d_name, "vme")) + continue; + if (dp->d_vec == 0) { + printf("must specify vector for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr address for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addrmod == 0) { + printf("must specify address modifier for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives need their own entries; dont "); + printf("specify drive or slave for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_flags) { + printf("controllers (e.g. %s%d) ", + dp->d_name, dp->d_unit); + printf("don't have flags, only devices do\n"); + continue; + } + fprintf(fp, +" { %14s, %3d, 0, %11s, C 0x%08x, 0x%02x },\n", + concat3(buf1, "&", dp->d_name, "driver"), + dp->d_unit, + concat3(buf2, "_", dp->d_name, "int"), + dp->d_addr, + dp->d_addrmod); + } + fprintf(fp, " { NULL }\n};\n"); + /* + * vme devices + */ + fprintf(fp, "\nstruct vme_device vmdinit[] = {\n"); + fprintf(fp, +"/* driver unit ctlr slave intr addr am dk flags */\n" + ); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || dp->d_type != DEVICE || mp == 0 || + mp == TO_NEXUS || mp->d_type == MASTER) + continue; + for (np = mp; np && np != TO_NEXUS; np = np->d_conn) + if (eq(np->d_name, "vme")) + break; + if (np != 0 && np != TO_NEXUS && !eq(np->d_name, "vme")) + continue; + np = 0; + if (eq(mp->d_name, "vme")) { + if (dp->d_vec == 0) { + printf("must specify vector for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr == 0) { + printf("must specify csr for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addrmod == 0) { + printf( + "must specify address modifier for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives/slaves can be specified "); + printf("only for controllers, "); + printf("not for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = QUES; + } else { + if ((np = mp->d_conn) == 0) { + printf("%s%d isn't connected to anything ", + mp->d_name, mp->d_unit); + printf(", so %s%d is unattached\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive == UNKNOWN) { + printf("must specify ``drive number'' "); + printf("for %s%d\n", dp->d_name, dp->d_unit); + continue; + } + if (dp->d_slave != UNKNOWN) { + printf("slave numbers should be given only "); + printf("for massbus tapes, not for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_vec != 0) { + printf("interrupt vectors should not be "); + printf("given for drive %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr != 0) { + printf("csr addresses should be given only "); + printf("on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addrmod != 0) { + printf("address modifiers should be given only "); + printf("on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = dp->d_drive; + } + fprintf(fp, +"{%14s, %3d, %3s, %4d,%10s, C 0x%08x, 0x%02x, %1d, 0x%08x },\n", + concat3(buf1, "&", + eq(mp->d_name, "vme") ? dp->d_name : mp->d_name, + "driver"), + dp->d_unit, + eq(mp->d_name, "vme") ? "-1" : qu(mp->d_unit), + slave, + intv2(dp), + dp->d_addr, + dp->d_addrmod, + dp->d_dk, + dp->d_flags); + } + fprintf(fp, "{ NULL }\n};\n"); +closefile: + (void) fclose(fp); +} + +char * +intv2(struct device *dev) +{ + static char buf[20]; + + if (dev->d_vec == 0) { + strcpy(buf, "NULL"); + } else { + (void) sprintf(buf, "_%sint", dev->d_name); + } + return (buf); +} + +char * +concat3(char *buf, const char *p1, const char *p2, const char *p3) +{ + (void) sprintf(buf, "%s%s%s", p1, p2, p3); + return (buf); +} + +#define MAXDEVS 100 +#define DEVLEN 10 +char decl_devices[MAXDEVS][DEVLEN]; + +void +declare(const char *cp) +{ + register int i; + + for (i = 0; i < MAXDEVS; i++) + if (decl_devices[i][0] == 0) { + strncpy(decl_devices[i], cp, DEVLEN); + return; + } + printf("device table full, fix mkioconf.c\n"); + exit(1); +} + +int +is_declared(const char *cp) +{ + register int i; + + for (i = 0; i < MAXDEVS; i++) { + if (decl_devices[i][0] == 0) + return(0); + if (strncmp(decl_devices[i], cp, DEVLEN) == 0) + return(1); + } + return(0); +} +#endif MACHINE_MIPSY || MACHINE_MIPS + +#if MACHINE_M68K +char *m68k_dn(const char *name); +void m68k_pseudo_inits(FILE *fp); + +void +m68k_ioconf(void) +{ + register struct device *dp, *mp; + register int slave; + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include <dev/m68k/busvar.h>\n"); + fprintf(fp, "\n"); + fprintf(fp, "#define C (void *)\n"); + fprintf(fp, "\n"); + + /* + * Now generate interrupt vectors for the bus + */ + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (mp == TO_NEXUS || mp == 0 || mp->d_conn != TO_NEXUS) + continue; + fprintf(fp, "extern struct bus_driver %sdriver;\n", + dp->d_name); + } + + /* + * Now spew forth the bus_ctrl structures + */ + fprintf(fp, "\nstruct bus_ctrl bus_cinit[] = {\n"); + fprintf(fp, +" /* driver ctrl ipl address */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_type != CONTROLLER || mp == TO_NEXUS || mp == 0 || + mp->d_conn != TO_NEXUS || dp->d_unit == QUES) + continue; + if (dp->d_addr == UNKNOWN) { + printf("must specify csr address for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives need their own entries; "); + printf("don't specify drive or slave for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_flags) { + printf("controllers (e.g. %s%d) don't have flags, ", + dp->d_name, dp->d_unit); + printf("only devices do\n"); + continue; + } + fprintf(fp, +" { %-12s, %5d, %4d, C 0x%08x },\n", + m68k_dn(dp->d_name), dp->d_unit, dp->d_pri, dp->d_addr); + } + fprintf(fp, " 0\n};\n"); + + /* + * Now we go for the bus_device stuff + */ + fprintf(fp, "\nstruct bus_device bus_dinit[] = {\n"); + fprintf(fp, +" /* driver unit ctrl slave ipl dk flags address name */\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + mp = dp->d_conn; + if (dp->d_unit == QUES || dp->d_type != DEVICE || mp == 0 || + mp == TO_NEXUS || mp->d_type == MASTER) + continue; + if (mp->d_conn == TO_NEXUS) { + if (dp->d_addr == UNKNOWN) { + printf("must specify csr for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive != UNKNOWN || dp->d_slave != UNKNOWN) { + printf("drives/slaves can be specified only "); + printf("for controllers, not for device %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = UNKNOWN; + } else { + if (mp->d_conn == 0) { + printf("%s%d isn't connected to anything, ", + mp->d_name, mp->d_unit); + printf("so %s%d is unattached\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_drive == UNKNOWN) { + printf("must specify ``drive number'' for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + /* NOTE THAT ON THE UNIBUS ``drive'' IS STORED IN */ + /* ``SLAVE'' AND WE DON'T WANT A SLAVE SPECIFIED */ + if (dp->d_slave != UNKNOWN) { + printf("slave numbers should be given only "); + printf("for massbus tapes, not for %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_pri != 0) { + printf("interrupt priority should not be "); + printf("given for drive %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + if (dp->d_addr != 0) { + printf("csr addresses should be given only"); + printf(" on controllers, not on %s%d\n", + dp->d_name, dp->d_unit); + continue; + } + slave = dp->d_drive; + } + fprintf(fp, +" { %-12s, %3d, %s, %s,%3d,%3d, %#10x, C 0x%08x, \"%s\" },\n", + m68k_dn(mp->d_conn == TO_NEXUS? dp->d_name : mp->d_name), + dp->d_unit, + mp->d_conn == TO_NEXUS? " -1" : qu(mp->d_unit), + qu(slave), + dp->d_pri, -dp->d_dk, dp->d_flags, + dp->d_addr == UNKNOWN? 0 : dp->d_addr, + dp->d_name); + } + fprintf(fp, " 0\n};\n"); + m68k_pseudo_inits (fp); + (void) fclose(fp); +} + +void +m68k_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +void +i386_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +char * +m68k_dn(const char *name) +{ + sprintf(errbuf, "&%sdriver", name); return ns(errbuf); +} +#endif MACHINE_M68K + +#if MACHINE_M88K || MACHINE_M98K +char *nrw_dn(char *name); +void nrw_pseudo_inits(FILE *fp); + +void +nrw_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include <dev/nrw/busvar.h>\n"); + fprintf(fp, "\n"); + nrw_pseudo_inits (fp); + (void) fclose(fp); +} + +void +nrw_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +char * +nrw_dn(char *name) +{ + sprintf(errbuf, "&%sdriver,", name); + return(errbuf); +} + +void +m88k_ioconf(void) +{ + nrw_ioconf(); +} + +void +m98k_ioconf(void) +{ + nrw_ioconf(); +} + +void +m88k_pseudo_inits(FILE *fp) +{ + nrw_pseudo_inits(fp); +} + +void +m98k_pseudo_inits(FILE *fp) +{ + nrw_pseudo_inits(fp); +} + +char * +m88k_dn(char *name) +{ + return(nrw_dn(name)); +} + +char * +m98k_dn(char *name) +{ + return(nrw_dn(name)); +} + + +#endif MACHINE_M88K || MACHINE_M98K + +#ifdef MACHINE_HPPA +char *hppa_dn(char *name); +void hppa_pseudo_inits(FILE *fp); + +void +hppa_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include <dev/hppa/busvar.h>\n"); + fprintf(fp, "\n"); + hppa_pseudo_inits (fp); + (void) fclose(fp); +} + +void +hppa_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +char * +hppa_dn(char *name) +{ + sprintf(errbuf, "&%sdriver,", name); + + return (errbuf); +} + +#endif MACHINE_HPPA + +#ifdef MACHINE_SPARC +char *sparc_dn(char *name); +void sparc_pseudo_inits(FILE *fp); + +void +sparc_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include <dev/busvar.h>\n"); + fprintf(fp, "\n"); + sparc_pseudo_inits (fp); + (void) fclose(fp); +} + +void +sparc_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +char * +sparc_dn(char *name) +{ + sprintf(errbuf, "&%sdriver,", name); + return (errbuf); +} + +#endif MACHINE_SPARC + +#ifdef MACHINE_PPC +char *ppc_dn(char *name); +void ppc_pseudo_inits(FILE *fp); + +void +ppc_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include <dev/busvar.h>\n"); + fprintf(fp, "\n"); + ppc_pseudo_inits (fp); + (void) fclose(fp); +} + +void +ppc_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +char * +ppc_dn(name) + char *name; +{ + sprintf(errbuf, "&%sdriver,", name); + return (errbuf); +} + +#endif MACHINE_PPC + +#ifdef MACHINE_ARM +void arm_pseudo_inits(FILE *fp); + +void +arm_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include <dev/busvar.h>\n"); + fprintf(fp, "\n"); + arm_pseudo_inits (fp); + (void) fclose(fp); +} + +void +arm_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +#endif /* MACHINE_ARM */ + +#ifdef MACHINE_X86_64 +void x86_64_pseudo_inits(FILE *fp); + +void +x86_64_ioconf(void) +{ + FILE *fp; + + unlink(path("ioconf.c")); + fp = fopen(path("ioconf.c"), "w"); + if (fp == 0) { + perror(path("ioconf.c")); + exit(1); + } + fprintf(fp, "#include <dev/busvar.h>\n"); + fprintf(fp, "\n"); + x86_64_pseudo_inits (fp); + (void) fclose(fp); +} + +void +x86_64_pseudo_inits(FILE *fp) +{ + register struct device *dp; + int count; + + fprintf(fp, "\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + fprintf(fp, "extern int %s(int);\n", dp->d_init); + } + fprintf(fp, "\nstruct pseudo_init pseudo_inits[] = {\n"); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (dp->d_type != PSEUDO_DEVICE || dp->d_init == 0) + continue; + count = dp->d_slave; + if (count <= 0) + count = 1; + fprintf(fp, "\t{%d,\t%s},\n", count, dp->d_init); + } + fprintf(fp, "\t{0,\t0},\n};\n"); +} + +#endif /* MACHINE_X86_64 */ + +char * +intv(struct device *dev) +{ + static char buf[20]; + + if (dev->d_vec == 0) { + strcpy(buf, " 0"); + } else { + (void) sprintf(buf, "%sint%d", dev->d_name, dev->d_unit); + } + return ns(buf); +} + +char * +qu(int num) +{ + + if (num == QUES) { + strcpy(errbuf, "'?'"); + } else if (num == UNKNOWN) { + strcpy(errbuf, " -1"); + } else { + (void) sprintf(errbuf, "%3d", num); + } + return ns(errbuf); +} diff --git a/SETUP/config/mkmakefile.c b/SETUP/config/mkmakefile.c new file mode 100644 index 000000000..6ac9aa099 --- /dev/null +++ b/SETUP/config/mkmakefile.c @@ -0,0 +1,1182 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef lint +static char sccsid[] __attribute__((used)) = "@(#)mkmakefile.c 5.21 (Berkeley) 6/18/88"; +#endif /* not lint */ + +/* + * Build the makefile for the system, from + * the information in the files files and the + * additional files for the machine being compiled to. + */ + +#include <stdio.h> +#include <unistd.h> /* for unlink */ +#include <ctype.h> +#include "parser.h" +#include "config.h" + +void read_files(void); +void do_objs(FILE *fp, const char *msg, int ext); +void do_ordered(FILE *fp); +void do_files(FILE *fp, const char *msg, char ext); +void do_machdep(FILE *ofp); +void do_build(const char *name, void (*format)(FILE *)); +void do_rules(FILE *f); +void do_load(FILE *f); +struct file_list *do_systemspec(FILE *f, struct file_list *fl, int first); +void do_swapspec(FILE *f, const char *name, char *sysname); +void copy_dependencies(FILE *makin, FILE *makout); + +void build_cputypes(FILE *fp); +void build_confdep(FILE *fp); + +struct file_list *fl_lookup(char *file); +struct file_list *fltail_lookup(char *file); +struct file_list *new_fent(void); + +void put_source_file_name(FILE *fp, struct file_list *tp); + + +#define DO_SWAPFILE 0 + +#define next_word(fp, wd) \ + { register const char *word = get_word(fp); \ + if (word == (char *)EOF) \ + return; \ + else \ + wd = word; \ + } + +static struct file_list *fcur; +const char *tail(const char *fn); +char *allCaps(char *str); + +/* + * Lookup a file, by name. + */ +struct file_list * +fl_lookup(char *file) +{ + register struct file_list *fp; + + for (fp = ftab ; fp != 0; fp = fp->f_next) { + if (eq(fp->f_fn, file)) + return (fp); + } + return (0); +} + +/* + * Lookup a file, by final component name. + */ +struct file_list * +fltail_lookup(char *file) +{ + register struct file_list *fp; + + for (fp = ftab ; fp != 0; fp = fp->f_next) { + if (eq(tail(fp->f_fn), tail(file))) + return (fp); + } + return (0); +} + +/* + * Make a new file list entry + */ +struct file_list * +new_fent(void) +{ + register struct file_list *fp; + + fp = (struct file_list *) malloc(sizeof *fp); + fp->f_needs = 0; + fp->f_next = 0; + fp->f_flags = 0; + fp->f_type = 0; + fp->f_extra = (char *) 0; + if (fcur == 0) + fcur = ftab = fp; + else + fcur->f_next = fp; + fcur = fp; + return (fp); +} + +char *COPTS; +static struct users { + int u_default; + int u_min; + int u_max; +} users[] = { + { 24, 2, 1024 }, /* MACHINE_VAX */ + { 8, 2, 32 }, /* MACHINE_SUN */ + { 16, 4, 32 }, /* MACHINE_ROMP */ + { 8, 2, 32 }, /* MACHINE_SUN2 */ + { 8, 2, 32 }, /* MACHINE_SUN3 */ + { 24, 8, 1024}, /* MACHINE_MMAX */ + { 32, 8, 1024}, /* MACHINE_SQT */ + { 8, 2, 32 }, /* MACHINE_SUN4 */ + { 2, 2, 1024 }, /* MACHINE_I386 */ + { 32, 8, 1024 }, /* MACHINE_IX */ + { 32, 8, 1024 }, /* MACHINE_MIPSY */ + { 32, 8, 1024 }, /* MACHINE_MIPS*/ + { 32, 8, 1024 }, /* MACHINE_I860*/ + { 8, 2, 32 }, /* MACHINE_M68K */ + { 8, 2, 32 }, /* MACHINE_M88K */ + { 8, 2, 32 }, /* MACHINE_M98K */ + { 8, 2, 32 }, /* MACHINE_HPPA */ + { 8, 2, 32 }, /* MACHINE_SPARC */ + { 8, 2, 32 }, /* MACHINE_PPC */ + { 8, 2, 32 }, /* MACHINE_ARM */ + { 8, 2, 32 }, /* MACHINE_X86_64 */ +}; +#define NUSERS (sizeof (users) / sizeof (users[0])) + +const char * +get_VPATH(void) +{ + static char *vpath = NULL; + + if ((vpath == NULL) && + ((vpath = getenv("VPATH")) != NULL) && + (*vpath != ':')) { + register char *buf = malloc((unsigned)(strlen(vpath) + 2)); + + vpath = strcat(strcpy(buf, ":"), vpath); + } + + return vpath ? vpath : ""; +} + + +/* + * Build the makefile from the skeleton + */ +void +makefile(void) +{ + FILE *ifp, *ofp; + FILE *dfp; + char pname[BUFSIZ]; + char line[BUFSIZ]; + struct opt *op; + struct users *up; + + read_files(); + (void) sprintf(line, "%s/Makefile.template", config_directory); + ifp = fopenp(VPATH, line, pname, "r"); + if (ifp == 0) { + perror(line); + exit(1); + } + dfp = fopen(path("Makefile"), "r"); + rename(path("Makefile"), path("Makefile.old")); + unlink(path("Makefile.old")); + unlink(path("M.d")); + if ((ofp = fopen(path("M.d"), "w")) == NULL) { + perror(path("M.d")); + /* We'll let this error go */ + } + else + fclose(ofp); + ofp = fopen(path("Makefile"), "w"); + if (ofp == 0) { + perror(path("Makefile")); + exit(1); + } + fprintf(ofp, "SOURCE_DIR=%s\n", source_directory); + + if (machine == MACHINE_SUN || machine == MACHINE_SUN2 + || machine == MACHINE_SUN3 || machine == MACHINE_SUN4) + fprintf(ofp, "IDENT=-D%s -D%s", machinename, allCaps(ident)); + else + fprintf(ofp, "IDENT=-D%s", allCaps(ident)); + if (profiling) + fprintf(ofp, " -DGPROF"); + if (cputype == 0) { + printf("cpu type must be specified\n"); + exit(1); + } + do_build("cputypes.h", build_cputypes); + + for (op = opt; op; op = op->op_next) + if (op->op_value) + fprintf(ofp, " -D%s=\"%s\"", op->op_name, op->op_value); + else + fprintf(ofp, " -D%s", op->op_name); + fprintf(ofp, "\n"); + if ((unsigned)machine > NUSERS) { + printf("maxusers config info isn't present, using vax\n"); + up = &users[MACHINE_VAX-1]; + } else + up = &users[machine-1]; + if (maxusers < up->u_min) { + maxusers = up->u_min; + } else if (maxusers > up->u_max) + printf("warning: maxusers > %d (%d)\n", up->u_max, maxusers); + if (maxusers) { + do_build("confdep.h", build_confdep); + } + for (op = mkopt; op; op = op->op_next) + if (op->op_value) + fprintf(ofp, "%s=%s\n", op->op_name, op->op_value); + else + fprintf(ofp, "%s\n", op->op_name); + + while (fgets(line, BUFSIZ, ifp) != 0) { + if (*line == '%') + goto percent; + if (profiling && strncmp(line, "COPTS=", 6) == 0) { + register char *cp; + if (machine != MACHINE_MMAX) + fprintf(ofp, + "GPROF.EX=$(SOURCE_DIR)/machdep/%s/gmon.ex\n", machinename); + cp = index(line, '\n'); + if (cp) + *cp = 0; + cp = line + 6; + while (*cp && (*cp == ' ' || *cp == '\t')) + cp++; + COPTS = malloc((unsigned)(strlen(cp) + 1)); + if (COPTS == 0) { + printf("config: out of memory\n"); + exit(1); + } + strcpy(COPTS, cp); + if (machine == MACHINE_MIPSY || machine == MACHINE_MIPS) { + fprintf(ofp, "%s ${CCPROFOPT}\n", line); + fprintf(ofp, "PCOPTS=%s\n", cp); + } else if (machine == MACHINE_MMAX) + fprintf(ofp, "%s -p\n",line); + else + fprintf(ofp, "%s -pg\n", line); + continue; + } + fprintf(ofp, "%s", line); + continue; + percent: + if (eq(line, "%OBJS\n")) { + do_objs(ofp, "OBJS=", -1); + } else if (eq(line, "%CFILES\n")) { + do_files(ofp, "CFILES=", 'c'); + do_objs(ofp, "COBJS=", 'c'); + } else if (eq(line, "%MFILES\n")) { + do_files(ofp, "MFILES=", 'm'); + do_objs(ofp, "MOBJS=", 'm'); + } else if (eq(line, "%SFILES\n")) { + do_files(ofp, "SFILES=", 's'); + do_objs(ofp, "SOBJS=", 's'); + } else if (eq(line, "%BFILES\n")) + do_files(ofp, "BFILES=", 'b'); + else if (eq(line, "%MACHDEP\n")) { + /* + * Move do_machdep() after the mkopt stuff. + */ + for (op = mkopt; op; op = op->op_next) + fprintf(ofp, "%s=%s\n", op->op_name, op->op_value); + do_machdep(ofp); + } else if (eq(line, "%ORDERED\n")) + do_ordered(ofp); + else if (eq(line, "%RULES\n")) + do_rules(ofp); + else if (eq(line, "%LOAD\n")) + do_load(ofp); + else + fprintf(stderr, + "Unknown %% construct in generic makefile: %s", + line); + } + if (dfp != NULL) + { + copy_dependencies(dfp, ofp); + (void) fclose(dfp); + } + (void) fclose(ifp); + (void) fclose(ofp); +} + +/* + * Read in the information about files used in making the system. + * Store it in the ftab linked list. + */ +void +read_files(void) +{ + FILE *fp; + register struct file_list *tp, *pf; + register struct device *dp; + register struct opt *op; + const char *wd; + char *this, *needs; + const char *devorprof; + int options; + int not_option; + int ordered; + int sedit; /* SQT */ + char pname[BUFSIZ]; + char fname[1024]; + char *rest = (char *) 0; + struct cputype *cp; + int nreqs, first = 1, isdup; + + ftab = 0; + (void) sprintf(fname, "%s/files", config_directory); +openit: + fp = fopenp(VPATH, fname, pname, "r"); + if (fp == 0) { + perror(fname); + exit(1); + } +next: + options = 0; + rest = (char *) 0; + /* + * filename [ standard | optional ] + * [ dev* | profiling-routine ] [ device-driver] + */ + /* + * MACHINE_SQT ONLY: + * + * filename [ standard | optional ] + * [ ordered | sedit ] + * [ dev* | profiling-routine ] [ device-driver] + */ + wd = get_word(fp); + if (wd == (char *)EOF) { + (void) fclose(fp); + if (first == 1) { + (void) sprintf(fname, "%s/files.%s", config_directory, machinename); + first++; + goto openit; + } + if (first == 2) { + (void) sprintf(fname, "files.%s", allCaps(ident)); + first++; + fp = fopenp(VPATH, fname, pname, "r"); + if (fp != 0) + goto next; + } + return; + } + if (wd == 0) + goto next; + /* + * Allow comment lines beginning witha '#' character. + */ + if (*wd == '#') + { + while ((wd=get_word(fp)) && wd != (char *)EOF) + ; + goto next; + } + + this = ns(wd); + next_word(fp, wd); + if (wd == 0) { + printf("%s: No type for %s.\n", + fname, this); + exit(1); + } + if ((pf = fl_lookup(this)) && (pf->f_type != INVISIBLE || pf->f_flags)) + isdup = 1; + else + isdup = 0; + tp = 0; + if (first == 3 && (tp = fltail_lookup(this)) != 0) + printf("%s: Local file %s overrides %s.\n", + fname, this, tp->f_fn); + nreqs = 0; + devorprof = ""; + ordered = 0; + sedit = 1; /* SQT: assume sedit for now */ + needs = 0; + if (eq(wd, "standard")) + goto checkdev; + if (!eq(wd, "optional")) { + printf("%s: %s must be optional or standard\n", fname, this); + exit(1); + } + if (strncmp(this, "OPTIONS/", 8) == 0) + options++; + not_option = 0; +nextopt: + next_word(fp, wd); + if (wd == 0) + goto doneopt; + if (eq(wd, "ordered")) { + ordered++; + goto nextopt; + } + if (machine == MACHINE_SQT && eq(wd, "sedit")) { + sedit++; + goto nextopt; + } + if (eq(wd, "not")) { + not_option = !not_option; + goto nextopt; + } + devorprof = wd; + if (eq(wd, "device-driver") || eq(wd, "profiling-routine")) { + next_word(fp, wd); + goto save; + } + nreqs++; + if (needs == 0 && nreqs == 1) + needs = ns(wd); + if (isdup) + goto invis; + if (options) + { + struct opt *lop = 0; + struct device tdev; + + /* + * Allocate a pseudo-device entry which we will insert into + * the device list below. The flags field is set non-zero to + * indicate an internal entry rather than one generated from + * the configuration file. The slave field is set to define + * the corresponding symbol as 0 should we fail to find the + * option in the option list. + */ + init_dev(&tdev); + tdev.d_name = ns(wd); + tdev.d_type = PSEUDO_DEVICE; + tdev.d_flags++; + tdev.d_slave = 0; + + for (op=opt; op; lop=op, op=op->op_next) + { + char *od = allCaps(ns(wd)); + + /* + * Found an option which matches the current device + * dependency identifier. Set the slave field to + * define the option in the header file. + */ + if (strcmp(op->op_name, od) == 0) + { + tdev.d_slave = 1; + if (lop == 0) + opt = op->op_next; + else + lop->op_next = op->op_next; + free(op); + op = 0; + } + free(od); + if (op == 0) + break; + } + newdev(&tdev); + } + for (dp = dtab; dp != 0; dp = dp->d_next) { + if (eq(dp->d_name, wd) && (dp->d_type != PSEUDO_DEVICE || dp->d_slave)) { + if (not_option) + goto invis; /* dont want file if option present */ + else + goto nextopt; + } + } + if (not_option) + goto nextopt; /* want file if option missing */ + + for (op = opt; op != 0; op = op->op_next) + if (op->op_value == 0 && opteq(op->op_name, wd)) { + if (nreqs == 1) { + free(needs); + needs = 0; + } + goto nextopt; + } + + for (cp = cputype; cp; cp = cp->cpu_next) + if (opteq(cp->cpu_name, wd)) { + if (nreqs == 1) { + free(needs); + needs = 0; + } + goto nextopt; + } + +invis: + while ((wd = get_word(fp)) != 0) + ; + if (tp == 0) + tp = new_fent(); + tp->f_fn = this; + tp->f_type = INVISIBLE; + tp->f_needs = needs; + tp->f_flags = isdup; + goto next; + +doneopt: + if (nreqs == 0) { + printf("%s: what is %s optional on?\n", + fname, this); + exit(1); + } + +checkdev: + if (wd) { + if (*wd == '|') + goto getrest; + next_word(fp, wd); + if (wd) { + if (eq(wd, "ordered")) { + ordered++; + goto checkdev; + } + if (machine == MACHINE_SQT && eq(wd, "sedit")) { + sedit++; + goto checkdev; + } + devorprof = wd; + next_word(fp, wd); + } + } + +save: +getrest: + if (wd) { + if (*wd == '|') { + rest = ns(get_rest(fp)); + } else { + printf("%s: syntax error describing %s\n", + fname, this); + exit(1); + } + } + if (eq(devorprof, "profiling-routine") && profiling == 0) + goto next; + if (tp == 0) + tp = new_fent(); + tp->f_fn = this; + tp->f_extra = rest; + if (options) + tp->f_type = INVISIBLE; + else + if (eq(devorprof, "device-driver")) + tp->f_type = DRIVER; + else if (eq(devorprof, "profiling-routine")) + tp->f_type = PROFILING; + else + tp->f_type = NORMAL; + tp->f_flags = 0; + if (ordered) + tp->f_flags |= ORDERED; + if (sedit) /* SQT */ + tp->f_flags |= SEDIT; + tp->f_needs = needs; + if (pf && pf->f_type == INVISIBLE) + pf->f_flags = 1; /* mark as duplicate */ + goto next; +} + +int +opteq(const char *cp, const char *dp) +{ + char c, d; + + for (; ; cp++, dp++) { + if (*cp != *dp) { + c = isupper(*cp) ? tolower(*cp) : *cp; + d = isupper(*dp) ? tolower(*dp) : *dp; + if (c != d) + return (0); + } + if (*cp == 0) + return (1); + } +} + +void +put_source_file_name(FILE *fp, struct file_list *tp) +{ + if ((tp->f_fn[0] == '.') && (tp->f_fn[1] == '/')) + fprintf(fp, "%s ", tp->f_fn); + else + fprintf(fp, "$(SOURCE_DIR)/%s ", tp->f_fn); +} + +void +do_objs(FILE *fp, const char *msg, int ext) +{ + register struct file_list *tp; + register int lpos, len; + char *cp; + char och; + const char *sp; +#if DO_SWAPFILE + register struct file_list *fl; + char swapname[32]; +#endif DO_SWAPFILE + + fprintf(fp, "%s", msg); + lpos = strlen(msg); + for (tp = ftab; tp != 0; tp = tp->f_next) { + if (tp->f_type == INVISIBLE) + continue; + + /* + * Check for '.o' file in list + */ + cp = tp->f_fn + (len = strlen(tp->f_fn)) - 1; + if ((ext == -1 && tp->f_flags & ORDERED) || /* not in objs */ + (ext != -1 && *cp != ext)) + continue; + else if (*cp == 'o') { + if (len + lpos > 72) { + lpos = 8; + fprintf(fp, "\\\n\t"); + } + put_source_file_name(fp, tp); + fprintf(fp, " "); + lpos += len + 1; + continue; + } + sp = tail(tp->f_fn); +#if DO_SWAPFILE + for (fl = conf_list; fl; fl = fl->f_next) { + if (fl->f_type != SWAPSPEC) + continue; + (void) sprintf(swapname, "swap%s.c", fl->f_fn); + if (eq(sp, swapname)) + goto cont; + } +#endif DO_SWAPFILE + cp = (char *)sp + (len = strlen(sp)) - 1; + och = *cp; + *cp = 'o'; + if (len + lpos > 72) { + lpos = 8; + fprintf(fp, "\\\n\t"); + } + fprintf(fp, "%s ", sp); + lpos += len + 1; + *cp = och; +#if DO_SWAPFILE +cont: + ; +#endif DO_SWAPFILE + } + if (lpos != 8) + putc('\n', fp); +} + +/* not presently used and probably broken, use ORDERED instead */ +void +do_ordered(FILE *fp) +{ + register struct file_list *tp; + register int lpos, len; + char *cp; + char och; + const char *sp; + + fprintf(fp, "ORDERED="); + lpos = 10; + for (tp = ftab; tp != 0; tp = tp->f_next) { + if ((tp->f_flags & ORDERED) != ORDERED) + continue; + sp = tail(tp->f_fn); + cp = (char *)sp + (len = strlen(sp)) - 1; + och = *cp; + *cp = 'o'; + if (len + lpos > 72) { + lpos = 8; + fprintf(fp, "\\\n\t"); + } + fprintf(fp, "%s ", sp); + lpos += len + 1; + *cp = och; + } + if (lpos != 8) + putc('\n', fp); +} + +void +do_files(FILE *fp, const char *msg, char ext) +{ + register struct file_list *tp; + register int lpos, len=0; /* dvw: init to 0 */ + + fprintf(fp, "%s", msg); + lpos = 8; + for (tp = ftab; tp != 0; tp = tp->f_next) { + if (tp->f_type == INVISIBLE) + continue; + if (tp->f_fn[strlen(tp->f_fn)-1] != ext) + continue; + /* + * Always generate a newline. + * Our Makefile's aren't readable anyway. + */ + + lpos = 8; + fprintf(fp, "\\\n\t"); + put_source_file_name(fp, tp); + lpos += len + 1; + } + if (lpos != 8) + putc('\n', fp); +} + +/* + * Include machine dependent makefile in output + */ + +void +do_machdep(FILE *ofp) +{ + FILE *ifp; + char pname[BUFSIZ]; + char line[BUFSIZ]; + + (void) sprintf(line, "%s/Makefile.%s", config_directory, machinename); + ifp = fopenp(VPATH, line, pname, "r"); + if (ifp == 0) { + perror(line); + exit(1); + } + while (fgets(line, BUFSIZ, ifp) != 0) { + if (profiling && (strncmp(line, "LIBS=", 5) == 0)) + fprintf(ofp,"LIBS=${LIBS_P}\n"); + else + fputs(line, ofp); + } + fclose(ifp); +} + + +/* + * Format configuration dependent parameter file. + */ + +void +build_confdep(FILE *fp) +{ + fprintf(fp, "#define MAXUSERS %d\n", maxusers); +} + +/* + * Format cpu types file. + */ + +void +build_cputypes(FILE *fp) +{ + struct cputype *cp; + + for (cp = cputype; cp; cp = cp->cpu_next) + fprintf(fp, "#define\t%s\t1\n", cp->cpu_name); +} + + + +/* + * Build a define parameter file. Create it first in a temporary location and + * determine if this new contents differs from the old before actually + * replacing the original (so as not to introduce avoidable extraneous + * compilations). + */ + +void +do_build(const char *name, void (*format)(FILE *)) +{ + static char temp[]="#config.tmp"; + FILE *tfp, *ofp; + int c; + + unlink(path(temp)); + tfp = fopen(path(temp), "w+"); + if (tfp == 0) { + perror(path(temp)); + exit(1); + } + unlink(path(temp)); + (*format)(tfp); + ofp = fopen(path(name), "r"); + if (ofp != 0) + { + fseek(tfp, 0, 0); + while ((c = fgetc(tfp)) != EOF) + if (fgetc(ofp) != c) + goto copy; + if (fgetc(ofp) == EOF) + goto same; + + } +copy: + if (ofp) + fclose(ofp); + unlink(path(name)); + ofp = fopen(path(name), "w"); + if (ofp == 0) { + perror(path(name)); + exit(1); + } + fseek(tfp, 0, 0); + while ((c = fgetc(tfp)) != EOF) + fputc(c, ofp); +same: + fclose(ofp); + fclose(tfp); +} + +const char * +tail(const char *fn) +{ + register const char *cp; + + cp = rindex(fn, '/'); + if (cp == 0) + return (fn); + return (cp+1); +} + +/* + * Create the makerules for each file + * which is part of the system. + * Devices are processed with the special c2 option -i + * which avoids any problem areas with i/o addressing + * (e.g. for the VAX); assembler files are processed by as. + */ +void +do_rules(FILE *f) +{ + char *cp; + char *np, och; + const char *tp; + register struct file_list *ftp; + const char *extras = ""; /* dvw: init to "" */ + char *source_dir; + char och_upper; + const char *nl = ""; + + for (ftp = ftab; ftp != 0; ftp = ftp->f_next) { + if (ftp->f_type == INVISIBLE) + continue; + cp = (np = ftp->f_fn) + strlen(ftp->f_fn) - 1; + och = *cp; + /* + * Don't compile '.o' files + */ + if (och == 'o') + continue; + /* + * Determine where sources should come from + */ + if ((np[0] == '.') && (np[1] == '/')) { + source_dir = ""; + np += 2; + } else + source_dir = "$(SOURCE_DIR)/"; + *cp = '\0'; + tp = tail(np); /* dvw: init tp before 'if' */ + if (och == 'o') { + fprintf(f, "%so: %so\n\t${O_RULE_1A}%s%.*s${O_RULE_1B}\n\n", + tp, np, source_dir, (int)(tp-np), np); + continue; + } + fprintf(f, "%so: %s%s%c\n", tp, source_dir, np, och); + if (och == 's') { + switch (machine) { + case MACHINE_MIPSY: + case MACHINE_MIPS: + switch (ftp->f_type) { + case NORMAL: + case DRIVER: + fprintf(f, "\t@${RM} %so\n", tp); + fprintf(f, "\t${CC} ${CCASFLAGS}%s %s%s%ss\n\n", + (ftp->f_extra?ftp->f_extra:""), extras, source_dir, np); + break; + + case PROFILING: + if (!profiling) + continue; + fprintf(f, "\t@${RM} %so\n", tp); + fprintf(f, "\t${CC} ${CCPASFLAGS}%s %s%s%ss\n\n", + (ftp->f_extra?ftp->f_extra:""), extras, source_dir, np); + break; + + default: + printf("Don't know rules for %s.s\n", np); + break; + } + break; + default: + fprintf(f, "\t${S_RULE_1A}%s%.*s${S_RULE_1B}%s\n", + source_dir, (int)(tp-np), np, nl); + fprintf(f, "\t${S_RULE_2}%s\n", nl); + fprintf(f, "\t${S_RULE_3}\n\n"); + } + continue; + } + if (och == 'b') { + fprintf(f, "\t${B_RULE_1A}%s%.*s${B_RULE_1B}\n\n", + source_dir, (int)(tp-np), np); + continue; + } + extras = ""; + switch (ftp->f_type) { + + case NORMAL: + switch (machine) { + + case MACHINE_MIPSY: + case MACHINE_MIPS: + fprintf(f, "\t@${RM} %so\n", tp); + fprintf(f, "\t${CC} ${CCNFLAGS}%s %s%s%sc\n\n", + (ftp->f_extra?ftp->f_extra:""), extras, source_dir, np); + continue; + #if 0 + case MACHINE_SQT: + if (ftp->f_flags & SEDIT) { + fprintf(f, "\t${CC} -SO ${COPTS} %s%s%sc | \\\n", extras, source_dir, np); + fprintf(f, "\t${SEDCMD} | ${C2} | ${AS} ${CAFLAGS} -o %so\n\n", tp); + } else { + fprintf(f, "\t${CC} -c -O ${COPTS} %s%s%sc\n\n", + source_dir, extras, np); + } + break; + #endif 0 + default: + goto common; + } + break; + + case DRIVER: + switch (machine) { + + case MACHINE_MIPSY: + case MACHINE_MIPS: + fprintf(f, "\t@${RM} %so\n", tp); + fprintf(f, "\t${CC} ${CCDFLAGS}%s %s%s%sc\n\n", + (ftp->f_extra?ftp->f_extra:""), extras, source_dir, np); + continue; + default: + extras = "_D"; + goto common; + } + break; + + case PROFILING: + if (!profiling) + continue; + if (COPTS == 0) { + fprintf(stderr, + "config: COPTS undefined in generic makefile"); + COPTS = ""; + } + switch (machine) { + case MACHINE_MIPSY: + case MACHINE_MIPS: + fprintf(f, "\t@${RM} %so\n", tp); + fprintf(f, "\t${CC} ${CCPFLAGS}%s %s../%sc\n\n", + (ftp->f_extra?ftp->f_extra:""), extras, np); + continue; + case MACHINE_VAX: + case MACHINE_ROMP: + case MACHINE_SQT: + case MACHINE_MMAX: + case MACHINE_SUN3: + case MACHINE_SUN4: + case MACHINE_I386: + case MACHINE_I860: + case MACHINE_HPPA: + case MACHINE_SPARC: + case MACHINE_PPC: + case MACHINE_ARM: + case MACHINE_X86_64: + extras = "_P"; + goto common; + default: + fprintf(stderr, + "config: don't know how to profile kernel on this cpu\n"); + break; + } + + common: + och_upper = och + 'A' - 'a'; + fprintf(f, "\t${%c_RULE_1A%s}", och_upper, extras); + if (ftp->f_extra) + fprintf(f, "%s", ftp->f_extra); + fprintf(f, "%s%.*s${%c_RULE_1B%s}%s\n", + source_dir, (int)(tp-np), np, och_upper, extras, nl); + fprintf(f, "\t${%c_RULE_2%s}%s\n", och_upper, extras, nl); + fprintf(f, "\t${%c_RULE_3%s}%s\n", och_upper, extras, nl); + fprintf(f, "\t${%c_RULE_4%s}\n\n", och_upper, extras); + break; + + default: + printf("Don't know rules for %s\n", np); + break; + } + *cp = och; + } +} + +/* + * Create the load strings + */ +void +do_load(FILE *f) +{ + register struct file_list *fl; + int first = 1; + + fl = conf_list; + while (fl) { + if (fl->f_type != SYSTEMSPEC) { + fl = fl->f_next; + continue; + } + fl = do_systemspec(f, fl, first); + if (first) + first = 0; + } + fprintf(f, "LOAD ="); + for (fl = conf_list; fl != 0; fl = fl->f_next) + if (fl->f_type == SYSTEMSPEC) + fprintf(f, " %s", fl->f_needs); +#ifdef multimax + fprintf(f, "\n\nall .ORDER: includelinks ${LOAD}\n"); +#else multimax + fprintf(f, "\n\nall: includelinks ${LOAD}\n"); +#endif multimax + fprintf(f, "\n"); +} + +struct file_list * +do_systemspec(FILE *f, struct file_list *fl, __unused int first) +{ + /* + * Variable for kernel name. + */ + fprintf(f, "KERNEL_NAME=%s\n", fl->f_needs); + + fprintf(f, "%s .ORDER: %s.sys ${SYSDEPS}\n", + fl->f_needs, fl->f_needs); + fprintf(f, "\t${SYS_RULE_1}\n"); + fprintf(f, "\t${SYS_RULE_2}\n"); + fprintf(f, "\t${SYS_RULE_3}\n"); + fprintf(f, "\t${SYS_RULE_4}\n\n"); + do_swapspec(f, fl->f_fn, fl->f_needs); + for (fl = fl->f_next; fl != NULL && fl->f_type == SWAPSPEC; fl = fl->f_next) + continue; + return (fl); +} + +void +do_swapspec(__unused FILE *f, __unused const char *name, __unused char *sysname) +{ + +#if DO_SWAPFILE + char *gdir = eq(name, "generic")?"$(MACHINEDIR)/":""; + + fprintf(f, "%s.sys:${P} ${PRELDDEPS} ${LDOBJS} ${LDDEPS}\n\n", sysname); + fprintf(f, "%s.swap: swap%s.o\n", sysname, name); + fprintf(f, "\t@rm -f $@\n"); + fprintf(f, "\t@cp swap%s.o $@\n\n", name); + fprintf(f, "swap%s.o: %sswap%s.c ${SWAPDEPS}\n", name, gdir, name); + if (machine == MACHINE_MIPSY || machine == MACHINE_MIPS) { + fprintf(f, "\t@${RM} swap%s.o\n", name); + fprintf(f, "\t${CC} ${CCNFLAGS} %sswap%s.c\n\n", gdir, name); + } else { + fprintf(f, "\t${C_RULE_1A}%s${C_RULE_1B}\n", gdir); + fprintf(f, "\t${C_RULE_2}\n"); + fprintf(f, "\t${C_RULE_3}\n"); + fprintf(f, "\t${C_RULE_4}\n\n"); + } +#endif DO_SWAPFILE +} + +char * +allCaps(str) + register char *str; +{ + register char *cp = str; + + while (*str) { + if (islower(*str)) + *str = toupper(*str); + str++; + } + return (cp); +} + +#define OLDSALUTATION "# DO NOT DELETE THIS LINE" + +#define LINESIZE 1024 +static char makbuf[LINESIZE]; /* one line buffer for makefile */ + +void +copy_dependencies(FILE *makin, FILE *makout) +{ + register int oldlen = (sizeof OLDSALUTATION - 1); + + while (fgets(makbuf, LINESIZE, makin) != NULL) { + if (! strncmp(makbuf, OLDSALUTATION, oldlen)) + break; + } + while (fgets(makbuf, LINESIZE, makin) != NULL) { + if (oldlen != 0) + { + if (makbuf[0] == '\n') + continue; + else + oldlen = 0; + } + fputs(makbuf, makout); + } +} diff --git a/SETUP/config/mkswapconf.c b/SETUP/config/mkswapconf.c new file mode 100644 index 000000000..fdd14d722 --- /dev/null +++ b/SETUP/config/mkswapconf.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1980 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef lint +static char sccsid[] __attribute__((used)) = "@(#)mkswapconf.c 5.6 (Berkeley) 6/18/88"; +#endif /* not lint */ + +/* + * Build a swap configuration file. + */ +#include "config.h" + +#include <stdio.h> +#include <unistd.h> /* for unlink */ +#include <ctype.h> + +struct file_list *do_swap(struct file_list *fl); +void initdevtable(void); + +void +swapconf(void) +{ + register struct file_list *fl; + + fl = conf_list; + while (fl) { + if (fl->f_type != SYSTEMSPEC) { + fl = fl->f_next; + continue; + } + fl = do_swap(fl); + } +} + +struct file_list * +do_swap(struct file_list *fl) +{ + FILE *fp; + char swapname[80]; + register struct file_list *swap; + dev_t dev; + + if (eq(fl->f_fn, "generic")) { + fl = fl->f_next; + return (fl->f_next); + } + if (machine == MACHINE_MMAX) { + printf("Error: Multimax must specify swap generic only.\n"); + exit(1); + } + (void) sprintf(swapname, "swap%s.c", fl->f_fn); + fp = fopen(path(swapname), "w"); + if (fp == 0) { + perror(path(swapname)); + exit(1); + } + fprintf(fp, "#include <sys/param.h>\n"); + fprintf(fp, "#include <sys/conf.h>\n"); + fprintf(fp, "\n"); + /* + * If there aren't any swap devices + * specified, just return, the error + * has already been noted. + */ + swap = fl->f_next; + if (swap == 0 || swap->f_type != SWAPSPEC) { + (void) unlink(path(swapname)); + fclose(fp); + return (swap); + } + fprintf(fp, "dev_t\trootdev = makedev(%d, %d);\n", + major(fl->f_rootdev), minor(fl->f_rootdev)); + fprintf(fp, "dev_t\targdev = makedev(%d, %d);\n", + major(fl->f_argdev), minor(fl->f_argdev)); + fprintf(fp, "dev_t\tdumpdev = makedev(%d, %d);\n", + major(fl->f_dumpdev), minor(fl->f_dumpdev)); + fprintf(fp, "\n"); + fprintf(fp, "struct\tswdevt swdevt[] = {\n"); + do { + dev = swap->f_swapdev; + fprintf(fp, "\t{ makedev(%d, %d),\t0,\t%d },\t/* %s */\n", + major(dev), minor(dev), swap->f_swapsize, swap->f_fn); + swap = swap->f_next; + } while (swap && swap->f_type == SWAPSPEC); + fprintf(fp, "\t{ 0, 0, 0 }\n"); + fprintf(fp, "};\n"); + if (machine == MACHINE_MIPSY || machine == MACHINE_MIPS) { + fprintf(fp, "\nsetconf()\n"); + fprintf(fp, "{\n"); + fprintf(fp, "\t/* resolve reference for non-generic kernels */\n"); + fprintf(fp, "}\n"); + } + fclose(fp); + return (swap); +} + +static int devtablenotread = 1; +static struct devdescription { + char *dev_name; + int dev_major; + struct devdescription *dev_next; +} *devtable; + +/* + * Given a device name specification figure out: + * major device number + * partition + * device name + * unit number + * This is a hack, but the system still thinks in + * terms of major/minor instead of string names. + */ +dev_t +nametodev(char *name, int defunit, char defpartition) +{ + char *cp, partition; + int unit; + register struct devdescription *dp; + + cp = name; + if (cp == 0) { + fprintf(stderr, "config: internal error, nametodev\n"); + exit(1); + } + while (*cp && !isdigit(*cp)) + cp++; + unit = *cp ? atoi(cp) : defunit; + if (unit < 0 || unit > 31) { + fprintf(stderr, +"config: %s: invalid device specification, unit out of range\n", name); + unit = defunit; /* carry on more checking */ + } + if (*cp) { + *cp++ = '\0'; + while (*cp && isdigit(*cp)) + cp++; + } + partition = *cp ? *cp : defpartition; + if (partition < 'a' || partition > 'h') { + fprintf(stderr, +"config: %c: invalid device specification, bad partition\n", *cp); + partition = defpartition; /* carry on */ + } + if (devtablenotread) + initdevtable(); + for (dp = devtable; dp->dev_next; dp = dp->dev_next) + if (eq(name, dp->dev_name)) + break; + if (dp == 0) { + fprintf(stderr, "config: %s: unknown device\n", name); + return (NODEV); + } + return (makedev(dp->dev_major, (unit << DEV_SHIFT) + (partition - 'a'))); +} + +char * +devtoname(dev_t dev) +{ + char buf[80]; + register struct devdescription *dp; + + if (devtablenotread) + initdevtable(); + for (dp = devtable; dp->dev_next; dp = dp->dev_next) + if (major(dev) == dp->dev_major) + break; + if (dp == 0) + dp = devtable; + (void) sprintf(buf, "%s%d%c", dp->dev_name, + minor(dev) >> DEV_SHIFT, (minor(dev) & DEV_MASK) + 'a'); + return (ns(buf)); +} + +void +initdevtable(void) +{ + char buf[BUFSIZ]; + char line[BUFSIZ]; + int maj; + register struct devdescription **dp = &devtable; + FILE *fp; + + (void) sprintf(buf, "%s/devices.%s", config_directory, machinename); + fp = fopenp(VPATH, buf, line, "r"); + if (fp == NULL) { + fprintf(stderr, "config: can't open %s\n", buf); + exit(1); + } + while (fgets(line, BUFSIZ, fp) != 0) { + if (*line == '#' || *line == '\n') + continue; + if (sscanf(line, "%s\t%d\n", buf, &maj) != 2) + break; + *dp = (struct devdescription *)malloc(sizeof (**dp)); + (*dp)->dev_name = ns(buf); + (*dp)->dev_major = maj; + dp = &(*dp)->dev_next; + } + *dp = 0; + fclose(fp); + devtablenotread = 0; +} diff --git a/SETUP/config/openp.c b/SETUP/config/openp.c new file mode 100644 index 000000000..c05cd9daf --- /dev/null +++ b/SETUP/config/openp.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* openp, fopenp -- search pathlist and open file + * + * Usage: + * i = openp (path,file,complete,flags,mode) + * f = fopenp (path,file,complete,type) + * int i,flags,mode; + * FILE *f; + * char *path,*file,*complete,*type; + * + * Openp searches for "file" in the pathlist "path"; + * when the file is found and can be opened by open() + * with the specified "flags" and "mode", then the full filename + * is copied into "complete" and openp returns the file + * descriptor. If no such file is found, openp returns -1. + * Fopenp performs the same function, using fopen() instead + * of open() and type instead of flags/mode; it returns 0 if no + * file is found. + * + * HISTORY + * 30-Apr-85 Steven Shafer (sas) at Carnegie-Mellon University + * Adapted for 4.2 BSD UNIX. Added new parameter to openp.c; + * changed names of flags, mode, and type parameters to reflect + * current manual entries for open and fopen. + * + * 20-Nov-79 Steven Shafer (sas) at Carnegie-Mellon University + * Created for VAX. + * + */ + +#include <stdio.h> +#include <fcntl.h> /* open */ +#include "config.h" + + +int openp(const char *fpath, char *file, char *complete, int flags, int mode); + +static int flgs,mod,value; +static const char *ftyp; +static FILE *fvalue; + +static int +func(char *fnam) +{ + value = open (fnam,flgs,mod); + return (value < 0); +} + +static int +ffunc(char *fnam) +{ + fvalue = fopen (fnam,ftyp); + return (fvalue == 0); +} + +int +openp(const char *fpath, char *file, char *complete, int flags, int mode) +{ + flgs = flags; + mod = mode; + if (searchp(fpath,file,complete,func) < 0) return (-1); + return (value); +} + +FILE * +fopenp(const char *fpath, char *file, char *complete, const char *ftype) +{ + ftyp = ftype; + if (searchp(fpath,file,complete,ffunc) < 0) return (0); + return (fvalue); +} diff --git a/SETUP/config/parser.y b/SETUP/config/parser.y new file mode 100644 index 000000000..4f77b93e4 --- /dev/null +++ b/SETUP/config/parser.y @@ -0,0 +1,1278 @@ +/* + * Mach Operating System + * Copyright (c) 1990 Carnegie-Mellon University + * Copyright (c) 1989 Carnegie-Mellon University + * Copyright (c) 1988 Carnegie-Mellon University + * Copyright (c) 1987 Carnegie-Mellon University + * All rights reserved. The CMU software License Agreement specifies + * the terms and conditions for use and redistribution. + */ + +/* + * Copyright (c) 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms are permitted + * provided that the above copyright notice and this paragraph are + * duplicated in all such forms and that any documentation, + * advertising materials, and other materials related to such + * distribution and use acknowledge that the software was developed + * by the University of California, Berkeley. The name of the + * University may not be used to endorse or promote products derived + * from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + * @(#)config.y 5.8 (Berkeley) 6/18/88 + */ + +%union { + char *str; + int val; + struct file_list *file; + struct idlst *lst; +} + +%token ADDRMOD +%token AND +%token ANY +%token ARGS +%token AT +%token BIN +%token BUILDDIR +%token COMMA +%token CONFIG +%token CONFIGDIR +%token CONTROLLER +%token CPU +%token CSR +%token DEVICE +%token DISK +%token DRIVE +%token DST +%token DUMPS +%token EQUALS +%token FLAGS +%token HZ +%token IDENT +%token INIT +%token MACHINE +%token MAJOR +%token MASTER +%token MAXUSERS +%token MAXDSIZ +%token MBA +%token MBII +%token MINOR +%token MINUS +%token NEXUS +%token OBJECTDIR +%token ON +%token OPTIONS +%token MAKEOPTIONS +%token PRIORITY +%token PROFILE +%token PSEUDO_DEVICE +%token ROOT +%token SEMICOLON +%token SIZE +%token SLAVE +%token SOURCEDIR +%token SWAP +%token TIMEZONE +%token TRACE +%token UBA +%token VECTOR +%token VME +%token VME16D16 +%token VME24D16 +%token VME32D16 +%token VME16D32 +%token VME24D32 +%token VME32D32 + +/* following 3 are unique to CMU */ +%token LUN +%token SLOT +%token TAPE + +%token <str> ID +%token <val> NUMBER +%token <val> FPNUMBER + +%type <str> Save_id +%type <str> Opt_value +%type <str> Dev +%type <lst> Id_list +%type <val> optional_size +%type <str> device_name +%type <val> major_minor +%type <val> arg_device_spec +%type <val> root_device_spec +%type <val> dump_device_spec +%type <file> swap_device_spec +%type <val> Value + +%{ + +#include "config.h" +#include <ctype.h> +#include <stdio.h> + +struct device cur; +struct device *curp = 0; +char *temp_id; +char *val_id; +/* char *malloc(); */ + +int yylex(void); + +int finddev(dev_t dev); +int alreadychecked(dev_t dev, dev_t list[], dev_t *last); +void deverror(const char *systemname, const char *devtype); +void mkconf(char *sysname); +struct file_list *newswap(void); +void mkswap(struct file_list *syslist, struct file_list *fl, int size); +struct device *huhcon(const char *dev); +void check_nexus(struct device *dev, int num); +void check_slot(struct device *dev, int num); +void checksystemspec(struct file_list *fl); +void verifysystemspecs(void); +dev_t *verifyswap(struct file_list *fl, dev_t checked[], dev_t *pchecked); +struct device *dconnect(const char *dev, int num); + +%} +%% +Configuration: + Many_specs + { verifysystemspecs(); } + ; + +Many_specs: + Many_specs Spec + | + /* lambda */ + ; + +Spec: + Device_spec SEMICOLON + { newdev(&cur); } | + Config_spec SEMICOLON + | + TRACE SEMICOLON + { do_trace = !do_trace; } | + SEMICOLON + | + error SEMICOLON + ; + +Config_spec: + MACHINE Save_id + { + if (!strcmp($2, "vax")) { + machine = MACHINE_VAX; + machinename = "vax"; + } else if (!strcmp($2, "sun")) { + /* default to Sun 3 */ + machine = MACHINE_SUN3; + machinename = "sun3"; + } else if (!strcmp($2, "sun2")) { + machine = MACHINE_SUN2; + machinename = "sun2"; + } else if (!strcmp($2, "sun3")) { + machine = MACHINE_SUN3; + machinename = "sun3"; + } else if (!strcmp($2, "sun4")) { + machine = MACHINE_SUN4; + machinename = "sun4"; + } else if (!strcmp($2, "romp")) { + machine = MACHINE_ROMP; + machinename = "romp"; + } else if (!strcmp($2, "ca")) { + machine = MACHINE_ROMP; + machinename = "ca"; + } else if (!strcmp($2, "mmax")) { + machine = MACHINE_MMAX; + machinename = "mmax"; + } else if (!strcmp($2, "sqt")) { + machine = MACHINE_SQT; + machinename = "sqt"; + } else if (!strcmp($2, "i")) { + machine = MACHINE_I386; + machinename = "i386"; + } else if (!strcmp($2, "i386")) { + machine = MACHINE_I386; + machinename = "i386"; + } else if (!strcmp($2, "ix")) { + machine = MACHINE_IX; + machinename = "ix"; + } else if (!strcmp($2, "mipsy")) { + machine = MACHINE_MIPSY; + machinename = "mipsy"; + } else if (!strcmp($2, "mips")) { + machine = MACHINE_MIPS; + machinename = "mips"; + } else if (!strcmp($2, "i860")) { + machine = MACHINE_I860; + machinename = "i860"; + } else if (!strcmp($2, "m68k")) { + machine = MACHINE_M68K; + machinename = "m68k"; + } else if (!strcmp($2, "m88k")) { + machine = MACHINE_M88K; + machinename = "m88k"; + } else if (!strcmp($2, "m98k")) { + machine = MACHINE_M98K; + machinename = "m98k"; + } else if (!strcmp($2, "hppa")) { + machine = MACHINE_HPPA; + machinename = "hppa"; + } else if (!strcmp($2, "sparc")) { + machine = MACHINE_SPARC; + machinename = "sparc"; + } else if (!strcmp($2, "ppc")) { + machine = MACHINE_PPC; + machinename = "ppc"; + } else if (!strcmp($2, "arm")) { + machine = MACHINE_ARM; + machinename = "arm"; + } else if (!strcmp($2, "x86_64")) { + machine = MACHINE_X86_64; + machinename = "x86_64"; + } else + yyerror("Unknown machine type"); + } | + CPU Save_id + { + struct cputype *cp = + (struct cputype *)malloc(sizeof (struct cputype)); + cp->cpu_name = ns($2); + cp->cpu_next = cputype; + cputype = cp; + free(temp_id); + } | + OPTIONS Opt_list + | + MAKEOPTIONS Mkopt_list + | + IDENT ID + { ident = ns($2); } + | + System_spec + | + MAXUSERS NUMBER + { maxusers = $2; } + | + BUILDDIR Save_id + { build_directory = ns($2); } + | + CONFIGDIR Save_id + { config_directory = ns($2); } + | + OBJECTDIR Save_id + { object_directory = ns($2); } + | + SOURCEDIR Save_id + { source_directory = ns($2); } + | + PROFILE + { profiling++; } + ; + +System_spec: + System_id + { checksystemspec(*confp); } + | System_id System_parameter_list + { checksystemspec(*confp); } + ; + +System_id: + CONFIG Save_id + { mkconf($2); } + ; + +System_parameter_list: + System_parameter_list System_parameter + | System_parameter + ; + +System_parameter: + swap_spec + | root_spec + | dump_spec + | arg_spec + ; + +swap_spec: + SWAP optional_on swap_device_list + ; + +swap_device_list: + swap_device_list AND swap_device + | swap_device + ; + +swap_device: + swap_device_spec optional_size + { mkswap(*confp, $1, $2); } + ; + +swap_device_spec: + device_name + { + struct file_list *fl = newswap(); + + if (eq($1, "generic")) + fl->f_fn = $1; + else { + fl->f_swapdev = nametodev($1, 0, 'b'); + fl->f_fn = devtoname(fl->f_swapdev); + } + $$ = fl; + } + | major_minor + { + struct file_list *fl = newswap(); + + fl->f_swapdev = $1; + fl->f_fn = devtoname($1); + $$ = fl; + } + ; + +root_spec: + ROOT optional_on root_device_spec + { + struct file_list *fl = *confp; + + if (fl && fl->f_rootdev != NODEV) + yyerror("extraneous root device specification"); + else + fl->f_rootdev = $3; + } + ; + +root_device_spec: + device_name + { $$ = nametodev($1, 0, 'a'); } + | major_minor + ; + +dump_spec: + DUMPS optional_on dump_device_spec + { + struct file_list *fl = *confp; + + if (fl && fl->f_dumpdev != NODEV) + yyerror("extraneous dump device specification"); + else + fl->f_dumpdev = $3; + } + + ; + +dump_device_spec: + device_name + { $$ = nametodev($1, 0, 'b'); } + | major_minor + ; + +arg_spec: + ARGS optional_on arg_device_spec + { + struct file_list *fl = *confp; + + if (fl && fl->f_argdev != NODEV) + yyerror("extraneous arg device specification"); + else + fl->f_argdev = $3; + } + ; + +arg_device_spec: + device_name + { $$ = nametodev($1, 0, 'b'); } + | major_minor + ; + +major_minor: + MAJOR NUMBER MINOR NUMBER + { $$ = makedev($2, $4); } + ; + +optional_on: + ON + | /* empty */ + ; + +optional_size: + SIZE NUMBER + { $$ = $2; } + | /* empty */ + { $$ = 0; } + ; + +device_name: + Save_id + { $$ = $1; } + | Save_id NUMBER + { + char buf[80]; + + (void) sprintf(buf, "%s%d", $1, $2); + $$ = ns(buf); free($1); + } + | Save_id NUMBER ID + { + char buf[80]; + + (void) sprintf(buf, "%s%d%s", $1, $2, $3); + $$ = ns(buf); free($1); + } + ; + +Opt_list: + Opt_list COMMA Option + | + Option + ; + +Option: + Save_id + { + struct opt *op = (struct opt *)malloc(sizeof (struct opt)); + op->op_name = ns($1); + op->op_next = (struct opt *) 0; + op->op_value = 0; + if (opt == (struct opt *) 0) + opt = op; + else + opt_tail->op_next = op; + opt_tail = op; + free(temp_id); + } | + Save_id EQUALS Opt_value + { + struct opt *op = (struct opt *)malloc(sizeof (struct opt)); + op->op_name = ns($1); + op->op_next = (struct opt *) 0; + op->op_value = ns($3); + if (opt == (struct opt *) 0) + opt = op; + else + opt_tail->op_next = op; + opt_tail = op; + free(temp_id); + if (val_id) + free(val_id); + } ; + +Opt_value: + ID + { $$ = val_id = ns($1); } | + NUMBER + { char nb[16]; + (void) sprintf(nb, "%u", $1); + $$ = val_id = ns(nb); + } | + /* lambda from MIPS -- WHY */ + { $$ = val_id = ns(""); } + ; + +Save_id: + ID + { $$ = temp_id = ns($1); } + ; + +Mkopt_list: + Mkopt_list COMMA Mkoption + | + Mkoption + ; + +Mkoption: + Save_id + { + struct opt *op = (struct opt *)malloc(sizeof (struct opt)); + op->op_name = ns($1); + op->op_next = (struct opt *) 0; + op->op_value = 0; + mkopt = op; + free(temp_id); + } | + Save_id EQUALS Opt_value + { + struct opt *op = (struct opt *)malloc(sizeof (struct opt)); + op->op_name = ns($1); + op->op_next = (struct opt *) 0; + op->op_value = ns($3); + if (mkopt == (struct opt *) 0) + mkopt = op; + else + mkopt_tail->op_next = op; + mkopt_tail = op; + free(temp_id); + if (val_id) + free(val_id); + } ; + +Dev: + UBA + { $$ = ns("uba"); } | + MBA + { $$ = ns("mba"); } | + VME16D16 + { + if (machine != MACHINE_SUN2 && machine != MACHINE_SUN3 + && machine != MACHINE_SUN4) + yyerror("wrong machine type for vme16d16"); + $$ = ns("vme16d16"); + } | + VME24D16 + { + if (machine != MACHINE_SUN2 && machine != MACHINE_SUN3 + && machine != MACHINE_SUN4) + yyerror("wrong machine type for vme24d16"); + $$ = ns("vme24d16"); + } | + VME32D16 + { + if (machine != MACHINE_SUN3 && machine != MACHINE_SUN4) + + yyerror("wrong machine type for vme32d16"); + $$ = ns("vme32d16"); + } | + VME16D32 + { + if (machine != MACHINE_SUN3 && machine != MACHINE_SUN4) + yyerror("wrong machine type for vme16d32"); + $$ = ns("vme16d32"); + } | + VME24D32 + { + if (machine != MACHINE_SUN3 && machine != MACHINE_SUN4) + yyerror("wrong machine type for vme24d32"); + $$ = ns("vme24d32"); + } | + VME32D32 + { + if (machine != MACHINE_SUN3 && machine != MACHINE_SUN4) + yyerror("wrong machine type for vme32d32"); + $$ = ns("vme32d32"); + } | + VME + { + if (machine != MACHINE_MIPSY && machine != MACHINE_MIPS) + yyerror("wrong machine type for vme"); + $$ = ns("vme"); + } | + MBII + { + if (machine != MACHINE_MIPSY && machine != MACHINE_MIPS) + yyerror("wrong machine type for mbii"); + $$ = ns("mbii"); + } | + ID + { $$ = ns($1); } + ; + +Device_spec: + DEVICE Dev_name Dev_info Int_spec + { cur.d_type = DEVICE; } | + MASTER Dev_name Dev_info Int_spec + { cur.d_type = MASTER; } | + DISK Dev_name Dev_info Int_spec + { cur.d_dk = 1; cur.d_type = DEVICE; } | +/* TAPE rule is unique to CMU */ + TAPE Dev_name Dev_info Int_spec + { cur.d_type = DEVICE; } | + CONTROLLER Dev_name Dev_info Int_spec + { cur.d_type = CONTROLLER; } | + PSEUDO_DEVICE Init_dev Dev + { + cur.d_name = $3; + cur.d_type = PSEUDO_DEVICE; + } | + PSEUDO_DEVICE Init_dev Dev NUMBER + { + cur.d_name = $3; + cur.d_type = PSEUDO_DEVICE; + cur.d_slave = $4; + } | + PSEUDO_DEVICE Init_dev Dev INIT ID + { + cur.d_name = $3; + cur.d_type = PSEUDO_DEVICE; + cur.d_init = ns($5); + } | + PSEUDO_DEVICE Init_dev Dev NUMBER INIT ID + { + cur.d_name = $3; + cur.d_type = PSEUDO_DEVICE; + cur.d_slave = $4; + cur.d_init = ns($6); + }; + +Dev_name: + Init_dev Dev NUMBER + { + cur.d_name = $2; + if (eq($2, "mba")) + seen_mba = 1; + else if (eq($2, "uba")) + seen_uba = 1; + else if (eq($2, "mbii")) + seen_mbii = 1; + else if (eq($2, "vme")) + seen_vme = 1; + cur.d_unit = $3; + }; + +Init_dev: + /* lambda */ + { init_dev(&cur); }; + +Dev_info: + Con_info Info_list + | + /* lambda */ + ; + +Con_info: + AT Dev NUMBER + { + if (eq(cur.d_name, "mba") || eq(cur.d_name, "uba") + || eq(cur.d_name, "mbii") || eq(cur.d_name, "vme")) { + (void) sprintf(errbuf, + "%s must be connected to a nexus", cur.d_name); + yyerror(errbuf); + } + cur.d_conn = dconnect($2, $3); + if (machine == MACHINE_SQT) + dev_param(&cur, "index", cur.d_unit); + } | +/* AT SLOT NUMBER rule is unique to CMU */ + AT SLOT NUMBER + { + check_slot(&cur, $3); + cur.d_addr = $3; + cur.d_conn = TO_SLOT; + } | + AT NEXUS NUMBER + { check_nexus(&cur, $3); cur.d_conn = TO_NEXUS; }; + +Info_list: + Info_list Info + | + /* lambda */ + ; + +Info: + CSR NUMBER + { + cur.d_addr = $2; + if (machine == MACHINE_SQT) { + dev_param(&cur, "csr", $2); + } + } | + DRIVE NUMBER + { + cur.d_drive = $2; + if (machine == MACHINE_SQT) { + dev_param(&cur, "drive", $2); + } + } | + SLAVE NUMBER + { + if (cur.d_conn != 0 && cur.d_conn != TO_NEXUS && + cur.d_conn->d_type == MASTER) + cur.d_slave = $2; + else + yyerror("can't specify slave--not to master"); + } | +/* MIPS */ + ADDRMOD NUMBER + { cur.d_addrmod = $2; } | +/* LUN NUMBER rule is unique to CMU */ + LUN NUMBER + { + if ((cur.d_conn != 0) && (cur.d_conn != TO_SLOT) && + (cur.d_conn->d_type == CONTROLLER)) { + cur.d_addr = $2; + } + else { + yyerror("device requires controller card"); + } + } | + FLAGS NUMBER + { + cur.d_flags = $2; + if (machine == MACHINE_SQT) { + dev_param(&cur, "flags", $2); + } + } | + BIN NUMBER + { + if (machine != MACHINE_SQT) + yyerror("bin specification only valid on Sequent Balance"); + if ($2 < 1 || $2 > 7) + yyerror("bogus bin number"); + else { + cur.d_bin = $2; + dev_param(&cur, "bin", $2); + } + } | + Dev Value + { + if (machine != MACHINE_SQT) + yyerror("bad device spec"); + dev_param(&cur, $1, $2); + }; + +Value: + NUMBER + | + MINUS NUMBER + { $$ = -($2); } + ; + +Int_spec: + Vec_spec + { cur.d_pri = 0; } | + PRIORITY NUMBER + { cur.d_pri = $2; } | + PRIORITY NUMBER Vec_spec + { cur.d_pri = $2; } | + Vec_spec PRIORITY NUMBER + { cur.d_pri = $3; } | + /* lambda */ + ; + +Vec_spec: + VECTOR Id_list + { cur.d_vec = $2; }; + + +Id_list: + Save_id + { + struct idlst *a = (struct idlst *)malloc(sizeof(struct idlst)); + a->id = $1; a->id_next = 0; $$ = a; + a->id_vec = 0; + } | + Save_id Id_list + { + struct idlst *a = (struct idlst *)malloc(sizeof(struct idlst)); + a->id = $1; a->id_next = $2; $$ = a; + a->id_vec = 0; + } | + Save_id NUMBER + { + struct idlst *a = (struct idlst *)malloc(sizeof(struct idlst)); + a->id_next = 0; a->id = $1; $$ = a; + a->id_vec = $2; + } | + Save_id NUMBER Id_list + { + struct idlst *a = (struct idlst *)malloc(sizeof(struct idlst)); + a->id_next = $3; a->id = $1; $$ = a; + a->id_vec = $2; + }; + +%% + +void +yyerror(const char *s) +{ + fprintf(stderr, "config: line %d: %s\n", yyline, s); +} + +/* + * return the passed string in a new space + */ +char * +ns(const char *str) +{ + register char *cp; + + cp = malloc((unsigned)(strlen(str)+1)); + (void) strcpy(cp, str); + return (cp); +} + +/* + * add a device to the list of devices + */ +void +newdev(struct device *dp) +{ + register struct device *np; + + np = (struct device *) malloc(sizeof *np); + *np = *dp; + if (curp == 0) + dtab = np; + else + curp->d_next = np; + curp = np; + curp->d_next = 0; +} + +/* + * note that a configuration should be made + */ +void +mkconf(char *sysname) +{ + register struct file_list *fl, **flp; + + fl = (struct file_list *) malloc(sizeof *fl); + fl->f_type = SYSTEMSPEC; + fl->f_needs = sysname; + fl->f_rootdev = NODEV; + fl->f_argdev = NODEV; + fl->f_dumpdev = NODEV; + fl->f_fn = 0; + fl->f_next = 0; + for (flp = confp; *flp; flp = &(*flp)->f_next) + ; + *flp = fl; + confp = flp; +} + +struct file_list * +newswap(void) +{ + struct file_list *fl = (struct file_list *)malloc(sizeof (*fl)); + + fl->f_type = SWAPSPEC; + fl->f_next = 0; + fl->f_swapdev = NODEV; + fl->f_swapsize = 0; + fl->f_needs = 0; + fl->f_fn = 0; + return (fl); +} + +/* + * Add a swap device to the system's configuration + */ +void +mkswap(struct file_list *syslist, struct file_list *fl, int size) +{ + register struct file_list **flp; + + if (syslist == 0 || syslist->f_type != SYSTEMSPEC) { + yyerror("\"swap\" spec precedes \"config\" specification"); + return; + } + if (size < 0) { + yyerror("illegal swap partition size"); + return; + } + /* + * Append swap description to the end of the list. + */ + flp = &syslist->f_next; + for (; *flp && (*flp)->f_type == SWAPSPEC; flp = &(*flp)->f_next) + ; + fl->f_next = *flp; + *flp = fl; + fl->f_swapsize = size; + /* + * If first swap device for this system, + * set up f_fn field to insure swap + * files are created with unique names. + */ + if (syslist->f_fn) + return; + if (eq(fl->f_fn, "generic")) + syslist->f_fn = ns(fl->f_fn); + else + syslist->f_fn = ns(syslist->f_needs); +} + +/* + * find the pointer to connect to the given device and number. + * returns 0 if no such device and prints an error message + */ +struct device * +dconnect(const char *dev, int num) +{ + register struct device *dp; + + if (num == QUES) + return (huhcon(dev)); + for (dp = dtab; dp != 0; dp = dp->d_next) { + if ((num != dp->d_unit) || !eq(dev, dp->d_name)) + continue; + if (dp->d_type != CONTROLLER && dp->d_type != MASTER) { + (void) sprintf(errbuf, + "%s connected to non-controller", dev); + yyerror(errbuf); + return (0); + } + return (dp); + } + (void) sprintf(errbuf, "%s %d not defined", dev, num); + yyerror(errbuf); + return (0); +} + +/* + * connect to an unspecific thing + */ +struct device * +huhcon(const char *dev) +{ + register struct device *dp, *dcp; + struct device rdev; /* only used if dp is NULL */ + int oldtype; + + memset(&rdev, 0, sizeof rdev); + + /* + * First make certain that there are some of these to wildcard on + */ + for (dp = dtab; dp != 0; dp = dp->d_next) + if (eq(dp->d_name, dev)) + break; + if (dp == 0) { + (void) sprintf(errbuf, "no %s's to wildcard", dev); + yyerror(errbuf); + return (0); + } + oldtype = dp->d_type; + dcp = dp->d_conn; + /* + * Now see if there is already a wildcard entry for this device + * (e.g. Search for a "uba ?") + */ + for (; dp != 0; dp = dp->d_next) + if (eq(dev, dp->d_name) && dp->d_unit == -1) + break; + /* + * If there isn't, make one because everything needs to be connected + * to something. + */ + if (dp == 0) { + dp = &rdev; + init_dev(dp); + dp->d_unit = QUES; + dp->d_name = ns(dev); + dp->d_type = oldtype; + newdev(dp); + dp = curp; + /* + * Connect it to the same thing that other similar things are + * connected to, but make sure it is a wildcard unit + * (e.g. up connected to sc ?, here we make connect sc? to a + * uba?). If other things like this are on the NEXUS or + * if they aren't connected to anything, then make the same + * connection, else call ourself to connect to another + * unspecific device. + */ + if (dcp == TO_NEXUS || dcp == 0) + dp->d_conn = dcp; + else + dp->d_conn = dconnect(dcp->d_name, QUES); + } + return (dp); +} + +void +init_dev(struct device *dp) +{ + + dp->d_name = "OHNO!!!"; + dp->d_type = DEVICE; + dp->d_conn = 0; + dp->d_vec = 0; + dp->d_addr = dp->d_pri = dp->d_flags = dp->d_dk = 0; + dp->d_slave = dp->d_drive = dp->d_unit = UNKNOWN; + if (machine == MACHINE_SUN2 || machine == MACHINE_SUN3 + || machine == MACHINE_SUN4){ + dp->d_addr = UNKNOWN; + dp->d_mach = dp->d_bus = 0; + } + if (machine == MACHINE_MIPSY || machine == MACHINE_MIPS){ + dp->d_addrmod = 0; + } + dp->d_init = 0; +} + +/* + * make certain that this is a reasonable type of thing to connect to a nexus + */ +void +check_nexus(struct device *dev, int num) +{ + + switch (machine) { + + case MACHINE_VAX: + if (!eq(dev->d_name, "uba") && !eq(dev->d_name, "mba")) + yyerror("only uba's and mba's should be connected to the nexus"); + if (num != QUES) + yyerror("can't give specific nexus numbers"); + break; + + case MACHINE_SUN: + if (!eq(dev->d_name, "mb")) + yyerror("only mb's should be connected to the nexus"); + break; + + case MACHINE_ROMP: + if (!eq(dev->d_name, "iocc")) + yyerror("only iocc's should be connected to the nexus"); + break; + case MACHINE_SUN2: + if (!eq(dev->d_name, "virtual") && + !eq(dev->d_name, "obmem") && + !eq(dev->d_name, "obio") && + !eq(dev->d_name, "mbmem") && + !eq(dev->d_name, "mbio") && + !eq(dev->d_name, "vme16d16") && + !eq(dev->d_name, "vme24d16")) { + (void)sprintf(errbuf, + "unknown bus type `%s' for nexus connection on %s", + dev->d_name, machinename); + yyerror(errbuf); + } + + case MACHINE_MMAX: + yyerror("don't grok 'nexus' on mmax -- try 'slot'."); + break; + case MACHINE_SUN3: + case MACHINE_SUN4: + if (!eq(dev->d_name, "virtual") && + !eq(dev->d_name, "obmem") && + !eq(dev->d_name, "obio") && + !eq(dev->d_name, "mbmem") && + !eq(dev->d_name, "mbio") && + !eq(dev->d_name, "vme16d16") && + !eq(dev->d_name, "vme24d16") && + !eq(dev->d_name, "vme32d16") && + !eq(dev->d_name, "vme16d32") && + !eq(dev->d_name, "vme24d32") && + !eq(dev->d_name, "vme32d32")) { + (void)sprintf(errbuf, + "unknown bus type `%s' for nexus connection on %s", + dev->d_name, machinename); + yyerror(errbuf); + } + break; + case MACHINE_MIPSY: + case MACHINE_MIPS: + if (!eq(dev->d_name, "vme") && !eq(dev->d_name, "mbii")) + yyerror("only vme's and mbii's should be connected to the nexus"); + if (num != QUES) + yyerror("can't give specific nexus numbers"); + break; + } +} + +/* + * make certain that this is a reasonable type of thing to connect to a slot + */ + +void +check_slot(struct device *dev, int num) +{ + + switch (machine) { + + case MACHINE_MMAX: + if (!eq(dev->d_name, "emc")) + yyerror("only emc's plug into backplane slots."); + if (num == QUES) + yyerror("specific slot numbers must be given"); + break; + + case MACHINE_SQT: + if (!eq(dev->d_name, "mbad") && + !eq(dev->d_name, "zdc") && + !eq(dev->d_name, "sec")) { + (void)sprintf(errbuf, + "unknown bus type `%s' for slot on %s", + dev->d_name, machinename); + yyerror(errbuf); + } + break; + + default: + yyerror("don't grok 'slot' for this machine -- try 'nexus'."); + break; + } +} + +/* + * Check system specification and apply defaulting + * rules on root, argument, dump, and swap devices. + */ +void +checksystemspec(struct file_list *fl) +{ + char buf[BUFSIZ]; + register struct file_list *swap; + int generic; + + if (fl == 0 || fl->f_type != SYSTEMSPEC) { + yyerror("internal error, bad system specification"); + exit(1); + } + swap = fl->f_next; + generic = swap && swap->f_type == SWAPSPEC && eq(swap->f_fn, "generic"); + if (fl->f_rootdev == NODEV && !generic) { + yyerror("no root device specified"); + exit(1); + } + /* + * Default swap area to be in 'b' partition of root's + * device. If root specified to be other than on 'a' + * partition, give warning, something probably amiss. + */ + if (swap == 0 || swap->f_type != SWAPSPEC) { + dev_t dev; + + swap = newswap(); + dev = fl->f_rootdev; + if (minor(dev) & DEV_MASK) { + (void) sprintf(buf, +"Warning, swap defaulted to 'b' partition with root on '%c' partition", + (minor(dev) & DEV_MASK) + 'a'); + yyerror(buf); + } + swap->f_swapdev = + makedev(major(dev), (minor(dev) &~ DEV_MASK) | ('b' - 'a')); + swap->f_fn = devtoname(swap->f_swapdev); + mkswap(fl, swap, 0); + } + /* + * Make sure a generic swap isn't specified, along with + * other stuff (user must really be confused). + */ + if (generic) { + if (fl->f_rootdev != NODEV) + yyerror("root device specified with generic swap"); + if (fl->f_argdev != NODEV) + yyerror("arg device specified with generic swap"); + if (fl->f_dumpdev != NODEV) + yyerror("dump device specified with generic swap"); + return; + } + /* + * Default argument device and check for oddball arrangements. + */ + if (fl->f_argdev == NODEV) + fl->f_argdev = swap->f_swapdev; + if (fl->f_argdev != swap->f_swapdev) + yyerror("Warning, arg device different than primary swap"); + /* + * Default dump device and warn if place is not a + * swap area or the argument device partition. + */ + if (fl->f_dumpdev == NODEV) + fl->f_dumpdev = swap->f_swapdev; + if (fl->f_dumpdev != swap->f_swapdev && fl->f_dumpdev != fl->f_argdev) { + struct file_list *p = swap->f_next; + + for (; p && p->f_type == SWAPSPEC; p = p->f_next) + if (fl->f_dumpdev == p->f_swapdev) + return; + (void) sprintf(buf, "Warning, orphaned dump device, %s", + "do you know what you're doing"); + yyerror(buf); + } +} + +/* + * Verify all devices specified in the system specification + * are present in the device specifications. + */ +void +verifysystemspecs(void) +{ + register struct file_list *fl; + dev_t checked[50]; + register dev_t *pchecked = checked; + + for (fl = conf_list; fl; fl = fl->f_next) { + if (fl->f_type != SYSTEMSPEC) + continue; + if (!finddev(fl->f_rootdev)) + deverror(fl->f_needs, "root"); + *pchecked++ = fl->f_rootdev; + pchecked = verifyswap(fl->f_next, checked, pchecked); +#define samedev(dev1, dev2) \ + ((minor(dev1) &~ DEV_MASK) != (minor(dev2) &~ DEV_MASK)) + if (!alreadychecked(fl->f_dumpdev, checked, pchecked)) { + if (!finddev(fl->f_dumpdev)) + deverror(fl->f_needs, "dump"); + *pchecked++ = fl->f_dumpdev; + } + if (!alreadychecked(fl->f_argdev, checked, pchecked)) { + if (!finddev(fl->f_argdev)) + deverror(fl->f_needs, "arg"); + *pchecked++ = fl->f_argdev; + } + } +} + +/* + * Do as above, but for swap devices. + */ +dev_t * +verifyswap(struct file_list *fl, dev_t checked[], dev_t *pchecked) +{ + + for (;fl && fl->f_type == SWAPSPEC; fl = fl->f_next) { + if (eq(fl->f_fn, "generic")) + continue; + if (alreadychecked(fl->f_swapdev, checked, pchecked)) + continue; + if (!finddev(fl->f_swapdev)) + fprintf(stderr, + "config: swap device %s not configured", fl->f_fn); + *pchecked++ = fl->f_swapdev; + } + return (pchecked); +} + +/* + * Has a device already been checked + * for it's existence in the configuration? + */ +int +alreadychecked(dev_t dev, dev_t list[], dev_t *last) +{ + register dev_t *p; + + for (p = list; p < last; p++) + if (samedev(*p, dev)) + return (1); + return (0); +} + +void +deverror(const char *systemname, const char *devtype) +{ + + fprintf(stderr, "config: %s: %s device not configured\n", + systemname, devtype); +} + +/* + * Look for the device in the list of + * configured hardware devices. Must + * take into account stuff wildcarded. + */ +/*ARGSUSED*/ +int +finddev(__unused dev_t dev) +{ + + /* punt on this right now */ + return (1); +} diff --git a/SETUP/config/searchp.c b/SETUP/config/searchp.c new file mode 100644 index 000000000..b79ca6a44 --- /dev/null +++ b/SETUP/config/searchp.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * "Portions Copyright (c) 1999 Apple Computer, Inc. All Rights + * Reserved. This file contains Original Code and/or Modifications of + * Original Code as defined in and that are subject to the Apple Public + * Source License Version 1.0 (the 'License'). You may not use this file + * except in compliance with the License. Please obtain a copy of the + * License at http://www.apple.com/publicsource and read it before using + * this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the + * License for the specific language governing rights and limitations + * under the License." + * + * @APPLE_LICENSE_HEADER_END@ + */ +/* searchp -- search through pathlist for file + * + * Usage: p = searchp (path,file,fullname,func); + * char *p, *path, *file, *fullname; + * int (*func)(); + * + * Searchp will parse "path", a list of pathnames separated + * by colons, prepending each pathname to "file". The resulting + * filename will be passed to "func", a function provided by the + * user. This function must return zero if the search is + * successful (i.e. ended), and non-zero if the search must + * continue. If the function returns zero (success), then + * searching stops, the full filename is placed into "fullname", + * and searchp returns 0. If the pathnames are all unsuccessfully + * examined, then searchp returns -1. + * If "file" begins with a slash, it is assumed to be an + * absolute pathname and the "path" list is not used. Note + * that this rule is used by Bell's cc also; whereas Bell's + * sh uses the rule that any filename which CONTAINS a slash + * is assumed to be absolute. The execlp and execvp procedures + * also use this latter rule. In my opinion, this is bogosity. + * + * HISTORY + * 01-Apr-86 Rudy Nedved (ern) at Carnegie-Mellon University + * 4.1BSD system ignores trailing slashes. 4.2BSD does not. + * Therefore don't add a seperating slash if there is a null + * filename. + * + * 23-Oct-82 Steven Shafer (sas) at Carnegie-Mellon University + * Fixed two bugs: (1) calling function as "func" instead of + * "(*func)", (2) omitting trailing null name implied by trailing + * colon in path. Latter bug fixed by introducing "lastchar" and + * changing final loop test to look for "*lastchar" instead of + * "*nextpath". + * + * 20-Nov-79 Steven Shafer (sas) at Carnegie-Mellon University + * Created for VAX. If you're thinking of using this, you probably + * should look at openp() and fopenp() (or the "want..." routines) + * instead. + * + */ +#include "config.h" + +int +searchp(const char *spath, char *file, char *fullname, int (*func)(char *)) +{ + const char *nextpath, *nextchar, *lastchar; + char *fname; + int failure; + + nextpath = ((*file == '/') ? "" : spath); + do { + fname = fullname; + nextchar = nextpath; + while (*nextchar && (*nextchar != ':')) + *fname++ = *nextchar++; + if (nextchar != nextpath && *file) *fname++ = '/'; + lastchar = nextchar; + nextpath = ((*nextchar) ? nextchar + 1 : nextchar); + nextchar = file; /* append file */ + while (*nextchar) *fname++ = *nextchar++; + *fname = '\0'; + failure = (*func) (fullname); + } + while (failure && (*lastchar)); + return (failure ? -1 : 0); +} diff --git a/SETUP/kextsymboltool/Makefile b/SETUP/kextsymboltool/Makefile new file mode 100644 index 000000000..137f253d2 --- /dev/null +++ b/SETUP/kextsymboltool/Makefile @@ -0,0 +1,31 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +OBJS = kextsymboltool.o + +CFLAGS = -isysroot $(HOST_SDKROOT) -g -O0 -I$(SOURCE) -I. + +WARNFLAGS = -Wall + +LDFLAGS = -isysroot $(HOST_SDKROOT) -lstdc++ + +kextsymboltool: $(OBJS) + $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ + @echo HOST_LD $@ + $(_v)$(HOST_CODESIGN) -s - $@ + @echo HOST_CODESIGN $@ + +.c.o: + $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< + @echo HOST_CC $@ + +do_build_setup: kextsymboltool + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/SETUP/kextsymboltool/kextsymboltool.c b/SETUP/kextsymboltool/kextsymboltool.c new file mode 100644 index 000000000..ee46713e4 --- /dev/null +++ b/SETUP/kextsymboltool/kextsymboltool.c @@ -0,0 +1,912 @@ +/* + * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include <libc.h> +#include <errno.h> +#include <ctype.h> + +#include <sys/stat.h> +#include <sys/file.h> +#include <sys/mman.h> + +#include <mach-o/arch.h> +#include <mach-o/fat.h> +#include <mach-o/loader.h> +#include <mach-o/nlist.h> +#include <mach-o/swap.h> + +#include <uuid/uuid.h> + +#include <IOKit/IOTypes.h> + +#pragma mark Typedefs, Enums, Constants +/********************************************************************* +* Typedefs, Enums, Constants +*********************************************************************/ +typedef enum { + kErrorNone = 0, + kError, + kErrorFileAccess, + kErrorDiskFull, + kErrorDuplicate +} ToolError; + +#pragma mark Function Protos +/********************************************************************* +* Function Protos +*********************************************************************/ +__private_extern__ ToolError +readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize); + +__private_extern__ ToolError +writeFile(int fd, const void * data, size_t length); + +extern char* __cxa_demangle (const char* mangled_name, + char* buf, + size_t* n, + int* status); + +#pragma mark Functions +/********************************************************************* +*********************************************************************/ +__private_extern__ ToolError +writeFile(int fd, const void * data, size_t length) +{ + ToolError err; + + if (length != (size_t)write(fd, data, length)) + err = kErrorDiskFull; + else + err = kErrorNone; + + if (kErrorNone != err) + perror("couldn't write output"); + + return( err ); +} + +/********************************************************************* +*********************************************************************/ +__private_extern__ ToolError +readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize) +{ + ToolError err = kErrorFileAccess; + int fd; + struct stat stat_buf; + + *objAddr = 0; + *objSize = 0; + + do + { + if((fd = open(path, O_RDONLY)) == -1) + continue; + + if(fstat(fd, &stat_buf) == -1) + continue; + + if (0 == (stat_buf.st_mode & S_IFREG)) + continue; + + /* Don't try to map an empty file, it fails now due to conformance + * stuff (PR 4611502). + */ + if (0 == stat_buf.st_size) { + err = kErrorNone; + continue; + } + + *objSize = stat_buf.st_size; + + *objAddr = (vm_offset_t)mmap(NULL /* address */, *objSize, + PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE /* flags */, + fd, 0 /* offset */); + + if ((void *)*objAddr == MAP_FAILED) { + *objAddr = 0; + *objSize = 0; + continue; + } + + err = kErrorNone; + + } while( false ); + + if (-1 != fd) + { + close(fd); + } + if (kErrorNone != err) + { + fprintf(stderr, "couldn't read %s: %s\n", path, strerror(errno)); + } + + return( err ); +} + + +enum { kExported = 0x00000001, kObsolete = 0x00000002 }; + +struct symbol { + char * name; + unsigned int name_len; + char * indirect; + unsigned int indirect_len; + unsigned int flags; + struct symbol * list; + unsigned int list_count; +}; + +static bool issymchar( char c ) +{ + return ((c > ' ') && (c <= '~') && (c != ':') && (c != '#')); +} + +static bool iswhitespace( char c ) +{ + return ((c == ' ') || (c == '\t')); +} + +/* + * Function for qsort for comparing symbol list names. + */ +static int +qsort_cmp(const void * _left, const void * _right) +{ + struct symbol * left = (struct symbol *) _left; + struct symbol * right = (struct symbol *) _right; + + return (strcmp(left->name, right->name)); +} + +/* + * Function for bsearch for finding a symbol name. + */ + +static int +bsearch_cmp( const void * _key, const void * _cmp) +{ + char * key = (char *)_key; + struct symbol * cmp = (struct symbol *) _cmp; + + return(strcmp(key, cmp->name)); +} + +struct bsearch_key +{ + char * name; + unsigned int name_len; +}; + +static int +bsearch_cmp_prefix( const void * _key, const void * _cmp) +{ + struct bsearch_key * key = (struct bsearch_key *)_key; + struct symbol * cmp = (struct symbol *) _cmp; + + return(strncmp(key->name, cmp->name, key->name_len)); +} + +static uint32_t +count_symbols(char * file, vm_size_t file_size) +{ + uint32_t nsyms = 0; + char * scan; + char * eol; + char * next; + + for (scan = file; true; scan = next) { + + eol = memchr(scan, '\n', file_size - (scan - file)); + if (eol == NULL) { + break; + } + next = eol + 1; + + /* Skip empty lines. + */ + if (eol == scan) { + continue; + } + + /* Skip comment lines. + */ + if (scan[0] == '#') { + continue; + } + + /* Scan past any non-symbol characters at the beginning of the line. */ + while ((scan < eol) && !issymchar(*scan)) { + scan++; + } + + /* No symbol on line? Move along. + */ + if (scan == eol) { + continue; + } + + /* Skip symbols starting with '.'. + */ + if (scan[0] == '.') { + continue; + } + nsyms++; + } + + return nsyms; +} + +static uint32_t +store_symbols(char * file, vm_size_t file_size, struct symbol * symbols, uint32_t idx, uint32_t max_symbols) +{ + char * scan; + char * line; + char * eol; + char * next; + + uint32_t strtabsize; + + strtabsize = 0; + + for (scan = file, line = file; true; scan = next, line = next) { + + char * name = NULL; + char * name_term = NULL; + unsigned int name_len = 0; + char * indirect = NULL; + char * indirect_term = NULL; + unsigned int indirect_len = 0; + char * option = NULL; + char * option_term = NULL; + unsigned int option_len = 0; + char optionstr[256]; + boolean_t obsolete = 0; + + eol = memchr(scan, '\n', file_size - (scan - file)); + if (eol == NULL) { + break; + } + next = eol + 1; + + /* Skip empty lines. + */ + if (eol == scan) { + continue; + } + + *eol = '\0'; + + /* Skip comment lines. + */ + if (scan[0] == '#') { + continue; + } + + /* Scan past any non-symbol characters at the beginning of the line. */ + while ((scan < eol) && !issymchar(*scan)) { + scan++; + } + + /* No symbol on line? Move along. + */ + if (scan == eol) { + continue; + } + + /* Skip symbols starting with '.'. + */ + if (scan[0] == '.') { + continue; + } + + name = scan; + + /* Find the end of the symbol. + */ + while ((*scan != '\0') && issymchar(*scan)) { + scan++; + } + + /* Note char past end of symbol. + */ + name_term = scan; + + /* Stored length must include the terminating nul char. + */ + name_len = name_term - name + 1; + + /* Now look for an indirect. + */ + if (*scan != '\0') { + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + if (*scan == ':') { + scan++; + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + if (issymchar(*scan)) { + indirect = scan; + + /* Find the end of the symbol. + */ + while ((*scan != '\0') && issymchar(*scan)) { + scan++; + } + + /* Note char past end of symbol. + */ + indirect_term = scan; + + /* Stored length must include the terminating nul char. + */ + indirect_len = indirect_term - indirect + 1; + + } else if (*scan == '\0') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + } else if (*scan != '\0' && *scan != '-') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + } + + /* Look for options. + */ + if (*scan != '\0') { + while ((*scan != '\0') && iswhitespace(*scan)) { + scan++; + } + + if (*scan == '-') { + scan++; + + if (isalpha(*scan)) { + option = scan; + + /* Find the end of the option. + */ + while ((*scan != '\0') && isalpha(*scan)) { + scan++; + } + + /* Note char past end of option. + */ + option_term = scan; + option_len = option_term - option; + + if (option_len >= sizeof(optionstr)) { + fprintf(stderr, "option too long in symbol line: %s\n", line); + exit(1); + } + memcpy(optionstr, option, option_len); + optionstr[option_len] = '\0'; + + /* Find the option. + */ + if (!strncmp(optionstr, "obsolete", option_len)) { + obsolete = TRUE; + } + + } else if (*scan == '\0') { + fprintf(stderr, "bad format in symbol line: %s\n", line); + exit(1); + } + + } + + } + + if(idx >= max_symbols) { + fprintf(stderr, "symbol[%d/%d] overflow: %s\n", idx, max_symbols, line); + exit(1); + } + + *name_term = '\0'; + if (indirect_term) { + *indirect_term = '\0'; + } + + symbols[idx].name = name; + symbols[idx].name_len = name_len; + symbols[idx].indirect = indirect; + symbols[idx].indirect_len = indirect_len; + symbols[idx].flags = (obsolete) ? kObsolete : 0; + + strtabsize += symbols[idx].name_len + symbols[idx].indirect_len; + idx++; + } + + return strtabsize; +} + +/********************************************************************* +*********************************************************************/ +int main(int argc, char * argv[]) +{ + ToolError err; + int i, fd; + const char * output_name = NULL; + uint32_t zero = 0, num_files = 0; + uint32_t filenum; + uint32_t strx, strtabsize, strtabpad; + struct symbol * import_symbols; + struct symbol * export_symbols; + uint32_t num_import_syms, num_export_syms; + uint32_t result_count, num_removed_syms; + uint32_t import_idx, export_idx; + const NXArchInfo * host_arch; + const NXArchInfo * target_arch; + boolean_t require_imports = true; + boolean_t diff = false; + + + struct file { + vm_offset_t mapped; + vm_size_t mapped_size; + uint32_t nsyms; + boolean_t import; + const char * path; + }; + struct file files[64]; + + host_arch = NXGetLocalArchInfo(); + target_arch = host_arch; + + for( i = 1; i < argc; i += 2) + { + boolean_t import; + + if (!strcmp("-sect", argv[i])) + { + require_imports = false; + i--; + continue; + } + if (!strcmp("-diff", argv[i])) + { + require_imports = false; + diff = true; + i--; + continue; + } + + if (i == (argc - 1)) + { + fprintf(stderr, "bad arguments: %s\n", argv[i]); + exit(1); + } + + if (!strcmp("-arch", argv[i])) + { + target_arch = NXGetArchInfoFromName(argv[i + 1]); + if (!target_arch) + { + fprintf(stderr, "unknown architecture name: %s\n", argv[i+1]); + exit(1); + } + continue; + } + if (!strcmp("-output", argv[i])) + { + output_name = argv[i+1]; + continue; + } + + if (!strcmp("-import", argv[i])) + import = true; + else if (!strcmp("-export", argv[i])) + import = false; + else + { + fprintf(stderr, "unknown option: %s\n", argv[i]); + exit(1); + } + + err = readFile(argv[i+1], &files[num_files].mapped, &files[num_files].mapped_size); + if (kErrorNone != err) + exit(1); + + if (files[num_files].mapped && files[num_files].mapped_size) + { + files[num_files].import = import; + files[num_files].path = argv[i+1]; + num_files++; + } + } + + if (!output_name) + { + fprintf(stderr, "no output file\n"); + exit(1); + } + + num_import_syms = 0; + num_export_syms = 0; + for (filenum = 0; filenum < num_files; filenum++) + { + files[filenum].nsyms = count_symbols((char *) files[filenum].mapped, files[filenum].mapped_size); + if (files[filenum].import) + num_import_syms += files[filenum].nsyms; + else + num_export_syms += files[filenum].nsyms; + } + if (!num_export_syms) + { + fprintf(stderr, "no export names\n"); + exit(1); + } + + import_symbols = calloc(num_import_syms, sizeof(struct symbol)); + export_symbols = calloc(num_export_syms, sizeof(struct symbol)); + + import_idx = 0; + export_idx = 0; + + for (filenum = 0; filenum < num_files; filenum++) + { + if (files[filenum].import) + { + store_symbols((char *) files[filenum].mapped, files[filenum].mapped_size, + import_symbols, import_idx, num_import_syms); + import_idx += files[filenum].nsyms; + } + else + { + store_symbols((char *) files[filenum].mapped, files[filenum].mapped_size, + export_symbols, export_idx, num_export_syms); + export_idx += files[filenum].nsyms; + } + if (false && !files[filenum].nsyms) + { + fprintf(stderr, "warning: file %s contains no names\n", files[filenum].path); + } + } + + + qsort(import_symbols, num_import_syms, sizeof(struct symbol), &qsort_cmp); + qsort(export_symbols, num_export_syms, sizeof(struct symbol), &qsort_cmp); + + result_count = 0; + num_removed_syms = 0; + strtabsize = 4; + if (num_import_syms) + { + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + struct symbol * result; + char * name; + size_t len; + boolean_t wild; + + name = export_symbols[export_idx].indirect; + len = export_symbols[export_idx].indirect_len; + if (!name) + { + name = export_symbols[export_idx].name; + len = export_symbols[export_idx].name_len; + } + wild = ((len > 2) && ('*' == name[len-=2])); + if (wild) + { + struct bsearch_key key; + key.name = name; + key.name_len = len; + result = bsearch(&key, import_symbols, + num_import_syms, sizeof(struct symbol), &bsearch_cmp_prefix); + + if (result) + { + struct symbol * first; + struct symbol * last; + + strtabsize += (result->name_len + result->indirect_len); + + first = result; + while (--first >= &import_symbols[0]) + { + if (bsearch_cmp_prefix(&key, first)) + break; + strtabsize += (first->name_len + first->indirect_len); + } + first++; + + last = result; + while (++last < (&import_symbols[0] + num_import_syms)) + { + if (bsearch_cmp_prefix(&key, last)) + break; + strtabsize += (last->name_len + last->indirect_len); + } + result_count += last - first; + result = first; + export_symbols[export_idx].list = first; + export_symbols[export_idx].list_count = last - first; + export_symbols[export_idx].flags |= kExported; + } + } + else + result = bsearch(name, import_symbols, + num_import_syms, sizeof(struct symbol), &bsearch_cmp); + + if (!result && require_imports) + { + int status; + char * demangled_result = + __cxa_demangle(export_symbols[export_idx].name + 1, NULL, NULL, &status); + fprintf(stderr, "exported name not in import list: %s\n", + demangled_result ? demangled_result : export_symbols[export_idx].name); +// fprintf(stderr, " : %s\n", export_symbols[export_idx].name); + if (demangled_result) { + free(demangled_result); + } + num_removed_syms++; + } + if (diff) + { + if (!result) + result = &export_symbols[export_idx]; + else + result = NULL; + } + if (result && !wild) + { + export_symbols[export_idx].flags |= kExported; + strtabsize += (export_symbols[export_idx].name_len + export_symbols[export_idx].indirect_len); + result_count++; + export_symbols[export_idx].list = &export_symbols[export_idx]; + export_symbols[export_idx].list_count = 1; + } + } + } + strtabpad = (strtabsize + 3) & ~3; + + if (require_imports && num_removed_syms) + { + err = kError; + goto finish; + } + + fd = open(output_name, O_WRONLY|O_CREAT|O_TRUNC, 0755); + if (-1 == fd) + { + perror("couldn't write output"); + err = kErrorFileAccess; + goto finish; + } + + struct symtab_command symcmd; + struct uuid_command uuidcmd; + + symcmd.cmd = LC_SYMTAB; + symcmd.cmdsize = sizeof(symcmd); + symcmd.symoff = sizeof(symcmd) + sizeof(uuidcmd); + symcmd.nsyms = result_count; + symcmd.strsize = strtabpad; + + uuidcmd.cmd = LC_UUID; + uuidcmd.cmdsize = sizeof(uuidcmd); + uuid_generate(uuidcmd.uuid); + + if (CPU_ARCH_ABI64 & target_arch->cputype) + { + struct mach_header_64 hdr; + hdr.magic = MH_MAGIC_64; + hdr.cputype = target_arch->cputype; + hdr.cpusubtype = target_arch->cpusubtype; + hdr.filetype = MH_KEXT_BUNDLE; + hdr.ncmds = 2; + hdr.sizeofcmds = sizeof(symcmd) + sizeof(uuidcmd); + hdr.flags = MH_INCRLINK; + + symcmd.symoff += sizeof(hdr); + symcmd.stroff = result_count * sizeof(struct nlist_64) + + symcmd.symoff; + + if (target_arch->byteorder != host_arch->byteorder) + swap_mach_header_64(&hdr, target_arch->byteorder); + err = writeFile(fd, &hdr, sizeof(hdr)); + } + else + { + struct mach_header hdr; + hdr.magic = MH_MAGIC; + hdr.cputype = target_arch->cputype; + hdr.cpusubtype = target_arch->cpusubtype; + hdr.filetype = (target_arch->cputype == CPU_TYPE_I386) ? MH_OBJECT : MH_KEXT_BUNDLE; + hdr.ncmds = 2; + hdr.sizeofcmds = sizeof(symcmd) + sizeof(uuidcmd); + hdr.flags = MH_INCRLINK; + + symcmd.symoff += sizeof(hdr); + symcmd.stroff = result_count * sizeof(struct nlist) + + symcmd.symoff; + + if (target_arch->byteorder != host_arch->byteorder) + swap_mach_header(&hdr, target_arch->byteorder); + err = writeFile(fd, &hdr, sizeof(hdr)); + } + + if (kErrorNone != err) + goto finish; + + if (target_arch->byteorder != host_arch->byteorder) { + swap_symtab_command(&symcmd, target_arch->byteorder); + swap_uuid_command(&uuidcmd, target_arch->byteorder); + } + err = writeFile(fd, &symcmd, sizeof(symcmd)); + if (kErrorNone != err) + goto finish; + err = writeFile(fd, &uuidcmd, sizeof(uuidcmd)); + if (kErrorNone != err) + goto finish; + + strx = 4; + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + if (!export_symbols[export_idx].name) + continue; + if (!(kExported & export_symbols[export_idx].flags)) + continue; + + if (export_idx + && export_symbols[export_idx - 1].name + && !strcmp(export_symbols[export_idx - 1].name, export_symbols[export_idx].name)) + { + fprintf(stderr, "duplicate export: %s\n", export_symbols[export_idx - 1].name); + err = kErrorDuplicate; + goto finish; + } + + for (import_idx = 0; import_idx < export_symbols[export_idx].list_count; import_idx++) + { + + if (export_symbols[export_idx].list != &export_symbols[export_idx]) + { + printf("wild: %s, %s\n", export_symbols[export_idx].name, + export_symbols[export_idx].list[import_idx].name); + } + if (CPU_ARCH_ABI64 & target_arch->cputype) + { + struct nlist_64 nl; + + nl.n_sect = 0; + nl.n_desc = 0; + nl.n_un.n_strx = strx; + strx += export_symbols[export_idx].list[import_idx].name_len; + + if (export_symbols[export_idx].flags & kObsolete) { + nl.n_desc |= N_DESC_DISCARDED; + } + + if (export_symbols[export_idx].list[import_idx].indirect) + { + nl.n_type = N_INDR | N_EXT; + nl.n_value = strx; + strx += export_symbols[export_idx].list[import_idx].indirect_len; + } + else + { + nl.n_type = N_UNDF | N_EXT; + nl.n_value = 0; + } + + if (target_arch->byteorder != host_arch->byteorder) + swap_nlist_64(&nl, 1, target_arch->byteorder); + + err = writeFile(fd, &nl, sizeof(nl)); + } + else + { + struct nlist nl; + + nl.n_sect = 0; + nl.n_desc = 0; + nl.n_un.n_strx = strx; + strx += export_symbols[export_idx].list[import_idx].name_len; + + if (export_symbols[export_idx].flags & kObsolete) { + nl.n_desc |= N_DESC_DISCARDED; + } + + if (export_symbols[export_idx].list[import_idx].indirect) + { + nl.n_type = N_INDR | N_EXT; + nl.n_value = strx; + strx += export_symbols[export_idx].list[import_idx].indirect_len; + } + else + { + nl.n_type = N_UNDF | N_EXT; + nl.n_value = 0; + } + + if (target_arch->byteorder != host_arch->byteorder) + swap_nlist(&nl, 1, target_arch->byteorder); + + err = writeFile(fd, &nl, sizeof(nl)); + } + } + + if (kErrorNone != err) + goto finish; + } + + strx = sizeof(uint32_t); + err = writeFile(fd, &zero, strx); + if (kErrorNone != err) + goto finish; + + for (export_idx = 0; export_idx < num_export_syms; export_idx++) + { + if (!export_symbols[export_idx].name) + continue; + + for (import_idx = 0; import_idx < export_symbols[export_idx].list_count; import_idx++) + { + err = writeFile(fd, export_symbols[export_idx].list[import_idx].name, + export_symbols[export_idx].list[import_idx].name_len); + if (kErrorNone != err) + goto finish; + if (export_symbols[export_idx].list[import_idx].indirect) + { + err = writeFile(fd, export_symbols[export_idx].list[import_idx].indirect, + export_symbols[export_idx].list[import_idx].indirect_len); + if (kErrorNone != err) + goto finish; + } + } + } + + err = writeFile(fd, &zero, strtabpad - strtabsize); + if (kErrorNone != err) + goto finish; + + close(fd); + + +finish: + for (filenum = 0; filenum < num_files; filenum++) { + // unmap file + if (files[filenum].mapped_size) + { + munmap((caddr_t)files[filenum].mapped, files[filenum].mapped_size); + files[filenum].mapped = 0; + files[filenum].mapped_size = 0; + } + + } + + if (kErrorNone != err) + { + if (output_name) + unlink(output_name); + exit(1); + } + else + exit(0); + return(0); +} + diff --git a/security/conf/tools/newvers/newvers.csh b/SETUP/newvers old mode 100644 new mode 100755 similarity index 100% rename from security/conf/tools/newvers/newvers.csh rename to SETUP/newvers diff --git a/SETUP/seed_objroot b/SETUP/seed_objroot deleted file mode 100755 index 6773e70e4..000000000 --- a/SETUP/seed_objroot +++ /dev/null @@ -1,133 +0,0 @@ -#!/bin/sh - -if [ ! $OBJROOT ] -then - echo "OBJROOT not defined" - exit 1 -fi - -if [ ! $PREBUILT_OBJROOT ] -then - PREBUILT_OBJROOT=/Prebuilt/$1/xnu/BUILD/obj -fi - -if [ ! -e $PREBUILT_OBJROOT ] -then - echo "$PREBUILT_OBJROOT doesn't exist" - exit 1 -else -if [ $# = 2 -a ! -e $PREBUILT_OBJROOT/$2 ] -then - echo "$PREBUILT_OBJROOT/$2 doesn't exist" - exit 1 -fi -if [ -e $PREBUILT_OBJROOT/BUILDING_SEED_OBJROOT ] -then - echo "Building $PREBUILT_OBJROOT, try later" - exit 1 -fi -fi - -cd $PREBUILT_OBJROOT - -if [ $# = 1 ] -then - -if [ ! -e $OBJROOT ] -then -mkdir -p $OBJROOT -echo "Copying $PREBUILT_OBJROOT in $OBJROOT" -pax -rw . $OBJROOT -else -echo "Remove $OBJROOT before calling seed_objroot" -exit 1 -fi - -else - -if [ ! -e $OBJROOT/$2 ] -then -mkdir -p $OBJROOT/$2 -echo "Copying $PREBUILT_OBJROOT/$2 in $OBJROOT/$2" -pax -rw $2 $OBJROOT -RELEASE_OBJ=`echo $2 | sed 's/DEBUG/RELEASE/'` -if [ $1 != $RELEASE_OBJ -a ! -e $OBJROOT/$RELEASE_OBJ ] -then -mkdir -p $OBJROOT/$RELEASE_OBJ -echo "Copying $PREBUILT_OBJROOT/$RELEASE_OBJ in $OBJROOT/$RELEASE_OBJ" -pax -rw $RELEASE_OBJ $OBJROOT -fi - -else -echo "remove $OBJROOT/$2 before calling seed_objroot" -exit 1 -fi - -fi - -if [ ! -e $OBJROOT/EXPORT_HDRS ] -then -echo "Copying $PREBUILT_OBJROOT/EXPORT_HDRS in $OBJROOT/EXPORT_HDRS" -mkdir -p $OBJROOT/EXPORT_HDRS -pax -rw EXPORT_HDRS $OBJROOT -fi - -cd $OBJROOT -if [ -e RELEASE_PPC/osfmk/RELEASE/config.RELEASE_PPC ] -then -PREV_OBJROOT=`grep objectdir RELEASE_PPC/osfmk/RELEASE/config.RELEASE_PPC | cut -f 2 -d\" | - sed 's|/RELEASE_PPC/osfmk/RELEASE||'` -fi -if [ -z $PREV_OBJROOT -a -e DEBUG_PPC/osfmk/DEBUG/config.DEBUG_PPC ] -then - PREV_OBJROOT=`grep objectdir DEBUG_PPC/osfmk/DEBUG/config.DEBUG_PPC | cut -f 2 -d\" | - sed 's|/DEBUG_PPC/osfmk/DEBUG||'` -fi -if [ -z $PREV_OBJROOT -a -e RELEASE_I386/osfmk/RELEASE/config.RELEASE_I386 ] -then - PREV_OBJROOT=`grep objectdir RELEASE_I386/osfmk/RELEASE/config.RELEASE_I386 | cut -f 2 -d\" | - sed 's|/RELEASE_I386/osfmk/RELEASE||'` -fi -if [ -z $PREV_OBJROOT -a -e DEBUG_I386/osfmk/DEBUG/config.DEBUG_I386 ] -then - PREV_OBJROOT=`grep objectdir DEBUG_I386/osfmk/DEBUG/config.DEBUG_I386 | cut -f 2 -d\" | - sed 's|/DEBUG_I386/osfmk/DEBUG||'` -fi -if [ -z $PREV_OBJROOT ] -then - echo "PREV_OBJROOT not found" - exit 1 -fi - -if [ -e RELEASE_PPC/osfmk/RELEASE/config.RELEASE_PPC ] -then -PREV_SRCROOT=`grep sourcedir RELEASE_PPC/osfmk/RELEASE/config.RELEASE_PPC | cut -f 2 -d\"` -fi -if [ -z $PREV_SRCROOT -a -e DEBUG_PPC/osfmk/DEBUG/config.DEBUG_PPC ] -then - PREV_SRCROOT=`grep sourcedir DEBUG_PPC/osfmk/DEBUG/config.DEBUG_PPC | cut -f 2 -d\"` -fi -if [ -z $PREV_SRCROOT -a -e RELEASE_I386/osfmk/RELEASE/config.RELEASE_I386 ] -then -PREV_SRCROOT=`grep sourcedir RELEASE_I386/osfmk/RELEASE/config.RELEASE_I386 | cut -f 2 -d\"` -fi -if [ -z $PREV_SRCROOT -a -e DEBUG_I386/osfmk/DEBUG/config.DEBUG_I386 ] -then - PREV_SRCROOT=`grep sourcedir DEBUG_I386/osfmk/DEBUG/config.DEBUG_I386 | cut -f 2 -d\"` -fi -if [ -z $PREV_SRCROOT ] -then - echo "PREV_SRCROOT not found" - exit 1 -fi - -echo "s|$PREV_OBJROOT|$OBJROOT|" > prebuild.sed -echo "s|$PREV_SRCROOT|$SRCROOT|" >>prebuild.sed - -for i in `find . -name Makedep -print` -do -sed -f prebuild.sed $i > $i.tmp -rm $i -mv $i.tmp $i -done -rm -f `find $OBJROOT -name Makefile -print` prebuild.sed diff --git a/SETUP/setsegname/Makefile b/SETUP/setsegname/Makefile new file mode 100644 index 000000000..70e5e2641 --- /dev/null +++ b/SETUP/setsegname/Makefile @@ -0,0 +1,31 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + + +include $(MakeInc_cmd) +include $(MakeInc_def) + +OBJS = setsegname.o + +CFLAGS = -isysroot $(HOST_SDKROOT) -g -O0 -I$(SOURCE) -I. + +WARNFLAGS = -Wall + +LDFLAGS = -isysroot $(HOST_SDKROOT) + +setsegname: $(OBJS) + $(_v)$(HOST_CC) $(LDFLAGS) -o $@ $^ + @echo HOST_LD $@ + $(_v)$(HOST_CODESIGN) -s - $@ + @echo HOST_CODESIGN $@ + +.c.o: + $(_v)$(HOST_CC) $(WARNFLAGS) $(CFLAGS) -c -o $@ $< + @echo HOST_CC $@ + +do_build_setup: setsegname + +include $(MakeInc_rule) +include $(MakeInc_dir) diff --git a/SETUP/setsegname/setsegname.c b/SETUP/setsegname/setsegname.c new file mode 100644 index 000000000..9afd6bc5d --- /dev/null +++ b/SETUP/setsegname/setsegname.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2007 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include <libc.h> +#include <errno.h> + +#include <sys/stat.h> +#include <sys/file.h> +#include <sys/mman.h> + +#include <mach-o/swap.h> + +#include <IOKit/IOTypes.h> + +/********************************************************************* +*********************************************************************/ +static int +writeFile(int fd, const void * data, size_t length) +{ + int error = 0; + + if (length != (size_t)write(fd, data, length)) { + error = -1; + } + + if (error != 0) { + perror("couldn't write output"); + } + + return error; +} + +/********************************************************************* +*********************************************************************/ +static int +readFile(const char *path, vm_offset_t * objAddr, vm_size_t * objSize) +{ + int error = -1; + int fd; + struct stat stat_buf; + + *objAddr = 0; + *objSize = 0; + + do { + if ((fd = open(path, O_RDONLY)) == -1) { + continue; + } + + if (fstat(fd, &stat_buf) == -1) { + continue; + } + + if (0 == (stat_buf.st_mode & S_IFREG)) { + continue; + } + + if (0 == stat_buf.st_size) { + error = 0; + continue; + } + + *objSize = stat_buf.st_size; + + *objAddr = (vm_offset_t)mmap(NULL /* address */, *objSize, + PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE /* flags */, + fd, 0 /* offset */); + + if ((void *)*objAddr == MAP_FAILED) { + *objAddr = 0; + *objSize = 0; + continue; + } + + error = 0; + + } while (false); + + if (-1 != fd) { + close(fd); + } + if (error) { + fprintf(stderr, "couldn't read %s: %s\n", path, strerror(errno)); + } + + return error; +} + +/********************************************************************* +*********************************************************************/ +int main(int argc, char * argv[]) +{ + int error; + const char * output_name = NULL; + const char * newseg_name = NULL; + struct mach_header * hdr; + struct mach_header_64 * hdr64; + struct load_command * cmds; + boolean_t swap = false; + uint32_t ncmds, cmdtype; + uint32_t len; + vm_offset_t input; + vm_size_t input_size; + uint32_t nsects = 0; + uint32_t * flags = NULL; + uint32_t attr; + typedef char segname_t[16]; + segname_t * names = NULL; + + if ((argc != 5) || strcmp("-o", argv[3])) { + fprintf(stderr, "Usage: %s NEWSEGNAME input -o output\n", argv[0]); + exit(1); + } + + output_name = argv[4]; + newseg_name = argv[1]; + + error = readFile(argv[2], &input, &input_size); + if (error) { + exit(1); + } + + hdr = (typeof(hdr)) input; + switch (hdr->magic) { + case MH_CIGAM: + swap = true; + // fall thru + case MH_MAGIC: + ncmds = hdr->ncmds; + cmds = (typeof(cmds)) (hdr+1); + break; + + case MH_CIGAM_64: + swap = true; + // fall thru + case MH_MAGIC_64: + hdr64 = (typeof(hdr64)) hdr; + ncmds = hdr64->ncmds; + cmds = (typeof(cmds)) (hdr64+1); + break; + + default: + fprintf(stderr, "not macho input file\n"); + exit(1); + break; + } + + if (swap) { + ncmds = OSSwapInt32(ncmds); + } + while (ncmds--) { + cmdtype = cmds->cmd; + if (swap) { + cmdtype = OSSwapInt32(cmdtype); + } + nsects = 0; + len = 0; + if (LC_SEGMENT == cmdtype) { + struct segment_command * segcmd; + struct section * sects; + + segcmd = (typeof(segcmd)) cmds; + nsects = segcmd->nsects; + sects = (typeof(sects))(segcmd + 1); + names = §s->segname; + flags = §s->flags; + len = sizeof(*sects); + } else if (LC_SEGMENT_64 == cmdtype) { + struct segment_command_64 * segcmd; + struct section_64 * sects; + + segcmd = (typeof(segcmd)) cmds; + nsects = segcmd->nsects; + sects = (typeof(sects))(segcmd + 1); + names = §s->segname; + flags = §s->flags; + len = sizeof(*sects); + } + + if (swap) + nsects = OSSwapInt32(nsects); + while (nsects--) { + attr = *flags; + if (swap) { + attr = OSSwapInt32(attr); + } + + if (!(S_ATTR_DEBUG & attr)) { + strncpy((char *)names, newseg_name, sizeof(*names)); + } + + names = (typeof(names))(((uintptr_t) names) + len); + flags = (typeof(flags))(((uintptr_t) flags) + len); + } + + len = cmds->cmdsize; + if (swap) { + len = OSSwapInt32(len); + } + cmds = (typeof(cmds))(((uintptr_t) cmds) + len); + } + + int fd = open(output_name, O_WRONLY|O_CREAT|O_TRUNC, 0755); + if (-1 == fd) { + error = -1; + } else { + error = writeFile(fd, (const void *) input, input_size); + close(fd); + } + + if (error) { + fprintf(stderr, "couldn't write output: %s\n", strerror(errno)); + exit(1); + } + + exit(0); + return 0; +} diff --git a/bsd/Makefile b/bsd/Makefile index d4df2fc62..8beb22975 100644 --- a/bsd/Makefile +++ b/bsd/Makefile @@ -26,14 +26,13 @@ INSTINC_SUBDIRS = \ uuid \ vfs -INSTINC_SUBDIRS_PPC = \ - ppc - INSTINC_SUBDIRS_I386 = \ - i386 + i386 \ + crypto INSTINC_SUBDIRS_X86_64 = \ - i386 + i386 \ + crypto INSTINC_SUBDIRS_ARM = \ arm @@ -58,9 +57,6 @@ EXPINC_SUBDIRS = \ vfs \ vm -EXPINC_SUBDIRS_PPC = \ - ppc - EXPINC_SUBDIRS_I386 = \ i386 @@ -70,16 +66,17 @@ EXPINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS_ARM = \ arm -SETUP_SUBDIRS = \ - conf +SETUP_SUBDIRS = COMP_SUBDIRS = \ conf INST_SUBDIRS = \ + kern INSTMAN_SUBDIRS = \ man + include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/bsm/Makefile b/bsd/bsm/Makefile index 0bb6f4dcf..f660aafb5 100644 --- a/bsd/bsm/Makefile +++ b/bsd/bsm/Makefile @@ -9,16 +9,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/bsm/audit.h b/bsd/bsm/audit.h index bb4a9497b..a24cc88d7 100644 --- a/bsd/bsm/audit.h +++ b/bsd/bsm/audit.h @@ -125,6 +125,8 @@ #define A_SETQCTRL 36 #define A_GETCOND 37 #define A_SETCOND 38 +#define A_GETSFLAGS 39 +#define A_SETSFLAGS 40 /* * Audit policy controls. @@ -318,6 +320,7 @@ int setaudit_addr(const struct auditinfo_addr *, int); #include <mach/port.h> mach_port_name_t audit_session_self(void); au_asid_t audit_session_join(mach_port_name_t port); +int audit_session_port(au_asid_t asid, mach_port_name_t *portname); #endif /* __APPLE_API_PRIVATE */ #endif /* defined(_KERNEL) || defined(KERNEL) */ diff --git a/bsd/bsm/audit_kevents.h b/bsd/bsm/audit_kevents.h index 268c456c3..25e3eb829 100644 --- a/bsd/bsm/audit_kevents.h +++ b/bsd/bsm/audit_kevents.h @@ -36,7 +36,7 @@ * The reserved event numbers for kernel events are 1...2047 and 43001..44900. */ #define AUE_IS_A_KEVENT(e) (((e) > 0 && (e) < 2048) || \ - ((e) > 43000 && (e) < 45000)) + ((e) > 43000 && (e) < 44901)) /* * Values marked as AUE_NULL are not required to be audited as per CAPP. @@ -596,6 +596,16 @@ #define AUE_PWRITE 43193 /* Darwin/FreeBSD. */ #define AUE_FSCTL 43194 /* Darwin. */ #define AUE_FFSCTL 43195 /* Darwin. */ +#define AUE_LPATHCONF 43196 /* FreeBSD. */ +#define AUE_PDFORK 43197 /* FreeBSD. */ +#define AUE_PDKILL 43198 /* FreeBSD. */ +#define AUE_PDGETPID 43199 /* FreeBSD. */ +#define AUE_PDWAIT 43200 /* FreeBSD. */ + +#define AUE_SESSION_START 44901 /* Darwin. */ +#define AUE_SESSION_UPDATE 44902 /* Darwin. */ +#define AUE_SESSION_END 44903 /* Darwin. */ +#define AUE_SESSION_CLOSE 44904 /* Darwin. */ /* * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the diff --git a/bsd/conf/MASTER b/bsd/conf/MASTER index 93872ad07..bb57c6dae 100644 --- a/bsd/conf/MASTER +++ b/bsd/conf/MASTER @@ -101,6 +101,7 @@ options MACH_IPC_COMPAT # Enable old IPC interface # <ipc_compat> options MACH_IPC_DEBUG # Enable IPC debugging calls # <ipc_debug> options MACH_IPC_TEST # Testing code/printfs # <ipc_test> options MACH_LDEBUG # Sanity-check simple locking # <test> +options CONFIG_ZLEAKS # Live zone leak debug sysctls # <zleaks> options MACH_NP # Mach IPC support # <np> options MACH_NBC # No buffer cache # <nbc> options MACH_NET # Fast network access # <mach_net> @@ -123,15 +124,12 @@ options LLC # 802.2 support # <llc> options LOOP # loopback support # <loop> options MROUTING # multicast routing # <mrouting> options ROUTING # routing # <routing> -options NETMIBS # # <netmibs> options VLAN # # <vlan> options BOND # # <bond> options PF # Packet Filter # <pf> options PF_PKTHDR # PF tag inside mbuf pkthdr # <pf_pkthdr> -options PKT_PRIORITY # Packet priority support # <pkt_priority> options PFLOG # PF log interface # <pflog> options IPDIVERT # Divert sockets (for NAT) # <ipdivert> -options IPFLOW # IP fast forwarding # <ipflow> options IPFIREWALL # IP Firewalling (used by NAT) # <ipfirewall> options IPFIREWALL_FORWARD #Transparent proxy # <ipfirewall> options IPFIREWALL_DEFAULT_TO_ACCEPT # allow everything by default # <ipfirewall> @@ -144,7 +142,6 @@ options RANDOM_IP_ID # random (not sequential) ip ids # <randomipid> options TCP_DROP_SYNFIN # Drop TCP packets with SYN+FIN set # <tcpdrop_synfin> options ICMP_BANDLIM # ICMP bandwidth limiting sysctl options IFNET_INPUT_SANITY_CHK # allow dlil/ifnet input sanity check # <ifnet_input_chk> -options IFNET_ROUTE_REFCNT # count route references to ifnet # <ifnet_route_refcnt> options SYSV_SEM # SVID semaphores # <sysv_sem> options SYSV_MSG # SVID messages # <sysv_msg> options SYSV_SHM # SVID shared mem # <sysv_shm> @@ -169,22 +166,18 @@ options NETWORKING # networking layer # <inet, inet6, netat> options CONFIG_FSE # file system events # <config_fse> options CONFIG_IMAGEBOOT # local image boot # <config_imageboot> options CONFIG_SOWUPCALL # SB_UPCALL on sowwakeup # <config_sowupcall> -options CONFIG_MBUF_NOEXPAND # limit mbuf expansion # <config_mbuf_noexpand> options CONFIG_MBUF_JUMBO # jumbo cluster pool # <config_mbuf_jumbo> -options CONFIG_MBUF_TAGS_MALLOC # use malloc for tags # <config_mbuf_tags_malloc> options CONFIG_FORCE_OUT_IFP # Enable IP_FORCE_OUT_IFP # <config_force_out_ifp> options CONFIG_IFEF_NOWINDOWSCALE # Scale TCP window per driver # <config_ifef_nowindowscale> options CONFIG_WORKQUEUE # <config_workqueue> - # # 4.4 filesystems # options FFS # Fast Filesystem Support # <ffs> options HFS # HFS/HFS+ support # <hfs> options FIFO # fifo support # <fifo> -options UNION # union_fs support # <union> options FDESC # fdesc_fs support # <fdesc> options DEVFS # devfs support # <devfs> options JOURNALING # journaling support # <journaling> @@ -199,6 +192,7 @@ options REV_ENDIAN_FS # Reverse Endian FS # <revfs> options NAMEDSTREAMS # named stream vnop support # <namedstreams> options CONFIG_VOLFS # volfs path support (legacy) # <config_volfs> options CONFIG_IMGSRC_ACCESS # source of imageboot dmg # <config_imgsrc_access> +options CONFIG_TRIGGERS # trigger vnodes # <config_triggers> # # NFS support @@ -249,6 +243,7 @@ options randomipid # <inet,randomipid> options ZLIB # inflate/deflate support # <zlib> +options IF_BRIDGE # <if_bridge> makeoptions LIBDRIVER = "libDriver_kern.o" # <libdriver> makeoptions LIBOBJC = "libkobjc.o" # <kernobjc> @@ -292,7 +287,6 @@ options CONFIG_KN_HASHSIZE=20 # <bsmall> options CONFIG_VNODES=263168 # <large,xlarge> options CONFIG_VNODES=263168 # <medium> options CONFIG_VNODES=10240 # <small> -options CONFIG_VNODES=1024 # <xsmall> options CONFIG_VNODES=750 # <bsmall> options CONFIG_VNODE_FREE_MIN=500 # <large,xlarge> @@ -396,6 +390,12 @@ options CONFIG_MFCTBLSIZ=256 # <medium,large,xlarge> options CONFIG_MFCTBLSIZ=128 # <small,xsmall> options CONFIG_MFCTBLSIZ=16 # <bsmall> +# +# configurable kernel message buffer size +# +options CONFIG_MSG_BSIZE=4096 # <bsmall,small,xsmall> +options CONFIG_MSG_BSIZE=16384 # <medium,large,xlarge> + # # configurable kernel - use these options to strip strings from panic # and printf calls. @@ -406,6 +406,11 @@ options CONFIG_NO_PANIC_STRINGS # <no_panic_str> options CONFIG_NO_PRINTF_STRINGS # <no_printf_str> options CONFIG_NO_KPRINTF_STRINGS # <no_kprintf_str> +# +# use finer-grained lock groups for the proc subsystem +# +options CONFIG_FINE_LOCK_GROUPS # <medium,large,xlarge> + # # configurable kernel - general switch to say we are building for an # embedded device @@ -433,6 +438,14 @@ options CONFIG_CODE_DECRYPTION # <config_embedded> options CONFIG_PROTECT # <config_protect> +# +# freeze - support app hibernation, used on embedded +# CONFIG_FREEZE_SUSPENDED_MIN is the minimum number of suspended +# processes to be left unhibernated +# +options CONFIG_FREEZE # <freeze> + +options CHECK_CS_VALIDATION_BITMAP # <config_cs_validation_bitmap> # # Ethernet (ARP) @@ -463,6 +476,7 @@ pseudo-device vndevice 16 init vndevice_init # <xlarge> pseudo-device vndevice 8 init vndevice_init # <large> pseudo-device vndevice 4 init vndevice_init # <medium> pseudo-device vndevice 3 init vndevice_init # <small> +pseudo-device vndevice 2 init vndevice_init # <xsmall> pseudo-device vndevice 2 init vndevice_init # <bsmall> # diff --git a/bsd/conf/MASTER.i386 b/bsd/conf/MASTER.i386 index 1e6641911..594f0fb51 100644 --- a/bsd/conf/MASTER.i386 +++ b/bsd/conf/MASTER.i386 @@ -44,21 +44,20 @@ # # Standard Apple Research Configurations: # -------- ----- -------- --------------- -# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] -# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs config_hfs_trim hfs_compression config_imgsrc_access ] -# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow pkt_priority if_bridge ] +# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch zleaks ] +# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo config_volfs config_hfs_trim hfs_compression config_hfs_alloc_rbtree config_imgsrc_access config_triggers ] +# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo if_bridge pf pflog pf_pkthdr ] # NFS = [ nfsclient nfsserver ] # VPN = [ ipsec ] # RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] # PROFILE = [ RELEASE profile ] -# DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert pf pflog ] -# +# DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert ] # # EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] # EMBEDDED_FILESYS = [ devfs hfs journaling fdesc fifo ] -# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter config_mbuf_noexpand ] +# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter ] # EMBEDDED = [ EMBEDDED_BASE EMBEDDED_NET VPN EMBEDDED_FILESYS libdriver no_printf_str no_kprintf_str no_kdebug ] -# DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver netmibs development mach_assert config_dtrace ] +# DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver development mach_assert config_dtrace ] # ###################################################################### # diff --git a/bsd/conf/MASTER.ppc b/bsd/conf/MASTER.ppc deleted file mode 100644 index d99b6e4f5..000000000 --- a/bsd/conf/MASTER.ppc +++ /dev/null @@ -1,99 +0,0 @@ -# -# Mach Operating System -# Copyright (c) 1986 Carnegie-Mellon University -# All rights reserved. The CMU software License Agreement -# specifies the terms and conditions for use and redistribution. -# -###################################################################### -# -# Master Apple configuration file (see the master machine independent -# configuration file for a description of the file format). -# -###################################################################### -# -# Apple (PSEUDO-)DEVICES (select any combination) -# ex = Excelan EXOS 202 Ethernet interface -# ip = Interphase V/SMD 3200 disk controller -# od = Canon OMD-1 Optical Disk -# rd = RAM disk -# sd = SCSI disk -# sg = Generic SCSI Device -# st = SCSI tape -# fd = Floppy Disk -# en = Integrated Ethernet controller -# dsp = DSP560001 digital signal processor -# iplmeas = ipl time measurement -# nextp = NeXT Laser Printer -# sound = sound I/O -# vol = removable volume support device -# venip = virtual Ethernet/IP network interface -# zs = Serial device -# -# MULTIPROCESSOR SUPPORT (select exactly one) -# multi = support 4 processors -# uni = supports single processor -# -# SPECIAL CHARACTERISTICS (select any combination) -# gdb = GNU kernel debugger -# posix_kern = POSIX support -# -# CPU TYPE (select exactly one) -# NeXT = FIXME -# -###################################################################### -# -# Standard Apple Research Configurations: -# -------- ----- -------- --------------- -# -# BASE = [ ppc mach medium config_dtrace vol pst gdb noprofiling simple_clock kernstack sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue ] -# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs config_hfs_trim hfs_compression ] -# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk ipflow pkt_priority ] -# NFS = [ nfsclient nfsserver ] -# VPN = [ ipsec ] -# RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] -# DEVELOPMENT = [ RELEASE ] -# PROFILE = [ RELEASE profile ] -# DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert pf pflog ] -# -###################################################################### -# -machine "ppc" # <ppc> -cpu "ppc" # <ppc> - -options GDB # GNU kernel debugger # <gdb> -options DEBUG # general debugging code # <debug> -options SHOW_SPACE # print size of structures # <debug> -options EVENTMETER # event meter support # <debug> -options FP_EMUL # floating point emulation # <fp> -options UXPR # user-level XPR package # <uxpr> -config mach_kernel swap generic # <mach> - -# -# Note: MAC/AUDIT options must be set in all the bsd/conf, osfmk/conf, and -# security/conf MASTER files. -# -options CONFIG_MACF # Mandatory Access Control Framework -options CONFIG_MACF_SOCKET_SUBSET # MAC socket subest (no labels) -#options CONFIG_MACF_SOCKET # MAC socket labels -#options CONFIG_MACF_NET # mbuf -#options CONFIG_MACF_DEBUG -#options CONFIG_MACF_MACH -options CONFIG_AUDIT # Kernel auditing - -options EVENT # <event> - -# -# Ipl measurement system -# -pseudo-device iplmeas # <iplmeas> - -# -# NFS measurement system -# -pseudo-device nfsmeas # <nfsmeas> - -# -# Removable Volume support -# -pseudo-device vol # <vol> - diff --git a/bsd/conf/MASTER.x86_64 b/bsd/conf/MASTER.x86_64 index 1050897d2..4bf42910b 100644 --- a/bsd/conf/MASTER.x86_64 +++ b/bsd/conf/MASTER.x86_64 @@ -44,21 +44,20 @@ # # Standard Apple Research Configurations: # -------- ----- -------- --------------- -# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] -# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo union config_volfs config_hfs_trim hfs_compression config_imgsrc_access ] -# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile netmibs bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo ipflow pkt_priority if_bridge ] +# BASE = [ intel mach medium config_dtrace vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch zleaks ] +# FILESYS = [ devfs revfs hfs journaling fdesc config_fse quota namedstreams fifo config_volfs config_hfs_trim hfs_compression config_hfs_alloc_rbtree config_imgsrc_access config_triggers ] +# NETWORKING = [ inet inet6 compat_oldsock tcpdrop_synfin bpfilter ipdivert ipfirewall ipv6firewall ipfw2 dummynet traffic_mgt sendfile bond vlan gif stf zlib randomipid ifnet_input_chk config_mbuf_jumbo if_bridge pf pflog pf_pkthdr ] # NFS = [ nfsclient nfsserver ] # VPN = [ ipsec ] # RELEASE = [ BASE NETWORKING NFS VPN FILESYS libdriver ] # PROFILE = [ RELEASE profile ] -# DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert pf pflog ] -# +# DEBUG = [ BASE NETWORKING NFS VPN FILESYS libdriver_g debug xpr_debug mach_assert ] # # EMBEDDED_BASE = [ intel mach bsmall vol pst gdb kernobjc fixpri simple_clock mdebug kernserv driverkit uxpr kernstack ipc_compat ipc_debug sysv_sem sysv_msg sysv_shm audit panic_info config_imageboot config_workqueue psynch ] # EMBEDDED_FILESYS = [ devfs hfs journaling fdesc fifo ] -# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter config_mbuf_noexpand ] +# EMBEDDED_NET = [ inet compat_oldsock tcpdrop_synfin bpfilter ] # EMBEDDED = [ EMBEDDED_BASE EMBEDDED_NET VPN EMBEDDED_FILESYS libdriver no_printf_str no_kprintf_str no_kdebug ] -# DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver netmibs development mach_assert ] +# DEVELOPMENT = [ EMBEDDED_BASE EMBEDDED_NET NFS VPN EMBEDDED_FILESYS libdriver development mach_assert ] # ###################################################################### # diff --git a/bsd/conf/Makefile b/bsd/conf/Makefile index a79644e77..afaf3eb89 100644 --- a/bsd/conf/Makefile +++ b/bsd/conf/Makefile @@ -3,92 +3,10 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir -export dp_backing_file.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export ubc_subr.o_CFLAGS_ADD=-Wno-discard-qual -Wshorten-64-to-32 -export vnode_pager.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 -export vm_unix.o_CFLAGS_ADD=-Werror -Wshorten-64-to-32 - -export if_mib.o_CFLAGS_ADD=-Wno-unused-parameter -export adsp_Write.o_CFLAGS_ADD=-Wno-sign-compare -export adsp_Packet.o_CFLAGS_ADD=-Wno-sign-compare -export adsp_Control.o_CFLAGS_ADD=-Wno-sign-compare -export adsp_RxAttn.o_CFLAGS_ADD=-Wno-sign-compare -export adsp_attention.o_CFLAGS_ADD=-Wno-sign-compare -export asp_proto.o_CFLAGS_ADD=-Wno-sign-compare -export drv_dep.o_CFLAGS_ADD=-Wno-sign-compare -export ddp_rtmp.o_CFLAGS_ADD=-Wno-sign-compare -export ddp_lap.o_CFLAGS_ADD=-Wno-sign-compare -export radix.o_CFLAGS_ADD=-Wno-sign-compare -export route.o_CFLAGS_ADD=-Wno-sign-compare -export rtsock.o_CFLAGS_ADD=-Wno-sign-compare -export dhcp_options.o_CFLAGS_ADD=-Wno-sign-compare -export igmp.o_CFLAGS_ADD=-Wno-sign-compare -export in_cksum.o_CFLAGS_ADD=-Wno-sign-compare -export ip_divert.o_CFLAGS_ADD=-Wno-sign-compare -export ip_dummynet.o_CFLAGS_ADD=-Wno-sign-compare -export ip_flow.o_CFLAGS_ADD=-Wno-sign-compare -export ip_fw2.o_CFLAGS_ADD=-Wno-sign-compare -export ip_fw2_compat.o_CFLAGS_ADD=-Wno-sign-compare -export ip_icmp.o_CFLAGS_ADD=-Wno-sign-compare -export ip_input.o_CFLAGS_ADD=-Wno-sign-compare -export ip_mroute.o_CFLAGS_ADD=-Wno-sign-compare -export ip_output.o_CFLAGS_ADD=-Wno-sign-compare -export raw_ip.o_CFLAGS_ADD=-Wno-sign-compare -export tcp_input.o_CFLAGS_ADD=-Wno-sign-compare -export tcp_output.o_CFLAGS_ADD=-Wno-sign-compare -export tcp_subr.o_CFLAGS_ADD=-Wno-sign-compare -export tcp_usrreq.o_CFLAGS_ADD=-Wno-sign-compare -export tcp_timer.o_CFLAGS_ADD=-Wno-sign-compare -export udp_usrreq.o_CFLAGS_ADD=-Wno-sign-compare -export ah_input.o_CFLAGS_ADD=-Wno-sign-compare -export ah_core.o_CFLAGS_ADD=-Wno-sign-compare -export ah_output.o_CFLAGS_ADD=-Wno-sign-compare -export esp_core.o_CFLAGS_ADD=-Wno-sign-compare -export esp_input.o_CFLAGS_ADD=-Wno-sign-compare -export esp_output.o_CFLAGS_ADD=-Wno-sign-compare -export esp_rijndael.o_CFLAGS_ADD=-Wno-sign-compare -export ipsec.o_CFLAGS_ADD=-Wno-sign-compare -export dest6.o_CFLAGS_ADD=-Wno-sign-compare -export frag6.o_CFLAGS_ADD=-Wno-sign-compare -export icmp6.o_CFLAGS_ADD=-Wno-sign-compare -export in6.o_CFLAGS_ADD=-Wno-sign-compare -export in6_src.o_CFLAGS_ADD=-Wno-sign-compare -export in6_cksum.o_CFLAGS_ADD=-Wno-sign-compare -export ip6_fw.o_CFLAGS_ADD=-Wno-sign-compare -export ip6_forward.o_CFLAGS_ADD=-Wno-sign-compare -export in6_ifattach.o_CFLAGS_ADD=-Wno-sign-compare -export ip6_input.o_CFLAGS_ADD=-Wno-sign-compare -export ip6_mroute.o_CFLAGS_ADD=-Wno-sign-compare -export ip6_output.o_CFLAGS_ADD=-Wno-sign-compare -export ipcomp_input.o_CFLAGS_ADD=-Wno-sign-compare -export ipcomp_output.o_CFLAGS_ADD=-Wno-sign-compare -export in6_proto.o_CFLAGS_ADD=-Wno-sign-compare -export mld6.o_CFLAGS_ADD=-Wno-sign-compare -export nd6.o_CFLAGS_ADD=-Wno-sign-compare -export nd6_nbr.o_CFLAGS_ADD=-Wno-sign-compare -export nd6_rtr.o_CFLAGS_ADD=-Wno-sign-compare -export raw_ip6.o_CFLAGS_ADD=-Wno-sign-compare -export route6.o_CFLAGS_ADD=-Wno-sign-compare -export scope6.o_CFLAGS_ADD=-Wno-sign-compare -export udp6_usrreq.o_CFLAGS_ADD=-Wno-sign-compare -export key.o_CFLAGS_ADD=-Wno-sign-compare -export keysock.o_CFLAGS_ADD=-Wno-sign-compare -export atp_write.o_CFLAGS_ADD=-Wno-sign-compare -export keydb.o_CFLAGS_ADD=-Wno-sign-compare -export des_setkey.o_CFLAGS_ADD=-Wno-sign-compare -export sys_socket.o_CFLAGS_ADD=-Wno-sign-compare -export sys_glue.o_CFLAGS_ADD=-Wno-sign-compare -export uipc_domain.o_CFLAGS_ADD=-Wno-sign-compare -export uipc_mbuf.o_CFLAGS_ADD=-Wno-sign-compare -export uipc_mbuf2.o_CFLAGS_ADD=-Wno-sign-compare -export uipc_socket.o_CFLAGS_ADD=-Wno-sign-compare -export uipc_socket2.o_CFLAGS_ADD=-Wno-sign-compare - include $(MakeInc_cmd) include $(MakeInc_def) -SETUP_SUBDIRS = \ - tools +SETUP_SUBDIRS = COMP_SUBDIRS = @@ -104,30 +22,24 @@ else export COMPOBJROOT=$(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT) endif -$(COMPOBJROOT)/doconf: - @make build_setup +MASTER_CPU_PER_SOC = $(SOURCE)/MASTER.$(ARCH_CONFIG_LC).$(MACHINE_CONFIG_LC) $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG)/Makefile : $(SOURCE)/MASTER \ $(SOURCE)/MASTER.$(ARCH_CONFIG_LC) \ $(SOURCE)/Makefile.template \ $(SOURCE)/Makefile.$(ARCH_CONFIG_LC) \ $(SOURCE)/files \ - $(SOURCE)/files.$(ARCH_CONFIG_LC) \ - $(COMPOBJROOT)/doconf + $(SOURCE)/files.$(ARCH_CONFIG_LC) $(_v)(doconf_target=$(addsuffix /conf, $(TARGET)); \ $(MKDIR) $${doconf_target}; \ cd $${doconf_target}; \ rm -f $(notdir $?); \ cp $? $${doconf_target}; \ - $(COMPOBJROOT)/doconf -c -cpu $(ARCH_CONFIG_LC) -d $(TARGET)/$(BSD_KERNEL_CONFIG) $(BSD_KERNEL_CONFIG); \ + if [ -f $(MASTER_CPU_PER_SOC) ]; then cp $(MASTER_CPU_PER_SOC) $${doconf_target}; fi; \ + $(SRCROOT)/SETUP/config/doconf -c -cpu $(ARCH_CONFIG_LC) -soc $(MACHINE_CONFIG_LC) -d $(TARGET)/$(BSD_KERNEL_CONFIG) $(BSD_KERNEL_CONFIG); \ ); -.ORDER: $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG)/Makefile - -do_setup_conf: $(COMPOBJROOT)/doconf \ - $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG)/Makefile - -do_all: do_setup_conf +do_all: $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG)/Makefile $(_v)next_source=$(subst conf/,,$(SOURCE)); \ ${MAKE} -C $(COMPOBJROOT)/$(BSD_KERNEL_CONFIG) \ MAKEFILES=$(TARGET)/$(BSD_KERNEL_CONFIG)/Makefile \ diff --git a/bsd/conf/Makefile.i386 b/bsd/conf/Makefile.i386 index 0b5f62979..a46354589 100644 --- a/bsd/conf/Makefile.i386 +++ b/bsd/conf/Makefile.i386 @@ -2,46 +2,12 @@ #BEGIN Machine dependent Makefile fragment for i386 ###################################################################### -# files to build with certain warnings turned off +# Files to build with certain warnings turned off dis_tables.o_CFLAGS_ADD += -Wno-cast-qual fbt_x86.o_CFLAGS_ADD += -Wno-cast-qual - -# Enable -Werror for i386 builds -CFLAGS+=$(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -OBJS_NO_WERROR = \ - fifo_vnops.o \ - aescrypt.o \ - aeskey.o \ - des_setkey.o \ - sha2.o \ - if_ethersubr.o \ - if_media.o \ - kext_net.o \ - dhcp_options.o \ - in_bootp.o \ - krpc_subr.o \ - ux_exception.o \ - unix_startup.o \ - randomdev.o \ - vnode_pager.o \ - dp_backing_file.o \ - vm_unix.o \ - mem.o \ - km.o \ - init_sysent.o \ - drv_dep.o \ - sdt_x86.o \ - dtrace_isa.o \ - aes_modes.o - - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror +# sha256 Files to build with -DSHA256_USE_ASSEMBLY=1 +sha2.o_CFLAGS_ADD += -DSHA256_USE_ASSEMBLY=1 ###################################################################### #END Machine dependent Makefile fragment for i386 diff --git a/bsd/conf/Makefile.ppc b/bsd/conf/Makefile.ppc deleted file mode 100644 index 2dd4e88b3..000000000 --- a/bsd/conf/Makefile.ppc +++ /dev/null @@ -1,53 +0,0 @@ -###################################################################### -#BEGIN Machine dependent Makefile fragment for ppc -###################################################################### - -# files to build with certain warnings turned off -dis_tables.o_CFLAGS_ADD += -Wno-cast-qual -fbt_ppc.o_CFLAGS_ADD += -Wno-cast-qual -Wno-pointer-to-int-cast - - -# Enable -Werror for ppc builds -CFLAGS+=$(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -OBJS_NO_WERROR = \ - fifo_vnops.o \ - aescrypt.o \ - aeskey.o \ - des_setkey.o \ - sha2.o \ - shadow.o \ - if_ethersubr.o \ - if_media.o \ - kext_net.o \ - dhcp_options.o \ - in_bootp.o \ - krpc_subr.o \ - ux_exception.o \ - sysctl.o \ - unix_startup.o \ - randomdev.o \ - devtimer.o \ - vnode_pager.o \ - dp_backing_file.o \ - vm_unix.o \ - mem.o \ - km.o \ - at.o \ - drv_dep.o \ - fbt_ppc.o \ - sdt_ppc.o \ - dtrace_isa.o \ - dtrace_subr_ppc.o - - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror - -###################################################################### -#END Machine dependent Makefile fragment for ppc -###################################################################### - diff --git a/bsd/conf/Makefile.template b/bsd/conf/Makefile.template index fdee45a3e..8691ce705 100644 --- a/bsd/conf/Makefile.template +++ b/bsd/conf/Makefile.template @@ -43,10 +43,81 @@ include $(MakeInc_def) # # XXX: CFLAGS # -CFLAGS+= -imacros meta_features.h -DARCH_PRIVATE -DKERNEL -DDRIVER_PRIVATE \ +CFLAGS+= -include meta_features.h -DARCH_PRIVATE -DDRIVER_PRIVATE \ -D_KERNEL_BUILD -DKERNEL_BUILD -DMACH_KERNEL -DBSD_BUILD \ -DBSD_KERNEL_PRIVATE -DLP64KERN=1 -DLP64_DEBUG=0 -I. $(CFLAGS_INLINE_CONFIG) +dp_backing_file.o_CFLAGS_ADD += -Wshorten-64-to-32 +ubc_subr.o_CFLAGS_ADD += -Wshorten-64-to-32 +vnode_pager.o_CFLAGS_ADD += -Wshorten-64-to-32 +vm_unix.o_CFLAGS_ADD += -Wshorten-64-to-32 + +# Objects that don't want -Wsign-compare +OBJS_NO_SIGN_COMPARE = \ + radix.o \ + route.o \ + rtsock.o \ + dhcp_options.o \ + igmp.o \ + in_cksum.o \ + ip_divert.o \ + ip_dummynet.o \ + ip_flow.o \ + ip_fw2.o \ + ip_fw2_compat.o \ + ip_icmp.o \ + ip_input.o \ + ip_mroute.o \ + ip_output.o \ + raw_ip.o \ + tcp_input.o \ + tcp_output.o \ + tcp_subr.o \ + tcp_usrreq.o \ + tcp_timer.o \ + udp_usrreq.o \ + ah_input.o \ + ah_core.o \ + ah_output.o \ + esp_core.o \ + esp_input.o \ + esp_output.o \ + esp_rijndael.o \ + ipsec.o \ + dest6.o \ + frag6.o \ + icmp6.o \ + in6.o \ + in6_src.o \ + in6_cksum.o \ + ip6_fw.o \ + ip6_forward.o \ + in6_ifattach.o \ + ip6_input.o \ + ip6_mroute.o \ + ip6_output.o \ + ipcomp_input.o \ + ipcomp_output.o \ + in6_proto.o \ + mld6.o \ + nd6.o \ + nd6_nbr.o \ + nd6_rtr.o \ + raw_ip6.o \ + route6.o \ + scope6.o \ + udp6_usrreq.o \ + key.o \ + keysock.o \ + keydb.o \ + des_setkey.o \ + uipc_mbuf.o \ + uipc_mbuf2.o \ + uipc_socket.o \ + uipc_socket2.o + +$(foreach file,$(OBJS_NO_SIGN_COMPARE),$(eval $(call add_perfile_cflags,$(file),-Wno-sign-compare))) + # # Directories for mig generated files # @@ -98,11 +169,11 @@ ${OBJS}: ${OBJSDEPS} LDOBJS = $(OBJS) -$(COMPONENT).o: $(LDOBJS) +$(COMPONENT).filelist: $(LDOBJS) @echo LDFILELIST $(COMPONENT) $(_v)( for obj in ${LDOBJS}; do \ echo $(TARGET)$(COMP_OBJ_DIR)/$(KERNEL_CONFIG)/$${obj}; \ - done; ) > $(COMPONENT).o + done; ) > $(COMPONENT).filelist MAKESYSCALLS = $(SRCROOT)/bsd/kern/makesyscalls.sh @@ -121,7 +192,7 @@ audit_kevents.c: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) do_depend: do_all $(_v)${MD} -u Makedep -f -d `ls *.d`; -do_all: $(COMPONENT).o +do_all: $(COMPONENT).filelist do_build_all: do_depend diff --git a/bsd/conf/Makefile.x86_64 b/bsd/conf/Makefile.x86_64 index 83b41e2dd..29811299a 100644 --- a/bsd/conf/Makefile.x86_64 +++ b/bsd/conf/Makefile.x86_64 @@ -2,46 +2,12 @@ #BEGIN Machine dependent Makefile fragment for x86_64 ###################################################################### -# files to build with certain warnings turned off +# Files to build with certain warnings turned off dis_tables.o_CFLAGS_ADD += -Wno-cast-qual fbt_x86.o_CFLAGS_ADD += -Wno-cast-qual - -# Enable -Werror for x86_64 builds -CFLAGS+=$(WERROR) -CWARNFLAGS= $(filter-out -Wbad-function-cast, $(CWARNFLAGS_STD)) - -# Objects that don't compile cleanly: -OBJS_NO_WERROR = \ - fifo_vnops.o \ - aescrypt.o \ - aeskey.o \ - des_setkey.o \ - sha2.o \ - if_ethersubr.o \ - if_media.o \ - kext_net.o \ - dhcp_options.o \ - in_bootp.o \ - krpc_subr.o \ - ux_exception.o \ - unix_startup.o \ - randomdev.o \ - vnode_pager.o \ - dp_backing_file.o \ - vm_unix.o \ - mem.o \ - km.o \ - init_sysent.o \ - drv_dep.o \ - sdt_x86.o \ - dtrace_isa.o \ - aes_modes.o - - -OBJS_WERROR=$(filter-out $(OBJS_NO_WERROR),$(OBJS)) - -$(OBJS_WERROR): WERROR=-Werror +# sha256 Files to build with -DSHA256_USE_ASSEMBLY=1 +sha2.o_CFLAGS_ADD += -DSHA256_USE_ASSEMBLY=1 ###################################################################### #END Machine dependent Makefile fragment for x86_64 diff --git a/bsd/conf/files b/bsd/conf/files index 92ea7269a..b3a7b10c4 100644 --- a/bsd/conf/files +++ b/bsd/conf/files @@ -107,6 +107,7 @@ OPTIONS/ipfw2 optional ipfw2 OPTIONS/ipfirewall optional ipfirewall OPTIONS/ipv6firewall optional ipv6firewall OPTIONS/tcpdebug optional tcpdebug +OPTIONS/if_bridge optional if_bridge OPTIONS/faith optional faith OPTIONS/gif optional gif OPTIONS/netat optional netat @@ -124,8 +125,6 @@ OPTIONS/hfs optional hfs OPTIONS/mfs optional mfs OPTIONS/fdesc optional fdesc OPTIONS/fifo optional fifo -OPTIONS/nullfs optional nullfs -OPTIONS/union optional union OPTIONS/devfs optional devfs OPTIONS/crypto optional crypto OPTIONS/allcrypto optional allcrypto @@ -183,13 +182,7 @@ bsd/vfs/vfs_fsevents.c standard bsd/miscfs/deadfs/dead_vnops.c standard bsd/miscfs/devfs/devfs_fdesc_support.c optional fdesc bsd/miscfs/fifofs/fifo_vnops.c optional fifo sockets -bsd/miscfs/nullfs/null_subr.c optional nullfs -bsd/miscfs/nullfs/null_vfsops.c optional nullfs -bsd/miscfs/nullfs/null_vnops.c optional nullfs bsd/miscfs/specfs/spec_vnops.c standard -bsd/miscfs/union/union_subr.c optional union -bsd/miscfs/union/union_vfsops.c optional union -bsd/miscfs/union/union_vnops.c optional union bsd/miscfs/devfs/devfs_tree.c optional devfs bsd/miscfs/devfs/devfs_vnops.c optional devfs @@ -199,10 +192,10 @@ bsd/kern/decmpfs.c standard bsd/net/bpf.c optional bpfilter bsd/net/bpf_filter.c optional bpfilter +bsd/net/if_bridge.c optional if_bridge +bsd/net/bridgestp.c optional if_bridge bsd/net/bsd_comp.c optional ppp_bsdcomp bsd/net/if.c optional networking -bsd/net/if_atmsubr.c optional atm -bsd/net/if_disc.c optional disc bsd/net/init.c optional sockets bsd/net/dlil.c optional networking bsd/net/ether_if_module.c optional ether @@ -210,7 +203,7 @@ bsd/net/ether_at_pr_module.c optional ether netat bsd/net/ether_inet_pr_module.c optional ether inet bsd/net/ether_inet6_pr_module.c optional ether inet6 bsd/net/if_loop.c optional loop -bsd/net/if_mib.c optional netmibs +bsd/net/if_mib.c optional networking bsd/net/if_sl.c optional sl bsd/net/if_tun.c optional tun bsd/net/if_vlan.c optional vlan @@ -224,9 +217,10 @@ bsd/net/raw_cb.c optional networking bsd/net/raw_usrreq.c optional networking bsd/net/route.c optional networking bsd/net/rtsock.c optional networking +bsd/net/netsrc.c optional networking +bsd/net/ntstat.c optional networking bsd/net/slcompress.c optional ppp bsd/net/slcompress.c optional sl -bsd/net/if_dummy.c optional dummy bsd/net/if_gif.c optional gif bsd/net/if_stf.c optional stf bsd/net/net_osdep.c optional sockets @@ -243,19 +237,21 @@ bsd/net/pf_norm.c optional pf bsd/net/pf_osfp.c optional pf bsd/net/pf_ruleset.c optional pf bsd/net/pf_table.c optional pf +bsd/net/if_llreach.c optional networking -bsd/netinet/if_atm.c optional atm bsd/netinet/igmp.c optional inet bsd/netinet/in.c optional inet bsd/netinet/in_dhcp.c optional inet bsd/netinet/dhcp_options.c optional inet bsd/netinet/in_arp.c optional inet +bsd/netinet/in_mcast.c optional inet bsd/netinet/in_pcb.c optional inet +bsd/netinet/in_pcblist.c optional inet bsd/netinet/in_proto.c optional inet bsd/netinet/in_rmx.c optional inet +bsd/netinet/in_tclass.c optional inet bsd/netinet/ip_divert.c optional ipdivert bsd/netinet/ip_dummynet.c optional dummynet -bsd/netinet/ip_flow.c optional inet bsd/netinet/ip_fw2.c optional ipfw2 bsd/netinet/ip_fw2_compat.c optional ipfw2 bsd/netinet/ip_icmp.c optional inet @@ -271,6 +267,8 @@ bsd/netinet/tcp_sack.c optional inet bsd/netinet/tcp_subr.c optional inet bsd/netinet/tcp_timer.c optional inet bsd/netinet/tcp_usrreq.c optional inet +bsd/netinet/tcp_newreno.c optional inet +bsd/netinet/tcp_ledbat.c optional inet bsd/netinet/udp_usrreq.c optional inet bsd/netinet/in_gif.c optional gif inet bsd/netinet/ip_ecn.c optional inet @@ -300,8 +298,8 @@ bsd/netinet6/in6_src.c optional inet6 bsd/netinet6/ipcomp_core.c optional ipsec bsd/netinet6/ipcomp_input.c optional ipsec bsd/netinet6/ipcomp_output.c optional ipsec +bsd/netinet6/in6_mcast.c optional inet6 bsd/netinet6/in6_pcb.c optional inet6 -bsd/netinet6/in6_prefix.c optional inet6 bsd/netinet6/in6_proto.c optional inet6 bsd/netinet6/in6_rmx.c optional inet6 bsd/netinet6/mld6.c optional inet6 @@ -313,6 +311,7 @@ bsd/netinet6/route6.c optional inet6 bsd/netinet6/scope6.c optional inet6 bsd/netinet6/udp6_output.c optional inet6 bsd/netinet6/udp6_usrreq.c optional inet6 +bsd/netinet6/ip6_id.c optional inet6 bsd/netkey/key.c optional ipsec bsd/netkey/key_debug.c optional ipsec @@ -443,7 +442,7 @@ bsd/hfs/hfs_vfsutils.c optional hfs bsd/hfs/hfs_vnops.c optional hfs bsd/hfs/hfs_xattr.c optional hfs bsd/hfs/MacOSStubs.c optional hfs -bsd/hfs/cprotect.c optional hfs +bsd/hfs/hfs_cprotect.c optional hfs bsd/hfs/rangelist.c optional hfs bsd/hfs/hfscommon/BTree/BTree.c optional hfs bsd/hfs/hfscommon/BTree/BTreeAllocate.c optional hfs @@ -457,6 +456,7 @@ bsd/hfs/hfscommon/Catalog/FileIDsServices.c optional hfs bsd/hfs/hfscommon/Misc/BTreeWrapper.c optional hfs bsd/hfs/hfscommon/Misc/FileExtentMapping.c optional hfs bsd/hfs/hfscommon/Misc/VolumeAllocation.c optional hfs +bsd/hfs/hfscommon/Misc/HybridAllocator.c optional hfs bsd/hfs/hfscommon/Unicode/UnicodeWrappers.c optional hfs bsd/security/audit/audit.c optional config_audit @@ -498,6 +498,7 @@ bsd/kern/kern_malloc.c standard bsd/kern/kern_mman.c standard bsd/kern/kern_panicinfo.c optional panic_info bsd/kern/kern_physio.c standard +bsd/kern/kern_priv.c standard bsd/kern/kern_proc.c standard bsd/kern/kern_prot.c standard bsd/kern/kern_resource.c standard @@ -556,6 +557,8 @@ bsd/kern/kpi_socketfilter.c optional sockets bsd/kern/pthread_support.c optional psynch bsd/kern/pthread_synch.c standard bsd/kern/proc_info.c standard +bsd/kern/process_policy.c standard +bsd/kern/vm_pressure.c standard bsd/kern/socket_info.c optional sockets bsd/vm/vnode_pager.c standard @@ -585,3 +588,4 @@ bsd/dev/dtrace/profile_prvd.c optional config_dtrace bsd/dev/dtrace/fasttrap.c optional config_dtrace bsd/kern/imageboot.c optional config_imageboot + diff --git a/bsd/conf/files.i386 b/bsd/conf/files.i386 index 424cc3e3e..331f7202d 100644 --- a/bsd/conf/files.i386 +++ b/bsd/conf/files.i386 @@ -14,10 +14,19 @@ bsd/dev/i386/systemcalls.c standard bsd/dev/i386/sysctl.c standard bsd/dev/i386/unix_signal.c standard bsd/dev/i386/munge.s standard -bsd/crypto/aes/i386/aes_x86_v2.s optional crypto -bsd/crypto/aes/i386/aes_modes.c optional crypto +bsd/crypto/aes/i386/AES.s optional crypto +bsd/crypto/aes/i386/aes_modes_asm.s optional crypto +bsd/crypto/aes/i386/aes_modes_hw.s optional crypto +bsd/crypto/aes/i386/aes_key_hw.s optional crypto +bsd/crypto/aes/i386/aes_crypt_hw.s optional crypto +bsd/crypto/aes/i386/aesxts_asm.s optional crypto +bsd/crypto/aes/i386/aesxts.c optional crypto +bsd/crypto/sha2/intel/sha256.s optional crypto +bsd/crypto/sha2/intel/sha256nossse3.s optional crypto + +# Lightly ifdef'd to support K64 DTrace bsd/dev/i386/dtrace_isa.c optional config_dtrace bsd/dev/i386/dtrace_subr_x86.c optional config_dtrace bsd/dev/i386/fbt_x86.c optional config_dtrace @@ -26,6 +35,11 @@ bsd/dev/i386/fasttrap_isa.c optional config_dtrace bsd/dev/i386/instr_size.c optional config_dtrace bsd/dev/i386/dis_tables.c optional config_dtrace +# Support for identifying MACF calouts with locks held +bsd/kern/policy_check.c optional config_macf + bsd/kern/bsd_stubs.c standard bsd/netinet/in_cksum.c optional inet + + diff --git a/bsd/conf/files.ppc b/bsd/conf/files.ppc deleted file mode 100644 index 57e8870a7..000000000 --- a/bsd/conf/files.ppc +++ /dev/null @@ -1,34 +0,0 @@ -OPTIONS/show_space optional show_space -OPTIONS/gdb optional gdb -OPTIONS/iplmeas optional iplmeas - -bsd/netinet/in_cksum.c optional inet - -bsd/dev/ppc/conf.c standard -bsd/dev/ppc/cons.c standard -bsd/dev/ppc/mem.c standard -bsd/dev/ppc/unix_signal.c standard -bsd/dev/ppc/ffs.s standard -bsd/dev/ppc/memmove.c standard -bsd/dev/ppc/machdep.c standard -bsd/dev/ppc/kern_machdep.c standard -bsd/dev/ppc/stubs.c standard -bsd/dev/ppc/systemcalls.c standard -bsd/dev/ppc/km.c standard -bsd/dev/ppc/xsumas.s standard -bsd/dev/ppc/sysctl.c standard -bsd/dev/ppc/munge.s standard -bsd/crypto/aes/ppc/aescrypt.c optional crypto -bsd/crypto/aes/ppc/aeskey.c optional crypto -bsd/crypto/aes/ppc/aestab.c optional crypto - - -bsd/dev/ppc/dtrace_isa.c optional config_dtrace -bsd/dev/ppc/dtrace_subr_ppc.c optional config_dtrace -bsd/dev/ppc/fbt_ppc.c optional config_dtrace -bsd/dev/ppc/sdt_ppc.c optional config_dtrace -bsd/dev/ppc/fasttrap_isa.c optional config_dtrace - -bsd/kern/bsd_stubs.c standard - - diff --git a/bsd/conf/files.x86_64 b/bsd/conf/files.x86_64 index 322174554..fcb3be604 100644 --- a/bsd/conf/files.x86_64 +++ b/bsd/conf/files.x86_64 @@ -15,9 +15,16 @@ bsd/dev/i386/sysctl.c standard bsd/dev/i386/unix_signal.c standard bsd/dev/x86_64/munge.s standard -bsd/crypto/aes/gen/aescrypt.c optional crypto -bsd/crypto/aes/gen/aeskey.c optional crypto -bsd/crypto/aes/gen/aestab.c optional crypto +bsd/crypto/aes/i386/AES.s optional crypto +bsd/crypto/aes/i386/aes_modes_asm.s optional crypto +bsd/crypto/aes/i386/aes_modes_hw.s optional crypto +bsd/crypto/aes/i386/aes_key_hw.s optional crypto +bsd/crypto/aes/i386/aes_crypt_hw.s optional crypto +bsd/crypto/aes/i386/aesxts_asm.s optional crypto +bsd/crypto/aes/i386/aesxts.c optional crypto + +bsd/crypto/sha2/intel/sha256.s optional crypto +bsd/crypto/sha2/intel/sha256nossse3.s optional crypto # Lightly ifdef'd to support K64 DTrace bsd/dev/i386/dtrace_isa.c optional config_dtrace @@ -28,6 +35,9 @@ bsd/dev/i386/fasttrap_isa.c optional config_dtrace bsd/dev/i386/instr_size.c optional config_dtrace bsd/dev/i386/dis_tables.c optional config_dtrace +# Support for identifying MACF calouts with locks held +bsd/kern/policy_check.c optional config_macf + bsd/kern/bsd_stubs.c standard bsd/netinet/in_cksum.c optional inet diff --git a/bsd/conf/param.c b/bsd/conf/param.c index 9aafb343c..95c01ffb5 100644 --- a/bsd/conf/param.c +++ b/bsd/conf/param.c @@ -91,7 +91,8 @@ int maxprocperuid = NPROC/2; int nprocs = 0; /* XXX */ //#define NTEXT (80 + NPROC / 8) /* actually the object cache */ -int desiredvnodes = CONFIG_VNODES; +int desiredvnodes = 0; /* desiredvnodes is set explicitly in unix_startup.c */ +uint32_t kern_maxvnodes = 0; /* global, to be read from the device tree */ #define MAXFILES (OPEN_MAX + 2048) int maxfiles = MAXFILES; diff --git a/bsd/conf/tools/Makefile b/bsd/conf/tools/Makefile deleted file mode 100644 index 4f9ccd553..000000000 --- a/bsd/conf/tools/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -SETUP_SUBDIRS = doconf - -COMP_SUBDIRS = doconf - -INST_SUBDIRS = \ - - -setup_build_all: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_all: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -setup_build_install: - @echo "[ $(SOURCE) ] make setup_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -do_build_install: - @echo "[ $(SOURCE) ] make do_build_all $(KERNEL_CONFIG) $(ARCH_CONFIG) $(TARGET)" - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/conf/tools/doconf/Makefile b/bsd/conf/tools/doconf/Makefile deleted file mode 100644 index 7794a4ceb..000000000 --- a/bsd/conf/tools/doconf/Makefile +++ /dev/null @@ -1,49 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -COMP_SUBDIRS = \ - -INST_SUBDIRS = \ - - -# -# Who and where -# -BINDIR= -ifneq ($(MACHINE_CONFIG), DEFAULT) -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)_$(MACHINE_CONFIG)/$(COMPONENT)/) -else -DSTDIR= $(strip $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/) -endif -PROGRAM= $(DSTDIR)doconf - -# -# How to install it -# -IFLAGS= -c -m 555 - -$(PROGRAM): $(DSTDIR)% : $(SOURCE)%.csh - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS - @sed -e "s/#PROGRAM.*/#`vers_string $(notdir $(PROGRAM))`/" \ - < $< >$(notdir $(PROGRAM)).VERS; - @install $(IFLAGS) $(notdir $(PROGRAM)).VERS $(PROGRAM); - @-$(RM) $(RMFLAGS) $(notdir $(PROGRAM)).VERS; - -do_build_setup: $(PROGRAM) - -do_build_all: - -setup_build_install: - -do_build_install: - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/crypto/Makefile b/bsd/crypto/Makefile index 0af469f52..ab0c4b986 100644 --- a/bsd/crypto/Makefile +++ b/bsd/crypto/Makefile @@ -16,18 +16,16 @@ INSTINC_SUBDIRS = \ sha2 -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ + aes INSTINC_SUBDIRS_X86_64 = \ + aes INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/aes/Assert.c b/bsd/crypto/aes/Assert.c new file mode 100644 index 000000000..5ba9c4472 --- /dev/null +++ b/bsd/crypto/aes/Assert.c @@ -0,0 +1,34 @@ +/* This module exists solely to check compile-time assertions. It should be + compiled when building the project, and building should be terminated if + errors are encountered. However, any object it produces need not be + included in the build. +*/ + + +#include <stddef.h> + +#include "crypto/aes.h" +#include "Context.h" + +/* Declare CheckAssertion so that if any of the declarations below differ + from it, the compiler will report an error. +*/ +extern char CheckAssertion[1]; + +/* Ensure that ContextKey is the offset of the ks member of the AES context + structures. +*/ +extern char CheckAssertion[ContextKey == offsetof(aes_encrypt_ctx, ks)]; +extern char CheckAssertion[ContextKey == offsetof(aes_decrypt_ctx, ks)]; + /* If these assertions fail, change the definition of ContextKey in + Context.h to match the offset of the ks field. + */ + +/* Ensure that ContextKeyLength is the offset of the inf member of the AES + context structures. +*/ +extern char CheckAssertion[ContextKeyLength == offsetof(aes_encrypt_ctx, inf)]; +extern char CheckAssertion[ContextKeyLength == offsetof(aes_decrypt_ctx, inf)]; + /* If these assertions fail, change the definition of ContextKeyLength in + Context.h to match the offset of the inf field. + */ diff --git a/bsd/crypto/aes/Makefile b/bsd/crypto/aes/Makefile index 026261c65..6b96dbd34 100644 --- a/bsd/crypto/aes/Makefile +++ b/bsd/crypto/aes/Makefile @@ -9,18 +9,16 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ + i386 INSTINC_SUBDIRS_X86_64 = \ + i386 INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/aes/aes.h b/bsd/crypto/aes/aes.h old mode 100644 new mode 100755 index eaba0a692..49c845da6 --- a/bsd/crypto/aes/aes.h +++ b/bsd/crypto/aes/aes.h @@ -80,14 +80,22 @@ typedef unsigned int aes_32t; #endif +#if 0 // defined (__i386__) || defined (__x86_64__) + +/* + looks like no other code for (i386/x86_64) is using the following definitions any more. + I comment this out, so the C code in the directory gen/ can be used to compile for test/development purpose. + Note : this is not going to change anything in the i386/x86_64 kernel. + (source code in i386/, mostly in assembly, does not reference to this header file.) + + cclee 10-20-2010 +*/ /* the character array 'inf' in the following structures is used */ /* to hold AES context information. This AES code uses cx->inf.b[0] */ /* to hold the number of rounds multiplied by 16. The other three */ /* elements can be used by code that implements additional modes */ -#if defined (__i386__) - #if defined( AES_ERR_CHK ) #define aes_rval int_ret #else @@ -166,7 +174,7 @@ aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]); aes_rval aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]); #endif -#if defined (__i386__) +#if defined (__i386__) || defined (__x86_64__) aes_rval aes_encrypt(const unsigned char *in, unsigned char *out, const aes_encrypt_ctx cx[1]); #endif @@ -193,7 +201,7 @@ aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]); aes_rval aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]); #endif -#if defined (__i386__) +#if defined (__i386__) || defined (__x86_64__) aes_rval aes_decrypt(const unsigned char *in, unsigned char *out, const aes_decrypt_ctx cx[1]); #endif diff --git a/bsd/crypto/aes/gen/Makefile b/bsd/crypto/aes/gen/Makefile index 7ea225c10..d32c71c39 100644 --- a/bsd/crypto/aes/gen/Makefile +++ b/bsd/crypto/aes/gen/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ PRIVATE_DATAFILES = \ diff --git a/bsd/crypto/aes/gen/aesopt.h b/bsd/crypto/aes/gen/aesopt.h index 2b78eb920..fc28e4a48 100644 --- a/bsd/crypto/aes/gen/aesopt.h +++ b/bsd/crypto/aes/gen/aesopt.h @@ -585,12 +585,12 @@ #elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER) -#define word_in(x,c) (*((aes_32t*)(x)+(c))) +#define word_in(x,c) (*((const aes_32t*)(x)+(c))) #define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = (v)) #else -#define word_in(x,c) aes_sw32(*((aes_32t*)(x)+(c))) +#define word_in(x,c) aes_sw32(*((const aes_32t*)(x)+(c))) #define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = aes_sw32(v)) #endif diff --git a/bsd/crypto/aes/i386/AES.s b/bsd/crypto/aes/i386/AES.s new file mode 100644 index 000000000..9bf440a68 --- /dev/null +++ b/bsd/crypto/aes/i386/AES.s @@ -0,0 +1,143 @@ +/* AES.s -- Core AES routines for Intel processors. + + Written by Eric Postpischil, January 30, 2008. +*/ + + +/* We build these AES routines as a single module because the routines refer + to labels in Data.s and it is easier and faster to refer to them as local + labels. In my implementations of AES for CommonCrypto, both i386 and + x86_64 use position-independent code. For this in-kernel implementation, + i386 has been converted to absolute addressing, but x86_64 still uses PIC. + + A local label can be referred to with position-independent assembler + expressions such as "label-base(register)", where <base> is a local label + whose address has been loaded into <register>. (On i386, this is typically + done with the idiom of a call to the next instruction and a pop of that + return address into a register.) Without local labels, the references must + be done using spaces for addresses of "lazy symbols" that are filled in by + the dynamic loader and loaded by the code that wants the address. + + So the various routines in other files are assembled here via #include + directives. +*/ +#include "Data.s" + + +#define TableSize (256*4) + /* Each of the arrays defined in Data.s except for the round constants + in _AESRcon is composed of four tables of 256 entries of four bytes + each. TableSize is the number of bytes in one of those four tables. + */ + + +// Include constants describing the AES context structures. +#include "Context.h" + + +/* Define a macro to select a value based on architecture. This reduces + some of the architecture conditionalization later in the source. +*/ +#if defined __i386__ + #define Arch(i386, x86_64) i386 +#elif defined __x86_64__ + #define Arch(i386, x86_64) x86_64 +#endif + + +// Define an instruction for moving pointers. +#define movp Arch(movd, movd) + // Latter argument should be "movq", but the assembler uses "movd". + + +/* Rename the general registers. This makes it easier to keep track of them + and provides names for the "whole register" that are uniform between i386 + and x86_64. +*/ +#if defined __i386__ + #define r0 %eax // Available for any use. + #define r1 %ecx // Available for any use, some special purposes (loop). + #define r2 %edx // Available for any use. + #define r3 %ebx // Must be preserved by called routine. + #define r4 %esp // Stack pointer. + #define r5 %ebp // Frame pointer, must preserve, no bare indirect. + #define r6 %esi // Must be preserved by called routine. + #define r7 %edi // Must be preserved by called routine. +#elif defined __x86_64__ + #define r0 %rax // Available for any use. + #define r1 %rcx // Available for any use. + #define r2 %rdx // Available for any use. + #define r3 %rbx // Must be preserved by called routine. + #define r4 %rsp // Stack pointer. + #define r5 %rbp // Frame pointer. Must be preserved by called routine. + #define r6 %rsi // Available for any use. + #define r7 %rdi // Available for any use. + #define r8 %r8 // Available for any use. + #define r9 %r9 // Available for any use. + #define r10 %r10 // Available for any use. + #define r11 %r11 // Available for any use. + #define r12 %r12 // Must be preserved by called routine. + #define r13 %r13 // Must be preserved by called routine. + #define r14 %r14 // Must be preserved by called routine. + #define r15 %r15 // Must be preserved by called routine. +#else + #error "Unknown architecture." +#endif + +// Define names for parts of registers. + +#define r0d %eax // Low 32 bits of r0. +#define r1d %ecx // Low 32 bits of r1. +#define r2d %edx // Low 32 bits of r2. +#define r3d %ebx // Low 32 bits of r3. +#define r5d %ebp // Low 32 bits of r5. +#define r6d %esi // Low 32 bits of r6. +#define r7d %edi // Low 32 bits of r7. +#define r8d %r8d // Low 32 bits of r8. +#define r9d %r9d // Low 32 bits of r9. +#define r11d %r11d // Low 32 bits of r11. + +#define r0l %al // Low byte of r0. +#define r1l %cl // Low byte of r1. +#define r2l %dl // Low byte of r2. +#define r3l %bl // Low byte of r3. +#define r5l %bpl // Low byte of r5. + +#define r0h %ah // Second lowest byte of r0. +#define r1h %ch // Second lowest byte of r1. +#define r2h %dh // Second lowest byte of r2. +#define r3h %bh // Second lowest byte of r3. + + + .text + + +// Define encryption routine, _AESEncryptWithExpandedKey +#define Select 0 +#include "EncryptDecrypt.s" +#undef Select + + +// Define decryption routine, _AESDecryptWithExpandedKey +#define Select 1 +#include "EncryptDecrypt.s" +#undef Select + +// Define encryption routine, _AESEncryptWithExpandedKey +#define Select 2 +#include "EncryptDecrypt.s" +#undef Select + + +// Define decryption routine, _AESDecryptWithExpandedKey +#define Select 3 +#include "EncryptDecrypt.s" +#undef Select + + +// Define key expansion routine for encryption, _AESExpandKeyForEncryption. +#include "ExpandKeyForEncryption.s" + + +// Define key expansion for decryption routine, _AESExpandKeyForDecryption. +#include "ExpandKeyForDecryption.s" diff --git a/bsd/crypto/aes/i386/Context.h b/bsd/crypto/aes/i386/Context.h new file mode 100644 index 000000000..f53cb9514 --- /dev/null +++ b/bsd/crypto/aes/i386/Context.h @@ -0,0 +1,9 @@ +// Define byte offset of key within context structure. +#define ContextKey 0 + +/* Define byte offset of key length within context structure. The number + stored there is the number of bytes from the start of the first round key + to the start of the last round key. That is 16 less than the number of + bytes in the entire key. +*/ +#define ContextKeyLength 240 diff --git a/bsd/crypto/aes/i386/Data.mk b/bsd/crypto/aes/i386/Data.mk new file mode 100644 index 000000000..4b55d630f --- /dev/null +++ b/bsd/crypto/aes/i386/Data.mk @@ -0,0 +1,30 @@ +default: + @echo "This makefile builds Data.s, which contains constant data for the" + @echo "AES implementation. This file does not normally need to be rebuilt," + @echo "so it is checked into the source code repository. It should be" + @echo "changed only when the implementation changes and needs data in a" + @echo "different format. (This file can also build a C version, Data.c," + @echo "but that is not currently in use.)" + @echo "" + @echo "To rebuild the file(s), execute \"make -f Data.mk all\"." + +.PHONY: all clean +Targets = Data.s +all: $(Targets) + +CFLAGS += -O3 -std=c99 -Wmost -Werror + +.INTERMEDIATE: MakeData +MakeData: MakeData.c + +# Do not leave bad output files if the build fails. +.DELETE_ON_ERROR: $(Targets) + +Data.c: MakeData + ./$< >$@ C + +Data.s: MakeData + ./$< >$@ Intel + +clean: + -rm $(Targets) diff --git a/bsd/crypto/aes/i386/Data.s b/bsd/crypto/aes/i386/Data.s new file mode 100644 index 000000000..d330462d0 --- /dev/null +++ b/bsd/crypto/aes/i386/Data.s @@ -0,0 +1,5196 @@ +// This file was generated by MakeData.c. + + + .const + + +// Round constants. + .globl _AESRcon + .private_extern _AESRcon +_AESRcon: + .byte 0 // Not used, included for indexing simplicity. + .byte 0x01 + .byte 0x02 + .byte 0x04 + .byte 0x08 + .byte 0x10 + .byte 0x20 + .byte 0x40 + .byte 0x80 + .byte 0x1b + .byte 0x36 + + +// Tables for InvMixColumn. + .globl _AESInvMixColumnTable + .private_extern _AESInvMixColumnTable + .align 2 +_AESInvMixColumnTable: + // Table 0. + .long 0x00000000 + .long 0x0b0d090e + .long 0x161a121c + .long 0x1d171b12 + .long 0x2c342438 + .long 0x27392d36 + .long 0x3a2e3624 + .long 0x31233f2a + .long 0x58684870 + .long 0x5365417e + .long 0x4e725a6c + .long 0x457f5362 + .long 0x745c6c48 + .long 0x7f516546 + .long 0x62467e54 + .long 0x694b775a + .long 0xb0d090e0 + .long 0xbbdd99ee + .long 0xa6ca82fc + .long 0xadc78bf2 + .long 0x9ce4b4d8 + .long 0x97e9bdd6 + .long 0x8afea6c4 + .long 0x81f3afca + .long 0xe8b8d890 + .long 0xe3b5d19e + .long 0xfea2ca8c + .long 0xf5afc382 + .long 0xc48cfca8 + .long 0xcf81f5a6 + .long 0xd296eeb4 + .long 0xd99be7ba + .long 0x7bbb3bdb + .long 0x70b632d5 + .long 0x6da129c7 + .long 0x66ac20c9 + .long 0x578f1fe3 + .long 0x5c8216ed + .long 0x41950dff + .long 0x4a9804f1 + .long 0x23d373ab + .long 0x28de7aa5 + .long 0x35c961b7 + .long 0x3ec468b9 + .long 0x0fe75793 + .long 0x04ea5e9d + .long 0x19fd458f + .long 0x12f04c81 + .long 0xcb6bab3b + .long 0xc066a235 + .long 0xdd71b927 + .long 0xd67cb029 + .long 0xe75f8f03 + .long 0xec52860d + .long 0xf1459d1f + .long 0xfa489411 + .long 0x9303e34b + .long 0x980eea45 + .long 0x8519f157 + .long 0x8e14f859 + .long 0xbf37c773 + .long 0xb43ace7d + .long 0xa92dd56f + .long 0xa220dc61 + .long 0xf66d76ad + .long 0xfd607fa3 + .long 0xe07764b1 + .long 0xeb7a6dbf + .long 0xda595295 + .long 0xd1545b9b + .long 0xcc434089 + .long 0xc74e4987 + .long 0xae053edd + .long 0xa50837d3 + .long 0xb81f2cc1 + .long 0xb31225cf + .long 0x82311ae5 + .long 0x893c13eb + .long 0x942b08f9 + .long 0x9f2601f7 + .long 0x46bde64d + .long 0x4db0ef43 + .long 0x50a7f451 + .long 0x5baafd5f + .long 0x6a89c275 + .long 0x6184cb7b + .long 0x7c93d069 + .long 0x779ed967 + .long 0x1ed5ae3d + .long 0x15d8a733 + .long 0x08cfbc21 + .long 0x03c2b52f + .long 0x32e18a05 + .long 0x39ec830b + .long 0x24fb9819 + .long 0x2ff69117 + .long 0x8dd64d76 + .long 0x86db4478 + .long 0x9bcc5f6a + .long 0x90c15664 + .long 0xa1e2694e + .long 0xaaef6040 + .long 0xb7f87b52 + .long 0xbcf5725c + .long 0xd5be0506 + .long 0xdeb30c08 + .long 0xc3a4171a + .long 0xc8a91e14 + .long 0xf98a213e + .long 0xf2872830 + .long 0xef903322 + .long 0xe49d3a2c + .long 0x3d06dd96 + .long 0x360bd498 + .long 0x2b1ccf8a + .long 0x2011c684 + .long 0x1132f9ae + .long 0x1a3ff0a0 + .long 0x0728ebb2 + .long 0x0c25e2bc + .long 0x656e95e6 + .long 0x6e639ce8 + .long 0x737487fa + .long 0x78798ef4 + .long 0x495ab1de + .long 0x4257b8d0 + .long 0x5f40a3c2 + .long 0x544daacc + .long 0xf7daec41 + .long 0xfcd7e54f + .long 0xe1c0fe5d + .long 0xeacdf753 + .long 0xdbeec879 + .long 0xd0e3c177 + .long 0xcdf4da65 + .long 0xc6f9d36b + .long 0xafb2a431 + .long 0xa4bfad3f + .long 0xb9a8b62d + .long 0xb2a5bf23 + .long 0x83868009 + .long 0x888b8907 + .long 0x959c9215 + .long 0x9e919b1b + .long 0x470a7ca1 + .long 0x4c0775af + .long 0x51106ebd + .long 0x5a1d67b3 + .long 0x6b3e5899 + .long 0x60335197 + .long 0x7d244a85 + .long 0x7629438b + .long 0x1f6234d1 + .long 0x146f3ddf + .long 0x097826cd + .long 0x02752fc3 + .long 0x335610e9 + .long 0x385b19e7 + .long 0x254c02f5 + .long 0x2e410bfb + .long 0x8c61d79a + .long 0x876cde94 + .long 0x9a7bc586 + .long 0x9176cc88 + .long 0xa055f3a2 + .long 0xab58faac + .long 0xb64fe1be + .long 0xbd42e8b0 + .long 0xd4099fea + .long 0xdf0496e4 + .long 0xc2138df6 + .long 0xc91e84f8 + .long 0xf83dbbd2 + .long 0xf330b2dc + .long 0xee27a9ce + .long 0xe52aa0c0 + .long 0x3cb1477a + .long 0x37bc4e74 + .long 0x2aab5566 + .long 0x21a65c68 + .long 0x10856342 + .long 0x1b886a4c + .long 0x069f715e + .long 0x0d927850 + .long 0x64d90f0a + .long 0x6fd40604 + .long 0x72c31d16 + .long 0x79ce1418 + .long 0x48ed2b32 + .long 0x43e0223c + .long 0x5ef7392e + .long 0x55fa3020 + .long 0x01b79aec + .long 0x0aba93e2 + .long 0x17ad88f0 + .long 0x1ca081fe + .long 0x2d83bed4 + .long 0x268eb7da + .long 0x3b99acc8 + .long 0x3094a5c6 + .long 0x59dfd29c + .long 0x52d2db92 + .long 0x4fc5c080 + .long 0x44c8c98e + .long 0x75ebf6a4 + .long 0x7ee6ffaa + .long 0x63f1e4b8 + .long 0x68fcedb6 + .long 0xb1670a0c + .long 0xba6a0302 + .long 0xa77d1810 + .long 0xac70111e + .long 0x9d532e34 + .long 0x965e273a + .long 0x8b493c28 + .long 0x80443526 + .long 0xe90f427c + .long 0xe2024b72 + .long 0xff155060 + .long 0xf418596e + .long 0xc53b6644 + .long 0xce366f4a + .long 0xd3217458 + .long 0xd82c7d56 + .long 0x7a0ca137 + .long 0x7101a839 + .long 0x6c16b32b + .long 0x671bba25 + .long 0x5638850f + .long 0x5d358c01 + .long 0x40229713 + .long 0x4b2f9e1d + .long 0x2264e947 + .long 0x2969e049 + .long 0x347efb5b + .long 0x3f73f255 + .long 0x0e50cd7f + .long 0x055dc471 + .long 0x184adf63 + .long 0x1347d66d + .long 0xcadc31d7 + .long 0xc1d138d9 + .long 0xdcc623cb + .long 0xd7cb2ac5 + .long 0xe6e815ef + .long 0xede51ce1 + .long 0xf0f207f3 + .long 0xfbff0efd + .long 0x92b479a7 + .long 0x99b970a9 + .long 0x84ae6bbb + .long 0x8fa362b5 + .long 0xbe805d9f + .long 0xb58d5491 + .long 0xa89a4f83 + .long 0xa397468d + // Table 1. + .long 0x00000000 + .long 0x0d090e0b + .long 0x1a121c16 + .long 0x171b121d + .long 0x3424382c + .long 0x392d3627 + .long 0x2e36243a + .long 0x233f2a31 + .long 0x68487058 + .long 0x65417e53 + .long 0x725a6c4e + .long 0x7f536245 + .long 0x5c6c4874 + .long 0x5165467f + .long 0x467e5462 + .long 0x4b775a69 + .long 0xd090e0b0 + .long 0xdd99eebb + .long 0xca82fca6 + .long 0xc78bf2ad + .long 0xe4b4d89c + .long 0xe9bdd697 + .long 0xfea6c48a + .long 0xf3afca81 + .long 0xb8d890e8 + .long 0xb5d19ee3 + .long 0xa2ca8cfe + .long 0xafc382f5 + .long 0x8cfca8c4 + .long 0x81f5a6cf + .long 0x96eeb4d2 + .long 0x9be7bad9 + .long 0xbb3bdb7b + .long 0xb632d570 + .long 0xa129c76d + .long 0xac20c966 + .long 0x8f1fe357 + .long 0x8216ed5c + .long 0x950dff41 + .long 0x9804f14a + .long 0xd373ab23 + .long 0xde7aa528 + .long 0xc961b735 + .long 0xc468b93e + .long 0xe757930f + .long 0xea5e9d04 + .long 0xfd458f19 + .long 0xf04c8112 + .long 0x6bab3bcb + .long 0x66a235c0 + .long 0x71b927dd + .long 0x7cb029d6 + .long 0x5f8f03e7 + .long 0x52860dec + .long 0x459d1ff1 + .long 0x489411fa + .long 0x03e34b93 + .long 0x0eea4598 + .long 0x19f15785 + .long 0x14f8598e + .long 0x37c773bf + .long 0x3ace7db4 + .long 0x2dd56fa9 + .long 0x20dc61a2 + .long 0x6d76adf6 + .long 0x607fa3fd + .long 0x7764b1e0 + .long 0x7a6dbfeb + .long 0x595295da + .long 0x545b9bd1 + .long 0x434089cc + .long 0x4e4987c7 + .long 0x053eddae + .long 0x0837d3a5 + .long 0x1f2cc1b8 + .long 0x1225cfb3 + .long 0x311ae582 + .long 0x3c13eb89 + .long 0x2b08f994 + .long 0x2601f79f + .long 0xbde64d46 + .long 0xb0ef434d + .long 0xa7f45150 + .long 0xaafd5f5b + .long 0x89c2756a + .long 0x84cb7b61 + .long 0x93d0697c + .long 0x9ed96777 + .long 0xd5ae3d1e + .long 0xd8a73315 + .long 0xcfbc2108 + .long 0xc2b52f03 + .long 0xe18a0532 + .long 0xec830b39 + .long 0xfb981924 + .long 0xf691172f + .long 0xd64d768d + .long 0xdb447886 + .long 0xcc5f6a9b + .long 0xc1566490 + .long 0xe2694ea1 + .long 0xef6040aa + .long 0xf87b52b7 + .long 0xf5725cbc + .long 0xbe0506d5 + .long 0xb30c08de + .long 0xa4171ac3 + .long 0xa91e14c8 + .long 0x8a213ef9 + .long 0x872830f2 + .long 0x903322ef + .long 0x9d3a2ce4 + .long 0x06dd963d + .long 0x0bd49836 + .long 0x1ccf8a2b + .long 0x11c68420 + .long 0x32f9ae11 + .long 0x3ff0a01a + .long 0x28ebb207 + .long 0x25e2bc0c + .long 0x6e95e665 + .long 0x639ce86e + .long 0x7487fa73 + .long 0x798ef478 + .long 0x5ab1de49 + .long 0x57b8d042 + .long 0x40a3c25f + .long 0x4daacc54 + .long 0xdaec41f7 + .long 0xd7e54ffc + .long 0xc0fe5de1 + .long 0xcdf753ea + .long 0xeec879db + .long 0xe3c177d0 + .long 0xf4da65cd + .long 0xf9d36bc6 + .long 0xb2a431af + .long 0xbfad3fa4 + .long 0xa8b62db9 + .long 0xa5bf23b2 + .long 0x86800983 + .long 0x8b890788 + .long 0x9c921595 + .long 0x919b1b9e + .long 0x0a7ca147 + .long 0x0775af4c + .long 0x106ebd51 + .long 0x1d67b35a + .long 0x3e58996b + .long 0x33519760 + .long 0x244a857d + .long 0x29438b76 + .long 0x6234d11f + .long 0x6f3ddf14 + .long 0x7826cd09 + .long 0x752fc302 + .long 0x5610e933 + .long 0x5b19e738 + .long 0x4c02f525 + .long 0x410bfb2e + .long 0x61d79a8c + .long 0x6cde9487 + .long 0x7bc5869a + .long 0x76cc8891 + .long 0x55f3a2a0 + .long 0x58faacab + .long 0x4fe1beb6 + .long 0x42e8b0bd + .long 0x099fead4 + .long 0x0496e4df + .long 0x138df6c2 + .long 0x1e84f8c9 + .long 0x3dbbd2f8 + .long 0x30b2dcf3 + .long 0x27a9ceee + .long 0x2aa0c0e5 + .long 0xb1477a3c + .long 0xbc4e7437 + .long 0xab55662a + .long 0xa65c6821 + .long 0x85634210 + .long 0x886a4c1b + .long 0x9f715e06 + .long 0x9278500d + .long 0xd90f0a64 + .long 0xd406046f + .long 0xc31d1672 + .long 0xce141879 + .long 0xed2b3248 + .long 0xe0223c43 + .long 0xf7392e5e + .long 0xfa302055 + .long 0xb79aec01 + .long 0xba93e20a + .long 0xad88f017 + .long 0xa081fe1c + .long 0x83bed42d + .long 0x8eb7da26 + .long 0x99acc83b + .long 0x94a5c630 + .long 0xdfd29c59 + .long 0xd2db9252 + .long 0xc5c0804f + .long 0xc8c98e44 + .long 0xebf6a475 + .long 0xe6ffaa7e + .long 0xf1e4b863 + .long 0xfcedb668 + .long 0x670a0cb1 + .long 0x6a0302ba + .long 0x7d1810a7 + .long 0x70111eac + .long 0x532e349d + .long 0x5e273a96 + .long 0x493c288b + .long 0x44352680 + .long 0x0f427ce9 + .long 0x024b72e2 + .long 0x155060ff + .long 0x18596ef4 + .long 0x3b6644c5 + .long 0x366f4ace + .long 0x217458d3 + .long 0x2c7d56d8 + .long 0x0ca1377a + .long 0x01a83971 + .long 0x16b32b6c + .long 0x1bba2567 + .long 0x38850f56 + .long 0x358c015d + .long 0x22971340 + .long 0x2f9e1d4b + .long 0x64e94722 + .long 0x69e04929 + .long 0x7efb5b34 + .long 0x73f2553f + .long 0x50cd7f0e + .long 0x5dc47105 + .long 0x4adf6318 + .long 0x47d66d13 + .long 0xdc31d7ca + .long 0xd138d9c1 + .long 0xc623cbdc + .long 0xcb2ac5d7 + .long 0xe815efe6 + .long 0xe51ce1ed + .long 0xf207f3f0 + .long 0xff0efdfb + .long 0xb479a792 + .long 0xb970a999 + .long 0xae6bbb84 + .long 0xa362b58f + .long 0x805d9fbe + .long 0x8d5491b5 + .long 0x9a4f83a8 + .long 0x97468da3 + // Table 2. + .long 0x00000000 + .long 0x090e0b0d + .long 0x121c161a + .long 0x1b121d17 + .long 0x24382c34 + .long 0x2d362739 + .long 0x36243a2e + .long 0x3f2a3123 + .long 0x48705868 + .long 0x417e5365 + .long 0x5a6c4e72 + .long 0x5362457f + .long 0x6c48745c + .long 0x65467f51 + .long 0x7e546246 + .long 0x775a694b + .long 0x90e0b0d0 + .long 0x99eebbdd + .long 0x82fca6ca + .long 0x8bf2adc7 + .long 0xb4d89ce4 + .long 0xbdd697e9 + .long 0xa6c48afe + .long 0xafca81f3 + .long 0xd890e8b8 + .long 0xd19ee3b5 + .long 0xca8cfea2 + .long 0xc382f5af + .long 0xfca8c48c + .long 0xf5a6cf81 + .long 0xeeb4d296 + .long 0xe7bad99b + .long 0x3bdb7bbb + .long 0x32d570b6 + .long 0x29c76da1 + .long 0x20c966ac + .long 0x1fe3578f + .long 0x16ed5c82 + .long 0x0dff4195 + .long 0x04f14a98 + .long 0x73ab23d3 + .long 0x7aa528de + .long 0x61b735c9 + .long 0x68b93ec4 + .long 0x57930fe7 + .long 0x5e9d04ea + .long 0x458f19fd + .long 0x4c8112f0 + .long 0xab3bcb6b + .long 0xa235c066 + .long 0xb927dd71 + .long 0xb029d67c + .long 0x8f03e75f + .long 0x860dec52 + .long 0x9d1ff145 + .long 0x9411fa48 + .long 0xe34b9303 + .long 0xea45980e + .long 0xf1578519 + .long 0xf8598e14 + .long 0xc773bf37 + .long 0xce7db43a + .long 0xd56fa92d + .long 0xdc61a220 + .long 0x76adf66d + .long 0x7fa3fd60 + .long 0x64b1e077 + .long 0x6dbfeb7a + .long 0x5295da59 + .long 0x5b9bd154 + .long 0x4089cc43 + .long 0x4987c74e + .long 0x3eddae05 + .long 0x37d3a508 + .long 0x2cc1b81f + .long 0x25cfb312 + .long 0x1ae58231 + .long 0x13eb893c + .long 0x08f9942b + .long 0x01f79f26 + .long 0xe64d46bd + .long 0xef434db0 + .long 0xf45150a7 + .long 0xfd5f5baa + .long 0xc2756a89 + .long 0xcb7b6184 + .long 0xd0697c93 + .long 0xd967779e + .long 0xae3d1ed5 + .long 0xa73315d8 + .long 0xbc2108cf + .long 0xb52f03c2 + .long 0x8a0532e1 + .long 0x830b39ec + .long 0x981924fb + .long 0x91172ff6 + .long 0x4d768dd6 + .long 0x447886db + .long 0x5f6a9bcc + .long 0x566490c1 + .long 0x694ea1e2 + .long 0x6040aaef + .long 0x7b52b7f8 + .long 0x725cbcf5 + .long 0x0506d5be + .long 0x0c08deb3 + .long 0x171ac3a4 + .long 0x1e14c8a9 + .long 0x213ef98a + .long 0x2830f287 + .long 0x3322ef90 + .long 0x3a2ce49d + .long 0xdd963d06 + .long 0xd498360b + .long 0xcf8a2b1c + .long 0xc6842011 + .long 0xf9ae1132 + .long 0xf0a01a3f + .long 0xebb20728 + .long 0xe2bc0c25 + .long 0x95e6656e + .long 0x9ce86e63 + .long 0x87fa7374 + .long 0x8ef47879 + .long 0xb1de495a + .long 0xb8d04257 + .long 0xa3c25f40 + .long 0xaacc544d + .long 0xec41f7da + .long 0xe54ffcd7 + .long 0xfe5de1c0 + .long 0xf753eacd + .long 0xc879dbee + .long 0xc177d0e3 + .long 0xda65cdf4 + .long 0xd36bc6f9 + .long 0xa431afb2 + .long 0xad3fa4bf + .long 0xb62db9a8 + .long 0xbf23b2a5 + .long 0x80098386 + .long 0x8907888b + .long 0x9215959c + .long 0x9b1b9e91 + .long 0x7ca1470a + .long 0x75af4c07 + .long 0x6ebd5110 + .long 0x67b35a1d + .long 0x58996b3e + .long 0x51976033 + .long 0x4a857d24 + .long 0x438b7629 + .long 0x34d11f62 + .long 0x3ddf146f + .long 0x26cd0978 + .long 0x2fc30275 + .long 0x10e93356 + .long 0x19e7385b + .long 0x02f5254c + .long 0x0bfb2e41 + .long 0xd79a8c61 + .long 0xde94876c + .long 0xc5869a7b + .long 0xcc889176 + .long 0xf3a2a055 + .long 0xfaacab58 + .long 0xe1beb64f + .long 0xe8b0bd42 + .long 0x9fead409 + .long 0x96e4df04 + .long 0x8df6c213 + .long 0x84f8c91e + .long 0xbbd2f83d + .long 0xb2dcf330 + .long 0xa9ceee27 + .long 0xa0c0e52a + .long 0x477a3cb1 + .long 0x4e7437bc + .long 0x55662aab + .long 0x5c6821a6 + .long 0x63421085 + .long 0x6a4c1b88 + .long 0x715e069f + .long 0x78500d92 + .long 0x0f0a64d9 + .long 0x06046fd4 + .long 0x1d1672c3 + .long 0x141879ce + .long 0x2b3248ed + .long 0x223c43e0 + .long 0x392e5ef7 + .long 0x302055fa + .long 0x9aec01b7 + .long 0x93e20aba + .long 0x88f017ad + .long 0x81fe1ca0 + .long 0xbed42d83 + .long 0xb7da268e + .long 0xacc83b99 + .long 0xa5c63094 + .long 0xd29c59df + .long 0xdb9252d2 + .long 0xc0804fc5 + .long 0xc98e44c8 + .long 0xf6a475eb + .long 0xffaa7ee6 + .long 0xe4b863f1 + .long 0xedb668fc + .long 0x0a0cb167 + .long 0x0302ba6a + .long 0x1810a77d + .long 0x111eac70 + .long 0x2e349d53 + .long 0x273a965e + .long 0x3c288b49 + .long 0x35268044 + .long 0x427ce90f + .long 0x4b72e202 + .long 0x5060ff15 + .long 0x596ef418 + .long 0x6644c53b + .long 0x6f4ace36 + .long 0x7458d321 + .long 0x7d56d82c + .long 0xa1377a0c + .long 0xa8397101 + .long 0xb32b6c16 + .long 0xba25671b + .long 0x850f5638 + .long 0x8c015d35 + .long 0x97134022 + .long 0x9e1d4b2f + .long 0xe9472264 + .long 0xe0492969 + .long 0xfb5b347e + .long 0xf2553f73 + .long 0xcd7f0e50 + .long 0xc471055d + .long 0xdf63184a + .long 0xd66d1347 + .long 0x31d7cadc + .long 0x38d9c1d1 + .long 0x23cbdcc6 + .long 0x2ac5d7cb + .long 0x15efe6e8 + .long 0x1ce1ede5 + .long 0x07f3f0f2 + .long 0x0efdfbff + .long 0x79a792b4 + .long 0x70a999b9 + .long 0x6bbb84ae + .long 0x62b58fa3 + .long 0x5d9fbe80 + .long 0x5491b58d + .long 0x4f83a89a + .long 0x468da397 + // Table 3. + .long 0x00000000 + .long 0x0e0b0d09 + .long 0x1c161a12 + .long 0x121d171b + .long 0x382c3424 + .long 0x3627392d + .long 0x243a2e36 + .long 0x2a31233f + .long 0x70586848 + .long 0x7e536541 + .long 0x6c4e725a + .long 0x62457f53 + .long 0x48745c6c + .long 0x467f5165 + .long 0x5462467e + .long 0x5a694b77 + .long 0xe0b0d090 + .long 0xeebbdd99 + .long 0xfca6ca82 + .long 0xf2adc78b + .long 0xd89ce4b4 + .long 0xd697e9bd + .long 0xc48afea6 + .long 0xca81f3af + .long 0x90e8b8d8 + .long 0x9ee3b5d1 + .long 0x8cfea2ca + .long 0x82f5afc3 + .long 0xa8c48cfc + .long 0xa6cf81f5 + .long 0xb4d296ee + .long 0xbad99be7 + .long 0xdb7bbb3b + .long 0xd570b632 + .long 0xc76da129 + .long 0xc966ac20 + .long 0xe3578f1f + .long 0xed5c8216 + .long 0xff41950d + .long 0xf14a9804 + .long 0xab23d373 + .long 0xa528de7a + .long 0xb735c961 + .long 0xb93ec468 + .long 0x930fe757 + .long 0x9d04ea5e + .long 0x8f19fd45 + .long 0x8112f04c + .long 0x3bcb6bab + .long 0x35c066a2 + .long 0x27dd71b9 + .long 0x29d67cb0 + .long 0x03e75f8f + .long 0x0dec5286 + .long 0x1ff1459d + .long 0x11fa4894 + .long 0x4b9303e3 + .long 0x45980eea + .long 0x578519f1 + .long 0x598e14f8 + .long 0x73bf37c7 + .long 0x7db43ace + .long 0x6fa92dd5 + .long 0x61a220dc + .long 0xadf66d76 + .long 0xa3fd607f + .long 0xb1e07764 + .long 0xbfeb7a6d + .long 0x95da5952 + .long 0x9bd1545b + .long 0x89cc4340 + .long 0x87c74e49 + .long 0xddae053e + .long 0xd3a50837 + .long 0xc1b81f2c + .long 0xcfb31225 + .long 0xe582311a + .long 0xeb893c13 + .long 0xf9942b08 + .long 0xf79f2601 + .long 0x4d46bde6 + .long 0x434db0ef + .long 0x5150a7f4 + .long 0x5f5baafd + .long 0x756a89c2 + .long 0x7b6184cb + .long 0x697c93d0 + .long 0x67779ed9 + .long 0x3d1ed5ae + .long 0x3315d8a7 + .long 0x2108cfbc + .long 0x2f03c2b5 + .long 0x0532e18a + .long 0x0b39ec83 + .long 0x1924fb98 + .long 0x172ff691 + .long 0x768dd64d + .long 0x7886db44 + .long 0x6a9bcc5f + .long 0x6490c156 + .long 0x4ea1e269 + .long 0x40aaef60 + .long 0x52b7f87b + .long 0x5cbcf572 + .long 0x06d5be05 + .long 0x08deb30c + .long 0x1ac3a417 + .long 0x14c8a91e + .long 0x3ef98a21 + .long 0x30f28728 + .long 0x22ef9033 + .long 0x2ce49d3a + .long 0x963d06dd + .long 0x98360bd4 + .long 0x8a2b1ccf + .long 0x842011c6 + .long 0xae1132f9 + .long 0xa01a3ff0 + .long 0xb20728eb + .long 0xbc0c25e2 + .long 0xe6656e95 + .long 0xe86e639c + .long 0xfa737487 + .long 0xf478798e + .long 0xde495ab1 + .long 0xd04257b8 + .long 0xc25f40a3 + .long 0xcc544daa + .long 0x41f7daec + .long 0x4ffcd7e5 + .long 0x5de1c0fe + .long 0x53eacdf7 + .long 0x79dbeec8 + .long 0x77d0e3c1 + .long 0x65cdf4da + .long 0x6bc6f9d3 + .long 0x31afb2a4 + .long 0x3fa4bfad + .long 0x2db9a8b6 + .long 0x23b2a5bf + .long 0x09838680 + .long 0x07888b89 + .long 0x15959c92 + .long 0x1b9e919b + .long 0xa1470a7c + .long 0xaf4c0775 + .long 0xbd51106e + .long 0xb35a1d67 + .long 0x996b3e58 + .long 0x97603351 + .long 0x857d244a + .long 0x8b762943 + .long 0xd11f6234 + .long 0xdf146f3d + .long 0xcd097826 + .long 0xc302752f + .long 0xe9335610 + .long 0xe7385b19 + .long 0xf5254c02 + .long 0xfb2e410b + .long 0x9a8c61d7 + .long 0x94876cde + .long 0x869a7bc5 + .long 0x889176cc + .long 0xa2a055f3 + .long 0xacab58fa + .long 0xbeb64fe1 + .long 0xb0bd42e8 + .long 0xead4099f + .long 0xe4df0496 + .long 0xf6c2138d + .long 0xf8c91e84 + .long 0xd2f83dbb + .long 0xdcf330b2 + .long 0xceee27a9 + .long 0xc0e52aa0 + .long 0x7a3cb147 + .long 0x7437bc4e + .long 0x662aab55 + .long 0x6821a65c + .long 0x42108563 + .long 0x4c1b886a + .long 0x5e069f71 + .long 0x500d9278 + .long 0x0a64d90f + .long 0x046fd406 + .long 0x1672c31d + .long 0x1879ce14 + .long 0x3248ed2b + .long 0x3c43e022 + .long 0x2e5ef739 + .long 0x2055fa30 + .long 0xec01b79a + .long 0xe20aba93 + .long 0xf017ad88 + .long 0xfe1ca081 + .long 0xd42d83be + .long 0xda268eb7 + .long 0xc83b99ac + .long 0xc63094a5 + .long 0x9c59dfd2 + .long 0x9252d2db + .long 0x804fc5c0 + .long 0x8e44c8c9 + .long 0xa475ebf6 + .long 0xaa7ee6ff + .long 0xb863f1e4 + .long 0xb668fced + .long 0x0cb1670a + .long 0x02ba6a03 + .long 0x10a77d18 + .long 0x1eac7011 + .long 0x349d532e + .long 0x3a965e27 + .long 0x288b493c + .long 0x26804435 + .long 0x7ce90f42 + .long 0x72e2024b + .long 0x60ff1550 + .long 0x6ef41859 + .long 0x44c53b66 + .long 0x4ace366f + .long 0x58d32174 + .long 0x56d82c7d + .long 0x377a0ca1 + .long 0x397101a8 + .long 0x2b6c16b3 + .long 0x25671bba + .long 0x0f563885 + .long 0x015d358c + .long 0x13402297 + .long 0x1d4b2f9e + .long 0x472264e9 + .long 0x492969e0 + .long 0x5b347efb + .long 0x553f73f2 + .long 0x7f0e50cd + .long 0x71055dc4 + .long 0x63184adf + .long 0x6d1347d6 + .long 0xd7cadc31 + .long 0xd9c1d138 + .long 0xcbdcc623 + .long 0xc5d7cb2a + .long 0xefe6e815 + .long 0xe1ede51c + .long 0xf3f0f207 + .long 0xfdfbff0e + .long 0xa792b479 + .long 0xa999b970 + .long 0xbb84ae6b + .long 0xb58fa362 + .long 0x9fbe805d + .long 0x91b58d54 + .long 0x83a89a4f + .long 0x8da39746 + + +// Tables for main encryption iterations. + .globl _AESEncryptTable + .private_extern _AESEncryptTable + .align 2 +_AESEncryptTable: + // Table 0. + .long 0xa56363c6 + .long 0x847c7cf8 + .long 0x997777ee + .long 0x8d7b7bf6 + .long 0x0df2f2ff + .long 0xbd6b6bd6 + .long 0xb16f6fde + .long 0x54c5c591 + .long 0x50303060 + .long 0x03010102 + .long 0xa96767ce + .long 0x7d2b2b56 + .long 0x19fefee7 + .long 0x62d7d7b5 + .long 0xe6abab4d + .long 0x9a7676ec + .long 0x45caca8f + .long 0x9d82821f + .long 0x40c9c989 + .long 0x877d7dfa + .long 0x15fafaef + .long 0xeb5959b2 + .long 0xc947478e + .long 0x0bf0f0fb + .long 0xecadad41 + .long 0x67d4d4b3 + .long 0xfda2a25f + .long 0xeaafaf45 + .long 0xbf9c9c23 + .long 0xf7a4a453 + .long 0x967272e4 + .long 0x5bc0c09b + .long 0xc2b7b775 + .long 0x1cfdfde1 + .long 0xae93933d + .long 0x6a26264c + .long 0x5a36366c + .long 0x413f3f7e + .long 0x02f7f7f5 + .long 0x4fcccc83 + .long 0x5c343468 + .long 0xf4a5a551 + .long 0x34e5e5d1 + .long 0x08f1f1f9 + .long 0x937171e2 + .long 0x73d8d8ab + .long 0x53313162 + .long 0x3f15152a + .long 0x0c040408 + .long 0x52c7c795 + .long 0x65232346 + .long 0x5ec3c39d + .long 0x28181830 + .long 0xa1969637 + .long 0x0f05050a + .long 0xb59a9a2f + .long 0x0907070e + .long 0x36121224 + .long 0x9b80801b + .long 0x3de2e2df + .long 0x26ebebcd + .long 0x6927274e + .long 0xcdb2b27f + .long 0x9f7575ea + .long 0x1b090912 + .long 0x9e83831d + .long 0x742c2c58 + .long 0x2e1a1a34 + .long 0x2d1b1b36 + .long 0xb26e6edc + .long 0xee5a5ab4 + .long 0xfba0a05b + .long 0xf65252a4 + .long 0x4d3b3b76 + .long 0x61d6d6b7 + .long 0xceb3b37d + .long 0x7b292952 + .long 0x3ee3e3dd + .long 0x712f2f5e + .long 0x97848413 + .long 0xf55353a6 + .long 0x68d1d1b9 + .long 0x00000000 + .long 0x2cededc1 + .long 0x60202040 + .long 0x1ffcfce3 + .long 0xc8b1b179 + .long 0xed5b5bb6 + .long 0xbe6a6ad4 + .long 0x46cbcb8d + .long 0xd9bebe67 + .long 0x4b393972 + .long 0xde4a4a94 + .long 0xd44c4c98 + .long 0xe85858b0 + .long 0x4acfcf85 + .long 0x6bd0d0bb + .long 0x2aefefc5 + .long 0xe5aaaa4f + .long 0x16fbfbed + .long 0xc5434386 + .long 0xd74d4d9a + .long 0x55333366 + .long 0x94858511 + .long 0xcf45458a + .long 0x10f9f9e9 + .long 0x06020204 + .long 0x817f7ffe + .long 0xf05050a0 + .long 0x443c3c78 + .long 0xba9f9f25 + .long 0xe3a8a84b + .long 0xf35151a2 + .long 0xfea3a35d + .long 0xc0404080 + .long 0x8a8f8f05 + .long 0xad92923f + .long 0xbc9d9d21 + .long 0x48383870 + .long 0x04f5f5f1 + .long 0xdfbcbc63 + .long 0xc1b6b677 + .long 0x75dadaaf + .long 0x63212142 + .long 0x30101020 + .long 0x1affffe5 + .long 0x0ef3f3fd + .long 0x6dd2d2bf + .long 0x4ccdcd81 + .long 0x140c0c18 + .long 0x35131326 + .long 0x2fececc3 + .long 0xe15f5fbe + .long 0xa2979735 + .long 0xcc444488 + .long 0x3917172e + .long 0x57c4c493 + .long 0xf2a7a755 + .long 0x827e7efc + .long 0x473d3d7a + .long 0xac6464c8 + .long 0xe75d5dba + .long 0x2b191932 + .long 0x957373e6 + .long 0xa06060c0 + .long 0x98818119 + .long 0xd14f4f9e + .long 0x7fdcdca3 + .long 0x66222244 + .long 0x7e2a2a54 + .long 0xab90903b + .long 0x8388880b + .long 0xca46468c + .long 0x29eeeec7 + .long 0xd3b8b86b + .long 0x3c141428 + .long 0x79dedea7 + .long 0xe25e5ebc + .long 0x1d0b0b16 + .long 0x76dbdbad + .long 0x3be0e0db + .long 0x56323264 + .long 0x4e3a3a74 + .long 0x1e0a0a14 + .long 0xdb494992 + .long 0x0a06060c + .long 0x6c242448 + .long 0xe45c5cb8 + .long 0x5dc2c29f + .long 0x6ed3d3bd + .long 0xefacac43 + .long 0xa66262c4 + .long 0xa8919139 + .long 0xa4959531 + .long 0x37e4e4d3 + .long 0x8b7979f2 + .long 0x32e7e7d5 + .long 0x43c8c88b + .long 0x5937376e + .long 0xb76d6dda + .long 0x8c8d8d01 + .long 0x64d5d5b1 + .long 0xd24e4e9c + .long 0xe0a9a949 + .long 0xb46c6cd8 + .long 0xfa5656ac + .long 0x07f4f4f3 + .long 0x25eaeacf + .long 0xaf6565ca + .long 0x8e7a7af4 + .long 0xe9aeae47 + .long 0x18080810 + .long 0xd5baba6f + .long 0x887878f0 + .long 0x6f25254a + .long 0x722e2e5c + .long 0x241c1c38 + .long 0xf1a6a657 + .long 0xc7b4b473 + .long 0x51c6c697 + .long 0x23e8e8cb + .long 0x7cdddda1 + .long 0x9c7474e8 + .long 0x211f1f3e + .long 0xdd4b4b96 + .long 0xdcbdbd61 + .long 0x868b8b0d + .long 0x858a8a0f + .long 0x907070e0 + .long 0x423e3e7c + .long 0xc4b5b571 + .long 0xaa6666cc + .long 0xd8484890 + .long 0x05030306 + .long 0x01f6f6f7 + .long 0x120e0e1c + .long 0xa36161c2 + .long 0x5f35356a + .long 0xf95757ae + .long 0xd0b9b969 + .long 0x91868617 + .long 0x58c1c199 + .long 0x271d1d3a + .long 0xb99e9e27 + .long 0x38e1e1d9 + .long 0x13f8f8eb + .long 0xb398982b + .long 0x33111122 + .long 0xbb6969d2 + .long 0x70d9d9a9 + .long 0x898e8e07 + .long 0xa7949433 + .long 0xb69b9b2d + .long 0x221e1e3c + .long 0x92878715 + .long 0x20e9e9c9 + .long 0x49cece87 + .long 0xff5555aa + .long 0x78282850 + .long 0x7adfdfa5 + .long 0x8f8c8c03 + .long 0xf8a1a159 + .long 0x80898909 + .long 0x170d0d1a + .long 0xdabfbf65 + .long 0x31e6e6d7 + .long 0xc6424284 + .long 0xb86868d0 + .long 0xc3414182 + .long 0xb0999929 + .long 0x772d2d5a + .long 0x110f0f1e + .long 0xcbb0b07b + .long 0xfc5454a8 + .long 0xd6bbbb6d + .long 0x3a16162c + // Table 1. + .long 0x6363c6a5 + .long 0x7c7cf884 + .long 0x7777ee99 + .long 0x7b7bf68d + .long 0xf2f2ff0d + .long 0x6b6bd6bd + .long 0x6f6fdeb1 + .long 0xc5c59154 + .long 0x30306050 + .long 0x01010203 + .long 0x6767cea9 + .long 0x2b2b567d + .long 0xfefee719 + .long 0xd7d7b562 + .long 0xabab4de6 + .long 0x7676ec9a + .long 0xcaca8f45 + .long 0x82821f9d + .long 0xc9c98940 + .long 0x7d7dfa87 + .long 0xfafaef15 + .long 0x5959b2eb + .long 0x47478ec9 + .long 0xf0f0fb0b + .long 0xadad41ec + .long 0xd4d4b367 + .long 0xa2a25ffd + .long 0xafaf45ea + .long 0x9c9c23bf + .long 0xa4a453f7 + .long 0x7272e496 + .long 0xc0c09b5b + .long 0xb7b775c2 + .long 0xfdfde11c + .long 0x93933dae + .long 0x26264c6a + .long 0x36366c5a + .long 0x3f3f7e41 + .long 0xf7f7f502 + .long 0xcccc834f + .long 0x3434685c + .long 0xa5a551f4 + .long 0xe5e5d134 + .long 0xf1f1f908 + .long 0x7171e293 + .long 0xd8d8ab73 + .long 0x31316253 + .long 0x15152a3f + .long 0x0404080c + .long 0xc7c79552 + .long 0x23234665 + .long 0xc3c39d5e + .long 0x18183028 + .long 0x969637a1 + .long 0x05050a0f + .long 0x9a9a2fb5 + .long 0x07070e09 + .long 0x12122436 + .long 0x80801b9b + .long 0xe2e2df3d + .long 0xebebcd26 + .long 0x27274e69 + .long 0xb2b27fcd + .long 0x7575ea9f + .long 0x0909121b + .long 0x83831d9e + .long 0x2c2c5874 + .long 0x1a1a342e + .long 0x1b1b362d + .long 0x6e6edcb2 + .long 0x5a5ab4ee + .long 0xa0a05bfb + .long 0x5252a4f6 + .long 0x3b3b764d + .long 0xd6d6b761 + .long 0xb3b37dce + .long 0x2929527b + .long 0xe3e3dd3e + .long 0x2f2f5e71 + .long 0x84841397 + .long 0x5353a6f5 + .long 0xd1d1b968 + .long 0x00000000 + .long 0xededc12c + .long 0x20204060 + .long 0xfcfce31f + .long 0xb1b179c8 + .long 0x5b5bb6ed + .long 0x6a6ad4be + .long 0xcbcb8d46 + .long 0xbebe67d9 + .long 0x3939724b + .long 0x4a4a94de + .long 0x4c4c98d4 + .long 0x5858b0e8 + .long 0xcfcf854a + .long 0xd0d0bb6b + .long 0xefefc52a + .long 0xaaaa4fe5 + .long 0xfbfbed16 + .long 0x434386c5 + .long 0x4d4d9ad7 + .long 0x33336655 + .long 0x85851194 + .long 0x45458acf + .long 0xf9f9e910 + .long 0x02020406 + .long 0x7f7ffe81 + .long 0x5050a0f0 + .long 0x3c3c7844 + .long 0x9f9f25ba + .long 0xa8a84be3 + .long 0x5151a2f3 + .long 0xa3a35dfe + .long 0x404080c0 + .long 0x8f8f058a + .long 0x92923fad + .long 0x9d9d21bc + .long 0x38387048 + .long 0xf5f5f104 + .long 0xbcbc63df + .long 0xb6b677c1 + .long 0xdadaaf75 + .long 0x21214263 + .long 0x10102030 + .long 0xffffe51a + .long 0xf3f3fd0e + .long 0xd2d2bf6d + .long 0xcdcd814c + .long 0x0c0c1814 + .long 0x13132635 + .long 0xececc32f + .long 0x5f5fbee1 + .long 0x979735a2 + .long 0x444488cc + .long 0x17172e39 + .long 0xc4c49357 + .long 0xa7a755f2 + .long 0x7e7efc82 + .long 0x3d3d7a47 + .long 0x6464c8ac + .long 0x5d5dbae7 + .long 0x1919322b + .long 0x7373e695 + .long 0x6060c0a0 + .long 0x81811998 + .long 0x4f4f9ed1 + .long 0xdcdca37f + .long 0x22224466 + .long 0x2a2a547e + .long 0x90903bab + .long 0x88880b83 + .long 0x46468cca + .long 0xeeeec729 + .long 0xb8b86bd3 + .long 0x1414283c + .long 0xdedea779 + .long 0x5e5ebce2 + .long 0x0b0b161d + .long 0xdbdbad76 + .long 0xe0e0db3b + .long 0x32326456 + .long 0x3a3a744e + .long 0x0a0a141e + .long 0x494992db + .long 0x06060c0a + .long 0x2424486c + .long 0x5c5cb8e4 + .long 0xc2c29f5d + .long 0xd3d3bd6e + .long 0xacac43ef + .long 0x6262c4a6 + .long 0x919139a8 + .long 0x959531a4 + .long 0xe4e4d337 + .long 0x7979f28b + .long 0xe7e7d532 + .long 0xc8c88b43 + .long 0x37376e59 + .long 0x6d6ddab7 + .long 0x8d8d018c + .long 0xd5d5b164 + .long 0x4e4e9cd2 + .long 0xa9a949e0 + .long 0x6c6cd8b4 + .long 0x5656acfa + .long 0xf4f4f307 + .long 0xeaeacf25 + .long 0x6565caaf + .long 0x7a7af48e + .long 0xaeae47e9 + .long 0x08081018 + .long 0xbaba6fd5 + .long 0x7878f088 + .long 0x25254a6f + .long 0x2e2e5c72 + .long 0x1c1c3824 + .long 0xa6a657f1 + .long 0xb4b473c7 + .long 0xc6c69751 + .long 0xe8e8cb23 + .long 0xdddda17c + .long 0x7474e89c + .long 0x1f1f3e21 + .long 0x4b4b96dd + .long 0xbdbd61dc + .long 0x8b8b0d86 + .long 0x8a8a0f85 + .long 0x7070e090 + .long 0x3e3e7c42 + .long 0xb5b571c4 + .long 0x6666ccaa + .long 0x484890d8 + .long 0x03030605 + .long 0xf6f6f701 + .long 0x0e0e1c12 + .long 0x6161c2a3 + .long 0x35356a5f + .long 0x5757aef9 + .long 0xb9b969d0 + .long 0x86861791 + .long 0xc1c19958 + .long 0x1d1d3a27 + .long 0x9e9e27b9 + .long 0xe1e1d938 + .long 0xf8f8eb13 + .long 0x98982bb3 + .long 0x11112233 + .long 0x6969d2bb + .long 0xd9d9a970 + .long 0x8e8e0789 + .long 0x949433a7 + .long 0x9b9b2db6 + .long 0x1e1e3c22 + .long 0x87871592 + .long 0xe9e9c920 + .long 0xcece8749 + .long 0x5555aaff + .long 0x28285078 + .long 0xdfdfa57a + .long 0x8c8c038f + .long 0xa1a159f8 + .long 0x89890980 + .long 0x0d0d1a17 + .long 0xbfbf65da + .long 0xe6e6d731 + .long 0x424284c6 + .long 0x6868d0b8 + .long 0x414182c3 + .long 0x999929b0 + .long 0x2d2d5a77 + .long 0x0f0f1e11 + .long 0xb0b07bcb + .long 0x5454a8fc + .long 0xbbbb6dd6 + .long 0x16162c3a + // Table 2. + .long 0x63c6a563 + .long 0x7cf8847c + .long 0x77ee9977 + .long 0x7bf68d7b + .long 0xf2ff0df2 + .long 0x6bd6bd6b + .long 0x6fdeb16f + .long 0xc59154c5 + .long 0x30605030 + .long 0x01020301 + .long 0x67cea967 + .long 0x2b567d2b + .long 0xfee719fe + .long 0xd7b562d7 + .long 0xab4de6ab + .long 0x76ec9a76 + .long 0xca8f45ca + .long 0x821f9d82 + .long 0xc98940c9 + .long 0x7dfa877d + .long 0xfaef15fa + .long 0x59b2eb59 + .long 0x478ec947 + .long 0xf0fb0bf0 + .long 0xad41ecad + .long 0xd4b367d4 + .long 0xa25ffda2 + .long 0xaf45eaaf + .long 0x9c23bf9c + .long 0xa453f7a4 + .long 0x72e49672 + .long 0xc09b5bc0 + .long 0xb775c2b7 + .long 0xfde11cfd + .long 0x933dae93 + .long 0x264c6a26 + .long 0x366c5a36 + .long 0x3f7e413f + .long 0xf7f502f7 + .long 0xcc834fcc + .long 0x34685c34 + .long 0xa551f4a5 + .long 0xe5d134e5 + .long 0xf1f908f1 + .long 0x71e29371 + .long 0xd8ab73d8 + .long 0x31625331 + .long 0x152a3f15 + .long 0x04080c04 + .long 0xc79552c7 + .long 0x23466523 + .long 0xc39d5ec3 + .long 0x18302818 + .long 0x9637a196 + .long 0x050a0f05 + .long 0x9a2fb59a + .long 0x070e0907 + .long 0x12243612 + .long 0x801b9b80 + .long 0xe2df3de2 + .long 0xebcd26eb + .long 0x274e6927 + .long 0xb27fcdb2 + .long 0x75ea9f75 + .long 0x09121b09 + .long 0x831d9e83 + .long 0x2c58742c + .long 0x1a342e1a + .long 0x1b362d1b + .long 0x6edcb26e + .long 0x5ab4ee5a + .long 0xa05bfba0 + .long 0x52a4f652 + .long 0x3b764d3b + .long 0xd6b761d6 + .long 0xb37dceb3 + .long 0x29527b29 + .long 0xe3dd3ee3 + .long 0x2f5e712f + .long 0x84139784 + .long 0x53a6f553 + .long 0xd1b968d1 + .long 0x00000000 + .long 0xedc12ced + .long 0x20406020 + .long 0xfce31ffc + .long 0xb179c8b1 + .long 0x5bb6ed5b + .long 0x6ad4be6a + .long 0xcb8d46cb + .long 0xbe67d9be + .long 0x39724b39 + .long 0x4a94de4a + .long 0x4c98d44c + .long 0x58b0e858 + .long 0xcf854acf + .long 0xd0bb6bd0 + .long 0xefc52aef + .long 0xaa4fe5aa + .long 0xfbed16fb + .long 0x4386c543 + .long 0x4d9ad74d + .long 0x33665533 + .long 0x85119485 + .long 0x458acf45 + .long 0xf9e910f9 + .long 0x02040602 + .long 0x7ffe817f + .long 0x50a0f050 + .long 0x3c78443c + .long 0x9f25ba9f + .long 0xa84be3a8 + .long 0x51a2f351 + .long 0xa35dfea3 + .long 0x4080c040 + .long 0x8f058a8f + .long 0x923fad92 + .long 0x9d21bc9d + .long 0x38704838 + .long 0xf5f104f5 + .long 0xbc63dfbc + .long 0xb677c1b6 + .long 0xdaaf75da + .long 0x21426321 + .long 0x10203010 + .long 0xffe51aff + .long 0xf3fd0ef3 + .long 0xd2bf6dd2 + .long 0xcd814ccd + .long 0x0c18140c + .long 0x13263513 + .long 0xecc32fec + .long 0x5fbee15f + .long 0x9735a297 + .long 0x4488cc44 + .long 0x172e3917 + .long 0xc49357c4 + .long 0xa755f2a7 + .long 0x7efc827e + .long 0x3d7a473d + .long 0x64c8ac64 + .long 0x5dbae75d + .long 0x19322b19 + .long 0x73e69573 + .long 0x60c0a060 + .long 0x81199881 + .long 0x4f9ed14f + .long 0xdca37fdc + .long 0x22446622 + .long 0x2a547e2a + .long 0x903bab90 + .long 0x880b8388 + .long 0x468cca46 + .long 0xeec729ee + .long 0xb86bd3b8 + .long 0x14283c14 + .long 0xdea779de + .long 0x5ebce25e + .long 0x0b161d0b + .long 0xdbad76db + .long 0xe0db3be0 + .long 0x32645632 + .long 0x3a744e3a + .long 0x0a141e0a + .long 0x4992db49 + .long 0x060c0a06 + .long 0x24486c24 + .long 0x5cb8e45c + .long 0xc29f5dc2 + .long 0xd3bd6ed3 + .long 0xac43efac + .long 0x62c4a662 + .long 0x9139a891 + .long 0x9531a495 + .long 0xe4d337e4 + .long 0x79f28b79 + .long 0xe7d532e7 + .long 0xc88b43c8 + .long 0x376e5937 + .long 0x6ddab76d + .long 0x8d018c8d + .long 0xd5b164d5 + .long 0x4e9cd24e + .long 0xa949e0a9 + .long 0x6cd8b46c + .long 0x56acfa56 + .long 0xf4f307f4 + .long 0xeacf25ea + .long 0x65caaf65 + .long 0x7af48e7a + .long 0xae47e9ae + .long 0x08101808 + .long 0xba6fd5ba + .long 0x78f08878 + .long 0x254a6f25 + .long 0x2e5c722e + .long 0x1c38241c + .long 0xa657f1a6 + .long 0xb473c7b4 + .long 0xc69751c6 + .long 0xe8cb23e8 + .long 0xdda17cdd + .long 0x74e89c74 + .long 0x1f3e211f + .long 0x4b96dd4b + .long 0xbd61dcbd + .long 0x8b0d868b + .long 0x8a0f858a + .long 0x70e09070 + .long 0x3e7c423e + .long 0xb571c4b5 + .long 0x66ccaa66 + .long 0x4890d848 + .long 0x03060503 + .long 0xf6f701f6 + .long 0x0e1c120e + .long 0x61c2a361 + .long 0x356a5f35 + .long 0x57aef957 + .long 0xb969d0b9 + .long 0x86179186 + .long 0xc19958c1 + .long 0x1d3a271d + .long 0x9e27b99e + .long 0xe1d938e1 + .long 0xf8eb13f8 + .long 0x982bb398 + .long 0x11223311 + .long 0x69d2bb69 + .long 0xd9a970d9 + .long 0x8e07898e + .long 0x9433a794 + .long 0x9b2db69b + .long 0x1e3c221e + .long 0x87159287 + .long 0xe9c920e9 + .long 0xce8749ce + .long 0x55aaff55 + .long 0x28507828 + .long 0xdfa57adf + .long 0x8c038f8c + .long 0xa159f8a1 + .long 0x89098089 + .long 0x0d1a170d + .long 0xbf65dabf + .long 0xe6d731e6 + .long 0x4284c642 + .long 0x68d0b868 + .long 0x4182c341 + .long 0x9929b099 + .long 0x2d5a772d + .long 0x0f1e110f + .long 0xb07bcbb0 + .long 0x54a8fc54 + .long 0xbb6dd6bb + .long 0x162c3a16 + // Table 3. + .long 0xc6a56363 + .long 0xf8847c7c + .long 0xee997777 + .long 0xf68d7b7b + .long 0xff0df2f2 + .long 0xd6bd6b6b + .long 0xdeb16f6f + .long 0x9154c5c5 + .long 0x60503030 + .long 0x02030101 + .long 0xcea96767 + .long 0x567d2b2b + .long 0xe719fefe + .long 0xb562d7d7 + .long 0x4de6abab + .long 0xec9a7676 + .long 0x8f45caca + .long 0x1f9d8282 + .long 0x8940c9c9 + .long 0xfa877d7d + .long 0xef15fafa + .long 0xb2eb5959 + .long 0x8ec94747 + .long 0xfb0bf0f0 + .long 0x41ecadad + .long 0xb367d4d4 + .long 0x5ffda2a2 + .long 0x45eaafaf + .long 0x23bf9c9c + .long 0x53f7a4a4 + .long 0xe4967272 + .long 0x9b5bc0c0 + .long 0x75c2b7b7 + .long 0xe11cfdfd + .long 0x3dae9393 + .long 0x4c6a2626 + .long 0x6c5a3636 + .long 0x7e413f3f + .long 0xf502f7f7 + .long 0x834fcccc + .long 0x685c3434 + .long 0x51f4a5a5 + .long 0xd134e5e5 + .long 0xf908f1f1 + .long 0xe2937171 + .long 0xab73d8d8 + .long 0x62533131 + .long 0x2a3f1515 + .long 0x080c0404 + .long 0x9552c7c7 + .long 0x46652323 + .long 0x9d5ec3c3 + .long 0x30281818 + .long 0x37a19696 + .long 0x0a0f0505 + .long 0x2fb59a9a + .long 0x0e090707 + .long 0x24361212 + .long 0x1b9b8080 + .long 0xdf3de2e2 + .long 0xcd26ebeb + .long 0x4e692727 + .long 0x7fcdb2b2 + .long 0xea9f7575 + .long 0x121b0909 + .long 0x1d9e8383 + .long 0x58742c2c + .long 0x342e1a1a + .long 0x362d1b1b + .long 0xdcb26e6e + .long 0xb4ee5a5a + .long 0x5bfba0a0 + .long 0xa4f65252 + .long 0x764d3b3b + .long 0xb761d6d6 + .long 0x7dceb3b3 + .long 0x527b2929 + .long 0xdd3ee3e3 + .long 0x5e712f2f + .long 0x13978484 + .long 0xa6f55353 + .long 0xb968d1d1 + .long 0x00000000 + .long 0xc12ceded + .long 0x40602020 + .long 0xe31ffcfc + .long 0x79c8b1b1 + .long 0xb6ed5b5b + .long 0xd4be6a6a + .long 0x8d46cbcb + .long 0x67d9bebe + .long 0x724b3939 + .long 0x94de4a4a + .long 0x98d44c4c + .long 0xb0e85858 + .long 0x854acfcf + .long 0xbb6bd0d0 + .long 0xc52aefef + .long 0x4fe5aaaa + .long 0xed16fbfb + .long 0x86c54343 + .long 0x9ad74d4d + .long 0x66553333 + .long 0x11948585 + .long 0x8acf4545 + .long 0xe910f9f9 + .long 0x04060202 + .long 0xfe817f7f + .long 0xa0f05050 + .long 0x78443c3c + .long 0x25ba9f9f + .long 0x4be3a8a8 + .long 0xa2f35151 + .long 0x5dfea3a3 + .long 0x80c04040 + .long 0x058a8f8f + .long 0x3fad9292 + .long 0x21bc9d9d + .long 0x70483838 + .long 0xf104f5f5 + .long 0x63dfbcbc + .long 0x77c1b6b6 + .long 0xaf75dada + .long 0x42632121 + .long 0x20301010 + .long 0xe51affff + .long 0xfd0ef3f3 + .long 0xbf6dd2d2 + .long 0x814ccdcd + .long 0x18140c0c + .long 0x26351313 + .long 0xc32fecec + .long 0xbee15f5f + .long 0x35a29797 + .long 0x88cc4444 + .long 0x2e391717 + .long 0x9357c4c4 + .long 0x55f2a7a7 + .long 0xfc827e7e + .long 0x7a473d3d + .long 0xc8ac6464 + .long 0xbae75d5d + .long 0x322b1919 + .long 0xe6957373 + .long 0xc0a06060 + .long 0x19988181 + .long 0x9ed14f4f + .long 0xa37fdcdc + .long 0x44662222 + .long 0x547e2a2a + .long 0x3bab9090 + .long 0x0b838888 + .long 0x8cca4646 + .long 0xc729eeee + .long 0x6bd3b8b8 + .long 0x283c1414 + .long 0xa779dede + .long 0xbce25e5e + .long 0x161d0b0b + .long 0xad76dbdb + .long 0xdb3be0e0 + .long 0x64563232 + .long 0x744e3a3a + .long 0x141e0a0a + .long 0x92db4949 + .long 0x0c0a0606 + .long 0x486c2424 + .long 0xb8e45c5c + .long 0x9f5dc2c2 + .long 0xbd6ed3d3 + .long 0x43efacac + .long 0xc4a66262 + .long 0x39a89191 + .long 0x31a49595 + .long 0xd337e4e4 + .long 0xf28b7979 + .long 0xd532e7e7 + .long 0x8b43c8c8 + .long 0x6e593737 + .long 0xdab76d6d + .long 0x018c8d8d + .long 0xb164d5d5 + .long 0x9cd24e4e + .long 0x49e0a9a9 + .long 0xd8b46c6c + .long 0xacfa5656 + .long 0xf307f4f4 + .long 0xcf25eaea + .long 0xcaaf6565 + .long 0xf48e7a7a + .long 0x47e9aeae + .long 0x10180808 + .long 0x6fd5baba + .long 0xf0887878 + .long 0x4a6f2525 + .long 0x5c722e2e + .long 0x38241c1c + .long 0x57f1a6a6 + .long 0x73c7b4b4 + .long 0x9751c6c6 + .long 0xcb23e8e8 + .long 0xa17cdddd + .long 0xe89c7474 + .long 0x3e211f1f + .long 0x96dd4b4b + .long 0x61dcbdbd + .long 0x0d868b8b + .long 0x0f858a8a + .long 0xe0907070 + .long 0x7c423e3e + .long 0x71c4b5b5 + .long 0xccaa6666 + .long 0x90d84848 + .long 0x06050303 + .long 0xf701f6f6 + .long 0x1c120e0e + .long 0xc2a36161 + .long 0x6a5f3535 + .long 0xaef95757 + .long 0x69d0b9b9 + .long 0x17918686 + .long 0x9958c1c1 + .long 0x3a271d1d + .long 0x27b99e9e + .long 0xd938e1e1 + .long 0xeb13f8f8 + .long 0x2bb39898 + .long 0x22331111 + .long 0xd2bb6969 + .long 0xa970d9d9 + .long 0x07898e8e + .long 0x33a79494 + .long 0x2db69b9b + .long 0x3c221e1e + .long 0x15928787 + .long 0xc920e9e9 + .long 0x8749cece + .long 0xaaff5555 + .long 0x50782828 + .long 0xa57adfdf + .long 0x038f8c8c + .long 0x59f8a1a1 + .long 0x09808989 + .long 0x1a170d0d + .long 0x65dabfbf + .long 0xd731e6e6 + .long 0x84c64242 + .long 0xd0b86868 + .long 0x82c34141 + .long 0x29b09999 + .long 0x5a772d2d + .long 0x1e110f0f + .long 0x7bcbb0b0 + .long 0xa8fc5454 + .long 0x6dd6bbbb + .long 0x2c3a1616 + + +// Tables for main decryption iterations. + .globl _AESDecryptTable + .private_extern _AESDecryptTable + .align 2 +_AESDecryptTable: + // Table 0. + .long 0x50a7f451 + .long 0x5365417e + .long 0xc3a4171a + .long 0x965e273a + .long 0xcb6bab3b + .long 0xf1459d1f + .long 0xab58faac + .long 0x9303e34b + .long 0x55fa3020 + .long 0xf66d76ad + .long 0x9176cc88 + .long 0x254c02f5 + .long 0xfcd7e54f + .long 0xd7cb2ac5 + .long 0x80443526 + .long 0x8fa362b5 + .long 0x495ab1de + .long 0x671bba25 + .long 0x980eea45 + .long 0xe1c0fe5d + .long 0x02752fc3 + .long 0x12f04c81 + .long 0xa397468d + .long 0xc6f9d36b + .long 0xe75f8f03 + .long 0x959c9215 + .long 0xeb7a6dbf + .long 0xda595295 + .long 0x2d83bed4 + .long 0xd3217458 + .long 0x2969e049 + .long 0x44c8c98e + .long 0x6a89c275 + .long 0x78798ef4 + .long 0x6b3e5899 + .long 0xdd71b927 + .long 0xb64fe1be + .long 0x17ad88f0 + .long 0x66ac20c9 + .long 0xb43ace7d + .long 0x184adf63 + .long 0x82311ae5 + .long 0x60335197 + .long 0x457f5362 + .long 0xe07764b1 + .long 0x84ae6bbb + .long 0x1ca081fe + .long 0x942b08f9 + .long 0x58684870 + .long 0x19fd458f + .long 0x876cde94 + .long 0xb7f87b52 + .long 0x23d373ab + .long 0xe2024b72 + .long 0x578f1fe3 + .long 0x2aab5566 + .long 0x0728ebb2 + .long 0x03c2b52f + .long 0x9a7bc586 + .long 0xa50837d3 + .long 0xf2872830 + .long 0xb2a5bf23 + .long 0xba6a0302 + .long 0x5c8216ed + .long 0x2b1ccf8a + .long 0x92b479a7 + .long 0xf0f207f3 + .long 0xa1e2694e + .long 0xcdf4da65 + .long 0xd5be0506 + .long 0x1f6234d1 + .long 0x8afea6c4 + .long 0x9d532e34 + .long 0xa055f3a2 + .long 0x32e18a05 + .long 0x75ebf6a4 + .long 0x39ec830b + .long 0xaaef6040 + .long 0x069f715e + .long 0x51106ebd + .long 0xf98a213e + .long 0x3d06dd96 + .long 0xae053edd + .long 0x46bde64d + .long 0xb58d5491 + .long 0x055dc471 + .long 0x6fd40604 + .long 0xff155060 + .long 0x24fb9819 + .long 0x97e9bdd6 + .long 0xcc434089 + .long 0x779ed967 + .long 0xbd42e8b0 + .long 0x888b8907 + .long 0x385b19e7 + .long 0xdbeec879 + .long 0x470a7ca1 + .long 0xe90f427c + .long 0xc91e84f8 + .long 0x00000000 + .long 0x83868009 + .long 0x48ed2b32 + .long 0xac70111e + .long 0x4e725a6c + .long 0xfbff0efd + .long 0x5638850f + .long 0x1ed5ae3d + .long 0x27392d36 + .long 0x64d90f0a + .long 0x21a65c68 + .long 0xd1545b9b + .long 0x3a2e3624 + .long 0xb1670a0c + .long 0x0fe75793 + .long 0xd296eeb4 + .long 0x9e919b1b + .long 0x4fc5c080 + .long 0xa220dc61 + .long 0x694b775a + .long 0x161a121c + .long 0x0aba93e2 + .long 0xe52aa0c0 + .long 0x43e0223c + .long 0x1d171b12 + .long 0x0b0d090e + .long 0xadc78bf2 + .long 0xb9a8b62d + .long 0xc8a91e14 + .long 0x8519f157 + .long 0x4c0775af + .long 0xbbdd99ee + .long 0xfd607fa3 + .long 0x9f2601f7 + .long 0xbcf5725c + .long 0xc53b6644 + .long 0x347efb5b + .long 0x7629438b + .long 0xdcc623cb + .long 0x68fcedb6 + .long 0x63f1e4b8 + .long 0xcadc31d7 + .long 0x10856342 + .long 0x40229713 + .long 0x2011c684 + .long 0x7d244a85 + .long 0xf83dbbd2 + .long 0x1132f9ae + .long 0x6da129c7 + .long 0x4b2f9e1d + .long 0xf330b2dc + .long 0xec52860d + .long 0xd0e3c177 + .long 0x6c16b32b + .long 0x99b970a9 + .long 0xfa489411 + .long 0x2264e947 + .long 0xc48cfca8 + .long 0x1a3ff0a0 + .long 0xd82c7d56 + .long 0xef903322 + .long 0xc74e4987 + .long 0xc1d138d9 + .long 0xfea2ca8c + .long 0x360bd498 + .long 0xcf81f5a6 + .long 0x28de7aa5 + .long 0x268eb7da + .long 0xa4bfad3f + .long 0xe49d3a2c + .long 0x0d927850 + .long 0x9bcc5f6a + .long 0x62467e54 + .long 0xc2138df6 + .long 0xe8b8d890 + .long 0x5ef7392e + .long 0xf5afc382 + .long 0xbe805d9f + .long 0x7c93d069 + .long 0xa92dd56f + .long 0xb31225cf + .long 0x3b99acc8 + .long 0xa77d1810 + .long 0x6e639ce8 + .long 0x7bbb3bdb + .long 0x097826cd + .long 0xf418596e + .long 0x01b79aec + .long 0xa89a4f83 + .long 0x656e95e6 + .long 0x7ee6ffaa + .long 0x08cfbc21 + .long 0xe6e815ef + .long 0xd99be7ba + .long 0xce366f4a + .long 0xd4099fea + .long 0xd67cb029 + .long 0xafb2a431 + .long 0x31233f2a + .long 0x3094a5c6 + .long 0xc066a235 + .long 0x37bc4e74 + .long 0xa6ca82fc + .long 0xb0d090e0 + .long 0x15d8a733 + .long 0x4a9804f1 + .long 0xf7daec41 + .long 0x0e50cd7f + .long 0x2ff69117 + .long 0x8dd64d76 + .long 0x4db0ef43 + .long 0x544daacc + .long 0xdf0496e4 + .long 0xe3b5d19e + .long 0x1b886a4c + .long 0xb81f2cc1 + .long 0x7f516546 + .long 0x04ea5e9d + .long 0x5d358c01 + .long 0x737487fa + .long 0x2e410bfb + .long 0x5a1d67b3 + .long 0x52d2db92 + .long 0x335610e9 + .long 0x1347d66d + .long 0x8c61d79a + .long 0x7a0ca137 + .long 0x8e14f859 + .long 0x893c13eb + .long 0xee27a9ce + .long 0x35c961b7 + .long 0xede51ce1 + .long 0x3cb1477a + .long 0x59dfd29c + .long 0x3f73f255 + .long 0x79ce1418 + .long 0xbf37c773 + .long 0xeacdf753 + .long 0x5baafd5f + .long 0x146f3ddf + .long 0x86db4478 + .long 0x81f3afca + .long 0x3ec468b9 + .long 0x2c342438 + .long 0x5f40a3c2 + .long 0x72c31d16 + .long 0x0c25e2bc + .long 0x8b493c28 + .long 0x41950dff + .long 0x7101a839 + .long 0xdeb30c08 + .long 0x9ce4b4d8 + .long 0x90c15664 + .long 0x6184cb7b + .long 0x70b632d5 + .long 0x745c6c48 + .long 0x4257b8d0 + // Table 1. + .long 0xa7f45150 + .long 0x65417e53 + .long 0xa4171ac3 + .long 0x5e273a96 + .long 0x6bab3bcb + .long 0x459d1ff1 + .long 0x58faacab + .long 0x03e34b93 + .long 0xfa302055 + .long 0x6d76adf6 + .long 0x76cc8891 + .long 0x4c02f525 + .long 0xd7e54ffc + .long 0xcb2ac5d7 + .long 0x44352680 + .long 0xa362b58f + .long 0x5ab1de49 + .long 0x1bba2567 + .long 0x0eea4598 + .long 0xc0fe5de1 + .long 0x752fc302 + .long 0xf04c8112 + .long 0x97468da3 + .long 0xf9d36bc6 + .long 0x5f8f03e7 + .long 0x9c921595 + .long 0x7a6dbfeb + .long 0x595295da + .long 0x83bed42d + .long 0x217458d3 + .long 0x69e04929 + .long 0xc8c98e44 + .long 0x89c2756a + .long 0x798ef478 + .long 0x3e58996b + .long 0x71b927dd + .long 0x4fe1beb6 + .long 0xad88f017 + .long 0xac20c966 + .long 0x3ace7db4 + .long 0x4adf6318 + .long 0x311ae582 + .long 0x33519760 + .long 0x7f536245 + .long 0x7764b1e0 + .long 0xae6bbb84 + .long 0xa081fe1c + .long 0x2b08f994 + .long 0x68487058 + .long 0xfd458f19 + .long 0x6cde9487 + .long 0xf87b52b7 + .long 0xd373ab23 + .long 0x024b72e2 + .long 0x8f1fe357 + .long 0xab55662a + .long 0x28ebb207 + .long 0xc2b52f03 + .long 0x7bc5869a + .long 0x0837d3a5 + .long 0x872830f2 + .long 0xa5bf23b2 + .long 0x6a0302ba + .long 0x8216ed5c + .long 0x1ccf8a2b + .long 0xb479a792 + .long 0xf207f3f0 + .long 0xe2694ea1 + .long 0xf4da65cd + .long 0xbe0506d5 + .long 0x6234d11f + .long 0xfea6c48a + .long 0x532e349d + .long 0x55f3a2a0 + .long 0xe18a0532 + .long 0xebf6a475 + .long 0xec830b39 + .long 0xef6040aa + .long 0x9f715e06 + .long 0x106ebd51 + .long 0x8a213ef9 + .long 0x06dd963d + .long 0x053eddae + .long 0xbde64d46 + .long 0x8d5491b5 + .long 0x5dc47105 + .long 0xd406046f + .long 0x155060ff + .long 0xfb981924 + .long 0xe9bdd697 + .long 0x434089cc + .long 0x9ed96777 + .long 0x42e8b0bd + .long 0x8b890788 + .long 0x5b19e738 + .long 0xeec879db + .long 0x0a7ca147 + .long 0x0f427ce9 + .long 0x1e84f8c9 + .long 0x00000000 + .long 0x86800983 + .long 0xed2b3248 + .long 0x70111eac + .long 0x725a6c4e + .long 0xff0efdfb + .long 0x38850f56 + .long 0xd5ae3d1e + .long 0x392d3627 + .long 0xd90f0a64 + .long 0xa65c6821 + .long 0x545b9bd1 + .long 0x2e36243a + .long 0x670a0cb1 + .long 0xe757930f + .long 0x96eeb4d2 + .long 0x919b1b9e + .long 0xc5c0804f + .long 0x20dc61a2 + .long 0x4b775a69 + .long 0x1a121c16 + .long 0xba93e20a + .long 0x2aa0c0e5 + .long 0xe0223c43 + .long 0x171b121d + .long 0x0d090e0b + .long 0xc78bf2ad + .long 0xa8b62db9 + .long 0xa91e14c8 + .long 0x19f15785 + .long 0x0775af4c + .long 0xdd99eebb + .long 0x607fa3fd + .long 0x2601f79f + .long 0xf5725cbc + .long 0x3b6644c5 + .long 0x7efb5b34 + .long 0x29438b76 + .long 0xc623cbdc + .long 0xfcedb668 + .long 0xf1e4b863 + .long 0xdc31d7ca + .long 0x85634210 + .long 0x22971340 + .long 0x11c68420 + .long 0x244a857d + .long 0x3dbbd2f8 + .long 0x32f9ae11 + .long 0xa129c76d + .long 0x2f9e1d4b + .long 0x30b2dcf3 + .long 0x52860dec + .long 0xe3c177d0 + .long 0x16b32b6c + .long 0xb970a999 + .long 0x489411fa + .long 0x64e94722 + .long 0x8cfca8c4 + .long 0x3ff0a01a + .long 0x2c7d56d8 + .long 0x903322ef + .long 0x4e4987c7 + .long 0xd138d9c1 + .long 0xa2ca8cfe + .long 0x0bd49836 + .long 0x81f5a6cf + .long 0xde7aa528 + .long 0x8eb7da26 + .long 0xbfad3fa4 + .long 0x9d3a2ce4 + .long 0x9278500d + .long 0xcc5f6a9b + .long 0x467e5462 + .long 0x138df6c2 + .long 0xb8d890e8 + .long 0xf7392e5e + .long 0xafc382f5 + .long 0x805d9fbe + .long 0x93d0697c + .long 0x2dd56fa9 + .long 0x1225cfb3 + .long 0x99acc83b + .long 0x7d1810a7 + .long 0x639ce86e + .long 0xbb3bdb7b + .long 0x7826cd09 + .long 0x18596ef4 + .long 0xb79aec01 + .long 0x9a4f83a8 + .long 0x6e95e665 + .long 0xe6ffaa7e + .long 0xcfbc2108 + .long 0xe815efe6 + .long 0x9be7bad9 + .long 0x366f4ace + .long 0x099fead4 + .long 0x7cb029d6 + .long 0xb2a431af + .long 0x233f2a31 + .long 0x94a5c630 + .long 0x66a235c0 + .long 0xbc4e7437 + .long 0xca82fca6 + .long 0xd090e0b0 + .long 0xd8a73315 + .long 0x9804f14a + .long 0xdaec41f7 + .long 0x50cd7f0e + .long 0xf691172f + .long 0xd64d768d + .long 0xb0ef434d + .long 0x4daacc54 + .long 0x0496e4df + .long 0xb5d19ee3 + .long 0x886a4c1b + .long 0x1f2cc1b8 + .long 0x5165467f + .long 0xea5e9d04 + .long 0x358c015d + .long 0x7487fa73 + .long 0x410bfb2e + .long 0x1d67b35a + .long 0xd2db9252 + .long 0x5610e933 + .long 0x47d66d13 + .long 0x61d79a8c + .long 0x0ca1377a + .long 0x14f8598e + .long 0x3c13eb89 + .long 0x27a9ceee + .long 0xc961b735 + .long 0xe51ce1ed + .long 0xb1477a3c + .long 0xdfd29c59 + .long 0x73f2553f + .long 0xce141879 + .long 0x37c773bf + .long 0xcdf753ea + .long 0xaafd5f5b + .long 0x6f3ddf14 + .long 0xdb447886 + .long 0xf3afca81 + .long 0xc468b93e + .long 0x3424382c + .long 0x40a3c25f + .long 0xc31d1672 + .long 0x25e2bc0c + .long 0x493c288b + .long 0x950dff41 + .long 0x01a83971 + .long 0xb30c08de + .long 0xe4b4d89c + .long 0xc1566490 + .long 0x84cb7b61 + .long 0xb632d570 + .long 0x5c6c4874 + .long 0x57b8d042 + // Table 2. + .long 0xf45150a7 + .long 0x417e5365 + .long 0x171ac3a4 + .long 0x273a965e + .long 0xab3bcb6b + .long 0x9d1ff145 + .long 0xfaacab58 + .long 0xe34b9303 + .long 0x302055fa + .long 0x76adf66d + .long 0xcc889176 + .long 0x02f5254c + .long 0xe54ffcd7 + .long 0x2ac5d7cb + .long 0x35268044 + .long 0x62b58fa3 + .long 0xb1de495a + .long 0xba25671b + .long 0xea45980e + .long 0xfe5de1c0 + .long 0x2fc30275 + .long 0x4c8112f0 + .long 0x468da397 + .long 0xd36bc6f9 + .long 0x8f03e75f + .long 0x9215959c + .long 0x6dbfeb7a + .long 0x5295da59 + .long 0xbed42d83 + .long 0x7458d321 + .long 0xe0492969 + .long 0xc98e44c8 + .long 0xc2756a89 + .long 0x8ef47879 + .long 0x58996b3e + .long 0xb927dd71 + .long 0xe1beb64f + .long 0x88f017ad + .long 0x20c966ac + .long 0xce7db43a + .long 0xdf63184a + .long 0x1ae58231 + .long 0x51976033 + .long 0x5362457f + .long 0x64b1e077 + .long 0x6bbb84ae + .long 0x81fe1ca0 + .long 0x08f9942b + .long 0x48705868 + .long 0x458f19fd + .long 0xde94876c + .long 0x7b52b7f8 + .long 0x73ab23d3 + .long 0x4b72e202 + .long 0x1fe3578f + .long 0x55662aab + .long 0xebb20728 + .long 0xb52f03c2 + .long 0xc5869a7b + .long 0x37d3a508 + .long 0x2830f287 + .long 0xbf23b2a5 + .long 0x0302ba6a + .long 0x16ed5c82 + .long 0xcf8a2b1c + .long 0x79a792b4 + .long 0x07f3f0f2 + .long 0x694ea1e2 + .long 0xda65cdf4 + .long 0x0506d5be + .long 0x34d11f62 + .long 0xa6c48afe + .long 0x2e349d53 + .long 0xf3a2a055 + .long 0x8a0532e1 + .long 0xf6a475eb + .long 0x830b39ec + .long 0x6040aaef + .long 0x715e069f + .long 0x6ebd5110 + .long 0x213ef98a + .long 0xdd963d06 + .long 0x3eddae05 + .long 0xe64d46bd + .long 0x5491b58d + .long 0xc471055d + .long 0x06046fd4 + .long 0x5060ff15 + .long 0x981924fb + .long 0xbdd697e9 + .long 0x4089cc43 + .long 0xd967779e + .long 0xe8b0bd42 + .long 0x8907888b + .long 0x19e7385b + .long 0xc879dbee + .long 0x7ca1470a + .long 0x427ce90f + .long 0x84f8c91e + .long 0x00000000 + .long 0x80098386 + .long 0x2b3248ed + .long 0x111eac70 + .long 0x5a6c4e72 + .long 0x0efdfbff + .long 0x850f5638 + .long 0xae3d1ed5 + .long 0x2d362739 + .long 0x0f0a64d9 + .long 0x5c6821a6 + .long 0x5b9bd154 + .long 0x36243a2e + .long 0x0a0cb167 + .long 0x57930fe7 + .long 0xeeb4d296 + .long 0x9b1b9e91 + .long 0xc0804fc5 + .long 0xdc61a220 + .long 0x775a694b + .long 0x121c161a + .long 0x93e20aba + .long 0xa0c0e52a + .long 0x223c43e0 + .long 0x1b121d17 + .long 0x090e0b0d + .long 0x8bf2adc7 + .long 0xb62db9a8 + .long 0x1e14c8a9 + .long 0xf1578519 + .long 0x75af4c07 + .long 0x99eebbdd + .long 0x7fa3fd60 + .long 0x01f79f26 + .long 0x725cbcf5 + .long 0x6644c53b + .long 0xfb5b347e + .long 0x438b7629 + .long 0x23cbdcc6 + .long 0xedb668fc + .long 0xe4b863f1 + .long 0x31d7cadc + .long 0x63421085 + .long 0x97134022 + .long 0xc6842011 + .long 0x4a857d24 + .long 0xbbd2f83d + .long 0xf9ae1132 + .long 0x29c76da1 + .long 0x9e1d4b2f + .long 0xb2dcf330 + .long 0x860dec52 + .long 0xc177d0e3 + .long 0xb32b6c16 + .long 0x70a999b9 + .long 0x9411fa48 + .long 0xe9472264 + .long 0xfca8c48c + .long 0xf0a01a3f + .long 0x7d56d82c + .long 0x3322ef90 + .long 0x4987c74e + .long 0x38d9c1d1 + .long 0xca8cfea2 + .long 0xd498360b + .long 0xf5a6cf81 + .long 0x7aa528de + .long 0xb7da268e + .long 0xad3fa4bf + .long 0x3a2ce49d + .long 0x78500d92 + .long 0x5f6a9bcc + .long 0x7e546246 + .long 0x8df6c213 + .long 0xd890e8b8 + .long 0x392e5ef7 + .long 0xc382f5af + .long 0x5d9fbe80 + .long 0xd0697c93 + .long 0xd56fa92d + .long 0x25cfb312 + .long 0xacc83b99 + .long 0x1810a77d + .long 0x9ce86e63 + .long 0x3bdb7bbb + .long 0x26cd0978 + .long 0x596ef418 + .long 0x9aec01b7 + .long 0x4f83a89a + .long 0x95e6656e + .long 0xffaa7ee6 + .long 0xbc2108cf + .long 0x15efe6e8 + .long 0xe7bad99b + .long 0x6f4ace36 + .long 0x9fead409 + .long 0xb029d67c + .long 0xa431afb2 + .long 0x3f2a3123 + .long 0xa5c63094 + .long 0xa235c066 + .long 0x4e7437bc + .long 0x82fca6ca + .long 0x90e0b0d0 + .long 0xa73315d8 + .long 0x04f14a98 + .long 0xec41f7da + .long 0xcd7f0e50 + .long 0x91172ff6 + .long 0x4d768dd6 + .long 0xef434db0 + .long 0xaacc544d + .long 0x96e4df04 + .long 0xd19ee3b5 + .long 0x6a4c1b88 + .long 0x2cc1b81f + .long 0x65467f51 + .long 0x5e9d04ea + .long 0x8c015d35 + .long 0x87fa7374 + .long 0x0bfb2e41 + .long 0x67b35a1d + .long 0xdb9252d2 + .long 0x10e93356 + .long 0xd66d1347 + .long 0xd79a8c61 + .long 0xa1377a0c + .long 0xf8598e14 + .long 0x13eb893c + .long 0xa9ceee27 + .long 0x61b735c9 + .long 0x1ce1ede5 + .long 0x477a3cb1 + .long 0xd29c59df + .long 0xf2553f73 + .long 0x141879ce + .long 0xc773bf37 + .long 0xf753eacd + .long 0xfd5f5baa + .long 0x3ddf146f + .long 0x447886db + .long 0xafca81f3 + .long 0x68b93ec4 + .long 0x24382c34 + .long 0xa3c25f40 + .long 0x1d1672c3 + .long 0xe2bc0c25 + .long 0x3c288b49 + .long 0x0dff4195 + .long 0xa8397101 + .long 0x0c08deb3 + .long 0xb4d89ce4 + .long 0x566490c1 + .long 0xcb7b6184 + .long 0x32d570b6 + .long 0x6c48745c + .long 0xb8d04257 + // Table 3. + .long 0x5150a7f4 + .long 0x7e536541 + .long 0x1ac3a417 + .long 0x3a965e27 + .long 0x3bcb6bab + .long 0x1ff1459d + .long 0xacab58fa + .long 0x4b9303e3 + .long 0x2055fa30 + .long 0xadf66d76 + .long 0x889176cc + .long 0xf5254c02 + .long 0x4ffcd7e5 + .long 0xc5d7cb2a + .long 0x26804435 + .long 0xb58fa362 + .long 0xde495ab1 + .long 0x25671bba + .long 0x45980eea + .long 0x5de1c0fe + .long 0xc302752f + .long 0x8112f04c + .long 0x8da39746 + .long 0x6bc6f9d3 + .long 0x03e75f8f + .long 0x15959c92 + .long 0xbfeb7a6d + .long 0x95da5952 + .long 0xd42d83be + .long 0x58d32174 + .long 0x492969e0 + .long 0x8e44c8c9 + .long 0x756a89c2 + .long 0xf478798e + .long 0x996b3e58 + .long 0x27dd71b9 + .long 0xbeb64fe1 + .long 0xf017ad88 + .long 0xc966ac20 + .long 0x7db43ace + .long 0x63184adf + .long 0xe582311a + .long 0x97603351 + .long 0x62457f53 + .long 0xb1e07764 + .long 0xbb84ae6b + .long 0xfe1ca081 + .long 0xf9942b08 + .long 0x70586848 + .long 0x8f19fd45 + .long 0x94876cde + .long 0x52b7f87b + .long 0xab23d373 + .long 0x72e2024b + .long 0xe3578f1f + .long 0x662aab55 + .long 0xb20728eb + .long 0x2f03c2b5 + .long 0x869a7bc5 + .long 0xd3a50837 + .long 0x30f28728 + .long 0x23b2a5bf + .long 0x02ba6a03 + .long 0xed5c8216 + .long 0x8a2b1ccf + .long 0xa792b479 + .long 0xf3f0f207 + .long 0x4ea1e269 + .long 0x65cdf4da + .long 0x06d5be05 + .long 0xd11f6234 + .long 0xc48afea6 + .long 0x349d532e + .long 0xa2a055f3 + .long 0x0532e18a + .long 0xa475ebf6 + .long 0x0b39ec83 + .long 0x40aaef60 + .long 0x5e069f71 + .long 0xbd51106e + .long 0x3ef98a21 + .long 0x963d06dd + .long 0xddae053e + .long 0x4d46bde6 + .long 0x91b58d54 + .long 0x71055dc4 + .long 0x046fd406 + .long 0x60ff1550 + .long 0x1924fb98 + .long 0xd697e9bd + .long 0x89cc4340 + .long 0x67779ed9 + .long 0xb0bd42e8 + .long 0x07888b89 + .long 0xe7385b19 + .long 0x79dbeec8 + .long 0xa1470a7c + .long 0x7ce90f42 + .long 0xf8c91e84 + .long 0x00000000 + .long 0x09838680 + .long 0x3248ed2b + .long 0x1eac7011 + .long 0x6c4e725a + .long 0xfdfbff0e + .long 0x0f563885 + .long 0x3d1ed5ae + .long 0x3627392d + .long 0x0a64d90f + .long 0x6821a65c + .long 0x9bd1545b + .long 0x243a2e36 + .long 0x0cb1670a + .long 0x930fe757 + .long 0xb4d296ee + .long 0x1b9e919b + .long 0x804fc5c0 + .long 0x61a220dc + .long 0x5a694b77 + .long 0x1c161a12 + .long 0xe20aba93 + .long 0xc0e52aa0 + .long 0x3c43e022 + .long 0x121d171b + .long 0x0e0b0d09 + .long 0xf2adc78b + .long 0x2db9a8b6 + .long 0x14c8a91e + .long 0x578519f1 + .long 0xaf4c0775 + .long 0xeebbdd99 + .long 0xa3fd607f + .long 0xf79f2601 + .long 0x5cbcf572 + .long 0x44c53b66 + .long 0x5b347efb + .long 0x8b762943 + .long 0xcbdcc623 + .long 0xb668fced + .long 0xb863f1e4 + .long 0xd7cadc31 + .long 0x42108563 + .long 0x13402297 + .long 0x842011c6 + .long 0x857d244a + .long 0xd2f83dbb + .long 0xae1132f9 + .long 0xc76da129 + .long 0x1d4b2f9e + .long 0xdcf330b2 + .long 0x0dec5286 + .long 0x77d0e3c1 + .long 0x2b6c16b3 + .long 0xa999b970 + .long 0x11fa4894 + .long 0x472264e9 + .long 0xa8c48cfc + .long 0xa01a3ff0 + .long 0x56d82c7d + .long 0x22ef9033 + .long 0x87c74e49 + .long 0xd9c1d138 + .long 0x8cfea2ca + .long 0x98360bd4 + .long 0xa6cf81f5 + .long 0xa528de7a + .long 0xda268eb7 + .long 0x3fa4bfad + .long 0x2ce49d3a + .long 0x500d9278 + .long 0x6a9bcc5f + .long 0x5462467e + .long 0xf6c2138d + .long 0x90e8b8d8 + .long 0x2e5ef739 + .long 0x82f5afc3 + .long 0x9fbe805d + .long 0x697c93d0 + .long 0x6fa92dd5 + .long 0xcfb31225 + .long 0xc83b99ac + .long 0x10a77d18 + .long 0xe86e639c + .long 0xdb7bbb3b + .long 0xcd097826 + .long 0x6ef41859 + .long 0xec01b79a + .long 0x83a89a4f + .long 0xe6656e95 + .long 0xaa7ee6ff + .long 0x2108cfbc + .long 0xefe6e815 + .long 0xbad99be7 + .long 0x4ace366f + .long 0xead4099f + .long 0x29d67cb0 + .long 0x31afb2a4 + .long 0x2a31233f + .long 0xc63094a5 + .long 0x35c066a2 + .long 0x7437bc4e + .long 0xfca6ca82 + .long 0xe0b0d090 + .long 0x3315d8a7 + .long 0xf14a9804 + .long 0x41f7daec + .long 0x7f0e50cd + .long 0x172ff691 + .long 0x768dd64d + .long 0x434db0ef + .long 0xcc544daa + .long 0xe4df0496 + .long 0x9ee3b5d1 + .long 0x4c1b886a + .long 0xc1b81f2c + .long 0x467f5165 + .long 0x9d04ea5e + .long 0x015d358c + .long 0xfa737487 + .long 0xfb2e410b + .long 0xb35a1d67 + .long 0x9252d2db + .long 0xe9335610 + .long 0x6d1347d6 + .long 0x9a8c61d7 + .long 0x377a0ca1 + .long 0x598e14f8 + .long 0xeb893c13 + .long 0xceee27a9 + .long 0xb735c961 + .long 0xe1ede51c + .long 0x7a3cb147 + .long 0x9c59dfd2 + .long 0x553f73f2 + .long 0x1879ce14 + .long 0x73bf37c7 + .long 0x53eacdf7 + .long 0x5f5baafd + .long 0xdf146f3d + .long 0x7886db44 + .long 0xca81f3af + .long 0xb93ec468 + .long 0x382c3424 + .long 0xc25f40a3 + .long 0x1672c31d + .long 0xbc0c25e2 + .long 0x288b493c + .long 0xff41950d + .long 0x397101a8 + .long 0x08deb30c + .long 0xd89ce4b4 + .long 0x6490c156 + .long 0x7b6184cb + .long 0xd570b632 + .long 0x48745c6c + .long 0xd04257b8 + + +// SubBytes embedded in words tables. + .globl _AESSubBytesWordTable + .private_extern _AESSubBytesWordTable + .align 2 +_AESSubBytesWordTable: + // Table 0. + .long 0x00000063 + .long 0x0000007c + .long 0x00000077 + .long 0x0000007b + .long 0x000000f2 + .long 0x0000006b + .long 0x0000006f + .long 0x000000c5 + .long 0x00000030 + .long 0x00000001 + .long 0x00000067 + .long 0x0000002b + .long 0x000000fe + .long 0x000000d7 + .long 0x000000ab + .long 0x00000076 + .long 0x000000ca + .long 0x00000082 + .long 0x000000c9 + .long 0x0000007d + .long 0x000000fa + .long 0x00000059 + .long 0x00000047 + .long 0x000000f0 + .long 0x000000ad + .long 0x000000d4 + .long 0x000000a2 + .long 0x000000af + .long 0x0000009c + .long 0x000000a4 + .long 0x00000072 + .long 0x000000c0 + .long 0x000000b7 + .long 0x000000fd + .long 0x00000093 + .long 0x00000026 + .long 0x00000036 + .long 0x0000003f + .long 0x000000f7 + .long 0x000000cc + .long 0x00000034 + .long 0x000000a5 + .long 0x000000e5 + .long 0x000000f1 + .long 0x00000071 + .long 0x000000d8 + .long 0x00000031 + .long 0x00000015 + .long 0x00000004 + .long 0x000000c7 + .long 0x00000023 + .long 0x000000c3 + .long 0x00000018 + .long 0x00000096 + .long 0x00000005 + .long 0x0000009a + .long 0x00000007 + .long 0x00000012 + .long 0x00000080 + .long 0x000000e2 + .long 0x000000eb + .long 0x00000027 + .long 0x000000b2 + .long 0x00000075 + .long 0x00000009 + .long 0x00000083 + .long 0x0000002c + .long 0x0000001a + .long 0x0000001b + .long 0x0000006e + .long 0x0000005a + .long 0x000000a0 + .long 0x00000052 + .long 0x0000003b + .long 0x000000d6 + .long 0x000000b3 + .long 0x00000029 + .long 0x000000e3 + .long 0x0000002f + .long 0x00000084 + .long 0x00000053 + .long 0x000000d1 + .long 0x00000000 + .long 0x000000ed + .long 0x00000020 + .long 0x000000fc + .long 0x000000b1 + .long 0x0000005b + .long 0x0000006a + .long 0x000000cb + .long 0x000000be + .long 0x00000039 + .long 0x0000004a + .long 0x0000004c + .long 0x00000058 + .long 0x000000cf + .long 0x000000d0 + .long 0x000000ef + .long 0x000000aa + .long 0x000000fb + .long 0x00000043 + .long 0x0000004d + .long 0x00000033 + .long 0x00000085 + .long 0x00000045 + .long 0x000000f9 + .long 0x00000002 + .long 0x0000007f + .long 0x00000050 + .long 0x0000003c + .long 0x0000009f + .long 0x000000a8 + .long 0x00000051 + .long 0x000000a3 + .long 0x00000040 + .long 0x0000008f + .long 0x00000092 + .long 0x0000009d + .long 0x00000038 + .long 0x000000f5 + .long 0x000000bc + .long 0x000000b6 + .long 0x000000da + .long 0x00000021 + .long 0x00000010 + .long 0x000000ff + .long 0x000000f3 + .long 0x000000d2 + .long 0x000000cd + .long 0x0000000c + .long 0x00000013 + .long 0x000000ec + .long 0x0000005f + .long 0x00000097 + .long 0x00000044 + .long 0x00000017 + .long 0x000000c4 + .long 0x000000a7 + .long 0x0000007e + .long 0x0000003d + .long 0x00000064 + .long 0x0000005d + .long 0x00000019 + .long 0x00000073 + .long 0x00000060 + .long 0x00000081 + .long 0x0000004f + .long 0x000000dc + .long 0x00000022 + .long 0x0000002a + .long 0x00000090 + .long 0x00000088 + .long 0x00000046 + .long 0x000000ee + .long 0x000000b8 + .long 0x00000014 + .long 0x000000de + .long 0x0000005e + .long 0x0000000b + .long 0x000000db + .long 0x000000e0 + .long 0x00000032 + .long 0x0000003a + .long 0x0000000a + .long 0x00000049 + .long 0x00000006 + .long 0x00000024 + .long 0x0000005c + .long 0x000000c2 + .long 0x000000d3 + .long 0x000000ac + .long 0x00000062 + .long 0x00000091 + .long 0x00000095 + .long 0x000000e4 + .long 0x00000079 + .long 0x000000e7 + .long 0x000000c8 + .long 0x00000037 + .long 0x0000006d + .long 0x0000008d + .long 0x000000d5 + .long 0x0000004e + .long 0x000000a9 + .long 0x0000006c + .long 0x00000056 + .long 0x000000f4 + .long 0x000000ea + .long 0x00000065 + .long 0x0000007a + .long 0x000000ae + .long 0x00000008 + .long 0x000000ba + .long 0x00000078 + .long 0x00000025 + .long 0x0000002e + .long 0x0000001c + .long 0x000000a6 + .long 0x000000b4 + .long 0x000000c6 + .long 0x000000e8 + .long 0x000000dd + .long 0x00000074 + .long 0x0000001f + .long 0x0000004b + .long 0x000000bd + .long 0x0000008b + .long 0x0000008a + .long 0x00000070 + .long 0x0000003e + .long 0x000000b5 + .long 0x00000066 + .long 0x00000048 + .long 0x00000003 + .long 0x000000f6 + .long 0x0000000e + .long 0x00000061 + .long 0x00000035 + .long 0x00000057 + .long 0x000000b9 + .long 0x00000086 + .long 0x000000c1 + .long 0x0000001d + .long 0x0000009e + .long 0x000000e1 + .long 0x000000f8 + .long 0x00000098 + .long 0x00000011 + .long 0x00000069 + .long 0x000000d9 + .long 0x0000008e + .long 0x00000094 + .long 0x0000009b + .long 0x0000001e + .long 0x00000087 + .long 0x000000e9 + .long 0x000000ce + .long 0x00000055 + .long 0x00000028 + .long 0x000000df + .long 0x0000008c + .long 0x000000a1 + .long 0x00000089 + .long 0x0000000d + .long 0x000000bf + .long 0x000000e6 + .long 0x00000042 + .long 0x00000068 + .long 0x00000041 + .long 0x00000099 + .long 0x0000002d + .long 0x0000000f + .long 0x000000b0 + .long 0x00000054 + .long 0x000000bb + .long 0x00000016 + // Table 1. + .long 0x00006300 + .long 0x00007c00 + .long 0x00007700 + .long 0x00007b00 + .long 0x0000f200 + .long 0x00006b00 + .long 0x00006f00 + .long 0x0000c500 + .long 0x00003000 + .long 0x00000100 + .long 0x00006700 + .long 0x00002b00 + .long 0x0000fe00 + .long 0x0000d700 + .long 0x0000ab00 + .long 0x00007600 + .long 0x0000ca00 + .long 0x00008200 + .long 0x0000c900 + .long 0x00007d00 + .long 0x0000fa00 + .long 0x00005900 + .long 0x00004700 + .long 0x0000f000 + .long 0x0000ad00 + .long 0x0000d400 + .long 0x0000a200 + .long 0x0000af00 + .long 0x00009c00 + .long 0x0000a400 + .long 0x00007200 + .long 0x0000c000 + .long 0x0000b700 + .long 0x0000fd00 + .long 0x00009300 + .long 0x00002600 + .long 0x00003600 + .long 0x00003f00 + .long 0x0000f700 + .long 0x0000cc00 + .long 0x00003400 + .long 0x0000a500 + .long 0x0000e500 + .long 0x0000f100 + .long 0x00007100 + .long 0x0000d800 + .long 0x00003100 + .long 0x00001500 + .long 0x00000400 + .long 0x0000c700 + .long 0x00002300 + .long 0x0000c300 + .long 0x00001800 + .long 0x00009600 + .long 0x00000500 + .long 0x00009a00 + .long 0x00000700 + .long 0x00001200 + .long 0x00008000 + .long 0x0000e200 + .long 0x0000eb00 + .long 0x00002700 + .long 0x0000b200 + .long 0x00007500 + .long 0x00000900 + .long 0x00008300 + .long 0x00002c00 + .long 0x00001a00 + .long 0x00001b00 + .long 0x00006e00 + .long 0x00005a00 + .long 0x0000a000 + .long 0x00005200 + .long 0x00003b00 + .long 0x0000d600 + .long 0x0000b300 + .long 0x00002900 + .long 0x0000e300 + .long 0x00002f00 + .long 0x00008400 + .long 0x00005300 + .long 0x0000d100 + .long 0x00000000 + .long 0x0000ed00 + .long 0x00002000 + .long 0x0000fc00 + .long 0x0000b100 + .long 0x00005b00 + .long 0x00006a00 + .long 0x0000cb00 + .long 0x0000be00 + .long 0x00003900 + .long 0x00004a00 + .long 0x00004c00 + .long 0x00005800 + .long 0x0000cf00 + .long 0x0000d000 + .long 0x0000ef00 + .long 0x0000aa00 + .long 0x0000fb00 + .long 0x00004300 + .long 0x00004d00 + .long 0x00003300 + .long 0x00008500 + .long 0x00004500 + .long 0x0000f900 + .long 0x00000200 + .long 0x00007f00 + .long 0x00005000 + .long 0x00003c00 + .long 0x00009f00 + .long 0x0000a800 + .long 0x00005100 + .long 0x0000a300 + .long 0x00004000 + .long 0x00008f00 + .long 0x00009200 + .long 0x00009d00 + .long 0x00003800 + .long 0x0000f500 + .long 0x0000bc00 + .long 0x0000b600 + .long 0x0000da00 + .long 0x00002100 + .long 0x00001000 + .long 0x0000ff00 + .long 0x0000f300 + .long 0x0000d200 + .long 0x0000cd00 + .long 0x00000c00 + .long 0x00001300 + .long 0x0000ec00 + .long 0x00005f00 + .long 0x00009700 + .long 0x00004400 + .long 0x00001700 + .long 0x0000c400 + .long 0x0000a700 + .long 0x00007e00 + .long 0x00003d00 + .long 0x00006400 + .long 0x00005d00 + .long 0x00001900 + .long 0x00007300 + .long 0x00006000 + .long 0x00008100 + .long 0x00004f00 + .long 0x0000dc00 + .long 0x00002200 + .long 0x00002a00 + .long 0x00009000 + .long 0x00008800 + .long 0x00004600 + .long 0x0000ee00 + .long 0x0000b800 + .long 0x00001400 + .long 0x0000de00 + .long 0x00005e00 + .long 0x00000b00 + .long 0x0000db00 + .long 0x0000e000 + .long 0x00003200 + .long 0x00003a00 + .long 0x00000a00 + .long 0x00004900 + .long 0x00000600 + .long 0x00002400 + .long 0x00005c00 + .long 0x0000c200 + .long 0x0000d300 + .long 0x0000ac00 + .long 0x00006200 + .long 0x00009100 + .long 0x00009500 + .long 0x0000e400 + .long 0x00007900 + .long 0x0000e700 + .long 0x0000c800 + .long 0x00003700 + .long 0x00006d00 + .long 0x00008d00 + .long 0x0000d500 + .long 0x00004e00 + .long 0x0000a900 + .long 0x00006c00 + .long 0x00005600 + .long 0x0000f400 + .long 0x0000ea00 + .long 0x00006500 + .long 0x00007a00 + .long 0x0000ae00 + .long 0x00000800 + .long 0x0000ba00 + .long 0x00007800 + .long 0x00002500 + .long 0x00002e00 + .long 0x00001c00 + .long 0x0000a600 + .long 0x0000b400 + .long 0x0000c600 + .long 0x0000e800 + .long 0x0000dd00 + .long 0x00007400 + .long 0x00001f00 + .long 0x00004b00 + .long 0x0000bd00 + .long 0x00008b00 + .long 0x00008a00 + .long 0x00007000 + .long 0x00003e00 + .long 0x0000b500 + .long 0x00006600 + .long 0x00004800 + .long 0x00000300 + .long 0x0000f600 + .long 0x00000e00 + .long 0x00006100 + .long 0x00003500 + .long 0x00005700 + .long 0x0000b900 + .long 0x00008600 + .long 0x0000c100 + .long 0x00001d00 + .long 0x00009e00 + .long 0x0000e100 + .long 0x0000f800 + .long 0x00009800 + .long 0x00001100 + .long 0x00006900 + .long 0x0000d900 + .long 0x00008e00 + .long 0x00009400 + .long 0x00009b00 + .long 0x00001e00 + .long 0x00008700 + .long 0x0000e900 + .long 0x0000ce00 + .long 0x00005500 + .long 0x00002800 + .long 0x0000df00 + .long 0x00008c00 + .long 0x0000a100 + .long 0x00008900 + .long 0x00000d00 + .long 0x0000bf00 + .long 0x0000e600 + .long 0x00004200 + .long 0x00006800 + .long 0x00004100 + .long 0x00009900 + .long 0x00002d00 + .long 0x00000f00 + .long 0x0000b000 + .long 0x00005400 + .long 0x0000bb00 + .long 0x00001600 + // Table 2. + .long 0x00630000 + .long 0x007c0000 + .long 0x00770000 + .long 0x007b0000 + .long 0x00f20000 + .long 0x006b0000 + .long 0x006f0000 + .long 0x00c50000 + .long 0x00300000 + .long 0x00010000 + .long 0x00670000 + .long 0x002b0000 + .long 0x00fe0000 + .long 0x00d70000 + .long 0x00ab0000 + .long 0x00760000 + .long 0x00ca0000 + .long 0x00820000 + .long 0x00c90000 + .long 0x007d0000 + .long 0x00fa0000 + .long 0x00590000 + .long 0x00470000 + .long 0x00f00000 + .long 0x00ad0000 + .long 0x00d40000 + .long 0x00a20000 + .long 0x00af0000 + .long 0x009c0000 + .long 0x00a40000 + .long 0x00720000 + .long 0x00c00000 + .long 0x00b70000 + .long 0x00fd0000 + .long 0x00930000 + .long 0x00260000 + .long 0x00360000 + .long 0x003f0000 + .long 0x00f70000 + .long 0x00cc0000 + .long 0x00340000 + .long 0x00a50000 + .long 0x00e50000 + .long 0x00f10000 + .long 0x00710000 + .long 0x00d80000 + .long 0x00310000 + .long 0x00150000 + .long 0x00040000 + .long 0x00c70000 + .long 0x00230000 + .long 0x00c30000 + .long 0x00180000 + .long 0x00960000 + .long 0x00050000 + .long 0x009a0000 + .long 0x00070000 + .long 0x00120000 + .long 0x00800000 + .long 0x00e20000 + .long 0x00eb0000 + .long 0x00270000 + .long 0x00b20000 + .long 0x00750000 + .long 0x00090000 + .long 0x00830000 + .long 0x002c0000 + .long 0x001a0000 + .long 0x001b0000 + .long 0x006e0000 + .long 0x005a0000 + .long 0x00a00000 + .long 0x00520000 + .long 0x003b0000 + .long 0x00d60000 + .long 0x00b30000 + .long 0x00290000 + .long 0x00e30000 + .long 0x002f0000 + .long 0x00840000 + .long 0x00530000 + .long 0x00d10000 + .long 0x00000000 + .long 0x00ed0000 + .long 0x00200000 + .long 0x00fc0000 + .long 0x00b10000 + .long 0x005b0000 + .long 0x006a0000 + .long 0x00cb0000 + .long 0x00be0000 + .long 0x00390000 + .long 0x004a0000 + .long 0x004c0000 + .long 0x00580000 + .long 0x00cf0000 + .long 0x00d00000 + .long 0x00ef0000 + .long 0x00aa0000 + .long 0x00fb0000 + .long 0x00430000 + .long 0x004d0000 + .long 0x00330000 + .long 0x00850000 + .long 0x00450000 + .long 0x00f90000 + .long 0x00020000 + .long 0x007f0000 + .long 0x00500000 + .long 0x003c0000 + .long 0x009f0000 + .long 0x00a80000 + .long 0x00510000 + .long 0x00a30000 + .long 0x00400000 + .long 0x008f0000 + .long 0x00920000 + .long 0x009d0000 + .long 0x00380000 + .long 0x00f50000 + .long 0x00bc0000 + .long 0x00b60000 + .long 0x00da0000 + .long 0x00210000 + .long 0x00100000 + .long 0x00ff0000 + .long 0x00f30000 + .long 0x00d20000 + .long 0x00cd0000 + .long 0x000c0000 + .long 0x00130000 + .long 0x00ec0000 + .long 0x005f0000 + .long 0x00970000 + .long 0x00440000 + .long 0x00170000 + .long 0x00c40000 + .long 0x00a70000 + .long 0x007e0000 + .long 0x003d0000 + .long 0x00640000 + .long 0x005d0000 + .long 0x00190000 + .long 0x00730000 + .long 0x00600000 + .long 0x00810000 + .long 0x004f0000 + .long 0x00dc0000 + .long 0x00220000 + .long 0x002a0000 + .long 0x00900000 + .long 0x00880000 + .long 0x00460000 + .long 0x00ee0000 + .long 0x00b80000 + .long 0x00140000 + .long 0x00de0000 + .long 0x005e0000 + .long 0x000b0000 + .long 0x00db0000 + .long 0x00e00000 + .long 0x00320000 + .long 0x003a0000 + .long 0x000a0000 + .long 0x00490000 + .long 0x00060000 + .long 0x00240000 + .long 0x005c0000 + .long 0x00c20000 + .long 0x00d30000 + .long 0x00ac0000 + .long 0x00620000 + .long 0x00910000 + .long 0x00950000 + .long 0x00e40000 + .long 0x00790000 + .long 0x00e70000 + .long 0x00c80000 + .long 0x00370000 + .long 0x006d0000 + .long 0x008d0000 + .long 0x00d50000 + .long 0x004e0000 + .long 0x00a90000 + .long 0x006c0000 + .long 0x00560000 + .long 0x00f40000 + .long 0x00ea0000 + .long 0x00650000 + .long 0x007a0000 + .long 0x00ae0000 + .long 0x00080000 + .long 0x00ba0000 + .long 0x00780000 + .long 0x00250000 + .long 0x002e0000 + .long 0x001c0000 + .long 0x00a60000 + .long 0x00b40000 + .long 0x00c60000 + .long 0x00e80000 + .long 0x00dd0000 + .long 0x00740000 + .long 0x001f0000 + .long 0x004b0000 + .long 0x00bd0000 + .long 0x008b0000 + .long 0x008a0000 + .long 0x00700000 + .long 0x003e0000 + .long 0x00b50000 + .long 0x00660000 + .long 0x00480000 + .long 0x00030000 + .long 0x00f60000 + .long 0x000e0000 + .long 0x00610000 + .long 0x00350000 + .long 0x00570000 + .long 0x00b90000 + .long 0x00860000 + .long 0x00c10000 + .long 0x001d0000 + .long 0x009e0000 + .long 0x00e10000 + .long 0x00f80000 + .long 0x00980000 + .long 0x00110000 + .long 0x00690000 + .long 0x00d90000 + .long 0x008e0000 + .long 0x00940000 + .long 0x009b0000 + .long 0x001e0000 + .long 0x00870000 + .long 0x00e90000 + .long 0x00ce0000 + .long 0x00550000 + .long 0x00280000 + .long 0x00df0000 + .long 0x008c0000 + .long 0x00a10000 + .long 0x00890000 + .long 0x000d0000 + .long 0x00bf0000 + .long 0x00e60000 + .long 0x00420000 + .long 0x00680000 + .long 0x00410000 + .long 0x00990000 + .long 0x002d0000 + .long 0x000f0000 + .long 0x00b00000 + .long 0x00540000 + .long 0x00bb0000 + .long 0x00160000 + // Table 3. + .long 0x63000000 + .long 0x7c000000 + .long 0x77000000 + .long 0x7b000000 + .long 0xf2000000 + .long 0x6b000000 + .long 0x6f000000 + .long 0xc5000000 + .long 0x30000000 + .long 0x01000000 + .long 0x67000000 + .long 0x2b000000 + .long 0xfe000000 + .long 0xd7000000 + .long 0xab000000 + .long 0x76000000 + .long 0xca000000 + .long 0x82000000 + .long 0xc9000000 + .long 0x7d000000 + .long 0xfa000000 + .long 0x59000000 + .long 0x47000000 + .long 0xf0000000 + .long 0xad000000 + .long 0xd4000000 + .long 0xa2000000 + .long 0xaf000000 + .long 0x9c000000 + .long 0xa4000000 + .long 0x72000000 + .long 0xc0000000 + .long 0xb7000000 + .long 0xfd000000 + .long 0x93000000 + .long 0x26000000 + .long 0x36000000 + .long 0x3f000000 + .long 0xf7000000 + .long 0xcc000000 + .long 0x34000000 + .long 0xa5000000 + .long 0xe5000000 + .long 0xf1000000 + .long 0x71000000 + .long 0xd8000000 + .long 0x31000000 + .long 0x15000000 + .long 0x04000000 + .long 0xc7000000 + .long 0x23000000 + .long 0xc3000000 + .long 0x18000000 + .long 0x96000000 + .long 0x05000000 + .long 0x9a000000 + .long 0x07000000 + .long 0x12000000 + .long 0x80000000 + .long 0xe2000000 + .long 0xeb000000 + .long 0x27000000 + .long 0xb2000000 + .long 0x75000000 + .long 0x09000000 + .long 0x83000000 + .long 0x2c000000 + .long 0x1a000000 + .long 0x1b000000 + .long 0x6e000000 + .long 0x5a000000 + .long 0xa0000000 + .long 0x52000000 + .long 0x3b000000 + .long 0xd6000000 + .long 0xb3000000 + .long 0x29000000 + .long 0xe3000000 + .long 0x2f000000 + .long 0x84000000 + .long 0x53000000 + .long 0xd1000000 + .long 0x00000000 + .long 0xed000000 + .long 0x20000000 + .long 0xfc000000 + .long 0xb1000000 + .long 0x5b000000 + .long 0x6a000000 + .long 0xcb000000 + .long 0xbe000000 + .long 0x39000000 + .long 0x4a000000 + .long 0x4c000000 + .long 0x58000000 + .long 0xcf000000 + .long 0xd0000000 + .long 0xef000000 + .long 0xaa000000 + .long 0xfb000000 + .long 0x43000000 + .long 0x4d000000 + .long 0x33000000 + .long 0x85000000 + .long 0x45000000 + .long 0xf9000000 + .long 0x02000000 + .long 0x7f000000 + .long 0x50000000 + .long 0x3c000000 + .long 0x9f000000 + .long 0xa8000000 + .long 0x51000000 + .long 0xa3000000 + .long 0x40000000 + .long 0x8f000000 + .long 0x92000000 + .long 0x9d000000 + .long 0x38000000 + .long 0xf5000000 + .long 0xbc000000 + .long 0xb6000000 + .long 0xda000000 + .long 0x21000000 + .long 0x10000000 + .long 0xff000000 + .long 0xf3000000 + .long 0xd2000000 + .long 0xcd000000 + .long 0x0c000000 + .long 0x13000000 + .long 0xec000000 + .long 0x5f000000 + .long 0x97000000 + .long 0x44000000 + .long 0x17000000 + .long 0xc4000000 + .long 0xa7000000 + .long 0x7e000000 + .long 0x3d000000 + .long 0x64000000 + .long 0x5d000000 + .long 0x19000000 + .long 0x73000000 + .long 0x60000000 + .long 0x81000000 + .long 0x4f000000 + .long 0xdc000000 + .long 0x22000000 + .long 0x2a000000 + .long 0x90000000 + .long 0x88000000 + .long 0x46000000 + .long 0xee000000 + .long 0xb8000000 + .long 0x14000000 + .long 0xde000000 + .long 0x5e000000 + .long 0x0b000000 + .long 0xdb000000 + .long 0xe0000000 + .long 0x32000000 + .long 0x3a000000 + .long 0x0a000000 + .long 0x49000000 + .long 0x06000000 + .long 0x24000000 + .long 0x5c000000 + .long 0xc2000000 + .long 0xd3000000 + .long 0xac000000 + .long 0x62000000 + .long 0x91000000 + .long 0x95000000 + .long 0xe4000000 + .long 0x79000000 + .long 0xe7000000 + .long 0xc8000000 + .long 0x37000000 + .long 0x6d000000 + .long 0x8d000000 + .long 0xd5000000 + .long 0x4e000000 + .long 0xa9000000 + .long 0x6c000000 + .long 0x56000000 + .long 0xf4000000 + .long 0xea000000 + .long 0x65000000 + .long 0x7a000000 + .long 0xae000000 + .long 0x08000000 + .long 0xba000000 + .long 0x78000000 + .long 0x25000000 + .long 0x2e000000 + .long 0x1c000000 + .long 0xa6000000 + .long 0xb4000000 + .long 0xc6000000 + .long 0xe8000000 + .long 0xdd000000 + .long 0x74000000 + .long 0x1f000000 + .long 0x4b000000 + .long 0xbd000000 + .long 0x8b000000 + .long 0x8a000000 + .long 0x70000000 + .long 0x3e000000 + .long 0xb5000000 + .long 0x66000000 + .long 0x48000000 + .long 0x03000000 + .long 0xf6000000 + .long 0x0e000000 + .long 0x61000000 + .long 0x35000000 + .long 0x57000000 + .long 0xb9000000 + .long 0x86000000 + .long 0xc1000000 + .long 0x1d000000 + .long 0x9e000000 + .long 0xe1000000 + .long 0xf8000000 + .long 0x98000000 + .long 0x11000000 + .long 0x69000000 + .long 0xd9000000 + .long 0x8e000000 + .long 0x94000000 + .long 0x9b000000 + .long 0x1e000000 + .long 0x87000000 + .long 0xe9000000 + .long 0xce000000 + .long 0x55000000 + .long 0x28000000 + .long 0xdf000000 + .long 0x8c000000 + .long 0xa1000000 + .long 0x89000000 + .long 0x0d000000 + .long 0xbf000000 + .long 0xe6000000 + .long 0x42000000 + .long 0x68000000 + .long 0x41000000 + .long 0x99000000 + .long 0x2d000000 + .long 0x0f000000 + .long 0xb0000000 + .long 0x54000000 + .long 0xbb000000 + .long 0x16000000 + + +// InvSubBytes embedded in words tables. + .globl _AESInvSubBytesWordTable + .private_extern _AESInvSubBytesWordTable + .align 2 +_AESInvSubBytesWordTable: + // Table 0. + .long 0x00000052 + .long 0x00000009 + .long 0x0000006a + .long 0x000000d5 + .long 0x00000030 + .long 0x00000036 + .long 0x000000a5 + .long 0x00000038 + .long 0x000000bf + .long 0x00000040 + .long 0x000000a3 + .long 0x0000009e + .long 0x00000081 + .long 0x000000f3 + .long 0x000000d7 + .long 0x000000fb + .long 0x0000007c + .long 0x000000e3 + .long 0x00000039 + .long 0x00000082 + .long 0x0000009b + .long 0x0000002f + .long 0x000000ff + .long 0x00000087 + .long 0x00000034 + .long 0x0000008e + .long 0x00000043 + .long 0x00000044 + .long 0x000000c4 + .long 0x000000de + .long 0x000000e9 + .long 0x000000cb + .long 0x00000054 + .long 0x0000007b + .long 0x00000094 + .long 0x00000032 + .long 0x000000a6 + .long 0x000000c2 + .long 0x00000023 + .long 0x0000003d + .long 0x000000ee + .long 0x0000004c + .long 0x00000095 + .long 0x0000000b + .long 0x00000042 + .long 0x000000fa + .long 0x000000c3 + .long 0x0000004e + .long 0x00000008 + .long 0x0000002e + .long 0x000000a1 + .long 0x00000066 + .long 0x00000028 + .long 0x000000d9 + .long 0x00000024 + .long 0x000000b2 + .long 0x00000076 + .long 0x0000005b + .long 0x000000a2 + .long 0x00000049 + .long 0x0000006d + .long 0x0000008b + .long 0x000000d1 + .long 0x00000025 + .long 0x00000072 + .long 0x000000f8 + .long 0x000000f6 + .long 0x00000064 + .long 0x00000086 + .long 0x00000068 + .long 0x00000098 + .long 0x00000016 + .long 0x000000d4 + .long 0x000000a4 + .long 0x0000005c + .long 0x000000cc + .long 0x0000005d + .long 0x00000065 + .long 0x000000b6 + .long 0x00000092 + .long 0x0000006c + .long 0x00000070 + .long 0x00000048 + .long 0x00000050 + .long 0x000000fd + .long 0x000000ed + .long 0x000000b9 + .long 0x000000da + .long 0x0000005e + .long 0x00000015 + .long 0x00000046 + .long 0x00000057 + .long 0x000000a7 + .long 0x0000008d + .long 0x0000009d + .long 0x00000084 + .long 0x00000090 + .long 0x000000d8 + .long 0x000000ab + .long 0x00000000 + .long 0x0000008c + .long 0x000000bc + .long 0x000000d3 + .long 0x0000000a + .long 0x000000f7 + .long 0x000000e4 + .long 0x00000058 + .long 0x00000005 + .long 0x000000b8 + .long 0x000000b3 + .long 0x00000045 + .long 0x00000006 + .long 0x000000d0 + .long 0x0000002c + .long 0x0000001e + .long 0x0000008f + .long 0x000000ca + .long 0x0000003f + .long 0x0000000f + .long 0x00000002 + .long 0x000000c1 + .long 0x000000af + .long 0x000000bd + .long 0x00000003 + .long 0x00000001 + .long 0x00000013 + .long 0x0000008a + .long 0x0000006b + .long 0x0000003a + .long 0x00000091 + .long 0x00000011 + .long 0x00000041 + .long 0x0000004f + .long 0x00000067 + .long 0x000000dc + .long 0x000000ea + .long 0x00000097 + .long 0x000000f2 + .long 0x000000cf + .long 0x000000ce + .long 0x000000f0 + .long 0x000000b4 + .long 0x000000e6 + .long 0x00000073 + .long 0x00000096 + .long 0x000000ac + .long 0x00000074 + .long 0x00000022 + .long 0x000000e7 + .long 0x000000ad + .long 0x00000035 + .long 0x00000085 + .long 0x000000e2 + .long 0x000000f9 + .long 0x00000037 + .long 0x000000e8 + .long 0x0000001c + .long 0x00000075 + .long 0x000000df + .long 0x0000006e + .long 0x00000047 + .long 0x000000f1 + .long 0x0000001a + .long 0x00000071 + .long 0x0000001d + .long 0x00000029 + .long 0x000000c5 + .long 0x00000089 + .long 0x0000006f + .long 0x000000b7 + .long 0x00000062 + .long 0x0000000e + .long 0x000000aa + .long 0x00000018 + .long 0x000000be + .long 0x0000001b + .long 0x000000fc + .long 0x00000056 + .long 0x0000003e + .long 0x0000004b + .long 0x000000c6 + .long 0x000000d2 + .long 0x00000079 + .long 0x00000020 + .long 0x0000009a + .long 0x000000db + .long 0x000000c0 + .long 0x000000fe + .long 0x00000078 + .long 0x000000cd + .long 0x0000005a + .long 0x000000f4 + .long 0x0000001f + .long 0x000000dd + .long 0x000000a8 + .long 0x00000033 + .long 0x00000088 + .long 0x00000007 + .long 0x000000c7 + .long 0x00000031 + .long 0x000000b1 + .long 0x00000012 + .long 0x00000010 + .long 0x00000059 + .long 0x00000027 + .long 0x00000080 + .long 0x000000ec + .long 0x0000005f + .long 0x00000060 + .long 0x00000051 + .long 0x0000007f + .long 0x000000a9 + .long 0x00000019 + .long 0x000000b5 + .long 0x0000004a + .long 0x0000000d + .long 0x0000002d + .long 0x000000e5 + .long 0x0000007a + .long 0x0000009f + .long 0x00000093 + .long 0x000000c9 + .long 0x0000009c + .long 0x000000ef + .long 0x000000a0 + .long 0x000000e0 + .long 0x0000003b + .long 0x0000004d + .long 0x000000ae + .long 0x0000002a + .long 0x000000f5 + .long 0x000000b0 + .long 0x000000c8 + .long 0x000000eb + .long 0x000000bb + .long 0x0000003c + .long 0x00000083 + .long 0x00000053 + .long 0x00000099 + .long 0x00000061 + .long 0x00000017 + .long 0x0000002b + .long 0x00000004 + .long 0x0000007e + .long 0x000000ba + .long 0x00000077 + .long 0x000000d6 + .long 0x00000026 + .long 0x000000e1 + .long 0x00000069 + .long 0x00000014 + .long 0x00000063 + .long 0x00000055 + .long 0x00000021 + .long 0x0000000c + .long 0x0000007d + // Table 1. + .long 0x00005200 + .long 0x00000900 + .long 0x00006a00 + .long 0x0000d500 + .long 0x00003000 + .long 0x00003600 + .long 0x0000a500 + .long 0x00003800 + .long 0x0000bf00 + .long 0x00004000 + .long 0x0000a300 + .long 0x00009e00 + .long 0x00008100 + .long 0x0000f300 + .long 0x0000d700 + .long 0x0000fb00 + .long 0x00007c00 + .long 0x0000e300 + .long 0x00003900 + .long 0x00008200 + .long 0x00009b00 + .long 0x00002f00 + .long 0x0000ff00 + .long 0x00008700 + .long 0x00003400 + .long 0x00008e00 + .long 0x00004300 + .long 0x00004400 + .long 0x0000c400 + .long 0x0000de00 + .long 0x0000e900 + .long 0x0000cb00 + .long 0x00005400 + .long 0x00007b00 + .long 0x00009400 + .long 0x00003200 + .long 0x0000a600 + .long 0x0000c200 + .long 0x00002300 + .long 0x00003d00 + .long 0x0000ee00 + .long 0x00004c00 + .long 0x00009500 + .long 0x00000b00 + .long 0x00004200 + .long 0x0000fa00 + .long 0x0000c300 + .long 0x00004e00 + .long 0x00000800 + .long 0x00002e00 + .long 0x0000a100 + .long 0x00006600 + .long 0x00002800 + .long 0x0000d900 + .long 0x00002400 + .long 0x0000b200 + .long 0x00007600 + .long 0x00005b00 + .long 0x0000a200 + .long 0x00004900 + .long 0x00006d00 + .long 0x00008b00 + .long 0x0000d100 + .long 0x00002500 + .long 0x00007200 + .long 0x0000f800 + .long 0x0000f600 + .long 0x00006400 + .long 0x00008600 + .long 0x00006800 + .long 0x00009800 + .long 0x00001600 + .long 0x0000d400 + .long 0x0000a400 + .long 0x00005c00 + .long 0x0000cc00 + .long 0x00005d00 + .long 0x00006500 + .long 0x0000b600 + .long 0x00009200 + .long 0x00006c00 + .long 0x00007000 + .long 0x00004800 + .long 0x00005000 + .long 0x0000fd00 + .long 0x0000ed00 + .long 0x0000b900 + .long 0x0000da00 + .long 0x00005e00 + .long 0x00001500 + .long 0x00004600 + .long 0x00005700 + .long 0x0000a700 + .long 0x00008d00 + .long 0x00009d00 + .long 0x00008400 + .long 0x00009000 + .long 0x0000d800 + .long 0x0000ab00 + .long 0x00000000 + .long 0x00008c00 + .long 0x0000bc00 + .long 0x0000d300 + .long 0x00000a00 + .long 0x0000f700 + .long 0x0000e400 + .long 0x00005800 + .long 0x00000500 + .long 0x0000b800 + .long 0x0000b300 + .long 0x00004500 + .long 0x00000600 + .long 0x0000d000 + .long 0x00002c00 + .long 0x00001e00 + .long 0x00008f00 + .long 0x0000ca00 + .long 0x00003f00 + .long 0x00000f00 + .long 0x00000200 + .long 0x0000c100 + .long 0x0000af00 + .long 0x0000bd00 + .long 0x00000300 + .long 0x00000100 + .long 0x00001300 + .long 0x00008a00 + .long 0x00006b00 + .long 0x00003a00 + .long 0x00009100 + .long 0x00001100 + .long 0x00004100 + .long 0x00004f00 + .long 0x00006700 + .long 0x0000dc00 + .long 0x0000ea00 + .long 0x00009700 + .long 0x0000f200 + .long 0x0000cf00 + .long 0x0000ce00 + .long 0x0000f000 + .long 0x0000b400 + .long 0x0000e600 + .long 0x00007300 + .long 0x00009600 + .long 0x0000ac00 + .long 0x00007400 + .long 0x00002200 + .long 0x0000e700 + .long 0x0000ad00 + .long 0x00003500 + .long 0x00008500 + .long 0x0000e200 + .long 0x0000f900 + .long 0x00003700 + .long 0x0000e800 + .long 0x00001c00 + .long 0x00007500 + .long 0x0000df00 + .long 0x00006e00 + .long 0x00004700 + .long 0x0000f100 + .long 0x00001a00 + .long 0x00007100 + .long 0x00001d00 + .long 0x00002900 + .long 0x0000c500 + .long 0x00008900 + .long 0x00006f00 + .long 0x0000b700 + .long 0x00006200 + .long 0x00000e00 + .long 0x0000aa00 + .long 0x00001800 + .long 0x0000be00 + .long 0x00001b00 + .long 0x0000fc00 + .long 0x00005600 + .long 0x00003e00 + .long 0x00004b00 + .long 0x0000c600 + .long 0x0000d200 + .long 0x00007900 + .long 0x00002000 + .long 0x00009a00 + .long 0x0000db00 + .long 0x0000c000 + .long 0x0000fe00 + .long 0x00007800 + .long 0x0000cd00 + .long 0x00005a00 + .long 0x0000f400 + .long 0x00001f00 + .long 0x0000dd00 + .long 0x0000a800 + .long 0x00003300 + .long 0x00008800 + .long 0x00000700 + .long 0x0000c700 + .long 0x00003100 + .long 0x0000b100 + .long 0x00001200 + .long 0x00001000 + .long 0x00005900 + .long 0x00002700 + .long 0x00008000 + .long 0x0000ec00 + .long 0x00005f00 + .long 0x00006000 + .long 0x00005100 + .long 0x00007f00 + .long 0x0000a900 + .long 0x00001900 + .long 0x0000b500 + .long 0x00004a00 + .long 0x00000d00 + .long 0x00002d00 + .long 0x0000e500 + .long 0x00007a00 + .long 0x00009f00 + .long 0x00009300 + .long 0x0000c900 + .long 0x00009c00 + .long 0x0000ef00 + .long 0x0000a000 + .long 0x0000e000 + .long 0x00003b00 + .long 0x00004d00 + .long 0x0000ae00 + .long 0x00002a00 + .long 0x0000f500 + .long 0x0000b000 + .long 0x0000c800 + .long 0x0000eb00 + .long 0x0000bb00 + .long 0x00003c00 + .long 0x00008300 + .long 0x00005300 + .long 0x00009900 + .long 0x00006100 + .long 0x00001700 + .long 0x00002b00 + .long 0x00000400 + .long 0x00007e00 + .long 0x0000ba00 + .long 0x00007700 + .long 0x0000d600 + .long 0x00002600 + .long 0x0000e100 + .long 0x00006900 + .long 0x00001400 + .long 0x00006300 + .long 0x00005500 + .long 0x00002100 + .long 0x00000c00 + .long 0x00007d00 + // Table 2. + .long 0x00520000 + .long 0x00090000 + .long 0x006a0000 + .long 0x00d50000 + .long 0x00300000 + .long 0x00360000 + .long 0x00a50000 + .long 0x00380000 + .long 0x00bf0000 + .long 0x00400000 + .long 0x00a30000 + .long 0x009e0000 + .long 0x00810000 + .long 0x00f30000 + .long 0x00d70000 + .long 0x00fb0000 + .long 0x007c0000 + .long 0x00e30000 + .long 0x00390000 + .long 0x00820000 + .long 0x009b0000 + .long 0x002f0000 + .long 0x00ff0000 + .long 0x00870000 + .long 0x00340000 + .long 0x008e0000 + .long 0x00430000 + .long 0x00440000 + .long 0x00c40000 + .long 0x00de0000 + .long 0x00e90000 + .long 0x00cb0000 + .long 0x00540000 + .long 0x007b0000 + .long 0x00940000 + .long 0x00320000 + .long 0x00a60000 + .long 0x00c20000 + .long 0x00230000 + .long 0x003d0000 + .long 0x00ee0000 + .long 0x004c0000 + .long 0x00950000 + .long 0x000b0000 + .long 0x00420000 + .long 0x00fa0000 + .long 0x00c30000 + .long 0x004e0000 + .long 0x00080000 + .long 0x002e0000 + .long 0x00a10000 + .long 0x00660000 + .long 0x00280000 + .long 0x00d90000 + .long 0x00240000 + .long 0x00b20000 + .long 0x00760000 + .long 0x005b0000 + .long 0x00a20000 + .long 0x00490000 + .long 0x006d0000 + .long 0x008b0000 + .long 0x00d10000 + .long 0x00250000 + .long 0x00720000 + .long 0x00f80000 + .long 0x00f60000 + .long 0x00640000 + .long 0x00860000 + .long 0x00680000 + .long 0x00980000 + .long 0x00160000 + .long 0x00d40000 + .long 0x00a40000 + .long 0x005c0000 + .long 0x00cc0000 + .long 0x005d0000 + .long 0x00650000 + .long 0x00b60000 + .long 0x00920000 + .long 0x006c0000 + .long 0x00700000 + .long 0x00480000 + .long 0x00500000 + .long 0x00fd0000 + .long 0x00ed0000 + .long 0x00b90000 + .long 0x00da0000 + .long 0x005e0000 + .long 0x00150000 + .long 0x00460000 + .long 0x00570000 + .long 0x00a70000 + .long 0x008d0000 + .long 0x009d0000 + .long 0x00840000 + .long 0x00900000 + .long 0x00d80000 + .long 0x00ab0000 + .long 0x00000000 + .long 0x008c0000 + .long 0x00bc0000 + .long 0x00d30000 + .long 0x000a0000 + .long 0x00f70000 + .long 0x00e40000 + .long 0x00580000 + .long 0x00050000 + .long 0x00b80000 + .long 0x00b30000 + .long 0x00450000 + .long 0x00060000 + .long 0x00d00000 + .long 0x002c0000 + .long 0x001e0000 + .long 0x008f0000 + .long 0x00ca0000 + .long 0x003f0000 + .long 0x000f0000 + .long 0x00020000 + .long 0x00c10000 + .long 0x00af0000 + .long 0x00bd0000 + .long 0x00030000 + .long 0x00010000 + .long 0x00130000 + .long 0x008a0000 + .long 0x006b0000 + .long 0x003a0000 + .long 0x00910000 + .long 0x00110000 + .long 0x00410000 + .long 0x004f0000 + .long 0x00670000 + .long 0x00dc0000 + .long 0x00ea0000 + .long 0x00970000 + .long 0x00f20000 + .long 0x00cf0000 + .long 0x00ce0000 + .long 0x00f00000 + .long 0x00b40000 + .long 0x00e60000 + .long 0x00730000 + .long 0x00960000 + .long 0x00ac0000 + .long 0x00740000 + .long 0x00220000 + .long 0x00e70000 + .long 0x00ad0000 + .long 0x00350000 + .long 0x00850000 + .long 0x00e20000 + .long 0x00f90000 + .long 0x00370000 + .long 0x00e80000 + .long 0x001c0000 + .long 0x00750000 + .long 0x00df0000 + .long 0x006e0000 + .long 0x00470000 + .long 0x00f10000 + .long 0x001a0000 + .long 0x00710000 + .long 0x001d0000 + .long 0x00290000 + .long 0x00c50000 + .long 0x00890000 + .long 0x006f0000 + .long 0x00b70000 + .long 0x00620000 + .long 0x000e0000 + .long 0x00aa0000 + .long 0x00180000 + .long 0x00be0000 + .long 0x001b0000 + .long 0x00fc0000 + .long 0x00560000 + .long 0x003e0000 + .long 0x004b0000 + .long 0x00c60000 + .long 0x00d20000 + .long 0x00790000 + .long 0x00200000 + .long 0x009a0000 + .long 0x00db0000 + .long 0x00c00000 + .long 0x00fe0000 + .long 0x00780000 + .long 0x00cd0000 + .long 0x005a0000 + .long 0x00f40000 + .long 0x001f0000 + .long 0x00dd0000 + .long 0x00a80000 + .long 0x00330000 + .long 0x00880000 + .long 0x00070000 + .long 0x00c70000 + .long 0x00310000 + .long 0x00b10000 + .long 0x00120000 + .long 0x00100000 + .long 0x00590000 + .long 0x00270000 + .long 0x00800000 + .long 0x00ec0000 + .long 0x005f0000 + .long 0x00600000 + .long 0x00510000 + .long 0x007f0000 + .long 0x00a90000 + .long 0x00190000 + .long 0x00b50000 + .long 0x004a0000 + .long 0x000d0000 + .long 0x002d0000 + .long 0x00e50000 + .long 0x007a0000 + .long 0x009f0000 + .long 0x00930000 + .long 0x00c90000 + .long 0x009c0000 + .long 0x00ef0000 + .long 0x00a00000 + .long 0x00e00000 + .long 0x003b0000 + .long 0x004d0000 + .long 0x00ae0000 + .long 0x002a0000 + .long 0x00f50000 + .long 0x00b00000 + .long 0x00c80000 + .long 0x00eb0000 + .long 0x00bb0000 + .long 0x003c0000 + .long 0x00830000 + .long 0x00530000 + .long 0x00990000 + .long 0x00610000 + .long 0x00170000 + .long 0x002b0000 + .long 0x00040000 + .long 0x007e0000 + .long 0x00ba0000 + .long 0x00770000 + .long 0x00d60000 + .long 0x00260000 + .long 0x00e10000 + .long 0x00690000 + .long 0x00140000 + .long 0x00630000 + .long 0x00550000 + .long 0x00210000 + .long 0x000c0000 + .long 0x007d0000 + // Table 3. + .long 0x52000000 + .long 0x09000000 + .long 0x6a000000 + .long 0xd5000000 + .long 0x30000000 + .long 0x36000000 + .long 0xa5000000 + .long 0x38000000 + .long 0xbf000000 + .long 0x40000000 + .long 0xa3000000 + .long 0x9e000000 + .long 0x81000000 + .long 0xf3000000 + .long 0xd7000000 + .long 0xfb000000 + .long 0x7c000000 + .long 0xe3000000 + .long 0x39000000 + .long 0x82000000 + .long 0x9b000000 + .long 0x2f000000 + .long 0xff000000 + .long 0x87000000 + .long 0x34000000 + .long 0x8e000000 + .long 0x43000000 + .long 0x44000000 + .long 0xc4000000 + .long 0xde000000 + .long 0xe9000000 + .long 0xcb000000 + .long 0x54000000 + .long 0x7b000000 + .long 0x94000000 + .long 0x32000000 + .long 0xa6000000 + .long 0xc2000000 + .long 0x23000000 + .long 0x3d000000 + .long 0xee000000 + .long 0x4c000000 + .long 0x95000000 + .long 0x0b000000 + .long 0x42000000 + .long 0xfa000000 + .long 0xc3000000 + .long 0x4e000000 + .long 0x08000000 + .long 0x2e000000 + .long 0xa1000000 + .long 0x66000000 + .long 0x28000000 + .long 0xd9000000 + .long 0x24000000 + .long 0xb2000000 + .long 0x76000000 + .long 0x5b000000 + .long 0xa2000000 + .long 0x49000000 + .long 0x6d000000 + .long 0x8b000000 + .long 0xd1000000 + .long 0x25000000 + .long 0x72000000 + .long 0xf8000000 + .long 0xf6000000 + .long 0x64000000 + .long 0x86000000 + .long 0x68000000 + .long 0x98000000 + .long 0x16000000 + .long 0xd4000000 + .long 0xa4000000 + .long 0x5c000000 + .long 0xcc000000 + .long 0x5d000000 + .long 0x65000000 + .long 0xb6000000 + .long 0x92000000 + .long 0x6c000000 + .long 0x70000000 + .long 0x48000000 + .long 0x50000000 + .long 0xfd000000 + .long 0xed000000 + .long 0xb9000000 + .long 0xda000000 + .long 0x5e000000 + .long 0x15000000 + .long 0x46000000 + .long 0x57000000 + .long 0xa7000000 + .long 0x8d000000 + .long 0x9d000000 + .long 0x84000000 + .long 0x90000000 + .long 0xd8000000 + .long 0xab000000 + .long 0x00000000 + .long 0x8c000000 + .long 0xbc000000 + .long 0xd3000000 + .long 0x0a000000 + .long 0xf7000000 + .long 0xe4000000 + .long 0x58000000 + .long 0x05000000 + .long 0xb8000000 + .long 0xb3000000 + .long 0x45000000 + .long 0x06000000 + .long 0xd0000000 + .long 0x2c000000 + .long 0x1e000000 + .long 0x8f000000 + .long 0xca000000 + .long 0x3f000000 + .long 0x0f000000 + .long 0x02000000 + .long 0xc1000000 + .long 0xaf000000 + .long 0xbd000000 + .long 0x03000000 + .long 0x01000000 + .long 0x13000000 + .long 0x8a000000 + .long 0x6b000000 + .long 0x3a000000 + .long 0x91000000 + .long 0x11000000 + .long 0x41000000 + .long 0x4f000000 + .long 0x67000000 + .long 0xdc000000 + .long 0xea000000 + .long 0x97000000 + .long 0xf2000000 + .long 0xcf000000 + .long 0xce000000 + .long 0xf0000000 + .long 0xb4000000 + .long 0xe6000000 + .long 0x73000000 + .long 0x96000000 + .long 0xac000000 + .long 0x74000000 + .long 0x22000000 + .long 0xe7000000 + .long 0xad000000 + .long 0x35000000 + .long 0x85000000 + .long 0xe2000000 + .long 0xf9000000 + .long 0x37000000 + .long 0xe8000000 + .long 0x1c000000 + .long 0x75000000 + .long 0xdf000000 + .long 0x6e000000 + .long 0x47000000 + .long 0xf1000000 + .long 0x1a000000 + .long 0x71000000 + .long 0x1d000000 + .long 0x29000000 + .long 0xc5000000 + .long 0x89000000 + .long 0x6f000000 + .long 0xb7000000 + .long 0x62000000 + .long 0x0e000000 + .long 0xaa000000 + .long 0x18000000 + .long 0xbe000000 + .long 0x1b000000 + .long 0xfc000000 + .long 0x56000000 + .long 0x3e000000 + .long 0x4b000000 + .long 0xc6000000 + .long 0xd2000000 + .long 0x79000000 + .long 0x20000000 + .long 0x9a000000 + .long 0xdb000000 + .long 0xc0000000 + .long 0xfe000000 + .long 0x78000000 + .long 0xcd000000 + .long 0x5a000000 + .long 0xf4000000 + .long 0x1f000000 + .long 0xdd000000 + .long 0xa8000000 + .long 0x33000000 + .long 0x88000000 + .long 0x07000000 + .long 0xc7000000 + .long 0x31000000 + .long 0xb1000000 + .long 0x12000000 + .long 0x10000000 + .long 0x59000000 + .long 0x27000000 + .long 0x80000000 + .long 0xec000000 + .long 0x5f000000 + .long 0x60000000 + .long 0x51000000 + .long 0x7f000000 + .long 0xa9000000 + .long 0x19000000 + .long 0xb5000000 + .long 0x4a000000 + .long 0x0d000000 + .long 0x2d000000 + .long 0xe5000000 + .long 0x7a000000 + .long 0x9f000000 + .long 0x93000000 + .long 0xc9000000 + .long 0x9c000000 + .long 0xef000000 + .long 0xa0000000 + .long 0xe0000000 + .long 0x3b000000 + .long 0x4d000000 + .long 0xae000000 + .long 0x2a000000 + .long 0xf5000000 + .long 0xb0000000 + .long 0xc8000000 + .long 0xeb000000 + .long 0xbb000000 + .long 0x3c000000 + .long 0x83000000 + .long 0x53000000 + .long 0x99000000 + .long 0x61000000 + .long 0x17000000 + .long 0x2b000000 + .long 0x04000000 + .long 0x7e000000 + .long 0xba000000 + .long 0x77000000 + .long 0xd6000000 + .long 0x26000000 + .long 0xe1000000 + .long 0x69000000 + .long 0x14000000 + .long 0x63000000 + .long 0x55000000 + .long 0x21000000 + .long 0x0c000000 + .long 0x7d000000 diff --git a/bsd/crypto/aes/i386/EncryptDecrypt.s b/bsd/crypto/aes/i386/EncryptDecrypt.s new file mode 100644 index 000000000..6a6147a11 --- /dev/null +++ b/bsd/crypto/aes/i386/EncryptDecrypt.s @@ -0,0 +1,607 @@ +/* This file defines _aes_encrypt or _aes_decrypt, according to the value of + the Select preprocessor symbol. This file is designed to be included in + another assembly file using the preprocessor #include directive, to benefit + from some assembly-time calculations. + + These two routines are nearly identical. They differ only in the tables + they use, the direction they iterate through the key, and the permutation + performed on part of the state. + + Written by Eric Postpischil, January 2008. +*/ + +/* add AES HW detection and HW-specific program branch cclee 3-12-10 */ +#ifdef KERNEL +#include <i386/cpu_capabilities.h> +#else +#include <System/i386/cpu_capabilities.h> +#endif + +#if Select == 0 + #define Name _aes_encrypt // Routine name. + #define MTable _AESEncryptTable // Main table. + #define FTable _AESSubBytesWordTable // Final table. + #define P0 S0 // State permutation. + #define P1 S1 + #define P2 S2 + #define P3 S3 + #define Increment +16 // ExpandedKey increment. +#elif Select == 1 + #define Name _aes_decrypt // Routine name. + #define MTable _AESDecryptTable // Main table. + #define FTable _AESInvSubBytesWordTable // Final table. + #define P0 S2 // State permutation. + #define P1 S3 + #define P2 S0 + #define P3 S1 + #define Increment -16 // ExpandedKey increment. +#elif Select == 2 + #define Name _aes_encrypt_xmm_no_save // Routine name. + #define MTable _AESEncryptTable // Main table. + #define FTable _AESSubBytesWordTable // Final table. + #define P0 S0 // State permutation. + #define P1 S1 + #define P2 S2 + #define P3 S3 + #define Increment +16 // ExpandedKey increment. +#elif Select == 3 + #define Name _aes_decrypt_xmm_no_save // Routine name. + #define MTable _AESDecryptTable // Main table. + #define FTable _AESInvSubBytesWordTable // Final table. + #define P0 S2 // State permutation. + #define P1 S3 + #define P2 S0 + #define P3 S1 + #define Increment -16 // ExpandedKey increment. +#endif // Select + + +/* Routine: + + _AESEncryptWithExpandedKey (if Select is 0) or + _AESDecryptWithExpandedKey (if Select is 1). + + Function: + + Perform the AES cipher or its inverse as defined in Federal Information + Processing Standards Publication 197 (FIPS-197), November 26, 2001. + + The inverse cipher here is the "Equivalent Inverse Cipher" in FIPS-197. + + Input: + + Constant data: + + The following names must be locally defined so the assembler + can calculate certain offsets. + + For encryption: + + static const Word _AESEncryptTable[4][256]. + + _AESEncryptTable[i] contains the tables T[i] defined in AES + Proposal: Rijndael, version 2, 03/09/99, by Joan Daemen and + Vincent Rijmen, section 5.2.1, page 18. These tables + combine the SubBytes and MixColumns operations. + + static const Word _AESSubBytesWordTable[256]. + + _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where + SubBytes is defined in FIPS-197. _AESSubBytesWordTable + differs from _AESEncryptTable in that it does not include + the MixColumn operation. It is used in performing the last + round, which differs fromm the previous rounds in that it + does not include the MixColumn operation. + + For decryption: + + static const Word _AESDecryptTable[4][256]. + + The analog of _AESEncryptTable for decryption. + + static const Word _AESSubBytesWordTable[256]. + + _AESInvSubBytesWordTable[i][j] = InvSubBytes(j) << 8*i, + where InvSubBytes is defined in FIPS-197. + _AESInvSubBytesWordTable differs from _AESDecryptTable in + that it does not include the InvMixColumn operation. It is + used in performing the last round, which differs from the + previous rounds in that it does not include the + InvMixColumn operation. + + Arguments: + + const Byte *InputText. + + Address of input, 16 bytes. Best if four-byte aligned. + + Byte *OutputText. + + Address of output, 16 bytes. Best if four-byte aligned. + + aes_encrypt_ctx *Context or aes_decrypt_ctx *Context + + aes_encrypt_ctx and aes_decrypt_ctx are identical except the + former is used for encryption and the latter for decryption. + + Each is a structure containing the expanded key beginning at + offset ContextKey and a four-byte "key length" beginning at + offset ContextKeyLength. The "key length" is the number of + bytes from the start of the first round key to the start of the + last round key. That is 16 less than the number of bytes in + the entire key. + + Output: + + Encrypted or decrypted data is written to *OutputText. + + Return: + + aes_rval // -1 if "key length" is invalid. 0 otherwise. +*/ + + .text + .globl Name +Name: + + // detect AES HW, cclee 3-13-10 +#if Select < 2 // only for aes_encrypt/aes_decrypt +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities +#else +#if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax // __cpu_capabilities & kHasAES +#if Select == 0 + jne _aes_encrypt_hw // if AES HW detected, branch to HW specific code +#else + jne _aes_decrypt_hw // if AES HW detected, branch to HW specific code +#endif +#endif // Select + + // Push new stack frame. + push r5 + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (3*8) + #endif + + /* Number of bytes used for local variables: + + 4 (i386) or 0 (x86_64) bytes for ExpandedKeyEnd. + + 5 (i386) or 3 (x86_64) 16-byte spaces to save XMM registers. + */ + #define LocalsSize (Arch(4, 0) + Arch(5, 3)*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + +#ifdef KERNEL +#if Select < 2 + // Save XMM registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) +#if defined __i386__ + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) +#endif +#endif // Select +#endif // KERNEL + +#if defined __i386__ + + // Number of bytes from caller's stack pointer to ours. + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Define location of argument i (presuming 4-byte arguments). + #define Argument(i) StackFrame+4*(i)(%esp) + + #define ArgInputText Argument(0) + #define ArgOutputText Argument(1) + #define ArgContext Argument(2) + +#elif defined __x86_64__ + + // Arguments. + #define InputText r7 // Used early then overwritten for other use. + #define OutputText r6 // Needed near end of routine. + #define ArgContext r2 + /* The argument passed in r2 overlaps registers we need for other + work, so it must be moved early in the routine. + */ + +#endif + +#define BaseP Arch(r6, r9) // Base pointer for addressing global data. +#define ExpandedKey Arch(t0, r10) // Address of expanded key. + +/* The Work registers defined below are used to hold parts of the AES state + while we dissect or assemble it. They must be assigned to the A, B, C, and + D registers so that we can access the bytes in %al, %ah, and so on. +*/ +#define Work0d r0d +#define Work0l r0l +#define Work0h r0h +#define Work1d r3d +#define Work1l r3l +#define Work1h r3h +#define Work2d r1d +#define Work2l r1l +#define Work2h r1h +#define Work3d r2d +#define Work3l r2l +#define Work3h r2h + +#define t0 r5 +#define t0d r5d // Low 32 bits of t0. +#define t0l r5l // Low byte of t0. + +#define t1 r7 + +/* S0, S1, S2, and S3 are where we assemble the new AES state when computing + a regular round. S1, S2, and S3 are assigned to the Work registers, but + S0 needs to go somewhere else because Work0 holds part of the old state. +*/ +#define S0 Arch(t1, r8d) +#define S1 Work1d +#define S2 Work2d +#define S3 Work3d + +/* These XMM registers are used as holding space, because it is faster to + spill to these registers than to the stack. (On x86_64, we do not need + to spill, because there are additional general registers available. + However, using more general registers requires saving them to the stack + and restoring them. I timed it, and no time was saved.) +*/ +#define vS1 %xmm0 +#define vS2 %xmm1 +#define vS3 %xmm2 +#if defined __i386__ + #define vExpandedKey %xmm3 + #define vIncrement %xmm4 +#endif + + // Get address of expanded key. + mov ArgContext, ExpandedKey + #if 0 != ContextKey + add $ContextKey, ExpandedKey + #endif + +/* Store sentinel value of ExpandedKey on the stack on i386, a register on + x86_64. +*/ +#define ExpandedKeyEnd Arch(5*16(r4), r11) + + // Get and check "key length". + movzx ContextKeyLength(ExpandedKey), r0 + cmp $160, r0 + je 2f + cmp $192, r0 + je 2f + cmp $224, r0 + je 2f + mov $-1, r0 // Return error. + jmp 9f +2: + + #if (Select == 0 || Select == 2) + // For encryption, prepare to iterate forward through expanded key. + add ExpandedKey, r0 + mov r0, ExpandedKeyEnd + #else + // For decryption, prepare to iterate backward through expanded key. + mov ExpandedKey, ExpandedKeyEnd + add r0, ExpandedKey + #endif + + // Initialize State from input text. + #if defined __i386__ + mov ArgInputText, BaseP + #define InputText BaseP + #endif + mov 0*4(InputText), Work0d + mov 1*4(InputText), S1 + mov 2*4(InputText), S2 + mov 3*4(InputText), S3 +#undef InputText // Register is reused after this for other purposes. + + // Add round key and save results. + xor 0*4(ExpandedKey), Work0d // S0 is in dissection register. + xor 1*4(ExpandedKey), S1 + movd S1, vS1 // Save S1 to S3 in vector registers. + xor 2*4(ExpandedKey), S2 + movd S2, vS2 + xor 3*4(ExpandedKey), S3 + movd S3, vS3 + + add $Increment, ExpandedKey // Advance to next round key. + + #if defined __i386__ + // Save expanded key address and increment in vector registers. + mov $Increment, t1 + movp ExpandedKey, vExpandedKey + movp t1, vIncrement + #endif + + // Set up relative addressing. + #if defined __i386__ + + // Get address of 0 in BaseP. + call 0f // Push program counter onto stack. + 0: + pop BaseP // Get program counter. + + // Define macros to help address data. +#define LookupM(table, index) MTable-0b+(table)*TableSize(BaseP, index, 4) +#define LookupF(table, index) FTable-0b+(table)*TableSize(BaseP, index, 4) + + #elif defined __x86_64__ + + lea MTable(%rip), BaseP + + // Define macros to help address data. + #define LookupM(table, index) (table)*TableSize(BaseP, index, 4) + #define LookupF(table, index) (table)*TableSize(BaseP, index, 4) + +/* With these definitions of LookupM and LookupF, BaseP must be loaded with + the address of the table at the point where it is used. So we need an + instruction to change BaseP after we are done with MTable and before we + start using FTable. I would prefer to use something like: + + .set FMinusM, FTable - MTable + #define LookupF(table, index) \ + FMinusM+(table)*TableSize(BaseP, index, 4) + + Then BaseP would not need to change. However, this fails due to an + assembler/linker bug, <rdar://problem/5683882>. +*/ + + #endif + + // Get round key. + mov 0*4(ExpandedKey), S0 + mov 1*4(ExpandedKey), S1 + mov 2*4(ExpandedKey), S2 + mov 3*4(ExpandedKey), S3 + +1: + /* Word 0 of the current state must be in Work0 now, and the next round + key must be in S0 to S3. + */ + + // Process previous S0. + movzx Work0l, t0 + xor LookupM(0, t0), S0 + movzx Work0h, t0d + xor LookupM(1, t0), P3 + shr $16, Work0d + movzx Work0l, t0d + xor LookupM(2, t0), S2 + movzx Work0h, t0d + xor LookupM(3, t0), P1 + + // Process previous S1. + movd vS1, Work0d + movzx Work0l, t0d + xor LookupM(0, t0), S1 + movzx Work0h, t0d + xor LookupM(1, t0), P0 + shr $16, Work0d + movzx Work0l, t0d + xor LookupM(2, t0), S3 + movzx Work0h, t0d + xor LookupM(3, t0), P2 + + // Process previous S2. + movd vS2, Work0d + movzx Work0l, t0d + xor LookupM(0, t0), S2 + movzx Work0h, t0d + xor LookupM(1, t0), P1 + shr $16, Work0d + movzx Work0l, t0d + xor LookupM(2, t0), S0 + movzx Work0h, t0d + xor LookupM(3, t0), P3 + + // Process previous S3. + movd vS3, Work0d + movzx Work0l, t0d + xor LookupM(0, t0), S3 + movzx Work0h, t0d + xor LookupM(1, t0), P2 + shr $16, Work0d + movzx Work0l, t0d + xor LookupM(2, t0), S1 + movzx Work0h, t0d + xor LookupM(3, t0), P0 + + #if defined __i386__ + paddd vIncrement, vExpandedKey + movp vExpandedKey, ExpandedKey + #else + add $Increment, ExpandedKey + #endif + + // Save state for next iteration and load next round key. + mov S0, Work0d + mov 0*4(ExpandedKey), S0 + movd S1, vS1 + mov 1*4(ExpandedKey), S1 + movd S2, vS2 + mov 2*4(ExpandedKey), S2 + movd S3, vS3 + mov 3*4(ExpandedKey), S3 + + cmp ExpandedKeyEnd, ExpandedKey + jne 1b + + /* Word 0 of the current state must be in Work0 now, and the next round + key must be in S0 to S3. + */ + + // Work around assembler bug. See comments above about Radar 5683882. + #if defined __x86_64__ + lea FTable(%rip), BaseP + #endif + + // Process previous S0. + movzx Work0l, t0 + xor LookupF(0, t0), S0 + movzx Work0h, t0d + xor LookupF(1, t0), P3 + shr $16, Work0d + movzx Work0l, t0d + xor LookupF(2, t0), S2 + movzx Work0h, t0d + xor LookupF(3, t0), P1 + + // Process previous S1. + movd vS1, Work0d + movzx Work0l, t0d + xor LookupF(0, t0), S1 + movzx Work0h, t0d + xor LookupF(1, t0), P0 + shr $16, Work0d + movzx Work0l, t0d + xor LookupF(2, t0), S3 + movzx Work0h, t0d + xor LookupF(3, t0), P2 + + // Process previous S2. + movd vS2, Work0d + movzx Work0l, t0d + xor LookupF(0, t0), S2 + movzx Work0h, t0d + xor LookupF(1, t0), P1 + shr $16, Work0d + movzx Work0l, t0d + xor LookupF(2, t0), S0 + movzx Work0h, t0d + xor LookupF(3, t0), P3 + + // Process previous S3. + movd vS3, Work0d + movzx Work0l, t0d + xor LookupF(0, t0), S3 + movzx Work0h, t0d + xor LookupF(1, t0), P2 + shr $16, Work0d + movzx Work0l, t0d + xor LookupF(2, t0), S1 + movzx Work0h, t0d + xor LookupF(3, t0), P0 + + #if defined __i386__ // Architecture. + // Get OutputText address. + #define OutputText BaseP + mov ArgOutputText, OutputText + #endif // Architecture. + + // Write output. + mov S0, 0*4(OutputText) + mov S1, 1*4(OutputText) + mov S2, 2*4(OutputText) + mov S3, 3*4(OutputText) + + xor r0, r0 // Return success. + +9: + // Pop stack and restore registers. +#ifdef KERNEL +#if Select < 2 +#if defined __i386__ + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 +#endif + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 +#endif // Select +#endif // KERNEL + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + #elif defined __x86_64__ + #endif + pop r3 + pop r5 + + ret + + +#undef ArgExpandedKey +#undef ArgInputText +#undef ArgNr +#undef ArgOutputText +#undef Argument +#undef BaseP +#undef ExpandedKey +#undef ExpandedKeyEnd +#undef FTable +#undef InputText +#undef LocalsSize +#undef LookupM +#undef LookupF +#undef MTable +#undef OutputText +#undef Padding +#undef SaveSize +#undef S0 +#undef S1 +#undef S2 +#undef S3 +#undef StackFrame +#undef Work0d +#undef Work0h +#undef Work0l +#undef Work1d +#undef Work1h +#undef Work1l +#undef Work2d +#undef Work2h +#undef Work2l +#undef Work3d +#undef Work3h +#undef Work3l +#undef t0 +#undef t0d +#undef t0l +#undef t1 +#undef vExpandedKey +#undef vS1 +#undef vS2 +#undef vS3 + +#undef Name +#undef MTable +#undef FTable +#undef P0 +#undef P1 +#undef P2 +#undef P3 +#undef Increment diff --git a/bsd/crypto/aes/i386/ExpandKeyForDecryption.s b/bsd/crypto/aes/i386/ExpandKeyForDecryption.s new file mode 100644 index 000000000..457508a9a --- /dev/null +++ b/bsd/crypto/aes/i386/ExpandKeyForDecryption.s @@ -0,0 +1,1214 @@ +/* This file defines _aes_decrypt_key, _aes_decrypt_key128, + _aes_decrypt_key192, and _aes_decrypt_key256. It is designed to be + included in another assembly file with the preprocessor #include directive, + to benefit from some assembly-time calculations. + + Written by Eric Postpischil, January 2008. + + The comments here do not say much about the algorithm; the code just + follows the FIPS-197 specification. I recommend reading the specification + before working with this code or examining the C code in the parent + directory that illustrates key expansion. + + One complication is that this routine both expands the key and applies + InvMixColumn to most of the words in the expanded key. This modifies the + key for use with the Equivalent Inverse Cipher. + + During key expansion, there are sequences of four or six words that are + produced like this: + + E[i+0] = E[i+0-Nk] ^ f(E[i-1]), where f is some function. + E[i+1] = E[i+1-Nk] ^ E[i+0]. + E[i+2] = E[i+2-Nk] ^ E[i+1]. + E[i+3] = E[i+3-Nk] ^ E[i+2]. + + When Nk is four or eight, the sequence stops there. When it is six, it + goes on for two more words. Let I be the InvMixColumn function. for the + Equivalent Inverse Cipher, we want to store I(E[i+0]), I(E[i+1]), + I(E[i+2]), I(E[i+3]) (and two more when Nk is six). However, we do not + need to calculate I four times. In AES' finite field, I is a linear + combination of the four bytes of its input. The ^ operation on the bits + that represent field elements is an addition in the Galois field. So + I(a ^ b) = I(a) ^ I(b). Then we have: + + I(E[i+0]) = I(E[i+0-Nk] ^ f(E[i-1])) = I(E[i+0-Nk]) ^ I(f(E[i-1])). + I(E[i+1]) = I(E[i+1-Nk]) ^ I(E[i+0]). + I(E[i+2]) = I(E[i+2-Nk]) ^ I(E[i+1]). + I(E[i+3]) = I(E[i+3-Nk]) ^ I(E[i+2]). + + To compute this, we compute I(f(E[i-1])) and XOR it with the previously + stored E[i+0-Nk])) to get I(E[i+0])). Then we XOR that with the previously + stored E[i+1-Nk])) to get I(E[i+1])), and so on. + + Note that to compute I(f(E[i-1])), we need to have E[i-1]. So we have to + compute the pre-InvMixColumn words of the expanded key; it is not + sufficient to have the post-InvMixColumn words. +*/ + + +/* Routine: + + _aes_decrypt_key. + + _aes_decrypt_key128, _aes_decrypt_key192, and _aes_decrypt_key256. + + Function: + + Expand the user's cipher key into the key schedule, as defined in + Federal Information Processing Standards Publication 197 (FIPS-197), + November 26, 2001. + + For decryption, the key is modified as shown in Figure 15 in FIPS-197, + to support the Equivalent Inverse Cipher. + + Input: + + Constant data: + + The following names must be locally defined so the assembler + can calculate certain offsets. + + static const Word _AESSubBytesWordTable[4][256]. + + _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where + SubBytes is defined in FIPS-197. _AESSubBytesWordTable + differs from _AESEncryptTable in that it does not include + the MixColumn operation. It is used in performing the last + round, which differs fromm the previous rounds in that it + does not include the MixColumn operation. + + static const Word _AESSInvMixColumnTable[4][256]. + + _AESInvMixColumnTable[i][j] contains the contribution of byte + j to element i of the InvMixColumn operation. + + The four bytes of the word _AESInvMixColumnTable[0][j] are: + + {0xe}*{j}, {0x9}*{j}, {0xd}*{j}, {0xb}*{j}, + + listed in increasing address order, where multiplication is + performed in the Galois field. {j} designates the element of + the Galois field represented by j. _AESInvMixColumn[i][j] has + the same bytes, rotated right in the order shown above. + + static const Byte _AESRcon[]. + + Round constants, beginning with AESRcon[1] for the first round + (AESRcon[0] is padding.) + + Arguments: + + const uint8_t *Key + + Address of user's cipher key. + + int Length + + Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in + user's cipher key. + + This argument is used with _aes_decrypt_key. It is not + present for the other routines. In those routines, Context + is the second argument. + + aes_decrypt_ctx *Context + + Structure to contain the expanded key beginning at offset + ContextKey and a four-byte "key length" beginning at offset + ContextKeyLength. The "key length" is the number of bytes from + the start of the first round key to the startof the last rond + key. That is 16 less than the number of bytes in the entire + key. + + Output: + + The expanded key and the "key length" are written to *Context. + + Return: + + aes_rval // -1 if "key length" is invalid. 0 otherwise. +*/ +/* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */ + +#ifdef KERNEL +#include <i386/cpu_capabilities.h> +#else +#include <System/i386/cpu_capabilities.h> +#endif + +#define dr r0d // Dissection register. +#define drl r0l // Low 8 bits of dissection register. +#define drh r0h // Second-lowest 8 bits of dissection register. + +#define t0 r1 +#define t0d r1d // Low 32 bits of t0. + +#define STable r2 // Address of SubBytes table. Overlaps Nk. +#define ITable r3 // Address of InvMixColumn table. +#define offset Arch(r5, r11) // Address offset and loop sentinel. + +#define R r7 // Address of round constant. +#define K r7 // User key pointer. + // R and K overlap. + +#define E r6 // Expanded key pointer. + +#define ve0 %xmm0 +#define ve1 %xmm1 +#define ve2 %xmm2 +#define ve3 %xmm3 +#define ve4 %xmm4 +#define ve5 %xmm5 +#define vt1 %xmm6 +#define vt0 %xmm7 + +#define LookupS(table, index) (table)*TableSize(STable, index, 4) +#define LookupI(table, index) (table)*TableSize(ITable, index, 4) + + +/* InvMixColumn puts InvMixColumn(dr) into vt0. This is a non-standard + subroutine. It does not conform to the ABI. It is an integral part of + _ExpandKeyForDecryption and shares register use with it. +*/ +InvMixColumn: + movzx drl, t0 + movd LookupI(0, t0), vt0 // Look up byte 0 in table 0. + movzx drh, t0d + movd LookupI(1, t0), vt1 // Look up byte 1 in table 1. + pxor vt1, vt0 + shr $16, dr + movzx drl, t0d + movd LookupI(2, t0), vt1 // Look up byte 2 in table 2. + pxor vt1, vt0 + movzx drh, t0d + movd LookupI(3, t0), vt1 // Look up byte 3 in table 3. + pxor vt1, vt0 + ret + + + // SubWordRotWord adds (XORs) SubWord(RotWord(dr)) to vt0. + .macro SubWordRotWord + movzx drl, t0 + movd LookupS(3, t0), vt1 // Look up byte 0 in table 3. + pxor vt1, vt0 + movzx drh, t0d + movd LookupS(0, t0), vt1 // Look up byte 1 in table 0. + pxor vt1, vt0 + shr $$16, dr + movzx drl, t0d + movd LookupS(1, t0), vt1 // Look up byte 2 in table 1. + pxor vt1, vt0 + movzx drh, t0d + movd LookupS(2, t0), vt1 // Look up byte 3 in table 2. + pxor vt1, vt0 + .endmacro + + + // SubWord puts SubWord(dr) into vt0. + .macro SubWord + movzx drl, t0 + movd LookupS(0, t0), vt0 // Look up byte 0 in table 0. + movzx drh, t0d + movd LookupS(1, t0), vt1 // Look up byte 1 in table 1. + pxor vt1,vt0 + shr $$16, dr + movzx drl, t0d + movd LookupS(2, t0), vt1 // Look up byte 2 in table 2. + pxor vt1,vt0 + movzx drh, t0d + movd LookupS(3, t0), vt1 // Look up byte 3 in table 3. + pxor vt1,vt0 + .endmacro + + .text + .globl _aes_decrypt_key +// .private_extern _aes_decrypt_key +_aes_decrypt_key: + + // detect AES HW, cclee 3-13-10 +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities +#else +#if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif + +#endif + test $(kHasAES), %eax // __cpu_capabilities & kHasAES + jne _aes_decrypt_key_hw // if AES HW detected, branch to _aes_decrypt_key_hw + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + + 8 four-byte spaces for work. + */ + #define LocalsSize (8*16 + 8*4) + + // Define stack offset to storage space for local data. + #define Local (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + +#if defined __i386__ + + // Define location of argument i. + #define Argument(i) StackFrame+4*(i)(r4) + + #define Nk t0d + + // Load arguments. + mov Argument(2), E + mov Argument(1), Nk + mov Argument(0), K + +#elif defined __x86_64__ + + #define Nk r9d // Number of words in key. + mov r6d, Nk // Move Nk argument out of way. + mov r2, E // Move E argument to common register. + +#endif + + // Dispatch on key length. + cmp $128, Nk + jge 2f + shl $3, Nk // Convert from bytes to bits. + cmp $128, Nk +2: + je DKeyHas4Words + cmp $192, Nk + je DKeyHas6Words + cmp $256, Nk + je DKeyHas8Words + mov $-1, r0 // Return error. + jmp 9f + + + .globl _aes_decrypt_key128 +// .private_extern _aes_decrypt_key128 +_aes_decrypt_key128: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + + 8 four-byte spaces for work. + */ + #define LocalsSize (8*16 + 8*4) + + // Define stack offset to storage space for local data. + #define Local (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + +#if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + +#endif + +// Merge point for _aes_decrypt_key and _aes_decrypt_key128. +DKeyHas4Words: + + // First words of expanded key are copied from user key. + movd 0*4(K), ve0 + movd 1*4(K), ve1 + movd 2*4(K), ve2 + movd 3*4(K), ve3 + + movl $10*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + // K cannot be used after we write to R, since they use the same register. + + #if defined __i386__ + + lea _AESRcon, R + lea _AESInvMixColumnTable, ITable + lea _AESSubBytesWordTable, STable + + #elif defined __x86_64__ + + lea _AESRcon(%rip), R + lea _AESInvMixColumnTable(%rip), ITable + lea _AESSubBytesWordTable(%rip), STable + + #endif + + /* With a four-word key, there are ten rounds (eleven 16-byte key blocks), + nine of which have InvMixColumn applied. + */ + mov $-9*4*4, offset + sub offset, E + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 0*4(E, offset) + movd ve1, 1*4(E, offset) + movd ve2, 2*4(E, offset) + movd ve3, 3*4(E, offset) + +/* Here is the first iteration of the key expansion. It is separate from the + main loop below because we need to apply InvMixColumn to each of the + outputs, in ve0 through ve3. In the main loop, the technique described at + the top of this file is used to compute the proper outputs while using + InvMixColumn only once. +*/ + add $1, R // Advance pointer. + movd ve3, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + pxor ve0, ve1 + pxor ve1, ve2 + pxor ve2, ve3 + + add $4*4, offset + + /* Apply InvMixColumn to each word. The transformed values are stored in + the expanded key. The original values are retained in registers for + further computation. + */ + movd ve0, dr + call InvMixColumn + movd vt0, 0*4(E, offset) + + movd ve1, dr + call InvMixColumn + movd vt0, 1*4(E, offset) + + movd ve2, dr + call InvMixColumn + movd vt0, 2*4(E, offset) + + movd ve3, dr + call InvMixColumn + movd vt0, 3*4(E, offset) + +// Here is the main loop. +1: + add $1, R // Advance pointer. + movd ve3, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + pxor ve0, ve1 + pxor ve1, ve2 + pxor ve2, ve3 + /* Dr. Brian Gladman uses a technique with a single XOR here instead + of the previous four. There is some periodic behavior in the key + expansion, and Gladman maintains E[4*i+3] for the latest four + values of i. XORing the value in vt0 with one of these yields its + replacement. However, using this technique requires additional + instructions before the loop (to initialize the values) and after + it (to extract the final values to be stored) and either some way + to rotate or index four values in the loop or a four-fold unrolling + of the loop to provide the indexing. Experiment suggests the + former is not worthwhile. Unrolling the loop might give a small + gain, at the cost of increased use of instruction cache, increased + instructions loads the first time the routine is executed, and + increased code complexity, so I decided against it. + */ + + // Apply InvMixColumn to the difference. + movd vt0, dr + call InvMixColumn + + add $4*4, offset + + // Chain the transformed difference to previously transformed outputs. + movd (0-4)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 0*4(E, offset) + + movd (1-4)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 1*4(E, offset) + + movd (2-4)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 2*4(E, offset) + + movd (3-4)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 3*4(E, offset) + + jl 1b + +// Here is the final iteration, which does not perform InvMixColumn. + + movd ve3, dr // Put previous word into work register. + movzx 1(R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + movd ve0, 4*4(E, offset) + pxor ve0, ve1 + movd ve1, 5*4(E, offset) + pxor ve1, ve2 + movd ve2, 6*4(E, offset) + pxor ve2, ve3 + movd ve3, 7*4(E, offset) + + xor r0, r0 // Return success. + +9: + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + + .globl _aes_decrypt_key192 +// .private_extern _aes_decrypt_key192 +_aes_decrypt_key192: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + + 8 four-byte spaces for work. + */ + #define LocalsSize (8*16 + 8*4) + + // Define stack offset to storage space for local data. + #define Local (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + +#if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + +#endif + +// Merge point for _aes_decrypt_key and _aes_decrypt_key192. +DKeyHas6Words: + + // First words of expanded key are copied from user key. + movd 0*4(K), ve0 + movd 1*4(K), ve1 + movd 2*4(K), ve2 + movd 3*4(K), ve3 + + movl $12*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + movd 4*4(K), ve4 + movd 5*4(K), ve5 + + // K cannot be used after we write to R, since they use the same register. + + #if defined __i386__ + + lea _AESRcon, R + lea _AESInvMixColumnTable, ITable + lea _AESSubBytesWordTable, STable + + #elif defined __x86_64__ + + lea _AESRcon(%rip), R + lea _AESInvMixColumnTable(%rip), ITable + lea _AESSubBytesWordTable(%rip), STable + + #endif + + /* With a six-word key, there are twelve rounds (thirteen 16-byte key + blocks), eleven of which have InvMixColumn applied. The key expansion + proceeds in iterations of six four-byte words, so the termination + condition is a bit complicated. We set offset to the negative of 10 + four four-byte words, and the loop branch does another iteration if + offset is less than or equal to zero, meaning the number of iterations + performed so far is less than or equal to 10. Thus, after ten + iterations, it branches again. After the eleventh iteration, it + stops. Code after the end of the loop computes the twelfth key block, + which does not have InvMixColumn applied. + */ + mov $-10*4*4, offset + sub offset, E + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 0*4(E, offset) + movd ve1, 1*4(E, offset) + movd ve2, 2*4(E, offset) + movd ve3, 3*4(E, offset) + + /* The first four words are stored untransformed. After that, words in + the expanded key are transformed by InvMixColumn. + */ + movd ve4, dr + call InvMixColumn + movd vt0, 4*4(E, offset) + + movd ve5, dr + call InvMixColumn + movd vt0, 5*4(E, offset) + +/* Here is the first iteration of the key expansion. It is separate from the + main loop below because we need to apply InvMixColumn to each of the + outputs, in ve0 through ve5. In the main loop, the technique described at + the top of this file is used to compute the proper outputs while using + InvMixColumn only once. +*/ + add $1, R // Advance pointer. + movd ve5, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + pxor ve0, ve1 + pxor ve1, ve2 + pxor ve2, ve3 + pxor ve3, ve4 + pxor ve4, ve5 + + add $6*4, offset + + /* Apply InvMixColumn to each word. The transformed values are stored in + the expanded key. The original values are retained in registers for + further computation. + */ + movd ve0, dr + call InvMixColumn + movd vt0, 0*4(E, offset) + + movd ve1, dr + call InvMixColumn + movd vt0, 1*4(E, offset) + + movd ve2, dr + call InvMixColumn + movd vt0, 2*4(E, offset) + + movd ve3, dr + call InvMixColumn + movd vt0, 3*4(E, offset) + + movd (4-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 4*4(E, offset) + + movd (5-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 5*4(E, offset) + +// Here is the main loop. +1: + add $1, R // Advance pointer. + movd ve5, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + pxor ve0, ve1 + pxor ve1, ve2 + pxor ve2, ve3 + pxor ve3, ve4 + pxor ve4, ve5 + + // Apply InvMixColumn to the difference. + movd vt0, dr + call InvMixColumn + + add $6*4, offset + + // Chain the transformed difference to previously transformed outputs. + movd (0-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 0*4(E, offset) + + movd (1-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 1*4(E, offset) + + movd (2-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 2*4(E, offset) + + movd (3-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 3*4(E, offset) + + movd (4-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 4*4(E, offset) + + movd (5-6)*4(E, offset), vt1 + pxor vt1, vt0 + movd vt0, 5*4(E, offset) + + jle 1b + +// Here is the final iteration, which does not perform InvMixColumn. + + movd ve5, dr // Put previous word into work register. + movzx 1(R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + pxor vt0, ve0 + + // Chain to successive words. + movd ve0, 6*4(E, offset) + pxor ve0, ve1 + movd ve1, 7*4(E, offset) + pxor ve1, ve2 + movd ve2, 8*4(E, offset) + pxor ve2, ve3 + movd ve3, 9*4(E, offset) + + xor r0, r0 // Return success. + + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + + .globl _aes_decrypt_key256 +// .private_extern _aes_decrypt_key256 +_aes_decrypt_key256: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + + 8 four-byte spaces for work. + */ + #define LocalsSize (8*16 + 8*4) + + // Define stack offset to storage space for local data. + #define Local (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + +#if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + +#endif + +// Merge point for _aes_decrypt_key and _aes_decrypt_key256. +DKeyHas8Words: + + // First words of expanded key are copied from user key. + movd 0*4(K), ve0 + movd 1*4(K), ve1 + movd 2*4(K), ve2 + movd 3*4(K), ve3 + + movl $14*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 0*4(E) + movd ve1, 1*4(E) + movd ve2, 2*4(E) + movd ve3, 3*4(E) + movd 4*4(K), ve0 + movd 5*4(K), ve1 + movd 6*4(K), ve2 + movd 7*4(K), ve3 + + // K cannot be used after we write to R, since they use the same register. + + #if defined __i386__ + + lea _AESRcon, R + lea _AESInvMixColumnTable, ITable + lea _AESSubBytesWordTable, STable + + #elif defined __x86_64__ + + lea _AESRcon(%rip), R + lea _AESInvMixColumnTable(%rip), ITable + lea _AESSubBytesWordTable(%rip), STable + + #endif + + /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key + blocks), thirteen of which have InvMixColumn applied. + */ + mov $-12*4*4, offset + sub offset, E + + // Save untransformed values in stack area. + movd ve0, 4*4+Local(r4) + movd ve1, 5*4+Local(r4) + movd ve2, 6*4+Local(r4) + movd ve3, 7*4+Local(r4) + + /* Apply InvMixColumn to words 4 through 7. The transformed values are + stored in the expanded key. The original values are saved in the stack + area for further computation. + */ + movd ve0, dr + call InvMixColumn + movd vt0, 4*4(E, offset) + + movd ve1, dr + call InvMixColumn + movd vt0, 5*4(E, offset) + + movd ve2, dr + call InvMixColumn + movd vt0, 6*4(E, offset) + + movd ve3, dr + call InvMixColumn + movd vt0, 7*4(E, offset) + +/* Here is the first iteration of the key expansion. It is separate from the + main loop below because we need to apply InvMixColumn to each of the + outputs, in ve0 through ve3. In the main loop, the technique described at + the top of this file is used to compute the proper outputs while using + InvMixColumn only once. +*/ + add $1, R // Advance pointer. + movd ve3, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + + add $8*4, offset + + movd (0-8)*4(E, offset), ve0 // Get old word. + pxor vt0, ve0 + movd ve0, 0*4+Local(r4) // Save on stack. + movd ve0, dr + call InvMixColumn + movd vt0, 0*4(E, offset) // Write to expanded key. + + /* Chain to successive words and apply InvMixColumn to each word. The + transformed values are stored in the expanded key. The original + values are retained in local data for further computation. + */ + movd (1-8)*4(E, offset), ve1 // Get old word. + pxor ve0, ve1 // Chain. + movd ve1, 1*4+Local(r4) // Save on stack. + movd ve1, dr + call InvMixColumn + movd vt0, 1*4(E, offset) // Write to expanded key. + + movd (2-8)*4(E, offset), ve2 // Get old word. + pxor ve1, ve2 // Chain. + movd ve2, 2*4+Local(r4) // Save on stack. + movd ve2, dr + call InvMixColumn + movd vt0, 2*4(E, offset) // Write to expanded key. + + movd (3-8)*4(E, offset), ve3 // Get old word. + pxor ve2, ve3 // Chain. + movd ve3, 3*4+Local(r4) // Save on stack. + movd ve3, dr + call InvMixColumn + movd vt0, 3*4(E, offset) // Write to expanded key. + + movd ve3, dr // Put previous word into work register. + SubWord + + movd 4*4+Local(r4), ve0 // Get old word. + pxor vt0, ve0 // Chain. + movd ve0, 4*4+Local(r4) // Save on stack. + + movd 5*4+Local(r4), ve1 // Get old word. + pxor ve0, ve1 // Chain. + movd ve1, 5*4+Local(r4) // Save on stack. + + movd 6*4+Local(r4), ve2 // Get old word. + pxor ve1, ve2 // Chain. + movd ve2, 6*4+Local(r4) // Save on stack. + + movd 7*4+Local(r4), ve3 // Get old word. + pxor ve2, ve3 // Chain. + movd ve3, 7*4+Local(r4) // Save on stack. + + movd vt0, dr // Move change to work register. + call InvMixColumn + + movd (4-8)*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, 4*4(E, offset) // Write new word to expanded key. + + movd (5-8)*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, 5*4(E, offset) // Write new word to expanded key. + + movd (6-8)*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, 6*4(E, offset) // Write new word to expanded key. + + movd (7-8)*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, 7*4(E, offset) // Write new word to expanded key. + +// Here is the main loop. +1: + add $1, R // Advance pointer. + movd ve3, dr // Put previous word into work register. + movzx (R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + + movd 0*4+Local(r4), ve0 // Get old word. + pxor vt0, ve0 + movd ve0, 0*4+Local(r4) // Save on stack. + + // Chain to successive words. + movd 1*4+Local(r4), ve1 // Get old word. + pxor ve0, ve1 // Chain. + movd ve1, 1*4+Local(r4) // Save on stack. + + movd 2*4+Local(r4), ve2 // Get old word. + pxor ve1, ve2 // Chain. + movd ve2, 2*4+Local(r4) // Save on stack. + + movd 3*4+Local(r4), ve3 // Get old word. + pxor ve2, ve3 // Chain. + movd ve3, 3*4+Local(r4) // Save on stack. + + movd vt0, dr // Move change to work register. + call InvMixColumn + + movd 0*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (0+8)*4(E, offset) // Write new word to expanded key. + + movd 1*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (1+8)*4(E, offset) // Write new word to expanded key. + + movd 2*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (2+8)*4(E, offset) // Write new word to expanded key. + + movd 3*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (3+8)*4(E, offset) // Write new word to expanded key. + + movd ve3, dr // Put previous word into work register. + SubWord + + movd 4*4+Local(r4), ve0 // Get old word. + pxor vt0, ve0 // Chain. + movd ve0, 4*4+Local(r4) // Save on stack. + + movd 5*4+Local(r4), ve1 // Get old word. + pxor ve0, ve1 // Chain. + movd ve1, 5*4+Local(r4) // Save on stack. + + movd 6*4+Local(r4), ve2 // Get old word. + pxor ve1, ve2 // Chain. + movd ve2, 6*4+Local(r4) // Save on stack. + + movd 7*4+Local(r4), ve3 // Get old word. + pxor ve2, ve3 // Chain. + movd ve3, 7*4+Local(r4) // Save on stack. + + movd vt0, dr // Move change to work register. + call InvMixColumn + + movd 4*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (4+8)*4(E, offset) // Write new word to expanded key. + + movd 5*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (5+8)*4(E, offset) // Write new word to expanded key. + + movd 6*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (6+8)*4(E, offset) // Write new word to expanded key. + + movd 7*4(E, offset), vt1 // Get old word. + pxor vt1, vt0 // Chain. + movd vt0, (7+8)*4(E, offset) // Write new word to expanded key. + + add $8*4, offset + + jl 1b + + movd ve3, dr // Put previous word into work register. + movzx 1(R), t0d // Get round constant. + movd t0d, vt0 + + SubWordRotWord + + movd 0*4+Local(r4), ve0 // Get old word. + pxor vt0, ve0 // Chain. + movd ve0, (0+8)*4(E, offset) + + // Chain to successive words. + movd 1*4+Local(r4), ve1 // Get old word. + pxor ve0, ve1 // Chain. + movd ve1, (1+8)*4(E, offset) + + movd 2*4+Local(r4), ve2 // Get old word. + pxor ve1, ve2 // Chain. + movd ve2, (2+8)*4(E, offset) + + movd 3*4+Local(r4), ve3 // Get old word. + pxor ve2, ve3 // Chain. + movd ve3, (3+8)*4(E, offset) + + xor r0, r0 // Return success. + + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + +#undef Address +#undef Argument +#undef E +#undef ITable +#undef K +#undef Local +#undef LocalsSize +#undef LookupI +#undef LookupS +#undef Nk +#undef Padding +#undef R +#undef SaveSize +#undef STable +#undef StackFrame +#undef dr +#undef drh +#undef drl +#undef offset +#undef t0 +#undef t0d +#undef ve0 +#undef ve1 +#undef ve2 +#undef ve3 +#undef ve4 +#undef ve5 +#undef vt0 +#undef vt1 diff --git a/bsd/crypto/aes/i386/ExpandKeyForEncryption.s b/bsd/crypto/aes/i386/ExpandKeyForEncryption.s new file mode 100644 index 000000000..1ce3c9553 --- /dev/null +++ b/bsd/crypto/aes/i386/ExpandKeyForEncryption.s @@ -0,0 +1,801 @@ +/* This file defines _aes_encrypt_key, _aes_encrypt_key128, + _aes_encrypt_key192, and _aes_encrypt_key256. It is designed to be + included in another assembly file with the preprocessor #include directive, + to benefit from some assembly-time calculations. + + Written by Eric Postpischil, January 2008. + + The comments here do not say much about the algorithm; the code just + follows the FIPS-197 specification. I recommend reading the specification + before working with this code or examining the C code in the parent + directory that illustrates key expansion. +*/ + + +/* Routines: + + _aes_encrypt_key. + + _aes_encrypt_key128, _aes_encrypt_key192, and _aes_encrypt_key256. + + Function: + + Expand the user's cipher key into the key schedule, as defined in + Federal Information Processing Standards Publication 197 (FIPS-197), + November 26, 2001. + + Input: + + Constant data: + + The following names must be locally defined so the assembler + can calculate certain offsets. + + static const Word _AESSubBytesWordTable[4][256]. + + _AESSubBytesWordTable[i][j] = SubBytes(j) << 8*i, where + SubBytes is defined in FIPS-197. _AESSubBytesWordTable + differs from _AESEncryptTable in that it does not include + the MixColumn operation. It is used in performing the last + round, which differs fromm the previous rounds in that it + does not include the MixColumn operation. + + static const Byte _AESRcon[]. + + Round constants, beginning with AESRcon[1] for the first round + (AESRcon[0] is padding.) + + Arguments: + + const uint8_t *Key + + Address of user's cipher key. + + int Length + + Number of bytes (16, 24, or 32) or bits (128, 192, or 256) in + user's cipher key. + + This argument is used with _aes_encrypt_key. It is not + present for the other routines. In those routines, Context + is the second argument. + + aes_encrypt_ctx *Context + + Structure to contain the expanded key beginning at offset + ContextKey and a four-byte "key length" beginning at offset + ContextKeyLength. The "key length" is the number of bytes from + the start of the first round key to the start of the last round + key. That is 16 less than the number of bytes in the entire + key. + + Output: + + The expanded key and the "key length" are written to *Context. + + Return: + + aes_rval // -1 if "key length" is invalid. 0 otherwise. +*/ + +/* add AES HW detection and program branch if AES HW is detected cclee 3-12-10 */ +#ifdef KERNEL +#include <i386/cpu_capabilities.h> +#else +#include <System/i386/cpu_capabilities.h> +#endif + + .text + .globl _aes_encrypt_key +// .private_extern _aes_encrypt_key +_aes_encrypt_key: + + // detect AES HW, cclee-3-13-10 +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities +#else +#if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax // __cpu_capabilities & kHasAES + jne _aes_encrypt_key_hw // if AES HW detected, branch to _aes_encrypt_key_hw + +#define dr r0d // Dissection register. +#define drl r0l // Low 8 bits of dissection register. +#define drh r0h // Second-lowest 8 bits of dissection register. + +#define t0 r1 +#define t0d r1d // Low 32 bits of t0. + +#define offset Arch(r5, r11) // Address offset and loop sentinel. + +#define R r7 // Address of round constant. +#define K r7 // User key pointer. + // R and K overlap. + +#define E r6 // Expanded key pointer. + +#define ve0 %xmm0 +#define ve1 %xmm1 +#define ve2 %xmm2 +#define ve3 %xmm3 +#define vt3 %xmm4 +#define vt2 %xmm5 +#define vt1 %xmm6 +#define vt0 %xmm7 + +#if defined __i386__ + #define LookupS(table, index) \ + _AESSubBytesWordTable+(table)*TableSize(, index, 4) +#elif defined __x86_64__ + #define LookupS(table, index) (table)*TableSize(STable, index, 4) +#endif + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + */ + #define LocalsSize (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + +#if defined __i386__ + + // Define location of argument i. + #define Argument(i) StackFrame+4*(i)(r4) + + #define Nk t0d + + // Load arguments. + mov Argument(2), E + mov Argument(1), Nk + mov Argument(0), K + +#elif defined __x86_64__ + + #define Nk r9d // Number of words in key. + mov r6d, Nk // Move Nk argument out of way. + mov r2, E // Move E argument to common register. + +#endif + + // Dispatch on key length. + cmp $128, Nk + jge 2f + shl $3, Nk // Convert from bytes to bits. + cmp $128, Nk +2: + je EKeyHas4Words + cmp $192, Nk + je EKeyHas6Words + cmp $256, Nk + je EKeyHas8Words + mov $-1, r0 // Return error. + jmp 9f + +// Stop using Nk. +#undef Nk + + .globl _aes_encrypt_key128 +// .private_extern _aes_encrypt_key128 +_aes_encrypt_key128: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + */ + #define LocalsSize (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + + #if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + + #endif + +// Merge point for _aes_encrypt_key and _aes_encrypt_key128. +EKeyHas4Words: + +#define e0 r2d +#define e1 r3d +#define e2 Arch(r5d, r11d) +#define e3 r7d + + // First words of expanded key are copied from user key. + mov 0*4(K), e0 + mov 1*4(K), e1 + mov 2*4(K), e2 + mov 3*4(K), e3 + + movl $10*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + // K cannot be used after we write to R, since they use the same register. + + // Cache round constants in output buffer. The last is a sentinel. + movb $0x01, 1*16(E) + movb $0x02, 2*16(E) + movb $0x04, 3*16(E) + movb $0x08, 4*16(E) + movb $0x10, 5*16(E) + movb $0x20, 6*16(E) + movb $0x40, 7*16(E) + movb $0x80, 8*16(E) + movb $0x1b, 9*16(E) + movb $0x36, 10*16(E) + + #if defined __x86_64__ + + #define STable r8 + lea _AESSubBytesWordTable(%rip), STable + + #endif + + // Store initial words of expanded key, which are copies of user's key. + mov e0, 0*4(E) + mov e1, 1*4(E) + mov e2, 2*4(E) + mov e3, 3*4(E) + +1: + mov e3, dr // Put previous word into dissection register. + + // Perform SubWord(RotWord(dr)). + movzx drl, t0 + xor LookupS(3, t0), e0 // Look up byte 0 in table 3. + movzx drh, t0d + xor LookupS(0, t0), e0 // Look up byte 1 in table 0. + shr $16, dr + movzx drl, t0d + xor LookupS(1, t0), e0 // Look up byte 2 in table 1. + movzx drh, t0d + xor LookupS(2, t0), e0 // Look up byte 3 in table 2. + + add $4*4, E + + movzx (E), t0d // Get cached round constant. + xor t0d, e0 // XOR with word from four words back. + + // Chain to successive words. + mov e0, 0*4(E) + xor e0, e1 + mov e1, 1*4(E) + xor e1, e2 + mov e2, 2*4(E) + xor e2, e3 + mov e3, 3*4(E) + + cmp $0x36, t0d // Was this the last round constant? + + jne 1b + + xor r0, r0 // Return success. + +9: + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + +// Reset definitions for next case. +#undef e0 +#undef e1 +#undef e2 +#undef e3 + +#undef vt3 +#undef vt2 +#define ve4 %xmm4 +#define ve5 %xmm5 + + + .globl _aes_encrypt_key192 +// .private_extern _aes_encrypt_key192 +_aes_encrypt_key192: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + */ + #define LocalsSize (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + + #if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + + #endif + +// Merge point for _aes_encrypt_key and _aes_encrypt_key192. +EKeyHas6Words: + + // First words of expanded key are copied from user key. + movd 0*4(K), ve0 + movd 1*4(K), ve1 + movd 2*4(K), ve2 + movd 3*4(K), ve3 + + movl $12*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + movd 4*4(K), ve4 + movd 5*4(K), ve5 + + // K cannot be used after we write to R, since they use the same register. + + #if defined __i386__ + + lea _AESRcon, R + + #elif defined __x86_64__ + + lea _AESRcon(%rip), R + lea _AESSubBytesWordTable(%rip), STable + + #endif + + /* With a six-word key, there are twelve rounds (thirteen 16-byte key + blocks). + */ + mov $-12*4*4, offset + sub offset, E + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 0*4(E, offset) + movd ve1, 1*4(E, offset) + movd ve2, 2*4(E, offset) + movd ve3, 3*4(E, offset) + movd ve4, 4*4(E, offset) + movd ve5, 5*4(E, offset) + +/* Jump into loop body. The key expansion processes six four-byte words per + iteration. 52 are needed in the key. So only four are needed in the last + iteration. +*/ + jmp 2f +1: + // Continue chaining to successive words. + pxor ve3, ve4 + movd ve4, 4*4(E, offset) + pxor ve4, ve5 + movd ve5, 5*4(E, offset) +2: + add $1, R // Advance pointer. + movd ve5, dr // Put previous word into dissection register. + movzx (R), t0 // Get round constant. + movd t0d, vt1 + pxor vt1, ve0 // XOR with word from six words back. + + // Perform SubWord(RotWord(dr)). + movzx drl, t0d + movd LookupS(3, t0), vt0 // Look up byte 0 in table 3. + movzx drh, t0d + movd LookupS(0, t0), vt1 // Look up byte 1 in table 0. + shr $16, dr + movzx drl, t0d + pxor vt1, vt0 + pxor vt0, ve0 + movd LookupS(1, t0), vt0 // Look up byte 2 in table 1. + movzx drh, t0d + movd LookupS(2, t0), vt1 // Look up byte 3 in table 2. + pxor vt1, vt0 + pxor vt0, ve0 + + add $6*4, offset + + // Chain to successive words. + movd ve0, 0*4(E, offset) + pxor ve0, ve1 + movd ve1, 1*4(E, offset) + pxor ve1, ve2 + movd ve2, 2*4(E, offset) + pxor ve2, ve3 + movd ve3, 3*4(E, offset) + + jne 1b + + xor r0, r0 // Return success. + + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + +// Reset definitions for next case. +#undef ve4 +#undef ve5 +#define vt3 %xmm4 +#define vt2 %xmm5 + + + .globl _aes_encrypt_key256 +// .private_extern _aes_encrypt_key256 +_aes_encrypt_key256: + + /* Save registers and set SaveSize to the number of bytes pushed onto the + stack so far, including the caller's return address. + */ + push r3 + #if defined __i386__ + push r5 + push r6 + push r7 + #define SaveSize (5*4) + #else + #define SaveSize (2*8) + #endif + + /* Number of bytes used for local variables: + + 8 16-byte spaces to save XMM registers. + */ + #define LocalsSize (8*16) + + #if 0 < LocalsSize + // Padding to position stack pointer at a multiple of 16 bytes. + #define Padding (15 & -(SaveSize + LocalsSize)) + sub $Padding + LocalsSize, r4 // Allocate space on stack. + #else + #define Padding 0 + #endif + + /* StackFrame is the number of bytes in our stack frame, from caller's + stack pointer to ours (so it includes the return address). + */ + #define StackFrame (SaveSize + Padding + LocalsSize) + + // Save xmm registers. + movaps %xmm0, 0*16(r4) + movaps %xmm1, 1*16(r4) + movaps %xmm2, 2*16(r4) + movaps %xmm3, 3*16(r4) + movaps %xmm4, 4*16(r4) + movaps %xmm5, 5*16(r4) + movaps %xmm6, 6*16(r4) + movaps %xmm7, 7*16(r4) + + #if defined __i386__ + + // Load arguments. + #define Argument(i) StackFrame+4*(i)(r4) + mov Argument(1), E + mov Argument(0), K + + #endif + +// Merge point for _aes_encrypt_key and _aes_encrypt_key256. +EKeyHas8Words: + + // First words of expanded key are copied from user key. + movd 0*4(K), ve0 + movd 1*4(K), ve1 + movd 2*4(K), ve2 + movd 3*4(K), ve3 + + movl $14*16, ContextKeyLength(E) // Set "key length." + + #if 0 != ContextKey + add $ContextKey, E + #endif + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 0*4(E) + movd ve1, 1*4(E) + movd ve2, 2*4(E) + movd ve3, 3*4(E) + movd 4*4(K), ve0 + movd 5*4(K), ve1 + movd 6*4(K), ve2 + movd 7*4(K), ve3 + + // K cannot be used after we write to R, since they use the same register. + + #if defined __i386__ + + lea _AESRcon, R + + #elif defined __x86_64__ + + lea _AESRcon(%rip), R + lea _AESSubBytesWordTable(%rip), STable + + #endif + + /* With an eight-word key, there are fourteen rounds (fifteen 16-byte key + blocks). + */ + mov $-14*4*4, offset + sub offset, E + + // Store initial words of expanded key, which are copies of user's key. + movd ve0, 4*4(E, offset) + movd ve1, 5*4(E, offset) + movd ve2, 6*4(E, offset) + movd ve3, 7*4(E, offset) + +/* Jump into loop body. The key expansion processes eight four-byte words per + iteration. 60 are needed in the key. So only four are needed in the last + iteration. +*/ + jmp 2f +1: + movd ve3, dr // Put previous word into dissection register. + + /* Get word from eight words back (it is four words back from where E + currently points, and we use it to prepare the value to be stored + four words beyond where E currently points). + */ + movd -4*4(E, offset), ve0 + + // Perform SubWord(dr). + movzx drl, t0 + movd LookupS(0, t0), vt0 // Look up byte 0 in table 0. + movzx drh, t0d + movd LookupS(1, t0), vt1 // Look up byte 1 in table 1. + shr $16, dr + movzx drl, t0d + movd LookupS(2, t0), vt2 // Look up byte 2 in table 2. + movzx drh, t0d + movd LookupS(3, t0), vt3 // Look up byte 3 in table 3. + pxor vt1, vt0 + pxor vt3, vt2 + pxor vt0, ve0 + pxor vt2, ve0 + + movd -3*4(E, offset), ve1 // Get words from eight words back. + movd -2*4(E, offset), ve2 + movd -1*4(E, offset), ve3 + + // Chain to successive words. + movd ve0, 4*4(E, offset) + pxor ve0, ve1 + movd ve1, 5*4(E, offset) + pxor ve1, ve2 + movd ve2, 6*4(E, offset) + pxor ve2, ve3 + movd ve3, 7*4(E, offset) + +2: + add $1, R // Advance pointer. + movd ve3, dr // Put previous word into dissection register. + movzx (R), t0d // Get round constant. + movd t0d, vt1 + movd 0*4(E, offset), ve0 // Get word from eight words back. + pxor vt1, ve0 + + // Perform SubWord(RotWord(dr)). + movzx drl, t0 + movd LookupS(3, t0), vt0 // Look up byte 0 in table 3. + movzx drh, t0d + movd LookupS(0, t0), vt1 // Look up byte 1 in table 0. + shr $16, dr + movzx drl, t0d + movd LookupS(1, t0), vt2 // Look up byte 2 in table 1. + movzx drh, t0d + movd LookupS(2, t0), vt3 // Look up byte 3 in table 2. + pxor vt1, vt0 + pxor vt3, vt2 + pxor vt0, ve0 + pxor vt2, ve0 + + movd 1*4(E, offset), ve1 + movd 2*4(E, offset), ve2 + movd 3*4(E, offset), ve3 + + add $8*4, offset + + // Chain to successive words. + movd ve0, 0*4(E, offset) + pxor ve0, ve1 + movd ve1, 1*4(E, offset) + pxor ve1, ve2 + movd ve2, 2*4(E, offset) + pxor ve2, ve3 + movd ve3, 3*4(E, offset) + + jne 1b + + xor r0, r0 // Return success. + + // Pop stack and restore registers. + movaps 7*16(r4), %xmm7 + movaps 6*16(r4), %xmm6 + movaps 5*16(r4), %xmm5 + movaps 4*16(r4), %xmm4 + movaps 3*16(r4), %xmm3 + movaps 2*16(r4), %xmm2 + movaps 1*16(r4), %xmm1 + movaps 0*16(r4), %xmm0 + #if 0 < LocalsSize + add $Padding + LocalsSize, r4 + #endif + #if defined __i386__ + pop r7 + pop r6 + pop r5 + #endif + pop r3 + + ret + + +#undef Address +#undef Argument +#undef E +#undef K +#undef LocalsSize +#undef LookupS +#undef Padding +#undef R +#undef SaveSize +#undef STable +#undef StackFrame +#undef dr +#undef drh +#undef drl +#undef offset +#undef t0 +#undef t0d +#undef ve0 +#undef ve1 +#undef ve2 +#undef ve3 +#undef vt0 +#undef vt1 +#undef vt2 +#undef vt3 diff --git a/bsd/crypto/aes/i386/MakeData.c b/bsd/crypto/aes/i386/MakeData.c new file mode 100644 index 000000000..262dc5996 --- /dev/null +++ b/bsd/crypto/aes/i386/MakeData.c @@ -0,0 +1,516 @@ +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#define MaxRcon 11 + +typedef uint8_t Byte; +typedef uint32_t Word; + + +/* In comments below, {n} designates the Galois field element represented by + the byte n. See notes about Galois field multiplication in ReadMe.txt. + + So 3+5 is addition of ordinary integers, and 3+5 == 8, while {3}+{5} is + addition in the field, and {3} + {5} = {3 XOR 5} = {6}.) +*/ + + +// Define constants for languages. +typedef enum { C, IntelAssembly } Language; + + +/* LogBase3[i] will contain the base-three logarithm of i in the 256-element + Galois field defined by AES. That is, {3}**LogBase3[i] == {3}**i. +*/ +static Byte LogBase3[256]; + +/* AntilogBase3[i] will contain {3}**i in the 256-element Galois field defined + by AES. It contains extra elements so that the antilog of a+b can be found + by looking up a+b directly, without having to reduce modulo the period, for + 0 <= a, b < 255. + + (254 is the greatest value we encounter. Each a or b we use is the + base-three logarithm of some element. As a primitive root, the powers of + three cycle through all non-zero elements of the field, of which there are + 255, so the exponents cover 0 to 254 before the powers repeat.) +*/ +static Byte AntilogBase3[254+254+1]; + + +static void InitializeLogTables(void) +{ + // log({1}) is zero, so start {p} (power) at {1} and l (logarithm) at 0. + Byte p = 1; + int l = 0; + do + { + // Record table entries. + LogBase3[p] = l; + AntilogBase3[l] = p; + + /* Observe that {2}*{p} is {p << 1 ^ (a & 0x80 ? 0x1b : 0)}, per notes + in ReadMe.txt. We produce {3}*{p}: + + {3}*{p} + = {1}*{p} + {2}*{p} + = {1}*{p} + {p << 1 ^ (a & 0x80 ? 0x1b : 0)} + = {p ^ p << 1 ^ (p & 0x80 ? 0x1b : 0)}. + */ + p ^= p << 1 ^ (p & 0x80 ? 0x1b : 0); + ++l; + + } while (p != 1); // Stop when we have gone around completely. + + /* The antilogarithms are periodic with a period of 255, and we want to + look up elements as high as 254+254 (the largest that a sum of two + logarithms could be), so we replicate the table beyond the first + period. + */ + for (l = 255; l < 254+254; ++l) + AntilogBase3[l] = AntilogBase3[l-255]; +} + + +/* MultiplyByte(Byte b, Byte c) returns {b}*{c}. It requires tables that must + be initialized before this routine is used. +*/ +static Byte MultiplyByte(Byte b, Byte c) +{ + // Calculate product by adding logarithms, but avoid logarithms of zero. + return b == 0 || c == 0 ? 0 : AntilogBase3[LogBase3[b] + LogBase3[c]]; +} + + +// Return {0} if {b} is {0} and the multiplicative inverse of {b} otherwise. +static Byte InverseByte(Byte b) +{ + return b == 0 ? 0 : AntilogBase3[255 - LogBase3[b]]; +} + + +// Perform AES' SubBytes operation on a single byte. +static Byte SubByte(Byte b) +{ + unsigned int r = InverseByte(b); + + // Duplicate r as a proxy for a rotate operation. + r = r | r<<8; + + // Apply the standard's affine transformation. + return r ^ r>>4 ^ r>>5 ^ r>>6 ^ r>>7 ^ 0x63; +} + + +// Define and populate tables for the SubBytes and InvSubBytes operations. +static Byte SubBytesTable[256]; +static Byte InvSubBytesTable[256]; + + +static void InitializeSubBytesTable(void) +{ + for (int i = 0; i < 256; ++i) + SubBytesTable[i] = SubByte((Byte) i); +} + + +static void InitializeInvSubBytesTable(void) +{ + for (int i = 0; i < 256; ++i) + InvSubBytesTable[SubByte((Byte) i)] = i; +} + + +/* Print tables for SubBytes function providing the output byte embedded in + various places in a word, so that the table entries can be used with + fewer byte manipulations. +*/ +static void PrintSubBytesWordTable(Language language) +{ + switch (language) + { + case C: + printf("\n\n" + "// SubBytes embedded in words tables.\n" + "const Word AESSubBytesWordTable[4][256] =\n" + "{\n"); + for (int j = 0; j < 4; ++j) + { + printf("\t{\n"); + for (int i = 0; i < 256; ++i) + printf("\t\t0x%08x,\n", SubBytesTable[i] << j*8); + printf("\t},\n"); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// SubBytes embedded in words tables.\n" + "\t.globl\t_AESSubBytesWordTable\n" + "\t.private_extern\t_AESSubBytesWordTable\n" + "\t.align\t2\n" + "_AESSubBytesWordTable:\n"); + for (int j = 0; j < 4; ++j) + { + printf("\t// Table %d.\n", j); + for (int i = 0; i < 256; ++i) + printf("\t.long\t0x%08x\n", SubBytesTable[i] << j*8); + } + break; + } +} + + +/* Print tables for InvSubBytes function providing the output byte embedded in + various places in a word, so that the table entries can be used with + fewer byte manipulations. +*/ +static void PrintInvSubBytesWordTable(Language language) +{ + switch (language) + { + case C: + printf("\n\n" + "// InvSubBytes embedded in words tables.\n" + "const Word AESInvSubBytesWordTable[4][256] =\n" + "{\n"); + for (int j = 0; j < 4; ++j) + { + printf("\t{\n"); + for (int i = 0; i < 256; ++i) + printf("\t\t0x%08x,\n", InvSubBytesTable[i] << j*8); + printf("\t},\n"); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// InvSubBytes embedded in words tables.\n" + "\t.globl\t_AESInvSubBytesWordTable\n" + "\t.private_extern\t_AESInvSubBytesWordTable\n" + "\t.align\t2\n" + "_AESInvSubBytesWordTable:\n"); + for (int j = 0; j < 4; ++j) + { + printf("\t// Table %d.\n", j); + for (int i = 0; i < 256; ++i) + printf("\t.long\t0x%08x\n", InvSubBytesTable[i] << j*8); + } + break; + } +} + + +// Print the round constants. +static void PrintRcon(Language language) +{ + union { Byte c[4]; Word w; } t = { { 1, 0, 0, 0 } }; + + switch (language) + { + case C: + printf("\n\n" + "// Round constants.\n" + "const Byte AESRcon[] =\n" + "{\n" + "\t0,\t// Not used, included for indexing simplicity.\n"); + for (int i = 1; i < MaxRcon; ++i) + { + printf("\t0x%02x,\n", t.w); + t.c[0] = MultiplyByte(0x2, t.c[0]); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// Round constants.\n" + "\t.globl\t_AESRcon\n" + "\t.private_extern\t_AESRcon\n" + "_AESRcon:\n" + "\t.byte\t0\t// Not used, included for indexing simplicity.\n"); + for (int i = 1; i < MaxRcon; ++i) + { + printf("\t.byte\t0x%02x\n", t.w); + t.c[0] = MultiplyByte(0x2, t.c[0]); + } + break; + } +} + + +// Print tables for the InvMixColumn operation. +static void PrintInvMixColumnTable(Language language) +{ + Word T[4][256]; + + for (int i = 0; i < 256; ++i) + { + union { Byte b[4]; Word w; } c; + + Byte s9 = MultiplyByte(0x9, i); + Byte sb = MultiplyByte(0xb, i); + Byte sd = MultiplyByte(0xd, i); + Byte se = MultiplyByte(0xe, i); + + c.b[0] = se; + c.b[1] = s9; + c.b[2] = sd; + c.b[3] = sb; + T[0][i] = c.w; + + c.b[0] = sb; + c.b[1] = se; + c.b[2] = s9; + c.b[3] = sd; + T[1][i] = c.w; + + c.b[0] = sd; + c.b[1] = sb; + c.b[2] = se; + c.b[3] = s9; + T[2][i] = c.w; + + c.b[0] = s9; + c.b[1] = sd; + c.b[2] = sb; + c.b[3] = se; + T[3][i] = c.w; + } + + switch (language) + { + case C: + printf("\n\n" + "// Tables for InvMixColumn.\n" + "const Word AESInvMixColumnTable[4][256] =\n" + "{\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t{\n"); + for (int j = 0; j < 256; ++j) + printf("\t\t0x%08x,\n", T[i][j]); + printf("\t},\n"); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// Tables for InvMixColumn.\n" + "\t.globl\t_AESInvMixColumnTable\n" + "\t.private_extern\t_AESInvMixColumnTable\n" + "\t.align\t2\n" + "_AESInvMixColumnTable:\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t// Table %d.\n", i); + for (int j = 0; j < 256; ++j) + printf("\t.long\t0x%08x\n", T[i][j]); + } + break; + } +} + + +/* Print the tables defined AES Proposal: Rijndael, amended, 9/04/2003, + section 5.2.1. These combine the MixColumn and SubBytes operations. +*/ +static void PrintEncryptTable(Language language) +{ + Word T[4][256]; + + for (int i = 0; i < 256; ++i) + { + union { Byte b[4]; Word w; } c; + + Byte s1 = SubBytesTable[i]; + Byte s2 = MultiplyByte(0x2, s1); + Byte s3 = s1 ^ s2; + + c.b[0] = s2; + c.b[1] = s1; + c.b[2] = s1; + c.b[3] = s3; + T[0][i] = c.w; + + c.b[0] = s3; + c.b[1] = s2; + //c.b[2] = s1; + c.b[3] = s1; + T[1][i] = c.w; + + c.b[0] = s1; + c.b[1] = s3; + c.b[2] = s2; + //c.b[3] = s1; + T[2][i] = c.w; + + //c.b[0] = s1; + c.b[1] = s1; + c.b[2] = s3; + c.b[3] = s2; + T[3][i] = c.w; + } + + switch (language) + { + case C: + printf("\n\n" + "// Tables for main encryption iterations.\n" + "const Word AESEncryptTable[4][256] =\n" + "{\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t{\n"); + for (int j = 0; j < 256; ++j) + printf("\t\t0x%08x,\n", T[i][j]); + printf("\t},\n"); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// Tables for main encryption iterations.\n" + "\t.globl\t_AESEncryptTable\n" + "\t.private_extern\t_AESEncryptTable\n" + "\t.align\t2\n" + "_AESEncryptTable:\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t// Table %d.\n", i); + for (int j = 0; j < 256; ++j) + printf("\t.long\t0x%08x\n", T[i][j]); + } + break; + } +} + + +/* Print the inverse tables. These correspond to the tables above, but for + decyrption. These combine the InvSubBytes and InvMixColumn operations. +*/ +static void PrintDecryptTable(Language language) +{ + Word T[4][256]; + + for (int i = 0; i < 256; ++i) + { + union { Byte b[4]; Word w; } c; + + Byte si = InvSubBytesTable[i]; + + Byte s9 = MultiplyByte(0x9, si); + Byte sb = MultiplyByte(0xb, si); + Byte sd = MultiplyByte(0xd, si); + Byte se = MultiplyByte(0xe, si); + + c.b[0] = se; + c.b[1] = s9; + c.b[2] = sd; + c.b[3] = sb; + T[0][i] = c.w; + + c.b[0] = sb; + c.b[1] = se; + c.b[2] = s9; + c.b[3] = sd; + T[1][i] = c.w; + + c.b[0] = sd; + c.b[1] = sb; + c.b[2] = se; + c.b[3] = s9; + T[2][i] = c.w; + + c.b[0] = s9; + c.b[1] = sd; + c.b[2] = sb; + c.b[3] = se; + T[3][i] = c.w; + } + + switch (language) + { + case C: + printf("\n\n" + "// Tables for main decryption iterations.\n" + "const Word AESDecryptTable[4][256] =\n" + "{\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t{\n"); + for (int j = 0; j < 256; ++j) + printf("\t\t0x%08x,\n", T[i][j]); + printf("\t},\n"); + } + printf("};\n"); + break; + + case IntelAssembly: + printf("\n\n" + "// Tables for main decryption iterations.\n" + "\t.globl\t_AESDecryptTable\n" + "\t.private_extern\t_AESDecryptTable\n" + "\t.align\t2\n" + "_AESDecryptTable:\n"); + for (int i = 0; i < 4; ++i) + { + printf("\t// Table %d.\n", i); + for (int j = 0; j < 256; ++j) + printf("\t.long\t0x%08x\n", T[i][j]); + } + break; + } +} + + +static void Usage(const char *ProgramName) +{ + fprintf(stderr, + "%s: This program must have exactly one argument, \"C\" to generate\n" + "C or \"Intel\" to generate GCC i386/x86_64 assembly.\n", ProgramName); + exit(EXIT_FAILURE); +} + + +int main(int argc, char *argv[]) +{ + if (argc != 2) + Usage(argv[0]); + + Language language; + + // Figure out which language to generate, C or Intel assembly. + if (0 == strcmp(argv[1], "C")) + language = C; + else if (0 == strcmp(argv[1], "Intel")) + language = IntelAssembly; + else + Usage(argv[0]); + + printf("// This file was generated by " __FILE__ ".\n"); + + if (language == C) + printf("\n\n#include \"AES.h\"\n"); + + if (language == IntelAssembly) + printf("\n\n\t.const\n"); + + InitializeLogTables(); + InitializeSubBytesTable(); + InitializeInvSubBytesTable(); + + PrintRcon(language); + PrintInvMixColumnTable(language); + PrintEncryptTable(language); + PrintDecryptTable(language); + PrintSubBytesWordTable(language); + PrintInvSubBytesWordTable(language); + + return 0; +} diff --git a/bsd/crypto/aes/i386/Makefile b/bsd/crypto/aes/i386/Makefile index f116db347..851f7b2ac 100644 --- a/bsd/crypto/aes/i386/Makefile +++ b/bsd/crypto/aes/i386/Makefile @@ -7,28 +7,26 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -INSTINC_SUBDIRS = \ +include $(MakeInc_cmd) +include $(MakeInc_def) -INSTINC_SUBDIRS_PPC = \ +INSTINC_SUBDIRS = \ INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ -PRIVATE_DATAFILES = \ - aesopt.h edefs.h - INSTALL_MI_DIR = crypto EXPORT_MI_DIR = ${INSTALL_MI_DIR} -INSTALL_KF_MI_LIST = +PRIVATE_DATAFILES = \ + aesxts.h -INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} +# /System/Library/Frameworks/Kernel.framework/PrivateHeaders +INSTALL_KF_MD_LCL_LIST = ${PRIVATE_DATAFILES} include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/crypto/aes/i386/ReadMe.txt b/bsd/crypto/aes/i386/ReadMe.txt new file mode 100644 index 000000000..7ac833117 --- /dev/null +++ b/bsd/crypto/aes/i386/ReadMe.txt @@ -0,0 +1,22 @@ +This directory contains a hybrid AES implementation. The core AES routines +(the actual encryption, decryption, and key expansion) are in: + + AES.s + Data.mk + Data.s + EncryptDecrypt.s + ExpandKeyForDecryption.s + ExpandKeyForEncryption.s + MakeData.c + +Although the above files do not explicitly include aes.h, they confirm to +certain things defined in it, notably the aes_rval type and the layout of the +aes_encrypt_ctx and aes_decrypt_ctx structures. These must be kept +compatibility; the definitions of ContextKey and ContextKeyLength in AES.s must +match the offsets of the key ("ks") and key_length ("inf") members of +aes_encrypt_ctx and aes_decrypt_ctx. (For some reason, aes_inf is a union that +is written as a 32-bit integer and read as an 8-bit integer. I do not know +why but have reproduced that behavior in the new implementation.) + +aes_modes.c extends the API, most notably by implementing CBC mode using the +basic AES block encryption. It uses aesopt.h and edefs.h. diff --git a/bsd/crypto/aes/i386/aes_crypt_hw.s b/bsd/crypto/aes/i386/aes_crypt_hw.s new file mode 100644 index 000000000..2edc3e2fd --- /dev/null +++ b/bsd/crypto/aes/i386/aes_crypt_hw.s @@ -0,0 +1,472 @@ +/* This files defines _aes_encrypt_hw and _aes_decrypt_hw --- Intel Westmere HW AES-based implementation + of _aes_encrypt and _aes_decrypt. + + These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. + They SHOULD NOT be called without AES HW detection. It might cause xnu to crash. + + The AES HW is detected 1st thing in + _aes_encrypt (EncryptDecrypt.s) + _aes_decrypt (EncryptDecrypt.s) + and, if AES HW is detected, branch without link (ie, jump) to the functions here. + + The implementation here follows the examples in an Intel White Paper + "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01 + + Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01 + + cclee 3-13-10 +*/ + + .text + .align 4,0x90 +.globl _aes_encrypt_hw +_aes_encrypt_hw: + +#if defined __i386__ + movl 4(%esp), %eax // in + movl 12(%esp), %edx // ctx + movl 8(%esp), %ecx // out + + #define LOCAL_SIZE (12+16+16) // 16-byte align (-4 for return address) + 16 (xmm0) + 16 (xmm1) + #define in %eax + #define ctx %edx + #define out %ecx + #define r13 %esp + +#else // x86_64 + + #define LOCAL_SIZE (8+16+16) // 16-byte align (-8 for return address) + 16 (xmm0) + 16 (xmm1) + #define in %rdi + #define ctx %rdx + #define out %rsi + #define r13 %rsp + +#endif // i386 or x86_64 + +#ifdef KERNEL + sub $LOCAL_SIZE, r13 + movaps %xmm0, (r13) +#endif + movups (in), %xmm0 + + // key length identification + movl 240(ctx), %eax // key length + cmp $160, %eax + je L_AES_128 + cmp $192, %eax + je L_AES_192 + cmp $224, %eax + je L_AES_256 + mov $-1, %eax // return ERROR +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret + +L_AES_128: + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor (ctx), %xmm0 + aesenc 16(ctx), %xmm0 + aesenc 32(ctx), %xmm0 + aesenc 48(ctx), %xmm0 + aesenc 64(ctx), %xmm0 + aesenc 80(ctx), %xmm0 + aesenc 96(ctx), %xmm0 + aesenc 112(ctx), %xmm0 + aesenc 128(ctx), %xmm0 + aesenc 144(ctx), %xmm0 + aesenclast 160(ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +0: // special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups (ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 160(ctx), %xmm1 + aesenclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + +L_AES_192: + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor (ctx), %xmm0 + aesenc 16(ctx), %xmm0 + aesenc 32(ctx), %xmm0 + aesenc 48(ctx), %xmm0 + aesenc 64(ctx), %xmm0 + aesenc 80(ctx), %xmm0 + aesenc 96(ctx), %xmm0 + aesenc 112(ctx), %xmm0 + aesenc 128(ctx), %xmm0 + aesenc 144(ctx), %xmm0 + aesenc 160(ctx), %xmm0 + aesenc 176(ctx), %xmm0 + aesenclast 192(ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +0: // special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups (ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 160(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 176(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 192(ctx), %xmm1 + aesenclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + +L_AES_256: + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 0f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor (ctx), %xmm0 + aesenc 16(ctx), %xmm0 + aesenc 32(ctx), %xmm0 + aesenc 48(ctx), %xmm0 + aesenc 64(ctx), %xmm0 + aesenc 80(ctx), %xmm0 + aesenc 96(ctx), %xmm0 + aesenc 112(ctx), %xmm0 + aesenc 128(ctx), %xmm0 + aesenc 144(ctx), %xmm0 + aesenc 160(ctx), %xmm0 + aesenc 176(ctx), %xmm0 + aesenc 192(ctx), %xmm0 + aesenc 208(ctx), %xmm0 + aesenclast 224(ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +0: // special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups (ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 160(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 176(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 192(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 208(ctx), %xmm1 + aesenc %xmm1, %xmm0 + movups 224(ctx), %xmm1 + aesenclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + + + .text + .align 4,0x90 +.globl _aes_decrypt_hw +_aes_decrypt_hw: + +#if defined __i386__ + movl 4(%esp), %eax // in + movl 12(%esp), %edx // ctx + movl 8(%esp), %ecx // out + +#endif + +#ifdef KERNEL + sub $LOCAL_SIZE, r13 + movaps %xmm0, (r13) +#endif + movups (in), %xmm0 + + // key length identification + movl 240(ctx), %eax // key length + cmp $160, %eax + je 0f // AES-128 + cmp $192, %eax + je 1f // AES-192 + cmp $224, %eax + je 2f // AES-256 + mov $-1, %eax // return ERROR +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret + +0: // AES-128 + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor 160(ctx), %xmm0 + aesdec 144(ctx), %xmm0 + aesdec 128(ctx), %xmm0 + aesdec 112(ctx), %xmm0 + aesdec 96(ctx), %xmm0 + aesdec 80(ctx), %xmm0 + aesdec 64(ctx), %xmm0 + aesdec 48(ctx), %xmm0 + aesdec 32(ctx), %xmm0 + aesdec 16(ctx), %xmm0 + aesdeclast (ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +9: // AES-128 Decrypt : special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups 160(ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + +1: // AES-192 + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor 192(ctx), %xmm0 + aesdec 176(ctx), %xmm0 + aesdec 160(ctx), %xmm0 + aesdec 144(ctx), %xmm0 + aesdec 128(ctx), %xmm0 + aesdec 112(ctx), %xmm0 + aesdec 96(ctx), %xmm0 + aesdec 80(ctx), %xmm0 + aesdec 64(ctx), %xmm0 + aesdec 48(ctx), %xmm0 + aesdec 32(ctx), %xmm0 + aesdec 16(ctx), %xmm0 + aesdeclast (ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +9: // AES-192 Decrypt : special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups 192(ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 176(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 160(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + +2: // AES-256 + testb $15, %dl // check whether expanded key is 16-byte aligned + jne 9f // if not 16-byte aligned, aesenc xmm, m128 won't work + pxor 224(ctx), %xmm0 + aesdec 208(ctx), %xmm0 + aesdec 192(ctx), %xmm0 + aesdec 176(ctx), %xmm0 + aesdec 160(ctx), %xmm0 + aesdec 144(ctx), %xmm0 + aesdec 128(ctx), %xmm0 + aesdec 112(ctx), %xmm0 + aesdec 96(ctx), %xmm0 + aesdec 80(ctx), %xmm0 + aesdec 64(ctx), %xmm0 + aesdec 48(ctx), %xmm0 + aesdec 32(ctx), %xmm0 + aesdec 16(ctx), %xmm0 + aesdeclast (ctx), %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + add $LOCAL_SIZE, r13 +#endif + ret +9: // AES-256 Decrypt : special case expanded key is not 16-byte aligned +#ifdef KERNEL + movaps %xmm1, 16(r13) // save xmm1 into stack +#endif + movups 224(ctx), %xmm1 + pxor %xmm1, %xmm0 + movups 208(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 192(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 176(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 160(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 144(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 128(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm0 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm0 + xorl %eax, %eax + movups %xmm0, (out) +#ifdef KERNEL + movaps (r13), %xmm0 + movaps 16(r13), %xmm1 + add $LOCAL_SIZE, r13 +#endif + ret + diff --git a/bsd/crypto/aes/i386/aes_key_hw.s b/bsd/crypto/aes/i386/aes_key_hw.s new file mode 100644 index 000000000..434fa553c --- /dev/null +++ b/bsd/crypto/aes/i386/aes_key_hw.s @@ -0,0 +1,405 @@ +/* This files defines _aes_encrypt_key_hw and _aes_decrypt_key_hw --- Intel Westmere HW AES-based implementation + of _aes_encrypt_key and _aes_decrypt_key. + + These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. + They SHOULD NOT be called without AES HW detection. It might cause xnu to crash. + + The AES HW is detected 1st thing in + _aes_encrypt_key (ExpandKeyForEncryption.s) + _aes_decrypt_key (ExpandKeyForDecryption.s) + and, if AES HW is detected, branch without link (ie, jump) to the functions here. + + The implementation here follows the examples in an Intel White Paper + "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01 + + Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01 + + cclee 3-13-10 +*/ + + .text + .align 4,0x90 + + // hw_aes_encrypt_key(key, klen, hwectx); + // klen = 16, 24, or 32, or (128/192/256) + + .globl _aes_encrypt_key_hw +_aes_encrypt_key_hw: + +#ifdef __i386__ + push %ebp + mov %esp, %ebp + push %ebx + push %edi + mov 8(%ebp), %eax // pointer to key + mov 12(%ebp), %ebx // klen + mov 16(%ebp), %edi // ctx + #define pkey %eax + #define klen %ebx + #define ctx %edi + #define sp %esp + #define cx %ecx +#else + #define pkey %rdi + #define klen %rsi + #define ctx %rdx + #define sp %rsp + #define cx %rcx + push %rbp + mov %rsp, %rbp +#endif + +#ifdef KERNEL + // for xmm registers save and restore + sub $(16*4), sp +#endif + + cmp $32, klen + jg 0f // klen>32 + shl $3, klen // convert 16/24/32 to 128/192/256 +0: + + cmp $128, klen // AES-128 ? + je L_AES_128_Encrypt_Key + cmp $192, klen // AES-192 ? + je L_AES_192_Encrypt_Key + cmp $256, klen // AES-256 ? + je L_AES_256_Encrypt_Key + mov $1, %eax // return error for wrong klen +L_Encrypt_Key_2_return: +#ifdef KERNEL + add $(16*4), sp +#endif +#ifdef __i386__ + pop %edi + pop %ebx +#endif + leave + ret + +L_AES_128_Encrypt_Key: +#ifdef KERNEL + // save xmm registers + movaps %xmm1, (sp) + movaps %xmm2, 16(sp) + movaps %xmm3, 32(sp) +#endif // KERNEL + + movl $160, 240(ctx) // write expanded key length to ctx + xor cx, cx + + movups (pkey), %xmm1 + movups %xmm1, (ctx) + aeskeygenassist $1, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $2, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $4, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $8, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x10, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x20, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x40, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x80, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x1b, %xmm1, %xmm2 + call L_key_expansion_128 + aeskeygenassist $0x36, %xmm1, %xmm2 + call L_key_expansion_128 + +#ifdef KERNEL + // restore xmm registers + movaps (sp), %xmm1 + movaps 16(sp), %xmm2 + movaps 32(sp), %xmm3 +#endif // KERNEL + xor %eax, %eax // return 0 for success + jmp L_Encrypt_Key_2_return + + .align 4, 0x90 +L_key_expansion_128: + pshufd $0xff, %xmm2, %xmm2 + movaps %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + movaps %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + movaps %xmm1, %xmm3 + pslldq $4, %xmm3 + pxor %xmm3, %xmm1 + pxor %xmm2, %xmm1 + add $16, cx + movups %xmm1, (ctx, cx) + ret + +L_AES_192_Encrypt_Key: +#ifdef KERNEL + // save xmm registers + movaps %xmm1, (sp) + movaps %xmm2, 16(sp) + movaps %xmm3, 32(sp) + movaps %xmm4, 48(sp) +#endif // KERNEL + movl $192, 240(ctx) // write expanded key length to ctx + + movups (pkey), %xmm1 + movq 16(pkey), %xmm3 + + movups %xmm1, (ctx) + movq %xmm3, 16(ctx) + + lea 24(ctx), cx + + aeskeygenassist $1, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $2, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $4, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $8, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $0x10, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $0x20, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $0x40, %xmm3, %xmm2 + call L_key_expansion_192 + aeskeygenassist $0x80, %xmm3, %xmm2 + call L_key_expansion_192 + +#ifdef KERNEL + // restore xmm registers + movaps (sp), %xmm1 + movaps 16(sp), %xmm2 + movaps 32(sp), %xmm3 + movaps 48(sp), %xmm4 +#endif // KERNEL + xor %eax, %eax // return 0 for success + jmp L_Encrypt_Key_2_return + + .align 4, 0x90 +L_key_expansion_192: + pshufd $0x55, %xmm2, %xmm2 + + movaps %xmm1, %xmm4 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pxor %xmm2, %xmm1 + + pshufd $0xff, %xmm1, %xmm2 + + movaps %xmm3, %xmm4 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm3 + pxor %xmm2, %xmm3 + + movups %xmm1, (cx) + movq %xmm3, 16(cx) + + add $24, cx + ret + +L_AES_256_Encrypt_Key: +#ifdef KERNEL + // save xmm registers + movaps %xmm1, (sp) + movaps %xmm2, 16(sp) + movaps %xmm3, 32(sp) + movaps %xmm4, 48(sp) +#endif // KERNEL + movl $224, 240(ctx) // write expanded key length to ctx + + movups (pkey), %xmm1 + movups 16(pkey), %xmm3 + movups %xmm1, (ctx) + movups %xmm3, 16(ctx) + + lea 32(ctx), cx + + aeskeygenassist $1, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $2, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $4, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $8, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $0x10, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $0x20, %xmm3, %xmm2 + call L_key_expansion_256 + aeskeygenassist $0x40, %xmm3, %xmm2 + call L_key_expansion_256_final + +#ifdef KERNEL + // restore xmm registers + movaps (sp), %xmm1 + movaps 16(sp), %xmm2 + movaps 32(sp), %xmm3 + movaps 48(sp), %xmm4 +#endif // KERNEL + xor %eax, %eax // return 0 for success + jmp L_Encrypt_Key_2_return + + .align 4, 0x90 +L_key_expansion_256: + + pshufd $0xff, %xmm2, %xmm2 + + movaps %xmm1, %xmm4 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pxor %xmm2, %xmm1 + + movups %xmm1, (cx) + + aeskeygenassist $0, %xmm1, %xmm4 + + pshufd $0xaa, %xmm4, %xmm2 + + movaps %xmm3, %xmm4 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm3 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm3 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm3 + pxor %xmm2, %xmm3 + + movups %xmm3, 16(cx) + + add $32, cx + ret + + .align 4, 0x90 +L_key_expansion_256_final: + + pshufd $0xff, %xmm2, %xmm2 + + movaps %xmm1, %xmm4 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pslldq $4, %xmm4 + + pxor %xmm4, %xmm1 + pxor %xmm2, %xmm1 + + movups %xmm1, (cx) + ret + +// _aes_decrypt_key_hw is implemented as +// 1. call _aes_encrypt_key_hw +// 2. use aesimc to convert the expanded round keys (except the 1st and last round keys) + + .text + .align 4, 0x90 + .globl _aes_decrypt_key_hw +_aes_decrypt_key_hw: + +#ifdef __i386__ + + push %ebp + mov %esp, %ebp + sub $(8+16), %esp + + // copy input arguments for calling aes_decrypt_key_hw + + mov 8(%ebp), %eax + mov %eax, (%esp) + mov 12(%ebp), %eax + mov %eax, 4(%esp) + mov 16(%ebp), %eax + mov %eax, 8(%esp) + +#else + + push %rbp + mov %rsp, %rbp + sub $16, %rsp + + // calling arguments %rdi/%rsi/%rdx will be used for encrypt_key + // %rdx (ctx) will return unchanged + // %rsi (klen) will (<<3) if <= 32 + +#endif + call _aes_encrypt_key_hw + cmp $0, %eax + je L_decrypt_inv +L_decrypt_almost_done: +#ifdef __i386__ + add $(8+16), %esp +#else + add $16, %rsp +#endif + leave + ret + +L_decrypt_inv: +#ifdef KERNEL + movaps %xmm0, (sp) +#endif + +#ifdef __i386__ + #undef klen + #undef ctx + mov 12(%ebp), %eax // klen + mov 16(%ebp), %edx // ctx + #define klen %eax + #define ctx %edx + cmp $32, klen + jg 0f // klen>32 + shl $3, klen // convert 16/24/32 to 128/192/256 +0: +#endif + + mov $9, cx // default is AES-128 + cmp $128, klen + je L_Decrypt_Key + add $2, cx + cmp $192, klen + je L_Decrypt_Key + add $2, cx + +L_Decrypt_Key: + add $16, ctx + movups (ctx), %xmm0 + aesimc %xmm0, %xmm0 + movups %xmm0, (ctx) + sub $1, cx + jg L_Decrypt_Key + +#ifdef KERNEL + movaps (sp), %xmm0 +#endif +#ifdef __i386__ + xor %eax, %eax +#endif + jmp L_decrypt_almost_done + diff --git a/bsd/crypto/aes/i386/aes_modes.c b/bsd/crypto/aes/i386/aes_modes.c deleted file mode 100644 index fd8b1401b..000000000 --- a/bsd/crypto/aes/i386/aes_modes.c +++ /dev/null @@ -1,471 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 - - These subroutines implement multiple block AES modes for ECB, CBC, CFB, - OFB and CTR encryption, The code provides support for the VIA Advanced - Cryptography Engine (ACE). - - NOTE: In the following subroutines, the AES contexts (ctx) must be - 16 byte aligned if VIA ACE is being used -*/ - -//#include <memory.h> -#include <kern/assert.h> - -#include "aesopt.h" - -#if defined( AES_MODES ) -#if defined(__cplusplus) -extern "C" -{ -#endif - -#if defined( _MSC_VER ) && ( _MSC_VER > 800 ) -#pragma intrinsic(memcpy) -#define in_line __inline -#else -#define in_line -#endif - -#define BFR_BLOCKS 8 - -/* These values are used to detect long word alignment in order to */ -/* speed up some buffer operations. This facility may not work on */ -/* some machines so this define can be commented out if necessary */ - -#define FAST_BUFFER_OPERATIONS -#pragma warning( disable : 4311 4312 ) - -#define lp08(x) ((uint_8t*)(x)) -#define lp32(x) ((uint_32t*)(x)) -#define addr_mod_04(x) ((unsigned long)(x) & 3) -#define addr_mod_16(x) ((unsigned long)(x) & 15) - -#if defined( USE_VIA_ACE_IF_PRESENT ) - -#include "via_ace.h" - -#pragma pack(16) - -aligned_array(unsigned long, enc_gen_table, 12, 16) = NEH_ENC_GEN_DATA; -aligned_array(unsigned long, enc_load_table, 12, 16) = NEH_ENC_LOAD_DATA; -aligned_array(unsigned long, enc_hybrid_table, 12, 16) = NEH_ENC_HYBRID_DATA; -aligned_array(unsigned long, dec_gen_table, 12, 16) = NEH_DEC_GEN_DATA; -aligned_array(unsigned long, dec_load_table, 12, 16) = NEH_DEC_LOAD_DATA; -aligned_array(unsigned long, dec_hybrid_table, 12, 16) = NEH_DEC_HYBRID_DATA; - -/* NOTE: These control word macros must only be used after */ -/* a key has been set up because they depend on key size */ - -#if NEH_KEY_TYPE == NEH_LOAD -#define kd_adr(c) ((uint_8t*)(c)->ks) -#elif NEH_KEY_TYPE == NEH_GENERATE -#define kd_adr(c) ((uint_8t*)(c)->ks + (c)->inf.b[0]) -#else -#define kd_adr(c) ((uint_8t*)(c)->ks + ((c)->inf.b[0] == 160 ? 160 : 0)) -#endif - -#else - -#define aligned_array(type, name, no, stride) type name[no] -#define aligned_auto(type, name, no, stride) type name[no] - -#endif - -#if defined( _MSC_VER ) && _MSC_VER > 1200 - -#define via_cwd(cwd, ty, dir, len) unsigned long* cwd = (dir##_##ty##_table + ((len - 128) >> 4)) - -#else - -#define via_cwd(cwd, ty, dir, len) \ - aligned_auto(unsigned long, cwd, 4, 16); \ - cwd[1] = cwd[2] = cwd[3] = 0; \ - cwd[0] = neh_##dir##_##ty##_key(len) - -#endif - -/* implemented in case of wrong call for fixed tables */ -void gen_tabs(void) -{ -} - -aes_rval aes_mode_reset(aes_encrypt_ctx ctx[1]) -{ - ctx->inf.b[2] = 0; - return 0; -} - -aes_rval aes_ecb_encrypt(const unsigned char *ibuf, unsigned char *obuf, - int len, const aes_encrypt_ctx ctx[1]) -{ int nb = len >> 4; - - if(len & (AES_BLOCK_SIZE - 1)) - return 1; - -#if defined( USE_VIA_ACE_IF_PRESENT ) - - if(ctx->inf.b[1] == 0xff) - { uint_8t *ksp = (uint_8t*)(ctx->ks); - via_cwd(cwd, hybrid, enc, 2* ctx->inf.b[0] - 192); - - if(addr_mod_16(ctx)) - return 1; - - if(!addr_mod_16(ibuf) && !addr_mod_16(obuf)) - { - via_ecb_op5(ksp,cwd,ibuf,obuf,nb); - } - else - { aligned_auto(uint_8t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16); - uint_8t *ip, *op; - - while(nb) - { - int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb); - - ip = (addr_mod_16(ibuf) ? buf : (uint_8t*)ibuf); - op = (addr_mod_16(obuf) ? buf : obuf); - - if(ip != ibuf) - memcpy(buf, ibuf, m * AES_BLOCK_SIZE); - - via_ecb_op5(ksp,cwd,ip,op,m); - - if(op != obuf) - memcpy(obuf, buf, m * AES_BLOCK_SIZE); - - ibuf += m * AES_BLOCK_SIZE; - obuf += m * AES_BLOCK_SIZE; - nb -= m; - } - } - - return 0; - } - -#endif - -#if !defined( ASSUME_VIA_ACE_PRESENT ) - while(nb--) - { - aes_encrypt(ibuf, obuf, ctx); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } -#endif - return 0; -} - -aes_rval aes_ecb_decrypt(const unsigned char *ibuf, unsigned char *obuf, - int len, const aes_decrypt_ctx ctx[1]) -{ int nb = len >> 4; - - if(len & (AES_BLOCK_SIZE - 1)) - return 1; - -#if defined( USE_VIA_ACE_IF_PRESENT ) - - if(ctx->inf.b[1] == 0xff) - { uint_8t *ksp = kd_adr(ctx); - via_cwd(cwd, hybrid, dec, 2* ctx->inf.b[0] - 192); - - if(addr_mod_16(ctx)) - return 1; - - if(!addr_mod_16(ibuf) && !addr_mod_16(obuf)) - { - via_ecb_op5(ksp,cwd,ibuf,obuf,nb); - } - else - { aligned_auto(uint_8t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16); - uint_8t *ip, *op; - - while(nb) - { - int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb); - - ip = (addr_mod_16(ibuf) ? buf : (uint_8t*)ibuf); - op = (addr_mod_16(obuf) ? buf : obuf); - - if(ip != ibuf) - memcpy(buf, ibuf, m * AES_BLOCK_SIZE); - - via_ecb_op5(ksp,cwd,ip,op,m); - - if(op != obuf) - memcpy(obuf, buf, m * AES_BLOCK_SIZE); - - ibuf += m * AES_BLOCK_SIZE; - obuf += m * AES_BLOCK_SIZE; - nb -= m; - } - } - - return 0; - } - -#endif - -#if !defined( ASSUME_VIA_ACE_PRESENT ) - while(nb--) - { - aes_decrypt(ibuf, obuf, ctx); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } -#endif - return 0; -} - -aes_rval aes_cbc_encrypt(const unsigned char *ibuf, unsigned char *obuf, - int len, unsigned char *iv, const aes_encrypt_ctx ctx[1]) -{ int nb = len >> 4; - - if(len & (AES_BLOCK_SIZE - 1)) - return 1; - -#if defined( USE_VIA_ACE_IF_PRESENT ) - - if(ctx->inf.b[1] == 0xff) - { uint_8t *ksp = (uint_8t*)(ctx->ks), *ivp = iv; - aligned_auto(uint_8t, liv, AES_BLOCK_SIZE, 16); - via_cwd(cwd, hybrid, enc, 2* ctx->inf.b[0] - 192); - - if(addr_mod_16(ctx)) - return 1; - - if(addr_mod_16(iv)) /* ensure an aligned iv */ - { - ivp = liv; - memcpy(liv, iv, AES_BLOCK_SIZE); - } - - if(!addr_mod_16(ibuf) && !addr_mod_16(obuf) && !addr_mod_16(iv)) - { - via_cbc_op7(ksp,cwd,ibuf,obuf,nb,ivp,ivp); - } - else - { aligned_auto(uint_8t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16); - uint_8t *ip, *op; - - while(nb) - { - int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb); - - ip = (addr_mod_16(ibuf) ? buf : (uint_8t*)ibuf); - op = (addr_mod_16(obuf) ? buf : obuf); - - if(ip != ibuf) - memcpy(buf, ibuf, m * AES_BLOCK_SIZE); - - via_cbc_op7(ksp,cwd,ip,op,m,ivp,ivp); - - if(op != obuf) - memcpy(obuf, buf, m * AES_BLOCK_SIZE); - - ibuf += m * AES_BLOCK_SIZE; - obuf += m * AES_BLOCK_SIZE; - nb -= m; - } - } - - if(iv != ivp) - memcpy(iv, ivp, AES_BLOCK_SIZE); - - return 0; - } - -#endif - -#if !defined( ASSUME_VIA_ACE_PRESENT ) -# ifdef FAST_BUFFER_OPERATIONS - if(!addr_mod_04(ibuf) && !addr_mod_04(iv)) - while(nb--) - { - lp32(iv)[0] ^= lp32(ibuf)[0]; - lp32(iv)[1] ^= lp32(ibuf)[1]; - lp32(iv)[2] ^= lp32(ibuf)[2]; - lp32(iv)[3] ^= lp32(ibuf)[3]; - aes_encrypt(iv, iv, ctx); - memcpy(obuf, iv, AES_BLOCK_SIZE); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } - else -# endif - while(nb--) - { - iv[ 0] ^= ibuf[ 0]; iv[ 1] ^= ibuf[ 1]; - iv[ 2] ^= ibuf[ 2]; iv[ 3] ^= ibuf[ 3]; - iv[ 4] ^= ibuf[ 4]; iv[ 5] ^= ibuf[ 5]; - iv[ 6] ^= ibuf[ 6]; iv[ 7] ^= ibuf[ 7]; - iv[ 8] ^= ibuf[ 8]; iv[ 9] ^= ibuf[ 9]; - iv[10] ^= ibuf[10]; iv[11] ^= ibuf[11]; - iv[12] ^= ibuf[12]; iv[13] ^= ibuf[13]; - iv[14] ^= ibuf[14]; iv[15] ^= ibuf[15]; - aes_encrypt(iv, iv, ctx); - memcpy(obuf, iv, AES_BLOCK_SIZE); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } -#endif - return 0; -} - -aes_rval aes_encrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out_blk, const aes_encrypt_ctx cx[1]) -{ - unsigned char tmp_iv[16]; - int i; - - for (i = 0; i < 16; i++) - tmp_iv[i] = *(in_iv + i); - - return aes_cbc_encrypt(in_blk, out_blk, num_blk<<4, tmp_iv, cx); - -} - -aes_rval aes_cbc_decrypt(const unsigned char *ibuf, unsigned char *obuf, - int len, unsigned char *iv, const aes_decrypt_ctx ctx[1]) -{ unsigned char tmp[AES_BLOCK_SIZE]; - int nb = len >> 4; - - if(len & (AES_BLOCK_SIZE - 1)) - return 1; - -#if defined( USE_VIA_ACE_IF_PRESENT ) - - if(ctx->inf.b[1] == 0xff) - { uint_8t *ksp = kd_adr(ctx), *ivp = iv; - aligned_auto(uint_8t, liv, AES_BLOCK_SIZE, 16); - via_cwd(cwd, hybrid, dec, 2* ctx->inf.b[0] - 192); - - if(addr_mod_16(ctx)) - return 1; - - if(addr_mod_16(iv)) /* ensure an aligned iv */ - { - ivp = liv; - memcpy(liv, iv, AES_BLOCK_SIZE); - } - - if(!addr_mod_16(ibuf) && !addr_mod_16(obuf) && !addr_mod_16(iv)) - { - via_cbc_op6(ksp,cwd,ibuf,obuf,nb,ivp); - } - else - { aligned_auto(uint_8t, buf, BFR_BLOCKS * AES_BLOCK_SIZE, 16); - uint_8t *ip, *op; - - while(nb) - { - int m = (nb > BFR_BLOCKS ? BFR_BLOCKS : nb); - - ip = (addr_mod_16(ibuf) ? buf : (uint_8t*)ibuf); - op = (addr_mod_16(obuf) ? buf : obuf); - - if(ip != ibuf) - memcpy(buf, ibuf, m * AES_BLOCK_SIZE); - - via_cbc_op6(ksp,cwd,ip,op,m,ivp); - - if(op != obuf) - memcpy(obuf, buf, m * AES_BLOCK_SIZE); - - ibuf += m * AES_BLOCK_SIZE; - obuf += m * AES_BLOCK_SIZE; - nb -= m; - } - } - - if(iv != ivp) - memcpy(iv, ivp, AES_BLOCK_SIZE); - - return 0; - } -#endif - -#if !defined( ASSUME_VIA_ACE_PRESENT ) -# ifdef FAST_BUFFER_OPERATIONS - if(!addr_mod_04(obuf) && !addr_mod_04(iv)) - while(nb--) - { - memcpy(tmp, ibuf, AES_BLOCK_SIZE); - aes_decrypt(ibuf, obuf, ctx); - lp32(obuf)[0] ^= lp32(iv)[0]; - lp32(obuf)[1] ^= lp32(iv)[1]; - lp32(obuf)[2] ^= lp32(iv)[2]; - lp32(obuf)[3] ^= lp32(iv)[3]; - memcpy(iv, tmp, AES_BLOCK_SIZE); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } - else -# endif - while(nb--) - { - memcpy(tmp, ibuf, AES_BLOCK_SIZE); - aes_decrypt(ibuf, obuf, ctx); - obuf[ 0] ^= iv[ 0]; obuf[ 1] ^= iv[ 1]; - obuf[ 2] ^= iv[ 2]; obuf[ 3] ^= iv[ 3]; - obuf[ 4] ^= iv[ 4]; obuf[ 5] ^= iv[ 5]; - obuf[ 6] ^= iv[ 6]; obuf[ 7] ^= iv[ 7]; - obuf[ 8] ^= iv[ 8]; obuf[ 9] ^= iv[ 9]; - obuf[10] ^= iv[10]; obuf[11] ^= iv[11]; - obuf[12] ^= iv[12]; obuf[13] ^= iv[13]; - obuf[14] ^= iv[14]; obuf[15] ^= iv[15]; - memcpy(iv, tmp, AES_BLOCK_SIZE); - ibuf += AES_BLOCK_SIZE; - obuf += AES_BLOCK_SIZE; - } -#endif - return 0; -} - -aes_rval aes_decrypt_cbc(const unsigned char *in_blk, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out_blk, const aes_decrypt_ctx cx[1]) -{ - unsigned char tmp_iv[16]; - int i; - - for (i = 0; i < 16; i++) - tmp_iv[i] = *(in_iv + i); - - return aes_cbc_decrypt(in_blk, out_blk, num_blk<<4, tmp_iv, cx); - -} - - -#if defined(__cplusplus) -} -#endif -#endif diff --git a/bsd/crypto/aes/i386/aes_modes_asm.s b/bsd/crypto/aes/i386/aes_modes_asm.s new file mode 100644 index 000000000..3b0f29aa1 --- /dev/null +++ b/bsd/crypto/aes/i386/aes_modes_asm.s @@ -0,0 +1,420 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 31/01/2006 + + These subroutines implement multiple block AES modes for ECB, CBC, CFB, + OFB and CTR encryption, The code provides support for the VIA Advanced + Cryptography Engine (ACE). + + NOTE: In the following subroutines, the AES contexts (ctx) must be + 16 byte aligned if VIA ACE is being used +*/ + +/* modified 3/5/10 cclee */ +/* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */ +/* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */ + +/* add code comments/description and HW AES detection and execution branch cclee 3-13-10 */ + +#ifdef KERNEL +#include <i386/cpu_capabilities.h> // to use __cpu_capabilities&kHasAES to detect Intel Westmere AES HW +#else +#include <System/i386/cpu_capabilities.h> // to use __cpu_capabilities&kHasAES to detect Intel Westmere AES HW +#endif + +#if 0 + +// TODO: +// aes_ecb_encrypt and aes_ecb_decrypt are not present in gen/aescrypt.c +// would add the implementation if needed +// they are now compiled from aes_modes.c + +aes_rval aes_ecb_encrypt(const unsigned char *ibuf, unsigned char *obuf, + int len, const aes_encrypt_ctx ctx[1]) +{ int nb = len >> 4; + + if(len & (AES_BLOCK_SIZE - 1)) return 1; + while(nb--) { + aes_encrypt(ibuf, obuf, ctx); + ibuf += AES_BLOCK_SIZE; + obuf += AES_BLOCK_SIZE; + } + return 0; +} + +aes_rval aes_ecb_decrypt(const unsigned char *ibuf, unsigned char *obuf, + int len, const aes_decrypt_ctx ctx[1]) +{ int nb = len >> 4; + + if(len & (AES_BLOCK_SIZE - 1)) return 1; + while(nb--) { + aes_decrypt(ibuf, obuf, ctx); + ibuf += AES_BLOCK_SIZE; + obuf += AES_BLOCK_SIZE; + } + return 0; +} +#endif + +#if 0 +aes_rval aes_encrypt_cbc(const unsigned char *ibuf, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *obuf, const aes_encrypt_ctx ctx[1]) +{ + unsigned char iv[16]; + int i; + + for (i = 0; i < 16; i++) iv[i] = *(in_iv + i); + + while (num_blk--) { + iv ^= ibuf; // 128-bit + aes_encrypt(iv, iv, ctx); + memcpy(obuf, iv, AES_BLOCK_SIZE); + ibuf += AES_BLOCK_SIZE; + obuf += AES_BLOCK_SIZE; + + } + + return 0; +} +#endif + + .text + .align 4,0x90 + .globl _aes_encrypt_cbc +_aes_encrypt_cbc: + + // detect AES HW + // if AES HW detected, branch to AES-HW-specific function _aes_encrypt_cbc_hw (aes_modes_hw.s) + // o.w., fall through to the original AES-SW function + +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capability + mov (%rax), %eax // %eax = __cpu_capabilities +#else +#ifdef KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax // kHasAES & __cpu_capabilities + jne _aes_encrypt_cbc_hw // if AES HW detected, branch to HW-specific code + + // save registers and allocate stack memory for xmm registers and calling arguments (i386 only) +#if defined __i386__ + push %ebp + mov %esp, %ebp + push %ebx // to be used as ibuf + push %edi // to be used as obuf + sub $(16+16+7*16), %esp // 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm) + mov %esi, 12(%esp) // save %esp in the unused 4-bytes, to be used as num_blk + + #define sp %esp +#else // __x86_64__ + push %rbp + mov %rsp, %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + sub $(8+16+5*16+16), %rsp // 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency) + + #define sp %rsp +#endif + + // save xmm registers for kernel use + // xmm6-xmm7 will be used locally + // xmm0-xmm2 (x86_64) or xmm0-/xmm4 (i386) will be used inside _aes_encrypt_xmm_no_save (non-restored) + // there is a hole not used for xmm, which is 48(sp). + // it has been used to store iv (16-bytes) in i386 code + // for consistency between i386 and x86_64, this hole is dummied in x86_64 code + // also the 1st 16 bytes (sp) is dummied in x86_64 code + +#ifdef KERNEL + movaps %xmm7, 16(sp) + movaps %xmm6, 32(sp) + movaps %xmm0, 64(sp) + movaps %xmm1, 80(sp) + movaps %xmm2, 96(sp) +#if defined __i386__ + movaps %xmm3, 112(sp) + movaps %xmm4, 128(sp) +#endif +#endif + + // set up registers from calling arguments + +#if defined __i386__ + + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), %xmm7 // in_iv + lea 48(%esp), %eax // &iv[0] + mov %eax, (%esp) // 1st iv for aes_encrypt + mov %eax, 4(%esp) // 2nd iv for aes_encrypt + mov %edx, 8(%esp) // ctx for aes_encrypt + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %esi // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %esi + +#else // __x86_64__, calling arguments order : rdi/rsi/rdx/rcx/r8 + + mov %rdi, %rbx // ibuf + lea 48(sp), %r12 // &iv + movups (%rsi), %xmm7 // in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define iv %r12 + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + + cmp $1, num_blk // num_blk vs 1 + jl 9f // if num_blk < 1, branch to bypass the main loop +0: + movups (ibuf), %xmm6 // ibuf +#if defined __i386__ + lea 48(sp), %eax // &iv[0] + pxor %xmm6, %xmm7 // iv ^= ibuf + movups %xmm7, (%eax) // save iv +#else + pxor %xmm6, %xmm7 // iv ^= ibuf + movups %xmm7, (iv) // save iv + mov iv, %rdi // 1st calling argument for aes_encrypt + mov iv, %rsi // 2nd calling argument for aes_encrypt + mov ctx, %rdx // 3rd calling argument for aes_encrypt +#endif + call _aes_encrypt_xmm_no_save // aes_encrypt(iv, iv, ctx) +#if defined __i386__ + leal 48(%esp), %eax // &iv[0] + movups (%eax), %xmm7 // read iv +#else + movups (iv), %xmm7 // read iv +#endif + movups %xmm7, (obuf) // memcpy(obuf, iv, AES_BLOCK_SIZE); + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop +9: + +L_crypt_cbc_done: + + // restore xmm registers due to kernel use +#ifdef KERNEL + movaps 16(sp), %xmm7 + movaps 32(sp), %xmm6 + movaps 64(sp), %xmm0 + movaps 80(sp), %xmm1 + movaps 96(sp), %xmm2 +#if defined __i386__ + movaps 112(sp), %xmm3 + movaps 128(sp), %xmm4 +#endif +#endif + + xor %eax, %eax // to return 0 for SUCCESS + +#if defined __i386__ + mov 12(%esp), %esi // restore %esi + add $(16+16+7*16), %esp // 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm) + pop %edi + pop %ebx +#else + add $(8+16+5*16+16), %rsp // 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency) + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx +#endif + leave + ret + +#if 0 +aes_rval aes_decrypt_cbc(const unsigned char *ibuf, const unsigned char *in_iv, unsigned int num_blk, + unsigned char *obuf, const aes_decrypt_ctx cx[1]) +{ + unsigned char iv[16], tmp[16]; + int i; + + for (i = 0; i < 16; i++) iv[i] = *(in_iv + i); + + while (num_blk--) { + + memcpy(tmp, ibuf, AES_BLOCK_SIZE); + aes_decrypt(ibuf, obuf, ctx); + obuf ^= iv; + memcpy(iv, tmp, AES_BLOCK_SIZE); + ibuf += AES_BLOCK_SIZE; + obuf += AES_BLOCK_SIZE; + } + + return 0; +} +#endif + + .text + .align 4,0x90 + .globl _aes_decrypt_cbc +_aes_decrypt_cbc: + + // detect AES HW + // if AES HW detected, branch to AES-HW-specific function _aes_decrypt_cbc_hw (aes_modes_hw.s) + // o.w., fall through to the original AES-SW function + +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capability + mov (%rax), %eax // %eax = __cpu_capabilities +#else +#ifdef KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax // kHasAES & __cpu_capabilities + jne _aes_decrypt_cbc_hw + + // save registers and allocate stack memory for xmm registers and calling arguments (i386 only) +#if defined __i386__ + push %ebp + mov %esp, %ebp + push %ebx // to be used as ibuf + push %edi // to be used as obuf + sub $(16+16+7*16), %esp // 12 (calling arguments) + 4 (%esi) + 16 (iv) + 7*16 (xmm) + mov %esi, 12(%esp) // save %esp in the unused 4-bytes, to be used as num_blk + + #define sp %esp +#else // __x86_64__ + push %rbp + mov %rsp, %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + sub $(8+16+5*16+16), %rsp // 8 (align) + 16 (dummy iv) + 5*16 (xmm) + 16 (for i386-x86_64 consistency) + + #define sp %rsp +#endif + + // save xmm registers for kernel use + // xmm6-xmm7 will be used locally + // xmm0-xmm2 (x86_64) or xmm0-/xmm4 (i386) will be used inside _aes_encrypt_xmm_no_save (non-restored) + // there is a hole not used for xmm, which is 48(sp). + // it has been used to store iv (16-bytes) in i386 code + // for consistency between i386 and x86_64, this hole is dummied in x86_64 code + // also the 1st 16 bytes (sp) is dummied in x86_64 code + +#ifdef KERNEL + movaps %xmm7, 16(sp) + movaps %xmm6, 32(sp) + movaps %xmm0, 64(sp) + movaps %xmm1, 80(sp) + movaps %xmm2, 96(sp) +#if defined __i386__ + movaps %xmm3, 112(sp) + movaps %xmm4, 128(sp) +#endif +#endif + + // set up registers from calling arguments + +#if defined __i386__ + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), %xmm7 // in_iv + mov %edx, 8(%esp) // ctx for aes_encrypt + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %esi // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %esi +#else // __x86_64__, rdi/rsi/rdx/rcx/r8 + mov %rdi, %rbx // ibuf + movups (%rsi), %xmm7 // in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + // memcpy(tmp, ibuf, AES_BLOCK_SIZE); + // aes_decrypt(ibuf, obuf, ctx); + // obuf ^= iv; + // memcpy(iv, tmp, AES_BLOCK_SIZE); + // ibuf += AES_BLOCK_SIZE; + // obuf += AES_BLOCK_SIZE; + + cmp $1, num_blk // num_blk vs 1 + jl L_crypt_cbc_done // if num_blk < 1, bypass the main loop, jump to finishing code +0: + movups (ibuf), %xmm6 // tmp +#if defined __i386__ + mov ibuf, (sp) // ibuf + mov obuf, 4(sp) // obuf +#else + mov ibuf, %rdi // ibuf + mov obuf, %rsi // obuf + mov ctx, %rdx // ctx +#endif + call _aes_decrypt_xmm_no_save // aes_decrypt(ibuf, obuf, ctx) + movups (obuf), %xmm0 // obuf + pxor %xmm7, %xmm0 // obuf ^= iv; + movaps %xmm6, %xmm7 // memcpy(iv, tmp, AES_BLOCK_SIZE); + movups %xmm0, (obuf) // update obuf + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop +9: + + // we are done here, the finishing code is identical to that in aes_encrypt_cbc, so just jump to there + jmp L_crypt_cbc_done + diff --git a/bsd/crypto/aes/i386/aes_modes_hw.s b/bsd/crypto/aes/i386/aes_modes_hw.s new file mode 100644 index 000000000..401fd3dd9 --- /dev/null +++ b/bsd/crypto/aes/i386/aes_modes_hw.s @@ -0,0 +1,1669 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue 31/01/2006 + + These subroutines implement multiple block AES modes for ECB, CBC, CFB, + OFB and CTR encryption, The code provides support for the VIA Advanced + Cryptography Engine (ACE). + + NOTE: In the following subroutines, the AES contexts (ctx) must be + 16 byte aligned if VIA ACE is being used +*/ + +/* modified 3/5/10 cclee */ +/* Clean up those related to VIA ACE and hand optimize aes_cbc_encrypt and aes_cbc_decrypt */ +/* move the xmm registers save/restore originally inside the callee functions into these 2 caller functions */ + +/* HW-AES specific implementation cclee 3-12-10 */ +/* In aes_encrypt_cbc and aes_decrypt_cbc, __cpu_capabilities is polled, + and if kHasAES is detected, branch to the hw-specific functions here */ + + +/* + This files defines _aes_encrypt_cbc_hw and _aes_decrypt_cbc_hw --- Intel Westmere HW AES-based implementation + of _aes_encrypt_cbc and _aes_decrypt_cbc. + + These 2 functions SHOULD BE entried ONLY after the AES HW is verified to be available. + They SHOULD NOT be called without AES HW detection. It might cause xnu to crash. + + The AES HW is detected 1st thing in + _aes_encrypt_cbc (aes_modes_asm.s) + _aes_decrypt_cbc (aes_modes_asm.s) + and, if AES HW is detected, branch without link (ie, jump) to the functions here. + + The implementation here follows the examples in an Intel White Paper + "Intel Advanced Encryption Standard (AES) Instruction Set" Rev.2 01 + + Note: Rev. 03 Final 2010 01 26 is available. Looks like some code change from Rev.2 01 + + cclee 3-13-10 +*/ + +/* + The function _aes_decrypt_cbc_hw previously simply serially decrypts block by block + in our group meeting, Eric/Ali suggested that I perhaps should take a look of combining multiple blocks + in a loop and interleaving multiple aesdec instructions to absorb/hide stalls to improve the decrypt thoughput. + + The idea was actually described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) + + This modification interleaves the aesdec/aesdeclast instructions for 4 blocks in cbc mode. + On a K18 (2.4GHz core-i5/2.66GHz core-i7), the x86_64 decrypt throughput (in xnu-iokit) has been improved + from 1180/1332 to 1667/1858 MBytes/sec. This is approximately 1.40 times speedup in the decryption. + The encrypt throughput is not changed. + + I also enhanced the assembly code comments. + + cclee-4-30-10 (Do you know 4-30 is National Honesty Day in the US? No need to know. I've been honest all the time.) + +*/ + +/* ---------------------------------------------------------------------------------------------------------------- + + aes_encrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : + + For simplicity, I am assuming all variables are in 128-bit data type. + + aes_rval aes_encrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_encrypt_ctx *ctx) + { + while(num_blk--) { + *iv ^= *ibuf++; + aes_encrypt(iv, iv, ctx); + *obuf++ = *iv; + } + return 0; + } + + The following is an implementation of this function using Intel AESNI. + This function _aes_encrypt_cbc_hw SHOULD NOT be called directly. + Developer should still call _aes_encrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch + to this aesni-based function should it detecs that aesni is available. + Blindly call this function SURELY will cause a CRASH on systems with no aesni support. + + Note that each block starts with *iv, which is the output of the previous block. Therefore, the cbc blocks + are serially chained. This prevents us from arranging several blocks for encryption in parallel. + + ----------------------------------------------------------------------------------------------------------------*/ + + .text + .align 4,0x90 + .globl _aes_encrypt_cbc_hw +_aes_encrypt_cbc_hw: + + // push/save registers for local use +#if defined __i386__ + + push %ebp + movl %esp, %ebp + push %ebx + push %edi + + #define sp %esp + +#else // __x86_64__ + + push %rbp + mov %rsp, %rbp + push %rbx + push %r13 + push %r14 + push %r15 + + #define sp %rsp + +#endif + + // if this is kernel code, need to save used xmm registers +#ifdef KERNEL + +#if defined __i386__ + sub $(8*16), %esp // for possible xmm0-xmm7 save/restore +#else + sub $(16*16), %rsp // xmm0-xmm15 save/restore +#endif + + movaps %xmm0, (sp) + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm3, 48(sp) + movaps %xmm4, 64(sp) + movaps %xmm5, 80(sp) + movaps %xmm6, 96(sp) + movaps %xmm7, 112(sp) +#if defined __x86_64__ + movaps %xmm8, 16*8(sp) + movaps %xmm9, 16*9(sp) + movaps %xmm10, 16*10(sp) + movaps %xmm11, 16*11(sp) + movaps %xmm12, 16*12(sp) + movaps %xmm13, 16*13(sp) + movaps %xmm14, 16*14(sp) + movaps %xmm15, 16*15(sp) +#endif // __x86_64__ + +#endif // KERNEL + + #define iv %xmm0 + +#ifdef __i386__ + + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), iv // iv = in_iv + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %ecx // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %ecx + #define ctx %edx + +#else + + mov %rdi, %rbx // ibuf + movups (%rsi), iv // iv = in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + + mov 240(ctx), %eax // aes length + cmp $160, %eax // aes-128 encrypt ? + je L_encrypt_128 + cmp $192, %eax // aes-192 encrypt ? + je L_encrypt_192 + cmp $224, %eax // aes-256 encrypt ? + je L_encrypt_256 + mov $-1, %eax // return error + jmp L_error + + // + // aes-128 encrypt_cbc operation, up to L_HW_cbc_done + // + +L_encrypt_128: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm2, iv // 1st instruction inside aes_encrypt + pxor %xmm1, iv // *iv ^= *ibuf + + // finishing up the rest of aes_encrypt + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenclast %xmm12, iv +#else + movups 96(ctx), %xmm1 // key6 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 // key7 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 // key8 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 // key9 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 // keyA + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, obuf // obuf++; + add $16, ibuf // ibuf++; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + // the following will be branched to from all other cases (encrypt/decrypt 128/192/256) + +L_HW_cbc_done: + + xor %eax, %eax // to return CRYPT_OK + +L_error: + + // if kernel, restore xmm registers +#ifdef KERNEL + movaps 0(sp), %xmm0 + movaps 16(sp), %xmm1 + movaps 32(sp), %xmm2 + movaps 48(sp), %xmm3 + movaps 64(sp), %xmm4 + movaps 80(sp), %xmm5 + movaps 96(sp), %xmm6 + movaps 112(sp), %xmm7 +#if defined __x86_64__ + movaps 16*8(sp), %xmm8 + movaps 16*9(sp), %xmm9 + movaps 16*10(sp), %xmm10 + movaps 16*11(sp), %xmm11 + movaps 16*12(sp), %xmm12 + movaps 16*13(sp), %xmm13 + movaps 16*14(sp), %xmm14 + movaps 16*15(sp), %xmm15 +#endif // __x86_64__ +#endif // KERNEL + + // release used stack memory, restore used callee-saved registers, and return +#if defined __i386__ +#ifdef KERNEL + add $(8*16), %esp +#endif + pop %edi + pop %ebx +#else +#ifdef KERNEL + add $(16*16), %rsp +#endif + pop %r15 + pop %r14 + pop %r13 + pop %rbx +#endif + leave + ret + + // + // aes-192 encrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_encrypt_192: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA + movups 176(ctx), %xmm13 // keyB + movups 192(ctx), %xmm14 // keyC +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm1, iv // *iv ^= ibuf + + // aes_encrypt(iv, iv, ctx); + + pxor %xmm2, iv + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenc %xmm12, iv + aesenc %xmm13, iv + aesenclast %xmm14, iv +#else + movups 96(ctx), %xmm1 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 + aesenc %xmm1, iv + movups 176(ctx), %xmm1 + aesenc %xmm1, iv + movups 192(ctx), %xmm1 + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, ibuf // ibuf++ + add $16, obuf // obuf++ + + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done // share with the common exit code + + // + // aes-256 encrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_encrypt_256: + + cmp $1, num_blk // check number of block + jl L_HW_cbc_done // should it be less than 1, nothing to do + + movups (ctx), %xmm2 // key0 + movups 16(ctx), %xmm3 // key1 + movups 32(ctx), %xmm4 // key2 + movups 48(ctx), %xmm5 // key3 + movups 64(ctx), %xmm6 // key4 + movups 80(ctx), %xmm7 // key5 +#if defined __x86_64__ + movups 96(ctx), %xmm8 // key6 + movups 112(ctx), %xmm9 // key7 + movups 128(ctx), %xmm10 // key8 + movups 144(ctx), %xmm11 // key9 + movups 160(ctx), %xmm12 // keyA + movups 176(ctx), %xmm13 // keyB + movups 192(ctx), %xmm14 // keyC + movups 208(ctx), %xmm15 // keyD + // movups 224(ctx), %xmm1 // keyE +#endif + + // while (num_blk--) { + // *iv ^= *ibuf++; + // aes_encrypt(iv, iv, ctx); + // *obuf++ = *iv; + // } +0: + movups (ibuf), %xmm1 // *ibuf + pxor %xmm1, iv // *iv ^= ibuf + + // aes_encrypt(iv, iv, ctx); + pxor %xmm2, iv + aesenc %xmm3, iv + aesenc %xmm4, iv + aesenc %xmm5, iv + aesenc %xmm6, iv + aesenc %xmm7, iv +#if defined __x86_64__ + movups 224(ctx), %xmm1 // keyE + aesenc %xmm8, iv + aesenc %xmm9, iv + aesenc %xmm10, iv + aesenc %xmm11, iv + aesenc %xmm12, iv + aesenc %xmm13, iv + aesenc %xmm14, iv + aesenc %xmm15, iv + aesenclast %xmm1, iv +#else + movups 96(ctx), %xmm1 // key6 + aesenc %xmm1, iv + movups 112(ctx), %xmm1 // key7 + aesenc %xmm1, iv + movups 128(ctx), %xmm1 // key8 + aesenc %xmm1, iv + movups 144(ctx), %xmm1 // key9 + aesenc %xmm1, iv + movups 160(ctx), %xmm1 // keyA + aesenc %xmm1, iv + movups 176(ctx), %xmm1 // keyB + aesenc %xmm1, iv + movups 192(ctx), %xmm1 // keyC + aesenc %xmm1, iv + movups 208(ctx), %xmm1 // keyD + aesenc %xmm1, iv + movups 224(ctx), %xmm1 // keyE + aesenclast %xmm1, iv +#endif + + movups iv, (obuf) // *obuf = *iv; + add $16, ibuf // ibuf++ + add $16, obuf // obuf++ + + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done // share with the common exit code + + + + // + // --------- END of aes_encrypt_cbc_hw ------------------- + // + + +/* ---------------------------------------------------------------------------------------------------------------- + + aes_decrypt_cbc function (see aes_modes.c or aes_modes_asm.s) : + + For simplicity, I am assuming all variables are in 128-bit data type. + + aes_rval aes_decrypt_cbc(const __m128 *ibuf, __m128 *iv, int num_blk, __m128 *obuf, const aes_decrypt_ctx *ctx) + { + while(num_blk--) { + aes_decrypt(ibuf, obuf, ctx); + *obuf++ ^= *iv; + *iv = *ibuf++; + } + return 0; + } + + The following is an implementation of this function using Intel AESNI. + This function _aes_decrypt_cbc_hw SHOULD NOT be called directly. + Developer should still call _aes_decrypt_cbc (in aes_modes_asm.s) which will poll cpu_capabilities and branch + to this aesni-based function should it detecs that aesni is available. + Blindly call this function SURELY will cause a CRASH on systems with no aesni support. + + Note that the decryption operation is not related over blocks. + This gives opportunity of arranging aes_decrypt operations in parallel to speed up code. + This is equivalent to what has been described in the Intel AES Instruction Set White Paper (Rev. 2.0 page 53-55) + The following assembly code exploits this idea to achieve ~ 1.4 speed up in aes_decrypt_cbc. + + Example C code for packing 4 blocks in an iteration is shown as follows: + + while ((num_blk-=4)>=0) { + + // the following 4 functions can be interleaved to exploit parallelism + aes_decrypt(ibuf, obuf, ctx); + aes_decrypt(ibuf+1, obuf+1, ctx); + aes_decrypt(ibuf+2, obuf+2, ctx); + aes_decrypt(ibuf+3, obuf+3, ctx); + + obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + *iv = ibuf[3]; ibuf += 4; obuf += 4; + } + num_blk+=4; + + ----------------------------------------------------------------------------------------------------------------*/ + + .text + .align 4,0x90 + .globl _aes_decrypt_cbc_hw +_aes_decrypt_cbc_hw: + + // push/save registers for local use +#if defined __i386__ + + push %ebp + movl %esp, %ebp + push %ebx // ibuf + push %edi // obuf + + #define sp %esp + +#else // __x86_64__ + + push %rbp + mov %rsp, %rbp + push %rbx + push %r13 + push %r14 + push %r15 + + #define sp %rsp + +#endif + + + // if kernel, allocate stack space to save xmm registers +#ifdef KERNEL +#if defined __i386__ + sub $(8*16), %esp +#else + sub $(16*16), %rsp +#endif + movaps %xmm0, (sp) + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm3, 48(sp) + movaps %xmm4, 64(sp) + movaps %xmm5, 80(sp) + movaps %xmm6, 96(sp) + movaps %xmm7, 112(sp) +#if defined __x86_64__ + movaps %xmm8, 16*8(sp) + movaps %xmm9, 16*9(sp) + movaps %xmm10, 16*10(sp) + movaps %xmm11, 16*11(sp) + movaps %xmm12, 16*12(sp) + movaps %xmm13, 16*13(sp) + movaps %xmm14, 16*14(sp) + movaps %xmm15, 16*15(sp) +#endif // __x86_64__ +#endif + + #undef iv + #define iv %xmm0 + +#if defined __i386__ + mov 12(%ebp), %eax // in_iv + mov 24(%ebp), %edx // ctx + movups (%eax), iv // iv = in_iv + mov 8(%ebp), %ebx // ibuf + mov 16(%ebp), %ecx // num_blk + mov 20(%ebp), %edi // obuf + + #define ibuf %ebx + #define obuf %edi + #define num_blk %ecx + #define ctx %edx + +#else // __x86_64__, rdi/rsi/rdx/rcx/r8 + + mov %rdi, %rbx // ibuf + movups (%rsi), iv // iv = in_iv + mov %rdx, %r13 // num_blk + mov %rcx, %r14 // obuf + mov %r8, %r15 // ctx + + #define ibuf %rbx + #define num_blk %r13d + #define obuf %r14 + #define ctx %r15 + +#endif + + mov 240(ctx), %eax // aes length + cmp $160, %eax // aes-128 decrypt + je L_decrypt_128 + cmp $192, %eax // aes-192 decrypt + je L_decrypt_192 + cmp $224, %eax // aes-256 decrypt + je L_decrypt_256 + + mov $-1, %eax // wrong aes length, to return -1 + jmp L_error // early exit due to wrong aes length + + + // + // aes-128 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_128: + + cmp $1, num_blk + jl L_HW_cbc_done // if num_blk < 1, early return + + // aes-128 decrypt expanded keys + movups 160(ctx), %xmm3 + movups 144(ctx), %xmm4 + movups 128(ctx), %xmm5 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#if defined __x86_64__ + movups 80(ctx), %xmm8 + movups 64(ctx), %xmm9 + movups 48(ctx), %xmm10 + movups 32(ctx), %xmm11 + movups 16(ctx), %xmm12 + movups 0(ctx), %xmm13 +#endif + + // performs 4 block decryption in an iteration to exploit decrypt in parallel + + // while ((num_blk-=4)>=0) { + // aes_decrypt(ibuf, obuf, ctx); + // aes_decrypt(ibuf+1, obuf+1, ctx); + // aes_decrypt(ibuf+2, obuf+2, ctx); + // aes_decrypt(ibuf+3, obuf+3, ctx); + // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + // *iv = ibuf[3]; ibuf += 4; obuf += 4; + // } + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code + +0: + + +#if defined __x86_64__ + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // for x86_64, the expanded keys are already stored in xmm3-xmm13 + + // aes-128 decrypt round 0 per 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + // aes-128 decrypt round 1 per 4 blocks + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + // aes-128 decrypt round 2 per 4 blocks + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + // aes-128 decrypt round 3 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + // aes-128 decrypt round 4 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + // aes-128 decrypt round 5 per 4 blocks + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + // aes-128 decrypt round 6 per 4 blocks + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + // aes-128 decrypt round 7 per 4 blocks + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + // aes-128 decrypt round 8 per 4 blocks + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + // aes-128 decrypt round 9 per 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + // aes-128 decrypt round 10 (last) per 4 blocks + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0]; + movups 16(ibuf), iv // ibuf[1] + pxor iv, %xmm14 // obuf[2] ^= ibuf[1]; + movups 32(ibuf), iv // ibuf[2] + pxor iv, %xmm15 // obuf[3] ^= obuf[2]; + movups 48(ibuf), iv // *iv = ibuf[3] + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + +#else + + // aes_decrypt_cbc per 4 blocks using aes-128 for i386 + // xmm1/xmm2/xmm4/xmm5 used for obuf per block + // xmm3 = key0 + // xmm0 = iv + // xmm6/xmm7 dynamically load with other expanded keys + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + + movups 144(ctx), %xmm6 // key1 + + // aes-128 decrypt round 0 per 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 128(ctx), %xmm7 // key2 + + // aes-128 decrypt round 1 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 // key3 + + // aes-128 decrypt round 2 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 // key4 + + // aes-128 decrypt round 3 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 // key5 + + // aes-128 decrypt round 4 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 // key6 + + // aes-128 decrypt round 5 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 // key7 + + // aes-128 decrypt round 6 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 // key8 + + // aes-128 decrypt round 7 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 // key9 + + // aes-128 decrypt round 8 per 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 // keyA + + // aes-128 decrypt round 9 per 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + // aes-128 decrypt round 10 (last) per 4 blocks + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf +#endif + + add $64, ibuf // ibuf += 4; + add $64, obuf // obuf += 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + +#if defined __i386__ + // updated as they might be needed as expanded keys in the remaining + movups 144(ctx), %xmm4 + movups 128(ctx), %xmm5 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#endif + + test $2, num_blk // check whether num_blk has 2 blocks + je 9f // if num_blk & 2 == 0, skip the per-pair processing code + + // do the remaining 2 blocks together + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + + // aes_decrypt + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 +#if defined __x86_64__ + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 +#else + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + movups 112(ctx), %xmm6 + movups 96(ctx), %xmm7 +#endif + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0] + movups 16(ibuf), iv // *iv = ibuf[1] + + movups %xmm1, (obuf) // write obuf[0] + movups %xmm2, 16(obuf) // write obuf[1] + + add $32, ibuf // ibuf += 2 + add $32, obuf // obuf += 2 + +9: + test $1, num_blk // check whether num_blk has residual 1 block + je L_HW_cbc_done // if num_blk == 0, no need for residual processing code + + movups (ibuf), %xmm2 // tmp = ibuf + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdeclast %xmm13, %xmm2 +#else + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 +#endif + + pxor iv, %xmm2 // *obuf ^= *iv; + movups (ibuf), iv // *iv = *ibuf; + movups %xmm2, (obuf) // write *obuf + + jmp L_HW_cbc_done + + // + // aes-192 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_192: + + cmp $1, num_blk + jl L_HW_cbc_done // if num_blk < 1, early return + + // aes-192 decryp expanded keys + movups 192(ctx), %xmm3 + movups 176(ctx), %xmm4 + movups 160(ctx), %xmm5 + movups 144(ctx), %xmm6 + movups 128(ctx), %xmm7 +#if defined __x86_64__ + movups 112(ctx), %xmm8 + movups 96(ctx), %xmm9 + movups 80(ctx), %xmm10 + movups 64(ctx), %xmm11 + movups 48(ctx), %xmm12 + movups 32(ctx), %xmm13 + movups 16(ctx), %xmm14 + movups (ctx), %xmm15 +#endif + + // performs 4 block decryption in an iteration to exploit decrypt in parallel + + // while ((num_blk-=4)>=0) { + // aes_decrypt(ibuf, obuf, ctx); + // aes_decrypt(ibuf+1, obuf+1, ctx); + // aes_decrypt(ibuf+2, obuf+2, ctx); + // aes_decrypt(ibuf+3, obuf+3, ctx); + // obuf[0] ^= *iv; obuf[1] ^= ibuf[1]; obuf[2] ^= ibuf[1]; obuf[3] ^= ibuf[2]; + // *iv = ibuf[3]; ibuf += 4; obuf += 4; + // } + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code +0: + +#if defined __x86_64__ + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 + // use %xmm12/%xmm13 ts dynamic keys in the middle, restored afterwards + + // round 0 for 4 blocks + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + // round 1 for 4 blocks + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + // round 2 for 4 blocks + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + // round 3 for 4 blocks + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + // round 4 for 4 blocks + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + // round 5 for 4 blocks + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + // round 6 for 4 blocks + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + // round 7 for 4 blocks + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + // round 8 for 4 blocks + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + // round 9 for 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + movups 16(ctx), %xmm12 + + // round A for 4 blocks + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + + movups (ctx), %xmm13 + + // round B for 4 blocks + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + + movups 48(ctx), %xmm12 // restore %xmm12 to its original key + + // round C (last) for 4 blocks + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + + movups 32(ctx), %xmm13 // restore %xmm13 to its original key + + pxor iv, %xmm1 // obuf[0] ^= *iv; + movups (ibuf), iv // ibuf[0] + pxor iv, %xmm2 // obuf[1] ^= ibuf[0] + movups 16(ibuf), iv // ibuf[1] + pxor iv, %xmm14 // obuf[2] ^= ibuf[1] + movups 32(ibuf), iv // ibuf[2] + pxor iv, %xmm15 // obuf[3] ^= ibuf[2] + movups 48(ibuf), iv // *iv = ibuf[3] + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += 4; + add $64, obuf // obuf += 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, prepare to return + + movups 16(ctx), %xmm14 // restore %xmm14 to its key + movups (ctx), %xmm15 // restore %xmm15 to its key + +#else + + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + movups 176(ctx), %xmm6 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 160(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 144(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 128(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; + add $64, obuf // obuf += AES_BLOCK_SIZE * 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 176(ctx), %xmm4 + movups 160(ctx), %xmm5 + movups 144(ctx), %xmm6 + movups 128(ctx), %xmm7 + +#endif + + // per-block aes_decrypt_cbc loop + +0: + movups (ibuf), %xmm2 // tmp = ibuf + + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdec %xmm13, %xmm2 + aesdec %xmm14, %xmm2 + aesdeclast %xmm15, %xmm2 +#else + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 +#endif + + pxor iv, %xmm2 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm2, (obuf) // write obuf + + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done + + // + // aes-256 decrypt_cbc operation, after completion, branch to L_HW_cbc_done + // + +L_decrypt_256: + + cmp $1, num_blk + jl L_HW_cbc_done + + movups 224(ctx), %xmm3 + movups 208(ctx), %xmm4 + movups 192(ctx), %xmm5 + movups 176(ctx), %xmm6 + movups 160(ctx), %xmm7 +#if defined __x86_64__ + movups 144(ctx), %xmm8 + movups 128(ctx), %xmm9 + movups 112(ctx), %xmm10 + movups 96(ctx), %xmm11 + movups 80(ctx), %xmm12 + movups 64(ctx), %xmm13 + movups 48(ctx), %xmm14 + movups 32(ctx), %xmm15 +// movups 16(ctx), %xmm14 +// movups (ctx), %xmm15 +#endif + +#if defined __x86_64__ + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-4-blocks processing code +0: + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm14 // tmp = 3rd ibuf + movups 48(ibuf), %xmm15 // tmp = 4th ibuf + + // aes_decrypt, for x86_64, the expanded keys are already stored in xmm3-xmm13 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm14 + pxor %xmm3, %xmm15 + + aesdec %xmm4, %xmm1 + aesdec %xmm4, %xmm2 + aesdec %xmm4, %xmm14 + aesdec %xmm4, %xmm15 + + aesdec %xmm5, %xmm1 + aesdec %xmm5, %xmm2 + aesdec %xmm5, %xmm14 + aesdec %xmm5, %xmm15 + + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm14 + aesdec %xmm6, %xmm15 + + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm14 + aesdec %xmm7, %xmm15 + + aesdec %xmm8, %xmm1 + aesdec %xmm8, %xmm2 + aesdec %xmm8, %xmm14 + aesdec %xmm8, %xmm15 + + aesdec %xmm9, %xmm1 + aesdec %xmm9, %xmm2 + aesdec %xmm9, %xmm14 + aesdec %xmm9, %xmm15 + + aesdec %xmm10, %xmm1 + aesdec %xmm10, %xmm2 + aesdec %xmm10, %xmm14 + aesdec %xmm10, %xmm15 + + aesdec %xmm11, %xmm1 + aesdec %xmm11, %xmm2 + aesdec %xmm11, %xmm14 + aesdec %xmm11, %xmm15 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 48(ctx), %xmm12 + + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + movups 32(ctx), %xmm13 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 16(ctx), %xmm12 + + aesdec %xmm13, %xmm1 + aesdec %xmm13, %xmm2 + aesdec %xmm13, %xmm14 + aesdec %xmm13, %xmm15 + movups (ctx), %xmm13 + + aesdec %xmm12, %xmm1 + aesdec %xmm12, %xmm2 + aesdec %xmm12, %xmm14 + aesdec %xmm12, %xmm15 + movups 80(ctx), %xmm12 + + aesdeclast %xmm13, %xmm1 + aesdeclast %xmm13, %xmm2 + aesdeclast %xmm13, %xmm14 + aesdeclast %xmm13, %xmm15 + movups 64(ctx), %xmm13 + + pxor iv, %xmm1 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // obuf ^= iv; + movups 16(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm14 // obuf ^= iv; + movups 32(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm15 // obuf ^= iv; + movups 48(ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm14, 32(obuf) // write 3rd obuf + movups %xmm15, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE*4; + add $64, obuf // obuf += AES_BLOCK_SIZE*4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 48(ctx), %xmm14 + movups 32(ctx), %xmm15 + +#else + + sub $4, num_blk // pre decrement num_blk by 4 + jl 9f // if num_blk < 4, skip the per-pair processing code +0: + movups (ibuf), %xmm1 // tmp = 1st ibuf + movups 16(ibuf), %xmm2 // tmp = 2nd ibuf + movups 32(ibuf), %xmm4 // tmp = 3rd ibuf + movups 48(ibuf), %xmm5 // tmp = 4th ibuf + + // aes_decrypt + // for i386, sequentially load expanded keys into xmm6/xmm7 + movups 208(ctx), %xmm6 + pxor %xmm3, %xmm1 + pxor %xmm3, %xmm2 + pxor %xmm3, %xmm4 + pxor %xmm3, %xmm5 + + movups 192(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 176(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 160(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 144(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 128(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 112(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 96(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 80(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 64(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 48(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 32(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + movups 16(ctx), %xmm6 + aesdec %xmm7, %xmm1 + aesdec %xmm7, %xmm2 + aesdec %xmm7, %xmm4 + aesdec %xmm7, %xmm5 + + movups 0(ctx), %xmm7 + aesdec %xmm6, %xmm1 + aesdec %xmm6, %xmm2 + aesdec %xmm6, %xmm4 + aesdec %xmm6, %xmm5 + + aesdeclast %xmm7, %xmm1 + aesdeclast %xmm7, %xmm2 + aesdeclast %xmm7, %xmm4 + aesdeclast %xmm7, %xmm5 + + pxor iv, %xmm1 // 1st obuf ^= iv; + movups (ibuf), iv // 1st memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm2 // 2nd obuf ^= iv; + movups 16(ibuf), iv // 2nd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm4 // 3rd obuf ^= iv; + movups 32(ibuf), iv // 3rd memcpy(iv, tmp, AES_BLOCK_SIZE); + pxor iv, %xmm5 // 4th obuf ^= iv; + movups 48(ibuf), iv // 4th memcpy(iv, tmp, AES_BLOCK_SIZE); + movups %xmm1, (obuf) // write 1st obuf + movups %xmm2, 16(obuf) // write 2nd obuf + movups %xmm4, 32(obuf) // write 3rd obuf + movups %xmm5, 48(obuf) // write 4th obuf + + add $64, ibuf // ibuf += AES_BLOCK_SIZE * 4; + add $64, obuf // obuf += AES_BLOCK_SIZE * 4; + + sub $4, num_blk // num_blk -= 4 + jge 0b // if num_blk > 0, repeat the loop + + +9: add $4, num_blk // post incremtn num_blk by 4 + je L_HW_cbc_done // if num_blk == 0, no need for forthur processing code + + movups 208(ctx), %xmm4 + movups 192(ctx), %xmm5 + movups 176(ctx), %xmm6 + movups 160(ctx), %xmm7 + +#endif + +0: + movups (ibuf), %xmm2 // tmp = ibuf + + // aes_decrypt + pxor %xmm3, %xmm2 + aesdec %xmm4, %xmm2 + aesdec %xmm5, %xmm2 + aesdec %xmm6, %xmm2 + aesdec %xmm7, %xmm2 +#if defined __x86_64__ + aesdec %xmm8, %xmm2 + aesdec %xmm9, %xmm2 + aesdec %xmm10, %xmm2 + aesdec %xmm11, %xmm2 + aesdec %xmm12, %xmm2 + aesdec %xmm13, %xmm2 + aesdec %xmm14, %xmm2 + aesdec %xmm15, %xmm2 +#else + movups 144(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 128(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 112(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 96(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 80(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 64(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 48(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups 32(ctx), %xmm1 + aesdec %xmm1, %xmm2 +#endif + movups 16(ctx), %xmm1 + aesdec %xmm1, %xmm2 + movups (ctx), %xmm1 + aesdeclast %xmm1, %xmm2 + + pxor iv, %xmm2 // obuf ^= iv; + movups (ibuf), iv // memcpy(iv, tmp, AES_BLOCK_SIZE); + + movups %xmm2, (obuf) // write obuf + + add $16, ibuf // ibuf += AES_BLOCK_SIZE; + add $16, obuf // obuf += AES_BLOCK_SIZE; + sub $1, num_blk // num_blk -- + jg 0b // if num_blk > 0, repeat the loop + + jmp L_HW_cbc_done + + // + // --------- END of aes_decrypt_cbc_hw ------------------- + // diff --git a/bsd/crypto/aes/i386/aes_x86_v2.s b/bsd/crypto/aes/i386/aes_x86_v2.s deleted file mode 100644 index 7ed98adb8..000000000 --- a/bsd/crypto/aes/i386/aes_x86_v2.s +++ /dev/null @@ -1,1298 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * --------------------------------------------------------------------------- - * Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. - * - * LICENSE TERMS - * - * The free distribution and use of this software in both source and binary - * form is allowed (with or without changes) provided that: - * - * 1. distributions of this source code include the above copyright - * notice, this list of conditions and the following disclaimer; - * - * 2. distributions in binary form include the above copyright - * notice, this list of conditions and the following disclaimer - * in the documentation and/or other associated materials; - * - * 3. the copyright holder's name is not used to endorse products - * built using this software without specific written permission. - * - * ALTERNATIVELY, provided that this notice is retained in full, this product - * may be distributed under the terms of the GNU General Public License (GPL), - * in which case the provisions of the GPL apply INSTEAD OF those given above. - * - * DISCLAIMER - * - * This software is provided 'as is' with no explicit or implied warranties - * in respect of its properties, including, but not limited to, correctness - * and/or fitness for purpose. - * --------------------------------------------------------------------------- - * Issue 31/01/2006 - * - * This code requires either ASM_X86_V2 or ASM_X86_V2C to be set in aesopt.h - * and the same define to be set here as well. If AES_V2C is set this file - * requires the C files aeskey.c and aestab.c for support. - * - * This is a full assembler implementation covering encryption, decryption and - * key scheduling. It uses 2k bytes of tables but its encryption and decryption - * performance is very close to that obtained using large tables. Key schedule - * expansion is slower for both encryption and decryption but this is likely to - * be offset by the much smaller load that this version places on the processor - * cache. I acknowledge the contribution made by Daniel Bernstein to aspects of - * the design of the AES round function used here. - * - * This code provides the standard AES block size (128 bits, 16 bytes) and the - * three standard AES key sizes (128, 192 and 256 bits). It has the same call - * interface as my C implementation. The ebx, esi, edi and ebp registers are - * preserved across calls but eax, ecx and edx and the artihmetic status flags - * are not. - */ - -#include <mach/i386/asm.h> - -#define AES_128 /* define if AES with 128 bit keys is needed */ -#define AES_192 /* define if AES with 192 bit keys is needed */ -#define AES_256 /* define if AES with 256 bit keys is needed */ -#define AES_VAR /* define if a variable key size is needed */ -#define ENCRYPTION /* define if encryption is needed */ -#define DECRYPTION /* define if decryption is needed */ -#define AES_REV_DKS /* define if key decryption schedule is reversed */ - -#ifndef ASM_X86_V2C -#define ENCRYPTION_KEY_SCHEDULE /* define if enc. key expansion is needed */ -#define DECRYPTION_KEY_SCHEDULE /* define if dec. key expansion is needed */ -#endif - -/* - * The encryption key schedule has the following in memory layout where N is the - * number of rounds (10, 12 or 14): - * - * lo: | input key (round 0) | ; each round is four 32-bit words - * | encryption round 1 | - * | encryption round 2 | - * .... - * | encryption round N-1 | - * hi: | encryption round N | - * - * The decryption key schedule is normally set up so that it has the same - * layout as above by actually reversing the order of the encryption key - * schedule in memory (this happens when AES_REV_DKS is set): - * - * lo: | decryption round 0 | = | encryption round N | - * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] - * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] - * .... .... - * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] - * hi: | decryption round N | = | input key (round 0) | - * - * with rounds except the first and last modified using inv_mix_column() - * But if AES_REV_DKS is NOT set the order of keys is left as it is for - * encryption so that it has to be accessed in reverse when used for - * decryption (although the inverse mix column modifications are done) - * - * lo: | decryption round 0 | = | input key (round 0) | - * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] - * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] - * .... .... - * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] - * hi: | decryption round N | = | encryption round N | - * - * This layout is faster when the assembler key scheduling provided here - * is used. - */ - -/* End of user defines */ - -#ifdef AES_VAR -#ifndef AES_128 -#define AES_128 -#endif -#ifndef AES_192 -#define AES_192 -#endif -#ifndef AES_256 -#define AES_256 -#endif -#endif - -#ifdef AES_VAR -#define KS_LENGTH 60 -#else -#ifdef AES_256 -#define KS_LENGTH 60 -#else -#ifdef AES_192 -#define KS_LENGTH 52 -#else -#define KS_LENGTH 44 -#endif -#endif -#endif - -/* - * These macros implement stack based local variables - */ -#define save(r1) \ - movl %r1, (%esp); - -#define restore(r1) \ - movl (%esp), %r1; - -#define do_call(f, n) \ - call EXT(f); \ - addl $(n), %esp; - -/* - * finite field multiplies by {02}, {04} and {08} - */ -#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) -#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) -#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) - -/* - * finite field multiplies required in table generation - */ -#define f3(x) (f2(x) ^ x) -#define f9(x) (f8(x) ^ x) -#define fb(x) (f8(x) ^ f2(x) ^ x) -#define fd(x) (f8(x) ^ f4(x) ^ x) -#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) - -#define etab_0(x) enc_tab+4(,x,8) -#define etab_1(x) enc_tab+3(,x,8) -#define etab_2(x) enc_tab+2(,x,8) -#define etab_3(x) enc_tab+1(,x,8) - -#define etab_b(x) etab_3(x) - -#define btab_0(x) enc_tab+6(,x,8) -#define btab_1(x) enc_tab+5(,x,8) -#define btab_2(x) enc_tab+4(,x,8) -#define btab_3(x) enc_tab+3(,x,8) - -/* - * ROUND FUNCTION. Build column[2] on ESI and column[3] on EDI that have the - * round keys pre-loaded. Build column[0] in EBP and column[1] in EBX. - * - * Input: - * - * EAX column[0] - * EBX column[1] - * ECX column[2] - * EDX column[3] - * ESI column key[round][2] - * EDI column key[round][3] - * EBP scratch - * - * Output: - * - * EBP column[0] unkeyed - * EBX column[1] unkeyed - * ESI column[2] keyed - * EDI column[3] keyed - * EAX scratch - * ECX scratch - * EDX scratch - */ -#define rnd_fun(m1, m2) \ - roll $16, %ebx; \ - \ - ## m1 ## _zo(esi, cl, 0, ebp); \ - m1(esi, dh, 1, ebp); \ - m1(esi, bh, 3, ebp); \ - ## m1 ## _zo(edi, dl, 0, ebp); \ - m1(edi, ah, 1, ebp); \ - m1(edi, bl, 2, ebp); \ - ## m2 ## _zo(ebp, al, 0, ebp); \ - \ - shrl $16, %ebx; \ - andl $0xffff0000, %eax; \ - orl %ebx, %eax; \ - shrl $16, %edx; \ - \ - m1(ebp, ah, 1, ebx); \ - m1(ebp, dh, 3, ebx); \ - m2(ebx, dl, 2, ebx); \ - m1(ebx, ch, 1, edx); \ - ## m1 ## _zo(ebx, al, 0, edx); \ - \ - shrl $16, %eax; \ - shrl $16, %ecx; \ - \ - m1(ebp, cl, 2, edx); \ - m1(edi, ch, 3, edx); \ - m1(esi, al, 2, edx); \ - m1(ebx, ah, 3, edx) - -/* - * Basic MOV and XOR Operations for normal rounds - */ -#define nr_xor_zo nr_xor -#define nr_xor(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - xorl etab_ ## r3(%r4), %r1; - -#define nr_mov_zo nr_mov -#define nr_mov(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movl etab_ ## r3(%r4), %r1; - -/* - * Basic MOV and XOR Operations for last round - */ - -#if 1 - -#define lr_xor_zo(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl etab_b(%r4), %r4; \ - xor %r4, %r1; - -#define lr_xor(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl etab_b(%r4), %r4; \ - shll $(8*r3), %r4; \ - xor %r4, %r1; - -#define lr_mov_zo(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl etab_b(%r4), %r1; - -#define lr_mov(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl etab_b(%r4), %r1; \ - shll $(8*r3), %r1; - -#else /* less effective but worth leaving as an option */ - -#define lr_xor_zo lr_xor -#define lr_xor(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - mov btab_ ## r3(%r4), %r4; \ - andl $(0x000000ff << 8 * r3), %r4; \ - xor %r4, %r1; - -#define lr_mov_zo lr_mov -#define lr_mov(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - mov btab_ ## r3(%r4), %r1; \ - andl $(0x000000ff << 8 * r3), %r1; - -#endif - -/* - * Apply S-Box to the 4 bytes in a 32-bit word and rotate left 3 byte positions - * - * r1 : output is xored into this register - * r2 : input: a => eax, b => ebx, c => ecx, d => edx - * r3 : scratch register - */ - -#define l3s_col(r1, r2, r3) \ - lr_xor_zo(r1, ## r2 ## h, 0, r3); \ - lr_xor(r1, ## r2 ## l, 3, r3); \ - shrl $16, %e ## r2 ## x; \ - lr_xor(r1, ## r2 ## h, 2, r3); \ - lr_xor(r1, ## r2 ## l, 1, r3); - -/* - * offsets to parameters - */ -#define in_blk 4 /* input byte array address parameter */ -#define out_blk 8 /* output byte array address parameter */ -#define ctx 12 /* AES context structure */ -#define stk_spc 20 /* stack space */ - -#ifdef ENCRYPTION - -#define ENCRYPTION_TABLE - -#define enc_round \ - addl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - rnd_fun(nr_xor, nr_mov); \ - \ - movl %ebp, %eax; \ - movl %esi, %ecx; \ - movl %edi, %edx; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - -#define enc_last_round \ - addl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - rnd_fun(lr_xor, lr_mov); \ - \ - movl %ebp, %eax; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - - .section __TEXT, __text - -/* - * AES Encryption Subroutine - */ -Entry(aes_encrypt) - - subl $stk_spc, %esp - movl %ebp, 16(%esp) - movl %ebx, 12(%esp) - movl %esi, 8(%esp) - movl %edi, 4(%esp) - - movl in_blk+stk_spc(%esp), %esi /* input pointer */ - movl (%esi), %eax - movl 4(%esi), %ebx - movl 8(%esi), %ecx - movl 12(%esi), %edx - - movl ctx+stk_spc(%esp), %ebp /* key pointer */ - movzbl 4*KS_LENGTH(%ebp), %edi - xorl (%ebp), %eax - xorl 4(%ebp), %ebx - xorl 8(%ebp), %ecx - xorl 12(%ebp), %edx - - /* - * determine the number of rounds - */ - cmpl $10*16, %edi - je aes_encrypt.3 - cmpl $12*16, %edi - je aes_encrypt.2 - cmpl $14*16, %edi - je aes_encrypt.1 - movl $-1, %eax - jmp aes_encrypt.5 - -aes_encrypt.1: - enc_round - enc_round -aes_encrypt.2: - enc_round - enc_round -aes_encrypt.3: - enc_round - enc_round - enc_round - enc_round - enc_round - enc_round - enc_round - enc_round - enc_round - enc_last_round - - movl out_blk+stk_spc(%esp), %edx - movl %eax, (%edx) - movl %ebx, 4(%edx) - movl %esi, 8(%edx) - movl %edi, 12(%edx) - xorl %eax, %eax - -aes_encrypt.5: - movl 16(%esp), %ebp - movl 12(%esp), %ebx - movl 8(%esp), %esi - movl 4(%esp), %edi - addl $stk_spc, %esp - ret - -#endif - -/* - * For r2 == 16, or r2 == 24 && r1 == 7, or r2 ==32 && r1 == 6 - */ -#define f_key(r1, r2, rc_val) \ - l3s_col(esi, a, ebx); \ - xorl $rc_val, %esi; \ - \ - movl %esi, r1*r2(%ebp); \ - xorl %esi, %edi; \ - movl %edi, r1*r2+4(%ebp); \ - xorl %edi, %ecx; \ - movl %ecx, r1*r2+8(%ebp); \ - xorl %ecx, %edx; \ - movl %edx, r1*r2+12(%ebp); \ - movl %edx, %eax; - -/* - * For r2 == 24 && r1 == 0 to 6 - */ -#define f_key_24(r1, r2, rc_val) \ - f_key(r1, r2, rc_val); \ - \ - xorl r1*r2+16-r2(%ebp), %eax; \ - movl %eax, r1*r2+16(%ebp); \ - xorl r1*r2+20-r2(%ebp), %eax; \ - movl %eax, r1*r2+20(%ebp); - -/* - * For r2 ==32 && r1 == 0 to 5 - */ -#define f_key_32(r1, r2, rc_val) \ - f_key(r1, r2, rc_val); \ - \ - roll $8, %eax; \ - pushl %edx; \ - movl r1*r2+16-r2(%ebp), %edx; \ - l3s_col(edx, a, ebx); \ - movl %edx, %eax; \ - popl %edx; \ - movl %eax, r1*r2+16(%ebp); \ - xorl r1*r2+20-r2(%ebp), %eax; \ - movl %eax, r1*r2+20(%ebp); \ - xorl r1*r2+24-r2(%ebp), %eax; \ - movl %eax, r1*r2+24(%ebp); \ - xorl r1*r2+28-r2(%ebp), %eax; \ - movl %eax, r1*r2+28(%ebp); - -#ifdef ENCRYPTION_KEY_SCHEDULE - -#ifdef AES_128 - -#ifndef ENCRYPTION_TABLE -#define ENCRYPTION_TABLE -#endif - -Entry(aes_encrypt_key128) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 24(%esp), %ebp - movl $10*16, 4*KS_LENGTH(%ebp) - movl 20(%esp), %ebx - - movl (%ebx), %esi - movl %esi, (%ebp) - movl 4(%ebx), %edi - movl %edi, 4(%ebp) - movl 8(%ebx), %ecx - movl %ecx, 8(%ebp) - movl 12(%ebx), %edx - movl %edx, 12(%ebp) - addl $16, %ebp - movl %edx, %eax - - f_key(0, 16, 1) - f_key(1, 16, 2) - f_key(2, 16, 4) - f_key(3, 16, 8) - f_key(4, 16, 16) - f_key(5, 16, 32) - f_key(6, 16, 64) - f_key(7, 16, 128) - f_key(8, 16, 27) - f_key(9, 16, 54) - - popl %edi - popl %esi - popl %ebx - popl %ebp - xorl %eax, %eax - ret - -#endif - -#ifdef AES_192 - -#ifndef ENCRYPTION_TABLE -#define ENCRYPTION_TABLE -#endif - -Entry(aes_encrypt_key192) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 24(%esp), %ebp - movl $12*16, 4*KS_LENGTH(%ebp) - movl 20(%esp), %ebx - - movl (%ebx), %esi - movl %esi, (%ebp) - movl 4(%ebx), %edi - movl %edi, 4(%ebp) - movl 8(%ebx), %ecx - movl %ecx, 8(%ebp) - movl 12(%ebx), %edx - movl %edx, 12(%ebp) - movl 16(%ebx), %eax - movl %eax, 16(%ebp) - movl 20(%ebx), %eax - movl %eax, 20(%ebp) - addl $24, %ebp - - f_key_24(0, 24, 1) - f_key_24(1, 24, 2) - f_key_24(2, 24, 4) - f_key_24(3, 24, 8) - f_key_24(4, 24, 16) - f_key_24(5, 24, 32) - f_key_24(6, 24, 64) - f_key(7, 24, 128) - - popl %edi - popl %esi - popl %ebx - popl %ebp - xorl %eax, %eax - ret - -#endif - -#ifdef AES_256 - -#ifndef ENCRYPTION_TABLE -#define ENCRYPTION_TABLE -#endif - -Entry(aes_encrypt_key256) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 24(%esp), %ebp - movl $14*16, 4*KS_LENGTH(%ebp) - movl 20(%esp), %ebx - - movl (%ebx), %esi - movl %esi, (%ebp) - movl 4(%ebx), %edi - movl %edi, 4(%ebp) - movl 8(%ebx), %ecx - movl %ecx, 8(%ebp) - movl 12(%ebx), %edx - movl %edx, 12(%ebp) - movl 16(%ebx), %eax - movl %eax, 16(%ebp) - movl 20(%ebx), %eax - movl %eax, 20(%ebp) - movl 24(%ebx), %eax - movl %eax, 24(%ebp) - movl 28(%ebx), %eax - movl %eax, 28(%ebp) - addl $32, %ebp - - f_key_32(0, 32, 1) - f_key_32(1, 32, 2) - f_key_32(2, 32, 4) - f_key_32(3, 32, 8) - f_key_32(4, 32, 16) - f_key_32(5, 32, 32) - f_key(6, 32, 64) - - popl %edi - popl %esi - popl %ebx - popl %ebp - xorl %eax, %eax - ret - -#endif - -#ifdef AES_VAR - -#ifndef ENCRYPTION_TABLE -#define ENCRYPTION_TABLE -#endif - -Entry(aes_encrypt_key) - - movl 4(%esp), %ecx - movl 8(%esp), %eax - movl 12(%esp), %edx - pushl %edx - pushl %ecx - - cmpl $16, %eax - je aes_encrypt_key.1 - cmpl $128, %eax - je aes_encrypt_key.1 - - cmpl $24, %eax - je aes_encrypt_key.2 - cmpl $192, %eax - je aes_encrypt_key.2 - - cmpl $32, %eax - je aes_encrypt_key.3 - cmpl $256, %eax - je aes_encrypt_key.3 - movl $-1, %eax - addl $8, %esp - ret - -aes_encrypt_key.1: - do_call(aes_encrypt_key128, 8) - ret -aes_encrypt_key.2: - do_call(aes_encrypt_key192, 8) - ret -aes_encrypt_key.3: - do_call(aes_encrypt_key256, 8) - ret - -#endif - -#endif - -#ifdef ENCRYPTION_TABLE - -# S-box data - 256 entries - - .section __DATA, __data - .align ALIGN - -#define u8(x) 0, x, x, f3(x), f2(x), x, x, f3(x) - -enc_tab: - .byte u8(0x63),u8(0x7c),u8(0x77),u8(0x7b),u8(0xf2),u8(0x6b),u8(0x6f),u8(0xc5) - .byte u8(0x30),u8(0x01),u8(0x67),u8(0x2b),u8(0xfe),u8(0xd7),u8(0xab),u8(0x76) - .byte u8(0xca),u8(0x82),u8(0xc9),u8(0x7d),u8(0xfa),u8(0x59),u8(0x47),u8(0xf0) - .byte u8(0xad),u8(0xd4),u8(0xa2),u8(0xaf),u8(0x9c),u8(0xa4),u8(0x72),u8(0xc0) - .byte u8(0xb7),u8(0xfd),u8(0x93),u8(0x26),u8(0x36),u8(0x3f),u8(0xf7),u8(0xcc) - .byte u8(0x34),u8(0xa5),u8(0xe5),u8(0xf1),u8(0x71),u8(0xd8),u8(0x31),u8(0x15) - .byte u8(0x04),u8(0xc7),u8(0x23),u8(0xc3),u8(0x18),u8(0x96),u8(0x05),u8(0x9a) - .byte u8(0x07),u8(0x12),u8(0x80),u8(0xe2),u8(0xeb),u8(0x27),u8(0xb2),u8(0x75) - .byte u8(0x09),u8(0x83),u8(0x2c),u8(0x1a),u8(0x1b),u8(0x6e),u8(0x5a),u8(0xa0) - .byte u8(0x52),u8(0x3b),u8(0xd6),u8(0xb3),u8(0x29),u8(0xe3),u8(0x2f),u8(0x84) - .byte u8(0x53),u8(0xd1),u8(0x00),u8(0xed),u8(0x20),u8(0xfc),u8(0xb1),u8(0x5b) - .byte u8(0x6a),u8(0xcb),u8(0xbe),u8(0x39),u8(0x4a),u8(0x4c),u8(0x58),u8(0xcf) - .byte u8(0xd0),u8(0xef),u8(0xaa),u8(0xfb),u8(0x43),u8(0x4d),u8(0x33),u8(0x85) - .byte u8(0x45),u8(0xf9),u8(0x02),u8(0x7f),u8(0x50),u8(0x3c),u8(0x9f),u8(0xa8) - .byte u8(0x51),u8(0xa3),u8(0x40),u8(0x8f),u8(0x92),u8(0x9d),u8(0x38),u8(0xf5) - .byte u8(0xbc),u8(0xb6),u8(0xda),u8(0x21),u8(0x10),u8(0xff),u8(0xf3),u8(0xd2) - .byte u8(0xcd),u8(0x0c),u8(0x13),u8(0xec),u8(0x5f),u8(0x97),u8(0x44),u8(0x17) - .byte u8(0xc4),u8(0xa7),u8(0x7e),u8(0x3d),u8(0x64),u8(0x5d),u8(0x19),u8(0x73) - .byte u8(0x60),u8(0x81),u8(0x4f),u8(0xdc),u8(0x22),u8(0x2a),u8(0x90),u8(0x88) - .byte u8(0x46),u8(0xee),u8(0xb8),u8(0x14),u8(0xde),u8(0x5e),u8(0x0b),u8(0xdb) - .byte u8(0xe0),u8(0x32),u8(0x3a),u8(0x0a),u8(0x49),u8(0x06),u8(0x24),u8(0x5c) - .byte u8(0xc2),u8(0xd3),u8(0xac),u8(0x62),u8(0x91),u8(0x95),u8(0xe4),u8(0x79) - .byte u8(0xe7),u8(0xc8),u8(0x37),u8(0x6d),u8(0x8d),u8(0xd5),u8(0x4e),u8(0xa9) - .byte u8(0x6c),u8(0x56),u8(0xf4),u8(0xea),u8(0x65),u8(0x7a),u8(0xae),u8(0x08) - .byte u8(0xba),u8(0x78),u8(0x25),u8(0x2e),u8(0x1c),u8(0xa6),u8(0xb4),u8(0xc6) - .byte u8(0xe8),u8(0xdd),u8(0x74),u8(0x1f),u8(0x4b),u8(0xbd),u8(0x8b),u8(0x8a) - .byte u8(0x70),u8(0x3e),u8(0xb5),u8(0x66),u8(0x48),u8(0x03),u8(0xf6),u8(0x0e) - .byte u8(0x61),u8(0x35),u8(0x57),u8(0xb9),u8(0x86),u8(0xc1),u8(0x1d),u8(0x9e) - .byte u8(0xe1),u8(0xf8),u8(0x98),u8(0x11),u8(0x69),u8(0xd9),u8(0x8e),u8(0x94) - .byte u8(0x9b),u8(0x1e),u8(0x87),u8(0xe9),u8(0xce),u8(0x55),u8(0x28),u8(0xdf) - .byte u8(0x8c),u8(0xa1),u8(0x89),u8(0x0d),u8(0xbf),u8(0xe6),u8(0x42),u8(0x68) - .byte u8(0x41),u8(0x99),u8(0x2d),u8(0x0f),u8(0xb0),u8(0x54),u8(0xbb),u8(0x16) - -#endif - -#ifdef DECRYPTION - -#define DECRYPTION_TABLE - -#define dtab_0(x) dec_tab(,x,8) -#define dtab_1(x) dec_tab+3(,x,8) -#define dtab_2(x) dec_tab+2(,x,8) -#define dtab_3(x) dec_tab+1(,x,8) -#define dtab_x(x) dec_tab+7(,x,8) - -#define irn_fun(m1, m2) \ - roll $16, %eax; \ - \ - ## m1 ## _zo(esi, cl, 0, ebp); \ - m1(esi, bh, 1, ebp); \ - m1(esi, al, 2, ebp); \ - ## m1 ## _zo(edi, dl, 0, ebp); \ - m1(edi, ch, 1, ebp); \ - m1(edi, ah, 3, ebp); \ - ## m2 ## _zo(ebp, bl, 0, ebp); \ - \ - shrl $16, %eax; \ - andl $0xffff0000, %ebx; \ - orl %eax, %ebx; \ - shrl $16, %ecx; \ - \ - m1(ebp, bh, 1, eax); \ - m1(ebp, ch, 3, eax); \ - m2(eax, cl, 2, ecx); \ - ## m1 ## _zo(eax, bl, 0, ecx); \ - m1(eax, dh, 1, ecx); \ - \ - shrl $16, %ebx; \ - shrl $16, %edx; \ - \ - m1(esi, dh, 3, ecx); \ - m1(ebp, dl, 2, ecx); \ - m1(eax, bh, 3, ecx); \ - m1(edi, bl, 2, ecx); - -/* - * Basic MOV and XOR Operations for normal rounds - */ -#define ni_xor_zo ni_xor -#define ni_xor(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - xorl dtab_ ## r3 ## (%r4), %r1; - -#define ni_mov_zo ni_mov -#define ni_mov(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movl dtab_ ## r3 ## (%r4), %r1; - -/* - * Basic MOV and XOR Operations for last round - */ - -#define li_xor_zo(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl dtab_x(%r4), %r4; \ - xor %r4, %r1; - -#define li_xor(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl dtab_x(%r4), %r4; \ - shll $(8*r3), %r4; \ - xor %r4, %r1; - -#define li_mov_zo(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl dtab_x(%r4), %r1; - -#define li_mov(r1, r2, r3, r4) \ - movzbl %r2, %r4; \ - movzbl dtab_x(%r4), %r1; \ - shl $(8*r3), %r1; - -#ifdef AES_REV_DKS - -#define dec_round \ - addl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - irn_fun(ni_xor, ni_mov); \ - \ - movl %ebp, %ebx; \ - movl %esi, %ecx; \ - movl %edi, %edx; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - -#define dec_last_round \ - addl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - irn_fun(li_xor, li_mov); \ - \ - movl %ebp, %ebx; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - -#else - -#define dec_round \ - subl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - irn_fun(ni_xor, ni_mov); \ - \ - movl %ebp, %ebx; \ - movl %esi, %ecx; \ - movl %edi, %edx; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - -#define dec_last_round \ - subl $16, %ebp; \ - save(ebp); \ - movl 8(%ebp), %esi; \ - movl 12(%ebp), %edi; \ - \ - irn_fun(li_xor, li_mov); \ - \ - movl %ebp, %ebx; \ - restore(ebp); \ - xorl (%ebp), %eax; \ - xorl 4(%ebp), %ebx; - -#endif /* AES_REV_DKS */ - - .section __TEXT, __text - -/* - * AES Decryption Subroutine - */ -Entry(aes_decrypt) - - subl $stk_spc, %esp - movl %ebp, 16(%esp) - movl %ebx, 12(%esp) - movl %esi, 8(%esp) - movl %edi, 4(%esp) - - /* - * input four columns and xor in first round key - */ - movl in_blk+stk_spc(%esp), %esi /* input pointer */ - movl (%esi), %eax - movl 4(%esi), %ebx - movl 8(%esi), %ecx - movl 12(%esi), %edx - leal 16(%esi), %esi - - movl ctx+stk_spc(%esp), %ebp /* key pointer */ - movzbl 4*KS_LENGTH(%ebp), %edi -#ifndef AES_REV_DKS /* if decryption key schedule is not reversed */ - leal (%ebp,%edi), %ebp /* we have to access it from the top down */ -#endif - xorl (%ebp), %eax /* key schedule */ - xorl 4(%ebp), %ebx - xorl 8(%ebp), %ecx - xorl 12(%ebp), %edx - - /* - * determine the number of rounds - */ - cmpl $10*16, %edi - je aes_decrypt.3 - cmpl $12*16, %edi - je aes_decrypt.2 - cmpl $14*16, %edi - je aes_decrypt.1 - movl $-1, %eax - jmp aes_decrypt.5 - -aes_decrypt.1: - dec_round - dec_round -aes_decrypt.2: - dec_round - dec_round -aes_decrypt.3: - dec_round - dec_round - dec_round - dec_round - dec_round - dec_round - dec_round - dec_round - dec_round - dec_last_round - - /* - * move final values to the output array. - */ - movl out_blk+stk_spc(%esp), %ebp - movl %eax, (%ebp) - movl %ebx, 4(%ebp) - movl %esi, 8(%ebp) - movl %edi, 12(%ebp) - xorl %eax, %eax - -aes_decrypt.5: - movl 16(%esp), %ebp - movl 12(%esp), %ebx - movl 8(%esp), %esi - movl 4(%esp), %edi - addl $stk_spc, %esp - ret - -#endif - -#define inv_mix_col \ - movzbl %dl, %ebx; \ - movzbl etab_b(%ebx), %ebx; \ - movl dtab_0(%ebx), %eax; \ - movzbl %dh, %ebx; \ - shrl $16, %edx; \ - movzbl etab_b(%ebx), %ebx; \ - xorl dtab_1(%ebx), %eax; \ - movzbl %dl, %ebx; \ - movzbl etab_b(%ebx), %ebx; \ - xorl dtab_2(%ebx), %eax; \ - movzbl %dh, %ebx; \ - movzbl etab_b(%ebx), %ebx; \ - xorl dtab_3(%ebx), %eax; - -#ifdef DECRYPTION_KEY_SCHEDULE - -#ifdef AES_128 - -#ifndef DECRYPTION_TABLE -#define DECRYPTION_TABLE -#endif - -Entry(aes_decrypt_key128) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 24(%esp), %eax /* context */ - movl 20(%esp), %edx /* key */ - pushl %eax - pushl %edx - do_call(aes_encrypt_key128, 8) - movl $10*16, %eax - movl 24(%esp), %esi /* pointer to first round key */ - leal (%esi,%eax), %edi /* pointer to last round key */ - addl $32, %esi - /* the inverse mix column transformation */ - movl -16(%esi), %edx /* needs to be applied to all round keys */ - inv_mix_col - movl %eax, -16(%esi) /* transforming the four sub-keys in the */ - movl -12(%esi), %edx /* second round key */ - inv_mix_col - movl %eax, -12(%esi) /* transformations for subsequent rounds */ - movl -8(%esi), %edx /* can then be made more efficient by */ - inv_mix_col - movl %eax, -8(%esi) /* in the encryption round key ek[r]: */ - movl -4(%esi), %edx - inv_mix_col - movl %eax, -4(%esi) /* where n is 1..3. Hence the corresponding */ - -aes_decrypt_key128.0: - movl (%esi), %edx /* subkeys in the decryption round key dk[r] */ - inv_mix_col - movl %eax, (%esi) /* GF(256): */ - xorl -12(%esi), %eax - movl %eax, 4(%esi) /* dk[r][n] = dk[r][n-1] ^ dk[r-1][n] */ - xorl -8(%esi), %eax - movl %eax, 8(%esi) /* So we only need one inverse mix column */ - xorl -4(%esi), %eax /* operation (n = 0) for each four word cycle */ - movl %eax, 12(%esi) /* in the expanded key. */ - addl $16, %esi - cmpl %esi, %edi - jg aes_decrypt_key128.0 - jmp dec_end - -#endif - -#ifdef AES_192 - -#ifndef DECRYPTION_TABLE -#define DECRYPTION_TABLE -#endif - -Entry(aes_decrypt_key192) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 24(%esp), %eax /* context */ - movl 20(%esp), %edx /* key */ - pushl %eax - pushl %edx - do_call(aes_encrypt_key192, 8) - movl $12*16, %eax - movl 24(%esp), %esi /* first round key */ - leal (%esi,%eax), %edi /* last round key */ - addl $48, %esi /* the first 6 words are the key, of */ - /* which the top 2 words are part of */ - movl -32(%esi), %edx /* the second round key and hence */ - inv_mix_col - movl %eax, -32(%esi) /* need to do a further six values prior */ - movl -28(%esi), %edx /* to using a more efficient technique */ - inv_mix_col - movl %eax, -28(%esi) - /* dk[r][n] = dk[r][n-1] ^ dk[r-1][n] */ - movl -24(%esi), %edx - inv_mix_col - movl %eax, -24(%esi) /* cycle is now 6 words long */ - movl -20(%esi), %edx - inv_mix_col - movl %eax, -20(%esi) - movl -16(%esi), %edx - inv_mix_col - movl %eax, -16(%esi) - movl -12(%esi), %edx - inv_mix_col - movl %eax, -12(%esi) - movl -8(%esi), %edx - inv_mix_col - movl %eax, -8(%esi) - movl -4(%esi), %edx - inv_mix_col - movl %eax, -4(%esi) - -aes_decrypt_key192.0: - movl (%esi), %edx /* expanded key is 13 * 4 = 44 32-bit words */ - inv_mix_col - movl %eax, (%esi) /* using inv_mix_col. We have already done 8 */ - xorl -20(%esi), %eax /* of these so 36 are left - hence we need */ - movl %eax, 4(%esi) /* exactly 6 loops of six here */ - xorl -16(%esi), %eax - movl %eax, 8(%esi) - xorl -12(%esi), %eax - movl %eax, 12(%esi) - xorl -8(%esi), %eax - movl %eax, 16(%esi) - xorl -4(%esi), %eax - movl %eax, 20(%esi) - addl $24, %esi - cmpl %esi, %edi - jg aes_decrypt_key192.0 - jmp dec_end - -#endif - -#ifdef AES_256 - -#ifndef DECRYPTION_TABLE -#define DECRYPTION_TABLE -#endif - -Entry(aes_decrypt_key256) - - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 24(%esp), %eax - movl 20(%esp), %edx - pushl %eax - pushl %edx - do_call(aes_encrypt_key256, 8) - movl $14*16, %eax - movl 24(%esp), %esi - leal (%esi,%eax), %edi - addl $64, %esi - - movl -48(%esi), %edx /* the primary key is 8 words, of which */ - inv_mix_col - movl %eax, -48(%esi) - movl -44(%esi), %edx - inv_mix_col - movl %eax, -44(%esi) - movl -40(%esi), %edx - inv_mix_col - movl %eax, -40(%esi) - movl -36(%esi), %edx - inv_mix_col - movl %eax, -36(%esi) - - movl -32(%esi), %edx /* the encryption key expansion cycle is */ - inv_mix_col - movl %eax, -32(%esi) /* start by doing one complete block */ - movl -28(%esi), %edx - inv_mix_col - movl %eax, -28(%esi) - movl -24(%esi), %edx - inv_mix_col - movl %eax, -24(%esi) - movl -20(%esi), %edx - inv_mix_col - movl %eax, -20(%esi) - movl -16(%esi), %edx - inv_mix_col - movl %eax, -16(%esi) - movl -12(%esi), %edx - inv_mix_col - movl %eax, -12(%esi) - movl -8(%esi), %edx - inv_mix_col - movl %eax, -8(%esi) - movl -4(%esi), %edx - inv_mix_col - movl %eax, -4(%esi) - -aes_decrypt_key256.0: - movl (%esi), %edx /* we can now speed up the remaining */ - inv_mix_col - movl %eax, (%esi) /* outlined earlier. But note that */ - xorl -28(%esi), %eax /* there is one extra inverse mix */ - movl %eax, 4(%esi) /* column operation as the 256 bit */ - xorl -24(%esi), %eax /* key has an extra non-linear step */ - movl %eax, 8(%esi) /* for the midway element. */ - xorl -20(%esi), %eax - movl %eax, 12(%esi) /* the expanded key is 15 * 4 = 60 */ - movl 16(%esi), %edx /* 32-bit words of which 52 need to */ - inv_mix_col - movl %eax, 16(%esi) /* 12 so 40 are left - which means */ - xorl -12(%esi), %eax /* that we need exactly 5 loops of 8 */ - movl %eax, 20(%esi) - xorl -8(%esi), %eax - movl %eax, 24(%esi) - xorl -4(%esi), %eax - movl %eax, 28(%esi) - addl $32, %esi - cmpl %esi, %edi - jg aes_decrypt_key256.0 - -#endif - -dec_end: - -#ifdef AES_REV_DKS - - movl 24(%esp), %esi /* this reverses the order of the */ -dec_end.1: - movl (%esi), %eax /* round keys if required */ - movl 4(%esi), %ebx - movl (%edi), %ebp - movl 4(%edi), %edx - movl %ebp, (%esi) - movl %edx, 4(%esi) - movl %eax, (%edi) - movl %ebx, 4(%edi) - - movl 8(%esi), %eax - movl 12(%esi), %ebx - movl 8(%edi), %ebp - movl 12(%edi), %edx - movl %ebp, 8(%esi) - movl %edx, 12(%esi) - movl %eax, 8(%edi) - movl %ebx, 12(%edi) - - addl $16, %esi - subl $16, %edi - cmpl %esi, %edi - jg dec_end.1 - -#endif - - popl %edi - popl %esi - popl %ebx - popl %ebp - xorl %eax, %eax - ret - -#ifdef AES_VAR - -Entry(aes_decrypt_key) - - movl 4(%esp), %ecx - movl 8(%esp), %eax - movl 12(%esp), %edx - pushl %edx - pushl %ecx - - cmpl $16, %eax - je aes_decrypt_key.1 - cmpl $128, %eax - je aes_decrypt_key.1 - - cmpl $24, %eax - je aes_decrypt_key.2 - cmpl $192, %eax - je aes_decrypt_key.2 - - cmpl $32, %eax - je aes_decrypt_key.3 - cmpl $256, %eax - je aes_decrypt_key.3 - movl $-1, %eax - addl $8, %esp - ret - -aes_decrypt_key.1: - do_call(aes_decrypt_key128, 8) - ret -aes_decrypt_key.2: - do_call(aes_decrypt_key192, 8) - ret -aes_decrypt_key.3: - do_call(aes_decrypt_key256, 8) - ret - -#endif - -#endif - -#ifdef DECRYPTION_TABLE - -/* - * Inverse S-box data - 256 entries - */ - - .section __DATA, __data - .align ALIGN - -#define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x - -dec_tab: - .byte v8(0x52),v8(0x09),v8(0x6a),v8(0xd5),v8(0x30),v8(0x36),v8(0xa5),v8(0x38) - .byte v8(0xbf),v8(0x40),v8(0xa3),v8(0x9e),v8(0x81),v8(0xf3),v8(0xd7),v8(0xfb) - .byte v8(0x7c),v8(0xe3),v8(0x39),v8(0x82),v8(0x9b),v8(0x2f),v8(0xff),v8(0x87) - .byte v8(0x34),v8(0x8e),v8(0x43),v8(0x44),v8(0xc4),v8(0xde),v8(0xe9),v8(0xcb) - .byte v8(0x54),v8(0x7b),v8(0x94),v8(0x32),v8(0xa6),v8(0xc2),v8(0x23),v8(0x3d) - .byte v8(0xee),v8(0x4c),v8(0x95),v8(0x0b),v8(0x42),v8(0xfa),v8(0xc3),v8(0x4e) - .byte v8(0x08),v8(0x2e),v8(0xa1),v8(0x66),v8(0x28),v8(0xd9),v8(0x24),v8(0xb2) - .byte v8(0x76),v8(0x5b),v8(0xa2),v8(0x49),v8(0x6d),v8(0x8b),v8(0xd1),v8(0x25) - .byte v8(0x72),v8(0xf8),v8(0xf6),v8(0x64),v8(0x86),v8(0x68),v8(0x98),v8(0x16) - .byte v8(0xd4),v8(0xa4),v8(0x5c),v8(0xcc),v8(0x5d),v8(0x65),v8(0xb6),v8(0x92) - .byte v8(0x6c),v8(0x70),v8(0x48),v8(0x50),v8(0xfd),v8(0xed),v8(0xb9),v8(0xda) - .byte v8(0x5e),v8(0x15),v8(0x46),v8(0x57),v8(0xa7),v8(0x8d),v8(0x9d),v8(0x84) - .byte v8(0x90),v8(0xd8),v8(0xab),v8(0x00),v8(0x8c),v8(0xbc),v8(0xd3),v8(0x0a) - .byte v8(0xf7),v8(0xe4),v8(0x58),v8(0x05),v8(0xb8),v8(0xb3),v8(0x45),v8(0x06) - .byte v8(0xd0),v8(0x2c),v8(0x1e),v8(0x8f),v8(0xca),v8(0x3f),v8(0x0f),v8(0x02) - .byte v8(0xc1),v8(0xaf),v8(0xbd),v8(0x03),v8(0x01),v8(0x13),v8(0x8a),v8(0x6b) - .byte v8(0x3a),v8(0x91),v8(0x11),v8(0x41),v8(0x4f),v8(0x67),v8(0xdc),v8(0xea) - .byte v8(0x97),v8(0xf2),v8(0xcf),v8(0xce),v8(0xf0),v8(0xb4),v8(0xe6),v8(0x73) - .byte v8(0x96),v8(0xac),v8(0x74),v8(0x22),v8(0xe7),v8(0xad),v8(0x35),v8(0x85) - .byte v8(0xe2),v8(0xf9),v8(0x37),v8(0xe8),v8(0x1c),v8(0x75),v8(0xdf),v8(0x6e) - .byte v8(0x47),v8(0xf1),v8(0x1a),v8(0x71),v8(0x1d),v8(0x29),v8(0xc5),v8(0x89) - .byte v8(0x6f),v8(0xb7),v8(0x62),v8(0x0e),v8(0xaa),v8(0x18),v8(0xbe),v8(0x1b) - .byte v8(0xfc),v8(0x56),v8(0x3e),v8(0x4b),v8(0xc6),v8(0xd2),v8(0x79),v8(0x20) - .byte v8(0x9a),v8(0xdb),v8(0xc0),v8(0xfe),v8(0x78),v8(0xcd),v8(0x5a),v8(0xf4) - .byte v8(0x1f),v8(0xdd),v8(0xa8),v8(0x33),v8(0x88),v8(0x07),v8(0xc7),v8(0x31) - .byte v8(0xb1),v8(0x12),v8(0x10),v8(0x59),v8(0x27),v8(0x80),v8(0xec),v8(0x5f) - .byte v8(0x60),v8(0x51),v8(0x7f),v8(0xa9),v8(0x19),v8(0xb5),v8(0x4a),v8(0x0d) - .byte v8(0x2d),v8(0xe5),v8(0x7a),v8(0x9f),v8(0x93),v8(0xc9),v8(0x9c),v8(0xef) - .byte v8(0xa0),v8(0xe0),v8(0x3b),v8(0x4d),v8(0xae),v8(0x2a),v8(0xf5),v8(0xb0) - .byte v8(0xc8),v8(0xeb),v8(0xbb),v8(0x3c),v8(0x83),v8(0x53),v8(0x99),v8(0x61) - .byte v8(0x17),v8(0x2b),v8(0x04),v8(0x7e),v8(0xba),v8(0x77),v8(0xd6),v8(0x26) - .byte v8(0xe1),v8(0x69),v8(0x14),v8(0x63),v8(0x55),v8(0x21),v8(0x0c),v8(0x7d) - -#endif diff --git a/bsd/crypto/aes/i386/aesopt.h b/bsd/crypto/aes/i386/aesopt.h deleted file mode 100644 index 025eb5fcf..000000000 --- a/bsd/crypto/aes/i386/aesopt.h +++ /dev/null @@ -1,719 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 - - This file contains the compilation options for AES (Rijndael) and code - that is common across encryption, key scheduling and table generation. - - OPERATION - - These source code files implement the AES algorithm Rijndael designed by - Joan Daemen and Vincent Rijmen. This version is designed for the standard - block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24 - and 32 bytes). - - This version is designed for flexibility and speed using operations on - 32-bit words rather than operations on bytes. It can be compiled with - either big or little endian internal byte order but is faster when the - native byte order for the processor is used. - - THE CIPHER INTERFACE - - The cipher interface is implemented as an array of bytes in which lower - AES bit sequence indexes map to higher numeric significance within bytes. - - uint_8t (an unsigned 8-bit type) - uint_32t (an unsigned 32-bit type) - struct aes_encrypt_ctx (structure for the cipher encryption context) - struct aes_decrypt_ctx (structure for the cipher decryption context) - aes_rval the function return type - - C subroutine calls: - - aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt(const unsigned char *in, unsigned char *out, - const aes_encrypt_ctx cx[1]); - - aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt(const unsigned char *in, unsigned char *out, - const aes_decrypt_ctx cx[1]); - - IMPORTANT NOTE: If you are using this C interface with dynamic tables make sure that - you call gen_tabs() before AES is used so that the tables are initialised. - - C++ aes class subroutines: - - Class AESencrypt for encryption - - Construtors: - AESencrypt(void) - AESencrypt(const unsigned char *key) - 128 bit key - Members: - aes_rval key128(const unsigned char *key) - aes_rval key192(const unsigned char *key) - aes_rval key256(const unsigned char *key) - aes_rval encrypt(const unsigned char *in, unsigned char *out) const - - Class AESdecrypt for encryption - Construtors: - AESdecrypt(void) - AESdecrypt(const unsigned char *key) - 128 bit key - Members: - aes_rval key128(const unsigned char *key) - aes_rval key192(const unsigned char *key) - aes_rval key256(const unsigned char *key) - aes_rval decrypt(const unsigned char *in, unsigned char *out) const -*/ - -#if !defined( _AESOPT_H ) -#define _AESOPT_H - -#if defined( __cplusplus ) -#include "aescpp.h" -#else -#include "crypto/aes.h" -#endif - -/* PLATFORM SPECIFIC INCLUDES */ - -#include "edefs.h" - -/* CONFIGURATION - THE USE OF DEFINES - - Later in this section there are a number of defines that control the - operation of the code. In each section, the purpose of each define is - explained so that the relevant form can be included or excluded by - setting either 1's or 0's respectively on the branches of the related - #if clauses. The following local defines should not be changed. -*/ - -#define ENCRYPTION_IN_C 1 -#define DECRYPTION_IN_C 2 -#define ENC_KEYING_IN_C 4 -#define DEC_KEYING_IN_C 8 - -#define NO_TABLES 0 -#define ONE_TABLE 1 -#define FOUR_TABLES 4 -#define NONE 0 -#define PARTIAL 1 -#define FULL 2 - -/* --- START OF USER CONFIGURED OPTIONS --- */ - -/* 1. BYTE ORDER WITHIN 32 BIT WORDS - - The fundamental data processing units in Rijndael are 8-bit bytes. The - input, output and key input are all enumerated arrays of bytes in which - bytes are numbered starting at zero and increasing to one less than the - number of bytes in the array in question. This enumeration is only used - for naming bytes and does not imply any adjacency or order relationship - from one byte to another. When these inputs and outputs are considered - as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to - byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte. - In this implementation bits are numbered from 0 to 7 starting at the - numerically least significant end of each byte (bit n represents 2^n). - - However, Rijndael can be implemented more efficiently using 32-bit - words by packing bytes into words so that bytes 4*n to 4*n+3 are placed - into word[n]. While in principle these bytes can be assembled into words - in any positions, this implementation only supports the two formats in - which bytes in adjacent positions within words also have adjacent byte - numbers. This order is called big-endian if the lowest numbered bytes - in words have the highest numeric significance and little-endian if the - opposite applies. - - This code can work in either order irrespective of the order used by the - machine on which it runs. Normally the internal byte order will be set - to the order of the processor on which the code is to be run but this - define can be used to reverse this in special situations - - WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set. - This define will hence be redefined later (in section 4) if necessary -*/ - -#if 1 -#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER -#elif 0 -#define ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif 0 -#define ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN -#else -#error The algorithm byte order is not defined -#endif - -/* 2. VIA ACE SUPPORT - - Define this option if support for the VIA ACE is required. This uses - inline assembler instructions and is only implemented for the Microsoft, - Intel and GCC compilers. If VIA ACE is known to be present, then defining - ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption - code. If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if - it is detected (both present and enabled) but the normal AES code will - also be present. - - When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte - aligned; other input/output buffers do not need to be 16 byte aligned - but there are very large performance gains if this can be arranged. - VIA ACE also requires the decryption key schedule to be in reverse - order (which the following defines ensure). -*/ - -#if 0 && !defined( _WIN64 ) && !defined( USE_VIA_ACE_IF_PRESENT ) -#define USE_VIA_ACE_IF_PRESENT -#endif - -#if 0 && !defined( _WIN64 ) && !defined( ASSUME_VIA_ACE_PRESENT ) -#define ASSUME_VIA_ACE_PRESENT -#endif - -/* 3. ASSEMBLER SUPPORT - - This define (which can be on the command line) enables the use of the - assembler code routines for encryption, decryption and key scheduling - as follows: - - ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for - encryption and decryption and but with key scheduling in C - ASM_X86_V2 uses assembler (aes_x86_v2.asm) with compressed tables for - encryption, decryption and key scheduling - ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for - encryption and decryption and but with key scheduling in C - ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for - encryption and decryption and but with key scheduling in C - - Change one 'if 0' below to 'if 1' to select the version or define - as a compilation option. -*/ - -#if defined ( ASM_X86_V1C ) || defined( ASM_X86_V2 ) || defined( ASM_X86_V2C ) -# if defined( _M_IX86 ) -# if 0 && !defined( ASM_X86_V1C ) -# define ASM_X86_V1C -# elif 0 && !defined( ASM_X86_V2 ) -# define ASM_X86_V2 -# elif 0 && !defined( ASM_X86_V2C ) -# define ASM_X86_V2C -# endif -# else -# error Assembler code is only available for x86 and AMD64 systems -# endif -#elif defined( ASM_AMD64_C ) -# if defined( _M_X64 ) -# if 0 && !defined( ASM_AMD64_C ) -# define ASM_AMD64_C -# endif -# else -# error Assembler code is only available for x86 and AMD64 systems -# endif -#endif - -/* 4. FAST INPUT/OUTPUT OPERATIONS. - - On some machines it is possible to improve speed by transferring the - bytes in the input and output arrays to and from the internal 32-bit - variables by addressing these arrays as if they are arrays of 32-bit - words. On some machines this will always be possible but there may - be a large performance penalty if the byte arrays are not aligned on - the normal word boundaries. On other machines this technique will - lead to memory access errors when such 32-bit word accesses are not - properly aligned. The option SAFE_IO avoids such problems but will - often be slower on those machines that support misaligned access - (especially so if care is taken to align the input and output byte - arrays on 32-bit word boundaries). If SAFE_IO is not defined it is - assumed that access to byte arrays as if they are arrays of 32-bit - words will not cause problems when such accesses are misaligned. -*/ -#if 1 && !defined( _MSC_VER ) -#define SAFE_IO -#endif - -/* 5. LOOP UNROLLING - - The code for encryption and decrytpion cycles through a number of rounds - that can be implemented either in a loop or by expanding the code into a - long sequence of instructions, the latter producing a larger program but - one that will often be much faster. The latter is called loop unrolling. - There are also potential speed advantages in expanding two iterations in - a loop with half the number of iterations, which is called partial loop - unrolling. The following options allow partial or full loop unrolling - to be set independently for encryption and decryption -*/ -#if 1 -#define ENC_UNROLL FULL -#elif 0 -#define ENC_UNROLL PARTIAL -#else -#define ENC_UNROLL NONE -#endif - -#if 1 -#define DEC_UNROLL FULL -#elif 0 -#define DEC_UNROLL PARTIAL -#else -#define DEC_UNROLL NONE -#endif - -/* 6. FAST FINITE FIELD OPERATIONS - - If this section is included, tables are used to provide faster finite - field arithmetic (this has no effect if FIXED_TABLES is defined). -*/ -#if 1 -#define FF_TABLES -#endif - -/* 7. INTERNAL STATE VARIABLE FORMAT - - The internal state of Rijndael is stored in a number of local 32-bit - word varaibles which can be defined either as an array or as individual - names variables. Include this section if you want to store these local - varaibles in arrays. Otherwise individual local variables will be used. -*/ -#if 1 -#define ARRAYS -#endif - -/* 8. FIXED OR DYNAMIC TABLES - - When this section is included the tables used by the code are compiled - statically into the binary file. Otherwise the subroutine gen_tabs() - must be called to compute them before the code is first used. -*/ -#if 0 && !(defined( _MSC_VER ) && ( _MSC_VER <= 800 )) -#define FIXED_TABLES -#endif - -/* 9. TABLE ALIGNMENT - - On some sytsems speed will be improved by aligning the AES large lookup - tables on particular boundaries. This define should be set to a power of - two giving the desired alignment. It can be left undefined if alignment - is not needed. This option is specific to the Microsft VC++ compiler - - it seems to sometimes cause trouble for the VC++ version 6 compiler. -*/ - -#if 1 && defined( _MSC_VER ) && ( _MSC_VER >= 1300 ) -#define TABLE_ALIGN 32 -#endif - -/* 10. TABLE OPTIONS - - This cipher proceeds by repeating in a number of cycles known as 'rounds' - which are implemented by a round function which can optionally be speeded - up using tables. The basic tables are each 256 32-bit words, with either - one or four tables being required for each round function depending on - how much speed is required. The encryption and decryption round functions - are different and the last encryption and decrytpion round functions are - different again making four different round functions in all. - - This means that: - 1. Normal encryption and decryption rounds can each use either 0, 1 - or 4 tables and table spaces of 0, 1024 or 4096 bytes each. - 2. The last encryption and decryption rounds can also use either 0, 1 - or 4 tables and table spaces of 0, 1024 or 4096 bytes each. - - Include or exclude the appropriate definitions below to set the number - of tables used by this implementation. -*/ - -#if 1 /* set tables for the normal encryption round */ -#define ENC_ROUND FOUR_TABLES -#elif 0 -#define ENC_ROUND ONE_TABLE -#else -#define ENC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the last encryption round */ -#define LAST_ENC_ROUND FOUR_TABLES -#elif 0 -#define LAST_ENC_ROUND ONE_TABLE -#else -#define LAST_ENC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the normal decryption round */ -#define DEC_ROUND FOUR_TABLES -#elif 0 -#define DEC_ROUND ONE_TABLE -#else -#define DEC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the last decryption round */ -#define LAST_DEC_ROUND FOUR_TABLES -#elif 0 -#define LAST_DEC_ROUND ONE_TABLE -#else -#define LAST_DEC_ROUND NO_TABLES -#endif - -/* The decryption key schedule can be speeded up with tables in the same - way that the round functions can. Include or exclude the following - defines to set this requirement. -*/ -#if 1 -#define KEY_SCHED FOUR_TABLES -#elif 0 -#define KEY_SCHED ONE_TABLE -#else -#define KEY_SCHED NO_TABLES -#endif - -/* ---- END OF USER CONFIGURED OPTIONS ---- */ - -/* VIA ACE support is only available for VC++ and GCC */ - -#if !defined( _MSC_VER ) && !defined( __GNUC__ ) -# if defined( ASSUME_VIA_ACE_PRESENT ) -# undef ASSUME_VIA_ACE_PRESENT -# endif -# if defined( USE_VIA_ACE_IF_PRESENT ) -# undef USE_VIA_ACE_IF_PRESENT -# endif -#endif - -#if defined( ASSUME_VIA_ACE_PRESENT ) && !defined( USE_VIA_ACE_IF_PRESENT ) -#define USE_VIA_ACE_IF_PRESENT -#endif - -#if defined( USE_VIA_ACE_IF_PRESENT ) && !defined ( AES_REV_DKS ) -#define AES_REV_DKS -#endif - -/* Assembler support requires the use of platform byte order */ - -#if ( defined( ASM_X86_V1C ) || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C ) ) && (ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER) -#undef ALGORITHM_BYTE_ORDER -#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER -#endif - -/* In this implementation the columns of the state array are each held in - 32-bit words. The state array can be held in various ways: in an array - of words, in a number of individual word variables or in a number of - processor registers. The following define maps a variable name x and - a column number c to the way the state array variable is to be held. - The first define below maps the state into an array x[c] whereas the - second form maps the state into a number of individual variables x0, - x1, etc. Another form could map individual state colums to machine - register names. -*/ - -#if defined( ARRAYS ) -#define s(x,c) x[c] -#else -#define s(x,c) x##c -#endif - -/* This implementation provides subroutines for encryption, decryption - and for setting the three key lengths (separately) for encryption - and decryption. Since not all functions are needed, masks are set - up here to determine which will be implemented in C -*/ - -#if !defined( AES_ENCRYPT ) -# define EFUNCS_IN_C 0 -#elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) - || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C ) -# define EFUNCS_IN_C ENC_KEYING_IN_C -#elif !defined( ASM_X86_V2 ) -# define EFUNCS_IN_C ( ENCRYPTION_IN_C | ENC_KEYING_IN_C ) -#else -# define EFUNCS_IN_C 0 -#endif - -#if !defined( AES_DECRYPT ) -# define DFUNCS_IN_C 0 -#elif defined( ASSUME_VIA_ACE_PRESENT ) || defined( ASM_X86_V1C ) - || defined( ASM_X86_V2C ) || defined( ASM_AMD64_C ) -# define DFUNCS_IN_C DEC_KEYING_IN_C -#elif !defined( ASM_X86_V2 ) -# define DFUNCS_IN_C ( DECRYPTION_IN_C | DEC_KEYING_IN_C ) -#else -# define DFUNCS_IN_C 0 -#endif - -#define FUNCS_IN_C ( EFUNCS_IN_C | DFUNCS_IN_C ) - -/* END OF CONFIGURATION OPTIONS */ - -#define RC_LENGTH (5 * (AES_BLOCK_SIZE / 4 - 2)) - -/* Disable or report errors on some combinations of options */ - -#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES -#undef LAST_ENC_ROUND -#define LAST_ENC_ROUND NO_TABLES -#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES -#undef LAST_ENC_ROUND -#define LAST_ENC_ROUND ONE_TABLE -#endif - -#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE -#undef ENC_UNROLL -#define ENC_UNROLL NONE -#endif - -#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES -#undef LAST_DEC_ROUND -#define LAST_DEC_ROUND NO_TABLES -#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES -#undef LAST_DEC_ROUND -#define LAST_DEC_ROUND ONE_TABLE -#endif - -#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE -#undef DEC_UNROLL -#define DEC_UNROLL NONE -#endif - -#if defined( bswap32 ) -#define aes_sw32 bswap32 -#elif defined( bswap_32 ) -#define aes_sw32 bswap_32 -#else -#define brot(x,n) (((uint_32t)(x) << n) | ((uint_32t)(x) >> (32 - n))) -#define aes_sw32(x) ((brot((x),8) & 0x00ff00ff) | (brot((x),24) & 0xff00ff00)) -#endif - -/* upr(x,n): rotates bytes within words by n positions, moving bytes to - higher index positions with wrap around into low positions - ups(x,n): moves bytes by n positions to higher index positions in - words but without wrap around - bval(x,n): extracts a byte from a word - - WARNING: The definitions given here are intended only for use with - unsigned variables and with shift counts that are compile - time constants -*/ - -#if ( ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN ) -#define upr(x,n) (((uint_32t)(x) << (8 * (n))) | ((uint_32t)(x) >> (32 - 8 * (n)))) -#define ups(x,n) ((uint_32t) (x) << (8 * (n))) -#define bval(x,n) ((uint_8t)((x) >> (8 * (n)))) -#define bytes2word(b0, b1, b2, b3) \ - (((uint_32t)(b3) << 24) | ((uint_32t)(b2) << 16) | ((uint_32t)(b1) << 8) | (b0)) -#endif - -#if ( ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN ) -#define upr(x,n) (((uint_32t)(x) >> (8 * (n))) | ((uint_32t)(x) << (32 - 8 * (n)))) -#define ups(x,n) ((uint_32t) (x) >> (8 * (n))) -#define bval(x,n) ((uint_8t)((x) >> (24 - 8 * (n)))) -#define bytes2word(b0, b1, b2, b3) \ - (((uint_32t)(b0) << 24) | ((uint_32t)(b1) << 16) | ((uint_32t)(b2) << 8) | (b3)) -#endif - -#if defined( SAFE_IO ) - -#define word_in(x,c) bytes2word(((const uint_8t*)(x)+4*c)[0], ((const uint_8t*)(x)+4*c)[1], \ - ((const uint_8t*)(x)+4*c)[2], ((const uint_8t*)(x)+4*c)[3]) -#define word_out(x,c,v) { ((uint_8t*)(x)+4*c)[0] = bval(v,0); ((uint_8t*)(x)+4*c)[1] = bval(v,1); \ - ((uint_8t*)(x)+4*c)[2] = bval(v,2); ((uint_8t*)(x)+4*c)[3] = bval(v,3); } - -#elif ( ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER ) - -#define word_in(x,c) (*((uint_32t*)(x)+(c))) -#define word_out(x,c,v) (*((uint_32t*)(x)+(c)) = (v)) - -#else - -#define word_in(x,c) aes_sw32(*((uint_32t*)(x)+(c))) -#define word_out(x,c,v) (*((uint_32t*)(x)+(c)) = aes_sw32(v)) - -#endif - -/* the finite field modular polynomial and elements */ - -#define WPOLY 0x011b -#define BPOLY 0x1b - -/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */ - -#define m1 0x80808080 -#define m2 0x7f7f7f7f -#define gf_mulx(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY)) - -/* The following defines provide alternative definitions of gf_mulx that might - give improved performance if a fast 32-bit multiply is not available. Note - that a temporary variable u needs to be defined where gf_mulx is used. - -#define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6)) -#define m4 (0x01010101 * BPOLY) -#define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4) -*/ - -/* Work out which tables are needed for the different options */ - -#if defined( ASM_X86_V1C ) -#if defined( ENC_ROUND ) -#undef ENC_ROUND -#endif -#define ENC_ROUND FOUR_TABLES -#if defined( LAST_ENC_ROUND ) -#undef LAST_ENC_ROUND -#endif -#define LAST_ENC_ROUND FOUR_TABLES -#if defined( DEC_ROUND ) -#undef DEC_ROUND -#endif -#define DEC_ROUND FOUR_TABLES -#if defined( LAST_DEC_ROUND ) -#undef LAST_DEC_ROUND -#endif -#define LAST_DEC_ROUND FOUR_TABLES -#if defined( KEY_SCHED ) -#undef KEY_SCHED -#define KEY_SCHED FOUR_TABLES -#endif -#endif - -#if ( FUNCS_IN_C & ENCRYPTION_IN_C ) || defined( ASM_X86_V1C ) -#if ENC_ROUND == ONE_TABLE -#define FT1_SET -#elif ENC_ROUND == FOUR_TABLES -#define FT4_SET -#else -#define SBX_SET -#endif -#if LAST_ENC_ROUND == ONE_TABLE -#define FL1_SET -#elif LAST_ENC_ROUND == FOUR_TABLES -#define FL4_SET -#elif !defined( SBX_SET ) -#define SBX_SET -#endif -#endif - -#if ( FUNCS_IN_C & DECRYPTION_IN_C ) || defined( ASM_X86_V1C ) -#if DEC_ROUND == ONE_TABLE -#define IT1_SET -#elif DEC_ROUND == FOUR_TABLES -#define IT4_SET -#else -#define ISB_SET -#endif -#if LAST_DEC_ROUND == ONE_TABLE -#define IL1_SET -#elif LAST_DEC_ROUND == FOUR_TABLES -#define IL4_SET -#elif !defined(ISB_SET) -#define ISB_SET -#endif -#endif - -#if (FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C) -#if KEY_SCHED == ONE_TABLE -#define LS1_SET -#elif KEY_SCHED == FOUR_TABLES -#define LS4_SET -#elif !defined( SBX_SET ) -#define SBX_SET -#endif -#endif - -#if (FUNCS_IN_C & DEC_KEYING_IN_C) -#if KEY_SCHED == ONE_TABLE -#define IM1_SET -#elif KEY_SCHED == FOUR_TABLES -#define IM4_SET -#elif !defined( SBX_SET ) -#define SBX_SET -#endif -#endif - -/* generic definitions of Rijndael macros that use tables */ - -#define no_table(x,box,vf,rf,c) bytes2word( \ - box[bval(vf(x,0,c),rf(0,c))], \ - box[bval(vf(x,1,c),rf(1,c))], \ - box[bval(vf(x,2,c),rf(2,c))], \ - box[bval(vf(x,3,c),rf(3,c))]) - -#define one_table(x,op,tab,vf,rf,c) \ - ( tab[bval(vf(x,0,c),rf(0,c))] \ - ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \ - ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \ - ^ op(tab[bval(vf(x,3,c),rf(3,c))],3)) - -#define four_tables(x,tab,vf,rf,c) \ - ( tab[0][bval(vf(x,0,c),rf(0,c))] \ - ^ tab[1][bval(vf(x,1,c),rf(1,c))] \ - ^ tab[2][bval(vf(x,2,c),rf(2,c))] \ - ^ tab[3][bval(vf(x,3,c),rf(3,c))]) - -#define vf1(x,r,c) (x) -#define rf1(r,c) (r) -#define rf2(r,c) ((8+r-c)&3) - -/* perform forward and inverse column mix operation on four bytes in long word x in */ -/* parallel. NOTE: x must be a simple variable, NOT an expression in these macros. */ - -#if defined( FM4_SET ) /* not currently used */ -#define fwd_mcol(x) four_tables(x,t_use(f,m),vf1,rf1,0) -#elif defined( FM1_SET ) /* not currently used */ -#define fwd_mcol(x) one_table(x,upr,t_use(f,m),vf1,rf1,0) -#else -#define dec_fmvars uint_32t g2 -#define fwd_mcol(x) (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ upr((x), 2) ^ upr((x), 1)) -#endif - -#if defined( IM4_SET ) -#define inv_mcol(x) four_tables(x,t_use(i,m),vf1,rf1,0) -#elif defined( IM1_SET ) -#define inv_mcol(x) one_table(x,upr,t_use(i,m),vf1,rf1,0) -#else -#define dec_imvars uint_32t g2, g4, g9 -#define inv_mcol(x) (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = (x) ^ gf_mulx(g4), g4 ^= g9, \ - (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ upr(g4, 2) ^ upr(g9, 1)) -#endif - -#if defined( FL4_SET ) -#define ls_box(x,c) four_tables(x,t_use(f,l),vf1,rf2,c) -#elif defined( LS4_SET ) -#define ls_box(x,c) four_tables(x,t_use(l,s),vf1,rf2,c) -#elif defined( FL1_SET ) -#define ls_box(x,c) one_table(x,upr,t_use(f,l),vf1,rf2,c) -#elif defined( LS1_SET ) -#define ls_box(x,c) one_table(x,upr,t_use(l,s),vf1,rf2,c) -#else -#define ls_box(x,c) no_table(x,t_use(s,box),vf1,rf2,c) -#endif - -#if defined( ASM_X86_V1C ) && defined( AES_DECRYPT ) && !defined( ISB_SET ) -#define ISB_SET -#endif - -#endif diff --git a/bsd/crypto/aes/i386/aesxts.c b/bsd/crypto/aes/i386/aesxts.c new file mode 100644 index 000000000..c0eaaa609 --- /dev/null +++ b/bsd/crypto/aes/i386/aesxts.c @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2010 Apple Inc. All Rights Reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +#include "aesxts.h" +#include <sys/types.h> +#include <string.h> +#include <libkern/libkern.h> + +int +aes_encrypt_key(const uint8_t *key, int key_len, aesedp_encrypt_ctx cx[1]); + +int +aes_decrypt_key(const uint8_t *key, int key_len, aesedp_decrypt_ctx cx[1]); + +int +aes_encrypt(const uint8_t *Plaintext, uint8_t *Ciphertext, aesedp_encrypt_ctx *ctx); + +int +aes_decrypt(const uint8_t *Ciphertext, uint8_t *Plaintext, aesedp_decrypt_ctx *ctx); + + +/* error codes [will be expanded in future releases] */ +enum { + CRYPT_OK=0, /* Result OK */ + CRYPT_ERROR=1, /* Generic Error */ + CRYPT_INVALID_KEYSIZE=3, /* Invalid key size given */ + CRYPT_INVALID_ARG=16, /* Generic invalid argument */ +}; + +static int +aesedp_keysize(int *keysize) +{ + switch (*keysize) { + case 16: + case 24: + case 32: + return CRYPT_OK; + default: + return CRYPT_INVALID_KEYSIZE; + } +} + +static int +aesedp_setup(const uint8_t *key, int keylen, int num_rounds __unused, aesedp_ctx *skey) +{ + aesedp_ctx *ctx = (aesedp_ctx *) skey; + int retval; + + if((retval = aesedp_keysize(&keylen)) != CRYPT_OK) return retval; + if((retval = aes_encrypt_key(key, keylen, &ctx->encrypt)) != CRYPT_OK) return CRYPT_ERROR; + if((retval = aes_decrypt_key(key, keylen, &ctx->decrypt)) != CRYPT_OK) return CRYPT_ERROR; + return CRYPT_OK; +} + +#ifdef ZZZNEVER +static int +aesedp_ecb_encrypt(const uint8_t *pt, uint8_t *ct, aesedp_ctx *skey) +{ + aesedp_ctx *ctx = (aesedp_ctx *) skey; + return aes_encrypt(pt, ct, &ctx->encrypt); +} + + + +static int +aesedp_ecb_decrypt(const uint8_t *ct, uint8_t *pt, aesedp_ctx *skey) +{ + return aes_decrypt(ct, pt, &skey->decrypt); +} +#endif + + +static void +aesedp_done(aesedp_ctx *skey __unused) +{ +} + +/** Start XTS mode + @param cipher The index of the cipher to use + @param key1 The encrypt key + @param key2 The tweak encrypt key + @param keylen The length of the keys (each) in octets + @param num_rounds The number of rounds for the cipher (0 == default) + @param xts [out] XTS structure + Returns CRYPT_OK upon success. +*/ + +uint32_t +xts_start(uint32_t cipher, // ignored - we're doing this for xts-aes only + const uint8_t *IV __unused, // ignored + const uint8_t *key1, int keylen, + const uint8_t *key2, int tweaklen __unused, // both keys are the same size for xts + uint32_t num_rounds, // ignored + uint32_t options __unused, // ignored + symmetric_xts *xts) +{ + uint32_t err; + + /* check inputs */ + if((key1 == NULL)|| (key2 == NULL) || (xts == NULL)) return CRYPT_INVALID_ARG; + + /* schedule the two ciphers */ + if ((err = aesedp_setup(key1, keylen, num_rounds, &xts->key1)) != 0) { + return err; + } + if ((err = aesedp_setup(key2, keylen, num_rounds, &xts->key2)) != 0) { + return err; + } + xts->cipher = cipher; + + return err; +} + + + + +/** multiply by x + @param I The value to multiply by x (LFSR shift) +*/ +#if defined __x86_64__ || defined __i386__ +extern void xts_mult_x(uint8_t *I); +#else +static void xts_mult_x(uint8_t *I) +{ + uint32_t x; + uint8_t t, tt; + + for (x = t = 0; x < 16; x++) { + tt = I[x] >> 7; + I[x] = ((I[x] << 1) | t) & 0xFF; + t = tt; + } + if (tt) { + I[0] ^= 0x87; + } +} +#endif + +#if defined __x86_64__ || defined __i386__ +extern int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx); +extern int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim); +#else +static int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx) +{ + uint32_t x; + uint32_t err; + + /* tweak encrypt block i */ + for (x = 0; x < 16; x += sizeof(uint64_t)) { + *((uint64_t*)&C[x]) = *((uint64_t*)&P[x]) ^ *((uint64_t*)&T[x]); + } + + if ((err = aes_encrypt(C, C, ctx)) != CRYPT_OK) { + return CRYPT_INVALID_KEYSIZE; + } + + for (x = 0; x < 16; x += sizeof(uint64_t)) { + *((uint64_t*)&C[x]) ^= *((uint64_t*)&T[x]); + } + + /* LFSR the tweak */ + xts_mult_x(T); + + return CRYPT_OK; +} +#endif + +/** XTS Encryption + @param pt [in] Plaintext + @param ptlen Length of plaintext (and ciphertext) + @param ct [out] Ciphertext + @param tweak [in] The 128--bit encryption tweak (e.g. sector number) + @param xts The XTS structure + Returns CRYPT_OK upon success +*/ +int xts_encrypt( + const uint8_t *pt, unsigned long ptlen, + uint8_t *ct, + const uint8_t *tweak, + symmetric_xts *xts) +{ + aesedp_encrypt_ctx *encrypt_ctx = &xts->key1.encrypt; + uint8_t PP[16], CC[16], T[16]; + uint32_t i, m, mo, lim; + uint32_t err; + + /* check inputs */ + if((pt == NULL) || (ct == NULL)|| (tweak == NULL) || (xts == NULL)) return 1; + + /* get number of blocks */ + m = ptlen >> 4; + mo = ptlen & 15; + + /* must have at least one full block */ + if (m == 0) { + return CRYPT_INVALID_ARG; + } + + /* encrypt the tweak */ + if ((err = aes_encrypt(tweak, T, &xts->key2.encrypt)) != 0) { + return CRYPT_INVALID_KEYSIZE; + } + + /* for i = 0 to m-2 do */ + if (mo == 0) { + lim = m; + } else { + lim = m - 1; + } + +#if defined __x86_64__ || defined __i386__ + if (lim>0) { + err = tweak_crypt_group(pt, ct, T, encrypt_ctx, lim); + ct += (lim<<4); + pt += (lim<<4); + } +#else + for (i = 0; i < lim; i++) { + err = tweak_crypt(pt, ct, T, encrypt_ctx); + ct += 16; + pt += 16; + } +#endif + + /* if ptlen not divide 16 then */ + if (mo > 0) { + /* CC = tweak encrypt block m-1 */ + if ((err = tweak_crypt(pt, CC, T, encrypt_ctx)) != 0) { + return err; + } + + /* Cm = first ptlen % 16 bytes of CC */ + for (i = 0; i < mo; i++) { + PP[i] = pt[16+i]; + ct[16+i] = CC[i]; + } + + for (; i < 16; i++) { + PP[i] = CC[i]; + } + + /* Cm-1 = Tweak encrypt PP */ + if ((err = tweak_crypt(PP, ct, T, encrypt_ctx)) != 0) { + return err; + } + } + + return err; +} + +#if defined __x86_64__ || defined __i386__ +extern int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx); +extern int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim); +#else +static int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx) +{ + uint32_t x; + uint32_t err; + + /* tweak encrypt block i */ + for (x = 0; x < 16; x += sizeof(uint64_t)) { + *((uint64_t*)&P[x]) = *((uint64_t*)&C[x]) ^ *((uint64_t*)&T[x]); + } + + err = aes_decrypt(P, P, ctx); + + for (x = 0; x < 16; x += sizeof(uint64_t)) { + *((uint64_t*)&P[x]) ^= *((uint64_t*)&T[x]); + } + + /* LFSR the tweak */ + xts_mult_x(T); + + return err; +} +#endif + +/** XTS Decryption + @param ct [in] Ciphertext + @param ptlen Length of plaintext (and ciphertext) + @param pt [out] Plaintext + @param tweak [in] The 128--bit encryption tweak (e.g. sector number) + @param xts The XTS structure + Returns CRYPT_OK upon success +*/ + +int xts_decrypt( + const uint8_t *ct, unsigned long ptlen, + uint8_t *pt, + const uint8_t *tweak, + symmetric_xts *xts) +{ + aesedp_decrypt_ctx *decrypt_ctx = &xts->key1.decrypt; + uint8_t PP[16], CC[16], T[16]; + uint32_t i, m, mo, lim; + uint32_t err; + + /* check inputs */ + if((pt == NULL) || (ct == NULL)|| (tweak == NULL) || (xts == NULL)) return 1; + + /* get number of blocks */ + m = ptlen >> 4; + mo = ptlen & 15; + + /* must have at least one full block */ + if (m == 0) { + return CRYPT_INVALID_ARG; + } + + /* encrypt the tweak , yes - encrypt */ + if ((err = aes_encrypt(tweak, T, &xts->key2.encrypt)) != 0) { + return CRYPT_INVALID_KEYSIZE; + } + + /* for i = 0 to m-2 do */ + if (mo == 0) { + lim = m; + } else { + lim = m - 1; + } + +#if defined __x86_64__ || defined __i386__ + if (lim>0) { + err = tweak_uncrypt_group(ct, pt, T, decrypt_ctx, lim); + ct += (lim<<4); + pt += (lim<<4); + } +#else + for (i = 0; i < lim; i++) { + err = tweak_uncrypt(ct, pt, T, decrypt_ctx); + ct += 16; + pt += 16; + } +#endif + + /* if ptlen not divide 16 then */ + if (mo > 0) { + memcpy(CC, T, 16); + xts_mult_x(CC); + + /* PP = tweak decrypt block m-1 */ + if ((err = tweak_uncrypt(ct, PP, CC, decrypt_ctx)) != CRYPT_OK) { + return err; + } + + /* Pm = first ptlen % 16 bytes of PP */ + for (i = 0; i < mo; i++) { + CC[i] = ct[16+i]; + pt[16+i] = PP[i]; + } + for (; i < 16; i++) { + CC[i] = PP[i]; + } + + /* Pm-1 = Tweak uncrypt CC */ + if ((err = tweak_uncrypt(CC, pt, T, decrypt_ctx)) != CRYPT_OK) { + return err; + } + } + + return CRYPT_OK; +} + + + +void xts_done(symmetric_xts *xts) +{ + if(xts == NULL) return; + aesedp_done(&xts->key1); + aesedp_done(&xts->key2); +} + diff --git a/bsd/crypto/aes/i386/aesxts.h b/bsd/crypto/aes/i386/aesxts.h new file mode 100644 index 000000000..fe7618066 --- /dev/null +++ b/bsd/crypto/aes/i386/aesxts.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2010 Apple Inc. All Rights Reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ + +/* + * aesxts.h + * + * + */ + +#include "stdint.h" + + +#ifndef _AESXTS_H +#define _AESXTS_H + +#if defined(__cplusplus) +extern "C" +{ +#endif + +/* + * The context for XTS-AES + */ + + +#define KS_LENGTH 60 + +typedef struct { + uint32_t ks[KS_LENGTH]; + uint32_t rn; +} aesedp_encrypt_ctx; + +typedef struct { + uint32_t ks[KS_LENGTH]; + uint32_t rn; +} aesedp_decrypt_ctx; + +typedef struct { + aesedp_decrypt_ctx decrypt; + aesedp_encrypt_ctx encrypt; +} aesedp_ctx; + +// xts mode context + +typedef struct { + aesedp_ctx key1, key2; + uint32_t cipher; // ignore - this is to fit with the library, but in this case we're only using aes +} symmetric_xts; + + +/* + * These are the interfaces required for XTS-AES support + */ + +uint32_t +xts_start(uint32_t cipher, // ignored - we're doing this for xts-aes only + const uint8_t *IV, // ignored + const uint8_t *key1, int keylen, + const uint8_t *key2, int tweaklen, // both keys are the same size for xts + uint32_t num_rounds, // ignored + uint32_t options, // ignored + symmetric_xts *xts); + +int xts_encrypt( + const uint8_t *pt, unsigned long ptlen, + uint8_t *ct, + const uint8_t *tweak, // this can be considered the sector IV for this use + symmetric_xts *xts); + +int xts_decrypt( + const uint8_t *ct, unsigned long ptlen, + uint8_t *pt, + const uint8_t *tweak, // this can be considered the sector IV for this use + symmetric_xts *xts); + + +void xts_done(symmetric_xts *xts); + +#if defined(__cplusplus) +} +#endif + +#endif /* _AESXTS_H */ \ No newline at end of file diff --git a/bsd/crypto/aes/i386/aesxts_asm.s b/bsd/crypto/aes/i386/aesxts_asm.s new file mode 100644 index 000000000..ec6b924b7 --- /dev/null +++ b/bsd/crypto/aes/i386/aesxts_asm.s @@ -0,0 +1,1305 @@ +/* + This file "aesxts.s" provides x86_64 / i386 optimization of the following functions + + 0. xts_mult_x_on_xmm7 : a code macro that is used throughout all other functions + 1. void xts_mult_x(uint8_t *I); + 2. int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx); + 3. int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim); + 4. int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx); + 5. int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim); + + This file should be compiled together with xtsClearC.c + + functions 1,2,4 are supposed to replace the C functions in xtsClearC.c for x86_64/i386 architectures + functions 3,5 are only given here, no C code is available, they are called in xts_encrypt/xts_decrypt (xtsClearC.c) + - we can possibly add C code for functions 3 and 5 for future porting to other architectures + + cclee 4-29-10 + +*/ + +#ifdef KERNEL +#include <i386/cpu_capabilities.h> +#else +#include <System/i386/cpu_capabilities.h> +#endif +#define CRYPT_OK 0 // can not include "crypt.h" in which CRYPT_OK is from enum + +/* + The following macro is used throughout the functions in this file. + It is the core function within the function xts_mult_x defined in (xtsClearC.c) + + upon entry, %xmm7 = the input tweak (128-bit), + on return, %xmm7 = the updated tweak (128-bit) + the macro uses %xmm1/%xmm2/%ecx in the computation + the operation can be described as follows : + 0. let x = %xmm7; // 128-bit little-endian input + 1. x = rotate_left(x,1); // rotate left by 1 -bit + 2. if (x&1) x ^= 0x0000...0086; // if least significant bit = 1, least significant byte ^= 0x86; + 3. return x; + + It's a pity that SSE does not support shifting of the whole 128-bit xmm registers. + The workaround is + 1. using parallel dual quad (8-byte) shifting, 1 for the 2 bottom 63-bits, 1 for the 2 leading bits + 2. manipulating the shifted quad words to form the 128-bit shifted result. + + Input : %xmm7 + Output : %xmm7 + Used : %xmm1/%xmm2/%ecx + + The macro is good for both x86_64 and i386. + +*/ + + .macro xts_mult_x_on_xmm7 // input : x = %xmm7, MS = most significant, LS = least significant + movaps %xmm7, %xmm1 // %xmm1 = a copy of x + movaps %xmm7, %xmm2 // %xmm2 = a copy of x + psllq $$1, %xmm7 // 1-bit left shift of 2 quad words (x1<<1, x0<<1), zero-filled + psrlq $$63, %xmm1 // 2 leading bits, each in the least significant bit of a quad word + psrad $$31, %xmm2 // the MS 32-bit will be either 0 or -1, depending on the MS bit of x + pshufd $$0xc6, %xmm1, %xmm1 // switch the positions of the 2 leading bits + pshufd $$0x03, %xmm2, %xmm2 // the LS 32-bit will be either 0 or -1, depending on the MS bit of x + por %xmm1, %xmm7 // we finally has %xmm7 = rotate_left(x,1); + movl $$0x86, %ecx // a potential byte to xor the bottom byte + movd %ecx, %xmm1 // copy it to %xmm1, the other is 0 + pand %xmm2, %xmm1 // %xmm1 = 0 or 0x86, depending on the MS bit of x + pxor %xmm1, %xmm7 // rotate_left(x,1) ^= 0 or 0x86 depending on the MS bit of x + .endm + + +/* + function : void xts_mult_x(uint8_t *I); + + 1. load (__m128*) (I) into xmm7 + 2. macro xts_mult_x_on_xmm7 (i/o @ xmm7, used xmm1/xmm2/ecx) + 3. save output (%xmm7) to memory pointed by I + + input : 16-byte memory pointed by I + output : same 16-byte memory pointed by I + + if kernel code, xmm1/xmm2/xmm7 saved and restored + other used registers : eax/ecx + + */ + .text + .align 4,0x90 + .globl _xts_mult_x +_xts_mult_x: + +#if defined __x86_64__ + #define I %rdi // 1st argument at %rdi for x86_64 + #define sp %rsp +#else + mov 4(%esp), %eax // 1st argument at stack, offset 4 for ret_addr for i386 + #define I %eax + #define sp %esp +#endif + + // if KERNEL code, allocate memory and save xmm1/xmm2/xmm7 +#ifdef KERNEL +#if defined __x86_64__ + sub $0x38, sp // 8-bytes alignment + 3 * 16 bytes +#else + sub $0x3c, sp // 12-bytes alignment + 3 * 16 bytes +#endif + movaps %xmm1, (sp) + movaps %xmm2, 16(sp) + movaps %xmm7, 32(sp) +#endif + + // load, compute, and save + movups (I), %xmm7 // load input tweak 128-bit into %xmm7 + xts_mult_x_on_xmm7 // the macro (also used else where) will update %xmm7 as the output + movups %xmm7, (I) // save the xts_mult_x output + + // if KERNEL code, restore xmm1/xmm2/xmm7 and deallocate stack memory +#ifdef KERNEL + movaps (sp), %xmm1 + movaps 16(sp), %xmm2 + movaps 32(sp), %xmm7 +#if defined __x86_64__ + add $0x38, sp // 8-bytes alignment + 3 * 16 bytes +#else + add $0x3c, sp // 12-bytes alignment + 3 * 16 bytes +#endif +#endif + + ret // return + + #undef I + #undef sp + +/* + The following is x86_64/i386 assembly implementation of + + int tweak_crypt(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx); + + Its C code implementation is given in xtsClearC.c + + all pointers P/C/T points to a block of 16 bytes. In the following description, P/C/T represent 128-bit data. + + The operation of tweak_crypt + + 1. C = P ^ T + 2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err; + 3. C = C ^ T + 4. xts_mult_x(T) + 5. return CRYPT_OK; + + The following is the assembly implementation flow + + 1. save used xmm registers (xmm1/xmm7) if kernel code + 2. load xmm1 = P, xmm7 = T + 3. xmm1 = C = P ^ T + 4. write xmm1 to C + 5. call aes_encryp(C,C,ctx); note that it will use aesni if available, also xmm will return intact + 6. load xmm1 = C + 7. xmm1 = C = C^T = xmm1 ^ xmm7 + 8. write xmm1 to C + 9. update T (in xmm7) via xts_mult_x macro + a. restore xmm registers (xmm1/xmm7) if kernel code + b. return CRYPT_OK (in eax) + + Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro + +*/ + + .text + .align 4,0x90 + .globl _tweak_crypt +_tweak_crypt: +#if defined __i386__ + + // push into stack for local use + push %ebp + mov %esp, %ebp + push %ebx + push %edi + push %esi + + // alllocate stack memory for local use + sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) + + // load with called arguments + mov 8(%ebp), %eax // P, we need this only briefly, so eax is fine + mov 12(%ebp), %edi // C + mov 16(%ebp), %ebx // T + mov 20(%ebp), %esi // ctx + + #define P %eax + #define C %edi + #define T %ebx + #define ctx %esi + #define sp %esp + +#else + // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8 + + // push into stack for local use + push %rbp + mov %rsp, %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers +#ifdef KERNEL + sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386 +#endif + + // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_encrypt + mov %rsi, %r13 + mov %rdx, %r14 + mov %rcx, %r15 + + #define P %rdi + #define C %r13 + #define T %r14 + #define ctx %r15 + #define sp %rsp + +#endif + + // if kernel, save used xmm registers +#ifdef KERNEL + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm7, 48(sp) +#endif + + movups (P), %xmm1 // P + movups (T), %xmm7 // T + + // setup caliing arguments for aes_encrypt +#if defined __i386__ + mov C, (%esp) // C + mov C, 4(%esp) // C + mov ctx, 8(%esp) // ctx +#else + mov C, %rdi // C + mov C, %rsi // C + mov ctx, %rdx // ctx +#endif + + pxor %xmm7, %xmm1 // C = P ^ T + movups %xmm1, (C) // save C into memory + + call _aes_encrypt // err = aes_encrypt(C,C,ctx); + + cmp $CRYPT_OK, %eax // check err == CRYPT_OK + jne 9f // if err != CRYPT_OK, exit + + movups (C), %xmm1 // load xmm1 = C + pxor %xmm7, %xmm1 // C ^= T + movups %xmm1, (C) // write C with xmm1, xmm1 is freed now, will be changed in the following macro + + xts_mult_x_on_xmm7 // update T (on xmm7) + + movups %xmm7, (T) // write xmm7 to T +9: + + // restore used xmm registers if this is for kernel +#ifdef KERNEL + movaps 16(sp), %xmm1 + movaps 32(sp), %xmm2 + movaps 48(sp), %xmm7 +#endif + + // free stack memory and restore callee registers +#if defined __i386__ + add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) + pop %esi + pop %edi + pop %ebx +#else +#ifdef KERNEL + add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386 +#endif + pop %r15 + pop %r14 + pop %r13 + pop %r12 +#endif + + // return, eax/rax already has the return val + leave + ret + + #undef P + #undef C + #undef T + #undef ctx + #undef sp + +/* + The following is x86_64/i386 assembly implementation of + + int tweak_crypt_group(const uint8_t *P, uint8_t *C, uint8_t *T, aesedp_encrypt_ctx *ctx, uint32_t lim); + + TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs) + This function is grouped version of the above function tweak_crypt(), so xmm registers save/restore only need + to happen once for all grouped blocks. + + The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available. + If aesni is available, the code branch to optimized code that uses aesni. + + The optimized aesni code operates as follows: + + while (more than 4 consecutive blocks available) { + + do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned) + + perform 4 C = P ^ T; // T is on 16-byte aligned stack + + perform 4 aes_encrypt (all aes_encrypt instruction interleaved to achieve better throughtput) + + perform 4 C = C ^ T // T is on 16-byte aligned stack + + } + + The code then falls through to the scalar code, that sequentially performs what tweak_crypt does + + 1. C = P ^ T + 2. err = aes_encryp(C, C, ctx); if (err != CRYPT_OK) return err; + 3. C = C ^ T + 4. xts_mult_x(T) + + Note: used xmm registers : + xmm0-xmm5, xmm7 if aesni is available + xmm0-xmm4, xmm7 if aesni is not available. + +*/ + + .text + .align 4,0x90 + .globl _tweak_crypt_group +_tweak_crypt_group: + +#if defined __i386__ + + // push callee-saved registers for local use + push %ebp + mov %esp, %ebp + push %ebx + push %edi + push %esi + + // allocate stack memory for local use and/or xmm register save for kernel code + sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni + // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_encrypt) no aesni + // transfer calling arguments + mov 20(%ebp), %eax // ctx + mov 12(%ebp), %edi // C + mov 16(%ebp), %ebx // T + mov 8(%ebp), %esi // P + mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt + + #define P %esi + #define C %edi + #define T %ebx + #define lim 24(%ebp) + #define sp %esp + +#else + + // push callee-saved registers for local use + push %rbp + mov %rsp, %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + // allocate stack memory for local use and/or xmm register save for kernel code + sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386) + + // rdi/rsi/rdx/rcx/r8 + // transfer calling arguments + mov %rdi, %r12 + mov %rsi, %r13 + mov %rdx, %r14 + mov %rcx, %r15 + mov %r8, %rbx + + #define P %r12 + #define C %r13 + #define T %r14 + #define ctx %r15 + #define lim %ebx + #define sp %rsp +#endif + +#ifdef KERNEL + movaps %xmm0, 0x50(sp) + movaps %xmm1, 0x60(sp) + movaps %xmm2, 0x70(sp) + movaps %xmm3, 0x80(sp) + movaps %xmm4, 0x90(sp) + movaps %xmm7, 0xa0(sp) +#endif + + // probe __cpu_capabilities to detect aesni +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities +#else // i386 +#if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + movl _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax + je L_crypt_group_sw // if aesni not available, jump to sw-based implementation + + // aesni-based implementation + + sub $4, lim // pre-decrement lim by 4 + jl 9f // if lim < 4, skip the following code + + movups (T), %xmm7 // xmm7 is the tweak before encrypting every 4 blocks +#ifdef KERNEL + movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5 +#endif + +0: + // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space + // xmm7 will be the tweak for next 4-blocks iteration + + #define tweak1 16(sp) + #define tweak2 32(sp) + #define tweak3 48(sp) + #define tweak4 64(sp) + + movaps %xmm7, tweak1 // save 1st tweak on stack + xts_mult_x_on_xmm7 // compute 2nd tweak + movaps %xmm7, tweak2 // save 2nd tweak on stack + xts_mult_x_on_xmm7 // compute 3rd tweak + movaps %xmm7, tweak3 // save 3rd tweak on stack + xts_mult_x_on_xmm7 // compute 4th tweak + movaps %xmm7, tweak4 // save 4th tweak on stack + xts_mult_x_on_xmm7 // compute 1st tweak for next iteration + + // read 4 Ps + movups (P), %xmm0 + movups 16(P), %xmm1 + movups 32(P), %xmm2 + movups 48(P), %xmm3 + + // 4 C = P ^ T + pxor tweak1, %xmm0 + pxor tweak2, %xmm1 + pxor tweak3, %xmm2 + pxor tweak4, %xmm3 + + // 4 interleaved aes_encrypt + +#if defined __i386__ + mov 8(sp), %ecx // ctx + #undef ctx + #define ctx %ecx +#endif + + mov 240(ctx), %eax // aes length + + cmp $160, %eax // AES-128 ? + je 160f + cmp $192, %eax // AES-192 ? + je 192f + cmp $224, %eax // AES-256 ? + je 224f + mov $-1, %eax // error : non-supported aes length +#ifdef KERNEL + movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 +#endif + jmp L_error_crypt + + // definitions, macros, and constructs for 4 blocks hw-aes-encrypt + + // the following key definitions will also be used in tweak_uncrypt_group + #define key0 0(ctx) + #define key1 16(ctx) + #define key2 32(ctx) + #define key3 48(ctx) + #define key4 64(ctx) + #define key5 80(ctx) + #define key6 96(ctx) + #define key7 112(ctx) + #define key8 128(ctx) + #define key9 144(ctx) + #define keyA 160(ctx) + #define keyB 176(ctx) + #define keyC 192(ctx) + #define keyD 208(ctx) + #define keyE 224(ctx) + + #define aes aesenc + #define aeslast aesenclast + + // all aes encrypt operations start with the following sequence + .macro aes_common_part + movups key0, %xmm4 + movups key1, %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movups key2, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key3, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key4, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key5, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key6, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key7, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key8, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key9, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups keyA, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + .endm + + // all aes encypt operations end with the following 4 instructions + .macro aes_last + aeslast %xmm4, %xmm0 + aeslast %xmm4, %xmm1 + aeslast %xmm4, %xmm2 + aeslast %xmm4, %xmm3 + .endm + + .macro aes_128 + aes_common_part // encrypt common part + aes_last // encrypt ending part + .endm + + .macro aes_192 + aes_common_part // encrypt common part + + // 10 extra instructions in between common and ending + movups keyB, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups keyC, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + + aes_last // encrypt ending part + .endm + + .macro aes_256 + aes_common_part // encrypt common part + + // 20 extra instructions in between common and ending + movups keyB, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups keyC, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups keyD, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups keyE, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + + aes_last // encrypt ending part + .endm + +160: // AES-128 encrypt + aes_128 + jmp 8f + +192: // AES-192 encrypt + aes_192 + jmp 8f + +224: // AES-256 encrypt + aes_256 + +8: + + // 4 C = C ^ T + pxor tweak1, %xmm0 + pxor tweak2, %xmm1 + pxor tweak3, %xmm2 + pxor tweak4, %xmm3 + + // write 4 Cs + movups %xmm0, (C) + movups %xmm1, 16(C) + movups %xmm2, 32(C) + movups %xmm3, 48(C) + + add $64, P + add $64, C + + sub $4, lim + jge 0b + +#ifdef KERNEL + movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 +#endif + movups %xmm7, (T) + +9: + xor %eax, %eax // to return CRYPT_OK + add $4, lim // post-increment lim by 4 + je 9f // if lim==0, branch to prepare to return + +L_crypt_group_sw: + + movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop + + sub $1, lim // pre-decrement lim by 1 + jl 1f // if lim < 1, branch to prepare to return +0: + movups (P), %xmm0 // P + + // prepare for calling aes_encrypt +#if defined __i386__ + mov C, (%esp) // C + mov C, 4(%esp) // C + // ctx was prepared previously in preamble +#else + mov C, %rdi // C + mov C, %rsi // C + mov ctx, %rdx // ctx +#endif + + pxor %xmm7, %xmm0 // C = P ^ T + movups %xmm0, (C) // save C into memory + + call _aes_encrypt_xmm_no_save // err = aes_encrypt(C,C,ctx); + + cmp $CRYPT_OK, %eax // err == CRYPT_OK ? + jne 9f // if err != CRYPT_OK, branch to exit with error + + movups (C), %xmm0 // load xmm0 with C + pxor %xmm7, %xmm0 // C ^= T + movups %xmm0, (C) // save output C + + xts_mult_x_on_xmm7 + + add $16, C // next C + add $16, P // next P + sub $1, lim // lim-- + jge 0b // if (lim>0) repeat the scalar loop + +1: movups %xmm7, (T) // save final tweak +L_error_crypt: +9: + // if kernel, restore used xmm registers +#ifdef KERNEL + movaps 0x50(sp), %xmm0 + movaps 0x60(sp), %xmm1 + movaps 0x70(sp), %xmm2 + movaps 0x80(sp), %xmm3 + movaps 0x90(sp), %xmm4 + movaps 0xa0(sp), %xmm7 +#endif + +#if defined __i386__ + add $(12+16*8+16*4), %esp + pop %esi + pop %edi + pop %ebx +#else + add $(8+16*8+16*5), %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx +#endif + leave + ret + + #undef P + #undef C + #undef T + #undef ctx + #undef sp + +/* + The following is x86_64/i386 assembly implementation of + + int tweak_uncrypt(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx); + + Its C code implementation is given in xtsClearC.c + + all pointers C/P/T points to a block of 16 bytes. In the following description, C/P/T represent 128-bit data. + + The operation of tweak_crypt + + 1. P = C ^ T + 2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err; + 3. P = P ^ T + 4. xts_mult_x(T) + 5. return CRYPT_OK; + + The following is the assembly implementation flow + + 1. save used xmm registers (xmm1/xmm7) if kernel code + 2. load xmm1 = C, xmm7 = T + 3. xmm1 = P = C ^ T + 4. write xmm1 to P + 5. call aes_decryp(P,P,ctx); note that it will use aesni if available, also xmm will return intact + 6. load xmm1 = P + 7. xmm1 = P = P^T = xmm1 ^ xmm7 + 8. write xmm1 to P + 9. update T (in xmm7) via xts_mult_x macro + a. restore xmm registers (xmm1/xmm7) if kernel code + b. return CRYPT_OK (in eax) + + Note: used xmm registers : xmm1/xmm2/xmm7, xmm2 in xts_mult_x macro + +*/ + + .text + .align 4,0x90 + .globl _tweak_uncrypt +_tweak_uncrypt: +#if defined __i386__ + + // push into stack for local use + push %ebp + mov %esp, %ebp + push %ebx + push %edi + push %esi + + // alllocate stack memory for local use + sub $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) + + // load with called arguments + mov 8(%ebp), %eax // C, we need this only briefly, so eax is fine + mov 12(%ebp), %edi // P + mov 16(%ebp), %ebx // T + mov 20(%ebp), %esi // ctx + + #define C %eax + #define P %edi + #define T %ebx + #define ctx %esi + #define sp %esp + +#else + // x86_64 calling argument order : rdi/rsi/rdx/rcx/r8 + + // push into stack for local use + push %rbp + mov %rsp, %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + // alllocate stack memory for local use, if kernel code, need to save/restore xmm registers +#ifdef KERNEL + sub $4*16, %rsp // only need 3*16, add 16 extra so to make save/restore xmm common to i386 +#endif + + // load with called arguments, release rdi/rsi/rdx/rcx/r8, as need to call aes_decrypt + mov %rsi, %r13 + mov %rdx, %r14 + mov %rcx, %r15 + + #define C %rdi + #define P %r13 + #define T %r14 + #define ctx %r15 + #define sp %rsp + +#endif + + // if kernel, save used xmm registers +#ifdef KERNEL + movaps %xmm1, 16(sp) + movaps %xmm2, 32(sp) + movaps %xmm7, 48(sp) +#endif + + movups (C), %xmm1 // C + movups (T), %xmm7 // T + + // setup caliing arguments for aes_decrypt +#if defined __i386__ + mov P, (%esp) // P + mov P, 4(%esp) // P + mov ctx, 8(%esp) // ctx +#else + mov P, %rdi // P + mov P, %rsi // P + mov ctx, %rdx // ctx +#endif + + pxor %xmm7, %xmm1 // P = C ^ T + movups %xmm1, (P) // save P into memory + + call _aes_decrypt // err = aes_decrypt(P,P,ctx); + + cmp $CRYPT_OK, %eax // check err == CRYPT_OK + jne 9f // if err != CRYPT_OK, exit + + movups (P), %xmm1 // load xmm1 = P + pxor %xmm7, %xmm1 // P ^= T + movups %xmm1, (P) // write P with xmm1, xmm1 is freed now, will be changed in the following macro + + xts_mult_x_on_xmm7 // update T (on xmm7) + + movups %xmm7, (T) // write xmm7 to T +9: + + // restore used xmm registers if this is for kernel +#ifdef KERNEL + movaps 16(sp), %xmm1 + movaps 32(sp), %xmm2 + movaps 48(sp), %xmm7 +#endif + + // free stack memory and restore callee registers +#if defined __i386__ + add $12+16*4, %esp // 12 (alignment) + 3*16 (xmm save/restore) + 16 (aes_crypt calling arguments) + pop %esi + pop %edi + pop %ebx +#else +#ifdef KERNEL + add $4*16, %rsp // only need 3*16, add 16 extra so make save/restore xmm common to i386 +#endif + pop %r15 + pop %r14 + pop %r13 + pop %r12 +#endif + + // return, eax/rax already has the return val + leave + ret + + #undef P + #undef C + #undef T + #undef ctx + #undef sp + +/* + The following is x86_64/i386 assembly implementation of + + int tweak_uncrypt_group(const uint8_t *C, uint8_t *P, uint8_t *T, aesedp_decrypt_ctx *ctx, uint32_t lim); + + TODO : Its C code implementation is YET to be provided in xtsClearC.c (for the benefit of porting to other ISAs) + This function is grouped version of the above function tweak_uncrypt(), so xmm registers save/restore only need + to happen once for all grouped blocks. + + The implementation here probes __cpu_capabilities to detect whether aesni (or hw-aes instruction) is available. + If aesni is available, the code branch to optimized code that uses aesni. + + The optimized aesni code operates as follows: + + while (more than 4 consecutive blocks available) { + + do xts_mult_x macro 4 times and write the 4 tweaks on stack (16-byte aligned) + + perform 4 P = C ^ T; // T is on 16-byte aligned stack + + perform 4 aes_decrypt (all aes_decrypt instruction interleaved to achieve better throughtput) + + perform 4 P = P ^ T // T is on 16-byte aligned stack + + } + + The code then falls through to the scalar code, that sequentially performs what tweak_crypt does + + 1. P = C ^ T + 2. err = aes_decryp(P, P, ctx); if (err != CRYPT_OK) return err; + 3. P = P ^ T + 4. xts_mult_x(T) + + Note: used xmm registers : + xmm0-xmm5, xmm7 if aesni is available + xmm0-xmm4, xmm7 if aesni is not available. + +*/ + + .text + .align 4,0x90 + .globl _tweak_uncrypt_group +_tweak_uncrypt_group: + +#if defined __i386__ + + // push callee-saved registers for local use + push %ebp + mov %esp, %ebp + push %ebx + push %edi + push %esi + + // allocate stack memory for local use and/or xmm register save for kernel code + sub $(12+8*16+16*4), %esp // 12 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) aesni + // 12 (alignment) + 8*16 (xmm) + 4*16 (only 12 used for aes_decrypt) no aesni + // transfer calling arguments + mov 20(%ebp), %eax // ctx + mov 12(%ebp), %edi // P + mov 16(%ebp), %ebx // T + mov 8(%ebp), %esi // C + mov %eax, 8(%esp) // ctx as the 3rd parameter to aes_decrypt + + #define C %esi + #define P %edi + #define T %ebx + #define lim 24(%ebp) + #define sp %esp + +#else + + // push callee-saved registers for local use + push %rbp + mov %rsp, %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + // allocate stack memory for local use and/or xmm register save for kernel code + sub $(8+8*16+16*5), %rsp // 8 (alignment) + 8*16 (xmm) + 4*16 (pre-computed tweaks) + 16 (common to i386) + + // rdi/rsi/rdx/rcx/r8 + // transfer calling arguments + mov %rdi, %r12 + mov %rsi, %r13 + mov %rdx, %r14 + mov %rcx, %r15 + mov %r8, %rbx + + #define C %r12 + #define P %r13 + #define T %r14 + #define ctx %r15 + #define lim %ebx + #define sp %rsp +#endif + +#ifdef KERNEL + movaps %xmm0, 0x50(sp) + movaps %xmm1, 0x60(sp) + movaps %xmm2, 0x70(sp) + movaps %xmm3, 0x80(sp) + movaps %xmm4, 0x90(sp) + movaps %xmm7, 0xa0(sp) +#endif + + // probe __cpu_capabilities to detect aesni +#if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities +#else // i386 +#if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities +#else + movl _COMM_PAGE_CPU_CAPABILITIES, %eax +#endif +#endif + test $(kHasAES), %eax + je L_uncrypt_group_sw // if aesni not available, jump to sw-based implementation + + // aesni-based implementation + + sub $4, lim // pre-decrement lim by 4 + jl 9f // if lim < 4, skip the following code + + movups (T), %xmm7 // xmm7 is the tweak before decrypting every 4 blocks +#ifdef KERNEL + movaps %xmm5, 0xb0(sp) // hw-aes-based uses extra xmm5 +#endif + +0: + // derive 4 tweaks using xts_mult_x macro, and save on aligned stack space + // xmm7 will be the tweak for next 4-blocks iteration + + #define tweak1 16(sp) + #define tweak2 32(sp) + #define tweak3 48(sp) + #define tweak4 64(sp) + + movaps %xmm7, tweak1 // save 1st tweak on stack + xts_mult_x_on_xmm7 // compute 2nd tweak + movaps %xmm7, tweak2 // save 2nd tweak on stack + xts_mult_x_on_xmm7 // compute 3rd tweak + movaps %xmm7, tweak3 // save 3rd tweak on stack + xts_mult_x_on_xmm7 // compute 4th tweak + movaps %xmm7, tweak4 // save 4th tweak on stack + xts_mult_x_on_xmm7 // compute 1st tweak for next iteration + + // read 4 Cs + movups (C), %xmm0 + movups 16(C), %xmm1 + movups 32(C), %xmm2 + movups 48(C), %xmm3 + + // 4 P = C ^ T + pxor tweak1, %xmm0 + pxor tweak2, %xmm1 + pxor tweak3, %xmm2 + pxor tweak4, %xmm3 + + // 4 interleaved aes_decrypt + +#if defined __i386__ + mov 8(sp), %ecx // ctx + #undef ctx + #define ctx %ecx +#endif + + mov 240(ctx), %eax // aes length + + cmp $160, %eax // AES-128 ? + je 160f + cmp $192, %eax // AES-192 ? + je 192f + cmp $224, %eax // AES-256 ? + je 224f + mov $-1, %eax // error : non-supported aes length +#ifdef KERNEL + movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 +#endif + jmp L_error_uncrypt + + // definitions, macros to construc hw-aes-decrypt + // will reuse previously defined key0 = (ctx), key1 = 16(ctx), .... + #undef aes + #undef aeslast + #define aes aesdec + #define aeslast aesdeclast + + .macro aes_decrypt_common + movups key8, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key7, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key6, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key5, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key4, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key3, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key2, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key1, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups key0, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + aeslast %xmm4, %xmm0 + aeslast %xmm4, %xmm1 + aeslast %xmm4, %xmm2 + aeslast %xmm4, %xmm3 + .endm + + .macro aes_dec_128 + movups keyA, %xmm4 + movups key9, %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + aes_decrypt_common + .endm + + .macro aes_dec_192 + movups keyC, %xmm4 + movups keyB, %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movups keyA, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key9, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + aes_decrypt_common + .endm + + .macro aes_dec_256 + movups keyE, %xmm4 + movups keyD, %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm4, %xmm1 + pxor %xmm4, %xmm2 + pxor %xmm4, %xmm3 + movups keyC, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups keyB, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + movups keyA, %xmm4 + aes %xmm5, %xmm0 + aes %xmm5, %xmm1 + aes %xmm5, %xmm2 + aes %xmm5, %xmm3 + movups key9, %xmm5 + aes %xmm4, %xmm0 + aes %xmm4, %xmm1 + aes %xmm4, %xmm2 + aes %xmm4, %xmm3 + aes_decrypt_common + .endm + +160: // AES-128 decrypt + aes_dec_128 + jmp 8f + +192: // AES-192 decrypt + aes_dec_192 + jmp 8f + +224: // AES-256 decrypt + aes_dec_256 + +8: + + // 4 P = P ^ T + pxor tweak1, %xmm0 + pxor tweak2, %xmm1 + pxor tweak3, %xmm2 + pxor tweak4, %xmm3 + + // write 4 Ps + movups %xmm0, (P) + movups %xmm1, 16(P) + movups %xmm2, 32(P) + movups %xmm3, 48(P) + + add $64, C + add $64, P + + sub $4, lim + jge 0b + +#ifdef KERNEL + movaps 0xb0(sp), %xmm5 // hw-aes-based uses extra xmm5 +#endif + movups %xmm7, (T) + +9: + xor %eax, %eax // to return CRYPT_OK + add $4, lim // post-increment lim by 4 + je 9f // if lim==0, branch to prepare to return + +L_uncrypt_group_sw: + + movups (T), %xmm7 // T, xmm7 will be used as T (128-bit) throughtout the loop + + sub $1, lim // pre-decrement lim by 1 + jl 1f // if lim < 1, branch to prepare to return +0: + movups (C), %xmm0 // C + + // prepare for calling aes_decrypt +#if defined __i386__ + mov P, (%esp) // P + mov P, 4(%esp) // P + // ctx was prepared previously in preamble +#else + mov P, %rdi // P + mov P, %rsi // P + mov ctx, %rdx // ctx +#endif + + pxor %xmm7, %xmm0 // P = C ^ T + movups %xmm0, (P) // save P into memory + + call _aes_decrypt_xmm_no_save // err = aes_decrypt(P,P,ctx); + + cmp $CRYPT_OK, %eax // err == CRYPT_OK ? + jne 9f // if err != CRYPT_OK, branch to exit with error + + movups (P), %xmm0 // load xmm0 with P + pxor %xmm7, %xmm0 // P ^= T + movups %xmm0, (P) // save output P + + xts_mult_x_on_xmm7 + + add $16, C // next C + add $16, P // next P + sub $1, lim // lim-- + jge 0b // if (lim>0) repeat the scalar loop + +1: movups %xmm7, (T) // save final tweak +L_error_uncrypt: +9: + // if kernel, restore used xmm registers +#ifdef KERNEL + movaps 0x50(sp), %xmm0 + movaps 0x60(sp), %xmm1 + movaps 0x70(sp), %xmm2 + movaps 0x80(sp), %xmm3 + movaps 0x90(sp), %xmm4 + movaps 0xa0(sp), %xmm7 +#endif + +#if defined __i386__ + add $(12+16*8+16*4), %esp + pop %esi + pop %edi + pop %ebx +#else + add $(8+16*8+16*5), %rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx +#endif + leave + ret diff --git a/bsd/crypto/aes/i386/edefs.h b/bsd/crypto/aes/i386/edefs.h deleted file mode 100644 index d25bef89c..000000000 --- a/bsd/crypto/aes/i386/edefs.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 31/01/2006 -*/ - -#ifndef EDEFS_H -#define EDEFS_H -#if defined(__cplusplus) -extern "C" -{ -#endif - -#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ -#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ - -#if defined(__GNUC__) || defined(__GNU_LIBRARY__) -# if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) -# include <sys/endian.h> -# elif defined( BSD ) && ( BSD >= 199103 ) || defined( __DJGPP__ ) || defined( __CYGWIN32__ ) -# include <machine/endian.h> -# elif defined(__APPLE__) -# if defined(__BIG_ENDIAN__) && !defined( BIG_ENDIAN ) -# define BIG_ENDIAN -# elif defined(__LITTLE_ENDIAN__) && !defined( LITTLE_ENDIAN ) -# define LITTLE_ENDIAN -# endif -# elif !defined( __MINGW32__ ) -# include <endian.h> -# if !defined(__BEOS__) -# include <byteswap.h> -# endif -# endif -#endif - -#if !defined(PLATFORM_BYTE_ORDER) -# if defined(LITTLE_ENDIAN) || defined(BIG_ENDIAN) -# if defined(LITTLE_ENDIAN) && !defined(BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif !defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined(BYTE_ORDER) && (BYTE_ORDER == LITTLE_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif defined(BYTE_ORDER) && (BYTE_ORDER == BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# endif -# elif defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN) -# if defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif !defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _LITTLE_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# endif -# elif defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__) -# if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __LITTLE_ENDIAN__) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# endif -# endif -#endif - -/* if the platform is still unknown, try to find its byte order */ -/* from commonly used machine defines */ - -#if !defined(PLATFORM_BYTE_ORDER) - -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) || defined( _M_X64 ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN - -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __TANDEM ) || defined( THINK_C ) || defined( __VMCMS__ ) || \ - defined( __VOS__ ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN - -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#else -# error Please edit edefs.h (lines 117 or 119) to set the platform byte order -#endif - -#endif - -#if defined(__cplusplus) -} -#endif -#endif diff --git a/bsd/crypto/aes/ppc/Makefile b/bsd/crypto/aes/ppc/Makefile deleted file mode 100644 index 99755ad2e..000000000 --- a/bsd/crypto/aes/ppc/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -INSTINC_SUBDIRS = \ - -INSTINC_SUBDIRS_PPC = \ - -INSTINC_SUBDIRS_I386 = \ - -EXPINC_SUBDIRS = \ - -EXPINC_SUBDIRS_PPC = \ - -EXPINC_SUBDIRS_I386 = \ - -PRIVATE_DATAFILES = \ - aestab.h aesopt.h - -INSTALL_MI_DIR = crypto - -EXPORT_MI_DIR = ${INSTALL_MI_DIR} - -INSTALL_KF_MI_LIST = - -INSTALL_KF_MI_LCL_LIST = ${PRIVATE_DATAFILES} - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/crypto/aes/ppc/aescrypt.c b/bsd/crypto/aes/ppc/aescrypt.c deleted file mode 100644 index 31d4c81af..000000000 --- a/bsd/crypto/aes/ppc/aescrypt.c +++ /dev/null @@ -1,411 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - - This file contains the code for implementing encryption and decryption - for AES (Rijndael) for block and key sizes of 16, 24 and 32 bytes. It - can optionally be replaced by code written in assembler using NASM. For - further details see the file aesopt.h -*/ - -#include "aesopt.h" -#include "aestab.h" - -#if defined(__cplusplus) -extern "C" -{ -#endif - -#define ki(y,x,k,c) (s(y,c) = s(x, c) ^ (k)[c]) -#define xo(y,x,c) (s(y,c) ^= s(x, c)) -#define si(y,x,c) (s(y,c) = word_in(x, c)) -#define so(y,x,c) word_out(y, c, s(x,c)) - -#if defined(ARRAYS) -#define locals(y,x) x[4],y[4] -#else -#define locals(y,x) x##0,x##1,x##2,x##3,y##0,y##1,y##2,y##3 -#endif - -#define dtables(tab) const aes_32t *tab##0, *tab##1, *tab##2, *tab##3 -#define itables(tab) tab##0 = tab[0]; tab##1 = tab[1]; tab##2 = tab[2]; tab##3 = tab[3] - -#define l_copy(y, x) s(y,0) = s(x,0); s(y,1) = s(x,1); \ - s(y,2) = s(x,2); s(y,3) = s(x,3); - -#define key_in(y,x,k) ki(y,x,k,0); ki(y,x,k,1); ki(y,x,k,2); ki(y,x,k,3) -#define cbc(y,x) xo(y,x,0); xo(y,x,1); xo(y,x,2); xo(y,x,3) -#define state_in(y,x) si(y,x,0); si(y,x,1); si(y,x,2); si(y,x,3) -#define state_out(y,x) so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3) -#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3) - -#if defined(ENCRYPTION) && !defined(AES_ASM) - -/* Visual C++ .Net v7.1 provides the fastest encryption code when using - Pentium optimiation with small code but this is poor for decryption - so we need to control this with the following VC++ pragmas -*/ - -#if defined(_MSC_VER) -#pragma optimize( "s", on ) -#endif - -/* Given the column (c) of the output state variable, the following - macros give the input state variables which are needed in its - computation for each row (r) of the state. All the alternative - macros give the same end values but expand into different ways - of calculating these values. In particular the complex macro - used for dynamically variable block sizes is designed to expand - to a compile time constant whenever possible but will expand to - conditional clauses on some branches (I am grateful to Frank - Yellin for this construction) -*/ - -#define fwd_var(x,r,c)\ - ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\ - : r == 1 ? ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))\ - : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\ - : ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))) - -#if defined(FT4_SET) -#undef dec_fmvars -# if defined(ENC_ROUND_CACHE_TABLES) -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_fn,fwd_var,rf1,c)) -# else -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_fn,fwd_var,rf1,c)) -# endif -#elif defined(FT1_SET) -#undef dec_fmvars -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,upr,t_fn,fwd_var,rf1,c)) -#else -#define fwd_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ fwd_mcol(no_table(x,t_sbox,fwd_var,rf1,c))) -#endif - -#if defined(FL4_SET) -# if defined(LAST_ENC_ROUND_CACHE_TABLES) -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_fl,fwd_var,rf1,c)) -# else -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_fl,fwd_var,rf1,c)) -# endif -#elif defined(FL1_SET) -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,ups,t_fl,fwd_var,rf1,c)) -#else -#define fwd_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ no_table(x,t_sbox,fwd_var,rf1,c)) -#endif - -aes_rval aes_encrypt_cbc(const unsigned char *in, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out, const aes_encrypt_ctx cx[1]) -{ aes_32t locals(b0, b1); - const aes_32t *kp; - const aes_32t *kptr = cx->ks; -#if defined(ENC_ROUND_CACHE_TABLES) - dtables(t_fn); -#endif -#if defined(LAST_ENC_ROUND_CACHE_TABLES) - dtables(t_fl); -#endif - -#if defined( dec_fmvars ) - dec_fmvars; /* declare variables for fwd_mcol() if needed */ -#endif - -#if defined( AES_ERR_CHK ) - if( cx->rn != 10 && cx->rn != 12 && cx->rn != 14 ) - return aes_error; -#endif - - // Load IV into b0. - state_in(b0, in_iv); - - for (;num_blk; in += AES_BLOCK_SIZE, out += AES_BLOCK_SIZE, --num_blk) - { - kp = kptr; -#if 0 - // Read the plaintext into b1 - state_in(b1, in); - // Do the CBC with b0 which is either the iv or the ciphertext of the previous block. - cbc(b1, b0); - - // Xor b1 with the key schedule to get things started. - key_in(b0, b1, kp); -#else - // Since xor is associative we mess with the ordering here to get the loads started early - key_in(b1, b0, kp); // Xor b0(IV) with the key schedule and assign to b1 - state_in(b0, in); // Load block into b0 - cbc(b0, b1); // Xor b0 with b1 and store in b0 -#endif - -#if defined(ENC_ROUND_CACHE_TABLES) - itables(t_fn); -#endif - -#if (ENC_UNROLL == FULL) - - switch(cx->rn) - { - case 14: - round(fwd_rnd, b1, b0, kp + 1 * N_COLS); - round(fwd_rnd, b0, b1, kp + 2 * N_COLS); - kp += 2 * N_COLS; - case 12: - round(fwd_rnd, b1, b0, kp + 1 * N_COLS); - round(fwd_rnd, b0, b1, kp + 2 * N_COLS); - kp += 2 * N_COLS; - case 10: - default: - round(fwd_rnd, b1, b0, kp + 1 * N_COLS); - round(fwd_rnd, b0, b1, kp + 2 * N_COLS); - round(fwd_rnd, b1, b0, kp + 3 * N_COLS); - round(fwd_rnd, b0, b1, kp + 4 * N_COLS); - round(fwd_rnd, b1, b0, kp + 5 * N_COLS); - round(fwd_rnd, b0, b1, kp + 6 * N_COLS); - round(fwd_rnd, b1, b0, kp + 7 * N_COLS); - round(fwd_rnd, b0, b1, kp + 8 * N_COLS); - round(fwd_rnd, b1, b0, kp + 9 * N_COLS); -#if defined(LAST_ENC_ROUND_CACHE_TABLES) - itables(t_fl); -#endif - round(fwd_lrnd, b0, b1, kp +10 * N_COLS); - } - -#else - - { aes_32t rnd; -#if (ENC_UNROLL == PARTIAL) - for(rnd = 0; rnd < (cx->rn >> 1) - 1; ++rnd) - { - kp += N_COLS; - round(fwd_rnd, b1, b0, kp); - kp += N_COLS; - round(fwd_rnd, b0, b1, kp); - } - kp += N_COLS; - round(fwd_rnd, b1, b0, kp); -#else - for(rnd = 0; rnd < cx->rn - 1; ++rnd) - { - kp += N_COLS; - round(fwd_rnd, b1, b0, kp); - l_copy(b0, b1); - } -#endif -#if defined(LAST_ENC_ROUND_CACHE_TABLES) - itables(t_fl); -#endif - kp += N_COLS; - round(fwd_lrnd, b0, b1, kp); - } -#endif - - state_out(out, b0); - } - -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(DECRYPTION) && !defined(AES_ASM) - -/* Visual C++ .Net v7.1 provides the fastest encryption code when using - Pentium optimiation with small code but this is poor for decryption - so we need to control this with the following VC++ pragmas -*/ - -#if defined(_MSC_VER) -#pragma optimize( "t", on ) -#endif - -/* Given the column (c) of the output state variable, the following - macros give the input state variables which are needed in its - computation for each row (r) of the state. All the alternative - macros give the same end values but expand into different ways - of calculating these values. In particular the complex macro - used for dynamically variable block sizes is designed to expand - to a compile time constant whenever possible but will expand to - conditional clauses on some branches (I am grateful to Frank - Yellin for this construction) -*/ - -#define inv_var(x,r,c)\ - ( r == 0 ? ( c == 0 ? s(x,0) : c == 1 ? s(x,1) : c == 2 ? s(x,2) : s(x,3))\ - : r == 1 ? ( c == 0 ? s(x,3) : c == 1 ? s(x,0) : c == 2 ? s(x,1) : s(x,2))\ - : r == 2 ? ( c == 0 ? s(x,2) : c == 1 ? s(x,3) : c == 2 ? s(x,0) : s(x,1))\ - : ( c == 0 ? s(x,1) : c == 1 ? s(x,2) : c == 2 ? s(x,3) : s(x,0))) - -#if defined(IT4_SET) -#undef dec_imvars -# if defined(DEC_ROUND_CACHE_TABLES) -#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_in,inv_var,rf1,c)) -# else -#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_in,inv_var,rf1,c)) -# endif -#elif defined(IT1_SET) -#undef dec_imvars -#define inv_rnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,upr,t_in,inv_var,rf1,c)) -#else -#define inv_rnd(y,x,k,c) (s(y,c) = inv_mcol((k)[c] ^ no_table(x,t_ibox,inv_var,rf1,c))) -#endif - -#if defined(IL4_SET) -# if defined(LAST_DEC_ROUND_CACHE_TABLES) -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_cached_tables(x,t_il,inv_var,rf1,c)) -# else -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ four_tables(x,t_il,inv_var,rf1,c)) -# endif -#elif defined(IL1_SET) -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ one_table(x,ups,t_il,inv_var,rf1,c)) -#else -#define inv_lrnd(y,x,k,c) (s(y,c) = (k)[c] ^ no_table(x,t_ibox,inv_var,rf1,c)) -#endif - -aes_rval aes_decrypt_cbc(const unsigned char *in, const unsigned char *in_iv, unsigned int num_blk, - unsigned char *out, const aes_decrypt_ctx cx[1]) -{ aes_32t locals(b0, b1); - const aes_32t *kptr = cx->ks + cx->rn * N_COLS; - const aes_32t *kp; -#if defined(DEC_ROUND_CACHE_TABLES) - dtables(t_in); -#endif -#if defined(LAST_DEC_ROUND_CACHE_TABLES) - dtables(t_il); -#endif - -#if defined( dec_imvars ) - dec_imvars; /* declare variables for inv_mcol() if needed */ -#endif - -#if defined( AES_ERR_CHK ) - if( cx->rn != 10 && cx->rn != 12 && cx->rn != 14 ) - return aes_error; -#endif - -#if defined(DEC_ROUND_CACHE_TABLES) - itables(t_in); -#endif - - in += AES_BLOCK_SIZE * (num_blk - 1); - out += AES_BLOCK_SIZE * (num_blk - 1); - // Load the last block's ciphertext into b1 - state_in(b1, in); - - for (;num_blk; out -= AES_BLOCK_SIZE, --num_blk) - { - kp = kptr; - // Do the xor part of state_in, where b1 is the previous block's ciphertext. - key_in(b0, b1, kp); - -#if (DEC_UNROLL == FULL) - - switch(cx->rn) - { - case 14: - round(inv_rnd, b1, b0, kp - 1 * N_COLS); - round(inv_rnd, b0, b1, kp - 2 * N_COLS); - kp -= 2 * N_COLS; - case 12: - round(inv_rnd, b1, b0, kp - 1 * N_COLS); - round(inv_rnd, b0, b1, kp - 2 * N_COLS); - kp -= 2 * N_COLS; - case 10: - default: - round(inv_rnd, b1, b0, kp - 1 * N_COLS); - round(inv_rnd, b0, b1, kp - 2 * N_COLS); - round(inv_rnd, b1, b0, kp - 3 * N_COLS); - round(inv_rnd, b0, b1, kp - 4 * N_COLS); - round(inv_rnd, b1, b0, kp - 5 * N_COLS); - round(inv_rnd, b0, b1, kp - 6 * N_COLS); - round(inv_rnd, b1, b0, kp - 7 * N_COLS); - round(inv_rnd, b0, b1, kp - 8 * N_COLS); - round(inv_rnd, b1, b0, kp - 9 * N_COLS); -#if defined(LAST_DEC_ROUND_CACHE_TABLES) - itables(t_il); -#endif - round(inv_lrnd, b0, b1, kp - 10 * N_COLS); - } - -#else - - { aes_32t rnd; -#if (DEC_UNROLL == PARTIAL) - for(rnd = 0; rnd < (cx->rn >> 1) - 1; ++rnd) - { - kp -= N_COLS; - round(inv_rnd, b1, b0, kp); - kp -= N_COLS; - round(inv_rnd, b0, b1, kp); - } - kp -= N_COLS; - round(inv_rnd, b1, b0, kp); -#else - for(rnd = 0; rnd < cx->rn - 1; ++rnd) - { - kp -= N_COLS; - round(inv_rnd, b1, b0, kp); - l_copy(b0, b1); - } -#endif -#if defined(LAST_DEC_ROUND_CACHE_TABLES) - itables(t_il); -#endif - kp -= N_COLS; - round(inv_lrnd, b0, b1, kp); - } -#endif - - if (num_blk == 1) - { - // We are doing the first block so we need the IV rather than the previous - // block for CBC (there is no previous block) - state_in(b1, in_iv); - } - else - { - in -= AES_BLOCK_SIZE; - state_in(b1, in); - } - - // Do the CBC with b1 which is either the IV or the ciphertext of the previous block. - cbc(b0, b1); - - state_out(out, b0); - } -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(__cplusplus) -} -#endif diff --git a/bsd/crypto/aes/ppc/aeskey.c b/bsd/crypto/aes/ppc/aeskey.c deleted file mode 100644 index 5e0a6453c..000000000 --- a/bsd/crypto/aes/ppc/aeskey.c +++ /dev/null @@ -1,455 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 26/08/2003 - - This file contains the code for implementing the key schedule for AES - (Rijndael) for block and key sizes of 16, 24, and 32 bytes. See aesopt.h - for further details including optimisation. -*/ - -#include "aesopt.h" -#include "aestab.h" - -#if defined(__cplusplus) -extern "C" -{ -#endif - -/* Initialise the key schedule from the user supplied key. The key - length can be specified in bytes, with legal values of 16, 24 - and 32, or in bits, with legal values of 128, 192 and 256. These - values correspond with Nk values of 4, 6 and 8 respectively. - - The following macros implement a single cycle in the key - schedule generation process. The number of cycles needed - for each cx->n_col and nk value is: - - nk = 4 5 6 7 8 - ------------------------------ - cx->n_col = 4 10 9 8 7 7 - cx->n_col = 5 14 11 10 9 9 - cx->n_col = 6 19 15 12 11 11 - cx->n_col = 7 21 19 16 13 14 - cx->n_col = 8 29 23 19 17 14 -*/ - -#define ke4(k,i) \ -{ k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+5] = ss[1] ^= ss[0]; \ - k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ -} -#define kel4(k,i) \ -{ k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+5] = ss[1] ^= ss[0]; \ - k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2]; \ -} - -#define ke6(k,i) \ -{ k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 7] = ss[1] ^= ss[0]; \ - k[6*(i)+ 8] = ss[2] ^= ss[1]; k[6*(i)+ 9] = ss[3] ^= ss[2]; \ - k[6*(i)+10] = ss[4] ^= ss[3]; k[6*(i)+11] = ss[5] ^= ss[4]; \ -} -#define kel6(k,i) \ -{ k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 7] = ss[1] ^= ss[0]; \ - k[6*(i)+ 8] = ss[2] ^= ss[1]; k[6*(i)+ 9] = ss[3] ^= ss[2]; \ -} - -#define ke8(k,i) \ -{ k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 9] = ss[1] ^= ss[0]; \ - k[8*(i)+10] = ss[2] ^= ss[1]; k[8*(i)+11] = ss[3] ^= ss[2]; \ - k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0); k[8*(i)+13] = ss[5] ^= ss[4]; \ - k[8*(i)+14] = ss[6] ^= ss[5]; k[8*(i)+15] = ss[7] ^= ss[6]; \ -} -#define kel8(k,i) \ -{ k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 9] = ss[1] ^= ss[0]; \ - k[8*(i)+10] = ss[2] ^= ss[1]; k[8*(i)+11] = ss[3] ^= ss[2]; \ -} - -#if defined(ENCRYPTION_KEY_SCHEDULE) - -#if defined(AES_128) || defined(AES_VAR) - -aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]) -{ aes_32t ss[4]; - - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if ENC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < ((11 * N_COLS - 5) / 4); ++i) - ke4(cx->ks, i); - } -#else - ke4(cx->ks, 0); ke4(cx->ks, 1); - ke4(cx->ks, 2); ke4(cx->ks, 3); - ke4(cx->ks, 4); ke4(cx->ks, 5); - ke4(cx->ks, 6); ke4(cx->ks, 7); - ke4(cx->ks, 8); -#endif - kel4(cx->ks, 9); - cx->rn = 10; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_192) || defined(AES_VAR) - -aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]) -{ aes_32t ss[6]; - - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - -#if ENC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < (13 * N_COLS - 7) / 6; ++i) - ke6(cx->ks, i); - } -#else - ke6(cx->ks, 0); ke6(cx->ks, 1); - ke6(cx->ks, 2); ke6(cx->ks, 3); - ke6(cx->ks, 4); ke6(cx->ks, 5); - ke6(cx->ks, 6); -#endif - kel6(cx->ks, 7); - cx->rn = 12; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_256) || defined(AES_VAR) - -aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]) -{ aes_32t ss[8]; - - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - cx->ks[6] = ss[6] = word_in(key, 6); - cx->ks[7] = ss[7] = word_in(key, 7); - -#if ENC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < (15 * N_COLS - 9) / 8; ++i) - ke8(cx->ks, i); - } -#else - ke8(cx->ks, 0); ke8(cx->ks, 1); - ke8(cx->ks, 2); ke8(cx->ks, 3); - ke8(cx->ks, 4); ke8(cx->ks, 5); -#endif - kel8(cx->ks, 6); - cx->rn = 14; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_VAR) - -aes_rval aes_encrypt_key(const unsigned char *key, int key_len, aes_encrypt_ctx cx[1]) -{ - switch(key_len) - { -#if defined( AES_ERR_CHK ) - case 16: case 128: return aes_encrypt_key128(key, cx); - case 24: case 192: return aes_encrypt_key192(key, cx); - case 32: case 256: return aes_encrypt_key256(key, cx); - default: return aes_error; -#else - case 16: case 128: aes_encrypt_key128(key, cx); return; - case 24: case 192: aes_encrypt_key192(key, cx); return; - case 32: case 256: aes_encrypt_key256(key, cx); return; -#endif - } -} - -#endif - -#endif - -#if defined(DECRYPTION_KEY_SCHEDULE) - -#if DEC_ROUND == NO_TABLES -#define ff(x) (x) -#else -#define ff(x) inv_mcol(x) -#if defined( dec_imvars ) -#define d_vars dec_imvars -#endif -#endif - -#if 1 -#define kdf4(k,i) \ -{ ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; ss[1] = ss[1] ^ ss[3]; ss[2] = ss[2] ^ ss[3]; ss[3] = ss[3]; \ - ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \ - ss[4] ^= k[4*(i)]; k[4*(i)+4] = ff(ss[4]); ss[4] ^= k[4*(i)+1]; k[4*(i)+5] = ff(ss[4]); \ - ss[4] ^= k[4*(i)+2]; k[4*(i)+6] = ff(ss[4]); ss[4] ^= k[4*(i)+3]; k[4*(i)+7] = ff(ss[4]); \ -} -#define kd4(k,i) \ -{ ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \ - k[4*(i)+4] = ss[4] ^= k[4*(i)]; k[4*(i)+5] = ss[4] ^= k[4*(i)+1]; \ - k[4*(i)+6] = ss[4] ^= k[4*(i)+2]; k[4*(i)+7] = ss[4] ^= k[4*(i)+3]; \ -} -#define kdl4(k,i) \ -{ ss[4] = ls_box(ss[(i+3) % 4], 3) ^ t_use(r,c)[i]; ss[i % 4] ^= ss[4]; \ - k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; k[4*(i)+5] = ss[1] ^ ss[3]; \ - k[4*(i)+6] = ss[0]; k[4*(i)+7] = ss[1]; \ -} -#else -#define kdf4(k,i) \ -{ ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+ 4] = ff(ss[0]); ss[1] ^= ss[0]; k[4*(i)+ 5] = ff(ss[1]); \ - ss[2] ^= ss[1]; k[4*(i)+ 6] = ff(ss[2]); ss[3] ^= ss[2]; k[4*(i)+ 7] = ff(ss[3]); \ -} -#define kd4(k,i) \ -{ ss[4] = ls_box(ss[3],3) ^ t_use(r,c)[i]; \ - ss[0] ^= ss[4]; ss[4] = ff(ss[4]); k[4*(i)+ 4] = ss[4] ^= k[4*(i)]; \ - ss[1] ^= ss[0]; k[4*(i)+ 5] = ss[4] ^= k[4*(i)+ 1]; \ - ss[2] ^= ss[1]; k[4*(i)+ 6] = ss[4] ^= k[4*(i)+ 2]; \ - ss[3] ^= ss[2]; k[4*(i)+ 7] = ss[4] ^= k[4*(i)+ 3]; \ -} -#define kdl4(k,i) \ -{ ss[0] ^= ls_box(ss[3],3) ^ t_use(r,c)[i]; k[4*(i)+ 4] = ss[0]; ss[1] ^= ss[0]; k[4*(i)+ 5] = ss[1]; \ - ss[2] ^= ss[1]; k[4*(i)+ 6] = ss[2]; ss[3] ^= ss[2]; k[4*(i)+ 7] = ss[3]; \ -} -#endif - -#define kdf6(k,i) \ -{ ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 6] = ff(ss[0]); ss[1] ^= ss[0]; k[6*(i)+ 7] = ff(ss[1]); \ - ss[2] ^= ss[1]; k[6*(i)+ 8] = ff(ss[2]); ss[3] ^= ss[2]; k[6*(i)+ 9] = ff(ss[3]); \ - ss[4] ^= ss[3]; k[6*(i)+10] = ff(ss[4]); ss[5] ^= ss[4]; k[6*(i)+11] = ff(ss[5]); \ -} -#define kd6(k,i) \ -{ ss[6] = ls_box(ss[5],3) ^ t_use(r,c)[i]; \ - ss[0] ^= ss[6]; ss[6] = ff(ss[6]); k[6*(i)+ 6] = ss[6] ^= k[6*(i)]; \ - ss[1] ^= ss[0]; k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1]; \ - ss[2] ^= ss[1]; k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2]; \ - ss[3] ^= ss[2]; k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3]; \ - ss[4] ^= ss[3]; k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4]; \ - ss[5] ^= ss[4]; k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5]; \ -} -#define kdl6(k,i) \ -{ ss[0] ^= ls_box(ss[5],3) ^ t_use(r,c)[i]; k[6*(i)+ 6] = ss[0]; ss[1] ^= ss[0]; k[6*(i)+ 7] = ss[1]; \ - ss[2] ^= ss[1]; k[6*(i)+ 8] = ss[2]; ss[3] ^= ss[2]; k[6*(i)+ 9] = ss[3]; \ -} - -#define kdf8(k,i) \ -{ ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 8] = ff(ss[0]); ss[1] ^= ss[0]; k[8*(i)+ 9] = ff(ss[1]); \ - ss[2] ^= ss[1]; k[8*(i)+10] = ff(ss[2]); ss[3] ^= ss[2]; k[8*(i)+11] = ff(ss[3]); \ - ss[4] ^= ls_box(ss[3],0); k[8*(i)+12] = ff(ss[4]); ss[5] ^= ss[4]; k[8*(i)+13] = ff(ss[5]); \ - ss[6] ^= ss[5]; k[8*(i)+14] = ff(ss[6]); ss[7] ^= ss[6]; k[8*(i)+15] = ff(ss[7]); \ -} -#define kd8(k,i) \ -{ aes_32t g = ls_box(ss[7],3) ^ t_use(r,c)[i]; \ - ss[0] ^= g; g = ff(g); k[8*(i)+ 8] = g ^= k[8*(i)]; \ - ss[1] ^= ss[0]; k[8*(i)+ 9] = g ^= k[8*(i)+ 1]; \ - ss[2] ^= ss[1]; k[8*(i)+10] = g ^= k[8*(i)+ 2]; \ - ss[3] ^= ss[2]; k[8*(i)+11] = g ^= k[8*(i)+ 3]; \ - g = ls_box(ss[3],0); \ - ss[4] ^= g; g = ff(g); k[8*(i)+12] = g ^= k[8*(i)+ 4]; \ - ss[5] ^= ss[4]; k[8*(i)+13] = g ^= k[8*(i)+ 5]; \ - ss[6] ^= ss[5]; k[8*(i)+14] = g ^= k[8*(i)+ 6]; \ - ss[7] ^= ss[6]; k[8*(i)+15] = g ^= k[8*(i)+ 7]; \ -} -#define kdl8(k,i) \ -{ ss[0] ^= ls_box(ss[7],3) ^ t_use(r,c)[i]; k[8*(i)+ 8] = ss[0]; ss[1] ^= ss[0]; k[8*(i)+ 9] = ss[1]; \ - ss[2] ^= ss[1]; k[8*(i)+10] = ss[2]; ss[3] ^= ss[2]; k[8*(i)+11] = ss[3]; \ -} - -#if defined(AES_128) || defined(AES_VAR) - -aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]) -{ aes_32t ss[5]; -#if defined( d_vars ) - d_vars; -#endif - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if DEC_UNROLL == NONE - { aes_32t i; - - for(i = 0; i < (11 * N_COLS - 5) / 4; ++i) - ke4(cx->ks, i); - kel4(cx->ks, 9); -#if !(DEC_ROUND == NO_TABLES) - for(i = N_COLS; i < 10 * N_COLS; ++i) - cx->ks[i] = inv_mcol(cx->ks[i]); -#endif - } -#else - kdf4(cx->ks, 0); kd4(cx->ks, 1); - kd4(cx->ks, 2); kd4(cx->ks, 3); - kd4(cx->ks, 4); kd4(cx->ks, 5); - kd4(cx->ks, 6); kd4(cx->ks, 7); - kd4(cx->ks, 8); kdl4(cx->ks, 9); -#endif - cx->rn = 10; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_192) || defined(AES_VAR) - -aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]) -{ aes_32t ss[7]; -#if defined( d_vars ) - d_vars; -#endif - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if DEC_UNROLL == NONE - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - { aes_32t i; - - for(i = 0; i < (13 * N_COLS - 7) / 6; ++i) - ke6(cx->ks, i); - kel6(cx->ks, 7); -#if !(DEC_ROUND == NO_TABLES) - for(i = N_COLS; i < 12 * N_COLS; ++i) - cx->ks[i] = inv_mcol(cx->ks[i]); -#endif - } -#else - cx->ks[4] = ff(ss[4] = word_in(key, 4)); - cx->ks[5] = ff(ss[5] = word_in(key, 5)); - kdf6(cx->ks, 0); kd6(cx->ks, 1); - kd6(cx->ks, 2); kd6(cx->ks, 3); - kd6(cx->ks, 4); kd6(cx->ks, 5); - kd6(cx->ks, 6); kdl6(cx->ks, 7); -#endif - cx->rn = 12; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_256) || defined(AES_VAR) - -aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]) -{ aes_32t ss[8]; -#if defined( d_vars ) - d_vars; -#endif - cx->ks[0] = ss[0] = word_in(key, 0); - cx->ks[1] = ss[1] = word_in(key, 1); - cx->ks[2] = ss[2] = word_in(key, 2); - cx->ks[3] = ss[3] = word_in(key, 3); - -#if DEC_UNROLL == NONE - cx->ks[4] = ss[4] = word_in(key, 4); - cx->ks[5] = ss[5] = word_in(key, 5); - cx->ks[6] = ss[6] = word_in(key, 6); - cx->ks[7] = ss[7] = word_in(key, 7); - { aes_32t i; - - for(i = 0; i < (15 * N_COLS - 9) / 8; ++i) - ke8(cx->ks, i); - kel8(cx->ks, i); -#if !(DEC_ROUND == NO_TABLES) - for(i = N_COLS; i < 14 * N_COLS; ++i) - cx->ks[i] = inv_mcol(cx->ks[i]); - -#endif - } -#else - cx->ks[4] = ff(ss[4] = word_in(key, 4)); - cx->ks[5] = ff(ss[5] = word_in(key, 5)); - cx->ks[6] = ff(ss[6] = word_in(key, 6)); - cx->ks[7] = ff(ss[7] = word_in(key, 7)); - kdf8(cx->ks, 0); kd8(cx->ks, 1); - kd8(cx->ks, 2); kd8(cx->ks, 3); - kd8(cx->ks, 4); kd8(cx->ks, 5); - kdl8(cx->ks, 6); -#endif - cx->rn = 14; -#if defined( AES_ERR_CHK ) - return aes_good; -#endif -} - -#endif - -#if defined(AES_VAR) - -aes_rval aes_decrypt_key(const unsigned char *key, int key_len, aes_decrypt_ctx cx[1]) -{ - switch(key_len) - { -#if defined( AES_ERR_CHK ) - case 16: case 128: return aes_decrypt_key128(key, cx); - case 24: case 192: return aes_decrypt_key192(key, cx); - case 32: case 256: return aes_decrypt_key256(key, cx); - default: return aes_error; -#else - case 16: case 128: aes_decrypt_key128(key, cx); return; - case 24: case 192: aes_decrypt_key192(key, cx); return; - case 32: case 256: aes_decrypt_key256(key, cx); return; -#endif - } -} - -#endif - -#endif - -#if defined(__cplusplus) -} -#endif diff --git a/bsd/crypto/aes/ppc/aesopt.h b/bsd/crypto/aes/ppc/aesopt.h deleted file mode 100644 index 2b78eb920..000000000 --- a/bsd/crypto/aes/ppc/aesopt.h +++ /dev/null @@ -1,753 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - - My thanks go to Dag Arne Osvik for devising the schemes used here for key - length derivation from the form of the key schedule - - This file contains the compilation options for AES (Rijndael) and code - that is common across encryption, key scheduling and table generation. - - OPERATION - - These source code files implement the AES algorithm Rijndael designed by - Joan Daemen and Vincent Rijmen. This version is designed for the standard - block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24 - and 32 bytes). - - This version is designed for flexibility and speed using operations on - 32-bit words rather than operations on bytes. It can be compiled with - either big or little endian internal byte order but is faster when the - native byte order for the processor is used. - - THE CIPHER INTERFACE - - The cipher interface is implemented as an array of bytes in which lower - AES bit sequence indexes map to higher numeric significance within bytes. - - aes_08t (an unsigned 8-bit type) - aes_32t (an unsigned 32-bit type) - struct aes_encrypt_ctx (structure for the cipher encryption context) - struct aes_decrypt_ctx (structure for the cipher decryption context) - aes_rval the function return type - - C subroutine calls: - - aes_rval aes_encrypt_key128(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt_key192(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt_key256(const unsigned char *key, aes_encrypt_ctx cx[1]); - aes_rval aes_encrypt(const unsigned char *in, unsigned char *out, - const aes_encrypt_ctx cx[1]); - - aes_rval aes_decrypt_key128(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt_key192(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt_key256(const unsigned char *key, aes_decrypt_ctx cx[1]); - aes_rval aes_decrypt(const unsigned char *in, unsigned char *out, - const aes_decrypt_ctx cx[1]); - - IMPORTANT NOTE: If you are using this C interface with dynamic tables make sure that - you call genTabs() before AES is used so that the tables are initialised. - - C++ aes class subroutines: - - Class AESencrypt for encryption - - Construtors: - AESencrypt(void) - AESencrypt(const unsigned char *key) - 128 bit key - Members: - aes_rval key128(const unsigned char *key) - aes_rval key192(const unsigned char *key) - aes_rval key256(const unsigned char *key) - aes_rval encrypt(const unsigned char *in, unsigned char *out) const - - Class AESdecrypt for encryption - Construtors: - AESdecrypt(void) - AESdecrypt(const unsigned char *key) - 128 bit key - Members: - aes_rval key128(const unsigned char *key) - aes_rval key192(const unsigned char *key) - aes_rval key256(const unsigned char *key) - aes_rval decrypt(const unsigned char *in, unsigned char *out) const - - COMPILATION - - The files used to provide AES (Rijndael) are - - a. aes.h for the definitions needed for use in C. - b. aescpp.h for the definitions needed for use in C++. - c. aesopt.h for setting compilation options (also includes common code). - d. aescrypt.c for encryption and decrytpion, or - e. aeskey.c for key scheduling. - f. aestab.c for table loading or generation. - g. aescrypt.asm for encryption and decryption using assembler code. - h. aescrypt.mmx.asm for encryption and decryption using MMX assembler. - - To compile AES (Rijndael) for use in C code use aes.h and set the - defines here for the facilities you need (key lengths, encryption - and/or decryption). Do not define AES_DLL or AES_CPP. Set the options - for optimisations and table sizes here. - - To compile AES (Rijndael) for use in in C++ code use aescpp.h but do - not define AES_DLL - - To compile AES (Rijndael) in C as a Dynamic Link Library DLL) use - aes.h and include the AES_DLL define. - - CONFIGURATION OPTIONS (here and in aes.h) - - a. set AES_DLL in aes.h if AES (Rijndael) is to be compiled as a DLL - b. You may need to set PLATFORM_BYTE_ORDER to define the byte order. - c. If you want the code to run in a specific internal byte order, then - ALGORITHM_BYTE_ORDER must be set accordingly. - d. set other configuration options decribed below. -*/ - -#if !defined( _AESOPT_H ) -#define _AESOPT_H - -#include <crypto/aes/aes.h> - -/* CONFIGURATION - USE OF DEFINES - - Later in this section there are a number of defines that control the - operation of the code. In each section, the purpose of each define is - explained so that the relevant form can be included or excluded by - setting either 1's or 0's respectively on the branches of the related - #if clauses. - - PLATFORM SPECIFIC INCLUDES AND BYTE ORDER IN 32-BIT WORDS - - To obtain the highest speed on processors with 32-bit words, this code - needs to determine the byte order of the target machine. The following - block of code is an attempt to capture the most obvious ways in which - various environemnts define byte order. It may well fail, in which case - the definitions will need to be set by editing at the points marked - **** EDIT HERE IF NECESSARY **** below. My thanks go to Peter Gutmann - for his assistance with this endian detection nightmare. -*/ - -#define BRG_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ -#define BRG_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ - -#if defined(__GNUC__) || defined(__GNU_LIBRARY__) -# if defined(__FreeBSD__) || defined(__OpenBSD__) -# include <sys/endian.h> -# elif defined( BSD ) && BSD >= 199103 -# include <machine/endian.h> -# elif defined(__APPLE__) -# if defined(__BIG_ENDIAN__) && !defined( BIG_ENDIAN ) -# define BIG_ENDIAN -# elif defined(__LITTLE_ENDIAN__) && !defined( LITTLE_ENDIAN ) -# define LITTLE_ENDIAN -# endif -# else -# include <endian.h> -# if defined(__BEOS__) -# include <byteswap.h> -# endif -# endif -#endif - -#if !defined(PLATFORM_BYTE_ORDER) -# if defined(LITTLE_ENDIAN) || defined(BIG_ENDIAN) -# if defined(LITTLE_ENDIAN) && !defined(BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif !defined(LITTLE_ENDIAN) && defined(BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# elif defined(BYTE_ORDER) && (BYTE_ORDER == LITTLE_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif defined(BYTE_ORDER) && (BYTE_ORDER == BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# endif -# elif defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN) -# if defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif !defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _LITTLE_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif defined(_BYTE_ORDER) && (_BYTE_ORDER == _BIG_ENDIAN) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# endif -# elif defined(__LITTLE_ENDIAN__) || defined(__BIG_ENDIAN__) -# if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __LITTLE_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -# elif defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __BIG_ENDIAN__) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -# endif -# endif -#endif - -/* if the platform is still unknown, try to find its byte order */ -/* from commonly used machine defines */ - -#if !defined(PLATFORM_BYTE_ORDER) - -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN - -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __TANDEM ) || defined( THINK_C ) || defined( __VMCMS__ ) -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN - -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER BRG_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER BRG_BIG_ENDIAN -#else -# error Please edit aesopt.h (line 234 or 236) to set the platform byte order -#endif - -#endif - -/* SOME LOCAL DEFINITIONS */ - -#define NO_TABLES 0 -#define ONE_TABLE 1 -#define FOUR_TABLES 4 -#define NONE 0 -#define PARTIAL 1 -#define FULL 2 - -#if defined(bswap32) -#define aes_sw32 bswap32 -#elif defined(bswap_32) -#define aes_sw32 bswap_32 -#else -#define brot(x,n) (((aes_32t)(x) << n) | ((aes_32t)(x) >> (32 - n))) -#define aes_sw32(x) ((brot((x),8) & 0x00ff00ff) | (brot((x),24) & 0xff00ff00)) -#endif - -/* 1. FUNCTIONS REQUIRED - - This implementation provides subroutines for encryption, decryption - and for setting the three key lengths (separately) for encryption - and decryption. When the assembler code is not being used the following - definition blocks allow the selection of the routines that are to be - included in the compilation. -*/ -#if defined( AES_ENCRYPT ) -#define ENCRYPTION -#define ENCRYPTION_KEY_SCHEDULE -#endif - -#if defined( AES_DECRYPT ) -#define DECRYPTION -#define DECRYPTION_KEY_SCHEDULE -#endif - -/* 2. ASSEMBLER SUPPORT - - This define (which can be on the command line) enables the use of the - assembler code routines for encryption and decryption with the C code - only providing key scheduling -*/ -#if 0 && !defined(AES_ASM) -#define AES_ASM -#endif - -/* 3. BYTE ORDER WITHIN 32 BIT WORDS - - The fundamental data processing units in Rijndael are 8-bit bytes. The - input, output and key input are all enumerated arrays of bytes in which - bytes are numbered starting at zero and increasing to one less than the - number of bytes in the array in question. This enumeration is only used - for naming bytes and does not imply any adjacency or order relationship - from one byte to another. When these inputs and outputs are considered - as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to - byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte. - In this implementation bits are numbered from 0 to 7 starting at the - numerically least significant end of each byte (bit n represents 2^n). - - However, Rijndael can be implemented more efficiently using 32-bit - words by packing bytes into words so that bytes 4*n to 4*n+3 are placed - into word[n]. While in principle these bytes can be assembled into words - in any positions, this implementation only supports the two formats in - which bytes in adjacent positions within words also have adjacent byte - numbers. This order is called big-endian if the lowest numbered bytes - in words have the highest numeric significance and little-endian if the - opposite applies. - - This code can work in either order irrespective of the order used by the - machine on which it runs. Normally the internal byte order will be set - to the order of the processor on which the code is to be run but this - define can be used to reverse this in special situations - - NOTE: Assembler code versions rely on PLATFORM_BYTE_ORDER being set -*/ -#if 1 || defined(AES_ASM) -#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER -#elif 0 -#define ALGORITHM_BYTE_ORDER BRG_LITTLE_ENDIAN -#elif 0 -#define ALGORITHM_BYTE_ORDER BRG_BIG_ENDIAN -#else -#error The algorithm byte order is not defined -#endif - -/* 4. FAST INPUT/OUTPUT OPERATIONS. - - On some machines it is possible to improve speed by transferring the - bytes in the input and output arrays to and from the internal 32-bit - variables by addressing these arrays as if they are arrays of 32-bit - words. On some machines this will always be possible but there may - be a large performance penalty if the byte arrays are not aligned on - the normal word boundaries. On other machines this technique will - lead to memory access errors when such 32-bit word accesses are not - properly aligned. The option SAFE_IO avoids such problems but will - often be slower on those machines that support misaligned access - (especially so if care is taken to align the input and output byte - arrays on 32-bit word boundaries). If SAFE_IO is not defined it is - assumed that access to byte arrays as if they are arrays of 32-bit - words will not cause problems when such accesses are misaligned. -*/ -#if 0 && !defined(_MSC_VER) -#define SAFE_IO -#endif - -/* 5. LOOP UNROLLING - - The code for encryption and decrytpion cycles through a number of rounds - that can be implemented either in a loop or by expanding the code into a - long sequence of instructions, the latter producing a larger program but - one that will often be much faster. The latter is called loop unrolling. - There are also potential speed advantages in expanding two iterations in - a loop with half the number of iterations, which is called partial loop - unrolling. The following options allow partial or full loop unrolling - to be set independently for encryption and decryption -*/ -#if 1 -#define ENC_UNROLL FULL -#elif 0 -#define ENC_UNROLL PARTIAL -#else -#define ENC_UNROLL NONE -#endif - -#if 1 -#define DEC_UNROLL FULL -#elif 0 -#define DEC_UNROLL PARTIAL -#else -#define DEC_UNROLL NONE -#endif - -/* 6. FAST FINITE FIELD OPERATIONS - - If this section is included, tables are used to provide faster finite - field arithmetic (this has no effect if FIXED_TABLES is defined). -*/ -#if 1 -#define FF_TABLES -#endif - -/* 7. INTERNAL STATE VARIABLE FORMAT - - The internal state of Rijndael is stored in a number of local 32-bit - word varaibles which can be defined either as an array or as individual - names variables. Include this section if you want to store these local - varaibles in arrays. Otherwise individual local variables will be used. -*/ -#if 0 -#define ARRAYS -#endif - -/* In this implementation the columns of the state array are each held in - 32-bit words. The state array can be held in various ways: in an array - of words, in a number of individual word variables or in a number of - processor registers. The following define maps a variable name x and - a column number c to the way the state array variable is to be held. - The first define below maps the state into an array x[c] whereas the - second form maps the state into a number of individual variables x0, - x1, etc. Another form could map individual state colums to machine - register names. -*/ - -#if defined(ARRAYS) -#define s(x,c) x[c] -#else -#define s(x,c) x##c -#endif - -/* 8. FIXED OR DYNAMIC TABLES - - When this section is included the tables used by the code are compiled - statically into the binary file. Otherwise the subroutine gen_tabs() - must be called to compute them before the code is first used. -*/ -#if 1 -#define FIXED_TABLES -#endif - -/* 9. TABLE ALIGNMENT - - On some sytsems speed will be improved by aligning the AES large lookup - tables on particular boundaries. This define should be set to a power of - two giving the desired alignment. It can be left undefined if alignment - is not needed. This option is specific to the Microsft VC++ compiler - - it seems to sometimes cause trouble for the VC++ version 6 compiler. -*/ - -#if 0 && defined(_MSC_VER) && (_MSC_VER >= 1300) -#define TABLE_ALIGN 64 -#endif - -/* 10. INTERNAL TABLE CONFIGURATION - - This cipher proceeds by repeating in a number of cycles known as 'rounds' - which are implemented by a round function which can optionally be speeded - up using tables. The basic tables are each 256 32-bit words, with either - one or four tables being required for each round function depending on - how much speed is required. The encryption and decryption round functions - are different and the last encryption and decrytpion round functions are - different again making four different round functions in all. - - This means that: - 1. Normal encryption and decryption rounds can each use either 0, 1 - or 4 tables and table spaces of 0, 1024 or 4096 bytes each. - 2. The last encryption and decryption rounds can also use either 0, 1 - or 4 tables and table spaces of 0, 1024 or 4096 bytes each. - - Include or exclude the appropriate definitions below to set the number - of tables used by this implementation. -*/ - -#if 1 /* set tables for the normal encryption round */ -#define ENC_ROUND FOUR_TABLES -#elif 0 -#define ENC_ROUND ONE_TABLE -#else -#define ENC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the last encryption round */ -#define LAST_ENC_ROUND FOUR_TABLES -#elif 0 -#define LAST_ENC_ROUND ONE_TABLE -#else -#define LAST_ENC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the normal decryption round */ -#define DEC_ROUND FOUR_TABLES -#elif 0 -#define DEC_ROUND ONE_TABLE -#else -#define DEC_ROUND NO_TABLES -#endif - -#if 1 /* set tables for the last decryption round */ -#define LAST_DEC_ROUND FOUR_TABLES -#elif 0 -#define LAST_DEC_ROUND ONE_TABLE -#else -#define LAST_DEC_ROUND NO_TABLES -#endif - -/* The decryption key schedule can be speeded up with tables in the same - way that the round functions can. Include or exclude the following - defines to set this requirement. -*/ -#if 1 -#define KEY_SCHED FOUR_TABLES -#elif 0 -#define KEY_SCHED ONE_TABLE -#else -#define KEY_SCHED NO_TABLES -#endif - -/* 11. TABLE POINTER CACHING - - Normally tables are referenced directly, Enable this option if you wish to - cache pointers to the tables in the encrypt/decrypt code. Note that this - only works if you are using FOUR_TABLES for the ROUND you enable this for. -*/ -#if 1 -#define ENC_ROUND_CACHE_TABLES -#endif -#if 1 -#define LAST_ENC_ROUND_CACHE_TABLES -#endif -#if 1 -#define DEC_ROUND_CACHE_TABLES -#endif -#if 1 -#define LAST_DEC_ROUND_CACHE_TABLES -#endif - - -/* END OF CONFIGURATION OPTIONS */ - -#define RC_LENGTH (5 * (AES_BLOCK_SIZE / 4 - 2)) - -/* Disable or report errors on some combinations of options */ - -#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES -#undef LAST_ENC_ROUND -#define LAST_ENC_ROUND NO_TABLES -#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES -#undef LAST_ENC_ROUND -#define LAST_ENC_ROUND ONE_TABLE -#endif - -#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE -#undef ENC_UNROLL -#define ENC_UNROLL NONE -#endif - -#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES -#undef LAST_DEC_ROUND -#define LAST_DEC_ROUND NO_TABLES -#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES -#undef LAST_DEC_ROUND -#define LAST_DEC_ROUND ONE_TABLE -#endif - -#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE -#undef DEC_UNROLL -#define DEC_UNROLL NONE -#endif - -/* upr(x,n): rotates bytes within words by n positions, moving bytes to - higher index positions with wrap around into low positions - ups(x,n): moves bytes by n positions to higher index positions in - words but without wrap around - bval(x,n): extracts a byte from a word - - NOTE: The definitions given here are intended only for use with - unsigned variables and with shift counts that are compile - time constants -*/ - -#if (ALGORITHM_BYTE_ORDER == BRG_LITTLE_ENDIAN) -#define upr(x,n) (((aes_32t)(x) << (8 * (n))) | ((aes_32t)(x) >> (32 - 8 * (n)))) -#define ups(x,n) ((aes_32t) (x) << (8 * (n))) -#define bval(x,n) ((aes_08t)((x) >> (8 * (n)))) -#define bytes2word(b0, b1, b2, b3) \ - (((aes_32t)(b3) << 24) | ((aes_32t)(b2) << 16) | ((aes_32t)(b1) << 8) | (b0)) -#endif - -#if (ALGORITHM_BYTE_ORDER == BRG_BIG_ENDIAN) -#define upr(x,n) (((aes_32t)(x) >> (8 * (n))) | ((aes_32t)(x) << (32 - 8 * (n)))) -#define ups(x,n) ((aes_32t) (x) >> (8 * (n)))) -#define bval(x,n) ((aes_08t)((x) >> (24 - 8 * (n)))) -#define bytes2word(b0, b1, b2, b3) \ - (((aes_32t)(b0) << 24) | ((aes_32t)(b1) << 16) | ((aes_32t)(b2) << 8) | (b3)) -#endif - -#if defined(SAFE_IO) - -#define word_in(x,c) bytes2word(((aes_08t*)(x)+4*c)[0], ((aes_08t*)(x)+4*c)[1], \ - ((aes_08t*)(x)+4*c)[2], ((aes_08t*)(x)+4*c)[3]) -#define word_out(x,c,v) { ((aes_08t*)(x)+4*c)[0] = bval(v,0); ((aes_08t*)(x)+4*c)[1] = bval(v,1); \ - ((aes_08t*)(x)+4*c)[2] = bval(v,2); ((aes_08t*)(x)+4*c)[3] = bval(v,3); } - -#elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER) - -#define word_in(x,c) (*((aes_32t*)(x)+(c))) -#define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = (v)) - -#else - -#define word_in(x,c) aes_sw32(*((aes_32t*)(x)+(c))) -#define word_out(x,c,v) (*((aes_32t*)(x)+(c)) = aes_sw32(v)) - -#endif - -/* the finite field modular polynomial and elements */ - -#define WPOLY 0x011b -#define BPOLY 0x1b - -/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */ - -#define m1 0x80808080 -#define m2 0x7f7f7f7f -#define gf_mulx(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY)) - -/* The following defines provide alternative definitions of gf_mulx that might - give improved performance if a fast 32-bit multiply is not available. Note - that a temporary variable u needs to be defined where gf_mulx is used. - -#define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6)) -#define m4 (0x01010101 * BPOLY) -#define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4) -*/ - -/* Work out which tables are needed for the different options */ - -#if defined( AES_ASM ) -#if defined( ENC_ROUND ) -#undef ENC_ROUND -#endif -#define ENC_ROUND FOUR_TABLES -#if defined( LAST_ENC_ROUND ) -#undef LAST_ENC_ROUND -#endif -#define LAST_ENC_ROUND FOUR_TABLES -#if defined( DEC_ROUND ) -#undef DEC_ROUND -#endif -#define DEC_ROUND FOUR_TABLES -#if defined( LAST_DEC_ROUND ) -#undef LAST_DEC_ROUND -#endif -#define LAST_DEC_ROUND FOUR_TABLES -#if defined( KEY_SCHED ) -#undef KEY_SCHED -#define KEY_SCHED FOUR_TABLES -#endif -#endif - -#if defined(ENCRYPTION) || defined(AES_ASM) -#if ENC_ROUND == ONE_TABLE -#define FT1_SET -#elif ENC_ROUND == FOUR_TABLES -#define FT4_SET -#else -#define SBX_SET -#endif -#if LAST_ENC_ROUND == ONE_TABLE -#define FL1_SET -#elif LAST_ENC_ROUND == FOUR_TABLES -#define FL4_SET -#elif !defined(SBX_SET) -#define SBX_SET -#endif -#endif - -#if defined(DECRYPTION) || defined(AES_ASM) -#if DEC_ROUND == ONE_TABLE -#define IT1_SET -#elif DEC_ROUND == FOUR_TABLES -#define IT4_SET -#else -#define ISB_SET -#endif -#if LAST_DEC_ROUND == ONE_TABLE -#define IL1_SET -#elif LAST_DEC_ROUND == FOUR_TABLES -#define IL4_SET -#elif !defined(ISB_SET) -#define ISB_SET -#endif -#endif - -#if defined(ENCRYPTION_KEY_SCHEDULE) || defined(DECRYPTION_KEY_SCHEDULE) -#if KEY_SCHED == ONE_TABLE -#define LS1_SET -#define IM1_SET -#elif KEY_SCHED == FOUR_TABLES -#define LS4_SET -#define IM4_SET -#elif !defined(SBX_SET) -#define SBX_SET -#endif -#endif - -/* generic definitions of Rijndael macros that use tables */ - -#define no_table(x,box,vf,rf,c) bytes2word( \ - box[bval(vf(x,0,c),rf(0,c))], \ - box[bval(vf(x,1,c),rf(1,c))], \ - box[bval(vf(x,2,c),rf(2,c))], \ - box[bval(vf(x,3,c),rf(3,c))]) - -#define one_table(x,op,tab,vf,rf,c) \ - ( tab[bval(vf(x,0,c),rf(0,c))] \ - ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \ - ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \ - ^ op(tab[bval(vf(x,3,c),rf(3,c))],3)) - -#define four_tables(x,tab,vf,rf,c) \ - ( tab[0][bval(vf(x,0,c),rf(0,c))] \ - ^ tab[1][bval(vf(x,1,c),rf(1,c))] \ - ^ tab[2][bval(vf(x,2,c),rf(2,c))] \ - ^ tab[3][bval(vf(x,3,c),rf(3,c))]) - -#define four_cached_tables(x,tab,vf,rf,c) \ -( tab##0[bval(vf(x,0,c),rf(0,c))] \ - ^ tab##1[bval(vf(x,1,c),rf(1,c))] \ - ^ tab##2[bval(vf(x,2,c),rf(2,c))] \ - ^ tab##3[bval(vf(x,3,c),rf(3,c))]) - -#define vf1(x,r,c) (x) -#define rf1(r,c) (r) -#define rf2(r,c) ((8+r-c)&3) - -/* perform forward and inverse column mix operation on four bytes in long word x in */ -/* parallel. NOTE: x must be a simple variable, NOT an expression in these macros. */ - -#if defined(FM4_SET) /* not currently used */ -#define fwd_mcol(x) four_tables(x,t_use(f,m),vf1,rf1,0) -#elif defined(FM1_SET) /* not currently used */ -#define fwd_mcol(x) one_table(x,upr,t_use(f,m),vf1,rf1,0) -#else -#define dec_fmvars aes_32t g2 -#define fwd_mcol(x) (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ upr((x), 2) ^ upr((x), 1)) -#endif - -#if defined(IM4_SET) -#define inv_mcol(x) four_tables(x,t_use(i,m),vf1,rf1,0) -#elif defined(IM1_SET) -#define inv_mcol(x) one_table(x,upr,t_use(i,m),vf1,rf1,0) -#else -#define dec_imvars aes_32t g2, g4, g9 -#define inv_mcol(x) (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = (x) ^ gf_mulx(g4), g4 ^= g9, \ - (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ upr(g4, 2) ^ upr(g9, 1)) -#endif - -#if defined(FL4_SET) -#define ls_box(x,c) four_tables(x,t_use(f,l),vf1,rf2,c) -#elif defined(LS4_SET) -#define ls_box(x,c) four_tables(x,t_use(l,s),vf1,rf2,c) -#elif defined(FL1_SET) -#define ls_box(x,c) one_table(x,upr,t_use(f,l),vf1,rf2,c) -#elif defined(LS1_SET) -#define ls_box(x,c) one_table(x,upr,t_use(l,s),vf1,rf2,c) -#else -#define ls_box(x,c) no_table(x,t_use(s,box),vf1,rf2,c) -#endif - -#endif diff --git a/bsd/crypto/aes/ppc/aestab.c b/bsd/crypto/aes/ppc/aestab.c deleted file mode 100644 index dfd2ee969..000000000 --- a/bsd/crypto/aes/ppc/aestab.c +++ /dev/null @@ -1,384 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - -*/ - -#if defined(__cplusplus) -extern "C" -{ -#endif - -#define DO_TABLES - -#include "aesopt.h" - -#if defined(FIXED_TABLES) - -#define sb_data(w) {\ - w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ - w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ - w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ - w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ - w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ - w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ - w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ - w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ - w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ - w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ - w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ - w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ - w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ - w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ - w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ - w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ - w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ - w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ - w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ - w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ - w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ - w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ - w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ - w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ - w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ - w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ - w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ - w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ - w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ - w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ - w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ - w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } - -#define isb_data(w) {\ - w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), w(0x38),\ - w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), w(0xd7), w(0xfb),\ - w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), w(0x2f), w(0xff), w(0x87),\ - w(0x34), w(0x8e), w(0x43), w(0x44), w(0xc4), w(0xde), w(0xe9), w(0xcb),\ - w(0x54), w(0x7b), w(0x94), w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d),\ - w(0xee), w(0x4c), w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e),\ - w(0x08), w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2),\ - w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), w(0x25),\ - w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), w(0x98), w(0x16),\ - w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), w(0x65), w(0xb6), w(0x92),\ - w(0x6c), w(0x70), w(0x48), w(0x50), w(0xfd), w(0xed), w(0xb9), w(0xda),\ - w(0x5e), w(0x15), w(0x46), w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84),\ - w(0x90), w(0xd8), w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a),\ - w(0xf7), w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06),\ - w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), w(0x02),\ - w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), w(0x8a), w(0x6b),\ - w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), w(0x67), w(0xdc), w(0xea),\ - w(0x97), w(0xf2), w(0xcf), w(0xce), w(0xf0), w(0xb4), w(0xe6), w(0x73),\ - w(0x96), w(0xac), w(0x74), w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85),\ - w(0xe2), w(0xf9), w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e),\ - w(0x47), w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89),\ - w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), w(0x1b),\ - w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), w(0x79), w(0x20),\ - w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), w(0xcd), w(0x5a), w(0xf4),\ - w(0x1f), w(0xdd), w(0xa8), w(0x33), w(0x88), w(0x07), w(0xc7), w(0x31),\ - w(0xb1), w(0x12), w(0x10), w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f),\ - w(0x60), w(0x51), w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d),\ - w(0x2d), w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef),\ - w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), w(0xb0),\ - w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), w(0x99), w(0x61),\ - w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), w(0x77), w(0xd6), w(0x26),\ - w(0xe1), w(0x69), w(0x14), w(0x63), w(0x55), w(0x21), w(0x0c), w(0x7d) } - -#define mm_data(w) {\ - w(0x00), w(0x01), w(0x02), w(0x03), w(0x04), w(0x05), w(0x06), w(0x07),\ - w(0x08), w(0x09), w(0x0a), w(0x0b), w(0x0c), w(0x0d), w(0x0e), w(0x0f),\ - w(0x10), w(0x11), w(0x12), w(0x13), w(0x14), w(0x15), w(0x16), w(0x17),\ - w(0x18), w(0x19), w(0x1a), w(0x1b), w(0x1c), w(0x1d), w(0x1e), w(0x1f),\ - w(0x20), w(0x21), w(0x22), w(0x23), w(0x24), w(0x25), w(0x26), w(0x27),\ - w(0x28), w(0x29), w(0x2a), w(0x2b), w(0x2c), w(0x2d), w(0x2e), w(0x2f),\ - w(0x30), w(0x31), w(0x32), w(0x33), w(0x34), w(0x35), w(0x36), w(0x37),\ - w(0x38), w(0x39), w(0x3a), w(0x3b), w(0x3c), w(0x3d), w(0x3e), w(0x3f),\ - w(0x40), w(0x41), w(0x42), w(0x43), w(0x44), w(0x45), w(0x46), w(0x47),\ - w(0x48), w(0x49), w(0x4a), w(0x4b), w(0x4c), w(0x4d), w(0x4e), w(0x4f),\ - w(0x50), w(0x51), w(0x52), w(0x53), w(0x54), w(0x55), w(0x56), w(0x57),\ - w(0x58), w(0x59), w(0x5a), w(0x5b), w(0x5c), w(0x5d), w(0x5e), w(0x5f),\ - w(0x60), w(0x61), w(0x62), w(0x63), w(0x64), w(0x65), w(0x66), w(0x67),\ - w(0x68), w(0x69), w(0x6a), w(0x6b), w(0x6c), w(0x6d), w(0x6e), w(0x6f),\ - w(0x70), w(0x71), w(0x72), w(0x73), w(0x74), w(0x75), w(0x76), w(0x77),\ - w(0x78), w(0x79), w(0x7a), w(0x7b), w(0x7c), w(0x7d), w(0x7e), w(0x7f),\ - w(0x80), w(0x81), w(0x82), w(0x83), w(0x84), w(0x85), w(0x86), w(0x87),\ - w(0x88), w(0x89), w(0x8a), w(0x8b), w(0x8c), w(0x8d), w(0x8e), w(0x8f),\ - w(0x90), w(0x91), w(0x92), w(0x93), w(0x94), w(0x95), w(0x96), w(0x97),\ - w(0x98), w(0x99), w(0x9a), w(0x9b), w(0x9c), w(0x9d), w(0x9e), w(0x9f),\ - w(0xa0), w(0xa1), w(0xa2), w(0xa3), w(0xa4), w(0xa5), w(0xa6), w(0xa7),\ - w(0xa8), w(0xa9), w(0xaa), w(0xab), w(0xac), w(0xad), w(0xae), w(0xaf),\ - w(0xb0), w(0xb1), w(0xb2), w(0xb3), w(0xb4), w(0xb5), w(0xb6), w(0xb7),\ - w(0xb8), w(0xb9), w(0xba), w(0xbb), w(0xbc), w(0xbd), w(0xbe), w(0xbf),\ - w(0xc0), w(0xc1), w(0xc2), w(0xc3), w(0xc4), w(0xc5), w(0xc6), w(0xc7),\ - w(0xc8), w(0xc9), w(0xca), w(0xcb), w(0xcc), w(0xcd), w(0xce), w(0xcf),\ - w(0xd0), w(0xd1), w(0xd2), w(0xd3), w(0xd4), w(0xd5), w(0xd6), w(0xd7),\ - w(0xd8), w(0xd9), w(0xda), w(0xdb), w(0xdc), w(0xdd), w(0xde), w(0xdf),\ - w(0xe0), w(0xe1), w(0xe2), w(0xe3), w(0xe4), w(0xe5), w(0xe6), w(0xe7),\ - w(0xe8), w(0xe9), w(0xea), w(0xeb), w(0xec), w(0xed), w(0xee), w(0xef),\ - w(0xf0), w(0xf1), w(0xf2), w(0xf3), w(0xf4), w(0xf5), w(0xf6), w(0xf7),\ - w(0xf8), w(0xf9), w(0xfa), w(0xfb), w(0xfc), w(0xfd), w(0xfe), w(0xff) } - -#define rc_data(w) {\ - w(0x01), w(0x02), w(0x04), w(0x08), w(0x10),w(0x20), w(0x40), w(0x80),\ - w(0x1b), w(0x36) } - -#define h0(x) (x) - -#define w0(p) bytes2word(p, 0, 0, 0) -#define w1(p) bytes2word(0, p, 0, 0) -#define w2(p) bytes2word(0, 0, p, 0) -#define w3(p) bytes2word(0, 0, 0, p) - -#define u0(p) bytes2word(f2(p), p, p, f3(p)) -#define u1(p) bytes2word(f3(p), f2(p), p, p) -#define u2(p) bytes2word(p, f3(p), f2(p), p) -#define u3(p) bytes2word(p, p, f3(p), f2(p)) - -#define v0(p) bytes2word(fe(p), f9(p), fd(p), fb(p)) -#define v1(p) bytes2word(fb(p), fe(p), f9(p), fd(p)) -#define v2(p) bytes2word(fd(p), fb(p), fe(p), f9(p)) -#define v3(p) bytes2word(f9(p), fd(p), fb(p), fe(p)) - -#endif - -#if defined(FIXED_TABLES) || !defined(FF_TABLES) - -#define f2(x) ((x<<1) ^ (((x>>7) & 1) * WPOLY)) -#define f4(x) ((x<<2) ^ (((x>>6) & 1) * WPOLY) ^ (((x>>6) & 2) * WPOLY)) -#define f8(x) ((x<<3) ^ (((x>>5) & 1) * WPOLY) ^ (((x>>5) & 2) * WPOLY) \ - ^ (((x>>5) & 4) * WPOLY)) -#define f3(x) (f2(x) ^ x) -#define f9(x) (f8(x) ^ x) -#define fb(x) (f8(x) ^ f2(x) ^ x) -#define fd(x) (f8(x) ^ f4(x) ^ x) -#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) - -#else - -#define f2(x) ((x) ? pow[log[x] + 0x19] : 0) -#define f3(x) ((x) ? pow[log[x] + 0x01] : 0) -#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0) -#define fb(x) ((x) ? pow[log[x] + 0x68] : 0) -#define fd(x) ((x) ? pow[log[x] + 0xee] : 0) -#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0) -#define fi(x) ((x) ? pow[ 255 - log[x]] : 0) - -#endif - -#include "aestab.h" - -#if defined(FIXED_TABLES) - -/* implemented in case of wrong call for fixed tables */ - -void gen_tabs(void) -{ -} - -#else /* dynamic table generation */ - -#if !defined(FF_TABLES) - -/* Generate the tables for the dynamic table option - - It will generally be sensible to use tables to compute finite - field multiplies and inverses but where memory is scarse this - code might sometimes be better. But it only has effect during - initialisation so its pretty unimportant in overall terms. -*/ - -/* return 2 ^ (n - 1) where n is the bit number of the highest bit - set in x with x in the range 1 < x < 0x00000200. This form is - used so that locals within fi can be bytes rather than words -*/ - -static aes_08t hibit(const aes_32t x) -{ aes_08t r = (aes_08t)((x >> 1) | (x >> 2)); - - r |= (r >> 2); - r |= (r >> 4); - return (r + 1) >> 1; -} - -/* return the inverse of the finite field element x */ - -static aes_08t fi(const aes_08t x) -{ aes_08t p1 = x, p2 = BPOLY, n1 = hibit(x), n2 = 0x80, v1 = 1, v2 = 0; - - if(x < 2) return x; - - for(;;) - { - if(!n1) return v1; - - while(n2 >= n1) - { - n2 /= n1; p2 ^= p1 * n2; v2 ^= v1 * n2; n2 = hibit(p2); - } - - if(!n2) return v2; - - while(n1 >= n2) - { - n1 /= n2; p1 ^= p2 * n1; v1 ^= v2 * n1; n1 = hibit(p1); - } - } -} - -#endif - -/* The forward and inverse affine transformations used in the S-box */ - -#define fwd_affine(x) \ - (w = (aes_32t)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(aes_08t)(w^(w>>8))) - -#define inv_affine(x) \ - (w = (aes_32t)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(aes_08t)(w^(w>>8))) - -static int init = 0; - -void gen_tabs(void) -{ aes_32t i, w; - -#if defined(FF_TABLES) - - aes_08t pow[512], log[256]; - - if(init) return; - /* log and power tables for GF(2^8) finite field with - WPOLY as modular polynomial - the simplest primitive - root is 0x03, used here to generate the tables - */ - - i = 0; w = 1; - do - { - pow[i] = (aes_08t)w; - pow[i + 255] = (aes_08t)w; - log[w] = (aes_08t)i++; - w ^= (w << 1) ^ (w & 0x80 ? WPOLY : 0); - } - while (w != 1); - -#else - if(init) return; -#endif - - for(i = 0, w = 1; i < RC_LENGTH; ++i) - { - t_set(r,c)[i] = bytes2word(w, 0, 0, 0); - w = f2(w); - } - - for(i = 0; i < 256; ++i) - { aes_08t b; - - b = fwd_affine(fi((aes_08t)i)); - w = bytes2word(f2(b), b, b, f3(b)); - -#if defined( SBX_SET ) - t_set(s,box)[i] = b; -#endif - -#if defined( FT1_SET ) /* tables for a normal encryption round */ - t_set(f,n)[i] = w; -#endif -#if defined( FT4_SET ) - t_set(f,n)[0][i] = w; - t_set(f,n)[1][i] = upr(w,1); - t_set(f,n)[2][i] = upr(w,2); - t_set(f,n)[3][i] = upr(w,3); -#endif - w = bytes2word(b, 0, 0, 0); - -#if defined( FL1_SET ) /* tables for last encryption round (may also */ - t_set(f,l)[i] = w; /* be used in the key schedule) */ -#endif -#if defined( FL4_SET ) - t_set(f,l)[0][i] = w; - t_set(f,l)[1][i] = upr(w,1); - t_set(f,l)[2][i] = upr(w,2); - t_set(f,l)[3][i] = upr(w,3); -#endif - -#if defined( LS1_SET ) /* table for key schedule if t_set(f,l) above is */ - t_set(l,s)[i] = w; /* not of the required form */ -#endif -#if defined( LS4_SET ) - t_set(l,s)[0][i] = w; - t_set(l,s)[1][i] = upr(w,1); - t_set(l,s)[2][i] = upr(w,2); - t_set(l,s)[3][i] = upr(w,3); -#endif - - b = fi(inv_affine((aes_08t)i)); - w = bytes2word(fe(b), f9(b), fd(b), fb(b)); - -#if defined( IM1_SET ) /* tables for the inverse mix column operation */ - t_set(i,m)[b] = w; -#endif -#if defined( IM4_SET ) - t_set(i,m)[0][b] = w; - t_set(i,m)[1][b] = upr(w,1); - t_set(i,m)[2][b] = upr(w,2); - t_set(i,m)[3][b] = upr(w,3); -#endif - -#if defined( ISB_SET ) - t_set(i,box)[i] = b; -#endif -#if defined( IT1_SET ) /* tables for a normal decryption round */ - t_set(i,n)[i] = w; -#endif -#if defined( IT4_SET ) - t_set(i,n)[0][i] = w; - t_set(i,n)[1][i] = upr(w,1); - t_set(i,n)[2][i] = upr(w,2); - t_set(i,n)[3][i] = upr(w,3); -#endif - w = bytes2word(b, 0, 0, 0); -#if defined( IL1_SET ) /* tables for last decryption round */ - t_set(i,l)[i] = w; -#endif -#if defined( IL4_SET ) - t_set(i,l)[0][i] = w; - t_set(i,l)[1][i] = upr(w,1); - t_set(i,l)[2][i] = upr(w,2); - t_set(i,l)[3][i] = upr(w,3); -#endif - } - init = 1; -} - -#endif - -#if defined(__cplusplus) -} -#endif - diff --git a/bsd/crypto/aes/ppc/aestab.h b/bsd/crypto/aes/ppc/aestab.h deleted file mode 100644 index 004ef9e74..000000000 --- a/bsd/crypto/aes/ppc/aestab.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The free distribution and use of this software in both source and binary - form is allowed (with or without changes) provided that: - - 1. distributions of this source code include the above copyright - notice, this list of conditions and the following disclaimer; - - 2. distributions in binary form include the above copyright - notice, this list of conditions and the following disclaimer - in the documentation and/or other associated materials; - - 3. the copyright holder's name is not used to endorse products - built using this software without specific written permission. - - ALTERNATIVELY, provided that this notice is retained in full, this product - may be distributed under the terms of the GNU General Public License (GPL), - in which case the provisions of the GPL apply INSTEAD OF those given above. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue 28/01/2004 - - This file contains the code for declaring the tables needed to implement - AES. The file aesopt.h is assumed to be included before this header file. - If there are no global variables, the definitions here can be used to put - the AES tables in a structure so that a pointer can then be added to the - AES context to pass them to the AES routines that need them. If this - facility is used, the calling program has to ensure that this pointer is - managed appropriately. In particular, the value of the t_dec(in,it) item - in the table structure must be set to zero in order to ensure that the - tables are initialised. In practice the three code sequences in aeskey.c - that control the calls to gen_tabs() and the gen_tabs() routine itself will - have to be changed for a specific implementation. If global variables are - available it will generally be preferable to use them with the precomputed - FIXED_TABLES option that uses static global tables. - - The following defines can be used to control the way the tables - are defined, initialised and used in embedded environments that - require special features for these purposes - - the 't_dec' construction is used to declare fixed table arrays - the 't_set' construction is used to set fixed table values - the 't_use' construction is used to access fixed table values - - 256 byte tables: - - t_xxx(s,box) => forward S box - t_xxx(i,box) => inverse S box - - 256 32-bit word OR 4 x 256 32-bit word tables: - - t_xxx(f,n) => forward normal round - t_xxx(f,l) => forward last round - t_xxx(i,n) => inverse normal round - t_xxx(i,l) => inverse last round - t_xxx(l,s) => key schedule table - t_xxx(i,m) => key schedule table - - Other variables and tables: - - t_xxx(r,c) => the rcon table -*/ - -#if !defined( _AESTAB_H ) -#define _AESTAB_H - -#define t_dec(m,n) t_##m##n -#define t_set(m,n) t_##m##n -#define t_use(m,n) t_##m##n - -#if defined(FIXED_TABLES) -#define Const const -#else -#define Const -#endif - -#if defined(DO_TABLES) -#define Extern -#else -#define Extern extern -#endif - -#if defined(_MSC_VER) && defined(TABLE_ALIGN) -#define Align __declspec(align(TABLE_ALIGN)) -#else -#define Align -#endif - -#if defined(__cplusplus) -extern "C" -{ -#endif - -#if defined(DO_TABLES) && defined(FIXED_TABLES) -#define d_1(t,n,b,e) Align Const t n[256] = b(e) -#define d_4(t,n,b,e,f,g,h) Align Const t n[4][256] = { b(e), b(f), b(g), b(h) } -Extern Align Const aes_32t t_dec(r,c)[RC_LENGTH] = rc_data(w0); -#else -#define d_1(t,n,b,e) Extern Align Const t n[256] -#define d_4(t,n,b,e,f,g,h) Extern Align Const t n[4][256] -Extern Align Const aes_32t t_dec(r,c)[RC_LENGTH]; -#endif - -#if defined( SBX_SET ) - d_1(aes_08t, t_dec(s,box), sb_data, h0); -#endif -#if defined( ISB_SET ) - d_1(aes_08t, t_dec(i,box), isb_data, h0); -#endif - -#if defined( FT1_SET ) - d_1(aes_32t, t_dec(f,n), sb_data, u0); -#endif -#if defined( FT4_SET ) - d_4(aes_32t, t_dec(f,n), sb_data, u0, u1, u2, u3); -#endif - -#if defined( FL1_SET ) - d_1(aes_32t, t_dec(f,l), sb_data, w0); -#endif -#if defined( FL4_SET ) - d_4(aes_32t, t_dec(f,l), sb_data, w0, w1, w2, w3); -#endif - -#if defined( IT1_SET ) - d_1(aes_32t, t_dec(i,n), isb_data, v0); -#endif -#if defined( IT4_SET ) - d_4(aes_32t, t_dec(i,n), isb_data, v0, v1, v2, v3); -#endif - -#if defined( IL1_SET ) - d_1(aes_32t, t_dec(i,l), isb_data, w0); -#endif -#if defined( IL4_SET ) - d_4(aes_32t, t_dec(i,l), isb_data, w0, w1, w2, w3); -#endif - -#if defined( LS1_SET ) -#if defined( FL1_SET ) -#undef LS1_SET -#else - d_1(aes_32t, t_dec(l,s), sb_data, w0); -#endif -#endif - -#if defined( LS4_SET ) -#if defined( FL4_SET ) -#undef LS4_SET -#else - d_4(aes_32t, t_dec(l,s), sb_data, w0, w1, w2, w3); -#endif -#endif - -#if defined( IM1_SET ) - d_1(aes_32t, t_dec(i,m), mm_data, v0); -#endif -#if defined( IM4_SET ) - d_4(aes_32t, t_dec(i,m), mm_data, v0, v1, v2, v3); -#endif - -#if defined(__cplusplus) -} -#endif - -#endif diff --git a/bsd/crypto/aes/test/ReadMe.txt b/bsd/crypto/aes/test/ReadMe.txt new file mode 100644 index 000000000..1329e84be --- /dev/null +++ b/bsd/crypto/aes/test/ReadMe.txt @@ -0,0 +1,97 @@ +This directory contains file and shell scripts + + tstaes.c + makegenarm.sh + makegenx86.sh + makeoptx86.sh + +that can be used to build executables. These executable are used to validate the implementation +and to benchmark the performance of the aes functions in the kernel. This directory also serves +as a development environment for porting of the aes functions to any new architectures. + +On xnu-1699.20.6 (from which we add this work), the generic aes source code sits at bsd/crypto/aes/gen. The x86_64 +and i386 architectural optimization is given in bsd/crypto/aes/i386. + +After making some code corrections (aes.h and most assembly code in i386), now you can build a test executable +that is functionally equivalent to aes in the kernel code. + +To generate a test executable for the aes in x86_64/i386 kernel, + + $ makeoptx86.sh + +This will build a test executable tstaesoptx86 (x86_64/i386). The executable will automatically detects the +CPU clock rates. You specify the number of iterations and the number of 16-byte blocks for simulation. +The executable generates (random number) the test data, and calls aes_encrypt_cbc to encrypt the plain data +into cipher data, and then calls aes_decrypt_cbc to decrypt cipher into decrypted data. Afterwards, it compares +the decrypted data against the plain data. Should there be a mismatch, the code breaks and exit. +Otherwise, it measures the times the system spends on the 2 functions under test. Afterwards, it prints out +the performance profiling data. + +On K5, + +$ tstaesoptx86 1000 2560 +device max CPU clock rate = 2659.00 MHz +40960 bytes per cbc call + aes_encrypt_cbc : time elapsed = 220.24 usecs, 177.37 MBytes/sec, 14.30 cycles/byte + best iteration : time elapsed = 218.30 usecs, 178.94 MBytes/sec, 14.17 cycles/byte + worst iteration : time elapsed = 286.14 usecs, 136.51 MBytes/sec, 18.58 cycles/byte + + aes_decrypt_cbc : time elapsed = 199.85 usecs, 195.46 MBytes/sec, 12.97 cycles/byte + best iteration : time elapsed = 198.17 usecs, 197.12 MBytes/sec, 12.86 cycles/byte + worst iteration : time elapsed = 228.12 usecs, 171.23 MBytes/sec, 14.81 cycles/byte + +On K5B (with aesni) + +$ tstaesoptx86 1000 256 +device max CPU clock rate = 2400.00 MHz +4096 bytes per cbc call + aes_encrypt_cbc : time elapsed = 6.69 usecs, 583.67 MBytes/sec, 3.92 cycles/byte + best iteration : time elapsed = 6.38 usecs, 612.46 MBytes/sec, 3.74 cycles/byte + worst iteration : time elapsed = 9.72 usecs, 401.96 MBytes/sec, 5.69 cycles/byte + + aes_decrypt_cbc : time elapsed = 2.05 usecs, 1902.65 MBytes/sec, 1.20 cycles/byte + best iteration : time elapsed = 1.96 usecs, 1997.06 MBytes/sec, 1.15 cycles/byte + worst iteration : time elapsed = 4.60 usecs, 849.00 MBytes/sec, 2.70 cycles/byte + +You can also build a test executable using the generic source code for the i386/x86_64 architecture. + + $ makegenx86.sh + +When run on K5, + +$ tstaesgenx86 1000 2560 +device max CPU clock rate = 2659.00 MHz +40960 bytes per cbc call + aes_encrypt_cbc : time elapsed = 278.05 usecs, 140.49 MBytes/sec, 18.05 cycles/byte + best iteration : time elapsed = 274.63 usecs, 142.24 MBytes/sec, 17.83 cycles/byte + worst iteration : time elapsed = 309.70 usecs, 126.13 MBytes/sec, 20.10 cycles/byte + + aes_decrypt_cbc : time elapsed = 265.43 usecs, 147.17 MBytes/sec, 17.23 cycles/byte + best iteration : time elapsed = 262.20 usecs, 148.98 MBytes/sec, 17.02 cycles/byte + worst iteration : time elapsed = 296.19 usecs, 131.88 MBytes/sec, 19.23 cycles/byte + +We can see the current AES implementation in the x86_64 kernel has been improved from 17.83/17.02 +down to 14.12/12.86 cycles/byte for aes_encrypt_cbc and aes_decrypt_cbc, respectively. + + + --------- iOS --------- + +Similarly, you can build a test executable for the aes in the armv7 kernel (which uses the generic source code) + + $ makegenarm.sh + +Note that you need the iOS SDK installed. We can then copy this executable to iOS devices for simulation. + +On N88, + +iPhone:~ root# ./tstaesgenarm 1000 2560 +device max CPU clock rate = 600.00 MHz +40960 bytes per cbc call + aes_encrypt_cbc : time elapsed = 2890.18 usecs, 13.52 MBytes/sec, 42.34 cycles/byte + best iteration : time elapsed = 2692.00 usecs, 14.51 MBytes/sec, 39.43 cycles/byte + worst iteration : time elapsed = 18248.33 usecs, 2.14 MBytes/sec, 267.31 cycles/byte + + aes_decrypt_cbc : time elapsed = 3078.20 usecs, 12.69 MBytes/sec, 45.09 cycles/byte + best iteration : time elapsed = 2873.33 usecs, 13.59 MBytes/sec, 42.09 cycles/byte + worst iteration : time elapsed = 9664.79 usecs, 4.04 MBytes/sec, 141.57 cycles/byte + diff --git a/bsd/crypto/aes/test/makegenx86.sh b/bsd/crypto/aes/test/makegenx86.sh new file mode 100755 index 000000000..ea4de6f63 --- /dev/null +++ b/bsd/crypto/aes/test/makegenx86.sh @@ -0,0 +1,8 @@ +#!/bin/ksh + +cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aescrypt.c -o aescrypt.o +cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aeskey.c -o aeskey.o +cc -Os -c -arch i386 -arch x86_64 -I ../../../ ../gen/aestab.c -o aestab.o + +cc -arch i386 -arch x86_64 -Os tstaes.c aescrypt.o aeskey.o aestab.o -o tstaesgenx86 +rm -fr aescrypt.o aeskey.o aestab.o diff --git a/bsd/crypto/aes/test/makeoptx86.sh b/bsd/crypto/aes/test/makeoptx86.sh new file mode 100755 index 000000000..3732e037f --- /dev/null +++ b/bsd/crypto/aes/test/makeoptx86.sh @@ -0,0 +1,10 @@ +#!/bin/ksh + +cc -c -Os -arch i386 -arch x86_64 ../i386/AES.s -o AES.o +cc -c -Os -arch i386 -arch x86_64 ../i386/aes_crypt_hw.s -o aes_crypt_hw.o +cc -c -Os -arch i386 -arch x86_64 ../i386/aes_key_hw.s -o aes_key_hw.o +cc -c -Os -arch i386 -arch x86_64 ../i386/aes_modes_asm.s -o aes_modes_asm.o +cc -c -Os -arch i386 -arch x86_64 ../i386/aes_modes_hw.s -o aes_modes_hw.o + +cc -Os -arch i386 -arch x86_64 tstaes.c AES.o aes_crypt_hw.o aes_key_hw.o aes_modes_asm.o aes_modes_hw.o -o tstaesoptx86 +rm -fr AES.o aes_crypt_hw.o aes_key_hw.o aes_modes_asm.o aes_modes_hw.o diff --git a/bsd/crypto/aes/test/tstaes.c b/bsd/crypto/aes/test/tstaes.c new file mode 100644 index 000000000..cbe364ed7 --- /dev/null +++ b/bsd/crypto/aes/test/tstaes.c @@ -0,0 +1,131 @@ + +#include <stdio.h> +#include <stdlib.h> +#include "../aes.h" +#include <mach/mach_time.h> +#include <sys/sysctl.h> + + +aes_encrypt_ctx encrypt_ctx; +aes_decrypt_ctx decrypt_ctx; + +size_t getFreq() +{ + int mib[2]; + size_t cpufreq, len; + mib[0] = CTL_HW; + mib[1] = HW_CPU_FREQ; + len = sizeof(cpufreq); + + sysctl(mib, 2, &cpufreq, &len, NULL, 0); + + return cpufreq; +} + + +uint32_t cpu_freq; + +main(int argc, char **argv) +{ + + char *plain; + char *cipher; + char *decrypt; + +uint32_t ITERATIONS; +uint32_t NUM_BLOCKS; +uint32_t data_size; + + char key[32]; + char iv[16]; + int checksum=0; + int i, j, iterations; + uint64_t t0, t1, t2, sum=0, max_time=0, min_time=-1, sum1=0, max_time1=0, min_time1=-1; + float time, time_max, time_min, time1, time_max1, time_min1; + + cpu_freq = getFreq(); + + if (cpu_freq == 0) { + fprintf(stderr, "this appears to be an N90 device, where cpu_freq can not be detected. set to 800MHz.\n"); + cpu_freq = 800000000; + } else { + fprintf(stderr, "device max CPU clock rate = %.2f MHz\n", cpu_freq/1.e6); + } + + mach_timebase_info_data_t info; + kern_return_t err = mach_timebase_info( &info ); + + if (argc!=3) { + fprintf(stderr, "usage : %s iterations num_16bytes_block\n", argv[0]); + exit(1); + } + ITERATIONS = atoi(argv[1]); + NUM_BLOCKS = atoi(argv[2]); + data_size = 16*NUM_BLOCKS; + + plain = malloc(data_size); + cipher = malloc(data_size); + decrypt = malloc(data_size); + + if ((plain==NULL) || (cipher==NULL) || (decrypt==NULL)) { + fprintf(stderr,"malloc error.\n"); + exit(1); + } + + for (i=0;i<data_size;i++) plain[i] = random(); + for (i=0;i<32;i++) key[i] = random(); + for (i=0;i<16;i++) iv[i] = random(); + + aes_encrypt_key128(key, &encrypt_ctx); + aes_decrypt_key128(key, &decrypt_ctx); + + for (iterations=0;iterations<ITERATIONS;iterations++) { + t0 = mach_absolute_time(); + + // encrypt + aes_encrypt_cbc(plain, iv, NUM_BLOCKS, cipher, &encrypt_ctx); + + t1 = mach_absolute_time(); + + // decrypt + aes_decrypt_cbc(cipher, iv, NUM_BLOCKS, decrypt, &decrypt_ctx); + + t2 = mach_absolute_time(); + + for (i=0;i<(16*NUM_BLOCKS);i++) if (plain[i]!=decrypt[i]) { + fprintf(stderr,"error : decrypt != plain. i = %d\n", i); + exit(1); + } + sum += (t1-t0); + sum1 += (t2-t1); + t2-=t1; + t1-=t0; + if (t1>max_time) max_time = t1; + if (t1<min_time) min_time = t1; + if (t2>max_time1) max_time1 = t2; + if (t2<min_time1) min_time1 = t2; + } + + time = sum * 1e-9* ((double) info.numer)/((double) info.denom); + time_max = max_time * 1e-9* ((double) info.numer)/((double) info.denom); + time_min = min_time * 1e-9* ((double) info.numer)/((double) info.denom); + + time1 = sum1 * 1e-9* ((double) info.numer)/((double) info.denom); + time_max1 = max_time1 * 1e-9* ((double) info.numer)/((double) info.denom); + time_min1 = min_time1 * 1e-9* ((double) info.numer)/((double) info.denom); + + printf("%d bytes per cbc call\n", data_size); + printf(" aes_encrypt_cbc : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time/ITERATIONS,data_size*ITERATIONS/1024./1024./time, time*1.*cpu_freq/ITERATIONS/data_size); + printf(" best iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_min,data_size/1024./1024./time_min, time_min*1.*cpu_freq/data_size); + printf(" worst iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_max,data_size/1024./1024./time_max, time_max*1.*cpu_freq/data_size); + + printf("\n"); + + printf(" aes_decrypt_cbc : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time1/ITERATIONS,data_size*ITERATIONS/1024./1024./time1, time1*1.*cpu_freq/ITERATIONS/data_size); + printf(" best iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_min1,data_size/1024./1024./time_min1, time_min1*1.*cpu_freq/data_size); + printf(" worst iteration : time elapsed = %8.2f usecs, %7.2f MBytes/sec, %8.2f cycles/byte\n", 1.e6*time_max1,data_size/1024./1024./time_max1, time_max1*1.*cpu_freq/data_size); + + free(plain); + free(cipher); + free(decrypt); +} diff --git a/bsd/crypto/blowfish/Makefile b/bsd/crypto/blowfish/Makefile index 6b3066a93..26126163a 100644 --- a/bsd/crypto/blowfish/Makefile +++ b/bsd/crypto/blowfish/Makefile @@ -9,16 +9,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/cast128/Makefile b/bsd/crypto/cast128/Makefile index 6eb76064a..100921729 100644 --- a/bsd/crypto/cast128/Makefile +++ b/bsd/crypto/cast128/Makefile @@ -9,16 +9,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/des/Makefile b/bsd/crypto/des/Makefile index df4545d55..2eee6301a 100644 --- a/bsd/crypto/des/Makefile +++ b/bsd/crypto/des/Makefile @@ -9,8 +9,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ @@ -19,8 +17,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/doc/KernelCrypto.plist b/bsd/crypto/doc/KernelCrypto.plist new file mode 100644 index 000000000..9c9cfd1c9 --- /dev/null +++ b/bsd/crypto/doc/KernelCrypto.plist @@ -0,0 +1,76 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> +<plist version="1.0"> +<array> + <dict> + <key>OpenSourceLicense</key> + <string>Other</string> + <key>OpenSourceLicenseFile</key> + <string>KernelCrypto.txt</string> + <key>OpenSourceModifications</key> + <string>Extensive customization for OS X</string> + <key>OpenSourceProject</key> + <string>openssl</string> + <key>OpenSourceURL</key> + <string>http://www.openssl.org/source/openssl-0.9.6.tar.gz</string> + <key>OpenSourceVersion</key> + <string>openssl-0.9.6</string> + <key>OpenSourceWebsiteURL</key> + <string>http://www.openssl.org/</string> + </dict> + <dict> + <key>OpenSourceImportDate</key> + <string>2004-04-07</string> + <key>OpenSourceLicense</key> + <string>Other</string> + <key>OpenSourceLicenseFile</key> + <string>KernelCrypto.txt</string> + <key>OpenSourceModifications</key> + <string>Customization for OS X</string> + <key>OpenSourceProject</key> + <string>Gladman AES</string> + <key>OpenSourceURL</key> + <string>http://fp.gladman.plus.com/AES/aesfull.zip</string> + <key>OpenSourceVersion</key> + <string>aes-src-26-08-05</string> + <key>OpenSourceWebsiteURL</key> + <string>http://fp.gladman.plus.com/AES/index.htm</string> + </dict> + <dict> + <key>OpenSourceImportDate</key> + <string>2005-09-02</string> + <key>OpenSourceLicense</key> + <string>Other</string> + <key>OpenSourceLicenseFile</key> + <string>KernelCrypto.txt</string> + <key>OpenSourceModifications</key> + <string>Customization for OS X</string> + <key>OpenSourceProject</key> + <string>Gladman SHA2</string> + <key>OpenSourceURL</key> + <string>http://fp.gladman.plus.com/cryptography_technology/sha/sha-26-08-05.zip</string> + <key>OpenSourceVersion</key> + <string>sha-26-08-05</string> + <key>OpenSourceWebsiteURL</key> + <string>http://fp.gladman.plus.com/cryptography_technology/sha/index.htm</string> + </dict> + <dict> + <key>OpenSourceImportDate</key> + <string>2010-04-14</string> + <key>OpenSourceLicense</key> + <string>Other</string> + <key>OpenSourceLicenseFile</key> + <string>KernelCrypto.txt</string> + <key>OpenSourceModifications</key> + <string>Customization for OS X</string> + <key>OpenSourceProject</key> + <string>Gladman XTS-AES</string> + <key>OpenSourceURL</key> + <string>http://gladman.plushost.co.uk/oldsite/AES/xts-vs2008-17-07-09.zip</string> + <key>OpenSourceVersion</key> + <string>xts-vs2008-17-07-09</string> + <key>OpenSourceWebsiteURL</key> + <string>http://gladman.plushost.co.uk/oldsite/AES/index.php</string> + </dict> +</array> +</plist> diff --git a/bsd/crypto/doc/KernelCrypto.txt b/bsd/crypto/doc/KernelCrypto.txt new file mode 100644 index 000000000..611542795 --- /dev/null +++ b/bsd/crypto/doc/KernelCrypto.txt @@ -0,0 +1,149 @@ + Original SSLeay License + ----------------------- + +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + + --------------------------------------------------------------------------- + License for Dr. Brian Gladman's SHA2 implementation + --------------------------------------------------------------------------- + + Copyright (c) 2002, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + + --------------------------------------------------------------------------- + License for Dr. Brian Gladman's AES implementation + --------------------------------------------------------------------------- + Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software in both source and binary + form is allowed (with or without changes) provided that: + + 1. distributions of this source code include the above copyright + notice, this list of conditions and the following disclaimer; + + 2. distributions in binary form include the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other associated materials; + + 3. the copyright holder's name is not used to endorse products + built using this software without specific written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this product + may be distributed under the terms of the GNU General Public License (GPL), + in which case the provisions of the GPL apply INSTEAD OF those given above. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + + +--------------------------------------------------------------------------- + License for Dr. Brian Gladman's XTS implementation + --------------------------------------------------------------------------- + +Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + +LICENSE TERMS + +The free distribution and use of this software is allowed (with or without +changes) provided that: + +1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + +2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + +3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + +DISCLAIMER + +This software is provided 'as is' with no explicit or implied warranties +in respect of its properties, including, but not limited to, correctness +and/or fitness for purpose. + + diff --git a/bsd/crypto/rc4/Makefile b/bsd/crypto/rc4/Makefile index 4de505de8..9aad66e3a 100644 --- a/bsd/crypto/rc4/Makefile +++ b/bsd/crypto/rc4/Makefile @@ -9,8 +9,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ @@ -19,8 +17,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/sha2/Makefile b/bsd/crypto/sha2/Makefile index 8e85f612c..4cc93fb76 100644 --- a/bsd/crypto/sha2/Makefile +++ b/bsd/crypto/sha2/Makefile @@ -9,8 +9,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ @@ -19,8 +17,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/crypto/sha2/intel/sha256.s b/bsd/crypto/sha2/intel/sha256.s new file mode 100644 index 000000000..59353ff4b --- /dev/null +++ b/bsd/crypto/sha2/intel/sha256.s @@ -0,0 +1,617 @@ +/* + This file provides x86_64/i386 hand implementation of the following function + + void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks); + + which is a C function in sha2.c (from xnu). + + The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to + SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file + with all ssse3 instructions replaced with sse3 or below instructions. + + sha256 algorithm per block description: + + 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) + 2. load 8 digests a-h from ctx->state + 3. for r = 0:15 + T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; + d += T1; + h = T1 + Sigma0(a) + Maj(a,b,c) + permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g + 4. for r = 16:63 + W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]); + T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; + d += T1; + h = T1 + Sigma0(a) + Maj(a,b,c) + permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g + + In the assembly implementation: + - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3 + - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer + - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386) + + the implementation per block looks like + + ---------------------------------------------------------------------------- + + load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K(0:15) in stack + + load digests a-h from ctx->state; + + for (r=0;r<48;r+=4) { + digests a-h update and permute round r:r+3 + update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration + } + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + } + + ctx->states += digests a-h; + + ---------------------------------------------------------------------------- + + our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block + into the last 16 rounds of its previous block: + + ---------------------------------------------------------------------------- + + load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K(0:15) in stack + +L_loop: + + load digests a-h from ctx->state; + + for (r=0;r<48;r+=4) { + digests a-h update and permute round r:r+3 + update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration + } + + num_block--; + if (num_block==0) jmp L_last_block; + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K([r:r+3]%16) in stack + } + + ctx->states += digests a-h; + + jmp L_loop; + +L_last_block: + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + } + + ctx->states += digests a-h; + + ------------------------------------------------------------------------ + + Apple CoreOS vector & numerics + cclee 8-3-10 +*/ + +#if defined KERNEL +#include <i386/cpu_capabilities.h> +#else +#include <System/i386/cpu_capabilities.h> +#endif + + // associate variables with registers or memory + +#if defined (__x86_64__) + #define sp %rsp + #define ctx %rdi + #define data %rsi + #define num_blocks %rdx + + #define a %r8d + #define b %r9d + #define c %r10d + #define d %r11d + #define e %r12d + #define f %r13d + #define g %r14d + #define h %r15d + + #define K %rbx + #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15) + + #define L_aligned_bswap 64(sp) // bswap : big-endian loading of 4-byte words + #define xmm_save 80(sp) // starting address for xmm save/restore +#else + #define sp %esp + #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15) + #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument + #define data_addr 24+stack_size(sp) // 2nd caller argument + #define num_blocks 28+stack_size(sp) // 3rd caller argument + + #define a %ebx + #define b %edx + #define c 64(sp) + #define d %ebp + #define e %esi + #define f 68(sp) + #define g %edi + #define h 72(sp) + + #define K 76(sp) // pointer to K256[] table + #define L_aligned_bswap 80(sp) // bswap : big-endian loading of 4-byte words + #define xmm_save 96(sp) // starting address for xmm save/restore +#endif + + // 2 local variables + #define t %eax + #define s %ecx + + // a window (16 words) of message scheule + #define W0 %xmm0 + #define W1 %xmm1 + #define W2 %xmm2 + #define W3 %xmm3 + + // circular buffer for WK[(r:r+15)%16] + #define WK(x) (x&15)*4(sp) + +// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) + + .macro Ch + mov $0, t // x + mov $0, s // x + not t // ~x + and $1, s // x & y + and $2, t // ~x & z + xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z)); + .endm + +// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + + .macro Maj + mov $0, t // x + mov $1, s // y + and s, t // x&y + and $2, s // y&z + xor s, t // (x&y) ^ (y&z) + mov $2, s // z + and $0, s // (x&z) + xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + .endm + +/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ +// #define R(b,x) ((x) >> (b)) +/* 32-bit Rotate-right (used in SHA-256): */ +// #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b)))) + +// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x))) + + // performs sigma0_256 on 4 words on an xmm registers + // use xmm6/xmm7 as intermediate registers + .macro sigma0 + movdqa $0, %xmm6 + movdqa $0, %xmm7 + psrld $$3, $0 // SHR3(x) + psrld $$7, %xmm6 // part of ROTR7 + pslld $$14, %xmm7 // part of ROTR18 + pxor %xmm6, $0 + pxor %xmm7, $0 + psrld $$11, %xmm6 // part of ROTR18 + pslld $$11, %xmm7 // part of ROTR7 + pxor %xmm6, $0 + pxor %xmm7, $0 + .endm + +// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) + + // performs sigma1_256 on 4 words on an xmm registers + // use xmm6/xmm7 as intermediate registers + .macro sigma1 + movdqa $0, %xmm6 + movdqa $0, %xmm7 + psrld $$10, $0 // SHR10(x) + psrld $$17, %xmm6 // part of ROTR17 + pxor %xmm6, $0 + pslld $$13, %xmm7 // part of ROTR19 + pxor %xmm7, $0 + psrld $$2, %xmm6 // part of ROTR19 + pxor %xmm6, $0 + pslld $$2, %xmm7 // part of ROTR17 + pxor %xmm7, $0 + .endm + +// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) + + .macro Sigma0 + mov $0, t // x + mov $0, s // x + ror $$2, t // S32(2, (x)) + ror $$13, s // S32(13, (x)) + xor s, t // S32(2, (x)) ^ S32(13, (x)) + ror $$9, s // S32(22, (x)) + xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) + .endm + +// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) + + .macro Sigma1 + mov $0, s // x + ror $$6, s // S32(6, (x)) + mov s, t // S32(6, (x)) + ror $$5, s // S32(11, (x)) + xor s, t // S32(6, (x)) ^ S32(11, (x)) + ror $$14, s // S32(25, (x)) + xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) + .endm + + // per round digests update + .macro round + Sigma1 $4 // t = T1 + add t, $7 // use h to store h+Sigma1(e) + Ch $4, $5, $6 // t = Ch (e, f, g); + add $7, t // t = h+Sigma1(e)+Ch(e,f,g); + add WK($8), t // h = T1 + add t, $3 // d += T1; + mov t, $7 // h = T1 + Sigma0 $0 // t = Sigma0(a); + add t, $7 // h = T1 + Sigma0(a); + Maj $0, $1, $2 // t = Maj(a,b,c) + add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c); + .endm + + // per 4 rounds digests update and permutation + // permutation is absorbed by rotating the roles of digests a-h + .macro rounds + round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8 + round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8 + round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8 + round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8 + .endm + + // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future + .macro message_schedule + + // 4 32-bit K256 words in xmm5 +#if defined (__x86_64__) + movdqu (K), %xmm5 +#else + mov K, t + movdqu (t), %xmm5 +#endif + add $$16, K // K points to next K256 word for next iteration + movdqa $1, %xmm4 // W7:W4 + palignr $$4, $0, %xmm4 // W4:W1 + sigma0 %xmm4 // sigma0(W4:W1) + movdqa $3, %xmm6 // W15:W12 + paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1) + palignr $$4, $2, %xmm6 // W12:W9 + paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0 + movdqa $3, %xmm4 // W15:W12 + psrldq $$8, %xmm4 // 0,0,W15,W14 + sigma1 %xmm4 // sigma1(0,0,W15,W14) + paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0 + movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16 + pslldq $$8, %xmm4 // W17, W16, 0, 0 + sigma1 %xmm4 // sigma1(W17,W16,0,0) + paddd %xmm4, $0 // W19:W16 + paddd $0, %xmm5 // WK + movdqa %xmm5, WK($4) + .endm + + // this macro is used in the last 16 rounds of a current block + // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3] + // and save into stack to prepare for next block + + .macro update_W_WK +#if defined (__x86_64__) + movdqu $0*16(data), $1 // read 4 4-byte words + pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] + movdqu $0*16(K), %xmm4 // K[r:r+3] +#else + mov data_addr, t + movdqu $0*16(t), $1 // read 4 4-byte words + pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] + mov K, t + movdqu $0*16(t), %xmm4 // K[r:r+3] +#endif + paddd $1, %xmm4 // WK[r:r+3] + movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer + .endm + + .text + +#if defined (__x86_64__) || defined (__i386__) + + .globl _SHA256_Transform + +_SHA256_Transform: + + + // detect SSSE3 and dispatch appropriate code branch + #if defined __x86_64__ + movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities + mov (%rax), %eax // %eax = __cpu_capabilities + #else // i386 + #if defined KERNEL + leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities + mov (%eax), %eax // %eax = __cpu_capabilities + #else + mov _COMM_PAGE_CPU_CAPABILITIES, %eax + #endif + #endif + test $(kHasSupplementalSSE3), %eax + je _SHA256_Transform_nossse3 // branch to no-ssse3 code + + // push callee-saved registers +#if defined (__x86_64__) + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 +#else + push %ebp + push %ebx + push %esi + push %edi +#endif + + // allocate stack space + sub $stack_size, sp + + // if kernel code, save used xmm registers +#if KERNEL + movdqa %xmm0, 0*16+xmm_save + movdqa %xmm1, 1*16+xmm_save + movdqa %xmm2, 2*16+xmm_save + movdqa %xmm3, 3*16+xmm_save + movdqa %xmm4, 4*16+xmm_save + movdqa %xmm5, 5*16+xmm_save + movdqa %xmm6, 6*16+xmm_save + movdqa %xmm7, 7*16+xmm_save +#endif + + // set up bswap parameters in the aligned stack space and pointer to table K256[] +#if defined (__x86_64__) + lea _K256(%rip), K + lea L_bswap(%rip), %rax + movdqa (%rax), %xmm0 +#else + lea _K256, t + mov t, K + lea L_bswap, %eax + movdqa (%eax), %xmm0 +#endif + movdqa %xmm0, L_aligned_bswap + + // load W[0:15] into xmm0-xmm3 +#if defined (__x86_64__) + movdqu 0*16(data), W0 + movdqu 1*16(data), W1 + movdqu 2*16(data), W2 + movdqu 3*16(data), W3 + add $64, data +#else + mov data_addr, t + movdqu 0*16(t), W0 + movdqu 1*16(t), W1 + movdqu 2*16(t), W2 + movdqu 3*16(t), W3 + add $64, data_addr +#endif + pshufb L_aligned_bswap, W0 + pshufb L_aligned_bswap, W1 + pshufb L_aligned_bswap, W2 + pshufb L_aligned_bswap, W3 + + // compute WK[0:15] and save in stack +#if defined (__x86_64__) + movdqu 0*16(K), %xmm4 + movdqu 1*16(K), %xmm5 + movdqu 2*16(K), %xmm6 + movdqu 3*16(K), %xmm7 +#else + mov K, t + movdqu 0*16(t), %xmm4 + movdqu 1*16(t), %xmm5 + movdqu 2*16(t), %xmm6 + movdqu 3*16(t), %xmm7 +#endif + add $64, K + paddd %xmm0, %xmm4 + paddd %xmm1, %xmm5 + paddd %xmm2, %xmm6 + paddd %xmm3, %xmm7 + movdqa %xmm4, WK(0) + movdqa %xmm5, WK(4) + movdqa %xmm6, WK(8) + movdqa %xmm7, WK(12) + +L_loop: + + // digests a-h = ctx->states; +#if defined (__x86_64__) + mov 0*4(ctx), a + mov 1*4(ctx), b + mov 2*4(ctx), c + mov 3*4(ctx), d + mov 4*4(ctx), e + mov 5*4(ctx), f + mov 6*4(ctx), g + mov 7*4(ctx), h +#else + mov ctx_addr, t + mov 0*4(t), a + mov 1*4(t), b + mov 2*4(t), s + mov s, c + mov 3*4(t), d + mov 4*4(t), e + mov 5*4(t), s + mov s, f + mov 6*4(t), g + mov 7*4(t), s + mov s, h +#endif + + // rounds 0:47 interleaved with W/WK update for rounds 16:63 + rounds a, b, c, d, e, f, g, h, 0 + message_schedule W0,W1,W2,W3,16 + rounds e, f, g, h, a, b, c, d, 4 + message_schedule W1,W2,W3,W0,20 + rounds a, b, c, d, e, f, g, h, 8 + message_schedule W2,W3,W0,W1,24 + rounds e, f, g, h, a, b, c, d, 12 + message_schedule W3,W0,W1,W2,28 + rounds a, b, c, d, e, f, g, h, 16 + message_schedule W0,W1,W2,W3,32 + rounds e, f, g, h, a, b, c, d, 20 + message_schedule W1,W2,W3,W0,36 + rounds a, b, c, d, e, f, g, h, 24 + message_schedule W2,W3,W0,W1,40 + rounds e, f, g, h, a, b, c, d, 28 + message_schedule W3,W0,W1,W2,44 + rounds a, b, c, d, e, f, g, h, 32 + message_schedule W0,W1,W2,W3,48 + rounds e, f, g, h, a, b, c, d, 36 + message_schedule W1,W2,W3,W0,52 + rounds a, b, c, d, e, f, g, h, 40 + message_schedule W2,W3,W0,W1,56 + rounds e, f, g, h, a, b, c, d, 44 + message_schedule W3,W0,W1,W2,60 + + // revert K to the beginning of K256[] +#if defined __x86_64__ + sub $256, K +#else + subl $256, K +#endif + + sub $1, num_blocks // num_blocks-- + je L_final_block // if final block, wrap up final rounds + + // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 + rounds a, b, c, d, e, f, g, h, 48 + update_W_WK 0, W0 + rounds e, f, g, h, a, b, c, d, 52 + update_W_WK 1, W1 + rounds a, b, c, d, e, f, g, h, 56 + update_W_WK 2, W2 + rounds e, f, g, h, a, b, c, d, 60 + update_W_WK 3, W3 + + add $64, K +#if defined (__x86_64__) + add $64, data +#else + add $64, data_addr +#endif + + // ctx->states += digests a-h +#if defined (__x86_64__) + add a, 0*4(ctx) + add b, 1*4(ctx) + add c, 2*4(ctx) + add d, 3*4(ctx) + add e, 4*4(ctx) + add f, 5*4(ctx) + add g, 6*4(ctx) + add h, 7*4(ctx) +#else + mov ctx_addr, t + add a, 0*4(t) + add b, 1*4(t) + mov c, s + add s, 2*4(t) + add d, 3*4(t) + add e, 4*4(t) + mov f, s + add s, 5*4(t) + add g, 6*4(t) + mov h, s + add s, 7*4(t) +#endif + + jmp L_loop // branch for next block + + // wrap up digest update round 48:63 for final block +L_final_block: + rounds a, b, c, d, e, f, g, h, 48 + rounds e, f, g, h, a, b, c, d, 52 + rounds a, b, c, d, e, f, g, h, 56 + rounds e, f, g, h, a, b, c, d, 60 + + // ctx->states += digests a-h +#if defined (__x86_64__) + add a, 0*4(ctx) + add b, 1*4(ctx) + add c, 2*4(ctx) + add d, 3*4(ctx) + add e, 4*4(ctx) + add f, 5*4(ctx) + add g, 6*4(ctx) + add h, 7*4(ctx) +#else + mov ctx_addr, t + add a, 0*4(t) + add b, 1*4(t) + mov c, s + add s, 2*4(t) + add d, 3*4(t) + add e, 4*4(t) + mov f, s + add s, 5*4(t) + add g, 6*4(t) + mov h, s + add s, 7*4(t) +#endif + + // if kernel, restore xmm0-xmm7 +#if KERNEL + movdqa 0*16+xmm_save, %xmm0 + movdqa 1*16+xmm_save, %xmm1 + movdqa 2*16+xmm_save, %xmm2 + movdqa 3*16+xmm_save, %xmm3 + movdqa 4*16+xmm_save, %xmm4 + movdqa 5*16+xmm_save, %xmm5 + movdqa 6*16+xmm_save, %xmm6 + movdqa 7*16+xmm_save, %xmm7 +#endif + + // free allocated stack memory + add $stack_size, sp + + // restore callee-saved registers +#if defined (__x86_64__) + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp +#else + pop %edi + pop %esi + pop %ebx + pop %ebp +#endif + + // return + ret + + + .const + .align 4, 0x90 + +L_bswap: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + +#endif // x86_64/i386 + diff --git a/bsd/crypto/sha2/intel/sha256nossse3.s b/bsd/crypto/sha2/intel/sha256nossse3.s new file mode 100644 index 000000000..b4dd0a035 --- /dev/null +++ b/bsd/crypto/sha2/intel/sha256nossse3.s @@ -0,0 +1,649 @@ +/* + This file provides x86_64/i386 hand implementation of the following function + + void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks); + + which is a C function in sha2.c (from xnu). + + The code SHA256_Transform_nossse3 is a clone of SHA256_Transform + with all ssse3 instructions replaced with sse3 or below instructions. + + For performance reason, this function should not be called directly. This file should be working + together with the one that implements SHA256_Transform. There, cpu_capabilities is probed to detect + ssse3. If ssse3 is not supported, the execution will be branched to this no-ssse3-specific function. + + sha256 algorithm per block description: + + 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) + 2. load 8 digests a-h from ctx->state + 3. for r = 0:15 + T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; + d += T1; + h = T1 + Sigma0(a) + Maj(a,b,c) + permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g + 4. for r = 16:63 + W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]); + T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; + d += T1; + h = T1 + Sigma0(a) + Maj(a,b,c) + permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g + + In the assembly implementation: + - a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3 + - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer + - the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386) + + the implementation per block looks like + + ---------------------------------------------------------------------------- + + load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K(0:15) in stack + + load digests a-h from ctx->state; + + for (r=0;r<48;r+=4) { + digests a-h update and permute round r:r+3 + update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration + } + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + } + + ctx->states += digests a-h; + + ---------------------------------------------------------------------------- + + our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block + into the last 16 rounds of its previous block: + + ---------------------------------------------------------------------------- + + load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K(0:15) in stack + +L_loop: + + load digests a-h from ctx->state; + + for (r=0;r<48;r+=4) { + digests a-h update and permute round r:r+3 + update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration + } + + num_block--; + if (num_block==0) jmp L_last_block; + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3 + pre_calculate and store W+K([r:r+3]%16) in stack + } + + ctx->states += digests a-h; + + jmp L_loop; + +L_last_block: + + for (r=48;r<64;r+=4) { + digests a-h update and permute round r:r+3 + } + + ctx->states += digests a-h; + + ------------------------------------------------------------------------ + + Apple CoreOS vector & numerics + cclee 8-3-10 +*/ + +#if defined KERNEL +#include <i386/cpu_capabilities.h> +#else +#include <System/i386/cpu_capabilities.h> +#endif + + // associate variables with registers or memory + +#if defined (__x86_64__) + #define sp %rsp + #define ctx %rdi + #define data %rsi + #define num_blocks %rdx + + #define a %r8d + #define b %r9d + #define c %r10d + #define d %r11d + #define e %r12d + #define f %r13d + #define g %r14d + #define h %r15d + + #define K %rbx + #define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15) + + #define xmm_save 80(sp) // starting address for xmm save/restore +#else + #define sp %esp + #define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15) + #define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument + #define data_addr 24+stack_size(sp) // 2nd caller argument + #define num_blocks 28+stack_size(sp) // 3rd caller argument + + #define a %ebx + #define b %edx + #define c 64(sp) + #define d %ebp + #define e %esi + #define f 68(sp) + #define g %edi + #define h 72(sp) + + #define K 76(sp) // pointer to K256[] table + #define xmm_save 96(sp) // starting address for xmm save/restore +#endif + + // 2 local variables + #define t %eax + #define s %ecx + + // a window (16 words) of message scheule + #define W0 %xmm0 + #define W1 %xmm1 + #define W2 %xmm2 + #define W3 %xmm3 + + // circular buffer for WK[(r:r+15)%16] + #define WK(x) (x&15)*4(sp) + +// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) + + .macro Ch + mov $0, t // x + mov $0, s // x + not t // ~x + and $1, s // x & y + and $2, t // ~x & z + xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z)); + .endm + +// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + + .macro Maj + mov $0, t // x + mov $1, s // y + and s, t // x&y + and $2, s // y&z + xor s, t // (x&y) ^ (y&z) + mov $2, s // z + and $0, s // (x&z) + xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + .endm + +/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */ +// #define R(b,x) ((x) >> (b)) +/* 32-bit Rotate-right (used in SHA-256): */ +// #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b)))) + +// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x))) + + // performs sigma0_256 on 4 words on an xmm registers + // use xmm6/xmm7 as intermediate registers + .macro sigma0 + movdqa $0, %xmm6 + movdqa $0, %xmm7 + psrld $$3, $0 // SHR3(x) + psrld $$7, %xmm6 // part of ROTR7 + pslld $$14, %xmm7 // part of ROTR18 + pxor %xmm6, $0 + pxor %xmm7, $0 + psrld $$11, %xmm6 // part of ROTR18 + pslld $$11, %xmm7 // part of ROTR7 + pxor %xmm6, $0 + pxor %xmm7, $0 + .endm + +// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x))) + + // performs sigma1_256 on 4 words on an xmm registers + // use xmm6/xmm7 as intermediate registers + .macro sigma1 + movdqa $0, %xmm6 + movdqa $0, %xmm7 + psrld $$10, $0 // SHR10(x) + psrld $$17, %xmm6 // part of ROTR17 + pxor %xmm6, $0 + pslld $$13, %xmm7 // part of ROTR19 + pxor %xmm7, $0 + psrld $$2, %xmm6 // part of ROTR19 + pxor %xmm6, $0 + pslld $$2, %xmm7 // part of ROTR17 + pxor %xmm7, $0 + .endm + +// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) + + .macro Sigma0 + mov $0, t // x + mov $0, s // x + ror $$2, t // S32(2, (x)) + ror $$13, s // S32(13, (x)) + xor s, t // S32(2, (x)) ^ S32(13, (x)) + ror $$9, s // S32(22, (x)) + xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x))) + .endm + +// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) + + .macro Sigma1 + mov $0, s // x + ror $$6, s // S32(6, (x)) + mov s, t // S32(6, (x)) + ror $$5, s // S32(11, (x)) + xor s, t // S32(6, (x)) ^ S32(11, (x)) + ror $$14, s // S32(25, (x)) + xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x))) + .endm + + // per round digests update + .macro round + Sigma1 $4 // t = T1 + add t, $7 // use h to store h+Sigma1(e) + Ch $4, $5, $6 // t = Ch (e, f, g); + add $7, t // t = h+Sigma1(e)+Ch(e,f,g); + add WK($8), t // h = T1 + add t, $3 // d += T1; + mov t, $7 // h = T1 + Sigma0 $0 // t = Sigma0(a); + add t, $7 // h = T1 + Sigma0(a); + Maj $0, $1, $2 // t = Maj(a,b,c) + add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c); + .endm + + // per 4 rounds digests update and permutation + // permutation is absorbed by rotating the roles of digests a-h + .macro rounds + round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8 + round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8 + round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8 + round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8 + .endm + + // update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future + .macro message_schedule + + // 4 32-bit K256 words in xmm5 +#if defined (__x86_64__) + movdqu (K), %xmm5 +#else + mov K, t + movdqu (t), %xmm5 +#endif + add $$16, K // K points to next K256 word for next iteration + movdqa $1, %xmm4 // W7:W4 +#if 0 + palignr $$4, $0, %xmm4 // W4:W1 +#else // no-ssse3 implementation of palignr + movdqa $0, %xmm7 + pslldq $$12, %xmm4 + psrldq $$4, %xmm7 + por %xmm7, %xmm4 +#endif + sigma0 %xmm4 // sigma0(W4:W1) + movdqa $3, %xmm6 // W15:W12 + paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1) +#if 0 + palignr $$4, $2, %xmm6 // W12:W9 +#else // no-ssse3 implementation of palignr + movdqa $2, %xmm7 + pslldq $$12, %xmm6 + psrldq $$4, %xmm7 + por %xmm7, %xmm6 +#endif + paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0 + movdqa $3, %xmm4 // W15:W12 + psrldq $$8, %xmm4 // 0,0,W15,W14 + sigma1 %xmm4 // sigma1(0,0,W15,W14) + paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0 + movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16 + pslldq $$8, %xmm4 // W17, W16, 0, 0 + sigma1 %xmm4 // sigma1(W17,W16,0,0) + paddd %xmm4, $0 // W19:W16 + paddd $0, %xmm5 // WK + movdqa %xmm5, WK($4) + .endm + + // this macro is used in the last 16 rounds of a current block + // it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3] + // and save into stack to prepare for next block + + .macro update_W_WK +#if defined (__x86_64__) +#if 0 + movdqu $0*16(data), $1 // read 4 4-byte words + pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] +#else // no-ssse3 implementation + mov 0+$0*16(data), s + bswap s + mov s, 0+WK($0*4) + mov 4+$0*16(data), s + bswap s + mov s, 4+WK($0*4) + mov 8+$0*16(data), s + bswap s + mov s, 8+WK($0*4) + mov 12+$0*16(data), s + bswap s + mov s, 12+WK($0*4) + movdqa WK($0*4), $1 +#endif + movdqu $0*16(K), %xmm4 // K[r:r+3] +#else + mov data_addr, t +#if 0 + movdqu $0*16(t), $1 // read 4 4-byte words + pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3] +#else // no-ssse3 implementation + mov 0+$0*16(t), s + bswap s + mov s, 0+WK($0*4) + mov 4+$0*16(t), s + bswap s + mov s, 4+WK($0*4) + mov 8+$0*16(t), s + bswap s + mov s, 8+WK($0*4) + mov 12+$0*16(t), s + bswap s + mov s, 12+WK($0*4) + movdqa WK($0*4), $1 +#endif + mov K, t + movdqu $0*16(t), %xmm4 // K[r:r+3] +#endif + paddd $1, %xmm4 // WK[r:r+3] + movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer + .endm + + .text + +#if defined (__x86_64__) || defined (__i386__) + + .globl _SHA256_Transform_nossse3 + +_SHA256_Transform_nossse3: + + // push callee-saved registers +#if defined (__x86_64__) + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 +#else + push %ebp + push %ebx + push %esi + push %edi +#endif + + // allocate stack space + sub $stack_size, sp + + // if kernel code, save used xmm registers +#if KERNEL + movdqa %xmm0, 0*16+xmm_save + movdqa %xmm1, 1*16+xmm_save + movdqa %xmm2, 2*16+xmm_save + movdqa %xmm3, 3*16+xmm_save + movdqa %xmm4, 4*16+xmm_save + movdqa %xmm5, 5*16+xmm_save + movdqa %xmm6, 6*16+xmm_save + movdqa %xmm7, 7*16+xmm_save +#endif + + // set up pointer to table K256[] +#if defined (__x86_64__) + lea _K256(%rip), K +#else + lea _K256, t + mov t, K +#endif + + // load W[0:15] into xmm0-xmm3 + .macro mybswap + movl 0+$0*16($1), a + movl 4+$0*16($1), b + movl 8+$0*16($1), e + movl 12+$0*16($1), d + bswap a + bswap b + bswap e + bswap d + movl a, $0*16(sp) + movl b, 4+$0*16(sp) + movl e, 8+$0*16(sp) + movl d, 12+$0*16(sp) + .endm + +#if defined (__x86_64__) + mybswap 0, data + mybswap 1, data + mybswap 2, data + mybswap 3, data + add $64, data +#else + mov data_addr, t + mybswap 0, t + mybswap 1, t + mybswap 2, t + mybswap 3, t + add $64, data_addr +#endif + movdqa 0*16(sp), W0 + movdqa 1*16(sp), W1 + movdqa 2*16(sp), W2 + movdqa 3*16(sp), W3 + + // compute WK[0:15] and save in stack +#if defined (__x86_64__) + movdqu 0*16(K), %xmm4 + movdqu 1*16(K), %xmm5 + movdqu 2*16(K), %xmm6 + movdqu 3*16(K), %xmm7 +#else + mov K, t + movdqu 0*16(t), %xmm4 + movdqu 1*16(t), %xmm5 + movdqu 2*16(t), %xmm6 + movdqu 3*16(t), %xmm7 +#endif + add $64, K + paddd %xmm0, %xmm4 + paddd %xmm1, %xmm5 + paddd %xmm2, %xmm6 + paddd %xmm3, %xmm7 + movdqa %xmm4, WK(0) + movdqa %xmm5, WK(4) + movdqa %xmm6, WK(8) + movdqa %xmm7, WK(12) + +L_loop: + + // digests a-h = ctx->states; +#if defined (__x86_64__) + mov 0*4(ctx), a + mov 1*4(ctx), b + mov 2*4(ctx), c + mov 3*4(ctx), d + mov 4*4(ctx), e + mov 5*4(ctx), f + mov 6*4(ctx), g + mov 7*4(ctx), h +#else + mov ctx_addr, t + mov 0*4(t), a + mov 1*4(t), b + mov 2*4(t), s + mov s, c + mov 3*4(t), d + mov 4*4(t), e + mov 5*4(t), s + mov s, f + mov 6*4(t), g + mov 7*4(t), s + mov s, h +#endif + + // rounds 0:47 interleaved with W/WK update for rounds 16:63 + rounds a, b, c, d, e, f, g, h, 0 + message_schedule W0,W1,W2,W3,16 + rounds e, f, g, h, a, b, c, d, 4 + message_schedule W1,W2,W3,W0,20 + rounds a, b, c, d, e, f, g, h, 8 + message_schedule W2,W3,W0,W1,24 + rounds e, f, g, h, a, b, c, d, 12 + message_schedule W3,W0,W1,W2,28 + rounds a, b, c, d, e, f, g, h, 16 + message_schedule W0,W1,W2,W3,32 + rounds e, f, g, h, a, b, c, d, 20 + message_schedule W1,W2,W3,W0,36 + rounds a, b, c, d, e, f, g, h, 24 + message_schedule W2,W3,W0,W1,40 + rounds e, f, g, h, a, b, c, d, 28 + message_schedule W3,W0,W1,W2,44 + rounds a, b, c, d, e, f, g, h, 32 + message_schedule W0,W1,W2,W3,48 + rounds e, f, g, h, a, b, c, d, 36 + message_schedule W1,W2,W3,W0,52 + rounds a, b, c, d, e, f, g, h, 40 + message_schedule W2,W3,W0,W1,56 + rounds e, f, g, h, a, b, c, d, 44 + message_schedule W3,W0,W1,W2,60 + + // revert K to the beginning of K256[] +#if defined __x86_64__ + sub $256, K +#else + subl $256, K +#endif + + sub $1, num_blocks // num_blocks-- + je L_final_block // if final block, wrap up final rounds + + // rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15 + rounds a, b, c, d, e, f, g, h, 48 + update_W_WK 0, W0 + rounds e, f, g, h, a, b, c, d, 52 + update_W_WK 1, W1 + rounds a, b, c, d, e, f, g, h, 56 + update_W_WK 2, W2 + rounds e, f, g, h, a, b, c, d, 60 + update_W_WK 3, W3 + + add $64, K +#if defined (__x86_64__) + add $64, data +#else + add $64, data_addr +#endif + + // ctx->states += digests a-h +#if defined (__x86_64__) + add a, 0*4(ctx) + add b, 1*4(ctx) + add c, 2*4(ctx) + add d, 3*4(ctx) + add e, 4*4(ctx) + add f, 5*4(ctx) + add g, 6*4(ctx) + add h, 7*4(ctx) +#else + mov ctx_addr, t + add a, 0*4(t) + add b, 1*4(t) + mov c, s + add s, 2*4(t) + add d, 3*4(t) + add e, 4*4(t) + mov f, s + add s, 5*4(t) + add g, 6*4(t) + mov h, s + add s, 7*4(t) +#endif + + jmp L_loop // branch for next block + + // wrap up digest update round 48:63 for final block +L_final_block: + rounds a, b, c, d, e, f, g, h, 48 + rounds e, f, g, h, a, b, c, d, 52 + rounds a, b, c, d, e, f, g, h, 56 + rounds e, f, g, h, a, b, c, d, 60 + + // ctx->states += digests a-h +#if defined (__x86_64__) + add a, 0*4(ctx) + add b, 1*4(ctx) + add c, 2*4(ctx) + add d, 3*4(ctx) + add e, 4*4(ctx) + add f, 5*4(ctx) + add g, 6*4(ctx) + add h, 7*4(ctx) +#else + mov ctx_addr, t + add a, 0*4(t) + add b, 1*4(t) + mov c, s + add s, 2*4(t) + add d, 3*4(t) + add e, 4*4(t) + mov f, s + add s, 5*4(t) + add g, 6*4(t) + mov h, s + add s, 7*4(t) +#endif + + // if kernel, restore xmm0-xmm7 +#if KERNEL + movdqa 0*16+xmm_save, %xmm0 + movdqa 1*16+xmm_save, %xmm1 + movdqa 2*16+xmm_save, %xmm2 + movdqa 3*16+xmm_save, %xmm3 + movdqa 4*16+xmm_save, %xmm4 + movdqa 5*16+xmm_save, %xmm5 + movdqa 6*16+xmm_save, %xmm6 + movdqa 7*16+xmm_save, %xmm7 +#endif + + // free allocated stack memory + add $stack_size, sp + + // restore callee-saved registers +#if defined (__x86_64__) + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp +#else + pop %edi + pop %esi + pop %ebx + pop %ebp +#endif + + // return + ret + + +#endif // x86_64/i386 + diff --git a/bsd/crypto/sha2/sha2.c b/bsd/crypto/sha2/sha2.c index c306068dc..603d32834 100644 --- a/bsd/crypto/sha2/sha2.c +++ b/bsd/crypto/sha2/sha2.c @@ -63,7 +63,7 @@ * */ -#ifndef assert(x) +#ifndef assert #define assert(x) do {} while(0) #endif @@ -202,13 +202,21 @@ typedef u_int64_t sha2_word64; /* Exactly 8 bytes */ * only. */ void SHA512_Last(SHA512_CTX*); +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) +void SHA256_Transform(SHA256_CTX*, const sha2_word32*, unsigned int num_blocks); +#else void SHA256_Transform(SHA256_CTX*, const sha2_word32*); +#endif void SHA512_Transform(SHA512_CTX*, const sha2_word64*); /*** SHA-XYZ INITIAL HASH VALUES AND CONSTANTS ************************/ /* Hash constant words K for SHA-256: */ +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) +const sha2_word32 K256[64] = { // assembly code will need to read this table +#else static const sha2_word32 K256[64] = { +#endif 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, @@ -324,6 +332,8 @@ void SHA256_Init(SHA256_CTX* context) { context->bitcount = 0; } +#if !(defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__))) + #ifdef SHA2_UNROLL_TRANSFORM /* Unrolled SHA-256 round macros: */ @@ -499,6 +509,8 @@ void SHA256_Transform(SHA256_CTX* context, const sha2_word32* data) { #endif /* SHA2_UNROLL_TRANSFORM */ +#endif // defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) + void SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len) { unsigned int freespace, usedspace; @@ -521,7 +533,11 @@ void SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len) { context->bitcount += freespace << 3; len -= freespace; data += freespace; +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) + SHA256_Transform(context, (sha2_word32*)context->buffer, 1); +#else SHA256_Transform(context, (sha2_word32*)context->buffer); +#endif } else { /* The buffer is not yet full */ bcopy(data, &context->buffer[usedspace], len); @@ -531,6 +547,17 @@ void SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len) { return; } } +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) + { + unsigned int kk = len/SHA256_BLOCK_LENGTH; + if (kk>0) { + SHA256_Transform(context, (const sha2_word32*)data, kk); + context->bitcount += (SHA256_BLOCK_LENGTH << 3)*kk; + len -= SHA256_BLOCK_LENGTH*kk; + data += SHA256_BLOCK_LENGTH*kk; + } + } +#else while (len >= SHA256_BLOCK_LENGTH) { /* Process as many complete blocks as we can */ SHA256_Transform(context, (const sha2_word32*)data); @@ -538,6 +565,7 @@ void SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len) { len -= SHA256_BLOCK_LENGTH; data += SHA256_BLOCK_LENGTH; } +#endif if (len > 0) { /* There's left-overs, so save 'em */ bcopy(data, context->buffer, len); @@ -573,7 +601,11 @@ void SHA256_Final(sha2_byte digest[], SHA256_CTX* context) { bzero(&context->buffer[usedspace], SHA256_BLOCK_LENGTH - usedspace); } /* Do second-to-last transform: */ +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) + SHA256_Transform(context, (sha2_word32*)context->buffer, 1); +#else SHA256_Transform(context, (sha2_word32*)context->buffer); +#endif /* And set-up for the last transform: */ bzero(context->buffer, SHA256_SHORT_BLOCK_LENGTH); @@ -589,7 +621,11 @@ void SHA256_Final(sha2_byte digest[], SHA256_CTX* context) { *(sha2_word64*)&context->buffer[SHA256_SHORT_BLOCK_LENGTH] = context->bitcount; /* Final transform: */ +#if defined (SHA256_USE_ASSEMBLY) && (defined(__x86_64__)||defined(__i386__)) + SHA256_Transform(context, (sha2_word32*)context->buffer, 1); +#else SHA256_Transform(context, (sha2_word32*)context->buffer); +#endif #if BYTE_ORDER == LITTLE_ENDIAN { diff --git a/bsd/dev/Makefile b/bsd/dev/Makefile index b2f00140a..01f00592f 100644 --- a/bsd/dev/Makefile +++ b/bsd/dev/Makefile @@ -9,16 +9,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/dev/chud/chud_bsd_callback.c b/bsd/dev/chud/chud_bsd_callback.c index 6fad80050..a28bebf46 100644 --- a/bsd/dev/chud/chud_bsd_callback.c +++ b/bsd/dev/chud/chud_bsd_callback.c @@ -36,15 +36,10 @@ #include <sys/systm.h> /* struct sysent */ #include <sys/sysproto.h> #include <sys/kdebug.h> /* KDEBUG_ENABLE_CHUD */ +#include <sys/kauth.h> /* kauth_cred_get */ #include <libkern/OSAtomic.h> - -#ifdef __ppc__ -#include <ppc/savearea.h> - -#define FM_ARG0 0x38ULL // offset from r1 to first argument -#define SPILLED_WORD_COUNT 7 // number of 32-bit words spilled to the stack - -extern struct savearea * find_user_regs( thread_t act); +#if CONFIG_MACF +#include <security/mac_framework.h> /* mac_system_check_chud */ #endif #pragma mark **** kern debug **** @@ -87,8 +82,6 @@ chudxnu_kdebug_callback_enter(chudxnu_kdebug_callback_func_t func) (void * volatile *)&kdebug_callback_fn)) { kdbg_control_chud(TRUE, (void *)chudxnu_private_kdebug_callback); - OSBitOrAtomic((UInt32)KDEBUG_ENABLE_CHUD, (volatile UInt32 *)&kdebug_enable); - return KERN_SUCCESS; } return KERN_FAILURE; @@ -97,7 +90,6 @@ chudxnu_kdebug_callback_enter(chudxnu_kdebug_callback_func_t func) __private_extern__ kern_return_t chudxnu_kdebug_callback_cancel(void) { - OSBitAndAtomic((UInt32)~(KDEBUG_ENABLE_CHUD), (volatile UInt32 *)&kdebug_enable); kdbg_control_chud(FALSE, NULL); chudxnu_kdebug_callback_func_t old = kdebug_callback_fn; @@ -175,40 +167,18 @@ static kern_return_t chud_null_syscall(uint64_t code __unused, int chud(__unused proc_t p, struct chud_args *uap, int32_t *retval) { +#if CONFIG_MACF + int error = mac_system_check_chud(kauth_cred_get()); + if (error) + return error; +#endif + chudxnu_syscall_callback_func_t fn = syscall_callback_fn; if(!fn) { return EINVAL; } -#ifdef __ppc__ - // ppc32 user land spills 2.5 64-bit args (5 x 32-bit) to the stack - // here we have to copy them out. r1 is the stack pointer in this world. - // the offset is calculated according to the PPC32 ABI - // Important: this only happens for 32-bit user threads - - if(!IS_64BIT_PROCESS(p)) { - struct savearea *regs = find_user_regs(current_thread()); - if(!regs) { - return EINVAL; - } - - // %r1 is the stack pointer on ppc32 - uint32_t stackPointer = regs->save_r1; - - // calculate number of bytes spilled to the stack - uint32_t spilledSize = sizeof(struct chud_args) - (sizeof(uint32_t) * SPILLED_WORD_COUNT); - - // obtain offset to arguments spilled onto user-thread stack - user_addr_t incomingAddr = (user_addr_t)stackPointer + FM_ARG0; - - // destination is halfway through arg3 - uint8_t *dstAddr = (uint8_t*)(&(uap->arg3)) + sizeof(uint32_t); - - copyin(incomingAddr, dstAddr, spilledSize); - } -#endif - *retval = fn(uap->code, uap->arg1, uap->arg2, uap->arg3, uap->arg4, uap->arg5); return 0; diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index 081f70dc3..745a0fa01 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -100,6 +100,7 @@ #include <sys/systm.h> #include <sys/dtrace_impl.h> #include <sys/param.h> +#include <sys/proc_internal.h> #include <sys/ioctl.h> #include <sys/fcntl.h> #include <miscfs/devfs/devfs.h> @@ -112,13 +113,17 @@ #include <sys/user.h> #include <mach/exception_types.h> #include <sys/signalvar.h> +#include <mach/task.h> #include <kern/zalloc.h> #include <kern/ast.h> #include <netinet/in.h> #if defined(__APPLE__) +#include <kern/cpu_data.h> extern uint32_t pmap_find_phys(void *, uint64_t); extern boolean_t pmap_valid_page(uint32_t); +extern void OSKextRegisterKextsWithDTrace(void); +extern kmod_info_t g_kernel_kmod_info; #endif /* __APPLE__ */ @@ -140,6 +145,7 @@ extern void dtrace_postinit(void); extern kern_return_t chudxnu_dtrace_callback (uint64_t selector, uint64_t *args, uint32_t count); + #endif /* __APPLE__ */ /* @@ -170,7 +176,7 @@ size_t dtrace_global_maxsize = (16 * 1024); size_t dtrace_actions_max = (16 * 1024); size_t dtrace_retain_max = 1024; dtrace_optval_t dtrace_helper_actions_max = 32; -dtrace_optval_t dtrace_helper_providers_max = 32; +dtrace_optval_t dtrace_helper_providers_max = 64; dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024); size_t dtrace_strsize_default = 256; dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */ @@ -238,6 +244,12 @@ static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */ static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */ #if defined(__APPLE__) static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's dof modes. */ + + /* + * This does't quite fit as an internal variable, as it must be accessed in + * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either... + */ +int dtrace_kernel_symbol_mode; /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */ #endif #if defined(__APPLE__) @@ -249,6 +261,8 @@ static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's */ struct zone *dtrace_probe_t_zone; + +static int dtrace_module_unloaded(struct kmod_info *kmod); #endif /* __APPLE__ */ /* @@ -328,10 +342,16 @@ static void dtrace_nullop(void) {} +static int +dtrace_enable_nullop(void) +{ + return (0); +} + static dtrace_pops_t dtrace_provider_ops = { (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop, (void (*)(void *, struct modctl *))dtrace_nullop, - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, @@ -429,8 +449,8 @@ static lck_mtx_t dtrace_errlock; (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \ (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \ } -#else -#if (defined(__x86_64__) || defined(__ppc64__)) +#else +#if defined(__x86_64__) /* FIXME: two function calls!! */ #define DTRACE_TLS_THRKEY(where) { \ uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \ @@ -542,12 +562,11 @@ dtrace_load##bits(uintptr_t addr) \ return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \ } #else /* __APPLE__ */ -#define RECOVER_LABEL(bits) __asm__ volatile("_dtraceLoadRecover" #bits ":" ); +#define RECOVER_LABEL(bits) dtraceLoadRecover##bits: #if (defined(__i386__) || defined (__x86_64__)) #define DTRACE_LOADFUNC(bits) \ /*CSTYLED*/ \ -extern vm_offset_t dtraceLoadRecover##bits; \ uint##bits##_t dtrace_load##bits(uintptr_t addr); \ \ uint##bits##_t \ @@ -578,7 +597,7 @@ dtrace_load##bits(uintptr_t addr) \ } \ \ { \ - volatile vm_offset_t recover = (vm_offset_t)&dtraceLoadRecover##bits; \ + volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits; \ *flags |= CPU_DTRACE_NOFAULT; \ recover = dtrace_set_thread_recover(current_thread(), recover); \ /*CSTYLED*/ \ @@ -598,7 +617,6 @@ dtrace_load##bits(uintptr_t addr) \ #else /* all other architectures */ #define DTRACE_LOADFUNC(bits) \ /*CSTYLED*/ \ -extern vm_offset_t dtraceLoadRecover##bits; \ uint##bits##_t dtrace_load##bits(uintptr_t addr); \ \ uint##bits##_t \ @@ -629,7 +647,7 @@ dtrace_load##bits(uintptr_t addr) \ } \ \ { \ - volatile vm_offset_t recover = (vm_offset_t)&dtraceLoadRecover##bits; \ + volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits; \ *flags |= CPU_DTRACE_NOFAULT; \ recover = dtrace_set_thread_recover(current_thread(), recover); \ /*CSTYLED*/ \ @@ -654,6 +672,7 @@ dtrace_load##bits(uintptr_t addr) \ #define DTRACE_DYNHASH_SINK 1 #define DTRACE_DYNHASH_VALID 2 +#define DTRACE_MATCH_FAIL -1 #define DTRACE_MATCH_NEXT 0 #define DTRACE_MATCH_DONE 1 #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0') @@ -1291,12 +1310,12 @@ dtrace_priv_proc_common_user(dtrace_state_t *state) #else if ((cr = dtrace_CRED()) != NULL && #endif /* __APPLE__ */ - s_cr->cr_uid == cr->cr_uid && - s_cr->cr_uid == cr->cr_ruid && - s_cr->cr_uid == cr->cr_suid && - s_cr->cr_gid == cr->cr_gid && - s_cr->cr_gid == cr->cr_rgid && - s_cr->cr_gid == cr->cr_sgid) + posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid && + posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid && + posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid && + posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid && + posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid && + posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid) return (1); return (0); @@ -4946,15 +4965,20 @@ next: #if !defined(__APPLE__) ipaddr_t ip4; #else - in_addr_t ip4; + uint32_t ip4; #endif /* __APPLE__ */ uint8_t *ptr8, val; /* * Safely load the IPv4 address. */ +#if !defined(__APPLE__) ip4 = dtrace_load32(tupregs[argi].dttk_value); - +#else + dtrace_bcopy( + (void *)(uintptr_t)tupregs[argi].dttk_value, + (void *)(uintptr_t)&ip4, sizeof (ip4)); +#endif /* __APPLE__ */ /* * Check an IPv4 string will fit in scratch. */ @@ -6180,7 +6204,7 @@ dtrace_action_raise(uint64_t sig) if (uthread && uthread->t_dtrace_sig == 0) { uthread->t_dtrace_sig = sig; - astbsd_on(); + act_set_astbsd(current_thread()); } #endif /* __APPLE__ */ } @@ -6198,21 +6222,55 @@ dtrace_action_stop(void) aston(curthread); } #else - uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); - - if (uthread && uthread->t_dtrace_stop == 0) { + uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); + if (uthread) { + /* + * The currently running process will be set to task_suspend + * when it next leaves the kernel. + */ uthread->t_dtrace_stop = 1; - astbsd_on(); + act_set_astbsd(current_thread()); } + #endif /* __APPLE__ */ } +#if defined(__APPLE__) +static void +dtrace_action_pidresume(uint64_t pid) +{ + if (dtrace_destructive_disallow) + return; + + if (kauth_cred_issuser(kauth_cred_get()) == 0) { + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return; + } + + uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); + + /* + * When the currently running process leaves the kernel, it attempts to + * task_resume the process (denoted by pid), if that pid appears to have + * been stopped by dtrace_action_stop(). + * The currently running process has a pidresume() queue depth of 1 -- + * subsequent invocations of the pidresume() action are ignored. + */ + + if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) { + uthread->t_dtrace_resumepid = pid; + act_set_astbsd(current_thread()); + } +} +#endif /* __APPLE__ */ + + static void dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val) { hrtime_t now; volatile uint16_t *flags; - cpu_t *cpu = CPU; + dtrace_cpu_t *cpu = CPU; if (dtrace_destructive_disallow) return; @@ -6601,17 +6659,21 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, ASSERT(s_cr != NULL); + /* + * XXX this is hackish, but so is setting a variable + * XXX in a McCarthy OR... + */ #if !defined(__APPLE__) if ((cr = CRED()) == NULL || #else if ((cr = dtrace_CRED()) == NULL || #endif /* __APPLE__ */ - s_cr->cr_uid != cr->cr_uid || - s_cr->cr_uid != cr->cr_ruid || - s_cr->cr_uid != cr->cr_suid || - s_cr->cr_gid != cr->cr_gid || - s_cr->cr_gid != cr->cr_rgid || - s_cr->cr_gid != cr->cr_sgid || + posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid || + posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid || + posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid || + posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid || + posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid || + posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid || #if !defined(__APPLE__) (proc = ttoproc(curthread)) == NULL || (proc->p_flag & SNOCD)) @@ -6868,6 +6930,13 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, dtrace_action_raise(val); continue; +#if defined(__APPLE__) + case DTRACEACT_PIDRESUME: + if (dtrace_priv_proc_destructive(state)) + dtrace_action_pidresume(val); + continue; +#endif /* __APPLE__ */ + case DTRACEACT_COMMIT: ASSERT(!committed); @@ -7126,12 +7195,13 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, on some function in the transitive closure of the call to dtrace_probe(). Solaris has some strong guarantees that this won't happen, the Darwin implementation is not so mature as to make those guarantees. */ + void dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4) { thread_t thread = current_thread(); - + disable_preemption(); if (id == dtrace_probeid_error) { __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); dtrace_getipl(); /* Defeat tail-call optimization of __dtrace_probe() */ @@ -7143,6 +7213,7 @@ dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, #if DEBUG else __dtrace_probe(dtrace_probeid_error, 0, id, 1, -1, DTRACEFLT_UNKNOWN); #endif + enable_preemption(); } #endif /* __APPLE__ */ @@ -7733,7 +7804,7 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, { dtrace_probe_t template, *probe; dtrace_hash_t *hash = NULL; - int len, best = INT_MAX, nmatched = 0; + int len, rc, best = INT_MAX, nmatched = 0; dtrace_id_t i; lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED); @@ -7745,7 +7816,8 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, if (pkp->dtpk_id != DTRACE_IDNONE) { if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL && dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) { - (void) (*matched)(probe, arg); + if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); nmatched++; } return (nmatched); @@ -7802,8 +7874,11 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, nmatched++; - if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT) - break; + if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) { + if (rc == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); + break; + } } return (nmatched); @@ -7822,8 +7897,11 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, nmatched++; - if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT) - break; + if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) { + if (rc == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); + break; + } } return (nmatched); @@ -8051,7 +8129,7 @@ dtrace_unregister(dtrace_provider_id_t id) dtrace_probe_t *probe, *first = NULL; if (old->dtpv_pops.dtps_enable == - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) { + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) { /* * If DTrace itself is the provider, we're called with locks * already held. @@ -8201,7 +8279,7 @@ dtrace_invalidate(dtrace_provider_id_t id) dtrace_provider_t *pvp = (dtrace_provider_t *)id; ASSERT(pvp->dtpv_pops.dtps_enable != - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop); + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop); lck_mtx_lock(&dtrace_provider_lock); lck_mtx_lock(&dtrace_lock); @@ -8242,7 +8320,7 @@ dtrace_condense(dtrace_provider_id_t id) * Make sure this isn't the dtrace provider itself. */ ASSERT(prov->dtpv_pops.dtps_enable != - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop); + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop); lck_mtx_lock(&dtrace_provider_lock); lck_mtx_lock(&dtrace_lock); @@ -8508,7 +8586,6 @@ dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv) { struct modctl *ctl; int all = 0; -#pragma unused(ctl) /* __APPLE__ */ lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED); @@ -8516,22 +8593,22 @@ dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv) all = 1; prv = dtrace_provider; } - + do { /* * First, call the blanket provide operation. */ prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc); - -#if !defined(__APPLE__) + /* * Now call the per-module provide operation. We will grab * mod_lock to prevent the list from being modified. Note * that this also prevents the mod_busy bits from changing. * (mod_busy can only be changed with mod_lock held.) */ - mutex_enter(&mod_lock); - + lck_mtx_lock(&mod_lock); + +#if !defined(__APPLE__) ctl = &modules; do { if (ctl->mod_busy || ctl->mod_mp == NULL) @@ -8540,29 +8617,15 @@ dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv) prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); } while ((ctl = ctl->mod_next) != &modules); - - mutex_exit(&mod_lock); #else -#if 0 /* FIXME: Workaround for PR_4643546 */ - /* NOTE: kmod_lock has been removed. */ - simple_lock(&kmod_lock); - - kmod_info_t *ktl = kmod; - while (ktl) { - prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ktl); - ktl = ktl->next; + ctl = dtrace_modctl_list; + while (ctl) { + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); + ctl = ctl->mod_next; } - - simple_unlock(&kmod_lock); -#else - /* - * Don't bother to iterate over the kmod list. At present only fbt - * offers a provide_module in its dtpv_pops, and then it ignores the - * module anyway. - */ - prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, NULL); #endif -#endif /* __APPLE__ */ + + lck_mtx_unlock(&mod_lock); } while (all && (prv = prv->dtpv_next) != NULL); } @@ -9295,7 +9358,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, break; default: - err += efunc(dp->dtdo_len - 1, "bad return size"); + err += efunc(dp->dtdo_len - 1, "bad return size\n"); } } @@ -10356,7 +10419,7 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe) return (ecb); } -static void +static int dtrace_ecb_enable(dtrace_ecb_t *ecb) { dtrace_probe_t *probe = ecb->dte_probe; @@ -10369,7 +10432,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) /* * This is the NULL probe -- there's nothing to do. */ - return; + return(0); } if (probe->dtpr_ecb == NULL) { @@ -10383,8 +10446,8 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) if (ecb->dte_predicate != NULL) probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid; - prov->dtpv_pops.dtps_enable(prov->dtpv_arg, - probe->dtpr_id, probe->dtpr_arg); + return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg)); } else { /* * This probe is already active. Swing the last pointer to @@ -10397,6 +10460,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) probe->dtpr_predcache = 0; dtrace_sync(); + return(0); } } @@ -10860,6 +10924,9 @@ dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc) case DTRACEACT_CHILL: case DTRACEACT_DISCARD: case DTRACEACT_RAISE: +#if defined(__APPLE__) + case DTRACEACT_PIDRESUME: +#endif /* __APPLE__ */ if (dp == NULL) return (EINVAL); break; @@ -11196,7 +11263,9 @@ dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg) if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL) return (DTRACE_MATCH_DONE); - dtrace_ecb_enable(ecb); + if (dtrace_ecb_enable(ecb) < 0) + return (DTRACE_MATCH_FAIL); + return (DTRACE_MATCH_NEXT); } @@ -11313,7 +11382,7 @@ static int dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, processorid_t cpu) { - cpu_t *cp; + dtrace_cpu_t *cp; dtrace_buffer_t *buf; lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED); @@ -12052,7 +12121,7 @@ static int dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) { int i = 0; - int matched = 0; + int total_matched = 0, matched = 0; lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED); lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED); @@ -12063,7 +12132,14 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) enab->dten_current = ep; enab->dten_error = 0; - matched += dtrace_probe_enable(&ep->dted_probe, enab); + /* + * If a provider failed to enable a probe then get out and + * let the consumer know we failed. + */ + if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0) + return (EBUSY); + + total_matched += matched; if (enab->dten_error != 0) { /* @@ -12091,7 +12167,7 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) enab->dten_probegen = dtrace_probegen; if (nmatched != NULL) - *nmatched = matched; + *nmatched = total_matched; return (0); } @@ -12351,16 +12427,22 @@ dtrace_dof_copyin(user_addr_t uarg, int *errp) #if !defined(__APPLE__) dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP); - if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) { + if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 || + dof->dofh_loadsz != hdr.dofh_loadsz) { + kmem_free(dof, hdr.dofh_loadsz); + *errp = EFAULT; + return (NULL); + } #else dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP); - if (copyin(uarg, dof, hdr.dofh_loadsz) != 0) { + if (copyin(uarg, dof, hdr.dofh_loadsz) != 0 || + dof->dofh_loadsz != hdr.dofh_loadsz) { + dt_kmem_free_aligned(dof, hdr.dofh_loadsz); + *errp = EFAULT; + return (NULL); + } #endif - dt_kmem_free_aligned(dof, hdr.dofh_loadsz); - *errp = EFAULT; - return (NULL); - } return (dof); } @@ -16079,30 +16161,257 @@ dtrace_helpers_duplicate(proc_t *from, proc_t *to) /* * DTrace Hook Functions */ + +#if defined(__APPLE__) +/* + * Routines to manipulate the modctl list within dtrace + */ + +modctl_t *dtrace_modctl_list; + +static void +dtrace_modctl_add(struct modctl * newctl) +{ + struct modctl *nextp, *prevp; + + ASSERT(newctl != NULL); + lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED); + + // Insert new module at the front of the list, + + newctl->mod_next = dtrace_modctl_list; + dtrace_modctl_list = newctl; + + /* + * If a module exists with the same name, then that module + * must have been unloaded with enabled probes. We will move + * the unloaded module to the new module's stale chain and + * then stop traversing the list. + */ + + prevp = newctl; + nextp = newctl->mod_next; + + while (nextp != NULL) { + if (nextp->mod_loaded) { + /* This is a loaded module. Keep traversing. */ + prevp = nextp; + nextp = nextp->mod_next; + continue; + } + else { + /* Found an unloaded module */ + if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) { + /* Names don't match. Keep traversing. */ + prevp = nextp; + nextp = nextp->mod_next; + continue; + } + else { + /* We found a stale entry, move it. We're done. */ + prevp->mod_next = nextp->mod_next; + newctl->mod_stale = nextp; + nextp->mod_next = NULL; + break; + } + } + } +} + +static modctl_t * +dtrace_modctl_lookup(struct kmod_info * kmod) +{ + lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED); + + struct modctl * ctl; + + for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) { + if (ctl->mod_id == kmod->id) + return(ctl); + } + return (NULL); +} + +/* + * This routine is called from dtrace_module_unloaded(). + * It removes a modctl structure and its stale chain + * from the kext shadow list. + */ +static void +dtrace_modctl_remove(struct modctl * ctl) +{ + ASSERT(ctl != NULL); + lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED); + modctl_t *prevp, *nextp, *curp; + + // Remove stale chain first + for (curp=ctl->mod_stale; curp != NULL; curp=nextp) { + nextp = curp->mod_stale; + /* There should NEVER be user symbols allocated at this point */ + ASSERT(curp->mod_user_symbols == NULL); + kmem_free(curp, sizeof(modctl_t)); + } + + prevp = NULL; + curp = dtrace_modctl_list; + + while (curp != ctl) { + prevp = curp; + curp = curp->mod_next; + } + + if (prevp != NULL) { + prevp->mod_next = ctl->mod_next; + } + else { + dtrace_modctl_list = ctl->mod_next; + } + + /* There should NEVER be user symbols allocated at this point */ + ASSERT(ctl->mod_user_symbols == NULL); + + kmem_free (ctl, sizeof(modctl_t)); +} + +#endif /* __APPLE__ */ + +/* + * APPLE NOTE: The kext loader will call dtrace_module_loaded + * when the kext is loaded in memory, but before calling the + * kext's start routine. + * + * Return 0 on success + * Return -1 on failure + */ + +#if !defined (__APPLE__) static void dtrace_module_loaded(struct modctl *ctl) +#else +static int +dtrace_module_loaded(struct kmod_info *kmod) +#endif /* __APPLE__ */ { dtrace_provider_t *prv; - lck_mtx_lock(&dtrace_provider_lock); - lck_mtx_lock(&mod_lock); - #if !defined(__APPLE__) + mutex_enter(&dtrace_provider_lock); + mutex_enter(&mod_lock); + ASSERT(ctl->mod_busy); #else - /* FIXME: awaits kmod awareness PR_4648477. */ -#endif /* __APPLE__ */ + + /* + * If kernel symbols have been disabled, return immediately + * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks + */ + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER) + return 0; + + struct modctl *ctl = NULL; + if (!kmod || kmod->address == 0 || kmod->size == 0) + return(-1); + + lck_mtx_lock(&dtrace_provider_lock); + lck_mtx_lock(&mod_lock); + + /* + * Have we seen this kext before? + */ + ctl = dtrace_modctl_lookup(kmod); + + if (ctl != NULL) { + /* bail... we already have this kext in the modctl list */ + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + if (dtrace_err_verbose) + cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id); + return(-1); + } + else { + ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP); + if (ctl == NULL) { + if (dtrace_err_verbose) + cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + return (-1); + } + ctl->mod_next = NULL; + ctl->mod_stale = NULL; + strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname)); + ctl->mod_loadcnt = kmod->id; + ctl->mod_nenabled = 0; + ctl->mod_address = kmod->address; + ctl->mod_size = kmod->size; + ctl->mod_id = kmod->id; + ctl->mod_loaded = 1; + ctl->mod_flags = 0; + ctl->mod_user_symbols = NULL; + + /* + * Find the UUID for this module, if it has one + */ + kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address; + struct load_command* load_cmd = (struct load_command *)&header[1]; + uint32_t i; + for (i = 0; i < header->ncmds; i++) { + if (load_cmd->cmd == LC_UUID) { + struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd; + memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid)); + ctl->mod_flags |= MODCTL_HAS_UUID; + break; + } + load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize); + } + + if (ctl->mod_address == g_kernel_kmod_info.address) { + ctl->mod_flags |= MODCTL_IS_MACH_KERNEL; + } + } + dtrace_modctl_add(ctl); + + /* + * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s) + */ + lck_mtx_lock(&dtrace_lock); + + /* + * If the module does not have a valid UUID, we will not be able to find symbols for it from + * userspace. Go ahead and instrument it now. + */ + if (MOD_HAS_UUID(ctl) && (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE)) { + lck_mtx_unlock(&dtrace_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + return 0; + } + + ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS; + + lck_mtx_unlock(&dtrace_lock); +#endif /* __APPLE__ */ + /* * We're going to call each providers per-module provide operation * specifying only this module. */ for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next) - prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); - + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); + +#if defined(__APPLE__) + /* + * The contract with the kext loader is that once this function has completed, + * it may delete kernel symbols at will. We must set this while still holding + * the mod_lock. + */ + ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS; +#endif + lck_mtx_unlock(&mod_lock); lck_mtx_unlock(&dtrace_provider_lock); - + /* * If we have any retained enablings, we need to match against them. * Enabling probes requires that cpu_lock be held, and we cannot hold @@ -16112,17 +16421,22 @@ dtrace_module_loaded(struct modctl *ctl) * our task queue to do the match for us. */ lck_mtx_lock(&dtrace_lock); - + if (dtrace_retained == NULL) { lck_mtx_unlock(&dtrace_lock); +#if !defined(__APPLE__) return; +#else + return 0; +#endif } - + +#if !defined(__APPLE__) (void) taskq_dispatch(dtrace_taskq, - (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP); - - lck_mtx_unlock(&dtrace_lock); - + (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP); + + mutex_exit(&dtrace_lock); + /* * And now, for a little heuristic sleaze: in general, we want to * match modules as soon as they load. However, we cannot guarantee @@ -16134,8 +16448,23 @@ dtrace_module_loaded(struct modctl *ctl) * just loaded may not be immediately instrumentable. */ delay(1); +#else + /* APPLE NOTE! + * + * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually + * holds it for any reason. Thus the comment above is invalid, we can directly invoke + * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid + * the delay call as well. + */ + lck_mtx_unlock(&dtrace_lock); + + dtrace_enabling_matchall(); + + return 0; +#endif /* __APPLE__ */ } - + +#if !defined(__APPLE__) static void dtrace_module_unloaded(struct modctl *ctl) { @@ -16144,27 +16473,27 @@ dtrace_module_unloaded(struct modctl *ctl) template.dtpr_mod = ctl->mod_modname; - lck_mtx_lock(&dtrace_provider_lock); - lck_mtx_lock(&mod_lock); - lck_mtx_lock(&dtrace_lock); + mutex_enter(&dtrace_provider_lock); + mutex_enter(&mod_lock); + mutex_enter(&dtrace_lock); if (dtrace_bymod == NULL) { /* * The DTrace module is loaded (obviously) but not attached; * we don't have any work to do. */ - lck_mtx_unlock(&dtrace_provider_lock); - lck_mtx_unlock(&mod_lock); - lck_mtx_unlock(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); + mutex_exit(&mod_lock); + mutex_exit(&dtrace_lock); return; } for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template); probe != NULL; probe = probe->dtpr_nextmod) { if (probe->dtpr_ecb != NULL) { - lck_mtx_unlock(&dtrace_provider_lock); - lck_mtx_unlock(&mod_lock); - lck_mtx_unlock(&dtrace_lock); + mutex_exit(&dtrace_provider_lock); + mutex_exit(&mod_lock); + mutex_exit(&dtrace_lock); /* * This shouldn't _actually_ be possible -- we're @@ -16222,25 +16551,185 @@ dtrace_module_unloaded(struct modctl *ctl) kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1); -#if !defined(__APPLE__) kmem_free(probe, sizeof (dtrace_probe_t)); -#else - zfree(dtrace_probe_t_zone, probe); -#endif /* __APPLE__ */ } - lck_mtx_unlock(&dtrace_lock); - lck_mtx_unlock(&mod_lock); - lck_mtx_unlock(&dtrace_provider_lock); + mutex_exit(&dtrace_lock); + mutex_exit(&mod_lock); + mutex_exit(&dtrace_provider_lock); } +#else /* __APPLE__ */ -void -dtrace_suspend(void) +/* + * Return 0 on success + * Return -1 on failure + */ +static int +dtrace_module_unloaded(struct kmod_info *kmod) { - dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend)); -} + dtrace_probe_t template, *probe, *first, *next; + dtrace_provider_t *prov; + struct modctl *ctl = NULL; + struct modctl *syncctl = NULL; + struct modctl *nextsyncctl = NULL; + int syncmode = 0; + + lck_mtx_lock(&dtrace_provider_lock); + lck_mtx_lock(&mod_lock); + lck_mtx_lock(&dtrace_lock); -void + if (kmod == NULL) { + syncmode = 1; + } + else { + ctl = dtrace_modctl_lookup(kmod); + if (ctl == NULL) + { + lck_mtx_unlock(&dtrace_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + return (-1); + } + ctl->mod_loaded = 0; + ctl->mod_address = 0; + ctl->mod_size = 0; + } + + if (dtrace_bymod == NULL) { + /* + * The DTrace module is loaded (obviously) but not attached; + * we don't have any work to do. + */ + if (ctl != NULL) + (void)dtrace_modctl_remove(ctl); + lck_mtx_unlock(&dtrace_provider_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_lock); + return(0); + } + + /* Syncmode set means we target and traverse entire modctl list. */ + if (syncmode) + nextsyncctl = dtrace_modctl_list; + +syncloop: + if (syncmode) + { + /* find a stale modctl struct */ + for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) { + if (syncctl->mod_address == 0) + break; + } + if (syncctl==NULL) + { + /* We have no more work to do */ + lck_mtx_unlock(&dtrace_provider_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_lock); + return(0); + } + else { + /* keep track of next syncctl in case this one is removed */ + nextsyncctl = syncctl->mod_next; + ctl = syncctl; + } + } + + template.dtpr_mod = ctl->mod_modname; + + for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template); + probe != NULL; probe = probe->dtpr_nextmod) { + if (probe->dtpr_ecb != NULL) { + /* + * This shouldn't _actually_ be possible -- we're + * unloading a module that has an enabled probe in it. + * (It's normally up to the provider to make sure that + * this can't happen.) However, because dtps_enable() + * doesn't have a failure mode, there can be an + * enable/unload race. Upshot: we don't want to + * assert, but we're not going to disable the + * probe, either. + */ + + + if (syncmode) { + /* We're syncing, let's look at next in list */ + goto syncloop; + } + + lck_mtx_unlock(&dtrace_provider_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_lock); + + if (dtrace_err_verbose) { + cmn_err(CE_WARN, "unloaded module '%s' had " + "enabled probes", ctl->mod_modname); + } + return(-1); + } + } + + probe = first; + + for (first = NULL; probe != NULL; probe = next) { + ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe); + + dtrace_probes[probe->dtpr_id - 1] = NULL; + + next = probe->dtpr_nextmod; + dtrace_hash_remove(dtrace_bymod, probe); + dtrace_hash_remove(dtrace_byfunc, probe); + dtrace_hash_remove(dtrace_byname, probe); + + if (first == NULL) { + first = probe; + probe->dtpr_nextmod = NULL; + } else { + probe->dtpr_nextmod = first; + first = probe; + } + } + + /* + * We've removed all of the module's probes from the hash chains and + * from the probe array. Now issue a dtrace_sync() to be sure that + * everyone has cleared out from any probe array processing. + */ + dtrace_sync(); + + for (probe = first; probe != NULL; probe = first) { + first = probe->dtpr_nextmod; + prov = probe->dtpr_provider; + prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id, + probe->dtpr_arg); + kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1); + kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1); + kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1); + vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1); + + zfree(dtrace_probe_t_zone, probe); + } + + dtrace_modctl_remove(ctl); + + if (syncmode) + goto syncloop; + + lck_mtx_unlock(&dtrace_lock); + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + + return(0); +} +#endif /* __APPLE__ */ + +void +dtrace_suspend(void) +{ + dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend)); +} + +void dtrace_resume(void) { dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume)); @@ -16463,13 +16952,6 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) dtrace_provider, NULL, NULL, "END", 0, NULL); dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t) dtrace_provider, NULL, NULL, "ERROR", 1, NULL); -#elif defined(__ppc__) || defined(__ppc64__) - dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t) - dtrace_provider, NULL, NULL, "BEGIN", 2, NULL); - dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t) - dtrace_provider, NULL, NULL, "END", 1, NULL); - dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t) - dtrace_provider, NULL, NULL, "ERROR", 4, NULL); #elif (defined(__i386__) || defined (__x86_64__)) dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t) dtrace_provider, NULL, NULL, "BEGIN", 1, NULL); @@ -16505,6 +16987,15 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) if (dtrace_anon.dta_enabling != NULL) { ASSERT(dtrace_retained == dtrace_anon.dta_enabling); +#if defined(__APPLE__) + /* + * If there is anonymous dof, we should switch symbol modes. + */ + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) { + dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL; + } +#endif + dtrace_enabling_provide(NULL); state = dtrace_anon.dta_state; @@ -16612,7 +17103,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) lck_mtx_unlock(&cpu_lock); if (state == NULL) { - if (--dtrace_opens == 0) + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); lck_mtx_unlock(&dtrace_lock); return (EAGAIN); @@ -16624,7 +17115,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) lck_mtx_unlock(&cpu_lock); if (rv != 0 || state == NULL) { - if (--dtrace_opens == 0) + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); lck_mtx_unlock(&dtrace_lock); /* propagate EAGAIN or ERESTART */ @@ -16656,6 +17147,27 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) } lck_rw_unlock_exclusive(&dtrace_dof_mode_lock); + + /* + * Update kernel symbol state. + * + * We must own the provider and dtrace locks. + * + * NOTE! It may appear there is a race by setting this value so late + * after dtrace_probe_provide. However, any kext loaded after the + * call to probe provide and before we set LAZY_OFF will be marked as + * eligible for symbols from userspace. The same dtrace that is currently + * calling dtrace_open() (this call!) will get a list of kexts needing + * symbols and fill them in, thus closing the race window. + * + * We want to set this value only after it certain it will succeed, as + * this significantly reduces the complexity of error exits. + */ + lck_mtx_lock(&dtrace_lock); + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) { + dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL; + } + lck_mtx_unlock(&dtrace_lock); #endif /* __APPLE__ */ return (0); @@ -16691,31 +17203,52 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) dtrace_state_destroy(state); ASSERT(dtrace_opens > 0); - if (--dtrace_opens == 0) - (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); + /* + * Only relinquish control of the kernel debugger interface when there + * are no consumers and no anonymous enablings. + */ + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) + (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); + lck_mtx_unlock(&dtrace_lock); lck_mtx_unlock(&cpu_lock); #if defined(__APPLE__) - /* * Lock ordering requires the dof mode lock be taken before * the dtrace_lock. */ lck_rw_lock_exclusive(&dtrace_dof_mode_lock); lck_mtx_lock(&dtrace_lock); + + if (dtrace_opens == 0) { + /* + * If we are currently lazy-off, and this is the last close, transition to + * lazy state. + */ + if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) { + dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON; + } - /* - * If we are currently lazy-off, and this is the last close, transition to - * lazy state. - */ - if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF && dtrace_opens == 0) { - dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON; + /* + * If we are the last dtrace client, switch back to lazy (from userspace) symbols + */ + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) { + dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE; + } } - + lck_mtx_unlock(&dtrace_lock); lck_rw_unlock_exclusive(&dtrace_dof_mode_lock); + + /* + * Kext probes may be retained past the end of the kext's lifespan. The + * probes are kept until the last reference to them has been removed. + * Since closing an active dtrace context is likely to drop that last reference, + * lets take a shot at cleaning out the orphaned probes now. + */ + dtrace_module_unloaded(NULL); #endif /* __APPLE__ */ return (0); @@ -18437,8 +18970,254 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv return (0); } - default: - break; + case DTRACEIOC_MODUUIDSLIST: { + size_t module_uuids_list_size; + dtrace_module_uuids_list_t* uuids_list; + uint64_t dtmul_count; + + /* + * Fail if the kernel symbol mode makes this operation illegal. + * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check + * for them without holding the dtrace_lock. + */ + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER || + dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) { + cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode); + return (EPERM); + } + + /* + * Read the number of symbolsdesc structs being passed in. + */ + if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count), + &dtmul_count, + sizeof(dtmul_count))) { + cmn_err(CE_WARN, "failed to copyin dtmul_count"); + return (EFAULT); + } + + /* + * Range check the count. More than 2k kexts is probably an error. + */ + if (dtmul_count > 2048) { + cmn_err(CE_WARN, "dtmul_count is not valid"); + return (EINVAL); + } + + /* + * For all queries, we return EINVAL when the user specified + * count does not match the actual number of modules we find + * available. + * + * If the user specified count is zero, then this serves as a + * simple query to count the available modules in need of symbols. + */ + + rval = 0; + + if (dtmul_count == 0) + { + lck_mtx_lock(&mod_lock); + struct modctl* ctl = dtrace_modctl_list; + while (ctl) { + ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); + if (!MOD_SYMBOLS_DONE(ctl)) { + dtmul_count++; + rval = EINVAL; + } + ctl = ctl->mod_next; + } + lck_mtx_unlock(&mod_lock); + + if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0) + return (EFAULT); + else + return (rval); + } + + /* + * If we reach this point, then we have a request for full list data. + * Allocate a correctly sized structure and copyin the data. + */ + module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count); + if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL) + return (ENOMEM); + + /* NOTE! We can no longer exit this method via return */ + if (copyin(arg, uuids_list, module_uuids_list_size) != 0) { + cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t"); + rval = EFAULT; + goto moduuidslist_cleanup; + } + + /* + * Check that the count didn't change between the first copyin and the second. + */ + if (uuids_list->dtmul_count != dtmul_count) { + rval = EINVAL; + goto moduuidslist_cleanup; + } + + /* + * Build the list of UUID's that need symbols + */ + lck_mtx_lock(&mod_lock); + + dtmul_count = 0; + + struct modctl* ctl = dtrace_modctl_list; + while (ctl) { + /* + * We assume that userspace symbols will be "better" than kernel level symbols, + * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms + * are available, add user syms if the module might use them. + */ + ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); + if (!MOD_SYMBOLS_DONE(ctl)) { + UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count]; + if (dtmul_count++ < uuids_list->dtmul_count) { + memcpy(uuid, ctl->mod_uuid, sizeof(UUID)); + } + } + ctl = ctl->mod_next; + } + + lck_mtx_unlock(&mod_lock); + + if (uuids_list->dtmul_count < dtmul_count) + rval = EINVAL; + + uuids_list->dtmul_count = dtmul_count; + + /* + * Copyout the symbols list (or at least the count!) + */ + if (copyout(uuids_list, arg, module_uuids_list_size) != 0) { + cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t"); + rval = EFAULT; + } + + moduuidslist_cleanup: + /* + * If we had to allocate struct memory, free it. + */ + if (uuids_list != NULL) { + kmem_free(uuids_list, module_uuids_list_size); + } + + return rval; + } + + case DTRACEIOC_PROVMODSYMS: { + size_t module_symbols_size; + dtrace_module_symbols_t* module_symbols; + uint64_t dtmodsyms_count; + + /* + * Fail if the kernel symbol mode makes this operation illegal. + * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check + * for them without holding the dtrace_lock. + */ + if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER || + dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) { + cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode); + return (EPERM); + } + + /* + * Read the number of module symbols structs being passed in. + */ + if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count), + &dtmodsyms_count, + sizeof(dtmodsyms_count))) { + cmn_err(CE_WARN, "failed to copyin dtmodsyms_count"); + return (EFAULT); + } + + /* + * Range check the count. How much data can we pass around? + * FIX ME! + */ + if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) { + cmn_err(CE_WARN, "dtmodsyms_count is not valid"); + return (EINVAL); + } + + /* + * Allocate a correctly sized structure and copyin the data. + */ + module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count); + if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL) + return (ENOMEM); + + rval = 0; + + /* NOTE! We can no longer exit this method via return */ + if (copyin(arg, module_symbols, module_symbols_size) != 0) { + cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t, symbol count %llu", module_symbols->dtmodsyms_count); + rval = EFAULT; + goto module_symbols_cleanup; + } + + /* + * Check that the count didn't change between the first copyin and the second. + */ + if (module_symbols->dtmodsyms_count != dtmodsyms_count) { + rval = EINVAL; + goto module_symbols_cleanup; + } + + /* + * Find the modctl to add symbols to. + */ + lck_mtx_lock(&dtrace_provider_lock); + lck_mtx_lock(&mod_lock); + + struct modctl* ctl = dtrace_modctl_list; + while (ctl) { + ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl)); + if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl)) { + if (memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) { + /* BINGO! */ + ctl->mod_user_symbols = module_symbols; + break; + } + } + ctl = ctl->mod_next; + } + + if (ctl) { + dtrace_provider_t *prv; + + /* + * We're going to call each providers per-module provide operation + * specifying only this module. + */ + for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next) + prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); + + /* + * We gave every provider a chance to provide with the user syms, go ahead and clear them + */ + ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */ + } + + lck_mtx_unlock(&mod_lock); + lck_mtx_unlock(&dtrace_provider_lock); + + module_symbols_cleanup: + /* + * If we had to allocate struct memory, free it. + */ + if (module_symbols != NULL) { + kmem_free(module_symbols, module_symbols_size); + } + + return rval; + } + + default: + break; } return (ENOTTY); @@ -18912,12 +19691,14 @@ dtrace_init( void ) lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr); lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr); + dtrace_modctl_list = NULL; + cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP ); for (i = 0; i < ncpu; ++i) { lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr); } - cpu_list = (cpu_t *)kmem_zalloc( ncpu * sizeof(cpu_t), KM_SLEEP ); + cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP ); for (i = 0; i < ncpu; ++i) { cpu_list[i].cpu_id = (processorid_t)i; cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]); @@ -18965,6 +19746,14 @@ dtrace_init( void ) break; } + /* + * See dtrace_impl.h for a description of kernel symbol modes. + * The default is to wait for symbols from userspace (lazy symbols). + */ + if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) { + dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE; + } + gDTraceInited = 1; } else @@ -18974,12 +19763,29 @@ dtrace_init( void ) void dtrace_postinit(void) { - /* - * Called from bsd_init after all provider's *_init() routines have been - * run. That way, anonymous DOF enabled under dtrace_attach() is safe - * to go. - */ - dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0), 0 ); /* Punning a dev_t to a dev_info_t* */ + /* + * Called from bsd_init after all provider's *_init() routines have been + * run. That way, anonymous DOF enabled under dtrace_attach() is safe + * to go. + */ + dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0), 0 ); /* Punning a dev_t to a dev_info_t* */ + + /* + * Add the mach_kernel to the module list for lazy processing + */ + struct kmod_info fake_kernel_kmod; + memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod)); + + strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name)); + fake_kernel_kmod.id = 1; + fake_kernel_kmod.address = g_kernel_kmod_info.address; + fake_kernel_kmod.size = g_kernel_kmod_info.size; + + if (dtrace_module_loaded(&fake_kernel_kmod) != 0) { + printf("dtrace_postinit: Could not register mach_kernel modctl\n"); + } + + (void)OSKextRegisterKextsWithDTrace(); } #undef DTRACE_MAJOR diff --git a/bsd/dev/dtrace/dtrace_glue.c b/bsd/dev/dtrace/dtrace_glue.c index 6d4586e2c..a046e3eac 100644 --- a/bsd/dev/dtrace/dtrace_glue.c +++ b/bsd/dev/dtrace/dtrace_glue.c @@ -227,7 +227,7 @@ done: lck_mtx_t cpu_lock; lck_mtx_t mod_lock; -cpu_t *cpu_list; +dtrace_cpu_t *cpu_list; cpu_core_t *cpu_core; /* XXX TLB lockdown? */ /* @@ -267,41 +267,38 @@ PRIV_POLICY_ONLY(void *cr, int priv, int boolean) return kauth_cred_issuser(cr); /* XXX TODO: HAS_PRIVILEGE(cr, priv); */ } +/* XXX Get around const poisoning using structure assigns */ gid_t -crgetgid(const cred_t *cr) { return cr->cr_groups[0]; } +crgetgid(const cred_t *cr) { cred_t copy_cr = *cr; return kauth_cred_getgid(©_cr); } uid_t -crgetuid(const cred_t *cr) { return cr->cr_uid; } +crgetuid(const cred_t *cr) { cred_t copy_cr = *cr; return kauth_cred_getuid(©_cr); } /* * "cyclic" */ /* osfmk/kern/timer_call.h */ -typedef void *call_entry_param_t; -typedef void (*call_entry_func_t)( - call_entry_param_t param0, - call_entry_param_t param1); - -typedef struct call_entry { - queue_chain_t q_link; - call_entry_func_t func; - call_entry_param_t param0; - call_entry_param_t param1; - uint64_t deadline; - enum { - IDLE, - PENDING, - DELAYED } state; -} call_entry_data_t; - - -typedef struct call_entry *timer_call_t; typedef void *timer_call_param_t; typedef void (*timer_call_func_t)( timer_call_param_t param0, timer_call_param_t param1); +typedef struct timer_call { + queue_chain_t q_link; + queue_t queue; + timer_call_func_t func; + timer_call_param_t param0; + timer_call_param_t param1; + decl_simple_lock_data(,lock); + uint64_t deadline; + uint64_t soft_deadline; + uint32_t flags; + boolean_t async_dequeue; +} timer_call_data_t; + +typedef struct timer_call *timer_call_t; + extern void timer_call_setup( timer_call_t call, @@ -312,7 +309,13 @@ extern boolean_t timer_call_enter1( timer_call_t call, timer_call_param_t param1, - uint64_t deadline); + uint64_t deadline, + uint32_t flags); + +#ifndef TIMER_CALL_CRITICAL +#define TIMER_CALL_CRITICAL 0x1 +#define TIMER_CALL_LOCAL 0x2 +#endif /* TIMER_CALL_CRITICAL */ extern boolean_t timer_call_cancel( @@ -322,7 +325,7 @@ typedef struct wrap_timer_call { cyc_handler_t hdlr; cyc_time_t when; uint64_t deadline; - struct call_entry call; + struct timer_call call; } wrap_timer_call_t; #define WAKEUP_REAPER 0x7FFFFFFFFFFFFFFFLL @@ -337,7 +340,7 @@ _timer_call_apply_cyclic( void *ignore, void *vTChdl ) (*(wrapTC->hdlr.cyh_func))( wrapTC->hdlr.cyh_arg ); clock_deadline_for_periodic_event( wrapTC->when.cyt_interval, mach_absolute_time(), &(wrapTC->deadline) ); - timer_call_enter1( &(wrapTC->call), (void *)wrapTC, wrapTC->deadline ); + timer_call_enter1( &(wrapTC->call), (void *)wrapTC, wrapTC->deadline, TIMER_CALL_CRITICAL | TIMER_CALL_LOCAL ); /* Did timer_call_remove_cyclic request a wakeup call when this timer call was re-armed? */ if (wrapTC->when.cyt_interval == WAKEUP_REAPER) @@ -359,7 +362,7 @@ timer_call_add_cyclic(wrap_timer_call_t *wrapTC, cyc_handler_t *handler, cyc_tim wrapTC->deadline = now; clock_deadline_for_periodic_event( wrapTC->when.cyt_interval, now, &(wrapTC->deadline) ); - timer_call_enter1( &(wrapTC->call), (void *)wrapTC, wrapTC->deadline ); + timer_call_enter1( &(wrapTC->call), (void *)wrapTC, wrapTC->deadline, TIMER_CALL_CRITICAL | TIMER_CALL_LOCAL ); return (cyclic_id_t)wrapTC; } diff --git a/bsd/dev/dtrace/dtrace_subr.c b/bsd/dev/dtrace/dtrace_subr.c index 3d8e65309..c3a69c48f 100644 --- a/bsd/dev/dtrace/dtrace_subr.c +++ b/bsd/dev/dtrace/dtrace_subr.c @@ -49,11 +49,14 @@ int (*dtrace_fasttrap_probe_ptr)(struct regs *); * They're assigned in dtrace.c but Darwin never calls them. */ void (*dtrace_cpu_init)(processorid_t); +#if !defined(__APPLE__) void (*dtrace_modload)(struct modctl *); void (*dtrace_modunload)(struct modctl *); -#if defined(__APPLE__) +#else +int (*dtrace_modload)(struct kmod_info *); +int (*dtrace_modunload)(struct kmod_info *); void (*dtrace_helpers_cleanup)(proc_t *); -#endif +#endif /*__APPLE__*/ void (*dtrace_helpers_fork)(proc_t *, proc_t *); void (*dtrace_cpustart_init)(void); void (*dtrace_cpustart_fini)(void); diff --git a/bsd/dev/dtrace/fasttrap.c b/bsd/dev/dtrace/fasttrap.c index 814778290..f75e9df72 100644 --- a/bsd/dev/dtrace/fasttrap.c +++ b/bsd/dev/dtrace/fasttrap.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -961,7 +961,7 @@ fasttrap_disable_callbacks(void) ASSERT(fasttrap_pid_count > 0); fasttrap_pid_count--; if (fasttrap_pid_count == 0) { - cpu_t *cur, *cpu = CPU; + dtrace_cpu_t *cur, *cpu = CPU; /* * APPLE NOTE: This loop seems broken, it touches every CPU @@ -987,7 +987,7 @@ fasttrap_disable_callbacks(void) } /*ARGSUSED*/ -static void +static int fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg, id) @@ -1016,7 +1016,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) * provider can't go away while we're in this code path. */ if (probe->ftp_prov->ftp_retired) - return; + return(0); /* * If we can't find the process, it may be that we're in the context of @@ -1030,11 +1030,11 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) * does not return process's with SIDL set, but we always return * the child process. */ - return; + return(0); #else if ((curproc->p_flag & SFORKING) == 0) - return; + return(0); lck_mtx_lock(&pidlock); p = prfind(probe->ftp_pid); @@ -1109,7 +1109,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) * drop our reference on the trap table entry. */ fasttrap_disable_callbacks(); - return; + return(0); } } @@ -1117,6 +1117,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) sprunlock(p); probe->ftp_enabled = 1; + return (0); } /*ARGSUSED*/ @@ -2155,9 +2156,6 @@ fasttrap_meta_create_probe(void *arg, void *parg, * Both 32 & 64 bit want to go back one byte, to point at the first NOP */ tp->ftt_pc = dhpb->dthpb_base + (int64_t)dhpb->dthpb_offs[i] - 1; -#elif defined(__ppc__) - /* All PPC probes are zero offset. */ - tp->ftt_pc = dhpb->dthpb_base + (int64_t)dhpb->dthpb_offs[i]; #else #error "Architecture not supported" #endif @@ -2199,9 +2197,6 @@ fasttrap_meta_create_probe(void *arg, void *parg, * Both 32 & 64 bit want to go forward two bytes, to point at a single byte nop. */ tp->ftt_pc = dhpb->dthpb_base + (int64_t)dhpb->dthpb_enoffs[j] + 2; -#elif defined(__ppc__) - /* All PPC is-enabled probes are zero offset. */ - tp->ftt_pc = dhpb->dthpb_base + (int64_t)dhpb->dthpb_enoffs[j]; #else #error "Architecture not supported" #endif @@ -2294,7 +2289,8 @@ fasttrap_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int * probe = kmem_alloc(size, KM_SLEEP); - if (copyin(arg, probe, size) != 0) { + if (copyin(arg, probe, size) != 0 || + probe->ftps_noffs != noffs) { kmem_free(probe, size); return (EFAULT); } diff --git a/bsd/dev/dtrace/fbt.c b/bsd/dev/dtrace/fbt.c index 94e15da00..5a6570ed1 100644 --- a/bsd/dev/dtrace/fbt.c +++ b/bsd/dev/dtrace/fbt.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,16 +52,10 @@ /* #include <machine/trap.h> */ struct savearea_t; /* Used anonymously */ -typedef kern_return_t (*perfCallback)(int, struct savearea_t *, int, int); +typedef kern_return_t (*perfCallback)(int, struct savearea_t *, uintptr_t *, int); -#if defined (__ppc__) || defined (__ppc64__) -extern perfCallback tempDTraceTrapHook, tempDTraceIntHook; -extern kern_return_t fbt_perfCallback(int, struct savearea_t *, int, int); -extern kern_return_t fbt_perfIntCallback(int, struct savearea_t *, int, int); -#else extern perfCallback tempDTraceTrapHook; -extern kern_return_t fbt_perfCallback(int, struct savearea_t *, int, int); -#endif +extern kern_return_t fbt_perfCallback(int, struct savearea_t *, uintptr_t *); #define FBT_ADDR2NDX(addr) ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask) #define FBT_PROBETAB_SIZE 0x8000 /* 32k entries -- 128K total */ @@ -111,25 +105,42 @@ fbt_destroy(void *arg, dtrace_id_t id, void *parg) } /*ARGSUSED*/ -static void +int fbt_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) fbt_probe_t *fbt = parg; - struct modctl *ctl = fbt->fbtp_ctl; + struct modctl *ctl = NULL; + + for (; fbt != NULL; fbt = fbt->fbtp_next) { -#if defined (__ppc__) || defined (__ppc64__) - dtrace_casptr(&tempDTraceIntHook, NULL, fbt_perfIntCallback); - if (tempDTraceIntHook != (perfCallback)fbt_perfIntCallback) { + ctl = fbt->fbtp_ctl; + + if (!ctl->mod_loaded) { if (fbt_verbose) { - cmn_err(CE_NOTE, "fbt_enable is failing for probe %s " - "in module %s: tempDTraceIntHook already occupied.", + cmn_err(CE_NOTE, "fbt is failing for probe %s " + "(module %s unloaded)", fbt->fbtp_name, ctl->mod_modname); } - return; + + continue; } -#endif - + + /* + * Now check that our modctl has the expected load count. If it + * doesn't, this module must have been unloaded and reloaded -- and + * we're not going to touch it. + */ + if (ctl->mod_loadcnt != fbt->fbtp_loadcnt) { + if (fbt_verbose) { + cmn_err(CE_NOTE, "fbt is failing for probe %s " + "(module %s reloaded)", + fbt->fbtp_name, ctl->mod_modname); + } + + continue; + } + dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback); if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) { if (fbt_verbose) { @@ -137,14 +148,21 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg) "in module %s: tempDTraceTrapHook already occupied.", fbt->fbtp_name, ctl->mod_modname); } - return; + continue; } - for (; fbt != NULL; fbt = fbt->fbtp_next) + if (fbt->fbtp_currentval != fbt->fbtp_patchval) { (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_patchval)); - - dtrace_membar_consumer(); + fbt->fbtp_currentval = fbt->fbtp_patchval; + ctl->mod_nenabled++; + } + + } + + dtrace_membar_consumer(); + + return (0); } /*ARGSUSED*/ @@ -153,11 +171,22 @@ fbt_disable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) fbt_probe_t *fbt = parg; + struct modctl *ctl = NULL; + + for (; fbt != NULL; fbt = fbt->fbtp_next) { + ctl = fbt->fbtp_ctl; + + if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt)) + continue; - for (; fbt != NULL; fbt = fbt->fbtp_next) + if (fbt->fbtp_currentval != fbt->fbtp_savedval) { (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_savedval)); - + fbt->fbtp_currentval = fbt->fbtp_savedval; + ASSERT(ctl->mod_nenabled > 0); + ctl->mod_nenabled--; + } + } dtrace_membar_consumer(); } @@ -167,11 +196,20 @@ fbt_suspend(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) fbt_probe_t *fbt = parg; + struct modctl *ctl = NULL; - for (; fbt != NULL; fbt = fbt->fbtp_next) - (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, + for (; fbt != NULL; fbt = fbt->fbtp_next) { + ctl = fbt->fbtp_ctl; + + ASSERT(ctl->mod_nenabled > 0); + if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt)) + continue; + + (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_savedval)); - + fbt->fbtp_currentval = fbt->fbtp_savedval; + } + dtrace_membar_consumer(); } @@ -181,34 +219,30 @@ fbt_resume(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) fbt_probe_t *fbt = parg; - struct modctl *ctl = fbt->fbtp_ctl; + struct modctl *ctl = NULL; -#if defined (__ppc__) || defined (__ppc64__) - dtrace_casptr(&tempDTraceIntHook, NULL, fbt_perfIntCallback); - if (tempDTraceIntHook != (perfCallback)fbt_perfIntCallback) { - if (fbt_verbose) { - cmn_err(CE_NOTE, "fbt_enable is failing for probe %s " - "in module %s: tempDTraceIntHook already occupied.", - fbt->fbtp_name, ctl->mod_modname); - } - return; - } -#endif + for (; fbt != NULL; fbt = fbt->fbtp_next) { + ctl = fbt->fbtp_ctl; + + ASSERT(ctl->mod_nenabled > 0); + if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt)) + continue; - dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback); - if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) { + dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback); + if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) { if (fbt_verbose) { cmn_err(CE_NOTE, "fbt_resume is failing for probe %s " "in module %s: tempDTraceTrapHook already occupied.", fbt->fbtp_name, ctl->mod_modname); } return; - } + } - for (; fbt != NULL; fbt = fbt->fbtp_next) - (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, + (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, sizeof(fbt->fbtp_patchval)); - + fbt->fbtp_currentval = fbt->fbtp_patchval; + } + dtrace_membar_consumer(); } @@ -422,8 +456,8 @@ static struct cdevsw fbt_cdevsw = 0 /* type */ }; -static int gDisableFBT = 0; -struct modctl g_fbt_kernctl; +int gIgnoreFBTBlacklist = 0; +static int gFBTInited = 0; #undef kmem_alloc /* from its binding to dt_kmem_alloc glue */ #undef kmem_free /* from its binding to dt_kmem_free glue */ #include <vm/vm_kern.h> @@ -431,66 +465,22 @@ struct modctl g_fbt_kernctl; void fbt_init( void ) { - - PE_parse_boot_argn("DisableFBT", &gDisableFBT, sizeof (gDisableFBT)); - - if (0 == gDisableFBT) + if (0 == gFBTInited) { int majdevno = cdevsw_add(FBT_MAJOR, &fbt_cdevsw); - unsigned long size = 0, header_size, round_size; - kern_return_t ret; - void *p, *q; if (majdevno < 0) { printf("fbt_init: failed to allocate a major number!\n"); return; } - - /* - * Capture the kernel's mach_header in its entirety and the contents of - * its LINKEDIT segment (and only that segment). This is sufficient to - * build all the fbt probes lazily the first time a client looks to - * the fbt provider. Remeber these on the global struct modctl g_fbt_kernctl. - */ - header_size = sizeof(kernel_mach_header_t) + _mh_execute_header.sizeofcmds; - p = getsegdatafromheader(&_mh_execute_header, SEG_LINKEDIT, &size); - - round_size = round_page(header_size + size); - /* "q" will accomodate copied kernel_mach_header_t, its load commands, and LINKEIT segment. */ - ret = kmem_alloc_pageable(kernel_map, (vm_offset_t *)&q, round_size); - - if (p && (ret == KERN_SUCCESS)) { - kernel_segment_command_t *sgp; - - bcopy( (void *)&_mh_execute_header, q, header_size); - bcopy( p, (char *)q + header_size, size); - - sgp = getsegbynamefromheader(q, SEG_LINKEDIT); - - if (sgp) { - sgp->vmaddr = (uintptr_t)((char *)q + header_size); - g_fbt_kernctl.address = (vm_address_t)q; - g_fbt_kernctl.size = header_size + size; - } else { - kmem_free(kernel_map, (vm_offset_t)q, round_size); - g_fbt_kernctl.address = (vm_address_t)NULL; - g_fbt_kernctl.size = 0; - } - } else { - if (ret == KERN_SUCCESS) - kmem_free(kernel_map, (vm_offset_t)q, round_size); - g_fbt_kernctl.address = (vm_address_t)NULL; - g_fbt_kernctl.size = 0; - } - - strncpy((char *)&(g_fbt_kernctl.mod_modname), "mach_kernel", KMOD_MAX_NAME); - ((char *)&(g_fbt_kernctl.mod_modname))[KMOD_MAX_NAME -1] = '\0'; + + PE_parse_boot_argn("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist, sizeof (gIgnoreFBTBlacklist)); fbt_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); - - gDisableFBT = 1; /* Ensure this initialization occurs just one time. */ + + gFBTInited = 1; /* Ensure this initialization occurs just one time. */ } else - printf("fbt_init: DisableFBT non-zero, no FBT probes will be provided.\n"); + panic("fbt_init: called twice!\n"); } #undef FBT_MAJOR diff --git a/bsd/dev/dtrace/lockstat.c b/bsd/dev/dtrace/lockstat.c index 0f9d6d4ff..a9f003e65 100644 --- a/bsd/dev/dtrace/lockstat.c +++ b/bsd/dev/dtrace/lockstat.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -59,10 +59,6 @@ #define NOP 0x90 #define RET 0xc3 #define LOCKSTAT_AFRAMES 1 -#elif __ppc__ -#define NOP 0x60000000 -#define RET 0x4e800020 /* blr */ -#define LOCKSTAT_AFRAMES 2 #else #error "not ported to this architecture" #endif @@ -188,11 +184,6 @@ void lockstat_hot_patch(boolean_t active) instr = (active ? NOP : RET ); (void) ml_nofault_copy( (vm_offset_t)&instr, *(assembly_probes[i]), sizeof(instr)); -#endif -#ifdef __ppc__ - uint32_t instr; - instr = (active ? NOP : RET ); - (void) ml_nofault_copy( (vm_offset_t)&instr, *(assembly_probes[i]), sizeof(instr)); #endif } } @@ -206,7 +197,7 @@ static dev_info_t *lockstat_devi; /* saved in xxattach() for xxinfo() */ static dtrace_provider_id_t lockstat_id; /*ARGSUSED*/ -static void +static int lockstat_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg) /* __APPLE__ */ @@ -220,6 +211,7 @@ lockstat_enable(void *arg, dtrace_id_t id, void *parg) lockstat_hot_patch(TRUE); membar_producer(); + return(0); } diff --git a/bsd/dev/dtrace/profile_prvd.c b/bsd/dev/dtrace/profile_prvd.c index a74254c5c..69f3aadd5 100644 --- a/bsd/dev/dtrace/profile_prvd.c +++ b/bsd/dev/dtrace/profile_prvd.c @@ -49,6 +49,7 @@ #define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ #include <kern/cpu_data.h> #include <kern/thread.h> +#include <kern/assert.h> #include <mach/thread_status.h> #include <sys/param.h> @@ -65,9 +66,9 @@ #include <sys/dtrace_glue.h> -#if defined(__ppc__) || defined(__ppc64__) -extern struct savearea *find_kern_regs(thread_t); -#elif defined(__i386__) || defined(__x86_64__) +#include <machine/pal_routines.h> + +#if defined(__i386__) || defined(__x86_64__) extern x86_saved_state_t *find_kern_regs(thread_t); #else #error Unknown architecture @@ -127,9 +128,7 @@ static dtrace_provider_id_t profile_id; #else /* is Mac OS X */ -#if defined(__ppc__) || defined(__ppc64__) -#define PROF_ARTIFICIAL_FRAMES 8 -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) #define PROF_ARTIFICIAL_FRAMES 9 #else #error Unknown architecture @@ -185,7 +184,6 @@ static int profile_ticks[] = { static uint32_t profile_max; /* maximum number of profile probes */ static uint32_t profile_total; /* current number of profile probes */ - static void profile_fire(void *arg) { @@ -200,22 +198,7 @@ profile_fire(void *arg) dtrace_probe(prof->prof_id, CPU->cpu_profile_pc, CPU->cpu_profile_upc, late, 0, 0); #else -#if defined(__ppc__) || defined(__ppc64__) - { - struct savearea *sv = find_kern_regs(current_thread()); - - if (sv) { - if (USERMODE(sv->save_srr1)) { - dtrace_probe(prof->prof_id, 0x0, sv->save_srr0, late, 0, 0); - } else { - dtrace_probe(prof->prof_id, sv->save_srr0, 0x0, late, 0, 0); - } - } else { - dtrace_probe(prof->prof_id, 0xcafebabe, - 0x0, late, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */ - } - } -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) x86_saved_state_t *kern_regs = find_kern_regs(current_thread()); if (NULL != kern_regs) { @@ -228,6 +211,7 @@ profile_fire(void *arg) #error Unknown arch #endif } else { + pal_register_cache_state(current_thread(), VALID); /* Possibly a user interrupt */ x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); @@ -260,22 +244,7 @@ profile_tick(void *arg) dtrace_probe(prof->prof_id, CPU->cpu_profile_pc, CPU->cpu_profile_upc, 0, 0, 0); #else -#if defined(__ppc__) || defined(__ppc64__) - { - struct savearea *sv = find_kern_regs(current_thread()); - - if (sv) { - if (USERMODE(sv->save_srr1)) { - dtrace_probe(prof->prof_id, 0x0, sv->save_srr0, 0, 0, 0); - } else { - dtrace_probe(prof->prof_id, sv->save_srr0, 0x0, 0, 0, 0); - } - } else { - dtrace_probe(prof->prof_id, 0xcafebabe, - 0x0, 0, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */ - } - } -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) x86_saved_state_t *kern_regs = find_kern_regs(current_thread()); if (NULL != kern_regs) { @@ -288,6 +257,7 @@ profile_tick(void *arg) #error Unknown arch #endif } else { + pal_register_cache_state(current_thread(), VALID); /* Possibly a user interrupt */ x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); @@ -550,7 +520,7 @@ profile_destroy(void *arg, dtrace_id_t id, void *parg) /*ARGSUSED*/ static void -profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) +profile_online(void *arg, dtrace_cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) { #pragma unused(cpu) /* __APPLE__ */ profile_probe_t *prof = arg; @@ -580,7 +550,7 @@ profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when) /*ARGSUSED*/ static void -profile_offline(void *arg, cpu_t *cpu, void *oarg) +profile_offline(void *arg, dtrace_cpu_t *cpu, void *oarg) { profile_probe_percpu_t *pcpu = oarg; @@ -593,7 +563,7 @@ profile_offline(void *arg, cpu_t *cpu, void *oarg) } /*ARGSUSED*/ -static void +static int profile_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) /* __APPLE__ */ @@ -636,6 +606,7 @@ profile_enable(void *arg, dtrace_id_t id, void *parg) prof->prof_cyclic = (cyclic_id_t)cyclic_add_omni(&omni); /* cast puns cyclic_id_list_t with cyclic_id_t */ } #endif /* __APPLE__ */ + return(0); } /*ARGSUSED*/ diff --git a/bsd/dev/dtrace/sdt.c b/bsd/dev/dtrace/sdt.c index 725ab5585..bca167f01 100644 --- a/bsd/dev/dtrace/sdt.c +++ b/bsd/dev/dtrace/sdt.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -46,18 +46,12 @@ #include <sys/dtrace_glue.h> #include <sys/sdt_impl.h> +extern int dtrace_kernel_symbol_mode; struct savearea_t; /* Used anonymously */ -typedef kern_return_t (*perfCallback)(int, struct savearea_t *, int, int); +typedef kern_return_t (*perfCallback)(int, struct savearea_t *, uintptr_t *, int); -#if defined (__ppc__) || defined (__ppc64__) -extern perfCallback tempDTraceTrapHook, tempDTraceIntHook; -extern kern_return_t fbt_perfCallback(int, struct savearea_t *, int, int); -extern kern_return_t fbt_perfIntCallback(int, struct savearea_t *, int, int); - -#define SDT_PATCHVAL 0x7c810808 -#define SDT_AFRAMES 6 -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) extern perfCallback tempDTraceTrapHook; extern kern_return_t fbt_perfCallback(int, struct savearea_t *, int, int); @@ -86,7 +80,7 @@ static void __sdt_provide_module(void *arg, struct modctl *ctl) { #pragma unused(arg) - struct module *mp = (struct module *)ctl->address; + struct module *mp = (struct module *)ctl->mod_address; char *modname = ctl->mod_modname; sdt_probedesc_t *sdpd; sdt_probe_t *sdp, *old; @@ -220,14 +214,13 @@ sdt_destroy(void *arg, dtrace_id_t id, void *parg) } /*ARGSUSED*/ -static void +static int sdt_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) sdt_probe_t *sdp = parg; struct modctl *ctl = sdp->sdp_ctl; -#if !defined(__APPLE__) ctl->mod_nenabled++; /* @@ -256,20 +249,7 @@ sdt_enable(void *arg, dtrace_id_t id, void *parg) } goto err; } -#endif /* __APPLE__ */ -#if defined (__ppc__) || defined (__ppc64__) - dtrace_casptr(&tempDTraceIntHook, NULL, fbt_perfIntCallback); - if (tempDTraceIntHook != (perfCallback)fbt_perfIntCallback) { - if (sdt_verbose) { - cmn_err(CE_NOTE, "sdt_enable is failing for probe %s " - "in module %s: tempDTraceIntHook already occupied.", - sdp->sdp_name, ctl->mod_modname); - } - return; - } -#endif - dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback); if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) { if (sdt_verbose) { @@ -277,7 +257,7 @@ sdt_enable(void *arg, dtrace_id_t id, void *parg) "in module %s: tempDTraceTrapHook already occupied.", sdp->sdp_name, ctl->mod_modname); } - return; + return (0); } while (sdp != NULL) { @@ -285,10 +265,9 @@ sdt_enable(void *arg, dtrace_id_t id, void *parg) (vm_size_t)sizeof(sdp->sdp_patchval)); sdp = sdp->sdp_next; } -#if !defined(__APPLE__) + err: -#endif /* __APPLE__ */ - ; + return (0); } /*ARGSUSED*/ @@ -297,14 +276,12 @@ sdt_disable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg,id) sdt_probe_t *sdp = parg; -#if !defined(__APPLE__) struct modctl *ctl = sdp->sdp_ctl; ctl->mod_nenabled--; if (!ctl->mod_loaded || ctl->mod_loadcnt != sdp->sdp_loadcnt) goto err; -#endif /* __APPLE__ */ while (sdp != NULL) { (void)ml_nofault_copy( (vm_offset_t)&sdp->sdp_savedval, (vm_offset_t)sdp->sdp_patchpoint, @@ -312,19 +289,10 @@ sdt_disable(void *arg, dtrace_id_t id, void *parg) sdp = sdp->sdp_next; } -#if !defined(__APPLE__) err: -#endif /* __APPLE__ */ ; } -static uint64_t -sdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) -{ -#pragma unused(arg,id,parg) /* __APPLE__ */ - return dtrace_getarg(argno, aframes); -} - static dtrace_pops_t sdt_pops = { NULL, sdt_provide_module, @@ -561,107 +529,116 @@ void sdt_init( void ) } if (KERNEL_MAGIC != _mh_execute_header.magic) { - g_sdt_kernctl.address = (vm_address_t)NULL; - g_sdt_kernctl.size = 0; + g_sdt_kernctl.mod_address = (vm_address_t)NULL; + g_sdt_kernctl.mod_size = 0; } else { - kernel_mach_header_t *mh; - struct load_command *cmd; - kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - kernel_nlist_t *sym = NULL; - char *strings; - unsigned int i; - - g_sdt_mach_module.sdt_nprobes = 0; - g_sdt_mach_module.sdt_probes = NULL; - - g_sdt_kernctl.address = (vm_address_t)&g_sdt_mach_module; - g_sdt_kernctl.size = 0; - strncpy((char *)&(g_sdt_kernctl.mod_modname), "mach_kernel", KMOD_MAX_NAME); - - mh = &_mh_execute_header; - cmd = (struct load_command*) &mh[1]; - for (i = 0; i < mh->ncmds; i++) { - if (cmd->cmd == LC_SEGMENT_KERNEL) { - kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; - - if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) - orig_ts = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) - orig_le = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, "")) - orig_ts = orig_sg; /* kexts have a single unnamed segment */ - } - else if (cmd->cmd == LC_SYMTAB) - orig_st = (struct symtab_command *) cmd; - - cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize); - } - - if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) - return; - - sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); - strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - - for (i = 0; i < orig_st->nsyms; i++) { - uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); - char *name = strings + sym[i].n_un.n_strx; + kernel_mach_header_t *mh; + struct load_command *cmd; + kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; + struct symtab_command *orig_st = NULL; + kernel_nlist_t *sym = NULL; + char *strings; + unsigned int i; + + g_sdt_mach_module.sdt_nprobes = 0; + g_sdt_mach_module.sdt_probes = NULL; + + g_sdt_kernctl.mod_address = (vm_address_t)&g_sdt_mach_module; + g_sdt_kernctl.mod_size = 0; + strncpy((char *)&(g_sdt_kernctl.mod_modname), "mach_kernel", KMOD_MAX_NAME); + + g_sdt_kernctl.mod_next = NULL; + g_sdt_kernctl.mod_stale = NULL; + g_sdt_kernctl.mod_id = 0; + g_sdt_kernctl.mod_loadcnt = 1; + g_sdt_kernctl.mod_loaded = 1; + g_sdt_kernctl.mod_flags = 0; + g_sdt_kernctl.mod_nenabled = 0; + + mh = &_mh_execute_header; + cmd = (struct load_command*) &mh[1]; + for (i = 0; i < mh->ncmds; i++) { + if (cmd->cmd == LC_SEGMENT_KERNEL) { + kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; + + if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) + orig_ts = orig_sg; + else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) + orig_le = orig_sg; + else if (LIT_STRNEQL(orig_sg->segname, "")) + orig_ts = orig_sg; /* kexts have a single unnamed segment */ + } + else if (cmd->cmd == LC_SYMTAB) + orig_st = (struct symtab_command *) cmd; + + cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize); + } + + if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) + return; + + sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); + strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); + + for (i = 0; i < orig_st->nsyms; i++) { + uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); + char *name = strings + sym[i].n_un.n_strx; const char *prev_name; unsigned long best; unsigned int j; - - /* Check that the symbol is a global and that it has a name. */ - if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) - continue; - - if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ - continue; - - /* Lop off omnipresent leading underscore. */ - if (*name == '_') - name += 1; - - if (strstr(name, DTRACE_PROBE_PREFIX)) { + + /* Check that the symbol is a global and that it has a name. */ + if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) + continue; + + if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ + continue; + + /* Lop off omnipresent leading underscore. */ + if (*name == '_') + name += 1; + + if (strncmp(name, DTRACE_PROBE_PREFIX, sizeof(DTRACE_PROBE_PREFIX) - 1) == 0) { sdt_probedesc_t *sdpd = kmem_alloc(sizeof(sdt_probedesc_t), KM_SLEEP); int len = strlen(name) + 1; - + sdpd->sdpd_name = kmem_alloc(len, KM_SLEEP); strncpy(sdpd->sdpd_name, name, len); /* NUL termination is ensured. */ - + prev_name = "<unknown>"; best = 0; - /* Avoid shadow build warnings */ + /* + * Find the symbol immediately preceding the sdt probe site just discovered, + * that symbol names the function containing the sdt probe. + */ for (j = 0; j < orig_st->nsyms; j++) { uint8_t jn_type = sym[j].n_type & (N_TYPE | N_EXT); char *jname = strings + sym[j].n_un.n_strx; - + if (((N_SECT | N_EXT) != jn_type && (N_ABS | N_EXT) != jn_type)) continue; - + if (0 == sym[j].n_un.n_strx) /* iff a null, "", name. */ continue; - + if (*jname == '_') jname += 1; - if (strstr(jname, DTRACE_PROBE_PREFIX)) - continue; - + if (*(unsigned long *)sym[i].n_value <= (unsigned long)sym[j].n_value) continue; - + if ((unsigned long)sym[j].n_value > best) { best = (unsigned long)sym[j].n_value; prev_name = jname; } } - + sdpd->sdpd_func = kmem_alloc((len = strlen(prev_name) + 1), KM_SLEEP); strncpy(sdpd->sdpd_func, prev_name, len); /* NUL termination is ensured. */ - + sdpd->sdpd_offset = *(unsigned long *)sym[i].n_value; - + sdpd->sdpd_next = g_sdt_mach_module.sdt_probes; g_sdt_mach_module.sdt_probes = sdpd; } else { @@ -669,9 +646,9 @@ void sdt_init( void ) } } } - + sdt_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH ); - + gSDTInited = 1; } else panic("sdt_init: called twice!\n"); @@ -683,19 +660,32 @@ void sdt_init( void ) void sdt_provide_module(void *arg, struct modctl *ctl) { -#pragma unused(ctl) #pragma unused(arg) - __sdt_provide_module(arg, &g_sdt_kernctl); - - sdt_probedesc_t *sdpd = g_sdt_mach_module.sdt_probes; - while (sdpd) { - sdt_probedesc_t *this_sdpd = sdpd; - kmem_free((void *)sdpd->sdpd_name, strlen(sdpd->sdpd_name) + 1); - kmem_free((void *)sdpd->sdpd_func, strlen(sdpd->sdpd_func) + 1); - sdpd = sdpd->sdpd_next; - kmem_free((void *)this_sdpd, sizeof(sdt_probedesc_t)); + ASSERT(ctl != NULL); + ASSERT(dtrace_kernel_symbol_mode != DTRACE_KERNEL_SYMBOLS_NEVER); + lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED); + + if (MOD_SDT_DONE(ctl)) + return; + + if (MOD_IS_MACH_KERNEL(ctl)) { + __sdt_provide_module(arg, &g_sdt_kernctl); + + sdt_probedesc_t *sdpd = g_sdt_mach_module.sdt_probes; + while (sdpd) { + sdt_probedesc_t *this_sdpd = sdpd; + kmem_free((void *)sdpd->sdpd_name, strlen(sdpd->sdpd_name) + 1); + kmem_free((void *)sdpd->sdpd_func, strlen(sdpd->sdpd_func) + 1); + sdpd = sdpd->sdpd_next; + kmem_free((void *)this_sdpd, sizeof(sdt_probedesc_t)); + } + g_sdt_mach_module.sdt_probes = NULL; + } else { + /* FIXME -- sdt in kext not yet supported */ } - g_sdt_mach_module.sdt_probes = NULL; + + /* Need to mark this module as completed */ + ctl->mod_flags |= MODCTL_SDT_PROBES_PROVIDED; } #endif /* __APPLE__ */ diff --git a/bsd/dev/dtrace/sdt_subr.c b/bsd/dev/dtrace/sdt_subr.c index 90ea1331a..891207713 100644 --- a/bsd/dev/dtrace/sdt_subr.c +++ b/bsd/dev/dtrace/sdt_subr.c @@ -92,6 +92,7 @@ sdt_provider_t sdt_providers[] = { { "proc", "__proc____", &stab_attr, 0 }, { "io", "__io____", &stab_attr, 0 }, { "ip", "__ip____", &stab_attr, 0 }, + { "tcp", "__tcp____", &stab_attr, 0 }, { "mib", "__mib____", &stab_attr, 0 }, { "fsinfo", "__fsinfo____", &fsinfo_attr, 0 }, { "nfsv3", "__nfsv3____", &stab_attr, 0 }, @@ -808,21 +809,66 @@ sdt_argdesc_t sdt_args[] = { "nfsv4cbinfo_t *" }, { "nfsv4", "cb-recall-done", 2, 2, "CB_RECALL4res *", NULL }, - { "ip", "send", 0, 0, "mblk_t *", "pktinfo_t *" }, - { "ip", "send", 1, 1, "conn_t *", "csinfo_t *" }, + { "ip", "send", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "ip", "send", 1, 1, "struct inpcb *", "csinfo_t *" }, { "ip", "send", 2, 2, "void_ip_t *", "ipinfo_t *" }, - { "ip", "send", 3, 3, "__dtrace_ipsr_ill_t *", "ifinfo_t *" }, - { "ip", "send", 4, 4, "ipha_t *", "ipv4info_t *" }, - { "ip", "send", 5, 5, "ip6_t *", "ipv6info_t *" }, - { "ip", "send", 6, 6, "int", NULL }, /* used by __dtrace_ipsr_ill_t */ - { "ip", "receive", 0, 0, "mblk_t *", "pktinfo_t *" }, - { "ip", "receive", 1, 1, "conn_t *", "csinfo_t *" }, + { "ip", "send", 3, 3, "struct ifnet *", "ifinfo_t *" }, + { "ip", "send", 4, 4, "struct ip *", "ipv4info_t *" }, + { "ip", "send", 5, 5, "struct ip6_hdr *", "ipv6info_t *" }, + { "ip", "receive", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "ip", "receive", 1, 1, "struct inpcb *", "csinfo_t *" }, { "ip", "receive", 2, 2, "void_ip_t *", "ipinfo_t *" }, - { "ip", "receive", 3, 3, "__dtrace_ipsr_ill_t *", "ifinfo_t *" }, - { "ip", "receive", 4, 4, "ipha_t *", "ipv4info_t *" }, - { "ip", "receive", 5, 5, "ip6_t *", "ipv6info_t *" }, - { "ip", "receive", 6, 6, "int", NULL }, /* used by __dtrace_ipsr_ill_t */ - + { "ip", "receive", 3, 3, "struct ifnet *", "ifinfo_t *" }, + { "ip", "receive", 4, 4, "struct ip *", "ipv4info_t *" }, + { "ip", "receive", 5, 5, "struct ip6_hdr *", "ipv6info_t *" }, + + { "tcp", "connect-established", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "connect-established", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "connect-established", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "connect-established", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "connect-established", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "connect-refused", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "connect-refused", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "connect-refused", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "connect-refused", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "connect-refused", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "connect-request", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "connect-request", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "connect-request", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "connect-request", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "connect-request", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "accept-established", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "accept-established", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "accept-established", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "accept-established", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "accept-established", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "accept-refused", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "accept-refused", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "accept-refused", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "accept-refused", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "accept-refused", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "state-change", 0, 0, "void", "void" }, + { "tcp", "state-change", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "state-change", 2, 2, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "state-change", 3, 3, "int32_t", "tcpnsinfo_t *" }, + { "tcp", "send", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "send", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "send", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "send", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "send", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "receive", 0, 0, "struct mbuf *", "pktinfo_t *" }, + { "tcp", "receive", 1, 1, "struct inpcb *", "csinfo_t *" }, + { "tcp", "receive", 2, 2, "void_ip_t *", "ipinfo_t *" }, + { "tcp", "receive", 3, 3, "struct tcpcb *", "tcpsinfo_t *" }, + { "tcp", "receive", 4, 4, "struct tcphdr *", "tcpinfo_t *" }, + { "tcp", "cc", 0, 0, "struct mbuf *", "pktinfo_t *"}, + { "tcp", "cc", 1, 1, "struct inpcb *", "csinfo_t *"}, + { "tcp", "cc", 2, 2, "struct tcpcb *", "tcpsinfo_t *"}, + { "tcp", "cc", 3, 3, "struct tcphdr *", "tcpinfo_t *"}, + { "tcp", "cc", 4, 4, "int32_t", "tcpccevent_t *"}, + { "tcp", "iaj", 0, 0, "struct tcpcb *", "tcpsinfo_t *"}, + { "tcp", "iaj", 1, 1, "uint32_t", NULL}, + { "tcp", "iaj", 2, 2, "uint32_t", NULL}, { "sysevent", "post", 0, 0, "evch_bind_t *", "syseventchaninfo_t *" }, { "sysevent", "post", 1, 1, "sysevent_impl_t *", "syseventinfo_t *" }, diff --git a/bsd/dev/dtrace/systrace.c b/bsd/dev/dtrace/systrace.c index 74ab8a105..271b2a0e1 100644 --- a/bsd/dev/dtrace/systrace.c +++ b/bsd/dev/dtrace/systrace.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -73,10 +73,9 @@ typedef x86_saved_state_t savearea_t; #include <sys/conf.h> #include <sys/user.h> -#if defined (__ppc__) || defined (__ppc64__) -#define SYSTRACE_ARTIFICIAL_FRAMES 3 -#define MACHTRACE_ARTIFICIAL_FRAMES 4 -#elif defined(__i386__) || defined (__x86_64__) +#include <machine/pal_routines.h> + +#if defined(__i386__) || defined (__x86_64__) #define SYSTRACE_ARTIFICIAL_FRAMES 2 #define MACHTRACE_ARTIFICIAL_FRAMES 3 #else @@ -107,7 +106,6 @@ systrace_stub(dtrace_id_t id, uint64_t arg0, uint64_t arg1, #pragma unused(id,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7) } - int32_t dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv) { @@ -122,24 +120,10 @@ dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv) #endif syscall_arg_t *ip = (syscall_arg_t *)uap; -#if defined (__ppc__) || defined (__ppc64__) - { - savearea_t *regs = (savearea_t *)find_user_regs(current_thread()); - - flavor = (((unsigned int)regs->save_r0) == 0)? 1: 0; - - if (flavor) - code = regs->save_r3; - else - code = regs->save_r0; - - /* - * FIXME: unix_syscall screens for "unsafe calls" and instead calls nosys(), *not* sysent[code] ! - */ - } -#elif defined(__i386__) || defined (__x86_64__) +#if defined(__i386__) || defined (__x86_64__) #pragma unused(flavor) { + pal_register_cache_state(current_thread(), VALID); x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); if (is_saved_state64(tagged_regs)) { @@ -482,7 +466,7 @@ systrace_destroy(void *arg, dtrace_id_t id, void *parg) } /*ARGSUSED*/ -static void +static int systrace_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg) /* __APPLE__ */ @@ -505,7 +489,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg) if (enabled) { ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall); - return; + return(0); } (void) casptr(&sysent[sysnum].sy_callc, @@ -516,6 +500,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg) (void *)systrace_sysent32[sysnum].stsy_underlying, (void *)dtrace_systrace_syscall32); #endif + return (0); } /*ARGSUSED*/ @@ -740,17 +725,13 @@ typedef void mach_munge_t(const void *, void *); typedef struct { int mach_trap_arg_count; int (*mach_trap_function)(void); -#if defined(__i386__) - boolean_t mach_trap_stack; -#else +#if 0 /* no active architectures use mungers for mach traps */ mach_munge_t *mach_trap_arg_munge32; /* system call arguments for 32-bit */ mach_munge_t *mach_trap_arg_munge64; /* system call arguments for 64-bit */ #endif -#if !MACH_ASSERT - int mach_trap_unused; -#else +#if MACH_ASSERT const char* mach_trap_name; -#endif /* !MACH_ASSERT */ +#endif /* MACH_ASSERT */ } mach_trap_t; extern mach_trap_t mach_trap_table[]; @@ -803,20 +784,10 @@ dtrace_machtrace_syscall(struct mach_call_args *args) syscall_arg_t *ip = (syscall_arg_t *)args; mach_call_t mach_call; -#if defined (__ppc__) || defined (__ppc64__) - { - savearea_t *regs = (savearea_t *)find_user_regs(current_thread()); - - flavor = (((unsigned int)regs->save_r0) == 0)? 1: 0; - - if (flavor) - code = -regs->save_r3; - else - code = -regs->save_r0; - } -#elif defined(__i386__) || defined (__x86_64__) +#if defined(__i386__) || defined (__x86_64__) #pragma unused(flavor) { + pal_register_cache_state(current_thread(), VALID); x86_saved_state_t *tagged_regs = (x86_saved_state_t *)find_user_regs(current_thread()); if (is_saved_state64(tagged_regs)) { @@ -937,7 +908,7 @@ machtrace_destroy(void *arg, dtrace_id_t id, void *parg) } /*ARGSUSED*/ -static void +static int machtrace_enable(void *arg, dtrace_id_t id, void *parg) { #pragma unused(arg) /* __APPLE__ */ @@ -954,12 +925,13 @@ machtrace_enable(void *arg, dtrace_id_t id, void *parg) if (enabled) { ASSERT(sysent[sysnum].sy_callc == (void *)dtrace_machtrace_syscall); - return; + return(0); } (void) casptr(&mach_trap_table[sysnum].mach_trap_function, (void *)machtrace_sysent[sysnum].stsy_underlying, (void *)dtrace_machtrace_syscall); + return(0); } /*ARGSUSED*/ diff --git a/bsd/dev/i386/conf.c b/bsd/dev/i386/conf.c index 964f945bf..b7de69df7 100644 --- a/bsd/dev/i386/conf.c +++ b/bsd/dev/i386/conf.c @@ -288,6 +288,7 @@ struct cdevsw cdevsw[] = }; int nchrdev = sizeof (cdevsw) / sizeof (cdevsw[0]); +uint64_t cdevsw_flags[sizeof (cdevsw) / sizeof (cdevsw[0])]; #include <sys/vnode.h> /* for VCHR and VBLK */ /* diff --git a/bsd/dev/i386/dtrace_isa.c b/bsd/dev/i386/dtrace_isa.c index 65749f9df..88e789fce 100644 --- a/bsd/dev/i386/dtrace_isa.c +++ b/bsd/dev/i386/dtrace_isa.c @@ -48,6 +48,8 @@ typedef x86_saved_state_t savearea_t; #include <kern/sched_prim.h> #include <miscfs/devfs/devfs.h> #include <mach/vm_param.h> +#include <machine/pal_routines.h> +#include <i386/mp.h> /* * APPLE NOTE: The regmap is used to decode which 64bit uregs[] register @@ -126,11 +128,6 @@ dtrace_getipl(void) /* * MP coordination */ - -extern void mp_broadcast( - void (*action_func)(void *), - void *arg); - typedef struct xcArg { processorid_t cpu; dtrace_xcall_t f; @@ -147,6 +144,7 @@ xcRemote( void *foo ) } } + /* * dtrace_xcall() is not called from probe context. */ @@ -159,13 +157,17 @@ dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg) xcArg.f = f; xcArg.arg = arg; - mp_broadcast( xcRemote, (void *)&xcArg); + if (cpu == DTRACE_CPUALL) { + mp_cpus_call (CPUMASK_ALL, SYNC, xcRemote, (void*)&xcArg); + } + else { + mp_cpus_call (cpu_to_cpumask((cpu_t)cpu), SYNC, xcRemote, (void*)&xcArg); + } } /* * Runtime and ABI */ - uint64_t dtrace_getreg(struct regs *savearea, uint_t reg) { @@ -420,6 +422,7 @@ dtrace_getupcstack(uint64_t *pcstack, int pcstack_limit) if (thread == NULL) goto zero; + pal_register_cache_state(thread, VALID); regs = (x86_saved_state_t *)find_user_regs(thread); if (regs == NULL) goto zero; @@ -483,6 +486,7 @@ dtrace_getustackdepth(void) if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT)) return (-1); + pal_register_cache_state(thread, VALID); regs = (x86_saved_state_t *)find_user_regs(thread); if (regs == NULL) return 0; @@ -746,7 +750,9 @@ dtrace_getarg(int arg, int aframes) fp = fp->backchain; pc = fp->retaddr; - if (pc == (uintptr_t)dtrace_invop_callsite) { + if (dtrace_invop_callsite_pre != NULL + && pc > (uintptr_t)dtrace_invop_callsite_pre + && pc <= (uintptr_t)dtrace_invop_callsite_post) { #if defined(__i386__) /* * If we pass through the invalid op handler, we will @@ -783,8 +789,10 @@ dtrace_getarg(int arg, int aframes) if (arg <= inreg) { stack = (uintptr_t *)&saved_state->rdi; } else { - stack = (uintptr_t *)(saved_state->isf.rsp); - arg -= inreg; + fp = (struct frame *)(saved_state->isf.rsp); + stack = (uintptr_t *)&fp[1]; /* Find marshalled + arguments */ + arg -= inreg + 1; } #else #error Unknown arch @@ -794,7 +802,11 @@ dtrace_getarg(int arg, int aframes) } /* - * Arrive here when provider has called dtrace_probe directly. + * We know that we did not come through a trap to get into + * dtrace_probe() -- We arrive here when the provider has + * called dtrace_probe() directly. + * The probe ID is the first argument to dtrace_probe(). + * We must advance beyond that to get the argX. */ arg++; /* Advance past probeID */ @@ -815,7 +827,8 @@ dtrace_getarg(int arg, int aframes) load: DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); - val = *(((uint64_t *)stack) + arg); /* dtrace_probe arguments arg0 .. arg4 are 64bits wide */ + /* dtrace_probe arguments arg0 ... arg4 are 64bits wide */ + val = (uint64_t)(*(((uintptr_t *)stack) + arg)); DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); return (val); diff --git a/bsd/dev/i386/fasttrap_isa.c b/bsd/dev/i386/fasttrap_isa.c index be620b517..e3bfb9402 100644 --- a/bsd/dev/i386/fasttrap_isa.c +++ b/bsd/dev/i386/fasttrap_isa.c @@ -45,6 +45,8 @@ extern dtrace_id_t dtrace_probeid_error; #include <sys/dtrace_ptss.h> #include <kern/debug.h> +#include <machine/pal_routines.h> + /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */ #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */ @@ -2207,11 +2209,11 @@ fasttrap_return_probe(x86_saved_state_t *regs) return (0); } - uint64_t fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) { + pal_register_cache_state(current_thread(), VALID); #pragma unused(arg, id, parg, aframes) return (fasttrap_anarg((x86_saved_state_t *)find_user_regs(current_thread()), 1, argno)); } @@ -2220,6 +2222,7 @@ uint64_t fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) { + pal_register_cache_state(current_thread(), VALID); #pragma unused(arg, id, parg, aframes) return (fasttrap_anarg((x86_saved_state_t *)find_user_regs(current_thread()), 0, argno)); } diff --git a/bsd/dev/i386/fbt_x86.c b/bsd/dev/i386/fbt_x86.c index 19d461ac2..baec24f83 100644 --- a/bsd/dev/i386/fbt_x86.c +++ b/bsd/dev/i386/fbt_x86.c @@ -39,6 +39,7 @@ #include <mach-o/loader.h> #include <mach-o/nlist.h> #include <libkern/kernel_mach_header.h> +#include <libkern/OSAtomic.h> #include <sys/param.h> #include <sys/systm.h> @@ -101,7 +102,9 @@ extern dtrace_provider_id_t fbt_id; extern fbt_probe_t **fbt_probetab; extern int fbt_probetab_mask; -kern_return_t fbt_perfCallback(int, x86_saved_state_t *, __unused int, __unused int); +extern int gIgnoreFBTBlacklist; /* From fbt_init */ + +kern_return_t fbt_perfCallback(int, x86_saved_state_t *, uintptr_t *, __unused int); /* * Critical routines that must not be probed. PR_5221096, PR_5379018. @@ -144,6 +147,7 @@ static const char * critical_blacklist[] = "cpu_topology_start_cpu", "cpu_type", "cpuid_cpu_display", + "cpuid_extfeatures", "handle_pending_TLB_flushes", "hw_compare_and_store", "machine_idle_cstate", @@ -171,8 +175,8 @@ static const char * probe_ctx_closure[] = "IS_64BIT_PROCESS", "OSCompareAndSwap", "absolutetime_to_microtime", + "act_set_astbsd", "ast_pending", - "astbsd_on", "clock_get_calendar_nanotime_nowait", "copyin", "copyin_user", @@ -257,6 +261,238 @@ static const void * bsearch( return (NULL); } +/* + * Module validation + */ +static int +is_module_valid(struct modctl* ctl) +{ + ASSERT(!MOD_FBT_PROBES_PROVIDED(ctl)); + ASSERT(!MOD_FBT_INVALID(ctl)); + + if (0 == ctl->mod_address || 0 == ctl->mod_size) { + return FALSE; + } + + if (0 == ctl->mod_loaded) { + return FALSE; + } + + if (strstr(ctl->mod_modname, "CHUD") != NULL) + return FALSE; + + /* + * If the user sets this, trust they know what they are doing. + */ + if (gIgnoreFBTBlacklist) /* per boot-arg set in fbt_init() */ + return TRUE; + + /* + * These drivers control low level functions that when traced + * cause problems, especially in the sleep/wake paths. + * If somebody really wants to drill in on one of these kexts, then + * they can override blacklisting using the boot-arg above. + */ + + if (strstr(ctl->mod_modname, "AppleACPIEC") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "AppleACPIPlatform") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "AppleRTC") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "IOACPIFamily") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "AppleIntelCPUPowerManagement") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "AppleProfile") != NULL) + return FALSE; + + if (strstr(ctl->mod_modname, "AppleIntelProfile") != NULL) + return FALSE; + + + + return TRUE; +} + +/* + * FBT probe name validation + */ +static int +is_symbol_valid(const char* name) +{ + /* + * If the user set this, trust they know what they are doing. + */ + if (gIgnoreFBTBlacklist) + return TRUE; + + if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) { + /* + * Anything beginning with "dtrace_" may be called + * from probe context unless it explitly indicates + * that it won't be called from probe context by + * using the prefix "dtrace_safe_". + */ + return FALSE; + } + + if (LIT_STRNSTART(name, "fasttrap_") || + LIT_STRNSTART(name, "fuword") || + LIT_STRNSTART(name, "suword") || + LIT_STRNEQL(name, "sprlock") || + LIT_STRNEQL(name, "sprunlock") || + LIT_STRNEQL(name, "uread") || + LIT_STRNEQL(name, "uwrite")) { + return FALSE; /* Fasttrap inner-workings. */ + } + + if (LIT_STRNSTART(name, "dsmos_")) + return FALSE; /* Don't Steal Mac OS X! */ + + if (LIT_STRNSTART(name, "_dtrace")) + return FALSE; /* Shims in dtrace.c */ + + if (LIT_STRNSTART(name, "chud")) + return FALSE; /* Professional courtesy. */ + + if (LIT_STRNSTART(name, "hibernate_")) + return FALSE; /* Let sleeping dogs lie. */ + + if (LIT_STRNEQL(name, "_ZNK6OSData14getBytesNoCopyEv")) + return FALSE; /* Data::getBytesNoCopy, IOHibernateSystemWake path */ + + if (LIT_STRNEQL(name, "_ZN9IOService14newTemperatureElPS_") || /* IOService::newTemperature */ + LIT_STRNEQL(name, "_ZN9IOService26temperatureCriticalForZoneEPS_")) { /* IOService::temperatureCriticalForZone */ + return FALSE; /* Per the fire code */ + } + + /* + * Place no probes (illegal instructions) in the exception handling path! + */ + if (LIT_STRNEQL(name, "t_invop") || + LIT_STRNEQL(name, "enter_lohandler") || + LIT_STRNEQL(name, "lo_alltraps") || + LIT_STRNEQL(name, "kernel_trap") || + LIT_STRNEQL(name, "interrupt") || + LIT_STRNEQL(name, "i386_astintr")) { + return FALSE; + } + + if (LIT_STRNEQL(name, "current_thread") || + LIT_STRNEQL(name, "ast_pending") || + LIT_STRNEQL(name, "fbt_perfCallback") || + LIT_STRNEQL(name, "machine_thread_get_kern_state") || + LIT_STRNEQL(name, "get_threadtask") || + LIT_STRNEQL(name, "ml_set_interrupts_enabled") || + LIT_STRNEQL(name, "dtrace_invop") || + LIT_STRNEQL(name, "fbt_invop") || + LIT_STRNEQL(name, "sdt_invop") || + LIT_STRNEQL(name, "max_valid_stack_address")) { + return FALSE; + } + + /* + * Voodoo. + */ + if (LIT_STRNSTART(name, "machine_stack_") || + LIT_STRNSTART(name, "mapping_") || + LIT_STRNEQL(name, "tmrCvt") || + + LIT_STRNSTART(name, "tsc_") || + + LIT_STRNSTART(name, "pmCPU") || + LIT_STRNEQL(name, "pmKextRegister") || + LIT_STRNEQL(name, "pmMarkAllCPUsOff") || + LIT_STRNEQL(name, "pmSafeMode") || + LIT_STRNEQL(name, "pmTimerSave") || + LIT_STRNEQL(name, "pmTimerRestore") || + LIT_STRNEQL(name, "pmUnRegister") || + LIT_STRNSTART(name, "pms") || + LIT_STRNEQL(name, "power_management_init") || + LIT_STRNSTART(name, "usimple_") || + LIT_STRNSTART(name, "lck_spin_lock") || + LIT_STRNSTART(name, "lck_spin_unlock") || + + LIT_STRNSTART(name, "rtc_") || + LIT_STRNSTART(name, "_rtc_") || + LIT_STRNSTART(name, "rtclock_") || + LIT_STRNSTART(name, "clock_") || + LIT_STRNSTART(name, "absolutetime_to_") || + LIT_STRNEQL(name, "setPop") || + LIT_STRNEQL(name, "nanoseconds_to_absolutetime") || + LIT_STRNEQL(name, "nanotime_to_absolutetime") || + + LIT_STRNSTART(name, "etimer_") || + + LIT_STRNSTART(name, "commpage_") || + LIT_STRNSTART(name, "pmap_") || + LIT_STRNSTART(name, "ml_") || + LIT_STRNSTART(name, "PE_") || + LIT_STRNEQL(name, "kprintf") || + LIT_STRNSTART(name, "lapic_") || + LIT_STRNSTART(name, "act_machine") || + LIT_STRNSTART(name, "acpi_") || + LIT_STRNSTART(name, "pal_")){ + return FALSE; + } + + /* + * Avoid machine_ routines. PR_5346750. + */ + if (LIT_STRNSTART(name, "machine_")) + return FALSE; + + if (LIT_STRNEQL(name, "handle_pending_TLB_flushes")) + return FALSE; + + /* + * Place no probes on critical routines. PR_5221096 + */ + if (bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL) + return FALSE; + + /* + * Place no probes that could be hit in probe context. + */ + if (bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) { + return FALSE; + } + + /* + * Place no probes that could be hit on the way to the debugger. + */ + if (LIT_STRNSTART(name, "kdp_") || + LIT_STRNSTART(name, "kdb_") || + LIT_STRNSTART(name, "kdbg_") || + LIT_STRNSTART(name, "kdebug_") || + LIT_STRNSTART(name, "kernel_debug") || + LIT_STRNEQL(name, "Debugger") || + LIT_STRNEQL(name, "Call_DebuggerC") || + LIT_STRNEQL(name, "lock_debugger") || + LIT_STRNEQL(name, "unlock_debugger") || + LIT_STRNEQL(name, "SysChoked")) { + return FALSE; + } + + + /* + * Place no probes that could be hit on the way to a panic. + */ + if (NULL != strstr(name, "panic_") || + LIT_STRNEQL(name, "panic") || + LIT_STRNEQL(name, "preemption_underflow_panic")) { + return FALSE; + } + + return TRUE; +} + #if defined(__i386__) int fbt_invop(uintptr_t addr, uintptr_t *stack, uintptr_t rval) @@ -313,8 +549,8 @@ kern_return_t fbt_perfCallback( int trapno, x86_saved_state_t *tagged_regs, - __unused int unused1, - __unused int unused2) + uintptr_t *lo_spp, + __unused int unused ) { kern_return_t retval = KERN_FAILURE; x86_saved_state32_t *saved_state = saved_state32(tagged_regs); @@ -322,7 +558,8 @@ fbt_perfCallback( if (FBT_EXCEPTION_CODE == trapno && !IS_USER_TRAP(saved_state)) { boolean_t oldlevel, cpu_64bit; - uint32_t esp_probe, *ebp, edi, fp, *pDst, delta = 0; + uint32_t esp_probe, fp, *pDst, delta = 0; + uintptr_t old_sp; int emul; cpu_64bit = ml_is64bit(); @@ -335,10 +572,26 @@ fbt_perfCallback( esp_probe = (uint32_t)&(regs[1]); /* Nasty, infer the location above the save area */ } + __asm__ volatile( + "Ldtrace_invop_callsite_pre_label:\n" + ".data\n" + ".private_extern _dtrace_invop_callsite_pre\n" + "_dtrace_invop_callsite_pre:\n" + " .long Ldtrace_invop_callsite_pre_label\n" + ".text\n" + ); + emul = dtrace_invop( saved_state->eip, (uintptr_t *)esp_probe, saved_state->eax ); - __asm__ volatile(".globl _dtrace_invop_callsite"); - __asm__ volatile("_dtrace_invop_callsite:"); + __asm__ volatile( + "Ldtrace_invop_callsite_post_label:\n" + ".data\n" + ".private_extern _dtrace_invop_callsite_post\n" + "_dtrace_invop_callsite_post:\n" + " .long Ldtrace_invop_callsite_post_label\n" + ".text\n" + ); + switch (emul) { case DTRACE_INVOP_NOP: saved_state->eip += DTRACE_INVOP_NOP_SKIP; /* Skip over the patched NOP (planted by sdt.) */ @@ -379,27 +632,18 @@ fbt_perfCallback( if (cpu_64bit) saved_state->uesp += (delta << 2); - -/* XXX Fragile in the extreme. Obtain the value of %edi that our caller pushed - * (on behalf of its caller -- trap_from_kernel()). Ultimately, - * trap_from_kernel's stack pointer is restored from this slot. - * This is sensitive to the manner in which the compiler preserves %edi, - * and trap_from_kernel()'s internals. - */ - ebp = (uint32_t *)__builtin_frame_address(0); - ebp = (uint32_t *)*ebp; - edi = *(ebp - 1); +/* Obtain the stack pointer recorded by the trampolines */ + old_sp = *lo_spp; /* Shift contents of stack */ for (pDst = (uint32_t *)fp; - pDst > (((uint32_t *)edi)); + pDst > (((uint32_t *)old_sp)); pDst--) *pDst = pDst[-delta]; /* Track the stack lift in "saved_state". */ saved_state = (x86_saved_state32_t *) (((uintptr_t)saved_state) + (delta << 2)); - -/* Now adjust the value of %edi in our caller (kernel_trap)'s frame */ - *(ebp - 1) = edi + (delta << 2); +/* Adjust the stack pointer utilized by the trampolines */ + *lo_spp = old_sp + (delta << 2); retval = KERN_SUCCESS; break; @@ -418,47 +662,299 @@ fbt_perfCallback( /*ARGSUSED*/ static void -__fbt_provide_module(void *arg, struct modctl *ctl) +__provide_probe_32(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart) { -#pragma unused(arg) - kernel_mach_header_t *mh; - struct load_command *cmd; - kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - struct nlist *sym = NULL; - char *strings; - uintptr_t instrLow, instrHigh; - char *modname; - unsigned int i, j; + unsigned int j; + unsigned int doenable = 0; + dtrace_id_t thisid; - int gIgnoreFBTBlacklist = 0; - PE_parse_boot_argn("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist, sizeof (gIgnoreFBTBlacklist)); - - mh = (kernel_mach_header_t *)(ctl->address); - modname = ctl->mod_modname; + fbt_probe_t *newfbt, *retfbt, *entryfbt; + machine_inst_t *instr, *limit, theInstr, i1, i2; + int size; - if (0 == ctl->address || 0 == ctl->size) /* Has the linker been jettisoned? */ + for (j = 0, instr = symbolStart, theInstr = 0; + (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2)); + j++) { + theInstr = instr[0]; + if (theInstr == FBT_PUSHL_EBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16) + break; + + if ((size = dtrace_instr_size(instr)) <= 0) + break; + + instr += size; + } + + if (theInstr != FBT_PUSHL_EBP) return; - + + i1 = instr[1]; + i2 = instr[2]; + + limit = (machine_inst_t *)instrHigh; + + if ((i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) || + (i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) { + instr += 1; /* Advance to the movl %esp,%ebp */ + theInstr = i1; + } else { + /* + * Sometimes, the compiler will schedule an intervening instruction + * in the function prologue. Example: + * + * _mach_vm_read: + * 000006d8 pushl %ebp + * 000006d9 movl $0x00000004,%edx + * 000006de movl %esp,%ebp + * + * Try the next instruction, to see if it is a movl %esp,%ebp + */ + + instr += 1; /* Advance past the pushl %ebp */ + if ((size = dtrace_instr_size(instr)) <= 0) + return; + + instr += size; + + if ((instr + 1) >= limit) + return; + + i1 = instr[0]; + i2 = instr[1]; + + if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) && + !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) + return; + + /* instr already points at the movl %esp,%ebp */ + theInstr = i1; + } + + thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_ENTRY); + newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); + + if (thisid != 0) { + /* + * The dtrace_probe previously existed, so we have to hook + * the newfbt entry onto the end of the existing fbt's chain. + * If we find an fbt entry that was previously patched to + * fire, (as indicated by the current patched value), then + * we want to enable this newfbt on the spot. + */ + entryfbt = dtrace_probe_arg (fbt_id, thisid); + ASSERT (entryfbt != NULL); + for(; entryfbt != NULL; entryfbt = entryfbt->fbtp_next) { + if (entryfbt->fbtp_currentval == entryfbt->fbtp_patchval) + doenable++; + + if (entryfbt->fbtp_next == NULL) { + entryfbt->fbtp_next = newfbt; + newfbt->fbtp_id = entryfbt->fbtp_id; + break; + } + } + } + else { + /* + * The dtrace_probe did not previously exist, so we + * create it and hook in the newfbt. Since the probe is + * new, we obviously do not need to enable it on the spot. + */ + newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, symbolName, FBT_ENTRY, FBT_AFRAMES_ENTRY, newfbt); + doenable = 0; + } + + + newfbt->fbtp_patchpoint = instr; + newfbt->fbtp_ctl = ctl; + newfbt->fbtp_loadcnt = ctl->mod_loadcnt; + newfbt->fbtp_rval = DTRACE_INVOP_MOVL_ESP_EBP; + newfbt->fbtp_savedval = theInstr; + newfbt->fbtp_patchval = FBT_PATCHVAL; + newfbt->fbtp_currentval = 0; + newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; + fbt_probetab[FBT_ADDR2NDX(instr)] = newfbt; + + if (doenable) + fbt_enable(NULL, newfbt->fbtp_id, newfbt); + /* - * Employees of dtrace and their families are ineligible. Void - * where prohibited. + * The fbt entry chain is in place, one entry point per symbol. + * The fbt return chain can have multiple return points per symbol. + * Here we find the end of the fbt return chain. */ - - if (LIT_STRNEQL(modname, "com.apple.driver.dtrace")) + + doenable=0; + + thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN); + if (thisid != 0) { + /* The dtrace_probe previously existed, so we have to + * find the end of the existing fbt chain. If we find + * an fbt return that was previously patched to fire, + * (as indicated by the currrent patched value), then + * we want to enable any new fbts on the spot. + */ + retfbt = dtrace_probe_arg (fbt_id, thisid); + ASSERT(retfbt != NULL); + for (; retfbt != NULL; retfbt = retfbt->fbtp_next) { + if (retfbt->fbtp_currentval == retfbt->fbtp_patchval) + doenable++; + if(retfbt->fbtp_next == NULL) + break; + } + } + else { + doenable = 0; + retfbt = NULL; + } + +again: + if (instr >= limit) return; - - if (strstr(modname, "CHUD") != NULL) + + /* + * If this disassembly fails, then we've likely walked off into + * a jump table or some other unsuitable area. Bail out of the + * disassembly now. + */ + if ((size = dtrace_instr_size(instr)) <= 0) + return; + + /* + * We (desperately) want to avoid erroneously instrumenting a + * jump table, especially given that our markers are pretty + * short: two bytes on x86, and just one byte on amd64. To + * determine if we're looking at a true instruction sequence + * or an inline jump table that happens to contain the same + * byte sequences, we resort to some heuristic sleeze: we + * treat this instruction as being contained within a pointer, + * and see if that pointer points to within the body of the + * function. If it does, we refuse to instrument it. + */ + for (j = 0; j < sizeof (uintptr_t); j++) { + uintptr_t check = (uintptr_t)instr - j; + uint8_t *ptr; + + if (check < (uintptr_t)symbolStart) + break; + + if (check + sizeof (uintptr_t) > (uintptr_t)limit) + continue; + + ptr = *(uint8_t **)check; + + if (ptr >= (uint8_t *)symbolStart && ptr < limit) { + instr += size; + goto again; + } + } + + /* + * OK, it's an instruction. + */ + theInstr = instr[0]; + + /* Walked onto the start of the next routine? If so, bail out of this function. */ + if (theInstr == FBT_PUSHL_EBP) + return; + + if (!(size == 1 && (theInstr == FBT_POPL_EBP || theInstr == FBT_LEAVE))) { + instr += size; + goto again; + } + + /* + * Found the popl %ebp; or leave. + */ + machine_inst_t *patch_instr = instr; + + /* + * Scan forward for a "ret", or "jmp". + */ + instr += size; + if (instr >= limit) + return; + + size = dtrace_instr_size(instr); + if (size <= 0) /* Failed instruction decode? */ + return; + + theInstr = instr[0]; + + if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) && + !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) && + !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) && + !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) && + !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS))) return; + + /* + * popl %ebp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner! + */ + newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); + + if (retfbt == NULL) { + newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, + symbolName, FBT_RETURN, FBT_AFRAMES_RETURN, newfbt); + } else { + retfbt->fbtp_next = newfbt; + newfbt->fbtp_id = retfbt->fbtp_id; + } + + retfbt = newfbt; + newfbt->fbtp_patchpoint = patch_instr; + newfbt->fbtp_ctl = ctl; + newfbt->fbtp_loadcnt = ctl->mod_loadcnt; + + if (*patch_instr == FBT_POPL_EBP) { + newfbt->fbtp_rval = DTRACE_INVOP_POPL_EBP; + } else { + ASSERT(*patch_instr == FBT_LEAVE); + newfbt->fbtp_rval = DTRACE_INVOP_LEAVE; + } + newfbt->fbtp_roffset = + (uintptr_t)(patch_instr - (uint8_t *)symbolStart); + + newfbt->fbtp_savedval = *patch_instr; + newfbt->fbtp_patchval = FBT_PATCHVAL; + newfbt->fbtp_currentval = 0; + newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)]; + fbt_probetab[FBT_ADDR2NDX(patch_instr)] = newfbt; + + if (doenable) + fbt_enable(NULL, newfbt->fbtp_id, newfbt); + + instr += size; + goto again; +} +static void +__kernel_syms_provide_module(void *arg, struct modctl *ctl) +{ +#pragma unused(arg) + kernel_mach_header_t *mh; + struct load_command *cmd; + kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; + struct symtab_command *orig_st = NULL; + struct nlist *sym = NULL; + char *strings; + uintptr_t instrLow, instrHigh; + char *modname; + unsigned int i; + + mh = (kernel_mach_header_t *)(ctl->mod_address); + modname = ctl->mod_modname; + if (mh->magic != MH_MAGIC) return; - + cmd = (struct load_command *) &mh[1]; for (i = 0; i < mh->ncmds; i++) { if (cmd->cmd == LC_SEGMENT_KERNEL) { kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; - + if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) orig_ts = orig_sg; else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) @@ -468,370 +964,75 @@ __fbt_provide_module(void *arg, struct modctl *ctl) } else if (cmd->cmd == LC_SYMTAB) orig_st = (struct symtab_command *) cmd; - + cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize); } - + if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) return; - + sym = (struct nlist *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - + /* Find extent of the TEXT section */ instrLow = (uintptr_t)orig_ts->vmaddr; instrHigh = (uintptr_t)(orig_ts->vmaddr + orig_ts->vmsize); - + for (i = 0; i < orig_st->nsyms; i++) { - fbt_probe_t *fbt, *retfbt; - machine_inst_t *instr, *limit, theInstr, i1, i2; uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); char *name = strings + sym[i].n_un.n_strx; - int size; - + /* Check that the symbol is a global and that it has a name. */ if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) continue; - + if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ continue; /* Lop off omnipresent leading underscore. */ if (*name == '_') name += 1; - - if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) { - /* - * Anything beginning with "dtrace_" may be called - * from probe context unless it explitly indicates - * that it won't be called from probe context by - * using the prefix "dtrace_safe_". - */ - continue; - } - - if (LIT_STRNSTART(name, "dsmos_")) - continue; /* Don't Steal Mac OS X! */ - - if (LIT_STRNSTART(name, "_dtrace")) - continue; /* Shims in dtrace.c */ - - if (LIT_STRNSTART(name, "chud")) - continue; /* Professional courtesy. */ - - if (LIT_STRNSTART(name, "hibernate_")) - continue; /* Let sleeping dogs lie. */ - if (LIT_STRNEQL(name, "_ZN9IOService14newTemperatureElPS_") || /* IOService::newTemperature */ - LIT_STRNEQL(name, "_ZN9IOService26temperatureCriticalForZoneEPS_")) /* IOService::temperatureCriticalForZone */ - continue; /* Per the fire code */ - /* - * Place no probes (illegal instructions) in the exception handling path! + * We're only blacklisting functions in the kernel for now. */ - if (LIT_STRNEQL(name, "t_invop") || - LIT_STRNEQL(name, "enter_lohandler") || - LIT_STRNEQL(name, "lo_alltraps") || - LIT_STRNEQL(name, "kernel_trap") || - LIT_STRNEQL(name, "interrupt") || - LIT_STRNEQL(name, "i386_astintr")) - continue; - - if (LIT_STRNEQL(name, "current_thread") || - LIT_STRNEQL(name, "ast_pending") || - LIT_STRNEQL(name, "fbt_perfCallback") || - LIT_STRNEQL(name, "machine_thread_get_kern_state") || - LIT_STRNEQL(name, "get_threadtask") || - LIT_STRNEQL(name, "ml_set_interrupts_enabled") || - LIT_STRNEQL(name, "dtrace_invop") || - LIT_STRNEQL(name, "fbt_invop") || - LIT_STRNEQL(name, "sdt_invop") || - LIT_STRNEQL(name, "max_valid_stack_address")) + if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name)) continue; + + __provide_probe_32(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value); + } +} - /* - * Voodoo. - */ - if (LIT_STRNSTART(name, "machine_stack_") || - LIT_STRNSTART(name, "mapping_") || - LIT_STRNEQL(name, "tmrCvt") || - - LIT_STRNSTART(name, "tsc_") || - - LIT_STRNSTART(name, "pmCPU") || - LIT_STRNEQL(name, "pmKextRegister") || - LIT_STRNEQL(name, "pmMarkAllCPUsOff") || - LIT_STRNEQL(name, "pmSafeMode") || - LIT_STRNEQL(name, "pmTimerSave") || - LIT_STRNEQL(name, "pmTimerRestore") || - LIT_STRNEQL(name, "pmUnRegister") || - LIT_STRNSTART(name, "pms") || - LIT_STRNEQL(name, "power_management_init") || - LIT_STRNSTART(name, "usimple_") || - LIT_STRNEQL(name, "lck_spin_lock") || - LIT_STRNEQL(name, "lck_spin_unlock") || - - LIT_STRNSTART(name, "rtc_") || - LIT_STRNSTART(name, "_rtc_") || - LIT_STRNSTART(name, "rtclock_") || - LIT_STRNSTART(name, "clock_") || - LIT_STRNSTART(name, "absolutetime_to_") || - LIT_STRNEQL(name, "setPop") || - LIT_STRNEQL(name, "nanoseconds_to_absolutetime") || - LIT_STRNEQL(name, "nanotime_to_absolutetime") || - - LIT_STRNSTART(name, "etimer_") || - - LIT_STRNSTART(name, "commpage_") || - LIT_STRNSTART(name, "pmap_") || - LIT_STRNSTART(name, "ml_") || - LIT_STRNSTART(name, "PE_") || - LIT_STRNEQL(name, "kprintf") || - LIT_STRNSTART(name, "lapic_") || - LIT_STRNSTART(name, "acpi_")) - continue; +static void +__user_syms_provide_module(void *arg, struct modctl *ctl) +{ +#pragma unused(arg) + char *modname; + unsigned int i; + + modname = ctl->mod_modname; + + dtrace_module_symbols_t* module_symbols = ctl->mod_user_symbols; + if (module_symbols) { + for (i=0; i<module_symbols->dtmodsyms_count; i++) { + dtrace_symbol_t* symbol = &module_symbols->dtmodsyms_symbols[i]; + char* name = symbol->dtsym_name; + + /* Lop off omnipresent leading underscore. */ + if (*name == '_') + name += 1; - /* - * Avoid machine_ routines. PR_5346750. - */ - if (LIT_STRNSTART(name, "machine_")) - continue; + /* + * We're only blacklisting functions in the kernel for now. + */ + if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name)) + continue; - if (LIT_STRNEQL(name, "handle_pending_TLB_flushes")) - continue; + __provide_probe_32(ctl, (uintptr_t)symbol->dtsym_addr, (uintptr_t)(symbol->dtsym_addr + symbol->dtsym_size), modname, name, (machine_inst_t*)(uintptr_t)symbol->dtsym_addr); + } + } +} - /* - * Place no probes on critical routines. PR_5221096 - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit in probe context. - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit on the way to the debugger. - */ - if (LIT_STRNSTART(name, "kdp_") || - LIT_STRNSTART(name, "kdb_") || - LIT_STRNSTART(name, "kdbg_") || - LIT_STRNSTART(name, "kdebug_") || - LIT_STRNEQL(name, "kernel_debug") || - LIT_STRNEQL(name, "Debugger") || - LIT_STRNEQL(name, "Call_DebuggerC") || - LIT_STRNEQL(name, "lock_debugger") || - LIT_STRNEQL(name, "unlock_debugger") || - LIT_STRNEQL(name, "SysChoked")) - continue; - - /* - * Place no probes that could be hit on the way to a panic. - */ - if (NULL != strstr(name, "panic_") || - LIT_STRNEQL(name, "panic") || - LIT_STRNEQL(name, "handleMck") || - LIT_STRNEQL(name, "unresolved_kernel_trap")) - continue; - - if (dtrace_probe_lookup(fbt_id, modname, name, NULL) != 0) - continue; - - for (j = 0, instr = (machine_inst_t *)sym[i].n_value, theInstr = 0; - (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2)); - j++) { - theInstr = instr[0]; - if (theInstr == FBT_PUSHL_EBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16) - break; - - if ((size = dtrace_instr_size(instr)) <= 0) - break; - - instr += size; - } - - if (theInstr != FBT_PUSHL_EBP) - continue; - - i1 = instr[1]; - i2 = instr[2]; - - limit = (machine_inst_t *)instrHigh; - - if ((i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) || - (i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) { - instr += 1; /* Advance to the movl %esp,%ebp */ - theInstr = i1; - } else { - /* - * Sometimes, the compiler will schedule an intervening instruction - * in the function prologue. Example: - * - * _mach_vm_read: - * 000006d8 pushl %ebp - * 000006d9 movl $0x00000004,%edx - * 000006de movl %esp,%ebp - * - * Try the next instruction, to see if it is a movl %esp,%ebp - */ - - instr += 1; /* Advance past the pushl %ebp */ - if ((size = dtrace_instr_size(instr)) <= 0) - continue; - - instr += size; - - if ((instr + 1) >= limit) - continue; - - i1 = instr[0]; - i2 = instr[1]; - - if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) && - !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) - continue; - - /* instr already points at the movl %esp,%ebp */ - theInstr = i1; - } - - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, name, FBT_ENTRY, FBT_AFRAMES_ENTRY, fbt); - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - fbt->fbtp_rval = DTRACE_INVOP_MOVL_ESP_EBP; - fbt->fbtp_savedval = theInstr; - fbt->fbtp_patchval = FBT_PATCHVAL; - - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - - retfbt = NULL; -again: - if (instr >= limit) - continue; - - /* - * If this disassembly fails, then we've likely walked off into - * a jump table or some other unsuitable area. Bail out of the - * disassembly now. - */ - if ((size = dtrace_instr_size(instr)) <= 0) - continue; - - /* - * We (desperately) want to avoid erroneously instrumenting a - * jump table, especially given that our markers are pretty - * short: two bytes on x86, and just one byte on amd64. To - * determine if we're looking at a true instruction sequence - * or an inline jump table that happens to contain the same - * byte sequences, we resort to some heuristic sleeze: we - * treat this instruction as being contained within a pointer, - * and see if that pointer points to within the body of the - * function. If it does, we refuse to instrument it. - */ - for (j = 0; j < sizeof (uintptr_t); j++) { - uintptr_t check = (uintptr_t)instr - j; - uint8_t *ptr; - - if (check < sym[i].n_value) - break; - - if (check + sizeof (uintptr_t) > (uintptr_t)limit) - continue; - - ptr = *(uint8_t **)check; - - if (ptr >= (uint8_t *)sym[i].n_value && ptr < limit) { - instr += size; - goto again; - } - } - - /* - * OK, it's an instruction. - */ - theInstr = instr[0]; - - /* Walked onto the start of the next routine? If so, bail out of this function. */ - if (theInstr == FBT_PUSHL_EBP) - continue; - - if (!(size == 1 && (theInstr == FBT_POPL_EBP || theInstr == FBT_LEAVE))) { - instr += size; - goto again; - } - - /* - * Found the popl %ebp; or leave. - */ - machine_inst_t *patch_instr = instr; - - /* - * Scan forward for a "ret", or "jmp". - */ - instr += size; - if (instr >= limit) - continue; - - size = dtrace_instr_size(instr); - if (size <= 0) /* Failed instruction decode? */ - continue; - - theInstr = instr[0]; - - if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) && - !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) && - !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) && - !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) && - !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS))) - continue; - - /* - * popl %ebp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner! - */ - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - - if (retfbt == NULL) { - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_RETURN, FBT_AFRAMES_RETURN, fbt); - } else { - retfbt->fbtp_next = fbt; - fbt->fbtp_id = retfbt->fbtp_id; - } - - retfbt = fbt; - fbt->fbtp_patchpoint = patch_instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - - if (*patch_instr == FBT_POPL_EBP) { - fbt->fbtp_rval = DTRACE_INVOP_POPL_EBP; - } else { - ASSERT(*patch_instr == FBT_LEAVE); - fbt->fbtp_rval = DTRACE_INVOP_LEAVE; - } - fbt->fbtp_roffset = - (uintptr_t)(patch_instr - (uint8_t *)sym[i].n_value); - - fbt->fbtp_savedval = *patch_instr; - fbt->fbtp_patchval = FBT_PATCHVAL; - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(patch_instr)] = fbt; - - instr += size; - goto again; - } -} #elif defined(__x86_64__) int fbt_invop(uintptr_t addr, uintptr_t *state, uintptr_t rval) @@ -870,7 +1071,7 @@ kern_return_t fbt_perfCallback( int trapno, x86_saved_state_t *tagged_regs, - __unused int unused1, + uintptr_t *lo_spp, __unused int unused2) { kern_return_t retval = KERN_FAILURE; @@ -878,18 +1079,36 @@ fbt_perfCallback( if (FBT_EXCEPTION_CODE == trapno && !IS_USER_TRAP(saved_state)) { boolean_t oldlevel; - uint64_t rsp_probe, *rbp, r12, fp, delta = 0; + uint64_t rsp_probe, fp, delta = 0; + uintptr_t old_sp; uint32_t *pDst; int emul; + oldlevel = ml_set_interrupts_enabled(FALSE); /* Calculate where the stack pointer was when the probe instruction "fired." */ rsp_probe = saved_state->isf.rsp; /* Easy, x86_64 establishes this value in idt64.s */ + __asm__ volatile( + "Ldtrace_invop_callsite_pre_label:\n" + ".data\n" + ".private_extern _dtrace_invop_callsite_pre\n" + "_dtrace_invop_callsite_pre:\n" + " .quad Ldtrace_invop_callsite_pre_label\n" + ".text\n" + ); + emul = dtrace_invop( saved_state->isf.rip, (uintptr_t *)saved_state, saved_state->rax ); - __asm__ volatile(".globl _dtrace_invop_callsite"); - __asm__ volatile("_dtrace_invop_callsite:"); + + __asm__ volatile( + "Ldtrace_invop_callsite_post_label:\n" + ".data\n" + ".private_extern _dtrace_invop_callsite_post\n" + "_dtrace_invop_callsite_post:\n" + " .quad Ldtrace_invop_callsite_post_label\n" + ".text\n" + ); switch (emul) { case DTRACE_INVOP_NOP: @@ -929,25 +1148,18 @@ fbt_perfCallback( */ delta += 2; saved_state->isf.rsp += (delta << 2); - -/* XXX Fragile in the extreme. - * This is sensitive to trap_from_kernel()'s internals. - */ - rbp = (uint64_t *)__builtin_frame_address(0); - rbp = (uint64_t *)*rbp; - r12 = *(rbp - 4); - +/* Obtain the stack pointer recorded by the trampolines */ + old_sp = *lo_spp; /* Shift contents of stack */ for (pDst = (uint32_t *)fp; - pDst > (((uint32_t *)r12)); + pDst > (((uint32_t *)old_sp)); pDst--) *pDst = pDst[-delta]; /* Track the stack lift in "saved_state". */ saved_state = (x86_saved_state64_t *) (((uintptr_t)saved_state) + (delta << 2)); - -/* Now adjust the value of %r12 in our caller (kernel_trap)'s frame */ - *(rbp - 4) = r12 + (delta << 2); +/* Adjust the stack pointer utilized by the trampolines */ + *lo_spp = old_sp + (delta << 2); retval = KERN_SUCCESS; break; @@ -966,47 +1178,301 @@ fbt_perfCallback( /*ARGSUSED*/ static void -__fbt_provide_module(void *arg, struct modctl *ctl) +__provide_probe_64(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart) { -#pragma unused(arg) - kernel_mach_header_t *mh; - struct load_command *cmd; - kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - struct nlist_64 *sym = NULL; - char *strings; - uintptr_t instrLow, instrHigh; - char *modname; - unsigned int i, j; - - int gIgnoreFBTBlacklist = 0; - PE_parse_boot_argn("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist, sizeof (gIgnoreFBTBlacklist)); - - mh = (kernel_mach_header_t *)(ctl->address); - modname = ctl->mod_modname; + unsigned int j; + unsigned int doenable = 0; + dtrace_id_t thisid; - if (0 == ctl->address || 0 == ctl->size) /* Has the linker been jettisoned? */ + fbt_probe_t *newfbt, *retfbt, *entryfbt; + machine_inst_t *instr, *limit, theInstr, i1, i2, i3; + int size; + + for (j = 0, instr = symbolStart, theInstr = 0; + (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2)); + j++) { + theInstr = instr[0]; + if (theInstr == FBT_PUSH_RBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16) + break; + + if ((size = dtrace_instr_size(instr)) <= 0) + break; + + instr += size; + } + + if (theInstr != FBT_PUSH_RBP) return; - + + i1 = instr[1]; + i2 = instr[2]; + i3 = instr[3]; + + limit = (machine_inst_t *)instrHigh; + + if (i1 == FBT_REX_RSP_RBP && i2 == FBT_MOV_RSP_RBP0 && i3 == FBT_MOV_RSP_RBP1) { + instr += 1; /* Advance to the mov %rsp,%rbp */ + theInstr = i1; + } else { + return; + } +#if 0 + else { + /* + * Sometimes, the compiler will schedule an intervening instruction + * in the function prologue. Example: + * + * _mach_vm_read: + * 000006d8 pushl %ebp + * 000006d9 movl $0x00000004,%edx + * 000006de movl %esp,%ebp + * + * Try the next instruction, to see if it is a movl %esp,%ebp + */ + + instr += 1; /* Advance past the pushl %ebp */ + if ((size = dtrace_instr_size(instr)) <= 0) + return; + + instr += size; + + if ((instr + 1) >= limit) + return; + + i1 = instr[0]; + i2 = instr[1]; + + if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) && + !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) + return; + + /* instr already points at the movl %esp,%ebp */ + theInstr = i1; + } +#endif + thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_ENTRY); + newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); + + if (thisid != 0) { + /* + * The dtrace_probe previously existed, so we have to hook + * the newfbt entry onto the end of the existing fbt's chain. + * If we find an fbt entry that was previously patched to + * fire, (as indicated by the current patched value), then + * we want to enable this newfbt on the spot. + */ + entryfbt = dtrace_probe_arg (fbt_id, thisid); + ASSERT (entryfbt != NULL); + for(; entryfbt != NULL; entryfbt = entryfbt->fbtp_next) { + if (entryfbt->fbtp_currentval == entryfbt->fbtp_patchval) + doenable++; + + if (entryfbt->fbtp_next == NULL) { + entryfbt->fbtp_next = newfbt; + newfbt->fbtp_id = entryfbt->fbtp_id; + break; + } + } + } + else { + /* + * The dtrace_probe did not previously exist, so we + * create it and hook in the newfbt. Since the probe is + * new, we obviously do not need to enable it on the spot. + */ + newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, symbolName, FBT_ENTRY, FBT_AFRAMES_ENTRY, newfbt); + doenable = 0; + } + + newfbt->fbtp_patchpoint = instr; + newfbt->fbtp_ctl = ctl; + newfbt->fbtp_loadcnt = ctl->mod_loadcnt; + newfbt->fbtp_rval = DTRACE_INVOP_MOV_RSP_RBP; + newfbt->fbtp_savedval = theInstr; + newfbt->fbtp_patchval = FBT_PATCHVAL; + newfbt->fbtp_currentval = 0; + newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; + fbt_probetab[FBT_ADDR2NDX(instr)] = newfbt; + + if (doenable) + fbt_enable(NULL, newfbt->fbtp_id, newfbt); + /* - * Employees of dtrace and their families are ineligible. Void - * where prohibited. + * The fbt entry chain is in place, one entry point per symbol. + * The fbt return chain can have multiple return points per symbol. + * Here we find the end of the fbt return chain. */ - - if (LIT_STRNEQL(modname, "com.apple.driver.dtrace")) + + doenable=0; + + thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN); + if (thisid != 0) { + /* The dtrace_probe previously existed, so we have to + * find the end of the existing fbt chain. If we find + * an fbt return that was previously patched to fire, + * (as indicated by the currrent patched value), then + * we want to enable any new fbts on the spot. + */ + retfbt = dtrace_probe_arg (fbt_id, thisid); + ASSERT(retfbt != NULL); + for (; retfbt != NULL; retfbt = retfbt->fbtp_next) { + if (retfbt->fbtp_currentval == retfbt->fbtp_patchval) + doenable++; + if(retfbt->fbtp_next == NULL) + break; + } + } + else { + doenable = 0; + retfbt = NULL; + } + +again: + if (instr >= limit) return; - - if (strstr(modname, "CHUD") != NULL) + + /* + * If this disassembly fails, then we've likely walked off into + * a jump table or some other unsuitable area. Bail out of the + * disassembly now. + */ + if ((size = dtrace_instr_size(instr)) <= 0) + return; + + /* + * We (desperately) want to avoid erroneously instrumenting a + * jump table, especially given that our markers are pretty + * short: two bytes on x86, and just one byte on amd64. To + * determine if we're looking at a true instruction sequence + * or an inline jump table that happens to contain the same + * byte sequences, we resort to some heuristic sleeze: we + * treat this instruction as being contained within a pointer, + * and see if that pointer points to within the body of the + * function. If it does, we refuse to instrument it. + */ + for (j = 0; j < sizeof (uintptr_t); j++) { + uintptr_t check = (uintptr_t)instr - j; + uint8_t *ptr; + + if (check < (uintptr_t)symbolStart) + break; + + if (check + sizeof (uintptr_t) > (uintptr_t)limit) + continue; + + ptr = *(uint8_t **)check; + + if (ptr >= (uint8_t *)symbolStart && ptr < limit) { + instr += size; + goto again; + } + } + + /* + * OK, it's an instruction. + */ + theInstr = instr[0]; + + /* Walked onto the start of the next routine? If so, bail out of this function. */ + if (theInstr == FBT_PUSH_RBP) return; + + if (!(size == 1 && (theInstr == FBT_POP_RBP || theInstr == FBT_LEAVE))) { + instr += size; + goto again; + } + + /* + * Found the pop %rbp; or leave. + */ + machine_inst_t *patch_instr = instr; + + /* + * Scan forward for a "ret", or "jmp". + */ + instr += size; + if (instr >= limit) + return; + + size = dtrace_instr_size(instr); + if (size <= 0) /* Failed instruction decode? */ + return; + + theInstr = instr[0]; + + if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) && + !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) && + !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) && + !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) && + !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS))) + return; + + /* + * pop %rbp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner! + */ + newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); + strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS ); + + if (retfbt == NULL) { + newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, + symbolName, FBT_RETURN, FBT_AFRAMES_RETURN, newfbt); + } else { + retfbt->fbtp_next = newfbt; + newfbt->fbtp_id = retfbt->fbtp_id; + } + + retfbt = newfbt; + newfbt->fbtp_patchpoint = patch_instr; + newfbt->fbtp_ctl = ctl; + newfbt->fbtp_loadcnt = ctl->mod_loadcnt; + + if (*patch_instr == FBT_POP_RBP) { + newfbt->fbtp_rval = DTRACE_INVOP_POP_RBP; + } else { + ASSERT(*patch_instr == FBT_LEAVE); + newfbt->fbtp_rval = DTRACE_INVOP_LEAVE; + } + newfbt->fbtp_roffset = + (uintptr_t)(patch_instr - (uint8_t *)symbolStart); + + newfbt->fbtp_savedval = *patch_instr; + newfbt->fbtp_patchval = FBT_PATCHVAL; + newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)]; + fbt_probetab[FBT_ADDR2NDX(patch_instr)] = newfbt; + + if (doenable) + fbt_enable(NULL, newfbt->fbtp_id, newfbt); + + instr += size; + goto again; +} +static void +__kernel_syms_provide_module(void *arg, struct modctl *ctl) +{ +#pragma unused(arg) + kernel_mach_header_t *mh; + struct load_command *cmd; + kernel_segment_command_t *orig_ts = NULL, *orig_le = NULL; + struct symtab_command *orig_st = NULL; + struct nlist_64 *sym = NULL; + char *strings; + uintptr_t instrLow, instrHigh; + char *modname; + unsigned int i; + + mh = (kernel_mach_header_t *)(ctl->mod_address); + modname = ctl->mod_modname; + if (mh->magic != MH_MAGIC_64) return; - + cmd = (struct load_command *) &mh[1]; for (i = 0; i < mh->ncmds; i++) { if (cmd->cmd == LC_SEGMENT_KERNEL) { kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd; - + if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) orig_ts = orig_sg; else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) @@ -1016,402 +1482,105 @@ __fbt_provide_module(void *arg, struct modctl *ctl) } else if (cmd->cmd == LC_SYMTAB) orig_st = (struct symtab_command *) cmd; - + cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize); } - + if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) return; - + sym = (struct nlist_64 *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - + /* Find extent of the TEXT section */ instrLow = (uintptr_t)orig_ts->vmaddr; instrHigh = (uintptr_t)(orig_ts->vmaddr + orig_ts->vmsize); - + for (i = 0; i < orig_st->nsyms; i++) { - fbt_probe_t *fbt, *retfbt; - machine_inst_t *instr, *limit, theInstr, i1, i2, i3; uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); char *name = strings + sym[i].n_un.n_strx; - int size; - + /* Check that the symbol is a global and that it has a name. */ if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) continue; - + if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ continue; /* Lop off omnipresent leading underscore. */ if (*name == '_') name += 1; - - if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) { - /* - * Anything beginning with "dtrace_" may be called - * from probe context unless it explitly indicates - * that it won't be called from probe context by - * using the prefix "dtrace_safe_". - */ - continue; - } - - if (LIT_STRNSTART(name, "fasttrap_") || - LIT_STRNSTART(name, "fuword") || - LIT_STRNSTART(name, "suword") || - LIT_STRNEQL(name, "sprlock") || - LIT_STRNEQL(name, "sprunlock") || - LIT_STRNEQL(name, "uread") || - LIT_STRNEQL(name, "uwrite")) - continue; /* Fasttrap inner-workings. */ - - if (LIT_STRNSTART(name, "dsmos_")) - continue; /* Don't Steal Mac OS X! */ - - if (LIT_STRNSTART(name, "_dtrace")) - continue; /* Shims in dtrace.c */ - - if (LIT_STRNSTART(name, "chud")) - continue; /* Professional courtesy. */ - - if (LIT_STRNSTART(name, "hibernate_")) - continue; /* Let sleeping dogs lie. */ - if (LIT_STRNEQL(name, "ZN9IOService14newTemperatureElPS_") || /* IOService::newTemperature */ - LIT_STRNEQL(name, "ZN9IOService26temperatureCriticalForZoneEPS_")) /* IOService::temperatureCriticalForZone */ - continue; /* Per the fire code */ - - /* - * Place no probes (illegal instructions) in the exception handling path! - */ - if (LIT_STRNEQL(name, "t_invop") || - LIT_STRNEQL(name, "enter_lohandler") || - LIT_STRNEQL(name, "lo_alltraps") || - LIT_STRNEQL(name, "kernel_trap") || - LIT_STRNEQL(name, "interrupt") || - LIT_STRNEQL(name, "i386_astintr")) - continue; - - if (LIT_STRNEQL(name, "current_thread") || - LIT_STRNEQL(name, "ast_pending") || - LIT_STRNEQL(name, "fbt_perfCallback") || - LIT_STRNEQL(name, "machine_thread_get_kern_state") || - LIT_STRNEQL(name, "get_threadtask") || - LIT_STRNEQL(name, "ml_set_interrupts_enabled") || - LIT_STRNEQL(name, "dtrace_invop") || - LIT_STRNEQL(name, "fbt_invop") || - LIT_STRNEQL(name, "sdt_invop") || - LIT_STRNEQL(name, "max_valid_stack_address")) - continue; - - /* - * Voodoo. - */ - if (LIT_STRNSTART(name, "machine_stack_") || - LIT_STRNSTART(name, "mapping_") || - LIT_STRNEQL(name, "tmrCvt") || - - LIT_STRNSTART(name, "tsc_") || - - LIT_STRNSTART(name, "pmCPU") || - LIT_STRNEQL(name, "pmKextRegister") || - LIT_STRNEQL(name, "pmMarkAllCPUsOff") || - LIT_STRNEQL(name, "pmSafeMode") || - LIT_STRNEQL(name, "pmTimerSave") || - LIT_STRNEQL(name, "pmTimerRestore") || - LIT_STRNEQL(name, "pmUnRegister") || - LIT_STRNSTART(name, "pms") || - LIT_STRNEQL(name, "power_management_init") || - LIT_STRNSTART(name, "usimple_") || - LIT_STRNSTART(name, "lck_spin_lock") || - LIT_STRNSTART(name, "lck_spin_unlock") || - - LIT_STRNSTART(name, "rtc_") || - LIT_STRNSTART(name, "_rtc_") || - LIT_STRNSTART(name, "rtclock_") || - LIT_STRNSTART(name, "clock_") || - LIT_STRNSTART(name, "absolutetime_to_") || - LIT_STRNEQL(name, "setPop") || - LIT_STRNEQL(name, "nanoseconds_to_absolutetime") || - LIT_STRNEQL(name, "nanotime_to_absolutetime") || - - LIT_STRNSTART(name, "etimer_") || - - LIT_STRNSTART(name, "commpage_") || - LIT_STRNSTART(name, "pmap_") || - LIT_STRNSTART(name, "ml_") || - LIT_STRNSTART(name, "PE_") || - LIT_STRNEQL(name, "kprintf") || - LIT_STRNSTART(name, "lapic_") || - LIT_STRNSTART(name, "acpi_")) - continue; - - /* - * Avoid machine_ routines. PR_5346750. - */ - if (LIT_STRNSTART(name, "machine_")) - continue; - - if (LIT_STRNEQL(name, "handle_pending_TLB_flushes")) - continue; - - /* - * Place no probes on critical routines. PR_5221096 - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit in probe context. - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit on the way to the debugger. - */ - if (LIT_STRNSTART(name, "kdp_") || - LIT_STRNSTART(name, "kdb_") || - LIT_STRNSTART(name, "kdbg_") || - LIT_STRNSTART(name, "kdebug_") || - LIT_STRNEQL(name, "kernel_debug") || - LIT_STRNEQL(name, "Debugger") || - LIT_STRNEQL(name, "Call_DebuggerC") || - LIT_STRNEQL(name, "lock_debugger") || - LIT_STRNEQL(name, "unlock_debugger") || - LIT_STRNEQL(name, "SysChoked")) - continue; - /* - * Place no probes that could be hit on the way to a panic. + * We're only blacklisting functions in the kernel for now. */ - if (NULL != strstr(name, "panic_") || - LIT_STRNEQL(name, "panic") || - LIT_STRNEQL(name, "handleMck") || - LIT_STRNEQL(name, "unresolved_kernel_trap")) + if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name)) continue; - if (dtrace_probe_lookup(fbt_id, modname, name, NULL) != 0) - continue; - - for (j = 0, instr = (machine_inst_t *)sym[i].n_value, theInstr = 0; - (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2)); - j++) { - theInstr = instr[0]; - if (theInstr == FBT_PUSH_RBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16) - break; - - if ((size = dtrace_instr_size(instr)) <= 0) - break; - - instr += size; - } - - if (theInstr != FBT_PUSH_RBP) - continue; - - i1 = instr[1]; - i2 = instr[2]; - i3 = instr[3]; - - limit = (machine_inst_t *)instrHigh; + __provide_probe_64(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value); + } +} - if (i1 == FBT_REX_RSP_RBP && i2 == FBT_MOV_RSP_RBP0 && i3 == FBT_MOV_RSP_RBP1) { - instr += 1; /* Advance to the mov %rsp,%rbp */ - theInstr = i1; - } else { - continue; - } -#if 0 - else { +static void +__user_syms_provide_module(void *arg, struct modctl *ctl) +{ +#pragma unused(arg) + char *modname; + unsigned int i; + + modname = ctl->mod_modname; + + dtrace_module_symbols_t* module_symbols = ctl->mod_user_symbols; + if (module_symbols) { + for (i=0; i<module_symbols->dtmodsyms_count; i++) { + dtrace_symbol_t* symbol = &module_symbols->dtmodsyms_symbols[i]; + char* name = symbol->dtsym_name; + + /* Lop off omnipresent leading underscore. */ + if (*name == '_') + name += 1; + /* - * Sometimes, the compiler will schedule an intervening instruction - * in the function prologue. Example: - * - * _mach_vm_read: - * 000006d8 pushl %ebp - * 000006d9 movl $0x00000004,%edx - * 000006de movl %esp,%ebp - * - * Try the next instruction, to see if it is a movl %esp,%ebp + * We're only blacklisting functions in the kernel for now. */ - - instr += 1; /* Advance past the pushl %ebp */ - if ((size = dtrace_instr_size(instr)) <= 0) - continue; - - instr += size; - - if ((instr + 1) >= limit) - continue; - - i1 = instr[0]; - i2 = instr[1]; - - if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) && - !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1)) + if (MOD_IS_MACH_KERNEL(ctl) && !is_symbol_valid(name)) continue; - - /* instr already points at the movl %esp,%ebp */ - theInstr = i1; - } -#endif - - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, name, FBT_ENTRY, FBT_AFRAMES_ENTRY, fbt); - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - fbt->fbtp_rval = DTRACE_INVOP_MOV_RSP_RBP; - fbt->fbtp_savedval = theInstr; - fbt->fbtp_patchval = FBT_PATCHVAL; - - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - - retfbt = NULL; -again: - if (instr >= limit) - continue; - - /* - * If this disassembly fails, then we've likely walked off into - * a jump table or some other unsuitable area. Bail out of the - * disassembly now. - */ - if ((size = dtrace_instr_size(instr)) <= 0) - continue; - - /* - * We (desperately) want to avoid erroneously instrumenting a - * jump table, especially given that our markers are pretty - * short: two bytes on x86, and just one byte on amd64. To - * determine if we're looking at a true instruction sequence - * or an inline jump table that happens to contain the same - * byte sequences, we resort to some heuristic sleeze: we - * treat this instruction as being contained within a pointer, - * and see if that pointer points to within the body of the - * function. If it does, we refuse to instrument it. - */ - for (j = 0; j < sizeof (uintptr_t); j++) { - uintptr_t check = (uintptr_t)instr - j; - uint8_t *ptr; - - if (check < sym[i].n_value) - break; - - if (check + sizeof (uintptr_t) > (uintptr_t)limit) - continue; - - ptr = *(uint8_t **)check; - - if (ptr >= (uint8_t *)sym[i].n_value && ptr < limit) { - instr += size; - goto again; - } - } - - /* - * OK, it's an instruction. - */ - theInstr = instr[0]; - - /* Walked onto the start of the next routine? If so, bail out of this function. */ - if (theInstr == FBT_PUSH_RBP) - continue; - - if (!(size == 1 && (theInstr == FBT_POP_RBP || theInstr == FBT_LEAVE))) { - instr += size; - goto again; - } - - /* - * Found the pop %rbp; or leave. - */ - machine_inst_t *patch_instr = instr; - - /* - * Scan forward for a "ret", or "jmp". - */ - instr += size; - if (instr >= limit) - continue; - - size = dtrace_instr_size(instr); - if (size <= 0) /* Failed instruction decode? */ - continue; - - theInstr = instr[0]; - - if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) && - !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) && - !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) && - !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) && - !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS))) - continue; - - /* - * pop %rbp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner! - */ - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - - if (retfbt == NULL) { - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_RETURN, FBT_AFRAMES_RETURN, fbt); - } else { - retfbt->fbtp_next = fbt; - fbt->fbtp_id = retfbt->fbtp_id; - } - - retfbt = fbt; - fbt->fbtp_patchpoint = patch_instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - - if (*patch_instr == FBT_POP_RBP) { - fbt->fbtp_rval = DTRACE_INVOP_POP_RBP; - } else { - ASSERT(*patch_instr == FBT_LEAVE); - fbt->fbtp_rval = DTRACE_INVOP_LEAVE; + + __provide_probe_64(ctl, (uintptr_t)symbol->dtsym_addr, (uintptr_t)(symbol->dtsym_addr + symbol->dtsym_size), modname, name, (machine_inst_t*)(uintptr_t)symbol->dtsym_addr); } - fbt->fbtp_roffset = - (uintptr_t)(patch_instr - (uint8_t *)sym[i].n_value); - - fbt->fbtp_savedval = *patch_instr; - fbt->fbtp_patchval = FBT_PATCHVAL; - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(patch_instr)] = fbt; - - instr += size; - goto again; } } #else #error Unknown arch #endif -extern struct modctl g_fbt_kernctl; -#undef kmem_alloc /* from its binding to dt_kmem_alloc glue */ -#undef kmem_free /* from its binding to dt_kmem_free glue */ -#include <vm/vm_kern.h> +extern int dtrace_kernel_symbol_mode; /*ARGSUSED*/ void fbt_provide_module(void *arg, struct modctl *ctl) { -#pragma unused(ctl) - __fbt_provide_module(arg, &g_fbt_kernctl); + ASSERT(ctl != NULL); + ASSERT(dtrace_kernel_symbol_mode != DTRACE_KERNEL_SYMBOLS_NEVER); + lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED); - if ( (vm_offset_t)g_fbt_kernctl.address != (vm_offset_t )NULL ) - kmem_free(kernel_map, (vm_offset_t)g_fbt_kernctl.address, round_page(g_fbt_kernctl.size)); - g_fbt_kernctl.address = 0; - g_fbt_kernctl.size = 0; + if (MOD_FBT_DONE(ctl)) + return; + + if (!is_module_valid(ctl)) { + ctl->mod_flags |= MODCTL_FBT_INVALID; + return; + } + + if (MOD_HAS_KERNEL_SYMBOLS(ctl)) { + __kernel_syms_provide_module(arg, ctl); + ctl->mod_flags |= MODCTL_FBT_PROBES_PROVIDED; + return; + } + + if (MOD_HAS_USERSPACE_SYMBOLS(ctl)) { + __user_syms_provide_module(arg, ctl); + ctl->mod_flags |= MODCTL_FBT_PROBES_PROVIDED; + return; + } } diff --git a/bsd/dev/i386/mem.c b/bsd/dev/i386/mem.c index e598cdf67..4b4589295 100644 --- a/bsd/dev/i386/mem.c +++ b/bsd/dev/i386/mem.c @@ -88,7 +88,9 @@ extern addr64_t kvtophys(vm_offset_t va); extern boolean_t kernacc(off_t, size_t ); +#if !defined(SECURE_KERNEL) extern int setup_kmem; +#endif static caddr_t devzerobuf; @@ -117,8 +119,11 @@ mmioctl(dev_t dev, u_long cmd, __unused caddr_t data, { int minnum = minor(dev); - if ((setup_kmem == 0) && ((minnum == 0) || (minnum == 1))) - return(EINVAL); + if ((minnum == 0) || (minnum == 1)) +#if !defined(SECURE_KERNEL) + if (setup_kmem == 0) + return(EINVAL); +#endif switch (cmd) { case FIONBIO: @@ -149,8 +154,12 @@ mmrw(dev_t dev, struct uio *uio, enum uio_rw rw) /* minor device 0 is physical memory */ case 0: +#if defined(SECURE_KERNEL) + return(ENODEV); +#else if (setup_kmem == 0) return(ENODEV); +#endif v = trunc_page(uio->uio_offset); if (uio->uio_offset >= (off_t)mem_size) @@ -169,8 +178,12 @@ mmrw(dev_t dev, struct uio *uio, enum uio_rw rw) /* minor device 1 is kernel memory */ case 1: +#if defined(SECURE_KERNEL) + return(ENODEV); +#else if (setup_kmem == 0) return(ENODEV); +#endif /* Do some sanity checking */ if (((vm_address_t)uio->uio_offset >= VM_MAX_KERNEL_ADDRESS) || ((vm_address_t)uio->uio_offset <= VM_MIN_KERNEL_AND_KEXT_ADDRESS)) diff --git a/bsd/dev/i386/munge.s b/bsd/dev/i386/munge.s index d174c06e3..9df397097 100644 --- a/bsd/dev/i386/munge.s +++ b/bsd/dev/i386/munge.s @@ -140,16 +140,92 @@ Entry(munge_wl) /* Costs an extra w move to do this */ ENTRY(munge_wlw) movl 8(%esp),%ecx // get &uu_args xorl %edx,%edx - movl 12(%ecx),%eax +Lwlw: + movl 12(%ecx),%eax //l movl %eax,16(%ecx) movl %edx,20(%ecx) - movl 8(%ecx),%eax +Lwl: + movl 8(%ecx),%eax //l movl %eax,12(%ecx) movl 4(%ecx),%eax movl %eax,8(%ecx) - movl %edx,4(%ecx) + movl %edx,4(%ecx) //w ret +ENTRY(munge_wlwwwll) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx +Lwlwwwll: + movl 36(%ecx),%eax + movl %eax,52(%ecx) + movl 32(%ecx),%eax + movl %eax,48(%ecx) + movl 28(%ecx),%eax + movl %eax,44(%ecx) + movl 24(%ecx),%eax + movl %eax,40(%ecx) + movl 20(%ecx),%eax + movl %eax,32(%ecx) + movl %edx,36(%ecx) +Lwlww: + movl 16(%ecx),%eax + movl %eax,24(%ecx) + movl %edx,28(%ecx) + jmp Lwlw + +ENTRY(munge_wlwwwllw) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 40(%ecx),%eax + movl %eax,56(%ecx) + movl %edx,60(%ecx) + jmp Lwlwwwll + +ENTRY(munge_wlwwlwlw) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 40(%ecx),%eax + movl %eax,56(%ecx) + movl %edx,60(%ecx) + movl 36(%ecx),%eax + movl %eax,52(%ecx) + movl 32(%ecx),%eax + movl %eax,48(%ecx) + movl 28(%ecx),%eax + movl %eax,40(%ecx) + movl %edx,44(%ecx) + movl 24(%ecx),%eax + movl %eax,36(%ecx) + movl 20(%ecx),%eax + movl %eax,32(%ecx) + jmp Lwlww + +ENTRY(munge_wllwwll) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + + movl 40(%ecx),%eax // l + movl %eax,52(%ecx) + movl 36(%ecx),%eax + movl %eax,48(%ecx) + movl 32(%ecx),%eax // l + movl %eax,44(%ecx) + movl 28(%ecx),%eax + movl %eax,40(%ecx) + + movl 24(%ecx),%eax //w + movl %eax,32(%ecx) + movl %edx,36(%ecx) + movl 20(%ecx),%eax //w + movl %eax,24(%ecx) + movl %edx,28(%ecx) + + movl 16(%ecx),%eax //l + movl %eax,20(%ecx) + movl 12(%ecx),%eax + movl %eax,16(%ecx) + jmp Lwl + Entry(munge_wwwlw) movl 8(%esp),%ecx // get &uu_args xorl %edx,%edx @@ -195,6 +271,63 @@ ENTRY(munge_wwwwwl) movl %eax,44(%ecx) jmp Lw5 +ENTRY(munge_wwwwwlww) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 32(%ecx),%eax + movl %eax,56(%ecx) + movl %edx,60(%ecx) + movl 28(%ecx),%eax + movl %eax,48(%ecx) + movl %edx,52(%ecx) + movl 20(%ecx),%eax + movl %eax,40(%ecx) + movl 24(%ecx),%eax + movl %eax,44(%ecx) + jmp Lw5 + +ENTRY(munge_wwwwwllw) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 36(%ecx),%eax + movl %eax,56(%ecx) + movl %edx,60(%ecx) + movl 28(%ecx),%eax + movl %eax,48(%ecx) + movl 32(%ecx),%eax + movl %eax,52(%ecx) + movl 20(%ecx),%eax + movl %eax,40(%ecx) + movl 24(%ecx),%eax + movl %eax,44(%ecx) + jmp Lw5 + +ENTRY(munge_wwwwwlll) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 36(%ecx),%eax + movl %eax,56(%ecx) + movl 40(%ecx),%eax + movl %eax,60(%ecx) + movl 28(%ecx),%eax + movl %eax,48(%ecx) + movl 32(%ecx),%eax + movl %eax,52(%ecx) + movl 20(%ecx),%eax + movl %eax,40(%ecx) + movl 24(%ecx),%eax + movl %eax,44(%ecx) + jmp Lw5 + +ENTRY(munge_wwwwwwl) + movl 8(%esp),%ecx // get &uu_args + xorl %edx,%edx + movl 24(%ecx),%eax + movl %eax,48(%ecx) + movl 28(%ecx),%eax + movl %eax,52(%ecx) + jmp Lw6 + ENTRY(munge_wwwwwwlw) movl 8(%esp),%ecx // get &uu_args xorl %edx,%edx diff --git a/bsd/dev/i386/sdt_x86.c b/bsd/dev/i386/sdt_x86.c index c354b303e..680ed779b 100644 --- a/bsd/dev/i386/sdt_x86.c +++ b/bsd/dev/i386/sdt_x86.c @@ -107,3 +107,115 @@ sdt_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax) #endif +struct frame { + struct frame *backchain; + uintptr_t retaddr; +}; + +/*ARGSUSED*/ +uint64_t +sdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, int aframes) +{ +#pragma unused(arg, id, parg) + uint64_t val; + struct frame *fp = (struct frame *)__builtin_frame_address(0); + uintptr_t *stack; + uintptr_t pc; + int i; + +#if defined(__x86_64__) + /* + * A total of 6 arguments are passed via registers; any argument with + * index of 5 or lower is therefore in a register. + */ + int inreg = 5; +#endif + + for (i = 1; i <= aframes; i++) { + fp = fp->backchain; + pc = fp->retaddr; + + if (dtrace_invop_callsite_pre != NULL + && pc > (uintptr_t)dtrace_invop_callsite_pre + && pc <= (uintptr_t)dtrace_invop_callsite_post) { +#if defined(__i386__) + /* + * If we pass through the invalid op handler, we will + * use the pointer that it passed to the stack as the + * second argument to dtrace_invop() as the pointer to + * the frame we're hunting for. + */ + + stack = (uintptr_t *)&fp[1]; /* Find marshalled arguments */ + fp = (struct frame *)stack[1]; /* Grab *second* argument */ + stack = (uintptr_t *)&fp[0]; /* Find marshalled arguments */ +#elif defined(__x86_64__) + /* + * In the case of x86_64, we will use the pointer to the + * save area structure that was pushed when we took the + * trap. To get this structure, we must increment + * beyond the frame structure. If the + * argument that we're seeking is passed on the stack, + * we'll pull the true stack pointer out of the saved + * registers and decrement our argument by the number + * of arguments passed in registers; if the argument + * we're seeking is passed in regsiters, we can just + * load it directly. + */ + + /* fp points to frame of dtrace_invop() activation. */ + fp = fp->backchain; /* to fbt_perfcallback() activation. */ + fp = fp->backchain; /* to kernel_trap() activation. */ + fp = fp->backchain; /* to trap_from_kernel() activation. */ + + x86_saved_state_t *tagged_regs = (x86_saved_state_t *)&fp[1]; + x86_saved_state64_t *saved_state = saved_state64(tagged_regs); + + if (argno <= inreg) { + stack = (uintptr_t *)&saved_state->rdi; + } else { + fp = (struct frame *)(saved_state->isf.rsp); + stack = (uintptr_t *)&fp[0]; /* Find marshalled + arguments */ + argno -= (inreg +1); + } +#else +#error Unknown arch +#endif + goto load; + } + } + + /* + * We know that we did not come through a trap to get into + * dtrace_probe() -- We arrive here when the provider has + * called dtrace_probe() directly. + * The probe ID is the first argument to dtrace_probe(). + * We must advance beyond that to get the argX. + */ + argno++; /* Advance past probeID */ + +#if defined(__x86_64__) + if (argno <= inreg) { + /* + * This shouldn't happen. If the argument is passed in a + * register then it should have been, well, passed in a + * register... + */ + DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); + return (0); + } + + argno -= (inreg + 1); +#endif + stack = (uintptr_t *)&fp[1]; /* Find marshalled arguments */ + +load: + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + /* dtrace_probe arguments arg0 ... arg4 are 64bits wide */ + val = (uint64_t)(*(((uintptr_t *)stack) + argno)); + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + return (val); +} + diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index c255529a1..ba3bfc1ee 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -33,6 +33,9 @@ #include <i386/cpuid.h> #include <i386/tsc.h> #include <i386/machine_routines.h> +#include <i386/ucode.h> +#include <kern/clock.h> +#include <libkern/libkern.h> static int _i386_cpu_info SYSCTL_HANDLER_ARGS @@ -201,6 +204,42 @@ cpu_flex_ratio_max SYSCTL_HANDLER_ARGS return SYSCTL_OUT(req, &flex_ratio_max, sizeof(flex_ratio_max)); } +static int +cpu_ucode_update SYSCTL_HANDLER_ARGS +{ + __unused struct sysctl_oid *unused_oidp = oidp; + __unused void *unused_arg1 = arg1; + __unused int unused_arg2 = arg2; + uint64_t addr; + int error; + + error = SYSCTL_IN(req, &addr, sizeof(addr)); + if (error) + return error; + + int ret = ucode_interface(addr); + return ret; +} + +extern uint64_t panic_restart_timeout; +static int +panic_set_restart_timeout(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int new_value = 0, old_value = 0, changed = 0, error; + uint64_t nstime; + + if (panic_restart_timeout) { + absolutetime_to_nanoseconds(panic_restart_timeout, &nstime); + old_value = nstime / NSEC_PER_SEC; + } + + error = sysctl_io_number(req, old_value, sizeof(int), &new_value, &changed); + if (error == 0 && changed) { + nanoseconds_to_absolutetime(((uint64_t)new_value) * NSEC_PER_SEC, &panic_restart_timeout); + } + return error; +} + /* * Populates the {CPU, vector, latency} triple for the maximum observed primary * interrupt latency @@ -226,107 +265,113 @@ misc_interrupt_latency_max(__unused struct sysctl_oid *oidp, __unused void *arg1 SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "CPU info"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, max_basic, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, max_basic, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_max_basic),sizeof(uint32_t), i386_cpu_info, "IU", "Max Basic Information value"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, max_ext, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, max_ext, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_max_ext), sizeof(uint32_t), i386_cpu_info, "IU", "Max Extended Function Information value"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, vendor, CTLTYPE_STRING | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, vendor, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_vendor), 0, i386_cpu_info, "A", "CPU vendor"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand_string, CTLTYPE_STRING | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand_string, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_brand_string), 0, i386_cpu_info, "A", "CPU brand string"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, family, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, family, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_family), sizeof(uint8_t), i386_cpu_info, "I", "CPU family"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, model, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, model, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_model), sizeof(uint8_t), i386_cpu_info, "I", "CPU model"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, extmodel, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, extmodel, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_extmodel), sizeof(uint8_t), i386_cpu_info, "I", "CPU extended model"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfamily, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfamily, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_extfamily), sizeof(uint8_t), i386_cpu_info, "I", "CPU extended family"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, stepping, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, stepping, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_stepping), sizeof(uint8_t), i386_cpu_info, "I", "CPU stepping"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, feature_bits, CTLTYPE_QUAD | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, feature_bits, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_features), sizeof(uint64_t), i386_cpu_info, "IU", "CPU features"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfeature_bits, CTLTYPE_QUAD | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfeature_bits, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_extfeatures), sizeof(uint64_t), i386_cpu_info, "IU", "CPU extended features"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, signature, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, signature, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_signature), sizeof(uint32_t), i386_cpu_info, "I", "CPU signature"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand, CTLTYPE_INT | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_brand), sizeof(uint8_t), i386_cpu_info, "I", "CPU brand"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, features, CTLTYPE_STRING | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, features, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_features, "A", "CPU feature names"); -SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfeatures, CTLTYPE_STRING | CTLFLAG_RD, +SYSCTL_PROC(_machdep_cpu, OID_AUTO, extfeatures, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_extfeatures, "A", "CPU extended feature names"); SYSCTL_PROC(_machdep_cpu, OID_AUTO, logical_per_package, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_logical_per_package, "I", "CPU logical cpus per package"); SYSCTL_PROC(_machdep_cpu, OID_AUTO, cores_per_package, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_cores_per_package), sizeof(uint32_t), i386_cpu_info, "I", "CPU cores per package"); SYSCTL_PROC(_machdep_cpu, OID_AUTO, microcode_version, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_microcode_version), sizeof(uint32_t), i386_cpu_info, "I", "Microcode version number"); +SYSCTL_PROC(_machdep_cpu, OID_AUTO, processor_flag, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, + (void *)offsetof(i386_cpu_info_t, cpuid_processor_flag), + sizeof(uint32_t), + i386_cpu_info, "I", "CPU processor flag"); + SYSCTL_NODE(_machdep_cpu, OID_AUTO, mwait, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "mwait"); SYSCTL_PROC(_machdep_cpu_mwait, OID_AUTO, linesize_min, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_mwait_leaf_t, linesize_min), sizeof(uint32_t), cpu_mwait, "I", "Monitor/mwait minimum line size"); SYSCTL_PROC(_machdep_cpu_mwait, OID_AUTO, linesize_max, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_mwait_leaf_t, linesize_max), sizeof(uint32_t), cpu_mwait, "I", "Monitor/mwait maximum line size"); SYSCTL_PROC(_machdep_cpu_mwait, OID_AUTO, extensions, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_mwait_leaf_t, extensions), sizeof(uint32_t), cpu_mwait, "I", "Monitor/mwait extensions"); SYSCTL_PROC(_machdep_cpu_mwait, OID_AUTO, sub_Cstates, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_mwait_leaf_t, sub_Cstates), sizeof(uint32_t), cpu_mwait, "I", "Monitor/mwait sub C-states"); @@ -336,31 +381,31 @@ SYSCTL_NODE(_machdep_cpu, OID_AUTO, thermal, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "thermal"); SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, sensor, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_thermal_leaf_t, sensor), sizeof(boolean_t), cpu_thermal, "I", "Thermal sensor present"); SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, dynamic_acceleration, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_thermal_leaf_t, dynamic_acceleration), sizeof(boolean_t), cpu_thermal, "I", "Dynamic Acceleration Technology (Turbo Mode)"); SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, invariant_APIC_timer, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_thermal_leaf_t, invariant_APIC_timer), sizeof(boolean_t), cpu_thermal, "I", "Invariant APIC Timer"); SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, thresholds, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_thermal_leaf_t, thresholds), sizeof(uint32_t), cpu_thermal, "I", "Number of interrupt thresholds"); SYSCTL_PROC(_machdep_cpu_thermal, OID_AUTO, ACNT_MCNT, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_thermal_leaf_t, ACNT_MCNT), sizeof(boolean_t), cpu_thermal, "I", "ACNT_MCNT capability"); @@ -410,43 +455,43 @@ SYSCTL_NODE(_machdep_cpu, OID_AUTO, arch_perf, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "arch_perf"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, version, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, version), sizeof(uint8_t), cpu_arch_perf, "I", "Architectural Performance Version Number"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, number, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, number), sizeof(uint8_t), cpu_arch_perf, "I", "Number of counters per logical cpu"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, width, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, width), sizeof(uint8_t), cpu_arch_perf, "I", "Bit width of counters"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, events_number, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, events_number), sizeof(uint8_t), cpu_arch_perf, "I", "Number of monitoring events"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, events, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, events), sizeof(uint32_t), cpu_arch_perf, "I", "Bit vector of events"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, fixed_number, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, fixed_number), sizeof(uint8_t), cpu_arch_perf, "I", "Number of fixed-function counters"); SYSCTL_PROC(_machdep_cpu_arch_perf, OID_AUTO, fixed_width, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(cpuid_arch_perf_leaf_t, fixed_width), sizeof(uint8_t), cpu_arch_perf, "I", "Bit-width of fixed-function counters"); @@ -456,19 +501,19 @@ SYSCTL_NODE(_machdep_cpu, OID_AUTO, cache, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "cache"); SYSCTL_PROC(_machdep_cpu_cache, OID_AUTO, linesize, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_cache_linesize), sizeof(uint32_t), i386_cpu_info, "I", "Cacheline size"); SYSCTL_PROC(_machdep_cpu_cache, OID_AUTO, L2_associativity, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_cache_L2_associativity), sizeof(uint32_t), i386_cpu_info, "I", "L2 cache associativity"); SYSCTL_PROC(_machdep_cpu_cache, OID_AUTO, size, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_cache_size), sizeof(uint32_t), i386_cpu_info, "I", "Cache size (in Kbytes)"); @@ -482,7 +527,7 @@ SYSCTL_NODE(_machdep_cpu_tlb, OID_AUTO, data, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "data"); SYSCTL_PROC(_machdep_cpu_tlb_inst, OID_AUTO, small, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_INST][TLB_SMALL][0]), sizeof(uint32_t), @@ -490,7 +535,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_inst, OID_AUTO, small, "Number of small page instruction TLBs"); SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, small, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_DATA][TLB_SMALL][0]), sizeof(uint32_t), @@ -498,7 +543,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, small, "Number of small page data TLBs (1st level)"); SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, small_level1, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_DATA][TLB_SMALL][1]), sizeof(uint32_t), @@ -506,7 +551,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, small_level1, "Number of small page data TLBs (2nd level)"); SYSCTL_PROC(_machdep_cpu_tlb_inst, OID_AUTO, large, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_INST][TLB_LARGE][0]), sizeof(uint32_t), @@ -514,7 +559,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_inst, OID_AUTO, large, "Number of large page instruction TLBs"); SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, large, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_DATA][TLB_LARGE][0]), sizeof(uint32_t), @@ -522,7 +567,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, large, "Number of large page data TLBs (1st level)"); SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, large_level1, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_tlb[TLB_DATA][TLB_LARGE][1]), sizeof(uint32_t), @@ -530,7 +575,7 @@ SYSCTL_PROC(_machdep_cpu_tlb_data, OID_AUTO, large_level1, "Number of large page data TLBs (2nd level)"); SYSCTL_PROC(_machdep_cpu_tlb, OID_AUTO, shared, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_stlb), sizeof(uint32_t), i386_cpu_info_nonzero, "I", @@ -541,26 +586,26 @@ SYSCTL_NODE(_machdep_cpu, OID_AUTO, address_bits, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "address_bits"); SYSCTL_PROC(_machdep_cpu_address_bits, OID_AUTO, physical, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_address_bits_physical), sizeof(uint32_t), i386_cpu_info, "I", "Number of physical address bits"); SYSCTL_PROC(_machdep_cpu_address_bits, OID_AUTO, virtual, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, cpuid_address_bits_virtual), sizeof(uint32_t), i386_cpu_info, "I", "Number of virtual address bits"); SYSCTL_PROC(_machdep_cpu, OID_AUTO, core_count, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, core_count), sizeof(uint32_t), i386_cpu_info, "I", "Number of enabled cores per package"); SYSCTL_PROC(_machdep_cpu, OID_AUTO, thread_count, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, (void *)offsetof(i386_cpu_info_t, thread_count), sizeof(uint32_t), i386_cpu_info, "I", "Number of enabled threads per package"); @@ -569,34 +614,40 @@ SYSCTL_NODE(_machdep_cpu, OID_AUTO, flex_ratio, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Flex ratio"); SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, desired, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_flex_ratio_desired, "I", "Flex ratio desired (0 disabled)"); SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, min, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_flex_ratio_min, "I", "Flex ratio min (efficiency)"); SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, max, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, cpu_flex_ratio_max, "I", "Flex ratio max (non-turbo)"); +SYSCTL_PROC(_machdep_cpu, OID_AUTO, ucupdate, + CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, + cpu_ucode_update, "S", "Microcode update interface"); + uint64_t pmap_pv_hashlist_walks; uint64_t pmap_pv_hashlist_cnts; uint32_t pmap_pv_hashlist_max; uint32_t pmap_kernel_text_ps = PAGE_SIZE; +extern uint32_t pv_hashed_kern_low_water_mark; /*extern struct sysctl_oid_list sysctl__machdep_pmap_children;*/ SYSCTL_NODE(_machdep, OID_AUTO, pmap, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "PMAP info"); -SYSCTL_QUAD (_machdep_pmap, OID_AUTO, hashwalks, CTLFLAG_RD | CTLFLAG_KERN, &pmap_pv_hashlist_walks, ""); -SYSCTL_QUAD (_machdep_pmap, OID_AUTO, hashcnts, CTLFLAG_RD | CTLFLAG_KERN, &pmap_pv_hashlist_cnts, ""); -SYSCTL_INT (_machdep_pmap, OID_AUTO, hashmax, CTLFLAG_RD | CTLFLAG_KERN, &pmap_pv_hashlist_max, 0, ""); -SYSCTL_INT (_machdep_pmap, OID_AUTO, kernel_text_ps, CTLFLAG_RD | CTLFLAG_KERN, &pmap_kernel_text_ps, 0, ""); +SYSCTL_QUAD (_machdep_pmap, OID_AUTO, hashwalks, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &pmap_pv_hashlist_walks, ""); +SYSCTL_QUAD (_machdep_pmap, OID_AUTO, hashcnts, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &pmap_pv_hashlist_cnts, ""); +SYSCTL_INT (_machdep_pmap, OID_AUTO, hashmax, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &pmap_pv_hashlist_max, 0, ""); +SYSCTL_INT (_machdep_pmap, OID_AUTO, kernel_text_ps, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &pmap_kernel_text_ps, 0, ""); +SYSCTL_INT (_machdep_pmap, OID_AUTO, kern_pv_reserve, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &pv_hashed_kern_low_water_mark, 0, ""); SYSCTL_NODE(_machdep, OID_AUTO, memmap, CTLFLAG_RD|CTLFLAG_LOCKED, NULL, "physical memory map"); @@ -621,9 +672,15 @@ SYSCTL_QUAD(_machdep_memmap, OID_AUTO, Other, CTLFLAG_RD|CTLFLAG_LOCKED, &firmwa SYSCTL_NODE(_machdep, OID_AUTO, tsc, CTLFLAG_RD|CTLFLAG_LOCKED, NULL, "Timestamp counter parameters"); SYSCTL_QUAD(_machdep_tsc, OID_AUTO, frequency, CTLFLAG_RD|CTLFLAG_LOCKED, &tscFreq, ""); + SYSCTL_NODE(_machdep, OID_AUTO, misc, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Miscellaneous x86 kernel parameters"); +SYSCTL_PROC(_machdep_misc, OID_AUTO, panic_restart_timeout, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, + panic_set_restart_timeout, "I", "Panic restart timeout in seconds"); + SYSCTL_PROC(_machdep_misc, OID_AUTO, interrupt_latency_max, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, misc_interrupt_latency_max, "A", "Maximum Interrupt latency"); diff --git a/bsd/dev/i386/systemcalls.c b/bsd/dev/i386/systemcalls.c index 660d0d1aa..7a849ca31 100644 --- a/bsd/dev/i386/systemcalls.c +++ b/bsd/dev/i386/systemcalls.c @@ -34,6 +34,7 @@ #include <kern/debug.h> #include <mach/machine/thread_status.h> #include <mach/thread_act.h> +#include <mach/branch_predicates.h> #include <sys/kernel.h> #include <sys/vm.h> @@ -54,6 +55,8 @@ #include <i386/machine_routines.h> #include <mach/i386/syscall_sw.h> +#include <machine/pal_routines.h> + #if CONFIG_DTRACE extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); extern void dtrace_systrace_syscall_return(unsigned short, int, int *); @@ -69,6 +72,15 @@ extern boolean_t x86_sysenter_arg_store_isvalid(thread_t thread); /* dynamically generated at build time based on syscalls.master */ extern const char *syscallnames[]; +/* + * This needs to be a single switch so that it's "all on" or "all off", + * rather than being turned on for some code paths and not others, as this + * has a tendency to introduce "blame the next guy" bugs. + */ +#if DEBUG +#define FUNNEL_DEBUG 1 /* Check for funnel held on exit */ +#endif + /* * Function: unix_syscall * @@ -90,6 +102,7 @@ unix_syscall(x86_saved_state_t *state) struct uthread *uthread; x86_saved_state32_t *regs; boolean_t args_in_uthread; + boolean_t is_vfork; assert(is_saved_state32(state)); regs = saved_state32(state); @@ -100,15 +113,15 @@ unix_syscall(x86_saved_state_t *state) thread = current_thread(); uthread = get_bsdthread_info(thread); - /* Get the approriate proc; may be different from task's for vfork() */ - if (!(uthread->uu_flag & UT_VFORK)) - p = (struct proc *)get_bsdtask_info(current_task()); - else + is_vfork = uthread->uu_flag & UT_VFORK; + if (__improbable(is_vfork != 0)) p = current_proc(); + else + p = (struct proc *)get_bsdtask_info(current_task()); /* Verify that we are not being called from a task without a proc */ - if (p == NULL) { + if (__improbable(p == NULL)) { regs->eax = EPERM; regs->efl |= EFL_CF; task_terminate_internal(current_task()); @@ -126,7 +139,7 @@ unix_syscall(x86_saved_state_t *state) callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; - if (callp == sysent) { + if (__improbable(callp == sysent)) { code = fuword(params); params += sizeof(int); callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; @@ -151,7 +164,7 @@ unix_syscall(x86_saved_state_t *state) } } - if (code != 180) { + if (__probable(code != 180)) { int *ip = (int *)vt; KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, @@ -191,9 +204,6 @@ unix_syscall(x86_saved_state_t *state) AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); -#if CONFIG_MACF - mac_thread_userret(code, error, thread); -#endif #ifdef JOE_DEBUG if (uthread->uu_iocount) @@ -203,7 +213,7 @@ unix_syscall(x86_saved_state_t *state) uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ - if (error == ERESTART) { + if (__improbable(error == ERESTART)) { /* * Move the user's pc back to repeat the syscall: * 5 bytes for a sysenter, or 2 for an int 8x. @@ -211,14 +221,10 @@ unix_syscall(x86_saved_state_t *state) * - see debug trap handler in idt.s/idt64.s */ - if (regs->cs == SYSENTER_CS || regs->cs == SYSENTER_TF_CS) { - regs->eip -= 5; - } - else - regs->eip -= 2; + pal_syscall_restart(thread, state); } else if (error != EJUSTRETURN) { - if (error) { + if (__improbable(error)) { regs->eax = error; regs->efl |= EFL_CF; /* carry bit */ } else { /* (not error) */ @@ -232,13 +238,14 @@ unix_syscall(x86_saved_state_t *state) error, regs->eax, regs->edx); uthread->uu_flag &= ~UT_NOTCANCELPT; -#if DEBUG +#if FUNNEL_DEBUG /* * if we're holding the funnel panic */ syscall_exit_funnelcheck(); -#endif /* DEBUG */ - if (uthread->uu_lowpri_window) { +#endif /* FUNNEL_DEBUG */ + + if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call @@ -248,10 +255,13 @@ unix_syscall(x86_saved_state_t *state) */ throttle_lowpri_io(TRUE); } - if (code != 180) + if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); + if (__improbable(!is_vfork && callp->sy_call == (sy_call_t *)execve && !error)) { + pal_execve_return(thread); + } thread_exception_return(); /* NOTREACHED */ @@ -273,21 +283,21 @@ unix_syscall64(x86_saved_state_t *state) assert(is_saved_state64(state)); regs = saved_state64(state); - +#if DEBUG if (regs->rax == 0x2000800) thread_exception_return(); - +#endif thread = current_thread(); uthread = get_bsdthread_info(thread); /* Get the approriate proc; may be different from task's for vfork() */ - if (!(uthread->uu_flag & UT_VFORK)) + if (__probable(!(uthread->uu_flag & UT_VFORK))) p = (struct proc *)get_bsdtask_info(current_task()); else p = current_proc(); /* Verify that we are not being called from a task without a proc */ - if (p == NULL) { + if (__improbable(p == NULL)) { regs->rax = EPERM; regs->isf.rflags |= EFL_CF; task_terminate_internal(current_task()); @@ -303,7 +313,7 @@ unix_syscall64(x86_saved_state_t *state) callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; uargp = (void *)(®s->rdi); - if (callp == sysent) { + if (__improbable(callp == sysent)) { /* * indirect system call... system call number * passed as 'arg0' @@ -323,7 +333,7 @@ unix_syscall64(x86_saved_state_t *state) } assert(callp->sy_narg <= 8); - if (callp->sy_narg > args_in_regs) { + if (__improbable(callp->sy_narg > args_in_regs)) { int copyin_count; copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t); @@ -339,7 +349,7 @@ unix_syscall64(x86_saved_state_t *state) /* * XXX Turn 64 bit unsafe calls into nosys() */ - if (callp->sy_flags & UNSAFE_64BIT) { + if (__improbable(callp->sy_flags & UNSAFE_64BIT)) { callp = &sysent[63]; goto unsafe; } @@ -360,25 +370,34 @@ unsafe: uthread->uu_flag |= UT_NOTCANCELPT; +#ifdef JOE_DEBUG + uthread->uu_iocount = 0; + uthread->uu_vpindex = 0; +#endif AUDIT_SYSCALL_ENTER(code, p, uthread); error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0])); AUDIT_SYSCALL_EXIT(code, p, uthread, error); +#ifdef JOE_DEBUG + if (uthread->uu_iocount) + printf("system call returned with uu_iocount != 0\n"); +#endif + #if CONFIG_DTRACE uthread->t_dtrace_errno = error; #endif /* CONFIG_DTRACE */ - if (error == ERESTART) { + if (__improbable(error == ERESTART)) { /* * all system calls come through via the syscall instruction * in 64 bit mode... its 2 bytes in length * move the user's pc back to repeat the syscall: */ - regs->isf.rip -= 2; + pal_syscall_restart( thread, state ); } else if (error != EJUSTRETURN) { - if (error) { + if (__improbable(error)) { regs->rax = error; regs->isf.rflags |= EFL_CF; /* carry bit */ } else { /* (not error) */ @@ -416,12 +435,14 @@ unsafe: uthread->uu_flag &= ~UT_NOTCANCELPT; +#if FUNNEL_DEBUG /* * if we're holding the funnel panic */ syscall_exit_funnelcheck(); +#endif /* FUNNEL_DEBUG */ - if (uthread->uu_lowpri_window) { + if (__improbable(uthread->uu_lowpri_window)) { /* * task is marked as a low priority I/O type * and the I/O we issued while in this system call @@ -431,7 +452,7 @@ unsafe: */ throttle_lowpri_io(TRUE); } - if (code != 180) + if (__probable(code != 180)) KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0); @@ -453,6 +474,7 @@ unix_syscall_return(int error) thread = current_thread(); uthread = get_bsdthread_info(thread); + pal_register_cache_state(thread, DIRTY); p = current_proc(); @@ -480,11 +502,9 @@ unix_syscall_return(int error) if (error == ERESTART) { /* - * all system calls come through via the syscall instruction - * in 64 bit mode... its 2 bytes in length - * move the user's pc back to repeat the syscall: + * repeat the syscall */ - regs->isf.rip -= 2; + pal_syscall_restart( thread, find_user_regs(thread) ); } else if (error != EJUSTRETURN) { if (error) { @@ -542,7 +562,7 @@ unix_syscall_return(int error) code = fuword(params); } if (error == ERESTART) { - regs->eip -= ((regs->cs & 0xffff) == SYSENTER_CS) ? 5 : 2; + pal_syscall_restart( thread, find_user_regs(thread) ); } else if (error != EJUSTRETURN) { if (error) { @@ -561,10 +581,12 @@ unix_syscall_return(int error) uthread->uu_flag &= ~UT_NOTCANCELPT; +#if FUNNEL_DEBUG /* * if we're holding the funnel panic */ syscall_exit_funnelcheck(); +#endif /* FUNNEL_DEBUG */ if (uthread->uu_lowpri_window) { /* diff --git a/bsd/dev/i386/unix_signal.c b/bsd/dev/i386/unix_signal.c index 06ed4172c..4292d6515 100644 --- a/bsd/dev/i386/unix_signal.c +++ b/bsd/dev/i386/unix_signal.c @@ -54,12 +54,12 @@ #include <i386/machine_routines.h> #include <i386/seg.h> -#include <sys/kdebug.h> +#include <machine/pal_routines.h> +#include <sys/kdebug.h> #include <sys/sdt.h> - /* Forward: */ extern boolean_t machine_exception(int, mach_exception_code_t, mach_exception_subcode_t, int *, mach_exception_subcode_t *); @@ -610,6 +610,8 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint goto bad; ml_fp_setvalid(FALSE); + /* Tell the PAL layer about the signal */ + pal_set_signal_delivery( thread ); proc_lock(p); diff --git a/bsd/dev/memdev.c b/bsd/dev/memdev.c index fe07f5e53..c425c7e08 100644 --- a/bsd/dev/memdev.c +++ b/bsd/dev/memdev.c @@ -175,7 +175,7 @@ int mdevBMajor = -1; int mdevCMajor = -1; static int mdevioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p, int is_char); -dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys); +dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys); dev_t mdevlookup(int devid); void mdevremoveall(void); @@ -543,7 +543,7 @@ char *cvtnum(char *pos, char *end, unsigned int *num) { /* Convert to a number #endif /* CONFIG_MEMDEV_INSECURE */ -dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys) { +dev_t mdevadd(int devid, uint64_t base, unsigned int size, int phys) { int i; @@ -556,7 +556,7 @@ dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys) { continue; /* Skip check */ } if(!(((base + size -1 ) < mdev[i].mdBase) || ((mdev[i].mdBase + mdev[i].mdSize - 1) < base))) { /* Is there any overlap? */ - panic("mdevadd: attempt to add overlapping memory device at %08lX-%08lX\n", (long) mdev[i].mdBase, (long) mdev[i].mdBase + mdev[i].mdSize - 1); + panic("mdevadd: attempt to add overlapping memory device at %016llX-%016llX\n", mdev[i].mdBase, mdev[i].mdBase + mdev[i].mdSize - 1); } } if(devid < 0) { /* Do we have free slots? */ @@ -567,7 +567,7 @@ dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys) { if(devid >= 16) { /* Giving us something bogus? */ panic("mdevadd: attempt to explicitly add a bogus memory device: %08X\n", devid); } - if(mdev[devid].mdFlags &mdInited) { /* Already there? */ + if(mdev[devid].mdFlags & mdInited) { /* Already there? */ panic("mdevadd: attempt to explicitly add a previously defined memory device: %08X\n", devid); } } @@ -611,8 +611,8 @@ dev_t mdevadd(int devid, ppnum_t base, unsigned int size, int phys) { mdev[devid].mdSecsize = DEV_BSIZE; /* Set starting block size */ if(phys) mdev[devid].mdFlags |= mdPhys; /* Show that we are in physical memory */ mdev[devid].mdFlags |= mdInited; /* Show we are all set up */ - printf("Added memory device md%x/rmd%x (%08X/%08X) at %08X for %08X\n", - devid, devid, mdev[devid].mdBDev, mdev[devid].mdCDev, base << 12, size << 12); + printf("Added memory device md%x/rmd%x (%08X/%08X) at %016llX for %016llX\n", + devid, devid, mdev[devid].mdBDev, mdev[devid].mdCDev, base << 12, (uint64_t)size << 12); return mdev[devid].mdBDev; } diff --git a/bsd/dev/ppc/conf.c b/bsd/dev/ppc/conf.c deleted file mode 100644 index acc9a8545..000000000 --- a/bsd/dev/ppc/conf.c +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997 by Apple Computer, Inc., all rights reserved - * Copyright (c) 1993 NeXT Computer, Inc. - * - * UNIX Device switch tables. - * - * HISTORY - * - * 30 July 1997 Umesh Vaishampayan (umeshv@apple.com) - * enabled file descriptor pseudo-device. - * 18 June 1993 ? at NeXT - * Cleaned up a lot of stuff in this file. - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/ioctl.h> -#include <sys/tty.h> -#include <sys/conf.h> -#include <machine/cons.h> - - -struct bdevsw bdevsw[] = -{ - /* - * For block devices, every other block of 8 slots is - * reserved to NeXT. The other slots are available for - * the user. This way we can both add new entries without - * running into each other. Be sure to fill in NeXT's - * 8 reserved slots when you jump over us -- we'll do the - * same for you. - */ - - /* 0 - 7 are reserved to NeXT */ - - NO_BDEVICE, /* 0*/ - NO_BDEVICE, /* 1*/ - NO_BDEVICE, /* 2*/ - NO_BDEVICE, /* 3*/ - NO_BDEVICE, /* 4*/ - NO_BDEVICE, /* 5*/ - NO_BDEVICE, /* 6*/ - NO_BDEVICE, /* 7*/ - - /* 8 - 15 are reserved to the user */ - NO_BDEVICE, /* 8*/ - NO_BDEVICE, /* 9*/ - NO_BDEVICE, /*10*/ - NO_BDEVICE, /*11*/ - NO_BDEVICE, /*12*/ - NO_BDEVICE, /*13*/ - NO_BDEVICE, /*14*/ - NO_BDEVICE, /*15*/ - - /* 16 - 23 are reserved to NeXT */ - NO_BDEVICE, /*16*/ - NO_BDEVICE, /*17*/ - NO_BDEVICE, /*18*/ - NO_BDEVICE, /*18*/ - NO_BDEVICE, /*20*/ - NO_BDEVICE, /*21*/ - NO_BDEVICE, /*22*/ - NO_BDEVICE, /*23*/ -}; - -int nblkdev = sizeof (bdevsw) / sizeof (bdevsw[0]); - -extern struct tty *km_tty[]; - -dev_t chrtoblk(dev_t dev); -int chrtoblk_set(int cdev, int bdev); -int iskmemdev(dev_t dev); - - -/* XXX No support for linker sets, so must declare here */ -int cttyopen(dev_t dev, int flag, int mode, struct proc *p); -int cttyread(dev_t dev, struct uio *uio, int flag); -int cttywrite(dev_t dev, struct uio *uio, int flag); -int cttyioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p); -int cttyselect(dev_t dev, int flag, void* wql, struct proc *p); - -/* XXX bsd/dev/ppc/mem.c */ -int mmread(dev_t dev, struct uio *uio, int flag); -int mmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); -int mmwrite(dev_t dev, struct uio *uio, int flag); - -#define mmselect (select_fcn_t *)seltrue - -#if 1 -#ifdef NPTY -#undef NPTY -#endif /* NPTY */ -#define NPTY 32 -#else /* 1 */ -#include <pty.h> -#endif /* 1 */ -#if NPTY > 0 -extern struct tty *pt_tty[]; -extern d_open_t ptsopen; -extern d_close_t ptsclose; -extern d_read_t ptsread; -extern d_write_t ptswrite; -extern d_stop_t ptsstop; -extern d_open_t ptcopen; -extern d_close_t ptcclose; -extern d_read_t ptcread; -extern d_write_t ptcwrite; -extern d_select_t ptcselect; -extern d_ioctl_t ptyioctl; -#else -#define ptsopen eno_opcl -#define ptsclose eno_opcl -#define ptsread eno_rdwrt -#define ptswrite eno_rdwrt -#define ptsstop nulldev - -#define ptcopen eno_opcl -#define ptcclose eno_opcl -#define ptcread eno_rdwrt -#define ptcwrite eno_rdwrt -#define ptcselect eno_select -#define ptyioctl eno_ioctl -#endif - -extern d_open_t logopen; -extern d_close_t logclose; -extern d_read_t logread; -extern d_ioctl_t logioctl; -extern d_select_t logselect; - -struct cdevsw cdevsw[] = -{ - /* - * For character devices, every other block of 16 slots is - * reserved to NeXT. The other slots are available for - * the user. This way we can both add new entries without - * running into each other. Be sure to fill in NeXT's - * 16 reserved slots when you jump over us -- we'll do the - * same for you. - */ - - /* 0 - 15 are reserved to NeXT */ - - { - consopen, consclose, consread, conswrite, /* 0*/ - consioctl, ((stop_fcn_t *)&nulldev), - ((reset_fcn_t *)&nulldev), - 0, consselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY - }, - NO_CDEVICE, /* 1*/ - { - cttyopen, ((open_close_fcn_t *)&nulldev), - cttyread, cttywrite, /* 2*/ - cttyioctl, ((stop_fcn_t *)&nulldev), - ((reset_fcn_t *)&nulldev), - 0, cttyselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY - }, - { - ((open_close_fcn_t *)&nulldev), - ((open_close_fcn_t *)&nulldev), - mmread, mmwrite, /* 3*/ - mmioctl, ((stop_fcn_t *)&nulldev), - ((reset_fcn_t *)&nulldev), - 0, mmselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_DISK - }, - { - ptsopen, ptsclose, ptsread, ptswrite, /* 4*/ - ptyioctl, ptsstop, ((reset_fcn_t *)&nulldev), - pt_tty, ttselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY - }, - { - ptcopen, ptcclose, ptcread, ptcwrite, /* 5*/ - ptyioctl, ((stop_fcn_t *)&nulldev), - ((reset_fcn_t *)&nulldev), - 0, ptcselect, - eno_mmap, eno_strat, eno_getc, eno_putc, D_TTY - }, - { - logopen, logclose, logread, eno_rdwrt, /* 6*/ - logioctl, eno_stop, ((reset_fcn_t *)&nulldev), - 0, logselect, - eno_mmap, eno_strat, eno_getc, eno_putc, 0 - }, - NO_CDEVICE, /* 7*/ - NO_CDEVICE, /* 8*/ - NO_CDEVICE, /* 9*/ - NO_CDEVICE, /*10*/ - NO_CDEVICE, /*11*/ - { - kmopen, kmclose, kmread, kmwrite, /*12*/ - kmioctl, ((stop_fcn_t *)&nulldev), - ((reset_fcn_t *)&nulldev), - km_tty, ttselect, - eno_mmap, eno_strat, eno_getc, eno_putc, 0 - }, - NO_CDEVICE, /*13*/ - NO_CDEVICE, /*14*/ - NO_CDEVICE, /*15*/ - - /* 16 - 31 are reserved to the user */ - NO_CDEVICE, /*16*/ - NO_CDEVICE, /*17*/ - NO_CDEVICE, /*18*/ - NO_CDEVICE, /*19*/ - NO_CDEVICE, /*20*/ - NO_CDEVICE, /*21*/ - NO_CDEVICE, /*22*/ - NO_CDEVICE, /*23*/ - NO_CDEVICE, /*24*/ - NO_CDEVICE, /*25*/ - NO_CDEVICE, /*26*/ - NO_CDEVICE, /*27*/ - NO_CDEVICE, /*28*/ - NO_CDEVICE, /*29*/ - NO_CDEVICE, /*30*/ - NO_CDEVICE, /*31*/ - - /* 32 - 47 are reserved to NeXT */ - NO_CDEVICE, /*32*/ - NO_CDEVICE, /*33*/ - NO_CDEVICE, /*34*/ - NO_CDEVICE, /*35*/ - NO_CDEVICE, /*36*/ - /* 37 used to be for nvram */ - NO_CDEVICE, /*37*/ - NO_CDEVICE, /*38*/ - NO_CDEVICE, /*39*/ - NO_CDEVICE, /*40*/ - /* 41 used to be for fd */ - NO_CDEVICE, /*41*/ - NO_CDEVICE, /*42*/ -}; -int nchrdev = sizeof (cdevsw) / sizeof (cdevsw[0]); - - -#include <sys/vnode.h> /* for VCHR and VBLK */ -/* - * return true if a disk - */ -int -isdisk(dev_t dev, int type) -{ - dev_t maj = major(dev); - - switch (type) { - case VCHR: - maj = chrtoblk(maj); - if (maj == NODEV) { - break; - } - /* FALL THROUGH */ - case VBLK: - if (bdevsw[maj].d_type == D_DISK) { - return (1); - } - break; - } - return(0); -} - -static int chrtoblktab[] = { - /* CHR*/ /* BLK*/ /* CHR*/ /* BLK*/ - /* 0 */ NODEV, /* 1 */ NODEV, - /* 2 */ NODEV, /* 3 */ NODEV, - /* 4 */ NODEV, /* 5 */ NODEV, - /* 6 */ NODEV, /* 7 */ NODEV, - /* 8 */ NODEV, /* 9 */ NODEV, - /* 10 */ NODEV, /* 11 */ NODEV, - /* 12 */ NODEV, /* 13 */ NODEV, - /* 14 */ 6, /* 15 */ NODEV, - /* 16 */ NODEV, /* 17 */ NODEV, - /* 18 */ NODEV, /* 19 */ NODEV, - /* 20 */ NODEV, /* 21 */ NODEV, - /* 22 */ NODEV, /* 23 */ NODEV, - /* 24 */ NODEV, /* 25 */ NODEV, - /* 26 */ NODEV, /* 27 */ NODEV, - /* 28 */ NODEV, /* 29 */ NODEV, - /* 30 */ NODEV, /* 31 */ NODEV, - /* 32 */ NODEV, /* 33 */ NODEV, - /* 34 */ NODEV, /* 35 */ NODEV, - /* 36 */ NODEV, /* 37 */ NODEV, - /* 38 */ NODEV, /* 39 */ NODEV, - /* 40 */ NODEV, /* 41 */ 1, - /* 42 */ NODEV, /* 43 */ NODEV, - /* 44 */ NODEV, -}; - -/* - * convert chr dev to blk dev - */ -dev_t -chrtoblk(dev_t dev) -{ - int blkmaj; - - if (major(dev) >= nchrdev) - return(NODEV); - blkmaj = chrtoblktab[major(dev)]; - if (blkmaj == NODEV) - return(NODEV); - return(makedev(blkmaj, minor(dev))); -} - -int -chrtoblk_set(int cdev, int bdev) -{ - if (cdev >= nchrdev) - return (NODEV); - if (bdev != NODEV && bdev >= nblkdev) - return (NODEV); - chrtoblktab[cdev] = bdev; - return 0; -} - -/* - * Returns true if dev is /dev/mem or /dev/kmem. - */ -int -iskmemdev(dev_t dev) -{ - - return (major(dev) == 3 && minor(dev) < 2); -} diff --git a/bsd/dev/ppc/cons.c b/bsd/dev/ppc/cons.c deleted file mode 100644 index 207ee03ae..000000000 --- a/bsd/dev/ppc/cons.c +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1987, 1988 NeXT, Inc. - * - * HISTORY - * 7-Jan-93 Mac Gillon (mgillon) at NeXT - * Integrated POSIX support - * - * 12-Aug-87 John Seamons (jks) at NeXT - * Ported to NeXT. - */ - -/* - * Indirect driver for console. - */ -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/conf.h> -#include <sys/ioctl.h> -#include <sys/tty.h> -#include <sys/proc.h> -#include <sys/uio.h> -#include <machine/cons.h> - -struct tty *constty; /* current console device */ - -/* - * The km driver supplied the default console device for the systems - * (usually a raw frame buffer driver, but potentially a serial driver). - */ -extern struct tty *km_tty[1]; - -static dev_t -cndev(void) -{ - if (constty) - return constty->t_dev; - else - return km_tty[0]->t_dev; -} - -/*ARGSUSED*/ -int -consopen(__unused dev_t dev, int flag, int devtype, struct proc *pp) -{ - dev = cndev(); - return ((*cdevsw[major(dev)].d_open)(dev, flag, devtype, pp)); -} - - -/*ARGSUSED*/ -int -consclose(__unused dev_t dev, int flag, int mode, struct proc *pp) -{ - dev = cndev(); - return ((*cdevsw[major(dev)].d_close)(dev, flag, mode, pp)); -} - - -/*ARGSUSED*/ -int -consread(__unused dev_t dev, struct uio *uio, int ioflag) -{ - dev = cndev(); - return ((*cdevsw[major(dev)].d_read)(dev, uio, ioflag)); -} - - -/*ARGSUSED*/ -int -conswrite(__unused dev_t dev, struct uio *uio, int ioflag) -{ - dev = cndev(); - return ((*cdevsw[major(dev)].d_write)(dev, uio, ioflag)); -} - - -/*ARGSUSED*/ -int -consioctl(__unused dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p) -{ - dev = cndev(); -#if 0 - /* - * Superuser can always use this to wrest control of console - * output from the "virtual" console. - * - * XXX Unfortunately, this code doesn't do what the author thougt - * XXX it did; use of the console device, a TIOCCONS would always - * XXX disassociate the console from a virtual terminal and send - * XXX it back to the fake tty. - */ - if ((unsigned) cmd == TIOCCONS && constty) { - int error = proc_suser(p); - if (!error) { - constty = NULL; - } - return(error); - } -#endif /* 0 */ - - return ((*cdevsw[major(dev)].d_ioctl)(dev, cmd, addr, flag, p)); -} - - -/*ARGSUSED*/ -/* called with funnel held */ -int -consselect(__unused dev_t dev, int flag, void *wql, struct proc *p) -{ - dev = cndev(); - return ((*cdevsw[major(dev)].d_select)(dev, flag, wql, p)); -} diff --git a/bsd/dev/ppc/dtrace_isa.c b/bsd/dev/ppc/dtrace_isa.c deleted file mode 100644 index 21b49bdc4..000000000 --- a/bsd/dev/ppc/dtrace_isa.c +++ /dev/null @@ -1,589 +0,0 @@ -/* - * Copyright (c) 2005-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ -#include <kern/thread.h> -#include <mach/thread_status.h> -#include <stdarg.h> -#include <string.h> -#include <sys/malloc.h> -#include <sys/time.h> -#include <sys/systm.h> -#include <sys/proc.h> -#include <sys/proc_internal.h> -#include <sys/kauth.h> -#include <sys/dtrace.h> -#include <sys/dtrace_impl.h> -#include <libkern/OSAtomic.h> -#include <kern/thread_call.h> -#include <kern/task.h> -#include <kern/sched_prim.h> -#include <miscfs/devfs/devfs.h> -#include <mach/vm_param.h> -#include <machine/cpu_capabilities.h> - -extern dtrace_id_t dtrace_probeid_error; /* special ERROR probe */ - -void -dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, - int fltoffs, int fault, uint64_t illval) -{ - /* - * dtrace_getarg() is a lost cause on PPC. For the case of the error probe firing lets - * stash away "illval" here, and special-case retrieving it in DIF_VARIABLE_ARG. - */ - state->dts_arg_error_illval = illval; - dtrace_probe( dtrace_probeid_error, (uint64_t)(uintptr_t)state, epid, which, fltoffs, fault ); -} - -/* - * Atomicity and synchronization - */ -void -dtrace_membar_producer(void) -{ - __asm__ volatile("sync"); -} - -void -dtrace_membar_consumer(void) -{ - __asm__ volatile("isync"); -} - -/* - * Interrupt manipulation - * XXX dtrace_getipl() can be called from probe context. - */ -int -dtrace_getipl(void) -{ - return (ml_at_interrupt_context() ? 1: 0); -} - -/* - * MP coordination - */ -typedef void (*broadcastFunc) (uint32_t); - -int32_t cpu_broadcast(uint32_t *, broadcastFunc, uint32_t); /* osfmk/ppc/machine_cpu.h */ - -typedef struct xcArg { - processorid_t cpu; - dtrace_xcall_t f; - void *arg; - uint32_t waitVar; -} xcArg_t; - -static void -xcRemote( uint32_t foo ) -{ - xcArg_t *pArg = (xcArg_t *)foo; - - if ( pArg->cpu == CPU->cpu_id || pArg->cpu == DTRACE_CPUALL ) { - (pArg->f)(pArg->arg); - } - - if(!hw_atomic_sub(&(pArg->waitVar), 1)) { /* Drop the wait count */ - thread_wakeup((event_t)&(pArg->waitVar)); /* If we were the last, wake up the signaller */ - } -} - -/* - * dtrace_xcall() is not called from probe context. - */ -void -dtrace_xcall(processorid_t cpu, dtrace_xcall_t f, void *arg) -{ - xcArg_t xcArg; - - /* Talking to ourselves, are we? */ - if ( cpu == CPU->cpu_id ) { - (*f)(arg); - return; - } - - if ( cpu == DTRACE_CPUALL ) { - (*f)(arg); - } - - xcArg.cpu = cpu; - xcArg.f = f; - xcArg.arg = arg; - xcArg.waitVar = 0; - - (void)cpu_broadcast(&(xcArg.waitVar), xcRemote, (uint32_t)&xcArg); -} - -/* - * Runtime and ABI - */ -uint64_t -dtrace_getreg(struct regs *savearea, uint_t reg) -{ - ppc_saved_state_t *regs = (ppc_saved_state_t *)savearea; - uint64_t mask = (_cpu_capabilities & k64Bit) ? 0xffffffffffffffffULL : 0x00000000ffffffffULL; - - /* See osfmk/ppc/savearea.h */ - if (reg > 68) { /* beyond mmcr2 */ - DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); - return (0); - } - - switch (reg) { - /* First 38 registers are saved to 64 bits r0-r31, srr0, srr1, xer, lr, ctr, dar. */ - default: - return (((uint64_t *)(&(regs->save_r0)))[reg]) & mask; - - /* Handle the 32-bit registers */ - case 38: case 39: case 40: case 41: /* cr, dsisr, exception, vrsave */ - case 42: case 43: case 44: case 45: /* vscr[4] */ - case 46: case 47: case 48: case 49: /* fpscrpad, fpscr, save_1d8[2] */ - case 50: case 51: case 52: case 53: /* save_1E0[8] */ - case 54: case 55: case 56: case 57: - case 58: case 59: case 60: case 61: /* save_pmc[8] */ - case 62: case 63: case 64: case 65: - return (uint64_t)(((unsigned int *)(&(regs->save_cr)))[reg - 38]); - - case 66: - return regs->save_mmcr0 & mask; - case 67: - return regs->save_mmcr1 & mask; - case 68: - return regs->save_mmcr2 & mask; - } -} - -#define RETURN_OFFSET 8 -#define RETURN_OFFSET64 16 -#define REGPC save_srr0 -#define REGSP save_r1 - -/* - * XXX dtrace_getustack_common() can be called from probe context. - */ -static int -dtrace_getustack_common(uint64_t *pcstack, int pcstack_limit, user_addr_t pc, - user_addr_t sp) -{ -#if 0 - volatile uint16_t *flags = - (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; - - uintptr_t oldcontext = lwp->lwp_oldcontext; /* XXX signal stack crawl*/ - size_t s1, s2; -#endif - int ret = 0; - boolean_t is64Bit = proc_is64bit(current_proc()); - - ASSERT(pcstack == NULL || pcstack_limit > 0); - -#if 0 /* XXX signal stack crawl*/ - if (p->p_model == DATAMODEL_NATIVE) { - s1 = sizeof (struct frame) + 2 * sizeof (long); - s2 = s1 + sizeof (siginfo_t); - } else { - s1 = sizeof (struct frame32) + 3 * sizeof (int); - s2 = s1 + sizeof (siginfo32_t); - } -#endif - - while (pc != 0) { - ret++; - if (pcstack != NULL) { - *pcstack++ = (uint64_t)pc; - pcstack_limit--; - if (pcstack_limit <= 0) - break; - } - - if (sp == 0) - break; - -#if 0 /* XXX signal stack crawl*/ - if (oldcontext == sp + s1 || oldcontext == sp + s2) { - if (p->p_model == DATAMODEL_NATIVE) { - ucontext_t *ucp = (ucontext_t *)oldcontext; - greg_t *gregs = ucp->uc_mcontext.gregs; - - sp = dtrace_fulword(&gregs[REG_FP]); - pc = dtrace_fulword(&gregs[REG_PC]); - - oldcontext = dtrace_fulword(&ucp->uc_link); - } else { - ucontext32_t *ucp = (ucontext32_t *)oldcontext; - greg32_t *gregs = ucp->uc_mcontext.gregs; - - sp = dtrace_fuword32(&gregs[EBP]); - pc = dtrace_fuword32(&gregs[EIP]); - - oldcontext = dtrace_fuword32(&ucp->uc_link); - } - } - else -#endif - { - if (is64Bit) { - pc = dtrace_fuword64((sp + RETURN_OFFSET64)); - sp = dtrace_fuword64(sp); - } else { - pc = dtrace_fuword32((sp + RETURN_OFFSET)); - sp = dtrace_fuword32(sp); - } - } - } - - return (ret); -} - -void -dtrace_getupcstack(uint64_t *pcstack, int pcstack_limit) -{ - thread_t thread = current_thread(); - ppc_saved_state_t *regs; - user_addr_t pc, sp; - volatile uint16_t *flags = - (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; - int n; - boolean_t is64Bit = proc_is64bit(current_proc()); - - if (*flags & CPU_DTRACE_FAULT) - return; - - if (pcstack_limit <= 0) - return; - - /* - * If there's no user context we still need to zero the stack. - */ - if (thread == NULL) - goto zero; - - regs = (ppc_saved_state_t *)find_user_regs(thread); - if (regs == NULL) - goto zero; - - *pcstack++ = (uint64_t)proc_selfpid(); - pcstack_limit--; - - if (pcstack_limit <= 0) - return; - - pc = regs->REGPC; - sp = regs->REGSP; - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_ENTRY)) { - *pcstack++ = (uint64_t)pc; - pcstack_limit--; - if (pcstack_limit <= 0) - return; - - pc = regs->save_lr; - } - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_USTACK_FP)) { - /* - * If the ustack fp flag is set, the stack frame from sp to - * fp contains no valid call information. Start with the fp. - */ - if (is64Bit) - sp = dtrace_fuword64(sp); - else - sp = (user_addr_t)dtrace_fuword32(sp); - } - - n = dtrace_getustack_common(pcstack, pcstack_limit, pc, sp); - ASSERT(n >= 0); - ASSERT(n <= pcstack_limit); - - pcstack += n; - pcstack_limit -= n; - -zero: - while (pcstack_limit-- > 0) - *pcstack++ = 0; -} - -int -dtrace_getustackdepth(void) -{ - thread_t thread = current_thread(); - ppc_saved_state_t *regs; - user_addr_t pc, sp; - int n = 0; - boolean_t is64Bit = proc_is64bit(current_proc()); - - if (thread == NULL) - return 0; - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT)) - return (-1); - - regs = (ppc_saved_state_t *)find_user_regs(thread); - if (regs == NULL) - return 0; - - pc = regs->REGPC; - sp = regs->REGSP; - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_ENTRY)) { - n++; - pc = regs->save_lr; - } - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_USTACK_FP)) { - /* - * If the ustack fp flag is set, the stack frame from sp to - * fp contains no valid call information. Start with the fp. - */ - if (is64Bit) - sp = dtrace_fuword64(sp); - else - sp = (user_addr_t)dtrace_fuword32(sp); - } - - n += dtrace_getustack_common(NULL, 0, pc, sp); - - return (n); -} - -void -dtrace_getufpstack(uint64_t *pcstack, uint64_t *fpstack, int pcstack_limit) -{ - thread_t thread = current_thread(); - ppc_saved_state_t *regs; - user_addr_t pc, sp; - volatile uint16_t *flags = - (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; -#if 0 - uintptr_t oldcontext; - size_t s1, s2; -#endif - boolean_t is64Bit = proc_is64bit(current_proc()); - - if (*flags & CPU_DTRACE_FAULT) - return; - - if (pcstack_limit <= 0) - return; - - /* - * If there's no user context we still need to zero the stack. - */ - if (thread == NULL) - goto zero; - - regs = (ppc_saved_state_t *)find_user_regs(thread); - if (regs == NULL) - goto zero; - - *pcstack++ = (uint64_t)proc_selfpid(); - pcstack_limit--; - - if (pcstack_limit <= 0) - return; - - pc = regs->REGPC; - sp = regs->REGSP; - -#if 0 /* XXX signal stack crawl*/ - oldcontext = lwp->lwp_oldcontext; - - if (p->p_model == DATAMODEL_NATIVE) { - s1 = sizeof (struct frame) + 2 * sizeof (long); - s2 = s1 + sizeof (siginfo_t); - } else { - s1 = sizeof (struct frame32) + 3 * sizeof (int); - s2 = s1 + sizeof (siginfo32_t); - } -#endif - - if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_ENTRY)) { - *pcstack++ = (uint64_t)pc; - *fpstack++ = 0; - pcstack_limit--; - if (pcstack_limit <= 0) - return; - - /* - * XXX This is wrong, but we do not yet support stack helpers. - */ - if (is64Bit) - pc = dtrace_fuword64(sp); - else - pc = dtrace_fuword32(sp); - } - - while (pc != 0) { - *pcstack++ = (uint64_t)pc; - *fpstack++ = sp; - pcstack_limit--; - if (pcstack_limit <= 0) - break; - - if (sp == 0) - break; - -#if 0 /* XXX signal stack crawl*/ - if (oldcontext == sp + s1 || oldcontext == sp + s2) { - if (p->p_model == DATAMODEL_NATIVE) { - ucontext_t *ucp = (ucontext_t *)oldcontext; - greg_t *gregs = ucp->uc_mcontext.gregs; - - sp = dtrace_fulword(&gregs[REG_FP]); - pc = dtrace_fulword(&gregs[REG_PC]); - - oldcontext = dtrace_fulword(&ucp->uc_link); - } else { - ucontext_t *ucp = (ucontext_t *)oldcontext; - greg_t *gregs = ucp->uc_mcontext.gregs; - - sp = dtrace_fuword32(&gregs[EBP]); - pc = dtrace_fuword32(&gregs[EIP]); - - oldcontext = dtrace_fuword32(&ucp->uc_link); - } - } - else -#endif - { - if (is64Bit) { - pc = dtrace_fuword64((sp + RETURN_OFFSET64)); - sp = dtrace_fuword64(sp); - } else { - pc = dtrace_fuword32((sp + RETURN_OFFSET)); - sp = dtrace_fuword32(sp); - } - } - } - -zero: - while (pcstack_limit-- > 0) - *pcstack++ = 0; -} - -void -dtrace_getpcstack(pc_t *pcstack, int pcstack_limit, int aframes, - uint32_t *intrpc) -{ - struct frame *fp = (struct frame *)__builtin_frame_address(0); - struct frame *nextfp, *minfp, *stacktop; - int depth = 0; - int last = 0; - uintptr_t pc; - uintptr_t caller = CPU->cpu_dtrace_caller; - int on_intr; - - if ((on_intr = CPU_ON_INTR(CPU)) != 0) - stacktop = (struct frame *)dtrace_get_cpu_int_stack_top(); - else - stacktop = (struct frame *)(dtrace_get_kernel_stack(current_thread()) + kernel_stack_size); - - minfp = fp; - - aframes++; - - if (intrpc != NULL && depth < pcstack_limit) - pcstack[depth++] = (pc_t)intrpc; - - while (depth < pcstack_limit) { - nextfp = *(struct frame **)fp; - pc = *(uintptr_t *)(((uintptr_t)fp) + RETURN_OFFSET); - - if (nextfp <= minfp || nextfp >= stacktop) { - if (on_intr) { - /* - * Hop from interrupt stack to thread stack. - */ - vm_offset_t kstack_base = dtrace_get_kernel_stack(current_thread()); - - minfp = (struct frame *)kstack_base; - stacktop = (struct frame *)(kstack_base + kernel_stack_size); - - on_intr = 0; - continue; - } - /* - * This is the last frame we can process; indicate - * that we should return after processing this frame. - */ - last = 1; - } - - if (aframes > 0) { - if (--aframes == 0 && caller != 0) { - /* - * We've just run out of artificial frames, - * and we have a valid caller -- fill it in - * now. - */ - ASSERT(depth < pcstack_limit); - pcstack[depth++] = (pc_t)caller; - caller = 0; - } - } else { - if (depth < pcstack_limit) - pcstack[depth++] = (pc_t)pc; - } - - if (last) { - while (depth < pcstack_limit) - pcstack[depth++] = 0; - return; - } - - fp = nextfp; - minfp = fp; - } -} - -uint64_t -dtrace_getarg(int arg, int aframes) -{ -#pragma unused(arg,aframes) - return 0xfeedfacedeafbeadLL; /* XXX Only called for arg >= 5 */ -} - -/* - * Load/Store Safety - */ - -void -dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) -{ - /* - * "base" is the smallest toxic address in the range, "limit" is the first - * VALID address greater than "base". - */ - func(0x0, VM_MIN_KERNEL_ADDRESS); - if (VM_MAX_KERNEL_ADDRESS < ~(uintptr_t)0) - func(VM_MAX_KERNEL_ADDRESS + 1, ~(uintptr_t)0); -} - -extern void *mapping_phys_lookup(ppnum_t, unsigned int *); - diff --git a/bsd/dev/ppc/dtrace_subr_ppc.c b/bsd/dev/ppc/dtrace_subr_ppc.c deleted file mode 100644 index 5040a9183..000000000 --- a/bsd/dev/ppc/dtrace_subr_ppc.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * #pragma ident "@(#)dtrace_subr.c 1.12 05/06/08 SMI" - */ - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ -#include <sys/dtrace.h> -#include <sys/dtrace_glue.h> -#include <sys/dtrace_impl.h> -#include <sys/fasttrap.h> -#include <sys/vm.h> -#include <sys/user.h> -#include <sys/kauth.h> -#include <kern/debug.h> - -int (*dtrace_pid_probe_ptr)(ppc_saved_state_t *); -int (*dtrace_return_probe_ptr)(ppc_saved_state_t *); -kern_return_t dtrace_user_probe(ppc_saved_state_t *sv); - -kern_return_t -dtrace_user_probe(ppc_saved_state_t *sv) -{ - - lck_rw_t *rwp; - struct proc *p = current_proc(); - - uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); - /* - * DTrace accesses t_cred in probe context. t_cred - * must always be either NULL, or point to a valid, - * allocated cred structure. - */ - kauth_cred_uthread_update(uthread, p); - - if (sv->save_exception == T_DTRACE_RET) { - -/* - * T_DTRACE_RET is generated by the kernel when an emulation sequence - * ends. Unlike the x86 implementation, this can not be caused by - * a user state trap instruction. It is a system error if it occurs - * when not stepping and is, therefore, a panickable offence. - */ - - if(uthread->t_dtrace_step == 0) { /* Are we supposed to be tracing? */ - panic("dtrace_user_probe: T_DTRACE_RET when not stepping\n"); - } - - if (uthread->t_dtrace_ast) { - printf("dtrace_user_probe() should be calling aston()\n"); - // aston(uthread); - // uthread->t_sig_check = 1; - } - - /* - * Clear all user tracing flags. - */ - uthread->t_dtrace_ft = 0; - - /* - * We need to wait until after we've called the - * dtrace_return_probe_ptr function pointer to step the pc. - */ - rwp = &CPU->cpu_ft_lock; - lck_rw_lock_shared(rwp); - - if (dtrace_return_probe_ptr != NULL) (void)(*dtrace_return_probe_ptr)(sv); - lck_rw_unlock_shared(rwp); - - sv->save_srr0 = sv->save_srr0 + 4; /* Step to next instruction */ - if(!(sv->save_srr1 & 0x8000000000000000ULL)) sv->save_srr0 &= 0x00000000FFFFFFFF; /* Trim if in 32-bit mode */ - - return KERN_SUCCESS; - - } else { - -/* - * We have taken our normal trap to get here. Make sure we expect it - */ - uint32_t instr; - rwp = &CPU->cpu_ft_lock; - - /* - * The DTrace fasttrap provider uses a trap, "twi 31,r31,0xDDDD". - * We will only be here if dtrace (or someone pretending to be us) - * sets the trap. - * We let DTrace take the first crack at handling - * this trap; if it's not a probe that DTrace knowns about, - * we call into the trap() routine to handle it like a - * breakpoint placed by a conventional debugger. - */ - - /* - * APPLE NOTE: I believe the purpose of the reader/writers lock - * is thus: There are times which dtrace needs to prevent calling - * dtrace_pid_probe_ptr(). Sun's original impl grabbed a plain - * mutex here. However, that serialized all probe calls, and - * destroyed MP behavior. So now they use a RW lock, with probes - * as readers, and the top level synchronization as a writer. - */ - lck_rw_lock_shared(rwp); - if (dtrace_pid_probe_ptr != NULL && - (*dtrace_pid_probe_ptr)(sv) == 0) { - lck_rw_unlock_shared(rwp); - return KERN_SUCCESS; - } - lck_rw_unlock_shared(rwp); - - /* - * If the instruction that caused the breakpoint trap doesn't - * look like our trap anymore, it may be that this tracepoint - * was removed just after the user thread executed it. In - * that case, return to user land to retry the instuction. - * - * Note that the PC is correct because we do not advance it until after emulation. - */ - if (fuword32(sv->save_srr0, &instr) == 0 && instr != FASTTRAP_INSTR) { - return KERN_SUCCESS; - } - - } - -/* - * If we get here, we go back to throw an exception - */ - - return KERN_FAILURE; -} - -void -dtrace_safe_synchronous_signal(void) -{ -// This is commented out of the x86 code and is never called. -} - -int -dtrace_safe_defer_signal(void) -{ -// This is commented out of the x86 code and is never called. - return 0; -} diff --git a/bsd/dev/ppc/fasttrap_isa.c b/bsd/dev/ppc/fasttrap_isa.c deleted file mode 100644 index 10e2edd08..000000000 --- a/bsd/dev/ppc/fasttrap_isa.c +++ /dev/null @@ -1,734 +0,0 @@ -/* - * Copyright (c) 2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* - * #pragma ident "@(#)fasttrap_isa.c 1.27 08/04/09 SMI" - */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ -#include <sys/fasttrap_isa.h> -#include <sys/fasttrap_impl.h> -#include <sys/dtrace.h> -#include <sys/dtrace_impl.h> -#include <sys/dtrace_ptss.h> -#include <kern/debug.h> -#include <ppc/decodePPC.h> -#include <kern/task.h> -#include <mach/vm_param.h> -#include <mach/mach_vm.h> -#include <mach/task.h> -#include <vm/pmap.h> -#include <vm/vm_map.h> /* All the bits we care about are guarded by MACH_KERNEL_PRIVATE :-( */ -extern dtrace_id_t dtrace_probeid_error; - -/* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */ -#define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */ - -static int32_t branchtaken(int32_t bo, int32_t bi, ppc_saved_state_t *sv); -static int32_t dtrace_decode_ppc(uint32_t inst); -int patchInst(task_t task, addr64_t vaddr, uint32_t inst); -kern_return_t dtrace_user_probe(ppc_saved_state_t *sv); - -/* - * Lossless User-Land Tracing on PPC - * --------------------------------- - * - * PPC uses a different technique to emulate user-land instruction replaces by a probe - * trap than x86. - * - * Like x86, it will emulate all forms of branch instructions. We will not attempt - * to emulate any instruction that we know will cause an interruption or exception - * (system call, trap, privileged instruction, instruction that uses a privileged - * register). - * - * NOTE: I am thinking that we should punish tight loopers, e.g., branch-to-dot. - * Depending upon clock resolution and how fast we can process these guys, it is - * possible that its quantum will never decrease. Maybe we could just manually - * end the guy's quantum and let the next guy go... - * - * When fasttrap_tracepoint_init is called, we fetch the instruction and decode it. - * If we don't recognize it or find it is a "banned" instruction, we return -1, - * telling our caller to forget it. Otherwise we save the instruction image and - * enough of the decode to quickly handle it at probe time. We cram it into - * the fasttrap_machtp_t structure. - * - * When the probe hits, we verify that the PC is still a probe point and if not, - * we bail. Otherwise we have a bit more to do. - * - * If DTFTP_ENTRY is set, we have an entry probe and need to call dtrace_probe. - * - * If DTFTP_IS_ENABLED is set, all we need to do is to return a 1. - * - * If ftp_argmap is NULL, we call dtrace_probe - * - * Otherwise, we figure out what the arguments are and pass them to dtrace_probe - * - * Next, we need to set up to emulate the probed instruction and here is where we are - * the most different than the x86 code. - * - * Like x86, we first check to see if the instruction is any form of branch. If so, - * we emulate it completely within the kernel and are done. - * - * If it is anything else, we build a code stream within the kernel to execute the - * instruction. Note that this is very different from x86 which build the code in - * userland. - * - * The generated stream needs to be executed within the kernel's code space but with - * the user address space and registers. Because PPC allows different translation modes - * for instruction fetch and data fetch, this is not too difficult. - * - * There are two kinds streams needed: execute and continue, and execute and return, - * which are used for entry/offset and exit probes respectivily. - * - * The probe code will copy the instruction image into the current user savearea (which - * also contains the complete user state register context). A flag that requests either - * execute/continue or execute/return is also set in the savearea. - * - * We now exit the dtrace code and the marked context makes its way back to the point - * where it will be dispatched on the processor. - * - * The exception return code will start to restore the user context, including registers - * and address space. However, before dispatching the user, it will notice that the - * emulate flags are set. At this point the code will build a code stream - * in an area in the per_proc that consists of - * the original instruction followed by a trap instruction. It will set the new MSR (in - * SRR1) to have address translation enable for data, translation disabled for instruction - * fetches, interruptions disabled, and supervisor state. - * - * The new PC and MSR are loaded via a RFID and the generated stream is executed. If a - * synchronous fault occurs, it is either handled (PTE miss, FPU or vector unavailable), - * emulated (alignment or denorm), or passed on to the user. - * - * Assuming the emulated instruction completes, the trap will execute. When that happens, - * low-level trap handler will check its flags. If the trap corresponds to an - * execute/continue stream, the trap handler will adjust the PC and complete the - * transition into user space. - * - * If the trap corresponds to an execute/return stream, the handler will generate - * a T_DTRACE_RET exception and let the trap handler pass it along to dtrace_user_probe. - * - */ - - -static uint64_t -fasttrap_anarg(ppc_saved_state_t *sv, int function_entry, int argno) -{ -#pragma unused(function_entry) - uint32_t farg; - uint64_t value; - - /* The first 8 arguments (argno 0-7) are in registers */ - if (argno < 8) { - value = (&sv->save_r3)[argno]; - } else { - if (sv->save_srr1 & 0x8000000000000000ULL) { - /* 64-bit */ - /* Grab argument >= 8 from stack */ - fasttrap_fuword64_noerr(sv->save_r1 + 48 + ((argno)* sizeof(uint64_t)), &value); - } else { - /* 32-bit */ - /* Grab argument >= 8 from stack */ - fasttrap_fuword32_noerr(sv->save_r1 + 24 + ((argno) * sizeof(uint32_t)), &farg); - value = (uint64_t)farg; - } - } - - return (value); -} - -/*ARGSUSED*/ -int -fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, user_addr_t pc, - fasttrap_probe_type_t type) -{ -#pragma unused(type) - - uint32_t instr, testr1, testr2, testr3; - user_addr_t targpc; - int32_t target, optype; - - /* - * Read the instruction at the given address out of the process's - * address space. We don't have to worry about a debugger - * changing this instruction before we overwrite it with our trap - * instruction since P_PR_LOCK is set. Since instructions can span - * pages, we potentially read the instruction in two parts. If the - * second part fails, we just zero out that part of the instruction. - */ - /* - * APPLE NOTE: Of course, we do not have a P_PR_LOCK, so this is racey... - */ - - if (uread(p, &instr, 4, pc) != 0) return (-1); /* Grab instruction, return suddenly if read fails... */ - - optype = dtrace_decode_ppc(instr); /* See if we have an instruction we can probe */ - - tp->ftt_instr = instr; /* Save the instruction image */ - testr1 = tp->ftt_bo = (uint8_t)((instr >> (31 - 10)) & 0x1F); /* Extract branch options */ - testr2 = tp->ftt_bi = (uint8_t)((instr >> (31 - 15)) & 0x1F); /* Extract condition register bit */ - testr3 = (instr >> (31 - 20)) & 0x1F; /* Get that last register */ - tp->ftt_flgs = (uint8_t)(instr & 3); /* Set the absolute address and link flags */ - - switch(optype) { /* Do instruction specific decode */ - - case diCMN: /* Common instruction */ - tp->ftt_type = ftmtCommon; /* Mark as common instruction */ - break; - - case diINV: /* Invalid */ - case diTRP: /* Trap */ - case diSC: /* System Call */ - case diRFI: /* Return from interrupt */ - case diPRV: /* Priviliged instruction */ - return (-1); /* We will not emulate these... */ - break; - - case diB: /* Branch */ - tp->ftt_type = ftmtB; /* Mark as branch instruction */ - target = instr & 0x03FFFFFC; /* Extract address or offset */ - if(target & 0x02000000) target |= 0xFC000000; /* Sign extend */ - tp->ftt_trgt = target; /* Trim back down and save */ - - targpc = (user_addr_t)((int64_t)target); /* Generate a target address, hopefully we sign extend... */ - if(!(tp->ftt_flgs & ftmtAbs)) { /* Are we dealing with an offset here? */ - targpc = targpc + pc; /* Apply offset to get target address */ - } - - if(targpc == pc) return -1; /* Branching to self is a sin and is forbidden... */ - break; - - case diBC: /* Branch conditional */ - tp->ftt_type = ftmtBC; /* Mark as branch conditional */ - target = instr & 0x0000FFFC; /* Extract address or offset */ - if(target & 0x00008000) target |= 0xFFFF0000; /* Sign extend */ - tp->ftt_trgt = target; /* Trim back down and save */ - - targpc = (user_addr_t)((int64_t)target); /* Generate a target address, hopefully we sign extend... */ - if(!(tp->ftt_flgs & ftmtAbs)) { /* Are we dealing with an offset here? */ - targpc = targpc + pc; /* Apply offset to get target address */ - } - - if(targpc == pc) return -1; /* Branching to self is a sin and is forbidden... */ - break; - - case diBLR: /* Branch conditional to link register */ - tp->ftt_type = ftmtBLR; /* Mark as branch conditional to link register */ - break; - - case diBCTR: /* Branch conditional to count register */ - tp->ftt_type = ftmtBCTR; /* Mark as branch conditional to count register */ - break; - - case diOR: /* OR */ - if((instr >> 26) == 24) { /* Is this the ORI nop? */ - if((testr1 == testr2) && ((instr & 0x0000FFFF) == 0)) tp->ftt_type = ftmtNOP; /* Remember if this is a NOP instruction */ - else tp->ftt_type = ftmtCommon; /* Otherwise it is a common ORI instruction */ - } - else if((testr1 == testr2) && (testr1 == testr3)) tp->ftt_type = ftmtNOP; /* If all three registers are the same, this is a NOP */ - else tp->ftt_type = ftmtCommon; /* Otherwise it is a common OR instruction */ - - break; - - default: - panic("fasttrap_tracepoint_init: invalid branch decode, inst = %08X, optype = %d\n", instr, optype); - break; - - } - - return (0); -} - -int -fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp) -{ - return patchInst(p->task, tp->ftt_pc, FASTTRAP_INSTR); /* Patch the instruction and flush it */ -} - -extern void dbgTrace(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); - -int -fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp) -{ - uint32_t instr; - - /* - * Distinguish between read or write failures and a changed - * instruction. - */ - if (uread(p, &instr, 4, tp->ftt_pc) != 0) return (0); /* Get the instruction, but exit if not mapped */ - -// dbgTrace(0x99999999, (uint32_t)tp->ftt_pc, tp->ftt_instr, instr, 0); /* (TRACE/DEBUG) */ - - if (instr != FASTTRAP_INSTR) return (0); /* Did someone change it? If so, just leave */ - - return patchInst(p->task, tp->ftt_pc, tp->ftt_instr); /* Patch the old instruction back in and flush it */ -} - -static void -fasttrap_return_common(ppc_saved_state_t *sv, user_addr_t pc, pid_t pid, user_addr_t new_pc) -{ - - fasttrap_tracepoint_t *tp; - fasttrap_bucket_t *bucket; - fasttrap_id_t *id; - lck_mtx_t *pid_mtx; - - pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock; - lck_mtx_lock(pid_mtx); - bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)]; - - for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { - if (pid == tp->ftt_pid && pc == tp->ftt_pc && - tp->ftt_proc->ftpc_acount != 0) - break; - } - - /* - * Don't sweat it if we can't find the tracepoint again. Unlike - * when we're in fasttrap_pid_probe(), finding the tracepoint here - * is not essential to the correct execution of the process. - */ - if (tp == NULL) { - lck_mtx_unlock(pid_mtx); - return; - } - - for (id = tp->ftt_retids; id != NULL; id = id->fti_next) { - /* - * If there's a branch that could act as a return site, we - * need to trace it, and check here if the program counter is - * external to the function. - */ - if((new_pc - id->fti_probe->ftp_faddr) < id->fti_probe->ftp_fsize) /* Is target within the function? */ - continue; /* Yeah, skip this one... */ - - DTRACE_CPUFLAG_SET(CPU_DTRACE_USTACK_FP); - if (ISSET(current_proc()->p_lflag, P_LNOATTACH)) { - dtrace_probe(dtrace_probeid_error, 0 /* state */, - id->fti_probe->ftp_id, 1 /* ndx */, -1 /* offset */, - DTRACEFLT_UPRIV); - } else { - dtrace_probe(id->fti_probe->ftp_id, - pc - id->fti_probe->ftp_faddr, - sv->save_r3, sv->save_r4, 0, 0); - } - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_USTACK_FP); - } - - lck_mtx_unlock(pid_mtx); -} - -static void -fasttrap_usdt_args(fasttrap_probe_t *probe, ppc_saved_state_t *sv, int argc, - uint64_t *argv) -{ - int i, x, cap = MIN(argc, probe->ftp_nargs); - uint32_t farg; - - for (i = 0; i < cap; i++) { - x = probe->ftp_argmap[i]; - - if (x <= 8) { /* Is this argument in a register? */ - argv[i] = (&sv->save_r0)[x]; - } else { - if(sv->save_srr1 & 0x8000000000000000ULL) { /* Are we running in 64-bit? */ - fasttrap_fuword64_noerr(sv->save_r1 + 48 + (x * sizeof(uint64_t)), &argv[i]); /* Grab argument > 8 from stack */ - } - else { - fasttrap_fuword32_noerr(sv->save_r1 + 24 + (x * sizeof(uint32_t)), &farg); /* Grab argument > 8 from stack */ - argv[i] = (uint64_t)farg; /* Convert to 64-bit */ - } - } - } - - for (; i < argc; i++) { - argv[i] = 0; - } -} - -int -fasttrap_pid_probe(ppc_saved_state_t *sv) -{ - proc_t *p = current_proc(); - fasttrap_bucket_t *bucket; - lck_mtx_t *pid_mtx; - fasttrap_tracepoint_t *tp, tp_local; - pid_t pid; - dtrace_icookie_t cookie; - uint_t is_enabled = 0; - user_addr_t new_pc = 0; - user_addr_t pc; - user_addr_t addrmask; - - pc = sv->save_srr0; /* Remember the PC for later */ - if(sv->save_srr1 & 0x8000000000000000ULL) addrmask = 0xFFFFFFFFFFFFFFFFULL; /* Set 64-bit addressing if enabled */ - else addrmask = 0x00000000FFFFFFFFULL; /* Otherwise set 32-bit */ - - uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread()); - - /* - * Clear all user tracing flags. - */ - uthread->t_dtrace_ft = 0; - - /* - * Treat a child created by a call to vfork(2) as if it were its - * parent. We know that there's only one thread of control in such a - * process: this one. - */ - /* - * APPLE NOTE: Terry says: "You need to hold the process locks (currently: kernel funnel) for this traversal" - * FIXME: How do we assert this? - */ - while (p->p_lflag & P_LINVFORK) p = p->p_pptr; /* Search the end */ - - pid = p->p_pid; - pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock; - lck_mtx_lock(pid_mtx); - bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, sv->save_srr0)]; /* Get the bucket that corresponds to out PC */ - - /* - * Lookup the tracepoint that the process just hit. - */ - for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) { - if (pid == tp->ftt_pid && (sv->save_srr0 == tp->ftt_pc) && - tp->ftt_proc->ftpc_acount != 0) - break; - } - - /* - * If we couldn't find a matching tracepoint, either a tracepoint has - * been inserted without using the pid<pid> ioctl interface (see - * fasttrap_ioctl), or somehow we have mislaid this tracepoint. - */ - if (tp == NULL) { - lck_mtx_unlock(pid_mtx); - return (-1); - } - - if (tp->ftt_ids != NULL) { - fasttrap_id_t *id; - - for (id = tp->ftt_ids; id != NULL; id = id->fti_next) { - fasttrap_probe_t *probe = id->fti_probe; - - if (ISSET(current_proc()->p_lflag, P_LNOATTACH)) { - dtrace_probe(dtrace_probeid_error, 0 /* state */, - id->fti_probe->ftp_id, 1 /* ndx */, -1 /* offset */, - DTRACEFLT_UPRIV); - } else if (id->fti_ptype == DTFTP_ENTRY) { - /* - * We note that this was an entry - * probe to help ustack() find the - * first caller. - */ - cookie = dtrace_interrupt_disable(); - DTRACE_CPUFLAG_SET(CPU_DTRACE_USTACK_FP | CPU_DTRACE_ENTRY); - dtrace_probe(probe->ftp_id, sv->save_r3, sv->save_r4, /* Call the main probe routine with the first 5 args */ - sv->save_r5, sv->save_r6, sv->save_r7); - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_USTACK_FP | CPU_DTRACE_ENTRY); - dtrace_interrupt_enable(cookie); - - } else if (id->fti_ptype == DTFTP_IS_ENABLED) { - /* - * Note that in this case, we don't - * call dtrace_probe() since it's only - * an artificial probe meant to change - * the flow of control so that it - * encounters the true probe. - */ - is_enabled = 1; - - } else if (probe->ftp_argmap == NULL) { - DTRACE_CPUFLAG_SET(CPU_DTRACE_USTACK_FP); - dtrace_probe(probe->ftp_id, sv->save_r3, sv->save_r4, /* Call the main probe routine with the first 5 args */ - sv->save_r5, sv->save_r6, sv->save_r7); - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_USTACK_FP); - - } else { - uint64_t t[5]; - - fasttrap_usdt_args(probe, sv, 5, t); /* Grab 5 arguments */ - - DTRACE_CPUFLAG_SET(CPU_DTRACE_USTACK_FP); - dtrace_probe(probe->ftp_id, t[0], t[1], - t[2], t[3], t[4]); - DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_USTACK_FP); - } - - /* APPLE NOTE: Oneshot probes get one and only one chance... */ - if (probe->ftp_prov->ftp_provider_type == DTFTP_PROVIDER_ONESHOT) { - fasttrap_tracepoint_remove(p, tp); - } - } - } - - /* - * We're about to do a bunch of work so we cache a local copy of - * the tracepoint to emulate the instruction, and then find the - * tracepoint again later if we need to light up any return probes. - */ - tp_local = *tp; - lck_mtx_unlock(pid_mtx); - tp = &tp_local; - - /* - * If there's an is-enabled probe connected to this tracepoint it - * means that there was a 'xor r3,r3,r3' - * instruction that was placed there by DTrace when the binary was - * linked. As this probe is, in fact, enabled, we need to stuff 1 - * into R3. Accordingly, we can bypass all the instruction - * emulation logic since we know the inevitable result. It's possible - * that a user could construct a scenario where the 'is-enabled' - * probe was on some other instruction, but that would be a rather - * exotic way to shoot oneself in the foot. - */ - if (is_enabled) { - sv->save_r3 = 1; /* Set condition to true */ - new_pc = (sv->save_srr0 + 4) & addrmask; /* Just fall through to the next instruction */ - goto done; - } - - /* - * We emulate certain types of instructions to ensure correctness - * (in the case of position dependent instructions) or optimize - * common cases. The rest we execute in the kernel, but with - * most of the user's context active. - */ - switch (tp->ftt_type) { - - case ftmtNOP: /* NOP */ - new_pc = (sv->save_srr0 + 4) & addrmask; /* Just fall through to the next instruction */ - break; - - case ftmtB: /* Plain unconditional branch */ - new_pc = (user_addr_t)((int64_t)tp->ftt_trgt); /* Assume target is absolute address for the moment */ - if(!(tp->ftt_flgs & ftmtAbs)) new_pc = (new_pc + sv->save_srr0) & addrmask; /* We don't have absolute address, use as offset from instruction address */ - - if(tp->ftt_flgs & ftmtLink) sv->save_lr = (sv->save_srr0 + 4) & addrmask; /* Set the LR to the next instruction if needed */ - break; - - case ftmtBC: /* Conditional PC relative or absolute branch */ - new_pc = (user_addr_t)((int64_t)tp->ftt_trgt); /* Assume target is absolute address for the moment */ - if(!(tp->ftt_flgs & ftmtAbs)) new_pc = new_pc + sv->save_srr0; /* We don't have absolute address, use as offset from instruction address */ - - if(tp->ftt_flgs & ftmtLink) sv->save_lr = (sv->save_srr0 + 4) & addrmask; /* Set the LR to the next instruction if needed */ - if(!branchtaken(tp->ftt_bo, tp->ftt_bi, sv)) new_pc = (sv->save_srr0 + 4) & addrmask; /* If branch was not taken, set PC to next address */ - break; - - case ftmtBLR: /* Conditional branch to LR */ - new_pc = sv->save_lr; /* Branch target comes from the LR */ - - if(tp->ftt_flgs & ftmtLink) sv->save_lr = (sv->save_srr0 + 4) & addrmask; /* Set the LR to the next instruction if needed */ - if(!branchtaken(tp->ftt_bo, tp->ftt_bi, sv)) new_pc = (sv->save_srr0 + 4) & addrmask; /* If branch was not taken, set PC to next address */ - break; - - case ftmtBCTR: /* Conditional branch to CTR */ - new_pc = sv->save_ctr; /* Branch target comes from the CTR */ - - if(tp->ftt_flgs & ftmtLink) sv->save_lr = (sv->save_srr0 + 4) & addrmask; /* Set the LR to the next instruction if needed */ - if(!branchtaken(tp->ftt_bo, tp->ftt_bi, sv)) new_pc = (sv->save_srr0 + 4) & addrmask; /* If branch was not taken, set PC to next address */ - break; - - case ftmtCommon: /* Common, non-in-kernel emulated instruction */ - sv->save_instr[0] = 1; /* We only have one instruction to inject */ - sv->save_instr[1] = tp->ftt_instr; /* Set the instruction */ - sv->save_hdr.save_flags = sv->save_hdr.save_flags | SAVinject; /* Tell low-level exception return to inject the instruction */ - uthread->t_dtrace_step = 1; /* Let it be known that a trace return is imminent */ - return 0; /* Go and don't dome back until you are done... */ - - default: - panic("fasttrap_pid_probe: invalid ftt_type = %08X\n", tp->ftt_type); /* Huh, wha happened? */ - break; - } - - -done: - - /* - * If there were no return probes when we first found the tracepoint, - * we should feel no obligation to honor any return probes that were - * subsequently enabled -- they'll just have to wait until the next - * time around. - */ - sv->save_srr0 = new_pc; /* Set the new PC */ - if (tp->ftt_retids != NULL) fasttrap_return_common(sv, pc, pid, new_pc); - - return (0); -} - - -int -fasttrap_return_probe(ppc_saved_state_t *sv) -{ - - user_addr_t pc, npc; - - proc_t *p = current_proc(); - - - /* - * Treat a child created by a call to vfork(2) as if it were its - * parent. We know that there's only one thread of control in such a - * process: this one. - */ - /* - * APPLE NOTE: Terry says: "You need to hold the process locks (currently: kernel funnel) for this traversal" - * How do we assert this? - */ - while (p->p_lflag & P_LINVFORK) { - p = p->p_pptr; - } - - pc = sv->save_srr0; /* Get the PC of the probed instruction */ - npc = pc + 4; /* Get next PC */ - if(!(sv->save_srr1 & 0x8000000000000000ULL)) npc &= 0x00000000FFFFFFFF; /* Wrap new PC if running 32-bit */ - fasttrap_return_common(sv, pc, p->p_pid, npc); - - return (0); -} - -uint64_t -fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno, - int aframes) -{ -#pragma unused(arg, id, parg, aframes) - return (fasttrap_anarg((ppc_saved_state_t *)find_user_regs(current_thread()), 1, argno)); -} - -uint64_t -fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno, - int aframes) -{ -#pragma unused(arg, id, parg, aframes) - return (fasttrap_anarg((ppc_saved_state_t *)find_user_regs(current_thread()), 0, argno)); -} - - -static int32_t branchtaken(int32_t bo, int32_t bi, ppc_saved_state_t *sv) { - int32_t bcond, czero, crmatch; - uint64_t ctr; - - if((bo & 0x14) == 0x14) return 1; /* If this is a branch always, exit with true... */ - - czero = 0; /* Assume that we have not just decremented the CTR to 0 */ - - if(!(bo & 4)) { /* Skip the next bit if we do NOT muck with the CTR */ - ctr = sv->save_ctr = sv->save_ctr - 1; /* Decrement the CTR */ - if(!(sv->save_srr1 & 0x8000000000000000ULL)) ctr &= 0x00000000FFFFFFFF; /* Only look at the bottom 32 bits if 32-bit mode */ - czero = (ctr == 0); /* Remember if we just hit zero */ - } - - bcond = (bo >> 3); /* If 1, branch if CR flag is 1. If 0, branch if 0 */ - crmatch = bo >> 4; /* If bo[0] is set, do not check CR flag */ - crmatch = crmatch | (((sv->save_cr >> (31 - bi)) ^ bcond) ^ 1); /* Low bit is now set if CR flag matches or CR is not checked. Other bits are trash. */ - -// dbgTrace(0x77777777, bo, bi, sv->save_cr, ((czero | crmatch) & 1)); /* (TRACE/DEBUG) */ - - return ((czero | crmatch) & 1); /* Return 1 if branch taken, 0 if not... */ -} - -static int32_t dtrace_decode_ppc(uint32_t inst) { - - int32_t curdcd, lastmask, newmask, spr, bit, bito, word; - uint16_t xop = 0; - dcdtab *dcd; - - curdcd = inst >> 26; /* Isolate major op code to start decode */ - lastmask = 99; /* Always force a new xop at the start */ - - while(1) { /* Loop until we find instruction or fail */ - dcd = &insts[curdcd]; /* Point to the current decode table entry */ - if(dcd->dcdFlgs & dcdJump) { /* Should we jump to a new spot in the decode table? */ - curdcd = dcd->dcdMatch; /* Jump */ - continue; - } - - newmask = dcd->dcdFlgs & dcdMask; /* Isolate the mask index */ - if(lastmask != newmask) { /* Are we changing masks? */ - if(!newmask) break; /* If the mask is 0, we match everything and succeed... (note: lastmask can never be 0) */ - xop = inst & masktab[newmask]; /* Clear all extra bits to make match */ - lastmask = newmask; /* Remember */ - } - - if(xop == dcd->dcdMatch) break; /* We found our guy! */ - - if(!(dcd->dcdFlgs & dcdStep)) { /* No stepping, we failed */ - dcd = &dcdfail; /* Point to a failure entry */ - break; /* Leave... */ - } - - curdcd = curdcd + 1; /* Step to the next decode entry */ - } - - if(dcd->dcdType != diSPR) return (int32_t)(dcd->dcdType); /* Return what we found */ - - spr = (inst >> (31 - 20)) & 0x3FF; /* Get the source */ - spr = ((spr << 5) & 0x3E0) | ((spr >> 5) & 0x1F); /* Flip to right order */ - - word = spr >> 5; /* Get word index into table */ - bito = spr & 0x1F; /* Get bit offset into entry */ - bit = 0x80000000 >> bito; /* Position bit for a test */ - - if(!(sprtbl[word] & bit)) return (diINV); /* Bogus SPR so whole instruction is invalid... */ - - if(spr & 0x10) return (diPRV); /* This is a priviliged SPR so instruction is priviliged... */ - return (diCMN); /* Just a common SPR so instruction is the same... */ -} diff --git a/bsd/dev/ppc/fbt_ppc.c b/bsd/dev/ppc/fbt_ppc.c deleted file mode 100644 index 0a505d23e..000000000 --- a/bsd/dev/ppc/fbt_ppc.c +++ /dev/null @@ -1,694 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* #pragma ident "@(#)fbt.c 1.15 05/09/19 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ -#include <kern/cpu_data.h> -#include <kern/thread.h> -#include <mach/thread_status.h> - -#include <mach-o/loader.h> -#include <mach-o/nlist.h> - -extern struct mach_header _mh_execute_header; /* the kernel's mach header */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/errno.h> -#include <sys/stat.h> -#include <sys/ioctl.h> -#include <sys/conf.h> -#include <sys/fcntl.h> -#include <miscfs/devfs/devfs.h> - -#include <sys/dtrace.h> -#include <sys/dtrace_impl.h> -#include <sys/fbt.h> - -#include <sys/dtrace_glue.h> -#include <machine/cpu_capabilities.h> - -#define DTRACE_INVOP_NOP_SKIP 4 - -#define DTRACE_INVOP_MFLR_R0 11 -#define DTRACE_INVOP_MFLR_R0_SKIP 4 - -#define FBT_MFLR_R0 0x7c0802a6 - -#define FBT_MTLR_R0 0x7c0803a6 -#define FBT_BLR 0x4e800020 -#define FBT_BCTR 0x4e800420 - -#define FBT_LI_MASK 0x03fffffc -#define FBT_JUMP 0x48000000 -#define IS_JUMP(instr) (((instr) & ~FBT_LI_MASK) == FBT_JUMP) /* Relative, No LR update -- AA == 0b, LK == 0b */ -#define FBT_LI_EXTD64(instr) \ - (((instr) & 0x02000000) ? \ - (((uint64_t)((instr) & FBT_LI_MASK)) | 0xfffffffffc000000ULL) : \ - ((uint64_t)((instr) & FBT_LI_MASK))) - -#define FBT_PATCHVAL 0x7c810808 -#define FBT_AFRAMES_ENTRY 6 -#define FBT_AFRAMES_RETURN 6 - -#define FBT_ENTRY "entry" -#define FBT_RETURN "return" -#define FBT_ADDR2NDX(addr) ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask) - -extern dtrace_provider_id_t fbt_id; -extern fbt_probe_t **fbt_probetab; -extern int fbt_probetab_mask; - -kern_return_t fbt_perfCallback(int, ppc_saved_state_t *, int, int); -kern_return_t fbt_perfIntCallback(int, ppc_saved_state_t *, int, int); - -/* - * Critical routines that must not be probed. PR_5221096, PR_5379018. - */ - -static const char * critical_blacklist[] = -{ - "bcopy_phys", - "bcopy_physvir_32", - "cpu_control", - "cpu_exit_wait", - "cpu_info", - "cpu_info_count", - "cpu_init", - "cpu_machine_init", - "cpu_per_proc_alloc", - "cpu_per_proc_free", - "cpu_signal_handler", - "cpu_sleep", - "cpu_start", - "cpu_subtype", - "cpu_threadtype", - "cpu_to_processor", - "cpu_type", - "mapSkipListVerifyC", - "ml_nofault_copy", - "register_cpu_setup_func", - "unregister_cpu_setup_func" -}; -#define CRITICAL_BLACKLIST_COUNT (sizeof(critical_blacklist)/sizeof(critical_blacklist[0])) - -/* - * The transitive closure of entry points that can be reached from probe context. - * (Apart from routines whose names begin with dtrace_). - */ -static const char * probe_ctx_closure[] = -{ - "Debugger", - "MapUserMemoryWindow", - "OSCompareAndSwap", - "absolutetime_to_microtime", - "bcopy", - "clock_get_calendar_nanotime_nowait", - "copyin", - "copyinstr", - "copyout", - "copyoutstr", - "cpu_number", - "current_proc", - "current_processor", - "current_task", - "current_thread", - "debug_enter", - "find_user_regs", - "getPerProc", - "get_bsdtask_info", - "get_bsdthread_info", - "get_threadtask", - "hw_atomic_and", - "hw_compare_and_store", - "hw_find_map", - "kauth_cred_get", - "kauth_getgid", - "kauth_getuid", - "mach_absolute_time", - "mapping_drop_busy", - "mapping_find", - "mapping_phys_lookup", - "max_valid_stack_address", - "ml_at_interrupt_context", - "ml_phys_write_byte_64", - "ml_phys_write_half_64", - "ml_phys_write_word_64", - "ml_set_interrupts_enabled", - "panic", - "pmap_find_phys", - "prf", - "proc_is64bit", - "proc_selfname", - "proc_selfpid", - "proc_selfppid", - "psignal_lock", - "sdt_getargdesc", - "splhigh", - "splx", - "strlcpy", - "systrace_stub", - "timer_grab" -}; -#define PROBE_CTX_CLOSURE_COUNT (sizeof(probe_ctx_closure)/sizeof(probe_ctx_closure[0])) - -static int _cmp(const void *a, const void *b) -{ - return strncmp((const char *)a, *(const char **)b, strlen((const char *)a) + 1); -} - -static const void * bsearch( - register const void *key, - const void *base0, - size_t nmemb, - register size_t size, - register int (*compar)(const void *, const void *)) { - - register const char *base = base0; - register size_t lim; - register int cmp; - register const void *p; - - for (lim = nmemb; lim != 0; lim >>= 1) { - p = base + (lim >> 1) * size; - cmp = (*compar)(key, p); - if (cmp == 0) - return p; - if (cmp > 0) { /* key > p: move right */ - base = (const char *)p + size; - lim--; - } /* else move left */ - } - return (NULL); -} - -int -fbt_invop(uintptr_t addr, uintptr_t *stack, uintptr_t rval) -{ - fbt_probe_t *fbt = fbt_probetab[FBT_ADDR2NDX(addr)]; - uint64_t mask = (_cpu_capabilities & k64Bit) ? 0xffffffffffffffffULL : 0x00000000ffffffffULL; - - for (; fbt != NULL; fbt = fbt->fbtp_hashnext) { - if ((uintptr_t)fbt->fbtp_patchpoint == addr) { - - if (fbt->fbtp_roffset == 0) { - ppc_saved_state_t *regs = (ppc_saved_state_t *)stack; - - CPU->cpu_dtrace_caller = regs->save_lr; - - dtrace_probe(fbt->fbtp_id, regs->save_r3 & mask, regs->save_r4 & mask, - regs->save_r5 & mask, regs->save_r6 & mask, regs->save_r7 & mask); - - CPU->cpu_dtrace_caller = (uintptr_t)NULL; - } else { - - dtrace_probe(fbt->fbtp_id, fbt->fbtp_roffset, rval, 0, 0, 0); - - if (fbt->fbtp_rval == DTRACE_INVOP_TAILJUMP) { - ppc_saved_state_t *regs = (ppc_saved_state_t *)stack; - - regs->save_srr0 = (uint64_t)fbt->fbtp_patchpoint + FBT_LI_EXTD64(fbt->fbtp_savedval); - regs->save_srr0 &= mask; - } - - CPU->cpu_dtrace_caller = (uintptr_t)NULL; - } - - return (fbt->fbtp_rval); - } - } - - return (0); -} - -#include <ppc/proc_reg.h> /* For USER_MODE */ -#define IS_USER_TRAP(regs) USER_MODE((regs)->save_srr1) -#define T_VECTOR_SIZE 4 /* function pointer size */ -#define T_PROGRAM (0x07 * T_VECTOR_SIZE) -#define FBT_EXCEPTION_CODE T_PROGRAM - -kern_return_t -fbt_perfCallback( - int trapno, - ppc_saved_state_t *regs, - int unused1, - int unused2) -{ -#pragma unused (unused1) -#pragma unused (unused2) - kern_return_t retval = KERN_FAILURE; - - if (!IS_USER_TRAP(regs) && FBT_EXCEPTION_CODE == trapno) { - boolean_t oldlevel; - - oldlevel = ml_set_interrupts_enabled(FALSE); - - switch (dtrace_invop( regs->save_srr0, (uintptr_t *)regs, regs->save_r3 )) { - case DTRACE_INVOP_NOP: - regs->save_srr0 += DTRACE_INVOP_NOP_SKIP; /* Skip over the bytes of the patched NOP */ - retval = KERN_SUCCESS; - break; - - case DTRACE_INVOP_MFLR_R0: - regs->save_r0 = regs->save_lr; /* Emulate patched mflr r0 */ - regs->save_srr0 += DTRACE_INVOP_MFLR_R0_SKIP; /* Skip over the bytes of the patched mflr r0 */ - retval = KERN_SUCCESS; - break; - - case DTRACE_INVOP_RET: - regs->save_srr0 = regs->save_lr; /* Emulate patched blr by resuming execution at the LR */ - retval = KERN_SUCCESS; - break; - - case DTRACE_INVOP_BCTR: - regs->save_srr0 = regs->save_ctr; /* Emulate patched bctr by resuming execution at the CTR */ - retval = KERN_SUCCESS; - break; - - case DTRACE_INVOP_TAILJUMP: - retval = KERN_SUCCESS; - break; - - default: - retval = KERN_FAILURE; - break; - } - ml_set_interrupts_enabled(oldlevel); - } - - return retval; -} - -kern_return_t -fbt_perfIntCallback( - int trapno, - ppc_saved_state_t *regs, - int unused1, - int unused2) -{ - kern_return_t retval = KERN_FAILURE; - - if (KERN_SUCCESS == (retval = fbt_perfCallback(trapno, regs, unused1, unused2))) - enable_preemption(); - - return retval; -} - -/*ARGSUSED*/ -static void -__fbt_provide_module(void *arg, struct modctl *ctl) -{ -#pragma unused(arg) - struct mach_header *mh; - struct load_command *cmd; - struct segment_command *orig_ts = NULL, *orig_le = NULL; - struct symtab_command *orig_st = NULL; - struct nlist *sym = NULL; - char *strings; - uintptr_t instrLow, instrHigh; - char *modname; - unsigned int i; - - int gIgnoreFBTBlacklist = 0; - PE_parse_boot_argn("IgnoreFBTBlacklist", &gIgnoreFBTBlacklist, sizeof (gIgnoreFBTBlacklist)); - - mh = (struct mach_header *)(ctl->address); - modname = ctl->mod_modname; - - if (0 == ctl->address || 0 == ctl->size) /* Has the linker been jettisoned? */ - return; - - /* - * Employees of dtrace and their families are ineligible. Void - * where prohibited. - */ - - if (LIT_STRNEQL(modname, "com.apple.driver.dtrace")) - return; - - if (strstr(modname, "CHUD") != NULL) - return; - - if (mh->magic != MH_MAGIC) - return; - - cmd = (struct load_command *) &mh[1]; - for (i = 0; i < mh->ncmds; i++) { - if (cmd->cmd == LC_SEGMENT) { - struct segment_command *orig_sg = (struct segment_command *) cmd; - - if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT)) - orig_ts = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT)) - orig_le = orig_sg; - else if (LIT_STRNEQL(orig_sg->segname, "")) - orig_ts = orig_sg; /* kexts have a single unnamed segment */ - } - else if (cmd->cmd == LC_SYMTAB) - orig_st = (struct symtab_command *) cmd; - - cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize); - } - - if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL)) - return; - - sym = (struct nlist *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff); - strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff); - - /* Find extent of the TEXT section */ - instrLow = (uintptr_t)orig_ts->vmaddr; - instrHigh = (uintptr_t)(orig_ts->vmaddr + orig_ts->vmsize); - - for (i = 0; i < orig_st->nsyms; i++) { - fbt_probe_t *fbt, *retfbt; - machine_inst_t *instr, *limit, theInstr; - uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT); - char *name = strings + sym[i].n_un.n_strx; - int j; - - /* Check that the symbol is a global and that it has a name. */ - if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type)) - continue; - - if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */ - continue; - - /* Lop off omnipresent leading underscore. */ - if (*name == '_') - name += 1; - - if (LIT_STRNSTART(name, "dtrace_") && !LIT_STRNSTART(name, "dtrace_safe_")) { - /* - * Anything beginning with "dtrace_" may be called - * from probe context unless it explitly indicates - * that it won't be called from probe context by - * using the prefix "dtrace_safe_". - */ - continue; - } - - if (LIT_STRNSTART(name, "fasttrap_") || - LIT_STRNSTART(name, "fuword") || - LIT_STRNSTART(name, "suword") || - LIT_STRNEQL(name, "sprlock") || - LIT_STRNEQL(name, "sprunlock") || - LIT_STRNEQL(name, "uread") || - LIT_STRNEQL(name, "uwrite")) - continue; /* Fasttrap inner-workings. */ - - if (LIT_STRNSTART(name, "dsmos_")) - continue; /* Don't Steal Mac OS X! */ - - if (LIT_STRNSTART(name, "_dtrace")) - continue; /* Shims in dtrace.c */ - - if (LIT_STRNSTART(name, "chud")) - continue; /* Professional courtesy. */ - - if (LIT_STRNSTART(name, "hibernate_")) - continue; /* Let sleeping dogs lie. */ - - if (LIT_STRNEQL(name, "_ZN9IOService14newTemperatureElPS_") || /* IOService::newTemperature */ - LIT_STRNEQL(name, "_ZN9IOService26temperatureCriticalForZoneEPS_")) /* IOService::temperatureCriticalForZone */ - continue; /* Per the fire code */ - - /* - * Place no probes (illegal instructions) in the exception handling path! - */ - if (LIT_STRNEQL(name, "L_handler700") || - LIT_STRNEQL(name, "save_get_phys_64") || - LIT_STRNEQL(name, "save_get_phys_32") || - LIT_STRNEQL(name, "EmulExit") || - LIT_STRNEQL(name, "Emulate") || - LIT_STRNEQL(name, "Emulate64") || - LIT_STRNEQL(name, "switchSegs") || - LIT_STRNEQL(name, "save_ret_phys")) - continue; - - if (LIT_STRNEQL(name, "thandler") || - LIT_STRNEQL(name, "versave") || - LIT_STRNEQL(name, "timer_event") || - LIT_STRNEQL(name, "hw_atomic_or") || - LIT_STRNEQL(name, "trap")) - continue; - - if (LIT_STRNEQL(name, "fbt_perfCallback") || - LIT_STRNEQL(name, "fbt_perfIntCallback") || - LIT_STRNEQL(name, "ml_set_interrupts_enabled") || - LIT_STRNEQL(name, "dtrace_invop") || - LIT_STRNEQL(name, "fbt_invop") || - LIT_STRNEQL(name, "sdt_invop") || - LIT_STRNEQL(name, "max_valid_stack_address")) - continue; - - /* - * Probes encountered while we're on the interrupt stack are routed along - * the interrupt handling path. No probes allowed there either! - */ - if (LIT_STRNEQL(name, "ihandler") || - LIT_STRNEQL(name, "interrupt") || - LIT_STRNEQL(name, "disable_preemption")) - continue; - - /* - * Avoid weird stack voodoo in and under machine_stack_handoff et al - */ - if (LIT_STRNSTART(name, "machine_stack") || - LIT_STRNEQL(name, "getPerProc") || /* Called in machine_stack_handoff with weird stack state */ - LIT_STRNEQL(name, "fpu_save") || /* Called in machine_stack_handoff with weird stack state */ - LIT_STRNEQL(name, "vec_save") || /* Called in machine_stack_handoff with weird stack state */ - LIT_STRNEQL(name, "pmap_switch")) /* Called in machine_stack_handoff with weird stack state */ - continue; - - /* - * Avoid machine_ routines. PR_5346750. - */ - if (LIT_STRNSTART(name, "machine_")) - continue; - - /* - * Avoid low level pmap and virtual machine monitor PowerPC routines. See PR_5379018. - */ - - if (LIT_STRNSTART(name, "hw_") || - LIT_STRNSTART(name, "mapping_") || - LIT_STRNSTART(name, "commpage_") || - LIT_STRNSTART(name, "pmap_") || - LIT_STRNSTART(name, "vmm_")) - continue; - /* - * Place no probes on critical routines. PR_5221096 - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, critical_blacklist, CRITICAL_BLACKLIST_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit in probe context. - */ - if (!gIgnoreFBTBlacklist && - bsearch( name, probe_ctx_closure, PROBE_CTX_CLOSURE_COUNT, sizeof(name), _cmp ) != NULL) - continue; - - /* - * Place no probes that could be hit on the way to the debugger. - */ - if (LIT_STRNSTART(name, "kdp_") || - LIT_STRNSTART(name, "kdb_") || - LIT_STRNSTART(name, "kdbg_") || - LIT_STRNSTART(name, "kdebug_") || - LIT_STRNEQL(name, "kernel_debug") || - LIT_STRNEQL(name, "Debugger") || - LIT_STRNEQL(name, "Call_DebuggerC") || - LIT_STRNEQL(name, "lock_debugger") || - LIT_STRNEQL(name, "unlock_debugger") || - LIT_STRNEQL(name, "SysChoked")) - continue; - - /* - * Place no probes that could be hit on the way to a panic. - */ - if (NULL != strstr(name, "panic_") || - LIT_STRNEQL(name, "panic") || - LIT_STRNEQL(name, "handleMck") || - LIT_STRNEQL(name, "unresolved_kernel_trap")) - continue; - - if (dtrace_probe_lookup(fbt_id, modname, name, NULL) != 0) - continue; - - /* - * Scan forward for mflr r0. - */ - for (j = 0, instr = (machine_inst_t *)sym[i].n_value, theInstr = 0; - (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)instr); - j++, instr++) - { - theInstr = *instr; - if (theInstr == FBT_MFLR_R0) /* Place the entry probe here. */ - break; - if (theInstr == FBT_MTLR_R0) /* We've gone too far, bail. */ - break; - if (theInstr == FBT_BLR) /* We've gone too far, bail. */ - break; - } - - if (theInstr != FBT_MFLR_R0) - continue; - - limit = (machine_inst_t *)instrHigh; - - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, name, FBT_ENTRY, FBT_AFRAMES_ENTRY, fbt); - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - fbt->fbtp_rval = DTRACE_INVOP_MFLR_R0; - fbt->fbtp_savedval = theInstr; - fbt->fbtp_patchval = FBT_PATCHVAL; - - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - - instr++; /* Move on down the line */ - retfbt = NULL; -again: - if (instr >= limit) - continue; - - /* - * We (desperately) want to avoid erroneously instrumenting a - * jump table. To determine if we're looking at a true instruction - * or an inline jump table that happens to contain the same - * byte sequences, we resort to some heuristic sleeze: we - * treat this instruction as being contained within a pointer, - * and see if that pointer points to within the body of the - * function. If it does, we refuse to instrument it. - */ - { - machine_inst_t *ptr = *(machine_inst_t **)instr; - - if (ptr >= (machine_inst_t *)sym[i].n_value && ptr < limit) { - instr++; - goto again; - } - } - - /* - * OK, it's an instruction. - */ - theInstr = *instr; - - /* Walked onto the start of the next routine? If so, bail out from this function. */ - if (theInstr == FBT_MFLR_R0) - continue; - - if (theInstr != FBT_MTLR_R0) { - instr++; - goto again; - } - - /* - * Found mtlr r0; - * Scan forward for a blr, bctr, or a jump (relative, no LR change). - */ - instr++; - for (j = 0; (j < 12) && (instr < limit); j++, instr++) { - theInstr = *instr; - if (theInstr == FBT_BLR || theInstr == FBT_BCTR || IS_JUMP(theInstr) || - theInstr == FBT_MFLR_R0 || theInstr == FBT_MTLR_R0) - break; - } - - if (!(theInstr == FBT_BLR || theInstr == FBT_BCTR || IS_JUMP(theInstr))) - goto again; - - /* - * We have a winner: "mtlr r0; ... ; {blr, bctr, j}" ! - */ - fbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP); - strlcpy( (char *)&(fbt->fbtp_name), name, MAX_FBTP_NAME_CHARS ); - - if (retfbt == NULL) { - fbt->fbtp_id = dtrace_probe_create(fbt_id, modname, - name, FBT_RETURN, FBT_AFRAMES_RETURN, fbt); - } else { - retfbt->fbtp_next = fbt; - fbt->fbtp_id = retfbt->fbtp_id; - } - - retfbt = fbt; - fbt->fbtp_patchpoint = instr; - fbt->fbtp_ctl = ctl; - fbt->fbtp_loadcnt = ctl->mod_loadcnt; - - if (theInstr == FBT_BLR) - fbt->fbtp_rval = DTRACE_INVOP_RET; - else if (theInstr == FBT_BCTR) - fbt->fbtp_rval = DTRACE_INVOP_BCTR; - else - fbt->fbtp_rval = DTRACE_INVOP_TAILJUMP; - - fbt->fbtp_roffset = - (uintptr_t)((uint8_t *)instr - (uint8_t *)sym[i].n_value); - - fbt->fbtp_savedval = *instr; - fbt->fbtp_patchval = FBT_PATCHVAL; - fbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)]; - fbt->fbtp_symndx = i; - fbt_probetab[FBT_ADDR2NDX(instr)] = fbt; - instr++; - goto again; - } -} - -extern struct modctl g_fbt_kernctl; -#undef kmem_alloc /* from its binding to dt_kmem_alloc glue */ -#undef kmem_free /* from its binding to dt_kmem_free glue */ -#include <vm/vm_kern.h> - -/*ARGSUSED*/ -void -fbt_provide_module(void *arg, struct modctl *ctl) -{ -#pragma unused(ctl) - __fbt_provide_module(arg, &g_fbt_kernctl); - - if ( (vm_offset_t)g_fbt_kernctl.address != (vm_offset_t )NULL ) - kmem_free(kernel_map, (vm_offset_t)g_fbt_kernctl.address, round_page(g_fbt_kernctl.size)); - g_fbt_kernctl.address = 0; - g_fbt_kernctl.size = 0; -} diff --git a/bsd/dev/ppc/ffs.c b/bsd/dev/ppc/ffs.c deleted file mode 100644 index c3f06a74f..000000000 --- a/bsd/dev/ppc/ffs.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1991 NeXT Computer, Inc. All rights reserved. - * - * File: machdep/i386/libc/ffs.c - * Author: Bruce Martin, NeXT Computer, Inc. - * - * This file contains machine dependent code for the ffs function - * on NeXT i386-based products. Currently tuned for the i486. - * - * HISTORY - * 27-Sep-92 Bruce Martin (Bruce_Martin@NeXT.COM) - * Created: stolen from Mike's code. - */ - -unsigned -ffs(unsigned mask) -{ - unsigned bitpos; - - if (mask == 0) - return 0; - - bitpos = 1; - while ((mask & 0xff) == 0) { - bitpos += 8; - mask >>= 8; - } - while ((mask & 1) == 0) { - bitpos += 1; - mask >>= 1; - } - return bitpos; -} diff --git a/bsd/dev/ppc/ffs.s b/bsd/dev/ppc/ffs.s deleted file mode 100644 index 290053a82..000000000 --- a/bsd/dev/ppc/ffs.s +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1992, 1997-1998 Apple Computer, Inc. All rights reserved. - * - * File: machdep/ppc/libc/ffs.s - * - * int ffs(int value) - * - * DESCRIPTION - * The ffs() function finds the first bit set in value and returns the - * index of that bit. Bits are numbered starting from 1, starting at - * the right-most bit. A return value of 0 means that the argument was - * - * HISTORY - * 14-Aug-1998 Umesh Vaishampayan (umeshv@apple.com) - * Optimized! - * - * 10-Mar-1998 Matt Watson (mwatson@apple.com) - * Correctified - * - * 19-Jan-1998 Matt Watson (mwatson@apple.com) - * Simplified - * - * 24-Jan-1997 Umesh Vaishampayan (umeshv@NeXT.com) - * Ported to PPC. - */ - -.text -.align 4 -.globl _ffs -_ffs: /* Cycles */ - neg r0,r3 /* 0 */ - and r3,r0,r3 /* 1 */ - li r4, 32 /* 1 */ - cntlzw r3,r3 /* 2 */ - subf r3,r3,r4 /* 3 */ - blr - - .globl _abs -_abs: - srawi r0,r3,31 - xor r3,r0,r3 - subf r3,r0,r3 - blr - diff --git a/bsd/dev/ppc/kern_machdep.c b/bsd/dev/ppc/kern_machdep.c deleted file mode 100644 index 1f45bd131..000000000 --- a/bsd/dev/ppc/kern_machdep.c +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (C) 1990, 1993 NeXT, Inc. - * Copyright (C) 1997 Apple Computer, Inc. - * - * File: next/kern_machdep.c - * Author: John Seamons - * - * Machine-specific kernel routines. - */ - -#include <sys/types.h> -#include <sys/param.h> -#include <mach/machine.h> -#include <mach/boolean.h> -#include <mach/vm_param.h> -#include <kern/cpu_number.h> -#include <machine/exec.h> - -boolean_t kernacc(off_t, size_t ); - - -/* - * Routine: grade_binary() - * - * Function: - * Return a relative preference for exectypes and execsubtypes in fat - * executable files. The higher the grade, the higher the preference. - * A grade of 0 means not acceptable. - * - * Note: We really don't care about the real cpu_type() here, - * because machines can only have one type. - */ -int -grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype) -{ - int cpusubtype = cpu_subtype(); - - /* - * This code should match cpusubtype_findbestarch() in best_arch.c - * in the cctools project. As of 2/16/98 this is what has been - * agreed upon for the PowerPC subtypes. If an exact match is not - * found the subtype will be picked from the following order: - * 970(but only on 970), 7450, 7400, 750, ALL - * Note the 601 is NOT in the list above. It is only picked via - * an exact match. For details see Radar 2213821. - */ - - switch (cpusubtype) { - case CPU_SUBTYPE_POWERPC_970: - switch(exectype) { - case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ - switch(execsubtype) { - /* - * Prefer 64 bit architecture specific binaries; note - * that this value does not mean the same thing here - * as it does below. - */ - case CPU_SUBTYPE_POWERPC_970: - return 8; - /* Prefer generic binaries */ - case CPU_SUBTYPE_POWERPC_ALL: - return 7; - default: - return 0; - } - /* NOTREACHED */ - - case CPU_TYPE_POWERPC: - switch(execsubtype) { - /* - * Prefer 32 bit binaries with 64 bit leaf functions; - * this is actually bogus use of the subtype to encode - * CPU feature bits. - */ - case CPU_SUBTYPE_POWERPC_970: - return 6; - case CPU_SUBTYPE_POWERPC_7450: - return 4; - case CPU_SUBTYPE_POWERPC_7400: - return 3; - case CPU_SUBTYPE_POWERPC_750: - return 2; - case CPU_SUBTYPE_POWERPC_ALL: - return 1; - default: - return 0; - } - /* NOTREACHED */ - - default: - return 0; - } - /* NOTREACHED */ - - case CPU_SUBTYPE_POWERPC_7450: - switch(exectype) { - case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ - return 0; - - case CPU_TYPE_POWERPC: - switch(execsubtype) { - case CPU_SUBTYPE_POWERPC_7450: - return 6; - case CPU_SUBTYPE_POWERPC_7400: - return 4; - case CPU_SUBTYPE_POWERPC_750: - return 3; - case CPU_SUBTYPE_POWERPC_ALL: - return 1; - default: - return 0; - } - /* NOTREACHED */ - - default: - return 0; - } - /* NOTREACHED */ - - case CPU_SUBTYPE_POWERPC_7400: - switch(exectype) { - case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ - return 0; - - case CPU_TYPE_POWERPC: - switch(execsubtype) { - case CPU_SUBTYPE_POWERPC_7400: - return 6; - case CPU_SUBTYPE_POWERPC_7450: - return 4; - case CPU_SUBTYPE_POWERPC_750: - return 3; - case CPU_SUBTYPE_POWERPC_ALL: - return 1; - default: - return 0; - } - /* NOTREACHED */ - - default: - return 0; - } - /* NOTREACHED */ - - case CPU_SUBTYPE_POWERPC_750: - switch(exectype) { - case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ - return 0; - - case CPU_TYPE_POWERPC: - switch(execsubtype) { - case CPU_SUBTYPE_POWERPC_750: - return 6; -#ifndef ADDRESS_RADAR_2678019 - /* - * Currently implemented because dropping this would - * turn the executable subtype into a "has Altivec" - * flag, which we do not want to permit. It could - * also break working third party applications - * already in use in the field. - */ - case CPU_SUBTYPE_POWERPC_7400: - return 4; - case CPU_SUBTYPE_POWERPC_7450: - return 3; -#endif /* ADDRESS_RADAR_2678019 */ - case CPU_SUBTYPE_POWERPC_ALL: - return 1; - default: - return 0; - } - /* NOTREACHED */ - - default: - return 0; - } - /* NOTREACHED */ - - default: - switch(exectype) { - case CPU_TYPE_POWERPC64: /* CPU_IS64BIT | CPU_POWERPC */ - return 0; - - case CPU_TYPE_POWERPC: - /* Special case for PPC601 */ - if (cpusubtype == execsubtype) - return 6; - /* - * If we get here it is because it is a cpusubtype we - * don't support or a new cpusubtype that was added - * since this code was written. Both will be - * considered unacceptable. - */ - return 0; - /* NOTREACHED */ - - default: - return 0; - } - /* NOTREACHED */ - } - /* NOTREACHED */ -} - -extern vm_map_offset_t kvtophys64(vm_map_offset_t); - -boolean_t -kernacc( - off_t start, - size_t len -) -{ - off_t base; - off_t end; - - base = trunc_page_64(start); - end = start + len; - - while (base < end) { - if(kvtophys64((vm_map_offset_t)base) == (vm_map_offset_t)0) - return(FALSE); - base += page_size; - } - - return (TRUE); -} - -void -md_prepare_for_shutdown(int paniced, int howto, char * command); - -void -md_prepare_for_shutdown(__unused int paniced, __unused int howto, - __unused char * command) -{ - return; -} diff --git a/bsd/dev/ppc/km.c b/bsd/dev/ppc/km.c deleted file mode 100644 index e82d6be27..000000000 --- a/bsd/dev/ppc/km.c +++ /dev/null @@ -1,392 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1992 NeXT Computer, Inc. All rights reserved. - * - * km.m - kernel keyboard/monitor module, procedural interface. - * - * HISTORY - */ - -#include <sys/kernel.h> -#include <sys/tty.h> - -#include <machine/cons.h> -#include <sys/conf.h> -#include <sys/systm.h> -#include <sys/uio.h> -#include <sys/fcntl.h> /* for kmopen */ -#include <sys/errno.h> -#include <sys/proc.h> /* for kmopen */ -#include <sys/msgbuf.h> -#include <sys/time.h> -#include <dev/kmreg_com.h> -#include <pexpert/pexpert.h> - -/* - * 'Global' variables, shared only by this file and conf.c. - */ -struct tty *km_tty[1] = { 0 }; - -/* - * this works early on, after initialize_screen() but before autoconf (and thus - * before we have a kmDevice). - */ -int disableConsoleOutput; - -static int initialized = 0; - -extern void kminit(void); - -// used by or implemented in the osfmk project -extern void cnputcusr(char); // From osfmk -extern int cngetc(void); // From osfmk -extern void cons_cinput(char ch); // Used by osfmk - -static int kmoutput(struct tty *tp); -static void kmtimeout(void *tp); -static void kmstart(struct tty *tp); - -extern void KeyboardOpen(void); - -void -kminit(void) -{ - km_tty[0] = ttymalloc(); - km_tty[0]->t_dev = makedev(12, 0); - initialized = 1; -} - -/* - * cdevsw interface to km driver. - */ -int -kmopen(dev_t dev, int flag, __unused int devtype, proc_t pp) -{ - int unit; - struct tty *tp; - struct winsize *wp; - int ret; - - unit = minor(dev); - if(unit >= 1) - return (ENXIO); - - tp = km_tty[unit]; - - tty_lock(tp); - - tp->t_oproc = kmstart; - tp->t_param = NULL; - tp->t_dev = dev; - - if ( !(tp->t_state & TS_ISOPEN) ) { - tp->t_iflag = TTYDEF_IFLAG; - tp->t_oflag = TTYDEF_OFLAG; - tp->t_cflag = (CREAD | CS8 | CLOCAL); - tp->t_lflag = TTYDEF_LFLAG; - tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; - termioschars(&tp->t_termios); - ttsetwater(tp); - } else if ((tp->t_state & TS_XCLUDE) && proc_suser(pp)) { - ret = EBUSY; - goto out; - } - - tp->t_state |= TS_CARR_ON; /* lie and say carrier exists and is on. */ - - ret = ((*linesw[tp->t_line].l_open)(dev, tp)); - { - PE_Video video; - wp = &tp->t_winsize; - /* - * Magic numbers. These are CHARWIDTH and CHARHEIGHT - * from osfmk/ppc/POWERMAC/video_console.c - */ - wp->ws_xpixel = 8; - wp->ws_ypixel = 16; - - tty_unlock(tp); /* XXX race window */ - - if (flag & O_POPUP) - PE_initialize_console(0, kPETextScreen); - - bzero(&video, sizeof(video)); - PE_current_console(&video); - - tty_lock(tp); - - if( video.v_width != 0 && video.v_height != 0 ) { - wp->ws_col = video.v_width / wp->ws_xpixel; - wp->ws_row = video.v_height / wp->ws_ypixel; - } else { - wp->ws_col = 100; - wp->ws_row = 36; - } - } - -out: - tty_unlock(tp); - - return ret; -} - -int -kmclose(dev_t dev, __unused int flag, __unused int mode, __unused proc_t p) -{ - int ret; - struct tty *tp = km_tty[minor(dev)]; - - tty_lock(tp); - ret = (*linesw[tp->t_line].l_close)(tp,flag); - ttyclose(tp); - tty_unlock(tp); - - return (ret); -} - -int -kmread(dev_t dev, struct uio *uio, int ioflag) -{ - int ret; - struct tty *tp = km_tty[minor(dev)]; - - tty_lock(tp); - ret = (*linesw[tp->t_line].l_read)(tp, uio, ioflag); - tty_unlock(tp); - - return (ret); -} - -int -kmwrite(dev_t dev, struct uio *uio, int ioflag) -{ - int ret; - struct tty *tp = km_tty[minor(dev)]; - - tty_lock(tp); - ret = (*linesw[tp->t_line].l_write)(tp, uio, ioflag); - tty_unlock(tp); - - return (ret); -} - -int -kmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) -{ - int error = 0; - struct tty *tp = km_tty[minor(dev)]; - struct winsize *wp; - - tty_lock(tp); - - switch (cmd) { - case KMIOCSIZE: - wp = (struct winsize *)data; - *wp = tp->t_winsize; - break; - - case TIOCSWINSZ: - /* Prevent changing of console size -- - * this ensures that login doesn't revert to the - * termcap-defined size - */ - error = EINVAL; - break; - - /* Bodge in the CLOCAL flag as the km device is always local */ - case TIOCSETA_32: - case TIOCSETAW_32: - case TIOCSETAF_32: - { - struct termios32 *t = (struct termios32 *)data; - t->c_cflag |= CLOCAL; - /* No Break */ - } - goto fallthrough; - case TIOCSETA_64: - case TIOCSETAW_64: - case TIOCSETAF_64: - { - struct user_termios *t = (struct user_termios *)data; - t->c_cflag |= CLOCAL; - /* No Break */ - } -fallthrough: - default: - error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p); - if (ENOTTY != error) - break; - error = ttioctl_locked(tp, cmd, data, flag, p); - break; - } - - tty_unlock(tp); - - return (error); -} - -/* - * kmputc - * - * Output a character to the serial console driver via cnputcusr(), - * which is exported by that driver. - * - * Locks: Assumes tp in the calling tty driver code is locked on - * entry, remains locked on exit - * - * Notes: Called from kmoutput(); giving the locking output - * assumptions here, this routine should be static (and - * inlined, given there is only one call site). - */ -int -kmputc(__unused dev_t dev, char c) -{ - if(!disableConsoleOutput && initialized) { - /* OCRNL */ - if(c == '\n') - cnputcusr('\r'); - cnputcusr(c); - } - - return (0); -} - - -/* - * Callouts from linesw. - */ - -#define KM_LOWAT_DELAY ((ns_time_t)1000) - -/* - * t_oproc for this driver; called from within the line discipline - * - * Locks: Assumes tp is locked on entry, remains locked on exit - */ -static void -kmstart(struct tty *tp) -{ - if (tp->t_state & (TS_TIMEOUT | TS_BUSY | TS_TTSTOP)) - goto out; - if (tp->t_outq.c_cc == 0) - goto out; - tp->t_state |= TS_BUSY; - kmoutput(tp); - return; - -out: - (*linesw[tp->t_line].l_start)(tp); - return; -} - -/* - * One-shot output retry timeout from kmoutput(); re-calls kmoutput() at - * intervals until the output queue for the tty is empty, at which point - * the timeout is not rescheduled by kmoutput() - * - * This function must take the tty_lock() around the kmoutput() call; it - * ignores the return value. - */ -static void -kmtimeout(void *arg) -{ - struct tty *tp = (struct tty *)arg; - - tty_lock(tp); - (void)kmoutput(tp); - tty_unlock(tp); -} - -/* - * kmoutput - * - * Locks: Assumes tp is locked on entry, remains locked on exit - * - * Notes: Called from kmstart() and kmtimeout(); kmtimeout() is a - * timer initiated by this routine to deal with pending - * output not yet flushed (output is flushed at a maximum - * of sizeof(buf) charatcers at a time before dropping into - * the timeout code). - */ -static int -kmoutput(struct tty *tp) -{ - char buf[80]; /* buffer; limits output per call */ - char *cp; - int cc = -1; - - - /* While there is data available to be output... */ - while (tp->t_outq.c_cc > 0) { - cc = ndqb(&tp->t_outq, 0); - if (cc == 0) - break; - /* - * attempt to output as many characters as are available, - * up to the available transfer buffer size. - */ - cc = min(cc, sizeof buf); - /* copy the output queue contents to the buffer */ - (void) q_to_b(&tp->t_outq, (unsigned char *)buf, cc); - for (cp = buf; cp < &buf[cc]; cp++) { - /* output the buffer one charatcer at a time */ - kmputc(tp->t_dev, *cp & 0x7f); - } - } - if (tp->t_outq.c_cc > 0) { - timeout((timeout_fcn_t)kmtimeout, tp, hz); - } - tp->t_state &= ~TS_BUSY; - (*linesw[tp->t_line].l_start)(tp); - - return 0; -} - -/* - * cons_cinput - * - * Driver character input from the polled mode serial console driver calls - * this routine to input a character from the serial driver into the tty - * line discipline specific input processing receiv interrupt routine, - * l_rint(). - * - * Locks: Assumes that the tty_lock() is NOT held on the tp, so a - * serial driver should NOT call this function as a result - * of being called from a function which already holds the - * lock; ECHOE will be handled at the line discipline, if - * output echo processing is going to occur. - */ -void -cons_cinput(char ch) -{ - struct tty *tp = km_tty[0]; /* XXX */ - - tty_lock(tp); - (*linesw[tp->t_line].l_rint) (ch, tp); - tty_unlock(tp); -} diff --git a/bsd/dev/ppc/mem.c b/bsd/dev/ppc/mem.c deleted file mode 100644 index fc2d39efb..000000000 --- a/bsd/dev/ppc/mem.c +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1988 University of Utah. - * Copyright (c) 1982, 1986, 1990, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * the Systems Programming Group of the University of Utah Computer - * Science Department, and code derived from software contributed to - * Berkeley by William Jolitz. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: Utah $Hdr: mem.c 1.13 89/10/08$ - * @(#)mem.c 8.1 (Berkeley) 6/11/93 - */ - -#include <mach_load.h> - -/* - * Memory special file - */ - -#include <sys/param.h> -#include <sys/dir.h> -#include <sys/proc.h> -#include <sys/systm.h> -#include <sys/conf.h> -#include <sys/vm.h> -#include <sys/uio_internal.h> -#include <sys/malloc.h> - -#include <vm/pmap.h> -#include <vm/vm_map.h> -#include <vm/vm_kern.h> -#include <mach/vm_param.h> - -#include <ppc/Diagnostics.h> -#include <ppc/mappings.h> - -static caddr_t devzerobuf; - -extern boolean_t kernacc(off_t, size_t ); -extern int setup_kmem; - -int mmread(dev_t dev, struct uio *uio, int flag); -int mmrw(dev_t dev, struct uio *uio, enum uio_rw rw); -int mmioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p); -int mmwrite(dev_t dev, struct uio *uio, int flag); - -int -mmread(dev_t dev, struct uio *uio, __unused int flag) -{ - - return (mmrw(dev, uio, UIO_READ)); -} - -int -mmwrite(dev_t dev, struct uio *uio, __unused int flag) -{ - - return (mmrw(dev, uio, UIO_WRITE)); -} - -int -mmioctl(dev_t dev, u_long cmd, __unused caddr_t data, - __unused int flag, __unused struct proc *p) -{ - int minnum = minor(dev); - - if ((setup_kmem == 0) && ((minnum == 0) || (minnum == 1))) - return(EINVAL); - - switch (cmd) { - case FIONBIO: - case FIOASYNC: - /* OK to do nothing: we always return immediately */ - break; - default: - return ENODEV; - } - - return (0); -} - -int -mmrw(dev, uio, rw) - dev_t dev; - struct uio *uio; - enum uio_rw rw; -{ - register int o; -#if LP64KERN - register uint64_t c; -#else - register uint c; -#endif - addr64_t vll; - int error = 0; - vm_offset_t where; - - while (uio_resid(uio) > 0 && error == 0) { - uio_update(uio, 0); - - switch (minor(dev)) { - -/* minor device 0 is physical memory */ - case 0: - if (setup_kmem == 0) - return(ENODEV); - vll = trunc_page_64(uio->uio_offset); - if(((vll >> 31) == 1) || vll >= ((dgWork.dgFlags & enaDiagDM) ? mem_actual : max_mem)) - goto fault; - - if(dgWork.dgFlags & enaDiagDM) { /* Can we really get all memory? */ - if (kmem_alloc_pageable(kernel_map, &where, PAGE_SIZE) != KERN_SUCCESS) { - goto fault; - } - else { - addr64_t collad; - - collad = mapping_make(kernel_pmap, (addr64_t)where, (ppnum_t)(vll >> 12), 0, 1, VM_PROT_READ); /* Map it in for the moment */ - if(collad) { /* See if it failed (shouldn't happen) */ - kmem_free(kernel_map, where, PAGE_SIZE); /* Toss the page */ - goto fault; /* Kill the transfer */ - } - } - } - else { - if (kmem_alloc(kernel_map, &where, 4096) - != KERN_SUCCESS) { - goto fault; - } - } - o = uio->uio_offset - vll; - c = min(PAGE_SIZE - o, uio_curriovlen(uio)); - error = uiomove((caddr_t)(where + o), c, uio); - - if(dgWork.dgFlags & enaDiagDM) (void)mapping_remove(kernel_pmap, (addr64_t)where); /* Unmap it */ - kmem_free(kernel_map, where, PAGE_SIZE); - continue; - - /* minor device 1 is kernel memory */ - case 1: - if (setup_kmem == 0) - return(ENODEV); - /* Do some sanity checking */ - if (((addr64_t)uio->uio_offset > vm_last_addr) || - ((addr64_t)uio->uio_offset < VM_MIN_KERNEL_ADDRESS)) - goto fault; - c = uio_curriovlen(uio); - if (!kernacc(uio->uio_offset, c)) - goto fault; - error = uiomove64(uio->uio_offset, c, uio); - continue; - - /* minor device 2 is EOF/RATHOLE */ - case 2: - if (rw == UIO_READ) - return (0); - c = uio_curriovlen(uio); - break; - /* minor device 3 is ZERO/RATHOLE */ - case 3: - if(devzerobuf == NULL) { - MALLOC(devzerobuf, caddr_t,PAGE_SIZE, M_TEMP, M_WAITOK); - bzero(devzerobuf, PAGE_SIZE); - } - if(uio->uio_rw == UIO_WRITE) { - c = uio_curriovlen(uio); - break; - } - c = min(uio_curriovlen(uio), PAGE_SIZE); - error = uiomove(devzerobuf, c, uio); - continue; - default: - goto fault; - break; - } - - if (error) - break; - uio_update(uio, c); - } - return (error); -fault: - return (EFAULT); -} - diff --git a/bsd/dev/ppc/munge.s b/bsd/dev/ppc/munge.s deleted file mode 100644 index 9e33bc326..000000000 --- a/bsd/dev/ppc/munge.s +++ /dev/null @@ -1,477 +0,0 @@ -/* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * Syscall argument mungers. - * - * Passed a pointer to the users register array in the savearea, we copy args into - * the uu_arg[] array, padding etc as appropriate. The issue is that parameters - * passed in registers from a 32-bit address space do not map directly into the uu_args. - * For example, a 32-bit long-long comes in two registers, but we need to combine - * them into one 64-bit long-long in the uu_args. - * - * There are several functions in this file. Each takes two parameters: - * - * void munge_XXXX( const void *regs, void *uu_args); - * - * The name of the function encodes the number and type of the parameters, as follows: - * - * w = a 32-bit value such as an int or a 32-bit ptr, that does not require - * sign extension. These are handled by skipping a word in the input, - * zeroing a word of output, and copying a word from input to output. - * - * s = a 32-bit value such as a long, which must be sign-extended to a 64-bit - * long-long in the uu_args. These are handled by skipping a word of - * input, loading a word of input and sign extending it to a double, - * and storing two words of output. - * - * l = a 64-bit long-long, passed in two registers. These are handled by skipping - * a word of input, copying a word, skipping another word of input, and - * copying another word. - * - * d = a 32-bit int or a 64-bit ptr or long, passed in via a 64-bit GPR - * from a 64-bit process. We copy two words from input to output. - * - * For example, "munge_wls" takes a word, a long-long, and a word. This takes - * four registers: the first word is in one, the long-long takes two, and the - * final word is in the fourth. We store six words: a 0, the low words of the - * first three registers, and the two words resulting from sign-extending the - * low word of the fourth register. - * - * As you can see, we save a lot of code by collapsing mungers that are prefixes - * of each other, into the more general routine. This ends up copying a few extra - * bytes of parameters, but big deal. The old kernel copied all eight words for - * every system call. - * - * These routines assume explicit pad words in the uu_arg structures, that fill out - * int parameters to 64 bits. Having pad words makes munging args for 64-bit - * processes the equivalent of a simple bcopy(), though it does introduce an - * endian dependency. - */ - - .align 5 - .globl _munge_dddddddd // that is 8 'd's -_munge_dddddddd: - .globl _munge_ddddddd -_munge_ddddddd: - .globl _munge_dddddd -_munge_dddddd: - .globl _munge_ddddd -_munge_ddddd: - ld r5,0*8+0(r3) - ld r6,1*8+0(r3) - ld r7,2*8+0(r3) - ld r8,3*8+0(r3) - ld r9,4*8+0(r3) - ld r10,5*8+0(r3) - ld r11,6*8+0(r3) - ld r12,7*8+0(r3) - - std r5,0*8+0(r4) - std r6,1*8+0(r4) - std r7,2*8+0(r4) - std r8,3*8+0(r4) - std r9,4*8+0(r4) - std r10,5*8+0(r4) - std r11,6*8+0(r4) - std r12,7*8+0(r4) - - blr - - - .align 5 - .globl _munge_dddd -_munge_dddd: - .globl _munge_ddd -_munge_ddd: - .globl _munge_dd -_munge_dd: - .globl _munge_d -_munge_d: - ld r5,0*8+0(r3) - ld r6,1*8+0(r3) - ld r7,2*8+0(r3) - ld r8,3*8+0(r3) - - std r5,0*8+0(r4) - std r6,1*8+0(r4) - std r7,2*8+0(r4) - std r8,3*8+0(r4) - - blr - - - .align 5 - .globl _munge_wwwwwwww // that is 8 'w's -_munge_wwwwwwww: - .globl _munge_wwwwwww -_munge_wwwwwww: - .globl _munge_wwwwww -_munge_wwwwww: - .globl _munge_wwwww -_munge_wwwww: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - lwz r10,5*8+4(r3) - lwz r11,6*8+4(r3) - lwz r12,7*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r0,3*8+0(r4) - stw r8,3*8+4(r4) - stw r0,4*8+0(r4) - stw r9,4*8+4(r4) - stw r0,5*8+0(r4) - stw r10,5*8+4(r4) - stw r0,6*8+0(r4) - stw r11,6*8+4(r4) - stw r0,7*8+0(r4) - stw r12,7*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwww -_munge_wwww: - .globl _munge_www -_munge_www: - .globl _munge_ww -_munge_ww: - .globl _munge_w -_munge_w: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r0,3*8+0(r4) - stw r8,3*8+4(r4) - - blr - - .align 5 - .globl _munge_l -_munge_l: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - - stw r5,0*8+0(r4) - stw r6,0*8+4(r4) - - blr - - .align 5 - .globl _munge_wlw -_munge_wlw: - .globl _munge_wl -_munge_wl: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r6,1*8+0(r4) - stw r7,1*8+4(r4) - stw r0,2*8+0(r4) - stw r8,2*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwwl -_munge_wwwl: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r8,3*8+0(r4) - stw r9,3*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwwlww -_munge_wwwlww: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - lwz r10,5*8+4(r3) - lwz r11,6*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r8,3*8+0(r4) - stw r9,3*8+4(r4) - stw r0,4*8+0(r4) - stw r10,4*8+4(r4) - stw r0,5*8+0(r4) - stw r11,5*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwlwww -_munge_wwlwww: - li r0,0 - lwz r5,0*8+4(r3) // Wwlwww - lwz r6,1*8+4(r3) // wWlwww - lwz r7,2*8+4(r3) // wwLwww (hi) - lwz r8,3*8+4(r3) // wwLwww (lo) - lwz r9,4*8+4(r3) // wwlWww - lwz r10,5*8+4(r3) // wwlwWw - lwz r11,6*8+4(r3) // wwlwwW - - stw r0,0*8+0(r4) // 0wlwww - stw r5,0*8+4(r4) // Wwlwww - stw r0,1*8+0(r4) // w0lwww - stw r6,1*8+4(r4) // wWlwww - stw r7,2*8+0(r4) // wwLwww (hi) - stw r8,2*8+4(r4) // wwLwww (lo) - stw r0,3*8+0(r4) // wwl0ww - stw r9,3*8+4(r4) // wwlwww - stw r0, 4*8+0(r4) // wwlw0w - stw r10,4*8+4(r4) // wwlwWw - stw r0, 5*8+0(r4) // wwlww0 - stw r11,5*8+4(r4) // wwlwwW - - blr - - .align 5 - .globl _munge_wwwwlw // 4 'w's and an l an w -_munge_wwwwlw: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - lwz r10,5*8+4(r3) - lwz r11,6*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r0,3*8+0(r4) - stw r8,3*8+4(r4) - stw r9,4*8+0(r4) - stw r10,4*8+4(r4) - stw r0,5*8+0(r4) - stw r11,5*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwwwl // 4 'w's and an l -_munge_wwwwl: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - lwz r10,5*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r0,3*8+0(r4) - stw r8,3*8+4(r4) - stw r9,4*8+0(r4) - stw r10,4*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwwwwl // 5 'w's and an l -_munge_wwwwwl: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - lwz r10,5*8+4(r3) - lwz r11,6*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r0,3*8+0(r4) - stw r8,3*8+4(r4) - stw r0,4*8+0(r4) - stw r9,4*8+4(r4) - stw r10,5*8+0(r4) - stw r11,5*8+4(r4) - - blr - - - .align 5 - .globl _munge_wsw -_munge_wsw: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - - stw r0,0*8+0(r4) - srawi r2,r6,31 - stw r5,0*8+4(r4) - stw r2,1*8+0(r4) - stw r6,1*8+4(r4) - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - - blr - - - .align 5 - .globl _munge_wws -_munge_wws: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - srawi r2,r7,31 - stw r6,1*8+4(r4) - stw r2,2*8+0(r4) - stw r7,2*8+4(r4) - - blr - - - .align 5 - .globl _munge_wwwsw -_munge_wwwsw: - li r0,0 - lwz r5,0*8+4(r3) - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) - - stw r0,0*8+0(r4) - stw r5,0*8+4(r4) - stw r0,1*8+0(r4) - stw r6,1*8+4(r4) - srawi r2,r8,31 - stw r0,2*8+0(r4) - stw r7,2*8+4(r4) - stw r2,3*8+0(r4) - stw r8,3*8+4(r4) - stw r0,4*8+0(r4) - stw r9,4*8+4(r4) - - blr - - .align 5 - .globl _munge_llllll -_munge_llllll: - li r0,0 - lwz r5,0*8+4(r3) // l1 - lwz r6,1*8+4(r3) - lwz r7,2*8+4(r3) // l2 - lwz r8,3*8+4(r3) - lwz r9,4*8+4(r3) // l3 - lwz r10,5*8+4(r3) - lwz r11,6*8+4(r3) // l4 - - stw r5,0*8+0(r4) - stw r6,0*8+4(r4) - stw r7,1*8+0(r4) - stw r8,1*8+4(r4) - stw r9,2*8+0(r4) - stw r10,2*8+4(r4) - stw r11,3*8+0(r4) - - // the rest spill to the stack (r1) - // we'll zero fill for now - // and make the syscall handler - // do the copyin from the user stack - stw r0,3*8+4(r4) - stw r0,4*8+0(r4) - stw r0,4*8+4(r4) - stw r0,5*8+0(r4) - stw r0,5*8+4(r4) - - blr diff --git a/bsd/dev/ppc/ppc_init.c b/bsd/dev/ppc/ppc_init.c deleted file mode 100644 index 545cfe5ae..000000000 --- a/bsd/dev/ppc/ppc_init.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - - -#include <mach/vm_types.h> -#include <mach/vm_param.h> -#include <mach/thread_status.h> -#include <kern/misc_protos.h> -#include <kern/assert.h> -#include <kern/cpu_number.h> - -#include <ppc/proc_reg.h> -#include <ppc/boot.h> -#include <ppc/misc_protos.h> -#include <ppc/pmap.h> -#include <ppc/pmap_internals.h> -#include <ppc/mem.h> -#include <ppc/exception.h> -#include <ppc/gdb_defs.h> -#include <ppc/POWERMAC/video_board.h> -#include <ppc/POWERMAC/video_pdm.h> - -#ifdef __MACHO__ -#include <libkern/kernel_mach_header.h> -#endif - -/* External references */ - -extern unsigned int intstack[]; /* declared in start.s */ -extern unsigned int intstack_top_ss; /* declared in start.s */ -#if MACH_KGDB -extern unsigned int gdbstackptr; /* declared in start.s */ -extern unsigned int gdbstack_top_ss; /* declared in start.s */ -#endif /* MACH_KGDB */ - -/* Stuff declared in kern/bootstrap.c which we may need to initialise */ - -extern vm_offset_t boot_start; -extern vm_size_t boot_size; -extern vm_offset_t boot_region_desc; -extern vm_size_t boot_region_count; -extern int boot_thread_state_flavor; -extern thread_state_t boot_thread_state; -extern unsigned int boot_thread_state_count; - -/* Trap handling function prototypes */ - -extern void thandler(void); /* trap handler */ -extern void ihandler(void); /* interrupt handler */ -extern void shandler(void); /* syscall handler */ -extern void gdbhandler(void); /* debugger handler */ -extern void fpu_switch(void); /* fp handler */ -extern void atomic_switch_trap(void); /* fast path atomic thread switch */ - -/* definitions */ - -struct ppc_thread_state boot_task_thread_state; - - - - - -#if 1 /* TODO NMGS - vm_map_steal_memory shouldn't use these - remove */ -vm_offset_t avail_start; -vm_offset_t avail_end; -#endif -unsigned int avail_remaining = 0; -vm_offset_t first_avail; - -/* - * Mach-O Support - */ - - -#ifdef __MACHO__ -void *sectTEXTB; -unsigned long sectSizeTEXT; -void *sectDATAB; -unsigned long sectSizeDATA; -void *sectOBJCB; -unsigned long sectSizeOBJC; -void *sectLINKB; -unsigned long sectSizeLINK; - -vm_offset_t end, etext, edata; -#define ETEXT etext -#endif - - - -void ppc_vm_init(unsigned int memory_size, boot_args *args) -{ - unsigned int htabmask; - unsigned int i; - vm_offset_t addr; - int boot_task_end_offset; - - printf("mem_size = %d M\n",memory_size / (1024 * 1024)); - -#ifdef __MACHO__ - /* Now retrieve addresses for end, edata, and etext - * from MACH-O headers. - */ - - - etext = (vm_offset_t) sectTEXTB + sectSizeTEXT; - edata = (vm_offset_t) sectDATAB + sectSizeDATA; - end = getlastaddr(); -#endif - - /* Stitch valid memory regions together - they may be contiguous - * even though they're not already glued together - */ - - /* Go through the list of memory regions passed in via the args - * and copy valid entries into the pmap_mem_regions table, adding - * further calculated entries. - */ - - - /* Initialise the pmap system, using space above `first_avail'*/ - -#ifndef __MACHO__ - free_regions[free_regions_count].start = - round_page((unsigned int)&_ExceptionVectorsEnd - - (unsigned int)&_ExceptionVectorsStart); -#else - /* On MACH-O generated kernels, the Exception Vectors - * are already mapped and loaded at 0 -- no relocation - * or freeing of memory is needed - */ - - free_regions[free_regions_count].start = round_page((unsigned int)&_ExceptionVectorsEnd) + 4096; -#endif - - /* If we are on a PDM machine memory at 1M might be used - * for video. TODO NMGS call video driver to do this - * somehow - */ - - - /* For PowerMac, first_avail is set to above the bootstrap task. - * TODO NMGS - different screen modes - might free mem? - */ - - first_avail = round_page(args->first_avail); - - - /* map in the exception vectors */ - /* - * map the kernel text, data and bss. Don't forget other regions too - */ - for (i = 0; i < args->kern_info.region_count; i++) { -#if MACH_KDB - if (args->kern_info.regions[i].prot == VM_PROT_NONE && - i == args->kern_info.region_count - 1) { - /* assume that's the kernel symbol table */ - kern_sym_start = args->kern_info.regions[i].addr; - kern_sym_size = args->kern_info.regions[i].size; - printf("kernel symbol table at 0x%x size 0x%x\n", - kern_sym_start, kern_sym_size); - args->kern_info.regions[i].prot |= - (VM_PROT_WRITE|VM_PROT_READ); - } -#endif /* MACH_KDB */ - -#ifdef __MACHO__ - /* Skip the VECTORS segment */ - if (args->kern_info.regions[i].addr == 0) - continue; -#endif - - boot_region_count = args->task_info.region_count; - boot_size = 0; - boot_task_end_offset = 0; - /* Map bootstrap task pages 1-1 so that user_bootstrap can find it */ - for (i = 0; i < boot_region_count; i++) { - if (args->task_info.regions[i].mapped) { - /* kernel requires everything page aligned */ -#if DEBUG - printf("mapping virt 0x%08x to phys 0x%08x end 0x%x, prot=0x%b\n", - ppc_trunc_page(args->task_info.base_addr + - args->task_info.regions[i].offset), - ppc_trunc_page(args->task_info.base_addr + - args->task_info.regions[i].offset), - ppc_round_page(args->task_info.base_addr + - args->task_info.regions[i].offset + - args->task_info.regions[i].size), - args->task_info.regions[i].prot, - "\x10\1READ\2WRITE\3EXEC"); -#endif /* DEBUG */ - - (void)pmap_map( - ppc_trunc_page(args->task_info.base_addr + - args->task_info.regions[i].offset), - ppc_trunc_page(args->task_info.base_addr + - args->task_info.regions[i].offset), - ppc_round_page(args->task_info.base_addr + - args->task_info.regions[i].offset + - args->task_info.regions[i].size), - args->task_info.regions[i].prot); - - /* Count the size of mapped space */ - boot_size += args->task_info.regions[i].size; - - /* There may be an overlapping physical page - * mapped to two different virtual addresses - */ - if (boot_task_end_offset > - args->task_info.regions[i].offset) { - boot_size -= boot_task_end_offset - - args->task_info.regions[i].offset; -#if DEBUG - printf("WARNING - bootstrap overlaps regions\n"); -#endif /* DEBUG */ - } - - boot_task_end_offset = - args->task_info.regions[i].offset + - args->task_info.regions[i].size; - } - } - - if (boot_region_count) { - - /* Add a new region to the bootstrap task for it's stack */ - args->task_info.regions[boot_region_count].addr = - BOOT_STACK_BASE; - args->task_info.regions[boot_region_count].size = - BOOT_STACK_SIZE; - args->task_info.regions[boot_region_count].mapped = FALSE; - boot_region_count++; - - boot_start = args->task_info.base_addr; - boot_region_desc = (vm_offset_t) args->task_info.regions; - /* TODO NMGS need to put param info onto top of boot stack */ - boot_task_thread_state.r1 = BOOT_STACK_PTR-0x100; - boot_task_thread_state.srr0 = args->task_info.entry; - boot_task_thread_state.srr1 = - MSR_MARK_SYSCALL(MSR_EXPORT_MASK_SET); - - boot_thread_state_flavor = PPC_THREAD_STATE; - boot_thread_state_count = PPC_THREAD_STATE_COUNT; - boot_thread_state = - (thread_state_t)&boot_task_thread_state; - } - - - -} - diff --git a/bsd/dev/ppc/sdt_ppc.c b/bsd/dev/ppc/sdt_ppc.c deleted file mode 100644 index bcd10c967..000000000 --- a/bsd/dev/ppc/sdt_ppc.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -/* #pragma ident "@(#)sdt.c 1.6 06/03/24 SMI" */ - -#ifdef KERNEL -#ifndef _KERNEL -#define _KERNEL /* Solaris vs. Darwin */ -#endif -#endif - -#define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */ -#include <kern/cpu_data.h> -#include <kern/thread.h> -#include <mach/thread_status.h> - -#include <sys/dtrace.h> -#include <sys/dtrace_impl.h> - -#include <sys/dtrace_glue.h> - -#include <sys/sdt_impl.h> -#include <machine/cpu_capabilities.h> - -extern sdt_probe_t **sdt_probetab; - -/*ARGSUSED*/ -int -sdt_invop(uintptr_t addr, uintptr_t *stack, uintptr_t eax) -{ - uint64_t mask = (_cpu_capabilities & k64Bit) ? 0xffffffffffffffffULL : 0x00000000ffffffffULL; - -#pragma unused(eax) - sdt_probe_t *sdt = sdt_probetab[SDT_ADDR2NDX(addr)]; - - for (; sdt != NULL; sdt = sdt->sdp_hashnext) { - if ((uintptr_t)sdt->sdp_patchpoint == addr) { - ppc_saved_state_t *regs = (ppc_saved_state_t *)stack; - - dtrace_probe(sdt->sdp_id, regs->save_r3 & mask, regs->save_r4 & mask, - regs->save_r5 & mask, regs->save_r6 & mask, regs->save_r7 & mask); - - return (DTRACE_INVOP_NOP); - } - } - - return (0); -} - diff --git a/bsd/dev/ppc/stubs.c b/bsd/dev/ppc/stubs.c deleted file mode 100644 index 55e2f0170..000000000 --- a/bsd/dev/ppc/stubs.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997 by Apple Computer, Inc., all rights reserved - * Copyright (c) 1993 NeXT Computer, Inc. - * - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/ioctl.h> -#include <sys/tty.h> -#include <sys/conf.h> -#include <sys/proc.h> -#include <sys/user.h> -#include <kern/thread.h> -#include <kern/task.h> -#include <vm/vm_map.h> - -extern void *get_bsduthreadarg(thread_t); -extern int *get_bsduthreadrval(thread_t); - -/* - * copy a null terminated string from one point to another in - * the kernel address space. - * - no access checks are performed. - * - if the end of string isn't found before - * maxlen bytes are copied, return ENAMETOOLONG, - * indicating an incomplete copy. - * - otherwise, return 0, indicating success. - * the number of bytes copied is always returned in lencopied. - */ -/* from ppc/fault_copy.c -Titan1T4 VERSION */ -int -copystr(const void *vfrom, void *vto, size_t maxlen, size_t *lencopied) -{ - register unsigned l; - const char *from; - char *to; - - from = vfrom; - to = vto; - for (l = 0; l < maxlen; l++) - if ((*to++ = *from++) == '\0') { - if (lencopied) - *lencopied = l + 1; - return 0; - } - if (lencopied) - *lencopied = maxlen; - return ENAMETOOLONG; -} - -int copywithin(src, dst, count) -void * src, *dst; -size_t count; -{ - bcopy(src,dst,count); - return 0; -} - -void * -get_bsduthreadarg(thread_t th) -{ -struct uthread *ut; - ut = get_bsdthread_info(th); - return((void *)(ut->uu_arg)); -} - -int * -get_bsduthreadrval(thread_t th) -{ -struct uthread *ut; - ut = get_bsdthread_info(th); - return(&ut->uu_rval[0]); -} - diff --git a/bsd/dev/ppc/systemcalls.c b/bsd/dev/ppc/systemcalls.c deleted file mode 100644 index a8fd2dcfd..000000000 --- a/bsd/dev/ppc/systemcalls.c +++ /dev/null @@ -1,435 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * NOTICE: This file was modified by McAfee Research in 2004 to introduce - * support for mandatory and extensible security protections. This notice - * is included in support of clause 2.2 (b) of the Apple Public License, - * Version 2.0. - */ - -#include <mach/mach_traps.h> - -#include <kern/task.h> -#include <kern/thread.h> -#include <kern/assert.h> -#include <kern/clock.h> -#include <kern/locks.h> -#include <kern/sched_prim.h> -#include <mach/machine/thread_status.h> -#include <mach/thread_act.h> -#include <ppc/savearea.h> - -#include <sys/kernel.h> -#include <sys/vm.h> -#include <sys/proc_internal.h> -#include <sys/syscall.h> -#include <sys/systm.h> -#include <sys/user.h> -#include <sys/errno.h> -#include <sys/kdebug.h> -#include <sys/sysent.h> -#include <sys/sysproto.h> -#include <sys/kauth.h> - -#include <security/audit/audit.h> - -#if CONFIG_DTRACE -extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *); -extern void dtrace_systrace_syscall_return(unsigned short, int, int *); -#endif - -extern void -unix_syscall(struct savearea *regs); - -extern struct savearea * -find_user_regs( - thread_t act); - -extern lck_spin_t * tz_slock; - -/* - * Function: unix_syscall - * - * Inputs: regs - pointer to Process Control Block - * - * Outputs: none - */ -void -unix_syscall(struct savearea *regs) -{ - thread_t thread_act; - struct uthread *uthread; - struct proc *proc; - struct sysent *callp; - int error; - unsigned int code; - boolean_t flavor; - - flavor = (((unsigned int)regs->save_r0) == 0)? 1: 0; - - if (flavor) - code = regs->save_r3; - else - code = regs->save_r0; - - if (kdebug_enable && (code != 180)) { - if (flavor) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - regs->save_r4, regs->save_r5, regs->save_r6, regs->save_r7, 0); - else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START, - regs->save_r3, regs->save_r4, regs->save_r5, regs->save_r6, 0); - } - thread_act = current_thread(); - uthread = get_bsdthread_info(thread_act); - - if (!(uthread->uu_flag & UT_VFORK)) - proc = (struct proc *)get_bsdtask_info(current_task()); - else - proc = current_proc(); - - /* Make sure there is a process associated with this task */ - if (proc == NULL) { - regs->save_r3 = (long long)EPERM; - /* set the "pc" to execute cerror routine */ - regs->save_srr0 -= 4; - task_terminate_internal(current_task()); - thread_exception_return(); - /* NOTREACHED */ - } - - /* - * Delayed binding of thread credential to process credential, if we - * are not running with an explicitly set thread credential. - */ - kauth_cred_uthread_update(uthread, proc); - - callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; - - if (callp->sy_narg != 0) { - void *regsp; - sy_munge_t *mungerp; - - if (IS_64BIT_PROCESS(proc)) { - /* XXX Turn 64 bit unsafe calls into nosys() */ - if (callp->sy_flags & UNSAFE_64BIT) { - callp = &sysent[63]; - goto unsafe; - } - mungerp = callp->sy_arg_munge64; - } - else { - mungerp = callp->sy_arg_munge32; - } - if ( !flavor) { - regsp = (void *) ®s->save_r3; - } else { - /* indirect system call consumes an argument so only 7 are supported */ - if (callp->sy_narg > 7) { - callp = &sysent[63]; - goto unsafe; - } - regsp = (void *) ®s->save_r4; - } - /* call syscall argument munger to copy in arguments (see xnu/bsd/dev/ppc/munge.s) */ - (*mungerp)(regsp, (void *) &uthread->uu_arg[0]); - } - -unsafe: - - uthread->uu_flag |= UT_NOTCANCELPT; - - uthread->uu_rval[0] = 0; - - /* - * r4 is volatile, if we set it to regs->save_r4 here the child - * will have parents r4 after execve - */ - uthread->uu_rval[1] = 0; - - error = 0; - - /* - * PPC runtime calls cerror after every unix system call, so - * assume no error and adjust the "pc" to skip this call. - * It will be set back to the cerror call if an error is detected. - */ - regs->save_srr0 += 4; - -#ifdef JOE_DEBUG - uthread->uu_iocount = 0; - uthread->uu_vpindex = 0; -#endif - AUDIT_SYSCALL_ENTER(code, proc, uthread); - error = (*(callp->sy_call))(proc, (void *)uthread->uu_arg, &(uthread->uu_rval[0])); - AUDIT_SYSCALL_EXIT(code, proc, uthread, error); -#if CONFIG_MACF - mac_thread_userret(code, error, thread_act); -#endif - - -#ifdef JOE_DEBUG - if (uthread->uu_iocount) - printf("system call returned with uu_iocount != 0\n"); -#endif -#if CONFIG_DTRACE - uthread->t_dtrace_errno = error; -#endif /* CONFIG_DTRACE */ - - regs = find_user_regs(thread_act); - - if (error == ERESTART) { - regs->save_srr0 -= 8; - } else if (error != EJUSTRETURN) { - if (error) { - regs->save_r3 = (long long)error; - /* set the "pc" to execute cerror routine */ - regs->save_srr0 -= 4; - } else { /* (not error) */ - switch (callp->sy_return_type) { - case _SYSCALL_RET_INT_T: - regs->save_r3 = uthread->uu_rval[0]; - regs->save_r4 = uthread->uu_rval[1]; - break; - case _SYSCALL_RET_UINT_T: - regs->save_r3 = ((u_int)uthread->uu_rval[0]); - regs->save_r4 = ((u_int)uthread->uu_rval[1]); - break; - case _SYSCALL_RET_OFF_T: - case _SYSCALL_RET_UINT64_T: - /* return 64 bits split across two registers for 32 bit */ - /* process and in one register for 64 bit process */ - if (IS_64BIT_PROCESS(proc)) { - u_int64_t *retp = (u_int64_t *)&uthread->uu_rval[0]; - regs->save_r3 = *retp; - regs->save_r4 = 0; - } - else { - regs->save_r3 = uthread->uu_rval[0]; - regs->save_r4 = uthread->uu_rval[1]; - } - break; - case _SYSCALL_RET_ADDR_T: - case _SYSCALL_RET_SIZE_T: - case _SYSCALL_RET_SSIZE_T: - /* the variable length return types (user_addr_t, user_ssize_t, - * and user_size_t) are always the largest possible size in the - * kernel (we use uu_rval[0] and [1] as one 64 bit value). - */ - { - user_addr_t *retp = (user_addr_t *)&uthread->uu_rval[0]; - regs->save_r3 = *retp; - regs->save_r4 = 0; - } - break; - case _SYSCALL_RET_NONE: - break; - default: - panic("unix_syscall: unknown return type"); - break; - } - } - } - /* else (error == EJUSTRETURN) { nothing } */ - - - uthread->uu_flag &= ~UT_NOTCANCELPT; - - /* panic if funnel is held */ - syscall_exit_funnelcheck(); - - if (uthread->uu_lowpri_window) { - /* - * task is marked as a low priority I/O type - * and the I/O we issued while in this system call - * collided with normal I/O operations... we'll - * delay in order to mitigate the impact of this - * task on the normal operation of the system - */ - throttle_lowpri_io(TRUE); - } - if (kdebug_enable && (code != 180)) { - - if (callp->sy_return_type == _SYSCALL_RET_SSIZE_T) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[1], 0, proc->p_pid, 0); - else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], proc->p_pid, 0); - } - - thread_exception_return(); - /* NOTREACHED */ -} - -void -unix_syscall_return(int error) -{ - thread_t thread_act; - struct uthread *uthread; - struct proc *proc; - struct savearea *regs; - unsigned int code; - struct sysent *callp; - - thread_act = current_thread(); - proc = current_proc(); - uthread = get_bsdthread_info(thread_act); - - regs = find_user_regs(thread_act); - - if (regs->save_r0 != 0) - code = regs->save_r0; - else - code = regs->save_r3; - - callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code]; - -#if CONFIG_DTRACE - if (callp->sy_call == dtrace_systrace_syscall) - dtrace_systrace_syscall_return( code, error, uthread->uu_rval ); -#endif /* CONFIG_DTRACE */ - AUDIT_SYSCALL_EXIT(code, proc, uthread, error); - - /* - * Get index into sysent table - */ - if (error == ERESTART) { - regs->save_srr0 -= 8; - } else if (error != EJUSTRETURN) { - if (error) { - regs->save_r3 = (long long)error; - /* set the "pc" to execute cerror routine */ - regs->save_srr0 -= 4; - } else { /* (not error) */ - switch (callp->sy_return_type) { - case _SYSCALL_RET_INT_T: - regs->save_r3 = uthread->uu_rval[0]; - regs->save_r4 = uthread->uu_rval[1]; - break; - case _SYSCALL_RET_UINT_T: - regs->save_r3 = ((u_int)uthread->uu_rval[0]); - regs->save_r4 = ((u_int)uthread->uu_rval[1]); - break; - case _SYSCALL_RET_OFF_T: - case _SYSCALL_RET_UINT64_T: - /* return 64 bits split across two registers for 32 bit */ - /* process and in one register for 64 bit process */ - if (IS_64BIT_PROCESS(proc)) { - u_int64_t *retp = (u_int64_t *)&uthread->uu_rval[0]; - regs->save_r3 = *retp; - } - else { - regs->save_r3 = uthread->uu_rval[0]; - regs->save_r4 = uthread->uu_rval[1]; - } - break; - case _SYSCALL_RET_ADDR_T: - case _SYSCALL_RET_SIZE_T: - case _SYSCALL_RET_SSIZE_T: - /* the variable length return types (user_addr_t, user_ssize_t, - * and user_size_t) are always the largest possible size in the - * kernel (we use uu_rval[0] and [1] as one 64 bit value). - */ - { - u_int64_t *retp = (u_int64_t *)&uthread->uu_rval[0]; - regs->save_r3 = *retp; - } - break; - case _SYSCALL_RET_NONE: - break; - default: - panic("unix_syscall: unknown return type"); - break; - } - } - } - /* else (error == EJUSTRETURN) { nothing } */ - - - uthread->uu_flag &= ~UT_NOTCANCELPT; - - /* panic if funnel is held */ - syscall_exit_funnelcheck(); - - if (uthread->uu_lowpri_window) { - /* - * task is marked as a low priority I/O type - * and the I/O we issued while in this system call - * collided with normal I/O operations... we'll - * delay in order to mitigate the impact of this - * task on the normal operation of the system - */ - throttle_lowpri_io(TRUE); - } - if (kdebug_enable && (code != 180)) { - if (callp->sy_return_type == _SYSCALL_RET_SSIZE_T) - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[1], 0, proc->p_pid, 0); - else - KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END, - error, uthread->uu_rval[0], uthread->uu_rval[1], proc->p_pid, 0); - } - - thread_exception_return(); - /* NOTREACHED */ -} - -void -munge_lwww( - const void *in32, - void *out64) -{ - const uint32_t *arg32; - uint64_t *arg64; - - arg32 = (const uint32_t *) in32; - arg64 = (uint64_t *) out64; - - arg64[3] = arg32[9]; /* lwwW */ - arg64[2] = arg32[7]; /* lwWw */ - arg64[1] = arg32[5]; /* lWww */ - arg64[0] = ((uint64_t) arg32[1]) << 32; /* Lwww (hi) */ - arg64[0] |= (uint64_t) arg32[3]; /* Lwww (lo) */ -} - -void -munge_lw( - const void *in32, - void *out64) -{ - const uint32_t *arg32; - uint64_t *arg64; - - arg32 = (const uint32_t *) in32; - arg64 = (uint64_t *) out64; - - arg64[1] = arg32[5]; /* lW */ - arg64[0] = ((uint64_t) arg32[1]) << 32; /* Lw (hi) */ - arg64[0] |= (uint64_t) arg32[3]; /* Lw (lo) */ -} diff --git a/bsd/dev/ppc/unix_signal.c b/bsd/dev/ppc/unix_signal.c deleted file mode 100644 index 4ca48b0b7..000000000 --- a/bsd/dev/ppc/unix_signal.c +++ /dev/null @@ -1,953 +0,0 @@ -/* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1999 Apple Computer, Inc. All rights reserved. - */ - -#include <mach/mach_types.h> -#include <mach/exception_types.h> - -#include <sys/param.h> -#include <sys/proc_internal.h> -#include <sys/user.h> -#include <sys/ucontext.h> -#include <sys/sysproto.h> -#include <sys/systm.h> -#include <sys/ux_exception.h> - -#include <ppc/signal.h> -#include <sys/signalvar.h> -#include <sys/kdebug.h> -#include <sys/wait.h> -#include <kern/thread.h> -#include <mach/ppc/thread_status.h> -#include <ppc/proc_reg.h> - -#include <sys/sdt.h> - -// #include <machine/thread.h> XXX include path messed up for some reason... - -/* XXX functions not in a Mach headers */ -extern kern_return_t thread_getstatus(register thread_t act, int flavor, - thread_state_t tstate, mach_msg_type_number_t *count); -extern unsigned int get_msr_exportmask(void); -extern kern_return_t thread_setstatus(thread_t thread, int flavor, - thread_state_t tstate, mach_msg_type_number_t count); -extern void ppc_checkthreadstate(void *, int); -extern struct savearea_vec *find_user_vec_curr(void); -extern int thread_enable_fpe(thread_t act, int onoff); - - - -#define C_32_REDZONE_LEN 224 -#define C_32_STK_ALIGN 16 -#define C_32_PARAMSAVE_LEN 64 -#define C_32_LINKAGE_LEN 48 - -#define C_64_REDZONE_LEN 320 -#define C_64_STK_ALIGN 32 -#define C_64_PARAMSAVE_LEN 64 -#define C_64_LINKAGE_LEN 48 - -#define TRUNC_DOWN32(a,b,c) ((((uint32_t)a)-(b)) & ((uint32_t)(-(c)))) -#define TRUNC_DOWN64(a,b,c) ((((uint64_t)a)-(b)) & ((uint64_t)(-(c)))) - -/* - * The stack layout possibilities (info style); This needs to mach with signal trampoline code - * - * Traditional: 1 - * Traditional64: 20 - * Traditional64with vec: 25 - * 32bit context 30 - * 32bit context with vector 35 - * 64bit context 40 - * 64bit context with vector 45 - * Dual context 50 - * Dual context with vector 55 - * - */ - -#define UC_TRAD 1 -#define UC_TRAD_VEC 6 -#define UC_TRAD64 20 -#define UC_TRAD64_VEC 25 -#define UC_FLAVOR 30 -#define UC_FLAVOR_VEC 35 -#define UC_FLAVOR64 40 -#define UC_FLAVOR64_VEC 45 -#define UC_DUAL 50 -#define UC_DUAL_VEC 55 -#define UC_SET_ALT_STACK 0x40000000 -#define UC_RESET_ALT_STACK 0x80000000 - - /* The following are valid mcontext sizes */ -#define UC_FLAVOR_SIZE ((PPC_THREAD_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)) - -#define UC_FLAVOR_VEC_SIZE ((PPC_THREAD_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_FLOAT_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int)) - -#define UC_FLAVOR64_SIZE ((PPC_THREAD_STATE64_COUNT + PPC_EXCEPTION_STATE64_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)) - -#define UC_FLAVOR64_VEC_SIZE ((PPC_THREAD_STATE64_COUNT + PPC_EXCEPTION_STATE64_COUNT + PPC_FLOAT_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int)) - - -/* - * NOTE: Source and target may *NOT* overlap! - */ -static void -ucontext_32to64(struct ucontext64 *in, struct user_ucontext64 *out) -{ - out->uc_onstack = in->uc_onstack; - out->uc_sigmask = in->uc_sigmask; - - /* internal "structure assign" */ - out->uc_stack.ss_sp = CAST_USER_ADDR_T(in->uc_stack.ss_sp); - out->uc_stack.ss_size = in->uc_stack.ss_size; - out->uc_stack.ss_flags = in->uc_stack.ss_flags; - - out->uc_link = CAST_USER_ADDR_T(in->uc_link); - out->uc_mcsize = in->uc_mcsize; - out->uc_mcontext64 = CAST_USER_ADDR_T(in->uc_mcontext64); -} - -/* - * This conversion is safe, since if we are converting for a 32 bit process, - * then it's values of uc-stack.ss_size and uc_mcsize will never exceed 4G. - * - * NOTE: Source and target may *NOT* overlap! - */ -static void -ucontext_64to32(struct user_ucontext64 *in, struct ucontext64 *out) -{ - out->uc_onstack = in->uc_onstack; - out->uc_sigmask = in->uc_sigmask; - - /* internal "structure assign" */ - out->uc_stack.ss_sp = CAST_DOWN(void *,in->uc_stack.ss_sp); - out->uc_stack.ss_size = in->uc_stack.ss_size; /* range reduction */ - out->uc_stack.ss_flags = in->uc_stack.ss_flags; - - out->uc_link = CAST_DOWN(void *,in->uc_link); - out->uc_mcsize = in->uc_mcsize; /* range reduction */ - out->uc_mcontext64 = CAST_DOWN(void *,in->uc_mcontext64); -} - -/* - * NOTE: Source and target may *NOT* overlap! - */ -static void -siginfo_user_to_user32(user_siginfo_t *in, user32_siginfo_t *out) -{ - out->si_signo = in->si_signo; - out->si_errno = in->si_errno; - out->si_code = in->si_code; - out->si_pid = in->si_pid; - out->si_uid = in->si_uid; - out->si_status = in->si_status; - out->si_addr = CAST_DOWN_EXPLICIT(user32_addr_t,in->si_addr); - /* following cast works for sival_int because of padding */ - out->si_value.sival_ptr = CAST_DOWN_EXPLICIT(user32_addr_t,in->si_value.sival_ptr); - out->si_band = in->si_band; /* range reduction */ - out->__pad[0] = in->pad[0]; /* mcontext.ss.r1 */ -} - -static void -siginfo_user_to_user64(user_siginfo_t *in, user64_siginfo_t *out) -{ - out->si_signo = in->si_signo; - out->si_errno = in->si_errno; - out->si_code = in->si_code; - out->si_pid = in->si_pid; - out->si_uid = in->si_uid; - out->si_status = in->si_status; - out->si_addr = in->si_addr; - out->si_value.sival_ptr = in->si_value.sival_ptr; - out->si_band = in->si_band; /* range reduction */ - out->__pad[0] = in->pad[0]; /* mcontext.ss.r1 */ -} - - -/* - * Arrange for this process to run a signal handler - */ - -void -sendsig(struct proc *p, user_addr_t catcher, int sig, int mask, __unused uint32_t code) -{ - kern_return_t kretn; - struct mcontext mctx; - user_addr_t p_mctx = USER_ADDR_NULL; /* mcontext dest. */ - struct mcontext64 mctx64; - user_addr_t p_mctx64 = USER_ADDR_NULL; /* mcontext dest. */ - struct user_ucontext64 uctx; - user_addr_t p_uctx; /* user stack addr top copy ucontext */ - user_siginfo_t sinfo; - user_addr_t p_sinfo; /* user stack addr top copy siginfo */ - struct sigacts *ps = p->p_sigacts; - int oonstack; - user_addr_t sp; - mach_msg_type_number_t state_count; - thread_t th_act; - struct uthread *ut; - int infostyle = UC_TRAD; - int dualcontext =0; - user_addr_t trampact; - int vec_used = 0; - int stack_size = 0; - void * tstate; - int flavor; - int ctx32 = 1; - - th_act = current_thread(); - ut = get_bsdthread_info(th_act); - - /* - * XXX We conditionalize type passed here based on SA_SIGINFO, but - * XXX we always send up all the information, regardless; perhaps - * XXX this should not be conditionalized? Defer making this change - * XXX now, due to possible tools impact. - */ - if (p->p_sigacts->ps_siginfo & sigmask(sig)) { - /* - * If SA_SIGINFO is set, then we must provide the user - * process both a siginfo_t and a context argument. We call - * this "FLAVORED", as opposed to "TRADITIONAL", which doesn't - * expect a context. "DUAL" is a type of "FLAVORED". - */ - if (is_64signalregset()) { - /* - * If this is a 64 bit CPU, we must include a 64 bit - * context in the data we pass to user space; we may - * or may not also include a 32 bit context at the - * same time, for non-leaf functions. - * - * The user may also explicitly choose to not receive - * a 32 bit context, at their option; we only allow - * this to happen on 64 bit processors, for obvious - * reasons. - */ - if (IS_64BIT_PROCESS(p) || - (p->p_sigacts->ps_64regset & sigmask(sig))) { - /* - * For a 64 bit process, there is no 32 bit - * context. - */ - ctx32 = 0; - infostyle = UC_FLAVOR64; - } else { - /* - * For a 32 bit process on a 64 bit CPU, we - * may have 64 bit leaf functions, so we need - * both contexts. - */ - dualcontext = 1; - infostyle = UC_DUAL; - } - } else { - /* - * If this is a 32 bit CPU, then we only have a 32 bit - * context to contend with. - */ - infostyle = UC_FLAVOR; - } - } else { - /* - * If SA_SIGINFO is not set, then we have a traditional style - * call which does not need additional context passed. The - * default is 32 bit traditional. - * - * XXX The second check is redundant on PPC32; keep it anyway. - */ - if (is_64signalregset() || IS_64BIT_PROCESS(p)) { - /* - * However, if this is a 64 bit CPU, we need to change - * this to 64 bit traditional, and drop the 32 bit - * context. - */ - ctx32 = 0; - infostyle = UC_TRAD64; - } - } - - proc_unlock(p); - - /* I need this for SIGINFO anyway */ - flavor = PPC_THREAD_STATE; - tstate = (void *)&mctx.ss; - state_count = PPC_THREAD_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - - if ((ctx32 == 0) || dualcontext) { - flavor = PPC_THREAD_STATE64; - tstate = (void *)&mctx64.ss; - state_count = PPC_THREAD_STATE64_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - } - - if ((ctx32 == 1) || dualcontext) { - flavor = PPC_EXCEPTION_STATE; - tstate = (void *)&mctx.es; - state_count = PPC_EXCEPTION_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - } - - if ((ctx32 == 0) || dualcontext) { - flavor = PPC_EXCEPTION_STATE64; - tstate = (void *)&mctx64.es; - state_count = PPC_EXCEPTION_STATE64_COUNT; - - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - - } - - - if ((ctx32 == 1) || dualcontext) { - flavor = PPC_FLOAT_STATE; - tstate = (void *)&mctx.fs; - state_count = PPC_FLOAT_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - } - - if ((ctx32 == 0) || dualcontext) { - flavor = PPC_FLOAT_STATE; - tstate = (void *)&mctx64.fs; - state_count = PPC_FLOAT_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - - } - - - if (find_user_vec_curr()) { - vec_used = 1; - - if ((ctx32 == 1) || dualcontext) { - flavor = PPC_VECTOR_STATE; - tstate = (void *)&mctx.vs; - state_count = PPC_VECTOR_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - infostyle += 5; - } - - if ((ctx32 == 0) || dualcontext) { - flavor = PPC_VECTOR_STATE; - tstate = (void *)&mctx64.vs; - state_count = PPC_VECTOR_STATE_COUNT; - if (thread_getstatus(th_act, flavor, (thread_state_t)tstate, &state_count) != KERN_SUCCESS) - goto bad; - infostyle += 5; - } - } - - trampact = ps->ps_trampact[sig]; - oonstack = ut->uu_sigstk.ss_flags & SA_ONSTACK; - - /* figure out where our new stack lives */ - if ((ut->uu_flag & UT_ALTSTACK) && !oonstack && - (ps->ps_sigonstack & sigmask(sig))) { - sp = ut->uu_sigstk.ss_sp; - sp += ut->uu_sigstk.ss_size; - stack_size = ut->uu_sigstk.ss_size; - ut->uu_sigstk.ss_flags |= SA_ONSTACK; - } - else { - if (ctx32 == 0) - sp = mctx64.ss.r1; - else - sp = CAST_USER_ADDR_T(mctx.ss.r1); - } - - - /* put siginfo on top */ - - /* preserve RED ZONE area */ - if (IS_64BIT_PROCESS(p)) - sp = TRUNC_DOWN64(sp, C_64_REDZONE_LEN, C_64_STK_ALIGN); - else - sp = TRUNC_DOWN32(sp, C_32_REDZONE_LEN, C_32_STK_ALIGN); - - /* next are the saved registers */ - if ((ctx32 == 0) || dualcontext) { - sp -= sizeof(struct mcontext64); - p_mctx64 = sp; - } - if ((ctx32 == 1) || dualcontext) { - sp -= sizeof(struct mcontext); - p_mctx = sp; - } - - if (IS_64BIT_PROCESS(p)) { - /* context goes first on stack */ - sp -= sizeof(struct user_ucontext64); - p_uctx = sp; - - /* this is where siginfo goes on stack */ - sp -= sizeof(user64_siginfo_t); - p_sinfo = sp; - - sp = TRUNC_DOWN64(sp, C_64_PARAMSAVE_LEN+C_64_LINKAGE_LEN, C_64_STK_ALIGN); - } else { - /* - * struct ucontext and struct ucontext64 are identical in - * size and content; the only difference is the internal - * pointer type for the last element, which makes no - * difference for the copyout(). - */ - - /* context goes first on stack */ - sp -= sizeof(struct ucontext64); - p_uctx = sp; - - /* this is where siginfo goes on stack */ - sp -= sizeof(user32_siginfo_t); - p_sinfo = sp; - - sp = TRUNC_DOWN32(sp, C_32_PARAMSAVE_LEN+C_32_LINKAGE_LEN, C_32_STK_ALIGN); - } - - uctx.uc_onstack = oonstack; - uctx.uc_sigmask = mask; - uctx.uc_stack.ss_sp = sp; - uctx.uc_stack.ss_size = stack_size; - if (oonstack) - uctx.uc_stack.ss_flags |= SS_ONSTACK; - - uctx.uc_link = 0; - if (ctx32 == 0) - uctx.uc_mcsize = (size_t)((PPC_EXCEPTION_STATE64_COUNT + PPC_THREAD_STATE64_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)); - else - uctx.uc_mcsize = (size_t)((PPC_EXCEPTION_STATE_COUNT + PPC_THREAD_STATE_COUNT + PPC_FLOAT_STATE_COUNT) * sizeof(int)); - - if (vec_used) - uctx.uc_mcsize += (size_t)(PPC_VECTOR_STATE_COUNT * sizeof(int)); - - if (ctx32 == 0) - uctx.uc_mcontext64 = p_mctx64; - else - uctx.uc_mcontext64 = p_mctx; - - /* setup siginfo */ - bzero((caddr_t)&sinfo, sizeof(sinfo)); - sinfo.si_signo = sig; - if (ctx32 == 0) { - sinfo.si_addr = mctx64.ss.srr0; - sinfo.pad[0] = mctx64.ss.r1; - } else { - sinfo.si_addr = CAST_USER_ADDR_T(mctx.ss.srr0); - sinfo.pad[0] = CAST_USER_ADDR_T(mctx.ss.r1); - } - - switch (sig) { - case SIGILL: - /* - * If it's 64 bit and not a dual context, mctx will - * contain uninitialized data, so we have to use - * mctx64 here. - */ - if(ctx32 == 0) { - if (mctx64.ss.srr1 & (1 << (31 - SRR1_PRG_ILL_INS_BIT))) - sinfo.si_code = ILL_ILLOPC; - else if (mctx64.ss.srr1 & (1 << (31 - SRR1_PRG_PRV_INS_BIT))) - sinfo.si_code = ILL_PRVOPC; - else if (mctx64.ss.srr1 & (1 << (31 - SRR1_PRG_TRAP_BIT))) - sinfo.si_code = ILL_ILLTRP; - else - sinfo.si_code = ILL_NOOP; - } else { - if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_ILL_INS_BIT))) - sinfo.si_code = ILL_ILLOPC; - else if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_PRV_INS_BIT))) - sinfo.si_code = ILL_PRVOPC; - else if (mctx.ss.srr1 & (1 << (31 - SRR1_PRG_TRAP_BIT))) - sinfo.si_code = ILL_ILLTRP; - else - sinfo.si_code = ILL_NOOP; - } - break; - case SIGFPE: -#define FPSCR_VX 2 -#define FPSCR_OX 3 -#define FPSCR_UX 4 -#define FPSCR_ZX 5 -#define FPSCR_XX 6 - /* - * If it's 64 bit and not a dual context, mctx will - * contain uninitialized data, so we have to use - * mctx64 here. - */ - if(ctx32 == 0) { - if (mctx64.fs.fpscr & (1 << (31 - FPSCR_VX))) - sinfo.si_code = FPE_FLTINV; - else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_OX))) - sinfo.si_code = FPE_FLTOVF; - else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_UX))) - sinfo.si_code = FPE_FLTUND; - else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_ZX))) - sinfo.si_code = FPE_FLTDIV; - else if (mctx64.fs.fpscr & (1 << (31 - FPSCR_XX))) - sinfo.si_code = FPE_FLTRES; - else - sinfo.si_code = FPE_NOOP; - } else { - if (mctx.fs.fpscr & (1 << (31 - FPSCR_VX))) - sinfo.si_code = FPE_FLTINV; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_OX))) - sinfo.si_code = FPE_FLTOVF; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_UX))) - sinfo.si_code = FPE_FLTUND; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_ZX))) - sinfo.si_code = FPE_FLTDIV; - else if (mctx.fs.fpscr & (1 << (31 - FPSCR_XX))) - sinfo.si_code = FPE_FLTRES; - else - sinfo.si_code = FPE_NOOP; - } - break; - - case SIGBUS: - if (ctx32 == 0) { - sinfo.si_addr = mctx64.es.dar; - } else { - sinfo.si_addr = CAST_USER_ADDR_T(mctx.es.dar); - } - /* on ppc we generate only if EXC_PPC_UNALIGNED */ - sinfo.si_code = BUS_ADRALN; - break; - - case SIGSEGV: - /* - * If it's 64 bit and not a dual context, mctx will - * contain uninitialized data, so we have to use - * mctx64 here. - */ - if (ctx32 == 0) { - sinfo.si_addr = mctx64.es.dar; - /* First check in srr1 and then in dsisr */ - if (mctx64.ss.srr1 & (1 << (31 - DSISR_PROT_BIT))) - sinfo.si_code = SEGV_ACCERR; - else if (mctx64.es.dsisr & (1 << (31 - DSISR_PROT_BIT))) - sinfo.si_code = SEGV_ACCERR; - else - sinfo.si_code = SEGV_MAPERR; - } else { - sinfo.si_addr = CAST_USER_ADDR_T(mctx.es.dar); - /* First check in srr1 and then in dsisr */ - if (mctx.ss.srr1 & (1 << (31 - DSISR_PROT_BIT))) - sinfo.si_code = SEGV_ACCERR; - else if (mctx.es.dsisr & (1 << (31 - DSISR_PROT_BIT))) - sinfo.si_code = SEGV_ACCERR; - else - sinfo.si_code = SEGV_MAPERR; - } - break; - default: - { - int status_and_exitcode; - - /* - * All other signals need to fill out a minimum set of - * information for the siginfo structure passed into - * the signal handler, if SA_SIGINFO was specified. - * - * p->si_status actually contains both the status and - * the exit code; we save it off in its own variable - * for later breakdown. - */ - proc_lock(p); - sinfo.si_pid = p->si_pid; - p->si_pid = 0; - status_and_exitcode = p->si_status; - p->si_status = 0; - sinfo.si_uid = p->si_uid; - p->si_uid = 0; - sinfo.si_code = p->si_code; - p->si_code = 0; - proc_unlock(p); - if (sinfo.si_code == CLD_EXITED) { - if (WIFEXITED(status_and_exitcode)) - sinfo.si_code = CLD_EXITED; - else if (WIFSIGNALED(status_and_exitcode)) { - if (WCOREDUMP(status_and_exitcode)) { - sinfo.si_code = CLD_DUMPED; - status_and_exitcode = W_EXITCODE(status_and_exitcode,status_and_exitcode); - } else { - sinfo.si_code = CLD_KILLED; - status_and_exitcode = W_EXITCODE(status_and_exitcode,status_and_exitcode); - } - } - } - /* - * The recorded status contains the exit code and the - * signal information, but the information to be passed - * in the siginfo to the handler is supposed to only - * contain the status, so we have to shift it out. - */ - sinfo.si_status = WEXITSTATUS(status_and_exitcode); - break; - } - } - - - /* copy info out to user space */ - if (IS_64BIT_PROCESS(p)) { - user64_siginfo_t sinfo64; - - siginfo_user_to_user64(&sinfo,&sinfo64); - -#if CONFIG_DTRACE - bzero((caddr_t)&(ut->t_dtrace_siginfo), sizeof(ut->t_dtrace_siginfo)); - - ut->t_dtrace_siginfo.si_signo = sinfo.si_signo; - ut->t_dtrace_siginfo.si_code = sinfo.si_code; - ut->t_dtrace_siginfo.si_pid = sinfo.si_pid; - ut->t_dtrace_siginfo.si_uid = sinfo.si_uid; - ut->t_dtrace_siginfo.si_status = sinfo.si_status; - /* XXX truncates faulting address to void * on K32 */ - ut->t_dtrace_siginfo.si_addr = CAST_DOWN(void *, sinfo.si_addr); - - - /* Fire DTrace proc:::fault probe when signal is generated by hardware. */ - switch (sig) { - case SIGILL: case SIGBUS: case SIGSEGV: case SIGFPE: case SIGTRAP: - DTRACE_PROC2(fault, int, (int)(ut->uu_code), siginfo_t *, &(ut->t_dtrace_siginfo)); - break; - default: - break; - } - - /* XXX truncates catcher address to uintptr_t */ - DTRACE_PROC3(signal__handle, int, sig, siginfo_t *, &(ut->t_dtrace_siginfo), - void (*)(void), CAST_DOWN(sig_t, catcher)); -#endif /* CONFIG_DTRACE */ - - if (copyout(&uctx, p_uctx, sizeof(struct user_ucontext64))) - goto bad; - if (copyout(&sinfo64, p_sinfo, sizeof(sinfo64))) - goto bad; - } else { - struct ucontext64 uctx32; - user32_siginfo_t sinfo32; - - ucontext_64to32(&uctx, &uctx32); - siginfo_user_to_user32(&sinfo,&sinfo32); - -#if CONFIG_DTRACE - bzero((caddr_t)&(ut->t_dtrace_siginfo), sizeof(ut->t_dtrace_siginfo)); - - ut->t_dtrace_siginfo.si_signo = sinfo.si_signo; - ut->t_dtrace_siginfo.si_code = sinfo.si_code; - ut->t_dtrace_siginfo.si_pid = sinfo.si_pid; - ut->t_dtrace_siginfo.si_uid = sinfo.si_uid; - ut->t_dtrace_siginfo.si_status = sinfo.si_status; - ut->t_dtrace_siginfo.si_addr = CAST_DOWN(void *, sinfo.si_addr); - - - /* Fire DTrace proc:::fault probe when signal is generated by hardware. */ - switch (sig) { - case SIGILL: case SIGBUS: case SIGSEGV: case SIGFPE: case SIGTRAP: - DTRACE_PROC2(fault, int, (int)(ut->uu_code), siginfo_t *, &(ut->t_dtrace_siginfo)); - break; - default: - break; - } - - DTRACE_PROC3(signal__handle, int, sig, siginfo_t *, &(ut->t_dtrace_siginfo), - void (*)(void), CAST_DOWN(sig_t, catcher)); -#endif /* CONFIG_DTRACE */ - - if (copyout(&uctx32, p_uctx, sizeof(struct ucontext64))) - goto bad; - - if (copyout(&sinfo32, p_sinfo, sizeof(sinfo32))) - goto bad; - } - if ((ctx32 == 0) || dualcontext) { - /* - * NOTE: Size of mcontext is not variant between 64bit and - * 32bit programs usng 64bit registers. - */ - if (copyout(&mctx64, p_mctx64, (vec_used? UC_FLAVOR64_VEC_SIZE: UC_FLAVOR64_SIZE))) - goto bad; - } - if ((ctx32 == 1) || dualcontext) { - if (copyout(&mctx, p_mctx, uctx.uc_mcsize)) - goto bad; - } - - - /* Place our arguments in arg registers: rtm dependent */ - if(IS_64BIT_PROCESS(p)) { - mctx64.ss.r3 = catcher; - mctx64.ss.r4 = CAST_USER_ADDR_T(infostyle); - mctx64.ss.r5 = CAST_USER_ADDR_T(sig); - mctx64.ss.r6 = p_sinfo; - mctx64.ss.r7 = p_uctx; - - mctx64.ss.srr0 = trampact; - /* MSR_EXPORT_MASK_SET */ - mctx64.ss.srr1 = CAST_USER_ADDR_T(get_msr_exportmask()); - mctx64.ss.r1 = sp; - state_count = PPC_THREAD_STATE64_COUNT; - if ((kretn = thread_setstatus(th_act, PPC_THREAD_STATE64, (void *)&mctx64.ss, state_count)) != KERN_SUCCESS) { - panic("sendsig: thread_setstatus failed, ret = %08X\n", kretn); - } - } else { - mctx.ss.r3 = CAST_DOWN(uint32_t,catcher); - mctx.ss.r4 = (uint32_t)infostyle; - mctx.ss.r5 = (uint32_t)sig; - mctx.ss.r6 = CAST_DOWN(uint32_t,p_sinfo); - mctx.ss.r7 = CAST_DOWN(uint32_t,p_uctx); - - mctx.ss.srr0 = CAST_DOWN(uint32_t,trampact); - /* MSR_EXPORT_MASK_SET */ - mctx.ss.srr1 = get_msr_exportmask(); - mctx.ss.r1 = CAST_DOWN(uint32_t,sp); - state_count = PPC_THREAD_STATE_COUNT; - if ((kretn = thread_setstatus(th_act, PPC_THREAD_STATE, (void *)&mctx.ss, state_count)) != KERN_SUCCESS) { - panic("sendsig: thread_setstatus failed, ret = %08X\n", kretn); - } - } - - proc_lock(p); - return; - -bad: - proc_lock(p); - SIGACTION(p, SIGILL) = SIG_DFL; - sig = sigmask(SIGILL); - p->p_sigignore &= ~sig; - p->p_sigcatch &= ~sig; - ut->uu_sigmask &= ~sig; - /* sendsig is called with signal lock held */ - proc_unlock(p); - psignal_locked(p, SIGILL); - proc_lock(p); - return; -} - -/* - * System call to cleanup state after a signal - * has been taken. Reset signal mask and - * stack state from context left by sendsig (above). - * Return to previous pc and psl as specified by - * context left by sendsig. Check carefully to - * make sure that the user has not modified the - * psl to gain improper priviledges or to cause - * a machine fault. - */ - -/* ARGSUSED */ -int -sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval) -{ - struct user_ucontext64 uctx; - - char mactx[sizeof(struct mcontext64)]; - struct mcontext *p_mctx; - struct mcontext64 *p_64mctx; - int error; - thread_t th_act; - struct sigacts *ps = p->p_sigacts; - sigset_t mask; - user_addr_t action; - uint32_t state_count; - unsigned int state_flavor; - struct uthread * ut; - int vec_used = 0; - void *tsptr, *fptr, *vptr; - int infostyle = uap->infostyle; - - th_act = current_thread(); - - ut = (struct uthread *)get_bsdthread_info(th_act); - - /* - * If we are being asked to change the altstack flag on the thread, we - * just rest it and return (the uap->uctx is not used). - */ - if (infostyle == UC_SET_ALT_STACK) { - ut->uu_sigstk.ss_flags |= SA_ONSTACK; - return (0); - } else if ((unsigned int)infostyle == UC_RESET_ALT_STACK) { - ut->uu_sigstk.ss_flags &= ~SA_ONSTACK; - return (0); - } - - if (IS_64BIT_PROCESS(p)) { - error = copyin(uap->uctx, &uctx, sizeof(struct user_ucontext64)); - if (error) - return(error); - } else { - struct ucontext64 uctx32; - - /* - * struct ucontext and struct ucontext64 are identical in - * size and content; the only difference is the internal - * pointer type for the last element, which makes no - * difference for the copyin(). - */ - error = copyin(uap->uctx, &uctx32, sizeof(struct ucontext)); - if (error) - return(error); - ucontext_32to64(&uctx32, &uctx); - } - - - /* validate the machine context size */ - switch (uctx.uc_mcsize) { - case UC_FLAVOR64_VEC_SIZE: - case UC_FLAVOR64_SIZE: - case UC_FLAVOR_VEC_SIZE: - case UC_FLAVOR_SIZE: - break; - default: - return(EINVAL); - } - - /* - * The 64 bit process mcontext is identical to the mcontext64, so - * there is no conversion necessary. - */ - error = copyin(uctx.uc_mcontext64, mactx, uctx.uc_mcsize); - if (error) - return(error); - - if ((uctx.uc_onstack & 01)) - ut->uu_sigstk.ss_flags |= SA_ONSTACK; - else - ut->uu_sigstk.ss_flags &= ~SA_ONSTACK; - - ut->uu_sigmask = uctx.uc_sigmask & ~sigcantmask; - if (ut->uu_siglist & ~ut->uu_sigmask) - signal_setast(current_thread()); - - vec_used = 0; - switch (infostyle) { - case UC_FLAVOR64_VEC: - case UC_TRAD64_VEC: - vec_used = 1; - case UC_TRAD64: - case UC_FLAVOR64: { - p_64mctx = (struct mcontext64 *)mactx; - tsptr = (void *)&p_64mctx->ss; - fptr = (void *)&p_64mctx->fs; - vptr = (void *)&p_64mctx->vs; - state_flavor = PPC_THREAD_STATE64; - state_count = PPC_THREAD_STATE64_COUNT; - } - break; - case UC_FLAVOR_VEC : - case UC_TRAD_VEC : - vec_used = 1; - case UC_FLAVOR : - case UC_TRAD : - default: { - p_mctx = (struct mcontext *)mactx; - tsptr = (void *)&p_mctx->ss; - fptr = (void *)&p_mctx->fs; - vptr = (void *)&p_mctx->vs; - state_flavor = PPC_THREAD_STATE; - state_count = PPC_THREAD_STATE_COUNT; - } - break; - } /* switch () */ - - /* validate the thread state, set/reset appropriate mode bits in srr1 */ - (void)ppc_checkthreadstate(tsptr, state_flavor); - - if (thread_setstatus(th_act, state_flavor, tsptr, state_count) != KERN_SUCCESS) { - return(EINVAL); - } - - state_count = PPC_FLOAT_STATE_COUNT; - if (thread_setstatus(th_act, PPC_FLOAT_STATE, fptr, state_count) != KERN_SUCCESS) { - return(EINVAL); - } - - mask = sigmask(SIGFPE); - if (((ut->uu_sigmask & mask) == 0) && (p->p_sigcatch & mask) && ((p->p_sigignore & mask) == 0)) { - action = ps->ps_sigact[SIGFPE]; - if((action != SIG_DFL) && (action != SIG_IGN)) { - thread_enable_fpe(th_act, 1); - } - } - - if (vec_used) { - state_count = PPC_VECTOR_STATE_COUNT; - if (thread_setstatus(th_act, PPC_VECTOR_STATE, vptr, state_count) != KERN_SUCCESS) { - return(EINVAL); - } - } - return (EJUSTRETURN); -} - -/* - * machine_exception() performs MD translation - * of a mach exception to a unix signal and code. - */ - -boolean_t -machine_exception( - int exception, - mach_exception_code_t code, - __unused mach_exception_subcode_t subcode, - int *unix_signal, - mach_exception_code_t *unix_code) -{ - switch(exception) { - - case EXC_BAD_INSTRUCTION: - *unix_signal = SIGILL; - *unix_code = code; - break; - - case EXC_ARITHMETIC: - *unix_signal = SIGFPE; - *unix_code = code; - break; - - case EXC_SOFTWARE: - if (code == EXC_PPC_TRAP) { - *unix_signal = SIGTRAP; - *unix_code = code; - break; - } else - return(FALSE); - - default: - return(FALSE); - } - - return(TRUE); -} - diff --git a/bsd/dev/ppc/xsumas.s b/bsd/dev/ppc/xsumas.s deleted file mode 100644 index 6ac06e947..000000000 --- a/bsd/dev/ppc/xsumas.s +++ /dev/null @@ -1,401 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#define kShort 11 -#define cr1_gt 5 // bit 1 of cr1 - -/* - * short xsum_assym( short *p, int len, short xsum, boolean odd); - * - * r3 - Pointer to data - * r4 - Length of data - * r5 - Accumulated sum value - * r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data) - * - * Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we - * correctly handle the case where the flag is set and the address is odd. - * - * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum - * of the data, treated as an array of 16-bit integers. 1s-complement sums are done - * via "add with carry" operations on a 2s-complement machine like PPC. Note that - * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the - * final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is - * perfect except that it serializes the adds on the carry bit. On 64-bit machines - * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding - * all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums, - * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit - * is set on the low 32-bits of the sum.) - * - * Using Altivec is tempting, but the performance impact of the greatly increased - * number of exceptions and register save/restore traffic probably make it impractical - * for now. - */ - .globl _xsum_assym - .globl _xsum_nop_if_32bit - .text - .align 5 -_xsum_assym: - cmplwi cr0,r4,kShort ; too short to word align? - rlwinm r2,r3,0,0x3 ; get byte offset in word - dcbt 0,r3 ; touch in 1st cache line - cmpwi cr6,r2,0 ; is address word aligned? - ble cr0,Lshort ; skip if too short to bother aligning - - subfic r0,r2,4 ; get #bytes in partial word - cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set - addic r0,r0,0 ; turn off carry - beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned) - -; Partial word at start: zero filled on left, it becomes initial checksum. - - rlwinm r3,r3,0,0,29 ; word align address - mtcrf 0x01,r2 ; move byte offset to cr7 - lwz r6,0(r3) ; get partial word - li r7,-1 ; start of mask for partial fill - slwi r8,r2,3 ; multiply byte offset by 8 - sub r4,r4,r0 ; adjust length for bytes in partial word - crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary - srw r7,r7,r8 ; get mask for bytes to keep in partial word - addi r3,r3,4 ; point to next word of input - and r2,r6,r7 ; zero fill on left - -; Address is now word aligned. Prepare for inner loop over 32-byte chunks. -; r2 = initial checksum -; r3 = word aligned address -; r4 = length remaining -; r5 = accumulated sum parameter -; carry = off -; cr1_gt = "starting on odd address" flag - -Laligned: - srwi. r0,r4,5 ; get count of 32-byte chunks - mtcrf 0x02,r4 ; move residual length to cr6 and cr7 - mtcrf 0x01,r4 - beq cr0,Lleftovers ; no chunks - - mtctr r0 ; set up loop count - li r4,32 ; offset to next chunk -_xsum_nop_if_32bit: - b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine) - dcbt r4,r3 ; touch in 2nd cache line - li r0,96 ; get touch offset - b LInnerLoop32 ; enter 32-bit loop - -; Inner loop for 32-bit machines. - - .align 4 -LInnerLoop32: - lwz r4,0(r3) - lwz r6,4(r3) - lwz r7,8(r3) - lwz r8,12(r3) - adde r2,r2,r4 - lwz r9,16(r3) - adde r2,r2,r6 - lwz r10,20(r3) - adde r2,r2,r7 - lwz r11,24(r3) - adde r2,r2,r8 - lwz r12,28(r3) - adde r2,r2,r9 - dcbt r3,r0 - adde r2,r2,r10 - addi r3,r3,32 - adde r2,r2,r11 - adde r2,r2,r12 - bdnz+ LInnerLoop32 - -; Handle leftover bytes. -; r2 = checksum so far -; r3 = word aligned address -; r5 = accumulated sum parameter -; carry = live -; cr1_gt = "starting on odd address" flag -; cr6,cr7 = residual length - -Lleftovers: - bf 27,Lleftover8 ; test 0x10 bit of residual length - lwz r4,0(r3) - lwz r6,4(r3) - lwz r7,8(r3) - lwz r8,12(r3) - addi r3,r3,16 - adde r2,r2,r4 - adde r2,r2,r6 - adde r2,r2,r7 - adde r2,r2,r8 -Lleftover8: - bf 28,Lleftover4 - lwz r4,0(r3) - lwz r6,4(r3) - addi r3,r3,8 - adde r2,r2,r4 - adde r2,r2,r6 -Lleftover4: - bf 29,Lleftover2 - lwz r4,0(r3) - addi r3,r3,4 - adde r2,r2,r4 -Lleftover2: - bf 30,Lleftover1 - lhz r4,0(r3) - addi r3,r3,2 - adde r2,r2,r4 -Lleftover1: - bf 31,Lwrapup - lbz r4,0(r3) - slwi r4,r4,8 ; shift last byte into proper lane - adde r2,r2,r4 - -; All data bytes checksummed. Wrap up. -; r2 = checksum so far (word parallel) -; r5 = accumulated sum parameter -; carry = live -; cr1_gt = "starting on odd address" flag - -Lwrapup: - addze r2,r2 ; add in last carry - addze r2,r2 ; in case the "addze" carries -Lwrapupx: ; here from short-operand case, with xer(ca) undefined - srwi r6,r2,16 ; top half of 32-bit checksum - rlwinm r7,r2,0,0xFFFF ; lower half - add r2,r6,r7 ; add them together - srwi r6,r2,16 ; then do it again, in case first carried - rlwinm r7,r2,0,0xFFFF - add r2,r6,r7 - bf cr1_gt,Lswapped ; test "starting on odd address" flag - -; The checksum began on an odd address, so swap bytes. - - rlwinm r6,r2,24,0x00FF ; move top byte to bottom - rlwinm r7,r2,8,0xFF00 ; bottom to top - or r2,r6,r7 ; rejoin - -; Finally, add in checksum passed in as a parameter. - -Lswapped: - add r2,r2,r5 ; add passed-in checksum - srwi r6,r2,16 ; top half of 32-bit checksum - rlwinm r7,r2,0,0xFFFF ; lower half - add r2,r6,r7 ; add them together - srwi r6,r2,16 ; then do it again, in case first carried - rlwinm r7,r2,0,0xFFFF - add r3,r6,r7 ; steer result into r3 - blr - -; Handle short operands. Do a halfword at a time. -; r3 = address -; r4 = length (<= kShort) -; r5 = accumulated sum parameter -; r6 = "starting on odd byte" flag - -Lshort: - cmpwi cr6,r4,2 ; at least two bytes? - andi. r0,r4,1 ; odd length? - li r2,0 ; initialize checksum - cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set - blt cr6,Lshort2 ; fewer than two bytes, so skip -Lshort1: - cmpwi cr6,r4,4 ; two more bytes (after we decrement)? - lhz r7,0(r3) - subi r4,r4,2 - addi r3,r3,2 - add r2,r2,r7 ; note no need for "adde" - bge cr6,Lshort1 ; loop for 2 more bytes -Lshort2: - beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined - lbz r7,0(r3) - slwi r7,r7,8 ; shift last byte into proper lane - add r2,r2,r7 - b Lwrapupx - -; Handle 64-bit machine. The major improvement over the 32-bit path is that we use -; four parallel 32-bit accumulators, which carry into the upper half naturally so we -; do not have to use "adde", which serializes on the carry bit. Note that we cannot -; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly. -; r2 = checksum so far (ie, the zero-filled partial first word) -; r3 = word aligned address -; r5 = accumulated sum parameter -; ctr = number of 32-byte chunks of input -; carry = unused in this code -; cr1_gt = "starting on odd address" flag -; cr6,cr7 = residual length - -L64BitPath: - stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them - stw r14,-8(r1) - stw r15,-12(r1) - stw r16,-16(r1) - li r0,128 ; to touch next line - li r13,0 ; r13-r15 are the accumulators, so initialize them - dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores - lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12 - lwz r6,4(r3) - lwz r7,8(r3) - mr r14,r2 ; just copy incoming partial word into one of the accumulators - li r15,0 - lwz r8,12(r3) - lwz r9,16(r3) - li r16,0 - li r0,256 ; get touch offset - lwz r10,20(r3) - lwz r11,24(r3) - lwz r12,28(r3) ; load last word of previous chunk - addi r3,r3,32 ; skip past the chunk - bdnz++ LInnerLoop64 ; enter loop if another chunk to go - - b LAddLastChunk ; only one chunk - -; Inner loop for 64-bit processors. This loop is scheduled for the 970. -; It is pipelined (loads are one iteration ahead of adds), and unrolled. -; It should take 9-10 cycles per iteration, which consumes 64 bytes of input. - - .align 5 -LInnerLoop64: ; 64 bytes/iteration - add r13,r13,r4 ; cycle 1 - add r14,r14,r6 - dcbt r3,r0 ; touch in 2 lines ahead - lwz r4,0(r3) - - add r15,r15,r7 ; cycle 2, etc - lwz r6,4(r3) - lwz r7,8(r3) - add r16,r16,r8 - - lwz r8,12(r3) - add r13,r13,r9 - add r14,r14,r10 - lwz r9,16(r3) - - add r15,r15,r11 - lwz r10,20(r3) - lwz r11,24(r3) - add r16,r16,r12 - bdz-- LEarlyExit ; early exit if no more chunks - - lwz r12,28(r3) - add r13,r13,r4 - add r14,r14,r6 - lwz r4,32(r3) - - add r15,r15,r7 - lwz r6,36(r3) - lwz r7,40(r3) - add r16,r16,r8 - - lwz r8,44(r3) - add r13,r13,r9 - add r14,r14,r10 - lwz r9,48(r3) - - add r15,r15,r11 - lwz r10,52(r3) - lwz r11,56(r3) - add r16,r16,r12 - - nop ; position last load in 2nd dispatch slot - lwz r12,60(r3) - addi r3,r3,64 - bdnz++ LInnerLoop64 - - b LAddLastChunk - -; Add in the last 32-byte chunk, and any leftover bytes. -; r3 = word aligned address of next byte of data -; r5 = accumulated sum parameter -; r13-r16 = the four accumulators -; cr1_gt = "starting on odd address" flag -; cr6,cr7 = residual length - -LEarlyExit: ; here from middle of inner loop - lwz r12,28(r3) ; load last word of last chunk - addi r3,r3,32 -LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12 - add r13,r13,r4 ; add in last chunk - add r14,r14,r6 ; these are 64-bit adds - add r15,r15,r7 - add r16,r16,r8 - add r13,r13,r9 - add r14,r14,r10 - add r15,r15,r11 - add r16,r16,r12 - -; Handle leftover bytes, if any. - - bf 27,Lleft1 ; test 0x10 bit of residual length - lwz r4,0(r3) - lwz r6,4(r3) - lwz r7,8(r3) - lwz r8,12(r3) - addi r3,r3,16 - add r13,r13,r4 - add r14,r14,r6 - add r15,r15,r7 - add r16,r16,r8 -Lleft1: - bf 28,Lleft2 - lwz r4,0(r3) - lwz r6,4(r3) - addi r3,r3,8 - add r13,r13,r4 - add r14,r14,r6 -Lleft2: - bf 29,Lleft3 - lwz r4,0(r3) - addi r3,r3,4 - add r14,r14,r4 -Lleft3: - bf 30,Lleft4 - lhz r4,0(r3) - addi r3,r3,2 - add r15,r15,r4 -Lleft4: - bf 31,Lleft5 - lbz r4,0(r3) - slwi r4,r4,8 ; shift last byte into proper lane - add r16,r16,r4 - -; All data bytes have been checksummed. Now we must add together the four -; accumulators and restore the regs from the red zone. -; r3 = word aligned address of next byte of data -; r5 = accumulated sum parameter -; r13-r16 = the four accumulators -; carry = not used so far -; cr1_gt = "starting on odd address" flag - -Lleft5: - add r8,r13,r14 ; add the four accumulators together - add r9,r15,r16 - lwz r13,-4(r1) ; start to restore nonvolatiles from red zone - lwz r14,-8(r1) - add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators - lwz r15,-12(r1) - lwz r16,-16(r1) - srdi r7,r8,32 ; get upper half of 64-bit sum - addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry) - b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum diff --git a/bsd/dev/random/Makefile b/bsd/dev/random/Makefile index 1190bc1ff..7a07200d9 100644 --- a/bsd/dev/random/Makefile +++ b/bsd/dev/random/Makefile @@ -9,16 +9,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/dev/unix_startup.c b/bsd/dev/unix_startup.c index 4ec548794..f167a1752 100644 --- a/bsd/dev/unix_startup.c +++ b/bsd/dev/unix_startup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,6 +42,7 @@ #include <sys/file_internal.h> #include <sys/proc_internal.h> #include <sys/clist.h> +#include <sys/mcache.h> #include <sys/mbuf.h> #include <sys/systm.h> #include <sys/tty.h> @@ -50,7 +51,9 @@ #include <machine/cons.h> #include <pexpert/pexpert.h> #include <sys/socketvar.h> +#include <pexpert/pexpert.h> +extern uint32_t kern_maxvnodes; extern vm_map_t mb_map; #if INET || INET6 @@ -62,7 +65,7 @@ void bsd_bufferinit(void) __attribute__((section("__TEXT, initcode")) extern void md_prepare_for_shutdown(int, int, char *); unsigned int bsd_mbuf_cluster_reserve(boolean_t *); -void bsd_srv_setup(int); +void bsd_scale_setup(int); void bsd_exec_setup(int); /* @@ -71,7 +74,7 @@ void bsd_exec_setup(int); #ifdef NBUF int max_nbuf_headers = NBUF; -int niobuf_headers = NBUF / 2; +int niobuf_headers = (NBUF / 2) + 2048; int nbuf_hashelements = NBUF; int nbuf_headers = NBUF; #else @@ -81,11 +84,11 @@ int nbuf_hashelements = 0; int nbuf_headers = 0; #endif -SYSCTL_INT (_kern, OID_AUTO, nbuf, CTLFLAG_RD, &nbuf_headers, 0, ""); -SYSCTL_INT (_kern, OID_AUTO, maxnbuf, CTLFLAG_RW, &max_nbuf_headers, 0, ""); +SYSCTL_INT (_kern, OID_AUTO, nbuf, CTLFLAG_RD | CTLFLAG_LOCKED, &nbuf_headers, 0, ""); +SYSCTL_INT (_kern, OID_AUTO, maxnbuf, CTLFLAG_RW | CTLFLAG_LOCKED, &max_nbuf_headers, 0, ""); __private_extern__ int customnbuf = 0; -int srv = 0; /* Flag indicates a server boot when set */ +int serverperfmode = 0; /* Flag indicates a server boot when set */ int ncl = 0; static unsigned int mbuf_poolsz; @@ -118,10 +121,12 @@ bsd_startupearly(void) } else nbuf_hashelements = max_nbuf_headers; - if (niobuf_headers == 0) - niobuf_headers = max_nbuf_headers; - if (niobuf_headers > 4096) - niobuf_headers = 4096; + if (niobuf_headers == 0) { + if (max_nbuf_headers < 4096) + niobuf_headers = max_nbuf_headers; + else + niobuf_headers = (max_nbuf_headers / 2) + 2048; + } if (niobuf_headers < CONFIG_MIN_NIOBUF) niobuf_headers = CONFIG_MIN_NIOBUF; @@ -176,18 +181,23 @@ bsd_startupearly(void) #endif /* SOCKETS */ if (vnodes_sized == 0) { - /* - * Size vnodes based on memory - * Number vnodes is (memsize/64k) + 1024 - * This is the calculation that is used by launchd in tiger - * we are clipping the max based on 16G - * ie ((16*1024*1024*1024)/(64 *1024)) + 1024 = 263168; - * CONFIG_VNODES is set to 263168 for "medium" configurations (the default) - * but can be smaller or larger. - */ - desiredvnodes = (sane_size/65536) + 1024; - if (desiredvnodes > CONFIG_VNODES) - desiredvnodes = CONFIG_VNODES; + if (!PE_get_default("kern.maxvnodes", &desiredvnodes, sizeof(desiredvnodes))) { + /* + * Size vnodes based on memory + * Number vnodes is (memsize/64k) + 1024 + * This is the calculation that is used by launchd in tiger + * we are clipping the max based on 16G + * ie ((16*1024*1024*1024)/(64 *1024)) + 1024 = 263168; + * CONFIG_VNODES is set to 263168 for "medium" configurations (the default) + * but can be smaller or larger. + */ + desiredvnodes = (sane_size/65536) + 1024; +#ifdef CONFIG_VNODES + if (desiredvnodes > CONFIG_VNODES) + desiredvnodes = CONFIG_VNODES; +#endif + } + vnodes_sized = 1; } } @@ -252,7 +262,6 @@ bsd_mbuf_cluster_reserve(boolean_t *overridden) * to correctly compute the size of the low-memory VM pool. It is * redundant but rather harmless. */ - //(void) PE_parse_boot_argn("srv", &srv, sizeof (srv)); (void) PE_parse_boot_argn("ncl", &ncl, sizeof (ncl)); (void) PE_parse_boot_argn("mbuf_pool", &mbuf_pool, sizeof (mbuf_pool)); @@ -265,12 +274,12 @@ bsd_mbuf_cluster_reserve(boolean_t *overridden) if (sane_size > (64 * 1024 * 1024) || ncl != 0) { - if (ncl || srv) + if (ncl || serverperfmode) was_overridden = TRUE; if ((nmbclusters = ncl) == 0) { /* Auto-configure the mbuf pool size */ - nmbclusters = mbuf_default_ncl(srv, sane_size); + nmbclusters = mbuf_default_ncl(serverperfmode, sane_size); } else { /* Make sure it's not odd in case ncl is manually set */ if (nmbclusters & 0x1) @@ -280,6 +289,9 @@ bsd_mbuf_cluster_reserve(boolean_t *overridden) if (nmbclusters > MAX_NCL) nmbclusters = MAX_NCL; } + + /* Round it down to nearest multiple of 4KB clusters */ + nmbclusters = P2ROUNDDOWN(nmbclusters, NCLPBG); } mbuf_poolsz = nmbclusters << MCLSHIFT; done: @@ -296,11 +308,15 @@ void IOSleep(int); void -bsd_srv_setup(int scale) +bsd_scale_setup(int scale) { #if defined(__LP64__) - /* if memory is more than 16G, then apply rules for processes */ - if (scale > 0) { + if ((scale > 0) && (serverperfmode == 0)) { + maxproc *= scale; + maxprocperuid = (maxproc * 2) / 3; + } + /* Apply server scaling rules */ + if ((scale > 0) && (serverperfmode !=0)) { maxproc = 2500 * scale; hard_maxproc = maxproc; /* no fp usage */ diff --git a/bsd/dev/vn/Makefile b/bsd/dev/vn/Makefile index 64ae209ac..b4e415a16 100644 --- a/bsd/dev/vn/Makefile +++ b/bsd/dev/vn/Makefile @@ -3,23 +3,18 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir -CFLAGS+=$(WERROR) include $(MakeInc_cmd) include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/dev/vn/vn.c b/bsd/dev/vn/vn.c index bac913331..2a0001d48 100644 --- a/bsd/dev/vn/vn.c +++ b/bsd/dev/vn/vn.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1141,10 +1141,10 @@ vniocattach_file(struct vn_softc *vn, flags = FREAD|FWRITE; if (in_kernel) { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx); + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx); } else { - NDINIT(&nd, LOOKUP, FOLLOW, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), vniop->vn_file, ctx); } @@ -1156,11 +1156,11 @@ vniocattach_file(struct vn_softc *vn, } flags &= ~FWRITE; if (in_kernel) { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx); } else { - NDINIT(&nd, LOOKUP, FOLLOW, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), vniop->vn_file, ctx); } @@ -1221,10 +1221,10 @@ vniocattach_shadow(struct vn_softc *vn, struct vn_ioctl_64 *vniop, flags = FREAD|FWRITE; if (in_kernel) { - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx); + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx); } else { - NDINIT(&nd, LOOKUP, FOLLOW, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), vniop->vn_file, ctx); } diff --git a/bsd/dev/x86_64/munge.s b/bsd/dev/x86_64/munge.s index ec5b6123b..cb2f6dfc0 100644 --- a/bsd/dev/x86_64/munge.s +++ b/bsd/dev/x86_64/munge.s @@ -132,16 +132,91 @@ Lw2: Entry(munge_wl) /* Costs an extra w move to do this */ ENTRY(munge_wlw) xorl %edx,%edx +Lwlw: movl 12(%rsi),%eax movl %eax,16(%rsi) movl %edx,20(%rsi) +Lwl: movl 8(%rsi),%eax movl %eax,12(%rsi) movl 4(%rsi),%eax movl %eax,8(%rsi) + movl %edx,4(%rsi) ret +ENTRY(munge_wlwwwll) + xorl %edx,%edx +Lwlwwwll: + movl 36(%rsi),%eax + movl %eax,52(%rsi) + movl 32(%rsi),%eax + movl %eax,48(%rsi) + + movl 28(%rsi),%eax + movl %eax,44(%rsi) + movl 24(%rsi),%eax + movl %eax,40(%rsi) + + movl 20(%rsi),%eax + movl %eax,32(%rsi) + movl %edx,36(%rsi) +Lwlww: + movl 16(%rsi),%eax + movl %eax,24(%rsi) + movl %edx,28(%rsi) + jmp Lwlw + +ENTRY(munge_wlwwwllw) + xorl %edx,%edx + movl 40(%rsi),%eax + movl %eax,56(%rsi) + movl %edx,60(%rsi) + jmp Lwlwwwll + +ENTRY(munge_wlwwlwlw) + xorl %edx,%edx + movl 40(%rsi),%eax + movl %eax,56(%rsi) + movl %edx,60(%rsi) + movl 36(%rsi),%eax + movl %eax,52(%rsi) + movl 32(%rsi),%eax + movl %eax,48(%rsi) + movl 28(%rsi),%eax + movl %eax,40(%rsi) + movl %edx,44(%rsi) + movl 24(%rsi),%eax + movl %eax,36(%rsi) + movl 20(%rsi),%eax + movl %eax,32(%rsi) + jmp Lwlww + + +ENTRY(munge_wllwwll) + xorl %edx,%edx + + movl 40(%rsi),%eax //l + movl %eax,52(%rsi) + movl 36(%rsi),%eax + movl %eax,48(%rsi) + movl 32(%rsi),%eax //l + movl %eax,44(%rsi) + movl 28(%rsi),%eax + movl %eax,40(%rsi) + movl 24(%rsi),%eax //w + movl %eax,32(%rsi) + movl %edx,36(%rsi) + movl 20(%rsi),%eax //w + movl %eax,24(%rsi) + movl %edx,28(%rsi) + movl 16(%rsi),%eax //l + movl %eax,20(%rsi) + movl 12(%rsi),%eax + movl %eax,16(%rsi) + + jmp Lwl + Entry(munge_wwwlw) xorl %edx,%edx movl 20(%rsi),%eax @@ -183,6 +258,61 @@ ENTRY(munge_wwwwwl) movl %eax,44(%rsi) jmp Lw5 + +ENTRY(munge_wwwwwlww) + xorl %edx,%edx + movl 32(%rsi),%eax + movl %eax,56(%rsi) + movl %edx,60(%rsi) + movl 28(%rsi),%eax + movl %eax,48(%rsi) + movl %edx,52(%rsi) + movl 20(%rsi),%eax + movl %eax,40(%rsi) + movl 24(%rsi),%eax + movl %eax,44(%rsi) + + jmp Lw5 + +ENTRY(munge_wwwwwllw) + xorl %edx,%edx + movl 36(%rsi),%eax + movl %eax,56(%rsi) + movl %edx,60(%rsi) + movl 28(%rsi),%eax + movl %eax,48(%rsi) + movl 32(%rsi),%eax + movl %eax,52(%rsi) + movl 20(%rsi),%eax + movl %eax,40(%rsi) + movl 24(%rsi),%eax + movl %eax,44(%rsi) + jmp Lw5 + +ENTRY(munge_wwwwwlll) + xorl %edx,%edx + movl 36(%rsi),%eax + movl %eax,56(%rsi) + movl 40(%rsi),%eax + movl %eax,60(%rsi) + movl 28(%rsi),%eax + movl %eax,48(%rsi) + movl 32(%rsi),%eax + movl %eax,52(%rsi) + movl 20(%rsi),%eax + movl %eax,40(%rsi) + movl 24(%rsi),%eax + movl %eax,44(%rsi) + jmp Lw5 + +ENTRY(munge_wwwwwwl) + xorl %edx,%edx + movl 24(%rsi),%eax + movl %eax,48(%rsi) + movl 28(%rsi),%eax + movl %eax,52(%rsi) + jmp Lw6 + ENTRY(munge_wwwwwwlw) xorl %edx,%edx movl 32(%rsi),%eax diff --git a/bsd/hfs/Makefile b/bsd/hfs/Makefile index 814b9184d..27705308f 100644 --- a/bsd/hfs/Makefile +++ b/bsd/hfs/Makefile @@ -9,16 +9,12 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h index 24807f7f3..114fcecc7 100644 --- a/bsd/hfs/hfs.h +++ b/bsd/hfs/hfs.h @@ -29,6 +29,14 @@ #ifndef __HFS__ #define __HFS__ +/* If set to 1, enables the code to allocate blocks from the start + * of the disk instead of the nextAllocation for sparse devices like + * sparse disk images or sparsebundle images. The free extent cache + * for such volumes is also maintained based on the start block instead + * of number of contiguous allocation blocks. These devices prefer + * allocation of blocks near the start of the disk to avoid the + * increasing the image size, but it can also result in file fragmentation. + */ #define HFS_SPARSE_DEV 1 #if DEBUG @@ -60,6 +68,10 @@ #include <hfs/hfs_encodings.h> #include <hfs/hfs_hotfiles.h> +#if CONFIG_HFS_ALLOC_RBTREE +#include <hfs/hfscommon/headers/HybridAllocator.h> +#endif + /* * Just reported via MIG interface. */ @@ -110,11 +122,11 @@ extern struct timezone gTimeZone; */ #define HFS_ROOTVERYLOWDISKTRIGGERFRACTION 5 -#define HFS_ROOTVERYLOWDISKTRIGGERLEVEL ((u_int64_t)(125*1024*1024)) +#define HFS_ROOTVERYLOWDISKTRIGGERLEVEL ((u_int64_t)(512*1024*1024)) #define HFS_ROOTLOWDISKTRIGGERFRACTION 10 -#define HFS_ROOTLOWDISKTRIGGERLEVEL ((u_int64_t)(250*1024*1024)) +#define HFS_ROOTLOWDISKTRIGGERLEVEL ((u_int64_t)(1024*1024*1024)) #define HFS_ROOTLOWDISKSHUTOFFFRACTION 11 -#define HFS_ROOTLOWDISKSHUTOFFLEVEL ((u_int64_t)(375*1024*1024)) +#define HFS_ROOTLOWDISKSHUTOFFLEVEL ((u_int64_t)(1024*1024*1024 + 250*1024*1024)) #define HFS_VERYLOWDISKTRIGGERFRACTION 1 #define HFS_VERYLOWDISKTRIGGERLEVEL ((u_int64_t)(100*1024*1024)) @@ -178,8 +190,9 @@ typedef struct hfsmount { int16_t vcbFlags; /* Runtime flag to indicate if volume is dirty/clean */ u_int32_t vcbAtrb; u_int32_t vcbJinfoBlock; - time_t hfs_itime; /* file system creation time */ - time_t hfs_btime; /* file system last backup time */ + u_int32_t localCreateDate;/* volume create time from volume header (For HFS+, value is in local time) */ + time_t hfs_itime; /* file system creation time (creation date of the root folder) */ + time_t hfs_btime; /* file system last backup time */ u_int32_t blockSize; /* size of allocation blocks */ u_int32_t totalBlocks; /* total allocation blocks */ u_int32_t allocLimit; /* Do not allocate this block or beyond */ @@ -204,11 +217,33 @@ typedef struct hfsmount { /* cache of largest known free extents */ u_int32_t vcbFreeExtCnt; HFSPlusExtentDescriptor vcbFreeExt[kMaxFreeExtents]; + lck_spin_t vcbFreeExtLock; + +#if CONFIG_HFS_ALLOC_RBTREE + /* + * Access to these fields should only be done + * after acquiring the bitmap lock. Note that the + * "offset_block_end" field indicates the portion of + * the bitmap that is currently managed by the red-black tree. + */ + + /* Normal Allocation Tree */ + extent_tree_offset_t offset_tree; + u_int32_t offset_free_extents; /* number of free extents managed by tree */ + u_int32_t offset_block_end; +#endif + /* + * For setting persistent in-mount fields that relate + * to the use of the extent trees. See HFS Red-Black + * Tree Allocator Flags below. + */ + u_int32_t extent_tree_flags; + + u_int32_t reserveBlocks; /* free block reserve */ u_int32_t loanedBlocks; /* blocks on loan for delayed allocations */ - u_int32_t localCreateDate; /* creation times for HFS+ volumes are in local time */ /* * HFS+ Private system directories (two). Any access @@ -232,8 +267,9 @@ typedef struct hfsmount { u_int32_t jnl_size; u_int32_t hfs_jnlfileid; u_int32_t hfs_jnlinfoblkid; - lck_rw_t hfs_global_lock; + lck_rw_t hfs_global_lock; u_int32_t hfs_global_lock_nesting; + void* hfs_global_lockowner; /* Notification variables: */ u_int32_t hfs_notification_conditions; @@ -266,7 +302,7 @@ typedef struct hfsmount { int hfc_maxfiles; /* maximum files to track */ struct vnode * hfc_filevp; -#ifdef HFS_SPARSE_DEV +#if HFS_SPARSE_DEV /* Sparse device variables: */ struct vnode * hfs_backingfs_rootvp; u_int32_t hfs_last_backingstatfs; @@ -281,8 +317,9 @@ typedef struct hfsmount { lck_rw_t hfs_insync; /* protects sync/freeze interaction */ /* Resize variables: */ - u_int32_t hfs_resize_filesmoved; - u_int32_t hfs_resize_totalfiles; + u_int32_t hfs_resize_blocksmoved; + u_int32_t hfs_resize_totalblocks; + u_int32_t hfs_resize_progress; /* Per mount cnode hash variables: */ lck_mtx_t hfs_chash_mutex; /* protects access to cnode hash table */ @@ -313,6 +350,7 @@ typedef struct hfsmount { u_int64_t hfs_max_pending_io; thread_call_t hfs_syncer; // removeable devices get sync'ed by this guy + } hfsmount_t; #define HFS_META_DELAY (100) @@ -321,7 +359,6 @@ typedef struct hfsmount { typedef hfsmount_t ExtendedVCB; /* Aliases for legacy (Mac OS 9) field names */ -#define vcbCrDate hfs_itime #define vcbLsMod hfs_mtime #define vcbVolBkUp hfs_btime #define extentsRefNum hfs_extents_vp @@ -362,6 +399,15 @@ static __inline__ Boolean IsVCBDirty(ExtendedVCB *vcb) */ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; +/* HFS Red-Black Tree Allocator Flags */ +#define HFS_ALLOC_RB_ENABLED 0x000001 /* trees in use */ +#define HFS_ALLOC_RB_ERRORED 0x000002 /* tree hit error; disabled for the mount */ +#define HFS_ALLOC_RB_MZACTIVE 0x000004 /* metazone tree has finished building */ +#define HFS_ALLOC_RB_ACTIVE 0x000008 /* normalzone tree finished building */ + +/* HFS Red-Black Unmount Synch. Flags */ +#define HFS_ALLOC_TREEBUILD_INFLIGHT 0x000010 +#define HFS_ALLOC_TEARDOWN_INFLIGHT 0x000020 /* HFS mount point flags */ #define HFS_READ_ONLY 0x00001 @@ -380,6 +426,7 @@ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; #define HFS_CREATING_BTREE 0x02000 /* When set, do not update nextAllocation in the mount structure */ #define HFS_SKIP_UPDATE_NEXT_ALLOCATION 0x04000 +/* When set, the file system supports extent-based extended attributes */ #define HFS_XATTR_EXTENTS 0x08000 #define HFS_FOLDERCOUNT 0x10000 /* When set, the file system exists on a virtual device, like disk image */ @@ -391,7 +438,7 @@ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; */ #define HFS_RDONLY_DOWNGRADE 0x80000 #define HFS_DID_CONTIG_SCAN 0x100000 -#define HFS_UNMAP 0x200000 +#define HFS_SSD 0x400000 /* Macro to update next allocation block in the HFS mount structure. If @@ -416,9 +463,6 @@ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; lck_mtx_unlock(&(hfsmp)->hfs_mutex); \ } \ -#define hfs_global_exclusive_lock_acquire(hfsmp) lck_rw_lock_exclusive(&(hfsmp)->hfs_global_lock) -#define hfs_global_exclusive_lock_release(hfsmp) lck_rw_unlock_exclusive(&(hfsmp)->hfs_global_lock) - /* Macro for incrementing and decrementing the folder count in a cnode * attribute only if the HFS_FOLDERCOUNT bit is set in the mount flags * and kHFSHasFolderCount bit is set in the cnode flags. Currently these @@ -517,12 +561,10 @@ enum { kHFSPlusMaxFileNameBytes = kHFSPlusMaxFileNameChars * 3 }; /* * HFS specific fcntl()'s */ -#define HFS_BULKACCESS (FCNTL_FS_SPECIFIC_BASE + 0x00001) -#define HFS_GET_MOUNT_TIME (FCNTL_FS_SPECIFIC_BASE + 0x00002) -#define HFS_GET_LAST_MTIME (FCNTL_FS_SPECIFIC_BASE + 0x00003) #define HFS_GET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00004) #define HFS_SET_BOOT_INFO (FCNTL_FS_SPECIFIC_BASE + 0x00005) -#define HFS_EXT_BULKACCESS (FCNTL_FS_SPECIFIC_BASE + 0x00006) +/* See HFSIOC_EXT_BULKACCESS and friends for HFS specific fsctls*/ + /* @@ -537,7 +579,6 @@ enum { kHFSPlusMaxFileNameBytes = kHFSPlusMaxFileNameChars * 3 }; FUNCTION PROTOTYPES ******************************************************************************/ - /***************************************************************************** hfs_vnop_xxx functions from different files ******************************************************************************/ @@ -545,6 +586,9 @@ int hfs_vnop_readdirattr(struct vnop_readdirattr_args *); /* in hfs_attrlist.c int hfs_vnop_inactive(struct vnop_inactive_args *); /* in hfs_cnode.c */ int hfs_vnop_reclaim(struct vnop_reclaim_args *); /* in hfs_cnode.c */ +int hfs_set_backingstore (struct vnode *vp, int val); /* in hfs_cnode.c */ +int hfs_is_backingstore (struct vnode *vp, int *val); /* in hfs_cnode.c */ + int hfs_vnop_link(struct vnop_link_args *); /* in hfs_link.c */ @@ -633,6 +677,11 @@ extern int hfs_relocate(struct vnode *, u_int32_t, kauth_cred_t, struct proc extern int hfs_truncate(struct vnode *, off_t, int, int, int, vfs_context_t); +extern int hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, + struct filefork *rsrcfork, u_int32_t fileid); + +extern int hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp); + extern int hfs_bmap(struct vnode *, daddr_t, struct vnode **, daddr64_t *, unsigned int *); extern int hfs_fsync(struct vnode *, int, int, struct proc *); @@ -643,9 +692,12 @@ extern int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid); extern int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state); -extern void hfs_check_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype); +extern int hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks); -extern int hfs_isallocated(struct hfsmount *, u_int32_t, u_int32_t); +extern int hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, + u_int32_t numBlocks, u_int32_t *alloc_count); + +extern int hfs_isrbtree_active (struct hfsmount *hfsmp); /***************************************************************************** @@ -656,7 +708,7 @@ int hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context); /* used as a callback by the journaling code */ extern void hfs_sync_metadata(void *arg); -extern int hfs_vget(struct hfsmount *, cnid_t, struct vnode **, int); +extern int hfs_vget(struct hfsmount *, cnid_t, struct vnode **, int, int); extern void hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding); @@ -694,6 +746,15 @@ extern int overflow_extents(struct filefork *fp); extern int hfs_owner_rights(struct hfsmount *hfsmp, uid_t cnode_uid, kauth_cred_t cred, struct proc *p, int invokesuperuserstatus); +extern int check_for_tracked_file(struct vnode *vp, time_t ctime, uint64_t op_type, void *arg); +extern int check_for_dataless_file(struct vnode *vp, uint64_t op_type); + +/* + * Journal lock function prototypes + */ +int hfs_lock_global (struct hfsmount *hfsmp, enum hfslocktype locktype); +void hfs_unlock_global (struct hfsmount *hfsmp); + /* HFS System file locking */ #define SFL_CATALOG 0x0001 @@ -717,7 +778,7 @@ extern u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve); short MacToVFSError(OSErr err); -void hfs_metadatazone_init(struct hfsmount *hfsmp); +void hfs_metadatazone_init(struct hfsmount *hfsmp, int disable); /* HFS directory hint functions. */ extern directoryhint_t * hfs_getdirhint(struct cnode *, int, int); @@ -735,9 +796,11 @@ extern int hfs_virtualmetafile(struct cnode *); extern int hfs_start_transaction(struct hfsmount *hfsmp); extern int hfs_end_transaction(struct hfsmount *hfsmp); -extern int hfs_journal_flush(struct hfsmount *hfsmp); +extern int hfs_journal_flush(struct hfsmount *hfsmp, boolean_t wait_for_IO); extern void hfs_sync_ejectable(struct hfsmount *hfsmp); +extern void hfs_trim_callback(void *arg, uint32_t extent_count, const dk_extent_t *extents); + /* Erase unused Catalog nodes due to <rdar://problem/6947811>. */ extern int hfs_erase_unused_nodes(struct hfsmount *hfsmp); @@ -758,7 +821,7 @@ extern int hfs_btsync(struct vnode *vp, int sync_transaction); extern void replace_desc(struct cnode *cp, struct cat_desc *cdp); extern int hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, - struct vnode **rvpp, int can_drop_lock, int error_on_unlinked); + struct vnode **rvpp, int can_drop_lock, int error_on_unlink); extern int hfs_update(struct vnode *, int); @@ -766,10 +829,24 @@ extern int hfs_update(struct vnode *, int); /***************************************************************************** Functions from hfs_xattr.c ******************************************************************************/ + +/* Maximum extended attribute size supported for all extended attributes except + * resource fork and finder info. + */ +#define HFS_XATTR_MAXSIZE (128 * 1024) + +/* Number of bits used to represent maximum extended attribute size */ +#define HFS_XATTR_SIZE_BITS 18 + int hfs_attrkeycompare(HFSPlusAttrKey *searchKey, HFSPlusAttrKey *trialKey); int hfs_buildattrkey(u_int32_t fileID, const char *attrname, HFSPlusAttrKey *key); void hfs_xattr_init(struct hfsmount * hfsmp); int file_attribute_exist(struct hfsmount *hfsmp, uint32_t fileID); +int init_attrdata_vnode(struct hfsmount *hfsmp); +int hfs_getxattr_internal(struct cnode *, struct vnop_getxattr_args *, + struct hfsmount *, u_int32_t); +int hfs_setxattr_internal(struct cnode *, caddr_t, size_t, + struct vnop_setxattr_args *, struct hfsmount *, u_int32_t); @@ -779,7 +856,7 @@ int file_attribute_exist(struct hfsmount *hfsmp, uint32_t fileID); extern int hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int skip_reserve); -extern int hfs_lookuplink(struct hfsmount *hfsmp, cnid_t linkfileid, +extern int hfs_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid); extern void hfs_privatedir_init(struct hfsmount *, enum privdirtype); diff --git a/bsd/hfs/hfs_attrlist.c b/bsd/hfs/hfs_attrlist.c index dc24f99a7..3f1e6da64 100644 --- a/bsd/hfs/hfs_attrlist.c +++ b/bsd/hfs/hfs_attrlist.c @@ -41,6 +41,7 @@ #include <sys/unistd.h> #include <sys/mount_internal.h> #include <sys/kauth.h> +#include <sys/fsctl.h> #include <kern/locks.h> @@ -80,7 +81,6 @@ static u_int32_t hfs_real_user_access(vnode_t vp, vfs_context_t ctx); * apply for the file system you are doing the readdirattr on. To make life * simpler, this call will only return entries in its directory, hfs like. */ -__private_extern__ int hfs_vnop_readdirattr(ap) struct vnop_readdirattr_args /* { @@ -138,6 +138,19 @@ hfs_vnop_readdirattr(ap) (alist->forkattr != 0)) { return (EINVAL); } + + if (VTOC(dvp)->c_flags & UF_COMPRESSED) { + int compressed = hfs_file_is_compressed(VTOC(dvp), 0); /* 0 == take the cnode lock */ + + if (!compressed) { + error = check_for_dataless_file(dvp, NAMESPACE_HANDLER_READ_OP); + if (error) { + return error; + } + } + } + + /* * Take an exclusive directory lock since we manipulate the directory hints */ @@ -256,12 +269,12 @@ hfs_vnop_readdirattr(ap) /* * Obtain vnode for our vnode_authorize() calls. */ - if (hfs_vget(hfsmp, cattrp->ca_fileid, &vp, 0) != 0) { + if (hfs_vget(hfsmp, cattrp->ca_fileid, &vp, 0, 0) != 0) { vp = NULL; } } else if (!(ap->a_options & FSOPT_NOINMEMUPDATE)) { /* Get in-memory cnode data (if any). */ - vp = hfs_chash_getvnode(hfsmp, cattrp->ca_fileid, 0, 0); + vp = hfs_chash_getvnode(hfsmp, cattrp->ca_fileid, 0, 0, 0); } if (vp != NULL) { cp = VTOC(vp); @@ -405,7 +418,7 @@ exit2: /* * Pack cnode attributes into an attribute block. */ - __private_extern__ +__private_extern__ void hfs_packattrblk(struct attrblock *abp, struct hfsmount *hfsmp, @@ -654,7 +667,10 @@ packcommonattr( } } if (ATTR_CMN_FNDRINFO & attr) { + u_int8_t *finfo = NULL; bcopy(&cap->ca_finderinfo, attrbufptr, sizeof(u_int8_t) * 32); + finfo = (u_int8_t*)attrbufptr; + /* Don't expose a symlink's private type/creator. */ if (S_ISLNK(cap->ca_mode)) { struct FndrFileInfo *fip; @@ -663,6 +679,18 @@ packcommonattr( fip->fdType = 0; fip->fdCreator = 0; } + + /* advance 16 bytes into the attrbuf */ + finfo = finfo + 16; + if (S_ISREG(cap->ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = 0; + } + else if (S_ISDIR(cap->ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = 0; + } + attrbufptr = (char *)attrbufptr + sizeof(u_int8_t) * 32; } if (ATTR_CMN_OWNERID & attr) { @@ -814,7 +842,10 @@ packfileattr( off_t datasize = datafork->cf_size; off_t totalsize = datasize + rsrcfork->cf_size; #if HFS_COMPRESSION - if ( cattrp->ca_flags & UF_COMPRESSED ) { + int handle_compressed; + handle_compressed = (cattrp->ca_flags & UF_COMPRESSED);// && hfs_file_is_compressed(VTOC(vp), 1); + + if (handle_compressed) { if (attr & (ATTR_FILE_DATALENGTH|ATTR_FILE_TOTALSIZE)) { if ( 0 == hfs_uncompressed_size_of_compressed_file(hfsmp, vp, cattrp->ca_fileid, &datasize, 1) ) { /* 1 == don't take the cnode lock */ /* total size of a compressed file is just the data size */ @@ -865,7 +896,7 @@ packfileattr( * passed by hfs_vnop_readdirattr() may be null. */ - if ( cattrp->ca_flags & UF_COMPRESSED ) { + if ( handle_compressed ) { if (attr & ATTR_FILE_DATAALLOCSIZE) { *((off_t *)attrbufptr) = (off_t)rsrcfork->cf_blocks * (off_t)allocblksize; attrbufptr = ((off_t *)attrbufptr) + 1; @@ -902,7 +933,7 @@ packfileattr( /* * Calculate the total size of an attribute block. */ - __private_extern__ +__private_extern__ int hfs_attrblksize(struct attrlist *attrlist) { @@ -1015,7 +1046,6 @@ hfs_real_user_access(vnode_t vp, vfs_context_t ctx) } -__private_extern__ u_int32_t DerivePermissionSummary(uid_t obj_uid, gid_t obj_gid, mode_t obj_mode, struct mount *mp, kauth_cred_t cred, __unused struct proc *p) diff --git a/bsd/hfs/hfs_btreeio.c b/bsd/hfs/hfs_btreeio.c index ddcb91277..7295cee54 100644 --- a/bsd/hfs/hfs_btreeio.c +++ b/bsd/hfs/hfs_btreeio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -53,8 +53,8 @@ extern int bdwrite_internal(struct buf *, int); static int ClearBTNodes(struct vnode *vp, long blksize, off_t offset, off_t amount); static int btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp); +void btree_swap_node(struct buf *bp, __unused void *arg); -__private_extern__ OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, __unused ItemCount minBlockCount) { BTreeControlBlockPtr bTreePtr; @@ -71,7 +71,6 @@ OSStatus SetBTreeBlockSize(FileReference vp, ByteCount blockSize, __unused ItemC } -__private_extern__ OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, GetBlockOptions options, BlockDescriptor *block) { OSStatus retval = E_NONE; @@ -165,7 +164,6 @@ OSStatus GetBTreeBlock(FileReference vp, u_int32_t blockNum, GetBlockOptions opt } -__private_extern__ void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr) { struct hfsmount *hfsmp = VTOHFS(vp); @@ -185,7 +183,7 @@ void ModifyBlockStart(FileReference vp, BlockDescPtr blockPtr) blockPtr->isModified = 1; } -static void +void btree_swap_node(struct buf *bp, __unused void *arg) { // struct hfsmount *hfsmp = (struct hfsmount *)arg; @@ -218,7 +216,6 @@ btree_journal_modify_block_end(struct hfsmount *hfsmp, struct buf *bp) } -__private_extern__ OSStatus ReleaseBTreeBlock(FileReference vp, BlockDescPtr blockPtr, ReleaseBlockOptions options) { struct hfsmount *hfsmp = VTOHFS(vp); @@ -331,7 +328,6 @@ exit: } -__private_extern__ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) { #pragma unused (maxEOF) @@ -467,7 +463,7 @@ OSStatus ExtendBTreeFile(FileReference vp, FSSize minEOF, FSSize maxEOF) trim = ((filePtr->fcbEOF - origSize) % btInfo.nodeSize); } - ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0); + ret = TruncateFileC(vcb, filePtr, filePtr->fcbEOF - trim, 0, 0, FTOC(filePtr)->c_fileid, 0); filePtr->fcbEOF = (u_int64_t)filePtr->ff_blocks * (u_int64_t)vcb->blockSize; // XXXdbg - panic if the file didn't get trimmed back properly @@ -611,6 +607,8 @@ hfs_create_attr_btree(struct hfsmount *hfsmp, u_int32_t nodesize, u_int32_t node u_int16_t offset; int intrans = 0; int result; + int newvnode_flags = 0; + again: /* * Serialize creation using HFS_CREATING_BTREE flag. @@ -654,7 +652,8 @@ again: bzero(&cfork, sizeof(cfork)); cfork.cf_clump = nodesize * nodecnt; - result = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, &vp); + result = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, + &cfork, &vp, &newvnode_flags); if (result) { goto exit; } diff --git a/bsd/hfs/hfs_catalog.c b/bsd/hfs/hfs_catalog.c index 0ae0e2600..0ab6f4585 100644 --- a/bsd/hfs/hfs_catalog.c +++ b/bsd/hfs/hfs_catalog.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,10 +90,12 @@ u_char modetodirtype[16] = { #define MODE_TO_DT(mode) (modetodirtype[((mode) & S_IFMT) >> 12]) -static int cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files, u_int32_t hint, int wantrsrc, +#define HFS_LOOKUP_SYSFILE 0x1 /* If set, allow lookup of system files */ +#define HFS_LOOKUP_HARDLINK 0x2 /* If set, allow lookup of hard link records and not resolve the hard links */ +static int cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t hint, int wantrsrc, struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp, cnid_t *desc_cnid); -static int cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, +int cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp); /* Internal catalog support routines */ @@ -133,8 +135,9 @@ static int buildthread(void *keyp, void *recp, int std_hfs, int directory); static int cat_makealias(struct hfsmount *hfsmp, u_int32_t inode_num, struct HFSPlusCatalogFile *crp); +static int cat_update_internal(struct hfsmount *hfsmp, int update_hardlink, struct cat_desc *descp, struct cat_attr *attrp, + struct cat_fork *dataforkp, struct cat_fork *rsrcforkp); -__private_extern__ int cat_preflight(struct hfsmount *hfsmp, catops_t ops, cat_cookie_t *cookie, __unused proc_t p) { @@ -152,7 +155,6 @@ cat_preflight(struct hfsmount *hfsmp, catops_t ops, cat_cookie_t *cookie, __unus return MacToVFSError(result); } -__private_extern__ void cat_postflight(struct hfsmount *hfsmp, cat_cookie_t *cookie, __unused proc_t p) { @@ -167,8 +169,7 @@ cat_postflight(struct hfsmount *hfsmp, cat_cookie_t *cookie, __unused proc_t p) hfs_systemfile_unlock(hfsmp, lockflags); } - -__private_extern__ +__private_extern__ void cat_convertattr( struct hfsmount *hfsmp, @@ -297,7 +298,6 @@ cat_releasedesc(struct cat_desc *descp) * Note: The caller is responsible for releasing the output * catalog descriptor (when supplied outdescp is non-null). */ -__private_extern__ int cat_lookup(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, struct cat_desc *outdescp, struct cat_attr *attrp, @@ -344,7 +344,6 @@ exit: return (result); } -__private_extern__ int cat_insertfilethread(struct hfsmount *hfsmp, struct cat_desc *descp) { @@ -409,7 +408,6 @@ exit: * catalog descriptor (when supplied outdescp is non-null). */ -__private_extern__ int cat_findname(struct hfsmount *hfsmp, cnid_t cnid, struct cat_desc *outdescp) { @@ -482,7 +480,6 @@ exit: * Note: The caller is responsible for releasing the output * catalog descriptor (when supplied outdescp is non-null). */ -__private_extern__ int cat_idlookup(struct hfsmount *hfsmp, cnid_t cnid, int allow_system_files, struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp) @@ -543,7 +540,9 @@ cat_idlookup(struct hfsmount *hfsmp, cnid_t cnid, int allow_system_files, goto exit; } - result = cat_lookupbykey(hfsmp, keyp, allow_system_files, 0, 0, outdescp, attrp, forkp, NULL); + result = cat_lookupbykey(hfsmp, keyp, + ((allow_system_files != 0) ? HFS_LOOKUP_SYSFILE : 0), + 0, 0, outdescp, attrp, forkp, NULL); /* No corresponding file/folder record found for a thread record, * mark the volume inconsistent. */ @@ -569,7 +568,7 @@ exit: /* * cat_lookupmangled - lookup a catalog node using a mangled name */ -static int +int cat_lookupmangled(struct hfsmount *hfsmp, struct cat_desc *descp, int wantrsrc, struct cat_desc *outdescp, struct cat_attr *attrp, struct cat_fork *forkp) { @@ -625,7 +624,7 @@ falsematch: * cat_lookupbykey - lookup a catalog node using a cnode key */ static int -cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files, u_int32_t hint, int wantrsrc, +cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int flags, u_int32_t hint, int wantrsrc, struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp, cnid_t *desc_cnid) { struct BTreeIterator * iterator; @@ -637,6 +636,7 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files u_int32_t ilink = 0; cnid_t cnid = 0; u_int32_t encoding = 0; + cnid_t parentid = 0; std_hfs = (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord); @@ -652,16 +652,18 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files if (result) goto exit; - /* Save the cnid and encoding now in case there's a hard link */ + /* Save the cnid, parentid, and encoding now in case there's a hard link or inode */ cnid = getcnid(recp); + if (!std_hfs) { + parentid = keyp->hfsPlus.parentID; + } encoding = getencoding(recp); hint = iterator->hint.nodeNum; /* Hide the journal files (if any) */ if ((hfsmp->jnl || ((HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask) && (hfsmp->hfs_flags & HFS_READ_ONLY))) && ((cnid == hfsmp->hfs_jnlfileid) || (cnid == hfsmp->hfs_jnlinfoblkid)) && - !allow_system_files) { - + !(flags & HFS_LOOKUP_SYSFILE)) { result = ENOENT; goto exit; } @@ -674,7 +676,7 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files if (!std_hfs && (attrp || forkp) && (recp->recordType == kHFSPlusFileRecord) - && ((to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->vcbCrDate) || + && ((to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->hfs_itime) || (to_bsd_time(recp->hfsPlusFile.createDate) == (time_t)hfsmp->hfs_metadata_createdate))) { int isdirlink = 0; int isfilelink = 0; @@ -687,7 +689,7 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files (SWAP_BE32(recp->hfsPlusFile.userInfo.fdCreator) == kHFSAliasCreator)) { isdirlink = 1; } - if (isfilelink || isdirlink) { + if ((isfilelink || isdirlink) && !(flags & HFS_LOOKUP_HARDLINK)) { ilink = recp->hfsPlusFile.hl_linkReference; (void) cat_resolvelink(hfsmp, ilink, isdirlink, (struct HFSPlusCatalogFile *)recp); } @@ -701,8 +703,50 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files getbsdattr(hfsmp, &cnoderec, attrp); } else { getbsdattr(hfsmp, (struct HFSPlusCatalogFile *)recp, attrp); - if (ilink) + if (ilink) { + /* Update the inode number for this hard link */ attrp->ca_linkref = ilink; + } + + /* + * Set kHFSHasLinkChainBit for hard links, and reset it for all + * other items. Also set linkCount to 1 for regular files. + * + * Due to some bug (rdar://8505977), some regular files can have + * kHFSHasLinkChainBit set and linkCount more than 1 even if they + * are not really hard links. The runtime code should not consider + * these files has hard links. Therefore we reset the kHFSHasLinkChainBit + * and linkCount for regular file before we vend it out. This might + * also result in repairing the bad files on disk, if the corresponding + * file is modified and updated on disk. + */ + if (ilink) { + /* This is a hard link and the link count bit was not set */ + if (!(attrp->ca_recflags & kHFSHasLinkChainMask)) { + printf ("hfs: set hardlink bit on vol=%s cnid=%u inoid=%u\n", hfsmp->vcbVN, cnid, ilink); + attrp->ca_recflags |= kHFSHasLinkChainMask; + } + } else { + /* Make sure that this non-hard link (regular) record is not + * an inode record or a valid hard link being that is not + * resolved for volume resize purposes. We do not want to + * reset the hard link bit or reset link count on these records. + */ + if (!(flags & HFS_LOOKUP_HARDLINK) && + (parentid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) && + (parentid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid)) { + /* This is not a hard link or inode and the link count bit was set */ + if (attrp->ca_recflags & kHFSHasLinkChainMask) { + printf ("hfs: clear hardlink bit on vol=%s cnid=%u\n", hfsmp->vcbVN, cnid); + attrp->ca_recflags &= ~kHFSHasLinkChainMask; + } + /* This is a regular file and the link count was more than 1 */ + if (S_ISREG(attrp->ca_mode) && (attrp->ca_linkcount > 1)) { + printf ("hfs: set linkcount=1 on vol=%s cnid=%u old=%u\n", hfsmp->vcbVN, cnid, attrp->ca_linkcount); + attrp->ca_linkcount = 1; + } + } + } } } if (forkp != NULL) { @@ -765,6 +809,22 @@ cat_lookupbykey(struct hfsmount *hfsmp, CatalogKey *keyp, int allow_system_files if ((validblks < forkp->cf_blocks) && (forkp->cf_extents[7].blockCount == 0)) { off_t psize; + /* + * This is technically a volume corruption. + * If the total number of blocks calculated by iterating + summing + * the extents in the resident extent records, is less than that + * which is reported in the catalog entry, we should force a fsck. + * Only modifying ca_blocks here is not guaranteed to make it out + * to disk; it is a runtime-only field. + * + * Note that we could have gotten into this state if we had invalid ranges + * that existed in borrowed blocks that somehow made it out to disk. + * The cnode's on disk block count should never be greater + * than that which is in its extent records. + */ + + (void) hfs_mark_volume_inconsistent (hfsmp); + forkp->cf_blocks = validblks; if (attrp != NULL) { attrp->ca_blocks = validblks + recp->hfsPlusFile.resourceFork.totalBlocks; @@ -813,7 +873,6 @@ exit: * The caller is responsible for releasing the output * catalog descriptor (when supplied outdescp is non-null). */ -__private_extern__ int cat_create(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, struct cat_desc *out_descp) @@ -988,7 +1047,6 @@ exit: * Note: The caller is responsible for releasing the output * catalog descriptor (when supplied out_cdp is non-null). */ -__private_extern__ int cat_rename ( struct hfsmount * hfsmp, @@ -1287,7 +1345,6 @@ exit: * 2. BTDeleteRecord(thread); * 3. BTUpdateRecord(parent); */ -__private_extern__ int cat_delete(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp) { @@ -1378,12 +1435,13 @@ exit: /* - * cnode_update - update the catalog node described by descp - * using the data from attrp and forkp. + * cat_update_internal - update the catalog node described by descp + * using the data from attrp and forkp. + * If update_hardlink is true, the hard link catalog record is updated + * and not the inode catalog record. */ -__private_extern__ -int -cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, +static int +cat_update_internal(struct hfsmount *hfsmp, int update_hardlink, struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *dataforkp, struct cat_fork *rsrcforkp) { FCB * fcb; @@ -1408,13 +1466,14 @@ cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attr * For open-deleted files we need to do a lookup by cnid * (using thread rec). * - * For hard links, the target of the update is the inode - * itself (not the link record) so a lookup by fileid - * (i.e. thread rec) is needed. + * For hard links and if not requested by caller, the target + * of the update is the inode itself (not the link record) + * so a lookup by fileid (i.e. thread rec) is needed. */ - if ((descp->cd_cnid != attrp->ca_fileid) || - (descp->cd_namelen == 0) || - (attrp->ca_recflags & kHFSHasLinkChainMask)) { + if ((update_hardlink == false) && + ((descp->cd_cnid != attrp->ca_fileid) || + (descp->cd_namelen == 0) || + (attrp->ca_recflags & kHFSHasLinkChainMask))) { result = getkey(hfsmp, attrp->ca_fileid, (CatalogKey *)&iterator->key); } else { result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0); @@ -1439,6 +1498,17 @@ exit: return MacToVFSError(result); } +/* + * cat_update - update the catalog node described by descp + * using the data from attrp and forkp. + */ +int +cat_update(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, + struct cat_fork *dataforkp, struct cat_fork *rsrcforkp) +{ + return cat_update_internal(hfsmp, false, descp, attrp, dataforkp, rsrcforkp); +} + /* * catrec_update - Update the fields of a catalog record * This is called from within BTUpdateRecord. @@ -1585,6 +1655,7 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *st } case kHFSPlusFileRecord: { HFSPlusCatalogFile *file; + int is_dirlink; file = (struct HFSPlusCatalogFile *)crp; /* Do a quick sanity check */ @@ -1627,13 +1698,22 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *st * supplied values (which will be default), which has the * same effect as creating a new file while * MNT_UNKNOWNPERMISSIONS is set. + * + * Do not modify bsdInfo for directory hard link records. + * They are set during creation and are not modifiable, so just + * leave them alone. */ - if ((file->bsdInfo.fileMode != 0) || - (attrp->ca_flags != 0) || - (attrp->ca_uid != hfsmp->hfs_uid) || - (attrp->ca_gid != hfsmp->hfs_gid) || - ((attrp->ca_mode & ALLPERMS) != - (hfsmp->hfs_file_mask & ACCESSPERMS))) { + is_dirlink = (file->flags & kHFSHasLinkChainMask) && + (SWAP_BE32(file->userInfo.fdType) == kHFSAliasType) && + (SWAP_BE32(file->userInfo.fdCreator) == kHFSAliasCreator); + + if (!is_dirlink && + ((file->bsdInfo.fileMode != 0) || + (attrp->ca_flags != 0) || + (attrp->ca_uid != hfsmp->hfs_uid) || + (attrp->ca_gid != hfsmp->hfs_gid) || + ((attrp->ca_mode & ALLPERMS) != + (hfsmp->hfs_file_mask & ACCESSPERMS)))) { if ((file->bsdInfo.fileMode == 0) || (((unsigned int)vfs_flags(HFSTOVFS(hfsmp))) & MNT_UNKNOWNPERMISSIONS) == 0) { file->bsdInfo.ownerID = attrp->ca_uid; @@ -1679,8 +1759,18 @@ catrec_update(const CatalogKey *ckp, CatalogRecord *crp, struct update_state *st /* Push out special field if necessary */ if (S_ISBLK(attrp->ca_mode) || S_ISCHR(attrp->ca_mode)) { file->bsdInfo.special.rawDevice = attrp->ca_rdev; - } else if (descp->cd_cnid != attrp->ca_fileid || attrp->ca_linkcount == 2) { - file->hl_linkCount = attrp->ca_linkcount; + } + else { + /* + * Protect against the degenerate case where the descriptor contains the + * raw inode ID in its CNID field. If the HFSPlusCatalogFile record indicates + * the linkcount was greater than 1 (the default value), then it must have become + * a hardlink. In this case, update the linkcount from the cat_attr passed in. + */ + if ((descp->cd_cnid != attrp->ca_fileid) || (attrp->ca_linkcount > 1 ) || + (file->hl_linkCount > 1)) { + file->hl_linkCount = attrp->ca_linkcount; + } } break; } @@ -1809,7 +1899,7 @@ cat_check_link_ancestry(struct hfsmount *hfsmp, cnid_t cnid, cnid_t pointed_at_c /* - * updatelink_callback - update a link's chain + * update_siblinglinks_callback - update a link's chain */ struct linkupdate_state { @@ -1819,12 +1909,12 @@ struct linkupdate_state { }; static int -updatelink_callback(__unused const CatalogKey *ckp, CatalogRecord *crp, struct linkupdate_state *state) +update_siblinglinks_callback(__unused const CatalogKey *ckp, CatalogRecord *crp, struct linkupdate_state *state) { HFSPlusCatalogFile *file; if (crp->recordType != kHFSPlusFileRecord) { - printf("hfs: updatelink_callback: unexpected rec type %d\n", crp->recordType); + printf("hfs: update_siblinglinks_callback: unexpected rec type %d\n", crp->recordType); return (btNotFound); } @@ -1837,17 +1927,16 @@ updatelink_callback(__unused const CatalogKey *ckp, CatalogRecord *crp, struct l file->hl_nextLinkID = state->nextlinkid; } } else { - printf("hfs: updatelink_callback: file %d isn't a chain\n", file->fileID); + printf("hfs: update_siblinglinks_callback: file %d isn't a chain\n", file->fileID); } return (0); } /* - * cat_updatelink - update a link's chain + * cat_update_siblinglinks - update a link's chain */ -__private_extern__ int -cat_updatelink(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevlinkid, cnid_t nextlinkid) +cat_update_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevlinkid, cnid_t nextlinkid) { FCB * fcb; BTreeIterator * iterator; @@ -1859,24 +1948,25 @@ cat_updatelink(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevlinkid, cni state.prevlinkid = prevlinkid; state.nextlinkid = nextlinkid; - /* Borrow the btcb iterator since we have an exclusive catalog lock. */ - iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; - iterator->hint.nodeNum = 0; + /* Create an iterator for use by us temporarily */ + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); result = getkey(hfsmp, linkfileid, (CatalogKey *)&iterator->key); if (result == 0) { - result = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr)updatelink_callback, &state); + result = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr)update_siblinglinks_callback, &state); (void) BTFlushPath(fcb); } else { - printf("hfs: cat_updatelink: couldn't resolve cnid %d\n", linkfileid); + printf("hfs: cat_update_siblinglinks: couldn't resolve cnid %d\n", linkfileid); } + + FREE (iterator, M_TEMP); return MacToVFSError(result); } /* * cat_lookuplink - lookup a link by it's name */ -__private_extern__ int cat_lookuplink(struct hfsmount *hfsmp, struct cat_desc *descp, cnid_t *linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) { @@ -1888,9 +1978,9 @@ cat_lookuplink(struct hfsmount *hfsmp, struct cat_desc *descp, cnid_t *linkfilei fcb = hfsmp->hfs_catalog_cp->c_datafork; - /* Borrow the btcb iterator since we have an exclusive catalog lock. */ - iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; - iterator->hint.nodeNum = 0; + /* Create an iterator for use by us temporarily */ + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); if ((result = buildkey(hfsmp, descp, (HFSPlusCatalogKey *)&iterator->key, 0))) { goto exit; @@ -1914,16 +2004,16 @@ cat_lookuplink(struct hfsmount *hfsmp, struct cat_desc *descp, cnid_t *linkfilei *nextlinkid = 0; } exit: + FREE(iterator, M_TEMP); return MacToVFSError(result); } /* - * cat_lookuplink - lookup a link by its cnid + * cat_lookup_siblinglinks - lookup previous and next link ID for link using its cnid */ -__private_extern__ int -cat_lookuplinkbyid(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) +cat_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) { FCB * fcb; BTreeIterator * iterator; @@ -1933,18 +2023,19 @@ cat_lookuplinkbyid(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid fcb = hfsmp->hfs_catalog_cp->c_datafork; - /* Borrow the btcb iterator since we have an exclusive catalog lock. */ - iterator = &((BTreeControlBlockPtr)(fcb->ff_sysfileinfo))->iterator; - iterator->hint.nodeNum = 0; + /* Create an iterator for use by us temporarily */ + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + bzero(iterator, sizeof(*iterator)); + if ((result = getkey(hfsmp, linkfileid, (CatalogKey *)&iterator->key))) { - printf("hfs: cat_lookuplinkbyid: getkey for %d failed %d\n", linkfileid, result); + printf("hfs: cat_lookup_siblinglinks: getkey for %d failed %d\n", linkfileid, result); goto exit; } BDINIT(btdata, &file); if ((result = BTSearchRecord(fcb, iterator, &btdata, NULL, NULL))) { - printf("hfs: cat_lookuplinkbyid: cannot find %d\n", linkfileid); + printf("hfs: cat_lookup_siblinglinks: cannot find %d\n", linkfileid); goto exit; } /* The prev/next chain is only valid when kHFSHasLinkChainMask is set. */ @@ -1953,7 +2044,7 @@ cat_lookuplinkbyid(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid parent = ((HFSPlusCatalogKey *)&iterator->key)->parentID; - /* ADL inodes don't have a chain (its in an EA) */ + /* directory inodes don't have a chain (its in an EA) */ if (parent == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { result = ENOLINK; /* signal to caller to get head of list */ } else if (parent == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) { @@ -1968,6 +2059,7 @@ cat_lookuplinkbyid(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid *nextlinkid = 0; } exit: + FREE(iterator, M_TEMP); return MacToVFSError(result); } @@ -1983,7 +2075,6 @@ exit: * ca_flags * ca_finderinfo (type and creator) */ -__private_extern__ int cat_createlink(struct hfsmount *hfsmp, struct cat_desc *descp, struct cat_attr *attrp, cnid_t nextlinkid, cnid_t *linkfileid) @@ -2278,7 +2369,6 @@ exit: /* * cat_deletelink - delete a link from the catalog */ -__private_extern__ int cat_deletelink(struct hfsmount *hfsmp, struct cat_desc *descp) { @@ -2455,7 +2545,6 @@ getentriesattr_callback(const CatalogKey *key, const CatalogRecord *rec, * * Note: index is zero relative */ -__private_extern__ int cat_getentriesattr(struct hfsmount *hfsmp, directoryhint_t *dirhint, struct cat_entrylist *ce_list) { @@ -2690,7 +2779,7 @@ getdirentries_callback(const CatalogKey *ckp, const CatalogRecord *crp, * regardless, so it's slightly safer to let that logic mark the boolean, * especially since it's closer to the return of this function. */ - + if (state->cbs_extended) { /* The last record has not been returned yet, so we * want to stop after packing the last item @@ -3043,7 +3132,6 @@ getdirentries_std_callback(const CatalogKey *ckp, const CatalogRecord *crp, /* * Pack a uio buffer with directory entries from the catalog */ -__private_extern__ int cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint, uio_t uio, int extended, int * items, int * eofflag) @@ -3087,7 +3175,7 @@ cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint * field to track whether or not we've returned EOF from the iterator function. */ state.cbs_eof = false; - + iterator = (BTreeIterator *) ((char *)state.cbs_linkinfo + (maxlinks * sizeof(linkinfo_t))); key = (CatalogKey *)&iterator->key; have_key = 0; @@ -3215,12 +3303,13 @@ cat_getdirentries(struct hfsmount *hfsmp, int entrycnt, directoryhint_t *dirhint /* Note that state.cbs_index is still valid on errors */ *items = state.cbs_index - index; index = state.cbs_index; - + /* * Also note that cbs_eof is set in all cases if we ever hit EOF * during the enumeration by the catalog callback. Mark the directory's hint * descriptor as having hit EOF. */ + if (state.cbs_eof) { dirhint->dh_desc.cd_flags |= CD_EOF; *eofflag = 1; @@ -3335,7 +3424,6 @@ cat_findposition(const CatalogKey *ckp, const CatalogRecord *crp, * The name portion of the key is compared using a 16-bit binary comparison. * This is called from the b-tree code. */ -__private_extern__ int cat_binarykeycompare(HFSPlusCatalogKey *searchKey, HFSPlusCatalogKey *trialKey) { @@ -3517,7 +3605,6 @@ buildkey(struct hfsmount *hfsmp, struct cat_desc *descp, /* * Resolve hard link reference to obtain the inode record. */ -__private_extern__ int cat_resolvelink(struct hfsmount *hfsmp, u_int32_t linkref, int isdirlink, struct HFSPlusCatalogFile *recp) { @@ -3657,7 +3744,6 @@ exit: * The key's parent id is the only part of the key expected to be used by the caller. * The name portion of the key may not always be valid (ie in the case of a hard link). */ -__private_extern__ int cat_getkeyplusattr(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key, struct cat_attr *attrp) { @@ -3684,7 +3770,7 @@ cat_getkeyplusattr(struct hfsmount *hfsmp, cnid_t cnid, CatalogKey * key, struct * Pick up the first link in the chain and get a descriptor for it. * This allows blind bulk access checks to work for hardlinks. */ - if ((cat_lookuplinkbyid(hfsmp, cnid, &prevlinkid, &nextlinkid) == 0) && + if ((cat_lookup_siblinglinks(hfsmp, cnid, &prevlinkid, &nextlinkid) == 0) && (nextlinkid != 0)) { if (cat_findname(hfsmp, nextlinkid, &linkdesc) == 0) { key->hfsPlus.parentID = linkdesc.cd_parentcnid; @@ -4203,3 +4289,99 @@ isadir(const CatalogRecord *crp) crp->recordType == kHFSPlusFolderRecord); } +/* + * cat_lookup_dirlink - lookup a catalog record for directory hard link + * (not inode) using catalog record id. Note that this function does + * NOT resolve directory hard link to its directory inode and return + * the link record. + * + * Note: The caller is responsible for releasing the output catalog + * descriptor (when supplied outdescp is non-null). + */ +int +cat_lookup_dirlink(struct hfsmount *hfsmp, cnid_t dirlink_id, + u_int8_t forktype, struct cat_desc *outdescp, + struct cat_attr *attrp, struct cat_fork *forkp) +{ + struct BTreeIterator *iterator = NULL; + FSBufferDescriptor btdata; + u_int16_t datasize; + CatalogKey *keyp; + CatalogRecord *recp = NULL; + int error; + + /* No directory hard links on standard HFS */ + if (hfsmp->vcbSigWord == kHFSSigWord) { + return ENOTSUP; + } + + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); + if (iterator == NULL) { + return ENOMEM; + } + bzero(iterator, sizeof(*iterator)); + buildthreadkey(dirlink_id, 1, (CatalogKey *)&iterator->key); + + MALLOC(recp, CatalogRecord *, sizeof(CatalogRecord), M_TEMP, M_WAITOK); + if (recp == NULL) { + error = ENOMEM; + goto out; + } + BDINIT(btdata, recp); + + error = BTSearchRecord(VTOF(HFSTOVCB(hfsmp)->catalogRefNum), iterator, + &btdata, &datasize, iterator); + if (error) { + goto out; + } + /* Directory hard links are catalog file record */ + if (recp->recordType != kHFSPlusFileThreadRecord) { + error = ENOENT; + goto out; + } + + keyp = (CatalogKey *)&recp->hfsPlusThread.reserved; + keyp->hfsPlus.keyLength = kHFSPlusCatalogKeyMinimumLength + + (keyp->hfsPlus.nodeName.length * 2); + if (forktype == kHFSResourceForkType) { + /* Lookup resource fork for directory hard link */ + error = cat_lookupbykey(hfsmp, keyp, HFS_LOOKUP_HARDLINK, 0, true, outdescp, attrp, forkp, NULL); + } else { + /* Lookup data fork, if any, for directory hard link */ + error = cat_lookupbykey(hfsmp, keyp, HFS_LOOKUP_HARDLINK, 0, false, outdescp, attrp, forkp, NULL); + } + if (error) { + printf ("hfs: cat_lookup_dirlink(): Error looking up file record for id=%u (error=%d)\n", dirlink_id, error); + hfs_mark_volume_inconsistent(hfsmp); + goto out; + } + /* Just for sanity, make sure that id in catalog record and thread record match */ + if ((outdescp != NULL) && (dirlink_id != outdescp->cd_cnid)) { + printf ("hfs: cat_lookup_dirlink(): Requested cnid=%u != found_cnid=%u\n", dirlink_id, outdescp->cd_cnid); + hfs_mark_volume_inconsistent(hfsmp); + error = ENOENT; + } + +out: + if (recp) { + FREE(recp, M_TEMP); + } + FREE(iterator, M_TEMP); + + return MacToVFSError(error); +} + +/* + * cnode_update_dirlink - update the catalog node for directory hard link + * described by descp using the data from attrp and forkp. + */ +int +cat_update_dirlink(struct hfsmount *hfsmp, u_int8_t forktype, + struct cat_desc *descp, struct cat_attr *attrp, struct cat_fork *forkp) +{ + if (forktype == kHFSResourceForkType) { + return cat_update_internal(hfsmp, true, descp, attrp, NULL, forkp); + } else { + return cat_update_internal(hfsmp, true, descp, attrp, forkp, NULL); + } +} diff --git a/bsd/hfs/hfs_catalog.h b/bsd/hfs/hfs_catalog.h index 6c1eaa130..e8574e17d 100644 --- a/bsd/hfs/hfs_catalog.h +++ b/bsd/hfs/hfs_catalog.h @@ -63,7 +63,7 @@ struct cat_desc { const u_int8_t * cd_nameptr; /* pointer to cnode name */ }; -/* cd_flags +/* cd_flags * * CD_EOF is used by hfs_vnop_readdir / cat_getdirentries to indicate EOF was * encountered during a directory enumeration. When this flag is observed @@ -258,6 +258,11 @@ union CatalogRecord { }; typedef union CatalogRecord CatalogRecord; +/* Constants for HFS fork types */ +enum { + kHFSDataForkType = 0x0, /* data fork */ + kHFSResourceForkType = 0xff /* resource fork */ +}; /* * Catalog Interface @@ -404,7 +409,7 @@ enum { extern int cat_deletelink( struct hfsmount *hfsmp, struct cat_desc *descp); -extern int cat_updatelink( struct hfsmount *hfsmp, +extern int cat_update_siblinglinks( struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t prevlinkid, cnid_t nextlinkid); @@ -415,11 +420,23 @@ extern int cat_lookuplink( struct hfsmount *hfsmp, cnid_t *prevlinkid, cnid_t *nextlinkid); -extern int cat_lookuplinkbyid( struct hfsmount *hfsmp, +extern int cat_lookup_siblinglinks( struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid); +extern int cat_lookup_dirlink(struct hfsmount *hfsmp, + cnid_t dirlink_id, + u_int8_t forktype, + struct cat_desc *outdescp, + struct cat_attr *attrp, + struct cat_fork *forkp); + +extern int cat_update_dirlink(struct hfsmount *hfsmp, + u_int8_t forktype, + struct cat_desc *descp, + struct cat_attr *attrp, + struct cat_fork *rsrcforkp); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/hfs/hfs_chash.c b/bsd/hfs/hfs_chash.c index b2db58e75..997d247ae 100644 --- a/bsd/hfs/hfs_chash.c +++ b/bsd/hfs/hfs_chash.c @@ -146,9 +146,8 @@ hfs_delete_chash(struct hfsmount *hfsmp) * * If it is in core, but locked, wait for it. */ -__private_extern__ struct vnode * -hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiplock) +hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiplock, int allow_deleted) { struct cnode *cp; struct vnode *vp; @@ -201,13 +200,15 @@ loop: * lock on the cnode which would allow the node to be * unlinked */ - if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { - if (!skiplock) - hfs_unlock(cp); - vnode_put(vp); - - return (NULL); - } + if (!allow_deleted) { + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + if (!skiplock) + hfs_unlock(cp); + vnode_put(vp); + + return (NULL); + } + } return (vp); } exit: @@ -218,8 +219,12 @@ exit: /* * Use the device, fileid pair to snoop an incore cnode. + * + * A cnode can exists in chash even after it has been + * deleted from the catalog, so this function returns + * ENOENT if C_NOEXIST is set in the cnode's flag. + * */ -__private_extern__ int hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int (*callout)(const struct cat_desc *, const struct cat_attr *, void *), void * arg) @@ -237,6 +242,10 @@ hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int (*callout)(const struct for (cp = CNODEHASH(hfsmp, inum)->lh_first; cp; cp = cp->c_hash.le_next) { if (cp->c_fileid != inum) continue; + /* Skip cnodes that have been removed from the catalog */ + if (cp->c_flag & (C_NOEXISTS | C_DELETED)) { + break; + } /* Skip cnodes being created or reclaimed. */ if (!ISSET(cp->c_hflag, H_ALLOC | H_TRANSIT | H_ATTACH)) { result = callout(&cp->c_desc, &cp->c_attr, arg); @@ -257,10 +266,16 @@ hfs_chash_snoop(struct hfsmount *hfsmp, ino_t inum, int (*callout)(const struct * * If the cnode is C_DELETED, then return NULL since that * inum is no longer valid for lookups (open-unlinked file). + * + * If the cnode is C_DELETED but also marked C_RENAMED, then that means + * the cnode was renamed over and a new entry exists in its place. The caller + * should re-drive the lookup to get the newer entry. In that case, we'll still + * return NULL for the cnode, but also return GNV_CHASH_RENAMED in the output flags + * of this function to indicate the caller that they should re-drive. */ -__private_extern__ struct cnode * -hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int wantrsrc, int skiplock) +hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, + int wantrsrc, int skiplock, int *out_flags, int *hflags) { struct cnode *cp; struct cnode *ncp = NULL; @@ -295,6 +310,7 @@ loop_with_lock: * The desired vnode isn't there so tag the cnode. */ SET(cp->c_hflag, H_ATTACH); + *hflags |= H_ATTACH; hfs_chash_unlock(hfsmp); } else { @@ -311,7 +327,7 @@ loop_with_lock: * this cnode and add it to the hash * just dump our allocation */ - FREE_ZONE(ncp, sizeof(struct cnode), M_HFSNODE); + FREE_ZONE(ncp, sizeof(struct cnode), M_HFSNODE); ncp = NULL; } @@ -330,13 +346,19 @@ loop_with_lock: * is no longer valid for lookups. */ if ((cp->c_flag & (C_NOEXISTS | C_DELETED)) && !wantrsrc) { + int renamed = 0; + if (cp->c_flag & C_RENAMED) { + renamed = 1; + } if (!skiplock) hfs_unlock(cp); if (vp != NULLVP) { vnode_put(vp); } else { hfs_chash_lock_spin(hfsmp); - CLR(cp->c_hflag, H_ATTACH); + CLR(cp->c_hflag, H_ATTACH); + *hflags &= ~H_ATTACH; + if (ISSET(cp->c_hflag, H_WAITING)) { CLR(cp->c_hflag, H_WAITING); wakeup((caddr_t)cp); @@ -345,6 +367,9 @@ loop_with_lock: } vp = NULL; cp = NULL; + if (renamed) { + *out_flags = GNV_CHASH_RENAMED; + } } *vpp = vp; return (cp); @@ -358,8 +383,7 @@ loop_with_lock: if (ncp == NULL) { hfs_chash_unlock(hfsmp); - - MALLOC_ZONE(ncp, struct cnode *, sizeof(struct cnode), M_HFSNODE, M_WAITOK); + MALLOC_ZONE(ncp, struct cnode *, sizeof(struct cnode), M_HFSNODE, M_WAITOK); /* * since we dropped the chash lock, * we need to go back and re-verify @@ -372,6 +396,7 @@ loop_with_lock: bzero(ncp, sizeof(struct cnode)); SET(ncp->c_hflag, H_ALLOC); + *hflags |= H_ALLOC; ncp->c_fileid = inum; TAILQ_INIT(&ncp->c_hintlist); /* make the list empty */ TAILQ_INIT(&ncp->c_originlist); diff --git a/bsd/hfs/hfs_cnode.c b/bsd/hfs/hfs_cnode.c index c17c8d4dd..8703ecb9b 100644 --- a/bsd/hfs/hfs_cnode.c +++ b/bsd/hfs/hfs_cnode.c @@ -36,6 +36,7 @@ #include <sys/ubc.h> #include <sys/quota.h> #include <sys/kdebug.h> +#include <libkern/OSByteOrder.h> #include <kern/locks.h> @@ -46,6 +47,7 @@ #include <hfs/hfs_catalog.h> #include <hfs/hfs_cnode.h> #include <hfs/hfs_quota.h> +#include <hfs/hfs_format.h> extern int prtactive; @@ -53,88 +55,192 @@ extern lck_attr_t * hfs_lock_attr; extern lck_grp_t * hfs_mutex_group; extern lck_grp_t * hfs_rwlock_group; -static int hfs_filedone(struct vnode *vp, vfs_context_t context); - static void hfs_reclaim_cnode(struct cnode *); - +static int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim); static int hfs_isordered(struct cnode *, struct cnode *); -inline int hfs_checkdeleted (struct cnode *cp) { - return ((cp->c_flag & (C_DELETED | C_NOEXISTS)) ? ENOENT : 0); +__inline__ int hfs_checkdeleted (struct cnode *cp) { + return ((cp->c_flag & (C_DELETED | C_NOEXISTS)) ? ENOENT : 0); } /* - * Last reference to an cnode. If necessary, write or delete it. + * Function used by a special fcntl() that decorates a cnode/vnode that + * indicates it is backing another filesystem, like a disk image. + * + * the argument 'val' indicates whether or not to set the bit in the cnode flags + * + * Returns non-zero on failure. 0 on success */ -__private_extern__ -int -hfs_vnop_inactive(struct vnop_inactive_args *ap) -{ - struct vnode *vp = ap->a_vp; - struct cnode *cp; - struct hfsmount *hfsmp = VTOHFS(vp); - struct proc *p = vfs_context_proc(ap->a_context); - int error = 0; - int recycle = 0; - int forkcount = 0; - int truncated = 0; - int started_tr = 0; - int took_trunc_lock = 0; - cat_cookie_t cookie; - int cat_reserve = 0; - int lockflags; - enum vtype v_type; - - v_type = vnode_vtype(vp); +int hfs_set_backingstore (struct vnode *vp, int val) { + struct cnode *cp = NULL; + int err = 0; + cp = VTOC(vp); - - if ((hfsmp->hfs_flags & HFS_READ_ONLY) || vnode_issystem(vp) || - (hfsmp->hfs_freezing_proc == p)) { - return (0); + if (vnode_isdir(vp)) { + return EINVAL; } - /* - * Ignore nodes related to stale file handles. - * We are peeking at the cnode flag without the lock, but if C_NOEXISTS - * is set, that means the cnode doesn't have any backing store in the - * catalog anymore, and is otherwise safe to force a recycle - */ + /* lock the cnode */ + err = hfs_lock (cp, HFS_EXCLUSIVE_LOCK); + if (err) { + return err; + } - if (cp->c_flag & C_NOEXISTS) { - vnode_recycle(vp); - return (0); + if (val) { + cp->c_flag |= C_BACKINGSTORE; + } + else { + cp->c_flag &= ~C_BACKINGSTORE; } - if ((v_type == VREG || v_type == VLNK)) { - hfs_lock_truncate(cp, TRUE); - took_trunc_lock = 1; + /* unlock everything */ + hfs_unlock (cp); + + return err; +} + +/* + * Function used by a special fcntl() that check to see if a cnode/vnode + * indicates it is backing another filesystem, like a disk image. + * + * the argument 'val' is an output argument for whether or not the bit is set + * + * Returns non-zero on failure. 0 on success + */ + +int hfs_is_backingstore (struct vnode *vp, int *val) { + struct cnode *cp = NULL; + int err = 0; + + if (!vnode_isreg(vp)) { + *val = 0; + return 0; } - (void) hfs_lock(cp, HFS_FORCE_LOCK); + cp = VTOC(vp); + + /* lock the cnode */ + err = hfs_lock (cp, HFS_SHARED_LOCK); + if (err) { + return err; + } + + if (cp->c_flag & C_BACKINGSTORE) { + *val = 1; + } + else { + *val = 0; + } + + /* unlock everything */ + hfs_unlock (cp); - if (cp->c_datafork) + return err; +} + + +/* + * hfs_cnode_teardown + * + * This is an internal function that is invoked from both hfs_vnop_inactive + * and hfs_vnop_reclaim. As VNOP_INACTIVE is not necessarily called from vnodes + * being recycled and reclaimed, it is important that we do any post-processing + * necessary for the cnode in both places. Important tasks include things such as + * releasing the blocks from an open-unlinked file when all references to it have dropped, + * and handling resource forks separately from data forks. + * + * Note that we take only the vnode as an argument here (rather than the cnode). + * Recall that each cnode supports two forks (rsrc/data), and we can always get the right + * cnode from either of the vnodes, but the reverse is not true -- we can't determine which + * vnode we need to reclaim if only the cnode is supplied. + * + * This function is idempotent and safe to call from both hfs_vnop_inactive and hfs_vnop_reclaim + * if both are invoked right after the other. In the second call, most of this function's if() + * conditions will fail, since they apply generally to cnodes still marked with C_DELETED. + * As a quick check to see if this function is necessary, determine if the cnode is already + * marked C_NOEXISTS. If it is, then it is safe to skip this function. The only tasks that + * remain for cnodes marked in such a fashion is to teardown their fork references and + * release all directory hints and hardlink origins. However, both of those are done + * in hfs_vnop_reclaim. hfs_update, by definition, is not necessary if the cnode's catalog + * entry is no longer there. + * + * 'reclaim' argument specifies whether or not we were called from hfs_vnop_reclaim. If we are + * invoked from hfs_vnop_reclaim, we can not call functions that cluster_push since the UBC info + * is totally gone by that point. + * + * Assumes that both truncate and cnode locks for 'cp' are held. + */ +static +int hfs_cnode_teardown (struct vnode *vp, vfs_context_t ctx, int reclaim) { + + int forkcount = 0; + enum vtype v_type; + struct cnode *cp; + int error = 0; + int started_tr = 0; + struct hfsmount *hfsmp = VTOHFS(vp); + struct proc *p = vfs_context_proc(ctx); + int truncated = 0; + cat_cookie_t cookie; + int cat_reserve = 0; + int lockflags; + int ea_error = 0; + + v_type = vnode_vtype(vp); + cp = VTOC(vp); + + if (cp->c_datafork) { ++forkcount; - if (cp->c_rsrcfork) + } + if (cp->c_rsrcfork) { ++forkcount; - + } + + /* - * We should lock cnode before checking the flags in the - * condition below and should unlock the cnode before calling - * ubc_setsize() as cluster code can call other HFS vnops which - * will try to acquire the same cnode lock and cause deadlock. - * Only call ubc_setsize to 0 if we are the last fork. - */ - if ((v_type == VREG || v_type == VLNK) && + * Skip the call to ubc_setsize if we're being invoked on behalf of reclaim. + * The dirty regions would have already been synced to disk, so informing UBC + * that they can toss the pages doesn't help anyone at this point. + * + * Note that this is a performance problem if the vnode goes straight to reclaim + * (and skips inactive), since there would be no way for anyone to notify the UBC + * that all pages in this file are basically useless. + */ + if (reclaim == 0) { + /* + * Check whether we are tearing down a cnode with only one remaining fork. + * If there are blocks in its filefork, then we need to unlock the cnode + * before calling ubc_setsize. The cluster layer may re-enter the filesystem + * (i.e. VNOP_BLOCKMAP), and if we retain the cnode lock, we could double-lock + * panic. + */ + + if ((v_type == VREG || v_type == VLNK) && (cp->c_flag & C_DELETED) && (VTOF(vp)->ff_blocks != 0) && (forkcount == 1)) { - hfs_unlock(cp); - ubc_setsize(vp, 0); - (void) hfs_lock(cp, HFS_FORCE_LOCK); + hfs_unlock(cp); + /* ubc_setsize just fails if we were to call this from VNOP_RECLAIM */ + ubc_setsize(vp, 0); + (void) hfs_lock(cp, HFS_FORCE_LOCK); + } } - - if (v_type == VREG && !ISSET(cp->c_flag, C_DELETED) && VTOF(vp)->ff_blocks) { - hfs_filedone(vp, ap->a_context); + + /* + * Push file data out for normal files that haven't been evicted from + * the namespace. We only do this if this function was not called from reclaim, + * because by that point the UBC information has been totally torn down. + * + * There should also be no way that a normal file that has NOT been deleted from + * the namespace to skip INACTIVE and go straight to RECLAIM. That race only happens + * when the file becomes open-unlinked. + */ + if ((v_type == VREG) && + (!ISSET(cp->c_flag, C_DELETED)) && + (!ISSET(cp->c_flag, C_NOEXISTS)) && + (VTOF(vp)->ff_blocks) && + (reclaim == 0)) { + hfs_filedone(vp, ctx); } /* * Remove any directory hints or cached origins @@ -145,12 +251,7 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) if (cp->c_flag & C_HARDLINK) { hfs_relorigins(cp); } - - /* Hurry the recycling process along if we're an open-unlinked file */ - if((v_type == VREG || v_type == VLNK) && (cp->c_flag & C_DELETED)) { - recycle = 1; - } - + /* * This check is slightly complicated. We should only truncate data * in very specific cases for open-unlinked files. This is because @@ -162,7 +263,7 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) * If we're the last fork, then we have cleaning up to do. * * A) last fork, and vp == c_vp - * Truncate away own fork dat. If rsrc fork is not in core, truncate it too. + * Truncate away own fork data. If rsrc fork is not in core, truncate it too. * * B) last fork, and vp == c_rsrc_vp * Truncate ourselves, assume data fork has been cleaned due to C). @@ -177,192 +278,320 @@ hfs_vnop_inactive(struct vnop_inactive_args *ap) * D) not the last fork, vp == c_rsrc_vp * Don't enter the block below, just clean up vnode and push it out of core. */ - - if ((v_type == VREG || v_type == VLNK) && (cp->c_flag & C_DELETED) && - ((forkcount == 1) || (!VNODE_IS_RSRC(vp)))) { + + if ((v_type == VREG || v_type == VLNK) && + (cp->c_flag & C_DELETED) && + ((forkcount == 1) || (!VNODE_IS_RSRC(vp)))) { + + /* Truncate away our own fork data. (Case A, B, C above) */ if (VTOF(vp)->ff_blocks != 0) { /* * Since we're already inside a transaction, * tell hfs_truncate to skip the ubc_setsize. + * + * This truncate call (and the one below) is fine from VNOP_RECLAIM's + * context because we're only removing blocks, not zero-filling new + * ones. The C_DELETED check above makes things much simpler. */ - error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, 0, ap->a_context); - if (error) + error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, 0, ctx); + if (error) { goto out; + } truncated = 1; } - + /* - * If c_blocks > 0 and we are the last fork (data fork), then - * we can go and and truncate away the rsrc fork blocks if - * they were not in core. + * Truncate away the resource fork, if we represent the data fork and + * it is the last fork. That means, by definition, the rsrc fork is not in + * core. So we bring it into core, and then truncate it away. + * + * This is invoked via case A above only. */ if ((cp->c_blocks > 0) && (forkcount == 1) && (vp != cp->c_rsrc_vp)) { struct vnode *rvp = NULLVP; - + + /* + * It is safe for us to pass FALSE to the argument can_drop_lock + * on this call to hfs_vgetrsrc. We know that the resource fork does not + * exist in core, so we'll have to go to the catalog to retrieve its + * information. That will attach the resource fork vnode to our cnode. + */ error = hfs_vgetrsrc(hfsmp, vp, &rvp, FALSE, FALSE); - if (error) + if (error) { goto out; + } /* * Defer the vnode_put and ubc_setsize on rvp until hfs_unlock(). + * + * By bringing the vnode into core above, we may force hfs_vnop_reclaim + * to only partially finish if that's what called us. Bringing the + * resource fork into core results in a new rsrc vnode that will get + * immediately marked for termination below. It will get recycled/reclaimed + * as soon as possible, but that could cause another round of inactive and reclaim. */ cp->c_flag |= C_NEED_RVNODE_PUT | C_NEED_RSRC_SETSIZE; - error = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 1, 0, ap->a_context); - if (error) + error = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 1, 0, ctx); + if (error) { goto out; + } + + /* + * Note that the following call to vnode_recycle is safe from within the + * context of hfs_vnop_inactive or hfs_vnop_reclaim. It is being invoked + * on the RSRC fork vp (which is not our current vnode) As such, we hold + * an iocount on it and vnode_recycle will just add the MARKTERM bit at this + * point. + */ vnode_recycle(rvp); /* all done with this vnode */ } } - - // If needed, get rid of any xattrs that this file (or directory) may have. - // Note that this must happen outside of any other transactions - // because it starts/ends its own transactions and grabs its - // own locks. This is to prevent a file with a lot of attributes - // from creating a transaction that is too large (which panics). - // - if ((cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0 && - (cp->c_flag & C_DELETED) && (forkcount <= 1)) { - hfs_removeallattr(hfsmp, cp->c_fileid); - } - + /* - * Check for a postponed deletion. - * (only delete cnode when the last fork goes inactive) + * If we represent the last fork (or none in the case of a dir), + * and the cnode has become open-unlinked, + * AND it has EA's, then we need to get rid of them. + * + * Note that this must happen outside of any other transactions + * because it starts/ends its own transactions and grabs its + * own locks. This is to prevent a file with a lot of attributes + * from creating a transaction that is too large (which panics). */ - if ((cp->c_flag & C_DELETED) && (forkcount <= 1)) { - /* - * Mark cnode in transit so that no one can get this - * cnode from cnode hash. - */ - // hfs_chash_mark_in_transit(hfsmp, cp); - // XXXdbg - remove the cnode from the hash table since it's deleted - // otherwise someone could go to sleep on the cnode and not - // be woken up until this vnode gets recycled which could be - // a very long time... - hfs_chashremove(hfsmp, cp); - - cp->c_flag |= C_NOEXISTS; // XXXdbg - cp->c_rdev = 0; - - if (started_tr == 0) { - if (hfs_start_transaction(hfsmp) != 0) { - error = EINVAL; - goto out; - } - started_tr = 1; - } + if ((cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0 && + (cp->c_flag & C_DELETED) && + (forkcount <= 1)) { - /* - * Reserve some space in the Catalog file. - */ - if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { + ea_error = hfs_removeallattr(hfsmp, cp->c_fileid); + } + + + /* + * If the cnode represented an open-unlinked file, then now + * actually remove the cnode's catalog entry and release all blocks + * it may have been using. + */ + if ((cp->c_flag & C_DELETED) && (forkcount <= 1)) { + /* + * Mark cnode in transit so that no one can get this + * cnode from cnode hash. + */ + // hfs_chash_mark_in_transit(hfsmp, cp); + // XXXdbg - remove the cnode from the hash table since it's deleted + // otherwise someone could go to sleep on the cnode and not + // be woken up until this vnode gets recycled which could be + // a very long time... + hfs_chashremove(hfsmp, cp); + + cp->c_flag |= C_NOEXISTS; // XXXdbg + cp->c_rdev = 0; + + if (started_tr == 0) { + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + started_tr = 1; + } + + /* + * Reserve some space in the Catalog file. + */ + if ((error = cat_preflight(hfsmp, CAT_DELETE, &cookie, p))) { + goto out; + } + cat_reserve = 1; + + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + if (cp->c_blocks > 0) { + printf("hfs_inactive: deleting non-empty%sfile %d, " + "blks %d\n", VNODE_IS_RSRC(vp) ? " rsrc " : " ", + (int)cp->c_fileid, (int)cp->c_blocks); + } + + // + // release the name pointer in the descriptor so that + // cat_delete() will use the file-id to do the deletion. + // in the case of hard links this is imperative (in the + // case of regular files the fileid and cnid are the + // same so it doesn't matter). + // + cat_releasedesc(&cp->c_desc); + + /* + * The descriptor name may be zero, + * in which case the fileid is used. + */ + error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); + + if (error && truncated && (error != ENXIO)) + printf("hfs_inactive: couldn't delete a truncated file!"); + + /* Update HFS Private Data dir */ + if (error == 0) { + hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries--; + if (vnode_isdir(vp)) { + DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); + } + (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], + &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); + } + + hfs_systemfile_unlock(hfsmp, lockflags); + + if (error) { goto out; } - cat_reserve = 1; - - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - - if (cp->c_blocks > 0) { - printf("hfs_inactive: deleting non-empty%sfile %d, " - "blks %d\n", VNODE_IS_RSRC(vp) ? " rsrc " : " ", - (int)cp->c_fileid, (int)cp->c_blocks); - } - - // - // release the name pointer in the descriptor so that - // cat_delete() will use the file-id to do the deletion. - // in the case of hard links this is imperative (in the - // case of regular files the fileid and cnid are the - // same so it doesn't matter). - // - cat_releasedesc(&cp->c_desc); - /* - * The descriptor name may be zero, - * in which case the fileid is used. - */ - error = cat_delete(hfsmp, &cp->c_desc, &cp->c_attr); +#if QUOTA + if (hfsmp->hfs_flags & HFS_QUOTAS) + (void)hfs_chkiq(cp, -1, NOCRED, 0); +#endif /* QUOTA */ - if (error && truncated && (error != ENXIO)) - printf("hfs_inactive: couldn't delete a truncated file!"); - - /* Update HFS Private Data dir */ - if (error == 0) { - hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries--; - if (vnode_isdir(vp)) { - DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); + /* Already set C_NOEXISTS at the beginning of this block */ + cp->c_flag &= ~C_DELETED; + cp->c_touch_chgtime = TRUE; + cp->c_touch_modtime = TRUE; + + if (error == 0) + hfs_volupdate(hfsmp, (v_type == VDIR) ? VOL_RMDIR : VOL_RMFILE, 0); + } + + /* + * A file may have had delayed allocations, in which case hfs_update + * would not have updated the catalog record (cat_update). We need + * to do that now, before we lose our fork data. We also need to + * force the update, or hfs_update will again skip the cat_update. + * + * If the file has C_NOEXISTS set, then we can skip the hfs_update call + * because the catalog entry has already been removed. There would be no point + * to looking up the entry in the catalog to modify it when we already know it's gone + */ + if ((!ISSET(cp->c_flag, C_NOEXISTS)) && + ((cp->c_flag & C_MODIFIED) || cp->c_touch_acctime || + cp->c_touch_chgtime || cp->c_touch_modtime)) { + + if ((cp->c_flag & C_MODIFIED) || cp->c_touch_modtime){ + cp->c_flag |= C_FORCEUPDATE; } - (void)cat_update(hfsmp, &hfsmp->hfs_private_desc[FILE_HARDLINKS], - &hfsmp->hfs_private_attr[FILE_HARDLINKS], NULL, NULL); + hfs_update(vp, 0); } + +out: + if (cat_reserve) + cat_postflight(hfsmp, &cookie, p); + + // XXXdbg - have to do this because a goto could have come here + if (started_tr) { + hfs_end_transaction(hfsmp); + started_tr = 0; + } + + + return error; +} - hfs_systemfile_unlock(hfsmp, lockflags); - - if (error) - goto out; -#if QUOTA - if (hfsmp->hfs_flags & HFS_QUOTAS) - (void)hfs_chkiq(cp, -1, NOCRED, 0); -#endif /* QUOTA */ - - /* Already set C_NOEXISTS at the beginning of this block */ - cp->c_flag &= ~C_DELETED; - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - if (error == 0) - hfs_volupdate(hfsmp, (v_type == VDIR) ? VOL_RMDIR : VOL_RMFILE, 0); - } +/* + * hfs_vnop_inactive + * + * The last usecount on the vnode has gone away, so we need to tear down + * any remaining data still residing in the cnode. If necessary, write out + * remaining blocks or delete the cnode's entry in the catalog. + */ +int +hfs_vnop_inactive(struct vnop_inactive_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct cnode *cp; + struct hfsmount *hfsmp = VTOHFS(vp); + struct proc *p = vfs_context_proc(ap->a_context); + int error = 0; + int took_trunc_lock = 0; + enum vtype v_type; + + v_type = vnode_vtype(vp); + cp = VTOC(vp); + if ((hfsmp->hfs_flags & HFS_READ_ONLY) || vnode_issystem(vp) || + (hfsmp->hfs_freezing_proc == p)) { + error = 0; + goto inactive_done; + } + /* - * A file may have had delayed allocations, in which case hfs_update - * would not have updated the catalog record (cat_update). We need - * to do that now, before we lose our fork data. We also need to - * force the update, or hfs_update will again skip the cat_update. + * For safety, do NOT call vnode_recycle from inside this function. This can cause + * problems in the following scenario: + * + * vnode_create -> vnode_reclaim_internal -> vclean -> VNOP_INACTIVE + * + * If we're being invoked as a result of a reclaim that was already in-flight, then we + * cannot call vnode_recycle again. Being in reclaim means that there are no usecounts or + * iocounts by definition. As a result, if we were to call vnode_recycle, it would immediately + * try to re-enter reclaim again and panic. + * + * Currently, there are three things that can cause us (VNOP_INACTIVE) to get called. + * 1) last usecount goes away on the vnode (vnode_rele) + * 2) last iocount goes away on a vnode that previously had usecounts but didn't have + * vnode_recycle called (vnode_put) + * 3) vclean by way of reclaim + * + * In this function we would generally want to call vnode_recycle to speed things + * along to ensure that we don't leak blocks due to open-unlinked files. However, by + * virtue of being in this function already, we can call hfs_cnode_teardown, which + * will release blocks held by open-unlinked files, and mark them C_NOEXISTS so that + * there's no entry in the catalog and no backing store anymore. If that's the case, + * then we really don't care all that much when the vnode actually goes through reclaim. + * Further, the HFS VNOPs that manipulated the namespace in order to create the open- + * unlinked file in the first place should have already called vnode_recycle on the vnode + * to guarantee that it would go through reclaim in a speedy way. */ - if ((cp->c_flag & C_MODIFIED) || - cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime) { - if ((cp->c_flag & C_MODIFIED) || cp->c_touch_modtime){ - cp->c_flag |= C_FORCEUPDATE; - } - hfs_update(vp, 0); + + if (cp->c_flag & C_NOEXISTS) { + /* + * If the cnode has already had its cat entry removed, then + * just skip to the end. We don't need to do anything here. + */ + error = 0; + goto inactive_done; } -out: - if (cat_reserve) - cat_postflight(hfsmp, &cookie, p); - - // XXXdbg - have to do this because a goto could have come here - if (started_tr) { - hfs_end_transaction(hfsmp); - started_tr = 0; + + if ((v_type == VREG || v_type == VLNK)) { + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); + took_trunc_lock = 1; } + + (void) hfs_lock(cp, HFS_FORCE_LOCK); + /* - * This has been removed from the namespace and has no backing store - * in the catalog, so we should force a reclaim as soon as possible. - * Also, we want to check the flag while we still have the cnode lock. + * Call cnode_teardown to push out dirty blocks to disk, release open-unlinked + * files' blocks from being in use, and move the cnode from C_DELETED to C_NOEXISTS. */ - if (cp->c_flag & C_NOEXISTS) - recycle = 1; + error = hfs_cnode_teardown (vp, ap->a_context, 0); + /* + * Drop the truncate lock before unlocking the cnode + * (which can potentially perform a vnode_put and + * recycle the vnode which in turn might require the + * truncate lock) + */ + if (took_trunc_lock) { + hfs_unlock_truncate(cp, 0); + } + hfs_unlock(cp); - - if (took_trunc_lock) - hfs_unlock_truncate(cp, TRUE); - - /* - * If we are done with the vnode, reclaim it - * so that it can be reused immediately. - */ - if (recycle) - vnode_recycle(vp); - - return (error); + +inactive_done: + + return error; } + /* * File clean-up (zero fill and shrink peof). */ -static int + +int hfs_filedone(struct vnode *vp, vfs_context_t context) { struct cnode *cp; @@ -371,6 +600,8 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) struct rl_entry *invalid_range; off_t leof; u_int32_t blks, blocksize; + int cluster_flags = IO_CLOSE; + int cluster_zero_flags = IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE; cp = VTOC(vp); fp = VTOF(vp); @@ -380,8 +611,18 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) if ((hfsmp->hfs_flags & HFS_READ_ONLY) || (fp->ff_blocks == 0)) return (0); + /* + * If we are being invoked from F_SWAPDATAEXTENTS, then we + * need to issue synchronous IO; Unless we are sure that all + * of the data has been written to the disk, we won't know + * that all of the blocks have been allocated properly. + */ + if (cp->c_flag & C_SWAPINPROGRESS) { + cluster_flags |= IO_SYNC; + } + hfs_unlock(cp); - (void) cluster_push(vp, IO_CLOSE); + (void) cluster_push(vp, cluster_flags); hfs_lock(cp, HFS_FORCE_LOCK); /* @@ -400,8 +641,7 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) hfs_unlock(cp); (void) cluster_write(vp, (struct uio *) 0, - leof, end + 1, start, (off_t)0, - IO_HEADZEROFILL | IO_NOZERODIRTY | IO_NOCACHE); + leof, end + 1, start, (off_t)0, cluster_zero_flags); hfs_lock(cp, HFS_FORCE_LOCK); cp->c_flag |= C_MODIFIED; } @@ -417,7 +657,7 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) if (blks < fp->ff_blocks) (void) hfs_truncate(vp, leof, IO_NDELAY, 0, 0, context); hfs_unlock(cp); - (void) cluster_push(vp, IO_CLOSE); + (void) cluster_push(vp, cluster_flags); hfs_lock(cp, HFS_FORCE_LOCK); /* @@ -435,7 +675,6 @@ hfs_filedone(struct vnode *vp, vfs_context_t context) /* * Reclaim a cnode so that it can be used for other purposes. */ -__private_extern__ int hfs_vnop_reclaim(struct vnop_reclaim_args *ap) { @@ -444,23 +683,30 @@ hfs_vnop_reclaim(struct vnop_reclaim_args *ap) struct filefork *fp = NULL; struct filefork *altfp = NULL; struct hfsmount *hfsmp = VTOHFS(vp); + vfs_context_t ctx = ap->a_context; int reclaim_cnode = 0; - - (void) hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + int err = 0; + enum vtype v_type; + + v_type = vnode_vtype(vp); cp = VTOC(vp); - /* - * A file may have had delayed allocations, in which case hfs_update - * would not have updated the catalog record (cat_update). We need - * to do that now, before we lose our fork data. We also need to - * force the update, or hfs_update will again skip the cat_update. + /* + * We don't take the truncate lock since by the time reclaim comes along, + * all dirty pages have been synced and nobody should be competing + * with us for this thread. */ - if ((cp->c_flag & C_MODIFIED) || - cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime) { - if ((cp->c_flag & C_MODIFIED) || cp->c_touch_modtime){ - cp->c_flag |= C_FORCEUPDATE; - } - hfs_update(vp, 0); + (void) hfs_lock (cp, HFS_FORCE_LOCK); + + /* + * Sync to disk any remaining data in the cnode/vnode. This includes + * a call to hfs_update if the cnode has outbound data. + * + * If C_NOEXISTS is set on the cnode, then there's nothing teardown needs to do + * because the catalog entry for this cnode is already gone. + */ + if (!ISSET(cp->c_flag, C_NOEXISTS)) { + err = hfs_cnode_teardown(vp, ctx, 1); } /* @@ -525,7 +771,12 @@ hfs_vnop_reclaim(struct vnop_reclaim_args *ap) if (reclaim_cnode) { hfs_chashwakeup(hfsmp, cp, H_ALLOC | H_TRANSIT); hfs_reclaim_cnode(cp); - } else /* cnode in use */ { + } + else { + /* + * cnode in use. If it is a directory, it could have + * no live forks. Just release the lock. + */ hfs_unlock(cp); } @@ -546,7 +797,6 @@ extern int (**hfs_fifoop_p) (void *); * * The vnode is returned with an iocount and the cnode locked */ -__private_extern__ int hfs_getnewvnode( struct hfsmount *hfsmp, @@ -556,7 +806,8 @@ hfs_getnewvnode( int flags, struct cat_attr *attrp, struct cat_fork *forkp, - struct vnode **vpp) + struct vnode **vpp, + int *out_flags) { struct mount *mp = HFSTOVFS(hfsmp); struct vnode *vp = NULL; @@ -568,12 +819,13 @@ hfs_getnewvnode( int retval; int issystemfile; int wantrsrc; + int hflags = 0; struct vnode_fsparam vfsp; enum vtype vtype; #if QUOTA int i; #endif /* QUOTA */ - + hfs_standard = (hfsmp->hfs_flags & HFS_STANDARD); if (attrp->ca_fileid == 0) { @@ -591,6 +843,9 @@ hfs_getnewvnode( issystemfile = (descp->cd_flags & CD_ISMETA) && (vtype == VREG); wantrsrc = flags & GNV_WANTRSRC; + /* Zero out the out_flags */ + *out_flags = 0; + #ifdef HFS_CHECK_LOCK_ORDER /* * The only case were its permissible to hold the parent cnode @@ -607,7 +862,8 @@ hfs_getnewvnode( /* * Get a cnode (new or existing) */ - cp = hfs_chash_getcnode(hfsmp, attrp->ca_fileid, vpp, wantrsrc, (flags & GNV_SKIPLOCK)); + cp = hfs_chash_getcnode(hfsmp, attrp->ca_fileid, vpp, wantrsrc, + (flags & GNV_SKIPLOCK), out_flags, &hflags); /* * If the id is no longer valid for lookups we'll get back a NULL cp. @@ -615,20 +871,76 @@ hfs_getnewvnode( if (cp == NULL) { return (ENOENT); } - + /* - * Hardlinks may need an updated catalog descriptor. However, if - * the cnode has already been marked as open-unlinked (C_DELETED), then don't - * replace its descriptor. + * If we get a cnode/vnode pair out of hfs_chash_getcnode, then update the + * descriptor in the cnode as needed if the cnode represents a hardlink. + * We want the caller to get the most up-to-date copy of the descriptor + * as possible. However, we only do anything here if there was a valid vnode. + * If there isn't a vnode, then the cnode is brand new and needs to be initialized + * as it doesn't have a descriptor or cat_attr yet. + * + * If we are about to replace the descriptor with the user-supplied one, then validate + * that the descriptor correctly acknowledges this item is a hardlink. We could be + * subject to a race where the calling thread invoked cat_lookup, got a valid lookup + * result but the file was not yet a hardlink. With sufficient delay between there + * and here, we might accidentally copy in the raw inode ID into the descriptor in the + * call below. If the descriptor's CNID is the same as the fileID then it must + * not yet have been a hardlink when the lookup occurred. */ + if (!(hfs_checkdeleted(cp))) { if ((cp->c_flag & C_HARDLINK) && descp->cd_nameptr && descp->cd_namelen > 0) { - replace_desc(cp, descp); + /* If cnode is uninitialized, its c_attr will be zeroed out; cnids wont match. */ + if ((descp->cd_cnid == cp->c_attr.ca_fileid) && + (attrp->ca_linkcount != cp->c_attr.ca_linkcount)){ + if ((flags & GNV_SKIPLOCK) == 0) { + /* + * Then we took the lock. Drop it before calling + * vnode_put, which may invoke hfs_vnop_inactive and need to take + * the cnode lock again. + */ + hfs_unlock(cp); + } + + /* + * Emit ERECYCLE and GNV_CAT_ATTRCHANGED to + * force a re-drive in the lookup routine. + * Drop the iocount on the vnode obtained from + * chash_getcnode if needed. + */ + if (*vpp != NULL) { + vnode_put (*vpp); + *vpp = NULL; + } + + /* + * If we raced with VNOP_RECLAIM for this vnode, the hash code could + * have observed it after the c_vp or c_rsrc_vp fields had been torn down; + * the hash code peeks at those fields without holding the cnode lock because + * it needs to be fast. As a result, we may have set H_ATTACH in the chash + * call above. Since we're bailing out, unset whatever flags we just set, and + * wake up all waiters for this cnode. + */ + if (hflags) { + hfs_chashwakeup(hfsmp, cp, hflags); + } + + *out_flags = GNV_CAT_ATTRCHANGED; + return ERECYCLE; + } + else { + /* Otherwise, CNID != fileid. Go ahead and copy in the new descriptor */ + replace_desc(cp, descp); + } } } + + /* Check if we found a matching vnode */ - if (*vpp != NULL) + if (*vpp != NULL) { return (0); + } /* * If this is a new cnode then initialize it. @@ -640,12 +952,38 @@ hfs_getnewvnode( #endif /* Make sure its still valid (ie exists on disk). */ - if (!(flags & GNV_CREATE) && - !hfs_valid_cnode(hfsmp, dvp, (wantrsrc ? NULL : cnp), cp->c_fileid)) { - hfs_chash_abort(hfsmp, cp); - hfs_reclaim_cnode(cp); - *vpp = NULL; - return (ENOENT); + if (!(flags & GNV_CREATE)) { + int error = 0; + if (!hfs_valid_cnode (hfsmp, dvp, (wantrsrc ? NULL : cnp), cp->c_fileid, attrp, &error)) { + hfs_chash_abort(hfsmp, cp); + hfs_reclaim_cnode(cp); + *vpp = NULL; + /* + * If we hit this case, that means that the entry was there in the catalog when + * we did a cat_lookup earlier. Think hfs_lookup. However, in between the time + * that we checked the catalog and the time we went to get a vnode/cnode for it, + * it had been removed from the namespace and the vnode totally reclaimed. As a result, + * it's not there in the catalog during the check in hfs_valid_cnode and we bubble out + * an ENOENT. To indicate to the caller that they should really double-check the + * entry (it could have been renamed over and gotten a new fileid), we mark a bit + * in the output flags. + */ + if (error == ENOENT) { + *out_flags = GNV_CAT_DELETED; + return ENOENT; + } + + /* + * Also, we need to protect the cat_attr acquired during hfs_lookup and passed into + * this function as an argument because the catalog may have changed w.r.t hardlink + * link counts and the firstlink field. If that validation check fails, then let + * lookup re-drive itself to get valid/consistent data with the same failure condition below. + */ + if (error == ERECYCLE) { + *out_flags = GNV_CAT_ATTRCHANGED; + return (ERECYCLE); + } + } } bcopy(attrp, &cp->c_attr, sizeof(struct cat_attr)); bcopy(descp, &cp->c_desc, sizeof(struct cat_desc)); @@ -695,6 +1033,8 @@ hfs_getnewvnode( cp->c_dquot[i] = NODQUOT; } #endif /* QUOTA */ + /* Mark the output flag that we're vending a new cnode */ + *out_flags |= GNV_NEW_CNODE; } if (vtype == VDIR) { @@ -802,7 +1142,7 @@ hfs_getnewvnode( vfsp.vnfs_filesize = 0; vfsp.vnfs_flags = VNFS_ADDFSREF; - if (dvp == NULLVP || cnp == NULL || !(cnp->cn_flags & MAKEENTRY)) + if (dvp == NULLVP || cnp == NULL || !(cnp->cn_flags & MAKEENTRY) || (flags & GNV_NOCACHE)) vfsp.vnfs_flags |= VNFS_NOCACHE; /* Tag system files */ @@ -868,6 +1208,11 @@ hfs_getnewvnode( if (!(flags & GNV_CREATE) && (vtype != VDIR) && !issystemfile) { (void) hfs_removehotfile(vp); } + +#if CONFIG_PROTECT + if (!issystemfile && (*out_flags & GNV_NEW_CNODE)) + cp_entry_init(cp, mp); +#endif *vpp = vp; return (0); @@ -900,7 +1245,16 @@ hfs_reclaim_cnode(struct cnode *cp) cp->c_desc.cd_namelen = 0; vfs_removename(nameptr); } - + + /* + * We only call this function if we are in hfs_vnop_reclaim and + * attempting to reclaim a cnode with only one live fork. Because the vnode + * went through reclaim, any future attempts to use this item will have to + * go through lookup again, which will need to create a new vnode. Thus, + * destroying the locks below (while they were still held during our parent + * function hfs_vnop_reclaim) is safe. + */ + lck_rw_destroy(&cp->c_rwlock, hfs_rwlock_group); lck_rw_destroy(&cp->c_truncatelock, hfs_rwlock_group); #if HFS_COMPRESSION @@ -909,14 +1263,27 @@ hfs_reclaim_cnode(struct cnode *cp) FREE_ZONE(cp->c_decmp, sizeof(*(cp->c_decmp)), M_DECMPFS_CNODE); } #endif +#if CONFIG_PROTECT + cp_entry_destroy(cp); +#endif + + bzero(cp, sizeof(struct cnode)); FREE_ZONE(cp, sizeof(struct cnode), M_HFSNODE); } -__private_extern__ +/* + * hfs_valid_cnode + * + * This function is used to validate data that is stored in-core against what is contained + * in the catalog. Common uses include validating that the parent-child relationship still exist + * for a specific directory entry (guaranteeing it has not been renamed into a different spot) at + * the point of the check. + */ int -hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, cnid_t cnid) +hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, + cnid_t cnid, struct cat_attr *cattr, int *error) { struct cat_attr attr; struct cat_desc cndesc; @@ -924,34 +1291,181 @@ hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname int lockflags; /* System files are always valid */ - if (cnid < kHFSFirstUserCatalogNodeID) + if (cnid < kHFSFirstUserCatalogNodeID) { + *error = 0; return (1); + } /* XXX optimization: check write count in dvp */ lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); if (dvp && cnp) { + int lookup = 0; + struct cat_fork fork; + bzero(&cndesc, sizeof(cndesc)); cndesc.cd_nameptr = (const u_int8_t *)cnp->cn_nameptr; cndesc.cd_namelen = cnp->cn_namelen; cndesc.cd_parentcnid = VTOC(dvp)->c_fileid; cndesc.cd_hint = VTOC(dvp)->c_childhint; - if ((cat_lookup(hfsmp, &cndesc, 0, NULL, &attr, NULL, NULL) == 0) && - (cnid == attr.ca_fileid)) { + /* + * We have to be careful when calling cat_lookup. The result argument + * 'attr' may get different results based on whether or not you ask + * for the filefork to be supplied as output. This is because cat_lookupbykey + * will attempt to do basic validation/smoke tests against the resident + * extents if there are no overflow extent records, but it needs someplace + * in memory to store the on-disk fork structures. + * + * Since hfs_lookup calls cat_lookup with a filefork argument, we should + * do the same here, to verify that block count differences are not + * due to calling the function with different styles. cat_lookupbykey + * will request the volume be fsck'd if there is true on-disk corruption + * where the number of blocks does not match the number generated by + * summing the number of blocks in the resident extents. + */ + + lookup = cat_lookup (hfsmp, &cndesc, 0, NULL, &attr, &fork, NULL); + if ((lookup == 0) && (cnid == attr.ca_fileid)) { stillvalid = 1; + *error = 0; + } + else { + *error = ENOENT; + } + + /* + * In hfs_getnewvnode, we may encounter a time-of-check vs. time-of-vnode creation + * race. Specifically, if there is no vnode/cnode pair for the directory entry + * being looked up, we have to go to the catalog. But since we don't hold any locks (aside + * from the dvp in 'shared' mode) there is nothing to protect us against the catalog record + * changing in between the time we do the cat_lookup there and the time we re-grab the + * catalog lock above to do another cat_lookup. + * + * However, we need to check more than just the CNID and parent-child name relationships above. + * Hardlinks can suffer the same race in the following scenario: Suppose we do a + * cat_lookup, and find a leaf record and a raw inode for a hardlink. Now, we have + * the cat_attr in hand (passed in above). But in between then and now, the vnode was + * created by a competing hfs_getnewvnode call, and is manipulated and reclaimed before we get + * a chance to do anything. This is possible if there are a lot of threads thrashing around + * with the cnode hash. In this case, if we don't check/validate the cat_attr in-hand, we will + * blindly stuff it into the cnode, which will make the in-core data inconsistent with what is + * on disk. So validate the cat_attr below, if required. This race cannot happen if the cnode/vnode + * already exists, as it does in the case of rename and delete. + */ + if (stillvalid && cattr != NULL) { + if (cattr->ca_linkcount != attr.ca_linkcount) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } + + if (cattr->ca_union1.cau_linkref != attr.ca_union1.cau_linkref) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } + + if (cattr->ca_union3.cau_firstlink != attr.ca_union3.cau_firstlink) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } + + if (cattr->ca_union2.cau_blocks != attr.ca_union2.cau_blocks) { + stillvalid = 0; + *error = ERECYCLE; + goto notvalid; + } } } else { if (cat_idlookup(hfsmp, cnid, 0, NULL, NULL, NULL) == 0) { stillvalid = 1; + *error = 0; + } + else { + *error = ENOENT; } } +notvalid: hfs_systemfile_unlock(hfsmp, lockflags); return (stillvalid); } +/* + * Per HI and Finder requirements, HFS should add in the + * date/time that a particular directory entry was added + * to the containing directory. + * This is stored in the extended Finder Info for the + * item in question. + * + * Note that this field is also set explicitly in the hfs_vnop_setxattr code. + * We must ignore user attempts to set this part of the finderinfo, and + * so we need to save a local copy of the date added, write in the user + * finderinfo, then stuff the value back in. + */ +void hfs_write_dateadded (struct cat_attr *attrp, u_int32_t dateadded) { + u_int8_t *finfo = NULL; + + /* overlay the FinderInfo to the correct pointer, and advance */ + finfo = (u_int8_t*)attrp->ca_finderinfo; + finfo = finfo + 16; + + /* + * Make sure to write it out as big endian, since that's how + * finder info is defined. + * + * NOTE: This is a Unix-epoch timestamp, not a HFS/Traditional Mac timestamp. + */ + if (S_ISREG(attrp->ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + attrp->ca_recflags |= kHFSHasDateAddedMask; + } + else if (S_ISDIR(attrp->ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + attrp->ca_recflags |= kHFSHasDateAddedMask; + } + + /* If it were neither directory/file, then we'd bail out */ + return; +} + +u_int32_t hfs_get_dateadded (struct cnode *cp) { + u_int8_t *finfo = NULL; + u_int32_t dateadded = 0; + + if ((cp->c_attr.ca_recflags & kHFSHasDateAddedMask) == 0) { + /* Date added was never set. Return 0. */ + return dateadded; + } + + + /* overlay the FinderInfo to the correct pointer, and advance */ + finfo = (u_int8_t*)cp->c_finderinfo; + finfo = finfo + 16; + + /* + * FinderInfo is written out in big endian... make sure to convert it to host + * native before we use it. + */ + if (S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + dateadded = OSSwapBigToHostInt32 (extinfo->date_added); + } + else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + dateadded = OSSwapBigToHostInt32 (extinfo->date_added); + } + + return dateadded; +} + + + /* * Touch cnode times based on c_touch_xxx flags * @@ -959,21 +1473,23 @@ hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname * * This will also update the volume modify time */ -__private_extern__ void hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) { + vfs_context_t ctx; /* don't modify times if volume is read-only */ if (hfsmp->hfs_flags & HFS_READ_ONLY) { cp->c_touch_acctime = FALSE; cp->c_touch_chgtime = FALSE; cp->c_touch_modtime = FALSE; + return; } else if (hfsmp->hfs_flags & HFS_STANDARD) { /* HFS Standard doesn't support access times */ cp->c_touch_acctime = FALSE; } + ctx = vfs_context_current(); /* * Skip access time updates if: * . MNT_NOATIME is set @@ -985,10 +1501,13 @@ hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) if ((vfs_flags(hfsmp->hfs_mp) & MNT_NOATIME) || (hfsmp->hfs_freezing_proc != NULL) || (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) || - (cp->c_vp && vnode_israge(cp->c_vp))) + (cp->c_vp && ((vnode_israge(cp->c_vp) || (vfs_ctx_skipatime(ctx)))))) { + cp->c_touch_acctime = FALSE; + } } - if (cp->c_touch_acctime || cp->c_touch_chgtime || cp->c_touch_modtime) { + if (cp->c_touch_acctime || cp->c_touch_chgtime || + cp->c_touch_modtime || (cp->c_flag & C_NEEDS_DATEADDED)) { struct timeval tv; int touchvol = 0; @@ -1027,6 +1546,14 @@ hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) cp->c_flag |= C_MODIFIED; touchvol = 1; } + + if (cp->c_flag & C_NEEDS_DATEADDED) { + hfs_write_dateadded (&(cp->c_attr), tv.tv_sec); + cp->c_flag |= C_MODIFIED; + /* untwiddle the bit */ + cp->c_flag &= ~C_NEEDS_DATEADDED; + touchvol = 1; + } /* Touch the volume modtime if needed */ if (touchvol) { @@ -1039,7 +1566,6 @@ hfs_touchtimes(struct hfsmount *hfsmp, struct cnode* cp) /* * Lock a cnode. */ -__private_extern__ int hfs_lock(struct cnode *cp, enum hfslocktype locktype) { @@ -1122,7 +1648,6 @@ hfs_lock(struct cnode *cp, enum hfslocktype locktype) /* * Lock a pair of cnodes. */ -__private_extern__ int hfs_lockpair(struct cnode *cp1, struct cnode *cp2, enum hfslocktype locktype) { @@ -1182,7 +1707,6 @@ hfs_isordered(struct cnode *cp1, struct cnode *cp2) * - only one lock taken per cnode (dup cnodes are skipped) * - some of the cnode pointers may be null */ -__private_extern__ int hfs_lockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, struct cnode *cp4, enum hfslocktype locktype, struct cnode **error_cnode) @@ -1245,7 +1769,6 @@ hfs_lockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, /* * Unlock a cnode. */ -__private_extern__ void hfs_unlock(struct cnode *cp) { @@ -1299,7 +1822,6 @@ hfs_unlock(struct cnode *cp) /* * Unlock a pair of cnodes. */ -__private_extern__ void hfs_unlockpair(struct cnode *cp1, struct cnode *cp2) { @@ -1311,7 +1833,6 @@ hfs_unlockpair(struct cnode *cp1, struct cnode *cp2) /* * Unlock a group of cnodes. */ -__private_extern__ void hfs_unlockfour(struct cnode *cp1, struct cnode *cp2, struct cnode *cp3, struct cnode *cp4) { @@ -1358,34 +1879,119 @@ skip2: * * The process doing a truncation must take the lock * exclusive. The read/write processes can take it - * non-exclusive. + * shared. The locktype argument is the same as supplied to + * hfs_lock. */ -__private_extern__ void -hfs_lock_truncate(struct cnode *cp, int exclusive) +hfs_lock_truncate(struct cnode *cp, enum hfslocktype locktype) { -#ifdef HFS_CHECK_LOCK_ORDER - if (cp->c_lockowner == current_thread()) - panic("hfs_lock_truncate: cnode %p locked!", cp); -#endif /* HFS_CHECK_LOCK_ORDER */ + void * thread = current_thread(); - if (exclusive) - lck_rw_lock_exclusive(&cp->c_truncatelock); - else + if (cp->c_truncatelockowner == thread) { + /* + * Only HFS_RECURSE_TRUNCLOCK is allowed to recurse. + * + * This is needed on the hfs_vnop_pagein path where we need to ensure + * the file does not change sizes while we are paging in. However, + * we may already hold the lock exclusive due to another + * VNOP from earlier in the call stack. So if we already hold + * the truncate lock exclusive, allow it to proceed, but ONLY if + * it's in the recursive case. + */ + if (locktype != HFS_RECURSE_TRUNCLOCK) { + panic("hfs_lock_truncate: cnode %p locked!", cp); + } + } + /* HFS_RECURSE_TRUNCLOCK takes a shared lock if it is not already locked */ + else if ((locktype == HFS_SHARED_LOCK) || (locktype == HFS_RECURSE_TRUNCLOCK)) { lck_rw_lock_shared(&cp->c_truncatelock); + cp->c_truncatelockowner = HFS_SHARED_OWNER; + } + else { /* must be an HFS_EXCLUSIVE_LOCK */ + lck_rw_lock_exclusive(&cp->c_truncatelock); + cp->c_truncatelockowner = thread; + } } -__private_extern__ -void -hfs_unlock_truncate(struct cnode *cp, int exclusive) -{ - if (exclusive) { - lck_rw_unlock_exclusive(&cp->c_truncatelock); - } else { - lck_rw_unlock_shared(&cp->c_truncatelock); - } + +/* + * Attempt to get the truncate lock. If it cannot be acquired, error out. + * This function is needed in the degenerate hfs_vnop_pagein during force unmount + * case. To prevent deadlocks while a VM copy object is moving pages, HFS vnop pagein will + * temporarily need to disable V2 semantics. + */ +int hfs_try_trunclock (struct cnode *cp, enum hfslocktype locktype) { + void * thread = current_thread(); + boolean_t didlock = false; + + if (cp->c_truncatelockowner == thread) { + /* + * Only HFS_RECURSE_TRUNCLOCK is allowed to recurse. + * + * This is needed on the hfs_vnop_pagein path where we need to ensure + * the file does not change sizes while we are paging in. However, + * we may already hold the lock exclusive due to another + * VNOP from earlier in the call stack. So if we already hold + * the truncate lock exclusive, allow it to proceed, but ONLY if + * it's in the recursive case. + */ + if (locktype != HFS_RECURSE_TRUNCLOCK) { + panic("hfs_lock_truncate: cnode %p locked!", cp); + } + } + /* HFS_RECURSE_TRUNCLOCK takes a shared lock if it is not already locked */ + else if ((locktype == HFS_SHARED_LOCK) || (locktype == HFS_RECURSE_TRUNCLOCK)) { + didlock = lck_rw_try_lock(&cp->c_truncatelock, LCK_RW_TYPE_SHARED); + if (didlock) { + cp->c_truncatelockowner = HFS_SHARED_OWNER; + } + } + else { /* must be an HFS_EXCLUSIVE_LOCK */ + didlock = lck_rw_try_lock (&cp->c_truncatelock, LCK_RW_TYPE_EXCLUSIVE); + if (didlock) { + cp->c_truncatelockowner = thread; + } + } + + return didlock; } +/* + * Unlock the truncate lock, which protects against size changes. + * + * The been_recursed argument is used when we may need to return + * from this function without actually unlocking the truncate lock. + */ +void +hfs_unlock_truncate(struct cnode *cp, int been_recursed) +{ + void *thread = current_thread(); + /* + * If been_recursed is nonzero AND the current lock owner of the + * truncate lock is our current thread, then we must have recursively + * taken the lock earlier on. If the lock were unlocked, + * HFS_RECURSE_TRUNCLOCK took a shared lock and it would fall through + * to the SHARED case below. + * + * If been_recursed is zero (most of the time) then we check the + * lockowner field to infer whether the lock was taken exclusively or + * shared in order to know what underlying lock routine to call. + */ + if (been_recursed) { + if (cp->c_truncatelockowner == thread) { + return; + } + } + /* HFS_LOCK_EXCLUSIVE */ + if (thread == cp->c_truncatelockowner) { + cp->c_truncatelockowner = NULL; + lck_rw_unlock_exclusive(&cp->c_truncatelock); + } + /* HFS_LOCK_SHARED */ + else { + lck_rw_unlock_shared(&cp->c_truncatelock); + } +} diff --git a/bsd/hfs/hfs_cnode.h b/bsd/hfs/hfs_cnode.h index 9ffb9a8ca..73c2f664a 100644 --- a/bsd/hfs/hfs_cnode.h +++ b/bsd/hfs/hfs_cnode.h @@ -45,6 +45,10 @@ #if HFS_COMPRESSION #include <sys/decmpfs.h> #endif +#if CONFIG_PROTECT +#include <sys/cprotect.h> +#endif + /* * The filefork is used to represent an HFS file fork (data or resource). @@ -106,6 +110,7 @@ struct cnode { lck_rw_t c_rwlock; /* cnode's lock */ void * c_lockowner; /* cnode's lock owner (exclusive case only) */ lck_rw_t c_truncatelock; /* protects file from truncation during read/write */ + void * c_truncatelockowner; /* truncate lock owner (exclusive case only) */ LIST_ENTRY(cnode) c_hash; /* cnode's hash chain */ u_int32_t c_flag; /* cnode's runtime flags */ u_int32_t c_hflag; /* cnode's flags for maintaining hash - protected by global hash lock */ @@ -132,6 +137,10 @@ struct cnode { #if HFS_COMPRESSION decmpfs_cnode *c_decmp; #endif /* HFS_COMPRESSION */ +#if CONFIG_PROTECT + cprotect_t c_cpentry; /* content protection data */ +#endif + }; typedef struct cnode cnode_t; @@ -183,13 +192,16 @@ typedef struct cnode cnode_t; #define C_FORCEUPDATE 0x00100 /* force the catalog entry update */ #define C_HASXATTRS 0x00200 /* cnode has extended attributes */ #define C_NEG_ENTRIES 0x00400 /* directory has negative name entries */ -#define C_WARNED_RSRC 0x00800 /* cnode lookup warning has been issued */ +#define C_SWAPINPROGRESS 0x00800 /* cnode's data is about to be swapped. Issue synchronous cluster io */ #define C_NEED_DATA_SETSIZE 0x01000 /* Do a ubc_setsize(0) on c_rsrc_vp after the unlock */ #define C_NEED_RSRC_SETSIZE 0x02000 /* Do a ubc_setsize(0) on c_vp after the unlock */ #define C_DIR_MODIFICATION 0x04000 /* Directory is being modified, wait for lookups */ #define C_ALWAYS_ZEROFILL 0x08000 /* Always zero-fill the file on an fsync */ +#define C_RENAMED 0x10000 /* cnode was deleted as part of rename; C_DELETED should also be set */ +#define C_NEEDS_DATEADDED 0x20000 /* cnode needs date-added written to the finderinfo bit */ +#define C_BACKINGSTORE 0x40000 /* cnode is a backing store for an existing or currently-mounting filesystem */ #define ZFTIMELIMIT (5 * 60) /* @@ -236,7 +248,7 @@ enum { kFinderInvisibleMask = 1 << 14 }; * upon the VNOP in question. Sometimes it is OK to use an open-unlinked file, for example, in, * reading. But other times, such as on the source of a VNOP_RENAME, it should be disallowed. */ -int hfs_checkdeleted (struct cnode *cp); +int hfs_checkdeleted(struct cnode *cp); /* * Test for a resource fork @@ -271,16 +283,28 @@ struct hfsfid { /* Get new default vnode */ extern int hfs_getnewvnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, struct cat_desc *descp, int flags, struct cat_attr *attrp, - struct cat_fork *forkp, struct vnode **vpp); + struct cat_fork *forkp, struct vnode **vpp, int *out_flags); +/* Input flags for hfs_getnewvnode */ #define GNV_WANTRSRC 0x01 /* Request the resource fork vnode. */ #define GNV_SKIPLOCK 0x02 /* Skip taking the cnode lock (when getting resource fork). */ #define GNV_CREATE 0x04 /* The vnode is for a newly created item. */ +#define GNV_NOCACHE 0x08 /* Delay entering this item in the name cache */ +/* Output flags for hfs_getnewvnode */ +#define GNV_CHASH_RENAMED 0x01 /* The cnode was renamed in-flight */ +#define GNV_CAT_DELETED 0x02 /* The cnode was deleted from the catalog */ +#define GNV_NEW_CNODE 0x04 /* We are vending out a newly initialized cnode */ +#define GNV_CAT_ATTRCHANGED 0x08 /* Something in struct cat_attr changed in between cat_lookups */ /* Touch cnode times based on c_touch_xxx flags */ extern void hfs_touchtimes(struct hfsmount *, struct cnode *); +extern void hfs_write_dateadded (struct cat_attr *cattrp, u_int32_t dateadded); +extern u_int32_t hfs_get_dateadded (struct cnode *cp); + +/* Zero-fill file and push regions out to disk */ +extern int hfs_filedone(struct vnode *vp, vfs_context_t context); /* * HFS cnode hash functions. @@ -294,11 +318,14 @@ extern void hfs_chash_rehash(struct hfsmount *hfsmp, struct cnode *cp1, struct extern void hfs_chashwakeup(struct hfsmount *hfsmp, struct cnode *cp, int flags); extern void hfs_chash_mark_in_transit(struct hfsmount *hfsmp, struct cnode *cp); -extern struct vnode * hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, int skiplock); -extern struct cnode * hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, int wantrsrc, int skiplock); +extern struct vnode * hfs_chash_getvnode(struct hfsmount *hfsmp, ino_t inum, int wantrsrc, + int skiplock, int allow_deleted); +extern struct cnode * hfs_chash_getcnode(struct hfsmount *hfsmp, ino_t inum, struct vnode **vpp, + int wantrsrc, int skiplock, int *out_flags, int *hflags); extern int hfs_chash_snoop(struct hfsmount *, ino_t, int (*)(const struct cat_desc *, const struct cat_attr *, void *), void *); -extern int hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, cnid_t cnid); +extern int hfs_valid_cnode(struct hfsmount *hfsmp, struct vnode *dvp, struct componentname *cnp, + cnid_t cnid, struct cat_attr *cattr, int *error); extern int hfs_chash_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid); @@ -319,20 +346,22 @@ extern int hfs_chash_set_childlinkbit(struct hfsmount *hfsmp, cnid_t cnid); * 5. hfs mount point (always last) * */ -enum hfslocktype {HFS_SHARED_LOCK = 1, HFS_EXCLUSIVE_LOCK = 2, HFS_FORCE_LOCK = 3}; +enum hfslocktype {HFS_SHARED_LOCK = 1, HFS_EXCLUSIVE_LOCK = 2, HFS_FORCE_LOCK = 3, HFS_RECURSE_TRUNCLOCK = 4}; #define HFS_SHARED_OWNER (void *)0xffffffff -extern int hfs_lock(struct cnode *, enum hfslocktype); -extern int hfs_lockpair(struct cnode *, struct cnode *, enum hfslocktype); -extern int hfs_lockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *, +int hfs_lock(struct cnode *, enum hfslocktype); +int hfs_lockpair(struct cnode *, struct cnode *, enum hfslocktype); +int hfs_lockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *, enum hfslocktype, struct cnode **); -extern void hfs_unlock(struct cnode *); -extern void hfs_unlockpair(struct cnode *, struct cnode *); -extern void hfs_unlockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *); +void hfs_unlock(struct cnode *); +void hfs_unlockpair(struct cnode *, struct cnode *); +void hfs_unlockfour(struct cnode *, struct cnode *, struct cnode *, struct cnode *); + +void hfs_lock_truncate(struct cnode *, enum hfslocktype); +void hfs_unlock_truncate(struct cnode *, int been_recursed); -extern void hfs_lock_truncate(struct cnode *, int); -extern void hfs_unlock_truncate(struct cnode *, int); +int hfs_try_trunclock(struct cnode *, enum hfslocktype); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/hfs/hfs_cprotect.c b/bsd/hfs/hfs_cprotect.c new file mode 100644 index 000000000..0345e4d9e --- /dev/null +++ b/bsd/hfs/hfs_cprotect.c @@ -0,0 +1,908 @@ +/* + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include <sys/cprotect.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <sys/random.h> +#include <sys/xattr.h> +#include <sys/uio_internal.h> +#include <sys/ubc_internal.h> +#include <sys/vnode_if.h> +#include <sys/vnode_internal.h> +#include <libkern/OSByteOrder.h> + +#include "hfs.h" +#include "hfs_cnode.h" + +#ifdef CONFIG_PROTECT +static struct cp_wrap_func g_cp_wrap_func = {NULL, NULL}; +static struct cp_global_state g_cp_state = {0, 0}; + +extern int (**hfs_vnodeop_p) (void *); + +/* + * CP private functions + */ +static int cp_is_valid_class(int); +static int cp_getxattr(cnode_t *, struct cp_xattr *); +static int cp_setxattr(cnode_t *, struct cp_xattr *, int); +static struct cprotect *cp_entry_alloc(void); +static int cp_make_keys (struct cprotect *); +static int cp_restore_keys(struct cprotect *); +static int cp_lock_vfs_callback(mount_t, void *); +static int cp_lock_vnode_callback(vnode_t, void *); +static int cp_vnode_is_eligible (vnode_t); +static int cp_check_access (cnode_t *, int); +static int cp_wrap(int, void *, void *); +static int cp_unwrap(int, void *, void *); + + + +#if DEVELOPMENT || DEBUG +#define CP_ASSERT(x) \ + if ((x) == 0) { \ + panic("CP: failed assertion in %s", __FUNCTION__); \ + } +#else +#define CP_ASSERT(x) +#endif + +int +cp_key_store_action(int action) +{ + g_cp_state.lock_state = action; + if (action == CP_LOCKED_STATE) + return vfs_iterate(0, cp_lock_vfs_callback, (void *)action); + else + return 0; +} + + +int +cp_register_wraps(cp_wrap_func_t key_store_func) +{ + g_cp_wrap_func.wrapper = key_store_func->wrapper; + g_cp_wrap_func.unwrapper = key_store_func->unwrapper; + + g_cp_state.wrap_functions_set = 1; + + return 0; +} + +/* + * Allocate and initialize a cprotect blob for a new cnode. + * Called from hfs_getnewcnode: cnode is locked exclusive. + * Read xattr data off the cnode. Then, if conditions permit, + * unwrap the file key and cache it in the cprotect blob. + */ +int +cp_entry_init(cnode_t *cnode, struct mount *mp) +{ + struct cprotect *entry; + struct cp_xattr xattr; + int error = 0; + + if (!cp_fs_protected (mp)) { + cnode->c_cpentry = NULL; + return 0; + } + + if (!S_ISREG(cnode->c_mode)) { + cnode->c_cpentry = NULL; + return 0; + } + + if (!g_cp_state.wrap_functions_set) { + printf("hfs: cp_update_entry: wrap functions not yet set\n"); + return ENXIO; + } + + CP_ASSERT (cnode->c_cpentry == NULL); + + entry = cp_entry_alloc(); + if (!entry) + return ENOMEM; + + entry->cp_flags |= CP_KEY_FLUSHED; + cnode->c_cpentry = entry; + + error = cp_getxattr(cnode, &xattr); + if (error == ENOATTR) { + /* + * Can't tell if the file is new, or was previously created but never + * written to or set-classed. In either case, it'll need a fresh + * per-file key. + */ + entry->cp_flags |= CP_NEEDS_KEYS; + error = 0; + } else { + if (xattr.xattr_major_version != CP_CURRENT_MAJOR_VERS) { + printf("hfs: cp_entry_init: bad xattr version\n"); + error = EINVAL; + goto out; + } + + /* set up entry with information from xattr */ + entry->cp_pclass = xattr.persistent_class; + bcopy(&xattr.persistent_key, &entry->cp_persistent_key, CP_WRAPPEDKEYSIZE); + } + +out: + if (error) { + cp_entry_destroy (cnode); + } + return error; +} + +/* + * Set up initial key/class pair on cnode. The cnode is locked exclusive. + */ +int +cp_entry_create_keys(cnode_t *cnode) +{ + struct cprotect *entry = cnode->c_cpentry; + + if (!entry) { + //unprotected file: continue + return 0; + } + + CP_ASSERT((entry->cp_flags & CP_NEEDS_KEYS)); + + return cp_make_keys(entry); +} + +/* + * Tear down and clear a cprotect blob for a closing file. + * Called at hfs_reclaim_cnode: cnode is locked exclusive. + */ +void +cp_entry_destroy(cnode_t *cnode) +{ + struct cprotect *entry = cnode->c_cpentry; + if (!entry) { + /* nothing to clean up */ + return; + } + cnode->c_cpentry = NULL; + bzero(entry, sizeof(*entry)); + FREE(entry, M_TEMP); +} + +int +cp_fs_protected (mount_t mnt) { + return (vfs_flags(mnt) & MNT_CPROTECT); +} + + +/* + * Return a pointer to underlying cnode if there is one for this vnode. + * Done without taking cnode lock, inspecting only vnode state. + */ +cnode_t * +cp_get_protected_cnode(vnode_t vp) +{ + if (!cp_vnode_is_eligible(vp)) { + return NULL; + } + + if (!cp_fs_protected(VTOVFS(vp))) { + /* mount point doesn't support it */ + return NULL; + } + + return (cnode_t *) vp->v_data; +} + + +/* + * Sets *class to persistent class associated with vnode, + * or returns error. + */ +int +cp_vnode_getclass(vnode_t vp, int *class) +{ + struct cp_xattr xattr; + int error = 0; + struct cnode *cnode; + + if (!cp_vnode_is_eligible (vp)) { + return EBADF; + } + + cnode = VTOC(vp); + + hfs_lock(cnode, HFS_SHARED_LOCK); + + if (cp_fs_protected(VTOVFS(vp))) { + /* pull the class from the live entry */ + struct cprotect *entry = cnode->c_cpentry; + if (!entry) { + panic("Content Protection: uninitialized cnode %p", cnode); + } + + if ((entry->cp_flags & CP_NEEDS_KEYS)) { + error = cp_make_keys(entry); + } + *class = entry->cp_pclass; + + } else { + /* + * Mount point is not formatted for content protection. If a class + * has been specified anyway, report it. Otherwise, report D. + */ + error = cp_getxattr(cnode, &xattr); + if (error == ENOATTR) { + *class = PROTECTION_CLASS_D; + error = 0; + } else if (error == 0) { + *class = xattr.persistent_class; + } + } + + hfs_unlock(cnode); + return error; +} + + +/* + * Sets persistent class for this file. + * If vnode cannot be protected (system file, non-regular file, non-hfs), EBADF. + * If the new class can't be accessed now, EPERM. + * Otherwise, record class and re-wrap key if the mount point is content-protected. + */ +int +cp_vnode_setclass(vnode_t vp, uint32_t newclass) +{ + struct cnode *cnode; + struct cp_xattr xattr; + struct cprotect *entry = 0; + int error = 0; + + if (!cp_is_valid_class(newclass)) { + printf("hfs: CP: cp_setclass called with invalid class %d\n", newclass); + return EINVAL; + } + + /* is this an interesting file? */ + if (!cp_vnode_is_eligible(vp)) { + return EBADF; + } + + cnode = VTOC(vp); + + if (hfs_lock(cnode, HFS_EXCLUSIVE_LOCK)) { + return EINVAL; + } + + /* is the volume formatted for content protection? */ + if (cp_fs_protected(VTOVFS(vp))) { + entry = cnode->c_cpentry; + if (entry == NULL) { + error = EINVAL; + goto out; + } + + if ((entry->cp_flags & CP_NEEDS_KEYS)) { + if ((error = cp_make_keys(entry)) != 0) { + goto out; + } + } + + if (entry->cp_flags & CP_KEY_FLUSHED) { + error = cp_restore_keys(entry); + if (error) + goto out; + } + + /* re-wrap per-file key with new class */ + error = cp_wrap(newclass, + &entry->cp_cache_key[0], + &entry->cp_persistent_key[0]); + if (error) { + /* we didn't have perms to set this class. leave file as-is and error out */ + goto out; + } + + entry->cp_pclass = newclass; + + /* prepare to write the xattr out */ + bcopy(&entry->cp_persistent_key, &xattr.persistent_key, CP_WRAPPEDKEYSIZE); + } else { + /* no live keys for this file. just remember intended class */ + bzero(&xattr.persistent_key, CP_WRAPPEDKEYSIZE); + } + + xattr.xattr_major_version = CP_CURRENT_MAJOR_VERS; + xattr.xattr_minor_version = CP_CURRENT_MINOR_VERS; + xattr.key_size = CP_WRAPPEDKEYSIZE; + xattr.flags = 0; + xattr.persistent_class = newclass; + error = cp_setxattr(cnode, &xattr, XATTR_REPLACE); + + if (error == ENOATTR) { + error = cp_setxattr (cnode, &xattr, XATTR_CREATE); + } + +out: + hfs_unlock(cnode); + return error; +} + +/* + * Check permission for the given operation (read, write, page in) on this node. + * Additionally, if the node needs work, do it: + * - create a new key for the file if one hasn't been set before + * - write out the xattr if it hasn't already been saved + * - unwrap the key if needed + * + * Takes cnode lock, and upgrades to exclusive if modifying cprotect. + */ + int +cp_handle_vnop(cnode_t *cnode, int vnop) +{ + struct cprotect *entry; + int error = 0; + struct cp_xattr xattr; + + if ((error = hfs_lock(cnode, HFS_SHARED_LOCK)) != KERN_SUCCESS) { + return error; + } + + entry = cnode->c_cpentry; + if (!entry) + goto out; + + if ((error = cp_check_access(cnode, vnop)) != KERN_SUCCESS) { + goto out; + } + + if (entry->cp_flags == 0) { + /* no more work to do */ + goto out; + } + + /* upgrade to exclusive lock */ + if (lck_rw_lock_shared_to_exclusive(&cnode->c_rwlock) == FALSE) { + if ((error = hfs_lock(cnode, HFS_EXCLUSIVE_LOCK)) != KERN_SUCCESS) { + return error; + } + } else { + cnode->c_lockowner = current_thread(); + } + + /* generate new keys if none have ever been saved */ + if ((entry->cp_flags & CP_NEEDS_KEYS)) { + if ((error = cp_make_keys(entry)) != 0) { + goto out; + } + } + + /* unwrap keys if needed */ + if (entry->cp_flags & CP_KEY_FLUSHED) { + error = cp_restore_keys(entry); + if (error) + goto out; + } + + /* write out the xattr if it's new */ + if (entry->cp_flags & CP_NO_XATTR) { + bcopy(&entry->cp_persistent_key[0], &xattr.persistent_key, CP_WRAPPEDKEYSIZE); + xattr.xattr_major_version = CP_CURRENT_MAJOR_VERS; + xattr.xattr_minor_version = CP_CURRENT_MINOR_VERS; + xattr.key_size = CP_WRAPPEDKEYSIZE; + xattr.persistent_class = entry->cp_pclass; + error = cp_setxattr(cnode, &xattr, XATTR_CREATE); + } + +out: + hfs_unlock(cnode); + return error; +} + +/* + * During hfs resize operations, we have slightly different constraints than during + * normal VNOPS that read/write data to files. Specifically, we already have the cnode + * locked (so nobody else can modify it), and we are doing the IO with root privileges, since + * we are moving the data behind the user's back. So, we skip access checks here (for unlock + * vs. lock), and don't worry about non-existing keys. If the file exists on-disk with valid + * payload, then it must have keys set up already by definition. + */ +int cp_handle_relocate (cnode_t *cp) { + struct cprotect *entry; + int error = -1; + + /* cp is already locked */ + entry = cp->c_cpentry; + if (!entry) + goto out; + + /* + * Still need to validate whether to permit access to the file or not + * based on lock status + */ + if ((error = cp_check_access(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != KERN_SUCCESS) { + goto out; + } + + if (entry->cp_flags == 0) { + /* no more work to do */ + error = 0; + goto out; + } + + /* it must have keys since it is an existing file with actual payload */ + + /* unwrap keys if needed */ + if (entry->cp_flags & CP_KEY_FLUSHED) { + error = cp_restore_keys(entry); + } + + /* don't need to write out the EA since the file is extant */ +out: + + /* return the cp still locked */ + return error; +} + + + +/* + * cp_getrootxattr: + * Gets the EA we set on the root folder (fileid 1) to get information about the + * version of Content Protection that was used to write to this filesystem. + * Note that all multi-byte fields are written to disk little endian so they must be + * converted to native endian-ness as needed. + */ + +int cp_getrootxattr(struct hfsmount* hfsmp, struct cp_root_xattr *outxattr) { + uio_t auio; + char uio_buf[UIO_SIZEOF(1)]; + size_t attrsize = sizeof(struct cp_root_xattr); + int error = 0; + struct vnop_getxattr_args args; + + if (!outxattr) { + panic("cp_xattr called with xattr == NULL"); + } + + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(outxattr), attrsize); + + args.a_desc = NULL; // unused + args.a_vp = NULL; //unused since we're writing EA to root folder. + args.a_name = CONTENT_PROTECTION_XATTR_NAME; + args.a_uio = auio; + args.a_size = &attrsize; + args.a_options = XATTR_REPLACE; + args.a_context = NULL; // unused + + error = hfs_getxattr_internal(NULL, &args, hfsmp, 1); + + /* Now convert the multi-byte fields to native endianness */ + outxattr->major_version = OSSwapLittleToHostInt16(outxattr->major_version); + outxattr->minor_version = OSSwapLittleToHostInt16(outxattr->minor_version); + outxattr->flags = OSSwapLittleToHostInt64(outxattr->flags); + + if (error != KERN_SUCCESS) { + goto out; + } + +out: + uio_free(auio); + return error; +} + +/* + * cp_setrootxattr: + * Sets the EA we set on the root folder (fileid 1) to get information about the + * version of Content Protection that was used to write to this filesystem. + * Note that all multi-byte fields are written to disk little endian so they must be + * converted to little endian as needed. + * + * This will be written to the disk when it detects the EA is not there, or when we need + * to make a modification to the on-disk version that can be done in-place. + */ + int +cp_setrootxattr(struct hfsmount *hfsmp, struct cp_root_xattr *newxattr) +{ + int error = 0; + struct vnop_setxattr_args args; + + args.a_desc = NULL; + args.a_vp = NULL; + args.a_name = CONTENT_PROTECTION_XATTR_NAME; + args.a_uio = NULL; //pass data ptr instead + args.a_options = 0; + args.a_context = NULL; //no context needed, only done from mount. + + /* Now convert the multi-byte fields to little endian before writing to disk. */ + newxattr->major_version = OSSwapHostToLittleInt16(newxattr->major_version); + newxattr->minor_version = OSSwapHostToLittleInt16(newxattr->minor_version); + newxattr->flags = OSSwapHostToLittleInt64(newxattr->flags); + + error = hfs_setxattr_internal(NULL, (caddr_t)newxattr, + sizeof(struct cp_root_xattr), &args, hfsmp, 1); + return error; +} + + + + +/******************** + * Private Functions + *******************/ + +static int +cp_vnode_is_eligible(vnode_t vp) +{ + return ((vp->v_op == hfs_vnodeop_p) && + (!vnode_issystem(vp)) && + (vnode_isreg(vp))); +} + + + +static int +cp_is_valid_class(int class) +{ + return ((class >= PROTECTION_CLASS_A) && + (class <= PROTECTION_CLASS_F)); +} + + +static struct cprotect * +cp_entry_alloc(void) +{ + struct cprotect *cp_entry; + + MALLOC(cp_entry, struct cprotect *, sizeof(struct cprotect), + M_TEMP, M_WAITOK); + if (cp_entry == NULL) + return (NULL); + + bzero(cp_entry, sizeof(*cp_entry)); + return (cp_entry); +} + + +/* + * Reads xattr data off the cnode and into provided xattr. + * cnode lock held shared + */ +static int +cp_getxattr(cnode_t *cnode, struct cp_xattr *outxattr) +{ + uio_t auio; + char uio_buf[UIO_SIZEOF(1)]; + size_t attrsize = sizeof(struct cp_xattr); + int error = 0; + struct vnop_getxattr_args args; + + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + uio_addiov(auio, CAST_USER_ADDR_T(outxattr), attrsize); + + args.a_desc = NULL; // unused + args.a_vp = cnode->c_vp; + args.a_name = CONTENT_PROTECTION_XATTR_NAME; + args.a_uio = auio; + args.a_size = &attrsize; + args.a_options = XATTR_REPLACE; + args.a_context = vfs_context_current(); // unused + error = hfs_getxattr_internal(cnode, &args, VTOHFS(cnode->c_vp), 0); + if (error != KERN_SUCCESS) { + goto out; + } + + /* Endian swap the multi-byte fields into host endianness from L.E. */ + outxattr->xattr_major_version = OSSwapLittleToHostInt16(outxattr->xattr_major_version); + outxattr->xattr_minor_version = OSSwapLittleToHostInt16(outxattr->xattr_minor_version); + outxattr->key_size = OSSwapLittleToHostInt32(outxattr->key_size); + outxattr->flags = OSSwapLittleToHostInt32(outxattr->flags); + outxattr->persistent_class = OSSwapLittleToHostInt32(outxattr->persistent_class); + +out: + uio_free(auio); + return error; +} + +/* + * Stores new xattr data on the cnode. + * cnode lock held exclusive + */ +static int +cp_setxattr(cnode_t *cnode, struct cp_xattr *newxattr, int options) +{ + int error = 0; + struct vnop_setxattr_args args; + + args.a_desc = NULL; + args.a_vp = cnode->c_vp; + args.a_name = CONTENT_PROTECTION_XATTR_NAME; + args.a_uio = NULL; //pass data ptr instead + args.a_options = options; + args.a_context = vfs_context_current(); + + /* Endian swap the multi-byte fields into L.E from host. */ + newxattr->xattr_major_version = OSSwapHostToLittleInt16(newxattr->xattr_major_version); + newxattr->xattr_minor_version = OSSwapHostToLittleInt16(newxattr->xattr_minor_version); + newxattr->key_size = OSSwapHostToLittleInt32(newxattr->key_size); + newxattr->flags = OSSwapHostToLittleInt32(newxattr->flags); + newxattr->persistent_class = OSSwapHostToLittleInt32(newxattr->persistent_class); + + error = hfs_setxattr_internal(cnode, (caddr_t)newxattr, + sizeof(struct cp_xattr), &args, VTOHFS(cnode->c_vp), 0); + + if ((error == KERN_SUCCESS) && (cnode->c_cpentry)) { + cnode->c_cpentry->cp_flags &= ~CP_NO_XATTR; + } + + return error; +} + + +/* + * Make a new random per-file key and wrap it. + */ +static int +cp_make_keys(struct cprotect *entry) +{ + int error = 0; + + if (g_cp_state.wrap_functions_set != 1) { + printf("hfs: CP: could not create keys: no wrappers set\n"); + return ENXIO; + } + + /* create new cp data: key and class */ + read_random(&entry->cp_cache_key[0], CP_KEYSIZE); + entry->cp_pclass = PROTECTION_CLASS_D; + + /* wrap the new key in the class key */ + error = cp_wrap(PROTECTION_CLASS_D, + &entry->cp_cache_key[0], + &entry->cp_persistent_key[0]); + + if (error) { + panic("could not wrap new key in class D\n"); + } + + /* ready for business */ + entry->cp_flags &= ~CP_NEEDS_KEYS; + entry->cp_flags |= CP_NO_XATTR; + + return error; +} + +/* + * If permitted, restore entry's unwrapped key from the persistent key. + * If not, clear key and set CP_ENTRY_FLUSHED. + * cnode lock held exclusive + */ +static int +cp_restore_keys(struct cprotect *entry) +{ + int error = 0; + + error = cp_unwrap(entry->cp_pclass, + &entry->cp_persistent_key[0], + &entry->cp_cache_key[0]); + + if (error) { + entry->cp_flags |= CP_KEY_FLUSHED; + bzero(entry->cp_cache_key, CP_KEYSIZE); + error = EPERM; + } + else { + entry->cp_flags &= ~CP_KEY_FLUSHED; + } + return error; +} + +static int +cp_lock_vfs_callback(mount_t mp, void *arg) +{ + if (!cp_fs_protected(mp)) { + /* not interested in this mount point */ + return 0; + } + + return vnode_iterate(mp, 0, cp_lock_vnode_callback, arg); +} + + +/* + * Deny access to protected files if keys have been locked. + * + * cnode lock is taken shared. + */ + static int +cp_check_access(cnode_t *cnode, int vnop) +{ + int error = 0; + + if (g_cp_state.lock_state == CP_UNLOCKED_STATE) { + return KERN_SUCCESS; + } + + if (!cnode->c_cpentry) { + /* unprotected node */ + return KERN_SUCCESS; + } + + /* Deny all access for class A files, and read access for class B */ + switch (cnode->c_cpentry->cp_pclass) { + case PROTECTION_CLASS_A: { + error = EPERM; + break; + } + case PROTECTION_CLASS_B: { + if (vnop & CP_READ_ACCESS) + error = EPERM; + else + error = 0; + break; + } + default: + error = 0; + break; + } + + return error; +} + + + +/* + * Respond to a lock or unlock event. + * On lock: clear out keys from memory, then flush file contents. + * On unlock: nothing (function not called). + */ +static int +cp_lock_vnode_callback(vnode_t vp, void *arg) +{ + cnode_t *cp = NULL; + struct cprotect *entry = NULL; + int error = 0; + int locked = 1; + int action = 0; + + error = vnode_getwithref (vp); + if (error) { + return error; + } + + cp = VTOC(vp); + hfs_lock(cp, HFS_FORCE_LOCK); + + entry = cp->c_cpentry; + if (!entry) { + /* unprotected vnode: not a regular file */ + goto out; + } + + action = (int)((uintptr_t) arg); + switch (action) { + case CP_LOCKED_STATE: { + vfs_context_t ctx; + if (entry->cp_pclass != PROTECTION_CLASS_A) { + /* no change at lock for other classes */ + goto out; + } + + /* Before doing anything else, zero-fille sparse ranges as needed */ + ctx = vfs_context_current(); + (void) hfs_filedone (vp, ctx); + + /* first, sync back dirty pages */ + hfs_unlock (cp); + ubc_msync (vp, 0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC); + hfs_lock (cp, HFS_FORCE_LOCK); + + /* flush keys */ + entry->cp_flags |= CP_KEY_FLUSHED; + bzero(&entry->cp_cache_key, CP_KEYSIZE); + /* some write may have arrived in the mean time. dump those pages */ + hfs_unlock(cp); + locked = 0; + + ubc_msync (vp, 0, ubc_getsize(vp), NULL, UBC_INVALIDATE | UBC_SYNC); + break; + } + case CP_UNLOCKED_STATE: { + /* no-op */ + break; + } + default: + panic("unknown lock action %d\n", action); + } + +out: + if (locked) + hfs_unlock(cp); + vnode_put (vp); + return error; +} + +static int +cp_wrap(int class, void *inkey, void *outkey) +{ + int error = 0; + size_t keyln = CP_WRAPPEDKEYSIZE; + + if (class == PROTECTION_CLASS_F) { + bzero(outkey, CP_WRAPPEDKEYSIZE); + return 0; + } + + error = g_cp_wrap_func.wrapper(class, + inkey, + CP_KEYSIZE, + outkey, + &keyln); + + return error; +} + + +static int +cp_unwrap(int class, void *inkey, void *outkey) +{ + int error = 0; + size_t keyln = CP_KEYSIZE; + + if (class == PROTECTION_CLASS_F) { + /* we didn't save a wrapped key, so nothing to unwrap */ + return EPERM; + } + + error = g_cp_wrap_func.unwrapper(class, + inkey, + CP_WRAPPEDKEYSIZE, + outkey, + &keyln); + + return error; + +} + + +#else + +int cp_key_store_action(int action __unused) +{ + return ENOTSUP; +} + + +int cp_register_wraps(cp_wrap_func_t key_store_func __unused) +{ + return ENOTSUP; +} + +#endif /* CONFIG_PROTECT */ diff --git a/bsd/hfs/hfs_dbg.h b/bsd/hfs/hfs_dbg.h index ef0423083..f39271fe4 100644 --- a/bsd/hfs/hfs_dbg.h +++ b/bsd/hfs/hfs_dbg.h @@ -94,7 +94,7 @@ extern int hfs_dbg_err; #if (HFS_DEBUG_STAGE == 4) char gDebugAssertStr[255]; #define DBG_ASSERT(a) { if (!(a)) { \ - sprintf(gDebugAssertStr,"Oops - File "__FILE__", line %d: assertion '%s' failed.\n", __LINE__, #a); \ + snprintf(gDebugAssertStr, sizeof (gDebugAssertStr), "Oops - File "__FILE__", line %d: assertion '%s' failed.\n", __LINE__, #a); \ Debugger(gDebugAssertStr); } } #else #define DBG_ASSERT(a) { if (!(a)) { panic("File "__FILE__", line %d: assertion '%s' failed.\n", __LINE__, #a); } } diff --git a/bsd/hfs/hfs_encodings.c b/bsd/hfs/hfs_encodings.c index 4a67567b2..13c9781f8 100644 --- a/bsd/hfs/hfs_encodings.c +++ b/bsd/hfs/hfs_encodings.c @@ -239,8 +239,8 @@ hfs_to_utf8(ExtendedVCB *vcb, const Str31 hfs_str, ByteCount maxDstLen, ByteCoun UniChar uniStr[MAX_HFS_UNICODE_CHARS]; ItemCount uniCount; size_t utf8len; - u_int8_t pascal_length = 0; hfs_to_unicode_func_t hfs_get_unicode = VCBTOHFS(vcb)->hfs_get_unicode; + u_int8_t pascal_length = 0; /* * Validate the length of the Pascal-style string before passing it @@ -252,7 +252,7 @@ hfs_to_utf8(ExtendedVCB *vcb, const Str31 hfs_str, ByteCount maxDstLen, ByteCoun error = EINVAL; return error; } - + error = hfs_get_unicode(hfs_str, uniStr, MAX_HFS_UNICODE_CHARS, &uniCount); if (uniCount == 0) @@ -292,7 +292,7 @@ mac_roman_to_utf8(const Str31 hfs_str, ByteCount maxDstLen, ByteCount *actualDst /* invalid string; longer than 31 bytes */ error = EINVAL; return error; - } + } error = mac_roman_to_unicode(hfs_str, uniStr, MAX_HFS_UNICODE_CHARS, &uniCount); diff --git a/bsd/hfs/hfs_endian.c b/bsd/hfs/hfs_endian.c index 6f840045d..367785b29 100644 --- a/bsd/hfs/hfs_endian.c +++ b/bsd/hfs/hfs_endian.c @@ -49,13 +49,14 @@ * The direction parameter must be kSwapBTNodeBigToHost or kSwapBTNodeHostToBig. * The kSwapBTNodeHeaderRecordOnly "direction" is not valid for these routines. */ -static int hfs_swap_HFSPlusBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); -static int hfs_swap_HFSBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); +int hfs_swap_HFSPlusBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); +int hfs_swap_HFSBTInternalNode (BlockDescriptor *src, HFSCatalogNodeID fileID, enum HFSBTSwapDirection direction); +void hfs_swap_HFSPlusForkData (HFSPlusForkData *src); /* * hfs_swap_HFSPlusForkData */ -static void +void hfs_swap_HFSPlusForkData ( HFSPlusForkData *src ) @@ -160,7 +161,7 @@ hfs_swap_BTNode ( /* * Check srcDesc->height. Don't swap it because it's only one byte. */ - if (srcDesc->height > btcb->treeDepth) { + if (srcDesc->height > kMaxTreeDepth) { printf("hfs_swap_BTNode: invalid node height (%d)\n", srcDesc->height); error = fsBTInvalidHeaderErr; goto fail; @@ -314,7 +315,7 @@ hfs_swap_BTNode ( /* * Check srcDesc->height. Don't swap it because it's only one byte. */ - if (srcDesc->height > btcb->treeDepth) { + if (srcDesc->height > kMaxTreeDepth) { panic("hfs_UNswap_BTNode: invalid node height (%d)\n", srcDesc->height); error = fsBTInvalidHeaderErr; goto fail; @@ -389,7 +390,7 @@ fail: return (error); } -static int +int hfs_swap_HFSPlusBTInternalNode ( BlockDescriptor *src, HFSCatalogNodeID fileID, @@ -925,7 +926,7 @@ hfs_swap_HFSPlusBTInternalNode ( return (0); } -static int +int hfs_swap_HFSBTInternalNode ( BlockDescriptor *src, HFSCatalogNodeID fileID, diff --git a/bsd/hfs/hfs_format.h b/bsd/hfs/hfs_format.h index 151cadde7..ae1039a3e 100644 --- a/bsd/hfs/hfs_format.h +++ b/bsd/hfs/hfs_format.h @@ -232,6 +232,21 @@ struct FndrOpaqueInfo { } __attribute__((aligned(2), packed)); typedef struct FndrOpaqueInfo FndrOpaqueInfo; +struct FndrExtendedDirInfo { + u_int32_t point; + u_int32_t date_added; + u_int16_t extended_flags; + u_int16_t reserved3; + u_int32_t reserved4; +} __attribute__((aligned(2), packed)); + +struct FndrExtendedFileInfo { + u_int32_t reserved1; + u_int32_t date_added; + u_int16_t extended_flags; + u_int16_t reserved2; + u_int32_t reserved3; +} __attribute__((aligned(2), packed)); /* HFS Plus Fork data info - 80 bytes */ struct HFSPlusForkData { @@ -354,7 +369,11 @@ enum { kHFSHasLinkChainMask = 0x0020, kHFSHasChildLinkBit = 0x0006, /* folder has a child that's a dir link */ - kHFSHasChildLinkMask = 0x0040 + kHFSHasChildLinkMask = 0x0040, + + kHFSHasDateAddedBit = 0x0007, /* File/Folder has the date-added stored in the finder info. */ + kHFSHasDateAddedMask = 0x0080 + }; @@ -577,7 +596,8 @@ enum { * Therefore, bits 16-31 can only be used on HFS Plus. */ kHFSUnusedNodeFixBit = 31, /* Unused nodes in the Catalog B-tree have been zero-filled. See Radar #6947811. */ - + kHFSContentProtectionBit = 30, /* Volume has per-file content protection */ + kHFSVolumeHardwareLockMask = 1 << kHFSVolumeHardwareLockBit, kHFSVolumeUnmountedMask = 1 << kHFSVolumeUnmountedBit, kHFSVolumeSparedBlocksMask = 1 << kHFSVolumeSparedBlocksBit, @@ -588,6 +608,7 @@ enum { kHFSVolumeInconsistentMask = 1 << kHFSVolumeInconsistentBit, kHFSVolumeSoftwareLockMask = 1 << kHFSVolumeSoftwareLockBit, kHFSUnusedNodeFixMask = 1 << kHFSUnusedNodeFixBit, + kHFSContentProtectionMask = 1 << kHFSContentProtectionBit, kHFSMDBAttributesMask = 0x8380 }; diff --git a/bsd/hfs/hfs_fsctl.h b/bsd/hfs/hfs_fsctl.h index 7759e799a..7bebee3fb 100644 --- a/bsd/hfs/hfs_fsctl.h +++ b/bsd/hfs/hfs_fsctl.h @@ -81,8 +81,8 @@ struct hfs_journal_info { #define HFSIOC_BULKACCESS _IOW('h', 9, struct user32_access_t) #define HFS_BULKACCESS_FSCTL IOCBASECMD(HFSIOC_BULKACCESS) -#define HFSIOC_SETACLSTATE _IOW('h', 10, int32_t) -#define HFS_SETACLSTATE IOCBASECMD(HFSIOC_SETACLSTATE) +/* Unsupported - Previously used to enable/disable ACLs */ +#define HFSIOC_UNSUPPORTED _IOW('h', 10, int32_t) #define HFSIOC_PREV_LINK _IOWR('h', 11, u_int32_t) #define HFS_PREV_LINK IOCBASECMD(HFSIOC_PREV_LINK) @@ -121,7 +121,10 @@ struct hfs_journal_info { #define HFSIOC_VOLUME_STATUS _IOR('h', 24, u_int32_t) #define HFS_VOLUME_STATUS IOCBASECMD(HFSIOC_VOLUME_STATUS) -#endif /* __APPLE_API_UNSTABLE */ +/* Disable metadata zone for given volume */ +#define HFSIOC_DISABLE_METAZONE _IO('h', 25) +#define HFS_DISABLE_METAZONE IOCBASECMD(HFSIOC_DISABLE_METAZONE) +#endif /* __APPLE_API_UNSTABLE */ #endif /* ! _HFS_FSCTL_H_ */ diff --git a/bsd/hfs/hfs_hotfiles.c b/bsd/hfs/hfs_hotfiles.c index ce0fe4dcf..66e273b5d 100644 --- a/bsd/hfs/hfs_hotfiles.c +++ b/bsd/hfs/hfs_hotfiles.c @@ -428,7 +428,6 @@ out: /* * Suspend recording the hotest files on a file system. */ -__private_extern__ int hfs_recording_suspend(struct hfsmount *hfsmp) { @@ -511,7 +510,6 @@ out: /* * */ -__private_extern__ int hfs_recording_init(struct hfsmount *hfsmp) { @@ -559,12 +557,17 @@ hfs_recording_init(struct hfsmount *hfsmp) hfsmp->hfc_stage = HFC_IDLE; return (0); } + + if (hfs_start_transaction(hfsmp) != 0) { + return EINVAL; + } + error = hfc_btree_create(hfsmp, HFSTOVCB(hfsmp)->blockSize, HFC_DEFAULT_FILE_COUNT); if (error) { #if HFC_VERBOSE printf("hfs: Error %d creating hot file b-tree on %s \n", error, hfsmp->vcbVN); #endif - return (error); + goto out2; } /* * Open the Hot File B-tree file for writing. @@ -576,7 +579,7 @@ hfs_recording_init(struct hfsmount *hfsmp) #if HFC_VERBOSE printf("hfs: Error %d opening hot file b-tree on %s \n", error, hfsmp->vcbVN); #endif - return (error); + goto out2; } MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); if (iterator == NULL) { @@ -697,6 +700,7 @@ out0: out1: (void) BTScanTerminate(&scanstate, &data, &data, &data); out2: + hfs_end_transaction(hfsmp); if (iterator) FREE(iterator, M_TEMP); if (hfsmp->hfc_filevp) { @@ -712,7 +716,6 @@ out2: /* * Use sync to perform ocassional background work. */ -__private_extern__ int hfs_hotfilesync(struct hfsmount *hfsmp, vfs_context_t ctx) { @@ -759,7 +762,6 @@ hfs_hotfilesync(struct hfsmount *hfsmp, vfs_context_t ctx) * * Note: the cnode is locked on entry. */ -__private_extern__ int hfs_addhotfile(struct vnode *vp) { @@ -847,7 +849,6 @@ hfs_addhotfile_internal(struct vnode *vp) * * Note: the cnode is locked on entry. */ -__private_extern__ int hfs_removehotfile(struct vnode *vp) { @@ -1128,7 +1129,7 @@ hotfiles_adopt(struct hfsmount *hfsmp) /* * Acquire a vnode for this file. */ - error = hfs_vget(hfsmp, listp->hfl_hotfile[i].hf_fileid, &vp, 0); + error = hfs_vget(hfsmp, listp->hfl_hotfile[i].hf_fileid, &vp, 0, 0); if (error) { if (error == ENOENT) { error = 0; @@ -1350,7 +1351,7 @@ hotfiles_evict(struct hfsmount *hfsmp, vfs_context_t ctx) /* * Aquire the vnode for this file. */ - error = hfs_vget(hfsmp, key->fileID, &vp, 0); + error = hfs_vget(hfsmp, key->fileID, &vp, 0, 0); if (error) { if (error == ENOENT) { goto delete; /* stale entry, go to next */ @@ -1684,6 +1685,7 @@ hfc_btree_open(struct hfsmount *hfsmp, struct vnode **vpp) int error; int retry = 0; int lockflags; + int newvnode_flags = 0; *vpp = NULL; p = current_proc(); @@ -1705,7 +1707,8 @@ hfc_btree_open(struct hfsmount *hfsmp, struct vnode **vpp) } again: cdesc.cd_flags |= CD_ISMETA; - error = hfs_getnewvnode(hfsmp, NULL, NULL, &cdesc, 0, &cattr, &cfork, &vp); + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cdesc, 0, &cattr, + &cfork, &vp, &newvnode_flags); if (error) { printf("hfs: hfc_btree_open: hfs_getnewvnode error %d\n", error); cat_releasedesc(&cdesc); @@ -1757,7 +1760,7 @@ hfc_btree_close(struct hfsmount *hfsmp, struct vnode *vp) if (hfsmp->jnl) { - hfs_journal_flush(hfsmp); + hfs_journal_flush(hfsmp, FALSE); } if (vnode_get(vp) == 0) { @@ -1814,6 +1817,11 @@ hfc_btree_create(struct hfsmount *hfsmp, unsigned int nodesize, unsigned int ent VATTR_SET(&va, va_uid, 0); VATTR_SET(&va, va_gid, 0); + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + goto out; + } + /* call ourselves directly, ignore the higher-level VFS file creation code */ error = VNOP_CREATE(dvp, &vp, &cname, &va, ctx); if (error) { @@ -1941,6 +1949,7 @@ hfc_btree_create(struct hfsmount *hfsmp, unsigned int nodesize, unsigned int ent kmem_free(kernel_map, (vm_offset_t)buffer, nodesize); } out: + hfs_end_transaction(hfsmp); if (dvp) { vnode_put(dvp); } diff --git a/bsd/hfs/hfs_kdebug.h b/bsd/hfs/hfs_kdebug.h new file mode 100644 index 000000000..5dd5d6a9c --- /dev/null +++ b/bsd/hfs/hfs_kdebug.h @@ -0,0 +1,54 @@ +#include <sys/kdebug.h> + +/* + * KERNEL_DEBUG related definitions for HFS. + * + * NOTE: The Class DBG_FSYSTEM = 3, and Subclass DBG_HFS = 8, so these + * debug codes are of the form 0x0308nnnn. + */ +#define HFSDBG_CODE(code) FSDBG_CODE(DBG_HFS, code) + +enum { + HFSDBG_UNMAP_FREE = HFSDBG_CODE(0), /* 0x03080000 */ + HFSDBG_UNMAP_ALLOC = HFSDBG_CODE(1), /* 0x03080004 */ + HFSDBG_UNMAP_CALLBACK = HFSDBG_CODE(2), /* 0x03080008 */ + /* 0x0308000C is unused */ + HFSDBG_BLOCK_ALLOCATE = HFSDBG_CODE(4), /* 0x03080010 */ + HFSDBG_BLOCK_DEALLOCATE = HFSDBG_CODE(5), /* 0x03080014 */ + HFSDBG_READ_BITMAP_BLOCK = HFSDBG_CODE(6), /* 0x03080018 */ + HFSDBG_RELEASE_BITMAP_BLOCK = HFSDBG_CODE(7), /* 0x0308001C */ + HFSDBG_ALLOC_CONTIG_BITMAP = HFSDBG_CODE(8), /* 0x03080020 */ + HFSDBG_ALLOC_ANY_BITMAP = HFSDBG_CODE(9), /* 0x03080024 */ + HFSDBG_ALLOC_KNOWN_BITMAP = HFSDBG_CODE(10), /* 0x03080028 */ + HFSDBG_MARK_ALLOC_BITMAP = HFSDBG_CODE(11), /* 0x0308002C */ + HFSDBG_MARK_FREE_BITMAP = HFSDBG_CODE(12), /* 0x03080030 */ + HFSDBG_BLOCK_FIND_CONTIG = HFSDBG_CODE(13), /* 0x03080034 */ + HFSDBG_IS_ALLOCATED = HFSDBG_CODE(14), /* 0x03080038 */ + /* 0x0308003C is unused */ + HFSDBG_RESET_EXTENT_CACHE = HFSDBG_CODE(16), /* 0x03080040 */ + HFSDBG_REMOVE_EXTENT_CACHE = HFSDBG_CODE(17), /* 0x03080044 */ + HFSDBG_ADD_EXTENT_CACHE = HFSDBG_CODE(18), /* 0x03080048 */ +}; + +/* + Parameters logged by the above + EVENT CODE DBG_FUNC_START arg1, arg2, arg3, arg4 ... DBG_FUNC_END arg1, arg2, arg3, arg4 + --------------------------- + HFSDBG_UNMAP_CALLBACK 0, extentCount, 0, 0 ... 0, 0, 0, 0 + HFSDBG_UNMAP_FREE startBlock, blockCount, 0, 0 ... err, 0, 0, 0 + HFSDBG_UNMAP_ALLOC startBlock, blockCount, 0, 0 ... err, 0, 0, 0 + HFSDBG_REMOVE_EXTENT_CACHE startBlock, blockCount, 0, 0 ... 0, 0, 0, 0 + HFSDBG_ADD_EXTENT_CACHE startBlock, blockCount, 0, 0 ... err, 0, 0, 0 + HFSDBG_MARK_ALLOC_BITMAP startBlock, blockCount, 0, 0 ... err, 0, 0, 0 + HFSDBG_MARK_FREE_BITMAP startBlock, blockCount, valid, 0 ... err, 0, 0, 0 + HFSDBG_BLOCK_DEALLOCATE startBlock, blockCount, flags, 0 ... err, 0, 0, 0 + HFSDBG_IS_ALLOCATED startBlock, blockCount, stop, 0 ... err, 0, actualBlockCount, 0 + HFSDBG_BLOCK_ALLOCATE startBlock, minBlocks, maxBlocks, flags ... err, actualStartBlock, actualBlockCount, 0 + HFSDBG_ALLOC_CONTIG_BITMAP startBlock, minBlocks, maxBlocks, useMeta ... err, actualStartBlock, actualBlockCount, 0 + HFSDBG_ALLOC_ANY_BITMAP startBlock, endBlock, maxBlocks, useMeta ... err, actualStartBlock, actualBlockCount, 0 + HFSDBG_ALLOC_KNOWN_BITMAP 0, 0, maxBlocks, 0 ... err, actualStartBlock, actualBlockCount, 0 + HFSDBG_BLOCK_FIND_CONTIG startBlock, endBlock, minBlocks, maxBlocks ... err, actualStartBlock, actualBlockCount, 0 + HFSDBG_READ_BITMAP_BLOCK startBlock, 0, 0, 0 ... err, 0, 0, 0 + HFSDBG_RELEASE_BITMAP_BLOCK dirty, 0, 0, 0 ... 0, 0, 0, 0 + HFSDBG_RESET_EXTENT_CACHE 0, 0, 0, 0 ... 0, 0, 0, 0 +*/ diff --git a/bsd/hfs/hfs_link.c b/bsd/hfs/hfs_link.c index 878c9def0..d24a92011 100644 --- a/bsd/hfs/hfs_link.c +++ b/bsd/hfs/hfs_link.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2008 Apple Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,7 @@ #include <sys/vnode.h> #include <vfs/vfs_support.h> #include <libkern/libkern.h> +#include <sys/fsctl.h> #include "hfs.h" #include "hfs_catalog.h" @@ -61,6 +62,8 @@ const char *hfs_private_names[] = { static int setfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t firstlink); static int getfirstlink(struct hfsmount * hfsmp, cnid_t fileid, cnid_t *firstlink); +int hfs_makelink(struct hfsmount *hfsmp, struct vnode *src_vp, struct cnode *cp, + struct cnode *dcp, struct componentname *cnp); /* * Create a new catalog link record * @@ -92,7 +95,7 @@ createindirectlink(struct hfsmount *hfsmp, u_int32_t linknum, struct cat_desc *d /* Links are matched to inodes by link ID and to volumes by create date */ attr.ca_linkref = linknum; - attr.ca_itime = hfsmp->hfs_itime; + attr.ca_itime = hfsmp->hfs_metadata_createdate; attr.ca_mode = S_IFREG | S_IRUSR | S_IRGRP | S_IROTH; attr.ca_recflags = kHFSHasLinkChainMask | kHFSThreadExistsMask; attr.ca_flags = UF_IMMUTABLE; @@ -121,13 +124,15 @@ createindirectlink(struct hfsmount *hfsmp, u_int32_t linknum, struct cat_desc *d /* * Make a link to the cnode cp in the directory dp - * using the name in cnp. + * using the name in cnp. src_vp is the vnode that + * corresponds to 'cp' which was part of the arguments to + * hfs_vnop_link. * * The cnodes cp and dcp must be locked. */ -static int -hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, - struct componentname *cnp) +int +hfs_makelink(struct hfsmount *hfsmp, struct vnode *src_vp, struct cnode *cp, + struct cnode *dcp, struct componentname *cnp) { vfs_context_t ctx = cnp->cn_context; struct proc *p = vfs_context_proc(ctx); @@ -291,7 +296,7 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, /* Update the original first link to point back to the new first link. */ if (cp->c_attr.ca_recflags & kHFSHasLinkChainMask) { - (void) cat_updatelink(hfsmp, orig_firstlink, linkcnid, HFS_IGNORABLE_LINK); + (void) cat_update_siblinglinks(hfsmp, orig_firstlink, linkcnid, HFS_IGNORABLE_LINK); /* Update the inode's first link value. */ if (type == DIR_HARDLINKS) { @@ -327,17 +332,46 @@ hfs_makelink(struct hfsmount *hfsmp, struct cnode *cp, struct cnode *dcp, panic("hfs_makelink: cat_update of privdir failed! (%d)\n", retval); } cp->c_flag |= C_HARDLINK; + + /* + * Now we need to mark the vnodes as being hardlinks via the vnode_setmultipath call. + * Note that we're calling vnode_get here, which should simply add an iocount if possible, without + * doing much checking. It's safe to call this because we are protected by the cnode lock, which + * ensures that anyone trying to reclaim it will block until we release it. vnode_get will usually + * give us an extra iocount, unless the vnode is about to be reclaimed (and has no iocounts). + * In that case, we'd error out, but we'd also not care if we added the VISHARDLINK bit to the vnode. + * + * As for the iocount we're about to add, we can't necessarily always call vnode_put here. + * If the one we add is the only iocount on the vnode, and there was + * sufficient vnode pressure, it could go through VNOP_INACTIVE immediately, which would + * require the cnode lock and cause us to double-lock panic. We can only call vnode_put if we know + * that the vnode we're operating on is the one with which we came into hfs_vnop_link, because + * that means VFS took an iocount on it for us. If it's *not* the one that we came into the call + * with, then mark it as NEED_VNODE_PUT to have hfs_unlock drop it for us. hfs_vnop_link will + * unlock the cnode when it is finished. + */ if ((vp = cp->c_vp) != NULLVP) { - if (vnode_get(vp) == 0) { - vnode_setmultipath(vp); - vnode_put(vp); - } + if (vnode_get(vp) == 0) { + vnode_setmultipath(vp); + if (vp == src_vp) { + /* we have an iocount on data fork vnode already. */ + vnode_put(vp); + } + else { + cp->c_flag |= C_NEED_DVNODE_PUT; + } + } } if ((vp = cp->c_rsrc_vp) != NULLVP) { - if (vnode_get(vp) == 0) { - vnode_setmultipath(vp); - vnode_put(vp); - } + if (vnode_get(vp) == 0) { + vnode_setmultipath(vp); + if (vp == src_vp) { + vnode_put(vp); + } + else { + cp->c_flag |= C_NEED_RVNODE_PUT; + } + } } cp->c_touch_chgtime = TRUE; cp->c_flag |= C_FORCEUPDATE; @@ -364,7 +398,6 @@ out: * IN struct componentname *a_cnp; * IN vfs_context_t a_context; */ -__private_extern__ int hfs_vnop_link(struct vnop_link_args *ap) { @@ -408,7 +441,7 @@ hfs_vnop_link(struct vnop_link_args *ap) return (EPERM); } /* Directory hardlinks also need the parent of the original directory. */ - if ((error = hfs_vget(hfsmp, hfs_currentparent(VTOC(vp)), &fdvp, 1))) { + if ((error = hfs_vget(hfsmp, hfs_currentparent(VTOC(vp)), &fdvp, 1, 0))) { return (error); } } else { @@ -423,6 +456,10 @@ hfs_vnop_link(struct vnop_link_args *ap) } return (ENOSPC); } + + check_for_tracked_file(vp, VTOC(vp)->c_ctime, NAMESPACE_HANDLER_LINK_CREATE, NULL); + + /* Lock the cnodes. */ if (fdvp) { if ((error = hfs_lockfour(VTOC(tdvp), VTOC(vp), VTOC(fdvp), NULL, HFS_EXCLUSIVE_LOCK, NULL))) { @@ -543,7 +580,7 @@ hfs_vnop_link(struct vnop_link_args *ap) cp->c_linkcount++; cp->c_touch_chgtime = TRUE; - error = hfs_makelink(hfsmp, cp, tdcp, cnp); + error = hfs_makelink(hfsmp, vp, cp, tdcp, cnp); if (error) { cp->c_linkcount--; hfs_volupdate(hfsmp, VOL_UPDATE, 0); @@ -634,7 +671,6 @@ out: * * Note: dvp and vp cnodes are already locked. */ -__private_extern__ int hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, struct componentname *cnp, int skip_reserve) { @@ -806,11 +842,11 @@ hfs_unlink(struct hfsmount *hfsmp, struct vnode *dvp, struct vnode *vp, struct c } /* Update previous link. */ if (prevlinkid) { - (void) cat_updatelink(hfsmp, prevlinkid, HFS_IGNORABLE_LINK, nextlinkid); + (void) cat_update_siblinglinks(hfsmp, prevlinkid, HFS_IGNORABLE_LINK, nextlinkid); } /* Update next link. */ if (nextlinkid) { - (void) cat_updatelink(hfsmp, nextlinkid, prevlinkid, HFS_IGNORABLE_LINK); + (void) cat_update_siblinglinks(hfsmp, nextlinkid, prevlinkid, HFS_IGNORABLE_LINK); } } @@ -860,7 +896,6 @@ out: * * This call is assumed to be made during mount. */ -__private_extern__ void hfs_privatedir_init(struct hfsmount * hfsmp, enum privdirtype type) { @@ -909,7 +944,7 @@ hfs_privatedir_init(struct hfsmount * hfsmp, enum privdirtype type) } /* Grab the root directory so we can update it later. */ - if (hfs_vget(hfsmp, kRootDirID, &dvp, 0) != 0) { + if (hfs_vget(hfsmp, kRootDirID, &dvp, 0, 0) != 0) { goto exit; } dcp = VTOC(dvp); @@ -965,7 +1000,7 @@ hfs_privatedir_init(struct hfsmount * hfsmp, enum privdirtype type) goto exit; } if (type == FILE_HARDLINKS) { - hfsmp->hfs_metadata_createdate = hfsmp->hfs_itime; + hfsmp->hfs_metadata_createdate = priv_attrp->ca_itime; } hfs_volupdate(hfsmp, VOL_MKDIR, 1); exit: @@ -985,9 +1020,8 @@ exit: /* * Lookup a hardlink link (from chain) */ -__private_extern__ int -hfs_lookuplink(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) +hfs_lookup_siblinglinks(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, cnid_t *nextlinkid) { int lockflags; int error; @@ -997,7 +1031,7 @@ hfs_lookuplink(struct hfsmount *hfsmp, cnid_t linkfileid, cnid_t *prevlinkid, c lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); - error = cat_lookuplinkbyid(hfsmp, linkfileid, prevlinkid, nextlinkid); + error = cat_lookup_siblinglinks(hfsmp, linkfileid, prevlinkid, nextlinkid); if (error == ENOLINK) { hfs_systemfile_unlock(hfsmp, lockflags); lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); diff --git a/bsd/hfs/hfs_lookup.c b/bsd/hfs/hfs_lookup.c index c82e68cb4..13cb1aa48 100644 --- a/bsd/hfs/hfs_lookup.c +++ b/bsd/hfs/hfs_lookup.c @@ -164,8 +164,10 @@ hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int struct cat_attr attr; struct cat_fork fork; int lockflags; + int newvnode_flags; retry: + newvnode_flags = 0; dcp = NULL; hfsmp = VTOHFS(dvp); *vpp = NULL; @@ -227,8 +229,16 @@ hfs_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int * Note: We must drop the parent lock here before calling * hfs_getnewvnode (which takes the child lock). */ - hfs_unlock(dcp); - dcp = NULL; + hfs_unlock(dcp); + dcp = NULL; + + /* Verify that the item just looked up isn't one of the hidden directories. */ + if (desc.cd_cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + desc.cd_cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + retval = ENOENT; + goto exit; + } + goto found; } notfound: @@ -301,37 +311,14 @@ found: * Directory hard links can have multiple parents so * find the appropriate parent for the current thread. */ - if ((retval = hfs_vget(hfsmp, hfs_currentparent(VTOC(dvp)), &tvp, 0))) { + if ((retval = hfs_vget(hfsmp, hfs_currentparent(VTOC(dvp)), &tvp, 0, 0))) { goto exit; } *cnode_locked = 1; *vpp = tvp; } else { int type = (attr.ca_mode & S_IFMT); -#if NAMEDRSRCFORK - int rsrc_warn = 0; - /* - * Check if caller wants the resource fork but utilized - * the legacy "file/rsrc" access path. - * - * This is deprecated behavior and support for it will not - * be allowed beyond case insensitive HFS+ and even that - * support will be removed in the next major OS release. - */ - if ((type == S_IFREG) && - ((flags & ISLASTCN) == 0) && - (cnp->cn_nameptr[cnp->cn_namelen] == '/') && - (bcmp(&cnp->cn_nameptr[cnp->cn_namelen+1], "rsrc", 5) == 0) && - ((hfsmp->hfs_flags & (HFS_STANDARD | HFS_CASE_SENSITIVE)) == 0)) { - - cnp->cn_consume = 5; - cnp->cn_flags |= CN_WANTSRSRCFORK | ISLASTCN | NOCACHE; - cnp->cn_flags &= ~MAKEENTRY; - flags |= ISLASTCN; - rsrc_warn = 1; - } -#endif if (!(flags & ISLASTCN) && (type != S_IFDIR) && (type != S_IFLNK)) { retval = ENOTDIR; goto exit; @@ -344,22 +331,65 @@ found: if (cnp->cn_namelen != desc.cd_namelen) cnp->cn_flags &= ~MAKEENTRY; - retval = hfs_getnewvnode(hfsmp, dvp, cnp, &desc, 0, &attr, &fork, &tvp); + retval = hfs_getnewvnode(hfsmp, dvp, cnp, &desc, 0, &attr, &fork, &tvp, &newvnode_flags); if (retval) { /* - * If this was a create operation lookup and another - * process removed the object before we had a chance - * to create the vnode, then just treat it as the not - * found case above and return EJUSTRETURN. - * We should do the same for the RENAME operation since we are - * going to write it in regardless. - */ + * If this was a create/rename operation lookup, then by this point + * we expected to see the item returned from hfs_getnewvnode above. + * In the create case, it would probably eventually bubble out an EEXIST + * because the item existed when we were trying to create it. In the + * rename case, it would let us know that we need to go ahead and + * delete it as part of the rename. However, if we hit the condition below + * then it means that we found the element during cat_lookup above, but + * it is now no longer there. We simply behave as though we never found + * the element at all and return EJUSTRETURN. + */ if ((retval == ENOENT) && - ((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && - (flags & ISLASTCN)) { + ((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME)) && + (flags & ISLASTCN)) { retval = EJUSTRETURN; } + + /* + * If this was a straight lookup operation, we may need to redrive the entire + * lookup starting from cat_lookup if the element was deleted as the result of + * a rename operation. Since rename is supposed to guarantee atomicity, then + * lookups cannot fail because the underlying element is deleted as a result of + * the rename call -- either they returned the looked up element prior to rename + * or return the newer element. If we are in this region, then all we can do is add + * workarounds to guarantee the latter case. The element has already been deleted, so + * we just re-try the lookup to ensure the caller gets the most recent element. + */ + if ((retval == ENOENT) && (cnp->cn_nameiop == LOOKUP) && + (newvnode_flags & (GNV_CHASH_RENAMED | GNV_CAT_DELETED))) { + if (dcp) { + hfs_unlock (dcp); + } + /* get rid of any name buffers that may have lingered from the cat_lookup call */ + cat_releasedesc (&desc); + goto retry; + } + + /* Also, re-drive the lookup if the item we looked up was a hardlink, and the number + * or name of hardlinks has changed in the interim between the cat_lookup above, and + * our call to hfs_getnewvnode. hfs_getnewvnode will validate the cattr we passed it + * against what is actually in the catalog after the cnode is created. If there were + * any issues, it will bubble out ERECYCLE, which we need to swallow and use as the + * key to redrive as well. We need to special case this below because in this case, + * it needs to occur regardless of the type of lookup we're doing here. + */ + if ((retval == ERECYCLE) && (newvnode_flags & GNV_CAT_ATTRCHANGED)) { + if (dcp) { + hfs_unlock (dcp); + } + /* get rid of any name buffers that may have lingered from the cat_lookup call */ + cat_releasedesc (&desc); + retval = 0; + goto retry; + } + + /* skip to the error-handling code if we can't retry */ goto exit; } @@ -375,15 +405,6 @@ found: } *cnode_locked = 1; *vpp = tvp; -#if NAMEDRSRCFORK - if (rsrc_warn) { - if ((VTOC(tvp)->c_flag & C_WARNED_RSRC) == 0) { - VTOC(tvp)->c_flag |= C_WARNED_RSRC; - printf("hfs: %.200s: file access by '/rsrc' was deprecated in 10.4\n", - cnp->cn_nameptr); - } - } -#endif } exit: if (dcp) { @@ -415,7 +436,6 @@ exit: #define S_IXALL 0000111 -__private_extern__ int hfs_vnop_lookup(struct vnop_lookup_args *ap) { @@ -423,6 +443,7 @@ hfs_vnop_lookup(struct vnop_lookup_args *ap) struct vnode *vp; struct cnode *cp; struct cnode *dcp; + struct hfsmount *hfsmp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; @@ -431,6 +452,8 @@ hfs_vnop_lookup(struct vnop_lookup_args *ap) *vpp = NULL; dcp = VTOC(dvp); + + hfsmp = VTOHFS(dvp); /* * Lookup an entry in the cache @@ -455,14 +478,24 @@ hfs_vnop_lookup(struct vnop_lookup_args *ap) */ error = 0; vp = *vpp; - + cp = VTOC(vp); + + /* We aren't allowed to vend out vp's via lookup to the hidden directory */ + if (cp->c_cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid || + cp->c_cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) { + /* Drop the iocount from cache_lookup */ + vnode_put (vp); + error = ENOENT; + goto exit; + } + + /* * If this is a hard-link vnode then we need to update * the name (of the link), the parent ID, the cnid, the * text encoding and the catalog hint. This enables * getattrlist calls to return the correct link info. */ - cp = VTOC(vp); if ((flags & ISLASTCN) && (cp->c_flag & C_HARDLINK)) { hfs_lock(cp, HFS_FORCE_LOCK); @@ -501,33 +534,7 @@ hfs_vnop_lookup(struct vnop_lookup_args *ap) } hfs_unlock(cp); } -#if NAMEDRSRCFORK - /* - * Check if caller wants the resource fork but utilized - * the legacy "file/rsrc" access path. - * - * This is deprecated behavior and support for it will not - * be allowed beyond case insensitive HFS+ and even that - * support will be removed in the next major OS release. - */ - if ((dvp != vp) && - ((flags & ISLASTCN) == 0) && - vnode_isreg(vp) && - (cnp->cn_nameptr[cnp->cn_namelen] == '/') && - (bcmp(&cnp->cn_nameptr[cnp->cn_namelen+1], "rsrc", 5) == 0) && - ((VTOHFS(vp)->hfs_flags & (HFS_STANDARD | HFS_CASE_SENSITIVE)) == 0)) { - cnp->cn_consume = 5; - cnp->cn_flags |= CN_WANTSRSRCFORK | ISLASTCN | NOCACHE; - cnp->cn_flags &= ~MAKEENTRY; - hfs_lock(cp, HFS_FORCE_LOCK); - if ((cp->c_flag & C_WARNED_RSRC) == 0) { - cp->c_flag |= C_WARNED_RSRC; - printf("hfs: %.200s: file access by '/rsrc' was deprecated in 10.4\n", cnp->cn_nameptr); - } - hfs_unlock(cp); - } -#endif return (error); lookup: diff --git a/bsd/hfs/hfs_mount.h b/bsd/hfs/hfs_mount.h index 5782bd6f6..ca4f8703f 100644 --- a/bsd/hfs/hfs_mount.h +++ b/bsd/hfs/hfs_mount.h @@ -79,6 +79,7 @@ struct hfs_mount_args { #define HFS_GET_JOURNAL_INFO 0x6a6e6c69 #define HFS_SET_PKG_EXTENSIONS 0x121031 #define HFS_REPLAY_JOURNAL 0x6a6e6c72 +#define HFS_ENABLE_RESIZE_DEBUG 4 /* enable debug code for volume resizing */ #endif /* __APPLE_API_UNSTABLE */ diff --git a/bsd/hfs/hfs_notification.c b/bsd/hfs/hfs_notification.c index 517c8ecdc..227e744b2 100644 --- a/bsd/hfs/hfs_notification.c +++ b/bsd/hfs/hfs_notification.c @@ -32,7 +32,9 @@ #include <sys/dirent.h> #include <sys/stat.h> #include <sys/mount.h> +#include <sys/mount_internal.h> #include <sys/vnode.h> +#include <sys/vnode_internal.h> #include <sys/malloc.h> #include <sys/ubc.h> #include <sys/quota.h> @@ -71,10 +73,24 @@ void hfs_generate_volume_notifications(struct hfsmount *hfsmp) } if (state == 2 && !(hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK)) { + /* Dump some logging to track down intermittent issues */ + printf("HFS: Very Low Disk: freeblks: %d, dangerlimit: %d\n", freeblks, hfsmp->hfs_freespace_notify_dangerlimit); +#if HFS_SPARSE_DEV + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + if (hfsmp->hfs_backingfs_rootvp) { + struct mount *mp = vnode_mount (hfsmp->hfs_backingfs_rootvp); + /* If we're a sparse device, dump some info about the backing store... */ + if (mp) { + printf("HFS: Very Low Disk: backingstore b_avail %lld, tag %d\n", mp->mnt_vfsstat.f_bavail, hfsmp->hfs_backingfs_rootvp->v_tag); + } + } + } +#endif hfsmp->hfs_notification_conditions |= (VQ_VERYLOWDISK|VQ_LOWDISK); vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); } else if (state == 1) { if (!(hfsmp->hfs_notification_conditions & VQ_LOWDISK)) { + printf("HFS: Low Disk: freeblks: %d, warninglimit: %d\n", freeblks, hfsmp->hfs_freespace_notify_warninglimit); hfsmp->hfs_notification_conditions |= VQ_LOWDISK; vfs_event_signal(&fsid, hfsmp->hfs_notification_conditions, (intptr_t)NULL); } else if (hfsmp->hfs_notification_conditions & VQ_VERYLOWDISK) { diff --git a/bsd/hfs/hfs_readwrite.c b/bsd/hfs/hfs_readwrite.c index 9fcd6a02d..27901f5de 100644 --- a/bsd/hfs/hfs_readwrite.c +++ b/bsd/hfs/hfs_readwrite.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,8 +90,7 @@ static int do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skip, int flush_cache_on_write = 0; -SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files"); - +SYSCTL_INT (_kern, OID_AUTO, flush_cache_on_write, CTLFLAG_RW | CTLFLAG_LOCKED, &flush_cache_on_write, 0, "always flush the drive cache on writes to uncached files"); /* * Read data from a file. @@ -109,6 +108,7 @@ hfs_vnop_read(struct vnop_read_args *ap) off_t start_resid = uio_resid(uio); off_t offset = uio_offset(uio); int retval = 0; + int took_truncate_lock = 0; /* Preflight checks */ if (!vnode_isreg(vp)) { @@ -147,6 +147,14 @@ hfs_vnop_read(struct vnop_read_args *ap) } /* otherwise the file was converted back to a regular file while we were reading it */ retval = 0; + } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) { + int error; + + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); + if (error) { + return error; + } + } } #endif /* HFS_COMPRESSION */ @@ -155,8 +163,15 @@ hfs_vnop_read(struct vnop_read_args *ap) fp = VTOF(vp); hfsmp = VTOHFS(vp); +#if CONFIG_PROTECT + if ((retval = cp_handle_vnop (cp, CP_READ_ACCESS)) != 0) { + goto exit; + } +#endif + /* Protect against a size change. */ - hfs_lock_truncate(cp, 0); + hfs_lock_truncate(cp, HFS_SHARED_LOCK); + took_truncate_lock = 1; filesize = fp->ff_size; filebytes = (off_t)fp->ff_blocks * (off_t)hfsmp->blockSize; @@ -209,7 +224,10 @@ hfs_vnop_read(struct vnop_read_args *ap) hfs_unlock(cp); } exit: - hfs_unlock_truncate(cp, 0); + if (took_truncate_lock) { + hfs_unlock_truncate(cp, 0); + } + return (retval); } @@ -238,7 +256,9 @@ hfs_vnop_write(struct vnop_write_args *ap) int lockflags; int cnode_locked = 0; int partialwrite = 0; - int exclusive_lock = 0; + int do_snapshot = 1; + time_t orig_ctime=VTOC(vp)->c_ctime; + int took_truncate_lock = 0; #if HFS_COMPRESSION if ( hfs_file_is_compressed(VTOC(vp), 1) ) { /* 1 == don't take the cnode lock */ @@ -247,23 +267,34 @@ hfs_vnop_write(struct vnop_write_args *ap) case FILE_IS_COMPRESSED: return EACCES; case FILE_IS_CONVERTING: - /* if FILE_IS_CONVERTING, we allow writes */ + /* if FILE_IS_CONVERTING, we allow writes but do not + bother with snapshots or else we will deadlock. + */ + do_snapshot = 0; break; default: printf("invalid state %d for compressed file\n", state); /* fall through */ } + } else if ((VTOC(vp)->c_flags & UF_COMPRESSED)) { + int error; + + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_WRITE_OP); + if (error != 0) { + return error; + } } + + if (do_snapshot) { + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, uio); + } + #endif // LP64todo - fix this! uio_resid may be 64-bit value resid = uio_resid(uio); offset = uio_offset(uio); - if (ioflag & IO_APPEND) { - exclusive_lock = 1; - } - if (offset < 0) return (EINVAL); if (resid == 0) @@ -275,8 +306,14 @@ hfs_vnop_write(struct vnop_write_args *ap) fp = VTOF(vp); hfsmp = VTOHFS(vp); +#if CONFIG_PROTECT + if ((retval = cp_handle_vnop (cp, CP_WRITE_ACCESS)) != 0) { + goto exit; + } +#endif + eflags = kEFDeferMask; /* defer file block allocations */ -#ifdef HFS_SPARSE_DEV +#if HFS_SPARSE_DEV /* * When the underlying device is sparse and space * is low (< 8MB), stop doing delayed allocations @@ -291,8 +328,15 @@ hfs_vnop_write(struct vnop_write_args *ap) again: /* Protect against a size change. */ - hfs_lock_truncate(cp, exclusive_lock); + if (ioflag & IO_APPEND) { + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); + } + else { + hfs_lock_truncate(cp, HFS_SHARED_LOCK); + } + took_truncate_lock = 1; + /* Update UIO */ if (ioflag & IO_APPEND) { uio_setoffset(uio, fp->ff_size); offset = fp->ff_size; @@ -313,13 +357,16 @@ again: * grab the truncate lock exclusive even if we're not allocating new blocks * because we could still be growing past the LEOF. */ - if ((exclusive_lock == 0) && + if ((cp->c_truncatelockowner == HFS_SHARED_OWNER) && ((fp->ff_unallocblocks != 0) || (writelimit > origFileSize))) { - exclusive_lock = 1; /* Lock upgrade failed and we lost our shared lock, try again */ if (lck_rw_lock_shared_to_exclusive(&cp->c_truncatelock) == FALSE) { goto again; } + else { + /* Store the owner in the c_truncatelockowner field if we successfully upgrade */ + cp->c_truncatelockowner = current_thread(); + } } if ( (retval = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { @@ -327,7 +374,7 @@ again: } cnode_locked = 1; - if (!exclusive_lock) { + if (cp->c_truncatelockowner == HFS_SHARED_OWNER) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 0)) | DBG_FUNC_START, (int)offset, uio_resid(uio), (int)fp->ff_size, (int)filebytes, 0); @@ -632,7 +679,10 @@ ioerr_exit: exit: if (cnode_locked) hfs_unlock(cp); - hfs_unlock_truncate(cp, exclusive_lock); + + if (took_truncate_lock) { + hfs_unlock_truncate(cp, 0); + } return (retval); } @@ -1004,7 +1054,7 @@ do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HF struct vnode *vp; /* get the vnode for this cnid */ - myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0); + myErr = hfs_vget(hfsmp, thisNodeID, &vp, 0, 0); if ( myErr ) { myResult = 0; goto ExitThisRoutine; @@ -1027,21 +1077,19 @@ do_access_check(struct hfsmount *hfsmp, int *err, struct access_cache *cache, HF } } else { unsigned int flags; - - myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, - cnattr.ca_mode, hfsmp->hfs_mp, - myp_ucred, theProcPtr); + int mode = cnattr.ca_mode & S_IFMT; + myPerms = DerivePermissionSummary(cnattr.ca_uid, cnattr.ca_gid, cnattr.ca_mode, hfsmp->hfs_mp,myp_ucred, theProcPtr); - if (cnattr.ca_mode & S_IFDIR) { - flags = R_OK | X_OK; - } else { - flags = R_OK; - } - if ( (myPerms & flags) != flags) { - myResult = 0; - myErr = EACCES; - goto ExitThisRoutine; /* no access */ - } + if (mode == S_IFDIR) { + flags = R_OK | X_OK; + } else { + flags = R_OK; + } + if ( (myPerms & flags) != flags) { + myResult = 0; + myErr = EACCES; + goto ExitThisRoutine; /* no access */ + } /* up the hierarchy we go */ thisNodeID = catkey.hfsPlus.parentID; @@ -1284,7 +1332,7 @@ do_bulk_access_check(struct hfsmount *hfsmp, struct vnode *vp, struct vnode *cvp; int myErr = 0; /* get the vnode for this cnid */ - myErr = hfs_vget(hfsmp, cnid, &cvp, 0); + myErr = hfs_vget(hfsmp, cnid, &cvp, 0, 0); if ( myErr ) { access[i] = myErr; continue; @@ -1432,6 +1480,15 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { is64bit = proc_is64bit(p); +#if CONFIG_PROTECT + { + int error = 0; + if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + return error; + } + } +#endif /* CONFIG_PROTECT */ + switch (ap->a_command) { case HFS_GETPATH: @@ -1491,7 +1548,7 @@ hfs_vnop_ioctl( struct vnop_ioctl_args /* { if (linkfileid < kHFSFirstUserCatalogNodeID) { return (EINVAL); } - if ((error = hfs_lookuplink(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) { + if ((error = hfs_lookup_siblinglinks(hfsmp, linkfileid, &prevlinkid, &nextlinkid))) { return (error); } if (ap->a_command == HFS_NEXT_LINK) { @@ -1591,7 +1648,7 @@ fail_change_next_allocation: return (error); } -#ifdef HFS_SPARSE_DEV +#if HFS_SPARSE_DEV case HFS_SETBACKINGSTOREINFO: { struct vnode * bsfs_rootvp; struct vnode * di_vp; @@ -1641,7 +1698,17 @@ fail_change_next_allocation: vnode_put(bsfs_rootvp); hfsmp->hfs_backingfs_rootvp = bsfs_rootvp; + hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; + /* The free extent cache is managed differently for sparse devices. + * There is a window between which the volume is mounted and the + * device is marked as sparse, so the free extent cache for this + * volume is currently initialized as normal volume (sorted by block + * count). Reset the cache so that it will be rebuilt again + * for sparse device (sorted by start block). + */ + ResetVCBFreeExtCache(hfsmp); + hfsmp->hfs_sparsebandblks = bsdata->bandsize / HFSTOVCB(hfsmp)->blockSize; hfsmp->hfs_sparsebandblks *= 4; @@ -1717,14 +1784,18 @@ fail_change_next_allocation: // note: can't do this after taking the lock as it will // deadlock against ourselves. vnode_iterate(mp, 0, hfs_freezewrite_callback, NULL); - hfs_global_exclusive_lock_acquire(hfsmp); + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); // DO NOT call hfs_journal_flush() because that takes a // shared lock on the global exclusive lock! - journal_flush(hfsmp->jnl); + journal_flush(hfsmp->jnl, TRUE); // don't need to iterate on all vnodes, we just need to // wait for writes to the system files and the device vnode + // + // Now that journal flush waits for all metadata blocks to + // be written out, waiting for btree writes is probably no + // longer required. if (HFSTOVCB(hfsmp)->extentsRefNum) vnode_waitforwrites(HFSTOVCB(hfsmp)->extentsRefNum, 0, 0, 0, "hfs freeze"); if (HFSTOVCB(hfsmp)->catalogRefNum) @@ -1756,7 +1827,7 @@ fail_change_next_allocation: // code that "thaws" the fs in hfs_vnop_close() // hfsmp->hfs_freezing_proc = NULL; - hfs_global_exclusive_lock_release(hfsmp); + hfs_unlock_global (hfsmp); lck_rw_unlock_exclusive(&hfsmp->hfs_insync); return (0); @@ -1794,30 +1865,6 @@ fail_change_next_allocation: return do_bulk_access_check(hfsmp, vp, ap, size, context); } - case HFS_SETACLSTATE: { - int state; - - if (ap->a_data == NULL) { - return (EINVAL); - } - - vfsp = vfs_statfs(HFSTOVFS(hfsmp)); - state = *(int *)ap->a_data; - - if (hfsmp->hfs_flags & HFS_READ_ONLY) { - return (EROFS); - } - // super-user can enable or disable acl's on a volume. - // the volume owner can only enable acl's - if (!is_suser() && (state == 0 || kauth_cred_getuid(cred) != vfsp->f_owner)) { - return (EPERM); - } - if (state == 0 || state == 1) - return hfs_set_volxattr(hfsmp, HFS_SETACLSTATE, state); - else - return (EINVAL); - } - case HFS_SET_XATTREXTENTS_STATE: { int state; @@ -1833,6 +1880,9 @@ fail_change_next_allocation: /* Super-user can enable or disable extent-based extended * attribute support on a volume + * Note: Starting Mac OS X 10.7, extent-based extended attributes + * are enabled by default, so any change will be transient only + * till the volume is remounted. */ if (!is_suser()) { return (EPERM); @@ -1891,7 +1941,7 @@ fail_change_next_allocation: fp = VTOF(vp); /* Protect against a size change. */ - hfs_lock_truncate(VTOC(vp), TRUE); + hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK); #if HFS_COMPRESSION if (compressed && (uncompressed_size == -1)) { @@ -1910,7 +1960,7 @@ fail_change_next_allocation: error = advisory_read(vp, fp->ff_size, ra->ra_offset, ra->ra_count); } - hfs_unlock_truncate(VTOC(vp), TRUE); + hfs_unlock_truncate(VTOC(vp), 0); return (error); } @@ -1934,21 +1984,22 @@ fail_change_next_allocation: * to a user_fbootstraptransfer_t else we get a pointer to a * fbootstraptransfer_t which we munge into a user_fbootstraptransfer_t */ - if (hfsmp->hfs_flags & HFS_READ_ONLY) { + if ((hfsmp->hfs_flags & HFS_READ_ONLY) + && (ap->a_command == F_WRITEBOOTSTRAP)) { return (EROFS); } if (is64bit) { user_bootstrapp = (user_fbootstraptransfer_t *)ap->a_data; } else { - user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data; + user32_fbootstraptransfer_t *bootstrapp = (user32_fbootstraptransfer_t *)ap->a_data; user_bootstrapp = &user_bootstrap; user_bootstrap.fbt_offset = bootstrapp->fbt_offset; user_bootstrap.fbt_length = bootstrapp->fbt_length; user_bootstrap.fbt_buffer = CAST_USER_ADDR_T(bootstrapp->fbt_buffer); } - if ((user_bootstrapp->fbt_offset < 0) || (user_bootstrapp->fbt_offset > 1024) || + if ((user_bootstrapp->fbt_offset < 0) || (user_bootstrapp->fbt_offset > 1024) || (user_bootstrapp->fbt_length > 1024)) { return EINVAL; } @@ -1956,7 +2007,7 @@ fail_change_next_allocation: if (user_bootstrapp->fbt_offset + user_bootstrapp->fbt_length > 1024) return EINVAL; - devvp = VTOHFS(vp)->hfs_devvp; + devvp = VTOHFS(vp)->hfs_devvp; auio = uio_create(1, user_bootstrapp->fbt_offset, is64bit ? UIO_USERSPACE64 : UIO_USERSPACE32, (ap->a_command == F_WRITEBOOTSTRAP) ? UIO_WRITE : UIO_READ); @@ -2116,6 +2167,21 @@ fail_change_next_allocation: break; } + case HFS_DISABLE_METAZONE: { + /* Only root can disable metadata zone */ + if (!is_suser()) { + return EACCES; + } + if (hfsmp->hfs_flags & HFS_READ_ONLY) { + return (EROFS); + } + + /* Disable metadata zone now */ + (void) hfs_metadatazone_init(hfsmp, true); + printf ("hfs: Disabling metadata zone on %s\n", hfsmp->vcbVN); + break; + } + default: return (ENOTTY); } @@ -2541,8 +2607,32 @@ hfs_vnop_strategy(struct vnop_strategy_args *ap) { buf_t bp = ap->a_bp; vnode_t vp = buf_vnode(bp); + int error = 0; + +#if CONFIG_PROTECT + cnode_t *cp = NULL; + + if ((cp = cp_get_protected_cnode(vp)) != NULL) { + /* + * Some paths to hfs_vnop_strategy will take the cnode lock, + * and some won't. But since content protection is only enabled + * for files that (a) aren't system files and (b) are regular + * files, any valid cnode here will be unlocked. + */ + hfs_lock(cp, HFS_SHARED_LOCK); + buf_setcpaddr(bp, cp->c_cpentry); + } +#endif /* CONFIG_PROTECT */ + + error = buf_strategy(VTOHFS(vp)->hfs_devvp, ap); - return (buf_strategy(VTOHFS(vp)->hfs_devvp, ap)); +#if CONFIG_PROTECT + if (cp) { + hfs_unlock(cp); + } +#endif + + return error; } static int @@ -2556,7 +2646,7 @@ hfs_minorupdate(struct vnode *vp) { return 0; } -static int +int do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_context_t context) { register struct cnode *cp = VTOC(vp); @@ -2801,8 +2891,8 @@ do_hfs_truncate(struct vnode *vp, off_t length, int flags, int skipupdate, vfs_c lockflags |= SFL_EXTENTS; lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); - retval = MacToVFSError(TruncateFileC(VTOVCB(vp), - (FCB*)fp, length, false)); + retval = MacToVFSError(TruncateFileC(VTOVCB(vp), (FCB*)fp, length, 0, + FORK_IS_RSRC (fp), FTOC(fp)->c_fileid, false)); hfs_systemfile_unlock(hfsmp, lockflags); } @@ -2860,13 +2950,201 @@ Err_Exit: return (retval); } +/* + * Preparation which must be done prior to deleting the catalog record + * of a file or directory. In order to make the on-disk as safe as possible, + * we remove the catalog entry before releasing the bitmap blocks and the + * overflow extent records. However, some work must be done prior to deleting + * the catalog record. + * + * When calling this function, the cnode must exist both in memory and on-disk. + * If there are both resource fork and data fork vnodes, this function should + * be called on both. + */ + +int +hfs_prepare_release_storage (struct hfsmount *hfsmp, struct vnode *vp) { + + struct filefork *fp = VTOF(vp); + struct cnode *cp = VTOC(vp); + int retval = 0; + + /* Cannot truncate an HFS directory! */ + if (vnode_isdir(vp)) { + return (EISDIR); + } + + /* + * See the comment below in hfs_truncate for why we need to call + * setsize here. Essentially we want to avoid pending IO if we + * already know that the blocks are going to be released here. + * This function is only called when totally removing all storage for a file, so + * we can take a shortcut and immediately setsize (0); + */ + ubc_setsize(vp, 0); + + /* This should only happen with a corrupt filesystem */ + if ((off_t)fp->ff_size < 0) + return (EINVAL); + + /* + * We cannot just check if fp->ff_size == length (as an optimization) + * since there may be extra physical blocks that also need truncation. + */ +#if QUOTA + if ((retval = hfs_getinoquota(cp))) { + return(retval); + } +#endif /* QUOTA */ + + /* Wipe out any invalid ranges which have yet to be backed by disk */ + rl_remove(0, fp->ff_size - 1, &fp->ff_invalidranges); + + /* + * Account for any unmapped blocks. Since we're deleting the + * entire file, we don't have to worry about just shrinking + * to a smaller number of borrowed blocks. + */ + if (fp->ff_unallocblocks > 0) { + u_int32_t loanedBlocks; + + HFS_MOUNT_LOCK(hfsmp, TRUE); + + loanedBlocks = fp->ff_unallocblocks; + cp->c_blocks -= loanedBlocks; + fp->ff_blocks -= loanedBlocks; + fp->ff_unallocblocks = 0; + + hfsmp->loanedBlocks -= loanedBlocks; + + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + } + + return 0; +} + + +/* + * Special wrapper around calling TruncateFileC. This function is useable + * even when the catalog record does not exist any longer, making it ideal + * for use when deleting a file. The simplification here is that we know + * that we are releasing all blocks. + * + * The caller is responsible for saving off a copy of the filefork(s) + * embedded within the cnode prior to calling this function. The pointers + * supplied as arguments must be valid even if the cnode is no longer valid. + */ + +int +hfs_release_storage (struct hfsmount *hfsmp, struct filefork *datafork, + struct filefork *rsrcfork, u_int32_t fileid) { + + off_t filebytes; + u_int32_t fileblocks; + int blksize = 0; + int error = 0; + int lockflags; + + blksize = hfsmp->blockSize; + + /* Data Fork */ + if (datafork->ff_blocks > 0) { + fileblocks = datafork->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ + + while (filebytes > 0) { + if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(datafork)) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = 0; + } + + /* Start a transaction, and wipe out as many blocks as we can in this iteration */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + break; + } + + if (datafork->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(datafork)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), datafork, filebytes, 1, 0, fileid, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (error == 0) { + datafork->ff_size = filebytes; + } + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + /* Finish the transaction and start over if necessary */ + hfs_end_transaction(hfsmp); + + if (error) { + break; + } + } + } + + /* Resource fork */ + if (error == 0 && (rsrcfork != NULL) && rsrcfork->ff_blocks > 0) { + fileblocks = rsrcfork->ff_blocks; + filebytes = (off_t)fileblocks * (off_t)blksize; + + /* We killed invalid ranges and loaned blocks before we removed the catalog entry */ + + while (filebytes > 0) { + if (filebytes > HFS_BIGFILE_SIZE && overflow_extents(rsrcfork)) { + filebytes -= HFS_BIGFILE_SIZE; + } else { + filebytes = 0; + } + + /* Start a transaction, and wipe out as many blocks as we can in this iteration */ + if (hfs_start_transaction(hfsmp) != 0) { + error = EINVAL; + break; + } + + if (rsrcfork->ff_unallocblocks == 0) { + /* Protect extents b-tree and allocation bitmap */ + lockflags = SFL_BITMAP; + if (overflow_extents(rsrcfork)) + lockflags |= SFL_EXTENTS; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + + error = MacToVFSError(TruncateFileC(HFSTOVCB(hfsmp), rsrcfork, filebytes, 1, 1, fileid, false)); + + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (error == 0) { + rsrcfork->ff_size = filebytes; + } + (void) hfs_volupdate(hfsmp, VOL_UPDATE, 0); + + /* Finish the transaction and start over if necessary */ + hfs_end_transaction(hfsmp); + + if (error) { + break; + } + } + } + + return error; +} /* * Truncate a cnode to at most length size, freeing (or adding) the * disk blocks. */ -__private_extern__ int hfs_truncate(struct vnode *vp, off_t length, int flags, int skipsetsize, int skipupdate, vfs_context_t context) @@ -2980,6 +3258,7 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { struct hfsmount *hfsmp; kauth_cred_t cred = vfs_context_ucred(ap->a_context); int lockflags; + time_t orig_ctime; *(ap->a_bytesallocated) = 0; @@ -2990,7 +3269,11 @@ hfs_vnop_allocate(struct vnop_allocate_args /* { cp = VTOC(vp); - hfs_lock_truncate(cp, TRUE); + orig_ctime = VTOC(vp)->c_ctime; + + check_for_tracked_file(vp, orig_ctime, ap->a_length == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL); + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); if ((retval = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { goto Err_Exit; @@ -3181,7 +3464,7 @@ Std_Exit: if (retval == 0) retval = retval2; Err_Exit: - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); hfs_unlock(cp); return (retval); } @@ -3204,74 +3487,298 @@ hfs_vnop_pagein(struct vnop_pagein_args *ap) }; */ { - vnode_t vp = ap->a_vp; - int error; + vnode_t vp; + struct cnode *cp; + struct filefork *fp; + int error = 0; + upl_t upl; + upl_page_info_t *pl; + off_t f_offset; + int offset; + int isize; + int pg_index; + boolean_t truncate_lock_held = FALSE; + boolean_t file_converted = FALSE; + kern_return_t kret; + + vp = ap->a_vp; + cp = VTOC(vp); + fp = VTOF(vp); + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(cp, CP_READ_ACCESS | CP_WRITE_ACCESS)) != 0) { + return error; + } +#endif /* CONFIG_PROTECT */ + + if (ap->a_pl != NULL) { + /* + * this can only happen for swap files now that + * we're asking for V2 paging behavior... + * so don't need to worry about decompression, or + * keeping track of blocks read or taking the truncate lock + */ + error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, + ap->a_size, (off_t)fp->ff_size, ap->a_flags); + goto pagein_done; + } + +retry_pagein: + /* + * take truncate lock (shared/recursive) to guard against + * zero-fill thru fsync interfering, but only for v2 + * + * the HFS_RECURSE_TRUNCLOCK arg indicates that we want the + * lock shared and we are allowed to recurse 1 level if this thread already + * owns the lock exclusively... this can legally occur + * if we are doing a shrinking ftruncate against a file + * that is mapped private, and the pages being truncated + * do not currently exist in the cache... in that case + * we will have to page-in the missing pages in order + * to provide them to the private mapping... we must + * also call hfs_unlock_truncate with a postive been_recursed + * arg to indicate that if we have recursed, there is no need to drop + * the lock. Allowing this simple recursion is necessary + * in order to avoid a certain deadlock... since the ftruncate + * already holds the truncate lock exclusively, if we try + * to acquire it shared to protect the pagein path, we will + * hang this thread + * + * NOTE: The if () block below is a workaround in order to prevent a + * VM deadlock. See rdar://7853471. + * + * If we are in a forced unmount, then launchd will still have the + * dyld_shared_cache file mapped as it is trying to reboot. If we + * take the truncate lock here to service a page fault, then our + * thread could deadlock with the forced-unmount. The forced unmount + * thread will try to reclaim the dyld_shared_cache vnode, but since it's + * marked C_DELETED, it will call ubc_setsize(0). As a result, the unmount + * thread will think it needs to copy all of the data out of the file + * and into a VM copy object. If we hold the cnode lock here, then that + * VM operation will not be able to proceed, because we'll set a busy page + * before attempting to grab the lock. Note that this isn't as simple as "don't + * call ubc_setsize" because doing that would just shift the problem to the + * ubc_msync done before the vnode is reclaimed. + * + * So, if a forced unmount on this volume is in flight AND the cnode is + * marked C_DELETED, then just go ahead and do the page in without taking + * the lock (thus suspending pagein_v2 semantics temporarily). Since it's on a file + * that is not going to be available on the next mount, this seems like a + * OK solution from a correctness point of view, even though it is hacky. + */ + if (vfs_isforce(vp->v_mount)) { + if (cp->c_flag & C_DELETED) { + /* If we don't get it, then just go ahead and operate without the lock */ + truncate_lock_held = hfs_try_trunclock(cp, HFS_RECURSE_TRUNCLOCK); + } + } + else { + hfs_lock_truncate(cp, HFS_RECURSE_TRUNCLOCK); + truncate_lock_held = TRUE; + } + + kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, UPL_UBC_PAGEIN | UPL_RET_ONLY_ABSENT); + + if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { + error = EINVAL; + goto pagein_done; + } + isize = ap->a_size; + + /* + * Scan from the back to find the last page in the UPL, so that we + * aren't looking at a UPL that may have already been freed by the + * preceding aborts/completions. + */ + for (pg_index = ((isize) / PAGE_SIZE); pg_index > 0;) { + if (upl_page_present(pl, --pg_index)) + break; + if (pg_index == 0) { + /* + * no absent pages were found in the range specified + * just abort the UPL to get rid of it and then we're done + */ + ubc_upl_abort_range(upl, 0, isize, UPL_ABORT_FREE_ON_EMPTY); + goto pagein_done; + } + } + /* + * initialize the offset variables before we touch the UPL. + * f_offset is the position into the file, in bytes + * offset is the position into the UPL, in bytes + * pg_index is the pg# of the UPL we're operating on + * isize is the offset into the UPL of the last page that is present. + */ + isize = ((pg_index + 1) * PAGE_SIZE); + pg_index = 0; + offset = 0; + f_offset = ap->a_f_offset; + + while (isize) { + int xsize; + int num_of_pages; + + if ( !upl_page_present(pl, pg_index)) { + /* + * we asked for RET_ONLY_ABSENT, so it's possible + * to get back empty slots in the UPL. + * just skip over them + */ + f_offset += PAGE_SIZE; + offset += PAGE_SIZE; + isize -= PAGE_SIZE; + pg_index++; + + continue; + } + /* + * We know that we have at least one absent page. + * Now checking to see how many in a row we have + */ + num_of_pages = 1; + xsize = isize - PAGE_SIZE; + + while (xsize) { + if ( !upl_page_present(pl, pg_index + num_of_pages)) + break; + num_of_pages++; + xsize -= PAGE_SIZE; + } + xsize = num_of_pages * PAGE_SIZE; #if HFS_COMPRESSION - if (VNODE_IS_RSRC(vp)) { - /* allow pageins of the resource fork */ - } else { - int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ - if (compressed) { - error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp)); + if (VNODE_IS_RSRC(vp)) { + /* allow pageins of the resource fork */ + } else { + int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ + if (compressed) { - if (error == 0) { - /* successful page-in, update the access time */ - VTOC(vp)->c_touch_acctime = TRUE; + if (truncate_lock_held) { + /* + * can't hold the truncate lock when calling into the decmpfs layer + * since it calls back into this layer... even though we're only + * holding the lock in shared mode, and the re-entrant path only + * takes the lock shared, we can deadlock if some other thread + * tries to grab the lock exclusively in between. + */ + hfs_unlock_truncate(cp, 1); + truncate_lock_held = FALSE; + } + ap->a_pl = upl; + ap->a_pl_offset = offset; + ap->a_f_offset = f_offset; + ap->a_size = xsize; + + error = decmpfs_pagein_compressed(ap, &compressed, VTOCMP(vp)); + /* + * note that decpfs_pagein_compressed can change the state of + * 'compressed'... it will set it to 0 if the file is no longer + * compressed once the compression lock is successfully taken + * i.e. we would block on that lock while the file is being inflated + */ + if (compressed) { + if (error == 0) { + /* successful page-in, update the access time */ + VTOC(vp)->c_touch_acctime = TRUE; - /* compressed files are not hot file candidates */ - if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { - VTOF(vp)->ff_bytesread = 0; + /* compressed files are not hot file candidates */ + if (VTOHFS(vp)->hfc_stage == HFC_RECORDING) { + fp->ff_bytesread = 0; + } + } else if (error == EAGAIN) { + /* + * EAGAIN indicates someone else already holds the compression lock... + * to avoid deadlocking, we'll abort this range of pages with an + * indication that the pagein needs to be redriven + */ + ubc_upl_abort_range(upl, (upl_offset_t) offset, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART); } + goto pagein_next_range; + } + else { + /* + * Set file_converted only if the file became decompressed while we were + * paging in. If it were still compressed, we would re-start the loop using the goto + * in the above block. This avoid us overloading truncate_lock_held as our retry_pagein + * condition below, since we could have avoided taking the truncate lock to prevent + * a deadlock in the force unmount case. + */ + file_converted = TRUE; } - return error; } - /* otherwise the file was converted back to a regular file while we were reading it */ + if (file_converted == TRUE) { + /* + * the file was converted back to a regular file after we first saw it as compressed + * we need to abort the upl, retake the truncate lock, recreate the UPL and start over + * reset a_size so that we consider what remains of the original request + * and null out a_upl and a_pl_offset. + * + * We should only be able to get into this block if the decmpfs_pagein_compressed + * successfully decompressed the range in question for this file. + */ + ubc_upl_abort_range(upl, (upl_offset_t) offset, isize, UPL_ABORT_FREE_ON_EMPTY); + + ap->a_size = isize; + ap->a_pl = NULL; + ap->a_pl_offset = 0; + + /* Reset file_converted back to false so that we don't infinite-loop. */ + file_converted = FALSE; + goto retry_pagein; + } } - } #endif + error = cluster_pagein(vp, upl, offset, f_offset, xsize, (off_t)fp->ff_size, ap->a_flags); - error = cluster_pagein(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, (off_t)VTOF(vp)->ff_size, ap->a_flags); - /* - * Keep track of blocks read. - */ - if (!vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { - struct cnode *cp; - struct filefork *fp; - int bytesread; - int took_cnode_lock = 0; + /* + * Keep track of blocks read. + */ + if ( !vnode_isswap(vp) && VTOHFS(vp)->hfc_stage == HFC_RECORDING && error == 0) { + int bytesread; + int took_cnode_lock = 0; - cp = VTOC(vp); - fp = VTOF(vp); + if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE) + bytesread = fp->ff_size; + else + bytesread = xsize; - if (ap->a_f_offset == 0 && fp->ff_size < PAGE_SIZE) - bytesread = fp->ff_size; - else - bytesread = ap->a_size; + /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ + if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) { + hfs_lock(cp, HFS_FORCE_LOCK); + took_cnode_lock = 1; + } + /* + * If this file hasn't been seen since the start of + * the current sampling period then start over. + */ + if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { + struct timeval tv; - /* When ff_bytesread exceeds 32-bits, update it behind the cnode lock. */ - if ((fp->ff_bytesread + bytesread) > 0x00000000ffffffff && cp->c_lockowner != current_thread()) { - hfs_lock(cp, HFS_FORCE_LOCK); - took_cnode_lock = 1; + fp->ff_bytesread = bytesread; + microtime(&tv); + cp->c_atime = tv.tv_sec; + } else { + fp->ff_bytesread += bytesread; + } + cp->c_touch_acctime = TRUE; + if (took_cnode_lock) + hfs_unlock(cp); } - /* - * If this file hasn't been seen since the start of - * the current sampling period then start over. - */ - if (cp->c_atime < VTOHFS(vp)->hfc_timebase) { - struct timeval tv; +pagein_next_range: + f_offset += xsize; + offset += xsize; + isize -= xsize; + pg_index += num_of_pages; - fp->ff_bytesread = bytesread; - microtime(&tv); - cp->c_atime = tv.tv_sec; - } else { - fp->ff_bytesread += bytesread; - } - cp->c_touch_acctime = TRUE; - if (took_cnode_lock) - hfs_unlock(cp); + error = 0; } + +pagein_done: + if (truncate_lock_held == TRUE) { + /* Note 1 is passed to hfs_unlock_truncate in been_recursed argument */ + hfs_unlock_truncate(cp, 1); + } + return (error); } @@ -3338,7 +3845,7 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) * take truncate lock (shared) to guard against * zero-fill thru fsync interfering, but only for v2 */ - hfs_lock_truncate(cp, 0); + hfs_lock_truncate(cp, HFS_SHARED_LOCK); if (a_flags & UPL_MSYNC) { request_flags = UPL_UBC_MSYNC | UPL_RET_ONLY_DIRTY; @@ -3346,6 +3853,7 @@ hfs_vnop_pageout(struct vnop_pageout_args *ap) else { request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; } + kret = ubc_create_upl(vp, ap->a_f_offset, ap->a_size, &upl, &pl, request_flags); if ((kret != KERN_SUCCESS) || (upl == (upl_t) NULL)) { @@ -3649,7 +4157,6 @@ hfs_vnop_bwrite(struct vnop_bwrite_args *ap) * * During step 3 page-ins to the file get suspended. */ -__private_extern__ int hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, struct proc *p) @@ -3685,6 +4192,22 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, fp = VTOF(vp); if (fp->ff_unallocblocks) return (EINVAL); + +#if CONFIG_PROTECT + /* + * <rdar://problem/9118426> + * Disable HFS file relocation on content-protected filesystems + */ + if (cp_fs_protected (hfsmp->hfs_mp)) { + return EINVAL; + } +#endif + + /* If it's an SSD, also disable HFS relocation */ + if (hfsmp->hfs_flags & HFS_SSD) { + return EINVAL; + } + blksize = hfsmp->blockSize; if (blockHint == 0) blockHint = hfsmp->nextAllocation; @@ -3707,15 +4230,15 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, if (!vnode_issystem(vp) && (vnodetype != VLNK)) { hfs_unlock(cp); - hfs_lock_truncate(cp, TRUE); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); /* Force lock since callers expects lock to be held. */ if ((retval = hfs_lock(cp, HFS_FORCE_LOCK))) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); return (retval); } /* No need to continue if file was removed. */ if (cp->c_flag & C_NOEXISTS) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); return (ENOENT); } took_trunc_lock = 1; @@ -3730,7 +4253,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, if (hfs_start_transaction(hfsmp) != 0) { if (took_trunc_lock) - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); return (EINVAL); } started_tr = 1; @@ -3850,7 +4373,7 @@ hfs_relocate(struct vnode *vp, u_int32_t blockHint, kauth_cred_t cred, goto restore; out: if (took_trunc_lock) - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); if (lockflags) { hfs_systemfile_unlock(hfsmp, lockflags); @@ -3876,7 +4399,7 @@ exit: restore: if (fp->ff_blocks == headblks) { if (took_trunc_lock) - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); goto exit; } /* @@ -3889,13 +4412,14 @@ restore: lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); } - (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, false); + (void) TruncateFileC(hfsmp, (FCB*)fp, fp->ff_size, 0, FORK_IS_RSRC(fp), + FTOC(fp)->c_fileid, false); hfs_systemfile_unlock(hfsmp, lockflags); lockflags = 0; if (took_trunc_lock) - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); goto exit; } @@ -3954,10 +4478,19 @@ hfs_clonefile(struct vnode *vp, int blkstart, int blkcnt, int blksize) iosize = bufsize = MIN(copysize, 128 * 1024); offset = 0; + hfs_unlock(VTOC(vp)); + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + hfs_lock(VTOC(vp), HFS_FORCE_LOCK); + return (error); + } +#endif /* CONFIG_PROTECT */ + if (kmem_alloc(kernel_map, (vm_offset_t *)&bufp, bufsize)) { + hfs_lock(VTOC(vp), HFS_FORCE_LOCK); return (ENOMEM); - } - hfs_unlock(VTOC(vp)); + } auio = uio_create(1, 0, UIO_SYSSPACE, UIO_READ); diff --git a/bsd/hfs/hfs_search.c b/bsd/hfs/hfs_search.c index 6a8a8b74f..878c70dc5 100644 --- a/bsd/hfs/hfs_search.c +++ b/bsd/hfs/hfs_search.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2007 Apple Inc. All rights reserved. + * Copyright (c) 1997-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,6 +48,7 @@ #include <sys/utfconv.h> #include <sys/kauth.h> #include <sys/vnode_internal.h> +#include <sys/mount_internal.h> #if CONFIG_MACF #include <security/mac_framework.h> @@ -154,7 +155,6 @@ vnop_searchfs { }; */ -__private_extern__ int hfs_vnop_search(ap) struct vnop_searchfs_args *ap; /* @@ -186,7 +186,6 @@ hfs_vnop_search(ap) struct proc *p = current_proc(); int err = E_NONE; int isHFSPlus; - int timerExpired = false; CatalogKey * myCurrentKeyPtr; CatalogRecord * myCurrentDataPtr; CatPosition * myCatPositionPtr; @@ -195,6 +194,9 @@ hfs_vnop_search(ap) user_size_t user_len = 0; int32_t searchTime; int lockflags; + struct uthread *ut; + boolean_t timerExpired = FALSE; + boolean_t needThrottle = FALSE; /* XXX Parameter check a_searchattrs? */ @@ -307,7 +309,7 @@ hfs_vnop_search(ap) (void) hfs_fsync(vcb->catalogRefNum, MNT_WAIT, 0, p); if (hfsmp->jnl) { hfs_systemfile_unlock(hfsmp, lockflags); - hfs_journal_flush(hfsmp); + hfs_journal_flush(hfsmp, FALSE); lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); } @@ -336,6 +338,8 @@ hfs_vnop_search(ap) if (err) goto ExitThisRoutine; + if (throttle_get_io_policy(&ut) == IOPOL_THROTTLE) + needThrottle = TRUE; /* * Check all the catalog btree records... * return the attributes for matching items @@ -373,18 +377,24 @@ hfs_vnop_search(ap) if (*(ap->a_nummatches) >= ap->a_maxmatches) break; } - - /* - * Check our elapsed time and bail if we've hit the max. - * The idea here is to throttle the amount of time we - * spend in the kernel. - */ - microuptime(&myCurrentTime); - timersub(&myCurrentTime, &myBTScanState.startTime, &myElapsedTime); - /* Note: assumes kMaxMicroSecsInKernel is less than 1,000,000 */ - if (myElapsedTime.tv_sec > 0 - || myElapsedTime.tv_usec >= searchTime) { - timerExpired = true; + if (timerExpired == FALSE) { + /* + * Check our elapsed time and bail if we've hit the max. + * The idea here is to throttle the amount of time we + * spend in the kernel. + */ + microuptime(&myCurrentTime); + timersub(&myCurrentTime, &myBTScanState.startTime, &myElapsedTime); + /* + * Note: assumes kMaxMicroSecsInKernel is less than 1,000,000 + */ + if (myElapsedTime.tv_sec > 0 + || myElapsedTime.tv_usec >= searchTime) { + timerExpired = TRUE; + } else if (needThrottle == TRUE) { + if (throttle_io_will_be_throttled(ut->uu_lowpri_window, HFSTOVFS(hfsmp))) + timerExpired = TRUE; + } } } @@ -436,12 +446,12 @@ ResolveHardlink(struct hfsmount *hfsmp, HFSPlusCatalogFile *recp) filecreatedate = to_bsd_time(recp->createDate); if ((type == kHardLinkFileType && creator == kHFSPlusCreator) && - (filecreatedate == (time_t)hfsmp->vcbCrDate || + (filecreatedate == (time_t)hfsmp->hfs_itime || filecreatedate == (time_t)hfsmp->hfs_metadata_createdate)) { isfilelink = 1; } else if ((type == kHFSAliasType && creator == kHFSAliasCreator) && (recp->flags & kHFSHasLinkChainMask) && - (filecreatedate == (time_t)hfsmp->vcbCrDate || + (filecreatedate == (time_t)hfsmp->hfs_itime || filecreatedate == (time_t)hfsmp->hfs_metadata_createdate)) { isdirlink = 1; } @@ -556,7 +566,7 @@ CheckAccess(ExtendedVCB *theVCBPtr, u_long searchBits, CatalogKey *theKeyPtr, st cnode_t * cp; /* now go get catalog data for this directory */ - myErr = hfs_vget(hfsmp, myNodeID, &vp, 0); + myErr = hfs_vget(hfsmp, myNodeID, &vp, 0, 0); if ( myErr ) { goto ExitThisRoutine; /* no access */ } diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index 7a049916f..1a1ca2aa4 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -87,6 +87,7 @@ #include <sys/utfconv.h> #include <sys/kdebug.h> #include <sys/fslog.h> +#include <sys/ubc.h> #include <kern/locks.h> @@ -109,6 +110,16 @@ #include "hfscommon/headers/FileMgrInternal.h" #include "hfscommon/headers/BTreesInternal.h" +#if CONFIG_PROTECT +#include <sys/cprotect.h> +#endif + +#if CONFIG_HFS_ALLOC_RBTREE +#include "hfscommon/headers/HybridAllocator.h" +#endif + +#define HFS_MOUNT_DEBUG 1 + #if HFS_DIAGNOSTIC int hfs_dbg_all = 0; int hfs_dbg_err = 0; @@ -121,6 +132,7 @@ lck_grp_attr_t * hfs_group_attr; lck_attr_t * hfs_lock_attr; lck_grp_t * hfs_mutex_group; lck_grp_t * hfs_rwlock_group; +lck_grp_t * hfs_spinlock_group; extern struct vnodeopv_desc hfs_vnodeop_opv_desc; extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc; @@ -134,29 +146,30 @@ static int hfs_flushfiles(struct mount *, int, struct proc *); static int hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush); static int hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp); static int hfs_init(struct vfsconf *vfsp); -static int hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context); -static int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context); -static int hfs_reload(struct mount *mp); static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context); static int hfs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t context); static int hfs_start(struct mount *mp, int flags, vfs_context_t context); -static int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context); -static int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context); -static int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, vfs_context_t context); -static int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context); static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context); - -static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimblks, vfs_context_t context); -static int hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t fileID); +static int hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec); static int hfs_journal_replay(vnode_t devvp, vfs_context_t context); +static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context); +void hfs_initialize_allocator (struct hfsmount *hfsmp); +int hfs_teardown_allocator (struct hfsmount *hfsmp); + +int hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context); +int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context); +int hfs_reload(struct mount *mp); +int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context); +int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context); +int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, + user_addr_t newp, size_t newlen, vfs_context_t context); +int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context); /* * Called by vfs_mountroot when mounting HFS Plus as root. */ -__private_extern__ int hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context) { @@ -165,8 +178,13 @@ hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context) struct vfsstatfs *vfsp; int error; - if ((error = hfs_mountfs(rvp, mp, NULL, 0, context))) + if ((error = hfs_mountfs(rvp, mp, NULL, 0, context))) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountroot: hfs_mountfs returned %d, rvp (%p) name (%s) \n", + error, rvp, (rvp->v_name ? rvp->v_name : "unknown device")); + } return (error); + } /* Init hfsmp */ hfsmp = VFSTOHFS(mp); @@ -194,7 +212,7 @@ hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context) * mount system call */ -static int +int hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context) { struct proc *p = vfs_context_proc(context); @@ -204,6 +222,9 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte u_int32_t cmdflags; if ((retval = copyin(data, (caddr_t)&args, sizeof(args)))) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: copyin returned %d for fs\n", retval); + } return (retval); } cmdflags = (u_int32_t)vfs_flags(mp) & MNT_CMDFLAGS; @@ -212,10 +233,19 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte /* Reload incore data after an fsck. */ if (cmdflags & MNT_RELOAD) { - if (vfs_isrdonly(mp)) - return hfs_reload(mp); - else + if (vfs_isrdonly(mp)) { + int error = hfs_reload(mp); + if (error && HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_reload returned %d on %s \n", error, hfsmp->vcbVN); + } + return error; + } + else { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: MNT_RELOAD not supported on rdwr filesystem %s\n", hfsmp->vcbVN); + } return (EINVAL); + } } /* Change to a read-only file system. */ @@ -227,16 +257,19 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte * is in progress and therefore block any further * modifications to the file system. */ - hfs_global_exclusive_lock_acquire(hfsmp); + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE; hfsmp->hfs_downgrading_proc = current_thread(); - hfs_global_exclusive_lock_release(hfsmp); + hfs_unlock_global (hfsmp); /* use VFS_SYNC to push out System (btree) files */ retval = VFS_SYNC(mp, MNT_WAIT, context); if (retval && ((cmdflags & MNT_FORCE) == 0)) { hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; hfsmp->hfs_downgrading_proc = NULL; + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: VFS_SYNC returned %d during b-tree sync of %s \n", retval, hfsmp->vcbVN); + } goto out; } @@ -247,6 +280,9 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte if ((retval = hfs_flushfiles(mp, flags, p))) { hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; hfsmp->hfs_downgrading_proc = NULL; + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_flushfiles returned %d on %s \n", retval, hfsmp->vcbVN); + } goto out; } @@ -266,13 +302,16 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte } } if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: FSYNC on devvp returned %d for fs %s\n", retval, hfsmp->vcbVN); + } hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; hfsmp->hfs_downgrading_proc = NULL; hfsmp->hfs_flags &= ~HFS_READ_ONLY; goto out; } if (hfsmp->jnl) { - hfs_global_exclusive_lock_acquire(hfsmp); + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); journal_close(hfsmp->jnl); hfsmp->jnl = NULL; @@ -281,14 +320,20 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte // access to the jvp because we may need // it later if we go back to being read-write. - hfs_global_exclusive_lock_release(hfsmp); + hfs_unlock_global (hfsmp); } +#if CONFIG_HFS_ALLOC_RBTREE + (void) hfs_teardown_allocator(hfsmp); +#endif hfsmp->hfs_downgrading_proc = NULL; } /* Change to a writable file system. */ if (vfs_iswriteupgrade(mp)) { +#if CONFIG_HFS_ALLOC_RBTREE + thread_t allocator_thread; +#endif /* * On inconsistent disks, do not allow read-write mount @@ -296,6 +341,9 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte */ if (!(vfs_flags(mp) & MNT_ROOTFS) && (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: attempting to mount inconsistent non-root volume %s\n", (hfsmp->vcbVN)); + } retval = EINVAL; goto out; } @@ -310,39 +358,52 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) { jflags = JOURNAL_RESET; - } else { + } else { jflags = 0; - } - - hfs_global_exclusive_lock_acquire(hfsmp); - - hfsmp->jnl = journal_open(hfsmp->jvp, - (hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, - hfsmp->jnl_size, - hfsmp->hfs_devvp, - hfsmp->hfs_logical_block_size, - jflags, - 0, - hfs_sync_metadata, hfsmp->hfs_mp); - - hfs_global_exclusive_lock_release(hfsmp); - - if (hfsmp->jnl == NULL) { - retval = EINVAL; - goto out; - } else { - hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET; - } + } + + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); + + hfsmp->jnl = journal_open(hfsmp->jvp, + (hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset, + hfsmp->jnl_size, + hfsmp->hfs_devvp, + hfsmp->hfs_logical_block_size, + jflags, + 0, + hfs_sync_metadata, hfsmp->hfs_mp); + + /* + * Set up the trim callback function so that we can add + * recently freed extents to the free extent cache once + * the transaction that freed them is written to the + * journal on disk. + */ + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); + + hfs_unlock_global (hfsmp); + + if (hfsmp->jnl == NULL) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: journal_open == NULL; couldn't be opened on %s \n", (hfsmp->vcbVN)); + } + retval = EINVAL; + goto out; + } else { + hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET; + } } /* See if we need to erase unused Catalog nodes due to <rdar://problem/6947811>. */ retval = hfs_erase_unused_nodes(hfsmp); - if (retval != E_NONE) + if (retval != E_NONE) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_erase_unused_nodes returned %d for fs %s\n", retval, hfsmp->vcbVN); + } goto out; - - /* Only clear HFS_READ_ONLY after a successful write */ - hfsmp->hfs_flags &= ~HFS_READ_ONLY; + } /* If this mount point was downgraded from read-write * to read-only, clear that information as we are now @@ -355,8 +416,16 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask; retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); - if (retval != E_NONE) + if (retval != E_NONE) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_flushvolumeheader returned %d for fs %s\n", retval, hfsmp->vcbVN); + } goto out; + } + + /* Only clear HFS_READ_ONLY after a successful write */ + hfsmp->hfs_flags &= ~HFS_READ_ONLY; + if (!(hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_STANDARD))) { /* Setup private/hidden directories for hardlinks. */ @@ -368,8 +437,8 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte /* * Allow hot file clustering if conditions allow. */ - if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && - ((hfsmp->hfs_mp->mnt_kern_flag & MNTK_SSD) == 0)) { + if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && + ((hfsmp->hfs_flags & HFS_SSD) == 0)) { (void) hfs_recording_init(hfsmp); } /* Force ACLs on HFS+ file systems. */ @@ -377,10 +446,45 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte vfs_setextendedsecurity(HFSTOVFS(hfsmp)); } } + +#if CONFIG_HFS_ALLOC_RBTREE + /* + * Like the normal mount case, we need to handle creation of the allocation red-black tree + * if we're upgrading from read-only to read-write. + * + * We spawn a thread to create the pair of red-black trees for this volume. + * However, in so doing, we must be careful to ensure that if this thread is still + * running after mount has finished, it doesn't interfere with an unmount. Specifically, + * we'll need to set a bit that indicates we're in progress building the trees here. + * Unmount will check for this bit, and then if it's set, mark a corresponding bit that + * notifies the tree generation code that an unmount is waiting. Also, mark the extent + * tree flags that the allocator is enabled for use before we spawn the thread that will start + * scanning the RB tree. + * + * Only do this if we're operating on a read-write mount (we wouldn't care for read-only), + * which has not previously encountered a bad error on the red-black tree code. Also, don't + * try to re-build a tree that already exists. + */ + + if (hfsmp->extent_tree_flags == 0) { + hfsmp->extent_tree_flags |= (HFS_ALLOC_TREEBUILD_INFLIGHT | HFS_ALLOC_RB_ENABLED); + /* Initialize EOF counter so that the thread can assume it started at initial values */ + hfsmp->offset_block_end = 0; + + InitTree(hfsmp); + + kernel_thread_start ((thread_continue_t) hfs_initialize_allocator , hfsmp, &allocator_thread); + thread_deallocate(allocator_thread); + } + +#endif } /* Update file system parameters. */ retval = hfs_changefs(mp, &args); + if (retval && HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_changefs returned %d for %s\n", retval, hfsmp->vcbVN); + } } else /* not an update request */ { @@ -388,6 +492,44 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_DOVOLFS)); retval = hfs_mountfs(devvp, mp, &args, 0, context); + if (retval && HFS_MOUNT_DEBUG) { + printf("hfs_mount: hfs_mountfs returned %d\n", retval); + } +#if CONFIG_PROTECT + /* + * If above mount call was successful, and this mount is content protection + * enabled, then verify the on-disk EA on the root to ensure that the filesystem + * is of a suitable vintage to allow the mount to proceed. + */ + if ((retval == 0) && (cp_fs_protected (mp))) { + int err = 0; + struct cp_root_xattr xattr; + bzero (&xattr, sizeof(struct cp_root_xattr)); + hfsmp = vfs_fsprivate(mp); + + /* go get the EA to get the version information */ + err = cp_getrootxattr (hfsmp, &xattr); + /* If there was no EA there, then write one out. */ + if (err == ENOATTR) { + bzero(&xattr, sizeof(struct cp_root_xattr)); + xattr.major_version = CP_CURRENT_MAJOR_VERS; + xattr.minor_version = CP_CURRENT_MINOR_VERS; + xattr.flags = 0; + + err = cp_setrootxattr (hfsmp, &xattr); + } + /* + * For any other error, including having an out of date CP version in the + * EA, or for an error out of cp_setrootxattr, deny the mount + * and do not proceed further. + */ + if (err || xattr.major_version != CP_CURRENT_MAJOR_VERS) { + /* Deny the mount and tear down. */ + retval = EPERM; + (void) hfs_unmount (mp, MNT_FORCE, context); + } + } +#endif } out: if (retval == 0) { @@ -629,7 +771,7 @@ hfs_reload_callback(struct vnode *vp, void *cargs) /* * Re-read cnode data for all active vnodes (non-metadata files). */ - if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp)) { + if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp) && (cp->c_fileid >= kHFSFirstUserCatalogNodeID)) { struct cat_fork *datafork; struct cat_desc desc; @@ -663,7 +805,7 @@ hfs_reload_callback(struct vnode *vp, void *cargs) * re-load B-tree header data. * re-read cnode data for all active vnodes. */ -static int +int hfs_reload(struct mount *mountp) { register struct vnode *devvp; @@ -877,7 +1019,7 @@ hfs_syncer(void *arg0, void *unused) } if (hfsmp->jnl) { - journal_flush(hfsmp->jnl); + journal_flush(hfsmp->jnl, FALSE); } else { hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel()); } @@ -918,11 +1060,11 @@ hfs_syncer(void *arg0, void *unused) // now. Else we defer the sync and reschedule it. // if (hfsmp->jnl) { - lck_rw_lock_shared(&hfsmp->hfs_global_lock); + hfs_lock_global (hfsmp, HFS_SHARED_LOCK); - journal_flush(hfsmp->jnl); + journal_flush(hfsmp->jnl, FALSE); - lck_rw_unlock_shared(&hfsmp->hfs_global_lock); + hfs_unlock_global (hfsmp); } else { hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel()); } @@ -957,10 +1099,119 @@ hfs_syncer(void *arg0, void *unused) extern int IOBSDIsMediaEjectable( const char *cdev_name ); +/* + * Initialization code for Red-Black Tree Allocator + * + * This function will build the two red-black trees necessary for allocating space + * from the metadata zone as well as normal allocations. Currently, we use + * an advisory read to get most of the data into the buffer cache. + * This function is intended to be run in a separate thread so as not to slow down mount. + * + */ + +void +hfs_initialize_allocator (struct hfsmount *hfsmp) { + +#if CONFIG_HFS_ALLOC_RBTREE + u_int32_t err; + + /* + * Take the allocation file lock. Journal transactions will block until + * we're done here. + */ + int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + /* + * GenerateTree assumes that the bitmap lock is held when you call the function. + * It will drop and re-acquire the lock periodically as needed to let other allocations + * through. It returns with the bitmap lock held. Since we only maintain one tree, + * we don't need to specify a start block (always starts at 0). + */ + err = GenerateTree(hfsmp, hfsmp->totalBlocks, &flags, 1); + if (err) { + goto bailout; + } + /* Mark offset tree as built */ + hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ACTIVE; + +bailout: + /* + * GenerateTree may drop the bitmap lock during operation in order to give other + * threads a chance to allocate blocks, but it will always return with the lock held, so + * we don't need to re-grab the lock in order to update the TREEBUILD_INFLIGHT bit. + */ + hfsmp->extent_tree_flags &= ~HFS_ALLOC_TREEBUILD_INFLIGHT; + if (err != 0) { + /* Wakeup any waiters on the allocation bitmap lock */ + wakeup((caddr_t)&hfsmp->extent_tree_flags); + } + + hfs_systemfile_unlock(hfsmp, flags); +#else +#pragma unused (hfsmp) +#endif +} + + +/* + * Teardown code for the Red-Black Tree allocator. + * This function consolidates the code which serializes with respect + * to a thread that may be potentially still building the tree when we need to begin + * tearing it down. Since the red-black tree may not be live when we enter this function + * we return: + * 1 -> Tree was live. + * 0 -> Tree was not active at time of call. + */ + +int +hfs_teardown_allocator (struct hfsmount *hfsmp) { + int rb_used = 0; + +#if CONFIG_HFS_ALLOC_RBTREE + + int flags = 0; + + /* + * Check to see if the tree-generation is still on-going. + * If it is, then block until it's done. + */ + + flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + + + while (hfsmp->extent_tree_flags & HFS_ALLOC_TREEBUILD_INFLIGHT) { + hfsmp->extent_tree_flags |= HFS_ALLOC_TEARDOWN_INFLIGHT; + + lck_rw_sleep(&(VTOC(hfsmp->hfs_allocation_vp))->c_rwlock, LCK_SLEEP_EXCLUSIVE, + &hfsmp->extent_tree_flags, THREAD_UNINT); + } + + if (hfs_isrbtree_active (hfsmp)) { + rb_used = 1; + + /* Tear down the RB Trees while we have the bitmap locked */ + DestroyTrees(hfsmp); + + } + + hfs_systemfile_unlock(hfsmp, flags); +#else + #pragma unused (hfsmp) +#endif + return rb_used; + +} + + +static int hfs_root_unmounted_cleanly = 0; + +SYSCTL_DECL(_vfs_generic); +SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &hfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly"); + /* * Common code for mount and mountroot */ -static int +int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context) { @@ -985,7 +1236,10 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, daddr64_t mdb_offset; int isvirtual = 0; int isroot = 0; - u_int32_t device_features = 0; + int isssd; +#if CONFIG_HFS_ALLOC_RBTREE + thread_t allocator_thread; +#endif if (args == NULL) { /* only hfs_mountroot passes us NULL as the 'args' argument */ @@ -1007,6 +1261,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* Get the logical block size (treated as physical block size everywhere) */ if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&log_blksize, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKSIZE failed\n"); + } retval = ENXIO; goto error_exit; } @@ -1020,6 +1277,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, retval = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_blksize, 0, context); if (retval) { if ((retval != ENOTSUP) && (retval != ENOTTY)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETPHYSICALBLOCKSIZE failed\n"); + } retval = ENXIO; goto error_exit; } @@ -1039,6 +1299,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, u_int32_t size512 = 512; if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE failed \n"); + } retval = ENXIO; goto error_exit; } @@ -1047,7 +1310,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { /* resetting block size may fail if getting block count did */ (void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context); - + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT failed\n"); + } retval = ENXIO; goto error_exit; } @@ -1083,11 +1348,17 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* Now switch to our preferred physical block size. */ if (log_blksize > 512) { if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE (2) failed\n"); + } retval = ENXIO; goto error_exit; } /* Get the count of physical blocks. */ if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (2) failed\n"); + } retval = ENXIO; goto error_exit; } @@ -1103,11 +1374,17 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, if ((retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, (phys_blksize/log_blksize)), phys_blksize, cred, &bp))) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: buf_meta_bread failed with %d\n", retval); + } goto error_exit; } MALLOC(mdbp, HFSMasterDirectoryBlock *, kMDBSize, M_TEMP, M_WAITOK); if (mdbp == NULL) { retval = ENOMEM; + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: MALLOC failed\n"); + } goto error_exit; } bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, kMDBSize); @@ -1116,25 +1393,27 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, MALLOC(hfsmp, struct hfsmount *, sizeof(struct hfsmount), M_HFSMNT, M_WAITOK); if (hfsmp == NULL) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: MALLOC (2) failed\n"); + } retval = ENOMEM; goto error_exit; } bzero(hfsmp, sizeof(struct hfsmount)); hfs_chashinit_finish(hfsmp); - + /* - * See if the disk supports unmap (trim). - * - * NOTE: vfs_init_io_attributes has not been called yet, so we can't use the io_flags field - * returned by vfs_ioattr. We need to call VNOP_IOCTL ourselves. + * See if the disk is a solid state device. We need this to decide what to do about + * hotfiles. */ - if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&device_features, 0, context) == 0) { - if (device_features & DK_FEATURE_UNMAP) { - hfsmp->hfs_flags |= HFS_UNMAP; + if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) { + if (isssd) { + hfsmp->hfs_flags |= HFS_SSD; } } - + + /* * Init the volume information structure */ @@ -1143,7 +1422,8 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, lck_mtx_init(&hfsmp->hfc_mutex, hfs_mutex_group, hfs_lock_attr); lck_rw_init(&hfsmp->hfs_global_lock, hfs_rwlock_group, hfs_lock_attr); lck_rw_init(&hfsmp->hfs_insync, hfs_rwlock_group, hfs_lock_attr); - + lck_spin_init(&hfsmp->vcbFreeExtLock, hfs_spinlock_group, hfs_lock_attr); + vfs_setfsprivate(mp, hfsmp); hfsmp->hfs_mp = mp; /* Make VFSTOHFS work */ hfsmp->hfs_raw_dev = vnode_specrdev(devvp); @@ -1216,6 +1496,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, retval = EROFS; goto error_exit; } + + printf("hfs_mountfs: Mounting HFS Standard volumes was deprecated in Mac OS 10.7 \n"); + /* Treat it as if it's read-only and not writeable */ hfsmp->hfs_flags |= HFS_READ_ONLY; hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA; @@ -1287,11 +1570,18 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, log_blksize = 512; if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE (3) failed\n"); + } retval = ENXIO; goto error_exit; } if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (3) failed\n"); + } retval = ENXIO; goto error_exit; } @@ -1314,8 +1604,12 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize)); retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys), phys_blksize, cred, &bp); - if (retval) + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: buf_meta_bread (2) failed with %d\n", retval); + } goto error_exit; + } bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, 512); buf_brelse(bp); bp = NULL; @@ -1326,6 +1620,10 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, vhp = (HFSPlusVolumeHeader*) mdbp; } + if (isroot) { + hfs_root_unmounted_cleanly = (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0; + } + /* * On inconsistent disks, do not allow read-write mount * unless it is the boot volume being mounted. We also @@ -1338,6 +1636,10 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, && (SWAP_BE32(vhp->attributes) & kHFSVolumeInconsistentMask) && !journal_replay_only && !(hfsmp->hfs_flags & HFS_READ_ONLY)) { + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: failed to mount non-root inconsistent disk\n"); + } retval = EINVAL; goto error_exit; } @@ -1375,6 +1677,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, // EROFS is a special error code that means the volume has an external // journal which we couldn't find. in that case we do not want to // rewrite the volume header - we'll just refuse to mount the volume. + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init indicated external jnl \n"); + } retval = EINVAL; goto error_exit; } @@ -1383,7 +1688,11 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, // to be "FSK!" which fsck_hfs will see and force the fsck instead // of just bailing out because the volume is journaled. if (!ronly) { - HFSPlusVolumeHeader *jvhp; + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init failed, setting to FSK \n"); + } + + HFSPlusVolumeHeader *jvhp; hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; @@ -1418,6 +1727,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, // in the hopes that fsck_hfs will be able to // fix any damage that exists on the volume. if ( !(vfs_flags(mp) & MNT_ROOTFS)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init failed, erroring out \n"); + } retval = EINVAL; goto error_exit; } @@ -1446,10 +1758,16 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, "(%d) switching to 512\n", log_blksize); log_blksize = 512; if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCSETBLOCKSIZE (4) failed \n"); + } retval = ENXIO; goto error_exit; } if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (4) failed \n"); + } retval = ENXIO; goto error_exit; } @@ -1470,6 +1788,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, // to be "FSK!" which fsck_hfs will see and force the fsck instead // of just bailing out because the volume is journaled. if (!ronly) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init (2) resetting.. \n"); + } HFSPlusVolumeHeader *jvhp; hfsmp->hfs_flags |= HFS_NEED_JNL_RESET; @@ -1504,6 +1825,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, // in the hopes that fsck_hfs will be able to // fix any damage that exists on the volume. if ( !(vfs_flags(mp) & MNT_ROOTFS)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: hfs_early_journal_init (2) failed \n"); + } retval = EINVAL; goto error_exit; } @@ -1512,6 +1836,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* Try again with a smaller block size... */ retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred); + if (retval && HFS_MOUNT_DEBUG) { + printf("hfs_MountHFSPlusVolume (late) returned %d\n",retval); + } } if (retval) (void) hfs_relconverter(0); @@ -1522,6 +1849,9 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, hfsmp->hfs_last_mounted_mtime = hfsmp->hfs_mtime; if ( retval ) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mountfs: encountered failure %d \n", retval); + } goto error_exit; } @@ -1538,7 +1868,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, mp->mnt_vtable->vfc_vfsflags |= VFC_VFSDIRLINKS; } else { /* HFS standard doesn't support extended readdir! */ - mp->mnt_vtable->vfc_vfsflags &= ~VFC_VFSREADDIR_EXTENDED; + mount_set_noreaddirext (mp); } if (args) { @@ -1563,10 +1893,10 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* * Set the free space warning levels for the root volume: * - * Set the "danger" limit to 5% of the volume size or 125MB, whichever - * is less. Set the "warning" limit to 10% of the volume size or 250MB, + * Set the "danger" limit to 5% of the volume size or 512MB, whichever + * is less. Set the "warning" limit to 10% of the volume size or 1GB, * whichever is less. And last, set the "desired" freespace level to - * to 11% of the volume size or 375MB, whichever is less. + * to 11% of the volume size or 1.25GB, whichever is less. */ hfsmp->hfs_freespace_notify_dangerlimit = MIN(HFS_ROOTVERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize, @@ -1598,6 +1928,32 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, } } } + +#if CONFIG_HFS_ALLOC_RBTREE + /* + * We spawn a thread to create the pair of red-black trees for this volume. + * However, in so doing, we must be careful to ensure that if this thread is still + * running after mount has finished, it doesn't interfere with an unmount. Specifically, + * we'll need to set a bit that indicates we're in progress building the trees here. + * Unmount will check for this bit, and then if it's set, mark a corresponding bit that + * notifies the tree generation code that an unmount is waiting. Also mark the bit that + * indicates the tree is live and operating. + * + * Only do this if we're operating on a read-write mount (we wouldn't care for read-only). + */ + + if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { + hfsmp->extent_tree_flags |= (HFS_ALLOC_TREEBUILD_INFLIGHT | HFS_ALLOC_RB_ENABLED); + + /* Initialize EOF counter so that the thread can assume it started at initial values */ + hfsmp->offset_block_end = 0; + InitTree(hfsmp); + + kernel_thread_start ((thread_continue_t) hfs_initialize_allocator , hfsmp, &allocator_thread); + thread_deallocate(allocator_thread); + } + +#endif /* * Start looking for free space to drop below this level and generate a @@ -1628,7 +1984,7 @@ error_exit: vnode_rele(hfsmp->hfs_devvp); } hfs_delete_chash(hfsmp); - + FREE(hfsmp, M_HFSMNT); vfs_setfsprivate(mp, NULL); } @@ -1651,7 +2007,7 @@ hfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t /* * unmount system call */ -static int +int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) { struct proc *p = vfs_context_proc(context); @@ -1660,6 +2016,7 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) int flags; int force; int started_tr = 0; + int rb_used = 0; flags = 0; force = 0; @@ -1706,6 +2063,10 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) panic("hfs_unmount: pm_sync_incomplete underflow!\n"); } +#if CONFIG_HFS_ALLOC_RBTREE + rb_used = hfs_teardown_allocator(hfsmp); +#endif + /* * Flush out the b-trees, volume bitmap and Volume Header */ @@ -1768,22 +2129,31 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask; } - if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - int i; - u_int32_t min_start = hfsmp->totalBlocks; - - // set the nextAllocation pointer to the smallest free block number - // we've seen so on the next mount we won't rescan unnecessarily - for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) { - if (hfsmp->vcbFreeExt[i].startBlock < min_start) { - min_start = hfsmp->vcbFreeExt[i].startBlock; + + if (rb_used) { + /* If the rb-tree was live, just set min_start to 0 */ + hfsmp->nextAllocation = 0; + } + else { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + int i; + u_int32_t min_start = hfsmp->totalBlocks; + + // set the nextAllocation pointer to the smallest free block number + // we've seen so on the next mount we won't rescan unnecessarily + lck_spin_lock(&hfsmp->vcbFreeExtLock); + for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) { + if (hfsmp->vcbFreeExt[i].startBlock < min_start) { + min_start = hfsmp->vcbFreeExt[i].startBlock; + } + } + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + if (min_start < hfsmp->nextAllocation) { + hfsmp->nextAllocation = min_start; } - } - if (min_start < hfsmp->nextAllocation) { - hfsmp->nextAllocation = min_start; } } - + retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0); if (retval) { @@ -1799,7 +2169,7 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) } if (hfsmp->jnl) { - hfs_journal_flush(hfsmp); + hfs_journal_flush(hfsmp, FALSE); } /* @@ -1807,11 +2177,6 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) */ (void) hfsUnmount(hfsmp, p); - /* - * Last chance to dump unreferenced system files. - */ - (void) vflush(mp, NULLVP, FORCECLOSE); - if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) (void) hfs_relconverter(hfsmp->hfs_encoding); @@ -1833,7 +2198,12 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) } // XXXdbg -#ifdef HFS_SPARSE_DEV + /* + * Last chance to dump unreferenced system files. + */ + (void) vflush(mp, NULLVP, FORCECLOSE); + +#if HFS_SPARSE_DEV /* Drop our reference on the backing fs (if any). */ if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) { struct vnode * tmpvp; @@ -1845,6 +2215,7 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) } #endif /* HFS_SPARSE_DEV */ lck_mtx_destroy(&hfsmp->hfc_mutex, hfs_mutex_group); + lck_spin_destroy(&hfsmp->vcbFreeExtLock, hfs_spinlock_group); vnode_rele(hfsmp->hfs_devvp); hfs_delete_chash(hfsmp); @@ -1866,7 +2237,7 @@ hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context) static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context) { - return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1); + return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1, 0); } @@ -1887,7 +2258,7 @@ hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t int cmd, type, error; if (uid == ~0U) - uid = vfs_context_ucred(context)->cr_ruid; + uid = kauth_cred_getuid(vfs_context_ucred(context)); cmd = cmds >> SUBCMDSHIFT; switch (cmd) { @@ -1895,7 +2266,7 @@ hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t case Q_QUOTASTAT: break; case Q_GETQUOTA: - if (uid == vfs_context_ucred(context)->cr_ruid) + if (uid == kauth_cred_getuid(vfs_context_ucred(context))) break; /* fall through */ default: @@ -1958,7 +2329,7 @@ hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t /* * Get file system statistics. */ -static int +int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, __unused vfs_context_t context) { ExtendedVCB *vcb = VFSTOVCB(mp); @@ -2099,7 +2470,7 @@ hfs_sync_callback(struct vnode *vp, void *cargs) * * Note: we are always called with the filesystem marked `MPBUSY'. */ -static int +int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context) { struct proc *p = vfs_context_proc(context); @@ -2203,7 +2574,7 @@ hfs_sync(struct mount *mp, int waitfor, vfs_context_t context) } if (hfsmp->jnl) { - hfs_journal_flush(hfsmp); + hfs_journal_flush(hfsmp, FALSE); } { @@ -2244,7 +2615,7 @@ hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, if (fhlen < (int)sizeof(struct hfsfid)) return (EINVAL); - result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0); + result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0, 0); if (result) { if (result == ENOENT) result = ESTALE; @@ -2319,6 +2690,7 @@ hfs_init(__unused struct vfsconf *vfsp) hfs_group_attr = lck_grp_attr_alloc_init(); hfs_mutex_group = lck_grp_alloc_init("hfs-mutex", hfs_group_attr); hfs_rwlock_group = lck_grp_alloc_init("hfs-rwlock", hfs_group_attr); + hfs_spinlock_group = lck_grp_alloc_init("hfs-spinlock", hfs_group_attr); #if HFS_COMPRESSION decmpfs_init(); @@ -2359,7 +2731,7 @@ hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp) /* * HFS filesystem related variables. */ -static int +int hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen, vfs_context_t context) { @@ -2505,6 +2877,15 @@ encodinghint_exit: 0, hfs_sync_metadata, hfsmp->hfs_mp); + /* + * Set up the trim callback function so that we can add + * recently freed extents to the free extent cache once + * the transaction that freed them is written to the + * journal on disk. + */ + if (jnl) + journal_trim_set_callback(jnl, hfs_trim_callback, hfsmp); + if (jnl == NULL) { printf("hfs: FAILED to create the journal!\n"); if (jvp && jvp != hfsmp->hfs_devvp) { @@ -2516,17 +2897,17 @@ encodinghint_exit: return EINVAL; } - hfs_global_exclusive_lock_acquire(hfsmp); - + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); + /* * Flush all dirty metadata buffers. */ - buf_flushdirtyblks(hfsmp->hfs_devvp, MNT_WAIT, 0, "hfs_sysctl"); - buf_flushdirtyblks(hfsmp->hfs_extents_vp, MNT_WAIT, 0, "hfs_sysctl"); - buf_flushdirtyblks(hfsmp->hfs_catalog_vp, MNT_WAIT, 0, "hfs_sysctl"); - buf_flushdirtyblks(hfsmp->hfs_allocation_vp, MNT_WAIT, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_devvp, TRUE, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_extents_vp, TRUE, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_catalog_vp, TRUE, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_allocation_vp, TRUE, 0, "hfs_sysctl"); if (hfsmp->hfs_attribute_vp) - buf_flushdirtyblks(hfsmp->hfs_attribute_vp, MNT_WAIT, 0, "hfs_sysctl"); + buf_flushdirtyblks(hfsmp->hfs_attribute_vp, TRUE, 0, "hfs_sysctl"); HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1]; HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask; @@ -2541,7 +2922,7 @@ encodinghint_exit: vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED)); - hfs_global_exclusive_lock_release(hfsmp); + hfs_unlock_global (hfsmp); hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); { @@ -2576,7 +2957,7 @@ encodinghint_exit: printf("hfs: disabling journaling for mount @ %p\n", vnode_mount(vp)); - hfs_global_exclusive_lock_acquire(hfsmp); + hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK); // Lights out for you buddy! journal_close(hfsmp->jnl); @@ -2595,7 +2976,8 @@ encodinghint_exit: HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask; - hfs_global_exclusive_lock_release(hfsmp); + hfs_unlock_global (hfsmp); + hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1); { @@ -2676,6 +3058,10 @@ encodinghint_exit: file_drop(device_fd); vnode_put(devvp); return error; + } else if (name[0] == HFS_ENABLE_RESIZE_DEBUG) { + hfs_resize_debug = 1; + printf ("hfs_sysctl: Enabled volume resize debugging.\n"); + return 0; } return (ENOTSUP); @@ -2696,7 +3082,7 @@ hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_con hfsmp = VFSTOHFS(mp); - error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1); + error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1, 0); if (error) return (error); @@ -2737,9 +3123,8 @@ hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_con * * If the object is a file then it will represent the data fork. */ -__private_extern__ int -hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) +hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock, int allow_deleted) { struct vnode *vp = NULLVP; struct cat_desc cndesc; @@ -2761,7 +3146,7 @@ hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) /* * Check the hash first */ - vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock); + vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock, allow_deleted); if (vp) { *vpp = vp; return(0); @@ -2841,7 +3226,7 @@ hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) * Pick up the first link in the chain and get a descriptor for it. * This allows blind volfs paths to work for hardlinks. */ - if ((hfs_lookuplink(hfsmp, linkref, &prevlinkid, &nextlinkid) == 0) && + if ((hfs_lookup_siblinglinks(hfsmp, linkref, &prevlinkid, &nextlinkid) == 0) && (nextlinkid != 0)) { lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); error = cat_findname(hfsmp, nextlinkid, &linkdesc); @@ -2854,13 +3239,17 @@ hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) } if (linkref) { - error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cnfork, &vp); + int newvnode_flags = 0; + + error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, + &cnfork, &vp, &newvnode_flags); if (error == 0) { VTOC(vp)->c_flag |= C_HARDLINK; vnode_setmultipath(vp); } } else { struct componentname cn; + int newvnode_flags = 0; /* Supply hfs_getnewvnode with a component name. */ MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); @@ -2874,7 +3263,8 @@ hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock) cn.cn_consume = 0; bcopy(cndesc.cd_nameptr, cn.cn_nameptr, cndesc.cd_namelen + 1); - error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr, &cnfork, &vp); + error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr, + &cnfork, &vp, &newvnode_flags); if (error == 0 && (VTOC(vp)->c_flag & C_HARDLINK)) { hfs_savelinkorigin(VTOC(vp), cndesc.cd_parentcnid); @@ -2927,7 +3317,7 @@ hfs_flushfiles(struct mount *mp, int flags, __unused struct proc *p) } /* Obtain the root vnode so we can skip over it. */ - skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0); + skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0, 0); } #endif /* QUOTA */ @@ -3004,7 +3394,6 @@ hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding) * * On journal volumes this will cause a volume header flush */ -__private_extern__ int hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot) { @@ -3079,7 +3468,7 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sectorsize)); - mdb->drCrDate = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbCrDate))); + mdb->drCrDate = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->hfs_itime))); mdb->drLsMod = SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod))); mdb->drAtrb = SWAP_BE16 (vcb->vcbAtrb); mdb->drNmFls = SWAP_BE16 (vcb->vcbNmFls); @@ -3156,7 +3545,6 @@ hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush) * not flushed since the on-disk "H+" and "HX" signatures * are always stored in-memory as "H+". */ -__private_extern__ int hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush) { @@ -3464,7 +3852,6 @@ err_exit: /* * Extend a file system. */ -__private_extern__ int hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) { @@ -3509,7 +3896,7 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * ownership and check permissions. */ if (suser(cred, NULL)) { - error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0); + error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0, 0); if (error) return (error); @@ -3562,7 +3949,11 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) addblks = newblkcnt - vcb->totalBlocks; - printf("hfs_extendfs: growing %s by %d blocks\n", vcb->vcbVN, addblks); + if (hfs_resize_debug) { + printf ("hfs_extendfs: old: size=%qu, blkcnt=%u\n", oldsize, hfsmp->totalBlocks); + printf ("hfs_extendfs: new: size=%qu, blkcnt=%u, addblks=%u\n", newsize, (u_int32_t)newblkcnt, addblks); + } + printf("hfs_extendfs: will extend \"%s\" by %d blocks\n", vcb->vcbVN, addblks); HFS_MOUNT_LOCK(hfsmp, TRUE); if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) { @@ -3573,9 +3964,6 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS; HFS_MOUNT_UNLOCK(hfsmp, TRUE); - /* Invalidate the current free extent cache */ - invalidate_free_extent_cache(hfsmp); - /* * Enclose changes inside a transaction. */ @@ -3604,6 +3992,17 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) else bitmapblks = 0; + /* + * The allocation bitmap can contain unused bits that are beyond end of + * current volume's allocation blocks. Usually they are supposed to be + * zero'ed out but there can be cases where they might be marked as used. + * After extending the file system, those bits can represent valid + * allocation blocks, so we mark all the bits from the end of current + * volume to end of allocation bitmap as "free". + */ + BlockMarkFreeUnused(vcb, vcb->totalBlocks, + (fp->ff_blocks * vcb->blockSize * 8) - vcb->totalBlocks); + if (bitmapblks > 0) { daddr64_t blkno; daddr_t blkcnt; @@ -3623,8 +4022,8 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * zone. */ error = ExtendFileC(vcb, fp, bitmapblks * vcb->blockSize, 0, - kEFAllMask | kEFNoClumpMask | kEFReserveMask | kEFMetadataMask, - &bytesAdded); + kEFAllMask | kEFNoClumpMask | kEFReserveMask + | kEFMetadataMask | kEFContigMask, &bytesAdded); if (error == 0) { usedExtendFileC = true; @@ -3736,7 +4135,8 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * Restore to old state. */ if (usedExtendFileC) { - (void) TruncateFileC(vcb, fp, oldBitmapSize, false); + (void) TruncateFileC(vcb, fp, oldBitmapSize, 0, FORK_IS_RSRC(fp), + FTOC(fp)->c_fileid, false); } else { fp->ff_blocks -= bitmapblks; fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize; @@ -3752,10 +4152,15 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) hfsmp->hfs_logical_block_count = prev_phys_block_count; hfsmp->hfs_alt_id_sector = prev_alt_sector; MarkVCBDirty(vcb); - if (vcb->blockSize == 512) - (void) BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2); - else - (void) BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1); + if (vcb->blockSize == 512) { + if (BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2)) { + hfs_mark_volume_inconsistent(hfsmp); + } + } else { + if (BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1)) { + hfs_mark_volume_inconsistent(hfsmp); + } + } goto out; } /* @@ -3779,7 +4184,7 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) /* * Update the metadata zone size based on current volume size */ - hfs_metadatazone_init(hfsmp); + hfs_metadatazone_init(hfsmp, false); /* * Adjust the size of hfsmp->hfs_attrdata_vp @@ -3801,21 +4206,36 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) } } + /* + * Update the R/B Tree if necessary. Since we don't have to drop the systemfile + * locks in the middle of these operations like we do in the truncate case + * where we have to relocate files, we can only update the red-black tree + * if there were actual changes made to the bitmap. Also, we can't really scan the + * new portion of the bitmap before it has been allocated. The BlockMarkAllocated + * routines are smart enough to avoid the r/b tree if the portion they are manipulating is + * not currently controlled by the tree. + * + * We only update hfsmp->allocLimit if totalBlocks actually increased. + */ + + if (error == 0) { + UpdateAllocLimit(hfsmp, hfsmp->totalBlocks); + } + + /* Log successful extending */ + printf("hfs_extendfs: extended \"%s\" to %d blocks (was %d blocks)\n", + hfsmp->vcbVN, hfsmp->totalBlocks, (u_int32_t)(oldsize/hfsmp->blockSize)); + out: if (error && fp) { /* Restore allocation fork. */ bcopy(&forkdata, &fp->ff_data, sizeof(forkdata)); VTOC(vp)->c_blocks = fp->ff_blocks; - + } - /* - Regardless of whether or not the totalblocks actually increased, - we should reset the allocLimit field. If it changed, it will - get updated; if not, it will remain the same. - */ + HFS_MOUNT_LOCK(hfsmp, TRUE); hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS; - hfsmp->allocLimit = vcb->totalBlocks; HFS_MOUNT_UNLOCK(hfsmp, TRUE); if (lockflags) { hfs_systemfile_unlock(hfsmp, lockflags); @@ -3824,7 +4244,7 @@ out: hfs_end_transaction(hfsmp); } - return (error); + return MacToVFSError(error); } #define HFS_MIN_SIZE (32LL * 1024LL * 1024LL) @@ -3832,7 +4252,6 @@ out: /* * Truncate a file system (while still mounted). */ -__private_extern__ int hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) { @@ -3843,17 +4262,19 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) int lockflags = 0; int transaction_begun = 0; Boolean updateFreeBlocks = false; - int error; + Boolean disable_sparse = false; + int error = 0; - HFS_MOUNT_LOCK(hfsmp, TRUE); + lck_mtx_lock(&hfsmp->hfs_mutex); if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) { - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + lck_mtx_unlock(&hfsmp->hfs_mutex); return (EALREADY); } hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS; - hfsmp->hfs_resize_filesmoved = 0; - hfsmp->hfs_resize_totalfiles = 0; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + hfsmp->hfs_resize_blocksmoved = 0; + hfsmp->hfs_resize_totalblocks = 0; + hfsmp->hfs_resize_progress = 0; + lck_mtx_unlock(&hfsmp->hfs_mutex); /* * - Journaled HFS Plus volumes only. @@ -3882,25 +4303,66 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) error = EINVAL; goto out; } - /* Make sure that the file system has enough free blocks reclaim */ - if (reclaimblks >= hfs_freeblks(hfsmp, 1)) { - printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1)); - error = ENOSPC; - goto out; - } - - /* Invalidate the current free extent cache */ - invalidate_free_extent_cache(hfsmp); - - /* Start with a clean journal. */ - hfs_journal_flush(hfsmp); + + /* + * Make sure that the file system has enough free blocks reclaim. + * + * Before resize, the disk is divided into four zones - + * A. Allocated_Stationary - These are allocated blocks that exist + * before the new end of disk. These blocks will not be + * relocated or modified during resize. + * B. Free_Stationary - These are free blocks that exist before the + * new end of disk. These blocks can be used for any new + * allocations during resize, including allocation for relocating + * data from the area of disk being reclaimed. + * C. Allocated_To-Reclaim - These are allocated blocks that exist + * beyond the new end of disk. These blocks need to be reclaimed + * during resize by allocating equal number of blocks in Free + * Stationary zone and copying the data. + * D. Free_To-Reclaim - These are free blocks that exist beyond the + * new end of disk. Nothing special needs to be done to reclaim + * them. + * + * Total number of blocks on the disk before resize: + * ------------------------------------------------ + * Total Blocks = Allocated_Stationary + Free_Stationary + + * Allocated_To-Reclaim + Free_To-Reclaim + * + * Total number of blocks that need to be reclaimed: + * ------------------------------------------------ + * Blocks to Reclaim = Allocated_To-Reclaim + Free_To-Reclaim + * + * Note that the check below also makes sure that we have enough space + * to relocate data from Allocated_To-Reclaim to Free_Stationary. + * Therefore we do not need to check total number of blocks to relocate + * later in the code. + * + * The condition below gets converted to: + * + * Allocated To-Reclaim + Free To-Reclaim >= Free Stationary + Free To-Reclaim + * + * which is equivalent to: + * + * Allocated To-Reclaim >= Free Stationary + */ + if (reclaimblks >= hfs_freeblks(hfsmp, 1)) { + printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1)); + error = ENOSPC; + goto out; + } + + /* Start with a clean journal. */ + hfs_journal_flush(hfsmp, TRUE); if (hfs_start_transaction(hfsmp) != 0) { error = EINVAL; goto out; } transaction_begun = 1; - + + /* Take the bitmap lock to update the alloc limit field */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + /* * Prevent new allocations from using the part we're trying to truncate. * @@ -3909,12 +4371,36 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) * interfere with allocating the new alternate volume header, and no files * in the allocation blocks beyond (i.e. the blocks we're trying to * truncate away. + * + * Also shrink the red-black tree if needed. + */ + if (hfsmp->blockSize == 512) { + error = UpdateAllocLimit (hfsmp, newblkcnt - 2); + } + else { + error = UpdateAllocLimit (hfsmp, newblkcnt - 1); + } + + /* Sparse devices use first fit allocation which is not ideal + * for volume resize which requires best fit allocation. If a + * sparse device is being truncated, disable the sparse device + * property temporarily for the duration of resize. Also reset + * the free extent cache so that it is rebuilt as sorted by + * totalBlocks instead of startBlock. + * + * Note that this will affect all allocations on the volume and + * ideal fix would be just to modify resize-related allocations, + * but it will result in complexity like handling of two free + * extent caches sorted differently, etc. So we stick to this + * solution for now. */ HFS_MOUNT_LOCK(hfsmp, TRUE); - if (hfsmp->blockSize == 512) - hfsmp->allocLimit = newblkcnt - 2; - else - hfsmp->allocLimit = newblkcnt - 1; + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE; + ResetVCBFreeExtCache(hfsmp); + disable_sparse = true; + } + /* * Update the volume free block count to reflect the total number * of free blocks that will exist after a successful resize. @@ -3928,16 +4414,28 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) */ hfsmp->freeBlocks -= reclaimblks; updateFreeBlocks = true; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); - + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + lockflags = 0; + } + /* - * Update the metadata zone size, and, if required, disable it + * Update the metadata zone size to match the new volume size, + * and if it too less, metadata zone might be disabled. */ - hfs_metadatazone_init(hfsmp); + hfs_metadatazone_init(hfsmp, false); /* - * Look for files that have blocks at or beyond the location of the - * new alternate volume header + * If some files have blocks at or beyond the location of the + * new alternate volume header, recalculate free blocks and + * reclaim blocks. Otherwise just update free blocks count. + * + * The current allocLimit is set to the location of new alternate + * volume header, and reclaimblks are the total number of blocks + * that need to be reclaimed. So the check below is really + * ignoring the blocks allocated for old alternate volume header. */ if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) { /* @@ -3967,23 +4465,14 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) error = EAGAIN; /* tell client to try again */ goto out; } - } - + } + /* * Note: we take the attributes lock in case we have an attribute data vnode * which needs to change size. */ lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); - /* - * Mark the old alternate volume header as free. - * We don't bother shrinking allocation bitmap file. - */ - if (hfsmp->blockSize == 512) - (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2); - else - (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1); - /* * Allocate last 1KB for alternate volume header. */ @@ -3993,6 +4482,15 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) goto out; } + /* + * Mark the old alternate volume header as free. + * We don't bother shrinking allocation bitmap file. + */ + if (hfsmp->blockSize == 512) + (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2); + else + (void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1); + /* * Invalidate the existing alternate volume header. * @@ -4028,7 +4526,7 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); if (error) panic("hfs_truncatefs: unexpected error flushing volume header (%d)\n", error); - + /* * Adjust the size of hfsmp->hfs_attrdata_vp */ @@ -4050,17 +4548,36 @@ hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) } out: - lck_mtx_lock(&hfsmp->hfs_mutex); - if (error && (updateFreeBlocks == true)) + /* + * Update the allocLimit to acknowledge the last one or two blocks now. + * Add it to the tree as well if necessary. + */ + UpdateAllocLimit (hfsmp, hfsmp->totalBlocks); + + HFS_MOUNT_LOCK(hfsmp, TRUE); + if (disable_sparse == true) { + /* Now that resize is completed, set the volume to be sparse + * device again so that all further allocations will be first + * fit instead of best fit. Reset free extent cache so that + * it is rebuilt. + */ + hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE; + ResetVCBFreeExtCache(hfsmp); + } + + if (error && (updateFreeBlocks == true)) { hfsmp->freeBlocks += reclaimblks; - hfsmp->allocLimit = hfsmp->totalBlocks; - if (hfsmp->nextAllocation >= hfsmp->allocLimit) + } + + if (hfsmp->nextAllocation >= hfsmp->allocLimit) { hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1; + } hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); + /* On error, reset the metadata zone for original volume size */ if (error && (updateFreeBlocks == true)) { - hfs_metadatazone_init(hfsmp); + hfs_metadatazone_init(hfsmp, false); } if (lockflags) { @@ -4068,12 +4585,12 @@ out: } if (transaction_begun) { hfs_end_transaction(hfsmp); - hfs_journal_flush(hfsmp); + hfs_journal_flush(hfsmp, FALSE); /* Just to be sure, sync all data to the disk */ (void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context); } - return (error); + return MacToVFSError(error); } @@ -4135,6 +4652,9 @@ hfs_copy_extent( u_int32_t ioSizeSectors; /* Device sectors in this I/O */ daddr64_t srcSector, destSector; u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_logical_block_size; +#if CONFIG_PROTECT + int cpenabled = 0; +#endif /* * Sanity check that we have locked the vnode of the file we're copying. @@ -4147,6 +4667,25 @@ hfs_copy_extent( if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread()) panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp); +#if CONFIG_PROTECT + /* Prepare the CP blob and get it ready for use */ + if (!vnode_issystem (vp) && vnode_isreg(vp) && + cp_fs_protected (hfsmp->hfs_mp)) { + int cp_err = 0; + cp_err = cp_handle_relocate (cp); + if (cp_err) { + /* + * can't copy the file because we couldn't set up keys. + * bail out + */ + return cp_err; + } + else { + cpenabled = 1; + } + } +#endif + /* * Determine the I/O size to use * @@ -4176,7 +4715,14 @@ hfs_copy_extent( buf_setcount(bp, ioSize); buf_setblkno(bp, srcSector); buf_setlblkno(bp, srcSector); - + + /* Attach the CP to the buffer */ +#if CONFIG_PROTECT + if (cpenabled) { + buf_setcpaddr (bp, cp->c_cpentry); + } +#endif + /* Do the read */ err = VNOP_STRATEGY(bp); if (!err) @@ -4194,6 +4740,13 @@ hfs_copy_extent( buf_setlblkno(bp, destSector); if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl)) buf_markfua(bp); + +#if CONFIG_PROTECT + /* Attach the CP to the buffer */ + if (cpenabled) { + buf_setcpaddr (bp, cp->c_cpentry); + } +#endif /* Do the write */ vnode_startwrite(hfsmp->hfs_devvp); @@ -4230,342 +4783,941 @@ hfs_copy_extent( } -static int -hfs_relocate_callback(__unused HFSPlusExtentKey *key, HFSPlusExtentRecord *record, HFSPlusExtentRecord *state) -{ - bcopy(state, record, sizeof(HFSPlusExtentRecord)); - return 0; -} +/* Structure to store state of reclaiming extents from a + * given file. hfs_reclaim_file()/hfs_reclaim_xattr() + * initializes the values in this structure which are then + * used by code that reclaims and splits the extents. + */ +struct hfs_reclaim_extent_info { + struct vnode *vp; + u_int32_t fileID; + u_int8_t forkType; + u_int8_t is_dirlink; /* Extent belongs to directory hard link */ + u_int8_t is_sysfile; /* Extent belongs to system file */ + u_int8_t is_xattr; /* Extent belongs to extent-based xattr */ + u_int8_t extent_index; + int lockflags; /* Locks that reclaim and split code should grab before modifying the extent record */ + u_int32_t blocks_relocated; /* Total blocks relocated for this file till now */ + u_int32_t recStartBlock; /* File allocation block number (FABN) for current extent record */ + u_int32_t cur_blockCount; /* Number of allocation blocks that have been checked for reclaim */ + struct filefork *catalog_fp; /* If non-NULL, extent is from catalog record */ + union record { + HFSPlusExtentRecord overflow;/* Extent record from overflow extents btree */ + HFSPlusAttrRecord xattr; /* Attribute record for large EAs */ + } record; + HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being processed. + * For catalog extent record, points to the correct + * extent information in filefork. For overflow extent + * record, or xattr record, points to extent record + * in the structure above + */ + struct cat_desc *dirlink_desc; + struct cat_attr *dirlink_attr; + struct filefork *dirlink_fork; /* For directory hard links, fp points actually to this */ + struct BTreeIterator *iterator; /* Shared read/write iterator, hfs_reclaim_file/xattr() + * use it for reading and hfs_reclaim_extent()/hfs_split_extent() + * use it for writing updated extent record + */ + struct FSBufferDescriptor btdata; /* Shared btdata for reading/writing extent record, same as iterator above */ + u_int16_t recordlen; + int overflow_count; /* For debugging, counter for overflow extent record */ + FCB *fcb; /* Pointer to the current btree being traversed */ +}; -/* - * Reclaim space at the end of a volume, used by a given file. +/* + * Split the current extent into two extents, with first extent + * to contain given number of allocation blocks. Splitting of + * extent creates one new extent entry which can result in + * shifting of many entries through all the extent records of a + * file, and/or creating a new extent record in the overflow + * extent btree. * - * This routine attempts to move any extent which contains allocation blocks - * at or after "startblk." A separate transaction is used to do the move. - * The contents of any moved extents are read and written via the volume's - * device vnode -- NOT via "vp." During the move, moved blocks which are part - * of a transaction have their physical block numbers invalidated so they will - * eventually be written to their new locations. + * Example: + * The diagram below represents two consecutive extent records, + * for simplicity, lets call them record X and X+1 respectively. + * Interesting extent entries have been denoted by letters. + * If the letter is unchanged before and after split, it means + * that the extent entry was not modified during the split. + * A '.' means that the entry remains unchanged after the split + * and is not relevant for our example. A '0' means that the + * extent entry is empty. * - * Inputs: - * hfsmp The volume being resized. - * startblk Blocks >= this allocation block need to be moved. - * locks Which locks need to be taken for the given system file. - * vp The vnode for the system file. + * If there isn't sufficient contiguous free space to relocate + * an extent (extent "C" below), we will have to break the one + * extent into multiple smaller extents, and relocate each of + * the smaller extents individually. The way we do this is by + * finding the largest contiguous free space that is currently + * available (N allocation blocks), and then convert extent "C" + * into two extents, C1 and C2, that occupy exactly the same + * allocation blocks as extent C. Extent C1 is the first + * N allocation blocks of extent C, and extent C2 is the remainder + * of extent C. Then we can relocate extent C1 since we know + * we have enough contiguous free space to relocate it in its + * entirety. We then repeat the process starting with extent C2. + * + * In record X, only the entries following entry C are shifted, and + * the original entry C is replaced with two entries C1 and C2 which + * are actually two extent entries for contiguous allocation blocks. + * + * Note that the entry E from record X is shifted into record X+1 as + * the new first entry. Since the first entry of record X+1 is updated, + * the FABN will also get updated with the blockCount of entry E. + * This also results in shifting of all extent entries in record X+1. + * Note that the number of empty entries after the split has been + * changed from 3 to 2. + * + * Before: + * record X record X+1 + * ---------------------===--------- --------------------------------- + * | A | . | . | . | B | C | D | E | | F | . | . | . | G | 0 | 0 | 0 | + * ---------------------===--------- --------------------------------- * - * The caller of this function, hfs_reclaimspace(), grabs cnode lock - * for non-system files before calling this function. + * After: + * ---------------------=======----- --------------------------------- + * | A | . | . | . | B | C1| C2| D | | E | F | . | . | . | G | 0 | 0 | + * ---------------------=======----- --------------------------------- * - * Outputs: - * blks_moved Total number of allocation blocks moved by this routine. + * C1.startBlock = C.startBlock + * C1.blockCount = N + * + * C2.startBlock = C.startBlock + N + * C2.blockCount = C.blockCount - N + * + * FABN = old FABN - E.blockCount + * + * Inputs: + * extent_info - This is the structure that contains state about + * the current file, extent, and extent record that + * is being relocated. This structure is shared + * among code that traverses through all the extents + * of the file, code that relocates extents, and + * code that splits the extent. + * Output: + * Zero on success, non-zero on failure. */ -static int -hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_long startblk, int locks, u_int32_t *blks_moved, vfs_context_t context) +static int +hfs_split_extent(struct hfs_reclaim_extent_info *extent_info, uint32_t newBlockCount) { - int error; - int lockflags; + int error = 0; + int index = extent_info->extent_index; int i; - u_long datablks; - u_long end_block; - u_int32_t oldStartBlock; - u_int32_t newStartBlock; - u_int32_t oldBlockCount; - u_int32_t newBlockCount; - struct filefork *fp; - struct cnode *cp; - int is_sysfile; - int took_truncate_lock = 0; - struct BTreeIterator *iterator = NULL; - u_int8_t forktype; - u_int32_t fileID; - u_int32_t alloc_flags; - - /* If there is no vnode for this file, then there's nothing to do. */ - if (vp == NULL) - return 0; + HFSPlusExtentDescriptor shift_extent; + HFSPlusExtentDescriptor last_extent; + HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being manipulated */ + HFSPlusExtentRecord *extents_rec = NULL; + HFSPlusExtentKey *extents_key = NULL; + HFSPlusAttrRecord *xattr_rec = NULL; + HFSPlusAttrKey *xattr_key = NULL; + struct BTreeIterator iterator; + struct FSBufferDescriptor btdata; + uint16_t reclen; + uint32_t read_recStartBlock; /* Starting allocation block number to read old extent record */ + uint32_t write_recStartBlock; /* Starting allocation block number to insert newly updated extent record */ + Boolean create_record = false; + Boolean is_xattr; + + is_xattr = extent_info->is_xattr; + extents = extent_info->extents; - cp = VTOC(vp); - fileID = cp->c_cnid; - is_sysfile = vnode_issystem(vp); - forktype = VNODE_IS_RSRC(vp) ? 0xFF : 0; + if (hfs_resize_debug) { + printf ("hfs_split_extent: Split record:%u recStartBlock=%u %u:(%u,%u) for %u blocks\n", extent_info->overflow_count, extent_info->recStartBlock, index, extents[index].startBlock, extents[index].blockCount, newBlockCount); + } - /* Flush all the buffer cache blocks and cluster pages associated with - * this vnode. - * - * If the current vnode is a system vnode, all the buffer cache blocks - * associated with it should already be sync'ed to the disk as part of - * journal flush in hfs_truncatefs(). Normally there should not be - * buffer cache blocks for regular files, but for objects like symlinks, - * we can have buffer cache blocks associated with the vnode. Therefore - * we call buf_flushdirtyblks() always. Resource fork data for directory - * hard links are directly written using buffer cache for device vnode, - * which should also be sync'ed as part of journal flush in hfs_truncatefs(). - * - * Flushing cluster pages should be the normal case for regular files, - * and really should not do anything for system files. But just to be - * sure that all blocks associated with this vnode is sync'ed to the - * disk, we call both buffer cache and cluster layer functions. + /* Determine the starting allocation block number for the following + * overflow extent record, if any, before the current record + * gets modified. */ - buf_flushdirtyblks(vp, MNT_NOWAIT, 0, "hfs_reclaim_file"); - - if (!is_sysfile) { - /* The caller grabs cnode lock for non-system files only, therefore - * we unlock only non-system files before calling cluster layer. - */ - hfs_unlock(cp); - hfs_lock_truncate(cp, TRUE); - took_truncate_lock = 1; + read_recStartBlock = extent_info->recStartBlock; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extents[i].blockCount == 0) { + break; + } + read_recStartBlock += extents[i].blockCount; } - (void) cluster_push(vp, 0); - if (!is_sysfile) { - error = hfs_lock(cp, HFS_FORCE_LOCK); - if (error) { - hfs_unlock_truncate(cp, TRUE); - return error; + + /* Shift and split */ + if (index == kHFSPlusExtentDensity-1) { + /* The new extent created after split will go into following overflow extent record */ + shift_extent.startBlock = extents[index].startBlock + newBlockCount; + shift_extent.blockCount = extents[index].blockCount - newBlockCount; + + /* Last extent in the record will be split, so nothing to shift */ + } else { + /* Splitting of extents can result in at most of one + * extent entry to be shifted into following overflow extent + * record. So, store the last extent entry for later. + */ + shift_extent = extents[kHFSPlusExtentDensity-1]; + + /* Start shifting extent information from the end of the extent + * record to the index where we want to insert the new extent. + * Note that kHFSPlusExtentDensity-1 is already saved above, and + * does not need to be shifted. The extent entry that is being + * split does not get shifted. + */ + for (i = kHFSPlusExtentDensity-2; i > index; i--) { + if (hfs_resize_debug) { + if (extents[i].blockCount) { + printf ("hfs_split_extent: Shift %u:(%u,%u) to %u:(%u,%u)\n", i, extents[i].startBlock, extents[i].blockCount, i+1, extents[i].startBlock, extents[i].blockCount); + } + } + extents[i+1] = extents[i]; } + } - /* If the file no longer exists, nothing left to do */ - if (cp->c_flag & C_NOEXISTS) { - hfs_unlock_truncate(cp, TRUE); - return 0; + if (index == kHFSPlusExtentDensity-1) { + /* The second half of the extent being split will be the overflow + * entry that will go into following overflow extent record. The + * value has been stored in 'shift_extent' above, so there is + * nothing to be done here. + */ + } else { + /* Update the values in the second half of the extent being split + * before updating the first half of the split. Note that the + * extent to split or first half of the split is at index 'index' + * and a new extent or second half of the split will be inserted at + * 'index+1' or into following overflow extent record. + */ + extents[index+1].startBlock = extents[index].startBlock + newBlockCount; + extents[index+1].blockCount = extents[index].blockCount - newBlockCount; + } + /* Update the extent being split, only the block count will change */ + extents[index].blockCount = newBlockCount; + + if (hfs_resize_debug) { + printf ("hfs_split_extent: Split %u:(%u,%u) and ", index, extents[index].startBlock, extents[index].blockCount); + if (index != kHFSPlusExtentDensity-1) { + printf ("%u:(%u,%u)\n", index+1, extents[index+1].startBlock, extents[index+1].blockCount); + } else { + printf ("overflow:(%u,%u)\n", shift_extent.startBlock, shift_extent.blockCount); } } - /* Wait for any in-progress writes to this vnode to complete, so that we'll - * be copying consistent bits. (Otherwise, it's possible that an async - * write will complete to the old extent after we read from it. That - * could lead to corruption.) + /* If the newly split extent is for large EAs or in overflow extent + * record, so update it directly in the btree using the iterator + * information from the shared extent_info structure */ - error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file"); - if (error) { - printf("hfs_reclaim_file: Error %d from vnode_waitforwrites\n", error); - return error; + if (extent_info->catalog_fp == NULL) { + error = BTReplaceRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), extent_info->recordlen); + if (error) { + printf ("hfs_split_extent: fileID=%u BTReplaceRecord returned error=%d\n", extent_info->fileID, error); + goto out; + } } - - if (hfs_resize_debug) { - printf("hfs_reclaim_file: Start relocating %sfork for fileid=%u name=%.*s\n", (forktype ? "rsrc" : "data"), fileID, cp->c_desc.cd_namelen, cp->c_desc.cd_nameptr); + + /* No extent entry to be shifted into another extent overflow record */ + if (shift_extent.blockCount == 0) { + if (hfs_resize_debug) { + printf ("hfs_split_extent: No extent entry to be shifted into overflow records\n"); + } + error = 0; + goto out; } - /* We always need the allocation bitmap and extents B-tree */ - locks |= SFL_BITMAP | SFL_EXTENTS; + /* The overflow extent entry has to be shifted into an extent + * overflow record. This would mean that we have to shift + * extent entries from all overflow records by one. We will + * start iteration from the first record to the last record, + * and shift the extent entry from one record to another. + * We might have to create a new record for the last extent + * entry for the file. + */ - error = hfs_start_transaction(hfsmp); - if (error) { - printf("hfs_reclaim_file: hfs_start_transaction returned %d\n", error); - if (took_truncate_lock) { - hfs_unlock_truncate(cp, TRUE); + /* Initialize iterator to search the next record */ + bzero(&iterator, sizeof(iterator)); + if (is_xattr) { + /* Copy the key from the iterator that was to update the modified attribute record. */ + xattr_key = (HFSPlusAttrKey *)&(iterator.key); + bcopy((HFSPlusAttrKey *)&(extent_info->iterator->key), xattr_key, sizeof(HFSPlusAttrKey)); + /* Note: xattr_key->startBlock will be initialized later in the iteration loop */ + + MALLOC(xattr_rec, HFSPlusAttrRecord *, + sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK); + if (xattr_rec == NULL) { + error = ENOMEM; + goto out; } - return error; + btdata.bufferAddress = xattr_rec; + btdata.itemSize = sizeof(HFSPlusAttrRecord); + btdata.itemCount = 1; + extents = xattr_rec->overflowExtents.extents; + } else { + extents_key = (HFSPlusExtentKey *) &(iterator.key); + extents_key->keyLength = kHFSPlusExtentKeyMaximumLength; + extents_key->forkType = extent_info->forkType; + extents_key->fileID = extent_info->fileID; + /* Note: extents_key->startBlock will be initialized later in the iteration loop */ + + MALLOC(extents_rec, HFSPlusExtentRecord *, + sizeof(HFSPlusExtentRecord), M_TEMP, M_WAITOK); + if (extents_rec == NULL) { + error = ENOMEM; + goto out; + } + btdata.bufferAddress = extents_rec; + btdata.itemSize = sizeof(HFSPlusExtentRecord); + btdata.itemCount = 1; + extents = extents_rec[0]; } - lockflags = hfs_systemfile_lock(hfsmp, locks, HFS_EXCLUSIVE_LOCK); - fp = VTOF(vp); - datablks = 0; - *blks_moved = 0; - /* Relocate non-overflow extents */ - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (fp->ff_extents[i].blockCount == 0) - break; - oldStartBlock = fp->ff_extents[i].startBlock; - oldBlockCount = fp->ff_extents[i].blockCount; - datablks += oldBlockCount; - end_block = oldStartBlock + oldBlockCount; - /* Check if the file overlaps the target space */ - if (end_block > startblk) { - alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS; - if (is_sysfile) { - alloc_flags |= HFS_ALLOC_METAZONE; + /* An extent entry still needs to be shifted into following overflow + * extent record. This will result in the starting allocation block + * number of the extent record being changed which is part of the key + * for the extent record. Since the extent record key is changing, + * the record can not be updated, instead has to be deleted and + * inserted again. + */ + while (shift_extent.blockCount) { + if (hfs_resize_debug) { + printf ("hfs_split_extent: Will shift (%u,%u) into record with startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, read_recStartBlock); + } + + /* Search if there is any existing overflow extent record. + * For this, the logical start block number in the key is + * the value calculated based on the logical start block + * number of the current extent record and the total number + * of blocks existing in the current extent record. + */ + if (is_xattr) { + xattr_key->startBlock = read_recStartBlock; + } else { + extents_key->startBlock = read_recStartBlock; + } + error = BTSearchRecord(extent_info->fcb, &iterator, &btdata, &reclen, &iterator); + if (error) { + if (error != btNotFound) { + printf ("hfs_split_extent: fileID=%u startBlock=%u BTSearchRecord error=%d\n", extent_info->fileID, read_recStartBlock, error); + goto out; } - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, &newStartBlock, &newBlockCount); - if (error) { - if (!is_sysfile && ((error == dskFulErr) || (error == ENOSPC))) { - /* Try allocating again using the metadata zone */ - alloc_flags |= HFS_ALLOC_METAZONE; - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, &newStartBlock, &newBlockCount); - } - if (error) { - printf("hfs_reclaim_file: BlockAllocate(metazone) (error=%d) for fileID=%u %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount); - goto fail; - } else { - if (hfs_resize_debug) { - printf("hfs_reclaim_file: BlockAllocate(metazone) success for fileID=%u %u:(%u,%u)\n", fileID, i, newStartBlock, newBlockCount); - } - } + create_record = true; + } + + /* The extra extent entry from the previous record is being inserted + * as the first entry in the current extent record. This will change + * the file allocation block number (FABN) of the current extent + * record, which is the startBlock value from the extent record key. + * Since one extra entry is being inserted in the record, the new + * FABN for the record will less than old FABN by the number of blocks + * in the new extent entry being inserted at the start. We have to + * do this before we update read_recStartBlock to point at the + * startBlock of the following record. + */ + write_recStartBlock = read_recStartBlock - shift_extent.blockCount; + if (hfs_resize_debug) { + if (create_record) { + printf ("hfs_split_extent: No records found for startBlock=%u, will create new with startBlock=%u\n", read_recStartBlock, write_recStartBlock); } + } - /* Copy data from old location to new location */ - error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, newBlockCount, context); - if (error) { - printf("hfs_reclaim_file: hfs_copy_extent error=%d for fileID=%u %u:(%u,%u) to %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount); - if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS)) { - hfs_mark_volume_inconsistent(hfsmp); - } - goto fail; + /* Now update the read_recStartBlock to account for total number + * of blocks in this extent record. It will now point to the + * starting allocation block number for the next extent record. + */ + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extents[i].blockCount == 0) { + break; } - fp->ff_extents[i].startBlock = newStartBlock; - cp->c_flag |= C_MODIFIED; - *blks_moved += newBlockCount; + read_recStartBlock += extents[i].blockCount; + } - /* Deallocate the old extent */ - error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS); - if (error) { - printf("hfs_reclaim_file: BlockDeallocate returned %d\n", error); - hfs_mark_volume_inconsistent(hfsmp); - goto fail; + if (create_record == true) { + /* Initialize new record content with only one extent entry */ + bzero(extents, sizeof(HFSPlusExtentRecord)); + /* The new record will contain only one extent entry */ + extents[0] = shift_extent; + /* There are no more overflow extents to be shifted */ + shift_extent.startBlock = shift_extent.blockCount = 0; + + if (is_xattr) { + xattr_rec->recordType = kHFSPlusAttrExtents; + xattr_rec->overflowExtents.reserved = 0; + reclen = sizeof(HFSPlusAttrExtents); + } else { + reclen = sizeof(HFSPlusExtentRecord); } - - /* If this is a system file, sync the volume header on disk */ - if (is_sysfile) { - error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); - if (error) { - printf("hfs_reclaim_file: hfs_flushvolumeheader returned %d\n", error); - hfs_mark_volume_inconsistent(hfsmp); - goto fail; - } + } else { + /* The overflow extent entry from previous record will be + * the first entry in this extent record. If the last + * extent entry in this record is valid, it will be shifted + * into the following extent record as its first entry. So + * save the last entry before shifting entries in current + * record. + */ + last_extent = extents[kHFSPlusExtentDensity-1]; + + /* Shift all entries by one index towards the end */ + for (i = kHFSPlusExtentDensity-2; i >= 0; i--) { + extents[i+1] = extents[i]; } + /* Overflow extent entry saved from previous record + * is now the first entry in the current record. + */ + extents[0] = shift_extent; + if (hfs_resize_debug) { - printf ("hfs_reclaim_file: Relocated %u:(%u,%u) to %u:(%u,%u)\n", i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount); + printf ("hfs_split_extent: Shift overflow=(%u,%u) to record with updated startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, write_recStartBlock); } - } - } - - /* Relocate overflow extents (if any) */ - if (i == kHFSPlusExtentDensity && fp->ff_blocks > datablks) { - struct FSBufferDescriptor btdata; - HFSPlusExtentRecord record; - HFSPlusExtentKey *key; - FCB *fcb; - int overflow_count = 0; - - if (kmem_alloc(kernel_map, (vm_offset_t*) &iterator, sizeof(*iterator))) { - printf("hfs_reclaim_file: kmem_alloc failed!\n"); - error = ENOMEM; - goto fail; - } - bzero(iterator, sizeof(*iterator)); - key = (HFSPlusExtentKey *) &iterator->key; - key->keyLength = kHFSPlusExtentKeyMaximumLength; - key->forkType = forktype; - key->fileID = fileID; - key->startBlock = datablks; - - btdata.bufferAddress = &record; - btdata.itemSize = sizeof(record); - btdata.itemCount = 1; - - fcb = VTOF(hfsmp->hfs_extents_vp); + /* The last entry from current record will be the + * overflow entry which will be the first entry for + * the following extent record. + */ + shift_extent = last_extent; - error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator); - while (error == 0) { - /* Stop when we encounter a different file or fork. */ - if ((key->fileID != fileID) || - (key->forkType != forktype)) { - break; + /* Since the key->startBlock is being changed for this record, + * it should be deleted and inserted with the new key. + */ + error = BTDeleteRecord(extent_info->fcb, &iterator); + if (error) { + printf ("hfs_split_extent: fileID=%u startBlock=%u BTDeleteRecord error=%d\n", extent_info->fileID, read_recStartBlock, error); + goto out; } - - /* Just track the overflow extent record number for debugging... */ if (hfs_resize_debug) { - overflow_count++; + printf ("hfs_split_extent: Deleted record with startBlock=%u\n", (is_xattr ? xattr_key->startBlock : extents_key->startBlock)); } + } - /* - * Check if the file overlaps target space. - */ - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (record[i].blockCount == 0) { - goto fail; - } - oldStartBlock = record[i].startBlock; - oldBlockCount = record[i].blockCount; - end_block = oldStartBlock + oldBlockCount; - if (end_block > startblk) { - alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS; - if (is_sysfile) { - alloc_flags |= HFS_ALLOC_METAZONE; - } - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, &newStartBlock, &newBlockCount); - if (error) { - if (!is_sysfile && ((error == dskFulErr) || (error == ENOSPC))) { - /* Try allocating again using the metadata zone */ - alloc_flags |= HFS_ALLOC_METAZONE; - error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, &newStartBlock, &newBlockCount); - } - if (error) { - printf("hfs_reclaim_file: BlockAllocate(metazone) (error=%d) for fileID=%u %u:(%u,%u)\n", error, fileID, i, oldStartBlock, oldBlockCount); - goto fail; - } else { - if (hfs_resize_debug) { - printf("hfs_reclaim_file: BlockAllocate(metazone) success for fileID=%u %u:(%u,%u)\n", fileID, i, newStartBlock, newBlockCount); - } - } - } - error = hfs_copy_extent(hfsmp, vp, oldStartBlock, newStartBlock, newBlockCount, context); - if (error) { - printf("hfs_reclaim_file: hfs_copy_extent error=%d for fileID=%u (%u,%u) to (%u,%u)\n", error, fileID, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); - if (BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS)) { - hfs_mark_volume_inconsistent(hfsmp); - } - goto fail; - } - record[i].startBlock = newStartBlock; - cp->c_flag |= C_MODIFIED; - *blks_moved += newBlockCount; - - /* - * NOTE: To support relocating overflow extents of the - * allocation file, we must update the BTree record BEFORE - * deallocating the old extent so that BlockDeallocate will - * use the extent's new location to calculate physical block - * numbers. (This is for the case where the old extent's - * bitmap bits actually reside in the extent being moved.) - */ - error = BTUpdateRecord(fcb, iterator, (IterateCallBackProcPtr) hfs_relocate_callback, &record); - if (error) { - printf("hfs_reclaim_file: BTUpdateRecord returned %d\n", error); - hfs_mark_volume_inconsistent(hfsmp); - goto fail; - } - error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS); - if (error) { - printf("hfs_reclaim_file: BlockDeallocate returned %d\n", error); - hfs_mark_volume_inconsistent(hfsmp); - goto fail; - } - if (hfs_resize_debug) { - printf ("hfs_reclaim_file: Relocated overflow#%d %u:(%u,%u) to %u:(%u,%u)\n", overflow_count, i, oldStartBlock, oldBlockCount, i, newStartBlock, newBlockCount); - } - } - } - /* Look for more records. */ - error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); - if (error == btNotFound) { - error = 0; - break; + /* Insert the newly created or modified extent record */ + bzero(&iterator.hint, sizeof(iterator.hint)); + if (is_xattr) { + xattr_key->startBlock = write_recStartBlock; + } else { + extents_key->startBlock = write_recStartBlock; + } + error = BTInsertRecord(extent_info->fcb, &iterator, &btdata, reclen); + if (error) { + printf ("hfs_split_extent: fileID=%u, startBlock=%u BTInsertRecord error=%d\n", extent_info->fileID, write_recStartBlock, error); + goto out; + } + if (hfs_resize_debug) { + printf ("hfs_split_extent: Inserted extent record with startBlock=%u\n", write_recStartBlock); + } + } + BTFlushPath(extent_info->fcb); +out: + if (extents_rec) { + FREE (extents_rec, M_TEMP); + } + if (xattr_rec) { + FREE (xattr_rec, M_TEMP); + } + return error; +} + + +/* + * Relocate an extent if it lies beyond the expected end of volume. + * + * This function is called for every extent of the file being relocated. + * It allocates space for relocation, copies the data, deallocates + * the old extent, and update corresponding on-disk extent. If the function + * does not find contiguous space to relocate an extent, it splits the + * extent in smaller size to be able to relocate it out of the area of + * disk being reclaimed. As an optimization, if an extent lies partially + * in the area of the disk being reclaimed, it is split so that we only + * have to relocate the area that was overlapping with the area of disk + * being reclaimed. + * + * Note that every extent is relocated in its own transaction so that + * they do not overwhelm the journal. This function handles the extent + * record that exists in the catalog record, extent record from overflow + * extents btree, and extents for large EAs. + * + * Inputs: + * extent_info - This is the structure that contains state about + * the current file, extent, and extent record that + * is being relocated. This structure is shared + * among code that traverses through all the extents + * of the file, code that relocates extents, and + * code that splits the extent. + */ +static int +hfs_reclaim_extent(struct hfsmount *hfsmp, const u_long allocLimit, struct hfs_reclaim_extent_info *extent_info, vfs_context_t context) +{ + int error = 0; + int index; + struct cnode *cp; + u_int32_t oldStartBlock; + u_int32_t oldBlockCount; + u_int32_t newStartBlock; + u_int32_t newBlockCount; + u_int32_t alloc_flags; + int blocks_allocated = false; + + index = extent_info->extent_index; + cp = VTOC(extent_info->vp); + + oldStartBlock = extent_info->extents[index].startBlock; + oldBlockCount = extent_info->extents[index].blockCount; + + if (0 && hfs_resize_debug) { + printf ("hfs_reclaim_extent: Examine record:%u recStartBlock=%u, %u:(%u,%u)\n", extent_info->overflow_count, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount); + } + + /* Check if the current extent lies completely within allocLimit */ + if ((oldStartBlock + oldBlockCount) <= allocLimit) { + extent_info->cur_blockCount += oldBlockCount; + return error; + } + + /* Every extent should be relocated in its own transaction + * to make sure that we don't overflow the journal buffer. + */ + error = hfs_start_transaction(hfsmp); + if (error) { + return error; + } + extent_info->lockflags = hfs_systemfile_lock(hfsmp, extent_info->lockflags, HFS_EXCLUSIVE_LOCK); + + /* Check if the extent lies partially in the area to reclaim, + * i.e. it starts before allocLimit and ends beyond allocLimit. + * We have already skipped extents that lie completely within + * allocLimit in the check above, so we only check for the + * startBlock. If it lies partially, split it so that we + * only relocate part of the extent. + */ + if (oldStartBlock < allocLimit) { + newBlockCount = allocLimit - oldStartBlock; + error = hfs_split_extent(extent_info, newBlockCount); + if (error == 0) { + /* After successful split, the current extent does not + * need relocation, so just return back. + */ + goto out; + } + /* Ignore error and try relocating the entire extent instead */ + } + + alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS; + if (extent_info->is_sysfile) { + alloc_flags |= HFS_ALLOC_METAZONE; + } + + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags, + &newStartBlock, &newBlockCount); + if ((extent_info->is_sysfile == false) && + ((error == dskFulErr) || (error == ENOSPC))) { + /* For non-system files, try reallocating space in metadata zone */ + alloc_flags |= HFS_ALLOC_METAZONE; + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, + alloc_flags, &newStartBlock, &newBlockCount); + } + if ((error == dskFulErr) || (error == ENOSPC)) { + /* We did not find desired contiguous space for this extent. + * So try to allocate the maximum contiguous space available. + */ + alloc_flags &= ~HFS_ALLOC_FORCECONTIG; + + error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, + alloc_flags, &newStartBlock, &newBlockCount); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; + } + blocks_allocated = true; + + error = hfs_split_extent(extent_info, newBlockCount); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) split error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; + } + oldBlockCount = newBlockCount; + } + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) contig BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; + } + blocks_allocated = true; + + /* Copy data from old location to new location */ + error = hfs_copy_extent(hfsmp, extent_info->vp, oldStartBlock, + newStartBlock, newBlockCount, context); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u)=>(%u,%u) hfs_copy_extent error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount, error); + goto out; + } + + /* Update the extent record with the new start block information */ + extent_info->extents[index].startBlock = newStartBlock; + + /* Sync the content back to the disk */ + if (extent_info->catalog_fp) { + /* Update the extents in catalog record */ + if (extent_info->is_dirlink) { + error = cat_update_dirlink(hfsmp, extent_info->forkType, + extent_info->dirlink_desc, extent_info->dirlink_attr, + &(extent_info->dirlink_fork->ff_data)); + } else { + cp->c_flag |= C_MODIFIED; + /* If this is a system file, sync volume headers on disk */ + if (extent_info->is_sysfile) { + error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH); } } + } else { + /* Replace record for extents overflow or extents-based xattrs */ + error = BTReplaceRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), extent_info->recordlen); } - -fail: - if (iterator) { - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u, update record error=%u\n", extent_info->fileID, error); + goto out; + } + + /* Deallocate the old extent */ + error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS); + if (error) { + printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockDeallocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error); + goto out; } + extent_info->blocks_relocated += newBlockCount; - (void) hfs_systemfile_unlock(hfsmp, lockflags); + if (hfs_resize_debug) { + printf ("hfs_reclaim_extent: Relocated record:%u %u:(%u,%u) to (%u,%u)\n", extent_info->overflow_count, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + } - if ((*blks_moved != 0) && (is_sysfile == false)) { - (void) hfs_update(vp, MNT_WAIT); +out: + if (error != 0) { + if (blocks_allocated == true) { + BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); + } + } else { + /* On success, increment the total allocation blocks processed */ + extent_info->cur_blockCount += newBlockCount; } - (void) hfs_end_transaction(hfsmp); + hfs_systemfile_unlock(hfsmp, extent_info->lockflags); - if (took_truncate_lock) { - hfs_unlock_truncate(cp, TRUE); + /* For a non-system file, if an extent entry from catalog record + * was modified, sync the in-memory changes to the catalog record + * on disk before ending the transaction. + */ + if ((error == 0) && + (extent_info->overflow_count < kHFSPlusExtentDensity) && + (extent_info->is_sysfile == false)) { + (void) hfs_update(extent_info->vp, MNT_WAIT); + } + + hfs_end_transaction(hfsmp); + + return error; +} + +/* Report intermediate progress during volume resize */ +static void +hfs_truncatefs_progress(struct hfsmount *hfsmp) +{ + u_int32_t cur_progress; + + hfs_resize_progress(hfsmp, &cur_progress); + if (cur_progress > (hfsmp->hfs_resize_progress + 9)) { + printf("hfs_truncatefs: %d%% done...\n", cur_progress); + hfsmp->hfs_resize_progress = cur_progress; + } + return; +} + +/* + * Reclaim space at the end of a volume for given file and forktype. + * + * This routine attempts to move any extent which contains allocation blocks + * at or after "allocLimit." A separate transaction is used for every extent + * that needs to be moved. If there is not contiguous space available for + * moving an extent, it can be split into smaller extents. The contents of + * any moved extents are read and written via the volume's device vnode -- + * NOT via "vp." During the move, moved blocks which are part of a transaction + * have their physical block numbers invalidated so they will eventually be + * written to their new locations. + * + * This function is also called for directory hard links. Directory hard links + * are regular files with no data fork and resource fork that contains alias + * information for backward compatibility with pre-Leopard systems. However + * non-Mac OS X implementation can add/modify data fork or resource fork + * information to directory hard links, so we check, and if required, relocate + * both data fork and resource fork. + * + * Inputs: + * hfsmp The volume being resized. + * vp The vnode for the system file. + * fileID ID of the catalog record that needs to be relocated + * forktype The type of fork that needs relocated, + * kHFSResourceForkType for resource fork, + * kHFSDataForkType for data fork + * allocLimit Allocation limit for the new volume size, + * do not use this block or beyond. All extents + * that use this block or any blocks beyond this limit + * will be relocated. + * + * Side Effects: + * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation + * blocks that were relocated. + */ +static int +hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, + u_int8_t forktype, u_long allocLimit, vfs_context_t context) +{ + int error = 0; + struct hfs_reclaim_extent_info *extent_info; + int i; + int lockflags = 0; + struct cnode *cp; + struct filefork *fp; + int took_truncate_lock = false; + int release_desc = false; + HFSPlusExtentKey *key; + + /* If there is no vnode for this file, then there's nothing to do. */ + if (vp == NULL) { + return 0; + } + + cp = VTOC(vp); + + MALLOC(extent_info, struct hfs_reclaim_extent_info *, + sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK); + if (extent_info == NULL) { + return ENOMEM; + } + bzero(extent_info, sizeof(struct hfs_reclaim_extent_info)); + extent_info->vp = vp; + extent_info->fileID = fileID; + extent_info->forkType = forktype; + extent_info->is_sysfile = vnode_issystem(vp); + if (vnode_isdir(vp) && (cp->c_flag & C_HARDLINK)) { + extent_info->is_dirlink = true; + } + /* We always need allocation bitmap and extent btree lock */ + lockflags = SFL_BITMAP | SFL_EXTENTS; + if ((fileID == kHFSCatalogFileID) || (extent_info->is_dirlink == true)) { + lockflags |= SFL_CATALOG; + } else if (fileID == kHFSAttributesFileID) { + lockflags |= SFL_ATTRIBUTE; + } else if (fileID == kHFSStartupFileID) { + lockflags |= SFL_STARTUP; + } + extent_info->lockflags = lockflags; + extent_info->fcb = VTOF(hfsmp->hfs_extents_vp); + + /* Flush data associated with current file on disk. + * + * If the current vnode is directory hard link, no flushing of + * journal or vnode is required. The current kernel does not + * modify data/resource fork of directory hard links, so nothing + * will be in the cache. If a directory hard link is newly created, + * the resource fork data is written directly using devvp and + * the code that actually relocates data (hfs_copy_extent()) also + * uses devvp for its I/O --- so they will see a consistent copy. + */ + if (extent_info->is_sysfile) { + /* If the current vnode is system vnode, flush journal + * to make sure that all data is written to the disk. + */ + error = hfs_journal_flush(hfsmp, TRUE); + if (error) { + printf ("hfs_reclaim_file: journal_flush returned %d\n", error); + goto out; + } + } else if (extent_info->is_dirlink == false) { + /* Flush all blocks associated with this regular file vnode. + * Normally there should not be buffer cache blocks for regular + * files, but for objects like symlinks, we can have buffer cache + * blocks associated with the vnode. Therefore we call + * buf_flushdirtyblks() also. + */ + buf_flushdirtyblks(vp, 0, BUF_SKIP_LOCKED, "hfs_reclaim_file"); + + hfs_unlock(cp); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); + took_truncate_lock = true; + (void) cluster_push(vp, 0); + error = hfs_lock(cp, HFS_FORCE_LOCK); + if (error) { + goto out; + } + + /* If the file no longer exists, nothing left to do */ + if (cp->c_flag & C_NOEXISTS) { + error = 0; + goto out; + } + + /* Wait for any in-progress writes to this vnode to complete, so that we'll + * be copying consistent bits. (Otherwise, it's possible that an async + * write will complete to the old extent after we read from it. That + * could lead to corruption.) + */ + error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file"); + if (error) { + goto out; + } + } + + if (hfs_resize_debug) { + printf("hfs_reclaim_file: === Start reclaiming %sfork for %sid=%u ===\n", (forktype ? "rsrc" : "data"), (extent_info->is_dirlink ? "dirlink" : "file"), fileID); + } + + if (extent_info->is_dirlink) { + MALLOC(extent_info->dirlink_desc, struct cat_desc *, + sizeof(struct cat_desc), M_TEMP, M_WAITOK); + MALLOC(extent_info->dirlink_attr, struct cat_attr *, + sizeof(struct cat_attr), M_TEMP, M_WAITOK); + MALLOC(extent_info->dirlink_fork, struct filefork *, + sizeof(struct filefork), M_TEMP, M_WAITOK); + if ((extent_info->dirlink_desc == NULL) || + (extent_info->dirlink_attr == NULL) || + (extent_info->dirlink_fork == NULL)) { + error = ENOMEM; + goto out; + } + + /* Lookup catalog record for directory hard link and + * create a fake filefork for the value looked up from + * the disk. + */ + fp = extent_info->dirlink_fork; + bzero(extent_info->dirlink_fork, sizeof(struct filefork)); + extent_info->dirlink_fork->ff_cp = cp; + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + error = cat_lookup_dirlink(hfsmp, fileID, forktype, + extent_info->dirlink_desc, extent_info->dirlink_attr, + &(extent_info->dirlink_fork->ff_data)); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + printf ("hfs_reclaim_file: cat_lookup_dirlink for fileID=%u returned error=%u\n", fileID, error); + goto out; + } + release_desc = true; + } else { + fp = VTOF(vp); + } + + extent_info->catalog_fp = fp; + extent_info->recStartBlock = 0; + extent_info->extents = extent_info->catalog_fp->ff_extents; + /* Relocate extents from the catalog record */ + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + if (fp->ff_extents[i].blockCount == 0) { + break; + } + extent_info->extent_index = i; + error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); + if (error) { + printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, error); + goto out; + } + } + + /* If the number of allocation blocks processed for reclaiming + * are less than total number of blocks for the file, continuing + * working on overflow extents record. + */ + if (fp->ff_blocks <= extent_info->cur_blockCount) { + if (0 && hfs_resize_debug) { + printf ("hfs_reclaim_file: Nothing more to relocate, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount); + } + goto out; + } + + if (hfs_resize_debug) { + printf ("hfs_reclaim_file: Will check overflow records, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount); + } + + MALLOC(extent_info->iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (extent_info->iterator == NULL) { + error = ENOMEM; + goto out; } + bzero(extent_info->iterator, sizeof(struct BTreeIterator)); + key = (HFSPlusExtentKey *) &(extent_info->iterator->key); + key->keyLength = kHFSPlusExtentKeyMaximumLength; + key->forkType = forktype; + key->fileID = fileID; + key->startBlock = extent_info->cur_blockCount; + + extent_info->btdata.bufferAddress = extent_info->record.overflow; + extent_info->btdata.itemSize = sizeof(HFSPlusExtentRecord); + extent_info->btdata.itemCount = 1; + extent_info->catalog_fp = NULL; + + /* Search the first overflow extent with expected startBlock as 'cur_blockCount' */ + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + error = BTSearchRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), &(extent_info->recordlen), + extent_info->iterator); + hfs_systemfile_unlock(hfsmp, lockflags); + while (error == 0) { + extent_info->overflow_count++; + extent_info->recStartBlock = key->startBlock; + extent_info->extents = extent_info->record.overflow; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extent_info->record.overflow[i].blockCount == 0) { + goto out; + } + extent_info->extent_index = i; + error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); + if (error) { + printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, extent_info->record.overflow[i].startBlock, extent_info->record.overflow[i].blockCount, error); + goto out; + } + } + + /* Look for more overflow records */ + lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK); + error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord, + extent_info->iterator, &(extent_info->btdata), + &(extent_info->recordlen)); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + break; + } + /* Stop when we encounter a different file or fork. */ + if ((key->fileID != fileID) || (key->forkType != forktype)) { + break; + } + } + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + +out: + /* If any blocks were relocated, account them and report progress */ + if (extent_info->blocks_relocated) { + hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated; + hfs_truncatefs_progress(hfsmp); + if (fileID < kHFSFirstUserCatalogNodeID) { + printf ("hfs_reclaim_file: Relocated %u blocks from fileID=%u on \"%s\"\n", + extent_info->blocks_relocated, fileID, hfsmp->vcbVN); + } + } + if (extent_info->iterator) { + FREE(extent_info->iterator, M_TEMP); + } + if (release_desc == true) { + cat_releasedesc(extent_info->dirlink_desc); + } + if (extent_info->dirlink_desc) { + FREE(extent_info->dirlink_desc, M_TEMP); + } + if (extent_info->dirlink_attr) { + FREE(extent_info->dirlink_attr, M_TEMP); + } + if (extent_info->dirlink_fork) { + FREE(extent_info->dirlink_fork, M_TEMP); + } + if ((extent_info->blocks_relocated != 0) && (extent_info->is_sysfile == false)) { + (void) hfs_update(vp, MNT_WAIT); + } + if (took_truncate_lock) { + hfs_unlock_truncate(cp, 0); + } + if (extent_info) { + FREE(extent_info, M_TEMP); + } if (hfs_resize_debug) { - printf("hfs_reclaim_file: Finished relocating %sfork for fileid=%u (error=%d)\n", (forktype ? "rsrc" : "data"), fileID, error); + printf("hfs_reclaim_file: === Finished relocating %sfork for fileid=%u (error=%d) ===\n", (forktype ? "rsrc" : "data"), fileID, error); } return error; @@ -4604,6 +5756,9 @@ hfs_journal_relocate_callback(void *_args) hfsmp->blockSize, vfs_context_ucred(args->context), &bp); if (error) { printf("hfs_reclaim_journal_file: failed to read JIB (%d)\n", error); + if (bp) { + buf_brelse(bp); + } return error; } jibp = (JournalInfoBlock*) buf_dataptr(bp); @@ -4629,9 +5784,10 @@ hfs_journal_relocate_callback(void *_args) static int -hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) +hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) { int error; + int journal_err; int lockflags; u_int32_t oldStartBlock; u_int32_t newStartBlock; @@ -4642,6 +5798,11 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) struct cat_fork journal_fork; struct hfs_journal_relocate_args callback_args; + if (hfsmp->jnl_start + (hfsmp->jnl_size / hfsmp->blockSize) <= allocLimit) { + /* The journal does not require relocation */ + return 0; + } + error = hfs_start_transaction(hfsmp); if (error) { printf("hfs_reclaim_journal_file: hfs_start_transaction returned %d\n", error); @@ -4708,13 +5869,24 @@ hfs_reclaim_journal_file(struct hfsmount *hfsmp, vfs_context_t context) printf("hfs_reclaim_journal_file: hfs_end_transaction returned %d\n", error); } - if (!error && hfs_resize_debug) { - printf ("hfs_reclaim_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + /* Account for the blocks relocated and print progress */ + hfsmp->hfs_resize_blocksmoved += oldBlockCount; + hfs_truncatefs_progress(hfsmp); + if (!error) { + printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n", + oldBlockCount, hfsmp->vcbVN); + if (hfs_resize_debug) { + printf ("hfs_reclaim_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount); + } } return error; free_fail: - (void) BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); + journal_err = BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS); + if (journal_err) { + printf("hfs_reclaim_journal_file: BlockDeallocate returned %d\n", error); + hfs_mark_volume_inconsistent(hfsmp); + } fail: hfs_systemfile_unlock(hfsmp, lockflags); (void) hfs_end_transaction(hfsmp); @@ -4731,9 +5903,10 @@ fail: * the field in the volume header and the catalog record. */ static int -hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) +hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) { int error; + int journal_err; int lockflags; u_int32_t oldBlock; u_int32_t newBlock; @@ -4742,6 +5915,11 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) struct cat_attr jib_attr; struct cat_fork jib_fork; buf_t old_bp, new_bp; + + if (hfsmp->vcbJinfoBlock <= allocLimit) { + /* The journal info block does not require relocation */ + return 0; + } error = hfs_start_transaction(hfsmp); if (error) { @@ -4773,6 +5951,9 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) hfsmp->blockSize, vfs_context_ucred(context), &old_bp); if (error) { printf("hfs_reclaim_journal_info_block: failed to read JIB (%d)\n", error); + if (old_bp) { + buf_brelse(old_bp); + } goto free_fail; } new_bp = buf_getblk(hfsmp->hfs_devvp, @@ -4820,101 +6001,537 @@ hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, vfs_context_t context) printf("hfs_reclaim_journal_info_block: hfs_flushvolumeheader returned %d\n", error); goto fail; } - hfs_systemfile_unlock(hfsmp, lockflags); - error = hfs_end_transaction(hfsmp); - if (error) { - printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error); + hfs_systemfile_unlock(hfsmp, lockflags); + error = hfs_end_transaction(hfsmp); + if (error) { + printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error); + } + error = hfs_journal_flush(hfsmp, FALSE); + if (error) { + printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error); + } + + /* Account for the block relocated and print progress */ + hfsmp->hfs_resize_blocksmoved += 1; + hfs_truncatefs_progress(hfsmp); + if (!error) { + printf ("hfs_reclaim_journal_info: Relocated 1 block from journal info on \"%s\"\n", + hfsmp->vcbVN); + if (hfs_resize_debug) { + printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount); + } + } + return error; + +free_fail: + journal_err = BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS); + if (journal_err) { + printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error); + hfs_mark_volume_inconsistent(hfsmp); + } + +fail: + hfs_systemfile_unlock(hfsmp, lockflags); + (void) hfs_end_transaction(hfsmp); + if (hfs_resize_debug) { + printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error); + } + return error; +} + + +/* + * This function traverses through all extended attribute records for a given + * fileID, and calls function that reclaims data blocks that exist in the + * area of the disk being reclaimed which in turn is responsible for allocating + * new space, copying extent data, deallocating new space, and if required, + * splitting the extent. + * + * Note: The caller has already acquired the cnode lock on the file. Therefore + * we are assured that no other thread would be creating/deleting/modifying + * extended attributes for this file. + * + * Side Effects: + * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation + * blocks that were relocated. + * + * Returns: + * 0 on success, non-zero on failure. + */ +static int +hfs_reclaim_xattr(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, u_int32_t allocLimit, vfs_context_t context) +{ + int error = 0; + struct hfs_reclaim_extent_info *extent_info; + int i; + HFSPlusAttrKey *key; + int *lockflags; + + if (hfs_resize_debug) { + printf("hfs_reclaim_xattr: === Start reclaiming xattr for id=%u ===\n", fileID); + } + + MALLOC(extent_info, struct hfs_reclaim_extent_info *, + sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK); + if (extent_info == NULL) { + return ENOMEM; + } + bzero(extent_info, sizeof(struct hfs_reclaim_extent_info)); + extent_info->vp = vp; + extent_info->fileID = fileID; + extent_info->is_xattr = true; + extent_info->is_sysfile = vnode_issystem(vp); + extent_info->fcb = VTOF(hfsmp->hfs_attribute_vp); + lockflags = &(extent_info->lockflags); + *lockflags = SFL_ATTRIBUTE | SFL_BITMAP; + + /* Initialize iterator from the extent_info structure */ + MALLOC(extent_info->iterator, struct BTreeIterator *, + sizeof(struct BTreeIterator), M_TEMP, M_WAITOK); + if (extent_info->iterator == NULL) { + error = ENOMEM; + goto out; + } + bzero(extent_info->iterator, sizeof(struct BTreeIterator)); + + /* Build attribute key */ + key = (HFSPlusAttrKey *)&(extent_info->iterator->key); + error = hfs_buildattrkey(fileID, NULL, key); + if (error) { + goto out; + } + + /* Initialize btdata from extent_info structure. Note that the + * buffer pointer actually points to the xattr record from the + * extent_info structure itself. + */ + extent_info->btdata.bufferAddress = &(extent_info->record.xattr); + extent_info->btdata.itemSize = sizeof(HFSPlusAttrRecord); + extent_info->btdata.itemCount = 1; + + /* + * Sync all extent-based attribute data to the disk. + * + * All extent-based attribute data I/O is performed via cluster + * I/O using a virtual file that spans across entire file system + * space. + */ + hfs_lock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_EXCLUSIVE_LOCK); + (void)cluster_push(hfsmp->hfs_attrdata_vp, 0); + error = vnode_waitforwrites(hfsmp->hfs_attrdata_vp, 0, 0, 0, "hfs_reclaim_xattr"); + hfs_unlock_truncate(VTOC(hfsmp->hfs_attrdata_vp), 0); + if (error) { + goto out; + } + + /* Search for extended attribute for current file. This + * will place the iterator before the first matching record. + */ + *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK); + error = BTSearchRecord(extent_info->fcb, extent_info->iterator, + &(extent_info->btdata), &(extent_info->recordlen), + extent_info->iterator); + hfs_systemfile_unlock(hfsmp, *lockflags); + if (error) { + if (error != btNotFound) { + goto out; + } + /* btNotFound is expected here, so just mask it */ + error = 0; + } + + while (1) { + /* Iterate to the next record */ + *lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK); + error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord, + extent_info->iterator, &(extent_info->btdata), + &(extent_info->recordlen)); + hfs_systemfile_unlock(hfsmp, *lockflags); + + /* Stop the iteration if we encounter end of btree or xattr with different fileID */ + if (error || key->fileID != fileID) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + + /* We only care about extent-based EAs */ + if ((extent_info->record.xattr.recordType != kHFSPlusAttrForkData) && + (extent_info->record.xattr.recordType != kHFSPlusAttrExtents)) { + continue; + } + + if (extent_info->record.xattr.recordType == kHFSPlusAttrForkData) { + extent_info->overflow_count = 0; + extent_info->extents = extent_info->record.xattr.forkData.theFork.extents; + } else if (extent_info->record.xattr.recordType == kHFSPlusAttrExtents) { + extent_info->overflow_count++; + extent_info->extents = extent_info->record.xattr.overflowExtents.extents; + } + + extent_info->recStartBlock = key->startBlock; + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (extent_info->extents[i].blockCount == 0) { + break; + } + extent_info->extent_index = i; + error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context); + if (error) { + printf ("hfs_reclaim_xattr: fileID=%u hfs_reclaim_extent error=%d\n", fileID, error); + goto out; + } + } + } + +out: + /* If any blocks were relocated, account them and report progress */ + if (extent_info->blocks_relocated) { + hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated; + hfs_truncatefs_progress(hfsmp); + } + if (extent_info->iterator) { + FREE(extent_info->iterator, M_TEMP); + } + if (extent_info) { + FREE(extent_info, M_TEMP); + } + if (hfs_resize_debug) { + printf("hfs_reclaim_xattr: === Finished relocating xattr for fileid=%u (error=%d) ===\n", fileID, error); + } + return error; +} + +/* + * Reclaim any extent-based extended attributes allocation blocks from + * the area of the disk that is being truncated. + * + * The function traverses the attribute btree to find out the fileIDs + * of the extended attributes that need to be relocated. For every + * file whose large EA requires relocation, it looks up the cnode and + * calls hfs_reclaim_xattr() to do all the work for allocating + * new space, copying data, deallocating old space, and if required, + * splitting the extents. + * + * Inputs: + * allocLimit - starting block of the area being reclaimed + * + * Returns: + * returns 0 on success, non-zero on failure. + */ +static int +hfs_reclaim_xattrspace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) +{ + int error = 0; + FCB *fcb; + struct BTreeIterator *iterator = NULL; + struct FSBufferDescriptor btdata; + HFSPlusAttrKey *key; + HFSPlusAttrRecord rec; + int lockflags = 0; + cnid_t prev_fileid = 0; + struct vnode *vp; + int need_relocate; + int btree_operation; + u_int32_t files_moved = 0; + u_int32_t prev_blocksmoved; + int i; + + fcb = VTOF(hfsmp->hfs_attribute_vp); + /* Store the value to print total blocks moved by this function in end */ + prev_blocksmoved = hfsmp->hfs_resize_blocksmoved; + + if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { + return ENOMEM; + } + bzero(iterator, sizeof(*iterator)); + key = (HFSPlusAttrKey *)&iterator->key; + btdata.bufferAddress = &rec; + btdata.itemSize = sizeof(rec); + btdata.itemCount = 1; + + need_relocate = false; + btree_operation = kBTreeFirstRecord; + /* Traverse the attribute btree to find extent-based EAs to reclaim */ + while (1) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + btree_operation = kBTreeNextRecord; + + /* If the extents of current fileID were already relocated, skip it */ + if (prev_fileid == key->fileID) { + continue; + } + + /* Check if any of the extents in the current record need to be relocated */ + need_relocate = false; + switch(rec.recordType) { + case kHFSPlusAttrForkData: + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (rec.forkData.theFork.extents[i].blockCount == 0) { + break; + } + if ((rec.forkData.theFork.extents[i].startBlock + + rec.forkData.theFork.extents[i].blockCount) > allocLimit) { + need_relocate = true; + break; + } + } + break; + + case kHFSPlusAttrExtents: + for (i = 0; i < kHFSPlusExtentDensity; i++) { + if (rec.overflowExtents.extents[i].blockCount == 0) { + break; + } + if ((rec.overflowExtents.extents[i].startBlock + + rec.overflowExtents.extents[i].blockCount) > allocLimit) { + need_relocate = true; + break; + } + } + break; + }; + + /* Continue iterating to next attribute record */ + if (need_relocate == false) { + continue; + } + + /* Look up the vnode for corresponding file. The cnode + * will be locked which will ensure that no one modifies + * the xattrs when we are relocating them. + * + * We want to allow open-unlinked files to be moved, + * so provide allow_deleted == 1 for hfs_vget(). + */ + if (hfs_vget(hfsmp, key->fileID, &vp, 0, 1) != 0) { + continue; + } + + error = hfs_reclaim_xattr(hfsmp, vp, key->fileID, allocLimit, context); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + if (error) { + printf ("hfs_reclaim_xattrspace: Error relocating xattrs for fileid=%u (error=%d)\n", key->fileID, error); + break; + } + prev_fileid = key->fileID; + files_moved++; + } + + if (files_moved) { + printf("hfs_reclaim_xattrspace: Relocated %u xattr blocks from %u files on \"%s\"\n", + (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved), + files_moved, hfsmp->vcbVN); } - error = hfs_journal_flush(hfsmp); - if (error) { - printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error); + + kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); + return error; +} + +/* + * Reclaim blocks from regular files. + * + * This function iterates over all the record in catalog btree looking + * for files with extents that overlap into the space we're trying to + * free up. If a file extent requires relocation, it looks up the vnode + * and calls function to relocate the data. + * + * Returns: + * Zero on success, non-zero on failure. + */ +static int +hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context) +{ + int error; + FCB *fcb; + struct BTreeIterator *iterator = NULL; + struct FSBufferDescriptor btdata; + int btree_operation; + int lockflags; + struct HFSPlusCatalogFile filerec; + struct vnode *vp; + struct vnode *rvp; + struct filefork *datafork; + u_int32_t files_moved = 0; + u_int32_t prev_blocksmoved; + + fcb = VTOF(hfsmp->hfs_catalog_vp); + /* Store the value to print total blocks moved by this function at the end */ + prev_blocksmoved = hfsmp->hfs_resize_blocksmoved; + + if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { + return ENOMEM; } + bzero(iterator, sizeof(*iterator)); + + btdata.bufferAddress = &filerec; + btdata.itemSize = sizeof(filerec); + btdata.itemCount = 1; + + btree_operation = kBTreeFirstRecord; + while (1) { + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK); + error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL); + hfs_systemfile_unlock(hfsmp, lockflags); + if (error) { + if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { + error = 0; + } + break; + } + btree_operation = kBTreeNextRecord; + + if (filerec.recordType != kHFSPlusFileRecord) { + continue; + } + + /* Check if any of the extents require relocation */ + if (hfs_file_extent_overlaps(hfsmp, allocLimit, &filerec) == false) { + continue; + } - if (!error && hfs_resize_debug) { - printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount); + /* We want to allow open-unlinked files to be moved, so allow_deleted == 1 */ + if (hfs_vget(hfsmp, filerec.fileID, &vp, 0, 1) != 0) { + continue; + } + + /* If data fork exists or item is a directory hard link, relocate blocks */ + datafork = VTOF(vp); + if ((datafork && datafork->ff_blocks > 0) || vnode_isdir(vp)) { + error = hfs_reclaim_file(hfsmp, vp, filerec.fileID, + kHFSDataForkType, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", filerec.fileID, error); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + } + + /* If resource fork exists or item is a directory hard link, relocate blocks */ + if (((VTOC(vp)->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) || vnode_isdir(vp)) { + if (vnode_isdir(vp)) { + /* Resource fork vnode lookup is invalid for directory hard link. + * So we fake data fork vnode as resource fork vnode. + */ + rvp = vp; + } else { + error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE); + if (error) { + printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", filerec.fileID, error); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT; + } + + error = hfs_reclaim_file(hfsmp, rvp, filerec.fileID, + kHFSResourceForkType, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", filerec.fileID, error); + hfs_unlock(VTOC(vp)); + vnode_put(vp); + break; + } + } + + /* The file forks were relocated successfully, now drop the + * cnode lock and vnode reference, and continue iterating to + * next catalog record. + */ + hfs_unlock(VTOC(vp)); + vnode_put(vp); + files_moved++; } - return error; -free_fail: - (void) BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS); -fail: - hfs_systemfile_unlock(hfsmp, lockflags); - (void) hfs_end_transaction(hfsmp); - if (hfs_resize_debug) { - printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error); + if (files_moved) { + printf("hfs_reclaim_filespace: Relocated %u blocks from %u files on \"%s\"\n", + (hfsmp->hfs_resize_blocksmoved - prev_blocksmoved), + files_moved, hfsmp->vcbVN); } + + kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); return error; } - /* * Reclaim space at the end of a file system. * * Inputs - - * startblk - start block of the space being reclaimed + * allocLimit - start block of the space being reclaimed * reclaimblks - number of allocation blocks to reclaim */ static int -hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimblks, vfs_context_t context) +hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context) { - struct vnode *vp = NULL; - FCB *fcb; - struct BTreeIterator * iterator = NULL; - struct FSBufferDescriptor btdata; - struct HFSPlusCatalogFile filerec; - u_int32_t saved_next_allocation; - cnid_t * cnidbufp; - size_t cnidbufsize; - int filecnt = 0; - int maxfilecnt; - u_int32_t block; - int lockflags; - int i, j; - int error; - int lastprogress = 0; - u_int32_t blks_moved = 0; - u_int32_t total_blks_moved = 0; - Boolean need_relocate; + int error = 0; + + /* + * Preflight the bitmap to find out total number of blocks that need + * relocation. + * + * Note: Since allocLimit is set to the location of new alternate volume + * header, the check below does not account for blocks allocated for old + * alternate volume header. + */ + error = hfs_count_allocated(hfsmp, allocLimit, reclaimblks, &(hfsmp->hfs_resize_totalblocks)); + if (error) { + printf ("hfs_reclaimspace: Unable to determine total blocks to reclaim error=%d\n", error); + return error; + } + if (hfs_resize_debug) { + printf ("hfs_reclaimspace: Total number of blocks to reclaim = %u\n", hfsmp->hfs_resize_totalblocks); + } /* Relocate extents of the Allocation file if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, startblk, SFL_BITMAP, &blks_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, kHFSAllocationFileID, + kHFSDataForkType, allocLimit, context); if (error) { printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error); return error; } - total_blks_moved += blks_moved; /* Relocate extents of the Extents B-tree if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, startblk, SFL_EXTENTS, &blks_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, kHFSExtentsFileID, + kHFSDataForkType, allocLimit, context); if (error) { printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error); return error; } - total_blks_moved += blks_moved; /* Relocate extents of the Catalog B-tree if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, startblk, SFL_CATALOG, &blks_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, kHFSCatalogFileID, + kHFSDataForkType, allocLimit, context); if (error) { printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error); return error; } - total_blks_moved += blks_moved; /* Relocate extents of the Attributes B-tree if they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, startblk, SFL_ATTRIBUTE, &blks_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, kHFSAttributesFileID, + kHFSDataForkType, allocLimit, context); if (error) { printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error); return error; } - total_blks_moved += blks_moved; /* Relocate extents of the Startup File if there is one and they're in the way. */ - error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, startblk, SFL_STARTUP, &blks_moved, context); + error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, kHFSStartupFileID, + kHFSDataForkType, allocLimit, context); if (error) { printf("hfs_reclaimspace: reclaim startup file returned %d\n", error); return error; } - total_blks_moved += blks_moved; /* * We need to make sure the alternate volume header gets flushed if we moved @@ -4922,249 +6539,98 @@ hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t reclaimbl * shrinking the size of the volume, or else the journal code will panic * with an invalid (too large) block number. * - * Note that total_blks_moved will be set if ANY extent was moved, even + * Note that blks_moved will be set if ANY extent was moved, even * if it was just an overflow extent. In this case, the journal_flush isn't * strictly required, but shouldn't hurt. */ - if (total_blks_moved) { - hfs_journal_flush(hfsmp); + if (hfsmp->hfs_resize_blocksmoved) { + hfs_journal_flush(hfsmp, FALSE); } - if (hfsmp->jnl_start + (hfsmp->jnl_size / hfsmp->blockSize) > startblk) { - error = hfs_reclaim_journal_file(hfsmp, context); - if (error) { - printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error); - return error; - } - } - - if (hfsmp->vcbJinfoBlock >= startblk) { - error = hfs_reclaim_journal_info_block(hfsmp, context); - if (error) { - printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error); - return error; - } + /* Relocate journal file blocks if they're in the way. */ + error = hfs_reclaim_journal_file(hfsmp, allocLimit, context); + if (error) { + printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error); + return error; } - /* For now move a maximum of 250,000 files. */ - maxfilecnt = MIN(hfsmp->hfs_filecount, 250000); - maxfilecnt = MIN((u_int32_t)maxfilecnt, reclaimblks); - cnidbufsize = maxfilecnt * sizeof(cnid_t); - if (kmem_alloc(kernel_map, (vm_offset_t *)&cnidbufp, cnidbufsize)) { - return (ENOMEM); - } - if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { - kmem_free(kernel_map, (vm_offset_t)cnidbufp, cnidbufsize); - return (ENOMEM); - } - - saved_next_allocation = hfsmp->nextAllocation; - /* Always try allocating new blocks after the metadata zone */ - HFS_UPDATE_NEXT_ALLOCATION(hfsmp, hfsmp->hfs_metazone_start); - - fcb = VTOF(hfsmp->hfs_catalog_vp); - bzero(iterator, sizeof(*iterator)); - - btdata.bufferAddress = &filerec; - btdata.itemSize = sizeof(filerec); - btdata.itemCount = 1; - - /* Keep the Catalog and extents files locked during iteration. */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS, HFS_SHARED_LOCK); - - error = BTIterateRecord(fcb, kBTreeFirstRecord, iterator, NULL, NULL); + /* Relocate journal info block blocks if they're in the way. */ + error = hfs_reclaim_journal_info_block(hfsmp, allocLimit, context); if (error) { - goto end_iteration; - } - /* - * Iterate over all the catalog records looking for files - * that overlap into the space we're trying to free up and - * the total number of blocks that will require relocation. - */ - for (filecnt = 0; filecnt < maxfilecnt; ) { - error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); - if (error) { - if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) { - error = 0; - } - break; - } - if (filerec.recordType != kHFSPlusFileRecord) { - continue; - } - - need_relocate = false; - /* Check if data fork overlaps the target space */ - for (i = 0; i < kHFSPlusExtentDensity; ++i) { - if (filerec.dataFork.extents[i].blockCount == 0) { - break; - } - block = filerec.dataFork.extents[i].startBlock + - filerec.dataFork.extents[i].blockCount; - if (block >= startblk) { - if ((filerec.fileID == hfsmp->hfs_jnlfileid) || - (filerec.fileID == hfsmp->hfs_jnlinfoblkid)) { - printf("hfs_reclaimspace: cannot move active journal\n"); - error = EPERM; - goto end_iteration; - } - need_relocate = true; - goto save_fileid; - } - } - - /* Check if resource fork overlaps the target space */ - for (j = 0; j < kHFSPlusExtentDensity; ++j) { - if (filerec.resourceFork.extents[j].blockCount == 0) { - break; - } - block = filerec.resourceFork.extents[j].startBlock + - filerec.resourceFork.extents[j].blockCount; - if (block >= startblk) { - need_relocate = true; - goto save_fileid; - } - } - - /* Check if any forks' overflow extents overlap the target space */ - if ((i == kHFSPlusExtentDensity) || (j == kHFSPlusExtentDensity)) { - if (hfs_overlapped_overflow_extents(hfsmp, startblk, filerec.fileID)) { - need_relocate = true; - goto save_fileid; - } - } - -save_fileid: - if (need_relocate == true) { - cnidbufp[filecnt++] = filerec.fileID; - if (hfs_resize_debug) { - printf ("hfs_reclaimspace: Will relocate extents for fileID=%u\n", filerec.fileID); - } - } + printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error); + return error; } -end_iteration: - /* If no regular file was found to be relocated and - * no system file was moved, we probably do not have - * enough space to relocate the system files, or - * something else went wrong. - */ - if ((filecnt == 0) && (total_blks_moved == 0)) { - printf("hfs_reclaimspace: no files moved\n"); - error = ENOSPC; + /* Reclaim extents from catalog file records */ + error = hfs_reclaim_filespace(hfsmp, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: hfs_reclaim_filespace returned error=%d\n", error); + return error; } - /* All done with catalog. */ - hfs_systemfile_unlock(hfsmp, lockflags); - if (error || filecnt == 0) - goto out; - - hfsmp->hfs_resize_filesmoved = 0; - hfsmp->hfs_resize_totalfiles = filecnt; - - /* Now move any files that are in the way. */ - for (i = 0; i < filecnt; ++i) { - struct vnode *rvp; - struct cnode *cp; - struct filefork *datafork; - - if (hfs_vget(hfsmp, cnidbufp[i], &vp, 0) != 0) - continue; - - cp = VTOC(vp); - datafork = VTOF(vp); - - /* Relocating directory hard links is not supported, so we punt (see radar 6217026). */ - if ((cp->c_flag & C_HARDLINK) && vnode_isdir(vp)) { - printf("hfs_reclaimspace: Unable to relocate directory hard link id=%d\n", cp->c_cnid); - error = EINVAL; - goto out; - } - - /* Relocate any overlapping data fork blocks. */ - if (datafork && datafork->ff_blocks > 0) { - error = hfs_reclaim_file(hfsmp, vp, startblk, 0, &blks_moved, context); - if (error) { - printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", cnidbufp[i], error); - break; - } - total_blks_moved += blks_moved; - } - - /* Relocate any overlapping resource fork blocks. */ - if ((cp->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) { - error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE); - if (error) { - printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", cnidbufp[i], error); - break; - } - error = hfs_reclaim_file(hfsmp, rvp, startblk, 0, &blks_moved, context); - VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT; - if (error) { - printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", cnidbufp[i], error); - break; - } - total_blks_moved += blks_moved; - } - hfs_unlock(cp); - vnode_put(vp); - vp = NULL; - - ++hfsmp->hfs_resize_filesmoved; - /* Report intermediate progress. */ - if (filecnt > 100) { - int progress; - - progress = (i * 100) / filecnt; - if (progress > (lastprogress + 9)) { - printf("hfs_reclaimspace: %d%% done...\n", progress); - lastprogress = progress; - } - } - } - if (vp) { - hfs_unlock(VTOC(vp)); - vnode_put(vp); - vp = NULL; - } - if (hfsmp->hfs_resize_filesmoved != 0) { - printf("hfs_reclaimspace: relocated %u blocks from %d files on \"%s\"\n", - total_blks_moved, (int)hfsmp->hfs_resize_filesmoved, hfsmp->vcbVN); + /* Reclaim extents from extent-based extended attributes, if any */ + error = hfs_reclaim_xattrspace(hfsmp, allocLimit, context); + if (error) { + printf ("hfs_reclaimspace: hfs_reclaim_xattrspace returned error=%d\n", error); + return error; } -out: - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); - kmem_free(kernel_map, (vm_offset_t)cnidbufp, cnidbufsize); - /* - * Restore the roving allocation pointer on errors. - * (but only if we didn't move any files) - */ - if (error && hfsmp->hfs_resize_filesmoved == 0) { - HFS_UPDATE_NEXT_ALLOCATION(hfsmp, saved_next_allocation); - } - return (error); + return error; } /* - * Check if there are any overflow data or resource fork extents that overlap + * Check if there are any extents (including overflow extents) that overlap * into the disk space that is being reclaimed. * * Output - - * 1 - One of the overflow extents need to be relocated - * 0 - No overflow extents need to be relocated, or there was an error + * true - One of the extents need to be relocated + * false - No overflow extents need to be relocated, or there was an error */ static int -hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_int32_t fileID) +hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec) { struct BTreeIterator * iterator = NULL; struct FSBufferDescriptor btdata; HFSPlusExtentRecord extrec; HFSPlusExtentKey *extkeyptr; FCB *fcb; - int overlapped = 0; - int i; + int overlapped = false; + int i, j; int error; + int lockflags = 0; + u_int32_t endblock; + + /* Check if data fork overlaps the target space */ + for (i = 0; i < kHFSPlusExtentDensity; ++i) { + if (filerec->dataFork.extents[i].blockCount == 0) { + break; + } + endblock = filerec->dataFork.extents[i].startBlock + + filerec->dataFork.extents[i].blockCount; + if (endblock > allocLimit) { + overlapped = true; + goto out; + } + } + + /* Check if resource fork overlaps the target space */ + for (j = 0; j < kHFSPlusExtentDensity; ++j) { + if (filerec->resourceFork.extents[j].blockCount == 0) { + break; + } + endblock = filerec->resourceFork.extents[j].startBlock + + filerec->resourceFork.extents[j].blockCount; + if (endblock > allocLimit) { + overlapped = true; + goto out; + } + } + + /* Return back if there are no overflow extents for this file */ + if ((i < kHFSPlusExtentDensity) && (j < kHFSPlusExtentDensity)) { + goto out; + } if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) { return 0; @@ -5173,7 +6639,7 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in extkeyptr = (HFSPlusExtentKey *)&iterator->key; extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength; extkeyptr->forkType = 0; - extkeyptr->fileID = fileID; + extkeyptr->fileID = filerec->fileID; extkeyptr->startBlock = 0; btdata.bufferAddress = &extrec; @@ -5182,6 +6648,8 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in fcb = VTOF(hfsmp->hfs_extents_vp); + lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK); + /* This will position the iterator just before the first overflow * extent record for given fileID. It will always return btNotFound, * so we special case the error code. @@ -5197,7 +6665,7 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL); while (error == 0) { /* Stop when we encounter a different file. */ - if (extkeyptr->fileID != fileID) { + if (extkeyptr->fileID != filerec->fileID) { break; } /* Check if any of the forks exist in the target space. */ @@ -5205,8 +6673,9 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in if (extrec[i].blockCount == 0) { break; } - if ((extrec[i].startBlock + extrec[i].blockCount) >= startblk) { - overlapped = 1; + endblock = extrec[i].startBlock + extrec[i].blockCount; + if (endblock > allocLimit) { + overlapped = true; goto out; } } @@ -5215,7 +6684,12 @@ hfs_overlapped_overflow_extents(struct hfsmount *hfsmp, u_int32_t startblk, u_in } out: - kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); + if (lockflags) { + hfs_systemfile_unlock(hfsmp, lockflags); + } + if (iterator) { + kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator)); + } return overlapped; } @@ -5231,10 +6705,11 @@ hfs_resize_progress(struct hfsmount *hfsmp, u_int32_t *progress) return (ENXIO); } - if (hfsmp->hfs_resize_totalfiles > 0) - *progress = (hfsmp->hfs_resize_filesmoved * 100) / hfsmp->hfs_resize_totalfiles; - else + if (hfsmp->hfs_resize_totalblocks > 0) { + *progress = (u_int32_t)((hfsmp->hfs_resize_blocksmoved * 100ULL) / hfsmp->hfs_resize_totalblocks); + } else { *progress = 0; + } return (0); } @@ -5270,6 +6745,7 @@ hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t { #define HFS_ATTR_CMN_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST)) #define HFS_ATTR_FILE_VALIDMASK (ATTR_FILE_VALIDMASK & ~(ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST)) +#define HFS_ATTR_CMN_VOL_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST | ATTR_CMN_ACCTIME)) ExtendedVCB *vcb = VFSTOVCB(mp); struct hfsmount *hfsmp = VFSTOHFS(mp); @@ -5396,20 +6872,20 @@ hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) { vol_attributes_attr_t *attrp = &fsap->f_attributes; - attrp->validattr.commonattr = HFS_ATTR_CMN_VALIDMASK; + attrp->validattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK; attrp->validattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO; attrp->validattr.dirattr = ATTR_DIR_VALIDMASK; attrp->validattr.fileattr = HFS_ATTR_FILE_VALIDMASK; attrp->validattr.forkattr = 0; - attrp->nativeattr.commonattr = HFS_ATTR_CMN_VALIDMASK; + attrp->nativeattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK; attrp->nativeattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO; attrp->nativeattr.dirattr = ATTR_DIR_VALIDMASK; attrp->nativeattr.fileattr = HFS_ATTR_FILE_VALIDMASK; attrp->nativeattr.forkattr = 0; VFSATTR_SET_SUPPORTED(fsap, f_attributes); } - fsap->f_create_time.tv_sec = hfsmp->vcbCrDate; + fsap->f_create_time.tv_sec = hfsmp->hfs_itime; fsap->f_create_time.tv_nsec = 0; VFSATTR_SET_SUPPORTED(fsap, f_create_time); fsap->f_modify_time.tv_sec = hfsmp->vcbLsMod; @@ -5470,6 +6946,10 @@ hfs_rename_volume(struct vnode *vp, const char *name, proc_t p) cat_cookie_t cookie; int lockflags; int error = 0; + char converted_volname[256]; + size_t volname_length = 0; + size_t conv_volname_length = 0; + /* * Ignore attempts to rename a volume to a zero-length name. @@ -5504,8 +6984,16 @@ hfs_rename_volume(struct vnode *vp, const char *name, proc_t p) */ if (!error) { strlcpy((char *)vcb->vcbVN, name, sizeof(vcb->vcbVN)); + volname_length = strlen ((const char*)vcb->vcbVN); +#define DKIOCCSSETLVNAME _IOW('d', 198, char[1024]) + /* Send the volume name down to CoreStorage if necessary */ + error = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED); + if (error == 0) { + (void) VNOP_IOCTL (hfsmp->hfs_devvp, DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current()); + } + error = 0; } - + hfs_systemfile_unlock(hfsmp, lockflags); cat_postflight(hfsmp, &cookie, p); @@ -5604,7 +7092,7 @@ static int hfs_journal_replay(vnode_t devvp, vfs_context_t context) struct hfs_mount_args *args = NULL; /* Replay allowed only on raw devices */ - if (!vnode_ischr(devvp)) { + if (!vnode_ischr(devvp) && !vnode_isblk(devvp)) { retval = EINVAL; goto out; } @@ -5626,7 +7114,10 @@ static int hfs_journal_replay(vnode_t devvp, vfs_context_t context) bzero(args, sizeof(struct hfs_mount_args)); retval = hfs_mountfs(devvp, mp, args, 1, context); - buf_flushdirtyblks(devvp, MNT_WAIT, 0, "hfs_journal_replay"); + buf_flushdirtyblks(devvp, TRUE, 0, "hfs_journal_replay"); + + /* FSYNC the devnode to be sure all data has been flushed */ + retval = VNOP_FSYNC(devvp, MNT_WAIT, context); out: if (mp) { diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c index 97559487f..103232431 100644 --- a/bsd/hfs/hfs_vfsutils.c +++ b/bsd/hfs/hfs_vfsutils.c @@ -46,6 +46,7 @@ #include <sys/utfconv.h> #include <sys/kauth.h> #include <sys/fcntl.h> +#include <sys/fsctl.h> #include <sys/vnode_internal.h> #include <kern/clock.h> @@ -68,6 +69,8 @@ static int hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *v static u_int32_t hfs_hotfile_freeblocks(struct hfsmount *); +#define HFS_MOUNT_DEBUG 1 + //******************************************************************************* // Note: Finder information in the HFS/HFS+ metadata are considered opaque and @@ -87,7 +90,6 @@ unsigned char hfs_attrname[] = "Attribute B-tree"; unsigned char hfs_startupname[] = "Startup File"; -__private_extern__ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, __unused struct proc *p) { @@ -97,6 +99,7 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, struct cat_desc cndesc; struct cat_attr cnattr; struct cat_fork fork; + int newvnode_flags = 0; /* Block size must be a multiple of 512 */ if (SWAP_BE32(mdb->drAlBlkSiz) == 0 || @@ -115,7 +118,7 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, * */ vcb->vcbSigWord = SWAP_BE16 (mdb->drSigWord); - vcb->vcbCrDate = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drCrDate))); + vcb->hfs_itime = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drCrDate))); vcb->localCreateDate = SWAP_BE32 (mdb->drCrDate); vcb->vcbLsMod = to_bsd_time(LocalToUTC(SWAP_BE32(mdb->drLsMod))); vcb->vcbAtrb = SWAP_BE16 (mdb->drAtrb); @@ -145,12 +148,13 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, * volume encoding we use MacRoman as a fallback. */ if (error || (utf8chars == 0)) { - (void) mac_roman_to_utf8(mdb->drVN, NAME_MAX, &utf8chars, vcb->vcbVN); - /* If we fail to encode to UTF8 from Mac Roman, the name is bad. Deny mount */ + error = mac_roman_to_utf8(mdb->drVN, NAME_MAX, &utf8chars, vcb->vcbVN); + /* If we fail to encode to UTF8 from Mac Roman, the name is bad. Deny the mount */ if (error) { goto MtVolErr; } } + hfsmp->hfs_logBlockSize = BestBlockSizeFit(vcb->blockSize, MAXBSIZE, hfsmp->hfs_logical_block_size); vcb->vcbVBMIOSize = kHFSBlockSize; @@ -184,11 +188,19 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, cnattr.ca_blocks = fork.cf_blocks; error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, - &hfsmp->hfs_extents_vp); - if (error) goto MtVolErr; + &hfsmp->hfs_extents_vp, &newvnode_flags); + if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error creating Ext Vnode (%d) \n", error); + } + goto MtVolErr; + } error = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_extents_vp), (KeyCompareProcPtr)CompareExtentKeys)); if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error opening Ext Vnode (%d) \n", error); + } hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; } @@ -213,14 +225,20 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, cnattr.ca_blocks = fork.cf_blocks; error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, - &hfsmp->hfs_catalog_vp); + &hfsmp->hfs_catalog_vp, &newvnode_flags); if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error creating catalog Vnode (%d) \n", error); + } hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; } error = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), (KeyCompareProcPtr)CompareCatalogKeys)); if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error opening catalog Vnode (%d) \n", error); + } hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; @@ -237,37 +255,41 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, cnattr.ca_blocks = 0; error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &fork, - &hfsmp->hfs_allocation_vp); + &hfsmp->hfs_allocation_vp, &newvnode_flags); if (error) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error creating bitmap Vnode (%d) \n", error); + } hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); goto MtVolErr; } hfsmp->hfs_allocation_cp = VTOC(hfsmp->hfs_allocation_vp); - /* mark the volume dirty (clear clean unmount bit) */ + /* mark the volume dirty (clear clean unmount bit) */ vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; - if (error == noErr) - { + if (error == noErr) { error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, NULL, NULL, NULL); - } - - if ( error == noErr ) - { - if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected - { - MarkVCBDirty( vcb ); // mark VCB dirty so it will be written - } - } - + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfs (std): error looking up root folder (%d) \n", error); + } + } + + if (error == noErr) { + /* If the disk isn't write protected.. */ + if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask)) { + MarkVCBDirty (vcb); // mark VCB dirty so it will be written + } + } + /* * all done with system files so we can unlock now... */ hfs_unlock(VTOC(hfsmp->hfs_allocation_vp)); hfs_unlock(VTOC(hfsmp->hfs_catalog_vp)); hfs_unlock(VTOC(hfsmp->hfs_extents_vp)); - + if (error == noErr) { /* If successful, then we can just return once we've unlocked the cnodes */ return error; @@ -275,9 +297,7 @@ OSErr hfs_MountHFSVolume(struct hfsmount *hfsmp, HFSMasterDirectoryBlock *mdb, //-- Release any resources allocated so far before exiting with an error: MtVolErr: - ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); - ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); - ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); + hfsUnmount(hfsmp, NULL); return (error); } @@ -288,7 +308,6 @@ MtVolErr: // //******************************************************************************* -__private_extern__ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, off_t embeddedOffset, u_int64_t disksize, __unused struct proc *p, void *args, kauth_cred_t cred) { @@ -301,8 +320,12 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, struct BTreeInfoRec btinfo; u_int16_t signature; u_int16_t hfs_version; + int newvnode_flags = 0; int i; OSErr retval; + char converted_volname[256]; + size_t volname_length = 0; + size_t conv_volname_length = 0; signature = SWAP_BE16(vhp->signature); hfs_version = SWAP_BE16(vhp->version); @@ -324,23 +347,38 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, /* Removed printf for invalid HFS+ signature because it gives * false error for UFS root volume */ + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: unknown Volume Signature\n"); + } return (EINVAL); } /* Block size must be at least 512 and a power of 2 */ blockSize = SWAP_BE32(vhp->blockSize); - if (blockSize < 512 || !powerof2(blockSize)) + if (blockSize < 512 || !powerof2(blockSize)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: invalid blocksize (%d) \n", blockSize); + } return (EINVAL); + } /* don't mount a writable volume if its dirty, it must be cleaned by fsck_hfs */ if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0 && hfsmp->jnl == NULL && - (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0) + (SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) == 0) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: cannot mount dirty non-journaled volumes\n"); + } return (EINVAL); + } /* Make sure we can live with the physical block size. */ if ((disksize & (hfsmp->hfs_logical_block_size - 1)) || (embeddedOffset & (hfsmp->hfs_logical_block_size - 1)) || (blockSize < hfsmp->hfs_logical_block_size)) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: invalid physical blocksize (%d), hfs_logical_blocksize (%d) \n", + blockSize, hfsmp->hfs_logical_block_size); + } return (ENXIO); } @@ -445,9 +483,12 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, SWAP_BE32 (vhp->extentsFile.extents[i].blockCount); } retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_extents_vp); + &hfsmp->hfs_extents_vp, &newvnode_flags); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting extentoverflow BT\n", retval); + } goto ErrorExit; } hfsmp->hfs_extents_cp = VTOC(hfsmp->hfs_extents_vp); @@ -457,6 +498,9 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, (KeyCompareProcPtr) CompareExtentKeysPlus)); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting extentoverflow BT\n", retval); + } goto ErrorExit; } /* @@ -478,8 +522,11 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, SWAP_BE32 (vhp->catalogFile.extents[i].blockCount); } retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_catalog_vp); + &hfsmp->hfs_catalog_vp, &newvnode_flags); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting catalog BT\n", retval); + } goto ErrorExit; } hfsmp->hfs_catalog_cp = VTOC(hfsmp->hfs_catalog_vp); @@ -488,6 +535,9 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_catalog_vp), (KeyCompareProcPtr) CompareExtendedCatalogKeys)); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting catalog BT\n", retval); + } goto ErrorExit; } if ((hfsmp->hfs_flags & HFS_X) && @@ -519,8 +569,11 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, SWAP_BE32 (vhp->allocationFile.extents[i].blockCount); } retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_allocation_vp); + &hfsmp->hfs_allocation_vp, &newvnode_flags); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting bitmap\n", retval); + } goto ErrorExit; } hfsmp->hfs_allocation_cp = VTOC(hfsmp->hfs_allocation_vp); @@ -546,8 +599,11 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, SWAP_BE32 (vhp->attributesFile.extents[i].blockCount); } retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_attribute_vp); + &hfsmp->hfs_attribute_vp, &newvnode_flags); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting EA BT\n", retval); + } goto ErrorExit; } hfsmp->hfs_attribute_cp = VTOC(hfsmp->hfs_attribute_vp); @@ -555,6 +611,22 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_attribute_vp), (KeyCompareProcPtr) hfs_attrkeycompare)); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: BTOpenPath returned (%d) getting EA BT\n", retval); + } + goto ErrorExit; + } + + /* Initialize vnode for virtual attribute data file that spans the + * entire file system space for performing I/O to attribute btree + * We hold iocount on the attrdata vnode for the entire duration + * of mount (similar to btree vnodes) + */ + retval = init_attrdata_vnode(hfsmp); + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: init_attrdata_vnode returned (%d) for virtual EA file\n", retval); + } goto ErrorExit; } } @@ -579,8 +651,11 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, SWAP_BE32 (vhp->startupFile.extents[i].blockCount); } retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, - &hfsmp->hfs_startup_vp); + &hfsmp->hfs_startup_vp, &newvnode_flags); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_getnewvnode returned (%d) getting startup file\n", retval); + } goto ErrorExit; } hfsmp->hfs_startup_cp = VTOC(hfsmp->hfs_startup_vp); @@ -590,13 +665,29 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, /* Pick up volume name and create date */ retval = cat_idlookup(hfsmp, kHFSRootFolderID, 0, &cndesc, &cnattr, NULL); if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: cat_idlookup returned (%d) getting rootfolder \n", retval); + } goto ErrorExit; } - vcb->vcbCrDate = cnattr.ca_itime; + vcb->hfs_itime = cnattr.ca_itime; vcb->volumeNameEncodingHint = cndesc.cd_encoding; bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen)); + volname_length = strlen ((const char*)vcb->vcbVN); cat_releasedesc(&cndesc); + +#define DKIOCCSSETLVNAME _IOW('d', 198, char[1024]) + + /* Send the volume name down to CoreStorage if necessary */ + retval = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED); + if (retval == 0) { + (void) VNOP_IOCTL (hfsmp->hfs_devvp, DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current()); + } + + /* reset retval == 0. we don't care about errors in volname conversion */ + retval = 0; + /* mark the volume dirty (clear clean unmount bit) */ vcb->vcbAtrb &= ~kHFSVolumeUnmountedMask; if (hfsmp->jnl && (hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { @@ -624,6 +715,9 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, // EROFS is a special error code that means the volume has an external // journal which we couldn't find. in that case we do not want to // rewrite the volume header - we'll just refuse to mount the volume. + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_late_journal_init returned (%d), maybe an external jnl?\n", retval); + } retval = EINVAL; goto ErrorExit; } @@ -663,7 +757,10 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, bp = NULL; } } - + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_late_journal_init returned (%d)\n", retval); + } retval = EINVAL; goto ErrorExit; } else if (hfsmp->jnl) { @@ -697,7 +794,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, /* * Establish a metadata allocation zone. */ - hfs_metadatazone_init(hfsmp); + hfs_metadatazone_init(hfsmp, false); /* * Make any metadata zone adjustments. @@ -726,8 +823,13 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) { retval = hfs_erase_unused_nodes(hfsmp); - if (retval) + if (retval) { + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: hfs_erase_unused_nodes returned (%d) for %s \n", retval, hfsmp->vcbVN); + } + goto ErrorExit; + } } if ( !(vcb->vcbAtrb & kHFSVolumeHardwareLockMask) ) // if the disk is not write protected @@ -739,30 +841,33 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, * Allow hot file clustering if conditions allow. */ if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) && - ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) && - ((hfsmp->hfs_mp->mnt_kern_flag & MNTK_SSD) == 0)) { + ((hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_SSD)) == 0)) { (void) hfs_recording_init(hfsmp); } /* Force ACLs on HFS+ file systems. */ vfs_setextendedsecurity(HFSTOVFS(hfsmp)); - /* Check if volume supports writing of extent-based extended attributes */ - hfs_check_volxattr(hfsmp, HFS_SET_XATTREXTENTS_STATE); + /* Enable extent-based extended attributes by default */ + hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; + + /* See if this volume should have per-file content protection enabled */ + if (vcb->vcbAtrb & kHFSContentProtectionMask) { + vfs_setflags (hfsmp->hfs_mp, MNT_CPROTECT); + } return (0); ErrorExit: /* - * A fatal error occurred and the volume cannot be mounted - * release any resources that we aquired... + * A fatal error occurred and the volume cannot be mounted, so + * release any resources that we acquired... */ - if (hfsmp->hfs_attribute_vp) - ReleaseMetaFileVNode(hfsmp->hfs_attribute_vp); - ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); - ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); - ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); - + hfsUnmount(hfsmp, NULL); + + if (HFS_MOUNT_DEBUG) { + printf("hfs_mounthfsplus: encountered errorr (%d)\n", retval); + } return (retval); } @@ -797,44 +902,47 @@ static void ReleaseMetaFileVNode(struct vnode *vp) * *************************************************************/ -__private_extern__ int hfsUnmount( register struct hfsmount *hfsmp, __unused struct proc *p) { - /* Get rid of our attribute data vnode (if any). */ + /* Get rid of our attribute data vnode (if any). This is done + * after the vflush() during mount, so we don't need to worry + * about any locks. + */ if (hfsmp->hfs_attrdata_vp) { - vnode_t advp = hfsmp->hfs_attrdata_vp; - - if (vnode_get(advp) == 0) { - vnode_rele_ext(advp, O_EVTONLY, 0); - vnode_put(advp); - } + ReleaseMetaFileVNode(hfsmp->hfs_attrdata_vp); hfsmp->hfs_attrdata_vp = NULLVP; } - if (hfsmp->hfs_startup_vp) + if (hfsmp->hfs_startup_vp) { ReleaseMetaFileVNode(hfsmp->hfs_startup_vp); - - if (hfsmp->hfs_allocation_vp) - ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); - - if (hfsmp->hfs_attribute_vp) + hfsmp->hfs_startup_cp = NULL; + hfsmp->hfs_startup_vp = NULL; + } + + if (hfsmp->hfs_attribute_vp) { ReleaseMetaFileVNode(hfsmp->hfs_attribute_vp); + hfsmp->hfs_attribute_cp = NULL; + hfsmp->hfs_attribute_vp = NULL; + } - ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); - ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); + if (hfsmp->hfs_catalog_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_catalog_vp); + hfsmp->hfs_catalog_cp = NULL; + hfsmp->hfs_catalog_vp = NULL; + } - /* - * Setting these pointers to NULL so that any references - * past this point will fail, and tell us the point of failure. - * Also, facilitates a check in hfs_update for a null catalog - * vp - */ - hfsmp->hfs_allocation_vp = NULL; - hfsmp->hfs_attribute_vp = NULL; - hfsmp->hfs_catalog_vp = NULL; - hfsmp->hfs_extents_vp = NULL; - hfsmp->hfs_startup_vp = NULL; + if (hfsmp->hfs_extents_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_extents_vp); + hfsmp->hfs_extents_cp = NULL; + hfsmp->hfs_extents_vp = NULL; + } + + if (hfsmp->hfs_allocation_vp) { + ReleaseMetaFileVNode(hfsmp->hfs_allocation_vp); + hfsmp->hfs_allocation_cp = NULL; + hfsmp->hfs_allocation_vp = NULL; + } return (0); } @@ -880,11 +988,56 @@ overflow_extents(struct filefork *fp) return (fp->ff_blocks > blocks); } +/* + * Lock the HFS global journal lock + */ +int +hfs_lock_global (struct hfsmount *hfsmp, enum hfslocktype locktype) { + + void *thread = current_thread(); + + if (hfsmp->hfs_global_lockowner == thread) { + panic ("hfs_lock_global: locking against myself!"); + } + + /* HFS_SHARED_LOCK */ + if (locktype == HFS_SHARED_LOCK) { + lck_rw_lock_shared (&hfsmp->hfs_global_lock); + hfsmp->hfs_global_lockowner = HFS_SHARED_OWNER; + } + /* HFS_EXCLUSIVE_LOCK */ + else { + lck_rw_lock_exclusive (&hfsmp->hfs_global_lock); + hfsmp->hfs_global_lockowner = thread; + } + + return 0; +} + + +/* + * Unlock the HFS global journal lock + */ +void +hfs_unlock_global (struct hfsmount *hfsmp) { + + void *thread = current_thread(); + + /* HFS_LOCK_EXCLUSIVE */ + if (hfsmp->hfs_global_lockowner == thread) { + hfsmp->hfs_global_lockowner = NULL; + lck_rw_unlock_exclusive (&hfsmp->hfs_global_lock); + } + /* HFS_LOCK_SHARED */ + else { + lck_rw_unlock_shared (&hfsmp->hfs_global_lock); + } +} + /* * Lock HFS system file(s). */ -__private_extern__ int hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype) { @@ -905,7 +1058,12 @@ hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype } #endif /* HFS_CHECK_LOCK_ORDER */ - (void) hfs_lock(hfsmp->hfs_catalog_cp, locktype); + if (hfsmp->hfs_catalog_cp) { + (void) hfs_lock(hfsmp->hfs_catalog_cp, locktype); + } else { + flags &= ~SFL_CATALOG; + } + /* * When the catalog file has overflow extents then * also acquire the extents b-tree lock if its not @@ -949,7 +1107,12 @@ hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype } #endif /* HFS_CHECK_LOCK_ORDER */ - (void) hfs_lock(hfsmp->hfs_startup_cp, locktype); + if (hfsmp->hfs_startup_cp) { + (void) hfs_lock(hfsmp->hfs_startup_cp, locktype); + } else { + flags &= ~SFL_STARTUP; + } + /* * When the startup file has overflow extents then * also acquire the extents b-tree lock if its not @@ -966,17 +1129,14 @@ hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype */ if (flags & (SFL_BITMAP | SFL_EXTENTS)) { /* - * Since the only bitmap operations are clearing and - * setting bits we always need exclusive access. And - * when we have a journal, we can "hide" behind that - * lock since we can only change the bitmap from - * within a transaction. + * If there's no bitmap cnode, ignore the bitmap lock. */ - if (hfsmp->jnl || (hfsmp->hfs_allocation_cp == NULL)) { + if (hfsmp->hfs_allocation_cp == NULL) { flags &= ~SFL_BITMAP; } else { (void) hfs_lock(hfsmp->hfs_allocation_cp, HFS_EXCLUSIVE_LOCK); - /* The bitmap lock is also grabbed when only extent lock + /* + * The bitmap lock is also grabbed when only extent lock * was requested. Set the bitmap lock bit in the lock * flags which callers will use during unlock. */ @@ -988,7 +1148,11 @@ hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype * Since the extents btree lock is recursive we always * need exclusive access. */ - (void) hfs_lock(hfsmp->hfs_extents_cp, HFS_EXCLUSIVE_LOCK); + if (hfsmp->hfs_extents_cp) { + (void) hfs_lock(hfsmp->hfs_extents_cp, HFS_EXCLUSIVE_LOCK); + } else { + flags &= ~SFL_EXTENTS; + } } return (flags); } @@ -996,7 +1160,6 @@ hfs_systemfile_lock(struct hfsmount *hfsmp, int flags, enum hfslocktype locktype /* * unlock HFS system file(s). */ -__private_extern__ void hfs_systemfile_unlock(struct hfsmount *hfsmp, int flags) { @@ -1023,7 +1186,7 @@ hfs_systemfile_unlock(struct hfsmount *hfsmp, int flags) } hfs_unlock(hfsmp->hfs_attribute_cp); } - if (flags & SFL_CATALOG) { + if (flags & SFL_CATALOG && hfsmp->hfs_catalog_cp) { if (hfsmp->jnl == NULL) { BTGetLastSync((FCB*)VTOF(hfsmp->hfs_catalog_vp), &lastfsync); numOfLockedBuffs = count_lock_queue(); @@ -1035,10 +1198,10 @@ hfs_systemfile_unlock(struct hfsmount *hfsmp, int flags) } hfs_unlock(hfsmp->hfs_catalog_cp); } - if (flags & SFL_BITMAP) { + if (flags & SFL_BITMAP && hfsmp->hfs_allocation_cp) { hfs_unlock(hfsmp->hfs_allocation_cp); } - if (flags & SFL_EXTENTS) { + if (flags & SFL_EXTENTS && hfsmp->hfs_extents_cp) { if (hfsmp->jnl == NULL) { BTGetLastSync((FCB*)VTOF(hfsmp->hfs_extents_vp), &lastfsync); numOfLockedBuffs = count_lock_queue(); @@ -1168,7 +1331,6 @@ u_int32_t BestBlockSizeFit(u_int32_t allocationBlockSize, } -__private_extern__ u_int32_t GetFileInfo(ExtendedVCB *vcb, __unused u_int32_t dirid, const char *name, struct cat_attr *fattr, struct cat_fork *forkinfo) @@ -1208,7 +1370,6 @@ GetFileInfo(ExtendedVCB *vcb, __unused u_int32_t dirid, const char *name, * If the volume was not cleanly unmounted then some of these may * have persisted and need to be removed. */ -__private_extern__ void hfs_remove_orphans(struct hfsmount * hfsmp) { @@ -1286,8 +1447,9 @@ hfs_remove_orphans(struct hfsmount * hfsmp) */ if (bcmp(tempname, filename, namelen) == 0) { struct filefork dfork; - struct filefork rfork; + struct filefork rfork; struct cnode cnode; + int mode = 0; bzero(&dfork, sizeof(dfork)); bzero(&rfork, sizeof(rfork)); @@ -1344,8 +1506,10 @@ hfs_remove_orphans(struct hfsmount * hfsmp) fsize = 0; } - if (TruncateFileC(vcb, (FCB*)&dfork, fsize, false) != 0) { - printf("hfs: error truncting data fork!\n"); + if (TruncateFileC(vcb, (FCB*)&dfork, fsize, 1, 0, + cnode.c_attr.ca_fileid, false) != 0) { + printf("hfs: error truncating data fork!\n"); + break; } @@ -1376,8 +1540,8 @@ hfs_remove_orphans(struct hfsmount * hfsmp) rfork.ff_cp = &cnode; cnode.c_datafork = NULL; cnode.c_rsrcfork = &rfork; - if (TruncateFileC(vcb, (FCB*)&rfork, 0, false) != 0) { - printf("hfs: error truncting rsrc fork!\n"); + if (TruncateFileC(vcb, (FCB*)&rfork, 0, 1, 1, cnode.c_attr.ca_fileid, false) != 0) { + printf("hfs: error truncating rsrc fork!\n"); break; } } @@ -1391,7 +1555,9 @@ hfs_remove_orphans(struct hfsmount * hfsmp) break; } - if (cnode.c_attr.ca_mode & S_IFDIR) { + mode = cnode.c_attr.ca_mode & S_IFMT; + + if (mode == S_IFDIR) { orphaned_dirs++; } else { @@ -1400,7 +1566,7 @@ hfs_remove_orphans(struct hfsmount * hfsmp) /* Update parent and volume counts */ hfsmp->hfs_private_attr[FILE_HARDLINKS].ca_entries--; - if (cnode.c_attr.ca_mode & S_IFDIR) { + if (mode == S_IFDIR) { DEC_FOLDERCOUNT(hfsmp, hfsmp->hfs_private_attr[FILE_HARDLINKS]); } @@ -1416,7 +1582,7 @@ hfs_remove_orphans(struct hfsmount * hfsmp) Now that Catalog is unlocked, update the volume info, making sure to differentiate between files and directories */ - if (cnode.c_attr.ca_mode & S_IFDIR) { + if (mode == S_IFDIR) { hfs_volupdate(hfsmp, VOL_RMDIR, 0); } else{ @@ -1489,7 +1655,6 @@ u_int32_t logBlockSize; return logBlockSize; } -__private_extern__ u_int32_t hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) { @@ -1517,7 +1682,7 @@ hfs_freeblks(struct hfsmount * hfsmp, int wantreserve) else freeblks = 0; -#ifdef HFS_SPARSE_DEV +#if HFS_SPARSE_DEV /* * When the underlying device is sparse, check the * available space on the backing store volume. @@ -1841,7 +2006,7 @@ journal_open_cb(const char *bsd_dev_name, const char *uuid_str, void *arg) // desired uuid so let's try to open the device for writing and // see if it works. if it does, we'll use it. - NDINIT(&nd, LOOKUP, LOCKLEAF, UIO_SYSSPACE32, CAST_USER_ADDR_T(bsd_name), vfs_context_kernel()); + NDINIT(&nd, LOOKUP, OP_LOOKUP, LOCKLEAF, UIO_SYSSPACE32, CAST_USER_ADDR_T(bsd_name), vfs_context_kernel()); if ((error = namei(&nd))) { printf("hfs: journal open cb: error %d looking up device %s (dev uuid %s)\n", error, bsd_name, uuid_str); return 1; // keep iterating @@ -1888,7 +2053,6 @@ journal_open_cb(const char *bsd_dev_name, const char *uuid_str, void *arg) extern dev_t IOBSDGetMediaWithUUID(const char *uuid_cstring, char *bsd_name, int bsd_name_len, int timeout); extern void IOBSDIterateMediaWithContent(const char *uuid_cstring, int (*func)(const char *bsd_dev_name, const char *uuid_str, void *arg), void *arg); -extern kern_return_t IOBSDGetPlatformUUID(__darwin_uuid_t uuid, mach_timespec_t timeoutp); kern_return_t IOBSDGetPlatformSerialNumber(char *serial_number_str, u_int32_t len); @@ -1940,7 +2104,6 @@ open_journal_dev(const char *vol_device, } -__private_extern__ int hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_args, off_t embeddedOffset, daddr64_t mdb_offset, @@ -2063,6 +2226,8 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, arg_flags, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); // no need to start a transaction here... if this were to fail // we'd just re-init it on the next mount. @@ -2084,6 +2249,8 @@ hfs_early_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, arg_flags, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); if (write_jibp) { buf_bwrite(jinfo_bp); @@ -2323,6 +2490,8 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a arg_flags, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); // no need to start a transaction here... if this were to fail // we'd just re-init it on the next mount. @@ -2352,6 +2521,8 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a arg_flags, arg_tbufsz, hfs_sync_metadata, hfsmp->hfs_mp); + if (hfsmp->jnl) + journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp); } @@ -2408,8 +2579,15 @@ hfs_late_journal_init(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, void *_a #define HOTBAND_MINIMUM_SIZE (10*1024*1024) #define HOTBAND_MAXIMUM_SIZE (512*1024*1024) +/* Initialize the metadata zone. + * + * If the size of the volume is less than the minimum size for + * metadata zone, metadata zone is disabled. + * + * If disable is true, disable metadata zone unconditionally. + */ void -hfs_metadatazone_init(struct hfsmount *hfsmp) +hfs_metadatazone_init(struct hfsmount *hfsmp, int disable) { ExtendedVCB *vcb; u_int64_t fs_size; @@ -2436,6 +2614,11 @@ hfs_metadatazone_init(struct hfsmount *hfsmp) really_do_it = 0; } + /* If caller wants to disable metadata zone, do it */ + if (disable == true) { + really_do_it = 0; + } + /* * Start with space for the boot blocks and Volume Header. * 1536 = byte offset from start of volume to end of volume header: @@ -2626,7 +2809,6 @@ hfs_hotfile_freeblocks(struct hfsmount *hfsmp) * Determine if a file is a "virtual" metadata file. * This includes journal and quota files. */ -__private_extern__ int hfs_virtualmetafile(struct cnode *cp) { @@ -2698,7 +2880,6 @@ hfs_sync_ejectable(struct hfsmount *hfsmp) } -__private_extern__ int hfs_start_transaction(struct hfsmount *hfsmp) { @@ -2723,11 +2904,11 @@ hfs_start_transaction(struct hfsmount *hfsmp) } #endif /* HFS_CHECK_LOCK_ORDER */ - if (hfsmp->jnl == NULL || journal_owner(hfsmp->jnl) != thread) { - lck_rw_lock_shared(&hfsmp->hfs_global_lock); - OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads); - unlock_on_err = 1; - } + if (hfsmp->jnl == NULL || journal_owner(hfsmp->jnl) != thread) { + hfs_lock_global (hfsmp, HFS_SHARED_LOCK); + OSAddAtomic(1, (SInt32 *)&hfsmp->hfs_active_threads); + unlock_on_err = 1; + } /* If a downgrade to read-only mount is in progress, no other * process than the downgrade process is allowed to modify @@ -2739,67 +2920,89 @@ hfs_start_transaction(struct hfsmount *hfsmp) goto out; } - if (hfsmp->jnl) { - ret = journal_start_transaction(hfsmp->jnl); - if (ret == 0) { - OSAddAtomic(1, &hfsmp->hfs_global_lock_nesting); + if (hfsmp->jnl) { + ret = journal_start_transaction(hfsmp->jnl); + if (ret == 0) { + OSAddAtomic(1, &hfsmp->hfs_global_lock_nesting); + } + } else { + ret = 0; } - } else { - ret = 0; - } out: - if (ret != 0 && unlock_on_err) { - lck_rw_unlock_shared(&hfsmp->hfs_global_lock); - OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); - } + if (ret != 0 && unlock_on_err) { + hfs_unlock_global (hfsmp); + OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); + } return ret; } -__private_extern__ int hfs_end_transaction(struct hfsmount *hfsmp) { int need_unlock=0, ret; - if ( hfsmp->jnl == NULL - || ( journal_owner(hfsmp->jnl) == current_thread() + if ((hfsmp->jnl == NULL) || ( journal_owner(hfsmp->jnl) == current_thread() && (OSAddAtomic(-1, &hfsmp->hfs_global_lock_nesting) == 1)) ) { - need_unlock = 1; } - if (hfsmp->jnl) { - ret = journal_end_transaction(hfsmp->jnl); - } else { - ret = 0; - } + if (hfsmp->jnl) { + ret = journal_end_transaction(hfsmp->jnl); + } else { + ret = 0; + } - if (need_unlock) { - OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); - lck_rw_unlock_shared(&hfsmp->hfs_global_lock); - hfs_sync_ejectable(hfsmp); - } + if (need_unlock) { + OSAddAtomic(-1, (SInt32 *)&hfsmp->hfs_active_threads); + hfs_unlock_global (hfsmp); + hfs_sync_ejectable(hfsmp); + } return ret; } -__private_extern__ +/* + * Flush the contents of the journal to the disk. + * + * Input: + * wait_for_IO - + * If TRUE, wait to write in-memory journal to the disk + * consistently, and also wait to write all asynchronous + * metadata blocks to its corresponding locations + * consistently on the disk. This means that the journal + * is empty at this point and does not contain any + * transactions. This is overkill in normal scenarios + * but is useful whenever the metadata blocks are required + * to be consistent on-disk instead of just the journal + * being consistent; like before live verification + * and live volume resizing. + * + * If FALSE, only wait to write in-memory journal to the + * disk consistently. This means that the journal still + * contains uncommitted transactions and the file system + * metadata blocks in the journal transactions might be + * written asynchronously to the disk. But there is no + * guarantee that they are written to the disk before + * returning to the caller. Note that this option is + * sufficient for file system data integrity as it + * guarantees consistent journal content on the disk. + */ int -hfs_journal_flush(struct hfsmount *hfsmp) +hfs_journal_flush(struct hfsmount *hfsmp, boolean_t wait_for_IO) { int ret; - + /* Only peek at hfsmp->jnl while holding the global lock */ - lck_rw_lock_shared(&hfsmp->hfs_global_lock); + hfs_lock_global (hfsmp, HFS_SHARED_LOCK); if (hfsmp->jnl) { - ret = journal_flush(hfsmp->jnl); + ret = journal_flush(hfsmp->jnl, wait_for_IO); } else { ret = 0; } - lck_rw_unlock_shared(&hfsmp->hfs_global_lock); + hfs_unlock_global (hfsmp); return ret; } @@ -2824,7 +3027,6 @@ hfs_journal_flush(struct hfsmount *hfsmp) * unused nodes have been repaired. A newer newfs_hfs will set this bit. * As will fsck_hfs when it repairs the unused nodes. */ -__private_extern__ int hfs_erase_unused_nodes(struct hfsmount *hfsmp) { int result; @@ -2877,3 +3079,92 @@ int hfs_erase_unused_nodes(struct hfsmount *hfsmp) done: return result; } + + +extern time_t snapshot_timestamp; + +int +check_for_tracked_file(struct vnode *vp, time_t ctime, uint64_t op_type, void *arg) +{ + int tracked_error = 0, snapshot_error = 0; + + if (vp == NULL) { + return 0; + } + + if (VTOC(vp)->c_flags & UF_TRACKED) { + // the file has the tracked bit set, so send an event to the tracked-file handler + int error; + + // printf("hfs: tracked-file: encountered a file with the tracked bit set! (vp %p)\n", vp); + error = resolve_nspace_item(vp, op_type | NAMESPACE_HANDLER_TRACK_EVENT); + if (error) { + if (error == EAGAIN) { + printf("hfs: tracked-file: timed out waiting for namespace handler...\n"); + + } else if (error == EINTR) { + // printf("hfs: tracked-file: got a signal while waiting for namespace handler...\n"); + tracked_error = EINTR; + } + } + } + + if (ctime != 0 && snapshot_timestamp != 0 && (ctime <= snapshot_timestamp || vnode_needssnapshots(vp))) { + // the change time is within this epoch + int error; + + error = resolve_nspace_item_ext(vp, op_type | NAMESPACE_HANDLER_SNAPSHOT_EVENT, arg); + if (error == EDEADLK) { + snapshot_error = 0; + } else if (error) { + if (error == EAGAIN) { + printf("hfs: cow-snapshot: timed out waiting for namespace handler...\n"); + } else if (error == EINTR) { + // printf("hfs: cow-snapshot: got a signal while waiting for namespace handler...\n"); + snapshot_error = EINTR; + } + } + } + + if (tracked_error) return tracked_error; + if (snapshot_error) return snapshot_error; + + return 0; +} + +int +check_for_dataless_file(struct vnode *vp, uint64_t op_type) +{ + int error; + + if (vp == NULL || (VTOC(vp)->c_flags & UF_COMPRESSED) == 0 || VTOCMP(vp) == NULL || VTOCMP(vp)->cmp_type != DATALESS_CMPFS_TYPE) { + // there's nothing to do, it's not dataless + return 0; + } + + // printf("hfs: dataless: encountered a file with the dataless bit set! (vp %p)\n", vp); + error = resolve_nspace_item(vp, op_type | NAMESPACE_HANDLER_NSPACE_EVENT); + if (error == EDEADLK && op_type == NAMESPACE_HANDLER_WRITE_OP) { + error = 0; + } else if (error) { + if (error == EAGAIN) { + printf("hfs: dataless: timed out waiting for namespace handler...\n"); + // XXXdbg - return the fabled ENOTPRESENT (i.e. EJUKEBOX)? + return 0; + } else if (error == EINTR) { + // printf("hfs: dataless: got a signal while waiting for namespace handler...\n"); + return EINTR; + } + } else if (VTOC(vp)->c_flags & UF_COMPRESSED) { + // + // if we're here, the dataless bit is still set on the file + // which means it didn't get handled. we return an error + // but it's presently ignored by all callers of this function. + // + // XXXdbg - EDATANOTPRESENT is what we really need... + // + return EBADF; + } + + return error; +} diff --git a/bsd/hfs/hfs_vnops.c b/bsd/hfs/hfs_vnops.c index 49973d29c..4c526f77b 100644 --- a/bsd/hfs/hfs_vnops.c +++ b/bsd/hfs/hfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,6 +45,10 @@ #include <sys/disk.h> #include <sys/kauth.h> #include <sys/uio_internal.h> +#include <sys/fsctl.h> +#include <sys/cprotect.h> + +#include <string.h> #include <miscfs/specfs/specdev.h> #include <miscfs/fifofs/fifo.h> @@ -72,20 +76,23 @@ /* Always F_FULLFSYNC? 1=yes,0=no (default due to "various" reasons is 'no') */ int always_do_fullfsync = 0; SYSCTL_DECL(_vfs_generic); -SYSCTL_INT (_vfs_generic, OID_AUTO, always_do_fullfsync, CTLFLAG_RW, &always_do_fullfsync, 0, "always F_FULLFSYNC when fsync is called"); +SYSCTL_INT (_vfs_generic, OID_AUTO, always_do_fullfsync, CTLFLAG_RW | CTLFLAG_LOCKED, &always_do_fullfsync, 0, "always F_FULLFSYNC when fsync is called"); -static int hfs_makenode(struct vnode *dvp, struct vnode **vpp, +int hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx); +int hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p); +int hfs_metasync_all(struct hfsmount *hfsmp); -static int hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p); -static int hfs_metasync_all(struct hfsmount *hfsmp); +int hfs_removedir(struct vnode *, struct vnode *, struct componentname *, + int, int); +int hfs_removefile(struct vnode *, struct vnode *, struct componentname *, + int, int, int, struct vnode *, int); -static int hfs_removedir(struct vnode *, struct vnode *, struct componentname *, - int); +int hfs_movedata (struct vnode *, struct vnode*); +static int hfs_move_fork (struct filefork *srcfork, struct cnode *src, + struct filefork *dstfork, struct cnode *dst); -static int hfs_removefile(struct vnode *, struct vnode *, struct componentname *, - int, int, int, struct vnode *); #if FIFO static int hfsfifo_read(struct vnop_read_args *); @@ -95,26 +102,27 @@ static int hfsfifo_close(struct vnop_close_args *); extern int (**fifo_vnodeop_p)(void *); #endif /* FIFO */ -static int hfs_vnop_close(struct vnop_close_args*); -static int hfs_vnop_create(struct vnop_create_args*); -static int hfs_vnop_exchange(struct vnop_exchange_args*); -static int hfs_vnop_fsync(struct vnop_fsync_args*); -static int hfs_vnop_mkdir(struct vnop_mkdir_args*); -static int hfs_vnop_mknod(struct vnop_mknod_args*); -static int hfs_vnop_getattr(struct vnop_getattr_args*); -static int hfs_vnop_open(struct vnop_open_args*); -static int hfs_vnop_readdir(struct vnop_readdir_args*); -static int hfs_vnop_remove(struct vnop_remove_args*); -static int hfs_vnop_rename(struct vnop_rename_args*); -static int hfs_vnop_rmdir(struct vnop_rmdir_args*); -static int hfs_vnop_symlink(struct vnop_symlink_args*); -static int hfs_vnop_setattr(struct vnop_setattr_args*); -static int hfs_vnop_readlink(struct vnop_readlink_args *); -static int hfs_vnop_pathconf(struct vnop_pathconf_args *); -static int hfs_vnop_whiteout(struct vnop_whiteout_args *); -static int hfsspec_read(struct vnop_read_args *); -static int hfsspec_write(struct vnop_write_args *); -static int hfsspec_close(struct vnop_close_args *); +int hfs_vnop_close(struct vnop_close_args*); +int hfs_vnop_create(struct vnop_create_args*); +int hfs_vnop_exchange(struct vnop_exchange_args*); +int hfs_vnop_fsync(struct vnop_fsync_args*); +int hfs_vnop_mkdir(struct vnop_mkdir_args*); +int hfs_vnop_mknod(struct vnop_mknod_args*); +int hfs_vnop_getattr(struct vnop_getattr_args*); +int hfs_vnop_open(struct vnop_open_args*); +int hfs_vnop_readdir(struct vnop_readdir_args*); +int hfs_vnop_remove(struct vnop_remove_args*); +int hfs_vnop_rename(struct vnop_rename_args*); +int hfs_vnop_rmdir(struct vnop_rmdir_args*); +int hfs_vnop_symlink(struct vnop_symlink_args*); +int hfs_vnop_setattr(struct vnop_setattr_args*); +int hfs_vnop_readlink(struct vnop_readlink_args *); +int hfs_vnop_pathconf(struct vnop_pathconf_args *); +int hfs_vnop_whiteout(struct vnop_whiteout_args *); +int hfs_vnop_mmap(struct vnop_mmap_args *ap); +int hfsspec_read(struct vnop_read_args *); +int hfsspec_write(struct vnop_write_args *); +int hfsspec_close(struct vnop_close_args *); /* Options for hfs_removedir and hfs_removefile */ #define HFSRM_SKIP_RESERVE 0x01 @@ -131,7 +139,7 @@ static int hfsspec_close(struct vnop_close_args *); /* * Create a regular file. */ -static int +int hfs_vnop_create(struct vnop_create_args *ap) { int error; @@ -164,6 +172,7 @@ again: /* Make sure it was file. */ if ((error == 0) && !vnode_isreg(*args.a_vpp)) { vnode_put(*args.a_vpp); + *args.a_vpp = NULLVP; error = EEXIST; } args.a_cnp->cn_nameiop = CREATE; @@ -174,7 +183,7 @@ again: /* * Make device special file. */ -static int +int hfs_vnop_mknod(struct vnop_mknod_args *ap) { struct vnode_attr *vap = ap->a_vap; @@ -245,7 +254,7 @@ hfs_ref_data_vp(struct cnode *cp, struct vnode **data_vp, int skiplock) return EINVAL; } - if (0 == hfs_vget(VTOHFS(cp->c_rsrc_vp), cp->c_cnid, data_vp, 1) && + if (0 == hfs_vget(VTOHFS(cp->c_rsrc_vp), cp->c_cnid, data_vp, 1, 0) && 0 != data_vp) { vref = vnode_ref(*data_vp); vnode_put(*data_vp); @@ -334,6 +343,8 @@ hfs_file_is_compressed(struct cnode *cp, int skiplock) * if the caller has passed a valid vnode (has a ref count > 0), then hfsmp and fid are not required. * if the caller doesn't have a vnode, pass NULL in vp, and pass valid hfsmp and fid. * files size is returned in size (required) + * if the indicated file is a directory (or something that doesn't have a data fork), then this call + * will return an error and the caller should fall back to treating the item as an uncompressed file */ int hfs_uncompressed_size_of_compressed_file(struct hfsmount *hfsmp, struct vnode *vp, cnid_t fid, off_t *size, int skiplock) @@ -349,7 +360,7 @@ hfs_uncompressed_size_of_compressed_file(struct hfsmount *hfsmp, struct vnode *v if (!hfsmp || !fid) { /* make sure we have the required parameters */ return EINVAL; } - if (0 != hfs_vget(hfsmp, fid, &vp, skiplock)) { /* vnode is null, use hfs_vget() to get it */ + if (0 != hfs_vget(hfsmp, fid, &vp, skiplock, 0)) { /* vnode is null, use hfs_vget() to get it */ vp = NULL; } else { putaway = 1; /* note that hfs_vget() was used to aquire the vnode */ @@ -359,10 +370,27 @@ hfs_uncompressed_size_of_compressed_file(struct hfsmount *hfsmp, struct vnode *v * ensures the cached size is present in case decmpfs hasn't * encountered this node yet. */ - if ( ( NULL != vp ) && hfs_file_is_compressed(VTOC(vp), skiplock) ) { - *size = decmpfs_cnode_get_vnode_cached_size(VTOCMP(vp)); /* file info will be cached now, so get size */ - } else { - ret = EINVAL; + if (vp) { + if (hfs_file_is_compressed(VTOC(vp), skiplock) ) { + *size = decmpfs_cnode_get_vnode_cached_size(VTOCMP(vp)); /* file info will be cached now, so get size */ + } else { + if (VTOCMP(vp) && VTOCMP(vp)->cmp_type >= CMP_MAX) { + if (VTOCMP(vp)->cmp_type != DATALESS_CMPFS_TYPE) { + // if we don't recognize this type, just use the real data fork size + if (VTOC(vp)->c_datafork) { + *size = VTOC(vp)->c_datafork->ff_size; + ret = 0; + } else { + ret = EINVAL; + } + } else { + *size = decmpfs_cnode_get_vnode_cached_size(VTOCMP(vp)); /* file info will be cached now, so get size */ + ret = 0; + } + } else { + ret = EINVAL; + } + } } if (putaway) { /* did we use hfs_vget() to get this vnode? */ @@ -396,7 +424,7 @@ hfs_hides_xattr(vfs_context_t ctx, struct cnode *cp, const char *name, int skipl /* * Open a file/directory. */ -static int +int hfs_vnop_open(struct vnop_open_args *ap) { struct vnode *vp = ap->a_vp; @@ -516,7 +544,7 @@ hfs_vnop_open(struct vnop_open_args *ap) /* * Close a file/directory. */ -static int +int hfs_vnop_close(ap) struct vnop_close_args /* { struct vnode *a_vp; @@ -559,11 +587,11 @@ hfs_vnop_close(ap) // release cnode lock; must acquire truncate lock BEFORE cnode lock hfs_unlock(cp); - hfs_lock_truncate(cp, TRUE); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); tooktrunclock = 1; if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK) != 0) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); // bail out if we can't re-acquire cnode lock return 0; } @@ -585,8 +613,8 @@ hfs_vnop_close(ap) // if we froze the fs and we're exiting, then "thaw" the fs if (hfsmp->hfs_freezing_proc == p && proc_exiting(p)) { hfsmp->hfs_freezing_proc = NULL; - hfs_global_exclusive_lock_release(hfsmp); - lck_rw_unlock_exclusive(&hfsmp->hfs_insync); + hfs_unlock_global (hfsmp); + lck_rw_unlock_exclusive(&hfsmp->hfs_insync); } busy = vnode_isinuse(vp, 1); @@ -601,7 +629,7 @@ hfs_vnop_close(ap) } if (tooktrunclock){ - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); } hfs_unlock(cp); @@ -615,7 +643,7 @@ hfs_vnop_close(ap) /* * Get basic attributes. */ -static int +int hfs_vnop_getattr(struct vnop_getattr_args *ap) { #define VNODE_ATTR_TIMES \ @@ -648,10 +676,16 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) /* if it's a data fork, we need to know if it was compressed so we can report the uncompressed size */ compressed = hfs_file_is_compressed(cp, 0); } - if (compressed && (VATTR_IS_ACTIVE(vap, va_data_size) || VATTR_IS_ACTIVE(vap, va_total_size))) { - if (0 != hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0)) { - /* failed to get the uncompressed size, we'll check for this later */ - uncompressed_size = -1; + if ((VATTR_IS_ACTIVE(vap, va_data_size) || VATTR_IS_ACTIVE(vap, va_total_size))) { + // if it's compressed + if (compressed || (!VNODE_IS_RSRC(vp) && cp->c_decmp && cp->c_decmp->cmp_type >= CMP_MAX)) { + if (0 != hfs_uncompressed_size_of_compressed_file(NULL, vp, 0, &uncompressed_size, 0)) { + /* failed to get the uncompressed size, we'll check for this later */ + uncompressed_size = -1; + } else { + // fake that it's compressed + compressed = 1; + } } } } @@ -812,13 +846,17 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) if (cp->c_blocks - VTOF(vp)->ff_blocks) { /* We deal with rsrc fork vnode iocount at the end of the function */ - error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE); + error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE); if (error) { - /* - * hfs_vgetrsrc may have returned a vnode in rvp even though - * we got an error, because we specified error_on_unlinked. - * We need to drop the iocount after we release the cnode lock, so - * it will be taken care of at the end of the function if it's needed. + /* + * Note that we call hfs_vgetrsrc with error_on_unlinked + * set to FALSE. This is because we may be invoked via + * fstat() on an open-unlinked file descriptor and we must + * continue to support access to the rsrc fork until it disappears. + * The code at the end of this function will be + * responsible for releasing the iocount generated by + * hfs_vgetrsrc. This is because we can't drop the iocount + * without unlocking the cnode first. */ goto out; } @@ -876,6 +914,17 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) vap->va_backup_time.tv_sec = cp->c_btime; vap->va_backup_time.tv_nsec = 0; + /* See if we need to emit the date added field to the user */ + if (VATTR_IS_ACTIVE(vap, va_addedtime)) { + u_int32_t dateadded = hfs_get_dateadded (cp); + if (dateadded) { + vap->va_addedtime.tv_sec = dateadded; + vap->va_addedtime.tv_nsec = 0; + VATTR_SET_SUPPORTED (vap, va_addedtime); + } + } + + /* XXX is this really a good 'optimal I/O size'? */ vap->va_iosize = hfsmp->hfs_logBlockSize; vap->va_uid = cp->c_uid; @@ -972,7 +1021,7 @@ hfs_vnop_getattr(struct vnop_getattr_args *ap) * have an open-unlinked file. Go to the next link in this case. */ if ((cp->c_desc.cd_namelen == 0) && (vap->va_linkid == cp->c_fileid)) { - if ((error = hfs_lookuplink(hfsmp, vap->va_linkid, &prevlinkid, &nextlinkid))){ + if ((error = hfs_lookup_siblinglinks(hfsmp, vap->va_linkid, &prevlinkid, &nextlinkid))){ goto out; } } @@ -1029,7 +1078,7 @@ out: return (error); } -static int +int hfs_vnop_setattr(ap) struct vnop_setattr_args /* { struct vnode *a_vp; @@ -1046,7 +1095,10 @@ hfs_vnop_setattr(ap) int error = 0; uid_t nuid; gid_t ngid; + time_t orig_ctime; + orig_ctime = VTOC(vp)->c_ctime; + #if HFS_COMPRESSION int decmpfs_reset_state = 0; /* @@ -1056,8 +1108,23 @@ hfs_vnop_setattr(ap) error = decmpfs_update_attributes(vp, vap); if (error) return error; + + // + // if this is not a size-changing setattr and it is not just + // an atime update, then check for a snapshot. + // + if (!VATTR_IS_ACTIVE(vap, va_data_size) && !(vap->va_active == VNODE_ATTR_va_access_time)) { + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_MOD, NULL); + } #endif + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + return (error); + } +#endif /* CONFIG_PROTECT */ + hfsmp = VTOHFS(vp); /* Don't allow modification of the journal file. */ @@ -1090,6 +1157,8 @@ hfs_vnop_setattr(ap) } } + check_for_tracked_file(vp, orig_ctime, vap->va_data_size == 0 ? NAMESPACE_HANDLER_TRUNCATE_OP|NAMESPACE_HANDLER_DELETE_OP : NAMESPACE_HANDLER_TRUNCATE_OP, NULL); + decmpfs_lock_compressed_data(dp, 1); if (hfs_file_is_compressed(VTOC(vp), 1)) { error = decmpfs_decompress_file(vp, dp, -1/*vap->va_data_size*/, 0, 1); @@ -1101,13 +1170,13 @@ hfs_vnop_setattr(ap) #endif /* Take truncate lock before taking cnode lock. */ - hfs_lock_truncate(VTOC(vp), TRUE); + hfs_lock_truncate(VTOC(vp), HFS_EXCLUSIVE_LOCK); /* Perform the ubc_setsize before taking the cnode lock. */ ubc_setsize(vp, vap->va_data_size); if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { - hfs_unlock_truncate(VTOC(vp), TRUE); + hfs_unlock_truncate(VTOC(vp), 0); #if HFS_COMPRESSION decmpfs_unlock_compressed_data(dp, 1); #endif @@ -1117,7 +1186,7 @@ hfs_vnop_setattr(ap) error = hfs_truncate(vp, vap->va_data_size, vap->va_vaflags & 0xffff, 1, 0, ap->a_context); - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); #if HFS_COMPRESSION decmpfs_unlock_compressed_data(dp, 1); #endif @@ -1297,7 +1366,6 @@ out: * Change the mode on a file. * cnode must be locked before calling. */ -__private_extern__ int hfs_chmod(struct vnode *vp, int mode, __unused kauth_cred_t cred, __unused struct proc *p) { @@ -1328,7 +1396,6 @@ hfs_chmod(struct vnode *vp, int mode, __unused kauth_cred_t cred, __unused struc } -__private_extern__ int hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean considerFlags) { @@ -1378,7 +1445,6 @@ hfs_write_access(struct vnode *vp, kauth_cred_t cred, struct proc *p, Boolean co * Perform chown operation on cnode cp; * code must be locked prior to call. */ -__private_extern__ int #if !QUOTA hfs_chown(struct vnode *vp, uid_t uid, gid_t gid, __unused kauth_cred_t cred, @@ -1512,7 +1578,7 @@ good: * case the file is being tracked through its file ID. Typically * its used after creating a new file during a safe-save. */ -static int +int hfs_vnop_exchange(ap) struct vnop_exchange_args /* { struct vnode *a_fvp; @@ -1538,6 +1604,7 @@ hfs_vnop_exchange(ap) int lockflags; int error = 0, started_tr = 0, got_cookie = 0; cat_cookie_t cookie; + time_t orig_from_ctime, orig_to_ctime; /* The files must be on the same volume. */ if (vnode_mount(from_vp) != vnode_mount(to_vp)) @@ -1546,6 +1613,9 @@ hfs_vnop_exchange(ap) if (from_vp == to_vp) return (EINVAL); + orig_from_ctime = VTOC(from_vp)->c_ctime; + orig_to_ctime = VTOC(to_vp)->c_ctime; + #if HFS_COMPRESSION if ( hfs_file_is_compressed(VTOC(from_vp), 0) ) { if ( 0 != ( error = decmpfs_decompress_file(from_vp, VTOCMP(from_vp), -1, 0, 1) ) ) { @@ -1560,6 +1630,50 @@ hfs_vnop_exchange(ap) } #endif // HFS_COMPRESSION + /* + * Normally, we want to notify the user handlers about the event, + * except if it's a handler driving the event. + */ + if ((ap->a_options & FSOPT_EXCHANGE_DATA_ONLY) == 0) { + check_for_tracked_file(from_vp, orig_from_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); + check_for_tracked_file(to_vp, orig_to_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); + } + else { + /* + * We're doing a data-swap. + * Take the truncate lock/cnode lock, then verify there are no mmap references. + * Issue a hfs_filedone to flush out all of the remaining state for this file. + * Allow the rest of the codeflow to re-acquire the cnode locks in order. + */ + + hfs_lock_truncate (VTOC(from_vp), HFS_SHARED_LOCK); + + if ((error = hfs_lock(VTOC(from_vp), HFS_EXCLUSIVE_LOCK))) { + hfs_unlock_truncate (VTOC(from_vp), 0); + return error; + } + + /* Verify the source file is not in use by anyone besides us (including mmap refs) */ + if (vnode_isinuse(from_vp, 1)) { + error = EBUSY; + hfs_unlock(VTOC(from_vp)); + hfs_unlock_truncate (VTOC(from_vp), 0); + return error; + } + + /* Flush out the data in the source file */ + VTOC(from_vp)->c_flag |= C_SWAPINPROGRESS; + error = hfs_filedone (from_vp, ap->a_context); + VTOC(from_vp)->c_flag &= ~C_SWAPINPROGRESS; + hfs_unlock(VTOC(from_vp)); + hfs_unlock_truncate(VTOC(from_vp), 0); + + if (error) { + return error; + } + } + + if ((error = hfs_lockpair(VTOC(from_vp), VTOC(to_vp), HFS_EXCLUSIVE_LOCK))) return (error); @@ -1595,6 +1709,16 @@ hfs_vnop_exchange(ap) } } + /* + * Ok, now that all of the pre-flighting is done, call the underlying + * function if needed. + */ + if (ap->a_options & FSOPT_EXCHANGE_DATA_ONLY) { + error = hfs_movedata(from_vp, to_vp); + goto exit; + } + + if ((error = hfs_start_transaction(hfsmp)) != 0) { goto exit; } @@ -1729,11 +1853,338 @@ exit: return (error); } +int +hfs_vnop_mmap(struct vnop_mmap_args *ap) +{ + struct vnode *vp = ap->a_vp; + int error; + + if (VNODE_IS_RSRC(vp)) { + /* allow pageins of the resource fork */ + } else { + int compressed = hfs_file_is_compressed(VTOC(vp), 1); /* 1 == don't take the cnode lock */ + time_t orig_ctime = VTOC(vp)->c_ctime; + + if (!compressed && (VTOC(vp)->c_flags & UF_COMPRESSED)) { + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); + if (error != 0) { + return error; + } + } + + if (ap->a_fflags & PROT_WRITE) { + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_WRITE_OP, NULL); + } + } + + // + // NOTE: we return ENOTSUP because we want the cluster layer + // to actually do all the real work. + // + return (ENOTSUP); +} + +/* + * hfs_movedata + * + * This is a non-symmetric variant of exchangedata. In this function, + * the contents of the fork in from_vp are moved to the fork + * specified by to_vp. + * + * The cnodes pointed to by 'from_vp' and 'to_vp' must be locked. + * + * The vnode pointed to by 'to_vp' *must* be empty prior to invoking this function. + * We impose this restriction because we may not be able to fully delete the entire + * file's contents in a single transaction, particularly if it has a lot of extents. + * In the normal file deletion codepath, the file is screened for two conditions: + * 1) bigger than 400MB, and 2) more than 8 extents. If so, the file is relocated to + * the hidden directory and the deletion is broken up into multiple truncates. We can't + * do that here because both files need to exist in the namespace. The main reason this + * is imposed is that we may have to touch a whole lot of bitmap blocks if there are + * many extents. + * + * Any data written to 'from_vp' after this call completes is not guaranteed + * to be moved. + * + * Arguments: + * vnode from_vp: source file + * vnode to_vp: destination file; must be empty + * + * Returns: + * EFBIG - Destination file was not empty + * 0 - success + * + * + */ +int hfs_movedata (struct vnode *from_vp, struct vnode *to_vp) { + + struct cnode *from_cp; + struct cnode *to_cp; + struct hfsmount *hfsmp = NULL; + int error = 0; + int started_tr = 0; + int lockflags = 0; + int overflow_blocks; + int rsrc = 0; + + + /* Get the HFS pointers */ + from_cp = VTOC(from_vp); + to_cp = VTOC(to_vp); + hfsmp = VTOHFS(from_vp); + + /* Verify that neither source/dest file is open-unlinked */ + if (from_cp->c_flag & (C_DELETED | C_NOEXISTS)) { + error = EBUSY; + goto movedata_exit; + } + + if (to_cp->c_flag & (C_DELETED | C_NOEXISTS)) { + error = EBUSY; + goto movedata_exit; + } + + /* + * Verify the source file is not in use by anyone besides us. + * + * This function is typically invoked by a namespace handler + * process responding to a temporarily stalled system call. + * The FD that it is working off of is opened O_EVTONLY, so + * it really has no active usecounts (the kusecount from O_EVTONLY + * is subtracted from the total usecounts). + * + * As a result, we shouldn't have any active usecounts against + * this vnode when we go to check it below. + */ + if (vnode_isinuse(from_vp, 0)) { + error = EBUSY; + goto movedata_exit; + } + + if (from_cp->c_rsrc_vp == from_vp) { + rsrc = 1; + } + + /* + * We assume that the destination file is already empty. + * Verify that it is. + */ + if (rsrc) { + if (to_cp->c_rsrcfork->ff_size > 0) { + error = EFBIG; + goto movedata_exit; + } + } + else { + if (to_cp->c_datafork->ff_size > 0) { + error = EFBIG; + goto movedata_exit; + } + } + + /* If the source has the rsrc open, make sure the destination is also the rsrc */ + if (rsrc) { + if (to_vp != to_cp->c_rsrc_vp) { + error = EINVAL; + goto movedata_exit; + } + } + else { + /* Verify that both forks are data forks */ + if (to_vp != to_cp->c_vp) { + error = EINVAL; + goto movedata_exit; + } + } + + /* + * See if the source file has overflow extents. If it doesn't, we don't + * need to call into MoveData, and the catalog will be enough. + */ + if (rsrc) { + overflow_blocks = overflow_extents(from_cp->c_rsrcfork); + } + else { + overflow_blocks = overflow_extents(from_cp->c_datafork); + } + + if ((error = hfs_start_transaction (hfsmp)) != 0) { + goto movedata_exit; + } + started_tr = 1; + + /* Lock the system files: catalog, extents, attributes */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS | SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); + + /* Copy over any catalog allocation data into the new spot. */ + if (rsrc) { + if ((error = hfs_move_fork (from_cp->c_rsrcfork, from_cp, to_cp->c_rsrcfork, to_cp))){ + hfs_systemfile_unlock(hfsmp, lockflags); + goto movedata_exit; + } + } + else { + if ((error = hfs_move_fork (from_cp->c_datafork, from_cp, to_cp->c_datafork, to_cp))) { + hfs_systemfile_unlock(hfsmp, lockflags); + goto movedata_exit; + } + } + + /* + * Note that because all we're doing is moving the extents around, we can + * probably do this in a single transaction: Each extent record (group of 8) + * is 64 bytes. A extent overflow B-Tree node is typically 4k. This means + * each node can hold roughly ~60 extent records == (480 extents). + * + * If a file was massively fragmented and had 20k extents, this means we'd + * roughly touch 20k/480 == 41 to 42 nodes, plus the index nodes, for half + * of the operation. (inserting or deleting). So if we're manipulating 80-100 + * nodes, this is basically 320k of data to write to the journal in + * a bad case. + */ + if (overflow_blocks != 0) { + if (rsrc) { + error = MoveData(hfsmp, from_cp->c_cnid, to_cp->c_cnid, 1); + } + else { + error = MoveData (hfsmp, from_cp->c_cnid, to_cp->c_cnid, 0); + } + } + + if (error) { + /* Reverse the operation. Copy the fork data back into the source */ + if (rsrc) { + hfs_move_fork (to_cp->c_rsrcfork, to_cp, from_cp->c_rsrcfork, from_cp); + } + else { + hfs_move_fork (to_cp->c_datafork, to_cp, from_cp->c_datafork, from_cp); + } + } + else { + struct cat_fork *src_data = NULL; + struct cat_fork *src_rsrc = NULL; + struct cat_fork *dst_data = NULL; + struct cat_fork *dst_rsrc = NULL; + + /* Touch the times*/ + to_cp->c_touch_acctime = TRUE; + to_cp->c_touch_chgtime = TRUE; + to_cp->c_touch_modtime = TRUE; + + from_cp->c_touch_acctime = TRUE; + from_cp->c_touch_chgtime = TRUE; + from_cp->c_touch_modtime = TRUE; + + hfs_touchtimes(hfsmp, to_cp); + hfs_touchtimes(hfsmp, from_cp); + + if (from_cp->c_datafork) { + src_data = &from_cp->c_datafork->ff_data; + } + if (from_cp->c_rsrcfork) { + src_rsrc = &from_cp->c_rsrcfork->ff_data; + } + + if (to_cp->c_datafork) { + dst_data = &to_cp->c_datafork->ff_data; + } + if (to_cp->c_rsrcfork) { + dst_rsrc = &to_cp->c_rsrcfork->ff_data; + } + + /* Update the catalog nodes */ + (void) cat_update(hfsmp, &from_cp->c_desc, &from_cp->c_attr, + src_data, src_rsrc); + + (void) cat_update(hfsmp, &to_cp->c_desc, &to_cp->c_attr, + dst_data, dst_rsrc); + + } + /* unlock the system files */ + hfs_systemfile_unlock(hfsmp, lockflags); + + +movedata_exit: + if (started_tr) { + hfs_end_transaction(hfsmp); + } + + return error; + +} + +/* + * Copy all of the catalog and runtime data in srcfork to dstfork. + * + * This allows us to maintain the invalid ranges across the movedata operation so + * we don't need to force all of the pending IO right now. In addition, we move all + * non overflow-extent extents into the destination here. + */ +static int hfs_move_fork (struct filefork *srcfork, struct cnode *src_cp, + struct filefork *dstfork, struct cnode *dst_cp) { + struct rl_entry *invalid_range; + int size = sizeof(struct HFSPlusExtentDescriptor); + size = size * kHFSPlusExtentDensity; + + /* If the dstfork has any invalid ranges, bail out */ + invalid_range = TAILQ_FIRST(&dstfork->ff_invalidranges); + if (invalid_range != NULL) { + return EFBIG; + } + + if (dstfork->ff_data.cf_size != 0 || dstfork->ff_data.cf_new_size != 0) { + return EFBIG; + } + + /* First copy the invalid ranges */ + while ((invalid_range = TAILQ_FIRST(&srcfork->ff_invalidranges))) { + off_t start = invalid_range->rl_start; + off_t end = invalid_range->rl_end; + + /* Remove it from the srcfork and add it to dstfork */ + rl_remove(start, end, &srcfork->ff_invalidranges); + rl_add(start, end, &dstfork->ff_invalidranges); + } + + /* + * Ignore the ff_union. We don't move symlinks or system files. + * Now copy the in-catalog extent information + */ + dstfork->ff_data.cf_size = srcfork->ff_data.cf_size; + dstfork->ff_data.cf_new_size = srcfork->ff_data.cf_new_size; + dstfork->ff_data.cf_vblocks = srcfork->ff_data.cf_vblocks; + dstfork->ff_data.cf_blocks = srcfork->ff_data.cf_blocks; + + /* just memcpy the whole array of extents to the new location. */ + memcpy (dstfork->ff_data.cf_extents, srcfork->ff_data.cf_extents, size); + + /* + * Copy the cnode attribute data. + * + */ + src_cp->c_blocks -= srcfork->ff_data.cf_vblocks; + src_cp->c_blocks -= srcfork->ff_data.cf_blocks; + + dst_cp->c_blocks += srcfork->ff_data.cf_vblocks; + dst_cp->c_blocks += srcfork->ff_data.cf_blocks; + + /* Now delete the entries in the source fork */ + srcfork->ff_data.cf_size = 0; + srcfork->ff_data.cf_new_size = 0; + srcfork->ff_data.cf_union.cfu_bytesread = 0; + srcfork->ff_data.cf_vblocks = 0; + srcfork->ff_data.cf_blocks = 0; + + /* Zero out the old extents */ + bzero (srcfork->ff_data.cf_extents, size); + return 0; +} + + /* * cnode must be locked */ -__private_extern__ int hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) { @@ -1747,7 +2198,6 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) int wait; /* all other attributes (e.g. atime, etc.) */ int lockflag; int took_trunc_lock = 0; - boolean_t trunc_lock_exclusive = FALSE; /* * Applications which only care about data integrity rather than full @@ -1777,14 +2227,13 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) } } else if (UBCINFOEXISTS(vp)) { hfs_unlock(cp); - hfs_lock_truncate(cp, trunc_lock_exclusive); + hfs_lock_truncate(cp, HFS_SHARED_LOCK); took_trunc_lock = 1; if (fp->ff_unallocblocks != 0) { - hfs_unlock_truncate(cp, trunc_lock_exclusive); + hfs_unlock_truncate(cp, 0); - trunc_lock_exclusive = TRUE; - hfs_lock_truncate(cp, trunc_lock_exclusive); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); } /* Don't hold cnode lock when calling into cluster layer. */ (void) cluster_push(vp, waitdata ? IO_SYNC : 0); @@ -1811,13 +2260,12 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) goto datasync; } if (!TAILQ_EMPTY(&fp->ff_invalidranges)) { - if (!took_trunc_lock || trunc_lock_exclusive == FALSE) { + if (!took_trunc_lock || (cp->c_truncatelockowner == HFS_SHARED_OWNER)) { hfs_unlock(cp); - if (took_trunc_lock) - hfs_unlock_truncate(cp, trunc_lock_exclusive); - - trunc_lock_exclusive = TRUE; - hfs_lock_truncate(cp, trunc_lock_exclusive); + if (took_trunc_lock) { + hfs_unlock_truncate(cp, 0); + } + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); hfs_lock(cp, HFS_FORCE_LOCK); took_trunc_lock = 1; } @@ -1848,7 +2296,7 @@ hfs_fsync(struct vnode *vp, int waitfor, int fullsync, struct proc *p) } datasync: if (took_trunc_lock) { - hfs_unlock_truncate(cp, trunc_lock_exclusive); + hfs_unlock_truncate(cp, 0); took_trunc_lock = 0; } /* @@ -1899,13 +2347,23 @@ metasync: * changes get to stable storage. */ if (fullsync) { - if (hfsmp->jnl) { - hfs_journal_flush(hfsmp); - } else { - retval = hfs_metasync_all(hfsmp); - /* XXX need to pass context! */ - VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL); - } + if (hfsmp->jnl) { + hfs_journal_flush(hfsmp, FALSE); + + if (journal_uses_fua(hfsmp->jnl)) { + /* + * the journal_flush did NOT issue a sync track cache command, + * and the fullsync indicates we are supposed to flush all cached + * data to the media, so issue the sync track cache command + * explicitly + */ + VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL); + } + } else { + retval = hfs_metasync_all(hfsmp); + /* XXX need to pass context! */ + VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, NULL); + } } } @@ -1914,7 +2372,7 @@ metasync: /* Sync an hfs catalog b-tree node */ -static int +int hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p) { vnode_t vp; @@ -1960,7 +2418,7 @@ hfs_metasync(struct hfsmount *hfsmp, daddr64_t node, __unused struct proc *p) * we rely on fsck_hfs to fix that up (which it can do without any loss * of data). */ -static int +int hfs_metasync_all(struct hfsmount *hfsmp) { int lockflags; @@ -2002,7 +2460,6 @@ hfs_btsync_callback(struct buf *bp, __unused void *dummy) } -__private_extern__ int hfs_btsync(struct vnode *vp, int sync_transaction) { @@ -2030,7 +2487,7 @@ hfs_btsync(struct vnode *vp, int sync_transaction) /* * Remove a directory. */ -static int +int hfs_vnop_rmdir(ap) struct vnop_rmdir_args /* { struct vnode *a_dvp; @@ -2044,6 +2501,9 @@ hfs_vnop_rmdir(ap) struct cnode *dcp = VTOC(dvp); struct cnode *cp = VTOC(vp); int error; + time_t orig_ctime; + + orig_ctime = VTOC(vp)->c_ctime; if (!S_ISDIR(cp->c_mode)) { return (ENOTDIR); @@ -2051,6 +2511,10 @@ hfs_vnop_rmdir(ap) if (dvp == vp) { return (EINVAL); } + + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); + cp = VTOC(vp); + if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { return (error); } @@ -2060,7 +2524,7 @@ hfs_vnop_rmdir(ap) hfs_unlockpair (dcp, cp); return ENOENT; } - error = hfs_removedir(dvp, vp, ap->a_cnp, 0); + error = hfs_removedir(dvp, vp, ap->a_cnp, 0, 0); hfs_unlockpair(dcp, cp); @@ -2072,9 +2536,9 @@ hfs_vnop_rmdir(ap) * * Both dvp and vp cnodes are locked */ -static int +int hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, - int skip_reserve) + int skip_reserve, int only_unlink) { struct cnode *cp; struct cnode *dcp; @@ -2096,24 +2560,77 @@ hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (cp->c_entries != 0) { return (ENOTEMPTY); } + + /* + * If the directory is open or in use (e.g. opendir() or current working + * directory for some process); wait for inactive/reclaim to actually + * remove cnode from the catalog. Both inactive and reclaim codepaths are capable + * of removing open-unlinked directories from the catalog, as well as getting rid + * of EAs still on the element. So change only_unlink to true, so that it will get + * cleaned up below. + * + * Otherwise, we can get into a weird old mess where the directory has C_DELETED, + * but it really means C_NOEXISTS because the item was actually removed from the + * catalog. Then when we try to remove the entry from the catalog later on, it won't + * really be there anymore. + */ + if (vnode_isinuse(vp, 0)) { + only_unlink = 1; + } - /* Check if we're removing the last link to an empty directory. */ + /* Deal with directory hardlinks */ if (cp->c_flag & C_HARDLINK) { - /* We could also return EBUSY here */ + /* + * Note that if we have a directory which was a hardlink at any point, + * its actual directory data is stored in the directory inode in the hidden + * directory rather than the leaf element(s) present in the namespace. + * + * If there are still other hardlinks to this directory, + * then we'll just eliminate this particular link and the vnode will still exist. + * If this is the last link to an empty directory, then we'll open-unlink the + * directory and it will be only tagged with C_DELETED (as opposed to C_NOEXISTS). + * + * We could also return EBUSY here. + */ + return hfs_unlink(hfsmp, dvp, vp, cnp, skip_reserve); } /* - * We want to make sure that if the directory has a lot of attributes, we process them - * in separate transactions to ensure we don't panic in the journal with a gigantic - * transaction. This means we'll let hfs_removefile deal with the directory, which generally - * follows the same codepath as open-unlinked files. Note that the last argument to - * hfs_removefile specifies that it is supposed to handle directories for this case. - */ - if ((hfsmp->hfs_attribute_vp != NULL) && - (cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0) { - - return hfs_removefile(dvp, vp, cnp, 0, 0, 1, NULL); + * In a few cases, we may want to allow the directory to persist in an + * open-unlinked state. If the directory is being open-unlinked (still has usecount + * references), or if it has EAs, or if it was being deleted as part of a rename, + * then we go ahead and move it to the hidden directory. + * + * If the directory is being open-unlinked, then we want to keep the catalog entry + * alive so that future EA calls and fchmod/fstat etc. do not cause issues later. + * + * If the directory had EAs, then we want to use the open-unlink trick so that the + * EA removal is not done in one giant transaction. Otherwise, it could cause a panic + * due to overflowing the journal. + * + * Finally, if it was deleted as part of a rename, we move it to the hidden directory + * in order to maintain rename atomicity. + * + * Note that the allow_dirs argument to hfs_removefile specifies that it is + * supposed to handle directories for this case. + */ + + if (((hfsmp->hfs_attribute_vp != NULL) && + ((cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0)) || + (only_unlink != 0)) { + + int ret = hfs_removefile(dvp, vp, cnp, 0, 0, 1, NULL, only_unlink); + /* + * Even though hfs_vnop_rename calls vnode_recycle for us on tvp we call + * it here just in case we were invoked by rmdir() on a directory that had + * EAs. To ensure that we start reclaiming the space as soon as possible, + * we call vnode_recycle on the directory. + */ + vnode_recycle(vp); + + return ret; + } dcp->c_flag |= C_DIR_MODIFICATION; @@ -2155,7 +2672,7 @@ hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, desc.cd_encoding = cp->c_encoding; desc.cd_hint = 0; - if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid)) { + if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid, NULL, &error)) { error = 0; goto out; } @@ -2199,16 +2716,8 @@ hfs_removedir(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, hfs_volupdate(hfsmp, VOL_RMDIR, (dcp->c_cnid == kHFSRootFolderID)); - /* - * directory open or in use (e.g. opendir() or current working - * directory for some process); wait for inactive to actually - * remove catalog entry - */ - if (vnode_isinuse(vp, 0)) { - cp->c_flag |= C_DELETED; - } else { - cp->c_flag |= C_NOEXISTS; - } + /* Mark C_NOEXISTS since the catalog entry is now gone */ + cp->c_flag |= C_NOEXISTS; out: dcp->c_flag &= ~C_DIR_MODIFICATION; wakeup((caddr_t)&dcp->c_flag); @@ -2224,7 +2733,7 @@ out: /* * Remove a file or link. */ -static int +int hfs_vnop_remove(ap) struct vnop_remove_args /* { struct vnode *a_dvp; @@ -2237,17 +2746,29 @@ hfs_vnop_remove(ap) struct vnode *dvp = ap->a_dvp; struct vnode *vp = ap->a_vp; struct cnode *dcp = VTOC(dvp); - struct cnode *cp = VTOC(vp); + struct cnode *cp; struct vnode *rvp = NULL; struct hfsmount *hfsmp = VTOHFS(vp); int error=0, recycle_rsrc=0; int drop_rsrc_vnode = 0; - int vref; + time_t orig_ctime; if (dvp == vp) { return (EINVAL); } + orig_ctime = VTOC(vp)->c_ctime; + if (!vnode_isnamedstream(vp)) { + error = check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); + if (error) { + // XXXdbg - decide on a policy for handling namespace handler failures! + // for now we just let them proceed. + } + } + error = 0; + + cp = VTOC(vp); + /* * We need to grab the cnode lock on 'cp' before the lockpair() * to get an iocount on the rsrc fork BEFORE we enter hfs_removefile. @@ -2269,23 +2790,25 @@ hfs_vnop_remove(ap) if ((error = hfs_lock (cp, HFS_EXCLUSIVE_LOCK))) { return (error); } + error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, TRUE); hfs_unlock(cp); if (error) { - /* We may have gotten a rsrc vp out even though we got an error back. */ + /* we may have gotten an rsrc vp even though we got an error */ if (rvp) { vnode_put(rvp); rvp = NULL; } - return error; + return (error); } drop_rsrc_vnode = 1; } /* Now that we may have an iocount on rvp, do the lock pair */ - hfs_lock_truncate(cp, TRUE); + + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); if ((error = hfs_lockpair(dcp, cp, HFS_EXCLUSIVE_LOCK))) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); /* drop the iocount on rvp if necessary */ if (drop_rsrc_vnode) { vnode_put (rvp); @@ -2302,20 +2825,27 @@ hfs_vnop_remove(ap) goto rm_done; } - error = hfs_removefile(dvp, vp, ap->a_cnp, ap->a_flags, 0, 0, rvp); - - // - // If the remove succeeded and it's an open-unlinked file that has - // a resource fork vnode that's not in use, we will want to recycle - // the rvp *after* we're done unlocking everything. Otherwise the - // resource vnode will keep a v_parent reference on this vnode which - // prevents it from going through inactive/reclaim which means that - // the disk space associated with this file won't get free'd until - // something forces the resource vnode to get recycled (and that can - // take a very long time). - // - if (error == 0 && (cp->c_flag & C_DELETED) && - (rvp) && !vnode_isinuse(rvp, 0)) { + error = hfs_removefile(dvp, vp, ap->a_cnp, ap->a_flags, 0, 0, rvp, 0); + + /* + * If the remove succeeded in deleting the file, then we may need to mark + * the resource fork for recycle so that it is reclaimed as quickly + * as possible. If it were not recycled quickly, then this resource fork + * vnode could keep a v_parent reference on the data fork, which prevents it + * from going through reclaim (by giving it extra usecounts), except in the force- + * unmount case. + * + * However, a caveat: we need to continue to supply resource fork + * access to open-unlinked files even if the resource fork is not open. This is + * a requirement for the compressed files work. Luckily, hfs_vgetrsrc will handle + * this already if the data fork has been re-parented to the hidden directory. + * + * As a result, all we really need to do here is mark the resource fork vnode + * for recycle. If it goes out of core, it can be brought in again if needed. + * If the cnode was instead marked C_NOEXISTS, then there wouldn't be any + * more work. + */ + if ((error == 0) && (rvp)) { recycle_rsrc = 1; } @@ -2326,15 +2856,11 @@ hfs_vnop_remove(ap) * truncate lock) */ rm_done: - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); hfs_unlockpair(dcp, cp); if (recycle_rsrc) { - vref = vnode_ref(rvp); - if (vref == 0) { - /* vnode_ref could return an error, only release if we got a ref */ - vnode_rele(rvp); - } + /* inactive or reclaim on rvp will clean up the blocks from the rsrc fork */ vnode_recycle(rvp); } @@ -2376,24 +2902,24 @@ hfs_removefile_callback(struct buf *bp, void *hfsmp) { * * Requires cnode and truncate locks to be held. */ -static int +int hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, - int flags, int skip_reserve, int allow_dirs, struct vnode *rvp) + int flags, int skip_reserve, int allow_dirs, + struct vnode *rvp, int only_unlink) { struct cnode *cp; struct cnode *dcp; struct hfsmount *hfsmp; struct cat_desc desc; struct timeval tv; - vfs_context_t ctx = cnp->cn_context; int dataforkbusy = 0; int rsrcforkbusy = 0; - int truncated = 0; int lockflags; int error = 0; int started_tr = 0; int isbigfile = 0, defer_remove=0, isdir=0; - + int update_vh = 0; + cp = VTOC(vp); dcp = VTOC(dvp); hfsmp = VTOHFS(vp); @@ -2403,7 +2929,7 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, return (0); } - if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid)) { + if (!hfs_valid_cnode(hfsmp, dvp, cnp, cp->c_fileid, NULL, &error)) { return 0; } @@ -2485,6 +3011,11 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, (cp->c_attr.ca_recflags & kHFSHasAttributesMask) != 0) { defer_remove = 1; } + + /* If we are explicitly told to only unlink item and move to hidden dir, then do it */ + if (only_unlink) { + defer_remove = 1; + } /* * Carbon semantics prohibit deleting busy files. @@ -2502,9 +3033,16 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (hfsmp->hfs_flags & HFS_QUOTAS) (void)hfs_getinoquota(cp); #endif /* QUOTA */ - - /* Check if we need a ubc_setsize. */ - if (isdir == 0 && (!dataforkbusy || !rsrcforkbusy)) { + + /* + * Do a ubc_setsize to indicate we need to wipe contents if: + * 1) item is a regular file. + * 2) Neither fork is busy AND we are not told to unlink this. + * + * We need to check for the defer_remove since it can be set without + * having a busy data or rsrc fork + */ + if (isdir == 0 && (!dataforkbusy || !rsrcforkbusy) && (defer_remove == 0)) { /* * A ubc_setsize can cause a pagein so defer it * until after the cnode lock is dropped. The @@ -2525,40 +3063,46 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, started_tr = 1; // XXXdbg - if we're journaled, kill any dirty symlink buffers - if (hfsmp->jnl && vnode_islnk(vp)) + if (hfsmp->jnl && vnode_islnk(vp) && (defer_remove == 0)) { buf_iterate(vp, hfs_removefile_callback, BUF_SKIP_NONLOCKED, (void *)hfsmp); + } /* - * Truncate any non-busy forks. Busy forks will + * Prepare to truncate any non-busy forks. Busy forks will * get truncated when their vnode goes inactive. * Note that we will only enter this region if we * can avoid creating an open-unlinked file. If * either region is busy, we will have to create an open * unlinked file. - * Since we're already inside a transaction, - * tell hfs_truncate to skip the ubc_setsize. + * + * Since we are deleting the file, we need to stagger the runtime + * modifications to do things in such a way that a crash won't + * result in us getting overlapped extents or any other + * bad inconsistencies. As such, we call prepare_release_storage + * which updates the UBC, updates quota information, and releases + * any loaned blocks that belong to this file. No actual + * truncation or bitmap manipulation is done until *AFTER* + * the catalog record is removed. */ - if (isdir == 0 && (!dataforkbusy && !rsrcforkbusy)) { - /* - * Note that 5th argument to hfs_truncate indicates whether or not - * hfs_update calls should be suppressed in call to do_hfs_truncate - */ + if (isdir == 0 && (!dataforkbusy && !rsrcforkbusy) && (only_unlink == 0)) { + if (!dataforkbusy && !isbigfile && cp->c_datafork->ff_blocks != 0) { - /* skip update in hfs_truncate */ - error = hfs_truncate(vp, (off_t)0, IO_NDELAY, 1, 1, ctx); - if (error) + + error = hfs_prepare_release_storage (hfsmp, vp); + if (error) { goto out; - truncated = 1; + } + update_vh = 1; } if (!rsrcforkbusy && rvp) { - /* skip update in hfs_truncate */ - error = hfs_truncate(rvp, (off_t)0, IO_NDELAY, 1, 1, ctx); - if (error) + error = hfs_prepare_release_storage (hfsmp, rvp); + if (error) { goto out; - truncated = 1; + } + update_vh = 1; } } - + /* * Protect against a race with rename by using the component * name passed in and parent id from dvp (instead of using @@ -2658,15 +3202,15 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, if (error) goto out; - } else /* Not busy */ { - - if (cp->c_blocks > 0) { - printf("hfs_remove: attempting to delete a non-empty file %s\n", - cp->c_desc.cd_nameptr); - error = EBUSY; - goto out; - } - + } + else /* Not busy */ { + +#if QUOTA + off_t savedbytes; + int blksize = hfsmp->blockSize; +#endif + u_int32_t fileid = cp->c_fileid; + lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_ATTRIBUTE | SFL_BITMAP, HFS_EXCLUSIVE_LOCK); if (!skip_reserve) { if ((error = cat_preflight(hfsmp, CAT_DELETE, NULL, 0))) { @@ -2674,30 +3218,14 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, goto out; } } - + error = cat_delete(hfsmp, &desc, &cp->c_attr); - - if (error && error != ENXIO && error != ENOENT && truncated) { - if ((cp->c_datafork && cp->c_datafork->ff_size != 0) || - (cp->c_rsrcfork && cp->c_rsrcfork->ff_size != 0)) { - off_t data_size = 0; - off_t rsrc_size = 0; - if (cp->c_datafork) { - data_size = cp->c_datafork->ff_size; - } - if (cp->c_rsrcfork) { - rsrc_size = cp->c_rsrcfork->ff_size; - } - printf("hfs: remove: couldn't delete a truncated file (%s)" - "(error %d, data sz %lld; rsrc sz %lld)", - cp->c_desc.cd_nameptr, error, data_size, rsrc_size); - hfs_mark_volume_inconsistent(hfsmp); - } else { - printf("hfs: remove: strangely enough, deleting truncated file %s (%d) got err %d\n", - cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error); - } + + if (error && error != ENXIO && error != ENOENT) { + printf("hfs_removefile: deleting file %s (%d), err: %d\n", + cp->c_desc.cd_nameptr, cp->c_attr.ca_fileid, error); } - + if (error == 0) { /* Update the parent directory */ if (dcp->c_entries > 0) @@ -2708,26 +3236,65 @@ hfs_removefile(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, (void) cat_update(hfsmp, &dcp->c_desc, &dcp->c_attr, NULL, NULL); } hfs_systemfile_unlock(hfsmp, lockflags); - if (error) + if (error) { goto out; - + } + + /* + * Now that we've wiped out the catalog record, the file effectively doesn't + * exist anymore. So update the quota records to reflect the loss of the + * data fork and the resource fork. + */ #if QUOTA - if (hfsmp->hfs_flags & HFS_QUOTAS) + if (cp->c_datafork->ff_blocks > 0) { + savedbytes = ((off_t)cp->c_datafork->ff_blocks * (off_t)blksize); + (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); + } + + if (cp->c_rsrcfork && (cp->c_rsrcfork->ff_blocks > 0)) { + savedbytes = ((off_t)cp->c_rsrcfork->ff_blocks * (off_t)blksize); + (void) hfs_chkdq(cp, (int64_t)-(savedbytes), NOCRED, 0); + } + + if (hfsmp->hfs_flags & HFS_QUOTAS) { (void)hfs_chkiq(cp, -1, NOCRED, 0); -#endif /* QUOTA */ - + } +#endif + + + /* + * If we didn't get any errors deleting the catalog entry, then go ahead + * and release the backing store now. The filefork pointers are still valid. + */ + error = hfs_release_storage (hfsmp, cp->c_datafork, cp->c_rsrcfork, fileid); + + if (error) { + /* + * If we encountered an error updating the extents and bitmap, + * mark the volume inconsistent. At this point, the catalog record has + * already been deleted, so we can't recover it at this point. We need + * to proceed and update the volume header and mark the cnode C_NOEXISTS. + * The subsequent fsck should be able to recover the free space for us. + */ + hfs_mark_volume_inconsistent(hfsmp); + } + else { + /* reset update_vh to 0, since hfs_release_storage should have done it for us */ + update_vh = 0; + } + cp->c_flag |= C_NOEXISTS; cp->c_flag &= ~C_DELETED; - truncated = 0; // because the catalog entry is gone - + cp->c_touch_chgtime = TRUE; /* XXX needed ? */ --cp->c_linkcount; - + /* * We must never get a directory if we're in this else block. We could * accidentally drop the number of files in the volume header if we did. */ hfs_volupdate(hfsmp, VOL_RMFILE, (dcp->c_cnid == kHFSRootFolderID)); + } /* @@ -2744,14 +3311,14 @@ out: if (error) { cp->c_flag &= ~C_DELETED; } - - /* Commit the truncation to the catalog record */ - if (truncated) { - cp->c_flag |= C_FORCEUPDATE; - cp->c_touch_chgtime = TRUE; - cp->c_touch_modtime = TRUE; - (void) hfs_update(vp, 0); - } + + if (update_vh) { + /* + * If we bailed out earlier, we may need to update the volume header + * to deal with the borrowed blocks accounting. + */ + hfs_volupdate (hfsmp, VOL_UPDATE, 0); + } if (started_tr) { hfs_end_transaction(hfsmp); @@ -2789,7 +3356,6 @@ replace_desc(struct cnode *cp, struct cat_desc *cdp) cdp->cd_flags &= ~CD_HASBUF; } - /* * Rename a cnode. * @@ -2813,7 +3379,7 @@ replace_desc(struct cnode *cp, struct cat_desc *cdp) * been locked. By taking the rsrc fork vnodes up front we ensure that they * cannot be recycled, and that the situation mentioned above cannot happen. */ -static int +int hfs_vnop_rename(ap) struct vnop_rename_args /* { struct vnode *a_fdvp; @@ -2849,9 +3415,21 @@ hfs_vnop_rename(ap) int took_trunc_lock = 0; int lockflags; int error; - int recycle_rsrc = 0; + time_t orig_from_ctime, orig_to_ctime; + + orig_from_ctime = VTOC(fvp)->c_ctime; + if (tvp && VTOC(tvp)) { + orig_to_ctime = VTOC(tvp)->c_ctime; + } else { + orig_to_ctime = ~0; + } + check_for_tracked_file(fvp, orig_from_ctime, NAMESPACE_HANDLER_RENAME_OP, NULL); + if (tvp && VTOC(tvp)) { + check_for_tracked_file(tvp, orig_to_ctime, NAMESPACE_HANDLER_DELETE_OP, NULL); + } + /* * Before grabbing the four locks, we may need to get an iocount on the resource fork * vnodes in question, just like hfs_vnop_remove. If fvp and tvp are not @@ -2867,16 +3445,15 @@ hfs_vnop_rename(ap) if ((error = hfs_lock (VTOC(fvp), HFS_EXCLUSIVE_LOCK))) { return (error); } - /* - * We care if we race against rename/delete with this cnode, so we'll - * error out if this file becomes open-unlinked during this call. + * We care if we race against rename/delete with this cp, so we'll error out + * if the file becomes open-unlinked during this call. */ error = hfs_vgetrsrc(VTOHFS(fvp), fvp, &fvp_rsrc, TRUE, TRUE); hfs_unlock (VTOC(fvp)); if (error) { if (fvp_rsrc) { - vnode_put (fvp_rsrc); + vnode_put(fvp_rsrc); } return error; } @@ -2890,7 +3467,6 @@ hfs_vnop_rename(ap) */ if (hfs_lock (VTOC(tvp), HFS_EXCLUSIVE_LOCK) == 0) { tcp = VTOC(tvp); - /* * We only care if we get an open-unlinked file on the dst so we * know to null out tvp/tcp to make the rename operation act @@ -2898,18 +3474,19 @@ hfs_vnop_rename(ap) * namespace already it's fine to do this. If this is true, then * make sure to unlock the cnode and drop the iocount only after the unlock. */ + error = hfs_vgetrsrc(VTOHFS(tvp), tvp, &tvp_rsrc, TRUE, TRUE); hfs_unlock (tcp); if (error) { /* - * Since we specify TRUE for error-on-unlinked in hfs_vgetrsrc, - * we can get a rsrc fork vp even if it returns an error. + * Since we specify TRUE for error_on_unlinked in hfs_vgetrsrc, + * we can get a rsrc fork vnode even if it returns an error. */ tcp = NULL; tvp = NULL; if (tvp_rsrc) { vnode_put (tvp_rsrc); - tvp_rsrc = NULLVP; + tvp_rsrc = NULL; } /* just bypass truncate lock and act as if we never got tcp/tvp */ goto retry; @@ -2919,7 +3496,7 @@ hfs_vnop_rename(ap) /* When tvp exists, take the truncate lock for hfs_removefile(). */ if (tvp && (vnode_isreg(tvp) || vnode_islnk(tvp))) { - hfs_lock_truncate(VTOC(tvp), TRUE); + hfs_lock_truncate(VTOC(tvp), HFS_EXCLUSIVE_LOCK); took_trunc_lock = 1; } @@ -2928,7 +3505,7 @@ hfs_vnop_rename(ap) HFS_EXCLUSIVE_LOCK, &error_cnode); if (error) { if (took_trunc_lock) { - hfs_unlock_truncate(VTOC(tvp), TRUE); + hfs_unlock_truncate(VTOC(tvp), 0); took_trunc_lock = 0; } /* @@ -2974,21 +3551,22 @@ hfs_vnop_rename(ap) * the parent/child relationship with fdcp and tdcp, as well as the * component name of the target cnodes. */ - if ((fcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, fdvp, fcnp, fcp->c_fileid)) { + if ((fcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, fdvp, fcnp, fcp->c_fileid, NULL, &error)) { error = ENOENT; goto out; } - if (tcp && ((tcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, tdvp, tcnp, tcp->c_fileid))) { + if (tcp && ((tcp->c_flag & (C_NOEXISTS | C_DELETED)) || !hfs_valid_cnode(hfsmp, tdvp, tcnp, tcp->c_fileid, NULL, &error))) { // // hmm, the destination vnode isn't valid any more. // in this case we can just drop him and pretend he // never existed in the first place. // if (took_trunc_lock) { - hfs_unlock_truncate(VTOC(tvp), TRUE); - took_trunc_lock = 0; + hfs_unlock_truncate(VTOC(tvp), 0); + took_trunc_lock = 0; } + error = 0; hfs_unlockfour(fdcp, fcp, tdcp, tcp); @@ -3186,7 +3764,33 @@ hfs_vnop_rename(ap) got_cookie = 1; /* - * If the destination exists then it may need to be removed. + * If the destination exists then it may need to be removed. + * + * Due to HFS's locking system, we should always move the + * existing 'tvp' element to the hidden directory in hfs_vnop_rename. + * Because the VNOP_LOOKUP call enters and exits the filesystem independently + * of the actual vnop that it was trying to do (stat, link, readlink), + * we must release the cnode lock of that element during the interim to + * do MAC checking, vnode authorization, and other calls. In that time, + * the item can be deleted (or renamed over). However, only in the rename + * case is it inappropriate to return ENOENT from any of those calls. Either + * the call should return information about the old element (stale), or get + * information about the newer element that we are about to write in its place. + * + * HFS lookup has been modified to detect a rename and re-drive its + * lookup internally. For other calls that have already succeeded in + * their lookup call and are waiting to acquire the cnode lock in order + * to proceed, that cnode lock will not fail due to the cnode being marked + * C_NOEXISTS, because it won't have been marked as such. It will only + * have C_DELETED. Thus, they will simply act on the stale open-unlinked + * element. All future callers will get the new element. + * + * To implement this behavior, we pass the "only_unlink" argument to + * hfs_removefile and hfs_removedir. This will result in the vnode acting + * as though it is open-unlinked. Additionally, when we are done moving the + * element to the hidden directory, we vnode_recycle the target so that it is + * reclaimed as soon as possible. Reclaim and inactive are both + * capable of clearing out unused blocks for an open-unlinked file or dir. */ if (tvp) { /* @@ -3209,28 +3813,54 @@ hfs_vnop_rename(ap) } } - if (vnode_isdir(tvp)) - error = hfs_removedir(tdvp, tvp, tcnp, HFSRM_SKIP_RESERVE); - else { - error = hfs_removefile(tdvp, tvp, tcnp, 0, HFSRM_SKIP_RESERVE, 0, tvp_rsrc); - - /* - * If the destination file had a rsrc fork vnode, it may have been cleaned up - * in hfs_removefile if it was not busy (had no usecounts). This is possible - * because we grabbed the iocount on the rsrc fork safely at the beginning - * of the function before we did the lockfour. However, we may still need - * to take action to prevent block leaks, so aggressively recycle the vnode - * if possible. The vnode cannot be recycled because we hold an iocount on it. + + if (vnode_isdir(tvp)) { + /* + * hfs_removedir will eventually call hfs_removefile on the directory + * we're working on, because only hfs_removefile does the renaming of the + * item to the hidden directory. The directory will stay around in the + * hidden directory with C_DELETED until it gets an inactive or a reclaim. + * That way, we can destroy all of the EAs as needed and allow new ones to be + * written. */ - - if ((error == 0) && (tcp->c_flag & C_DELETED) && tvp_rsrc && !vnode_isinuse(tvp_rsrc, 0)) { - recycle_rsrc = 1; - } + error = hfs_removedir(tdvp, tvp, tcnp, HFSRM_SKIP_RESERVE, 1); + } + else { + error = hfs_removefile(tdvp, tvp, tcnp, 0, HFSRM_SKIP_RESERVE, 0, tvp_rsrc, 1); + + /* + * If the destination file had a resource fork vnode, then we need to get rid of + * its blocks when there are no more references to it. Because the call to + * hfs_removefile above always open-unlinks things, we need to force an inactive/reclaim + * on the resource fork vnode, in order to prevent block leaks. Otherwise, + * the resource fork vnode could prevent the data fork vnode from going out of scope + * because it holds a v_parent reference on it. So we mark it for termination + * with a call to vnode_recycle. hfs_vnop_reclaim has been modified so that it + * can clean up the blocks of open-unlinked files and resource forks. + * + * We can safely call vnode_recycle on the resource fork because we took an iocount + * reference on it at the beginning of the function. + */ + + if ((error == 0) && (tcp->c_flag & C_DELETED) && (tvp_rsrc)) { + vnode_recycle(tvp_rsrc); + } } - if (error) + if (error) { goto out; + } + tvp_deleted = 1; + + /* Mark 'tcp' as being deleted due to a rename */ + tcp->c_flag |= C_RENAMED; + + /* + * Aggressively mark tvp/tcp for termination to ensure that we recover all blocks + * as quickly as possible. + */ + vnode_recycle(tvp); } skip_rm: /* @@ -3268,6 +3898,11 @@ skip_rm: replace_desc(fcp, &out_desc); fcp->c_parentcnid = tdcp->c_fileid; fcp->c_hint = 0; + + /* Now indicate this cnode needs to have date-added written to the finderinfo */ + fcp->c_flag |= C_NEEDS_DATEADDED; + (void) hfs_update (fvp, 0); + hfs_volupdate(hfsmp, vnode_isdir(fvp) ? VOL_RMDIR : VOL_RMFILE, (fdcp->c_cnid == kHFSRootFolderID)); @@ -3327,28 +3962,12 @@ out: wakeup((caddr_t)&tdcp->c_flag); } - if (took_trunc_lock) - hfs_unlock_truncate(VTOC(tvp), TRUE); + if (took_trunc_lock) { + hfs_unlock_truncate(VTOC(tvp), 0); + } hfs_unlockfour(fdcp, fcp, tdcp, tcp); - /* - * Now that we've dropped all of the locks, we need to force an inactive and a recycle - * on the old destination's rsrc fork to prevent a leak of its blocks. Note that - * doing the ref/rele is to twiddle the VL_NEEDINACTIVE bit of the vnode's flags, so that - * on the last vnode_put for this vnode, we will force inactive to get triggered. - * We hold an iocount from the beginning of this function so we know it couldn't have been - * recycled already. - */ - if (recycle_rsrc) { - int vref; - vref = vnode_ref(tvp_rsrc); - if (vref == 0) { - vnode_rele(tvp_rsrc); - } - vnode_recycle(tvp_rsrc); - } - /* Now vnode_put the resource forks vnodes if necessary */ if (tvp_rsrc) { vnode_put(tvp_rsrc); @@ -3368,7 +3987,7 @@ out: /* * Make a directory. */ -static int +int hfs_vnop_mkdir(struct vnop_mkdir_args *ap) { /***** HACK ALERT ********/ @@ -3380,7 +3999,7 @@ hfs_vnop_mkdir(struct vnop_mkdir_args *ap) /* * Create a symbolic link. */ -static int +int hfs_vnop_symlink(struct vnop_symlink_args *ap) { struct vnode **vpp = ap->a_vpp; @@ -3456,7 +4075,7 @@ hfs_vnop_symlink(struct vnop_symlink_args *ap) /* hfs_removefile() requires holding the truncate lock */ hfs_unlock(cp); - hfs_lock_truncate(cp, TRUE); + hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK); hfs_lock(cp, HFS_FORCE_LOCK); if (hfs_start_transaction(hfsmp) != 0) { @@ -3465,8 +4084,8 @@ hfs_vnop_symlink(struct vnop_symlink_args *ap) goto out; } - (void) hfs_removefile(dvp, vp, ap->a_cnp, 0, 0, 0, NULL); - hfs_unlock_truncate(cp, TRUE); + (void) hfs_removefile(dvp, vp, ap->a_cnp, 0, 0, 0, NULL, 0); + hfs_unlock_truncate(cp, 0); goto out; } @@ -3562,7 +4181,7 @@ typedef union { * If the directory is marked as deleted-but-in-use (cp->c_flag & C_DELETED), * do NOT synthesize entries for "." and "..". */ -static int +int hfs_vnop_readdir(ap) struct vnop_readdir_args /* { vnode_t a_vp; @@ -3601,11 +4220,23 @@ hfs_vnop_readdir(ap) /* Sanity check the uio data. */ if (uio_iovcnt(uio) > 1) return (EINVAL); + + if (VTOC(vp)->c_flags & UF_COMPRESSED) { + int compressed = hfs_file_is_compressed(VTOC(vp), 0); /* 0 == take the cnode lock */ + if (VTOCMP(vp) != NULL && !compressed) { + error = check_for_dataless_file(vp, NAMESPACE_HANDLER_READ_OP); + if (error) { + return error; + } + } + } + + cp = VTOC(vp); + hfsmp = VTOHFS(vp); + /* Note that the dirhint calls require an exclusive lock. */ if ((error = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) return (error); - cp = VTOC(vp); - hfsmp = VTOHFS(vp); /* Pick up cnid hint (if any). */ if (nfs_cookies) { @@ -3741,7 +4372,7 @@ hfs_vnop_readdir(ap) if (index == 0) { dirhint->dh_threadhint = cp->c_dirthreadhint; - } + } else { /* * If we have a non-zero index, there is a possibility that during the last @@ -3822,7 +4453,7 @@ out: /* * Read contents of a symbolic link. */ -static int +int hfs_vnop_readlink(ap) struct vnop_readlink_args /* { struct vnode *a_vp; @@ -3845,7 +4476,6 @@ hfs_vnop_readlink(ap) /* Zero length sym links are not allowed */ if (fp->ff_size == 0 || fp->ff_size > MAXPATHLEN) { - printf("hfs: zero length symlink on fileid %d\n", cp->c_fileid); error = EINVAL; goto exit; } @@ -3907,7 +4537,7 @@ exit: /* * Get configurable pathname variables. */ -static int +int hfs_vnop_pathconf(ap) struct vnop_pathconf_args /* { struct vnode *a_vp; @@ -3925,9 +4555,9 @@ hfs_vnop_pathconf(ap) break; case _PC_NAME_MAX: if (VTOHFS(ap->a_vp)->hfs_flags & HFS_STANDARD) - *ap->a_retval = kHFSMaxFileNameChars; /* 255 */ + *ap->a_retval = kHFSMaxFileNameChars; /* 31 */ else - *ap->a_retval = kHFSPlusMaxFileNameChars; /* 31 */ + *ap->a_retval = kHFSPlusMaxFileNameChars; /* 255 */ break; case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; /* 1024 */ @@ -3942,7 +4572,10 @@ hfs_vnop_pathconf(ap) *ap->a_retval = 200112; /* _POSIX_NO_TRUNC */ break; case _PC_NAME_CHARS_MAX: - *ap->a_retval = kHFSPlusMaxFileNameChars; + if (VTOHFS(ap->a_vp)->hfs_flags & HFS_STANDARD) + *ap->a_retval = kHFSMaxFileNameChars; /* 31 */ + else + *ap->a_retval = kHFSPlusMaxFileNameChars; /* 255 */ break; case _PC_CASE_SENSITIVE: if (VTOHFS(ap->a_vp)->hfs_flags & HFS_CASE_SENSITIVE) @@ -3959,6 +4592,10 @@ hfs_vnop_pathconf(ap) else *ap->a_retval = 64; /* number of bits to store max file size */ break; + case _PC_XATTR_SIZE_BITS: + /* Number of bits to store maximum extended attribute size */ + *ap->a_retval = HFS_XATTR_SIZE_BITS; + break; default: return (EINVAL); } @@ -3975,7 +4612,6 @@ hfs_vnop_pathconf(ap) * * The cnode must be locked exclusive */ -__private_extern__ int hfs_update(struct vnode *vp, __unused int waitfor) { @@ -4040,28 +4676,50 @@ hfs_update(struct vnode *vp, __unused int waitfor) return error; } - /* - * For files with invalid ranges (holes) the on-disk - * field representing the size of the file (cf_size) - * must be no larger than the start of the first hole. + /* + * Modify the values passed to cat_update based on whether or not + * the file has invalid ranges or borrowed blocks. */ - if (dataforkp && !TAILQ_EMPTY(&cp->c_datafork->ff_invalidranges)) { + if (dataforkp) { + off_t numbytes = 0; + + /* copy the datafork into a temporary copy so we don't pollute the cnode's */ bcopy(dataforkp, &datafork, sizeof(datafork)); - datafork.cf_size = TAILQ_FIRST(&cp->c_datafork->ff_invalidranges)->rl_start; dataforkp = &datafork; - } else if (dataforkp && (cp->c_datafork->ff_unallocblocks != 0)) { - // always make sure the block count and the size - // of the file match the number of blocks actually - // allocated to the file on disk - bcopy(dataforkp, &datafork, sizeof(datafork)); - // make sure that we don't assign a negative block count - if (cp->c_datafork->ff_blocks < cp->c_datafork->ff_unallocblocks) { - panic("hfs: ff_blocks %d is less than unalloc blocks %d\n", - cp->c_datafork->ff_blocks, cp->c_datafork->ff_unallocblocks); + + /* + * If there are borrowed blocks, ensure that they are subtracted + * from the total block count before writing the cnode entry to disk. + * Only extents that have actually been marked allocated in the bitmap + * should be reflected in the total block count for this fork. + */ + if (cp->c_datafork->ff_unallocblocks != 0) { + // make sure that we don't assign a negative block count + if (cp->c_datafork->ff_blocks < cp->c_datafork->ff_unallocblocks) { + panic("hfs: ff_blocks %d is less than unalloc blocks %d\n", + cp->c_datafork->ff_blocks, cp->c_datafork->ff_unallocblocks); + } + + /* Also cap the LEOF to the total number of bytes that are allocated. */ + datafork.cf_blocks = (cp->c_datafork->ff_blocks - cp->c_datafork->ff_unallocblocks); + datafork.cf_size = datafork.cf_blocks * HFSTOVCB(hfsmp)->blockSize; + } + + /* + * For files with invalid ranges (holes) the on-disk + * field representing the size of the file (cf_size) + * must be no larger than the start of the first hole. + * However, note that if the first invalid range exists + * solely within borrowed blocks, then our LEOF and block + * count should both be zero. As a result, set it to the + * min of the current cf_size and the start of the first + * invalid range, because it may have already been reduced + * to zero by the borrowed blocks check above. + */ + if (!TAILQ_EMPTY(&cp->c_datafork->ff_invalidranges)) { + numbytes = TAILQ_FIRST(&cp->c_datafork->ff_invalidranges)->rl_start; + datafork.cf_size = MIN((numbytes), (datafork.cf_size)); } - datafork.cf_blocks = (cp->c_datafork->ff_blocks - cp->c_datafork->ff_unallocblocks); - datafork.cf_size = datafork.cf_blocks * HFSTOVCB(hfsmp)->blockSize; - dataforkp = &datafork; } /* @@ -4098,7 +4756,7 @@ hfs_update(struct vnode *vp, __unused int waitfor) * Allocate a new node * Note - Function does not create and return a vnode for whiteout creation. */ -static int +int hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx) { @@ -4113,16 +4771,19 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int error, started_tr = 0; enum vtype vnodetype; int mode; + int newvnode_flags = 0; + int nocache = 0; + u_int32_t gnv_flags = 0; if ((error = hfs_lock(VTOC(dvp), HFS_EXCLUSIVE_LOCK))) return (error); /* set the cnode pointer only after successfully acquiring lock */ dcp = VTOC(dvp); - + /* Don't allow creation of new entries in open-unlinked directories */ - if ((error = hfs_checkdeleted (dcp))) { - hfs_unlock (dcp); + if ((error = hfs_checkdeleted(dcp))) { + hfs_unlock(dcp); return error; } @@ -4139,6 +4800,13 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, vnodetype = VREG; mode = MAKEIMODE(vnodetype, vap->va_mode); +#if CONFIG_PROTECT + /* If we're creating a regular file on a CP filesystem, then delay caching */ + if ((vnodetype == VREG ) && (cp_fs_protected (VTOVFS(dvp)))) { + nocache = 1; + } +#endif + /* Check if were out of usable disk space. */ if ((hfs_freeblks(hfsmp, 1) == 0) && (vfs_context_suser(ctx) != 0)) { error = ENOSPC; @@ -4169,7 +4837,7 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, VATTR_SET_SUPPORTED(vap, va_flags); attr.ca_flags = vap->va_flags; } - + /* * HFS+ only: all files get ThreadExists * HFSX only: dirs get HasFolderCount @@ -4183,6 +4851,9 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, } } + /* Add the date added to the item */ + hfs_write_dateadded (&attr, attr.ca_atime); + attr.ca_uid = vap->va_uid; attr.ca_gid = vap->va_gid; VATTR_SET_SUPPORTED(vap, va_mode); @@ -4282,6 +4953,11 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, /* Do not create vnode for whiteouts */ if (S_ISWHT(mode)) { goto exit; + } + + gnv_flags |= GNV_CREATE; + if (nocache) { + gnv_flags |= GNV_NOCACHE; } /* @@ -4297,15 +4973,72 @@ hfs_makenode(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, * try to create a new vnode, and then end up reclaiming another shadow vnode to * create the new one. However, if everything is working properly, this should * be a non-issue as we would never enter that reclaim codepath. - * + * * The cnode is locked on successful return. */ - error = hfs_getnewvnode(hfsmp, dvp, cnp, &out_desc, GNV_CREATE, &attr, NULL, &tvp); + error = hfs_getnewvnode(hfsmp, dvp, cnp, &out_desc, gnv_flags, &attr, + NULL, &tvp, &newvnode_flags); if (error) goto exit; cp = VTOC(tvp); *vpp = tvp; + +#if CONFIG_PROTECT + error = cp_entry_create_keys(cp); + /* + * If we fail to create keys, then do NOT allow this vnode to percolate out into the + * namespace. Delete it and return the errno that cp_entry_create_keys generated. + * Luckily, we can do this without issues because the entry was newly created + * and we're still holding the directory cnode lock. Because we prevented it from + * getting inserted into the namecache upon vnode creation, all accesss to this file + * would have to go through the directory, whose lock we are still holding. + */ + if (error) { + /* + * If we fail to remove/recycle the item here, we can't do much about it. Log + * a message to the console and then we can backtrack it. The ultimate error + * that will get emitted to userland will be from the failure to create the EA blob. + */ + int err = hfs_removefile (dvp, tvp, cnp, 0, 0, 0, NULL, 0); + if (err) { + printf("hfs_makenode: removefile failed (%d) for CP file %p\n", err, tvp); + } + hfs_unlock (cp); + err = vnode_recycle (tvp); + if (err) { + printf("hfs_makenode: vnode_recycle failed (%d) for CP file %p\n", err, tvp); + } + /* Drop the iocount on the new vnode to force reclamation/recycling */ + vnode_put (tvp); + cp = NULL; + *vpp = NULL; + } + else { + /* insert item into name cache if it wasn't already inserted.*/ + if (nocache) { + cache_enter (dvp, tvp, cnp); + } + } + +#endif +/* + * If CONFIG_PROTECT is not enabled, then all items will get automatically added into + * the namecache, as nocache will be set to 0. + */ + +#if QUOTA + /* + * Once we create this vnode, we need to initialize its quota data + * structures, if necessary. We know that it is OK to just go ahead and + * initialize because we've already validated earlier (through the hfs_quotacheck + * function) to see if creating this cnode/vnode would cause us to go over quota. + */ + if (hfsmp->hfs_flags & HFS_QUOTAS) { + (void) hfs_getinoquota(cp); + } +#endif + exit: cat_releasedesc(&out_desc); @@ -4330,8 +5063,8 @@ exit: } - -/* hfs_vgetrsrc acquires a resource fork vnode corresponding to the cnode that is +/* + * hfs_vgetrsrc acquires a resource fork vnode corresponding to the cnode that is * found in 'vp'. The rsrc fork vnode is returned with the cnode locked and iocount * on the rsrc vnode. * @@ -4351,10 +5084,9 @@ exit: * there's really no reason to double-check for errors on the cnode. */ -__private_extern__ int -hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, - struct vnode **rvpp, int can_drop_lock, int error_on_unlinked) +hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, struct vnode **rvpp, + int can_drop_lock, int error_on_unlinked) { struct vnode *rvp; struct vnode *dvp = NULLVP; @@ -4363,18 +5095,21 @@ hfs_vgetrsrc(struct hfsmount *hfsmp, struct vnode *vp, int vid; int delete_status = 0; - + if (vnode_vtype(vp) == VDIR) { + return EINVAL; + } + /* - * Need to check the status of the cnode to validate it hasn't - * gone open-unlinked on us before we can actually do work with it. + * Need to check the status of the cnode to validate it hasn't gone + * open-unlinked on us before we can actually do work with it. */ - delete_status = hfs_checkdeleted (cp); + delete_status = hfs_checkdeleted(cp); if ((delete_status) && (error_on_unlinked)) { return delete_status; } restart: - /* Attempt to use exising vnode */ + /* Attempt to use existing vnode */ if ((rvp = cp->c_rsrc_vp)) { vid = vnode_vid(rvp); @@ -4410,11 +5145,10 @@ restart: if ((delete_status = hfs_checkdeleted(cp))) { /* * If error == 0, this means that we succeeded in acquiring an iocount on the - * rsrc fork vnode. However, if we're in this block of code, that - * means that we noticed that the cnode has gone open-unlinked. In - * this case, the caller requested that we not do any other work and - * return an errno. The caller will be responsible for dropping the - * iocount we just acquired because we can't do it until we've released + * rsrc fork vnode. However, if we're in this block of code, that means that we noticed + * that the cnode has gone open-unlinked. In this case, the caller requested that we + * not do any other work and return an errno. The caller will be responsible for + * dropping the iocount we just acquired because we can't do it until we've released * the cnode lock. */ if (error == 0) { @@ -4447,7 +5181,8 @@ restart: struct cat_desc to_desc; char delname[32]; int lockflags; - + int newvnode_flags = 0; + /* * Make sure cnode lock is exclusive, if not upgrade it. * @@ -4478,7 +5213,7 @@ restart: */ if ((error_on_unlinked) && (can_drop_lock)) { - if ((error = hfs_checkdeleted (cp))) { + if ((error = hfs_checkdeleted(cp))) { return error; } } @@ -4530,7 +5265,7 @@ restart: dvp = vnode_getparent(vp); error = hfs_getnewvnode(hfsmp, dvp, cn.cn_pnbuf ? &cn : NULL, descptr, GNV_WANTRSRC | GNV_SKIPLOCK, &cp->c_attr, - &rsrcfork, &rvp); + &rsrcfork, &rvp, &newvnode_flags); if (dvp) vnode_put(dvp); if (cn.cn_pnbuf) @@ -4546,7 +5281,7 @@ restart: /* * Wrapper for special device reads */ -static int +int hfsspec_read(ap) struct vnop_read_args /* { struct vnode *a_vp; @@ -4565,7 +5300,7 @@ hfsspec_read(ap) /* * Wrapper for special device writes */ -static int +int hfsspec_write(ap) struct vnop_write_args /* { struct vnode *a_vp; @@ -4587,7 +5322,7 @@ hfsspec_write(ap) * * Update the times on the cnode then do device close. */ -static int +int hfsspec_close(ap) struct vnop_close_args /* { struct vnode *a_vp; @@ -4680,7 +5415,7 @@ hfsfifo_close(ap) /* * Synchronize a file's in-core state with that on disk. */ -static int +int hfs_vnop_fsync(ap) struct vnop_fsync_args /* { struct vnode *a_vp; @@ -4691,6 +5426,21 @@ hfs_vnop_fsync(ap) struct vnode* vp = ap->a_vp; int error; + /* Note: We check hfs flags instead of vfs mount flag because during + * read-write update, hfs marks itself read-write much earlier than + * the vfs, and hence won't result in skipping of certain writes like + * zero'ing out of unused nodes, creation of hotfiles btree, etc. + */ + if (VTOHFS(vp)->hfs_flags & HFS_READ_ONLY) { + return 0; + } + +#if CONFIG_PROTECT + if ((error = cp_handle_vnop(VTOC(vp), CP_WRITE_ACCESS)) != 0) { + return (error); + } +#endif /* CONFIG_PROTECT */ + /* * We need to allow ENOENT lock errors since unlink * systenm call can call VNOP_FSYNC during vclean. @@ -4706,7 +5456,7 @@ hfs_vnop_fsync(ap) } -static int +int hfs_vnop_whiteout(ap) struct vnop_whiteout_args /* { struct vnode *a_dvp; @@ -4858,7 +5608,7 @@ struct vnodeopv_entry_desc hfs_vnodeop_entries[] = { { &vnop_select_desc, (VOPFUNC)hfs_vnop_select }, /* select */ { &vnop_revoke_desc, (VOPFUNC)nop_revoke }, /* revoke */ { &vnop_exchange_desc, (VOPFUNC)hfs_vnop_exchange }, /* exchange */ - { &vnop_mmap_desc, (VOPFUNC)err_mmap }, /* mmap */ + { &vnop_mmap_desc, (VOPFUNC)hfs_vnop_mmap }, /* mmap */ { &vnop_fsync_desc, (VOPFUNC)hfs_vnop_fsync }, /* fsync */ { &vnop_remove_desc, (VOPFUNC)hfs_vnop_remove }, /* remove */ { &vnop_link_desc, (VOPFUNC)hfs_vnop_link }, /* link */ diff --git a/bsd/hfs/hfs_xattr.c b/bsd/hfs/hfs_xattr.c index 6eec7028b..8091dfaa2 100644 --- a/bsd/hfs/hfs_xattr.c +++ b/bsd/hfs/hfs_xattr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 Apple Inc. All rights reserved. + * Copyright (c) 2004-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,7 @@ #include <sys/vnode.h> #include <sys/xattr.h> #include <sys/fcntl.h> +#include <sys/fsctl.h> #include <sys/vnode_internal.h> #include <sys/kauth.h> @@ -66,7 +67,6 @@ struct listattr_callback_state { #endif /* HFS_COMPRESSION */ }; -#define HFS_MAXATTRIBUTESIZE (128 * 1024) #define HFS_MAXATTRBLKS (32 * 1024) @@ -80,6 +80,8 @@ struct listattr_callback_state { static u_int32_t emptyfinfo[8] = {0}; +static int hfs_zero_dateadded (struct cnode *cp, u_int8_t *finderinfo); + const char hfs_attrdatafilename[] = "Attribute Data"; static int listattr_callback(const HFSPlusAttrKey *key, const HFSPlusAttrData *data, @@ -216,7 +218,7 @@ hfs_vnop_removenamedstream(struct vnop_removenamedstream_args* ap) scp = VTOC(svp); /* Take truncate lock before taking cnode lock. */ - hfs_lock_truncate(scp, TRUE); + hfs_lock_truncate(scp, HFS_EXCLUSIVE_LOCK); if ((error = hfs_lock(scp, HFS_EXCLUSIVE_LOCK))) { goto out; } @@ -225,16 +227,38 @@ hfs_vnop_removenamedstream(struct vnop_removenamedstream_args* ap) } hfs_unlock(scp); out: - hfs_unlock_truncate(scp, TRUE); + hfs_unlock_truncate(scp, 0); return (error); } #endif +/* Zero out the date added field for the specified cnode */ +static int hfs_zero_dateadded (struct cnode *cp, u_int8_t *finderinfo) { + u_int8_t *finfo = finderinfo; + + /* Advance finfo by 16 bytes to the 2nd half of the finderinfo */ + finfo = finfo + 16; + + if (S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = 0; + } + else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = 0; + } + else { + /* Return an error */ + return -1; + } + return 0; + +} + /* * Retrieve the data of an extended attribute. */ -__private_extern__ int hfs_vnop_getxattr(struct vnop_getxattr_args *ap) /* @@ -253,13 +277,7 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) struct cnode *cp; struct hfsmount *hfsmp; uio_t uio = ap->a_uio; - struct BTreeIterator * iterator = NULL; - struct filefork *btfile; - FSBufferDescriptor btdata; - HFSPlusAttrRecord * recp = NULL; size_t bufsize; - u_int16_t datasize; - int lockflags; int result; cp = VTOC(vp); @@ -281,6 +299,9 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) /* Make a copy since we may not export all of it. */ bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); hfs_unlock(cp); + + /* Zero out the date added field in the local copy */ + hfs_zero_dateadded (cp, finderinfo); /* Don't expose a symlink's private type/creator. */ if (vnode_islnk(vp)) { @@ -347,17 +368,17 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) (result == 0) && (uio_resid(uio) == uio_size)) { /* - we intentionally make the above call to VNOP_READ so that - it can return an authorization/permission/etc. error - based on ap->a_context and thus deny this operation; - in that case, result != 0 and we won't proceed - - however, if result == 0, it will have returned no data - because hfs_vnop_read hid the resource fork - (hence uio_resid(uio) == uio_size, i.e. the uio is untouched) - - in that case, we try again with the decmpfs_ctx context - to get the actual data + * We intentionally make the above call to VNOP_READ so that + * it can return an authorization/permission/etc. Error + * based on ap->a_context and thus deny this operation; + * in that case, result != 0 and we won't proceed. + * + * However, if result == 0, it will have returned no data + * because hfs_vnop_read hid the resource fork + * (hence uio_resid(uio) == uio_size, i.e. the uio is untouched) + * + * In that case, we try again with the decmpfs_ctx context + * to get the actual data */ result = VNOP_READ(rvp, uio, 0, decmpfs_ctx); } @@ -387,24 +408,78 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) if ((result = hfs_lock(cp, HFS_SHARED_LOCK))) { return (result); } - /* Bail if we don't have any extended attributes. */ + + /* Check for non-rsrc, non-finderinfo EAs */ + result = hfs_getxattr_internal (cp, ap, VTOHFS(cp->c_vp), 0); + + hfs_unlock(cp); + + return MacToVFSError(result); +} + + + +/* + * getxattr_internal + * + * We break out this internal function which searches the attributes B-Tree and the + * overflow extents file to find non-resource, non-finderinfo EAs. There may be cases + * where we need to get EAs in contexts where we are already holding the cnode lock, + * and to re-enter hfs_vnop_getxattr would cause us to double-lock the cnode. Instead, + * we can just directly call this function. + * + * We pass the hfsmp argument directly here because we may not necessarily have a cnode to + * operate on. Under normal conditions, we have a file or directory to query, but if we + * are operating on the root directory (id 1), then we may not have a cnode. In this case, if hte + * 'cp' argument is NULL, then we need to use the 'fileid' argument as the entry to manipulate + * + * NOTE: This function assumes the cnode lock for 'cp' is held exclusive or shared. + */ + + +int hfs_getxattr_internal (struct cnode *cp, struct vnop_getxattr_args *ap, + struct hfsmount *hfsmp, u_int32_t fileid) { + + struct filefork *btfile; + struct BTreeIterator * iterator = NULL; + size_t bufsize = 0; + HFSPlusAttrRecord *recp = NULL; + FSBufferDescriptor btdata; + int lockflags = 0; + int result = 0; + u_int16_t datasize = 0; + uio_t uio = ap->a_uio; + u_int32_t target_id = 0; + + if (cp) { + target_id = cp->c_fileid; + } + else { + target_id = fileid; + } + + + /* Bail if we don't have an EA B-Tree. */ if ((hfsmp->hfs_attribute_vp == NULL) || - (cp->c_attr.ca_recflags & kHFSHasAttributesMask) == 0) { - result = ENOATTR; + ((cp) && (cp->c_attr.ca_recflags & kHFSHasAttributesMask) == 0)) { + result = ENOATTR; goto exit; } + + /* Initialize the B-Tree iterator for searching for the proper EA */ btfile = VTOF(hfsmp->hfs_attribute_vp); - + MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); if (iterator == NULL) { result = ENOMEM; goto exit; } bzero(iterator, sizeof(*iterator)); - + bufsize = sizeof(HFSPlusAttrData) - 2; - if (uio) + if (uio) { bufsize += uio_resid(uio); + } bufsize = MAX(bufsize, sizeof(HFSPlusAttrRecord)); MALLOC(recp, HFSPlusAttrRecord *, bufsize, M_TEMP, M_WAITOK); if (recp == NULL) { @@ -414,132 +489,146 @@ hfs_vnop_getxattr(struct vnop_getxattr_args *ap) btdata.bufferAddress = recp; btdata.itemSize = bufsize; btdata.itemCount = 1; + + result = hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + if (result) { + goto exit; + } - result = hfs_buildattrkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - if (result) - goto exit; - - /* Lookup the attribute. */ + /* Lookup the attribute in the Attribute B-Tree */ lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); hfs_systemfile_unlock(hfsmp, lockflags); - + if (result) { - if (result == btNotFound) + if (result == btNotFound) { result = ENOATTR; + } goto exit; } - + + /* + * Operate differently if we have inline EAs that can fit in the attribute B-Tree or if + * we have extent based EAs. + */ switch (recp->recordType) { - case kHFSPlusAttrInlineData: - /* - * Sanity check record size. It's not required to have any - * user data, so the minimum size is 2 bytes less that the - * size of HFSPlusAttrData (since HFSPlusAttrData struct - * has 2 bytes set aside for attribute data). - */ - if (datasize < (sizeof(HFSPlusAttrData) - 2)) { - printf("hfs_getxattr: %d,%s invalid record size %d (expecting %lu)\n", - VTOC(vp)->c_fileid, ap->a_name, datasize, sizeof(HFSPlusAttrData)); - result = ENOATTR; - break; - } - *ap->a_size = recp->attrData.attrSize; - if (uio && recp->attrData.attrSize != 0) { - if (*ap->a_size > (user_size_t)uio_resid(uio)) - result = ERANGE; - else - result = uiomove((caddr_t) &recp->attrData.attrData , recp->attrData.attrSize, uio); - } - break; - - case kHFSPlusAttrForkData: - if (datasize < sizeof(HFSPlusAttrForkData)) { - printf("hfs_getxattr: %d,%s invalid record size %d (expecting %lu)\n", - VTOC(vp)->c_fileid, ap->a_name, datasize, sizeof(HFSPlusAttrForkData)); - result = ENOATTR; - break; - } - *ap->a_size = recp->forkData.theFork.logicalSize; - if (uio == NULL) { - break; - } - if (*ap->a_size > (user_size_t)uio_resid(uio)) { - result = ERANGE; + /* Attribute fits in the Attribute B-Tree */ + case kHFSPlusAttrInlineData: + /* + * Sanity check record size. It's not required to have any + * user data, so the minimum size is 2 bytes less that the + * size of HFSPlusAttrData (since HFSPlusAttrData struct + * has 2 bytes set aside for attribute data). + */ + if (datasize < (sizeof(HFSPlusAttrData) - 2)) { + printf("hfs_getxattr: %d,%s invalid record size %d (expecting %lu)\n", + target_id, ap->a_name, datasize, sizeof(HFSPlusAttrData)); + result = ENOATTR; + break; + } + *ap->a_size = recp->attrData.attrSize; + if (uio && recp->attrData.attrSize != 0) { + if (*ap->a_size > (user_size_t)uio_resid(uio)) { + result = ERANGE; + } + else { + result = uiomove((caddr_t) &recp->attrData.attrData , recp->attrData.attrSize, uio); + } + } break; - } - /* Process overflow extents if necessary. */ - if (has_overflow_extents(&recp->forkData.theFork)) { - HFSPlusExtentDescriptor *extentbuf; - HFSPlusExtentDescriptor *extentptr; - size_t extentbufsize; - u_int32_t totalblocks; - u_int32_t blkcnt; - u_int32_t attrlen; - - totalblocks = recp->forkData.theFork.totalBlocks; - /* Ignore bogus block counts. */ - if (totalblocks > HFS_MAXATTRBLKS) { - result = ERANGE; + /* Extent-Based EAs */ + case kHFSPlusAttrForkData: { + if (datasize < sizeof(HFSPlusAttrForkData)) { + printf("hfs_getxattr: %d,%s invalid record size %d (expecting %lu)\n", + target_id, ap->a_name, datasize, sizeof(HFSPlusAttrForkData)); + result = ENOATTR; break; } - attrlen = recp->forkData.theFork.logicalSize; - - /* Get a buffer to hold the worst case amount of extents. */ - extentbufsize = totalblocks * sizeof(HFSPlusExtentDescriptor); - extentbufsize = roundup(extentbufsize, sizeof(HFSPlusExtentRecord)); - MALLOC(extentbuf, HFSPlusExtentDescriptor *, extentbufsize, M_TEMP, M_WAITOK); - if (extentbuf == NULL) { - result = ENOMEM; + *ap->a_size = recp->forkData.theFork.logicalSize; + if (uio == NULL) { break; } - bzero(extentbuf, extentbufsize); - extentptr = extentbuf; - - /* Grab the first 8 extents. */ - bcopy(&recp->forkData.theFork.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); - extentptr += kHFSPlusExtentDensity; - blkcnt = count_extent_blocks(totalblocks, recp->forkData.theFork.extents); - - /* Now lookup the overflow extents. */ - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); - while (blkcnt < totalblocks) { - ((HFSPlusAttrKey *)&iterator->key)->startBlock = blkcnt; - result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); - if (result || - (recp->recordType != kHFSPlusAttrExtents) || - (datasize < sizeof(HFSPlusAttrExtents))) { - printf("hfs_getxattr: %s missing extents, only %d blks of %d found\n", - ap->a_name, blkcnt, totalblocks); - result = ENOATTR; - break; /* break from while */ + if (*ap->a_size > (user_size_t)uio_resid(uio)) { + result = ERANGE; + break; + } + /* Process overflow extents if necessary. */ + if (has_overflow_extents(&recp->forkData.theFork)) { + HFSPlusExtentDescriptor *extentbuf; + HFSPlusExtentDescriptor *extentptr; + size_t extentbufsize; + u_int32_t totalblocks; + u_int32_t blkcnt; + u_int32_t attrlen; + + totalblocks = recp->forkData.theFork.totalBlocks; + /* Ignore bogus block counts. */ + if (totalblocks > HFS_MAXATTRBLKS) { + result = ERANGE; + break; + } + attrlen = recp->forkData.theFork.logicalSize; + + /* Get a buffer to hold the worst case amount of extents. */ + extentbufsize = totalblocks * sizeof(HFSPlusExtentDescriptor); + extentbufsize = roundup(extentbufsize, sizeof(HFSPlusExtentRecord)); + MALLOC(extentbuf, HFSPlusExtentDescriptor *, extentbufsize, M_TEMP, M_WAITOK); + if (extentbuf == NULL) { + result = ENOMEM; + break; } - /* Grab the next 8 extents. */ - bcopy(&recp->overflowExtents.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); + bzero(extentbuf, extentbufsize); + extentptr = extentbuf; + + /* Grab the first 8 extents. */ + bcopy(&recp->forkData.theFork.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); extentptr += kHFSPlusExtentDensity; - blkcnt += count_extent_blocks(totalblocks, recp->overflowExtents.extents); - } - hfs_systemfile_unlock(hfsmp, lockflags); - - if (blkcnt < totalblocks) { - result = ENOATTR; - } else { - result = read_attr_data(hfsmp, uio, attrlen, extentbuf); + blkcnt = count_extent_blocks(totalblocks, recp->forkData.theFork.extents); + + /* Now lookup the overflow extents. */ + lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK); + while (blkcnt < totalblocks) { + ((HFSPlusAttrKey *)&iterator->key)->startBlock = blkcnt; + result = BTSearchRecord(btfile, iterator, &btdata, &datasize, NULL); + if (result || + (recp->recordType != kHFSPlusAttrExtents) || + (datasize < sizeof(HFSPlusAttrExtents))) { + printf("hfs_getxattr: %s missing extents, only %d blks of %d found\n", + ap->a_name, blkcnt, totalblocks); + result = ENOATTR; + break; /* break from while */ + } + /* Grab the next 8 extents. */ + bcopy(&recp->overflowExtents.extents[0], extentptr, sizeof(HFSPlusExtentRecord)); + extentptr += kHFSPlusExtentDensity; + blkcnt += count_extent_blocks(totalblocks, recp->overflowExtents.extents); + } + + /* Release Attr B-Tree lock */ + hfs_systemfile_unlock(hfsmp, lockflags); + + if (blkcnt < totalblocks) { + result = ENOATTR; + } + else { + result = read_attr_data(hfsmp, uio, attrlen, extentbuf); + } + FREE(extentbuf, M_TEMP); + + } + else /* No overflow extents. */ { + result = read_attr_data(hfsmp, uio, recp->forkData.theFork.logicalSize, recp->forkData.theFork.extents); } - FREE(extentbuf, M_TEMP); - - } else /* No overflow extents. */ { - result = read_attr_data(hfsmp, uio, recp->forkData.theFork.logicalSize, recp->forkData.theFork.extents); + break; } - break; - - default: - result = ENOATTR; - break; + + default: + /* We only support Extent or inline EAs. Default to ENOATTR for anything else */ + result = ENOATTR; + break; } -exit: - hfs_unlock(cp); - + +exit: if (iterator) { FREE(iterator, M_TEMP); } @@ -547,13 +636,14 @@ exit: FREE(recp, M_TEMP); } - return MacToVFSError(result); + return result; + } + /* * Set the data of an extended attribute. */ -__private_extern__ int hfs_vnop_setxattr(struct vnop_setxattr_args *ap) /* @@ -571,19 +661,10 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) struct cnode *cp = NULL; struct hfsmount *hfsmp; uio_t uio = ap->a_uio; - struct BTreeIterator * iterator = NULL; - struct filefork *btfile = NULL; size_t attrsize; - FSBufferDescriptor btdata; - HFSPlusAttrRecord *recp = NULL; - HFSPlusExtentDescriptor *extentptr = NULL; - HFSPlusAttrRecord attrdata; /* 90 bytes */ void * user_data_ptr = NULL; - int started_transaction = 0; - int lockflags = 0; - int exists; - int allocatedblks = 0; int result; + time_t orig_ctime=VTOC(vp)->c_ctime; if (ap->a_name == NULL || ap->a_name[0] == '\0') { return (EINVAL); /* invalid name */ @@ -599,6 +680,8 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) if (result != 0) return result; } + + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_WRITE_OP, NULL); #endif /* HFS_COMPRESSION */ /* Set the Finder Info. */ @@ -606,7 +689,9 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) u_int8_t finderinfo[32]; struct FndrFileInfo *fip; void * finderinfo_start; + u_int8_t *finfo = NULL; u_int16_t fdFlags; + u_int32_t dateadded = 0; attrsize = sizeof(VTOC(vp)->c_finderinfo); @@ -641,6 +726,12 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) } } + /* Grab the current date added from the cnode */ + dateadded = hfs_get_dateadded (cp); + + /* Zero out the date added field to ignore user's attempts to set it */ + hfs_zero_dateadded(cp, finderinfo); + if (bcmp(finderinfo_start, emptyfinfo, attrsize)) { /* attr exists and "create" was specified. */ if (ap->a_options & XATTR_CREATE) { @@ -654,12 +745,33 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) return (ENOATTR); } } + + /* + * Now restore the date added to the finderinfo to be written out. + * Advance to the 2nd half of the finderinfo to write out the date added + * into the buffer. + * + * Make sure to endian swap the date added back into big endian. When we used + * hfs_get_dateadded above to retrieve it, it swapped into local endianness + * for us. But now that we're writing it out, put it back into big endian. + */ + finfo = &finderinfo[16]; + + if (S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + } + else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = OSSwapHostToBigInt32(dateadded); + } + /* Set the cnode's Finder Info. */ if (attrsize == sizeof(cp->c_finderinfo)) bcopy(&finderinfo[0], finderinfo_start, attrsize); else bcopy(&finderinfo[8], finderinfo_start, attrsize); - + /* Updating finderInfo updates change time and modified time */ cp->c_touch_chgtime = TRUE; cp->c_flag |= C_MODIFIED; @@ -724,32 +836,29 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) if (result) { return (result); } - /* - * VNOP_WRITE marks the vnode as needing a modtime update. - */ + /* VNOP_WRITE marks cnode as needing a modtime update */ result = VNOP_WRITE(rvp, uio, 0, ap->a_context); - /* if open unlinked, force it inactive and recycle */ + /* if open unlinked, force it inactive */ if (openunlinked) { int vref; vref = vnode_ref (rvp); if (vref == 0) { vnode_rele(rvp); } - vnode_recycle (rvp); + vnode_recycle (rvp); } else { - /* re-lock the cnode so we can update the modtimes */ - if ((result = hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK))) { - vnode_recycle(rvp); + /* cnode is not open-unlinked, so re-lock cnode to sync */ + if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { + vnode_recycle (rvp); vnode_put(rvp); - return (result); + return result; } - - /* HFS fsync the resource fork to force it out to disk */ - result = hfs_fsync (rvp, MNT_NOWAIT, 0, vfs_context_proc(ap->a_context)); - - hfs_unlock(cp); + + /* hfs fsync rsrc fork to force to disk and update modtime */ + result = hfs_fsync (rvp, MNT_NOWAIT, 0, vfs_context_proc (ap->a_context)); + hfs_unlock (cp); } vnode_put(rvp); @@ -764,8 +873,9 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) attrsize = uio_resid(uio); /* Enforce an upper limit. */ - if (attrsize > HFS_MAXATTRIBUTESIZE) { - return (E2BIG); + if (attrsize > HFS_XATTR_MAXSIZE) { + result = E2BIG; + goto exit; } /* @@ -791,23 +901,82 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) goto exit; } cp = VTOC(vp); + + /* + * If we're trying to set a non-finderinfo, non-resourcefork EA, then + * call the breakout function. + */ + result = hfs_setxattr_internal (cp, user_data_ptr, attrsize, ap, VTOHFS(vp), 0); + exit: + if (cp) { + hfs_unlock(cp); + } + if (user_data_ptr) { + FREE(user_data_ptr, M_TEMP); + } + + return (result == btNotFound ? ENOATTR : MacToVFSError(result)); +} + + +/* + * hfs_setxattr_internal + * + * Internal function to set non-rsrc, non-finderinfo EAs to either the attribute B-Tree or + * extent-based EAs. + * + * See comments from hfs_getxattr_internal on why we need to pass 'hfsmp' and fileid here. + * The gist is that we could end up writing to the root folder which may not have a cnode. + * + * Assumptions: + * 1. cnode 'cp' is locked EXCLUSIVE before calling this function. + * 2. data_ptr contains data to be written. If gathering data from userland, this must be + * done before calling this function. + * 3. If data originates entirely in-kernel, use a null UIO, and ensure the size is less than + * hfsmp->hfs_max_inline_attrsize bytes long. + */ +int hfs_setxattr_internal (struct cnode *cp, caddr_t data_ptr, size_t attrsize, + struct vnop_setxattr_args *ap, struct hfsmount *hfsmp, + u_int32_t fileid) { + uio_t uio = ap->a_uio; + struct vnode *vp = ap->a_vp; + int started_transaction = 0; + struct BTreeIterator * iterator = NULL; + struct filefork *btfile = NULL; + FSBufferDescriptor btdata; + HFSPlusAttrRecord attrdata; /* 90 bytes */ + HFSPlusAttrRecord *recp = NULL; + HFSPlusExtentDescriptor *extentptr = NULL; + int result = 0; + int lockflags = 0; + int exists = 0; + int allocatedblks = 0; + u_int32_t target_id; + + if (cp) { + target_id = cp->c_fileid; + } + else { + target_id = fileid; + } + /* Start a transaction for our changes. */ if (hfs_start_transaction(hfsmp) != 0) { result = EINVAL; goto exit; } started_transaction = 1; - + /* * Once we started the transaction, nobody can compete * with us, so make sure this file is still there. */ - if (cp->c_flag & C_NOEXISTS) { + if ((cp) && (cp->c_flag & C_NOEXISTS)) { result = ENOENT; goto exit; } - + /* * If there isn't an attributes b-tree then create one. */ @@ -821,10 +990,10 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) if (hfsmp->hfs_max_inline_attrsize == 0) { hfsmp->hfs_max_inline_attrsize = getmaxinlineattrsize(hfsmp->hfs_attribute_vp); } - + /* Take exclusive access to the attributes b-tree. */ lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - + /* Build the b-tree key. */ MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); if (iterator == NULL) { @@ -832,18 +1001,18 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) goto exit; } bzero(iterator, sizeof(*iterator)); - result = hfs_buildattrkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + result = hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); if (result) { goto exit; } - + /* Preflight for replace/create semantics. */ btfile = VTOF(hfsmp->hfs_attribute_vp); btdata.bufferAddress = &attrdata; btdata.itemSize = sizeof(attrdata); btdata.itemCount = 1; exists = BTSearchRecord(btfile, iterator, &btdata, NULL, NULL) == 0; - + /* Replace requires that the attribute already exists. */ if ((ap->a_options & XATTR_REPLACE) && !exists) { result = ENOATTR; @@ -854,6 +1023,7 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) result = EEXIST; goto exit; } + /* If it won't fit inline then use extent-based attributes. */ if (attrsize > hfsmp->hfs_max_inline_attrsize) { size_t extentbufsize; @@ -861,13 +1031,17 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) int extentblks; u_int32_t *keystartblk; int i; - - /* Check if volume supports extent-based attributes */ - if ((hfsmp->hfs_flags & HFS_XATTR_EXTENTS) == 0) { - result = E2BIG; + + if (uio == NULL) { + /* + * setxattrs originating from in-kernel are not supported if they are bigger + * than the inline max size. Just return ENOATTR and force them to do it with a + * smaller EA. + */ + result = EPERM; goto exit; - } - + } + /* Get some blocks. */ blkcnt = howmany(attrsize, hfsmp->blockSize); extentbufsize = blkcnt * sizeof(HFSPlusExtentDescriptor); @@ -886,11 +1060,13 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) /* Copy data into the blocks. */ result = write_attr_data(hfsmp, uio, attrsize, extentptr); if (result) { - const char *name = vnode_getname(vp); - printf("hfs_setxattr: write_attr_data err (%d) %s:%s\n", - result, name ? name : "", ap->a_name); - if (name) - vnode_putname(name); + if (vp) { + const char *name = vnode_getname(vp); + printf("hfs_setxattr: write_attr_data err (%d) %s:%s\n", + result, name ? name : "", ap->a_name); + if (name) + vnode_putname(name); + } goto exit; } @@ -898,15 +1074,16 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) if (exists) { result = remove_attribute_records(hfsmp, iterator); if (result) { - const char *name = vnode_getname(vp); - printf("hfs_setxattr: remove_attribute_records err (%d) %s:%s\n", - result, name ? name : "", ap->a_name); - if (name) - vnode_putname(name); - goto exit; + if (vp) { + const char *name = vnode_getname(vp); + printf("hfs_setxattr: remove_attribute_records err (%d) %s:%s\n", + result, name ? name : "", ap->a_name); + if (name) + vnode_putname(name); + } + goto exit; } } - /* Create attribute fork data record. */ MALLOC(recp, HFSPlusAttrRecord *, sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK); if (recp == NULL) { @@ -916,32 +1093,27 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) btdata.bufferAddress = recp; btdata.itemCount = 1; btdata.itemSize = sizeof(HFSPlusAttrForkData); - + recp->recordType = kHFSPlusAttrForkData; recp->forkData.reserved = 0; recp->forkData.theFork.logicalSize = attrsize; recp->forkData.theFork.clumpSize = 0; recp->forkData.theFork.totalBlocks = blkcnt; bcopy(extentptr, recp->forkData.theFork.extents, sizeof(HFSPlusExtentRecord)); - - (void) hfs_buildattrkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - + + (void) hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); if (result) { -#if HFS_XATTR_VERBOSE - const char *name = vnode_getname(vp); - printf("hfs_setxattr: BTInsertRecord err (%d) %s:%s\n", - MacToVFSError(result), name ? name : "", ap->a_name); - if (name) - vnode_putname(name); -#endif + printf ("hfs_setxattr: BTInsertRecord() - %d,%s err=%d\n", + target_id, ap->a_name, result); goto exit; } extentblks = count_extent_blocks(blkcnt, recp->forkData.theFork.extents); blkcnt -= extentblks; keystartblk = &((HFSPlusAttrKey *)&iterator->key)->startBlock; i = 0; - + /* Create overflow extents as needed. */ while (blkcnt > 0) { /* Initialize the key and record. */ @@ -949,31 +1121,29 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) btdata.itemSize = sizeof(HFSPlusAttrExtents); recp->recordType = kHFSPlusAttrExtents; recp->overflowExtents.reserved = 0; - + /* Copy the next set of extents. */ i += kHFSPlusExtentDensity; bcopy(&extentptr[i], recp->overflowExtents.extents, sizeof(HFSPlusExtentRecord)); - + result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); if (result) { - const char *name = vnode_getname(vp); - printf("hfs_setxattr: BTInsertRecord err (%d) %s:%s\n", - MacToVFSError(result), name ? name : "", ap->a_name); - if (name) - vnode_putname(name); + printf ("hfs_setxattr: BTInsertRecord() overflow - %d,%s err=%d\n", + target_id, ap->a_name, result); goto exit; } extentblks = count_extent_blocks(blkcnt, recp->overflowExtents.extents); blkcnt -= extentblks; } - } else /* Inline data */ { + } + else { /* Inline data */ if (exists) { result = remove_attribute_records(hfsmp, iterator); if (result) { goto exit; } } - + /* Calculate size of record rounded up to multiple of 2 bytes. */ btdata.itemSize = sizeof(HFSPlusAttrData) - 2 + attrsize + ((attrsize & 1) ? 1 : 0); MALLOC(recp, HFSPlusAttrRecord *, btdata.itemSize, M_TEMP, M_WAITOK); @@ -985,24 +1155,36 @@ hfs_vnop_setxattr(struct vnop_setxattr_args *ap) recp->attrData.reserved[0] = 0; recp->attrData.reserved[1] = 0; recp->attrData.attrSize = attrsize; - + /* Copy in the attribute data (if any). */ if (attrsize > 0) { - if (user_data_ptr) - bcopy(user_data_ptr, &recp->attrData.attrData, attrsize); - else + if (data_ptr) { + bcopy(data_ptr, &recp->attrData.attrData, attrsize); + } + else { + /* + * A null UIO meant it originated in-kernel. If they didn't supply data_ptr + * then deny the copy operation. + */ + if (uio == NULL) { + result = EPERM; + goto exit; + } result = uiomove((caddr_t)&recp->attrData.attrData, attrsize, uio); + } + if (result) { goto exit; } } - - (void) hfs_buildattrkey(VTOC(vp)->c_fileid, ap->a_name, (HFSPlusAttrKey *)&iterator->key); - + + (void) hfs_buildattrkey(target_id, ap->a_name, (HFSPlusAttrKey *)&iterator->key); + btdata.bufferAddress = recp; btdata.itemCount = 1; result = BTInsertRecord(btfile, iterator, &btdata, btdata.itemSize); } + exit: if (btfile && started_transaction) { (void) BTFlushPath(btfile); @@ -1011,16 +1193,18 @@ exit: hfs_systemfile_unlock(hfsmp, lockflags); } if (result == 0) { - cp = VTOC(vp); - /* Setting an attribute only updates change time and not - * modified time of the file. - */ - cp->c_touch_chgtime = TRUE; - cp->c_attr.ca_recflags |= kHFSHasAttributesMask; - if ((bcmp(ap->a_name, KAUTH_FILESEC_XATTR, sizeof(KAUTH_FILESEC_XATTR)) == 0)) { - cp->c_attr.ca_recflags |= kHFSHasSecurityMask; + if (vp) { + cp = VTOC(vp); + /* Setting an attribute only updates change time and not + * modified time of the file. + */ + cp->c_touch_chgtime = TRUE; + cp->c_attr.ca_recflags |= kHFSHasAttributesMask; + if ((bcmp(ap->a_name, KAUTH_FILESEC_XATTR, sizeof(KAUTH_FILESEC_XATTR)) == 0)) { + cp->c_attr.ca_recflags |= kHFSHasSecurityMask; + } + (void) hfs_update(vp, 0); } - (void) hfs_update(vp, 0); } if (started_transaction) { if (result && allocatedblks) { @@ -1028,12 +1212,7 @@ exit: } hfs_end_transaction(hfsmp); } - if (cp) { - hfs_unlock(cp); - } - if (user_data_ptr) { - FREE(user_data_ptr, M_TEMP); - } + if (recp) { FREE(recp, M_TEMP); } @@ -1043,13 +1222,16 @@ exit: if (iterator) { FREE(iterator, M_TEMP); } - return (result == btNotFound ? ENOATTR : MacToVFSError(result)); + + return result; } + + + /* * Remove an extended attribute. */ -__private_extern__ int hfs_vnop_removexattr(struct vnop_removexattr_args *ap) /* @@ -1068,6 +1250,7 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) struct BTreeIterator * iterator = NULL; int lockflags; int result; + time_t orig_ctime=VTOC(vp)->c_ctime; if (ap->a_name == NULL || ap->a_name[0] == '\0') { return (EINVAL); /* invalid name */ @@ -1078,8 +1261,11 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) } #if HFS_COMPRESSION - if (hfs_hides_xattr(ap->a_context, VTOC(vp), ap->a_name, 1) && !(ap->a_options & XATTR_SHOWCOMPRESSION)) + if (hfs_hides_xattr(ap->a_context, VTOC(vp), ap->a_name, 1) && !(ap->a_options & XATTR_SHOWCOMPRESSION)) { return ENOATTR; + } + + check_for_tracked_file(vp, orig_ctime, NAMESPACE_HANDLER_METADATA_DELETE_OP, NULL); #endif /* HFS_COMPRESSION */ /* If Resource Fork is non-empty then truncate it. */ @@ -1102,9 +1288,9 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) return (result); } - hfs_lock_truncate(VTOC(rvp), TRUE); + hfs_lock_truncate(VTOC(rvp), HFS_EXCLUSIVE_LOCK); if ((result = hfs_lock(VTOC(rvp), HFS_EXCLUSIVE_LOCK))) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); vnode_put(rvp); return (result); } @@ -1113,7 +1299,7 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) * hfs_truncate() and hfs_update() */ if ((result = hfs_start_transaction(hfsmp))) { - hfs_unlock_truncate(cp, TRUE); + hfs_unlock_truncate(cp, 0); hfs_unlock(cp); vnode_put(rvp); return (result); @@ -1127,7 +1313,7 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) } hfs_end_transaction(hfsmp); - hfs_unlock_truncate(VTOC(rvp), TRUE); + hfs_unlock_truncate(VTOC(rvp), 0); hfs_unlock(VTOC(rvp)); vnode_put(rvp); @@ -1137,34 +1323,80 @@ hfs_vnop_removexattr(struct vnop_removexattr_args *ap) if (bcmp(ap->a_name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0) { void * finderinfo_start; int finderinfo_size; - + u_int8_t finderinfo[32]; + u_int32_t date_added; + u_int8_t *finfo = NULL; + if ((result = hfs_lock(cp, HFS_EXCLUSIVE_LOCK))) { return (result); } - - /* Symlink's don't have an external type/creator. */ + + /* Use the local copy to store our temporary changes. */ + bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); + + + /* Zero out the date added field in the local copy */ + hfs_zero_dateadded (cp, finderinfo); + + /* Don't expose a symlink's private type/creator. */ if (vnode_islnk(vp)) { - /* Skip over type/creator fields. */ + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)&finderinfo; + fip->fdType = 0; + fip->fdCreator = 0; + } + + /* Do the byte compare against the local copy */ + if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) == 0) { + hfs_unlock (cp); + return (ENOATTR); + } + + /* + * If there was other content, zero out everything except + * type/creator and date added. First, save the date added. + */ + finfo = cp->c_finderinfo; + finfo = finfo + 16; + if (S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + date_added = extinfo->date_added; + } + else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + date_added = extinfo->date_added; + } + + if (vnode_islnk(vp)) { + /* Ignore type/creator */ finderinfo_start = &cp->c_finderinfo[8]; finderinfo_size = sizeof(cp->c_finderinfo) - 8; - } else { + } + else { finderinfo_start = &cp->c_finderinfo[0]; finderinfo_size = sizeof(cp->c_finderinfo); } - if (bcmp(finderinfo_start, emptyfinfo, finderinfo_size) == 0) { - hfs_unlock(cp); - return (ENOATTR); - } - bzero(finderinfo_start, finderinfo_size); - + + + /* Now restore the date added */ + if (S_ISREG(cp->c_attr.ca_mode)) { + struct FndrExtendedFileInfo *extinfo = (struct FndrExtendedFileInfo *)finfo; + extinfo->date_added = date_added; + } + else if (S_ISDIR(cp->c_attr.ca_mode)) { + struct FndrExtendedDirInfo *extinfo = (struct FndrExtendedDirInfo *)finfo; + extinfo->date_added = date_added; + } + /* Updating finderInfo updates change time and modified time */ cp->c_touch_chgtime = TRUE; cp->c_flag |= C_MODIFIED; hfs_update(vp, FALSE); - + hfs_unlock(cp); - + return (0); } /* @@ -1305,7 +1537,7 @@ out: * - The Allocation Bitmap file must be locked exclusive. * - The iterator key must be initialized. */ -static int +int remove_attribute_records(struct hfsmount *hfsmp, BTreeIterator * iterator) { struct filefork *btfile; @@ -1334,11 +1566,9 @@ remove_attribute_records(struct hfsmount *hfsmp, BTreeIterator * iterator) int extentblks; u_int32_t *keystartblk; -#if HFS_XATTR_VERBOSE if (datasize < sizeof(HFSPlusAttrForkData)) { - printf("hfs: remove_attribute_records: bad record size %d (expecting %d)\n", datasize, sizeof(HFSPlusAttrForkData)); + printf("hfs: remove_attribute_records: bad record size %d (expecting %lu)\n", datasize, sizeof(HFSPlusAttrForkData)); } -#endif totalblks = attrdata.forkData.theFork.totalBlocks; /* Process the first 8 extents. */ @@ -1385,7 +1615,6 @@ exit: /* * Retrieve the list of extended attribute names. */ -__private_extern__ int hfs_vnop_listxattr(struct vnop_listxattr_args *ap) /* @@ -1405,12 +1634,11 @@ hfs_vnop_listxattr(struct vnop_listxattr_args *ap) struct BTreeIterator * iterator = NULL; struct filefork *btfile; struct listattr_callback_state state; - void * finderinfo_start; - int finderinfo_size; user_addr_t user_start = 0; user_size_t user_len = 0; int lockflags; int result; + u_int8_t finderinfo[32]; if (VNODE_IS_RSRC(vp)) { return (EPERM); @@ -1427,17 +1655,26 @@ hfs_vnop_listxattr(struct vnop_listxattr_args *ap) return (result); } + /* + * Make a copy of the cnode's finderinfo to a local so we can + * zero out the date added field. Also zero out the private type/creator + * for symlinks. + */ + bcopy(cp->c_finderinfo, finderinfo, sizeof(finderinfo)); + hfs_zero_dateadded (cp, finderinfo); + /* Don't expose a symlink's private type/creator. */ if (vnode_islnk(vp)) { - /* Skip over type/creator fields. */ - finderinfo_start = &cp->c_finderinfo[8]; - finderinfo_size = sizeof(cp->c_finderinfo) - 8; - } else { - finderinfo_start = &cp->c_finderinfo[0]; - finderinfo_size = sizeof(cp->c_finderinfo); - } + struct FndrFileInfo *fip; + + fip = (struct FndrFileInfo *)&finderinfo; + fip->fdType = 0; + fip->fdCreator = 0; + } + + /* If Finder Info is non-empty then export it's name. */ - if (bcmp(finderinfo_start, emptyfinfo, finderinfo_size) != 0) { + if (bcmp(finderinfo, emptyfinfo, sizeof(emptyfinfo)) != 0) { if (uio == NULL) { *ap->a_size += sizeof(XATTR_FINDERINFO_NAME); } else if ((user_size_t)uio_resid(uio) < sizeof(XATTR_FINDERINFO_NAME)) { @@ -1546,11 +1783,9 @@ exit: if (user_start) { vsunlock(user_start, user_len, TRUE); } - if (iterator) { FREE(iterator, M_TEMP); } - hfs_unlock(cp); return MacToVFSError(result); @@ -1558,7 +1793,7 @@ exit: /* - * Callback - called for each attribute + * Callback - called for each attribute record */ static int listattr_callback(const HFSPlusAttrKey *key, __unused const HFSPlusAttrData *data, struct listattr_callback_state *state) @@ -1621,7 +1856,6 @@ listattr_callback(const HFSPlusAttrKey *key, __unused const HFSPlusAttrData *dat * This function takes the necessary locks on the attribute * b-tree file and the allocation (bitmap) file. */ -__private_extern__ int hfs_removeallattr(struct hfsmount *hfsmp, u_int32_t fileid) { @@ -1699,10 +1933,8 @@ hfs_xattr_init(struct hfsmount * hfsmp) /* * Enable/Disable volume attributes stored as EA for root file system. * Supported attributes are - - * 1. ACLs - * 2. Extent-based Extended Attributes + * 1. Extent-based Extended Attributes */ -__private_extern__ int hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state) { @@ -1714,6 +1946,9 @@ hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state) if (hfsmp->hfs_flags & HFS_STANDARD) { return (ENOTSUP); } + if (xattrtype != HFS_SET_XATTREXTENTS_STATE) { + return EINVAL; + } /* * If there isn't an attributes b-tree then create one. @@ -1736,18 +1971,8 @@ hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state) * Build a b-tree key. * We use the root's parent id (1) to hold this volume attribute. */ - if (xattrtype == HFS_SETACLSTATE) { - /* ACL */ - (void) hfs_buildattrkey(kHFSRootParentID, XATTR_EXTENDEDSECURITY_NAME, - (HFSPlusAttrKey *)&iterator->key); - } else if (xattrtype == HFS_SET_XATTREXTENTS_STATE) { - /* Extent-based extended attributes */ - (void) hfs_buildattrkey(kHFSRootParentID, XATTR_XATTREXTENTS_NAME, - (HFSPlusAttrKey *)&iterator->key); - } else { - result = EINVAL; - goto exit; - } + (void) hfs_buildattrkey(kHFSRootParentID, XATTR_XATTREXTENTS_NAME, + (HFSPlusAttrKey *)&iterator->key); /* Start a transaction for our changes. */ if (hfs_start_transaction(hfsmp) != 0) { @@ -1790,91 +2015,21 @@ hfs_set_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype, int state) /* Finish the transaction of our changes. */ hfs_end_transaction(hfsmp); -exit: - if (iterator) { - FREE(iterator, M_TEMP); - } - if (result == 0) { - if (xattrtype == HFS_SETACLSTATE) { - if (state == 0) { - vfs_clearextendedsecurity(HFSTOVFS(hfsmp)); - } else { - vfs_setextendedsecurity(HFSTOVFS(hfsmp)); - } - } else { - /* HFS_SET_XATTREXTENTS_STATE */ - HFS_MOUNT_LOCK(hfsmp, TRUE); - if (state == 0) { - hfsmp->hfs_flags &= ~HFS_XATTR_EXTENTS; - } else { - hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; - } - HFS_MOUNT_UNLOCK(hfsmp, TRUE); - } - } - - return MacToVFSError(result); -} - - /* - * Check for volume attributes stored as EA for root file system. - * Supported attributes are - - * 1. ACLs - * 2. Extent-based Extended Attributes - */ -__private_extern__ -void -hfs_check_volxattr(struct hfsmount *hfsmp, unsigned int xattrtype) -{ - struct BTreeIterator * iterator; - struct filefork *btfile; - int lockflags; - int result; - - if (hfsmp->hfs_flags & HFS_STANDARD || - hfsmp->hfs_attribute_vp == NULL) { - return; - } - - MALLOC(iterator, BTreeIterator *, sizeof(*iterator), M_TEMP, M_WAITOK); - if (iterator == NULL) { - return; - } - bzero(iterator, sizeof(*iterator)); - - /* - * Build a b-tree key. - * We use the root's parent id (1) to hold this volume attribute. - */ - if (xattrtype == HFS_SETACLSTATE) { - /* ACLs */ - (void) hfs_buildattrkey(kHFSRootParentID, XATTR_EXTENDEDSECURITY_NAME, - (HFSPlusAttrKey *)&iterator->key); + /* Update the state in the mount point */ + HFS_MOUNT_LOCK(hfsmp, TRUE); + if (state == 0) { + hfsmp->hfs_flags &= ~HFS_XATTR_EXTENTS; } else { - /* Extent-based extended attributes */ - (void) hfs_buildattrkey(kHFSRootParentID, XATTR_XATTREXTENTS_NAME, - (HFSPlusAttrKey *)&iterator->key); + hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; } - btfile = VTOF(hfsmp->hfs_attribute_vp); - - lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_EXCLUSIVE_LOCK); - - /* Check for our attribute. */ - result = BTSearchRecord(btfile, iterator, NULL, NULL, NULL); + HFS_MOUNT_UNLOCK(hfsmp, TRUE); - hfs_systemfile_unlock(hfsmp, lockflags); - FREE(iterator, M_TEMP); - - if (result == 0) { - if (xattrtype == HFS_SETACLSTATE) { - vfs_setextendedsecurity(HFSTOVFS(hfsmp)); - } else { - HFS_MOUNT_LOCK(hfsmp, TRUE); - hfsmp->hfs_flags |= HFS_XATTR_EXTENTS; - HFS_MOUNT_UNLOCK(hfsmp, TRUE); - } +exit: + if (iterator) { + FREE(iterator, M_TEMP); } + return MacToVFSError(result); } @@ -2036,76 +2191,70 @@ getmaxinlineattrsize(struct vnode * attrvp) } /* - * Get a referenced vnode for attribute data I/O. + * Initialize vnode for attribute data I/O. + * + * On success, + * - returns zero + * - the attrdata vnode is initialized as hfsmp->hfs_attrdata_vp + * - an iocount is taken on the attrdata vnode which exists + * for the entire duration of the mount. It is only dropped + * during unmount + * - the attrdata cnode is not locked + * + * On failure, + * - returns non-zero value + * - the caller does not have to worry about any locks or references */ -static int -get_attr_data_vnode(struct hfsmount *hfsmp, vnode_t *vpp) +int init_attrdata_vnode(struct hfsmount *hfsmp) { vnode_t vp; int result = 0; + struct cat_desc cat_desc; + struct cat_attr cat_attr; + struct cat_fork cat_fork; + int newvnode_flags = 0; + + bzero(&cat_desc, sizeof(cat_desc)); + cat_desc.cd_parentcnid = kHFSRootParentID; + cat_desc.cd_nameptr = (const u_int8_t *)hfs_attrdatafilename; + cat_desc.cd_namelen = strlen(hfs_attrdatafilename); + cat_desc.cd_cnid = kHFSAttributeDataFileID; + /* Tag vnode as system file, note that we can still use cluster I/O */ + cat_desc.cd_flags |= CD_ISMETA; + + bzero(&cat_attr, sizeof(cat_attr)); + cat_attr.ca_linkcount = 1; + cat_attr.ca_mode = S_IFREG; + cat_attr.ca_fileid = cat_desc.cd_cnid; + cat_attr.ca_blocks = hfsmp->totalBlocks; - vp = hfsmp->hfs_attrdata_vp; - if (vp == NULLVP) { - struct cat_desc cat_desc; - struct cat_attr cat_attr; - struct cat_fork cat_fork; - - /* We don't tag it as a system file since we intend to use cluster I/O. */ - bzero(&cat_desc, sizeof(cat_desc)); - cat_desc.cd_parentcnid = kHFSRootParentID; - cat_desc.cd_nameptr = (const u_int8_t *)hfs_attrdatafilename; - cat_desc.cd_namelen = strlen(hfs_attrdatafilename); - cat_desc.cd_cnid = kHFSAttributeDataFileID; - - bzero(&cat_attr, sizeof(cat_attr)); - cat_attr.ca_linkcount = 1; - cat_attr.ca_mode = S_IFREG; - cat_attr.ca_fileid = cat_desc.cd_cnid; - cat_attr.ca_blocks = hfsmp->totalBlocks; - - /* - * The attribute data file is a virtual file that spans the - * entire file system space. - * - * Each extent-based attribute occupies a unique portion of - * in this virtual file. The cluster I/O is done using actual - * allocation block offsets so no additional mapping is needed - * for the VNOP_BLOCKMAP call. - * - * This approach allows the attribute data to be cached without - * incurring the high cost of using a separate vnode per attribute. - * - * Since we need to acquire the attribute b-tree file lock anyways, - * the virtual file doesn't introduce any additional serialization. - */ - bzero(&cat_fork, sizeof(cat_fork)); - cat_fork.cf_size = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; - cat_fork.cf_blocks = hfsmp->totalBlocks; - cat_fork.cf_extents[0].startBlock = 0; - cat_fork.cf_extents[0].blockCount = cat_fork.cf_blocks; - - result = hfs_getnewvnode(hfsmp, NULL, NULL, &cat_desc, 0, &cat_attr, &cat_fork, &vp); - if (result == 0) { - HFS_MOUNT_LOCK(hfsmp, 1); - /* Check if someone raced us for creating this vnode. */ - if (hfsmp->hfs_attrdata_vp != NULLVP) { - HFS_MOUNT_UNLOCK(hfsmp, 1); - vnode_put(vp); - vnode_recycle(vp); - vp = hfsmp->hfs_attrdata_vp; - } else { - hfsmp->hfs_attrdata_vp = vp; - HFS_MOUNT_UNLOCK(hfsmp, 1); - /* Keep a reference on this vnode until unmount */ - vnode_ref_ext(vp, O_EVTONLY); - hfs_unlock(VTOC(vp)); - } - } - } else { - if ((result = vnode_get(vp))) - vp = NULLVP; + /* + * The attribute data file is a virtual file that spans the + * entire file system space. + * + * Each extent-based attribute occupies a unique portion of + * in this virtual file. The cluster I/O is done using actual + * allocation block offsets so no additional mapping is needed + * for the VNOP_BLOCKMAP call. + * + * This approach allows the attribute data to be cached without + * incurring the high cost of using a separate vnode per attribute. + * + * Since we need to acquire the attribute b-tree file lock anyways, + * the virtual file doesn't introduce any additional serialization. + */ + bzero(&cat_fork, sizeof(cat_fork)); + cat_fork.cf_size = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; + cat_fork.cf_blocks = hfsmp->totalBlocks; + cat_fork.cf_extents[0].startBlock = 0; + cat_fork.cf_extents[0].blockCount = cat_fork.cf_blocks; + + result = hfs_getnewvnode(hfsmp, NULL, NULL, &cat_desc, 0, &cat_attr, + &cat_fork, &vp, &newvnode_flags); + if (result == 0) { + hfsmp->hfs_attrdata_vp = vp; + hfs_unlock(VTOC(vp)); } - *vpp = vp; return (result); } @@ -2115,7 +2264,7 @@ get_attr_data_vnode(struct hfsmount *hfsmp, vnode_t *vpp) static int read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents) { - vnode_t evp = NULLVP; + vnode_t evp = hfsmp->hfs_attrdata_vp; int bufsize; int iosize; int attrsize; @@ -2123,10 +2272,7 @@ read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtent int i; int result = 0; - if ((result = get_attr_data_vnode(hfsmp, &evp))) { - return (result); - } - hfs_lock_truncate(VTOC(evp), 0); + hfs_lock_truncate(VTOC(evp), HFS_SHARED_LOCK); bufsize = (int)uio_resid(uio); attrsize = (int)datasize; @@ -2158,7 +2304,6 @@ read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtent uio_setoffset(uio, datasize); hfs_unlock_truncate(VTOC(evp), 0); - vnode_put(evp); return (result); } @@ -2168,7 +2313,7 @@ read_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtent static int write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExtentDescriptor *extents) { - vnode_t evp = NULLVP; + vnode_t evp = hfsmp->hfs_attrdata_vp; off_t filesize; int bufsize; int attrsize; @@ -2177,11 +2322,7 @@ write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExten int i; int result = 0; - /* Get exclusive use of attribute data vnode. */ - if ((result = get_attr_data_vnode(hfsmp, &evp))) { - return (result); - } - hfs_lock_truncate(VTOC(evp), 0); + hfs_lock_truncate(VTOC(evp), HFS_SHARED_LOCK); bufsize = uio_resid(uio); attrsize = (int) datasize; @@ -2213,7 +2354,6 @@ write_attr_data(struct hfsmount *hfsmp, uio_t uio, size_t datasize, HFSPlusExten uio_setoffset(uio, datasize); hfs_unlock_truncate(VTOC(evp), 0); - vnode_put(evp); return (result); } @@ -2264,7 +2404,7 @@ alloc_attr_blks(struct hfsmount *hfsmp, size_t attrsize, size_t extentbufsize, H #if HFS_XATTR_VERBOSE printf("hfs: alloc_attr_blks: unexpected failure, %d blocks unallocated\n", blkcnt); #endif - for (; i <= 0; i--) { + for (; i >= 0; i--) { if ((blkcnt = extents[i].blockCount) != 0) { (void) BlockDeallocate(hfsmp, extents[i].startBlock, blkcnt, 0); extents[i].startBlock = 0; @@ -2283,14 +2423,11 @@ alloc_attr_blks(struct hfsmount *hfsmp, size_t attrsize, size_t extentbufsize, H static void free_attr_blks(struct hfsmount *hfsmp, int blkcnt, HFSPlusExtentDescriptor *extents) { - vnode_t evp = NULLVP; + vnode_t evp = hfsmp->hfs_attrdata_vp; int remblks = blkcnt; int lockflags; int i; - if (get_attr_data_vnode(hfsmp, &evp) != 0) { - evp = NULLVP; - } lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); for (i = 0; (remblks > 0) && (extents[i].blockCount != 0); i++) { @@ -2325,9 +2462,6 @@ free_attr_blks(struct hfsmount *hfsmp, int blkcnt, HFSPlusExtentDescriptor *exte } hfs_systemfile_unlock(hfsmp, lockflags); - if (evp) { - vnode_put(evp); - } } static int diff --git a/bsd/hfs/hfscommon/BTree/BTree.c b/bsd/hfs/hfscommon/BTree/BTree.c index 73a521c8f..be52f5900 100644 --- a/bsd/hfs/hfscommon/BTree/BTree.c +++ b/bsd/hfs/hfscommon/BTree/BTree.c @@ -1583,6 +1583,7 @@ BTUpdateRecord(FCB *filePtr, BTreeIterator *iterator, u_int16_t index; Boolean validHint; + ////////////////////////// Priliminary Checks /////////////////////////////// nodeRec.buffer = nil; // so we can call ReleaseNode @@ -1666,9 +1667,9 @@ Success: ////////////////////////////// Error Exit /////////////////////////////////// ErrorExit: - + (void) ReleaseNode (btreePtr, &nodeRec); - + iterator->hint.writeCount = 0; iterator->hint.nodeNum = 0; iterator->hint.index = 0; @@ -1996,7 +1997,6 @@ OSStatus BTSetLastSync (FCB *filePtr, return noErr; } - __private_extern__ OSStatus BTHasContiguousNodes (FCB *filePtr) { @@ -2021,7 +2021,6 @@ Routine: BTGetUserData Function: Read the user data area of the b-tree header node. -------------------------------------------------------------------------------*/ -__private_extern__ OSStatus BTGetUserData(FCB *filePtr, void * dataPtr, int dataSize) { @@ -2059,7 +2058,6 @@ Routine: BTSetUserData Function: Write the user data area of the b-tree header node. -------------------------------------------------------------------------------*/ -__private_extern__ OSStatus BTSetUserData(FCB *filePtr, void * dataPtr, int dataSize) { diff --git a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c index 99d586408..fe2f91714 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeAllocate.c +++ b/bsd/hfs/hfscommon/BTree/BTreeAllocate.c @@ -621,7 +621,6 @@ Routine: BTZeroUnusedNodes Function: Write zeros to all nodes in the B-tree that are not currently in use. -------------------------------------------------------------------------------*/ -__private_extern__ int BTZeroUnusedNodes(FCB *filePtr) { @@ -695,31 +694,39 @@ BTZeroUnusedNodes(FCB *filePtr) err = EIO; goto ErrorExit; } - + if (buf_flags(bp) & B_LOCKED) { /* - * This node is already part of a transaction and will be - * written when the transaction is committed so don't write it here. - * If we did, then we'd hit a panic in hfs_vnop_bwrite since - * B_LOCKED is still set + * This node is already part of a transaction and will be written when + * the transaction is committed, so don't write it here. If we did, then + * we'd hit a panic in hfs_vnop_bwrite because the B_LOCKED bit is still set. */ buf_brelse(bp); continue; } - buf_clear(bp); buf_markaged(bp); /* * Try not to hog the buffer cache. Wait for the write - * every 32 nodes. + * every 32 nodes. If VNOP_BWRITE reports an error, bail out and bubble + * it up to the function calling us. If we tried to update a read-only + * mount on read-only media, for example, catching the error will let + * us alert the callers of this function that they should maintain + * the mount in read-only mode. + */ ++numWritten; - if (numWritten % 32 == 0) - VNOP_BWRITE(bp); - else + if (numWritten % 32 == 0) { + err = VNOP_BWRITE(bp); + if (err) { + goto ErrorExit; + } + } + else { buf_bawrite(bp); + } } } diff --git a/bsd/hfs/hfscommon/BTree/BTreeScanner.c b/bsd/hfs/hfscommon/BTree/BTreeScanner.c index 1ce08e385..ea549278d 100644 --- a/bsd/hfs/hfscommon/BTree/BTreeScanner.c +++ b/bsd/hfs/hfscommon/BTree/BTreeScanner.c @@ -272,7 +272,7 @@ static int ReadMultipleNodes( BTScanState *theScanStatePtr ) } // now read blocks from the device - myErr = (int)buf_bread(myDevPtr, + myErr = (int)buf_meta_bread(myDevPtr, myPhyBlockNum, myBufferSize, NOCRED, diff --git a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c index dbbc33b58..fee50fe6d 100644 --- a/bsd/hfs/hfscommon/Catalog/FileIDsServices.c +++ b/bsd/hfs/hfscommon/Catalog/FileIDsServices.c @@ -42,12 +42,63 @@ typedef struct ExtentsRecBuffer ExtentsRecBuffer; static u_int32_t CheckExtents( void *extents, u_int32_t blocks, Boolean isHFSPlus ); -static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileNumber, Boolean isHFSPlus ); -static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, Boolean isHFSPlus ); +static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileNumber, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ); +static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ); static void CopyCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); static void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ); static void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffer *buffer, u_int16_t bufferCount ); +/* + * This function moves the overflow extents associated with srcID into the file associated with dstID. + * We should have already verified that 'srcID' has overflow extents. So now we move all of the overflow + * extent records. + */ +OSErr MoveData( ExtendedVCB *vcb, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, int rsrc) { + + OSErr err; + + /* + * Only the source file should have extents, so we just track those. + * We operate on the fork represented by the open FD that was used to call into this + * function + */ + if (rsrc) { + /* Copy the extent overflow blocks. */ + err = MoveExtents( vcb, srcID, destID, 1, (u_int8_t)0xff, 1); + if ( err != noErr ) { + if ( err != dskFulErr ) { + return( err ); + } + /* + * In case of error, we would have probably run into problems + * growing the extents b-tree. Since the move is actually a copy + delete + * just delete the new entries. Same for below. + */ + err = DeleteExtents( vcb, destID, 1, (u_int8_t)0xff, 1); + ReturnIfError( err ); // we are doomed. Just QUIT! + goto FlushAndReturn; + } + } + else { + /* Copy the extent overflow blocks. */ + err = MoveExtents( vcb, srcID, destID, 1, 0, 1); + if ( err != noErr ) { + if ( err != dskFulErr ) { + return( err ); + } + err = DeleteExtents( vcb, destID, 1, 0, 1); + ReturnIfError( err ); // we are doomed. Just QUIT! + goto FlushAndReturn; + } + } + +FlushAndReturn: + /* Write out the catalog and extent overflow B-Tree changes */ + err = FlushCatalog( vcb ); + err = FlushExtentFile( vcb ); + + return( err ); +} OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param destName, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, u_int32_t srcHint, u_int32_t destHint ) @@ -61,13 +112,13 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param int16_t numDestExtentBlocks; OSErr err; Boolean isHFSPlus = ( vcb->vcbSigWord == kHFSPlusSigWord ); - + err = BuildCatalogKeyUTF8(vcb, srcID, srcName, kUndefinedStrLen, &srcKey, NULL); ReturnIfError(err); - + err = BuildCatalogKeyUTF8(vcb, destID, destName, kUndefinedStrLen, &destKey, NULL); ReturnIfError(err); - + if ( isHFSPlus ) { //-- Step 1: Check the catalog nodes for extents @@ -75,37 +126,37 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param //-- locate the source file, test for extents in extent file, and copy the cat record for later err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); ReturnIfError( err ); - + if ( srcData.recordType != kHFSPlusFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + //-- Check if there are any extents in the source file //€€ I am only checling the extents in the low 32 bits, routine will fail if files extents after 2 gig are in overflow numSrcExtentBlocks = CheckExtents( srcData.hfsPlusFile.dataFork.extents, srcData.hfsPlusFile.dataFork.totalBlocks, isHFSPlus ); if ( numSrcExtentBlocks == 0 ) // then check the resource fork extents numSrcExtentBlocks = CheckExtents( srcData.hfsPlusFile.resourceFork.extents, srcData.hfsPlusFile.resourceFork.totalBlocks, isHFSPlus ); - + //-- Check if there are any extents in the destination file err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); ReturnIfError( err ); - + if ( destData.recordType != kHFSPlusFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + numDestExtentBlocks = CheckExtents( destData.hfsPlusFile.dataFork.extents, destData.hfsPlusFile.dataFork.totalBlocks, isHFSPlus ); if ( numDestExtentBlocks == 0 ) // then check the resource fork extents numDestExtentBlocks = CheckExtents( destData.hfsPlusFile.resourceFork.extents, destData.hfsPlusFile.resourceFork.totalBlocks, isHFSPlus ); - + //-- Step 2: Exchange the Extent key in the extent file //-- Exchange the extents key in the extent file - err = DeleteExtents( vcb, kHFSBogusExtentFileID, isHFSPlus ); + err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); ReturnIfError( err ); if ( numSrcExtentBlocks && numDestExtentBlocks ) // if both files have extents { //-- Change the source extents file ids to our known bogus value - err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, kHFSBogusExtentFileID, isHFSPlus ); + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) @@ -115,67 +166,67 @@ OSErr ExchangeFileIDs( ExtendedVCB *vcb, ConstUTF8Param srcName, ConstUTF8Param } //-- Change the destination extents file id's to the source id's - err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - -ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, isHFSPlus ); + + ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsPlusFile.fileID, isHFSPlus ); // Move the extents back + + err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo1a; } //-- Change the bogus extents file id's to the dest id's - err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsPlusFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, isHFSPlus ); // Move the extents back + + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo2aPlus; } } else if ( numSrcExtentBlocks ) // just the source file has extents { - err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, srcData.hfsPlusFile.fileID, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } else if ( numDestExtentBlocks ) // just the destination file has extents { - err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, destData.hfsPlusFile.fileID, srcData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, destData.hfsPlusFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } - + //-- Step 3: Change the data in the catalog nodes //-- find the source cnode and put dest info in it @@ -188,12 +239,12 @@ ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, isHFSPlus ); err = ReplaceBTreeRecord( vcb->catalogRefNum, &srcKey, srcHint, &srcData, sizeof(HFSPlusCatalogFile), &srcHint ); ReturnIfError( err ); - + // find the destination cnode and put source info in it err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); if ( err != noErr ) return( cmBadNews ); - + CopyBigCatalogNodeInfo( &swapData, &destData ); err = ReplaceBTreeRecord( vcb->catalogRefNum, &destKey, destHint, &destData, sizeof(HFSPlusCatalogFile), &destHint ); ReturnIfError( err ); @@ -205,10 +256,10 @@ ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, isHFSPlus ); //-- locate the source file, test for extents in extent file, and copy the cat record for later err = LocateCatalogNodeByKey( vcb, srcHint, &srcKey, &srcData, &srcHint ); ReturnIfError( err ); - + if ( srcData.recordType != kHFSFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + //-- Check if there are any extents in the source file numSrcExtentBlocks = CheckExtents( srcData.hfsFile.dataExtents, srcData.hfsFile.dataPhysicalSize / vcb->blockSize, isHFSPlus ); if ( numSrcExtentBlocks == 0 ) // then check the resource fork extents @@ -217,106 +268,106 @@ ExUndo2aPlus: err = DeleteExtents( vcb, srcData.hfsPlusFile.fileID, isHFSPlus ); //€€ Do we save the found source node for later use? - + //-- Check if there are any extents in the destination file err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); ReturnIfError( err ); - + if ( destData.recordType != kHFSFileRecord ) return( cmFThdDirErr ); // Error "cmFThdDirErr = it is a directory" - + numDestExtentBlocks = CheckExtents( destData.hfsFile.dataExtents, destData.hfsFile.dataPhysicalSize / vcb->blockSize, isHFSPlus ); if ( numDestExtentBlocks == 0 ) // then check the resource fork extents numDestExtentBlocks = CheckExtents( destData.hfsFile.rsrcExtents, destData.hfsFile.rsrcPhysicalSize / vcb->blockSize, isHFSPlus ); - + //€€ Do we save the found destination node for later use? - - + + //-- Step 2: Exchange the Extent key in the extent file //-- Exchange the extents key in the extent file - err = DeleteExtents( vcb, kHFSBogusExtentFileID, isHFSPlus ); + err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); ReturnIfError( err ); if ( numSrcExtentBlocks && numDestExtentBlocks ) // if both files have extents { //-- Change the source extents file ids to our known bogus value - err = MoveExtents( vcb, srcData.hfsFile.fileID, kHFSBogusExtentFileID, isHFSPlus ); + err = MoveExtents( vcb, srcData.hfsFile.fileID, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - -ExUndo1a: err = DeleteExtents( vcb, kHFSBogusExtentFileID, isHFSPlus ); + + ExUndo1a: err = DeleteExtents( vcb, kHFSBogusExtentFileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + err = FlushCatalog( vcb ); // flush the catalog err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) return( dskFulErr ); } //-- Change the destination extents file id's to the source id's - err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - -ExUndo2a: err = DeleteExtents( vcb, srcData.hfsFile.fileID, isHFSPlus ); + + ExUndo2a: err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsFile.fileID, isHFSPlus ); // Move the extents back + + err = MoveExtents( vcb, kHFSBogusExtentFileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo1a; } //-- Change the bogus extents file id's to the dest id's - err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, kHFSBogusExtentFileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, destData.hfsFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, destData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - - err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, isHFSPlus ); // Move the extents back + + err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); // Move the extents back ReturnIfError( err ); // we are doomed. Just QUIT! - + goto ExUndo2a; } } else if ( numSrcExtentBlocks ) // just the source file has extents { - err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, srcData.hfsFile.fileID, destData.hfsFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, srcData.hfsFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } else if ( numDestExtentBlocks ) // just the destination file has extents { - err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, isHFSPlus ); + err = MoveExtents( vcb, destData.hfsFile.fileID, srcData.hfsFile.fileID, 0, 0, isHFSPlus ); if ( err != noErr ) { if ( err != dskFulErr ) return( err ); - - err = DeleteExtents( vcb, destData.hfsFile.fileID, isHFSPlus ); + + err = DeleteExtents( vcb, destData.hfsFile.fileID, 0, 0, isHFSPlus ); ReturnIfError( err ); // we are doomed. Just QUIT! - + goto FlushAndReturn; } } - + //-- Step 3: Change the data in the catalog nodes //-- find the source cnode and put dest info in it @@ -330,23 +381,23 @@ ExUndo2a: err = DeleteExtents( vcb, srcData.hfsFile.fileID, isHFSPlus ); err = ReplaceBTreeRecord( vcb->catalogRefNum, &srcKey, srcHint, &srcData, sizeof(HFSCatalogFile), &srcHint ); ReturnIfError( err ); - + // find the destination cnode and put source info in it err = LocateCatalogNodeByKey( vcb, destHint, &destKey, &destData, &destHint ); if ( err != noErr ) return( cmBadNews ); - + CopyCatalogNodeInfo( &swapData, &destData ); err = ReplaceBTreeRecord( vcb->catalogRefNum, &destKey, destHint, &destData, sizeof(HFSCatalogFile), &destHint ); ReturnIfError( err ); } err = noErr; - + //-- Step 4: Error Handling section - - + + FlushAndReturn: err = FlushCatalog( vcb ); // flush the catalog err = FlushExtentFile( vcb ); // flush the extent file (unneeded for common case, but it's cheap) @@ -373,7 +424,7 @@ static void CopyBigCatalogNodeInfo( CatalogRecord *src, CatalogRecord *dest ) } -static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, Boolean isHFSPlus ) +static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t destFileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ) { FCB * fcb; ExtentsRecBuffer extentsBuffer[kNumExtentsToCache]; @@ -386,16 +437,16 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest int16_t i, j; OSErr err; - + fcb = GetFileControlBlock(vcb->extentsRefNum); (void) BTInvalidateHint(&btIterator); extentKeyPtr = (ExtentKey*) &btIterator.key; btRecord.bufferAddress = &extentData; btRecord.itemCount = 1; - + //-- Collect the extent records - + // // A search on the following key will cause the BTree to be positioned immediately // before the first extent record for file #srcFileID, but not actually positioned @@ -408,9 +459,9 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest if (isHFSPlus) { btRecord.itemSize = sizeof(HFSPlusExtentRecord); btKeySize = sizeof(HFSPlusExtentKey); - + extentKeyPtr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; - extentKeyPtr->hfsPlus.forkType = 0; + extentKeyPtr->hfsPlus.forkType = forkType; extentKeyPtr->hfsPlus.pad = 0; extentKeyPtr->hfsPlus.fileID = srcFileID; extentKeyPtr->hfsPlus.startBlock = 0; @@ -418,7 +469,7 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest else { btRecord.itemSize = sizeof(HFSExtentRecord); btKeySize = sizeof(HFSExtentKey); - + extentKeyPtr->hfs.keyLength = kHFSExtentKeyMaximumLength; extentKeyPtr->hfs.forkType = 0; extentKeyPtr->hfs.fileID = srcFileID; @@ -440,7 +491,7 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest // of BTIterateRecord. We'd need to set up the key for BTSearchRecord to find the last record // we found, so that BTIterateRecord would get the next one (the first we haven't processed). // - + err = BTSearchRecord(fcb, &btIterator, &btRecord, &btRecordSize, &btIterator); // We expect a btNotFound here, since there shouldn't be an extent record with FABN = 0. @@ -454,16 +505,16 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest return err; } - + do { btRecord.bufferAddress = &extentData; btRecord.itemCount = 1; - + for ( i=0 ; i<kNumExtentsToCache ; i++ ) { HFSCatalogNodeID foundFileID; - + err = BTIterateRecord(fcb, kBTreeNextRecord, &btIterator, &btRecord, &btRecordSize); if ( err == btNotFound ) // Did we run out of extent records in the extents tree? break; // if xkrFNum(A0) is cleared on this error, then this test is bogus! @@ -471,12 +522,17 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest return( err ); // must be ioError foundFileID = isHFSPlus ? extentKeyPtr->hfsPlus.fileID : extentKeyPtr->hfs.fileID; - if ( foundFileID == srcFileID ) - { + if ( foundFileID == srcFileID ) { + /* Check if we need to quit early. */ + if (quitEarly && isHFSPlus) { + if (extentKeyPtr->hfsPlus.forkType != forkType) { + break; + } + } CopyExtentInfo(extentKeyPtr, &extentData, extentsBuffer, i); } - else - { + else{ + /* The fileID's are of a different file. We're done here. */ break; } } @@ -486,21 +542,20 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest btRecordSize = sizeof(HFSPlusExtentRecord); else btRecordSize = sizeof(HFSExtentRecord); - - for ( j=0 ; j<i ; j++ ) - { + + for ( j=0 ; j<i ; j++ ) { BTreeIterator tmpIterator; - + if (isHFSPlus) extentsBuffer[j].extentKey.hfsPlus.fileID = destFileID; // change only the id in the key to dest ID else extentsBuffer[j].extentKey.hfs.fileID = destFileID; // change only the id in the key to dest ID - + // get iterator and buffer descriptor ready... (void) BTInvalidateHint(&tmpIterator); BlockMoveData(&(extentsBuffer[j].extentKey), &tmpIterator.key, btKeySize); btRecord.bufferAddress = &(extentsBuffer[j].extentData); - + err = BTInsertRecord(fcb, &tmpIterator, &btRecord, btRecordSize); if ( err != noErr ) { // parse the error @@ -520,7 +575,7 @@ static OSErr MoveExtents( ExtendedVCB *vcb, u_int32_t srcFileID, u_int32_t dest if ( i != kNumExtentsToCache ) // if the buffer is not full, we must be done { - err = DeleteExtents( vcb, srcFileID, isHFSPlus ); // Now delete all the extent entries with the sourceID + err = DeleteExtents( vcb, srcFileID, forkType, quitEarly, isHFSPlus ); // Now delete all the extent entries with the sourceID if ( DEBUG_BUILD && err != noErr ) DebugStr("Error from DeleteExtents"); break; // we're done! @@ -538,8 +593,10 @@ static void CopyExtentInfo( ExtentKey *key, ExtentRecord *data, ExtentsRecBuffe } + + //-- Delete all extents in extent file that have the ID given. -static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, Boolean isHFSPlus ) +static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, int quitEarly, u_int8_t forkType, Boolean isHFSPlus ) { FCB * fcb; ExtentKey * extentKeyPtr; @@ -548,36 +605,36 @@ static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, Boolean isHFSPl FSBufferDescriptor btRecord; u_int16_t btRecordSize; OSErr err; - + fcb = GetFileControlBlock(vcb->extentsRefNum); - + (void) BTInvalidateHint(&btIterator); extentKeyPtr = (ExtentKey*) &btIterator.key; btRecord.bufferAddress = &extentData; btRecord.itemCount = 1; - + // The algorithm is to position the BTree just before any extent records for fileID. // Then just keep getting successive records. If the record is still for fileID, // then delete it. if (isHFSPlus) { btRecord.itemSize = sizeof(HFSPlusExtentRecord); - + extentKeyPtr->hfsPlus.keyLength = kHFSPlusExtentKeyMaximumLength; - extentKeyPtr->hfsPlus.forkType = 0; + extentKeyPtr->hfsPlus.forkType = forkType; extentKeyPtr->hfsPlus.pad = 0; extentKeyPtr->hfsPlus.fileID = fileID; extentKeyPtr->hfsPlus.startBlock = 0; } else { btRecord.itemSize = sizeof(HFSExtentRecord); - + extentKeyPtr->hfs.keyLength = kHFSExtentKeyMaximumLength; - extentKeyPtr->hfs.forkType = 0; + extentKeyPtr->hfs.forkType = forkType; extentKeyPtr->hfs.fileID = fileID; extentKeyPtr->hfs.startBlock = 0; } - + err = BTSearchRecord(fcb, &btIterator, &btRecord, &btRecordSize, &btIterator); if ( err != btNotFound ) { @@ -587,25 +644,32 @@ static OSErr DeleteExtents( ExtendedVCB *vcb, u_int32_t fileID, Boolean isHFSPl return err; // Got some unexpected error, so return it } - + do { BTreeIterator tmpIterator; HFSCatalogNodeID foundFileID; - + err = BTIterateRecord(fcb, kBTreeNextRecord, &btIterator, &btRecord, &btRecordSize); if ( err != noErr ) { if (err == btNotFound) // If we hit the end of the BTree err = noErr; // then it's OK - + break; // We're done now. } foundFileID = isHFSPlus ? extentKeyPtr->hfsPlus.fileID : extentKeyPtr->hfs.fileID; - if ( foundFileID != fileID ) + if ( foundFileID != fileID ) { break; // numbers don't match, we must be done - + } + if (quitEarly && isHFSPlus) { + /* If we're only deleting one type of fork, then quit early if it doesn't match */ + if (extentKeyPtr->hfsPlus.forkType != forkType) { + break; + } + } + tmpIterator = btIterator; err = BTDeleteRecord( fcb, &tmpIterator ); if (err != noErr) diff --git a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c index 5d037026b..ec9881da8 100644 --- a/bsd/hfs/hfscommon/Misc/FileExtentMapping.c +++ b/bsd/hfs/hfscommon/Misc/FileExtentMapping.c @@ -180,7 +180,8 @@ static OSErr TruncateExtents( static OSErr UpdateExtentRecord ( ExtendedVCB *vcb, - FCB *fcb, + FCB *fcb, + int deleted, const HFSPlusExtentKey *extentFileKey, const HFSPlusExtentRecord extentData, u_int32_t extentBTreeHint); @@ -456,7 +457,6 @@ static OSErr DeleteExtentRecord( // //_________________________________________________________________________________ -__private_extern__ OSErr MapFileBlockC ( ExtendedVCB *vcb, // volume that file resides on FCB *fcb, // FCB of file @@ -682,7 +682,6 @@ static OSErr DeallocateFork( // Function: Flushes the extent file for a specified volume //‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹‹ -__private_extern__ OSErr FlushExtentFile( ExtendedVCB *vcb ) { FCB * fcb; @@ -842,7 +841,6 @@ int32_t CompareExtentKeysPlus( const HFSPlusExtentKey *searchKey, const HFSPlusE * Used by hfs_extendfs to extend the volume allocation bitmap file. * */ -__private_extern__ int AddFileExtent(ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockCount) { @@ -896,7 +894,7 @@ AddFileExtent(ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockC */ foundData[foundIndex].startBlock = startBlock; foundData[foundIndex].blockCount = blockCount; - error = UpdateExtentRecord(vcb, fcb, &foundKey, foundData, hint); + error = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); } (void) FlushExtentFile(vcb); @@ -912,7 +910,6 @@ AddFileExtent(ExtendedVCB *vcb, FCB *fcb, u_int32_t startBlock, u_int32_t blockC // //_________________________________________________________________________________ -__private_extern__ OSErr ExtendFileC ( ExtendedVCB *vcb, // volume that file resides on FCB *fcb, // FCB of file to truncate @@ -1087,21 +1084,44 @@ OSErr ExtendFileC ( * should only be aggressive with re-using once-allocated pieces * if we're not dealing with system files. If we're trying to operate * on behalf of a system file, we need the maximum contiguous amount - * possible. + * possible. For non-system files we favor locality and fragmentation over + * contiguity as it can result in fewer blocks being needed from the underlying + * filesystem that the sparse image resides upon. */ err = noErr; if ( (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) - && (fcb->ff_cp->c_fileid >= kHFSFirstUserCatalogNodeID) - && (flags & kEFMetadataMask) == 0) { - if (vcb->hfs_flags & HFS_DID_CONTIG_SCAN) { - wantContig = false; - } else { - // we only want to do this once to scan the bitmap to - // fill in the vcbFreeExt table of free blocks - vcb->hfs_flags |= HFS_DID_CONTIG_SCAN; - wantContig = true; + && (fcb->ff_cp->c_fileid >= kHFSFirstUserCatalogNodeID) + && (flags & kEFMetadataMask) == 0) { + /* + * We want locality over contiguity so by default we set wantContig to + * false unless we hit one of the circumstances below. + */ + wantContig = false; + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + /* + * If the red-black tree is acive, we can always find a suitable contiguous + * chunk. So if the user specifically requests contiguous files, we should + * honor that no matter what kind of device it is. + */ + if (forceContig) { + wantContig = true; + } } - } else { + else { + /* + * If the red-black tree is not active, then only set wantContig to true + * if we have never done a contig scan on the device, which would populate + * the free extent cache. Note that the caller may explicitly unset the + * DID_CONTIG_SCAN bit in order to force us to vend a contiguous extent here + * if the caller wants to get a contiguous chunk. + */ + if ((vcb->hfs_flags & HFS_DID_CONTIG_SCAN) == 0) { + vcb->hfs_flags |= HFS_DID_CONTIG_SCAN; + wantContig = true; + } + } + } + else { wantContig = true; } useMetaZone = flags & kEFMetadataMask; @@ -1163,7 +1183,7 @@ OSErr ExtendFileC ( if ((actualStartBlock == startBlock) && (blockHint == 0)) { // We grew the file's last extent, so just adjust the number of blocks. foundData[foundIndex].blockCount += actualNumBlocks; - err = UpdateExtentRecord(vcb, fcb, &foundKey, foundData, hint); + err = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); if (err != noErr) break; } else { @@ -1217,7 +1237,7 @@ OSErr ExtendFileC ( // Add a new extent into this record and update. foundData[foundIndex].startBlock = actualStartBlock; foundData[foundIndex].blockCount = actualNumBlocks; - err = UpdateExtentRecord(vcb, fcb, &foundKey, foundData, hint); + err = UpdateExtentRecord(vcb, fcb, 0, &foundKey, foundData, hint); if (err != noErr) break; } } @@ -1289,12 +1309,15 @@ Overflow: // //_________________________________________________________________________________ -__private_extern__ OSErr TruncateFileC ( ExtendedVCB *vcb, // volume that file resides on FCB *fcb, // FCB of file to truncate int64_t peof, // new physical size for file + int deleted, // if nonzero, the file's catalog record has already been deleted. + int rsrc, // does this represent a resource fork or not? + uint32_t fileid, // the fileid of the file we're manipulating. Boolean truncateToExtent) // if true, truncate to end of extent containing newPEOF + { OSErr err; u_int32_t nextBlock; // next file allocation block to consider @@ -1314,16 +1337,20 @@ OSErr TruncateFileC ( recordDeleted = false; - if (vcb->vcbSigWord == kHFSPlusSigWord) + if (vcb->vcbSigWord == kHFSPlusSigWord) { numExtentsPerRecord = kHFSPlusExtentDensity; - else + } + else { numExtentsPerRecord = kHFSExtentDensity; - - if (FORK_IS_RSRC(fcb)) + } + + if (rsrc) { forkType = kResourceForkType; - else + } + else { forkType = kDataForkType; - + } + temp64 = fcb->ff_blocks; physNumBlocks = (u_int32_t)temp64; @@ -1349,13 +1376,21 @@ OSErr TruncateFileC ( * XXX Any errors could cause ff_blocks and c_blocks to get out of sync... */ numBlocks = peof / vcb->blockSize; - FTOC(fcb)->c_blocks -= (fcb->ff_blocks - numBlocks); + if (!deleted) { + FTOC(fcb)->c_blocks -= (fcb->ff_blocks - numBlocks); + } fcb->ff_blocks = numBlocks; - + // this catalog entry is modified and *must* get forced // to disk when hfs_update() is called - FTOC(fcb)->c_flag |= C_MODIFIED | C_FORCEUPDATE; - + if (!deleted) { + /* + * If the file is already C_NOEXISTS, then the catalog record + * has been removed from disk already. We wouldn't need to force + * another update + */ + FTOC(fcb)->c_flag |= (C_MODIFIED | C_FORCEUPDATE); + } // // If the new PEOF is 0, then truncateToExtent has no meaning (we should always deallocate // all storage). @@ -1364,7 +1399,7 @@ OSErr TruncateFileC ( int i; // Deallocate all the extents for this fork - err = DeallocateFork(vcb, FTOC(fcb)->c_fileid, forkType, fcb->fcbExtents, &recordDeleted); + err = DeallocateFork(vcb, fileid, forkType, fcb->fcbExtents, &recordDeleted); if (err != noErr) goto ErrorExit; // got some error, so return it // Update the catalog extent record (making sure it's zeroed out) @@ -1440,7 +1475,7 @@ OSErr TruncateFileC ( // record (in the FCB, or extents file). // if (extentChanged) { - err = UpdateExtentRecord(vcb, fcb, &key, extentRecord, hint); + err = UpdateExtentRecord(vcb, fcb, deleted, &key, extentRecord, hint); if (err != noErr) goto ErrorExit; } @@ -1450,7 +1485,7 @@ OSErr TruncateFileC ( // blocks. // if (nextBlock < physNumBlocks) - err = TruncateExtents(vcb, forkType, FTOC(fcb)->c_fileid, nextBlock, &recordDeleted); + err = TruncateExtents(vcb, forkType, fileid, nextBlock, &recordDeleted); Done: ErrorExit: @@ -1465,7 +1500,6 @@ ErrorExit: * HFS Plus only * */ -__private_extern__ OSErr HeadTruncateFile ( ExtendedVCB *vcb, FCB *fcb, @@ -1824,6 +1858,7 @@ Exit: // // Input: vcb - the volume containing the extents // fcb - the file that owns the extents +// deleted - whether or not the file is already deleted // extentFileKey - pointer to extent key record (xkr) // If the key length is 0, then the extents are actually part // of the catalog record, stored in the FCB. @@ -1834,18 +1869,18 @@ Exit: // (other) = error from BTree //============================================================================ -static OSErr UpdateExtentRecord ( - ExtendedVCB *vcb, - FCB *fcb, - const HFSPlusExtentKey *extentFileKey, - const HFSPlusExtentRecord extentData, - u_int32_t extentBTreeHint) +static OSErr UpdateExtentRecord (ExtendedVCB *vcb, FCB *fcb, int deleted, + const HFSPlusExtentKey *extentFileKey, + const HFSPlusExtentRecord extentData, + u_int32_t extentBTreeHint) { OSErr err = noErr; if (extentFileKey->keyLength == 0) { // keyLength == 0 means the FCB's extent record BlockMoveData(extentData, fcb->fcbExtents, sizeof(HFSPlusExtentRecord)); - FTOC(fcb)->c_flag |= C_MODIFIED; + if (!deleted) { + FTOC(fcb)->c_flag |= C_MODIFIED; + } } else { BTreeIterator btIterator; @@ -2013,7 +2048,6 @@ static Boolean ExtentsAreIntegral( // Called by BTOpenPath during volume mount //_________________________________________________________________________________ -__private_extern__ Boolean NodesAreContiguous( ExtendedVCB *vcb, FCB *fcb, diff --git a/bsd/hfs/hfscommon/Misc/HybridAllocator.c b/bsd/hfs/hfscommon/Misc/HybridAllocator.c new file mode 100644 index 000000000..6e0e1f23a --- /dev/null +++ b/bsd/hfs/hfscommon/Misc/HybridAllocator.c @@ -0,0 +1,533 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#if CONFIG_HFS_ALLOC_RBTREE + +#define assert(a) { if (!(a)) { panic("File "__FILE__", line %d: assertion '%s' failed.\n", __LINE__, #a); } } + +//#include <sys/systm.h> +#include "../../hfs_macos_defs.h" +#include "../headers/HybridAllocator.h" + +#define bool Boolean + +#define ALLOC_DEBUG 0 + +/* + * The rb_wrap macro in RedBlackTree.h automatically generates the source for a variety of functions that + * operate on the red-black trees. The bodies of these automatically generated functions are the corresponding + * macro from RedBlackTree.h. For example, the extent_tree_length_new() function invokes the rb_new() macro. + * We re-define actual wrapper functions around them so that we can re-name them and adjust the functions + * that are available to the allocator in VolumeAllocation.c. + * + * Here are the functions that get automatically generated: + * Offset-Tree Functions: + * + * initialize the tree + * static void extent_tree_offset_new(extent_tree_offset_t * tree) + * + * Get the first node in the tree. If it is empty, return NULL + * static extent_node_t* extent_tree_offset_first (extent_tree_offset_t * tree) + * + * Get the last node in the tree. If it is empty, return NULL + * static extent_node_t* extent_tree_offset_last (extent_tree_offset_t * tree) + * + * From a given extent_node_t, grab the next one. If no next exists, return NULL + * static extent_node_t* extent_tree_offset_next (extent_tree_offset_t * tree, extent_node_t * node) + * + * From a given extent_node_t, grab the previous. If no prev exists, return NULL + * static extent_node_t* extent_tree_offset_prev(extent_tree_offset_t * tree, extent_node_t * node) + * + * Find a extent_node_t with the specified key (search by offset). If it does not exist, return NULL + * static extent_node_t* extent_tree_offset_search(extent_tree_offset_t * tree, extent_node_t * key) + * + * Find an extent node_t withthe specified key (offset). If it does not exist, + * either grab the next node, if possible, or return NULL + * static extent_node_t* extent_tree_offset_nsearch(extent_tree_offset_t * tree, extent_node_t * key) + * + * Find an extent_node_t with the specified key (offset). If it does not exist, + * either grab the previous node, if possible, or return NULL + * static extent_node_t* extent_tree_offset_psearch(extent_tree_offset_t * tree, extent_node_t * key) + * + * Insert the specified node into the tree. + * static void extent_tree_offset_insert(extent_tree_offset_t * tree, extent_node_t * node) + * + * Remove the specified node from the tree. + * static void extent_tree_offset_remove(extent_tree_offset_t * tree, extent_node_t * node) + * + */ + + +/* Static Functions only used in this file */ +static int32_t +extent_tree_internal_alloc_space(extent_tree_offset_t *offset_tree, + u_int32_t size, u_int32_t offset, extent_node_t *node); + +/* + * cmp_offset_node + * + * Compare the extents in two nodes by offset. + * + * Returns: + * -1 if node 1's offset < node 2's offset. + * 1 if node 1's offset > node 2's offset. + */ + +__private_extern__ int +cmp_offset_node(extent_node_t *node_1, extent_node_t *node_2) { + u_int32_t addr_1 = node_1->offset; + u_int32_t addr_2 = node_2->offset; + + return ((addr_1 > addr_2) - (addr_1 < addr_2)); +} + +/* + * Allocate a new red-black tree node. + * + * Currently, we get memory from the M_TEMP zone. + * TODO: Need to get our own zone to avoid bloating the M_TEMP zone. + */ +__private_extern__ extent_node_t * +alloc_node(u_int32_t length, u_int32_t offset) { + extent_node_t *node; + MALLOC(node, extent_node_t *, sizeof(extent_node_t), M_TEMP, M_WAITOK); + + if (node) { + node->offset = offset; + node->length = length; + node->offset_next = NULL; + } + return node; +} + +/* + * De-allocate a red-black tree node. + * + * Currently, this goes back to the M_TEMP zone. + * TODO: May need to adjust this if we pull memory out of our own zone. + */ +__private_extern__ void +free_node(extent_node_t *node) { + FREE(node, M_TEMP); +} + +/* + * rb_wrap is a macro found in the rb.h header file. It builds functions that operate on + * the red-black tree based upon the types specified here. This code will build red-black tree + * search functions that operate on extent_node_t's and use cmp_length_node to do length searches. + * It uses cmp_offset_node to do offset searches. Ties are broken by offset. This will generate + * the functions specified above. + */ + +rb_wrap(__attribute__ ((unused)) static, extent_tree_offset_, extent_tree_offset_t, extent_node_t, offset_link, cmp_offset_node) + + +/* + * Create a new extent tree, composed of links sorted by offset. + */ +__private_extern__ void +extent_tree_init(extent_tree_offset_t *offset_tree) +{ + extent_node_t *node = NULL; + extent_tree_offset_new(offset_tree); + + node = extent_tree_off_first (offset_tree); + if (node) { + node->offset_next = NULL; + } +} + +/* + * Destroy an extent tree + * + * This function finds the first node in the specified red-black tree, then + * uses the embedded linked list to walk through the tree in O(n) time and destroy + * all of its nodes. + */ +__private_extern__ void +extent_tree_destroy(extent_tree_offset_t *off_tree) { + extent_node_t *node = NULL; + extent_node_t *next = NULL; + + node = extent_tree_offset_first (off_tree); + + while (node) { + next = node->offset_next; + extent_tree_offset_remove (off_tree, node); + free_node (node); + node = next; + } +} + +/* + * Search the extent tree by offset. The "key" argument is only used to extract + * the offset and length information. Its link fields are not used in the underlying + * tree code. + */ +__private_extern__ extent_node_t * +extent_tree_off_search(extent_tree_offset_t *tree, extent_node_t *key) { + return extent_tree_offset_search(tree, key); +} + +/* + * Search the extent tree by offset, finding the next node in the tree + * if the specified one does not exist. The "key" argument is only used to extract + * the offset and length information. Its link fields are not used in the underlying + * tree code. + */ +__private_extern__ extent_node_t * +extent_tree_off_search_next(extent_tree_offset_t *offset_tree, extent_node_t *key) { + + return extent_tree_offset_nsearch (offset_tree, key); +} + +/* + * Search the extent tree by offset to find a starting position. Then, do a linear search + * through the list of free extents to find the first free extent in the tree that has size + * greater than or equal to the specified size. The "key" argument is only used to extract + * the offset and length information. Its link fields are not used in the underlying + * tree code. + */ +__private_extern__ extent_node_t * +extent_tree_off_search_nextWithSize (extent_tree_offset_t *offset_tree, extent_node_t *key) { + + extent_node_t *current; + + u_int32_t min_size = key->length; + + current = extent_tree_offset_nsearch (offset_tree, key); + + while (current) { + if (current->length >= min_size) { + return current; + } + current = current->offset_next; + } + + /* return NULL if no free extent of suitable size could be found. */ + return NULL; +} + + +/* + * Search the extent tree by offset, finding the previous node in the tree + * if the specified one does not exist. The "key" argument is only used to extract + * the offset and length information. Its link fields are not used in the underlying + * tree code. + */ +__private_extern__ extent_node_t * +extent_tree_off_search_prev(extent_tree_offset_t *offset_tree, extent_node_t *key) { + + return extent_tree_offset_psearch (offset_tree, key); +} + + +/* + * Find the first node in the extent tree, by offset. This will be the first + * free space region relative to the start of the disk. + */ +__private_extern__ extent_node_t * +extent_tree_off_first (extent_tree_offset_t *offset_tree) { + return extent_tree_offset_first(offset_tree); +} + +/* + * From a given tree node (sorted by offset), get the next node in the tree. + */ +__private_extern__ extent_node_t * +extent_tree_off_next(extent_tree_offset_t * tree, extent_node_t *node) +{ + return extent_tree_offset_next(tree, node); +} + +/* + * From a given tree node (sorted by offset), get the previous node in the tree. + */ +__private_extern__ extent_node_t * +extent_tree_off_prev(extent_tree_offset_t * tree, extent_node_t *node) +{ + return extent_tree_offset_prev(tree, node); +} + + +/* + * For a node of a given offset and size, remove it from the extent tree and + * insert a new node that: + * + * A) increase its offset by that of the node we just removed + * B) decreases its size by that of the node we just removed. + * + * NOTE: Callers must ensure that the 'size' specified is less than or equal to the + * length of the extent represented by node. The node pointer must point to an + * extant node in the tree, as it will be removed from the tree. + */ +static int32_t +extent_tree_internal_alloc_space(extent_tree_offset_t *offset_tree, u_int32_t size, + u_int32_t offset, extent_node_t *node) +{ + if (node) { + extent_node_t *prev = NULL; + extent_node_t *next = NULL; + + if( ALLOC_DEBUG ) { + assert ((size <= node->length)); + assert ((offset == node->offset)); + } + + prev = extent_tree_offset_prev(offset_tree, node); + + /* + * Note that, unless the node is exactly the size of the amount of space + * requested, we do not need to remove it from the offset tree, now matter + * how much space we remove from the node. Remember that the offset tree is + * sorting the extents based on their offsets, and that each node is a discrete + * chunk of free space. + * + * If node A has offset B, with length C, in the offset tree, by definition, there + * can be no other node in the extent tree within the range {B, B+C}. If there were, + * we'd have overlapped extents. + * + * So in the normal case, we'll just update the offset node in place with the new offset + * and size. + * + * Otherwise, if we have an exact match, then just remove the node altogether. Don't forget + * to update the next pointer for the linked list if applicable. + */ + if (node->length == size) { + next = node->offset_next; + extent_tree_offset_remove(offset_tree, node); + free_node(node); + if (prev) { + prev->offset_next = next; + } + } + else { + node->offset = node->offset + size; + node->length -= size; + /* The next pointer does not change since we keep the node in place */ + } + return 0; + } + return -1; +} + +/* + * Search the extent tree for a region of free space after the specified + * offset and attempt to allocate it. + * + * This is expected to be used by attempts to grow a file contiguously. If we + * start at a file's EOF, then we can try to allocate space immediately after it + * if it's available. This function specifies a tail (the offset), and then passes it + * into extent_tree_offset_search. Note that this is not the search_prev or search_next + * variant, so if no node exists at the specified offset we'll fail out. + * + */ + +__private_extern__ int32_t +extent_tree_offset_alloc_space(extent_tree_offset_t *offset_tree, u_int32_t size, u_int32_t offset) { + extent_node_t search_sentinel = { .offset = offset }; + extent_node_t *node = extent_tree_offset_search(offset_tree, &search_sentinel); + if (node && (node->length < size)) { + /* It's too small. Fail the allocation */ + if ( ALLOC_DEBUG ) { + printf("HFS Allocator: internal_alloc_space, ptr (%p) node->length (%d), node->offset (%d), off(%d), size (%d) \n", + node, node->length, node->offset, offset, size); + } + return -1; + } + return extent_tree_internal_alloc_space(offset_tree, size, offset, node); +} + + +/* + * Search the extent tree for a region of free space at the specified + * offset and attempt to allocate it. + * + * This is a little bit more involved than the previous function. It is intended for use when + * we may be allocating space from the middle of an existing extent node. + * + */ + + +__private_extern__ int32_t +extent_tree_offset_alloc_unaligned(extent_tree_offset_t *offset_tree, u_int32_t size, u_int32_t offset) { + extent_node_t search_sentinel = { .offset = offset }; + extent_node_t *node= NULL; + + node = extent_tree_off_search_prev(offset_tree, &search_sentinel); + + if (node == NULL) { + return -1; + } + + if (node && (node->length < size)) { + /* It's too small. Fail the allocation */ + if ( ALLOC_DEBUG ) { + printf("HFS Allocator: internal_alloc_space, ptr (%p) node->length (%d), node->offset (%d), off(%d), size (%d) \n", + node, node->length, node->offset, offset, size); + } + return -1; + } + + /* Now see if we need to split this node because we're not allocating from the beginning */ + if (offset != node->offset) { + + if (ALLOC_DEBUG) { + assert ((offset + size) <= (node->offset + node->length)); + if (node->offset_next) { + assert ((offset > node->offset) && (offset < node->offset_next->offset)); + } + } + + u_int32_t end = node->offset + node->length; + node->length = offset - node->offset; + + /* + * Do we need to create a new node? If our extent we're carving away ends earlier than + * the current extent's length, then yes - we do. + */ + if ((offset + size) < (end)) { + u_int32_t newoff = offset + size; + u_int32_t newlen = end - newoff; + + extent_node_t* newnode = alloc_node(newlen, newoff); + extent_tree_offset_insert(offset_tree, newnode); + + extent_node_t *next = extent_tree_offset_next(offset_tree, newnode); + newnode->offset_next = next; + node->offset_next = newnode; + } + + return 0; + } + else { + return extent_tree_internal_alloc_space(offset_tree, size, offset, node); + } +} + + + +/* + * Mark an extent of space as being free. This means we need to insert + * this extent into our tree. + * + * Search the offset tree, based on the new offset that we construct by adding + * the length of our extent to be freed to its offset. If something exists at + * that offset, then we coalesce the nodes. In this case, we do not need to adjust + * the offset tree because our extent we wanted to add could not have been in the tree. + * + * If no node existed at the specified offset, then create a new one and insert it + * into the tree. + * + * Finally, search based on the node that would precede our newly created/inserted one. + * If possible, coalesce the previous node into our new one. + * + * We return the node which we are modifying in this function. + */ + +__private_extern__ extent_node_t * +extent_tree_free_space(extent_tree_offset_t *offset_tree, u_int32_t size, u_int32_t offset) +{ + extent_node_t *prev = NULL; + extent_node_t *node = NULL; + extent_node_t *next = NULL; + extent_node_t search_sentinel = { .offset = size + offset }; + + node = extent_tree_offset_nsearch(offset_tree, &search_sentinel); + /* Insert our node into the tree, and coalesce with the next one if necessary */ + + if ((node) && (node->offset == search_sentinel.offset)) { + node->offset = offset; + node->length += size; + next = node->offset_next; + } + else { + node = alloc_node(size, offset); + assert(node); + extent_tree_offset_insert(offset_tree, node); + + /* Find the next entry in the tree, if applicable. */ + next = extent_tree_offset_next(offset_tree, node); + node->offset_next = next; + } + + /* Coalesce with the previous if necessary */ + prev = extent_tree_offset_prev(offset_tree, node); + if (prev && (prev->offset + prev->length) == offset) { + extent_tree_offset_remove(offset_tree, prev); + node->offset = prev->offset; + node->length += prev->length; + free_node(prev); + prev = extent_tree_offset_prev(offset_tree, node); + } + + /* Update the next pointer for the previous entry (if necessary) */ + if (prev) { + prev->offset_next = node; + } + + return node; +} + +/* + * Remove the specified node from the offset_tree. Note that the parameter node + * must be an extant node in the tree. This function is used by the allocator when + * we are resizing a volume and need to directly manipulate the contents of the red-black + * tree without going through the normal allocation and deallocation routines. + */ +__private_extern__ void +extent_tree_remove_node (extent_tree_offset_t *offset_tree, extent_node_t * node) { + + if (node) { + /* Just remove the entry from the tree */ + extent_tree_offset_remove(offset_tree, node); + } + return; + +} + + + +#if ALLOC_DEBUG +/* + * For each node in the tree, print out its length and block offset. + */ +__private_extern__ void +extent_tree_offset_print(extent_tree_offset_t *offset_tree) +{ + extent_node_t *node = NULL; + + node = extent_tree_offset_first(offset_tree); + while (node) { + printf("length: %u, offset: %u\n", node->length, node->offset); + node = node->offset_next; + } +} +#endif + +#endif diff --git a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c index bc58bd947..de2858418 100644 --- a/bsd/hfs/hfscommon/Misc/VolumeAllocation.c +++ b/bsd/hfs/hfscommon/Misc/VolumeAllocation.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,7 +32,7 @@ Version: HFS Plus 1.0 - Copyright: © 1996-2001 by Apple Computer, Inc., all rights reserved. + Copyright: ÔøΩ 1996-2009 by Apple Computer, Inc., all rights reserved. */ @@ -45,31 +45,92 @@ Public routines: blocks. (Will only do a single extent???) BlockDeallocate Deallocate a contiguous run of allocation blocks. - - invalidate_free_extent_cache Invalidate free extent cache for a given volume. - -Internal routines: + + BlockMarkAllocated + Exported wrapper to mark blocks as in-use. This will correctly determine + whether or not the red-black tree is enabled and call the appropriate function + if applicable. BlockMarkFree + Exported wrapper to mark blocks as freed. This will correctly determine whether or + not the red-black tree is enabled and call the appropriate function if applicable. + + + ResetVCBFreeExtCache + Since the red-black tree obviates the need to maintain the free extent cache, we do + not update it if the tree is also live. As a result, if we ever need to destroy the trees + we should reset the free extent cache so it doesn't confuse us when we need to fall back to the + bitmap scanning allocator. + We also reset and disable the free extent cache when volume resizing is + in flight. + + UpdateAllocLimit + Adjusts the AllocLimit field in the hfs mount point. This is used when we need to prevent + allocations from occupying space in the region we are modifying during a filesystem resize. + At other times, it should be consistent with the total number of allocation blocks in the + filesystem. It is also used to shrink or grow the number of blocks that the red-black tree should + know about. If growing, scan the new range of bitmap, and if shrinking, reduce the + number of items in the tree that we can allocate from. + +Internal routines: + Note that the RBTree routines are guarded by a cpp check for CONFIG_HFS_ALLOC_RBTREE. This + is to cut down on space for functions that could not possibly be used if they are not planning to + use the red-black tree code. + + BlockMarkFreeRBTree + Make an internal call to BlockMarkFree and then update + and/or create Red-Black Tree allocation tree nodes to correspond + to the free space being generated. + BlockMarkFreeInternal Mark a contiguous range of blocks as free. The corresponding - bits in the volume bitmap will be cleared. - BlockMarkAllocated + bits in the volume bitmap will be cleared. This will actually do the work + of modifying the bitmap for us. + + BlockMarkAllocatedRBTree + Make an internal call to BlockAllocateMarked, which will update the + bitmap on-disk when we allocate blocks. If that is successful, then + we'll remove the appropriate entries from the red-black tree. + BlockMarkAllocatedInternal Mark a contiguous range of blocks as allocated. The cor- responding bits in the volume bitmap are set. Also tests to see - if any of the blocks were previously unallocated. - FindContiguous + if any of the blocks were previously unallocated. + BlockFindContiguous Find a contiguous range of blocks of a given size. The caller specifies where to begin the search (by block number). The - block number of the first block in the range is returned. + block number of the first block in the range is returned. This is only + called by the bitmap scanning logic as the red-black tree should be able + to do this internally by searching its tree. BlockAllocateAny Find and allocate a contiguous range of blocks up to a given size. The first range of contiguous free blocks found are allocated, even if there are fewer blocks than requested (and even if a contiguous range of blocks of the given size exists elsewhere). + BlockAllocateAnyBitmap + Finds a range of blocks per the above requirements without using the + Allocation RB Tree. This relies on the bitmap-scanning logic in order to find + any valid range of free space needed. + BlockAllocateAnyRBTree + Finds a valid range of blocks per the above requirements by searching + the red-black tree. We can just make an internal call to + BlockAllocateContigRBTree to find the valid range. BlockAllocateContig Find and allocate a contiguous range of blocks of a given size. If a contiguous range of free blocks of the given size isn't found, then - the allocation fails (i.e. it is "all or nothing"). - + the allocation fails (i.e. it is "all or nothing"). This routine is + essentially a wrapper function around its related sub-functions, + BlockAllocateContigBitmap and BlockAllocateContigRBTree, which use, + respectively, the original HFS+ bitmap scanning logic and the new + Red-Black Tree to search and manage free-space decisions. This function + contains logic for when to use which of the allocation algorithms, + depending on the free space contained in the volume. + BlockAllocateContigBitmap + Finds and allocates a range of blocks specified by the size parameters + using the original HFS+ bitmap scanning logic. The red-black tree + will not be updated if this function is used. + BlockAllocateContigRBTree + Finds and allocates a range of blocks specified by the size parameters + using the new red/black tree data structure and search algorithms + provided by the tree library. Updates the red/black tree nodes after + the on-disk data structure (bitmap) has been updated. BlockAllocateKnown Try to allocate space from known free space in the volume's free extent cache. @@ -80,6 +141,57 @@ Internal routines: ReleaseBitmapBlock Release a bitmap block back into the buffer cache. + + +Debug/Test Routines + hfs_isallocated + Test to see if any blocks in a range are allocated. Journal or + allocation file lock must be held. + + hfs_isallocated_scan + Test to see if any blocks in a range are allocated. Releases and + invalidates the block used when finished. + + hfs_isrbtree_active + Test to see if the allocation red-black tree is live. This function + requires either an exclusive or shared lock on the allocation bitmap file + in the HFS mount structure, to prevent red-black tree pointers from disappearing. + + hfs_isrbtree_allocated + Test to see if the specified extent is marked as allocated in the red-black tree. + Multiplexes between the metadata zone trees and the normal allocation zone trees + depending on the offset of the extent specified. + + check_rbtree_extents + Void function that wraps around the above function (hfs_isrbtree_allocated) + and checks to see that the return value was appropriate based on the assertion we're + trying to validate (whether or not the specified extent should be marked as free + or allocated). + + hfs_validate_rbtree + Exhaustive search function that will check every allocation block for its status in the + red-black tree and then check the corresponding status in the bitmap file. If the two are out + of sync, it will panic. Note that this function is extremely expensive and must NEVER + be run outside of debug code. + + hfs_checktreelinks + Checks the embedded linked list structure of the red black tree for integrity. The next pointer + should always point to whatever extent_tree_offset_next returns. + + +Red Black Tree Specific Routines + GenerateTree + Build a red-black tree for the given filesystem's bitmap. + + DestroyTrees + Destroy the tree on the given filesystem + + + hfs_alloc_scan_block + Given a starting allocation block number, figures out which physical block contains that + allocation block's bit, and scans it from the starting bit until either the ending bit or + the end of the block. Free space extents are inserted into the appropriate red-black tree. + */ #include "../../hfs_macos_defs.h" @@ -89,19 +201,54 @@ Internal routines: #include <sys/systm.h> #include <sys/sysctl.h> #include <sys/disk.h> +#include <sys/ubc.h> +#include <sys/uio.h> #include <kern/kalloc.h> #include "../../hfs.h" #include "../../hfs_dbg.h" #include "../../hfs_format.h" #include "../../hfs_endian.h" - +#include "../../hfs_macos_defs.h" #include "../headers/FileMgrInternal.h" +#include "../headers/HybridAllocator.h" +#include "../../hfs_kdebug.h" #ifndef CONFIG_HFS_TRIM #define CONFIG_HFS_TRIM 0 #endif +/* + * Use sysctl vfs.generic.hfs.kdebug.allocation to control which + * KERNEL_DEBUG_CONSTANT events are enabled at runtime. (They're + * disabled by default because there can be a lot of these events, + * and we don't want to overwhelm the kernel debug buffer. If you + * want to watch these events in particular, just set the sysctl.) + */ +static int hfs_kdebug_allocation = 0; +SYSCTL_DECL(_vfs_generic); +SYSCTL_NODE(_vfs_generic, OID_AUTO, hfs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "HFS file system"); +SYSCTL_NODE(_vfs_generic_hfs, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "HFS kdebug"); +SYSCTL_INT(_vfs_generic_hfs_kdebug, OID_AUTO, allocation, CTLFLAG_RW|CTLFLAG_LOCKED, &hfs_kdebug_allocation, 0, "Enable kdebug logging for HFS allocations"); +enum { + /* + * HFSDBG_ALLOC_ENABLED: Log calls to BlockAllocate and + * BlockDeallocate, including the internal BlockAllocateXxx + * routines so we can see how an allocation was satisfied. + * + * HFSDBG_EXT_CACHE_ENABLED: Log routines that read or write the + * free extent cache. + * + * HFSDBG_UNMAP_ENABLED: Log events involving the trim list. + * + * HFSDBG_BITMAP_ENABLED: Log accesses to the volume bitmap (setting + * or clearing bits, scanning the bitmap). + */ + HFSDBG_ALLOC_ENABLED = 1, + HFSDBG_EXT_CACHE_ENABLED = 2, + HFSDBG_UNMAP_ENABLED = 4, + HFSDBG_BITMAP_ENABLED = 8 +}; enum { kBytesPerWord = 4, @@ -116,6 +263,8 @@ enum { #define kAllBitsSetInWord 0xFFFFFFFFul +#define ALLOC_DEBUG 0 + static OSErr ReadBitmapBlock( ExtendedVCB *vcb, u_int32_t bit, @@ -136,6 +285,15 @@ static OSErr BlockAllocateAny( u_int32_t *actualStartBlock, u_int32_t *actualNumBlocks); +static OSErr BlockAllocateAnyBitmap( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t endingBlock, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks); + static OSErr BlockAllocateContig( ExtendedVCB *vcb, u_int32_t startingBlock, @@ -145,6 +303,15 @@ static OSErr BlockAllocateContig( u_int32_t *actualStartBlock, u_int32_t *actualNumBlocks); +static OSErr BlockAllocateContigBitmap( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks); + static OSErr BlockFindContiguous( ExtendedVCB *vcb, u_int32_t startingBlock, @@ -161,9 +328,136 @@ static OSErr BlockAllocateKnown( u_int32_t *actualStartBlock, u_int32_t *actualNumBlocks); -static int free_extent_cache_active( - ExtendedVCB *vcb); +static OSErr BlockMarkAllocatedInternal ( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t numBlocks); + +static OSErr BlockMarkFreeInternal( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t numBlocks, + Boolean do_validate); + +#if CONFIG_HFS_ALLOC_RBTREE + +static OSErr ReleaseRBScanBitmapBlock( struct buf *bp ); + +static OSErr BlockAllocateAnyRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks); + +static OSErr BlockAllocateContigRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks, + u_int32_t forceContig); + +static OSErr BlockMarkAllocatedRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t numBlocks); + +static OSErr BlockMarkFreeRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t numBlocks); + +static int +hfs_isrbtree_allocated (struct hfsmount * hfsmp, + u_int32_t startBlock, + u_int32_t numBlocks, + extent_node_t** node1); + +extern void +hfs_validate_rbtree (struct hfsmount *hfsmp, + u_int32_t start, + u_int32_t end); + +static void hfs_checktreelinks (struct hfsmount *hfsmp); + + +void check_rbtree_extents (struct hfsmount *hfsmp, + u_int32_t start, + u_int32_t numBlocks, + int shouldBeFree); + +int hfs_isallocated_scan (struct hfsmount *hfsmp, + u_int32_t startingBlock, + u_int32_t *bp_buf); + +static int hfs_alloc_scan_block(struct hfsmount *hfsmp, + u_int32_t startbit, + u_int32_t endBit, + u_int32_t *bitToScan); + +#define ASSERT_FREE 1 +#define ASSERT_ALLOC 0 + +#endif /* CONFIG_HFS_ALLOC_RBTREE */ + +/* Functions for manipulating free extent cache */ +static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount); +static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount); +static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated); + +#if ALLOC_DEBUG +/* + * Extra #includes for the debug function below. These are not normally #included because + * they would constitute a layering violation + */ +#include <vfs/vfs_journal.h> +#include <sys/disk.h> + +/* + * Validation Routine to verify that the TRIM list maintained by the journal + * is in good shape relative to what we think the bitmap should have. We should + * never encounter allocated blocks in the TRIM list, so if we ever encounter them, + * we panic. + */ +int trim_validate_bitmap (struct hfsmount *hfsmp) { + u_int64_t blockno_offset; + u_int64_t numblocks; + int i; + int count; + u_int32_t startblk; + u_int32_t blks; + int err = 0; + uint32_t alloccount = 0; + + if (hfsmp->jnl) { + struct journal *jnl = (struct journal*)hfsmp->jnl; + if (jnl->active_tr) { + struct jnl_trim_list *trim = &(jnl->active_tr->trim); + count = trim->extent_count; + for (i = 0; i < count; i++) { + blockno_offset = trim->extents[i].offset; + blockno_offset = blockno_offset - (uint64_t)hfsmp->hfsPlusIOPosOffset; + blockno_offset = blockno_offset / hfsmp->blockSize; + numblocks = trim->extents[i].length / hfsmp->blockSize; + + startblk = (u_int32_t)blockno_offset; + blks = (u_int32_t) numblocks; + err = hfs_count_allocated (hfsmp, startblk, blks, &alloccount); + + if (err == 0 && alloccount != 0) { + panic ("trim_validate_bitmap: %d blocks @ ABN %d are allocated!", alloccount, startblk); + } + } + } + } + return 0; +} +#endif /* ;________________________________________________________________________________ @@ -188,22 +482,25 @@ static int free_extent_cache_active( */ static void hfs_unmap_free_extent(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) { - if (CONFIG_HFS_TRIM) { - u_int64_t offset; - u_int64_t length; - int err; - - if ((hfsmp->hfs_flags & HFS_UNMAP) && (hfsmp->jnl != NULL)) { - offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; - length = (u_int64_t) numBlocks * hfsmp->blockSize; - - err = journal_trim_add_extent(hfsmp->jnl, offset, length); - if (err) { - printf("hfs_unmap_free_extent: error %d from journal_trim_add_extent", err); - hfsmp->hfs_flags &= ~HFS_UNMAP; - } + u_int64_t offset; + u_int64_t length; + int err; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_FREE | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); + + if (hfsmp->jnl != NULL) { + offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; + length = (u_int64_t) numBlocks * hfsmp->blockSize; + + err = journal_trim_add_extent(hfsmp->jnl, offset, length); + if (err) { + printf("hfs_unmap_free_extent: error %d from journal_trim_add_extent", err); } } + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_FREE | DBG_FUNC_END, err, 0, 0, 0, 0); } @@ -225,100 +522,117 @@ static void hfs_unmap_free_extent(struct hfsmount *hfsmp, u_int32_t startingBloc */ static void hfs_unmap_alloc_extent(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) { - if (CONFIG_HFS_TRIM) { - u_int64_t offset; - u_int64_t length; - int err; + u_int64_t offset; + u_int64_t length; + int err; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_ALLOC | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); + + if (hfsmp->jnl != NULL) { + offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; + length = (u_int64_t) numBlocks * hfsmp->blockSize; - if ((hfsmp->hfs_flags & HFS_UNMAP) && (hfsmp->jnl != NULL)) { - offset = (u_int64_t) startingBlock * hfsmp->blockSize + (u_int64_t) hfsmp->hfsPlusIOPosOffset; - length = (u_int64_t) numBlocks * hfsmp->blockSize; - - err = journal_trim_remove_extent(hfsmp->jnl, offset, length); - if (err) { - printf("hfs_unmap_alloc_extent: error %d from journal_trim_remove_extent", err); - hfsmp->hfs_flags &= ~HFS_UNMAP; - } + err = journal_trim_remove_extent(hfsmp->jnl, offset, length); + if (err) { + printf("hfs_unmap_alloc_extent: error %d from journal_trim_remove_extent", err); } } + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_ALLOC | DBG_FUNC_END, err, 0, 0, 0, 0); } /* ;________________________________________________________________________________ ; -; Routine: BlkAlloc -; -; Function: Allocate space on a volume. If contiguous allocation is requested, -; at least the requested number of bytes will be allocated or an -; error will be returned. If contiguous allocation is not forced, -; the space will be allocated at the first free fragment following -; the requested starting allocation block. If there is not enough -; room there, a block of less than the requested size will be -; allocated. +; Routine: hfs_trim_callback ; -; If the requested starting block is 0 (for new file allocations), -; the volume's allocation block pointer will be used as a starting -; point. +; Function: This function is called when a transaction that freed extents +; (via hfs_unmap_free_extent/journal_trim_add_extent) has been +; written to the on-disk journal. This routine will add those +; extents to the free extent cache so that they can be reused. ; -; Input Arguments: -; vcb - Pointer to ExtendedVCB for the volume to allocate space on -; fcb - Pointer to FCB for the file for which storage is being allocated -; startingBlock - Preferred starting allocation block, 0 = no preference -; forceContiguous - Force contiguous flag - if bit 0 set (NE), allocation is contiguous -; or an error is returned -; useMetaZone - -; minBlocks - Number of blocks requested. If the allocation is non-contiguous, -; less than this may actually be allocated -; maxBlocks - The maximum number of blocks to allocate. If there is additional free -; space after bytesRequested, then up to maxBlocks bytes should really -; be allocated. (Used by ExtendFileC to round up allocations to a multiple -; of the file's clump size.) +; CAUTION: This routine is called while the journal's trim lock +; is held shared, so that no other thread can reuse any portion +; of those extents. We must be very careful about which locks +; we take from within this callback, to avoid deadlock. The +; call to add_free_extent_cache will end up taking the cache's +; lock (just long enough to add these extents to the cache). ; -; Output: -; (result) - Error code, zero for successful allocation -; *startBlock - Actual starting allocation block -; *actualBlocks - Actual number of allocation blocks allocated +; CAUTION: If the journal becomes invalid (eg., due to an I/O +; error when trying to write to the journal), this callback +; will stop getting called, even if extents got freed before +; the journal became invalid! ; -; Side effects: -; The volume bitmap is read and updated; the volume bitmap cache may be changed. +; Input Arguments: +; arg - The hfsmount of the volume containing the extents. +; extent_count - The number of extents freed in the transaction. +; extents - An array of extents (byte ranges) that were freed. ;________________________________________________________________________________ */ -static void -sanity_check_free_ext(__unused ExtendedVCB *vcb, __unused int check_allocated) +__private_extern__ void +hfs_trim_callback(void *arg, uint32_t extent_count, const dk_extent_t *extents) { -#if DEBUG - u_int32_t i, j; - - for(i=0; i < vcb->vcbFreeExtCnt; i++) { - u_int32_t start, nblocks; - - start = vcb->vcbFreeExt[i].startBlock; - nblocks = vcb->vcbFreeExt[i].blockCount; - - - if (nblocks == 0) { - panic("hfs: %p: slot %d in the free extent array had a zero count (%d)\n", vcb, i, start); - } - - if (check_allocated && hfs_isallocated(vcb, start, nblocks)) { - panic("hfs: %p: slot %d in the free extent array is bad (%d / %d)\n", - vcb, i, start, nblocks); - } - - for(j=i+1; j < vcb->vcbFreeExtCnt; j++) { - if (start == vcb->vcbFreeExt[j].startBlock) { - panic("hfs: %p: slot %d/%d are dups?! (%d / %d ; %d / %d)\n", - vcb, i, j, start, nblocks, vcb->vcbFreeExt[i].startBlock, - vcb->vcbFreeExt[i].blockCount); - } - } + uint32_t i; + uint32_t startBlock, numBlocks; + struct hfsmount *hfsmp = arg; + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_CALLBACK | DBG_FUNC_START, 0, extent_count, 0, 0, 0); + + for (i=0; i<extent_count; ++i) { + /* Convert the byte range in *extents back to a range of allocation blocks. */ + startBlock = (extents[i].offset - hfsmp->hfsPlusIOPosOffset) / hfsmp->blockSize; + numBlocks = extents[i].length / hfsmp->blockSize; + (void) add_free_extent_cache(hfsmp, startBlock, numBlocks); } -#endif + + if (hfs_kdebug_allocation & HFSDBG_UNMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_UNMAP_CALLBACK | DBG_FUNC_END, 0, 0, 0, 0, 0); } -__private_extern__ +/* + ;________________________________________________________________________________ + ; + ; Routine: BlockAllocate + ; + ; Function: Allocate space on a volume. If contiguous allocation is requested, + ; at least the requested number of bytes will be allocated or an + ; error will be returned. If contiguous allocation is not forced, + ; the space will be allocated with the first largest extent available + ; at the requested starting allocation block. If there is not enough + ; room there, a block allocation of less than the requested size will be + ; allocated. + ; + ; If the requested starting block is 0 (for new file allocations), + ; the volume's allocation block pointer will be used as a starting + ; point. + ; + ; Input Arguments: + ; vcb - Pointer to ExtendedVCB for the volume to allocate space on + ; fcb - Pointer to FCB for the file for which storage is being allocated + ; startingBlock - Preferred starting allocation block, 0 = no preference + ; minBlocks - Number of blocks requested. If the allocation is non-contiguous, + ; less than this may actually be allocated + ; maxBlocks - The maximum number of blocks to allocate. If there is additional free + ; space after bytesRequested, then up to maxBlocks bytes should really + ; be allocated. (Used by ExtendFileC to round up allocations to a multiple + ; of the file's clump size.) + ; flags - Flags to specify options like contiguous, use metadata zone, + ; skip free block check, etc. + ; + ; Output: + ; (result) - Error code, zero for successful allocation + ; *startBlock - Actual starting allocation block + ; *actualBlocks - Actual number of allocation blocks allocated + ; + ; Side effects: + ; The volume bitmap is read and updated; the volume bitmap cache may be changed. + ;________________________________________________________________________________ + */ OSErr BlockAllocate ( ExtendedVCB *vcb, /* which volume to allocate space on */ u_int32_t startingBlock, /* preferred starting block, or 0 for no preference */ @@ -332,9 +646,13 @@ OSErr BlockAllocate ( u_int32_t freeBlocks; OSErr err; Boolean updateAllocPtr = false; // true if nextAllocation needs to be updated + struct hfsmount *hfsmp; Boolean useMetaZone; Boolean forceContiguous; + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_ALLOCATE | DBG_FUNC_START, startingBlock, minBlocks, maxBlocks, flags, 0); + if (flags & HFS_ALLOC_FORCECONTIG) { forceContiguous = true; } else { @@ -347,12 +665,27 @@ OSErr BlockAllocate ( useMetaZone = false; } + //TODO: Figure out when we need to re-enable the RB-Tree. + + + //TODO: Make sure we use allocLimit when appropriate. + + /* + * TODO: Update BlockAllocate and its sub-functions to do cooperative allocation and bitmap scanning + * in conjunction with the Generate Tree function. If the red-black tree does not currently contain + * an allocation block of appropriate size, then start scanning blocks FOR the tree generation function until + * we find what we need. We'll update the tree fields when we're done, indicating that we've advanced the + * high water mark for the tree. + */ + // // Initialize outputs in case we get an error // *actualStartBlock = 0; *actualNumBlocks = 0; - freeBlocks = hfs_freeblks(VCBTOHFS(vcb), 0); + hfsmp = VCBTOHFS (vcb); + freeBlocks = hfs_freeblks(hfsmp, 0); + /* Skip free block check if blocks are being allocated for relocating * data during truncating a volume. @@ -394,14 +727,19 @@ OSErr BlockAllocate ( // if (startingBlock == 0) { HFS_MOUNT_LOCK(vcb, TRUE); + + /* Sparse Allocation and nextAllocation are both used even if the R/B Tree is on */ if (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) { startingBlock = vcb->sparseAllocation; - } else { + } + else { startingBlock = vcb->nextAllocation; } HFS_MOUNT_UNLOCK(vcb, TRUE); updateAllocPtr = true; } + + if (startingBlock >= vcb->allocLimit) { startingBlock = 0; /* overflow so start at beginning */ } @@ -414,33 +752,91 @@ OSErr BlockAllocate ( err = BlockAllocateContig(vcb, startingBlock, minBlocks, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); /* - * If we allocated from a new position then - * also update the roving allocator. + * If we allocated from a new position then also update the roving allocator. + * This will keep the roving allocation pointer up-to-date even + * if we are using the new R/B tree allocator, since + * it doesn't matter to us here, how the underlying allocator found + * the block to vend out. */ if ((err == noErr) && (*actualStartBlock > startingBlock) && ((*actualStartBlock < VCBTOHFS(vcb)->hfs_metazone_start) || (*actualStartBlock > VCBTOHFS(vcb)->hfs_metazone_end))) { - updateAllocPtr = true; } } else { +#if CONFIG_HFS_ALLOC_RBTREE + /* + * If the RB-Tree Allocator is live, just go straight for a + * BlockAllocateAny call and return the result. Otherwise, + * resort to the bitmap scanner. + */ + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + /* Start by trying to allocate from the starting block forward */ + err = BlockAllocateAny(vcb, startingBlock, vcb->allocLimit, + maxBlocks, useMetaZone, actualStartBlock, + actualNumBlocks); + + /* + * Because the RB-Tree is live, the previous call to BlockAllocateAny + * will use the rbtree variant. As a result, it will automatically search the + * metadata zone for a valid extent if needed. If we get a return value of + * noErr, we found a valid extent and we can skip to the end. If the error indicates + * the disk is full, that's an equally valid return code and we can skip to the end, too. + */ + if (err == noErr || err == dskFulErr) { + goto Exit; + } + else { + //TODO: only tear down tree if the tree is finished building. + //Make sure to handle the ENOSPC condition properly. We shouldn't error out in that case. + /* Tear down tree if we encounter an error */ + if (hfsmp->extent_tree_flags & HFS_ALLOC_RB_ACTIVE) { + hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ERRORED; + DestroyTrees(hfsmp); + ResetVCBFreeExtCache(hfsmp); + } + else { + goto Exit; + } + // fall through to the normal allocation since the rb-tree allocation failed. + } + } +#endif + /* * Scan the bitmap once, gather the N largest free extents, then * allocate from these largest extents. Repeat as needed until * we get all the space we needed. We could probably build up * that list when the higher level caller tried (and failed) a * contiguous allocation first. + * + * Note that the free-extent cache will be cease to be updated if + * we are using the red-black tree for allocations. If we jettison + * the tree, then we will reset the free-extent cache and start over. */ + err = BlockAllocateKnown(vcb, maxBlocks, actualStartBlock, actualNumBlocks); - if (err == dskFulErr) + /* dskFulErr out of BlockAllocateKnown indicates an empty Free Extent Cache */ + + if (err == dskFulErr) { + /* + * Now we have to do a bigger scan. Start at startingBlock and go up until the + * allocation limit. + */ err = BlockAllocateAny(vcb, startingBlock, vcb->allocLimit, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); - if (err == dskFulErr) + } + if (err == dskFulErr) { + /* + * We may be out of space in the normal zone; go up to the starting block from + * the start of the volume. + */ err = BlockAllocateAny(vcb, 1, startingBlock, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); + } } Exit: @@ -450,8 +846,6 @@ Exit: // still need to update things like the free block count). // if (*actualNumBlocks != 0) { - int i,j; - // // If we used the volume's roving allocation pointer, then we need to update it. // Adding in the length of the current allocation might reduce the next allocate @@ -462,41 +856,24 @@ Exit: // HFS_MOUNT_LOCK(vcb, TRUE); + lck_spin_lock(&hfsmp->vcbFreeExtLock); if (vcb->vcbFreeExtCnt == 0 && vcb->hfs_freed_block_count == 0) { vcb->sparseAllocation = *actualStartBlock; } + lck_spin_unlock(&hfsmp->vcbFreeExtLock); if (*actualNumBlocks < vcb->hfs_freed_block_count) { vcb->hfs_freed_block_count -= *actualNumBlocks; } else { vcb->hfs_freed_block_count = 0; } - + if (updateAllocPtr && - ((*actualStartBlock < VCBTOHFS(vcb)->hfs_metazone_start) || - (*actualStartBlock > VCBTOHFS(vcb)->hfs_metazone_end))) { + ((*actualStartBlock < VCBTOHFS(vcb)->hfs_metazone_start) || + (*actualStartBlock > VCBTOHFS(vcb)->hfs_metazone_end))) { HFS_UPDATE_NEXT_ALLOCATION(vcb, *actualStartBlock); } - for(i=0; i < (int)vcb->vcbFreeExtCnt; i++) { - u_int32_t start, end; - - start = vcb->vcbFreeExt[i].startBlock; - end = start + vcb->vcbFreeExt[i].blockCount; - - if ( (*actualStartBlock >= start && *actualStartBlock < end) - || ((*actualStartBlock + *actualNumBlocks) > start && *actualStartBlock < start)) { - - for(j=i; j < (int)vcb->vcbFreeExtCnt-1; j++) { - vcb->vcbFreeExt[j] = vcb->vcbFreeExt[j+1]; - } - - vcb->vcbFreeExtCnt--; - i--; // so we'll check the guy we just copied down... - - // keep looping because we may have invalidated more - // than one entry in the array - } - } + (void) remove_free_extent_cache(hfsmp, *actualStartBlock, *actualNumBlocks); /* * Update the number of free blocks on the volume @@ -510,11 +887,31 @@ Exit: MarkVCBDirty(vcb); HFS_MOUNT_UNLOCK(vcb, TRUE); - sanity_check_free_ext(vcb, 1); - hfs_generate_volume_notifications(VCBTOHFS(vcb)); } + if (ALLOC_DEBUG) { + if (err == noErr) { + if (*actualStartBlock >= hfsmp->totalBlocks) { + panic ("BlockAllocate: vending invalid blocks!"); + } + if (*actualStartBlock >= hfsmp->allocLimit) { + panic ("BlockAllocate: vending block past allocLimit!"); + } + + if ((*actualStartBlock + *actualNumBlocks) >= hfsmp->totalBlocks) { + panic ("BlockAllocate: vending too many invalid blocks!"); + } + + if ((*actualStartBlock + *actualNumBlocks) >= hfsmp->allocLimit) { + panic ("BlockAllocate: vending too many invalid blocks past allocLimit!"); + } + } + } + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_ALLOCATE | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); + return err; } @@ -522,7 +919,7 @@ Exit: /* ;________________________________________________________________________________ ; -; Routine: BlkDealloc +; Routine: BlockDeallocate ; ; Function: Update the bitmap to deallocate a run of disk allocation blocks ; @@ -536,10 +933,10 @@ Exit: ; ; Side effects: ; The volume bitmap is read and updated; the volume bitmap cache may be changed. +; The Allocator's red-black trees may also be modified as a result. ;________________________________________________________________________________ */ -__private_extern__ OSErr BlockDeallocate ( ExtendedVCB *vcb, // Which volume to deallocate space on u_int32_t firstBlock, // First block in range to deallocate @@ -547,8 +944,12 @@ OSErr BlockDeallocate ( u_int32_t flags) { OSErr err; - u_int32_t tempWord; + struct hfsmount *hfsmp; + hfsmp = VCBTOHFS(vcb); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_DEALLOCATE | DBG_FUNC_START, firstBlock, numBlocks, flags, 0, 0); + // // If no blocks to deallocate, then exit early // @@ -556,11 +957,51 @@ OSErr BlockDeallocate ( err = noErr; goto Exit; } + + + if (ALLOC_DEBUG) { + if (firstBlock >= hfsmp->totalBlocks) { + panic ("BlockDeallocate: freeing invalid blocks!"); + } + + if ((firstBlock + numBlocks) >= hfsmp->totalBlocks) { + panic ("BlockDeallocate: freeing too many invalid blocks!"); + } + } + + + + + /* + * If we're using the red-black tree code, then try to free the + * blocks by marking them in the red-black tree first. If the tree + * is not active for whatever reason (or we're not using the + * R/B Tree code at all), then go straight for the BlockMarkFree + * function. + * + * Remember that we can get into this function if the tree isn't finished + * building. In that case, check to see if the block we're de-allocating is + * past the high watermark + */ +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + /* + * BlockMarkFreeRBTree deals with the case where we are resizing the + * filesystem (shrinking), and we need to manipulate the bitmap beyond the portion + * that is currenly controlled by the r/b tree. + */ + + //TODO: Update multiplexing code for the half-finished case. + err = BlockMarkFreeRBTree(vcb, firstBlock, numBlocks); + adjustFreeExtCache = 0; + } + else { + err = BlockMarkFreeInternal(vcb, firstBlock, numBlocks, true); + } - // - // Call internal routine to free the sequence of blocks - // - err = BlockMarkFree(vcb, firstBlock, numBlocks); +#else + err = BlockMarkFreeInternal(vcb, firstBlock, numBlocks, true); +#endif if (err) goto Exit; @@ -578,75 +1019,39 @@ OSErr BlockDeallocate ( } vcb->hfs_freed_block_count += numBlocks; - if (firstBlock < vcb->sparseAllocation) { - vcb->sparseAllocation = firstBlock; - } if (vcb->nextAllocation == (firstBlock + numBlocks)) { HFS_UPDATE_NEXT_ALLOCATION(vcb, (vcb->nextAllocation - numBlocks)); } - if (free_extent_cache_active(vcb) == 0) { - goto skip_cache; - } - - tempWord = vcb->vcbFreeExtCnt; - // Add this free chunk to the free extent list - if (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) { - // Sorted by start block - if (tempWord == kMaxFreeExtents && vcb->vcbFreeExt[kMaxFreeExtents-1].startBlock > firstBlock) - --tempWord; - if (tempWord < kMaxFreeExtents) - { - // We're going to add this extent. Bubble any smaller extents down in the list. - while (tempWord && vcb->vcbFreeExt[tempWord-1].startBlock > firstBlock) - { - vcb->vcbFreeExt[tempWord] = vcb->vcbFreeExt[tempWord-1]; - if (vcb->vcbFreeExt[tempWord].startBlock < vcb->sparseAllocation) { - vcb->sparseAllocation = vcb->vcbFreeExt[tempWord].startBlock; - } - --tempWord; - } - vcb->vcbFreeExt[tempWord].startBlock = firstBlock; - vcb->vcbFreeExt[tempWord].blockCount = numBlocks; - - if (vcb->vcbFreeExtCnt < kMaxFreeExtents) { - ++vcb->vcbFreeExtCnt; - } - } - } else { - // Sorted by num blocks - if (tempWord == kMaxFreeExtents && vcb->vcbFreeExt[kMaxFreeExtents-1].blockCount < numBlocks) - --tempWord; - if (tempWord < kMaxFreeExtents) - { - // We're going to add this extent. Bubble any smaller extents down in the list. - while (tempWord && vcb->vcbFreeExt[tempWord-1].blockCount < numBlocks) - { - vcb->vcbFreeExt[tempWord] = vcb->vcbFreeExt[tempWord-1]; - if (vcb->vcbFreeExt[tempWord].startBlock < vcb->sparseAllocation) { - vcb->sparseAllocation = vcb->vcbFreeExt[tempWord].startBlock; - } - --tempWord; - } - vcb->vcbFreeExt[tempWord].startBlock = firstBlock; - vcb->vcbFreeExt[tempWord].blockCount = numBlocks; - - if (vcb->vcbFreeExtCnt < kMaxFreeExtents) { - ++vcb->vcbFreeExtCnt; - } + if (hfsmp->jnl == NULL) { + /* + * In the journal case, we'll add the free extent once the journal + * calls us back to tell us it wrote the transaction to disk. + */ + (void) add_free_extent_cache(vcb, firstBlock, numBlocks); + + /* + * If the journal case, we'll only update sparseAllocation once the + * free extent cache becomes empty (when we remove the last entry + * from the cache). Skipping it here means we're less likely to + * find a recently freed extent via the bitmap before it gets added + * to the free extent cache. + */ + if (firstBlock < vcb->sparseAllocation) { + vcb->sparseAllocation = firstBlock; } } - -skip_cache: + MarkVCBDirty(vcb); HFS_MOUNT_UNLOCK(vcb, TRUE); - sanity_check_free_ext(vcb, 1); - hfs_generate_volume_notifications(VCBTOHFS(vcb)); Exit: + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_DEALLOCATE | DBG_FUNC_END, err, 0, 0, 0, 0); + return err; } @@ -656,7 +1061,6 @@ u_int8_t freebitcount[16] = { 3, 2, 2, 1, 2, 1, 1, 0, /* 8 9 A B C D E F */ }; -__private_extern__ u_int32_t MetaZoneFreeBlocks(ExtendedVCB *vcb) { @@ -763,6 +1167,9 @@ static OSErr ReadBitmapBlock( daddr64_t block; u_int32_t blockSize; + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_BLOCK | DBG_FUNC_START, bit, 0, 0, 0, 0); + /* * volume bitmap blocks are protected by the allocation file lock */ @@ -792,6 +1199,9 @@ static OSErr ReadBitmapBlock( } } + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_READ_BITMAP_BLOCK | DBG_FUNC_END, err, 0, 0, 0, 0); + return err; } @@ -816,6 +1226,9 @@ static OSErr ReleaseBitmapBlock( { struct buf *bp = (struct buf *)blockRef; + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_BITMAP_BLOCK | DBG_FUNC_START, dirty, 0, 0, 0, 0); + if (blockRef == 0) { if (dirty) panic("hfs: ReleaseBitmapBlock: missing bp"); @@ -837,9 +1250,42 @@ static OSErr ReleaseBitmapBlock( } } + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RELEASE_BITMAP_BLOCK | DBG_FUNC_END, 0, 0, 0, 0, 0); + + return (0); +} + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * ReleaseRBScanBitmapBlock is used to release struct bufs that were + * created for use by the Red-Black tree generation code. We want to force + * them to be purged out of the buffer cache ASAP, so we'll release them differently + * than in the ReleaseBitmapBlock case. Alternately, we know that we're only reading + * the blocks, so we will never dirty them as part of the tree building scan. + */ + +static OSErr ReleaseRBScanBitmapBlock(struct buf *bp ) { + + if (bp == NULL) { + return (0); + } + + if (bp) { + /* Mark the buffer invalid if it isn't locked, then release it */ + if ((buf_flags(bp) & B_LOCKED) == 0) { + buf_markinvalid(bp); + } + buf_brelse(bp); + } + return (0); + + } +#endif + /* _______________________________________________________________________ @@ -872,21 +1318,48 @@ static OSErr BlockAllocateContig( u_int32_t *actualStartBlock, u_int32_t *actualNumBlocks) { - OSErr err; - // - // Find a contiguous group of blocks at least minBlocks long. - // Determine the number of contiguous blocks available (up - // to maxBlocks). - // +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + return BlockAllocateContigRBTree(vcb, startingBlock, minBlocks, maxBlocks, useMetaZone, + actualStartBlock, actualNumBlocks, 1); + } +#endif + return BlockAllocateContigBitmap(vcb, startingBlock, minBlocks, + maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); +} - /* - * NOTE: If the only contiguous free extent of at least minBlocks - * crosses startingBlock (i.e. starts before, ends after), then we - * won't find it. Earlier versions *did* find this case by letting - * the second search look past startingBlock by minBlocks. But - * with the free extent cache, this can lead to duplicate entries - * in the cache, causing the same blocks to be allocated twice. +/* + * Variant of BlockAllocateContig that uses the original bitmap-searching logic + */ + +static OSErr BlockAllocateContigBitmap( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks) +{ + OSErr err; + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_CONTIG_BITMAP | DBG_FUNC_START, startingBlock, minBlocks, maxBlocks, useMetaZone, 0); + + // + // Find a contiguous group of blocks at least minBlocks long. + // Determine the number of contiguous blocks available (up + // to maxBlocks). + // + + /* + * NOTE: If the only contiguous free extent of at least minBlocks + * crosses startingBlock (i.e. starts before, ends after), then we + * won't find it. Earlier versions *did* find this case by letting + * the second search look past startingBlock by minBlocks. But + * with the free extent cache, this can lead to duplicate entries + * in the cache, causing the same blocks to be allocated twice. */ err = BlockFindContiguous(vcb, startingBlock, vcb->allocLimit, minBlocks, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); @@ -902,10 +1375,234 @@ static OSErr BlockAllocateContig( // Now mark those blocks allocated. // if (err == noErr) - err = BlockMarkAllocated(vcb, *actualStartBlock, *actualNumBlocks); + err = BlockMarkAllocatedInternal(vcb, *actualStartBlock, *actualNumBlocks); + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_CONTIG_BITMAP | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); + + return err; +} + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * Variant of BlockAllocateContig that uses the newer red-black tree library + * in order to manage free space extents. This will search the red-black tree + * and return results in the same fashion as BlockAllocateContigBitmap. + * + * Note that this function is invoked from both the red-black tree variant of BlockAllocateany + * as well as BlockAllocateContig. In order to determine when we should vend contiguous chunks over + * locality-based-searches, we use the forceContig argument to determine who called us. + */ + +static OSErr BlockAllocateContigRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t minBlocks, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks, + u_int32_t forceContig) +{ + OSErr err; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + extent_node_t search_sentinel; + extent_node_t *node = NULL; + extent_node_t tempnode; + + bzero (&tempnode, sizeof(extent_node_t)); + + /* Begin search at the end of the file, via startingBlock */ + memset (&search_sentinel, 0, sizeof(extent_node_t)); + search_sentinel.offset = startingBlock; + + *actualStartBlock = 0; + *actualNumBlocks = 0; + + /* + * Find the first available extent that satifies the allocation by searching + * from the starting point and moving forward + */ + node = extent_tree_off_search_next(&hfsmp->offset_tree, &search_sentinel); + + if (node) { + *actualStartBlock = node->offset; + *actualNumBlocks = node->length; + } + + /* If we managed to grab at least minBlocks of space, then we're done. */ + + if (*actualNumBlocks >= minBlocks) { + if (*actualNumBlocks > maxBlocks) { + *actualNumBlocks = maxBlocks; + } + + + /* Check to see if blocks are already marked as in-use */ + if (ALLOC_DEBUG) { + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + if (hfs_isallocated(hfsmp, *actualStartBlock, *actualNumBlocks)) { + printf("bad node: %p, offset %d, length %d\n", node, node->offset,node->length); + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks in use already\n", + *actualStartBlock, *actualNumBlocks); + } + } + + /* + * BlockMarkAllocatedRBTree is responsible for removing the nodes + * from the red-black tree after the bitmap has been updated on-disk. + */ + err = BlockMarkAllocatedRBTree(vcb, *actualStartBlock, *actualNumBlocks); + if (err == noErr) { + + if ( ALLOC_DEBUG ) { + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + if (!hfs_isallocated(hfsmp, *actualStartBlock, *actualNumBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks not in use yet\n", + *actualStartBlock, *actualNumBlocks); + } + check_rbtree_extents (VCBTOHFS(vcb), *actualStartBlock, *actualNumBlocks, ASSERT_ALLOC); + } + + return err; + } + } + + /* + * We may have failed to grow at the end of the file. We'll try to find + * appropriate free extents, searching by size in the normal allocation zone. + * + * However, if we're allocating on behalf of a sparse device that hasn't explicitly + * requested a contiguous chunk, then we try to search by offset, even if it + * means fragmenting the file. We want all available entries starting + * from the front of the disk to avoid creating new bandfiles. As a result, + * we'll start by searching the offset tree rather than the normal length + * tree. Note that this function can be invoked from BlockAllocateAny, in + * which the minimum block size is 1 block, making it easy to succeed. + */ + search_sentinel.offset = hfsmp->hfs_metazone_end; + search_sentinel.length = minBlocks; + + if ((vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE) && (forceContig == 0)) { + /* just start with the first offset node */ + node = extent_tree_off_search_next(&hfsmp->offset_tree, &search_sentinel); + } + else { + /* + * Otherwise, start from the end of the metadata zone or our next allocation pointer, + * and try to find the first chunk of size >= min. + */ + node = extent_tree_off_search_nextWithSize (&hfsmp->offset_tree, &search_sentinel); + + if (node == NULL) { + extent_node_t *metaend_node; + /* + * Maybe there's a free extent coalesced with the space still in the metadata + * zone. If there is, find it and allocate from the middle of it, starting at + * the end of the metadata zone. + * + * If search_prev yields a result that is not offset == metazone_end, then that + * means no node existed at that offset. If the previous node's offset + length crosses + * the metazone boundary, then allocate from there. If it is too small to + * cross the metazone boundary, then it is of no importance and we'd have to + * report ENOSPC. + */ + metaend_node = extent_tree_off_search_prev(&hfsmp->offset_tree, &search_sentinel); + + if ((metaend_node) && (metaend_node->offset < hfsmp->hfs_metazone_end)) { + u_int32_t node_end = metaend_node->offset + metaend_node->length; + if (node_end > hfsmp->hfs_metazone_end) { + u_int32_t modified_length = node_end - hfsmp->hfs_metazone_end; + if (modified_length >= minBlocks) { + /* + * Then we can allocate it. Fill in the contents into tempnode, + * and BlockMarkAllocatedRBTree below will take care of the rest. + */ + tempnode.offset = hfsmp->hfs_metazone_end; + tempnode.length = MIN(minBlocks, node_end - tempnode.offset); + node = &tempnode; + } + } + } + } + } + + /* If we can't find anything useful, search the metadata zone as a last resort. */ + + if ((!node) && useMetaZone) { + search_sentinel.offset = 0; + search_sentinel.length = minBlocks; + node = extent_tree_off_search_nextWithSize (&hfsmp->offset_tree, &search_sentinel); + } + + /* If we found something useful, then go ahead and update the bitmap */ + if ((node) && (node->length >= minBlocks)) { + *actualStartBlock = node->offset; + if (node->length >= maxBlocks) { + *actualNumBlocks = maxBlocks; + } + else { + *actualNumBlocks = node->length; + } + + err = BlockMarkAllocatedRBTree(vcb, *actualStartBlock, *actualNumBlocks); + + if (err == noErr) { + if ( ALLOC_DEBUG ) { + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + if (!hfs_isallocated(hfsmp, *actualStartBlock, *actualNumBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks not in use yet\n", + *actualStartBlock, *actualNumBlocks); + } + check_rbtree_extents (VCBTOHFS(vcb), *actualStartBlock, *actualNumBlocks, ASSERT_ALLOC); + } + } + } + else { + int destroy_trees = 0; + /* + * TODO: Add High-water mark check here. If we couldn't find anything useful, + * when do we tear down the tree? Or should the logic be in BlockAllocateContig?? + */ + if (destroy_trees) { + DestroyTrees(VCBTOHFS(vcb)); + /* Reset the Free Ext Cache since we'll be using it now. */ + ResetVCBFreeExtCache(VCBTOHFS(vcb)); + } + + if (ALLOC_DEBUG) { + printf("HFS allocator: No space on FS (%s). Node %p Start %d Min %d, Max %d, Tree still alive.\n", + hfsmp->vcbVN, node, startingBlock, minBlocks, maxBlocks); + + /* Dump the list ? */ + extent_tree_offset_print(&hfsmp->offset_tree); + + printf("HFS allocator: Done printing list on FS (%s). Min %d, Max %d, Tree still alive.\n", + hfsmp->vcbVN, minBlocks, maxBlocks); + + + + } + err = dskFulErr; + } + + if (err == noErr) { + if (ALLOC_DEBUG) { + if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) + panic("hfs: BlockAllocateAny: allocation overflow on \"%s\"", vcb->vcbVN); + } + } + else { + *actualStartBlock = 0; + *actualNumBlocks = 0; + } return err; + } +#endif + + /* _______________________________________________________________________ @@ -929,6 +1626,12 @@ Outputs: actualNumBlocks Number of blocks allocated, or 0 if error _______________________________________________________________________ */ + +/* + * BlockAllocateAny acts as a multiplexer between BlockAllocateAnyRBTree + * and BlockAllocateAnyBitmap, which uses the bitmap scanning logic. + */ + static OSErr BlockAllocateAny( ExtendedVCB *vcb, u_int32_t startingBlock, @@ -938,6 +1641,60 @@ static OSErr BlockAllocateAny( u_int32_t *actualStartBlock, u_int32_t *actualNumBlocks) { + +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(VCBTOHFS(vcb))) { + return BlockAllocateAnyRBTree(vcb, startingBlock, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); + } +#endif + return BlockAllocateAnyBitmap(vcb, startingBlock, endingBlock, maxBlocks, useMetaZone, actualStartBlock, actualNumBlocks); + +} + + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * BlockAllocateAnyRBTree finds one or more allocation blocks by using + * the red-black allocation tree to figure out where the free ranges are. + * This function is typically used as a last resort becuase we were unable to + * find the right ranges. Outputs are the same as BlockAllocateAnyBitmap. + */ +static OSErr BlockAllocateAnyRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks) +{ + OSErr err; + + /* + * BlockAllocateContig + */ + /* If we're using the red-black tree, try searching at the specified offsets. */ + err = BlockAllocateContigRBTree(vcb, startingBlock, 1, maxBlocks, useMetaZone, + actualStartBlock, actualNumBlocks, 0); + return err; + +} +#endif + +/* + * BlockAllocateAnyBitmap finds free ranges by scanning the bitmap to figure out + * where the free allocation blocks are. Inputs and outputs are the same as for + * BlockAllocateAny and BlockAllocateAnyRBTree + */ + +static OSErr BlockAllocateAnyBitmap( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t endingBlock, + u_int32_t maxBlocks, + Boolean useMetaZone, + u_int32_t *actualStartBlock, + u_int32_t *actualNumBlocks) +{ OSErr err; register u_int32_t block; // current block number register u_int32_t currentWord; // Pointer to current word within bitmap block @@ -951,6 +1708,9 @@ static OSErr BlockAllocateAny( Boolean dirty = false; struct hfsmount *hfsmp = VCBTOHFS(vcb); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_ANY_BITMAP | DBG_FUNC_START, startingBlock, endingBlock, maxBlocks, useMetaZone, 0); + /* * When we're skipping the metadata zone and the start/end * range overlaps with the metadata zone then adjust the @@ -1128,11 +1888,19 @@ Exit: if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) { panic("hfs: BlockAllocateAny: allocation overflow on \"%s\"", vcb->vcbVN); } - - /* Remove these blocks from the TRIM list if applicable */ - if (CONFIG_HFS_TRIM) { - hfs_unmap_alloc_extent(vcb, *actualStartBlock, *actualNumBlocks); - } + + /* + * Beware! + * Because this function directly manipulates the bitmap to mark the + * blocks it came across as allocated, we must inform the journal (and + * subsequently, the journal's trim list) that we are allocating these + * blocks, just like in BlockMarkAllocatedInternal. hfs_unmap_alloc_extent + * and the functions it calls will serialize behind the journal trim list lock + * to ensure that either the asynchronous flush/TRIM/UNMAP happens prior to + * us manipulating the trim list, or we get there first and successfully remove + * these bitmap blocks before the TRIM happens. + */ + hfs_unmap_alloc_extent (vcb, *actualStartBlock, *actualNumBlocks); } else { *actualStartBlock = 0; @@ -1142,6 +1910,9 @@ Exit: if (currCache) (void) ReleaseBitmapBlock(vcb, blockRef, dirty); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_ANY_BITMAP | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); + return err; } @@ -1178,15 +1949,25 @@ static OSErr BlockAllocateKnown( u_int32_t foundBlocks; u_int32_t newStartBlock, newBlockCount; + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_KNOWN_BITMAP | DBG_FUNC_START, 0, 0, maxBlocks, 0, 0); + HFS_MOUNT_LOCK(vcb, TRUE); - if (free_extent_cache_active(vcb) == 0 || - vcb->vcbFreeExtCnt == 0 || + lck_spin_lock(&vcb->vcbFreeExtLock); + if ((hfs_isrbtree_active(vcb) == true) || + vcb->vcbFreeExtCnt == 0 || vcb->vcbFreeExt[0].blockCount == 0) { + lck_spin_unlock(&vcb->vcbFreeExtLock); HFS_MOUNT_UNLOCK(vcb, TRUE); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_KNOWN_BITMAP | DBG_FUNC_END, dskFulErr, *actualStartBlock, *actualNumBlocks, 0, 0); return dskFulErr; } + lck_spin_unlock(&vcb->vcbFreeExtLock); HFS_MOUNT_UNLOCK(vcb, TRUE); + lck_spin_lock(&vcb->vcbFreeExtLock); + // Just grab up to maxBlocks of the first (largest) free exent. *actualStartBlock = vcb->vcbFreeExt[0].startBlock; foundBlocks = vcb->vcbFreeExt[0].blockCount; @@ -1246,6 +2027,7 @@ static OSErr BlockAllocateKnown( } done: + lck_spin_unlock(&vcb->vcbFreeExtLock); // sanity check if ((*actualStartBlock + *actualNumBlocks) > vcb->allocLimit) { @@ -1260,24 +2042,83 @@ done: // // Now mark the found extent in the bitmap // - err = BlockMarkAllocated(vcb, *actualStartBlock, *actualNumBlocks); + err = BlockMarkAllocatedInternal(vcb, *actualStartBlock, *actualNumBlocks); } - sanity_check_free_ext(vcb, 1); + sanity_check_free_ext(vcb, 0); + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ALLOC_KNOWN_BITMAP | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); return err; } +/* + * BlockMarkAllocated + * + * This is a wrapper function around the internal calls which will actually mark the blocks + * as in-use. It will mark the blocks in the red-black tree if appropriate. We need to do + * this logic here to avoid callers having to deal with whether or not the red-black tree + * is enabled. + */ + + +OSErr BlockMarkAllocated( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t numBlocks) +{ + struct hfsmount *hfsmp; + + hfsmp = VCBTOHFS(vcb); +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(hfsmp)) { + int err; + + if ((startingBlock >= hfsmp->offset_block_end) && + (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS)) { + /* + * We're manipulating a portion of the bitmap that is not controlled by the + * red-black tree. Just update the bitmap and don't bother manipulating the tree + */ + goto justbitmap; + } + + err = BlockMarkAllocatedRBTree(vcb, startingBlock, numBlocks); + if (err == noErr) { + if ( ALLOC_DEBUG ) { + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + if (!hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks not in use yet\n", + startingBlock, numBlocks); + } + check_rbtree_extents (hfsmp, startingBlock, numBlocks, ASSERT_ALLOC); + } + } + return err; + + } +justbitmap: +#endif + + return BlockMarkAllocatedInternal(vcb, startingBlock, numBlocks); + +} + /* _______________________________________________________________________ -Routine: BlockMarkAllocated +Routine: BlockMarkAllocatedInternal Function: Mark a contiguous group of blocks as allocated (set in the bitmap). It assumes those bits are currently marked - deallocated (clear in the bitmap). + deallocated (clear in the bitmap). Note that this function + must be called regardless of whether or not the bitmap or + tree-based allocator is used, as all allocations must correctly + be marked on-disk. If the tree-based approach is running, then + this will be done before the node is removed from the tree. Inputs: vcb Pointer to volume where space is to be allocated @@ -1285,8 +2126,8 @@ Inputs: numBlocks Number of blocks to mark as allocated _______________________________________________________________________ */ -__private_extern__ -OSErr BlockMarkAllocated( +static +OSErr BlockMarkAllocatedInternal ( ExtendedVCB *vcb, u_int32_t startingBlock, register u_int32_t numBlocks) @@ -1304,9 +2145,10 @@ OSErr BlockMarkAllocated( // XXXdbg struct hfsmount *hfsmp = VCBTOHFS(vcb); - if (CONFIG_HFS_TRIM) { - hfs_unmap_alloc_extent(vcb, startingBlock, numBlocks); - } + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_ALLOC_BITMAP | DBG_FUNC_START, startingBlock, numBlocks, 0, 0, 0); + + hfs_unmap_alloc_extent(vcb, startingBlock, numBlocks); // // Pre-read the bitmap block containing the first word of allocation @@ -1349,7 +2191,7 @@ OSErr BlockMarkAllocated( } #if DEBUG_BUILD if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - panic("hfs: BlockMarkAllocated: blocks already allocated!"); + panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); } #endif *currentWord |= SWAP_BE32 (bitMask); // set the bits in the bitmap @@ -1387,7 +2229,7 @@ OSErr BlockMarkAllocated( } #if DEBUG_BUILD if (*currentWord != 0) { - panic("hfs: BlockMarkAllocated: blocks already allocated!"); + panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); } #endif *currentWord = SWAP_BE32 (bitMask); @@ -1425,7 +2267,7 @@ OSErr BlockMarkAllocated( } #if DEBUG_BUILD if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - panic("hfs: BlockMarkAllocated: blocks already allocated!"); + panic("hfs: BlockMarkAllocatedInternal: blocks already allocated!"); } #endif *currentWord |= SWAP_BE32 (bitMask); // set the bits in the bitmap @@ -1438,73 +2280,322 @@ Exit: if (buffer) (void)ReleaseBitmapBlock(vcb, blockRef, true); + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_ALLOC_BITMAP | DBG_FUNC_END, err, 0, 0, 0, 0); + return err; } - +#if CONFIG_HFS_ALLOC_RBTREE /* -_______________________________________________________________________ - -Routine: BlockMarkFree - -Function: Mark a contiguous group of blocks as free (clear in the - bitmap). It assumes those bits are currently marked - allocated (set in the bitmap). + * This is a wrapper function around BlockMarkAllocated. This function is + * called when the RB Tree-based allocator needs to mark a block as in-use. + * This function should take the locks that would not normally be + * necessary for the normal bitmap allocator, and then call the function. Once + * the on-disk data structures are updated properly, then this will remove the + * appropriate node from the tree. + */ -Inputs: - vcb Pointer to volume where space is to be freed - startingBlock First block number to mark as freed - numBlocks Number of blocks to mark as freed -_______________________________________________________________________ -*/ -__private_extern__ -OSErr BlockMarkFree( +static OSErr BlockMarkAllocatedRBTree( ExtendedVCB *vcb, - u_int32_t startingBlock_in, - register u_int32_t numBlocks_in) + u_int32_t startingBlock, + u_int32_t numBlocks) { - OSErr err; - u_int32_t startingBlock = startingBlock_in; - u_int32_t numBlocks = numBlocks_in; - register u_int32_t *currentWord; // Pointer to current word within bitmap block - register u_int32_t wordsLeft; // Number of words left in this bitmap block - register u_int32_t bitMask; // Word with given bits already set (ready to OR in) - u_int32_t firstBit; // Bit index within word of first bit to allocate - u_int32_t numBits; // Number of bits in word to allocate - u_int32_t *buffer = NULL; - uintptr_t blockRef; - u_int32_t bitsPerBlock; - u_int32_t wordsPerBlock; - // XXXdbg - struct hfsmount *hfsmp = VCBTOHFS(vcb); - - /* - * NOTE: We use vcb->totalBlocks instead of vcb->allocLimit because we - * need to be able to free blocks being relocated during hfs_truncatefs. - */ - if (startingBlock + numBlocks > vcb->totalBlocks) { - printf ("hfs: BlockMarkFree() trying to free non-existent blocks starting at %u (numBlock=%u) on volume %s\n", startingBlock, numBlocks, vcb->vcbVN); - hfs_mark_volume_inconsistent(vcb); - err = EIO; - goto Exit; - } - - // - // Pre-read the bitmap block containing the first word of allocation - // + OSErr err; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + int rb_err = 0; - err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); - if (err != noErr) goto Exit; - // XXXdbg - if (hfsmp->jnl) { - journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + + if (ALLOC_DEBUG) { + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + if (hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks in use already\n", + startingBlock, numBlocks); + } + check_rbtree_extents (VCBTOHFS(vcb), startingBlock, numBlocks, ASSERT_FREE); } + + err = BlockMarkAllocatedInternal (vcb, startingBlock, numBlocks); + + if (err == noErr) { - // - // Initialize currentWord, and wordsLeft. - // - { - u_int32_t wordIndexInBlock; + if (ALLOC_DEBUG) { + if (!hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks not in use yet!\n", + startingBlock, numBlocks); + } + } + + /* + * Mark the blocks in the offset tree. + */ + rb_err = extent_tree_offset_alloc_space(&hfsmp->offset_tree, numBlocks, startingBlock); + if (rb_err) { + if (ALLOC_DEBUG) { + printf("HFS RBTree Allocator: Could not mark blocks as in-use! %d \n", rb_err); + } + + /* + * We may be called from the BlockMarkAllocated interface, in which case, they would + * not be picking extents from their start. Do a check here, find if the specified + * extent is free, and if it is, then find the containing node. + */ + extent_node_t *node = NULL; + extent_node_t search_sentinel; + search_sentinel.offset = startingBlock; + + node = extent_tree_off_search_prev(&hfsmp->offset_tree, &search_sentinel); + + if (node) { + rb_err = extent_tree_offset_alloc_unaligned (&hfsmp->offset_tree, numBlocks, startingBlock); + } + + if (ALLOC_DEBUG) { + if (rb_err) { + printf ("HFS RBTree Allocator: Still Couldn't mark blocks as in-use! %d\n", rb_err); + } + } + } + if (ALLOC_DEBUG) { + check_rbtree_extents (VCBTOHFS(vcb), startingBlock, numBlocks, ASSERT_ALLOC); + } + } + + /* + * If we encountered a red-black tree error, for now, we immediately back off and force + * destruction of rb-tree. Set the persistent error-detected bit in the mount point. + * That will ensure that even if we reach a low-water-mark in the future we will still + * not allow the rb-tree to be used. On next mount, we will force a re-construction from + * on-disk state. As a fallback, we will now resort to the bitmap-scanning behavior. + */ + if (rb_err) { + /* Mark RB-Trees with error */ + hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ERRORED; + DestroyTrees(hfsmp); + /* Reset the Free Ext Cache since we'll be using it now. */ + ResetVCBFreeExtCache(hfsmp); + printf("HFS: Red-Black Allocator Tree BlockMarkAllocated error\n"); + } + + return err; +} +#endif + + + +/* + * BlockMarkFree + * + * This is a wrapper function around the internal calls which will actually mark the blocks + * as freed. It will mark the blocks in the red-black tree if appropriate. We need to do + * this logic here to avoid callers having to deal with whether or not the red-black tree + * is enabled. + * + */ +OSErr BlockMarkFree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t numBlocks) +{ + struct hfsmount *hfsmp; + hfsmp = VCBTOHFS(vcb); +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(hfsmp)) { + int err; + + if ((startingBlock >= hfsmp->offset_block_end) && + (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS)) { + /* + * We're manipulating a portion of the bitmap that is not controlled by the + * red-black tree. Just update the bitmap and don't bother manipulating the tree + */ + goto justbitmap; + } + + err = BlockMarkFreeRBTree(vcb, startingBlock, numBlocks); + if (err == noErr) { + if ( ALLOC_DEBUG ) { + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + if (hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks in use!\n", + startingBlock, numBlocks); + } + check_rbtree_extents (hfsmp, startingBlock, numBlocks, ASSERT_FREE); + } + } + return err; + } +justbitmap: +#endif + return BlockMarkFreeInternal(vcb, startingBlock, numBlocks, true); + +} + + +/* + * BlockMarkFreeUnused + * + * Scan the bitmap block beyond end of current file system for bits + * that are marked as used. If any of the bits are marked as used, + * this function marks them free. + * + * Note: This was specifically written to mark all bits beyond + * end of current file system during hfs_extendfs(), which makes + * sure that all the new blocks added to the file system are + * marked as free. We expect that all the blocks beyond end of + * current file system are always marked as free, but there might + * be cases where are marked as used. This function assumes that + * the number of blocks marked as used incorrectly are relatively + * small, otherwise this can overflow journal transaction size + * on certain file system configurations (example, large unused + * bitmap with relatively small journal). + * + * Input: + * startingBlock: First block of the range to mark unused + * numBlocks: Number of blocks in the range to mark unused + * + * Returns: zero on success, non-zero on error. + */ +OSErr BlockMarkFreeUnused(ExtendedVCB *vcb, u_int32_t startingBlock, register u_int32_t numBlocks) +{ + int error = 0; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + u_int32_t curNumBlocks; + u_int32_t bitsPerBlock; + u_int32_t lastBit; + + /* Use the optimal bitmap I/O size instead of bitmap block size */ + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + + /* + * First clear any non bitmap allocation block aligned bits + * + * Calculate the first bit in the bitmap block next to + * the bitmap block containing the bit for startingBlock. + * Using this value, we calculate the total number of + * bits to be marked unused from startingBlock to the + * end of bitmap block containing startingBlock. + */ + lastBit = ((startingBlock + (bitsPerBlock - 1))/bitsPerBlock) * bitsPerBlock; + curNumBlocks = lastBit - startingBlock; + if (curNumBlocks > numBlocks) { + curNumBlocks = numBlocks; + } + error = BlockMarkFreeInternal(vcb, startingBlock, curNumBlocks, false); + if (error) { + return error; + } + startingBlock += curNumBlocks; + numBlocks -= curNumBlocks; + + /* + * Check a full bitmap block for any 'used' bit. If any bit is used, + * mark all the bits only in that bitmap block as free. This ensures + * that we do not write unmodified bitmap blocks and do not + * overwhelm the journal. + * + * The code starts by checking full bitmap block at a time, and + * marks entire bitmap block as free only if any bit in that bitmap + * block is marked as used. In the end, it handles the last bitmap + * block which might be partially full by only checking till the + * caller-specified last bit and if any bit is set, only mark that + * range as free. + */ + while (numBlocks) { + if (numBlocks >= bitsPerBlock) { + curNumBlocks = bitsPerBlock; + } else { + curNumBlocks = numBlocks; + } + if (hfs_isallocated(hfsmp, startingBlock, curNumBlocks) == true) { + error = BlockMarkFreeInternal(vcb, startingBlock, curNumBlocks, false); + if (error) { + return error; + } + } + startingBlock += curNumBlocks; + numBlocks -= curNumBlocks; + } + + return error; +} + +/* +_______________________________________________________________________ + +Routine: BlockMarkFreeInternal + +Function: Mark a contiguous group of blocks as free (clear in the + bitmap). It assumes those bits are currently marked + allocated (set in the bitmap). + +Inputs: + vcb Pointer to volume where space is to be freed + startingBlock First block number to mark as freed + numBlocks Number of blocks to mark as freed + do_validate If true, validate that the blocks being + deallocated to check if they are within totalBlocks + for current volume and whether they were allocated + before they are marked free. +_______________________________________________________________________ +*/ +static +OSErr BlockMarkFreeInternal( + ExtendedVCB *vcb, + u_int32_t startingBlock_in, + register u_int32_t numBlocks_in, + Boolean do_validate) +{ + OSErr err; + u_int32_t startingBlock = startingBlock_in; + u_int32_t numBlocks = numBlocks_in; + register u_int32_t *currentWord; // Pointer to current word within bitmap block + register u_int32_t wordsLeft; // Number of words left in this bitmap block + register u_int32_t bitMask; // Word with given bits already set (ready to OR in) + u_int32_t firstBit; // Bit index within word of first bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t *buffer = NULL; + uintptr_t blockRef; + u_int32_t bitsPerBlock; + u_int32_t wordsPerBlock; + // XXXdbg + struct hfsmount *hfsmp = VCBTOHFS(vcb); + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_FREE_BITMAP | DBG_FUNC_START, startingBlock_in, numBlocks_in, do_validate, 0, 0); + + /* + * NOTE: We use vcb->totalBlocks instead of vcb->allocLimit because we + * need to be able to free blocks being relocated during hfs_truncatefs. + */ + if ((do_validate == true) && + (startingBlock + numBlocks > vcb->totalBlocks)) { + if (ALLOC_DEBUG) { + panic ("BlockMarkFreeInternal() free non-existent blocks at %u (numBlock=%u) on vol %s\n", startingBlock, numBlocks, vcb->vcbVN); + } + + printf ("hfs: BlockMarkFreeInternal() trying to free non-existent blocks starting at %u (numBlock=%u) on volume %s\n", startingBlock, numBlocks, vcb->vcbVN); + hfs_mark_volume_inconsistent(vcb); + err = EIO; + goto Exit; + } + + // + // Pre-read the bitmap block containing the first word of allocation + // + + err = ReadBitmapBlock(vcb, startingBlock, &buffer, &blockRef); + if (err != noErr) goto Exit; + // XXXdbg + if (hfsmp->jnl) { + journal_modify_block_start(hfsmp->jnl, (struct buf *)blockRef); + } + + // + // Initialize currentWord, and wordsLeft. + // + { + u_int32_t wordIndexInBlock; bitsPerBlock = vcb->vcbVBMIOSize * kBitsPerByte; wordsPerBlock = vcb->vcbVBMIOSize / kBytesPerWord; @@ -1528,7 +2619,8 @@ OSErr BlockMarkFree( numBits = numBlocks; // entire allocation is inside this one word bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); // turn off bits after last } - if ((*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { + if ((do_validate == true) && + (*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { goto Corruption; } *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap @@ -1563,7 +2655,8 @@ OSErr BlockMarkFree( currentWord = buffer; wordsLeft = wordsPerBlock; } - if (*currentWord != SWAP_BE32 (kAllBitsSetInWord)) { + if ((do_validate == true) && + (*currentWord != SWAP_BE32 (kAllBitsSetInWord))) { goto Corruption; } *currentWord = 0; // clear the entire word @@ -1599,7 +2692,8 @@ OSErr BlockMarkFree( currentWord = buffer; wordsLeft = wordsPerBlock; } - if ((*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { + if ((do_validate == true) && + (*currentWord & SWAP_BE32 (bitMask)) != SWAP_BE32 (bitMask)) { goto Corruption; } *currentWord &= SWAP_BE32 (~bitMask); // clear the bits in the bitmap @@ -1612,24 +2706,128 @@ Exit: if (buffer) (void)ReleaseBitmapBlock(vcb, blockRef, true); - if (CONFIG_HFS_TRIM && err == noErr) { + if (err == noErr) { hfs_unmap_free_extent(vcb, startingBlock_in, numBlocks_in); } + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_MARK_FREE_BITMAP | DBG_FUNC_END, err, 0, 0, 0, 0); return err; Corruption: #if DEBUG_BUILD - panic("hfs: BlockMarkFree: blocks not allocated!"); + panic("hfs: BlockMarkFreeInternal: blocks not allocated!"); #else - printf ("hfs: BlockMarkFree() trying to free unallocated blocks on volume %s\n", vcb->vcbVN); + printf ("hfs: BlockMarkFreeInternal() trying to free unallocated blocks (%u,%u) on volume %s\n", startingBlock, numBlocks, vcb->vcbVN); hfs_mark_volume_inconsistent(vcb); err = EIO; goto Exit; #endif } +#if CONFIG_HFS_ALLOC_RBTREE +/* + * This is a wrapper function around BlockMarkFree. This function is + * called when the RB Tree-based allocator needs to mark a block as no longer + * in use. This function should take the locks that would not normally be + * necessary for the normal bitmap deallocator, and then call the function. Once + * the on-disk data structures are updated properly, then this will update an + * existing rb-tree node if possible, or else create a new one. + */ + +OSErr BlockMarkFreeRBTree( + ExtendedVCB *vcb, + u_int32_t startingBlock, + register u_int32_t numBlocks) +{ + OSErr err; + struct hfsmount *hfsmp = VCBTOHFS(vcb); + int rb_err = 0; + + if (ALLOC_DEBUG) { + REQUIRE_FILE_LOCK(vcb->hfs_allocation_vp, false); + if (!hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Trying to free blocks starting @ %x for %x but blocks not in use! \n", + startingBlock, numBlocks); + } + check_rbtree_extents (VCBTOHFS(vcb), startingBlock, numBlocks, ASSERT_ALLOC); + } + + err = BlockMarkFreeInternal(vcb, startingBlock, numBlocks, true); + + if (err == noErr) { + + /* + * During a filesystem truncation, we may need to relocate files out of the + * portion of the bitmap that is no longer controlled by the r/b tree. + * In this case, just update the bitmap and do not attempt to manipulate the tree. + */ + if ((startingBlock >= hfsmp->offset_block_end) && + (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS)) { + goto free_error; + } + + extent_node_t *newnode; + + if (ALLOC_DEBUG) { + /* + * Validate that the blocks in question are not allocated in the bitmap, and that they're + * not in the offset tree, since it should be tracking free extents, rather than allocated + * extents + */ + if (hfs_isallocated(hfsmp, startingBlock, numBlocks)) { + panic ("HFS RBTree Allocator: Blocks starting @ %x for %x blocks still marked in-use!\n", + startingBlock, numBlocks); + } + } + + if ((hfsmp->extent_tree_flags & HFS_ALLOC_RB_ACTIVE) == 0) { + if (startingBlock >= hfsmp->offset_block_end) { + /* + * If the tree generation code has not yet finished scanning the + * bitmap region containing this extent, do nothing. If the start + * of the range to be deallocated is greater than the current high + * watermark on the offset tree, just bail out and let the scanner catch up with us. + */ + rb_err = 0; + goto free_error; + } + } + + newnode = extent_tree_free_space(&hfsmp->offset_tree, numBlocks, startingBlock); + if (newnode == NULL) { + rb_err = 1; + goto free_error; + } + + if (ALLOC_DEBUG) { + check_rbtree_extents (VCBTOHFS(vcb), startingBlock, numBlocks, ASSERT_FREE); + } + + } + +free_error: + /* + * We follow the same principle as in BlockMarkAllocatedRB. + * If we encounter an error in adding the extents to the rb-tree, then immediately + * back off, destroy the trees, and persistently set a bit in the runtime hfsmp flags + * to indicate we should not use the rb-tree until next mount, when we can force a rebuild. + */ + if (rb_err) { + /* Mark RB-Trees with error */ + hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ERRORED; + DestroyTrees(hfsmp); + /* Reset the Free Ext Cache since we'll be using it now. */ + ResetVCBFreeExtCache(hfsmp); + printf("HFS: Red-Black Allocator Tree BlockMarkFree error\n"); + } + + + return err; + +} +#endif /* _______________________________________________________________________ @@ -1639,6 +2837,9 @@ Routine: BlockFindContiguous Function: Find a contiguous range of blocks that are free (bits clear in the bitmap). If a contiguous range of the minimum size can't be found, an error will be returned. + This is only needed to support the bitmap-scanning logic, + as the red-black tree should be able to do this by internally + searching its tree. Inputs: vcb Pointer to volume where space is to be allocated @@ -1680,7 +2881,10 @@ static OSErr BlockFindContiguous( register u_int32_t tempWord; uintptr_t blockRef; u_int32_t wordsPerBlock; - u_int32_t j, updated_free_extents = 0, really_add; + u_int32_t updated_free_extent = 0; + + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_FIND_CONTIG | DBG_FUNC_START, startingBlock, endingBlock, minBlocks, maxBlocks, 0); /* * When we're skipping the metadata zone and the start/end @@ -1916,79 +3120,10 @@ FoundUsed: if (foundBlocks >= minBlocks) break; // Found what we needed! - HFS_MOUNT_LOCK(vcb, TRUE); - if (free_extent_cache_active(vcb) == 0) { - HFS_MOUNT_UNLOCK(vcb, TRUE); - goto skip_cache; - } - HFS_MOUNT_UNLOCK(vcb, TRUE); - - // This free chunk wasn't big enough. Try inserting it into the free extent cache in case - // the allocation wasn't forced contiguous. - really_add = 0; - for(j=0; j < vcb->vcbFreeExtCnt; j++) { - u_int32_t start, end; - - start = vcb->vcbFreeExt[j].startBlock; - end = start + vcb->vcbFreeExt[j].blockCount; - - if ( (firstBlock >= start && firstBlock < end) - || ((firstBlock + foundBlocks) > start && firstBlock < start)) { - - // there's overlap with an existing entry so do not add this - break; - } - - } - - if (j >= vcb->vcbFreeExtCnt) { - really_add = 1; - } - - tempWord = vcb->vcbFreeExtCnt; - if (really_add && (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE)) { - // Sorted by starting block - if (tempWord == kMaxFreeExtents && vcb->vcbFreeExt[kMaxFreeExtents-1].startBlock > firstBlock) - --tempWord; - if (tempWord < kMaxFreeExtents) - { - // We're going to add this extent. Bubble any smaller extents down in the list. - while (tempWord && vcb->vcbFreeExt[tempWord-1].startBlock > firstBlock) - { - vcb->vcbFreeExt[tempWord] = vcb->vcbFreeExt[tempWord-1]; - --tempWord; - } - vcb->vcbFreeExt[tempWord].startBlock = firstBlock; - vcb->vcbFreeExt[tempWord].blockCount = foundBlocks; - - if (vcb->vcbFreeExtCnt < kMaxFreeExtents) { - ++vcb->vcbFreeExtCnt; - } - updated_free_extents = 1; - } - } else if (really_add) { - // Sorted by blockCount - if (tempWord == kMaxFreeExtents && vcb->vcbFreeExt[kMaxFreeExtents-1].blockCount < foundBlocks) - --tempWord; - if (tempWord < kMaxFreeExtents) - { - // We're going to add this extent. Bubble any smaller extents down in the list. - while (tempWord && vcb->vcbFreeExt[tempWord-1].blockCount < foundBlocks) - { - vcb->vcbFreeExt[tempWord] = vcb->vcbFreeExt[tempWord-1]; - --tempWord; - } - vcb->vcbFreeExt[tempWord].startBlock = firstBlock; - vcb->vcbFreeExt[tempWord].blockCount = foundBlocks; - - if (vcb->vcbFreeExtCnt < kMaxFreeExtents) { - ++vcb->vcbFreeExtCnt; - } - updated_free_extents = 1; - } - } -skip_cache: - sanity_check_free_ext(vcb, 0); + /* We did not find the total blocks were were looking for, but + * lets add this free block run to our free extent cache list + */ + updated_free_extent = add_free_extent_cache(vcb, firstBlock, foundBlocks); } while (currentBlock < stopBlock); LoopExit: @@ -2017,17 +3152,19 @@ ErrorExit: } } - if (updated_free_extents && (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE)) { + if (updated_free_extent && (vcb->hfs_flags & HFS_HAS_SPARSE_DEVICE)) { int i; u_int32_t min_start = vcb->totalBlocks; // set the nextAllocation pointer to the smallest free block number // we've seen so on the next mount we won't rescan unnecessarily + lck_spin_lock(&vcb->vcbFreeExtLock); for(i=0; i < (int)vcb->vcbFreeExtCnt; i++) { if (vcb->vcbFreeExt[i].startBlock < min_start) { min_start = vcb->vcbFreeExt[i].startBlock; } } + lck_spin_unlock(&vcb->vcbFreeExtLock); if (min_start != vcb->totalBlocks) { if (min_start < vcb->nextAllocation) { vcb->nextAllocation = min_start; @@ -2041,71 +3178,296 @@ ErrorExit: if (buffer) (void) ReleaseBitmapBlock(vcb, blockRef, false); - sanity_check_free_ext(vcb, 1); + if (hfs_kdebug_allocation & HFSDBG_ALLOC_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_BLOCK_FIND_CONTIG | DBG_FUNC_END, err, *actualStartBlock, *actualNumBlocks, 0, 0); return err; } + +#if CONFIG_HFS_ALLOC_RBTREE /* - * Test to see if any blocks in a range are allocated. + * Wrapper function around hfs_isrbtree_allocated. This just takes the start offset, + * and the number of blocks, and whether or not we should check if the blocks are + * free or not. This function is designed to be used primarily with the debug #ifdef + * enabled, so it results in a panic if anything unexpected occurs. * - * The journal or allocation file lock must be held. + * shouldBeFree will be nonzero if the caller expects the zone to be free. */ -__private_extern__ -int -hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) -{ - u_int32_t *currentWord; // Pointer to current word within bitmap block - u_int32_t wordsLeft; // Number of words left in this bitmap block - u_int32_t bitMask; // Word with given bits already set (ready to test) - u_int32_t firstBit; // Bit index within word of first bit to allocate - u_int32_t numBits; // Number of bits in word to allocate - u_int32_t *buffer = NULL; - uintptr_t blockRef; - u_int32_t bitsPerBlock; - u_int32_t wordsPerBlock; - int inuse = 0; - int error; - - /* - * Pre-read the bitmap block containing the first word of allocation - */ - error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); - if (error) - return (error); - - /* - * Initialize currentWord, and wordsLeft. - */ - { - u_int32_t wordIndexInBlock; - - bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; - wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; - wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; - currentWord = buffer + wordIndexInBlock; - wordsLeft = wordsPerBlock - wordIndexInBlock; +void check_rbtree_extents (struct hfsmount *hfsmp, u_int32_t startBlocks, + u_int32_t numBlocks, int shouldBeFree) { + int alloc; + extent_node_t *node1 = NULL; + u_int32_t off1 = 0; + u_int32_t len1 = 0; + alloc = hfs_isrbtree_allocated (hfsmp, startBlocks, numBlocks, &node1); + + if (node1) { + off1 = node1->offset; + len1 = node1->length; } - /* - * First test any non word aligned bits. - */ - firstBit = startingBlock % kBitsPerWord; - if (firstBit != 0) { - bitMask = kAllBitsSetInWord >> firstBit; - numBits = kBitsPerWord - firstBit; - if (numBits > numBlocks) { - numBits = numBlocks; - bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); - } - if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - inuse = 1; - goto Exit; + if (shouldBeFree) { + /* + * If the region should be free, then we expect to see extents in the tree + * matching this start and length. Alloc != 0 means some portion of the extent + * specified was allocated. + */ + if (alloc != 0){ + panic ("HFS check_rbtree_extents: Node (%p) do not exist! " + "node1 off (%d),len(%d),, start(%d) end(%d)\n", + node1, off1, len1, startBlocks, numBlocks); } - numBlocks -= numBits; - ++currentWord; - --wordsLeft; + } + else { + /* + * Otherwise, this means that the region should be allocated, and if we find + * an extent matching it, that's bad. + */ + if (alloc == 0){ + panic ("HFS check_rbtree_extents: Node (%p) exists! " + "node1 off (%d),len(%d), start(%d) end(%d)\n", + node1, off1, len1, startBlocks, numBlocks); + } + } +} +#endif + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * Exhaustive validation search. This function iterates over all allocation blocks and + * compares their status in the red-black tree vs. the allocation bitmap. If the two are out of sync + * then it will panic. Bitmap lock must be held while this function is run. + * + * Because this function requires a red-black tree search to validate every allocation block, it is + * very expensive and should ONLY be run in debug mode, and even then, infrequently. + * + * 'end' is non-inclusive, so it should represent the total number of blocks in the volume. + * + */ +void +hfs_validate_rbtree (struct hfsmount *hfsmp, u_int32_t start, u_int32_t end){ + + u_int32_t current; + extent_node_t* node1; + + hfs_checktreelinks (hfsmp); + + for (current = start; current < end; current++) { + node1 = NULL; + int rbtree = hfs_isrbtree_allocated(hfsmp, current, 1, &node1); + int bitmap = hfs_isallocated(hfsmp, current, 1); + + if (bitmap != rbtree){ + panic("HFS: Allocator mismatch @ block %d -- bitmap %d : rbtree %d\n", + current, bitmap, rbtree); + } + } +} + +/* + * Exhaustive Red-Black Tree Linked List verification routine. + * + * This function iterates through the red-black tree's nodes, and then verifies that the linked list + * embedded within each of the nodes accurately points to the correct node as its "next" pointer. + * The bitmap lock must be held while this function is run. + */ + +void +hfs_checktreelinks (struct hfsmount *hfsmp) { + extent_tree_offset_t *tree = &hfsmp->offset_tree; + + extent_node_t *current = NULL; + extent_node_t *next = NULL; + extent_node_t *treenext; + + current = extent_tree_off_first (tree); + + while (current) { + next = current->offset_next; + treenext = extent_tree_off_next (tree, current); + if (next != treenext) { + panic("hfs_checktreelinks: mismatch for node (%p), next: %p , treenext %p !\n", current, next, treenext); + } + current = treenext; + } +} + +#endif + + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * Test to see if any free blocks exist at a given offset. + * If there exists a node at the specified offset, it will return the appropriate + * node. + * + * NULL indicates allocated blocks exist at that offset. + * + * Allocation file lock must be held. + * + * Returns: + * 1 if blocks in the range are allocated. + * 0 if all blocks in the range are free. + */ + +static int +hfs_isrbtree_allocated (struct hfsmount *hfsmp, u_int32_t startBlock, + u_int32_t numBlocks, extent_node_t **ret_node) { + + extent_node_t search_sentinel; + extent_node_t *node = NULL; + extent_node_t *nextnode = NULL; + + /* + * With only one tree, then we just have to validate that there are entries + * in the R/B tree at the specified offset if it really is free. + */ + search_sentinel.offset = startBlock; + search_sentinel.length = numBlocks; + + node = extent_tree_off_search_prev(&hfsmp->offset_tree, &search_sentinel); + if (node) { + + *ret_node = node; + nextnode = extent_tree_off_next (&hfsmp->offset_tree, node); + if (nextnode != node->offset_next) { + panic ("hfs_rbtree_isallocated: Next pointers out of sync!\n"); + } + + /* + * Check to see if it is a superset of our target range. Because we started + * with the offset or some offset prior to it, then we know the node's offset is + * at least <= startBlock. So, if the end of the node is greater than the end of + * our target range, then the whole range is free. + */ + + if ((node->offset + node->length) >= (startBlock + numBlocks)) { + if (node->offset > startBlock) { + panic ("hfs_rbtree_isallocated: bad node ordering!"); + } + return 0; + } + } + /* + * We got here if either our node search resulted in a node whose extent + * was strictly before our target offset, or we couldnt' find a previous node + * at all (the beginning of the volume). If the former, then we can infer that + * at least one block in the target range is allocated since the next node's offset + * must be greater than startBlock. + * + * Either way, this means that the target node is unavailable to allocate, so + * just return 1; + */ + return 1; +} + + +#endif + +/* + * Count number of bits set in the given 32-bit unsigned number + * + * Returns: + * Number of bits set + */ +static int num_bits_set(u_int32_t num) +{ + int count; + + for (count = 0; num; count++) { + num &= num - 1; + } + + return count; +} + +/* + * For a given range of blocks, find the total number of blocks + * allocated. If 'stop_on_first' is true, it stops as soon as it + * encounters the first allocated block. This option is useful + * to determine if any block is allocated or not. + * + * Inputs: + * startingBlock First allocation block number of the range to be scanned. + * numBlocks Total number of blocks that need to be scanned. + * stop_on_first Stop the search after the first allocated block is found. + * + * Output: + * allocCount Total number of allocation blocks allocated in the given range. + * + * On error, it is the number of allocated blocks found + * before the function got an error. + * + * If 'stop_on_first' is set, + * allocCount = 1 if any allocated block was found. + * allocCount = 0 if no allocated block was found. + * + * Returns: + * 0 on success, non-zero on failure. + */ +static int +hfs_isallocated_internal(struct hfsmount *hfsmp, u_int32_t startingBlock, + u_int32_t numBlocks, Boolean stop_on_first, u_int32_t *allocCount) +{ + u_int32_t *currentWord; // Pointer to current word within bitmap block + u_int32_t wordsLeft; // Number of words left in this bitmap block + u_int32_t bitMask; // Word with given bits already set (ready to test) + u_int32_t firstBit; // Bit index within word of first bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t *buffer = NULL; + uintptr_t blockRef; + u_int32_t bitsPerBlock; + u_int32_t wordsPerBlock; + u_int32_t blockCount = 0; + int error; + + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_IS_ALLOCATED | DBG_FUNC_START, startingBlock, numBlocks, stop_on_first, 0, 0); + + /* + * Pre-read the bitmap block containing the first word of allocation + */ + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); + if (error) + goto JustReturn; + + /* + * Initialize currentWord, and wordsLeft. + */ + { + u_int32_t wordIndexInBlock; + + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; + + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + currentWord = buffer + wordIndexInBlock; + wordsLeft = wordsPerBlock - wordIndexInBlock; + } + + /* + * First test any non word aligned bits. + */ + firstBit = startingBlock % kBitsPerWord; + if (firstBit != 0) { + bitMask = kAllBitsSetInWord >> firstBit; + numBits = kBitsPerWord - firstBit; + if (numBits > numBlocks) { + numBits = numBlocks; + bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); + } + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + if (stop_on_first) { + blockCount = 1; + goto Exit; + } + blockCount += num_bits_set(*currentWord & SWAP_BE32 (bitMask)); + } + numBlocks -= numBits; + ++currentWord; + --wordsLeft; } /* @@ -2128,8 +3490,11 @@ hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBl wordsLeft = wordsPerBlock; } if (*currentWord != 0) { - inuse = 1; - goto Exit; + if (stop_on_first) { + blockCount = 1; + goto Exit; + } + blockCount += num_bits_set(*currentWord); } numBlocks -= kBitsPerWord; ++currentWord; @@ -2156,54 +3521,922 @@ hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBl wordsLeft = wordsPerBlock; } if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { - inuse = 1; - goto Exit; + if (stop_on_first) { + blockCount = 1; + goto Exit; + } + blockCount += num_bits_set(*currentWord & SWAP_BE32 (bitMask)); } } Exit: if (buffer) { (void)ReleaseBitmapBlock(hfsmp, blockRef, false); } + if (allocCount) { + *allocCount = blockCount; + } + +JustReturn: + if (hfs_kdebug_allocation & HFSDBG_BITMAP_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_IS_ALLOCATED | DBG_FUNC_END, error, 0, blockCount, 0, 0); + + return (error); +} + +/* + * Count total number of blocks that are allocated in the given + * range from the bitmap. This is used to preflight total blocks + * that need to be relocated during volume resize. + * + * The journal or allocation file lock must be held. + * + * Returns: + * 0 on success, non-zero on failure. + * On failure, allocCount is zero. + */ +int +hfs_count_allocated(struct hfsmount *hfsmp, u_int32_t startBlock, + u_int32_t numBlocks, u_int32_t *allocCount) +{ + return hfs_isallocated_internal(hfsmp, startBlock, numBlocks, false, allocCount); +} + +/* + * Test to see if any blocks in a range are allocated. + * + * Note: On error, this function returns 1, which means that + * one or more blocks in the range are allocated. This function + * is primarily used for volume resize and we do not want + * to report to the caller that the blocks are free when we + * were not able to deterministically find it out. So on error, + * we always report that the blocks are allocated. + * + * The journal or allocation file lock must be held. + * + * Returns + * 0 if all blocks in the range are free. + * 1 if blocks in the range are allocated, or there was an error. + */ +int +hfs_isallocated(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t numBlocks) +{ + int error; + u_int32_t allocCount; + + error = hfs_isallocated_internal(hfsmp, startingBlock, numBlocks, true, &allocCount); + if (error) { + /* On error, we always say that the blocks are allocated + * so that volume resize does not return false success. + */ + return 1; + } else { + /* The function was deterministically able to find out + * if there was any block allocated or not. In that case, + * the value in allocCount is good enough to be returned + * back to the caller. + */ + return allocCount; + } +} + +/* + * Check to see if the red-black tree is live. Allocation file lock must be held + * shared or exclusive to call this function. Note that we may call this even if + * HFS is built without activating the red-black tree code. + */ +__private_extern__ +int +hfs_isrbtree_active(struct hfsmount *hfsmp){ + + //TODO: Update this function to deal with a truncate/resize coming in when the tree + //isn't fully finished. maybe we need to check the flags for something other than ENABLED? + +#if CONFIG_HFS_ALLOC_RBTREE + if (ALLOC_DEBUG) { + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + } + if (hfsmp){ + + if (hfsmp->extent_tree_flags & HFS_ALLOC_RB_ENABLED) { + return 1; + } + } +#else + #pragma unused (hfsmp) +#endif + /* If the RB Tree code is not enabled, then just always return 0 */ + return 0; +} + +#if CONFIG_HFS_ALLOC_RBTREE +/* + * This function is basically the same as hfs_isallocated, except it's designed for + * use with the red-black tree validation code. It assumes we're only checking whether + * one bit is active, and that we're going to pass in the buf to use, since GenerateTree + * calls ReadBitmapBlock and will have that buf locked down for the duration of its operation. + * + * This should not be called in general purpose scanning code. + */ +int hfs_isallocated_scan(struct hfsmount *hfsmp, u_int32_t startingBlock, u_int32_t *bp_buf) { + + u_int32_t *currentWord; // Pointer to current word within bitmap block + u_int32_t bitMask; // Word with given bits already set (ready to test) + u_int32_t firstBit; // Bit index within word of first bit to allocate + u_int32_t numBits; // Number of bits in word to allocate + u_int32_t bitsPerBlock; + uintptr_t blockRef; + u_int32_t wordsPerBlock; + u_int32_t numBlocks = 1; + u_int32_t *buffer = NULL; + + int inuse = 0; + int error; + + + if (bp_buf) { + /* just use passed-in buffer if avail. */ + buffer = bp_buf; + } + else { + /* + * Pre-read the bitmap block containing the first word of allocation + */ + error = ReadBitmapBlock(hfsmp, startingBlock, &buffer, &blockRef); + if (error) + return (error); + } + + /* + * Initialize currentWord, and wordsLeft. + */ + u_int32_t wordIndexInBlock; + + bitsPerBlock = hfsmp->vcbVBMIOSize * kBitsPerByte; + wordsPerBlock = hfsmp->vcbVBMIOSize / kBytesPerWord; + + wordIndexInBlock = (startingBlock & (bitsPerBlock-1)) / kBitsPerWord; + currentWord = buffer + wordIndexInBlock; + + /* + * First test any non word aligned bits. + */ + firstBit = startingBlock % kBitsPerWord; + bitMask = kAllBitsSetInWord >> firstBit; + numBits = kBitsPerWord - firstBit; + if (numBits > numBlocks) { + numBits = numBlocks; + bitMask &= ~(kAllBitsSetInWord >> (firstBit + numBits)); + } + if ((*currentWord & SWAP_BE32 (bitMask)) != 0) { + inuse = 1; + goto Exit; + } + numBlocks -= numBits; + ++currentWord; + +Exit: + if(bp_buf == NULL) { + if (buffer) { + (void)ReleaseBitmapBlock(hfsmp, blockRef, false); + } + } return (inuse); + + + +} + +/* + * This function scans the specified block and adds it to the pair of trees specified + * in its arguments. We break this behavior out of GenerateTree so that an allocating + * thread can invoke this if the tree does not have enough extents to satisfy + * an allocation request. + * + * startbit - the allocation block represented by a bit in 'allocblock' where we need to + * start our scan. For instance, we may need to start the normal allocation scan + * in the middle of an existing allocation block. + * endBit - the allocation block where we should end this search (inclusive). + * bitToScan - output argument for this function to specify the next bit to scan. + * + * Returns: + * 0 on success + * nonzero on failure. + */ + +static int hfs_alloc_scan_block(struct hfsmount *hfsmp, u_int32_t startbit, + u_int32_t endBit, u_int32_t *bitToScan) { + + int error; + u_int32_t curAllocBlock; + struct buf *blockRef = NULL; + u_int32_t *buffer = NULL; + u_int32_t wordIndexInBlock; + u_int32_t blockSize = (u_int32_t)hfsmp->vcbVBMIOSize; + u_int32_t wordsPerBlock = blockSize / kBytesPerWord; + u_int32_t offset = 0; + u_int32_t size = 0; + + /* + * Read the appropriate block from the bitmap file. ReadBitmapBlock + * figures out which actual on-disk block corresponds to the bit we're + * looking at. + */ + error = ReadBitmapBlock(hfsmp, startbit, &buffer, (uintptr_t*)&blockRef); + if (error) { + return error; + } + + /* curAllocBlock represents the logical block we're analyzing. */ + curAllocBlock = startbit; + + /* Figure out which word curAllocBlock corresponds to in the block we read */ + wordIndexInBlock = (curAllocBlock / kBitsPerWord) % wordsPerBlock; + + /* Scan a word at a time */ + while (wordIndexInBlock < wordsPerBlock) { + u_int32_t currentWord = SWAP_BE32(buffer[wordIndexInBlock]); + u_int32_t curBit; + + /* modulate curBit because it may start in the middle of a word */ + for (curBit = curAllocBlock % kBitsPerWord; curBit < kBitsPerWord; curBit++) { + + u_int32_t is_allocated = currentWord & (1 << (kBitsWithinWordMask - curBit)); + if (ALLOC_DEBUG) { + u_int32_t res = hfs_isallocated_scan (hfsmp, curAllocBlock, buffer); + if ( ((res) && (!is_allocated)) || ((!res) && (is_allocated))) { + panic("hfs_alloc_scan: curAllocBit %u, curBit (%d), word (0x%x), is_allocated (0x%x) res(0x%x) \n", + curAllocBlock, curBit, currentWord, is_allocated, res); + } + } + /* + * If curBit is not allocated, keep track of the start of the free range. + * Increment a running tally on how many free blocks in a row we've seen. + */ + if (!is_allocated) { + size++; + if (offset == 0) { + offset = curAllocBlock; + } + } + else { + /* + * If we hit an allocated block, insert the extent that tracked the range + * we saw, and reset our tally counter. + */ + if (size != 0) { + extent_tree_free_space(&hfsmp->offset_tree, size, offset); + size = 0; + offset = 0; + } + } + curAllocBlock++; + /* + * Exit early if the next bit we'd analyze would take us beyond the end of the + * range that we're supposed to scan. + */ + if (curAllocBlock >= endBit) { + goto DoneScanning; + } + } + wordIndexInBlock++; + } +DoneScanning: + + /* We may have been tracking a range of free blocks that hasn't been inserted yet. */ + if (size != 0) { + extent_tree_free_space(&hfsmp->offset_tree, size, offset); + } + /* + * curAllocBlock represents the next block we need to scan while we're in this + * function. + */ + *bitToScan = curAllocBlock; + + ReleaseRBScanBitmapBlock(blockRef); + + return 0; +} + +/* + * Extern function that is called from mount and upgrade mount routines + * that enable us to initialize the tree. + */ + +__private_extern__ +u_int32_t InitTree(struct hfsmount *hfsmp) { + extent_tree_init (&(hfsmp->offset_tree)); + return 0; +} + + +/* + * This function builds the trees specified in its arguments. It uses + * buf_meta_breads to scan through the bitmap and re-build the tree state. + * It is very important to use buf_meta_bread because we need to ensure that we + * read the most current version of the blocks that we're scanning. If we used + * cluster_io, then journaled transactions could still be sitting in RAM since they are + * written to disk in the proper location asynchronously. + * + * Because this could be still running when mount has finished, we need to check + * after every allocation block that we're working on if an unmount or some other + * operation that would cause us to teardown has come in. (think downgrade mount). + * If an unmount has come in, then abort whatever we're doing and return -1 + * to indicate we hit an error. If we don't do this, we'd hold up unmount for + * a very long time. + * + * This function assumes that the bitmap lock is acquired exclusively before being + * called. It will drop the lock and then re-acquire it during operation, but + * will always return with the lock held. + */ +__private_extern__ +u_int32_t GenerateTree(struct hfsmount *hfsmp, u_int32_t endBlock, int *flags, int initialscan) { + + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + + u_int32_t *cur_block_eof; + int error = 0; + + int USE_FINE_GRAINED_LOCKING = 0; + + /* Initialize the block counter while we hold the bitmap lock */ + cur_block_eof = &hfsmp->offset_block_end; + + /* + * This loop advances over all allocation bitmap blocks of the current region + * to scan them and add the results into the red-black tree. We use the mount point + * variable offset_block_end as our loop counter. This gives us flexibility + * because we can release the allocation bitmap lock and allow a thread that wants + * to make an allocation to grab the lock and do some scanning on our behalf while we're + * waiting to re-acquire the lock. Then, the allocating thread will only do as much bitmap + * scanning as needed to fulfill its allocation. + * + * If the other thread does IO for us, then it will update the offset_block_end + * variable as well, since it will use the same hfs_alloc_scan_block function to do its bit + * scanning. So when we re-grab the lock, our current EOF/loop will immediately skip us to the next + * block that needs scanning. + */ + + while (*cur_block_eof < endBlock) { + + /* + * If the filesystem is being resized before the bitmap has been fully scanned, we'll + * update our endBlock to match the current allocation limit in the hfsmp struct. + * The allocLimit field would only be be updated while holding the bitmap lock, so we won't + * be executing this code at the same time that the resize is going on. + */ + if ((initialscan) && (endBlock != hfsmp->allocLimit)) { + + /* If we're past the new/modified allocLimit, then just stop immediately.*/ + if (*cur_block_eof >= hfsmp->allocLimit ) { + break; + } + endBlock = hfsmp->allocLimit; + } + + /* + * TODO: fix unmount stuff! + * See rdar://7391404 + * + * Once the RB allocator is checked in, we'll want to augment it to not hold the + * allocation bitmap lock for the entire duration of the tree scan. For a first check-in + * it's ok to do that but we can't leave it like that forever. + * + * The gist of the new algorithm will work as follows: + * if an unmount is in flight and has been detected: + * abort tree-build. + * unset tree-in-progress bit. + * wakeup unmount thread + * unlock allocation bitmap lock, fail out. + * + * The corresponding code in the unmount side should already be in place. + */ + + error = hfs_alloc_scan_block (hfsmp, *cur_block_eof, endBlock, cur_block_eof); + + //TODO: Fix this below! + if (USE_FINE_GRAINED_LOCKING){ + hfs_systemfile_unlock(hfsmp, *flags); + *flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK); + } + //TODO: Infer that if *flags == 0, we don't actually need to lock/unlock. + } + + return error; } -/* Invalidate free extent cache for a given volume. - * This cache is invalidated and disabled when a volume is being resized - * (via hfs_trucatefs() or hfs_extendefs()). +/* + * This function destroys the specified rb-trees associated with the mount point. + */ +__private_extern__ +void DestroyTrees(struct hfsmount *hfsmp) { + + if (ALLOC_DEBUG) { + REQUIRE_FILE_LOCK(hfsmp->hfs_allocation_vp, false); + printf("DestroyTrees: Validating red/black tree for vol %s\n", (char*) hfsmp->vcbVN); + hfs_validate_rbtree (hfsmp, 0, hfsmp->offset_block_end ); + } + + /* + * extent_tree_destroy will start with the first entry in the tree (by offset), then + * iterate through the tree quickly using its embedded linked list. This results in tree + * destruction in O(n) time. + */ + + if (hfsmp->extent_tree_flags & HFS_ALLOC_RB_ENABLED) { + extent_tree_destroy(&hfsmp->offset_tree); + + /* Mark Trees as disabled */ + hfsmp->extent_tree_flags &= ~HFS_ALLOC_RB_ENABLED; + } + + return; +} + +#endif + +/* + * This function resets all of the data structures relevant to the + * free extent cache stored in the hfsmount struct. + * + * If we are using the red-black tree code then we need to account for the fact that + * we may encounter situations where we need to jettison the tree. If that is the + * case, then we fail-over to the bitmap scanning logic, but we need to ensure that + * the free ext cache is zeroed before we start using it. * - * Returns: Nothing + * We also reset and disable the cache when allocLimit is updated... which + * is when a volume is being resized (via hfs_truncatefs() or hfs_extendfs()). + * It is independent of the type of allocator being used currently. */ -void invalidate_free_extent_cache(ExtendedVCB *vcb) +void ResetVCBFreeExtCache(struct hfsmount *hfsmp) { - u_int32_t i; + int bytes; + void *freeExt; - HFS_MOUNT_LOCK(vcb, TRUE); - for (i = 0; i < vcb->vcbFreeExtCnt; i++) { - vcb->vcbFreeExt[i].startBlock = 0; - vcb->vcbFreeExt[i].blockCount = 0; + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RESET_EXTENT_CACHE | DBG_FUNC_START, 0, 0, 0, 0, 0); + + lck_spin_lock(&hfsmp->vcbFreeExtLock); + + /* reset Free Extent Count */ + hfsmp->vcbFreeExtCnt = 0; + + /* reset the actual array */ + bytes = kMaxFreeExtents * sizeof(HFSPlusExtentDescriptor); + freeExt = (void*)(hfsmp->vcbFreeExt); + + bzero (freeExt, bytes); + + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_RESET_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, 0, 0); + + return; +} + +/* + * This function is used to inform the allocator if we have to effectively shrink + * or grow the total number of allocation blocks via hfs_truncatefs or hfs_extendfs. + * + * The bitmap lock must be held when calling this function. This function also modifies the + * allocLimit field in the hfs mount point structure in the general case. + * + * In the shrinking case, we'll have to remove all free extents from the red-black + * tree past the specified offset new_end_block. In the growth case, we'll have to force + * a re-scan of the new allocation blocks from our current allocLimit to the new end block. + * + * new_end_block represents the total number of blocks available for allocation in the resized + * filesystem. Block #new_end_block should not be allocatable in the resized filesystem since it + * will be out of the (0, n-1) range that are indexable in the bitmap. + * + * Returns 0 on success + * errno on failure + */ +__private_extern__ +u_int32_t UpdateAllocLimit (struct hfsmount *hfsmp, u_int32_t new_end_block) { + + /* + * Update allocLimit to the argument specified, but don't do anything else + * if the red/black tree is not enabled. + */ + hfsmp->allocLimit = new_end_block; + + /* Invalidate the free extent cache completely so that + * it does not have any extents beyond end of current + * volume. + */ + ResetVCBFreeExtCache(hfsmp); + +#if CONFIG_HFS_ALLOC_RBTREE + /* Shrinking the existing filesystem */ + if ((new_end_block < hfsmp->offset_block_end) && + (hfsmp->extent_tree_flags & HFS_ALLOC_RB_ACTIVE)) { + extent_node_t search_sentinel; + extent_node_t *node = NULL; + /* Remover points to the current item to free/remove from the tree */ + extent_node_t *remover = NULL; + + /* Begin search at the specified offset */ + memset (&search_sentinel, 0, sizeof(extent_node_t)); + search_sentinel.offset = new_end_block; + + /* + * Find the first available extent that satifies the allocation by searching + * from the starting point or 1 earlier. We may need to split apart an existing node + * if it straddles the new alloc limit. + */ + node = extent_tree_off_search_prev(&hfsmp->offset_tree, &search_sentinel); + if (node) { + /* If it's an exact match, then just remove them all from this point forward */ + if (node->offset == new_end_block) { + /* + * Find the previous entry and update its next pointer to NULL + * since this entry is biting the dust. Update remover to node. + */ + extent_node_t *prev = NULL; + prev = extent_tree_off_prev (&hfsmp->offset_tree, node); + if (prev) { + prev->offset_next = NULL; + } + remover = node; + } + else { + /* See if we need to split this node */ + if ((node->offset + node->length) > new_end_block) { + /* + * Update node to reflect its new size up until new_end_block. + */ + remover = node->offset_next; + node->length = new_end_block - node->offset; + /* node is becoming the last free extent in the volume. */ + node->offset_next = NULL; + } + else { + if (node->offset_next == NULL) { + /* + * 'node' points to the last free extent in the volume. + * Coincidentally, it is also before the new cut-off point at which + * we will stop representing bitmap values in the tree. Just bail out now. + */ + return 0; + } + /* + * Otherwise, point our temp variable 'remover' to the node where + * we'll need to start yanking things out of the tree, and make 'node' + * the last element in the tree in the linked list. + */ + remover = node->offset_next; + if (remover->offset <= new_end_block) { + panic ("UpdateAllocLimit: Invalid RBTree node next ptr!"); + } + node->offset_next = NULL; + } + } + + /* + * Remover is our "temp" pointer that points to the current node to remove from + * the offset tree. We'll simply iterate through the tree linked list, removing the current + * element from the tree, freeing them as we come across them. + */ + while (remover) { + extent_node_t *next = remover->offset_next; + extent_tree_remove_node (&hfsmp->offset_tree, remover); + free_node (remover); + remover = next; + } + + if (ALLOC_DEBUG) { + printf ("UpdateAllocLimit: Validating rbtree after truncation\n"); + hfs_validate_rbtree (hfsmp, 0, new_end_block-1); + } + + /* + * Don't forget to shrink offset_block_end after a successful truncation + * new_end_block should represent the number of blocks available on the + * truncated volume. + */ + + hfsmp->offset_block_end = new_end_block; + + return 0; + } + else { + if (ALLOC_DEBUG) { + panic ("UpdateAllocLimit: no prev!"); + } + return ENOSPC; + } } - vcb->vcbFreeExtCnt = 0; - HFS_MOUNT_UNLOCK(vcb, TRUE); + /* Growing the existing filesystem */ + else if ((new_end_block > hfsmp->offset_block_end) && + (hfsmp->extent_tree_flags & HFS_ALLOC_RB_ACTIVE)) { + int flags = 0; + int retval = 0; + + if (ALLOC_DEBUG) { + printf ("UpdateAllocLimit: Validating rbtree prior to growth\n"); + hfs_validate_rbtree (hfsmp, 0, hfsmp->offset_block_end); + } + + + retval = GenerateTree (hfsmp, new_end_block, &flags, 0); + + /* + * Don't forget to update offset_block_end after a successful tree extension. + */ + if (retval == 0) { + + if (ALLOC_DEBUG) { + printf ("UpdateAllocLimit: Validating rbtree after growth\n"); + hfs_validate_rbtree (hfsmp, 0, new_end_block); + } + + hfsmp->offset_block_end = new_end_block; + } + + return retval; + } + /* Otherwise, do nothing. fall through to the code below. */ + printf ("error : off_block_end: %d, alloclimit: %d, new_end_block: %d\n", + hfsmp->offset_block_end,hfsmp->allocLimit, new_end_block); +#endif + + return 0; + +} + + +/* + * Remove an entry from free extent cache after it has been allocated. + * + * This function does not split extents to remove them from the allocated list. + * + * Inputs: + * hfsmp - mount point structure + * startBlock - starting block of the extent to be removed. + * blockCount - number of blocks of the extent to be removed. + */ +static void remove_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) +{ + int i, j; + int extentsRemoved = 0; + u_int32_t start, end; + +#if CONFIG_HFS_ALLOC_RBTREE + /* If red-black tree is enabled, no free extent cache is necessary */ + if (hfs_isrbtree_active(hfsmp) == true) { + return; + } +#endif + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_REMOVE_EXTENT_CACHE | DBG_FUNC_START, startBlock, blockCount, 0, 0, 0); + + lck_spin_lock(&hfsmp->vcbFreeExtLock); + + for (i = 0; i < (int)hfsmp->vcbFreeExtCnt; i++) { + start = hfsmp->vcbFreeExt[i].startBlock; + end = start + hfsmp->vcbFreeExt[i].blockCount; + + /* If the extent to remove from free extent list starts within + * this free extent, or, if it starts before this free extent + * but ends in this free extent, remove it by shifting all other + * extents. + */ + if (((startBlock >= start) && (startBlock < end)) || + ((startBlock < start) && (startBlock + blockCount) > start)) { + for (j = i; j < (int)hfsmp->vcbFreeExtCnt - 1; j++) { + hfsmp->vcbFreeExt[j] = hfsmp->vcbFreeExt[j+1]; + } + hfsmp->vcbFreeExtCnt--; + /* Decrement the index so that we check the extent + * that just got shifted to the current index. + */ + i--; + extentsRemoved++; + } + /* Continue looping as we might have to invalidate multiple extents, + * probably not possible in normal case, but does not hurt. + */ + } + + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + + sanity_check_free_ext(hfsmp, 0); + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_REMOVE_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, extentsRemoved, 0); return; } -/* Check whether free extent cache is active or not. - * This cache is invalidated and disabled when a volume is being resized - * (via hfs_trucatefs() or hfs_extendefs()). +/* + * Add an entry to free extent cache after it has been deallocated. * - * This function assumes that the caller is holding the lock on - * the mount point. + * If the extent provided has blocks beyond current allocLimit, it + * is clipped to allocLimit. This function does not merge contiguous + * extents, if they already exist in the list. * - * Returns: 0 if the cache is not active, - * 1 if the cache is active. + * Inputs: + * hfsmp - mount point structure + * startBlock - starting block of the extent to be removed. + * blockCount - number of blocks of the extent to be removed. + * + * Returns: + * true - if the extent was added successfully to the list + * false - if the extent was no added to the list, maybe because + * the extent was beyond allocLimit, or is not best + * candidate to be put in the cache. */ -static int free_extent_cache_active(ExtendedVCB *vcb) +static Boolean add_free_extent_cache(struct hfsmount *hfsmp, u_int32_t startBlock, u_int32_t blockCount) { - int retval = 1; + Boolean retval = false; + u_int32_t start, end; + int i; + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ADD_EXTENT_CACHE | DBG_FUNC_START, startBlock, blockCount, 0, 0, 0); + + /* + * If using the red-black tree allocator, then there's no need to special case + * for the sparse device case. We'll simply add the region we've recently freed + * to the red-black tree, where it will get sorted by offset and length. The only special + * casing will need to be done on the allocation side, where we may favor free extents + * based on offset even if it will cause fragmentation. This may be true, for example, if + * we are trying to reduce the number of bandfiles created in a sparse bundle disk image. + */ +#if CONFIG_HFS_ALLOC_RBTREE + if (hfs_isrbtree_active(hfsmp) == true) { + goto out_not_locked; + } +#endif + + /* No need to add extent that is beyond current allocLimit */ + if (startBlock >= hfsmp->allocLimit) { + goto out_not_locked; + } + + /* If end of the free extent is beyond current allocLimit, clip the extent */ + if ((startBlock + blockCount) > hfsmp->allocLimit) { + blockCount = hfsmp->allocLimit - startBlock; + } + + lck_spin_lock(&hfsmp->vcbFreeExtLock); - if (vcb->hfs_flags & HFS_RESIZE_IN_PROGRESS) { - retval = 0; + /* If the free extent cache is full and the new extent fails to + * compare with the last extent, skip adding it to the list. + */ + if (hfsmp->vcbFreeExtCnt == kMaxFreeExtents) { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + /* For sparse disks, free extent cache list is sorted by start block, lowest first */ + if (startBlock > hfsmp->vcbFreeExt[kMaxFreeExtents-1].startBlock) { + goto out; + } + } else { + /* For normal mounts, free extent cache list is sorted by total blocks, highest first */ + if (blockCount <= hfsmp->vcbFreeExt[kMaxFreeExtents-1].blockCount) { + goto out; + } + } } + + /* Check if the current extent overlaps with any of the existing + * extents. If yes, just skip adding it to the list. We have + * to do this check before shifting the extent records. + */ + for (i = 0; i < (int)hfsmp->vcbFreeExtCnt; i++) { + + start = hfsmp->vcbFreeExt[i].startBlock; + end = start + hfsmp->vcbFreeExt[i].blockCount; + + if (((startBlock >= start) && (startBlock < end)) || + ((startBlock < start) && (startBlock + blockCount) > start)) { + goto out; + } + } + + /* Scan the free extent cache array from tail to head till + * we find the entry after which our new entry should be + * inserted. After we break out of this loop, the new entry + * will be inserted at 'i+1'. + */ + for (i = (int)hfsmp->vcbFreeExtCnt-1; i >= 0; i--) { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + /* For sparse devices, find entry with smaller start block than ours */ + if (hfsmp->vcbFreeExt[i].startBlock < startBlock) { + break; + } + } else { + /* For normal devices, find entry with greater block count than ours */ + if (hfsmp->vcbFreeExt[i].blockCount >= blockCount) { + break; + } + } + + /* If this is not the right spot to insert, and this is + * not the last entry in the array, just shift it and + * continue check another one. + */ + if ((i+1) < kMaxFreeExtents) { + hfsmp->vcbFreeExt[i+1] = hfsmp->vcbFreeExt[i]; + } + } + /* 'i' points to one index offset before which the new extent should be inserted */ + hfsmp->vcbFreeExt[i+1].startBlock = startBlock; + hfsmp->vcbFreeExt[i+1].blockCount = blockCount; + if (hfsmp->vcbFreeExtCnt < kMaxFreeExtents) { + hfsmp->vcbFreeExtCnt++; + } + retval = true; + +out: + lck_spin_unlock(&hfsmp->vcbFreeExtLock); +out_not_locked: + sanity_check_free_ext(hfsmp, 0); + + if (hfs_kdebug_allocation & HFSDBG_EXT_CACHE_ENABLED) + KERNEL_DEBUG_CONSTANT(HFSDBG_ADD_EXTENT_CACHE | DBG_FUNC_END, 0, 0, 0, retval, 0); + return retval; } + +/* Debug function to check if the free extent cache is good or not */ +static void sanity_check_free_ext(struct hfsmount *hfsmp, int check_allocated) +{ + u_int32_t i, j; + + /* Do not do anything if debug is not on, or if we're using the red-black tree */ + if ((ALLOC_DEBUG == 0) || (hfs_isrbtree_active(hfsmp) == true)) { + return; + } + + lck_spin_lock(&hfsmp->vcbFreeExtLock); + + /* + * Iterate the Free extent cache and ensure no entries are bogus or refer to + * allocated blocks. + */ + for(i=0; i < hfsmp->vcbFreeExtCnt; i++) { + u_int32_t start, nblocks; + + start = hfsmp->vcbFreeExt[i].startBlock; + nblocks = hfsmp->vcbFreeExt[i].blockCount; + + //printf ("hfs: %p: slot:%d (%u,%u)\n", hfsmp, i, start, nblocks); + + /* Check if any of the blocks in free extent cache are allocated. + * This should not be enabled always because it might take + * very long for large extents that get added to the list. + * + * We have to drop vcbFreeExtLock while we call hfs_isallocated + * because it is going to do I/O. Note that the free extent + * cache could change. That's a risk we take when using this + * debugging code. (Another alternative would be to try to + * detect when the free extent cache changed, and perhaps + * restart if the list changed while we dropped the lock.) + */ + if (check_allocated) { + lck_spin_unlock(&hfsmp->vcbFreeExtLock); + if (hfs_isallocated(hfsmp, start, nblocks)) { + panic("hfs: %p: slot %d:(%u,%u) in the free extent array is allocated\n", + hfsmp, i, start, nblocks); + } + lck_spin_lock(&hfsmp->vcbFreeExtLock); + } + + /* Check if any part of the extent is beyond allocLimit */ + if ((start > hfsmp->allocLimit) || ((start + nblocks) > hfsmp->allocLimit)) { + panic ("hfs: %p: slot %d:(%u,%u) in the free extent array is beyond allocLimit=%u\n", + hfsmp, i, start, nblocks, hfsmp->allocLimit); + } + + /* Check if there are any duplicate start blocks */ + for(j=i+1; j < hfsmp->vcbFreeExtCnt; j++) { + if (start == hfsmp->vcbFreeExt[j].startBlock) { + panic("hfs: %p: slot %d:(%u,%u) and %d:(%u,%u) are duplicate\n", + hfsmp, i, start, nblocks, j, hfsmp->vcbFreeExt[j].startBlock, + hfsmp->vcbFreeExt[j].blockCount); + } + } + + /* Check if the entries are out of order */ + if ((i+1) != hfsmp->vcbFreeExtCnt) { + if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) { + /* sparse devices are sorted by starting block number (ascending) */ + if (hfsmp->vcbFreeExt[i].startBlock > hfsmp->vcbFreeExt[i+1].startBlock) { + panic ("hfs: %p: SPARSE %d:(%u,%u) and %d:(%u,%u) are out of order\n", + hfsmp, i, start, nblocks, i+1, hfsmp->vcbFreeExt[i+1].startBlock, + hfsmp->vcbFreeExt[i+1].blockCount); + } + } else { + /* normally sorted by block count (descending) */ + if (hfsmp->vcbFreeExt[i].blockCount < hfsmp->vcbFreeExt[i+1].blockCount) { + panic ("hfs: %p: %d:(%u,%u) and %d:(%u,%u) are out of order\n", + hfsmp, i, start, nblocks, i+1, hfsmp->vcbFreeExt[i+1].startBlock, + hfsmp->vcbFreeExt[i+1].blockCount); + } + } + } + } + lck_spin_unlock(&hfsmp->vcbFreeExtLock); +} diff --git a/bsd/hfs/hfscommon/headers/FileMgrInternal.h b/bsd/hfs/hfscommon/headers/FileMgrInternal.h index 307178907..7276daa26 100644 --- a/bsd/hfs/hfscommon/headers/FileMgrInternal.h +++ b/bsd/hfs/hfscommon/headers/FileMgrInternal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,7 +32,7 @@ Version: HFS Plus 1.0 - Copyright: © 1996-2001 by Apple Computer, Inc., all rights reserved. + Copyright: � 1996-2001 by Apple Computer, Inc., all rights reserved. */ #ifndef __FILEMGRINTERNAL__ @@ -189,6 +189,8 @@ ExchangeFileIDs (ExtendedVCB * volume, u_int32_t srcHint, u_int32_t destHint ); +EXTERN_API_C( OSErr ) +MoveData( ExtendedVCB *vcb, HFSCatalogNodeID srcID, HFSCatalogNodeID destID, int rsrc); /* BTree Manager Routines*/ @@ -232,7 +234,7 @@ BlockDeallocate (ExtendedVCB * vcb, u_int32_t flags); EXTERN_API_C ( void ) -invalidate_free_extent_cache (ExtendedVCB * vcb); +ResetVCBFreeExtCache(struct hfsmount *hfsmp); EXTERN_API_C( OSErr ) BlockMarkAllocated(ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); @@ -240,8 +242,28 @@ BlockMarkAllocated(ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlock EXTERN_API_C( OSErr ) BlockMarkFree( ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); +EXTERN_API_C( OSErr ) +BlockMarkFreeUnused( ExtendedVCB *vcb, u_int32_t startingBlock, u_int32_t numBlocks); + EXTERN_API_C( u_int32_t ) MetaZoneFreeBlocks(ExtendedVCB *vcb); + +EXTERN_API_C( u_int32_t ) +UpdateAllocLimit (struct hfsmount *hfsmp, u_int32_t new_end_block); + +#if CONFIG_HFS_ALLOC_RBTREE +EXTERN_API_C( u_int32_t ) +GenerateTree( struct hfsmount *hfsmp, u_int32_t end_block, int *flags, int initialscan); + +EXTERN_API_C( void ) +DestroyTrees( struct hfsmount *hfsmp); + +EXTERN_API_C( u_int32_t ) +InitTree(struct hfsmount *hfsmp); +#endif + + + /* File Extent Mapping routines*/ EXTERN_API_C( OSErr ) @@ -256,11 +278,9 @@ CompareExtentKeysPlus (const HFSPlusExtentKey *searchKey, const HFSPlusExtentKey *trialKey); EXTERN_API_C( OSErr ) -TruncateFileC (ExtendedVCB * vcb, - FCB * fcb, - int64_t peof, - Boolean truncateToExtent); - +TruncateFileC (ExtendedVCB *vcb, FCB *fcb, int64_t peof, int deleted, + int rsrc, uint32_t fileid, Boolean truncateToExtent); + EXTERN_API_C( OSErr ) ExtendFileC (ExtendedVCB * vcb, FCB * fcb, diff --git a/bsd/hfs/hfscommon/headers/HybridAllocator.h b/bsd/hfs/hfscommon/headers/HybridAllocator.h new file mode 100644 index 000000000..4add9daee --- /dev/null +++ b/bsd/hfs/hfscommon/headers/HybridAllocator.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + + +#ifndef __HYBRID_ALLOC__ +#define __HYBRID_ALLOC__ + +#include <sys/types.h> +#include "RedBlackTree.h" + +typedef struct extent_node extent_node_t; + +struct extent_node +{ + u_int32_t length; + u_int32_t offset; + struct extent_node *offset_next; + rb_node(extent_node_t) offset_link; +}; + +typedef rb_tree(extent_node_t) extent_tree_offset_t; + +extern extent_node_t * +alloc_node(u_int32_t length, u_int32_t offset); + +extern void +free_node(extent_node_t *node); + +extern extent_node_t * +extent_tree_free_space( extent_tree_offset_t *offset_tree, u_int32_t size, u_int32_t offset); + +extern void +extent_tree_offset_print(extent_tree_offset_t *offset_tree); + +extern int32_t +extent_tree_offset_alloc_space(extent_tree_offset_t *offset_tree, u_int32_t size, u_int32_t offset); + +extern int32_t +extent_tree_offset_alloc_unaligned(extent_tree_offset_t *tree, u_int32_t size, u_int32_t offset); + + +extern void +extent_tree_remove_node (extent_tree_offset_t *offset_tree, extent_node_t * node); + +extern extent_node_t * +extent_tree_off_first (extent_tree_offset_t *offset_tree); + +extern extent_node_t * +extent_tree_off_search(extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern extent_node_t * +extent_tree_off_search_next(extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern extent_node_t* +extent_tree_off_search_nextWithSize (extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern extent_node_t * +extent_tree_off_search_prev(extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern extent_node_t * +extent_tree_off_next(extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern extent_node_t * +extent_tree_off_prev(extent_tree_offset_t *offset_tree, extent_node_t *node); + +extern void +extent_tree_init(extent_tree_offset_t *offset_tree); + +extern void +extent_tree_destroy(extent_tree_offset_t *offset_tree); + +extern int +cmp_offset_node(extent_node_t *node_1, extent_node_t *node_2); + + +#endif diff --git a/bsd/hfs/hfscommon/headers/RedBlackTree.h b/bsd/hfs/hfscommon/headers/RedBlackTree.h new file mode 100644 index 000000000..21342296c --- /dev/null +++ b/bsd/hfs/hfscommon/headers/RedBlackTree.h @@ -0,0 +1,969 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/****************************************************************************** + * + * Copyright (C) 2008 Jason Evans <jasone@FreeBSD.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice(s), this list of conditions and the following disclaimer + * unmodified other than the allowable addition of one or more + * copyright notices. + * 2. Redistributions in binary form must reproduce the above copyright + * notice(s), this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ****************************************************************************** + * + * cpp macro implementation of left-leaning red-black trees. + * + * Usage: + * + * (Optional, see assert(3).) + * #define NDEBUG + * + * (Required.) + * #include <assert.h> + * #include <rb.h> + * ... + * + * All operations are done non-recursively. Parent pointers are not used, and + * color bits are stored in the least significant bit of right-child pointers, + * thus making node linkage as compact as is possible for red-black trees. + * + * Some macros use a comparison function pointer, which is expected to have the + * following prototype: + * + * int (a_cmp *)(a_type *a_node, a_type *a_other); + * ^^^^^^ + * or a_key + * + * Interpretation of comparision function return values: + * + * -1 : a_node < a_other + * 0 : a_node == a_other + * 1 : a_node > a_other + * + * In all cases, the a_node or a_key macro argument is the first argument to the + * comparison function, which makes it possible to write comparison functions + * that treat the first argument specially. + * + ******************************************************************************/ + +#ifndef RB_H_ +#define RB_H_ + +#define RB_COMPACT +#ifdef RB_COMPACT +/* Node structure. */ +#define rb_node(a_type) \ +struct { \ + a_type *rbn_left; \ + a_type *rbn_right_red; \ +} +#else +#define rb_node(a_type) \ +struct { \ + a_type *rbn_left; \ + a_type *rbn_right; \ + bool rbn_red; \ +} +#endif + +/* Root structure. */ +#define rb_tree(a_type) \ +struct { \ + a_type *rbt_root; \ + a_type rbt_nil; \ +} + +/* Left accessors. */ +#define rbp_left_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_left) +#define rbp_left_set(a_type, a_field, a_node, a_left) do { \ + (a_node)->a_field.rbn_left = a_left; \ +} while (0) + +#ifdef RB_COMPACT +/* Right accessors. */ +#define rbp_right_get(a_type, a_field, a_node) \ + ((a_type *) (((intptr_t) (a_node)->a_field.rbn_right_red) \ + & ((ssize_t)-2))) +#define rbp_right_set(a_type, a_field, a_node, a_right) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((uintptr_t) a_right) \ + | (((uintptr_t) (a_node)->a_field.rbn_right_red) & ((size_t)1))); \ +} while (0) + +/* Color accessors. */ +#define rbp_red_get(a_type, a_field, a_node) \ + ((bool) (((uintptr_t) (a_node)->a_field.rbn_right_red) \ + & ((size_t)1))) +#define rbp_color_set(a_type, a_field, a_node, a_red) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) ((((intptr_t) \ + (a_node)->a_field.rbn_right_red) & ((ssize_t)-2)) \ + | ((ssize_t)a_red)); \ +} while (0) +#define rbp_red_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((uintptr_t) \ + (a_node)->a_field.rbn_right_red) | ((size_t)1)); \ +} while (0) +#define rbp_black_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((intptr_t) \ + (a_node)->a_field.rbn_right_red) & ((ssize_t)-2)); \ +} while (0) +#else +/* Right accessors. */ +#define rbp_right_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_right) +#define rbp_right_set(a_type, a_field, a_node, a_right) do { \ + (a_node)->a_field.rbn_right = a_right; \ +} while (0) + +/* Color accessors. */ +#define rbp_red_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_red) +#define rbp_color_set(a_type, a_field, a_node, a_red) do { \ + (a_node)->a_field.rbn_red = (a_red); \ +} while (0) +#define rbp_red_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_red = true; \ +} while (0) +#define rbp_black_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_red = false; \ +} while (0) +#endif + +/* Node initializer. */ +#define rbp_node_new(a_type, a_field, a_tree, a_node) do { \ + rbp_left_set(a_type, a_field, (a_node), &(a_tree)->rbt_nil); \ + rbp_right_set(a_type, a_field, (a_node), &(a_tree)->rbt_nil); \ + rbp_red_set(a_type, a_field, (a_node)); \ +} while (0) + +/* Tree initializer. */ +#define rb_new(a_type, a_field, a_tree) do { \ + (a_tree)->rbt_root = &(a_tree)->rbt_nil; \ + rbp_node_new(a_type, a_field, a_tree, &(a_tree)->rbt_nil); \ + rbp_black_set(a_type, a_field, &(a_tree)->rbt_nil); \ +} while (0) + +/* Tree operations. */ +#define rbp_black_height(a_type, a_field, a_tree, r_height) do { \ + a_type *rbp_bh_t; \ + for (rbp_bh_t = (a_tree)->rbt_root, (r_height) = 0; \ + rbp_bh_t != &(a_tree)->rbt_nil; \ + rbp_bh_t = rbp_left_get(a_type, a_field, rbp_bh_t)) { \ + if (rbp_red_get(a_type, a_field, rbp_bh_t) == false) { \ + (r_height)++; \ + } \ + } \ +} while (0) + +#define rbp_first(a_type, a_field, a_tree, a_root, r_node) do { \ + for ((r_node) = (a_root); \ + rbp_left_get(a_type, a_field, (r_node)) != &(a_tree)->rbt_nil; \ + (r_node) = rbp_left_get(a_type, a_field, (r_node))) { \ + } \ +} while (0) + +#define rbp_last(a_type, a_field, a_tree, a_root, r_node) do { \ + for ((r_node) = (a_root); \ + rbp_right_get(a_type, a_field, (r_node)) != &(a_tree)->rbt_nil; \ + (r_node) = rbp_right_get(a_type, a_field, (r_node))) { \ + } \ +} while (0) + +#define rbp_next(a_type, a_field, a_cmp, a_tree, a_node, r_node) do { \ + if (rbp_right_get(a_type, a_field, (a_node)) != &(a_tree)->rbt_nil) { \ + rbp_first(a_type, a_field, a_tree, rbp_right_get(a_type, \ + a_field, (a_node)), (r_node)); \ + } else { \ + a_type *rbp_n_t = (a_tree)->rbt_root; \ + assert(rbp_n_t != &(a_tree)->rbt_nil); \ + (r_node) = &(a_tree)->rbt_nil; \ + while (true) { \ + int rbp_n_cmp = (a_cmp)((a_node), rbp_n_t); \ + if (rbp_n_cmp < 0) { \ + (r_node) = rbp_n_t; \ + rbp_n_t = rbp_left_get(a_type, a_field, rbp_n_t); \ + } else if (rbp_n_cmp > 0) { \ + rbp_n_t = rbp_right_get(a_type, a_field, rbp_n_t); \ + } else { \ + break; \ + } \ + assert(rbp_n_t != &(a_tree)->rbt_nil); \ + } \ + } \ +} while (0) + +#define rbp_prev(a_type, a_field, a_cmp, a_tree, a_node, r_node) do { \ + if (rbp_left_get(a_type, a_field, (a_node)) != &(a_tree)->rbt_nil) {\ + rbp_last(a_type, a_field, a_tree, rbp_left_get(a_type, \ + a_field, (a_node)), (r_node)); \ + } else { \ + a_type *rbp_p_t = (a_tree)->rbt_root; \ + assert(rbp_p_t != &(a_tree)->rbt_nil); \ + (r_node) = &(a_tree)->rbt_nil; \ + while (true) { \ + int rbp_p_cmp = (a_cmp)((a_node), rbp_p_t); \ + if (rbp_p_cmp < 0) { \ + rbp_p_t = rbp_left_get(a_type, a_field, rbp_p_t); \ + } else if (rbp_p_cmp > 0) { \ + (r_node) = rbp_p_t; \ + rbp_p_t = rbp_right_get(a_type, a_field, rbp_p_t); \ + } else { \ + break; \ + } \ + assert(rbp_p_t != &(a_tree)->rbt_nil); \ + } \ + } \ +} while (0) + +#define rb_first(a_type, a_field, a_tree, r_node) do { \ + rbp_first(a_type, a_field, a_tree, (a_tree)->rbt_root, (r_node)); \ + if ((r_node) == &(a_tree)->rbt_nil) { \ + (r_node) = NULL; \ + } \ +} while (0) + +#define rb_last(a_type, a_field, a_tree, r_node) do { \ + rbp_last(a_type, a_field, a_tree, (a_tree)->rbt_root, r_node); \ + if ((r_node) == &(a_tree)->rbt_nil) { \ + (r_node) = NULL; \ + } \ +} while (0) + +#define rb_next(a_type, a_field, a_cmp, a_tree, a_node, r_node) do { \ + rbp_next(a_type, a_field, a_cmp, a_tree, (a_node), (r_node)); \ + if ((r_node) == &(a_tree)->rbt_nil) { \ + (r_node) = NULL; \ + } \ +} while (0) + +#define rb_prev(a_type, a_field, a_cmp, a_tree, a_node, r_node) do { \ + rbp_prev(a_type, a_field, a_cmp, a_tree, (a_node), (r_node)); \ + if ((r_node) == &(a_tree)->rbt_nil) { \ + (r_node) = NULL; \ + } \ +} while (0) + +#define rb_search(a_type, a_field, a_cmp, a_tree, a_key, r_node) do { \ + int rbp_se_cmp; \ + (r_node) = (a_tree)->rbt_root; \ + while ((r_node) != &(a_tree)->rbt_nil && (rbp_se_cmp = (a_cmp)((a_key), (r_node))) != 0) { \ + if (rbp_se_cmp < 0) { \ + (r_node) = rbp_left_get(a_type, a_field, (r_node)); \ + } else { \ + (r_node) = rbp_right_get(a_type, a_field, (r_node)); \ + } \ + } \ + if ((r_node) == &(a_tree)->rbt_nil) { \ + (r_node) = NULL; \ + } \ +} while (0) + +/* + * Find a match if it exists. Otherwise, find the next greater node, if one + * exists. + */ +#define rb_nsearch(a_type, a_field, a_cmp, a_tree, a_key, r_node) do { \ + a_type *rbp_ns_t = (a_tree)->rbt_root; \ + (r_node) = NULL; \ + while (rbp_ns_t != &(a_tree)->rbt_nil) { \ + int rbp_ns_cmp = (a_cmp)((a_key), rbp_ns_t); \ + if (rbp_ns_cmp < 0) { \ + (r_node) = rbp_ns_t; \ + rbp_ns_t = rbp_left_get(a_type, a_field, rbp_ns_t); \ + } else if (rbp_ns_cmp > 0) { \ + rbp_ns_t = rbp_right_get(a_type, a_field, rbp_ns_t); \ + } else { \ + (r_node) = rbp_ns_t; \ + break; \ + } \ + } \ +} while (0) + +/* + * Find a match if it exists. Otherwise, find the previous lesser node, if one + * exists. + */ +#define rb_psearch(a_type, a_field, a_cmp, a_tree, a_key, r_node) do { \ + a_type *rbp_ps_t = (a_tree)->rbt_root; \ + (r_node) = NULL; \ + while (rbp_ps_t != &(a_tree)->rbt_nil) { \ + int rbp_ps_cmp = (a_cmp)((a_key), rbp_ps_t); \ + if (rbp_ps_cmp < 0) { \ + rbp_ps_t = rbp_left_get(a_type, a_field, rbp_ps_t); \ + } else if (rbp_ps_cmp > 0) { \ + (r_node) = rbp_ps_t; \ + rbp_ps_t = rbp_right_get(a_type, a_field, rbp_ps_t); \ + } else { \ + (r_node) = rbp_ps_t; \ + break; \ + } \ + } \ +} while (0) + +#define rbp_rotate_left(a_type, a_field, a_node, r_node) do { \ + (r_node) = rbp_right_get(a_type, a_field, (a_node)); \ + rbp_right_set(a_type, a_field, (a_node), rbp_left_get(a_type, a_field, (r_node))); \ + rbp_left_set(a_type, a_field, (r_node), (a_node)); \ +} while (0) + +#define rbp_rotate_right(a_type, a_field, a_node, r_node) do { \ + (r_node) = rbp_left_get(a_type, a_field, (a_node)); \ + rbp_left_set(a_type, a_field, (a_node), rbp_right_get(a_type, a_field, (r_node))); \ + rbp_right_set(a_type, a_field, (r_node), (a_node)); \ +} while (0) + +#define rbp_lean_left(a_type, a_field, a_node, r_node) do { \ + bool rbp_ll_red; \ + rbp_rotate_left(a_type, a_field, (a_node), (r_node)); \ + rbp_ll_red = rbp_red_get(a_type, a_field, (a_node)); \ + rbp_color_set(a_type, a_field, (r_node), rbp_ll_red); \ + rbp_red_set(a_type, a_field, (a_node)); \ +} while (0) + +#define rbp_lean_right(a_type, a_field, a_node, r_node) do { \ + bool rbp_lr_red; \ + rbp_rotate_right(a_type, a_field, (a_node), (r_node)); \ + rbp_lr_red = rbp_red_get(a_type, a_field, (a_node)); \ + rbp_color_set(a_type, a_field, (r_node), rbp_lr_red); \ + rbp_red_set(a_type, a_field, (a_node)); \ +} while (0) + +#define rbp_move_red_left(a_type, a_field, a_node, r_node) do { \ + a_type *rbp_mrl_t, *rbp_mrl_u; \ + rbp_mrl_t = rbp_left_get(a_type, a_field, (a_node)); \ + rbp_red_set(a_type, a_field, rbp_mrl_t); \ + rbp_mrl_t = rbp_right_get(a_type, a_field, (a_node)); \ + rbp_mrl_u = rbp_left_get(a_type, a_field, rbp_mrl_t); \ + if (rbp_red_get(a_type, a_field, rbp_mrl_u)) { \ + rbp_rotate_right(a_type, a_field, rbp_mrl_t, rbp_mrl_u); \ + rbp_right_set(a_type, a_field, (a_node), rbp_mrl_u); \ + rbp_rotate_left(a_type, a_field, (a_node), (r_node)); \ + rbp_mrl_t = rbp_right_get(a_type, a_field, (a_node)); \ + if (rbp_red_get(a_type, a_field, rbp_mrl_t)) { \ + rbp_black_set(a_type, a_field, rbp_mrl_t); \ + rbp_red_set(a_type, a_field, (a_node)); \ + rbp_rotate_left(a_type, a_field, (a_node), rbp_mrl_t); \ + rbp_left_set(a_type, a_field, (r_node), rbp_mrl_t); \ + } else { \ + rbp_black_set(a_type, a_field, (a_node)); \ + } \ + } else { \ + rbp_red_set(a_type, a_field, (a_node)); \ + rbp_rotate_left(a_type, a_field, (a_node), (r_node)); \ + } \ +} while (0) + +#define rbp_move_red_right(a_type, a_field, a_node, r_node) do { \ + a_type *rbp_mrr_t; \ + rbp_mrr_t = rbp_left_get(a_type, a_field, (a_node)); \ + if (rbp_red_get(a_type, a_field, rbp_mrr_t)) { \ + a_type *rbp_mrr_u, *rbp_mrr_v; \ + rbp_mrr_u = rbp_right_get(a_type, a_field, rbp_mrr_t); \ + rbp_mrr_v = rbp_left_get(a_type, a_field, rbp_mrr_u); \ + if (rbp_red_get(a_type, a_field, rbp_mrr_v)) { \ + rbp_color_set(a_type, a_field, rbp_mrr_u, rbp_red_get(a_type, a_field, (a_node))); \ + rbp_black_set(a_type, a_field, rbp_mrr_v); \ + rbp_rotate_left(a_type, a_field, rbp_mrr_t, rbp_mrr_u); \ + rbp_left_set(a_type, a_field, (a_node), rbp_mrr_u); \ + rbp_rotate_right(a_type, a_field, (a_node), (r_node)); \ + rbp_rotate_left(a_type, a_field, (a_node), rbp_mrr_t); \ + rbp_right_set(a_type, a_field, (r_node), rbp_mrr_t); \ + } else { \ + rbp_color_set(a_type, a_field, rbp_mrr_t, rbp_red_get(a_type, a_field, (a_node))); \ + rbp_red_set(a_type, a_field, rbp_mrr_u); \ + rbp_rotate_right(a_type, a_field, (a_node), (r_node)); \ + rbp_rotate_left(a_type, a_field, (a_node), rbp_mrr_t); \ + rbp_right_set(a_type, a_field, (r_node), rbp_mrr_t); \ + } \ + rbp_red_set(a_type, a_field, (a_node)); \ + } else { \ + rbp_red_set(a_type, a_field, rbp_mrr_t); \ + rbp_mrr_t = rbp_left_get(a_type, a_field, rbp_mrr_t); \ + if (rbp_red_get(a_type, a_field, rbp_mrr_t)) { \ + rbp_black_set(a_type, a_field, rbp_mrr_t); \ + rbp_rotate_right(a_type, a_field, (a_node), (r_node)); \ + rbp_rotate_left(a_type, a_field, (a_node), rbp_mrr_t); \ + rbp_right_set(a_type, a_field, (r_node), rbp_mrr_t); \ + } else { \ + rbp_rotate_left(a_type, a_field, (a_node), (r_node)); \ + } \ + } \ +} while (0) + +#define rb_insert(a_type, a_field, a_cmp, a_tree, a_node) do { \ + a_type rbp_i_s; \ + a_type *rbp_i_g, *rbp_i_p, *rbp_i_c, *rbp_i_t, *rbp_i_u; \ + int rbp_i_cmp = 0; \ + rbp_i_g = &(a_tree)->rbt_nil; \ + rbp_left_set(a_type, a_field, &rbp_i_s, (a_tree)->rbt_root); \ + rbp_right_set(a_type, a_field, &rbp_i_s, &(a_tree)->rbt_nil); \ + rbp_black_set(a_type, a_field, &rbp_i_s); \ + rbp_i_p = &rbp_i_s; \ + rbp_i_c = (a_tree)->rbt_root; \ + /* Iteratively search down the tree for the insertion point, */\ + /* splitting 4-nodes as they are encountered. At the end of each */\ + /* iteration, rbp_i_g->rbp_i_p->rbp_i_c is a 3-level path down */\ + /* the tree, assuming a sufficiently deep tree. */\ + while (rbp_i_c != &(a_tree)->rbt_nil) { \ + rbp_i_t = rbp_left_get(a_type, a_field, rbp_i_c); \ + rbp_i_u = rbp_left_get(a_type, a_field, rbp_i_t); \ + if (rbp_red_get(a_type, a_field, rbp_i_t) \ + && rbp_red_get(a_type, a_field, rbp_i_u)) { \ + /* rbp_i_c is the top of a logical 4-node, so split it. */\ + /* This iteration does not move down the tree, due to the */\ + /* disruptiveness of node splitting. */\ + /* */\ + /* Rotate right. */\ + rbp_rotate_right(a_type, a_field, rbp_i_c, rbp_i_t); \ + /* Pass red links up one level. */\ + rbp_i_u = rbp_left_get(a_type, a_field, rbp_i_t); \ + rbp_black_set(a_type, a_field, rbp_i_u); \ + if (rbp_left_get(a_type, a_field, rbp_i_p) == rbp_i_c) { \ + rbp_left_set(a_type, a_field, rbp_i_p, rbp_i_t); \ + rbp_i_c = rbp_i_t; \ + } else { \ + /* rbp_i_c was the right child of rbp_i_p, so rotate */\ + /* left in order to maintain the left-leaning */\ + /* invariant. */\ + assert(rbp_right_get(a_type, a_field, rbp_i_p) == rbp_i_c); \ + rbp_right_set(a_type, a_field, rbp_i_p, rbp_i_t); \ + rbp_lean_left(a_type, a_field, rbp_i_p, rbp_i_u); \ + if (rbp_left_get(a_type, a_field, rbp_i_g) == rbp_i_p) {\ + rbp_left_set(a_type, a_field, rbp_i_g, rbp_i_u); \ + } else { \ + assert(rbp_right_get(a_type, a_field, rbp_i_g) == rbp_i_p); \ + rbp_right_set(a_type, a_field, rbp_i_g, rbp_i_u); \ + } \ + rbp_i_p = rbp_i_u; \ + rbp_i_cmp = (a_cmp)((a_node), rbp_i_p); \ + if (rbp_i_cmp < 0) { \ + rbp_i_c = rbp_left_get(a_type, a_field, rbp_i_p); \ + } else { \ + assert(rbp_i_cmp > 0); \ + rbp_i_c = rbp_right_get(a_type, a_field, rbp_i_p); \ + } \ + continue; \ + } \ + } \ + rbp_i_g = rbp_i_p; \ + rbp_i_p = rbp_i_c; \ + rbp_i_cmp = (a_cmp)((a_node), rbp_i_c); \ + if (rbp_i_cmp < 0) { \ + rbp_i_c = rbp_left_get(a_type, a_field, rbp_i_c); \ + } else { \ + assert(rbp_i_cmp > 0); \ + rbp_i_c = rbp_right_get(a_type, a_field, rbp_i_c); \ + } \ + } \ + /* rbp_i_p now refers to the node under which to insert. */\ + rbp_node_new(a_type, a_field, a_tree, (a_node)); \ + if (rbp_i_cmp > 0) { \ + rbp_right_set(a_type, a_field, rbp_i_p, (a_node)); \ + rbp_lean_left(a_type, a_field, rbp_i_p, rbp_i_t); \ + if (rbp_left_get(a_type, a_field, rbp_i_g) == rbp_i_p) { \ + rbp_left_set(a_type, a_field, rbp_i_g, rbp_i_t); \ + } else if (rbp_right_get(a_type, a_field, rbp_i_g) == rbp_i_p) {\ + rbp_right_set(a_type, a_field, rbp_i_g, rbp_i_t); \ + } \ + } else { \ + rbp_left_set(a_type, a_field, rbp_i_p, (a_node)); \ + } \ + /* Update the root and make sure that it is black. */\ + (a_tree)->rbt_root = rbp_left_get(a_type, a_field, &rbp_i_s); \ + rbp_black_set(a_type, a_field, (a_tree)->rbt_root); \ +} while (0) + +#define rb_remove(a_type, a_field, a_cmp, a_tree, a_node) do { \ + a_type rbp_r_s; \ + a_type *rbp_r_p, *rbp_r_c, *rbp_r_xp, *rbp_r_t, *rbp_r_u; \ + int rbp_r_cmp; \ + rbp_left_set(a_type, a_field, &rbp_r_s, (a_tree)->rbt_root); \ + rbp_right_set(a_type, a_field, &rbp_r_s, &(a_tree)->rbt_nil); \ + rbp_black_set(a_type, a_field, &rbp_r_s); \ + rbp_r_p = &rbp_r_s; \ + rbp_r_c = (a_tree)->rbt_root; \ + rbp_r_xp = &(a_tree)->rbt_nil; \ + /* Iterate down the tree, but always transform 2-nodes to 3- or */\ + /* 4-nodes in order to maintain the invariant that the current */\ + /* node is not a 2-node. This allows simple deletion once a leaf */\ + /* is reached. Handle the root specially though, since there may */\ + /* be no way to convert it from a 2-node to a 3-node. */\ + rbp_r_cmp = (a_cmp)((a_node), rbp_r_c); \ + if (rbp_r_cmp < 0) { \ + rbp_r_t = rbp_left_get(a_type, a_field, rbp_r_c); \ + rbp_r_u = rbp_left_get(a_type, a_field, rbp_r_t); \ + if (rbp_red_get(a_type, a_field, rbp_r_t) == false \ + && rbp_red_get(a_type, a_field, rbp_r_u) == false) { \ + /* Apply standard transform to prepare for left move. */\ + rbp_move_red_left(a_type, a_field, rbp_r_c, rbp_r_t); \ + rbp_black_set(a_type, a_field, rbp_r_t); \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + rbp_r_c = rbp_r_t; \ + } else { \ + /* Move left. */\ + rbp_r_p = rbp_r_c; \ + rbp_r_c = rbp_left_get(a_type, a_field, rbp_r_c); \ + } \ + } else { \ + if (rbp_r_cmp == 0) { \ + assert((a_node) == rbp_r_c); \ + if (rbp_right_get(a_type, a_field, rbp_r_c) == &(a_tree)->rbt_nil) { \ + /* Delete root node (which is also a leaf node). */\ + if (rbp_left_get(a_type, a_field, rbp_r_c) != &(a_tree)->rbt_nil) { \ + rbp_lean_right(a_type, a_field, rbp_r_c, rbp_r_t); \ + rbp_right_set(a_type, a_field, rbp_r_t, &(a_tree)->rbt_nil); \ + } else { \ + rbp_r_t = &(a_tree)->rbt_nil; \ + } \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + } else { \ + /* This is the node we want to delete, but we will */\ + /* instead swap it with its successor and delete the */\ + /* successor. Record enough information to do the */\ + /* swap later. rbp_r_xp is the a_node's parent. */\ + rbp_r_xp = rbp_r_p; \ + rbp_r_cmp = 1; /* Note that deletion is incomplete. */\ + } \ + } \ + if (rbp_r_cmp == 1) { \ + if (rbp_red_get(a_type, a_field, rbp_left_get(a_type, \ + a_field, rbp_right_get(a_type, a_field, rbp_r_c))) == false) { \ + rbp_r_t = rbp_left_get(a_type, a_field, rbp_r_c); \ + if (rbp_red_get(a_type, a_field, rbp_r_t)) { \ + /* Standard transform. */\ + rbp_move_red_right(a_type, a_field, rbp_r_c, rbp_r_t); \ + } else { \ + /* Root-specific transform. */\ + rbp_red_set(a_type, a_field, rbp_r_c); \ + rbp_r_u = rbp_left_get(a_type, a_field, rbp_r_t); \ + if (rbp_red_get(a_type, a_field, rbp_r_u)) { \ + rbp_black_set(a_type, a_field, rbp_r_u); \ + rbp_rotate_right(a_type, a_field, rbp_r_c, rbp_r_t); \ + rbp_rotate_left(a_type, a_field, rbp_r_c, rbp_r_u); \ + rbp_right_set(a_type, a_field, rbp_r_t, rbp_r_u); \ + } else { \ + rbp_red_set(a_type, a_field, rbp_r_t); \ + rbp_rotate_left(a_type, a_field, rbp_r_c, rbp_r_t); \ + } \ + } \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + rbp_r_c = rbp_r_t; \ + } else { \ + /* Move right */\ + rbp_r_p = rbp_r_c; \ + rbp_r_c = rbp_right_get(a_type, a_field, rbp_r_c); \ + } \ + } \ + } \ + if (rbp_r_cmp != 0) { \ + while (true) { \ + assert(rbp_r_p != &(a_tree)->rbt_nil); \ + rbp_r_cmp = (a_cmp)((a_node), rbp_r_c); \ + if (rbp_r_cmp < 0) { \ + rbp_r_t = rbp_left_get(a_type, a_field, rbp_r_c); \ + if (rbp_r_t == &(a_tree)->rbt_nil) { \ + /* rbp_r_c now refers to the successor node to */\ + /* relocate, and rbp_r_xp/a_node refer to the */\ + /* context for the relocation. */\ + if (rbp_left_get(a_type, a_field, rbp_r_xp) == (a_node)) { \ + rbp_left_set(a_type, a_field, rbp_r_xp, rbp_r_c); \ + } else { \ + assert(rbp_right_get(a_type, a_field, rbp_r_xp) == (a_node)); \ + rbp_right_set(a_type, a_field, rbp_r_xp, rbp_r_c); \ + } \ + rbp_left_set(a_type, a_field, rbp_r_c, rbp_left_get(a_type, a_field, (a_node))); \ + rbp_right_set(a_type, a_field, rbp_r_c, rbp_right_get(a_type, a_field, (a_node))); \ + rbp_color_set(a_type, a_field, rbp_r_c, rbp_red_get(a_type, a_field, (a_node))); \ + if (rbp_left_get(a_type, a_field, rbp_r_p) == rbp_r_c) { \ + rbp_left_set(a_type, a_field, rbp_r_p, &(a_tree)->rbt_nil); \ + } else { \ + assert(rbp_right_get(a_type, a_field, rbp_r_p) == rbp_r_c); \ + rbp_right_set(a_type, a_field, rbp_r_p, &(a_tree)->rbt_nil); \ + } \ + break; \ + } \ + rbp_r_u = rbp_left_get(a_type, a_field, rbp_r_t); \ + if (rbp_red_get(a_type, a_field, rbp_r_t) == false \ + && rbp_red_get(a_type, a_field, rbp_r_u) == false) { \ + rbp_move_red_left(a_type, a_field, rbp_r_c, rbp_r_t); \ + if (rbp_left_get(a_type, a_field, rbp_r_p) == rbp_r_c) { \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t);\ + } else { \ + rbp_right_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + } \ + rbp_r_c = rbp_r_t; \ + } else { \ + rbp_r_p = rbp_r_c; \ + rbp_r_c = rbp_left_get(a_type, a_field, rbp_r_c); \ + } \ + } else { \ + /* Check whether to delete this node (it has to be */\ + /* the correct node and a leaf node). */\ + if (rbp_r_cmp == 0) { \ + assert((a_node) == rbp_r_c); \ + if (rbp_right_get(a_type, a_field, rbp_r_c) == &(a_tree)->rbt_nil) { \ + /* Delete leaf node. */\ + if (rbp_left_get(a_type, a_field, rbp_r_c) != &(a_tree)->rbt_nil) { \ + rbp_lean_right(a_type, a_field, rbp_r_c, rbp_r_t); \ + rbp_right_set(a_type, a_field, rbp_r_t, &(a_tree)->rbt_nil); \ + } else { \ + rbp_r_t = &(a_tree)->rbt_nil; \ + } \ + if (rbp_left_get(a_type, a_field, rbp_r_p) == rbp_r_c) { \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + } else { \ + rbp_right_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + } \ + break; \ + } else { \ + /* This is the node we want to delete, but we */\ + /* will instead swap it with its successor */\ + /* and delete the successor. Record enough */\ + /* information to do the swap later. */\ + /* rbp_r_xp is a_node's parent. */\ + rbp_r_xp = rbp_r_p; \ + } \ + } \ + rbp_r_t = rbp_right_get(a_type, a_field, rbp_r_c); \ + rbp_r_u = rbp_left_get(a_type, a_field, rbp_r_t); \ + if (rbp_red_get(a_type, a_field, rbp_r_u) == false) { \ + rbp_move_red_right(a_type, a_field, rbp_r_c, \ + rbp_r_t); \ + if (rbp_left_get(a_type, a_field, rbp_r_p) == rbp_r_c) { \ + rbp_left_set(a_type, a_field, rbp_r_p, rbp_r_t);\ + } else { \ + rbp_right_set(a_type, a_field, rbp_r_p, rbp_r_t); \ + } \ + rbp_r_c = rbp_r_t; \ + } else { \ + rbp_r_p = rbp_r_c; \ + rbp_r_c = rbp_right_get(a_type, a_field, rbp_r_c); \ + } \ + } \ + } \ + } \ + /* Update root. */\ + (a_tree)->rbt_root = rbp_left_get(a_type, a_field, &rbp_r_s); \ +} while (0) + +/* + * The rb_wrap() macro provides a convenient way to wrap functions around the + * cpp macros. The main benefits of wrapping are that 1) repeated macro + * expansion can cause code bloat, especially for rb_{insert,remove)(), and + * 2) type, linkage, comparison functions, etc. need not be specified at every + * call point. + */ + +#define rb_wrap(a_attr, a_prefix, a_tree_type, a_type, a_field, a_cmp) \ +a_attr void \ +a_prefix##new(a_tree_type *tree) { \ + rb_new(a_type, a_field, tree); \ +} \ +a_attr a_type * \ +a_prefix##first(a_tree_type *tree) { \ + a_type *ret; \ + rb_first(a_type, a_field, tree, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##last(a_tree_type *tree) { \ + a_type *ret; \ + rb_last(a_type, a_field, tree, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##next(a_tree_type *tree, a_type *node) { \ + a_type *ret; \ + rb_next(a_type, a_field, a_cmp, tree, node, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##prev(a_tree_type *tree, a_type *node) { \ + a_type *ret; \ + rb_prev(a_type, a_field, a_cmp, tree, node, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##search(a_tree_type *tree, a_type *key) { \ + a_type *ret; \ + rb_search(a_type, a_field, a_cmp, tree, key, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##nsearch(a_tree_type *tree, a_type *key) { \ + a_type *ret; \ + rb_nsearch(a_type, a_field, a_cmp, tree, key, ret); \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##psearch(a_tree_type *tree, a_type *key) { \ + a_type *ret; \ + rb_psearch(a_type, a_field, a_cmp, tree, key, ret); \ + return (ret); \ +} \ +a_attr void \ +a_prefix##insert(a_tree_type *tree, a_type *node) { \ + rb_insert(a_type, a_field, a_cmp, tree, node); \ +} \ +a_attr void \ +a_prefix##remove(a_tree_type *tree, a_type *node) { \ + rb_remove(a_type, a_field, a_cmp, tree, node); \ +} + +/* + * The iterators simulate recursion via an array of pointers that store the + * current path. This is critical to performance, since a series of calls to + * rb_{next,prev}() would require time proportional to (n lg n), whereas this + * implementation only requires time proportional to (n). + * + * Since the iterators cache a path down the tree, any tree modification may + * cause the cached path to become invalid. In order to continue iteration, + * use something like the following sequence: + * + * { + * a_type *node, *tnode; + * + * rb_foreach_begin(a_type, a_field, a_tree, node) { + * ... + * rb_next(a_type, a_field, a_cmp, a_tree, node, tnode); + * rb_remove(a_type, a_field, a_cmp, a_tree, node); + * rb_foreach_next(a_type, a_field, a_cmp, a_tree, tnode); + * ... + * } rb_foreach_end(a_type, a_field, a_tree, node) + * } + * + * Note that this idiom is not advised if every iteration modifies the tree, + * since in that case there is no algorithmic complexity improvement over a + * series of rb_{next,prev}() calls, thus making the setup overhead wasted + * effort. + */ + +#define rb_foreach_begin(a_type, a_field, a_tree, a_var) { /* brace A */ \ + /* Compute the maximum possible tree depth (3X the black height). */\ + unsigned rbp_f_height; \ + rbp_black_height(a_type, a_field, a_tree, rbp_f_height); \ + rbp_f_height *= 3; \ + { /* brace B */ \ + /* Initialize the path to contain the left spine. */\ + a_type *rbp_f_path[rbp_f_height]; \ + a_type *rbp_f_node; \ + bool rbp_f_synced = false; \ + unsigned rbp_f_depth = 0; \ + if ((a_tree)->rbt_root != &(a_tree)->rbt_nil) { \ + rbp_f_path[rbp_f_depth] = (a_tree)->rbt_root; \ + rbp_f_depth++; \ + while ((rbp_f_node = rbp_left_get(a_type, a_field, \ + rbp_f_path[rbp_f_depth-1])) != &(a_tree)->rbt_nil) { \ + rbp_f_path[rbp_f_depth] = rbp_f_node; \ + rbp_f_depth++; \ + } \ + } \ + /* While the path is non-empty, iterate. */\ + while (rbp_f_depth > 0) { /* brace C */ \ + (a_var) = rbp_f_path[rbp_f_depth-1]; + +/* + * Note that rb_foreach_begin omits closing }'s because + * it expects that it will be succeeded by a call to + * rb_foreach_end which will have the closing } + */ + +/* Only use if modifying the tree during iteration. */ +#define rb_foreach_next(a_type, a_field, a_cmp, a_tree, a_node) \ + /* Re-initialize the path to contain the path to a_node. */\ + rbp_f_depth = 0; \ + if (a_node != NULL) { \ + if ((a_tree)->rbt_root != &(a_tree)->rbt_nil) { \ + rbp_f_path[rbp_f_depth] = (a_tree)->rbt_root; \ + rbp_f_depth++; \ + rbp_f_node = rbp_f_path[0]; \ + while (true) { \ + int rbp_f_cmp = (a_cmp)((a_node), \ + rbp_f_path[rbp_f_depth-1]); \ + if (rbp_f_cmp < 0) { \ + rbp_f_node = rbp_left_get(a_type, a_field, \ + rbp_f_path[rbp_f_depth-1]); \ + } else if (rbp_f_cmp > 0) { \ + rbp_f_node = rbp_right_get(a_type, a_field, \ + rbp_f_path[rbp_f_depth-1]); \ + } else { \ + break; \ + } \ + assert(rbp_f_node != &(a_tree)->rbt_nil); \ + rbp_f_path[rbp_f_depth] = rbp_f_node; \ + rbp_f_depth++; \ + } \ + } \ + } \ + rbp_f_synced = true; + +#define rb_foreach_end(a_type, a_field, a_tree, a_var) \ + if (rbp_f_synced) { \ + rbp_f_synced = false; \ + continue; \ + } \ + /* Find the successor. */\ + if ((rbp_f_node = rbp_right_get(a_type, a_field, \ + rbp_f_path[rbp_f_depth-1])) != &(a_tree)->rbt_nil) { \ + /* The successor is the left-most node in the right */\ + /* subtree. */\ + rbp_f_path[rbp_f_depth] = rbp_f_node; \ + rbp_f_depth++; \ + while ((rbp_f_node = rbp_left_get(a_type, a_field, \ + rbp_f_path[rbp_f_depth-1])) != &(a_tree)->rbt_nil) { \ + rbp_f_path[rbp_f_depth] = rbp_f_node; \ + rbp_f_depth++; \ + } \ + } else { \ + /* The successor is above the current node. Unwind */\ + /* until a left-leaning edge is removed from the */\ + /* path, or the path is empty. */\ + for (rbp_f_depth--; rbp_f_depth > 0; rbp_f_depth--) { \ + if (rbp_left_get(a_type, a_field, rbp_f_path[rbp_f_depth-1]) \ + == rbp_f_path[rbp_f_depth]) { \ + break; \ + } \ + } \ + } \ + } /* close brace C */ \ + } /* close brace B */ \ +} /* close brace A */ + + + +#define rb_foreach_reverse_begin(a_type, a_field, a_tree, a_var) { /* brace A */ \ + /* Compute the maximum possible tree depth (3X the black height). */\ + unsigned rbp_fr_height; \ + rbp_black_height(a_type, a_field, a_tree, rbp_fr_height); \ + rbp_fr_height *= 3; \ + { /* brace B */ \ + /* Initialize the path to contain the right spine. */\ + a_type *rbp_fr_path[rbp_fr_height]; \ + a_type *rbp_fr_node; \ + bool rbp_fr_synced = false; \ + unsigned rbp_fr_depth = 0; \ + if ((a_tree)->rbt_root != &(a_tree)->rbt_nil) { \ + rbp_fr_path[rbp_fr_depth] = (a_tree)->rbt_root; \ + rbp_fr_depth++; \ + while ((rbp_fr_node = rbp_right_get(a_type, a_field, \ + rbp_fr_path[rbp_fr_depth-1])) != &(a_tree)->rbt_nil) { \ + rbp_fr_path[rbp_fr_depth] = rbp_fr_node; \ + rbp_fr_depth++; \ + } \ + } \ + /* While the path is non-empty, iterate. */\ + while (rbp_fr_depth > 0) { /* brace C */ \ + (a_var) = rbp_fr_path[rbp_fr_depth-1]; + + +/* Only use if modifying the tree during iteration. */ +#define rb_foreach_reverse_prev(a_type, a_field, a_cmp, a_tree, a_node) \ + /* Re-initialize the path to contain the path to a_node. */\ + rbp_fr_depth = 0; \ + if (a_node != NULL) { \ + if ((a_tree)->rbt_root != &(a_tree)->rbt_nil) { \ + rbp_fr_path[rbp_fr_depth] = (a_tree)->rbt_root; \ + rbp_fr_depth++; \ + rbp_fr_node = rbp_fr_path[0]; \ + while (true) { \ + int rbp_fr_cmp = (a_cmp)((a_node), rbp_fr_path[rbp_fr_depth-1]); \ + if (rbp_fr_cmp < 0) { \ + rbp_fr_node = rbp_left_get(a_type, a_field, \ + rbp_fr_path[rbp_fr_depth-1]); \ + } else if (rbp_fr_cmp > 0) { \ + rbp_fr_node = rbp_right_get(a_type, a_field, rbp_fr_path[rbp_fr_depth-1]); \ + } else { \ + break; \ + } \ + assert(rbp_fr_node != &(a_tree)->rbt_nil); \ + rbp_fr_path[rbp_fr_depth] = rbp_fr_node; \ + rbp_fr_depth++; \ + } \ + } \ + } \ + rbp_fr_synced = true; + +#define rb_foreach_reverse_end(a_type, a_field, a_tree, a_var) \ + if (rbp_fr_synced) { \ + rbp_fr_synced = false; \ + continue; \ + } \ + if (rbp_fr_depth == 0) { \ + /* rb_foreach_reverse_sync() was called with a NULL */\ + /* a_node. */\ + break; \ + } \ + /* Find the predecessor. */\ + if ((rbp_fr_node = rbp_left_get(a_type, a_field, \ + rbp_fr_path[rbp_fr_depth-1])) != &(a_tree)->rbt_nil) { \ + /* The predecessor is the right-most node in the left */\ + /* subtree. */\ + rbp_fr_path[rbp_fr_depth] = rbp_fr_node; \ + rbp_fr_depth++; \ + while ((rbp_fr_node = rbp_right_get(a_type, a_field, \ + rbp_fr_path[rbp_fr_depth-1])) != &(a_tree)->rbt_nil) {\ + rbp_fr_path[rbp_fr_depth] = rbp_fr_node; \ + rbp_fr_depth++; \ + } \ + } else { \ + /* The predecessor is above the current node. Unwind */\ + /* until a right-leaning edge is removed from the */\ + /* path, or the path is empty. */\ + for (rbp_fr_depth--; rbp_fr_depth > 0; rbp_fr_depth--) {\ + if (rbp_right_get(a_type, a_field, rbp_fr_path[rbp_fr_depth-1]) \ + == rbp_fr_path[rbp_fr_depth]) { \ + break; \ + } \ + } \ + } \ + } /* Close brace C */ \ + } /* close brace B */ \ +} /* close brace A*/ + +#endif /* RB_H_ */ diff --git a/bsd/i386/param.h b/bsd/i386/param.h index 03a38d2ce..0eae0fea5 100644 --- a/bsd/i386/param.h +++ b/bsd/i386/param.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -101,9 +101,15 @@ * clusters (MAPPED_MBUFS), MCLBYTES must also be an integral multiple * of the hardware page size. */ -#define MSIZE 256 /* size of an mbuf */ -#define MCLBYTES 2048 /* large enough for ether MTU */ -#define MCLSHIFT 11 +#define MSIZESHIFT 8 /* 256 */ +#define MSIZE (1 << MSIZESHIFT) /* size of an mbuf */ +#define MCLSHIFT 11 /* 2048 */ +#define MCLBYTES (1 << MCLSHIFT) /* size of an mbuf cluster */ +#define MBIGCLSHIFT 12 /* 4096 */ +#define MBIGCLBYTES (1 << MBIGCLSHIFT) /* size of a big cluster */ +#define M16KCLSHIFT 14 /* 16384 */ +#define M16KCLBYTES (1 << M16KCLSHIFT) /* size of a jumbo cluster */ + #define MCLOFSET (MCLBYTES - 1) #ifndef NMBCLUSTERS #ifdef GATEWAY diff --git a/bsd/kern/Makefile b/bsd/kern/Makefile new file mode 100644 index 000000000..c7eecbb12 --- /dev/null +++ b/bsd/kern/Makefile @@ -0,0 +1,26 @@ +export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd +export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def +export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule +export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir + +include $(MakeInc_cmd) +include $(MakeInc_def) + +INSTALL_SHARE_MISC_LIST = \ + trace.codes + +include $(MakeInc_rule) +include $(MakeInc_dir) + +SHARE_MISC_DIR = usr/share/misc + +INSTALL_SHARE_MISC_FILES = \ + $(addprefix $(DSTROOT)/$(SHARE_MISC_DIR)/, $(INSTALL_SHARE_MISC_LIST)) + +$(INSTALL_SHARE_MISC_FILES): $(DSTROOT)/$(SHARE_MISC_DIR)/% : % + @echo Installing $< in $(dir $@) + $(_v) $(MKDIR) $(DSTROOT)/$(SHARE_MISC_DIR); \ + $(RM) $(RMFLAGS) $@; \ + $(INSTALL) $(INSTALL_FLAGS) $< $(dir $@); + +do_build_install: $(INSTALL_SHARE_MISC_FILES) diff --git a/bsd/kern/bsd_init.c b/bsd/kern/bsd_init.c index e2868a40c..dac2c94d8 100644 --- a/bsd/kern/bsd_init.c +++ b/bsd/kern/bsd_init.c @@ -106,6 +106,7 @@ #include <kern/task.h> #include <kern/ast.h> #include <kern/kalloc.h> +#include <mach/mach_host.h> #include <mach/vm_param.h> @@ -132,6 +133,7 @@ #include <sys/mcache.h> /* for mcache_init() */ #include <sys/mbuf.h> /* for mbinit() */ #include <sys/event.h> /* for knote_init() */ +#include <sys/kern_memorystatus.h> /* for kern_memorystatus_init() */ #include <sys/aio_kern.h> /* for aio_init() */ #include <sys/semaphore.h> /* for psem_cache_init() */ #include <net/dlil.h> /* for dlil_init() */ @@ -151,6 +153,8 @@ #include <sys/tty.h> /* for tty_init() */ #include <net/if_utun.h> /* for utun_register_control() */ #include <net/net_str_id.h> /* for net_str_id_init() */ +#include <net/netsrc.h> /* for netsrc_init() */ +#include <kern/assert.h> /* for assert() */ #include <net/init.h> @@ -162,6 +166,10 @@ #include <machine/exec.h> +#if NFSCLIENT +#include <sys/netboot.h> +#endif + #if CONFIG_IMAGEBOOT #include <sys/imageboot.h> #endif @@ -171,6 +179,7 @@ #endif #include <pexpert/pexpert.h> +#include <machine/pal_routines.h> void * get_user_regs(thread_t); /* XXX kludge for <machine/thread.h> */ void IOKitInitializeTime(void); /* XXX */ @@ -216,9 +225,8 @@ char domainname[MAXDOMNAMELEN]; int domainnamelen; #if defined(__i386__) || defined(__x86_64__) struct exec_archhandler exec_archhandler_ppc = { - .path = "/usr/libexec/oah/translate", + .path = "/usr/libexec/oah/RosettaNonGrata", }; -const char * const kRosettaStandIn_str = "/usr/libexec/oah/RosettaNonGrata"; #else /* __i386__ */ struct exec_archhandler exec_archhandler_ppc; #endif /* __i386__ */ @@ -243,16 +251,16 @@ extern void file_lock_init(void); extern void kmeminit(void); extern void bsd_bufferinit(void); -extern int srv; +extern int serverperfmode; extern int ncl; vm_map_t bsd_pageable_map; vm_map_t mb_map; -static int bsd_simul_execs = BSD_SIMUL_EXECS; -static int bsd_pageable_map_size = BSD_PAGABLE_MAP_SIZE; -__private_extern__ int execargs_cache_size = BSD_SIMUL_EXECS; -__private_extern__ int execargs_free_count = BSD_SIMUL_EXECS; +static int bsd_simul_execs; +static int bsd_pageable_map_size; +__private_extern__ int execargs_cache_size = 0; +__private_extern__ int execargs_free_count = 0; __private_extern__ vm_offset_t * execargs_cache = NULL; void bsd_exec_setup(int); @@ -262,6 +270,14 @@ void bsd_exec_setup(int); * Intel only. */ __private_extern__ int bootarg_no64exec = 0; +__private_extern__ int bootarg_vnode_cache_defeat = 0; + +/* + * Prevent kernel-based ASLR from being used, for testing. + */ +#if DEVELOPMENT || DEBUG +__private_extern__ int bootarg_disable_aslr = 0; +#endif int cmask = CMASK; extern int customnbuf; @@ -274,6 +290,7 @@ static void parse_bsd_args(void); extern task_t bsd_init_task; extern char init_task_failure_data[]; extern void time_zone_slock_init(void); +extern void select_wait_queue_init(void); static void process_name(const char *, proc_t); static void setconf(void); @@ -289,17 +306,21 @@ extern void sysv_sem_lock_init(void); #if SYSV_MSG extern void sysv_msg_lock_init(void); #endif -extern void pthread_init(void); +#if !defined(SECURE_KERNEL) /* kmem access not enabled by default; can be changed with boot-args */ +/* We don't need to keep this symbol around in RELEASE kernel */ int setup_kmem = 0; +#endif -/* size of kernel trace buffer, disabled by default */ -unsigned int new_nkdbufs = 0; +#if CONFIG_MACF +#if defined (__i386__) || defined (__x86_64__) +/* MACF policy_check configuration flags; see policy_check.c for details */ +int policy_check_flags = 0; -/* mach leak logging */ -int log_leaks = 0; -int turn_on_log_leaks = 0; +extern int check_policy_init(int); +#endif +#endif /* CONFIG_MACF */ extern void stackshot_lock_init(void); @@ -343,8 +364,6 @@ struct rlimit vm_initial_limit_core = { DFLCSIZ, MAXCSIZ }; extern thread_t cloneproc(task_t, proc_t, int); extern int (*mountroot)(void); -extern int netboot_mountroot(void); /* netboot.c */ -extern int netboot_setup(void); lck_grp_t * proc_lck_grp; lck_grp_t * proc_slock_grp; @@ -386,6 +405,10 @@ bsd_init(void) struct vfs_context context; kern_return_t ret; struct ucred temp_cred; + struct posix_cred temp_pcred; +#if NFSCLIENT || CONFIG_IMAGEBOOT + boolean_t netboot = FALSE; +#endif #define bsd_init_kprintf(x...) /* kprintf("bsd_init: " x) */ @@ -427,7 +450,7 @@ bsd_init(void) proc_lck_grp_attr= lck_grp_attr_alloc_init(); proc_lck_grp = lck_grp_alloc_init("proc", proc_lck_grp_attr); -#ifndef CONFIG_EMBEDDED +#if CONFIG_FINE_LOCK_GROUPS proc_slock_grp = lck_grp_alloc_init("proc-slock", proc_lck_grp_attr); proc_fdmlock_grp = lck_grp_alloc_init("proc-fdmlock", proc_lck_grp_attr); proc_mlock_grp = lck_grp_alloc_init("proc-mlock", proc_lck_grp_attr); @@ -440,20 +463,21 @@ bsd_init(void) #endif #endif -#ifdef CONFIG_EMBEDDED - proc_list_mlock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); - proc_klist_mlock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); - lck_mtx_init(&kernproc->p_mlock, proc_lck_grp, proc_lck_attr); - lck_mtx_init(&kernproc->p_fdmlock, proc_lck_grp, proc_lck_attr); - lck_spin_init(&kernproc->p_slock, proc_lck_grp, proc_lck_attr); -#else +#if CONFIG_FINE_LOCK_GROUPS proc_list_mlock = lck_mtx_alloc_init(proc_mlock_grp, proc_lck_attr); proc_klist_mlock = lck_mtx_alloc_init(proc_mlock_grp, proc_lck_attr); lck_mtx_init(&kernproc->p_mlock, proc_mlock_grp, proc_lck_attr); lck_mtx_init(&kernproc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr); lck_spin_init(&kernproc->p_slock, proc_slock_grp, proc_lck_attr); +#else + proc_list_mlock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); + proc_klist_mlock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); + lck_mtx_init(&kernproc->p_mlock, proc_lck_grp, proc_lck_attr); + lck_mtx_init(&kernproc->p_fdmlock, proc_lck_grp, proc_lck_attr); + lck_spin_init(&kernproc->p_slock, proc_lck_grp, proc_lck_attr); #endif + assert(bsd_simul_execs != 0); execargs_cache_lock = lck_mtx_alloc_init(proc_lck_grp, proc_lck_attr); execargs_cache_size = bsd_simul_execs; execargs_free_count = bsd_simul_execs; @@ -473,6 +497,14 @@ bsd_init(void) */ mac_policy_initbsd(); kernproc->p_mac_enforce = 0; + +#if defined (__i386__) || defined (__x86_64__) + /* + * We currently only support this on i386/x86_64, as that is the + * only lock code we have instrumented so far. + */ + check_policy_init(policy_check_flags); +#endif #endif /* MAC */ /* @@ -483,15 +515,16 @@ bsd_init(void) kernproc->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); -#ifdef CONFIG_EMBEDDED - lck_mtx_init(&pgrp0.pg_mlock, proc_lck_grp, proc_lck_attr); -#else +#ifdef CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&pgrp0.pg_mlock, proc_mlock_grp, proc_lck_attr); +#else + lck_mtx_init(&pgrp0.pg_mlock, proc_lck_grp, proc_lck_attr); #endif /* There is no other bsd thread this point and is safe without pgrp lock */ LIST_INSERT_HEAD(&pgrp0.pg_members, kernproc, p_pglist); kernproc->p_listflag |= P_LIST_INPGRP; kernproc->p_pgrpid = 0; + kernproc->p_uniqueid = 0; pgrp0.pg_session = &session0; pgrp0.pg_membercnt = 1; @@ -499,10 +532,10 @@ bsd_init(void) session0.s_count = 1; session0.s_leader = kernproc; session0.s_listflags = 0; -#ifdef CONFIG_EMBEDDED - lck_mtx_init(&session0.s_mlock, proc_lck_grp, proc_lck_attr); -#else +#ifdef CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&session0.s_mlock, proc_mlock_grp, proc_lck_attr); +#else + lck_mtx_init(&session0.s_mlock, proc_lck_grp, proc_lck_attr); #endif LIST_INSERT_HEAD(SESSHASH(0), &session0, s_hash); proc_list_unlock(); @@ -515,6 +548,14 @@ bsd_init(void) kernproc->p_stat = SRUN; kernproc->p_flag = P_SYSTEM; + kernproc->p_lflag = 0; + kernproc->p_ladvflag = 0; + +#if DEVELOPMENT || DEBUG + if (bootarg_disable_aslr) + kernproc->p_flag |= P_DISABLE_ASLR; +#endif + kernproc->p_nice = NZERO; kernproc->p_pptr = kernproc; @@ -531,15 +572,22 @@ bsd_init(void) */ bsd_init_kprintf("calling bzero\n"); bzero(&temp_cred, sizeof(temp_cred)); - temp_cred.cr_ngroups = 1; + bzero(&temp_pcred, sizeof(temp_pcred)); + temp_pcred.cr_ngroups = 1; - temp_cred.cr_audit.as_aia_p = &audit_default_aia; - /* XXX the following will go away with cr_au */ - temp_cred.cr_au.ai_auid = AU_DEFAUDITID; + temp_cred.cr_audit.as_aia_p = audit_default_aia_p; bsd_init_kprintf("calling kauth_cred_create\n"); + /* + * We have to label the temp cred before we create from it to + * properly set cr_ngroups, or the create will fail. + */ + posix_cred_label(&temp_cred, &temp_pcred); kernproc->p_ucred = kauth_cred_create(&temp_cred); + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(kernproc); + /* give the (already exisiting) initial thread a reference on it */ bsd_init_kprintf("calling kauth_cred_ref\n"); kauth_cred_ref(kernproc->p_ucred); @@ -598,6 +646,7 @@ bsd_init(void) vm_offset_t minimum; bsd_init_kprintf("calling kmem_suballoc\n"); + assert(bsd_pageable_map_size != 0); ret = kmem_suballoc(kernel_map, &minimum, (vm_size_t)bsd_pageable_map_size, @@ -630,15 +679,15 @@ bsd_init(void) bsd_init_kprintf("calling IOKitInitializeTime\n"); IOKitInitializeTime(); - if (turn_on_log_leaks && !new_nkdbufs) - new_nkdbufs = 200000; - start_kern_tracing(new_nkdbufs); - if (turn_on_log_leaks) - log_leaks = 1; - bsd_init_kprintf("calling ubc_init\n"); ubc_init(); + /* + * Initialize device-switches. + */ + bsd_init_kprintf("calling devsw_init() \n"); + devsw_init(); + /* Initialize the file systems. */ bsd_init_kprintf("calling vfsinit\n"); vfsinit(); @@ -702,6 +751,8 @@ bsd_init(void) psem_cache_init(); bsd_init_kprintf("calling time_zone_slock_init\n"); time_zone_slock_init(); + bsd_init_kprintf("calling select_wait_queue_init\n"); + select_wait_queue_init(); /* Stack snapshot facility lock */ stackshot_lock_init(); @@ -729,6 +780,12 @@ bsd_init(void) kernproc->p_fd->fd_cdir = NULL; kernproc->p_fd->fd_rdir = NULL; +#if CONFIG_FREEZE + /* Initialise background hibernation */ + bsd_init_kprintf("calling kern_hibernation_init\n"); + kern_hibernation_init(); +#endif + #if CONFIG_EMBEDDED /* Initialize kernel memory status notifications */ bsd_init_kprintf("calling kern_memorystatus_init\n"); @@ -780,6 +837,10 @@ bsd_init(void) /* register user tunnel kernel control handler */ utun_register_control(); + netsrc_init(); + + /* wait for network domain to finish */ + domainfin(); #endif /* NETWORKING */ bsd_init_kprintf("calling vnode_pager_bootstrap\n"); @@ -794,61 +855,22 @@ bsd_init(void) bsd_init_kprintf("calling inittodr\n"); inittodr(0); -#if CONFIG_EMBEDDED - { - /* print out early VM statistics */ - kern_return_t kr1; - vm_statistics_data_t stat; - mach_msg_type_number_t count; - - count = HOST_VM_INFO_COUNT; - kr1 = host_statistics(host_self(), - HOST_VM_INFO, - (host_info_t)&stat, - &count); - kprintf("Mach Virtual Memory Statistics (page size of 4096) bytes\n" - "Pages free:\t\t\t%u.\n" - "Pages active:\t\t\t%u.\n" - "Pages inactive:\t\t\t%u.\n" - "Pages wired down:\t\t%u.\n" - "\"Translation faults\":\t\t%u.\n" - "Pages copy-on-write:\t\t%u.\n" - "Pages zero filled:\t\t%u.\n" - "Pages reactivated:\t\t%u.\n" - "Pageins:\t\t\t%u.\n" - "Pageouts:\t\t\t%u.\n" - "Object cache: %u hits of %u lookups (%d%% hit rate)\n", - - stat.free_count, - stat.active_count, - stat.inactive_count, - stat.wire_count, - stat.faults, - stat.cow_faults, - stat.zero_fill_count, - stat.reactivations, - stat.pageins, - stat.pageouts, - stat.hits, - stat.lookups, - (stat.hits == 0) ? 100 : - ((stat.lookups * 100) / stat.hits)); - } -#endif /* CONFIG_EMBEDDED */ - /* Mount the root file system. */ while( TRUE) { int err; bsd_init_kprintf("calling setconf\n"); setconf(); +#if NFSCLIENT + netboot = (mountroot == netboot_mountroot); +#endif bsd_init_kprintf("vfs_mountroot\n"); if (0 == (err = vfs_mountroot())) break; rootdevice[0] = '\0'; #if NFSCLIENT - if (mountroot == netboot_mountroot) { + if (netboot) { PE_display_icon( 0, "noroot"); /* XXX a netboot-specific icon would be nicer */ vc_progress_set(FALSE, 0); for (i=1; 1; i*=2) { @@ -880,8 +902,10 @@ bsd_init(void) filedesc0.fd_cdir = rootvnode; #if NFSCLIENT - if (mountroot == netboot_mountroot) { + if (netboot) { int err; + + netboot = TRUE; /* post mount setup */ if ((err = netboot_setup()) != 0) { PE_display_icon( 0, "noroot"); /* XXX a netboot-specific icon would be nicer */ @@ -903,19 +927,12 @@ bsd_init(void) * See if a system disk image is present. If so, mount it and * switch the root vnode to point to it */ - - if(imageboot_needed()) { - int err; - - /* An image was found */ - if((err = imageboot_setup())) { - /* - * this is not fatal. Keep trying to root - * off the original media - */ - printf("%s: imageboot could not find root, %d\n", - __FUNCTION__, err); - } + if (netboot == FALSE && imageboot_needed()) { + /* + * An image was found. No turning back: we're booted + * with a kernel from the disk image. + */ + imageboot_setup(); } #endif /* CONFIG_IMAGEBOOT */ @@ -943,15 +960,12 @@ bsd_init(void) kernproc->p_flag |= P_LP64; printf("Kernel is LP64\n"); #endif + + pal_kernel_announce(); + #if __i386__ || __x86_64__ /* this should be done after the root filesystem is mounted */ error = set_archhandler(kernproc, CPU_TYPE_POWERPC); - // 10/30/08 - gab: <rdar://problem/6324501> - // if default 'translate' can't be found, see if the understudy is available - if (ENOENT == error) { - strlcpy(exec_archhandler_ppc.path, kRosettaStandIn_str, MAXPATHLEN); - error = set_archhandler(kernproc, CPU_TYPE_POWERPC); - } if (error) /* XXX make more generic */ exec_archhandler_ppc.path[0] = 0; #endif @@ -1117,13 +1131,19 @@ parse_bsd_args(void) if (PE_parse_boot_argn("-x", namep, sizeof (namep))) /* safe boot */ boothowto |= RB_SAFEBOOT; - if (PE_parse_boot_argn("-l", namep, sizeof (namep))) /* leaks logging */ - turn_on_log_leaks = 1; - /* disable 64 bit grading */ if (PE_parse_boot_argn("-no64exec", namep, sizeof (namep))) bootarg_no64exec = 1; + /* disable vnode_cache_is_authorized() by setting vnode_cache_defeat */ + if (PE_parse_boot_argn("-vnode_cache_defeat", namep, sizeof (namep))) + bootarg_vnode_cache_defeat = 1; + +#if DEVELOPMENT || DEBUG + if (PE_parse_boot_argn("-disable_aslr", namep, sizeof (namep))) + bootarg_disable_aslr = 1; +#endif + PE_parse_boot_argn("ncl", &ncl, sizeof (ncl)); if (PE_parse_boot_argn("nbuf", &max_nbuf_headers, sizeof (max_nbuf_headers))) { @@ -1132,11 +1152,20 @@ parse_bsd_args(void) #if !defined(SECURE_KERNEL) PE_parse_boot_argn("kmem", &setup_kmem, sizeof (setup_kmem)); #endif - PE_parse_boot_argn("trace", &new_nkdbufs, sizeof (new_nkdbufs)); + +#if CONFIG_MACF +#if defined (__i386__) || defined (__x86_64__) + PE_parse_boot_argn("policy_check", &policy_check_flags, sizeof (policy_check_flags)); +#endif +#endif /* CONFIG_MACF */ if (PE_parse_boot_argn("msgbuf", &msgbuf, sizeof (msgbuf))) { log_setsize(msgbuf); } + + if (PE_parse_boot_argn("-novfscache", namep, sizeof(namep))) { + nc_disabled = 1; + } } void @@ -1165,10 +1194,13 @@ bsd_exec_setup(int scale) break; } - bsd_pageable_map_size = (bsd_simul_execs * (NCARGS + PAGE_SIZE)); + bsd_pageable_map_size = (bsd_simul_execs * BSD_PAGEABLE_SIZE_PER_EXEC); } #if !NFSCLIENT +int +netboot_root(void); + int netboot_root(void) { diff --git a/bsd/kern/bsd_stubs.c b/bsd/kern/bsd_stubs.c index 64127d32a..19da61270 100644 --- a/bsd/kern/bsd_stubs.c +++ b/bsd/kern/bsd_stubs.c @@ -31,8 +31,10 @@ #include <mach/mach_types.h> #include <mach/vm_prot.h> #include <vm/vm_kern.h> +#include <sys/stat.h> #include <vm/vm_map.h> #include <sys/systm.h> +#include <kern/assert.h> #include <sys/conf.h> #include <sys/proc_internal.h> #include <sys/buf.h> /* for SET */ @@ -49,6 +51,9 @@ extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int); void pcb_synch(void); void tbeproc(void *); +TAILQ_HEAD(,devsw_lock) devsw_locks; +lck_mtx_t devsw_lock_list_mtx; +lck_grp_t *devsw_lock_grp; /* Just to satisfy pstat command */ int dmmin, dmmax, dmtext; @@ -280,6 +285,7 @@ cdevsw_remove(int index, struct cdevsw * csw) return(-1); } cdevsw[index] = nocdev; + cdevsw_flags[index] = 0; return(index); } @@ -303,6 +309,28 @@ cdevsw_add_with_bdev(int index, struct cdevsw * csw, int bdev) return (index); } +int +cdevsw_setkqueueok(int index, struct cdevsw *csw, int use_offset) +{ + struct cdevsw *devsw; + uint64_t flags = CDEVSW_SELECT_KQUEUE; + + devsw = &cdevsw[index]; + if ((index < 0) || (index >= nchrdev) || + (memcmp((char *)devsw, + (char *)csw, + sizeof(struct cdevsw)) != 0)) { + return(-1); + } + + if (use_offset) { + flags |= CDEVSW_USE_OFFSET; + } + + cdevsw_flags[index] = flags; + return 0; +} + #include <pexpert/pexpert.h> /* for PE_parse_boot_arg */ void @@ -336,3 +364,71 @@ bsd_hostname(char *buf, int bufsize, int *len) } } +void +devsw_lock(dev_t dev, int mode) +{ + devsw_lock_t newlock, tmplock; + int res; + + assert(0 <= major(dev) && major(dev) < nchrdev); + assert(mode == S_IFCHR || mode == S_IFBLK); + + MALLOC(newlock, devsw_lock_t, sizeof(struct devsw_lock), M_TEMP, M_WAITOK | M_ZERO); + newlock->dl_dev = dev; + newlock->dl_thread = current_thread(); + newlock->dl_mode = mode; + + lck_mtx_lock_spin(&devsw_lock_list_mtx); +retry: + TAILQ_FOREACH(tmplock, &devsw_locks, dl_list) { + if (tmplock->dl_dev == dev && tmplock->dl_mode == mode) { + res = msleep(tmplock, &devsw_lock_list_mtx, PVFS, "devsw_lock", NULL); + assert(res == 0); + goto retry; + } + } + + TAILQ_INSERT_TAIL(&devsw_locks, newlock, dl_list); + lck_mtx_unlock(&devsw_lock_list_mtx); + +} +void +devsw_unlock(dev_t dev, int mode) +{ + devsw_lock_t tmplock; + + assert(0 <= major(dev) && major(dev) < nchrdev); + + lck_mtx_lock_spin(&devsw_lock_list_mtx); + + TAILQ_FOREACH(tmplock, &devsw_locks, dl_list) { + if (tmplock->dl_dev == dev && tmplock->dl_mode == mode) { + break; + } + } + + if (tmplock == NULL) { + panic("Trying to unlock, and couldn't find lock."); + } + + if (tmplock->dl_thread != current_thread()) { + panic("Trying to unlock, but I don't hold the lock."); + } + + wakeup(tmplock); + TAILQ_REMOVE(&devsw_locks, tmplock, dl_list); + + lck_mtx_unlock(&devsw_lock_list_mtx); + + FREE(tmplock, M_TEMP); +} + +void +devsw_init() +{ + devsw_lock_grp = lck_grp_alloc_init("devsw", NULL); + assert(devsw_lock_grp != NULL); + + lck_mtx_init(&devsw_lock_list_mtx, devsw_lock_grp, NULL); + TAILQ_INIT(&devsw_locks); +} diff --git a/bsd/kern/decmpfs.c b/bsd/kern/decmpfs.c index d0483c0e4..33e3b3040 100644 --- a/bsd/kern/decmpfs.c +++ b/bsd/kern/decmpfs.c @@ -204,11 +204,12 @@ _decmp_get_func(uint32_t type, int offset) if (IOCatalogueMatchingDriversPresent(providesName)) { // there is a kext that says it will register for this type, so let's wait for it char resourceName[80]; + uint64_t delay = 10000000ULL; // 10 milliseconds. snprintf(resourceName, sizeof(resourceName), "com.apple.AppleFSCompression.Type%u", type); printf("waiting for %s\n", resourceName); while(decompressors[type] == NULL) { lck_rw_done(decompressorsLock); // we have to unlock to allow the kext to register - if (IOServiceWaitForMatchingResource(resourceName, 60)) { + if (IOServiceWaitForMatchingResource(resourceName, delay)) { break; } if (!IOCatalogueMatchingDriversPresent(providesName)) { @@ -217,6 +218,7 @@ _decmp_get_func(uint32_t type, int offset) break; } printf("still waiting for %s\n", resourceName); + delay *= 2; lck_rw_lock_shared(decompressorsLock); } // IOKit says the kext is loaded, so it should be registered too! @@ -659,11 +661,11 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp) return 0; } - if (!vnode_isreg(vp)) { - /* only regular files can be compressed */ - ret = FILE_IS_NOT_COMPRESSED; - goto done; - } +// if (!vnode_isreg(vp)) { +// /* only regular files can be compressed */ +// ret = FILE_IS_NOT_COMPRESSED; +// goto done; +// } mp = vnode_mount(vp); if (mp == NULL) { @@ -1137,7 +1139,7 @@ decompress: else { if (!abort_pagein) { /* commit our pages */ - kr = commit_upl(pl, pl_offset, total_size, UPL_COMMIT_FREE_ON_EMPTY | UPL_COMMIT_INACTIVATE, 0); + kr = commit_upl(pl, pl_offset, total_size, UPL_COMMIT_FREE_ON_EMPTY, 0); } } diff --git a/bsd/kern/imageboot.c b/bsd/kern/imageboot.c index 0ed79dc69..8bc4ede36 100644 --- a/bsd/kern/imageboot.c +++ b/bsd/kern/imageboot.c @@ -35,6 +35,7 @@ #include <sys/filedesc.h> #include <sys/vnode_internal.h> #include <sys/imageboot.h> +#include <kern/assert.h> #include <pexpert/pexpert.h> @@ -52,33 +53,68 @@ extern char rootdevice[]; #endif extern int di_root_image(const char *path, char devname[], dev_t *dev_p); +static boolean_t imageboot_setup_new(void); #define kIBFilePrefix "file://" -int +__private_extern__ int +imageboot_format_is_valid(const char *root_path) +{ + return (strncmp(root_path, kIBFilePrefix, + strlen(kIBFilePrefix)) == 0); +} + +static void +vnode_get_and_drop_always(vnode_t vp) +{ + vnode_getalways(vp); + vnode_rele(vp); + vnode_put(vp); +} + +__private_extern__ int imageboot_needed(void) { int result = 0; char *root_path = NULL; - + DBG_TRACE("%s: checking for presence of root path\n", __FUNCTION__); MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); if (root_path == NULL) panic("%s: M_NAMEI zone exhausted", __FUNCTION__); - if(PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == TRUE) { - /* Got it, now verify scheme */ + /* Check for first layer */ + if (!(PE_parse_boot_argn("rp0", root_path, MAXPATHLEN) || + PE_parse_boot_argn("rp", root_path, MAXPATHLEN) || + PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN))) { + goto out; + } + + /* Sanity-check first layer */ + if (imageboot_format_is_valid(root_path)) { + DBG_TRACE("%s: Found %s\n", __FUNCTION__, root_path); + } else { + goto out; + } - if (strncmp(root_path, kIBFilePrefix, - strlen(kIBFilePrefix)) == 0) { - DBG_TRACE("%s: Found %s\n", __FUNCTION__, root_path); - result = 1; - } else { - DBG_TRACE("%s: Invalid URL scheme for %s\n", - __FUNCTION__, root_path); - } + result = 1; + + /* Check for second layer */ + if (!(PE_parse_boot_argn("rp1", root_path, MAXPATHLEN) || + PE_parse_boot_argn(IMAGEBOOT_CONTAINER_ARG, root_path, MAXPATHLEN))) { + goto out; + } + + /* Sanity-check second layer */ + if (imageboot_format_is_valid(root_path)) { + DBG_TRACE("%s: Found %s\n", __FUNCTION__, root_path); + } else { + panic("%s: Invalid URL scheme for %s\n", + __FUNCTION__, root_path); } + +out: FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); return (result); @@ -86,97 +122,193 @@ imageboot_needed(void) /* - * We know there's an image. Attach it, and - * switch over to root off it - * - * NB: p is always kernproc + * Swaps in new root filesystem based on image path. + * Current root filesystem is removed from mount list and + * tagged MNTK_BACKS_ROOT, MNT_ROOTFS is cleared on it, and + * "rootvnode" is reset. Root vnode of currentroot filesystem + * is returned with usecount (no iocount). */ - -int -imageboot_setup() +__private_extern__ int +imageboot_mount_image(const char *root_path, int height) { - dev_t dev; - int error = 0; - char *root_path = NULL; + dev_t dev; + int error; + vnode_t old_rootvnode = NULL; + vnode_t newdp; + mount_t new_rootfs; - DBG_TRACE("%s: entry\n", __FUNCTION__); - - MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); - if (root_path == NULL) - return (ENOMEM); - - if(PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == FALSE) { - error = ENOENT; - goto done; - } - - printf("%s: root image url is %s\n", __FUNCTION__, root_path); error = di_root_image(root_path, rootdevice, &dev); - if(error) { - printf("%s: di_root_image failed: %d\n", __FUNCTION__, error); - goto done; + if (error) { + panic("%s: di_root_image failed: %d\n", __FUNCTION__, error); } rootdev = dev; mountroot = NULL; printf("%s: root device 0x%x\n", __FUNCTION__, rootdev); error = vfs_mountroot(); + if (error != 0) { + panic("vfs_mountroot() failed.\n"); + } - if (error == 0 && rootvnode != NULL) { - vnode_t newdp, old_rootvnode; - mount_t new_rootfs, old_rootfs; + /* + * Get the vnode for '/'. + * Set fdp->fd_fd.fd_cdir to reference it. + */ + if (VFS_ROOT(TAILQ_LAST(&mountlist,mntlist), &newdp, vfs_context_kernel())) + panic("%s: cannot find root vnode", __FUNCTION__); - /* - * Get the vnode for '/'. - * Set fdp->fd_fd.fd_cdir to reference it. - */ - if (VFS_ROOT(TAILQ_LAST(&mountlist,mntlist), &newdp, vfs_context_kernel())) - panic("%s: cannot find root vnode", __FUNCTION__); + if (rootvnode != NULL) { + /* remember the old rootvnode, but remove it from mountlist */ + mount_t old_rootfs; old_rootvnode = rootvnode; old_rootfs = rootvnode->v_mount; - + mount_list_remove(old_rootfs); - + mount_lock(old_rootfs); #ifdef CONFIG_IMGSRC_ACCESS old_rootfs->mnt_kern_flag |= MNTK_BACKS_ROOT; #endif /* CONFIG_IMGSRC_ACCESS */ old_rootfs->mnt_flag &= ~MNT_ROOTFS; mount_unlock(old_rootfs); + } - rootvnode = newdp; + /* switch to the new rootvnode */ + rootvnode = newdp; - new_rootfs = rootvnode->v_mount; - mount_lock(new_rootfs); - new_rootfs->mnt_flag |= MNT_ROOTFS; - mount_unlock(new_rootfs); + new_rootfs = rootvnode->v_mount; + mount_lock(new_rootfs); + new_rootfs->mnt_flag |= MNT_ROOTFS; + mount_unlock(new_rootfs); - vnode_ref(newdp); - vnode_put(newdp); - filedesc0.fd_cdir = newdp; - DBG_TRACE("%s: root switched\n", __FUNCTION__); + vnode_ref(newdp); + vnode_put(newdp); + filedesc0.fd_cdir = newdp; + DBG_TRACE("%s: root switched\n", __FUNCTION__); + if (old_rootvnode != NULL) { #ifdef CONFIG_IMGSRC_ACCESS - if (PE_imgsrc_mount_supported()) { - imgsrc_rootvnode = old_rootvnode; - } else { - vnode_getalways(old_rootvnode); - vnode_rele(old_rootvnode); - vnode_put(old_rootvnode); - } + if (height >= 0 && PE_imgsrc_mount_supported()) { + imgsrc_rootvnodes[height] = old_rootvnode; + } else { + vnode_get_and_drop_always(old_rootvnode); + } #else - vnode_getalways(old_rootvnode); - vnode_rele(old_rootvnode); - vnode_put(old_rootvnode); + vnode_get_and_drop_always(old_rootvnode); #endif /* CONFIG_IMGSRC_ACCESS */ + } + return 0; +} +static boolean_t +imageboot_setup_new() +{ + int error; + char *root_path = NULL; + int height = 0; + boolean_t done = FALSE; + + MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + assert(root_path != NULL); + + if(PE_parse_boot_argn(IMAGEBOOT_CONTAINER_ARG, root_path, MAXPATHLEN) == TRUE) { + printf("%s: container image url is %s\n", __FUNCTION__, root_path); + error = imageboot_mount_image(root_path, height); + if (error != 0) { + panic("Failed to mount container image."); + } + + height++; + } + + if (PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN) == FALSE) { + if (height > 0) { + panic("%s specified without %s?\n", IMAGEBOOT_CONTAINER_ARG, IMAGEBOOT_ROOT_ARG); + } + goto out; } + + printf("%s: root image url is %s\n", __FUNCTION__, root_path); + + error = imageboot_mount_image(root_path, height); + if (error != 0) { + panic("Failed to mount root image."); + } + + done = TRUE; + +out: + FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); + return done; +} + +__private_extern__ void +imageboot_setup() +{ + int error = 0; + char *root_path = NULL; + + DBG_TRACE("%s: entry\n", __FUNCTION__); + + if (rootvnode == NULL) { + panic("imageboot_setup: rootvnode is NULL."); + } + + /* + * New boot-arg scheme: + * root-dmg : the dmg that will be the root filesystem. + * container-dmg : an optional dmg that contains the root-dmg. + */ + if (imageboot_setup_new()) { + return; + } + + MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); + assert(root_path != NULL); + + /* + * Look for outermost disk image to root from. If we're doing a nested boot, + * there's some sense in which the outer image never needs to be the root filesystem, + * but it does need very similar treatment: it must not be unmounted, needs a fake + * device vnode created for it, and should not show up in getfsstat() until exposed + * with MNT_IMGSRC. We just make it the temporary root. + */ + if((PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == FALSE) && + (PE_parse_boot_argn("rp0", root_path, MAXPATHLEN) == FALSE)) { + panic("%s: no valid path to image.\n", __FUNCTION__); + } + + printf("%s: root image url is %s\n", __FUNCTION__, root_path); + + error = imageboot_mount_image(root_path, 0); + if (error) { + panic("Failed on first stage of imageboot."); + } + + /* + * See if we are rooting from a nested image + */ + if(PE_parse_boot_argn("rp1", root_path, MAXPATHLEN) == FALSE) { + goto done; + } + + printf("%s: second level root image url is %s\n", __FUNCTION__, root_path); + + /* + * If we fail to set up second image, it's not a given that we + * can safely root off the first. + */ + error = imageboot_mount_image(root_path, 1); + if (error) { + panic("Failed on second stage of imageboot."); + } + done: FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); DBG_TRACE("%s: exit\n", __FUNCTION__); - return (error); + return; } diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index 3eb9043dd..f7c7fa73a 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -20,6 +20,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ + #include <machine/spl.h> #include <sys/errno.h> @@ -30,6 +31,7 @@ #include <sys/sysctl.h> #include <sys/kdebug.h> #include <sys/sysproto.h> +#include <sys/bsdtask_info.h> #define HZ 100 #include <mach/clock_types.h> @@ -38,11 +40,18 @@ #include <machine/machine_routines.h> #if defined(__i386__) || defined(__x86_64__) -#include <i386/rtclock.h> +#include <i386/rtclock_protos.h> +#include <i386/mp.h> +#include <i386/machine_routines.h> #endif + +#include <kern/clock.h> + #include <kern/thread.h> #include <kern/task.h> #include <kern/debug.h> +#include <kern/kalloc.h> +#include <kern/cpu_data.h> #include <kern/assert.h> #include <vm/vm_kern.h> #include <sys/lock.h> @@ -54,10 +63,14 @@ #include <sys/vnode.h> #include <sys/vnode_internal.h> #include <sys/fcntl.h> +#include <sys/file_internal.h> +#include <sys/ubc.h> #include <mach/mach_host.h> /* for host_info() */ #include <libkern/OSAtomic.h> +#include <machine/pal_routines.h> + /* XXX should have prototypes, but Mach does not provide one */ void task_act_iterate_wth_args(task_t, void(*)(thread_t, void *), void *); int cpu_number(void); /* XXX <machine/...> include path broken */ @@ -74,18 +87,14 @@ int kdbg_setrtcdec(kd_regtype *); int kdbg_setpidex(kd_regtype *); int kdbg_setpid(kd_regtype *); void kdbg_mapinit(void); -int kdbg_reinit(void); -int kdbg_bootstrap(void); +int kdbg_reinit(boolean_t); +int kdbg_bootstrap(boolean_t); -static int create_buffers(void); +static int create_buffers(boolean_t); static void delete_buffers(void); extern void IOSleep(int); -#ifdef ppc -extern uint32_t maxDec; -#endif - /* trace enable status */ unsigned int kdebug_enable = 0; @@ -96,23 +105,38 @@ unsigned int kd_entropy_count = 0; unsigned int kd_entropy_indx = 0; vm_offset_t kd_entropy_buftomem = 0; +#define MAX_ENTROPY_COUNT (128 * 1024) + #define SLOW_NOLOG 0x01 #define SLOW_CHECKS 0x02 #define SLOW_ENTROPY 0x04 - -unsigned int kdebug_slowcheck = SLOW_NOLOG; +#define SLOW_CHUD 0x08 unsigned int kd_cpus; #define EVENTS_PER_STORAGE_UNIT 2048 #define MIN_STORAGE_UNITS_PER_CPU 4 +#define POINTER_FROM_KDS_PTR(x) (&kd_bufs[x.buffer_index].kdsb_addr[x.offset]) + +#define NATIVE_TRACE_FACILITY + +union kds_ptr { + struct { + uint32_t buffer_index:21; + uint16_t offset:11; + }; + uint32_t raw; +}; + struct kd_storage { - struct kd_storage *kds_next; - kd_buf *kds_bufptr; - kd_buf *kds_buflast; - kd_buf *kds_readlast; + union kds_ptr kds_next; + uint32_t kds_bufindx; + uint32_t kds_bufcnt; + uint32_t kds_readlast; + boolean_t kds_lostevents; + uint64_t kds_timestamp; kd_buf kds_records[EVENTS_PER_STORAGE_UNIT]; }; @@ -120,34 +144,52 @@ struct kd_storage { #define MAX_BUFFER_SIZE (1024 * 1024 * 128) #define N_STORAGE_UNITS_PER_BUFFER (MAX_BUFFER_SIZE / sizeof(struct kd_storage)) - struct kd_storage_buffers { struct kd_storage *kdsb_addr; uint32_t kdsb_size; }; - -struct kd_storage *kds_free_list = NULL; +#define KDS_PTR_NULL 0xffffffff struct kd_storage_buffers *kd_bufs = NULL; int n_storage_units = 0; int n_storage_buffers = 0; +int n_storage_threshold = 0; +int kds_waiter = 0; +int kde_waiter = 0; +#pragma pack(0) struct kd_bufinfo { - struct kd_storage *kd_list_head; - struct kd_storage *kd_list_tail; - struct kd_storage *kd_active; - uint64_t kd_prev_timebase; + union kds_ptr kd_list_head; + union kds_ptr kd_list_tail; + boolean_t kd_lostevents; + uint32_t _pad; + uint64_t kd_prev_timebase; + uint32_t num_bufs; } __attribute__(( aligned(CPU_CACHE_SIZE) )); +struct kd_ctrl_page_t { + union kds_ptr kds_free_list; + uint32_t enabled :1; + uint32_t _pad0 :31; + int kds_inuse_count; + uint32_t kdebug_flags; + uint32_t kdebug_slowcheck; + uint32_t _pad1; + struct { + uint64_t tsc_base; + uint64_t ns_base; + } cpu_timebase[32]; // should be max number of actual logical cpus +} kd_ctrl_page = {.kds_free_list = {.raw = KDS_PTR_NULL}, .enabled = 0, .kds_inuse_count = 0, .kdebug_flags = 0, .kdebug_slowcheck = SLOW_NOLOG}; +#pragma pack() + struct kd_bufinfo *kdbip = NULL; -#define KDCOPYBUF_COUNT 2048 +#define KDCOPYBUF_COUNT 8192 #define KDCOPYBUF_SIZE (KDCOPYBUF_COUNT * sizeof(kd_buf)) kd_buf *kdcopybuf = NULL; unsigned int nkdbufs = 8192; -unsigned int kdebug_flags = 0; unsigned int kdlog_beg=0; unsigned int kdlog_end=0; unsigned int kdlog_value1=0; @@ -155,6 +197,7 @@ unsigned int kdlog_value2=0; unsigned int kdlog_value3=0; unsigned int kdlog_value4=0; +static lck_spin_t * kdw_spin_lock; static lck_spin_t * kds_spin_lock; static lck_mtx_t * kd_trace_mtx_sysctl; static lck_grp_t * kd_trace_mtx_sysctl_grp; @@ -185,10 +228,21 @@ unsigned int kd_mapcount = 0; vm_offset_t kd_maptomem = 0; off_t RAW_file_offset = 0; +int RAW_file_written = 0; + +#define RAW_FLUSH_SIZE (2 * 1024 * 1024) + pid_t global_state_pid = -1; /* Used to control exclusive use of kd_buffer */ -#define DBG_FUNC_MASK 0xfffffffc +#define DBG_FUNC_MASK 0xfffffffc + +#define INTERRUPT 0x01050000 +#define MACH_vmfault 0x01300008 +#define BSC_SysCall 0x040c0000 +#define MACH_SysCall 0x010c0000 +#define DBG_SCALL_MASK 0xffff0000 + /* task to string structure */ struct tts @@ -202,10 +256,10 @@ typedef struct tts tts_t; struct krt { - kd_threadmap *map; /* pointer to the map buffer */ - int count; - int maxcount; - struct tts *atts; + kd_threadmap *map; /* pointer to the map buffer */ + int count; + int maxcount; + struct tts *atts; }; typedef struct krt krt_t; @@ -215,24 +269,102 @@ typedef void (*kd_chudhook_fn) (uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); -kd_chudhook_fn kdebug_chudhook = 0; /* pointer to CHUD toolkit function */ +volatile kd_chudhook_fn kdebug_chudhook = 0; /* pointer to CHUD toolkit function */ __private_extern__ void stackshot_lock_init( void ) __attribute__((section("__TEXT, initcode"))); -/* Support syscall SYS_kdebug_trace */ -int -kdebug_trace(__unused struct proc *p, struct kdebug_trace_args *uap, __unused int32_t *retval) +static void +kdbg_set_tracing_enabled(boolean_t enabled) { - if ( (kdebug_enable == 0) ) - return(EINVAL); - - kernel_debug(uap->code, uap->arg1, uap->arg2, uap->arg3, uap->arg4, 0); - return(0); + int s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kds_spin_lock); + + if (enabled) { + kdebug_enable |= KDEBUG_ENABLE_TRACE; + kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG; + kd_ctrl_page.enabled = 1; + } else { + kdebug_enable &= ~KDEBUG_ENABLE_TRACE; + kd_ctrl_page.kdebug_slowcheck |= SLOW_NOLOG; + kd_ctrl_page.enabled = 0; + } + lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); } +static void +kdbg_set_flags(int slowflag, int enableflag, boolean_t enabled) +{ + int s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kds_spin_lock); + + if (enabled) { + kd_ctrl_page.kdebug_slowcheck |= slowflag; + kdebug_enable |= enableflag; + } else { + kd_ctrl_page.kdebug_slowcheck &= ~slowflag; + kdebug_enable &= ~enableflag; + } + lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); +} + + +#ifdef NATIVE_TRACE_FACILITY +void +disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags) +{ + int s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kds_spin_lock); + + *old_slowcheck = kd_ctrl_page.kdebug_slowcheck; + *old_flags = kd_ctrl_page.kdebug_flags; + + kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; + kd_ctrl_page.kdebug_flags |= KDBG_NOWRAP; + + lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); +} + +void +enable_wrap(uint32_t old_slowcheck, boolean_t lostevents) +{ + int s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kds_spin_lock); + + kd_ctrl_page.kdebug_flags &= ~KDBG_NOWRAP; + + if ( !(old_slowcheck & SLOW_NOLOG)) + kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG; + + if (lostevents == TRUE) + kd_ctrl_page.kdebug_flags |= KDBG_WRAPPED; + + lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); +} + +void trace_set_timebases(__unused uint64_t tsc, __unused uint64_t ns) +{ +} +#else +/* Begin functions that are defined twice */ +void trace_set_timebases(uint64_t tsc, uint64_t ns) +{ + int cpu = cpu_number(); + kd_ctrl_page.cpu_timebase[cpu].tsc_base = tsc; + kd_ctrl_page.cpu_timebase[cpu].ns_base = ns; +} + +#endif static int -create_buffers(void) +#if defined(__i386__) || defined(__x86_64__) +create_buffers(boolean_t early_trace) +#else +create_buffers(__unused boolean_t early_trace) +#endif { int i; int p_buffer_size; @@ -240,6 +372,42 @@ create_buffers(void) int f_buffers; int error = 0; + /* + * get the number of cpus and cache it + */ +#if defined(__i386__) || defined(__x86_64__) + if (early_trace == TRUE) { + /* + * we've started tracing before the + * IOKit has even started running... just + * use the static max value + */ + kd_cpus = max_ncpus; + } else +#endif + { + host_basic_info_data_t hinfo; + mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; + +#define BSD_HOST 1 + host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); + kd_cpus = hinfo.logical_cpu_max; + } + if (kmem_alloc(kernel_map, (vm_offset_t *)&kdbip, sizeof(struct kd_bufinfo) * kd_cpus) != KERN_SUCCESS) { + error = ENOSPC; + goto out; + } + + trace_handler_map_bufinfo((uintptr_t)kdbip, sizeof(struct kd_bufinfo) * kd_cpus); + +#if !defined(NATIVE_TRACE_FACILITY) + for(i=0;i<(int)kd_cpus;i++) { + get_nanotime_timebases(i, + &kd_ctrl_page.cpu_timebase[i].tsc_base, + &kd_ctrl_page.cpu_timebase[i].ns_base); + } +#endif + if (nkdbufs < (kd_cpus * EVENTS_PER_STORAGE_UNIT * MIN_STORAGE_UNITS_PER_CPU)) n_storage_units = kd_cpus * MIN_STORAGE_UNITS_PER_CPU; else @@ -275,6 +443,8 @@ create_buffers(void) error = ENOSPC; goto out; } + bzero(kd_bufs[i].kdsb_addr, f_buffer_size); + kd_bufs[i].kdsb_size = f_buffer_size; } if (p_buffer_size) { @@ -282,8 +452,11 @@ create_buffers(void) error = ENOSPC; goto out; } + bzero(kd_bufs[i].kdsb_addr, p_buffer_size); + kd_bufs[i].kdsb_size = p_buffer_size; } + n_storage_units = 0; for (i = 0; i < n_storage_buffers; i++) { struct kd_storage *kds; @@ -293,16 +466,31 @@ create_buffers(void) n_elements = kd_bufs[i].kdsb_size / sizeof(struct kd_storage); kds = kd_bufs[i].kdsb_addr; + trace_handler_map_buffer(i, (uintptr_t)kd_bufs[i].kdsb_addr, kd_bufs[i].kdsb_size); + for (n = 0; n < n_elements; n++) { - kds[n].kds_next = kds_free_list; - kds_free_list = &kds[n]; + kds[n].kds_next.buffer_index = kd_ctrl_page.kds_free_list.buffer_index; + kds[n].kds_next.offset = kd_ctrl_page.kds_free_list.offset; - kds[n].kds_buflast = &kds[n].kds_records[EVENTS_PER_STORAGE_UNIT]; + kd_ctrl_page.kds_free_list.buffer_index = i; + kd_ctrl_page.kds_free_list.offset = n; } + n_storage_units += n_elements; } + bzero((char *)kdbip, sizeof(struct kd_bufinfo) * kd_cpus); - kdebug_flags |= KDBG_BUFINIT; + for (i = 0; i < (int)kd_cpus; i++) { + kdbip[i].kd_list_head.raw = KDS_PTR_NULL; + kdbip[i].kd_list_tail.raw = KDS_PTR_NULL; + kdbip[i].kd_lostevents = FALSE; + kdbip[i].num_bufs = 0; + } + + kd_ctrl_page.kdebug_flags |= KDBG_BUFINIT; + + kd_ctrl_page.kds_inuse_count = 0; + n_storage_threshold = n_storage_units / 2; out: if (error) delete_buffers(); @@ -318,8 +506,10 @@ delete_buffers(void) if (kd_bufs) { for (i = 0; i < n_storage_buffers; i++) { - if (kd_bufs[i].kdsb_addr) + if (kd_bufs[i].kdsb_addr) { kmem_free(kernel_map, (vm_offset_t)kd_bufs[i].kdsb_addr, (vm_size_t)kd_bufs[i].kdsb_size); + trace_handler_unmap_buffer(i); + } } kmem_free(kernel_map, (vm_offset_t)kd_bufs, (vm_size_t)(n_storage_buffers * sizeof(struct kd_storage_buffers))); @@ -331,58 +521,92 @@ delete_buffers(void) kdcopybuf = NULL; } - kds_free_list = NULL; + kd_ctrl_page.kds_free_list.raw = KDS_PTR_NULL; - kdebug_flags &= ~KDBG_BUFINIT; + if (kdbip) { + trace_handler_unmap_bufinfo(); + + kmem_free(kernel_map, (vm_offset_t)kdbip, sizeof(struct kd_bufinfo) * kd_cpus); + + kdbip = NULL; + } + kd_ctrl_page.kdebug_flags &= ~KDBG_BUFINIT; } -static void -release_storage_unit(struct kd_bufinfo *kdbp, struct kd_storage *kdsp) +#ifdef NATIVE_TRACE_FACILITY +void +release_storage_unit(int cpu, uint32_t kdsp_raw) { - int s = 0; + struct kd_storage *kdsp_actual; + struct kd_bufinfo *kdbp; + union kds_ptr kdsp; + + kdsp.raw = kdsp_raw; + s = ml_set_interrupts_enabled(FALSE); lck_spin_lock(kds_spin_lock); - if (kdsp == kdbp->kd_list_head) { + kdbp = &kdbip[cpu]; + + if (kdsp.raw == kdbp->kd_list_head.raw) { /* - * its possible for the storage unit pointed to + * it's possible for the storage unit pointed to * by kdsp to have already been stolen... so - * check to see if its still the head of the list + * check to see if it's still the head of the list * now that we're behind the lock that protects * adding and removing from the queue... * since we only ever release and steal units from - * that position, if its no longer the head + * that position, if it's no longer the head * we having nothing to do in this context */ - kdbp->kd_list_head = kdsp->kds_next; + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); + kdbp->kd_list_head = kdsp_actual->kds_next; - kdsp->kds_next = kds_free_list; - kds_free_list = kdsp; + kdsp_actual->kds_next = kd_ctrl_page.kds_free_list; + kd_ctrl_page.kds_free_list = kdsp; + + kd_ctrl_page.kds_inuse_count--; } lck_spin_unlock(kds_spin_lock); ml_set_interrupts_enabled(s); } -/* - * Interrupts are disabled when we enter this routine. - */ -static struct kd_storage * -allocate_storage_unit(struct kd_bufinfo *kdbp) +boolean_t +allocate_storage_unit(int cpu) { - struct kd_storage *kdsp; - struct kd_bufinfo *kdbp_vict, *kdbp_try; + union kds_ptr kdsp; + struct kd_storage *kdsp_actual; + struct kd_bufinfo *kdbp, *kdbp_vict, *kdbp_try; uint64_t oldest_ts, ts; + boolean_t retval = TRUE; + int s = 0; + s = ml_set_interrupts_enabled(FALSE); lck_spin_lock(kds_spin_lock); - if ((kdsp = kds_free_list)) - kds_free_list = kdsp->kds_next; - else { - if (kdebug_flags & KDBG_NOWRAP) { - kdebug_slowcheck |= SLOW_NOLOG; + kdbp = &kdbip[cpu]; + + /* If someone beat us to the allocate, return success */ + if (kdbp->kd_list_tail.raw != KDS_PTR_NULL) { + kdsp_actual = POINTER_FROM_KDS_PTR(kdbp->kd_list_tail); + + if (kdsp_actual->kds_bufindx < EVENTS_PER_STORAGE_UNIT) + goto out; + } + + if ((kdsp = kd_ctrl_page.kds_free_list).raw != KDS_PTR_NULL) { + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); + kd_ctrl_page.kds_free_list = kdsp_actual->kds_next; + + kd_ctrl_page.kds_inuse_count++; + } else { + if (kd_ctrl_page.kdebug_flags & KDBG_NOWRAP) { + kd_ctrl_page.kdebug_slowcheck |= SLOW_NOLOG; + kdbp->kd_lostevents = TRUE; + retval = FALSE; goto out; } kdbp_vict = NULL; @@ -390,22 +614,25 @@ allocate_storage_unit(struct kd_bufinfo *kdbp) for (kdbp_try = &kdbip[0]; kdbp_try < &kdbip[kd_cpus]; kdbp_try++) { - if ((kdsp = kdbp_try->kd_list_head) == NULL) { + if (kdbp_try->kd_list_head.raw == KDS_PTR_NULL) { /* * no storage unit to steal */ continue; } - if (kdsp == kdbp_try->kd_active) { + + kdsp_actual = POINTER_FROM_KDS_PTR(kdbp_try->kd_list_head); + + if (kdsp_actual->kds_bufcnt < EVENTS_PER_STORAGE_UNIT) { /* * make sure we don't steal the storage unit - * being actively recorded to... this state - * also implies that this is the only unit assigned - * to this CPU, so we can immediately move on + * being actively recorded to... need to + * move on because we don't want an out-of-order + * set of events showing up later */ continue; } - ts = kdbg_get_timestamp(&(kdbp_try->kd_list_head->kds_records[0])); + ts = kdbg_get_timestamp(&kdsp_actual->kds_records[0]); if (ts < oldest_ts) { /* @@ -417,37 +644,52 @@ allocate_storage_unit(struct kd_bufinfo *kdbp) kdbp_vict = kdbp_try; } } -#if 1 if (kdbp_vict == NULL) { kdebug_enable = 0; - - panic("allocate_storage_unit: no storage units available\n"); + kd_ctrl_page.enabled = 0; + retval = FALSE; + goto out; } -#endif kdsp = kdbp_vict->kd_list_head; + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); - kdbp_vict->kd_list_head = kdsp->kds_next; + kdbp_vict->kd_list_head = kdsp_actual->kds_next; - kdebug_flags |= KDBG_WRAPPED; + kd_ctrl_page.kdebug_flags |= KDBG_WRAPPED; } - kdsp->kds_next = NULL; - kdsp->kds_bufptr = &kdsp->kds_records[0]; - kdsp->kds_readlast = kdsp->kds_bufptr; + kdsp_actual->kds_timestamp = mach_absolute_time(); + kdsp_actual->kds_next.raw = KDS_PTR_NULL; + kdsp_actual->kds_bufcnt = 0; + kdsp_actual->kds_readlast = 0; + + kdsp_actual->kds_lostevents = kdbp->kd_lostevents; + kdbp->kd_lostevents = FALSE; + kdsp_actual->kds_bufindx = 0; - if (kdbp->kd_list_head == NULL) + if (kdbp->kd_list_head.raw == KDS_PTR_NULL) kdbp->kd_list_head = kdsp; else - kdbp->kd_list_tail->kds_next = kdsp; + POINTER_FROM_KDS_PTR(kdbp->kd_list_tail)->kds_next = kdsp; kdbp->kd_list_tail = kdsp; out: lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); - return (kdsp); + return (retval); } +#endif +void +kernel_debug_internal( + uint32_t debugid, + uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t arg5, + int entropy_flag); - -static void +__attribute__((always_inline)) void kernel_debug_internal( uint32_t debugid, uintptr_t arg1, @@ -459,92 +701,118 @@ kernel_debug_internal( { struct proc *curproc; uint64_t now; - int s; + uint32_t bindx; + boolean_t s; kd_buf *kd; int cpu; struct kd_bufinfo *kdbp; - struct kd_storage *kdsp; + struct kd_storage *kdsp_actual; - s = ml_set_interrupts_enabled(FALSE); - now = mach_absolute_time() & KDBG_TIMESTAMP_MASK; - cpu = cpu_number(); - - if (kdebug_enable & KDEBUG_ENABLE_CHUD) { - if (kdebug_chudhook) - kdebug_chudhook(debugid, arg1, arg2, arg3, arg4, arg5); - - if ( !(kdebug_enable & (KDEBUG_ENABLE_ENTROPY | KDEBUG_ENABLE_TRACE))) - goto out; - } - if (kdebug_slowcheck == 0) - goto record_trace; + if (kd_ctrl_page.kdebug_slowcheck) { - if (entropy_flag && (kdebug_enable & KDEBUG_ENABLE_ENTROPY)) { - if (kd_entropy_indx < kd_entropy_count) { - kd_entropy_buffer [ kd_entropy_indx] = mach_absolute_time(); - kd_entropy_indx++; - } - - if (kd_entropy_indx == kd_entropy_count) { + if (kdebug_enable & KDEBUG_ENABLE_CHUD) { + kd_chudhook_fn chudhook; /* - * Disable entropy collection + * Mask interrupts to minimize the interval across + * which the driver providing the hook could be + * unloaded. */ - kdebug_enable &= ~KDEBUG_ENABLE_ENTROPY; - kdebug_slowcheck &= ~SLOW_ENTROPY; + s = ml_set_interrupts_enabled(FALSE); + chudhook = kdebug_chudhook; + if (chudhook) + chudhook(debugid, arg1, arg2, arg3, arg4, arg5); + ml_set_interrupts_enabled(s); } - } - if ( (kdebug_slowcheck & SLOW_NOLOG) ) - goto out; + if ((kdebug_enable & KDEBUG_ENABLE_ENTROPY) && entropy_flag) { + + now = mach_absolute_time(); + + s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kds_spin_lock); + + if (kdebug_enable & KDEBUG_ENABLE_ENTROPY) { + + if (kd_entropy_indx < kd_entropy_count) { + kd_entropy_buffer[kd_entropy_indx] = now; + kd_entropy_indx++; + } + if (kd_entropy_indx == kd_entropy_count) { + /* + * Disable entropy collection + */ + kdebug_enable &= ~KDEBUG_ENABLE_ENTROPY; + kd_ctrl_page.kdebug_slowcheck &= ~SLOW_ENTROPY; + } + } + lck_spin_unlock(kds_spin_lock); + ml_set_interrupts_enabled(s); + } + if ( (kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) || !(kdebug_enable & KDEBUG_ENABLE_TRACE)) + goto out1; - if (kdebug_flags & KDBG_PIDCHECK) { - /* - * If kdebug flag is not set for current proc, return - */ - curproc = current_proc(); + if ( !ml_at_interrupt_context()) { + if (kd_ctrl_page.kdebug_flags & KDBG_PIDCHECK) { + /* + * If kdebug flag is not set for current proc, return + */ + curproc = current_proc(); - if ((curproc && !(curproc->p_kdebug)) && - ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) - goto out; - } - else if (kdebug_flags & KDBG_PIDEXCLUDE) { - /* - * If kdebug flag is set for current proc, return - */ - curproc = current_proc(); + if ((curproc && !(curproc->p_kdebug)) && + ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) + goto out1; + } + else if (kd_ctrl_page.kdebug_flags & KDBG_PIDEXCLUDE) { + /* + * If kdebug flag is set for current proc, return + */ + curproc = current_proc(); - if ((curproc && curproc->p_kdebug) && - ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) - goto out; - } - if (kdebug_flags & KDBG_RANGECHECK) { - if ((debugid < kdlog_beg) - || ((debugid >= kdlog_end) && (debugid >> 24 != DBG_TRACE))) - goto out; - } - else if (kdebug_flags & KDBG_VALCHECK) { - if ((debugid & DBG_FUNC_MASK) != kdlog_value1 && - (debugid & DBG_FUNC_MASK) != kdlog_value2 && - (debugid & DBG_FUNC_MASK) != kdlog_value3 && - (debugid & DBG_FUNC_MASK) != kdlog_value4 && - (debugid >> 24 != DBG_TRACE)) - goto out; + if ((curproc && curproc->p_kdebug) && + ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE))) + goto out1; + } + } + if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) { + if ((debugid < kdlog_beg) + || ((debugid >= kdlog_end) && (debugid >> 24 != DBG_TRACE))) + goto out1; + } + else if (kd_ctrl_page.kdebug_flags & KDBG_VALCHECK) { + if ((debugid & DBG_FUNC_MASK) != kdlog_value1 && + (debugid & DBG_FUNC_MASK) != kdlog_value2 && + (debugid & DBG_FUNC_MASK) != kdlog_value3 && + (debugid & DBG_FUNC_MASK) != kdlog_value4 && + (debugid >> 24 != DBG_TRACE)) + goto out1; + } } - -record_trace: + disable_preemption(); + cpu = cpu_number(); kdbp = &kdbip[cpu]; - - if ((kdsp = kdbp->kd_active) == NULL) { - if ((kdsp = allocate_storage_unit(kdbp)) == NULL) { +retry_q: + if (kdbp->kd_list_tail.raw != KDS_PTR_NULL) { + kdsp_actual = POINTER_FROM_KDS_PTR(kdbp->kd_list_tail); + bindx = kdsp_actual->kds_bufindx; + } else + kdsp_actual = NULL; + + if (kdsp_actual == NULL || bindx >= EVENTS_PER_STORAGE_UNIT) { + if (allocate_storage_unit(cpu) == FALSE) { /* * this can only happen if wrapping * has been disabled */ goto out; } - kdbp->kd_active = kdsp; + goto retry_q; } - kd = kdsp->kds_bufptr; + now = mach_absolute_time() & KDBG_TIMESTAMP_MASK; + + if ( !OSCompareAndSwap(bindx, bindx + 1, &kdsp_actual->kds_bufindx)) + goto retry_q; + + kd = &kdsp_actual->kds_records[bindx]; kd->debugid = debugid; kd->arg1 = arg1; @@ -555,12 +823,56 @@ record_trace: kdbg_set_timestamp_and_cpu(kd, now, cpu); - kdsp->kds_bufptr++; - - if (kdsp->kds_bufptr >= kdsp->kds_buflast) - kdbp->kd_active = NULL; + OSAddAtomic(1, &kdsp_actual->kds_bufcnt); out: - ml_set_interrupts_enabled(s); + enable_preemption(); +out1: + if ((kds_waiter && kd_ctrl_page.kds_inuse_count >= n_storage_threshold) || + (kde_waiter && kd_entropy_indx >= kd_entropy_count)) { + uint32_t etype; + uint32_t stype; + + etype = debugid & DBG_FUNC_MASK; + stype = debugid & DBG_SCALL_MASK; + + if (etype == INTERRUPT || etype == MACH_vmfault || + stype == BSC_SysCall || stype == MACH_SysCall) { + + boolean_t need_kds_wakeup = FALSE; + boolean_t need_kde_wakeup = FALSE; + + /* + * try to take the lock here to synchronize with the + * waiter entering the blocked state... use the try + * mode to prevent deadlocks caused by re-entering this + * routine due to various trace points triggered in the + * lck_spin_sleep_xxxx routines used to actually enter + * one of our 2 wait conditions... no problem if we fail, + * there will be lots of additional events coming in that + * will eventually succeed in grabbing this lock + */ + s = ml_set_interrupts_enabled(FALSE); + + if (lck_spin_try_lock(kdw_spin_lock)) { + + if (kds_waiter && kd_ctrl_page.kds_inuse_count >= n_storage_threshold) { + kds_waiter = 0; + need_kds_wakeup = TRUE; + } + if (kde_waiter && kd_entropy_indx >= kd_entropy_count) { + kde_waiter = 0; + need_kde_wakeup = TRUE; + } + lck_spin_unlock(kdw_spin_lock); + } + ml_set_interrupts_enabled(s); + + if (need_kds_wakeup == TRUE) + wakeup(&kds_waiter); + if (need_kde_wakeup == TRUE) + wakeup(&kde_waiter); + } + } } void @@ -584,27 +896,32 @@ kernel_debug1( uintptr_t arg4, uintptr_t arg5) { - kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 0); + kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 1); } -static void -kdbg_lock_init(void) +/* + * Support syscall SYS_kdebug_trace + */ +int +kdebug_trace(__unused struct proc *p, struct kdebug_trace_args *uap, __unused int32_t *retval) { - host_basic_info_data_t hinfo; - mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; + if ( __probable(kdebug_enable == 0) ) + return(EINVAL); + + kernel_debug_internal(uap->code, uap->arg1, uap->arg2, uap->arg3, uap->arg4, (uintptr_t)thread_tid(current_thread()), 0); - if (kdebug_flags & KDBG_LOCKINIT) - return; + return(0); +} - /* get the number of cpus and cache it */ -#define BSD_HOST 1 - host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); - kd_cpus = hinfo.logical_cpu_max; - if (kmem_alloc(kernel_map, (vm_offset_t *)&kdbip, - sizeof(struct kd_bufinfo) * kd_cpus) != KERN_SUCCESS) - return; +static void +kdbg_lock_init(void) +{ + if (kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT) + return; + trace_handler_map_ctrl_page((uintptr_t)&kd_ctrl_page, sizeof(kd_ctrl_page), sizeof(struct kd_storage), sizeof(union kds_ptr)); + /* * allocate lock group attribute and group */ @@ -618,25 +935,26 @@ kdbg_lock_init(void) /* - * allocate and initialize spin lock and mutex + * allocate and initialize mutex's */ kd_trace_mtx_sysctl = lck_mtx_alloc_init(kd_trace_mtx_sysctl_grp, kd_trace_mtx_sysctl_attr); kds_spin_lock = lck_spin_alloc_init(kd_trace_mtx_sysctl_grp, kd_trace_mtx_sysctl_attr); + kdw_spin_lock = lck_spin_alloc_init(kd_trace_mtx_sysctl_grp, kd_trace_mtx_sysctl_attr); - kdebug_flags |= KDBG_LOCKINIT; + kd_ctrl_page.kdebug_flags |= KDBG_LOCKINIT; } int -kdbg_bootstrap(void) +kdbg_bootstrap(boolean_t early_trace) { - kdebug_flags &= ~KDBG_WRAPPED; + kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED; - return (create_buffers()); + return (create_buffers(early_trace)); } int -kdbg_reinit(void) +kdbg_reinit(boolean_t early_trace) { int ret = 0; @@ -645,8 +963,7 @@ kdbg_reinit(void) * First make sure we're not in * the middle of cutting a trace */ - kdebug_enable &= ~KDEBUG_ENABLE_TRACE; - kdebug_slowcheck |= SLOW_NOLOG; + kdbg_set_tracing_enabled(FALSE); /* * make sure the SLOW_NOLOG is seen @@ -657,14 +974,17 @@ kdbg_reinit(void) delete_buffers(); - if ((kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { + if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); - kdebug_flags &= ~KDBG_MAPINIT; + kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; kd_mapsize = 0; kd_mapptr = (kd_threadmap *) 0; kd_mapcount = 0; } - ret = kdbg_bootstrap(); + ret = kdbg_bootstrap(early_trace); + + RAW_file_offset = 0; + RAW_file_written = 0; return(ret); } @@ -750,7 +1070,7 @@ kdbg_mapinit(void) vm_offset_t tts_maptomem=0; int i; - if (kdebug_flags & KDBG_MAPINIT) + if (kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) return; /* @@ -821,7 +1141,7 @@ kdbg_mapinit(void) } if (kd_mapptr && tts_mapptr) { - kdebug_flags |= KDBG_MAPINIT; + kd_ctrl_page.kdebug_flags |= KDBG_MAPINIT; /* * Initialize thread map data @@ -847,9 +1167,7 @@ kdbg_clear(void) * First make sure we're not in * the middle of cutting a trace */ - - kdebug_enable &= ~KDEBUG_ENABLE_TRACE; - kdebug_slowcheck = SLOW_NOLOG; + kdbg_set_tracing_enabled(FALSE); /* * make sure the SLOW_NOLOG is seen @@ -858,24 +1176,24 @@ kdbg_clear(void) */ IOSleep(100); - if (kdebug_enable & KDEBUG_ENABLE_ENTROPY) - kdebug_slowcheck |= SLOW_ENTROPY; - global_state_pid = -1; - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags &= ~(KDBG_NOWRAP | KDBG_RANGECHECK | KDBG_VALCHECK); - kdebug_flags &= ~(KDBG_PIDCHECK | KDBG_PIDEXCLUDE); + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= ~(KDBG_NOWRAP | KDBG_RANGECHECK | KDBG_VALCHECK); + kd_ctrl_page.kdebug_flags &= ~(KDBG_PIDCHECK | KDBG_PIDEXCLUDE); delete_buffers(); /* Clean up the thread map buffer */ - kdebug_flags &= ~KDBG_MAPINIT; + kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; if (kd_mapptr) { kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); kd_mapptr = (kd_threadmap *) 0; } kd_mapsize = 0; kd_mapcount = 0; + + RAW_file_offset = 0; + RAW_file_written = 0; } int @@ -896,17 +1214,17 @@ kdbg_setpid(kd_regtype *kdr) /* * turn on pid check for this and all pids */ - kdebug_flags |= KDBG_PIDCHECK; - kdebug_flags &= ~KDBG_PIDEXCLUDE; - kdebug_slowcheck |= SLOW_CHECKS; - + kd_ctrl_page.kdebug_flags |= KDBG_PIDCHECK; + kd_ctrl_page.kdebug_flags &= ~KDBG_PIDEXCLUDE; + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); + p->p_kdebug = 1; } else { /* * turn off pid check for this pid value * Don't turn off all pid checking though * - * kdebug_flags &= ~KDBG_PIDCHECK; + * kd_ctrl_page.kdebug_flags &= ~KDBG_PIDCHECK; */ p->p_kdebug = 0; } @@ -938,9 +1256,9 @@ kdbg_setpidex(kd_regtype *kdr) /* * turn on pid exclusion */ - kdebug_flags |= KDBG_PIDEXCLUDE; - kdebug_flags &= ~KDBG_PIDCHECK; - kdebug_slowcheck |= SLOW_CHECKS; + kd_ctrl_page.kdebug_flags |= KDBG_PIDEXCLUDE; + kd_ctrl_page.kdebug_flags &= ~KDBG_PIDCHECK; + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); p->p_kdebug = 1; } @@ -949,7 +1267,7 @@ kdbg_setpidex(kd_regtype *kdr) * turn off pid exclusion for this pid value * Don't turn off all pid exclusion though * - * kdebug_flags &= ~KDBG_PIDEXCLUDE; + * kd_ctrl_page.kdebug_flags &= ~KDBG_PIDEXCLUDE; */ p->p_kdebug = 0; } @@ -975,14 +1293,8 @@ kdbg_setrtcdec(kd_regtype *kdr) if (decval && decval < KDBG_MINRTCDEC) ret = EINVAL; -#ifdef ppc - else { - maxDec = decval ? decval : 0x7FFFFFFF; /* Set or reset the max decrementer */ - } -#else else ret = ENOTSUP; -#endif /* ppc */ return(ret); } @@ -999,10 +1311,10 @@ kdbg_setreg(kd_regtype * kdr) val_2 = (kdr->value2 & 0xff); kdlog_beg = (val_1<<24); kdlog_end = (val_2<<24); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ - kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); - kdebug_slowcheck |= SLOW_CHECKS; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); break; case KDBG_SUBCLSTYPE : val_1 = (kdr->value1 & 0xff); @@ -1010,36 +1322,36 @@ kdbg_setreg(kd_regtype * kdr) val = val_2 + 1; kdlog_beg = ((val_1<<24) | (val_2 << 16)); kdlog_end = ((val_1<<24) | (val << 16)); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ - kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); - kdebug_slowcheck |= SLOW_CHECKS; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); break; case KDBG_RANGETYPE : kdlog_beg = (kdr->value1); kdlog_end = (kdr->value2); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ - kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); - kdebug_slowcheck |= SLOW_CHECKS; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= ~KDBG_VALCHECK; /* Turn off specific value check */ + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); break; case KDBG_VALCHECK: kdlog_value1 = (kdr->value1); kdlog_value2 = (kdr->value2); kdlog_value3 = (kdr->value3); kdlog_value4 = (kdr->value4); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags &= ~KDBG_RANGECHECK; /* Turn off range check */ - kdebug_flags |= KDBG_VALCHECK; /* Turn on specific value check */ - kdebug_slowcheck |= SLOW_CHECKS; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= ~KDBG_RANGECHECK; /* Turn off range check */ + kd_ctrl_page.kdebug_flags |= KDBG_VALCHECK; /* Turn on specific value check */ + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); break; case KDBG_TYPENONE : - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - if ( (kdebug_flags & (KDBG_RANGECHECK | KDBG_VALCHECK | KDBG_PIDCHECK | KDBG_PIDEXCLUDE)) ) - kdebug_slowcheck |= SLOW_CHECKS; + if ( (kd_ctrl_page.kdebug_flags & (KDBG_RANGECHECK | KDBG_VALCHECK | KDBG_PIDCHECK | KDBG_PIDEXCLUDE)) ) + kdbg_set_flags(SLOW_CHECKS, 0, TRUE); else - kdebug_slowcheck &= ~SLOW_CHECKS; + kdbg_set_flags(SLOW_CHECKS, 0, FALSE); kdlog_beg = 0; kdlog_end = 0; @@ -1064,8 +1376,8 @@ kdbg_getreg(__unused kd_regtype * kdr) val_2 = val_1 + 1; kdlog_beg = (val_1<<24); kdlog_end = (val_2<<24); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_CLASSTYPE); break; case KDBG_SUBCLSTYPE : val_1 = (kdr->value1 & 0xff); @@ -1073,17 +1385,17 @@ kdbg_getreg(__unused kd_regtype * kdr) val = val_2 + 1; kdlog_beg = ((val_1<<24) | (val_2 << 16)); kdlog_end = ((val_1<<24) | (val << 16)); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_SUBCLSTYPE); break; case KDBG_RANGETYPE : kdlog_beg = (kdr->value1); kdlog_end = (kdr->value2); - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; - kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags |= (KDBG_RANGECHECK | KDBG_RANGETYPE); break; case KDBG_TYPENONE : - kdebug_flags &= (unsigned int)~KDBG_CKTYPES; + kd_ctrl_page.kdebug_flags &= (unsigned int)~KDBG_CKTYPES; kdlog_beg = 0; kdlog_end = 0; break; @@ -1107,21 +1419,56 @@ kdbg_readmap(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) if (count && (count <= kd_mapcount)) { - if ((kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) + if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { if (*number < kd_mapsize) ret = EINVAL; else { - if (vp) { - vn_rdwr(UIO_WRITE, vp, (caddr_t)&count, sizeof(uint32_t), RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); - RAW_file_offset += sizeof(uint32_t); - - vn_rdwr(UIO_WRITE, vp, (caddr_t)kd_mapptr, kd_mapsize, RAW_file_offset, - UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + if (vp) + { + RAW_header header; + clock_sec_t secs; + clock_usec_t usecs; + char *pad_buf; + int pad_size; + + header.version_no = RAW_VERSION1; + header.thread_count = count; + + clock_get_calendar_microtime(&secs, &usecs); + header.TOD_secs = secs; + header.TOD_usecs = usecs; + + ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)&header, sizeof(RAW_header), RAW_file_offset, + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + if (ret) + goto write_error; + RAW_file_offset += sizeof(RAW_header); + + ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)kd_mapptr, kd_mapsize, RAW_file_offset, + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + if (ret) + goto write_error; RAW_file_offset += kd_mapsize; + pad_size = PAGE_SIZE - (RAW_file_offset & PAGE_MASK_64); + + if (pad_size) + { + pad_buf = (char *)kalloc(pad_size); + memset(pad_buf, 0, pad_size); + + ret = vn_rdwr(UIO_WRITE, vp, (caddr_t)pad_buf, pad_size, RAW_file_offset, + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); + kfree(pad_buf, pad_size); + + if (ret) + goto write_error; + RAW_file_offset += pad_size; + } + RAW_file_written += sizeof(RAW_header) + kd_mapsize + pad_size; + } else { if (copyout(kd_mapptr, buffer, kd_mapsize)) ret = EINVAL; @@ -1134,22 +1481,24 @@ kdbg_readmap(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) else ret = EINVAL; - if (ret && vp) { + if (ret && vp) + { count = 0; vn_rdwr(UIO_WRITE, vp, (caddr_t)&count, sizeof(uint32_t), RAW_file_offset, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); RAW_file_offset += sizeof(uint32_t); + RAW_file_written += sizeof(uint32_t); } - if ((kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) +write_error: + if ((kd_ctrl_page.kdebug_flags & KDBG_MAPINIT) && kd_mapsize && kd_mapptr) { kmem_free(kernel_map, (vm_offset_t)kd_mapptr, kd_mapsize); - kdebug_flags &= ~KDBG_MAPINIT; + kd_ctrl_page.kdebug_flags &= ~KDBG_MAPINIT; kd_mapsize = 0; kd_mapptr = (kd_threadmap *) 0; kd_mapcount = 0; } - return(ret); } @@ -1158,44 +1507,85 @@ kdbg_getentropy (user_addr_t buffer, size_t *number, int ms_timeout) { int avail = *number; int ret = 0; + int s; + u_int64_t abstime; + u_int64_t ns; + int wait_result = THREAD_AWAKENED; + if (kd_entropy_buffer) return(EBUSY); - kd_entropy_count = avail/sizeof(mach_timespec_t); - kd_entropy_bufsize = kd_entropy_count * sizeof(mach_timespec_t); - kd_entropy_indx = 0; + if (ms_timeout < 0) + return(EINVAL); + + kd_entropy_count = avail/sizeof(uint64_t); + + if (kd_entropy_count > MAX_ENTROPY_COUNT || kd_entropy_count == 0) { + /* + * Enforce maximum entropy entries + */ + return(EINVAL); + } + kd_entropy_bufsize = kd_entropy_count * sizeof(uint64_t); /* - * Enforce maximum entropy entries here if needed * allocate entropy buffer */ - if (kmem_alloc(kernel_map, &kd_entropy_buftomem, - (vm_size_t)kd_entropy_bufsize) == KERN_SUCCESS) { + if (kmem_alloc(kernel_map, &kd_entropy_buftomem, (vm_size_t)kd_entropy_bufsize) == KERN_SUCCESS) { kd_entropy_buffer = (uint64_t *) kd_entropy_buftomem; } else { kd_entropy_buffer = (uint64_t *) 0; kd_entropy_count = 0; - kd_entropy_indx = 0; - return (EINVAL); + + return (ENOMEM); } + kd_entropy_indx = 0; - if (ms_timeout < 10) - ms_timeout = 10; + KERNEL_DEBUG_CONSTANT(0xbbbbf000 | DBG_FUNC_START, ms_timeout, kd_entropy_count, 0, 0, 0); /* * Enable entropy sampling */ - kdebug_enable |= KDEBUG_ENABLE_ENTROPY; - kdebug_slowcheck |= SLOW_ENTROPY; + kdbg_set_flags(SLOW_ENTROPY, KDEBUG_ENABLE_ENTROPY, TRUE); - ret = tsleep (kdbg_getentropy, PRIBIO | PCATCH, "kd_entropy", (ms_timeout/(1000/HZ))); + if (ms_timeout) { + ns = (u_int64_t)ms_timeout * (u_int64_t)(1000 * 1000); + nanoseconds_to_absolutetime(ns, &abstime ); + clock_absolutetime_interval_to_deadline( abstime, &abstime ); + } else + abstime = 0; + + s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kdw_spin_lock); + + while (wait_result == THREAD_AWAKENED && kd_entropy_indx < kd_entropy_count) { + + kde_waiter = 1; + + if (abstime) { + /* + * wait for the specified timeout or + * until we've hit our sample limit + */ + wait_result = lck_spin_sleep_deadline(kdw_spin_lock, 0, &kde_waiter, THREAD_ABORTSAFE, abstime); + } else { + /* + * wait until we've hit our sample limit + */ + wait_result = lck_spin_sleep(kdw_spin_lock, 0, &kde_waiter, THREAD_ABORTSAFE); + } + kde_waiter = 0; + } + lck_spin_unlock(kdw_spin_lock); + ml_set_interrupts_enabled(s); /* * Disable entropy sampling */ - kdebug_enable &= ~KDEBUG_ENABLE_ENTROPY; - kdebug_slowcheck &= ~SLOW_ENTROPY; + kdbg_set_flags(SLOW_ENTROPY, KDEBUG_ENABLE_ENTROPY, FALSE); + + KERNEL_DEBUG_CONSTANT(0xbbbbf000 | DBG_FUNC_END, ms_timeout, kd_entropy_indx, 0, 0, 0); *number = 0; ret = 0; @@ -1204,10 +1594,10 @@ kdbg_getentropy (user_addr_t buffer, size_t *number, int ms_timeout) /* * copyout the buffer */ - if (copyout(kd_entropy_buffer, buffer, kd_entropy_indx * sizeof(mach_timespec_t))) + if (copyout(kd_entropy_buffer, buffer, kd_entropy_indx * sizeof(uint64_t))) ret = EINVAL; else - *number = kd_entropy_indx; + *number = kd_entropy_indx * sizeof(uint64_t); } /* * Always cleanup @@ -1250,14 +1640,16 @@ kdbg_set_nkdbufs(unsigned int value) void kdbg_control_chud(int val, void *fn) { - if (val) { - /* enable chudhook */ + kdbg_lock_init(); + + if (val) { + /* enable chudhook */ kdebug_chudhook = fn; - kdebug_enable |= KDEBUG_ENABLE_CHUD; + kdbg_set_flags(SLOW_CHUD, KDEBUG_ENABLE_CHUD, TRUE); } else { - /* disable chudhook */ - kdebug_enable &= ~KDEBUG_ENABLE_CHUD; + /* disable chudhook */ + kdbg_set_flags(SLOW_CHUD, KDEBUG_ENABLE_CHUD, FALSE); kdebug_chudhook = 0; } } @@ -1272,22 +1664,24 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) kd_regtype kd_Reg; kbufinfo_t kd_bufinfo; pid_t curpid; - struct proc *p, *curproc; + proc_t p, curproc; if (name[0] == KERN_KDGETENTROPY || + name[0] == KERN_KDWRITETR || + name[0] == KERN_KDWRITEMAP || name[0] == KERN_KDEFLAGS || name[0] == KERN_KDDFLAGS || name[0] == KERN_KDENABLE || name[0] == KERN_KDSETBUF) { if ( namelen < 2 ) - return(EINVAL); + return(EINVAL); value = name[1]; } kdbg_lock_init(); - if ( !(kdebug_flags & KDBG_LOCKINIT)) + if ( !(kd_ctrl_page.kdebug_flags & KDBG_LOCKINIT)) return(ENOSPC); lck_mtx_lock(kd_trace_mtx_sysctl); @@ -1308,12 +1702,12 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) kd_bufinfo.nkdbufs = nkdbufs; kd_bufinfo.nkdthreads = kd_mapsize / sizeof(kd_threadmap); - if ( (kdebug_slowcheck & SLOW_NOLOG) ) + if ( (kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) ) kd_bufinfo.nolog = 1; else kd_bufinfo.nolog = 0; - kd_bufinfo.flags = kdebug_flags; + kd_bufinfo.flags = kd_ctrl_page.kdebug_flags; #if defined(__LP64__) kd_bufinfo.flags |= KDBG_LP64; #endif @@ -1371,11 +1765,11 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) switch(name[0]) { case KERN_KDEFLAGS: value &= KDBG_USERFLAGS; - kdebug_flags |= value; + kd_ctrl_page.kdebug_flags |= value; break; case KERN_KDDFLAGS: value &= KDBG_USERFLAGS; - kdebug_flags &= ~value; + kd_ctrl_page.kdebug_flags &= ~value; break; case KERN_KDENABLE: /* @@ -1385,25 +1779,22 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) /* * enable only if buffer is initialized */ - if (!(kdebug_flags & KDBG_BUFINIT)) { + if (!(kd_ctrl_page.kdebug_flags & KDBG_BUFINIT)) { ret = EINVAL; break; } kdbg_mapinit(); - kdebug_enable |= KDEBUG_ENABLE_TRACE; - kdebug_slowcheck &= ~SLOW_NOLOG; - } - else { - kdebug_enable &= ~KDEBUG_ENABLE_TRACE; - kdebug_slowcheck |= SLOW_NOLOG; + kdbg_set_tracing_enabled(TRUE); } + else + kdbg_set_tracing_enabled(FALSE); break; case KERN_KDSETBUF: kdbg_set_nkdbufs(value); break; case KERN_KDSETUP: - ret = kdbg_reinit(); + ret = kdbg_reinit(FALSE); break; case KERN_KDREMOVE: kdbg_clear(); @@ -1432,6 +1823,86 @@ kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) case KERN_KDREADTR: ret = kdbg_read(where, sizep, NULL, NULL); break; + case KERN_KDWRITETR: + case KERN_KDWRITEMAP: + { + struct vfs_context context; + struct fileproc *fp; + size_t number; + vnode_t vp; + int fd; + + if (name[0] == KERN_KDWRITETR) { + int s; + int wait_result = THREAD_AWAKENED; + u_int64_t abstime; + u_int64_t ns; + + if (*sizep) { + ns = ((u_int64_t)*sizep) * (u_int64_t)(1000 * 1000); + nanoseconds_to_absolutetime(ns, &abstime ); + clock_absolutetime_interval_to_deadline( abstime, &abstime ); + } else + abstime = 0; + + s = ml_set_interrupts_enabled(FALSE); + lck_spin_lock(kdw_spin_lock); + + while (wait_result == THREAD_AWAKENED && kd_ctrl_page.kds_inuse_count < n_storage_threshold) { + + kds_waiter = 1; + + if (abstime) + wait_result = lck_spin_sleep_deadline(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE, abstime); + else + wait_result = lck_spin_sleep(kdw_spin_lock, 0, &kds_waiter, THREAD_ABORTSAFE); + + kds_waiter = 0; + } + lck_spin_unlock(kdw_spin_lock); + ml_set_interrupts_enabled(s); + } + p = current_proc(); + fd = value; + + proc_fdlock(p); + if ( (ret = fp_lookup(p, fd, &fp, 1)) ) { + proc_fdunlock(p); + break; + } + context.vc_thread = current_thread(); + context.vc_ucred = fp->f_fglob->fg_cred; + + if (fp->f_fglob->fg_type != DTYPE_VNODE) { + fp_drop(p, fd, fp, 1); + proc_fdunlock(p); + + ret = EBADF; + break; + } + vp = (struct vnode *)fp->f_fglob->fg_data; + proc_fdunlock(p); + + if ((ret = vnode_getwithref(vp)) == 0) { + + if (name[0] == KERN_KDWRITETR) { + number = nkdbufs * sizeof(kd_buf); + + KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_START, 0, 0, 0, 0, 0); + ret = kdbg_read(0, &number, vp, &context); + KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 3)) | DBG_FUNC_END, number, 0, 0, 0, 0); + + *sizep = number; + } else { + number = kd_mapsize; + kdbg_readmap(0, &number, vp, &context); + } + vnode_put(vp); + } + fp_drop(p, fd, fp, 0); + + break; + } case KERN_KDPIDTR: if (size < sizeof(kd_regtype)) { ret = EINVAL; @@ -1489,25 +1960,32 @@ int kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) { unsigned int count; - unsigned int cpu, mincpu; + unsigned int cpu, min_cpu; uint64_t mintime, t; - int error = 0,s = 0; + int error = 0; kd_buf *tempbuf; - kd_buf *rcursor; - kd_buf *min_rcursor; - struct kd_storage *kdsp; + uint32_t rcursor; + kd_buf lostevent; + union kds_ptr kdsp; + struct kd_storage *kdsp_actual; struct kd_bufinfo *kdbp; + struct kd_bufinfo *min_kdbp; uint32_t tempbuf_count; uint32_t tempbuf_number; uint32_t old_kdebug_flags; uint32_t old_kdebug_slowcheck; + boolean_t lostevents = FALSE; + boolean_t out_of_events = FALSE; count = *number/sizeof(kd_buf); *number = 0; - if (count == 0 || !(kdebug_flags & KDBG_BUFINIT) || kdcopybuf == 0) + if (count == 0 || !(kd_ctrl_page.kdebug_flags & KDBG_BUFINIT) || kdcopybuf == 0) return EINVAL; + memset(&lostevent, 0, sizeof(lostevent)); + lostevent.debugid = TRACEDBG_CODE(DBG_TRACE_INFO, 2); + /* * because we hold kd_trace_mtx_sysctl, no other control threads can * be playing with kdebug_flags... the code that cuts new events could @@ -1515,17 +1993,8 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) * storage chunk which is where it examines kdebug_flags... it its adding * to the same chunk we're reading from, no problem... */ - s = ml_set_interrupts_enabled(FALSE); - lck_spin_lock(kds_spin_lock); - old_kdebug_slowcheck = kdebug_slowcheck; - old_kdebug_flags = kdebug_flags; - - kdebug_flags &= ~KDBG_WRAPPED; - kdebug_flags |= KDBG_NOWRAP; - - lck_spin_unlock(kds_spin_lock); - ml_set_interrupts_enabled(s); + disable_wrap(&old_kdebug_slowcheck, &old_kdebug_flags); if (count > nkdbufs) count = nkdbufs; @@ -1538,66 +2007,86 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) tempbuf_number = 0; while (tempbuf_count) { - mintime = 0xffffffffffffffffULL; /* all actual timestamps are below */ - mincpu = -1; - min_rcursor = NULL; + mintime = 0xffffffffffffffffULL; + min_kdbp = NULL; + min_cpu = 0; for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_cpus; cpu++, kdbp++) { - if ((kdsp = kdbp->kd_list_head) == NULL) + if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) continue; - rcursor = kdsp->kds_readlast; + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); + + rcursor = kdsp_actual->kds_readlast; - if (rcursor == kdsp->kds_bufptr) + if (rcursor == kdsp_actual->kds_bufindx) continue; - t = kdbg_get_timestamp(rcursor); + t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]); + + if (t < kdsp_actual->kds_timestamp) { + /* + * indicates we've not yet completed filling + * in this event... + * this should only occur when we're looking + * at the buf that the record head is utilizing + * we'll pick these events up on the next + * call to kdbg_read + * we bail at this point so that we don't + * get an out-of-order timestream by continuing + * to read events from the other CPUs' timestream(s) + */ + out_of_events = TRUE; + break; + } if (t < mintime) { - mincpu = cpu; mintime = t; - min_rcursor = rcursor; + min_kdbp = kdbp; + min_cpu = cpu; } } - if (mincpu == (unsigned int)-1) - /* + if (min_kdbp == NULL || out_of_events == TRUE) { + /* * all buffers ran empty */ - break; - - kdbp = &kdbip[mincpu]; - kdsp = kdbp->kd_list_head; + out_of_events = TRUE; + break; + } + kdsp = min_kdbp->kd_list_head; + kdsp_actual = POINTER_FROM_KDS_PTR(kdsp); - *tempbuf = *min_rcursor; + if (kdsp_actual->kds_lostevents == TRUE) { + lostevent.timestamp = kdsp_actual->kds_records[kdsp_actual->kds_readlast].timestamp; + *tempbuf = lostevent; + + kdsp_actual->kds_lostevents = FALSE; + lostevents = TRUE; - if (mintime != kdbg_get_timestamp(tempbuf)) { - /* - * we stole this storage unit and used it - * before we could slurp the selected event out - * so we need to re-evaluate - */ - continue; + goto nextevent; } + *tempbuf = kdsp_actual->kds_records[kdsp_actual->kds_readlast++]; + + if (kdsp_actual->kds_readlast == EVENTS_PER_STORAGE_UNIT) + release_storage_unit(min_cpu, kdsp.raw); + /* * Watch for out of order timestamps */ - if (mintime < kdbp->kd_prev_timebase) { + if (mintime < min_kdbp->kd_prev_timebase) { /* * if so, use the previous timestamp + 1 cycle */ - kdbp->kd_prev_timebase++; - kdbg_set_timestamp_and_cpu(tempbuf, kdbp->kd_prev_timebase, mincpu); + min_kdbp->kd_prev_timebase++; + kdbg_set_timestamp_and_cpu(tempbuf, min_kdbp->kd_prev_timebase, kdbg_get_cpu(tempbuf)); } else - kdbp->kd_prev_timebase = mintime; - - if (min_rcursor == kdsp->kds_readlast) - kdsp->kds_readlast++; - - if (kdsp->kds_readlast == kdsp->kds_buflast) - release_storage_unit(kdbp, kdsp); - + min_kdbp->kd_prev_timebase = mintime; +nextevent: tempbuf_count--; tempbuf_number++; tempbuf++; + + if ((RAW_file_written += sizeof(kd_buf)) >= RAW_FLUSH_SIZE) + break; } if (tempbuf_number) { @@ -1606,6 +2095,12 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vfs_context_ucred(ctx), (int *) 0, vfs_context_proc(ctx)); RAW_file_offset += (tempbuf_number * sizeof(kd_buf)); + + if (RAW_file_written >= RAW_FLUSH_SIZE) { + cluster_push(vp, 0); + + RAW_file_written = 0; + } } else { error = copyout(kdcopybuf, buffer, tempbuf_number * sizeof(kd_buf)); buffer += (tempbuf_number * sizeof(kd_buf)); @@ -1618,7 +2113,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) count -= tempbuf_number; *number += tempbuf_number; } - if (tempbuf_count) + if (out_of_events == TRUE) /* * all trace buffers are empty */ @@ -1628,17 +2123,7 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx) tempbuf_count = KDCOPYBUF_COUNT; } if ( !(old_kdebug_flags & KDBG_NOWRAP)) { - - s = ml_set_interrupts_enabled(FALSE); - lck_spin_lock(kds_spin_lock); - - kdebug_flags &= ~KDBG_NOWRAP; - - if ( !(old_kdebug_slowcheck & SLOW_NOLOG)) - kdebug_slowcheck &= ~SLOW_NOLOG; - - lck_spin_unlock(kds_spin_lock); - ml_set_interrupts_enabled(s); + enable_wrap(old_kdebug_slowcheck, lostevents); } return (error); } @@ -1656,9 +2141,6 @@ unsigned char *getProcName(struct proc *proc) { #if defined(__i386__) || defined (__x86_64__) #define TRAP_DEBUGGER __asm__ volatile("int3"); #endif -#ifdef __ppc__ -#define TRAP_DEBUGGER __asm__ volatile("tw 4,r3,r3"); -#endif #define SANE_TRACEBUF_SIZE (8 * 1024 * 1024) @@ -1701,7 +2183,6 @@ int stack_snapshot(struct proc *p, register struct stack_snapshot_args *uap, int32_t *retval) { int error = 0; - if ((error = suser(kauth_cred_get(), &p->p_acflag))) return(error); @@ -1779,14 +2260,13 @@ error_exit: void start_kern_tracing(unsigned int new_nkdbufs) { + if (!new_nkdbufs) return; kdbg_set_nkdbufs(new_nkdbufs); kdbg_lock_init(); - kdbg_reinit(); - kdebug_enable |= KDEBUG_ENABLE_TRACE; - kdebug_slowcheck &= ~SLOW_NOLOG; - kdbg_mapinit(); + kdbg_reinit(TRUE); + kdbg_set_tracing_enabled(TRUE); #if defined(__i386__) || defined(__x86_64__) uint64_t now = mach_absolute_time(); @@ -1808,7 +2288,7 @@ kdbg_dump_trace_to_file(const char *filename) size_t number; - if (kdebug_enable & (KDEBUG_ENABLE_CHUD | KDEBUG_ENABLE_ENTROPY)) + if ( !(kdebug_enable & KDEBUG_ENABLE_TRACE)) return; if (global_state_pid != -1) { @@ -1824,6 +2304,7 @@ kdbg_dump_trace_to_file(const char *filename) KERNEL_DEBUG_CONSTANT((TRACEDBG_CODE(DBG_TRACE_INFO, 0)) | DBG_FUNC_NONE, 0, 0, 0, 0, 0); kdebug_enable = 0; + kd_ctrl_page.enabled = 0; ctx = vfs_context_kernel(); @@ -1840,3 +2321,44 @@ kdbg_dump_trace_to_file(const char *filename) sync(current_proc(), (void *)NULL, (int *)NULL); } + +/* Helper function for filling in the BSD name for an address space + * Defined here because the machine bindings know only Mach threads + * and nothing about BSD processes. + * + * FIXME: need to grab a lock during this? + */ +void kdbg_get_task_name(char* name_buf, int len, task_t task) +{ + proc_t proc; + + /* Note: we can't use thread->task (and functions that rely on it) here + * because it hasn't been initialized yet when this function is called. + * We use the explicitly-passed task parameter instead. + */ + proc = get_bsdtask_info(task); + if (proc != PROC_NULL) + snprintf(name_buf, len, "%s/%d", proc->p_comm, proc->p_pid); + else + snprintf(name_buf, len, "%p [!bsd]", task); +} + + + +#if defined(NATIVE_TRACE_FACILITY) +void trace_handler_map_ctrl_page(__unused uintptr_t addr, __unused size_t ctrl_page_size, __unused size_t storage_size, __unused size_t kds_ptr_size) +{ +} +void trace_handler_map_bufinfo(__unused uintptr_t addr, __unused size_t size) +{ +} +void trace_handler_unmap_bufinfo(void) +{ +} +void trace_handler_map_buffer(__unused int index, __unused uintptr_t addr, __unused size_t size) +{ +} +void trace_handler_unmap_buffer(__unused int index) +{ +} +#endif diff --git a/bsd/kern/kern_acct.c b/bsd/kern/kern_acct.c index 747f09221..516de08dc 100644 --- a/bsd/kern/kern_acct.c +++ b/bsd/kern/kern_acct.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -161,7 +161,7 @@ acct(proc_t p, struct acct_args *uap, __unused int *retval) * writing and make sure it's a 'normal'. */ if (uap->path != USER_ADDR_NULL) { - NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_OPEN, NOFOLLOW, UIO_USERSPACE, uap->path, ctx); if ((error = vn_open(&nd, FWRITE, 0))) return (error); #if CONFIG_MACF @@ -271,8 +271,8 @@ acct_process(proc_t p) /* (6) The UID and GID of the process */ safecred = kauth_cred_proc_ref(p); - an_acct.ac_uid = safecred->cr_ruid; - an_acct.ac_gid = safecred->cr_rgid; + an_acct.ac_uid = kauth_cred_getruid(safecred); + an_acct.ac_gid = kauth_cred_getrgid(safecred); /* (7) The terminal from which the process was started */ diff --git a/bsd/kern/kern_aio.c b/bsd/kern/kern_aio.c index a2983db0a..89a2ba012 100644 --- a/bsd/kern/kern_aio.c +++ b/bsd/kern/kern_aio.c @@ -1394,7 +1394,7 @@ aio_enqueue_work( proc_t procp, aio_workq_entry *entryp, int proc_locked) /* And work queue */ aio_workq_lock_spin(queue); aio_workq_add_entry_locked(queue, entryp); - wait_queue_wakeup_one(queue->aioq_waitq, queue, THREAD_AWAKENED); + wait_queue_wakeup_one(queue->aioq_waitq, queue, THREAD_AWAKENED, -1); aio_workq_unlock(queue); if (proc_locked == 0) { diff --git a/bsd/kern/kern_authorization.c b/bsd/kern/kern_authorization.c index 263fc28b4..f30df6d2d 100644 --- a/bsd/kern/kern_authorization.c +++ b/bsd/kern/kern_authorization.c @@ -185,10 +185,9 @@ kauth_alloc_scope(const char *identifier, kauth_scope_callback_t callback, void /* * Allocate and populate the scope structure. */ - MALLOC(sp, kauth_scope_t, sizeof(*sp), M_KAUTH, M_WAITOK); + MALLOC(sp, kauth_scope_t, sizeof(*sp), M_KAUTH, M_WAITOK | M_ZERO); if (sp == NULL) return(NULL); - bzero(&sp->ks_listeners, sizeof(sp->ks_listeners)); sp->ks_flags = 0; sp->ks_identifier = identifier; sp->ks_idata = idata; @@ -613,7 +612,7 @@ kauth_authorize_generic_callback(kauth_cred_t credential, __unused void *idata, int kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) { - int applies, error, i; + int applies, error, i, gotguid; kauth_ace_t ace; guid_t guid; uint32_t rights; @@ -632,9 +631,11 @@ kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) * Get our guid for comparison purposes. */ if ((error = kauth_cred_getguid(cred, &guid)) != 0) { - eval->ae_result = KAUTH_RESULT_DENY; - KAUTH_DEBUG(" ACL - can't get credential GUID (%d), ACL denied", error); - return(error); + KAUTH_DEBUG(" ACL - can't get credential GUID (%d)", error); + error = 0; + gotguid = 0; + } else { + gotguid = 1; } KAUTH_DEBUG(" ACL - %d entries, initial residual %x", eval->ae_count, eval->ae_residual); @@ -678,7 +679,7 @@ kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) /* we don't recognise this ACE, skip it */ continue; } - + /* * Verify whether this entry applies to the credential. */ @@ -688,7 +689,10 @@ kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) applies = eval->ae_options & KAUTH_AEVAL_IS_OWNER; break; case KAUTH_WKG_GROUP: - applies = eval->ae_options & KAUTH_AEVAL_IN_GROUP; + if (!gotguid || (eval->ae_options & KAUTH_AEVAL_IN_GROUP_UNKNOWN)) + applies = ((ace->ace_flags & KAUTH_ACE_KINDMASK) == KAUTH_ACE_DENY); + else + applies = eval->ae_options & KAUTH_AEVAL_IN_GROUP; break; /* we short-circuit these here rather than wasting time calling the group membership code */ case KAUTH_WKG_EVERYBODY: @@ -700,12 +704,12 @@ kauth_acl_evaluate(kauth_cred_t cred, kauth_acl_eval_t eval) default: /* check to see whether it's exactly us, or a group we are a member of */ - applies = kauth_guid_equal(&guid, &ace->ace_applicable); + applies = !gotguid ? 0 : kauth_guid_equal(&guid, &ace->ace_applicable); KAUTH_DEBUG(" ACL - ACE applicable " K_UUID_FMT " caller " K_UUID_FMT " %smatched", K_UUID_ARG(ace->ace_applicable), K_UUID_ARG(guid), applies ? "" : "not "); if (!applies) { - error = kauth_cred_ismember_guid(cred, &ace->ace_applicable, &applies); + error = !gotguid ? ENOENT : kauth_cred_ismember_guid(cred, &ace->ace_applicable, &applies); /* * If we can't resolve group membership, we have to limit misbehaviour. * If the ACE is an 'allow' ACE, assume the cred is not a member (avoid @@ -791,15 +795,37 @@ kauth_acl_inherit(vnode_t dvp, kauth_acl_t initial, kauth_acl_t *product, int is * XXX TODO: <rdar://3634665> wants a "umask ACL" from the process. */ inherit = NULL; - if ((dvp != NULL) && !vfs_authopaque(vnode_mount(dvp))) { + /* + * If there is no initial ACL, or there is, and the initial ACLs + * flags do not request "no inheritance", then we inherit. This allows + * initial object creation via open_extended() and mkdir_extended() + * to reject inheritance for themselves and for inferior nodes by + * specifying a non-NULL inital ACL which has the KAUTH_ACL_NO_INHERIT + * flag set in the flags field. + */ + if ((initial == NULL || !(initial->acl_flags & KAUTH_ACL_NO_INHERIT)) && + (dvp != NULL) && !vfs_authopaque(vnode_mount(dvp))) { VATTR_INIT(&dva); VATTR_WANTED(&dva, va_acl); if ((error = vnode_getattr(dvp, &dva, ctx)) != 0) { KAUTH_DEBUG(" ERROR - could not get parent directory ACL for inheritance"); return(error); } - if (VATTR_IS_SUPPORTED(&dva, va_acl)) + if (VATTR_IS_SUPPORTED(&dva, va_acl)) { inherit = dva.va_acl; + /* + * If there is an ACL on the parent directory, then + * there are potentially inheritable ACE entries, but + * if the flags on the directory ACL say not to + * inherit, then we don't inherit. This allows for + * per directory rerooting of the inheritable ACL + * hierarchy. + */ + if (inherit != NULL && inherit->acl_flags & KAUTH_ACL_NO_INHERIT) { + kauth_acl_free(inherit); + inherit = NULL; + } + } } /* @@ -852,14 +878,17 @@ kauth_acl_inherit(vnode_t dvp, kauth_acl_t initial, kauth_acl_t *product, int is /* * Composition is simply: - * - initial - * - inherited + * - initial direct ACEs + * - inherited ACEs from new parent */ index = 0; if (initial != NULL) { - for (i = 0; i < initial->acl_entrycount; i++) - result->acl_ace[index++] = initial->acl_ace[i]; - KAUTH_DEBUG(" INHERIT - applied %d initial entries", index); + for (i = 0; i < initial->acl_entrycount; i++) { + if (!(initial->acl_ace[i].ace_flags & KAUTH_ACE_INHERITED)) { + result->acl_ace[index++] = initial->acl_ace[i]; + } + } + KAUTH_DEBUG(" INHERIT - applied %d of %d initial entries", index, initial->acl_entrycount); } if (inherit != NULL) { for (i = 0; i < inherit->acl_entrycount; i++) { diff --git a/bsd/kern/kern_clock.c b/bsd/kern/kern_clock.c index 0cbd41e1b..1aae2df47 100644 --- a/bsd/kern/kern_clock.c +++ b/bsd/kern/kern_clock.c @@ -241,7 +241,7 @@ sysctl_clockrate } SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, - CTLTYPE_STRUCT | CTLFLAG_RD, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_clockrate, "S,clockinfo", ""); diff --git a/bsd/kern/kern_control.c b/bsd/kern/kern_control.c index a76088eb4..92b15bc40 100644 --- a/bsd/kern/kern_control.c +++ b/bsd/kern/kern_control.c @@ -91,6 +91,7 @@ static int ctl_peeraddr(struct socket *so, struct sockaddr **nam); static struct kctl *ctl_find_by_name(const char *); static struct kctl *ctl_find_by_id_unit(u_int32_t id, u_int32_t unit); +static struct socket *kcb_find_socket(struct kctl *, u_int32_t unit); static struct ctl_cb *kcb_find(struct kctl *, u_int32_t unit); static void ctl_post_msg(u_int32_t event_code, u_int32_t id); @@ -255,7 +256,7 @@ ctl_sofreelastref(struct socket *so) if ((kctl = kcb->kctl) != 0) { lck_mtx_lock(ctl_mtx); TAILQ_REMOVE(&kctl->kcb_head, kcb, next); - lck_mtx_lock(ctl_mtx); + lck_mtx_unlock(ctl_mtx); } kcb_delete(kcb); } @@ -364,10 +365,16 @@ ctl_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) error = (*kctl->connect)(kctl, &sa, &kcb->userdata); socket_lock(so, 0); if (error) - goto done; + goto end; soisconnected(so); +end: + if (error && kctl->disconnect) { + socket_unlock(so, 0); + (*kctl->disconnect)(kctl, kcb->unit, kcb->userdata); + socket_lock(so, 0); + } done: if (error) { soisdisconnected(so); @@ -393,12 +400,19 @@ ctl_disconnect(struct socket *so) (*kctl->disconnect)(kctl, kcb->unit, kcb->userdata); socket_lock(so, 0); } + + soisdisconnected(so); + + socket_unlock(so, 0); lck_mtx_lock(ctl_mtx); kcb->kctl = 0; kcb->unit = 0; + while (kcb->usecount != 0) { + msleep(&kcb->usecount, ctl_mtx, 0, "kcb->usecount", 0); + } TAILQ_REMOVE(&kctl->kcb_head, kcb, next); - soisdisconnected(so); lck_mtx_unlock(ctl_mtx); + socket_lock(so, 0); } return 0; } @@ -430,23 +444,29 @@ ctl_peeraddr(struct socket *so, struct sockaddr **nam) static int ctl_send(struct socket *so, int flags, struct mbuf *m, - __unused struct sockaddr *addr, __unused struct mbuf *control, + __unused struct sockaddr *addr, struct mbuf *control, __unused struct proc *p) { int error = 0; struct ctl_cb *kcb = (struct ctl_cb *)so->so_pcb; struct kctl *kctl; + if (control) m_freem(control); + if (kcb == NULL) /* sanity check */ - return(ENOTCONN); + error = ENOTCONN; - if ((kctl = kcb->kctl) == NULL) - return(EINVAL); + if (error == 0 && (kctl = kcb->kctl) == NULL) + error = EINVAL; - if (kctl->send) { + if (error == 0 && kctl->send) { socket_unlock(so, 0); error = (*kctl->send)(kctl, kcb->unit, kcb->userdata, m, flags); socket_lock(so, 0); + } else { + m_freem(m); + if (error == 0) + error = ENOTSUP; } return error; } @@ -454,23 +474,18 @@ ctl_send(struct socket *so, int flags, struct mbuf *m, errno_t ctl_enqueuembuf(void *kctlref, u_int32_t unit, struct mbuf *m, u_int32_t flags) { - struct ctl_cb *kcb; struct socket *so; errno_t error = 0; struct kctl *kctl = (struct kctl *)kctlref; if (kctl == NULL) return EINVAL; - - kcb = kcb_find(kctl, unit); - if (kcb == NULL) - return EINVAL; - so = (struct socket *)kcb->so; - if (so == NULL) + so = kcb_find_socket(kctl, unit); + + if (so == NULL) return EINVAL; - socket_lock(so, 1); if (sbspace(&so->so_rcv) < m->m_pkthdr.len) { error = ENOBUFS; goto bye; @@ -487,7 +502,6 @@ bye: errno_t ctl_enqueuedata(void *kctlref, u_int32_t unit, void *data, size_t len, u_int32_t flags) { - struct ctl_cb *kcb; struct socket *so; struct mbuf *m; errno_t error = 0; @@ -499,15 +513,10 @@ ctl_enqueuedata(void *kctlref, u_int32_t unit, void *data, size_t len, u_int32_t if (kctlref == NULL) return EINVAL; - kcb = kcb_find(kctl, unit); - if (kcb == NULL) + so = kcb_find_socket(kctl, unit); + if (so == NULL) return EINVAL; - so = (struct socket *)kcb->so; - if (so == NULL) - return EINVAL; - - socket_lock(so, 1); if (sbspace(&so->so_rcv) < (int)len) { error = ENOBUFS; goto bye; @@ -545,27 +554,21 @@ bye: errno_t ctl_getenqueuespace(kern_ctl_ref kctlref, u_int32_t unit, size_t *space) { - struct ctl_cb *kcb; struct kctl *kctl = (struct kctl *)kctlref; struct socket *so; long avail; if (kctlref == NULL || space == NULL) return EINVAL; - - kcb = kcb_find(kctl, unit); - if (kcb == NULL) - return EINVAL; - so = (struct socket *)kcb->so; - if (so == NULL) + so = kcb_find_socket(kctl, unit); + if (so == NULL) return EINVAL; - socket_lock(so, 1); avail = sbspace(&so->so_rcv); *space = (avail < 0) ? 0 : avail; socket_unlock(so, 1); - + return 0; } @@ -624,6 +627,9 @@ ctl_ctloutput(struct socket *so, struct sockopt *sopt) socket_unlock(so, 0); error = (*kctl->getopt)(kcb->kctl, kcb->unit, kcb->userdata, sopt->sopt_name, data, &len); + if (data != NULL && len > sopt->sopt_valsize) + panic_plain("ctl_ctloutput: ctl %s returned len (%lu) > sopt_valsize (%lu)\n", + kcb->kctl->name, len, sopt->sopt_valsize); socket_lock(so, 0); if (error == 0) { if (data != NULL) @@ -858,6 +864,46 @@ ctl_find_by_name(const char *name) return NULL; } +u_int32_t +ctl_id_by_name(const char *name) +{ + u_int32_t ctl_id = 0; + + lck_mtx_lock(ctl_mtx); + struct kctl *kctl = ctl_find_by_name(name); + if (kctl) ctl_id = kctl->id; + lck_mtx_unlock(ctl_mtx); + + return ctl_id; +} + +errno_t +ctl_name_by_id( + u_int32_t id, + char *out_name, + size_t maxsize) +{ + int found = 0; + + lck_mtx_lock(ctl_mtx); + struct kctl *kctl; + TAILQ_FOREACH(kctl, &ctl_head, next) { + if (kctl->id == id) + break; + } + + if (kctl && kctl->name) + { + if (maxsize > MAX_KCTL_NAME) + maxsize = MAX_KCTL_NAME; + strlcpy(out_name, kctl->name, maxsize); + found = 1; + } + lck_mtx_unlock(ctl_mtx); + + return found ? 0 : ENOENT; +} + /* * Must be called with global ctl_mtx lock taked * @@ -885,21 +931,58 @@ kcb_find(struct kctl *kctl, u_int32_t unit) struct ctl_cb *kcb; TAILQ_FOREACH(kcb, &kctl->kcb_head, next) - if ((kcb->unit == unit)) + if (kcb->unit == unit) return kcb; return NULL; } -/* - * Must be called witout lock - */ +static struct socket * +kcb_find_socket(struct kctl *kctl, u_int32_t unit) +{ + struct socket *so = NULL; + + lck_mtx_lock(ctl_mtx); + struct ctl_cb *kcb = kcb_find(kctl, unit); + if (kcb && kcb->kctl == kctl) { + so = kcb->so; + if (so) { + kcb->usecount++; + } + } + lck_mtx_unlock(ctl_mtx); + + if (so == NULL) { + return NULL; + } + + socket_lock(so, 1); + + lck_mtx_lock(ctl_mtx); + if (kcb->kctl == NULL) + { + lck_mtx_unlock(ctl_mtx); + socket_unlock(so, 1); + so = NULL; + lck_mtx_lock(ctl_mtx); + } + kcb->usecount--; + if (kcb->usecount == 0) + wakeup((event_t)&kcb->usecount); + lck_mtx_unlock(ctl_mtx); + + return so; +} + static void ctl_post_msg(u_int32_t event_code, u_int32_t id) { struct ctl_event_data ctl_ev_data; struct kev_msg ev_msg; + lck_mtx_assert(ctl_mtx, LCK_MTX_ASSERT_NOTOWNED); + + bzero(&ev_msg, sizeof(struct kev_msg)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_SYSTEM_CLASS; diff --git a/bsd/kern/kern_core.c b/bsd/kern/kern_core.c index 52c0a3095..cf63621d9 100644 --- a/bsd/kern/kern_core.c +++ b/bsd/kern/kern_core.c @@ -70,24 +70,7 @@ typedef struct { mach_msg_type_number_t count; /* count of ints in this flavor */ } mythread_state_flavor_t; -#if defined (__ppc__) -/* 64 bit */ -mythread_state_flavor_t thread_flavor_array64[]={ - {PPC_THREAD_STATE64 , PPC_THREAD_STATE64_COUNT}, - {PPC_FLOAT_STATE, PPC_FLOAT_STATE_COUNT}, - {PPC_EXCEPTION_STATE64, PPC_EXCEPTION_STATE64_COUNT}, - {PPC_VECTOR_STATE, PPC_VECTOR_STATE_COUNT} - }; - -/* 32 bit */ -mythread_state_flavor_t thread_flavor_array[]={ - {PPC_THREAD_STATE , PPC_THREAD_STATE_COUNT}, - {PPC_FLOAT_STATE, PPC_FLOAT_STATE_COUNT}, - {PPC_EXCEPTION_STATE, PPC_EXCEPTION_STATE_COUNT}, - {PPC_VECTOR_STATE, PPC_VECTOR_STATE_COUNT} - }; - -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) mythread_state_flavor_t thread_flavor_array [] = { {x86_THREAD_STATE, x86_THREAD_STATE_COUNT}, {x86_FLOAT_STATE, x86_FLOAT_STATE_COUNT}, @@ -139,9 +122,6 @@ process_cpu_type(proc_t core_proc) } else { what_we_think = CPU_TYPE_I386; } -#elif defined (__ppc__) - #pragma unused(core_proc) - what_we_think = CPU_TYPE_POWERPC; #endif return what_we_think; } @@ -156,9 +136,6 @@ process_cpu_subtype(proc_t core_proc) } else { what_we_think = CPU_SUBTYPE_I386_ALL; } -#elif defined (__ppc__) - #pragma unused(core_proc) - what_we_think = CPU_SUBTYPE_POWERPC_ALL; #endif return what_we_think; } @@ -261,8 +238,8 @@ coredump(proc_t core_proc) if (do_coredump == 0 || /* Not dumping at all */ ( (sugid_coredump == 0) && /* Not dumping SUID/SGID binaries */ - ( (cred->cr_svuid != cred->cr_ruid) || - (cred->cr_svgid != cred->cr_rgid)))) { + ( (kauth_cred_getsvuid(cred) != kauth_cred_getruid(cred)) || + (kauth_cred_getsvgid(cred) != kauth_cred_getrgid(cred))))) { #if CONFIG_AUDIT audit_proc_coredump(core_proc, NULL, EFAULT); @@ -320,17 +297,8 @@ coredump(proc_t core_proc) thread_count = get_task_numacts(task); segment_count = get_vmmap_entries(map); /* XXX */ -#if defined (__ppc__) - if (is_64) { - tir1.flavor_count = sizeof(thread_flavor_array64)/sizeof(mythread_state_flavor_t); - bcopy(thread_flavor_array64, flavors,sizeof(thread_flavor_array64)); - } else { -#endif /* __ppc __ */ - tir1.flavor_count = sizeof(thread_flavor_array)/sizeof(mythread_state_flavor_t); - bcopy(thread_flavor_array, flavors,sizeof(thread_flavor_array)); -#if defined (__ppc__) - } -#endif /* __ppc __ */ + tir1.flavor_count = sizeof(thread_flavor_array)/sizeof(mythread_state_flavor_t); + bcopy(thread_flavor_array, flavors,sizeof(thread_flavor_array)); tstate_size = 0; for (i = 0; i < tir1.flavor_count; i++) tstate_size += sizeof(mythread_state_flavor_t) + diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c index 4ab7311ae..484c86fff 100644 --- a/bsd/kern/kern_credential.c +++ b/bsd/kern/kern_credential.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 Apple Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -52,6 +52,7 @@ #include <security/audit/audit.h> #include <sys/mount.h> +#include <sys/stat.h> /* For manifest constants in posix_cred_access */ #include <sys/sysproto.h> #include <sys/kern_callout.h> #include <mach/message.h> @@ -150,6 +151,7 @@ static int kauth_resolver_timeout = 30; /* default: 30 seconds */ struct kauth_resolver_work { TAILQ_ENTRY(kauth_resolver_work) kr_link; struct kauth_identity_extlookup kr_work; + uint64_t kr_extend; uint32_t kr_seqno; uint64_t kr_subtime; /* submission time */ int kr_refs; @@ -164,7 +166,7 @@ TAILQ_HEAD(kauth_resolver_unsubmitted_head, kauth_resolver_work) kauth_resolver_ TAILQ_HEAD(kauth_resolver_submitted_head, kauth_resolver_work) kauth_resolver_submitted; TAILQ_HEAD(kauth_resolver_done_head, kauth_resolver_work) kauth_resolver_done; -static int kauth_resolver_submit(struct kauth_identity_extlookup *lkp); +static int kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data); static int kauth_resolver_complete(user_addr_t message); static int kauth_resolver_getwork(user_addr_t message); static int kauth_resolver_getwork2(user_addr_t message); @@ -246,21 +248,37 @@ kauth_resolver_init(void) * * Parameters: lkp A pointer to an external * lookup request + * extend_data extended data for kr_extend * * Returns: 0 Success * EWOULDBLOCK No resolver registered * EINTR Operation interrupted (e.g. by * a signal) * ENOMEM Could not allocate work item + * copyinstr:EFAULT Bad message from user space * workp->kr_result:??? An error from the user space * daemon (includes ENOENT!) * + * Implicit returns: + * *lkp Modified + * * Notes: Allocate a work queue entry, submit the work and wait for * the operation to either complete or time out. Outstanding * operations may also be cancelled. + * + * Submission is by means of placing the item on a work queue + * which is serviced by an external resolver thread calling + * into the kernel. The caller then sleeps until timeout, + * cancellation, or an external resolver thread calls in with + * a result message to kauth_resolver_complete(). All of these + * events wake the caller back up. + * + * This code is called from either kauth_cred_ismember_gid() + * for a group membership request, or it is called from + * kauth_cred_cache_lookup() when we get a cache miss. */ static int -kauth_resolver_submit(struct kauth_identity_extlookup *lkp) +kauth_resolver_submit(struct kauth_identity_extlookup *lkp, uint64_t extend_data) { struct kauth_resolver_work *workp, *killp; struct timespec ts; @@ -294,6 +312,7 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp) return(ENOMEM); workp->kr_work = *lkp; + workp->kr_extend = extend_data; workp->kr_refs = 1; workp->kr_flags = KAUTH_REQUEST_UNSUBMITTED; workp->kr_result = 0; @@ -307,11 +326,19 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp) workp->kr_work.el_result = KAUTH_EXTLOOKUP_INPROG; /* - * XXX As an optimisation, we could check the queue for identical - * XXX items and coalesce them + * XXX We *MUST NOT* attempt to coelesce identical work items due to + * XXX the inability to ensure order of update of the request item + * XXX extended data vs. the wakeup; instead, we let whoever is waiting + * XXX for each item repeat the update when they wake up. */ TAILQ_INSERT_TAIL(&kauth_resolver_unsubmitted, workp, kr_link); + /* + * Wake up an external resolver thread to deal with the new work; one + * may not be available, and if not, then the request will be grabed + * when a resolver thread comes back into the kernel to request new + * work. + */ wakeup_one((caddr_t)&kauth_resolver_unsubmitted); for (;;) { /* we could compute a better timeout here */ @@ -332,8 +359,9 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp) } /* - * Update the moving average of how long it took; if it took longer - * than the time threshold, then we complain about it being slow. + * Update the moving average of how long the request took; if it + * took longer than the time threshold, then we complain about it + * being slow. */ duration = mach_absolute_time() - workp->kr_subtime; if (kco_ma_addsample(&resolver_ma, duration)) { @@ -401,15 +429,19 @@ kauth_resolver_submit(struct kauth_identity_extlookup *lkp) /* someone else still has a reference on this request */ shouldfree = 0; } + /* collect request result */ - if (error == 0) + if (error == 0) { error = workp->kr_result; + } KAUTH_RESOLVER_UNLOCK(); + /* * If we dropped the last reference, free the request. */ - if (shouldfree) + if (shouldfree) { FREE(workp, M_KAUTH); + } KAUTH_DEBUG("RESOLVER - returning %d", error); return(error); @@ -473,7 +505,7 @@ identitysvc(__unused struct proc *p, struct identitysvc_args *uap, __unused int3 * Allow user space resolver to override the * external resolution timeout */ - if (message >= 30 && message <= 10000) { + if (message > 30 && message < 10000) { kauth_resolver_timeout = message; KAUTH_DEBUG("RESOLVER - new resolver changes timeout to %d seconds\n", (int)message); } @@ -625,10 +657,54 @@ kauth_resolver_getwork2(user_addr_t message) */ workp = TAILQ_FIRST(&kauth_resolver_unsubmitted); - if ((error = copyout(&workp->kr_work, message, sizeof(workp->kr_work))) != 0) { + /* + * Copy out the external lookup structure for the request, not + * including the el_extend field, which contains the address of the + * external buffer provided by the external resolver into which we + * copy the extension request information. + */ + /* BEFORE FIELD */ + if ((error = copyout(&workp->kr_work, message, offsetof(struct kauth_identity_extlookup, el_extend))) != 0) { + KAUTH_DEBUG("RESOLVER - error submitting work to resolve"); + goto out; + } + /* AFTER FIELD */ + if ((error = copyout(&workp->kr_work.el_info_reserved_1, + message + offsetof(struct kauth_identity_extlookup, el_info_reserved_1), + sizeof(struct kauth_identity_extlookup) - offsetof(struct kauth_identity_extlookup, el_info_reserved_1))) != 0) { KAUTH_DEBUG("RESOLVER - error submitting work to resolve"); goto out; } + + /* + * Handle extended requests here; if we have a request of a type where + * the kernel wants a translation of extended information, then we need + * to copy it out into the extended buffer, assuming the buffer is + * valid; we only attempt to get the buffer address if we have request + * data to copy into it. + */ + + /* + * translate a user@domain string into a uid/gid/whatever + */ + if (workp->kr_work.el_flags & (KAUTH_EXTLOOKUP_VALID_PWNAM | KAUTH_EXTLOOKUP_VALID_GRNAM)) { + uint64_t uaddr; + + error = copyin(message + offsetof(struct kauth_identity_extlookup, el_extend), &uaddr, sizeof(uaddr)); + if (!error) { + size_t actual; /* not used */ + /* + * Use copyoutstr() to reduce the copy size; we let + * this catch a NULL uaddr because we shouldn't be + * asking in that case anyway. + */ + error = copyoutstr(CAST_DOWN(void *,workp->kr_extend), uaddr, MAXPATHLEN, &actual); + } + if (error) { + KAUTH_DEBUG("RESOLVER - error submitting work to resolve"); + goto out; + } + } TAILQ_REMOVE(&kauth_resolver_unsubmitted, workp, kr_link); workp->kr_flags &= ~KAUTH_REQUEST_UNSUBMITTED; workp->kr_flags |= KAUTH_REQUEST_SUBMITTED; @@ -706,6 +782,10 @@ kauth_resolver_complete(user_addr_t message) struct kauth_resolver_work *killp; int error, result; + /* + * Copy in the mesage, including the extension field, since we are + * copying into a local variable. + */ if ((error = copyin(message, &extl, sizeof(extl))) != 0) { KAUTH_DEBUG("RESOLVER - error getting completed work\n"); return(error); @@ -771,22 +851,66 @@ kauth_resolver_complete(user_addr_t message) } /* - * In the case of a fatal error, we assume that the resolver will restart - * quickly and re-collect all of the outstanding requests. Thus, we don't - * complete the request which returned the fatal error status. + * In the case of a fatal error, we assume that the resolver will + * restart quickly and re-collect all of the outstanding requests. + * Thus, we don't complete the request which returned the fatal + * error status. */ if (extl.el_result != KAUTH_EXTLOOKUP_FATAL) { /* scan our list for this request */ TAILQ_FOREACH(workp, &kauth_resolver_submitted, kr_link) { /* found it? */ if (workp->kr_seqno == extl.el_seqno) { - /* copy result */ - workp->kr_work = extl; - /* move onto completed list and wake up requester(s) */ + + /* + * Get the request of the submitted queue so + * that it is not cleaned up out from under + * us by a timeout. + */ TAILQ_REMOVE(&kauth_resolver_submitted, workp, kr_link); workp->kr_flags &= ~KAUTH_REQUEST_SUBMITTED; workp->kr_flags |= KAUTH_REQUEST_DONE; workp->kr_result = result; + + /* Copy the result message to the work item. */ + memcpy(&workp->kr_work, &extl, sizeof(struct kauth_identity_extlookup)); + + /* + * Check if we have a result in the extension + * field; if we do, then we need to separately + * copy the data from the message el_extend + * into the request buffer that's in the work + * item. We have to do it here because we do + * not want to wake up the waiter until the + * data is in their buffer, and because the + * actual request response may be destroyed + * by the time the requester wakes up, and they + * do not have access to the user space buffer + * address. + * + * It is safe to drop and reacquire the lock + * here because we've already removed the item + * from the submission queue, but have not yet + * moved it to the completion queue. Note that + * near simultaneous requests may result in + * duplication of requests for items in this + * window. This should not be a performance + * issue and is easily detectable by comparing + * time to live on last response vs. time of + * next request in the resolver logs. + */ + if (extl.el_flags & (KAUTH_EXTLOOKUP_VALID_PWNAM|KAUTH_EXTLOOKUP_VALID_GRNAM)) { + size_t actual; /* notused */ + + KAUTH_RESOLVER_UNLOCK(); + error = copyinstr(extl.el_extend, CAST_DOWN(void *, workp->kr_extend), MAXPATHLEN, &actual); + KAUTH_RESOLVER_LOCK(); + } + + /* + * Move the completed work item to the + * completion queue and wake up requester(s) + */ TAILQ_INSERT_TAIL(&kauth_resolver_done, workp, kr_link); wakeup(workp); break; @@ -814,14 +938,18 @@ struct kauth_identity { #define KI_VALID_GID (1<<1) #define KI_VALID_GUID (1<<2) #define KI_VALID_NTSID (1<<3) +#define KI_VALID_PWNAM (1<<4) /* Used for translation */ +#define KI_VALID_GRNAM (1<<5) /* Used for translation */ uid_t ki_uid; gid_t ki_gid; guid_t ki_guid; ntsid_t ki_ntsid; + const char *ki_name; /* string name from string cache */ /* - * Expiry times are the earliest time at which we will disregard the cached state and go to - * userland. Before then if the valid bit is set, we will return the cached value. If it's - * not set, we will not go to userland to resolve, just assume that there is no answer + * Expiry times are the earliest time at which we will disregard the + * cached state and go to userland. Before then if the valid bit is + * set, we will return the cached value. If it's not set, we will + * not go to userland to resolve, just assume that there is no answer * available. */ time_t ki_guid_expiry; @@ -838,16 +966,17 @@ static lck_mtx_t *kauth_identity_mtx; static struct kauth_identity *kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, - ntsid_t *ntsidp, time_t ntsid_expiry); + ntsid_t *ntsidp, time_t ntsid_expiry, const char *name, int nametype); static void kauth_identity_register_and_free(struct kauth_identity *kip); -static void kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *kip); +static void kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *kip, uint64_t extend_data); static void kauth_identity_lru(struct kauth_identity *kip); static int kauth_identity_guid_expired(struct kauth_identity *kip); static int kauth_identity_ntsid_expired(struct kauth_identity *kip); -static int kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir); -static int kauth_identity_find_gid(gid_t gid, struct kauth_identity *kir); -static int kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir); -static int kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir); +static int kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_gid(gid_t gid, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir, char *getname); +static int kauth_identity_find_nam(char *name, int valid, struct kauth_identity *kir); /* @@ -888,11 +1017,11 @@ kauth_identity_init(void) * structure, filled in * * Notes: It is illegal to translate between UID and GID; any given UUID - * or NTSID can oly refer to an NTSIDE or UUID (respectively), + * or NTSID can only refer to an NTSID or UUID (respectively), * and *either* a UID *or* a GID, but not both. */ static struct kauth_identity * -kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, ntsid_t *ntsidp, time_t ntsid_expiry) +kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, ntsid_t *ntsidp, time_t ntsid_expiry, const char *name, int nametype) { struct kauth_identity *kip; @@ -919,6 +1048,10 @@ kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, nt kip->ki_valid |= KI_VALID_NTSID; } kip->ki_ntsid_expiry = ntsid_expiry; + if (name != NULL) { + kip->ki_name = name; + kip->ki_valid |= nametype; + } } return(kip); } @@ -928,7 +1061,7 @@ kauth_identity_alloc(uid_t uid, gid_t gid, guid_t *guidp, time_t guid_expiry, nt * kauth_identity_register_and_free * * Description: Register an association between identity tokens. The passed - * 'kip' is freed by this function. + * 'kip' is consumed by this function. * * Parameters: kip Pointer to kauth_identity * structure to register @@ -975,11 +1108,22 @@ kauth_identity_register_and_free(struct kauth_identity *kip) ip->ki_valid |= KI_VALID_NTSID; } ip->ki_ntsid_expiry = kip->ki_ntsid_expiry; - /* and discard the incoming identity */ - FREE(kip, M_KAUTH); - ip = NULL; + /* a valid ki_name field overwrites the previous name field */ + if (kip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM)) { + /* if there's an old one, discard it */ + const char *oname = NULL; + if (ip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM)) + oname = ip->ki_name; + ip->ki_name = kip->ki_name; + kip->ki_name = oname; + } + /* and discard the incoming entry */ + ip = kip; } else { - /* don't have any information on this identity, so just add it */ + /* + * if we don't have any information on this identity, add it; + * if it pushes us over our limit, discard the oldest one. + */ TAILQ_INSERT_HEAD(&kauth_identities, kip, ki_link); if (++kauth_identity_count > KAUTH_IDENTITY_CACHEMAX) { ip = TAILQ_LAST(&kauth_identities, kauth_identity_head); @@ -988,9 +1132,14 @@ kauth_identity_register_and_free(struct kauth_identity *kip) } } KAUTH_IDENTITY_UNLOCK(); - /* have to drop lock before freeing expired entry */ - if (ip != NULL) + /* have to drop lock before freeing expired entry (it may be in use) */ + if (ip != NULL) { + /* if the ki_name field is used, clear it first */ + if (ip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM)) + vfs_removename(ip->ki_name); + /* free the expired entry */ FREE(ip, M_KAUTH); + } } @@ -998,25 +1147,51 @@ kauth_identity_register_and_free(struct kauth_identity *kip) * kauth_identity_updatecache * * Description: Given a lookup result, add any associations that we don't - * currently have. + * currently have; replace ones which have changed. * * Parameters: elp External lookup result from * user space daemon to kernel * rkip pointer to returned kauth * identity, or NULL + * extend_data Extended data (can vary) * * Returns: (void) * * Implicit returns: * *rkip Modified (if non-NULL) + * + * Notes: For extended information requests, this code relies on the fact + * that elp->el_flags is never used as an rvalue, and is only + * ever bit-tested for valid lookup information we are willing + * to cache. + * + * XXX: We may have to do the same in the case that extended data was + * passed out to user space to ensure that the request string + * gets cached; we may also be able to use the rkip as an + * input to avoid this. The jury is still out. + * + * XXX: This codes performance could be improved for multiple valid + * results by combining the loop iteration in a single loop. */ static void -kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *rkip) +kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_identity *rkip, uint64_t extend_data) { struct timeval tv; struct kauth_identity *kip; + const char *speculative_name = NULL; microuptime(&tv); + + /* + * If there is extended data, and that data represents a name rather + * than something else, speculatively create an entry for it in the + * string cache. We do this to avoid holding the KAUTH_IDENTITY_LOCK + * over the allocation later. + */ + if (elp->el_flags & (KAUTH_EXTLOOKUP_VALID_PWNAM | KAUTH_EXTLOOKUP_VALID_GRNAM)) { + const char *tmp = CAST_DOWN(const char *,extend_data); + speculative_name = vfs_addname(tmp, strnlen(tmp, MAXPATHLEN - 1), 0, 0); + } /* user identity? */ if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_UID) { @@ -1034,6 +1209,19 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id kip->ki_valid |= KI_VALID_NTSID; } kip->ki_ntsid_expiry = tv.tv_sec + elp->el_usid_valid; + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_PWNAM) { + const char *oname = kip->ki_name; + kip->ki_name = speculative_name; + speculative_name = NULL; + kip->ki_valid |= KI_VALID_PWNAM; + if (oname) { + /* + * free oname (if any) outside + * the lock + */ + speculative_name = oname; + } + } kauth_identity_lru(kip); if (rkip != NULL) *rkip = *kip; @@ -1048,18 +1236,22 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id (elp->el_flags & KAUTH_EXTLOOKUP_VALID_UGUID) ? &elp->el_uguid : NULL, tv.tv_sec + elp->el_uguid_valid, (elp->el_flags & KAUTH_EXTLOOKUP_VALID_USID) ? &elp->el_usid : NULL, - tv.tv_sec + elp->el_usid_valid); + tv.tv_sec + elp->el_usid_valid, + (elp->el_flags & KAUTH_EXTLOOKUP_VALID_PWNAM) ? speculative_name : NULL, + KI_VALID_PWNAM); if (kip != NULL) { if (rkip != NULL) *rkip = *kip; + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_PWNAM) + speculative_name = NULL; KAUTH_DEBUG("CACHE - learned %d is " K_UUID_FMT, kip->ki_uid, K_UUID_ARG(kip->ki_guid)); kauth_identity_register_and_free(kip); } } } - /* group identity? */ - if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GID) { + /* group identity? (ignore, if we already processed it as a user) */ + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GID && !(elp->el_flags & KAUTH_EXTLOOKUP_VALID_UID)) { KAUTH_IDENTITY_LOCK(); TAILQ_FOREACH(kip, &kauth_identities, ki_link) { /* matching record */ @@ -1074,6 +1266,19 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id kip->ki_valid |= KI_VALID_NTSID; } kip->ki_ntsid_expiry = tv.tv_sec + elp->el_gsid_valid; + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GRNAM) { + const char *oname = kip->ki_name; + kip->ki_name = speculative_name; + speculative_name = NULL; + kip->ki_valid |= KI_VALID_GRNAM; + if (oname) { + /* + * free oname (if any) outside + * the lock + */ + speculative_name = oname; + } + } kauth_identity_lru(kip); if (rkip != NULL) *rkip = *kip; @@ -1088,16 +1293,24 @@ kauth_identity_updatecache(struct kauth_identity_extlookup *elp, struct kauth_id (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GGUID) ? &elp->el_gguid : NULL, tv.tv_sec + elp->el_gguid_valid, (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GSID) ? &elp->el_gsid : NULL, - tv.tv_sec + elp->el_gsid_valid); + tv.tv_sec + elp->el_gsid_valid, + (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GRNAM) ? speculative_name : NULL, + KI_VALID_GRNAM); if (kip != NULL) { if (rkip != NULL) *rkip = *kip; + if (elp->el_flags & KAUTH_EXTLOOKUP_VALID_GRNAM) + speculative_name = NULL; KAUTH_DEBUG("CACHE - learned %d is " K_UUID_FMT, kip->ki_uid, K_UUID_ARG(kip->ki_guid)); kauth_identity_register_and_free(kip); } } } + /* If we have a name reference to drop, drop it here */ + if (speculative_name != NULL) { + vfs_removename(speculative_name); + } } @@ -1179,6 +1392,7 @@ kauth_identity_ntsid_expired(struct kauth_identity *kip) * * Parameters: uid UID to find * kir Pointer to return area + * getname Name buffer, if ki_name wanted * * Returns: 0 Found * ENOENT Not found @@ -1187,7 +1401,7 @@ kauth_identity_ntsid_expired(struct kauth_identity *kip) * *klr Modified, if found */ static int -kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir) +kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir, char *getname) { struct kauth_identity *kip; @@ -1197,6 +1411,9 @@ kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir) kauth_identity_lru(kip); /* Copy via structure assignment */ *kir = *kip; + /* If a name is wanted and one exists, copy it out */ + if (getname != NULL && (kip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM))) + strlcpy(getname, kip->ki_name, MAXPATHLEN); break; } } @@ -1206,12 +1423,13 @@ kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir) /* - * kauth_identity_find_uid + * kauth_identity_find_gid * * Description: Search for an entry by GID * * Parameters: gid GID to find * kir Pointer to return area + * getname Name buffer, if ki_name wanted * * Returns: 0 Found * ENOENT Not found @@ -1220,7 +1438,7 @@ kauth_identity_find_uid(uid_t uid, struct kauth_identity *kir) * *klr Modified, if found */ static int -kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir) +kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir, char *getname) { struct kauth_identity *kip; @@ -1230,6 +1448,9 @@ kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir) kauth_identity_lru(kip); /* Copy via structure assignment */ *kir = *kip; + /* If a name is wanted and one exists, copy it out */ + if (getname != NULL && (kip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM))) + strlcpy(getname, kip->ki_name, MAXPATHLEN); break; } } @@ -1245,6 +1466,7 @@ kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir) * * Parameters: guidp Pointer to GUID to find * kir Pointer to return area + * getname Name buffer, if ki_name wanted * * Returns: 0 Found * ENOENT Not found @@ -1256,13 +1478,49 @@ kauth_identity_find_gid(uid_t gid, struct kauth_identity *kir) * may elect to call out to userland to revalidate. */ static int -kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir) +kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir, char *getname) { struct kauth_identity *kip; KAUTH_IDENTITY_LOCK(); TAILQ_FOREACH(kip, &kauth_identities, ki_link) { if ((kip->ki_valid & KI_VALID_GUID) && (kauth_guid_equal(guidp, &kip->ki_guid))) { + kauth_identity_lru(kip); + /* Copy via structure assignment */ + *kir = *kip; + /* If a name is wanted and one exists, copy it out */ + if (getname != NULL && (kip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM))) + strlcpy(getname, kip->ki_name, MAXPATHLEN); + break; + } + } + KAUTH_IDENTITY_UNLOCK(); + return((kip == NULL) ? ENOENT : 0); +} + +/* + * kauth_identity_find_nam + * + * Description: Search for an entry by name + * + * Parameters: name Pointer to name to find + * valid KI_VALID_PWNAM or KI_VALID_GRNAM + * kir Pointer to return aread + * + * Returns: 0 Found + * ENOENT Not found + * + * Implicit returns: + * *klr Modified, if found + */ +static int +kauth_identity_find_nam(char *name, int valid, struct kauth_identity *kir) +{ + struct kauth_identity *kip; + + KAUTH_IDENTITY_LOCK(); + TAILQ_FOREACH(kip, &kauth_identities, ki_link) { + if ((kip->ki_valid & valid) && !strcmp(name, kip->ki_name)) { kauth_identity_lru(kip); /* Copy via structure assignment */ *kir = *kip; @@ -1281,6 +1539,7 @@ kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir) * * Parameters: ntsid Pointer to NTSID to find * kir Pointer to return area + * getname Name buffer, if ki_name wanted * * Returns: 0 Found * ENOENT Not found @@ -1292,7 +1551,7 @@ kauth_identity_find_guid(guid_t *guidp, struct kauth_identity *kir) * may elect to call out to userland to revalidate. */ static int -kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir) +kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir, char *getname) { struct kauth_identity *kip; @@ -1302,6 +1561,9 @@ kauth_identity_find_ntsid(ntsid_t *ntsid, struct kauth_identity *kir) kauth_identity_lru(kip); /* Copy via structure assignment */ *kir = *kip; + /* If a name is wanted and one exists, copy it out */ + if (getname != NULL && (kip->ki_valid & (KI_VALID_PWNAM | KI_VALID_GRNAM))) + strlcpy(getname, kip->ki_name, MAXPATHLEN); break; } } @@ -1351,7 +1613,7 @@ int kauth_wellknown_guid(guid_t *guid) { static char fingerprint[] = {0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef, 0xab, 0xcd, 0xef}; - int code; + uint32_t code; /* * All WKGs begin with the same 12 bytes. */ @@ -1359,7 +1621,7 @@ kauth_wellknown_guid(guid_t *guid) /* * The final 4 bytes are our code (in network byte order). */ - code = OSSwapHostToBigInt32(*(u_int32_t *)&guid->g_guid[12]); + code = OSSwapHostToBigInt32(*(uint32_t *)&guid->g_guid[12]); switch(code) { case 0x0000000c: return(KAUTH_WKG_EVERYBODY); @@ -1445,16 +1707,17 @@ kauth_cred_change_egid(kauth_cred_t cred, gid_t new_egid) #if radar_4600026 int is_member; #endif /* radar_4600026 */ - gid_t old_egid = cred->cr_groups[0]; + gid_t old_egid = kauth_cred_getgid(cred); + posix_cred_t pcred = posix_cred_get(cred); /* Ignoring the first entry, scan for a match for the new egid */ - for (i = 1; i < cred->cr_ngroups; i++) { + for (i = 1; i < pcred->cr_ngroups; i++) { /* * If we find a match, swap them so we don't lose overall * group information */ - if (cred->cr_groups[i] == new_egid) { - cred->cr_groups[i] = old_egid; + if (pcred->cr_groups[i] == new_egid) { + pcred->cr_groups[i] = old_egid; DEBUG_CRED_CHANGE("kauth_cred_change_egid: unset displaced\n"); displaced = 0; break; @@ -1480,7 +1743,7 @@ conservative approach (i.e. less likely to cause things to break). * * NB: This is typically a cold code path. */ - if (displaced && !(cred->cr_flags & CRF_NOMEMBERD) && + if (displaced && !(pcred->cr_flags & CRF_NOMEMBERD) && kauth_cred_ismember_gid(cred, new_egid, &is_member) == 0 && is_member) { displaced = 0; @@ -1489,7 +1752,7 @@ conservative approach (i.e. less likely to cause things to break). #endif /* radar_4600026 */ /* set the new EGID into the old spot */ - cred->cr_groups[0] = new_egid; + pcred->cr_groups[0] = new_egid; return (displaced); } @@ -1508,7 +1771,41 @@ uid_t kauth_cred_getuid(kauth_cred_t cred) { NULLCRED_CHECK(cred); - return(cred->cr_uid); + return(posix_cred_get(cred)->cr_uid); +} + + +/* + * kauth_cred_getruid + * + * Description: Fetch RUID from credential + * + * Parameters: cred Credential to examine + * + * Returns: (uid_t) RUID associated with credential + */ +uid_t +kauth_cred_getruid(kauth_cred_t cred) +{ + NULLCRED_CHECK(cred); + return(posix_cred_get(cred)->cr_ruid); +} + + +/* + * kauth_cred_getsvuid + * + * Description: Fetch SVUID from credential + * + * Parameters: cred Credential to examine + * + * Returns: (uid_t) SVUID associated with credential + */ +uid_t +kauth_cred_getsvuid(kauth_cred_t cred) +{ + NULLCRED_CHECK(cred); + return(posix_cred_get(cred)->cr_svuid); } @@ -1521,11 +1818,139 @@ kauth_cred_getuid(kauth_cred_t cred) * * Returns: (gid_t) GID associated with credential */ -uid_t +gid_t kauth_cred_getgid(kauth_cred_t cred) { NULLCRED_CHECK(cred); - return(cred->cr_gid); + return(posix_cred_get(cred)->cr_gid); +} + + +/* + * kauth_cred_getrgid + * + * Description: Fetch RGID from credential + * + * Parameters: cred Credential to examine + * + * Returns: (gid_t) RGID associated with credential + */ +gid_t +kauth_cred_getrgid(kauth_cred_t cred) +{ + NULLCRED_CHECK(cred); + return(posix_cred_get(cred)->cr_rgid); +} + + +/* + * kauth_cred_getsvgid + * + * Description: Fetch SVGID from credential + * + * Parameters: cred Credential to examine + * + * Returns: (gid_t) SVGID associated with credential + */ +gid_t +kauth_cred_getsvgid(kauth_cred_t cred) +{ + NULLCRED_CHECK(cred); + return(posix_cred_get(cred)->cr_svgid); +} + + +/* + * kauth_cred_guid2pwnam + * + * Description: Fetch PWNAM from GUID + * + * Parameters: guidp Pointer to GUID to examine + * pwnam Pointer to user@domain buffer + * + * Returns: 0 Success + * kauth_cred_cache_lookup:EINVAL + * + * Implicit returns: + * *pwnam Modified, if successful + * + * Notes: pwnam is assumed to point to a buffer of MAXPATHLEN in size + */ +int +kauth_cred_guid2pwnam(guid_t *guidp, char *pwnam) +{ + return(kauth_cred_cache_lookup(KI_VALID_GUID, KI_VALID_PWNAM, guidp, pwnam)); +} + + +/* + * kauth_cred_guid2grnam + * + * Description: Fetch GRNAM from GUID + * + * Parameters: guidp Pointer to GUID to examine + * grnam Pointer to group@domain buffer + * + * Returns: 0 Success + * kauth_cred_cache_lookup:EINVAL + * + * Implicit returns: + * *grnam Modified, if successful + * + * Notes: grnam is assumed to point to a buffer of MAXPATHLEN in size + */ +int +kauth_cred_guid2grnam(guid_t *guidp, char *grnam) +{ + return(kauth_cred_cache_lookup(KI_VALID_GUID, KI_VALID_GRNAM, guidp, grnam)); +} + + +/* + * kauth_cred_pwnam2guid + * + * Description: Fetch PWNAM from GUID + * + * Parameters: pwnam String containing user@domain + * guidp Pointer to buffer for GUID + * + * Returns: 0 Success + * kauth_cred_cache_lookup:EINVAL + * + * Implicit returns: + * *guidp Modified, if successful + * + * Notes: pwnam should not point to a request larger than MAXPATHLEN + * bytes in size, including the NUL termination of the string. + */ +int +kauth_cred_pwnam2guid(char *pwnam, guid_t *guidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_PWNAM, KI_VALID_GUID, pwnam, guidp)); +} + + +/* + * kauth_cred_grnam2guid + * + * Description: Fetch GRNAM from GUID + * + * Parameters: grnam String containing group@domain + * guidp Pointer to buffer for GUID + * + * Returns: 0 Success + * kauth_cred_cache_lookup:EINVAL + * + * Implicit returns: + * *guidp Modified, if successful + * + * Notes: grnam should not point to a request larger than MAXPATHLEN + * bytes in size, including the NUL termination of the string. + */ +int +kauth_cred_grnam2guid(char *grnam, guid_t *guidp) +{ + return(kauth_cred_cache_lookup(KI_VALID_GRNAM, KI_VALID_GUID, grnam, guidp)); } @@ -1806,27 +2231,40 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) struct kauth_identity ki; struct kauth_identity_extlookup el; int error; + uint64_t extend_data = 0ULL; int (* expired)(struct kauth_identity *kip); + char *namebuf = NULL; KAUTH_DEBUG("CACHE - translate %d to %d", from, to); /* * Look for an existing cache entry for this association. * If the entry has not expired, return the cached information. + * We do not cache user@domain translations here; they use too + * much memory to hold onto forever, and can not be updated + * atomically. */ + if (to == KI_VALID_PWNAM || to == KI_VALID_GRNAM) { + namebuf = dst; + } ki.ki_valid = 0; switch(from) { case KI_VALID_UID: - error = kauth_identity_find_uid(*(uid_t *)src, &ki); + error = kauth_identity_find_uid(*(uid_t *)src, &ki, namebuf); break; case KI_VALID_GID: - error = kauth_identity_find_gid(*(gid_t *)src, &ki); + error = kauth_identity_find_gid(*(gid_t *)src, &ki, namebuf); break; case KI_VALID_GUID: - error = kauth_identity_find_guid((guid_t *)src, &ki); + error = kauth_identity_find_guid((guid_t *)src, &ki, namebuf); break; case KI_VALID_NTSID: - error = kauth_identity_find_ntsid((ntsid_t *)src, &ki); + error = kauth_identity_find_ntsid((ntsid_t *)src, &ki, namebuf); + break; + case KI_VALID_PWNAM: + case KI_VALID_GRNAM: + /* Names are unique in their 'from' space */ + error = kauth_identity_find_nam((char *)src, from, &ki); break; default: return(EINVAL); @@ -1862,7 +2300,7 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) expired = NULL; } } - KAUTH_DEBUG("CACHE - found matching entry with valid %d", ki.ki_valid); + KAUTH_DEBUG("CACHE - found matching entry with valid 0x%08x", ki.ki_valid); /* * If no expiry function, or not expired, we have found * a hit. @@ -1882,13 +2320,33 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) * a better-than nothing alternative. */ KAUTH_DEBUG("CACHE - expired entry found"); + } else { + /* + * A guid can't both match a uid and a gid, so if we + * found a cache entry while looking for one or the + * other from a guid, the 'from' is KI_VALID_GUID, + * and the 'to' is one, and the other one is valid, + * then we immediately return ENOENT without calling + * the resolver again. + */ + if (from == KI_VALID_GUID && + (((ki.ki_valid & KI_VALID_UID) && + to == KI_VALID_GID) || + ((ki.ki_valid & KI_VALID_GID) && + to == KI_VALID_UID))) { + return (ENOENT); + } } } /* * We failed to find a cache entry; call the resolver. * - * Note: We ask for as much data as we can get. + * Note: We ask for as much non-extended data as we can get, + * and only provide (or ask for) extended information if + * we have a 'from' (or 'to') which requires it. This + * way we don't pay for the extra transfer overhead for + * data we don't need. */ bzero(&el, sizeof(el)); el.el_info_pid = current_proc()->p_pid; @@ -1911,6 +2369,16 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) el.el_usid = *(ntsid_t *)src; el.el_gsid = *(ntsid_t *)src; break; + case KI_VALID_PWNAM: + /* extra overhead */ + el.el_flags = KAUTH_EXTLOOKUP_VALID_PWNAM; + extend_data = CAST_USER_ADDR_T(src); + break; + case KI_VALID_GRNAM: + /* extra overhead */ + el.el_flags = KAUTH_EXTLOOKUP_VALID_GRNAM; + extend_data = CAST_USER_ADDR_T(src); + break; default: return(EINVAL); } @@ -1926,25 +2394,53 @@ kauth_cred_cache_lookup(int from, int to, void *src, void *dst) el.el_flags |= KAUTH_EXTLOOKUP_WANT_UID | KAUTH_EXTLOOKUP_WANT_GID | KAUTH_EXTLOOKUP_WANT_UGUID | KAUTH_EXTLOOKUP_WANT_GGUID | KAUTH_EXTLOOKUP_WANT_USID | KAUTH_EXTLOOKUP_WANT_GSID; + if (to == KI_VALID_PWNAM) { + /* extra overhead */ + el.el_flags |= KAUTH_EXTLOOKUP_WANT_PWNAM; + extend_data = CAST_USER_ADDR_T(dst); + } + if (to == KI_VALID_GRNAM) { + /* extra overhead */ + el.el_flags |= KAUTH_EXTLOOKUP_WANT_GRNAM; + extend_data = CAST_USER_ADDR_T(dst); + } + + /* Call resolver */ KAUTH_DEBUG("CACHE - calling resolver for %x", el.el_flags); - error = kauth_resolver_submit(&el); + error = kauth_resolver_submit(&el, extend_data); KAUTH_DEBUG("CACHE - resolver returned %d", error); - /* was the lookup successful? */ + + /* was the external lookup successful? */ if (error == 0) { /* - * Save the results from the lookup - may have other - * information even if we didn't get a guid. + * Save the results from the lookup - we may have other + * information, even if we didn't get a guid or the + * extended data. + * + * If we came from a name, we know the extend_data is valid. + */ + if (from == KI_VALID_PWNAM) + el.el_flags |= KAUTH_EXTLOOKUP_VALID_PWNAM; + else if (from == KI_VALID_GRNAM) + el.el_flags |= KAUTH_EXTLOOKUP_VALID_GRNAM; + + kauth_identity_updatecache(&el, &ki, extend_data); + + /* + * Check to see if we have a valid cache entry + * originating from the result. */ - kauth_identity_updatecache(&el, &ki); + if (!(ki.ki_valid & to)) { + error = ENOENT; + } } - /* - * Check to see if we have a valid result. - */ - if (!error && !(ki.ki_valid & to)) - error = ENOENT; if (error) return(error); found: + /* + * Copy from the appropriate struct kauth_identity cache entry + * structure into the destination buffer area. + */ switch(to) { case KI_VALID_UID: *(uid_t *)dst = ki.ki_uid; @@ -1958,6 +2454,10 @@ found: case KI_VALID_NTSID: *(ntsid_t *)dst = ki.ki_ntsid; break; + case KI_VALID_PWNAM: + case KI_VALID_GRNAM: + /* handled in kauth_resolver_complete() */ + break; default: return(EINVAL); } @@ -2190,6 +2690,7 @@ kauth_groups_updatecache(struct kauth_identity_extlookup *el) int kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) { + posix_cred_t pcred = posix_cred_get(cred); struct kauth_group_membership *gm; struct kauth_identity_extlookup el; int i, error; @@ -2200,8 +2701,8 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) * We can conditionalise this on cred->cr_gmuid == KAUTH_UID_NONE since * the cache should be used for that case. */ - for (i = 0; i < cred->cr_ngroups; i++) { - if (gid == cred->cr_groups[i]) { + for (i = 0; i < pcred->cr_ngroups; i++) { + if (gid == pcred->cr_groups[i]) { *resultp = 1; return(0); } @@ -2211,7 +2712,7 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) * If we don't have a UID for group membership checks, the in-cred list * was authoritative and we can stop here. */ - if (cred->cr_gmuid == KAUTH_UID_NONE) { + if (pcred->cr_gmuid == KAUTH_UID_NONE) { *resultp = 0; return(0); } @@ -2236,7 +2737,7 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) */ KAUTH_GROUPS_LOCK(); TAILQ_FOREACH(gm, &kauth_groups, gm_link) { - if ((gm->gm_uid == cred->cr_gmuid) && (gm->gm_gid == gid) && !kauth_groups_expired(gm)) { + if ((gm->gm_uid == pcred->cr_gmuid) && (gm->gm_gid == gid) && !kauth_groups_expired(gm)) { kauth_groups_lru(gm); break; } @@ -2255,10 +2756,10 @@ kauth_cred_ismember_gid(kauth_cred_t cred, gid_t gid, int *resultp) bzero(&el, sizeof(el)); el.el_info_pid = current_proc()->p_pid; el.el_flags = KAUTH_EXTLOOKUP_VALID_UID | KAUTH_EXTLOOKUP_VALID_GID | KAUTH_EXTLOOKUP_WANT_MEMBERSHIP; - el.el_uid = cred->cr_gmuid; + el.el_uid = pcred->cr_gmuid; el.el_gid = gid; el.el_member_valid = 0; /* XXX set by resolver? */ - error = kauth_resolver_submit(&el); + error = kauth_resolver_submit(&el, 0ULL); if (error != 0) return(error); /* save the results from the lookup */ @@ -2332,7 +2833,7 @@ kauth_cred_ismember_guid(kauth_cred_t cred, guid_t *guidp, int *resultp) * this is expected to be a common case. */ ki.ki_valid = 0; - if ((error = kauth_identity_find_guid(guidp, &ki)) == 0 && + if ((error = kauth_identity_find_guid(guidp, &ki, NULL)) == 0 && !kauth_identity_guid_expired(&ki)) { if (ki.ki_valid & KI_VALID_GID) { /* It's a group after all... */ @@ -2395,38 +2896,40 @@ kauth_cred_gid_subset(kauth_cred_t cred1, kauth_cred_t cred2, int *resultp) { int i, err, res = 1; gid_t gid; + posix_cred_t pcred1 = posix_cred_get(cred1); + posix_cred_t pcred2 = posix_cred_get(cred2); /* First, check the local list of groups */ - for (i = 0; i < cred1->cr_ngroups; i++) { - gid = cred1->cr_groups[i]; + for (i = 0; i < pcred1->cr_ngroups; i++) { + gid = pcred1->cr_groups[i]; if ((err = kauth_cred_ismember_gid(cred2, gid, &res)) != 0) { return err; } - if (!res && gid != cred2->cr_rgid && gid != cred2->cr_svgid) { + if (!res && gid != pcred2->cr_rgid && gid != pcred2->cr_svgid) { *resultp = 0; return 0; } } /* Check real gid */ - if ((err = kauth_cred_ismember_gid(cred2, cred1->cr_rgid, &res)) != 0) { + if ((err = kauth_cred_ismember_gid(cred2, pcred1->cr_rgid, &res)) != 0) { return err; } - if (!res && cred1->cr_rgid != cred2->cr_rgid && - cred1->cr_rgid != cred2->cr_svgid) { + if (!res && pcred1->cr_rgid != pcred2->cr_rgid && + pcred1->cr_rgid != pcred2->cr_svgid) { *resultp = 0; return 0; } /* Finally, check saved gid */ - if ((err = kauth_cred_ismember_gid(cred2, cred1->cr_svgid, &res)) != 0){ + if ((err = kauth_cred_ismember_gid(cred2, pcred1->cr_svgid, &res)) != 0){ return err; } - if (!res && cred1->cr_svgid != cred2->cr_rgid && - cred1->cr_svgid != cred2->cr_svgid) { + if (!res && pcred1->cr_svgid != pcred2->cr_rgid && + pcred1->cr_svgid != pcred2->cr_svgid) { *resultp = 0; return 0; } @@ -2453,7 +2956,7 @@ kauth_cred_gid_subset(kauth_cred_t cred1, kauth_cred_t cred2, int *resultp) int kauth_cred_issuser(kauth_cred_t cred) { - return(cred->cr_uid == 0); + return(kauth_cred_getuid(cred) == 0); } @@ -2536,7 +3039,7 @@ kauth_cred_init(void) uid_t kauth_getuid(void) { - return(kauth_cred_get()->cr_uid); + return(kauth_cred_getuid(kauth_cred_get())); } @@ -2553,7 +3056,7 @@ kauth_getuid(void) uid_t kauth_getruid(void) { - return(kauth_cred_get()->cr_ruid); + return(kauth_cred_getruid(kauth_cred_get())); } @@ -2570,7 +3073,7 @@ kauth_getruid(void) gid_t kauth_getgid(void) { - return(kauth_cred_get()->cr_groups[0]); + return(kauth_cred_getgid(kauth_cred_get())); } @@ -2587,7 +3090,7 @@ kauth_getgid(void) gid_t kauth_getrgid(void) { - return(kauth_cred_get()->cr_rgid); + return(kauth_cred_getrgid(kauth_cred_get())); } @@ -2823,13 +3326,12 @@ kauth_cred_alloc(void) MALLOC_ZONE(newcred, kauth_cred_t, sizeof(*newcred), M_CRED, M_WAITOK); if (newcred != 0) { + posix_cred_t newpcred = posix_cred_get(newcred); bzero(newcred, sizeof(*newcred)); newcred->cr_ref = 1; - newcred->cr_audit.as_aia_p = &audit_default_aia; - /* XXX the following will go away with cr_au */ - newcred->cr_au.ai_auid = AU_DEFAUDITID; + newcred->cr_audit.as_aia_p = audit_default_aia_p; /* must do this, or cred has same group membership as uid 0 */ - newcred->cr_gmuid = KAUTH_UID_NONE; + newpcred->cr_gmuid = KAUTH_UID_NONE; #if CRED_DIAGNOSTIC } else { panic("kauth_cred_alloc: couldn't allocate credential"); @@ -2878,12 +3380,13 @@ kauth_cred_t kauth_cred_create(kauth_cred_t cred) { kauth_cred_t found_cred, new_cred = NULL; + posix_cred_t pcred = posix_cred_get(cred); int is_member = 0; KAUTH_CRED_HASH_LOCK_ASSERT(); - if (cred->cr_flags & CRF_NOMEMBERD) { - cred->cr_gmuid = KAUTH_UID_NONE; + if (pcred->cr_flags & CRF_NOMEMBERD) { + pcred->cr_gmuid = KAUTH_UID_NONE; } else { /* * If the template credential is not opting out of external @@ -2902,7 +3405,7 @@ kauth_cred_create(kauth_cred_t cred) * the answer, so long as it's something the external * resolver could have vended. */ - cred->cr_gmuid = cred->cr_uid; + pcred->cr_gmuid = pcred->cr_uid; } else { /* * It's not something the external resolver could @@ -2913,13 +3416,13 @@ kauth_cred_create(kauth_cred_t cred) * cost. Since most credentials are used multiple * times, we still get some performance win from this. */ - cred->cr_gmuid = KAUTH_UID_NONE; - cred->cr_flags |= CRF_NOMEMBERD; + pcred->cr_gmuid = KAUTH_UID_NONE; + pcred->cr_flags |= CRF_NOMEMBERD; } } /* Caller *must* specify at least the egid in cr_groups[0] */ - if (cred->cr_ngroups < 1) + if (pcred->cr_ngroups < 1) return(NULL); for (;;) { @@ -2943,22 +3446,20 @@ kauth_cred_create(kauth_cred_t cred) new_cred = kauth_cred_alloc(); if (new_cred != NULL) { int err; - new_cred->cr_uid = cred->cr_uid; - new_cred->cr_ruid = cred->cr_ruid; - new_cred->cr_svuid = cred->cr_svuid; - new_cred->cr_rgid = cred->cr_rgid; - new_cred->cr_svgid = cred->cr_svgid; - new_cred->cr_gmuid = cred->cr_gmuid; - new_cred->cr_ngroups = cred->cr_ngroups; - bcopy(&cred->cr_groups[0], &new_cred->cr_groups[0], sizeof(new_cred->cr_groups)); + posix_cred_t new_pcred = posix_cred_get(new_cred); + new_pcred->cr_uid = pcred->cr_uid; + new_pcred->cr_ruid = pcred->cr_ruid; + new_pcred->cr_svuid = pcred->cr_svuid; + new_pcred->cr_rgid = pcred->cr_rgid; + new_pcred->cr_svgid = pcred->cr_svgid; + new_pcred->cr_gmuid = pcred->cr_gmuid; + new_pcred->cr_ngroups = pcred->cr_ngroups; + bcopy(&pcred->cr_groups[0], &new_pcred->cr_groups[0], sizeof(new_pcred->cr_groups)); #if CONFIG_AUDIT bcopy(&cred->cr_audit, &new_cred->cr_audit, sizeof(new_cred->cr_audit)); - /* XXX the following bcopy() will go away with cr_au */ - bcopy(&cred->cr_au, &new_cred->cr_au, - sizeof(new_cred->cr_au)); #endif - new_cred->cr_flags = cred->cr_flags; + new_pcred->cr_flags = pcred->cr_flags; KAUTH_CRED_HASH_LOCK(); err = kauth_cred_add(new_cred); @@ -3017,6 +3518,8 @@ kauth_cred_t kauth_cred_setresuid(kauth_cred_t cred, uid_t ruid, uid_t euid, uid_t svuid, uid_t gmuid) { struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred = posix_cred_get(cred); NULLCRED_CHECK(cred); @@ -3024,10 +3527,10 @@ kauth_cred_setresuid(kauth_cred_t cred, uid_t ruid, uid_t euid, uid_t svuid, uid * We don't need to do anything if the UIDs we are changing are * already the same as the UIDs passed in */ - if ((euid == KAUTH_UID_NONE || cred->cr_uid == euid) && - (ruid == KAUTH_UID_NONE || cred->cr_ruid == ruid) && - (svuid == KAUTH_UID_NONE || cred->cr_svuid == svuid) && - (cred->cr_gmuid == gmuid)) { + if ((euid == KAUTH_UID_NONE || pcred->cr_uid == euid) && + (ruid == KAUTH_UID_NONE || pcred->cr_ruid == ruid) && + (svuid == KAUTH_UID_NONE || pcred->cr_svuid == svuid) && + (pcred->cr_gmuid == gmuid)) { /* no change needed */ return(cred); } @@ -3038,13 +3541,13 @@ kauth_cred_setresuid(kauth_cred_t cred, uid_t ruid, uid_t euid, uid_t svuid, uid */ bcopy(cred, &temp_cred, sizeof(temp_cred)); if (euid != KAUTH_UID_NONE) { - temp_cred.cr_uid = euid; + temp_pcred->cr_uid = euid; } if (ruid != KAUTH_UID_NONE) { - temp_cred.cr_ruid = ruid; + temp_pcred->cr_ruid = ruid; } if (svuid != KAUTH_UID_NONE) { - temp_cred.cr_svuid = svuid; + temp_pcred->cr_svuid = svuid; } /* @@ -3052,8 +3555,8 @@ kauth_cred_setresuid(kauth_cred_t cred, uid_t ruid, uid_t euid, uid_t svuid, uid * opt out of participation in external group resolution, unless we * unless we explicitly opt back in later. */ - if ((temp_cred.cr_gmuid = gmuid) == KAUTH_UID_NONE) { - temp_cred.cr_flags |= CRF_NOMEMBERD; + if ((temp_pcred->cr_gmuid = gmuid) == KAUTH_UID_NONE) { + temp_pcred->cr_flags |= CRF_NOMEMBERD; } return(kauth_cred_update(cred, &temp_cred, TRUE)); @@ -3090,6 +3593,8 @@ kauth_cred_t kauth_cred_setresgid(kauth_cred_t cred, gid_t rgid, gid_t egid, gid_t svgid) { struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred = posix_cred_get(cred); NULLCRED_CHECK(cred); DEBUG_CRED_ENTER("kauth_cred_setresgid %p %d %d %d\n", cred, rgid, egid, svgid); @@ -3098,9 +3603,9 @@ kauth_cred_setresgid(kauth_cred_t cred, gid_t rgid, gid_t egid, gid_t svgid) * We don't need to do anything if the given GID are already the * same as the GIDs in the credential. */ - if (cred->cr_groups[0] == egid && - cred->cr_rgid == rgid && - cred->cr_svgid == svgid) { + if (pcred->cr_groups[0] == egid && + pcred->cr_rgid == rgid && + pcred->cr_svgid == svgid) { /* no change needed */ return(cred); } @@ -3114,17 +3619,17 @@ kauth_cred_setresgid(kauth_cred_t cred, gid_t rgid, gid_t egid, gid_t svgid) /* displacing a supplementary group opts us out of memberd */ if (kauth_cred_change_egid(&temp_cred, egid)) { DEBUG_CRED_CHANGE("displaced!\n"); - temp_cred.cr_flags |= CRF_NOMEMBERD; - temp_cred.cr_gmuid = KAUTH_UID_NONE; + temp_pcred->cr_flags |= CRF_NOMEMBERD; + temp_pcred->cr_gmuid = KAUTH_UID_NONE; } else { DEBUG_CRED_CHANGE("not displaced\n"); } } if (rgid != KAUTH_GID_NONE) { - temp_cred.cr_rgid = rgid; + temp_pcred->cr_rgid = rgid; } if (svgid != KAUTH_GID_NONE) { - temp_cred.cr_svgid = svgid; + temp_pcred->cr_svgid = svgid; } return(kauth_cred_update(cred, &temp_cred, TRUE)); @@ -3185,16 +3690,20 @@ kauth_cred_setgroups(kauth_cred_t cred, gid_t *groups, int groupcount, uid_t gmu { int i; struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred; NULLCRED_CHECK(cred); + pcred = posix_cred_get(cred); + /* * We don't need to do anything if the given list of groups does not * change. */ - if ((cred->cr_gmuid == gmuid) && (cred->cr_ngroups == groupcount)) { + if ((pcred->cr_gmuid == gmuid) && (pcred->cr_ngroups == groupcount)) { for (i = 0; i < groupcount; i++) { - if (cred->cr_groups[i] != groups[i]) + if (pcred->cr_groups[i] != groups[i]) break; } if (i == groupcount) { @@ -3211,17 +3720,46 @@ kauth_cred_setgroups(kauth_cred_t cred, gid_t *groups, int groupcount, uid_t gmu * using initgroups(). This is required for POSIX conformance. */ bcopy(cred, &temp_cred, sizeof(temp_cred)); - temp_cred.cr_ngroups = groupcount; - bcopy(groups, temp_cred.cr_groups, sizeof(temp_cred.cr_groups)); - temp_cred.cr_gmuid = gmuid; + temp_pcred->cr_ngroups = groupcount; + bcopy(groups, temp_pcred->cr_groups, sizeof(temp_pcred->cr_groups)); + temp_pcred->cr_gmuid = gmuid; if (gmuid == KAUTH_UID_NONE) - temp_cred.cr_flags |= CRF_NOMEMBERD; + temp_pcred->cr_flags |= CRF_NOMEMBERD; else - temp_cred.cr_flags &= ~CRF_NOMEMBERD; + temp_pcred->cr_flags &= ~CRF_NOMEMBERD; return(kauth_cred_update(cred, &temp_cred, TRUE)); } +/* + * XXX temporary, for NFS support until we can come up with a better + * XXX enumeration/comparison mechanism + * + * Notes: The return value exists to account for the possbility of a + * kauth_cred_t without a POSIX label. This will be the case in + * the future (see posix_cred_get() below, for more details). + */ +int +kauth_cred_getgroups(kauth_cred_t cred, gid_t *grouplist, int *countp) +{ + int limit = NGROUPS; + + /* + * If they just want a copy of the groups list, they may not care + * about the actual count. If they specify an input count, however, + * treat it as an indicator of the buffer size available in grouplist, + * and limit the returned list to that size. + */ + if (countp) { + limit = MIN(*countp, cred->cr_posix.cr_ngroups); + *countp = limit; + } + + memcpy(grouplist, cred->cr_posix.cr_groups, sizeof(gid_t) * limit); + + return 0; +} + /* * kauth_cred_setuidgid @@ -3262,15 +3800,19 @@ kauth_cred_t kauth_cred_setuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) { struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred; NULLCRED_CHECK(cred); + pcred = posix_cred_get(cred); + /* * We don't need to do anything if the effective, real and saved * user IDs are already the same as the user ID passed into us. */ - if (cred->cr_uid == uid && cred->cr_ruid == uid && cred->cr_svuid == uid && - cred->cr_groups[0] == gid && cred->cr_rgid == gid && cred->cr_svgid == gid) { + if (pcred->cr_uid == uid && pcred->cr_ruid == uid && pcred->cr_svuid == uid && + pcred->cr_gid == gid && pcred->cr_rgid == gid && pcred->cr_svgid == gid) { /* no change needed */ return(cred); } @@ -3280,26 +3822,26 @@ kauth_cred_setuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) * with the new values. */ bzero(&temp_cred, sizeof(temp_cred)); - temp_cred.cr_uid = uid; - temp_cred.cr_ruid = uid; - temp_cred.cr_svuid = uid; - temp_cred.cr_flags = cred->cr_flags; + temp_pcred->cr_uid = uid; + temp_pcred->cr_ruid = uid; + temp_pcred->cr_svuid = uid; + temp_pcred->cr_flags = pcred->cr_flags; /* inherit the opt-out of memberd */ - if (cred->cr_flags & CRF_NOMEMBERD) { - temp_cred.cr_gmuid = KAUTH_UID_NONE; - temp_cred.cr_flags |= CRF_NOMEMBERD; + if (pcred->cr_flags & CRF_NOMEMBERD) { + temp_pcred->cr_gmuid = KAUTH_UID_NONE; + temp_pcred->cr_flags |= CRF_NOMEMBERD; } else { - temp_cred.cr_gmuid = uid; - temp_cred.cr_flags &= ~CRF_NOMEMBERD; + temp_pcred->cr_gmuid = uid; + temp_pcred->cr_flags &= ~CRF_NOMEMBERD; } - temp_cred.cr_ngroups = 1; + temp_pcred->cr_ngroups = 1; /* displacing a supplementary group opts us out of memberd */ if (kauth_cred_change_egid(&temp_cred, gid)) { - temp_cred.cr_gmuid = KAUTH_UID_NONE; - temp_cred.cr_flags |= CRF_NOMEMBERD; + temp_pcred->cr_gmuid = KAUTH_UID_NONE; + temp_pcred->cr_flags |= CRF_NOMEMBERD; } - temp_cred.cr_rgid = gid; - temp_cred.cr_svgid = gid; + temp_pcred->cr_rgid = gid; + temp_pcred->cr_svgid = gid; #if CONFIG_MACF temp_cred.cr_label = cred->cr_label; #endif @@ -3336,8 +3878,13 @@ kauth_cred_t kauth_cred_setsvuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) { struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred; NULLCRED_CHECK(cred); + + pcred = posix_cred_get(cred); + DEBUG_CRED_ENTER("kauth_cred_setsvuidgid: %p u%d->%d g%d->%d\n", cred, cred->cr_svuid, uid, cred->cr_svgid, gid); /* @@ -3345,7 +3892,7 @@ kauth_cred_setsvuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) * uids are already the same as the uid provided. This check is * likely insufficient. */ - if (cred->cr_svuid == uid && cred->cr_svgid == gid) { + if (pcred->cr_svuid == uid && pcred->cr_svgid == gid) { /* no change needed */ return(cred); } @@ -3355,8 +3902,8 @@ kauth_cred_setsvuidgid(kauth_cred_t cred, uid_t uid, gid_t gid) * with new values. */ bcopy(cred, &temp_cred, sizeof(temp_cred)); - temp_cred.cr_svuid = uid; - temp_cred.cr_svgid = gid; + temp_pcred->cr_svuid = uid; + temp_pcred->cr_svgid = gid; return(kauth_cred_update(cred, &temp_cred, TRUE)); } @@ -3402,18 +3949,6 @@ kauth_cred_setauditinfo(kauth_cred_t cred, au_session_t *auditinfo_p) bcopy(cred, &temp_cred, sizeof(temp_cred)); bcopy(auditinfo_p, &temp_cred.cr_audit, sizeof(temp_cred.cr_audit)); - /* XXX the following will go away with cr_au */ - temp_cred.cr_au.ai_auid = auditinfo_p->as_aia_p->ai_auid; - temp_cred.cr_au.ai_mask.am_success = - auditinfo_p->as_mask.am_success; - temp_cred.cr_au.ai_mask.am_failure = - auditinfo_p->as_mask.am_failure; - temp_cred.cr_au.ai_termid.port = - auditinfo_p->as_aia_p->ai_termid.at_port; - temp_cred.cr_au.ai_termid.machine = - auditinfo_p->as_aia_p->ai_termid.at_addr[0]; - temp_cred.cr_au.ai_asid = auditinfo_p->as_aia_p->ai_asid; - /* XXX */ return(kauth_cred_update(cred, &temp_cred, FALSE)); } @@ -3560,6 +4095,9 @@ int kauth_proc_label_update(struct proc *p, struct label *label) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); + mac_proc_set_enforce(p, MAC_ALL_ENFORCE); proc_unlock(p); } @@ -3635,6 +4173,8 @@ kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx, continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); mac_proc_set_enforce(p, MAC_ALL_ENFORCE); proc_unlock(p); } @@ -3951,10 +4491,12 @@ kauth_cred_copy_real(kauth_cred_t cred) { kauth_cred_t newcred = NULL, found_cred; struct ucred temp_cred; + posix_cred_t temp_pcred = posix_cred_get(&temp_cred); + posix_cred_t pcred = posix_cred_get(cred); /* if the credential is already 'real', just take a reference */ - if ((cred->cr_ruid == cred->cr_uid) && - (cred->cr_rgid == cred->cr_gid)) { + if ((pcred->cr_ruid == pcred->cr_uid) && + (pcred->cr_rgid == pcred->cr_gid)) { kauth_cred_ref(cred); return(cred); } @@ -3964,18 +4506,18 @@ kauth_cred_copy_real(kauth_cred_t cred) * with the new values. */ bcopy(cred, &temp_cred, sizeof(temp_cred)); - temp_cred.cr_uid = cred->cr_ruid; + temp_pcred->cr_uid = pcred->cr_ruid; /* displacing a supplementary group opts us out of memberd */ - if (kauth_cred_change_egid(&temp_cred, cred->cr_rgid)) { - temp_cred.cr_flags |= CRF_NOMEMBERD; - temp_cred.cr_gmuid = KAUTH_UID_NONE; + if (kauth_cred_change_egid(&temp_cred, pcred->cr_rgid)) { + temp_pcred->cr_flags |= CRF_NOMEMBERD; + temp_pcred->cr_gmuid = KAUTH_UID_NONE; } /* * If the cred is not opted out, make sure we are using the r/euid * for group checks */ - if (temp_cred.cr_gmuid != KAUTH_UID_NONE) - temp_cred.cr_gmuid = cred->cr_ruid; + if (temp_pcred->cr_gmuid != KAUTH_UID_NONE) + temp_pcred->cr_gmuid = pcred->cr_ruid; for (;;) { int err; @@ -4063,9 +4605,6 @@ kauth_cred_update(kauth_cred_t old_cred, kauth_cred_t model_cred, if (retain_auditinfo) { bcopy(&old_cred->cr_audit, &model_cred->cr_audit, sizeof(model_cred->cr_audit)); - /* XXX following bcopy will go away with cr_au */ - bcopy(&old_cred->cr_au, &model_cred->cr_au, - sizeof(model_cred->cr_au)); } for (;;) { @@ -4240,6 +4779,7 @@ kauth_cred_find(kauth_cred_t cred) { u_long hash_key; kauth_cred_t found_cred; + posix_cred_t pcred = posix_cred_get(cred); KAUTH_CRED_HASH_LOCK_ASSERT(); @@ -4258,23 +4798,26 @@ kauth_cred_find(kauth_cred_t cred) /* Find cred in the credential hash table */ TAILQ_FOREACH(found_cred, &kauth_cred_table_anchor[hash_key], cr_link) { boolean_t match; + posix_cred_t found_pcred = posix_cred_get(found_cred); /* * don't worry about the label unless the flags in * either credential tell us to. */ - if ((found_cred->cr_flags & CRF_MAC_ENFORCE) != 0 || - (cred->cr_flags & CRF_MAC_ENFORCE) != 0) { + if ((found_pcred->cr_flags & CRF_MAC_ENFORCE) != 0 || + (pcred->cr_flags & CRF_MAC_ENFORCE) != 0) { /* include the label pointer in the compare */ - match = (bcmp(&found_cred->cr_uid, &cred->cr_uid, + match = (bcmp(&found_pcred->cr_uid, &pcred->cr_uid, (sizeof(struct ucred) - - offsetof(struct ucred, cr_uid))) == 0); + offsetof(struct ucred, cr_posix))) == 0); } else { /* flags have to match, but skip the label in bcmp */ - match = (found_cred->cr_flags == cred->cr_flags && - bcmp(&found_cred->cr_uid, &cred->cr_uid, - (offsetof(struct ucred, cr_label) - - offsetof(struct ucred, cr_uid))) == 0); + match = (found_pcred->cr_flags == pcred->cr_flags && + bcmp(&found_pcred->cr_uid, &pcred->cr_uid, + sizeof(struct posix_cred)) == 0 && + bcmp(&found_cred->cr_audit, &cred->cr_audit, + sizeof(cred->cr_audit)) == 0); + } if (match) { /* found a match */ @@ -4326,24 +4869,33 @@ kauth_cred_hash(const uint8_t *datap, int data_len, u_long start_key) * not including the ref count or the TAILQ, which are mutable; * everything else isn't. * - * We also avoid the label (if the flag is not set saying the - * label is actually enforced). - * * Parameters: cred Credential for which hash is * desired * * Returns: (u_long) Returned hash key + * + * Notes: When actually moving the POSIX credential into a real label, + * remember to update this hash computation. */ static u_long kauth_cred_get_hashkey(kauth_cred_t cred) { + posix_cred_t pcred = posix_cred_get(cred); u_long hash_key = 0; - - hash_key = kauth_cred_hash((uint8_t *)&cred->cr_uid, - ((cred->cr_flags & CRF_MAC_ENFORCE) ? - sizeof(struct ucred) : offsetof(struct ucred, cr_label)) - - offsetof(struct ucred, cr_uid), - hash_key); + + if (pcred->cr_flags & CRF_MAC_ENFORCE) { + hash_key = kauth_cred_hash((uint8_t *)&cred->cr_posix, + sizeof(struct ucred) - offsetof(struct ucred, cr_posix), + hash_key); + } else { + /* skip label */ + hash_key = kauth_cred_hash((uint8_t *)&cred->cr_posix, + sizeof(struct posix_cred), + hash_key); + hash_key = kauth_cred_hash((uint8_t *)&cred->cr_audit, + sizeof(struct au_session), + hash_key); + } return(hash_key); } @@ -4691,3 +5243,226 @@ sysctl_dump_cred_backtraces( __unused struct sysctl_oid *oidp, __unused void *ar } #endif /* KAUTH_CRED_HASH_DEBUG || DEBUG_CRED */ + + +/* + ********************************************************************** + * The following routines will be moved to a policy_posix.c module at + * some future point. + ********************************************************************** + */ + +/* + * posix_cred_create + * + * Description: Helper function to create a kauth_cred_t credential that is + * initally labelled with a specific POSIX credential label + * + * Parameters: pcred The posix_cred_t to use as the initial + * label value + * + * Returns: (kauth_cred_t) The credential that was found in the + * hash or creates + * NULL kauth_cred_add() failed, or there was + * no egid specified, or we failed to + * attach a label to the new credential + * + * Notes: This function currently wraps kauth_cred_create(), and is the + * only consume of tht ill-fated function, apart from bsd_init(). + * It exists solely to support the NFS server code creation of + * credentials based on the over-the-wire RPC cals containing + * traditional POSIX credential information being tunneled to + * the server host from the client machine. + * + * In the future, we hope this function goes away. + * + * In the short term, it creates a temporary credential, puts + * the POSIX information from NFS into it, and then calls + * kauth_cred_create(), as an internal implementaiton detail. + * + * If we have to keep it around in the medium term, it will + * create a new kauth_cred_t, then label it with a POSIX label + * corresponding to the contents of the kauth_cred_t. If the + * policy_posix MACF module is not loaded, it will instead + * substitute a posix_cred_t which GRANTS all access (effectively + * a "root" credential) in order to not prevent NFS from working + * in the case that we are not supporting POSIX credentials. + */ +kauth_cred_t +posix_cred_create(posix_cred_t pcred) +{ + struct ucred temp_cred; + + bzero(&temp_cred, sizeof(temp_cred)); + temp_cred.cr_posix = *pcred; + + return kauth_cred_create(&temp_cred); +} + + +/* + * posix_cred_get + * + * Description: Given a kauth_cred_t, return the POSIX credential label, if + * any, which is associated with it. + * + * Parameters: cred The credential to obtain the label from + * + * Returns: posix_cred_t The POSIX credential label + * + * Notes: In the event that the policy_posix MACF module IS NOT loaded, + * this function will return a pointer to a posix_cred_t which + * GRANTS all access (effectively, a "root" credential). This is + * necessary to support legacy code which insists on tightly + * integrating POSIX credentails into its APIs, including, but + * not limited to, System V IPC mechanisms, POSIX IPC mechanisms, + * NFSv3, signals, dtrace, and a large number of kauth routines + * used to implement POSIX permissions related system calls. + * + * In the event that the policy_posix MACF module IS loaded, and + * there is no POSIX label on the kauth_cred_t credential, this + * function will return a pointer to a posix_cred_t which DENIES + * all access (effectively, a "deny rights granted by POSIX" + * credential). This is necessary to support the concept of a + * transiently loaded POSIX policy, or kauth_cred_t credentials + * which can not be used in conjunctions with POSIX permissions + * checks. + * + * This function currently returns the address of the cr_posix + * field of the supplied kauth_cred_t credential, and as such + * currently can not fail. In the future, this will not be the + * case. + */ +posix_cred_t +posix_cred_get(kauth_cred_t cred) +{ + return(&cred->cr_posix); +} + + +/* + * posix_cred_label + * + * Description: Label a kauth_cred_t with a POSIX credential label + * + * Parameters: cred The credential to label + * pcred The POSIX credential t label it with + * + * Returns: (void) + * + * Notes: This function is currently void in order to permit it to fit + * in with the currrent MACF framework label methods which allow + * labelling to fail silently. This is like acceptable for + * mandatory access controls, but not for POSIX, since those + * access controls are advisory. We will need to consider a + * return value in a future version of the MACF API. + * + * This operation currenty can not fail, as currently the POSIX + * credential is a subfield of the kauth_cred_t (ucred), which + * MUST be valid. In the future, this will not be the case. + */ +void +posix_cred_label(kauth_cred_t cred, posix_cred_t pcred) +{ + cred->cr_posix = *pcred; /* structure assign for now */ +} + + +/* + * posix_cred_access + * + * Description: Perform a POSIX access check for a protected object + * + * Parameters: cred The credential to check + * object_uid The POSIX UID of the protected object + * object_gid The POSIX GID of the protected object + * object_mode The POSIX mode of the protected object + * mode_req The requested POSIX access rights + * + * Returns 0 Access is granted + * EACCES Access is denied + * + * Notes: This code optimizes the case where the world and group rights + * would both grant the requested rights to avoid making a group + * membership query. This is a big performance win in the case + * where this is true. + */ +int +posix_cred_access(kauth_cred_t cred, id_t object_uid, id_t object_gid, mode_t object_mode, mode_t mode_req) +{ + int is_member; + mode_t mode_owner = (object_mode & S_IRWXU); + mode_t mode_group = (object_mode & S_IRWXG) << 3; + mode_t mode_world = (object_mode & S_IRWXO) << 6; + + /* + * Check first for owner rights + */ + if (kauth_cred_getuid(cred) == object_uid && (mode_req & mode_owner) == mode_req) + return (0); + + /* + * Combined group and world rights check, if we don't have owner rights + * + * OPTIMIZED: If group and world rights would grant the same bits, and + * they set of requested bits is in both, then we can simply check the + * world rights, avoiding a group membership check, which is expensive. + */ + if ((mode_req & mode_group & mode_world) == mode_req) { + return (0); + } else { + /* + * NON-OPTIMIZED: requires group membership check. + */ + if ((mode_req & mode_group) != mode_req) { + /* + * exclusion group : treat errors as "is a member" + * + * NON-OPTIMIZED: +group would deny; must check group + */ + if (!kauth_cred_ismember_gid(cred, object_gid, &is_member) && is_member) { + /* + * DENY: +group denies + */ + return (EACCES); + } else { + if ((mode_req & mode_world) != mode_req) { + /* + * DENY: both -group & world would deny + */ + return (EACCES); + } else { + /* + * ALLOW: allowed by -group and +world + */ + return (0); + } + } + } else { + /* + * inclusion group; treat errors as "not a member" + * + * NON-OPTIMIZED: +group allows, world denies; must + * check group + */ + if (!kauth_cred_ismember_gid(cred, object_gid, &is_member) && is_member) { + /* + * ALLOW: allowed by +group + */ + return (0); + } else { + if ((mode_req & mode_world) != mode_req) { + /* + * DENY: both -group & world would deny + */ + return (EACCES); + } else { + /* + * ALLOW: allowed by -group and +world + */ + return (0); + } + } + } + } +} diff --git a/bsd/kern/kern_descrip.c b/bsd/kern/kern_descrip.c index d3f483d24..3283ee3c0 100644 --- a/bsd/kern/kern_descrip.c +++ b/bsd/kern/kern_descrip.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -101,6 +101,7 @@ #include <sys/kdebug.h> #include <sys/sysproto.h> #include <sys/pipe.h> +#include <sys/spawn.h> #include <kern/kern_types.h> #include <kern/kalloc.h> #include <libkern/OSAtomic.h> @@ -112,6 +113,11 @@ #include <mach/mach_port.h> +#if CONFIG_PROTECT +#include <sys/cprotect.h> +#endif +#include <hfs/hfs.h> + kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, mach_msg_type_name_t, ipc_port_t *); void ipc_port_release_send(ipc_port_t); @@ -119,16 +125,14 @@ void ipc_port_release_send(ipc_port_t); struct psemnode; struct pshmnode; -int fdopen(dev_t dev, int mode, int type, proc_t p); -int finishdup(proc_t p, struct filedesc *fdp, int old, int new, int32_t *retval); +static int finishdup(proc_t p, + struct filedesc *fdp, int old, int new, int flags, int32_t *retval); int falloc_locked(proc_t p, struct fileproc **resultfp, int *resultfd, vfs_context_t ctx, int locked); void fg_drop(struct fileproc * fp); void fg_free(struct fileglob *fg); void fg_ref(struct fileproc * fp); -#if CONFIG_EMBEDDED void fileport_releasefg(struct fileglob *fg); -#endif /* CONFIG_EMBEDDED */ /* flags for close_internal_locked */ #define FD_DUP2RESV 1 @@ -156,6 +160,9 @@ extern kauth_scope_t kauth_scope_fileop; extern int cs_debug; +/* Conflict wait queue for when selects collide (opaque type) */ +extern struct wait_queue select_conflict_queue; + #define f_flag f_fglob->fg_flag #define f_type f_fglob->fg_type #define f_msgcount f_fglob->fg_msgcount @@ -474,21 +481,20 @@ dup(proc_t p, struct dup_args *uap, int32_t *retval) proc_fdunlock(p); return (error); } - error = finishdup(p, fdp, old, new, retval); + error = finishdup(p, fdp, old, new, 0, retval); fp_drop(p, old, fp, 1); proc_fdunlock(p); return (error); } - /* * dup2 * * Description: Duplicate a file descriptor to a particular value. * * Parameters: p Process performing the dup - * uap->fd The fd to dup + * uap->from The fd to dup * uap->to The fd to dup it to * retval Pointer to the call return area * @@ -547,7 +553,8 @@ closeit: goto startover; } - if ((fdp->fd_ofiles[new] != NULL) && ((error = fp_lookup(p, new, &nfp, 1)) == 0)) { + if ((fdp->fd_ofiles[new] != NULL) && + ((error = fp_lookup(p, new, &nfp, 1)) == 0)) { fp_drop(p, old, fp, 1); (void)close_internal_locked(p, new, nfp, FD_DUP2RESV); #if DIAGNOSTIC @@ -558,7 +565,7 @@ closeit: } else { #if DIAGNOSTIC if (fdp->fd_ofiles[new] != NULL) - panic("dup2: unable to get ref on a fileproc %d\n", new); + panic("dup2: no ref on fileproc %d", new); #endif procfdtbl_reservefd(p, new); } @@ -570,11 +577,11 @@ closeit: } #if DIAGNOSTIC if (fdp->fd_ofiles[new] != 0) - panic("dup2-1: overwriting fd_ofiles with new %d\n", new); + panic("dup2: overwriting fd_ofiles with new %d", new); if ((fdp->fd_ofileflags[new] & UF_RESERVED) == 0) - panic("dup2-1: unreserved fileflags with new %d\n", new); + panic("dup2: unreserved fileflags with new %d", new); #endif - error = finishdup(p, fdp, old, new, retval); + error = finishdup(p, fdp, old, new, 0, retval); fp_drop(p, old, fp, 1); proc_fdunlock(p); @@ -678,7 +685,6 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) off_t offset; int newmin; daddr64_t lbn, bn; - int devBlockSize = 0; unsigned int fflag; user_addr_t argp; boolean_t is64bit; @@ -723,6 +729,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) switch (uap->cmd) { case F_DUPFD: + case F_DUPFD_CLOEXEC: newmin = CAST_DOWN_EXPLICIT(int, uap->arg); /* arg is an int, so we won't lose bits */ AUDIT_ARG(value32, newmin); if ((u_int)newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || @@ -732,7 +739,8 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } if ( (error = fdalloc(p, newmin, &i)) ) goto out; - error = finishdup(p, fdp, fd, i, retval); + error = finishdup(p, fdp, fd, i, + uap->cmd == F_DUPFD_CLOEXEC ? UF_EXCLOSE : 0, retval); goto out; case F_GETFD: @@ -807,6 +815,36 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) error = fo_ioctl(fp, (int)TIOCSPGRP, (caddr_t)&tmp, &context); goto out; + case F_SETNOSIGPIPE: + tmp = CAST_DOWN_EXPLICIT(int, uap->arg); + if (fp->f_type == DTYPE_SOCKET) { + error = sock_setsockopt((struct socket *)fp->f_data, + SOL_SOCKET, SO_NOSIGPIPE, &tmp, sizeof (tmp)); + } else { + struct fileglob *fg = fp->f_fglob; + + lck_mtx_lock_spin(&fg->fg_lock); + if (tmp) + fg->fg_lflags |= FG_NOSIGPIPE; + else + fg->fg_lflags &= FG_NOSIGPIPE; + lck_mtx_unlock(&fg->fg_lock); + error = 0; + } + goto out; + + case F_GETNOSIGPIPE: + if (fp->f_type == DTYPE_SOCKET) { + int retsize = sizeof (*retval); + error = sock_getsockopt((struct socket *)fp->f_data, + SOL_SOCKET, SO_NOSIGPIPE, retval, &retsize); + } else { + *retval = (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) ? + 1 : 0; + error = 0; + } + goto out; + case F_SETLKW: flg |= F_WAIT; /* Fall into F_SETLK */ @@ -886,6 +924,9 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } case F_GETLK: +#if CONFIG_EMBEDDED + case F_GETLKPID: +#endif if (fp->f_type != DTYPE_VNODE) { error = EBADF; goto out; @@ -943,10 +984,10 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) #if CONFIG_MACF error = mac_file_check_lock(proc_ucred(p), fp->f_fglob, - F_GETLK, &fl); + uap->cmd, &fl); if (error == 0) #endif - error = VNOP_ADVLOCK(vp, (caddr_t)p, F_GETLK, &fl, F_POSIX, &context); + error = VNOP_ADVLOCK(vp, (caddr_t)p, uap->cmd, &fl, F_POSIX, &context); (void)vnode_put(vp); @@ -1108,6 +1149,18 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto out; + case F_NODIRECT: + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + if (uap->arg) + fp->f_fglob->fg_flag |= FNODIRECT; + else + fp->f_fglob->fg_flag &= ~FNODIRECT; + + goto out; + case F_GLOBAL_NOCACHE: if (fp->f_type != DTYPE_VNODE) { error = EBADF; @@ -1170,6 +1223,23 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } + case F_FLUSH_DATA: + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + + if ( (error = vnode_getwithref(vp)) == 0 ) { + error = cluster_push(vp, 0); + + (void)vnode_put(vp); + } + goto outdrop; + + case F_READBOOTSTRAP: case F_WRITEBOOTSTRAP: { user32_fbootstraptransfer_t user32_fbt_struct; @@ -1221,9 +1291,23 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) (void)vnode_put(vp); goto outdrop; } - case F_LOG2PHYS: { + case F_LOG2PHYS: + case F_LOG2PHYS_EXT: { struct log2phys l2p_struct; /* structure for allocate command */ + int devBlockSize; + off_t file_offset = 0; + size_t a_size = 0; + size_t run = 0; + + if (uap->cmd == F_LOG2PHYS_EXT) { + error = copyin(argp, (caddr_t)&l2p_struct, sizeof(l2p_struct)); + if (error) + goto out; + file_offset = l2p_struct.l2p_devoffset; + } else { + file_offset = fp->f_offset; + } if (fp->f_type != DTYPE_VNODE) { error = EBADF; goto out; @@ -1233,7 +1317,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) if ( (error = vnode_getwithref(vp)) ) { goto outdrop; } - error = VNOP_OFFTOBLK(vp, fp->f_offset, &lbn); + error = VNOP_OFFTOBLK(vp, file_offset, &lbn); if (error) { (void)vnode_put(vp); goto outdrop; @@ -1244,16 +1328,25 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } devBlockSize = vfs_devblocksize(vnode_mount(vp)); - - error = VNOP_BLOCKMAP(vp, offset, devBlockSize, &bn, NULL, NULL, 0, &context); + if (uap->cmd == F_LOG2PHYS_EXT) { + a_size = l2p_struct.l2p_contigbytes; + } else { + a_size = devBlockSize; + } + + error = VNOP_BLOCKMAP(vp, offset, a_size, &bn, &run, NULL, 0, &context); (void)vnode_put(vp); if (!error) { l2p_struct.l2p_flags = 0; /* for now */ - l2p_struct.l2p_contigbytes = 0; /* for now */ + if (uap->cmd == F_LOG2PHYS_EXT) { + l2p_struct.l2p_contigbytes = run - (file_offset - offset); + } else { + l2p_struct.l2p_contigbytes = 0; /* for now */ + } l2p_struct.l2p_devoffset = bn * devBlockSize; - l2p_struct.l2p_devoffset += fp->f_offset - offset; + l2p_struct.l2p_devoffset += file_offset - offset; error = copyout((caddr_t)&l2p_struct, argp, sizeof(l2p_struct)); } goto outdrop; @@ -1384,7 +1477,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) VATTR_SET(&va, va_mode, cmode & ACCESSPERMS); /* Start the lookup relative to the file descriptor's vnode. */ - NDINIT(&nd, LOOKUP, USEDVP | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + NDINIT(&nd, LOOKUP, OP_OPEN, USEDVP | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, fopen.o_pathname, &context); nd.ni_dvp = vp; @@ -1429,7 +1522,8 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } /* Start the lookup relative to the file descriptor's vnode. */ - NDINIT(&nd, DELETE, USEDVP | AUDITVNPATH1, UIO_USERSPACE, pathname, &context); + NDINIT(&nd, DELETE, OP_UNLINK, USEDVP | AUDITVNPATH1, UIO_USERSPACE, + pathname, &context); nd.ni_dvp = vp; error = unlink1(&context, &nd, 0); @@ -1533,6 +1627,9 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) kernel_blob_size); } else { /* ubc_blob_add() has consumed "kernel_blob_addr" */ +#if CHECK_CS_VALIDATION_BITMAP + ubc_cs_validation_bitmap_allocate( vp ); +#endif } (void) vnode_put(vp); @@ -1540,7 +1637,6 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) } case F_MARKDEPENDENCY: { - struct vnode *root_vp; struct vnode_attr va; vfs_context_t ctx = vfs_context_current(); kauth_cred_t cred; @@ -1563,13 +1659,11 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) goto outdrop; } - // the passed in vnode must be the root dir of the file system - if (VFS_ROOT(vp->v_mount, &root_vp, ctx) != 0 || vp != root_vp) { + if (!vnode_isvroot(vp)) { error = EINVAL; vnode_put(vp); goto outdrop; } - vnode_put(root_vp); // get the owner of the root dir VATTR_INIT(&va); @@ -1592,24 +1686,291 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval) // if all those checks pass then we can mark the dependency vfs_markdependency(vp->v_mount); error = 0; + + vnode_put(vp); + + break; + } + +#ifdef CONFIG_PROTECT + case F_GETPROTECTIONCLASS: { + int class = 0; + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + + proc_fdunlock(p); + + if (vnode_getwithref(vp)) { + error = ENOENT; + goto outdrop; + } + + error = cp_vnode_getclass (vp, &class); + if (error == 0) { + *retval = class; + } vnode_put(vp); + break; + } + + case F_SETPROTECTIONCLASS: { + /* tmp must be a valid PROTECTION_CLASS_* */ + tmp = CAST_DOWN_EXPLICIT(uint32_t, uap->arg); + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + + proc_fdunlock(p); + + if (vnode_getwithref(vp)) { + error = ENOENT; + goto outdrop; + } + /* Only go forward if you have write access */ + vfs_context_t ctx = vfs_context_current(); + if(vnode_authorize(vp, NULLVP, (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), ctx) != 0) { + vnode_put(vp); + error = EBADF; + goto outdrop; + } + error = cp_vnode_setclass (vp, tmp); + vnode_put(vp); + break; + } +#endif /* CONFIG_PROTECT */ + + case F_MOVEDATAEXTENTS: { + struct fileproc *fp2 = NULL; + struct vnode *src_vp = NULLVP; + struct vnode *dst_vp = NULLVP; + /* We need to grab the 2nd FD out of the argments before moving on. */ + int fd2 = CAST_DOWN_EXPLICIT(int32_t, uap->arg); + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = src_vp = (struct vnode *)fp->f_data; + + /* For now, special case HFS+ only, since this is SPI. */ + if (src_vp->v_tag != VT_HFS) { + error = EINVAL; + goto out; + } + + /* We're still holding the proc FD lock */ + if ( (error = fp_lookup(p, fd2, &fp2, 1)) ) { + error = EBADF; + goto out; + } + if (fp2->f_type != DTYPE_VNODE) { + fp_drop(p, fd2, fp2, 1); + error = EBADF; + goto out; + } + dst_vp = (struct vnode *)fp2->f_data; + + /* For now, special case HFS+ only, since this is SPI. */ + if (dst_vp->v_tag != VT_HFS) { + fp_drop(p, fd2, fp2, 1); + error = EINVAL; + goto out; + } + +#if CONFIG_MACF + /* Re-do MAC checks against the new FD, pass in a fake argument */ + error = mac_file_check_fcntl(proc_ucred(p), fp2->f_fglob, uap->cmd, 0); + if (error) { + fp_drop(p, fd2, fp2, 1); + goto out; + } +#endif + /* Audit the 2nd FD */ + AUDIT_ARG(fd, fd2); + + proc_fdunlock(p); + + /* Proc lock dropped; now we have a legit pair of FDs. Go to work */ + + if (vnode_getwithref(src_vp)) { + fp_drop(p, fd2, fp2, 0); + error = ENOENT; + goto outdrop; + } + if (vnode_getwithref(dst_vp)) { + vnode_put (src_vp); + fp_drop(p, fd2, fp2, 0); + error = ENOENT; + goto outdrop; + } + + /* + * Basic asserts; validate they are not the same and that + * both live on the same filesystem. + */ + + if (dst_vp == src_vp) { + vnode_put (src_vp); + vnode_put (dst_vp); + fp_drop (p, fd2, fp2, 0); + error = EINVAL; + goto outdrop; + } + + if (dst_vp->v_mount != src_vp->v_mount) { + vnode_put (src_vp); + vnode_put (dst_vp); + fp_drop (p, fd2, fp2, 0); + error = EXDEV; + goto outdrop; + } + + /* Now check for write access to the target files */ + if(vnode_authorize(src_vp, NULLVP, + (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), &context) != 0) { + vnode_put(src_vp); + vnode_put(dst_vp); + fp_drop(p, fd2, fp2, 0); + error = EBADF; + goto outdrop; + } + + if(vnode_authorize(dst_vp, NULLVP, + (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), &context) != 0) { + vnode_put(src_vp); + vnode_put(dst_vp); + fp_drop(p, fd2, fp2, 0); + error = EBADF; + goto outdrop; + } + + /* Verify that both vps point to files and not directories */ + if (!vnode_isreg(src_vp) || !vnode_isreg(dst_vp)) { + vnode_put(src_vp); + vnode_put(dst_vp); + fp_drop(p, fd2, fp2, 0); + error = EINVAL; + goto outdrop; + } + + /* + * The exchangedata syscall handler passes in 0 for the flags to VNOP_EXCHANGE. + * We'll pass in our special bit indicating that the new behavior is expected + */ + + error = VNOP_EXCHANGE(src_vp, dst_vp, FSOPT_EXCHANGE_DATA_ONLY, &context); + + vnode_put (src_vp); + vnode_put (dst_vp); + fp_drop(p, fd2, fp2, 0); break; } - case F_GETPROTECTIONCLASS: { - // stub to make the API work - printf("Reached F_GETPROTECTIONCLASS, returning without action\n"); + /* + * Set the vnode pointed to by 'fd' + * and tag it as the (potentially future) backing store + * for another filesystem + */ + case F_SETBACKINGSTORE: { + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + + if (vp->v_tag != VT_HFS) { + error = EINVAL; + goto out; + + } + proc_fdunlock(p); + + if (vnode_getwithref(vp)) { + error = ENOENT; + goto outdrop; + } + + /* only proceed if you have write access */ + vfs_context_t ctx = vfs_context_current(); + if(vnode_authorize(vp, NULLVP, (KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA), ctx) != 0) { + vnode_put(vp); + error = EBADF; + goto outdrop; + } + + + /* If arg != 0, set, otherwise unset */ + if (uap->arg) { + error = hfs_set_backingstore (vp, 1); + } + else { + error = hfs_set_backingstore (vp, 0); + } + /* Success. explicitly set error to 0. */ error = 0; - goto out; + + vnode_put(vp); + break; } - case F_SETPROTECTIONCLASS: { - // stub to make the API work - printf("Reached F_SETPROTECTIONCLASS, returning without action\n"); - error = 0; - goto out; + /* + * like F_GETPATH, but special semantics for + * the mobile time machine handler. + */ + case F_GETPATH_MTMINFO: { + char *pathbufp; + int pathlen; + + if (fp->f_type != DTYPE_VNODE) { + error = EBADF; + goto out; + } + vp = (struct vnode *)fp->f_data; + proc_fdunlock(p); + + pathlen = MAXPATHLEN; + MALLOC(pathbufp, char *, pathlen, M_TEMP, M_WAITOK); + if (pathbufp == NULL) { + error = ENOMEM; + goto outdrop; + } + if ( (error = vnode_getwithref(vp)) == 0 ) { + int backingstore = 0; + + /* Check for error from vn_getpath before moving on */ + if ((error = vn_getpath(vp, pathbufp, &pathlen)) == 0) { + if (vp->v_tag == VT_HFS) { + error = hfs_is_backingstore (vp, &backingstore); + } + (void)vnode_put(vp); + + if (error == 0) { + error = copyout((caddr_t)pathbufp, argp, pathlen); + } + if (error == 0) { + /* + * If the copyout was successful, now check to ensure + * that this vnode is not a BACKINGSTORE vnode. mtmd + * wants the path regardless. + */ + if (backingstore) { + error = EBUSY; + } + } + } else + (void)vnode_put(vp); + } + FREE(pathbufp, M_TEMP); + goto outdrop; } @@ -1730,6 +2091,7 @@ out: * Parameters: p Process performing the dup * old The fd to dup * new The fd to dup it to + * fd_flags Flags to augment the new fd * retval Pointer to the call return area * * Returns: 0 Success @@ -1744,10 +2106,11 @@ out: * * Notes: This function may drop and reacquire this lock; it is unsafe * for a caller to assume that other state protected by the lock - * has not been subsequently changes out from under it. + * has not been subsequently changed out from under it. */ int -finishdup(proc_t p, struct filedesc *fdp, int old, int new, int32_t *retval) +finishdup(proc_t p, + struct filedesc *fdp, int old, int new, int fd_flags, int32_t *retval) { struct fileproc *nfp; struct fileproc *ofp; @@ -1758,9 +2121,8 @@ finishdup(proc_t p, struct filedesc *fdp, int old, int new, int32_t *retval) #if DIAGNOSTIC proc_fdlock_assert(p, LCK_MTX_ASSERT_OWNED); #endif - if ((ofp = fdp->fd_ofiles[old]) == NULL || - (fdp->fd_ofileflags[old] & UF_RESERVED)) { + (fdp->fd_ofileflags[old] & UF_RESERVED)) { fdrelse(p, new); return (EBADF); } @@ -1796,13 +2158,14 @@ finishdup(proc_t p, struct filedesc *fdp, int old, int new, int32_t *retval) #if DIAGNOSTIC if (fdp->fd_ofiles[new] != 0) - panic("finishdup: overwriting fd_ofiles with new %d\n", new); + panic("finishdup: overwriting fd_ofiles with new %d", new); if ((fdp->fd_ofileflags[new] & UF_RESERVED) == 0) - panic("finishdup: unreserved fileflags with new %d\n", new); + panic("finishdup: unreserved fileflags with new %d", new); #endif if (new > fdp->fd_lastfile) fdp->fd_lastfile = new; + *fdflags(p, new) |= fd_flags; procfdtbl_releasefd(p, new, nfp); *retval = new; return (0); @@ -1897,13 +2260,13 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags) if ((fp->f_flags & FP_CLOSING) == FP_CLOSING) { - panic("close_internal_locked: being called on already closing fd\n"); + panic("close_internal_locked: being called on already closing fd"); } #if DIAGNOSTIC if ((fdp->fd_ofileflags[fd] & UF_RESERVED) == 0) - panic("close_internal: unreserved fileflags with fd %d\n", fd); + panic("close_internal: unreserved fileflags with fd %d", fd); #endif fp->f_flags |= FP_CLOSING; @@ -1961,7 +2324,7 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags) #if DIAGNOSTIC if (resvfd != 0) { if ((fdp->fd_ofileflags[fd] & UF_RESERVED) == 0) - panic("close with reserved fd returns with freed fd:%d: proc: %x\n", fd, (unsigned int)p); + panic("close with reserved fd returns with freed fd:%d: proc: %p", fd, p); } #endif @@ -3150,9 +3513,14 @@ fp_drop(proc_t p, int fd, struct fileproc *fp, int locked) } fp->f_iocount--; - if (p->p_fpdrainwait && fp->f_iocount == 0) { - p->p_fpdrainwait = 0; - needwakeup = 1; + if (fp->f_iocount == 0) { + if (fp->f_flags & FP_SELCONFLICT) + fp->f_flags &= ~FP_SELCONFLICT; + + if (p->p_fpdrainwait) { + p->p_fpdrainwait = 0; + needwakeup = 1; + } } if (!locked) proc_fdunlock(p); @@ -3188,7 +3556,7 @@ fp_drop(proc_t p, int fd, struct fileproc *fp, int locked) * * The fileproc referenced is not returned; because of this, care * must be taken to not drop the last reference (e.g. by closing - * the file). This is inhernely unsafe, since the reference may + * the file). This is inherently unsafe, since the reference may * not be recoverable from the vnode, if there is a subsequent * close that destroys the associate fileproc. The caller should * therefore retain their own reference on the fileproc so that @@ -3249,7 +3617,7 @@ file_vnode(int fd, struct vnode **vpp) * * The fileproc referenced is not returned; because of this, care * must be taken to not drop the last reference (e.g. by closing - * the file). This is inhernely unsafe, since the reference may + * the file). This is inherently unsafe, since the reference may * not be recoverable from the vnode, if there is a subsequent * close that destroys the associate fileproc. The caller should * therefore retain their own reference on the fileproc so that @@ -3314,7 +3682,7 @@ file_vnode_withvid(int fd, struct vnode **vpp, uint32_t * vidp) * * The fileproc referenced is not returned; because of this, care * must be taken to not drop the last reference (e.g. by closing - * the file). This is inhernely unsafe, since the reference may + * the file). This is inherently unsafe, since the reference may * not be recoverable from the socket, if there is a subsequent * close that destroys the associate fileproc. The caller should * therefore retain their own reference on the fileproc so that @@ -3445,9 +3813,14 @@ file_drop(int fd) } fp->f_iocount --; - if (p->p_fpdrainwait && fp->f_iocount == 0) { - p->p_fpdrainwait = 0; - needwakeup = 1; + if (fp->f_iocount == 0) { + if (fp->f_flags & FP_SELCONFLICT) + fp->f_flags &= ~FP_SELCONFLICT; + + if (p->p_fpdrainwait) { + p->p_fpdrainwait = 0; + needwakeup = 1; + } } proc_fdunlock(p); @@ -3481,7 +3854,7 @@ file_drop(int fd) * *resultfd (modified) Returned fd * * Locks: This function takes and drops the proc_fdlock; if this lock - * is alread held, use falloc_locked() instead. + * is already held, use falloc_locked() instead. * * Notes: This function takes separate process and context arguments * solely to support kern_exec.c; otherwise, it would take @@ -3505,7 +3878,7 @@ falloc(proc_t p, struct fileproc **resultfp, int *resultfd, vfs_context_t ctx) * falloc_locked * * Create a new open file structure and allocate - * a file decriptor for the process that refers to it. + * a file descriptor for the process that refers to it. * * Returns: 0 Success * @@ -3679,6 +4052,10 @@ fg_free(struct fileglob *fg) * that are either marked as close-on-exec, or which were in the * process of being opened at the time of the execve * + * Also handles the case (via posix_spawn()) where -all- + * files except those marked with "inherit" as treated as + * close-on-exec. + * * Parameters: p Pointer to process calling * execve * @@ -3693,27 +4070,39 @@ fg_free(struct fileglob *fg) * XXX: We should likely reverse the lock and funnel drop/acquire * order to avoid the small race window; it's also possible that * if the program doing the exec has an outstanding listen socket - * and a network connection is completed asyncrhonously that we + * and a network connection is completed asynchronously that we * will end up with a "ghost" socket reference in the new process. * * This needs reworking to make it safe to remove the funnel from * the execve and posix_spawn system calls. */ void -fdexec(proc_t p) +fdexec(proc_t p, short flags) { struct filedesc *fdp = p->p_fd; int i; - struct fileproc *fp; + boolean_t cloexec_default = (flags & POSIX_SPAWN_CLOEXEC_DEFAULT) != 0; proc_fdlock(p); - i = fdp->fd_lastfile; + for (i = fdp->fd_lastfile; i >= 0; i--) { + + struct fileproc *fp = fdp->fd_ofiles[i]; + char *flagp = &fdp->fd_ofileflags[i]; - while (i >= 0) { + if (cloexec_default) { + /* + * Reverse the usual semantics of file descriptor + * inheritance - all of them should be closed + * except files marked explicitly as "inherit" and + * not marked close-on-exec. + */ + if ((*flagp & (UF_EXCLOSE|UF_INHERIT)) != UF_INHERIT) + *flagp |= UF_EXCLOSE; + *flagp &= ~UF_INHERIT; + } - fp = fdp->fd_ofiles[i]; if ( - ((fdp->fd_ofileflags[i] & (UF_RESERVED|UF_EXCLOSE)) == UF_EXCLOSE) + ((*flagp & (UF_RESERVED|UF_EXCLOSE)) == UF_EXCLOSE) #if CONFIG_MACF || (fp && mac_file_check_inherit(proc_ucred(p), fp->f_fglob)) #endif @@ -3725,10 +4114,21 @@ fdexec(proc_t p) fdp->fd_lastfile--; if (i < fdp->fd_freefile) fdp->fd_freefile = i; + + /* + * Wait for any third party viewers (e.g., lsof) + * to release their references to this fileproc. + */ + while (fp->f_iocount > 0) { + p->p_fpdrainwait = 1; + msleep(&p->p_fpdrainwait, &p->p_fdmlock, PRIBIO, + "fpdrain", NULL); + } + closef_locked(fp, fp->f_fglob, p); + FREE_ZONE(fp, sizeof(*fp), M_FILEPROC); } - i--; } proc_fdunlock(p); } @@ -3764,7 +4164,7 @@ fdexec(proc_t p) * thread making the call, rather than from the process. * * In the case of a failure to obtain a reference, for most cases, - * the file entry will be silently droppped. There's an exception + * the file entry will be silently dropped. There's an exception * for the case of a chroot dir, since a failure to to obtain a * reference there would constitute an "escape" from the chroot * environment, which must not be allowed. In that case, we will @@ -3822,7 +4222,7 @@ fdcopy(proc_t p, vnode_t uth_cdir) * our reference from the parent also * since the vnode has gone DEAD making * it useless... by dropping it we'll - * be that much closer to recyling it + * be that much closer to recycling it */ vnode_rele(fdp->fd_cdir); fdp->fd_cdir = NULL; @@ -3994,7 +4394,7 @@ fdfree(proc_t p) if ((fp = fdp->fd_ofiles[i]) != NULL) { if (fdp->fd_ofileflags[i] & UF_RESERVED) - panic("fdfree: found fp with UF_RESERVED\n"); + panic("fdfree: found fp with UF_RESERVED"); /* closef drops the iocount ... */ if ((fp->f_flags & FP_INCHRREAD) != 0) @@ -4186,7 +4586,7 @@ closef_locked(struct fileproc *fp, struct fileglob *fg, proc_t p) * Locks: Assumes the caller holds the proc_fdlock * * Notes: For character devices, this occurs on the last close of the - * device; for all other file descriptos, this occurs on each + * device; for all other file descriptors, this occurs on each * close to prevent fd's from being closed out from under * operations currently in progress and blocked * @@ -4210,14 +4610,25 @@ fileproc_drain(proc_t p, struct fileproc * fp) if (fp->f_fglob->fg_ops->fo_drain) { (*fp->f_fglob->fg_ops->fo_drain)(fp, &context); } - if (((fp->f_flags & FP_INSELECT)== FP_INSELECT)) { - wait_queue_wakeup_all((wait_queue_t)fp->f_waddr, NULL, THREAD_INTERRUPTED); + if ((fp->f_flags & FP_INSELECT) == FP_INSELECT) { + if (wait_queue_wakeup_all((wait_queue_t)fp->f_waddr, NULL, THREAD_INTERRUPTED) == KERN_INVALID_ARGUMENT) + panic("bad wait queue for wait_queue_wakeup_all %p", fp->f_waddr); } + if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) { + if (wait_queue_wakeup_all(&select_conflict_queue, NULL, THREAD_INTERRUPTED) == KERN_INVALID_ARGUMENT) + panic("bad select_conflict_queue"); + } p->p_fpdrainwait = 1; msleep(&p->p_fpdrainwait, &p->p_fdmlock, PRIBIO, "fpdrain", NULL); } +#if DIAGNOSTIC + if ((fp->f_flags & FP_INSELECT) != 0) + panic("FP_INSELECT set on drained fp"); +#endif + if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) + fp->f_flags &= ~FP_SELCONFLICT; } @@ -4329,7 +4740,6 @@ out1: } -#if CONFIG_EMBEDDED /* * fileport_makeport * @@ -4465,7 +4875,7 @@ fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) err = EINVAL; goto out; } - + MALLOC_ZONE(fp, struct fileproc *, sizeof(*fp), M_FILEPROC, M_WAITOK); if (fp == FILEPROC_NULL) { err = ENOMEM; @@ -4483,6 +4893,7 @@ fileport_makefd(proc_t p, struct fileport_makefd_args *uap, int32_t *retval) proc_fdunlock(p); goto out; } + *fdflags(p, fd) |= UF_EXCLOSE; procfdtbl_releasefd(p, fd, fp); proc_fdunlock(p); @@ -4500,7 +4911,6 @@ out: return err; } -#endif /* CONFIG_EMBEDDED */ /* @@ -4524,7 +4934,7 @@ out: * Notes: XXX This is not thread safe; see fdopen() above */ int -dupfdopen(struct filedesc *fdp, int indx, int dfd, int mode, int error) +dupfdopen(struct filedesc *fdp, int indx, int dfd, int flags, int error) { struct fileproc *wfp; struct fileproc *fp; @@ -4575,7 +4985,7 @@ dupfdopen(struct filedesc *fdp, int indx, int dfd, int mode, int error) * Check that the mode the file is being opened for is a * subset of the mode of the existing descriptor. */ - if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { + if (((flags & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) { proc_fdunlock(p); return (EACCES); } @@ -4587,7 +4997,8 @@ dupfdopen(struct filedesc *fdp, int indx, int dfd, int mode, int error) fg_free(fp->f_fglob); fp->f_fglob = wfp->f_fglob; - fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd] | + (flags & O_CLOEXEC) ? UF_EXCLOSE : 0; proc_fdunlock(p); return (0); @@ -4623,10 +5034,11 @@ fg_ref(struct fileproc * fp) #if DIAGNOSTIC if ((fp->f_flags & ~((unsigned int)FP_VALID_FLAGS)) != 0) - panic("fg_ref: invalid bits on fp%x\n", (unsigned int)fp); + panic("fg_ref: invalid bits on fp %p", fp); if (fg->fg_count == 0) - panic("fg_ref: adding fgcount to zeroed fg :fp %x, fg%x\n ", (unsigned int)fp, (unsigned int)fg); + panic("fg_ref: adding fgcount to zeroed fg: fp %p fg %p", + fp, fg); #endif fg->fg_count++; lck_mtx_unlock(&fg->fg_lock); diff --git a/bsd/kern/kern_event.c b/bsd/kern/kern_event.c index 5d195dcf0..632501473 100644 --- a/bsd/kern/kern_event.c +++ b/bsd/kern/kern_event.c @@ -92,6 +92,9 @@ #include <libkern/libkern.h> #include "net/net_str_id.h" +#include <mach/task.h> +#include <kern/vm_pressure.h> + MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system"); #define KQ_EVENT NULL @@ -140,6 +143,8 @@ static void kevent_continue(struct kqueue *kq, void *data, int error); static void kqueue_scan_continue(void *contp, wait_result_t wait_result); static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *data, int *countp, struct proc *p); +static int kqueue_begin_processing(struct kqueue *kq); +static void kqueue_end_processing(struct kqueue *kq); static int knote_process(struct knote *kn, kevent_callback_t callback, void *data, struct kqtailq *inprocessp, struct proc *p); static void knote_put(struct knote *kn); @@ -183,6 +188,15 @@ static struct filterops proc_filtops = { .f_event = filt_proc, }; +static int filt_vmattach(struct knote *kn); +static void filt_vmdetach(struct knote *kn); +static int filt_vm(struct knote *kn, long hint); +static struct filterops vm_filtops = { + .f_attach = filt_vmattach, + .f_detach = filt_vmdetach, + .f_event = filt_vm, +}; + extern struct filterops fs_filtops; extern struct filterops sig_filtops; @@ -238,11 +252,6 @@ static struct filterops user_filtops = { .f_touch = filt_usertouch, }; -#if CONFIG_AUDIT -/* Audit session filter */ -extern struct filterops audit_session_filtops; -#endif - /* * Table for for all system-defined filters. */ @@ -261,11 +270,8 @@ static struct filterops *sysfilt_ops[] = { &machport_filtops, /* EVFILT_MACHPORT */ &fs_filtops, /* EVFILT_FS */ &user_filtops, /* EVFILT_USER */ -#if CONFIG_AUDIT - &audit_session_filtops, /* EVFILT_SESSION */ -#else - &bad_filtops, -#endif + &bad_filtops, /* unused */ + &vm_filtops, /* EVFILT_VM */ }; /* @@ -455,6 +461,7 @@ static int filt_procattach(struct knote *kn) { struct proc *p; + pid_t selfpid = (pid_t)0; assert(PID_MAX < NOTE_PDATAMASK); @@ -466,6 +473,16 @@ filt_procattach(struct knote *kn) return (ESRCH); } + if ((kn->kn_sfflags & NOTE_EXIT) != 0) { + selfpid = proc_selfpid(); + /* check for validity of NOTE_EXISTATUS */ + if (((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) && + ((p->p_ppid != selfpid) && (((p->p_lflag & P_LTRACED) == 0) || (p->p_oppid != selfpid)))) { + proc_rele(p); + return(EACCES); + } + } + proc_klist_lock(); kn->kn_flags |= EV_CLEAR; /* automatically set */ @@ -524,12 +541,57 @@ filt_proc(struct knote *kn, long hint) if (event == NOTE_REAP || (event == NOTE_EXIT && !(kn->kn_sfflags & NOTE_REAP))) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); } + if ((event == NOTE_EXIT) && ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0)) { + kn->kn_fflags |= NOTE_EXITSTATUS; + kn->kn_data = (hint & NOTE_PDATAMASK); + } + if ((event == NOTE_RESOURCEEND) && ((kn->kn_sfflags & NOTE_RESOURCEEND) != 0)) { + kn->kn_fflags |= NOTE_RESOURCEEND; + kn->kn_data = (hint & NOTE_PDATAMASK); + } } /* atomic check, no locking need when called from above */ return (kn->kn_fflags != 0); } +/* + * Virtual memory kevents + * + * author: Matt Jacobson [matthew_jacobson@apple.com] + */ + +static int +filt_vmattach(struct knote *kn) +{ + /* + * The note will be cleared once the information has been flushed to the client. + * If there is still pressure, we will be re-alerted. + */ + kn->kn_flags |= EV_CLEAR; + + return vm_knote_register(kn); +} + +static void +filt_vmdetach(struct knote *kn) +{ + vm_knote_unregister(kn); +} + +static int +filt_vm(struct knote *kn, long hint) +{ + /* hint == 0 means this is just an alive? check (always true) */ + if (hint != 0) { + /* If this knote is interested in the event specified in hint... */ + if ((kn->kn_sfflags & hint) != 0) { + kn->kn_fflags |= hint; + } + } + + return (kn->kn_fflags != 0); +} /* * filt_timervalidate - process data from user @@ -872,7 +934,7 @@ filt_userattach(struct knote *kn) { /* EVFILT_USER knotes are not attached to anything in the kernel */ kn->kn_hook = NULL; - if (kn->kn_fflags & NOTE_TRIGGER || kn->kn_flags & EV_TRIGGER) { + if (kn->kn_fflags & NOTE_TRIGGER) { kn->kn_hookid = 1; } else { kn->kn_hookid = 0; @@ -895,10 +957,10 @@ filt_user(struct knote *kn, __unused long hint) static void filt_usertouch(struct knote *kn, struct kevent64_s *kev, long type) { - int ffctrl; + uint32_t ffctrl; switch (type) { case EVENT_REGISTER: - if (kev->fflags & NOTE_TRIGGER || kev->flags & EV_TRIGGER) { + if (kev->fflags & NOTE_TRIGGER) { kn->kn_hookid = 1; } @@ -1511,6 +1573,7 @@ kevent_register(struct kqueue *kq, struct kevent64_s *kev, __unused struct proc error = fops->f_attach(kn); kqlock(kq); + if (error != 0) { /* * Failed to attach correctly, so drop. @@ -1594,11 +1657,6 @@ kevent_register(struct kqueue *kq, struct kevent64_s *kev, __unused struct proc */ if (!fops->f_isfd && fops->f_touch != NULL) fops->f_touch(kn, kev, EVENT_REGISTER); - - /* We may need to push some info down to a networked filesystem */ - if (kn->kn_filter == EVFILT_VNODE) { - vnode_knoteupdate(kn); - } } /* still have use ref on knote */ @@ -1770,6 +1828,47 @@ knote_process(struct knote *kn, return error; } +/* + * Return 0 to indicate that processing should proceed, + * -1 if there is nothing to process. + * + * Called with kqueue locked and returns the same way, + * but may drop lock temporarily. + */ +static int +kqueue_begin_processing(struct kqueue *kq) +{ + for (;;) { + if (kq->kq_count == 0) { + return -1; + } + + /* if someone else is processing the queue, wait */ + if (kq->kq_nprocess != 0) { + wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_UNINT, 0); + kq->kq_state |= KQ_PROCWAIT; + kqunlock(kq); + thread_block(THREAD_CONTINUE_NULL); + kqlock(kq); + } else { + kq->kq_nprocess = 1; + return 0; + } + } +} + +/* + * Called with kqueue lock held. + */ +static void +kqueue_end_processing(struct kqueue *kq) +{ + kq->kq_nprocess = 0; + if (kq->kq_state & KQ_PROCWAIT) { + kq->kq_state &= ~KQ_PROCWAIT; + wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_AWAKENED); + } +} /* * kqueue_process - process the triggered events in a kqueue @@ -1799,23 +1898,13 @@ kqueue_process(struct kqueue *kq, int error; TAILQ_INIT(&inprocess); - restart: - if (kq->kq_count == 0) { + + if (kqueue_begin_processing(kq) == -1) { *countp = 0; + /* Nothing to process */ return 0; } - /* if someone else is processing the queue, wait */ - if (hw_atomic_add(&kq->kq_nprocess, 1) != 1) { - hw_atomic_sub(&kq->kq_nprocess, 1); - wait_queue_assert_wait((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_UNINT, 0); - kq->kq_state |= KQ_PROCWAIT; - kqunlock(kq); - thread_block(THREAD_CONTINUE_NULL); - kqlock(kq); - goto restart; - } - /* * Clear any pre-posted status from previous runs, so we only * detect events that occur during this run. @@ -1850,11 +1939,8 @@ kqueue_process(struct kqueue *kq, kn->kn_tq = &kq->kq_head; TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); } - hw_atomic_sub(&kq->kq_nprocess, 1); - if (kq->kq_state & KQ_PROCWAIT) { - kq->kq_state &= ~KQ_PROCWAIT; - wait_queue_wakeup_all((wait_queue_t)kq->kq_wqs, &kq->kq_nprocess, THREAD_AWAKENED); - } + + kqueue_end_processing(kq); *countp = nevents; return error; @@ -2044,11 +2130,15 @@ static int kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t ctx) { struct kqueue *kq = (struct kqueue *)fp->f_data; - int again; - + struct knote *kn; + struct kqtailq inprocessq; + int retnum = 0; + if (which != FREAD) return 0; + TAILQ_INIT(&inprocessq); + kqlock(kq); /* * If this is the first pass, link the wait queue associated with the @@ -2067,11 +2157,12 @@ kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t (wait_queue_link_t)wql); } - retry: - again = 0; - if (kq->kq_count != 0) { - struct knote *kn; + if (kqueue_begin_processing(kq) == -1) { + kqunlock(kq); + return 0; + } + if (kq->kq_count != 0) { /* * there is something queued - but it might be a * KN_STAYQUEUED knote, which may or may not have @@ -2079,31 +2170,42 @@ kqueue_select(struct fileproc *fp, int which, void *wql, __unused vfs_context_t * list of knotes to see, and peek at the stay- * queued ones to be really sure. */ - TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) { - int retnum = 0; - if ((kn->kn_status & KN_STAYQUEUED) == 0 || - (retnum = kn->kn_fop->f_peek(kn)) > 0) { - kqunlock(kq); - return 1; + while ((kn = (struct knote*)TAILQ_FIRST(&kq->kq_head)) != NULL) { + if ((kn->kn_status & KN_STAYQUEUED) == 0) { + retnum = 1; + goto out; } - if (retnum < 0) - again++; + + TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); + TAILQ_INSERT_TAIL(&inprocessq, kn, kn_tqe); + + if (kqlock2knoteuse(kq, kn)) { + unsigned peek; + + peek = kn->kn_fop->f_peek(kn); + if (knoteuse2kqlock(kq, kn)) { + if (peek > 0) { + retnum = 1; + goto out; + } + } else { + retnum = 0; + } + } } } - /* - * If we stumbled across a knote that couldn't be peeked at, - * we have to drop the kq lock and try again. - */ - if (again > 0) { - kqunlock(kq); - mutex_pause(0); - kqlock(kq); - goto retry; +out: + /* Return knotes to active queue */ + while ((kn = TAILQ_FIRST(&inprocessq)) != NULL) { + TAILQ_REMOVE(&inprocessq, kn, kn_tqe); + kn->kn_tq = &kq->kq_head; + TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); } + kqueue_end_processing(kq); kqunlock(kq); - return 0; + return retnum; } /* @@ -2312,10 +2414,7 @@ knote_link_wait_queue(struct knote *kn, struct wait_queue *wq) kr = wait_queue_link(wq, kq->kq_wqs); if (kr == KERN_SUCCESS) { - kqlock(kq); - kn->kn_status |= KN_STAYQUEUED; - knote_enqueue(kn); - kqunlock(kq); + knote_markstayqueued(kn); return 0; } else { return ENOMEM; @@ -2531,6 +2630,7 @@ knote_init(void) /* Initialize the timer filter lock */ lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr); + lck_mtx_init(&vm_pressure_klist_mutex, kq_lck_grp, kq_lck_attr); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL) @@ -2843,3 +2943,12 @@ fill_kqueueinfo(struct kqueue *kq, struct kqueue_info * kinfo) return(0); } + +void +knote_markstayqueued(struct knote *kn) +{ + kqlock(kn->kn_kq); + kn->kn_status |= KN_STAYQUEUED; + knote_enqueue(kn); + kqunlock(kn->kn_kq); +} diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index 722415a70..3de273b36 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -139,6 +139,7 @@ #include <vm/vm_protos.h> #include <vm/vm_kern.h> +#include <machine/pal_routines.h> #if CONFIG_DTRACE /* Do not include dtrace.h, it redefines kmem_[alloc/free] */ @@ -153,8 +154,7 @@ extern void dtrace_lazy_dofs_destroy(proc_t); thread_t fork_create_child(task_t parent_task, proc_t child_proc, int inherit_memory, int is64bit); void vfork_exit(proc_t p, int rv); int setsigvec(proc_t, thread_t, int, struct __kern_sigaction *, boolean_t in_sigstart); -void workqueue_exit(struct proc *); - +extern void proc_apply_task_networkbg_internal(proc_t); /* * Mach things for which prototypes are unavailable from Mach headers @@ -186,16 +186,6 @@ extern struct savearea *get_user_regs(thread_t); #include <sys/sdt.h> -/* - * SIZE_MAXPTR The maximum size of a user space pointer, in bytes - * SIZE_IMG_STRSPACE The available string space, minus two pointers; we - * define it interms of the maximum, since we don't - * know the pointer size going in, until after we've - * parsed the executable image. - */ -#define SIZE_MAXPTR 8 /* 64 bits */ -#define SIZE_IMG_STRSPACE (NCARGS - 2 * SIZE_MAXPTR) - /* * EAI_ITERLIMIT The maximum number of times to iterate an image * activator in exec_activate_image() before treating @@ -203,6 +193,12 @@ extern struct savearea *get_user_regs(thread_t); */ #define EAI_ITERLIMIT 10 +/* + * For #! interpreter parsing + */ +#define IS_WHITESPACE(ch) ((ch == ' ') || (ch == '\t')) +#define IS_EOL(ch) ((ch == '#') || (ch == '\n')) + extern vm_map_t bsd_pageable_map; extern struct fileops vnops; @@ -218,9 +214,10 @@ static int execargs_alloc(struct image_params *imgp); static int execargs_free(struct image_params *imgp); static int exec_check_permissions(struct image_params *imgp); static int exec_extract_strings(struct image_params *imgp); +static int exec_add_apple_strings(struct image_params *imgp); static int exec_handle_sugid(struct image_params *imgp); static int sugid_scripts = 0; -SYSCTL_INT (_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW, &sugid_scripts, 0, ""); +SYSCTL_INT (_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW | CTLFLAG_LOCKED, &sugid_scripts, 0, ""); static kern_return_t create_unix_stack(vm_map_t map, user_addr_t user_stack, int customstack, proc_t p); static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size); @@ -232,12 +229,14 @@ __private_extern__ int open1(vfs_context_t, struct nameidata *, int, struct vnode_attr *, int32_t *); /* - * exec_add_string + * exec_add_user_string * * Add the requested string to the string space area. * * Parameters; struct image_params * image parameter block * user_addr_t string to add to strings area + * int segment from which string comes + * boolean_t TRUE if string contributes to NCARGS * * Returns: 0 Success * !0 Failure errno from copyinstr() @@ -245,29 +244,41 @@ int open1(vfs_context_t, struct nameidata *, int, struct vnode_attr *, int32_t * Implicit returns: * (imgp->ip_strendp) updated location of next add, if any * (imgp->ip_strspace) updated byte count of space remaining + * (imgp->ip_argspace) updated byte count of space in NCARGS */ static int -exec_add_string(struct image_params *imgp, user_addr_t str) +exec_add_user_string(struct image_params *imgp, user_addr_t str, int seg, boolean_t is_ncargs) { - int error = 0; - - do { - size_t len = 0; - if (imgp->ip_strspace <= 0) { + int error = 0; + + do { + size_t len = 0; + int space; + + if (is_ncargs) + space = imgp->ip_argspace; /* by definition smaller than ip_strspace */ + else + space = imgp->ip_strspace; + + if (space <= 0) { error = E2BIG; break; } - if (!UIO_SEG_IS_USER_SPACE(imgp->ip_seg)) { + + if (!UIO_SEG_IS_USER_SPACE(seg)) { char *kstr = CAST_DOWN(char *,str); /* SAFE */ - error = copystr(kstr, imgp->ip_strendp, imgp->ip_strspace, &len); + error = copystr(kstr, imgp->ip_strendp, space, &len); } else { - error = copyinstr(str, imgp->ip_strendp, imgp->ip_strspace, - &len); + error = copyinstr(str, imgp->ip_strendp, space, &len); } + imgp->ip_strendp += len; imgp->ip_strspace -= len; + if (is_ncargs) + imgp->ip_argspace -= len; + } while (error == ENAMETOOLONG); - + return error; } @@ -277,11 +288,10 @@ exec_add_string(struct image_params *imgp, user_addr_t str) * To support new app package launching for Mac OS X, the dyld needs the * first argument to execve() stored on the user stack. * - * Save the executable path name at the top of the strings area and set + * Save the executable path name at the bottom of the strings area and set * the argument vector pointer to the location following that to indicate * the start of the argument and environment tuples, setting the remaining - * string space count to the size of the string area minus the path length - * and a reserve for two pointers. + * string space count to the size of the string area minus the path length. * * Parameters; struct image_params * image parameter block * char * path used to invoke program @@ -295,8 +305,9 @@ exec_add_string(struct image_params *imgp, user_addr_t str) * Implicit returns: * (imgp->ip_strings) saved path * (imgp->ip_strspace) space remaining in ip_strings - * (imgp->ip_argv) beginning of argument list * (imgp->ip_strendp) start of remaining copy area + * (imgp->ip_argspace) space remaining of NCARGS + * (imgp->ip_applec) Initial applev[0] * * Note: We have to do this before the initial namei() since in the * path contains symbolic links, namei() will overwrite the @@ -310,10 +321,7 @@ exec_save_path(struct image_params *imgp, user_addr_t path, int seg) { int error; size_t len; - char *kpath = CAST_DOWN(char *,path); /* SAFE */ - - imgp->ip_strendp = imgp->ip_strings; - imgp->ip_strspace = SIZE_IMG_STRSPACE; + char *kpath; len = MIN(MAXPATHLEN, imgp->ip_strspace); @@ -323,6 +331,7 @@ exec_save_path(struct image_params *imgp, user_addr_t path, int seg) error = copyinstr(path, imgp->ip_strings, len, &len); break; case UIO_SYSSPACE: + kpath = CAST_DOWN(char *,path); /* SAFE */ error = copystr(kpath, imgp->ip_strings, len, &len); break; default: @@ -333,12 +342,38 @@ exec_save_path(struct image_params *imgp, user_addr_t path, int seg) if (!error) { imgp->ip_strendp += len; imgp->ip_strspace -= len; - imgp->ip_argv = imgp->ip_strendp; } return(error); } +/* + * exec_reset_save_path + * + * If we detect a shell script, we need to reset the string area + * state so that the interpreter can be saved onto the stack. + + * Parameters; struct image_params * image parameter block + * + * Returns: int 0 Success + * + * Implicit returns: + * (imgp->ip_strings) saved path + * (imgp->ip_strspace) space remaining in ip_strings + * (imgp->ip_strendp) start of remaining copy area + * (imgp->ip_argspace) space remaining of NCARGS + * + */ +static int +exec_reset_save_path(struct image_params *imgp) +{ + imgp->ip_strendp = imgp->ip_strings; + imgp->ip_argspace = NCARGS; + imgp->ip_strspace = ( NCARGS + PAGE_SIZE ); + + return (0); +} + #ifdef IMGPF_POWERPC /* * exec_powerpc32_imgact @@ -406,11 +441,15 @@ exec_powerpc32_imgact(struct image_params *imgp) imgp->ip_flags |= IMGPF_POWERPC; /* impute an interpreter */ - error = copystr(exec_archhandler_ppc.path, imgp->ip_interp_name, + error = copystr(exec_archhandler_ppc.path, imgp->ip_interp_buffer, IMG_SHSIZE, &len); if (error) return (error); + exec_reset_save_path(imgp); + exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_buffer), + UIO_SYSSPACE); + /* * provide a replacement string for p->p_comm; we have to use an * alternate buffer for this, rather than replacing it directly, @@ -451,14 +490,12 @@ exec_shell_imgact(struct image_params *imgp) { char *vdata = imgp->ip_vdata; char *ihp; - char *line_endp; + char *line_startp, *line_endp; char *interp; - char temp[16]; proc_t p; struct fileproc *fp; int fd; int error; - size_t len; /* * Make sure it's a shell script. If we've already redirected @@ -480,65 +517,82 @@ exec_shell_imgact(struct image_params *imgp) #endif /* IMGPF_POWERPC */ imgp->ip_flags |= IMGPF_INTERPRET; + imgp->ip_interp_sugid_fd = -1; + imgp->ip_interp_buffer[0] = '\0'; - /* Check to see if SUGID scripts are permitted. If they aren't then + /* Check to see if SUGID scripts are permitted. If they aren't then * clear the SUGID bits. * imgp->ip_vattr is known to be valid. - */ - if (sugid_scripts == 0) { - imgp->ip_origvattr->va_mode &= ~(VSUID | VSGID); + */ + if (sugid_scripts == 0) { + imgp->ip_origvattr->va_mode &= ~(VSUID | VSGID); } - /* Find the nominal end of the interpreter line */ - for( ihp = &vdata[2]; *ihp != '\n' && *ihp != '#'; ihp++) { - if (ihp >= &vdata[IMG_SHSIZE]) + /* Try to find the first non-whitespace character */ + for( ihp = &vdata[2]; ihp < &vdata[IMG_SHSIZE]; ihp++ ) { + if (IS_EOL(*ihp)) { + /* Did not find interpreter, "#!\n" */ return (ENOEXEC); + } else if (IS_WHITESPACE(*ihp)) { + /* Whitespace, like "#! /bin/sh\n", keep going. */ + } else { + /* Found start of interpreter */ + break; + } } - line_endp = ihp; - ihp = &vdata[2]; - /* Skip over leading spaces - until the interpreter name */ - while ( ihp < line_endp && ((*ihp == ' ') || (*ihp == '\t'))) - ihp++; + if (ihp == &vdata[IMG_SHSIZE]) { + /* All whitespace, like "#! " */ + return (ENOEXEC); + } - /* - * Find the last non-whitespace character before the end of line or - * the beginning of a comment; this is our new end of line. - */ - for (;line_endp > ihp && ((*line_endp == ' ') || (*line_endp == '\t')); line_endp--) - continue; + line_startp = ihp; + + /* Try to find the end of the interpreter+args string */ + for ( ; ihp < &vdata[IMG_SHSIZE]; ihp++ ) { + if (IS_EOL(*ihp)) { + /* Got it */ + break; + } else { + /* Still part of interpreter or args */ + } + } - /* Empty? */ - if (line_endp == ihp) + if (ihp == &vdata[IMG_SHSIZE]) { + /* A long line, like "#! blah blah blah" without end */ return (ENOEXEC); + } + + /* Backtrack until we find the last non-whitespace */ + while (IS_EOL(*ihp) || IS_WHITESPACE(*ihp)) { + ihp--; + } + + /* The character after the last non-whitespace is our logical end of line */ + line_endp = ihp + 1; + + /* + * Now we have pointers to the usable part of: + * + * "#! /usr/bin/int first second third \n" + * ^ line_startp ^ line_endp + */ /* copy the interpreter name */ - interp = imgp->ip_interp_name; - while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) - *interp++ = *ihp++; + interp = imgp->ip_interp_buffer; + for ( ihp = line_startp; (ihp < line_endp) && !IS_WHITESPACE(*ihp); ihp++) + *interp++ = *ihp; *interp = '\0'; - exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_name), + exec_reset_save_path(imgp); + exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_buffer), UIO_SYSSPACE); - ihp = &vdata[2]; - while (ihp < line_endp) { - /* Skip leading whitespace before each argument */ - while ((*ihp == ' ') || (*ihp == '\t')) - ihp++; - - if (ihp >= line_endp) - break; - - /* We have an argument; copy it */ - while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) { - *imgp->ip_strendp++ = *ihp++; - imgp->ip_strspace--; - } - *imgp->ip_strendp++ = 0; - imgp->ip_strspace--; - imgp->ip_argc++; - } + /* Copy the entire interpreter + args for later processing into argv[] */ + interp = imgp->ip_interp_buffer; + for ( ihp = line_startp; (ihp < line_endp); ihp++) + *interp++ = *ihp; + *interp = '\0'; /* * If we have a SUID oder SGID script, create a file descriptor @@ -562,10 +616,7 @@ exec_shell_imgact(struct image_params *imgp) proc_fdunlock(p); vnode_ref(imgp->ip_vp); - snprintf(temp, sizeof(temp), "/dev/fd/%d", fd); - error = copyoutstr(temp, imgp->ip_user_fname, sizeof(temp), &len); - if (error) - return(error); + imgp->ip_interp_sugid_fd = fd; } return (-3); @@ -736,6 +787,7 @@ exec_mach_imgact(struct image_params *imgp) load_result_t load_result; struct _posix_spawnattr *psa = NULL; int spawn = (imgp->ip_flags & IMGPF_SPAWN); + int apptype = 0; /* * make sure it's a Mach-O 1.0 or Mach-O 2.0 binary; the difference @@ -766,7 +818,7 @@ exec_mach_imgact(struct image_params *imgp) /* * Save off the vfexec state up front; we have to do this, because - * we need to know if we were in this state initally subsequent to + * we need to know if we were in this state initially subsequent to * creating the backing task, thread, and uthread for the child * process (from the vfs_context_t from in img_parms). */ @@ -813,20 +865,14 @@ grade: if (error) goto bad; - AUDIT_ARG(argv, imgp->ip_argv, imgp->ip_argc, - imgp->ip_strendargvp - imgp->ip_argv); - AUDIT_ARG(envv, imgp->ip_strendargvp, imgp->ip_envc, - imgp->ip_strendp - imgp->ip_strendargvp); + error = exec_add_apple_strings(imgp); + if (error) + goto bad; - /* - * Hack for binary compatability; put three NULs on the end of the - * string area, and round it up to the next word boundary. This - * ensures padding with NULs to the boundary. - */ - imgp->ip_strendp[0] = 0; - imgp->ip_strendp[1] = 0; - imgp->ip_strendp[2] = 0; - imgp->ip_strendp += (((imgp->ip_strendp - imgp->ip_strings) + NBPW-1) & ~(NBPW-1)); + AUDIT_ARG(argv, imgp->ip_startargv, imgp->ip_argc, + imgp->ip_endargv - imgp->ip_startargv); + AUDIT_ARG(envv, imgp->ip_endargv, imgp->ip_envc, + imgp->ip_endenvv - imgp->ip_endargv); #ifdef IMGPF_POWERPC /* @@ -838,7 +884,7 @@ grade: * to the "encapsulated_binary:" label in exec_activate_image(). */ if (imgp->ip_vattr->va_fsid == exec_archhandler_ppc.fsid && - imgp->ip_vattr->va_fileid == (uint64_t)((u_long)exec_archhandler_ppc.fileid)) { + imgp->ip_vattr->va_fileid == exec_archhandler_ppc.fileid) { imgp->ip_flags |= IMGPF_POWERPC; } #endif /* IMGPF_POWERPC */ @@ -846,7 +892,7 @@ grade: /* * We are being called to activate an image subsequent to a vfork() * operation; in this case, we know that our task, thread, and - * uthread are actualy those of our parent, and our proc, which we + * uthread are actually those of our parent, and our proc, which we * obtained indirectly from the image_params vfs_context_t, is the * new child process. */ @@ -885,7 +931,7 @@ grade: * Load the Mach-O file. * * NOTE: An error after this point indicates we have potentially - * destroyed or overwrote some process state while attempting an + * destroyed or overwritten some process state while attempting an * execve() following a vfork(), which is an unrecoverable condition. */ @@ -932,10 +978,9 @@ grade: cpu_type()); /* - * Close file descriptors - * which specify close-on-exec. + * Close file descriptors which specify close-on-exec. */ - fdexec(p); + fdexec(p, psa != NULL ? psa->psa_flags : 0); /* * deal with set[ug]id. @@ -959,14 +1004,6 @@ grade: goto badtoolate; } - /* - * There is no continuing workq context during - * vfork exec. So no need to reset then. Otherwise - * clear the workqueue context. - */ - if (vfexec == 0 && spawn == 0) { - (void)workqueue_exit(p); - } if (vfexec || spawn) { old_map = vm_map_switch(get_task_map(task)); } @@ -991,15 +1028,12 @@ grade: if (load_result.dynlinker) { uint64_t ap; + int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; /* Adjust the stack */ - if (imgp->ip_flags & IMGPF_IS_64BIT) { - ap = thread_adjuserstack(thread, -8); - error = copyoutptr(load_result.mach_header, ap, 8); - } else { - ap = thread_adjuserstack(thread, -4); - error = suword(ap, load_result.mach_header); - } + ap = thread_adjuserstack(thread, -new_ptr_size); + error = copyoutptr(load_result.mach_header, ap, new_ptr_size); + if (error) { if (vfexec || spawn) vm_map_switch(old_map); @@ -1058,6 +1092,8 @@ grade: p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = '\0'; } + pal_dbg_set_task_name( p->task ); + memcpy(&p->p_uuid[0], &load_result.uuid[0], sizeof(p->p_uuid)); // <rdar://6598155> dtrace code cleanup needed @@ -1143,6 +1179,22 @@ grade: proc_unlock(p); (void) task_suspend(p->task); } + if ((psa->psa_flags & POSIX_SPAWN_OSX_TALAPP_START) || (psa->psa_flags & POSIX_SPAWN_OSX_DBCLIENT_START) || (psa->psa_flags & POSIX_SPAWN_IOS_APP_START)) { + if ((psa->psa_flags & POSIX_SPAWN_OSX_TALAPP_START)) + apptype = PROC_POLICY_OSX_APPTYPE_TAL; + else if (psa->psa_flags & POSIX_SPAWN_OSX_DBCLIENT_START) + apptype = PROC_POLICY_OSX_APPTYPE_DBCLIENT; + else if (psa->psa_flags & POSIX_SPAWN_IOS_APP_START) + apptype = PROC_POLICY_IOS_APPTYPE; + else + apptype = 0; + proc_set_task_apptype(p->task, apptype); + if ((apptype == PROC_POLICY_OSX_APPTYPE_TAL) || + (apptype == PROC_POLICY_OSX_APPTYPE_DBCLIENT)) { + + proc_apply_task_networkbg_internal(p); + } + } } /* @@ -1245,21 +1297,16 @@ exec_activate_image(struct image_params *imgp) if (error) goto bad; - /* - * XXXAUDIT: Note: the double copyin introduces an audit - * race. To correct this race, we must use a single - * copyin(), e.g. by passing a flag to namei to indicate an - * external path buffer is being used. - */ error = exec_save_path(imgp, imgp->ip_user_fname, imgp->ip_seg); if (error) { goto bad_notrans; } + /* Use imgp->ip_strings, which contains the copyin-ed exec path */ DTRACE_PROC1(exec, uintptr_t, imgp->ip_strings); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, - imgp->ip_seg, imgp->ip_user_fname, imgp->ip_vfs_context); + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + UIO_SYSSPACE, CAST_USER_ADDR_T(imgp->ip_strings), imgp->ip_vfs_context); again: error = namei(&nd); @@ -1268,7 +1315,20 @@ again: imgp->ip_ndp = &nd; /* successful namei(); call nameidone() later */ imgp->ip_vp = nd.ni_vp; /* if set, need to vnode_put() at some point */ - error = proc_transstart(p, 0); + /* + * Before we start the transition from binary A to binary B, make + * sure another thread hasn't started exiting the process. We grab + * the proc lock to check p_lflag initially, and the transition + * mechanism ensures that the value doesn't change after we release + * the lock. + */ + proc_lock(p); + if (p->p_lflag & P_LEXIT) { + proc_unlock(p); + goto bad_notrans; + } + error = proc_transstart(p, 1); + proc_unlock(p); if (error) goto bad_notrans; @@ -1322,11 +1382,16 @@ encapsulated_binary: mac_vnode_label_copy(imgp->ip_vp->v_label, imgp->ip_scriptlabelp); #endif + + nameidone(&nd); + vnode_put(imgp->ip_vp); imgp->ip_vp = NULL; /* already put */ - - NDINIT(&nd, LOOKUP, (nd.ni_cnd.cn_flags & HASBUF) | (FOLLOW | LOCKLEAF), - UIO_SYSSPACE, CAST_USER_ADDR_T(imgp->ip_interp_name), imgp->ip_vfs_context); + imgp->ip_ndp = NULL; /* already nameidone */ + + /* Use imgp->ip_strings, which exec_shell_imgact reset to the interpreter */ + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF, + UIO_SYSSPACE, CAST_USER_ADDR_T(imgp->ip_strings), imgp->ip_vfs_context); #ifdef IMGPF_POWERPC /* @@ -1379,10 +1444,10 @@ bad_notrans: * short psa_flags posix spawn attribute flags * * Returns: 0 Success - * KERN_FAILURE Failure + * EINVAL Failure * ENOTSUP Illegal posix_spawn attr flag was set */ -static int +static errno_t exec_handle_port_actions(struct image_params *imgp, short psa_flags) { _posix_spawn_port_actions_t pacts = imgp->ip_px_spa; @@ -1390,16 +1455,17 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags) _ps_port_action_t *act = NULL; task_t task = p->task; ipc_port_t port = NULL; - kern_return_t ret = KERN_SUCCESS; + errno_t ret = KERN_SUCCESS; int i; for (i = 0; i < pacts->pspa_count; i++) { act = &pacts->pspa_actions[i]; - ret = ipc_object_copyin(get_task_ipcspace(current_task()), + if (ipc_object_copyin(get_task_ipcspace(current_task()), CAST_MACH_PORT_TO_NAME(act->new_port), MACH_MSG_TYPE_COPY_SEND, - (ipc_object_t *) &port); + (ipc_object_t *) &port) != KERN_SUCCESS) + return EINVAL; if (ret) return ret; @@ -1409,19 +1475,19 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags) /* Only allowed when not under vfork */ if (!(psa_flags & POSIX_SPAWN_SETEXEC)) return ENOTSUP; - ret = task_set_special_port(task, + ret = (task_set_special_port(task, act->which, - port); + port) == KERN_SUCCESS) ? 0 : EINVAL; break; case PSPA_EXCEPTION: /* Only allowed when not under vfork */ if (!(psa_flags & POSIX_SPAWN_SETEXEC)) return ENOTSUP; - ret = task_set_exception_ports(task, + ret = (task_set_exception_ports(task, act->mask, port, act->behavior, - act->flavor); + act->flavor) == KERN_SUCCESS) ? 0 : EINVAL; break; #if CONFIG_AUDIT case PSPA_AU_SESSION: @@ -1430,7 +1496,7 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags) break; #endif default: - ret = KERN_FAILURE; + ret = EINVAL; } /* action failed, so release port resources */ if (ret) { @@ -1461,7 +1527,7 @@ exec_handle_port_actions(struct image_params *imgp, short psa_flags) * normally permitted to perform. */ static int -exec_handle_file_actions(struct image_params *imgp) +exec_handle_file_actions(struct image_params *imgp, short psa_flags) { int error = 0; int action; @@ -1479,7 +1545,7 @@ exec_handle_file_actions(struct image_params *imgp) * a path argument, which is normally copied in from * user space; because of this, we have to support an * open from kernel space that passes an address space - * context oof UIO_SYSSPACE, and casts the address + * context of UIO_SYSSPACE, and casts the address * argument to a user_addr_t. */ struct vnode_attr va; @@ -1494,7 +1560,7 @@ exec_handle_file_actions(struct image_params *imgp) mode = ((mode &~ p->p_fd->fd_cmask) & ALLPERMS) & ~S_ISTXT; VATTR_SET(&va, va_mode, mode & ACCESSPERMS); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(psfa->psfaa_openargs.psfao_path), imgp->ip_vfs_context); @@ -1506,8 +1572,8 @@ exec_handle_file_actions(struct image_params *imgp) /* * If there's an error, or we get the right fd by - * accident, then drop out here. This is easier that - * rearchitecting all the open code to preallocate fd + * accident, then drop out here. This is easier than + * reworking all the open code to preallocate fd * slots, and internally taking one as an argument. */ if (error || ival[0] == psfa->psfaa_filedes) @@ -1566,16 +1632,68 @@ exec_handle_file_actions(struct image_params *imgp) } break; + case PSFA_INHERIT: { + struct fileproc *fp; + int fd = psfa->psfaa_filedes; + + /* + * Check to see if the descriptor exists, and + * ensure it's -not- marked as close-on-exec. + * [Less code than the equivalent F_GETFD/F_SETFD.] + */ + proc_fdlock(p); + if ((error = fp_lookup(p, fd, &fp, 1)) == 0) { + *fdflags(p, fd) &= ~UF_EXCLOSE; + (void) fp_drop(p, fd, fp, 1); + } + proc_fdunlock(p); + } + break; + default: error = EINVAL; break; } + /* All file actions failures are considered fatal, per POSIX */ + if (error) break; } - return (error); + if (error != 0 || (psa_flags & POSIX_SPAWN_CLOEXEC_DEFAULT) == 0) + return (error); + + /* + * If POSIX_SPAWN_CLOEXEC_DEFAULT is set, behave (during + * this spawn only) as if "close on exec" is the default + * disposition of all pre-existing file descriptors. In this case, + * the list of file descriptors mentioned in the file actions + * are the only ones that can be inherited, so mark them now. + * + * The actual closing part comes later, in fdexec(). + */ + proc_fdlock(p); + for (action = 0; action < px_sfap->psfa_act_count; action++) { + _psfa_action_t *psfa = &px_sfap->psfa_act_acts[action]; + int fd = psfa->psfaa_filedes; + + switch (psfa->psfaa_type) { + case PSFA_DUP2: + fd = psfa->psfaa_openargs.psfao_oflag; + /*FALLTHROUGH*/ + case PSFA_OPEN: + case PSFA_INHERIT: + *fdflags(p, fd) |= UF_INHERIT; + break; + + case PSFA_CLOSE: + break; + } + } + proc_fdunlock(p); + + return (0); } @@ -1628,10 +1746,12 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) _posix_spawn_port_actions_t px_spap = NULL; struct __kern_sigaction vec; boolean_t spawn_no_exec = FALSE; + boolean_t proc_transit_set = TRUE; + boolean_t exec_done = FALSE; /* * Allocate a big chunk for locals instead of using stack since these - * structures a pretty big. + * structures are pretty big. */ MALLOC(bufp, char *, (sizeof(*imgp) + sizeof(*vap) + sizeof(*origvap)), M_TEMP, M_WAITOK | M_ZERO); imgp = (struct image_params *) bufp; @@ -1740,7 +1860,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) } /* - * If we don't have the extention flag that turns "posix_spawn()" + * If we don't have the extension flag that turns "posix_spawn()" * into "execve() with options", then we will be creating a new * process which does not inherit memory from the parent process, * which is one of the most expensive things about using fork() @@ -1755,7 +1875,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) if (spawn_no_exec) p = (proc_t)get_bsdthreadtask_info(imgp->ip_new_thread); - + assert(p != NULL); /* By default, the thread everyone plays with is the parent */ context.vc_thread = current_thread(); @@ -1768,17 +1888,22 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) if (spawn_no_exec) context.vc_thread = imgp->ip_new_thread; - /* * Post fdcopy(), pre exec_handle_sugid() - this is where we want * to handle the file_actions. Since vfork() also ends up setting * us into the parent process group, and saved off the signal flags, * this is also where we want to handle the spawn flags. */ + /* Has spawn file actions? */ - if (imgp->ip_px_sfa != NULL && - (error = exec_handle_file_actions(imgp)) != 0) { - goto bad; + if (imgp->ip_px_sfa != NULL) { + /* + * The POSIX_SPAWN_CLOEXEC_DEFAULT flag + * is handled in exec_handle_file_actions(). + */ + if ((error = exec_handle_file_actions(imgp, + imgp->ip_px_sa != NULL ? px_sa.psa_flags : 0)) != 0) + goto bad; } /* Has spawn port actions? */ @@ -1787,7 +1912,8 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) * The check for the POSIX_SPAWN_SETEXEC flag is done in * exec_handle_port_actions(). */ - if((error = exec_handle_port_actions(imgp, px_sa.psa_flags)) != 0) + if ((error = exec_handle_port_actions(imgp, + imgp->ip_px_sa != NULL ? px_sa.psa_flags : 0)) != 0) goto bad; } @@ -1824,12 +1950,36 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) */ if (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) { kauth_cred_t my_cred = p->p_ucred; - kauth_cred_t my_new_cred = kauth_cred_setuidgid(my_cred, my_cred->cr_ruid, my_cred->cr_rgid); - if (my_new_cred != my_cred) + kauth_cred_t my_new_cred = kauth_cred_setuidgid(my_cred, kauth_cred_getruid(my_cred), kauth_cred_getrgid(my_cred)); + if (my_new_cred != my_cred) { p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); + } } + + /* + * Disable ASLR for the spawned process. + */ + if (px_sa.psa_flags & _POSIX_SPAWN_DISABLE_ASLR) + OSBitOrAtomic(P_DISABLE_ASLR, &p->p_flag); + + /* + * Forcibly disallow execution from data pages for the spawned process + * even if it would otherwise be permitted by the architecture default. + */ + if (px_sa.psa_flags & _POSIX_SPAWN_ALLOW_DATA_EXEC) + imgp->ip_flags |= IMGPF_ALLOW_DATA_EXEC; } + /* + * Disable ASLR during image activation. This occurs either if the + * _POSIX_SPAWN_DISABLE_ASLR attribute was found above or if + * P_DISABLE_ASLR was inherited from the parent process. + */ + if (p->p_flag & P_DISABLE_ASLR) + imgp->ip_flags |= IMGPF_DISABLE_ASLR; + /* * Clear transition flag so we won't hang if exec_activate_image() causes * an automount (and launchd does a proc sysctl to service it). @@ -1838,6 +1988,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) */ if (spawn_no_exec) { proc_transend(p, 0); + proc_transit_set = 0; } #if MAC_SPAWN /* XXX */ @@ -1853,9 +2004,13 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval) */ error = exec_activate_image(imgp); - /* Image not claimed by any activator? */ - if (error == -1) + if (error == 0) { + /* process completed the exec */ + exec_done = TRUE; + } else if (error == -1) { + /* Image not claimed by any activator? */ error = ENOEXEC; + } /* * If we have a spawn attr, and it contains signal related flags, @@ -1938,6 +2093,9 @@ bad: * before check_for_signature(), which uses psignal. */ if (spawn_no_exec) { + if (proc_transit_set) + proc_transend(p, 0); + /* * Drop the signal lock on the child which was taken on our * behalf by forkproc()/cloneproc() to prevent signals being @@ -2040,8 +2198,10 @@ bad: p->exit_thread = current_thread(); proc_unlock(p); exit1(p, 1, (int *)NULL); - task_deallocate(get_threadtask(imgp->ip_new_thread)); - thread_deallocate(imgp->ip_new_thread); + if (exec_done == FALSE) { + task_deallocate(get_threadtask(imgp->ip_new_thread)); + thread_deallocate(imgp->ip_new_thread); + } } else { /* someone is doing it for us; just skip it */ proc_unlock(p); @@ -2165,7 +2325,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval) imgp->ip_vattr = vap; imgp->ip_origvattr = origvap; imgp->ip_vfs_context = &context; - imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE); + imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE) | ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE); imgp->ip_p_comm = alt_p_comm; /* for PowerPC */ imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32); @@ -2273,8 +2433,6 @@ copyinptr(user_addr_t froma, user_addr_t *toptr, int ptr_size) * Returns: 0 Success * EFAULT Bad 'ua' * - * Implicit returns: - * *ptr_size Modified */ static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size) @@ -2311,85 +2469,156 @@ copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size) * Note: The strings segment layout is backward, from the beginning * of the top of the stack to consume the minimal amount of * space possible; the returned stack pointer points to the - * end of the area consumed (stacks grow upward). + * end of the area consumed (stacks grow downward). * * argc is an int; arg[i] are pointers; env[i] are pointers; - * exec_path is a pointer; the 0's are (void *)NULL's + * the 0's are (void *)NULL's * * The stack frame layout is: * - * +-------------+ - * sp-> | argc | - * +-------------+ - * | arg[0] | - * +-------------+ - * : - * : - * +-------------+ - * | arg[argc-1] | - * +-------------+ - * | 0 | - * +-------------+ - * | env[0] | - * +-------------+ - * : - * : - * +-------------+ - * | env[n] | - * +-------------+ - * | 0 | - * +-------------+ - * | exec_path | In MacOS X PR2 Beaker2E the path passed to exec() is - * +-------------+ passed on the stack just after the trailing 0 of the - * | 0 | the envp[] array as a pointer to a string. - * +-------------+ - * | PATH AREA | - * +-------------+ - * | STRING AREA | - * : - * : - * | | <- p->user_stack - * +-------------+ + * +-------------+ <- p->user_stack + * | 16b | + * +-------------+ + * | STRING AREA | + * | : | + * | : | + * | : | + * +- -- -- -- --+ + * | PATH AREA | + * +-------------+ + * | 0 | + * +-------------+ + * | applev[n] | + * +-------------+ + * : + * : + * +-------------+ + * | applev[1] | + * +-------------+ + * | exec_path / | + * | applev[0] | + * +-------------+ + * | 0 | + * +-------------+ + * | env[n] | + * +-------------+ + * : + * : + * +-------------+ + * | env[0] | + * +-------------+ + * | 0 | + * +-------------+ + * | arg[argc-1] | + * +-------------+ + * : + * : + * +-------------+ + * | arg[0] | + * +-------------+ + * | argc | + * sp-> +-------------+ * * Although technically a part of the STRING AREA, we treat the PATH AREA as * a separate entity. This allows us to align the beginning of the PATH AREA * to a pointer boundary so that the exec_path, env[i], and argv[i] pointers * which preceed it on the stack are properly aligned. - * - * TODO: argc copied with suword(), which takes a 64 bit address */ + static int exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp) { proc_t p = vfs_context_proc(imgp->ip_vfs_context); int ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; - char *argv = imgp->ip_argv; /* modifiable copy of argv */ + int ptr_area_size; + void *ptr_buffer_start, *ptr_buffer; + int string_size; + user_addr_t string_area; /* *argv[], *env[] */ - user_addr_t path_area; /* package launch path */ - user_addr_t ptr_area; /* argv[], env[], exec_path */ + user_addr_t ptr_area; /* argv[], env[], applev[] */ + user_addr_t argc_area; /* argc */ user_addr_t stack; - int stringc = imgp->ip_argc + imgp->ip_envc; - size_t len; int error; - ssize_t strspace; + + unsigned i; + struct copyout_desc { + char *start_string; + int count; +#if CONFIG_DTRACE + user_addr_t *dtrace_cookie; +#endif + boolean_t null_term; + } descriptors[] = { + { + .start_string = imgp->ip_startargv, + .count = imgp->ip_argc, +#if CONFIG_DTRACE + .dtrace_cookie = &p->p_dtrace_argv, +#endif + .null_term = TRUE + }, + { + .start_string = imgp->ip_endargv, + .count = imgp->ip_envc, +#if CONFIG_DTRACE + .dtrace_cookie = &p->p_dtrace_envp, +#endif + .null_term = TRUE + }, + { + .start_string = imgp->ip_strings, + .count = 1, +#if CONFIG_DTRACE + .dtrace_cookie = NULL, +#endif + .null_term = FALSE + }, + { + .start_string = imgp->ip_endenvv, + .count = imgp->ip_applec - 1, /* exec_path handled above */ +#if CONFIG_DTRACE + .dtrace_cookie = NULL, +#endif + .null_term = TRUE + } + }; stack = *stackp; - size_t patharea_len = imgp->ip_argv - imgp->ip_strings; - int envc_add = 0; - /* - * Set up pointers to the beginning of the string area, the beginning - * of the path area, and the beginning of the pointer area (actually, - * the location of argc, an int, which may be smaller than a pointer, - * but we use ptr_size worth of space for it, for alignment). + * All previous contributors to the string area + * should have aligned their sub-area */ - string_area = stack - (((imgp->ip_strendp - imgp->ip_strings) + ptr_size-1) & ~(ptr_size-1)) - ptr_size; - path_area = string_area - ((patharea_len + ptr_size-1) & ~(ptr_size-1)); - ptr_area = path_area - ((imgp->ip_argc + imgp->ip_envc + 4 + envc_add) * ptr_size) - ptr_size /*argc*/; + if (imgp->ip_strspace % ptr_size != 0) { + error = EINVAL; + goto bad; + } - /* Return the initial stack address: the location of argc */ - *stackp = ptr_area; + /* Grow the stack down for the strings we've been building up */ + string_size = imgp->ip_strendp - imgp->ip_strings; + stack -= string_size; + string_area = stack; + + /* + * Need room for one pointer for each string, plus + * one for the NULLs terminating the argv, envv, and apple areas. + */ + ptr_area_size = (imgp->ip_argc + imgp->ip_envc + imgp->ip_applec + 3) * + ptr_size; + stack -= ptr_area_size; + ptr_area = stack; + + /* We'll construct all the pointer arrays in our string buffer, + * which we already know is aligned properly, and ip_argspace + * was used to verify we have enough space. + */ + ptr_buffer_start = ptr_buffer = (void *)imgp->ip_strendp; + + /* + * Need room for pointer-aligned argc slot. + */ + stack -= ptr_size; + argc_area = stack; /* * Record the size of the arguments area so that sysctl_procargs() @@ -2397,92 +2626,73 @@ exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp) */ proc_lock(p); p->p_argc = imgp->ip_argc; - p->p_argslen = (int)(stack - path_area); + p->p_argslen = (int)(*stackp - string_area); proc_unlock(p); + /* Return the initial stack address: the location of argc */ + *stackp = stack; /* - * Support for new app package launching for Mac OS X allocates - * the "path" at the begining of the imgp->ip_strings buffer. - * copy it just before the string area. - */ - len = 0; - error = copyoutstr(imgp->ip_strings, path_area, - patharea_len, - &len); + * Copy out the entire strings area. + */ + error = copyout(imgp->ip_strings, string_area, + string_size); if (error) goto bad; - - /* Save a NULL pointer below it */ - (void)copyoutptr(0LL, path_area - ptr_size, ptr_size); - - /* Save the pointer to "path" just below it */ - (void)copyoutptr(path_area, path_area - 2*ptr_size, ptr_size); - - /* - * ptr_size for 2 NULL one each ofter arg[argc -1] and env[n] - * ptr_size for argc - * skip over saved path, ptr_size for pointer to path, - * and ptr_size for the NULL after pointer to path. - */ - - /* argc (int32, stored in a ptr_size area) */ - (void)suword(ptr_area, imgp->ip_argc); - ptr_area += sizeof(int); - /* pad to ptr_size, if 64 bit image, to ensure user stack alignment */ - if (imgp->ip_flags & IMGPF_IS_64BIT) { - (void)suword(ptr_area, 0); /* int, not long: ignored */ - ptr_area += sizeof(int); - } + for (i = 0; i < sizeof(descriptors)/sizeof(descriptors[0]); i++) { + char *cur_string = descriptors[i].start_string; + int j; #if CONFIG_DTRACE - p->p_dtrace_argv = ptr_area; /* user_addr_t &argv[0] for dtrace convenience */ + if (descriptors[i].dtrace_cookie) { + proc_lock(p); + *descriptors[i].dtrace_cookie = ptr_area + ((uintptr_t)ptr_buffer - (uintptr_t)ptr_buffer_start); /* dtrace convenience */ + proc_unlock(p); + } #endif /* CONFIG_DTRACE */ - /* - * We use (string_area - path_area) here rather than the more - * intuitive (imgp->ip_argv - imgp->ip_strings) because we are - * interested in the length of the PATH_AREA in user space, - * rather than the actual length of the execution path, since - * it includes alignment padding of the PATH_AREA + STRING_AREA - * to a ptr_size boundary. - */ - strspace = SIZE_IMG_STRSPACE - (string_area - path_area); - for (;;) { - if (stringc == imgp->ip_envc) { - /* argv[n] = NULL */ - (void)copyoutptr(0LL, ptr_area, ptr_size); - ptr_area += ptr_size; -#if CONFIG_DTRACE - p->p_dtrace_envp = ptr_area; /* user_addr_t &env[0] for dtrace convenience */ -#endif /* CONFIG_DTRACE */ + /* + * For each segment (argv, envv, applev), copy as many pointers as requested + * to our pointer buffer. + */ + for (j = 0; j < descriptors[i].count; j++) { + user_addr_t cur_address = string_area + (cur_string - imgp->ip_strings); + + /* Copy out the pointer to the current string. Alignment has been verified */ + if (ptr_size == 8) { + *(uint64_t *)ptr_buffer = (uint64_t)cur_address; + } else { + *(uint32_t *)ptr_buffer = (uint32_t)cur_address; + } + + ptr_buffer = (void *)((uintptr_t)ptr_buffer + ptr_size); + cur_string += strlen(cur_string) + 1; /* Only a NUL between strings in the same area */ } - if (--stringc < 0) - break; - /* pointer: argv[n]/env[n] */ - (void)copyoutptr(string_area, ptr_area, ptr_size); - - /* string : argv[n][]/env[n][] */ - do { - if (strspace <= 0) { - error = E2BIG; - break; + if (descriptors[i].null_term) { + if (ptr_size == 8) { + *(uint64_t *)ptr_buffer = 0ULL; + } else { + *(uint32_t *)ptr_buffer = 0; } - error = copyoutstr(argv, string_area, - strspace, - &len); - string_area += len; - argv += len; - strspace -= len; - } while (error == ENAMETOOLONG); - if (error == EFAULT || error == E2BIG) - break; /* bad stack - user's problem */ - ptr_area += ptr_size; - } - /* env[n] = NULL */ - (void)copyoutptr(0LL, ptr_area, ptr_size); + + ptr_buffer = (void *)((uintptr_t)ptr_buffer + ptr_size); + } + } + + /* + * Copy out all our pointer arrays in bulk. + */ + error = copyout(ptr_buffer_start, ptr_area, + ptr_area_size); + if (error) + goto bad; + + /* argc (int32, stored in a ptr_size area) */ + error = copyoutptr((user_addr_t)imgp->ip_argc, argc_area, ptr_size); + if (error) + goto bad; bad: return(error); @@ -2495,6 +2705,11 @@ bad: * Copy arguments and environment from user space into work area; we may * have already copied some early arguments into the work area, and if * so, any arguments opied in are appended to those already there. + * This function is the primary manipulator of ip_argspace, since + * these are the arguments the client of execve(2) knows about. After + * each argv[]/envv[] string is copied, we charge the string length + * and argv[]/envv[] pointer slot to ip_argspace, so that we can + * full preflight the arg list size. * * Parameters: struct image_params * the image parameter block * @@ -2504,6 +2719,8 @@ bad: * Implicit returns; * (imgp->ip_argc) Count of arguments, updated * (imgp->ip_envc) Count of environment strings, updated + * (imgp->ip_argspace) Count of remaining of NCARGS + * (imgp->ip_interp_buffer) Interpreter and args (mutated in place) * * * Note: The argument and environment vectors are user space pointers @@ -2513,47 +2730,101 @@ static int exec_extract_strings(struct image_params *imgp) { int error = 0; - int strsz = 0; int ptr_size = (imgp->ip_flags & IMGPF_WAS_64BIT) ? 8 : 4; + int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; user_addr_t argv = imgp->ip_user_argv; user_addr_t envv = imgp->ip_user_envv; - /* - * If the argument vector is NULL, this is the system startup - * bootstrap from load_init_program(), and there's nothing to do - */ - if (imgp->ip_user_argv == 0LL) - goto bad; - - /* Now, get rest of arguments */ - /* * Adjust space reserved for the path name by however much padding it * needs. Doing this here since we didn't know if this would be a 32- * or 64-bit process back in exec_save_path. */ - strsz = strlen(imgp->ip_strings) + 1; - imgp->ip_strspace -= ((strsz + ptr_size-1) & ~(ptr_size-1)) - strsz; + while (imgp->ip_strspace % new_ptr_size != 0) { + *imgp->ip_strendp++ = '\0'; + imgp->ip_strspace--; + /* imgp->ip_argspace--; not counted towards exec args total */ + } /* - * If we are running an interpreter, replace the av[0] that was - * passed to execve() with the fully qualified path name that was - * passed to execve() for interpreters which do not use the PATH - * to locate their script arguments. + * From now on, we start attributing string space to ip_argspace */ - if((imgp->ip_flags & IMGPF_INTERPRET) != 0 && argv != 0LL) { + imgp->ip_startargv = imgp->ip_strendp; + imgp->ip_argc = 0; + + if((imgp->ip_flags & IMGPF_INTERPRET) != 0) { user_addr_t arg; + char *argstart, *ch; + + /* First, the arguments in the "#!" string are tokenized and extracted. */ + argstart = imgp->ip_interp_buffer; + while (argstart) { + ch = argstart; + while (*ch && !IS_WHITESPACE(*ch)) { + ch++; + } - error = copyinptr(argv, &arg, ptr_size); - if (error) - goto bad; - if (arg != 0LL && arg != (user_addr_t)-1) { - argv += ptr_size; - error = exec_add_string(imgp, imgp->ip_user_fname); + if (*ch == '\0') { + /* last argument, no need to NUL-terminate */ + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(argstart), UIO_SYSSPACE, TRUE); + argstart = NULL; + } else { + /* NUL-terminate */ + *ch = '\0'; + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(argstart), UIO_SYSSPACE, TRUE); + + /* + * Find the next string. We know spaces at the end of the string have already + * been stripped. + */ + argstart = ch + 1; + while (IS_WHITESPACE(*argstart)) { + argstart++; + } + } + + /* Error-check, regardless of whether this is the last interpreter arg or not */ if (error) goto bad; + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; /* to hold argv[] entry */ imgp->ip_argc++; } + + if (argv != 0LL) { + /* + * If we are running an interpreter, replace the av[0] that was + * passed to execve() with the path name that was + * passed to execve() for interpreters which do not use the PATH + * to locate their script arguments. + */ + error = copyinptr(argv, &arg, ptr_size); + if (error) + goto bad; + if (arg != 0LL) { + argv += ptr_size; /* consume without using */ + } + } + + if (imgp->ip_interp_sugid_fd != -1) { + char temp[19]; /* "/dev/fd/" + 10 digits + NUL */ + snprintf(temp, sizeof(temp), "/dev/fd/%d", imgp->ip_interp_sugid_fd); + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(temp), UIO_SYSSPACE, TRUE); + } else { + error = exec_add_user_string(imgp, imgp->ip_user_fname, imgp->ip_seg, TRUE); + } + + if (error) + goto bad; + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; /* to hold argv[] entry */ + imgp->ip_argc++; } while (argv != 0LL) { @@ -2563,25 +2834,36 @@ exec_extract_strings(struct image_params *imgp) if (error) goto bad; - argv += ptr_size; if (arg == 0LL) { break; - } else if (arg == (user_addr_t)-1) { - /* Um... why would it be -1? */ - error = EFAULT; - goto bad; } + + argv += ptr_size; + /* * av[n...] = arg[n] */ - error = exec_add_string(imgp, arg); + error = exec_add_user_string(imgp, arg, imgp->ip_seg, TRUE); if (error) goto bad; + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; /* to hold argv[] entry */ imgp->ip_argc++; } + + /* Save space for argv[] NULL terminator */ + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; - /* Note where the args end and env begins. */ - imgp->ip_strendargvp = imgp->ip_strendp; + /* Note where the args ends and env begins. */ + imgp->ip_endargv = imgp->ip_strendp; + imgp->ip_envc = 0; /* Now, get the environment */ while (envv != 0LL) { @@ -2594,29 +2876,165 @@ exec_extract_strings(struct image_params *imgp) envv += ptr_size; if (env == 0LL) { break; - } else if (env == (user_addr_t)-1) { - error = EFAULT; - goto bad; } /* * av[n...] = env[n] */ - error = exec_add_string(imgp, env); + error = exec_add_user_string(imgp, env, imgp->ip_seg, TRUE); if (error) goto bad; + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; /* to hold envv[] entry */ imgp->ip_envc++; } + + /* Save space for envv[] NULL terminator */ + if (imgp->ip_argspace < new_ptr_size) { + error = E2BIG; + goto bad; + } + imgp->ip_argspace -= new_ptr_size; + + /* Align the tail of the combined argv+envv area */ + while (imgp->ip_strspace % new_ptr_size != 0) { + if (imgp->ip_argspace < 1) { + error = E2BIG; + goto bad; + } + *imgp->ip_strendp++ = '\0'; + imgp->ip_strspace--; + imgp->ip_argspace--; + } + + /* Note where the envv ends and applev begins. */ + imgp->ip_endenvv = imgp->ip_strendp; + + /* + * From now on, we are no longer charging argument + * space to ip_argspace. + */ + bad: return error; } +static char * +random_hex_str(char *str, int len) +{ + uint64_t low, high, value; + int idx; + char digit; + + /* A 64-bit value will only take 16 characters, plus '0x' and NULL. */ + if (len > 19) + len = 19; + + /* We need enough room for at least 1 digit */ + if (len < 4) + return (NULL); + + low = random(); + high = random(); + value = high << 32 | low; + + str[0] = '0'; + str[1] = 'x'; + for (idx = 2; idx < len - 1; idx++) { + digit = value & 0xf; + value = value >> 4; + if (digit < 10) + str[idx] = '0' + digit; + else + str[idx] = 'a' + (digit - 10); + } + str[idx] = '\0'; + return (str); +} + +/* + * Libc has an 8-element array set up for stack guard values. It only fills + * in one of those entries, and both gcc and llvm seem to use only a single + * 8-byte guard. Until somebody needs more than an 8-byte guard value, don't + * do the work to construct them. + */ +#define GUARD_VALUES 1 +#define GUARD_KEY "stack_guard=" + +/* + * System malloc needs some entropy when it is initialized. + */ +#define ENTROPY_VALUES 2 +#define ENTROPY_KEY "malloc_entropy=" + +/* + * Build up the contents of the apple[] string vector + */ +static int +exec_add_apple_strings(struct image_params *imgp) +{ + int i, error; + int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4; + char guard[19]; + char guard_vec[strlen(GUARD_KEY) + 19 * GUARD_VALUES + 1]; + + char entropy[19]; + char entropy_vec[strlen(ENTROPY_KEY) + 19 * ENTROPY_VALUES + 1]; + + /* exec_save_path stored the first string */ + imgp->ip_applec = 1; + + /* + * Supply libc with a collection of random values to use when + * implementing -fstack-protector. + */ + (void)strlcpy(guard_vec, GUARD_KEY, sizeof (guard_vec)); + for (i = 0; i < GUARD_VALUES; i++) { + random_hex_str(guard, sizeof (guard)); + if (i) + (void)strlcat(guard_vec, ",", sizeof (guard_vec)); + (void)strlcat(guard_vec, guard, sizeof (guard_vec)); + } + + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(guard_vec), UIO_SYSSPACE, FALSE); + if (error) + goto bad; + imgp->ip_applec++; + + /* + * Supply libc with entropy for system malloc. + */ + (void)strlcpy(entropy_vec, ENTROPY_KEY, sizeof(entropy_vec)); + for (i = 0; i < ENTROPY_VALUES; i++) { + random_hex_str(entropy, sizeof (entropy)); + if (i) + (void)strlcat(entropy_vec, ",", sizeof (entropy_vec)); + (void)strlcat(entropy_vec, entropy, sizeof (entropy_vec)); + } + + error = exec_add_user_string(imgp, CAST_USER_ADDR_T(entropy_vec), UIO_SYSSPACE, FALSE); + if (error) + goto bad; + imgp->ip_applec++; + + /* Align the tail of the combined applev area */ + while (imgp->ip_strspace % new_ptr_size != 0) { + *imgp->ip_strendp++ = '\0'; + imgp->ip_strspace--; + } + +bad: + return error; +} #define unix_stack_size(p) (p->p_rlimit[RLIMIT_STACK].rlim_cur) /* * exec_check_permissions * - * Decription: Verify that the file that is being attempted to be executed + * Description: Verify that the file that is being attempted to be executed * is in fact allowed to be executed based on it POSIX file * permissions and other access control criteria * @@ -2658,7 +3076,7 @@ exec_check_permissions(struct image_params *imgp) * will always succeed, and we don't want to happen unless the * file really is executable. */ - if ((vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) + if (!vfs_authopaque(vnode_mount(vp)) && ((vap->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) return (EACCES); /* Disallow zero length files */ @@ -2705,7 +3123,7 @@ exec_check_permissions(struct image_params *imgp) * cached values, then we set the PowerPC environment flag. */ if (vap->va_fsid == exec_archhandler_ppc.fsid && - vap->va_fileid == (uint64_t)((uint32_t)exec_archhandler_ppc.fileid)) { + vap->va_fileid == exec_archhandler_ppc.fileid) { imgp->ip_flags |= IMGPF_POWERPC; } #endif /* IMGPF_POWERPC */ @@ -2790,7 +3208,7 @@ exec_handle_sugid(struct image_params *imgp) kauth_cred_getuid(cred) != imgp->ip_origvattr->va_uid) || ((imgp->ip_origvattr->va_mode & VSGID) != 0 && ((kauth_cred_ismember_gid(cred, imgp->ip_origvattr->va_gid, &leave_sugid_clear) || !leave_sugid_clear) || - (cred->cr_gid != imgp->ip_origvattr->va_gid)))) { + (kauth_cred_getgid(cred) != imgp->ip_origvattr->va_gid)))) { #if CONFIG_MACF /* label for MAC transition and neither VSUID nor VSGID */ @@ -2815,9 +3233,13 @@ handle_mac_transition: */ if (imgp->ip_origvattr->va_mode & VSUID) { p->p_ucred = kauth_cred_setresuid(p->p_ucred, KAUTH_UID_NONE, imgp->ip_origvattr->va_uid, imgp->ip_origvattr->va_uid, KAUTH_UID_NONE); + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); } if (imgp->ip_origvattr->va_mode & VSGID) { p->p_ucred = kauth_cred_setresgid(p->p_ucred, KAUTH_GID_NONE, imgp->ip_origvattr->va_gid, imgp->ip_origvattr->va_gid); + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); } #if CONFIG_MACF @@ -2878,7 +3300,7 @@ handle_mac_transition: if (dev_null == NULLVP) { struct nameidata nd1; - NDINIT(&nd1, LOOKUP, FOLLOW, UIO_SYSSPACE, + NDINIT(&nd1, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, CAST_USER_ADDR_T("/dev/null"), imgp->ip_vfs_context); @@ -2893,9 +3315,8 @@ handle_mac_transition: } } - /* Radar 2261856; setuid security hole fix */ - /* Patch from OpenBSD: A. Ramesh */ /* + * Radar 2261856; setuid security hole fix * XXX For setuid processes, attempt to ensure that * stdin, stdout, and stderr are already allocated. * We do not want userland to accidentally allocate @@ -2913,7 +3334,7 @@ handle_mac_transition: if ((error = falloc(p, &fp, &indx, imgp->ip_vfs_context)) != 0) continue; - if ((error = vnode_ref_ext(dev_null, FREAD)) != 0) { + if ((error = vnode_ref_ext(dev_null, FREAD, 0)) != 0) { fp_free(p, indx, fp); break; } @@ -2958,7 +3379,9 @@ handle_mac_transition: * Implement the semantic where the effective user and group become * the saved user and group in exec'ed programs. */ - p->p_ucred = kauth_cred_setsvuidgid(p->p_ucred, kauth_cred_getuid(p->p_ucred), p->p_ucred->cr_gid); + p->p_ucred = kauth_cred_setsvuidgid(p->p_ucred, kauth_cred_getuid(p->p_ucred), kauth_cred_getgid(p->p_ucred)); + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); /* Update the process' identity version and set the security token */ p->p_idversion++; @@ -3131,7 +3554,7 @@ load_init_program(proc_t p) error = execve(p,&init_exec_args,retval); if (error) - panic("Process 1 exec of %s failed, errno %d\n", + panic("Process 1 exec of %s failed, errno %d", init_program_name, error); } @@ -3188,8 +3611,6 @@ load_return_to_errno(load_return_t lrtn) #include <kern/clock.h> #include <mach/kern_return.h> -extern semaphore_t execve_semaphore; - /* * execargs_alloc * @@ -3244,7 +3665,7 @@ execargs_lock_sleep(void) { static kern_return_t execargs_purgeable_allocate(char **execarg_address) { - kern_return_t kr = vm_allocate(bsd_pageable_map, (vm_offset_t *)execarg_address, NCARGS + PAGE_SIZE, VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE); + kern_return_t kr = vm_allocate(bsd_pageable_map, (vm_offset_t *)execarg_address, BSD_PAGEABLE_SIZE_PER_EXEC, VM_FLAGS_ANYWHERE | VM_FLAGS_PURGABLE); assert(kr == KERN_SUCCESS); return kr; } @@ -3315,7 +3736,11 @@ execargs_alloc(struct image_params *imgp) return (ENOMEM); } - imgp->ip_vdata = imgp->ip_strings + NCARGS; + /* last page used to read in file headers */ + imgp->ip_vdata = imgp->ip_strings + ( NCARGS + PAGE_SIZE ); + imgp->ip_strendp = imgp->ip_strings; + imgp->ip_argspace = NCARGS; + imgp->ip_strspace = ( NCARGS + PAGE_SIZE ); return (0); } @@ -3404,8 +3829,11 @@ exec_resettextvp(proc_t p, struct image_params *imgp) static int check_for_signature(proc_t p, struct image_params *imgp) { + void *blob = NULL; + size_t length = 0; mach_port_t port = NULL; - kern_return_t error = 0; + kern_return_t kr = KERN_FAILURE; + int error = EACCES; unsigned char hash[SHA1_RESULTLEN]; /* @@ -3422,35 +3850,56 @@ check_for_signature(proc_t p, struct image_params *imgp) vm_map_switch_protect(get_task_map(p->task), TRUE); } - /* - * If the task_access_port is set and the proc isn't signed, - * ask for a code signature from user space. Fail the exec - * if permission is denied. - */ - error = task_get_task_access_port(p->task, &port); - if (error == 0 && IPC_PORT_VALID(port) && !(p->p_csflags & CS_VALID)) { - error = find_code_signature(port, p->p_pid); - if (error == KERN_FAILURE) { - /* Make very sure execution fails */ - psignal(p, SIGKILL); - return EACCES; - } + /* If the process is not signed or if it contains + * entitlements, we need to communicate through the + * task_access_port to taskgated. taskgated will provide a + * detached code signature if present, and will enforce any + * restrictions on entitlements. taskgated returns + * KERN_SUCCESS if it has completed its work and the exec + * should continue, or KERN_FAILURE if the exec should fail. + */ + error = cs_entitlements_blob_get(p, &blob, &length); - /* Only do this if exec_resettextvp() did not fail */ - if (p->p_textvp != NULLVP) { - /* - * If there's a new code directory, mark this process - * as signed. - */ - error = ubc_cs_getcdhash(p->p_textvp, p->p_textoff, hash); - if (error == 0) { - proc_lock(p); - p->p_csflags |= CS_VALID; - proc_unlock(p); - } + /* if signed and no entitlements, then we're done here */ + if ((p->p_csflags & CS_VALID) && NULL == blob) { + error = 0; + goto done; + } + + kr = task_get_task_access_port(p->task, &port); + if (KERN_SUCCESS != kr || !IPC_PORT_VALID(port)) { + error = 0; +#if !CONFIG_EMBEDDED + /* fatal on the desktop when entitlements are present */ + if (NULL != blob) + error = EACCES; +#endif + goto done; + } + + kr = find_code_signature(port, p->p_pid); + if (KERN_SUCCESS != kr) { + error = EACCES; + goto done; + } + + /* Only do this if exec_resettextvp() did not fail */ + if (p->p_textvp != NULLVP) { + /* + * If there's a new code directory, mark this process + * as signed. + */ + if (0 == ubc_cs_getcdhash(p->p_textvp, p->p_textoff, hash)) { + proc_lock(p); + p->p_csflags |= CS_VALID; + proc_unlock(p); } } - return KERN_SUCCESS; +done: + if (0 != error) + /* make very sure execution fails */ + psignal(p, SIGKILL); + return error; } diff --git a/bsd/kern/kern_exit.c b/bsd/kern/kern_exit.c index 811b4fb7d..7d5ddd37b 100644 --- a/bsd/kern/kern_exit.c +++ b/bsd/kern/kern_exit.c @@ -158,7 +158,6 @@ int *get_bsduthreadrval(thread_t); kern_return_t sys_perf_notify(thread_t thread, int pid); kern_return_t abnormal_exit_notify(mach_exception_data_type_t code, mach_exception_data_type_t subcode); -void workqueue_exit(struct proc *); void delay(int); /* @@ -256,8 +255,10 @@ exit1(proc_t p, int rv, int *retval) DTRACE_PROC1(exit, int, CLD_EXITED); proc_lock(p); + proc_transstart(p, 1); while (p->exit_thread != self) { if (sig_try_locked(p) <= 0) { + proc_transend(p, 1); if (get_threadtask(self) != task) { proc_unlock(p); return(0); @@ -283,11 +284,12 @@ exit1(proc_t p, int rv, int *retval) p->p_lflag |= P_LEXIT; p->p_xstat = rv; + proc_transend(p, 1); proc_unlock(p); proc_prepareexit(p, rv); - /* task terminate will call proc_terminate and that cleans it up */ + /* Last thread to terminate will call proc_exit() */ task_terminate_internal(task); return(0); @@ -372,21 +374,39 @@ proc_exit(proc_t p) pid_t pid; int exitval; - /* This can happen if thread_terminate of the single thread - * process - */ - uth = (struct uthread *)get_bsdthread_info(current_thread()); proc_lock(p); + proc_transstart(p, 1); if( !(p->p_lflag & P_LEXIT)) { + /* + * This can happen if a thread_terminate() occurs + * in a single-threaded process. + */ p->p_lflag |= P_LEXIT; + proc_transend(p, 1); proc_unlock(p); proc_prepareexit(p, 0); + (void) task_terminate_internal(task); proc_lock(p); + } else { + proc_transend(p, 1); } p->p_lflag |= P_LPEXIT; + + /* + * Other kernel threads may be in the middle of signalling this process. + * Wait for those threads to wrap it up before making the process + * disappear on them. + */ + if ((p->p_lflag & P_LINSIGNAL) || (p->p_sigwaitcnt > 0)) { + p->p_sigwaitcnt++; + while ((p->p_lflag & P_LINSIGNAL) || (p->p_sigwaitcnt > 1)) + msleep(&p->p_sigmask, &p->p_mlock, PWAIT, "proc_sigdrain", NULL); + p->p_sigwaitcnt--; + } + proc_unlock(p); pid = p->p_pid; exitval = p->p_xstat; @@ -429,6 +449,8 @@ proc_exit(proc_t p) MALLOC_ZONE(p->p_ru, struct rusage *, sizeof (*p->p_ru), M_ZOMBIE, M_WAITOK); + nspace_proc_exit(p); + /* * need to cancel async IO requests that can be cancelled and wait for those * already active. MAY BLOCK! @@ -575,7 +597,7 @@ proc_exit(proc_t p) * if the reap is already in progress. So we get * the reference here exclusively and their can be * no waiters. So there is no need for a wakeup - * after we are done. AlsO the reap frees the structure + * after we are done. Also the reap frees the structure * and the proc struct cannot be used for wakeups as well. * It is safe to use q here as this is system reap */ @@ -587,10 +609,21 @@ proc_exit(proc_t p) * since their existence means someone is messing up. */ if (q->p_lflag & P_LTRACED) { + /* + * Take a reference on the child process to + * ensure it doesn't exit and disappear between + * the time we drop the list_lock and attempt + * to acquire its proc_lock. + */ + if (proc_ref_locked(q) != q) + continue; + proc_list_unlock(); proc_lock(q); q->p_lflag &= ~P_LTRACED; if (q->sigwait_thread) { + thread_t thread = q->sigwait_thread; + proc_unlock(q); /* * The sigwait_thread could be stopped at a @@ -599,13 +632,16 @@ proc_exit(proc_t p) * the first thread in the task. So any attempts to kill * the process would result into a deadlock on q->sigwait. */ - thread_resume((thread_t)q->sigwait_thread); - clear_wait(q->sigwait_thread, THREAD_INTERRUPTED); - threadsignal((thread_t)q->sigwait_thread, SIGKILL, 0); - } else + thread_resume(thread); + clear_wait(thread, THREAD_INTERRUPTED); + threadsignal(thread, SIGKILL, 0); + } else { proc_unlock(q); + } + psignal(q, SIGKILL); proc_list_lock(); + proc_rele_locked(q); } } } @@ -629,10 +665,9 @@ proc_exit(proc_t p) */ /* No need for locking here as no one than this thread can access this */ if (p->p_ru != NULL) { + calcru(p, &p->p_stats->p_ru.ru_utime, &p->p_stats->p_ru.ru_stime, NULL); *p->p_ru = p->p_stats->p_ru; - calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); - ruadd(p->p_ru, &p->p_stats->p_cru); } @@ -689,7 +724,8 @@ proc_exit(proc_t p) p->task = TASK_NULL; set_bsdtask_info(task, NULL); - proc_knote(p, NOTE_EXIT); + /* exit status will be seen by parent process */ + proc_knote(p, NOTE_EXIT | (p->p_xstat & 0xffff)); /* mark the thread as the one that is doing proc_exit * no need to hold proc lock in uthread_free @@ -737,7 +773,7 @@ proc_exit(proc_t p) * p_ucred usage is safe as it is an exiting process * and reference is dropped in reap */ - pp->si_uid = p->p_ucred->cr_ruid; + pp->si_uid = kauth_cred_getruid(p->p_ucred); proc_unlock(pp); } /* mark as a zombie */ @@ -855,7 +891,7 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int locked, int d trace_parent->si_pid = child->p_pid; trace_parent->si_status = child->p_xstat; trace_parent->si_code = CLD_CONTINUED; - trace_parent->si_uid = child->p_ucred->cr_ruid; + trace_parent->si_uid = kauth_cred_getruid(child->p_ucred); proc_unlock(trace_parent); } proc_reparentlocked(child, trace_parent, 1, 0); @@ -899,7 +935,7 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int locked, int d printf("Warning : lost p_ru for %s\n", child->p_comm); } - AUDIT_SESSION_PROCEXIT(child->p_ucred); + AUDIT_SESSION_PROCEXIT(child); /* * Decrement the count of procs running with this uid. @@ -907,7 +943,7 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int locked, int d * and refernce is dropped after these calls down below * (locking protection is provided by list lock held in chgproccnt) */ - (void)chgproccnt(child->p_ucred->cr_ruid, -1); + (void)chgproccnt(kauth_cred_getruid(child->p_ucred), -1); #if CONFIG_LCTX ALLLCTX_LOCK; @@ -948,22 +984,21 @@ reap_child_locked(proc_t parent, proc_t child, int deadparent, int locked, int d proc_list_unlock(); -#ifdef CONFIG_EMBEDDED - lck_mtx_destroy(&child->p_mlock, proc_lck_grp); - lck_mtx_destroy(&child->p_fdmlock, proc_lck_grp); -#if CONFIG_DTRACE - lck_mtx_destroy(&child->p_dtrace_sprlock, proc_lck_grp); -#endif - lck_spin_destroy(&child->p_slock, proc_lck_grp); - -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_destroy(&child->p_mlock, proc_mlock_grp); lck_mtx_destroy(&child->p_fdmlock, proc_fdmlock_grp); #if CONFIG_DTRACE lck_mtx_destroy(&child->p_dtrace_sprlock, proc_lck_grp); #endif lck_spin_destroy(&child->p_slock, proc_slock_grp); +#else /* CONFIG_FINE_LOCK_GROUPS */ + lck_mtx_destroy(&child->p_mlock, proc_lck_grp); + lck_mtx_destroy(&child->p_fdmlock, proc_lck_grp); +#if CONFIG_DTRACE + lck_mtx_destroy(&child->p_dtrace_sprlock, proc_lck_grp); #endif + lck_spin_destroy(&child->p_slock, proc_lck_grp); +#endif /* CONFIG_FINE_LOCK_GROUPS */ workqueue_destroy_lock(child); FREE_ZONE(child, sizeof *child, M_PROC); @@ -1754,6 +1789,8 @@ vproc_exit(proc_t p) proc_lock(q); q->p_lflag &= ~P_LTRACED; if (q->sigwait_thread) { + thread_t thread = q->sigwait_thread; + proc_unlock(q); /* * The sigwait_thread could be stopped at a @@ -1762,12 +1799,13 @@ vproc_exit(proc_t p) * the first thread in the task. So any attempts to kill * the process would result into a deadlock on q->sigwait. */ - thread_resume((thread_t)q->sigwait_thread); - clear_wait(q->sigwait_thread, THREAD_INTERRUPTED); - threadsignal((thread_t)q->sigwait_thread, SIGKILL, 0); - } else + thread_resume(thread); + clear_wait(thread, THREAD_INTERRUPTED); + threadsignal(thread, SIGKILL, 0); + } else { proc_unlock(q); - + } + psignal(q, SIGKILL); proc_list_lock(); } @@ -1844,6 +1882,10 @@ vproc_exit(proc_t p) } } +#if PSYNCH + pth_proc_hashdelete(p); +#endif /* PSYNCH */ + /* * Other substructures are freed from wait(). */ @@ -1877,7 +1919,7 @@ vproc_exit(proc_t p) * p_ucred usage is safe as it is an exiting process * and reference is dropped in reap */ - pp->si_uid = p->p_ucred->cr_ruid; + pp->si_uid = kauth_cred_getruid(p->p_ucred); proc_unlock(pp); } /* mark as a zombie */ diff --git a/bsd/kern/kern_fork.c b/bsd/kern/kern_fork.c index a5b1350d3..7746398bf 100644 --- a/bsd/kern/kern_fork.c +++ b/bsd/kern/kern_fork.c @@ -129,7 +129,6 @@ extern void dtrace_lazy_dofs_duplicate(proc_t, proc_t); #include <sys/sdt.h> - /* XXX routines which should have Mach prototypes, but don't */ void thread_set_parent(thread_t parent, int pid); extern void act_thread_catt(void *ctx); @@ -365,7 +364,7 @@ fork1(proc_t parent_proc, thread_t *child_threadp, int kind) * exceed the limit. The variable nprocs is the current number of * processes, maxproc is the limit. */ - uid = kauth_cred_get()->cr_ruid; + uid = kauth_getruid(); proc_list_lock(); if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) { proc_list_unlock(); @@ -466,7 +465,6 @@ fork1(proc_t parent_proc, thread_t *child_threadp, int kind) AUDIT_ARG(pid, child_proc->p_pid); - AUDIT_SESSION_PROCNEW(child_proc->p_ucred); // XXX END: wants to move to be common code (and safe) /* @@ -570,7 +568,6 @@ fork1(proc_t parent_proc, thread_t *child_threadp, int kind) AUDIT_ARG(pid, child_proc->p_pid); - AUDIT_SESSION_PROCNEW(child_proc->p_ucred); // XXX END: wants to move to be common code (and safe) /* @@ -690,7 +687,6 @@ vfork_return(proc_t child_proc, int32_t *retval, int rval) thread_t parent_thread = (thread_t)current_thread(); uthread_t parent_uthread = (uthread_t)get_bsdthread_info(parent_thread); - act_thread_catt(parent_uthread->uu_userstate); /* end vfork in parent */ @@ -948,14 +944,6 @@ cloneproc(task_t parent_task, proc_t parent_proc, int inherit_memory) if (parent_proc->p_flag & P_LP64) { task_set_64bit(child_task, TRUE); OSBitOrAtomic(P_LP64, (UInt32 *)&child_proc->p_flag); -#ifdef __ppc__ - /* - * PPC51: ppc64 is limited to 51-bit addresses. - * Memory above that limit is handled specially at - * the pmap level. - */ - pmap_map_sharedpage(child_task, get_map_pmap(get_task_map(child_task))); -#endif /* __ppc__ */ } else { task_set_64bit(child_task, FALSE); OSBitAndAtomic(~((uint32_t)P_LP64), (UInt32 *)&child_proc->p_flag); @@ -1031,6 +1019,9 @@ forkproc_free(proc_t p) /* Stop the profiling clock */ stopprofclock(p); + /* Update the audit session proc count */ + AUDIT_SESSION_PROCEXIT(p); + /* Release the credential reference */ kauth_cred_unref(&p->p_ucred); @@ -1069,6 +1060,7 @@ forkproc(proc_t parent_proc) { proc_t child_proc; /* Our new process */ static int nextpid = 0, pidwrap = 0, nextpidversion = 0; + static uint64_t nextuniqueid = 0; int error = 0; struct session *sessp; uthread_t parent_uthread = (uthread_t)get_bsdthread_info(current_thread()); @@ -1147,6 +1139,8 @@ retry: nprocs++; child_proc->p_pid = nextpid; child_proc->p_idversion = nextpidversion++; + /* kernel process is handcrafted and not from fork, so start from 1 */ + child_proc->p_uniqueid = ++nextuniqueid; #if 1 if (child_proc->p_pid != 0) { if (pfind_locked(child_proc->p_pid) != PROC_NULL) @@ -1180,7 +1174,7 @@ retry: * Increase reference counts on shared objects. * The p_stats and p_sigacts substructs are set in vm_fork. */ - child_proc->p_flag = (parent_proc->p_flag & (P_LP64 | P_TRANSLATED | P_AFFINITY)); + child_proc->p_flag = (parent_proc->p_flag & (P_LP64 | P_TRANSLATED | P_AFFINITY | P_DISABLE_ASLR)); if (parent_proc->p_flag & P_PROFIL) startprofclock(child_proc); /* @@ -1188,22 +1182,26 @@ retry: * credential will be granted to the new process. */ child_proc->p_ucred = kauth_cred_get_with_ref(); + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(child_proc); + /* update audit session proc count */ + AUDIT_SESSION_PROCNEW(child_proc); -#ifdef CONFIG_EMBEDDED - lck_mtx_init(&child_proc->p_mlock, proc_lck_grp, proc_lck_attr); - lck_mtx_init(&child_proc->p_fdmlock, proc_lck_grp, proc_lck_attr); -#if CONFIG_DTRACE - lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr); -#endif - lck_spin_init(&child_proc->p_slock, proc_lck_grp, proc_lck_attr); -#else /* !CONFIG_EMBEDDED */ +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&child_proc->p_mlock, proc_mlock_grp, proc_lck_attr); lck_mtx_init(&child_proc->p_fdmlock, proc_fdmlock_grp, proc_lck_attr); #if CONFIG_DTRACE lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr); #endif lck_spin_init(&child_proc->p_slock, proc_slock_grp, proc_lck_attr); -#endif /* !CONFIG_EMBEDDED */ +#else /* !CONFIG_FINE_LOCK_GROUPS */ + lck_mtx_init(&child_proc->p_mlock, proc_lck_grp, proc_lck_attr); + lck_mtx_init(&child_proc->p_fdmlock, proc_lck_grp, proc_lck_attr); +#if CONFIG_DTRACE + lck_mtx_init(&child_proc->p_dtrace_sprlock, proc_lck_grp, proc_lck_attr); +#endif + lck_spin_init(&child_proc->p_slock, proc_lck_grp, proc_lck_attr); +#endif /* !CONFIG_FINE_LOCK_GROUPS */ klist_init(&child_proc->p_klist); if (child_proc->p_textvp != NULLVP) { @@ -1396,6 +1394,7 @@ uthread_alloc(task_t task, thread_t thread, int noinherit) p = (proc_t) get_bsdtask_info(task); uth = (uthread_t)ut; + uth->uu_kwe.kwe_uth = uth; /* * Thread inherits credential from the creating thread, if both diff --git a/bsd/kern/kern_lockf.c b/bsd/kern/kern_lockf.c index 31b25d885..b7775864c 100644 --- a/bsd/kern/kern_lockf.c +++ b/bsd/kern/kern_lockf.c @@ -89,7 +89,7 @@ static int maxlockdepth = MAXDEPTH; void lf_print(const char *tag, struct lockf *lock); void lf_printlist(const char *tag, struct lockf *lock); static int lockf_debug = 2; -SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &lockf_debug, 0, ""); /* * If there is no mask bit selector, or there is on, and the selector is @@ -129,11 +129,13 @@ static overlap_t lf_findoverlap(struct lockf *, struct lockf *, int, struct lockf ***, struct lockf **); static struct lockf *lf_getblock(struct lockf *); static int lf_getlock(struct lockf *, struct flock *); +#if CONFIG_EMBEDDED +static int lf_getlockpid(struct vnode *, struct flock *); +#endif static int lf_setlock(struct lockf *); static int lf_split(struct lockf *, struct lockf *); static void lf_wakelock(struct lockf *, boolean_t); - /* * lf_advlock * @@ -172,6 +174,11 @@ lf_advlock(struct vnop_advlock_args *ap) /* XXX HFS may need a !vnode_isreg(vp) EISDIR error here */ +#if CONFIG_EMBEDDED + if (ap->a_op == F_GETLKPID) + return lf_getlockpid(vp, fl); +#endif + /* * Avoid the common case of unlocking when inode has no locks. */ @@ -289,7 +296,7 @@ lf_advlock(struct vnop_advlock_args *ap) error = EINVAL; break; } - lck_mtx_unlock(&vp->v_lock); /* done maniplulating the list */ + lck_mtx_unlock(&vp->v_lock); /* done manipulating the list */ LOCKF_DEBUG(0, "lf_advlock: normal exit: %d\n\n", error); return (error); @@ -297,25 +304,42 @@ lf_advlock(struct vnop_advlock_args *ap) /* - * lf_coelesce_adjacent + * Take any lock attempts which are currently blocked by a given lock ("from") + * and mark them as blocked by a different lock ("to"). Used in the case + * where a byte range currently occupied by "from" is to be occupied by "to." + */ +static void +lf_move_blocked(struct lockf *to, struct lockf *from) +{ + struct lockf *tlock; + + TAILQ_FOREACH(tlock, &from->lf_blkhd, lf_block) { + tlock->lf_next = to; + } + + TAILQ_CONCAT(&to->lf_blkhd, &from->lf_blkhd, lf_block); +} + +/* + * lf_coalesce_adjacent * - * Description: Helper function: when setting a lock, coelesce adjacent + * Description: Helper function: when setting a lock, coalesce adjacent * locks. Needed because adjacent locks are not overlapping, - * but POSIX requires that they be coelesced. + * but POSIX requires that they be coalesced. * * Parameters: lock The new lock which may be adjacent - * to already locked reagions, and which - * should therefore be coelesced with them + * to already locked regions, and which + * should therefore be coalesced with them * * Returns: <void> */ static void -lf_coelesce_adjacent(struct lockf *lock) +lf_coalesce_adjacent(struct lockf *lock) { struct lockf **lf = lock->lf_head; while (*lf != NOLOCKF) { - /* reject locks that obviously could not be coelesced */ + /* reject locks that obviously could not be coalesced */ if ((*lf == lock) || ((*lf)->lf_id != lock->lf_id) || ((*lf)->lf_type != lock->lf_type)) { @@ -323,27 +347,38 @@ lf_coelesce_adjacent(struct lockf *lock) continue; } + /* + * NOTE: Assumes that if two locks are adjacent on the number line + * and belong to the same owner, then they are adjacent on the list. + */ + /* If the lock ends adjacent to us, we can coelesce it */ if ((*lf)->lf_end != -1 && ((*lf)->lf_end + 1) == lock->lf_start) { struct lockf *adjacent = *lf; - LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent previous\n"); + LOCKF_DEBUG(0, "lf_coalesce_adjacent: coalesce adjacent previous\n"); lock->lf_start = (*lf)->lf_start; *lf = lock; lf = &(*lf)->lf_next; + + lf_move_blocked(lock, adjacent); + FREE(adjacent, M_LOCKF); continue; } - /* If the lock starts adjacent to us, we can coelesce it */ + /* If the lock starts adjacent to us, we can coalesce it */ if (lock->lf_end != -1 && (lock->lf_end + 1) == (*lf)->lf_start) { struct lockf *adjacent = *lf; - LOCKF_DEBUG(0, "lf_coelesce_adjacent: coelesce adjacent following\n"); + LOCKF_DEBUG(0, "lf_coalesce_adjacent: coalesce adjacent following\n"); lock->lf_end = (*lf)->lf_end; lock->lf_next = (*lf)->lf_next; lf = &lock->lf_next; + + lf_move_blocked(lock, adjacent); + FREE(adjacent, M_LOCKF); continue; } @@ -373,7 +408,7 @@ lf_coelesce_adjacent(struct lockf *lock) * msleep:EINTR * * Notes: We add the lock to the provisional lock list. We do not - * coelesce at this time; this has implications for other lock + * coalesce at this time; this has implications for other lock * requestors in the blocker search mechanism. */ static int @@ -518,13 +553,8 @@ lf_setlock(struct lockf *lock) error = msleep(lock, &vp->v_lock, priority, lockstr, 0); if (!TAILQ_EMPTY(&lock->lf_blkhd)) { - struct lockf *tlock; - if ((block = lf_getblock(lock))) { - TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) { - tlock->lf_next = block; - } - TAILQ_CONCAT(&block->lf_blkhd, &lock->lf_blkhd, lf_block); + lf_move_blocked(block, lock); } } if (error) { /* XXX */ @@ -589,7 +619,7 @@ lf_setlock(struct lockf *lock) lf_wakelock(overlap, TRUE); overlap->lf_type = lock->lf_type; FREE(lock, M_LOCKF); - lock = overlap; /* for lf_coelesce_adjacent() */ + lock = overlap; /* for lf_coalesce_adjacent() */ break; case OVERLAP_CONTAINS_LOCK: @@ -598,7 +628,7 @@ lf_setlock(struct lockf *lock) */ if (overlap->lf_type == lock->lf_type) { FREE(lock, M_LOCKF); - lock = overlap; /* for lf_coelesce_adjacent() */ + lock = overlap; /* for lf_coalesce_adjacent() */ break; } if (overlap->lf_start == lock->lf_start) { @@ -676,8 +706,8 @@ lf_setlock(struct lockf *lock) } break; } - /* Coelesce adjacent locks with identical attributes */ - lf_coelesce_adjacent(lock); + /* Coalesce adjacent locks with identical attributes */ + lf_coalesce_adjacent(lock); #ifdef LOCKF_DEBUGGING if (lockf_debug & 1) { lf_print("lf_setlock: got the lock", lock); @@ -825,6 +855,55 @@ lf_getlock(struct lockf *lock, struct flock *fl) return (0); } +#if CONFIG_EMBEDDED +int lf_getlockpid(struct vnode *vp, struct flock *fl) +{ + struct lockf *lf, *blk; + + if (vp == 0) + return EINVAL; + + fl->l_type = F_UNLCK; + + lck_mtx_lock(&vp->v_lock); + + for (lf = vp->v_lockf; lf; lf = lf->lf_next) { + + if (lf->lf_flags & F_POSIX) { + if ((((struct proc *)lf->lf_id)->p_pid) == fl->l_pid) { + fl->l_type = lf->lf_type; + fl->l_whence = SEEK_SET; + fl->l_start = lf->lf_start; + if (lf->lf_end == -1) + fl->l_len = 0; + else + fl->l_len = lf->lf_end - lf->lf_start + 1; + + break; + } + } + + TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) { + if (blk->lf_flags & F_POSIX) { + if ((((struct proc *)blk->lf_id)->p_pid) == fl->l_pid) { + fl->l_type = blk->lf_type; + fl->l_whence = SEEK_SET; + fl->l_start = blk->lf_start; + if (blk->lf_end == -1) + fl->l_len = 0; + else + fl->l_len = blk->lf_end - blk->lf_start + 1; + + break; + } + } + } + } + + lck_mtx_unlock(&vp->v_lock); + return (0); +} +#endif /* * lf_getblock @@ -901,7 +980,7 @@ lf_getblock(struct lockf *lock) * while lf_setlock will iterate over all overlapping locks to * * The check parameter can be SELF, meaning we are looking for - * overelapping locks owned by us, or it can be OTHERS, meaning + * overlapping locks owned by us, or it can be OTHERS, meaning * we are looking for overlapping locks owned by someone else so * we can report a blocking lock on an F_GETLK request. * @@ -913,6 +992,7 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, struct lockf ***prev, struct lockf **overlap) { off_t start, end; + int found_self = 0; *overlap = lf; if (lf == NOLOCKF) @@ -926,10 +1006,28 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, while (lf != NOLOCKF) { if (((type & SELF) && lf->lf_id != lock->lf_id) || ((type & OTHERS) && lf->lf_id == lock->lf_id)) { + /* + * Locks belonging to one process are adjacent on the + * list, so if we've found any locks belonging to us, + * and we're now seeing something else, then we've + * examined all "self" locks. Note that bailing out + * here is quite important; for coalescing, we assume + * numerically adjacent locks from the same owner to + * be adjacent on the list. + */ + if ((type & SELF) && found_self) { + return OVERLAP_NONE; + } + *prev = &lf->lf_next; *overlap = lf = lf->lf_next; continue; } + + if ((type & SELF)) { + found_self = 1; + } + #ifdef LOCKF_DEBUGGING if (lockf_debug & 2) lf_print("\tchecking", lf); @@ -941,6 +1039,11 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, (end != -1 && lf->lf_start > end)) { /* Case 0 */ LOCKF_DEBUG(2, "no overlap\n"); + + /* + * NOTE: assumes that locks for the same process are + * nonintersecting and ordered. + */ if ((type & SELF) && end != -1 && lf->lf_start > end) return (OVERLAP_NONE); *prev = &lf->lf_next; diff --git a/bsd/kern/kern_malloc.c b/bsd/kern/kern_malloc.c index ff86bfff6..c1700ee51 100644 --- a/bsd/kern/kern_malloc.c +++ b/bsd/kern/kern_malloc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -293,9 +293,15 @@ const char *memname[] = { #else "", /* 109 M_DECMPFS_CNODE */ #endif /* HFS_COMPRESSION */ + "ipmfilter", /* 110 M_INMFILTER */ + "ipmsource", /* 111 M_IPMSOURCE */ + "in6mfilter", /* 112 M_IN6MFILTER */ + "ip6mopts", /* 113 M_IP6MOPTS */ + "ip6msource", /* 114 M_IP6MSOURCE */ }; /* for use with kmzones.kz_zalloczone */ +#define KMZ_CREATEZONE_ACCT ((void *)-3) #define KMZ_CREATEZONE ((void *)-2) #define KMZ_LOOKUPZONE ((void *)-1) #define KMZ_MALLOC ((void *)0) @@ -332,7 +338,7 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 21 M_FHANDLE */ #if (NFSCLIENT || NFSSERVER) { SOS(nfsreq), KMZ_CREATEZONE, FALSE }, /* 22 M_NFSREQ */ - { SOS(nfsmount), KMZ_CREATEZONE, FALSE },/* 23 M_NFSMNT */ + { SOS(nfsmount),KMZ_CREATEZONE, FALSE }, /* 23 M_NFSMNT */ { SOS(nfsnode), KMZ_CREATEZONE, FALSE }, /* 24 M_NFSNODE */ #else { 0, KMZ_MALLOC, FALSE }, /* 22 M_NFSREQ */ @@ -340,25 +346,25 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 24 M_NFSNODE */ #endif { SOS(vnode), KMZ_CREATEZONE, TRUE }, /* 25 M_VNODE */ - { SOS(namecache),KMZ_CREATEZONE, FALSE }, /* 26 M_CACHE */ + { SOS(namecache), KMZ_CREATEZONE, FALSE }, /* 26 M_CACHE */ #if QUOTA { SOX(dquot), KMZ_LOOKUPZONE, FALSE }, /* 27 M_DQUOT */ #else { 0, KMZ_MALLOC, FALSE }, /* 27 M_DQUOT */ #endif { 0, KMZ_MALLOC, FALSE }, /* 28 M_UFSMNT */ - { 0, KMZ_MALLOC, FALSE }, /* 29 M_CGSUM */ + { 0, KMZ_MALLOC, FALSE }, /* 29 M_SHM */ { SOS(plimit), KMZ_CREATEZONE, TRUE }, /* 30 M_PLIMIT */ - { SOS(sigacts), KMZ_CREATEZONE, TRUE }, /* 31 M_SIGACTS */ + { SOS(sigacts), KMZ_CREATEZONE_ACCT, TRUE }, /* 31 M_SIGACTS */ { 0, KMZ_MALLOC, FALSE }, /* 32 M_VMOBJ */ { 0, KMZ_MALLOC, FALSE }, /* 33 M_VMOBJHASH */ { 0, KMZ_MALLOC, FALSE }, /* 34 M_VMPMAP */ { 0, KMZ_MALLOC, FALSE }, /* 35 M_VMPVENT */ { 0, KMZ_MALLOC, FALSE }, /* 36 M_VMPAGER */ { 0, KMZ_MALLOC, FALSE }, /* 37 M_VMPGDATA */ - { SOS(fileproc),KMZ_CREATEZONE, TRUE }, /* 38 M_FILEPROC */ - { SOS(filedesc),KMZ_CREATEZONE, TRUE }, /* 39 M_FILEDESC */ - { SOX(lockf), KMZ_CREATEZONE, TRUE }, /* 40 M_LOCKF */ + { SOS(fileproc),KMZ_CREATEZONE_ACCT, TRUE }, /* 38 M_FILEPROC */ + { SOS(filedesc),KMZ_CREATEZONE_ACCT, TRUE }, /* 39 M_FILEDESC */ + { SOX(lockf), KMZ_CREATEZONE_ACCT, TRUE }, /* 40 M_LOCKF */ { SOS(proc), KMZ_CREATEZONE, FALSE }, /* 41 M_PROC */ { SOS(pstats), KMZ_CREATEZONE, TRUE }, /* 42 M_PSTATS */ { 0, KMZ_MALLOC, FALSE }, /* 43 M_SEGMENT */ @@ -370,10 +376,10 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 49 M_NETADDR */ #if (NFSCLIENT || NFSSERVER) { SOX(nfsrv_sock), - KMZ_CREATEZONE, FALSE }, /* 50 M_NFSSVC */ + KMZ_CREATEZONE_ACCT, FALSE }, /* 50 M_NFSSVC */ { 0, KMZ_MALLOC, FALSE }, /* 51 M_NFSUID */ { SOX(nfsrvcache), - KMZ_CREATEZONE, FALSE }, /* 52 M_NFSD */ + KMZ_CREATEZONE_ACCT, FALSE }, /* 52 M_NFSD */ #else { 0, KMZ_MALLOC, FALSE }, /* 50 M_NFSSVC */ { 0, KMZ_MALLOC, FALSE }, /* 51 M_NFSUID */ @@ -389,7 +395,7 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 58 unused entry */ #if (NFSCLIENT || NFSSERVER) { SOS(nfsrv_descript), - KMZ_CREATEZONE, FALSE }, /* 59 M_NFSRVDESC */ + KMZ_CREATEZONE_ACCT, FALSE }, /* 59 M_NFSRVDESC */ { SOS(nfsdmap), KMZ_CREATEZONE, FALSE }, /* 60 M_NFSDIROFF */ { SOS(fhandle), KMZ_LOOKUPZONE, FALSE }, /* 61 M_NFSBIGFH */ #else @@ -407,9 +413,9 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 69 M_ADOSFSMNT */ { 0, KMZ_MALLOC, FALSE }, /* 70 M_ADOSFSNODE */ { 0, KMZ_MALLOC, FALSE }, /* 71 M_ANODE */ - { SOX(buf), KMZ_CREATEZONE, TRUE }, /* 72 M_BUFHDR */ + { 0, KMZ_MALLOC, TRUE }, /* 72 M_BUFHDR */ { (NDFILE * OFILESIZE), - KMZ_CREATEZONE, FALSE }, /* 73 M_OFILETABL */ + KMZ_CREATEZONE_ACCT, FALSE }, /* 73 M_OFILETABL */ { MCLBYTES, KMZ_CREATEZONE, FALSE }, /* 74 M_MCLUST */ #if HFS { SOX(hfsmount),KMZ_LOOKUPZONE, FALSE }, /* 75 M_HFSMNT */ @@ -437,15 +443,15 @@ struct kmzones { { SOS(journal), KMZ_CREATEZONE, FALSE }, /* 91 M_JNL_JNL */ { SOS(transaction), KMZ_CREATEZONE, FALSE }, /* 92 M_JNL_TR */ #else - { 0, KMZ_MALLOC, FALSE }, /* 91 M_JNL_JNL */ - { 0, KMZ_MALLOC, FALSE }, /* 92 M_JNL_TR */ + { 0, KMZ_MALLOC, FALSE }, /* 91 M_JNL_JNL */ + { 0, KMZ_MALLOC, FALSE }, /* 92 M_JNL_TR */ #endif - { SOS(specinfo), KMZ_CREATEZONE, TRUE }, /* 93 M_SPECINFO */ - { SOS(kqueue), KMZ_CREATEZONE, FALSE }, /* 94 M_KQUEUE */ + { SOS(specinfo),KMZ_CREATEZONE, TRUE }, /* 93 M_SPECINFO */ + { SOS(kqueue), KMZ_CREATEZONE, FALSE }, /* 94 M_KQUEUE */ #if HFS - { SOS(directoryhint), KMZ_CREATEZONE, FALSE }, /* 95 M_HFSDIRHINT */ + { SOS(directoryhint), KMZ_CREATEZONE, TRUE }, /* 95 M_HFSDIRHINT */ #else - { 0, KMZ_MALLOC, FALSE }, /* 95 M_HFSDIRHINT */ + { 0, KMZ_MALLOC, FALSE }, /* 95 M_HFSDIRHINT */ #endif { SOS(cl_readahead), KMZ_CREATEZONE, TRUE }, /* 96 M_CLRDAHEAD */ { SOS(cl_writebehind),KMZ_CREATEZONE, TRUE }, /* 97 M_CLWRBEHIND */ @@ -454,7 +460,7 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 100 M_KAUTH */ { 0, KMZ_MALLOC, FALSE }, /* 101 M_DUMMYNET */ #ifndef __LP64__ - { SOS(unsafe_fsnode),KMZ_CREATEZONE, FALSE }, /* 102 M_UNSAFEFS */ + { SOS(unsafe_fsnode),KMZ_CREATEZONE, TRUE }, /* 102 M_UNSAFEFS */ #else { 0, KMZ_MALLOC, FALSE }, /* 102 M_UNSAFEFS */ #endif /* __LP64__ */ @@ -465,10 +471,15 @@ struct kmzones { { 0, KMZ_MALLOC, FALSE }, /* 107 M_LCTX */ { 0, KMZ_MALLOC, FALSE }, /* 108 M_TRAFFIC_MGT */ #if HFS_COMPRESSION - { SOS(decmpfs_cnode),KMZ_CREATEZONE, FALSE }, /* 109 M_DECMPFS_CNODE */ + { SOS(decmpfs_cnode),KMZ_CREATEZONE , FALSE}, /* 109 M_DECMPFS_CNODE */ #else { 0, KMZ_MALLOC, FALSE }, /* 109 M_DECMPFS_CNODE */ #endif /* HFS_COMPRESSION */ + { 0, KMZ_MALLOC, FALSE }, /* 110 M_INMFILTER */ + { 0, KMZ_MALLOC, FALSE }, /* 111 M_IPMSOURCE */ + { 0, KMZ_MALLOC, FALSE }, /* 112 M_IN6MFILTER */ + { 0, KMZ_MALLOC, FALSE }, /* 113 M_IP6MOPTS */ + { 0, KMZ_MALLOC, FALSE }, /* 114 M_IP6MSOURCE */ #undef SOS #undef SOX }; @@ -495,10 +506,14 @@ kmeminit(void) ; else /* XXX */ - if (kmz->kz_zalloczone == KMZ_CREATEZONE) { + if (kmz->kz_zalloczone == KMZ_CREATEZONE || + kmz->kz_zalloczone == KMZ_CREATEZONE_ACCT) { kmz->kz_zalloczone = zinit(kmz->kz_elemsize, 1024 * 1024, PAGE_SIZE, memname[kmz - kmzones]); + zone_change(kmz->kz_zalloczone, Z_CALLERACCT, + (kmz->kz_zalloczone == KMZ_CREATEZONE_ACCT)); + if (kmz->kz_noencrypt == TRUE) zone_change(kmz->kz_zalloczone, Z_NOENCRYPT, TRUE); } @@ -526,12 +541,6 @@ kmeminit(void) } } -#define MDECL(reqlen) \ -union { \ - struct _mhead hdr; \ - char _m[(reqlen) + sizeof (struct _mhead)]; \ -} - struct _mhead { size_t mlen; char dat[0]; @@ -543,8 +552,8 @@ _MALLOC( int type, int flags) { - MDECL(size) *mem; - size_t memsize = sizeof (*mem); + struct _mhead *hdr; + size_t memsize = sizeof (*hdr) + size; if (type >= M_LAST) panic("_malloc TYPE"); @@ -553,11 +562,11 @@ _MALLOC( return (NULL); if (flags & M_NOWAIT) { - mem = (void *)kalloc_noblock(memsize); + hdr = (void *)kalloc_noblock(memsize); } else { - mem = (void *)kalloc(memsize); + hdr = (void *)kalloc(memsize); - if (mem == NULL) { + if (hdr == NULL) { /* * We get here when the caller told us to block waiting for memory, but @@ -572,15 +581,15 @@ _MALLOC( panic("_MALLOC: kalloc returned NULL (potential leak), size %llu", (uint64_t) size); } } - if (!mem) + if (!hdr) return (0); - mem->hdr.mlen = memsize; + hdr->mlen = memsize; if (flags & M_ZERO) - bzero(mem->hdr.dat, size); + bzero(hdr->dat, size); - return (mem->hdr.dat); + return (hdr->dat); } void @@ -600,6 +609,36 @@ _FREE( kfree(hdr, hdr->mlen); } +void * +_REALLOC( + void *addr, + size_t size, + int type, + int flags) +{ + struct _mhead *hdr; + void *newaddr; + size_t alloc; + + /* realloc(NULL, ...) is equivalent to malloc(...) */ + if (addr == NULL) + return (_MALLOC(size, type, flags)); + + /* Allocate a new, bigger (or smaller) block */ + if ((newaddr = _MALLOC(size, type, flags)) == NULL) + return (NULL); + + hdr = addr; + --hdr; + alloc = hdr->mlen - sizeof (*hdr); + + /* Copy over original contents */ + bcopy(addr, newaddr, MIN(size, alloc)); + _FREE(addr, type); + + return (newaddr); +} + void * _MALLOC_ZONE( size_t size, @@ -660,3 +699,116 @@ _FREE_ZONE( else kfree(elem, size); } + +#if CONFIG_ZLEAKS + +SYSCTL_DECL(_kern_zleak); +SYSCTL_NODE(_kern, OID_AUTO, zleak, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "zleak"); + +/* + * kern.zleak.active + * + * Show the status of the zleak subsystem (0 = enabled, 1 = active, + * and -1 = failed), and if enabled, allow it to be activated immediately. + */ +static int +sysctl_zleak_active SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int oldval, val, error; + + val = oldval = get_zleak_state(); + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) + return (error); + /* + * Can only be activated if it's off (and not failed.) + * Cannot be deactivated once it's on. + */ + if (val == 1 && oldval == 0) { + kern_return_t kr = zleak_activate(); + + if (KERN_SUCCESS != kr) + printf("zleak_active: failed to activate " + "live zone leak debugging (%d).\n", kr); + } if (val == 0 && oldval == 1) { + printf("zleak_active: active, cannot be disabled.\n"); + return (EINVAL); + } + return (0); +} + +SYSCTL_PROC(_kern_zleak, OID_AUTO, active, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + 0, 0, sysctl_zleak_active, "I", "zleak activity"); + +/* + * kern.zleak.max_zonemap_size + * + * Read the value of the maximum zonemap size in bytes; useful + * as the maximum size that zleak.global_threshold and + * zleak.zone_threshold should be set to. + */ +static int +sysctl_zleak_max_zonemap_size SYSCTL_HANDLER_ARGS +{ + uint64_t zmap_max_size = *(vm_size_t *)arg1; + + return sysctl_handle_quad(oidp, &zmap_max_size, arg2, req); +} + +SYSCTL_PROC(_kern_zleak, OID_AUTO, max_zonemap_size, + CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, + &zleak_max_zonemap_size, 0, + sysctl_zleak_max_zonemap_size, "Q", "zleak max zonemap size"); + + +static int +sysctl_zleak_threshold SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg2) + int error; + uint64_t value = *(vm_size_t *)arg1; + + error = sysctl_io_number(req, value, sizeof (value), &value, NULL); + + if (error || !req->newptr) + return (error); + + if (value > (uint64_t)zleak_max_zonemap_size) + return (ERANGE); + + *(vm_size_t *)arg1 = value; + return (0); +} + +/* + * kern.zleak.global_threshold + * + * Set the global zleak threshold size (in bytes). If the zone map + * grows larger than this value, zleaks are automatically activated. + * + * The default value is set in zleak_init(). + */ +SYSCTL_PROC(_kern_zleak, OID_AUTO, global_threshold, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, + &zleak_global_tracking_threshold, 0, + sysctl_zleak_threshold, "Q", "zleak global threshold"); + +/* + * kern.zleak.zone_threshold + * + * Set the per-zone threshold size (in bytes) above which any + * zone will automatically start zleak tracking. + * + * The default value is set in zleak_init(). + * + * Setting this variable will have no effect until zleak tracking is + * activated (See above.) + */ +SYSCTL_PROC(_kern_zleak, OID_AUTO, zone_threshold, + CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, + &zleak_per_zone_tracking_threshold, 0, + sysctl_zleak_threshold, "Q", "zleak per-zone threshold"); + +#endif /* CONFIG_ZLEAKS */ diff --git a/bsd/kern/kern_memorystatus.c b/bsd/kern/kern_memorystatus.c index dfbaa794d..489ddd2be 100644 --- a/bsd/kern/kern_memorystatus.c +++ b/bsd/kern/kern_memorystatus.c @@ -31,6 +31,8 @@ #include <sys/kern_memorystatus.h> #include <kern/sched_prim.h> +#include <kern/kalloc.h> +#include <kern/debug.h> #include <kern/lock.h> #include <kern/task.h> #include <kern/thread.h> @@ -42,6 +44,126 @@ #include <sys/signalvar.h> #include <sys/sysctl.h> #include <sys/wait.h> +#include <sys/tree.h> +#include <pexpert/pexpert.h> + +#if CONFIG_FREEZE +#include <vm/vm_protos.h> +#include <vm/vm_map.h> + +enum { + kProcessSuspended = (1 << 0), + kProcessHibernated = (1 << 1), + kProcessNoReclaimWorth = (1 << 2), + kProcessIgnored = (1 << 3), + kProcessBusy = (1 << 4) +}; + +static lck_mtx_t * hibernation_mlock; +static lck_attr_t * hibernation_lck_attr; +static lck_grp_t * hibernation_lck_grp; +static lck_grp_attr_t * hibernation_lck_grp_attr; + +typedef struct hibernation_node { + RB_ENTRY(hibernation_node) link; + pid_t pid; + uint32_t state; + mach_timespec_t hibernation_ts; +} hibernation_node; + +static int hibernation_tree_compare(hibernation_node *n1, hibernation_node *n2) { + if (n1->pid < n2->pid) + return -1; + else if (n1->pid > n2->pid) + return 1; + else + return 0; +} + +static RB_HEAD(hibernation_tree, hibernation_node) hibernation_tree_head; +RB_PROTOTYPE_SC(static, hibernation_tree, hibernation_node, link, hibernation_tree_compare); + +RB_GENERATE(hibernation_tree, hibernation_node, link, hibernation_tree_compare); + +static inline boolean_t kern_hibernation_can_hibernate_processes(void); +static boolean_t kern_hibernation_can_hibernate(void); + +static void kern_hibernation_add_node(hibernation_node *node); +static hibernation_node *kern_hibernation_get_node(pid_t pid); +static void kern_hibernation_release_node(hibernation_node *node); +static void kern_hibernation_free_node(hibernation_node *node, boolean_t unlock); + +static void kern_hibernation_register_pid(pid_t pid); +static void kern_hibernation_unregister_pid(pid_t pid); + +static int kern_hibernation_get_process_state(pid_t pid, uint32_t *state, mach_timespec_t *ts); +static int kern_hibernation_set_process_state(pid_t pid, uint32_t state); + +static void kern_hibernation_cull(void); + +static void kern_hibernation_thread(void); + +extern boolean_t vm_freeze_enabled; + +int kern_hibernation_wakeup = 0; + +static int jetsam_priority_list_hibernation_index = 0; + +/* Thresholds */ +static int kern_memorystatus_level_hibernate = 50; + +#define HIBERNATION_PAGES_MIN ( 1 * 1024 * 1024 / PAGE_SIZE) +#define HIBERNATION_PAGES_MAX (16 * 1024 * 1024 / PAGE_SIZE) + +static unsigned int kern_memorystatus_hibernation_pages_min = HIBERNATION_PAGES_MIN; +static unsigned int kern_memorystatus_hibernation_pages_max = HIBERNATION_PAGES_MAX; + +static unsigned int kern_memorystatus_suspended_count = 0; +static unsigned int kern_memorystatus_hibernated_count = 0; + +static unsigned int kern_memorystatus_hibernation_suspended_minimum = 4; + +static unsigned int kern_memorystatus_low_swap_pages = 0; + +/* Throttling */ +#define HIBERNATION_DAILY_MB_MAX 1024 +#define HIBERNATION_DAILY_PAGEOUTS_MAX (HIBERNATION_DAILY_MB_MAX * (1024 * 1024 / PAGE_SIZE)) + +static struct throttle_interval_t { + uint32_t mins; + uint32_t burst_multiple; + uint32_t pageouts; + uint32_t max_pageouts; + mach_timespec_t ts; + boolean_t throttle; +} throttle_intervals[] = { + { 60, 8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */ + { 24 * 60, 1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */ +}; + +/* Stats */ +static uint64_t kern_memorystatus_hibernation_count = 0; +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_hibernation_count, CTLFLAG_RD, &kern_memorystatus_hibernation_count, ""); + +static uint64_t kern_memorystatus_hibernation_pageouts = 0; +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_hibernation_pageouts, CTLFLAG_RD, &kern_memorystatus_hibernation_pageouts, ""); + +static uint64_t kern_memorystatus_hibernation_throttle_count = 0; +SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_hibernation_throttle_count, CTLFLAG_RD, &kern_memorystatus_hibernation_throttle_count, ""); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_min_processes, CTLFLAG_RW, &kern_memorystatus_hibernation_suspended_minimum, 0, ""); + +#if DEVELOPMENT || DEBUG +/* Allow parameter tweaking in these builds */ +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_level_hibernate, CTLFLAG_RW, &kern_memorystatus_level_hibernate, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_pages_min, CTLFLAG_RW, &kern_memorystatus_hibernation_pages_min, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_pages_max, CTLFLAG_RW, &kern_memorystatus_hibernation_pages_max, 0, ""); + +boolean_t kern_memorystatus_hibernation_throttle_enabled = TRUE; +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_hibernation_throttle_enabled, CTLFLAG_RW, &kern_memorystatus_hibernation_throttle_enabled, 0, ""); +#endif /* DEVELOPMENT || DEBUG */ +#endif /* CONFIG_FREEZE */ extern unsigned int vm_page_free_count; extern unsigned int vm_page_active_count; @@ -54,6 +176,8 @@ static void kern_memorystatus_thread(void); int kern_memorystatus_wakeup = 0; int kern_memorystatus_level = 0; int kern_memorystatus_last_level = 0; +unsigned int kern_memorystatus_delta; + unsigned int kern_memorystatus_kev_failure_count = 0; int kern_memorystatus_level_critical = 5; #define kern_memorystatus_level_highwater (kern_memorystatus_level_critical + 5) @@ -76,16 +200,66 @@ static lck_attr_t * jetsam_lck_attr; static lck_grp_t * jetsam_lck_grp; static lck_grp_attr_t * jetsam_lck_grp_attr; -SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD, &kern_memorystatus_level, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, memorystatus_kev_failure_count, CTLFLAG_RD, &kern_memorystatus_kev_failure_count, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, memorystatus_level, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_memorystatus_level, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, memorystatus_kev_failure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_memorystatus_kev_failure_count, 0, ""); + +#if DEVELOPMENT || DEBUG + +enum { + kJetsamDiagnosticModeNone = 0, + kJetsamDiagnosticModeAll = 1, + kJetsamDiagnosticModeStopAtFirstActive = 2 +} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone; + +static int jetsam_diagnostic_suspended_one_active_proc = 0; + +static int +sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val = jetsam_diagnostic_mode; + boolean_t disabled; + + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) + return (error); + if ((val < 0) || (val > 2)) { + printf("jetsam: diagnostic mode: invalid value - %d\n", val); + return (0); + } + + /* + * If jetsam_diagnostic_mode is set, we need to lower memory threshold for jetsam + */ + disabled = (val == 0) && (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone); + + jetsam_diagnostic_mode = val; + + if (disabled) { + kern_memorystatus_level_critical = 5; + printf("jetsam: diagnostic mode: resetting critical level to %d\n", kern_memorystatus_level_critical); + } else { + kern_memorystatus_level_critical = 10; + printf("jetsam: diagnostic mode: %d: increasing critical level to %d\n", (int) jetsam_diagnostic_mode, kern_memorystatus_level_critical); + if (jetsam_diagnostic_mode == kJetsamDiagnosticModeStopAtFirstActive) + printf("jetsam: diagnostic mode: will stop at first active app\n"); + } + + return (0); +} + +SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, + &jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode"); +#endif /* DEVELOPMENT || DEBUG */ __private_extern__ void kern_memorystatus_init(void) { - jetsam_lck_attr = lck_attr_alloc_init(); - jetsam_lck_grp_attr= lck_grp_attr_alloc_init(); - jetsam_lck_grp = lck_grp_alloc_init("jetsam", jetsam_lck_grp_attr); - jetsam_list_mlock = lck_mtx_alloc_init(jetsam_lck_grp, jetsam_lck_attr); + jetsam_lck_attr = lck_attr_alloc_init(); + jetsam_lck_grp_attr= lck_grp_attr_alloc_init(); + jetsam_lck_grp = lck_grp_alloc_init("jetsam", jetsam_lck_grp_attr); + jetsam_list_mlock = lck_mtx_alloc_init(jetsam_lck_grp, jetsam_lck_attr); + kern_memorystatus_delta = 5 * atop_64(max_mem) / 100; (void)kernel_thread(kernel_task, kern_memorystatus_thread); } @@ -153,48 +327,107 @@ jetsam_snapshot_procs(void) } static void -jetsam_mark_pid_in_snapshot(pid_t pid, int flag) +jetsam_mark_pid_in_snapshot(pid_t pid, int flags) { int i = 0; for (i = 0; i < jetsam_snapshot_list_count; i++) { if (jetsam_snapshot_list[i].pid == pid) { - jetsam_snapshot_list[i].flags |= flag; + jetsam_snapshot_list[i].flags |= flags; return; } } } int -jetsam_kill_top_proc(void) +jetsam_kill_top_proc(boolean_t any, uint32_t cause) { proc_t p; +#ifndef CONFIG_FREEZE +#pragma unused(any) +#endif + if (jetsam_snapshot_list_count == 0) { jetsam_snapshot_procs(); } lck_mtx_lock(jetsam_list_mlock); while (jetsam_priority_list_index < jetsam_priority_list_count) { - pid_t aPid; - aPid = jetsam_priority_list[jetsam_priority_list_index].pid; + jetsam_priority_entry_t* jetsam_priority_entry = &jetsam_priority_list[jetsam_priority_list_index]; + pid_t aPid = jetsam_priority_entry->pid; +#if DEVELOPMENT || DEBUG + int activeProcess = jetsam_priority_entry->flags & kJetsamFlagsFrontmost; + int procSuspendedForDiagnosis = jetsam_priority_entry->flags & kJetsamFlagsSuspForDiagnosis; +#endif /* DEVELOPMENT || DEBUG */ jetsam_priority_list_index++; /* skip empty slots in the list */ if (aPid == 0) { continue; // with lock held } lck_mtx_unlock(jetsam_list_mlock); - jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsKilled); p = proc_find(aPid); if (p != NULL) { - printf("jetsam: killing pid %d [%s] - memory_status_level: %d - ", - aPid, (p->p_comm ? p->p_comm : "(unknown)"), kern_memorystatus_level); - exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); - proc_rele(p); + int flags = cause; +#if DEVELOPMENT || DEBUG + if ((jetsam_diagnostic_mode != kJetsamDiagnosticModeNone) && procSuspendedForDiagnosis) { + printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid); + proc_rele(p); + lck_mtx_lock(jetsam_list_mlock); + continue; + } +#endif /* DEVELOPMENT || DEBUG */ +#if CONFIG_FREEZE + hibernation_node *node; + boolean_t skip; + if ((node = kern_hibernation_get_node(aPid))) { + boolean_t reclaim_proc = !(node->state & (kProcessBusy | kProcessNoReclaimWorth)); + if (any || reclaim_proc) { + if (node->state & kProcessHibernated) { + flags |= kJetsamFlagsHibernated; + } + skip = FALSE; + } else { + skip = TRUE; + } + kern_hibernation_release_node(node); + } else { + skip = FALSE; + } + if (skip) { + proc_rele(p); + } else +#endif + { +#if DEVELOPMENT || DEBUG + if ((jetsam_diagnostic_mode != kJetsamDiagnosticModeNone) && activeProcess) { #if DEBUG - printf("jetsam: pid %d killed - memory_status_level: %d\n", aPid, kern_memorystatus_level); + printf("jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n", + aPid, (p->p_comm ? p->p_comm: "(unknown)"), kern_memorystatus_level); #endif /* DEBUG */ - return 0; + jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsSuspForDiagnosis); + jetsam_priority_entry->flags |= kJetsamFlagsSuspForDiagnosis; + task_suspend(p->task); + proc_rele(p); + if (jetsam_diagnostic_mode == kJetsamDiagnosticModeStopAtFirstActive) { + jetsam_diagnostic_suspended_one_active_proc = 1; + printf("jetsam: returning after suspending first active proc - %d\n", aPid); + } + return 0; + } else +#endif /* DEVELOPMENT || DEBUG */ + { + printf("jetsam: killing pid %d [%s] - memory_status_level: %d\n", + aPid, (p->p_comm ? p->p_comm : "(unknown)"), kern_memorystatus_level); + jetsam_mark_pid_in_snapshot(aPid, flags); + exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); + proc_rele(p); +#if DEBUG + printf("jetsam: pid %d killed - memory_status_level: %d\n", aPid, kern_memorystatus_level); +#endif /* DEBUG */ + return 0; + } + } } lck_mtx_lock(jetsam_list_mlock); } @@ -220,54 +453,235 @@ jetsam_kill_hiwat_proc(void) if (aPid == 0 || (hiwat < 0)) { continue; // with lock held } - lck_mtx_unlock(jetsam_list_mlock); p = proc_find(aPid); if (p != NULL) { int32_t pages = (int32_t)jetsam_task_page_count(p->task); - if (pages > hiwat) { + boolean_t skip = (pages <= hiwat); +#if DEVELOPMENT || DEBUG + if (!skip && (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone)) { + if (jetsam_priority_list[i].flags & kJetsamFlagsSuspForDiagnosis) { + proc_rele(p); + continue; + } + } +#endif /* DEVELOPMENT || DEBUG */ +#if CONFIG_FREEZE + if (!skip) { + hibernation_node *node; + if ((node = kern_hibernation_get_node(aPid))) { + if (node->state & kProcessBusy) { + kern_hibernation_release_node(node); + skip = TRUE; + } else { + kern_hibernation_free_node(node, TRUE); + skip = FALSE; + } + } + } +#endif + if (!skip) { #if DEBUG - printf("jetsam: killing pid %d [%s] - %d pages > hiwat (%d)\n", aPid, p->p_comm, pages, hiwat); + printf("jetsam: %s pid %d [%s] - %d pages > hiwat (%d)\n", + (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone)?"suspending": "killing", aPid, p->p_comm, pages, hiwat); #endif /* DEBUG */ - exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); - proc_rele(p); +#if DEVELOPMENT || DEBUG + if (jetsam_diagnostic_mode != kJetsamDiagnosticModeNone) { + lck_mtx_unlock(jetsam_list_mlock); + task_suspend(p->task); + proc_rele(p); #if DEBUG - printf("jetsam: pid %d killed - memory_status_level: %d\n", aPid, kern_memorystatus_level); + printf("jetsam: pid %d suspended for diagnosis - memory_status_level: %d\n", aPid, kern_memorystatus_level); #endif /* DEBUG */ - jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsKilledHiwat); - jetsam_priority_list[i].pid = 0; + jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsSuspForDiagnosis); + jetsam_priority_list[i].flags |= kJetsamFlagsSuspForDiagnosis; + } else +#endif /* DEVELOPMENT || DEBUG */ + { + jetsam_priority_list[i].pid = 0; + lck_mtx_unlock(jetsam_list_mlock); + exit1(p, W_EXITCODE(0, SIGKILL), (int *)NULL); + proc_rele(p); +#if DEBUG + printf("jetsam: pid %d killed - memory_status_level: %d\n", aPid, kern_memorystatus_level); +#endif /* DEBUG */ + jetsam_mark_pid_in_snapshot(aPid, kJetsamFlagsKilledHiwat); + } return 0; } else { proc_rele(p); } } - lck_mtx_lock(jetsam_list_mlock); } lck_mtx_unlock(jetsam_list_mlock); return -1; } +#if CONFIG_FREEZE +static void +jetsam_send_hibernation_note(uint32_t flags, pid_t pid, uint32_t pages) { + int ret; + struct kev_msg ev_msg; + jetsam_hibernation_entry_t data; + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_SYSTEM_CLASS; + ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS; + + ev_msg.event_code = kMemoryStatusHibernationNote; + + ev_msg.dv[0].data_length = sizeof data; + ev_msg.dv[0].data_ptr = &data; + ev_msg.dv[1].data_length = 0; + + data.pid = pid; + data.flags = flags; + data.pages = pages; + + ret = kev_post_msg(&ev_msg); + if (ret) { + kern_memorystatus_kev_failure_count++; + printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); + } +} + +static int +jetsam_hibernate_top_proc(void) +{ + int hibernate_index; + proc_t p; + uint32_t i; + + lck_mtx_lock(jetsam_list_mlock); + + for (hibernate_index = jetsam_priority_list_index; hibernate_index < jetsam_priority_list_count; hibernate_index++) { + pid_t aPid; + uint32_t state = 0; + + aPid = jetsam_priority_list[hibernate_index].pid; + + /* skip empty slots in the list */ + if (aPid == 0) { + continue; // with lock held + } + + if (kern_hibernation_get_process_state(aPid, &state, NULL) != 0) { + continue; // with lock held + } + + /* ensure the process isn't marked as busy and is suspended */ + if ((state & kProcessBusy) || !(state & kProcessSuspended)) { + continue; // with lock held + } + + p = proc_find(aPid); + if (p != NULL) { + hibernation_node *node; + boolean_t skip; + uint32_t purgeable, wired, clean, dirty; + boolean_t shared; + + lck_mtx_unlock(jetsam_list_mlock); + + if ((node = kern_hibernation_get_node(aPid))) { + if (node->state & kProcessBusy) { + skip = TRUE; + } else { + node->state |= kProcessBusy; + /* Whether we hibernate or not, increase the count so can we maintain the gap between hibernated and suspended processes. */ + kern_memorystatus_hibernated_count++; + skip = FALSE; + } + kern_hibernation_release_node(node); + } else { + skip = TRUE; + } + + if (!skip) { + /* Only hibernate processes meeting our size criteria. If not met, mark it as such and return. */ + task_freeze(p->task, &purgeable, &wired, &clean, &dirty, &shared, TRUE); + skip = (dirty < kern_memorystatus_hibernation_pages_min) || (dirty > kern_memorystatus_hibernation_pages_max); + } + + if (!skip) { + unsigned int swap_pages_free = default_pager_swap_pages_free(); + + /* Ensure there's actually enough space free to hibernate this process. */ + if (dirty > swap_pages_free) { + kern_memorystatus_low_swap_pages = swap_pages_free; + skip = TRUE; + } + } + + if (skip) { + kern_hibernation_set_process_state(aPid, kProcessIgnored); + proc_rele(p); + return 0; + } + +#if DEBUG + printf("jetsam: pid %d [%s] hibernating - memory_status_level: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n", + aPid, (p->p_comm ? p->p_comm : "(unknown)"), kern_memorystatus_level, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free()); +#endif + + task_freeze(p->task, &purgeable, &wired, &clean, &dirty, &shared, FALSE); + proc_rele(p); + + kern_hibernation_set_process_state(aPid, kProcessHibernated | (shared ? 0: kProcessNoReclaimWorth)); + + /* Update stats */ + for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { + throttle_intervals[i].pageouts += dirty; + } + kern_memorystatus_hibernation_pageouts += dirty; + kern_memorystatus_hibernation_count++; + + jetsam_send_hibernation_note(kJetsamFlagsHibernated, aPid, dirty); + + return dirty; + } + } + lck_mtx_unlock(jetsam_list_mlock); + return -1; +} +#endif /* CONFIG_FREEZE */ + static void kern_memorystatus_thread(void) { struct kev_msg ev_msg; jetsam_kernel_stats_t data; + boolean_t post_memorystatus_snapshot = FALSE; int ret; + bzero(&data, sizeof(jetsam_kernel_stats_t)); + bzero(&ev_msg, sizeof(struct kev_msg)); while(1) { - while (kern_memorystatus_level <= kern_memorystatus_level_critical) { - if (jetsam_kill_top_proc() < 0) { +#if DEVELOPMENT || DEBUG + jetsam_diagnostic_suspended_one_active_proc = 0; +#endif /* DEVELOPMENT || DEBUG */ + + while (kern_memorystatus_level <= kern_memorystatus_level_highwater) { + if (jetsam_kill_hiwat_proc() < 0) { break; } + post_memorystatus_snapshot = TRUE; } - while (kern_memorystatus_level <= kern_memorystatus_level_highwater) { - if (jetsam_kill_hiwat_proc() < 0) { + while (kern_memorystatus_level <= kern_memorystatus_level_critical) { + if (jetsam_kill_top_proc(FALSE, kJetsamFlagsKilled) < 0) { break; } + post_memorystatus_snapshot = TRUE; +#if DEVELOPMENT || DEBUG + if ((jetsam_diagnostic_mode == kJetsamDiagnosticModeStopAtFirstActive) && jetsam_diagnostic_suspended_one_active_proc) { + printf("jetsam: stopping killing since 1 active proc suspended already for diagnosis\n"); + break; // we found first active proc, let's not kill any more + } +#endif /* DEVELOPMENT || DEBUG */ } - + kern_memorystatus_last_level = kern_memorystatus_level; ev_msg.vendor_code = KEV_VENDOR_APPLE; @@ -295,7 +709,7 @@ kern_memorystatus_thread(void) printf("%s: kev_post_msg() failed, err %d\n", __func__, ret); } - if (jetsam_snapshot_list_count) { + if (post_memorystatus_snapshot) { size_t snapshot_size = sizeof(jetsam_kernel_stats_t) + sizeof(size_t) + sizeof(jetsam_snapshot_entry_t) * jetsam_snapshot_list_count; ev_msg.event_code = kMemoryStatusSnapshotNote; ev_msg.dv[0].data_length = sizeof snapshot_size; @@ -318,6 +732,349 @@ kern_memorystatus_thread(void) } } +#if CONFIG_FREEZE + +__private_extern__ void +kern_hibernation_init(void) +{ + hibernation_lck_attr = lck_attr_alloc_init(); + hibernation_lck_grp_attr = lck_grp_attr_alloc_init(); + hibernation_lck_grp = lck_grp_alloc_init("hibernation", hibernation_lck_grp_attr); + hibernation_mlock = lck_mtx_alloc_init(hibernation_lck_grp, hibernation_lck_attr); + + RB_INIT(&hibernation_tree_head); + + (void)kernel_thread(kernel_task, kern_hibernation_thread); +} + +static inline boolean_t +kern_hibernation_can_hibernate_processes(void) +{ + boolean_t ret; + + lck_mtx_lock_spin(hibernation_mlock); + ret = (kern_memorystatus_suspended_count - kern_memorystatus_hibernated_count) > + kern_memorystatus_hibernation_suspended_minimum ? TRUE : FALSE; + lck_mtx_unlock(hibernation_mlock); + + return ret; +} + +static boolean_t +kern_hibernation_can_hibernate(void) +{ + /* Only hibernate if we're sufficiently low on memory; this holds off hibernation right after boot, + and is generally is a no-op once we've reached steady state. */ + if (kern_memorystatus_level > kern_memorystatus_level_hibernate) { + return FALSE; + } + + /* Check minimum suspended process threshold. */ + if (!kern_hibernation_can_hibernate_processes()) { + return FALSE; + } + + /* Is swap running low? */ + if (kern_memorystatus_low_swap_pages) { + /* If there's been no movement in free swap pages since we last attempted hibernation, return. */ + if (default_pager_swap_pages_free() <= kern_memorystatus_low_swap_pages) { + return FALSE; + } + + /* Pages have been freed, so we can retry. */ + kern_memorystatus_low_swap_pages = 0; + } + + /* OK */ + return TRUE; +} + +static void +kern_hibernation_add_node(hibernation_node *node) +{ + lck_mtx_lock_spin(hibernation_mlock); + + RB_INSERT(hibernation_tree, &hibernation_tree_head, node); + kern_memorystatus_suspended_count++; + + lck_mtx_unlock(hibernation_mlock); +} + +/* Returns with the hibernation lock taken */ +static hibernation_node * +kern_hibernation_get_node(pid_t pid) +{ + hibernation_node sought, *found; + sought.pid = pid; + lck_mtx_lock_spin(hibernation_mlock); + found = RB_FIND(hibernation_tree, &hibernation_tree_head, &sought); + if (!found) { + lck_mtx_unlock(hibernation_mlock); + } + return found; +} + +static void +kern_hibernation_release_node(hibernation_node *node) +{ +#pragma unused(node) + lck_mtx_unlock(hibernation_mlock); +} + +static void +kern_hibernation_free_node(hibernation_node *node, boolean_t unlock) +{ + /* make sure we're called with the hibernation_mlock held */ + lck_mtx_assert(hibernation_mlock, LCK_MTX_ASSERT_OWNED); + + if (node->state & (kProcessHibernated | kProcessIgnored)) { + kern_memorystatus_hibernated_count--; + } + + kern_memorystatus_suspended_count--; + + RB_REMOVE(hibernation_tree, &hibernation_tree_head, node); + kfree(node, sizeof(hibernation_node)); + + if (unlock) { + lck_mtx_unlock(hibernation_mlock); + } +} + +static void +kern_hibernation_register_pid(pid_t pid) +{ + hibernation_node *node; + +#if DEVELOPMENT || DEBUG + node = kern_hibernation_get_node(pid); + if (node) { + printf("kern_hibernation_register_pid: pid %d already registered!\n", pid); + kern_hibernation_release_node(node); + return; + } +#endif + + /* Register as a candiate for hibernation */ + node = (hibernation_node *)kalloc(sizeof(hibernation_node)); + if (node) { + clock_sec_t sec; + clock_nsec_t nsec; + mach_timespec_t ts; + + memset(node, 0, sizeof(hibernation_node)); + + node->pid = pid; + node->state = kProcessSuspended; + + clock_get_system_nanotime(&sec, &nsec); + ts.tv_sec = sec; + ts.tv_nsec = nsec; + + node->hibernation_ts = ts; + + kern_hibernation_add_node(node); + } +} + +static void +kern_hibernation_unregister_pid(pid_t pid) +{ + hibernation_node *node; + + node = kern_hibernation_get_node(pid); + if (node) { + kern_hibernation_free_node(node, TRUE); + } +} + +void +kern_hibernation_on_pid_suspend(pid_t pid) +{ + kern_hibernation_register_pid(pid); +} + +/* If enabled, we bring all the hibernated pages back prior to resumption; otherwise, they're faulted back in on demand */ +#define THAW_ON_RESUME 1 + +void +kern_hibernation_on_pid_resume(pid_t pid, task_t task) +{ +#if THAW_ON_RESUME + hibernation_node *node; + if ((node = kern_hibernation_get_node(pid))) { + if (node->state & kProcessHibernated) { + node->state |= kProcessBusy; + kern_hibernation_release_node(node); + task_thaw(task); + jetsam_send_hibernation_note(kJetsamFlagsThawed, pid, 0); + } else { + kern_hibernation_release_node(node); + } + } +#else +#pragma unused(task) +#endif + kern_hibernation_unregister_pid(pid); +} + +void +kern_hibernation_on_pid_hibernate(pid_t pid) +{ +#pragma unused(pid) + + /* Wake the hibernation thread */ + thread_wakeup((event_t)&kern_hibernation_wakeup); +} + +static int +kern_hibernation_get_process_state(pid_t pid, uint32_t *state, mach_timespec_t *ts) +{ + hibernation_node *found; + int err = ESRCH; + + *state = 0; + + found = kern_hibernation_get_node(pid); + if (found) { + *state = found->state; + if (ts) { + *ts = found->hibernation_ts; + } + err = 0; + kern_hibernation_release_node(found); + } + + return err; +} + +static int +kern_hibernation_set_process_state(pid_t pid, uint32_t state) +{ + hibernation_node *found; + int err = ESRCH; + + found = kern_hibernation_get_node(pid); + if (found) { + found->state = state; + err = 0; + kern_hibernation_release_node(found); + } + + return err; +} + +static void +kern_hibernation_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval) +{ + if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) { + if (!interval->max_pageouts) { + interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * HIBERNATION_DAILY_PAGEOUTS_MAX) / (24 * 60))); + } else { + printf("jetsam: %d minute throttle timeout, resetting\n", interval->mins); + } + interval->ts.tv_sec = interval->mins * 60; + interval->ts.tv_nsec = 0; + ADD_MACH_TIMESPEC(&interval->ts, ts); + /* Since we update the throttle stats pre-hibernation, adjust for overshoot here */ + if (interval->pageouts > interval->max_pageouts) { + interval->pageouts -= interval->max_pageouts; + } else { + interval->pageouts = 0; + } + interval->throttle = FALSE; + } else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) { + printf("jetsam: %d minute pageout limit exceeded; enabling throttle\n", interval->mins); + interval->throttle = TRUE; + } +#ifdef DEBUG + printf("jetsam: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n", + interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60, + interval->throttle ? "on" : "off"); +#endif +} + +static boolean_t +kern_hibernation_throttle_update(void) +{ + clock_sec_t sec; + clock_nsec_t nsec; + mach_timespec_t ts; + uint32_t i; + boolean_t throttled = FALSE; + +#if DEVELOPMENT || DEBUG + if (!kern_memorystatus_hibernation_throttle_enabled) + return FALSE; +#endif + + clock_get_system_nanotime(&sec, &nsec); + ts.tv_sec = sec; + ts.tv_nsec = nsec; + + /* Check hibernation pageouts over multiple intervals and throttle if we've exceeded our budget. + * + * This ensures that periods of inactivity can't be used as 'credit' towards hibernation if the device has + * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in + * order to allow for bursts of activity. + */ + for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) { + kern_hibernation_update_throttle_interval(&ts, &throttle_intervals[i]); + if (throttle_intervals[i].throttle == TRUE) + throttled = TRUE; + } + + return throttled; +} + +static void +kern_hibernation_cull(void) +{ + hibernation_node *node, *next; + lck_mtx_lock(hibernation_mlock); + + for (node = RB_MIN(hibernation_tree, &hibernation_tree_head); node != NULL; node = next) { + proc_t p; + + next = RB_NEXT(hibernation_tree, &hibernation_tree_head, node); + + /* TODO: probably suboptimal, so revisit should it cause a performance issue */ + p = proc_find(node->pid); + if (p) { + proc_rele(p); + } else { + kern_hibernation_free_node(node, FALSE); + } + } + + lck_mtx_unlock(hibernation_mlock); +} + +static void +kern_hibernation_thread(void) +{ + if (vm_freeze_enabled) { + if (kern_hibernation_can_hibernate()) { + + /* Cull dead processes */ + kern_hibernation_cull(); + + /* Only hibernate if we've not exceeded our pageout budgets */ + if (!kern_hibernation_throttle_update()) { + jetsam_hibernate_top_proc(); + } else { + printf("kern_hibernation_thread: in throttle, ignoring hibernation\n"); + kern_memorystatus_hibernation_throttle_count++; /* Throttled, update stats */ + } + } + } + + assert_wait((event_t) &kern_hibernation_wakeup, THREAD_UNINT); + thread_block((thread_continue_t) kern_hibernation_thread); +} + +#endif /* CONFIG_FREEZE */ + static int sysctl_io_variable(struct sysctl_req *req, void *pValue, size_t currentsize, size_t maxsize, size_t *newsize) { @@ -362,19 +1119,24 @@ sysctl_handle_kern_memorystatus_priority_list(__unused struct sysctl_oid *oid, _ ret = sysctl_io_variable(req, &temp_list[0], currentsize, sizeof(temp_list), &newsize); if (!ret && req->newptr) { - jetsam_priority_list_count = newsize / sizeof(jetsam_priority_list[0]); + int temp_list_count = newsize / sizeof(jetsam_priority_list[0]); #if DEBUG printf("set jetsam priority pids = { "); - for (i = 0; i < jetsam_priority_list_count; i++) { + for (i = 0; i < temp_list_count; i++) { printf("(%d, 0x%08x, %d) ", temp_list[i].pid, temp_list[i].flags, temp_list[i].hiwat_pages); } printf("}\n"); #endif /* DEBUG */ lck_mtx_lock(jetsam_list_mlock); - for (i = 0; i < jetsam_priority_list_count; i++) { +#if CONFIG_FREEZE + jetsam_priority_list_hibernation_index = 0; +#endif + jetsam_priority_list_index = 0; + jetsam_priority_list_count = temp_list_count; + for (i = 0; i < temp_list_count; i++) { jetsam_priority_list[i] = temp_list[i]; } - for (i = jetsam_priority_list_count; i < kMaxPriorityEntries; i++) { + for (i = temp_list_count; i < kMaxPriorityEntries; i++) { jetsam_priority_list[i].pid = 0; jetsam_priority_list[i].flags = 0; jetsam_priority_list[i].hiwat_pages = -1; @@ -382,7 +1144,6 @@ sysctl_handle_kern_memorystatus_priority_list(__unused struct sysctl_oid *oid, _ jetsam_priority_list[i].hiwat_reserved2 = -1; jetsam_priority_list[i].hiwat_reserved3 = -1; } - jetsam_priority_list_index = 0; lck_mtx_unlock(jetsam_list_mlock); } return ret; @@ -421,5 +1182,5 @@ sysctl_handle_kern_memorystatus_snapshot(__unused struct sysctl_oid *oid, __unus return ret; } -SYSCTL_PROC(_kern, OID_AUTO, memorystatus_priority_list, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, sysctl_handle_kern_memorystatus_priority_list, "S,jetsam_priorities", ""); +SYSCTL_PROC(_kern, OID_AUTO, memorystatus_priority_list, CTLTYPE_OPAQUE|CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_handle_kern_memorystatus_priority_list, "S,jetsam_priorities", ""); SYSCTL_PROC(_kern, OID_AUTO, memorystatus_snapshot, CTLTYPE_OPAQUE|CTLFLAG_RD, 0, 0, sysctl_handle_kern_memorystatus_snapshot, "S,jetsam_snapshot", ""); diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 658f13860..f82717429 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -333,34 +333,34 @@ sysctl_tbfrequency /* * hw.* MIB variables. */ -SYSCTL_PROC (_hw, HW_NCPU, ncpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_NCPU, sysctl_hw_generic, "I", ""); -SYSCTL_PROC (_hw, HW_AVAILCPU, activecpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_AVAILCPU, sysctl_hw_generic, "I", ""); -SYSCTL_PROC (_hw, OID_AUTO, physicalcpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_PHYSICALCPU, sysctl_hw_generic, "I", ""); -SYSCTL_PROC (_hw, OID_AUTO, physicalcpu_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_PHYSICALCPUMAX, sysctl_hw_generic, "I", ""); -SYSCTL_PROC (_hw, OID_AUTO, logicalcpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_LOGICALCPU, sysctl_hw_generic, "I", ""); -SYSCTL_PROC (_hw, OID_AUTO, logicalcpu_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_LOCAL_LOGICALCPUMAX, sysctl_hw_generic, "I", ""); -SYSCTL_INT (_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD | CTLFLAG_KERN, (int *)NULL, BYTE_ORDER, ""); -SYSCTL_INT (_hw, OID_AUTO, cputype, CTLFLAG_RD | CTLFLAG_KERN, &cputype, 0, ""); -SYSCTL_INT (_hw, OID_AUTO, cpusubtype, CTLFLAG_RD | CTLFLAG_KERN, &cpusubtype, 0, ""); -SYSCTL_INT (_hw, OID_AUTO, cpu64bit_capable, CTLFLAG_RD | CTLFLAG_KERN, &cpu64bit, 0, ""); -SYSCTL_INT (_hw, OID_AUTO, cpufamily, CTLFLAG_RD | CTLFLAG_KERN, &cpufamily, 0, ""); -SYSCTL_OPAQUE (_hw, OID_AUTO, cacheconfig, CTLFLAG_RD, &cacheconfig, sizeof(cacheconfig), "Q", ""); -SYSCTL_OPAQUE (_hw, OID_AUTO, cachesize, CTLFLAG_RD, &cachesize, sizeof(cachesize), "Q", ""); -SYSCTL_PROC (_hw, OID_AUTO, pagesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, 0, sysctl_pagesize, "Q", ""); -SYSCTL_QUAD (_hw, OID_AUTO, busfrequency, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.bus_frequency_hz, ""); -SYSCTL_QUAD (_hw, OID_AUTO, busfrequency_min, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.bus_frequency_min_hz, ""); -SYSCTL_QUAD (_hw, OID_AUTO, busfrequency_max, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.bus_frequency_max_hz, ""); -SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.cpu_frequency_hz, ""); -SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency_min, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.cpu_frequency_min_hz, ""); -SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency_max, CTLFLAG_RD | CTLFLAG_KERN, &gPEClockFrequencyInfo.cpu_frequency_max_hz, ""); -SYSCTL_PROC (_hw, OID_AUTO, cachelinesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_CACHELINE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); -SYSCTL_PROC (_hw, OID_AUTO, l1icachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_L1ICACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); -SYSCTL_PROC (_hw, OID_AUTO, l1dcachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_L1DCACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); -SYSCTL_PROC (_hw, OID_AUTO, l2cachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_L2CACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); -SYSCTL_PROC (_hw, OID_AUTO, l3cachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, HW_L3CACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); -SYSCTL_PROC(_hw, OID_AUTO, tbfrequency, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN, 0, 0, sysctl_tbfrequency, "Q", ""); -SYSCTL_QUAD (_hw, HW_MEMSIZE, memsize, CTLFLAG_RD | CTLFLAG_KERN, &max_mem, ""); -SYSCTL_INT (_hw, OID_AUTO, packages, CTLFLAG_RD | CTLFLAG_KERN, &packages, 0, ""); +SYSCTL_PROC (_hw, HW_NCPU, ncpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_NCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, HW_AVAILCPU, activecpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_AVAILCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, physicalcpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_LOCAL_PHYSICALCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, physicalcpu_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_LOCAL_PHYSICALCPUMAX, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, logicalcpu, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_LOCAL_LOGICALCPU, sysctl_hw_generic, "I", ""); +SYSCTL_PROC (_hw, OID_AUTO, logicalcpu_max, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_LOCAL_LOGICALCPUMAX, sysctl_hw_generic, "I", ""); +SYSCTL_INT (_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, BYTE_ORDER, ""); +SYSCTL_INT (_hw, OID_AUTO, cputype, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &cputype, 0, ""); +SYSCTL_INT (_hw, OID_AUTO, cpusubtype, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &cpusubtype, 0, ""); +SYSCTL_INT (_hw, OID_AUTO, cpu64bit_capable, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &cpu64bit, 0, ""); +SYSCTL_INT (_hw, OID_AUTO, cpufamily, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &cpufamily, 0, ""); +SYSCTL_OPAQUE (_hw, OID_AUTO, cacheconfig, CTLFLAG_RD | CTLFLAG_LOCKED, &cacheconfig, sizeof(cacheconfig), "Q", ""); +SYSCTL_OPAQUE (_hw, OID_AUTO, cachesize, CTLFLAG_RD | CTLFLAG_LOCKED, &cachesize, sizeof(cachesize), "Q", ""); +SYSCTL_PROC (_hw, OID_AUTO, pagesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_pagesize, "Q", ""); +SYSCTL_QUAD (_hw, OID_AUTO, busfrequency, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.bus_frequency_hz, ""); +SYSCTL_QUAD (_hw, OID_AUTO, busfrequency_min, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.bus_frequency_min_hz, ""); +SYSCTL_QUAD (_hw, OID_AUTO, busfrequency_max, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.bus_frequency_max_hz, ""); +SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.cpu_frequency_hz, ""); +SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency_min, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.cpu_frequency_min_hz, ""); +SYSCTL_QUAD (_hw, OID_AUTO, cpufrequency_max, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.cpu_frequency_max_hz, ""); +SYSCTL_PROC (_hw, OID_AUTO, cachelinesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_CACHELINE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); +SYSCTL_PROC (_hw, OID_AUTO, l1icachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_L1ICACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); +SYSCTL_PROC (_hw, OID_AUTO, l1dcachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_L1DCACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); +SYSCTL_PROC (_hw, OID_AUTO, l2cachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_L2CACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); +SYSCTL_PROC (_hw, OID_AUTO, l3cachesize, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, HW_L3CACHESIZE | CTLHW_RETQUAD, sysctl_hw_generic, "Q", ""); +SYSCTL_PROC(_hw, OID_AUTO, tbfrequency, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_tbfrequency, "Q", ""); +SYSCTL_QUAD (_hw, HW_MEMSIZE, memsize, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &max_mem, ""); +SYSCTL_INT (_hw, OID_AUTO, packages, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &packages, 0, ""); /* * Optional features can register nodes below hw.optional. @@ -372,7 +372,7 @@ SYSCTL_INT (_hw, OID_AUTO, packages, CTLFLAG_RD | CTLFLAG_KERN, &packages, 0 */ SYSCTL_NODE(_hw, OID_AUTO, optional, CTLFLAG_RW|CTLFLAG_LOCKED, NULL, "optional features"); -SYSCTL_INT(_hw_optional, OID_AUTO, floatingpoint, CTLFLAG_RD | CTLFLAG_KERN, (int *)NULL, 1, ""); /* always set */ +SYSCTL_INT(_hw_optional, OID_AUTO, floatingpoint, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, 1, ""); /* always set */ /* * Deprecated variables. These are supported for backwards compatibility @@ -386,44 +386,26 @@ SYSCTL_INT(_hw_optional, OID_AUTO, floatingpoint, CTLFLAG_RD | CTLFLAG_KERN, (in * * The *_compat nodes are *NOT* visible within the kernel. */ -SYSCTL_COMPAT_INT (_hw, HW_PAGESIZE, pagesize_compat, CTLFLAG_RD | CTLFLAG_MASKED, &page_size, 0, ""); -SYSCTL_COMPAT_INT (_hw, HW_BUS_FREQ, busfrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED, &gPEClockFrequencyInfo.bus_clock_rate_hz, 0, ""); -SYSCTL_COMPAT_INT (_hw, HW_CPU_FREQ, cpufrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED, &gPEClockFrequencyInfo.cpu_clock_rate_hz, 0, ""); -SYSCTL_PROC(_hw, HW_CACHELINE, cachelinesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_CACHELINE, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L1ICACHESIZE, l1icachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L1ICACHESIZE, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L1DCACHESIZE, l1dcachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L1DCACHESIZE, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L2CACHESIZE, l2cachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L2CACHESIZE, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L3CACHESIZE, l3cachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L3CACHESIZE, sysctl_hw_generic, "I", ""); -SYSCTL_COMPAT_INT (_hw, HW_TB_FREQ, tbfrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED, &gPEClockFrequencyInfo.timebase_frequency_hz, 0, ""); -SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_MACHINE, sysctl_hw_generic, "A", ""); -SYSCTL_PROC(_hw, HW_MODEL, model, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_MODEL, sysctl_hw_generic, "A", ""); -SYSCTL_COMPAT_UINT(_hw, HW_PHYSMEM, physmem, CTLFLAG_RD | CTLFLAG_MASKED, &mem_size, 0, ""); -SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_USERMEM, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_EPOCH, epoch, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_EPOCH, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_VECTORUNIT, vectorunit, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_VECTORUNIT, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L2SETTINGS, l2settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L2SETTINGS, sysctl_hw_generic, "I", ""); -SYSCTL_PROC(_hw, HW_L3SETTINGS, l3settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED, 0, HW_L3SETTINGS, sysctl_hw_generic, "I", ""); -SYSCTL_INT (_hw, OID_AUTO, cputhreadtype, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &cputhreadtype, 0, ""); - -#ifdef __ppc__ -int altivec_flag = -1; -int graphicsops_flag = -1; -int x64bitops_flag = -1; -int fsqrt_flag = -1; -int stfiwx_flag = -1; -int dcba_flag = -1; -int datastreams_flag = -1; -int dcbtstreams_flag = -1; - -SYSCTL_INT(_hw_optional, OID_AUTO, altivec, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &altivec_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, graphicsops, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &graphicsops_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, 64bitops, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &x64bitops_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, fsqrt, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &fsqrt_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, stfiwx, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &stfiwx_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, dcba, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &dcba_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, datastreams, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &datastreams_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, dcbtstreams, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, &dcbtstreams_flag, 0, ""); -#elif defined (__i386__) || defined (__x86_64__) +SYSCTL_COMPAT_INT (_hw, HW_PAGESIZE, pagesize_compat, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &page_size, 0, ""); +SYSCTL_COMPAT_INT (_hw, HW_BUS_FREQ, busfrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.bus_clock_rate_hz, 0, ""); +SYSCTL_COMPAT_INT (_hw, HW_CPU_FREQ, cpufrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.cpu_clock_rate_hz, 0, ""); +SYSCTL_PROC(_hw, HW_CACHELINE, cachelinesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_CACHELINE, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L1ICACHESIZE, l1icachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L1ICACHESIZE, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L1DCACHESIZE, l1dcachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L1DCACHESIZE, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L2CACHESIZE, l2cachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L2CACHESIZE, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L3CACHESIZE, l3cachesize_compat, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L3CACHESIZE, sysctl_hw_generic, "I", ""); +SYSCTL_COMPAT_INT (_hw, HW_TB_FREQ, tbfrequency_compat, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &gPEClockFrequencyInfo.timebase_frequency_hz, 0, ""); +SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_MACHINE, sysctl_hw_generic, "A", ""); +SYSCTL_PROC(_hw, HW_MODEL, model, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_MODEL, sysctl_hw_generic, "A", ""); +SYSCTL_COMPAT_UINT(_hw, HW_PHYSMEM, physmem, CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, &mem_size, 0, ""); +SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_USERMEM, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_EPOCH, epoch, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_EPOCH, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_VECTORUNIT, vectorunit, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_VECTORUNIT, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L2SETTINGS, l2settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L2SETTINGS, sysctl_hw_generic, "I", ""); +SYSCTL_PROC(_hw, HW_L3SETTINGS, l3settings, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, HW_L3SETTINGS, sysctl_hw_generic, "I", ""); +SYSCTL_INT (_hw, OID_AUTO, cputhreadtype, CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &cputhreadtype, 0, ""); + +#if defined (__i386__) || defined (__x86_64__) int mmx_flag = -1; int sse_flag = -1; int sse2_flag = -1; @@ -433,22 +415,27 @@ int sse4_2_flag = -1; int x86_64_flag = -1; int supplementalsse3_flag = -1; int aes_flag = -1; - -SYSCTL_INT(_hw_optional, OID_AUTO, mmx, CTLFLAG_RD | CTLFLAG_KERN, &mmx_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse, CTLFLAG_RD | CTLFLAG_KERN, &sse_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse2, CTLFLAG_RD | CTLFLAG_KERN, &sse2_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse3, CTLFLAG_RD | CTLFLAG_KERN, &sse3_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, supplementalsse3, CTLFLAG_RD | CTLFLAG_KERN, &supplementalsse3_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse4_1, CTLFLAG_RD | CTLFLAG_KERN, &sse4_1_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, sse4_2, CTLFLAG_RD | CTLFLAG_KERN, &sse4_2_flag, 0, ""); +int avx1_0_flag = -1; + +SYSCTL_INT(_hw_optional, OID_AUTO, mmx, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &mmx_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, sse, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, sse2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse2_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, sse3, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse3_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, supplementalsse3, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &supplementalsse3_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, sse4_1, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse4_1_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, sse4_2, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &sse4_2_flag, 0, ""); /* "x86_64" is actually a preprocessor symbol on the x86_64 kernel, so we have to hack this */ #undef x86_64 -SYSCTL_INT(_hw_optional, OID_AUTO, x86_64, CTLFLAG_RD | CTLFLAG_KERN, &x86_64_flag, 0, ""); -SYSCTL_INT(_hw_optional, OID_AUTO, aes, CTLFLAG_RD | CTLFLAG_KERN, &aes_flag, 0, ""); -#endif /* __ppc__ */ +SYSCTL_INT(_hw_optional, OID_AUTO, x86_64, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &x86_64_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, aes, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &aes_flag, 0, ""); +SYSCTL_INT(_hw_optional, OID_AUTO, avx1_0, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &avx1_0_flag, 0, ""); +#endif /* !__i386__ && !__x86_64 && !__arm__ */ /* * Debugging interface to the CPU power management code. + * + * Note: Does not need locks because it disables interrupts over + * the call. */ static int pmsSysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, @@ -468,7 +455,7 @@ pmsSysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, return(error); } -SYSCTL_PROC(_hw, OID_AUTO, pms, CTLTYPE_STRUCT | CTLFLAG_WR, 0, 0, pmsSysctl, "S", "Processor Power Management"); +SYSCTL_PROC(_hw, OID_AUTO, pms, CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_LOCKED, 0, 0, pmsSysctl, "S", "Processor Power Management"); @@ -484,9 +471,7 @@ sysctl_mib_init(void) cputype = cpu_type(); cpusubtype = cpu_subtype(); cputhreadtype = cpu_threadtype(); -#if defined(__ppc__) - cpu64bit = (_cpu_capabilities & k64Bit) == k64Bit; -#elif defined(__i386__) || defined (__x86_64__) +#if defined(__i386__) || defined (__x86_64__) cpu64bit = (_get_cpu_capabilities() & k64Bit) == k64Bit; #endif @@ -502,114 +487,18 @@ sysctl_mib_init(void) sysctl_register_oid(&sysctl__hw_cputhreadtype); } -#ifdef __ppc__ -/* - * The convention for these is as follows: - * If the sysctl does not exist, the functionality is not present in the CPU. - * If the sysctl exists, it will not crash, and should otherwise function - * corectly. - * If the sysctl exists and returns 0, we advise against using this feature. - * If the sysctl exists and returns 1, we advise it's use. - */ - - if (_cpu_capabilities & kHasAltivec) { - altivec_flag = 1; - sysctl_register_oid(&sysctl__hw_optional_altivec); - } - if (_cpu_capabilities & kHasGraphicsOps) { - graphicsops_flag = 1; - sysctl_register_oid(&sysctl__hw_optional_graphicsops); - } - if (_cpu_capabilities & k64Bit) { - x64bitops_flag = 1; - sysctl_register_oid(&sysctl__hw_optional_64bitops); - } - if (_cpu_capabilities & kHasFsqrt) { - fsqrt_flag = 1; - sysctl_register_oid(&sysctl__hw_optional_fsqrt); - } - if (_cpu_capabilities & kHasStfiwx) { - stfiwx_flag = 1; - sysctl_register_oid(&sysctl__hw_optional_stfiwx); - } - if (_cpu_capabilities & kDcbaAvailable) - dcba_flag = 0; - if (_cpu_capabilities & kDcbaRecommended) - dcba_flag = 1; - if (dcba_flag >= 0) - sysctl_register_oid(&sysctl__hw_optional_dcba); - if (_cpu_capabilities & kDataStreamsAvailable) - datastreams_flag = 0; - if (_cpu_capabilities & kDataStreamsRecommended) - datastreams_flag = 1; - if (datastreams_flag >= 0) - sysctl_register_oid(&sysctl__hw_optional_datastreams); - if (_cpu_capabilities & kDcbtStreamsAvailable) - dcbtstreams_flag = 0; - if (_cpu_capabilities & kDcbtStreamsRecommended) - dcbtstreams_flag = 1; - if (dcbtstreams_flag >= 0) - sysctl_register_oid(&sysctl__hw_optional_dcbtstreams); - - /* hw.cpufamily */ - switch (cpusubtype) { - case CPU_SUBTYPE_POWERPC_750: - cpufamily = CPUFAMILY_POWERPC_G3; - break; - case CPU_SUBTYPE_POWERPC_7400: - case CPU_SUBTYPE_POWERPC_7450: - cpufamily = CPUFAMILY_POWERPC_G4; - break; - case CPU_SUBTYPE_POWERPC_970: - cpufamily = CPUFAMILY_POWERPC_G5; - break; - default: - cpufamily = CPUFAMILY_UNKNOWN; - } - - ml_cpu_info_t cpu_info; - ml_cpu_get_info(&cpu_info); - - host_basic_info_data_t hinfo; - mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; - kern_return_t kret = host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); - if(kret != KERN_SUCCESS) - { - hinfo.max_cpus = 1; - } - - /* hw.cachesize */ - cachesize[0] = max_mem; - cachesize[1] = cpu_info.l1_dcache_size; - cachesize[2] = cpu_info.l2_settings ? cpu_info.l2_cache_size : 0; - cachesize[3] = cpu_info.l3_settings ? cpu_info.l3_cache_size : 0; - cachesize[4] = 0; - - /* hw.cacheconfig */ - cacheconfig[0] = hinfo.max_cpus; - cacheconfig[1] = 1; - cacheconfig[2] = cachesize[2] ? 1 : 0; - cacheconfig[3] = cachesize[3] ? 1 : 0; - cacheconfig[4] = 0; - - /* hw.packages */ - if (cpusubtype == CPU_SUBTYPE_POWERPC_970 && - cpu_info.l2_cache_size == 1 * 1024 * 1024) - /* The signature of the dual-core G5 */ - packages = roundup(hinfo.max_cpus, 2) / 2; - else - packages = hinfo.max_cpus; - -#elif defined (__i386__) || defined (__x86_64__) - mmx_flag = ((_get_cpu_capabilities() & kHasMMX) == kHasMMX)? 1 : 0; - sse_flag = ((_get_cpu_capabilities() & kHasSSE) == kHasSSE)? 1 : 0; - sse2_flag = ((_get_cpu_capabilities() & kHasSSE2) == kHasSSE2)? 1 : 0; - sse3_flag = ((_get_cpu_capabilities() & kHasSSE3) == kHasSSE3)? 1 : 0; - supplementalsse3_flag = ((_get_cpu_capabilities() & kHasSupplementalSSE3) == kHasSupplementalSSE3)? 1 : 0; - sse4_1_flag = ((_get_cpu_capabilities() & kHasSSE4_1) == kHasSSE4_1)? 1 : 0; - sse4_2_flag = ((_get_cpu_capabilities() & kHasSSE4_2) == kHasSSE4_2)? 1 : 0; - x86_64_flag = ((_get_cpu_capabilities() & k64Bit) == k64Bit)? 1 : 0; - aes_flag = ((_get_cpu_capabilities() & kHasAES) == kHasAES)? 1 : 0; +#if defined (__i386__) || defined (__x86_64__) +#define is_capability_set(k) (((_get_cpu_capabilities() & (k)) == (k)) ? 1 : 0) + mmx_flag = is_capability_set(kHasMMX); + sse_flag = is_capability_set(kHasSSE); + sse2_flag = is_capability_set(kHasSSE2); + sse3_flag = is_capability_set(kHasSSE3); + supplementalsse3_flag = is_capability_set(kHasSupplementalSSE3); + sse4_1_flag = is_capability_set(kHasSSE4_1); + sse4_2_flag = is_capability_set(kHasSSE4_2); + x86_64_flag = is_capability_set(k64Bit); + aes_flag = is_capability_set(kHasAES); + avx1_0_flag = is_capability_set(kHasAVX1_0); /* hw.cpufamily */ cpufamily = cpuid_cpufamily(); @@ -633,7 +522,7 @@ sysctl_mib_init(void) / cpuid_info()->thread_count; #else /* end __arm__ */ -# warning we do not support this platform yet -#endif /* __ppc__ */ +# error unknown architecture +#endif /* !__i386__ && !__x86_64 && !__arm__ */ } diff --git a/bsd/kern/kern_mman.c b/bsd/kern/kern_mman.c index 6da43d2fd..979af3e5d 100644 --- a/bsd/kern/kern_mman.c +++ b/bsd/kern/kern_mman.c @@ -95,6 +95,9 @@ #include <sys/ubc.h> #include <sys/ubc_internal.h> #include <sys/sysproto.h> +#if CONFIG_PROTECT +#include <sys/cprotect.h> +#endif #include <sys/syscall.h> #include <sys/kdebug.h> @@ -156,6 +159,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) int fpref=0; int error =0; int fd = uap->fd; + int num_retries = 0; user_addr = (mach_vm_offset_t)uap->addr; user_size = (mach_vm_size_t) uap->len; @@ -203,7 +207,9 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) user_size += pageoff; /* low end... */ user_size = mach_vm_round_page(user_size); /* hi end */ - + if ((flags & MAP_JIT) && ((flags & MAP_FIXED) || (flags & MAP_SHARED) || (flags & MAP_FILE))){ + return EINVAL; + } /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). @@ -216,7 +222,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) */ user_addr -= pageoff; if (user_addr & PAGE_MASK) - return (EINVAL); + return (EINVAL); } #ifdef notyet /* DO not have apis to get this info, need to wait till then*/ @@ -236,6 +242,19 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) alloc_flags = 0; if (flags & MAP_ANON) { + + maxprot = VM_PROT_ALL; +#if CONFIG_MACF + /* + * Entitlement check. + * Re-enable once mac* is implemented. + */ + /*error = mac_proc_check_map_anon(p, user_addr, user_size, prot, flags, &maxprot); + if (error) { + return EINVAL; + }*/ +#endif /* MAC */ + /* * Mapping blank space is trivial. Use positive fds as the alias * value for memory tracking. @@ -245,7 +264,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) * Use "fd" to pass (some) Mach VM allocation flags, * (see the VM_FLAGS_* definitions). */ - alloc_flags = fd & (VM_FLAGS_ALIAS_MASK | + alloc_flags = fd & (VM_FLAGS_ALIAS_MASK | VM_FLAGS_SUPERPAGE_MASK | VM_FLAGS_PURGABLE); if (alloc_flags != fd) { /* reject if there are any extra flags */ @@ -254,7 +273,6 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) } handle = NULL; - maxprot = VM_PROT_ALL; file_pos = 0; mapanon = 1; } else { @@ -382,6 +400,21 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) goto bad; } #endif /* MAC */ + +#if CONFIG_PROTECT + { + void *cnode; + if ((cnode = cp_get_protected_cnode(vp)) != NULL) { + error = cp_handle_vnop(cnode, CP_READ_ACCESS | CP_WRITE_ACCESS); + if (error) { + (void) vnode_put(vp); + goto bad; + } + } + } +#endif /* CONFIG_PROTECT */ + + } } @@ -434,6 +467,9 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) if (flags & MAP_NOCACHE) alloc_flags |= VM_FLAGS_NO_CACHE; + if (flags & MAP_JIT){ + alloc_flags |= VM_FLAGS_MAP_JIT; + } /* * Lookup/allocate object. */ @@ -455,7 +491,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) if (maxprot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) maxprot |= VM_PROT_READ; #endif /* radar 3777787 */ - +map_anon_retry: result = vm_map_enter_mem_object(user_map, &user_addr, user_size, 0, alloc_flags, @@ -464,6 +500,16 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) (flags & MAP_SHARED) ? VM_INHERIT_SHARE : VM_INHERIT_DEFAULT); + + /* If a non-binding address was specified for this anonymous + * mapping, retry the mapping with a zero base + * in the event the mapping operation failed due to + * lack of space between the address and the map's maximum. + */ + if ((result == KERN_NO_SPACE) && ((flags & MAP_FIXED) == 0) && user_addr && (num_retries++ == 0)) { + user_addr = PAGE_SIZE; + goto map_anon_retry; + } } else { if (vnode_isswap(vp)) { /* @@ -514,7 +560,7 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) if (maxprot & (VM_PROT_EXECUTE | VM_PROT_WRITE)) maxprot |= VM_PROT_READ; #endif /* radar 3777787 */ - +map_file_retry: result = vm_map_enter_mem_object_control(user_map, &user_addr, user_size, 0, alloc_flags, @@ -523,6 +569,16 @@ mmap(proc_t p, struct mmap_args *uap, user_addr_t *retval) (flags & MAP_SHARED) ? VM_INHERIT_SHARE : VM_INHERIT_DEFAULT); + + /* If a non-binding address was specified for this file backed + * mapping, retry the mapping with a zero base + * in the event the mapping operation failed due to + * lack of space between the address and the map's maximum. + */ + if ((result == KERN_NO_SPACE) && ((flags & MAP_FIXED) == 0) && user_addr && (num_retries++ == 0)) { + user_addr = PAGE_SIZE; + goto map_file_retry; + } } if (!mapanon) { @@ -855,13 +911,15 @@ madvise(__unused proc_t p, struct madvise_args *uap, __unused int32_t *retval) result = mach_vm_behavior_set(user_map, start, size, new_behavior); switch (result) { - case KERN_SUCCESS: - return (0); - case KERN_INVALID_ADDRESS: - return (ENOMEM); + case KERN_SUCCESS: + return 0; + case KERN_INVALID_ADDRESS: + return EINVAL; + case KERN_NO_SPACE: + return ENOMEM; } - return (EINVAL); + return EINVAL; } int @@ -1034,6 +1092,7 @@ munlockall(__unused proc_t p, __unused struct munlockall_args *uap, __unused int return(ENOSYS); } +#if !defined(CONFIG_EMBEDDED) /* USV: No! need to obsolete map_fd()! mmap() already supports 64 bits */ kern_return_t map_fd(struct map_fd_args *args) @@ -1070,6 +1129,7 @@ map_fd_funneled( vm_offset_t map_addr=0; vm_size_t map_size; int err=0; + vm_prot_t maxprot = VM_PROT_ALL; vm_map_t my_map; proc_t p = current_proc(); struct vnode_attr vattr; @@ -1103,6 +1163,29 @@ map_fd_funneled( goto bad; } +#if CONFIG_MACF + err = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()), + fp->f_fglob, VM_PROT_DEFAULT, MAP_FILE, &maxprot); + if (err) { + (void)vnode_put(vp); + goto bad; + } +#endif /* MAC */ + +#if CONFIG_PROTECT + /* check for content protection access */ + { + void *cnode; + if ((cnode = cp_get_protected_cnode(vp)) != NULL) { + err = cp_handle_vnop(cnode, CP_READ_ACCESS | CP_WRITE_ACCESS); + if (err != 0) { + (void)vnode_put(vp); + goto bad; + } + } + } +#endif /* CONFIG_PROTECT */ + AUDIT_ARG(vnpath, vp, ARG_VNODE1); /* @@ -1148,7 +1231,7 @@ map_fd_funneled( my_map, &map_addr, map_size, (vm_offset_t)0, VM_FLAGS_ANYWHERE, pager, offset, TRUE, - VM_PROT_DEFAULT, VM_PROT_ALL, + VM_PROT_DEFAULT, maxprot, VM_INHERIT_DEFAULT); if (result != KERN_SUCCESS) { (void)vnode_put(vp); @@ -1213,4 +1296,5 @@ bad: fp_drop(p, fd, fp, 0); return (err); } +#endif /* !defined(CONFIG_EMBEDDED) */ diff --git a/bsd/kern/kern_newsysctl.c b/bsd/kern/kern_newsysctl.c index de083965e..2d872d54c 100644 --- a/bsd/kern/kern_newsysctl.c +++ b/bsd/kern/kern_newsysctl.c @@ -86,20 +86,51 @@ struct sysctl_oid_list sysctl__sysctl_children; lck_rw_t * sysctl_geometry_lock = NULL; -static void -sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i); +/* + * Conditionally allow dtrace to see these functions for debugging purposes. + */ +#ifdef STATIC +#undef STATIC +#endif +#if 0 +#define STATIC +#else +#define STATIC static +#endif + +/* forward declarations of static functions */ +STATIC funnel_t *spl_kernel_funnel(void); +STATIC void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i); +STATIC int sysctl_sysctl_debug(struct sysctl_oid *oidp, void *arg1, + int arg2, struct sysctl_req *req); +STATIC int sysctl_sysctl_name(struct sysctl_oid *oidp, void *arg1, + int arg2, struct sysctl_req *req); +STATIC int sysctl_sysctl_next_ls (struct sysctl_oid_list *lsp, + int *name, u_int namelen, int *next, int *len, int level, + struct sysctl_oid **oidpp); +STATIC int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l); +STATIC int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l); +STATIC int name2oid (char *name, int *oid, int *len); +STATIC int sysctl_sysctl_name2oid(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_sysctl_next(struct sysctl_oid *oidp, void *arg1, int arg2, + struct sysctl_req *req); +STATIC int sysctl_sysctl_oidfmt(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC void splx_kernel_funnel(funnel_t *saved); +STATIC int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l); +STATIC int sysctl_new_user(struct sysctl_req *req, void *p, size_t l); +STATIC int sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen); +STATIC int sysctlnametomib(const char *name, int *mibp, size_t *sizep); /* * Locking and stats */ -static struct sysctl_lock memlock; /* * XXX this does not belong here */ -static funnel_t * +STATIC funnel_t * spl_kernel_funnel(void) { funnel_t *cfunnel; @@ -113,7 +144,7 @@ spl_kernel_funnel(void) return(cfunnel); } -static void +STATIC void splx_kernel_funnel(funnel_t *saved) { if (saved != kernel_flock) { @@ -123,7 +154,7 @@ splx_kernel_funnel(funnel_t *saved) } } -static int sysctl_root SYSCTL_HANDLER_ARGS; +STATIC int sysctl_root SYSCTL_HANDLER_ARGS; struct sysctl_oid_list sysctl__children; /* root list */ @@ -133,21 +164,65 @@ struct sysctl_oid_list sysctl__children; /* root list */ * Order by number in each list. */ -void sysctl_register_oid(struct sysctl_oid *oidp) +void +sysctl_register_oid(struct sysctl_oid *new_oidp) { - struct sysctl_oid_list *parent = oidp->oid_parent; + struct sysctl_oid *oidp = NULL; + struct sysctl_oid_list *parent = new_oidp->oid_parent; struct sysctl_oid *p; struct sysctl_oid *q; int n; - funnel_t *fnl; + funnel_t *fnl = NULL; /* compiler doesn't notice CTLFLAG_LOCKED */ + + /* + * The OID can be old-style (needs copy), new style without an earlier + * version (also needs copy), or new style with a matching version (no + * copy needed). Later versions are rejected (presumably, the OID + * structure was changed for a necessary reason). + */ + if (!(new_oidp->oid_kind & CTLFLAG_OID2)) { + /* + * XXX: M_TEMP is perhaps not the most apropriate zone, as it + * XXX: will subject us to use-after-free by other consumers. + */ + MALLOC(oidp, struct sysctl_oid *, sizeof(*oidp), M_TEMP, M_WAITOK | M_ZERO); + if (oidp == NULL) + return; /* reject: no memory */ + + /* + * Copy the structure only through the oid_fmt field, which + * is the last field in a non-OID2 OID structure. + * + * Note: We may want to set the oid_descr to the + * oid_name (or "") at some future date. + */ + memcpy(oidp, new_oidp, offsetof(struct sysctl_oid, oid_descr)); + } else { + /* It's a later version; handle the versions we know about */ + switch (new_oidp->oid_version) { + case SYSCTL_OID_VERSION: + /* current version */ + oidp = new_oidp; + break; + default: + return; /* rejects unknown version */ + } + } - fnl = spl_kernel_funnel(); + /* + * If it's a locked OID being registered, we can assume that the + * caller is doing their own reentrancy locking before calling us. + */ + if (!(oidp->oid_kind & CTLFLAG_LOCKED)) + fnl = spl_kernel_funnel(); if(sysctl_geometry_lock == NULL) { - /* Initialise the geometry lock for reading/modifying the sysctl tree - * This is done here because IOKit registers some sysctls before bsd_init() - * calls sysctl_register_fixed(). + /* + * Initialise the geometry lock for reading/modifying the + * sysctl tree. This is done here because IOKit registers + * some sysctl's before bsd_init() calls + * sysctl_register_fixed(). */ lck_grp_t* lck_grp = lck_grp_alloc_init("sysctl", NULL); @@ -169,6 +244,12 @@ void sysctl_register_oid(struct sysctl_oid *oidp) n = p->oid_number; } oidp->oid_number = n + 1; + /* + * Reflect the number in an llocated OID into the template + * of the caller for sysctl_unregister_oid() compares. + */ + if (oidp != new_oidp) + new_oidp->oid_number = oidp->oid_number; } /* @@ -188,30 +269,83 @@ void sysctl_register_oid(struct sysctl_oid *oidp) /* Release the write lock */ lck_rw_unlock_exclusive(sysctl_geometry_lock); - splx_kernel_funnel(fnl); + if (!(oidp->oid_kind & CTLFLAG_LOCKED)) + splx_kernel_funnel(fnl); } -void sysctl_unregister_oid(struct sysctl_oid *oidp) +void +sysctl_unregister_oid(struct sysctl_oid *oidp) { - funnel_t *fnl; + struct sysctl_oid *removed_oidp = NULL; /* OID removed from tree */ + struct sysctl_oid *old_oidp = NULL; /* OID compatibility copy */ + funnel_t *fnl = NULL; /* compiler doesn't notice CTLFLAG_LOCKED */ - fnl = spl_kernel_funnel(); + if (!(oidp->oid_kind & CTLFLAG_LOCKED)) + fnl = spl_kernel_funnel(); /* Get the write lock to modify the geometry */ lck_rw_lock_exclusive(sysctl_geometry_lock); - SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); + if (!(oidp->oid_kind & CTLFLAG_OID2)) { + /* + * We're using a copy so we can get the new fields in an + * old structure, so we have to iterate to compare the + * partial structure; when we find a match, we remove it + * normally and free the memory. + */ + SLIST_FOREACH(old_oidp, oidp->oid_parent, oid_link) { + if (!memcmp(&oidp->oid_number, &old_oidp->oid_number, (offsetof(struct sysctl_oid, oid_descr)-offsetof(struct sysctl_oid, oid_number)))) { + break; + } + } + if (old_oidp != NULL) { + SLIST_REMOVE(old_oidp->oid_parent, old_oidp, sysctl_oid, oid_link); + removed_oidp = old_oidp; + } + } else { + /* It's a later version; handle the versions we know about */ + switch (oidp->oid_version) { + case SYSCTL_OID_VERSION: + /* We can just remove the OID directly... */ + SLIST_REMOVE(oidp->oid_parent, oidp, sysctl_oid, oid_link); + removed_oidp = oidp; + break; + default: + /* XXX: Can't happen; probably tree coruption.*/ + break; /* rejects unknown version */ + } + } + + /* + * We've removed it from the list at this point, but we don't want + * to return to the caller until all handler references have drained + * out. Doing things in this order prevent other people coming in + * and starting new operations against the OID node we want removed. + * + * Note: oidp could be NULL if it wasn't found. + */ + while(removed_oidp && removed_oidp->oid_refcnt) { + lck_rw_sleep(sysctl_geometry_lock, LCK_SLEEP_EXCLUSIVE, &removed_oidp->oid_refcnt, THREAD_UNINT); + } /* Release the write lock */ lck_rw_unlock_exclusive(sysctl_geometry_lock); - splx_kernel_funnel(fnl); + /* If it was allocated, free it after dropping the lock */ + if (old_oidp != NULL) { + FREE(old_oidp, M_TEMP); + } + + /* And drop the funnel interlock, if needed */ + if (!(oidp->oid_kind & CTLFLAG_LOCKED)) + splx_kernel_funnel(fnl); } /* * Bulk-register all the oids in a linker_set. */ -void sysctl_register_set(const char *set) +void +sysctl_register_set(const char *set) { struct sysctl_oid **oidpp, *oidp; @@ -223,7 +357,8 @@ void sysctl_register_set(const char *set) } } -void sysctl_unregister_set(const char *set) +void +sysctl_unregister_set(const char *set) { struct sysctl_oid **oidpp, *oidp; @@ -401,7 +536,32 @@ int sysctl_io_opaque(struct sysctl_req *req,void *pValue, size_t valueSize, int * {0,4,...} return the kind & format info for the "..." OID. */ -static void +/* + * sysctl_sysctl_debug_dump_node + * + * Description: Dump debug information for a given sysctl_oid_list at the + * given oid depth out to the kernel log, via printf + * + * Parameters: l sysctl_oid_list pointer + * i current node depth + * + * Returns: (void) + * + * Implicit: kernel log, modified + * + * Locks: Assumes sysctl_geometry_lock is held prior to calling + * + * Notes: This function may call itself recursively to resolve Node + * values, which potentially have an inferioer sysctl_oid_list + * + * This function is only callable indirectly via the function + * sysctl_sysctl_debug() + * + * Bugs: The node depth indentation does not work; this may be an + * artifact of leading space removal by the log daemon itself + * or some intermediate routine. + */ +STATIC void sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) { int k; @@ -414,7 +574,8 @@ sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) printf("%d %s ", oidp->oid_number, oidp->oid_name); - printf("%c%c", + printf("%c%c%c", + oidp->oid_kind & CTLFLAG_LOCKED ? 'L':' ', oidp->oid_kind & CTLFLAG_RD ? 'R':' ', oidp->oid_kind & CTLFLAG_WR ? 'W':' '); @@ -439,18 +600,83 @@ sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i) } } -static int +/* + * sysctl_sysctl_debug + * + * Description: This function implements the "sysctl.debug" portion of the + * OID space for sysctl. + * + * OID: 0, 0 + * + * Parameters: __unused + * + * Returns: ENOENT + * + * Implicit: kernel log, modified + * + * Locks: Acquires and then releases a read lock on the + * sysctl_geometry_lock + */ +STATIC int sysctl_sysctl_debug(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, __unused struct sysctl_req *req) { + lck_rw_lock_shared(sysctl_geometry_lock); sysctl_sysctl_debug_dump_node(&sysctl__children, 0); + lck_rw_done(sysctl_geometry_lock); return ENOENT; } -SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD, +SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_sysctl_debug, "-", ""); -static int +/* + * sysctl_sysctl_name + * + * Description: Convert an OID into a string name; this is used by the user + * space sysctl() command line utility; this is done in a purely + * advisory capacity (e.g. to provide node names for "sysctl -A" + * output). + * + * OID: 0, 1 + * + * Parameters: oidp __unused + * arg1 A pointer to the OID name list + * integer array, beginning at + * adjusted option base 2 + * arg2 The number of elements which + * remain in the name array + * + * Returns: 0 Success + * SYSCTL_OUT:EPERM Permission denied + * SYSCTL_OUT:EFAULT Bad user supplied buffer + * SYSCTL_OUT:??? Return value from user function + * for SYSCTL_PROC leaf node + * + * Implict: Contents of user request buffer, modified + * + * Locks: Acquires and then releases a read lock on the + * sysctl_geometry_lock + * + * Notes: SPI (System Programming Interface); this is subject to change + * and may not be relied upon by third party applications; use + * a subprocess to communicate with the "sysctl" command line + * command instead, if you believe you need this functionality. + * Preferrably, use sysctlbyname() instead. + * + * Setting of the NULL termination of the output string is + * delayed until after the geometry lock is dropped. If there + * are no Entries remaining in the OID name list when this + * function is called, it will still write out the termination + * byte. + * + * This function differs from other sysctl functions in that + * it can not take an output buffer length of 0 to determine the + * space which will be required. It is suggested that the buffer + * length be PATH_MAX, and that authors of new sysctl's refrain + * from exceeding this string length. + */ +STATIC int sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { @@ -461,6 +687,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_oid_list *lsp = &sysctl__children, *lsp2; char tempbuf[10]; + lck_rw_lock_shared(sysctl_geometry_lock); while (namelen) { if (!lsp) { snprintf(tempbuf,sizeof(tempbuf),"%d",*name); @@ -468,8 +695,10 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, error = SYSCTL_OUT(req, ".", 1); if (!error) error = SYSCTL_OUT(req, tempbuf, strlen(tempbuf)); - if (error) + if (error) { + lck_rw_done(sysctl_geometry_lock); return (error); + } namelen--; name++; continue; @@ -484,8 +713,10 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, if (!error) error = SYSCTL_OUT(req, oid->oid_name, strlen(oid->oid_name)); - if (error) + if (error) { + lck_rw_done(sysctl_geometry_lock); return (error); + } namelen--; name++; @@ -501,12 +732,45 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2, } lsp = lsp2; } + lck_rw_done(sysctl_geometry_lock); return (SYSCTL_OUT(req, "", 1)); } -SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, ""); +SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_sysctl_name, ""); -static int +/* + * sysctl_sysctl_next_ls + * + * Description: For a given OID name value, return the next consecutive OID + * name value within the geometry tree + * + * Parameters: lsp The OID list to look in + * name The OID name to start from + * namelen The length of the OID name + * next Pointer to new oid storage to + * fill in + * len Pointer to receive new OID + * length value of storage written + * level OID tree depth (used to compute + * len value) + * oidpp Pointer to OID list entry + * pointer; used to walk the list + * forward across recursion + * + * Returns: 0 Returning a new entry + * 1 End of geometry list reached + * + * Implicit: *next Modified to contain the new OID + * *len Modified to contain new length + * + * Locks: Assumes sysctl_geometry_lock is held prior to calling + * + * Notes: This function will not return OID values that have special + * handlers, since we can not tell wheter these handlers consume + * elements from the OID space as parameters. For this reason, + * we STRONGLY discourage these types of handlers + */ +STATIC int sysctl_sysctl_next_ls (struct sysctl_oid_list *lsp, int *name, u_int namelen, int *next, int *len, int level, struct sysctl_oid **oidpp) { @@ -566,7 +830,45 @@ sysctl_sysctl_next_ls (struct sysctl_oid_list *lsp, int *name, u_int namelen, return 1; } -static int +/* + * sysctl_sysctl_next + * + * Description: This is an iterator function designed to iterate the oid tree + * and provide a list of OIDs for use by the user space "sysctl" + * command line tool + * + * OID: 0, 2 + * + * Parameters: oidp __unused + * arg1 Pointer to start OID name + * arg2 Start OID name length + * req Pointer to user request buffer + * + * Returns: 0 Success + * ENOENT Reached end of OID space + * SYSCTL_OUT:EPERM Permission denied + * SYSCTL_OUT:EFAULT Bad user supplied buffer + * SYSCTL_OUT:??? Return value from user function + * for SYSCTL_PROC leaf node + * + * Implict: Contents of user request buffer, modified + * + * Locks: Acquires and then releases a read lock on the + * sysctl_geometry_lock + * + * Notes: SPI (System Programming Interface); this is subject to change + * and may not be relied upon by third party applications; use + * a subprocess to communicate with the "sysctl" command line + * command instead, if you believe you need this functionality. + * Preferrably, use sysctlbyname() instead. + * + * This function differs from other sysctl functions in that + * it can not take an output buffer length of 0 to determine the + * space which will be required. It is suggested that the buffer + * length be PATH_MAX, and that authors of new sysctl's refrain + * from exceeding this string length. + */ +STATIC int sysctl_sysctl_next(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { @@ -577,17 +879,38 @@ sysctl_sysctl_next(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_oid_list *lsp = &sysctl__children; int newoid[CTL_MAXNAME]; + lck_rw_lock_shared(sysctl_geometry_lock); i = sysctl_sysctl_next_ls (lsp, name, namelen, newoid, &j, 1, &oid); + lck_rw_done(sysctl_geometry_lock); if (i) return ENOENT; error = SYSCTL_OUT(req, newoid, j * sizeof (int)); return (error); } -SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, ""); +SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_sysctl_next, ""); -static int -name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp) +/* + * name2oid + * + * Description: Support function for use by sysctl_sysctl_name2oid(); looks + * up an OID name given a string name. + * + * Parameters: name NULL terminated string name + * oid Pointer to receive OID name + * len Pointer to receive OID length + * pointer value (see "Notes") + * + * Returns: 0 Success + * ENOENT Entry not found + * + * Implicit: *oid Modified to contain OID value + * *len Modified to contain OID length + * + * Locks: Assumes sysctl_geometry_lock is held prior to calling + */ +STATIC int +name2oid (char *name, int *oid, int *len) { int i; struct sysctl_oid *oidp; @@ -620,8 +943,6 @@ name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp) (*len)++; if (!i) { - if (oidpp) - *oidpp = oidp; return (0); } @@ -643,16 +964,54 @@ name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidpp) return ENOENT; } -static int +/* + * sysctl_sysctl_name2oid + * + * Description: Translate a string name to an OID name value; this is used by + * the sysctlbyname() function as well as by the "sysctl" command + * line command. + * + * OID: 0, 3 + * + * Parameters: oidp __unused + * arg1 __unused + * arg2 __unused + * req Request structure + * + * Returns: ENOENT Input length too short + * ENAMETOOLONG Input length too long + * ENOMEM Could not allocate work area + * SYSCTL_IN/OUT:EPERM Permission denied + * SYSCTL_IN/OUT:EFAULT Bad user supplied buffer + * SYSCTL_IN/OUT:??? Return value from user function + * name2oid:ENOENT Not found + * + * Implicit: *req Contents of request, modified + * + * Locks: Acquires and then releases a read lock on the + * sysctl_geometry_lock + * + * Notes: SPI (System Programming Interface); this is subject to change + * and may not be relied upon by third party applications; use + * a subprocess to communicate with the "sysctl" command line + * command instead, if you believe you need this functionality. + * Preferrably, use sysctlbyname() instead. + * + * This function differs from other sysctl functions in that + * it can not take an output buffer length of 0 to determine the + * space which will be required. It is suggested that the buffer + * length be PATH_MAX, and that authors of new sysctl's refrain + * from exceeding this string length. + */ +STATIC int sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { char *p; int error, oid[CTL_MAXNAME]; int len = 0; /* set by name2oid() */ - struct sysctl_oid *op = 0; - if (!req->newlen) + if (req->newlen < 1) return ENOENT; if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */ return (ENAMETOOLONG); @@ -669,7 +1028,13 @@ sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1, p [req->newlen] = '\0'; - error = name2oid(p, oid, &len, &op); + /* + * Note: We acquire and release the geometry lock here to + * avoid making name2oid needlessly complex. + */ + lck_rw_lock_shared(sysctl_geometry_lock); + error = name2oid(p, oid, &len); + lck_rw_done(sysctl_geometry_lock); FREE(p, M_TEMP); @@ -680,19 +1045,58 @@ sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1, return (error); } -SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_KERN, 0, 0, +SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_sysctl_name2oid, "I", ""); -static int +/* + * sysctl_sysctl_oidfmt + * + * Description: For a given OID name, determine the format of the data which + * is associated with it. This is used by the "sysctl" command + * line command. + * + * OID: 0, 4 + * + * Parameters: oidp __unused + * arg1 The OID name to look up + * arg2 The length of the OID name + * req Pointer to user request buffer + * + * Returns: 0 Success + * EISDIR Malformed request + * ENOENT No such OID name + * SYSCTL_OUT:EPERM Permission denied + * SYSCTL_OUT:EFAULT Bad user supplied buffer + * SYSCTL_OUT:??? Return value from user function + * + * Implict: Contents of user request buffer, modified + * + * Locks: Acquires and then releases a read lock on the + * sysctl_geometry_lock + * + * Notes: SPI (System Programming Interface); this is subject to change + * and may not be relied upon by third party applications; use + * a subprocess to communicate with the "sysctl" command line + * command instead, if you believe you need this functionality. + * + * This function differs from other sysctl functions in that + * it can not take an output buffer length of 0 to determine the + * space which will be required. It is suggested that the buffer + * length be PATH_MAX, and that authors of new sysctl's refrain + * from exceeding this string length. + */ +STATIC int sysctl_sysctl_oidfmt(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { - int *name = (int *) arg1, error; + int *name = (int *) arg1; + int error = ENOENT; /* default error: not found */ u_int namelen = arg2; u_int indx; struct sysctl_oid *oid; struct sysctl_oid_list *lsp = &sysctl__children; + lck_rw_lock_shared(sysctl_geometry_lock); oid = SLIST_FIRST(lsp); indx = 0; @@ -707,28 +1111,34 @@ sysctl_sysctl_oidfmt(__unused struct sysctl_oid *oidp, void *arg1, int arg2, lsp = (struct sysctl_oid_list *)oid->oid_arg1; oid = SLIST_FIRST(lsp); } else { - if (indx != namelen) - return EISDIR; + if (indx != namelen) { + error = EISDIR; + goto err; + } goto found; } } else { oid = SLIST_NEXT(oid, oid_link); } } - return ENOENT; + /* Not found */ + goto err; + found: if (!oid->oid_fmt) - return ENOENT; + goto err; error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind)); if (!error) error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt)+1); +err: + lck_rw_done(sysctl_geometry_lock); return (error); } +SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_sysctl_oidfmt, ""); -SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, ""); /* * Default "handler" functions. @@ -842,7 +1252,7 @@ sysctl_handle_opaque(__unused struct sysctl_oid *oidp, void *arg1, int arg2, /* * Transfer functions to/from kernel space. */ -static int +STATIC int sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) { size_t i = 0; @@ -860,7 +1270,7 @@ sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) return (0); } -static int +STATIC int sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) { if (!req->newptr) @@ -914,7 +1324,7 @@ kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldle /* * Transfer function to/from user space. */ -static int +STATIC int sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) { int error = 0; @@ -937,7 +1347,7 @@ sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) return (0); } -static int +STATIC int sysctl_new_user(struct sysctl_req *req, void *p, size_t l) { int error; @@ -981,10 +1391,28 @@ sysctl_root(__unused struct sysctl_oid *oidp, void *arg1, int arg2, indx++; if (!(oid->oid_kind & CTLFLAG_LOCKED)) { +/* +printf("sysctl_root: missing CTLFLAG_LOCKED: "); +for(i = 0; i < (int)(indx - 1); i++) +printf("oid[%d] = %d ", i, name[i]); +printf("\n"); +*/ funnel_held = TRUE; } if (oid->oid_kind & CTLFLAG_NOLOCK) req->lock = 0; + /* + * For SYSCTL_PROC() functions which are for sysctl's + * which have parameters at the end of their OID + * space, you need to OR CTLTYPE_NODE into their + * access value. + * + * NOTE: For binary backward compatibility ONLY! Do + * NOT add new sysctl's that do this! Existing + * sysctl's which do this will eventually have + * compatibility code in user space, and this method + * will become unsupported. + */ if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { if (oid->oid_handler) goto found; @@ -1028,7 +1456,14 @@ found: goto err; } - /* Most likely only root can write */ + /* + * This is where legacy enforcement of permissions occurs. If the + * flag does not say CTLFLAG_ANYBODY, then we prohibit anyone but + * root from writing new values down. If local enforcement happens + * at the leaf node, then it needs to be set as CTLFLAG_ANYBODY. In + * addition, if the leaf node is set this way, then in order to do + * specific enforcement, it has to be of type SYSCTL_PROC. + */ if (!(oid->oid_kind & CTLFLAG_ANYBODY) && req->newptr && req->p && (error = proc_suser(req->p))) @@ -1039,10 +1474,24 @@ found: goto err; } + /* + * Reference the OID and drop the geometry lock; this prevents the + * OID from being deleted out from under the handler call, but does + * not prevent other calls into handlers or calls to manage the + * geometry elsewhere from blocking... + */ + OSAddAtomic(1, &oid->oid_refcnt); + + lck_rw_done(sysctl_geometry_lock); + + /* + * ...however, we still have to grab the funnel for those calls which + * may be into code whose reentrancy is protected by the funnel; a + * blocking operation should not prevent reentrancy, at this point. + */ if (funnel_held) { fnl = spl_kernel_funnel(); - MEMLOCK_LOCK(); } if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) { @@ -1058,10 +1507,27 @@ found: if (funnel_held) { - MEMLOCK_UNLOCK(); splx_kernel_funnel(fnl); } + /* + * This is tricky... we re-grab the geometry lock in order to drop + * the reference and wake on the address; since the geometry + * lock is a reader/writer lock rather than a mutex, we have to + * wake on all apparent 1->0 transitions. This abuses the drop + * after the reference decrement in order to wake any lck_rw_sleep() + * in progress in sysctl_unregister_oid() that slept because of a + * non-zero reference count. + * + * Note: OSAddAtomic() is defined to return the previous value; + * we use this and the fact that the lock itself is a + * barrier to avoid waking every time through on "hot" + * OIDs. + */ + lck_rw_lock_shared(sysctl_geometry_lock); + if (OSAddAtomic(-1, &oid->oid_refcnt) == 1) + wakeup(&oid->oid_refcnt); + err: lck_rw_done(sysctl_geometry_lock); return (error); @@ -1170,14 +1636,14 @@ userland_sysctl(struct proc *p, int *name, u_int namelen, user_addr_t oldp, * may not work correctly. */ -static int +STATIC int sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { return(kernel_sysctl(current_proc(), name, namelen, oldp, oldlenp, newp, newlen)); } -static int +STATIC int sysctlnametomib(const char *name, int *mibp, size_t *sizep) { int oid[2]; diff --git a/bsd/kern/kern_panicinfo.c b/bsd/kern/kern_panicinfo.c index 024ec5220..1a949de7b 100644 --- a/bsd/kern/kern_panicinfo.c +++ b/bsd/kern/kern_panicinfo.c @@ -47,7 +47,7 @@ extern int panic_dialog_set_image( const unsigned char * ptr, unsigned int size extern void panic_dialog_get_image( unsigned char ** ptr, unsigned int * size ); /* make the compiler happy */ -extern int sysctl_dopanicinfo(int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, struct proc *); +static int sysctl_dopanicinfo SYSCTL_HANDLER_ARGS; #define PANIC_IMAGE_SIZE_LIMIT (32 * 4096) /* 128K - Maximum amount of memory consumed for the panic UI */ @@ -56,11 +56,20 @@ extern int sysctl_dopanicinfo(int *, u_int, user_addr_t, size_t *, user_addr_t, /* Local data */ static int image_size_limit = PANIC_IMAGE_SIZE_LIMIT; -__private_extern__ int -sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, struct proc *p) +/* XXX Should be STATIC for dtrace debugging.. */ +static int +sysctl_dopanicinfo SYSCTL_HANDLER_ARGS { + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ + size_t newlen = req->newlen; /* user buffer copy in size */ int error = 0; + proc_t p = current_proc(); + vm_offset_t newimage = (vm_offset_t )NULL; kern_return_t kret; unsigned char * prev_image_ptr; @@ -70,7 +79,8 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, if (namelen != 1) return (ENOTDIR); /* overloaded */ - if ( (error = proc_suser(p)) ) /* must be super user to muck with image */ + /* must be super user to muck with image */ + if ( (error = proc_suser(p)) ) return (error); switch (name[0]) { @@ -80,7 +90,7 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, case KERN_PANICINFO_TEST: panic_dialog_test(); - return (0); + break; case KERN_PANICINFO_MAXSIZE: @@ -91,7 +101,7 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, error = sysctl_int(oldp, oldlenp, newp, newlen, &image_size_limit); - return (error); + break; case KERN_PANICINFO_IMAGE: @@ -99,8 +109,10 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, if ( newp != USER_ADDR_NULL ) { /* check the length of the incoming image before allocating space for it. */ - if ( newlen > (size_t)image_size_limit ) - return (ENOMEM); + if ( newlen > (size_t)image_size_limit ) { + error = ENOMEM; + break; + } /* allocate some kernel wired memory for the new image */ kret = kmem_alloc(kernel_map, &newimage, (vm_size_t)round_page(newlen)); @@ -118,8 +130,7 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, error = EPERM; break; } - - return (error); + break; } /* copy the image in from user space */ @@ -169,12 +180,24 @@ sysctl_dopanicinfo(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, } } - return (0); + break; errout: if ( newimage != (vm_offset_t )NULL ) (void)kmem_free(kernel_map, newimage, (vm_size_t)round_page(newlen)); - return (error); + break; } + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +SYSCTL_PROC(_kern, KERN_PANICINFO, panicinfo, CTLTYPE_NODE|CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_dopanicinfo, /* Handler function */ + NULL, /* Data pointer */ + ""); diff --git a/bsd/kern/kern_priv.c b/bsd/kern/kern_priv.c new file mode 100644 index 000000000..e7ceb6075 --- /dev/null +++ b/bsd/kern/kern_priv.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2006 nCircle Network Security, Inc. + * Copyright (c) 2009 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert N. M. Watson for the TrustedBSD + * Project under contract to nCircle Network Security, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, + * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/kauth.h> + +#if CONFIG_MACF +#include <security/mac_framework.h> +#endif + +/* + * Check a credential for privilege. Lots of good reasons to deny privilege; + * only a few to grant it. + */ +int +priv_check_cred(kauth_cred_t cred, int priv, __unused int flags) +{ + int error; + + /* + * We first evaluate policies that may deny the granting of + * privilege unilaterally. + */ +#if CONFIG_MACF + error = mac_priv_check(cred, priv); + if (error) + goto out; +#endif + + /* + * Having determined if privilege is restricted by various policies, + * now determine if privilege is granted. At this point, any policy + * may grant privilege. For now, we allow short-circuit boolean + * evaluation, so may not call all policies. Perhaps we should. + */ + if (kauth_cred_getuid(cred) == 0) { + error = 0; + goto out; + } + + /* + * Now check with MAC, if enabled, to see if a policy module grants + * privilege. + */ +#if CONFIG_MACF + if (mac_priv_grant(cred, priv) == 0) { + error = 0; + goto out; + } +#endif + + /* + * The default is deny, so if no policies have granted it, reject + * with a privilege error here. + */ + error = EPERM; +out: + return (error); +} diff --git a/bsd/kern/kern_proc.c b/bsd/kern/kern_proc.c index ba7505008..042a3a864 100644 --- a/bsd/kern/kern_proc.c +++ b/bsd/kern/kern_proc.c @@ -96,6 +96,7 @@ #include <sys/kauth.h> #include <sys/codesign.h> #include <sys/kernel_types.h> +#include <sys/ubc.h> #include <kern/kalloc.h> #include <kern/task.h> #include <kern/assert.h> @@ -168,13 +169,10 @@ static void orphanpg(struct pgrp *pg); void proc_name_kdp(task_t t, char * buf, int size); char *proc_name_address(void *p); -static proc_t proc_refinternal_locked(proc_t p); static void pgrp_add(struct pgrp * pgrp, proc_t parent, proc_t child); static void pgrp_remove(proc_t p); static void pgrp_replace(proc_t p, struct pgrp *pgrp); static void pgdelete_dropref(struct pgrp *pgrp); -static proc_t proc_find_zombref(int pid); -static void proc_drop_zombref(proc_t p); extern void pg_rele_dropref(struct pgrp * pgrp); struct fixjob_iterargs { @@ -345,7 +343,7 @@ proc_findinternal(int pid, int locked) } p = pfind_locked(pid); - if ((p == PROC_NULL) || (p != proc_refinternal_locked(p))) + if ((p == PROC_NULL) || (p != proc_ref_locked(p))) p = PROC_NULL; if (locked == 0) { @@ -373,15 +371,15 @@ proc_self(void) p = current_proc(); proc_list_lock(); - if (p != proc_refinternal_locked(p)) + if (p != proc_ref_locked(p)) p = PROC_NULL; proc_list_unlock(); return(p); } -static proc_t -proc_refinternal_locked(proc_t p) +proc_t +proc_ref_locked(proc_t p) { proc_t p1 = p; @@ -412,7 +410,7 @@ proc_rele_locked(proc_t p) } -static proc_t +proc_t proc_find_zombref(int pid) { proc_t p1 = PROC_NULL; @@ -440,7 +438,7 @@ proc_find_zombref(int pid) return(p1); } -static void +void proc_drop_zombref(proc_t p) { proc_list_lock(); @@ -608,7 +606,7 @@ proc_parent(proc_t p) proc_list_lock(); loop: pp = p->p_pptr; - parent = proc_refinternal_locked(pp); + parent = proc_ref_locked(pp); if ((parent == PROC_NULL) && (pp != PROC_NULL) && (pp->p_stat != SZOMB) && ((pp->p_listflag & P_LIST_EXITED) != 0) && ((pp->p_listflag & P_LIST_CHILDDRAINED)== 0)){ pp->p_listflag |= P_LIST_CHILDLKWAIT; msleep(&pp->p_childrencnt, proc_list_mlock, 0, "proc_parent", 0); @@ -781,12 +779,34 @@ proc_pidversion(proc_t p) return(p->p_idversion); } +uint64_t +proc_uniqueid(proc_t p) +{ + return(p->p_uniqueid); +} + +uint64_t +proc_selfuniqueid(void) +{ + proc_t p = current_proc(); + return(p->p_uniqueid); +} + int proc_getcdhash(proc_t p, unsigned char *cdhash) { return vn_getcdhash(p->p_textvp, p->p_textoff, cdhash); } +void +proc_getexecutableuuid(proc_t p, unsigned char *uuidbuf, unsigned long size) +{ + if (size >= sizeof(p->p_uuid)) { + memcpy(uuidbuf, p->p_uuid, sizeof(p->p_uuid)); + } +} + + void bsd_set_dependency_capable(task_t task) { @@ -1029,10 +1049,10 @@ enterpgrp(proc_t p, pid_t pgid, int mksess) sess->s_flags = 0; sess->s_listflags = 0; sess->s_ttypgrpid = NO_PID; -#ifdef CONFIG_EMBEDDED - lck_mtx_init(&sess->s_mlock, proc_lck_grp, proc_lck_attr); -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&sess->s_mlock, proc_mlock_grp, proc_lck_attr); +#else + lck_mtx_init(&sess->s_mlock, proc_lck_grp, proc_lck_attr); #endif bcopy(procsp->s_login, sess->s_login, sizeof(sess->s_login)); @@ -1055,10 +1075,10 @@ enterpgrp(proc_t p, pid_t pgid, int mksess) proc_list_unlock(); } pgrp->pg_id = pgid; -#ifdef CONFIG_EMBEDDED - lck_mtx_init(&pgrp->pg_mlock, proc_lck_grp, proc_lck_attr); -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_init(&pgrp->pg_mlock, proc_mlock_grp, proc_lck_attr); +#else + lck_mtx_init(&pgrp->pg_mlock, proc_lck_grp, proc_lck_attr); #endif LIST_INIT(&pgrp->pg_members); pgrp->pg_membercnt = 0; @@ -1178,18 +1198,18 @@ pgdelete_dropref(struct pgrp *pgrp) if (sessp->s_count != 0) panic("pg_deleteref: freeing session in use"); proc_list_unlock(); -#ifdef CONFIG_EMBEDDED - lck_mtx_destroy(&sessp->s_mlock, proc_lck_grp); -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_destroy(&sessp->s_mlock, proc_mlock_grp); +#else + lck_mtx_destroy(&sessp->s_mlock, proc_lck_grp); #endif FREE_ZONE(sessp, sizeof(struct session), M_SESSION); } else proc_list_unlock(); -#ifdef CONFIG_EMBEDDED - lck_mtx_destroy(&pgrp->pg_mlock, proc_lck_grp); -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_destroy(&pgrp->pg_mlock, proc_mlock_grp); +#else + lck_mtx_destroy(&pgrp->pg_mlock, proc_lck_grp); #endif FREE_ZONE(pgrp, sizeof(*pgrp), M_PGRP); } @@ -1650,14 +1670,14 @@ out: SYSCTL_NODE(_kern, KERN_LCTX, lctx, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Login Context"); -SYSCTL_PROC(_kern_lctx, KERN_LCTX_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT, +SYSCTL_PROC(_kern_lctx, KERN_LCTX_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT | CTLFLAG_LOCKED, 0, 0, sysctl_kern_lctx, "S,lctx", "Return entire login context table"); -SYSCTL_NODE(_kern_lctx, KERN_LCTX_LCID, lcid, CTLFLAG_RD, +SYSCTL_NODE(_kern_lctx, KERN_LCTX_LCID, lcid, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_kern_lctx, "Login Context Table"); -SYSCTL_INT(_kern_lctx, OID_AUTO, last, CTLFLAG_RD, &lastlcid, 0, ""); -SYSCTL_INT(_kern_lctx, OID_AUTO, count, CTLFLAG_RD, &alllctx_cnt, 0, ""); -SYSCTL_INT(_kern_lctx, OID_AUTO, max, CTLFLAG_RW, &maxlcid, 0, ""); +SYSCTL_INT(_kern_lctx, OID_AUTO, last, CTLFLAG_RD | CTLFLAG_LOCKED, &lastlcid, 0, ""); +SYSCTL_INT(_kern_lctx, OID_AUTO, count, CTLFLAG_RD | CTLFLAG_LOCKED, &alllctx_cnt, 0, ""); +SYSCTL_INT(_kern_lctx, OID_AUTO, max, CTLFLAG_RW | CTLFLAG_LOCKED, &maxlcid, 0, ""); #endif /* LCTX */ @@ -1811,7 +1831,33 @@ csops(__unused proc_t p, struct csops_args *uap, __unused int32_t *retval) } return error; - + + case CS_OPS_ENTITLEMENTS_BLOB: { + char zeros[8] = { 0 }; + void *start; + size_t length; + + if (0 != (error = cs_entitlements_blob_get(pt, + &start, &length))) + break; + if (usize < sizeof(zeros) || usize < length) { + error = ERANGE; + break; + } + if (NULL == start) { + start = zeros; + length = sizeof(zeros); + } + error = copyout(start, uaddr, length); + break; + } + + case CS_OPS_MARKRESTRICT: + proc_lock(pt); + pt->p_csflags |= CS_RESTRICT; + proc_unlock(pt); + break; + default: error = EINVAL; break; @@ -1984,7 +2030,7 @@ ps_allprocscan: for (p = allproc.lh_first; (p != 0); p = p->p_list.le_next) { if ( (filterfn == 0 ) || (filterfn(p, filterarg) != 0)) { - p = proc_refinternal_locked(p); + p = proc_ref_locked(p); proc_list_unlock(); lockheld = 0; @@ -2449,10 +2495,10 @@ session_rele(struct session *sess) if (sess->s_count != 0) panic("session_rele: freeing session in use"); proc_list_unlock(); -#ifdef CONFIG_EMBEDDED - lck_mtx_destroy(&sess->s_mlock, proc_lck_grp); -#else +#if CONFIG_FINE_LOCK_GROUPS lck_mtx_destroy(&sess->s_mlock, proc_mlock_grp); +#else + lck_mtx_destroy(&sess->s_mlock, proc_lck_grp); #endif FREE_ZONE(sess, sizeof(struct session), M_SESSION); } else @@ -2575,9 +2621,9 @@ unsigned long cs_procs_invalidated = 0; int cs_force_kill = 0; int cs_force_hard = 0; int cs_debug = 0; -SYSCTL_INT(_vm, OID_AUTO, cs_force_kill, CTLFLAG_RW, &cs_force_kill, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, cs_force_hard, CTLFLAG_RW, &cs_force_hard, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, cs_debug, CTLFLAG_RW, &cs_debug, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, cs_force_kill, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_kill, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, cs_force_hard, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_hard, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, cs_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_debug, 0, ""); int cs_allow_invalid(struct proc *p) @@ -2633,11 +2679,9 @@ cs_invalid_page( if (p->p_csflags & CS_KILL) { p->p_csflags |= CS_KILLED; proc_unlock(p); - if (cs_debug) { - printf("CODE SIGNING: cs_invalid_page(0x%llx): " - "p=%d[%s] honoring CS_KILL, final status 0x%x\n", - vaddr, p->p_pid, p->p_comm, p->p_csflags); - } + printf("CODE SIGNING: cs_invalid_page(0x%llx): " + "p=%d[%s] honoring CS_KILL, final status 0x%x\n", + vaddr, p->p_pid, p->p_comm, p->p_csflags); cs_procs_killed++; psignal(p, SIGKILL); proc_lock(p); @@ -2646,11 +2690,9 @@ cs_invalid_page( /* CS_HARD means fail the mapping operation so the process stays valid. */ if (p->p_csflags & CS_HARD) { proc_unlock(p); - if (cs_debug) { - printf("CODE SIGNING: cs_invalid_page(0x%llx): " - "p=%d[%s] honoring CS_HARD\n", - vaddr, p->p_pid, p->p_comm); - } + printf("CODE SIGNING: cs_invalid_page(0x%llx): " + "p=%d[%s] honoring CS_HARD\n", + vaddr, p->p_pid, p->p_comm); retval = 1; } else { if (p->p_csflags & CS_VALID) { @@ -2773,9 +2815,12 @@ proc_resetpcontrol(int pid) proc_t p; int pcontrol; int error; + proc_t self = current_proc(); - if ((error = suser(kauth_cred_get(), 0))) + /* if the process has been validated to handle resource control or root is valid one */ + if (((self->p_lflag & P_LVMRSRCOWNER) == 0) && (error = suser(kauth_cred_get(), 0))) return error; + p = proc_find(pid); if (p == PROC_NULL) return(ESRCH); diff --git a/bsd/kern/kern_prot.c b/bsd/kern/kern_prot.c index a084ddf89..d2408a2f3 100644 --- a/bsd/kern/kern_prot.c +++ b/bsd/kern/kern_prot.c @@ -360,9 +360,9 @@ gettid(__unused proc_t p, struct gettid_args *uap, int32_t *retval) if (!(uthread->uu_flag & UT_SETUID)) return (ESRCH); - if ((error = suword(uap->uidp, uthread->uu_ucred->cr_ruid))) + if ((error = suword(uap->uidp, kauth_cred_getruid(uthread->uu_ucred)))) return (error); - if ((error = suword(uap->gidp, uthread->uu_ucred->cr_rgid))) + if ((error = suword(uap->gidp, kauth_cred_getrgid(uthread->uu_ucred)))) return (error); *retval = 0; @@ -448,21 +448,23 @@ getgroups(__unused proc_t p, struct getgroups_args *uap, int32_t *retval) int ngrp; int error; kauth_cred_t cred; + posix_cred_t pcred; /* grab reference while we muck around with the credential */ cred = kauth_cred_get_with_ref(); + pcred = posix_cred_get(cred); if ((ngrp = uap->gidsetsize) == 0) { - *retval = cred->cr_ngroups; + *retval = pcred->cr_ngroups; kauth_cred_unref(&cred); return (0); } - if (ngrp < cred->cr_ngroups) { + if (ngrp < pcred->cr_ngroups) { kauth_cred_unref(&cred); return (EINVAL); } - ngrp = cred->cr_ngroups; - if ((error = copyout((caddr_t)cred->cr_groups, + ngrp = pcred->cr_ngroups; + if ((error = copyout((caddr_t)pcred->cr_groups, uap->gidset, ngrp * sizeof(gid_t)))) { kauth_cred_unref(&cred); @@ -716,17 +718,19 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) uid_t gmuid = KAUTH_UID_NONE; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; uid = uap->uid; my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); DEBUG_CRED_ENTER("setuid (%d/%d): %p %d\n", p->p_pid, (p->p_pptr ? p->p_pptr->p_pid : 0), my_cred, uap->uid); AUDIT_ARG(uid, uid); - if (uid != my_cred->cr_ruid && /* allow setuid(getuid()) */ - uid != my_cred->cr_svuid && /* allow setuid(saved uid) */ + if (uid != my_pcred->cr_ruid && /* allow setuid(getuid()) */ + uid != my_pcred->cr_svuid && /* allow setuid(saved uid) */ (error = suser(my_cred, &p->p_acflag))) { kauth_cred_unref(&my_cred); return (error); @@ -747,7 +751,7 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) * chgproccnt uses list lock for protection */ (void)chgproccnt(uid, 1); - (void)chgproccnt(my_cred->cr_ruid, -1); + (void)chgproccnt(my_pcred->cr_ruid, -1); } /* get current credential and take a reference while we muck with it */ @@ -761,7 +765,7 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) * to something other than the default list for the user, as * in entering a group or leaving an exclusion group). */ - if (!(my_cred->cr_flags & CRF_NOMEMBERD)) + if (!(my_pcred->cr_flags & CRF_NOMEMBERD)) gmuid = uid; /* @@ -774,7 +778,7 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) my_new_cred = kauth_cred_setresuid(my_cred, ruid, uid, svuid, gmuid); if (my_cred != my_new_cred) { - DEBUG_CRED_CHANGE("setuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); + DEBUG_CRED_CHANGE("setuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); proc_lock(p); /* @@ -791,6 +795,9 @@ setuid(proc_t p, struct setuid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); + OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); } @@ -828,6 +835,7 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) uid_t euid; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; DEBUG_CRED_ENTER("seteuid: %d\n", uap->euid); @@ -835,8 +843,9 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) AUDIT_ARG(euid, euid); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); - if (euid != my_cred->cr_ruid && euid != my_cred->cr_svuid && + if (euid != my_pcred->cr_ruid && euid != my_pcred->cr_svuid && (error = suser(my_cred, &p->p_acflag))) { kauth_cred_unref(&my_cred); return (error); @@ -855,11 +864,11 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) * passed in. The subsequent compare is safe, because it is * a pointer compare rather than a contents compare. */ - my_new_cred = kauth_cred_setresuid(my_cred, KAUTH_UID_NONE, euid, KAUTH_UID_NONE, my_cred->cr_gmuid); + my_new_cred = kauth_cred_setresuid(my_cred, KAUTH_UID_NONE, euid, KAUTH_UID_NONE, my_pcred->cr_gmuid); if (my_cred != my_new_cred) { - DEBUG_CRED_CHANGE("seteuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); + DEBUG_CRED_CHANGE("seteuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); proc_lock(p); /* @@ -876,6 +885,8 @@ seteuid(proc_t p, struct seteuid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); } @@ -926,6 +937,7 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) uid_t ruid, euid; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; DEBUG_CRED_ENTER("setreuid %d %d\n", uap->ruid, uap->euid); @@ -939,15 +951,16 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) AUDIT_ARG(ruid, ruid); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); if (((ruid != KAUTH_UID_NONE && /* allow no change of ruid */ - ruid != my_cred->cr_ruid && /* allow ruid = ruid */ - ruid != my_cred->cr_uid && /* allow ruid = euid */ - ruid != my_cred->cr_svuid) || /* allow ruid = svuid */ + ruid != my_pcred->cr_ruid && /* allow ruid = ruid */ + ruid != my_pcred->cr_uid && /* allow ruid = euid */ + ruid != my_pcred->cr_svuid) || /* allow ruid = svuid */ (euid != KAUTH_UID_NONE && /* allow no change of euid */ - euid != my_cred->cr_uid && /* allow euid = euid */ - euid != my_cred->cr_ruid && /* allow euid = ruid */ - euid != my_cred->cr_svuid)) && /* allow euid = svui */ + euid != my_pcred->cr_uid && /* allow euid = euid */ + euid != my_pcred->cr_ruid && /* allow euid = ruid */ + euid != my_pcred->cr_svuid)) && /* allow euid = svui */ (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */ kauth_cred_unref(&my_cred); return (error); @@ -963,8 +976,8 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) uid_t new_ruid; uid_t svuid = KAUTH_UID_NONE; - new_euid = my_cred->cr_uid; - new_ruid = my_cred->cr_ruid; + new_euid = my_pcred->cr_uid; + new_ruid = my_pcred->cr_ruid; /* * Set the credential with new info. If there is no change, @@ -973,16 +986,16 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) * passed in. The subsequent compare is safe, because it is * a pointer compare rather than a contents compare. */ - if (euid == KAUTH_UID_NONE && my_cred->cr_uid != euid) { + if (euid == KAUTH_UID_NONE && my_pcred->cr_uid != euid) { /* changing the effective UID */ new_euid = euid; OSBitOrAtomic(P_SUGID, &p->p_flag); } - if (ruid != KAUTH_UID_NONE && my_cred->cr_ruid != ruid) { + if (ruid != KAUTH_UID_NONE && my_pcred->cr_ruid != ruid) { /* changing the real UID; must do user accounting */ /* chgproccnt uses list lock for protection */ (void)chgproccnt(ruid, 1); - (void)chgproccnt(my_cred->cr_ruid, -1); + (void)chgproccnt(my_pcred->cr_ruid, -1); new_ruid = ruid; OSBitOrAtomic(P_SUGID, &p->p_flag); } @@ -992,17 +1005,17 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) * new effective uid. We are protected from escalation * by the prechecking. */ - if (my_cred->cr_svuid != uap->ruid && - my_cred->cr_svuid != uap->euid) { + if (my_pcred->cr_svuid != uap->ruid && + my_pcred->cr_svuid != uap->euid) { svuid = new_euid; OSBitOrAtomic(P_SUGID, &p->p_flag); } - my_new_cred = kauth_cred_setresuid(my_cred, ruid, euid, svuid, my_cred->cr_gmuid); + my_new_cred = kauth_cred_setresuid(my_cred, ruid, euid, svuid, my_pcred->cr_gmuid); if (my_cred != my_new_cred) { - DEBUG_CRED_CHANGE("setreuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); + DEBUG_CRED_CHANGE("setreuid CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); proc_lock(p); /* @@ -1019,6 +1032,8 @@ setreuid(proc_t p, struct setreuid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); /* XXX redundant? */ proc_unlock(p); } @@ -1065,6 +1080,7 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) gid_t svgid = KAUTH_GID_NONE; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; DEBUG_CRED_ENTER("setgid(%d/%d): %d\n", p->p_pid, (p->p_pptr ? p->p_pptr->p_pid : 0), uap->gid); @@ -1072,9 +1088,10 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) AUDIT_ARG(gid, gid); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); - if (gid != my_cred->cr_rgid && /* allow setgid(getgid()) */ - gid != my_cred->cr_svgid && /* allow setgid(saved gid) */ + if (gid != my_pcred->cr_rgid && /* allow setgid(getgid()) */ + gid != my_pcred->cr_svgid && /* allow setgid(saved gid) */ (error = suser(my_cred, &p->p_acflag))) { kauth_cred_unref(&my_cred); return (error); @@ -1119,6 +1136,8 @@ setgid(proc_t p, struct setgid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); } @@ -1161,6 +1180,7 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) gid_t egid; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; DEBUG_CRED_ENTER("setegid %d\n", uap->egid); @@ -1168,9 +1188,10 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) AUDIT_ARG(egid, egid); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); - if (egid != my_cred->cr_rgid && - egid != my_cred->cr_svgid && + if (egid != my_pcred->cr_rgid && + egid != my_pcred->cr_svgid && (error = suser(my_cred, &p->p_acflag))) { kauth_cred_unref(&my_cred); return (error); @@ -1188,7 +1209,7 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) my_new_cred = kauth_cred_setresgid(my_cred, KAUTH_GID_NONE, egid, KAUTH_GID_NONE); if (my_cred != my_new_cred) { - DEBUG_CRED_CHANGE("setegid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); + DEBUG_CRED_CHANGE("setegid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); proc_lock(p); /* @@ -1205,6 +1226,8 @@ setegid(proc_t p, struct setegid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); } @@ -1261,6 +1284,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) gid_t rgid, egid; int error; kauth_cred_t my_cred, my_new_cred; + posix_cred_t my_pcred; DEBUG_CRED_ENTER("setregid %d %d\n", uap->rgid, uap->egid); @@ -1275,16 +1299,17 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) AUDIT_ARG(rgid, rgid); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); if (((rgid != KAUTH_UID_NONE && /* allow no change of rgid */ - rgid != my_cred->cr_rgid && /* allow rgid = rgid */ - rgid != my_cred->cr_gid && /* allow rgid = egid */ - rgid != my_cred->cr_svgid) || /* allow rgid = svgid */ + rgid != my_pcred->cr_rgid && /* allow rgid = rgid */ + rgid != my_pcred->cr_gid && /* allow rgid = egid */ + rgid != my_pcred->cr_svgid) || /* allow rgid = svgid */ (egid != KAUTH_UID_NONE && /* allow no change of egid */ - egid != my_cred->cr_groups[0] && /* allow no change of egid */ - egid != my_cred->cr_gid && /* allow egid = egid */ - egid != my_cred->cr_rgid && /* allow egid = rgid */ - egid != my_cred->cr_svgid)) && /* allow egid = svgid */ + egid != my_pcred->cr_groups[0] && /* allow no change of egid */ + egid != my_pcred->cr_gid && /* allow egid = egid */ + egid != my_pcred->cr_rgid && /* allow egid = rgid */ + egid != my_pcred->cr_svgid)) && /* allow egid = svgid */ (error = suser(my_cred, &p->p_acflag))) { /* allow root user any */ kauth_cred_unref(&my_cred); return (error); @@ -1292,8 +1317,8 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) /* get current credential and take a reference while we muck with it */ for (;;) { - uid_t new_egid = my_cred->cr_gid; - uid_t new_rgid = my_cred->cr_rgid; + uid_t new_egid = my_pcred->cr_gid; + uid_t new_rgid = my_pcred->cr_rgid; uid_t svgid = KAUTH_UID_NONE; @@ -1304,12 +1329,12 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) * passed in. The subsequent compare is safe, because it is * a pointer compare rather than a contents compare. */ - if (egid == KAUTH_UID_NONE && my_cred->cr_groups[0] != egid) { + if (egid == KAUTH_UID_NONE && my_pcred->cr_gid != egid) { /* changing the effective GID */ new_egid = egid; OSBitOrAtomic(P_SUGID, &p->p_flag); } - if (rgid != KAUTH_UID_NONE && my_cred->cr_rgid != rgid) { + if (rgid != KAUTH_UID_NONE && my_pcred->cr_rgid != rgid) { /* changing the real GID */ new_rgid = rgid; OSBitOrAtomic(P_SUGID, &p->p_flag); @@ -1320,8 +1345,8 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) * new effective gid. We are protected from escalation * by the prechecking. */ - if (my_cred->cr_svgid != uap->rgid && - my_cred->cr_svgid != uap->egid) { + if (my_pcred->cr_svgid != uap->rgid && + my_pcred->cr_svgid != uap->egid) { svgid = new_egid; OSBitOrAtomic(P_SUGID, &p->p_flag); } @@ -1329,7 +1354,7 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) my_new_cred = kauth_cred_setresgid(my_cred, rgid, egid, svgid); if (my_cred != my_new_cred) { - DEBUG_CRED_CHANGE("setregid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); + DEBUG_CRED_CHANGE("setregid(CH)%d: %p/0x%08x->%p/0x%08x\n", p->p_pid, my_cred, my_pcred->cr_flags, my_new_cred, posix_cred_get(my_new_cred)->cr_flags); proc_lock(p); /* need to protect for a race where another thread @@ -1345,6 +1370,8 @@ setregid(proc_t p, struct setregid_args *uap, __unused int32_t *retval) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); /* XXX redundant? */ proc_unlock(p); } @@ -1444,6 +1471,7 @@ settid_with_pid(proc_t p, struct settid_with_pid_args *uap, __unused int32_t *re proc_t target_proc; struct uthread *uthread = get_bsdthread_info(current_thread()); kauth_cred_t my_cred, my_target_cred, my_new_cred; + posix_cred_t my_target_pcred; AUDIT_ARG(pid, uap->pid); AUDIT_ARG(value32, uap->assume); @@ -1491,7 +1519,8 @@ settid_with_pid(proc_t p, struct settid_with_pid_args *uap, __unused int32_t *re kauth_cred_ref(uthread->uu_ucred); my_cred = uthread->uu_ucred; my_target_cred = kauth_cred_proc_ref(target_proc); - my_new_cred = kauth_cred_setuidgid(my_cred, my_target_cred->cr_uid, my_target_cred->cr_gid); + my_target_pcred = posix_cred_get(my_target_cred); + my_new_cred = kauth_cred_setuidgid(my_cred, my_target_pcred->cr_uid, my_target_pcred->cr_gid); if (my_cred != my_new_cred) uthread->uu_ucred = my_new_cred; @@ -1647,13 +1676,15 @@ setgroups1(proc_t p, u_int gidsetsize, user_addr_t gidset, uid_t gmuid, __unused continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); OSBitOrAtomic(P_SUGID, &p->p_flag); proc_unlock(p); } break; } /* Drop old proc reference or our extra reference */ - AUDIT_ARG(groupset, my_cred->cr_groups, ngrp); + AUDIT_ARG(groupset, posix_cred_get(my_cred)->cr_groups, ngrp); kauth_cred_unref(&my_cred); @@ -1835,15 +1866,17 @@ is_suser1(void) { proc_t p = current_proc(); kauth_cred_t my_cred; + posix_cred_t my_pcred; int err; if (!p) return (0); my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); err = (suser(my_cred, &p->p_acflag) == 0 || - my_cred->cr_ruid == 0 || my_cred->cr_svuid == 0); + my_pcred->cr_ruid == 0 || my_pcred->cr_svuid == 0); kauth_cred_unref(&my_cred); return(err); } @@ -1959,6 +1992,7 @@ set_security_token(proc_t p) security_token_t sec_token; audit_token_t audit_token; kauth_cred_t my_cred; + posix_cred_t my_pcred; host_priv_t host_priv; /* @@ -1975,10 +2009,12 @@ set_security_token(proc_t p) } my_cred = kauth_cred_proc_ref(p); + my_pcred = posix_cred_get(my_cred); + /* XXX mach_init doesn't have a p_ucred when it calls this function */ if (IS_VALID_CRED(my_cred)) { sec_token.val[0] = kauth_cred_getuid(my_cred); - sec_token.val[1] = my_cred->cr_gid; + sec_token.val[1] = kauth_cred_getgid(my_cred); } else { sec_token.val[0] = 0; sec_token.val[1] = 0; @@ -1994,10 +2030,10 @@ set_security_token(proc_t p) * changes. */ audit_token.val[0] = my_cred->cr_audit.as_aia_p->ai_auid; - audit_token.val[1] = my_cred->cr_uid; - audit_token.val[2] = my_cred->cr_gid; - audit_token.val[3] = my_cred->cr_ruid; - audit_token.val[4] = my_cred->cr_rgid; + audit_token.val[1] = my_pcred->cr_uid; + audit_token.val[2] = my_pcred->cr_gid; + audit_token.val[3] = my_pcred->cr_ruid; + audit_token.val[4] = my_pcred->cr_rgid; audit_token.val[5] = p->p_pid; audit_token.val[6] = my_cred->cr_audit.as_aia_p->ai_asid; audit_token.val[7] = p->p_idversion; @@ -2028,12 +2064,13 @@ __private_extern__ void cru2x(kauth_cred_t cr, struct xucred *xcr) { + posix_cred_t pcr = posix_cred_get(cr); bzero(xcr, sizeof(*xcr)); xcr->cr_version = XUCRED_VERSION; xcr->cr_uid = kauth_cred_getuid(cr); - xcr->cr_ngroups = cr->cr_ngroups; - bcopy(cr->cr_groups, xcr->cr_groups, sizeof(xcr->cr_groups)); + xcr->cr_ngroups = pcr->cr_ngroups; + bcopy(pcr->cr_groups, xcr->cr_groups, sizeof(xcr->cr_groups)); } #if CONFIG_LCTX diff --git a/bsd/kern/kern_resource.c b/bsd/kern/kern_resource.c index 13b124887..d2473dbf0 100644 --- a/bsd/kern/kern_resource.c +++ b/bsd/kern/kern_resource.c @@ -111,8 +111,9 @@ int donice(struct proc *curp, struct proc *chgp, int n); int dosetrlimit(struct proc *p, u_int which, struct rlimit *limp); int uthread_get_background_state(uthread_t); static void do_background_socket(struct proc *p, thread_t thread, int priority); -static int do_background_thread(struct proc *curp, int priority); -static int do_background_task(struct proc *curp, int priority); +static int do_background_thread(struct proc *curp, thread_t thread, int priority); +static int do_background_proc(struct proc *curp, struct proc *targetp, int priority); +void proc_apply_task_networkbg_internal(proc_t); rlim_t maxdmap = MAXDSIZ; /* XXX */ rlim_t maxsmap = MAXSSIZ - PAGE_SIZE; /* XXX */ @@ -125,10 +126,10 @@ rlim_t maxsmap = MAXSSIZ - PAGE_SIZE; /* XXX */ */ __private_extern__ int maxfilesperproc = OPEN_MAX; /* per-proc open files limit */ -SYSCTL_INT( _kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW, +SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW | CTLFLAG_LOCKED, &maxprocperuid, 0, "Maximum processes allowed per userid" ); -SYSCTL_INT( _kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, +SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW | CTLFLAG_LOCKED, &maxfilesperproc, 0, "Maximum files allowed open per process" ); /* Args and fn for proc_iteration callback used in setpriority */ @@ -371,8 +372,10 @@ setpriority(struct proc *curp, struct setpriority_args *uap, __unused int32_t *r if (uap->who != 0) { return (EINVAL); } - error = do_background_thread(curp, uap->prio); - (void) do_background_socket(curp, current_thread(), uap->prio); + error = do_background_thread(curp, current_thread(), uap->prio); + if (!error) { + (void) do_background_socket(curp, current_thread(), uap->prio); + } found++; break; } @@ -387,8 +390,10 @@ setpriority(struct proc *curp, struct setpriority_args *uap, __unused int32_t *r refheld = 1; } - error = do_background_task(p, uap->prio); - (void) do_background_socket(p, NULL, uap->prio); + error = do_background_proc(curp, p, uap->prio); + if (!error) { + (void) do_background_socket(p, NULL, uap->prio); + } found++; if (refheld != 0) @@ -421,9 +426,9 @@ donice(struct proc *curp, struct proc *chgp, int n) ucred = kauth_cred_proc_ref(curp); my_cred = kauth_cred_proc_ref(chgp); - if (suser(ucred, NULL) && ucred->cr_ruid && + if (suser(ucred, NULL) && kauth_cred_getruid(ucred) && kauth_cred_getuid(ucred) != kauth_cred_getuid(my_cred) && - ucred->cr_ruid != kauth_cred_getuid(my_cred)) { + kauth_cred_getruid(ucred) != kauth_cred_getuid(my_cred)) { error = EPERM; goto out; } @@ -451,19 +456,53 @@ out: } static int -do_background_task(struct proc *p, int priority) +do_background_proc(struct proc *curp, struct proc *targetp, int priority) { int error = 0; + kauth_cred_t ucred; + kauth_cred_t target_cred; +#if CONFIG_EMBEDDED task_category_policy_data_t info; +#endif + + ucred = kauth_cred_get(); + target_cred = kauth_cred_proc_ref(targetp); + + if (!kauth_cred_issuser(ucred) && kauth_cred_getruid(ucred) && + kauth_cred_getuid(ucred) != kauth_cred_getuid(target_cred) && + kauth_cred_getruid(ucred) != kauth_cred_getuid(target_cred)) + { + error = EPERM; + goto out; + } + +#if CONFIG_MACF + error = mac_proc_check_sched(curp, targetp); + if (error) + goto out; +#endif + +#if !CONFIG_EMBEDDED + if (priority == PRIO_DARWIN_NONUI) + error = proc_apply_task_gpuacc(targetp->task, TASK_POLICY_HWACCESS_GPU_ATTRIBUTE_NOACCESS); + else + error = proc_set1_bgtaskpolicy(targetp->task, priority); + if (error) + goto out; +#else /* !CONFIG_EMBEDDED */ /* set the max scheduling priority on the task */ - if (priority & PRIO_DARWIN_BG) { + if (priority == PRIO_DARWIN_BG) { info.role = TASK_THROTTLE_APPLICATION; - } else { + } + else if (priority == PRIO_DARWIN_NONUI) { + info.role = TASK_NONUI_APPLICATION; + } + else { info.role = TASK_DEFAULT_APPLICATION; } - error = task_policy_set(p->task, + error = task_policy_set(targetp->task, TASK_CATEGORY_POLICY, (task_policy_t) &info, TASK_CATEGORY_POLICY_COUNT); @@ -471,22 +510,24 @@ do_background_task(struct proc *p, int priority) if (error) goto out; - proc_lock(p); + proc_lock(targetp); /* mark proc structure as backgrounded */ - if (priority & PRIO_DARWIN_BG) { - p->p_lflag |= P_LBACKGROUND; + if (priority == PRIO_DARWIN_BG) { + targetp->p_lflag |= P_LBACKGROUND; } else { - p->p_lflag &= ~P_LBACKGROUND; + targetp->p_lflag &= ~P_LBACKGROUND; } /* set or reset the disk I/O priority */ - p->p_iopol_disk = (priority == PRIO_DARWIN_BG ? + targetp->p_iopol_disk = (priority == PRIO_DARWIN_BG ? IOPOL_THROTTLE : IOPOL_DEFAULT); - proc_unlock(p); + proc_unlock(targetp); +#endif /* !CONFIG_EMBEDDED */ out: + kauth_cred_unref(&target_cred); return (error); } @@ -497,7 +538,7 @@ do_background_socket(struct proc *p, thread_t thread, int priority) struct fileproc *fp; int i; - if (priority & PRIO_DARWIN_BG) { + if (priority == PRIO_DARWIN_BG) { /* * For PRIO_DARWIN_PROCESS (thread is NULL), simply mark * the sockets with the background flag. There's nothing @@ -523,12 +564,6 @@ do_background_socket(struct proc *p, thread_t thread, int priority) } } else { - u_int32_t traffic_mgt; - /* - * See comments on do_background_thread(). Deregulate network - * traffics only for setpriority(PRIO_DARWIN_THREAD). - */ - traffic_mgt = (thread == NULL) ? 0 : TRAFFIC_MGT_SO_BG_REGULATE; /* disable networking IO throttle. * NOTE - It is a known limitation of the current design that we @@ -550,7 +585,7 @@ do_background_socket(struct proc *p, thread_t thread, int priority) if ((thread) && (sockp->so_background_thread != thread)) { continue; } - socket_clear_traffic_mgt_flags(sockp, TRAFFIC_MGT_SO_BACKGROUND | traffic_mgt); + socket_clear_traffic_mgt_flags(sockp, TRAFFIC_MGT_SO_BACKGROUND); sockp->so_background_thread = NULL; } proc_fdunlock(p); @@ -572,15 +607,26 @@ do_background_socket(struct proc *p, thread_t thread, int priority) * and only TRAFFIC_MGT_SO_BACKGROUND is set via do_background_socket(). */ static int -do_background_thread(struct proc *curp __unused, int priority) +do_background_thread(struct proc *curp __unused, thread_t thread, int priority) { - thread_t thread; struct uthread *ut; +#if !CONFIG_EMBEDDED + int error = 0; +#else /* !CONFIG_EMBEDDED */ thread_precedence_policy_data_t policy; +#endif /* !CONFIG_EMBEDDED */ - thread = current_thread(); ut = get_bsdthread_info(thread); + /* Backgrounding is unsupported for threads in vfork */ + if ( (ut->uu_flag & UT_VFORK) != 0) { + return(EPERM); + } + +#if !CONFIG_EMBEDDED + error = proc_set1_bgthreadpolicy(curp->task, thread_tid(thread), priority); + return(error); +#else /* !CONFIG_EMBEDDED */ if ( (priority & PRIO_DARWIN_BG) == 0 ) { /* turn off backgrounding of thread */ if ( (ut->uu_flag & UT_BACKGROUND) == 0 ) { @@ -630,9 +676,57 @@ do_background_thread(struct proc *curp __unused, int priority) * thread then TRAFFIC_MGT_SO_{BACKGROUND,BG_REGULATE} is set. * Existing sockets are taken care of by do_background_socket(). */ +#endif /* !CONFIG_EMBEDDED */ return(0); } +#if CONFIG_EMBEDDED +int mach_do_background_thread(thread_t thread, int prio); + +int +mach_do_background_thread(thread_t thread, int prio) +{ + int error = 0; + struct proc *curp = NULL; + struct proc *targetp = NULL; + kauth_cred_t ucred; + + targetp = get_bsdtask_info(get_threadtask(thread)); + if (!targetp) { + return KERN_INVALID_ARGUMENT; + } + + curp = proc_self(); + if (curp == PROC_NULL) { + return KERN_FAILURE; + } + + ucred = kauth_cred_proc_ref(curp); + + if (suser(ucred, NULL) && curp != targetp) { + error = KERN_PROTECTION_FAILURE; + goto out; + } + + error = do_background_thread(curp, thread, prio); + if (!error) { + (void) do_background_socket(curp, thread, prio); + } else { + if (error == EPERM) { + error = KERN_PROTECTION_FAILURE; + } else { + error = KERN_FAILURE; + } + } + +out: + proc_rele(curp); + kauth_cred_unref(&ucred); + return error; +} +#endif /* CONFIG_EMBEDDED */ + +#if CONFIG_EMBEDDED /* * If the thread or its proc has been put into the background * with setpriority(PRIO_DARWIN_{THREAD,PROCESS}, *, PRIO_DARWIN_BG), @@ -653,6 +747,7 @@ uthread_get_background_state(uthread_t uth) return 0; } +#endif /* CONFIG_EMBEDDED */ /* * Returns: 0 Success @@ -1234,19 +1329,70 @@ int iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __unused int32_t *retval) { int error = 0; + struct _iopol_param_t iop_param; +#if !CONFIG_EMBEDDED + int processwide = 0; +#else /* !CONFIG_EMBEDDED */ thread_t thread = THREAD_NULL; - int *policy; struct uthread *ut = NULL; - struct _iopol_param_t iop_param; + int *policy; +#endif /* !CONFIG_EMBEDDED */ if ((error = copyin(uap->arg, &iop_param, sizeof(iop_param))) != 0) - goto exit; + goto out; if (iop_param.iop_iotype != IOPOL_TYPE_DISK) { error = EINVAL; - goto exit; + goto out; + } + +#if !CONFIG_EMBEDDED + switch (iop_param.iop_scope) { + case IOPOL_SCOPE_PROCESS: + processwide = 1; + break; + case IOPOL_SCOPE_THREAD: + processwide = 0; + break; + default: + error = EINVAL; + goto out; } + + switch(uap->cmd) { + case IOPOL_CMD_SET: + switch (iop_param.iop_policy) { + case IOPOL_DEFAULT: + case IOPOL_NORMAL: + case IOPOL_THROTTLE: + case IOPOL_PASSIVE: + if(processwide != 0) + proc_apply_task_diskacc(current_task(), iop_param.iop_policy); + else + proc_apply_thread_selfdiskacc(iop_param.iop_policy); + + break; + default: + error = EINVAL; + goto out; + } + break; + + case IOPOL_CMD_GET: + if(processwide != 0) + iop_param.iop_policy = proc_get_task_disacc(current_task()); + else + iop_param.iop_policy = proc_get_thread_selfdiskacc(); + + error = copyout((caddr_t)&iop_param, uap->arg, sizeof(iop_param)); + break; + default: + error = EINVAL; // unknown command + break; + } + +#else /* !CONFIG_EMBEDDED */ switch (iop_param.iop_scope) { case IOPOL_SCOPE_PROCESS: policy = &p->p_iopol_disk; @@ -1258,7 +1404,7 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un break; default: error = EINVAL; - goto exit; + goto out; } switch(uap->cmd) { @@ -1274,7 +1420,7 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un break; default: error = EINVAL; - goto exit; + goto out; } break; case IOPOL_CMD_GET: @@ -1300,7 +1446,8 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un break; } - exit: +#endif /* !CONFIG_EMBEDDED */ +out: *retval = error; return (error); } @@ -1309,8 +1456,14 @@ iopolicysys(__unused struct proc *p, __unused struct iopolicysys_args *uap, __un boolean_t thread_is_io_throttled(void); boolean_t -thread_is_io_throttled(void) { +thread_is_io_throttled(void) +{ + +#if !CONFIG_EMBEDDED + return(proc_get_task_selfdiskacc() == IOPOL_THROTTLE); + +#else /* !CONFIG_EMBEDDED */ int policy; struct uthread *ut; @@ -1326,4 +1479,54 @@ thread_is_io_throttled(void) { return TRUE; } return FALSE; +#endif /* !CONFIG_EMBEDDED */ +} + +void +proc_apply_task_networkbg(void * bsd_info) +{ + proc_t p = PROC_NULL; + proc_t curp = (proc_t)bsd_info; + pid_t pid; + + pid = curp->p_pid; + p = proc_find(pid); + if (p != PROC_NULL) { + do_background_socket(p, NULL, PRIO_DARWIN_BG); + proc_rele(p); + } +} + +void +proc_restore_task_networkbg(void * bsd_info) +{ + proc_t p = PROC_NULL; + proc_t curp = (proc_t)bsd_info; + pid_t pid; + + pid = curp->p_pid; + p = proc_find(pid); + if (p != PROC_NULL) { + do_background_socket(p, NULL, 0); + proc_rele(p); + } + +} + +void +proc_set_task_networkbg(void * bsdinfo, int setbg) +{ + if (setbg != 0) + proc_apply_task_networkbg(bsdinfo); + else + proc_restore_task_networkbg(bsdinfo); } + +void +proc_apply_task_networkbg_internal(proc_t p) +{ + if (p != PROC_NULL) { + do_background_socket(p, NULL, PRIO_DARWIN_BG); + } +} + diff --git a/bsd/kern/kern_shutdown.c b/bsd/kern/kern_shutdown.c index f8984bb3c..4e231826d 100644 --- a/bsd/kern/kern_shutdown.c +++ b/bsd/kern/kern_shutdown.c @@ -67,16 +67,16 @@ #include <mach/task.h> /* for task_suspend() */ #include <sys/sysproto.h> /* abused for sync() */ #include <kern/clock.h> /* for delay_for_interval() */ +#include <libkern/OSAtomic.h> #include <sys/kdebug.h> -int system_inshutdown = 0; +uint32_t system_inshutdown = 0; /* XXX should be in a header file somewhere, but isn't */ extern void md_prepare_for_shutdown(int, int, char *); extern void (*unmountroot_pre_hook)(void); -int waittime = -1; unsigned int proc_shutdown_exitcount = 0; static int sd_openlog(vfs_context_t); @@ -109,37 +109,34 @@ static int sd_callback1(proc_t p, void * arg); static int sd_callback2(proc_t p, void * arg); static int sd_callback3(proc_t p, void * arg); -void +int boot(int paniced, int howto, char *command) { struct proc *p = current_proc(); /* XXX */ int hostboot_option=0; - int funnel_state; - system_inshutdown = 1; - - funnel_state = thread_funnel_set(kernel_flock, TRUE); - - /* - * Temporary hack to notify the power management root domain - * that the system will shut down. - */ + if (!OSCompareAndSwap(0, 1, &system_inshutdown)) { + if ( (howto&RB_QUICK) == RB_QUICK) + goto force_reboot; + return (EBUSY); + } + /* + * Temporary hack to notify the power management root domain + * that the system will shut down. + */ IOSystemShutdownNotification(); md_prepare_for_shutdown(paniced, howto, command); - if ((howto&RB_QUICK)==RB_QUICK && waittime < 0) { - waittime = 0; + if ((howto&RB_QUICK)==RB_QUICK) { printf("Quick reboot...\n"); if ((howto&RB_NOSYNC)==0) { sync(p, (void *)NULL, (int *)NULL); } } - else if ((howto&RB_NOSYNC)==0 && waittime < 0) { + else if ((howto&RB_NOSYNC)==0) { int iter, nbusy; - waittime = 0; - printf("syncing disks... "); /* @@ -150,7 +147,7 @@ boot(int paniced, int howto, char *command) proc_shutdown(); #if CONFIG_AUDIT - audit_shutdown(); + audit_shutdown(); #endif if (unmountroot_pre_hook != NULL) @@ -162,7 +159,7 @@ boot(int paniced, int howto, char *command) * Now that all processes have been terminated and system is * sync'ed up, suspend init */ - + if (initproc && p != initproc) task_suspend(initproc->task); @@ -187,7 +184,6 @@ boot(int paniced, int howto, char *command) else printf("done\n"); } - #if NETWORKING /* * Can't just use an splnet() here to disable the network @@ -197,6 +193,7 @@ boot(int paniced, int howto, char *command) if_down_all(); #endif /* NETWORKING */ +force_reboot: if (howto & RB_POWERDOWN) hostboot_option = HOST_REBOOT_HALT; if (howto & RB_HALT) @@ -204,13 +201,15 @@ boot(int paniced, int howto, char *command) if (paniced == RB_PANIC) hostboot_option = HOST_REBOOT_HALT; - if (howto & RB_UPSDELAY) { - hostboot_option = HOST_REBOOT_UPSDELAY; - } + if (howto & RB_UPSDELAY) { + hostboot_option = HOST_REBOOT_UPSDELAY; + } host_reboot(host_priv_self(), hostboot_option); - - thread_funnel_set(kernel_flock, FALSE); + /* + * should not be reached + */ + return (0); } static int diff --git a/bsd/kern/kern_sig.c b/bsd/kern/kern_sig.c index e0ded6e4c..de5455812 100644 --- a/bsd/kern/kern_sig.c +++ b/bsd/kern/kern_sig.c @@ -334,10 +334,10 @@ cansignal(proc_t p, kauth_cred_t uc, proc_t q, int signum, int zombie) else my_cred = proc_ucred(q); - if (uc->cr_ruid == my_cred->cr_ruid || - uc->cr_ruid == my_cred->cr_svuid || - kauth_cred_getuid(uc) == my_cred->cr_ruid || - kauth_cred_getuid(uc) == my_cred->cr_svuid) { + if (kauth_cred_getruid(uc) == kauth_cred_getruid(my_cred) || + kauth_cred_getruid(uc) == kauth_cred_getsvuid(my_cred) || + kauth_cred_getuid(uc) == kauth_cred_getruid(my_cred) || + kauth_cred_getuid(uc) == kauth_cred_getsvuid(my_cred)) { if (zombie == 0) kauth_cred_unref(&my_cred); return (1); @@ -566,7 +566,7 @@ set_procsigmask(proc_t p, int bit) * process/thread pair. * * We mark thread as unused to alow compilation without warning - * onnon-PPC platforms. + * on non-PPC platforms. */ int setsigvec(proc_t p, __unused thread_t thread, int signum, struct __kern_sigaction *sa, boolean_t in_sigstart) @@ -623,14 +623,6 @@ setsigvec(proc_t p, __unused thread_t thread, int signum, struct __kern_sigactio OSBitAndAtomic(~((uint32_t)P_NOCLDWAIT), &p->p_flag); } -#ifdef __ppc__ - if (signum == SIGFPE) { - if (sa->sa_handler == SIG_DFL || sa->sa_handler == SIG_IGN) - thread_enable_fpe(thread, 0); - else - thread_enable_fpe(thread, 1); - } -#endif /* __ppc__ */ /* * Set bit in p_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to ignore. @@ -1749,34 +1741,35 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) if (flavor & PSIG_VFORK) { sig_task = task; sig_thread = thread; - sig_proc= p; + sig_proc = p; } else if (flavor & PSIG_THREAD) { sig_task = get_threadtask(thread); sig_thread = thread; sig_proc = (proc_t)get_bsdtask_info(sig_task); } else { sig_task = p->task; - sig_proc = p; sig_thread = (struct thread *)0; + sig_proc = p; } - if (((sig_task == TASK_NULL) || is_kerneltask(sig_task))) { + + if ((sig_task == TASK_NULL) || is_kerneltask(sig_task)) return; - } /* * do not send signals to the process that has the thread * doing a reboot(). Not doing so will mark that thread aborted - * and can cause IO failures wich will cause data loss. + * and can cause IO failures wich will cause data loss. There's + * also no need to send a signal to a process that is in the middle + * of being torn down. */ - if (ISSET(sig_proc->p_flag, P_REBOOT)) { + if (ISSET(sig_proc->p_flag, P_REBOOT) || + ISSET(sig_proc->p_lflag, P_LEXIT)) return; - } if( (flavor & (PSIG_VFORK | PSIG_THREAD)) == 0) { proc_knote(sig_proc, NOTE_SIGNAL | signum); } - if ((flavor & PSIG_LOCKED)== 0) proc_signalstart(sig_proc, 0); @@ -2027,7 +2020,7 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) if (( pp != PROC_NULL) && ((pp->p_flag & P_NOCLDSTOP) == 0)) { my_cred = kauth_cred_proc_ref(sig_proc); - r_uid = my_cred->cr_ruid; + r_uid = kauth_cred_getruid(my_cred); kauth_cred_unref(&my_cred); proc_lock(sig_proc); @@ -2077,6 +2070,14 @@ psignal_internal(proc_t p, task_t task, thread_t thread, int flavor, int signum) */ sig_proc->p_stat = SRUN; proc_unlock(sig_proc); + /* + * In scenarios where suspend/resume are racing + * the signal we are missing AST_BSD by the time + * we get here, set again to avoid races. This + * was the scenario with spindump enabled shutdowns. + * We would need to cover this approp down the line. + */ + act_set_astbsd(sig_thread); thread_abort(sig_thread); goto psigout; @@ -2281,7 +2282,7 @@ issignal(proc_t p) } else { proc_unlock(p); my_cred = kauth_cred_proc_ref(p); - r_uid = my_cred->cr_ruid; + r_uid = kauth_cred_getruid(my_cred); kauth_cred_unref(&my_cred); pp = proc_parentholdref(p); @@ -2445,7 +2446,7 @@ issignal(proc_t p) stop(p, pp); if ((pp != PROC_NULL) && ((pp->p_flag & P_NOCLDSTOP) == 0)) { my_cred = kauth_cred_proc_ref(p); - r_uid = my_cred->cr_ruid; + r_uid = kauth_cred_getruid(my_cred); kauth_cred_unref(&my_cred); proc_lock(pp); @@ -2501,7 +2502,7 @@ issignal(proc_t p) } /* NOTREACHED */ out: - proc_signalend(p,1); + proc_signalend(p, 1); proc_unlock(p); return(retval); } @@ -2538,6 +2539,7 @@ CURSIG(proc_t p) signum = ffs((long)sigbits); mask = sigmask(signum); prop = sigprop[signum]; + sigbits &= ~mask; /* take the signal out */ /* * We should see pending but ignored signals @@ -2546,14 +2548,8 @@ CURSIG(proc_t p) if (mask & p->p_sigignore && (p->p_lflag & P_LTRACED) == 0) { continue; } + if (p->p_lflag & P_LTRACED && (p->p_lflag & P_LPPWAIT) == 0) { - /* - * Put the new signal into p_siglist. If the - * signal is being masked, look for other signals. - */ - mask = sigmask(signum); - if (ut->uu_sigmask & mask) - continue; return(signum); } @@ -2631,7 +2627,6 @@ CURSIG(proc_t p) */ return (signum); } - sigbits &= ~mask; /* take the signal! */ } /* NOTREACHED */ } @@ -2761,12 +2756,6 @@ postsig(int signum) ps->ps_siginfo &= ~mask; ps->ps_signodefer &= ~mask; } -#ifdef __ppc__ - /* Needs to disable to run in user mode */ - if (signum == SIGFPE) { - thread_enable_fpe(current_thread(), 0); - } -#endif /* __ppc__ */ if (ps->ps_sig != signum) { code = 0; @@ -2945,10 +2934,33 @@ bsd_ast(thread_t thread) ut->t_dtrace_sig = 0; psignal(p, dt_action_sig); } + if (ut->t_dtrace_stop) { - ut->t_dtrace_stop = 0; - psignal(p, SIGSTOP); + ut->t_dtrace_stop = 0; + proc_lock(p); + p->p_dtrace_stop = 1; + proc_unlock(p); + (void)task_suspend(p->task); + } + + if (ut->t_dtrace_resumepid) { + proc_t resumeproc = proc_find(ut->t_dtrace_resumepid); + ut->t_dtrace_resumepid = 0; + if (resumeproc != PROC_NULL) { + proc_lock(resumeproc); + /* We only act on processes stopped by dtrace */ + if (resumeproc->p_dtrace_stop) { + resumeproc->p_dtrace_stop = 0; + proc_unlock(resumeproc); + task_resume(resumeproc->task); + } + else { + proc_unlock(resumeproc); + } + proc_rele(resumeproc); + } } + #endif /* CONFIG_DTRACE */ if (CHECK_SIGNALS(p, current_thread(), ut)) { @@ -3066,79 +3078,37 @@ pgsigio(pid_t pgid, int sig) proc_rele(p); } - void proc_signalstart(proc_t p, int locked) { - if (locked == 0) + if (!locked) proc_lock(p); - while ((p->p_lflag & P_LINSIGNAL) == P_LINSIGNAL) { - p->p_lflag |= P_LSIGNALWAIT; + p->p_sigwaitcnt++; + while ((p->p_lflag & P_LINSIGNAL) == P_LINSIGNAL) msleep(&p->p_sigmask, &p->p_mlock, 0, "proc_signstart", NULL); - } + p->p_sigwaitcnt--; + p->p_lflag |= P_LINSIGNAL; -#if DIAGNOSTIC -#if SIGNAL_DEBUG -#ifdef __ppc__ - { - int sp, *fp, numsaved; - - __asm__ volatile("mr %0,r1" : "=r" (sp)); - - fp = (int *)*((int *)sp); - for (numsaved = 0; numsaved < 3; numsaved++) { - p->lockpc[numsaved] = fp[2]; - if ((int)fp <= 0) - break; - fp = (int *)*fp; - } - } -#endif /* __ppc__ */ -#endif /* SIGNAL_DEBUG */ -#endif /* DIAGNOSTIC */ p->p_signalholder = current_thread(); - if (locked == 0) + if (!locked) proc_unlock(p); - } void proc_signalend(proc_t p, int locked) { - if (locked == 0) + if (!locked) proc_lock(p); p->p_lflag &= ~P_LINSIGNAL; -#if DIAGNOSTIC -#if SIGNAL_DEBUG -#ifdef __ppc__ - { - int sp, *fp, numsaved; - - __asm__ volatile("mr %0,r1" : "=r" (sp)); - - fp = (int *)*((int *)sp); - for (numsaved = 0; numsaved < 3; numsaved++) { - p->unlockpc[numsaved] = fp[2]; - if ((int)fp <= 0) - break; - fp = (int *)*fp; - } - } -#endif /* __ppc__ */ -#endif /* SIGNAL_DEBUG */ -#endif /* DIAGNOSTIC */ - - if ((p->p_lflag & P_LSIGNALWAIT) == P_LSIGNALWAIT) { - p->p_lflag &= ~P_LSIGNALWAIT; + if (p->p_sigwaitcnt > 0) wakeup(&p->p_sigmask); - } + p->p_signalholder = NULL; - if (locked == 0) + if (!locked) proc_unlock(p); } - void sig_lock_to_exit(proc_t p) { diff --git a/bsd/kern/kern_symfile.c b/bsd/kern/kern_symfile.c index ffbff213f..dc6531b42 100644 --- a/bsd/kern/kern_symfile.c +++ b/bsd/kern/kern_symfile.c @@ -72,12 +72,13 @@ get_kernel_symfile(__unused proc_t p, __unused char const **symfile) struct kern_direct_file_io_ref_t { - vfs_context_t ctx; - struct vnode *vp; + vfs_context_t ctx; + struct vnode * vp; + dev_t device; }; -static int file_ioctl(void * p1, void * p2, int theIoctl, caddr_t result) +static int file_ioctl(void * p1, void * p2, u_long theIoctl, caddr_t result) { dev_t device = *(dev_t*) p1; @@ -85,7 +86,7 @@ static int file_ioctl(void * p1, void * p2, int theIoctl, caddr_t result) (device, theIoctl, result, S_IFBLK, p2)); } -static int device_ioctl(void * p1, __unused void * p2, int theIoctl, caddr_t result) +static int device_ioctl(void * p1, __unused void * p2, u_long theIoctl, caddr_t result) { return (VNOP_IOCTL(p1, theIoctl, result, 0, p2)); } @@ -94,10 +95,14 @@ struct kern_direct_file_io_ref_t * kern_open_file_for_direct_io(const char * name, kern_get_file_extents_callback_t callback, void * callback_ref, - dev_t * device_result, + dev_t * partition_device_result, + dev_t * image_device_result, uint64_t * partitionbase_result, uint64_t * maxiocount_result, - boolean_t * solid_state) + uint32_t * oflags, + off_t offset, + caddr_t addr, + vm_size_t len) { struct kern_direct_file_io_ref_t * ref; @@ -105,14 +110,21 @@ kern_open_file_for_direct_io(const char * name, struct vnode_attr va; int error; off_t f_offset; - uint32_t blksize; - uint64_t size; + off_t filelength; + uint64_t fileblk; + size_t filechunk; + uint64_t physoffset; dev_t device; + dev_t target = 0; + int isssd = 0; + uint32_t flags = 0; + uint32_t blksize; off_t maxiocount, count; + boolean_t locked = FALSE; - int (*do_ioctl)(void * p1, void * p2, int theIoctl, caddr_t result); - void * p1; - void * p2; + int (*do_ioctl)(void * p1, void * p2, u_long theIoctl, caddr_t result); + void * p1 = NULL; + void * p2 = NULL; error = EFAULT; @@ -124,12 +136,18 @@ kern_open_file_for_direct_io(const char * name, } ref->vp = NULL; - p = current_proc(); // kernproc; + p = kernproc; ref->ctx = vfs_context_create(vfs_context_current()); if ((error = vnode_open(name, (O_CREAT | FWRITE), (0), 0, &ref->vp, ref->ctx))) goto out; + if (addr && len) + { + if ((error = kern_write_file(ref, offset, addr, len))) + goto out; + } + VATTR_INIT(&va); VATTR_WANTED(&va, va_rdev); VATTR_WANTED(&va, va_fsid); @@ -169,6 +187,80 @@ kern_open_file_for_direct_io(const char * name, error = EFAULT; goto out; } + ref->device = device; + + // generate the block list + + error = do_ioctl(p1, p2, DKIOCLOCKPHYSICALEXTENTS, NULL); + if (error) + goto out; + locked = TRUE; + + // get block size + + error = do_ioctl(p1, p2, DKIOCGETBLOCKSIZE, (caddr_t) &blksize); + if (error) + goto out; + + if (ref->vp->v_type == VREG) + filelength = va.va_data_size; + else + { + error = do_ioctl(p1, p2, DKIOCGETBLOCKCOUNT, (caddr_t) &fileblk); + if (error) + goto out; + filelength = fileblk * blksize; + } + + f_offset = 0; + while (f_offset < filelength) + { + if (ref->vp->v_type == VREG) + { + filechunk = 1*1024*1024*1024; + daddr64_t blkno; + + error = VNOP_BLOCKMAP(ref->vp, f_offset, filechunk, &blkno, &filechunk, NULL, 0, NULL); + if (error) + goto out; + + fileblk = blkno * blksize; + } + else if ((ref->vp->v_type == VBLK) || (ref->vp->v_type == VCHR)) + { + fileblk = f_offset; + filechunk = f_offset ? 0 : filelength; + } + + physoffset = 0; + while (physoffset < filechunk) + { + dk_physical_extent_t getphysreq; + bzero(&getphysreq, sizeof(getphysreq)); + + getphysreq.offset = fileblk + physoffset; + getphysreq.length = (filechunk - physoffset); + error = do_ioctl(p1, p2, DKIOCGETPHYSICALEXTENT, (caddr_t) &getphysreq); + if (error) + goto out; + if (!target) + { + target = getphysreq.dev; + } + else if (target != getphysreq.dev) + { + error = ENOTSUP; + goto out; + } + callback(callback_ref, getphysreq.offset, getphysreq.length); + physoffset += getphysreq.length; + } + f_offset += filechunk; + } + callback(callback_ref, 0ULL, 0ULL); + + if (ref->vp->v_type == VREG) + p1 = ⌖ // get partition base @@ -226,62 +318,37 @@ kern_open_file_for_direct_io(const char * name, if (maxiocount_result) *maxiocount_result = maxiocount; - if (solid_state) - { - int isssd = 0; - error = do_ioctl(p1, p2, DKIOCISSOLIDSTATE, (caddr_t)&isssd); - if (error) - *solid_state = FALSE; - else - *solid_state = isssd; - } - - // generate the block list - - error = 0; - if (ref->vp->v_type == VREG) - { - f_offset = 0; - while(f_offset < (off_t) va.va_data_size) - { - size_t io_size = 1*1024*1024*1024; - daddr64_t blkno; - - error = VNOP_BLOCKMAP(ref->vp, f_offset, io_size, &blkno, (size_t *)&io_size, NULL, 0, NULL); - if (error) - goto out; - callback(callback_ref, ((uint64_t) blkno) * blksize, (uint64_t) io_size); - f_offset += io_size; - } - callback(callback_ref, 0ULL, 0ULL); - } - else if ((ref->vp->v_type == VBLK) || (ref->vp->v_type == VCHR)) - { - error = do_ioctl(p1, p2, DKIOCGETBLOCKCOUNT, (caddr_t) &size); - if (error) - goto out; - size *= blksize; - callback(callback_ref, 0ULL, size); - callback(callback_ref, size, 0ULL); - } + error = do_ioctl(p1, p2, DKIOCISSOLIDSTATE, (caddr_t)&isssd); + if (!error && isssd) + flags |= kIOHibernateOptionSSD; - if (device_result) - *device_result = device; + if (partition_device_result) + *partition_device_result = device; + if (image_device_result) + *image_device_result = target; + if (flags) + *oflags = flags; out: kprintf("kern_open_file_for_direct_io(%d)\n", error); - if (error && ref) { - if (ref->vp) { + if (error && locked) + { + p1 = &device; + (void) do_ioctl(p1, p2, DKIOCUNLOCKPHYSICALEXTENTS, NULL); + } + + if (error && ref) + { + if (ref->vp) + { vnode_close(ref->vp, FWRITE, ref->ctx); ref->vp = NULLVP; } - vfs_context_rele(ref->ctx); kfree(ref, sizeof(struct kern_direct_file_io_ref_t)); ref = NULL; } - return(ref); } @@ -296,21 +363,47 @@ kern_write_file(struct kern_direct_file_io_ref_t * ref, off_t offset, caddr_t ad } void -kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref) +kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, + off_t offset, caddr_t addr, vm_size_t len) { + int error; kprintf("kern_close_file_for_direct_io\n"); - if (ref) { - int error; + if (!ref) return; - if (ref->vp) { - error = vnode_close(ref->vp, FWRITE, ref->ctx); - ref->vp = NULLVP; - kprintf("vnode_close(%d)\n", error); - } - vfs_context_rele(ref->ctx); - ref->ctx = NULL; - kfree(ref, sizeof(struct kern_direct_file_io_ref_t)); + if (ref->vp) + { + int (*do_ioctl)(void * p1, void * p2, u_long theIoctl, caddr_t result); + void * p1; + void * p2; + + if (ref->vp->v_type == VREG) + { + p1 = &ref->device; + p2 = kernproc; + do_ioctl = &file_ioctl; + } + else + { + /* Partition. */ + p1 = ref->vp; + p2 = ref->ctx; + do_ioctl = &device_ioctl; + } + (void) do_ioctl(p1, p2, DKIOCUNLOCKPHYSICALEXTENTS, NULL); + + if (addr && len) + { + (void) kern_write_file(ref, offset, addr, len); + } + + error = vnode_close(ref->vp, FWRITE, ref->ctx); + + ref->vp = NULLVP; + kprintf("vnode_close(%d)\n", error); } + vfs_context_rele(ref->ctx); + ref->ctx = NULL; + kfree(ref, sizeof(struct kern_direct_file_io_ref_t)); } diff --git a/bsd/kern/kern_synch.c b/bsd/kern/kern_synch.c index 68a45824e..c6b4888c3 100644 --- a/bsd/kern/kern_synch.c +++ b/bsd/kern/kern_synch.c @@ -162,7 +162,7 @@ _sleep( struct proc *p; thread_t self = current_thread(); struct uthread * ut; - int sig, catch = pri & PCATCH; + int sig, catch; int dropmutex = pri & PDROP; int spinmutex = pri & PSPIN; int wait_result; @@ -175,26 +175,39 @@ _sleep( /* It can still block in proc_exit() after the teardown. */ if (p->p_stats != NULL) OSIncrementAtomicLong(&p->p_stats->p_ru.ru_nvcsw); + + if (pri & PCATCH) + catch = THREAD_ABORTSAFE; + else + catch = THREAD_UNINT; /* set wait message & channel */ ut->uu_wchan = chan; ut->uu_wmesg = wmsg ? wmsg : "unknown"; if (mtx != NULL && chan != NULL && (thread_continue_t)continuation == THREAD_CONTINUE_NULL) { + int flags; + + if (dropmutex) + flags = LCK_SLEEP_UNLOCK; + else + flags = LCK_SLEEP_DEFAULT; + + if (spinmutex) + flags |= LCK_SLEEP_SPIN; if (abstime) - wait_result = lck_mtx_sleep_deadline(mtx, (dropmutex) ? LCK_SLEEP_UNLOCK : 0, - chan, (catch) ? THREAD_ABORTSAFE : THREAD_UNINT, abstime); + wait_result = lck_mtx_sleep_deadline(mtx, flags, chan, catch, abstime); else - wait_result = lck_mtx_sleep(mtx, (dropmutex) ? LCK_SLEEP_UNLOCK : 0, - chan, (catch) ? THREAD_ABORTSAFE : THREAD_UNINT); + wait_result = lck_mtx_sleep(mtx, flags, chan, catch); } else { if (chan != NULL) - assert_wait_deadline(chan, (catch) ? THREAD_ABORTSAFE : THREAD_UNINT, abstime); + assert_wait_deadline(chan, catch, abstime); if (mtx) lck_mtx_unlock(mtx); - if (catch) { + + if (catch == THREAD_ABORTSAFE) { if (SHOULDissignal(p,ut)) { if ((sig = CURSIG(p)) != 0) { if (clear_wait(self, THREAD_INTERRUPTED) == KERN_FAILURE) @@ -258,11 +271,11 @@ block: * first, regardless of whether awakened due * to receiving event. */ - if (!catch) + if (catch != THREAD_ABORTSAFE) break; /* else fall through */ case THREAD_INTERRUPTED: - if (catch) { + if (catch == THREAD_ABORTSAFE) { if (thread_should_abort(self)) { error = EINTR; } else if (SHOULDissignal(p, ut)) { @@ -392,7 +405,7 @@ tsleep1( void wakeup(void *chan) { - thread_wakeup_prim((caddr_t)chan, FALSE, THREAD_AWAKENED); + thread_wakeup((caddr_t)chan); } /* @@ -404,7 +417,7 @@ wakeup(void *chan) void wakeup_one(caddr_t chan) { - thread_wakeup_prim((caddr_t)chan, TRUE, THREAD_AWAKENED); + thread_wakeup_one((caddr_t)chan); } /* diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index 842a3e572..f2c9c8711 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -107,11 +107,13 @@ #include <kern/kalloc.h> #include <mach/machine.h> +#include <mach/mach_host.h> #include <mach/mach_types.h> #include <mach/vm_param.h> #include <kern/mach_param.h> #include <kern/task.h> #include <kern/lock.h> +#include <kern/processor.h> #include <kern/debug.h> #include <vm/vm_kern.h> #include <vm/vm_map.h> @@ -128,15 +130,12 @@ #include <machine/exec.h> #include <vm/vm_protos.h> +#include <sys/imgsrc.h> #if defined(__i386__) || defined(__x86_64__) #include <i386/cpuid.h> #endif -sysctlfn kern_sysctl; -#if DEBUG -sysctlfn debug_sysctl; -#endif extern sysctlfn net_sysctl; extern sysctlfn cpu_sysctl; extern int aio_max_requests; @@ -146,25 +145,47 @@ extern int lowpri_IO_window_msecs; extern int lowpri_IO_delay_msecs; extern int nx_enabled; extern int speculative_reads_disabled; +extern int ignore_is_ssd; +extern unsigned int speculative_prefetch_max; extern unsigned int preheat_pages_max; extern unsigned int preheat_pages_min; -extern unsigned int preheat_pages_mult; extern long numvnodes; -static void +extern unsigned int vm_max_delayed_work_limit; +extern unsigned int vm_max_batch; + +extern unsigned int vm_page_free_min; +extern unsigned int vm_page_free_target; +extern unsigned int vm_page_free_reserved; +extern unsigned int vm_page_speculative_percentage; +extern unsigned int vm_page_speculative_q_age_ms; + +/* + * Conditionally allow dtrace to see these functions for debugging purposes. + */ +#ifdef STATIC +#undef STATIC +#endif +#if 0 +#define STATIC +#else +#define STATIC static +#endif + +extern boolean_t mach_timer_coalescing_enabled; + +STATIC void fill_user32_eproc(proc_t p, struct user32_eproc *ep); -static void +STATIC void fill_user32_externproc(proc_t p, struct user32_extern_proc *exp); -static void +STATIC void fill_user64_eproc(proc_t p, struct user64_eproc *ep); -static void +STATIC void fill_user64_proc(proc_t p, struct user64_kinfo_proc *kp); -static void +STATIC void fill_user64_externproc(proc_t p, struct user64_extern_proc *exp); extern int kdbg_control(int *name, u_int namelen, user_addr_t where, size_t * sizep); -int -kdebug_ops(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t p); #if NFSCLIENT extern int netboot_root(void); @@ -174,41 +195,94 @@ pcsamples_ops(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t p); __private_extern__ kern_return_t reset_vmobjectcache(unsigned int val1, unsigned int val2); -int -sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep); -int -sysctl_doprof(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen); -static void +STATIC void fill_user32_proc(proc_t p, struct user32_kinfo_proc *kp); int sysctl_procargs(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t cur_proc); -static int -sysctl_procargs2(int *name, u_int namelen, user_addr_t where, size_t *sizep, - proc_t cur_proc); -static int +STATIC int sysctl_procargsx(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t cur_proc, int argc_yes); int sysctl_struct(user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen, void *sp, int len); -static int sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg); -static int sysdoproc_filt_KERN_PROC_PGRP(proc_t p, void * arg); -static int sysdoproc_filt_KERN_PROC_TTY(proc_t p, void * arg); -static int sysdoproc_filt_KERN_PROC_UID(proc_t p, void * arg); -static int sysdoproc_filt_KERN_PROC_RUID(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_PGRP(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_TTY(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_UID(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_RUID(proc_t p, void * arg); #if CONFIG_LCTX -static int sysdoproc_filt_KERN_PROC_LCID(proc_t p, void * arg); +STATIC int sysdoproc_filt_KERN_PROC_LCID(proc_t p, void * arg); #endif int sysdoproc_callback(proc_t p, void *arg); -static int __sysctl_funneled(proc_t p, struct __sysctl_args *uap, int32_t *retval); + +/* forward declarations for non-static STATIC */ +STATIC void fill_loadavg64(struct loadavg *la, struct user64_loadavg *la64); +STATIC void fill_loadavg32(struct loadavg *la, struct user32_loadavg *la32); +STATIC int sysctl_handle_exec_archhandler_ppc(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_handle_kern_threadname(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_sched_stats(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_sched_stats_enable(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_file(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_kdebug_ops SYSCTL_HANDLER_ARGS; +STATIC int sysctl_dotranslate SYSCTL_HANDLER_ARGS; +STATIC int sysctl_doaffinity SYSCTL_HANDLER_ARGS; +#if COUNT_SYSCALLS +STATIC int sysctl_docountsyscalls SYSCTL_HANDLER_ARGS; +#endif /* COUNT_SYSCALLS */ +#if !CONFIG_EMBEDDED +STATIC int sysctl_doprocargs SYSCTL_HANDLER_ARGS; +#endif /* !CONFIG_EMBEDDED */ +STATIC int sysctl_doprocargs2 SYSCTL_HANDLER_ARGS; +STATIC int sysctl_prochandle SYSCTL_HANDLER_ARGS; +#if DEBUG +STATIC int sysctl_dodebug SYSCTL_HANDLER_ARGS; +#endif +STATIC int sysctl_aiomax(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_aioprocmax(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_aiothreads(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_maxproc(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_osversion(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_sysctl_bootargs(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_maxvnodes(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_securelvl(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_domainname(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_hostname(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_procname(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_boottime(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_symfile(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#if NFSCLIENT +STATIC int sysctl_netboot(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#endif +#ifdef CONFIG_IMGSRC_ACCESS +STATIC int sysctl_imgsrcdev(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#endif +STATIC int sysctl_usrstack(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_usrstack64(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_coredump(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_suid_coredump(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_delayterm(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_rage_vnode(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_kern_check_openevt(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_nx(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_loadavg(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_vm_toggle_address_reuse(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_swapusage(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#if defined(__i386__) || defined(__x86_64__) +STATIC int sysctl_sysctl_exec_affinity(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +#endif +STATIC int fetch_process_cputype( proc_t cur_proc, int *name, u_int namelen, cpu_type_t *cputype); +STATIC int sysctl_sysctl_native(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_sysctl_cputype(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_safeboot(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); +STATIC int sysctl_singleuser(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); + extern void IORegistrySetOSBuildVersion(char * build_version); -static void +STATIC void fill_loadavg64(struct loadavg *la, struct user64_loadavg *la64) { la64->ldavg[0] = la->ldavg[0]; @@ -217,7 +291,7 @@ fill_loadavg64(struct loadavg *la, struct user64_loadavg *la64) la64->fscale = (user64_long_t)la->fscale; } -static void +STATIC void fill_loadavg32(struct loadavg *la, struct user32_loadavg *la32) { la32->ldavg[0] = la->ldavg[0]; @@ -226,34 +300,76 @@ fill_loadavg32(struct loadavg *la, struct user32_loadavg *la32) la32->fscale = (user32_long_t)la->fscale; } +/* + * sysctl_mem_hold + * + * Description: Wire down the callers address map on behalf of sysctl's + * that perform their own copy operations while holding + * locks e.g. in the paging path, which could lead to a + * deadlock, or while holding a spinlock. + * + * Parameters: addr User buffer address + * len User buffer length + * + * Returns: 0 Success + * vslock:ENOMEM Insufficient physical pages to wire + * vslock:EACCES Bad protection mode + * vslock:EINVAL Invalid parameters + * + * Notes: This code is invoked for the first OID element where the + * CTLFLAG_LOCKED is not specified for a given OID node + * element durng OID traversal, and is held for all + * subsequent node traversals, and only released after the + * leaf node handler invocation is complete. + * + * Legacy: For legacy scyctl's provided by third party code which + * expect funnel protection for calls into their code, this + * routine will also take the funnel, which will also only + * be released after the leaf node handler is complete. + * + * This is to support legacy 32 bit BSD KEXTs and legacy 32 + * bit single threaded filesystem KEXTs and similar code + * which relies on funnel protection, e.g. for things like + * FSID based sysctl's. + * + * NEW CODE SHOULD NOT RELY ON THIS BEHAVIOUR! IT WILL BE + * REMOVED IN A FUTURE RELASE OF Mac OS X! + * + * Bugs: This routine does nothing with the new_addr and new_len + * at present, but it should, since read from the user space + * process adddress space which could potentially trigger + * paging may also be occurring deep down. This is due to + * a current limitation of the vslock() routine, which will + * always request a wired mapping be read/write, due to not + * taking an access mode parameter. Note that this could + * also cause problems for output on architectures where + * write access does not require read acccess if the current + * mapping lacks read access. + * + * XXX: To be moved to kern_newsysctl.c to avoid __private_extern__ + */ +int sysctl_mem_lock(user_addr_t old_addr, user_size_t old_len, user_addr_t new_addr, user_size_t new_len); +int +sysctl_mem_lock(__unused user_addr_t old_addr, __unused user_size_t old_len, __unused user_addr_t new_addr, __unused user_size_t new_len) +{ + return 0; +} + /* * Locking and stats */ -static struct sysctl_lock memlock; /* sysctl() syscall */ int -__sysctl(proc_t p, struct __sysctl_args *uap, int32_t *retval) +__sysctl(proc_t p, struct __sysctl_args *uap, __unused int32_t *retval) { - boolean_t funnel_state; + boolean_t funnel_state = FALSE; /* not held if unknown */ int error; - - funnel_state = thread_funnel_set(kernel_flock, TRUE); - error = __sysctl_funneled(p, uap, retval); - thread_funnel_set(kernel_flock, funnel_state); - return(error); -} - -static int -__sysctl_funneled(proc_t p, struct __sysctl_args *uap, __unused int32_t *retval) -{ - int error, dolock = 1; size_t savelen = 0, oldlen = 0, newlen; - sysctlfn *fnp = NULL; int name[CTL_MAXNAME]; int error1; - boolean_t memlock_taken = FALSE; boolean_t vslock_taken = FALSE; + boolean_t funnel_taken = FALSE; #if CONFIG_MACF kauth_cred_t my_cred; #endif @@ -279,38 +395,49 @@ __sysctl_funneled(proc_t p, struct __sysctl_args *uap, __unused int32_t *retval) else { newlen = uap->newlen; } - + +/* + * XXX TODO: push down rights check for CTL_HW OIDs; most duplicate + * XXX it anyway, which is a performance sink, and requires use + * XXX of SUID root programs (see <rdar://3915692>). + * + * Note: Opt out of non-leaf node enforcement by removing this + * check for the top level OID value, and then adding + * CTLFLAG_ANYBODY to the leaf nodes in question. Enforce as + * suser for writed in leaf nodes by omitting this flag. + * Enforce with a higher granularity by making the leaf node + * of type SYSCTL_PROC() in order to provide a procedural + * enforcement call site. + * + * NOTE: This function is called prior to any subfunctions being + * called with a fallback to userland_sysctl(); as such, this + * permissions check here will veto the fallback operation. + */ /* CTL_UNSPEC is used to get oid to AUTO_OID */ if (uap->new != USER_ADDR_NULL - && ((name[0] == CTL_KERN - && !(name[1] == KERN_IPC || name[1] == KERN_PANICINFO || name[1] == KERN_PROCDELAYTERM || - name[1] == KERN_PROCNAME || name[1] == KERN_RAGEVNODE || name[1] == KERN_CHECKOPENEVT || name[1] == KERN_THREADNAME)) - || (name[0] == CTL_HW) + && ((name[0] == CTL_HW) || (name[0] == CTL_VM)) && (error = suser(kauth_cred_get(), &p->p_acflag))) return (error); -/* XXX: KERN, VFS and DEBUG are handled by their respective functions, - * but there is a fallback for all sysctls other than VFS to - * userland_sysctl() - KILL THIS! */ - switch (name[0]) { - case CTL_KERN: - fnp = kern_sysctl; - if ((name[1] != KERN_VNODE) && (name[1] != KERN_FILE) - && (name[1] != KERN_PROC)) - dolock = 0; - break; - case CTL_VFS: - fnp = vfs_sysctl; - break; -#if DEBUG - case CTL_DEBUG: - fnp = debug_sysctl; - break; +// XXX need to relocate into each terminal instead of leaving this here... +// XXX macf preemptory check. +#if CONFIG_MACF + my_cred = kauth_cred_proc_ref(p); + error = mac_system_check_sysctl( + my_cred, + (int *) name, + uap->namelen, + uap->old, + uap->oldlenp, + 0, /* XXX 1 for CTL_KERN checks */ + uap->new, + newlen + ); + kauth_cred_unref(&my_cred); + if (error) + return (error); #endif - default: - fnp = NULL; - } if (uap->oldlenp != USER_ADDR_NULL) { uint64_t oldlen64 = fuulong(uap->oldlenp); @@ -324,79 +451,82 @@ __sysctl_funneled(proc_t p, struct __sysctl_args *uap, __unused int32_t *retval) oldlen = 0xffffffffUL; } - if (uap->old != USER_ADDR_NULL) { - if (!useracc(uap->old, (user_size_t)oldlen, B_WRITE)) - return (EFAULT); + if ((name[0] == CTL_VFS || name[0] == CTL_VM)) { /* - * The kernel debug mechanism does not need to take this lock, and - * we don't grab the memlock around calls to KERN_PROC because it is reentrant. - * Grabbing the lock for a KERN_PROC sysctl makes a deadlock possible 5024049. + * Always take the funnel for CTL_VFS and CTL_VM + * + * XXX We should also take it for any OID without the + * XXX CTLFLAG_LOCKED set on it; fix this later! + */ + funnel_state = thread_funnel_set(kernel_flock, TRUE); + funnel_taken = TRUE; + + /* + * XXX Take the vslock() only when we are copying out; this + * XXX erroneously assumes that the copy in will not cause + * XXX a fault if caled from the paging path due to the + * XXX having been recently touched in order to establish + * XXX the input data. This is a bad assumption. + * + * Note: This is overkill, but third parties might + * already call sysctl internally in KEXTs that + * implement mass storage drivers. If you are + * writing a new KEXT, don't do that. */ - if (!((name[1] == KERN_KDEBUG) && (name[2] == KERN_KDGETENTROPY)) && - !(name[1] == KERN_PROC)) { - MEMLOCK_LOCK(); - memlock_taken = TRUE; - } - - if (dolock && oldlen) { - if ((error = vslock(uap->old, (user_size_t)oldlen))) { - if (memlock_taken == TRUE) - MEMLOCK_UNLOCK(); - return(error); + if(uap->old != USER_ADDR_NULL) { + if (!useracc(uap->old, (user_size_t)oldlen, B_WRITE)) { + thread_funnel_set(kernel_flock, funnel_state); + return (EFAULT); + } + + if (oldlen) { + if ((error = vslock(uap->old, (user_size_t)oldlen))) { + thread_funnel_set(kernel_flock, funnel_state); + return(error); + } + savelen = oldlen; + vslock_taken = TRUE; } - savelen = oldlen; - vslock_taken = TRUE; } } -#if CONFIG_MACF - my_cred = kauth_cred_proc_ref(p); - error = mac_system_check_sysctl( - my_cred, - (int *) name, - uap->namelen, - uap->old, - uap->oldlenp, - fnp == kern_sysctl ? 1 : 0, - uap->new, - newlen - ); - kauth_cred_unref(&my_cred); - if (!error) { -#endif - if (fnp) { - error = (*fnp)(name + 1, uap->namelen - 1, uap->old, + /* + * XXX convert vfs_sysctl subelements to newsysctl; this is hard + * XXX because of VFS_NUMMNTOPS being top level. + */ + error = ENOTSUP; + if (name[0] == CTL_VFS) { + error = vfs_sysctl(name + 1, uap->namelen - 1, uap->old, &oldlen, uap->new, newlen, p); } - else - error = ENOTSUP; -#if CONFIG_MACF - } -#endif if (vslock_taken == TRUE) { error1 = vsunlock(uap->old, (user_size_t)savelen, B_WRITE); if (!error) error = error1; } - if (memlock_taken == TRUE) - MEMLOCK_UNLOCK(); - if ( (name[0] != CTL_VFS) && (error == ENOTSUP)) { - size_t tmp = oldlen; - boolean_t funnel_state; - - /* - * Drop the funnel when calling new sysctl code, which will conditionally - * grab the funnel if it really needs to. - */ - funnel_state = thread_funnel_set(kernel_flock, FALSE); - + if ( (name[0] != CTL_VFS) && (error == ENOTSUP) ) { + size_t tmp = oldlen; error = userland_sysctl(p, name, uap->namelen, uap->old, &tmp, uap->new, newlen, &oldlen); + } + /* + * If we took the funnel, which we only do for CTL_VFS and CTL_VM on + * 32 bit architectures, then drop it. + * + * XXX the grabbing and dropping need to move into the leaf nodes, + * XXX for sysctl's that are not marked CTLFLAG_LOCKED, but this is + * XXX true for the vslock, as well. We have a start at a routine + * to wrapper this (above), but it's not turned on. The current code + * removed the funnel and the vslock() from all but these two top + * level OIDs. Note that VFS only needs to take the funnel if the FS + * against which it's operating is not thread safe (but since an FS + * can be in the paging path, it still needs to take the vslock()). + */ + if (funnel_taken) thread_funnel_set(kernel_flock, funnel_state); - } if ((error) && (error != ENOMEM)) return (error); @@ -424,21 +554,26 @@ int securelevel = -1; int securelevel; #endif -static int -sysctl_affinity( - int *name, - u_int namelen, - user_addr_t oldBuf, - size_t *oldSize, - user_addr_t newBuf, - __unused size_t newSize, - proc_t cur_proc) +STATIC int +sysctl_doaffinity SYSCTL_HANDLER_ARGS { + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ +// size_t newlen = req->newlen; /* user buffer copy in size */ + + int error = ENOTSUP; /* Default to failure */ + + proc_t cur_proc = current_proc(); + if (namelen < 1) return (ENOTSUP); if (name[0] == 0 && 1 == namelen) { - return sysctl_rdint(oldBuf, oldSize, newBuf, + error = sysctl_rdint(oldp, oldlenp, newp, (cur_proc->p_flag & P_AFFINITY) ? 1 : 0); } else if (name[0] == 1 && 2 == namelen) { if (name[1] == 0) { @@ -446,21 +581,35 @@ sysctl_affinity( } else { OSBitOrAtomic(P_AFFINITY, &cur_proc->p_flag); } - return 0; + error = 0; } - return (ENOTSUP); + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +SYSCTL_PROC(_kern, KERN_AFFINITY, affinity, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_doaffinity, /* Handler function */ + NULL, /* Data pointer */ + ""); + +STATIC int +sysctl_dotranslate SYSCTL_HANDLER_ARGS +{ + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ +// size_t newlen = req->newlen; /* user buffer copy in size */ + int error; -static int -sysctl_translate( - int *name, - u_int namelen, - user_addr_t oldBuf, - size_t *oldSize, - user_addr_t newBuf, - __unused size_t newSize, - proc_t cur_proc) -{ + proc_t cur_proc = current_proc(); proc_t p; int istranslated = 0; kauth_cred_t my_cred; @@ -484,9 +633,25 @@ sysctl_translate( istranslated = (p->p_flag & P_TRANSLATED); proc_rele(p); - return sysctl_rdint(oldBuf, oldSize, newBuf, + error = sysctl_rdint(oldp, oldlenp, newp, (istranslated != 0) ? 1 : 0); + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +/* + * XXX make CTLFLAG_RW so sysctl_rdint() will EPERM on attempts to write; + * XXX this may not be necessary. + */ +SYSCTL_PROC(_kern, KERN_TRANSLATE, translate, CTLTYPE_NODE|CTLFLAG_RW | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_dotranslate, /* Handler function */ + NULL, /* Data pointer */ + ""); int set_archhandler(__unused proc_t p, int arch) @@ -505,7 +670,7 @@ set_archhandler(__unused proc_t p, int arch) return (EBADARCH); } - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_GETATTR, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(archhandler->path), ctx); error = namei(&nd); if (error) @@ -530,63 +695,20 @@ set_archhandler(__unused proc_t p, int arch) vnode_put(nd.ni_vp); archhandler->fsid = va.va_fsid; - archhandler->fileid = (u_int32_t)va.va_fileid; + archhandler->fileid = va.va_fileid; return 0; } -/* XXX remove once Rosetta is rev'ed */ -/*****************************************************************************/ -static int -sysctl_exec_archhandler_ppc( - __unused int *name, - __unused u_int namelen, - user_addr_t oldBuf, - size_t *oldSize, - user_addr_t newBuf, - size_t newSize, - proc_t p) -{ - int error; - size_t len; - char handler[sizeof(exec_archhandler_ppc.path)]; - vfs_context_t ctx = vfs_context_current(); - if (oldSize) { - len = strlen(exec_archhandler_ppc.path) + 1; - if (oldBuf) { - if (*oldSize < len) - return (ENOMEM); - error = copyout(exec_archhandler_ppc.path, oldBuf, len); - if (error) - return (error); - } - *oldSize = len - 1; - } - if (newBuf) { - error = suser(vfs_context_ucred(ctx), &p->p_acflag); - if (error) - return (error); - if (newSize >= sizeof(exec_archhandler_ppc.path)) - return (ENAMETOOLONG); - error = copyin(newBuf, handler, newSize); - if (error) - return (error); - handler[newSize] = 0; - strlcpy(exec_archhandler_ppc.path, handler, MAXPATHLEN); - error = set_archhandler(p, CPU_TYPE_POWERPC); - if (error) - return (error); - } - return 0; -} -/*****************************************************************************/ - -static int +STATIC int sysctl_handle_exec_archhandler_ppc(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { int error = 0; + if (req->newptr && !kauth_cred_issuser(kauth_cred_get())) + return (EPERM); + error = sysctl_handle_string(oidp, arg1, arg2, req); if (error) @@ -600,7 +722,7 @@ done: } -static int +STATIC int sysctl_handle_kern_threadname( __unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -657,133 +779,153 @@ sysctl_handle_kern_threadname( __unused struct sysctl_oid *oidp, __unused void * return 0; } -SYSCTL_PROC(_kern, KERN_THREADNAME, threadname, CTLFLAG_ANYBODY | CTLTYPE_STRING | CTLFLAG_RW, 0, 0, sysctl_handle_kern_threadname,"A",""); +SYSCTL_PROC(_kern, KERN_THREADNAME, threadname, CTLFLAG_ANYBODY | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_handle_kern_threadname,"A",""); SYSCTL_NODE(_kern, KERN_EXEC, exec, CTLFLAG_RD|CTLFLAG_LOCKED, 0, ""); SYSCTL_NODE(_kern_exec, OID_AUTO, archhandler, CTLFLAG_RD|CTLFLAG_LOCKED, 0, ""); SYSCTL_PROC(_kern_exec_archhandler, OID_AUTO, powerpc, - CTLTYPE_STRING | CTLFLAG_RW, exec_archhandler_ppc.path, 0, - sysctl_handle_exec_archhandler_ppc, "A", ""); + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, + exec_archhandler_ppc.path, + sizeof(exec_archhandler_ppc.path), + sysctl_handle_exec_archhandler_ppc, "A", ""); + +#define BSD_HOST 1 +STATIC int +sysctl_sched_stats(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + host_basic_info_data_t hinfo; + kern_return_t kret; + uint32_t size; + int changed; + mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT; + struct _processor_statistics_np *buf; + int error; -extern int get_kernel_symfile(proc_t, char **); -__private_extern__ int -sysctl_dopanicinfo(int *, u_int, user_addr_t, size_t *, user_addr_t, - size_t, proc_t); + kret = host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count); + if (kret != KERN_SUCCESS) { + return EINVAL; + } -/* - * kernel related system variables. - */ -int -kern_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, proc_t p) -{ - /* all sysctl names not listed below are terminal at this level */ - if (namelen != 1 - && !(name[0] == KERN_PROC - || name[0] == KERN_PROF - || name[0] == KERN_KDEBUG -#if !CONFIG_EMBEDDED - || name[0] == KERN_PROCARGS -#endif - || name[0] == KERN_PROCARGS2 - || name[0] == KERN_IPC - || name[0] == KERN_SYSV - || name[0] == KERN_AFFINITY - || name[0] == KERN_TRANSLATE - || name[0] == KERN_EXEC - || name[0] == KERN_PANICINFO - || name[0] == KERN_POSIX - || name[0] == KERN_TFP - || name[0] == KERN_TTY -#if CONFIG_LCTX - || name[0] == KERN_LCTX -#endif - ) - ) - return (ENOTDIR); /* overloaded */ + size = sizeof(struct _processor_statistics_np) * (hinfo.logical_cpu_max + 2); /* One for RT Queue, One for Fair Share Queue */ + + if (req->oldlen < size) { + return EINVAL; + } + + MALLOC(buf, struct _processor_statistics_np*, size, M_TEMP, M_ZERO | M_WAITOK); + + kret = get_sched_statistics(buf, &size); + if (kret != KERN_SUCCESS) { + error = EINVAL; + goto out; + } + + error = sysctl_io_opaque(req, buf, size, &changed); + if (error) { + goto out; + } + + if (changed) { + panic("Sched info changed?!"); + } +out: + FREE(buf, M_TEMP); + return error; +} + +SYSCTL_PROC(_kern, OID_AUTO, sched_stats, CTLFLAG_LOCKED, 0, 0, sysctl_sched_stats, "-", ""); + +STATIC int +sysctl_sched_stats_enable(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, __unused struct sysctl_req *req) +{ + boolean_t active; + int res; + + if (req->newlen != sizeof(active)) { + return EINVAL; + } + + res = copyin(req->newptr, &active, sizeof(active)); + if (res != 0) { + return res; + } + + return set_sched_stats_active(active); +} + +SYSCTL_PROC(_kern, OID_AUTO, sched_stats_enable, CTLFLAG_LOCKED | CTLFLAG_WR, 0, 0, sysctl_sched_stats_enable, "-", ""); + +extern int get_kernel_symfile(proc_t, char **); - switch (name[0]) { - case KERN_PROC: - return (sysctl_doproc(name + 1, namelen - 1, oldp, oldlenp)); -#ifdef GPROF - case KERN_PROF: - return (sysctl_doprof(name + 1, namelen - 1, oldp, oldlenp, - newp, newlen)); -#endif - case KERN_KDEBUG: - return (kdebug_ops(name + 1, namelen - 1, oldp, oldlenp, p)); -#if !CONFIG_EMBEDDED - case KERN_PROCARGS: - /* new one as it does not use kinfo_proc */ - return (sysctl_procargs(name + 1, namelen - 1, oldp, oldlenp, p)); -#endif - case KERN_PROCARGS2: - /* new one as it does not use kinfo_proc */ - return (sysctl_procargs2(name + 1, namelen - 1, oldp, oldlenp, p)); -#if PANIC_INFO - case KERN_PANICINFO: - return(sysctl_dopanicinfo(name + 1, namelen - 1, oldp, oldlenp, - newp, newlen, p)); -#endif - case KERN_AFFINITY: - return sysctl_affinity(name+1, namelen-1, oldp, oldlenp, - newp, newlen, p); - case KERN_TRANSLATE: - return sysctl_translate(name+1, namelen-1, oldp, oldlenp, newp, - newlen, p); - - /* XXX remove once Rosetta has rev'ed */ - case KERN_EXEC: - return sysctl_exec_archhandler_ppc(name+1, namelen-1, oldp, - oldlenp, newp, newlen, p); #if COUNT_SYSCALLS - case KERN_COUNT_SYSCALLS: - { - /* valid values passed in: - * = 0 means don't keep called counts for each bsd syscall - * > 0 means keep called counts for each bsd syscall - * = 2 means dump current counts to the system log - * = 3 means reset all counts - * for example, to dump current counts: - * sysctl -w kern.count_calls=2 - */ - error = sysctl_int(oldp, oldlenp, newp, newlen, &tmp); - if ( error != 0 ) { - return (error); - } - - if ( tmp == 1 ) { - do_count_syscalls = 1; - } - else if ( tmp == 0 || tmp == 2 || tmp == 3 ) { - extern int nsysent; - extern int syscalls_log[]; - extern const char * syscallnames[]; - int i; - for ( i = 0; i < nsysent; i++ ) { - if ( syscalls_log[i] != 0 ) { - if ( tmp == 2 ) { - printf("%d calls - name %s \n", syscalls_log[i], syscallnames[i]); - } - else { - syscalls_log[i] = 0; - } +#define KERN_COUNT_SYSCALLS (KERN_OSTYPE + 1000) + +extern int nsysent; +extern int syscalls_log[]; +extern const char *syscallnames[]; + +STATIC int +sysctl_docountsyscalls SYSCTL_HANDLER_ARGS +{ + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + __unused int *name = arg1; /* oid element argument vector */ + __unused int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ + size_t newlen = req->newlen; /* user buffer copy in size */ + int error; + + int tmp; + + /* valid values passed in: + * = 0 means don't keep called counts for each bsd syscall + * > 0 means keep called counts for each bsd syscall + * = 2 means dump current counts to the system log + * = 3 means reset all counts + * for example, to dump current counts: + * sysctl -w kern.count_calls=2 + */ + error = sysctl_int(oldp, oldlenp, newp, newlen, &tmp); + if ( error != 0 ) { + return (error); + } + + if ( tmp == 1 ) { + do_count_syscalls = 1; + } + else if ( tmp == 0 || tmp == 2 || tmp == 3 ) { + int i; + for ( i = 0; i < nsysent; i++ ) { + if ( syscalls_log[i] != 0 ) { + if ( tmp == 2 ) { + printf("%d calls - name %s \n", syscalls_log[i], syscallnames[i]); + } + else { + syscalls_log[i] = 0; } - } - if ( tmp != 0 ) { - do_count_syscalls = 1; } } - return (0); - } -#endif - default: - return (ENOTSUP); + if ( tmp != 0 ) { + do_count_syscalls = 1; + } } - /* NOTREACHED */ + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +SYSCTL_PROC(_kern, KERN_COUNT_SYSCALLS, count_syscalls, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_docountsyscalls, /* Handler function */ + NULL, /* Data pointer */ + ""); +#endif /* COUNT_SYSCALLS */ #if DEBUG /* @@ -797,36 +939,68 @@ struct ctldebug debug2, debug3, debug4; struct ctldebug debug5, debug6, debug7, debug8, debug9; struct ctldebug debug10, debug11, debug12, debug13, debug14; struct ctldebug debug15, debug16, debug17, debug18, debug19; -static struct ctldebug *debugvars[CTL_DEBUG_MAXID] = { +STATIC struct ctldebug *debugvars[CTL_DEBUG_MAXID] = { &debug0, &debug1, &debug2, &debug3, &debug4, &debug5, &debug6, &debug7, &debug8, &debug9, &debug10, &debug11, &debug12, &debug13, &debug14, &debug15, &debug16, &debug17, &debug18, &debug19, }; -int -debug_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, - user_addr_t newp, size_t newlen, __unused proc_t p) -{ +STATIC int +sysctl_dodebug SYSCTL_HANDLER_ARGS +{ + int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ + size_t newlen = req->newlen; /* user buffer copy in size */ + int error; + struct ctldebug *cdp; /* all sysctl names at this level are name and field */ - if (namelen != 2) + if (namelen != 1) return (ENOTSUP); /* overloaded */ - if (name[0] < 0 || name[0] >= CTL_DEBUG_MAXID) + if (cmd < 0 || cmd >= CTL_DEBUG_MAXID) return (ENOTSUP); - cdp = debugvars[name[0]]; + cdp = debugvars[cmd]; if (cdp->debugname == 0) return (ENOTSUP); - switch (name[1]) { + switch (name[0]) { case CTL_DEBUG_NAME: - return (sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname)); + error = sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname); + break; case CTL_DEBUG_VALUE: - return (sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar)); + error = sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar); + break; default: - return (ENOTSUP); + error = ENOTSUP; + break; } - /* NOTREACHED */ + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +/* + * XXX We mark this RW instead of RD to let sysctl_rdstring() return the + * XXX historical error. + */ +SYSCTL_PROC(_debug, CTL_DEBUG_NAME, name, CTLTYPE_NODE|CTLFLAG_RW | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + CTL_DEBUG_NAME, /* Integer argument (arg2) */ + sysctl_dodebug, /* Handler function */ + NULL, /* Data pointer */ + "Debugging"); +SYSCTL_PROC(_debug, CTL_DEBUG_VALUE, value, CTLTYPE_NODE|CTLFLAG_RW | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + CTL_DEBUG_VALUE, /* Integer argument (arg2) */ + sysctl_dodebug, /* Handler function */ + NULL, /* Data pointer */ + "Debugging"); #endif /* DEBUG */ /* @@ -1073,7 +1247,7 @@ sysctl_rdstruct(user_addr_t oldp, size_t *oldlenp, /* * Get file structures. */ -static int +STATIC int sysctl_file (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -1118,10 +1292,10 @@ sysctl_file } SYSCTL_PROC(_kern, KERN_FILE, file, - CTLTYPE_STRUCT | CTLFLAG_RW, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_file, "S,filehead", ""); -static int +STATIC int sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg) { if (p->p_pid != (pid_t)*(int*)arg) @@ -1130,7 +1304,7 @@ sysdoproc_filt_KERN_PROC_PID(proc_t p, void * arg) return(1); } -static int +STATIC int sysdoproc_filt_KERN_PROC_PGRP(proc_t p, void * arg) { if (p->p_pgrpid != (pid_t)*(int*)arg) @@ -1139,7 +1313,7 @@ sysdoproc_filt_KERN_PROC_PGRP(proc_t p, void * arg) return(1); } -static int +STATIC int sysdoproc_filt_KERN_PROC_TTY(proc_t p, void * arg) { boolean_t funnel_state; @@ -1162,7 +1336,7 @@ sysdoproc_filt_KERN_PROC_TTY(proc_t p, void * arg) return(retval); } -static int +STATIC int sysdoproc_filt_KERN_PROC_UID(proc_t p, void * arg) { kauth_cred_t my_cred; @@ -1181,7 +1355,7 @@ sysdoproc_filt_KERN_PROC_UID(proc_t p, void * arg) } -static int +STATIC int sysdoproc_filt_KERN_PROC_RUID(proc_t p, void * arg) { kauth_cred_t my_cred; @@ -1190,7 +1364,7 @@ sysdoproc_filt_KERN_PROC_RUID(proc_t p, void * arg) if (p->p_ucred == NULL) return(0); my_cred = kauth_cred_proc_ref(p); - ruid = my_cred->cr_ruid; + ruid = kauth_cred_getruid(my_cred); kauth_cred_unref(&my_cred); if (ruid != (uid_t)*(int*)arg) @@ -1200,7 +1374,7 @@ sysdoproc_filt_KERN_PROC_RUID(proc_t p, void * arg) } #if CONFIG_LCTX -static int +STATIC int sysdoproc_filt_KERN_PROC_LCID(proc_t p, void * arg) { if ((p->p_lctx == NULL) || @@ -1263,12 +1437,18 @@ sysdoproc_callback(proc_t p, void * arg) return(PROC_RETURNED); } -int -sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) +SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD | CTLFLAG_LOCKED, 0, ""); +STATIC int +sysctl_prochandle SYSCTL_HANDLER_ARGS { + int cmd = oidp->oid_arg2; /* subcommand for multiple nodes */ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t where = req->oldptr;/* user buffer copy out address */ + user_addr_t dp = where; size_t needed = 0; - int buflen = where != USER_ADDR_NULL ? *sizep : 0; + int buflen = where != USER_ADDR_NULL ? req->oldlen : 0; int error = 0; boolean_t is_64_bit = FALSE; struct user32_kinfo_proc user32_kproc; @@ -1281,8 +1461,9 @@ sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) int ruidcheck = 0; int ttycheck = 0; - if (namelen != 2 && !(namelen == 1 && name[0] == KERN_PROC_ALL)) + if (namelen != 1 && !(namelen == 0 && cmd == KERN_PROC_ALL)) return (EINVAL); + is_64_bit = proc_is64bit(current_proc()); if (is_64_bit) { sizeof_kproc = sizeof(user_kproc); @@ -1294,7 +1475,7 @@ sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) } - switch (name[0]) { + switch (cmd) { case KERN_PROC_PID: filterfn = sysdoproc_filt_KERN_PROC_PID; @@ -1321,6 +1502,12 @@ sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) filterfn = sysdoproc_filt_KERN_PROC_LCID; break; #endif + case KERN_PROC_ALL: + break; + + default: + /* must be kern.proc.<unknown> */ + return (ENOTSUP); } error = 0; @@ -1334,9 +1521,10 @@ sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) args.ruidcheck = ruidcheck; args.ttycheck = ttycheck; args.sizeof_kproc = sizeof_kproc; - args.uidval = name[1]; + if (namelen) + args.uidval = name[0]; - proc_iterate((PROC_ALLPROCLIST | PROC_ZOMBPROCLIST), sysdoproc_callback, &args, filterfn, &name[1]); + proc_iterate((PROC_ALLPROCLIST | PROC_ZOMBPROCLIST), sysdoproc_callback, &args, filterfn, name); if (error) return(error); @@ -1345,20 +1533,87 @@ sysctl_doproc(int *name, u_int namelen, user_addr_t where, size_t *sizep) needed = args.needed; if (where != USER_ADDR_NULL) { - *sizep = dp - where; - if (needed > *sizep) + req->oldlen = dp - where; + if (needed > req->oldlen) return (ENOMEM); } else { needed += KERN_PROCSLOP; - *sizep = needed; + req->oldlen = needed; } + /* adjust index so we return the right required/consumed amount */ + req->oldidx += req->oldlen; return (0); } +/* + * We specify the subcommand code for multiple nodes as the 'req->arg2' value + * in the sysctl declaration itself, which comes into the handler function + * as 'oidp->oid_arg2'. + * + * For these particular sysctls, since they have well known OIDs, we could + * have just obtained it from the '((int *)arg1)[0]' parameter, but that would + * not demonstrate how to handle multiple sysctls that used OID_AUTO instead + * of a well known value with a common handler function. This is desirable, + * because we want well known values to "go away" at some future date. + * + * It should be noted that the value of '((int *)arg1)[1]' is used for many + * an integer parameter to the subcommand for many of these sysctls; we'd + * rather have used '((int *)arg1)[0]' for that, or even better, an element + * in a structure passed in as the the 'newp' argument to sysctlbyname(3), + * and then use leaf-node permissions enforcement, but that would have + * necessitated modifying user space code to correspond to the interface + * change, and we are striving for binary backward compatibility here; even + * though these are SPI, and not intended for use by user space applications + * which are not themselves system tools or libraries, some applications + * have erroneously used them. + */ +SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_ALL, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_PID, pid, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_PID, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_TTY, tty, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_TTY, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_PGRP, pgrp, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_PGRP, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_UID, uid, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_UID, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_RUID, ruid, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_RUID, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); +SYSCTL_PROC(_kern_proc, KERN_PROC_LCID, lcid, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + KERN_PROC_LCID, /* Integer argument (arg2) */ + sysctl_prochandle, /* Handler function */ + NULL, /* Data is size variant on ILP32/LP64 */ + ""); + /* * Fill in an eproc structure for the specified process. */ -static void +STATIC void fill_user32_eproc(proc_t p, struct user32_eproc *ep) { struct tty *tp; @@ -1396,15 +1651,15 @@ fill_user32_eproc(proc_t p, struct user32_eproc *ep) my_cred = kauth_cred_proc_ref(p); /* A fake historical pcred */ - ep->e_pcred.p_ruid = my_cred->cr_ruid; - ep->e_pcred.p_svuid = my_cred->cr_svuid; - ep->e_pcred.p_rgid = my_cred->cr_rgid; - ep->e_pcred.p_svgid = my_cred->cr_svgid; + ep->e_pcred.p_ruid = kauth_cred_getruid(my_cred); + ep->e_pcred.p_svuid = kauth_cred_getsvuid(my_cred); + ep->e_pcred.p_rgid = kauth_cred_getrgid(my_cred); + ep->e_pcred.p_svgid = kauth_cred_getsvgid(my_cred); /* A fake historical *kauth_cred_t */ ep->e_ucred.cr_ref = my_cred->cr_ref; ep->e_ucred.cr_uid = kauth_cred_getuid(my_cred); - ep->e_ucred.cr_ngroups = my_cred->cr_ngroups; - bcopy(my_cred->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); + ep->e_ucred.cr_ngroups = posix_cred_get(my_cred)->cr_ngroups; + bcopy(posix_cred_get(my_cred)->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); kauth_cred_unref(&my_cred); } @@ -1437,7 +1692,7 @@ fill_user32_eproc(proc_t p, struct user32_eproc *ep) /* * Fill in an LP64 version of eproc structure for the specified process. */ -static void +STATIC void fill_user64_eproc(proc_t p, struct user64_eproc *ep) { struct tty *tp; @@ -1476,16 +1731,16 @@ fill_user64_eproc(proc_t p, struct user64_eproc *ep) my_cred = kauth_cred_proc_ref(p); /* A fake historical pcred */ - ep->e_pcred.p_ruid = my_cred->cr_ruid; - ep->e_pcred.p_svuid = my_cred->cr_svuid; - ep->e_pcred.p_rgid = my_cred->cr_rgid; - ep->e_pcred.p_svgid = my_cred->cr_svgid; + ep->e_pcred.p_ruid = kauth_cred_getruid(my_cred); + ep->e_pcred.p_svuid = kauth_cred_getsvuid(my_cred); + ep->e_pcred.p_rgid = kauth_cred_getrgid(my_cred); + ep->e_pcred.p_svgid = kauth_cred_getsvgid(my_cred); /* A fake historical *kauth_cred_t */ ep->e_ucred.cr_ref = my_cred->cr_ref; ep->e_ucred.cr_uid = kauth_cred_getuid(my_cred); - ep->e_ucred.cr_ngroups = my_cred->cr_ngroups; - bcopy(my_cred->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); + ep->e_ucred.cr_ngroups = posix_cred_get(my_cred)->cr_ngroups; + bcopy(posix_cred_get(my_cred)->cr_groups, ep->e_ucred.cr_groups, NGROUPS*sizeof(gid_t)); kauth_cred_unref(&my_cred); } @@ -1518,7 +1773,7 @@ fill_user64_eproc(proc_t p, struct user64_eproc *ep) /* * Fill in an eproc structure for the specified process. */ -static void +STATIC void fill_user32_externproc(proc_t p, struct user32_extern_proc *exp) { exp->p_forw = exp->p_back = 0; @@ -1583,7 +1838,7 @@ fill_user32_externproc(proc_t p, struct user32_extern_proc *exp) /* * Fill in an LP64 version of extern_proc structure for the specified process. */ -static void +STATIC void fill_user64_externproc(proc_t p, struct user64_extern_proc *exp) { exp->p_forw = exp->p_back = USER_ADDR_NULL; @@ -1649,7 +1904,7 @@ fill_user64_externproc(proc_t p, struct user64_extern_proc *exp) exp->p_ru = CAST_USER_ADDR_T(p->p_ru); /* XXX may be NULL */ } -static void +STATIC void fill_user32_proc(proc_t p, struct user32_kinfo_proc *kp) { /* on a 64 bit kernel, 32 bit users will get some truncated information */ @@ -1657,23 +1912,31 @@ fill_user32_proc(proc_t p, struct user32_kinfo_proc *kp) fill_user32_eproc(p, &kp->kp_eproc); } -static void +STATIC void fill_user64_proc(proc_t p, struct user64_kinfo_proc *kp) { fill_user64_externproc(p, &kp->kp_proc); fill_user64_eproc(p, &kp->kp_eproc); } -int -kdebug_ops(int *name, u_int namelen, user_addr_t where, - size_t *sizep, proc_t p) +STATIC int +sysctl_kdebug_ops SYSCTL_HANDLER_ARGS { + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ +// user_addr_t newp = req->newptr; /* user buffer copy in address */ +// size_t newlen = req->newlen; /* user buffer copy in size */ + + proc_t p = current_proc(); int ret=0; if (namelen == 0) return(ENOTSUP); - ret = suser(kauth_cred_get(), &p->p_acflag); + ret = suser(kauth_cred_get(), &p->p_acflag); if (ret) return(ret); @@ -1687,41 +1950,96 @@ kdebug_ops(int *name, u_int namelen, user_addr_t where, case KERN_KDSETREG: case KERN_KDGETREG: case KERN_KDREADTR: + case KERN_KDWRITETR: + case KERN_KDWRITEMAP: case KERN_KDPIDTR: case KERN_KDTHRMAP: case KERN_KDPIDEX: case KERN_KDSETRTCDEC: case KERN_KDSETBUF: case KERN_KDGETENTROPY: - ret = kdbg_control(name, namelen, where, sizep); + ret = kdbg_control(name, namelen, oldp, oldlenp); break; default: ret= ENOTSUP; break; } - return(ret); + + /* adjust index so we return the right required/consumed amount */ + if (!ret) + req->oldidx += req->oldlen; + + return (ret); } +SYSCTL_PROC(_kern, KERN_KDEBUG, kdebug, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_kdebug_ops, /* Handler function */ + NULL, /* Data pointer */ + ""); +#if !CONFIG_EMBEDDED /* * Return the top *sizep bytes of the user stack, or the entire area of the * user stack down through the saved exec_path, whichever is smaller. */ -int -sysctl_procargs(int *name, u_int namelen, user_addr_t where, - size_t *sizep, proc_t cur_proc) -{ - return sysctl_procargsx( name, namelen, where, sizep, cur_proc, 0); +STATIC int +sysctl_doprocargs SYSCTL_HANDLER_ARGS +{ + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ +// user_addr_t newp = req->newptr; /* user buffer copy in address */ +// size_t newlen = req->newlen; /* user buffer copy in size */ + int error; + + error = sysctl_procargsx( name, namelen, oldp, oldlenp, current_proc(), 0); + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +SYSCTL_PROC(_kern, KERN_PROCARGS, procargs, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_doprocargs, /* Handler function */ + NULL, /* Data pointer */ + ""); +#endif /* !CONFIG_EMBEDDED */ + +STATIC int +sysctl_doprocargs2 SYSCTL_HANDLER_ARGS +{ + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = &req->oldlen; /* user buffer copy out size */ +// user_addr_t newp = req->newptr; /* user buffer copy in address */ +// size_t newlen = req->newlen; /* user buffer copy in size */ + int error; -static int -sysctl_procargs2(int *name, u_int namelen, user_addr_t where, - size_t *sizep, proc_t cur_proc) -{ - return sysctl_procargsx( name, namelen, where, sizep, cur_proc, 1); + error = sysctl_procargsx( name, namelen, oldp, oldlenp, current_proc(), 1); + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return (error); } +SYSCTL_PROC(_kern, KERN_PROCARGS2, procargs2, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_doprocargs2, /* Handler function */ + NULL, /* Data pointer */ + ""); -static int +STATIC int sysctl_procargsx(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t cur_proc, int argc_yes) { @@ -1977,7 +2295,7 @@ sysctl_procargsx(int *name, u_int namelen, user_addr_t where, /* * Max number of concurrent aio requests */ -static int +STATIC int sysctl_aiomax (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -1997,7 +2315,7 @@ sysctl_aiomax /* * Max number of concurrent aio requests per process */ -static int +STATIC int sysctl_aioprocmax (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2017,7 +2335,7 @@ sysctl_aioprocmax /* * Max number of async IO worker threads */ -static int +STATIC int sysctl_aiothreads (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2039,7 +2357,7 @@ sysctl_aiothreads /* * System-wide limit on the max number of processes */ -static int +STATIC int sysctl_maxproc (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2058,26 +2376,30 @@ sysctl_maxproc } SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, ostype, 0, ""); SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, osrelease, 0, ""); SYSCTL_INT(_kern, KERN_OSREV, osrevision, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, BSD, ""); SYSCTL_STRING(_kern, KERN_VERSION, version, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, version, 0, ""); +SYSCTL_STRING(_kern, OID_AUTO, uuid, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, + &kernel_uuid[0], 0, ""); #if DEBUG int debug_kprint_syscall = 0; char debug_kprint_syscall_process[MAXCOMLEN+1]; +/* Thread safe: bits and string value are not used to reclaim state */ SYSCTL_INT (_debug, OID_AUTO, kprint_syscall, - CTLFLAG_RW, &debug_kprint_syscall, 0, "kprintf syscall tracing"); + CTLFLAG_RW | CTLFLAG_LOCKED, &debug_kprint_syscall, 0, "kprintf syscall tracing"); SYSCTL_STRING(_debug, OID_AUTO, kprint_syscall_process, - CTLFLAG_RW, debug_kprint_syscall_process, sizeof(debug_kprint_syscall_process), + CTLFLAG_RW | CTLFLAG_LOCKED, debug_kprint_syscall_process, sizeof(debug_kprint_syscall_process), "name of process for kprintf syscall tracing"); int debug_kprint_current_process(const char **namep) @@ -2113,7 +2435,7 @@ int debug_kprint_current_process(const char **namep) /* PR-5293665: need to use a callback function for kern.osversion to set * osversion in IORegistry */ -static int +STATIC int sysctl_osversion(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { int rval = 0; @@ -2128,11 +2450,11 @@ sysctl_osversion(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct } SYSCTL_PROC(_kern, KERN_OSVERSION, osversion, - CTLFLAG_RW | CTLFLAG_KERN | CTLTYPE_STRING, + CTLFLAG_RW | CTLFLAG_KERN | CTLTYPE_STRING | CTLFLAG_LOCKED, osversion, 256 /* OSVERSIZE*/, sysctl_osversion, "A", ""); -static int +STATIC int sysctl_sysctl_bootargs (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2150,46 +2472,46 @@ SYSCTL_PROC(_kern, OID_AUTO, bootargs, sysctl_sysctl_bootargs, "A", "bootargs"); SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &maxfiles, 0, ""); SYSCTL_INT(_kern, KERN_ARGMAX, argmax, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, ARG_MAX, ""); SYSCTL_INT(_kern, KERN_POSIX1, posix1version, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, _POSIX_VERSION, ""); SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, NGROUPS_MAX, ""); SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, 1, ""); #if 1 /* _POSIX_SAVED_IDS from <unistd.h> */ SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (int *)NULL, 1, ""); #else SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, - CTLFLAG_RD | CTLFLAG_KERN, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, NULL, 0, ""); #endif SYSCTL_INT(_kern, OID_AUTO, num_files, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, &nfiles, 0, ""); SYSCTL_COMPAT_INT(_kern, OID_AUTO, num_vnodes, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, &numvnodes, 0, ""); SYSCTL_INT(_kern, OID_AUTO, num_tasks, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, &task_max, 0, ""); SYSCTL_INT(_kern, OID_AUTO, num_threads, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, &thread_max, 0, ""); SYSCTL_INT(_kern, OID_AUTO, num_taskthreads, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, &task_threadmax, 0, ""); -static int +STATIC int sysctl_maxvnodes (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { int oldval = desiredvnodes; @@ -2203,27 +2525,31 @@ sysctl_maxvnodes (__unused struct sysctl_oid *oidp, __unused void *arg1, __unuse return(error); } +SYSCTL_INT(_kern, OID_AUTO, namecache_disabled, + CTLFLAG_RW | CTLFLAG_LOCKED, + &nc_disabled, 0, ""); + SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_maxvnodes, "I", ""); SYSCTL_PROC(_kern, KERN_MAXPROC, maxproc, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_maxproc, "I", ""); SYSCTL_PROC(_kern, KERN_AIOMAX, aiomax, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_aiomax, "I", ""); SYSCTL_PROC(_kern, KERN_AIOPROCMAX, aioprocmax, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_aioprocmax, "I", ""); SYSCTL_PROC(_kern, KERN_AIOTHREADS, aiothreads, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_aiothreads, "I", ""); -static int +STATIC int sysctl_securelvl (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2242,11 +2568,11 @@ sysctl_securelvl } SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_securelvl, "I", ""); -static int +STATIC int sysctl_domainname (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2259,14 +2585,14 @@ sysctl_domainname } SYSCTL_PROC(_kern, KERN_DOMAINNAME, nisdomainname, - CTLTYPE_STRING | CTLFLAG_RW, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_domainname, "A", ""); SYSCTL_COMPAT_INT(_kern, KERN_HOSTID, hostid, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &hostid, 0, ""); -static int +STATIC int sysctl_hostname (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2280,10 +2606,10 @@ sysctl_hostname SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname, - CTLTYPE_STRING | CTLFLAG_RW, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_hostname, "A", ""); -static int +STATIC int sysctl_procname (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2293,26 +2619,59 @@ sysctl_procname } SYSCTL_PROC(_kern, KERN_PROCNAME, procname, - CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_ANYBODY, + CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_procname, "A", ""); SYSCTL_INT(_kern, KERN_SPECULATIVE_READS, speculative_reads_disabled, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &speculative_reads_disabled, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, ignore_is_ssd, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &ignore_is_ssd, 0, ""); + SYSCTL_UINT(_kern, OID_AUTO, preheat_pages_max, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &preheat_pages_max, 0, ""); SYSCTL_UINT(_kern, OID_AUTO, preheat_pages_min, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &preheat_pages_min, 0, ""); -SYSCTL_UINT(_kern, OID_AUTO, preheat_pages_mult, - CTLFLAG_RW | CTLFLAG_KERN, - &preheat_pages_mult, 0, ""); +SYSCTL_UINT(_kern, OID_AUTO, speculative_prefetch_max, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &speculative_prefetch_max, 0, ""); -static int +SYSCTL_UINT(_kern, OID_AUTO, vm_page_free_target, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_free_target, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_page_free_min, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_free_min, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_page_free_reserved, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_free_reserved, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_page_speculative_percentage, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_speculative_percentage, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_page_speculative_q_age_ms, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_page_speculative_q_age_ms, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_max_delayed_work_limit, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_max_delayed_work_limit, 0, ""); + +SYSCTL_UINT(_kern, OID_AUTO, vm_max_batch, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, + &vm_max_batch, 0, ""); + + +STATIC int sysctl_boottime (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2333,10 +2692,10 @@ sysctl_boottime } SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, - CTLTYPE_STRUCT | CTLFLAG_RD, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_boottime, "S,timeval", ""); -static int +STATIC int sysctl_symfile (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2349,11 +2708,11 @@ sysctl_symfile SYSCTL_PROC(_kern, KERN_SYMFILE, symfile, - CTLTYPE_STRING | CTLFLAG_RD, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_symfile, "A", ""); #if NFSCLIENT -static int +STATIC int sysctl_netboot (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2361,12 +2720,15 @@ sysctl_netboot } SYSCTL_PROC(_kern, KERN_NETBOOT, netboot, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_netboot, "I", ""); #endif #ifdef CONFIG_IMGSRC_ACCESS -static int +/* + * Legacy--act as if only one layer of nesting is possible. + */ +STATIC int sysctl_imgsrcdev (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2378,16 +2740,16 @@ sysctl_imgsrcdev return EPERM; } - if (imgsrc_rootvnode == NULL) { + if (imgsrc_rootvnodes[0] == NULL) { return ENOENT; } - result = vnode_getwithref(imgsrc_rootvnode); + result = vnode_getwithref(imgsrc_rootvnodes[0]); if (result != 0) { return result; } - devvp = vnode_mount(imgsrc_rootvnode)->mnt_devvp; + devvp = vnode_mount(imgsrc_rootvnodes[0])->mnt_devvp; result = vnode_getwithref(devvp); if (result != 0) { goto out; @@ -2397,16 +2759,82 @@ sysctl_imgsrcdev vnode_put(devvp); out: - vnode_put(imgsrc_rootvnode); + vnode_put(imgsrc_rootvnodes[0]); return result; } SYSCTL_PROC(_kern, OID_AUTO, imgsrcdev, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_imgsrcdev, "I", ""); + +STATIC int +sysctl_imgsrcinfo +(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int error; + struct imgsrc_info info[MAX_IMAGEBOOT_NESTING]; /* 2 for now, no problem */ + uint32_t i; + vnode_t rvp, devvp; + + if (imgsrc_rootvnodes[0] == NULLVP) { + return ENXIO; + } + + for (i = 0; i < MAX_IMAGEBOOT_NESTING; i++) { + /* + * Go get the root vnode. + */ + rvp = imgsrc_rootvnodes[i]; + if (rvp == NULLVP) { + break; + } + + error = vnode_get(rvp); + if (error != 0) { + return error; + } + + /* + * For now, no getting at a non-local volume. + */ + devvp = vnode_mount(rvp)->mnt_devvp; + if (devvp == NULL) { + vnode_put(rvp); + return EINVAL; + } + + error = vnode_getwithref(devvp); + if (error != 0) { + vnode_put(rvp); + return error; + } + + /* + * Fill in info. + */ + info[i].ii_dev = vnode_specrdev(devvp); + info[i].ii_flags = 0; + info[i].ii_height = i; + bzero(info[i].ii_reserved, sizeof(info[i].ii_reserved)); + + vnode_put(devvp); + vnode_put(rvp); + } + + return sysctl_io_opaque(req, info, i * sizeof(info[0]), NULL); +} + +SYSCTL_PROC(_kern, OID_AUTO, imgsrcinfo, + CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, sysctl_imgsrcinfo, "I", ""); + #endif /* CONFIG_IMGSRC_ACCESS */ -static int +SYSCTL_INT(_kern, OID_AUTO, timer_coalescing_enabled, + CTLFLAG_RW | CTLFLAG_LOCKED, + &mach_timer_coalescing_enabled, 0, ""); + +STATIC int sysctl_usrstack (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2414,10 +2842,10 @@ sysctl_usrstack } SYSCTL_PROC(_kern, KERN_USRSTACK32, usrstack, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_usrstack, "I", ""); -static int +STATIC int sysctl_usrstack64 (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2425,14 +2853,14 @@ sysctl_usrstack64 } SYSCTL_PROC(_kern, KERN_USRSTACK64, usrstack64, - CTLTYPE_QUAD | CTLFLAG_RD, + CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_usrstack64, "Q", ""); SYSCTL_STRING(_kern, KERN_COREFILE, corefile, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, corefilename, sizeof(corefilename), ""); -static int +STATIC int sysctl_coredump (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2451,10 +2879,10 @@ sysctl_coredump } SYSCTL_PROC(_kern, KERN_COREDUMP, coredump, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_coredump, "I", ""); -static int +STATIC int sysctl_suid_coredump (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2473,10 +2901,10 @@ sysctl_suid_coredump } SYSCTL_PROC(_kern, KERN_SUGID_COREDUMP, sugid_coredump, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_suid_coredump, "I", ""); -static int +STATIC int sysctl_delayterm (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2495,11 +2923,11 @@ sysctl_delayterm } SYSCTL_PROC(_kern, KERN_PROCDELAYTERM, delayterm, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_delayterm, "I", ""); -static int +STATIC int sysctl_rage_vnode (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2545,11 +2973,11 @@ sysctl_rage_vnode } SYSCTL_PROC(_kern, KERN_RAGEVNODE, rage_vnode, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_rage_vnode, "I", ""); -static int +STATIC int sysctl_kern_check_openevt (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2582,12 +3010,12 @@ sysctl_kern_check_openevt return(error); } -SYSCTL_PROC(_kern, KERN_CHECKOPENEVT, check_openevt, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY, +SYSCTL_PROC(_kern, KERN_CHECKOPENEVT, check_openevt, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_kern_check_openevt, "I", "set the per-process check-open-evt flag"); -static int +STATIC int sysctl_nx (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2617,10 +3045,10 @@ sysctl_nx SYSCTL_PROC(_kern, KERN_NX_PROTECTION, nx, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_nx, "I", ""); -static int +STATIC int sysctl_loadavg (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2636,10 +3064,30 @@ sysctl_loadavg } SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, - CTLTYPE_STRUCT | CTLFLAG_RD, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_loadavg, "S,loadavg", ""); -static int +/* + * Note: Thread safe; vm_map_lock protects in vm_toggle_entry_reuse() + */ +STATIC int +sysctl_vm_toggle_address_reuse(__unused struct sysctl_oid *oidp, __unused void *arg1, + __unused int arg2, struct sysctl_req *req) +{ + int old_value=0, new_value=0, error=0; + + if(vm_toggle_entry_reuse( VM_TOGGLE_GETVALUE, &old_value )) + return(error); + error = sysctl_io_number(req, old_value, sizeof(int), &new_value, NULL); + if (!error) { + return (vm_toggle_entry_reuse(new_value, NULL)); + } + return(error); +} + +SYSCTL_PROC(_debug, OID_AUTO, toggle_address_reuse, CTLFLAG_ANYBODY | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_vm_toggle_address_reuse,"I",""); + +STATIC int sysctl_swapusage (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2668,17 +3116,53 @@ sysctl_swapusage SYSCTL_PROC(_vm, VM_SWAPUSAGE, swapusage, - CTLTYPE_STRUCT | CTLFLAG_RD, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_swapusage, "S,xsw_usage", ""); +#if CONFIG_EMBEDDED +/* <rdar://problem/7688080> */ +boolean_t vm_freeze_enabled = FALSE; +#endif /* CONFIG_EMBEDDED */ + + +#if CONFIG_FREEZE +extern void vm_page_reactivate_all_throttled(void); + +static int +sysctl_freeze_enabled SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error, val = vm_freeze_enabled ? 1 : 0; + boolean_t disabled; + + error = sysctl_handle_int(oidp, &val, 0, req); + if (error || !req->newptr) + return (error); + + /* + * If freeze is being disabled, we need to move dirty pages out from the throttle to the active queue. + */ + disabled = (!val && vm_freeze_enabled); + + vm_freeze_enabled = val ? TRUE : FALSE; + + if (disabled) { + vm_page_reactivate_all_throttled(); + } + + return (0); +} + +SYSCTL_PROC(_vm, OID_AUTO, freeze_enabled, CTLTYPE_INT|CTLFLAG_RW, &vm_freeze_enabled, 0, sysctl_freeze_enabled, "I", ""); +#endif /* CONFIG_FREEZE */ /* this kernel does NOT implement shared_region_make_private_np() */ SYSCTL_INT(_kern, KERN_SHREG_PRIVATIZABLE, shreg_private, - CTLFLAG_RD, + CTLFLAG_RD | CTLFLAG_LOCKED, (int *)NULL, 0, ""); #if defined(__i386__) || defined(__x86_64__) -static int +STATIC int sysctl_sysctl_exec_affinity(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) @@ -2706,10 +3190,10 @@ sysctl_sysctl_exec_affinity(__unused struct sysctl_oid *oidp, return 0; } -SYSCTL_PROC(_sysctl, OID_AUTO, proc_exec_affinity, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, sysctl_sysctl_exec_affinity ,"I","proc_exec_affinity"); +SYSCTL_PROC(_sysctl, OID_AUTO, proc_exec_affinity, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, sysctl_sysctl_exec_affinity ,"I","proc_exec_affinity"); #endif -static int +STATIC int fetch_process_cputype( proc_t cur_proc, int *name, @@ -2752,7 +3236,7 @@ out: return (error); } -static int +STATIC int sysctl_sysctl_native(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { @@ -2765,9 +3249,9 @@ sysctl_sysctl_native(__unused struct sysctl_oid *oidp, void *arg1, int arg2, res = 0; return SYSCTL_OUT(req, &res, sizeof(res)); } -SYSCTL_PROC(_sysctl, OID_AUTO, proc_native, CTLTYPE_NODE|CTLFLAG_RD, 0, 0, sysctl_sysctl_native ,"I","proc_native"); +SYSCTL_PROC(_sysctl, OID_AUTO, proc_native, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_sysctl_native ,"I","proc_native"); -static int +STATIC int sysctl_sysctl_cputype(__unused struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req) { @@ -2777,9 +3261,9 @@ sysctl_sysctl_cputype(__unused struct sysctl_oid *oidp, void *arg1, int arg2, return error; return SYSCTL_OUT(req, &proc_cputype, sizeof(proc_cputype)); } -SYSCTL_PROC(_sysctl, OID_AUTO, proc_cputype, CTLTYPE_NODE|CTLFLAG_RD, 0, 0, sysctl_sysctl_cputype ,"I","proc_cputype"); +SYSCTL_PROC(_sysctl, OID_AUTO, proc_cputype, CTLTYPE_NODE|CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_sysctl_cputype ,"I","proc_cputype"); -static int +STATIC int sysctl_safeboot (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2787,10 +3271,10 @@ sysctl_safeboot } SYSCTL_PROC(_kern, KERN_SAFEBOOT, safeboot, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_safeboot, "I", ""); -static int +STATIC int sysctl_singleuser (__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) { @@ -2798,7 +3282,7 @@ sysctl_singleuser } SYSCTL_PROC(_kern, OID_AUTO, singleuser, - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_singleuser, "I", ""); /* @@ -2808,9 +3292,9 @@ extern boolean_t affinity_sets_enabled; extern int affinity_sets_mapping; SYSCTL_INT (_kern, OID_AUTO, affinity_sets_enabled, - CTLFLAG_RW, (int *) &affinity_sets_enabled, 0, "hinting enabled"); + CTLFLAG_RW | CTLFLAG_LOCKED, (int *) &affinity_sets_enabled, 0, "hinting enabled"); SYSCTL_INT (_kern, OID_AUTO, affinity_sets_mapping, - CTLFLAG_RW, &affinity_sets_mapping, 0, "mapping policy"); + CTLFLAG_RW | CTLFLAG_LOCKED, &affinity_sets_mapping, 0, "mapping policy"); /* * Limit on total memory users can wire. @@ -2833,9 +3317,9 @@ vm_map_size_t vm_user_wire_limit; * There needs to be a more automatic/elegant way to do this */ -SYSCTL_QUAD(_vm, OID_AUTO, global_no_user_wire_amount, CTLFLAG_RW, &vm_global_no_user_wire_amount, ""); -SYSCTL_QUAD(_vm, OID_AUTO, global_user_wire_limit, CTLFLAG_RW, &vm_global_user_wire_limit, ""); -SYSCTL_QUAD(_vm, OID_AUTO, user_wire_limit, CTLFLAG_RW, &vm_user_wire_limit, ""); +SYSCTL_QUAD(_vm, OID_AUTO, global_no_user_wire_amount, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_global_no_user_wire_amount, ""); +SYSCTL_QUAD(_vm, OID_AUTO, global_user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_global_user_wire_limit, ""); +SYSCTL_QUAD(_vm, OID_AUTO, user_wire_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_user_wire_limit, ""); @@ -2846,15 +3330,15 @@ SYSCTL_QUAD(_vm, OID_AUTO, user_wire_limit, CTLFLAG_RW, &vm_user_wire_limit, "") extern uint32_t kdebug_thread_block; SYSCTL_INT (_kern, OID_AUTO, kdebug_thread_block, - CTLFLAG_RW, &kdebug_thread_block, 0, "kdebug thread_block"); + CTLFLAG_RW | CTLFLAG_LOCKED, &kdebug_thread_block, 0, "kdebug thread_block"); /* * Kernel stack size and depth */ SYSCTL_INT (_kern, OID_AUTO, stack_size, - CTLFLAG_RD, (int *) &kernel_stack_size, 0, "Kernel stack size"); + CTLFLAG_RD | CTLFLAG_LOCKED, (int *) &kernel_stack_size, 0, "Kernel stack size"); SYSCTL_INT (_kern, OID_AUTO, stack_depth_max, - CTLFLAG_RD, (int *) &kernel_stack_depth_max, 0, "Max kernel stack depth at interrupt or context switch"); + CTLFLAG_RD | CTLFLAG_LOCKED, (int *) &kernel_stack_depth_max, 0, "Max kernel stack depth at interrupt or context switch"); /* * enable back trace for port allocations @@ -2862,6 +3346,21 @@ SYSCTL_INT (_kern, OID_AUTO, stack_depth_max, extern int ipc_portbt; SYSCTL_INT(_kern, OID_AUTO, ipc_portbt, - CTLFLAG_RW | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, &ipc_portbt, 0, ""); +/* + * Scheduler sysctls + */ + +/* + * See osfmk/kern/sched_prim.c for the corresponding definition + * in osfmk/. If either version changes, update the other. + */ +#define SCHED_STRING_MAX_LENGTH (48) + +extern char sched_string[SCHED_STRING_MAX_LENGTH]; +SYSCTL_STRING(_kern, OID_AUTO, sched, + CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, + sched_string, sizeof(sched_string), + "Timeshare scheduler implementation"); diff --git a/bsd/kern/kern_time.c b/bsd/kern/kern_time.c index 6f392fb86..315863131 100644 --- a/bsd/kern/kern_time.c +++ b/bsd/kern/kern_time.c @@ -75,6 +75,7 @@ #include <sys/kauth.h> #include <sys/vnode.h> #include <sys/time.h> +#include <sys/priv.h> #include <sys/mount_internal.h> #include <sys/sysproto.h> @@ -215,7 +216,7 @@ adjtime(struct proc *p, struct adjtime_args *uap, __unused int32_t *retval) if (error) return (error); #endif - if ((error = suser(kauth_cred_get(), &p->p_acflag))) + if ((error = priv_check_cred(kauth_cred_get(), PRIV_ADJTIME, 0))) return (error); if (IS_64BIT_PROCESS(p)) { struct user64_timeval user_atv; diff --git a/bsd/kern/kern_xxx.c b/bsd/kern/kern_xxx.c index 0a080dd59..293808838 100644 --- a/bsd/kern/kern_xxx.c +++ b/bsd/kern/kern_xxx.c @@ -116,7 +116,7 @@ reboot(struct proc *p, register struct reboot_args *uap, __unused int32_t *retva #endif if (!error) { OSBitOrAtomic(P_REBOOT, &p->p_flag); /* No more signals for this proc */ - boot(RB_BOOT, uap->opt, command); + error = boot(RB_BOOT, uap->opt, command); } return(error); } diff --git a/bsd/kern/kpi_mbuf.c b/bsd/kern/kpi_mbuf.c index a89cfbef0..70ab53b31 100644 --- a/bsd/kern/kpi_mbuf.c +++ b/bsd/kern/kpi_mbuf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -40,9 +40,9 @@ #include "net/net_str_id.h" -static const mbuf_flags_t mbuf_flags_mask = MBUF_EXT | MBUF_PKTHDR | MBUF_EOR | - MBUF_BCAST | MBUF_MCAST | MBUF_FRAG | MBUF_FIRSTFRAG | - MBUF_LASTFRAG | MBUF_PROMISC; +static const mbuf_flags_t mbuf_flags_mask = (MBUF_EXT | MBUF_PKTHDR | MBUF_EOR | + MBUF_LOOP | MBUF_BCAST | MBUF_MCAST | MBUF_FRAG | MBUF_FIRSTFRAG | + MBUF_LASTFRAG | MBUF_PROMISC | MBUF_HASFCS); void* mbuf_data(mbuf_t mbuf) { @@ -81,6 +81,10 @@ errno_t mbuf_align_32(mbuf_t mbuf, size_t len) return 0; } +/* This function is used to provide mcl_to_paddr via symbol indirection, + * please avoid any change in behavior or remove the indirection in + * config/Unsupported* + */ addr64_t mbuf_data_to_physical(void* ptr) { return (addr64_t)(uintptr_t)mcl_to_paddr(ptr); @@ -107,10 +111,10 @@ mbuf_attachcluster(mbuf_how_t how, mbuf_type_t type, mbuf_t *mbuf, caddr_t extbuf, void (*extfree)(caddr_t , u_int, caddr_t), size_t extsize, caddr_t extarg) { - if (extbuf == NULL || extfree == NULL || extsize == 0) + if (mbuf == NULL || extbuf == NULL || extfree == NULL || extsize == 0) return (EINVAL); - if ((*mbuf = m_clattach(mbuf != NULL ? *mbuf : NULL, type, extbuf, + if ((*mbuf = m_clattach(*mbuf, type, extbuf, extfree, extsize, extarg, how)) == NULL) return (ENOMEM); @@ -126,15 +130,15 @@ mbuf_alloccluster(mbuf_how_t how, size_t *size, caddr_t *addr) *addr = NULL; /* Jumbo cluster pool not available? */ - if (*size > NBPG && njcl == 0) + if (*size > MBIGCLBYTES && njcl == 0) return (ENOTSUP); if (*size <= MCLBYTES && (*addr = m_mclalloc(how)) != NULL) *size = MCLBYTES; - else if (*size > MCLBYTES && *size <= NBPG && + else if (*size > MCLBYTES && *size <= MBIGCLBYTES && (*addr = m_bigalloc(how)) != NULL) - *size = NBPG; - else if (*size > NBPG && *size <= M16KCLBYTES && + *size = MBIGCLBYTES; + else if (*size > MBIGCLBYTES && *size <= M16KCLBYTES && (*addr = m_16kalloc(how)) != NULL) *size = M16KCLBYTES; else @@ -149,14 +153,14 @@ mbuf_alloccluster(mbuf_how_t how, size_t *size, caddr_t *addr) void mbuf_freecluster(caddr_t addr, size_t size) { - if (size != MCLBYTES && size != NBPG && size != M16KCLBYTES) + if (size != MCLBYTES && size != MBIGCLBYTES && size != M16KCLBYTES) panic("%s: invalid size (%ld) for cluster %p", __func__, size, (void *)addr); if (size == MCLBYTES) m_mclfree(addr); - else if (size == NBPG) - m_bigfree(addr, NBPG, NULL); + else if (size == MBIGCLBYTES) + m_bigfree(addr, MBIGCLBYTES, NULL); else if (njcl > 0) m_16kfree(addr, M16KCLBYTES, NULL); else @@ -184,7 +188,7 @@ mbuf_getcluster(mbuf_how_t how, mbuf_type_t type, size_t size, mbuf_t* mbuf) */ if (size == MCLBYTES) { *mbuf = m_mclget(*mbuf, how); - } else if (size == NBPG) { + } else if (size == MBIGCLBYTES) { *mbuf = m_mbigget(*mbuf, how); } else if (size == M16KCLBYTES) { if (njcl > 0) { @@ -254,11 +258,17 @@ errno_t mbuf_getpacket(mbuf_how_t how, mbuf_t *mbuf) return error; } +/* This function is used to provide m_free via symbol indirection, please avoid + * any change in behavior or remove the indirection in config/Unsupported* + */ mbuf_t mbuf_free(mbuf_t mbuf) { return m_free(mbuf); } +/* This function is used to provide m_freem via symbol indirection, please avoid + * any change in behavior or remove the indirection in config/Unsupported* + */ void mbuf_freem(mbuf_t mbuf) { m_freem(mbuf); @@ -274,6 +284,10 @@ size_t mbuf_leadingspace(const mbuf_t mbuf) return m_leadingspace(mbuf); } +/* This function is used to provide m_trailingspace via symbol indirection, + * please avoid any change in behavior or remove the indirection in + * config/Unsupported* + */ size_t mbuf_trailingspace(const mbuf_t mbuf) { return m_trailingspace(mbuf); @@ -332,6 +346,9 @@ errno_t mbuf_pulldown(mbuf_t src, size_t *offset, size_t len, mbuf_t *location) return (*location == NULL) ? ENOMEM : 0; } +/* This function is used to provide m_adj via symbol indirection, please avoid + * any change in behavior or remove the indirection in config/Unsupported* + */ void mbuf_adj(mbuf_t mbuf, int len) { m_adj(mbuf, len); @@ -544,7 +561,7 @@ void mbuf_outbound_finalize(mbuf_t mbuf, u_int32_t protocol_family, size_t protocol_offset) { if ((mbuf->m_pkthdr.csum_flags & - (CSUM_DELAY_DATA | CSUM_DELAY_IP | CSUM_TCP_SUM16)) == 0) + (CSUM_DELAY_DATA | CSUM_DELAY_IP | CSUM_TCP_SUM16 | CSUM_DELAY_IPV6_DATA)) == 0) return; /* Generate the packet in software, client needs it */ @@ -573,14 +590,23 @@ mbuf_outbound_finalize(mbuf_t mbuf, u_int32_t protocol_family, size_t protocol_o mbuf->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DELAY_IP); break; + + case PF_INET6: + + if (mbuf->m_pkthdr.csum_flags & CSUM_DELAY_IPV6_DATA) { + in_delayed_cksum_offset(mbuf, protocol_offset); + } + mbuf->m_pkthdr.csum_flags &= ~CSUM_DELAY_IPV6_DATA; + break; + default: /* * Not sure what to do here if anything. - * Hardware checksum code looked pretty IPv4 specific. + * Hardware checksum code looked pretty IPv4/IPv6 specific. */ - if ((mbuf->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_IP)) != 0) - panic("mbuf_outbound_finalize - CSUM flags set for non-IPv4 packet (%u)!\n", protocol_family); + if ((mbuf->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_IP | CSUM_DELAY_IPV6_DATA)) != 0) + panic("mbuf_outbound_finalize - CSUM flags set for non-IPv4 or IPv6 packet (%u)!\n", protocol_family); } } @@ -619,7 +645,8 @@ mbuf_clear_vlan_tag( } static const mbuf_csum_request_flags_t mbuf_valid_csum_request_flags = - MBUF_CSUM_REQ_IP | MBUF_CSUM_REQ_TCP | MBUF_CSUM_REQ_UDP | MBUF_CSUM_REQ_SUM16; + MBUF_CSUM_REQ_IP | MBUF_CSUM_REQ_TCP | MBUF_CSUM_REQ_UDP | + MBUF_CSUM_REQ_SUM16 | MBUF_CSUM_REQ_TCPIPV6 | MBUF_CSUM_REQ_UDPIPV6; errno_t mbuf_set_csum_requested( @@ -827,7 +854,7 @@ mbuf_tag_allocate( } /* Allocate an mtag */ - tag = m_tag_alloc(id, type, length, how); + tag = m_tag_create(id, type, length, how, mbuf); if (tag == NULL) { return how == M_WAITOK ? ENOMEM : EWOULDBLOCK; } @@ -1072,34 +1099,16 @@ mbuf_get_mhlen(void) return (_MHLEN); } -mbuf_priority_t -mbuf_get_priority(struct mbuf *m) +u_int32_t +mbuf_get_minclsize(void) { -#if !PKT_PRIORITY -#pragma unused(m) - return (MBUF_PRIORITY_NORMAL); -#else /* PKT_PRIORITY */ - mbuf_priority_t prio = MBUF_PRIORITY_NORMAL; - - if (m == NULL || !(m->m_flags & M_PKTHDR)) - return (prio); - - /* Defaults to normal; ignore anything else but background */ - if (m->m_pkthdr.prio == MBUF_PRIORITY_BACKGROUND) - prio = MBUF_PRIORITY_BACKGROUND; - - return (prio); -#endif /* PKT_PRIORITY */ + return (MHLEN + MLEN); } mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t m) { -#if !PKT_PRIORITY -#pragma unused(m) - return (MBUF_TC_BE); -#else /* PKT_PRIORITY */ - mbuf_priority_t prio = MBUF_TC_BE; + mbuf_traffic_class_t prio = MBUF_TC_BE; if (m == NULL || !(m->m_flags & M_PKTHDR)) return (prio); @@ -1108,17 +1117,11 @@ mbuf_get_traffic_class(mbuf_t m) prio = m->m_pkthdr.prio; return (prio); -#endif /* PKT_PRIORITY */ } errno_t mbuf_set_traffic_class(mbuf_t m, mbuf_traffic_class_t tc) { -#if !PKT_PRIORITY -#pragma unused(m) -#pragma unused(tc) - return 0; -#else /* PKT_PRIORITY */ errno_t error = 0; if (m == NULL || !(m->m_flags & M_PKTHDR)) @@ -1136,5 +1139,4 @@ mbuf_set_traffic_class(mbuf_t m, mbuf_traffic_class_t tc) break; } return error; -#endif /* PKT_PRIORITY */ } diff --git a/bsd/kern/kpi_socket.c b/bsd/kern/kpi_socket.c index ee1fd5af0..70507beff 100644 --- a/bsd/kern/kpi_socket.c +++ b/bsd/kern/kpi_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2010 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -172,18 +172,13 @@ sock_accept( if (sa) FREE(sa, M_SONAME); /* - * If the socket has been marked as inactive by soacceptfilter(), - * disallow further operations on it. We explicitly call shutdown - * on both data directions to ensure that SS_CANT{RCV,SEND}MORE - * states are set for the socket. This would also flush out data - * hanging off the receive list of this socket. + * If the socket has been marked as inactive by sosetdefunct(), + * disallow further operations on it. */ if (new_so->so_flags & SOF_DEFUNCT) { - (void) soshutdownlock(new_so, SHUT_RD); - (void) soshutdownlock(new_so, SHUT_WR); - (void) sodisconnectlocked(new_so); + (void) sodefunct(current_proc(), new_so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL); } - *new_sock = new_so; if (dosocklock) socket_unlock(new_so, 1); @@ -195,9 +190,30 @@ sock_bind( socket_t sock, const struct sockaddr *to) { - if (sock == NULL || to == NULL) return EINVAL; + int error = 0; + struct sockaddr *sa = NULL; + struct sockaddr_storage ss; + boolean_t want_free = TRUE; + + if (sock == NULL || to == NULL) + return EINVAL; + + if (to->sa_len > sizeof(ss)) { + MALLOC(sa, struct sockaddr *, to->sa_len, M_SONAME, M_WAITOK); + if (sa == NULL) + return ENOBUFS; + } else { + sa = (struct sockaddr *)&ss; + want_free = FALSE; + } + memcpy(sa, to, to->sa_len); + + error = sobind(sock, sa); - return sobind(sock, (struct sockaddr*)(uintptr_t)to); + if (sa != NULL && want_free == TRUE) + FREE(sa, M_SONAME); + + return error; } errno_t @@ -208,23 +224,37 @@ sock_connect( { int error = 0; lck_mtx_t *mutex_held; + struct sockaddr *sa = NULL; + struct sockaddr_storage ss; + boolean_t want_free = TRUE; if (sock == NULL || to == NULL) return EINVAL; + + if (to->sa_len > sizeof(ss)) { + MALLOC(sa, struct sockaddr *, to->sa_len, M_SONAME, + (flags & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK); + if (sa == NULL) + return ENOBUFS; + } else { + sa = (struct sockaddr *)&ss; + want_free = FALSE; + } + memcpy(sa, to, to->sa_len); socket_lock(sock, 1); if ((sock->so_state & SS_ISCONNECTING) && ((sock->so_state & SS_NBIO) != 0 || (flags & MSG_DONTWAIT) != 0)) { - socket_unlock(sock, 1); - return EALREADY; + error = EALREADY; + goto out; } - error = soconnectlock(sock, (struct sockaddr*)(uintptr_t)to, 0); + error = soconnectlock(sock, sa, 0); if (!error) { if ((sock->so_state & SS_ISCONNECTING) && ((sock->so_state & SS_NBIO) != 0 || (flags & MSG_DONTWAIT) != 0)) { - socket_unlock(sock, 1); - return EINPROGRESS; + error = EINPROGRESS; + goto out; } if (sock->so_proto->pr_getlock != NULL) @@ -247,7 +277,12 @@ sock_connect( else { sock->so_state &= ~SS_ISCONNECTING; } +out: socket_unlock(sock, 1); + + if (sa != NULL && want_free == TRUE) + FREE(sa, M_SONAME); + return error; } @@ -476,6 +511,27 @@ sock_setsockopt( return sosetopt(sock, &sopt); /* will lock socket */ } +/* + * This follows the recommended mappings between DSCP code points and WMM access classes + */ +static u_int8_t so_tc_from_dscp(u_int8_t dscp); +static u_int8_t +so_tc_from_dscp(u_int8_t dscp) +{ + u_int8_t tc; + + if (dscp >= 0x30 && dscp <= 0x3f) + tc = SO_TC_VO; + else if (dscp >= 0x20 && dscp <= 0x2f) + tc = SO_TC_VI; + else if (dscp >= 0x08 && dscp <= 0x17) + tc = SO_TC_BK; + else + tc = SO_TC_BE; + + return tc; +} + errno_t sock_settclassopt( socket_t sock, @@ -484,13 +540,9 @@ sock_settclassopt( errno_t error = 0; struct sockopt sopt; + int sotc; - if (sock == NULL || optval == NULL || optlen == 0) return EINVAL; - - sopt.sopt_dir = SOPT_SET; - sopt.sopt_val = CAST_USER_ADDR_T(optval); - sopt.sopt_valsize = optlen; - sopt.sopt_p = kernproc; + if (sock == NULL || optval == NULL || optlen != sizeof(int)) return EINVAL; socket_lock(sock, 1); if (!(sock->so_state & SS_ISCONNECTED)) { @@ -507,6 +559,28 @@ sock_settclassopt( goto out; } + /* + * Set the socket traffic class based on the passed DSCP code point + * regardless of the scope of the destination + */ + sotc = so_tc_from_dscp((*(const int *)optval) >> 2); + + sopt.sopt_dir = SOPT_SET; + sopt.sopt_val = CAST_USER_ADDR_T(&sotc); + sopt.sopt_valsize = sizeof(sotc); + sopt.sopt_p = kernproc; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_TRAFFIC_CLASS; + + socket_unlock(sock, 0); + error = sosetopt(sock, &sopt); + socket_lock(sock, 0); + + if (error != 0) { + printf("sock_settclassopt: sosetopt SO_TRAFFIC_CLASS failed %d\n", error); + goto out; + } + /* Check if the destination address is LAN or link local address. * We do not want to set traffic class bits if the destination * is not local @@ -515,6 +589,11 @@ sock_settclassopt( goto out; } + sopt.sopt_dir = SOPT_SET; + sopt.sopt_val = CAST_USER_ADDR_T(optval); + sopt.sopt_valsize = optlen; + sopt.sopt_p = kernproc; + switch (sock->so_proto->pr_domain->dom_family) { case AF_INET: sopt.sopt_level = IPPROTO_IP; @@ -989,59 +1068,114 @@ sock_getlistener(socket_t sock) return (sock->so_head); } +static inline void +sock_set_tcp_stream_priority(socket_t sock) +{ + if ((sock->so_proto->pr_domain->dom_family == AF_INET || + sock->so_proto->pr_domain->dom_family == AF_INET6) && + sock->so_proto->pr_type == SOCK_STREAM) { + + set_tcp_stream_priority(sock); + + } +} + /* * Caller must have ensured socket is valid and won't be going away. */ void -socket_set_traffic_mgt_flags(socket_t sock, u_int32_t flags) +socket_set_traffic_mgt_flags_locked(socket_t sock, u_int32_t flags) { (void) OSBitOrAtomic(flags, &sock->so_traffic_mgt_flags); + sock_set_tcp_stream_priority(sock); +} + +void +socket_set_traffic_mgt_flags(socket_t sock, u_int32_t flags) +{ + socket_lock(sock, 1); + socket_set_traffic_mgt_flags_locked(sock, flags); + socket_unlock(sock, 1); } /* * Caller must have ensured socket is valid and won't be going away. */ void -socket_clear_traffic_mgt_flags(socket_t sock, u_int32_t flags) +socket_clear_traffic_mgt_flags_locked(socket_t sock, u_int32_t flags) { (void) OSBitAndAtomic(~flags, &sock->so_traffic_mgt_flags); + sock_set_tcp_stream_priority(sock); } -__private_extern__ void -set_traffic_class(struct mbuf *m, struct socket *so, int mtc) +void +socket_clear_traffic_mgt_flags(socket_t sock, u_int32_t flags) { -#if !PKT_PRIORITY -#pragma unused(m) -#pragma unused(so) -#pragma unused(mtc) - return; -#else /* PKT_PRIORITY */ - if (!(m->m_flags & M_PKTHDR)) - return; + socket_lock(sock, 1); + socket_clear_traffic_mgt_flags_locked(sock, flags); + socket_unlock(sock, 1); +} + - if (soisbackground(so)) { - m->m_pkthdr.prio = MBUF_TC_BK; - } else if (mtc != MBUF_TC_NONE) { - if (mtc >= MBUF_TC_BE && mtc <= MBUF_TC_VO) - m->m_pkthdr.prio = mtc; +/* + * Caller must have ensured socket is valid and won't be going away. + */ +errno_t +socket_defunct(struct proc *p, socket_t so, int level) +{ + errno_t retval; + + if (level != SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC && + level != SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL) + return (EINVAL); + + socket_lock(so, 1); + /* + * SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC level is meant to tear down + * all of mDNSResponder IPC sockets, currently those of AF_UNIX; note + * that this is an implementation artifact of mDNSResponder. We do + * a quick test against the socket buffers for SB_UNIX, since that + * would have been set by unp_attach() at socket creation time. + */ + if (level == SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC && + (so->so_rcv.sb_flags & so->so_snd.sb_flags & SB_UNIX) != SB_UNIX) { + socket_unlock(so, 1); + return (EOPNOTSUPP); + } + retval = sosetdefunct(p, so, level, TRUE); + if (retval == 0) + retval = sodefunct(p, so, level); + socket_unlock(so, 1); + return (retval); +} + +errno_t +sock_setupcall(socket_t sock, sock_upcall callback, void* context) +{ + if (sock == NULL) + return EINVAL; + + /* + * Note that we don't wait for any in progress upcall to complete. + */ + socket_lock(sock, 1); + + sock->so_upcall = (so_upcall) callback; + sock->so_upcallarg = context; + if (callback) { + sock->so_rcv.sb_flags |= SB_UPCALL; +#if CONFIG_SOWUPCALL + sock->so_snd.sb_flags |= SB_UPCALL; +#endif /* CONFIG_SOWUPCALL */ } else { - switch (so->so_traffic_class) { - case SO_TC_BE: - m->m_pkthdr.prio = MBUF_TC_BE; - break; - case SO_TC_BK: - m->m_pkthdr.prio = MBUF_TC_BK; - break; - case SO_TC_VI: - m->m_pkthdr.prio = MBUF_TC_VI; - break; - case SO_TC_VO: - m->m_pkthdr.prio = MBUF_TC_VO; - break; - default: - break; - } + sock->so_rcv.sb_flags &= ~SB_UPCALL; +#if CONFIG_SOWUPCALL + sock->so_snd.sb_flags &= ~SB_UPCALL; +#endif /* CONFIG_SOWUPCALL */ } - return; -#endif /* PKT_PRIORITY */ + + socket_unlock(sock, 1); + + return 0; } + diff --git a/bsd/kern/kpi_socketfilter.c b/bsd/kern/kpi_socketfilter.c index c8469ab40..67a944c2d 100644 --- a/bsd/kern/kpi_socketfilter.c +++ b/bsd/kern/kpi_socketfilter.c @@ -33,17 +33,56 @@ #include <sys/errno.h> #include <sys/malloc.h> #include <sys/protosw.h> +#include <sys/proc.h> #include <kern/locks.h> +#include <kern/thread.h> +#include <kern/debug.h> #include <net/kext_net.h> #include <libkern/libkern.h> +#include <libkern/OSAtomic.h> #include <string.h> +#define SFEF_ATTACHED 0x1 /* SFE is on socket list */ +#define SFEF_NODETACH 0x2 /* Detach should not be called */ +#define SFEF_NOSOCKET 0x4 /* Socket is gone */ + +struct socket_filter_entry { + struct socket_filter_entry *sfe_next_onsocket; + struct socket_filter_entry *sfe_next_onfilter; + struct socket_filter_entry *sfe_next_oncleanup; + + struct socket_filter *sfe_filter; + struct socket *sfe_socket; + void *sfe_cookie; + + uint32_t sfe_flags; + int32_t sfe_refcount; +}; + +struct socket_filter { + TAILQ_ENTRY(socket_filter) sf_protosw_next; + TAILQ_ENTRY(socket_filter) sf_global_next; + struct socket_filter_entry *sf_entry_head; + + struct protosw *sf_proto; + struct sflt_filter sf_filter; + u_int32_t sf_refcount; +}; + +TAILQ_HEAD(socket_filter_list, socket_filter); + static struct socket_filter_list sock_filter_head; -static lck_mtx_t *sock_filter_lock = 0; +static lck_rw_t *sock_filter_lock = NULL; +static lck_mtx_t *sock_filter_cleanup_lock = NULL; +static struct socket_filter_entry *sock_filter_cleanup_entries = NULL; +static thread_t sock_filter_cleanup_thread = NULL; -static void sflt_detach_private(struct socket_filter_entry *entry, int unregistering); +static void sflt_cleanup_thread(void *, wait_result_t); +static void sflt_detach_locked(struct socket_filter_entry *entry); + +#pragma mark -- Internal State Management -- __private_extern__ void sflt_init(void) @@ -54,70 +93,361 @@ sflt_init(void) TAILQ_INIT(&sock_filter_head); - /* Allocate a spin lock */ + /* Allocate a rw lock */ grp_attrib = lck_grp_attr_alloc_init(); lck_group = lck_grp_alloc_init("socket filter lock", grp_attrib); lck_grp_attr_free(grp_attrib); lck_attrib = lck_attr_alloc_init(); - sock_filter_lock = lck_mtx_alloc_init(lck_group, lck_attrib); + sock_filter_lock = lck_rw_alloc_init(lck_group, lck_attrib); + sock_filter_cleanup_lock = lck_mtx_alloc_init(lck_group, lck_attrib); lck_grp_free(lck_group); lck_attr_free(lck_attrib); } -__private_extern__ void -sflt_initsock( - struct socket *so) +static void +sflt_retain_locked( + struct socket_filter *filter) { - struct protosw *proto = so->so_proto; - struct socket_filter *filter; + filter->sf_refcount++; +} + +static void +sflt_release_locked( + struct socket_filter *filter) +{ + filter->sf_refcount--; + if (filter->sf_refcount == 0) + { + // Call the unregistered function + if (filter->sf_filter.sf_unregistered) { + lck_rw_unlock_exclusive(sock_filter_lock); + filter->sf_filter.sf_unregistered(filter->sf_filter.sf_handle); + lck_rw_lock_exclusive(sock_filter_lock); + } + + // Free the entry + FREE(filter, M_IFADDR); + } +} + +static void +sflt_entry_retain( + struct socket_filter_entry *entry) +{ + if (OSIncrementAtomic(&entry->sfe_refcount) <= 0) + panic("sflt_entry_retain - sfe_refcount <= 0\n"); +} + +static void +sflt_entry_release( + struct socket_filter_entry *entry) +{ + SInt32 old = OSDecrementAtomic(&entry->sfe_refcount); + if (old == 1) { + // That was the last reference + + // Take the cleanup lock + lck_mtx_lock(sock_filter_cleanup_lock); + + // Put this item on the cleanup list + entry->sfe_next_oncleanup = sock_filter_cleanup_entries; + sock_filter_cleanup_entries = entry; + + // If the item is the first item in the list + if (entry->sfe_next_oncleanup == NULL) { + if (sock_filter_cleanup_thread == NULL) { + // Create a thread + kernel_thread_start(sflt_cleanup_thread, NULL, &sock_filter_cleanup_thread); + } else { + // Wakeup the thread + wakeup(&sock_filter_cleanup_entries); + } + } + + // Drop the cleanup lock + lck_mtx_unlock(sock_filter_cleanup_lock); + } + else if (old <= 0) + { + panic("sflt_entry_release - sfe_refcount (%d) <= 0\n", (int)old); + } +} + +static void +sflt_cleanup_thread( + __unused void * blah, + __unused wait_result_t blah2) +{ + while (1) { + lck_mtx_lock(sock_filter_cleanup_lock); + while (sock_filter_cleanup_entries == NULL) { + // Sleep until we've got something better to do + msleep(&sock_filter_cleanup_entries, sock_filter_cleanup_lock, PWAIT, "sflt_cleanup", NULL); + } + + // Pull the current list of dead items + struct socket_filter_entry *dead = sock_filter_cleanup_entries; + sock_filter_cleanup_entries = NULL; + + // Drop the lock + lck_mtx_unlock(sock_filter_cleanup_lock); + + // Take the socket filter lock + lck_rw_lock_exclusive(sock_filter_lock); + + // Cleanup every dead item + struct socket_filter_entry *entry; + for (entry = dead; entry; entry = dead) { + struct socket_filter_entry **nextpp; + + dead = entry->sfe_next_oncleanup; + + // Call the detach function if necessary - drop the lock + if ((entry->sfe_flags & SFEF_NODETACH) == 0 && + entry->sfe_filter->sf_filter.sf_detach) { + entry->sfe_flags |= SFEF_NODETACH; + lck_rw_unlock_exclusive(sock_filter_lock); + + // Warning - passing a potentially dead socket may be bad + entry->sfe_filter->sf_filter. + sf_detach(entry->sfe_cookie, entry->sfe_socket); + + lck_rw_lock_exclusive(sock_filter_lock); + } + + // Pull entry off the socket list -- if the socket still exists + if ((entry->sfe_flags & SFEF_NOSOCKET) == 0) { + for (nextpp = &entry->sfe_socket->so_filt; *nextpp; + nextpp = &(*nextpp)->sfe_next_onsocket) { + if (*nextpp == entry) { + *nextpp = entry->sfe_next_onsocket; + break; + } + } + } + + // Pull entry off the filter list + for (nextpp = &entry->sfe_filter->sf_entry_head; *nextpp; + nextpp = &(*nextpp)->sfe_next_onfilter) { + if (*nextpp == entry) { + *nextpp = entry->sfe_next_onfilter; + break; + } + } + + // Release the filter -- may drop lock, but that's okay + sflt_release_locked(entry->sfe_filter); + entry->sfe_socket = NULL; + entry->sfe_filter = NULL; + FREE(entry, M_IFADDR); + } + + // Drop the socket filter lock + lck_rw_unlock_exclusive(sock_filter_lock); + } + // Not reached +} + +static int +sflt_attach_locked( + struct socket *so, + struct socket_filter *filter, + int socklocked) +{ + int error = 0; + struct socket_filter_entry *entry = NULL; - if (TAILQ_FIRST(&proto->pr_filter_head) != NULL) { - lck_mtx_lock(sock_filter_lock); - TAILQ_FOREACH(filter, &proto->pr_filter_head, sf_protosw_next) { - sflt_attach_private(so, filter, 0, 0); + if (filter == NULL) + error = ENOENT; + + if (error == 0) { + /* allocate the socket filter entry */ + MALLOC(entry, struct socket_filter_entry *, sizeof(*entry), M_IFADDR, M_WAITOK); + if (entry == NULL) { + error = ENOMEM; + } + } + + if (error == 0) { + /* Initialize the socket filter entry */ + entry->sfe_cookie = NULL; + entry->sfe_flags = SFEF_ATTACHED; + entry->sfe_refcount = 1; // corresponds to SFEF_ATTACHED flag set + + /* Put the entry in the filter list */ + sflt_retain_locked(filter); + entry->sfe_filter = filter; + entry->sfe_next_onfilter = filter->sf_entry_head; + filter->sf_entry_head = entry; + + /* Put the entry on the socket filter list */ + entry->sfe_socket = so; + entry->sfe_next_onsocket = so->so_filt; + so->so_filt = entry; + + if (entry->sfe_filter->sf_filter.sf_attach) { + // Retain the entry while we call attach + sflt_entry_retain(entry); + + // Release the filter lock -- callers must be aware we will do this + lck_rw_unlock_exclusive(sock_filter_lock); + + // Unlock the socket + if (socklocked) + socket_unlock(so, 0); + + // It's finally safe to call the filter function + error = entry->sfe_filter->sf_filter.sf_attach(&entry->sfe_cookie, so); + + // Lock the socket again + if (socklocked) + socket_lock(so, 0); + + // Lock the filters again + lck_rw_lock_exclusive(sock_filter_lock); + + // If the attach function returns an error, this filter must be detached + if (error) { + entry->sfe_flags |= SFEF_NODETACH; // don't call sf_detach + sflt_detach_locked(entry); + } + + // Release the retain we held through the attach call + sflt_entry_release(entry); } - lck_mtx_unlock(sock_filter_lock); } + + return error; } -__private_extern__ void -sflt_termsock( - struct socket *so) +errno_t +sflt_attach_internal( + socket_t socket, + sflt_handle handle) { - struct socket_filter_entry *filter; - struct socket_filter_entry *filter_next; + if (socket == NULL || handle == 0) + return EINVAL; + + int result = EINVAL; + + lck_rw_lock_exclusive(sock_filter_lock); + + struct socket_filter *filter = NULL; + TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) { + if (filter->sf_filter.sf_handle == handle) break; + } + + if (filter) { + result = sflt_attach_locked(socket, filter, 1); + } - for (filter = so->so_filt; filter; filter = filter_next) { - filter_next = filter->sfe_next_onsocket; - sflt_detach_private(filter, 0); + lck_rw_unlock_exclusive(sock_filter_lock); + + return result; +} + +static void +sflt_detach_locked( + struct socket_filter_entry *entry) +{ + if ((entry->sfe_flags & SFEF_ATTACHED) != 0) { + entry->sfe_flags &= ~SFEF_ATTACHED; + sflt_entry_release(entry); } - so->so_filt = NULL; } +#pragma mark -- Socket Layer Hooks -- + __private_extern__ void -sflt_use( +sflt_initsock( struct socket *so) { - so->so_filteruse++; + struct protosw *proto = so->so_proto; + + lck_rw_lock_shared(sock_filter_lock); + if (TAILQ_FIRST(&proto->pr_filter_head) != NULL) { + // Promote lock to exclusive + if (!lck_rw_lock_shared_to_exclusive(sock_filter_lock)) + lck_rw_lock_exclusive(sock_filter_lock); + + // Warning: A filter unregistering will be pulled out of the list. + // This could happen while we drop the lock in sftl_attach_locked + // or sflt_release_locked. For this reason we retain a reference + // on the filter (or next_filter) while calling this function + // + // This protects us from a panic, but it could result in a + // socket being created without all of the global filters if + // we're attaching a filter as it is removed, if that's possible. + struct socket_filter *filter = TAILQ_FIRST(&proto->pr_filter_head); + sflt_retain_locked(filter); + + while (filter) + { + struct socket_filter *filter_next; + + // Warning: sflt_attach_private_locked will drop the lock + sflt_attach_locked(so, filter, 0); + + filter_next = TAILQ_NEXT(filter, sf_protosw_next); + if (filter_next) + sflt_retain_locked(filter_next); + + // Warning: filt_release_locked may remove the filter from the queue + sflt_release_locked(filter); + filter = filter_next; + } + } + lck_rw_done(sock_filter_lock); } +/* + * sflt_termsock + * + * Detaches all filters from the socket. + */ + __private_extern__ void -sflt_unuse( +sflt_termsock( struct socket *so) { - so->so_filteruse--; - if (so->so_filteruse == 0) { - struct socket_filter_entry *filter; - struct socket_filter_entry *next_filter; - // search for detaching filters - for (filter = so->so_filt; filter; filter = next_filter) { - next_filter = filter->sfe_next_onsocket; + lck_rw_lock_exclusive(sock_filter_lock); + + struct socket_filter_entry *entry; + + while ((entry = so->so_filt) != NULL) { + // Pull filter off the socket + so->so_filt = entry->sfe_next_onsocket; + entry->sfe_flags |= SFEF_NOSOCKET; + + // Call detach + sflt_detach_locked(entry); + + // On sflt_termsock, we can't return until the detach function has been called + // Call the detach function - this is gross because the socket filter + // entry could be freed when we drop the lock, so we make copies on + // the stack and retain everything we need before dropping the lock + if ((entry->sfe_flags & SFEF_NODETACH) == 0 && + entry->sfe_filter->sf_filter.sf_detach) { + void *sfe_cookie = entry->sfe_cookie; + struct socket_filter *sfe_filter = entry->sfe_filter; - if (filter->sfe_flags & SFEF_DETACHUSEZERO) { - sflt_detach_private(filter, 0); - } + // Retain the socket filter + sflt_retain_locked(sfe_filter); + + // Mark that we've called the detach function + entry->sfe_flags |= SFEF_NODETACH; + + // Drop the lock around the call to the detach function + lck_rw_unlock_exclusive(sock_filter_lock); + sfe_filter->sf_filter.sf_detach(sfe_cookie, so); + lck_rw_lock_exclusive(sock_filter_lock); + + // Release the filter + sflt_release_locked(sfe_filter); } } + + lck_rw_unlock_exclusive(sock_filter_lock); } __private_extern__ void @@ -126,280 +456,595 @@ sflt_notify( sflt_event_t event, void *param) { - struct socket_filter_entry *filter; - int filtered = 0; + if (so->so_filt == NULL) return; + + struct socket_filter_entry *entry; + int unlocked = 0; - for (filter = so->so_filt; filter; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_notify) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry; entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_notify) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + unlocked = 1; socket_unlock(so, 0); } - filter->sfe_filter->sf_filter.sf_notify( - filter->sfe_cookie, so, event, param); + + // Finally call the filter + entry->sfe_filter->sf_filter. + sf_notify(entry->sfe_cookie, so, event, param); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } + lck_rw_unlock_shared(sock_filter_lock); - if (filtered != 0) { + if (unlocked != 0) { socket_lock(so, 0); - sflt_unuse(so); } } __private_extern__ int -sflt_data_in( - struct socket *so, - const struct sockaddr *from, - mbuf_t *data, - mbuf_t *control, - sflt_data_flag_t flags, - int *filtered) +sflt_ioctl( + struct socket *so, + u_long cmd, + caddr_t data) { - struct socket_filter_entry *filter; + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; int error = 0; - int filtered_storage; - - if (filtered == NULL) - filtered = &filtered_storage; - *filtered = 0; - - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_data_in) { - if (*filtered == 0) { - *filtered = 1; - sflt_use(so); + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_ioctl) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { socket_unlock(so, 0); + unlocked = 1; } - error = filter->sfe_filter->sf_filter.sf_data_in( - filter->sfe_cookie, so, from, data, control, flags); + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_ioctl(entry->sfe_cookie, so, cmd, data); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } - - if (*filtered != 0) { + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { socket_lock(so, 0); - sflt_unuse(so); } return error; } -/* sflt_attach_private - * - * Assumptions: If filter is not NULL, socket_filter_lock is held. - */ - __private_extern__ int -sflt_attach_private( - struct socket *so, - struct socket_filter *filter, - sflt_handle handle, - int sock_locked) +sflt_bind( + struct socket *so, + const struct sockaddr *nam) { - struct socket_filter_entry *entry = NULL; - int didlock = 0; - int error = 0; + if (so->so_filt == NULL) return 0; - if (filter == NULL) { - /* Find the filter by the handle */ - lck_mtx_lock(sock_filter_lock); - didlock = 1; - - TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) { - if (filter->sf_filter.sf_handle == handle) - break; + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_bind) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_bind(entry->sfe_cookie, so, nam); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } - if (filter == NULL) - error = ENOENT; + return error; +} + +__private_extern__ int +sflt_listen( + struct socket *so) +{ + if (so->so_filt == NULL) return 0; - if (error == 0) { - /* allocate the socket filter entry */ - MALLOC(entry, struct socket_filter_entry *, sizeof(*entry), M_IFADDR, M_WAITOK); - if (entry == NULL) { - error = ENOMEM; + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_listen) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_listen(entry->sfe_cookie, so); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } - if (error == 0) { - /* Initialize the socket filter entry and call the attach function */ - entry->sfe_filter = filter; - entry->sfe_socket = so; - entry->sfe_cookie = NULL; - entry->sfe_flags = 0; - if (entry->sfe_filter->sf_filter.sf_attach) { - filter->sf_usecount++; - - if (sock_locked) - socket_unlock(so, 0); - error = entry->sfe_filter->sf_filter.sf_attach(&entry->sfe_cookie, so); - if (sock_locked) - socket_lock(so, 0); - - filter->sf_usecount--; + return error; +} + +__private_extern__ int +sflt_accept( + struct socket *head, + struct socket *so, + const struct sockaddr *local, + const struct sockaddr *remote) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_accept) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); - /* If the attach function returns an error, this filter is not attached */ - if (error) { - FREE(entry, M_IFADDR); - entry = NULL; + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_accept(entry->sfe_cookie, head, so, local, remote); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } - - if (error == 0) { - /* Put the entry in the socket list */ - entry->sfe_next_onsocket = so->so_filt; - so->so_filt = entry; - - /* Put the entry in the filter list */ - entry->sfe_next_onfilter = filter->sf_entry_head; - filter->sf_entry_head = entry; - - /* Incremenet the parent filter's usecount */ - filter->sf_usecount++; + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); } - if (didlock) { - lck_mtx_unlock(sock_filter_lock); + return error; +} + +__private_extern__ int +sflt_getsockname( + struct socket *so, + struct sockaddr **local) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_getsockname) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_getsockname(entry->sfe_cookie, so, local); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); + } + } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); } return error; } +__private_extern__ int +sflt_getpeername( + struct socket *so, + struct sockaddr **remote) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_getpeername) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_getpeername(entry->sfe_cookie, so, remote); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); + } + } + lck_rw_unlock_shared(sock_filter_lock); -/* sflt_detach_private - * - * Assumptions: if you pass 0 in for the second parameter, you are holding the - * socket lock for the socket the entry is attached to. If you pass 1 in for - * the second parameter, it is assumed that the entry is not on the filter's - * list and the socket lock is not held. - */ + if (unlocked) { + socket_lock(so, 0); + } + + return error; +} -static void -sflt_detach_private( - struct socket_filter_entry *entry, - int unregistering) +__private_extern__ int +sflt_connectin( + struct socket *so, + const struct sockaddr *remote) { - struct socket_filter_entry **next_ptr; - int detached = 0; - int found = 0; + if (so->so_filt == NULL) return 0; - if (unregistering) { - socket_lock(entry->sfe_socket, 0); - } + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; - /* - * Attempt to find the entry on the filter's list and - * remove it. This prevents a filter detaching at the - * same time from attempting to remove the same entry. - */ - lck_mtx_lock(sock_filter_lock); - if (!unregistering) { - if ((entry->sfe_flags & SFEF_UNREGISTERING) != 0) { - /* - * Another thread is unregistering the filter, we - * need to avoid detaching the filter here so the - * socket won't go away. Bump up the socket's - * usecount so that it won't be freed until after - * the filter unregistration has been completed; - * at this point the caller has already held the - * socket's lock, so we can directly modify the - * usecount. - */ - if (!(entry->sfe_flags & SFEF_DETACHXREF)) { - entry->sfe_socket->so_usecount++; - entry->sfe_flags |= SFEF_DETACHXREF; + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_connect_in) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; } - lck_mtx_unlock(sock_filter_lock); - return; + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_connect_in(entry->sfe_cookie, so, remote); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } - for (next_ptr = &entry->sfe_filter->sf_entry_head; *next_ptr; - next_ptr = &((*next_ptr)->sfe_next_onfilter)) { - if (*next_ptr == entry) { - found = 1; - *next_ptr = entry->sfe_next_onfilter; - break; + } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } + + return error; +} + +__private_extern__ int +sflt_connectout( + struct socket *so, + const struct sockaddr *nam) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_connect_out) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; } - } - - if (!found && (entry->sfe_flags & SFEF_DETACHUSEZERO) == 0) { - lck_mtx_unlock(sock_filter_lock); - return; - } - } else { - /* - * Clear the removing flag. We will perform the detach here or - * request a delayed detach. Since we do an extra ref release - * below, bump up the usecount if we haven't done so. - */ - entry->sfe_flags &= ~SFEF_UNREGISTERING; - if (!(entry->sfe_flags & SFEF_DETACHXREF)) { - entry->sfe_socket->so_usecount++; - entry->sfe_flags |= SFEF_DETACHXREF; + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_connect_out(entry->sfe_cookie, so, nam); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } + lck_rw_unlock_shared(sock_filter_lock); - if (entry->sfe_socket->so_filteruse != 0) { - entry->sfe_flags |= SFEF_DETACHUSEZERO; - lck_mtx_unlock(sock_filter_lock); + if (unlocked) { + socket_lock(so, 0); + } - if (unregistering) { -#if DEBUG - printf("sflt_detach_private unregistering SFEF_DETACHUSEZERO " - "so%p so_filteruse %u so_usecount %d\n", - entry->sfe_socket, entry->sfe_socket->so_filteruse, - entry->sfe_socket->so_usecount); -#endif - socket_unlock(entry->sfe_socket, 0); + return error; +} + +__private_extern__ int +sflt_setsockopt( + struct socket *so, + struct sockopt *sopt) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_setoption) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_setoption(entry->sfe_cookie, so, sopt); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } - return; - } else { - /* - * Check if we are removing the last attached filter and - * the parent filter is being unregistered. - */ - entry->sfe_filter->sf_usecount--; - if ((entry->sfe_filter->sf_usecount == 0) && - (entry->sfe_filter->sf_flags & SFF_DETACHING) != 0) - detached = 1; - } - lck_mtx_unlock(sock_filter_lock); - - /* Remove from the socket list */ - for (next_ptr = &entry->sfe_socket->so_filt; *next_ptr; - next_ptr = &((*next_ptr)->sfe_next_onsocket)) { - if (*next_ptr == entry) { - *next_ptr = entry->sfe_next_onsocket; - break; + } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } + + return error; +} + +__private_extern__ int +sflt_getsockopt( + struct socket *so, + struct sockopt *sopt) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int unlocked = 0; + int error = 0; + + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_getoption) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_getoption(entry->sfe_cookie, so, sopt); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); } } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } + + return error; +} + +__private_extern__ int +sflt_data_out( + struct socket *so, + const struct sockaddr *to, + mbuf_t *data, + mbuf_t *control, + sflt_data_flag_t flags) +{ + if (so->so_filt == NULL) return 0; - if (entry->sfe_filter->sf_filter.sf_detach) - entry->sfe_filter->sf_filter.sf_detach(entry->sfe_cookie, entry->sfe_socket); + struct socket_filter_entry *entry; + int unlocked = 0; + int setsendthread = 0; + int error = 0; - if (detached && entry->sfe_filter->sf_filter.sf_unregistered) { - entry->sfe_filter->sf_filter.sf_unregistered(entry->sfe_filter->sf_filter.sf_handle); - FREE(entry->sfe_filter, M_IFADDR); + lck_rw_lock_shared(sock_filter_lock); + for (entry = so->so_filt; entry && error == 0; + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) + && entry->sfe_filter->sf_filter.sf_data_out) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + if (so->so_send_filt_thread == NULL) { + setsendthread = 1; + so->so_send_filt_thread = current_thread(); + } + socket_unlock(so, 0); + unlocked = 1; + } + + // Call the filter + error = entry->sfe_filter->sf_filter. + sf_data_out(entry->sfe_cookie, so, to, data, control, flags); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); + } } + lck_rw_unlock_shared(sock_filter_lock); - if (unregistering) - socket_unlock(entry->sfe_socket, 1); + if (unlocked) { + socket_lock(so, 0); + if (setsendthread) so->so_send_filt_thread = NULL; + } + + return error; +} - FREE(entry, M_IFADDR); +__private_extern__ int +sflt_data_in( + struct socket *so, + const struct sockaddr *from, + mbuf_t *data, + mbuf_t *control, + sflt_data_flag_t flags) +{ + if (so->so_filt == NULL) return 0; + + struct socket_filter_entry *entry; + int error = 0; + int unlocked = 0; + + lck_rw_lock_shared(sock_filter_lock); + + for (entry = so->so_filt; entry && (error == 0); + entry = entry->sfe_next_onsocket) { + if ((entry->sfe_flags & SFEF_ATTACHED) && + entry->sfe_filter->sf_filter.sf_data_in) { + // Retain the filter entry and release the socket filter lock + sflt_entry_retain(entry); + lck_rw_unlock_shared(sock_filter_lock); + + // If the socket isn't already unlocked, unlock it + if (unlocked == 0) { + unlocked = 1; + socket_unlock(so, 0); + } + + // Call the filter + error = entry->sfe_filter->sf_filter.sf_data_in( + entry->sfe_cookie, so, from, data, control, flags); + + // Take the socket filter lock again and release the entry + lck_rw_lock_shared(sock_filter_lock); + sflt_entry_release(entry); + } + } + lck_rw_unlock_shared(sock_filter_lock); + + if (unlocked) { + socket_lock(so, 0); + } + + return error; } +#pragma mark -- KPI -- + errno_t sflt_attach( socket_t socket, sflt_handle handle) { - if (socket == NULL || handle == 0) - return EINVAL; - - return sflt_attach_private(socket, NULL, handle, 0); + socket_lock(socket, 1); + errno_t result = sflt_attach_internal(socket, handle); + socket_unlock(socket, 1); + return result; } errno_t @@ -407,34 +1052,29 @@ sflt_detach( socket_t socket, sflt_handle handle) { - struct socket_filter_entry *filter; + struct socket_filter_entry *entry; errno_t result = 0; if (socket == NULL || handle == 0) return EINVAL; - socket_lock(socket, 1); - - for (filter = socket->so_filt; filter; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_handle == handle) + lck_rw_lock_exclusive(sock_filter_lock); + for (entry = socket->so_filt; entry; + entry = entry->sfe_next_onsocket) { + if (entry->sfe_filter->sf_filter.sf_handle == handle && + (entry->sfe_flags & SFEF_ATTACHED) != 0) { break; + } } - if (filter != NULL) { - sflt_detach_private(filter, 0); + if (entry != NULL) { + sflt_detach_locked(entry); } - else { - socket->so_filt = NULL; - result = ENOENT; - } - - socket_unlock(socket, 1); + lck_rw_unlock_exclusive(sock_filter_lock); return result; } - errno_t sflt_register( const struct sflt_filter *filter, @@ -481,7 +1121,7 @@ sflt_register( } bcopy(filter, &sock_filt->sf_filter, len); - lck_mtx_lock(sock_filter_lock); + lck_rw_lock_exclusive(sock_filter_lock); /* Look for an existing entry */ TAILQ_FOREACH(match, &sock_filter_head, sf_global_next) { if (match->sf_filter.sf_handle == @@ -489,7 +1129,7 @@ sflt_register( break; } } - + /* Add the entry only if there was no existing entry */ if (match == NULL) { TAILQ_INSERT_TAIL(&sock_filter_head, sock_filt, sf_global_next); @@ -498,9 +1138,10 @@ sflt_register( sf_protosw_next); sock_filt->sf_proto = pr; } + sflt_retain_locked(sock_filt); } - lck_mtx_unlock(sock_filter_lock); - + lck_rw_unlock_exclusive(sock_filter_lock); + if (match != NULL) { FREE(sock_filt, M_IFADDR); return EEXIST; @@ -514,62 +1155,39 @@ sflt_unregister( sflt_handle handle) { struct socket_filter *filter; - struct socket_filter_entry *entry_head = NULL; - struct socket_filter_entry *next_entry = NULL; + lck_rw_lock_exclusive(sock_filter_lock); - /* Find the entry and remove it from the global and protosw lists */ - lck_mtx_lock(sock_filter_lock); + /* Find the entry by the handle */ TAILQ_FOREACH(filter, &sock_filter_head, sf_global_next) { if (filter->sf_filter.sf_handle == handle) break; } if (filter) { + // Remove it from the global list TAILQ_REMOVE(&sock_filter_head, filter, sf_global_next); + + // Remove it from the protosw list if ((filter->sf_filter.sf_flags & SFLT_GLOBAL) != 0) { TAILQ_REMOVE(&filter->sf_proto->pr_filter_head, filter, sf_protosw_next); } - entry_head = filter->sf_entry_head; - filter->sf_entry_head = NULL; - filter->sf_flags |= SFF_DETACHING; - - for (next_entry = entry_head; next_entry; - next_entry = next_entry->sfe_next_onfilter) { - /* - * Mark this as "unregistering"; upon dropping the - * lock, another thread may win the race and attempt - * to detach a socket from it (e.g. as part of close) - * before we get a chance to detach. Setting this - * flag practically tells the other thread to go away. - * If the other thread wins, this causes an extra - * reference hold on the socket so that it won't be - * deallocated until after we finish with the detach - * for it below. If we win the race, the extra - * reference hold is also taken to compensate for the - * extra reference release when detach is called - * with a "1" for its second parameter. - */ - next_entry->sfe_flags |= SFEF_UNREGISTERING; + + // Detach from any sockets + struct socket_filter_entry *entry = NULL; + + for (entry = filter->sf_entry_head; entry; entry = entry->sfe_next_onfilter) { + sflt_detach_locked(entry); } + + // Release the filter + sflt_release_locked(filter); } - lck_mtx_unlock(sock_filter_lock); + lck_rw_unlock_exclusive(sock_filter_lock); if (filter == NULL) return ENOENT; - /* We need to detach the filter from any sockets it's attached to */ - if (entry_head == 0) { - if (filter->sf_filter.sf_unregistered) - filter->sf_filter.sf_unregistered(filter->sf_filter.sf_handle); - } else { - while (entry_head) { - next_entry = entry_head->sfe_next_onfilter; - sflt_detach_private(entry_head, 1); - entry_head = next_entry; - } - } - return 0; } diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index a6a0ab766..ab26c40b9 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,6 +59,7 @@ #include <machine/vmparam.h> #include <machine/exec.h> +#include <machine/pal_routines.h> #include <kern/kern_types.h> #include <kern/cpu_number.h> @@ -79,7 +80,6 @@ #include <vm/vnode_pager.h> #include <vm/vm_protos.h> - /* * XXX vm/pmap.h should not treat these prototypes as MACH_KERNEL_PRIVATE * when KERNEL is defined. @@ -100,10 +100,6 @@ extern kern_return_t thread_state_initialize(thread_t thread); /* XXX should have prototypes in a shared header file */ extern int get_map_nentries(vm_map_t); -extern kern_return_t thread_userstack(thread_t, int, thread_state_t, - unsigned int, mach_vm_offset_t *, int *); -extern kern_return_t thread_entrypoint(thread_t, int, thread_state_t, - unsigned int, mach_vm_offset_t *); extern kern_return_t memory_object_signed(memory_object_control_t control, boolean_t is_signed); @@ -119,8 +115,11 @@ static load_result_t load_result_null = { .unixproc = 0, .dynlinker = 0, .customstack = 0, + .validentry = 0, .csflags = 0, - .uuid = { 0 } + .uuid = { 0 }, + .min_vm_addr = MACH_VM_MAX_ADDRESS, + .max_vm_addr = MACH_VM_MIN_ADDRESS }; /* @@ -135,6 +134,7 @@ parse_machfile( off_t file_offset, off_t macho_size, int depth, + int64_t slide, load_result_t *result ); @@ -147,10 +147,12 @@ load_segment( off_t macho_size, struct vnode *vp, vm_map_t map, + int64_t slide, load_result_t *result ); -int load_code_signature( +static load_return_t +load_code_signature( struct linkedit_data_command *lcp, struct vnode *vp, off_t macho_offset, @@ -171,13 +173,7 @@ static load_return_t load_unixthread( struct thread_command *tcp, thread_t thread, - load_result_t *result -); - -static load_return_t -load_thread( - struct thread_command *tcp, - thread_t thread, + int64_t slide, load_result_t *result ); @@ -193,7 +189,7 @@ load_threadstack( thread_t thread, uint32_t *ts, uint32_t total_size, - user_addr_t *user_stack, + mach_vm_offset_t *user_stack, int *customstack ); @@ -212,10 +208,12 @@ load_dylinker( vm_map_t map, thread_t thread, int depth, - load_result_t *result, - boolean_t is_64bit + int64_t slide, + load_result_t *result ); +struct macho_data; + static load_return_t get_macho_vnode( char *path, @@ -223,6 +221,7 @@ get_macho_vnode( struct mach_header *mach_header, off_t *file_offset, off_t *macho_size, + struct macho_data *macho_data, struct vnode **vpp ); @@ -246,7 +245,7 @@ widen_segment_command(const struct segment_command *scp32, static void note_all_image_info_section(const struct segment_command_64 *scp, boolean_t is64, size_t section_size, const void *sections, - load_result_t *result) + int64_t slide, load_result_t *result) { const union { struct section s32; @@ -263,6 +262,7 @@ note_all_image_info_section(const struct segment_command_64 *scp, sizeof(sectionp->s64.sectname))) { result->all_image_info_addr = is64 ? sectionp->s64.addr : sectionp->s32.addr; + result->all_image_info_addr += slide; result->all_image_info_size = is64 ? sectionp->s64.size : sectionp->s32.size; return; @@ -270,7 +270,6 @@ note_all_image_info_section(const struct segment_command_64 *scp, } } - load_return_t load_machfile( struct image_params *imgp, @@ -293,6 +292,8 @@ load_machfile( boolean_t create_map = FALSE; int spawn = (imgp->ip_flags & IMGPF_SPAWN); task_t task = current_task(); + mach_vm_offset_t aslr_offset = 0; + kern_return_t kret; if (new_map == VM_MAP_NULL) { create_map = TRUE; @@ -312,10 +313,12 @@ load_machfile( if (create_map) { pmap = pmap_create((vm_map_size_t) 0, (imgp->ip_flags & IMGPF_IS_64BIT)); + pal_switch_pmap(thread, pmap, imgp->ip_flags & IMGPF_IS_64BIT); map = vm_map_create(pmap, 0, vm_compute_max_offset((imgp->ip_flags & IMGPF_IS_64BIT)), TRUE); + } else map = new_map; @@ -325,6 +328,20 @@ load_machfile( if ( (header->flags & MH_ALLOW_STACK_EXECUTION) ) vm_map_disable_NX(map); #endif + + /* Forcibly disallow execution from data pages on even if the arch + * normally permits it. */ + if ((header->flags & MH_NO_HEAP_EXECUTION) && !(imgp->ip_flags & IMGPF_ALLOW_DATA_EXEC)) + vm_map_disallow_data_exec(map); + + /* + * Compute a random offset for ASLR. + */ + if (!(imgp->ip_flags & IMGPF_DISABLE_ASLR)) { + aslr_offset = random(); + aslr_offset %= 1 << ((imgp->ip_flags & IMGPF_IS_64BIT) ? 16 : 8); + aslr_offset <<= PAGE_SHIFT; + } if (!result) result = &myresult; @@ -332,7 +349,7 @@ load_machfile( *result = load_result_null; lret = parse_machfile(vp, map, thread, header, file_offset, macho_size, - 0, result); + 0, (int64_t)aslr_offset, result); if (lret != LOAD_SUCCESS) { if (create_map) { @@ -362,7 +379,7 @@ load_machfile( if (create_map) { /* - * If this is an exec, then we are going to destory the old + * If this is an exec, then we are going to destroy the old * task, and it's correct to halt it; if it's spawn, the * task is not yet running, and it makes no sense. */ @@ -376,15 +393,16 @@ load_machfile( * task halting (wait for threads and then cleanup * task resources). */ - task_start_halt(task); + kret = task_start_halt(task); + if (kret != KERN_SUCCESS) { + return(kret); + } proc_transcommit(current_proc(), 0); task_complete_halt(task); + workqueue_exit(current_proc()); } - old_map = swap_task_map(old_task, thread, map); + old_map = swap_task_map(old_task, thread, map, !spawn); vm_map_clear_4GB_pagezero(old_map); - /* XXX L4 : For spawn the current task isn't running... */ - if (!spawn) - pmap_switch(pmap); /* Make sure we are using the new pmap */ vm_map_deallocate(old_map); } return(LOAD_SUCCESS); @@ -397,7 +415,9 @@ load_machfile( * bits in the file format itself. We read into the kernel buffer the * commands section, and then parse it in order to parse the mach-o file * format load_command segment(s). We are only interested in a subset of - * the total set of possible commands. + * the total set of possible commands. If "map"==VM_MAP_NULL or + * "thread"==THREAD_NULL, do not make permament VM modifications, + * just preflight the parse. */ static load_return_t @@ -409,6 +429,7 @@ parse_machfile( off_t file_offset, off_t macho_size, int depth, + int64_t aslr_offset, load_result_t *result ) { @@ -428,10 +449,10 @@ parse_machfile( proc_t p = current_proc(); /* XXXX */ int error; int resid=0; - task_t task; size_t mach_header_sz = sizeof(struct mach_header); boolean_t abi64; boolean_t got_code_signatures = FALSE; + int64_t slide = 0; if (header->magic == MH_MAGIC_64 || header->magic == MH_CIGAM_64) { @@ -445,8 +466,6 @@ parse_machfile( return(LOAD_FAILURE); } - task = (task_t)get_threadtask(thread); - depth++; /* @@ -522,11 +541,30 @@ parse_machfile( kfree(kl_addr, kl_size); return(LOAD_IOERROR); } - + + /* + * For PIE and dyld, slide everything by the ASLR offset. + */ + if ((header->flags & MH_PIE) || (header->filetype == MH_DYLINKER)) { + slide = aslr_offset; + } + /* * Scan through the commands, processing each one as necessary. */ - for (pass = 1; pass <= 2; pass++) { + for (pass = 1; pass <= 3; pass++) { + +#if CONFIG_EMBEDDED + /* + * Check that the entry point is contained in an executable segments + */ + if ((pass == 3) && (result->validentry == 0)) { + thread_state_initialize(thread); + ret = LOAD_FAILURE; + break; + } +#endif + /* * Loop through each of the load_commands indicated by the * Mach-O header; if an absurd value is provided, we just @@ -535,6 +573,7 @@ parse_machfile( */ offset = mach_header_sz; ncmds = header->ncmds; + while (ncmds--) { /* * Get a pointer to the command. @@ -565,7 +604,7 @@ parse_machfile( switch(lcp->cmd) { case LC_SEGMENT: case LC_SEGMENT_64: - if (pass != 1) + if (pass != 2) break; ret = load_segment(lcp, header->filetype, @@ -574,25 +613,20 @@ parse_machfile( macho_size, vp, map, + slide, result); break; - case LC_THREAD: - if (pass != 2) - break; - ret = load_thread((struct thread_command *)lcp, - thread, - result); - break; case LC_UNIXTHREAD: - if (pass != 2) + if (pass != 1) break; ret = load_unixthread( (struct thread_command *) lcp, - thread, + thread, + slide, result); break; case LC_LOAD_DYLINKER: - if (pass != 2) + if (pass != 3) break; if ((depth == 1) && (dlp == 0)) { dlp = (struct dylinker_command *)lcp; @@ -602,14 +636,14 @@ parse_machfile( } break; case LC_UUID: - if (pass == 2 && depth == 1) { + if (pass == 1 && depth == 1) { uulp = (struct uuid_command *)lcp; memcpy(&result->uuid[0], &uulp->uuid[0], sizeof(result->uuid)); } break; case LC_CODE_SIGNATURE: /* CODE SIGNING */ - if (pass != 2) + if (pass != 1) break; /* pager -> uip -> load signatures & store in uip @@ -633,7 +667,7 @@ parse_machfile( break; #if CONFIG_CODE_DECRYPTION case LC_ENCRYPTION_INFO: - if (pass != 2) + if (pass != 3) break; ret = set_code_unprotect( (struct encryption_info_command *) lcp, @@ -671,24 +705,15 @@ parse_machfile( } } - if (dlp != 0) - ret = load_dylinker(dlp, dlarchbits, map, thread, depth, result, abi64); + if (dlp != 0) { + /* load the dylinker, and always slide it by the ASLR + * offset regardless of PIE */ + ret = load_dylinker(dlp, dlarchbits, map, thread, depth, aslr_offset, result); + } if(depth == 1) { if (result->thread_count == 0) { ret = LOAD_FAILURE; - } else if ( abi64 ) { -#ifdef __ppc__ - /* Map in 64-bit commpage */ - /* - * PPC51: ppc64 is limited to 51-bit addresses. - * Memory above that limit is handled specially - * at the pmap level. - * - * <rdar://6640492> -- wrong task for vfork()/spawn() - */ - pmap_map_sharedpage(current_task(), get_map_pmap(map)); -#endif /* __ppc__ */ } } } @@ -780,6 +805,7 @@ load_segment( off_t macho_size, struct vnode *vp, vm_map_t map, + int64_t slide, load_result_t *result ) { @@ -795,17 +821,21 @@ load_segment( if (LC_SEGMENT_64 == lcp->cmd) { segment_command_size = sizeof(struct segment_command_64); single_section_size = sizeof(struct section_64); - scp = (struct segment_command_64 *)lcp; } else { segment_command_size = sizeof(struct segment_command); single_section_size = sizeof(struct section); - scp = &segment_command; - widen_segment_command((struct segment_command *)lcp, scp); } if (lcp->cmdsize < segment_command_size) return (LOAD_BADMACHO); total_section_size = lcp->cmdsize - segment_command_size; + if (LC_SEGMENT_64 == lcp->cmd) + scp = (struct segment_command_64 *)lcp; + else { + scp = &segment_command; + widen_segment_command((struct segment_command *)lcp, scp); + } + /* * Make sure what we get from the file is really ours (as specified * by macho_size). @@ -833,27 +863,48 @@ load_segment( map_addr = trunc_page_64(scp->vmaddr); /* JVXXX note that in XNU TOT this is round instead of trunc for 64 bits */ if (seg_size == 0) return (KERN_SUCCESS); - /* XXX (4596982) this interferes with Rosetta, so limit to 64-bit tasks */ if (map_addr == 0 && map_size == 0 && seg_size != 0 && - scp->cmd == LC_SEGMENT_64 && (scp->initprot & VM_PROT_ALL) == VM_PROT_NONE && (scp->maxprot & VM_PROT_ALL) == VM_PROT_NONE) { /* - * This is a "page zero" segment: it starts at address 0, - * is not mapped from the binary file and is not accessible. - * User-space should never be able to access that memory, so - * make it completely off limits by raising the VM map's - * minimum offset. + * For PIE, extend page zero rather than moving it. Extending + * page zero keeps early allocations from falling predictably + * between the end of page zero and the beginning of the first + * slid segment. */ - ret = vm_map_raise_min_offset(map, seg_size); - if (ret != KERN_SUCCESS) { - return (LOAD_FAILURE); + seg_size += slide; + slide = 0; + + /* XXX (4596982) this interferes with Rosetta, so limit to 64-bit tasks */ + if (scp->cmd == LC_SEGMENT_64) { + /* + * This is a "page zero" segment: it starts at address 0, + * is not mapped from the binary file and is not accessible. + * User-space should never be able to access that memory, so + * make it completely off limits by raising the VM map's + * minimum offset. + */ + ret = vm_map_raise_min_offset(map, seg_size); + if (ret != KERN_SUCCESS) { + return (LOAD_FAILURE); + } + return (LOAD_SUCCESS); } - return (LOAD_SUCCESS); } + /* If a non-zero slide was specified by the caller, apply now */ + map_addr += slide; + + if (map_addr < result->min_vm_addr) + result->min_vm_addr = map_addr; + if (map_addr+seg_size > result->max_vm_addr) + result->max_vm_addr = map_addr+seg_size; + + if (map == VM_MAP_NULL) + return (LOAD_SUCCESS); + map_offset = pager_offset + scp->fileoff; /* limited to 32 bits */ if (map_size > 0) { @@ -930,77 +981,12 @@ load_segment( result->all_image_info_addr == MACH_VM_MIN_ADDRESS) note_all_image_info_section(scp, LC_SEGMENT_64 == lcp->cmd, single_section_size, - (const char *)lcp + segment_command_size, result); - - return ret; -} - -static -load_return_t -load_thread( - struct thread_command *tcp, - thread_t thread, - load_result_t *result -) -{ - kern_return_t kret; - load_return_t lret; - task_t task; - int customstack=0; + (const char *)lcp + segment_command_size, slide, result); - if (tcp->cmdsize < sizeof(*tcp)) - return (LOAD_BADMACHO); - task = get_threadtask(thread); + if ((result->entry_point >= map_addr) && (result->entry_point < (map_addr + map_size))) + result->validentry = 1; - /* if count is 0; same as thread */ - if (result->thread_count != 0) { - kret = thread_create(task, &thread); - if (kret != KERN_SUCCESS) - return(LOAD_RESOURCE); - thread_deallocate(thread); - } - - lret = load_threadstate(thread, - (uint32_t *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command)); - if (lret != LOAD_SUCCESS) - return (lret); - - if (result->thread_count == 0) { - lret = load_threadstack(thread, - (uint32_t *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command), - &result->user_stack, - &customstack); - if (customstack) - result->customstack = 1; - else - result->customstack = 0; - - if (lret != LOAD_SUCCESS) - return(lret); - - lret = load_threadentry(thread, - (uint32_t *)(((vm_offset_t)tcp) + - sizeof(struct thread_command)), - tcp->cmdsize - sizeof(struct thread_command), - &result->entry_point); - if (lret != LOAD_SUCCESS) - return(lret); - } - /* - * Resume thread now, note that this means that the thread - * commands should appear after all the load commands to - * be sure they don't reference anything not yet mapped. - */ - else - thread_resume(thread); - - result->thread_count++; - - return(LOAD_SUCCESS); + return ret; } static @@ -1008,6 +994,7 @@ load_return_t load_unixthread( struct thread_command *tcp, thread_t thread, + int64_t slide, load_result_t *result ) { @@ -1017,9 +1004,12 @@ load_unixthread( if (tcp->cmdsize < sizeof(*tcp)) return (LOAD_BADMACHO); if (result->thread_count != 0) { -printf("load_unixthread: already have a thread!"); + printf("load_unixthread: already have a thread!"); return (LOAD_FAILURE); } + + if (thread == THREAD_NULL) + return (LOAD_SUCCESS); ret = load_threadstack(thread, (uint32_t *)(((vm_offset_t)tcp) + @@ -1031,9 +1021,12 @@ printf("load_unixthread: already have a thread!"); return(ret); if (customstack) - result->customstack = 1; + result->customstack = 1; else - result->customstack = 0; + result->customstack = 0; + + result->user_stack += slide; + ret = load_threadentry(thread, (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)), @@ -1042,6 +1035,8 @@ printf("load_unixthread: already have a thread!"); if (ret != LOAD_SUCCESS) return(ret); + result->entry_point += slide; + ret = load_threadstate(thread, (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)), @@ -1107,7 +1102,7 @@ load_threadstack( thread_t thread, uint32_t *ts, uint32_t total_size, - user_addr_t *user_stack, + mach_vm_offset_t *user_stack, int *customstack ) { @@ -1183,31 +1178,40 @@ load_threadentry( return(LOAD_SUCCESS); } +struct macho_data { + struct nameidata __nid; + union macho_vnode_header { + struct mach_header mach_header; + struct fat_header fat_header; + char __pad[512]; + } __header; +}; -static -load_return_t +static load_return_t load_dylinker( struct dylinker_command *lcp, integer_t archbits, vm_map_t map, thread_t thread, int depth, - load_result_t *result, - boolean_t is_64bit + int64_t slide, + load_result_t *result ) { char *name; char *p; struct vnode *vp = NULLVP; /* set by get_macho_vnode() */ - struct mach_header header; + struct mach_header *header; off_t file_offset = 0; /* set by get_macho_vnode() */ off_t macho_size = 0; /* set by get_macho_vnode() */ - vm_map_t copy_map; - load_result_t myresult; + load_result_t *myresult; kern_return_t ret; - vm_map_copy_t tmp; - mach_vm_offset_t dyl_start, map_addr; - mach_vm_size_t dyl_length; + struct macho_data *macho_data; + struct { + struct mach_header __header; + load_result_t __myresult; + struct macho_data __macho_data; + } *dyld_data; if (lcp->cmdsize < sizeof(*lcp)) return (LOAD_BADMACHO); @@ -1222,11 +1226,19 @@ load_dylinker( return(LOAD_BADMACHO); } while (*p++); - ret = get_macho_vnode(name, archbits, &header, &file_offset, &macho_size, &vp); + /* Allocate wad-of-data from heap to reduce excessively deep stacks */ + + MALLOC(dyld_data, void *, sizeof (*dyld_data), M_TEMP, M_WAITOK); + header = &dyld_data->__header; + myresult = &dyld_data->__myresult; + macho_data = &dyld_data->__macho_data; + + ret = get_macho_vnode(name, archbits, header, + &file_offset, &macho_size, macho_data, &vp); if (ret) - return (ret); - - myresult = load_result_null; + goto novp_out; + + *myresult = load_result_null; /* * First try to map dyld in directly. This should work most of @@ -1234,106 +1246,85 @@ load_dylinker( * mapped to its address. */ - ret = parse_machfile(vp, map, thread, &header, file_offset, macho_size, - depth, &myresult); + ret = parse_machfile(vp, map, thread, header, file_offset, + macho_size, depth, slide, myresult); /* * If it turned out something was in the way, then we'll take - * take this longer path to map dyld into a temporary map and - * copy it into destination map at a different address. + * take this longer path to preflight dyld's vm ranges, then + * map it at a free location in the address space. */ if (ret == LOAD_NOSPACE) { + mach_vm_offset_t dyl_start, map_addr; + mach_vm_size_t dyl_length; + int64_t slide_amount; + + *myresult = load_result_null; /* - * Load the Mach-O. - * Use a temporary map to do the work. + * Preflight parsing the Mach-O file with a NULL + * map, which will return the ranges needed for a + * subsequent map attempt (with a slide) in "myresult" */ - copy_map = vm_map_create(pmap_create(vm_map_round_page(macho_size), - is_64bit), - get_map_min(map), get_map_max(map), TRUE); - if (VM_MAP_NULL == copy_map) { - ret = LOAD_RESOURCE; + ret = parse_machfile(vp, VM_MAP_NULL, THREAD_NULL, header, + file_offset, macho_size, depth, 0 /* slide */, myresult); + + if (ret != LOAD_SUCCESS) { goto out; } - - myresult = load_result_null; - ret = parse_machfile(vp, copy_map, thread, &header, - file_offset, macho_size, - depth, &myresult); - - if (ret) { - vm_map_deallocate(copy_map); + dyl_start = myresult->min_vm_addr; + dyl_length = myresult->max_vm_addr - myresult->min_vm_addr; + + dyl_length += slide; + + /* To find an appropriate load address, do a quick allocation */ + map_addr = dyl_start; + ret = mach_vm_allocate(map, &map_addr, dyl_length, VM_FLAGS_ANYWHERE); + if (ret != KERN_SUCCESS) { + ret = LOAD_NOSPACE; goto out; } - - if (get_map_nentries(copy_map) > 0) { - - dyl_start = mach_get_vm_start(copy_map); - dyl_length = mach_get_vm_end(copy_map) - dyl_start; - - map_addr = dyl_start; - ret = mach_vm_allocate(map, &map_addr, dyl_length, VM_FLAGS_ANYWHERE); - - if (ret != KERN_SUCCESS) { - vm_map_deallocate(copy_map); - ret = LOAD_NOSPACE; - goto out; - - } - ret = vm_map_copyin(copy_map, - (vm_map_address_t)dyl_start, - (vm_map_size_t)dyl_length, - TRUE, &tmp); - if (ret != KERN_SUCCESS) { - (void) vm_map_remove(map, - vm_map_trunc_page(map_addr), - vm_map_round_page(map_addr + dyl_length), - VM_MAP_NO_FLAGS); - vm_map_deallocate(copy_map); - goto out; - } - - ret = vm_map_copy_overwrite(map, - (vm_map_address_t)map_addr, - tmp, FALSE); - if (ret != KERN_SUCCESS) { - vm_map_copy_discard(tmp); - (void) vm_map_remove(map, - vm_map_trunc_page(map_addr), - vm_map_round_page(map_addr + dyl_length), - VM_MAP_NO_FLAGS); - vm_map_deallocate(copy_map); - goto out; - } - - if (map_addr != dyl_start) { - myresult.entry_point += (map_addr - dyl_start); - myresult.all_image_info_addr += - (map_addr - dyl_start); - } - } else { - ret = LOAD_FAILURE; + ret = mach_vm_deallocate(map, map_addr, dyl_length); + if (ret != KERN_SUCCESS) { + ret = LOAD_NOSPACE; + goto out; } + + if (map_addr < dyl_start) + slide_amount = -(int64_t)(dyl_start - map_addr); + else + slide_amount = (int64_t)(map_addr - dyl_start); + + slide_amount += slide; - vm_map_deallocate(copy_map); + *myresult = load_result_null; + + ret = parse_machfile(vp, map, thread, header, + file_offset, macho_size, depth, slide_amount, myresult); + + if (ret) { + goto out; + } } - + if (ret == LOAD_SUCCESS) { result->dynlinker = TRUE; - result->entry_point = myresult.entry_point; - result->all_image_info_addr = myresult.all_image_info_addr; - result->all_image_info_size = myresult.all_image_info_size; + result->entry_point = myresult->entry_point; + result->all_image_info_addr = myresult->all_image_info_addr; + result->all_image_info_size = myresult->all_image_info_size; } out: vnode_put(vp); +novp_out: + FREE(dyld_data, M_TEMP); return (ret); } -int +static load_return_t load_code_signature( struct linkedit_data_command *lcp, struct vnode *vp, @@ -1408,6 +1399,10 @@ load_code_signature( /* ubc_cs_blob_add() has consumed "addr" */ addr = 0; } + +#if CHECK_CS_VALIDATION_BITMAP + ubc_cs_validation_bitmap_allocate( vp ); +#endif blob = ubc_cs_blob_get(vp, cputype, -1); @@ -1435,9 +1430,9 @@ set_code_unprotect( struct vnode *vp) { int result, len; - char vpath[MAXPATHLEN]; pager_crypt_info_t crypt_info; const char * cryptname = 0; + char *vpath; size_t offset; struct segment_command_64 *seg64; @@ -1445,8 +1440,7 @@ set_code_unprotect( vm_map_offset_t map_offset, map_size; kern_return_t kr; - if (eip->cmdsize < sizeof(*eip)) - return LOAD_BADMACHO; + if (eip->cmdsize < sizeof(*eip)) return LOAD_BADMACHO; switch(eip->cryptid) { case 0: @@ -1464,13 +1458,22 @@ set_code_unprotect( return LOAD_BADMACHO; } + if (map == VM_MAP_NULL) return (LOAD_SUCCESS); + if (NULL == text_crypter_create) return LOAD_FAILURE; + + MALLOC_ZONE(vpath, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if(vpath == NULL) return LOAD_FAILURE; + len = MAXPATHLEN; result = vn_getpath(vp, vpath, &len); - if(result) return result; + if(result) { + FREE_ZONE(vpath, MAXPATHLEN, M_NAMEI); + return LOAD_FAILURE; + } /* set up decrypter first */ - if(NULL==text_crypter_create) return LOAD_FAILURE; kr=text_crypter_create(&crypt_info, cryptname, (void*)vpath); + FREE_ZONE(vpath, MAXPATHLEN, M_NAMEI); if(kr) { printf("set_code_unprotect: unable to create decrypter %s, kr=%d\n", @@ -1549,6 +1552,7 @@ get_macho_vnode( struct mach_header *mach_header, off_t *file_offset, off_t *macho_size, + struct macho_data *data, struct vnode **vpp ) { @@ -1556,19 +1560,14 @@ get_macho_vnode( vfs_context_t ctx = vfs_context_current(); proc_t p = vfs_context_proc(ctx); kauth_cred_t kerncred; - struct nameidata nid, *ndp; + struct nameidata *ndp = &data->__nid; boolean_t is_fat; struct fat_arch fat_arch; - int error = LOAD_SUCCESS; + int error; int resid; - union { - struct mach_header mach_header; - struct fat_header fat_header; - char pad[512]; - } header; + union macho_vnode_header *header = &data->__header; off_t fsize = (off_t)0; - int err2; - + /* * Capture the kernel credential for use in the actual read of the * file, since the user doing the execution may have execute rights @@ -1579,10 +1578,8 @@ get_macho_vnode( */ kerncred = vfs_context_ucred(vfs_context_kernel()); - ndp = &nid; - /* init the namei data to point the file user's program name */ - NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); if ((error = namei(ndp)) != 0) { if (error == ENOENT) { @@ -1594,7 +1591,7 @@ get_macho_vnode( } nameidone(ndp); vp = ndp->ni_vp; - + /* check for regular file */ if (vp->v_type != VREG) { error = LOAD_PROTECT; @@ -1625,41 +1622,42 @@ get_macho_vnode( goto bad1; } - if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&header, sizeof(header), 0, + if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)header, sizeof (*header), 0, UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p)) != 0) { error = LOAD_IOERROR; goto bad2; } - - if (header.mach_header.magic == MH_MAGIC || - header.mach_header.magic == MH_MAGIC_64) - is_fat = FALSE; - else if (header.fat_header.magic == FAT_MAGIC || - header.fat_header.magic == FAT_CIGAM) - is_fat = TRUE; - else { - error = LOAD_BADMACHO; - goto bad2; + + if (header->mach_header.magic == MH_MAGIC || + header->mach_header.magic == MH_MAGIC_64) { + is_fat = FALSE; + } else if (header->fat_header.magic == FAT_MAGIC || + header->fat_header.magic == FAT_CIGAM) { + is_fat = TRUE; + } else { + error = LOAD_BADMACHO; + goto bad2; } if (is_fat) { /* Look up our architecture in the fat file. */ - error = fatfile_getarch_with_bits(vp, archbits, (vm_offset_t)(&header.fat_header), &fat_arch); + error = fatfile_getarch_with_bits(vp, archbits, + (vm_offset_t)(&header->fat_header), &fat_arch); if (error != LOAD_SUCCESS) goto bad2; /* Read the Mach-O header out of it */ - error = vn_rdwr(UIO_READ, vp, (caddr_t)&header.mach_header, - sizeof(header.mach_header), fat_arch.offset, - UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p); + error = vn_rdwr(UIO_READ, vp, (caddr_t)&header->mach_header, + sizeof (header->mach_header), fat_arch.offset, + UIO_SYSSPACE, IO_NODELOCKED, kerncred, &resid, p); if (error) { error = LOAD_IOERROR; goto bad2; } /* Is this really a Mach-O? */ - if (header.mach_header.magic != MH_MAGIC && - header.mach_header.magic != MH_MAGIC_64) { + if (header->mach_header.magic != MH_MAGIC && + header->mach_header.magic != MH_MAGIC_64) { error = LOAD_BADMACHO; goto bad2; } @@ -1677,25 +1675,23 @@ get_macho_vnode( * required, since the dynamic linker might work, but we will * refuse to load it because of this check. */ - if ((cpu_type_t)(header.mach_header.cputype & CPU_ARCH_MASK) != archbits) - return(LOAD_BADARCH); + if ((cpu_type_t)(header->mach_header.cputype & CPU_ARCH_MASK) != archbits) { + error = LOAD_BADARCH; + goto bad2; + } *file_offset = 0; *macho_size = fsize; } - *mach_header = header.mach_header; + *mach_header = header->mach_header; *vpp = vp; ubc_setsize(vp, fsize); - return (error); bad2: - err2 = VNOP_CLOSE(vp, FREAD, ctx); - vnode_put(vp); - return (error); - + (void) VNOP_CLOSE(vp, FREAD, ctx); bad1: vnode_put(vp); return(error); diff --git a/bsd/kern/mach_loader.h b/bsd/kern/mach_loader.h index 413d1a9a7..fd8e585db 100644 --- a/bsd/kern/mach_loader.h +++ b/bsd/kern/mach_loader.h @@ -60,9 +60,12 @@ typedef struct _load_result { /* boolean_t */ unixproc :1, dynlinker :1, customstack :1, + validentry :1, :0; unsigned int csflags; unsigned char uuid[16]; + mach_vm_address_t min_vm_addr; + mach_vm_address_t max_vm_addr; } load_result_t; struct image_params; diff --git a/bsd/kern/mach_process.c b/bsd/kern/mach_process.c index 0df8a49c8..9aba89b96 100644 --- a/bsd/kern/mach_process.c +++ b/bsd/kern/mach_process.c @@ -94,6 +94,8 @@ #include <vm/vm_protos.h> /* cs_allow_invalid() */ +#include <pexpert/pexpert.h> + /* XXX ken/bsd_kern.c - prototype should be in common header */ int get_task_userstop(task_t); @@ -127,6 +129,10 @@ ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval) AUDIT_ARG(value32, uap->data); if (uap->req == PT_DENY_ATTACH) { +#if (DEVELOPMENT || DEBUG) && defined(__arm__) + if (PE_i_can_has_debugger(NULL)) + return(0); +#endif proc_lock(p); if (ISSET(p->p_lflag, P_LTRACED)) { proc_unlock(p); @@ -164,8 +170,10 @@ ptrace(struct proc *p, struct ptrace_args *uap, int32_t *retval) struct proc *pproc=proc_find(p->p_oppid); proc_unlock(p); cs_allow_invalid(p); - cs_allow_invalid(pproc); - proc_rele(pproc); + if(pproc) { + cs_allow_invalid(pproc); + proc_rele(pproc); + } return(0); } if (uap->req == PT_SIGEXC) { @@ -434,7 +442,7 @@ cantrace(proc_t cur_procp, kauth_cred_t creds, proc_t traced_procp, int *errp) * (3) it's not owned by you, or is set-id on exec * (unless you're root). */ - if ((creds->cr_ruid != proc_ucred(traced_procp)->cr_ruid || + if ((kauth_cred_getruid(creds) != kauth_cred_getruid(proc_ucred(traced_procp)) || ISSET(traced_procp->p_flag, P_SUGID)) && (my_err = suser(creds, &cur_procp->p_acflag)) != 0) { *errp = my_err; diff --git a/bsd/kern/makesyscalls.sh b/bsd/kern/makesyscalls.sh index d8b11ba6c..301905871 100755 --- a/bsd/kern/makesyscalls.sh +++ b/bsd/kern/makesyscalls.sh @@ -190,6 +190,11 @@ s/\$//g printf "#include <mach/shared_region.h>\n" > sysarg printf "\n#ifdef KERNEL\n" > sysarg printf "#ifdef __APPLE_API_PRIVATE\n" > sysarg + printf "/*\n" > sysarg + printf " * The kernel may support multiple userspace ABIs, and must use\n" > sysarg + printf " * argument structures with elements large enough for any of them.\n" > sysarg + printf "*/\n" > sysarg + printf "\n" > sysarg printf "#ifndef __arm__\n" > sysarg printf "#define\tPAD_(t)\t(sizeof(uint64_t) <= sizeof(t) \\\n " > sysarg printf "\t\t? 0 : sizeof(uint64_t) - sizeof(t))\n" > sysarg @@ -205,8 +210,6 @@ s/\$//g printf "#define\tPADR_(t)\t0\n" > sysarg printf "#endif\n" > sysarg printf "\n__BEGIN_DECLS\n" > sysarg - printf "#ifndef __MUNGE_ONCE\n" > sysarg - printf "#define __MUNGE_ONCE\n" > sysarg printf "#ifndef __arm__\n" > sysarg printf "void munge_w(const void *, void *); \n" > sysarg printf "void munge_ww(const void *, void *); \n" > sysarg @@ -218,6 +221,10 @@ s/\$//g printf "void munge_wwwwwwww(const void *, void *); \n" > sysarg printf "void munge_wl(const void *, void *); \n" > sysarg printf "void munge_wlw(const void *, void *); \n" > sysarg + printf "void munge_wlwwwll(const void *, void *); \n" > sysarg + printf "void munge_wlwwwllw(const void *, void *); \n" > sysarg + printf "void munge_wlwwlwlw(const void *, void *); \n" > sysarg + printf "void munge_wllwwll(const void *, void *); \n" > sysarg printf "void munge_wwwl(const void *, void *); \n" > sysarg printf "void munge_wwwlw(const void *, void *); \n" > sysarg printf "void munge_wwwlww(const void *, void *); \n" > sysarg @@ -225,13 +232,18 @@ s/\$//g printf "void munge_wwwwlw(const void *, void *); \n" > sysarg printf "void munge_wwwwl(const void *, void *); \n" > sysarg printf "void munge_wwwwwl(const void *, void *); \n" > sysarg + printf "void munge_wwwwwlww(const void *, void *); \n" > sysarg + printf "void munge_wwwwwllw(const void *, void *); \n" > sysarg + printf "void munge_wwwwwlll(const void *, void *); \n" > sysarg printf "void munge_wwwwwwll(const void *, void *); \n" > sysarg + printf "void munge_wwwwwwl(const void *, void *); \n" > sysarg printf "void munge_wwwwwwlw(const void *, void *); \n" > sysarg printf "void munge_wsw(const void *, void *); \n" > sysarg printf "void munge_wws(const void *, void *); \n" > sysarg printf "void munge_wwwsw(const void *, void *); \n" > sysarg printf "void munge_llllll(const void *, void *); \n" > sysarg printf "#else \n" > sysarg + printf "/* ARM does not need mungers for BSD system calls */\n" > sysarg printf "#define munge_w NULL \n" > sysarg printf "#define munge_ww NULL \n" > sysarg printf "#define munge_www NULL \n" > sysarg @@ -242,6 +254,10 @@ s/\$//g printf "#define munge_wwwwwwww NULL \n" > sysarg printf "#define munge_wl NULL \n" > sysarg printf "#define munge_wlw NULL \n" > sysarg + printf "#define munge_wlwwwll NULL \n" > sysarg + printf "#define munge_wlwwwllw NULL \n" > sysarg + printf "#define munge_wlwwlwlw NULL \n" > sysarg + printf "#define munge_wllwwll NULL \n" > sysarg printf "#define munge_wwwl NULL \n" > sysarg printf "#define munge_wwwlw NULL \n" > sysarg printf "#define munge_wwwlww NULL\n" > sysarg @@ -249,22 +265,18 @@ s/\$//g printf "#define munge_wwwwl NULL \n" > sysarg printf "#define munge_wwwwlw NULL \n" > sysarg printf "#define munge_wwwwwl NULL \n" > sysarg + printf "#define munge_wwwwwlww NULL \n" > sysarg + printf "#define munge_wwwwwllw NULL \n" > sysarg + printf "#define munge_wwwwwlll NULL \n" > sysarg + printf "#define munge_wwwwwwl NULL \n" > sysarg printf "#define munge_wwwwwwlw NULL \n" > sysarg printf "#define munge_wsw NULL \n" > sysarg printf "#define munge_wws NULL \n" > sysarg printf "#define munge_wwwsw NULL \n" > sysarg printf "#define munge_llllll NULL \n" > sysarg - printf "#endif // ! __arm__\n" > sysarg - printf "#ifdef __ppc__\n" > sysarg - printf "void munge_d(const void *, void *); \n" > sysarg - printf "void munge_dd(const void *, void *); \n" > sysarg - printf "void munge_ddd(const void *, void *); \n" > sysarg - printf "void munge_dddd(const void *, void *); \n" > sysarg - printf "void munge_ddddd(const void *, void *); \n" > sysarg - printf "void munge_dddddd(const void *, void *); \n" > sysarg - printf "void munge_ddddddd(const void *, void *); \n" > sysarg - printf "void munge_dddddddd(const void *, void *); \n" > sysarg - printf "#else \n" > sysarg + printf "#endif /* __arm__ */\n" > sysarg + printf "\n" > sysarg + printf "/* Active 64-bit user ABIs do not need munging */\n" > sysarg printf "#define munge_d NULL \n" > sysarg printf "#define munge_dd NULL \n" > sysarg printf "#define munge_ddd NULL \n" > sysarg @@ -273,8 +285,6 @@ s/\$//g printf "#define munge_dddddd NULL \n" > sysarg printf "#define munge_ddddddd NULL \n" > sysarg printf "#define munge_dddddddd NULL \n" > sysarg - printf "#endif // __ppc__\n" > sysarg - printf "#endif /* !__MUNGE_ONCE */\n" > sysarg printf "\n" > sysarg @@ -592,7 +602,7 @@ s/\$//g argtype[i] == "socklen_t" || argtype[i] == "uint32_t" || argtype[i] == "int32_t" || argtype[i] == "sigset_t" || argtype[i] == "gid_t" || argtype[i] == "unsigned int" || argtype[i] == "mode_t" || argtype[i] == "key_t" || - argtype[i] == "mach_port_name_t") { + argtype[i] == "mach_port_name_t" || argtype[i] == "au_asid_t") { munge32 = munge32 "w" munge64 = munge64 "d" size32 += 4 diff --git a/bsd/kern/mcache.c b/bsd/kern/mcache.c index 14416f34a..a0c6cfb69 100644 --- a/bsd/kern/mcache.c +++ b/bsd/kern/mcache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2007 Apple Inc. All rights reserved. + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -98,9 +98,6 @@ #define MCACHE_UNLOCK(l) lck_mtx_unlock(l) #define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l) -/* This should be in a header file */ -#define atomic_add_32(a, n) ((void) OSAddAtomic(n, a)) - static int ncpu; static lck_mtx_t *mcache_llock; static struct thread *mcache_llock_owner; @@ -137,8 +134,8 @@ static mcache_bkttype_t mcache_bkttype[] = { }; static mcache_t *mcache_create_common(const char *, size_t, size_t, - mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_notifyfn_t, - void *, u_int32_t, int, int); + mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t, + mcache_notifyfn_t, void *, u_int32_t, int, int); static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***, unsigned int, int); static void mcache_slab_free(void *, mcache_obj_t *, boolean_t); @@ -192,6 +189,7 @@ mcache_init(void) PAGE_SIZE, "mcache"); if (mcache_zone == NULL) panic("mcache_init: failed to allocate mcache zone\n"); + zone_change(mcache_zone, Z_CALLERACCT, FALSE); LIST_INIT(&mcache_head); @@ -233,7 +231,8 @@ mcache_create(const char *name, size_t bufsize, size_t align, u_int32_t flags, int wait) { return (mcache_create_common(name, bufsize, align, mcache_slab_alloc, - mcache_slab_free, mcache_slab_audit, NULL, NULL, flags, 1, wait)); + mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1, + wait)); } /* @@ -244,10 +243,11 @@ mcache_create(const char *name, size_t bufsize, size_t align, __private_extern__ mcache_t * mcache_create_ext(const char *name, size_t bufsize, mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn, - mcache_notifyfn_t notifyfn, void *arg, u_int32_t flags, int wait) + mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg, + u_int32_t flags, int wait) { return (mcache_create_common(name, bufsize, 0, allocfn, - freefn, auditfn, notifyfn, arg, flags, 0, wait)); + freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait)); } /* @@ -256,8 +256,8 @@ mcache_create_ext(const char *name, size_t bufsize, static mcache_t * mcache_create_common(const char *name, size_t bufsize, size_t align, mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn, - mcache_notifyfn_t notifyfn, void *arg, u_int32_t flags, int need_zone, - int wait) + mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg, + u_int32_t flags, int need_zone, int wait) { mcache_bkttype_t *btp; mcache_t *cp = NULL; @@ -267,7 +267,7 @@ mcache_create_common(const char *name, size_t bufsize, size_t align, char lck_name[64]; /* If auditing is on and print buffer is NULL, allocate it now */ - if ((flags & MCF_AUDIT) && mca_dump_buf == NULL) { + if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) { int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK; MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP, malloc_wait | M_ZERO); @@ -313,6 +313,7 @@ mcache_create_common(const char *name, size_t bufsize, size_t align, cp->mc_slab_alloc = allocfn; cp->mc_slab_free = freefn; cp->mc_slab_audit = auditfn; + cp->mc_slab_log = logfn; cp->mc_slab_notify = notifyfn; cp->mc_private = need_zone ? cp : arg; cp->mc_bufsize = bufsize; @@ -467,6 +468,11 @@ retry_alloc: /* If we got them all, return to caller */ if ((need -= objs) == 0) { MCACHE_UNLOCK(&ccp->cc_lock); + + if (!(cp->mc_flags & MCF_NOLEAKLOG) && + cp->mc_slab_log != NULL) + (*cp->mc_slab_log)(num, *top, TRUE); + if (cp->mc_flags & MCF_DEBUG) goto debug_alloc; @@ -534,11 +540,14 @@ retry_alloc: } } + if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) + (*cp->mc_slab_log)((num - need), *top, TRUE); + if (!(cp->mc_flags & MCF_DEBUG)) return (num - need); debug_alloc: - if (cp->mc_flags & MCF_VERIFY) { + if (cp->mc_flags & MCF_DEBUG) { mcache_obj_t **o = top; unsigned int n; @@ -561,7 +570,7 @@ debug_alloc: } /* Invoke the slab layer audit callback if auditing is enabled */ - if ((cp->mc_flags & MCF_AUDIT) && cp->mc_slab_audit != NULL) + if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE); return (num - need); @@ -678,8 +687,11 @@ mcache_free_ext(mcache_t *cp, mcache_obj_t *list) mcache_obj_t *nlist; mcache_bkt_t *bkt; + if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) + (*cp->mc_slab_log)(0, list, FALSE); + /* Invoke the slab layer audit callback if auditing is enabled */ - if ((cp->mc_flags & MCF_AUDIT) && cp->mc_slab_audit != NULL) + if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) (*cp->mc_slab_audit)(cp->mc_private, list, FALSE); MCACHE_LOCK(&ccp->cc_lock); @@ -899,7 +911,7 @@ mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) * the nearest 64-bit multiply; this is because we use * 64-bit memory access to set/check the pattern. */ - if (flags & MCF_AUDIT) { + if (flags & MCF_DEBUG) { VERIFY(((intptr_t)base + rsize) <= ((intptr_t)buf + cp->mc_chunksize)); mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize); @@ -958,7 +970,7 @@ mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged) /* Get the original address since we're about to free it */ pbuf = (void **)((intptr_t)base - sizeof (void *)); - if (flags & MCF_AUDIT) { + if (flags & MCF_DEBUG) { VERIFY(((intptr_t)base + rsize) <= ((intptr_t)*pbuf + cp->mc_chunksize)); mcache_audit_free_verify(NULL, base, offset, rsize); @@ -1156,7 +1168,7 @@ mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt, if (nobjs > 0) { mcache_obj_t *top = bkt->bkt_obj[nobjs - 1]; - if (cp->mc_flags & MCF_VERIFY) { + if (cp->mc_flags & MCF_DEBUG) { mcache_obj_t *o = top; int cnt = 0; diff --git a/bsd/kern/netboot.c b/bsd/kern/netboot.c index 6c4b5437e..664f03ef7 100644 --- a/bsd/kern/netboot.c +++ b/bsd/kern/netboot.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2001-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -57,7 +57,8 @@ #include <kern/kern_types.h> #include <kern/kalloc.h> - +#include <sys/netboot.h> +#include <sys/imageboot.h> #include <pexpert/pexpert.h> //#include <libkern/libkern.h> @@ -81,10 +82,6 @@ const void * IOBSDRegistryEntryGetData(void * entry, const char * property_name, int * packet_length); -extern int vndevice_root_image(const char * path, char devname[], - dev_t * dev_p); -extern int di_root_image(const char *path, char devname[], dev_t *dev_p); - #define BOOTP_RESPONSE "bootp-response" #define BSDP_RESPONSE "bsdp-response" #define DHCP_RESPONSE "dhcp-response" @@ -92,16 +89,6 @@ extern int di_root_image(const char *path, char devname[], dev_t *dev_p); /* forward declarations */ int inet_aton(char * cp, struct in_addr * pin); -boolean_t netboot_iaddr(struct in_addr * iaddr_p); -boolean_t netboot_rootpath(struct in_addr * server_ip, - char * name, int name_len, - char * path, int path_len); -int netboot_setup(void); -int netboot_mountroot(void); -int netboot_root(void); - - - #define IP_FORMAT "%d.%d.%d.%d" #define IP_CH(ip) ((u_char *)ip) #define IP_LIST(ip) IP_CH(ip)[0],IP_CH(ip)[1],IP_CH(ip)[2],IP_CH(ip)[3] @@ -125,29 +112,10 @@ struct netboot_info { char * image_path; int image_path_length; NetBootImageType image_type; - boolean_t use_hdix; + char * second_image_path; + int second_image_path_length; }; -int -inet_aton(char * cp, struct in_addr * pin) -{ - u_char * b = (u_char *)pin; - int i; - char * p; - - for (p = cp, i = 0; i < 4; i++) { - u_long l = strtoul(p, 0, 0); - if (l > 255) - return (FALSE); - b[i] = l; - p = strchr(p, '.'); - if (i < 3 && p == NULL) - return (FALSE); - p++; - } - return (TRUE); -} - /* * Function: parse_booter_path * Purpose: @@ -251,7 +219,7 @@ static __inline__ boolean_t parse_netboot_path(char * path, struct in_addr * iaddr_p, char const * * host, char * * mount_dir, char * * image_path) { - static char tmp[MAX_IPv4_STR_LEN]; /* Danger - not thread safe */ + static char tmp[MAX_IPv4_STR_LEN]; /* Danger - not thread safe */ char * start; char * colon; @@ -346,35 +314,46 @@ get_root_path(char * root_path) } +static void +save_path(char * * str_p, int * length_p, char * path) +{ + *length_p = strlen(path) + 1; + *str_p = (char *)kalloc(*length_p); + strlcpy(*str_p, path, *length_p); + return; +} + static struct netboot_info * netboot_info_init(struct in_addr iaddr) { - struct netboot_info * info; + boolean_t have_root_path = FALSE; + struct netboot_info * info = NULL; char * root_path = NULL; - boolean_t use_hdix = TRUE; - char * vndevice = NULL; - - MALLOC_ZONE(vndevice, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); - if (vndevice == NULL) - panic("netboot_info_init: M_NAMEI zone exhausted"); - if (PE_parse_boot_argn("vndevice", vndevice, MAXPATHLEN) == TRUE) { - use_hdix = FALSE; - } - FREE_ZONE(vndevice, MAXPATHLEN, M_NAMEI); info = (struct netboot_info *)kalloc(sizeof(*info)); bzero(info, sizeof(*info)); info->client_ip = iaddr; info->image_type = kNetBootImageTypeUnknown; - info->use_hdix = use_hdix; /* check for a booter-specified path then a NetBoot path */ MALLOC_ZONE(root_path, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); if (root_path == NULL) panic("netboot_info_init: M_NAMEI zone exhausted"); - if (PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == TRUE - || PE_parse_boot_argn("rootpath", root_path, MAXPATHLEN) == TRUE - || get_root_path(root_path) == TRUE) { + if (PE_parse_boot_argn("rp0", root_path, MAXPATHLEN) == TRUE + || PE_parse_boot_argn("rp", root_path, MAXPATHLEN) == TRUE + || PE_parse_boot_argn("rootpath", root_path, MAXPATHLEN) == TRUE) { + if (imageboot_format_is_valid(root_path)) { + printf("netboot_info_init: rp0='%s' isn't a network path," + " ignoring\n", root_path); + } + else { + have_root_path = TRUE; + } + } + if (have_root_path == FALSE) { + have_root_path = get_root_path(root_path); + } + if (have_root_path) { const char * server_name = NULL; char * mount_point = NULL; char * image_path = NULL; @@ -391,11 +370,11 @@ netboot_info_init(struct in_addr iaddr) strlcpy(info->server_name, server_name, info->server_name_length); strlcpy(info->mount_point, mount_point, info->mount_point_length); - printf("Server %s Mount %s", + printf("netboot: NFS Server %s Mount %s", server_name, info->mount_point); if (image_path != NULL) { boolean_t needs_slash = FALSE; - + info->image_path_length = strlen(image_path) + 1; if (image_path[0] != '/') { needs_slash = TRUE; @@ -416,16 +395,27 @@ netboot_info_init(struct in_addr iaddr) } else if (strncmp(root_path, kNetBootRootPathPrefixHTTP, strlen(kNetBootRootPathPrefixHTTP)) == 0) { - /* only HDIX supports HTTP */ info->image_type = kNetBootImageTypeHTTP; - info->use_hdix = TRUE; - info->image_path_length = strlen(root_path) + 1; - info->image_path = (char *)kalloc(info->image_path_length); - strlcpy(info->image_path, root_path, info->image_path_length); + save_path(&info->image_path, &info->image_path_length, + root_path); + printf("netboot: HTTP URL %s\n", info->image_path); } else { printf("netboot: root path uses unrecognized format\n"); } + + /* check for image-within-image */ + if (info->image_path != NULL) { + if (PE_parse_boot_argn(IMAGEBOOT_ROOT_ARG, root_path, MAXPATHLEN) + || PE_parse_boot_argn("rp1", root_path, MAXPATHLEN)) { + /* rp1/root-dmg is the second-level image */ + save_path(&info->second_image_path, &info->second_image_path_length, + root_path); + } + } + if (info->second_image_path != NULL) { + printf("netboot: nested image %s\n", info->second_image_path); + } } FREE_ZONE(root_path, MAXPATHLEN, M_NAMEI); return (info); @@ -446,6 +436,9 @@ netboot_info_free(struct netboot_info * * info_p) if (info->image_path) { kfree(info->image_path, info->image_path_length); } + if (info->second_image_path) { + kfree(info->second_image_path, info->second_image_path_length); + } kfree(info, sizeof(*info)); } *info_p = NULL; @@ -565,13 +558,10 @@ route_cmd(int cmd, struct in_addr d, struct in_addr g, mask.sin_len = sizeof(mask); mask.sin_family = AF_INET; mask.sin_addr = m; - lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(rnh_lock); - error = rtrequest_scoped_locked(cmd, (struct sockaddr *)&dst, - (struct sockaddr *)&gw, - (struct sockaddr *)&mask, - flags, NULL, ifscope); - lck_mtx_unlock(rnh_lock); + + error = rtrequest_scoped(cmd, (struct sockaddr *)&dst, + (struct sockaddr *)&gw, (struct sockaddr *)&mask, flags, NULL, ifscope); + return (error); } @@ -751,53 +741,24 @@ failed: int netboot_setup() { - dev_t dev; int error = 0; if (S_netboot_info_p == NULL || S_netboot_info_p->image_path == NULL) { goto done; } - if (S_netboot_info_p->use_hdix) { - printf("netboot_setup: calling di_root_image\n"); - error = di_root_image(S_netboot_info_p->image_path, - (char *)rootdevice, &dev); - if (error) { - printf("netboot_setup: di_root_image: failed %d\n", error); - goto done; - } + printf("netboot_setup: calling imageboot_mount_image\n"); + error = imageboot_mount_image(S_netboot_info_p->image_path, -1); + if (error != 0) { + printf("netboot: failed to mount root image, %d\n", error); } - else { - printf("netboot_setup: calling vndevice_root_image\n"); - error = vndevice_root_image(S_netboot_info_p->image_path, - (char *)rootdevice, &dev); - if (error) { - printf("netboot_setup: vndevice_root_image: failed %d\n", error); - goto done; + else if (S_netboot_info_p->second_image_path != NULL) { + error = imageboot_mount_image(S_netboot_info_p->second_image_path, 0); + if (error != 0) { + printf("netboot: failed to mount second root image, %d\n", error); } } - rootdev = dev; - mountroot = NULL; - printf("netboot: root device 0x%x\n", (int32_t)rootdev); - error = vfs_mountroot(); - if (error == 0 && rootvnode != NULL) { - struct vnode *tvp; - struct vnode *newdp; - - /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */ - if (VFS_ROOT(TAILQ_LAST(&mountlist,mntlist), &newdp, vfs_context_kernel())) - panic("netboot_setup: cannot find root vnode"); - vnode_ref(newdp); - vnode_put(newdp); - tvp = rootvnode; - vnode_rele(tvp); - filedesc0.fd_cdir = newdp; - rootvnode = newdp; - mount_list_lock(); - TAILQ_REMOVE(&mountlist, TAILQ_FIRST(&mountlist), mnt_list); - mount_list_unlock(); - mountlist.tqh_first->mnt_flag |= MNT_ROOTFS; - } + done: netboot_info_free(&S_netboot_info_p); return (error); diff --git a/bsd/kern/policy_check.c b/bsd/kern/policy_check.c new file mode 100644 index 000000000..e5573a99f --- /dev/null +++ b/bsd/kern/policy_check.c @@ -0,0 +1,511 @@ +#include <sys/param.h> +#include <sys/systm.h> /* XXX printf() */ + +#include <sys/types.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/kauth.h> +#include <sys/mount.h> +#include <sys/msg.h> +#include <sys/proc.h> +#include <sys/socketvar.h> +#include <sys/vnode.h> +#include <security/mac.h> +#include <security/mac_policy.h> + +#include <libkern/OSDebug.h> /* OSBPrintBacktrace */ + + +/* forward declaration; see bsd_init.c */ +errno_t check_policy_init(int); +int get_thread_lock_count(thread_t th); /* forced forward */ + +/* + * Policy flags used when the policy is enabled + * + * Note: CHECK_POLICY_CHECK is probably not very useful unless you + * are kernel debugging and set a breakpoint. + */ +#define CHECK_POLICY_CHECK 0x00000001 /* Check on calls */ +#define CHECK_POLICY_FAIL 0x00000002 /* EPERM on fails */ +#define CHECK_POLICY_BACKTRACE 0x00000004 /* Show call stack on fails */ +#define CHECK_POLICY_PANIC 0x00000008 /* Panic on fails */ +#define CHECK_POLICY_PERIODIC 0x00000010 /* Show fails periodically */ + +static int policy_flags = 0; + + +#define CHECK_SET_INT_HOOK(x) .mpo_##x = (mpo_##x##_t *)common_int_hook, +#define CHECK_SET_VOID_HOOK(x) .mpo_##x = (mpo_##x##_t *)common_void_hook, + + +/* + * Init; currently, we only print our arrival notice. + */ +static void +hook_policy_init(struct mac_policy_conf *mpc) +{ + printf("Policy '%s' = '%s' ready\n", mpc->mpc_name, mpc->mpc_fullname); +} + +static void +hook_policy_initbsd(struct mac_policy_conf *mpc) +{ + /* called with policy_grab_exclusive mutex held; exempt */ + printf("hook_policy_initbsd: %s\n", mpc->mpc_name); +} + + +/* Implementation */ +#define CLASS_PERIOD_LIMIT 10000 +#define CLASS_PERIOD_MULT 20 + +static int policy_check_event = 1; +static int policy_check_period = 1; +static int policy_check_next = CLASS_PERIOD_MULT; + + +static int +common_int_hook(void) +{ + int i; + int rv = 0; + + if ((i = get_thread_lock_count(current_thread())) != 0) { + /* + * fail the MACF check if we hold a lock; this assumes a + * a non-void (authorization) MACF hook. + */ + if (policy_flags & CHECK_POLICY_FAIL) + rv = EPERM; + + /* + * display a backtrace if we hold a lock and we are not + * going to panic + */ + if ((policy_flags & (CHECK_POLICY_BACKTRACE | CHECK_POLICY_PANIC)) == CHECK_POLICY_BACKTRACE) { + if (policy_flags & CHECK_POLICY_PERIODIC) { + /* at exponentially increasing intervals */ + if (!(policy_check_event % policy_check_period)) { + if (policy_check_event <= policy_check_next || policy_check_period == CLASS_PERIOD_LIMIT) { + /* + * According to Derek, we could + * technically get a symbolicated name + * here, if we refactered some code + * and set the "keepsyms=1" boot + * argument... + */ + OSReportWithBacktrace("calling MACF hook with mutex count %d (event %d) ", i, policy_check_event); + } + } else { + if (policy_check_period < CLASS_PERIOD_LIMIT) { + policy_check_next *= CLASS_PERIOD_MULT; + policy_check_period *= CLASS_PERIOD_MULT; + } + } + } else { + /* always */ + OSReportWithBacktrace("calling MACF hook with mutex count %d (event %d) ", i, policy_check_event); + } + } + + /* Panic */ + if (policy_flags & CHECK_POLICY_PANIC) + panic("calling MACF hook with mutex count %d\n", i); + + /* count for non-fatal tracing */ + policy_check_event++; + } + + return rv; +} + +static void +common_void_hook(void) +{ + (void)common_int_hook(); + + return; +} + + +/* + * Policy hooks; one per possible hook + */ +static struct mac_policy_ops policy_ops = { + + /* separate init */ + .mpo_policy_init = hook_policy_init, + .mpo_policy_initbsd = hook_policy_initbsd, + + /* operations which return int */ + CHECK_SET_INT_HOOK(audit_check_postselect) + CHECK_SET_INT_HOOK(audit_check_preselect) + CHECK_SET_INT_HOOK(bpfdesc_check_receive) + CHECK_SET_INT_HOOK(cred_check_label_update_execve) + CHECK_SET_INT_HOOK(cred_check_label_update) + CHECK_SET_INT_HOOK(cred_check_visible) + CHECK_SET_INT_HOOK(cred_label_externalize_audit) + CHECK_SET_INT_HOOK(cred_label_externalize) + CHECK_SET_INT_HOOK(cred_label_internalize) + CHECK_SET_INT_HOOK(file_check_change_offset) + CHECK_SET_INT_HOOK(file_check_create) + CHECK_SET_INT_HOOK(file_check_dup) + CHECK_SET_INT_HOOK(file_check_fcntl) + CHECK_SET_INT_HOOK(file_check_get) + CHECK_SET_INT_HOOK(file_check_get_offset) + CHECK_SET_INT_HOOK(file_check_inherit) + CHECK_SET_INT_HOOK(file_check_ioctl) + CHECK_SET_INT_HOOK(file_check_lock) + CHECK_SET_INT_HOOK(file_check_mmap) + CHECK_SET_INT_HOOK(file_check_receive) + CHECK_SET_INT_HOOK(file_check_set) + CHECK_SET_INT_HOOK(ifnet_check_label_update) + CHECK_SET_INT_HOOK(ifnet_check_transmit) + CHECK_SET_INT_HOOK(ifnet_label_externalize) + CHECK_SET_INT_HOOK(ifnet_label_internalize) + CHECK_SET_INT_HOOK(inpcb_check_deliver) + CHECK_SET_INT_HOOK(inpcb_label_init) + CHECK_SET_INT_HOOK(iokit_check_device) + CHECK_SET_INT_HOOK(iokit_check_open) + CHECK_SET_INT_HOOK(iokit_check_set_properties) + CHECK_SET_INT_HOOK(iokit_check_hid_control) + CHECK_SET_INT_HOOK(ipq_label_compare) + CHECK_SET_INT_HOOK(ipq_label_init) + CHECK_SET_INT_HOOK(lctx_check_label_update) + CHECK_SET_INT_HOOK(lctx_label_externalize) + CHECK_SET_INT_HOOK(lctx_label_internalize) + CHECK_SET_INT_HOOK(mbuf_label_init) + CHECK_SET_INT_HOOK(mount_check_fsctl) + CHECK_SET_INT_HOOK(mount_check_getattr) + CHECK_SET_INT_HOOK(mount_check_label_update) + CHECK_SET_INT_HOOK(mount_check_mount) + CHECK_SET_INT_HOOK(mount_check_remount) + CHECK_SET_INT_HOOK(mount_check_setattr) + CHECK_SET_INT_HOOK(mount_check_stat) + CHECK_SET_INT_HOOK(mount_check_umount) + CHECK_SET_INT_HOOK(mount_label_externalize) + CHECK_SET_INT_HOOK(mount_label_internalize) + CHECK_SET_INT_HOOK(pipe_check_ioctl) + CHECK_SET_INT_HOOK(pipe_check_kqfilter) + CHECK_SET_INT_HOOK(pipe_check_label_update) + CHECK_SET_INT_HOOK(pipe_check_read) + CHECK_SET_INT_HOOK(pipe_check_select) + CHECK_SET_INT_HOOK(pipe_check_stat) + CHECK_SET_INT_HOOK(pipe_check_write) + CHECK_SET_INT_HOOK(pipe_label_externalize) + CHECK_SET_INT_HOOK(pipe_label_internalize) + CHECK_SET_INT_HOOK(policy_syscall) + CHECK_SET_INT_HOOK(port_check_copy_send) + CHECK_SET_INT_HOOK(port_check_hold_receive) + CHECK_SET_INT_HOOK(port_check_hold_send_once) + CHECK_SET_INT_HOOK(port_check_hold_send) + CHECK_SET_INT_HOOK(port_check_label_update) + CHECK_SET_INT_HOOK(port_check_make_send_once) + CHECK_SET_INT_HOOK(port_check_make_send) + CHECK_SET_INT_HOOK(port_check_method) + CHECK_SET_INT_HOOK(port_check_move_receive) + CHECK_SET_INT_HOOK(port_check_move_send_once) + CHECK_SET_INT_HOOK(port_check_move_send) + CHECK_SET_INT_HOOK(port_check_receive) + CHECK_SET_INT_HOOK(port_check_send) + CHECK_SET_INT_HOOK(port_check_service) + CHECK_SET_INT_HOOK(port_label_compute) + CHECK_SET_INT_HOOK(posixsem_check_create) + CHECK_SET_INT_HOOK(posixsem_check_open) + CHECK_SET_INT_HOOK(posixsem_check_post) + CHECK_SET_INT_HOOK(posixsem_check_unlink) + CHECK_SET_INT_HOOK(posixsem_check_wait) + CHECK_SET_INT_HOOK(posixshm_check_create) + CHECK_SET_INT_HOOK(posixshm_check_mmap) + CHECK_SET_INT_HOOK(posixshm_check_open) + CHECK_SET_INT_HOOK(posixshm_check_stat) + CHECK_SET_INT_HOOK(posixshm_check_truncate) + CHECK_SET_INT_HOOK(posixshm_check_unlink) + CHECK_SET_INT_HOOK(priv_check) + /* relative ordinal location of "priv_grant" */ + CHECK_SET_INT_HOOK(proc_check_debug) + CHECK_SET_INT_HOOK(proc_check_fork) + CHECK_SET_INT_HOOK(proc_check_getaudit) + CHECK_SET_INT_HOOK(proc_check_getauid) + CHECK_SET_INT_HOOK(proc_check_getlcid) + CHECK_SET_INT_HOOK(proc_check_map_anon) + CHECK_SET_INT_HOOK(proc_check_mprotect) + CHECK_SET_INT_HOOK(proc_check_sched) + CHECK_SET_INT_HOOK(proc_check_setaudit) + CHECK_SET_INT_HOOK(proc_check_setauid) + CHECK_SET_INT_HOOK(proc_check_setlcid) + CHECK_SET_INT_HOOK(proc_check_signal) + CHECK_SET_INT_HOOK(proc_check_suspend_resume) + CHECK_SET_INT_HOOK(proc_check_wait) + CHECK_SET_INT_HOOK(socket_check_accept) + CHECK_SET_INT_HOOK(socket_check_accepted) + CHECK_SET_INT_HOOK(socket_check_bind) + CHECK_SET_INT_HOOK(socket_check_connect) + CHECK_SET_INT_HOOK(socket_check_create) + CHECK_SET_INT_HOOK(socket_check_deliver) + CHECK_SET_INT_HOOK(socket_check_kqfilter) + CHECK_SET_INT_HOOK(socket_check_label_update) + CHECK_SET_INT_HOOK(socket_check_listen) + CHECK_SET_INT_HOOK(socket_check_receive) + CHECK_SET_INT_HOOK(socket_check_received) + CHECK_SET_INT_HOOK(socket_check_select) + CHECK_SET_INT_HOOK(socket_check_send) + CHECK_SET_INT_HOOK(socket_check_stat) + CHECK_SET_INT_HOOK(socket_check_setsockopt) + CHECK_SET_INT_HOOK(socket_check_getsockopt) + CHECK_SET_INT_HOOK(socket_label_externalize) + CHECK_SET_INT_HOOK(socket_label_init) + CHECK_SET_INT_HOOK(socket_label_internalize) + CHECK_SET_INT_HOOK(socketpeer_label_externalize) + CHECK_SET_INT_HOOK(socketpeer_label_init) + CHECK_SET_INT_HOOK(system_check_acct) + CHECK_SET_INT_HOOK(system_check_audit) + CHECK_SET_INT_HOOK(system_check_auditctl) + CHECK_SET_INT_HOOK(system_check_auditon) + CHECK_SET_INT_HOOK(system_check_chud) + CHECK_SET_INT_HOOK(system_check_host_priv) + CHECK_SET_INT_HOOK(system_check_nfsd) + CHECK_SET_INT_HOOK(system_check_reboot) + CHECK_SET_INT_HOOK(system_check_settime) + CHECK_SET_INT_HOOK(system_check_swapoff) + CHECK_SET_INT_HOOK(system_check_swapon) + CHECK_SET_INT_HOOK(system_check_sysctl) + CHECK_SET_INT_HOOK(sysvmsq_check_enqueue) + CHECK_SET_INT_HOOK(sysvmsq_check_msgrcv) + CHECK_SET_INT_HOOK(sysvmsq_check_msgrmid) + CHECK_SET_INT_HOOK(sysvmsq_check_msqctl) + CHECK_SET_INT_HOOK(sysvmsq_check_msqget) + CHECK_SET_INT_HOOK(sysvmsq_check_msqrcv) + CHECK_SET_INT_HOOK(sysvmsq_check_msqsnd) + CHECK_SET_INT_HOOK(sysvsem_check_semctl) + CHECK_SET_INT_HOOK(sysvsem_check_semget) + CHECK_SET_INT_HOOK(sysvsem_check_semop) + CHECK_SET_INT_HOOK(sysvshm_check_shmat) + CHECK_SET_INT_HOOK(sysvshm_check_shmctl) + CHECK_SET_INT_HOOK(sysvshm_check_shmdt) + CHECK_SET_INT_HOOK(sysvshm_check_shmget) + CHECK_SET_INT_HOOK(proc_check_get_task_name) + CHECK_SET_INT_HOOK(proc_check_get_task) + CHECK_SET_INT_HOOK(task_label_externalize) + CHECK_SET_INT_HOOK(task_label_internalize) + CHECK_SET_INT_HOOK(vnode_check_access) + CHECK_SET_INT_HOOK(vnode_check_chdir) + CHECK_SET_INT_HOOK(vnode_check_chroot) + CHECK_SET_INT_HOOK(vnode_check_create) + CHECK_SET_INT_HOOK(vnode_check_deleteextattr) + CHECK_SET_INT_HOOK(vnode_check_exchangedata) + CHECK_SET_INT_HOOK(vnode_check_exec) + CHECK_SET_INT_HOOK(vnode_check_fsgetpath) + CHECK_SET_INT_HOOK(vnode_check_signature) + CHECK_SET_INT_HOOK(vnode_check_getattrlist) + CHECK_SET_INT_HOOK(vnode_check_getextattr) + CHECK_SET_INT_HOOK(vnode_check_ioctl) + CHECK_SET_INT_HOOK(vnode_check_kqfilter) + CHECK_SET_INT_HOOK(vnode_check_label_update) + CHECK_SET_INT_HOOK(vnode_check_link) + CHECK_SET_INT_HOOK(vnode_check_listextattr) + CHECK_SET_INT_HOOK(vnode_check_lookup) + CHECK_SET_INT_HOOK(vnode_check_open) + CHECK_SET_INT_HOOK(vnode_check_read) + CHECK_SET_INT_HOOK(vnode_check_readdir) + CHECK_SET_INT_HOOK(vnode_check_readlink) + CHECK_SET_INT_HOOK(vnode_check_rename_from) + CHECK_SET_INT_HOOK(vnode_check_rename_to) + CHECK_SET_INT_HOOK(vnode_check_revoke) + CHECK_SET_INT_HOOK(vnode_check_searchfs) + CHECK_SET_INT_HOOK(vnode_check_select) + CHECK_SET_INT_HOOK(vnode_check_setattrlist) + CHECK_SET_INT_HOOK(vnode_check_setextattr) + CHECK_SET_INT_HOOK(vnode_check_setflags) + CHECK_SET_INT_HOOK(vnode_check_setmode) + CHECK_SET_INT_HOOK(vnode_check_setowner) + CHECK_SET_INT_HOOK(vnode_check_setutimes) + CHECK_SET_INT_HOOK(vnode_check_stat) + CHECK_SET_INT_HOOK(vnode_check_truncate) + CHECK_SET_INT_HOOK(vnode_check_uipc_bind) + CHECK_SET_INT_HOOK(vnode_check_uipc_connect) + CHECK_SET_INT_HOOK(vnode_check_unlink) + CHECK_SET_INT_HOOK(vnode_check_write) + CHECK_SET_INT_HOOK(vnode_label_associate_extattr) + CHECK_SET_INT_HOOK(vnode_label_externalize_audit) + CHECK_SET_INT_HOOK(vnode_label_externalize) + CHECK_SET_INT_HOOK(vnode_label_internalize) + CHECK_SET_INT_HOOK(vnode_label_store) + CHECK_SET_INT_HOOK(vnode_label_update_extattr) + CHECK_SET_INT_HOOK(vnode_notify_create) + + /* operations which return void */ + CHECK_SET_VOID_HOOK(bpfdesc_label_init) + CHECK_SET_VOID_HOOK(bpfdesc_label_destroy) + CHECK_SET_VOID_HOOK(bpfdesc_label_associate) + CHECK_SET_VOID_HOOK(cred_label_associate_fork) + CHECK_SET_VOID_HOOK(cred_label_associate_kernel) + CHECK_SET_VOID_HOOK(cred_label_associate) + CHECK_SET_VOID_HOOK(cred_label_associate_user) + CHECK_SET_VOID_HOOK(cred_label_destroy) + CHECK_SET_VOID_HOOK(cred_label_init) + CHECK_SET_VOID_HOOK(cred_label_update_execve) + CHECK_SET_VOID_HOOK(cred_label_update) + CHECK_SET_VOID_HOOK(devfs_label_associate_device) + CHECK_SET_VOID_HOOK(devfs_label_associate_directory) + CHECK_SET_VOID_HOOK(devfs_label_copy) + CHECK_SET_VOID_HOOK(devfs_label_destroy) + CHECK_SET_VOID_HOOK(devfs_label_init) + CHECK_SET_VOID_HOOK(devfs_label_update) + CHECK_SET_VOID_HOOK(file_check_mmap_downgrade) + CHECK_SET_VOID_HOOK(file_label_associate) + CHECK_SET_VOID_HOOK(file_label_destroy) + CHECK_SET_VOID_HOOK(file_label_init) + CHECK_SET_VOID_HOOK(ifnet_label_associate) + CHECK_SET_VOID_HOOK(ifnet_label_copy) + CHECK_SET_VOID_HOOK(ifnet_label_destroy) + CHECK_SET_VOID_HOOK(ifnet_label_init) + CHECK_SET_VOID_HOOK(ifnet_label_recycle) + CHECK_SET_VOID_HOOK(ifnet_label_update) + CHECK_SET_VOID_HOOK(inpcb_label_associate) + CHECK_SET_VOID_HOOK(inpcb_label_destroy) + CHECK_SET_VOID_HOOK(inpcb_label_recycle) + CHECK_SET_VOID_HOOK(inpcb_label_update) + CHECK_SET_VOID_HOOK(ipq_label_associate) + CHECK_SET_VOID_HOOK(ipq_label_destroy) + CHECK_SET_VOID_HOOK(ipq_label_update) + CHECK_SET_VOID_HOOK(lctx_label_destroy) + CHECK_SET_VOID_HOOK(lctx_label_init) + CHECK_SET_VOID_HOOK(lctx_label_update) + CHECK_SET_VOID_HOOK(lctx_notify_create) + CHECK_SET_VOID_HOOK(lctx_notify_join) + CHECK_SET_VOID_HOOK(lctx_notify_leave) + CHECK_SET_VOID_HOOK(mbuf_label_associate_bpfdesc) + CHECK_SET_VOID_HOOK(mbuf_label_associate_ifnet) + CHECK_SET_VOID_HOOK(mbuf_label_associate_inpcb) + CHECK_SET_VOID_HOOK(mbuf_label_associate_ipq) + CHECK_SET_VOID_HOOK(mbuf_label_associate_linklayer) + CHECK_SET_VOID_HOOK(mbuf_label_associate_multicast_encap) + CHECK_SET_VOID_HOOK(mbuf_label_associate_netlayer) + CHECK_SET_VOID_HOOK(mbuf_label_associate_socket) + CHECK_SET_VOID_HOOK(mbuf_label_copy) + CHECK_SET_VOID_HOOK(mbuf_label_destroy) + CHECK_SET_VOID_HOOK(mount_label_associate) + CHECK_SET_VOID_HOOK(mount_label_destroy) + CHECK_SET_VOID_HOOK(mount_label_init) + CHECK_SET_VOID_HOOK(netinet_fragment) + CHECK_SET_VOID_HOOK(netinet_icmp_reply) + CHECK_SET_VOID_HOOK(netinet_tcp_reply) + CHECK_SET_VOID_HOOK(pipe_label_associate) + CHECK_SET_VOID_HOOK(pipe_label_copy) + CHECK_SET_VOID_HOOK(pipe_label_destroy) + CHECK_SET_VOID_HOOK(pipe_label_init) + CHECK_SET_VOID_HOOK(pipe_label_update) + CHECK_SET_VOID_HOOK(policy_destroy) + /* relative ordinal location of "policy_init" */ + /* relative ordinal location of "policy_initbsd" */ + CHECK_SET_VOID_HOOK(port_label_associate_kernel) + CHECK_SET_VOID_HOOK(port_label_associate) + CHECK_SET_VOID_HOOK(port_label_copy) + CHECK_SET_VOID_HOOK(port_label_destroy) + CHECK_SET_VOID_HOOK(port_label_init) + CHECK_SET_VOID_HOOK(port_label_update_cred) + CHECK_SET_VOID_HOOK(port_label_update_kobject) + CHECK_SET_VOID_HOOK(posixsem_label_associate) + CHECK_SET_VOID_HOOK(posixsem_label_destroy) + CHECK_SET_VOID_HOOK(posixsem_label_init) + CHECK_SET_VOID_HOOK(posixshm_label_associate) + CHECK_SET_VOID_HOOK(posixshm_label_destroy) + CHECK_SET_VOID_HOOK(posixshm_label_init) + CHECK_SET_VOID_HOOK(proc_label_destroy) + CHECK_SET_VOID_HOOK(proc_label_init) + CHECK_SET_VOID_HOOK(socket_label_associate_accept) + CHECK_SET_VOID_HOOK(socket_label_associate) + CHECK_SET_VOID_HOOK(socket_label_copy) + CHECK_SET_VOID_HOOK(socket_label_destroy) + CHECK_SET_VOID_HOOK(socket_label_update) + CHECK_SET_VOID_HOOK(socketpeer_label_associate_mbuf) + CHECK_SET_VOID_HOOK(socketpeer_label_associate_socket) + CHECK_SET_VOID_HOOK(socketpeer_label_destroy) + CHECK_SET_VOID_HOOK(sysvmsg_label_associate) + CHECK_SET_VOID_HOOK(sysvmsg_label_destroy) + CHECK_SET_VOID_HOOK(sysvmsg_label_init) + CHECK_SET_VOID_HOOK(sysvmsg_label_recycle) + CHECK_SET_VOID_HOOK(sysvmsq_label_associate) + CHECK_SET_VOID_HOOK(sysvmsq_label_destroy) + CHECK_SET_VOID_HOOK(sysvmsq_label_init) + CHECK_SET_VOID_HOOK(sysvmsq_label_recycle) + CHECK_SET_VOID_HOOK(sysvsem_label_associate) + CHECK_SET_VOID_HOOK(sysvsem_label_destroy) + CHECK_SET_VOID_HOOK(sysvsem_label_init) + CHECK_SET_VOID_HOOK(sysvsem_label_recycle) + CHECK_SET_VOID_HOOK(sysvshm_label_associate) + CHECK_SET_VOID_HOOK(sysvshm_label_destroy) + CHECK_SET_VOID_HOOK(sysvshm_label_init) + CHECK_SET_VOID_HOOK(sysvshm_label_recycle) + CHECK_SET_VOID_HOOK(task_label_associate_kernel) + CHECK_SET_VOID_HOOK(task_label_associate) + CHECK_SET_VOID_HOOK(task_label_copy) + CHECK_SET_VOID_HOOK(task_label_destroy) + CHECK_SET_VOID_HOOK(task_label_init) + CHECK_SET_VOID_HOOK(task_label_update) + CHECK_SET_VOID_HOOK(vnode_label_associate_devfs) + CHECK_SET_VOID_HOOK(vnode_label_associate_file) + CHECK_SET_VOID_HOOK(vnode_label_associate_pipe) + CHECK_SET_VOID_HOOK(vnode_label_associate_posixsem) + CHECK_SET_VOID_HOOK(vnode_label_associate_posixshm) + CHECK_SET_VOID_HOOK(vnode_label_associate_singlelabel) + CHECK_SET_VOID_HOOK(vnode_label_associate_socket) + CHECK_SET_VOID_HOOK(vnode_label_copy) + CHECK_SET_VOID_HOOK(vnode_label_destroy) + CHECK_SET_VOID_HOOK(vnode_label_init) + CHECK_SET_VOID_HOOK(vnode_label_recycle) + CHECK_SET_VOID_HOOK(vnode_label_update) + CHECK_SET_VOID_HOOK(vnode_notify_rename) + .mpo_reserved12 = common_void_hook, + .mpo_reserved14 = common_void_hook, + .mpo_reserved15 = common_void_hook, + .mpo_reserved16 = common_void_hook, + .mpo_reserved17 = common_void_hook, + .mpo_reserved18 = common_void_hook, + .mpo_reserved19 = common_void_hook, + .mpo_reserved20 = common_void_hook, + .mpo_reserved21 = common_void_hook, + .mpo_reserved22 = common_void_hook, + .mpo_reserved23 = common_void_hook, + .mpo_reserved24 = common_void_hook, + .mpo_reserved25 = common_void_hook, + .mpo_reserved26 = common_void_hook, + .mpo_reserved27 = common_void_hook, + .mpo_reserved28 = common_void_hook, + .mpo_reserved29 = common_void_hook, +}; + +/* + * Policy definition + */ +static struct mac_policy_conf policy_conf = { + .mpc_name = "CHECK", + .mpc_fullname = "Check Assumptions Policy", + .mpc_field_off = NULL, /* no label slot */ + .mpc_labelnames = NULL, /* no policy label names */ + .mpc_labelname_count = 0, /* count of label names is 0 */ + .mpc_ops = &policy_ops, /* policy operations */ + .mpc_loadtime_flags = 0, + .mpc_runtime_flags = 0, +}; + +static mac_policy_handle_t policy_handle; + +/* + * Init routine; for a loadable policy, this would be called during the KEXT + * initialization; we're going to call this from bsd_init() if the boot + * argument for checking is present. + */ +errno_t +check_policy_init(int flags) +{ + /* Only instantiate the module if we have been asked to do checking */ + if (!flags) + return 0; + + policy_flags = flags; + + return mac_policy_register(&policy_conf, &policy_handle, NULL); +} diff --git a/bsd/kern/posix_sem.c b/bsd/kern/posix_sem.c index a2cd627f1..c312d1b84 100644 --- a/bsd/kern/posix_sem.c +++ b/bsd/kern/posix_sem.c @@ -30,7 +30,7 @@ * All Rights Reserved. */ /* - * posix_shm.c : Support for POSIX semaphore APIs + * posix_sem.c : Support for POSIX semaphore APIs * * File: posix_sem.c * Author: Ananthakrishna Ramesh @@ -155,9 +155,9 @@ u_long psemhash; /* size of hash table - 1 */ long psemnument; /* number of cache entries allocated */ long posix_sem_max = 10000; /* tunable for max POSIX semaphores */ /* 10000 limits to ~1M of memory */ -SYSCTL_NODE(_kern, KERN_POSIX, posix, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Posix"); -SYSCTL_NODE(_kern_posix, OID_AUTO, sem, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Semaphores"); -SYSCTL_LONG (_kern_posix_sem, OID_AUTO, max, CTLFLAG_RW, &posix_sem_max, "max"); +SYSCTL_NODE(_kern, KERN_POSIX, posix, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Posix"); +SYSCTL_NODE(_kern_posix, OID_AUTO, sem, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Semaphores"); +SYSCTL_LONG (_kern_posix_sem, OID_AUTO, max, CTLFLAG_RW | CTLFLAG_LOCKED, &posix_sem_max, "max"); struct psemstats psemstats; /* cache effectiveness statistics */ @@ -524,8 +524,8 @@ sem_open(proc_t p, struct sem_open_args *uap, user_addr_t *retval) pinfo->psem_flags = PSEM_DEFINED | PSEM_INCREATE; pinfo->psem_usecount = 1; pinfo->psem_mode = cmode; - pinfo->psem_uid = kauth_cred_getuid(kauth_cred_get()); - pinfo->psem_gid = kauth_cred_get()->cr_gid; + pinfo->psem_uid = kauth_getuid(); + pinfo->psem_gid = kauth_getgid(); bcopy(pnbuf, &pinfo->psem_name[0], PSEMNAMLEN); pinfo->psem_name[PSEMNAMLEN]= 0; pinfo->psem_flags &= ~PSEM_DEFINED; @@ -643,39 +643,14 @@ bad: static int psem_access(struct pseminfo *pinfo, int mode, kauth_cred_t cred) { - mode_t mask; - int is_member; + int mode_req = ((mode & FREAD) ? S_IRUSR : 0) | + ((mode & FWRITE) ? S_IWUSR : 0); /* Otherwise, user id 0 always gets access. */ if (!suser(cred, NULL)) return (0); - mask = 0; - - /* Otherwise, check the owner. */ - if (kauth_cred_getuid(cred) == pinfo->psem_uid) { - if (mode & FREAD) - mask |= S_IRUSR; - if (mode & FWRITE) - mask |= S_IWUSR; - return ((pinfo->psem_mode & mask) == mask ? 0 : EACCES); - } - - /* Otherwise, check the groups. */ - if (kauth_cred_ismember_gid(cred, pinfo->psem_gid, &is_member) == 0 && is_member) { - if (mode & FREAD) - mask |= S_IRGRP; - if (mode & FWRITE) - mask |= S_IWGRP; - return ((pinfo->psem_mode & mask) == mask ? 0 : EACCES); - } - - /* Otherwise, check everyone else. */ - if (mode & FREAD) - mask |= S_IROTH; - if (mode & FWRITE) - mask |= S_IWOTH; - return ((pinfo->psem_mode & mask) == mask ? 0 : EACCES); + return(posix_cred_access(cred, pinfo->psem_uid, pinfo->psem_gid, pinfo->psem_mode, mode_req)); } int @@ -809,6 +784,7 @@ sem_close(proc_t p, struct sem_close_args *uap, __unused int32_t *retval) proc_fdunlock(p); return(error); } + procfdtbl_markclosefd(p, fd); fileproc_drain(p, fp); fdrelse(p, fd); error = closef_locked(fp, fp->f_fglob, p); diff --git a/bsd/kern/posix_shm.c b/bsd/kern/posix_shm.c index 985538e69..617d1dc9f 100644 --- a/bsd/kern/posix_shm.c +++ b/bsd/kern/posix_shm.c @@ -178,7 +178,7 @@ static int pshm_write (struct fileproc *fp, struct uio *uio, static int pshm_ioctl (struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx); static int pshm_select (struct fileproc *fp, int which, void *wql, vfs_context_t ctx); -static int pshm_close(struct pshmnode *pnode); +static int pshm_close(struct pshminfo *pinfo, int dropref); static int pshm_closefile (struct fileglob *fg, vfs_context_t ctx); static int pshm_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx); @@ -190,7 +190,7 @@ static void pshm_cache_delete(struct pshmcache *pcp); static void pshm_cache_purge(void); #endif /* NOT_USED */ static int pshm_cache_search(struct pshminfo **pshmp, struct pshmname *pnp, - struct pshmcache **pcache); + struct pshmcache **pcache, int addref); struct fileops pshmops = { pshm_read, pshm_write, pshm_ioctl, pshm_select, pshm_closefile, pshm_kqfilter, 0 }; @@ -229,7 +229,7 @@ pshm_lock_init( void ) static int pshm_cache_search(struct pshminfo **pshmp, struct pshmname *pnp, - struct pshmcache **pcache) + struct pshmcache **pcache, int addref) { struct pshmcache *pcp, *nnp; struct pshmhashhead *pcpp; @@ -258,6 +258,8 @@ pshm_cache_search(struct pshminfo **pshmp, struct pshmname *pnp, /* TOUCH(ncp); */ *pshmp = pcp->pshminfo; *pcache = pcp; + if (addref) + pcp->pshminfo->pshm_usecount++; return (-1); } @@ -287,7 +289,7 @@ pshm_cache_add(struct pshminfo *pshmp, struct pshmname *pnp, struct pshmcache *p /* if the entry has already been added by some one else return */ - if (pshm_cache_search(&dpinfo, pnp, &dpcp) == -1) { + if (pshm_cache_search(&dpinfo, pnp, &dpcp, 0) == -1) { return(EEXIST); } pshmnument++; @@ -438,6 +440,14 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) if (error) goto bad; + cmode &= ALLPERMS; + + fmode = FFLAGS(uap->oflag); + if ((fmode & (FREAD | FWRITE)) == 0) { + error = EINVAL; + goto bad; + } + /* * We allocate a new entry if we are less than the maximum * allowed and the one at the front of the LRU list is in use. @@ -466,27 +476,42 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) PSHM_SUBSYS_LOCK(); - error = pshm_cache_search(&pinfo, &nd, &pcache); + /* + * If we find the entry in the cache, this will take a reference, + * allowing us to unlock it for the permissions check. + */ + error = pshm_cache_search(&pinfo, &nd, &pcache, 1); + + PSHM_SUBSYS_UNLOCK(); if (error == ENOENT) { error = EINVAL; - goto bad_locked; - + goto bad; } + if (!error) { incache = 0; - } else + if (fmode & O_CREAT) { + /* create a new one (commit the allocation) */ + pinfo = new_pinfo; + pinfo->pshm_flags = PSHM_DEFINED | PSHM_INCREATE; + pinfo->pshm_usecount = 1; /* existence reference */ + pinfo->pshm_mode = cmode; + pinfo->pshm_uid = kauth_getuid(); + pinfo->pshm_gid = kauth_getgid(); + bcopy(pnbuf, &pinfo->pshm_name[0], PSHMNAMLEN); + pinfo->pshm_name[PSHMNAMLEN]=0; +#if CONFIG_MACF + error = mac_posixshm_check_create(kauth_cred_get(), nameptr); + if (error) { + goto bad; + } + mac_posixshm_label_associate(kauth_cred_get(), pinfo, nameptr); +#endif + } + } else { incache = 1; - fmode = FFLAGS(uap->oflag); - if ((fmode & (FREAD | FWRITE))==0) { - error = EINVAL; - goto bad_locked; - } - - cmode &= ALLPERMS; - - if (fmode & O_CREAT) { - if (incache) { + if (fmode & O_CREAT) { /* already exists */ if ((fmode & O_EXCL)) { AUDIT_ARG(posix_ipc_perm, pinfo->pshm_uid, @@ -495,65 +520,53 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) /* shm obj exists and opened O_EXCL */ error = EEXIST; - goto bad_locked; + goto bad; } if( pinfo->pshm_flags & PSHM_INDELETE) { error = ENOENT; - goto bad_locked; + goto bad; } AUDIT_ARG(posix_ipc_perm, pinfo->pshm_uid, pinfo->pshm_gid, pinfo->pshm_mode); #if CONFIG_MACF if ((error = mac_posixshm_check_open(kauth_cred_get(), pinfo))) { - goto bad_locked; + goto bad; } #endif if ( (error = pshm_access(pinfo, fmode, kauth_cred_get(), p)) ) { - goto bad_locked; - } - } else { - /* create a new one (commit the allocation) */ - pinfo = new_pinfo; - pinfo->pshm_flags = PSHM_DEFINED | PSHM_INCREATE; - pinfo->pshm_usecount = 1; /* existence reference */ - pinfo->pshm_mode = cmode; - pinfo->pshm_uid = kauth_cred_getuid(kauth_cred_get()); - pinfo->pshm_gid = kauth_cred_get()->cr_gid; - bcopy(pnbuf, &pinfo->pshm_name[0], PSHMNAMLEN); - pinfo->pshm_name[PSHMNAMLEN]=0; -#if CONFIG_MACF - error = mac_posixshm_check_create(kauth_cred_get(), nameptr); - if (error) { - goto bad_locked; + goto bad; } - mac_posixshm_label_associate(kauth_cred_get(), pinfo, nameptr); -#endif } - } else { + } + if (!(fmode & O_CREAT)) { if (!incache) { /* O_CREAT is not set and the object does not exist */ error = ENOENT; - goto bad_locked; + goto bad; } if( pinfo->pshm_flags & PSHM_INDELETE) { error = ENOENT; - goto bad_locked; + goto bad; } #if CONFIG_MACF if ((error = mac_posixshm_check_open(kauth_cred_get(), pinfo))) { - goto bad_locked; + goto bad; } #endif if ((error = pshm_access(pinfo, fmode, kauth_cred_get(), p))) { - goto bad_locked; + goto bad; } } if (fmode & O_TRUNC) { error = EINVAL; - goto bad_locked; + goto bad; } + + + PSHM_SUBSYS_LOCK(); + #if DIAGNOSTIC if (fmode & FWRITE) pinfo->pshm_writecount++; @@ -565,9 +578,13 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) if ( (error = pshm_cache_add(pinfo, &nd, pcp)) ) { goto bad_locked; } + /* + * add reference for the new entry; otherwise, we obtained + * one from the cache hit earlier. + */ + pinfo->pshm_usecount++; } pinfo->pshm_flags &= ~PSHM_INCREATE; - pinfo->pshm_usecount++; /* extra reference for the new fd */ new_pnode->pinfo = pinfo; PSHM_SUBSYS_UNLOCK(); @@ -604,6 +621,17 @@ shm_open(proc_t p, struct shm_open_args *uap, int32_t *retval) bad_locked: PSHM_SUBSYS_UNLOCK(); bad: + /* + * If we obtained the entry from the cache, we need to drop the + * reference; holding the reference may have prevented unlinking, + * so we need to call pshm_close() to get the full effect. + */ + if (incache) { + PSHM_SUBSYS_LOCK(); + pshm_close(pinfo, 1); + PSHM_SUBSYS_UNLOCK(); + } + if (pcp != NULL) FREE(pcp, M_SHM); @@ -633,7 +661,8 @@ pshm_truncate(__unused proc_t p, struct fileproc *fp, __unused int fd, struct pshmnode * pnode ; kern_return_t kret; mem_entry_name_port_t mem_object; - mach_vm_size_t size, total_size, alloc_size; + mach_vm_size_t total_size, alloc_size; + memory_object_size_t mosize; struct pshmobj *pshmobj, *pshmobj_next, **pshmobj_next_p; #if CONFIG_MACF int error; @@ -658,7 +687,7 @@ pshm_truncate(__unused proc_t p, struct fileproc *fp, __unused int fd, return(EINVAL); } #if CONFIG_MACF - error = mac_posixshm_check_truncate(kauth_cred_get(), pinfo, size); + error = mac_posixshm_check_truncate(kauth_cred_get(), pinfo, length); if (error) { PSHM_SUBSYS_UNLOCK(); return(error); @@ -671,14 +700,14 @@ pshm_truncate(__unused proc_t p, struct fileproc *fp, __unused int fd, for (alloc_size = 0; alloc_size < total_size; - alloc_size += size) { + alloc_size += mosize) { PSHM_SUBSYS_UNLOCK(); - size = MIN(total_size - alloc_size, ANON_MAX_SIZE); + mosize = MIN(total_size - alloc_size, ANON_MAX_SIZE); kret = mach_make_memory_entry_64( VM_MAP_NULL, - &size, + &mosize, 0, MAP_MEM_NAMED_CREATE | VM_PROT_DEFAULT, &mem_object, @@ -699,7 +728,7 @@ pshm_truncate(__unused proc_t p, struct fileproc *fp, __unused int fd, PSHM_SUBSYS_LOCK(); pshmobj->pshmo_memobject = (void *) mem_object; - pshmobj->pshmo_size = size; + pshmobj->pshmo_size = mosize; pshmobj->pshmo_next = NULL; *pshmobj_next_p = pshmobj; @@ -787,39 +816,14 @@ pshm_stat(struct pshmnode *pnode, void *ub, int isstat64) int pshm_access(struct pshminfo *pinfo, int mode, kauth_cred_t cred, __unused proc_t p) { - mode_t mask; - int is_member; + int mode_req = ((mode & FREAD) ? S_IRUSR : 0) | + ((mode & FWRITE) ? S_IWUSR : 0); /* Otherwise, user id 0 always gets access. */ if (!suser(cred, NULL)) return (0); - mask = 0; - - /* Otherwise, check the owner. */ - if (kauth_cred_getuid(cred) == pinfo->pshm_uid) { - if (mode & FREAD) - mask |= S_IRUSR; - if (mode & FWRITE) - mask |= S_IWUSR; - return ((pinfo->pshm_mode & mask) == mask ? 0 : EACCES); - } - - /* Otherwise, check the groups. */ - if (kauth_cred_ismember_gid(cred, pinfo->pshm_gid, &is_member) == 0 && is_member) { - if (mode & FREAD) - mask |= S_IRGRP; - if (mode & FWRITE) - mask |= S_IWGRP; - return ((pinfo->pshm_mode & mask) == mask ? 0 : EACCES); - } - - /* Otherwise, check everyone else. */ - if (mode & FREAD) - mask |= S_IROTH; - if (mode & FWRITE) - mask |= S_IWOTH; - return ((pinfo->pshm_mode & mask) == mask ? 0 : EACCES); + return(posix_cred_access(cred, pinfo->pshm_uid, pinfo->pshm_gid, pinfo->pshm_mode, mode_req)); } int @@ -1051,7 +1055,7 @@ shm_unlink(__unused proc_t p, struct shm_unlink_args *uap, } PSHM_SUBSYS_LOCK(); - error = pshm_cache_search(&pinfo, &nd, &pcache); + error = pshm_cache_search(&pinfo, &nd, &pcache, 0); if (error == ENOENT) { PSHM_SUBSYS_UNLOCK(); @@ -1132,16 +1136,16 @@ bad: /* already called locked */ static int -pshm_close(struct pshmnode *pnode) +pshm_close(struct pshminfo *pinfo, int dropref) { - int error=0; - struct pshminfo *pinfo; + int error = 0; struct pshmobj *pshmobj, *pshmobj_next; - if ((pinfo = pnode->pinfo) == PSHMINFO_NULL) - return(EINVAL); - - if ((pinfo->pshm_flags & PSHM_ALLOCATED) != PSHM_ALLOCATED) { + /* + * If we are dropping the reference we took on the cache object, don't + * enforce the allocation requirement. + */ + if ( !dropref && ((pinfo->pshm_flags & PSHM_ALLOCATED) != PSHM_ALLOCATED)) { return(EINVAL); } #if DIAGNOSTIC @@ -1170,7 +1174,6 @@ pshm_close(struct pshmnode *pnode) PSHM_SUBSYS_LOCK(); FREE(pinfo,M_SHM); } - FREE(pnode, M_SHM); return (error); } @@ -1178,11 +1181,20 @@ pshm_close(struct pshmnode *pnode) static int pshm_closefile(struct fileglob *fg, __unused vfs_context_t ctx) { - int error; + int error = EINVAL; + struct pshmnode *pnode; PSHM_SUBSYS_LOCK(); - error = pshm_close(((struct pshmnode *)fg->fg_data)); + + if ((pnode = (struct pshmnode *)fg->fg_data) != NULL) { + if (pnode->pinfo != PSHMINFO_NULL) { + error = pshm_close(pnode->pinfo, 0); + } + FREE(pnode, M_SHM); + } + PSHM_SUBSYS_UNLOCK(); + return(error); } diff --git a/bsd/kern/proc_info.c b/bsd/kern/proc_info.c index d13a2df81..a907fad59 100644 --- a/bsd/kern/proc_info.c +++ b/bsd/kern/proc_info.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2005, 2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -57,6 +57,7 @@ #include <kern/task.h> #include <kern/lock.h> #include <kern/kalloc.h> +#include <kern/assert.h> #include <vm/vm_kern.h> #include <vm/vm_map.h> #include <mach/host_info.h> @@ -75,6 +76,8 @@ #include <machine/machine_routines.h> +#include <kern/ipc_misc.h> + #include <vm/vm_protos.h> struct pshmnode; @@ -92,10 +95,12 @@ int proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t int proc_pidfdinfo(int pid, int flavor,int fd, user_addr_t buffer, uint32_t buffersize, int32_t * retval); int proc_kernmsgbuf(user_addr_t buffer, uint32_t buffersize, int32_t * retval); int proc_setcontrol(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t * retval); +int proc_pidfileportinfo(int pid, int flavor, mach_port_name_t name, user_addr_t buffer, uint32_t buffersize, int32_t *retval); /* protos for procpidinfo calls */ int proc_pidfdlist(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int proc_pidbsdinfo(proc_t p, struct proc_bsdinfo *pbsd, int zombie); +int proc_pidshortbsdinfo(proc_t p, struct proc_bsdshortinfo *pbsd_shortp, int zombie); int proc_pidtaskinfo(proc_t p, struct proc_taskinfo *ptinfo); int proc_pidallinfo(proc_t p, int flavor, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int proc_pidthreadinfo(proc_t p, uint64_t arg, struct proc_threadinfo *pthinfo); @@ -106,6 +111,7 @@ int proc_pidregionpathinfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t int proc_pidvnodepathinfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int proc_pidpathinfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval); int proc_pidworkqueueinfo(proc_t p, struct proc_workqueueinfo *pwqinfo); +int proc_pidfileportlist(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retval); /* protos for proc_pidfdinfo calls */ @@ -161,7 +167,9 @@ proc_info_internal(int callnum, int pid, int flavor, uint64_t arg, user_addr_t b return(proc_kernmsgbuf(buffer, buffersize, retval)); case 5: /* set on self properties proc_setcontrol */ return(proc_setcontrol(pid, flavor, arg, buffer, buffersize, retval)); - + case 6: /* proc_pidfileportinfo */ + return(proc_pidfileportinfo(pid, flavor, (mach_port_name_t)arg, buffer, buffersize, retval)); + default: return(EINVAL); } @@ -180,6 +188,7 @@ proc_listpids(uint32_t type, uint32_t typeinfo, user_addr_t buffer, uint32_t bu struct proc * p; struct tty * tp; int error = 0; + struct proclist *current_list; /* if the buffer is null, return num of procs */ if (buffer == (user_addr_t)0) { @@ -205,13 +214,20 @@ proc_listpids(uint32_t type, uint32_t typeinfo, user_addr_t buffer, uint32_t bu n = 0; ptr = (int *)kbuf; - LIST_FOREACH(p, &allproc, p_list) { + current_list = &allproc; +proc_loop: + LIST_FOREACH(p, current_list, p_list) { skip = 0; switch (type) { case PROC_PGRP_ONLY: if (p->p_pgrpid != (pid_t)typeinfo) skip = 1; break; + case PROC_PPID_ONLY: + if ((p->p_ppid != (pid_t)typeinfo) && (((p->p_lflag & P_LTRACED) == 0) || (p->p_oppid != (pid_t)typeinfo))) + skip = 1; + break; + case PROC_ALL_PIDS: skip = 0; break; @@ -245,7 +261,7 @@ proc_listpids(uint32_t type, uint32_t typeinfo, user_addr_t buffer, uint32_t bu uid_t uid; my_cred = kauth_cred_proc_ref(p); - uid = my_cred->cr_ruid; + uid = kauth_cred_getruid(my_cred); kauth_cred_unref(&my_cred); if (uid != (uid_t)typeinfo) skip = 1; @@ -256,11 +272,6 @@ proc_listpids(uint32_t type, uint32_t typeinfo, user_addr_t buffer, uint32_t bu break; }; - /* Do we have permission to look into this ? */ - if (proc_security_policy(p) != 0) { - skip = 1; - } - if(skip == 0) { *ptr++ = p->p_pid; n++; @@ -269,15 +280,10 @@ proc_listpids(uint32_t type, uint32_t typeinfo, user_addr_t buffer, uint32_t bu break; } - if (n < numprocs) { - LIST_FOREACH(p, &zombproc, p_list) { - *ptr++ = p->p_pid; - n++; - if (n >= numprocs) - break; - } + if ((n < numprocs) && (current_list == &allproc)) { + current_list = &zombproc; + goto proc_loop; } - proc_list_unlock(); @@ -345,6 +351,119 @@ proc_pidfdlist(proc_t p, user_addr_t buffer, uint32_t buffersize, int32_t *retv return(error); } +/* + * Helper functions for proc_pidfileportlist. + */ +static int +proc_fileport_count(__unused mach_port_name_t name, + __unused struct fileglob *fg, void *arg) +{ + uint32_t *counter = arg; + + *counter += 1; + return (0); +} + +struct fileport_fdtype_args { + struct proc_fileportinfo *ffa_pfi; + struct proc_fileportinfo *ffa_pfi_end; +}; + +static int +proc_fileport_fdtype(mach_port_name_t name, struct fileglob *fg, void *arg) +{ + struct fileport_fdtype_args *ffa = arg; + + if (ffa->ffa_pfi != ffa->ffa_pfi_end) { + ffa->ffa_pfi->proc_fdtype = fg->fg_type; + ffa->ffa_pfi->proc_fileport = name; + ffa->ffa_pfi++; + return (0); /* keep walking */ + } else + return (-1); /* stop the walk! */ +} + +int +proc_pidfileportlist(proc_t p, + user_addr_t buffer, uint32_t buffersize, int32_t *retval) +{ + void *kbuf; + vm_size_t kbufsize; + struct proc_fileportinfo *pfi; + uint32_t needfileports, numfileports; + struct fileport_fdtype_args ffa; + int error; + + needfileports = buffersize / sizeof (*pfi); + if ((user_addr_t)0 == buffer || needfileports > (uint32_t)maxfiles) { + /* + * Either (i) the user is asking for a fileport count, + * or (ii) the number of fileports they're asking for is + * larger than the maximum number of open files (!); count + * them to bound subsequent heap allocations. + */ + numfileports = 0; + switch (fileport_walk(p->task, + proc_fileport_count, &numfileports)) { + case KERN_SUCCESS: + break; + case KERN_RESOURCE_SHORTAGE: + return (ENOMEM); + case KERN_INVALID_TASK: + return (ESRCH); + default: + return (EINVAL); + } + + if (numfileports == 0) { + *retval = 0; /* none at all, bail */ + return (0); + } + if ((user_addr_t)0 == buffer) { + numfileports += 20; /* accelerate convergence */ + *retval = numfileports * sizeof (*pfi); + return (0); + } + if (needfileports > numfileports) + needfileports = numfileports; + } + + assert(buffersize >= PROC_PIDLISTFILEPORTS_SIZE); + + kbufsize = (vm_size_t)needfileports * sizeof (*pfi); + pfi = kbuf = kalloc(kbufsize); + if (kbuf == NULL) + return (ENOMEM); + bzero(kbuf, kbufsize); + + ffa.ffa_pfi = pfi; + ffa.ffa_pfi_end = pfi + needfileports; + + switch (fileport_walk(p->task, proc_fileport_fdtype, &ffa)) { + case KERN_SUCCESS: + error = 0; + pfi = ffa.ffa_pfi; + if ((numfileports = pfi - (typeof(pfi))kbuf) == 0) + break; + if (numfileports > needfileports) + panic("more fileports returned than requested"); + error = copyout(kbuf, buffer, numfileports * sizeof (*pfi)); + break; + case KERN_RESOURCE_SHORTAGE: + error = ENOMEM; + break; + case KERN_INVALID_TASK: + error = ESRCH; + break; + default: + error = EINVAL; + break; + } + kfree(kbuf, kbufsize); + if (error == 0) + *retval = numfileports * sizeof (*pfi); + return (error); +} int proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) @@ -363,19 +482,21 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) pbsd->pbi_xstatus = p->p_xstat; pbsd->pbi_pid = p->p_pid; pbsd->pbi_ppid = p->p_ppid; - pbsd->pbi_uid = my_cred->cr_uid; - pbsd->pbi_gid = my_cred->cr_gid; - pbsd->pbi_ruid = my_cred->cr_ruid; - pbsd->pbi_rgid = my_cred->cr_rgid; - pbsd->pbi_svuid = my_cred->cr_svuid; - pbsd->pbi_svgid = my_cred->cr_svgid; + pbsd->pbi_uid = kauth_cred_getuid(my_cred); + pbsd->pbi_gid = kauth_cred_getgid(my_cred); + pbsd->pbi_ruid = kauth_cred_getruid(my_cred); + pbsd->pbi_rgid = kauth_cred_getrgid(my_cred); + pbsd->pbi_svuid = kauth_cred_getsvuid(my_cred); + pbsd->pbi_svgid = kauth_cred_getsvgid(my_cred); kauth_cred_unref(&my_cred); pbsd->pbi_nice = p->p_nice; pbsd->pbi_start_tvsec = p->p_start.tv_sec; pbsd->pbi_start_tvusec = p->p_start.tv_usec; - bcopy(&p->p_comm, &pbsd->pbi_comm[0], MAXCOMLEN-1); - bcopy(&p->p_name, &pbsd->pbi_name[0], 2*MAXCOMLEN-1); + bcopy(&p->p_comm, &pbsd->pbi_comm[0], MAXCOMLEN); + pbsd->pbi_comm[MAXCOMLEN - 1] = '\0'; + bcopy(&p->p_name, &pbsd->pbi_name[0], 2*MAXCOMLEN); + pbsd->pbi_name[(2*MAXCOMLEN) - 1] = '\0'; pbsd->pbi_flags = 0; if ((p->p_flag & P_SYSTEM) == P_SYSTEM) @@ -392,6 +513,10 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) pbsd->pbi_flags |= PROC_FLAG_CONTROLT; if ((p->p_flag & P_THCWD) == P_THCWD) pbsd->pbi_flags |= PROC_FLAG_THCWD; + if ((p->p_flag & P_SUGID) == P_SUGID) + pbsd->pbi_flags |= PROC_FLAG_PSUGID; + if ((p->p_flag & P_EXEC) == P_EXEC) + pbsd->pbi_flags |= PROC_FLAG_EXEC; if (sessionp != SESSION_NULL) { if (SESS_LEADER(p, sessionp)) @@ -422,6 +547,10 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) break; }; + /* if process is a zombie skip bg state */ + if ((zombie == 0) && (p->p_stat != SZOMB) && (p->task != TASK_NULL)) + proc_get_darwinbgstate(p->task, &pbsd->pbi_flags); + if (zombie == 0) pbsd->pbi_nfiles = p->p_fd->fd_nfiles; if (pg != PGRP_NULL) { @@ -441,6 +570,72 @@ proc_pidbsdinfo(proc_t p, struct proc_bsdinfo * pbsd, int zombie) } +int +proc_pidshortbsdinfo(proc_t p, struct proc_bsdshortinfo * pbsd_shortp, int zombie) +{ + bzero(pbsd_shortp, sizeof(struct proc_bsdshortinfo)); + pbsd_shortp->pbsi_pid = p->p_pid; + pbsd_shortp->pbsi_ppid = p->p_ppid; + pbsd_shortp->pbsi_pgid = p->p_pgrpid; + pbsd_shortp->pbsi_status = p->p_stat; + bcopy(&p->p_comm, &pbsd_shortp->pbsi_comm[0], MAXCOMLEN); + pbsd_shortp->pbsi_comm[MAXCOMLEN - 1] = '\0'; + + pbsd_shortp->pbsi_flags = 0; + if ((p->p_flag & P_SYSTEM) == P_SYSTEM) + pbsd_shortp->pbsi_flags |= PROC_FLAG_SYSTEM; + if ((p->p_lflag & P_LTRACED) == P_LTRACED) + pbsd_shortp->pbsi_flags |= PROC_FLAG_TRACED; + if ((p->p_lflag & P_LEXIT) == P_LEXIT) + pbsd_shortp->pbsi_flags |= PROC_FLAG_INEXIT; + if ((p->p_lflag & P_LPPWAIT) == P_LPPWAIT) + pbsd_shortp->pbsi_flags |= PROC_FLAG_PPWAIT; + if ((p->p_flag & P_LP64) == P_LP64) + pbsd_shortp->pbsi_flags |= PROC_FLAG_LP64; + if ((p->p_flag & P_CONTROLT) == P_CONTROLT) + pbsd_shortp->pbsi_flags |= PROC_FLAG_CONTROLT; + if ((p->p_flag & P_THCWD) == P_THCWD) + pbsd_shortp->pbsi_flags |= PROC_FLAG_THCWD; + if ((p->p_flag & P_SUGID) == P_SUGID) + pbsd_shortp->pbsi_flags |= PROC_FLAG_PSUGID; + if ((p->p_flag & P_EXEC) == P_EXEC) + pbsd_shortp->pbsi_flags |= PROC_FLAG_EXEC; + + switch(PROC_CONTROL_STATE(p)) { + case P_PCTHROTTLE: + pbsd_shortp->pbsi_flags |= PROC_FLAG_PC_THROTTLE; + break; + case P_PCSUSP: + pbsd_shortp->pbsi_flags |= PROC_FLAG_PC_SUSP; + break; + case P_PCKILL: + pbsd_shortp->pbsi_flags |= PROC_FLAG_PC_KILL; + break; + }; + + switch(PROC_ACTION_STATE(p)) { + case P_PCTHROTTLE: + pbsd_shortp->pbsi_flags |= PROC_FLAG_PA_THROTTLE; + break; + case P_PCSUSP: + pbsd_shortp->pbsi_flags |= PROC_FLAG_PA_SUSP; + break; + }; + + /* if process is a zombie skip bg state */ + if ((zombie == 0) && (p->p_stat != SZOMB) && (p->task != TASK_NULL)) + proc_get_darwinbgstate(p->task, &pbsd_shortp->pbsi_flags); + + pbsd_shortp->pbsi_uid = p->p_uid; + pbsd_shortp->pbsi_gid = p->p_gid; + pbsd_shortp->pbsi_ruid = p->p_ruid; + pbsd_shortp->pbsi_rgid = p->p_rgid; + pbsd_shortp->pbsi_svuid = p->p_svuid; + pbsd_shortp->pbsi_svgid = p->p_svgid; + + return(0); +} + int proc_pidtaskinfo(proc_t p, struct proc_taskinfo * ptinfo) { @@ -739,7 +934,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu int error = ENOTSUP; int gotref = 0; int findzomb = 0; - int refheld = 0; + int refheld = 0, shortversion = 0; uint32_t size; int zombie = 0; @@ -786,6 +981,14 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu else size = PROC_PIDWORKQUEUEINFO_SIZE; break; + case PROC_PIDT_SHORTBSDINFO: + size = PROC_PIDT_SHORTBSDINFO_SIZE; + break; + case PROC_PIDLISTFILEPORTS: + size = PROC_PIDLISTFILEPORTS_SIZE; + if (buffer == (user_addr_t)0) + size = 0; + break; default: return(EINVAL); } @@ -797,7 +1000,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu return(EOVERFLOW); } - if ((flavor != PROC_PIDTBSDINFO) && (flavor != PROC_PIDPATHINFO)) { + if ((flavor != PROC_PIDTBSDINFO) && (flavor != PROC_PIDPATHINFO) && (flavor != PROC_PIDT_SHORTBSDINFO)) { if ((p = proc_find(pid)) == PROC_NULL) { error = ESRCH; goto out; @@ -816,8 +1019,11 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu } break; + case PROC_PIDT_SHORTBSDINFO: + shortversion = 1; case PROC_PIDTBSDINFO: { struct proc_bsdinfo pbsd; + struct proc_bsdshortinfo pbsd_short; zombie = 0; if (arg) @@ -825,27 +1031,45 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu p = proc_find(pid); if (p == PROC_NULL) { if (findzomb) - p = pzfind(pid); + p = proc_find_zombref(pid); if (p == NULL) { error = ESRCH; goto out; } zombie = 1; - } else - refheld = 1; + } + refheld = 1; /* Do we have permission to look into this ? */ - if ((error = proc_security_policy(p)) != 0) { - if (refheld != 0) - proc_rele(p); + if ((flavor != PROC_PIDT_SHORTBSDINFO) && ((error = proc_security_policy(p)) != 0)) { + if (refheld != 0) { + if (zombie != 0) + proc_drop_zombref(p); + else + proc_rele(p); + } goto out; } - error = proc_pidbsdinfo(p, &pbsd, zombie); - if (refheld != 0) - proc_rele(p); + if (shortversion != 0) { + error = proc_pidshortbsdinfo(p, &pbsd_short, zombie); + } else { + error = proc_pidbsdinfo(p, &pbsd, zombie); + } + if (refheld != 0) { + if (zombie != 0) + proc_drop_zombref(p); + else + proc_rele(p); + } if (error == 0) { - error = copyout(&pbsd, buffer, sizeof(struct proc_bsdinfo)); - if (error == 0) - *retval = sizeof(struct proc_bsdinfo); + if (shortversion != 0) { + error = copyout(&pbsd_short, buffer, sizeof(struct proc_bsdshortinfo)); + if (error == 0) + *retval = sizeof(struct proc_bsdshortinfo); + } else { + error = copyout(&pbsd, buffer, sizeof(struct proc_bsdinfo)); + if (error == 0) + *retval = sizeof(struct proc_bsdinfo); + } } } break; @@ -945,6 +1169,12 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t bu } break; + case PROC_PIDLISTFILEPORTS: { + error = proc_pidfileportlist(p, buffer, buffersize, + retval); + } + break; + default: error = ENOTSUP; } @@ -1297,9 +1527,8 @@ proc_pidfdinfo(int pid, int flavor, int fd, user_addr_t buffer, uint32_t buffer #endif /* NETAT */ default: { error = EINVAL; + goto out1; } - break; - } fp_drop(p, fd, fp , 0); @@ -1309,6 +1538,137 @@ out: return(error); } +/* + * Helper function for proc_pidfileportinfo + */ + +struct fileport_info_args { + int fia_flavor; + user_addr_t fia_buffer; + uint32_t fia_buffersize; + int32_t *fia_retval; +}; + +static kern_return_t +proc_fileport_info(__unused mach_port_name_t name, + struct fileglob *fg, void *arg) +{ + struct fileport_info_args *fia = arg; + struct fileproc __fileproc, *fp = &__fileproc; + int error; + + bzero(fp, sizeof (*fp)); + fp->f_fglob = fg; + + switch (fia->fia_flavor) { + case PROC_PIDFILEPORTVNODEPATHINFO: { + vnode_t vp; + + if (fg->fg_type != DTYPE_VNODE) { + error = ENOTSUP; + break; + } + vp = (struct vnode *)fg->fg_data; + error = pid_vnodeinfopath(vp, vnode_vid(vp), fp, 0, + fia->fia_buffer, fia->fia_buffersize, fia->fia_retval); + } break; + + case PROC_PIDFILEPORTSOCKETINFO: { + socket_t so; + + if (fg->fg_type != DTYPE_SOCKET) { + error = EOPNOTSUPP; + break; + } + so = (socket_t)fg->fg_data; + error = pid_socketinfo(so, fp, 0, + fia->fia_buffer, fia->fia_buffersize, fia->fia_retval); + } break; + + case PROC_PIDFILEPORTPSHMINFO: { + struct pshmnode *pshm; + + if (fg->fg_type != DTYPE_PSXSHM) { + error = EBADF; /* ick - mirror fp_getfpshm */ + break; + } + pshm = (struct pshmnode *)fg->fg_data; + error = pid_pshminfo(pshm, fp, 0, + fia->fia_buffer, fia->fia_buffersize, fia->fia_retval); + } break; + + case PROC_PIDFILEPORTPIPEINFO: { + struct pipe *cpipe; + + if (fg->fg_type != DTYPE_PIPE) { + error = EBADF; /* ick - mirror fp_getfpipe */ + break; + } + cpipe = (struct pipe *)fg->fg_data; + error = pid_pipeinfo(cpipe, fp, 0, + fia->fia_buffer, fia->fia_buffersize, fia->fia_retval); + } break; + + default: + error = EINVAL; + break; + } + + return (error); +} + +/************************* proc_pidfileportinfo routine *********************/ +int +proc_pidfileportinfo(int pid, int flavor, mach_port_name_t name, + user_addr_t buffer, uint32_t buffersize, int32_t *retval) +{ + proc_t p; + int error = ENOTSUP; + uint32_t size; + struct fileport_info_args fia; + + /* fileport types are restricted by filetype_issendable() */ + + switch (flavor) { + case PROC_PIDFILEPORTVNODEPATHINFO: + size = PROC_PIDFILEPORTVNODEPATHINFO_SIZE; + break; + case PROC_PIDFILEPORTSOCKETINFO: + size = PROC_PIDFILEPORTSOCKETINFO_SIZE; + break; + case PROC_PIDFILEPORTPSHMINFO: + size = PROC_PIDFILEPORTPSHMINFO_SIZE; + break; + case PROC_PIDFILEPORTPIPEINFO: + size = PROC_PIDFILEPORTPIPEINFO_SIZE; + break; + default: + return (EINVAL); + } + + if (buffersize < size) + return (ENOMEM); + if ((p = proc_find(pid)) == PROC_NULL) { + error = ESRCH; + goto out; + } + if ((error = proc_security_policy(p)) != 0) { + goto out1; + } + + fia.fia_flavor = flavor; + fia.fia_buffer = buffer; + fia.fia_buffersize = buffersize; + fia.fia_retval = retval; + + if (fileport_invoke(p->task, name, + proc_fileport_info, &fia, &error) != KERN_SUCCESS) + error = EINVAL; +out1: + proc_rele(p); +out: + return (error); +} static int proc_security_policy(proc_t p) @@ -1339,22 +1699,23 @@ proc_kernmsgbuf(user_addr_t buffer, uint32_t buffersize, int32_t * retval) /* ********* process control sets on self only */ int -proc_setcontrol(int pid, int flavor, uint64_t arg, __unused user_addr_t buffer, __unused uint32_t buffersize, __unused int32_t * retval) +proc_setcontrol(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t buffersize, __unused int32_t * retval) { struct proc * pself = PROC_NULL; int error = 0; uint32_t pcontrol = (uint32_t)arg; + struct uthread *ut = NULL; pself = current_proc(); if (pid != pself->p_pid) return(EINVAL); - if (pcontrol > P_PCMAX) - return(EINVAL); switch (flavor) { case PROC_SELFSET_PCONTROL: { + if (pcontrol > P_PCMAX) + return(EINVAL); proc_lock(pself); /* reset existing control setting while retaining action state */ pself->p_pcaction &= PROC_ACTION_MASK; @@ -1364,10 +1725,42 @@ proc_setcontrol(int pid, int flavor, uint64_t arg, __unused user_addr_t buffer, } break; + case PROC_SELFSET_THREADNAME: { + /* PROC_SELFSET_THREADNAME_SIZE = (MAXTHREADNAMESIZE -1) */ + if(buffersize > PROC_SELFSET_THREADNAME_SIZE) + return ENAMETOOLONG; + ut = current_uthread(); + + if(!ut->pth_name) + { + ut->pth_name = (char*)kalloc(MAXTHREADNAMESIZE ); + if(!ut->pth_name) + return ENOMEM; + } + bzero(ut->pth_name, MAXTHREADNAMESIZE); + error = copyin(buffer, ut->pth_name, buffersize); + } + break; + + case PROC_SELFSET_VMRSRCOWNER: { + /* need to to be superuser */ + if (suser(kauth_cred_get(), (u_short *)0) != 0) { + error = EPERM; + goto out; + } + + proc_lock(pself); + /* reset existing control setting while retaining action state */ + pself->p_lflag |= P_LVMRSRCOWNER; + proc_unlock(pself); + } + break; + default: error = ENOTSUP; } +out: return(error); } diff --git a/bsd/kern/process_policy.c b/bsd/kern/process_policy.c new file mode 100644 index 000000000..e6596dad4 --- /dev/null +++ b/bsd/kern/process_policy.c @@ -0,0 +1,460 @@ +/* + * Copyright (c) 2005, 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * process policy syscall implementation + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/proc_internal.h> +#include <sys/kauth.h> +#include <sys/unistd.h> +#include <sys/buf.h> +#include <sys/ioctl.h> +#include <sys/vm.h> +#include <sys/user.h> + +#include <security/audit/audit.h> + +#include <mach/machine.h> +#include <mach/mach_types.h> +#include <mach/vm_param.h> +#include <kern/task.h> +#include <kern/lock.h> +#include <kern/kalloc.h> +#include <kern/assert.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <mach/host_info.h> +#include <mach/task_info.h> +#include <mach/thread_info.h> +#include <mach/vm_region.h> + +#include <sys/process_policy.h> +#include <sys/proc_info.h> +#include <sys/bsdtask_info.h> +#include <sys/kdebug.h> +#include <sys/sysproto.h> +#include <sys/msgbuf.h> + +#include <machine/machine_routines.h> + +#include <kern/ipc_misc.h> +#include <vm/vm_protos.h> + +static int handle_background(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); +static int handle_hwaccess(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); +static int handle_lowresrouce(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); +static int handle_resourceuse(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); +static int handle_apptype(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid); + +extern kern_return_t task_suspend(task_t); +extern kern_return_t task_resume(task_t); + +/***************************** process_policy ********************/ + +/* + *int process_policy(int scope, int action, int policy, int policy_subtype, + * proc_policy_attribute_t * attrp, pid_t target_pid, + * uint64_t target_threadid) + *{ int process_policy(int scope, int action, int policy, int policy_subtype, + * user_addr_t attrp, pid_t target_pid, uint64_t target_threadid); } + */ + +/* system call implementaion */ +int +process_policy(struct proc *p, struct process_policy_args * uap, __unused int32_t *retval) +{ + int error = 0; + int scope = uap->scope; + int policy = uap->policy; + int action = uap->action; + int policy_subtype = uap->policy_subtype; + user_addr_t attrp = uap->attrp; + pid_t target_pid = uap->target_pid; + uint64_t target_threadid = uap->target_threadid; + proc_t proc = PROC_NULL; + proc_t curp = current_proc(); + kauth_cred_t my_cred; +#if CONFIG_EMBEDDED + kauth_cred_t target_cred; +#endif + + if ((scope != PROC_POLICY_SCOPE_PROCESS) && (scope != PROC_POLICY_SCOPE_THREAD)) { + return(EINVAL); + } + proc = proc_find(target_pid); + if (proc == PROC_NULL) { + return(EINVAL); + } + + my_cred = kauth_cred_proc_ref(curp); + +#if CONFIG_EMBEDDED + target_cred = kauth_cred_proc_ref(proc); + + if (suser(my_cred, NULL) && kauth_cred_getruid(my_cred) && + kauth_cred_getuid(my_cred) != kauth_cred_getuid(target_cred) && + kauth_cred_getruid(my_cred) != kauth_cred_getuid(target_cred)) +#else + /* + * Resoure starvation control can be used by unpriv resource owner but priv at the time of ownership claim. This is + * checked in low resource handle routine. So bypass the checks here. + */ + if ((policy != PROC_POLICY_RESOURCE_STARVATION) && + (policy != PROC_POLICY_APPTYPE) && + (suser(my_cred, NULL) && curp != p)) +#endif + { + error = EPERM; + goto out; + } + +#if CONFIG_MACF + error = mac_proc_check_sched(curp, p); + if (error) + goto out; +#endif + + + switch(policy) { + case PROC_POLICY_BACKGROUND: + error = handle_background(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + break; + case PROC_POLICY_HARDWARE_ACCESS: + error = handle_hwaccess(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + break; + case PROC_POLICY_RESOURCE_STARVATION: + error = handle_lowresrouce(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + break; + case PROC_POLICY_RESOURCE_USAGE: + error = handle_resourceuse(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + break; + case PROC_POLICY_APPTYPE: + error = handle_apptype(scope, action, policy, policy_subtype, attrp, proc, target_threadid); + break; + default: + error = EINVAL; + break; + } + +out: + proc_rele(proc); + kauth_cred_unref(&my_cred); +#if CONFIG_EMBEDDED + kauth_cred_unref(&target_cred); +#endif + return(error); +} + + +/* darwin background handling code */ +static int +handle_background(int scope, int action, __unused int policy, __unused int policy_subtype, user_addr_t attrp, proc_t proc, uint64_t target_threadid) +{ + int intval, error = 0; + + + switch (action) { + case PROC_POLICY_ACTION_GET: + if (scope == PROC_POLICY_SCOPE_PROCESS) { + intval = proc_get_task_bg_policy(proc->task); + } else { + /* thread scope */ + intval = proc_get_thread_bg_policy(proc->task, target_threadid); + } + error = copyout((int *)&intval, (user_addr_t)attrp, sizeof(int)); + break; + + case PROC_POLICY_ACTION_SET: + error = copyin((user_addr_t)attrp, (int *)&intval, sizeof(int)); + if (error != 0) + goto out; + if (intval > PROC_POLICY_BG_ALL) { + error = EINVAL; + goto out; + } + if (scope == PROC_POLICY_SCOPE_PROCESS) { + error = proc_set_bgtaskpolicy(proc->task, intval); + } else { + /* thread scope */ + error = proc_set_bgthreadpolicy(proc->task, target_threadid, intval); + } + break; + + case PROC_POLICY_ACTION_ADD: + error = copyin((user_addr_t)attrp, (int *)&intval, sizeof(int)); + if (error != 0) + goto out; + if (intval > PROC_POLICY_BG_ALL) { + error = EINVAL; + goto out; + } + if (scope == PROC_POLICY_SCOPE_PROCESS) { + error = proc_add_bgtaskpolicy(proc->task, intval); + } else { + /* thread scope */ + error = proc_add_bgthreadpolicy(proc->task, target_threadid, intval); + } + break; + + case PROC_POLICY_ACTION_REMOVE: + error = copyin((user_addr_t)attrp, (int *)&intval, sizeof(int)); + if (error != 0) + goto out; + if (intval > PROC_POLICY_BG_ALL) { + error = EINVAL; + goto out; + } + if (scope == PROC_POLICY_SCOPE_PROCESS) { + error = proc_remove_bgtaskpolicy(proc->task, intval); + } else { + /* thread scope */ + error = proc_remove_bgthreadpolicy(proc->task, target_threadid, intval); + } + break; + + case PROC_POLICY_ACTION_APPLY: + if (scope == PROC_POLICY_SCOPE_PROCESS) { + error = proc_apply_bgtaskpolicy(proc->task); + } else { + /* thread scope */ + error = proc_apply_bgthreadpolicy(proc->task, target_threadid); + } + break; + + case PROC_POLICY_ACTION_RESTORE: + if (scope == PROC_POLICY_SCOPE_PROCESS) { + error = proc_restore_bgtaskpolicy(proc->task); + } else { + /* thread scope */ + error = proc_restore_bgthreadpolicy(proc->task, target_threadid); + } + break; + + case PROC_POLICY_ACTION_DENYINHERIT: + error = proc_denyinherit_policy(proc->task); + break; + + case PROC_POLICY_ACTION_DENYSELFSET: + error = proc_denyselfset_policy(proc->task); + break; + + default: + return(EINVAL); + } + +out: + return(error); +} + +static int +handle_hwaccess(__unused int scope, __unused int action, __unused int policy, int policy_subtype, __unused user_addr_t attrp, __unused proc_t proc, __unused uint64_t target_threadid) +{ + switch(policy_subtype) { + case PROC_POLICY_HWACCESS_NONE: + case PROC_POLICY_HWACCESS_DISK: + case PROC_POLICY_HWACCESS_GPU: + case PROC_POLICY_HWACCESS_NETWORK: + case PROC_POLICY_HWACCESS_CPU: + break; + default: + return(EINVAL); + } + return(0); +} + +static int +handle_lowresrouce(__unused int scope, int action, __unused int policy, int policy_subtype, __unused user_addr_t attrp, proc_t proc, __unused uint64_t target_threadid) +{ + int error = 0; + + switch(policy_subtype) { + case PROC_POLICY_RS_NONE: + case PROC_POLICY_RS_VIRTUALMEM: + break; + default: + return(EINVAL); + } + + if (action == PROC_POLICY_ACTION_RESTORE) + error = proc_resetpcontrol(proc_pid(proc)); + else + error = EINVAL; + + return(error); +} + + +static int +handle_resourceuse(__unused int scope, __unused int action, __unused int policy, int policy_subtype, user_addr_t attrp, proc_t proc, __unused uint64_t target_threadid) +{ + proc_policy_cpuusage_attr_t cpuattr; + int error = 0; + + switch(policy_subtype) { + case PROC_POLICY_RUSAGE_NONE: + case PROC_POLICY_RUSAGE_WIREDMEM: + case PROC_POLICY_RUSAGE_VIRTMEM: + case PROC_POLICY_RUSAGE_DISK: + case PROC_POLICY_RUSAGE_NETWORK: + case PROC_POLICY_RUSAGE_POWER: + return(ENOTSUP); + break; + default: + return(EINVAL); + case PROC_POLICY_RUSAGE_CPU: + break; + } + + switch (action) { + case PROC_POLICY_ACTION_GET: + error = proc_get_task_ruse_cpu(proc->task, &cpuattr.ppattr_cpu_attr, + &cpuattr.ppattr_cpu_percentage, + &cpuattr.ppattr_cpu_attr_interval, + &cpuattr.ppattr_cpu_attr_deadline); + if (error == 0) + error = copyout((proc_policy_cpuusage_attr_t *)&cpuattr, (user_addr_t)attrp, sizeof(proc_policy_cpuusage_attr_t)); + break; + + case PROC_POLICY_ACTION_APPLY: + case PROC_POLICY_ACTION_SET: + error = copyin((user_addr_t)attrp, (proc_policy_cpuusage_attr_t *)&cpuattr, sizeof(proc_policy_cpuusage_attr_t)); + + if (error == 0) { + error = proc_set_task_ruse_cpu(proc->task, cpuattr.ppattr_cpu_attr, + cpuattr.ppattr_cpu_percentage, + cpuattr.ppattr_cpu_attr_interval, + cpuattr.ppattr_cpu_attr_deadline); + } + default: + error = EINVAL; + break; + + } + + return(error); +} + + +static int +handle_apptype(__unused int scope, int action, __unused int policy, int policy_subtype, __unused user_addr_t attrp, proc_t proc, __unused uint64_t target_threadid) +{ + int error = 0; + + switch(policy_subtype) { + case PROC_POLICY_OSX_APPTYPE_TAL: + /* need to be super user to do this */ + if (kauth_cred_issuser(kauth_cred_get()) == 0) { + error = EPERM; + goto out; + } + break; + case PROC_POLICY_OSX_APPTYPE_DASHCLIENT: + /* no special priv needed */ + break; + case PROC_POLICY_OSX_APPTYPE_NONE: + case PROC_POLICY_IOS_APPTYPE: + case PROC_POLICY_IOS_NONUITYPE: + return(ENOTSUP); + break; + default: + return(EINVAL); + } + + switch (action) { + case PROC_POLICY_ACTION_ENABLE: + /* reapply the app foreground/background policy */ + error = proc_enable_task_apptype(proc->task, policy_subtype); + break; + case PROC_POLICY_ACTION_DISABLE: + /* remove the app foreground/background policy */ + error = proc_disable_task_apptype(proc->task, policy_subtype); + break; + default: + error = EINVAL; + break; + } + +out: + return(error); +} + +int +proc_apply_resource_actions(void * bsdinfo, int type, int action) +{ + proc_t p = (proc_t)bsdinfo; + + switch(action) { + case PROC_POLICY_RSRCACT_THROTTLE: + /* no need to do anything */ + break; + + case PROC_POLICY_RSRCACT_SUSPEND: + task_suspend(p->task); + break; + + case PROC_POLICY_RSRCACT_TERMINATE: + psignal(p, SIGKILL); + break; + + case PROC_POLICY_RSRCACT_NOTIFY: + proc_lock(p); + proc_knote(p, NOTE_RESOURCEEND | (type & 0xff)); + proc_unlock(p); + break; + } + + return(0); +} + + +int +proc_restore_resource_actions(void * bsdinfo, __unused int type, int action) +{ + proc_t p = (proc_t)bsdinfo; + + switch(action) { + case PROC_POLICY_RSRCACT_THROTTLE: + case PROC_POLICY_RSRCACT_TERMINATE: + case PROC_POLICY_RSRCACT_NOTIFY: + /* no need to do anything */ + break; + + case PROC_POLICY_RSRCACT_SUSPEND: + task_resume(p->task); + break; + + } + + return(0); +} + diff --git a/bsd/kern/pthread_support.c b/bsd/kern/pthread_support.c index 2691813a4..e5626dfa2 100644 --- a/bsd/kern/pthread_support.c +++ b/bsd/kern/pthread_support.c @@ -67,10 +67,12 @@ #include <kern/sched_prim.h> #include <kern/thread_call.h> #include <kern/kalloc.h> +#include <kern/zalloc.h> #include <kern/sched_prim.h> #include <kern/processor.h> #include <kern/affinity.h> #include <kern/wait_queue.h> +#include <kern/mach_param.h> #include <mach/mach_vm.h> #include <mach/mach_param.h> #include <mach/thread_policy.h> @@ -82,9 +84,35 @@ #include <libkern/OSAtomic.h> -#define _PSYNCH_TRACE_ 0 /* kdebug trace */ -#define __TESTPANICS__ 0 /* panics for error conditions */ -#define COND_MTX_WAITQUEUEMOVE 0 /* auto move from cvar wait queue to mutex waitqueue */ +#include <pexpert/pexpert.h> + +#define __PSYNCH_DEBUG__ 0 /* debug panic actions */ +#define _PSYNCH_TRACE_ 1 /* kdebug trace */ + +#define __TESTMODE__ 2 /* 0 - return error on user error conditions */ + /* 1 - log error on user error conditions */ + /* 2 - abort caller on user error conditions */ + /* 3 - panic on user error conditions */ +static int __test_panics__; +static int __test_aborts__; +static int __test_prints__; + +static inline void __FAILEDUSERTEST__(const char *str) +{ + proc_t p; + + if (__test_panics__ != 0) + panic(str); + + if (__test_aborts__ != 0 || __test_prints__ != 0) + p = current_proc(); + + if (__test_prints__ != 0) + printf("PSYNCH: pid[%d]: %s\n", p->p_pid, str); + + if (__test_aborts__ != 0) + psignal(p, SIGABRT); +} #if _PSYNCH_TRACE_ #define _PSYNCH_TRACE_MLWAIT 0x9000000 @@ -103,6 +131,10 @@ #define _PSYNCH_TRACE_RWUNLOCK2 0x9000034 #define _PSYNCH_TRACE_RWHANDLEU 0x9000038 #define _PSYNCH_TRACE_FSEQTILL 0x9000040 +#define _PSYNCH_TRACE_CLRPRE 0x9000044 +#define _PSYNCH_TRACE_CVHBROAD 0x9000048 +#define _PSYNCH_TRACE_CVSEQ 0x900004c +#define _PSYNCH_TRACE_THWAKEUP 0x9000050 /* user side */ #define _PSYNCH_TRACE_UM_LOCK 0x9000060 #define _PSYNCH_TRACE_UM_UNLOCK 0x9000064 @@ -112,8 +144,24 @@ #define _PSYNCH_TRACE_UM_CVSIG 0x9000074 #define _PSYNCH_TRACE_UM_CVBRD 0x9000078 +proc_t pthread_debug_proc = PROC_NULL; +static inline void __PTHREAD_TRACE_DEBUG(uint32_t debugid, uintptr_t arg1, + uintptr_t arg2, + uintptr_t arg3, + uintptr_t arg4, + uintptr_t arg5) +{ + proc_t p = current_proc(); + + if ((pthread_debug_proc != NULL) && (p == pthread_debug_proc)) + KERNEL_DEBUG_CONSTANT(debugid, arg1, arg2, arg3, arg4, arg5); +} + #endif /* _PSYNCH_TRACE_ */ +#define ECVCERORR 256 +#define ECVPERORR 512 + lck_mtx_t * pthread_list_mlock; #define PTHHASH(addr) (&pthashtbl[(addr) & pthhash]) @@ -122,19 +170,28 @@ struct pthhashhead * pth_glob_hashtbl; u_long pthhash; LIST_HEAD(, ksyn_wait_queue) pth_free_list; +int num_total_kwq = 0; /* number of kwq in use currently */ +int num_infreekwq = 0; /* number of kwq in free list */ +int num_freekwq = 0; /* number of kwq actually freed from the free the list */ +int num_reusekwq = 0; /* number of kwq pulled back for reuse from free list */ +int num_addedfreekwq = 0; /* number of added free kwq from the last instance */ +int num_lastfreekwqcount = 0; /* the free count from the last time */ static int PTH_HASHSIZE = 100; +static zone_t kwq_zone; /* zone for allocation of ksyn_queue */ +static zone_t kwe_zone; /* zone for allocation of ksyn_waitq_element */ #define SEQFIT 0 #define FIRSTFIT 1 struct ksyn_queue { - TAILQ_HEAD(, uthread) ksynq_uthlist; + TAILQ_HEAD(ksynq_kwelist_head, ksyn_waitq_element) ksynq_kwelist; uint32_t ksynq_count; /* number of entries in queue */ uint32_t ksynq_firstnum; /* lowest seq in queue */ uint32_t ksynq_lastnum; /* highest seq in queue */ }; +typedef struct ksyn_queue * ksyn_queue_t; #define KSYN_QUEUE_READ 0 #define KSYN_QUEUE_LREAD 1 @@ -146,9 +203,6 @@ struct ksyn_queue { struct ksyn_wait_queue { LIST_ENTRY(ksyn_wait_queue) kw_hash; LIST_ENTRY(ksyn_wait_queue) kw_list; -#if USE_WAITQUEUE - struct wait_queue kw_wq; -#endif /* USE_WAITQUEUE */ user_addr_t kw_addr; uint64_t kw_owner; uint64_t kw_object; /* object backing in shared mode */ @@ -157,78 +211,113 @@ struct ksyn_wait_queue { int kw_pflags; /* flags under listlock protection */ struct timeval kw_ts; /* timeval need for upkeep before free */ int kw_iocount; /* inuse reference */ + int kw_dropcount; /* current users unlocking... */ int kw_type; /* queue type like mutex, cvar, etc */ uint32_t kw_inqueue; /* num of waiters held */ + uint32_t kw_fakecount; /* number of error/prepost fakes */ uint32_t kw_highseq; /* highest seq in the queue */ uint32_t kw_lowseq; /* lowest seq in the queue */ + uint32_t kw_lword; /* L value from userland */ + uint32_t kw_uword; /* U world value from userland */ + uint32_t kw_sword; /* S word value from userland */ uint32_t kw_lastunlockseq; /* the last seq that unlocked */ +/* for CV to be used as the seq kernel has seen so far */ +#define kw_cvkernelseq kw_lastunlockseq + uint32_t kw_lastseqword; /* the last seq that unlocked */ +/* for mutex and cvar we need to track I bit values */ + uint32_t kw_nextseqword; /* the last seq that unlocked; with num of waiters */ +#define kw_initrecv kw_nextseqword /* number of incoming waiters with Ibit seen sofar */ + uint32_t kw_overlapwatch; /* chance for overlaps */ +#define kw_initcount kw_overlapwatch /* number of incoming waiters with Ibit expected */ + uint32_t kw_initcountseq; /* highest seq with Ibit on for mutex and cvar*/ uint32_t kw_pre_rwwc; /* prepost count */ uint32_t kw_pre_lockseq; /* prepost target seq */ - uint32_t kw_pre_cvretval; /* retval for cwait on prepost */ - uint32_t kw_pre_limrd; /* prepost read only(rwlock) */ - uint32_t kw_pre_limrdseq; /* prepost limit seq for reads(rwlock) */ - uint32_t kw_pre_limrdbits; /* seqbit needed for updates on prepost */ + uint32_t kw_pre_sseq; /* prepost target sword, in cvar used for mutexowned */ uint32_t kw_pre_intrcount; /* prepost of missed wakeup due to intrs */ uint32_t kw_pre_intrseq; /* prepost of missed wakeup limit seq */ uint32_t kw_pre_intrretbits; /* return bits value for missed wakeup threads */ uint32_t kw_pre_intrtype; /* type of failed wakueps*/ int kw_kflags; - TAILQ_HEAD(, uthread) kw_uthlist; /* List of uthreads */ struct ksyn_queue kw_ksynqueues[KSYN_QUEUE_MAX]; /* queues to hold threads */ lck_mtx_t kw_lock; /* mutex lock protecting this structure */ - struct ksyn_wait_queue * kw_attq; /* attached queue (cvar->mutex, need in prepost */ }; - -typedef struct ksyn_queue * ksyn_queue_t; typedef struct ksyn_wait_queue * ksyn_wait_queue_t; -#define PTHRW_EBIT 0x01 -#define PTHRW_LBIT 0x02 -#define PTHRW_YBIT 0x04 -#define PTHRW_WBIT 0x08 -#define PTHRW_UBIT 0x10 -#define PTHRW_RETRYBIT 0x20 -/* same as 0x20, shadow W bit for rwlock */ -#define PTHRW_SHADOW_W 0x20 - -#define PTHRW_TRYLKBIT 0x40 -#define PTHRW_RW_HUNLOCK 0x40 /* returning read thread responsible to handle unlock */ - -#define PTHRW_MTX_NONE 0x80 -#define PTHRW_RW_INIT 0x80 /* reset on the lock bits */ -/* same as 0x80, spurious rwlock unlock ret from kernel */ -#define PTHRW_RW_SPURIOUS 0x80 - #define PTHRW_INC 0x100 - -#define PTHRW_BIT_MASK 0x000000ff; +#define PTHRW_BIT_MASK 0x000000ff #define PTHRW_COUNT_SHIFT 8 #define PTHRW_COUNT_MASK 0xffffff00 #define PTHRW_MAX_READERS 0xffffff00 +/* New model bits on Lword */ +#define PTH_RWL_KBIT 0x01 /* users cannot acquire in user mode */ +#define PTH_RWL_EBIT 0x02 /* exclusive lock in progress */ +#define PTH_RWL_WBIT 0x04 /* write waiters pending in kernel */ +#define PTH_RWL_PBIT 0x04 /* prepost (cv) pending in kernel */ +#define PTH_RWL_YBIT 0x08 /* yielding write waiters pending in kernel */ +#define PTH_RWL_RETRYBIT 0x08 /* mutex retry wait */ +#define PTH_RWL_LBIT 0x10 /* long read in progress */ +#define PTH_RWL_MTXNONE 0x10 /* indicates the cvwait does not have mutex held */ +#define PTH_RWL_UBIT 0x20 /* upgrade request pending */ +#define PTH_RWL_MTX_WAIT 0x20 /* in cvar in mutex wait */ +#define PTH_RWL_RBIT 0x40 /* reader pending in kernel(not used) */ +#define PTH_RWL_MBIT 0x40 /* overlapping grants from kernel */ +#define PTH_RWL_TRYLKBIT 0x40 /* trylock attempt (mutex only) */ +#define PTH_RWL_IBIT 0x80 /* lcok reset, held untill first succeesful unlock */ + + +/* UBIT values for mutex, cvar */ +#define PTH_RWU_SBIT 0x01 +#define PTH_RWU_BBIT 0x02 + +#define PTHRW_RWL_INIT PTH_RWL_IBIT /* reset state on the lock bits (U)*/ + +/* New model bits on Sword */ +#define PTH_RWS_SBIT 0x01 /* kernel transition seq not set yet*/ +#define PTH_RWS_IBIT 0x02 /* Sequence is not set on return from kernel */ +#define PTH_RWS_CV_CBIT PTH_RWS_SBIT /* kernel has cleared all info w.r.s.t CV */ +#define PTH_RWS_CV_PBIT PTH_RWS_IBIT /* kernel has prepost/fake structs only,no waiters */ +#define PTH_RWS_CV_MBIT PTH_RWL_MBIT /* to indicate prepost return */ +#define PTH_RWS_WSVBIT 0x04 /* save W bit */ +#define PTH_RWS_USVBIT 0x08 /* save U bit */ +#define PTH_RWS_YSVBIT 0x10 /* save Y bit */ +#define PTHRW_RWS_INIT PTH_RWS_SBIT /* reset on the lock bits (U)*/ +#define PTHRW_RWS_SAVEMASK (PTH_RWS_WSVBIT|PTH_RWS_USVBIT|PTH_RWS_YSVBIT) /*save bits mask*/ +#define PTHRW_SW_Reset_BIT_MASK 0x000000fe /* remove S bit and get rest of the bits */ + +#define PTHRW_RWS_INIT PTH_RWS_SBIT /* reset on the lock bits (U)*/ + + +#define PTHRW_UN_BIT_MASK 0x000000bf /* remove overlap bit */ + + +#define PTHREAD_MTX_TID_SWITCHING (uint64_t)-1 + +/* new L word defns */ +#define is_rwl_readinuser(x) ((((x) & (PTH_RWL_UBIT | PTH_RWL_KBIT)) == 0)||(((x) & PTH_RWL_LBIT) != 0)) +#define is_rwl_ebit_set(x) (((x) & PTH_RWL_EBIT) != 0) +#define is_rwl_lbit_set(x) (((x) & PTH_RWL_LBIT) != 0) +#define is_rwl_readoverlap(x) (((x) & PTH_RWL_MBIT) != 0) +#define is_rw_ubit_set(x) (((x) & PTH_RWL_UBIT) != 0) + +/* S word checks */ +#define is_rws_setseq(x) (((x) & PTH_RWS_SBIT)) +#define is_rws_setunlockinit(x) (((x) & PTH_RWS_IBIT)) + /* first contended seq that kernel sees */ #define KW_MTXFIRST_KSEQ 0x200 #define KW_CVFIRST_KSEQ 1 #define KW_RWFIRST_KSEQ 0x200 -#define is_rw_ewubit_set(x) ((x & (PTHRW_EBIT | PTHRW_WBIT | PTHRW_UBIT)) != 0) -#define is_rw_lybit_set(x) ((x & (PTHRW_LBIT | PTHRW_YBIT)) != 0) -#define is_rw_ebit_set(x) ((x & PTHRW_EBIT) != 0) -#define is_rw_uebit_set(x) ((x & (PTHRW_EBIT | PTHRW_UBIT)) != 0) -#define is_rw_ubit_set(x) ((x & PTHRW_UBIT) != 0) -#define is_rw_either_ewyubit_set(x) ((x & (PTHRW_EBIT | PTHRW_WBIT | PTHRW_UBIT | PTHRW_YBIT)) != 0) - - -/* is x lower than Y */ -#define is_seqlower(x, y) ((x < y) || ((x - y) > (PTHRW_MAX_READERS/2))) -/* is x lower than or eq Y */ -#define is_seqlower_eq(x, y) ((x <= y) || ((x - y) > (PTHRW_MAX_READERS/2))) +int is_seqlower(uint32_t x, uint32_t y); +int is_seqlower_eq(uint32_t x, uint32_t y); +int is_seqhigher(uint32_t x, uint32_t y); +int is_seqhigher_eq(uint32_t x, uint32_t y); +int find_diff(uint32_t upto, uint32_t lowest); -/* is x greater than Y */ -#define is_seqhigher(x, y) ((x > y) || ((y - x) > (PTHRW_MAX_READERS/2))) static inline int diff_genseq(uint32_t x, uint32_t y) { if (x > y) { @@ -292,27 +381,39 @@ static inline int diff_genseq(uint32_t x, uint32_t y) { #define PTHREAD_POLICY_FLAGS_MASK 0x1c0 #define _PTHREAD_MTX_OPT_HOLDLOCK 0x200 -#define _PTHREAD_MTX_OPT_NOHOLDLOCK 0x400 -#define _PTHREAD_MTX_OPT_LASTDROP (_PTHREAD_MTX_OPT_HOLDLOCK | _PTHREAD_MTX_OPT_NOHOLDLOCK) +#define _PTHREAD_MTX_OPT_NOMTX 0x400 + +#define _PTHREAD_MTX_OPT_NOTIFY 0x1000 +#define _PTHREAD_MTX_OPT_MUTEX 0x2000 /* this is a mutex type */ +#define _PTHREAD_RWLOCK_UPGRADE_TRY 0x10000 + +/* pflags */ #define KSYN_WQ_INLIST 1 #define KSYN_WQ_INHASH 2 #define KSYN_WQ_SHARED 4 +#define KSYN_WQ_WAITING 8 /* threads waiting for this wq to be available */ #define KSYN_WQ_FLIST 0X10 /* in free list to be freed after a short delay */ +/* kflags */ +#define KSYN_KWF_INITCLEARED 1 /* the init status found and preposts cleared */ +#define KSYN_KWF_ZEROEDOUT 2 /* the lword, etc are inited to 0 */ + #define KSYN_CLEANUP_DEADLINE 10 int psynch_cleanupset; thread_call_t psynch_thcall; #define KSYN_WQTYPE_INWAIT 0x1000 +#define KSYN_WQTYPE_INDROP 0x2000 #define KSYN_WQTYPE_MTX 0x1 #define KSYN_WQTYPE_CVAR 0x2 #define KSYN_WQTYPE_RWLOCK 0x4 #define KSYN_WQTYPE_SEMA 0x8 #define KSYN_WQTYPE_BARR 0x10 -#define KSYN_WQTYPE_MASK 0xffff +#define KSYN_WQTYPE_MASK 0x00ff #define KSYN_MTX_MAX 0x0fffffff +#define KSYN_WQTYPE_MUTEXDROP (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_MTX) #define KW_UNLOCK_PREPOST 0x01 #define KW_UNLOCK_PREPOST_UPGRADE 0x02 @@ -324,14 +425,14 @@ thread_call_t psynch_thcall; #define CLEAR_PREPOST_BITS(kwq) {\ kwq->kw_pre_lockseq = 0; \ + kwq->kw_pre_sseq = PTHRW_RWS_INIT; \ kwq->kw_pre_rwwc = 0; \ - kwq->kw_pre_cvretval = 0; \ } -#define CLEAR_READ_PREPOST_BITS(kwq) {\ - kwq->kw_pre_limrd = 0; \ - kwq->kw_pre_limrdseq = 0; \ - kwq->kw_pre_limrdbits = 0; \ +#define CLEAR_INITCOUNT_BITS(kwq) {\ + kwq->kw_initcount = 0; \ + kwq->kw_initrecv = 0; \ + kwq->kw_initcountseq = 0; \ } #define CLEAR_INTR_PREPOST_BITS(kwq) {\ @@ -340,7 +441,30 @@ thread_call_t psynch_thcall; kwq->kw_pre_intrretbits = 0; \ kwq->kw_pre_intrtype = 0; \ } - + +#define CLEAR_REINIT_BITS(kwq) {\ + if ((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_CVAR) { \ + if((kwq->kw_inqueue != 0) && (kwq->kw_inqueue != kwq->kw_fakecount)) \ + panic("CV:entries in queue durinmg reinit %d:%d\n",kwq->kw_inqueue, kwq->kw_fakecount); \ + };\ + if ((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_RWLOCK) { \ + kwq->kw_nextseqword = PTHRW_RWS_INIT; \ + kwq->kw_overlapwatch = 0; \ + }; \ + kwq->kw_pre_lockseq = 0; \ + kwq->kw_pre_rwwc = 0; \ + kwq->kw_pre_sseq = PTHRW_RWS_INIT; \ + kwq->kw_lastunlockseq = PTHRW_RWL_INIT; \ + kwq->kw_lastseqword = PTHRW_RWS_INIT; \ + kwq->kw_pre_intrcount = 0; \ + kwq->kw_pre_intrseq = 0; \ + kwq->kw_pre_intrretbits = 0; \ + kwq->kw_pre_intrtype = 0; \ + kwq->kw_lword = 0; \ + kwq->kw_uword = 0; \ + kwq->kw_sword = PTHRW_RWS_INIT; \ + } + void pthread_list_lock(void); void pthread_list_unlock(void); void pthread_list_lock_spin(void); @@ -349,41 +473,69 @@ void ksyn_wqlock(ksyn_wait_queue_t kwq); void ksyn_wqunlock(ksyn_wait_queue_t kwq); ksyn_wait_queue_t ksyn_wq_hash_lookup(user_addr_t mutex, proc_t p, int flags, uint64_t object, uint64_t offset); int ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uint64_t tid, int flags, int wqtype , ksyn_wait_queue_t * wq); -void ksyn_wqrelease(ksyn_wait_queue_t mkwq, ksyn_wait_queue_t ckwq); -int ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, uthread_t uth); -kern_return_t ksyn_wakeup_thread(ksyn_wait_queue_t kwq, uthread_t uth); -void ksyn_move_wqthread(ksyn_wait_queue_t ckwq, ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t updateval, int diffgen, int nomutex); -extern thread_t port_name_to_thread(mach_port_name_t port_name); +void ksyn_wqrelease(ksyn_wait_queue_t mkwq, ksyn_wait_queue_t ckwq, int qfreenow, int wqtype); extern int ksyn_findobj(uint64_t mutex, uint64_t * object, uint64_t * offset); -static void UPDATE_KWQ(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uint64_t tid, int wqtype, int retry); -void psynch_mutexdrop_internal(ksyn_wait_queue_t kwq, uint32_t lkseq, uint32_t ugen, int flags); - -#if USE_WAITQUEUE -kern_return_t wait_queue_move_all(wait_queue_t from, event64_t eventfrom, wait_queue_t to, event64_t eventto); -kern_return_t wait_queue_move_thread(wait_queue_t from, event64_t eventfrom, thread_t th, wait_queue_t to, event64_t eventto, thread_t * mthp); -#endif /* USE_WAITQUEUE */ -int kwq_handle_unlock(ksyn_wait_queue_t, uint32_t mgen, uint32_t * updatep, int flags, int *blockp, uint32_t premgen); +static void UPDATE_CVKWQ(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uint64_t tid, int wqtype); +extern thread_t port_name_to_thread(mach_port_name_t port_name); + +int ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, ksyn_waitq_element_t kwe, int log); +kern_return_t ksyn_wakeup_thread(ksyn_wait_queue_t kwq, ksyn_waitq_element_t kwe); +void ksyn_freeallkwe(ksyn_queue_t kq); + +uint32_t psynch_mutexdrop_internal(ksyn_wait_queue_t kwq, uint32_t lkseq, uint32_t ugen, int flags); +int kwq_handle_unlock(ksyn_wait_queue_t, uint32_t mgen, uint32_t rw_wc, uint32_t * updatep, int flags, int *blockp, uint32_t premgen); + void ksyn_queue_init(ksyn_queue_t kq); -int ksyn_queue_insert(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t mgen, struct uthread * uth, int firstfit); -struct uthread * ksyn_queue_removefirst(ksyn_queue_t kq, ksyn_wait_queue_t kwq); -void ksyn_queue_removeitem(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uthread_t uth); +int ksyn_queue_insert(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t mgen, struct uthread * uth, ksyn_waitq_element_t kwe, int firstfit); +ksyn_waitq_element_t ksyn_queue_removefirst(ksyn_queue_t kq, ksyn_wait_queue_t kwq); +void ksyn_queue_removeitem(ksyn_wait_queue_t kwq, ksyn_queue_t kq, ksyn_waitq_element_t kwe); +int ksyn_queue_move_tofree(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t upto, ksyn_queue_t freeq, int all, int reease); void update_low_high(ksyn_wait_queue_t kwq, uint32_t lockseq); uint32_t find_nextlowseq(ksyn_wait_queue_t kwq); uint32_t find_nexthighseq(ksyn_wait_queue_t kwq); + int find_seq_till(ksyn_wait_queue_t kwq, uint32_t upto, uint32_t nwaiters, uint32_t *countp); -int find_diff(uint32_t upto, uint32_t lowest); uint32_t ksyn_queue_count_tolowest(ksyn_queue_t kq, uint32_t upto); + +ksyn_waitq_element_t ksyn_queue_find_cvpreposeq(ksyn_queue_t kq, uint32_t cgen); +uint32_t ksyn_queue_cvcount_entries(ksyn_queue_t kq, uint32_t upto, uint32_t from, int * numwaitersp, int * numintrp, int * numprepop); +void ksyn_handle_cvbroad(ksyn_wait_queue_t ckwq, uint32_t upto, uint32_t *updatep); +void ksyn_cvupdate_fixup(ksyn_wait_queue_t ckwq, uint32_t *updatep, ksyn_queue_t kfreeq, int release); +ksyn_waitq_element_t ksyn_queue_find_signalseq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t toseq, uint32_t lockseq); +ksyn_waitq_element_t ksyn_queue_find_threadseq(ksyn_wait_queue_t ckwq, ksyn_queue_t kq, thread_t th, uint32_t toseq); + int ksyn_wakeupreaders(ksyn_wait_queue_t kwq, uint32_t limitread, int longreadset, int allreaders, uint32_t updatebits, int * wokenp); int kwq_find_rw_lowest(ksyn_wait_queue_t kwq, int flags, uint32_t premgen, int * type, uint32_t lowest[]); -uthread_t ksyn_queue_find_seq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t seq); +ksyn_waitq_element_t ksyn_queue_find_seq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t seq, int remove); +int kwq_handle_overlap(ksyn_wait_queue_t kwq, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, uint32_t *updatebitsp, int flags , int * blockp); int kwq_handle_downgrade(ksyn_wait_queue_t kwq, uint32_t mgen, int flags, uint32_t premgen, int * blockp); - static void -UPDATE_KWQ(__unused ksyn_wait_queue_t kwq, __unused uint32_t mgen, __unused uint32_t ugen, __unused uint32_t rw_wc, __unused uint64_t tid, __unused int wqtype, __unused int retry) +UPDATE_CVKWQ(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, __unused uint64_t tid, __unused int wqtype) { + if ((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_CVAR) { + if ((kwq->kw_kflags & KSYN_KWF_ZEROEDOUT) != 0) { + /* the values of L,U and S are cleared out due to L==S in previous transition */ + kwq->kw_lword = mgen; + kwq->kw_uword = ugen; + kwq->kw_sword = rw_wc; + kwq->kw_kflags &= ~KSYN_KWF_ZEROEDOUT; + } + if (is_seqhigher((mgen & PTHRW_COUNT_MASK), (kwq->kw_lword & PTHRW_COUNT_MASK)) != 0) + kwq->kw_lword = mgen; + if (is_seqhigher((ugen & PTHRW_COUNT_MASK), (kwq->kw_uword & PTHRW_COUNT_MASK)) != 0) + kwq->kw_uword = ugen; + if ((rw_wc & PTH_RWS_CV_CBIT) != 0) { + if(is_seqlower(kwq->kw_cvkernelseq, (rw_wc & PTHRW_COUNT_MASK)) != 0) { + kwq->kw_cvkernelseq = (rw_wc & PTHRW_COUNT_MASK); + } + if (is_seqhigher((rw_wc & PTHRW_COUNT_MASK), (kwq->kw_sword & PTHRW_COUNT_MASK)) != 0) + kwq->kw_sword = rw_wc; + } + } } + /* to protect the hashes, iocounts, freelist */ void pthread_list_lock(void) @@ -426,51 +578,43 @@ ksyn_wqunlock(ksyn_wait_queue_t kwq) /* routine to drop the mutex unlocks , used both for mutexunlock system call and drop during cond wait */ -void +uint32_t psynch_mutexdrop_internal(ksyn_wait_queue_t kwq, uint32_t lkseq, uint32_t ugen, int flags) { - uint32_t nextgen, low_writer, updatebits; + uint32_t nextgen, low_writer, updatebits, returnbits = 0; int firstfit = flags & _PTHREAD_MUTEX_POLICY_FIRSTFIT; - uthread_t uth; + ksyn_waitq_element_t kwe = NULL; kern_return_t kret = KERN_SUCCESS; - nextgen = (ugen + PTHRW_INC); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_START, kwq, lkseq, ugen, flags, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_START, (uint32_t)kwq->kw_addr, lkseq, ugen, flags, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); redrive: - -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 1, kwq->kw_inqueue, nextgen, 0); -#endif /* _PSYNCH_TRACE_ */ + if (kwq->kw_inqueue != 0) { - updatebits = (kwq->kw_highseq & PTHRW_COUNT_MASK) | PTHRW_EBIT; - kwq->kw_lastunlockseq = ugen; + updatebits = (kwq->kw_highseq & PTHRW_COUNT_MASK) | (PTH_RWL_EBIT | PTH_RWL_KBIT); + kwq->kw_lastunlockseq = (ugen & PTHRW_COUNT_MASK); if (firstfit != 0) { -#if __TESTPANICS__ - panic("psynch_mutexdrop_internal: first fit mutex arrives, not enabled yet \n"); -#endif /* __TESTPANICS__ */ /* first fit , pick any one */ - uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); + kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); + kwe->kwe_psynchretval = updatebits; + kwe->kwe_kwqqueue = NULL; - if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) - updatebits |= PTHRW_WBIT; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 2, uth, updatebits, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xcafecaf1, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); #endif /* _PSYNCH_TRACE_ */ - - uth->uu_psynchretval = updatebits; - uth->uu_kwqqueue = NULL; - - kret = ksyn_wakeup_thread(kwq, uth); + + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("psynch_mutexdrop_internal: panic unable to wakeup firstfit mutex thread\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) goto redrive; } else { @@ -479,86 +623,124 @@ redrive: low_writer &= PTHRW_COUNT_MASK; if (low_writer == nextgen) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 3, low_writer, nextgen, 0); -#endif /* _PSYNCH_TRACE_ */ /* next seq to be granted found */ - uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); - if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) - updatebits |= PTHRW_WBIT; + kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); - uth->uu_psynchretval = updatebits; - uth->uu_kwqqueue = NULL; + /* since the grant could be cv, make sure mutex wait is set incase the thread interrupted out */ + kwe->kwe_psynchretval = updatebits | PTH_RWL_MTX_WAIT; + kwe->kwe_kwqqueue = NULL; - kret = ksyn_wakeup_thread(kwq, uth); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xcafecaf2, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); +#endif /* _PSYNCH_TRACE_ */ + + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("psynch_mutexdrop_internal: panic unable to wakeup fairshare mutex thread\n"); - if (kret == KERN_NOT_WAITING) - goto redrive; +#endif /* __TESTPANICS__ */ + if (kret == KERN_NOT_WAITING) { + /* interrupt post */ + kwq->kw_pre_intrcount = 1; + kwq->kw_pre_intrseq = nextgen; + kwq->kw_pre_intrretbits = updatebits; + kwq->kw_pre_intrtype = PTH_RW_TYPE_WRITE; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfafafaf1, nextgen, kwq->kw_pre_intrretbits, 0); +#endif /* _PSYNCH_TRACE_ */ + } } else if (is_seqhigher(low_writer, nextgen) != 0) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 4, low_writer, nextgen, 0); -#endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc++; + + if (kwq->kw_pre_rwwc > 1) { + __FAILEDUSERTEST__("psynch_mutexdrop_internal: prepost more than one (1)\n"); + goto out; + } + kwq->kw_pre_lockseq = (nextgen & PTHRW_COUNT_MASK); - } else { -#if __TESTPANICS__ - panic("psynch_mutexdrop_internal: FS mutex unlock sequence higher than the lowest one is queue\n"); -#endif /* __TESTPANICS__ */ #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 5, low_writer, nextgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef1, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ - uth = ksyn_queue_find_seq(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], nextgen); - if (uth != NULL) { - /* next seq to be granted found */ + } else { - if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) - updatebits |= PTHRW_WBIT; - + //__FAILEDUSERTEST__("psynch_mutexdrop_internal: FS mutex unlock sequence higher than the lowest one is queue\n"); + + kwe = ksyn_queue_find_seq(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], (nextgen & PTHRW_COUNT_MASK), 1); + if (kwe != NULL) { + /* next seq to be granted found */ + /* since the grant could be cv, make sure mutex wait is set incase the thread interrupted out */ + kwe->kwe_psynchretval = updatebits | PTH_RWL_MTX_WAIT; + kwe->kwe_kwqqueue = NULL; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 6, updatebits, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xcafecaf3, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); #endif /* _PSYNCH_TRACE_ */ - uth->uu_psynchretval = updatebits; - uth->uu_kwqqueue = NULL; - - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("psynch_mutexdrop_internal: panic unable to wakeup fairshare mutex thread\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) goto redrive; } else { /* next seq to be granted not found, prepost */ -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 7, 0, 0, 0); -#endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc++; + + if (kwq->kw_pre_rwwc > 1) { + __FAILEDUSERTEST__("psynch_mutexdrop_internal: prepost more than one (2)\n"); + goto out; + } + kwq->kw_pre_lockseq = (nextgen & PTHRW_COUNT_MASK); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); +#endif /* _PSYNCH_TRACE_ */ } } } } else { + + /* if firstfit the last one could be spurious */ + if (firstfit == 0) { + kwq->kw_lastunlockseq = (ugen & PTHRW_COUNT_MASK); + kwq->kw_pre_rwwc++; + + if (kwq->kw_pre_rwwc > 1) { + __FAILEDUSERTEST__("psynch_mutexdrop_internal: prepost more than one (3)\n"); + goto out; + } + + kwq->kw_pre_lockseq = (nextgen & PTHRW_COUNT_MASK); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 8, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef3, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ - /* if firstfit the last one could be spurious */ - if ((firstfit == 0) || ((lkseq & PTHRW_COUNT_MASK) != nextgen)) { + } else { + /* first fit case */ #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, kwq, 9, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef3, kwq->kw_lastunlockseq, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ - kwq->kw_lastunlockseq = ugen; - kwq->kw_pre_rwwc++; - kwq->kw_pre_lockseq = (nextgen & PTHRW_COUNT_MASK); + kwq->kw_lastunlockseq = (ugen & PTHRW_COUNT_MASK); + /* not set or the new lkseq is higher */ + if ((kwq->kw_pre_rwwc == 0) || (is_seqlower(kwq->kw_pre_lockseq, lkseq) == 0)) + kwq->kw_pre_lockseq = (lkseq & PTHRW_COUNT_MASK); + kwq->kw_pre_rwwc = 1; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef3, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); +#endif /* _PSYNCH_TRACE_ */ + + /* indicate prepost content in kernel */ + returnbits = lkseq | PTH_RWL_PBIT; } } +out: ksyn_wqunlock(kwq); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_KMDROP | DBG_FUNC_END, kwq, 0, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_KMDROP | DBG_FUNC_END, (uint32_t)kwq->kw_addr, 0xeeeeeeed, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - ksyn_wqrelease(kwq, NULL); - return; + ksyn_wqrelease(kwq, NULL, 1, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_MTX)); + return(returnbits); } /* @@ -575,19 +757,24 @@ psynch_mutexwait(__unused proc_t p, struct psynch_mutexwait_args * uap, uint32_t int flags = uap->flags; ksyn_wait_queue_t kwq; int error=0; - int ins_flags; + int ins_flags, retry; uthread_t uth; int firstfit = flags & _PTHREAD_MUTEX_POLICY_FIRSTFIT; - uint32_t lockseq, updatebits; - + uint32_t lockseq, updatebits=0; + ksyn_waitq_element_t kwe; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_START, (uint32_t)mutex, mgen, ugen, flags, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_START, (uint32_t)mutex, mgen, ugen, flags, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)mutex, mgen, ugen, (uint32_t)tid, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); - uth->uu_lockseq = uap->mgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = uap->mgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; lockseq = (uap->mgen & PTHRW_COUNT_MASK); if (firstfit == 0) { @@ -600,67 +787,105 @@ psynch_mutexwait(__unused proc_t p, struct psynch_mutexwait_args * uap, uint32_t error = ksyn_wqfind(mutex, mgen, ugen, 0, tid, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_MTX), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_END, (uint32_t)mutex, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_END, (uint32_t)mutex, 1, 0xdeadbeef, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); - - if ((kwq->kw_pre_rwwc != 0) && ((ins_flags == FIRSTFIT) || (lockseq == kwq->kw_pre_lockseq ))) { + + if ((mgen & PTH_RWL_RETRYBIT) != 0) { + retry = 1; + mgen &= ~PTH_RWL_RETRYBIT; + } + + /* handle first the missed wakeups */ + if ((kwq->kw_pre_intrcount != 0) && + ((kwq->kw_pre_intrtype == PTH_RW_TYPE_WRITE)) && + (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { + kwq->kw_pre_intrcount--; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; + if (kwq->kw_pre_intrcount==0) + CLEAR_INTR_PREPOST_BITS(kwq); + ksyn_wqunlock(kwq); + *retval = kwe->kwe_psynchretval; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)mutex, 0xfafafaf1, kwe->kwe_psynchretval, kwq->kw_pre_intrcount, 0); +#endif /* _PSYNCH_TRACE_ */ + goto out; + } + + if ((kwq->kw_pre_rwwc != 0) && ((ins_flags == FIRSTFIT) || ((lockseq & PTHRW_COUNT_MASK) == (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK) ))) { /* got preposted lock */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { CLEAR_PREPOST_BITS(kwq); - kwq->kw_lastunlockseq = 0; + kwq->kw_lastunlockseq = PTHRW_RWL_INIT; + if (kwq->kw_inqueue == 0) { + updatebits = lockseq | (PTH_RWL_KBIT | PTH_RWL_EBIT); + } else { + updatebits = (kwq->kw_highseq & PTHRW_COUNT_MASK) | (PTH_RWL_KBIT | PTH_RWL_EBIT); + } + updatebits &= ~PTH_RWL_MTX_WAIT; + + kwe->kwe_psynchretval = updatebits; + + if (updatebits == 0) { + __FAILEDUSERTEST__("psynch_mutexwait(prepost): returning 0 lseq in mutexwait with no EBIT \n"); + } + ksyn_wqunlock(kwq); + *retval = updatebits; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfefefef1, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); +#endif /* _PSYNCH_TRACE_ */ + goto out; } else { - panic("psynch_mutexwait: more than one prepost %d\n", (kwq->kw_pre_rwwc + 1)); + __FAILEDUSERTEST__("psynch_mutexwait: more than one prepost\n"); kwq->kw_pre_lockseq += PTHRW_INC; /* look for next one */ + ksyn_wqunlock(kwq); + error = EINVAL; + goto out; } - if (kwq->kw_inqueue == 0) { - updatebits = lockseq | PTHRW_EBIT; - } else { - updatebits = (kwq->kw_highseq & PTHRW_COUNT_MASK) | (PTHRW_EBIT | PTHRW_WBIT); - } - - uth->uu_psynchretval = updatebits; -#if __TESTPANICS__ - if ((updatebits & PTHRW_COUNT_MASK) == 0) - panic("psynch_mutexwait: (prepost)returning 0 lseq in mutexwait with EBIT \n"); -#endif /* __TESTPANICS__ */ - ksyn_wqunlock(kwq); - *retval = updatebits; - goto out; } - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], mgen, uth, ins_flags); - if (error != 0) - panic("psynch_mutexwait: failed to enqueue\n"); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 0xfeedfeed, mgen, ins_flags, 0); +#endif /* _PSYNCH_TRACE_ */ - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], mgen, uth, kwe, ins_flags); + if (error != 0) { + ksyn_wqunlock(kwq); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_END, (uint32_t)mutex, 2, 0xdeadbeef, error, 0); +#endif /* _PSYNCH_TRACE_ */ + goto out; + } + + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); /* drops the wq lock */ if (error != 0) { ksyn_wqlock(kwq); + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)mutex, 2, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_NONE, (uint32_t)mutex, 3, 0xdeadbeef, error, 0); #endif /* _PSYNCH_TRACE_ */ - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwe); ksyn_wqunlock(kwq); } else { - updatebits = uth->uu_psynchretval; + updatebits = kwe->kwe_psynchretval; + updatebits &= ~PTH_RWL_MTX_WAIT; *retval = updatebits; -#if __TESTPANICS__ - if ((updatebits & PTHRW_COUNT_MASK) == 0) - panic("psynch_mutexwait: returning 0 lseq in mutexwait with EBIT \n"); -#endif /* __TESTPANICS__ */ + + if (updatebits == 0) + __FAILEDUSERTEST__("psynch_mutexwait: returning 0 lseq in mutexwait with no EBIT \n"); } out: - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 1, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_MTX)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_END, (uint32_t)mutex, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_MLWAIT | DBG_FUNC_END, (uint32_t)mutex, 0xeeeeeeed, updatebits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); @@ -670,32 +895,26 @@ out: * psynch_mutexdrop: This system call is used for unlock postings on contended psynch mutexes. */ int -psynch_mutexdrop(__unused proc_t p, struct psynch_mutexdrop_args * uap, __unused uint32_t * retval) +psynch_mutexdrop(__unused proc_t p, struct psynch_mutexdrop_args * uap, uint32_t * retval) { user_addr_t mutex = uap->mutex; uint32_t mgen = uap->mgen; - uint32_t lkseq = mgen & PTHRW_COUNT_MASK; uint32_t ugen = uap->ugen; uint64_t tid = uap->tid; int flags = uap->flags; ksyn_wait_queue_t kwq; + uint32_t updateval; int error=0; -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLDROP | DBG_FUNC_START, (uint32_t)mutex, mgen, ugen, flags, 0); -#endif /* _PSYNCH_TRACE_ */ - error = ksyn_wqfind(mutex, mgen, ugen, 0, tid, flags, KSYN_WQTYPE_MTX, &kwq); + error = ksyn_wqfind(mutex, mgen, ugen, 0, tid, flags, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_MTX), &kwq); if (error != 0) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLDROP | DBG_FUNC_END, (uint32_t)mutex, 1, 0, error, 0); -#endif /* _PSYNCH_TRACE_ */ return(error); } - psynch_mutexdrop_internal(kwq, lkseq, ugen, flags); + + updateval = psynch_mutexdrop_internal(kwq, mgen, ugen, flags); /* drops the kwq reference */ -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_MLDROP | DBG_FUNC_END, (uint32_t)mutex, 0, 0, error, 0); -#endif /* _PSYNCH_TRACE_ */ + + *retval = updateval; return(0); } @@ -704,350 +923,261 @@ psynch_mutexdrop(__unused proc_t p, struct psynch_mutexdrop_args * uap, __unused * psynch_cvbroad: This system call is used for broadcast posting on blocked waiters of psynch cvars. */ int -psynch_cvbroad(__unused proc_t p, struct psynch_cvbroad_args * uap, int * retval) +psynch_cvbroad(__unused proc_t p, struct psynch_cvbroad_args * uap, uint32_t * retval) { user_addr_t cond = uap->cv; - uint32_t cgen = uap->cvgen; - uint32_t diffgen = uap->diffgen; - uint32_t mgen = uap->mgen; + uint64_t cvlsgen = uap->cvlsgen; + uint64_t cvudgen = uap->cvudgen; + uint32_t cgen, cugen, csgen, diffgen; + uint32_t uptoseq, fromseq; int flags = uap->flags; - ksyn_wait_queue_t kwq, ckwq; + ksyn_wait_queue_t ckwq; int error=0; -#if COND_MTX_WAITQUEUEMOVE - int mutexowned = flags & _PTHREAD_MTX_OPT_HOLDLOCK; - int nomutex = flags & _PTHREAD_MTX_OPT_NOHOLDLOCK; - user_addr_t mutex = uap->mutex; - uint32_t ugen = uap->ugen; - uint64_t tid = uap->tid; - uthread_t uth; - kern_return_t kret = KERN_SUCCESS; -#else /* COND_MTX_WAITQUEUEMOVE */ - int nomutex = _PTHREAD_MTX_OPT_NOHOLDLOCK; -#endif /* COND_MTX_WAITQUEUEMOVE */ - uint32_t nextgen, ngen; - int updatebits = 0; + uint32_t updatebits = 0; + uint32_t count; + struct ksyn_queue kfreeq; + + csgen = (uint32_t)((cvlsgen >> 32) & 0xffffffff); + cgen = ((uint32_t)(cvlsgen & 0xffffffff)); + cugen = (uint32_t)((cvudgen >> 32) & 0xffffffff); + diffgen = ((uint32_t)(cvudgen & 0xffffffff)); + count = (diffgen >> PTHRW_COUNT_SHIFT); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_START, (uint32_t)cond, (uint32_t) 0, cgen, mgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_START, (uint32_t)cond, cgen, cugen, csgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_NONE, (uint32_t)cond, 0xcbcbcbc1, diffgen,flags, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_wqfind(cond, cgen, cgen, 0, 0, flags, KSYN_WQTYPE_CVAR, &ckwq); + + uptoseq = cgen & PTHRW_COUNT_MASK; + fromseq = (cugen & PTHRW_COUNT_MASK) + PTHRW_INC; + + if (is_seqhigher(fromseq, uptoseq) || is_seqhigher((csgen & PTHRW_COUNT_MASK), uptoseq)) { + __FAILEDUSERTEST__("cvbroad: invalid L, U and S values\n"); + return EINVAL; + } + if (count > (uint32_t)task_threadmax) { + __FAILEDUSERTEST__("cvbroad: difference greater than maximum possible thread count\n"); + return EBUSY; + } + + ckwq = NULL; + + error = ksyn_wqfind(cond, cgen, cugen, csgen, 0, flags, (KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INDROP), &ckwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_END, (uint32_t)cond, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_END, (uint32_t)cond, 0, 0xdeadbeef, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } -#if COND_MTX_WAITQUEUEMOVE - ngen = mgen + (PTHRW_INC * diffgen); - if (nomutex ==0) { - error = ksyn_wqfind(mutex, ngen, ugen, 0, tid, flags, KSYN_WQTYPE_MTX, &kwq); - if (error != 0) { - kwq = NULL; - goto out; - } - } -#else /* COND_MTX_WAITQUEUEMOVE */ - nomutex = _PTHREAD_MTX_OPT_NOHOLDLOCK; - kwq= NULL; - ngen = 0; -#endif /* COND_MTX_WAITQUEUEMOVE */ - + *retval = 0; ksyn_wqlock(ckwq); -#if COND_MTX_WAITQUEUEMOVE -redrive: -#endif /* COND_MTX_WAITQUEUEMOVE */ - if (diffgen > ckwq->kw_inqueue) { - ckwq->kw_pre_rwwc = diffgen - ckwq->kw_inqueue; - ckwq->kw_pre_lockseq = cgen & PTHRW_BIT_MASK; - updatebits = ckwq->kw_pre_rwwc; /* unused mutex refs */ - nextgen = (mgen + (ckwq->kw_pre_rwwc * PTHRW_INC)); - } else { - updatebits = 0; - nextgen = mgen + PTHRW_INC; - } - - if (ckwq->kw_inqueue != 0) { -#if COND_MTX_WAITQUEUEMOVE - if (mutexowned != 0) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_NONE, (uint32_t)cond, 0, 1, ckwq->kw_inqueue, 0); -#endif /* _PSYNCH_TRACE_ */ - uth = ksyn_queue_removefirst(&ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER],ckwq); - uth->uu_psynchretval = ngen; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(ckwq, uth); - if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) - panic("cvbraoad: failed to remove\n"); - if (kret == KERN_NOT_WAITING) { - /* - * trying to wake one thread to return, so if - * failed to wakeup get the next one.. - */ - goto redrive; - } - nextgen = nextgen + PTHRW_INC; - diffgen -= 1; - } -#else /* COND_MTX_WAITQUEUEMOVE */ - updatebits = 0; -#endif /* COND_MTX_WAITQUEUEMOVE */ - - /* nomutex case or in mutexowned case after the first one */ - /* move them all to the mutex waitqueue */ - if ((ckwq->kw_inqueue != 0) && (diffgen > 0)) { - /* atleast one more posting needed and there are waiting threads */ - /* drops the ckwq lock */ -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_NONE, (uint32_t)cond, 0, 2, diffgen, 0); -#endif /* _PSYNCH_TRACE_ */ - /* move threads from ckwq to kwq if COND_MTX_WAITQUEUEMOVE, else wakeup */ - ksyn_move_wqthread(ckwq, kwq, nextgen, ngen, diffgen, nomutex); - } else - ksyn_wqunlock(ckwq); - } else { - /* no need for prepost as it is covered before */ - ksyn_wqunlock(ckwq); - } + /* update L, U and S... */ + UPDATE_CVKWQ(ckwq, cgen, cugen, csgen, 0, KSYN_WQTYPE_CVAR); - if (error == 0) { - *retval = updatebits; - } + /* broadcast wakeups/prepost handling */ + ksyn_handle_cvbroad(ckwq, uptoseq, &updatebits); -#if COND_MTX_WAITQUEUEMOVE -out: -#endif /* COND_MTX_WAITQUEUEMOVE */ - ksyn_wqrelease(ckwq, kwq); + /* set C or P bits and free if needed */ + ckwq->kw_sword += (updatebits & PTHRW_COUNT_MASK); + ksyn_cvupdate_fixup(ckwq, &updatebits, &kfreeq, 1); + ksyn_wqunlock(ckwq); + + *retval = updatebits; + + ksyn_wqrelease(ckwq, NULL, 1, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_CVAR)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_END, (uint32_t)cond, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVBROAD | DBG_FUNC_END, (uint32_t)cond, 0xeeeeeeed, (uint32_t)*retval, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } +ksyn_waitq_element_t +ksyn_queue_find_threadseq(ksyn_wait_queue_t ckwq, __unused ksyn_queue_t kq, thread_t th, uint32_t upto) +{ + uthread_t uth = get_bsdthread_info(th); + ksyn_waitq_element_t kwe = &uth->uu_kwe; + + if (kwe->kwe_kwqqueue != ckwq || + is_seqhigher((kwe->kwe_lockseq & PTHRW_COUNT_MASK), upto)) { + /* the thread is not waiting in the cv (or wasn't when the wakeup happened) */ + return NULL; + } + return kwe; +} + /* * psynch_cvsignal: This system call is used for signalling the blocked waiters of psynch cvars. */ int -psynch_cvsignal(__unused proc_t p, struct psynch_cvsignal_args * uap, int * retval) +psynch_cvsignal(__unused proc_t p, struct psynch_cvsignal_args * uap, uint32_t * retval) { user_addr_t cond = uap->cv; - uint32_t cgen = uap->cvgen; + uint64_t cvlsgen = uap->cvlsgen; + uint32_t cgen, csgen, signalseq, uptoseq; uint32_t cugen = uap->cvugen; - uint32_t mgen = uap->mgen; int threadport = uap->thread_port; int flags = uap->flags; - ksyn_wait_queue_t kwq, ckwq; - int error=0, kret; - uthread_t uth; -#if USE_WAITQUEUE - thread_t th = THREAD_NULL, mth; -#else /* USE_WAITQUEUE */ + ksyn_wait_queue_t ckwq = NULL; + ksyn_waitq_element_t kwe, nkwe = NULL; + ksyn_queue_t kq; + int error=0; thread_t th = THREAD_NULL; -#endif /* USE_WAITQUEUE */ -#if COND_MTX_WAITQUEUEMOVE - user_addr_t mutex = uap->mutex; - uint32_t ugen = uap->ugen; - int mutexowned = flags & _PTHREAD_MTX_OPT_HOLDLOCK; - int nomutex = flags & _PTHREAD_MTX_OPT_NOHOLDLOCK; -#else /* COND_MTX_WAITQUEUEMOVE */ - int nomutex = _PTHREAD_MTX_OPT_NOHOLDLOCK; -#endif /* COND_MTX_WAITQUEUEMOVE */ - uint32_t retbits, ngen, lockseq; + uint32_t updatebits = 0; + kern_return_t kret; + struct ksyn_queue kfreeq; - if (nomutex != 0) - retbits = 0; - else - retbits = 1; -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_START, (uint32_t)cond, (uint32_t) 0, cgen, mgen, 0); - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)cond, (uint32_t)cugen , flags, mgen, 0); -#endif /* _PSYNCH_TRACE_ */ + csgen = (uint32_t)((cvlsgen >> 32) & 0xffffffff); + cgen = ((uint32_t)(cvlsgen & 0xffffffff)); - error = ksyn_wqfind(cond, cgen, cugen, 0, 0, flags, KSYN_WQTYPE_CVAR, &ckwq); - if (error != 0) { - *retval = retbits; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_END, (uint32_t)cond, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_START, (uint32_t)cond, cgen, cugen, threadport, 0); #endif /* _PSYNCH_TRACE_ */ - return(error); - } - - - if ((flags & _PTHREAD_MTX_OPT_LASTDROP) == _PTHREAD_MTX_OPT_LASTDROP) { - - ksyn_wqlock(ckwq); - lockseq = cgen & PTHRW_COUNT_MASK; - /* do we need to check for lockseq as this is from last waiter, may be race ? */ - if ((ckwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, ckwq->kw_pre_lockseq) != 0)) { - ckwq->kw_pre_rwwc--; - if (ckwq->kw_pre_rwwc == 0) - CLEAR_PREPOST_BITS(ckwq); - } - ksyn_wqunlock(ckwq); - /* no mutex or thread is associated with this, just notificaion */ - th = THREAD_NULL; - error = 0; - goto out; - } - ngen = mgen + PTHRW_INC; + uptoseq = cgen & PTHRW_COUNT_MASK; + signalseq = (cugen & PTHRW_COUNT_MASK) + PTHRW_INC; -#if COND_MTX_WAITQUEUEMOVE - if (nomutex == 0) { - /* mutex was not operated on, ignore it */ - error = ksyn_wqfind(mutex, ngen, ugen, 0, 0, flags, KSYN_WQTYPE_MTX, &kwq); - if (error != 0) { - *retval = retbits; - kwq = NULL; - goto out; - } - } else { -#endif /* COND_MTX_WAITQUEUEMOVE */ - kwq = NULL; -#if COND_MTX_WAITQUEUEMOVE + /* validate sane L, U, and S values */ + if (((threadport == 0) && (is_seqhigher(signalseq, uptoseq))) || is_seqhigher((csgen & PTHRW_COUNT_MASK), uptoseq)) { + __FAILEDUSERTEST__("psync_cvsignal; invalid sequence numbers\n"); + error = EINVAL; + goto out; } -#endif /* COND_MTX_WAITQUEUEMOVE */ - + /* If we are looking for a specific thread, grab a reference for it */ if (threadport != 0) { th = (thread_t)port_name_to_thread((mach_port_name_t)threadport); if (th == THREAD_NULL) { - *retval = retbits; error = ESRCH; goto out; } } - ksyn_wqlock(ckwq); -redrive: - if (ckwq->kw_inqueue != 0) { - *retval = 0; -#if COND_MTX_WAITQUEUEMOVE - if ((mutexowned != 0) || (nomutex != 0)) { + error = ksyn_wqfind(cond, cgen, cugen, csgen, 0, flags, (KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INDROP), &ckwq); + if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)cond, 0, 1, ckwq->kw_inqueue, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_END, (uint32_t)cond, 0, 0xdeadbeef, error, 0); +#endif /* _PSYNCH_TRACE_ */ + goto out; + } + + ksyn_wqlock(ckwq); + + /* update L, U and S... */ + UPDATE_CVKWQ(ckwq, cgen, cugen, csgen, 0, KSYN_WQTYPE_CVAR); + + kq = &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER]; + +retry: + /* Only bother if we aren't already balanced */ + if ((ckwq->kw_lword & PTHRW_COUNT_MASK) != (ckwq->kw_sword & PTHRW_COUNT_MASK)) { + + kwe = (th != NULL) ? ksyn_queue_find_threadseq(ckwq, kq, th, uptoseq) : + ksyn_queue_find_signalseq(ckwq, kq, uptoseq, signalseq); + if (kwe != NULL) { + switch (kwe->kwe_flags) { + + case KWE_THREAD_BROADCAST: + /* broadcasts swallow our signal */ + break; + + case KWE_THREAD_PREPOST: + /* merge in with existing prepost at our same uptoseq */ + kwe->kwe_count += 1; + break; + + case KWE_THREAD_INWAIT: + if (is_seqlower((kwe->kwe_lockseq & PTHRW_COUNT_MASK), signalseq)) { + /* + * A valid thread in our range, but lower than our signal. + * Matching it may leave our match with nobody to wake it if/when + * it arrives (the signal originally meant for this thread might + * not successfully wake it). + * + * Convert to broadcast - may cause some spurious wakeups + * (allowed by spec), but avoids starvation (better choice). + */ +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xc1c1c1c1, uptoseq, 0, 0); #endif /* _PSYNCH_TRACE_ */ - if (th != THREAD_NULL) { - uth = get_bsdthread_info(th); - if (nomutex != 0) - ngen |= PTHRW_MTX_NONE; - uth->uu_psynchretval = ngen; - uth->uu_kwqqueue = NULL; - ksyn_queue_removeitem(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uth); - kret = ksyn_wakeup_thread(ckwq, uth); - if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) - panic("psynch_cvsignal: panic waking in cvsignal\n"); - if (kret == KERN_NOT_WAITING) { - if (threadport != 0) { - error = 0; - } else - goto redrive; - } - } else { - uth = ksyn_queue_removefirst(&ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER],ckwq); - if (nomutex != 0) - ngen |= PTHRW_MTX_NONE; - uth->uu_psynchretval = ngen; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(ckwq, uth); - if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) - panic("psynch_cvsignal: panic waking in cvsignal\n"); - if (kret == KERN_NOT_WAITING) { - if (threadport != 0) { - error = 0; - } else - goto redrive; - } - } - ksyn_wqunlock(ckwq); - } else { -#endif /* COND_MTX_WAITQUEUEMOVE */ - /* need to move a thread to another queue */ + ksyn_handle_cvbroad(ckwq, uptoseq, &updatebits); + } else { + ksyn_queue_removeitem(ckwq, kq, kwe); + kwe->kwe_psynchretval = PTH_RWL_MTX_WAIT; + kwe->kwe_kwqqueue = NULL; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)cond, 0, 2, ckwq->kw_inqueue, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xcafecaf2, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); #endif /* _PSYNCH_TRACE_ */ - if (th != THREAD_NULL) { - uth = get_bsdthread_info(th); - /* if given thread not blocked in cvwait , return error */ - if (uth->uu_kwqqueue != ckwq) { - error = EINVAL; - ksyn_wqunlock(ckwq); - goto out; + kret = ksyn_wakeup_thread(ckwq, kwe); +#if __TESTPANICS__ + if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) + panic("ksyn_wakeup_thread: panic waking up condition waiter\n"); +#endif /* __TESTPANICS__ */ + updatebits += PTHRW_INC; } - ksyn_queue_removeitem(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uth); - } else { - uth = ksyn_queue_removefirst(&ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER],ckwq); - if (uth == NULL) - panic("cvsign: null uthread after rem"); - } -#if COND_MTX_WAITQUEUEMOVE - ksyn_wqunlock(ckwq); -#else /* COND_MTX_WAITQUEUEMOVE */ - uth->uu_psynchretval = 0; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(ckwq, uth); - if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) - panic("psynch_cvsignal: panic waking in cvsignal\n"); - if (kret == KERN_NOT_WAITING) { - error = 0; - if (threadport == 0) - goto redrive; + + ckwq->kw_sword += (updatebits & PTHRW_COUNT_MASK); + break; + + default: + panic("unknown kweflags\n"); + break; } - + + } else if (th != NULL) { + /* + * Could not find the thread, post a broadcast, + * otherwise the waiter will be stuck. Use to send + * ESRCH here, did lead to rare hangs. + */ + ksyn_handle_cvbroad(ckwq, uptoseq, &updatebits); + ckwq->kw_sword += (updatebits & PTHRW_COUNT_MASK); + } else if (nkwe == NULL) { ksyn_wqunlock(ckwq); - error = 0; -#endif /* COND_MTX_WAITQUEUEMOVE */ - -#if COND_MTX_WAITQUEUEMOVE - ksyn_wqlock(kwq); - ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], ngen, uth, SEQFIT); -#if USE_WAITQUEUE - kret = wait_queue_move_thread(&ckwq->kw_wq, ckwq->kw_addr, th, &kwq->kw_wq, kwq->kw_addr, &mth); - if (kret == KERN_SUCCESS) { - if (mth != THREAD_NULL) { - uth = (struct uthread *)get_bsdthread_info(mth); - uth->uu_lockseq = ngen; - TAILQ_INSERT_TAIL(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_uthlist, uth, uu_mtxlist); - } - } -#else /* USE_WAITQUEUE */ - /* no need to move anything, just update the sequence */ - uth->uu_lockseq = ngen; - -#endif /* USE_WAITQUEUE */ - ksyn_wqunlock(kwq); - } -#endif /* COND_MTX_WAITQUEUEMOVE */ - } else { - /* prepost */ + nkwe = (ksyn_waitq_element_t)zalloc(kwe_zone); + ksyn_wqlock(ckwq); + goto retry; + + } else { + /* no eligible entries - add prepost */ + bzero(nkwe, sizeof(struct ksyn_waitq_element)); + nkwe->kwe_kwqqueue = ckwq; + nkwe->kwe_flags = KWE_THREAD_PREPOST; + nkwe->kwe_lockseq = uptoseq; + nkwe->kwe_count = 1; + nkwe->kwe_uth = NULL; + nkwe->kwe_psynchretval = 0; + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)cond, 0, 3, ckwq->kw_inqueue, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xfeedfefe, uptoseq, 0, 0); #endif /* _PSYNCH_TRACE_ */ - if (threadport != 0) { - error = EINVAL; - ksyn_wqunlock(ckwq); - goto out; + + (void)ksyn_queue_insert(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uptoseq, NULL, nkwe, SEQFIT); + ckwq->kw_fakecount++; + nkwe = NULL; } - - ckwq->kw_pre_rwwc++; - ckwq->kw_attq = kwq; - ckwq->kw_pre_lockseq = cgen & PTHRW_BIT_MASK; - ckwq->kw_pre_cvretval = ngen; - *retval = retbits; - ksyn_wqunlock(ckwq); + + /* set C or P bits and free if needed */ + ksyn_cvupdate_fixup(ckwq, &updatebits, &kfreeq, 1); } - /* ckwq is unlocked here */ - + + ksyn_wqunlock(ckwq); + if (nkwe != NULL) + zfree(kwe_zone, nkwe); + + ksyn_wqrelease(ckwq, NULL, 1, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_CVAR)); + out: - ksyn_wqrelease(ckwq, kwq); - if (th != THREAD_NULL) + if (th != NULL) thread_deallocate(th); + if (error == 0) + *retval = updatebits; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_END, (uint32_t)cond, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSIGNAL | DBG_FUNC_END, (uint32_t)cond, 0xeeeeeeed, updatebits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); @@ -1060,112 +1190,318 @@ int psynch_cvwait(__unused proc_t p, struct psynch_cvwait_args * uap, uint32_t * retval) { user_addr_t cond = uap->cv; - uint32_t cgen = uap->cvgen; + uint64_t cvlsgen = uap->cvlsgen; + uint32_t cgen, csgen; uint32_t cugen = uap->cvugen; user_addr_t mutex = uap->mutex; - uint32_t mgen =0, ugen; - int flags = 0; + uint64_t mugen = uap->mugen; + uint32_t mgen, ugen; + int flags = uap->flags; ksyn_wait_queue_t kwq, ckwq; - int error=0; + int error=0, local_error = 0; uint64_t abstime = 0; - uint32_t lockseq, updatebits; + uint32_t lockseq, updatebits=0; struct timespec ts; uthread_t uth; - + ksyn_waitq_element_t kwe, nkwe = NULL; + struct ksyn_queue *kq, kfreeq; +#if __TESTPANICS__ + //int timeoutval = 3; /* 3 secs */ + //u_int64_t ntime = 0; +#endif /* __TESTPANICS__ */ + /* for conformance reasons */ __pthread_testcancel(0); + csgen = (uint32_t)((cvlsgen >> 32) & 0xffffffff); + cgen = ((uint32_t)(cvlsgen & 0xffffffff)); + ugen = (uint32_t)((mugen >> 32) & 0xffffffff); + mgen = ((uint32_t)(mugen & 0xffffffff)); + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_START, (uint32_t)cond, (uint32_t) mutex, cgen, mgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_START, (uint32_t)cond, cgen, cugen, csgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)mutex, mgen, ugen, flags, 0); #endif /* _PSYNCH_TRACE_ */ - flags = 0; - if ((uap->usec & 0xc0000000) != 0) { - if (uap->usec & 0x40000000) - flags |= PTHREAD_PROCESS_SHARED; - if (uap->usec & 0x80000000) - flags |= _PTHREAD_MUTEX_POLICY_FIRSTFIT; + + lockseq = (cgen & PTHRW_COUNT_MASK); + /* + * In cvwait U word can be out of range as cond could be used only for + * timeouts. However S word needs to be within bounds and validated at + * user level as well. + */ + if (is_seqhigher_eq((csgen & PTHRW_COUNT_MASK), lockseq) != 0) { + __FAILEDUSERTEST__("psync_cvwait; invalid sequence numbers\n"); + return EINVAL; } - - error = ksyn_wqfind(cond, cgen, cugen, 0, 0, flags, KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INWAIT, &ckwq); + + ckwq = kwq = NULL; + error = ksyn_wqfind(cond, cgen, cugen, csgen, 0, flags, KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INWAIT, &ckwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_END, (uint32_t)cond, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_END, (uint32_t)cond, 1, 0xdeadbeef, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } - if (mutex != (user_addr_t)0) { - mgen = uap->mgen; - ugen = uap->ugen; +#if __TESTPANICS__ + //clock_interval_to_deadline(timeoutval, NSEC_PER_SEC, &ntime); +#endif /* __TESTPANICS__ */ - error = ksyn_wqfind(mutex, mgen, ugen, 0, 0, flags, KSYN_WQTYPE_MTX, &kwq); { - if (error != 0) + if (mutex != (user_addr_t)0) { + error = ksyn_wqfind(mutex, mgen, ugen, 0, 0, flags, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_MTX), &kwq); + if (error != 0) { + local_error = error; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_END, (uint32_t)mutex, 2, 0xdeadbeef, error, 0); +#endif /* _PSYNCH_TRACE_ */ goto out; } - psynch_mutexdrop_internal(kwq, mgen, ugen, flags); + (void)psynch_mutexdrop_internal(kwq, mgen, ugen, flags); /* drops kwq reference */ + kwq = NULL; } - uth = current_uthread(); - uth->uu_lockseq = cgen; - lockseq = (cgen & PTHRW_COUNT_MASK); - - if (uap->sec != 0 || (uap->usec & 0x3fffffff) != 0) { + if (uap->sec != 0 || (uap->nsec & 0x3fffffff) != 0) { ts.tv_sec = uap->sec; - ts.tv_nsec = (uap->usec & 0xc0000000); - nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec, &abstime ); - clock_absolutetime_interval_to_deadline( abstime, &abstime ); + ts.tv_nsec = (uap->nsec & 0x3fffffff); + nanoseconds_to_absolutetime((uint64_t)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec, &abstime ); + clock_absolutetime_interval_to_deadline( abstime, &abstime ); } + ksyn_wqlock(ckwq); - if ((ckwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, ckwq->kw_pre_lockseq) != 0)) { + + /* update L, U and S... */ + UPDATE_CVKWQ(ckwq, cgen, cugen, csgen, 0, KSYN_WQTYPE_CVAR); + + /* Look for the sequence for prepost (or conflicting thread */ + kq = &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER]; + kwe = ksyn_queue_find_cvpreposeq(kq, lockseq); + + if (kwe != NULL) { + switch (kwe->kwe_flags) { + + case KWE_THREAD_INWAIT: + ksyn_wqunlock(ckwq); + __FAILEDUSERTEST__("cvwait: thread entry with same sequence already present\n"); + local_error = EBUSY; + goto out; + + case KWE_THREAD_BROADCAST: + break; + + case KWE_THREAD_PREPOST: + if ((kwe->kwe_lockseq & PTHRW_COUNT_MASK) == lockseq) { + /* we can safely consume a reference, so do so */ + if (--kwe->kwe_count == 0) { + ksyn_queue_removeitem(ckwq, kq, kwe); + ckwq->kw_fakecount--; + nkwe = kwe; + } + } else { + /* + * consuming a prepost higher than our lock sequence is valid, but + * can leave the higher thread without a match. Convert the entry + * to a broadcast to compensate for this. + */ #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)cond, 0, 1, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xc2c2c2c2, kwe->kwe_lockseq, 0, 0); #endif /* _PSYNCH_TRACE_ */ + + ksyn_handle_cvbroad(ckwq, kwe->kwe_lockseq, &updatebits); +#if __TESTPANICS__ + if (updatebits != 0) + panic("psync_cvwait: convert pre-post to broadcast: woke up %d threads that shouldn't be there\n", + updatebits); +#endif /* __TESTPANICS__ */ + } + + break; -#if COND_MTX_WAITQUEUEMOVE - updatebits = ckwq->kw_pre_cvretval | PTHRW_MTX_NONE; -#else /* COND_MTX_WAITQUEUEMOVE */ - updatebits = 0; -#endif /* COND_MTX_WAITQUEUEMOVE */ - ckwq->kw_pre_rwwc--; - if (ckwq->kw_pre_rwwc == 0) - CLEAR_PREPOST_BITS(ckwq); - *retval = updatebits; + default: + panic("psync_cvwait: unexpected wait queue element type\n"); + } + +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xfefefefe, kwe->kwe_lockseq, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + + + updatebits = PTHRW_INC; + ckwq->kw_sword += PTHRW_INC; + + /* set C or P bits and free if needed */ + ksyn_cvupdate_fixup(ckwq, &updatebits, &kfreeq, 1); + error = 0; + local_error = 0; + + *retval = updatebits; + ksyn_wqunlock(ckwq); + + if (nkwe != NULL) + zfree(kwe_zone, nkwe); + goto out; + + } - } else { + uth = current_uthread(); + kwe = &uth->uu_kwe; + kwe->kwe_kwqqueue = ckwq; + kwe->kwe_flags = KWE_THREAD_INWAIT; + kwe->kwe_lockseq = lockseq; + kwe->kwe_count = 1; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)cond, 0, 2, cgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xfeedfeed, cgen, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], cgen, uth, FIRSTFIT); - if (error != 0) - panic("psynch_cvwait: failed to enqueue\n"); - error = ksyn_block_thread_locked(ckwq, abstime, uth); - /* drops the lock */ + + error = ksyn_queue_insert(ckwq, kq, cgen, uth, kwe, SEQFIT); + if (error != 0) { + ksyn_wqunlock(ckwq); + local_error = error; + goto out; } + +#if 0 /* __TESTPANICS__ */ + /* if no timeout is passed, set 5 secs timeout to catch hangs */ + error = ksyn_block_thread_locked(ckwq, (abstime == 0) ? ntime : abstime, kwe, 1); +#else + error = ksyn_block_thread_locked(ckwq, abstime, kwe, 1); +#endif /* __TESTPANICS__ */ + /* lock dropped */ + + local_error = error; if (error != 0) { ksyn_wqlock(ckwq); + /* just in case it got woken up as we were granting */ + *retval = kwe->kwe_psynchretval; + +#if __TESTPANICS__ + if ((kwe->kwe_kwqqueue != NULL) && (kwe->kwe_kwqqueue != ckwq)) + panic("cvwait waiting on some other kwq\n"); + +#endif /* __TESTPANICS__ */ + + + if (kwe->kwe_kwqqueue != NULL) { + ksyn_queue_removeitem(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwe); + kwe->kwe_kwqqueue = NULL; + } + if ((kwe->kwe_psynchretval & PTH_RWL_MTX_WAIT) != 0) { + /* the condition var granted. + * reset the error so that the thread returns back. + */ + local_error = 0; + /* no need to set any bits just return as cvsig/broad covers this */ + ksyn_wqunlock(ckwq); + *retval = 0; + goto out; + } + + ckwq->kw_sword += PTHRW_INC; + + /* set C and P bits, in the local error as well as updatebits */ + if ((ckwq->kw_lword & PTHRW_COUNT_MASK) == (ckwq->kw_sword & PTHRW_COUNT_MASK)) { + updatebits |= PTH_RWS_CV_CBIT; + local_error |= ECVCERORR; + if (ckwq->kw_inqueue != 0) { + (void)ksyn_queue_move_tofree(ckwq, kq, (ckwq->kw_lword & PTHRW_COUNT_MASK), &kfreeq, 1, 1); + } + ckwq->kw_lword = ckwq->kw_uword = ckwq->kw_sword = 0; + ckwq->kw_kflags |= KSYN_KWF_ZEROEDOUT; + } else { + /* everythig in the queue is a fake entry ? */ + if ((ckwq->kw_inqueue != 0) && (ckwq->kw_fakecount == ckwq->kw_inqueue)) { + updatebits |= PTH_RWS_CV_PBIT; + local_error |= ECVPERORR; + } + } + ksyn_wqunlock(ckwq); + + } else { + /* PTH_RWL_MTX_WAIT is removed */ + if ((kwe->kwe_psynchretval & PTH_RWS_CV_MBIT) != 0) + *retval = PTHRW_INC | PTH_RWS_CV_CBIT; + else + *retval = 0; + local_error = 0; + } +out: +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_END, (uint32_t)cond, 0xeeeeeeed, (uint32_t)*retval, local_error, 0); +#endif /* _PSYNCH_TRACE_ */ + ksyn_wqrelease(ckwq, NULL, 1, (KSYN_WQTYPE_INWAIT | KSYN_WQTYPE_CVAR)); + return(local_error); +} + +/* + * psynch_cvclrprepost: This system call clears pending prepost if present. + */ +int +psynch_cvclrprepost(__unused proc_t p, struct psynch_cvclrprepost_args * uap, __unused int * retval) +{ + user_addr_t cond = uap->cv; + uint32_t cgen = uap->cvgen; + uint32_t cugen = uap->cvugen; + uint32_t csgen = uap->cvsgen; + uint32_t pseq = uap->preposeq; + uint32_t flags = uap->flags; + int error; + ksyn_wait_queue_t ckwq = NULL; + struct ksyn_queue kfreeq; + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_NONE, (uint32_t)cond, 0, 3, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CLRPRE | DBG_FUNC_START, (uint32_t)cond, cgen, cugen, csgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CLRPRE | DBG_FUNC_NONE, (uint32_t)cond, 0xcececece, pseq, flags, 0); #endif /* _PSYNCH_TRACE_ */ - if (uth->uu_kwqqueue != NULL) { - ksyn_queue_removeitem(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uth); + + if ((flags & _PTHREAD_MTX_OPT_MUTEX) == 0) { + error = ksyn_wqfind(cond, cgen, cugen, csgen, 0, flags, (KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INDROP), &ckwq); + if (error != 0) { + *retval = 0; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CLRPRE | DBG_FUNC_END, (uint32_t)cond, 0, 0xdeadbeef, error, 0); +#endif /* _PSYNCH_TRACE_ */ + return(error); } + + ksyn_wqlock(ckwq); + (void)ksyn_queue_move_tofree(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], (pseq & PTHRW_COUNT_MASK), &kfreeq, 0, 1); ksyn_wqunlock(ckwq); - } else { - *retval = uth->uu_psynchretval; + ksyn_wqrelease(ckwq, NULL, 1, (KSYN_WQTYPE_CVAR | KSYN_WQTYPE_INDROP)); + } else { + /* mutex type */ + error = ksyn_wqfind(cond, cgen, cugen, 0, 0, flags, (KSYN_WQTYPE_MTX | KSYN_WQTYPE_INDROP), &ckwq); + if (error != 0) { + *retval = 0; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CLRPRE | DBG_FUNC_END, (uint32_t)cond, 0, 0xdeadbeef, error, 0); +#endif /* _PSYNCH_TRACE_ */ + return(error); + } + ksyn_wqlock(ckwq); + if (((flags & _PTHREAD_MUTEX_POLICY_FIRSTFIT) != 0) && (ckwq->kw_pre_rwwc != 0)) { + if (is_seqlower_eq(ckwq->kw_pre_lockseq, cgen) != 0) { + /* clear prepost */ + ckwq->kw_pre_rwwc = 0; + ckwq->kw_pre_lockseq = 0; + } + } + ksyn_wqunlock(ckwq); + ksyn_wqrelease(ckwq, NULL, 1, (KSYN_WQTYPE_MTX | KSYN_WQTYPE_INDROP)); } -out: + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_CVWAIT | DBG_FUNC_END, (uint32_t)cond, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CLRPRE | DBG_FUNC_END, (uint32_t)cond, 0xeeeeeeed, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - ksyn_wqrelease(ckwq, NULL); - return(error); + return(0); } /* ***************** pthread_rwlock ************************ */ @@ -1182,67 +1518,106 @@ psynch_rw_rdlock(__unused proc_t p, struct psynch_rw_rdlock_args * uap, uint32_t //uint64_t tid = uap->tid; int flags = uap->flags; int error = 0, block; - uint32_t lockseq = 0, updatebits = 0, preseq = 0; + uint32_t lockseq = 0, updatebits = 0, preseq = 0, prerw_wc = 0; ksyn_wait_queue_t kwq; uthread_t uth; + int isinit = lgen & PTHRW_RWL_INIT; + uint32_t returnbits = 0; + ksyn_waitq_element_t kwe; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); /* preserve the seq number */ - uth->uu_lockseq = lgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = lgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; + lockseq = lgen & PTHRW_COUNT_MASK; + error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); + if (isinit != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) { + /* first to notice the reset of the lock, clear preposts */ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + } + /* handle first the missed wakeups */ if ((kwq->kw_pre_intrcount != 0) && ((kwq->kw_pre_intrtype == PTH_RW_TYPE_READ) || (kwq->kw_pre_intrtype == PTH_RW_TYPE_LREAD)) && (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { kwq->kw_pre_intrcount--; - uth->uu_psynchretval = kwq->kw_pre_intrretbits; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; if (kwq->kw_pre_intrcount==0) CLEAR_INTR_PREPOST_BITS(kwq); ksyn_wqunlock(kwq); goto out; } - /* handle unlock2/downgrade first */ - if ((kwq->kw_pre_limrd != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_limrdseq & PTHRW_COUNT_MASK)) != 0)) { + /* handle overlap first as they are not counted against pre_rwwc */ + + /* check for overlap and if no pending W bit (indicates writers) */ + if ((kwq->kw_overlapwatch != 0) && ((rw_wc & PTHRW_RWS_SAVEMASK) == 0) && ((lgen & PTH_RWL_WBIT) == 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_limrd, kwq->kw_pre_limrdseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 10, kwq->kw_nextseqword, kwq->kw_lastseqword, 0); #endif /* _PSYNCH_TRACE_ */ - kwq->kw_pre_limrd--; - /* acquired the locks, so return */ - uth->uu_psynchretval = kwq->kw_pre_limrdbits; - if (kwq->kw_pre_limrd == 0) - CLEAR_READ_PREPOST_BITS(kwq); - ksyn_wqunlock(kwq); - goto out; + error = kwq_handle_overlap(kwq, lgen, ugen, rw_wc, &updatebits, (KW_UNLOCK_PREPOST_READLOCK|KW_UNLOCK_PREPOST), &block); +#if __TESTPANICS__ + if (error != 0) + panic("rw_rdlock: kwq_handle_overlap failed %d\n",error); +#endif /* __TESTPANICS__ */ + if (block == 0) { + error = 0; + kwe->kwe_psynchretval = updatebits; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 0xff, updatebits, 0xee, 0); +#endif /* _PSYNCH_TRACE_ */ + ksyn_wqunlock(kwq); + goto out; + } } if ((kwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK)) != 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { preseq = kwq->kw_pre_lockseq; + prerw_wc = kwq->kw_pre_sseq; CLEAR_PREPOST_BITS(kwq); - error = kwq_handle_unlock(kwq, preseq, &updatebits, (KW_UNLOCK_PREPOST_READLOCK|KW_UNLOCK_PREPOST), &block, lgen); + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } + error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (KW_UNLOCK_PREPOST_READLOCK|KW_UNLOCK_PREPOST), &block, lgen); +#if __TESTPANICS__ if (error != 0) - panic("kwq_handle_unlock failed %d\n",error); + panic("rw_rdlock: kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ if (block == 0) { ksyn_wqunlock(kwq); goto out; @@ -1251,31 +1626,35 @@ psynch_rw_rdlock(__unused proc_t p, struct psynch_rw_rdlock_args * uap, uint32_t } } + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_READ], lgen, uth, SEQFIT); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_READ], lgen, uth, kwe, SEQFIT); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_rdlock: failed to enqueue\n"); - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); +#endif /* __TESTPANICS__ */ + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); /* drops the kwq lock */ out: if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_READ], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_READ], kwe); ksyn_wqunlock(kwq); } else { /* update bits */ - *retval = uth->uu_psynchretval; + *retval = kwe->kwe_psynchretval; + returnbits = kwe->kwe_psynchretval; } - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, returnbits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } @@ -1284,7 +1663,7 @@ out: * psynch_rw_longrdlock: This system call is used for psync rwlock long readers to block. */ int -psynch_rw_longrdlock(__unused proc_t p, struct psynch_rw_longrdlock_args * uap, uint32_t * retval) +psynch_rw_longrdlock(__unused proc_t p, __unused struct psynch_rw_longrdlock_args * uap, __unused uint32_t * retval) { user_addr_t rwlock = uap->rwlock; uint32_t lgen = uap->lgenval; @@ -1292,65 +1671,82 @@ psynch_rw_longrdlock(__unused proc_t p, struct psynch_rw_longrdlock_args * uap, uint32_t rw_wc = uap->rw_wc; //uint64_t tid = uap->tid; int flags = uap->flags; + int isinit = lgen & PTHRW_RWL_INIT; + uint32_t returnbits=0; + ksyn_waitq_element_t kwe; ksyn_wait_queue_t kwq; int error=0, block = 0 ; uthread_t uth; - uint32_t lockseq = 0, updatebits = 0, preseq = 0; + uint32_t lockseq = 0, updatebits = 0, preseq = 0, prerw_wc = 0; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); - - uth->uu_lockseq = lgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = lgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; lockseq = (lgen & PTHRW_COUNT_MASK); - + error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); + if (isinit != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) { + /* first to notice the reset of the lock, clear preposts */ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + } + /* handle first the missed wakeups */ if ((kwq->kw_pre_intrcount != 0) && (kwq->kw_pre_intrtype == PTH_RW_TYPE_LREAD) && (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { kwq->kw_pre_intrcount--; - uth->uu_psynchretval = kwq->kw_pre_intrretbits; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; if (kwq->kw_pre_intrcount==0) CLEAR_INTR_PREPOST_BITS(kwq); ksyn_wqunlock(kwq); goto out; } - /* handle unlock2/downgrade first */ - if ((kwq->kw_pre_limrd != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_limrdseq & PTHRW_COUNT_MASK)) != 0)) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_limrd, kwq->kw_pre_limrdseq, 0); -#endif /* _PSYNCH_TRACE_ */ - kwq->kw_pre_limrd--; - if (kwq->kw_pre_limrd == 0) - CLEAR_READ_PREPOST_BITS(kwq); - /* not a read proceed */ - } if ((kwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK)) != 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { preseq = kwq->kw_pre_lockseq; + prerw_wc = kwq->kw_pre_sseq; CLEAR_PREPOST_BITS(kwq); - error = kwq_handle_unlock(kwq, preseq, &updatebits, (KW_UNLOCK_PREPOST_LREADLOCK|KW_UNLOCK_PREPOST), &block, lgen); + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } + error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (KW_UNLOCK_PREPOST_LREADLOCK|KW_UNLOCK_PREPOST), &block, lgen); +#if __TESTPANICS__ if (error != 0) panic("kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ if (block == 0) { ksyn_wqunlock(kwq); goto out; @@ -1360,32 +1756,35 @@ psynch_rw_longrdlock(__unused proc_t p, struct psynch_rw_longrdlock_args * uap, } #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], lgen, uth, SEQFIT); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], lgen, uth, kwe, SEQFIT); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_longrdlock: failed to enqueue\n"); +#endif /* __TESTPANICS__ */ - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); /* drops the kwq lock */ out: if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], kwe); ksyn_wqunlock(kwq); } else { /* update bits */ - *retval = uth->uu_psynchretval; + *retval = kwe->kwe_psynchretval; + returnbits = kwe->kwe_psynchretval; } - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWLRDLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, returnbits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } @@ -1406,97 +1805,122 @@ psynch_rw_wrlock(__unused proc_t p, struct psynch_rw_wrlock_args * uap, uint32_t ksyn_wait_queue_t kwq; int error=0; uthread_t uth; - uint32_t lockseq = 0, updatebits = 0, preseq = 0; + uint32_t lockseq = 0, updatebits = 0, preseq = 0, prerw_wc = 0; + int isinit = lgen & PTHRW_RWL_INIT; + uint32_t returnbits = 0; + ksyn_waitq_element_t kwe; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); - - uth->uu_lockseq = lgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = lgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; lockseq = (lgen & PTHRW_COUNT_MASK); error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); + + if (isinit != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) { + /* first to notice the reset of the lock, clear preposts */ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + } + + /* handle first the missed wakeups */ if ((kwq->kw_pre_intrcount != 0) && (kwq->kw_pre_intrtype == PTH_RW_TYPE_WRITE) && (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { kwq->kw_pre_intrcount--; - uth->uu_psynchretval = kwq->kw_pre_intrretbits; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; if (kwq->kw_pre_intrcount==0) CLEAR_INTR_PREPOST_BITS(kwq); ksyn_wqunlock(kwq); goto out; } - /* handle unlock2/downgrade first */ - if ((kwq->kw_pre_limrd != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_limrdseq & PTHRW_COUNT_MASK)) != 0)) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_limrd, kwq->kw_pre_limrdseq, 0); -#endif /* _PSYNCH_TRACE_ */ - kwq->kw_pre_limrd--; - if (kwq->kw_pre_limrd == 0) - CLEAR_READ_PREPOST_BITS(kwq); - /* not a read proceed */ - } if ((kwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK)) != 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { preseq = kwq->kw_pre_lockseq; + prerw_wc = kwq->kw_pre_sseq; CLEAR_PREPOST_BITS(kwq); - error = kwq_handle_unlock(kwq, preseq, &updatebits, (KW_UNLOCK_PREPOST_WRLOCK|KW_UNLOCK_PREPOST), &block, lgen); + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } + error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (KW_UNLOCK_PREPOST_WRLOCK|KW_UNLOCK_PREPOST), &block, lgen); +#if __TESTPANICS__ if (error != 0) - panic("kwq_handle_unlock failed %d\n",error); + panic("rw_wrlock: kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ if (block == 0) { ksyn_wqunlock(kwq); - goto out; + *retval = updatebits; + goto out1; } /* insert to q and proceed as ususal */ } } + /* No overlap watch needed go ahead and block */ + #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], lgen, uth, SEQFIT); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], lgen, uth, kwe, SEQFIT); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_wrlock: failed to enqueue\n"); +#endif /* __TESTPANICS__ */ - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); /* drops the wq lock */ out: if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwe); ksyn_wqunlock(kwq); } else { /* update bits */ - *retval = uth->uu_psynchretval; + *retval = kwe->kwe_psynchretval; + returnbits = kwe->kwe_psynchretval; } - - ksyn_wqrelease(kwq, NULL); +out1: + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, returnbits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } @@ -1505,7 +1929,7 @@ out: * psynch_rw_yieldwrlock: This system call is used for psync rwlock yielding writers to block. */ int -psynch_rw_yieldwrlock(__unused proc_t p, struct psynch_rw_yieldwrlock_args * uap, uint32_t * retval) +psynch_rw_yieldwrlock(__unused proc_t p, __unused struct psynch_rw_yieldwrlock_args * uap, __unused uint32_t * retval) { user_addr_t rwlock = uap->rwlock; uint32_t lgen = uap->lgenval; @@ -1516,65 +1940,82 @@ psynch_rw_yieldwrlock(__unused proc_t p, struct psynch_rw_yieldwrlock_args * ua int block; ksyn_wait_queue_t kwq; int error=0; + int isinit = lgen & PTHRW_RWL_INIT; uthread_t uth; + uint32_t returnbits=0; + ksyn_waitq_element_t kwe; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ - uint32_t lockseq = 0, updatebits = 0, preseq = 0; + uint32_t lockseq = 0, updatebits = 0, preseq = 0, prerw_wc = 0; uth = current_uthread(); - - uth->uu_lockseq = lgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = lgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; lockseq = (lgen & PTHRW_COUNT_MASK); error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); + if (isinit != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) { + /* first to notice the reset of the lock, clear preposts */ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + } + /* handle first the missed wakeups */ if ((kwq->kw_pre_intrcount != 0) && (kwq->kw_pre_intrtype == PTH_RW_TYPE_YWRITE) && (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { kwq->kw_pre_intrcount--; - uth->uu_psynchretval = kwq->kw_pre_intrretbits; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; if (kwq->kw_pre_intrcount==0) CLEAR_INTR_PREPOST_BITS(kwq); ksyn_wqunlock(kwq); goto out; } - /* handle unlock2/downgrade first */ - if ((kwq->kw_pre_limrd != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_limrdseq & PTHRW_COUNT_MASK)) != 0)) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_limrd, kwq->kw_pre_limrdseq, 0); -#endif /* _PSYNCH_TRACE_ */ - kwq->kw_pre_limrd--; - if (kwq->kw_pre_limrd == 0) - CLEAR_READ_PREPOST_BITS(kwq); - /* not a read proceed */ - } - if ((kwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK)) != 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { preseq = kwq->kw_pre_lockseq; + prerw_wc = kwq->kw_pre_sseq; CLEAR_PREPOST_BITS(kwq); - error = kwq_handle_unlock(kwq, preseq, &updatebits, (KW_UNLOCK_PREPOST_YWRLOCK|KW_UNLOCK_PREPOST), &block, lgen); + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } + error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (KW_UNLOCK_PREPOST_YWRLOCK|KW_UNLOCK_PREPOST), &block, lgen); +#if __TESTPANICS__ if (error != 0) panic("kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ if (block == 0) { ksyn_wqunlock(kwq); + *retval = updatebits; goto out; } /* insert to q and proceed as ususal */ @@ -1582,37 +2023,40 @@ psynch_rw_yieldwrlock(__unused proc_t p, struct psynch_rw_yieldwrlock_args * ua } #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], lgen, uth, SEQFIT); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], lgen, uth, kwe, SEQFIT); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_yieldwrlock: failed to enqueue\n"); +#endif /* __TESTPANICS__ */ - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); out: if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], kwe); ksyn_wqunlock(kwq); } else { /* update bits */ - *retval = uth->uu_psynchretval; + *retval = kwe->kwe_psynchretval; + returnbits = kwe->kwe_psynchretval; } - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INWAIT | KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWYWRLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, returnbits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } - +#if NOTYET /* * psynch_rw_downgrade: This system call is used for wakeup blocked readers who are eligible to run due to downgrade. */ @@ -1626,72 +2070,93 @@ psynch_rw_downgrade(__unused proc_t p, struct psynch_rw_downgrade_args * uap, __ //uint64_t tid = uap->tid; int flags = uap->flags; uint32_t count = 0; - + int isinit = lgen & PTHRW_RWL_INIT; ksyn_wait_queue_t kwq; int error=0; uthread_t uth; uint32_t curgen = 0; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); curgen = (lgen & PTHRW_COUNT_MASK); - error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); + error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); - if (is_seqlower(ugen, kwq->kw_lastunlockseq)!= 0) { + if ((lgen & PTHRW_RWL_INIT) != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0){ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + isinit = 1; + } + + /* if lastunlock seq is set, ensure the current one is not lower than that, as it would be spurious */ + if ((kwq->kw_lastunlockseq != PTHRW_RWL_INIT) && (is_seqlower(ugen, kwq->kw_lastunlockseq)!= 0)) { /* spurious updatebits?? */ + error = 0; goto out; } - /* fast path for default case */ - if((rw_wc == kwq->kw_inqueue) && (kwq->kw_highseq == curgen)) - goto dounlock; - /* have we seen all the waiters? */ - if(rw_wc > kwq->kw_inqueue) { - goto prepost; + + + /* If L-U != num of waiters, then it needs to be preposted or spr */ + diff = find_diff(lgen, ugen); + /* take count of the downgrade thread itself */ + diff--; + + +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_inqueue, curgen, 0); +#endif /* _PSYNCH_TRACE_ */ + if (find_seq_till(kwq, curgen, diff, &count) == 0) { + if (count < (uint32_t)diff) + goto prepost; } - - if (is_seqhigher(curgen, kwq->kw_highseq) != 0) { - goto prepost; - } else { - if (find_seq_till(kwq, curgen, rw_wc, &count) == 0) { - if (count < rw_wc) { - kwq->kw_pre_limrd = rw_wc - count; - kwq->kw_pre_limrdseq = lgen; - kwq->kw_pre_limrdbits = lgen; - /* found none ? */ - if (count == 0) - goto out; - } - } + + /* no prepost and all threads are in place, reset the bit */ + if ((isinit != 0) && ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0)){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ } + + /* can handle unlock now */ + CLEAR_PREPOST_BITS(kwq); + dounlock: #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ error = kwq_handle_downgrade(kwq, lgen, 0, 0, NULL); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_downgrade: failed to wakeup\n"); +#endif /* __TESTPANICS__ */ out: ksyn_wqunlock(kwq); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_RWLOCK)); return(error); @@ -1699,7 +2164,7 @@ prepost: kwq->kw_pre_rwwc = (rw_wc - count); kwq->kw_pre_lockseq = lgen; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWDOWNGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ error = 0; goto out; @@ -1723,32 +2188,49 @@ psynch_rw_upgrade(__unused proc_t p, struct psynch_rw_upgrade_args * uap, uint32 int error=0; uthread_t uth; uint32_t lockseq = 0, updatebits = 0, preseq = 0; + int isinit = lgen & PTHRW_RWL_INIT; + ksyn_waitq_element_t kwe; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); - - uth->uu_lockseq = lgen; + kwe = &uth->uu_kwe; + kwe->kwe_lockseq = lgen; + kwe->kwe_uth = uth; + kwe->kwe_psynchretval = 0; + kwe->kwe_kwqqueue = NULL; lockseq = (lgen & PTHRW_COUNT_MASK); - - error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT|KSYN_WQTYPE_RWLOCK), &kwq); + + error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INWAIT | KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } ksyn_wqlock(kwq); - + + if (isinit != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0) { + /* first to notice the reset of the lock, clear preposts */ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + } + /* handle first the missed wakeups */ if ((kwq->kw_pre_intrcount != 0) && - (kwq->kw_pre_intrtype == PTH_RW_TYPE_UPGRADE) && + ((kwq->kw_pre_intrtype == PTH_RW_TYPE_READ) || (kwq->kw_pre_intrtype == PTH_RW_TYPE_LREAD)) && (is_seqlower_eq(lockseq, (kwq->kw_pre_intrseq & PTHRW_COUNT_MASK)) != 0)) { kwq->kw_pre_intrcount--; - uth->uu_psynchretval = kwq->kw_pre_intrretbits; + kwe->kwe_psynchretval = kwq->kw_pre_intrretbits; if (kwq->kw_pre_intrcount==0) CLEAR_INTR_PREPOST_BITS(kwq); ksyn_wqunlock(kwq); @@ -1757,15 +2239,24 @@ psynch_rw_upgrade(__unused proc_t p, struct psynch_rw_upgrade_args * uap, uint32 if ((kwq->kw_pre_rwwc != 0) && (is_seqlower_eq(lockseq, (kwq->kw_pre_lockseq & PTHRW_COUNT_MASK)) != 0)) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWRDLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ kwq->kw_pre_rwwc--; if (kwq->kw_pre_rwwc == 0) { preseq = kwq->kw_pre_lockseq; + prerw_wc = kwq->kw_pre_sseq; CLEAR_PREPOST_BITS(kwq); - error = kwq_handle_unlock(kwq, preseq, &updatebits, (KW_UNLOCK_PREPOST_UPGRADE|KW_UNLOCK_PREPOST), &block, lgen); + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } + error = kwq_handle_unlock(kwq, preseq, prerw_wc, &updatebits, (KW_UNLOCK_PREPOST_UPGRADE|KW_UNLOCK_PREPOST), &block, lgen); +#if __TESTPANICS__ if (error != 0) - panic("kwq_handle_unlock failed %d\n",error); + panic("rw_rdlock: kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ if (block == 0) { ksyn_wqunlock(kwq); goto out; @@ -1776,37 +2267,52 @@ psynch_rw_upgrade(__unused proc_t p, struct psynch_rw_upgrade_args * uap, uint32 #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], lgen, uth, SEQFIT); + error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], lgen, uth, kwe, SEQFIT); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_upgrade: failed to enqueue\n"); +#endif /* __TESTPANICS__ */ - error = ksyn_block_thread_locked(kwq, (uint64_t)0, uth); + error = ksyn_block_thread_locked(kwq, (uint64_t)0, kwe, 0); /* drops the lock */ out: if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_NONE, (uint32_t)rwlock, 4, error, 0, 0); #endif /* _PSYNCH_TRACE_ */ ksyn_wqlock(kwq); - if (uth->uu_kwqqueue != NULL) - ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], uth); + if (kwe->kwe_kwqqueue != NULL) + ksyn_queue_removeitem(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], kwe); ksyn_wqunlock(kwq); } else { /* update bits */ - *retval = uth->uu_psynchretval; + *retval = kwe->kwe_psynchretval; } - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INWAIT | KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUPGRADE | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ + return(error); } +#else /* NOTYET */ +int +psynch_rw_upgrade(__unused proc_t p, __unused struct psynch_rw_upgrade_args * uap, __unused uint32_t * retval) +{ + return(0); +} +int +psynch_rw_downgrade(__unused proc_t p, __unused struct psynch_rw_downgrade_args * uap, __unused int * retval) +{ + return(0); +} +#endif /* NOTYET */ /* * psynch_rw_unlock: This system call is used for unlock state postings. This will grant appropriate * reader/writer variety lock. @@ -1825,19 +2331,20 @@ psynch_rw_unlock(__unused proc_t p, struct psynch_rw_unlock_args * uap, uint32_ uthread_t uth; ksyn_wait_queue_t kwq; uint32_t updatebits = 0; - int error=0; + int error=0, diff; uint32_t count = 0; + int isinit = 0; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ uth = current_uthread(); - error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_RWLOCK), &kwq); + error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_RWLOCK), &kwq); if (error != 0) { #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } @@ -1846,59 +2353,87 @@ psynch_rw_unlock(__unused proc_t p, struct psynch_rw_unlock_args * uap, uint32_ ksyn_wqlock(kwq); - if ((lgen & PTHRW_RW_INIT) != 0) { - kwq->kw_lastunlockseq = 0; - lgen &= ~PTHRW_RW_INIT; - } else if (is_seqlower(ugen, kwq->kw_lastunlockseq) != 0) { - /* spurious updatebits set */ - updatebits = PTHRW_RW_SPURIOUS; + if ((lgen & PTHRW_RWL_INIT) != 0) { + lgen &= ~PTHRW_RWL_INIT; + if ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) == 0){ + CLEAR_REINIT_BITS(kwq); + kwq->kw_kflags |= KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 1, 0); +#endif /* _PSYNCH_TRACE_ */ + } + isinit = 1; + } + + /* if lastunlock seq is set, ensure the current one is not lower than that, as it would be spurious */ + if ((kwq->kw_lastunlockseq != PTHRW_RWL_INIT) && (is_seqlower(ugen, kwq->kw_lastunlockseq)!= 0)) { +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, (uint32_t)0xeeeeeeee, rw_wc, kwq->kw_lastunlockseq, 0); +#endif /* _PSYNCH_TRACE_ */ + error = 0; goto out; } + /* If L-U != num of waiters, then it needs to be preposted or spr */ + diff = find_diff(lgen, ugen); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_inqueue, curgen, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 1, kwq->kw_inqueue, curgen, 0); #endif /* _PSYNCH_TRACE_ */ - if (find_seq_till(kwq, curgen, rw_wc, &count) == 0) { - if (count < rw_wc) + if (find_seq_till(kwq, curgen, diff, &count) == 0) { + if ((count == 0) || (count < (uint32_t)diff)) goto prepost; } + /* no prepost and all threads are in place, reset the bit */ + if ((isinit != 0) && ((kwq->kw_kflags & KSYN_KWF_INITCLEARED) != 0)){ + kwq->kw_kflags &= ~KSYN_KWF_INITCLEARED; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, lgen, ugen, rw_wc, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + } /* can handle unlock now */ CLEAR_PREPOST_BITS(kwq); - kwq->kw_lastunlockseq = ugen; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, 0, 0, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 2, 0, 0, 0); #endif /* _PSYNCH_TRACE_ */ - error = kwq_handle_unlock(kwq, lgen, &updatebits, 0, NULL, 0); + error = kwq_handle_unlock(kwq, lgen, rw_wc, &updatebits, 0, NULL, 0); +#if __TESTPANICS__ if (error != 0) panic("psynch_rw_unlock: kwq_handle_unlock failed %d\n",error); +#endif /* __TESTPANICS__ */ out: if (error == 0) { /* update bits?? */ *retval = updatebits; } + + ksyn_wqunlock(kwq); - ksyn_wqrelease(kwq, NULL); + ksyn_wqrelease(kwq, NULL, 0, (KSYN_WQTYPE_INDROP | KSYN_WQTYPE_RWLOCK)); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_END, (uint32_t)rwlock, 0, updatebits, error, 0); #endif /* _PSYNCH_TRACE_ */ return(error); prepost: - kwq->kw_pre_rwwc = (rw_wc - count); - kwq->kw_pre_lockseq = curgen; - kwq->kw_lastunlockseq = ugen; + /* update if the new seq is higher than prev prepost, or first set */ + if ((is_rws_setseq(kwq->kw_pre_sseq) != 0) || + (is_seqhigher_eq((rw_wc & PTHRW_COUNT_MASK), (kwq->kw_pre_sseq & PTHRW_COUNT_MASK)) != 0)) { + kwq->kw_pre_rwwc = (diff - count); + kwq->kw_pre_lockseq = curgen; + kwq->kw_pre_sseq = rw_wc; #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, rw_wc, count, 0); - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 3, rw_wc, count, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWUNLOCK | DBG_FUNC_NONE, (uint32_t)rwlock, 4, kwq->kw_pre_rwwc, kwq->kw_pre_lockseq, 0); #endif /* _PSYNCH_TRACE_ */ - updatebits = (lgen | PTHRW_RW_SPURIOUS);/* let this not do unlock handling */ + updatebits = lgen; /* let this not do unlock handling */ + } error = 0; goto out; } @@ -1909,86 +2444,9 @@ prepost: * to new reader arrival races */ int -psynch_rw_unlock2(__unused proc_t p, struct psynch_rw_unlock2_args * uap, uint32_t * retval) +psynch_rw_unlock2(__unused proc_t p, __unused struct psynch_rw_unlock2_args * uap, __unused uint32_t * retval) { - user_addr_t rwlock = uap->rwlock; - uint32_t lgen = uap->lgenval; - uint32_t ugen = uap->ugenval; - uint32_t rw_wc = uap->rw_wc; - //uint64_t tid = uap->tid; - int flags = uap->flags; - uthread_t uth; - uint32_t num_lreader, limitread, curgen, updatebits; - ksyn_wait_queue_t kwq; - int error=0, longreadset = 0; - int diff; - uint32_t count=0; - -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK2 | DBG_FUNC_START, (uint32_t)rwlock, lgen, ugen, rw_wc, 0); -#endif /* _PSYNCH_TRACE_ */ - uth = current_uthread(); - - error = ksyn_wqfind(rwlock, lgen, ugen, rw_wc, TID_ZERO, flags, (KSYN_WQTYPE_RWLOCK), &kwq); - if (error != 0) { -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK2 | DBG_FUNC_END, (uint32_t)rwlock, 1, 0, error, 0); -#endif /* _PSYNCH_TRACE_ */ - return(error); - } - - ksyn_wqlock(kwq); - - curgen = (lgen & PTHRW_COUNT_MASK); - diff = find_diff(lgen, ugen); - - limitread = lgen & PTHRW_COUNT_MASK; - - if (find_seq_till(kwq, curgen, diff, &count) == 0) { - kwq->kw_pre_limrd = diff - count; - kwq->kw_pre_limrdseq = lgen; - kwq->kw_pre_limrdbits = lgen; - /* found none ? */ - if (count == 0) - goto out; - } - - if (kwq->kw_ksynqueues[KSYN_QUEUE_LREAD].ksynq_count != 0) { - num_lreader = kwq->kw_ksynqueues[KSYN_QUEUE_LREAD].ksynq_firstnum; - if (is_seqlower_eq(num_lreader, limitread) != 0) - longreadset = 1; - } - - updatebits = lgen; -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK2 | DBG_FUNC_NONE, (uint32_t)rwlock, 3, 0, 0, 0); -#endif /* _PSYNCH_TRACE_ */ - count = ksyn_wakeupreaders(kwq, limitread, longreadset, 0, updatebits, NULL); - - if (count != 0) { - if (kwq->kw_pre_limrd != 0) { - kwq->kw_pre_limrd += count; - } else { - kwq->kw_pre_limrd = count; - kwq->kw_pre_limrdseq = lgen; - kwq->kw_pre_limrdbits = lgen; - } - } - error = 0; - -out: - if (error == 0) { - /* update bits?? */ - *retval = uth->uu_psynchretval; - } - ksyn_wqunlock(kwq); - - ksyn_wqrelease(kwq, NULL); -#if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWUNLOCK2 | DBG_FUNC_END, (uint32_t)rwlock, 0, 0, error, 0); -#endif /* _PSYNCH_TRACE_ */ - - return(error); + return(ENOTSUP); } @@ -1996,7 +2454,31 @@ out: void pth_global_hashinit() { + int arg; + pth_glob_hashtbl = hashinit(PTH_HASHSIZE * 4, M_PROC, &pthhash); + + /* + * pthtest={0,1,2,3} (override default aborting behavior on pthread sync failures) + * 0 - just return errors + * 1 - print and return errors + * 2 - abort user, print and return errors + * 3 - panic + */ + if (!PE_parse_boot_argn("pthtest", &arg, sizeof(arg))) + arg = __TESTMODE__; + + if (arg == 3) { + __test_panics__ = 1; + printf("Pthread support PANICS when sync kernel primitives misused\n"); + } else if (arg == 2) { + __test_aborts__ = 1; + __test_prints__ = 1; + printf("Pthread support ABORTS when sync kernel primitives misused\n"); + } else if (arg == 1) { + __test_prints__ = 1; + printf("Pthread support LOGS when sync kernel primitives misused\n"); + } } void @@ -2046,6 +2528,10 @@ pth_proc_hashdelete(proc_t p) int hashsize = pthhash + 1; int i; +#if _PSYNCH_TRACE_ + if ((pthread_debug_proc != NULL) && (p == pthread_debug_proc)) + pthread_debug_proc = PROC_NULL; +#endif /* _PSYNCH_TRACE_ */ hashptr = p->p_pthhash; if (hashptr == NULL) return; @@ -2060,16 +2546,39 @@ pth_proc_hashdelete(proc_t p) if ((kwq->kw_pflags & KSYN_WQ_FLIST) != 0) { kwq->kw_pflags &= ~KSYN_WQ_FLIST; LIST_REMOVE(kwq, kw_list); + num_infreekwq--; } + num_freekwq++; pthread_list_unlock(); + /* release fake entries if present for cvars */ + if (((kwq->kw_type & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_CVAR) && (kwq->kw_inqueue != 0)) + ksyn_freeallkwe(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER]); lck_mtx_destroy(&kwq->kw_lock, pthread_lck_grp); - kfree(kwq, sizeof(struct ksyn_wait_queue)); + zfree(kwq_zone, kwq); } } FREE(p->p_pthhash, M_PROC); p->p_pthhash = NULL; } +/* no lock held for this as the waitqueue is getting freed */ +void +ksyn_freeallkwe(ksyn_queue_t kq) +{ + ksyn_waitq_element_t kwe; + + /* free all the fake entries, dequeue rest */ + kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + while (kwe != NULL) { + if (kwe->kwe_flags != KWE_THREAD_INWAIT) { + TAILQ_REMOVE(&kq->ksynq_kwelist, kwe, kwe_list); + zfree(kwe_zone, kwe); + } else { + TAILQ_REMOVE(&kq->ksynq_kwelist, kwe, kwe_list); + } + kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + } +} /* find kernel waitqueue, if not present create one. Grants a reference */ int @@ -2081,7 +2590,8 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin uint64_t object = 0, offset = 0; uint64_t hashhint; proc_t p = current_proc(); - int retry = mgen & PTHRW_RETRYBIT; + int retry = mgen & PTH_RWL_RETRYBIT; + struct ksyn_queue kfreeq; int i; if ((flags & PTHREAD_PSHARED_FLAGS_MASK) == PTHREAD_PROCESS_SHARED) @@ -2093,18 +2603,60 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin hashptr = p->p_pthhash; } + ksyn_queue_init(&kfreeq); + + if (((wqtype & KSYN_WQTYPE_MASK) == KSYN_WQTYPE_MTX) && (retry != 0)) + mgen &= ~PTH_RWL_RETRYBIT; + +loop: //pthread_list_lock_spin(); pthread_list_lock(); kwq = ksyn_wq_hash_lookup(mutex, p, flags, object, offset); if (kwq != NULL) { - kwq->kw_iocount++; if ((kwq->kw_pflags & KSYN_WQ_FLIST) != 0) { LIST_REMOVE(kwq, kw_list); kwq->kw_pflags &= ~KSYN_WQ_FLIST; + num_infreekwq--; + num_reusekwq++; + } + if ((kwq->kw_type & KSYN_WQTYPE_MASK) != (wqtype &KSYN_WQTYPE_MASK)) { + if ((kwq->kw_inqueue == 0) && (kwq->kw_pre_rwwc ==0) && (kwq->kw_pre_intrcount == 0)) { + if (kwq->kw_iocount == 0) { + kwq->kw_addr = mutex; + kwq->kw_flags = flags; + kwq->kw_object = object; + kwq->kw_offset = offset; + kwq->kw_type = (wqtype & KSYN_WQTYPE_MASK); + CLEAR_REINIT_BITS(kwq); + CLEAR_INTR_PREPOST_BITS(kwq); + CLEAR_PREPOST_BITS(kwq); + kwq->kw_lword = mgen; + kwq->kw_uword = ugen; + kwq->kw_sword = rw_wc; + kwq->kw_owner = tid; + } else if ((kwq->kw_iocount == 1) && (kwq->kw_dropcount == kwq->kw_iocount)) { + /* if all users are unlockers then wait for it to finish */ + kwq->kw_pflags |= KSYN_WQ_WAITING; + /* wait for the wq to be free */ + (void)msleep(&kwq->kw_pflags, pthread_list_mlock, PDROP, "ksyn_wqfind", 0); + /* does not have list lock */ + goto loop; + } else { + __FAILEDUSERTEST__("address already known to kernel for another (busy) synchronizer type\n"); + pthread_list_unlock(); + return EBUSY; + } + } else { + __FAILEDUSERTEST__("address already known to kernel for another (busy) synchronizer type(1)\n"); + pthread_list_unlock(); + return EBUSY; + } } - UPDATE_KWQ(kwq, mgen, ugen, rw_wc, tid, wqtype, retry); + kwq->kw_iocount++; + if (wqtype == KSYN_WQTYPE_MUTEXDROP) + kwq->kw_dropcount++; if (kwqp != NULL) *kwqp = kwq; pthread_list_unlock(); @@ -2113,23 +2665,34 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin pthread_list_unlock(); - nkwq = kalloc(sizeof(struct ksyn_wait_queue)); + nkwq = (ksyn_wait_queue_t)zalloc(kwq_zone); bzero(nkwq, sizeof(struct ksyn_wait_queue)); nkwq->kw_addr = mutex; nkwq->kw_flags = flags; nkwq->kw_iocount = 1; + if (wqtype == KSYN_WQTYPE_MUTEXDROP) + nkwq->kw_dropcount++; nkwq->kw_object = object; nkwq->kw_offset = offset; nkwq->kw_type = (wqtype & KSYN_WQTYPE_MASK); - TAILQ_INIT(&nkwq->kw_uthlist); + nkwq->kw_lastseqword = PTHRW_RWS_INIT; + if (nkwq->kw_type == KSYN_WQTYPE_RWLOCK) + nkwq->kw_nextseqword = PTHRW_RWS_INIT; + + nkwq->kw_pre_sseq = PTHRW_RWS_INIT; + + CLEAR_PREPOST_BITS(nkwq); + CLEAR_INTR_PREPOST_BITS(nkwq); + CLEAR_REINIT_BITS(nkwq); + nkwq->kw_lword = mgen; + nkwq->kw_uword = ugen; + nkwq->kw_sword = rw_wc; + nkwq->kw_owner = tid; + for (i=0; i< KSYN_QUEUE_MAX; i++) ksyn_queue_init(&nkwq->kw_ksynqueues[i]); - UPDATE_KWQ(nkwq, mgen, ugen, rw_wc, tid, wqtype, retry); -#if USE_WAITQUEUE - wait_queue_init(&nkwq->kw_wq, SYNC_POLICY_FIFO); -#endif /* USE_WAITQUEUE */ lck_mtx_init(&nkwq->kw_lock, pthread_lck_grp, pthread_lck_attr); //pthread_list_lock_spin(); @@ -2138,21 +2701,67 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin kwq = ksyn_wq_hash_lookup(mutex, p, flags, object, offset); if (kwq != NULL) { - kwq->kw_iocount++; if ((kwq->kw_pflags & KSYN_WQ_FLIST) != 0) { LIST_REMOVE(kwq, kw_list); kwq->kw_pflags &= ~KSYN_WQ_FLIST; + num_infreekwq--; + num_reusekwq++; + } + if ((kwq->kw_type & KSYN_WQTYPE_MASK) != (wqtype &KSYN_WQTYPE_MASK)) { + if ((kwq->kw_inqueue == 0) && (kwq->kw_pre_rwwc ==0) && (kwq->kw_pre_intrcount == 0)) { + if (kwq->kw_iocount == 0) { + kwq->kw_addr = mutex; + kwq->kw_flags = flags; + kwq->kw_object = object; + kwq->kw_offset = offset; + kwq->kw_type = (wqtype & KSYN_WQTYPE_MASK); + CLEAR_REINIT_BITS(kwq); + CLEAR_INTR_PREPOST_BITS(kwq); + CLEAR_PREPOST_BITS(kwq); + kwq->kw_lword = mgen; + kwq->kw_uword = ugen; + kwq->kw_sword = rw_wc; + kwq->kw_owner = tid; + } else if ((kwq->kw_iocount == 1) && (kwq->kw_dropcount == kwq->kw_iocount)) { + kwq->kw_pflags |= KSYN_WQ_WAITING; + /* wait for the wq to be free */ + (void)msleep(&kwq->kw_pflags, pthread_list_mlock, PDROP, "ksyn_wqfind", 0); + + lck_mtx_destroy(&nkwq->kw_lock, pthread_lck_grp); + zfree(kwq_zone, nkwq); + /* will acquire lock again */ + + goto loop; + } else { + __FAILEDUSERTEST__("address already known to kernel for another [busy] synchronizer type(2)\n"); + pthread_list_unlock(); + lck_mtx_destroy(&nkwq->kw_lock, pthread_lck_grp); + zfree(kwq_zone, nkwq); + return EBUSY; + } + } else { + __FAILEDUSERTEST__("address already known to kernel for another [busy] synchronizer type(3)\n"); + pthread_list_unlock(); + lck_mtx_destroy(&nkwq->kw_lock, pthread_lck_grp); + zfree(kwq_zone, nkwq); + return EBUSY; + } } - UPDATE_KWQ(kwq, mgen, ugen, rw_wc, tid, wqtype, retry); + kwq->kw_iocount++; + if (wqtype == KSYN_WQTYPE_MUTEXDROP) + kwq->kw_dropcount++; if (kwqp != NULL) *kwqp = kwq; pthread_list_unlock(); lck_mtx_destroy(&nkwq->kw_lock, pthread_lck_grp); - kfree(nkwq, sizeof(struct ksyn_wait_queue)); + zfree(kwq_zone, nkwq); return (0); } kwq = nkwq; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVSEQ | DBG_FUNC_NONE, kwq->kw_lword, kwq->kw_uword, kwq->kw_sword, 0xffff, 0); +#endif /* _PSYNCH_TRACE_ */ if ((flags & PTHREAD_PSHARED_FLAGS_MASK) == PTHREAD_PROCESS_SHARED) { kwq->kw_pflags |= KSYN_WQ_SHARED; @@ -2161,6 +2770,7 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin LIST_INSERT_HEAD(&hashptr[mutex & pthhash], kwq, kw_hash); kwq->kw_pflags |= KSYN_WQ_INHASH; + num_total_kwq++; pthread_list_unlock(); @@ -2171,34 +2781,81 @@ ksyn_wqfind(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint32_t rw_wc, uin /* Reference from find is dropped here. Starts the free process if needed */ void -ksyn_wqrelease(ksyn_wait_queue_t kwq, ksyn_wait_queue_t ckwq) +ksyn_wqrelease(ksyn_wait_queue_t kwq, ksyn_wait_queue_t ckwq, int qfreenow, int wqtype) { uint64_t deadline; struct timeval t; int sched = 0; - + ksyn_wait_queue_t free_elem = NULL; + ksyn_wait_queue_t free_elem1 = NULL; //pthread_list_lock_spin(); pthread_list_lock(); kwq->kw_iocount--; + if (wqtype == KSYN_WQTYPE_MUTEXDROP) { + kwq->kw_dropcount--; + } if (kwq->kw_iocount == 0) { - if ((kwq->kw_pre_rwwc == 0) && (kwq->kw_inqueue == 0)) { - microuptime(&kwq->kw_ts); - LIST_INSERT_HEAD(&pth_free_list, kwq, kw_list); - kwq->kw_pflags |= KSYN_WQ_FLIST; + if ((kwq->kw_pflags & KSYN_WQ_WAITING) != 0) { + /* some one is waiting for the waitqueue, wake them up */ + kwq->kw_pflags &= ~KSYN_WQ_WAITING; + wakeup(&kwq->kw_pflags); } - sched = 1; + + if ((kwq->kw_pre_rwwc == 0) && (kwq->kw_inqueue == 0) && (kwq->kw_pre_intrcount == 0)) { + if (qfreenow == 0) { + microuptime(&kwq->kw_ts); + LIST_INSERT_HEAD(&pth_free_list, kwq, kw_list); + kwq->kw_pflags |= KSYN_WQ_FLIST; + num_infreekwq++; + free_elem = NULL; + } else { + /* remove from the only list it is in ie hash */ + kwq->kw_pflags &= ~(KSYN_WQ_FLIST | KSYN_WQ_INHASH); + LIST_REMOVE(kwq, kw_hash); + lck_mtx_destroy(&kwq->kw_lock, pthread_lck_grp); + num_total_kwq--; + num_freekwq++; + free_elem = kwq; + } + } else + free_elem = NULL; + if (qfreenow == 0) + sched = 1; } - if (ckwq != NULL){ + + if (ckwq != NULL) { ckwq->kw_iocount--; + if (wqtype == KSYN_WQTYPE_MUTEXDROP) { + kwq->kw_dropcount--; + } if ( ckwq->kw_iocount == 0) { - if ((ckwq->kw_pre_rwwc == 0) && (ckwq->kw_inqueue == 0)) { - /* mark for free if we can */ - microuptime(&ckwq->kw_ts); - LIST_INSERT_HEAD(&pth_free_list, ckwq, kw_list); - ckwq->kw_pflags |= KSYN_WQ_FLIST; + if ((kwq->kw_pflags & KSYN_WQ_WAITING) != 0) { + /* some one is waiting for the waitqueue, wake them up */ + kwq->kw_pflags &= ~KSYN_WQ_WAITING; + wakeup(&kwq->kw_pflags); } - sched = 1; + if ((ckwq->kw_pre_rwwc == 0) && (ckwq->kw_inqueue == 0) && (ckwq->kw_pre_intrcount == 0)) { + if (qfreenow == 0) { + /* mark for free if we can */ + microuptime(&ckwq->kw_ts); + LIST_INSERT_HEAD(&pth_free_list, ckwq, kw_list); + ckwq->kw_pflags |= KSYN_WQ_FLIST; + num_infreekwq++; + free_elem1 = NULL; + } else { + /* remove from the only list it is in ie hash */ + ckwq->kw_pflags &= ~(KSYN_WQ_FLIST | KSYN_WQ_INHASH); + LIST_REMOVE(ckwq, kw_hash); + lck_mtx_destroy(&ckwq->kw_lock, pthread_lck_grp); + num_total_kwq--; + num_freekwq++; + free_elem1 = ckwq; + } + } else + free_elem1 = NULL; + if (qfreenow == 0) + sched = 1; } } @@ -2211,6 +2868,10 @@ ksyn_wqrelease(ksyn_wait_queue_t kwq, ksyn_wait_queue_t ckwq) thread_call_enter_delayed(psynch_thcall, deadline); } pthread_list_unlock(); + if (free_elem != NULL) + zfree(kwq_zone, free_elem); + if (free_elem1 != NULL) + zfree(kwq_zone, free_elem1); } /* responsible to free the waitqueues */ @@ -2226,16 +2887,13 @@ psynch_wq_cleanup(__unused void * param, __unused void * param1) //pthread_list_lock_spin(); pthread_list_lock(); + num_addedfreekwq = num_infreekwq - num_lastfreekwqcount; + num_lastfreekwqcount = num_infreekwq; microuptime(&t); LIST_FOREACH(kwq, &pth_free_list, kw_list) { - - if (count > 100) { - delayed = 1; - break; - } - if ((kwq->kw_iocount != 0) && (kwq->kw_inqueue != 0)) { - /* still in freelist ??? */ + if ((kwq->kw_iocount != 0) || (kwq->kw_pre_rwwc != 0) || (kwq->kw_inqueue != 0) || (kwq->kw_pre_intrcount != 0)) { + /* still in use */ continue; } diff = t.tv_sec - kwq->kw_ts.tv_sec; @@ -2244,10 +2902,13 @@ psynch_wq_cleanup(__unused void * param, __unused void * param1) if (diff >= KSYN_CLEANUP_DEADLINE) { /* out of hash */ kwq->kw_pflags &= ~(KSYN_WQ_FLIST | KSYN_WQ_INHASH); + num_infreekwq--; + num_freekwq++; LIST_REMOVE(kwq, kw_hash); LIST_REMOVE(kwq, kw_list); LIST_INSERT_HEAD(&freelist, kwq, kw_list); count ++; + num_total_kwq--; } else { delayed = 1; } @@ -2268,23 +2929,22 @@ psynch_wq_cleanup(__unused void * param, __unused void * param1) while ((kwq = LIST_FIRST(&freelist)) != NULL) { LIST_REMOVE(kwq, kw_list); lck_mtx_destroy(&kwq->kw_lock, pthread_lck_grp); - kfree(kwq, sizeof(struct ksyn_wait_queue)); + zfree(kwq_zone, kwq); } } int -ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, uthread_t uth) +ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, ksyn_waitq_element_t kwe, int mylog) { kern_return_t kret; int error = 0; +#if _PSYNCH_TRACE_ + uthread_t uth = NULL; +#endif /* _PSYNCH_TRACE_ */ - uth->uu_kwqqueue = (void *)kwq; -#if USE_WAITQUEUE - kret = wait_queue_assert_wait64(&kwq->kw_wq, kwq->kw_addr, THREAD_ABORTSAFE, abstime); -#else /* USE_WAITQUEUE */ - assert_wait_deadline(&uth->uu_psynchretval, THREAD_ABORTSAFE, abstime); -#endif /* USE_WAITQUEUE */ + kwe->kwe_kwqqueue = (void *)kwq; + assert_wait_deadline(&kwe->kwe_psynchretval, THREAD_ABORTSAFE, abstime); ksyn_wqunlock(kwq); kret = thread_block(NULL); @@ -2296,116 +2956,42 @@ ksyn_block_thread_locked(ksyn_wait_queue_t kwq, uint64_t abstime, uthread_t uth) error = EINTR; break; } +#if _PSYNCH_TRACE_ + uth = current_uthread(); +#if defined(__i386__) + if (mylog != 0) + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_THWAKEUP | DBG_FUNC_NONE, 0xf4f3f2f1, (uint32_t)uth, kret, 0, 0); +#else + if (mylog != 0) + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_THWAKEUP | DBG_FUNC_NONE, 0xeeeeeeee, kret, error, 0xeeeeeeee, 0); +#endif +#endif /* _PSYNCH_TRACE_ */ + return(error); } kern_return_t -#if USE_WAITQUEUE -ksyn_wakeup_thread(ksyn_wait_queue_t kwq, uthread_t uth) -#else /* USE_WAITQUEUE */ -ksyn_wakeup_thread(__unused ksyn_wait_queue_t kwq, uthread_t uth) -#endif /* USE_WAITQUEUE */ +ksyn_wakeup_thread(__unused ksyn_wait_queue_t kwq, ksyn_waitq_element_t kwe) { - thread_t th; kern_return_t kret; - th = uth->uu_context.vc_thread; +#if _PSYNCH_TRACE_ + uthread_t uth = NULL; +#endif /* _PSYNCH_TRACE_ */ -#if USE_WAITQUEUE - kret = wait_queue_wakeup64_thread(&kwq->kw_wq, kwq->kw_addr, th, THREAD_AWAKENED); -#else /* USE_WAITQUEUE */ - kret = thread_wakeup_prim((caddr_t)&uth->uu_psynchretval, TRUE, THREAD_AWAKENED); -#endif /* USE_WAITQUEUE */ + kret = thread_wakeup_one((caddr_t)&kwe->kwe_psynchretval); if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("ksyn_wakeup_thread: panic waking up thread %x\n", kret); +#if _PSYNCH_TRACE_ + uth = kwe->kwe_uth; +#if defined(__i386__) + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_THWAKEUP | DBG_FUNC_NONE, 0xf1f2f3f4, (uint32_t)uth, kret, 0, 0); +#endif +#endif /* _PSYNCH_TRACE_ */ - - return(kret); } -/* move from one waitqueue to another */ -#if COND_MTX_WAITQUEUEMOVE -void -ksyn_move_wqthread( ksyn_wait_queue_t ckwq, ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t updateval, int diffgen, int nomutex) -#else /* COND_MTX_WAITQUEUEMOVE */ -void -ksyn_move_wqthread( ksyn_wait_queue_t ckwq, __unused ksyn_wait_queue_t kwq, __unused uint32_t mgen, uint32_t updateval, __unused int diffgen, int nomutex) -#endif /* COND_MTX_WAITQUEUEMOVE */ -{ - kern_return_t kret; - uthread_t uth; -#if COND_MTX_WAITQUEUEMOVE - int count = 0, error, kret; - uint32_t nextgen = mgen; -#endif /* COND_MTX_WAITQUEUEMOVE */ - struct ksyn_queue kq; - uint32_t upgen; - - ksyn_queue_init(&kq); -#if USE_WAITQUEUE - /* TBD wq move */ - kret = wait_queue_move_all(&ckwq->kw_wq, ckwq->kw_addr, &kwq->kw_wq, kwq->kw_addr); -#else /* USE_WAITQUEUE */ - /* no need to move as the thread is blocked at uthread address */ - kret = KERN_SUCCESS; -#endif /* USE_WAITQUEUE */ - - if (nomutex != 0) - upgen = updateval | PTHRW_MTX_NONE; - else - upgen = updateval; - - if (kret== KERN_SUCCESS) { -redrive: - while ((uth = ksyn_queue_removefirst(&ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], ckwq)) != NULL) { - if (nomutex != 0) { -#if COND_MTX_WAITQUEUEMOVE - uth->uu_psynchretval = upgen; -#else /* COND_MTX_WAITQUEUEMOVE */ - uth->uu_psynchretval = 0; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(ckwq, uth); - if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) - panic("ksyn_move_wqthread: panic waking up \n"); - if (kret == KERN_NOT_WAITING) - goto redrive; -#endif /* COND_MTX_WAITQUEUEMOVE */ - } -#if COND_MTX_WAITQUEUEMOVE - else { - count++; - if (count >diffgen) - panic("movethread inserting more than expected\n"); - TAILQ_INSERT_TAIL(&kq.ksynq_uthlist, uth, uu_mtxlist); - } -#endif /* COND_MTX_WAITQUEUEMOVE */ - - } - ksyn_wqunlock(ckwq); - -#if COND_MTX_WAITQUEUEMOVE - if ( (nomutex == 0) && (count > 0)) { - ksyn_wqlock(kwq); - uth = TAILQ_FIRST(&kq.ksynq_uthlist); - while(uth != NULL) { - TAILQ_REMOVE(&kq.ksynq_uthlist, uth, uu_mtxlist); - error = ksyn_queue_insert(kwq, &kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], nextgen, uth, SEQFIT); - if (error != 0) { - panic("movethread insert failed\n"); - } - uth->uu_lockseq = nextgen; - nextgen += PTHRW_INC; - uth = TAILQ_FIRST(&kq.ksynq_uthlist); - } - ksyn_wqunlock(kwq); - } -#endif /* COND_MTX_WAITQUEUEMOVE */ - } else - panic("movethread : wq move all failed\n"); - return; -} - /* find the true shared obect/offset for shared mutexes */ int ksyn_findobj(uint64_t mutex, uint64_t * objectp, uint64_t * offsetp) @@ -2509,12 +3095,13 @@ kwq_find_rw_lowest(ksyn_wait_queue_t kwq, int flags, uint32_t premgen, int * typ } else lowest[KSYN_QUEUE_YWRITER] = 0; - +#if __TESTPANICS__ if (count == 0) panic("nothing in the queue???\n"); +#endif /* __TESTPANICS__ */ - low = numbers[0]; + low = numbers[0]; lowtype = typenum[0]; if (count > 1) { for (i = 1; i< count; i++) { @@ -2535,44 +3122,39 @@ kwq_find_rw_lowest(ksyn_wait_queue_t kwq, int flags, uint32_t premgen, int * typ int ksyn_wakeupreaders(ksyn_wait_queue_t kwq, uint32_t limitread, int longreadset, int allreaders, uint32_t updatebits, int * wokenp) { - uthread_t uth; + ksyn_waitq_element_t kwe = NULL; ksyn_queue_t kq; int failedwakeup = 0; int numwoken = 0; kern_return_t kret = KERN_SUCCESS; - int resetbit = updatebits & PTHRW_RW_HUNLOCK; uint32_t lbits = 0; lbits = updatebits; if (longreadset != 0) { /* clear all read and longreads */ - while ((uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_READ], kwq)) != NULL) { - uth->uu_psynchretval = lbits; - /* set on one thread */ - if (resetbit != 0) { - lbits &= ~PTHRW_RW_HUNLOCK; - resetbit = 0; - } + while ((kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_READ], kwq)) != NULL) { + kwe->kwe_psynchretval = lbits; + kwe->kwe_kwqqueue = NULL; + numwoken++; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("ksyn_wakeupreaders: panic waking up readers\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) { failedwakeup++; } } - while ((uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], kwq)) != NULL) { - uth->uu_psynchretval = lbits; - uth->uu_kwqqueue = NULL; - if (resetbit != 0) { - lbits &= ~PTHRW_RW_HUNLOCK; - resetbit = 0; - } + while ((kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_LREAD], kwq)) != NULL) { + kwe->kwe_psynchretval = lbits; + kwe->kwe_kwqqueue = NULL; numwoken++; - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("ksyn_wakeupreaders: panic waking up lreaders\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) { failedwakeup++; } @@ -2580,17 +3162,15 @@ ksyn_wakeupreaders(ksyn_wait_queue_t kwq, uint32_t limitread, int longreadset, i } else { kq = &kwq->kw_ksynqueues[KSYN_QUEUE_READ]; while ((kq->ksynq_count != 0) && (allreaders || (is_seqlower(kq->ksynq_firstnum, limitread) != 0))) { - uth = ksyn_queue_removefirst(kq, kwq); - uth->uu_psynchretval = lbits; - if (resetbit != 0) { - lbits &= ~PTHRW_RW_HUNLOCK; - resetbit = 0; - } + kwe = ksyn_queue_removefirst(kq, kwq); + kwe->kwe_psynchretval = lbits; + kwe->kwe_kwqqueue = NULL; numwoken++; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("ksyn_wakeupreaders: panic waking up readers\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) { failedwakeup++; } @@ -2605,32 +3185,45 @@ ksyn_wakeupreaders(ksyn_wait_queue_t kwq, uint32_t limitread, int longreadset, i /* This handles the unlock grants for next set on rw_unlock() or on arrival of all preposted waiters */ int -kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int flags, int * blockp, uint32_t premgen) +kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t rw_wc, uint32_t * updatep, int flags, int * blockp, uint32_t premgen) { uint32_t low_reader, low_writer, low_ywriter, low_lreader,limitrdnum; int rwtype, error=0; int longreadset = 0, allreaders, failed; - uint32_t updatebits; + uint32_t updatebits=0, numneeded = 0;; int prepost = flags & KW_UNLOCK_PREPOST; thread_t preth = THREAD_NULL; + ksyn_waitq_element_t kwe; uthread_t uth; thread_t th; int woken = 0; int block = 1; - uint32_t lowest[KSYN_QUEUE_MAX]; /* np need for upgrade as it is handled separately */ + uint32_t lowest[KSYN_QUEUE_MAX]; /* np need for upgrade as it is handled separately */ kern_return_t kret = KERN_SUCCESS; + ksyn_queue_t kq; + int curthreturns = 0; #if _PSYNCH_TRACE_ -#if defined(__i386__) - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_START, (uint32_t)kwq, mgen, premgen, 0, 0); -#endif + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_START, (uint32_t)kwq->kw_addr, mgen, premgen, rw_wc, 0); #endif /* _PSYNCH_TRACE_ */ if (prepost != 0) { preth = current_thread(); } + kq = &kwq->kw_ksynqueues[KSYN_QUEUE_READ]; + kwq->kw_lastseqword = rw_wc; + kwq->kw_lastunlockseq = (rw_wc & PTHRW_COUNT_MASK); + kwq->kw_overlapwatch = 0; + /* upgrade pending */ if (is_rw_ubit_set(mgen)) { +#if __TESTPANICS__ + panic("NO UBIT SHOULD BE SET\n"); + updatebits = PTH_RWL_EBIT | PTH_RWL_KBIT; + if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) + updatebits |= PTH_RWL_WBIT; + if (kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER].ksynq_count != 0) + updatebits |= PTH_RWL_YBIT; if (prepost != 0) { if((flags & KW_UNLOCK_PREPOST_UPGRADE) != 0) { /* upgrade thread calling the prepost */ @@ -2641,34 +3234,37 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int } if (kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE].ksynq_count > 0) { - uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], kwq); - uth->uu_psynchretval = (mgen | PTHRW_EBIT) & ~PTHRW_UBIT; - uth->uu_kwqqueue = NULL; - kret = ksyn_wakeup_thread(kwq, uth); + kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_UPGRADE], kwq); + + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; + kwe->kwe_psynchretval = updatebits; + kwe->kwe_kwqqueue = NULL; + kret = ksyn_wakeup_thread(kwq, kwe); if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("kwq_handle_unlock: panic waking up the upgrade thread \n"); if (kret == KERN_NOT_WAITING) { kwq->kw_pre_intrcount = 1; /* actually a count */ kwq->kw_pre_intrseq = mgen; - kwq->kw_pre_intrretbits = uth->uu_psynchretval; + kwq->kw_pre_intrretbits = kwe->kwe_psynchretval; kwq->kw_pre_intrtype = PTH_RW_TYPE_UPGRADE; } error = 0; } else { panic("panic unable to find the upgrade thread\n"); } +#endif /* __TESTPANICS__ */ ksyn_wqunlock(kwq); goto out; } error = kwq_find_rw_lowest(kwq, flags, premgen, &rwtype, lowest); +#if __TESTPANICS__ if (error != 0) panic("rwunlock: cannot fails to slot next round of threads"); +#endif /* __TESTPANICS__ */ #if _PSYNCH_TRACE_ -#if defined(__i386__) - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq, 1, rwtype, lowest, 0); -#endif + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 1, rwtype, 0, 0); #endif /* _PSYNCH_TRACE_ */ low_reader = lowest[KSYN_QUEUE_READ]; low_lreader = lowest[KSYN_QUEUE_LREAD]; @@ -2676,24 +3272,36 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int low_ywriter = lowest[KSYN_QUEUE_YWRITER]; - updatebits = mgen & ~( PTHRW_EBIT | PTHRW_WBIT |PTHRW_YBIT | PTHRW_UBIT | PTHRW_LBIT); - longreadset = 0; allreaders = 0; + updatebits = 0; + + switch (rwtype & PTH_RW_TYPE_MASK) { case PTH_RW_TYPE_LREAD: longreadset = 1; + case PTH_RW_TYPE_READ: { + /* what about the preflight which is LREAD or READ ?? */ + if ((rwtype & PTH_RWSHFT_TYPE_MASK) != 0) { + if (rwtype & PTH_RWSHFT_TYPE_WRITE) + updatebits |= (PTH_RWL_WBIT | PTH_RWL_KBIT); + if (rwtype & PTH_RWSHFT_TYPE_YWRITE) + updatebits |= PTH_RWL_YBIT; + } limitrdnum = 0; if (longreadset == 0) { switch (rwtype & (PTH_RWSHFT_TYPE_WRITE | PTH_RWSHFT_TYPE_YWRITE)) { case PTH_RWSHFT_TYPE_WRITE: limitrdnum = low_writer; if (((rwtype & PTH_RWSHFT_TYPE_LREAD) != 0) && - (is_seqlower(low_lreader, low_writer) != 0)) { + (is_seqlower(low_lreader, limitrdnum) != 0)) { + longreadset = 1; + } + if (((flags & KW_UNLOCK_PREPOST_LREADLOCK) != 0) && + (is_seqlower(premgen, limitrdnum) != 0)) { longreadset = 1; } - break; case PTH_RWSHFT_TYPE_YWRITE: /* all read ? */ @@ -2702,11 +3310,25 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int longreadset = 1; } else allreaders = 1; + if (((flags & KW_UNLOCK_PREPOST_LREADLOCK) != 0) && + (is_seqlower(premgen, low_ywriter) != 0)) { + longreadset = 1; + allreaders = 0; + } + + break; case (PTH_RWSHFT_TYPE_WRITE | PTH_RWSHFT_TYPE_YWRITE): - limitrdnum = low_writer; + if (is_seqlower(low_ywriter, low_writer) != 0) { + limitrdnum = low_ywriter; + } else + limitrdnum = low_writer; if (((rwtype & PTH_RWSHFT_TYPE_LREAD) != 0) && - (is_seqlower(low_lreader, low_ywriter) != 0)) { + (is_seqlower(low_lreader, limitrdnum) != 0)) { + longreadset = 1; + } + if (((flags & KW_UNLOCK_PREPOST_LREADLOCK) != 0) && + (is_seqlower(premgen, limitrdnum) != 0)) { longreadset = 1; } break; @@ -2718,35 +3340,71 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int }; } + numneeded = 0; + if (longreadset != 0) { + updatebits |= PTH_RWL_LBIT; + updatebits &= ~PTH_RWL_KBIT; + if ((flags & (KW_UNLOCK_PREPOST_READLOCK | KW_UNLOCK_PREPOST_LREADLOCK)) != 0) + numneeded += 1; + numneeded += kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count; + numneeded += kwq->kw_ksynqueues[KSYN_QUEUE_LREAD].ksynq_count; + updatebits += (numneeded << PTHRW_COUNT_SHIFT); + kwq->kw_overlapwatch = 1; + } else { + /* no longread, evaluate number of readers */ - if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) - updatebits |= PTHRW_WBIT; - else if ((rwtype & PTH_RWSHFT_TYPE_YWRITE) != 0) - updatebits |= PTHRW_YBIT; + switch (rwtype & (PTH_RWSHFT_TYPE_WRITE | PTH_RWSHFT_TYPE_YWRITE)) { + case PTH_RWSHFT_TYPE_WRITE: + limitrdnum = low_writer; + numneeded = ksyn_queue_count_tolowest(kq, limitrdnum); + if (((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) && (is_seqlower(premgen, limitrdnum) != 0)) { + curthreturns = 1; + numneeded += 1; + } + break; + case PTH_RWSHFT_TYPE_YWRITE: + /* all read ? */ + numneeded += kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count; + if ((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) { + curthreturns = 1; + numneeded += 1; + } + break; + case (PTH_RWSHFT_TYPE_WRITE | PTH_RWSHFT_TYPE_YWRITE): + limitrdnum = low_writer; + numneeded = ksyn_queue_count_tolowest(kq, limitrdnum); + if (((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) && (is_seqlower(premgen, limitrdnum) != 0)) { + curthreturns = 1; + numneeded += 1; + } + break; + default: /* no writers at all */ + /* no other waiters only readers */ + kwq->kw_overlapwatch = 1; + numneeded += kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count; + if ((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) { + curthreturns = 1; + numneeded += 1; + } + }; + + updatebits += (numneeded << PTHRW_COUNT_SHIFT); + } + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; - if (longreadset == 0) { - if((prepost != 0) && - ((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) && - ((allreaders != 0) || (is_seqlower(premgen, limitrdnum) != 0))) { - block = 0; - uth = current_uthread(); - uth->uu_psynchretval = updatebits; - } - } else { - updatebits |= PTHRW_LBIT; - if ((prepost != 0) && - ((flags & (KW_UNLOCK_PREPOST_READLOCK | KW_UNLOCK_PREPOST_LREADLOCK)) != 0)) { - block = 0; - uth = current_uthread(); - uth->uu_psynchretval = updatebits; - } + if (curthreturns != 0) { + block = 0; + uth = current_uthread(); + kwe = &uth->uu_kwe; + kwe->kwe_psynchretval = updatebits; } - if (prepost != 0) { - updatebits |= PTHRW_RW_HUNLOCK; - } failed = ksyn_wakeupreaders(kwq, limitrdnum, longreadset, allreaders, updatebits, &woken); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 2, woken, failed, 0); +#endif /* _PSYNCH_TRACE_ */ + if (failed != 0) { kwq->kw_pre_intrcount = failed; /* actually a count */ kwq->kw_pre_intrseq = limitrdnum; @@ -2757,43 +3415,49 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int kwq->kw_pre_intrtype = PTH_RW_TYPE_READ; } - /* if we woken up no one and the current thread is returning, ensure it is doing unlock */ - if ((prepost != 0) && (woken == 0) && (block == 0)&& ((updatebits & PTHRW_RW_HUNLOCK) != 0)) { - uth = current_uthread(); - uth->uu_psynchretval = updatebits; - } - error = 0; + if ((kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) && ((updatebits & PTH_RWL_WBIT) == 0)) + panic("kwq_handle_unlock: writer pending but no writebit set %x\n", updatebits); } break; case PTH_RW_TYPE_WRITE: { - updatebits |= PTHRW_EBIT; + + /* only one thread is goin to be granted */ + updatebits |= (PTHRW_INC); + updatebits |= PTH_RWL_KBIT| PTH_RWL_EBIT; + if (((flags & KW_UNLOCK_PREPOST_WRLOCK) != 0) && (low_writer == premgen)) { block = 0; if (kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) - updatebits |= PTHRW_WBIT; - else if ((rwtype & PTH_RWSHFT_TYPE_YWRITE) != 0) - updatebits |= PTHRW_YBIT; + updatebits |= PTH_RWL_WBIT; + if ((rwtype & PTH_RWSHFT_TYPE_YWRITE) != 0) + updatebits |= PTH_RWL_YBIT; th = preth; uth = get_bsdthread_info(th); - uth->uu_psynchretval = updatebits; + kwe = &uth->uu_kwe; + kwe->kwe_psynchretval = updatebits; } else { /* we are not granting writelock to the preposting thread */ - uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); + kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_WRITER], kwq); /* if there are writers present or the preposting write thread then W bit is to be set */ if ((kwq->kw_ksynqueues[KSYN_QUEUE_WRITER].ksynq_count != 0) || ((flags & KW_UNLOCK_PREPOST_WRLOCK) != 0) ) - updatebits |= PTHRW_WBIT; - else if ((rwtype & PTH_RWSHFT_TYPE_YWRITE) != 0) - updatebits |= PTHRW_YBIT; - uth->uu_psynchretval = updatebits; - uth->uu_kwqqueue = NULL; + updatebits |= PTH_RWL_WBIT; + if ((rwtype & PTH_RWSHFT_TYPE_YWRITE) != 0) + updatebits |= PTH_RWL_YBIT; + kwe->kwe_psynchretval = updatebits; + kwe->kwe_kwqqueue = NULL; /* setup next in the queue */ - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 3, kret, 0, 0); +#endif /* _PSYNCH_TRACE_ */ +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("kwq_handle_unlock: panic waking up writer\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) { kwq->kw_pre_intrcount = 1; /* actually a count */ kwq->kw_pre_intrseq = low_writer; @@ -2802,6 +3466,9 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int } error = 0; } + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; + if ((updatebits & (PTH_RWL_KBIT | PTH_RWL_EBIT)) != (PTH_RWL_KBIT | PTH_RWL_EBIT)) + panic("kwq_handle_unlock: writer lock granted but no ke set %x\n", updatebits); } break; @@ -2809,26 +3476,36 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int case PTH_RW_TYPE_YWRITE: { /* can reader locks be granted ahead of this write? */ if ((rwtype & PTH_RWSHFT_TYPE_READ) != 0) { - if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) - updatebits |= PTHRW_WBIT; - else if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) - updatebits |= PTHRW_YBIT; + if ((rwtype & PTH_RWSHFT_TYPE_MASK) != 0) { + if (rwtype & PTH_RWSHFT_TYPE_WRITE) + updatebits |= (PTH_RWL_WBIT | PTH_RWL_KBIT); + if (rwtype & PTH_RWSHFT_TYPE_YWRITE) + updatebits |= PTH_RWL_YBIT; + } if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) { /* is lowest reader less than the low writer? */ if (is_seqlower(low_reader,low_writer) == 0) goto yielditis; + + numneeded = ksyn_queue_count_tolowest(kq, low_writer); + updatebits += (numneeded << PTHRW_COUNT_SHIFT); if (((flags & KW_UNLOCK_PREPOST_READLOCK) != 0) && (is_seqlower(premgen, low_writer) != 0)) { uth = current_uthread(); - uth->uu_psynchretval = updatebits; + kwe = &uth->uu_kwe; + /* add one more */ + updatebits += PTHRW_INC; + kwe->kwe_psynchretval = updatebits; block = 0; } - if (prepost != 0) { - updatebits |= PTHRW_RW_HUNLOCK; - } + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; + /* there will be readers to wakeup , no need to check for woken */ failed = ksyn_wakeupreaders(kwq, low_writer, 0, 0, updatebits, NULL); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 2, woken, failed, 0); +#endif /* _PSYNCH_TRACE_ */ if (failed != 0) { kwq->kw_pre_intrcount = failed; /* actually a count */ kwq->kw_pre_intrseq = low_writer; @@ -2838,32 +3515,33 @@ kwq_handle_unlock(ksyn_wait_queue_t kwq, uint32_t mgen, uint32_t * updatep, int error = 0; } else { /* wakeup all readers */ + numneeded = kwq->kw_ksynqueues[KSYN_QUEUE_READ].ksynq_count; + updatebits += (numneeded << PTHRW_COUNT_SHIFT); if ((prepost != 0) && ((flags & KW_UNLOCK_PREPOST_READLOCK) != 0)) { uth = current_uthread(); - uth->uu_psynchretval = updatebits; + kwe = &uth->uu_kwe; + updatebits += PTHRW_INC; + kwe->kwe_psynchretval = updatebits; block = 0; } - if (prepost != 0) { - updatebits |= PTHRW_RW_HUNLOCK; - } + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; failed = ksyn_wakeupreaders(kwq, low_writer, 0, 1, updatebits, &woken); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 2, woken, failed, 0); +#endif /* _PSYNCH_TRACE_ */ if (failed != 0) { kwq->kw_pre_intrcount = failed; /* actually a count */ kwq->kw_pre_intrseq = kwq->kw_highseq; kwq->kw_pre_intrretbits = updatebits; kwq->kw_pre_intrtype = PTH_RW_TYPE_READ; } - /* if we woken up no one and the current thread is returning, ensure it is doing unlock */ - if ((prepost != 0) && (woken ==0) && (block == 0)&& ((updatebits & PTHRW_RW_HUNLOCK) != 0)) { - uth = current_uthread(); - uth->uu_psynchretval = updatebits; - } error = 0; } } else { yielditis: /* no reads, so granting yeilding writes */ - updatebits |= PTHRW_EBIT; + updatebits |= PTHRW_INC; + updatebits |= PTH_RWL_KBIT| PTH_RWL_EBIT; if (((flags & KW_UNLOCK_PREPOST_YWRLOCK) != 0) && (low_writer == premgen)) { /* preposting yielding write thread is being granted exclusive lock */ @@ -2871,29 +3549,35 @@ yielditis: block = 0; if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) - updatebits |= PTHRW_WBIT; + updatebits |= PTH_RWL_WBIT; else if (kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER].ksynq_count != 0) - updatebits |= PTHRW_YBIT; + updatebits |= PTH_RWL_YBIT; th = preth; uth = get_bsdthread_info(th); - uth->uu_psynchretval = updatebits; + kwe = &uth->uu_kwe; + kwe->kwe_psynchretval = updatebits; } else { /* we are granting yield writelock to some other thread */ - uth = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], kwq); + kwe = ksyn_queue_removefirst(&kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER], kwq); if ((rwtype & PTH_RWSHFT_TYPE_WRITE) != 0) - updatebits |= PTHRW_WBIT; + updatebits |= PTH_RWL_WBIT; /* if there are ywriters present or the preposting ywrite thread then W bit is to be set */ else if ((kwq->kw_ksynqueues[KSYN_QUEUE_YWRITER].ksynq_count != 0) || ((flags & KW_UNLOCK_PREPOST_YWRLOCK) != 0) ) - updatebits |= PTHRW_YBIT; + updatebits |= PTH_RWL_YBIT; - uth->uu_psynchretval = updatebits; - uth->uu_kwqqueue = NULL; + kwe->kwe_psynchretval = updatebits; + kwe->kwe_kwqqueue = NULL; - kret = ksyn_wakeup_thread(kwq, uth); + kret = ksyn_wakeup_thread(kwq, kwe); +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_NONE, (uint32_t)kwq->kw_addr, 3, kret, 0, 0); +#endif /* _PSYNCH_TRACE_ */ +#if __TESTPANICS__ if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) panic("kwq_handle_unlock : panic waking up readers\n"); +#endif /* __TESTPANICS__ */ if (kret == KERN_NOT_WAITING) { kwq->kw_pre_intrcount = 1; /* actually a count */ kwq->kw_pre_intrseq = low_ywriter; @@ -2902,6 +3586,7 @@ yielditis: } error = 0; } + kwq->kw_nextseqword = (rw_wc & PTHRW_COUNT_MASK) + updatebits; } } break; @@ -2911,21 +3596,58 @@ yielditis: }; - if (updatep != NULL) - *updatep = updatebits; out: + if (updatep != NULL) + *updatep = updatebits; if (blockp != NULL) *blockp = block; #if _PSYNCH_TRACE_ -#if defined(__i386__) - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_END, (uint32_t)kwq, 0, 0, block, 0); -#endif + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_RWHANDLEU | DBG_FUNC_END, (uint32_t)kwq->kw_addr, 0, updatebits, block, 0); #endif /* _PSYNCH_TRACE_ */ return(error); } +int +kwq_handle_overlap(ksyn_wait_queue_t kwq, uint32_t lgenval, __unused uint32_t ugenval, uint32_t rw_wc, uint32_t *updatebitsp, __unused int flags , int * blockp) +{ + uint32_t highword = kwq->kw_nextseqword & PTHRW_COUNT_MASK; + uint32_t lowword = kwq->kw_lastseqword & PTHRW_COUNT_MASK; + uint32_t val=0; + int withinseq; + + + /* overlap is set, so no need to check for valid state for overlap */ + + withinseq = ((is_seqlower_eq(rw_wc, highword) != 0) || (is_seqhigher_eq(lowword, rw_wc) != 0)); + + if (withinseq != 0) { + if ((kwq->kw_nextseqword & PTH_RWL_LBIT) == 0) { + /* if no writers ahead, overlap granted */ + if ((lgenval & PTH_RWL_WBIT) == 0) { + goto grantoverlap; + } + } else { + /* Lbit is set, and writers ahead does not count */ + goto grantoverlap; + } + } + + *blockp = 1; + return(0); + +grantoverlap: + /* increase the next expected seq by one */ + kwq->kw_nextseqword += PTHRW_INC; + /* set count by one & bits from the nextseq and add M bit */ + val = PTHRW_INC; + val |= ((kwq->kw_nextseqword & PTHRW_BIT_MASK) | PTH_RWL_MBIT); + *updatebitsp = val; + *blockp = 0; + return(0); +} +#if NOTYET /* handle downgrade actions */ int kwq_handle_downgrade(ksyn_wait_queue_t kwq, uint32_t mgen, __unused int flags, __unused uint32_t premgen, __unused int * blockp) @@ -2964,33 +3686,38 @@ kwq_handle_downgrade(ksyn_wait_queue_t kwq, uint32_t mgen, __unused int flags, _ } return(0); } + +#endif /* NOTYET */ + /************* Indiv queue support routines ************************/ void ksyn_queue_init(ksyn_queue_t kq) { - TAILQ_INIT(&kq->ksynq_uthlist); + TAILQ_INIT(&kq->ksynq_kwelist); kq->ksynq_count = 0; kq->ksynq_firstnum = 0; kq->ksynq_lastnum = 0; } - int -ksyn_queue_insert(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t mgen, struct uthread * uth, int fit) +ksyn_queue_insert(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t mgen, struct uthread * uth, ksyn_waitq_element_t kwe, int fit) { uint32_t lockseq = mgen & PTHRW_COUNT_MASK; - struct uthread * q_uth, * r_uth; + ksyn_waitq_element_t q_kwe, r_kwe; + int res = 0; + uthread_t nuth = NULL; if (kq->ksynq_count == 0) { - TAILQ_INSERT_HEAD(&kq->ksynq_uthlist, uth, uu_mtxlist); + TAILQ_INSERT_HEAD(&kq->ksynq_kwelist, kwe, kwe_list); kq->ksynq_firstnum = lockseq; kq->ksynq_lastnum = lockseq; goto out; } if (fit == FIRSTFIT) { + /* TBD: if retry bit is set for mutex, add it to the head */ /* firstfit, arriving order */ - TAILQ_INSERT_TAIL(&kq->ksynq_uthlist, uth, uu_mtxlist); + TAILQ_INSERT_TAIL(&kq->ksynq_kwelist, kwe, kwe_list); if (is_seqlower (lockseq, kq->ksynq_firstnum) != 0) kq->ksynq_firstnum = lockseq; if (is_seqhigher (lockseq, kq->ksynq_lastnum) != 0) @@ -2998,55 +3725,79 @@ ksyn_queue_insert(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t mgen, struct goto out; } - if ((lockseq == kq->ksynq_firstnum) || (lockseq == kq->ksynq_lastnum)) - panic("ksyn_queue_insert: two threads with same lockseq "); + if ((lockseq == kq->ksynq_firstnum) || (lockseq == kq->ksynq_lastnum)) { + /* During prepost when a thread is getting cancelled, we could have two with same seq */ + if (kwe->kwe_flags == KWE_THREAD_PREPOST) { + q_kwe = ksyn_queue_find_seq(kwq, kq, lockseq, 0); + if ((q_kwe != NULL) && ((nuth = (uthread_t)q_kwe->kwe_uth) != NULL) && + ((nuth->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL)) { + TAILQ_INSERT_TAIL(&kq->ksynq_kwelist, kwe, kwe_list); + goto out; + + } else { + __FAILEDUSERTEST__("ksyn_queue_insert: two threads with same lockseq "); + res = EBUSY; + goto out1; + } + } else { + __FAILEDUSERTEST__("ksyn_queue_insert: two threads with same lockseq "); + res = EBUSY; + goto out1; + } + } /* check for next seq one */ if (is_seqlower(kq->ksynq_lastnum, lockseq) != 0) { - TAILQ_INSERT_TAIL(&kq->ksynq_uthlist, uth, uu_mtxlist); + TAILQ_INSERT_TAIL(&kq->ksynq_kwelist, kwe, kwe_list); kq->ksynq_lastnum = lockseq; goto out; } if (is_seqlower(lockseq, kq->ksynq_firstnum) != 0) { - TAILQ_INSERT_HEAD(&kq->ksynq_uthlist, uth, uu_mtxlist); + TAILQ_INSERT_HEAD(&kq->ksynq_kwelist, kwe, kwe_list); kq->ksynq_firstnum = lockseq; goto out; } /* goto slow insert mode */ - TAILQ_FOREACH_SAFE(q_uth, &kq->ksynq_uthlist, uu_mtxlist, r_uth) { - if (is_seqhigher(q_uth->uu_lockseq, lockseq) != 0) { - TAILQ_INSERT_BEFORE(q_uth, uth, uu_mtxlist); + TAILQ_FOREACH_SAFE(q_kwe, &kq->ksynq_kwelist, kwe_list, r_kwe) { + if (is_seqhigher(q_kwe->kwe_lockseq, lockseq) != 0) { + TAILQ_INSERT_BEFORE(q_kwe, kwe, kwe_list); goto out; } } +#if __TESTPANICS__ panic("failed to insert \n"); +#endif /* __TESTPANICS__ */ + out: + if (uth != NULL) + kwe->kwe_uth = uth; kq->ksynq_count++; kwq->kw_inqueue++; update_low_high(kwq, lockseq); - return(0); +out1: + return(res); } -struct uthread * +ksyn_waitq_element_t ksyn_queue_removefirst(ksyn_queue_t kq, ksyn_wait_queue_t kwq) { - uthread_t uth = NULL; - uthread_t q_uth; + ksyn_waitq_element_t kwe = NULL; + ksyn_waitq_element_t q_kwe; uint32_t curseq; if (kq->ksynq_count != 0) { - uth = TAILQ_FIRST(&kq->ksynq_uthlist); - TAILQ_REMOVE(&kq->ksynq_uthlist, uth, uu_mtxlist); - curseq = uth->uu_lockseq & PTHRW_COUNT_MASK; + kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + TAILQ_REMOVE(&kq->ksynq_kwelist, kwe, kwe_list); + curseq = kwe->kwe_lockseq & PTHRW_COUNT_MASK; kq->ksynq_count--; kwq->kw_inqueue--; if(kq->ksynq_count != 0) { - q_uth = TAILQ_FIRST(&kq->ksynq_uthlist); - kq->ksynq_firstnum = (q_uth->uu_lockseq & PTHRW_COUNT_MASK); + q_kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + kq->ksynq_firstnum = (q_kwe->kwe_lockseq & PTHRW_COUNT_MASK); } else { kq->ksynq_firstnum = 0; kq->ksynq_lastnum = 0; @@ -3062,28 +3813,30 @@ ksyn_queue_removefirst(ksyn_queue_t kq, ksyn_wait_queue_t kwq) kwq->kw_highseq = find_nexthighseq(kwq); } } - return(uth); + return(kwe); } void -ksyn_queue_removeitem(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uthread_t uth) +ksyn_queue_removeitem(ksyn_wait_queue_t kwq, ksyn_queue_t kq, ksyn_waitq_element_t kwe) { - uthread_t q_uth; + ksyn_waitq_element_t q_kwe; uint32_t curseq; if (kq->ksynq_count > 0) { - TAILQ_REMOVE(&kq->ksynq_uthlist, uth, uu_mtxlist); + TAILQ_REMOVE(&kq->ksynq_kwelist, kwe, kwe_list); kq->ksynq_count--; if(kq->ksynq_count != 0) { - q_uth = TAILQ_FIRST(&kq->ksynq_uthlist); - kq->ksynq_firstnum = (q_uth->uu_lockseq & PTHRW_COUNT_MASK); + q_kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + kq->ksynq_firstnum = (q_kwe->kwe_lockseq & PTHRW_COUNT_MASK); + q_kwe = TAILQ_LAST(&kq->ksynq_kwelist, ksynq_kwelist_head); + kq->ksynq_lastnum = (q_kwe->kwe_lockseq & PTHRW_COUNT_MASK); } else { kq->ksynq_firstnum = 0; kq->ksynq_lastnum = 0; } kwq->kw_inqueue--; - curseq = uth->uu_lockseq & PTHRW_COUNT_MASK; + curseq = kwe->kwe_lockseq & PTHRW_COUNT_MASK; if (kwq->kw_inqueue == 0) { kwq->kw_lowseq = 0; kwq->kw_highseq = 0; @@ -3096,6 +3849,168 @@ ksyn_queue_removeitem(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uthread_t uth) } } +/* find the thread and removes from the queue */ +ksyn_waitq_element_t +ksyn_queue_find_seq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t seq, int remove) +{ + ksyn_waitq_element_t q_kwe, r_kwe; + + /* TBD: bail out if higher seq is seen */ + /* case where wrap in the tail of the queue exists */ + TAILQ_FOREACH_SAFE(q_kwe, &kq->ksynq_kwelist, kwe_list, r_kwe) { + if ((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK) == seq) { + if (remove != 0) + ksyn_queue_removeitem(kwq, kq, q_kwe); + return(q_kwe); + } + } + return(NULL); +} + + +/* find the thread at the target sequence (or a broadcast/prepost at or above) */ +ksyn_waitq_element_t +ksyn_queue_find_cvpreposeq(ksyn_queue_t kq, uint32_t cgen) +{ + ksyn_waitq_element_t q_kwe, r_kwe; + uint32_t lgen = (cgen & PTHRW_COUNT_MASK); + + /* case where wrap in the tail of the queue exists */ + TAILQ_FOREACH_SAFE(q_kwe, &kq->ksynq_kwelist, kwe_list, r_kwe) { + + /* skip the lower entries */ + if (is_seqlower((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK), cgen) != 0) + continue; + + switch (q_kwe->kwe_flags) { + + case KWE_THREAD_INWAIT: + if ((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK) != lgen) + break; + /* fall thru */ + + case KWE_THREAD_BROADCAST: + case KWE_THREAD_PREPOST: + return (q_kwe); + } + } + return(NULL); +} + +/* look for a thread at lockseq, a */ +ksyn_waitq_element_t +ksyn_queue_find_signalseq(__unused ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t uptoseq, uint32_t signalseq) +{ + ksyn_waitq_element_t q_kwe, r_kwe, t_kwe = NULL; + + /* case where wrap in the tail of the queue exists */ + TAILQ_FOREACH_SAFE(q_kwe, &kq->ksynq_kwelist, kwe_list, r_kwe) { + + switch (q_kwe->kwe_flags) { + + case KWE_THREAD_PREPOST: + if (is_seqhigher((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK), uptoseq)) + return t_kwe; + /* fall thru */ + + case KWE_THREAD_BROADCAST: + /* match any prepost at our same uptoseq or any broadcast above */ + if (is_seqlower((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK), uptoseq)) + continue; + return q_kwe; + + case KWE_THREAD_INWAIT: + /* + * Match any (non-cancelled) thread at or below our upto sequence - + * but prefer an exact match to our signal sequence (if present) to + * keep exact matches happening. + */ + if (is_seqhigher((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK), uptoseq)) + return t_kwe; + + if (q_kwe->kwe_kwqqueue == kwq) { + uthread_t ut = q_kwe->kwe_uth; + if ((ut->uu_flag & ( UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) != UT_CANCEL) { + /* if equal or higher than our signal sequence, return this one */ + if (is_seqhigher_eq((q_kwe->kwe_lockseq & PTHRW_COUNT_MASK), signalseq)) + return q_kwe; + + /* otherwise, just remember this eligible thread and move on */ + if (t_kwe == NULL) + t_kwe = q_kwe; + } + } + break; + + default: + panic("ksyn_queue_find_signalseq(): unknow wait queue element type (%d)\n", q_kwe->kwe_flags); + break; + } + } + return t_kwe; +} + + +int +ksyn_queue_move_tofree(ksyn_wait_queue_t ckwq, ksyn_queue_t kq, uint32_t upto, ksyn_queue_t kfreeq, int all, int release) +{ + ksyn_waitq_element_t kwe; + int count = 0; + uint32_t tseq = upto & PTHRW_COUNT_MASK; +#if _PSYNCH_TRACE_ + uthread_t ut; +#endif /* _PSYNCH_TRACE_ */ + + ksyn_queue_init(kfreeq); + + /* free all the entries, must be only fakes.. */ + kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + while (kwe != NULL) { + if ((all == 0) && (is_seqhigher((kwe->kwe_lockseq & PTHRW_COUNT_MASK), tseq) != 0)) + break; + if (kwe->kwe_flags == KWE_THREAD_INWAIT) { + /* + * This scenario is typically noticed when the cvar is + * reinited and the new waiters are waiting. We can + * return them as spurious wait so the cvar state gets + * reset correctly. + */ +#if _PSYNCH_TRACE_ + ut = (uthread_t)kwe->kwe_uth; +#endif /* _PSYNCH_TRACE_ */ + + /* skip canceled ones */ + /* wake the rest */ + ksyn_queue_removeitem(ckwq, kq, kwe); + /* set M bit to indicate to waking CV to retun Inc val */ + kwe->kwe_psynchretval = PTHRW_INC | (PTH_RWS_CV_MBIT | PTH_RWL_MTX_WAIT); + kwe->kwe_kwqqueue = NULL; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVHBROAD | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xcafecaf3, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); +#endif /* _PSYNCH_TRACE_ */ + (void)ksyn_wakeup_thread(ckwq, kwe); + } else { + ksyn_queue_removeitem(ckwq, kq, kwe); + TAILQ_INSERT_TAIL(&kfreeq->ksynq_kwelist, kwe, kwe_list); + ckwq->kw_fakecount--; + count++; + } + kwe = TAILQ_FIRST(&kq->ksynq_kwelist); + } + + if ((release != 0) && (count != 0)) { + kwe = TAILQ_FIRST(&kfreeq->ksynq_kwelist); + while (kwe != NULL) { + TAILQ_REMOVE(&kfreeq->ksynq_kwelist, kwe, kwe_list); + zfree(kwe_zone, kwe); + kwe = TAILQ_FIRST(&kfreeq->ksynq_kwelist); + } + } + + return(count); +} + +/*************************************************************************/ void update_low_high(ksyn_wait_queue_t kwq, uint32_t lockseq) @@ -3167,6 +4082,51 @@ find_nexthighseq(ksyn_wait_queue_t kwq) return(highest); } +int +is_seqlower(uint32_t x, uint32_t y) +{ + if (x < y) { + if ((y-x) < (PTHRW_MAX_READERS/2)) + return(1); + } else { + if ((x-y) > (PTHRW_MAX_READERS/2)) + return(1); + } + return(0); +} + +int +is_seqlower_eq(uint32_t x, uint32_t y) +{ + if (x==y) + return(1); + else + return(is_seqlower(x,y)); +} + +int +is_seqhigher(uint32_t x, uint32_t y) +{ + if (x > y) { + if ((x-y) < (PTHRW_MAX_READERS/2)) + return(1); + } else { + if ((y-x) > (PTHRW_MAX_READERS/2)) + return(1); + } + return(0); +} + +int +is_seqhigher_eq(uint32_t x, uint32_t y) +{ + if (x==y) + return(1); + else + return(is_seqhigher(x,y)); +} + + int find_diff(uint32_t upto, uint32_t lowest) { @@ -3174,7 +4134,14 @@ find_diff(uint32_t upto, uint32_t lowest) if (upto == lowest) return(0); +#if 0 diff = diff_genseq(upto, lowest); +#else + if (is_seqlower(upto, lowest) != 0) + diff = diff_genseq(lowest, upto); + else + diff = diff_genseq(upto, lowest); +#endif diff = (diff >> PTHRW_COUNT_SHIFT); return(diff); } @@ -3188,13 +4155,13 @@ find_seq_till(ksyn_wait_queue_t kwq, uint32_t upto, uint32_t nwaiters, uint32_t #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_START, 0, 0, upto, nwaiters, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_START, 0, 0, upto, nwaiters, 0); #endif /* _PSYNCH_TRACE_ */ for (i= 0; i< KSYN_QUEUE_MAX; i++) { count += ksyn_queue_count_tolowest(&kwq->kw_ksynqueues[i], upto); #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_NONE, 0, 1, i, count, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_NONE, 0, 1, i, count, 0); #endif /* _PSYNCH_TRACE_ */ if (count >= nwaiters) { break; @@ -3205,9 +4172,11 @@ find_seq_till(ksyn_wait_queue_t kwq, uint32_t upto, uint32_t nwaiters, uint32_t *countp = count; } #if _PSYNCH_TRACE_ - KERNEL_DEBUG_CONSTANT(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_END, 0, 0, count, nwaiters, 0); + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_FSEQTILL | DBG_FUNC_END, 0, 0, count, nwaiters, 0); #endif /* _PSYNCH_TRACE_ */ - if (count >= nwaiters) + if (count == 0) + return(0); + else if (count >= nwaiters) return(1); else return(0); @@ -3218,7 +4187,7 @@ uint32_t ksyn_queue_count_tolowest(ksyn_queue_t kq, uint32_t upto) { uint32_t i = 0; - uthread_t uth, newuth; + ksyn_waitq_element_t kwe, newkwe; uint32_t curval; /* if nothing or the first num is greater than upto, return none */ @@ -3227,8 +4196,8 @@ ksyn_queue_count_tolowest(ksyn_queue_t kq, uint32_t upto) if (upto == kq->ksynq_firstnum) return(1); - TAILQ_FOREACH_SAFE(uth, &kq->ksynq_uthlist, uu_mtxlist, newuth) { - curval = (uth->uu_lockseq & PTHRW_COUNT_MASK); + TAILQ_FOREACH_SAFE(kwe, &kq->ksynq_kwelist, kwe_list, newkwe) { + curval = (kwe->kwe_lockseq & PTHRW_COUNT_MASK); if (upto == curval) { i++; break; @@ -3242,19 +4211,147 @@ ksyn_queue_count_tolowest(ksyn_queue_t kq, uint32_t upto) return(i); } -/* find the thread and removes from the queue */ -uthread_t -ksyn_queue_find_seq(ksyn_wait_queue_t kwq, ksyn_queue_t kq, uint32_t seq) -{ - uthread_t q_uth, r_uth; - /* case where wrap in the tail of the queue exists */ - TAILQ_FOREACH_SAFE(q_uth, &kq->ksynq_uthlist, uu_mtxlist, r_uth) { - if (q_uth->uu_lockseq == seq) { - ksyn_queue_removeitem(kwq, kq, q_uth); - return(q_uth); - } + +/* handles the cond broadcast of cvar and returns number of woken threads and bits for syscall return */ +void +ksyn_handle_cvbroad(ksyn_wait_queue_t ckwq, uint32_t upto, uint32_t * updatep) +{ + kern_return_t kret; + ksyn_queue_t kq; + ksyn_waitq_element_t kwe, newkwe; + uint32_t updatebits = 0; + struct ksyn_queue kfreeq; + uthread_t ut; + +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVHBROAD | DBG_FUNC_START, 0xcbcbcbc2, upto, 0, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + + ksyn_queue_init(&kfreeq); + kq = &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER]; + + retry: + TAILQ_FOREACH_SAFE(kwe, &kq->ksynq_kwelist, kwe_list, newkwe) { + + if (is_seqhigher((kwe->kwe_lockseq & PTHRW_COUNT_MASK), upto)) /* outside our range */ + break; + + /* now handle the one we found (inside the range) */ + switch (kwe->kwe_flags) { + + case KWE_THREAD_INWAIT: + ut = (uthread_t)kwe->kwe_uth; + + /* skip canceled ones */ + if (kwe->kwe_kwqqueue != ckwq || + (ut->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL) + break; + + /* wake the rest */ + ksyn_queue_removeitem(ckwq, kq, kwe); + kwe->kwe_psynchretval = PTH_RWL_MTX_WAIT; + kwe->kwe_kwqqueue = NULL; +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVHBROAD | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xcafecaf2, (uint32_t)(thread_tid((struct thread *)(((struct uthread *)(kwe->kwe_uth))->uu_context.vc_thread))), kwe->kwe_psynchretval, 0); +#endif /* _PSYNCH_TRACE_ */ + kret = ksyn_wakeup_thread(ckwq, kwe); +#if __TESTPANICS__ + if ((kret != KERN_SUCCESS) && (kret != KERN_NOT_WAITING)) + panic("ksyn_wakeupreaders: panic waking up readers\n"); +#endif /* __TESTPANICS__ */ + updatebits += PTHRW_INC; + break; + + case KWE_THREAD_BROADCAST: + case KWE_THREAD_PREPOST: + ksyn_queue_removeitem(ckwq, kq, kwe); + TAILQ_INSERT_TAIL(&kfreeq.ksynq_kwelist, kwe, kwe_list); + ckwq->kw_fakecount--; + break; + + default: + panic("unknown kweflags\n"); + break; } - return(NULL); + } + + /* Need to enter a broadcast in the queue (if not already at L == S) */ + + if ((ckwq->kw_lword & PTHRW_COUNT_MASK) != (ckwq->kw_sword & PTHRW_COUNT_MASK)) { + + newkwe = TAILQ_FIRST(&kfreeq.ksynq_kwelist); + if (newkwe == NULL) { + ksyn_wqunlock(ckwq); + newkwe = (ksyn_waitq_element_t)zalloc(kwe_zone); + TAILQ_INSERT_TAIL(&kfreeq.ksynq_kwelist, newkwe, kwe_list); + ksyn_wqlock(ckwq); + goto retry; + } + + TAILQ_REMOVE(&kfreeq.ksynq_kwelist, newkwe, kwe_list); + bzero(newkwe, sizeof(struct ksyn_waitq_element)); + newkwe->kwe_kwqqueue = ckwq; + newkwe->kwe_flags = KWE_THREAD_BROADCAST; + newkwe->kwe_lockseq = upto; + newkwe->kwe_count = 0; + newkwe->kwe_uth = NULL; + newkwe->kwe_psynchretval = 0; + +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVHBROAD | DBG_FUNC_NONE, (uint32_t)ckwq->kw_addr, 0xfeedfeed, upto, 0, 0); +#endif /* _PSYNCH_TRACE_ */ + + (void)ksyn_queue_insert(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], upto, NULL, newkwe, SEQFIT); + ckwq->kw_fakecount++; + } + + /* free up any remaining things stumbled across above */ + kwe = TAILQ_FIRST(&kfreeq.ksynq_kwelist); + while (kwe != NULL) { + TAILQ_REMOVE(&kfreeq.ksynq_kwelist, kwe, kwe_list); + zfree(kwe_zone, kwe); + kwe = TAILQ_FIRST(&kfreeq.ksynq_kwelist); + } + + if (updatep != NULL) + *updatep = updatebits; + +#if _PSYNCH_TRACE_ + __PTHREAD_TRACE_DEBUG(_PSYNCH_TRACE_CVHBROAD | DBG_FUNC_END, 0xeeeeeeed, updatebits, 0, 0, 0); +#endif /* _PSYNCH_TRACE_ */ } +void +ksyn_cvupdate_fixup(ksyn_wait_queue_t ckwq, uint32_t *updatep, ksyn_queue_t kfreeq, int release) +{ + uint32_t updatebits = 0; + + if (updatep != NULL) + updatebits = *updatep; + if ((ckwq->kw_lword & PTHRW_COUNT_MASK) == (ckwq->kw_sword & PTHRW_COUNT_MASK)) { + updatebits |= PTH_RWS_CV_CBIT; + if (ckwq->kw_inqueue != 0) { + /* FREE THE QUEUE */ + ksyn_queue_move_tofree(ckwq, &ckwq->kw_ksynqueues[KSYN_QUEUE_WRITER], ckwq->kw_lword, kfreeq, 0, release); +#if __TESTPANICS__ + if (ckwq->kw_inqueue != 0) + panic("ksyn_cvupdate_fixup: L == S, but entries in queue beyond S"); +#endif /* __TESTPANICS__ */ + } + ckwq->kw_lword = ckwq->kw_uword = ckwq->kw_sword = 0; + ckwq->kw_kflags |= KSYN_KWF_ZEROEDOUT; + } else if ((ckwq->kw_inqueue != 0) && (ckwq->kw_fakecount == ckwq->kw_inqueue)) { + /* only fake entries are present in the queue */ + updatebits |= PTH_RWS_CV_PBIT; + } + if (updatep != NULL) + *updatep = updatebits; +} + +void +psynch_zoneinit(void) +{ + kwq_zone = (zone_t)zinit(sizeof(struct ksyn_wait_queue), 8192 * sizeof(struct ksyn_wait_queue), 4096, "ksyn_waitqueue zone"); + kwe_zone = (zone_t)zinit(sizeof(struct ksyn_waitq_element), 8192 * sizeof(struct ksyn_waitq_element), 4096, "ksyn_waitq_element zone"); +} #endif /* PSYNCH */ diff --git a/bsd/kern/pthread_synch.c b/bsd/kern/pthread_synch.c index 7a00399cc..3fbed7532 100644 --- a/bsd/kern/pthread_synch.c +++ b/bsd/kern/pthread_synch.c @@ -91,6 +91,7 @@ #include <mach/port.h> #include <vm/vm_protos.h> #include <vm/vm_map.h> /* for current_map() */ +#include <vm/vm_fault.h> #include <mach/thread_act.h> /* for thread_resume */ #include <machine/machine_routines.h> #if defined(__i386__) @@ -109,12 +110,6 @@ #define KERNEL_DEBUG1 KERNEL_DEBUG_CONSTANT1 #endif - -#if defined(__ppc__) || defined(__ppc64__) -#include <architecture/ppc/cframe.h> -#endif - - lck_grp_attr_t *pthread_lck_grp_attr; lck_grp_t *pthread_lck_grp; lck_attr_t *pthread_lck_attr; @@ -130,7 +125,6 @@ extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t); extern void workqueue_thread_yielded(void); static int workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity); -static int workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item); static boolean_t workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t th, user_addr_t oc_item, int oc_prio, int oc_affinity); static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, @@ -138,7 +132,7 @@ static void wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlis static void wq_unpark_continue(void); static void wq_unsuspend_continue(void); static int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl); -static boolean_t workqueue_addnewthread(struct workqueue *wq); +static boolean_t workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread); static void workqueue_removethread(struct threadlist *tl); static void workqueue_lock_spin(proc_t); static void workqueue_unlock(proc_t); @@ -215,9 +209,7 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args *uap, us isLP64 = IS_64BIT_PROCESS(p); -#if defined(__ppc__) - stackaddr = 0xF0000000; -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) stackaddr = 0xB0000000; #else #error Need to define a stack address hint for this architecture @@ -266,6 +258,22 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args *uap, us th_stack = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE); th_pthread = (stackaddr + th_stacksize + PTH_DEFAULT_GUARDSIZE); user_stacksize = th_stacksize; + + /* + * Pre-fault the first page of the new thread's stack and the page that will + * contain the pthread_t structure. + */ + vm_fault( vmap, + vm_map_trunc_page(th_stack - PAGE_SIZE_64), + VM_PROT_READ | VM_PROT_WRITE, + FALSE, + THREAD_UNINT, NULL, 0); + + vm_fault( vmap, + vm_map_trunc_page(th_pthread), + VM_PROT_READ | VM_PROT_WRITE, + FALSE, + THREAD_UNINT, NULL, 0); } else { th_stack = user_stack; user_stacksize = user_stack; @@ -275,31 +283,7 @@ bsdthread_create(__unused struct proc *p, struct bsdthread_create_args *uap, us #endif } -#if defined(__ppc__) - /* - * Set up PowerPC registers... - * internally they are always kept as 64 bit and - * since the register set is the same between 32 and 64bit modes - * we don't need 2 different methods for setting the state - */ - { - ppc_thread_state64_t state64; - ppc_thread_state64_t *ts64 = &state64; - - ts64->srr0 = (uint64_t)p->p_threadstart; - ts64->r1 = (uint64_t)(th_stack - C_ARGSAVE_LEN - C_RED_ZONE); - ts64->r3 = (uint64_t)th_pthread; - ts64->r4 = (uint64_t)(th_thport); - ts64->r5 = (uint64_t)user_func; - ts64->r6 = (uint64_t)user_funcarg; - ts64->r7 = (uint64_t)user_stacksize; - ts64->r8 = (uint64_t)uap->flags; - - thread_set_wq_state64(th, (thread_state_t)ts64); - - thread_set_cthreadself(th, (uint64_t)th_pthread, isLP64); - } -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) { /* * Set up i386 registers & function call. @@ -453,26 +437,33 @@ uint32_t wq_stalled_window_usecs = WQ_STALLED_WINDOW_USECS; uint32_t wq_reduce_pool_window_usecs = WQ_REDUCE_POOL_WINDOW_USECS; uint32_t wq_max_timer_interval_usecs = WQ_MAX_TIMER_INTERVAL_USECS; uint32_t wq_max_threads = WORKQUEUE_MAXTHREADS; +uint32_t wq_max_constrained_threads = WORKQUEUE_MAXTHREADS / 8; -SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_yielded_threshold, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_yielded_window_usecs, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_stalled_window_usecs, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_reduce_pool_window_usecs, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_max_timer_interval_usecs, 0, ""); -SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW, +SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED, &wq_max_threads, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED, + &wq_max_constrained_threads, 0, ""); + + +static uint32_t wq_init_constrained_limit = 1; + void workqueue_init_lock(proc_t p) @@ -542,11 +533,9 @@ wq_thread_is_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp) */ lastblocked_ts = *lastblocked_tsp; -#if defined(__ppc__) -#else if ( !OSCompareAndSwap64((UInt64)lastblocked_ts, (UInt64)lastblocked_ts, lastblocked_tsp)) return (TRUE); -#endif + if (lastblocked_ts >= cur_ts) { /* * because the update of the timestamp when a thread blocks isn't @@ -682,7 +671,7 @@ again: } } if (add_thread == TRUE) { - retval = workqueue_addnewthread(wq); + retval = workqueue_addnewthread(wq, FALSE); break; } } @@ -774,7 +763,7 @@ workqueue_thread_yielded(void) if (secs == 0 && usecs < wq_yielded_window_usecs) { if (wq->wq_thidlecount == 0) { - workqueue_addnewthread(wq); + workqueue_addnewthread(wq, TRUE); /* * 'workqueue_addnewthread' drops the workqueue lock * when creating the new thread and then retakes it before @@ -876,14 +865,9 @@ workqueue_callback(int type, thread_t thread) * since another thread would have to get scheduled and then block after we start down * this path), it's not a problem. Either timestamp is adequate, so no need to retry */ -#if defined(__ppc__) - /* - * this doesn't have to actually work reliablly for PPC, it just has to compile/link - */ - *lastblocked_ptr = (UInt64)curtime; -#else + OSCompareAndSwap64(*lastblocked_ptr, (UInt64)curtime, lastblocked_ptr); -#endif + if (wq->wq_itemcount) WQ_TIMER_NEEDED(wq, start_timer); @@ -963,9 +947,13 @@ workqueue_removethread(struct threadlist *tl) } - +/* + * called with workq lock held + * dropped and retaken around thread creation + * return with workq lock held + */ static boolean_t -workqueue_addnewthread(struct workqueue *wq) +workqueue_addnewthread(struct workqueue *wq, boolean_t oc_thread) { struct threadlist *tl; struct uthread *uth; @@ -975,8 +963,25 @@ workqueue_addnewthread(struct workqueue *wq) void *sright; mach_vm_offset_t stackaddr; - if (wq->wq_nthreads >= wq_max_threads || wq->wq_nthreads >= (CONFIG_THREAD_MAX - 20)) + if (wq->wq_nthreads >= wq_max_threads || wq->wq_nthreads >= (CONFIG_THREAD_MAX - 20)) { + wq->wq_lflags |= WQL_EXCEEDED_TOTAL_THREAD_LIMIT; return (FALSE); + } + wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT; + + if (oc_thread == FALSE && wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) { + /* + * if we're not creating this thread to service an overcommit request, + * then check the size of the constrained thread pool... if we've already + * reached our max for threads scheduled from this pool, don't create a new + * one... the callers of this function are prepared for failure. + */ + wq->wq_lflags |= WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT; + return (FALSE); + } + if (wq->wq_constrained_threads_scheduled < wq_max_constrained_threads) + wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT; + wq->wq_nthreads++; p = wq->wq_proc; @@ -990,9 +995,7 @@ workqueue_addnewthread(struct workqueue *wq) tl = kalloc(sizeof(struct threadlist)); bzero(tl, sizeof(struct threadlist)); -#if defined(__ppc__) - stackaddr = 0xF0000000; -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) stackaddr = 0xB0000000; #else #error Need to define a stack address hint for this architecture @@ -1023,6 +1026,7 @@ workqueue_addnewthread(struct workqueue *wq) } if (kret != KERN_SUCCESS) { (void) thread_terminate(th); + thread_deallocate(th); kfree(tl, sizeof(struct threadlist)); goto failed; @@ -1043,11 +1047,6 @@ workqueue_addnewthread(struct workqueue *wq) tl->th_priority = WORKQUEUE_NUMPRIOS; tl->th_policy = -1; -#if defined(__ppc__) - //ml_fp_setvalid(FALSE); - thread_set_cthreadself(th, (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE), IS_64BIT_PROCESS(p)); -#endif /* __ppc__ */ - uth = get_bsdthread_info(tl->th_thread); uth->uu_threadlist = (void *)tl; @@ -1087,6 +1086,22 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap, __unused int32 if ((p->p_lflag & P_LREGISTER) == 0) return(EINVAL); + num_cpus = ml_get_max_cpus(); + + if (wq_init_constrained_limit) { + uint32_t limit; + /* + * set up the limit for the constrained pool + * this is a virtual pool in that we don't + * maintain it on a separate idle and run list + */ + limit = num_cpus * (WORKQUEUE_NUMPRIOS + 1); + + if (limit > wq_max_constrained_threads) + wq_max_constrained_threads = limit; + + wq_init_constrained_limit = 0; + } workqueue_lock_spin(p); if (p->p_wqptr == NULL) { @@ -1107,8 +1122,6 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap, __unused int32 workqueue_unlock(p); - num_cpus = ml_get_max_cpus(); - wq_size = sizeof(struct workqueue) + (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) + (num_cpus * WORKQUEUE_NUMPRIOS * sizeof(uint32_t)) + @@ -1153,7 +1166,7 @@ workq_open(struct proc *p, __unused struct workq_open_args *uap, __unused int32 * the size for the allocation of the workqueue struct */ nptr += (sizeof(uint64_t) - 1); - nptr = (char *)((long)nptr & ~(sizeof(uint64_t) - 1)); + nptr = (char *)((uintptr_t)nptr & ~(sizeof(uint64_t) - 1)); for (i = 0; i < WORKQUEUE_NUMPRIOS; i++) { wq->wq_lastblocked_ts[i] = (uint64_t *)nptr; @@ -1217,9 +1230,9 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, __unused in workqueue_unlock(p); return (EINVAL); } - if (wq->wq_thidlecount == 0 && (oc_item || (wq->wq_nthreads < wq->wq_affinity_max))) { + if (wq->wq_thidlecount == 0 && (oc_item || (wq->wq_constrained_threads_scheduled < wq->wq_affinity_max))) { - workqueue_addnewthread(wq); + workqueue_addnewthread(wq, oc_item ? TRUE : FALSE); if (wq->wq_thidlecount == 0) oc_item = 0; @@ -1230,20 +1243,6 @@ workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, __unused in KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, wq, prio, affinity, oc_item, 0); } break; - case WQOPS_QUEUE_REMOVE: { - - if ((prio < 0) || (prio >= WORKQUEUE_NUMPRIOS)) - return (EINVAL); - - workqueue_lock_spin(p); - - if ((wq = (struct workqueue *)p->p_wqptr) == NULL) { - workqueue_unlock(p); - return (EINVAL); - } - error = workqueue_removeitem(wq, prio, item); - } - break; case WQOPS_THREAD_RETURN: { th = current_thread(); @@ -1423,42 +1422,16 @@ workqueue_additem(struct workqueue *wq, int prio, user_addr_t item, int affinity return (0); } -static int -workqueue_removeitem(struct workqueue *wq, int prio, user_addr_t item) -{ - struct workitem *witem; - struct workitemlist *wl; - int error = ESRCH; - - wl = (struct workitemlist *)&wq->wq_list[prio]; - - TAILQ_FOREACH(witem, &wl->wl_itemlist, wi_entry) { - if (witem->wi_item == item) { - TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry); - - if (TAILQ_EMPTY(&wl->wl_itemlist)) - wq->wq_list_bitmap &= ~(1 << prio); - wq->wq_itemcount--; - - witem->wi_item = (user_addr_t)0; - witem->wi_affinity = 0; - TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry); - - error = 0; - break; - } - } - return (error); -} - static int workqueue_importance[WORKQUEUE_NUMPRIOS] = { - 2, 0, -2, + 2, 0, -2, INT_MIN, }; +#define WORKQ_POLICY_TIMESHARE 1 + static int workqueue_policy[WORKQUEUE_NUMPRIOS] = { - 1, 1, 1, + WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE, WORKQ_POLICY_TIMESHARE }; @@ -1536,10 +1509,20 @@ workqueue_run_nextitem(proc_t p, struct workqueue *wq, thread_t thread, user_add } goto grab_idle_thread; } - if (wq->wq_itemcount == 0) { + /* + * if we get here, the work should be handled by a constrained thread + */ + if (wq->wq_itemcount == 0 || wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) { + /* + * no work to do, or we're already at or over the scheduling limit for + * constrained threads... just return or park the thread... + * do not start the timer for this condition... if we don't have any work, + * we'll check again when new work arrives... if we're over the limit, we need 1 or more + * constrained threads to return to the kernel before we can dispatch work from our queue + */ if ((th_to_park = thread) == THREAD_NULL) goto out_of_work; - goto parkit; + goto parkit; } for (priority = 0; priority < WORKQUEUE_NUMPRIOS; priority++) { if (wq->wq_list_bitmap & (1 << priority)) { @@ -1727,6 +1710,16 @@ pick_up_work: witem->wi_item = (user_addr_t)0; witem->wi_affinity = 0; TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry); + + if ( !(tl->th_flags & TH_LIST_CONSTRAINED)) { + wq->wq_constrained_threads_scheduled++; + tl->th_flags |= TH_LIST_CONSTRAINED; + } + } else { + if (tl->th_flags & TH_LIST_CONSTRAINED) { + wq->wq_constrained_threads_scheduled--; + tl->th_flags &= ~TH_LIST_CONSTRAINED; + } } orig_priority = tl->th_priority; orig_affinity_tag = tl->th_affinity_tag; @@ -1775,16 +1768,47 @@ pick_up_work: KERNEL_DEBUG(0xefffd120 | DBG_FUNC_START, wq, orig_priority, tl->th_policy, 0, 0); - if (tl->th_policy != policy) { + if ((orig_priority == WORKQUEUE_BG_PRIOQUEUE) || (priority == WORKQUEUE_BG_PRIOQUEUE)) { + struct uthread *ut = NULL; + + ut = get_bsdthread_info(th_to_run); + if (orig_priority == WORKQUEUE_BG_PRIOQUEUE) { + /* remove the disk throttle, importance will be reset in anycase */ +#if !CONFIG_EMBEDDED + proc_restore_workq_bgthreadpolicy(th_to_run); +#else /* !CONFIG_EMBEDDED */ + if ((ut->uu_flag & UT_BACKGROUND) != 0) { + ut->uu_flag &= ~UT_BACKGROUND; + ut->uu_iopol_disk = IOPOL_NORMAL; + } +#endif /* !CONFIG_EMBEDDED */ + } + + if (priority == WORKQUEUE_BG_PRIOQUEUE) { +#if !CONFIG_EMBEDDED + proc_apply_workq_bgthreadpolicy(th_to_run); +#else /* !CONFIG_EMBEDDED */ + if ((ut->uu_flag & UT_BACKGROUND) == 0) { + /* set diskthrottling */ + ut->uu_flag |= UT_BACKGROUND; + ut->uu_iopol_disk = IOPOL_THROTTLE; + } +#endif /* !CONFIG_EMBEDDED */ + } + } + + if (tl->th_policy != policy) { extinfo.timeshare = policy; (void)thread_policy_set_internal(th_to_run, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT); tl->th_policy = policy; } + precedinfo.importance = workqueue_importance[priority]; (void)thread_policy_set_internal(th_to_run, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT); + KERNEL_DEBUG(0xefffd120 | DBG_FUNC_END, wq, priority, policy, 0, 0); } if (kdebug_enable) { @@ -1858,12 +1882,18 @@ parkit: wq->wq_thscheduled_count[tl->th_priority][tl->th_affinity_tag]--; wq->wq_threads_scheduled--; + if (tl->th_flags & TH_LIST_CONSTRAINED) { + wq->wq_constrained_threads_scheduled--; + wq->wq_lflags &= ~WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT; + tl->th_flags &= ~TH_LIST_CONSTRAINED; + } if (wq->wq_thidlecount < 100) us_to_wait = wq_reduce_pool_window_usecs - (wq->wq_thidlecount * (wq_reduce_pool_window_usecs / 100)); else us_to_wait = wq_reduce_pool_window_usecs / 100; wq->wq_thidlecount++; + wq->wq_lflags &= ~WQL_EXCEEDED_TOTAL_THREAD_LIMIT; assert_wait_timeout((caddr_t)tl, (THREAD_INTERRUPTIBLE), us_to_wait, NSEC_PER_USEC); @@ -2080,34 +2110,11 @@ wq_runitem(proc_t p, user_addr_t item, thread_t th, struct threadlist *tl, } } + int setup_wqthread(proc_t p, thread_t th, user_addr_t item, int reuse_thread, struct threadlist *tl) { -#if defined(__ppc__) - /* - * Set up PowerPC registers... - * internally they are always kept as 64 bit and - * since the register set is the same between 32 and 64bit modes - * we don't need 2 different methods for setting the state - */ - { - ppc_thread_state64_t state64; - ppc_thread_state64_t *ts64 = &state64; - - ts64->srr0 = (uint64_t)p->p_wqthread; - ts64->r1 = (uint64_t)((tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE) - C_ARGSAVE_LEN - C_RED_ZONE); - ts64->r3 = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_STACKSIZE + PTH_DEFAULT_GUARDSIZE); - ts64->r4 = (uint64_t)(tl->th_thport); - ts64->r5 = (uint64_t)(tl->th_stackaddr + PTH_DEFAULT_GUARDSIZE); - ts64->r6 = (uint64_t)item; - ts64->r7 = (uint64_t)reuse_thread; - ts64->r8 = (uint64_t)0; - - if ((reuse_thread != 0) && (ts64->r3 == (uint64_t)0)) - panic("setup_wqthread: setting reuse thread with null pthread\n"); - thread_set_wq_state64(th, (thread_state_t)ts64); - } -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) int isLP64 = 0; isLP64 = IS_64BIT_PROCESS(p); @@ -2183,6 +2190,14 @@ fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo) pwqinfo->pwq_nthreads = wq->wq_nthreads; pwqinfo->pwq_runthreads = activecount; pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount; + pwqinfo->pwq_state = 0; + + if (wq->wq_lflags & WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT) + pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT; + + if (wq->wq_lflags & WQL_EXCEEDED_TOTAL_THREAD_LIMIT) + pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT; + out: workqueue_unlock(p); return(error); @@ -2308,5 +2323,6 @@ pthread_init(void) pth_global_hashinit(); psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL); + psynch_zoneinit(); #endif /* PSYNCH */ } diff --git a/bsd/kern/subr_log.c b/bsd/kern/subr_log.c index d39eccd5d..2cd5c3ac2 100644 --- a/bsd/kern/subr_log.c +++ b/bsd/kern/subr_log.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,6 +81,7 @@ #include <sys/conf.h> #include <sys/sysctl.h> #include <kern/kalloc.h> +#include <pexpert/pexpert.h> /* XXX should be in a common header somewhere */ extern void klogwakeup(void); @@ -92,6 +93,9 @@ extern void logwakeup(void); #define LOG_ASYNC 0x04 #define LOG_RDWAIT 0x08 +/* All globals should be accessed under LOG_LOCK() */ + +/* logsoftc only valid while log_open=1 */ struct logsoftc { int sc_state; /* see above for possibilities */ struct selinfo sc_selp; /* thread waiting for select */ @@ -99,11 +103,11 @@ struct logsoftc { } logsoftc; int log_open; /* also used in log() */ -char smsg_bufc[MSG_BSIZE]; /* static buffer */ -struct msgbuf temp_msgbuf = {0,MSG_BSIZE,0,0,smsg_bufc}; -struct msgbuf *msgbufp; -static int _logentrypend = 0; -static int log_inited = 0; +char smsg_bufc[CONFIG_MSG_BSIZE]; /* static buffer */ +struct msgbuf msgbuf = {MSG_MAGIC,sizeof(smsg_bufc),0,0,smsg_bufc}; +struct msgbuf *msgbufp = &msgbuf; +static int logentrypend = 0; + /* the following are implemented in osfmk/kern/printf.c */ extern void bsd_log_lock(void); extern void bsd_log_unlock(void); @@ -125,6 +129,16 @@ extern d_select_t logselect; #define LOG_LOCK() bsd_log_lock() #define LOG_UNLOCK() bsd_log_unlock() +#if DEBUG +#define LOG_SETSIZE_DEBUG(x...) kprintf(x) +#else +#define LOG_SETSIZE_DEBUG(x...) do { } while(0) +#endif + +static int sysctl_kern_msgbuf(struct sysctl_oid *oidp, + void *arg1, + int arg2, + struct sysctl_req *req); /*ARGSUSED*/ int @@ -135,21 +149,9 @@ logopen(__unused dev_t dev, __unused int flags, __unused int mode, struct proc * LOG_UNLOCK(); return (EBUSY); } - log_open = 1; logsoftc.sc_pgid = p->p_pid; /* signal process only */ - /* - * Potential race here with putchar() but since putchar should be - * called by autoconf, msg_magic should be initialized by the time - * we get here. - */ - if (msgbufp->msg_magic != MSG_MAGIC) { - register int i; + log_open = 1; - msgbufp->msg_magic = MSG_MAGIC; - msgbufp->msg_bufx = msgbufp->msg_bufr = 0; - for (i=0; i < MSG_BSIZE; i++) - msgbufp->msg_bufc[i] = 0; - } LOG_UNLOCK(); return (0); @@ -160,9 +162,9 @@ int logclose(__unused dev_t dev, __unused int flag, __unused int devtype, __unused struct proc *p) { LOG_LOCK(); - log_open = 0; selwakeup(&logsoftc.sc_selp); selthreadclear(&logsoftc.sc_selp); + log_open = 0; LOG_UNLOCK(); return (0); } @@ -171,7 +173,7 @@ logclose(__unused dev_t dev, __unused int flag, __unused int devtype, __unused s int logread(__unused dev_t dev, struct uio *uio, int flag) { - register long l; + int l; int error = 0; LOG_LOCK(); @@ -202,20 +204,24 @@ logread(__unused dev_t dev, struct uio *uio, int flag) logsoftc.sc_state &= ~LOG_RDWAIT; while (uio_resid(uio) > 0) { + int readpos; + l = msgbufp->msg_bufx - msgbufp->msg_bufr; if (l < 0) l = msgbufp->msg_size - msgbufp->msg_bufr; l = min(l, uio_resid(uio)); if (l == 0) break; + + readpos = msgbufp->msg_bufr; LOG_UNLOCK(); - error = uiomove((caddr_t)&msgbufp->msg_bufc[msgbufp->msg_bufr], - (int)l, uio); + error = uiomove((caddr_t)&msgbufp->msg_bufc[readpos], + l, uio); LOG_LOCK(); if (error) break; - msgbufp->msg_bufr += l; - if (msgbufp->msg_bufr < 0 || msgbufp->msg_bufr >= msgbufp->msg_size) + msgbufp->msg_bufr = readpos + l; + if (msgbufp->msg_bufr >= msgbufp->msg_size) msgbufp->msg_bufr = 0; } out: @@ -272,9 +278,13 @@ logwakeup(void) void klogwakeup(void) { - if (_logentrypend) { - _logentrypend = 0; + LOG_LOCK(); + if (logentrypend && log_open) { + logentrypend = 0; /* only reset if someone will be reading */ + LOG_UNLOCK(); logwakeup(); + } else { + LOG_UNLOCK(); } } @@ -282,7 +292,7 @@ klogwakeup(void) int logioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __unused struct proc *p) { - long l; + int l; LOG_LOCK(); switch (com) { @@ -328,10 +338,7 @@ logioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __unus void bsd_log_init(void) { - if (!log_inited) { - msgbufp = &temp_msgbuf; - log_inited = 1; - } + /* After this point, we must be ready to accept characters */ } @@ -353,24 +360,12 @@ bsd_log_init(void) void log_putc_locked(char c) { - register struct msgbuf *mbp; - - if (!log_inited) { - panic("bsd log is not inited"); - } + struct msgbuf *mbp; mbp = msgbufp; - if (mbp-> msg_magic != MSG_MAGIC) { - register int i; - - mbp->msg_magic = MSG_MAGIC; - mbp->msg_bufx = mbp->msg_bufr = 0; - for (i=0; i < MSG_BSIZE; i++) - mbp->msg_bufc[i] = 0; - } mbp->msg_bufc[mbp->msg_bufx++] = c; - _logentrypend = 1; - if (mbp->msg_bufx < 0 || mbp->msg_bufx >= msgbufp->msg_size) + logentrypend = 1; + if (mbp->msg_bufx >= msgbufp->msg_size) mbp->msg_bufx = 0; } @@ -391,9 +386,6 @@ log_putc_locked(char c) void log_putc(char c) { - if (!log_inited) { - panic("bsd log is not inited"); - } LOG_LOCK(); log_putc_locked(c); LOG_UNLOCK(); @@ -406,59 +398,143 @@ log_putc(char c) * to the kernel command line, and to read the current size using * sysctl kern.msgbuf * If there is no parameter on the kernel command line, the buffer is - * allocated statically and is MSG_BSIZE characters in size, otherwise - * memory is dynamically allocated. - * This function may only be called once, during kernel initialization. - * Memory management must already be up. The buffer must not have - * overflown yet. + * allocated statically and is CONFIG_MSG_BSIZE characters in size, otherwise + * memory is dynamically allocated. Memory management must already be up. */ -void -log_setsize(long size) { +int +log_setsize(int size) { char *new_logdata; - if (msgbufp->msg_size!=MSG_BSIZE) { - printf("log_setsize: attempt to change size more than once\n"); - return; - } - if (size==MSG_BSIZE) - return; - if (size<MSG_BSIZE) { /* we don't support reducing the log size */ - printf("log_setsize: can't decrease log size\n"); - return; - } + int new_logsize, new_bufr, new_bufx; + char *old_logdata; + int old_logsize, old_bufr, old_bufx; + int i, count; + char *p, ch; + + if (size > MAX_MSG_BSIZE) + return (EINVAL); + + if (size <= 0) + return (EINVAL); + + new_logsize = size; if (!(new_logdata = (char*)kalloc(size))) { printf("log_setsize: unable to allocate memory\n"); - return; + return (ENOMEM); } + bzero(new_logdata, new_logsize); + LOG_LOCK(); - bcopy(smsg_bufc, new_logdata, MSG_BSIZE); - bzero(new_logdata+MSG_BSIZE, size - MSG_BSIZE); + + old_logsize = msgbufp->msg_size; + old_logdata = msgbufp->msg_bufc; + old_bufr = msgbufp->msg_bufr; + old_bufx = msgbufp->msg_bufx; + + LOG_SETSIZE_DEBUG("log_setsize(%d): old_logdata %p old_logsize %d old_bufr %d old_bufx %d\n", + size, old_logdata, old_logsize, old_bufr, old_bufx); + + /* start "new_logsize" bytes before the write pointer */ + if (new_logsize <= old_bufx) { + count = new_logsize; + p = old_logdata + old_bufx - count; + } else { + /* + * if new buffer is bigger, copy what we have and let the + * bzero above handle the difference + */ + count = MIN(new_logsize, old_logsize); + p = old_logdata + old_logsize - (count - old_bufx); + } + for (i = 0; i < count; i++) { + if (p >= old_logdata + old_logsize) + p = old_logdata; + + ch = *p++; + new_logdata[i] = ch; + } + + new_bufx = i; + if (new_bufx >= new_logsize) + new_bufx = 0; + msgbufp->msg_bufx = new_bufx; + + new_bufr = old_bufx - old_bufr; /* how much were we trailing bufx by? */ + if (new_bufr < 0) + new_bufr += old_logsize; + new_bufr = new_bufx - new_bufr; /* now relative to oldest data in new buffer */ + if (new_bufr < 0) + new_bufr += new_logsize; + msgbufp->msg_bufr = new_bufr; + + msgbufp->msg_size = new_logsize; + msgbufp->msg_bufc = new_logdata; + + LOG_SETSIZE_DEBUG("log_setsize(%d): new_logdata %p new_logsize %d new_bufr %d new_bufx %d\n", + size, new_logdata, new_logsize, new_bufr, new_bufx); + + LOG_UNLOCK(); + /* this memory is now dead - clear it so that it compresses better in case of suspend to disk etc. */ - bzero(smsg_bufc, MSG_BSIZE); - msgbufp->msg_size = size; - msgbufp->msg_bufc = new_logdata; + bzero(old_logdata, old_logsize); + if (old_logdata != smsg_bufc) { + /* dynamic memory that must be freed */ + kfree(old_logdata, old_logsize); + } + + printf("set system log size to %d bytes\n", new_logsize); + + return 0; +} + +SYSCTL_PROC(_kern, OID_AUTO, msgbuf, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_kern_msgbuf, "I", ""); + +static int sysctl_kern_msgbuf(struct sysctl_oid *oidp __unused, + void *arg1 __unused, + int arg2 __unused, + struct sysctl_req *req) +{ + int old_bufsize, bufsize; + int error; + + LOG_LOCK(); + old_bufsize = bufsize = msgbufp->msg_size; LOG_UNLOCK(); - printf("set system log size to %ld bytes\n", msgbufp->msg_size); + + error = sysctl_io_number(req, bufsize, sizeof(bufsize), &bufsize, NULL); + if (error) + return (error); + + if (bufsize != old_bufsize) { + error = log_setsize(bufsize); + } + + return (error); } -SYSCTL_LONG(_kern, OID_AUTO, msgbuf, CTLFLAG_RD, &temp_msgbuf.msg_size, ""); /* - * This should be called by single user mode /sbin/dmesg only. + * This should be called by /sbin/dmesg only via libproc. * It returns as much data still in the buffer as possible. */ int log_dmesg(user_addr_t buffer, uint32_t buffersize, int32_t * retval) { uint32_t i; - uint32_t localbuff_size = (msgbufp->msg_size + 2); + uint32_t localbuff_size; int error = 0, newl, skip; char *localbuff, *p, *copystart, ch; - long copysize; + size_t copysize; + LOG_LOCK(); + localbuff_size = (msgbufp->msg_size + 2); /* + '\n' + '\0' */ + LOG_UNLOCK(); + + /* Allocate a temporary non-circular buffer for copyout */ if (!(localbuff = (char *)kalloc(localbuff_size))) { printf("log_dmesg: unable to allocate memory\n"); return (ENOMEM); } + /* in between here, the log could become bigger, but that's fine */ LOG_LOCK(); @@ -483,7 +559,7 @@ log_dmesg(user_addr_t buffer, uint32_t buffersize, int32_t * retval) { } if (ch == '\0') continue; - newl = ch == '\n'; + newl = (ch == '\n'); localbuff[i++] = ch; /* The original version of this routine contained a buffer * overflow. At the time, a "small" targeted fix was desired diff --git a/bsd/kern/subr_prof.c b/bsd/kern/subr_prof.c index 5b1024141..4d07853d9 100644 --- a/bsd/kern/subr_prof.c +++ b/bsd/kern/subr_prof.c @@ -152,14 +152,28 @@ kmstartup(void) } /* - * Return kernel profiling information. + * XXX These should be broken out into per-argument OID values, + * XXX since there are no sub-OID parameter values, but unfortunately + * XXX there is barely enough time for an initial conversion. + * + * Note: These items appear to be read/write. */ -int +STATIC int +sysctl_doprofhandle SYSCTL_HANDLER_ARGS +{ sysctl_doprof(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, user_addr_t newp, size_t newlen) { + __unused int cmd = oidp->oid_arg2; /* subcommand*/ + int *name = arg1; /* oid element argument vector */ + int namelen = arg2; /* number of oid element arguments */ + user_addr_t oldp = req->oldptr; /* user buffer copy out address */ + size_t *oldlenp = req->oldlen; /* user buffer copy out size */ + user_addr_t newp = req->newptr; /* user buffer copy in address */ + size_t newlen = req->newlen; /* user buffer copy in size */ + struct gmonparam *gp = &_gmonparam; - int error; + int error = 0; /* all sysctl names at this level are terminal */ if (namelen != 1) @@ -169,28 +183,44 @@ sysctl_doprof(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, case GPROF_STATE: error = sysctl_int(oldp, oldlenp, newp, newlen, &gp->state); if (error) - return (error); + break; if (gp->state == GMON_PROF_OFF) stopprofclock(kernproc); else startprofclock(kernproc); - return (0); + break; case GPROF_COUNT: - return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->kcount, gp->kcountsize)); + error = sysctl_struct(oldp, oldlenp, newp, newlen, + gp->kcount, gp->kcountsize); + break; case GPROF_FROMS: - return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->froms, gp->fromssize)); + error = sysctl_struct(oldp, oldlenp, newp, newlen, + gp->froms, gp->fromssize); + break; case GPROF_TOS: - return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->tos, gp->tossize)); + error = sysctl_struct(oldp, oldlenp, newp, newlen, + gp->tos, gp->tossize); + break; case GPROF_GMONPARAM: - return (sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp)); + error = sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp); + break; default: - return (ENOTSUP); + error = ENOTSUP; + break; } - /* NOTREACHED */ + + /* adjust index so we return the right required/consumed amount */ + if (!error) + req->oldidx += req->oldlen; + + return(error); } +SYSCTL_PROC(_kern, KERN_PROF, prof, STLFLAG_NODE|CTLFLAG_RW | CTLFLAG_LOCKED, + 0, /* Pointer argument (arg1) */ + 0, /* Integer argument (arg2) */ + sysctl_doprofhandle, /* Handler function */ + NULL, /* No explicit data */ + ""); /* diff --git a/bsd/kern/sys_generic.c b/bsd/kern/sys_generic.c index 11a276bbd..bacd02b79 100644 --- a/bsd/kern/sys_generic.c +++ b/bsd/kern/sys_generic.c @@ -152,6 +152,21 @@ __private_extern__ int dofilewrite(vfs_context_t ctx, struct fileproc *fp, __private_extern__ int preparefileread(struct proc *p, struct fileproc **fp_ret, int fd, int check_for_vnode); __private_extern__ void donefileread(struct proc *p, struct fileproc *fp_ret, int fd); + +/* Conflict wait queue for when selects collide (opaque type) */ +struct wait_queue select_conflict_queue; + +/* + * Init routine called from bsd_init.c + */ +void select_wait_queue_init(void); +void +select_wait_queue_init(void) +{ + wait_queue_init(&select_conflict_queue, SYNC_POLICY_FIFO); +} + + #if NETAT extern int appletalk_inited; #endif /* NETAT */ @@ -570,7 +585,8 @@ dofilewrite(vfs_context_t ctx, struct fileproc *fp, error == EINTR || error == EWOULDBLOCK)) error = 0; /* The socket layer handles SIGPIPE */ - if (error == EPIPE && fp->f_type != DTYPE_SOCKET) { + if (error == EPIPE && fp->f_type != DTYPE_SOCKET && + (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) { /* XXX Raise the signal on the thread? */ psignal(vfs_context_proc(ctx), SIGPIPE); } @@ -662,13 +678,14 @@ wr_uio(struct proc *p, int fdes, uio_t uio, user_ssize_t *retval) error == EINTR || error == EWOULDBLOCK)) error = 0; /* The socket layer handles SIGPIPE */ - if (error == EPIPE && fp->f_type != DTYPE_SOCKET) + if (error == EPIPE && fp->f_type != DTYPE_SOCKET && + (fp->f_fglob->fg_lflags & FG_NOSIGPIPE) == 0) psignal(p, SIGPIPE); } *retval = count - uio_resid(uio); out: - if ( (error == 0) ) + if (error == 0) fp_drop_written(p, fdes, fp); else fp_drop(p, fdes, fp, 0); @@ -937,8 +954,8 @@ extern int selcontinue(int error); extern int selprocess(int error, int sel_pass); static int selscan(struct proc *p, struct _select * sel, int nfd, int32_t *retval, int sel_pass, wait_queue_sub_t wqsub); -static int selcount(struct proc *p, u_int32_t *ibits, u_int32_t *obits, - int nfd, int * count, int *kfcount); +static int selcount(struct proc *p, u_int32_t *ibits, int nfd, int *count); +static int seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount); static int seldrop(struct proc *p, u_int32_t *ibits, int nfd); /* @@ -966,7 +983,6 @@ select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retva struct _select *sel; int needzerofill = 1; int count = 0; - int kfcount = 0; th_act = current_thread(); uth = get_bsdthread_info(th_act); @@ -1070,13 +1086,11 @@ select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retva else sel->abstime = 0; - sel->kfcount = 0; - if ( (error = selcount(p, sel->ibits, sel->obits, uap->nd, &count, &kfcount)) ) { + if ( (error = selcount(p, sel->ibits, uap->nd, &count)) ) { goto continuation; } sel->count = count; - sel->kfcount = kfcount; size = SIZEOF_WAITQUEUE_SET + (count * SIZEOF_WAITQUEUE_LINK); if (uth->uu_allocsize) { if (uth->uu_wqset == 0) @@ -1090,7 +1104,6 @@ select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retva panic("failed to allocate memory for waitqueue\n"); } } else { - sel->count = count; uth->uu_allocsize = size; uth->uu_wqset = (wait_queue_set_t)kalloc(uth->uu_allocsize); if (uth->uu_wqset == (wait_queue_set_t)NULL) @@ -1101,7 +1114,18 @@ select_nocancel(struct proc *p, struct select_nocancel_args *uap, int32_t *retva wait_queue_set_init(uth->uu_wqset, (SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST)); continuation: - return selprocess(error, SEL_FIRSTPASS); + + if (error) { + /* + * We have already cleaned up any state we established, + * either locally or as a result of selcount(). We don't + * need to wait_subqueue_unlink_all(), since we haven't set + * anything at this point. + */ + return (error); + } + + return selprocess(0, SEL_FIRSTPASS); } int @@ -1110,6 +1134,13 @@ selcontinue(int error) return selprocess(error, SEL_SECONDPASS); } + +/* + * selprocess + * + * Parameters: error The error code from our caller + * sel_pass The pass we are on + */ int selprocess(int error, int sel_pass) { @@ -1134,20 +1165,24 @@ selprocess(int error, int sel_pass) uth = get_bsdthread_info(th_act); sel = &uth->uu_select; - /* if it is first pass wait queue is not setup yet */ if ((error != 0) && (sel_pass == SEL_FIRSTPASS)) unwind = 0; if (sel->count == 0) unwind = 0; retry: if (error != 0) { - goto done; + sel_pass = SEL_FIRSTPASS; /* Reset for seldrop */ + goto done; } ncoll = nselcoll; OSBitOrAtomic(P_SELECT, &p->p_flag); /* skip scans if the select is just for timeouts */ if (sel->count) { + /* + * Clear out any dangling refs from prior calls; technically + * there should not be any. + */ if (sel_pass == SEL_FIRSTPASS) wait_queue_sub_clearrefs(uth->uu_wqset); @@ -1215,10 +1250,10 @@ retry: error = 0; } - sel_pass = SEL_SECONDPASS; if (error == 0) { + sel_pass = SEL_SECONDPASS; if (!prepost) - somewakeup =1; + somewakeup = 1; goto retry; } done: @@ -1253,6 +1288,23 @@ done: return(error); } + +/* + * selscan + * + * Parameters: p Process performing the select + * sel The per-thread select context structure + * nfd The number of file descriptors to scan + * retval The per thread system call return area + * sel_pass Which pass this is; allowed values are + * SEL_FIRSTPASS and SEL_SECONDPASS + * wqsub The per thread wait queue set + * + * Returns: 0 Success + * EIO Invalid p->p_fd field XXX Obsolete? + * EBADF One of the files in the bit vector is + * invalid. + */ static int selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval, int sel_pass, wait_queue_sub_t wqsub) @@ -1261,16 +1313,15 @@ selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval, int msk, i, j, fd; u_int32_t bits; struct fileproc *fp; - int n = 0; - int nc = 0; + int n = 0; /* count of bits */ + int nc = 0; /* bit vector offset (nc'th bit) */ static int flag[3] = { FREAD, FWRITE, 0 }; u_int32_t *iptr, *optr; u_int nw; u_int32_t *ibits, *obits; char * wql; char * wql_ptr; - int count, kfcount; - vnode_t vp; + int count; struct vfs_context context = *vfs_context_current(); /* @@ -1288,57 +1339,9 @@ selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval, nw = howmany(nfd, NFDBITS); count = sel->count; - kfcount = sel->kfcount; - - if (kfcount > count) - panic("selscan: count < kfcount"); - - if (kfcount != 0) { - proc_fdlock(p); - for (msk = 0; msk < 3; msk++) { - iptr = (u_int32_t *)&ibits[msk * nw]; - optr = (u_int32_t *)&obits[msk * nw]; - - for (i = 0; i < nfd; i += NFDBITS) { - bits = iptr[i/NFDBITS]; - - while ((j = ffs(bits)) && (fd = i + --j) < nfd) { - bits &= ~(1 << j); - fp = fdp->fd_ofiles[fd]; - - if (fp == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) { - proc_fdunlock(p); - return(EBADF); - } - if (sel_pass == SEL_SECONDPASS) { - wql_ptr = (char *)0; - fp->f_flags &= ~FP_INSELECT; - fp->f_waddr = (void *)0; - } else { - wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK); - fp->f_flags |= FP_INSELECT; - fp->f_waddr = (void *)wqsub; - } - - context.vc_ucred = fp->f_cred; - - if (fp->f_ops && (fp->f_type == DTYPE_VNODE) - && ((vp = (struct vnode *)fp->f_data) != NULLVP) - && (vp->v_type == VCHR) - && fo_select(fp, flag[msk], wql_ptr, &context)) { - optr[fd/NFDBITS] |= (1 << (fd % NFDBITS)); - n++; - } - nc++; - } - } - } - proc_fdunlock(p); - } nc = 0; - if (kfcount != count) { + if (count) { proc_fdlock(p); for (msk = 0; msk < 3; msk++) { iptr = (u_int32_t *)&ibits[msk * nw]; @@ -1351,29 +1354,37 @@ selscan(struct proc *p, struct _select *sel, int nfd, int32_t *retval, bits &= ~(1 << j); fp = fdp->fd_ofiles[fd]; - if (fp == NULL || - (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) { + /* + * If we abort because of a bad + * fd, let the caller unwind... + */ proc_fdunlock(p); return(EBADF); } if (sel_pass == SEL_SECONDPASS) { wql_ptr = (char *)0; - fp->f_flags &= ~FP_INSELECT; - fp->f_waddr = (void *)0; + if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)wqsub)) { + fp->f_flags &= ~FP_INSELECT; + fp->f_waddr = (void *)0; + } } else { wql_ptr = (wql + nc * SIZEOF_WAITQUEUE_LINK); - fp->f_flags |= FP_INSELECT; - fp->f_waddr = (void *)wqsub; + if (fp->f_flags & FP_INSELECT) { + /* someone is already in select on this fp */ + fp->f_flags |= FP_SELCONFLICT; + wait_queue_link(&select_conflict_queue, (wait_queue_set_t)wqsub); + } else { + fp->f_flags |= FP_INSELECT; + fp->f_waddr = (void *)wqsub; + } } context.vc_ucred = fp->f_cred; - if ((fp->f_ops && - ((fp->f_type != DTYPE_VNODE) - || (((vp = (struct vnode *)fp->f_data) != NULLVP) - && (vp->v_type != VCHR)) - ) - && fo_select(fp, flag[msk], wql_ptr, &context))) { + /* The select; set the bit, if true */ + if (fp->f_ops + && fo_select(fp, flag[msk], wql_ptr, &context)) { optr[fd/NFDBITS] |= (1 << (fd % NFDBITS)); n++; } @@ -1476,9 +1487,9 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval) /* convert the poll event into a kqueue kevent */ kev.ident = fds[i].fd; kev.flags = EV_ADD | EV_ONESHOT | EV_POLL; - kev.fflags = NOTE_LOWAT; - kev.data = 1; /* efficiency be damned: any data should trigger */ kev.udata = CAST_USER_ADDR_T(&fds[i]); + kev.fflags = 0; + kev.data = 0; kev.ext[0] = 0; kev.ext[1] = 0; @@ -1608,9 +1619,32 @@ seltrue(__unused dev_t dev, __unused int flag, __unused struct proc *p) return (1); } +/* + * selcount + * + * Count the number of bits set in the input bit vector, and establish an + * outstanding fp->f_iocount for each of the descriptors which will be in + * use in the select operation. + * + * Parameters: p The process doing the select + * ibits The input bit vector + * nfd The number of fd's in the vector + * countp Pointer to where to store the bit count + * + * Returns: 0 Success + * EIO Bad per process open file table + * EBADF One of the bits in the input bit vector + * references an invalid fd + * + * Implicit: *countp (modified) Count of fd's + * + * Notes: This function is the first pass under the proc_fdlock() that + * permits us to recognize invalid descriptors in the bit vector; + * the may, however, not remain valid through the drop and + * later reacquisition of the proc_fdlock(). + */ static int -selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, - int nfd, int *countp, int * kfcountp) +selcount(struct proc *p, u_int32_t *ibits, int nfd, int *countp) { struct filedesc *fdp = p->p_fd; int msk, i, j, fd; @@ -1620,9 +1654,8 @@ selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, u_int32_t *iptr; u_int nw; int error=0; - int kfc = 0; int dropcount; - vnode_t vp; + int need_wakeup = 0; /* * Problems when reboot; due to MacOSX signal probs @@ -1630,7 +1663,6 @@ selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, */ if (fdp == NULL) { *countp = 0; - *kfcountp = 0; return(EIO); } nw = howmany(nfd, NFDBITS); @@ -1646,16 +1678,10 @@ selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, if (fp == NULL || (fdp->fd_ofileflags[fd] & UF_RESERVED)) { *countp = 0; - *kfcountp = 0; error = EBADF; goto bad; } fp->f_iocount++; - if ((fp->f_type == DTYPE_VNODE) - && ((vp = (struct vnode *)fp->f_data) != NULLVP) - && (vp->v_type == VCHR) ) - kfc++; - n++; } } @@ -1663,48 +1689,64 @@ selcount(struct proc *p, u_int32_t *ibits, __unused u_int32_t *obits, proc_fdunlock(p); *countp = n; - *kfcountp = kfc; return (0); + bad: dropcount = 0; if (n== 0) goto out; - /* undo the iocounts */ - for (msk = 0; msk < 3; msk++) { - iptr = (u_int32_t *)&ibits[msk * nw]; - for (i = 0; i < nfd; i += NFDBITS) { - bits = iptr[i/NFDBITS]; - while ((j = ffs(bits)) && (fd = i + --j) < nfd) { - bits &= ~(1 << j); - fp = fdp->fd_ofiles[fd]; - if (dropcount >= n) - goto out; - fp->f_iocount--; + /* Ignore error return; it's already EBADF */ + (void)seldrop_locked(p, ibits, nfd, n, &need_wakeup, 1); - if (p->p_fpdrainwait && fp->f_iocount == 0) { - p->p_fpdrainwait = 0; - wakeup(&p->p_fpdrainwait); - } - dropcount++; - } - } - } out: proc_fdunlock(p); + if (need_wakeup) { + wakeup(&p->p_fpdrainwait); + } return(error); } + +/* + * seldrop_locked + * + * Drop outstanding wait queue references set up during selscan(); drop the + * outstanding per fileproc f_iocount() picked up during the selcount(). + * + * Parameters: p Process performing the select + * ibits Input pit bector of fd's + * nfd Number of fd's + * lim Limit to number of vector entries to + * consider, or -1 for "all" + * inselect True if + * need_wakeup Pointer to flag to set to do a wakeup + * if f_iocont on any descriptor goes to 0 + * + * Returns: 0 Success + * EBADF One or more fds in the bit vector + * were invalid, but the rest + * were successfully dropped + * + * Notes: An fd make become bad while the proc_fdlock() is not held, + * if a multithreaded application closes the fd out from under + * the in progress select. In this case, we still have to + * clean up after the set up on the remaining fds. + */ static int -seldrop(struct proc *p, u_int32_t *ibits, int nfd) +seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wakeup, int fromselcount) { struct filedesc *fdp = p->p_fd; int msk, i, j, fd; u_int32_t bits; struct fileproc *fp; - int n = 0; u_int32_t *iptr; u_int nw; + int error = 0; + int dropcount = 0; + uthread_t uth = get_bsdthread_info(current_thread()); + + *need_wakeup = 0; /* * Problems when reboot; due to MacOSX signal probs @@ -1716,8 +1758,6 @@ seldrop(struct proc *p, u_int32_t *ibits, int nfd) nw = howmany(nfd, NFDBITS); - - proc_fdlock(p); for (msk = 0; msk < 3; msk++) { iptr = (u_int32_t *)&ibits[msk * nw]; for (i = 0; i < nfd; i += NFDBITS) { @@ -1725,28 +1765,67 @@ seldrop(struct proc *p, u_int32_t *ibits, int nfd) while ((j = ffs(bits)) && (fd = i + --j) < nfd) { bits &= ~(1 << j); fp = fdp->fd_ofiles[fd]; - if (fp == NULL -#if 0 - /* if you are here then it is being closed */ - || (fdp->fd_ofileflags[fd] & UF_RESERVED) -#endif - ) { - proc_fdunlock(p); - return(EBADF); + /* + * If we've already dropped as many as were + * counted/scanned, then we are done. + */ + if ((fromselcount != 0) && (++dropcount > lim)) + goto done; + + if (fp == NULL) { + /* skip (now) bad fds */ + error = EBADF; + continue; + } + /* + * Only clear the flag if we set it. We'll + * only find that we set it if we had made + * at least one [partial] pass through selscan(). + */ + if ((fp->f_flags & FP_INSELECT) && (fp->f_waddr == (void *)uth->uu_wqset)) { + fp->f_flags &= ~FP_INSELECT; + fp->f_waddr = (void *)0; } - n++; - fp->f_iocount--; - fp->f_flags &= ~FP_INSELECT; - if (p->p_fpdrainwait && fp->f_iocount == 0) { - p->p_fpdrainwait = 0; - wakeup(&p->p_fpdrainwait); + fp->f_iocount--; + if (fp->f_iocount < 0) + panic("f_iocount overdecrement!"); + + if (fp->f_iocount == 0) { + /* + * The last iocount is responsible for clearing + * selconfict flag - even if we didn't set it - + * and is also responsible for waking up anyone + * waiting on iocounts to drain. + */ + if (fp->f_flags & FP_SELCONFLICT) + fp->f_flags &= ~FP_SELCONFLICT; + if (p->p_fpdrainwait) { + p->p_fpdrainwait = 0; + *need_wakeup = 1; + } } } } } +done: + return (error); +} + + +static int +seldrop(struct proc *p, u_int32_t *ibits, int nfd) +{ + int error; + int need_wakeup = 0; + + proc_fdlock(p); + error = seldrop_locked(p, ibits, nfd, nfd, &need_wakeup, 0); proc_fdunlock(p); - return (0); + if (need_wakeup) { + wakeup(&p->p_fpdrainwait); + } + return (error); } /* @@ -1760,12 +1839,8 @@ selrecord(__unused struct proc *selector, struct selinfo *sip, void * p_wql) /* need to look at collisions */ - if ((p_wql == (void *)0) && ((sip->si_flags & SI_INITED) == 0)) { - return; - } - /*do not record if this is second pass of select */ - if((p_wql == (void *)0)) { + if(p_wql == (void *)0) { return; } diff --git a/bsd/kern/sys_pipe.c b/bsd/kern/sys_pipe.c index f86ad5a11..c374ea07e 100644 --- a/bsd/kern/sys_pipe.c +++ b/bsd/kern/sys_pipe.c @@ -231,17 +231,17 @@ int maxpipekva = 1024 * 1024 * 16; #if PIPE_SYSCTLS SYSCTL_DECL(_kern_ipc); -SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD|CTLFLAG_LOCKED, &maxpipekva, 0, "Pipe KVA limit"); -SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW|CTLFLAG_LOCKED, &maxpipekvawired, 0, "Pipe KVA wired limit"); -SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD|CTLFLAG_LOCKED, &amountpipes, 0, "Current # of pipes"); -SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD|CTLFLAG_LOCKED, &nbigpipe, 0, "Current # of big pipes"); -SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD|CTLFLAG_LOCKED, &amountpipekva, 0, "Pipe KVA usage"); -SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD|CTLFLAG_LOCKED, &amountpipekvawired, 0, "Pipe wired KVA usage"); #endif @@ -1332,6 +1332,16 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, error = EAGAIN; break; } + + /* + * If read side wants to go away, we just issue a signal + * to ourselves. + */ + if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { + error = EPIPE; + break; + } + /* * We have no more space and have something to offer, * wake up select/poll. @@ -1344,14 +1354,6 @@ pipe_write(struct fileproc *fp, struct uio *uio, __unused int flags, if (error != 0) break; - /* - * If read side wants to go away, we just issue a signal - * to ourselves. - */ - if (wpipe->pipe_state & (PIPE_DRAIN | PIPE_EOF)) { - error = EPIPE; - break; - } } } --wpipe->pipe_busy; @@ -1741,8 +1743,14 @@ filt_piperead(struct knote *kn, long hint) kn->kn_flags |= EV_EOF; retval = 1; } else { - retval = (kn->kn_sfflags & NOTE_LOWAT) ? - (kn->kn_data >= kn->kn_sdata) : (kn->kn_data > 0); + int64_t lowwat = 1; + if (kn->kn_sfflags & NOTE_LOWAT) { + if (rpipe->pipe_buffer.size && kn->kn_sdata > rpipe->pipe_buffer.size) + lowwat = rpipe->pipe_buffer.size; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + retval = kn->kn_data >= lowwat; } if (hint == 0) @@ -1779,17 +1787,24 @@ filt_pipewrite(struct knote *kn, long hint) } kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; if (!kn->kn_data && wpipe->pipe_buffer.size == 0) - kn->kn_data = 1; /* unwritten pipe is ready for write */ + kn->kn_data = PIPE_BUF; /* unwritten pipe is ready for write */ #ifndef PIPE_NODIRECT if (wpipe->pipe_state & PIPE_DIRECTW) kn->kn_data = 0; #endif + int64_t lowwat = PIPE_BUF; + if (kn->kn_sfflags & NOTE_LOWAT) { + if (wpipe->pipe_buffer.size && kn->kn_sdata > wpipe->pipe_buffer.size) + lowwat = wpipe->pipe_buffer.size; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + if (hint == 0) PIPE_UNLOCK(rpipe); - return (kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ? - kn->kn_sdata : PIPE_BUF)); + return (kn->kn_data >= lowwat); } int diff --git a/bsd/kern/sys_socket.c b/bsd/kern/sys_socket.c index 471cac76a..431e47658 100644 --- a/bsd/kern/sys_socket.c +++ b/bsd/kern/sys_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -194,27 +194,7 @@ soioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) /* Call the socket filter's ioctl handler for most ioctls */ if (IOCGROUP(cmd) != 'i' && IOCGROUP(cmd) != 'r') { - int filtered = 0; - struct socket_filter_entry *filter; - - for (filter = so->so_filt; filter && error == 0; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_ioctl) { - if (filtered == 0) { - sflt_use(so); - socket_unlock(so, 0); - filtered = 1; - } - error = filter->sfe_filter->sf_filter. - sf_ioctl(filter->sfe_cookie, so, cmd, data); - } - } - - if (filtered) { - socket_lock(so, 0); - sflt_unuse(so); - } - + error = sflt_ioctl(so, cmd, data); if (error != 0) goto out; } @@ -462,7 +442,7 @@ soo_stat(struct socket *so, void *ub, int isstat64) sb64->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; sb64->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; sb64->st_uid = so->so_uid; - sb64->st_gid = -1; /* XXX -- what else to do? */ + sb64->st_gid = so->so_gid; } else { sb->st_mode = S_IFSOCK; if ((so->so_state & SS_CANTRCVMORE) == 0 || @@ -472,7 +452,7 @@ soo_stat(struct socket *so, void *ub, int isstat64) sb->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; sb->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; sb->st_uid = so->so_uid; - sb->st_gid = -1; /* XXX -- what else to do? */ + sb->st_gid = so->so_gid; } ret = (*so->so_proto->pr_usrreqs->pru_sense)(so, ub, isstat64); diff --git a/bsd/kern/syscalls.master b/bsd/kern/syscalls.master index 00fb84082..009dd377b 100644 --- a/bsd/kern/syscalls.master +++ b/bsd/kern/syscalls.master @@ -39,21 +39,21 @@ #include <sys/sysproto.h> 0 AUE_NULL ALL { int nosys(void); } { indirect syscall } -1 AUE_EXIT ALL { void exit(int rval); } -2 AUE_FORK ALL { int fork(void); } +1 AUE_EXIT ALL { void exit(int rval) NO_SYSCALL_STUB; } +2 AUE_FORK ALL { int fork(void) NO_SYSCALL_STUB; } 3 AUE_NULL ALL { user_ssize_t read(int fd, user_addr_t cbuf, user_size_t nbyte); } 4 AUE_NULL ALL { user_ssize_t write(int fd, user_addr_t cbuf, user_size_t nbyte); } -5 AUE_OPEN_RWTC ALL { int open(user_addr_t path, int flags, int mode); } +5 AUE_OPEN_RWTC ALL { int open(user_addr_t path, int flags, int mode) NO_SYSCALL_STUB; } 6 AUE_CLOSE ALL { int close(int fd); } -7 AUE_WAIT4 ALL { int wait4(int pid, user_addr_t status, int options, user_addr_t rusage); } +7 AUE_WAIT4 ALL { int wait4(int pid, user_addr_t status, int options, user_addr_t rusage) NO_SYSCALL_STUB; } 8 AUE_NULL ALL { int nosys(void); } { old creat } 9 AUE_LINK ALL { int link(user_addr_t path, user_addr_t link); } -10 AUE_UNLINK ALL { int unlink(user_addr_t path); } +10 AUE_UNLINK ALL { int unlink(user_addr_t path) NO_SYSCALL_STUB; } 11 AUE_NULL ALL { int nosys(void); } { old execv } 12 AUE_CHDIR ALL { int chdir(user_addr_t path); } 13 AUE_FCHDIR ALL { int fchdir(int fd); } 14 AUE_MKNOD ALL { int mknod(user_addr_t path, int mode, int dev); } -15 AUE_CHMOD ALL { int chmod(user_addr_t path, int mode); } +15 AUE_CHMOD ALL { int chmod(user_addr_t path, int mode) NO_SYSCALL_STUB; } 16 AUE_CHOWN ALL { int chown(user_addr_t path, int uid, int gid); } 17 AUE_NULL ALL { int nosys(void); } { old break } 18 AUE_GETFSSTAT ALL { int getfsstat(user_addr_t buf, int bufsize, int flags); } @@ -66,12 +66,12 @@ 25 AUE_GETEUID ALL { int geteuid(void); } 26 AUE_PTRACE ALL { int ptrace(int req, pid_t pid, caddr_t addr, int data); } #if SOCKETS -27 AUE_RECVMSG ALL { int recvmsg(int s, struct msghdr *msg, int flags); } -28 AUE_SENDMSG ALL { int sendmsg(int s, caddr_t msg, int flags); } -29 AUE_RECVFROM ALL { int recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from, int *fromlenaddr); } -30 AUE_ACCEPT ALL { int accept(int s, caddr_t name, socklen_t *anamelen); } -31 AUE_GETPEERNAME ALL { int getpeername(int fdes, caddr_t asa, socklen_t *alen); } -32 AUE_GETSOCKNAME ALL { int getsockname(int fdes, caddr_t asa, socklen_t *alen); } +27 AUE_RECVMSG ALL { int recvmsg(int s, struct msghdr *msg, int flags) NO_SYSCALL_STUB; } +28 AUE_SENDMSG ALL { int sendmsg(int s, caddr_t msg, int flags) NO_SYSCALL_STUB; } +29 AUE_RECVFROM ALL { int recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from, int *fromlenaddr) NO_SYSCALL_STUB; } +30 AUE_ACCEPT ALL { int accept(int s, caddr_t name, socklen_t *anamelen) NO_SYSCALL_STUB; } +31 AUE_GETPEERNAME ALL { int getpeername(int fdes, caddr_t asa, socklen_t *alen) NO_SYSCALL_STUB; } +32 AUE_GETSOCKNAME ALL { int getsockname(int fdes, caddr_t asa, socklen_t *alen) NO_SYSCALL_STUB; } #else 27 AUE_NULL ALL { int nosys(void); } 28 AUE_NULL ALL { int nosys(void); } @@ -84,7 +84,7 @@ 34 AUE_CHFLAGS ALL { int chflags(char *path, int flags); } 35 AUE_FCHFLAGS ALL { int fchflags(int fd, int flags); } 36 AUE_SYNC ALL { int sync(void); } -37 AUE_KILL ALL { int kill(int pid, int signum, int posix); } +37 AUE_KILL ALL { int kill(int pid, int signum, int posix) NO_SYSCALL_STUB; } 38 AUE_NULL ALL { int nosys(void); } { old stat } 39 AUE_GETPPID ALL { int getppid(void); } 40 AUE_NULL ALL { int nosys(void); } { old lstat } @@ -93,15 +93,15 @@ 43 AUE_GETEGID ALL { int getegid(void); } 44 AUE_PROFILE ALL { int profil(short *bufbase, size_t bufsize, u_long pcoffset, u_int pcscale); } 45 AUE_NULL ALL { int nosys(void); } { old ktrace } -46 AUE_SIGACTION ALL { int sigaction(int signum, struct __sigaction *nsa, struct sigaction *osa); } +46 AUE_SIGACTION ALL { int sigaction(int signum, struct __sigaction *nsa, struct sigaction *osa) NO_SYSCALL_STUB; } 47 AUE_GETGID ALL { int getgid(void); } 48 AUE_SIGPROCMASK ALL { int sigprocmask(int how, user_addr_t mask, user_addr_t omask); } -49 AUE_GETLOGIN ALL { int getlogin(char *namebuf, u_int namelen); } -50 AUE_SETLOGIN ALL { int setlogin(char *namebuf); } +49 AUE_GETLOGIN ALL { int getlogin(char *namebuf, u_int namelen) NO_SYSCALL_STUB; } +50 AUE_SETLOGIN ALL { int setlogin(char *namebuf) NO_SYSCALL_STUB; } 51 AUE_ACCT ALL { int acct(char *path); } 52 AUE_SIGPENDING ALL { int sigpending(struct sigvec *osv); } -53 AUE_SIGALTSTACK ALL { int sigaltstack(struct sigaltstack *nss, struct sigaltstack *oss); } -54 AUE_IOCTL ALL { int ioctl(int fd, u_long com, caddr_t data); } +53 AUE_SIGALTSTACK ALL { int sigaltstack(struct sigaltstack *nss, struct sigaltstack *oss) NO_SYSCALL_STUB ; } +54 AUE_IOCTL ALL { int ioctl(int fd, u_long com, caddr_t data) NO_SYSCALL_STUB; } 55 AUE_REBOOT ALL { int reboot(int opt, char *command); } 56 AUE_REVOKE ALL { int revoke(char *path); } 57 AUE_SYMLINK ALL { int symlink(char *path, char *link); } @@ -112,7 +112,7 @@ 62 AUE_NULL ALL { int nosys(void); } { old fstat } 63 AUE_NULL ALL { int nosys(void); } { used internally, reserved } 64 AUE_NULL ALL { int nosys(void); } { old getpagesize } -65 AUE_MSYNC ALL { int msync(caddr_t addr, size_t len, int flags); } +65 AUE_MSYNC ALL { int msync(caddr_t addr, size_t len, int flags) NO_SYSCALL_STUB; } 66 AUE_VFORK ALL { int vfork(void); } 67 AUE_NULL ALL { int nosys(void); } { old vread } 68 AUE_NULL ALL { int nosys(void); } { old vwrite } @@ -120,8 +120,8 @@ 70 AUE_NULL ALL { int nosys(void); } { old sstk } 71 AUE_NULL ALL { int nosys(void); } { old mmap } 72 AUE_NULL ALL { int nosys(void); } { old vadvise } -73 AUE_MUNMAP ALL { int munmap(caddr_t addr, size_t len); } -74 AUE_MPROTECT ALL { int mprotect(caddr_t addr, size_t len, int prot); } +73 AUE_MUNMAP ALL { int munmap(caddr_t addr, size_t len) NO_SYSCALL_STUB; } +74 AUE_MPROTECT ALL { int mprotect(caddr_t addr, size_t len, int prot) NO_SYSCALL_STUB; } 75 AUE_MADVISE ALL { int madvise(caddr_t addr, size_t len, int behav); } 76 AUE_NULL ALL { int nosys(void); } { old vhangup } 77 AUE_NULL ALL { int nosys(void); } { old vlimit } @@ -139,14 +139,14 @@ 89 AUE_GETDTABLESIZE ALL { int getdtablesize(void); } 90 AUE_DUP2 ALL { int dup2(u_int from, u_int to); } 91 AUE_NULL ALL { int nosys(void); } { old getdopt } -92 AUE_FCNTL ALL { int fcntl(int fd, int cmd, long arg); } -93 AUE_SELECT ALL { int select(int nd, u_int32_t *in, u_int32_t *ou, u_int32_t *ex, struct timeval *tv); } +92 AUE_FCNTL ALL { int fcntl(int fd, int cmd, long arg) NO_SYSCALL_STUB; } +93 AUE_SELECT ALL { int select(int nd, u_int32_t *in, u_int32_t *ou, u_int32_t *ex, struct timeval *tv) NO_SYSCALL_STUB; } 94 AUE_NULL ALL { int nosys(void); } { old setdopt } 95 AUE_FSYNC ALL { int fsync(int fd); } 96 AUE_SETPRIORITY ALL { int setpriority(int which, id_t who, int prio); } #if SOCKETS 97 AUE_SOCKET ALL { int socket(int domain, int type, int protocol); } -98 AUE_CONNECT ALL { int connect(int s, caddr_t name, socklen_t namelen); } +98 AUE_CONNECT ALL { int connect(int s, caddr_t name, socklen_t namelen) NO_SYSCALL_STUB; } #else 97 AUE_NULL ALL { int nosys(void); } 98 AUE_NULL ALL { int nosys(void); } @@ -157,9 +157,9 @@ 102 AUE_NULL ALL { int nosys(void); } { old recv } 103 AUE_NULL ALL { int nosys(void); } { old sigreturn } #if SOCKETS -104 AUE_BIND ALL { int bind(int s, caddr_t name, socklen_t namelen); } +104 AUE_BIND ALL { int bind(int s, caddr_t name, socklen_t namelen) NO_SYSCALL_STUB; } 105 AUE_SETSOCKOPT ALL { int setsockopt(int s, int level, int name, caddr_t val, socklen_t valsize); } -106 AUE_LISTEN ALL { int listen(int s, int backlog); } +106 AUE_LISTEN ALL { int listen(int s, int backlog) NO_SYSCALL_STUB; } #else 104 AUE_NULL ALL { int nosys(void); } 105 AUE_NULL ALL { int nosys(void); } @@ -169,7 +169,7 @@ 108 AUE_NULL ALL { int nosys(void); } { old sigvec } 109 AUE_NULL ALL { int nosys(void); } { old sigblock } 110 AUE_NULL ALL { int nosys(void); } { old sigsetmask } -111 AUE_NULL ALL { int sigsuspend(sigset_t mask); } +111 AUE_NULL ALL { int sigsuspend(sigset_t mask) NO_SYSCALL_STUB; } 112 AUE_NULL ALL { int nosys(void); } { old sigstack } #if SOCKETS 113 AUE_NULL ALL { int nosys(void); } { old recvmsg } @@ -179,7 +179,7 @@ 114 AUE_NULL ALL { int nosys(void); } #endif /* SOCKETS */ 115 AUE_NULL ALL { int nosys(void); } { old vtrace } -116 AUE_GETTIMEOFDAY ALL { int gettimeofday(struct timeval *tp, struct timezone *tzp); } +116 AUE_GETTIMEOFDAY ALL { int gettimeofday(struct timeval *tp, struct timezone *tzp) NO_SYSCALL_STUB; } 117 AUE_GETRUSAGE ALL { int getrusage(int who, struct rusage *rusage); } #if SOCKETS 118 AUE_GETSOCKOPT ALL { int getsockopt(int s, int level, int name, caddr_t val, socklen_t *avalsize); } @@ -189,28 +189,28 @@ 119 AUE_NULL ALL { int nosys(void); } { old resuba } 120 AUE_READV ALL { user_ssize_t readv(int fd, struct iovec *iovp, u_int iovcnt); } 121 AUE_WRITEV ALL { user_ssize_t writev(int fd, struct iovec *iovp, u_int iovcnt); } -122 AUE_SETTIMEOFDAY ALL { int settimeofday(struct timeval *tv, struct timezone *tzp); } +122 AUE_SETTIMEOFDAY ALL { int settimeofday(struct timeval *tv, struct timezone *tzp) NO_SYSCALL_STUB; } 123 AUE_FCHOWN ALL { int fchown(int fd, int uid, int gid); } -124 AUE_FCHMOD ALL { int fchmod(int fd, int mode); } +124 AUE_FCHMOD ALL { int fchmod(int fd, int mode) NO_SYSCALL_STUB; } 125 AUE_NULL ALL { int nosys(void); } { old recvfrom } -126 AUE_SETREUID ALL { int setreuid(uid_t ruid, uid_t euid); } -127 AUE_SETREGID ALL { int setregid(gid_t rgid, gid_t egid); } -128 AUE_RENAME ALL { int rename(char *from, char *to); } +126 AUE_SETREUID ALL { int setreuid(uid_t ruid, uid_t euid) NO_SYSCALL_STUB; } +127 AUE_SETREGID ALL { int setregid(gid_t rgid, gid_t egid) NO_SYSCALL_STUB; } +128 AUE_RENAME ALL { int rename(char *from, char *to) NO_SYSCALL_STUB; } 129 AUE_NULL ALL { int nosys(void); } { old truncate } 130 AUE_NULL ALL { int nosys(void); } { old ftruncate } 131 AUE_FLOCK ALL { int flock(int fd, int how); } 132 AUE_MKFIFO ALL { int mkfifo(user_addr_t path, int mode); } #if SOCKETS -133 AUE_SENDTO ALL { int sendto(int s, caddr_t buf, size_t len, int flags, caddr_t to, socklen_t tolen); } +133 AUE_SENDTO ALL { int sendto(int s, caddr_t buf, size_t len, int flags, caddr_t to, socklen_t tolen) NO_SYSCALL_STUB; } 134 AUE_SHUTDOWN ALL { int shutdown(int s, int how); } -135 AUE_SOCKETPAIR ALL { int socketpair(int domain, int type, int protocol, int *rsv); } +135 AUE_SOCKETPAIR ALL { int socketpair(int domain, int type, int protocol, int *rsv) NO_SYSCALL_STUB; } #else 133 AUE_NULL ALL { int nosys(void); } 134 AUE_NULL ALL { int nosys(void); } 135 AUE_NULL ALL { int nosys(void); } #endif /* SOCKETS */ 136 AUE_MKDIR ALL { int mkdir(user_addr_t path, int mode); } -137 AUE_RMDIR ALL { int rmdir(char *path); } +137 AUE_RMDIR ALL { int rmdir(char *path) NO_SYSCALL_STUB; } 138 AUE_UTIMES ALL { int utimes(char *path, struct timeval *tptr); } 139 AUE_FUTIMES ALL { int futimes(int fd, struct timeval *tptr); } 140 AUE_ADJTIME ALL { int adjtime(struct timeval *delta, struct timeval *olddelta); } @@ -279,10 +279,10 @@ 191 AUE_PATHCONF ALL { int pathconf(char *path, int name); } 192 AUE_FPATHCONF ALL { int fpathconf(int fd, int name); } 193 AUE_NULL ALL { int nosys(void); } -194 AUE_GETRLIMIT ALL { int getrlimit(u_int which, struct rlimit *rlp); } -195 AUE_SETRLIMIT ALL { int setrlimit(u_int which, struct rlimit *rlp); } +194 AUE_GETRLIMIT ALL { int getrlimit(u_int which, struct rlimit *rlp) NO_SYSCALL_STUB; } +195 AUE_SETRLIMIT ALL { int setrlimit(u_int which, struct rlimit *rlp) NO_SYSCALL_STUB; } 196 AUE_GETDIRENTRIES ALL { int getdirentries(int fd, char *buf, u_int count, long *basep); } -197 AUE_MMAP ALL { user_addr_t mmap(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos); } +197 AUE_MMAP ALL { user_addr_t mmap(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos) NO_SYSCALL_STUB; } 198 AUE_NULL ALL { int nosys(void); } { __syscall } 199 AUE_LSEEK ALL { off_t lseek(int fd, off_t offset, int whence); } 200 AUE_TRUNCATE ALL { int truncate(char *path, off_t length); } @@ -326,8 +326,8 @@ 217 AUE_STATV UHN { int statv(const char *path, struct vstat *vsb); } { soon to be obsolete } 218 AUE_LSTATV UHN { int lstatv(const char *path, struct vstat *vsb); } { soon to be obsolete } 219 AUE_FSTATV UHN { int fstatv(int fd, struct vstat *vsb); } { soon to be obsolete } -220 AUE_GETATTRLIST ALL { int getattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options); } -221 AUE_SETATTRLIST ALL { int setattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options); } +220 AUE_GETATTRLIST ALL { int getattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options) NO_SYSCALL_STUB; } +221 AUE_SETATTRLIST ALL { int setattrlist(const char *path, struct attrlist *alist, void *attributeBuffer, size_t bufferSize, u_long options) NO_SYSCALL_STUB; } 222 AUE_GETDIRENTRIESATTR ALL { int getdirentriesattr(int fd, struct attrlist *alist, void *buffer, size_t buffersize, u_long *count, u_long *basep, u_long *newstate, u_long options); } 223 AUE_EXCHANGEDATA ALL { int exchangedata(const char *path1, const char *path2, u_long options); } 224 AUE_NULL ALL { int nosys(void); } { old checkuseraccess / fsgetpath (which moved to 427) } @@ -349,8 +349,8 @@ 240 AUE_LISTXATTR ALL { user_ssize_t listxattr(user_addr_t path, user_addr_t namebuf, size_t bufsize, int options); } 241 AUE_FLISTXATTR ALL { user_ssize_t flistxattr(int fd, user_addr_t namebuf, size_t bufsize, int options); } 242 AUE_FSCTL ALL { int fsctl(const char *path, u_long cmd, caddr_t data, u_int options); } -243 AUE_INITGROUPS ALL { int initgroups(u_int gidsetsize, gid_t *gidset, int gmuid); } -244 AUE_POSIX_SPAWN ALL { int posix_spawn(pid_t *pid, const char *path, const struct _posix_spawn_args_desc *adesc, char **argv, char **envp); } +243 AUE_INITGROUPS ALL { int initgroups(u_int gidsetsize, gid_t *gidset, int gmuid) NO_SYSCALL_STUB; } +244 AUE_POSIX_SPAWN ALL { int posix_spawn(pid_t *pid, const char *path, const struct _posix_spawn_args_desc *adesc, char **argv, char **envp) NO_SYSCALL_STUB; } 245 AUE_FFSCTL ALL { int ffsctl(int fd, u_long cmd, caddr_t data, u_int options); } 246 AUE_NULL ALL { int nosys(void); } @@ -383,7 +383,7 @@ 253 AUE_NULL ALL { int nosys(void); } #endif #if SYSV_SEM -254 AUE_SEMCTL ALL { int semctl(int semid, int semnum, int cmd, semun_t arg); } +254 AUE_SEMCTL ALL { int semctl(int semid, int semnum, int cmd, semun_t arg) NO_SYSCALL_STUB; } 255 AUE_SEMGET ALL { int semget(key_t key, int nsems, int semflg); } 256 AUE_SEMOP ALL { int semop(int semid, struct sembuf *sops, int nsops); } 257 AUE_NULL ALL { int nosys(void); } @@ -394,7 +394,7 @@ 257 AUE_NULL ALL { int nosys(void); } #endif #if SYSV_MSG -258 AUE_MSGCTL ALL { int msgctl(int msqid, int cmd, struct msqid_ds *buf); } +258 AUE_MSGCTL ALL { int msgctl(int msqid, int cmd, struct msqid_ds *buf) NO_SYSCALL_STUB; } 259 AUE_MSGGET ALL { int msgget(key_t key, int msgflg); } 260 AUE_MSGSND ALL { int msgsnd(int msqid, void *msgp, size_t msgsz, int msgflg); } 261 AUE_MSGRCV ALL { user_ssize_t msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } @@ -406,7 +406,7 @@ #endif #if SYSV_SHM 262 AUE_SHMAT ALL { user_addr_t shmat(int shmid, void *shmaddr, int shmflg); } -263 AUE_SHMCTL ALL { int shmctl(int shmid, int cmd, struct shmid_ds *buf); } +263 AUE_SHMCTL ALL { int shmctl(int shmid, int cmd, struct shmid_ds *buf) NO_SYSCALL_STUB; } 264 AUE_SHMDT ALL { int shmdt(void *shmaddr); } 265 AUE_SHMGET ALL { int shmget(key_t key, size_t size, int shmflg); } #else @@ -444,7 +444,7 @@ 292 AUE_MKDIR_EXTENDED ALL { int mkdir_extended(user_addr_t path, uid_t uid, gid_t gid, int mode, user_addr_t xsecurity) NO_SYSCALL_STUB; } 293 AUE_IDENTITYSVC ALL { int identitysvc(int opcode, user_addr_t message) NO_SYSCALL_STUB; } 294 AUE_NULL ALL { int shared_region_check_np(uint64_t *start_address) NO_SYSCALL_STUB; } -295 AUE_NULL ALL { int shared_region_map_np(int fd, uint32_t count, const struct shared_file_mapping_np *mappings) NO_SYSCALL_STUB; } +295 AUE_NULL ALL { int nosys(void); } { old shared_region_map_np } 296 AUE_NULL ALL { int vm_pressure_monitor(int wait_for_pressure, int nsecs_monitored, uint32_t *pages_reclaimed); } #if PSYNCH 297 AUE_NULL ALL { uint32_t psynch_rw_longrdlock(user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags) NO_SYSCALL_STUB; } @@ -453,9 +453,9 @@ 300 AUE_NULL ALL { uint32_t psynch_rw_upgrade(user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags) NO_SYSCALL_STUB; } 301 AUE_NULL ALL { uint32_t psynch_mutexwait(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint64_t tid, uint32_t flags) NO_SYSCALL_STUB; } 302 AUE_NULL ALL { uint32_t psynch_mutexdrop(user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint64_t tid, uint32_t flags) NO_SYSCALL_STUB; } -303 AUE_NULL ALL { int psynch_cvbroad(user_addr_t cv, uint32_t cvgen, uint32_t diffgen, user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint64_t tid, uint32_t flags) NO_SYSCALL_STUB; } -304 AUE_NULL ALL { int psynch_cvsignal(user_addr_t cv, uint32_t cvgen, uint32_t cvugen, user_addr_t mutex, uint32_t mgen, uint32_t ugen, int thread_port, uint32_t flags) NO_SYSCALL_STUB; } -305 AUE_NULL ALL { uint32_t psynch_cvwait(user_addr_t cv, uint32_t cvgen, uint32_t cvugen, user_addr_t mutex, uint32_t mgen, uint32_t ugen, uint64_t sec, uint64_t usec) NO_SYSCALL_STUB; } +303 AUE_NULL ALL { uint32_t psynch_cvbroad(user_addr_t cv, uint64_t cvlsgen, uint64_t cvudgen, uint32_t flags, user_addr_t mutex, uint64_t mugen, uint64_t tid) NO_SYSCALL_STUB; } +304 AUE_NULL ALL { uint32_t psynch_cvsignal(user_addr_t cv, uint64_t cvlsgen, uint32_t cvugen, int thread_port, user_addr_t mutex, uint64_t mugen, uint64_t tid, uint32_t flags) NO_SYSCALL_STUB; } +305 AUE_NULL ALL { uint32_t psynch_cvwait(user_addr_t cv, uint64_t cvlsgen, uint32_t cvugen, user_addr_t mutex, uint64_t mugen, uint32_t flags, int64_t sec, uint32_t nsec) NO_SYSCALL_STUB; } 306 AUE_NULL ALL { uint32_t psynch_rw_rdlock(user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags) NO_SYSCALL_STUB; } 307 AUE_NULL ALL { uint32_t psynch_rw_wrlock(user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags) NO_SYSCALL_STUB; } 308 AUE_NULL ALL { uint32_t psynch_rw_unlock(user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags) NO_SYSCALL_STUB; } @@ -477,7 +477,11 @@ #endif 310 AUE_GETSID ALL { int getsid(pid_t pid); } 311 AUE_SETTIDWITHPID ALL { int settid_with_pid(pid_t pid, int assume) NO_SYSCALL_STUB; } +#if PSYNCH +312 AUE_NULL ALL { int psynch_cvclrprepost(user_addr_t cv, uint32_t cvgen, uint32_t cvugen, uint32_t cvsgen, uint32_t prepocnt, uint32_t preposeq, uint32_t flags) NO_SYSCALL_STUB; } +#else 312 AUE_NULL ALL { int nosys(void); } { old __pthread_cond_timedwait } +#endif 313 AUE_NULL ALL { int aio_fsync(int op, user_addr_t aiocbp); } 314 AUE_NULL ALL { user_ssize_t aio_return(user_addr_t aiocbp); } 315 AUE_NULL ALL { int aio_suspend(user_addr_t aiocblist, int nent, user_addr_t timeoutp); } @@ -488,7 +492,7 @@ 320 AUE_LIOLISTIO ALL { int lio_listio(int mode, user_addr_t aiocblist, int nent, user_addr_t sigp); } 321 AUE_NULL ALL { int nosys(void); } { old __pthread_cond_wait } 322 AUE_IOPOLICYSYS ALL { int iopolicysys(int cmd, void *arg) NO_SYSCALL_STUB; } -323 AUE_NULL ALL { int nosys(void); } +323 AUE_NULL ALL { int process_policy(int scope, int action, int policy, int policy_subtype, user_addr_t attrp, pid_t target_pid, uint64_t target_threadid) NO_SYSCALL_STUB; } 324 AUE_MLOCKALL ALL { int mlockall(int how); } 325 AUE_MUNLOCKALL ALL { int munlockall(int how); } 326 AUE_NULL ALL { int nosys(void); } @@ -544,7 +548,7 @@ #endif /* CONFIG_WORKQUEUE */ 362 AUE_KQUEUE ALL { int kqueue(void); } 363 AUE_NULL ALL { int kevent(int fd, const struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } -364 AUE_LCHOWN ALL { int lchown(user_addr_t path, uid_t owner, gid_t group); } +364 AUE_LCHOWN ALL { int lchown(user_addr_t path, uid_t owner, gid_t group) NO_SYSCALL_STUB; } 365 AUE_STACKSNAPSHOT ALL { int stack_snapshot(pid_t pid, user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, uint32_t dispatch_offset) NO_SYSCALL_STUB; } #if CONFIG_WORKQUEUE 366 AUE_NULL ALL { int bsdthread_register(user_addr_t threadstart, user_addr_t wqthread, int pthsize,user_addr_t dummy_value, user_addr_t targetconc_ptr, uint64_t dispatchqueue_offset) NO_SYSCALL_STUB; } @@ -637,7 +641,7 @@ ;#if OLD_SEMWAIT_SIGNAL ;423 AUE_NULL ALL { int nosys(void); } { old __semwait_signal_nocancel } ;#else -423 AUE_SEMWAITSIGNAL ALL { int __semwait_signal_nocancel(int cond_sem, int mutex_sem, int timeout, int relative, int64_t tv_sec, int32_t tv_nsec) NO_SYSCALL_STUB;} +423 AUE_SEMWAITSIGNAL ALL { int __semwait_signal_nocancel(int cond_sem, int mutex_sem, int timeout, int relative, int64_t tv_sec, int32_t tv_nsec); } ;#endif 424 AUE_MAC_MOUNT ALL { int __mac_mount(char *type, char *path, int flags, caddr_t data, struct mac *mac_p); } 425 AUE_MAC_GET_MOUNT ALL { int __mac_get_mount(char *path, struct mac *mac_p); } @@ -645,12 +649,18 @@ 427 AUE_FSGETPATH ALL { user_ssize_t fsgetpath(user_addr_t buf, size_t bufsize, user_addr_t fsid, uint64_t objid) NO_SYSCALL_STUB; } { private fsgetpath (File Manager SPI) } 428 AUE_NULL ALL { mach_port_name_t audit_session_self(void); } 429 AUE_NULL ALL { int audit_session_join(mach_port_name_t port); } -430 AUE_NULL ALL { int pid_suspend(int pid); } -431 AUE_NULL ALL { int pid_resume(int pid); } +430 AUE_NULL ALL { int fileport_makeport(int fd, user_addr_t portnamep); } +431 AUE_NULL ALL { int fileport_makefd(mach_port_name_t port); } +432 AUE_NULL ALL { int audit_session_port(au_asid_t asid, user_addr_t portnamep); } +433 AUE_NULL ALL { int pid_suspend(int pid); } +434 AUE_NULL ALL { int pid_resume(int pid); } #if CONFIG_EMBEDDED -432 AUE_NULL ALL { int fileport_makeport(int fd, user_addr_t portnamep); } -433 AUE_NULL ALL { int fileport_makefd(mach_port_name_t port); } +435 AUE_NULL ALL { int pid_hibernate(int pid); } +436 AUE_NULL ALL { int pid_shutdown_sockets(int pid, int level); } #else -432 AUE_NULL ALL { int nosys(void); } -433 AUE_NULL ALL { int nosys(void); } +435 AUE_NULL ALL { int nosys(void); } +436 AUE_NULL ALL { int nosys(void); } #endif +437 AUE_NULL ALL { int nosys(void); } { old shared_region_slide_np } +438 AUE_NULL ALL { int shared_region_map_and_slide_np(int fd, uint32_t count, const struct shared_file_mapping_np *mappings, uint32_t slide, uint64_t* slide_start, uint32_t slide_size) NO_SYSCALL_STUB; } + diff --git a/bsd/kern/sysv_ipc.c b/bsd/kern/sysv_ipc.c index 95c23d418..8f56757c4 100644 --- a/bsd/kern/sysv_ipc.c +++ b/bsd/kern/sysv_ipc.c @@ -60,46 +60,101 @@ #include <sys/param.h> #include <sys/ipc.h> +#include <sys/stat.h> /* mode constants */ #include <sys/ucred.h> #include <sys/kauth.h> /* * Check for ipc permission - * - * XXX: Should pass proc argument so that we can pass - * XXX: proc->p_acflag to suser() */ + /* + * ipc_perm + * + * perm->mode mode of the object + * mode mode bits we want to test + * * Returns: 0 Success * EPERM * EACCES + * + * Notes: The IPC_M bit is special, in that it may only be granted to + * root, the creating user, or the owning user. + * + * This code does not use posix_cred_access() because of the + * need to check both creator and owner separately when we are + * considering a rights grant. Because of this, we need to do + * two evaluations when the values are inequal, which can lead + * us to defeat the callout avoidance optimization. So we do + * the work here, inline. This is less than optimal for any + * future work involving opacity of of POSIX credentials. + * + * Setting up the mode_owner / mode_group / mode_world implicitly + * masks the IPC_M bit off. This is intentional. + * + * See the posix_cred_access() implementation for algorithm + * information. */ int -ipcperm(kauth_cred_t cred, struct ipc_perm *perm, int mode) +ipcperm(kauth_cred_t cred, struct ipc_perm *perm, int mode_req) { + uid_t uid = kauth_cred_getuid(cred); /* avoid multiple calls */ + int want_mod_controlinfo = (mode_req & IPC_M); + int is_member; + mode_t mode_owner = (perm->mode & S_IRWXU); + mode_t mode_group = (perm->mode & S_IRWXG) << 3; + mode_t mode_world = (perm->mode & S_IRWXO) << 6; + /* Grant all rights to super user */ if (!suser(cred, (u_short *)NULL)) return (0); - /* Check for user match. */ - if (kauth_cred_getuid(cred) != perm->cuid && kauth_cred_getuid(cred) != perm->uid) { - int is_member; + /* Grant or deny rights based on ownership */ + if (uid == perm->cuid || uid == perm->uid) { + if (want_mod_controlinfo) + return (0); - if (mode & IPC_M) + return ((mode_req & mode_owner) == mode_req ? 0 : EACCES); + } else { + /* everyone else who wants to modify control info is denied */ + if (want_mod_controlinfo) return (EPERM); - /* Check for group match. */ - mode >>= 3; - if ((kauth_cred_ismember_gid(cred, perm->gid, &is_member) || !is_member) && - (kauth_cred_ismember_gid(cred, perm->cgid, &is_member) || !is_member)) { - /* Check for `other' match. */ - mode >>= 3; - } } - if (mode & IPC_M) + /* + * Combined group and world rights check, if no owner rights; positive + * asssertion of gid/cgid equality avoids an extra callout in the + * common case. + */ + if ((mode_req & mode_group & mode_world) == mode_req) { return (0); - - return ((mode & perm->mode) == mode ? 0 : EACCES); + } else { + if ((mode_req & mode_group) != mode_req) { + if ((!kauth_cred_ismember_gid(cred, perm->gid, &is_member) && is_member) && + ((perm->gid == perm->cgid) || + (!kauth_cred_ismember_gid(cred, perm->cgid, &is_member) && is_member))) { + return (EACCES); + } else { + if ((mode_req & mode_world) != mode_req) { + return (EACCES); + } else { + return (0); + } + } + } else { + if ((!kauth_cred_ismember_gid(cred, perm->gid, &is_member) && is_member) || + ((perm->gid != perm->cgid) && + (!kauth_cred_ismember_gid(cred, perm->cgid, &is_member) && is_member))) { + return (0); + } else { + if ((mode_req & mode_world) != mode_req) { + return (EACCES); + } else { + return (0); + } + } + } + } } diff --git a/bsd/kern/sysv_msg.c b/bsd/kern/sysv_msg.c index 7ed083eb9..daca44630 100644 --- a/bsd/kern/sysv_msg.c +++ b/bsd/kern/sysv_msg.c @@ -667,8 +667,8 @@ msgget(__unused struct proc *p, struct msgget_args *uap, int32_t *retval) msqptr->u.msg_perm._key = key; msqptr->u.msg_perm.cuid = kauth_cred_getuid(cred); msqptr->u.msg_perm.uid = kauth_cred_getuid(cred); - msqptr->u.msg_perm.cgid = cred->cr_gid; - msqptr->u.msg_perm.gid = cred->cr_gid; + msqptr->u.msg_perm.cgid = kauth_cred_getgid(cred); + msqptr->u.msg_perm.gid = kauth_cred_getgid(cred); msqptr->u.msg_perm.mode = (msgflg & 0777); /* Make sure that the returned msqid is unique */ msqptr->u.msg_perm._seq++; @@ -1576,7 +1576,7 @@ IPCS_msg_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, } SYSCTL_DECL(_kern_sysv_ipcs); -SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, msg, CTLFLAG_RW|CTLFLAG_ANYBODY, +SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, msg, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, IPCS_msg_sysctl, "S,IPCS_msg_command", "ipcs msg command interface"); diff --git a/bsd/kern/sysv_sem.c b/bsd/kern/sysv_sem.c index 0e44029cf..ed43ec893 100644 --- a/bsd/kern/sysv_sem.c +++ b/bsd/kern/sysv_sem.c @@ -999,8 +999,8 @@ semget(__unused struct proc *p, struct semget_args *uap, int32_t *retval) sema[semid].u.sem_perm._key = key; sema[semid].u.sem_perm.cuid = kauth_cred_getuid(cred); sema[semid].u.sem_perm.uid = kauth_cred_getuid(cred); - sema[semid].u.sem_perm.cgid = cred->cr_gid; - sema[semid].u.sem_perm.gid = cred->cr_gid; + sema[semid].u.sem_perm.cgid = kauth_cred_getgid(cred); + sema[semid].u.sem_perm.gid = kauth_cred_getgid(cred); sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC; sema[semid].u.sem_perm._seq = (sema[semid].u.sem_perm._seq + 1) & 0x7fff; @@ -1092,6 +1092,15 @@ semop(struct proc *p, struct semop_args *uap, int32_t *retval) goto semopout; } + /* OK for LP64, since sizeof(struct sembuf) is currently invariant */ + if ((eval = copyin(uap->sops, &sops, nsops * sizeof(struct sembuf))) != 0) { +#ifdef SEM_DEBUG + printf("eval = %d from copyin(%08x, %08x, %ld)\n", eval, + uap->sops, &sops, nsops * sizeof(struct sembuf)); +#endif + goto semopout; + } + #if CONFIG_MACF /* * Initial pass thru sops to see what permissions are needed. @@ -1110,15 +1119,6 @@ semop(struct proc *p, struct semop_args *uap, int32_t *retval) goto semopout; #endif - /* OK for LP64, since sizeof(struct sembuf) is currently invariant */ - if ((eval = copyin(uap->sops, &sops, nsops * sizeof(struct sembuf))) != 0) { -#ifdef SEM_DEBUG - printf("eval = %d from copyin(%08x, %08x, %ld)\n", eval, - uap->sops, &sops, nsops * sizeof(struct sembuf)); -#endif - goto semopout; - } - /* * Loop trying to satisfy the vector of requests. * If we reach a point where we must wait, any requests already @@ -1539,19 +1539,19 @@ out: /* SYSCTL_NODE(_kern, KERN_SYSV, sysv, CTLFLAG_RW, 0, "SYSV"); */ extern struct sysctl_oid_list sysctl__kern_sysv_children; -SYSCTL_PROC(_kern_sysv, OID_AUTO, semmni, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, semmni, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &limitseminfo.semmni, 0, &sysctl_seminfo ,"I","semmni"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, semmns, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, semmns, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &limitseminfo.semmns, 0, &sysctl_seminfo ,"I","semmns"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, semmnu, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, semmnu, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &limitseminfo.semmnu, 0, &sysctl_seminfo ,"I","semmnu"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, semmsl, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, semmsl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &limitseminfo.semmsl, 0, &sysctl_seminfo ,"I","semmsl"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, semume, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, semume, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &limitseminfo.semume, 0, &sysctl_seminfo ,"I","semume"); @@ -1662,7 +1662,7 @@ IPCS_sem_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, } SYSCTL_DECL(_kern_sysv_ipcs); -SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, sem, CTLFLAG_RW|CTLFLAG_ANYBODY, +SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, sem, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, IPCS_sem_sysctl, "S,IPCS_sem_command", "ipcs sem command interface"); diff --git a/bsd/kern/sysv_shm.c b/bsd/kern/sysv_shm.c index 4a93dc597..25a484798 100644 --- a/bsd/kern/sysv_shm.c +++ b/bsd/kern/sysv_shm.c @@ -774,7 +774,7 @@ shmget_allocate_segment(struct proc *p, struct shmget_args *uap, int mode, shmid = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm); shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = kauth_cred_getuid(cred); - shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid; + shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = kauth_cred_getgid(cred); shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) | (mode & ACCESSPERMS) | SHMSEG_ALLOCATED; shmseg->u.shm_segsz = uap->size; @@ -1165,26 +1165,26 @@ ipcs_shm_sysctl_out: return(error); } -SYSCTL_NODE(_kern, KERN_SYSV, sysv, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "SYSV"); +SYSCTL_NODE(_kern, KERN_SYSV, sysv, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, "SYSV"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmax, CTLTYPE_QUAD | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmax, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &shminfo.shmmax, 0, &sysctl_shminfo ,"Q","shmmax"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmin, CTLTYPE_QUAD | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmin, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &shminfo.shmmin, 0, &sysctl_shminfo ,"Q","shmmin"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmni, CTLTYPE_QUAD | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, shmmni, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &shminfo.shmmni, 0, &sysctl_shminfo ,"Q","shmmni"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, shmseg, CTLTYPE_QUAD | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, shmseg, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &shminfo.shmseg, 0, &sysctl_shminfo ,"Q","shmseg"); -SYSCTL_PROC(_kern_sysv, OID_AUTO, shmall, CTLTYPE_QUAD | CTLFLAG_RW, +SYSCTL_PROC(_kern_sysv, OID_AUTO, shmall, CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &shminfo.shmall, 0, &sysctl_shminfo ,"Q","shmall"); -SYSCTL_NODE(_kern_sysv, OID_AUTO, ipcs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "SYSVIPCS"); +SYSCTL_NODE(_kern_sysv, OID_AUTO, ipcs, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, "SYSVIPCS"); -SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, shm, CTLFLAG_RW|CTLFLAG_ANYBODY, +SYSCTL_PROC(_kern_sysv_ipcs, OID_AUTO, shm, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, IPCS_shm_sysctl, "S,IPCS_shm_command", "ipcs shm command interface"); diff --git a/bsd/kern/trace.codes b/bsd/kern/trace.codes new file mode 100644 index 000000000..3792f3d37 --- /dev/null +++ b/bsd/kern/trace.codes @@ -0,0 +1,2149 @@ +0x1020000 KTrap_DivideError +0x1020004 KTrap_Debug +0x1020008 KTrap_NMI +0x102000c KTrap_Int3 +0x1020010 KTrap_Overflow +0x1020014 KTrap_BoundRange +0x1020018 KTrap_InvalidOpcode +0x102001c KTrap_DeviceNotAvail +0x1020020 KTrap_DoubleFault +0x1020024 KTrap_Coprocessor +0x1020028 KTrap_InvalidTSS +0x102002c KTrap_SegmentNotPresent +0x1020030 KTrap_StackFault +0x1020034 KTrap_GeneralProtection +0x1020038 KTrap_PageFault +0x102003c KTrap_unknown +0x1020040 KTrap_FloatPointError +0x1020044 KTrap_AlignmentCheck +0x1020048 KTrap_MachineCheck +0x102004c KTrap_SIMD_FP +0x10203fc KTrap_Preempt +0x1050000 INTERRUPT +0x1070000 UTrap_DivideError +0x1070004 UTrap_Debug +0x1070008 UTrap_NMI +0x107000c UTrap_Int3 +0x1070010 UTrap_Overflow +0x1070014 UTrap_BoundRange +0x1070018 UTrap_InvalidOpcode +0x107001c UTrap_DeviceNotAvail +0x1070020 UTrap_DoubleFault +0x1070024 UTrap_Coprocessor +0x1070028 UTrap_InvalidTSS +0x107002c UTrap_SegmentNotPresent +0x1070030 UTrap_StackFault +0x1070034 UTrap_GeneralProtection +0x1070038 UTrap_PageFault +0x107003c UTrap_unknown +0x1070040 UTrap_FloatPointError +0x1070044 UTrap_AlignmentCheck +0x1070048 UTrap_MachineCheck +0x107004c UTrap_SIMD_FP +0x1090000 DecrTrap +0x1090004 DecrSet +0x1090008 TimerCallIntr +0x109000c pmsStep +0x1090010 TimerMigration +0x1090014 rdHPET +0x1090018 set_tsc_deadline +0x10c0000 MACH_SysCall +0x10c0004 MSC_kern_invalid_#1 +0x10c0008 MSC_kern_invalid_#2 +0x10c000c MSC_kern_invalid_#3 +0x10c0010 MSC_kern_invalid_#4 +0x10c0014 MSC_kern_invalid_#5 +0x10c0018 MSC_kern_invalid_#6 +0x10c001c MSC_kern_invalid_#7 +0x10c0020 MSC_kern_invalid_#8 +0x10c0024 MSC_kern_invalid_#9 +0x10c0028 MSC_kern_invalid_#10 +0x10c002c MSC_kern_invalid_#11 +0x10c0030 MSC_kern_invalid_#12 +0x10c0034 MSC_kern_invalid_#13 +0x10c0038 MSC_kern_invalid_#14 +0x10c003c MSC_kern_invalid_#15 +0x10c0040 MSC_kern_invalid_#16 +0x10c0044 MSC_kern_invalid_#17 +0x10c0048 MSC_kern_invalid_#18 +0x10c004c MSC_kern_invalid_#19 +0x10c0050 MSC_kern_invalid_#20 +0x10c0054 MSC_kern_invalid_#21 +0x10c0058 MSC_kern_invalid_#22 +0x10c005c MSC_kern_invalid_#23 +0x10c0060 MSC_kern_invalid_#24 +0x10c0064 MSC_kern_invalid_#25 +0x10c0068 MSC_mach_reply_port +0x10c006c MSC_thread_self_trap +0x10c0070 MSC_task_self_trap +0x10c0074 MSC_host_self_trap +0x10c0078 MSC_kern_invalid_#30 +0x10c007c MSC_mach_msg_trap +0x10c0080 MSC_mach_msg_overwrite_trap +0x10c0084 MSC_semaphore_signal_trap +0x10c0088 MSC_semaphore_signal_all_trap +0x10c008c MSC_semaphore_signal_thread_trap +0x10c0090 MSC_semaphore_wait_trap +0x10c0094 MSC_semaphore_wait_signal_trap +0x10c0098 MSC_semaphore_timedwait_trap +0x10c009c MSC_semaphore_timedwait_signal_trap +0x10c00a0 MSC_kern_invalid_#40 +0x10c00a4 MSC_kern_invalid_#41 +0x10c00a8 MSC_kern_invalid_#42 +0x10c00ac MSC_map_fd +0x10c00b0 MSC_task_name_for_pid +0x10c00b4 MSC_task_for_pid +0x10c00b8 MSC_pid_for_task +0x10c00bc MSC_kern_invalid_#47 +0x10c00c0 MSC_macx_swapon +0x10c00c4 MSC_macx_swapoff +0x10c00c8 MSC_kern_invalid_#50 +0x10c00cc MSC_macx_triggers +0x10c00d0 MSC_macx_backing_store_suspend +0x10c00d4 MSC_macx_backing_store_recovery +0x10c00d8 MSC_kern_invalid_#54 +0x10c00dc MSC_kern_invalid_#55 +0x10c00e0 MSC_kern_invalid_#56 +0x10c00e4 MSC_kern_invalid_#57 +0x10c00e8 MSC_pfz_exit +0x10c00ec MSC_swtch_pri +0x10c00f0 MSC_swtch +0x10c00f4 MSC_thread_switch +0x10c00f8 MSC_clock_sleep_trap +0x10c00fc MSC_kern_invalid_#63 +0x10c0100 MSC_kern_invalid_#64 +0x10c0104 MSC_kern_invalid_#65 +0x10c0108 MSC_kern_invalid_#66 +0x10c010c MSC_kern_invalid_#67 +0x10c0110 MSC_kern_invalid_#68 +0x10c0114 MSC_kern_invalid_#69 +0x10c0118 MSC_kern_invalid_#70 +0x10c011c MSC_kern_invalid_#71 +0x10c0120 MSC_kern_invalid_#72 +0x10c0124 MSC_kern_invalid_#73 +0x10c0128 MSC_kern_invalid_#74 +0x10c012c MSC_kern_invalid_#75 +0x10c0130 MSC_kern_invalid_#76 +0x10c0134 MSC_kern_invalid_#77 +0x10c0138 MSC_kern_invalid_#78 +0x10c013c MSC_kern_invalid_#79 +0x10c0140 MSC_kern_invalid_#80 +0x10c0144 MSC_kern_invalid_#81 +0x10c0148 MSC_kern_invalid_#82 +0x10c014c MSC_kern_invalid_#83 +0x10c0150 MSC_kern_invalid_#84 +0x10c0154 MSC_kern_invalid_#85 +0x10c0158 MSC_kern_invalid_#86 +0x10c015c MSC_kern_invalid_#87 +0x10c0160 MSC_kern_invalid_#88 +0x10c0164 MSC_mach_timebase_info +0x10c0168 MSC_mach_wait_until +0x10c016c MSC_mk_timer_create +0x10c0170 MSC_mk_timer_destroy +0x10c0174 MSC_mk_timer_arm +0x10c0178 MSC_mk_timer_cancel +0x10c017c MSC_kern_invalid_#95 +0x10c0180 MSC_kern_invalid_#96 +0x10c0184 MSC_kern_invalid_#97 +0x10c0188 MSC_kern_invalid_#98 +0x10c018c MSC_kern_invalid_#99 +0x10c0190 MSC_iokit_user_client +0x10c0194 MSC_kern_invalid_#101 +0x10c0198 MSC_kern_invalid_#102 +0x10c019c MSC_kern_invalid_#103 +0x10c01a0 MSC_kern_invalid_#104 +0x10c01a4 MSC_kern_invalid_#105 +0x10c01a8 MSC_kern_invalid_#106 +0x10c01ac MSC_kern_invalid_#107 +0x10c01b0 MSC_kern_invalid_#108 +0x10c01b4 MSC_kern_invalid_#109 +0x10c01b8 MSC_kern_invalid_#110 +0x10c01bc MSC_kern_invalid_#111 +0x10c01c0 MSC_kern_invalid_#112 +0x10c01c4 MSC_kern_invalid_#113 +0x10c01c8 MSC_kern_invalid_#114 +0x10c01cc MSC_kern_invalid_#115 +0x10c01d0 MSC_kern_invalid_#116 +0x10c01d4 MSC_kern_invalid_#117 +0x10c01d8 MSC_kern_invalid_#118 +0x10c01dc MSC_kern_invalid_#119 +0x10c01e0 MSC_kern_invalid_#120 +0x10c01e4 MSC_kern_invalid_#121 +0x10c01e8 MSC_kern_invalid_#122 +0x10c01ec MSC_kern_invalid_#123 +0x10c01f0 MSC_kern_invalid_#124 +0x10c01f4 MSC_kern_invalid_#125 +0x10c01f8 MSC_kern_invalid_#126 +0x10c01fc MSC_kern_invalid_#127 +0x1300004 MACH_Pageout +0x1300008 MACH_vmfault +0x1300100 MACH_purgable_token_add +0x1300104 MACH_purgable_token_delete +0x1300108 MACH_purgable_token_ripened +0x130010c MACH_purgable_token_purged +0x1300400 MACH_vm_check_zf_delay +0x1300404 MACH_vm_cow_delay +0x1300408 MACH_vm_zf_delay +0x1300410 MACH_vm_pageout_scan +0x1300414 MACH_vm_pageout_balanceQ +0x1300418 MACH_vm_pageout_freelist +0x130041c MACH_vm_pageout_purge_one +0x1300420 MACH_vm_pageout_cache_evict +0x1300424 MACH_vm_pageout_thread_block +0x1300480 MACH_vm_upl_page_wait +0x1300484 MACH_vm_iopl_page_wait +0x1400000 MACH_SCHED +0x1400004 MACH_STKATTACH +0x1400008 MACH_STKHANDOFF +0x140000c MACH_CALLCONT +0x1400010 MACH_CALLOUT +0x1400014 MACH_ServiceT +0x1400018 MACH_MKRUNNABLE +0x140001c MACH_PROMOTE +0x1400020 MACH_DEMOTE +0x1400024 MACH_IDLE +0x1400028 MACH_STACK_DEPTH +0x140002c MACH_MOVED +0x1400030 MACH_FAIRSHARE_ENTER +0x1400034 MACH_FAIRSHARE_EXIT +0x1400038 MACH_FAILSAFE +0x1400040 MACH_STKHANDOFF_BT +0x1400044 MACH_SCHED_BT +0x1400048 MACH_IDLE_BT +0x1400050 MACH_SCHED_GET_URGENCY +0x1400054 MACH_SCHED_URGENCY +0x1400058 MACH_SCHED_REDISPATCH +0x140005C MACH_SCHED_REMOTE_AST +0x1400060 MACH_SCHED_LPA_BROKEN +0x1500000 MACH_MSGID_INVALID +0x1600000 MTX_SLEEP +0x1600004 MTX_SLEEP_DEADLINE +0x1600008 MTX_WAIT +0x160000c MTX_WAKEUP +0x1600010 MTX_LOCK +0x1600014 MTX_UNLOCK +0x1600080 MTX_x86_wait +0x1600084 MTX_x86_wakeup +0x1600088 MTX_x86_spin +0x160008c MTX_x86_acquire +0x1600090 MTX_x86_demote +0x1600200 MTX_full_lock +0x1600400 RW_EXCL_WaitForWriter +0x1600404 RW_EXCL_WaitForReaders +0x1600408 RW_SHRD_WaitForWriter +0x160040c RW_SHRDtoEXCL_FailedUpgrade +0x1600410 RW_SHRDtoEXCL_WaitForReaders +0x1600414 RW_EXCLtoSHRD +0x1600418 RW_EXCL_SpinForWriter +0x160041c RW_EXCL_WaitForWriter +0x1600420 RW_EXCL_SpinForReaders +0x1600424 RW_EXCL_WaitForReaders +0x1600428 RW_SHRD_unlock +0x160042c RW_EXCL_unlock +0x1600440 RW_SHRD_SpinForWriter +0x1600444 RW_SHRD_WaitForWriter +0x1600448 RW_SHRDtoEXCL_SpinForReaders +0x160044c RW_SHRDtoEXCL_WaitForReaders +0x1700000 PMAP_create +0x1700004 PMAP_destroy +0x1700008 PMAP_protect +0x170000c PMAP_page_protect +0x1700010 PMAP_enter +0x1700014 PMAP_remove +0x1700018 PMAP_nest +0x170001c PMAP_unnest +0x1700020 PMAP_flush_TLBS +0x1700024 PMAP_update_interrupt +0x1700028 PMAP_attribute_clear +0x2010000 L_IP_In_Beg +0x2010004 L_IP_Out_Beg +0x2010008 L_IP_In_End +0x201000c L_IP_Out_End +0x2010404 F_IP_Output +0x2010800 F_IP_Input +0x2010c00 F_In_CkSum +0x2020000 L_ARP_Req +0x2020004 L_ARP_Resp +0x2020008 L_ARP_Reply +0x202000c L_ARP_Timo +0x2020010 L_ARP_Look +0x2020014 L_ARP_Input +0x2030000 L_UDP_In_Beg +0x2030004 L_UDP_Out_Beg +0x2030008 L_UDP_In_End +0x203000c L_UDP_Out_End +0x2031400 F_UDP_Input +0x2031804 F_UDP_Output +0x2040000 L_TCP_In_Beg +0x2040004 L_TCP_Out_Beg +0x2040008 L_TCP_In_End +0x204000c L_TCP_Out_End +0x2040c00 F_TCP_Input +0x2041004 F_TCP_Output +0x2041400 F_TCP_FastT +0x2041404 F_TCP_SlowT +0x2041408 F_TCP_Close +0x2041800 F_PCB_Lookup +0x2041804 F_PCB_HshLkup +0x2041c00 F_TCP_NewConn +0x2041d00 F_TCP_gotSync +0x20b0010 F_SBDrop +0x20b0014 F_SBAppend +0x20b0404 F_SendMsg +0x20b0804 F_SendTo +0x20b0c04 F_SendIt +0x20b1004 F_SoSend +0x20b1008 F_SoSend_CopyD +0x20b1400 F_RecvFrom +0x20b1800 F_RecvMsg +0x20b1c00 F_RecvIt +0x20b2000 F_SoReceive +0x20b2100 F_SoShutdown +0x20b2400 F_SoAccept +0x20b2800 F_sendfile +0x20b2804 F_sendfile_wait +0x20b2808 F_sendfile_read +0x20b280c F_sendfile_send +0x2650004 AT_DDPinput +0x2f00000 F_FreemList +0x2f00004 F_m_copym +0x2f00008 F_getpackets +0x2f0000c F_getpackethdrs +0x3010000 HFS_Write +0x3010004 HFS_Fsync +0x3010008 HFS_Close +0x301000c HFS_Remove +0x3010010 HFS_Create +0x3010014 HFS_Inactive +0x3010018 HFS_Reclaim +0x301001C HFS_Truncate +0x3010028 vinvalbuf +0x3010030 HFS_Read +0x3010034 HFS_RL_ADD +0x3010038 HFS_RL_REMOVE +0x301003c MACH_copyiostr +0x3010040 UIO_copyout +0x3010044 UIO_copyin +0x3010048 MACH_copyio +0x301004c Cl_bp +0x3010050 Cl_iodone +0x3010054 Cl_ubc_dump +0x3010058 Cl_io +0x301005c Cl_zero +0x3010060 Cl_cmap +0x3010068 Cl_ioread +0x301006c Cl_iowrite +0x3010070 Cl_ioabort +0x3010074 Cl_zero_commit +0x3010078 Cl_wrdel_commit +0x301007c Cl_read_abort +0x3010080 Cl_read_copy +0x3010084 Cl_read_list_req +0x3010088 Cl_phys_uiomove +0x301008c Cl_read_commit +0x3010090 VFS_LOOKUP +0x3010094 Cl_read_uplmap +0x3010098 Cl_read_uplunmap +0x301009C VFS_LOOKUP_DONE +0x30100a0 Cl_write_copy +0x30100a4 Cl_write_list_req +0x30100a8 Cl_write_uiomove +0x30100ac Cl_write_zeros +0x30100b0 Cl_write_delayed +0x30100b4 Cl_write_abort +0x30100b8 Cl_zero_info +0x30100c0 Cl_rd_ahead +0x30100c4 Cl_rd_prefetch +0x30100c8 Cl_rd_prefabort +0x30100cc Cl_writepush +0x30100d0 Cl_pageout +0x30100d4 Cl_push +0x30100e0 Cl_pagein +0x30100f0 Cl_advisory_rd +0x30100f4 Cl_adv_fault_list +0x30100f8 Cl_adv_abort1 +0x30100fc Cl_adv_abort2 +0x3010118 Cl_read_direct +0x301011c Cl_ncpr_uiomv +0x3010120 Cl_ncpr_getupl +0x3010124 Cl_ncpr_clio +0x301012c Cl_write_direct +0x3010130 Cl_ncpw_getupl +0x3010134 Cl_ncpw_clio +0x3010138 Cl_sparse_collect +0x301013c Cl_sparse_push +0x3010140 Cl_sparse_add +0x3010144 Cl_release +0x3010148 Cl_drt_emptyfree +0x301014c Cl_drt_retcluster +0x3010150 Cl_drt_alloctable +0x3010154 Cl_drt_insert +0x3010158 Cl_drt_mark +0x301015c Cl_drt_6 +0x3010160 Cl_drt_freetable +0x3010170 Cl_read_contig_getupl +0x3010174 Cl_write_contig_getupl +0x3010178 Cl_io_type +0x301017c Cl_wait_IO +0x3010180 Vnode_Pagein +0x3010184 throttle_lowpri_io +0x3010200 Vnode_Pageout +0x3010280 Vnode_WaitForWrites +0x3010300 PageoutThrottle +0x3010340 SuperCluster +0x3010344 PS_Offsets +0x3010348 PS_Indexes +0x301034c Dirty_Indexes +0x3010350 PS_Write +0x3010354 PS_WriteComplete +0x3010380 PageoutCollect +0x3010384 PagesOnInactive_Q +0x3010388 PagesOnActive_Q +0x301038c PageoutScan +0x3010390 PageoutWait +0x3010394 PageoutWakeup1 +0x3010398 PageoutWakeup2 +0x301039c PageoutWakeup3 +0x3010400 NFS_doio +0x3010404 NFS_doio_offsets +0x3010408 NFS_doio_zero_read +0x301040c NFS_doio_zero_write +0x3010410 NFS_doio_invalidate +0x3010414 NFS_doio_retry +0x3010418 NFS_doio_done +0x3010500 NFS_pagein_zero +0x3010504 NFS_pageout_zero +0x3010508 NFS_pagein +0x301050c NFS_pageout +0x3010600 BIO_write_list_req +0x3010604 BIO_getblk_list_req +0x3010608 BIO_getblk +0x301060c BIO_biodone +0x3010610 BIO_brelse +0x3010614 BIO_recovered_buf +0x3010618 BIO_dumped_buf +0x301061c BIO_write_delayed +0x3010620 BIO_acquire_error +0x3010624 BIO_write_async +0x3010628 BIO_write_sync +0x301062c BIO_flushdirty +0x3010630 BIO_getblk_msleep +0x3010700 VM_pageout_list_req +0x3010704 VM_pagein_list_req +0x3010800 NFS_setattr +0x3010804 NFS_getattr +0x3010808 NFS_read +0x301080c NFS_write +0x3010810 NFS_truncate +0x3010814 NFS_flush +0x3010818 NFS_flush_again +0x301081c NFS_flush_bvec +0x3010820 NFS_flush_upls +0x3010824 NFS_commit +0x3010828 NFS_flush_commit +0x301082c NFS_flush_done +0x3010830 NFS_flush_busy +0x3010834 NFS_flush_bwrite +0x3010838 NFS_flush_normal +0x301083c NFS_loadattrcache +0x3010840 NFS_getattrcache +0x3010844 NFS_connect +0x3010848 NFS_reply +0x301084c NFS_request +0x3010850 NFS_softterm +0x3010854 NFS_rcvunlock +0x3010858 NFS_rcvlock +0x301085c NFS_timer +0x3010860 NFS_vinvalbuf +0x3010864 NFS_srvcommit +0x3010868 NFS_srvfsync +0x301086c NFS_RdAhead +0x3010870 NFS_srvread +0x3010874 NFS_srvVOPREAD +0x3010900 UBC_setsize +0x3010904 UBC_sync_range +0x3010908 UBC_upl_abort_range +0x301090c UBC_upl_commit_range +0x3011000 UPL_iopl_req +0x3011004 UPL_upl_req +0x3011008 UPL_abort_range +0x301100c UPL_abort +0x3011010 UPL_commit_range +0x3011014 UPL_commit +0x3011018 UPL_destroy +0x301101c UPL_commit_range_active +0x3011020 UPL_commit_range_inactive +0x3011024 UPL_map_enter_upl +0x3011028 UPL_map_remove_upl +0x301102c UPL_commit_range_speculative +0x3020000 P_WrData +0x3020004 P_WrDataDone +0x3020008 P_RdData +0x302000C P_RdDataDone +0x3020010 P_WrDataAsync +0x3020014 P_WrDataAsyncDone +0x3020018 P_RdDataAsync +0x302001C P_RdDataAsyncDone +0x3020020 P_WrMeta +0x3020024 P_WrMetaDone +0x3020028 P_RdMeta +0x302002C P_RdMetaDone +0x3020030 P_WrMetaAsync +0x3020034 P_WrMetaAsyncDone +0x3020038 P_RdMetaAsync +0x302003C P_RdMetaAsyncDone +0x3020040 P_PgOut +0x3020044 P_PgOutDone +0x3020048 P_PgIn +0x302004C P_PgInDone +0x3020050 P_PgOutAsync +0x3020054 P_PgOutAsyncDone +0x3020058 P_PgInAsync +0x302005C P_PgInAsyncDone +0x3020080 P_WrDataT +0x3020084 P_WrDataTDone +0x3020088 P_RdDataT +0x302008C P_RdDataTDone +0x3020090 P_WrDataAsyncT +0x3020094 P_WrDataAsyncTDone +0x3020098 P_RdDataAsyncT +0x302009C P_RdDataAsyncTDone +0x30200a0 P_WrMetaT +0x30200A4 P_WrMetaTDone +0x30200a8 P_RdMetaT +0x30200AC P_RdMetaTDone +0x30200b0 P_WrMetaAsyncT +0x30200B4 P_WrMetaAsyncTDone +0x30200b8 P_RdMetaAsyncT +0x30200BC P_RdMetaAsyncTDone +0x30200c0 P_PgOutT +0x30200C4 P_PgOutTDone +0x30200c8 P_PgInT +0x30200CC P_PgInTDone +0x30200d0 P_PgOutAsyncT +0x30200D4 P_PgOutAsyncTDone +0x30200d8 P_PgInAsyncT +0x30200DC P_PgInAsyncTDone +0x3020100 P_WrDataP +0x3020104 P_WrDataPDone +0x3020108 P_RdDataP +0x302010C P_RdDataPDone +0x3020110 P_WrDataAsyncP +0x3020114 P_WrDataAsyncPDone +0x3020118 P_RdDataAsyncP +0x302011C P_RdDataAsyncPDone +0x3020120 P_WrMetaP +0x3020124 P_WrMetaPDone +0x3020128 P_RdMetaP +0x302012C P_RdMetaPDone +0x3020130 P_WrMetaAsyncP +0x3020134 P_WrMetaAsyncPDone +0x3020138 P_RdMetaAsyncP +0x302013C P_RdMetaAsyncPDone +0x3020140 P_PgOutP +0x3020144 P_PgOutPDone +0x3020148 P_PgInP +0x302014C P_PgInPDone +0x3020150 P_PgOutAsyncP +0x3020154 P_PgOutAsyncPDone +0x3020158 P_PgInAsyncP +0x302015C P_PgInAsyncPDone +0x3050004 journal_flush +0x3070004 BootCache_tag +0x3070008 BootCache_batch +0x4010004 proc_exit +0x4010008 force_exit +0x40c0000 BSC_SysCall +0x40c0004 BSC_exit +0x40c0008 BSC_fork +0x40c000c BSC_read +0x40c0010 BSC_write +0x40c0014 BSC_open +0x40c0018 BSC_close +0x40c001c BSC_wait4 +0x40c0020 BSC_obs_creat +0x40c0024 BSC_link +0x40c0028 BSC_unlink +0x40c002c BSC_obs_execv +0x40c0030 BSC_chdir +0x40c0034 BSC_fchdir +0x40c0038 BSC_mknod +0x40c003c BSC_chmod +0x40c0040 BSC_chown +0x40c0044 BSC_obs_break +0x40c0048 BSC_getfsstat +0x40c004c BSC_obs_lseek +0x40c0050 BSC_getpid +0x40c0054 BSC_obs_mount +0x40c0058 BSC_obs_unmount +0x40c005c BSC_setuid +0x40c0060 BSC_getuid +0x40c0064 BSC_geteuid +0x40c0068 BSC_ptrace +0x40c006c BSC_recvmsg +0x40c0070 BSC_sendmsg +0x40c0074 BSC_recvfrom +0x40c0078 BSC_accept +0x40c007c BSC_getpeername +0x40c0080 BSC_getsockname +0x40c0084 BSC_access +0x40c0088 BSC_chflags +0x40c008c BSC_fchflags +0x40c0090 BSC_sync +0x40c0094 BSC_kill +0x40c0098 BSC_obs_stat +0x40c009c BSC_getppid +0x40c00a0 BSC_obs_lstat +0x40c00a4 BSC_dup +0x40c00a8 BSC_pipe +0x40c00ac BSC_getegid +0x40c00b0 BSC_profil +0x40c00b4 BSC_obs_ktrace +0x40c00b8 BSC_sigaction +0x40c00bc BSC_getgid +0x40c00c0 BSC_sigprocmask +0x40c00c4 BSC_getlogin +0x40c00c8 BSC_setlogin +0x40c00cc BSC_acct +0x40c00d0 BSC_sigpending +0x40c00d4 BSC_sigaltstack +0x40c00d8 BSC_ioctl +0x40c00dc BSC_reboot +0x40c00e0 BSC_revoke +0x40c00e4 BSC_symlink +0x40c00e8 BSC_readlink +0x40c00ec BSC_execve +0x40c00f0 BSC_umask +0x40c00f4 BSC_chroot +0x40c00f8 BSC_obs_fstat +0x40c00fc BSC_#63 +0x40c0100 BSC_obs_getpagesize +0x40c0104 BSC_msync +0x40c0108 BSC_vfork +0x40c010c BSC_obs_vread +0x40c0110 BSC_obs_vwrite +0x40c0114 BSC_obs_sbrk +0x40c0118 BSC_obs_sstk +0x40c011c BSC_obs_mmap +0x40c0120 BSC_obs_vadvise +0x40c0124 BSC_munmap +0x40c0128 BSC_mprotect +0x40c012c BSC_madvise +0x40c0130 BSC_obs_vhangup +0x40c0134 BSC_obs_vlimit +0x40c0138 BSC_mincore +0x40c013c BSC_getgroups +0x40c0140 BSC_setgroups +0x40c0144 BSC_getpgrp +0x40c0148 BSC_setpgid +0x40c014c BSC_setitimer +0x40c0150 BSC_obs_wait +0x40c0154 BSC_swapon +0x40c0158 BSC_getitimer +0x40c015c BSC_obs_gethostname +0x40c0160 BSC_obs_sethostname +0x40c0164 BSC_getdtablesize +0x40c0168 BSC_dup2 +0x40c016c BSC_obs_getdopt +0x40c0170 BSC_fcntl +0x40c0174 BSC_select +0x40c0178 BSC_obs_setdopt +0x40c017c BSC_fsync +0x40c0180 BSC_setpriority +0x40c0184 BSC_socket +0x40c0188 BSC_connect +0x40c018c BSC_obs_accept +0x40c0190 BSC_getpriority +0x40c0194 BSC_obs_send +0x40c0198 BSC_obs_recv +0x40c019c BSC_obs_sigreturn +0x40c01a0 BSC_bind +0x40c01a4 BSC_setsockopt +0x40c01a8 BSC_listen +0x40c01ac BSC_obs_vtimes +0x40c01b0 BSC_obs_sigvec +0x40c01b4 BSC_obs_sigblock +0x40c01b8 BSC_obs_sigsetmask +0x40c01bc BSC_sigsuspend +0x40c01c0 BSC_obs_sigstack +0x40c01c4 BSC_obs_recvmsg +0x40c01c8 BSC_obs_sendmsg +0x40c01cc BSC_obs_vtrace +0x40c01d0 BSC_gettimeofday +0x40c01d4 BSC_getrusage +0x40c01d8 BSC_getsockopt +0x40c01dc BSC_obs_resuba +0x40c01e0 BSC_readv +0x40c01e4 BSC_writev +0x40c01e8 BSC_settimeofday +0x40c01ec BSC_fchown +0x40c01f0 BSC_fchmod +0x40c01f4 BSC_obs_recvfrom +0x40c01f8 BSC_setreuid +0x40c01fc BSC_setregid +0x40c0200 BSC_rename +0x40c0204 BSC_obs_truncate +0x40c0208 BSC_obs_ftruncate +0x40c020c BSC_flock +0x40c0210 BSC_mkfifo +0x40c0214 BSC_sendto +0x40c0218 BSC_shutdown +0x40c021c BSC_socketpair +0x40c0220 BSC_mkdir +0x40c0224 BSC_rmdir +0x40c0228 BSC_utimes +0x40c022c BSC_futimes +0x40c0230 BSC_adjtime +0x40c0234 BSC_obs_getpeername +0x40c0238 BSC_gethostuuid +0x40c023c BSC_obs_sethostid +0x40c0240 BSC_obs_getrlimit +0x40c0244 BSC_obs_setrlimit +0x40c0248 BSC_obs_killpg +0x40c024c BSC_setsid +0x40c0250 BSC_obs_setquota +0x40c0254 BSC_obs_qquota +0x40c0258 BSC_obs_getsockname +0x40c025c BSC_getpgid +0x40c0260 BSC_setprivexec +0x40c0264 BSC_pread +0x40c0268 BSC_pwrite +0x40c026c BSC_nfssvc +0x40c0270 BSC_obs_getdirentries +0x40c0274 BSC_statfs +0x40c0278 BSC_fstatfs +0x40c027c BSC_unmount +0x40c0280 BSC_obs_async_daemon +0x40c0284 BSC_getfh +0x40c0288 BSC_obs_getdomainname +0x40c028c BSC_obs_setdomainname +0x40c0290 BSC_#164 +0x40c0294 BSC_quotactl +0x40c0298 BSC_obs_exportfs +0x40c029c BSC_mount +0x40c02a0 BSC_obs_ustat +0x40c02a4 BSC_csops +0x40c02a8 BSC_obs_table +0x40c02ac BSC_obs_wait3 +0x40c02b0 BSC_obs_rpause +0x40c02b4 BSC_waitid +0x40c02b8 BSC_obs_getdents +0x40c02bc BSC_obs_gc_control +0x40c02c0 BSC_add_profil +0x40c02c4 BSC_#177 +0x40c02c8 BSC_#178 +0x40c02cc BSC_#179 +0x40c02d0 BSC_kdebug_trace +0x40c02d4 BSC_setgid +0x40c02d8 BSC_setegid +0x40c02dc BSC_seteuid +0x40c02e0 BSC_sigreturn +0x40c02e4 BSC_chud +0x40c02e8 BSC_#186 +0x40c02ec BSC_fdatasync +0x40c02f0 BSC_stat +0x40c02f4 BSC_fstat +0x40c02f8 BSC_lstat +0x40c02fc BSC_pathconf +0x40c0300 BSC_fpathconf +0x40c0304 BSC_#193 +0x40c0308 BSC_getrlimit +0x40c030c BSC_setrlimit +0x40c0310 BSC_getdirentries +0x40c0314 BSC_mmap +0x40c0318 BSC_obs__syscall +0x40c031c BSC_lseek +0x40c0320 BSC_truncate +0x40c0324 BSC_ftruncate +0x40c0328 BSC_sysctl +0x40c032c BSC_mlock +0x40c0330 BSC_munlock +0x40c0334 BSC_undelete +0x40c0338 BSC_ATsocket +0x40c033c BSC_ATgetmsg +0x40c0340 BSC_ATputmsg +0x40c0344 BSC_ATPsndreq +0x40c0348 BSC_ATPsndrsp +0x40c034c BSC_ATPgetreq +0x40c0350 BSC_ATPgetrsp +0x40c0354 BSC_#213 +0x40c0358 BSC_#214 +0x40c035c BSC_#215 +0x40c0360 BSC_mkcomplex +0x40c0364 BSC_statv +0x40c0368 BSC_lstatv +0x40c036c BSC_fstatv +0x40c0370 BSC_getattrlist +0x40c0374 BSC_setattrlist +0x40c0378 BSC_getdirentriesattr +0x40c037c BSC_exchangedata +0x40c0380 BSC_#224 +0x40c0384 BSC_searchfs +0x40c0388 BSC_delete_Carbon +0x40c038c BSC_copyfile +0x40c0390 BSC_fgetattrlist +0x40c0394 BSC_fsetattrlist +0x40c0398 BSC_poll +0x40c039c BSC_watchevent +0x40c03a0 BSC_waitevent +0x40c03a4 BSC_modwatch +0x40c03a8 BSC_getxattr +0x40c03ac BSC_fgetxattr +0x40c03b0 BSC_setxattr +0x40c03b4 BSC_fsetxattr +0x40c03b8 BSC_removexattr +0x40c03bc BSC_fremovexattr +0x40c03c0 BSC_listxattr +0x40c03c4 BSC_flistxattr +0x40c03c8 BSC_fsctl +0x40c03cc BSC_initgroups +0x40c03d0 BSC_posix_spawn +0x40c03d4 BSC_ffsctl +0x40c03d8 BSC_#246 +0x40c03dc BSC_nfsclnt +0x40c03e0 BSC_fhopen +0x40c03e4 BSC_#249 +0x40c03e8 BSC_minherit +0x40c03ec BSC_semsys +0x40c03f0 BSC_msgsys +0x40c03f4 BSC_shmsys +0x40c03f8 BSC_semctl +0x40c03fc BSC_semget +0x40c0400 BSC_semop +0x40c0404 BSC_#257 +0x40c0408 BSC_msgctl +0x40c040c BSC_msgget +0x40c0410 BSC_msgsnd +0x40c0414 BSC_msgrcv +0x40c0418 BSC_shmat +0x40c041c BSC_shmctl +0x40c0420 BSC_shmdt +0x40c0424 BSC_shmget +0x40c0428 BSC_shm_open +0x40c042c BSC_shm_unlink +0x40c0430 BSC_sem_open +0x40c0434 BSC_sem_close +0x40c0438 BSC_sem_unlink +0x40c043c BSC_sem_wait +0x40c0440 BSC_sem_trywait +0x40c0444 BSC_sem_post +0x40c0448 BSC_sem_getvalue +0x40c044c BSC_sem_init +0x40c0450 BSC_sem_destroy +0x40c0454 BSC_open_extended +0x40c0458 BSC_umask_extended +0x40c045c BSC_stat_extended +0x40c0460 BSC_lstat_extended +0x40c0464 BSC_fstat_extended +0x40c0468 BSC_chmod_extended +0x40c046c BSC_fchmod_extended +0x40c0470 BSC_access_extended +0x40c0474 BSC_settid +0x40c0478 BSC_gettid +0x40c047c BSC_setsgroups +0x40c0480 BSC_getsgroups +0x40c0484 BSC_setwgroups +0x40c0488 BSC_getwgroups +0x40c048c BSC_mkfifo_extended +0x40c0490 BSC_mkdir_extended +0x40c0494 BSC_identitysvc +0x40c0498 BSC_shared_region_chk_np +0x40c049c BSC_shared_region_map_np +0x40c04a0 BSC_vm_pressure_monitor +0x40c04a4 BSC_psynch_rw_longrdlock +0x40c04a8 BSC_psynch_rw_yieldwrlock +0x40c04ac BSC_psynch_rw_downgrade +0x40c04b0 BSC_psynch_rw_upgrade +0x40c04b4 BSC_psynch_mutexwait +0x40c04b8 BSC_psynch_mutexdrop +0x40c04bc BSC_psynch_cvbroad +0x40c04c0 BSC_psynch_cvsignal +0x40c04c4 BSC_psynch_cvwait +0x40c04c8 BSC_psynch_rw_rdlock +0x40c04cc BSC_psynch_rw_wrlock +0x40c04d0 BSC_psynch_rw_unlock +0x40c04d4 BSC_psynch_rw_unlock2 +0x40c04d8 BSC_getsid +0x40c04dc BSC_settid_with_pid +0x40c04e0 BSC_psynch_cvclrprepost +0x40c04e4 BSC_aio_fsync +0x40c04e8 BSC_aio_return +0x40c04ec BSC_aio_suspend +0x40c04f0 BSC_aio_cancel +0x40c04f4 BSC_aio_error +0x40c04f8 BSC_aio_read +0x40c04fc BSC_aio_write +0x40c0500 BSC_lio_listio +0x40c0504 BSC_obs_pthread_cond_wait +0x40c0508 BSC_iopolicysys +0x40c050c BSC_process_policy +0x40c0510 BSC_mlockall +0x40c0514 BSC_munlockall +0x40c0518 BSC_#326 +0x40c051c BSC_issetugid +0x40c0520 BSC_pthread_kill +0x40c0524 BSC_pthread_sigmask +0x40c0528 BSC_sigwait +0x40c052c BSC_disable_threadsignal +0x40c0530 BSC_pthread_markcancel +0x40c0534 BSC_pthread_canceled +0x40c0538 BSC_semwait_signal +0x40c053c BSC_obs_utrace +0x40c0540 BSC_proc_info +0x40c0544 BSC_sendfile +0x40c0548 BSC_stat64 +0x40c054c BSC_fstat64 +0x40c0550 BSC_lstat64 +0x40c0554 BSC_stat64_extended +0x40c0558 BSC_lstat64_extended +0x40c055c BSC_fstat64_extended +0x40c0560 BSC_getdirentries64 +0x40c0564 BSC_statfs64 +0x40c0568 BSC_fstatfs64 +0x40c056c BSC_getfsstat64 +0x40c0570 BSC_pthread_chdir +0x40c0574 BSC_pthread_fchdir +0x40c0578 BSC_audit +0x40c057c BSC_auditon +0x40c0580 BSC_#352 +0x40c0584 BSC_getauid +0x40c0588 BSC_setauid +0x40c058c BSC_getaudit +0x40c0590 BSC_setaudit +0x40c0594 BSC_getaudit_addr +0x40c0598 BSC_setaudit_addr +0x40c059c BSC_auditctl +0x40c05a0 BSC_bsdthread_create +0x40c05a4 BSC_bsdthread_terminate +0x40c05a8 BSC_kqueue +0x40c05ac BSC_kevent +0x40c05b0 BSC_lchown +0x40c05b4 BSC_stack_snapshot +0x40c05b8 BSC_bsdthread_register +0x40c05bc BSC_workq_open +0x40c05c0 BSC_workq_kernreturn +0x40c05c4 BSC_kevent64 +0x40c05c8 BSC_obs_semwait_signal +0x40c05cc BSC_obs_semwait_signal_nocancel +0x40c05d0 BSC_thread_selfid +0x40c05d4 BSC_#373 +0x40c05d8 BSC_#374 +0x40c05dc BSC_#375 +0x40c05e0 BSC_#376 +0x40c05e4 BSC_#377 +0x40c05e8 BSC_#378 +0x40c05ec BSC_#379 +0x40c05f0 BSC_mac_execve +0x40c05f4 BSC_mac_syscall +0x40c05f8 BSC_mac_get_file +0x40c0600 BSC_mac_get_link +0x40c0604 BSC_mac_set_link +0x40c0608 BSC_mac_get_proc +0x40c060c BSC_mac_set_proc +0x40c0610 BSC_mac_get_fd +0x40c0614 BSC_mac_set_fd +0x40c0618 BSC_mac_get_pid +0x40c061c BSC_mac_get_lcid +0x40c0620 BSC_mac_get_lctx +0x40c0624 BSC_mac_set_lctx +0x40c0628 BSC_setlcid +0x40c062c BSC_getlcid +0x40c0630 BSC_read_nocancel +0x40c0634 BSC_write_nocancel +0x40c0638 BSC_open_nocancel +0x40c063c BSC_close_nocancel +0x40c0640 BSC_wait4_nocancel +0x40c0644 BSC_recvmsg_nocancel +0x40c0648 BSC_sendmsg_nocancel +0x40c064c BSC_recvfrom_nocancel +0x40c0650 BSC_accept_nocancel +0x40c0654 BSC_msync_nocancel +0x40c0658 BSC_fcntl_nocancel +0x40c065c BSC_select_nocancel +0x40c0660 BSC_fsync_nocancel +0x40c0664 BSC_connect_nocancel +0x40c0668 BSC_sigsuspend_nocancel +0x40c066c BSC_readv_nocancel +0x40c0670 BSC_writev_nocancel +0x40c0674 BSC_sendto_nocancel +0x40c0678 BSC_pread_nocancel +0x40c067c BSC_pwrite_nocancel +0x40c0680 BSC_waitid_nocancel +0x40c0684 BSC_poll_nocancel +0x40c0688 BSC_msgsnd_nocancel +0x40c068c BSC_msgrcv_nocancel +0x40c0690 BSC_sem_wait_nocancel +0x40c0694 BSC_aio_suspend_nocancel +0x40c0698 BSC_sigwait_nocancel +0x40c069c BSC_semwait_signal_nocancel +0x40c06a0 BSC_mac_mount +0x40c06a4 BSC_mac_get_mount +0x40c06a8 BSC_mac_getfsstat +0x40c06ac BSC_fsgetpath +0x40c06b0 BSC_audit_session +0x40c06b4 BSC_audit_session_join +0x40c06b8 BSC_fileport_makeport +0x40c06bc BSC_fileport_makefd +0x40c06c0 BSC_audit_session_port +0x40c06c4 BSC_pid_suspend +0x40c06c8 BSC_pid_resume +0x40c06cc BSC_pid_hibernate +0x40c06d0 BSC_pid_shutdown_sockets +0x40c06d4 BSC_shared_region_slide_np +0x40c06fc BSC_shared_region_map_and_slide_np +0x40e0104 BSC_msync_extended_info +0x40e0264 BSC_pread_extended_info +0x40e0268 BSC_pwrite_extended_info +0x40e0314 BSC_mmap_extended_info +0x40f0314 BSC_mmap_extended_info2 +0x5000004 INTC_Handler +0x5010004 WL_CheckForWork +0x5010008 WL_RunEventSources +0x5020004 IES_client +0x5020008 IES_latency +0x502000c IES_sema +0x5020010 IES_intctxt +0x5020018 IES_action +0x502001c IES_filter +0x5030004 TES_client +0x5030008 TES_latency +0x503000c TES_sema +0x5030010 TES_action +0x5040004 CQ_client +0x5040008 CQ_latency +0x504000c CQ_sema +0x5040010 CQ_psema +0x5040014 CQ_plock +0x5040018 CG_action +0x5080004 IOSERVICE_BUSY +0x5080008 IOSERVICE_NONBUSY +0x508000c IOSERVICE_MODULESTALL +0x5080010 IOSERVICE_MODULEUNSTALL +0x5080014 IOSERVICE_TERM_PHASE1 +0x5080018 IOSERVICE_TERM_REQUEST_OK +0x508001c IOSERVICE_TERM_REQUEST_FAIL +0x5080020 IOSERVICE_TERM_SCHEDULE_STOP +0x5080024 IOSERVICE_TERM_SCHEDULE_FINALIZE +0x5080028 IOSERVICE_TERM_WILL +0x508002c IOSERVICE_TERM_DID +0x5080030 IOSERVICE_TERM_DID_DEFER +0x5080034 IOSERVICE_TERM_FINALIZE +0x5080038 IOSERVICE_TERM_STOP +0x508003c IOSERVICE_TERM_STOP_NOP +0x5080040 IOSERVICE_TERM_STOP_DEFER +0x5080044 IOSERVICE_TERM_DONE +0x5080048 IOSERVICE_KEXTD_ALIVE +0x508004C IOSERVICE_KEXTD_READY +0x5080050 IOSERVICE_REGISTRY_QUIET +0x5100004 PM_SetParent +0x5100008 PM_AddChild +0x510000c PM_RemoveChild +0x5100010 PM_CtrlDriver +0x5100014 PM_CtrlDrvrE1 +0x5100018 PM_CtrlDrvrE2 +0x510001c PM_CtrlDrvrE3 +0x5100020 PM_CtrlDrvrE4 +0x5100024 PM_IntDriver +0x5100028 PM_AckE1 +0x510002c PM_ChildAck +0x5100030 PM_DriverAck +0x5100034 PM_AckE2 +0x5100038 PM_AckE3 +0x510003c PM_AckE4 +0x5100040 PM_DrvrAckSPwr +0x5100044 PM_WillChange +0x5100048 PM_DidChange +0x510004c PM_ReqstDomain +0x5100050 PM_MakeUsable +0x5100054 PM_ChangeTo +0x5100058 PM_ChngeToPriv +0x510005c PM_SetAggrssvs +0x5100060 PM_CritclTemp +0x5100064 PM_OverrideOn +0x5100068 PM_OverrideOff +0x510006c PM_EnqueueErr +0x5100070 PM_CollapseQ +0x5100074 PM_ChangeDone +0x5100078 PM_CtrlDrvTrdy +0x510007c PM_IntDrvrTrdy +0x5100080 PM_StartAckTmr +0x5100084 PM_ParentChnge +0x5100088 PM_AmndPrnChng +0x510008c PM_DeviceChnge +0x5100090 PM_ReqDenied +0x5100094 PM_CtrlDrvrE45 +0x5100098 PM_PrgrmHrdwre +0x510009c PM_InfDrvrPre +0x51000a0 PM_InfDrvrPost +0x51000a4 PM_RemoveDrivr +0x51000a8 PM_IdlTimerPrd +0x51000ac PM_SystemWake +0x51000b0 PM_AckE5 +0x51000b4 PM_ClientAck +0x51000b8 PM_ClientTardy +0x51000bc PM_ClientCancl +0x51000c0 PM_ClientNotfy +0x51000c4 PM_AppNotify +0x5230000 HID_Unexpected +0x5230004 HID_KeyboardLEDThreadTrigger +0x5230008 HID_KeyboardLEDThreadActive +0x523000c HID_KeyboardSetParam +0x5230010 HID_KeyboardCapsThreadTrigger +0x5230014 HID_KeyboardCapsThreadActive +0x5230018 HID_PostEvent +0x523001c HID_NewUserClient +0x5230020 HID_InturruptReport +0x5230024 HID_DispatchScroll +0x5230028 HID_DispatchRelativePointer +0x523002c HID_DispatchAbsolutePointer +0x5230030 HID_DispatchKeyboard +0x5230034 HID_EjectCallback +0x5230038 HID_CapsCallback +0x523003c HID_#3c +0x523004c HID_#4c +0x5310004 CPUPM_PSTATE +0x5310008 CPUPM_IDLE_CSTATE +0x531000c CPUPM_IDLE_HALT +0x5310010 CPUPM_IDLE_LOOP +0x5310014 CPUPM_HPET_START +0x5310018 CPUPM_HPET_END +0x531001c CPUPM_HPET_INTR +0x5310020 CPUPM_PSTATE_HW +0x5310024 CPUPM_PSTATE_LIMIT +0x5310028 CPUPM_PSTATE_PARK +0x531002c CPUPM_PSTATE_START +0x5310030 CPUPM_PSTATE_PAUSE +0x5310034 CPUPM_PSTATE_RESUME +0x5310038 CPUPM_PSTATE_DOWN +0x531003c CPUPM_PSTATE_UP +0x5310040 CPUPM_PSTATE_NORM +0x5310044 CPUPM_PSTATE_FORCE +0x5310048 CPUPM_PSTATE_TIMEOUT +0x531004c CPUPM_PSTATE_SETTO +0x5310050 CPUPM_SET_DEADLINE +0x5310054 CPUPM_GET_DEADLINE +0x5310058 CPUPM_DEADLINE +0x531005c CPUPM_IDLE_SNOOP +0x5310060 CPUPM_IDLE_LATENCY +0x5310064 CPUPM_IDLE_WAKEUP +0x5310068 CPUPM_IDLE_SW_WAKEUP +0x531006c CPUPM_IDLE_SELECT +0x5310070 CPUPM_IDLE_SELECTED +0x5310074 CPUPM_IDLE_INTSKIP +0x5310078 CPUPM_IDLE_LOCK +0x531007c CPUPM_IDLE_UNLOCK +0x5310080 CPUPM_IDLE_NO_HPET +0x5310084 CPUPM_FI_UP +0x5310088 CPUPM_FI_UP_CPU +0x531008c CPUPM_FI_MP +0x5310090 CPUPM_FI_MP_CPU +0x5310094 CPUPM_FI_PAUSE +0x5310098 CPUPM_FI_RUN +0x531009c CPUPM_PROC_HALT +0x53100a0 CPUPM_TRACE_STOPPED +0x53100a4 CPUPM_HPET_INT_LOCK +0x53100a8 CPUPM_HPET_INT_UNLOCK +0x53100ac CPUPM_HPET_TRY_AGAIN +0x53100b0 CPUPM_HPET_SETDEADLINE +0x53100b4 CPUPM_LOCK_HELDBY +0x53100b8 CPUPM_HPET_DELTA +0x53100bc CPUPM_HPET_TOO_LATE +0x53100c0 CPUPM_HPET_NO_DEADLINE +0x53100c4 CPUPM_IDLE +0x53100c8 CPUPM_CORE_CHK_DEADLINE +0x53100cc CPUPM_SET_HPET_DEADLINE +0x53100d0 CPUPM_HPET_READ +0x53100d4 CPUPM_TIME_ADJUST +0x53100d8 CPUPM_IDLE_MWAIT +0x53100dc CPUPM_FI_SLAVE_IDLE +0x53100e0 CPUPM_FI_SLAVE_BLOCK +0x53100e4 CPUPM_FI_MAST_SIGNAL +0x53100e8 CPUPM_CORE_DEADLINE +0x53100ec CPUPM_IDLE_FAST +0x53100f0 CPUPM_IDLE_PAUSE +0x53100f4 CPUPM_IDLE_SHORT +0x53100f8 CPUPM_IDLE_NORMAL +0x53100fc CPUPM_IDLE_SPURIOUS +0x5310100 CPUPM_PSTATE_INFO +0x5310104 CPUPM_PSTATE_INFO_HW +0x5310108 CPUPM_PSTATE_FSM +0x531010c CPUPM_PSTATE_FSM_STEP +0x5310110 CPUPM_PSTATE_FSM_EVAL +0x5310114 CPUPM_PSTATE_FSM_MAP +0x5310118 CPUPM_CPUSTEP_STEP +0x531011c CPUPM_CPUSTEP_STEP_UP +0x5310120 CPUPM_CPUSTEP_STEP_DOWN +0x5310124 CPUPM_CPUSTEP_AVAIL +0x5310128 CPUPM_CPUSTEP_AVAIL_STEP +0x531012c CPUPM_CPUSTEP_AVAIL_CHNG +0x5310130 CPUPM_CPUSTEP_LOAD +0x5310134 CPUPM_CPUSTEP_START +0x5310138 CPUPM_CPUSTEP_STOP +0x531013c CPUPM_CPUSTEP_COPY +0x5310140 CPUPM_CPUSTEP_CLEAR +0x5310144 CPUPM_CPUSTEP_RUNCOUNT +0x5310148 CPUPM_CPUSTEP_WAKEUP +0x531014c CPUPM_PSTATE_TRACE +0x5310150 CPUPM_PSTATE_EVENT +0x5310154 CPUPM_IDLE_RATE +0x5310158 CPUPM_PSTATE_FSM_RESUME +0x531015c CPUPM_PSTATE_FSM_PAUSE +0x5310160 CPUPM_PSTATE_INSTRUCTION +0x5310164 CPUPM_PSTATE_INST_ARG +0x5310168 CPUPM_PSTATE_STACK_PUSH +0x531016c CPUPM_PSTATE_STACK_POP +0x5310170 CPUPM_IDLE_PREFIRE +0x5310174 CPUPM_PSTATE_VERIFY +0x5310178 CPUPM_TIMER_MIGRATE +0x531017c CPUPM_RING_LIMIT +0x5310180 CPUPM_CONTEXT_PAUSE +0x5310184 CPUPM_CONTEXT_RESUME +0x5310188 CPUPM_CONTEXT_RESUME_INFO +0x531018c CPUPM_THREAD_RESUME +0x5310190 CPUPM_THREAD_PAUSE_INFO +0x5310194 CPUPM_THREAD_RESUME_INFO +0x5310198 CPUPM_TEST_MASTER_INFO +0x531019c CPUPM_TEST_SLAVE_INFO +0x53101a0 CPUPM_TEST_INFO +0x53101a4 CPUPM_TEST_RUN_INFO +0x53101a8 CPUPM_TEST_SLAVE_INFO +0x5330000 HIBERNATE +0x5330004 HIBERNATE_WRITE_IMAGE +0x5330008 HIBERNATE_MACHINE_INIT +0x533000c HIBERNATE_FLUSH_MEMORY +0x5330010 HIBERNATE_flush_queue +0x5330014 HIBERNATE_flush_wait +0x5330018 HIBERNATE_flush_in_progress +0x533001c HIBERNATE_flush_bufs +0x5330020 HIBERNATE_page_list_setall +0x5330024 HIBERNATE_aes_decrypt_cbc +0x7000004 TRACE_DATA_NEWTHREAD +0x7000008 TRACE_DATA_EXEC +0x7010004 TRACE_STRING_NEWTHREAD +0x7010008 TRACE_STRING_EXEC +0x7020000 TRACE_PANIC +0x7020004 TRACE_TIMESTAMPS +0x7020008 TRACE_LOST_EVENTS +0x702000c TRACE_WRITING_EVENTS +0x8000000 USER_TEST +0x8000004 USER_run +0x8000008 USER_join +0x800000c USER_create +0x8000010 USER_pthread_create +0x8000014 USER_pthread_exit +0x8000018 USER_pthread_join +0x800001c USER_pthread_run +0x8000020 USER_pthread_cleanup_push +0x8000100 FW_underrun +0x8000104 FW_interrupt +0x8000108 FW_workloop +0x8010400 F_DLIL_Input +0x8010800 F_DLIL_Output +0x8010c00 F_DLIL_IfOut +0x8040000 USER_STOP +0x9000084 wq_deallocate_stack +0x9000088 wq_allocate_stack +0x9008070 wq_run_item +0x9008074 wq_clean_thread +0x9008078 wq_post_done +0x900807c wq_stk_cleanup +0x9008080 wq_tsd_cleanup +0x9008084 wq_tsd_destructor +0x9008088 wq_pthread_exit +0x900808c wq_workqueue_exit +0xa000100 P_CS_Read +0xa000110 P_CS_Write +0xa000180 P_CS_ReadDone +0xa000190 P_CS_WriteDone +0xa000200 P_CS_ReadChunk +0xa000210 P_CS_WriteChunk +0xa000280 P_CS_ReadChunkDone +0xa000290 P_CS_WriteChunkDone +0xa000300 P_CS_ReadCrypto +0xa000310 P_CS_WriteCrypto +0xa000500 P_CS_Originated_Read +0xa000510 P_CS_Originated_Write +0xa000580 P_CS_Originated_ReadDone +0xa000590 P_CS_Originated_WriteDone +0xa000900 P_CS_MetaRead +0xa000910 P_CS_MetaWrite +0xa000980 P_CS_MetaReadDone +0xa000990 P_CS_MetaWriteDone +0xa008000 P_CS_SYNC_DISK +0xa008004 P_CS_WaitForBuffer +0xa008008 P_CS_NoBuffer +0xb000000 AFP_asp_tcp_usr_send +0xb000004 AFP_asp_tcp_usr_send_after_Request +0xb000008 AFP_asp_tcp_usr_send_after_FindDSIReq +0xb00000c AFP_asp_tcp_usr_send_after_Reply +0xb000010 AFP_asp_tcp_slowtimo +0xb000014 AFP_asp_tcp_usr_control +0xb000018 AFP_asp_tcp_fasttimo +0xb000020 AFP_Send +0xb000024 AFP_Send_before_sosend +0xb000028 AFP_Send_after_sosend +0xb00002c AFP_Send_before_write +0xb000030 AFP_Send_after_write +0xb000040 AFP_Reply +0xb000044 AFP_Reply_rcvdAlready +0xb000048 AFP_Reply_before_RcvLock +0xb00004c AFP_Reply_fail_RcvLock +0xb000050 AFP_Reply_before_ReadDSIHdr +0xb000054 AFP_Reply_after_ReadDSIHdr +0xb000058 AFP_Reply_fail_ReadDSIHdr +0xb00005c AFP_Reply_after_FindDSIReqInfo +0xb000060 AFP_Reply_SetAFPCmd +0xb000064 AFP_Reply_before_ReadDSIPacket +0xb000068 AFP_Reply_setRcvdReplyLen +0xb000070 AFP_SendReply +0xb000080 AFP_CreateDSIHeader +0xb000084 AFP_CreateDSIHeader_after_GetReqID +0xb000090 AFP_Request +0xb0000a0 AFP_ReceiveLock +0xb0000b0 AFP_ReceiveWakeUp +0xb0000c0 AFP_ReceiveUnLock +0xb0000e0 AFP_SendLock +0xb0000e4 AFP_SendUnLock +0xb0000f0 AFP_SendQueueLock +0xb000100 AFP_SendQueueUnLock +0xb000110 AFP_ReadDSIHeader +0xb000120 AFP_Receive +0xb000124 AFP_Receive_before_sorcv +0xb000128 AFP_Receive_after_sorcv +0xb000130 AFP_ReadDSIPacket +0xb000140 AFP_DoCopyOut +0xb000150 AFP_DoCopyIn +0xb000160 AFP_CheckRcvTickle +0xb000164 AFP_CheckRcvTickleTO +0xb000170 AFP_CheckSendTickle +0xb000180 AFP_CheckIncomingPkts +0xb000190 AFP_ProcessOptions +0xb000200 AFP_FindDSIReqInfo +0xb000204 AFP_FindDSIReqInfo_foundReqInfo +0xb000208 AFP_FindDSIReqInfo_flags +0xb00020c AFP_FindDSIReqLeave +0xb000210 AFP_UsrDisconnect +0xc000000 AFPVFS_UserReply +0xc000004 AFPVFS_UserReplyGetMbuf +0xc000008 AFPVFS_UserReplysosend +0xc000010 AFPVFS_UserCommand +0xc000018 AFPVFS_UserCommandsosend +0xc000020 AFPVFS_ReadFork +0xc000024 AFPVFS_ReadForkFillQPB +0xc000028 AFPVFS_ReadForkNbrRequests +0xc00002c AFPVFS_ReadForkSendQPB +0xc000030 AFPVFS_ReadForkSendErr +0xc000040 AFPVFS_ReadForkGetReply +0xc000044 AFPVFS_ReadForkGetReplyResult +0xc000050 AFPVFS_WriteFork +0xc000054 AFPVFS_WriteForkFillQPB +0xc000058 AFPVFS_WriteForkNbrRequests +0xc00005c AFPVFS_WriteForkSendQPB +0xc000060 AFPVFS_WriteForkSendErr +0xc000064 AFPVFS_WriteForkGetReply +0xc000068 AFPVFS_WriteForkGetReplyResult +0xc000070 AFPVFS_GetAttr +0xc000080 AFPVFS_SetAttr +0xc000090 AFPVFS_GetAttrList +0xc0000a0 AFPVFS_SetAttrList +0xc0000b0 AFPVFS_FSCTL +0xc0000c0 AFPVFS_LookUp +0xc0000d0 AFPVFS_CacheLookUp +0xc0000e0 AFPVFS_Write +0xc0000e4 AFPVFS_WriteNoCluster +0xc0000e8 AFPVFS_WriteDone +0xc0000f0 AFPVFS_DoWrite +0xc000100 AFPVFS_Lock +0xc000110 AFPVFS_Statfs +0xc000120 AFPVFS_Sync +0xc000130 AFPVFS_VGet +0xc000140 AFPVFS_FlushFiles +0xc000150 AFPVFS_Create +0xc000160 AFPVFS_Mknod +0xc000170 AFPVFS_Open +0xc000180 AFPVFS_Close +0xc000190 AFPVFS_Access +0xc000194 AFPVFS_AccessUID +0xc000198 AFPVFS_AccessGID +0xc00019c AFPVFS_AccessWID +0xc0001a0 AFPVFS_Writeperm +0xc0001b0 AFPVFS_Chmod +0xc0001c0 AFPVFS_Chflags +0xc0001d0 AFPVFS_Exchange +0xc0001e0 AFPVFS_Chid +0xc0001f0 AFPVFS_Fsync +0xc000200 AFPVFS_Remove +0xc000210 AFPVFS_Rename +0xc000220 AFPVFS_Copyfile +0xc000230 AFPVFS_Mkdir +0xc000240 AFPVFS_Symlink +0xc000250 AFPVFS_Readdir +0xc000260 AFPVFS_Readdirattr +0xc000264 AFPVFS_Readdirattr1 +0xc000268 AFPVFS_Readdirattr2 +0xc00026c AFPVFS_Readdirattr3 +0xc000270 AFPVFS_Readlink +0xc000280 AFPVFS_Abortop +0xc000290 AFPVFS_Inactive +0xc0002a0 AFPVFS_Reclaim +0xc0002b0 AFPVFS_Unlock +0xc0002c0 AFPVFS_Islocked +0xc0002d0 AFPVFS_Pathconf +0xc0002e0 AFPVFS_Update +0xc0002f0 AFPVFS_Makenode +0xc000300 AFPVFS_Allocate +0xc000310 AFPVFS_Search +0xc000320 AFPVFS_Reconnect +0xc0003e0 AFPVFS_Rmdir +0xc0003f0 AFPVFS_Vinit +0x11000000 DNC_PURGE1 +0x11000004 DNC_PURGE2 +0x11000008 DNC_FOUND +0x1100000c DNC_FAILED +0x11000010 DNC_ENTER +0x11000014 DNC_remove_name +0x11000018 DNC_ENTER_CREATE +0x1100001c DNC_update_identity +0x11000020 DNC_PURGE +0x11000030 DNC_LOOKUP_PATH +0x11000034 HFS_vnop_lookup +0x11000038 NAMEI +0x11000048 VFS_SUSPENDED +0x1100004C VFS_CACHEPURGE +0x11000050 VFS_CACHELOOKUP_SUCCESS +0x11000054 VFS_CACHELOOKUP_FAILED +0x11000058 VFS_CACHELOOKUP_ENTER +0x1100005c VFS_CACHELOOKUP +0x11000060 VFS_GETIOCOUNT +0x11000064 VFS_vnode_recycle +0x11000068 VFS_vnode_reclaim +0x11000070 HFS_getnewvnode1 +0x11000074 HFS_getnewvnode2 +0x11000078 HFS_chash_getcnode +0x1100007c HFS_vfs_getpath +0x11000080 VOLFS_lookup +0x11000084 lookup_mountedhere +0x11000088 VNOP_LOOKUP +0x1100008c HFS_chash_getvnode +0x11000090 VFS_vnode_rele +0x11000094 VFS_vnode_put +0x11004100 NC_lock_shared +0x11004104 NC_lock_exclusive +0x11004108 NC_unlock +0x1f000000 DYLD_initialize +0x1f010000 DYLD_CALL_image_init_routine +0x1f010004 DYLD_CALL_dependent_init_routine +0x1f010008 DYLD_CALL_lazy_init_routine +0x1f01000c DYLD_CALL_module_init_for_library +0x1f010010 DYLD_CALL_module_init_for_object +0x1f010014 DYLD_CALL_module_terminator_for_object +0x1f010018 DYLD_CALL_module_init_for_dylib +0x1f01001c DYLD_CALL_mod_term_func +0x1f010020 DYLD_CALL_object_func +0x1f010024 DYLD_CALL_library_func +0x1f010028 DYLD_CALL_add_image_func +0x1f01002c DYLD_CALL_remove_image_func +0x1f010030 DYLD_CALL_link_object_module_func +0x1f010034 DYLD_CALL_link_library_module_func +0x1f010038 DYLD_CALL_link_module_func +0x1f020000 DYLD_lookup_and_bind_with_hint +0x1f020004 DYLD_lookup_and_bind_fully +0x1f020008 DYLD_link_module +0x1f02000c DYLD_ulink_module +0x1f020010 DYLD_bind_objc_module +0x1f020014 DYLD_bind_fully_image_containing_address +0x1f020018 DYLD_make_delayed_module_initializer_calls +0x1f02001c DYLD_NSNameOfSymbol +0x1f020020 DYLD_NSAddressOfSymbol +0x1f020024 DYLD_NSModuleForSymbol +0x1f020028 DYLD_NSLookupAndBindSymbolWithHint +0x1f02002c DYLD_NSLookupSymbolInModule +0x1f020030 DYLD_NSLookupSymbolInImage +0x1f020034 DYLD_NSIsSymbolNameDefined +0x1f020038 DYLD_NSIsSymbolNameDefinedWithHint +0x1f02003c DYLD_NSIsSymbolNameDefinedInImage +0x1f020040 DYLD_NSNameOfModule +0x1f020044 DYLD_NSLibraryNameForModule +0x1f020048 DYLD_NSAddLibrary +0x1f02004c DYLD_NSAddLibraryWithSearching +0x1f020050 DYLD_NSAddImage +0x1f030000 DYLD_lookup_symbol +0x1f030004 DYLD_bind_lazy_symbol_reference +0x1f030008 DYLD_bind_symbol_by_name +0x1f03000c DYLD_link_in_need_modules +0x1f040000 DYLD_map_image +0x1f040004 DYLD_load_executable_image +0x1f040008 DYLD_load_library_image +0x1f04000c DYLD_map_library_image +0x1f040010 DYLD_map_bundle_image +0x1f040014 DYLD_load_dependent_libraries +0x1f040018 DYLD_notify_prebinding_agent +0x1ff10000 SCROLL_BEGIN_obs +0x1ff10100 SCROLL_END_obs +0x1ff20000 BOOT_BEGIN_obs +0x1ff20100 BOOT_END_obs +0x1ff20400 APP_DidActivateWindow_obs +0x1ff20500 TOOL_PRIVATE_1_obs +0x1ff20504 TOOL_PRIVATE_2_obs +0x1ff20508 TOOL_PRIVATE_3_obs +0x1ff2050c TOOL_PRIVATE_4_obs +0x1fff0000 LAUNCH_START_FINDER +0x1fff0100 LAUNCH_START_DOCK +0x1fff0200 LAUNCH_LSOpen +0x1fff0204 LAUNCH_LSRegisterItem +0x1fff0208 LAUNCH_LSGetApplicationAndFlagsForInfo +0x1fff0300 LAUNCH_CPSLaunch +0x1fff0304 LAUNCH_CPSRegisterwithServer +0x1fff0308 LAUNCH_CGSCheckInNewProcess +0x1fff030c LAUNCH_CPSExecProcess +0x1fff0310 LAUNCH_APP_EnterEventLoop +0x1fff0314 LAUNCH_APP_WillOpenUntitled +0x1fff031c LAUNCH_APP_DidOpenUntitled +0x1fff1000 LAUNCH_END +0x1fffffff LAUNCH_END +0x20000004 RTC_sync_TBR +0x21010000 SCROLL_BEGIN +0x21020000 BOOT_BEGIN +0x21030200 LOGIN_BEGIN +0x21030204 LOGINWINDOW_LAUNCHED +0x21030208 LOGINWINDOW_LAUNCHES_SA +0x2103020c LOGINWINDOW_GUI_APPEARS +0x21030210 LOGINWINDOW_LOGIN_CLICKED +0x21030214 LOGINWINDOW_ASKS_AUTH +0x21030218 LOGINWINDOW_AUTH_SUCCEEDED +0x2103021c LOGINWINDOW_LAUNCHES_DOCK +0x21030220 LOGINWINDOW_LAUNCHES_SUIS +0x21030224 LOGINWINDOW_LAUNCHES_FINDER +0x21030228 LOGINWINDOW_DOCK_LAUNCHED +0x2103022c LOGINWINDOW_SUIS_LAUNCHED +0x21030230 LOGINWINDOW_FINDER_LAUNCHED +0x21030234 LOGINWINDOW_LOGOUT_CLICKED +0x21030238 LOGINWINDOW_QUIT_FGAPPS +0x2103023c LOGINWINDOW_FGAPPS_QUIT +0x21030240 LOGINWINDOW_QUIT_SUIS +0x21030244 LOGINWINDOW_SUIS_DIES +0x21030248 LOGINWINDOW_QUIT_FINDER +0x2103024c LOGINWINDOW_FINDER_DIES +0x21030250 LOGINWINDOW_QUIT_DOCK +0x21030254 LOGINWINDOW_DOCK_DIES +0x21030258 LOGINWINDOW_EXIT +0x2103025c LOGINWINDOW_FUS_SELUSERNAME +0x21030260 LOGINWINDOW_FUS_SELLOGINWIND +0x21030270 LOGIN_APPLICATION_EXECUTING +0x21030274 LOGIN_APPLICATION_USABLE +0x21030300 LOGIN_END +0x21030500 LOGINWINDOW_APP_TERMINATION_REQUEST +0x21030504 LOGINWINDOW_LOGOUT_START +0x21030508 LOGINWINDOW_DESKTOP_UP +0x2103050c LOGINWINDOW_DESKTOP_UP_NOTIFICATION +0x21040000 APP_DIDActivateWindow +0x21050000 TOOL_PRIVATE_1 +0x21050004 TOOL_PRIVATE_2 +0x21050008 TOOL_PRIVATE_3 +0x2105000c TOOL_PRIVATE_4 +0x21060000 LAUNCH_CPSTraceLineNum +0x21060004 LAUNCH_CPSLaunch +0x21060008 LAUNCH_CPSRegisterwithServer +0x2106000c LAUNCH_CPSCheckInNewProcess +0x21060010 LAUNCH_CPSServerSideLaunch +0x21060014 LAUNCH_CPSExecProcess +0x21070000 LAUNCH_LSOpen +0x21070004 LAUNCH_LSRegisterItem +0x21070008 LAUNCH_LSGetApplicationAndFlagsForInfo +0x21080000 MCX_DAEMON_START +0x21080004 MCX_DAEMON_FINISH +0x21080008 MCX_STARTMCX_START +0x2108000C MCX_STARTMCX_FINISH +0x21080010 MCX_POSTCMP_DOCK_START +0x21080014 MCX_POSTCMP_DOCK_FINISH +0x21080020 MCX_POSTCMP_ENERGYSVR_START +0x21080024 MCX_POSTCMP_ENERGYSVR_FINISH +0x21080030 MCX_POSTCMP_LOGINITMS_START +0x21080034 MCX_POSTCMP_LOGINITMS_FINISH +0x21080040 MCX_CMP_COMPUTERINFO_START +0x21080044 MCX_CMP_COMPUTERINFO_FINISH +0x21080050 MCX_CMP_USERINFO_START +0x21080054 MCX_CMP_USERINFO_FINISH +0x21080060 MCX_POSTCMP_USER_START +0x21080064 MCX_POSTCMP_USER_FINISH +0x210800A0 MCX_MECHANISM_START +0x210800A4 MCX_MECHANISM_FINISH +0x210800C0 MCX_MECHANISM_PICKER_START +0x210800C4 MCX_MECHANISM_PICKER_FINISH +0x21080100 MCX_APPITEMS_START +0x21080104 MCX_APPITEMS_FINISH +0x21080200 MCX_CACHER_START +0x21080204 MCX_CACHER_FINISH +0x21080300 MCX_COMPOSITOR_START +0x21080304 MCX_COMPOSITOR_FINISH +0x21080400 MCX_DISKSETUP_START +0x21080404 MCX_DISKSETUP_FINISH +0x21090000 PHD_DAEMON_START +0x21090004 PHD_DAEMON_FINISH +0x21090010 PHD_SYNCNOW_START +0x21090014 PHD_SYNCNOW_FINISH +0x210b0000 TAL_APP_LAUNCH_START +0x210b0004 TAL_APP_LAUNCH_UNSUSPENDED +0x210b0008 TAL_APP_LAUNCH_UNTHROTTLED +0x210b000c TAL_APP_LAUNCH_VISIBLE +0x210b0010 TAL_APP_LAUNCH_READY +0x210b0014 TAL_ALL_LAUNCH_READY +0x21800000 SMB_smbd_idle +0x21800004 SMB_syscall_opendir +0x21800008 SMB_syscall_readdir +0x2180000c SMB_syscall_seekdir +0x21800010 SMB_syscall_telldir +0x21800014 SMB_syscall_rewinddir +0x21800018 SMB_syscall_mkdir +0x2180001c SMB_syscall_rmdir +0x21800020 SMB_syscall_closedir +0x21800024 SMB_syscall_open +0x21800028 SMB_syscall_close +0x2180002c SMB_syscall_read +0x21800030 SMB_syscall_pread +0x21800034 SMB_syscall_write +0x21800038 SMB_syscall_pwrite +0x2180003c SMB_syscall_lseek +0x21800040 SMB_syscall_sendfile +0x21800044 SMB_syscall_rename +0x21800048 SMB_syscall_fsync +0x2180004c SMB_syscall_stat +0x21800050 SMB_syscall_fstat +0x21800054 SMB_syscall_lstat +0x21800058 SMB_syscall_unlink +0x2180005c SMB_syscall_chmod +0x21800060 SMB_syscall_fchmod +0x21800064 SMB_syscall_chown +0x21800068 SMB_syscall_fchown +0x2180006c SMB_syscall_chdir +0x21800070 SMB_syscall_getwd +0x21800074 SMB_syscall_utime +0x21800078 SMB_syscall_ftruncate +0x2180007c SMB_syscall_fcntl_lock +0x21800080 SMB_syscall_kernel_flock +0x21800084 SMB_syscall_fcntl_getlock +0x21800088 SMB_syscall_readlink +0x2180008c SMB_syscall_symlink +0x21800090 SMB_syscall_link +0x21800094 SMB_syscall_mknod +0x21800098 SMB_syscall_realpath +0x2180009c SMB_syscall_get_quota +0x218000a0 SMB_syscall_set_quota +0x218000a4 SMB_smbmkdir +0x218000a8 SMB_smbrmdir +0x218000ac SMB_smbopen +0x218000b0 SMB_smbcreate +0x218000b4 SMB_smbclose +0x218000b8 SMB_smbflush +0x218000bc SMB_smbunlink +0x218000c0 SMB_smbmv +0x218000c4 SMB_smbgetatr +0x218000c8 SMB_smbsetatr +0x218000cc SMB_smbread +0x218000d0 SMB_smbwrite +0x218000d4 SMB_smblock +0x218000d8 SMB_smbunlock +0x218000dc SMB_smbctemp +0x218000e0 SMB_smbmknew +0x218000e4 SMB_smbcheckpath +0x218000e8 SMB_smbexit +0x218000ec SMB_smblseek +0x218000f0 SMB_smblockread +0x218000f4 SMB_smbwriteunlock +0x218000f8 SMB_smbreadbraw +0x218000fc SMB_smbreadbmpx +0x21800100 SMB_smbreadbs +0x21800104 SMB_smbwritebraw +0x21800108 SMB_smbwritebmpx +0x2180010c SMB_smbwritebs +0x21800110 SMB_smbwritec +0x21800114 SMB_smbsetattre +0x21800118 SMB_smbgetattre +0x2180011c SMB_smblockingx +0x21800120 SMB_smbtrans +0x21800124 SMB_smbtranss +0x21800128 SMB_smbioctl +0x2180012c SMB_smbioctls +0x21800130 SMB_smbcopy +0x21800134 SMB_smbmove +0x21800138 SMB_smbecho +0x2180013c SMB_smbwriteclose +0x21800140 SMB_smbopenx +0x21800144 SMB_smbreadx +0x21800148 SMB_smbwritex +0x2180014c SMB_smbtrans2 +0x21800150 SMB_smbtranss2 +0x21800154 SMB_smbfindclose +0x21800158 SMB_smbfindnclose +0x2180015c SMB_smbtcon +0x21800160 SMB_smbtdis +0x21800164 SMB_smbnegprot +0x21800168 SMB_smbsesssetupx +0x2180016c SMB_smbulogoffx +0x21800170 SMB_smbtconx +0x21800174 SMB_smbdskattr +0x21800178 SMB_smbsearch +0x2180017c SMB_smbffirst +0x21800180 SMB_smbfunique +0x21800184 SMB_smbfclose +0x21800188 SMB_smbnttrans +0x2180018c SMB_smbnttranss +0x21800190 SMB_smbntcreatex +0x21800194 SMB_smbntcancel +0x21800198 SMB_smbntrename +0x2180019c SMB_smbsplopen +0x218001a0 SMB_smbsplwr +0x218001a4 SMB_smbsplclose +0x218001a8 SMB_smbsplretq +0x218001ac SMB_smbsends +0x218001b0 SMB_smbsendb +0x218001b4 SMB_smbfwdname +0x218001b8 SMB_smbcancelf +0x218001bc SMB_smbgetmac +0x218001c0 SMB_smbsendstrt +0x218001c4 SMB_smbsendend +0x218001c8 SMB_smbsendtxt +0x218001cc SMB_smbinvalid +0x218001d0 SMB_pathworks_setdir +0x218001d4 SMB_trans2_open +0x218001d8 SMB_trans2_findfirst +0x218001dc SMB_trans2_findnext +0x218001e0 SMB_trans2_qfsinfo +0x218001e4 SMB_trans2_setfsinfo +0x218001e8 SMB_trans2_qpathinfo +0x218001ec SMB_trans2_setpathinfo +0x218001f0 SMB_trans2_qfileinfo +0x218001f4 SMB_trans2_setfileinfo +0x218001f8 SMB_trans2_fsctl +0x218001fc SMB_trans2_ioctl +0x21800200 SMB_trans2_findnotifyfirst +0x21800204 SMB_trans2_findnotifynext +0x21800208 SMB_trans2_mkdir +0x2180020c SMB_trans2_session_setup +0x21800210 SMB_trans2_get_dfs_referral +0x21800214 SMB_trans2_report_dfs_inconsistancy +0x21800218 SMB_nt_transact_create +0x2180021c SMB_nt_transact_ioctl +0x21800220 SMB_nt_transact_set_security_desc +0x21800224 SMB_nt_transact_notify_change +0x21800228 SMB_nt_transact_rename +0x2180022c SMB_nt_transact_query_security_desc +0x21800230 SMB_nt_transact_get_user_quota +0x21800234 SMB_nt_transact_set_user_quota +0x21800238 SMB_get_nt_acl +0x2180023c SMB_fget_nt_acl +0x21800240 SMB_set_nt_acl +0x21800244 SMB_fset_nt_acl +0x21800248 SMB_chmod_acl +0x2180024c SMB_fchmod_acl +0x21800250 SMB_name_release +0x21800254 SMB_name_refresh +0x21800258 SMB_name_registration +0x2180025c SMB_node_status +0x21800260 SMB_name_query +0x21800264 SMB_host_announce +0x21800268 SMB_workgroup_announce +0x2180026c SMB_local_master_announce +0x21800270 SMB_master_browser_announce +0x21800274 SMB_lm_host_announce +0x21800278 SMB_get_backup_list +0x2180027c SMB_reset_browser +0x21800280 SMB_announce_request +0x21800284 SMB_lm_announce_request +0x21800288 SMB_domain_logon +0x2180028c SMB_sync_browse_lists +0x21800290 SMB_run_elections +0x21800294 SMB_election +0x22000004 LAUNCHD_starting +0x22000008 LAUNCHD_exiting +0x2200000c LAUNCHD_finding_stray_pg +0x22000010 LAUNCHD_finding_all_strays +0x22000014 LAUNCHD_finding_execless +0x22000018 LAUNCHD_finding_weird_uids +0x2200001c LAUNCHD_data_pack +0x22000020 LAUNCHD_data_unpack +0x22000024 LAUNCHD_bug +0x22000028 LAUNCHD_mach_ipc +0x2200002c LAUNCHD_bsd_kevent +0x22000030 LAUNCHD_vproc_trans_incr +0x22000034 LAUNCHD_vproc_trans_decr +0xff000104 MSG_mach_notify_port_deleted +0xff000114 MSG_mach_notify_port_destroyed +0xff000118 MSG_mach_notify_no_senders +0xff00011c MSG_mach_notify_send_once +0xff000120 MSG_mach_notify_dead_name +0xff0001ec MSG_audit_triggers +0xff000320 MSG_host_info +0xff000324 MSG_host_kernel_version +0xff000328 MSG_host_page_size +0xff00032c MSG_mach_memory_object_memory_entry +0xff000330 MSG_host_processor_info +0xff000334 MSG_host_get_io_master +0xff000338 MSG_host_get_clock_service +0xff00033c MSG_kmod_get_info +0xff000340 MSG_host_zone_info +0xff000344 MSG_host_virtual_physical_table_info +0xff000348 MSG_host_ipc_hash_info +0xff00034c MSG_enable_bluebox +0xff000350 MSG_disable_bluebox +0xff000354 MSG_processor_set_default +0xff000358 MSG_processor_set_create +0xff00035c MSG_mach_memory_object_memory_entry_64 +0xff000360 MSG_host_statistics +0xff000364 MSG_host_request_notification +0xff000368 MSG_host_lockgroup_info +0xff00036c MSG_host_statistics64 +0xff000370 MSG_mach_zone_info +0xff000640 MSG_host_get_boot_info +0xff000644 MSG_host_reboot +0xff000648 MSG_host_priv_statistics +0xff00064c MSG_host_default_memory_manager +0xff000650 MSG_vm_wire +0xff000654 MSG_thread_wire +0xff000658 MSG_vm_allocate_cpm +0xff00065c MSG_host_processors +0xff000660 MSG_host_get_clock_control +0xff000664 MSG_kmod_create +0xff000668 MSG_kmod_destroy +0xff00066c MSG_kmod_control +0xff000670 MSG_host_get_special_port +0xff000674 MSG_host_set_special_port +0xff000678 MSG_host_set_exception_ports +0xff00067c MSG_host_get_exception_ports +0xff000680 MSG_host_swap_exception_ports +0xff000684 MSG_host_load_symbol_table +0xff000688 MSG_mach_vm_wire +0xff00068c MSG_host_processor_sets +0xff000690 MSG_host_processor_set_priv +0xff000694 MSG_set_dp_control_port +0xff000698 MSG_get_dp_control_port +0xff00069c MSG_host_set_UNDServer +0xff0006a0 MSG_host_get_UNDServer +0xff0006a4 MSG_kext_request +0xff000960 MSG_host_security_create_task_token +0xff000964 MSG_host_security_set_task_token +0xff000f9c MSG_mach_gss_init_sec_context +0xff000fa0 MSG_clock_get_time +0xff000fa0 MSG_mach_gss_accept_sec_context +0xff000fa4 MSG_clock_get_attributes +0xff000fa4 MSG_mach_gss_log_error +0xff000fa8 MSG_clock_alarm +0xff000fa8 MSG_mach_gss_init_sec_context_v2 +0xff000fac MSG_mach_gss_accept_sec_context_v2 +0xff000fb0 MSG_mach_gss_hold_cred +0xff000fb4 MSG_mach_gss_unhold_cred +0xff000ffc MSG_lockd_request +0xff001000 MSG_lockd_ping +0xff001004 MSG_lockd_shutdown +0xff0012c0 MSG_clock_set_time +0xff0012c4 MSG_clock_set_attributes +0xff001f40 MSG_memory_object_get_attributes +0xff001f44 MSG_memory_object_change_attributes +0xff001f48 MSG_memory_object_synchronize_completed +0xff001f4c MSG_memory_object_lock_request +0xff001f50 MSG_memory_object_destroy +0xff001f54 MSG_memory_object_upl_request +0xff001f58 MSG_memory_object_super_upl_request +0xff001f5c MSG_memory_object_cluster_size +0xff001f60 MSG_memory_object_page_op +0xff001f64 MSG_memory_object_recover_named +0xff001f68 MSG_memory_object_release_name +0xff001f6c MSG_memory_object_range_op +0xff002008 MSG_upl_abort +0xff00200c MSG_upl_abort_range +0xff002010 MSG_upl_commit +0xff002014 MSG_upl_commit_range +0xff002260 MSG_memory_object_init +0xff002264 MSG_memory_object_terminate +0xff002268 MSG_memory_object_data_request +0xff00226c MSG_memory_object_data_return +0xff002270 MSG_memory_object_data_initialize +0xff002274 MSG_memory_object_data_unlock +0xff002278 MSG_memory_object_synchronize +0xff00227c MSG_memory_object_map +0xff002280 MSG_memory_object_last_unmap +0xff002284 MSG_memory_object_data_reclaim +0xff002328 MSG_memory_object_create +0xff00238c MSG_default_pager_object_create +0xff002390 MSG_default_pager_info +0xff002394 MSG_default_pager_objects +0xff002398 MSG_default_pager_object_pages +0xff0023a0 MSG_default_pager_backing_store_create +0xff0023a4 MSG_default_pager_backing_store_delete +0xff0023a8 MSG_default_pager_backing_store_info +0xff0023ac MSG_default_pager_add_file +0xff0023b0 MSG_default_pager_triggers +0xff0023b4 MSG_default_pager_info_64 +0xff0023dc MSG_default_pager_space_alert +0xff002584 MSG_exception_raise +0xff002588 MSG_exception_raise_state +0xff00258c MSG_exception_raise_state_identity +0xff002594 MSG_mach_exception_raise +0xff002598 MSG_mach_exception_raise_state +0xff00259c MSG_mach_exception_raise_state_identity +0xff002bc0 MSG_io_object_get_class +0xff002bc4 MSG_io_object_conforms_to +0xff002bc8 MSG_io_iterator_next +0xff002bcc MSG_io_iterator_reset +0xff002bd0 MSG_io_service_get_matching_services +0xff002bd4 MSG_io_registry_entry_get_property +0xff002bd8 MSG_io_registry_create_iterator +0xff002bdc MSG_io_registry_iterator_enter_entry +0xff002be0 MSG_io_registry_iterator_exit_entry +0xff002be4 MSG_io_registry_entry_from_path +0xff002be8 MSG_io_registry_entry_get_name +0xff002bec MSG_io_registry_entry_get_properties +0xff002bf0 MSG_io_registry_entry_get_property_bytes +0xff002bf4 MSG_io_registry_entry_get_child_iterator +0xff002bf8 MSG_io_registry_entry_get_parent_iterator +0xff002c00 MSG_io_service_close +0xff002c04 MSG_io_connect_get_service +0xff002c08 MSG_io_connect_set_notification_port +0xff002c0c MSG_io_connect_map_memory +0xff002c10 MSG_io_connect_add_client +0xff002c14 MSG_io_connect_set_properties +0xff002c18 MSG_io_connect_method_scalarI_scalarO +0xff002c1c MSG_io_connect_method_scalarI_structureO +0xff002c20 MSG_io_connect_method_scalarI_structureI +0xff002c24 MSG_io_connect_method_structureI_structureO +0xff002c28 MSG_io_registry_entry_get_path +0xff002c2c MSG_io_registry_get_root_entry +0xff002c30 MSG_io_registry_entry_set_properties +0xff002c34 MSG_io_registry_entry_in_plane +0xff002c38 MSG_io_object_get_retain_count +0xff002c3c MSG_io_service_get_busy_state +0xff002c40 MSG_io_service_wait_quiet +0xff002c44 MSG_io_registry_entry_create_iterator +0xff002c48 MSG_io_iterator_is_valid +0xff002c4c MSG_io_make_matching +0xff002c50 MSG_io_catalog_send_data +0xff002c54 MSG_io_catalog_terminate +0xff002c58 MSG_io_catalog_get_data +0xff002c5c MSG_io_catalog_get_gen_count +0xff002c60 MSG_io_catalog_module_loaded +0xff002c64 MSG_io_catalog_reset +0xff002c68 MSG_io_service_request_probe +0xff002c6c MSG_io_registry_entry_get_name_in_plane +0xff002c70 MSG_io_service_match_property_table +0xff002c74 MSG_io_async_method_scalarI_scalarO +0xff002c78 MSG_io_async_method_scalarI_structureO +0xff002c7c MSG_io_async_method_scalarI_structureI +0xff002c80 MSG_io_async_method_structureI_structureO +0xff002c84 MSG_io_service_add_notification +0xff002c88 MSG_io_service_add_interest_notification +0xff002c8c MSG_io_service_acknowledge_notification +0xff002c90 MSG_io_connect_get_notification_semaphore +0xff002c94 MSG_io_connect_unmap_memory +0xff002c98 MSG_io_registry_entry_get_location_in_plane +0xff002c9c MSG_io_registry_entry_get_property_recursively +0xff002ca0 MSG_io_service_get_state +0xff002ca4 MSG_io_service_get_matching_services_ool +0xff002ca8 MSG_io_service_match_property_table_ool +0xff002cac MSG_io_service_add_notification_ool +0xff002cb0 MSG_io_object_get_superclass +0xff002cb4 MSG_io_object_get_bundle_identifier +0xff002cb8 MSG_io_service_open_extended +0xff002cbc MSG_io_connect_map_memory_into_task +0xff002cc0 MSG_io_connect_unmap_memory_from_task +0xff002cc4 MSG_io_connect_method +0xff002cc8 MSG_io_connect_async_method +0xff002ccc MSG_io_connect_set_notification_port_64 +0xff002cd0 MSG_io_service_add_notification_64 +0xff002cd4 MSG_io_service_add_interest_notification_64 +0xff002cd8 MSG_io_service_add_notification_ool_64 +0xff002cdc MSG_io_registry_entry_get_registry_entry_id +0xff002ee0 MSG_processor_start +0xff002ee4 MSG_processor_exit +0xff002ee8 MSG_processor_info +0xff002eec MSG_processor_control +0xff002ef0 MSG_processor_assign +0xff002ef4 MSG_processor_get_assignment +0xff003200 MSG_mach_port_names +0xff003204 MSG_mach_port_type +0xff003208 MSG_mach_port_rename +0xff00320c MSG_mach_port_allocate_name +0xff003210 MSG_mach_port_allocate +0xff003214 MSG_mach_port_destroy +0xff003218 MSG_mach_port_deallocate +0xff00321c MSG_mach_port_get_refs +0xff003220 MSG_mach_port_mod_refs +0xff003228 MSG_mach_port_set_mscount +0xff00322c MSG_mach_port_get_set_status +0xff003230 MSG_mach_port_move_member +0xff003234 MSG_mach_port_request_notification +0xff003238 MSG_mach_port_insert_right +0xff00323c MSG_mach_port_extract_right +0xff003240 MSG_mach_port_set_seqno +0xff003244 MSG_mach_port_get_attributes +0xff003248 MSG_mach_port_set_attributes +0xff00324c MSG_mach_port_allocate_qos +0xff003250 MSG_mach_port_allocate_full +0xff003254 MSG_task_set_port_space +0xff003258 MSG_mach_port_get_srights +0xff00325c MSG_mach_port_space_info +0xff003260 MSG_mach_port_dnrequest_info +0xff003264 MSG_mach_port_kernel_object +0xff003268 MSG_mach_port_insert_member +0xff00326c MSG_mach_port_extract_member +0xff003270 MSG_mach_port_get_context +0xff003274 MSG_mach_port_set_context +0xff003278 MSG_mach_port_kobject +0xff003520 MSG_task_create +0xff003524 MSG_task_terminate +0xff003528 MSG_task_threads +0xff00352c MSG_mach_ports_register +0xff003530 MSG_mach_ports_lookup +0xff003534 MSG_task_info +0xff003538 MSG_task_set_info +0xff00353c MSG_task_suspend +0xff003540 MSG_task_resume +0xff003544 MSG_task_get_special_port +0xff003548 MSG_task_set_special_port +0xff00354c MSG_thread_create +0xff003550 MSG_thread_create_running +0xff003554 MSG_task_set_exception_ports +0xff003558 MSG_task_get_exception_ports +0xff00355c MSG_task_swap_exception_ports +0xff003560 MSG_lock_set_create +0xff003564 MSG_lock_set_destroy +0xff003568 MSG_semaphore_create +0xff00356c MSG_semaphore_destroy +0xff003570 MSG_task_policy_set +0xff003574 MSG_task_policy_get +0xff003578 MSG_task_sample +0xff00357c MSG_task_policy +0xff003580 MSG_task_set_emulation +0xff003584 MSG_task_get_emulation_vector +0xff003588 MSG_task_set_emulation_vector +0xff00358c MSG_task_set_ras_pc +0xff003590 MSG_task_zone_info +0xff003594 MSG_task_assign +0xff003598 MSG_task_assign_default +0xff00359c MSG_task_get_assignment +0xff0035a0 MSG_task_set_policy +0xff0035a4 MSG_task_get_state +0xff0035a8 MSG_task_set_state +0xff003840 MSG_thread_terminate +0xff003844 MSG_act_get_state +0xff003848 MSG_act_set_state +0xff00384c MSG_thread_get_state +0xff003850 MSG_thread_set_state +0xff003854 MSG_thread_suspend +0xff003858 MSG_thread_resume +0xff00385c MSG_thread_abort +0xff003860 MSG_thread_abort_safely +0xff003864 MSG_thread_depress_abort +0xff003868 MSG_thread_get_special_port +0xff00386c MSG_thread_set_special_port +0xff003870 MSG_thread_info +0xff003874 MSG_thread_set_exception_ports +0xff003878 MSG_thread_get_exception_ports +0xff00387c MSG_thread_swap_exception_ports +0xff003880 MSG_thread_policy +0xff003884 MSG_thread_policy_set +0xff003888 MSG_thread_policy_get +0xff00388c MSG_thread_sample +0xff003890 MSG_etap_trace_thread +0xff003894 MSG_thread_assign +0xff003898 MSG_thread_assign_default +0xff00389c MSG_thread_get_assignment +0xff0038a0 MSG_thread_set_policy +0xff003b60 MSG_vm_region +0xff003b64 MSG_vm_allocate +0xff003b68 MSG_vm_deallocate +0xff003b6c MSG_vm_protect +0xff003b70 MSG_vm_inherit +0xff003b74 MSG_vm_read +0xff003b78 MSG_vm_read_list +0xff003b7c MSG_vm_write +0xff003b80 MSG_vm_copy +0xff003b84 MSG_vm_read_overwrite +0xff003b88 MSG_vm_msync +0xff003b8c MSG_vm_behavior_set +0xff003b90 MSG_vm_map +0xff003b94 MSG_vm_machine_attribute +0xff003b98 MSG_vm_remap +0xff003b9c MSG_task_wire +0xff003ba0 MSG_mach_make_memory_entry +0xff003ba4 MSG_vm_map_page_query +0xff003ba8 MSG_mach_vm_region_info +0xff003bac MSG_vm_mapped_pages_info +0xff003bb4 MSG_vm_region_recurse +0xff003bb8 MSG_vm_region_recurse_64 +0xff003bbc MSG_mach_vm_region_info_64 +0xff003bc0 MSG_vm_region_64 +0xff003bc4 MSG_mach_make_memory_entry_64 +0xff003bc8 MSG_vm_map_64 +0xff003bcc MSG_vm_map_get_upl +0xff003bd8 MSG_vm_purgable_control +0xff003e80 MSG_processor_set_statistics +0xff003e84 MSG_processor_set_destroy +0xff003e88 MSG_processor_set_max_priority +0xff003e8c MSG_processor_set_policy_enable +0xff003e90 MSG_processor_set_policy_disable +0xff003e94 MSG_processor_set_tasks +0xff003e98 MSG_processor_set_threads +0xff003e9c MSG_processor_set_policy_control +0xff003ea0 MSG_processor_set_stack_usage +0xff003ea4 MSG_processor_set_info +0xff004b00 MSG_mach_vm_allocate +0xff004b04 MSG_mach_vm_deallocate +0xff004b08 MSG_mach_vm_protect +0xff004b0c MSG_mach_vm_inherit +0xff004b10 MSG_mach_vm_read +0xff004b14 MSG_mach_vm_read_list +0xff004b18 MSG_mach_vm_write +0xff004b1c MSG_mach_vm_copy +0xff004b20 MSG_mach_vm_read_overwrite +0xff004b24 MSG_mach_vm_msync +0xff004b28 MSG_mach_vm_behavior_set +0xff004b2c MSG_mach_vm_map +0xff004b30 MSG_mach_vm_machine_attribute +0xff004b34 MSG_mach_vm_remap +0xff004b38 MSG_mach_vm_page_query +0xff004b3c MSG_mach_vm_region_recurse +0xff004b40 MSG_mach_vm_region +0xff004b44 MSG__mach_make_memory_entry +0xff004b48 MSG_mach_vm_purgable_control +0xff004b4c MSG_mach_vm_page_info +0xff004e20 MSG_ledger_create +0xff004e24 MSG_ledger_terminate +0xff004e28 MSG_ledger_transfer +0xff004e2c MSG_ledger_read +0xff005140 MSG_mach_get_task_label +0xff005144 MSG_mach_get_task_label_text +0xff005148 MSG_mach_get_label +0xff00514c MSG_mach_get_label_text +0xff005150 MSG_mach_set_port_label +0xff005154 MSG_mac_check_service +0xff005158 MSG_mac_port_check_service_obj +0xff00515c MSG_mac_port_check_access +0xff005160 MSG_mac_label_new +0xff005164 MSG_mac_request_label +0xff005dc0 MSG_UNDExecute_rpc +0xff005dc4 MSG_UNDDisplayNoticeFromBundle_rpc +0xff005dc8 MSG_UNDDisplayAlertFromBundle_rpc +0xff005dcc MSG_UNDDisplayCustomFromBundle_rpc +0xff005dd0 MSG_UNDDisplayCustomFromDictionary_rpc +0xff005dd4 MSG_UNDCancelNotification_rpc +0xff005dd8 MSG_UNDDisplayNoticeSimple_rpc +0xff005ddc MSG_UNDDisplayAlertSimple_rpc +0xff0060e0 MSG_UNDAlertCompletedWithResult_rpc +0xff0060e4 MSG_UNDNotificationCreated_rpc +0xff01a5e0 MSG_check_task_access +0xff01a5e4 MSG_find_code_signature +0xff04b320 MSG_kextd_ping +0xff25a8a0 MSG_lock_acquire +0xff25a8a4 MSG_lock_release +0xff25a8a8 MSG_lock_try +0xff25a8ac MSG_lock_make_stable +0xff25a8b0 MSG_lock_handoff +0xff25a8b4 MSG_lock_handoff_accept +0xff25abc0 MSG_semaphore_signal +0xff25abc4 MSG_semaphore_signal_all +0xff25abc8 MSG_semaphore_wait +0xff25abcc MSG_semaphore_signal_thread +0xff25abd0 MSG_semaphore_timedwait +0xff25abd4 MSG_semaphore_wait_signal +0xff25abd8 MSG_semaphore_timedwait_signal +0xffbebdcc MSG_clock_alarm_reply diff --git a/bsd/kern/tty.c b/bsd/kern/tty.c index 841781612..b97eb780e 100644 --- a/bsd/kern/tty.c +++ b/bsd/kern/tty.c @@ -1498,6 +1498,8 @@ out: int ttyselect(struct tty *tp, int rw, void *wql, proc_t p) { + int retval = 0; + if (tp == NULL) return (ENXIO); @@ -1505,20 +1507,32 @@ ttyselect(struct tty *tp, int rw, void *wql, proc_t p) switch (rw) { case FREAD: - if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE)) + if (ISSET(tp->t_state, TS_ZOMBIE)) { return(1); + } + + retval = ttnread(tp); + if (retval > 0) { + break; + } + selrecord(p, &tp->t_rsel, wql); break; case FWRITE: - if ((tp->t_outq.c_cc <= tp->t_lowat && - ISSET(tp->t_state, TS_CONNECTED)) - || ISSET(tp->t_state, TS_ZOMBIE)) { - return (1); + if (ISSET(tp->t_state, TS_ZOMBIE)) { + return(1); } + + if ((tp->t_outq.c_cc <= tp->t_lowat) && + ISSET(tp->t_state, TS_CONNECTED)) { + retval = tp->t_hiwat - tp->t_outq.c_cc; + break; + } + selrecord(p, &tp->t_wsel, wql); break; } - return (0); + return retval; } @@ -3040,6 +3054,12 @@ ttyfree(struct tty *tp) { TTY_LOCK_NOTOWNED(tp); /* debug assert */ +#if DEBUG + if (!(SLIST_EMPTY(&tp->t_rsel.si_note) && SLIST_EMPTY(&tp->t_wsel.si_note))) { + panic("knotes hooked into a tty when the tty is freed.\n"); + } +#endif /* DEBUG */ + clfree(&tp->t_rawq); clfree(&tp->t_canq); clfree(&tp->t_outq); diff --git a/bsd/kern/tty_ptmx.c b/bsd/kern/tty_ptmx.c index 19c8e5bcc..a0be4feb5 100644 --- a/bsd/kern/tty_ptmx.c +++ b/bsd/kern/tty_ptmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1997-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -178,7 +178,7 @@ _devfs_setattr(void * handle, unsigned short mode, uid_t uid, gid_t gid) char name[128]; snprintf(name, sizeof(name), "/dev/%s", direntp->de_name); - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, CAST_USER_ADDR_T(name), ctx); + NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW, UIO_SYSSPACE, CAST_USER_ADDR_T(name), ctx); error = namei(&nd); if (error) goto out; @@ -229,7 +229,7 @@ sysctl_ptmx_max(__unused struct sysctl_oid *oidp, __unused void *arg1, SYSCTL_NODE(_kern, KERN_TTY, tty, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "TTY"); SYSCTL_PROC(_kern_tty, OID_AUTO, ptmx_max, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ptmx_max, 0, &sysctl_ptmx_max, "I", "ptmx_max"); @@ -259,6 +259,39 @@ struct ptmx_ioctl { static int ptmx_clone(dev_t dev, int minor); +/* + * Set of locks to keep the interaction between kevents and revoke + * from causing havoc. + */ + +#define LOG2_PTSD_KE_NLCK 2 +#define PTSD_KE_NLCK (1l << LOG2_PTSD_KE_NLCK) +#define PTSD_KE_LOCK_INDEX(x) ((x) & (PTSD_KE_NLCK - 1)) + +static lck_mtx_t ptsd_kevent_lock[PTSD_KE_NLCK]; + +static void +ptsd_kevent_lock_init(void) +{ + int i; + lck_grp_t *lgrp = lck_grp_alloc_init("ptsd kevent", LCK_GRP_ATTR_NULL); + + for (i = 0; i < PTSD_KE_NLCK; i++) + lck_mtx_init(&ptsd_kevent_lock[i], lgrp, LCK_ATTR_NULL); +} + +static void +ptsd_kevent_mtx_lock(int minor) +{ + lck_mtx_lock(&ptsd_kevent_lock[PTSD_KE_LOCK_INDEX(minor)]); +} + +static void +ptsd_kevent_mtx_unlock(int minor) +{ + lck_mtx_unlock(&ptsd_kevent_lock[PTSD_KE_LOCK_INDEX(minor)]); +} + int ptmx_init( __unused int config_count) { @@ -273,12 +306,25 @@ ptmx_init( __unused int config_count) return (ENOENT); } + if (cdevsw_setkqueueok(ptmx_major, &ptmx_cdev, 0) == -1) { + panic("Failed to set flags on ptmx cdevsw entry."); + } + /* Get a major number for /dev/pts/nnn */ if ((ptsd_major = cdevsw_add(-15, &ptsd_cdev)) == -1) { (void)cdevsw_remove(ptmx_major, &ptmx_cdev); printf("ptmx_init: failed to obtain /dev/ptmx major number\n"); return (ENOENT); } + + if (cdevsw_setkqueueok(ptsd_major, &ptsd_cdev, 0) == -1) { + panic("Failed to set flags on ptmx cdevsw entry."); + } + + /* + * Locks to guard against races between revoke and kevents + */ + ptsd_kevent_lock_init(); /* Create the /dev/ptmx device {<major>,0} */ (void)devfs_make_node_clone(makedev(ptmx_major, 0), @@ -549,6 +595,7 @@ ptsd_open(dev_t dev, int flag, __unused int devtype, __unused proc_t p) error = (*linesw[tp->t_line].l_open)(dev, tp); /* Successful open; mark as open by the slave */ pti->pt_flags |= PF_OPEN_S; + CLR(tp->t_state, TS_IOCTL_NOT_OK); if (error == 0) ptmx_wakeup(tp, FREAD|FWRITE); out: @@ -556,6 +603,8 @@ out: return (error); } +static void ptsd_revoke_knotes(dev_t, struct tty *); + FREE_BSDSTATIC int ptsd_close(dev_t dev, int flag, __unused int mode, __unused proc_t p) { @@ -587,9 +636,11 @@ ptsd_close(dev_t dev, int flag, __unused int mode, __unused proc_t p) #ifdef FIX_VSX_HANG tp->t_timeout = save_timeout; #endif - tty_unlock(tp); + if ((flag & IO_REVOKE) == IO_REVOKE) + ptsd_revoke_knotes(dev, tp); + /* unconditional, just like ttyclose() */ ptmx_free_ioctl(minor(dev), PF_OPEN_S); @@ -786,6 +837,7 @@ ptmx_open(dev_t dev, __unused int flag, __unused int devtype, __unused proc_t p) } tp->t_oproc = ptsd_start; CLR(tp->t_state, TS_ZOMBIE); + SET(tp->t_state, TS_IOCTL_NOT_OK); #ifdef sun4c tp->t_stop = ptsd_stop; #endif @@ -1000,19 +1052,30 @@ ptsd_select(dev_t dev, int rw, void *wql, proc_t p) switch (rw) { case FREAD: - if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE)) { + if (ISSET(tp->t_state, TS_ZOMBIE)) { retval = 1; break; } + + retval = ttnread(tp); + if (retval > 0) { + break; + } + selrecord(p, &tp->t_rsel, wql); break; case FWRITE: - if ((tp->t_outq.c_cc <= tp->t_lowat && - ISSET(tp->t_state, TS_CONNECTED)) - || ISSET(tp->t_state, TS_ZOMBIE)) { + if (ISSET(tp->t_state, TS_ZOMBIE)) { retval = 1; break; } + + if ((tp->t_outq.c_cc <= tp->t_lowat) && + ISSET(tp->t_state, TS_CONNECTED)) { + retval = tp->t_hiwat - tp->t_outq.c_cc; + break; + } + selrecord(p, &tp->t_wsel, wql); break; } @@ -1044,7 +1107,7 @@ ptmx_select(dev_t dev, int rw, void *wql, proc_t p) */ if ((tp->t_state&TS_ISOPEN) && tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) { - retval = 1; + retval = tp->t_outq.c_cc; break; } /* FALLTHROUGH */ @@ -1063,18 +1126,19 @@ ptmx_select(dev_t dev, int rw, void *wql, proc_t p) if (tp->t_state&TS_ISOPEN) { if (pti->pt_flags & PF_REMOTE) { if (tp->t_canq.c_cc == 0) { - retval = 1; + retval = (TTYHOG -1) ; break; } } else { - if (tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG-2) { - retval = 1; + retval = (TTYHOG - 2) - (tp->t_rawq.c_cc + tp->t_canq.c_cc); + if (retval > 0) { break; } if (tp->t_canq.c_cc == 0 && (tp->t_lflag&ICANON)) { retval = 1; break; } + retval = 0; } } selrecord(p, &pti->pt_selw, wql); @@ -1225,6 +1289,7 @@ cptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) struct ptmx_ioctl *pti; u_char *cc; int stop, error = 0; + int allow_ext_ioctl = 1; pti = ptmx_get_ioctl(minor(dev), 0); @@ -1233,11 +1298,18 @@ cptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) cc = tp->t_cc; + /* + * Do not permit extended ioctls on the master side of the pty unless + * the slave side has been successfully opened and initialized. + */ + if (cdevsw[major(dev)].d_open == ptmx_open && ISSET(tp->t_state, TS_IOCTL_NOT_OK)) + allow_ext_ioctl = 0; + /* * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG. * ttywflush(tp) will hang if there are characters in the outq. */ - if (cmd == TIOCEXT) { + if (cmd == TIOCEXT && allow_ext_ioctl) { /* * When the EXTPROC bit is being toggled, we need * to send an TIOCPKT_IOCTL if the packet driver @@ -1259,7 +1331,7 @@ cptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) } goto out; } else - if (cdevsw[major(dev)].d_open == ptmx_open) + if (cdevsw[major(dev)].d_open == ptmx_open) { switch (cmd) { case TIOCGPGRP: @@ -1363,6 +1435,17 @@ cptyioctl(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p) error = 0; goto out; } + + /* + * Fail all other calls; pty masters are not serial devices; + * we only pretend they are when the slave side of the pty is + * already open. + */ + if (!allow_ext_ioctl) { + error = ENOTTY; + goto out; + } + } error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p); if (error == ENOTTY) { error = ttioctl_locked(tp, cmd, data, flag, p); @@ -1440,127 +1523,110 @@ out: * kqueue support. */ int ptsd_kqfilter(dev_t, struct knote *); -static void ptsd_kqops_read_detach(struct knote *); -static int ptsd_kqops_read_event(struct knote *, long); -static void ptsd_kqops_write_detach(struct knote *); -static int ptsd_kqops_write_event(struct knote *, long); +static void ptsd_kqops_detach(struct knote *); +static int ptsd_kqops_event(struct knote *, long); -static struct filterops ptsd_kqops_read = { +static struct filterops ptsd_kqops = { .f_isfd = 1, - .f_detach = ptsd_kqops_read_detach, - .f_event = ptsd_kqops_read_event, + .f_detach = ptsd_kqops_detach, + .f_event = ptsd_kqops_event, }; -static struct filterops ptsd_kqops_write = { - .f_isfd = 1, - .f_detach = ptsd_kqops_write_detach, - .f_event = ptsd_kqops_write_event, -}; -static void -ptsd_kqops_read_detach(struct knote *kn) -{ - struct ptmx_ioctl *pti; - struct tty *tp; - dev_t dev = (dev_t) kn->kn_hookid; - - pti = ptmx_get_ioctl(minor(dev), 0); - tp = pti->pt_tty; +#define PTSD_KNOTE_VALID NULL +#define PTSD_KNOTE_REVOKED ((void *)-911l) - if (tp == NULL) - return; - - tty_lock(tp); - KNOTE_DETACH(&tp->t_rsel.si_note, kn); - tty_unlock(tp); - - kn->kn_hookid = 0; -} +/* + * In the normal case, by the time the driver_close() routine is called + * on the slave, all knotes have been detached. However in the revoke(2) + * case, the driver's close routine is called while there are knotes active + * that reference the handlers below. And we have no obvious means to + * reach from the driver out to the kqueue's that reference them to get + * them to stop. + */ -static int -ptsd_kqops_read_event(struct knote *kn, long hint) +static void +ptsd_kqops_detach(struct knote *kn) { struct ptmx_ioctl *pti; struct tty *tp; - dev_t dev = (dev_t) kn->kn_hookid; - int retval = 0; - - pti = ptmx_get_ioctl(minor(dev), 0); - tp = pti->pt_tty; - - if (tp == NULL) - return (ENXIO); - - if (hint == 0) - tty_lock(tp); + dev_t dev, lockdev = (dev_t)kn->kn_hookid; - kn->kn_data = ttnread(tp); - if (kn->kn_data > 0) { - retval = 1; - } + ptsd_kevent_mtx_lock(minor(lockdev)); - if (ISSET(tp->t_state, TS_ZOMBIE)) { - kn->kn_flags |= EV_EOF; - retval = 1; + if ((dev = (dev_t)kn->kn_hookid) != 0) { + pti = ptmx_get_ioctl(minor(dev), 0); + if (pti != NULL && (tp = pti->pt_tty) != NULL) { + tty_lock(tp); + if (kn->kn_filter == EVFILT_READ) + KNOTE_DETACH(&tp->t_rsel.si_note, kn); + else + KNOTE_DETACH(&tp->t_wsel.si_note, kn); + tty_unlock(tp); + kn->kn_hookid = 0; + } } - if (hint == 0) - tty_unlock(tp); - return (retval); -} -static void -ptsd_kqops_write_detach(struct knote *kn) -{ - struct ptmx_ioctl *pti; - struct tty *tp; - dev_t dev = (dev_t) kn->kn_hookid; - - pti = ptmx_get_ioctl(minor(dev), 0); - tp = pti->pt_tty; - - if (tp == NULL) - return; - - tty_lock(tp); - KNOTE_DETACH(&tp->t_wsel.si_note, kn); - tty_unlock(tp); - - kn->kn_hookid = 0; + ptsd_kevent_mtx_unlock(minor(lockdev)); } static int -ptsd_kqops_write_event(struct knote *kn, long hint) +ptsd_kqops_event(struct knote *kn, long hint) { struct ptmx_ioctl *pti; struct tty *tp; - dev_t dev = (dev_t) kn->kn_hookid; + dev_t dev = (dev_t)kn->kn_hookid; int retval = 0; - pti = ptmx_get_ioctl(minor(dev), 0); - tp = pti->pt_tty; + ptsd_kevent_mtx_lock(minor(dev)); - if (tp == NULL) - return (ENXIO); + do { + if (kn->kn_hook != PTSD_KNOTE_VALID ) { + /* We were revoked */ + kn->kn_data = 0; + kn->kn_flags |= EV_EOF; + retval = 1; + break; + } - if (hint == 0) - tty_lock(tp); + pti = ptmx_get_ioctl(minor(dev), 0); + if (pti == NULL || (tp = pti->pt_tty) == NULL) { + kn->kn_data = ENXIO; + kn->kn_flags |= EV_ERROR; + retval = 1; + break; + } - if ((tp->t_outq.c_cc <= tp->t_lowat) && - ISSET(tp->t_state, TS_CONNECTED)) { - kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc; - retval = 1; - } + if (hint == 0) + tty_lock(tp); - if (ISSET(tp->t_state, TS_ZOMBIE)) { - kn->kn_flags |= EV_EOF; - retval = 1; - } + if (kn->kn_filter == EVFILT_READ) { + kn->kn_data = ttnread(tp); + if (kn->kn_data > 0) + retval = 1; + if (ISSET(tp->t_state, TS_ZOMBIE)) { + kn->kn_flags |= EV_EOF; + retval = 1; + } + } else { /* EVFILT_WRITE */ + if ((tp->t_outq.c_cc <= tp->t_lowat) && + ISSET(tp->t_state, TS_CONNECTED)) { + kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc; + retval = 1; + } + if (ISSET(tp->t_state, TS_ZOMBIE)) { + kn->kn_flags |= EV_EOF; + retval = 1; + } + } - if (hint == 0) - tty_unlock(tp); - return (retval); + if (hint == 0) + tty_unlock(tp); + } while (0); -} + ptsd_kevent_mtx_unlock(minor(dev)); + return (retval); +} int ptsd_kqfilter(dev_t dev, struct knote *kn) { @@ -1581,14 +1647,14 @@ ptsd_kqfilter(dev_t dev, struct knote *kn) tty_lock(tp); kn->kn_hookid = dev; + kn->kn_hook = PTSD_KNOTE_VALID; + kn->kn_fop = &ptsd_kqops; switch (kn->kn_filter) { case EVFILT_READ: - kn->kn_fop = &ptsd_kqops_read; KNOTE_ATTACH(&tp->t_rsel.si_note, kn); break; case EVFILT_WRITE: - kn->kn_fop = &ptsd_kqops_write; KNOTE_ATTACH(&tp->t_wsel.si_note, kn); break; default: @@ -1600,3 +1666,59 @@ ptsd_kqfilter(dev_t dev, struct knote *kn) return (retval); } +/* + * Support for revoke(2). + * + * Mark all the kn_hook fields so that future invocations of the + * f_event op will just say "EOF" *without* looking at the + * ptmx_ioctl structure (which may disappear or be recycled at + * the end of ptsd_close). Issue wakeups to post that EOF to + * anyone listening. And finally remove the knotes from the + * tty's klists to keep ttyclose() happy, and set the hookid to + * zero to make the final detach passively successful. + */ +static void +ptsd_revoke_knotes(dev_t dev, struct tty *tp) +{ + struct klist *list; + struct knote *kn, *tkn; + + /* (Hold and drop the right locks in the right order.) */ + + ptsd_kevent_mtx_lock(minor(dev)); + tty_lock(tp); + + list = &tp->t_rsel.si_note; + SLIST_FOREACH(kn, list, kn_selnext) + kn->kn_hook = PTSD_KNOTE_REVOKED; + + list = &tp->t_wsel.si_note; + SLIST_FOREACH(kn, list, kn_selnext) + kn->kn_hook = PTSD_KNOTE_REVOKED; + + tty_unlock(tp); + ptsd_kevent_mtx_unlock(minor(dev)); + + tty_lock(tp); + ttwakeup(tp); + ttwwakeup(tp); + tty_unlock(tp); + + ptsd_kevent_mtx_lock(minor(dev)); + tty_lock(tp); + + list = &tp->t_rsel.si_note; + SLIST_FOREACH_SAFE(kn, list, kn_selnext, tkn) { + (void) KNOTE_DETACH(list, kn); + kn->kn_hookid = 0; + } + + list = &tp->t_wsel.si_note; + SLIST_FOREACH_SAFE(kn, list, kn_selnext, tkn) { + (void) KNOTE_DETACH(list, kn); + kn->kn_hookid = 0; + } + + tty_unlock(tp); + ptsd_kevent_mtx_unlock(minor(dev)); +} diff --git a/bsd/kern/tty_subr.c b/bsd/kern/tty_subr.c index c2abc010d..89bc09fe0 100644 --- a/bsd/kern/tty_subr.c +++ b/bsd/kern/tty_subr.c @@ -340,7 +340,9 @@ clrbits(u_char *cp, int off, int len) cp[sby++] &= mask; mask = (1<<ebi) - 1; - cp[eby] &= ~mask; + /* handle remainder bits, if any, for a non-0 ebi value */ + if (mask) + cp[eby] &= ~mask; for (i = sby; i < eby; i++) cp[i] = 0x00; diff --git a/bsd/kern/tty_tty.c b/bsd/kern/tty_tty.c index 29868fadc..ec7bee445 100644 --- a/bsd/kern/tty_tty.c +++ b/bsd/kern/tty_tty.c @@ -82,9 +82,8 @@ int cttyioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, proc_t p); int cttyselect(dev_t dev, int flag, void* wql, proc_t p); static vnode_t cttyvp(proc_t p); - int -cttyopen(__unused dev_t dev, int flag, __unused int mode, proc_t p) +cttyopen(dev_t dev, int flag, __unused int mode, proc_t p) { vnode_t ttyvp = cttyvp(p); struct vfs_context context; @@ -96,7 +95,18 @@ cttyopen(__unused dev_t dev, int flag, __unused int mode, proc_t p) context.vc_thread = current_thread(); context.vc_ucred = kauth_cred_proc_ref(p); + /* + * A little hack--this device, used by many processes, + * happens to do an open on another device, which can + * cause unhappiness if the second-level open blocks indefinitely + * (as could be the case if the master side has hung up). Since + * we know that this driver doesn't care about the serializing + * opens and closes, we can drop the lock. + */ + devsw_unlock(dev, S_IFCHR); error = VNOP_OPEN(ttyvp, flag, &context); + devsw_lock(dev, S_IFCHR); + vnode_put(ttyvp); kauth_cred_unref(&context.vc_ucred); diff --git a/bsd/kern/ubc_subr.c b/bsd/kern/ubc_subr.c index 473ae0a78..c7661e41b 100644 --- a/bsd/kern/ubc_subr.c +++ b/bsd/kern/ubc_subr.c @@ -74,6 +74,8 @@ extern kern_return_t memory_object_pages_resident(memory_object_control_t, boolean_t *); extern kern_return_t memory_object_signed(memory_object_control_t control, boolean_t is_signed); +extern boolean_t memory_object_is_slid(memory_object_control_t control); + extern void Debugger(const char *message); @@ -140,9 +142,11 @@ enum { CSMAGIC_CODEDIRECTORY = 0xfade0c02, /* CodeDirectory blob */ CSMAGIC_EMBEDDED_SIGNATURE = 0xfade0cc0, /* embedded form of signature data */ CSMAGIC_EMBEDDED_SIGNATURE_OLD = 0xfade0b02, /* XXX */ + CSMAGIC_EMBEDDED_ENTITLEMENTS = 0xfade7171, /* embedded entitlements */ CSMAGIC_DETACHED_SIGNATURE = 0xfade0cc1, /* multi-arch collection of embedded signatures */ CSSLOT_CODEDIRECTORY = 0, /* slot index for CodeDirectory */ + CSSLOT_ENTITLEMENTS = 5 }; static const uint32_t supportsScatter = 0x20100; // first version to support scatter option @@ -163,6 +167,12 @@ typedef struct __SuperBlob { /* followed by Blobs in no particular order as indicated by offsets in index */ } CS_SuperBlob; +typedef struct __GenericBlob { + uint32_t magic; /* magic number */ + uint32_t length; /* total length of blob */ + char data[]; +} CS_GenericBlob; + struct Scatter { uint32_t count; // number of pages; zero for sentinel (only) uint32_t base; // first page number @@ -353,6 +363,113 @@ hashes( * End of routines to navigate code signing data structures in the kernel. */ +/* + * ENTITLEMENTS + * Routines to navigate entitlements in the kernel. + */ + +/* Retrieve the entitlements blob for a process. + * Returns: + * EINVAL no text vnode associated with the process + * EBADEXEC invalid code signing data + * ENOMEM you should reboot + * 0 no error occurred + * + * On success, out_start and out_length will point to the + * entitlements blob if found; or will be set to NULL/zero + * if there were no entitlements. + */ +int +cs_entitlements_blob_get(proc_t p, void **out_start, size_t *out_length) +{ + SHA1_CTX context; /* XXX hash agility */ + int error = 0; + struct cs_blob *blob_list_entry; + CS_SuperBlob *super_blob; + CS_BlobIndex *blob_index; + CS_GenericBlob *blob; + CS_CodeDirectory *code_dir; + unsigned char *computed_hash = NULL; + unsigned char *embedded_hash = NULL; + void *start = NULL; + size_t length = 0; + size_t hash_size = 0; + unsigned int i, count; + + if (NULL == p->p_textvp) { + error = EINVAL; + goto out; + } + if (NULL == (blob_list_entry = ubc_cs_blob_get(p->p_textvp, -1, + p->p_textoff))) + goto out; + super_blob = (void *)blob_list_entry->csb_mem_kaddr; + if (CSMAGIC_EMBEDDED_SIGNATURE != ntohl(super_blob->magic)) { + error = EBADEXEC; + goto out; + } + count = ntohl(super_blob->count); + for (i = 0; i < count; ++i) { + blob_index = &super_blob->index[i]; + blob = (void *)((char *)super_blob + ntohl(blob_index->offset)); + switch (ntohl(blob_index->type)) { + case CSSLOT_CODEDIRECTORY: + if (CSMAGIC_CODEDIRECTORY != ntohl(blob->magic)) + break; + code_dir = (void *)blob; + hash_size = code_dir->hashSize; + if (CSSLOT_ENTITLEMENTS <= + ntohl(code_dir->nSpecialSlots)) { + embedded_hash = (void *)((char *)code_dir + + ntohl(code_dir->hashOffset) - + (hash_size * CSSLOT_ENTITLEMENTS)); + } + break; + case CSSLOT_ENTITLEMENTS: + if (CSMAGIC_EMBEDDED_ENTITLEMENTS != ntohl(blob->magic)) + break; + start = (void *)blob; + length = ntohl(blob->length); + break; + default: + break; + } + } + if (NULL == start && NULL == embedded_hash) { + error = 0; + goto out; + } else if (NULL == start || NULL == embedded_hash) { + error = EBADEXEC; + goto out; + } + if (NULL == (computed_hash = kalloc(hash_size))) { + error = ENOMEM; + goto out; + } + SHA1Init(&context); + SHA1Update(&context, start, length); + SHA1Final(computed_hash, &context); + if (0 != memcmp(computed_hash, embedded_hash, hash_size)) { + error = EBADEXEC; + goto out; + } + error = 0; +out: + if (NULL != computed_hash) + kfree(computed_hash, hash_size); + if (0 == error) { + *out_start = start; + *out_length = length; + } + return error; +} + +/* + * ENTITLEMENTS + * End of routines to navigate entitlements in the kernel. + */ + + /* * ubc_init @@ -626,7 +743,10 @@ ubc_setsize(struct vnode *vp, off_t nsize) uip->ui_size = nsize; if (nsize >= osize) { /* Nothing more to do */ - lock_vnode_and_post(vp, NOTE_EXTEND); + if (nsize > osize) { + lock_vnode_and_post(vp, NOTE_EXTEND); + } + return (1); /* return success */ } @@ -986,6 +1106,16 @@ ubc_getobject(struct vnode *vp, __unused int flags) return (MEMORY_OBJECT_CONTROL_NULL); } +boolean_t +ubc_strict_uncached_IO(struct vnode *vp) +{ + boolean_t result = FALSE; + + if (UBCINFOEXISTS(vp)) { + result = memory_object_is_slid(vp->v_ubcinfo->ui_control); + } + return result; +} /* * ubc_blktooff @@ -1834,6 +1964,9 @@ ubc_create_upl( if (bufsize & 0xfff) return KERN_INVALID_ARGUMENT; + if (bufsize > MAX_UPL_SIZE * PAGE_SIZE) + return KERN_INVALID_ARGUMENT; + if (uplflags & (UPL_UBC_MSYNC | UPL_UBC_PAGEOUT | UPL_UBC_PAGEIN)) { if (uplflags & UPL_UBC_MSYNC) { @@ -2223,12 +2356,12 @@ static SInt32 cs_blob_count_peak = 0; int cs_validation = 1; -SYSCTL_INT(_vm, OID_AUTO, cs_validation, CTLFLAG_RW, &cs_validation, 0, "Do validate code signatures"); -SYSCTL_INT(_vm, OID_AUTO, cs_blob_count, CTLFLAG_RD, &cs_blob_count, 0, "Current number of code signature blobs"); -SYSCTL_INT(_vm, OID_AUTO, cs_blob_size, CTLFLAG_RD, &cs_blob_size, 0, "Current size of all code signature blobs"); -SYSCTL_INT(_vm, OID_AUTO, cs_blob_count_peak, CTLFLAG_RD, &cs_blob_count_peak, 0, "Peak number of code signature blobs"); -SYSCTL_INT(_vm, OID_AUTO, cs_blob_size_peak, CTLFLAG_RD, &cs_blob_size_peak, 0, "Peak size of code signature blobs"); -SYSCTL_INT(_vm, OID_AUTO, cs_blob_size_max, CTLFLAG_RD, &cs_blob_size_max, 0, "Size of biggest code signature blob"); +SYSCTL_INT(_vm, OID_AUTO, cs_validation, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_validation, 0, "Do validate code signatures"); +SYSCTL_INT(_vm, OID_AUTO, cs_blob_count, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)(uintptr_t)&cs_blob_count, 0, "Current number of code signature blobs"); +SYSCTL_INT(_vm, OID_AUTO, cs_blob_size, CTLFLAG_RD | CTLFLAG_LOCKED, (int *)(uintptr_t)&cs_blob_size, 0, "Current size of all code signature blobs"); +SYSCTL_INT(_vm, OID_AUTO, cs_blob_count_peak, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_blob_count_peak, 0, "Peak number of code signature blobs"); +SYSCTL_INT(_vm, OID_AUTO, cs_blob_size_peak, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_blob_size_peak, 0, "Peak size of code signature blobs"); +SYSCTL_INT(_vm, OID_AUTO, cs_blob_size_max, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_blob_size_max, 0, "Size of biggest code signature blob"); kern_return_t ubc_cs_blob_allocate( @@ -2335,7 +2468,7 @@ ubc_cs_blob_add( blob->csb_start_offset = 0; blob->csb_end_offset = 0; } else { - unsigned char *sha1_base; + const unsigned char *sha1_base; int sha1_size; blob->csb_flags = ntohl(cd->flags) | CS_VALID; @@ -2582,6 +2715,9 @@ ubc_cs_free( OSAddAtomic((SInt32) -blob->csb_mem_size, &cs_blob_size); kfree(blob, sizeof (*blob)); } +#if CHECK_CS_VALIDATION_BITMAP + ubc_cs_validation_bitmap_deallocate( uip->ui_vnode ); +#endif uip->cs_blobs = NULL; } @@ -2820,3 +2956,127 @@ ubc_cs_getcdhash( return ret; } + +#if CHECK_CS_VALIDATION_BITMAP +#define stob(s) ((atop_64((s)) + 07) >> 3) +extern boolean_t root_fs_upgrade_try; + +/* + * Should we use the code-sign bitmap to avoid repeated code-sign validation? + * Depends: + * a) Is the target vnode on the root filesystem? + * b) Has someone tried to mount the root filesystem read-write? + * If answers are (a) yes AND (b) no, then we can use the bitmap. + */ +#define USE_CODE_SIGN_BITMAP(vp) ( (vp != NULL) && (vp->v_mount != NULL) && (vp->v_mount->mnt_flag & MNT_ROOTFS) && !root_fs_upgrade_try) +kern_return_t +ubc_cs_validation_bitmap_allocate( + vnode_t vp) +{ + kern_return_t kr = KERN_SUCCESS; + struct ubc_info *uip; + char *target_bitmap; + vm_object_size_t bitmap_size; + + if ( ! USE_CODE_SIGN_BITMAP(vp) || (! UBCINFOEXISTS(vp))) { + kr = KERN_INVALID_ARGUMENT; + } else { + uip = vp->v_ubcinfo; + + if ( uip->cs_valid_bitmap == NULL ) { + bitmap_size = stob(uip->ui_size); + target_bitmap = (char*) kalloc( (vm_size_t)bitmap_size ); + if (target_bitmap == 0) { + kr = KERN_NO_SPACE; + } else { + kr = KERN_SUCCESS; + } + if( kr == KERN_SUCCESS ) { + memset( target_bitmap, 0, (size_t)bitmap_size); + uip->cs_valid_bitmap = (void*)target_bitmap; + uip->cs_valid_bitmap_size = bitmap_size; + } + } + } + return kr; +} + +kern_return_t +ubc_cs_check_validation_bitmap ( + vnode_t vp, + memory_object_offset_t offset, + int optype) +{ + kern_return_t kr = KERN_SUCCESS; + + if ( ! USE_CODE_SIGN_BITMAP(vp) || ! UBCINFOEXISTS(vp)) { + kr = KERN_INVALID_ARGUMENT; + } else { + struct ubc_info *uip = vp->v_ubcinfo; + char *target_bitmap = uip->cs_valid_bitmap; + + if ( target_bitmap == NULL ) { + kr = KERN_INVALID_ARGUMENT; + } else { + uint64_t bit, byte; + bit = atop_64( offset ); + byte = bit >> 3; + + if ( byte > uip->cs_valid_bitmap_size ) { + kr = KERN_INVALID_ARGUMENT; + } else { + + if (optype == CS_BITMAP_SET) { + target_bitmap[byte] |= (1 << (bit & 07)); + kr = KERN_SUCCESS; + } else if (optype == CS_BITMAP_CLEAR) { + target_bitmap[byte] &= ~(1 << (bit & 07)); + kr = KERN_SUCCESS; + } else if (optype == CS_BITMAP_CHECK) { + if ( target_bitmap[byte] & (1 << (bit & 07))) { + kr = KERN_SUCCESS; + } else { + kr = KERN_FAILURE; + } + } + } + } + } + return kr; +} + +void +ubc_cs_validation_bitmap_deallocate( + vnode_t vp) +{ + struct ubc_info *uip; + void *target_bitmap; + vm_object_size_t bitmap_size; + + if ( UBCINFOEXISTS(vp)) { + uip = vp->v_ubcinfo; + + if ( (target_bitmap = uip->cs_valid_bitmap) != NULL ) { + bitmap_size = uip->cs_valid_bitmap_size; + kfree( target_bitmap, (vm_size_t) bitmap_size ); + uip->cs_valid_bitmap = NULL; + } + } +} +#else +kern_return_t ubc_cs_validation_bitmap_allocate(__unused vnode_t vp){ + return KERN_INVALID_ARGUMENT; +} + +kern_return_t ubc_cs_check_validation_bitmap( + __unused struct vnode *vp, + __unused memory_object_offset_t offset, + __unused int optype){ + + return KERN_INVALID_ARGUMENT; +} + +void ubc_cs_validation_bitmap_deallocate(__unused vnode_t vp){ + return; +} +#endif /* CHECK_CS_VALIDATION_BITMAP */ diff --git a/bsd/kern/uipc_domain.c b/bsd/kern/uipc_domain.c index 37a040c86..1065d3683 100644 --- a/bsd/kern/uipc_domain.c +++ b/bsd/kern/uipc_domain.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2009 Apple Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,15 +77,14 @@ #include <pexpert/pexpert.h> void init_domain(struct domain *dp) __attribute__((section("__TEXT, initcode"))); -void concat_domain(struct domain *dp) __attribute__((section("__TEXT, initcode"))); +void prepend_domain(struct domain *dp) __attribute__((section("__TEXT, initcode"))); - -void pffasttimo(void *); void pfslowtimo(void *); struct protosw *pffindprotonotype(int, int); struct protosw *pffindprotonotype_locked(int , int , int); struct domain *pffinddomain(int); +static void net_update_uptime(void); /* * Add/delete 'domain': Link structure into system list, @@ -101,6 +100,12 @@ extern int do_reclaim; extern sysctlfn net_sysctl; +static u_int64_t uptime; + +#ifdef INET6 +extern void ip6_fin(void); +#endif + static void init_proto(struct protosw *pr) { @@ -133,6 +138,16 @@ init_domain(struct domain *dp) dp->dom_name, (int)(pr - dp->dom_protosw)); +#if __APPLE__ + /* + * Warn that pr_fasttimo (now pr_unused) is deprecated since rdar://7617868 + */ + if (pr->pr_unused != NULL) { + printf("init_domain: warning %s, proto %d: pr_fasttimo is deprecated and won't be called\n", + dp->dom_name, pr->pr_protocol); + } +#endif + init_proto(pr); } @@ -147,8 +162,8 @@ init_domain(struct domain *dp) } void -concat_domain(struct domain *dp) -{ +prepend_domain(struct domain *dp) +{ lck_mtx_assert(domain_proto_mtx, LCK_MTX_ASSERT_OWNED); dp->dom_next = domains; domains = dp; @@ -162,7 +177,7 @@ net_add_domain(struct domain *dp) /* First, link in the domain */ lck_mtx_lock(domain_proto_mtx); - concat_domain(dp); + prepend_domain(dp); init_domain(dp); lck_mtx_unlock(domain_proto_mtx); @@ -302,31 +317,32 @@ domaininit(void) lck_mtx_lock(domain_proto_mtx); - concat_domain(&localdomain); - concat_domain(&routedomain); - concat_domain(&inetdomain); + prepend_domain(&localdomain); + prepend_domain(&inetdomain); #if NETAT - concat_domain(&atalkdomain); + prepend_domain(&atalkdomain); #endif #if INET6 - concat_domain(&inet6domain); + prepend_domain(&inet6domain); #endif + prepend_domain(&routedomain); + #if IPSEC - concat_domain(&keydomain); + prepend_domain(&keydomain); #endif #if NS - concat_domain(&nsdomain); + prepend_domain(&nsdomain); #endif #if ISO - concat_domain(&isodomain); + prepend_domain(&isodomain); #endif #if CCITT - concat_domain(&ccittdomain); + prepend_domain(&ccittdomain); #endif - concat_domain(&ndrvdomain); + prepend_domain(&ndrvdomain); - concat_domain(&systemdomain); + prepend_domain(&systemdomain); /* * Now ask them all to init (XXX including the routing domain, @@ -336,10 +352,17 @@ domaininit(void) init_domain(dp); lck_mtx_unlock(domain_proto_mtx); - timeout(pffasttimo, NULL, 1); timeout(pfslowtimo, NULL, 1); } +void +domainfin(void) +{ +#ifdef INET6 + ip6_fin(); +#endif +} + static __inline__ struct domain * pffinddomain_locked(int pf) { @@ -525,6 +548,13 @@ pfslowtimo(__unused void *arg) register struct domain *dp; register struct protosw *pr; + /* + * Update coarse-grained networking timestamp (in sec.); the idea + * is to piggy-back on the periodic slow timeout callout to update + * the counter returnable via net_uptime(). + */ + net_update_uptime(); + lck_mtx_lock(domain_proto_mtx); for (dp = domains; dp; dp = dp->dom_next) for (pr = dp->dom_protosw; pr; pr = pr->pr_next) { @@ -539,17 +569,26 @@ pfslowtimo(__unused void *arg) timeout(pfslowtimo, NULL, hz/PR_SLOWHZ); } -void -pffasttimo(__unused void *arg) +static void +net_update_uptime(void) { - register struct domain *dp; - register struct protosw *pr; + struct timeval tv; - lck_mtx_lock(domain_proto_mtx); - for (dp = domains; dp; dp = dp->dom_next) - for (pr = dp->dom_protosw; pr; pr = pr->pr_next) - if (pr->pr_fasttimo) - (*pr->pr_fasttimo)(); - lck_mtx_unlock(domain_proto_mtx); - timeout(pffasttimo, NULL, hz/PR_FASTHZ); + microuptime(&tv); + uptime = tv.tv_sec; +} + +/* + * An alternative way to obtain the coarse-grained uptime (in seconds) + * for networking code which do not require high-precision timestamp, + * as this is significantly cheaper than microuptime(). + */ +u_int64_t +net_uptime(void) +{ + /* If we get here before pfslowtimo() fires for the first time */ + if (uptime == 0) + net_update_uptime(); + + return (uptime); } diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index 627cd9926..d8d3ce857 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,7 @@ #include <kern/queue.h> #include <kern/sched_prim.h> #include <kern/cpu_number.h> +#include <kern/zalloc.h> #include <libkern/OSAtomic.h> #include <libkern/libkern.h> @@ -115,7 +116,7 @@ * preserve the contents of the objects during its transactions. * * MC_BIGCL: - * This is a cache of rudimentary objects of NBPG in size; each + * This is a cache of rudimentary objects of MBIGCLBYTES in size; each * object represents a mbigcluster structure. This cache does not * preserve the contents of the objects during its transaction. * @@ -264,8 +265,9 @@ * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally, * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag, - * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note - * that debugging consumes more CPU and memory. + * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak + * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g. + * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory. * * Each object is associated with exactly one mcache_audit_t structure that * contains the information related to its last buffer transaction. Given @@ -276,9 +278,9 @@ * | mbuf addr | | mclaudit[i] | * +------------+ +=============+ * | | cl_audit[0] | - * i = MTOCL(addr) +-------------+ + * i = MTOBG(addr) +-------------+ * | +-----> | cl_audit[1] | -----> mcache_audit_t - * b = CLTOM(i) | +-------------+ + * b = BGTOM(i) | +-------------+ * | | | ... | * x = MCLIDX(b, addr) | +-------------+ * | | | cl_audit[7] | @@ -286,12 +288,12 @@ * (e.g. x == 1) * * The mclaudit[] array is allocated at initialization time, but its contents - * get populated when the corresponding cluster is created. Because a cluster - * can be turned into NMBPCL number of mbufs, we preserve enough space for the - * mbufs so that there is a 1-to-1 mapping between them. A cluster that never + * get populated when the corresponding cluster is created. Because a page + * can be turned into NMBPBG number of mbufs, we preserve enough space for the + * mbufs so that there is a 1-to-1 mapping between them. A page that never * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the - * remaining entries unused. For big clusters, only one entry is allocated - * and used for the entire cluster pair. + * remaining entries unused. For 16KB cluster, only one entry from the first + * page is allocated and used for the entire object. */ /* TODO: should be in header file */ @@ -311,7 +313,7 @@ static void *mbuf_worker_run; /* wait channel for worker thread */ static int mbuf_worker_ready; /* worker thread is runnable */ static int mbuf_expand_mcl; /* number of cluster creation requets */ static int mbuf_expand_big; /* number of big cluster creation requests */ -static int mbuf_expand_16k; /* number of 16K cluster creation requests */ +static int mbuf_expand_16k; /* number of 16KB cluster creation requests */ static int ncpu; /* number of CPUs */ static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */ static ppnum_t mcl_pages; /* Size of array (# physical pages) */ @@ -320,19 +322,18 @@ static mcache_t *ref_cache; /* Cache of cluster reference & flags */ static mcache_t *mcl_audit_con_cache; /* Audit contents cache */ static unsigned int mbuf_debug; /* patchable mbuf mcache flags */ static unsigned int mb_normalized; /* number of packets "normalized" */ -static unsigned int mbuf_gscale; /* Power-of-two growth scale for m_howmany */ #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */ -#define MB_GROWTH_NORMAL 4 /* Threshold: 15/16 of total */ +#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */ typedef enum { MC_MBUF = 0, /* Regular mbuf */ MC_CL, /* Cluster */ - MC_BIGCL, /* Large (4K) cluster */ - MC_16KCL, /* Jumbo (16K) cluster */ + MC_BIGCL, /* Large (4KB) cluster */ + MC_16KCL, /* Jumbo (16KB) cluster */ MC_MBUF_CL, /* mbuf + cluster */ - MC_MBUF_BIGCL, /* mbuf + large (4K) cluster */ - MC_MBUF_16KCL /* mbuf + jumbo (16K) cluster */ + MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */ + MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */ } mbuf_class_t; #define MBUF_CLASS_MIN MC_MBUF @@ -371,6 +372,8 @@ typedef enum { * a cluster's size. In this case, only the slab of the first cluster is * used. The rest of the slabs are marked with SLF_PARTIAL to indicate * that they are part of the larger slab. + * + * Each slab controls a page of memory. */ typedef struct mcl_slab { struct mcl_slab *sl_next; /* neighboring slab */ @@ -394,23 +397,24 @@ typedef struct mcl_slab { * whenever a new piece of memory mapped in from the VM crosses the 1MB * boundary. */ -#define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */ +#define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */ typedef struct mcl_slabg { mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */ } mcl_slabg_t; +/* + * Number of slabs needed to control a 16KB cluster object. + */ +#define NSLABSP16KB (M16KCLBYTES >> PGSHIFT) + /* * Per-cluster audit structure. */ typedef struct { - mcache_audit_t *cl_audit[NMBPCL]; /* array of audits */ + mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */ } mcl_audit_t; -#if CONFIG_MBUF_NOEXPAND -static unsigned int maxmbufcl; -#endif /* CONFIG_MBUF_NOEXPAND */ - /* * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr * and m_ext structures. If auditing is enabled, we allocate a shadow @@ -434,6 +438,7 @@ static unsigned int maxmbufcl; * Each of the following two arrays hold up to nmbclusters elements. */ static mcl_audit_t *mclaudit; /* array of cluster audit information */ +static unsigned int maxclaudit; /* max # of entries in audit table */ static mcl_slabg_t **slabstbl; /* cluster slabs table */ static unsigned int maxslabgrp; /* max # of entries in slabs table */ static unsigned int slabgrp; /* # of entries in slabs table */ @@ -442,13 +447,68 @@ static unsigned int slabgrp; /* # of entries in slabs table */ int nclusters; /* # of clusters for non-jumbo (legacy) sizes */ int njcl; /* # of clusters for jumbo sizes */ int njclbytes; /* size of a jumbo cluster */ -union mcluster *mbutl; /* first mapped cluster address */ -union mcluster *embutl; /* ending virtual address of mclusters */ +union mbigcluster *mbutl; /* first mapped cluster address */ +union mbigcluster *embutl; /* ending virtual address of mclusters */ int max_linkhdr; /* largest link-level header */ int max_protohdr; /* largest protocol header */ int max_hdr; /* largest link+protocol header */ int max_datalen; /* MHLEN - max_hdr */ +static boolean_t mclverify; /* debug: pattern-checking */ +static boolean_t mcltrace; /* debug: stack tracing */ +static boolean_t mclfindleak; /* debug: leak detection */ + +/* mbuf leak detection variables */ +static struct mleak_table mleak_table; +static mleak_stat_t *mleak_stat; + +#define MLEAK_STAT_SIZE(n) \ + ((size_t)(&((mleak_stat_t *)0)->ml_trace[n])) + +struct mallocation { + mcache_obj_t *element; /* the alloc'ed element, NULL if unused */ + u_int32_t trace_index; /* mtrace index for corresponding backtrace */ + u_int32_t count; /* How many objects were requested */ + u_int64_t hitcount; /* for determining hash effectiveness */ +}; + +struct mtrace { + u_int64_t collisions; + u_int64_t hitcount; + u_int64_t allocs; + u_int64_t depth; + uintptr_t addr[MLEAK_STACK_DEPTH]; +}; + +/* Size must be a power of two for the zhash to be able to just mask off bits */ +#define MLEAK_ALLOCATION_MAP_NUM 512 +#define MLEAK_TRACE_MAP_NUM 256 + +/* + * Sample factor for how often to record a trace. This is overwritable + * by the boot-arg mleak_sample_factor. + */ +#define MLEAK_SAMPLE_FACTOR 500 + +/* + * Number of top leakers recorded. + */ +#define MLEAK_NUM_TRACES 5 + +static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM; +static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM; + +/* Hashmaps of allocations and their corresponding traces */ +static struct mallocation *mleak_allocations; +static struct mtrace *mleak_traces; +static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES]; + +/* Lock to protect mleak tables from concurrent modification */ +static lck_mtx_t *mleak_lock; +static lck_attr_t *mleak_lock_attr; +static lck_grp_t *mleak_lock_grp; +static lck_grp_attr_t *mleak_lock_grp_attr; + extern u_int32_t high_sb_max; /* TODO: should be in header file */ @@ -460,7 +520,6 @@ int do_reclaim = 0; #define MIN16KCL (MINCL >> 2) /* Low watermarks (only map in pages once free counts go below) */ -#define MCL_LOWAT MINCL #define MBIGCL_LOWAT MINBIGCL #define M16KCL_LOWAT MIN16KCL @@ -525,15 +584,34 @@ static mbuf_table_t mbuf_table[] = { #define NELEM(a) (sizeof (a) / sizeof ((a)[0])) static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */ -static int mb_waiters; /* number of sleepers */ +static int mb_waiters; /* number of waiters */ + +#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */ +static struct timeval mb_wdtstart; /* watchdog start timestamp */ +static char mbuf_dump_buf[256]; + +/* + * mbuf watchdog is enabled by default on embedded platforms. It is + * also toggeable via the kern.ipc.mb_watchdog sysctl. + */ +#if CONFIG_EMBEDDED +static unsigned int mb_watchdog = 1; +#else +static unsigned int mb_watchdog = 0; +#endif /* CONFIG_EMBEDDED */ /* The following are used to serialize m_clalloc() */ static boolean_t mb_clalloc_busy; static void *mb_clalloc_waitchan = &mb_clalloc_busy; static int mb_clalloc_waiters; +static void mbuf_mtypes_sync(boolean_t); static int mbstat_sysctl SYSCTL_HANDLER_ARGS; +static void mbuf_stat_sync(void); static int mb_stat_sysctl SYSCTL_HANDLER_ARGS; +static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS; +static int mleak_table_sysctl SYSCTL_HANDLER_ARGS; +static char *mbuf_dump(void); static void mbuf_table_init(void); static inline void m_incref(struct mbuf *); static inline u_int32_t m_decref(struct mbuf *); @@ -554,11 +632,13 @@ static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***, static void mbuf_cslab_free(void *, mcache_obj_t *, int); static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t); static int freelist_populate(mbuf_class_t, unsigned int, int); +static void freelist_init(mbuf_class_t); static boolean_t mbuf_cached_above(mbuf_class_t, int); static boolean_t mbuf_steal(mbuf_class_t, unsigned int); static void m_reclaim(mbuf_class_t, unsigned int, boolean_t); static int m_howmany(int, size_t); static void mbuf_worker_thread(void); +static void mbuf_watchdog(void); static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int); static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **, @@ -572,6 +652,11 @@ static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *); static void mcl_audit_mcheck_panic(struct mbuf *); static void mcl_audit_verify_nextptr(void *, mcache_audit_t *); +static void mleak_activate(void); +static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t); +static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int); +static void mleak_free(mcache_obj_t *); + static mcl_slab_t *slab_get(void *); static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t, void *, void *, unsigned int, int, int); @@ -582,7 +667,6 @@ static void slab_nextptr_panic(mcl_slab_t *, void *); static void slab_detach(mcl_slab_t *); static boolean_t slab_is_detached(mcl_slab_t *); -static unsigned int m_length(struct mbuf *); static int m_copyback0(struct mbuf **, int, int, const void *, int, int); static struct mbuf *m_split0(struct mbuf *, int, int, int); @@ -605,11 +689,19 @@ static struct mbuf *m_split0(struct mbuf *, int, int, int); */ #define EXTF_COMPOSITE 0x1 +/* + * This flag indicates that the external cluster is read-only, i.e. it is + * or was referred to by more than one mbufs. Once set, this flag is never + * cleared. + */ +#define EXTF_READONLY 0x2 +#define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY) + #define MEXT_RFA(m) ((m)->m_ext.ext_refflags) #define MEXT_REF(m) (MEXT_RFA(m)->refcnt) #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags) #define MBUF_IS_COMPOSITE(m) \ - (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE)) + (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE) /* * Macros used to verify the integrity of the mbuf. @@ -638,15 +730,21 @@ static struct mbuf *m_split0(struct mbuf *, int, int, int); #define MTOD(m, t) ((t)((m)->m_data)) /* - * Macros to obtain cluster index and base cluster address. + * Macros to obtain (4KB) cluster index and base cluster address. + */ + +#define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT) +#define BGTOM(x) ((union mbigcluster *)(mbutl + (x))) + +/* + * Macro to find the mbuf index relative to a base. */ -#define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT) -#define CLTOM(x) ((union mcluster *)(mbutl + (x))) +#define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT) /* - * Macro to find the mbuf index relative to the cluster base. + * Same thing for 2KB cluster index. */ -#define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8) +#define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT) /* * Macros used during mbuf and cluster initialization. @@ -670,6 +768,7 @@ static struct mbuf *m_split0(struct mbuf *, int, int, int); (m)->m_pkthdr.tso_segsz = 0; \ (m)->m_pkthdr.vlan_tag = 0; \ (m)->m_pkthdr.socket_id = 0; \ + (m)->m_pkthdr.vt_nrecs = 0; \ m_tag_init(m); \ m_prio_init(m); \ } \ @@ -759,16 +858,12 @@ static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ #define MTYPES_CPU(p) \ ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) -/* This should be in a header file */ -#define atomic_add_16(a, n) ((void) OSAddAtomic16(n, a)) -#define atomic_add_32(a, n) ((void) OSAddAtomic(n, a)) - #define mtype_stat_add(type, n) { \ if ((unsigned)(type) < MT_MAX) { \ mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \ atomic_add_32(&mbs->cpu_mtypes[type], n); \ - } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \ - atomic_add_16((int16_t*)&mbstat.m_mtypes[type], n); \ + } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \ + atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \ } \ } @@ -776,13 +871,15 @@ static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ #define mtype_stat_inc(t) mtype_stat_add(t, 1) #define mtype_stat_dec(t) mtype_stat_sub(t, 1) -static int -mbstat_sysctl SYSCTL_HANDLER_ARGS +static void +mbuf_mtypes_sync(boolean_t locked) { -#pragma unused(oidp, arg1, arg2) int m, n; mtypes_cpu_t mtc; + if (locked) + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + bzero(&mtc, sizeof (mtc)); for (m = 0; m < ncpu; m++) { mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m]; @@ -794,25 +891,33 @@ mbstat_sysctl SYSCTL_HANDLER_ARGS for (n = 0; n < MT_MAX; n++) mtc.cpu_mtypes[n] += temp.cpu_mtypes[n]; } - lck_mtx_lock(mbuf_mlock); + if (!locked) + lck_mtx_lock(mbuf_mlock); for (n = 0; n < MT_MAX; n++) mbstat.m_mtypes[n] = mtc.cpu_mtypes[n]; - lck_mtx_unlock(mbuf_mlock); - - return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat))); + if (!locked) + lck_mtx_unlock(mbuf_mlock); } static int -mb_stat_sysctl SYSCTL_HANDLER_ARGS +mbstat_sysctl SYSCTL_HANDLER_ARGS { #pragma unused(oidp, arg1, arg2) - mcache_t *cp; - mcache_cpu_t *ccp; + mbuf_mtypes_sync(FALSE); + + return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat))); +} + +static void +mbuf_stat_sync(void) +{ mb_class_stat_t *sp; - void *statp; - int k, m, bktsize, statsz, proc64 = proc_is64bit(req->p); + mcache_cpu_t *ccp; + mcache_t *cp; + int k, m, bktsize; + + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); - lck_mtx_lock(mbuf_mlock); for (k = 0; k < NELEM(mbuf_table); k++) { cp = m_cache(k); ccp = &cp->mc_cpu[0]; @@ -854,9 +959,8 @@ mb_stat_sysctl SYSCTL_HANDLER_ARGS break; case MC_CL: - /* Deduct clusters used in composite cache and mbufs */ - sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + - (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL)); + /* Deduct clusters used in composite cache */ + sp->mbcl_ctotal -= m_total(MC_MBUF_CL); break; case MC_BIGCL: @@ -873,6 +977,17 @@ mb_stat_sysctl SYSCTL_HANDLER_ARGS break; } } +} + +static int +mb_stat_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + void *statp; + int k, statsz, proc64 = proc_is64bit(req->p); + + lck_mtx_lock(mbuf_mlock); + mbuf_stat_sync(); if (!proc64) { struct omb_class_stat *oc; @@ -913,6 +1028,69 @@ mb_stat_sysctl SYSCTL_HANDLER_ARGS return (SYSCTL_OUT(req, statp, statsz)); } +static int +mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + mleak_trace_stat_t *mltr; + int i; + + /* Ensure leak tracing turned on */ + if (!mclfindleak) + return (ENXIO); + + VERIFY(mleak_stat != NULL); +#ifdef __LP64__ + VERIFY(mleak_stat->ml_isaddr64); +#else + VERIFY(!mleak_stat->ml_isaddr64); +#endif /* !__LP64__ */ + VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES); + + lck_mtx_lock(mleak_lock); + mltr = &mleak_stat->ml_trace[0]; + bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES); + for (i = 0; i < MLEAK_NUM_TRACES; i++) { + int j; + + if (mleak_top_trace[i] == NULL || + mleak_top_trace[i]->allocs == 0) + continue; + + mltr->mltr_collisions = mleak_top_trace[i]->collisions; + mltr->mltr_hitcount = mleak_top_trace[i]->hitcount; + mltr->mltr_allocs = mleak_top_trace[i]->allocs; + mltr->mltr_depth = mleak_top_trace[i]->depth; + + VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH); + for (j = 0; j < mltr->mltr_depth; j++) + mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j]; + + mltr++; + } + i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES)); + lck_mtx_unlock(mleak_lock); + + return (i); +} + +static int +mleak_table_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int i = 0; + + /* Ensure leak tracing turned on */ + if (!mclfindleak) + return (ENXIO); + + lck_mtx_lock(mleak_lock); + i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table)); + lck_mtx_unlock(mleak_lock); + + return (i); +} + static inline void m_incref(struct mbuf *m) { @@ -924,6 +1102,14 @@ m_incref(struct mbuf *m) new = old + 1; ASSERT(new != 0); } while (!OSCompareAndSwap(old, new, addr)); + + /* + * If cluster is shared, mark it with (sticky) EXTF_READONLY; + * we don't clear the flag when the refcount goes back to 1 + * to simplify code calling m_mclhasreference(). + */ + if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY)) + (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m)); } static inline u_int32_t @@ -944,6 +1130,7 @@ m_decref(struct mbuf *m) static void mbuf_table_init(void) { + unsigned int b, c, s; int m; MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)), @@ -968,66 +1155,78 @@ mbuf_table_init(void) #endif /* CONFIG_MBUF_JUMBO */ /* - * nclusters is going to be split in 2 to hold both the 2K - * and the 4K pools, so make sure each half is even. + * nclusters holds both the 2KB and 4KB pools, so ensure it's + * a multiple of 4KB clusters. */ - nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4); + nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG); if (njcl > 0) { /* - * Each jumbo cluster takes 8 2K clusters, so make - * sure that the pool size is evenly divisible by 8. + * Each jumbo cluster takes 8 2KB clusters, so make + * sure that the pool size is evenly divisible by 8; + * njcl is in 2KB unit, hence treated as such. */ njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8); - } -#if CONFIG_MBUF_NOEXPAND - /* Only use 4k clusters if we're setting aside more than 256k */ - if (nmbclusters <= 128) { - maxmbufcl = nmbclusters / 4; - } else { - /* Half to big clusters, half to small */ - maxmbufcl = (nmbclusters / 4) * 3; + /* Update nclusters with rounded down value of njcl */ + nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG); } -#endif /* CONFIG_MBUF_NOEXPAND */ /* - * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th - * of the total number of 2K clusters allocated is reserved and cannot - * be turned into mbufs. It can only be used for pure cluster objects. + * njcl is valid only on platforms with 16KB jumbo clusters, where + * it is configured to 1/3 of the pool size. On these platforms, + * the remaining is used for 2KB and 4KB clusters. On platforms + * without 16KB jumbo clusters, the entire pool is used for both + * 2KB and 4KB clusters. A 4KB cluster can either be splitted into + * 16 mbufs, or into 2 2KB clusters. + * + * +---+---+------------ ... -----------+------- ... -------+ + * | c | b | s | njcl | + * +---+---+------------ ... -----------+------- ... -------+ + * + * 1/32th of the shared region is reserved for pure 2KB and 4KB + * clusters (1/64th each.) + */ + c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */ + b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */ + s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */ + + /* + * 1/64th (c) is reserved for 2KB clusters. */ - m_minlimit(MC_CL) = (nclusters >> 5); - m_maxlimit(MC_CL) = (nclusters >> 1); + m_minlimit(MC_CL) = c; + m_maxlimit(MC_CL) = s + c; /* in 2KB unit */ m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES; (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl"); /* - * The remaining (15/16th) can be turned into mbufs. + * Another 1/64th (b) of the map is reserved for 4KB clusters. + * It cannot be turned into 2KB clusters or mbufs. */ - m_minlimit(MC_MBUF) = 0; - m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL; - m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE; - (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf"); + m_minlimit(MC_BIGCL) = b; + m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */ + m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES; + (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl"); /* - * The other 1/2 of the map is reserved for 4K clusters. + * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB) */ - m_minlimit(MC_BIGCL) = 0; - m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1; - m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG; - (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl"); + m_minlimit(MC_MBUF) = 0; + m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */ + m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE; + (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf"); /* * Set limits for the composite classes. */ m_minlimit(MC_MBUF_CL) = 0; - m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL); + m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL); m_maxsize(MC_MBUF_CL) = MCLBYTES; m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL); (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl"); m_minlimit(MC_MBUF_BIGCL) = 0; m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL); - m_maxsize(MC_MBUF_BIGCL) = NBPG; + m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES; m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL); (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl"); @@ -1035,7 +1234,7 @@ mbuf_table_init(void) * And for jumbo classes. */ m_minlimit(MC_16KCL) = 0; - m_maxlimit(MC_16KCL) = (njcl >> 3); + m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */ m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES; (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl"); @@ -1084,19 +1283,19 @@ static ncl_tbl_t ncl_table_srv[] = { #endif /* __LP64__ */ __private_extern__ unsigned int -mbuf_default_ncl(int srv, uint64_t mem) +mbuf_default_ncl(int server, uint64_t mem) { #if !defined(__LP64__) -#pragma unused(srv) +#pragma unused(server) unsigned int n; /* * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM). */ - if ((n = ((mem / 16) / MCLBYTES)) > 32768) - n = 32768; + if ((n = ((mem / 16) / MCLBYTES)) > 32768) + n = 32768; #else unsigned int n, i; - ncl_tbl_t *tbl = (srv ? ncl_table_srv : ncl_table); + ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table); /* * 64-bit kernel (mbuf pool size based on table). */ @@ -1115,13 +1314,16 @@ __private_extern__ void mbinit(void) { unsigned int m; - int initmcl = MINCL; + unsigned int initmcl = 0; void *buf; thread_t thread = THREAD_NULL; if (nmbclusters == 0) nmbclusters = NMBCLUSTERS; + /* This should be a sane (at least even) value by now */ + VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1)); + /* Setup the mbuf table */ mbuf_table_init(); @@ -1131,25 +1333,51 @@ mbinit(void) mbuf_mlock_attr = lck_attr_alloc_init(); mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr); - /* Allocate cluster slabs table */ - maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB; + /* + * Allocate cluster slabs table: + * + * maxslabgrp = (N * 2048) / (1024 * 1024) + * + * Where N is nmbclusters rounded up to the nearest 512. This yields + * mcl_slab_g_t units, each one representing a MB of memory. + */ + maxslabgrp = + (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT; MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *), M_TEMP, M_WAITOK | M_ZERO); VERIFY(slabstbl != NULL); - /* Allocate audit structures if needed */ + /* + * Allocate audit structures, if needed: + * + * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096 + * + * This yields mcl_audit_t units, each one representing a page. + */ PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug)); mbuf_debug |= mcache_getflags(); - if (mbuf_debug & MCF_AUDIT) { - MALLOC(mclaudit, mcl_audit_t *, - nmbclusters * sizeof (*mclaudit), M_TEMP, - M_WAITOK | M_ZERO); + if (mbuf_debug & MCF_DEBUG) { + maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT); + MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit), + M_TEMP, M_WAITOK | M_ZERO); VERIFY(mclaudit != NULL); mcl_audit_con_cache = mcache_create("mcl_audit_contents", AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP); VERIFY(mcl_audit_con_cache != NULL); } + mclverify = (mbuf_debug & MCF_VERIFY); + mcltrace = (mbuf_debug & MCF_TRACE); + mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG); + + /* Enable mbuf leak logging, with a lock to protect the tables */ + + mleak_lock_grp_attr = lck_grp_attr_alloc_init(); + mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr); + mleak_lock_attr = lck_attr_alloc_init(); + mleak_lock = lck_mtx_alloc_init(mleak_lock_grp, mleak_lock_attr); + + mleak_activate(); /* Calculate the number of pages assigned to the cluster pool */ mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES; @@ -1161,19 +1389,41 @@ mbinit(void) mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t)); - embutl = (union mcluster *) + embutl = (union mbigcluster *) ((unsigned char *)mbutl + (nmbclusters * MCLBYTES)); + VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0); + /* Prime up the freelist */ PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl)); + if (initmcl != 0) { + initmcl >>= NCLPBGSHIFT; /* become a 4K unit */ + if (initmcl > m_maxlimit(MC_BIGCL)) + initmcl = m_maxlimit(MC_BIGCL); + } + if (initmcl < m_minlimit(MC_BIGCL)) + initmcl = m_minlimit(MC_BIGCL); lck_mtx_lock(mbuf_mlock); - if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0) - panic("mbinit: m_clalloc failed\n"); + /* + * For classes with non-zero minimum limits, populate their freelists + * so that m_total(class) is at least m_minlimit(class). + */ + VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0); + freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT); + VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); + freelist_init(m_class(MC_CL)); + + for (m = 0; m < NELEM(mbuf_table); m++) { + /* Make sure we didn't miss any */ + VERIFY(m_minlimit(m_class(m)) == 0 || + m_total(m_class(m)) >= m_minlimit(m_class(m))); + } lck_mtx_unlock(mbuf_mlock); - (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, NULL, &thread); + (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, + NULL, &thread); thread_deallocate(thread); ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref), @@ -1181,7 +1431,7 @@ mbinit(void) /* Create the cache for each class */ for (m = 0; m < NELEM(mbuf_table); m++) { - void *allocfunc, *freefunc, *auditfunc; + void *allocfunc, *freefunc, *auditfunc, *logfunc; u_int32_t flags; flags = mbuf_debug; @@ -1190,10 +1440,12 @@ mbinit(void) allocfunc = mbuf_cslab_alloc; freefunc = mbuf_cslab_free; auditfunc = mbuf_cslab_audit; + logfunc = mleak_logger; } else { allocfunc = mbuf_slab_alloc; freefunc = mbuf_slab_free; auditfunc = mbuf_slab_audit; + logfunc = mleak_logger; } /* @@ -1206,8 +1458,11 @@ mbinit(void) njcl == 0) flags |= MCF_NOCPUCACHE; + if (!mclfindleak) + flags |= MCF_NOLEAKLOG; + m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m), - allocfunc, freefunc, auditfunc, mbuf_slab_notify, + allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify, (void *)(uintptr_t)m, flags, MCR_SLEEP); } @@ -1225,30 +1480,31 @@ mbinit(void) mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE); bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu)); - mbuf_gscale = MB_GROWTH_NORMAL; - - /* - * Set the max limit on sb_max to be 1/16 th of the size of + /* + * Set the max limit on sb_max to be 1/16 th of the size of * memory allocated for mbuf clusters. */ - high_sb_max = (nmbclusters << (MCLSHIFT - 4)); + high_sb_max = (nmbclusters << (MCLSHIFT - 4)); if (high_sb_max < sb_max) { /* sb_max is too large for this configuration, scale it down */ - if (high_sb_max > (1 << MBSHIFT)) { + if (high_sb_max > (1 << MBSHIFT)) { /* We have atleast 16 M of mbuf pool */ sb_max = high_sb_max; } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) { - /* If we have more than 1M of mbufpool, cap the size of + /* + * If we have more than 1M of mbufpool, cap the size of * max sock buf at 1M - */ + */ sb_max = high_sb_max = (1 << MBSHIFT); } else { sb_max = high_sb_max; } } - printf("mbinit: done (%d MB memory set for mbuf pool)\n", - (nmbclusters << MCLSHIFT) >> MBSHIFT); + printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n", + (nmbclusters << MCLSHIFT) >> MBSHIFT, + (nclusters << MCLSHIFT) >> MBSHIFT, + (njcl << MCLSHIFT) >> MBSHIFT); } /* @@ -1274,7 +1530,7 @@ slab_alloc(mbuf_class_t class, int wait) * more than one buffer chunks (e.g. mbuf slabs). For other * slabs, this probably doesn't make much of a difference. */ - if (class == MC_MBUF && (wait & MCR_COMP)) + if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP)) sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead); else sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class)); @@ -1294,7 +1550,10 @@ slab_alloc(mbuf_class_t class, int wait) if (class == MC_MBUF) { sp->sl_head = buf->obj_next; - VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1)); + VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1)); + } else if (class == MC_CL) { + sp->sl_head = buf->obj_next; + VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1)); } else { sp->sl_head = NULL; } @@ -1319,41 +1578,33 @@ slab_alloc(mbuf_class_t class, int wait) if (class == MC_CL) { mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL); /* - * A 2K cluster slab can have at most 1 reference. + * A 2K cluster slab can have at most NCLPBG references. */ - VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && - sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL); + VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG && + sp->sl_chunks == NCLPBG && + sp->sl_len == m_maxsize(MC_BIGCL)); + VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL); } else if (class == MC_BIGCL) { - mcl_slab_t *nsp = sp->sl_next; mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) + m_infree(MC_MBUF_BIGCL); /* - * Increment 2nd slab. A 4K big cluster takes - * 2 slabs, each having at most 1 reference. + * A 4K cluster slab can have at most 1 reference. */ VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && - sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL); - /* Next slab must already be present */ - VERIFY(nsp != NULL); - nsp->sl_refcnt++; - VERIFY(!slab_is_detached(nsp)); - VERIFY(nsp->sl_class == MC_BIGCL && - nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && - nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && - nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && - nsp->sl_head == NULL); + sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); } else if (class == MC_16KCL) { mcl_slab_t *nsp; int k; --m_infree(MC_16KCL); VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && - sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL); + sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); /* - * Increment 2nd-8th slab. A 16K big cluster takes - * 8 cluster slabs, each having at most 1 reference. + * Increment 2nd-Nth slab reference, where N is NSLABSP16KB. + * A 16KB big cluster takes NSLABSP16KB slabs, each having at + * most 1 reference. */ - for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; /* Next slab must already be present */ VERIFY(nsp != NULL); @@ -1366,7 +1617,7 @@ slab_alloc(mbuf_class_t class, int wait) nsp->sl_head == NULL); } } else { - ASSERT(class == MC_MBUF); + VERIFY(class == MC_MBUF); --m_infree(MC_MBUF); /* * If auditing is turned on, this check is @@ -1376,20 +1627,20 @@ slab_alloc(mbuf_class_t class, int wait) _MCHECK((struct mbuf *)buf); /* * Since we have incremented the reference count above, - * an mbuf slab (formerly a 2K cluster slab that was cut + * an mbuf slab (formerly a 4KB cluster slab that was cut * up into mbufs) must have a reference count between 1 - * and NMBPCL at this point. + * and NMBPBG at this point. */ - VERIFY(sp->sl_refcnt >= 1 && - (unsigned short)sp->sl_refcnt <= NMBPCL && - sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL)); - VERIFY((unsigned short)sp->sl_refcnt < NMBPCL || - sp->sl_head == NULL); + VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG && + sp->sl_chunks == NMBPBG && + sp->sl_len == m_maxsize(MC_BIGCL)); + VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL); } /* If empty, remove this slab from the class's freelist */ if (sp->sl_head == NULL) { - VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL); + VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG); + VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG); slab_remove(sp, class); } @@ -1415,45 +1666,38 @@ slab_free(mbuf_class_t class, mcache_obj_t *buf) /* Decrement slab reference */ sp->sl_refcnt--; - if (class == MC_CL || class == MC_BIGCL) { + if (class == MC_CL) { VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); /* - * A 2K cluster slab can have at most 1 reference + * A slab that has been splitted for 2KB clusters can have + * at most 1 outstanding reference at this point. + */ + VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) && + sp->sl_chunks == NCLPBG && + sp->sl_len == m_maxsize(MC_BIGCL)); + VERIFY(sp->sl_refcnt < (NCLPBG - 1) || + (slab_is_detached(sp) && sp->sl_head == NULL)); + } else if (class == MC_BIGCL) { + VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); + /* + * A 4KB cluster slab can have at most 1 reference * which must be 0 at this point. */ VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); VERIFY(slab_is_detached(sp)); - if (class == MC_BIGCL) { - mcl_slab_t *nsp = sp->sl_next; - VERIFY(IS_P2ALIGNED(buf, NBPG)); - /* Next slab must already be present */ - VERIFY(nsp != NULL); - /* Decrement 2nd slab reference */ - nsp->sl_refcnt--; - /* - * A 4K big cluster takes 2 slabs, both - * must now have 0 reference. - */ - VERIFY(slab_is_detached(nsp)); - VERIFY(nsp->sl_class == MC_BIGCL && - (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && - nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && - nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && - nsp->sl_head == NULL); - } } else if (class == MC_16KCL) { mcl_slab_t *nsp; int k; /* - * A 16K cluster takes 8 cluster slabs, all must + * A 16KB cluster takes NSLABSP16KB slabs, all must * now have 0 reference. */ - VERIFY(IS_P2ALIGNED(buf, NBPG)); + VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES)); VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && - sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL); + sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); VERIFY(slab_is_detached(sp)); - for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; /* Next slab must already be present */ VERIFY(nsp != NULL); @@ -1467,14 +1711,15 @@ slab_free(mbuf_class_t class, mcache_obj_t *buf) } } else { /* - * An mbuf slab has a total of NMBPL reference counts. - * Since we have decremented the reference above, it - * must now be between 0 and NMBPCL-1. + * A slab that has been splitted for mbufs has at most NMBPBG + * reference counts. Since we have decremented one reference + * above, it must now be between 0 and NMBPBG-1. */ - VERIFY(sp->sl_refcnt >= 0 && - (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) && - sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL)); - VERIFY(sp->sl_refcnt < (NMBPCL - 1) || + VERIFY(class == MC_MBUF); + VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) && + sp->sl_chunks == NMBPBG && + sp->sl_len == m_maxsize(MC_BIGCL)); + VERIFY(sp->sl_refcnt < (NMBPBG - 1) || (slab_is_detached(sp) && sp->sl_head == NULL)); } @@ -1485,12 +1730,15 @@ slab_free(mbuf_class_t class, mcache_obj_t *buf) */ if (mclaudit != NULL) { mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); - mcache_audit_free_verify(mca, buf, 0, m_maxsize(class)); + if (mclverify) { + mcache_audit_free_verify(mca, buf, 0, m_maxsize(class)); + } mca->mca_uflags &= ~MB_SCVALID; } if (class == MC_CL) { mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); + buf->obj_next = sp->sl_head; } else if (class == MC_BIGCL) { mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + m_infree(MC_MBUF_BIGCL); @@ -1502,14 +1750,25 @@ slab_free(mbuf_class_t class, mcache_obj_t *buf) } sp->sl_head = buf; - /* All mbufs are freed; return the cluster that we stole earlier */ - if (sp->sl_refcnt == 0 && class == MC_MBUF) { - int i = NMBPCL; - - m_total(MC_MBUF) -= NMBPCL; + /* + * If a slab has been splitted to either one which holds 2KB clusters, + * or one which holds mbufs, turn it back to one which holds a 4KB + * cluster. + */ + if (class == MC_MBUF && sp->sl_refcnt == 0 && + m_total(class) > m_minlimit(class) && + m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) { + int i = NMBPBG; + + m_total(MC_BIGCL)++; + mbstat.m_bigclusters = m_total(MC_BIGCL); + m_total(MC_MBUF) -= NMBPBG; mbstat.m_mbufs = m_total(MC_MBUF); - m_infree(MC_MBUF) -= NMBPCL; - mtype_stat_add(MT_FREE, -((unsigned)NMBPCL)); + m_infree(MC_MBUF) -= NMBPBG; + mtype_stat_add(MT_FREE, -((unsigned)NMBPBG)); + + VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); + VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF)); while (i--) { struct mbuf *m = sp->sl_head; @@ -1522,19 +1781,58 @@ slab_free(mbuf_class_t class, mcache_obj_t *buf) /* Remove the slab from the mbuf class's slab list */ slab_remove(sp, class); - /* Reinitialize it as a 2K cluster slab */ - slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base, + /* Reinitialize it as a 4KB cluster slab */ + slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base, sp->sl_len, 0, 1); - if (mclaudit != NULL) + if (mclverify) { mcache_set_pattern(MCACHE_FREE_PATTERN, - (caddr_t)sp->sl_head, m_maxsize(MC_CL)); + (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL)); + } + mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + + m_infree(MC_MBUF_BIGCL); - mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); + VERIFY(slab_is_detached(sp)); + /* And finally switch class */ + class = MC_BIGCL; + } else if (class == MC_CL && sp->sl_refcnt == 0 && + m_total(class) > m_minlimit(class) && + m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) { + int i = NCLPBG; + + m_total(MC_BIGCL)++; + mbstat.m_bigclusters = m_total(MC_BIGCL); + m_total(MC_CL) -= NCLPBG; + mbstat.m_clusters = m_total(MC_CL); + m_infree(MC_CL) -= NCLPBG; + VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); + VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL)); + + while (i--) { + union mcluster *c = sp->sl_head; + VERIFY(c != NULL); + sp->sl_head = c->mcl_next; + c->mcl_next = NULL; + } + VERIFY(sp->sl_head == NULL); + + /* Remove the slab from the 2KB cluster class's slab list */ + slab_remove(sp, class); + + /* Reinitialize it as a 4KB cluster slab */ + slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base, + sp->sl_len, 0, 1); + + if (mclverify) { + mcache_set_pattern(MCACHE_FREE_PATTERN, + (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL)); + } + mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + + m_infree(MC_MBUF_BIGCL); VERIFY(slab_is_detached(sp)); /* And finally switch class */ - class = MC_CL; + class = MC_BIGCL; } /* Reinsert the slab to the class's slab list */ @@ -1593,6 +1891,9 @@ mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) if (mbuf_cached_above(class, wait)) break; + /* watchdog checkpoint */ + mbuf_watchdog(); + /* We have nothing and cannot block; give up */ if (wait & MCR_NOSLEEP) { if (!(wait & MCR_TRYHARD)) { @@ -1689,7 +1990,9 @@ mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) ASSERT(!(mca->mca_uflags & MB_SCVALID)); } /* Record this transaction */ - mcache_buffer_log(mca, list, m_cache(class)); + if (mcltrace) + mcache_buffer_log(mca, list, m_cache(class)); + if (alloc) mca->mca_uflags |= MB_INUSE; else @@ -1756,16 +2059,17 @@ cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num) clsp = slab_get(cl); VERIFY(m->m_flags == M_EXT && cl != NULL); VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m)); - VERIFY(clsp->sl_refcnt == 1); - if (class == MC_MBUF_BIGCL) { - nsp = clsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - VERIFY(nsp->sl_refcnt == 1); - } else if (class == MC_MBUF_16KCL) { + + if (class == MC_MBUF_CL) { + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NCLPBG); + } else { + VERIFY(clsp->sl_refcnt == 1); + } + + if (class == MC_MBUF_16KCL) { int k; - for (nsp = clsp, k = 1; - k < (M16KCLBYTES / MCLBYTES); k++) { + for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; /* Next slab must already be present */ VERIFY(nsp != NULL); @@ -1802,11 +2106,21 @@ cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) mcache_obj_t *ref_list = NULL; mcl_slab_t *clsp, *nsp; void *cl; + mbuf_class_t cl_class; ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); VERIFY(class != MC_MBUF_16KCL || njcl > 0); lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + if (class == MC_MBUF_CL) { + cl_class = MC_CL; + } else if (class == MC_MBUF_BIGCL) { + cl_class = MC_BIGCL; + } else { + VERIFY(class == MC_MBUF_16KCL); + cl_class = MC_16KCL; + } + o = tail = list; while ((m = ms = (struct mbuf *)o) != NULL) { @@ -1815,37 +2129,33 @@ cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) /* Do the mbuf sanity checks */ if (mclaudit != NULL) { mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); - mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF)); + if (mclverify) { + mcache_audit_free_verify(mca, m, 0, + m_maxsize(MC_MBUF)); + } ms = (struct mbuf *)mca->mca_contents; } /* Do the cluster sanity checks */ cl = ms->m_ext.ext_buf; clsp = slab_get(cl); - if (mclaudit != NULL) { - size_t size; - if (class == MC_MBUF_CL) - size = m_maxsize(MC_CL); - else if (class == MC_MBUF_BIGCL) - size = m_maxsize(MC_BIGCL); - else - size = m_maxsize(MC_16KCL); - mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL, + if (mclverify) { + size_t size = m_maxsize(cl_class); + mcache_audit_free_verify(mcl_audit_buf2mca(cl_class, (mcache_obj_t *)cl), cl, 0, size); } VERIFY(ms->m_type == MT_FREE); VERIFY(ms->m_flags == M_EXT); VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); - VERIFY(clsp->sl_refcnt == 1); - if (class == MC_MBUF_BIGCL) { - nsp = clsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - VERIFY(nsp->sl_refcnt == 1); - } else if (class == MC_MBUF_16KCL) { + if (cl_class == MC_CL) { + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NCLPBG); + } else { + VERIFY(clsp->sl_refcnt == 1); + } + if (cl_class == MC_16KCL) { int k; - for (nsp = clsp, k = 1; - k < (M16KCLBYTES / MCLBYTES); k++) { + for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; /* Next slab must already be present */ VERIFY(nsp != NULL); @@ -1926,7 +2236,7 @@ mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, int wait) { mbuf_class_t class = (mbuf_class_t)arg; - mcache_t *cp = NULL; + mbuf_class_t cl_class = 0; unsigned int num = 0, cnum = 0, want = needed; mcache_obj_t *ref_list = NULL; mcache_obj_t *mp_list = NULL; @@ -1977,22 +2287,28 @@ mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, if (!(wait & MCR_NOSLEEP)) wait |= MCR_FAILOK; + /* allocate mbufs */ needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait); if (needed == 0) { ASSERT(mp_list == NULL); goto fail; } - if (class == MC_MBUF_CL) - cp = m_cache(MC_CL); - else if (class == MC_MBUF_BIGCL) - cp = m_cache(MC_BIGCL); - else - cp = m_cache(MC_16KCL); - needed = mcache_alloc_ext(cp, &clp_list, needed, wait); + + /* allocate clusters */ + if (class == MC_MBUF_CL) { + cl_class = MC_CL; + } else if (class == MC_MBUF_BIGCL) { + cl_class = MC_BIGCL; + } else { + VERIFY(class == MC_MBUF_16KCL); + cl_class = MC_16KCL; + } + needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait); if (needed == 0) { ASSERT(clp_list == NULL); goto fail; } + needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait); if (needed == 0) { ASSERT(ref_list == NULL); @@ -2025,7 +2341,6 @@ mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, */ if (mclaudit != NULL) { mcache_audit_t *mca, *cl_mca; - size_t size; lck_mtx_lock(mbuf_mlock); mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); @@ -2048,15 +2363,22 @@ mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, lck_mtx_unlock(mbuf_mlock); /* Technically, they are in the freelist */ - mcache_set_pattern(MCACHE_FREE_PATTERN, m, - m_maxsize(MC_MBUF)); - if (class == MC_MBUF_CL) - size = m_maxsize(MC_CL); - else if (class == MC_MBUF_BIGCL) - size = m_maxsize(MC_BIGCL); - else - size = m_maxsize(MC_16KCL); - mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size); + if (mclverify) { + size_t size; + + mcache_set_pattern(MCACHE_FREE_PATTERN, m, + m_maxsize(MC_MBUF)); + + if (class == MC_MBUF_CL) + size = m_maxsize(MC_CL); + else if (class == MC_MBUF_BIGCL) + size = m_maxsize(MC_BIGCL); + else + size = m_maxsize(MC_16KCL); + + mcache_set_pattern(MCACHE_FREE_PATTERN, cl, + size); + } } MBUF_INIT(ms, 0, MT_FREE); @@ -2082,7 +2404,7 @@ fail: if (mp_list != NULL) mcache_free_ext(m_cache(MC_MBUF), mp_list); if (clp_list != NULL) - mcache_free_ext(cp, clp_list); + mcache_free_ext(m_cache(cl_class), clp_list); if (ref_list != NULL) mcache_free_ext(ref_cache, ref_list); @@ -2152,7 +2474,9 @@ mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) /* Do the mbuf sanity checks and record its transaction */ mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); mcl_audit_mbuf(mca, m, TRUE, alloc); - mcache_buffer_log(mca, m, m_cache(class)); + if (mcltrace) + mcache_buffer_log(mca, m, m_cache(class)); + if (alloc) mca->mca_uflags |= MB_COMP_INUSE; else @@ -2163,7 +2487,7 @@ mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) * freeing, since the contents of the actual mbuf has been * pattern-filled by the above call to mcl_audit_mbuf(). */ - if (!alloc) + if (!alloc && mclverify) ms = (struct mbuf *)mca->mca_contents; /* Do the cluster sanity checks and record its transaction */ @@ -2171,16 +2495,15 @@ mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) clsp = slab_get(cl); VERIFY(ms->m_flags == M_EXT && cl != NULL); VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); - VERIFY(clsp->sl_refcnt == 1); - if (class == MC_MBUF_BIGCL) { - nsp = clsp->sl_next; - /* Next slab must already be present */ - VERIFY(nsp != NULL); - VERIFY(nsp->sl_refcnt == 1); - } else if (class == MC_MBUF_16KCL) { + if (class == MC_MBUF_CL) + VERIFY(clsp->sl_refcnt >= 1 && + clsp->sl_refcnt <= NCLPBG); + else + VERIFY(clsp->sl_refcnt == 1); + + if (class == MC_MBUF_16KCL) { int k; - for (nsp = clsp, k = 1; - k < (M16KCLBYTES / MCLBYTES); k++) { + for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { nsp = nsp->sl_next; /* Next slab must already be present */ VERIFY(nsp != NULL); @@ -2196,7 +2519,9 @@ mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) else size = m_maxsize(MC_16KCL); mcl_audit_cluster(mca, cl, size, alloc, FALSE); - mcache_buffer_log(mca, cl, m_cache(class)); + if (mcltrace) + mcache_buffer_log(mca, cl, m_cache(class)); + if (alloc) mca->mca_uflags |= MB_COMP_INUSE; else @@ -2221,8 +2546,8 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) mcache_obj_t *con_list = NULL; mcl_slab_t *sp; - VERIFY(bufsize == m_maxsize(MC_CL) || - bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL)); + VERIFY(bufsize == m_maxsize(MC_BIGCL) || + bufsize == m_maxsize(MC_16KCL)); lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); @@ -2258,7 +2583,7 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) page = kmem_mb_alloc(mb_map, size, large_buffer); /* - * If we did ask for "n" 16K physically contiguous chunks + * If we did ask for "n" 16KB physically contiguous chunks * and didn't get them, then please try again without this * restriction. */ @@ -2266,8 +2591,8 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) page = kmem_mb_alloc(mb_map, size, 0); if (page == 0) { - if (bufsize <= m_maxsize(MC_BIGCL)) { - /* Try for 1 page if failed, only for 2KB/4KB request */ + if (bufsize == m_maxsize(MC_BIGCL)) { + /* Try for 1 page if failed, only 4KB request */ size = NBPG; page = kmem_mb_alloc(mb_map, size, 0); } @@ -2288,24 +2613,20 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) /* * Yes, I realize this is a waste of memory for clusters * that never get transformed into mbufs, as we may end - * up with NMBPCL-1 unused audit structures per cluster. + * up with NMBPBG-1 unused audit structures per cluster. * But doing so tremendously simplifies the allocation * strategy, since at this point we are not holding the - * mbuf lock and the caller is okay to be blocked. For - * the case of big clusters, we allocate one structure - * for each as we never turn them into mbufs. + * mbuf lock and the caller is okay to be blocked. */ - if (bufsize == m_maxsize(MC_CL)) { - needed = numpages * 2 * NMBPCL; + if (bufsize == m_maxsize(MC_BIGCL)) { + needed = numpages * NMBPBG; i = mcache_alloc_ext(mcl_audit_con_cache, &con_list, needed, MCR_SLEEP); VERIFY(con_list != NULL && i == needed); - } else if (bufsize == m_maxsize(MC_BIGCL)) { - needed = numpages; } else { - needed = numpages / (M16KCLBYTES / NBPG); + needed = numpages / NSLABSP16KB; } i = mcache_alloc_ext(mcache_audit_cache, @@ -2331,68 +2652,23 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) mcl_paddr[offset] = new_page << PGSHIFT; /* Pattern-fill this fresh page */ - if (mclaudit != NULL) + if (mclverify) { mcache_set_pattern(MCACHE_FREE_PATTERN, (caddr_t)page, NBPG); - - if (bufsize == m_maxsize(MC_CL)) { - union mcluster *mcl = (union mcluster *)page; - - /* 1st cluster in the page */ - sp = slab_get(mcl); - if (mclaudit != NULL) - mcl_audit_init(mcl, &mca_list, &con_list, - AUDIT_CONTENTS_SIZE, NMBPCL); - - VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); - slab_init(sp, MC_CL, SLF_MAPPED, - mcl, mcl, bufsize, 0, 1); - - /* Insert this slab */ - slab_insert(sp, MC_CL); - - /* Update stats now since slab_get() drops the lock */ - mbstat.m_clfree = ++m_infree(MC_CL) + - m_infree(MC_MBUF_CL); - mbstat.m_clusters = ++m_total(MC_CL); - VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); - - /* 2nd cluster in the page */ - sp = slab_get(++mcl); - if (mclaudit != NULL) - mcl_audit_init(mcl, &mca_list, &con_list, - AUDIT_CONTENTS_SIZE, NMBPCL); - - VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); - slab_init(sp, MC_CL, SLF_MAPPED, - mcl, mcl, bufsize, 0, 1); - - /* Insert this slab */ - slab_insert(sp, MC_CL); - - /* Update stats now since slab_get() drops the lock */ - mbstat.m_clfree = ++m_infree(MC_CL) + - m_infree(MC_MBUF_CL); - mbstat.m_clusters = ++m_total(MC_CL); - VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); - } else if (bufsize == m_maxsize(MC_BIGCL)) { + } + if (bufsize == m_maxsize(MC_BIGCL)) { union mbigcluster *mbc = (union mbigcluster *)page; - mcl_slab_t *nsp; /* One for the entire page */ sp = slab_get(mbc); - if (mclaudit != NULL) - mcl_audit_init(mbc, &mca_list, NULL, 0, 1); - + if (mclaudit != NULL) { + mcl_audit_init(mbc, &mca_list, &con_list, + AUDIT_CONTENTS_SIZE, NMBPBG); + } VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); slab_init(sp, MC_BIGCL, SLF_MAPPED, mbc, mbc, bufsize, 0, 1); - /* 2nd cluster's slab is part of the previous one */ - nsp = slab_get(((union mcluster *)page) + 1); - slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL, - mbc, NULL, 0, 0, 0); - /* Insert this slab */ slab_insert(sp, MC_BIGCL); @@ -2401,7 +2677,7 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) m_infree(MC_MBUF_BIGCL); mbstat.m_bigclusters = ++m_total(MC_BIGCL); VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); - } else if ((i % (M16KCLBYTES / NBPG)) == 0) { + } else if ((i % NSLABSP16KB) == 0) { union m16kcluster *m16kcl = (union m16kcluster *)page; mcl_slab_t *nsp; int k; @@ -2416,9 +2692,12 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) slab_init(sp, MC_16KCL, SLF_MAPPED, m16kcl, m16kcl, bufsize, 0, 1); - /* 2nd-8th cluster's slab is part of the first one */ - for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { - nsp = slab_get(((union mcluster *)page) + k); + /* + * 2nd-Nth page's slab is part of the first one, + * where N is NSLABSP16KB. + */ + for (k = 1; k < NSLABSP16KB; k++) { + nsp = slab_get(((union mbigcluster *)page) + k); VERIFY(nsp->sl_refcnt == 0 && nsp->sl_flags == 0); slab_init(nsp, MC_16KCL, @@ -2444,13 +2723,11 @@ m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) wakeup(mb_clalloc_waitchan); } - if (bufsize == m_maxsize(MC_CL)) - return (numpages << 1); - else if (bufsize == m_maxsize(MC_BIGCL)) + if (bufsize == m_maxsize(MC_BIGCL)) return (numpages); VERIFY(bufsize == m_maxsize(MC_16KCL)); - return (numpages / (M16KCLBYTES / NBPG)); + return (numpages / NSLABSP16KB); out: lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); @@ -2466,23 +2743,7 @@ out: * When non-blocking we kick a thread if we have to grow the * pool or if the number of free clusters is less than requested. */ - if (bufsize == m_maxsize(MC_CL)) { - if (i > 0) { - /* - * Remember total number of clusters needed - * at this time. - */ - i += m_total(MC_CL); - if (i > mbuf_expand_mcl) { - mbuf_expand_mcl = i; - if (mbuf_worker_ready) - wakeup((caddr_t)&mbuf_worker_run); - } - } - - if (m_infree(MC_CL) >= num) - return (1); - } else if (bufsize == m_maxsize(MC_BIGCL)) { + if (bufsize == m_maxsize(MC_BIGCL)) { if (i > 0) { /* * Remember total number of 4KB clusters needed @@ -2525,44 +2786,30 @@ static int freelist_populate(mbuf_class_t class, unsigned int num, int wait) { mcache_obj_t *o = NULL; - int i; + int i, numpages = 0, count; VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL || class == MC_16KCL); -#if CONFIG_MBUF_NOEXPAND - if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) { -#if DEBUG - static int printonce = 1; - if (printonce == 1) { - printonce = 0; - printf("m_expand failed, allocated %ld out of %d " - "clusters\n", mbstat.m_mbufs / NMBPCL, - nmbclusters); - } -#endif /* DEBUG */ - return (0); - } -#endif /* CONFIG_MBUF_NOEXPAND */ - lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); switch (class) { case MC_MBUF: case MC_CL: - i = m_clalloc(num, wait, m_maxsize(MC_CL)); + case MC_BIGCL: + numpages = (num * m_size(class) + NBPG - 1) / NBPG; + i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL)); - /* Respect the 2K clusters minimum limit */ - if (m_total(MC_CL) == m_maxlimit(MC_CL) && - m_infree(MC_CL) <= m_minlimit(MC_CL)) { - if (class != MC_CL || (wait & MCR_COMP)) + /* Respect the 4KB clusters minimum limit */ + if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) && + m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) { + if (class != MC_BIGCL || (wait & MCR_COMP)) return (0); } - if (class == MC_CL) + if (class == MC_BIGCL) return (i != 0); break; - case MC_BIGCL: case MC_16KCL: return (m_clalloc(num, wait, m_maxsize(class)) != 0); /* NOTREACHED */ @@ -2572,66 +2819,119 @@ freelist_populate(mbuf_class_t class, unsigned int num, int wait) /* NOTREACHED */ } - /* Steal a cluster and cut it up to create NMBPCL mbufs */ - if ((o = slab_alloc(MC_CL, wait)) != NULL) { + VERIFY(class == MC_MBUF || class == MC_CL); + + /* how many objects will we cut the page into? */ + int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG); + + for (count = 0; count < numpages; count++) { + + /* respect totals, minlimit, maxlimit */ + if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) || + m_total(class) >= m_maxlimit(class)) + break; + + if ((o = slab_alloc(MC_BIGCL, wait)) == NULL) + break; + struct mbuf *m = (struct mbuf *)o; - mcache_audit_t *mca = NULL; + union mcluster *c = (union mcluster *)o; mcl_slab_t *sp = slab_get(o); + mcache_audit_t *mca = NULL; VERIFY(slab_is_detached(sp) && (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); - /* Make sure that the cluster is unmolested while in freelist */ - if (mclaudit != NULL) { - mca = mcl_audit_buf2mca(MC_CL, o); - mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL)); + /* + * Make sure that the cluster is unmolested + * while in freelist + */ + if (mclverify) { + mca = mcl_audit_buf2mca(MC_BIGCL, o); + mcache_audit_free_verify(mca, o, 0, + m_maxsize(MC_BIGCL)); } - /* Reinitialize it as an mbuf slab */ - slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL, - sp->sl_len, 0, NMBPCL); + /* Reinitialize it as an mbuf or 2K slab */ + slab_init(sp, class, sp->sl_flags, + sp->sl_base, NULL, sp->sl_len, 0, numobj); - VERIFY(m == (struct mbuf *)sp->sl_base); + VERIFY(o == (mcache_obj_t *)sp->sl_base); VERIFY(sp->sl_head == NULL); - m_total(MC_MBUF) += NMBPCL; - mbstat.m_mbufs = m_total(MC_MBUF); - m_infree(MC_MBUF) += NMBPCL; - mtype_stat_add(MT_FREE, NMBPCL); + VERIFY(m_total(MC_BIGCL) > 0); + m_total(MC_BIGCL)--; + mbstat.m_bigclusters = m_total(MC_BIGCL); - i = NMBPCL; - while (i--) { - /* - * If auditing is enabled, construct the shadow mbuf - * in the audit structure instead of the actual one. - * mbuf_slab_audit() will take care of restoring the - * contents after the integrity check. - */ - if (mclaudit != NULL) { - struct mbuf *ms; - mca = mcl_audit_buf2mca(MC_MBUF, - (mcache_obj_t *)m); - ms = ((struct mbuf *)mca->mca_contents); - ms->m_type = MT_FREE; - } else { - m->m_type = MT_FREE; + m_total(class) += numobj; + m_infree(class) += numobj; + + VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); + VERIFY(m_total(class) <= m_maxlimit(class)); + + i = numobj; + if (class == MC_MBUF) { + mbstat.m_mbufs = m_total(MC_MBUF); + mtype_stat_add(MT_FREE, NMBPBG); + while (i--) { + /* + * If auditing is enabled, construct the + * shadow mbuf in the audit structure + * instead of the actual one. + * mbuf_slab_audit() will take care of + * restoring the contents after the + * integrity check. + */ + if (mclaudit != NULL) { + struct mbuf *ms; + mca = mcl_audit_buf2mca(MC_MBUF, + (mcache_obj_t *)m); + ms = ((struct mbuf *) + mca->mca_contents); + ms->m_type = MT_FREE; + } else { + m->m_type = MT_FREE; + } + m->m_next = sp->sl_head; + sp->sl_head = (void *)m++; + } + } else { /* MC_CL */ + mbstat.m_clfree = + m_infree(MC_CL) + m_infree(MC_MBUF_CL); + mbstat.m_clusters = m_total(MC_CL); + while (i--) { + c->mcl_next = sp->sl_head; + sp->sl_head = (void *)c++; } - m->m_next = sp->sl_head; - sp->sl_head = (void *)m++; } - /* Insert it into the mbuf class's slab list */ - slab_insert(sp, MC_MBUF); + /* Insert into the mbuf or 2k slab list */ + slab_insert(sp, class); if ((i = mb_waiters) > 0) mb_waiters = 0; if (i != 0) wakeup(mb_waitchan); - - return (1); } + return (count != 0); +} - return (0); +/* + * For each class, initialize the freelist to hold m_minlimit() objects. + */ +static void +freelist_init(mbuf_class_t class) +{ + lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + + VERIFY(class == MC_CL || class == MC_BIGCL); + VERIFY(m_total(class) == 0); + VERIFY(m_minlimit(class) > 0); + + while (m_total(class) < m_minlimit(class)) + (void) freelist_populate(class, m_minlimit(class), M_WAIT); + + VERIFY(m_total(class) >= m_minlimit(class)); } /* @@ -2736,17 +3036,23 @@ m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp) switch (class) { case MC_MBUF: m_wantpurge(MC_CL)++; + m_wantpurge(MC_BIGCL)++; m_wantpurge(MC_MBUF_CL)++; m_wantpurge(MC_MBUF_BIGCL)++; break; case MC_CL: m_wantpurge(MC_MBUF)++; + m_wantpurge(MC_BIGCL)++; + m_wantpurge(MC_MBUF_BIGCL)++; if (!comp) m_wantpurge(MC_MBUF_CL)++; break; case MC_BIGCL: + m_wantpurge(MC_MBUF)++; + m_wantpurge(MC_CL)++; + m_wantpurge(MC_MBUF_CL)++; if (!comp) m_wantpurge(MC_MBUF_BIGCL)++; break; @@ -2894,11 +3200,11 @@ m_free(struct mbuf *m) if (m->m_flags & M_EXT) { u_int32_t refcnt; - u_int32_t flags; + u_int32_t composite; refcnt = m_decref(m); - flags = MEXT_FLAGS(m); - if (refcnt == 0 && flags == 0) { + composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); + if (refcnt == 0 && !composite) { if (m->m_ext.ext_free == NULL) { mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); } else if (m->m_ext.ext_free == m_bigfree) { @@ -2913,7 +3219,7 @@ m_free(struct mbuf *m) } mcache_free(ref_cache, MEXT_RFA(m)); MEXT_RFA(m) = NULL; - } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { + } else if (refcnt == 0 && composite) { VERIFY(m->m_type != MT_FREE); mtype_stat_dec(m->m_type); @@ -2924,6 +3230,8 @@ m_free(struct mbuf *m) m->m_len = 0; m->m_next = m->m_nextpkt = NULL; + MEXT_FLAGS(m) &= ~EXTF_READONLY; + /* "Free" into the intermediate cache */ if (m->m_ext.ext_free == NULL) { mcache_free(m_cache(MC_MBUF_CL), m); @@ -2963,11 +3271,11 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf, if (m->m_flags & M_EXT) { u_int32_t refcnt; - u_int32_t flags; + u_int32_t composite; refcnt = m_decref(m); - flags = MEXT_FLAGS(m); - if (refcnt == 0 && flags == 0) { + composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); + if (refcnt == 0 && !composite) { if (m->m_ext.ext_free == NULL) { mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); } else if (m->m_ext.ext_free == m_bigfree) { @@ -2982,7 +3290,7 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf, } /* Re-use the reference structure */ rfa = MEXT_RFA(m); - } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { + } else if (refcnt == 0 && composite) { VERIFY(m->m_type != MT_FREE); mtype_stat_dec(m->m_type); @@ -2992,6 +3300,9 @@ m_clattach(struct mbuf *m, int type, caddr_t extbuf, m->m_flags = M_EXT; m->m_len = 0; m->m_next = m->m_nextpkt = NULL; + + MEXT_FLAGS(m) &= ~EXTF_READONLY; + /* "Free" into the intermediate cache */ if (m->m_ext.ext_free == NULL) { mcache_free(m_cache(MC_MBUF_CL), m); @@ -3036,14 +3347,29 @@ m_getcl(int wait, int type, int flags) if (mcflags & MCR_NOSLEEP) mcflags |= MCR_TRYHARD; - m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags); - if (m != NULL) { + m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags); + if (m != NULL) { + u_int32_t flag; + struct ext_ref *rfa; + void *cl; + + VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); + cl = m->m_ext.ext_buf; + rfa = MEXT_RFA(m); + + ASSERT(cl != NULL && rfa != NULL); + VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL); + + flag = MEXT_FLAGS(m); + MBUF_INIT(m, hdr, type); + MBUF_CL_INIT(m, cl, rfa, 1, flag); + mtype_stat_inc(type); mtype_stat_dec(MT_FREE); #if CONFIG_MACF_NET if (hdr && mac_init_mbuf(m, wait) != 0) { - m_free(m); + m_freem(m); return (NULL); } #endif /* MAC_NET */ @@ -3091,7 +3417,7 @@ m_mclfree(caddr_t p) /* * mcl_hasreference() checks if a cluster of an mbuf is referenced by - * another mbuf + * another mbuf; see comments in m_incref() regarding EXTF_READONLY. */ int m_mclhasreference(struct mbuf *m) @@ -3101,7 +3427,7 @@ m_mclhasreference(struct mbuf *m) ASSERT(MEXT_RFA(m) != NULL); - return (MEXT_REF(m) > 1); + return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0); } __private_extern__ caddr_t @@ -3292,7 +3618,7 @@ m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, --num_with_pkthdrs; #if CONFIG_MACF_NET if (mac_mbuf_label_init(m, wait) != 0) { - m_free(m); + m_freem(m); break; } #endif /* MAC_NET */ @@ -3608,7 +3934,7 @@ m_allocpacket_internal(unsigned int *numlist, size_t packetlen, #if CONFIG_MACF_NET if (pkthdr && mac_init_mbuf(m, wait) != 0) { --num; - m_free(m); + m_freem(m); break; } #endif /* MAC_NET */ @@ -3745,7 +4071,7 @@ m_freem_list(struct mbuf *m) while (m != NULL) { struct mbuf *next = m->m_next; mcache_obj_t *o, *rfa; - u_int32_t refcnt, flags; + u_int32_t refcnt, composite; if (m->m_type == MT_FREE) panic("m_free: freeing an already freed mbuf"); @@ -3762,8 +4088,8 @@ m_freem_list(struct mbuf *m) o = (mcache_obj_t *)m->m_ext.ext_buf; refcnt = m_decref(m); - flags = MEXT_FLAGS(m); - if (refcnt == 0 && flags == 0) { + composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); + if (refcnt == 0 && !composite) { if (m->m_ext.ext_free == NULL) { o->obj_next = mcl_list; mcl_list = o; @@ -3782,7 +4108,7 @@ m_freem_list(struct mbuf *m) rfa->obj_next = ref_list; ref_list = rfa; MEXT_RFA(m) = NULL; - } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) { + } else if (refcnt == 0 && composite) { VERIFY(m->m_type != MT_FREE); /* * Amortize the costs of atomic operations @@ -3804,6 +4130,8 @@ m_freem_list(struct mbuf *m) m->m_len = 0; m->m_next = m->m_nextpkt = NULL; + MEXT_FLAGS(m) &= ~EXTF_READONLY; + /* "Free" into the intermediate cache */ o = (mcache_obj_t *)m; if (m->m_ext.ext_free == NULL) { @@ -4067,7 +4395,7 @@ nospace: */ struct mbuf * m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait, - struct mbuf **m_last, int *m_off) + struct mbuf **m_lastm, int *m_off) { struct mbuf *n, **np = NULL; int off = off0, len = len0; @@ -4081,8 +4409,8 @@ m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait, if (off == 0 && (m->m_flags & M_PKTHDR)) copyhdr = 1; - if (*m_last != NULL) { - m = *m_last; + if (*m_lastm != NULL) { + m = *m_lastm; off = *m_off; } else { while (off >= m->m_len) { @@ -4159,10 +4487,10 @@ m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait, if (len == 0) { if ((off + n->m_len) == m->m_len) { - *m_last = m->m_next; + *m_lastm = m->m_next; *m_off = 0; } else { - *m_last = m; + *m_lastm = m; *m_off = off + n->m_len; } break; @@ -4385,6 +4713,56 @@ bad: return (0); } +/* + * Like m_pullup(), except a new mbuf is always allocated, and we allow + * the amount of empty space before the data in the new mbuf to be specified + * (in the event that the caller expects to prepend later). + */ +__private_extern__ int MSFail = 0; + +__private_extern__ struct mbuf * +m_copyup(struct mbuf *n, int len, int dstoff) +{ + struct mbuf *m; + int count, space; + + if (len > (MHLEN - dstoff)) + goto bad; + MGET(m, M_DONTWAIT, n->m_type); + if (m == NULL) + goto bad; + m->m_len = 0; + if (n->m_flags & M_PKTHDR) { + m_copy_pkthdr(m, n); + n->m_flags &= ~M_PKTHDR; + } + m->m_data += dstoff; + space = &m->m_dat[MLEN] - (m->m_data + m->m_len); + do { + count = min(min(max(len, max_protohdr), space), n->m_len); + memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), + (unsigned)count); + len -= count; + m->m_len += count; + n->m_len -= count; + space -= count; + if (n->m_len) + n->m_data += count; + else + n = m_free(n); + } while (len > 0 && n); + if (len > 0) { + (void) m_free(m); + goto bad; + } + m->m_next = n; + return (m); +bad: + m_freem(n); + MSFail++; + return (NULL); +} + /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and @@ -4531,29 +4909,9 @@ m_devget(char *buf, int totlen, int off0, struct ifnet *ifp, return (top); } -void -mbuf_growth_aggressive(void) -{ - lck_mtx_lock(mbuf_mlock); - /* - * Don't start to grow the pool until we are at least - * 1/2 (50%) of current total capacity. - */ - mbuf_gscale = MB_GROWTH_AGGRESSIVE; - lck_mtx_unlock(mbuf_mlock); -} - -void -mbuf_growth_normal(void) -{ - lck_mtx_lock(mbuf_mlock); - /* - * Don't start to grow the pool until we are at least - * 15/16 (93.75%) of current total capacity. - */ - mbuf_gscale = MB_GROWTH_NORMAL; - lck_mtx_unlock(mbuf_mlock); -} +#ifndef MBUF_GROWTH_NORMAL_THRESH +#define MBUF_GROWTH_NORMAL_THRESH 25 +#endif /* * Cluster freelist allocation check. @@ -4562,94 +4920,121 @@ static int m_howmany(int num, size_t bufsize) { int i = 0, j = 0; - u_int32_t m_clusters, m_bigclusters, m_16kclusters; - u_int32_t m_clfree, m_bigclfree, m_16kclfree; - u_int32_t s = mbuf_gscale; + u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters; + u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree; + u_int32_t sumclusters, freeclusters; + u_int32_t percent_pool, percent_kmem; + u_int32_t mb_growth, mb_growth_thresh; + + VERIFY(bufsize == m_maxsize(MC_BIGCL) || + bufsize == m_maxsize(MC_16KCL)); lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); + /* Numbers in 2K cluster units */ + m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT; m_clusters = m_total(MC_CL); - m_bigclusters = m_total(MC_BIGCL); + m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT; m_16kclusters = m_total(MC_16KCL); + sumclusters = m_mbclusters + m_clusters + m_bigclusters; + + m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT; m_clfree = m_infree(MC_CL); - m_bigclfree = m_infree(MC_BIGCL); + m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT; m_16kclfree = m_infree(MC_16KCL); + freeclusters = m_mbfree + m_clfree + m_bigclfree; /* Bail if we've maxed out the mbuf memory map */ - if ((bufsize != m_maxsize(MC_16KCL) && - (m_clusters + (m_bigclusters << 1) >= nclusters)) || + if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) || (njcl > 0 && bufsize == m_maxsize(MC_16KCL) && - (m_16kclusters << 3) >= njcl)) { -#if DEBUG - if (bufsize == MCLBYTES && num > m_clfree) { - printf("m_howmany - out of small clusters, " - "%d short\n", num - mbstat.m_clfree); - } -#endif /* DEBUG */ + (m_16kclusters << NCLPJCLSHIFT) >= njcl)) { return (0); } - if (bufsize == m_maxsize(MC_CL)) { + if (bufsize == m_maxsize(MC_BIGCL)) { /* Under minimum */ - if (m_clusters < MINCL) - return (MINCL - m_clusters); - /* Too few (free < threshold) and not over maximum */ - if (m_clusters < m_maxlimit(MC_CL)) { - if (m_clfree >= MCL_LOWAT) + if (m_bigclusters < m_minlimit(MC_BIGCL)) + return (m_minlimit(MC_BIGCL) - m_bigclusters); + + percent_pool = + ((sumclusters - freeclusters) * 100) / sumclusters; + percent_kmem = (sumclusters * 100) / nclusters; + + /* + * If a light/normal user, grow conservatively (75%) + * If a heavy user, grow aggressively (50%) + */ + if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) + mb_growth = MB_GROWTH_NORMAL; + else + mb_growth = MB_GROWTH_AGGRESSIVE; + + if (percent_kmem < 5) { + /* For initial allocations */ + i = num; + } else { + /* Return if >= MBIGCL_LOWAT clusters available */ + if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT && + m_total(MC_BIGCL) >= + MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) return (0); - if (num >= m_clfree) - i = num - m_clfree; - if (((m_clusters + num) >> s) > m_clfree) - j = ((m_clusters + num) >> s) - m_clfree; + + /* Ensure at least num clusters are accessible */ + if (num >= m_infree(MC_BIGCL)) + i = num - m_infree(MC_BIGCL); + if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) + j = num - (m_total(MC_BIGCL) - + m_minlimit(MC_BIGCL)); + i = MAX(i, j); - if (i + m_clusters >= m_maxlimit(MC_CL)) - i = m_maxlimit(MC_CL) - m_clusters; - } - VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL)); - } else if (bufsize == m_maxsize(MC_BIGCL)) { - /* Under minimum */ - if (m_bigclusters < MINBIGCL) - return (MINBIGCL - m_bigclusters); - /* Too few (free < 1/16 total) and not over maximum */ - if (m_bigclusters < m_maxlimit(MC_BIGCL)) { - if (m_bigclfree >= MBIGCL_LOWAT) - return (0); - if (num >= m_bigclfree) - i = num - m_bigclfree; - if (((m_bigclusters + num) >> 4) > m_bigclfree) - j = ((m_bigclusters + num) >> 4) - m_bigclfree; + + /* + * Grow pool if percent_pool > 75 (normal growth) + * or percent_pool > 50 (aggressive growth). + */ + mb_growth_thresh = 100 - (100 / (1 << mb_growth)); + if (percent_pool > mb_growth_thresh) + j = ((sumclusters + num) >> mb_growth) - + freeclusters; i = MAX(i, j); - if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) - i = m_maxlimit(MC_BIGCL) - m_bigclusters; } + + /* Check to ensure we didn't go over limits */ + if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) + i = m_maxlimit(MC_BIGCL) - m_bigclusters; + if ((i << 1) + sumclusters >= nclusters) + i = (nclusters - sumclusters) >> 1; VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL)); - } else { + VERIFY(sumclusters + (i << 1) <= nclusters); + + } else { /* 16K CL */ VERIFY(njcl > 0); /* Under minimum */ if (m_16kclusters < MIN16KCL) return (MIN16KCL - m_16kclusters); - /* Too few (free < 1/16 total) and not over maximum */ - if (m_16kclusters < m_maxlimit(MC_16KCL)) { - if (m_16kclfree >= M16KCL_LOWAT) - return (0); - if (num >= m_16kclfree) - i = num - m_16kclfree; - if (((m_16kclusters + num) >> 4) > m_16kclfree) - j = ((m_16kclusters + num) >> 4) - m_16kclfree; - i = MAX(i, j); - if (i + m_16kclusters >= m_maxlimit(MC_16KCL)) - i = m_maxlimit(MC_16KCL) - m_16kclusters; - } + if (m_16kclfree >= M16KCL_LOWAT) + return (0); + + /* Ensure at least num clusters are available */ + if (num >= m_16kclfree) + i = num - m_16kclfree; + + /* Always grow 16KCL pool aggressively */ + if (((m_16kclusters + num) >> 1) > m_16kclfree) + j = ((m_16kclusters + num) >> 1) - m_16kclfree; + i = MAX(i, j); + + /* Check to ensure we don't go over limit */ + if (i + m_16kclusters >= m_maxlimit(MC_16KCL)) + i = m_maxlimit(MC_16KCL) - m_16kclusters; VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL)); } - return (i); } - /* * Return the number of bytes in the mbuf chain, m. - */ -static unsigned int + */ +unsigned int m_length(struct mbuf *m) { struct mbuf *m0; @@ -5157,6 +5542,61 @@ m_normalize(struct mbuf *m) return (top); } +/* + * Append the specified data to the indicated mbuf chain, + * Extend the mbuf chain if the new data does not fit in + * existing space. + * + * Return 1 if able to complete the job; otherwise 0. + */ +int +m_append(struct mbuf *m0, int len, caddr_t cp) +{ + struct mbuf *m, *n; + int remainder, space; + + for (m = m0; m->m_next != NULL; m = m->m_next) + ; + remainder = len; + space = M_TRAILINGSPACE(m); + if (space > 0) { + /* + * Copy into available space. + */ + if (space > remainder) + space = remainder; + bcopy(cp, mtod(m, caddr_t) + m->m_len, space); + m->m_len += space; + cp += space, remainder -= space; + } + while (remainder > 0) { + /* + * Allocate a new mbuf; could check space + * and allocate a cluster instead. + */ + n = m_get(M_WAITOK, m->m_type); + if (n == NULL) + break; + n->m_len = min(MLEN, remainder); + bcopy(cp, mtod(n, caddr_t), n->m_len); + cp += n->m_len; + remainder -= n->m_len; + m->m_next = n; + m = n; + } + if (m0->m_flags & M_PKTHDR) + m0->m_pkthdr.len += len - remainder; + return (remainder == 0); +} + +struct mbuf * +m_last(struct mbuf *m) +{ + while (m->m_next != NULL) + m = m->m_next; + return (m); +} + void m_mchtype(struct mbuf *m, int t) { @@ -5183,6 +5623,34 @@ m_mcheck(struct mbuf *m) _MCHECK(m); } +/* + * Return a pointer to mbuf/offset of location in mbuf chain. + */ +struct mbuf * +m_getptr(struct mbuf *m, int loc, int *off) +{ + + while (loc >= 0) { + /* Normal end of search. */ + if (m->m_len > loc) { + *off = loc; + return (m); + } else { + loc -= m->m_len; + if (m->m_next == NULL) { + if (loc == 0) { + /* Point at the end of valid data. */ + *off = m->m_len; + return (m); + } + return (NULL); + } + m = m->m_next; + } + } + return (NULL); +} + /* * Inform the corresponding mcache(s) that there's a waiter below. */ @@ -5225,6 +5693,29 @@ mbuf_waiter_dec(mbuf_class_t class, boolean_t comp) } } +/* + * Called during slab (blocking and non-blocking) allocation. If there + * is at least one waiter, and the time since the first waiter is blocked + * is greater than the watchdog timeout, panic the system. + */ +static void +mbuf_watchdog(void) +{ + struct timeval now; + unsigned int since; + + if (mb_waiters == 0 || !mb_watchdog) + return; + + microuptime(&now); + since = now.tv_sec - mb_wdtstart.tv_sec; + if (since >= MB_WDT_MAXTIME) { + panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__, + mb_waiters, since, mbuf_dump()); + /* NOTREACHED */ + } +} + /* * Called during blocking allocation. Returns TRUE if one or more objects * are available at the per-CPU caches layer and that allocation should be @@ -5266,6 +5757,16 @@ mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) mbuf_waiter_inc(class, (wait & MCR_COMP)); VERIFY(!(wait & MCR_NOSLEEP)); + + /* + * If this is the first waiter, arm the watchdog timer. Otherwise + * check if we need to panic the system due to watchdog timeout. + */ + if (mb_waiters == 0) + microuptime(&mb_wdtstart); + else + mbuf_watchdog(); + mb_waiters++; (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL); @@ -5420,7 +5921,7 @@ slab_get(void *buf) } } - ix = MTOCL(buf) % NSLABSPMB; + ix = MTOBG(buf) % NSLABSPMB; VERIFY(ix < NSLABSPMB); return (&slg->slg_slab[ix]); @@ -5447,15 +5948,9 @@ slab_insert(mcl_slab_t *sp, mbuf_class_t class) m_slab_cnt(class)++; TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link); sp->sl_flags &= ~SLF_DETACHED; - if (class == MC_BIGCL) { - sp = sp->sl_next; - /* Next slab must already be present */ - VERIFY(sp != NULL); - VERIFY(slab_is_detached(sp)); - sp->sl_flags &= ~SLF_DETACHED; - } else if (class == MC_16KCL) { + if (class == MC_16KCL) { int k; - for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + for (k = 1; k < NSLABSP16KB; k++) { sp = sp->sl_next; /* Next slab must already be present */ VERIFY(sp != NULL); @@ -5473,15 +5968,9 @@ slab_remove(mcl_slab_t *sp, mbuf_class_t class) m_slab_cnt(class)--; TAILQ_REMOVE(&m_slablist(class), sp, sl_link); slab_detach(sp); - if (class == MC_BIGCL) { - sp = sp->sl_next; - /* Next slab must already be present */ - VERIFY(sp != NULL); - VERIFY(!slab_is_detached(sp)); - slab_detach(sp); - } else if (class == MC_16KCL) { + if (class == MC_16KCL) { int k; - for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) { + for (k = 1; k < NSLABSP16KB; k++) { sp = sp->sl_next; /* Next slab must already be present */ VERIFY(sp != NULL); @@ -5511,7 +6000,7 @@ slab_nextptr_panic(mcl_slab_t *sp, void *addr) void *next = ((mcache_obj_t *)buf)->obj_next; if (next != addr) continue; - if (mclaudit == NULL) { + if (!mclverify) { if (next != NULL && !MBUF_IN_MAP(next)) { mcache_t *cp = m_cache(sp->sl_class); panic("%s: %s buffer %p in slab %p modified " @@ -5553,12 +6042,14 @@ mcl_audit_init(void *buf, mcache_audit_t **mca_list, boolean_t save_contents = (con_list != NULL); unsigned int i, ix; - ASSERT(num <= NMBPCL); + ASSERT(num <= NMBPBG); ASSERT(con_list == NULL || con_size != 0); - ix = MTOCL(buf); + ix = MTOBG(buf); + VERIFY(ix < maxclaudit); + /* Make sure we haven't been here before */ - for (i = 0; i < NMBPCL; i++) + for (i = 0; i < NMBPBG; i++) VERIFY(mclaudit[ix].cl_audit[i] == NULL); mca = mca_tail = *mca_list; @@ -5594,31 +6085,39 @@ mcl_audit_init(void *buf, mcache_audit_t **mca_list, } /* - * Given an address of a buffer (mbuf/cluster/big cluster), return + * Given an address of a buffer (mbuf/2KB/4KB/16KB), return * the corresponding audit structure for that buffer. */ static mcache_audit_t * mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o) { mcache_audit_t *mca = NULL; - int ix = MTOCL(o); + int ix = MTOBG(o); + VERIFY(ix < maxclaudit); VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG))); switch (class) { case MC_MBUF: /* - * For the mbuf case, find the index of the cluster + * For the mbuf case, find the index of the page * used by the mbuf and use that index to locate the - * base address of the cluster. Then find out the - * mbuf index relative to the cluster base and use + * base address of the page. Then find out the + * mbuf index relative to the page base and use * it to locate the audit structure. */ - VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL); - mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)]; + VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG); + mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)]; break; case MC_CL: + /* + * Same thing as above, but for 2KB clusters in a page. + */ + VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG); + mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)]; + break; + case MC_BIGCL: case MC_16KCL: /* @@ -5645,19 +6144,24 @@ mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite, VERIFY(mca->mca_contents != NULL && mca->mca_contents_size == AUDIT_CONTENTS_SIZE); - mcl_audit_verify_nextptr(next, mca); + if (mclverify) + mcl_audit_verify_nextptr(next, mca); if (!alloc) { /* Save constructed mbuf fields */ mcl_audit_save_mbuf(m, mca); - mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF)); + if (mclverify) { + mcache_set_pattern(MCACHE_FREE_PATTERN, m, + m_maxsize(MC_MBUF)); + } ((mcache_obj_t *)m)->obj_next = next; return; } /* Check if the buffer has been corrupted while in freelist */ - mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); - + if (mclverify) { + mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); + } /* Restore constructed mbuf fields */ mcl_audit_restore_mbuf(m, mca, composite); } @@ -5704,12 +6208,14 @@ mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc, mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next; if (!alloc) { - mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); + if (mclverify) { + mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); + } if (save_next) { mcl_audit_verify_nextptr(next, mca); ((mcache_obj_t *)addr)->obj_next = next; } - } else { + } else if (mclverify) { /* Check if the buffer has been corrupted while in freelist */ mcl_audit_verify_nextptr(next, mca); mcache_audit_free_verify_set(mca, addr, 0, size); @@ -5732,8 +6238,8 @@ mcl_audit_mcheck_panic(struct mbuf *m) static void mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) { - if (next != NULL && next != (void *)MCACHE_FREE_PATTERN && - !MBUF_IN_MAP(next)) { + if (next != NULL && !MBUF_IN_MAP(next) && + (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) { panic("mcl_audit: buffer %p modified after free at offset 0: " "%p out of range [%p-%p)\n%s\n", mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca)); @@ -5741,10 +6247,358 @@ mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) } } +/* This function turns on mbuf leak detection */ +static void +mleak_activate(void) +{ + mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR; + PE_parse_boot_argn("mleak_sample_factor", + &mleak_table.mleak_sample_factor, + sizeof (mleak_table.mleak_sample_factor)); + + if (mleak_table.mleak_sample_factor == 0) + mclfindleak = 0; + + if (mclfindleak == 0) + return; + + vm_size_t alloc_size = + mleak_alloc_buckets * sizeof (struct mallocation); + vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace); + + MALLOC(mleak_allocations, struct mallocation *, alloc_size, + M_TEMP, M_WAITOK | M_ZERO); + VERIFY(mleak_allocations != NULL); + + MALLOC(mleak_traces, struct mtrace *, trace_size, + M_TEMP, M_WAITOK | M_ZERO); + VERIFY(mleak_traces != NULL); + + MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES), + M_TEMP, M_WAITOK | M_ZERO); + VERIFY(mleak_stat != NULL); + mleak_stat->ml_cnt = MLEAK_NUM_TRACES; +#ifdef __LP64__ + mleak_stat->ml_isaddr64 = 1; +#endif /* __LP64__ */ +} + +static void +mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc) +{ + int temp; + + if (mclfindleak == 0) + return; + + if (!alloc) + return (mleak_free(addr)); + + temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1); + + if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) { + uintptr_t bt[MLEAK_STACK_DEPTH]; + int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH); + mleak_log(bt, addr, logged, num); + } +} + +/* + * This function records the allocation in the mleak_allocations table + * and the backtrace in the mleak_traces table; if allocation slot is in use, + * replace old allocation with new one if the trace slot is in use, return + * (or increment refcount if same trace). + */ +static boolean_t +mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num) +{ + struct mallocation *allocation; + struct mtrace *trace; + uint32_t trace_index; + int i; + + /* Quit if someone else modifying the tables */ + if (!lck_mtx_try_lock_spin(mleak_lock)) { + mleak_table.total_conflicts++; + return (FALSE); + } + + allocation = &mleak_allocations[hashaddr((uintptr_t)addr, + mleak_alloc_buckets)]; + trace_index = hashbacktrace(bt, depth, mleak_trace_buckets); + trace = &mleak_traces[trace_index]; + + VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]); + VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]); + + allocation->hitcount++; + trace->hitcount++; + + /* + * If the allocation bucket we want is occupied + * and the occupier has the same trace, just bail. + */ + if (allocation->element != NULL && + trace_index == allocation->trace_index) { + mleak_table.alloc_collisions++; + lck_mtx_unlock(mleak_lock); + return (TRUE); + } + + /* + * Store the backtrace in the traces array; + * Size of zero = trace bucket is free. + */ + if (trace->allocs > 0 && + bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) { + /* Different, unique trace, but the same hash! Bail out. */ + trace->collisions++; + mleak_table.trace_collisions++; + lck_mtx_unlock(mleak_lock); + return (TRUE); + } else if (trace->allocs > 0) { + /* Same trace, already added, so increment refcount */ + trace->allocs++; + } else { + /* Found an unused trace bucket, so record the trace here */ + if (trace->depth != 0) { + /* this slot previously used but not currently in use */ + mleak_table.trace_overwrites++; + } + mleak_table.trace_recorded++; + trace->allocs = 1; + memcpy(trace->addr, bt, (depth * sizeof (uintptr_t))); + trace->depth = depth; + trace->collisions = 0; + } + + /* Step 2: Store the allocation record in the allocations array */ + if (allocation->element != NULL) { + /* + * Replace an existing allocation. No need to preserve + * because only a subset of the allocations are being + * recorded anyway. + */ + mleak_table.alloc_collisions++; + } else if (allocation->trace_index != 0) { + mleak_table.alloc_overwrites++; + } + allocation->element = addr; + allocation->trace_index = trace_index; + allocation->count = num; + mleak_table.alloc_recorded++; + mleak_table.outstanding_allocs++; + + /* keep a log of the last 5 traces to be top trace, in order */ + for (i = 0; i < MLEAK_NUM_TRACES; i++) { + if (mleak_top_trace[i] == NULL || + mleak_top_trace[i]->allocs <= trace->allocs) { + if (mleak_top_trace[i] != trace) { + int j = MLEAK_NUM_TRACES; + while (--j > i) { + mleak_top_trace[j] = + mleak_top_trace[j - 1]; + } + mleak_top_trace[i] = trace; + } + break; + } + } + + lck_mtx_unlock(mleak_lock); + return (TRUE); +} + +static void +mleak_free(mcache_obj_t *addr) +{ + while (addr != NULL) { + struct mallocation *allocation = &mleak_allocations + [hashaddr((uintptr_t)addr, mleak_alloc_buckets)]; + + if (allocation->element == addr && + allocation->trace_index < mleak_trace_buckets) { + lck_mtx_lock_spin(mleak_lock); + if (allocation->element == addr && + allocation->trace_index < mleak_trace_buckets) { + struct mtrace *trace; + trace = &mleak_traces[allocation->trace_index]; + /* allocs = 0 means trace bucket is unused */ + if (trace->allocs > 0) + trace->allocs--; + if (trace->allocs == 0) + trace->depth = 0; + /* NULL element means alloc bucket is unused */ + allocation->element = NULL; + mleak_table.outstanding_allocs--; + } + lck_mtx_unlock(mleak_lock); + } + addr = addr->obj_next; + } +} + +static struct mbtypes { + int mt_type; + const char *mt_name; +} mbtypes[] = { + { MT_DATA, "data" }, + { MT_OOBDATA, "oob data" }, + { MT_CONTROL, "ancillary data" }, + { MT_HEADER, "packet headers" }, + { MT_SOCKET, "socket structures" }, + { MT_PCB, "protocol control blocks" }, + { MT_RTABLE, "routing table entries" }, + { MT_HTABLE, "IMP host table entries" }, + { MT_ATABLE, "address resolution tables" }, + { MT_FTABLE, "fragment reassembly queue headers" }, + { MT_SONAME, "socket names and addresses" }, + { MT_SOOPTS, "socket options" }, + { MT_RIGHTS, "access rights" }, + { MT_IFADDR, "interface addresses" }, + { MT_TAG, "packet tags" }, + { 0, NULL } +}; + +#define MBUF_DUMP_BUF_CHK() { \ + clen -= k; \ + if (clen < 1) \ + goto done; \ + c += k; \ +} + +static char * +mbuf_dump(void) +{ + unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct; + u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0; + u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0; + u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0; + int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short); + uint8_t seen[256]; + struct mbtypes *mp; + mb_class_stat_t *sp; + char *c = mbuf_dump_buf; + int i, k, clen = sizeof (mbuf_dump_buf); + + mbuf_dump_buf[0] = '\0'; + + /* synchronize all statistics in the mbuf table */ + mbuf_stat_sync(); + mbuf_mtypes_sync(TRUE); + + sp = &mb_stat->mbs_class[0]; + for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) { + u_int32_t mem; + + if (m_class(i) == MC_MBUF) { + m_mbufs = sp->mbcl_active; + } else if (m_class(i) == MC_CL) { + m_clfree = sp->mbcl_total - sp->mbcl_active; + } else if (m_class(i) == MC_BIGCL) { + m_bigclfree = sp->mbcl_total - sp->mbcl_active; + } else if (njcl > 0 && m_class(i) == MC_16KCL) { + m_16kclfree = sp->mbcl_total - sp->mbcl_active; + m_16kclusters = sp->mbcl_total; + } else if (m_class(i) == MC_MBUF_CL) { + m_mbufclfree = sp->mbcl_total - sp->mbcl_active; + } else if (m_class(i) == MC_MBUF_BIGCL) { + m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active; + } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) { + m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active; + } + + mem = sp->mbcl_ctotal * sp->mbcl_size; + totmem += mem; + totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) * + sp->mbcl_size; + + } + + /* adjust free counts to include composite caches */ + m_clfree += m_mbufclfree; + m_bigclfree += m_mbufbigclfree; + m_16kclfree += m_mbuf16kclfree; + + totmbufs = 0; + for (mp = mbtypes; mp->mt_name != NULL; mp++) + totmbufs += mbstat.m_mtypes[mp->mt_type]; + if (totmbufs > m_mbufs) + totmbufs = m_mbufs; + k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs); + MBUF_DUMP_BUF_CHK(); + + bzero(&seen, sizeof (seen)); + for (mp = mbtypes; mp->mt_name != NULL; mp++) { + if (mbstat.m_mtypes[mp->mt_type] != 0) { + seen[mp->mt_type] = 1; + k = snprintf(c, clen, "\t%u mbufs allocated to %s\n", + mbstat.m_mtypes[mp->mt_type], mp->mt_name); + MBUF_DUMP_BUF_CHK(); + } + } + seen[MT_FREE] = 1; + for (i = 0; i < nmbtypes; i++) + if (!seen[i] && mbstat.m_mtypes[i] != 0) { + k = snprintf(c, clen, "\t%u mbufs allocated to " + "<mbuf type %d>\n", mbstat.m_mtypes[i], i); + MBUF_DUMP_BUF_CHK(); + } + if ((m_mbufs - totmbufs) > 0) { + k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n", + m_mbufs - totmbufs); + MBUF_DUMP_BUF_CHK(); + } + k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n" + "%u/%u mbuf 4KB clusters in use\n", + (unsigned int)(mbstat.m_clusters - m_clfree), + (unsigned int)mbstat.m_clusters, + (unsigned int)(mbstat.m_bigclusters - m_bigclfree), + (unsigned int)mbstat.m_bigclusters); + MBUF_DUMP_BUF_CHK(); + + if (njcl > 0) { + k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n", + m_16kclusters - m_16kclfree, m_16kclusters, + njclbytes / 1024); + MBUF_DUMP_BUF_CHK(); + } + totused = totmem - totfree; + if (totmem == 0) { + totpct = 0; + } else if (totused < (ULONG_MAX / 100)) { + totpct = (totused * 100) / totmem; + } else { + u_long totmem1 = totmem / 100; + u_long totused1 = totused / 100; + totpct = (totused1 * 100) / totmem1; + } + k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% " + "in use)\n", totmem / 1024, totpct); + MBUF_DUMP_BUF_CHK(); + +done: + return (mbuf_dump_buf); +} + +#undef MBUF_DUMP_BUF_CHK + SYSCTL_DECL(_kern_ipc); -SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED, +SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, + CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, mbstat_sysctl, "S,mbstat", ""); -SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED, +SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, + CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, mb_stat_sysctl, "S,mb_stat", ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED, - &mb_normalized, 0, ""); +SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace, + CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", ""); +SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table, + CTLFLAG_RD | CTLFLAG_LOCKED, + 0, 0, mleak_table_sysctl, "S,mleak_table", ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor, + CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, + CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog, + CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, ""); diff --git a/bsd/kern/uipc_mbuf2.c b/bsd/kern/uipc_mbuf2.c index 5276ce659..386238460 100644 --- a/bsd/kern/uipc_mbuf2.c +++ b/bsd/kern/uipc_mbuf2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,11 +105,12 @@ #include <sys/proc_internal.h> #include <sys/malloc.h> #include <sys/mbuf.h> -#if defined(PULLDOWN_STAT) && defined(INET6) +#include <sys/mcache.h> +#if INET6 #include <netinet/in.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> -#endif +#endif /* INET6 */ #if CONFIG_MACF_NET #include <security/mac_framework.h> @@ -131,7 +132,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) struct mbuf *n, *o; int hlen, tlen, olen; int sharedcluster; -#if defined(PULLDOWN_STAT) && defined(INET6) +#if defined(PULLDOWN_STAT) && INET6 static struct mbuf *prev = NULL; int prevlen = 0, prevmlen = 0; #endif @@ -144,11 +145,11 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) return NULL; /* impossible */ } -#if defined(PULLDOWN_STAT) && defined(INET6) +#if defined(PULLDOWN_STAT) && INET6 ip6stat.ip6s_pulldown++; #endif -#if defined(PULLDOWN_STAT) && defined(INET6) +#if defined(PULLDOWN_STAT) && INET6 /* statistics for m_pullup */ ip6stat.ip6s_pullup++; if (off + len > MHLEN) @@ -241,7 +242,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) if ((off == 0 || offp) && len <= n->m_len - off) goto ok; -#if defined(PULLDOWN_STAT) && defined(INET6) +#if defined(PULLDOWN_STAT) && INET6 ip6stat.ip6s_pulldown_copy++; #endif @@ -321,7 +322,7 @@ m_pulldown(struct mbuf *m, int off, int len, int *offp) * now, we need to do the hard way. don't m_copy as there's no room * on both end. */ -#if defined(PULLDOWN_STAT) && defined(INET6) +#if defined(PULLDOWN_STAT) && INET6 ip6stat.ip6s_pulldown_alloc++; #endif MGET(o, M_DONTWAIT, m->m_type); @@ -365,6 +366,67 @@ ok: return n; } +/* + * Create and return an m_tag, either by re-using space in a previous tag + * or by allocating a new mbuf/cluster + */ +struct m_tag * +m_tag_create(u_int32_t id, u_int16_t type, int len, int wait, struct mbuf *buf) +{ + struct m_tag *t = NULL; + struct m_tag *p; + + if (len < 0) + return (NULL); + + if (len + sizeof (struct m_tag) + sizeof (struct m_taghdr) > MLEN) + return (m_tag_alloc(id, type, len, wait)); + + /* + * We've exhausted all external cases. Now, go through the m_tag + * chain and see if we can fit it in any of them. + * If not (t == NULL), call m_tag_alloc to store it in a new mbuf. + */ + p = SLIST_FIRST(&buf->m_pkthdr.tags); + while(p != NULL) { + /* 2KCL m_tag */ + if (M_TAG_ALIGN(p->m_tag_len) + + sizeof (struct m_taghdr) > MLEN) { + p = SLIST_NEXT(p, m_tag_link); + continue; + } + + VERIFY(p->m_tag_cookie == M_TAG_VALID_PATTERN); + + struct mbuf *m = m_dtom(p); + struct m_taghdr *hdr = (struct m_taghdr *)m->m_data; + + VERIFY(m->m_flags & M_TAGHDR && !(m->m_flags & M_EXT)); + + /* The mbuf can store this m_tag */ + if (M_TAG_ALIGN(len) <= MLEN - m->m_len) { + t = (struct m_tag *)(m->m_data + m->m_len); + hdr->refcnt++; + m->m_len += M_TAG_ALIGN(len); + VERIFY(m->m_len <= MLEN); + break; + } + + p = SLIST_NEXT(p, m_tag_link); + } + + if (t == NULL) + return (m_tag_alloc(id, type, len, wait)); + + t->m_tag_cookie = M_TAG_VALID_PATTERN; + t->m_tag_type = type; + t->m_tag_len = len; + t->m_tag_id = id; + if (len > 0) + bzero(t + 1, len); + return (t); +} + /* Get a packet tag structure along with specified data following. */ struct m_tag * m_tag_alloc(u_int32_t id, u_int16_t type, int len, int wait) @@ -372,26 +434,39 @@ m_tag_alloc(u_int32_t id, u_int16_t type, int len, int wait) struct m_tag *t; if (len < 0) - return NULL; -#if CONFIG_MBUF_TAGS_MALLOC - t = _MALLOC(len + sizeof (struct m_tag), M_TEMP, wait); -#else - if (len + sizeof(struct m_tag) <= MLEN) { + return (NULL); + + if (M_TAG_ALIGN(len) + sizeof (struct m_taghdr) <= MLEN) { struct mbuf *m = m_get(wait, MT_TAG); + struct m_taghdr *hdr; + if (m == NULL) - return NULL; - t = mtod(m, struct m_tag *); - } else if (len + sizeof(struct m_tag) <= MCLBYTES) { - t = (struct m_tag *) m_mclalloc(wait); - } else + return (NULL); + + m->m_flags |= M_TAGHDR; + + hdr = (struct m_taghdr *)m->m_data; + hdr->refcnt = 1; + m->m_len += sizeof (struct m_taghdr); + t = (struct m_tag *)(m->m_data + m->m_len); + m->m_len += M_TAG_ALIGN(len); + VERIFY(m->m_len <= MLEN); + } else if (len + sizeof (struct m_tag) <= MCLBYTES) { + t = (struct m_tag *)m_mclalloc(wait); + } else { t = NULL; -#endif + } + if (t == NULL) - return NULL; + return (NULL); + + t->m_tag_cookie = M_TAG_VALID_PATTERN; t->m_tag_type = type; t->m_tag_len = len; t->m_tag_id = id; - return t; + if (len > 0) + bzero(t + 1, len); + return (t); } @@ -405,25 +480,44 @@ m_tag_free(struct m_tag *t) t->m_tag_type == KERNEL_TAG_TYPE_MACLABEL) mac_mbuf_tag_destroy(t); #endif -#if CONFIG_MBUF_TAGS_MALLOC - _FREE(t, M_TEMP); -#else +#if INET6 + if (t != NULL && + t->m_tag_id == KERNEL_MODULE_TAG_ID && + t->m_tag_type == KERNEL_TAG_TYPE_INET6 && + t->m_tag_len == sizeof (struct ip6aux)) + ip6_destroyaux((struct ip6aux *)(t + 1)); +#endif /* INET6 */ if (t == NULL) return; - if (t->m_tag_len + sizeof(struct m_tag) <= MLEN) { + if (M_TAG_ALIGN(t->m_tag_len) + sizeof (struct m_taghdr) <= MLEN) { struct mbuf * m = m_dtom(t); - m_free(m); + VERIFY(m->m_flags & M_TAGHDR); + struct m_taghdr *hdr = (struct m_taghdr *)m->m_data; + + /* No other tags in this mbuf */ + if(--hdr->refcnt == 0) { + m_free(m); + return; + } + + /* Pattern-fill the header */ + u_int64_t *fill_ptr = (u_int64_t *)t; + u_int64_t *end_ptr = (u_int64_t *)(t + 1); + while (fill_ptr < end_ptr) { + *fill_ptr = M_TAG_FREE_PATTERN; + fill_ptr++; + } } else { - MCLFREE((caddr_t)t); + m_mclfree((caddr_t)t); } -#endif } /* Prepend a packet tag. */ void m_tag_prepend(struct mbuf *m, struct m_tag *t) { - KASSERT(m && t, ("m_tag_prepend: null argument, m %p t %p", m, t)); + VERIFY(m != NULL && t != NULL); + SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } @@ -431,7 +525,9 @@ m_tag_prepend(struct mbuf *m, struct m_tag *t) void m_tag_unlink(struct mbuf *m, struct m_tag *t) { - KASSERT(m && t, ("m_tag_unlink: null argument, m %p t %p", m, t)); + VERIFY(m != NULL && t != NULL); + VERIFY(t->m_tag_cookie == M_TAG_VALID_PATTERN); + SLIST_REMOVE(&m->m_pkthdr.tags, t, m_tag, m_tag_link); } @@ -439,7 +535,8 @@ m_tag_unlink(struct mbuf *m, struct m_tag *t) void m_tag_delete(struct mbuf *m, struct m_tag *t) { - KASSERT(m && t, ("m_tag_delete: null argument, m %p t %p", m, t)); + VERIFY(m != NULL && t != NULL); + m_tag_unlink(m, t); m_tag_free(t); } @@ -450,15 +547,21 @@ m_tag_delete_chain(struct mbuf *m, struct m_tag *t) { struct m_tag *p, *q; - KASSERT(m, ("m_tag_delete_chain: null mbuf")); - if (t != NULL) + VERIFY(m != NULL); + + if (t != NULL) { p = t; - else + } else { p = SLIST_FIRST(&m->m_pkthdr.tags); + } if (p == NULL) return; - while ((q = SLIST_NEXT(p, m_tag_link)) != NULL) + + VERIFY(p->m_tag_cookie == M_TAG_VALID_PATTERN); + while ((q = SLIST_NEXT(p, m_tag_link)) != NULL) { + VERIFY(q->m_tag_cookie == M_TAG_VALID_PATTERN); m_tag_delete(m, q); + } m_tag_delete(m, p); } @@ -468,17 +571,21 @@ m_tag_locate(struct mbuf *m, u_int32_t id, u_int16_t type, struct m_tag *t) { struct m_tag *p; - KASSERT(m, ("m_tag_find: null mbuf")); - if (t == NULL) + VERIFY(m != NULL); + + if (t == NULL) { p = SLIST_FIRST(&m->m_pkthdr.tags); - else + } else { + VERIFY(t->m_tag_cookie == M_TAG_VALID_PATTERN); p = SLIST_NEXT(t, m_tag_link); + } while (p != NULL) { + VERIFY(p->m_tag_cookie == M_TAG_VALID_PATTERN); if (p->m_tag_id == id && p->m_tag_type == type) - return p; + return (p); p = SLIST_NEXT(p, m_tag_link); } - return NULL; + return (NULL); } /* Copy a single tag. */ @@ -487,7 +594,8 @@ m_tag_copy(struct m_tag *t, int how) { struct m_tag *p; - KASSERT(t, ("m_tag_copy: null tag")); + VERIFY(t != NULL); + p = m_tag_alloc(t->m_tag_id, t->m_tag_type, t->m_tag_len, how); if (p == NULL) return (NULL); @@ -507,8 +615,16 @@ m_tag_copy(struct m_tag *t, int how) mac_mbuf_tag_copy(t, p); } else #endif +#if INET6 + if (t != NULL && + t->m_tag_id == KERNEL_MODULE_TAG_ID && + t->m_tag_type == KERNEL_TAG_TYPE_INET6 && + t->m_tag_len == sizeof (struct ip6aux)) { + ip6_copyaux((struct ip6aux *)(t + 1), (struct ip6aux *)(p + 1)); + } else +#endif /* INET6 */ bcopy(t + 1, p + 1, t->m_tag_len); /* Copy the data */ - return p; + return (p); } /* @@ -522,29 +638,32 @@ m_tag_copy_chain(struct mbuf *to, struct mbuf *from, int how) { struct m_tag *p, *t, *tprev = NULL; - KASSERT(to && from, - ("m_tag_copy: null argument, to %p from %p", to, from)); + VERIFY(to != NULL && from != NULL); + m_tag_delete_chain(to, NULL); SLIST_FOREACH(p, &from->m_pkthdr.tags, m_tag_link) { + VERIFY(p->m_tag_cookie == M_TAG_VALID_PATTERN); t = m_tag_copy(p, how); if (t == NULL) { m_tag_delete_chain(to, NULL); - return 0; + return (0); } - if (tprev == NULL) + if (tprev == NULL) { SLIST_INSERT_HEAD(&to->m_pkthdr.tags, t, m_tag_link); - else { + } else { SLIST_INSERT_AFTER(tprev, t, m_tag_link); tprev = t; } } - return 1; + return (1); } /* Initialize tags on an mbuf. */ void m_tag_init(struct mbuf *m) { + VERIFY(m != NULL); + SLIST_INIT(&m->m_pkthdr.tags); #if PF_PKTHDR bzero(&m->m_pkthdr.pf_mtag, sizeof (m->m_pkthdr.pf_mtag)); @@ -555,34 +674,25 @@ m_tag_init(struct mbuf *m) struct m_tag * m_tag_first(struct mbuf *m) { - return SLIST_FIRST(&m->m_pkthdr.tags); + VERIFY(m != NULL); + + return (SLIST_FIRST(&m->m_pkthdr.tags)); } /* Get next tag in chain. */ struct m_tag * -m_tag_next(__unused struct mbuf *m, struct m_tag *t) +m_tag_next(struct mbuf *m, struct m_tag *t) { - return SLIST_NEXT(t, m_tag_link); -} - -void -m_prio_init(struct mbuf *m) -{ -#if !PKT_PRIORITY #pragma unused(m) -#else /* PKT_PRIORITY */ - if (m->m_flags & M_PKTHDR) - m->m_pkthdr.prio = MBUF_PRIORITY_NORMAL; -#endif /* PKT_PRIORITY */ + VERIFY(t != NULL); + VERIFY(t->m_tag_cookie == M_TAG_VALID_PATTERN); + + return (SLIST_NEXT(t, m_tag_link)); } void -m_prio_background(struct mbuf *m) +m_prio_init(struct mbuf *m) { -#if !PKT_PRIORITY -#pragma unused(m) -#else /* PKT_PRIORITY */ if (m->m_flags & M_PKTHDR) - m->m_pkthdr.prio = MBUF_PRIORITY_BACKGROUND; -#endif /* PKT_PRIORITY */ + m->m_pkthdr.prio = MBUF_TC_BE; } diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index 4b2c8a79b..b496895f6 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2008 Apple Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -96,18 +96,25 @@ #include <net/route.h> #include <netinet/in.h> #include <netinet/in_pcb.h> +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> #include <kern/zalloc.h> #include <kern/locks.h> #include <machine/limits.h> #include <libkern/OSAtomic.h> #include <pexpert/pexpert.h> #include <kern/assert.h> +#include <kern/task.h> + +#include <sys/mcache.h> #if CONFIG_MACF #include <security/mac.h> #include <security/mac_framework.h> #endif /* MAC */ +extern int in6_init_done; + int so_cache_hw = 0; int so_cache_timeouts = 0; int so_cache_max_freed = 0; @@ -170,15 +177,15 @@ MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); SYSCTL_DECL(_kern_ipc); int somaxconn = SOMAXCONN; -SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, ""); /* Should we get a maximum also ??? */ static int sosendmaxchain = 65536; static int sosendminchain = 16384; static int sorecvmincopy = 16384; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW, &sosendminchain, +SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy, +SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, ""); /* @@ -186,7 +193,7 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy, * the socket is marked with SOF_MULTIPAGES; see below. */ int sosendjcl = 1; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW, &sosendjcl, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, ""); /* * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large @@ -200,9 +207,13 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW, &sosendjcl, 0, ""); * capable. Set this to 1 only for testing/debugging purposes. */ int sosendjcl_ignore_capab = 0; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, ""); +int sodefunctlog = 0; +SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED, + &sodefunctlog, 0, ""); + /* * Socket operation routines. * These routines are called by the routines in @@ -223,7 +234,9 @@ extern struct protosw *pffindprotonotype(int, int); extern int soclose_locked(struct socket *); extern int soo_kqfilter(struct fileproc *, struct knote *, struct proc *); +#if CONFIG_EMBEDDED extern int uthread_get_background_state(uthread_t); +#endif /*CONFIG_EMBEDDED */ #ifdef __APPLE__ @@ -237,6 +250,9 @@ static void so_cache_timer(void *); void soclose_wait_locked(struct socket *so); int so_isdstlocal(struct socket *so); +__private_extern__ u_int32_t sotcdb = 0; +SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED, + &sotcdb, 0, ""); void socketinit(void) @@ -275,6 +291,7 @@ socketinit(void) get_inpcb_str_size() + 4 + get_tcp_str_size()); so_cache_zone = zinit(str_size, 120000*str_size, 8192, "socache zone"); + zone_change(so_cache_zone, Z_CALLERACCT, FALSE); zone_change(so_cache_zone, Z_NOENCRYPT, TRUE); #if TEMPDEBUG printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size); @@ -284,6 +301,10 @@ socketinit(void) so_cache_zone_element_size = str_size; sflt_init(); + + VERIFY(SO_TC_MAX == SO_TC_STATS_MAX); + + socket_tclass_init(); } static void @@ -398,6 +419,21 @@ cached_sock_free(struct socket *so) #endif } +static void +so_update_last_owner_locked( + struct socket *so, + proc_t self) +{ + if (self == NULL) + self = current_proc(); + + if (self) + { + so->last_upid = proc_uniqueid(self); + so->last_pid = proc_pid(self); + } +} + static void so_cache_timer(__unused void *dummy) { @@ -464,6 +500,7 @@ soalloc(int waitok, int dom, int type) return (NULL); } #endif /* MAC_SOCKET */ + so_update_last_owner_locked(so, NULL); } return (so); @@ -488,8 +525,10 @@ socreate(int dom, struct socket **aso, int type, int proto) register struct protosw *prp; register struct socket *so; register int error = 0; +#if CONFIG_EMBEDDED thread_t thread; struct uthread *ut; +#endif /* CONFIG_EMBEDDED */ #if TCPDEBUG extern int tcpconsdebug; @@ -521,6 +560,7 @@ socreate(int dom, struct socket **aso, int type, int proto) so->so_type = type; so->so_uid = kauth_cred_getuid(kauth_cred_get()); + so->so_gid = kauth_cred_getgid(kauth_cred_get()); if (!suser(kauth_cred_get(), NULL)) so->so_state = SS_PRIV; @@ -566,22 +606,42 @@ socreate(int dom, struct socket **aso, int type, int proto) so->so_options |= SO_DEBUG; #endif #endif + so_set_default_traffic_class(so); /* * If this is a background thread/task, mark the socket as such. */ +#if !CONFIG_EMBEDDED + if (proc_get_self_isbackground() != 0) +#else /* !CONFIG_EMBEDDED */ thread = current_thread(); ut = get_bsdthread_info(thread); - if (uthread_get_background_state(ut)) { + if (uthread_get_background_state(ut)) +#endif /* !CONFIG_EMBEDDED */ + { socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND); - so->so_background_thread = thread; - /* - * In case setpriority(PRIO_DARWIN_THREAD) was called - * on this thread, regulate network (TCP) traffics. - */ - if (ut->uu_flag & UT_BACKGROUND_TRAFFIC_MGT) { - socket_set_traffic_mgt_flags(so, - TRAFFIC_MGT_SO_BG_REGULATE); - } + so->so_background_thread = current_thread(); + } + + switch (dom) { + /* + * Don't mark Unix domain sockets as eligible for defunct by default. + */ + case PF_LOCAL: + so->so_flags |= SOF_NODEFUNCT; + break; + /* + * Radar 9119053 + * Since v6 initialization is asynchronous and we can't hold + * up the main boot path, we need to at least hold off any + * sockets attempting to be created until the v6 stack is + * up and ready. + */ + case PF_INET6: + if (in6_init_done == 0) + ip6_fin(); + break; + default: + break; } *aso = so; @@ -615,40 +675,25 @@ sobind(struct socket *so, struct sockaddr *nam) { struct proc *p = current_proc(); int error = 0; - struct socket_filter_entry *filter; - int filtered = 0; socket_lock(so, 1); + + so_update_last_owner_locked(so, p); /* - * If this is a bind request on a previously-accepted socket - * that has been marked as inactive, reject it now before - * we go any further. + * If this is a bind request on a socket that has been marked + * as inactive, reject it now before we go any further. */ if (so->so_flags & SOF_DEFUNCT) { error = EINVAL; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), + error)); goto out; } /* Socket filter */ - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_bind) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_bind(filter->sfe_cookie, so, nam); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - } - /* End socket filter */ + error = sflt_bind(so, nam); if (error == 0) error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); @@ -664,6 +709,9 @@ out: void sodealloc(struct socket *so) { + /* Remove any filters */ + sflt_termsock(so); + so->so_gencnt = ++so_gencnt; #if CONFIG_MACF_SOCKET @@ -703,10 +751,11 @@ solisten(struct socket *so, int backlog) { struct proc *p = current_proc(); int error = 0; - struct socket_filter_entry *filter; - int filtered = 0; socket_lock(so, 1); + + so_update_last_owner_locked(so, p); + if (so->so_proto == NULL) { error = EINVAL; goto out; @@ -718,13 +767,18 @@ solisten(struct socket *so, int backlog) /* * If the listen request is made on a socket that is not fully - * disconnected, or on a previously-accepted socket that has - * been marked as inactive, reject the request now. + * disconnected, or on a socket that has been marked as inactive, + * reject the request now. */ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) || (so->so_flags & SOF_DEFUNCT)) { error = EINVAL; + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_pid(p), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), error)); + } goto out; } @@ -733,23 +787,7 @@ solisten(struct socket *so, int backlog) goto out; } - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_listen) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_listen(filter->sfe_cookie, so); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - } + error = sflt_listen(so); if (error == 0) { error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); @@ -793,9 +831,6 @@ sofreelastref(struct socket *so, int dealloc) /* Assume socket is locked */ - /* Remove any filters - may be called more than once */ - sflt_termsock(so); - if ((!(so->so_flags & SOF_PCBCLEARING)) || ((so->so_state & SS_NOFDREF) == 0)) { #ifdef __APPLE__ @@ -1104,8 +1139,7 @@ int soacceptfilter(struct socket *so) { struct sockaddr *local = NULL, *remote = NULL; - struct socket_filter_entry *filter; - int error = 0, filtered = 0; + int error = 0; struct socket *head = so->so_head; /* @@ -1126,29 +1160,7 @@ soacceptfilter(struct socket *so) goto done; } - /* - * At this point, we have a reference on the listening socket - * so we know it won't be going away. Do the same for the newly - * accepted socket while we invoke the accept callback routine. - */ - for (filter = so->so_filt; filter != NULL && error == 0; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_accept != NULL) { - if (!filtered) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_accept(filter->sfe_cookie, - head, so, local, remote); - } - } - - if (filtered) { - socket_lock(so, 0); - sflt_unuse(so); - } + error = sflt_accept(head, so, local, remote); /* * If we get EJUSTRETURN from one of the filters, mark this socket @@ -1157,10 +1169,8 @@ soacceptfilter(struct socket *so) */ if (error == EJUSTRETURN) { error = 0; - so->so_flags |= SOF_DEFUNCT; - /* Prevent data from being appended to the socket buffers */ - so->so_snd.sb_flags |= SB_DROP; - so->so_rcv.sb_flags |= SB_DROP; + (void) sosetdefunct(current_proc(), so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE); } if (error != 0) { @@ -1207,14 +1217,22 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) if (dolock) socket_lock(so, 1); + so_update_last_owner_locked(so, p); + /* * If this is a listening socket or if this is a previously-accepted * socket that has been marked as inactive, reject the connect request. */ if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { + error = EOPNOTSUPP; + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_pid(p), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), error)); + } if (dolock) socket_unlock(so, 1); - return (EOPNOTSUPP); + return (error); } if ((so->so_restrictions & SO_RESTRICT_DENYOUT) != 0) { @@ -1238,36 +1256,14 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) * Run connect filter before calling protocol: * - non-blocking connect returns before completion; */ - struct socket_filter_entry *filter; - int filtered = 0; - - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_connect_out) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_connect_out(filter->sfe_cookie, so, nam); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - } + error = sflt_connectout(so, nam); if (error) { if (error == EJUSTRETURN) error = 0; - if (dolock) - socket_unlock(so, 1); - return (error); + } else { + error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); } - - error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); } if (dolock) socket_unlock(so, 1); @@ -1377,6 +1373,8 @@ restart: } else { error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) { + if (so->so_flags & SOF_DEFUNCT) + goto defunct; return (error); } *sblocked = 1; @@ -1384,12 +1382,17 @@ restart: } /* - * If a send attempt is made on a previously-accepted socket - * that has been marked as inactive (disconnected), reject - * the request. + * If a send attempt is made on a socket that has been marked + * as inactive (disconnected), reject the request. */ - if (so->so_flags & SOF_DEFUNCT) - return (ENOTCONN); + if (so->so_flags & SOF_DEFUNCT) { +defunct: + error = EPIPE; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__, + proc_selfpid(), so, INP_SOCKAF(so), INP_SOCKTYPE(so), + error)); + return (error); + } if (so->so_state & SS_CANTSENDMORE) return (EPIPE); @@ -1423,8 +1426,11 @@ restart: return (EWOULDBLOCK); } sbunlock(&so->so_snd, 1); + *sblocked = 0; error = sbwait(&so->so_snd); if (error) { + if (so->so_flags & SOF_DEFUNCT) + goto defunct; return (error); } goto restart; @@ -1515,6 +1521,8 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat); socket_lock(so, 1); + so_update_last_owner_locked(so, p); + if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) { error = EOPNOTSUPP; socket_unlock(so, 1); @@ -1555,10 +1563,6 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1024 : 0); do { - struct socket_filter_entry *filter; - int filtered; - boolean_t recursive; - if (uio == NULL) { /* * Data is prepackaged in "top". @@ -1611,7 +1615,8 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, * haven't yet consumed. */ if (freelist == NULL && - bytes_to_copy > NBPG && jumbocl) { + bytes_to_copy > MBIGCLBYTES && + jumbocl) { num_needed = bytes_to_copy / M16KCLBYTES; @@ -1634,10 +1639,10 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, if (freelist == NULL && bytes_to_copy > MCLBYTES) { num_needed = - bytes_to_copy / NBPG; + bytes_to_copy / MBIGCLBYTES; if ((bytes_to_copy - - (num_needed * NBPG)) >= + (num_needed * MBIGCLBYTES)) >= MINCLSIZE) num_needed++; @@ -1645,7 +1650,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, m_getpackets_internal( (unsigned int *)&num_needed, hdrs_needed, M_WAIT, 0, - NBPG); + MBIGCLBYTES); /* * Fall back to cluster size * if allocation failed @@ -1783,65 +1788,24 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, /* * Socket filter processing */ - recursive = (so->so_send_filt_thread != NULL); - filtered = 0; - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_data_out) { - int so_flags = 0; - if (filtered == 0) { - filtered = 1; - so->so_send_filt_thread = - current_thread(); - sflt_use(so); - socket_unlock(so, 0); - so_flags = - (sendflags & MSG_OOB) ? - sock_data_filt_flag_oob : 0; - } - error = filter->sfe_filter->sf_filter. - sf_data_out(filter->sfe_cookie, so, - addr, &top, &control, so_flags); + error = sflt_data_out(so, addr, &top, &control, + (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0); + if (error) { + if (error == EJUSTRETURN) { + error = 0; + clen = 0; + control = 0; + top = 0; } - } - if (filtered) { - /* - * At this point, we've run at least one - * filter. The socket is unlocked as is - * the socket buffer. Clear the recorded - * filter thread only when we are outside - * of a filter's context. This allows for - * a filter to issue multiple inject calls - * from its sf_data_out callback routine. - */ - socket_lock(so, 0); - sflt_unuse(so); - if (!recursive) - so->so_send_filt_thread = 0; - if (error) { - if (error == EJUSTRETURN) { - error = 0; - clen = 0; - control = 0; - top = 0; - } - - goto release; - } + goto release; } /* * End Socket filter processing */ - if (error == EJUSTRETURN) { - /* A socket filter handled this data */ - error = 0; - } else { - error = (*so->so_proto->pr_usrreqs->pru_send) - (so, sendflags, top, addr, control, p); - } + error = (*so->so_proto->pr_usrreqs->pru_send) + (so, sendflags, top, addr, control, p); #ifdef __APPLE__ if (flags & MSG_SEND) so->so_temp = NULL; @@ -1935,6 +1899,7 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat); socket_lock(so, 1); + so_update_last_owner_locked(so, p); #ifdef MORE_LOCKING_DEBUG if (so->so_usecount == 1) @@ -1958,14 +1923,18 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, if (so->so_flags & SOF_DEFUNCT) { struct sockbuf *sb = &so->so_rcv; + error = ENOTCONN; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__, + proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), error)); /* * This socket should have been disconnected and flushed - * prior to being returned from accept; there should be - * no data on its receive list, so panic otherwise. + * prior to being returned from sodefunct(); there should + * be no data on its receive list, so panic otherwise. */ - sb_empty_assert(sb, __func__); + if (so->so_state & SS_DEFUNCT) + sb_empty_assert(sb, __func__); socket_unlock(so, 1); - return (ENOTCONN); + return (error); } /* @@ -2197,6 +2166,14 @@ dontblock: goto restart; } socket_lock(so, 0); + /* + * If the socket has been defunct'd, drop it. + */ + if (so->so_flags & SOF_DEFUNCT) { + m_freem(m); + error = ENOTCONN; + goto release; + } /* * Re-adjust the socket receive list and re-enqueue * the record in front of any packets which may have @@ -2253,6 +2230,7 @@ dontblock: struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; struct sockbuf *sb_rcv = &so->so_rcv; + struct mbuf **msgpcm = NULL; /* * Externalizing the control messages would require us to @@ -2265,7 +2243,23 @@ dontblock: do { if (flags & MSG_PEEK) { if (controlp != NULL) { + if (*controlp == NULL) { + msgpcm = controlp; + } *controlp = m_copy(m, 0, m->m_len); + + /* If we failed to allocate an mbuf, + * release any previously allocated + * mbufs for control data. Return + * an error. Keep the mbufs in the + * socket as this is using + * MSG_PEEK flag. + */ + if (*controlp == NULL) { + m_freem(*msgpcm); + error = ENOBUFS; + goto release; + } controlp = &(*controlp)->m_next; } m = m->m_next; @@ -2499,8 +2493,25 @@ dontblock: if (flags & MSG_PEEK) { moff += len; } else { - if (mp) - *mp = m_copym(m, 0, len, M_WAIT); + if (mp != NULL) { + int copy_flag; + + if (flags & MSG_DONTWAIT) + copy_flag = M_DONTWAIT; + else + copy_flag = M_WAIT; + *mp = m_copym(m, 0, len, copy_flag); + if (*mp == NULL) { + /* + * Failed to allocate an mbuf. + * Adjust uio_resid back, it was + * adjusted down by len bytes which + * we didn't copy over + */ + uio_setresid(uio, (uio_resid(uio) + len)); + break; + } + } m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; @@ -2959,13 +2970,13 @@ sosetopt(struct socket *so, struct sockopt *sopt) int error, optval; struct linger l; struct timeval tv; - struct socket_filter_entry *filter; - int filtered = 0; #if CONFIG_MACF_SOCKET struct mac extmac; #endif /* MAC_SOCKET */ socket_lock(so, 1); + so_update_last_owner_locked(so, NULL); + if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) == (SS_CANTRCVMORE | SS_CANTSENDMORE) && (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) { @@ -2978,29 +2989,11 @@ sosetopt(struct socket *so, struct sockopt *sopt) sopt->sopt_dir = SOPT_SET; } - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_setoption) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_setoption(filter->sfe_cookie, so, sopt); - } - } - - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - - if (error) { - if (error == EJUSTRETURN) - error = 0; - goto bad; - } + error = sflt_setsockopt(so, sopt); + if (error) { + if (error == EJUSTRETURN) + error = 0; + goto bad; } error = 0; @@ -3036,6 +3029,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: + case SO_TIMESTAMP_MONOTONIC: #ifdef __APPLE__ case SO_DONTTRUNC: case SO_WANTMORE: @@ -3126,8 +3120,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) if (error) goto bad; - error = sflt_attach_private(so, NULL, - nke.nke_handle, 1); + error = sflt_attach_internal(so, nke.nke_handle); break; } @@ -3253,19 +3246,76 @@ sosetopt(struct socket *so, struct sockopt *sopt) break; } -#if PKT_PRIORITY case SO_TRAFFIC_CLASS: { error = sooptcopyin(sopt, &optval, sizeof (optval), sizeof (optval)); if (error) goto bad; - if (optval < SO_TC_BE || optval > SO_TC_VO) { - error = EINVAL; + error = so_set_traffic_class(so, optval); + if (error) goto bad; - } - so->so_traffic_class = optval; + break; } -#endif /* PKT_PRIORITY */ + + case SO_RECV_TRAFFIC_CLASS: { + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error) + goto bad; + if (optval == 0) + so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS; + else + so->so_flags |= SOF_RECV_TRAFFIC_CLASS; + break; + } + + case SO_TRAFFIC_CLASS_DBG: { + struct so_tcdbg so_tcdbg; + + error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg), + sizeof (struct so_tcdbg)); + if (error) + goto bad; + error = so_set_tcdbg(so, &so_tcdbg); + if (error) + goto bad; + break; + } + + case SO_DEFUNCTOK: + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error != 0 || (so->so_flags & SOF_DEFUNCT)) { + if (error == 0) + error = EBADF; + goto bad; + } + /* + * Any process can set SO_DEFUNCTOK (clear + * SOF_NODEFUNCT), but only root can clear + * SO_DEFUNCTOK (set SOF_NODEFUNCT). + */ + if (optval == 0 && + kauth_cred_issuser(kauth_cred_get()) == 0) { + error = EPERM; + goto bad; + } + if (optval) + so->so_flags &= ~SOF_NODEFUNCT; + else + so->so_flags |= SOF_NODEFUNCT; + + SODEFUNCTLOG(("%s[%d]: so %p [%d,%d] is now marked as " + "%seligible for defunct\n", __func__, + proc_selfpid(), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), + (so->so_flags & SOF_NODEFUNCT) ? "not " : "")); + break; + + case SO_ISDEFUNCT: + /* This option is not settable */ + error = EINVAL; + break; default: error = ENOPROTOOPT; @@ -3355,8 +3405,6 @@ sogetopt(struct socket *so, struct sockopt *sopt) int error, optval; struct linger l; struct timeval tv; - struct socket_filter_entry *filter; - int filtered = 0; #if CONFIG_MACF_SOCKET struct mac extmac; #endif /* MAC_SOCKET */ @@ -3366,32 +3414,16 @@ sogetopt(struct socket *so, struct sockopt *sopt) } socket_lock(so, 1); + so_update_last_owner_locked(so, NULL); - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_getoption) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_getoption(filter->sfe_cookie, so, sopt); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - - if (error) { - if (error == EJUSTRETURN) - error = 0; - socket_unlock(so, 1); - return (error); - } + error = sflt_getsockopt(so, sopt); + if (error) { + if (error == EJUSTRETURN) + error = 0; + socket_unlock(so, 1); + return (error); } - + error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { @@ -3421,6 +3453,7 @@ sogetopt(struct socket *so, struct sockopt *sopt) case SO_BROADCAST: case SO_OOBINLINE: case SO_TIMESTAMP: + case SO_TIMESTAMP_MONOTONIC: #ifdef __APPLE__ case SO_DONTTRUNC: case SO_WANTMORE: @@ -3556,11 +3589,29 @@ integer: error = sooptcopyout(sopt, &sonpx, sizeof(struct so_np_extensions)); break; } -#if PKT_PRIORITY + case SO_TRAFFIC_CLASS: optval = so->so_traffic_class; goto integer; -#endif /* PKT_PRIORITY */ + + case SO_RECV_TRAFFIC_CLASS: + optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS); + goto integer; + + case SO_TRAFFIC_CLASS_STATS: + error = sooptcopyout(sopt, &so->so_tc_stats, sizeof(so->so_tc_stats)); + + case SO_TRAFFIC_CLASS_DBG: + error = sogetopt_tcdbg(so, sopt); + break; + + case SO_DEFUNCTOK: + optval = !(so->so_flags & SOF_NODEFUNCT); + goto integer; + + case SO_ISDEFUNCT: + optval = (so->so_flags & SOF_DEFUNCT); + goto integer; default: error = ENOPROTOOPT; @@ -3570,8 +3621,10 @@ integer: return (error); } } - -/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ +/* The size limits on our soopt_getm is different from that on FreeBSD. + * We limit the size of options to MCLBYTES. This will have to change + * if we need to define options that need more space than MCLBYTES. + */ int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { @@ -3579,7 +3632,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) int sopt_size = sopt->sopt_valsize; int how; - if (sopt_size > MAX_SOOPTGETM_SIZE) + if (sopt_size <= 0 || sopt_size > MCLBYTES) return (EMSGSIZE); how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT; @@ -3600,7 +3653,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) *mp = m; m_prev = m; - while (sopt_size) { + while (sopt_size > 0) { MGET(m, how, MT_DATA); if (m == 0) { m_freem(*mp); @@ -3610,6 +3663,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) MCLGET(m, how); if ((m->m_flags & M_EXT) == 0) { m_freem(*mp); + m_freem(m); return (ENOBUFS); } m->m_len = min(MCLBYTES, sopt_size); @@ -3623,7 +3677,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) return (0); } -/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ +/* copyin sopt data into mbuf chain */ int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { @@ -3654,7 +3708,7 @@ soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) return (0); } -/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ +/* copyout mbuf chain data into soopt */ int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { @@ -3709,6 +3763,7 @@ sopoll(struct socket *so, int events, __unused kauth_cred_t cred, void * wql) int revents = 0; socket_lock(so, 1); + so_update_last_owner_locked(so, p); if (events & (POLLIN | POLLRDNORM)) if (soreadable(so)) @@ -3863,12 +3918,19 @@ filt_soread(struct knote *kn, long hint) return (1); } + int64_t lowwat = so->so_rcv.sb_lowat; + if (kn->kn_sfflags & NOTE_LOWAT) + { + if (kn->kn_sdata > so->so_rcv.sb_hiwat) + lowwat = so->so_rcv.sb_hiwat; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_unlock(so, 1); - - return ((kn->kn_flags & EV_OOBAND) || - kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ? - kn->kn_sdata : so->so_rcv.sb_lowat)); + + return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat); } static void @@ -3911,14 +3973,20 @@ filt_sowrite(struct knote *kn, long hint) socket_unlock(so, 1); return (0); } + int64_t lowwat = so->so_snd.sb_lowat; + if (kn->kn_sfflags & NOTE_LOWAT) + { + if (kn->kn_sdata > so->so_snd.sb_hiwat) + lowwat = so->so_snd.sb_hiwat; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_unlock(so, 1); - if (kn->kn_sfflags & NOTE_LOWAT) - return (kn->kn_data >= kn->kn_sdata); - return (kn->kn_data >= so->so_snd.sb_lowat); + return (kn->kn_data >= lowwat); } -#define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + sizeof(void *) + 1) + 1) +#define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof(void *)) + 1) + 1) __private_extern__ const char * solockhistory_nr(struct socket *so) { @@ -3926,6 +3994,7 @@ __private_extern__ const char * solockhistory_nr(struct socket *so) int i; static char lock_history_str[SO_LOCK_HISTORY_STR_LEN]; + bzero(lock_history_str, sizeof(lock_history_str)); for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) { n += snprintf(lock_history_str + n, SO_LOCK_HISTORY_STR_LEN - n, "%lx:%lx ", (uintptr_t) so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX], @@ -4054,3 +4123,107 @@ so_isdstlocal(struct socket *so) { } return 0; } + +int +sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) +{ + int err = 0, defunct; + + defunct = (so->so_flags & SOF_DEFUNCT); + if (defunct) { + if (!(so->so_snd.sb_flags & so->so_rcv.sb_flags & SB_DROP)) + panic("%s: SB_DROP not set", __func__); + goto done; + } + + if (so->so_flags & SOF_NODEFUNCT) { + if (noforce) { + err = EOPNOTSUPP; + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p " + "[%d,%d] is not eligible for defunct (%d)\n", + __func__, proc_selfpid(), proc_pid(p), level, so, + INP_SOCKAF(so), INP_SOCKTYPE(so), err)); + return (err); + } + so->so_flags &= ~SOF_NODEFUNCT; + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] " + "defunct by force\n", __func__, proc_selfpid(), proc_pid(p), + level, so, INP_SOCKAF(so), INP_SOCKTYPE(so))); + } + + so->so_flags |= SOF_DEFUNCT; + /* Prevent further data from being appended to the socket buffers */ + so->so_snd.sb_flags |= SB_DROP; + so->so_rcv.sb_flags |= SB_DROP; + +done: + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] %s " + "defunct\n", __func__, proc_selfpid(), proc_pid(p), level, so, + INP_SOCKAF(so), INP_SOCKTYPE(so), + defunct ? "is already" : "marked as")); + + return (err); +} + +int +sodefunct(struct proc *p, struct socket *so, int level) +{ + struct sockbuf *rcv, *snd; + + if (!(so->so_flags & SOF_DEFUNCT)) + panic("%s improperly called", __func__); + + if (so->so_state & SS_DEFUNCT) + goto done; + + rcv = &so->so_rcv; + snd = &so->so_snd; + + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] is now " + "defunct [rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", + __func__, proc_selfpid(), proc_pid(p), level, so, + INP_SOCKAF(so), INP_SOCKTYPE(so), + (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags, + (uint16_t)rcv->sb_flags, (uint16_t)snd->sb_flags)); + + /* + * Unwedge threads blocked on sbwait() and sb_lock(). + */ + sbwakeup(rcv); + sbwakeup(snd); + + if (rcv->sb_flags & SB_LOCK) + sbunlock(rcv, 1); + if (snd->sb_flags & SB_LOCK) + sbunlock(snd, 1); + + /* + * Flush the buffers and disconnect. We explicitly call shutdown + * on both data directions to ensure that SS_CANT{RCV,SEND}MORE + * states are set for the socket. This would also flush out data + * hanging off the receive list of this socket. + */ + (void) soshutdownlock(so, SHUT_RD); + (void) soshutdownlock(so, SHUT_WR); + (void) sodisconnectlocked(so); + + /* + * Explicitly handle connectionless-protocol disconnection + * and release any remaining data in the socket buffers. + */ + if (!(so->so_flags & SS_ISDISCONNECTED)) + (void) soisdisconnected(so); + + if (so->so_error == 0) + so->so_error = EBADF; + + if (rcv->sb_cc != 0) + sbrelease(rcv); + if (snd->sb_cc != 0) + sbrelease(snd); + + so->so_state |= SS_DEFUNCT; + +done: + return (0); +} diff --git a/bsd/kern/uipc_socket2.c b/bsd/kern/uipc_socket2.c index a6b2af000..4b71dd80c 100644 --- a/bsd/kern/uipc_socket2.c +++ b/bsd/kern/uipc_socket2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2010 Apple Inc. All rights reserved. + * Copyright (c) 1998-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -222,6 +222,20 @@ soisdisconnected(struct socket *so) sorwakeup(so); } +/* This function will issue a wakeup like soisdisconnected but it will not + * notify the socket filters. This will avoid unlocking the socket + * in the midst of closing it. + */ +void +sodisconnectwakeup(struct socket *so) +{ + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + /* * When an attempt at a new connection is noted on a socket * which accepts connections, sonewconn is called. If the @@ -276,7 +290,6 @@ sonewconn_internal(struct socket *head, int connstatus) return ((struct socket *)0); } - so->so_head = head; so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; @@ -285,13 +298,15 @@ sonewconn_internal(struct socket *head, int connstatus) so->so_timeo = head->so_timeo; so->so_pgid = head->so_pgid; so->so_uid = head->so_uid; + so->so_gid = head->so_gid; /* inherit socket options stored in so_flags */ so->so_flags = head->so_flags & (SOF_NOSIGPIPE | SOF_NOADDRAVAIL | SOF_REUSESHAREUID | SOF_NOTIFYCONFLICT | SOF_BINDRANDOMPORT | - SOF_NPX_SETOPTSHUT); + SOF_NPX_SETOPTSHUT | + SOF_NODEFUNCT); so->so_usecount = 1; so->next_lock_lr = 0; so->next_unlock_lr = 0; @@ -307,15 +322,11 @@ sonewconn_internal(struct socket *head, int connstatus) #endif /* inherit traffic management properties of listener */ - so->so_traffic_mgt_flags = head->so_traffic_mgt_flags & - (TRAFFIC_MGT_SO_BACKGROUND | TRAFFIC_MGT_SO_BG_REGULATE); + so->so_traffic_mgt_flags = head->so_traffic_mgt_flags & (TRAFFIC_MGT_SO_BACKGROUND); so->so_background_thread = head->so_background_thread; -#if PKT_PRIORITY so->so_traffic_class = head->so_traffic_class; -#endif /* PKT_PRIORITY */ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { - sflt_termsock(so); sodealloc(so); return ((struct socket *)0); } @@ -328,17 +339,36 @@ sonewconn_internal(struct socket *head, int connstatus) socket_unlock(head, 0); if (((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL) != 0) || error) { - sflt_termsock(so); sodealloc(so); if (head->so_proto->pr_unlock) socket_lock(head, 0); return ((struct socket *)0); } - if (head->so_proto->pr_unlock) + if (head->so_proto->pr_unlock) { socket_lock(head, 0); + /* Radar 7385998 Recheck that the head is still accepting + * to avoid race condition when head is getting closed. + */ + if ((head->so_options & SO_ACCEPTCONN) == 0) { + so->so_state &= ~SS_NOFDREF; + soclose(so); + return ((struct socket *)0); + } + } + #ifdef __APPLE__ so->so_proto->pr_domain->dom_refs++; #endif + /* Insert in head appropriate lists */ + so->so_head = head; + + /* Since this socket is going to be inserted into the incomp + * queue, it can be picked up by another thread in + * tcp_dropdropablreq to get dropped before it is setup.. + * To prevent this race, set in-progress flag which can be + * cleared later + */ + so->so_flags |= SOF_INCOMP_INPROGRESS; if (connstatus) { TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); @@ -367,27 +397,7 @@ sonewconn_internal(struct socket *head, int connstatus) struct socket * sonewconn(struct socket *head, int connstatus, const struct sockaddr *from) { - int error = 0; - struct socket_filter_entry *filter; - int filtered = 0; - - for (filter = head->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_connect_in) { - if (filtered == 0) { - filtered = 1; - sflt_use(head); - socket_unlock(head, 0); - } - error = filter->sfe_filter->sf_filter. - sf_connect_in(filter->sfe_cookie, head, from); - } - } - if (filtered != 0) { - socket_lock(head, 0); - sflt_unuse(head); - } - + int error = sflt_connectin(head, from); if (error) { return (NULL); } @@ -443,6 +453,7 @@ sbwait(struct sockbuf *sb) mutex_held = (*so->so_proto->pr_getlock)(so, 0); else mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); sb->sb_flags |= SB_WAIT; @@ -458,8 +469,13 @@ sbwait(struct sockbuf *sb) if (so->so_usecount < 1) panic("sbwait: so=%p refcount=%d\n", so, so->so_usecount); - if ((so->so_state & SS_DRAINING)) { + if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) { error = EBADF; + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_selfpid(), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), error)); + } } return (error); @@ -484,10 +500,13 @@ sb_lock(struct sockbuf *sb) while (sb->sb_flags & SB_LOCK) { sb->sb_flags |= SB_WANT; + if (so->so_proto->pr_getlock != NULL) mutex_held = (*so->so_proto->pr_getlock)(so, 0); else mutex_held = so->so_proto->pr_domain->dom_mtx; + lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); + if (so->so_usecount < 1) panic("sb_lock: so=%p refcount=%d\n", so, so->so_usecount); @@ -498,6 +517,14 @@ sb_lock(struct sockbuf *sb) if (so->so_usecount < 1) panic("sb_lock: 2 so=%p refcount=%d\n", so, so->so_usecount); + + if (error == 0 && (so->so_flags & SOF_DEFUNCT)) { + error = EBADF; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_selfpid(), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), error)); + } + if (error) return (error); } @@ -505,6 +532,15 @@ sb_lock(struct sockbuf *sb) return (0); } +void +sbwakeup(struct sockbuf *sb) +{ + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup((caddr_t)&sb->sb_cc); + } +} + /* * Wakeup processes waiting on a socket buffer. * Do asynchronous notification via SIGIO @@ -513,12 +549,17 @@ sb_lock(struct sockbuf *sb) void sowakeup(struct socket *so, struct sockbuf *sb) { + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] si 0x%x, " + "fl 0x%x [%s]\n", __func__, proc_selfpid(), so, + INP_SOCKAF(so), INP_SOCKTYPE(so), + (uint32_t)sb->sb_sel.si_flags, (uint16_t)sb->sb_flags, + (sb->sb_flags & SB_RECV) ? "rcv" : "snd")); + } + sb->sb_flags &= ~SB_SEL; selwakeup(&sb->sb_sel); - if (sb->sb_flags & SB_WAIT) { - sb->sb_flags &= ~SB_WAIT; - wakeup((caddr_t)&sb->sb_cc); - } + sbwakeup(sb); if (so->so_state & SS_ASYNC) { if (so->so_pgid < 0) gsignal(-so->so_pgid, SIGIO); @@ -685,7 +726,7 @@ sbappend(struct sockbuf *sb, struct mbuf *m) return (sbappendrecord(sb, m)); if (sb->sb_flags & SB_RECV) { - int error = sflt_data_in(so, NULL, &m, NULL, 0, NULL); + int error = sflt_data_in(so, NULL, &m, NULL, 0); SBLASTRECORDCHK(sb, "sbappend 2"); if (error != 0) { if (error != EJUSTRETURN) @@ -724,7 +765,7 @@ sbappendstream(struct sockbuf *sb, struct mbuf *m) } if (sb->sb_flags & SB_RECV) { - int error = sflt_data_in(so, NULL, &m, NULL, 0, NULL); + int error = sflt_data_in(so, NULL, &m, NULL, 0); SBLASTRECORDCHK(sb, "sbappendstream 1"); if (error != 0) { if (error != EJUSTRETURN) @@ -844,7 +885,7 @@ sbappendrecord(struct sockbuf *sb, struct mbuf *m0) if (sb->sb_flags & SB_RECV) { int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, - sock_data_filt_flag_record, NULL); + sock_data_filt_flag_record); if (error != 0) { SBLASTRECORDCHK(sb, "sbappendrecord 1"); if (error != EJUSTRETURN) @@ -895,7 +936,7 @@ sbinsertoob(struct sockbuf *sb, struct mbuf *m0) if ((sb->sb_flags & SB_RECV) != 0) { int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL, - sock_data_filt_flag_oob, NULL); + sock_data_filt_flag_oob); SBLASTRECORDCHK(sb, "sbinsertoob 2"); if (error) { @@ -1040,7 +1081,7 @@ sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, /* Call socket data in filters */ if ((sb->sb_flags & SB_RECV) != 0) { int error; - error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0, NULL); + error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0); SBLASTRECORDCHK(sb, __func__); if (error) { if (error != EJUSTRETURN) { @@ -1135,7 +1176,7 @@ sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control, if (sb->sb_flags & SB_RECV) { int error; - error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0, NULL); + error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0); SBLASTRECORDCHK(sb, __func__); if (error) { if (error != EJUSTRETURN) { @@ -1413,6 +1454,38 @@ sbcreatecontrol(caddr_t p, int size, int type, int level) return (m); } +struct mbuf** +sbcreatecontrol_mbuf(caddr_t p, int size, int type, int level, struct mbuf** mp) +{ + struct mbuf* m; + struct cmsghdr *cp; + + if (*mp == NULL){ + *mp = sbcreatecontrol(p, size, type, level); + return mp; + } + + if (CMSG_SPACE((u_int)size) + (*mp)->m_len > MLEN){ + mp = &(*mp)->m_next; + *mp = sbcreatecontrol(p, size, type, level); + return mp; + } + + m = *mp; + + cp = (struct cmsghdr *) (mtod(m, char *) + m->m_len); + m->m_len += CMSG_SPACE(size); + + /* XXX check size? */ + (void) memcpy(CMSG_DATA(cp), p, size); + cp->cmsg_len = CMSG_LEN(size); + cp->cmsg_level = level; + cp->cmsg_type = type; + + return mp; +} + + /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. @@ -1858,72 +1931,12 @@ soisbackground(struct socket *so) return (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND); } -#if PKT_PRIORITY -#define _MIN_NXT_CMSGHDR_PTR(cmsg) \ - ((char *)(cmsg) + \ - __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len) + \ - __DARWIN_ALIGN32(sizeof(struct cmsghdr))) - -#define M_FIRST_CMSGHDR(m) \ - ((char *)(m) != (char *)0L && (size_t)(m)->m_len >= sizeof(struct cmsghdr) && \ - (socklen_t)(m)->m_len >= __DARWIN_ALIGN32(((struct cmsghdr *)(m)->m_data)->cmsg_len) ?\ - (struct cmsghdr *)(m)->m_data : \ - (struct cmsghdr *)0L) - -#define M_NXT_CMSGHDR(m, cmsg) \ - ((char *)(cmsg) == (char *)0L ? M_FIRST_CMSGHDR(m) : \ - _MIN_NXT_CMSGHDR_PTR(cmsg) > ((char *)(m)->m_data) + (m)->m_len || \ - _MIN_NXT_CMSGHDR_PTR(cmsg) < (char *)(m)->m_data ? \ - (struct cmsghdr *)0L /* NULL */ : \ - (struct cmsghdr *)((unsigned char *)(cmsg) + \ - __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len))) -#endif /* PKT_PRIORITY */ - -__private_extern__ int -mbuf_traffic_class_from_control(struct mbuf *control) -{ -#if !PKT_PRIORITY -#pragma unused(control) - return MBUF_TC_NONE; -#else /* PKT_PRIORITY */ - struct cmsghdr *cm; - - for (cm = M_FIRST_CMSGHDR(control); cm; cm = M_NXT_CMSGHDR(control, cm)) { - int tc; - - if (cm->cmsg_len < sizeof(struct cmsghdr)) - break; - - if (cm->cmsg_level != SOL_SOCKET || cm->cmsg_type != SO_TRAFFIC_CLASS) - continue; - if (cm->cmsg_len != CMSG_LEN(sizeof(int))) - continue; - - tc = *(int *)CMSG_DATA(cm); - - switch (tc) { - case SO_TC_BE: - return MBUF_TC_BE; - case SO_TC_BK: - return MBUF_TC_BK; - case SO_TC_VI: - return MBUF_TC_VI; - case SO_TC_VO: - return MBUF_TC_VO; - default: - break; - } - } - - return MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ -} /* * Here is the definition of some of the basic objects in the kern.ipc * branch of the MIB. */ -SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "IPC"); +SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW|CTLFLAG_LOCKED|CTLFLAG_ANYBODY, 0, "IPC"); /* Check that the maximum socket buffer size is within a range */ @@ -1946,20 +1959,20 @@ sysctl_sb_max(__unused struct sysctl_oid *oidp, __unused void *arg1, return error; } -SYSCTL_PROC(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &sb_max, 0, &sysctl_sb_max, "IU", "Maximum socket buffer size"); -SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD | CTLFLAG_LOCKED, &maxsockets, 0, "Maximum number of sockets avaliable"); -SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW | CTLFLAG_LOCKED, &sb_efficiency, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, sbspace_factor, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, sbspace_factor, CTLFLAG_RW | CTLFLAG_LOCKED, &sbspace_factor, 0, "Ratio of mbuf/cluster use for socket layers"); -SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, +SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD | CTLFLAG_LOCKED, &nmbclusters, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, njcl, CTLFLAG_RD, &njcl, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, njclbytes, CTLFLAG_RD, &njclbytes, 0, ""); -SYSCTL_INT(_kern_ipc, KIPC_SOQLIMITCOMPAT, soqlimitcompat, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, njcl, CTLFLAG_RD | CTLFLAG_LOCKED, &njcl, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, njclbytes, CTLFLAG_RD | CTLFLAG_LOCKED, &njclbytes, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_SOQLIMITCOMPAT, soqlimitcompat, CTLFLAG_RW | CTLFLAG_LOCKED, &soqlimitcompat, 1, "Enable socket queue limit compatibility"); -SYSCTL_INT(_kern_ipc, OID_AUTO, soqlencomp, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, soqlencomp, CTLFLAG_RW | CTLFLAG_LOCKED, &soqlencomp, 0, "Listen backlog represents only complete queue"); diff --git a/bsd/kern/uipc_syscalls.c b/bsd/kern/uipc_syscalls.c index 3c0aec400..521de769e 100644 --- a/bsd/kern/uipc_syscalls.c +++ b/bsd/kern/uipc_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -86,6 +86,7 @@ #include <sys/kernel.h> #include <sys/uio_internal.h> #include <sys/kauth.h> +#include <kern/task.h> #include <security/audit/audit.h> @@ -191,6 +192,22 @@ socket(struct proc *p, struct socket_args *uap, int32_t *retval) if (error) { fp_free(p, fd, fp); } else { + thread_t thread; + struct uthread *ut; + + thread = current_thread(); + ut = get_bsdthread_info(thread); + + /* if this is a backgrounded thread then throttle all new sockets */ +#if !CONFIG_EMBEDDED + if (proc_get_selfthread_isbackground() != 0) +#else /* !CONFIG_EMBEDDED */ + if ( (ut->uu_flag & UT_BACKGROUND) != 0 ) +#endif /* !CONFIG_EMBEDDED */ + { + so->so_traffic_mgt_flags |= TRAFFIC_MGT_SO_BACKGROUND; + so->so_background_thread = thread; + } fp->f_data = (caddr_t)so; proc_fdlock(p); @@ -510,16 +527,12 @@ gotnoname: releasefd: /* - * If the socket has been marked as inactive by soacceptfilter(), - * disallow further operations on it. We explicitly call shutdown - * on both data directions to ensure that SS_CANT{RCV,SEND}MORE - * states are set for the socket. This would also flush out data - * hanging off the receive list of this socket. + * If the socket has been marked as inactive by sosetdefunct(), + * disallow further operations on it. */ if (so->so_flags & SOF_DEFUNCT) { - (void) soshutdownlock(so, SHUT_RD); - (void) soshutdownlock(so, SHUT_WR); - (void) sodisconnectlocked(so); + sodefunct(current_proc(), so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL); } if (dosocklock) @@ -735,6 +748,9 @@ socketpair(struct proc *p, struct socketpair_args *uap, } } + if ((error = copyout(sv, uap->rsv, 2 * sizeof (int))) != 0) + goto free4; + proc_fdlock(p); procfdtbl_releasefd(p, sv[0], NULL); procfdtbl_releasefd(p, sv[1], NULL); @@ -742,8 +758,7 @@ socketpair(struct proc *p, struct socketpair_args *uap, fp_drop(p, sv[1], fp2, 1); proc_fdunlock(p); - error = copyout((caddr_t)sv, uap->rsv, 2 * sizeof (int)); - return (error); + return (0); free4: fp_free(p, sv[1], fp2); free3: @@ -1194,63 +1209,79 @@ recvit(struct proc *p, int s, struct user_msghdr *mp, uio_t uiop, while (m && len > 0) { unsigned int tocopy; struct cmsghdr *cp = mtod(m, struct cmsghdr *); - - /* - * SCM_TIMESTAMP hack because struct timeval has a - * different size for 32 bits and 64 bits processes - */ - if (cp->cmsg_level == SOL_SOCKET && cp->cmsg_type == SCM_TIMESTAMP) { - unsigned char tmp_buffer[CMSG_SPACE(sizeof(struct user64_timeval))]; - struct cmsghdr *tmp_cp = (struct cmsghdr *)tmp_buffer; - int tmp_space; - struct timeval *tv = (struct timeval *)CMSG_DATA(cp); - - tmp_cp->cmsg_level = SOL_SOCKET; - tmp_cp->cmsg_type = SCM_TIMESTAMP; + int cp_size = CMSG_ALIGN(cp->cmsg_len); + int buflen = m->m_len; + + while (buflen > 0 && len > 0) { - if (proc_is64bit(p)) { - struct user64_timeval *tv64 = (struct user64_timeval *)CMSG_DATA(tmp_cp); - - tv64->tv_sec = tv->tv_sec; - tv64->tv_usec = tv->tv_usec; - - tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user64_timeval)); - tmp_space = CMSG_SPACE(sizeof(struct user64_timeval)); - } else { - struct user32_timeval *tv32 = (struct user32_timeval *)CMSG_DATA(tmp_cp); + /* + SCM_TIMESTAMP hack because struct timeval has a + * different size for 32 bits and 64 bits processes + */ + if (cp->cmsg_level == SOL_SOCKET && cp->cmsg_type == SCM_TIMESTAMP) { + unsigned char tmp_buffer[CMSG_SPACE(sizeof(struct user64_timeval))]; + struct cmsghdr *tmp_cp = (struct cmsghdr *)tmp_buffer; + int tmp_space; + struct timeval *tv = (struct timeval *)CMSG_DATA(cp); + + tmp_cp->cmsg_level = SOL_SOCKET; + tmp_cp->cmsg_type = SCM_TIMESTAMP; + + if (proc_is64bit(p)) { + struct user64_timeval *tv64 = (struct user64_timeval *)CMSG_DATA(tmp_cp); + + tv64->tv_sec = tv->tv_sec; + tv64->tv_usec = tv->tv_usec; + + tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user64_timeval)); + tmp_space = CMSG_SPACE(sizeof(struct user64_timeval)); + } else { + struct user32_timeval *tv32 = (struct user32_timeval *)CMSG_DATA(tmp_cp); + + tv32->tv_sec = tv->tv_sec; + tv32->tv_usec = tv->tv_usec; + + tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user32_timeval)); + tmp_space = CMSG_SPACE(sizeof(struct user32_timeval)); + } + if (len >= tmp_space) { + tocopy = tmp_space; + } else { + mp->msg_flags |= MSG_CTRUNC; + tocopy = len; + } + error = copyout(tmp_buffer, ctlbuf, tocopy); + if (error) + goto out; - tv32->tv_sec = tv->tv_sec; - tv32->tv_usec = tv->tv_usec; - - tmp_cp->cmsg_len = CMSG_LEN(sizeof(struct user32_timeval)); - tmp_space = CMSG_SPACE(sizeof(struct user32_timeval)); - } - if (len >= tmp_space) { - tocopy = tmp_space; - } else { - mp->msg_flags |= MSG_CTRUNC; - tocopy = len; - } - error = copyout(tmp_buffer, ctlbuf, tocopy); - if (error) - goto out; - - } else { - if (len >= m->m_len) { - tocopy = m->m_len; } else { - mp->msg_flags |= MSG_CTRUNC; - tocopy = len; + + if (cp_size > buflen) { + panic("cp_size > buflen, something wrong with alignment!"); + } + + if (len >= cp_size) { + tocopy = cp_size; + } else { + mp->msg_flags |= MSG_CTRUNC; + tocopy = len; + } + + error = copyout((caddr_t) cp, ctlbuf, + tocopy); + if (error) + goto out; } - - error = copyout((caddr_t)mtod(m, caddr_t), ctlbuf, - tocopy); - if (error) - goto out; + + + ctlbuf += tocopy; + len -= tocopy; + + buflen -= cp_size; + cp = (struct cmsghdr *) ((unsigned char *) cp + cp_size); + cp_size = CMSG_ALIGN(cp->cmsg_len); } - - ctlbuf += tocopy; - len -= tocopy; + m = m->m_next; } mp->msg_controllen = ctlbuf - mp->msg_control; @@ -1266,7 +1297,6 @@ out1: return (error); } - /* * Returns: 0 Success * ENOMEM @@ -1698,28 +1728,9 @@ getsockname(__unused struct proc *p, struct getsockname_args *uap, socket_lock(so, 1); error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); if (error == 0) { - struct socket_filter_entry *filter; - int filtered = 0; - for (filter = so->so_filt; filter && error == 0; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_getsockname) { - if (!filtered) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_getsockname(filter->sfe_cookie, so, &sa); - } - } - + error = sflt_getsockname(so, &sa); if (error == EJUSTRETURN) error = 0; - - if (filtered) { - socket_lock(so, 0); - sflt_unuse(so); - } } socket_unlock(so, 1); if (error) @@ -1802,28 +1813,9 @@ getpeername(__unused struct proc *p, struct getpeername_args *uap, sa = 0; error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); if (error == 0) { - struct socket_filter_entry *filter; - int filtered = 0; - for (filter = so->so_filt; filter && error == 0; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_getpeername) { - if (!filtered) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_getpeername(filter->sfe_cookie, so, &sa); - } - } - + error = sflt_getpeername(so, &sa); if (error == EJUSTRETURN) error = 0; - - if (filtered) { - socket_lock(so, 0); - sflt_unuse(so); - } } socket_unlock(so, 1); if (error) @@ -1983,7 +1975,7 @@ SYSCTL_DECL(_kern_ipc); #define SFUIOBUFS 64 static int sendfileuiobufs = SFUIOBUFS; -SYSCTL_INT(_kern_ipc, OID_AUTO, sendfileuiobufs, CTLFLAG_RW, &sendfileuiobufs, +SYSCTL_INT(_kern_ipc, OID_AUTO, sendfileuiobufs, CTLFLAG_RW | CTLFLAG_LOCKED, &sendfileuiobufs, 0, ""); /* Macros to compute the number of mbufs needed depending on cluster size */ @@ -2026,13 +2018,13 @@ alloc_sendpkt(int how, size_t pktlen, unsigned int *maxchunks, * use mbuf_allocpacket(). The logic below is similar to sosend(). */ *m = NULL; - if (pktlen > NBPG && jumbocl) { + if (pktlen > MBIGCLBYTES && jumbocl) { needed = MIN(SENDFILE_MAX_16K, HOWMANY_16K(pktlen)); *m = m_getpackets_internal(&needed, 1, how, 0, M16KCLBYTES); } if (*m == NULL) { needed = MIN(SENDFILE_MAX_4K, HOWMANY_4K(pktlen)); - *m = m_getpackets_internal(&needed, 1, how, 0, NBPG); + *m = m_getpackets_internal(&needed, 1, how, 0, MBIGCLBYTES); } /* @@ -2043,7 +2035,7 @@ alloc_sendpkt(int how, size_t pktlen, unsigned int *maxchunks, */ if (*m == NULL) { needed = 1; - *m = m_getpackets_internal(&needed, 1, M_WAIT, 1, NBPG); + *m = m_getpackets_internal(&needed, 1, M_WAIT, 1, MBIGCLBYTES); } if (*m == NULL) panic("%s: blocking allocation returned NULL\n", __func__); @@ -2295,7 +2287,7 @@ sendfile(struct proc *p, struct sendfile_args *uap, __unused int *retval) if (xfsize != uio_resid(auio)) printf("sendfile: xfsize: %lld != uio_resid(auio): " - "%lld\n", xfsize, uio_resid(auio)); + "%lld\n", xfsize, (long long)uio_resid(auio)); KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_READ | DBG_FUNC_START), uap->s, (unsigned int)((xfsize >> 32) & 0x0ffffffff), @@ -2385,53 +2377,20 @@ retry_space: } goto retry_space; } + + struct mbuf *control = NULL; { /* * Socket filter processing */ - struct socket_filter_entry *filter; - int filtered = 0; - struct mbuf *control = NULL; - boolean_t recursive = (so->so_send_filt_thread != NULL); - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_data_out) { - if (filtered == 0) { - filtered = 1; - so->so_send_filt_thread = - current_thread(); - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_data_out(filter->sfe_cookie, so, - NULL, &m0, &control, 0); - } - } - - if (filtered) { - /* - * At this point, we've run at least one filter. - * The socket is unlocked as is the socket - * buffer. Clear the recorded filter thread - * only when we are outside of a filter's - * context. This allows for a filter to issue - * multiple inject calls from its sf_data_out - * callback routine. - */ - socket_lock(so, 0); - sflt_unuse(so); - if (!recursive) - so->so_send_filt_thread = 0; - if (error) { - if (error == EJUSTRETURN) { - error = 0; - continue; - } - goto done3; + error = sflt_data_out(so, NULL, &m0, &control, 0); + if (error) { + if (error == EJUSTRETURN) { + error = 0; + continue; } + goto done3; } /* * End Socket filter processing @@ -2440,7 +2399,7 @@ retry_space: KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_SEND | DBG_FUNC_START), uap->s, 0, 0, 0, 0); error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m0, - 0, 0, p); + 0, control, p); KERNEL_DEBUG_CONSTANT((DBG_FNC_SENDFILE_SEND | DBG_FUNC_START), uap->s, 0, 0, 0, 0); if (error) { diff --git a/bsd/kern/uipc_usrreq.c b/bsd/kern/uipc_usrreq.c index 202f2d858..c64053a2c 100644 --- a/bsd/kern/uipc_usrreq.c +++ b/bsd/kern/uipc_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -117,6 +117,32 @@ static u_int disconnect_in_progress; extern lck_mtx_t *uipc_lock; static struct unp_head unp_shead, unp_dhead; +/* + * mDNSResponder tracing. When enabled, endpoints connected to + * /var/run/mDNSResponder will be traced; during each send on + * the traced socket, we log the PID and process name of the + * sending process. We also print out a bit of info related + * to the data itself; this assumes ipc_msg_hdr in dnssd_ipc.h + * of mDNSResponder stays the same. + */ +#define MDNSRESPONDER_PATH "/var/run/mDNSResponder" + +static int unpst_tracemdns; /* enable tracing */ + +#define MDNS_IPC_MSG_HDR_VERSION_1 1 + +struct mdns_ipc_msg_hdr { + uint32_t version; + uint32_t datalen; + uint32_t ipc_flags; + uint32_t op; + union { + void *context; + uint32_t u32[2]; + } __attribute__((packed)); + uint32_t reg_index; +} __attribute__((packed)); + /* * Unix communications domain. * @@ -271,7 +297,7 @@ uipc_detach(struct socket *so) if (unp == 0) return (EINVAL); - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); unp_detach(unp); return (0); } @@ -428,7 +454,8 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, } so2 = unp->unp_conn->unp_socket; - unp_get_locks_in_order(so, so2); + if (so != so2) + unp_get_locks_in_order(so, so2); if (unp->unp_addr) from = (struct sockaddr *)unp->unp_addr; @@ -450,7 +477,8 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, control = NULL; } - socket_unlock(so2, 1); + if (so != so2) + socket_unlock(so2, 1); m = NULL; if (nam) @@ -498,6 +526,16 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, break; } + if (unp->unp_flags & UNP_TRACE_MDNS) { + struct mdns_ipc_msg_hdr hdr; + + if (mbuf_copydata(m, 0, sizeof (hdr), &hdr) == 0 && + hdr.version == ntohl(MDNS_IPC_MSG_HDR_VERSION_1)) { + printf("%s[mDNSResponder] pid=%d (%s): op=0x%x\n", + __func__, p->p_pid, p->p_comm, ntohl(hdr.op)); + } + } + /* * Send to paired receive port, and then reduce send buffer * hiwater marks to maintain backpressure. Wake up readers. @@ -694,17 +732,19 @@ static int unp_rights; /* file descriptors in flight */ static int unp_disposed; /* discarded file descriptors */ SYSCTL_DECL(_net_local_stream); -SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, +SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW | CTLFLAG_LOCKED, &unpst_sendspace, 0, ""); -SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, +SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED, &unpst_recvspace, 0, ""); +SYSCTL_INT(_net_local_stream, OID_AUTO, tracemdns, CTLFLAG_RW | CTLFLAG_LOCKED, + &unpst_tracemdns, 0, ""); SYSCTL_DECL(_net_local_dgram); -SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, +SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW | CTLFLAG_LOCKED, &unpdg_sendspace, 0, ""); -SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, +SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED, &unpdg_recvspace, 0, ""); SYSCTL_DECL(_net_local); -SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); +SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD | CTLFLAG_LOCKED, &unp_rights, 0, ""); /* * Returns: 0 Success @@ -739,11 +779,8 @@ unp_attach(struct socket *so) return (ENOBUFS); bzero(unp, sizeof (*unp)); - unp->unp_mtx = lck_mtx_alloc_init(unp_mtx_grp, unp_mtx_attr); - if (unp->unp_mtx == NULL) { - zfree(unp_zone, unp); - return(ENOBUFS); - } + lck_mtx_init(&unp->unp_mtx, + unp_mtx_grp, unp_mtx_attr); lck_rw_lock_exclusive(unp_list_mtx); LIST_INIT(&unp->unp_refs); @@ -892,7 +929,7 @@ unp_bind( socket_unlock(so, 0); strlcpy(buf, soun->sun_path, namelen+1); - NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE, + NDINIT(&nd, CREATE, OP_MKFIFO, FOLLOW | LOCKPARENT, UIO_SYSSPACE, CAST_USER_ADDR_T(buf), ctx); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ error = namei(&nd); @@ -938,7 +975,7 @@ unp_bind( if (!error) { /* create the socket */ - error = vn_create(dvp, &vp, &nd.ni_cnd, &va, 0, ctx); + error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx); } nameidone(&nd); @@ -1001,7 +1038,7 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) strlcpy(buf, soun->sun_path, len+1); socket_unlock(so, 0); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(buf), ctx); error = namei(&nd); if (error) { @@ -1046,8 +1083,13 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) if (so2->so_pcb == NULL) { error = ECONNREFUSED; - socket_unlock(so2, 1); - socket_lock(so, 0); + if (so != so2) { + socket_unlock(so2, 1); + socket_lock(so, 0); + } else { + /* Release the reference held for the listen socket */ + so2->so_usecount--; + } goto out; } @@ -1055,7 +1097,7 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) socket_unlock(so2, 0); socket_lock(so, 0); socket_lock(so2, 0); - } else { + } else if (so > so2) { socket_lock(so, 0); } /* @@ -1064,15 +1106,13 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) * XXX - probably shouldn't return an error for SOCK_DGRAM */ if ((so->so_state & SS_ISCONNECTED) != 0) { - socket_unlock(so2, 1); error = EISCONN; - goto out; + goto decref_out; } if (so->so_type != so2->so_type) { - socket_unlock(so2, 1); error = EPROTOTYPE; - goto out; + goto decref_out; } if (so->so_proto->pr_flags & PR_CONNREQUIRED) { @@ -1149,19 +1189,41 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p) socket_lock(so3, 1); so2 = so3; + /* + * Enable tracing for mDNSResponder endpoints. (The use + * of sizeof instead of strlen below takes the null + * terminating character into account.) + */ + if (unpst_tracemdns && + !strncmp(soun->sun_path, MDNSRESPONDER_PATH, + sizeof (MDNSRESPONDER_PATH))) { + unp->unp_flags |= UNP_TRACE_MDNS; + unp2->unp_flags |= UNP_TRACE_MDNS; + } } error = unp_connect2(so, so2); + +decref_out: if (so2 != NULL) { - socket_unlock(so2, 1); + if (so != so2) { + socket_unlock(so2, 1); + } else { + /* Release the extra reference held for the listen socket. + * This is possible only for SOCK_DGRAM sockets. We refuse + * connecting to the same socket for SOCK_STREAM sockets. + */ + so2->so_usecount--; + } } if (list_so != NULL) { socket_lock(list_so, 0); socket_unlock(list_so, 1); } + out: - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); vnode_put(vp); return (error); } @@ -1182,8 +1244,8 @@ unp_connect2(struct socket *so, struct socket *so2) unp2 = sotounpcb(so2); - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); - lck_mtx_assert(unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); /* Verify both sockets are still opened */ if (unp == 0 || unp2 == 0) @@ -1197,15 +1259,18 @@ unp_connect2(struct socket *so, struct socket *so2) case SOCK_DGRAM: LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); - - /* Avoid lock order reversals due to drop/acquire in soisconnected. */ - /* Keep an extra reference on so2 that will be dropped - * soon after getting the locks in order - */ - socket_unlock(so2, 0); - soisconnected(so); - unp_get_locks_in_order(so, so2); - so2->so_usecount--; + if (so != so2) { + /* Avoid lock order reversals due to drop/acquire in soisconnected. */ + /* Keep an extra reference on so2 that will be dropped + * soon after getting the locks in order + */ + socket_unlock(so2, 0); + soisconnected(so); + unp_get_locks_in_order(so, so2); + so2->so_usecount--; + } else { + soisconnected(so); + } break; @@ -1242,8 +1307,8 @@ unp_connect2(struct socket *so, struct socket *so2) default: panic("unknown socket type %d in unp_connect2", so->so_type); } - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); - lck_mtx_assert(unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); return (0); } @@ -1284,7 +1349,12 @@ unp_disconnect(struct unpcb *unp) so2 = unp2->unp_socket; try_again: - if (so < so2) { + if (so == so2) { + if (so_locked == 0) { + socket_lock(so, 0); + } + waitso = so; + } else if (so < so2) { if (so_locked == 0) { socket_lock(so, 0); } @@ -1298,19 +1368,22 @@ try_again: socket_lock(so, 0); waitso = so; } + so_locked = 1; - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); - lck_mtx_assert(unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp2->unp_mtx, LCK_MTX_ASSERT_OWNED); /* Check for the UNP_DONTDISCONNECT flag, if it * is set, release both sockets and go to sleep */ if ((((struct unpcb *)waitso->so_pcb)->unp_flags & UNP_DONTDISCONNECT) != 0) { - socket_unlock(so2, 1); + if (so != so2) { + socket_unlock(so2, 1); + } so_locked = 0; - (void)msleep(waitso->so_pcb, unp->unp_mtx, + (void)msleep(waitso->so_pcb, &unp->unp_mtx, PSOCK | PDROP, "unpdisconnect", NULL); goto try_again; } @@ -1322,12 +1395,16 @@ try_again: unp->unp_conn = NULL; so2->so_usecount--; + if (unp->unp_flags & UNP_TRACE_MDNS) + unp->unp_flags &= ~UNP_TRACE_MDNS; + switch (unp->unp_socket->so_type) { case SOCK_DGRAM: LIST_REMOVE(unp, unp_reflink); unp->unp_socket->so_state &= ~SS_ISCONNECTED; - socket_unlock(so2, 1); + if (so != so2) + socket_unlock(so2, 1); break; case SOCK_STREAM: @@ -1343,6 +1420,10 @@ try_again: unp2->unp_socket->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); unp->unp_socket->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED); + + if (unp2->unp_flags & UNP_TRACE_MDNS) + unp2->unp_flags &= ~UNP_TRACE_MDNS; + strdisconn = 1; break; default: @@ -1362,7 +1443,7 @@ out: socket_lock(so,0); soisdisconnected(so); } - lck_mtx_assert(unp->unp_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&unp->unp_mtx, LCK_MTX_ASSERT_OWNED); return; } @@ -1519,10 +1600,10 @@ unp_pcblist SYSCTL_HANDLER_ARGS return (error); } -SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, +SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", "List of active local datagram sockets"); -SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, +SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", "List of active local stream sockets"); @@ -1662,10 +1743,10 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS return (error); } -SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist64, CTLFLAG_RD, +SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist64, "S,xunpcb64", "List of active local datagram sockets 64 bit"); -SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist64, CTLFLAG_RD, +SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist64, "S,xunpcb64", "List of active local stream sockets 64 bit"); @@ -2195,7 +2276,7 @@ unp_lock(struct socket *so, int refcount, void * lr) else lr_saved = lr; if (so->so_pcb) { - lck_mtx_lock(((struct unpcb *)so->so_pcb)->unp_mtx); + lck_mtx_lock(&((struct unpcb *)so->so_pcb)->unp_mtx); } else { panic("unp_lock: so=%p NO PCB! lr=%p ref=0x%x\n", so, lr_saved, so->so_usecount); @@ -2232,7 +2313,7 @@ unp_unlock(struct socket *so, int refcount, void * lr) if (so->so_pcb == NULL) { panic("unp_unlock: so=%p NO PCB usecount=%x\n", so, so->so_usecount); } else { - mutex_held = ((struct unpcb *)so->so_pcb)->unp_mtx; + mutex_held = &((struct unpcb *)so->so_pcb)->unp_mtx; } lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); so->unlock_lr[so->next_unlock_lr] = lr_saved; @@ -2245,8 +2326,6 @@ unp_unlock(struct socket *so, int refcount, void * lr) FREE(unp->unp_addr, M_SONAME); lck_mtx_unlock(mutex_held); - if (unp->unp_mtx) - lck_mtx_free(unp->unp_mtx, unp_mtx_grp); unp->unp_gencnt = ++unp_gencnt; zfree(unp_zone, unp); @@ -2269,7 +2348,7 @@ unp_getlock(struct socket *so, __unused int locktype) if (so->so_pcb) { if (so->so_usecount < 0) panic("unp_getlock: so=%p usecount=%x\n", so, so->so_usecount); - return(unp->unp_mtx); + return(&unp->unp_mtx); } else { panic("unp_getlock: so=%p NULL so_pcb\n", so); return (so->so_proto->pr_domain->dom_mtx); diff --git a/bsd/kern/vm_pressure.c b/bsd/kern/vm_pressure.c new file mode 100644 index 000000000..b5fc2f072 --- /dev/null +++ b/bsd/kern/vm_pressure.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include <libkern/libkern.h> +#include <mach/mach_types.h> +#include <mach/task.h> +#include <sys/proc_internal.h> +#include <sys/event.h> +#include <sys/eventvar.h> +#include <kern/locks.h> +#include <sys/queue.h> +#include <kern/vm_pressure.h> +#include <sys/malloc.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/sysctl.h> + +void vm_pressure_klist_lock(void); +void vm_pressure_klist_unlock(void); + +void vm_dispatch_memory_pressure(void); +int vm_try_terminate_candidates(void); +int vm_try_pressure_candidates(void); +void vm_recharge_active_list(void); + +struct klist vm_pressure_klist; +struct klist vm_pressure_klist_dormant; + +void vm_pressure_klist_lock(void) { + lck_mtx_lock(&vm_pressure_klist_mutex); +} + +void vm_pressure_klist_unlock(void) { + lck_mtx_unlock(&vm_pressure_klist_mutex); +} + +int vm_knote_register(struct knote *kn) { + int rv = 0; + + vm_pressure_klist_lock(); + + if ((kn->kn_sfflags & (NOTE_VM_PRESSURE))) { +#if DEBUG + printf("[vm_pressure] process %d registering pressure notification\n", kn->kn_kq->kq_p->p_pid); +#endif + KNOTE_ATTACH(&vm_pressure_klist, kn); + } else + rv = ENOTSUP; + + vm_pressure_klist_unlock(); + + return rv; +} + +void vm_knote_unregister(struct knote *kn) { + struct knote *kn_temp; + + vm_pressure_klist_lock(); + +#if DEBUG + printf("[vm_pressure] process %d cancelling pressure notification\n", kn->kn_kq->kq_p->p_pid); +#endif + + SLIST_FOREACH(kn_temp, &vm_pressure_klist, kn_selnext) { + if (kn_temp == kn) { + KNOTE_DETACH(&vm_pressure_klist, kn); + vm_pressure_klist_unlock(); + return; + } + } + KNOTE_DETACH(&vm_pressure_klist_dormant, kn); + + vm_pressure_klist_unlock(); +} + +/* Interface for event dispatch from vm_pageout_garbage_collect thread */ +void consider_pressure_events(void) { + vm_dispatch_memory_pressure(); +} + +void vm_dispatch_memory_pressure(void) { + vm_pressure_klist_lock(); + + if (!SLIST_EMPTY(&vm_pressure_klist)) { + +#if DEBUG + printf("[vm_pressure] vm_dispatch_memory_pressure\n"); +#endif + + if (vm_try_pressure_candidates()) { + vm_pressure_klist_unlock(); + return; + } + + } + + /* Else... */ + +#if DEBUG + printf("[vm_pressure] could not find suitable event candidate\n"); +#endif + + vm_recharge_active_list(); + + vm_pressure_klist_unlock(); +} + +/* + * Try standard pressure event candidates. Called with klist lock held. + */ +int vm_try_pressure_candidates(void) { + /* + * This value is the threshold that a process must meet to be considered for scavenging. + * If a process has sufficiently little resident memory, there is probably no use scavenging it. + * At best, we'll scavenge very little memory. At worst, we'll page in code pages or malloc metadata. + */ + +#define VM_PRESSURE_MINIMUM_RSIZE (10 * 1024 * 1024) + + struct proc *p_max = NULL; + unsigned int resident_max = 0; + struct knote *kn_max = NULL; + struct knote *kn; + + SLIST_FOREACH(kn, &vm_pressure_klist, kn_selnext) { + if ( (kn != NULL ) && ( kn->kn_kq != NULL ) && ( kn->kn_kq->kq_p != NULL ) ) { + if (kn->kn_sfflags & NOTE_VM_PRESSURE) { + struct proc *p = kn->kn_kq->kq_p; + if (!(kn->kn_status & KN_DISABLED)) { + kern_return_t kr = KERN_SUCCESS; + struct task *t = (struct task *)(p->task); + struct task_basic_info basic_info; + mach_msg_type_number_t size = TASK_BASIC_INFO_COUNT; + if( ( kr = task_info(t, TASK_BASIC_INFO, (task_info_t)(&basic_info), &size)) == KERN_SUCCESS ) { + unsigned int resident_size = basic_info.resident_size; + /* + * We don't want a small process to block large processes from + * being notified again. <rdar://problem/7955532> + */ + if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) { + if (resident_size > resident_max) { + p_max = p; + resident_max = resident_size; + kn_max = kn; + } + } else { +#if DEBUG + /* There was no candidate with enough resident memory to scavenge */ + /* This debug print makes too much noise now */ + //printf("[vm_pressure] threshold failed for pid %d with %u resident, skipping...\n", p->p_pid, resident_size); +#endif + } + } else { +#if DEBUG + printf("[vm_pressure] task_info for pid %d failed with %d\n", p->p_pid, kr); +#endif + } + } else { +#if DEBUG + printf("[vm_pressure] pid %d currently disabled, skipping...\n", p->p_pid); +#endif + } + } + } else { +#if DEBUG + if (kn == NULL) { + printf("[vm_pressure] kn is NULL\n"); + } else if (kn->kn_kq == NULL) { + printf("[vm_pressure] kn->kn_kq is NULL\n"); + } else if (kn->kn_kq->kq_p == NULL) { + printf("[vm_pressure] kn->kn_kq->kq_p is NULL\n"); + } +#endif + } + } + + if (kn_max == NULL) return 0; + +#if DEBUG + printf("[vm_pressure] sending event to pid %d with %u resident\n", kn_max->kn_kq->kq_p->p_pid, resident_max); +#endif + + KNOTE_DETACH(&vm_pressure_klist, kn_max); + struct klist dispatch_klist = { NULL }; + KNOTE_ATTACH(&dispatch_klist, kn_max); + KNOTE(&dispatch_klist, NOTE_VM_PRESSURE); + KNOTE_ATTACH(&vm_pressure_klist_dormant, kn_max); + + return 1; +} + + +/* + * Remove all elements from the dormant list and place them on the active list. + * Called with klist lock held. + */ +void vm_recharge_active_list(void) { + /* Re-charge the main list from the dormant list if possible */ + if (!SLIST_EMPTY(&vm_pressure_klist_dormant)) { +#if DEBUG + printf("[vm_pressure] recharging main list from dormant list\n"); +#endif + struct knote *kn; + while (!SLIST_EMPTY(&vm_pressure_klist_dormant)) { + kn = SLIST_FIRST(&vm_pressure_klist_dormant); + SLIST_REMOVE_HEAD(&vm_pressure_klist_dormant, kn_selnext); + SLIST_INSERT_HEAD(&vm_pressure_klist, kn, kn_selnext); + } + } +} diff --git a/bsd/ppc/reg.h b/bsd/kern/vm_pressure.h similarity index 80% rename from bsd/ppc/reg.h rename to bsd/kern/vm_pressure.h index 0449be6df..8063c820a 100644 --- a/bsd/ppc/reg.h +++ b/bsd/kern/vm_pressure.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,21 +25,17 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Copyright 1993, NeXT Computer, Inc. - */ - -#ifndef _BSD_PPC_REG_H_ -#define _BSD_PPC_REG_H_ +#ifndef VM_PRESSURE_H +#define VM_PRESSURE_H -#ifdef BSD_KERNEL_PRIVATE +#include <sys/queue.h> -/* Index into the thread_state */ -#define SP 3 -#define PC 0 +static lck_mtx_t vm_pressure_klist_mutex; -#endif /* KERNEL_PRIVATE */ +int vm_knote_register(struct knote *); +void vm_knote_unregister(struct knote *); -#endif /* _BSD_PPC_REG_H_ */ +void consider_pressure_events(void); +#endif /* VM_PRESSURE_H */ diff --git a/bsd/libkern/libkern.h b/bsd/libkern/libkern.h index 6fd1f7a86..0d9cff919 100644 --- a/bsd/libkern/libkern.h +++ b/bsd/libkern/libkern.h @@ -203,16 +203,7 @@ extern void flush_dcache64(addr64_t, unsigned, int); static __inline__ unsigned int clz(unsigned int num) { -#if __ppc__ - unsigned int result; - __asm__ volatile( - "cntlzw %0, %1" - : "=r" (result) - : "r" (num) - ); - return result; - -#elif __i386__ +#if __i386__ unsigned int result; __asm__ volatile( "bsrl %1, %0\n\t" diff --git a/bsd/machine/_limits.h b/bsd/machine/_limits.h index dd32b6197..c1d8abd07 100644 --- a/bsd/machine/_limits.h +++ b/bsd/machine/_limits.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE__LIMITS_H_ #define _BSD_MACHINE__LIMITS_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/_limits.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/_limits.h" #else #error architecture not supported diff --git a/bsd/machine/_param.h b/bsd/machine/_param.h index 844370744..beb2cb939 100644 --- a/bsd/machine/_param.h +++ b/bsd/machine/_param.h @@ -25,9 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/_param.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/_param.h" #else #error architecture not supported diff --git a/bsd/machine/_structs.h b/bsd/machine/_structs.h index a0e15996e..509d5f618 100644 --- a/bsd/machine/_structs.h +++ b/bsd/machine/_structs.h @@ -25,9 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/_structs.h" -#elif defined (__i386__) || defined (__x86_64__) +#if defined (__i386__) || defined (__x86_64__) #include "i386/_structs.h" #else #error architecture not supported diff --git a/bsd/machine/_types.h b/bsd/machine/_types.h index ceac56ea0..92c65bf6c 100644 --- a/bsd/machine/_types.h +++ b/bsd/machine/_types.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE__TYPES_H_ #define _BSD_MACHINE__TYPES_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/_types.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/_types.h" #else #error architecture not supported diff --git a/bsd/machine/dis_tables.h b/bsd/machine/dis_tables.h index 6eaff8106..7ac37dd7e 100644 --- a/bsd/machine/dis_tables.h +++ b/bsd/machine/dis_tables.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_DIS_TABLES_H_ #define _BSD_MACHINE_DIS_TABLES_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/dis_tables.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/dis_tables.h" #else #error architecture not supported diff --git a/bsd/machine/disklabel.h b/bsd/machine/disklabel.h index 93fa986ed..490bbda8a 100644 --- a/bsd/machine/disklabel.h +++ b/bsd/machine/disklabel.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_CPU_H_ #define _BSD_MACHINE_CPU_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/disklabel.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/disklabel.h" #else #error architecture not supported diff --git a/bsd/machine/endian.h b/bsd/machine/endian.h index 879cf17bd..871af6483 100644 --- a/bsd/machine/endian.h +++ b/bsd/machine/endian.h @@ -31,9 +31,7 @@ #ifndef _BSD_MACHINE_ENDIAN_H_ #define _BSD_MACHINE_ENDIAN_H_ -#if defined (__ppc__) || defined(__ppc64__) -#include "ppc/endian.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/endian.h" #else #error architecture not supported diff --git a/bsd/machine/exec.h b/bsd/machine/exec.h index fc8a27279..1a6417179 100644 --- a/bsd/machine/exec.h +++ b/bsd/machine/exec.h @@ -44,16 +44,14 @@ struct exec_info { struct exec_archhandler { char path[MAXPATHLEN]; uint32_t fsid; - long fileid; + uint64_t fileid; }; extern struct exec_archhandler exec_archhandler_ppc; int set_archhandler(struct proc *, int); int grade_binary(cpu_type_t, cpu_subtype_t); -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/exec.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/exec.h" #else #error architecture not supported diff --git a/bsd/machine/fasttrap_isa.h b/bsd/machine/fasttrap_isa.h index d57bac1ba..cfe9e297a 100644 --- a/bsd/machine/fasttrap_isa.h +++ b/bsd/machine/fasttrap_isa.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_FASTTRAP_ISA_H_ #define _BSD_MACHINE_FASTTRAP_ISA_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/fasttrap_isa.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/fasttrap_isa.h" #else #error architecture not supported diff --git a/bsd/machine/limits.h b/bsd/machine/limits.h index 0f40842f9..e96709f89 100644 --- a/bsd/machine/limits.h +++ b/bsd/machine/limits.h @@ -2,9 +2,7 @@ compiler. GCC provides its own limits.h which can be found in /usr/lib/gcc, although it is not very informative. This file is public domain. */ -#if defined (__ppc__) || defined (__ppc64__) -#include <ppc/limits.h> -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include <i386/limits.h> #else #error architecture not supported diff --git a/bsd/machine/param.h b/bsd/machine/param.h index 6253a5fb4..2724da7e1 100644 --- a/bsd/machine/param.h +++ b/bsd/machine/param.h @@ -31,9 +31,7 @@ #ifndef _BSD_MACHINE_PARAM_H_ #define _BSD_MACHINE_PARAM_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/param.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/param.h" #else #error architecture not supported diff --git a/bsd/machine/profile.h b/bsd/machine/profile.h index ea28264c6..cc8a5eac0 100644 --- a/bsd/machine/profile.h +++ b/bsd/machine/profile.h @@ -33,9 +33,7 @@ #ifndef _BSD_MACHINE_PROFILE_H_ #define _BSD_MACHINE_PROFILE_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/profile.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/profile.h" #else #error architecture not supported diff --git a/bsd/machine/psl.h b/bsd/machine/psl.h index 711639e4f..01c6e0a25 100644 --- a/bsd/machine/psl.h +++ b/bsd/machine/psl.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_PSL_H_ #define _BSD_MACHINE_PSL_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/psl.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/psl.h" #else #error architecture not supported diff --git a/bsd/machine/ptrace.h b/bsd/machine/ptrace.h index 031327fe4..3320c2226 100644 --- a/bsd/machine/ptrace.h +++ b/bsd/machine/ptrace.h @@ -31,9 +31,7 @@ #ifndef _BSD_MACHINE_PTRACE_H_ #define _BSD_MACHINE_PTRACE_H_ -#if defined (__ppc__) || defined(__ppc64__) -#include "ppc/ptrace.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/ptrace.h" #else #error architecture not supported diff --git a/bsd/machine/reboot.h b/bsd/machine/reboot.h index cf91c27da..864f1970c 100644 --- a/bsd/machine/reboot.h +++ b/bsd/machine/reboot.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_REBOOT_H_ #define _BSD_MACHINE_REBOOT_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/reboot.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/reboot.h" #else #error architecture not supported diff --git a/bsd/machine/reg.h b/bsd/machine/reg.h index 95ec0f7d1..30e5dc524 100644 --- a/bsd/machine/reg.h +++ b/bsd/machine/reg.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_REG_H_ #define _BSD_MACHINE_REG_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/reg.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/reg.h" #else #error architecture not supported diff --git a/bsd/machine/setjmp.h b/bsd/machine/setjmp.h index 4f37be1d3..262acfbc8 100644 --- a/bsd/machine/setjmp.h +++ b/bsd/machine/setjmp.h @@ -31,9 +31,7 @@ #ifndef _MACHINE_SETJMP_H_ #define _MACHINE_SETJMP_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/setjmp.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/setjmp.h" #else #error architecture not supported diff --git a/bsd/machine/signal.h b/bsd/machine/signal.h index 227d4182e..4b7f69c19 100644 --- a/bsd/machine/signal.h +++ b/bsd/machine/signal.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_SIGNAL_H_ #define _BSD_MACHINE_SIGNAL_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/signal.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/signal.h" #else #error architecture not supported diff --git a/bsd/machine/types.h b/bsd/machine/types.h index ed113ddde..5d6d4db44 100644 --- a/bsd/machine/types.h +++ b/bsd/machine/types.h @@ -31,9 +31,7 @@ #ifndef _BSD_MACHINE_TYPES_H_ #define _BSD_MACHINE_TYPES_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/types.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/types.h" #else #error architecture not supported diff --git a/bsd/machine/ucontext.h b/bsd/machine/ucontext.h index a0e91489a..60e157643 100644 --- a/bsd/machine/ucontext.h +++ b/bsd/machine/ucontext.h @@ -28,9 +28,7 @@ #ifndef _MACHINE_UCONTEXT_H_ #define _MACHINE_UCONTEXT_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/ucontext.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/ucontext.h" #else #error architecture not supported diff --git a/bsd/machine/vmparam.h b/bsd/machine/vmparam.h index 8911ea054..54b212382 100644 --- a/bsd/machine/vmparam.h +++ b/bsd/machine/vmparam.h @@ -28,9 +28,7 @@ #ifndef _BSD_MACHINE_VMPARAM_H_ #define _BSD_MACHINE_VMPARAM_H_ -#if defined (__ppc__) || defined (__ppc64__) -#include "ppc/vmparam.h" -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) #include "i386/vmparam.h" #else #error architecture not supported diff --git a/bsd/man/man2/Makefile b/bsd/man/man2/Makefile index 12cc26329..93d247a32 100644 --- a/bsd/man/man2/Makefile +++ b/bsd/man/man2/Makefile @@ -67,6 +67,7 @@ DATAFILES = \ getauid.2 \ getdirentries.2 \ getdirentriesattr.2 \ + getdtablesize.2 \ getegid.2 \ geteuid.2 \ getfh.2 \ @@ -145,6 +146,11 @@ DATAFILES = \ rmdir.2 \ searchfs.2 \ select.2 \ + sem_close.2 \ + sem_open.2 \ + sem_post.2 \ + sem_unlink.2 \ + sem_wait.2 \ semctl.2 \ semget.2 \ semop.2 \ @@ -166,12 +172,16 @@ DATAFILES = \ setpgid.2 \ setpgrp.2 \ setpriority.2 \ + setregid.2 \ + setreuid.2 \ setrlimit.2 \ setsid.2 \ setsockopt.2 \ settimeofday.2 \ setuid.2 \ setxattr.2 \ + shm_open.2 \ + shm_unlink.2 \ shmat.2 \ shmctl.2 \ shmdt.2 \ @@ -194,6 +204,7 @@ DATAFILES = \ syscall.2 \ truncate.2 \ umask.2 \ + undelete.2 \ unlink.2 \ unmount.2 \ utimes.2 \ @@ -207,7 +218,8 @@ DATAFILES = \ posix_spawn.2 INSTALL_MAN_LINKS = \ - posix_spawn.2 posix_spawnp.2 + posix_spawn.2 posix_spawnp.2 \ + sem_wait.2 sem_trywait.2 INSTALL_MAN_LIST = ${DATAFILES} diff --git a/bsd/man/man2/auditon.2 b/bsd/man/man2/auditon.2 index bf37e6ab4..4d551ba7a 100644 --- a/bsd/man/man2/auditon.2 +++ b/bsd/man/man2/auditon.2 @@ -243,6 +243,15 @@ structure with the field set to the maximum audit log file size. A value of 0 indicates no limit to the size. +.It Dv A_SETSFLAGS +Set the audit sessions flags for the current session. +The +.Fa data +argument must point to an +.Vt au_asflgs_t +value containing the new audit session flags. +Audit session flags may be updated only according to local +access control policy. .It Dv A_GETCLASS Return the event to class mapping for the designated audit event. The @@ -376,6 +385,13 @@ The .Va af_currsz field will be set to the current audit log file size. +.It Dv A_GETSFLAGS +Returns the audit session flags for the current session. +The +.Fa data +argument must point to an +.Vt au_asflgs_t +value which will be set with the current session flags. .It Dv A_GETCWD .\" [COMMENTED OUT]: Valid description, not yet implemented. .\" Return the current working directory as stored in the audit subsystem. diff --git a/bsd/man/man2/dup.2 b/bsd/man/man2/dup.2 index c13ca0bb5..897966a52 100644 --- a/bsd/man/man2/dup.2 +++ b/bsd/man/man2/dup.2 @@ -33,7 +33,7 @@ .\" .\" @(#)dup.2 8.1 (Berkeley) 6/4/93 .\" -.Dd June 4, 1993 +.Dd December 1, 2010 .Dt DUP 2 .Os BSD 4 .Sh NAME @@ -100,8 +100,18 @@ In the value of the new descriptor .Fa fildes2 is specified. -If this descriptor is already in use, -the descriptor is first deallocated as if a +If +.Fa fildes +and +.Fa fildes2 +are equal, then +.Fn dup2 +just returns +.Fa fildes2 ; +no other changes are made to the existing descriptor. +Otherwise, if descriptor +.Fa fildes2 +is already in use, it is first deallocated as if a .Xr close 2 call had been done first. .Sh RETURN VALUES diff --git a/bsd/man/man2/exchangedata.2 b/bsd/man/man2/exchangedata.2 index cc2111ea4..83dc23c1b 100644 --- a/bsd/man/man2/exchangedata.2 +++ b/bsd/man/man2/exchangedata.2 @@ -24,8 +24,9 @@ .Nd atomically exchange data between two files .Sh SYNOPSIS .Fd #include <unistd.h> +.Fd #include <sys/attr.h> .Ft int -.Fn exchangedata "const char * path1" "const char * path2" "unsigned long options" +.Fn exchangedata "const char * path1" "const char * path2" "unsigned int options" . .Sh DESCRIPTION The diff --git a/bsd/man/man2/fcntl.2 b/bsd/man/man2/fcntl.2 index d6d1ce8cd..b53a38be1 100644 --- a/bsd/man/man2/fcntl.2 +++ b/bsd/man/man2/fcntl.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2008 Apple Inc. All rights reserved. +.\" Copyright (c) 2011 Apple Inc. All rights reserved. .\" .\" @APPLE_LICENSE_HEADER_START@ .\" @@ -56,7 +56,7 @@ .\" .\" @(#)fcntl.2 8.2 (Berkeley) 1/12/94 .\" -.Dd October 2, 2008 +.Dd February 17, 2011 .Dt FCNTL 2 .Os BSD 4.2 .Sh NAME @@ -98,26 +98,24 @@ Same file status flags (i.e., both file descriptors share the same file status flags). .It The close-on-exec flag associated with the new file descriptor -is set to remain open across +is cleared so that the descriptor remains open across an .Xr execv 2 -system calls. +system call. .El +.It Dv F_DUPFD_CLOEXEC +Like +.Dv F_DUPFD , +except that the close-on-exec flag associated with the new file descriptor +is set. .It Dv F_GETFD -Get the close-on-exec flag associated with the file descriptor -.Fa fildes . -If the low-order bit of the returned value is 0, -the file will remain open across -.Fn exec , -otherwise the file will be closed upon execution of -.Fn exec +Get the flags associated with the file descriptor +.Fa fildes , +as described below .Fa ( arg is ignored). .It Dv F_SETFD -Set the close-on-exec flag associated with -.Fa fildes -to the low order bit of -.Fa arg -(0 or 1 as above). +Set the file descriptor flags to +.Fa arg . .It Dv F_GETFL Get descriptor status flags, as described below .Fa ( arg @@ -187,6 +185,9 @@ Get disk device information. Currently this only includes the disk device address that corresponds to the current file offset. +.It Dv F_LOG2PHYS_EXT +Variant of F_LOG2PHYS that uses the passed in +file offset and length. .It Dv F_FULLFSYNC Does the same thing as .Xr fsync 2 @@ -200,6 +201,43 @@ and Universal Disk Format (UDF) file systems. The operation may take quite a while to complete. Certain FireWire drives have also been known to ignore the request to flush their buffered data. +.It Dv F_SETNOSIGPIPE +Determines whether a +.Dv SIGPIPE +signal will be generated when a write fails on a pipe or socket for +which there is no reader. If +.Fa arg +is non-zero, +.Dv SIGPIPE +generation is disabled for descriptor +.Fa fildes , +while an +.Fa arg +of zero enables it (the default). +.It Dv F_GETNOSIGPIPE +Returns whether a +.Dv SIGPIPE +signal will be generated when a write fails on a pipe or socket +for which there is no reader. The semantics of the return value +match those of the +.Fa arg +of +.Dv F_SETNOSIGPIPE . +.El +.Pp +The flags for the +.Dv F_GETFD +and +.Dv F_SETFD +commands are as follows: +.Bl -tag -width FD_CLOEXECX -offset indent +.It Dv FD_CLOEXEC +Close-on-exec; the given file descriptor will be automatically +closed in the successor process image when one of the +.Xr execv 2 +or +.Xr posix_spawn 2 +family of system calls is invoked. .El .Pp The flags for the @@ -476,15 +514,43 @@ commands operate on the following structure. .Pp The .Dv F_LOG2PHYS -command operates on the following structure. +command operates on the following structure: .ne 7v .Bd -literal struct log2phys { - u_int32_t l2p_flags; /* unused so far */ - off_t l2p_contigbytes; /* unused so far */ - off_t l2p_devoffset; /* bytes into device */ + u_int32_t l2p_flags; /* unused so far */ + off_t l2p_contigbytes; /* unused so far */ + off_t l2p_devoffset; /* bytes into device */ }; .Ed +.Pp +The +.Dv F_LOG2PHYS_EXT +command operates on the same structure as F_LOG2PHYS but treats it as an in/out: +.ne 7v +.Bd -literal + struct log2phys { + u_int32_t l2p_flags; /* unused so far */ + off_t l2p_contigbytes; /* IN: number of bytes to be queried; + OUT: number of contiguous bytes allocated at this position */ + off_t l2p_devoffset; /* IN: bytes into file; + OUT: bytes into device */ + }; +.Ed +.Pp +If +.Fa fildes +is a socket, then the +.Dv F_SETNOSIGPIPE +and +.Dv F_GETNOSIGPIPE +commands are directly analogous, and fully interoperate with the +.Dv SO_NOSIGPIPE +option of +.Xr setsockopt 2 +and +.Xr getsockopt 2 +respectively. .Sh RETURN VALUES Upon successful completion, the value returned depends on .Fa cmd @@ -579,6 +645,8 @@ The argument .Fa cmd is .Dv F_LOG2PHYS +or +.Dv F_LOG2PHYS_EXT and .Fa fildes is not a valid file descriptor open for reading. @@ -696,6 +764,9 @@ the process ID given as argument is not in use. .Xr flock 2 , .Xr getdtablesize 2 , .Xr open 2 , +.Xr pipe 2 , +.Xr socket 2 , +.Xr setsockopt 2 , .Xr sigaction 3 .Sh HISTORY The diff --git a/bsd/man/man2/getattrlist.2 b/bsd/man/man2/getattrlist.2 index 856f1a110..e2af8fc60 100644 --- a/bsd/man/man2/getattrlist.2 +++ b/bsd/man/man2/getattrlist.2 @@ -354,8 +354,11 @@ An structure containing the name of the file system object as UTF-8 encoded, null terminated C string. The attribute data length will not be greater than -.Dv NAME_MAX + -1. +.Dv NAME_MAX ++ 1 characters, which is +.Dv NAME_MAX +* 3 + 1 bytes (as one UTF-8-encoded character may +take up to three bytes). .Pp . .It ATTR_CMN_DEVID @@ -570,6 +573,11 @@ field of the .Vt stat structure returned by .Xr stat 2 . +Only the permission bits of +.Fa st_mode +are valid; other bits should be ignored, +e.g., by masking with +.Dv ~S_IFMT . . .It ATTR_CMN_NAMEDATTRCOUNT A @@ -665,6 +673,13 @@ The attribute data length will not be greater than Inconsistent behavior may be observed when this attribute is requested on hard-linked items, particularly when the file system does not support ATTR_CMN_PARENTID natively. Callers should be aware of this when requesting the full path of a hard-linked item. +. +.It ATTR_CMN_ADDEDTIME +A +.Vt timespec +that contains the time that the file system object was created or renamed into +its containing directory. Note that inconsistent behavior may obe observed +when this attribute is requested on hard-linked items. .Pp . .El @@ -1288,6 +1303,13 @@ that did not support them. .Pp Introduced with Darwin 10.0 (Mac OS X version 10.6). . +.It VOL_CAP_FMT_64BIT_OBJECT_IDS +If this bit is set, the volume format uses object IDs that are 64-bit. +This means that ATTR_CMN_FILEID and ATTR_CMN_PARENTID are the only +legitimate attributes for obtaining object IDs from this volume and the +32-bit fid_objno fields of the fsobj_id_t returned by ATTR_CMN_OBJID, +ATTR_CMN_OBJPERMANENTID, and ATTR_CMN_PAROBJID are undefined. +. .El .Pp . @@ -1602,6 +1624,10 @@ structure is 64-bits (two 32-bit elements) in 32-bit code, and 128-bits (two 64-bit elements) in 64-bit code; however, it is aligned on a 4-byte (32-bit) boundary, even in 64-bit code. .Pp +If you use a structure +for the attribute data, it must be correctly packed and aligned (see +examples). +.Pp . Inconsistent behavior may be observed when the ATTR_CMN_FULLPATH attribute is requested on hard-linked items, particularly when the file system does not support ATTR_CMN_PARENTID @@ -1633,7 +1659,7 @@ struct FInfoAttrBuf { u_int32_t length; fsobj_type_t objType; char finderInfo[32]; -}; +} __attribute__((aligned(4), packed)); typedef struct FInfoAttrBuf FInfoAttrBuf; .Pp . @@ -1700,14 +1726,14 @@ typedef struct attrlist attrlist_t; struct FInfo2CommonAttrBuf { fsobj_type_t objType; char finderInfo[32]; -}; +} __attribute__((aligned(4), packed)); typedef struct FInfo2CommonAttrBuf FInfo2CommonAttrBuf; .Pp . struct FInfo2AttrBuf { u_int32_t length; FInfo2CommonAttrBuf common; -}; +} __attribute__((aligned(4), packed));; typedef struct FInfo2AttrBuf FInfo2AttrBuf; .Pp . @@ -1790,7 +1816,7 @@ struct VolAttrBuf { attrreference_t volNameRef; char mountPointSpace[MAXPATHLEN]; char volNameSpace[MAXPATHLEN]; -}; +} __attribute__((aligned(4), packed)); typedef struct VolAttrBuf VolAttrBuf; .Pp . @@ -1843,6 +1869,53 @@ static int VolDemo(const char *path) } .Ed .Pp +The following sample demonstrates the need to use packing and alignment +controls; without the attribute, in 64-bit code, the fields of the structure are not +placed at the locations that the kernel expects. +. +.Bd -literal +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <err.h> +#include <time.h> +#include <sys/attr.h> +.Pp +/* The alignment and packing attribute is necessary in 64-bit code */ +struct AttrListTimes { + u_int32_t length; + struct timespec st_crtime; + struct timespec st_modtime; +} __attribute__((aligned(4), packed)); +.Pp +main(int argc, char **argv) +{ + int rv; + int i; +.Pp + for (i = 1; i < argc; i++) { + struct attrlist attrList; + struct AttrListTimes myStat = {0}; + char *path = argv[i]; +.Pp + memset(&attrList, 0, sizeof(attrList)); + attrList.bitmapcount = ATTR_BIT_MAP_COUNT; + attrList.commonattr = ATTR_CMN_CRTIME | + ATTR_CMN_MODTIME; +.Pp + rv = getattrlist(path, &attrList, &myStat, sizeof(myStat), 0); +.Pp + if (rv == -1) { + warn("getattrlist(%s)", path); + continue; + } + printf("%s: Modification time = %s", argv[i], ctime(&myStat.st_modtime.tv_sec)); + } + return 0; +} +.Ed +.Pp . .Sh SEE ALSO . diff --git a/bsd/man/man2/getaudit.2 b/bsd/man/man2/getaudit.2 index 10f84aaf6..d2895cd33 100644 --- a/bsd/man/man2/getaudit.2 +++ b/bsd/man/man2/getaudit.2 @@ -59,7 +59,6 @@ The data structure is defined as follows: .nf .in +4n - struct auditinfo { au_id_t ai_auid; /* Audit user ID */ au_mask_t ai_mask; /* Audit masks */ @@ -74,15 +73,13 @@ The .Fa ai_auid variable contains the audit identifier which is recorded in the audit log for each event the process caused. -.PP - +.Pp The .Fa au_mask_t data structure defines the bit mask for auditing successful and failed events out of the predefined list of event classes. It is defined as follows: .nf .in +4n - struct au_mask { unsigned int am_success; /* success bits */ unsigned int am_failure; /* failure bits */ @@ -90,15 +87,13 @@ struct au_mask { typedef struct au_mask au_mask_t; .in .fi -.PP - +.Pp The .Fa au_termid_t data structure defines the Terminal ID recorded with every event caused by the process. It is defined as follows: .nf .in +4n - struct au_tid { dev_t port; u_int32_t machine; @@ -106,8 +101,7 @@ struct au_tid { typedef struct au_tid au_tid_t; .in .fi -.PP - +.Pp The .Fa ai_asid variable contains the audit session ID which is recorded with every event @@ -122,7 +116,6 @@ data structure supports Terminal IDs with larger addresses such as those used in IP version 6. It is defined as follows: .nf .in +4n - struct auditinfo_addr { au_id_t ai_auid; /* Audit user ID. */ au_mask_t ai_mask; /* Audit masks. */ @@ -134,14 +127,12 @@ typedef struct auditinfo_addr auditinfo_addr_t; .in .fi .Pp - The .Fa au_tid_addr_t data structure which includes a larger address storage field and an additional field with the type of address stored: .nf .in +4n - struct au_tid_addr { dev_t at_port; u_int32_t at_type; diff --git a/bsd/man/man2/getauid.2 b/bsd/man/man2/getauid.2 index a89d98aae..373deb2a0 100644 --- a/bsd/man/man2/getauid.2 +++ b/bsd/man/man2/getauid.2 @@ -25,7 +25,7 @@ .Os .Sh NAME .Nm getauid -.Nd "retrieve audit session ID" +.Nd "retrieve audit user ID" .Sh SYNOPSIS .In bsm/audit.h .Ft int @@ -34,7 +34,7 @@ The .Fn getauid system call -retrieves the active audit session ID for the current process via the +retrieves the active audit user ID for the current process via the .Vt au_id_t pointed to by .Fa auid . diff --git a/bsd/man/man2/getdirentries.2 b/bsd/man/man2/getdirentries.2 index a77a5d8b6..a513ea8e8 100644 --- a/bsd/man/man2/getdirentries.2 +++ b/bsd/man/man2/getdirentries.2 @@ -67,14 +67,8 @@ with buffers smaller than this size. .Pp The data in the buffer is a series of .Em dirent -structures each containing the following entries: -.Bd -literal -offset indent -u_int32_t d_fileno; /* file number of entry */ -u_int16_t d_reclen; /* length of this record */ -u_int8_t d_type; /* file type, see below */ -u_int8_t d_namlen; /* length of string in d_name */ -char d_name[MAXNAMELEN + 1]; /* see below */ -.Ed +structures (see +.Xr dir 5 ) .Pp The .Fa d_fileno @@ -166,7 +160,11 @@ will not work with 64-bit inodes; in order to use .Fn getdirentries , .Dv _DARWIN_NO_64_BIT_INODE -must be defined. +must be defined. See +.Xr stat 2 +for more information on +.Dv _DARWIN_NO_64_BIT_INODE +and its other effects. .Sh RETURN VALUES If successful, the number of bytes actually transferred is returned. Otherwise, -1 is returned and the global variable @@ -193,8 +191,10 @@ error occurred while reading from or writing to the file system. .Sh SEE ALSO .Xr lseek 2 , .Xr open 2 , +.Xr stat 2 , .Xr opendir 3 , -.Xr readdir 3 +.Xr readdir 3 , +.Xr dir 5 .Sh HISTORY The .Fn getdirentries diff --git a/bsd/man/man2/getdirentriesattr.2 b/bsd/man/man2/getdirentriesattr.2 index a2cc333ff..78a839766 100644 --- a/bsd/man/man2/getdirentriesattr.2 +++ b/bsd/man/man2/getdirentriesattr.2 @@ -163,7 +163,7 @@ However, since the variable is too small to hold an .Vt off_t , you should use .Xr lseek 2 -to get the directoy's current position instead of using this parameter. +to get the directory's current position instead of using this parameter. The initial value of the variable is ignored. .Pp . @@ -245,6 +245,16 @@ If you're implementing a volume format that supports .Fn getdirentriesattr , you should be careful to support the behaviour specified by this document. . +.Pp +If the directory contains a mount point, then +.Dv DIR_MNTSTATUS_MNTPOINT +will be set in the +.Dv ATTR_DIR_MOUNTSTATUS +for that entry; all other attributes for that entry, however, +will be for the underlying file system (as opposed to the mounted +file system). +.Xr getattrlist 2 +should be used to get the attributes for the mount point. .Sh ERRORS .Fn getdirentriesattr will fail if: @@ -315,7 +325,8 @@ struct FInfoAttrBuf { attrreference_t name; fsobj_type_t objType; char finderInfo[32]; -}; + u_int32_t dirStatus; +} __attribute__((aligned(4), packed)); typedef struct FInfoAttrBuf FInfoAttrBuf; .Pp . @@ -358,6 +369,7 @@ static int FInfoDemo(const char *dirPath) attrList.commonattr = ATTR_CMN_NAME | ATTR_CMN_OBJTYPE | ATTR_CMN_FNDRINFO; + attrList.dirattr = ATTR_DIR_MOUNTSTATUS; .Pp err = 0; @@ -411,7 +423,10 @@ static int FInfoDemo(const char *dirPath) ); break; case VDIR: - printf("directory "); + if (thisEntry->dirStatus & DIR_MNTSTATUS_MNTPOINT) + printf("mount-point "); + else + printf("directory "); break; default: printf( @@ -428,7 +443,7 @@ static int FInfoDemo(const char *dirPath) .Pp // Advance to the next entry. .Pp - ((char *) thisEntry) += thisEntry->length; + thisEntry = (FInfoAttrBuf*)((char*)thisEntry + thisEntry->length); } } } while ( err == 0 && ! done ); diff --git a/bsd/man/man2/getdtablesize.2 b/bsd/man/man2/getdtablesize.2 new file mode 100644 index 000000000..7465f9aeb --- /dev/null +++ b/bsd/man/man2/getdtablesize.2 @@ -0,0 +1,63 @@ +.\" Copyright (c) 1983, 1991, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)getdtablesize.2 8.1 (Berkeley) 6/4/93 +.\" $FreeBSD: src/lib/libc/sys/getdtablesize.2,v 1.4.2.3 2001/12/14 18:34:00 ru Exp $ +.\" +.Dd June 4, 1993 +.Dt GETDTABLESIZE 2 +.Os +.Sh NAME +.Nm getdtablesize +.Nd get descriptor table size +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In unistd.h +.Ft int +.Fn getdtablesize void +.Sh DESCRIPTION +Each process has a fixed size descriptor table, +which is guaranteed to have at least 20 slots. The entries in +the descriptor table are numbered with small integers starting at 0. +The call +.Fn getdtablesize +returns the size of this table. +.Sh SEE ALSO +.Xr close 2 , +.Xr dup 2 , +.Xr open 2 , +.Xr select 2 +.Sh HISTORY +The +.Fn getdtablesize +function call appeared in +.Bx 4.2 . diff --git a/bsd/man/man2/getfsstat.2 b/bsd/man/man2/getfsstat.2 index 47e792b60..99e2abaf6 100644 --- a/bsd/man/man2/getfsstat.2 +++ b/bsd/man/man2/getfsstat.2 @@ -56,111 +56,15 @@ function returns information about all mounted file systems. The .Fa buf argument is a pointer to an array of -.Xr statfs -structures. -.Pp -As of Mac OS X 10.6, the default size of the -.Ft ino_t -type is 64 bits (the macro -.Dv _DARWIN_FEATURE_64_BIT_INODE -will be defined). -While there is no -.Ft ino_t -type used in the -.Xr statfs -structure, the changes to -.Fn getfsstat -are grouped together with the 64-bit inode changes. -The string fields in the -.Xr statfs -structure are larger and the variant symbol -.Li _getfsstat$INODE64 -will be automatically used. -The -.Xr statfs -structure is defined as: -.Bd -literal -typedef struct { int32_t val[2]; } fsid_t; - -#define MFSTYPENAMELEN 16 /* length of fs type name including null */ -#define MAXPATHLEN 1024 -#define MNAMELEN MAXPATHLEN - -struct statfs { /* when _DARWIN_FEATURE_64_BIT_INODE is defined */ - uint32_t f_bsize; /* fundamental file system block size */ - int32_t f_iosize; /* optimal transfer block size */ - uint64_t f_blocks; /* total data blocks in file system */ - uint64_t f_bfree; /* free blocks in fs */ - uint64_t f_bavail; /* free blocks avail to non-superuser */ - uint64_t f_files; /* total file nodes in file system */ - uint64_t f_ffree; /* free file nodes in fs */ - fsid_t f_fsid; /* file system id */ - uid_t f_owner; /* user that mounted the filesystem */ - uint32_t f_type; /* type of filesystem */ - uint32_t f_flags; /* copy of mount exported flags */ - uint32_t f_fssubtype; /* fs sub-type (flavor) */ - char f_fstypename[MFSTYPENAMELEN]; /* fs type name */ - char f_mntonname[MAXPATHLEN]; /* directory on which mounted */ - char f_mntfromname[MAXPATHLEN]; /* mounted filesystem */ - uint32_t f_reserved[8]; /* For future use */ -}; -.Ed -.Pp -(In 10.5, 64-bit -.Ft ino_t , -larger -.Xr statfs -structure and variant symbol were available if the macro -.Dv _DARWIN_USE_64_BIT_INODE -is defined before any header files are included; this macro is optional in -10.6.) -.Pp -If the macro -.Dv _DARWIN_NO_64_BIT_INODE -is defined before any header files are included, or if the deployment target -is less than 10.6, the legacy -.Xr statfs -structure will be in effect. -The -.Ft ino_t -type will be 32 bits (the -.Dv _DARWIN_FEATURE_64_BIT_INODE -macro will not be defined), the strings in the -.Xr statfs -structure will be their smaller legacy size (and long mount paths may no longer -fit) and the undecorated symbol -.Li _getfsstat -will be used. -This legacy .Fa statfs -structure is defined as: -.Bd -literal -#define MFSNAMELEN 15 /* length of fs type name, not inc. nul */ -#define MNAMELEN 90 /* length of buffer for returned name */ - -struct statfs { /* when _DARWIN_FEATURE_64_BIT_INODE is NOT defined */ - short f_otype; /* type of file system (reserved: zero) */ - short f_oflags; /* copy of mount flags (reserved: zero) */ - long f_bsize; /* fundamental file system block size */ - long f_iosize; /* optimal transfer block size */ - long f_blocks; /* total data blocks in file system */ - long f_bfree; /* free blocks in fs */ - long f_bavail; /* free blocks avail to non-superuser */ - long f_files; /* total file nodes in file system */ - long f_ffree; /* free file nodes in fs */ - fsid_t f_fsid; /* file system id */ - uid_t f_owner; /* user that mounted the file system */ - short f_reserved1; /* reserved for future use */ - short f_type; /* type of file system (reserved) */ - long f_flags; /* copy of mount flags (reserved) */ - long f_reserved2[2]; /* reserved for future use */ - char f_fstypename[MFSNAMELEN]; /* fs type name */ - char f_mntonname[MNAMELEN]; /* directory on which mounted */ - char f_mntfromname[MNAMELEN]; /* mounted file system */ - char f_reserved3; /* reserved for future use */ - long f_reserved4[4]; /* reserved for future use */ -}; -.Ed +structures (see +.Xr statfs 2 ) . +As +.Xr statfs 2 +indicates, the structure is defined differently depending on +whether the macro _DARWIN_FEATURE_64_BIT_INODE is defined (see +.Xr stat 2 +for more information on this macro). .Pp Fields that are undefined for a particular file system are set to -1. The buffer is filled with an array of @@ -223,6 +127,7 @@ routine is equivalent to the default is defined), so there is no longer any reason to use it (it will be removed in the future). .Sh SEE ALSO +.Xr stat 2 , .Xr statfs 2 , .Xr fstab 5 , .Xr mount 8 diff --git a/bsd/man/man2/getgroups.2 b/bsd/man/man2/getgroups.2 index f2a9e995d..a941bc389 100644 --- a/bsd/man/man2/getgroups.2 +++ b/bsd/man/man2/getgroups.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2008 Apple Inc. All rights reserved. +.\" Copyright (c) 2008, 2010 Apple Inc. All rights reserved. .\" .\" @APPLE_LICENSE_HEADER_START@ .\" @@ -56,7 +56,7 @@ .\" .\" @(#)getgroups.2 8.2 (Berkeley) 4/16/94 .\" -.Dd October 2, 2008 +.Dd September 17, 2010 .Dt GETGROUPS 2 .Os BSD 4.2 .Sh NAME @@ -90,6 +90,28 @@ is 0, returns the number of groups without modifying the .Fa grouplist[] array. +.Pp +To provide compatibility with applications that use +.Fn getgroups +in environments where users may be in more than +.Dv {NGROUPS_MAX} +groups, a variant of +.Fn getgroups , +obtained when compiling with either the macros +.Dv _DARWIN_UNLIMITED_GETGROUPS +or +.Dv _DARWIN_C_SOURCE +defined, can be used that is not limited to +.Dv {NGROUPS_MAX} +groups. +However, this variant only returns the user's default group access list and +not the group list modified by a call to +.Xr setgroups 2 +(either in the current process or an ancestor process). +Use of +.Xr setgroups 2 +is highly discouraged, and there is no foolproof way to determine if it has +been previously called. .Sh RETURN VALUES A successful call returns the number of groups in the group set. Otherwise, a value of -1 is returned and the global integer variable @@ -112,12 +134,6 @@ The argument although non-zero, is smaller than the number of groups in the group set. .El -.Sh LEGACY DESCRIPTION -If _DARWIN_C_SOURCE is defined, -.Fn getgroups -can return more than -.Dv {NGROUPS_MAX} -groups. .Sh LEGACY SYNOPSIS .Fd #include <sys/param.h> .Fd #include <sys/types.h> diff --git a/bsd/man/man2/gettimeofday.2 b/bsd/man/man2/gettimeofday.2 index 96659f100..a9b300555 100644 --- a/bsd/man/man2/gettimeofday.2 +++ b/bsd/man/man2/gettimeofday.2 @@ -53,7 +53,6 @@ .Fa "const struct timezone *tzp" .Fc .Sh DESCRIPTION -.Ef .Pp The system's notion of the current Greenwich time and the current time zone is obtained with the diff --git a/bsd/man/man2/kqueue.2 b/bsd/man/man2/kqueue.2 index f7a12d523..6ab998c5a 100644 --- a/bsd/man/man2/kqueue.2 +++ b/bsd/man/man2/kqueue.2 @@ -74,7 +74,7 @@ and The .Fn kqueue system call -provides a generic method of notifying the user when an kernel +provides a generic method of notifying the user when a kernel event (kevent) happens or a condition holds, based on the results of small pieces of kernel code termed filters. A kevent is identified by an (ident, filter) pair and specifies @@ -267,7 +267,7 @@ the descriptor. .It EV_RECEIPT This flag is useful for making bulk changes to a kqueue without draining any pending events. When passed as input, it forces EV_ERROR to always be returned. -When a filter is successfully added. The +When a filter is successfully added, the .Va data field will be zero. .It EV_ONESHOT @@ -433,6 +433,8 @@ The events to monitor are: .Bl -tag -width NOTE_SIGNAL .It NOTE_EXIT The process has exited. +.It NOTE_EXITSTATUS +The process has exited and its exit status is in filter specific data. Valid only on child processes and to be used along with NOTE_EXIT. .It NOTE_FORK The process created a child process via .Xr fork 2 @@ -507,42 +509,6 @@ contains the number of times the timeout has expired since the last call to or .Fn kevent64 . This filter automatically sets the EV_CLEAR flag internally. -.It EVFILT_SESSION -Takes the audit session ID to monitor as the identifier and the events to watch for in -.Va fflags , -and returns when one or more of the requested session events occurs. -To monitor for events for any audit session the value AS_ANY_ASID -should be used as the identifier. With AS_ANY_ASID, as new audit -sessions are created they are included as if the were added -individually. The events to monitor are: -.Bl -tag -width NOTE_AS_UPDATE -.It NOTE_AS_START -A new audit session has started. -.It NOTE_AS_END -All the processes in the audit session have exited. -.It NOTE_AS_CLOSE -This audit session is no longer valid in the kernel. In other words, it -is now safe to dispose of any cached information about this session or -reuse its session ID for a new audit session. -.It NOTE_AS_UPDATE -The audit session information was updated. The audit session information is -considered immutable once initially set. If this becomes enforced in -the kernel then this event may no longer be needed and may become -obsolete. -.It NOTE_AS_ERR -This flag is returned if the system was unable to attach an event to a -new session when the audit session ID of AS_ANY_ASID -is used. This is usually due to resource limitations. -.El -.Pp -On return, -.Va fflags -contains the events which triggered the filter, -.Va ident -contains the audit session ID, and -.Va data -contains the audit user ID. -This filter automatically sets the EV_CLEAR flag internally. .El .Pp ---- diff --git a/bsd/man/man2/madvise.2 b/bsd/man/man2/madvise.2 index a4b4d415d..9f89c32ba 100644 --- a/bsd/man/man2/madvise.2 +++ b/bsd/man/man2/madvise.2 @@ -126,7 +126,7 @@ This is used with system call. .It Dv MADV_ZERO_WIRED_PAGES Indicates that the application would like the wired pages in this address -range to be zeroed out if the address range is dellocated without first +range to be zeroed out if the address range is deallocated without first unwiring the pages (i.e. a munmap(2) without a preceding munlock(2) or the application quits). This is used with diff --git a/bsd/man/man2/mmap.2 b/bsd/man/man2/mmap.2 index af9de8c04..b55d054e1 100644 --- a/bsd/man/man2/mmap.2 +++ b/bsd/man/man2/mmap.2 @@ -148,6 +148,15 @@ VM_MAKE_TAG(tag) to associate an 8-bit tag with the region <mach/vm_statistics.h> defines some preset tags (with a VM_MEMORY_ prefix). Users are encouraged to use tags between 240 and 255. Tags are used by tools such as vmmap(1) to help identify specific memory regions. +.Pp +VM_FLAGS_SUPERPAGE_SIZE_* to use superpages for the allocation. +See <mach/vm_statistics.h> for supported architectures and sizes (or use +VM_FLAGS_SUPERPAGE_SIZE_ANY to have the kernel choose a size). +The specified size must be divisible by the superpage size (except for +VM_FLAGS_SUPERPAGE_SIZE_ANY), and if you use MAP_FIXED, the specified address +must be properly aligned. If the system cannot satisfy the request with superpages, +the call will fail. Note that currently, superpages are always wired and not +inherited by children of the process. .It Dv MAP_FILE Mapped from a regular file. (This is the default mapping type, and need not be specified.) diff --git a/bsd/man/man2/open.2 b/bsd/man/man2/open.2 index 2d121402d..80c293626 100644 --- a/bsd/man/man2/open.2 +++ b/bsd/man/man2/open.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2008 Apple Inc. All rights reserved. +.\" Copyright (c) 2010 Apple Inc. All rights reserved. .\" .\" @APPLE_LICENSE_HEADER_START@ .\" @@ -56,7 +56,7 @@ .\" .\" @(#)open.2 8.2 (Berkeley) 11/16/93 .\" -.Dd October 7, 2008 +.Dd November 10, 2010 .Dt OPEN 2 .Os BSD 4 .Sh NAME @@ -114,6 +114,7 @@ O_EXLOCK atomically obtain an exclusive lock O_NOFOLLOW do not follow symlinks O_SYMLINK allow open of symlinks O_EVTONLY descriptor requested for event notifications only +O_CLOEXEC mark as close-on-exec .Ed .Pp Opening a file with @@ -133,7 +134,9 @@ returns an error. This may be used to implement a simple exclusive-access locking mechanism. If .Dv O_EXCL -is set and the last component of the pathname is a symbolic link, +is set with +.Dv O_CREAT +and the last component of the pathname is a symbolic link, .Fn open will fail even if the symbolic link points to a non-existent name. .Pp @@ -184,6 +187,15 @@ flag is only intended for monitoring a file for changes (e.g. kqueue). Note: whe this flag is used, the opened file will not prevent an unmount of the volume that contains the file. .Pp +The +.Dv O_CLOEXEC +flag causes the file descriptor to be marked as close-on-exec, +setting the +.Dv FD_CLOEXEC +flag. The state of the file descriptor flags can be inspected +using the F_GETFD fcntl. See +.Xr fcntl 2 . +.Pp If successful, .Fn open returns a non-negative integer, termed a file descriptor. diff --git a/bsd/man/man2/pathconf.2 b/bsd/man/man2/pathconf.2 index afe640327..9743384ce 100644 --- a/bsd/man/man2/pathconf.2 +++ b/bsd/man/man2/pathconf.2 @@ -103,6 +103,16 @@ system call, otherwise 0. Return 1 if file names longer than KERN_NAME_MAX are truncated. .It Li _PC_VDISABLE Returns the terminal character disabling value. +.It Li _PC_XATTR_SIZE_BITS +Returns the number of bits used to store maximum extended +attribute size in bytes. For example, if the maximum +attribute size supported by a file system is 128K, the +value returned will be 18. However a value 18 can mean +that the maximum attribute size can be anywhere from +(256KB - 1) to 128KB. As a special case, the resource +fork can have much larger size, and some file system +specific extended attributes can have smaller and preset +size; for example, Finder Info is always 32 bytes. .El .Sh RETURN VALUES If the call to diff --git a/bsd/man/man2/pipe.2 b/bsd/man/man2/pipe.2 index 03f12c196..df5b9d85c 100644 --- a/bsd/man/man2/pipe.2 +++ b/bsd/man/man2/pipe.2 @@ -33,7 +33,7 @@ .\" .\" @(#)pipe.2 8.1 (Berkeley) 6/4/93 .\" -.Dd June 4, 1993 +.Dd February 17, 2011 .Dt PIPE 2 .Os BSD 4 .Sh NAME @@ -82,6 +82,12 @@ signal. Widowing a pipe is the only way to deliver end-of-file to a reader: after the reader consumes any buffered data, reading a widowed pipe returns a zero count. +.Pp +The generation of the +.Dv SIGPIPE +signal can be suppressed using the +.Dv F_SETNOSIGPIPE +fcntl command. .Sh RETURN VALUES On successful creation of the pipe, zero is returned. Otherwise, a value of -1 is returned and the variable @@ -111,6 +117,7 @@ The system file table is full. .Xr fork 2 , .Xr read 2 , .Xr socketpair 2 , +.Xr fcntl 2 , .Xr write 2 .Sh HISTORY A diff --git a/bsd/man/man2/posix_spawn.2 b/bsd/man/man2/posix_spawn.2 index 76bfa055b..6a940d5c1 100644 --- a/bsd/man/man2/posix_spawn.2 +++ b/bsd/man/man2/posix_spawn.2 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2000-2007 Apple Inc. All rights reserved. +.\" Copyright (c) 2000-2010 Apple Inc. All rights reserved. .\" .\" @APPLE_OSREFERENCE_LICENSE_HEADER_START@ .\" @@ -27,7 +27,7 @@ .\" .\" @(#)posix_spawn.2 . -.Dd August 9, 2007 +.Dd November 2, 2010 .Dt POSIX_SPAWN 2 .Os "Mac OS X" .Sh NAME @@ -95,7 +95,7 @@ spawned process. The value is undefined in the case of a failure. .Pp The argument .Fa file_actions -is either NULL, or it is a a pointer to a file actions object that was +is either NULL, or it is a pointer to a file actions object that was initialized by a call to .Xr posix_spawn_file_actions_init 3 and represents zero or more file actions. @@ -108,9 +108,12 @@ and .Xr fcntl 2 ) . Descriptors that remain open are unaffected by .Fn posix_spawn -unless their behaviour is modified by a file action; see +unless their behaviour is modified by particular spawn flags +or a file action; see +.Xr posix_spawnattr_setflags 3 +and .Xr posix_spawn_file_actions_init 3 -for more information. +for additional information. .Pp The argument .Fa attrp diff --git a/bsd/man/man2/quotactl.2 b/bsd/man/man2/quotactl.2 index c60519313..4a4760dc6 100644 --- a/bsd/man/man2/quotactl.2 +++ b/bsd/man/man2/quotactl.2 @@ -64,7 +64,6 @@ The address of an optional command specific data structure, may be given; its interpretation is discussed below with each command. .Pp -Currently quotas are supported only for the "ffs" and "hfs" filesystems. A command is composed of a primary command (see below) and a command type used to interpret the .Fa id . diff --git a/bsd/man/man2/sem_close.2 b/bsd/man/man2/sem_close.2 new file mode 100644 index 000000000..cdff87c7c --- /dev/null +++ b/bsd/man/man2/sem_close.2 @@ -0,0 +1,60 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd June 8, 2000 +.Dt SEM_CLOSE 2 +.Os Darwin +.Sh NAME +.Nm sem_close +.Nd close a named semaphore +.Sh SYNOPSIS +.Fd #include <semaphore.h> +.Ft int +.Fn sem_close "sem_t *sem" +.Sh DESCRIPTION +The system resources associated with the named semaphore referenced by +.Fa sem +are deallocated and the descriptor is invalidated. +.Pp +If successful, +.Fn sem_close +will return 0. Otherwise, -1 is returned and +.Va errno +is set. +.Sh ERRORS +.Fn sem_close +succeeds unless: +.Bl -tag -width Er +.It Bq Er EINVAL +.Fa sem +is not a valid semaphore descriptor. +.El +.Sh SEE ALSO +.Xr sem_init 2 , +.Xr sem_open 2 , +.Xr sem_unlink 2 , +.Xr semctl 2 , +.Xr semget 2 , +.Xr semop 2 +.Sh HISTORY +.Fn sem_close +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/sem_open.2 b/bsd/man/man2/sem_open.2 new file mode 100644 index 000000000..423e98ae4 --- /dev/null +++ b/bsd/man/man2/sem_open.2 @@ -0,0 +1,169 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd June 8, 2000 +.Dt SEM_OPEN 2 +.Os Darwin +.Sh NAME +.Nm sem_open +.Nd initialize and open a named semaphore +.Sh SYNOPSIS +.Fd #include <semaphore.h> +.Ft sem_t * +.Fo sem_open +.Fa "const char *name" +.Fa "int oflag" +.Fa "..." +.Fc +.Pp +The parameters "mode_t mode" and "unsigned int value" +are optional. +.Sh DESCRIPTION +The named semaphore named +.Fa name +is initialized and opened as specified by the argument +.Fa oflag +and a semaphore descriptor is returned to the calling process. +.Pp +The value of +.Fa oflag +is formed by +.Em or Ns 'ing +the following values: +.Pp +.Bd -literal -offset indent -compact +O_CREAT create the semaphore if it does not exist +O_EXCL error if create and semaphore exists +.Ed +.Pp +If +.Dv O_CREAT +is specified, +.Fn sem_open +requires an additional two arguments. +.Fa mode +specifies the permissions for the semaphore as described in +.Xr chmod 2 +and modified by the process' umask value (see +.Xr umask 2 ) . +The semaphore is created with an initial +.Fa value , +which must be less than or equal to +.Dv SEM_VALUE_MAX . +.Pp +If +.Dv O_EXCL +is specified and the semaphore exists, +.Fn sem_open +fails. The check for the existence of the semaphore and the creation +of the semaphore are atomic with respect to all processes calling +.Fn sem_open +with +.Dv O_CREAT +and +.Dv O_EXCL +set. +.Pp +When a new semaphore is created, it is given the user ID and group ID +which correspond to the effective user and group IDs of the calling +process. There is no visible entry in the file system for the created +object in this implementation. +.Pp +The returned semaphore descriptor is available to the calling process +until it is closed with +.Fn sem_close , +or until the caller exits or execs. +.Pp +If a process makes repeated calls to +.Fn sem_open , +with the same +.Fa name +argument, the same descriptor is returned for each successful call, +unless +.Fn sem_unlink +has been called on the semaphore in the interim. +.Pp +If +.Fn sem_open +fails for any reason, it will return a value of +.Dv SEM_FAILED +and sets +.Va errno . +On success, it returns a semaphore descriptor. +.Sh ERRORS +The named semaphore is opened unless: +.Bl -tag -width Er +.It Bq Er EACCES +The required permissions (for reading and/or writing) +are denied for the given flags; or +.Dv O_CREAT +is specified, the object does not exist, and permission to +create the semaphore is denied. +.It Bq Er EEXIST +.Dv O_CREAT +and +.Dv O_EXCL +were specified and the semaphore exists. +.It Bq Er EINTR +The +.Fn sem_open +operation was interrupted by a signal. +.It Bq Er EINVAL +The +.Fn shm_open +operation is not supported; or +.Dv O_CREAT +is specified and +.Fa value +exceeds +.Dv SEM_VALUE_MAX . +.It Bq Er EMFILE +The process has already reached its limit for semaphores or file +descriptors in use. +.It Bq Er ENAMETOOLONG +.Fa name +exceeded +.Dv SEM_NAME_LEN +characters. +.It Bq Er ENFILE +Too many semaphores or file descriptors are open on the system. +.It Bq Er ENOENT +.Dv O_CREAT +is not set and the named semaphore does not exist. +.It Bq Er ENOSPC +.Dv O_CREAT +is specified, the file does not exist, and there is insufficient +space available to create the semaphore. +.El +.Sh SEE ALSO +.Xr sem_close 2 , +.Xr sem_post 2 , +.Xr sem_trywait 2 , +.Xr sem_unlink 2 , +.Xr sem_wait 2 , +.Xr semctl 2 , +.Xr semget 2 , +.Xr semop 2 , +.Xr umask 2 +.Sh HISTORY +.Fn sem_open +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/sem_post.2 b/bsd/man/man2/sem_post.2 new file mode 100644 index 000000000..36d06fde8 --- /dev/null +++ b/bsd/man/man2/sem_post.2 @@ -0,0 +1,65 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd June 8, 2000 +.Dt SEM_POST 2 +.Os Darwin +.Sh NAME +.Nm sem_post +.Nd unlock a semaphore +.Sh SYNOPSIS +.Fd #include <semaphore.h> +.Ft int +.Fn sem_post "sem_t *sem" +.Sh DESCRIPTION +The semaphore referenced by +.Fa sem +is unlocked, the value of the semaphore is incremented, and all +threads which are waiting on the semaphore are awakened. +.Pp +.Fn sem_post +is reentrant with respect to signals and may be called from within a +signal hanlder. +.Pp +If successful, +.Fn sem_post +will return 0. Otherwise, -1 is returned and +.Va errno +is set. +.Sh ERRORS +.Fn sem_post +succeeds unless: +.Bl -tag -width Er +.It Bq Er EINVAL +.Fa sem +is not a valid semaphore descriptor. +.El +.Sh SEE ALSO +.Xr sem_open 2 , +.Xr sem_trywait 2 , +.Xr sem_wait 2 , +.Xr semctl 2 , +.Xr semget 2 , +.Xr semop 2 +.Sh HISTORY +.Fn sem_post +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/sem_unlink.2 b/bsd/man/man2/sem_unlink.2 new file mode 100644 index 000000000..7fc7e9c4d --- /dev/null +++ b/bsd/man/man2/sem_unlink.2 @@ -0,0 +1,74 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd June 8, 2000 +.Dt SEM_UNLINK 2 +.Os Darwin +.Sh NAME +.Nm sem_unlink +.Nd remove a named semaphore +.Sh SYNOPSIS +.Fd #include <semaphore.h> +.Ft int +.Fn sem_unlink "const char *name" +.Sh DESCRIPTION +The named semaphore named +.Fa name +is removed. If the semaphore is in use by other processes, then +.Fa name +is immediately disassociated with the semaphore, but the semaphore +itself will not be removed until all references to it have been +closed. Subsequent calls to +.Fn sem_open +using +.Fa name +will refer to or create a new semaphore named +.Fa name . +.Pp +If successful, +.Fn sem_unlink +will return 0. Otherwise, -1 is returned and +.Va errno +is set, and the state of the semaphore is unchanged. +.Sh ERRORS +.Fn sem_unlink +succeeds unless: +.Bl -tag -width Er +.It Bq Er EACCES +Permission is denied to be remove the semaphore. +.It Bq Er ENAMETOOLONG +.Fa name +exceeded +.Dv SEM_NAME_LEN +characters. +.It Bq Er ENOENT +The named semaphore does not exist. +.El +.Sh SEE ALSO +.Xr sem_close 2 , +.Xr sem_open 2 , +.Xr semctl 2 , +.Xr semget 2 , +.Xr semop 2 +.Sh HISTORY +.Fn sem_unlink +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/sem_wait.2 b/bsd/man/man2/sem_wait.2 new file mode 100644 index 000000000..02f8d8586 --- /dev/null +++ b/bsd/man/man2/sem_wait.2 @@ -0,0 +1,88 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd June 8, 2000 +.Dt SEM_WAIT 2 +.Os Darwin +.Sh NAME +.Nm sem_trywait, sem_wait +.Nd lock a semaphore +.Sh SYNOPSIS +.Fd #include <semaphore.h> +.Ft int +.Fn sem_trywait "sem_t *sem" +.Ft int +.Fn sem_wait "sem_t *sem" +.Sh DESCRIPTION +The semaphore referenced by +.Fa sem +is locked. When calling +.Fn sem_wait , +if the semaphore's value is zero, the calling thread will block until +the lock is acquired or until the call is interrupted by a +signal. Alternatively, the +.Fn sem_trywait +function will fail if the semaphore is already locked, rather than +blocking on the semaphore. +.Pp +If successful (the lock was acquired), +.Fn sem_wait +and +.Fn sem_trywait +will return 0. Otherwise, -1 is returned and +.Va errno +is set, and the state of the semaphore is unchanged. +.Sh ERRORS +.Fn sem_wait +and +.Fn sem_trywait +succeed unless: +.Bl -tag -width Er +.It Bq Er EAGAIN +The semaphore is already locked. +.It Bq Er EDEADLK +A deadlock was detected. +.It Bq Er EINTR +The call was interrupted by a signal. +.It Bq Er EINVAL +.Fa sem +is not a valid semaphore descriptor. +.El +.Sh NOTES +Applications may encounter a priority inversion while using +semaphores. When a thread is waiting on a semaphore which is about to +be posted by a lower-priority thread and the lower-priority thread is +preempted by another thread (of medium priority), a priority inversion +has occured, and the higher-priority thread will be blocked for an +unlimited time period. Programmers using the realtime functionality +of the system should take care to avoid priority inversions. +.Sh SEE ALSO +.Xr sem_open 2 , +.Xr sem_post 2 , +.Xr semctl 2 , +.Xr semget 2 , +.Xr semop 2 +.Sh HISTORY +.Fn sem_wait +and +.Fn sem_trywait +are specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/sendfile.2 b/bsd/man/man2/sendfile.2 index d2919e3d2..1e5e537f3 100644 --- a/bsd/man/man2/sendfile.2 +++ b/bsd/man/man2/sendfile.2 @@ -104,14 +104,16 @@ arrays is specified by and .Fa trl_cnt . .Pp -When a header or trailer is specified the value of +When a header or trailer is specified, the value of .Fa len -returned will include the size of header or trailer sent. The user should -provide sufficiently large value of +argument indicates the maximum number of bytes in the header and/or file to be sent. +It does not control the trailer; if a trailer exists, all of it will be sent. +If the value of .Fa len -as argument including the size of header or trailer, -otherwise only part of file data will be sent -following the header. +argument is 0, all of the header and/or file will be sent before the entire trailer is sent. +On return, the +.Fa len +argument specifies the total number of bytes sent. .Pp The .Fa flags diff --git a/bsd/man/man2/setaudit.2 b/bsd/man/man2/setaudit.2 index 6b1979f5d..b626e0cf8 100644 --- a/bsd/man/man2/setaudit.2 +++ b/bsd/man/man2/setaudit.2 @@ -54,7 +54,6 @@ The data structure is defined as follows: .nf .in +4n - struct auditinfo { au_id_t ai_auid; /* Audit user ID */ au_mask_t ai_mask; /* Audit masks */ @@ -77,15 +76,13 @@ Until is set to something other than AU_DEFAUDITID any audit events generated by the system with be filtered by the non-attributed audit mask. -.PP - +.Pp The .Fa au_mask_t data structure defines the bit mask for auditing successful and failed events out of the predefined list of event classes. It is defined as follows: .nf .in +4n - struct au_mask { unsigned int am_success; /* success bits */ unsigned int am_failure; /* failure bits */ @@ -93,24 +90,21 @@ struct au_mask { typedef struct au_mask au_mask_t; .in .fi -.PP - +.Pp The .Fa au_termid_t data structure defines the Terminal ID recorded with every event caused by the process. It is defined as follows: .nf .in +4n - struct au_tid { dev_t port; u_int32_t machine; }; typedef struct au_tid au_tid_t; - .in .fi -.PP +.Pp The .Fa ai_asid variable contains the audit session ID which is recorded with every event @@ -118,7 +112,7 @@ caused by the process. It can be any value in the range 1 to PID_MAX (99999). If the value of AU_ASSIGN_ASID is used for .Fa ai_asid a unique session ID will be generated by the kernel. -The audit session ID will be returned in +The audit session ID will be returned in the .Fa ai_asid field on success. .Pp @@ -127,11 +121,10 @@ The system call uses the expanded .Fa auditinfo_addr_t -data structure supports Terminal IDs with larger addresses such as those used -in IP version 6. It is defined as follows: +data structure which supports Terminal IDs with larger addresses +such as those used in IP version 6. It is defined as follows: .nf .in +4n - struct auditinfo_addr { au_id_t ai_auid; /* Audit user ID. */ au_mask_t ai_mask; /* Audit masks. */ @@ -145,11 +138,10 @@ typedef struct auditinfo_addr auditinfo_addr_t; .Pp The .Fa au_tid_addr_t -data structure which includes a larger address storage field and an additional +data structure includes a larger address storage field and an additional field with the type of address stored: .nf .in +4n - struct au_tid_addr { dev_t at_port; u_int32_t at_type; @@ -183,18 +175,20 @@ field in is set to AU_IPv4 and the other .Fa ai_tid_addr fields are all set to zero. -The -.Fa ai_flags -field can only be set when a new session is initially created. Creating a new session is done by setting the .Fa ai_asid field to an unique session value or AU_ASSIGN_ASID. These system calls will fail when attempting to change the -.Fa ai_auid , -.Fa ai_termid , -or -.Fa ai_flags +.Fa ai_auid +or +.Fa ai_termid fields once set to something other than the default values. +The +.Fa ai_flags +field may be updated only according to local access control +policy but this is usually accomplished with +.Xr auditon 2 +using the A_SETSFLAGS command. The audit preselection masks may be changed at any time but are usually updated with .Xr auditon 2 diff --git a/bsd/man/man2/setgroups.2 b/bsd/man/man2/setgroups.2 index 1547f7027..0ec3c3086 100644 --- a/bsd/man/man2/setgroups.2 +++ b/bsd/man/man2/setgroups.2 @@ -33,7 +33,7 @@ .\" .\" @(#)setgroups.2 8.2 (Berkeley) 4/16/94 .\" -.Dd April 16, 1994 +.Dd September 15, 2010 .Dt SETGROUPS 2 .Os BSD 4.2 .Sh NAME @@ -56,6 +56,10 @@ more than .Dv {NGROUPS_MAX} . .Pp Only the super-user may set new groups. +.Pp +Use of +.Fn setgroups +is highly discouraged. .Sh RETURN VALUES A 0 value is returned on success, -1 on error, with an error code stored in diff --git a/bsd/man/man2/setregid.2 b/bsd/man/man2/setregid.2 new file mode 100644 index 000000000..47d791647 --- /dev/null +++ b/bsd/man/man2/setregid.2 @@ -0,0 +1,92 @@ +.\" Copyright (c) 1980, 1991, 1993, 1994 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)setregid.2 8.2 (Berkeley) 4/16/94 +.\" $FreeBSD: src/lib/libc/sys/setregid.2,v 1.6.2.4 2001/12/14 18:34:01 ru Exp $ +.\" +.Dd April 16, 1994 +.Dt SETREGID 2 +.Os +.Sh NAME +.Nm setregid +.Nd set real and effective group ID +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In unistd.h +.Ft int +.Fn setregid "gid_t rgid" "gid_t egid" +.Sh DESCRIPTION +The real and effective group ID's of the current process +are set to the arguments. +Unprivileged users may change the real group +ID to the effective group ID and vice-versa; only the super-user may +make other changes. +.Pp +Supplying a value of -1 for either the real or effective +group ID forces the system to substitute the current +ID in place of the -1 parameter. +.Pp +The +.Fn setregid +function was intended to allow swapping +the real and effective group IDs +in set-group-ID programs to temporarily relinquish the set-group-ID value. +This function did not work correctly; +its purpose is now better served by the use of the +.Fn setegid +function (see +.Xr setuid 2 ) . +.Pp +When setting the real and effective group IDs to the same value, +the standard +.Fn setgid +function is preferred. +.Sh RETURN VALUES +.Rv -std setregid +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er EPERM +The current process is not the super-user and a change +other than changing the effective group-id to the real group-id +was specified. +.El +.Sh SEE ALSO +.Xr getgid 2 , +.Xr issetugid 2 , +.Xr setegid 2 , +.Xr setgid 2 , +.Xr setuid 2 +.Sh HISTORY +The +.Fn setregid +system call appeared in +.Bx 4.2 . diff --git a/bsd/man/man2/setreuid.2 b/bsd/man/man2/setreuid.2 new file mode 100644 index 000000000..13cfeadf4 --- /dev/null +++ b/bsd/man/man2/setreuid.2 @@ -0,0 +1,90 @@ +.\" Copyright (c) 1980, 1991, 1993, 1994 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)setreuid.2 8.2 (Berkeley) 4/16/94 +.\" $FreeBSD: src/lib/libc/sys/setreuid.2,v 1.6.2.6 2001/12/14 18:34:01 ru Exp $ +.\" +.Dd February 8, 2001 +.Dt SETREUID 2 +.Os +.Sh NAME +.Nm setreuid +.Nd set real and effective user IDs +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In unistd.h +.Ft int +.Fn setreuid "uid_t ruid" "uid_t euid" +.Sh DESCRIPTION +The real and effective user IDs of the +current process are set according to the arguments. +If +.Fa ruid +or +.Fa euid +is -1, the current uid is filled in by the system. +Unprivileged users may change the real user +ID to the effective user ID and vice-versa; only the super-user may +make other changes. +.Pp +The +.Fn setreuid +function has been used to swap the real and effective user IDs +in set-user-ID programs to temporarily relinquish the set-user-ID value. +This purpose is now better served by the use of the +.Fn seteuid +function (see +.Xr setuid 2 ) . +.Pp +When setting the real and effective user IDs to the same value, +the standard +.Fn setuid +function is preferred. +.Sh RETURN VALUES +.Rv -std setreuid +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er EPERM +The current process is not the super-user and a change +other than changing the effective user-id to the real user-id +was specified. +.El +.Sh SEE ALSO +.Xr getuid 2 , +.Xr issetugid 2 , +.Xr seteuid 2 , +.Xr setuid 2 +.Sh HISTORY +The +.Fn setreuid +system call appeared in +.Bx 4.2 . diff --git a/bsd/man/man2/setxattr.2 b/bsd/man/man2/setxattr.2 index 6fe4f86b8..957c5bd77 100644 --- a/bsd/man/man2/setxattr.2 +++ b/bsd/man/man2/setxattr.2 @@ -91,6 +91,13 @@ is identical to except that it sets an extended attribute on an open file referenced by file descriptor .Fa fd . +.Sh NOTE +On some filesystems, such as +.Dv HFS+ , +setting the extended attribute +.Dv com.apple.ResourceFork +will update the modification time (``mtime'') of +the file. .Sh RETURN VALUES On success, 0 is returned. On failure, -1 is returned and the global variable diff --git a/bsd/man/man2/shm_open.2 b/bsd/man/man2/shm_open.2 new file mode 100644 index 000000000..1b4bfc685 --- /dev/null +++ b/bsd/man/man2/shm_open.2 @@ -0,0 +1,179 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd August 29, 2008 +.Dt SHM_OPEN 2 +.Os Darwin +.Sh NAME +.Nm shm_open +.Nd open a shared memory object +.Sh SYNOPSIS +.Fd #include <sys/mman.h> +.Ft int +.Fo shm_open +.Fa "const char *name" +.Fa "int oflag" +.Fa "..." +.Fc +.Pp +The parameter "mode_t mode" is optional. +.Sh DESCRIPTION +The shared memory object referenced by +.Fa name +is opened for reading and/or writing as specified by the argument +.Fa oflag +and the file descriptor returned to the calling process. +The returned file descriptor will be the lowest non-open file +descriptor for the calling process, and is not shared with any +other processes, as it is a new file descriptor. The new file +descriptor will have the +.Dv FD_CLOEXEC +flag set. +Repeated calls +to +.Nm shm_open +with the same string value for +.Fn name +will return a file descriptor referring to the same shared memory +object, provided that the object has not been unlinked by a call to +.Fn shm_unlink . +The +.Fa oflag +argument may indicate the file is to be +created if it does not exist (by specifying the +.Dv O_CREAT +flag), in which case the file is created with mode +.Fa mode +as described in +.Xr chmod 2 +and modified by the process' umask value (see +.Xr umask 2 ) . +.Pp +The value of +.Fa oflag +is formed by +.Em or Ns 'ing +the following values: +.Pp +.Bd -literal -offset indent -compact +O_RDONLY open for reading only +O_RDWR open for reading and writing +O_CREAT create object if it does not exist +O_EXCL error if create and object exists +O_TRUNC truncate size to 0 +.Ed +.Pp +Exactly one of +.Dv O_RDONLY +or +.Dv O_RDWR +must be specified. +.Pp +If +.Dv O_TRUNC +is specified and the +file exists, the file is truncated to zero length. +If +.Dv O_EXCL +is set with +.Dv O_CREAT +and the file already +exists, +.Fn shm_open +returns an error. This may be used to +implement a simple exclusive access locking mechanism. +.Pp +If successful, +.Fn shm_open +returns a non-negative integer, termed a file descriptor. +It returns -1 and sets +.Va errno +on failure. +The file pointer used to mark the current position within the +memory object is set to the beginning of the object. +.Pp +When a new shared memory object is created it is given the +owner and group corresponding to the effective user and +group of the calling process. There is no visible entry in the +file system for the created object in this implementation. +.Pp +When a shared memory object is created, it persists until it +it unlinked and all other references are gone. Objects do +not persist across a system reboot. +.Pp +The system imposes a limit on the number of file descriptors +open simultaneously by one process. +.Xr Getdtablesize 2 +returns the current system limit. +.Sh ERRORS +The named object is opened unless: +.Bl -tag -width Er +.It Bq Er EACCES +The required permissions (for reading and/or writing) +are denied for the given flags. +.It Bq Er EACCES +.Dv O_CREAT +is specified, the object does not exist, and permission to +create the object is denied. +.It Bq Er EEXIST +.Dv O_CREAT +and +.Dv O_EXCL +were specified and the object exists. +.It Bq Er EINTR +The +.Fn shm_open +operation was interrupted by a signal. +.It Bq Er EINVAL +The +.Fn shm_open +operation is not supported. +.It Bq Er EMFILE +The process has already reached its limit for open file descriptors. +.It Bq Er ENAMETOOLONG +.Fa name +exceeded the name size limit. +This is currently +.Dv PSHMNAMLEN +characters (defined in +.In sys/posix_shm.h ) , +but this may change in the future. +.It Bq Er ENFILE +The system file table is full. +.It Bq Er ENOENT +.Dv O_CREAT +is not set and the named object does not exist. +.It Bq Er ENOSPC +.Dv O_CREAT +is specified, the file does not exist, and there is insufficient +space available to create the object. +.El +.Sh SEE ALSO +.Xr chmod 2 , +.Xr close 2 , +.Xr getdtablesize 2 , +.Xr mmap 2 , +.Xr shm_unlink 2 , +.Xr umask 2 +.Sh HISTORY +.Fn shm_open +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/shm_unlink.2 b/bsd/man/man2/shm_unlink.2 new file mode 100644 index 000000000..7ecc66fd4 --- /dev/null +++ b/bsd/man/man2/shm_unlink.2 @@ -0,0 +1,87 @@ +.\" $Darwin$ +.\" +.\" Copyright (c) 1999-2002 Apple Computer, Inc. All rights reserved. +.\" +.\" @APPLE_LICENSE_HEADER_START@ +.\" +.\" The contents of this file constitute Original Code as defined in and +.\" are subject to the Apple Public Source License Version 1.1 (the +.\" "License"). You may not use this file except in compliance with the +.\" License. Please obtain a copy of the License at +.\" http://www.apple.com/publicsource and read it before using this file. +.\" +.\" This Original Code and all software distributed under the License are +.\" distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER +.\" EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +.\" INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +.\" FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the +.\" License for the specific language governing rights and limitations +.\" under the License. +.\" +.\" @APPLE_LICENSE_HEADER_END@ +.\" +.Dd August 31, 2006 +.Dt SHM_UNLINK 2 +.Os Darwin +.Sh NAME +.Nm shm_unlink +.Nd remove shared memory object +.Sh SYNOPSIS +.Fd #include <sys/mman.h> +.Ft int +.Fn shm_unlink "const char *name" +.Sh DESCRIPTION +The +.Fn shm_unlink +function disassociates the shared memory object specified by +.Fa name +from that name. +The resources associated with the shared memory object remain intact +until the last file descriptor reference is removed, e.g., by +.Xr close 2 +or +.Xr munmap 2 , +at which point the resources are reclaimed +(if no references exist at the time of the call to +.Fn shm_unlink , +the resources are reclaimed immediately). +The name can only be reused +when it is bound to a new shared memory object with a call to +.Xr shm_open 2 +with the +.Dv O_CREAT +flag. +.Sh RETURN VALUES +Upon successful completion, a value of 0 is returned. +Otherwise, a value of -1 is returned and +.Va errno +is set to indicate the error, +and the named shared memory object will remain unchanged. +.Sh ERRORS +The +.Fn shm_unlink +succeeds unless: +.Bl -tag -width Er +.It Bq Er EACCES +Permission is denied to be remove the object. +.It Bq Er ENAMETOOLONG +.Fa name +exceeded the name size limit. +This is currently +.Dv PSHMNAMLEN +characters (defined in +.In sys/posix_shm.h ) , +but this may change in the future. +.It Bq Er ENOENT +The named object does not exist. +.El +.Sh SEE ALSO +.Xr close 2 , +.Xr mmap 2 , +.Xr munmap 2 , +.Xr shm_open 2 , +.Xr shmat 2 , +.Xr shmctl 2 +.Sh HISTORY +.Fn shm_open +is specified in the POSIX Realtime Extension (1003.1b-1993/1003.1i-1995). diff --git a/bsd/man/man2/stat.2 b/bsd/man/man2/stat.2 index 76cca02f1..02de79c72 100644 --- a/bsd/man/man2/stat.2 +++ b/bsd/man/man2/stat.2 @@ -127,9 +127,7 @@ as defined by and into which information is placed concerning the file. When the macro .Dv _DARWIN_FEATURE_64_BIT_INODE -is not defined (the -.Ft ino_t -type is 32-bits), the +is not defined (see below for more information about this macro), the .Fa stat structure is defined as: .Bd -literal @@ -137,7 +135,7 @@ struct stat { /* when _DARWIN_FEATURE_64_BIT_INODE is NOT defined */ dev_t st_dev; /* device inode resides on */ ino_t st_ino; /* inode's number */ mode_t st_mode; /* inode protection mode */ - nlink_t st_nlink; /* number or hard links to the file */ + nlink_t st_nlink; /* number of hard links to the file */ uid_t st_uid; /* user-id of owner */ gid_t st_gid; /* group-id of owner */ dev_t st_rdev; /* device type, for special file inode */ @@ -155,16 +153,6 @@ struct stat { /* when _DARWIN_FEATURE_64_BIT_INODE is NOT defined */ However, when the macro .Dv _DARWIN_FEATURE_64_BIT_INODE is defined, the -.Ft ino_t -type will be 64-bits (force 64-bit inode mode by defining the -.Dv _DARWIN_USE_64_BIT_INODE -macro before including header files). -This will cause symbol variants of the -.Fa stat -family, with the -.Fa $INODE64 -suffixes, to be automatically linked in. -In addition, the .Fa stat structure will now be defined as: .Bd -literal @@ -274,6 +262,141 @@ field, see .Aq Pa sys/stat.h and .Xr chflags 2 . +.Sh _DARWIN_FEATURE_64_BIT_INODE +In order to accommodate advanced capabilities of newer file systems, the +.Fa struct stat , +.Fa struct statfs , +and +.Fa struct dirent +data structures were updated in Mac OSX 10.5. +.Pp +The most obvious change is the increased size of +.Fa ino_t +from 32 bits to 64 bits. As a consequence, storing an ino_t in an int is +no longer safe, and file formats storing ino_t as 32-bit values may need to +be updated. There are other changes as well, such as the widening of +.Fa f_fstypename , +.Fa f_mntonname , +and +.Fa f_mntfromname +in +.Fa struct statfs . +Please refer to +.Xr stat 2 +and +.Xr dir 5 +for more detail on the specific changes to the other affected data structures. +.Pp +On platforms that existed before these updates were available, ABI +compatibility is achieved by providing two implementations for related +functions: one using the legacy data structures and one using the updated +data structures. Variants which make use of the newer structures have their +symbols suffixed with $INODE64. These $INODE64 suffixes are automatically +appended by the compiler tool-chain and should not be used directly. +.Pp +Platforms that were released after these updates only have the newer variants +available to them. These platforms have the macro +.Dv _DARWIN_FEATURE_ONLY_64_BIT_INODE +defined. +.Pp +The +.Dv _DARWIN_FEATURE_64_BIT_INODE +macro should not be set directly. Instead, developers should make use of the +.Dv _DARWIN_NO_64_BIT_INODE +or +.Dv _DARWIN_USE_64_BIT_INODE +macros when the default variant is not desired. The following table details +the effects of defining these macros for different deployment targets. +.Pp +.TS +center; +c s s s +l | c s s +c | c c c +c | c c c +l | c c c. +T{ +.Dv _DARWIN_FEATURE_ONLY_64_BIT_INODE Sy not defined +T} += + Deployment Target +user defines: < 10.5 10.5 > 10.5 +_ +T{ +.Em (none) +T} 32-bit 32-bit 64-bit +T{ +.Dv _DARWIN_NO_64_BIT_INODE +T} 32-bit 32-bit 32-bit +T{ +.Dv _DARWIN_USE_64_BIT_INODE +T} 32-bit 64-bit 64-bit +_ +.T& +c s s s +c s s s +c | l s s +c | c c c +l | c c c. + +T{ +.Dv _DARWIN_FEATURE_ONLY_64_BIT_INODE Sy defined +T} += +user defines: Any Deployment Target +_ +T{ +.Em (none) +T} 64-bit-only +T{ +.Dv _DARWIN_NO_64_BIT_INODE +T} T{ +.Em (error) +T} +T{ +.Dv _DARWIN_USE_64_BIT_INODE +T} 64-bit-only +_ +.TE +.Pp +.Bl -tag -width 64-bit-only -offset indent +.It 32-bit +32-bit inode values are enabled, and the legacy structures involving the +.Vt ino_t +type are in use. +The macro +.Dv _DARWIN_FEATURE_64_BIT_INODE +is not defined. +.It 64-bit +64-bit inode values are enabled, and the expanded structures involving the +.Vt ino_t +type are in use. +The macro +.Dv _DARWIN_FEATURE_64_BIT_INODE +is defined, and loader symbols will contain the +.Li $INODE64 +suffix. +.It 64-bit-only +Like 64-bit, except loader symbols do not have the +.Li $INODE64 +suffix. +.It Em (error) +A compile time error is generated. +.El +.Pp +Due to the increased benefits of the larger structure, it is highly +recommended that developers not define +.Dv _DARWIN_NO_64_BIT_INODE +and make use of +.Dv _DARWIN_USE_64_BIT_INODE +when targeting Mac OSX 10.5. +.Pp +In addition to the $INODE64 suffixed symbols, variants suffixed with 64 are +also available for related functions. These functions were provided as a way +for developers to use the updated structures in code that also made use of +the legacy structures. The enlarged stat structures were also prefixed with +64 to distinguish them from their legacy variants. These functions have been +deprecated and should be avoided. .Sh RETURN VALUES Upon successful completion a value of 0 is returned. Otherwise, a value of -1 is returned and @@ -399,6 +522,7 @@ structure when 64-bit inodes are in effect (see above). .Xr chown 2 , .Xr utimes 2 , .Xr compat 5 , +.Xr statfs 2 , .Xr symlink 7 .Sh BUGS Applying diff --git a/bsd/man/man2/statfs.2 b/bsd/man/man2/statfs.2 index 16e80f5d4..85dce6a80 100644 --- a/bsd/man/man2/statfs.2 +++ b/bsd/man/man2/statfs.2 @@ -71,9 +71,11 @@ argument is a pointer to a structure. When the macro .Dv _DARWIN_FEATURE_64_BIT_INODE -is not defined (the -.Ft ino_t -type is 32-bits), that structure is defined as: +is not defined (see +.Xr stat 2 +for more information on this macro), the +.Fa statfs +structure is defined as: .Bd -literal typedef struct { int32_t val[2]; } fsid_t; @@ -107,18 +109,8 @@ struct statfs { /* when _DARWIN_FEATURE_64_BIT_INODE is NOT defined */ However, when the macro .Dv _DARWIN_FEATURE_64_BIT_INODE is defined, the -.Ft ino_t -type will be 64-bits (force 64-bit inode mode by defining the -.Dv _DARWIN_USE_64_BIT_INODE -macro before including header files). -This will cause symbol variants of the -.Fa statfs -family, with the -.Fa $INODE64 -suffixes, to be automatically linked in. -In addition, the .Fa statfs -structure will now be defined as: +structure is defined as: .Bd -literal #define MFSTYPENAMELEN 16 /* length of fs type name including null */ #define MAXPATHLEN 1024 @@ -144,6 +136,13 @@ struct statfs { /* when _DARWIN_FEATURE_64_BIT_INODE is defined */ }; .Ed .Pp +Note that the +.Fa f_fstypename , +.Fa f_mntonname , +and +.Fa f_mntfromname +fields are also wider in this variant. +.Pp Fields that are undefined for a particular file system are set to -1. The .Fn fstatfs diff --git a/bsd/man/man2/undelete.2 b/bsd/man/man2/undelete.2 new file mode 100644 index 000000000..b85ecdcae --- /dev/null +++ b/bsd/man/man2/undelete.2 @@ -0,0 +1,108 @@ +.\" Copyright (c) 1994 +.\" Jan-Simon Pendry +.\" The Regents of the University of California. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. All advertising materials mentioning features or use of this software +.\" must display the following acknowledgement: +.\" This product includes software developed by the University of +.\" California, Berkeley and its contributors. +.\" 4. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)undelete.2 8.4 (Berkeley) 10/18/94 +.\" $FreeBSD: src/lib/libc/sys/undelete.2,v 1.17 2006/01/22 19:49:37 truckman Exp $ +.\" +.Dd January 22, 2006 +.Dt UNDELETE 2 +.Os +.Sh NAME +.Nm undelete +.Nd attempt to recover a deleted file +.Sh LIBRARY +.Lb libc +.Sh SYNOPSIS +.In unistd.h +.Ft int +.Fn undelete "const char *path" +.Sh DESCRIPTION +The +.Fn undelete +system call attempts to recover the deleted file named by +.Fa path . +Currently, this works only when the named object +is a whiteout in a union file system. +The system call removes the whiteout causing +any objects in a lower layer of the +union stack to become visible once more. +.Pp +Eventually, the +.Fn undelete +functionality may be expanded to other file systems able to recover +deleted files such as the log-structured file system. +.Sh RETURN VALUES +.Rv -std undelete +.Sh ERRORS +The +.Fn undelete +succeeds unless: +.Bl -tag -width Er +.It Bq Er ENOTDIR +A component of the path prefix is not a directory. +.It Bq Er ENAMETOOLONG +A component of a pathname exceeded 255 characters, +or an entire path name exceeded 1023 characters. +.It Bq Er EEXIST +The path does not reference a whiteout. +.It Bq Er ENOENT +The named whiteout does not exist. +.It Bq Er EACCES +Search permission is denied for a component of the path prefix. +.It Bq Er EACCES +Write permission is denied on the directory containing the name +to be undeleted. +.It Bq Er ELOOP +Too many symbolic links were encountered in translating the pathname. +.It Bq Er EPERM +The directory containing the name is marked sticky, +and the containing directory is not owned by the effective user ID. +.It Bq Er EINVAL +The last component of the path is +.Ql .. . +.It Bq Er EIO +An I/O error occurred while updating the directory entry. +.It Bq Er EROFS +The name resides on a read-only file system. +.It Bq Er EFAULT +The +.Fa path +argument +points outside the process's allocated address space. +.El +.Sh SEE ALSO +.Xr unlink 2 +.Sh HISTORY +The +.Fn undelete +system call first appeared in +.Bx 4.4 Lite . diff --git a/bsd/man/man3/posix_spawn_file_actions_addclose.3 b/bsd/man/man3/posix_spawn_file_actions_addclose.3 index 6cd2033c3..36c64715d 100644 --- a/bsd/man/man3/posix_spawn_file_actions_addclose.3 +++ b/bsd/man/man3/posix_spawn_file_actions_addclose.3 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2000-2007 Apple Inc. All rights reserved. +.\" Copyright (c) 2000-2010 Apple Inc. All rights reserved. .\" .\" @APPLE_OSREFERENCE_LICENSE_HEADER_START@ .\" @@ -27,7 +27,7 @@ .\" .\" @(#)posix_spawn_file_actions_addclose.3 . -.Dd August 22, 2007 +.Dd November 2, 2010 .Dt POSIX_SPAWN_FILE_ACTIONS_ADDCLOSE 3 .Os "Mac OS X" .Sh NAME @@ -56,6 +56,11 @@ .Fa "int filedes" .Fa "int newfiledes" .Fc +.Ft int +.Fo posix_spawn_file_actions_addinherit_np +.Fa "posix_spawn_file_actions_t *file_actions" +.Fa "int filedes" +.Fc .Sh DESCRIPTION The .Fn posix_spawn_file_actions_addclose @@ -115,6 +120,42 @@ is created as if had been called on .Em filedes prior to the new child process starting execution. +.Pp +The +.Fn posix_spawn_file_actions_addinherit_np +function adds an abstract inheritance operation to the +list of operations associated with the object referenced by +.Em file_actions , +for subsequent use in a call to +.Xr posix_spawn 2 +or +.Xr posix_spawnp 2 . +The pre-existing descriptor referred to by +.Em filedes +is marked for inheritance into the new process image, and the +.Em FD_CLOEXEC +flag is cleared from the file descriptor in the new process image. +.Pp +Normally, for +.Xr posix_spawn 2 +and +.Xr posix_spawnp 2 , +all file descriptors are inherited from the parent process +into the spawned process, except for those explicitly +marked as close-on-exec. However if the flag +.Em POSIX_SPAWN_CLOEXEC_DEFAULT +is set, then during the spawn operation, all pre-existing +file descriptors in the parent process are treated as if they +had been marked close-on-exec i.e. none of them are automatically +inherited. See +.Xr posix_spawnattr_setflags 3 . +Only file descriptors explicitly manipulated via +.Em file_actions +are made available in the spawned process. In that case, +.Fn posix_spawn_file_actions_addinherit_np +can be used to make specific pre-existing file +descriptors from the parent process be +available in the spawned process. .Sh RETURN VALUES On success, these functions return 0; on failure they return an error number from @@ -127,7 +168,7 @@ These functions may fail if: The value specified by .Fa filedes is negative or would cause the process to exceed the maximum number of -open files it is allowed.. +open files it is allowed. .\" ========== .It Bq Er EINVAL The value of @@ -135,7 +176,7 @@ The value of is invalid. .\" ========== .It Bq Er ENOMEM -Insufficient memory was available eo add to the +Insufficient memory was available to add the new action to .Fa file_actions . .El .Sh SEE ALSO @@ -143,6 +184,7 @@ Insufficient memory was available eo add to the .Xr posix_spawnp 2 , .Xr posix_spawn_file_actions_init 3 , .Xr posix_spawn_file_actions_destroy 3 , +.Xr posix_spawnattr_setflags 3 . .Sh STANDARDS .St -susv3 [SPN] .Sh HISTORY diff --git a/bsd/man/man3/posix_spawnattr_setflags.3 b/bsd/man/man3/posix_spawnattr_setflags.3 index 8828a83df..3359497ec 100644 --- a/bsd/man/man3/posix_spawnattr_setflags.3 +++ b/bsd/man/man3/posix_spawnattr_setflags.3 @@ -1,5 +1,5 @@ .\" -.\" Copyright (c) 2000-2007 Apple Inc. All rights reserved. +.\" Copyright (c) 2000-2010 Apple Inc. All rights reserved. .\" .\" @APPLE_OSREFERENCE_LICENSE_HEADER_START@ .\" @@ -27,7 +27,7 @@ .\" .\" @(#)posix_spawnattr_setflags.3 . -.Dd August 22, 2007 +.Dd October 28, 2010 .Dt POSIX_SPAWNATTR_SETFLAGS 3 .Os "Mac OS X" .Sh NAME @@ -119,6 +119,13 @@ manipulate the process before it begins execution in user space. This permits, for example, obtaining exact instruction counts, or debugging very early in .Xr dyld 1 . +.It Dv POSIX_SPAWN_CLOEXEC_DEFAULT +.Em Apple Extension : +If this bit is set, then only file descriptors explicitly described by the +.Fa file_actions +argument are available in the spawned process; all +of the other file descriptors are +automatically closed in the spawned process. .El .Sh RETURN VALUES On success, these functions return 0; on failure they return an error @@ -154,6 +161,7 @@ is invalid. .Xr posix_spawnattr_setpgroup 3 , .Xr posix_spawnattr_setsigdefault 3 , .Xr posix_spawnattr_setsigmask 3 , +.Xr posix_spawn_file_actions_init 3 , .Xr setpgid 2 , .Xr execve 2 , .Xr dyld 1 diff --git a/bsd/man/man4/auditpipe.4 b/bsd/man/man4/auditpipe.4 index 7e0d7cc3e..e3a7a9427 100644 --- a/bsd/man/man4/auditpipe.4 +++ b/bsd/man/man4/auditpipe.4 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD: src/share/man/man4/auditpipe.4,v 1.6 2008/05/02 17:36:22 rwatson Exp $ .\" -.Dd May 5, 2006 +.Dd Oct 18, 2010 .Os .Dt AUDITPIPE 4 .Sh NAME @@ -156,7 +156,7 @@ These flags correspond to the field in .Xr audit_control 5 . The ioctl argument should be of type -.Vt u_int . +.Vt au_mask_t . .It Dv AUDITPIPE_SET_PRESELECT_FLAGS Set the current default preselection flags for attributable events on the pipe. @@ -165,7 +165,7 @@ These flags correspond to the field in .Xr audit_control 5 . The ioctl argument should be of type -.Vt u_int . +.Vt au_mask_t . .It Dv AUDITPIPE_GET_PRESELECT_NAFLAGS Retrieve the current default preselection flags for non-attributable events on the pipe. @@ -174,7 +174,7 @@ These flags correspond to the field in .Xr audit_control 5 . The ioctl argument should be of type -.Vt u_int . +.Vt au_mask_t . .It Dv AUDITPIPE_SET_PRESELECT_NAFLAGS Set the current default preselection flags for non-attributable events on the pipe. @@ -183,7 +183,7 @@ These flags correspond to the field in .Xr audit_control 5 . The ioctl argument should be of type -.Vt u_int . +.Vt au_mask_t . .It Dv AUDITPIPE_GET_PRESELECT_AUID Query the current preselection masks for a specific auid on the pipe. The ioctl argument should be of type @@ -252,7 +252,5 @@ It might be desirable to provided a more flexible selection model. The per-pipe audit event queue is fifo, with drops occurring if either the user thread provides in sufficient for the record on the queue head, or on enqueue if there is insufficient room. -It might be desirable to support partial reads of records, which would be -more compatible with buffered I/O as implemented in system libraries, and to -allow applications to select which records are dropped, possibly in the style -of preselection. +It might be desirable to allow applications to select which records are +dropped, possibly in the style of preselection. diff --git a/bsd/man/man4/gif.4 b/bsd/man/man4/gif.4 index fe42f42cd..00e63f8eb 100644 --- a/bsd/man/man4/gif.4 +++ b/bsd/man/man4/gif.4 @@ -39,7 +39,7 @@ .Sh DESCRIPTION The .Nm -interface is a generic tunnelling pseudo device for IPv4 and IPv6. +interface is a generic tunneling pseudo device for IPv4 and IPv6. It can tunnel IPv[46] traffic over IPv[46]. Therefore, there can be four possible configurations. The behavior of @@ -195,7 +195,7 @@ The device first appeared in WIDE hydrangea IPv6 kit. .\" .Sh BUGS -There are many tunnelling protocol specifications, +There are many tunneling protocol specifications, defined differently from each other. .Nm may not interoperate with peers which are based on different specifications, diff --git a/bsd/man/man4/icmp6.4 b/bsd/man/man4/icmp6.4 index 40a30a31f..f41f7216d 100644 --- a/bsd/man/man4/icmp6.4 +++ b/bsd/man/man4/icmp6.4 @@ -235,7 +235,7 @@ sockets can be opened with the .Dv SOCK_DGRAM socket type without requiring root privileges. The synopsis is the following: .Pp -.Fn socket AF_INET6 SOCK_DGRAM IPPROTO_ICMP6 +.Fn socket AF_INET6 SOCK_DGRAM IPPROTO_ICMPV6 .Pp This can only be used to send .Tn ICMPv6 diff --git a/bsd/man/man4/netintro.4 b/bsd/man/man4/netintro.4 index 725797146..ab2b1b277 100644 --- a/bsd/man/man4/netintro.4 +++ b/bsd/man/man4/netintro.4 @@ -145,7 +145,8 @@ are known to the system (and additional formats are defined for possible future implementation): .Bd -literal #define AF_UNIX 1 /* local to host (pipes) */ -#define AF_INET 2 /* internetwork: UDP, TCP, etc. */ +#define AF_INET 2 /* IPv4: UDP, TCP, etc. */ +#define AF_INET6 30 /* IPv6: UDP, TCP, etc. */ #define AF_NS 6 /* Xerox NS protocols */ #define AF_CCITT 10 /* CCITT protocols, X.25 etc */ #define AF_HYLINK 15 /* NSC Hyperchannel */ diff --git a/bsd/man/man4/random.4 b/bsd/man/man4/random.4 index bc0dbc76c..ed72fa315 100644 --- a/bsd/man/man4/random.4 +++ b/bsd/man/man4/random.4 @@ -72,4 +72,4 @@ directly before obtaining important random numbers. .Sh HISTORY A .Nm -device appeared in Linux operating system. +device appeared in the Linux operating system. diff --git a/bsd/man/man5/Makefile b/bsd/man/man5/Makefile index 0780eadfa..bf6093b3f 100644 --- a/bsd/man/man5/Makefile +++ b/bsd/man/man5/Makefile @@ -11,8 +11,6 @@ DATAFILES = \ core.5 \ dir.5 \ dirent.5 \ - fs.5 \ - inode.5 \ types.5 INSTALL_MAN_LIST = ${DATAFILES} diff --git a/bsd/man/man5/dir.5 b/bsd/man/man5/dir.5 index 6f2eacb60..c9e37b3b5 100644 --- a/bsd/man/man5/dir.5 +++ b/bsd/man/man5/dir.5 @@ -87,9 +87,9 @@ and further in the file .Aq dirent.h . When the macro .Dv _DARWIN_FEATURE_64_BIT_INODE -is not defined (the -.Ft ino_t -type is 32-bits), the +is not defined (see +.Xr stat 2 +for more information on this macro), the .Fa dirent structure is defined as: .Bd -literal @@ -116,16 +116,8 @@ struct dirent { /* when _DARWIN_FEATURE_64_BIT_INODE is NOT defined */ However, when the macro .Dv _DARWIN_FEATURE_64_BIT_INODE is defined, the -.Ft ino_t -type will be 64-bits (force 64-bit inode mode by defining the -.Dv _DARWIN_USE_64_BIT_INODE -macro before including header files). -This will cause symbol variants of the directory routines, with the -.Fa $INODE64 -suffixes, to be automatically linked in. -In addition, the .Fa dirent -structure will now be defined as: +structure is defined as: .Bd -literal /* * The dirent structure defines the format of directory entries. diff --git a/bsd/man/man5/fs.5 b/bsd/man/man5/fs.5 deleted file mode 100644 index 18da833e6..000000000 --- a/bsd/man/man5/fs.5 +++ /dev/null @@ -1,343 +0,0 @@ -.\" $NetBSD: fs.5,v 1.3 1994/11/30 19:31:17 jtc Exp $ -.\" -.\" Copyright (c) 1983, 1991, 1993 -.\" The Regents of the University of California. All rights reserved. -.\" -.\" Redistribution and use in source and binary forms, with or without -.\" modification, are permitted provided that the following conditions -.\" are met: -.\" 1. Redistributions of source code must retain the above copyright -.\" notice, this list of conditions and the following disclaimer. -.\" 2. Redistributions in binary form must reproduce the above copyright -.\" notice, this list of conditions and the following disclaimer in the -.\" documentation and/or other materials provided with the distribution. -.\" 3. All advertising materials mentioning features or use of this software -.\" must display the following acknowledgement: -.\" This product includes software developed by the University of -.\" California, Berkeley and its contributors. -.\" 4. Neither the name of the University nor the names of its contributors -.\" may be used to endorse or promote products derived from this software -.\" without specific prior written permission. -.\" -.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -.\" SUCH DAMAGE. -.\" -.\" @(#)fs.5 8.2 (Berkeley) 4/19/94 -.\" -.Dd April 19, 1994 -.Dt FS 5 -.Os BSD 4.2 -.Sh NAME -.Nm fs , -.Nm inode -.Nd format of file system volume -.Sh SYNOPSIS -.Fd #include <sys/types.h> -.Fd #include <ufs/fs.h> -.Fd #include <ufs/inode.h> -.Sh DESCRIPTION -The files -.Aq Pa fs.h -and -.Aq Pa inode.h -declare several structures, defined variables and macros -which are used to create and manage the underlying format of -file system objects on random access devices (disks). -.Pp -The block size and number of blocks which -comprise a file system are parameters of the file system. -Sectors beginning at -.Dv BBLOCK -and continuing for -.Dv BBSIZE -are used -for a disklabel and for some hardware primary -and secondary bootstrapping programs. -.Pp -The actual file system begins at sector -.Dv SBLOCK -with the -.Em super-block -that is of size -.Dv SBSIZE . -The following structure described the super-block and is -from the file -.Aq Pa ufs/fs.h : -.Bd -literal -#define FS_MAGIC 0x011954 -struct fs { - struct fs *fs_link; /* linked list of file systems */ - struct fs *fs_rlink; /* used for incore super blocks */ - daddr_t fs_sblkno; /* addr of super-block in filesys */ - daddr_t fs_cblkno; /* offset of cyl-block in filesys */ - daddr_t fs_iblkno; /* offset of inode-blocks in filesys */ - daddr_t fs_dblkno; /* offset of first data after cg */ - long fs_cgoffset; /* cylinder group offset in cylinder */ - long fs_cgmask; /* used to calc mod fs_ntrak */ - time_t fs_time; /* last time written */ - long fs_size; /* number of blocks in fs */ - long fs_dsize; /* number of data blocks in fs */ - long fs_ncg; /* number of cylinder groups */ - long fs_bsize; /* size of basic blocks in fs */ - long fs_fsize; /* size of frag blocks in fs */ - long fs_frag; /* number of frags in a block in fs */ -/* these are configuration parameters */ - long fs_minfree; /* minimum percentage of free blocks */ - long fs_rotdelay; /* num of ms for optimal next block */ - long fs_rps; /* disk revolutions per second */ -/* these fields can be computed from the others */ - long fs_bmask; /* ``blkoff'' calc of blk offsets */ - long fs_fmask; /* ``fragoff'' calc of frag offsets */ - long fs_bshift; /* ``lblkno'' calc of logical blkno */ - long fs_fshift; /* ``numfrags'' calc number of frags */ -/* these are configuration parameters */ - long fs_maxcontig; /* max number of contiguous blks */ - long fs_maxbpg; /* max number of blks per cyl group */ -/* these fields can be computed from the others */ - long fs_fragshift; /* block to frag shift */ - long fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ - long fs_sbsize; /* actual size of super block */ - long fs_csmask; /* csum block offset */ - long fs_csshift; /* csum block number */ - long fs_nindir; /* value of NINDIR */ - long fs_inopb; /* value of INOPB */ - long fs_nspf; /* value of NSPF */ -/* yet another configuration parameter */ - long fs_optim; /* optimization preference, see below */ -/* these fields are derived from the hardware */ - long fs_npsect; /* # sectors/track including spares */ - long fs_interleave; /* hardware sector interleave */ - long fs_trackskew; /* sector 0 skew, per track */ - long fs_headswitch; /* head switch time, usec */ - long fs_trkseek; /* track-to-track seek, usec */ -/* sizes determined by number of cylinder groups and their sizes */ - daddr_t fs_csaddr; /* blk addr of cyl grp summary area */ - long fs_cssize; /* size of cyl grp summary area */ - long fs_cgsize; /* cylinder group size */ -/* these fields are derived from the hardware */ - long fs_ntrak; /* tracks per cylinder */ - long fs_nsect; /* sectors per track */ - long fs_spc; /* sectors per cylinder */ -/* this comes from the disk driver partitioning */ - long fs_ncyl; /* cylinders in file system */ -/* these fields can be computed from the others */ - long fs_cpg; /* cylinders per group */ - long fs_ipg; /* inodes per group */ - long fs_fpg; /* blocks per group * fs_frag */ -/* this data must be re-computed after crashes */ - struct csum fs_cstotal; /* cylinder summary information */ -/* these fields are cleared at mount time */ - char fs_fmod; /* super block modified flag */ - char fs_clean; /* file system is clean flag */ - char fs_ronly; /* mounted read-only flag */ - char fs_flags; /* currently unused flag */ - char fs_fsmnt[MAXMNTLEN]; /* name mounted on */ -/* these fields retain the current block allocation info */ - long fs_cgrotor; /* last cg searched */ - struct csum *fs_csp[MAXCSBUFS]; /* list of fs_cs info buffers */ - long fs_cpc; /* cyl per cycle in postbl */ - short fs_opostbl[16][8]; /* old rotation block list head */ - long fs_sparecon[56]; /* reserved for future constants */ - quad fs_qbmask; /* ~fs_bmask - for use with quad size */ - quad fs_qfmask; /* ~fs_fmask - for use with quad size */ - long fs_postblformat; /* format of positional layout tables */ - long fs_nrpos; /* number of rotational positions */ - long fs_postbloff; /* (short) rotation block list head */ - long fs_rotbloff; /* (u_char) blocks for each rotation */ - long fs_magic; /* magic number */ - u_char fs_space[1]; /* list of blocks for each rotation */ -/* actually longer */ -}; -.Ed -.Pp -Each disk drive contains some number of file systems. -A file system consists of a number of cylinder groups. -Each cylinder group has inodes and data. -.Pp -A file system is described by its super-block, which in turn -describes the cylinder groups. The super-block is critical -data and is replicated in each cylinder group to protect against -catastrophic loss. This is done at file system creation -time and the critical -super-block data does not change, so the copies need not be -referenced further unless disaster strikes. -.Pp -Addresses stored in inodes are capable of addressing fragments -of `blocks'. File system blocks of at most size -.Dv MAXBSIZE -can -be optionally broken into 2, 4, or 8 pieces, each of which is -addressable; these pieces may be -.Dv DEV_BSIZE , -or some multiple of -a -.Dv DEV_BSIZE -unit. -.Pp -Large files consist of exclusively large data blocks. To avoid -undue wasted disk space, the last data block of a small file is -allocated as only as many fragments of a large block as are -necessary. The file system format retains only a single pointer -to such a fragment, which is a piece of a single large block that -has been divided. The size of such a fragment is determinable from -information in the inode, using the -.Fn blksize fs ip lbn -macro. -.Pp -The file system records space availability at the fragment level; -to determine block availability, aligned fragments are examined. -.Pp -The root inode is the root of the file system. -Inode 0 can't be used for normal purposes and -historically bad blocks were linked to inode 1, -thus the root inode is 2 (inode 1 is no longer used for -this purpose, however numerous dump tapes make this -assumption, so we are stuck with it). -.Pp -The -.Fa fs_minfree -element gives the minimum acceptable percentage of file system -blocks that may be free. If the freelist drops below this level -only the super-user may continue to allocate blocks. -The -.Fa fs_minfree -element -may be set to 0 if no reserve of free blocks is deemed necessary, -however severe performance degradations will be observed if the -file system is run at greater than 90% full; thus the default -value of -.Fa fs_minfree -is 10%. -.Pp -Empirically the best trade-off between block fragmentation and -overall disk utilization at a loading of 90% comes with a -fragmentation of 8, thus the default fragment size is an eighth -of the block size. -.Pp -The element -.Fa fs_optim -specifies whether the file system should try to minimize the time spent -allocating blocks, or if it should attempt to minimize the space -fragmentation on the disk. -If the value of fs_minfree (see above) is less than 10%, -then the file system defaults to optimizing for space to avoid -running out of full sized blocks. -If the value of minfree is greater than or equal to 10%, -fragmentation is unlikely to be problematical, and -the file system defaults to optimizing for time. -.Pp -.Em Cylinder group related limits : -Each cylinder keeps track of the availability of blocks at different -rotational positions, so that sequential blocks can be laid out -with minimum rotational latency. With the default of 8 distinguished -rotational positions, the resolution of the -summary information is 2ms for a typical 3600 rpm drive. -.Pp -The element -.Fa fs_rotdelay -gives the minimum number of milliseconds to initiate -another disk transfer on the same cylinder. -It is used in determining the rotationally optimal -layout for disk blocks within a file; -the default value for -.Fa fs_rotdelay -is 2ms. -.Pp -Each file system has a statically allocated number of inodes. -An inode is allocated for each -.Dv NBPI -bytes of disk space. -The inode allocation strategy is extremely conservative. -.Pp -.Dv MINBSIZE -is the smallest allowable block size. -With a -.Dv MINBSIZE -of 4096 -it is possible to create files of size -2^32 with only two levels of indirection. -.Dv MINBSIZE -must be big enough to hold a cylinder group block, -thus changes to -.Pq Fa struct cg -must keep its size within -.Dv MINBSIZE . -Note that super-blocks are never more than size -.Dv SBSIZE . -.Pp -The path name on which the file system is mounted is maintained in -.Fa fs_fsmnt . -.Dv MAXMNTLEN -defines the amount of space allocated in -the super-block for this name. -The limit on the amount of summary information per file system -is defined by -.Dv MAXCSBUFS. -For a 4096 byte block size, it is currently parameterized for a -maximum of two million cylinders. -.Pp -Per cylinder group information is summarized in blocks allocated -from the first cylinder group's data blocks. -These blocks are read in from -.Fa fs_csaddr -(size -.Fa fs_cssize ) -in addition to the super-block. -.Pp -.Sy N.B.: -.Xr sizeof Pq Fa struct csum -must be a power of two in order for -the -.Fn fs_cs -macro to work. -.Pp -The -.Em "Super-block for a file system" : -The size of the rotational layout tables -is limited by the fact that the super-block is of size -.Dv SBSIZE . -The size of these tables is -.Em inversely -proportional to the block -size of the file system. The size of the tables is -increased when sector sizes are not powers of two, -as this increases the number of cylinders -included before the rotational pattern repeats -.Pq Fa fs_cpc . -The size of the rotational layout -tables is derived from the number of bytes remaining in -.Pq Fa struct fs . -.Pp -The number of blocks of data per cylinder group -is limited because cylinder groups are at most one block. -The inode and free block tables -must fit into a single block after deducting space for -the cylinder group structure -.Pq Fa struct cg . -.Pp -The -.Em Inode : -The inode is the focus of all file activity in the -file system. -There is a unique inode allocated -for each active file, -each current directory, each mounted-on file, -text file, and the root. -An inode is `named' by its device/i-number pair. -For further information, see the include file -.Aq Pa sys/inode.h . -.Sh HISTORY -A super-block structure named filsys appeared in -.At v6 . -The file system described in this manual appeared -in -.Bx 4.2 . diff --git a/bsd/man/man5/inode.5 b/bsd/man/man5/inode.5 deleted file mode 100644 index 1b47f6228..000000000 --- a/bsd/man/man5/inode.5 +++ /dev/null @@ -1 +0,0 @@ -.so man5/fs.5 diff --git a/bsd/miscfs/Makefile b/bsd/miscfs/Makefile index 009da4c3f..ece064108 100644 --- a/bsd/miscfs/Makefile +++ b/bsd/miscfs/Makefile @@ -13,8 +13,6 @@ INSTINC_SUBDIRS = \ specfs \ union -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ @@ -22,8 +20,6 @@ EXPINC_SUBDIRS = \ fifofs \ specfs -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ SETUP_SUBDIRS = \ diff --git a/bsd/miscfs/devfs/Makefile b/bsd/miscfs/devfs/Makefile index bb2e43304..9d29f42e1 100644 --- a/bsd/miscfs/devfs/Makefile +++ b/bsd/miscfs/devfs/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/miscfs/devfs/devfs_tree.c b/bsd/miscfs/devfs/devfs_tree.c index 58aea8eb9..daf8c8ace 100644 --- a/bsd/miscfs/devfs/devfs_tree.c +++ b/bsd/miscfs/devfs/devfs_tree.c @@ -148,6 +148,7 @@ lck_grp_t * devfs_lck_grp; lck_grp_attr_t * devfs_lck_grp_attr; lck_attr_t * devfs_lck_attr; lck_mtx_t devfs_mutex; +lck_mtx_t devfs_attr_mutex; devdirent_t * dev_root = NULL; /* root of backing tree */ struct devfs_stats devfs_stats; /* hold stats */ @@ -185,6 +186,7 @@ devfs_sinit(void) devfs_lck_attr = lck_attr_alloc_init(); lck_mtx_init(&devfs_mutex, devfs_lck_grp, devfs_lck_attr); + lck_mtx_init(&devfs_attr_mutex, devfs_lck_grp, devfs_lck_attr); DEVFS_LOCK(); error = dev_add_entry("root", NULL, DEV_DIR, NULL, NULL, NULL, &dev_root); diff --git a/bsd/miscfs/devfs/devfs_vfsops.c b/bsd/miscfs/devfs/devfs_vfsops.c index c5875bd55..f34edc2c5 100644 --- a/bsd/miscfs/devfs/devfs_vfsops.c +++ b/bsd/miscfs/devfs/devfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -93,9 +93,10 @@ static int devfs_statfs( struct mount *mp, struct vfsstatfs *sbp, vfs_context_t ctx); static int devfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx); +#if !defined(SECURE_KERNEL) extern int setup_kmem; __private_extern__ void devfs_setup_kmem(void); - +#endif /*- * Called from the generic VFS startups. @@ -114,9 +115,11 @@ devfs_init(__unused struct vfsconf *vfsp) UID_ROOT, GID_WHEEL, 0622, "console"); devfs_make_node(makedev(2, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666, "tty"); +#if !defined(SECURE_KERNEL) if (setup_kmem) { devfs_setup_kmem(); } +#endif devfs_make_node(makedev(3, 2), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666, "null"); devfs_make_node(makedev(3, 3), DEVFS_CHAR, @@ -131,6 +134,7 @@ devfs_init(__unused struct vfsconf *vfsp) return 0; } +#if !defined(SECURE_KERNEL) __private_extern__ void devfs_setup_kmem(void) { @@ -139,6 +143,7 @@ devfs_setup_kmem(void) devfs_make_node(makedev(3, 1), DEVFS_CHAR, UID_ROOT, GID_KMEM, 0640, "kmem"); } +#endif /*- @@ -495,7 +500,7 @@ devfs_kernel_mount(char * mntname) /* * Get vnode to be covered */ - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(mntname), ctx); if ((error = namei(&nd))) { printf("devfs_kernel_mount: failed to find directory '%s', %d", diff --git a/bsd/miscfs/devfs/devfs_vnops.c b/bsd/miscfs/devfs/devfs_vnops.c index 58746bb73..207a50c01 100644 --- a/bsd/miscfs/devfs/devfs_vnops.c +++ b/bsd/miscfs/devfs/devfs_vnops.c @@ -110,9 +110,79 @@ #include "fdesc.h" #endif /* FDESC */ -static int devfs_update(struct vnode *vp, struct timeval *access, - struct timeval *modify); -void devfs_rele_node(devnode_t *); +static int devfs_update(struct vnode *vp, struct timeval *access, + struct timeval *modify); +void devfs_rele_node(devnode_t *); +static void devfs_consider_time_update(devnode_t *dnp, uint32_t just_changed_flags); +static boolean_t devfs_update_needed(long now_s, long last_s); +void dn_times_locked(devnode_t * dnp, struct timeval *t1, struct timeval *t2, struct timeval *t3, uint32_t just_changed_flags); +void dn_times_now(devnode_t *dnp, uint32_t just_changed_flags); +void dn_mark_for_delayed_times_update(devnode_t *dnp, uint32_t just_changed_flags); + +void +dn_times_locked(devnode_t * dnp, struct timeval *t1, struct timeval *t2, struct timeval *t3, uint32_t just_changed_flags) +{ + + lck_mtx_assert(&devfs_attr_mutex, LCK_MTX_ASSERT_OWNED); + + if (just_changed_flags & DEVFS_UPDATE_ACCESS) { + dnp->dn_atime.tv_sec = t1->tv_sec; + dnp->dn_atime.tv_nsec = t1->tv_usec * 1000; + dnp->dn_access = 0; + } else if (dnp->dn_access) { + dnp->dn_atime.tv_sec = MIN(t1->tv_sec, dnp->dn_atime.tv_sec + DEVFS_LAZY_UPDATE_SECONDS); + dnp->dn_atime.tv_nsec = t1->tv_usec * 1000; + dnp->dn_access = 0; + } + + if (just_changed_flags & DEVFS_UPDATE_MOD) { + dnp->dn_mtime.tv_sec = t2->tv_sec; + dnp->dn_mtime.tv_nsec = t2->tv_usec * 1000; + dnp->dn_update = 0; + } else if (dnp->dn_update) { + dnp->dn_mtime.tv_sec = MIN(t2->tv_sec, dnp->dn_mtime.tv_sec + DEVFS_LAZY_UPDATE_SECONDS); + dnp->dn_mtime.tv_nsec = t2->tv_usec * 1000; + dnp->dn_update = 0; + } + + if (just_changed_flags & DEVFS_UPDATE_CHANGE) { + dnp->dn_ctime.tv_sec = t3->tv_sec; + dnp->dn_ctime.tv_nsec = t3->tv_usec * 1000; + dnp->dn_change = 0; + } else if (dnp->dn_change) { + dnp->dn_ctime.tv_sec = MIN(t3->tv_sec, dnp->dn_ctime.tv_sec + DEVFS_LAZY_UPDATE_SECONDS); + dnp->dn_ctime.tv_nsec = t3->tv_usec * 1000; + dnp->dn_change = 0; + } +} + +void +dn_mark_for_delayed_times_update(devnode_t *dnp, uint32_t just_changed_flags) +{ + if (just_changed_flags & DEVFS_UPDATE_CHANGE) { + dnp->dn_change = 1; + } + if (just_changed_flags & DEVFS_UPDATE_ACCESS) { + dnp->dn_access = 1; + } + if (just_changed_flags & DEVFS_UPDATE_MOD) { + dnp->dn_update = 1; + } +} + +/* + * Update times based on pending updates and optionally a set of new changes. + */ +void +dn_times_now(devnode_t * dnp, uint32_t just_changed_flags) +{ + struct timeval now; + + DEVFS_ATTR_LOCK_SPIN(); + microtime(&now); + dn_times_locked(dnp, &now, &now, &now, just_changed_flags); + DEVFS_ATTR_UNLOCK(); +} /* @@ -353,9 +423,6 @@ devfs_getattr(struct vnop_getattr_args *ap) DEVFS_LOCK(); file_node = VTODN(vp); - microtime(&now); - dn_times(file_node, &now, &now, &now); - VATTR_RETURN(vap, va_mode, file_node->dn_mode); /* @@ -402,6 +469,13 @@ devfs_getattr(struct vnop_getattr_args *ap) VATTR_RETURN(vap, va_iosize, MAXPHYSIO); else VATTR_RETURN(vap, va_iosize, vp->v_mount->mnt_vfsstat.f_iosize); + + + DEVFS_ATTR_LOCK_SPIN(); + + microtime(&now); + dn_times_locked(file_node, &now, &now, &now, 0); + /* if the time is bogus, set it to the boot time */ if (file_node->dn_ctime.tv_sec == 0) { file_node->dn_ctime.tv_sec = boottime_sec(); @@ -414,6 +488,9 @@ devfs_getattr(struct vnop_getattr_args *ap) VATTR_RETURN(vap, va_change_time, file_node->dn_ctime); VATTR_RETURN(vap, va_modify_time, file_node->dn_mtime); VATTR_RETURN(vap, va_access_time, file_node->dn_atime); + + DEVFS_ATTR_UNLOCK(); + VATTR_RETURN(vap, va_gen, 0); VATTR_RETURN(vap, va_filerev, 0); VATTR_RETURN(vap, va_acl, NULL); @@ -557,13 +634,11 @@ devfs_close(struct vnop_close_args *ap) { struct vnode * vp = ap->a_vp; register devnode_t * dnp; - struct timeval now; if (vnode_isinuse(vp, 1)) { DEVFS_LOCK(); dnp = VTODN(vp); - microtime(&now); - dn_times(dnp, &now, &now, &now); + dn_times_now(dnp, 0); DEVFS_UNLOCK(); } return (0); @@ -579,19 +654,68 @@ devfsspec_close(struct vnop_close_args *ap) { struct vnode * vp = ap->a_vp; register devnode_t * dnp; - struct timeval now; if (vnode_isinuse(vp, 0)) { DEVFS_LOCK(); - microtime(&now); dnp = VTODN(vp); - dn_times(dnp, &now, &now, &now); + dn_times_now(dnp, 0); DEVFS_UNLOCK(); } return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_close), ap)); } +static boolean_t +devfs_update_needed(long now_s, long last_s) +{ + if (now_s > last_s) { + if (now_s - last_s >= DEVFS_LAZY_UPDATE_SECONDS) { + return TRUE; + } + } + + return FALSE; +} + +/* + * Given a set of time updates required [to happen at some point], check + * either make those changes (and resolve other pending updates) or mark + * the devnode for a subsequent update. + */ +static void +devfs_consider_time_update(devnode_t *dnp, uint32_t just_changed_flags) +{ + struct timeval now; + long now_s; + + microtime(&now); + now_s = now.tv_sec; + + if (dnp->dn_change || (just_changed_flags & DEVFS_UPDATE_CHANGE)) { + if (devfs_update_needed(now_s, dnp->dn_ctime.tv_sec)) { + dn_times_now(dnp, just_changed_flags); + return; + } + } + if (dnp->dn_access || (just_changed_flags & DEVFS_UPDATE_ACCESS)) { + if (devfs_update_needed(now_s, dnp->dn_atime.tv_sec)) { + dn_times_now(dnp, just_changed_flags); + return; + } + } + if (dnp->dn_update || (just_changed_flags & DEVFS_UPDATE_MOD)) { + if (devfs_update_needed(now_s, dnp->dn_mtime.tv_sec)) { + dn_times_now(dnp, just_changed_flags); + return; + } + } + + /* Not going to do anything now--mark for later update */ + dn_mark_for_delayed_times_update(dnp, just_changed_flags); + + return; +} + static int devfsspec_read(struct vnop_read_args *ap) /* struct vnop_read_args { @@ -603,7 +727,7 @@ devfsspec_read(struct vnop_read_args *ap) { register devnode_t * dnp = VTODN(ap->a_vp); - dnp->dn_access = 1; + devfs_consider_time_update(dnp, DEVFS_UPDATE_ACCESS); return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_read), ap)); } @@ -619,8 +743,7 @@ devfsspec_write(struct vnop_write_args *ap) { register devnode_t * dnp = VTODN(ap->a_vp); - dnp->dn_change = 1; - dnp->dn_update = 1; + devfs_consider_time_update(dnp, DEVFS_UPDATE_CHANGE | DEVFS_UPDATE_MOD); return (VOCALL (spec_vnodeop_p, VOFFSET(vnop_write), ap)); } @@ -704,8 +827,7 @@ devfs_vnop_remove(struct vnop_remove_args *ap) /*********************************** * Start actually doing things.... * ***********************************/ - tdp->dn_change = 1; - tdp->dn_update = 1; + devfs_consider_time_update(tdp, DEVFS_UPDATE_CHANGE | DEVFS_UPDATE_MOD); /* * Target must be empty if a directory and have no links @@ -741,7 +863,6 @@ devfs_link(struct vnop_link_args *ap) devnode_t * tdp; devdirent_t * tnp; int error = 0; - struct timeval now; /* * First catch an arbitrary restriction for this FS @@ -770,10 +891,7 @@ devfs_link(struct vnop_link_args *ap) /*********************************** * Start actually doing things.... * ***********************************/ - fp->dn_change = 1; - - microtime(&now); - error = devfs_update(vp, &now, &now); + dn_times_now(fp, DEVFS_UPDATE_CHANGE); if (!error) { error = dev_add_name(cnp->cn_nameptr, tdp, NULL, fp, &tnp); @@ -833,7 +951,6 @@ devfs_rename(struct vnop_rename_args *ap) devdirent_t *fnp,*tnp; int doingdirectory = 0; int error = 0; - struct timeval now; DEVFS_LOCK(); /* @@ -914,12 +1031,8 @@ devfs_rename(struct vnop_rename_args *ap) /*********************************** * Start actually doing things.... * ***********************************/ - fp->dn_change = 1; - microtime(&now); + dn_times_now(fp, DEVFS_UPDATE_CHANGE); - if ( (error = devfs_update(fvp, &now, &now)) ) { - goto out; - } /* * Check if just deleting a link name. */ @@ -1192,8 +1305,6 @@ devfs_readdir(struct vnop_readdir_args *ap) name_node = dir_node->dn_typeinfo.Dir.dirlist; nodenumber = 0; - dir_node->dn_access = 1; - while ((name_node || (nodenumber < 2)) && (uio_resid(uio) > 0)) { switch(nodenumber) @@ -1256,6 +1367,8 @@ devfs_readdir(struct vnop_readdir_args *ap) DEVFS_UNLOCK(); uio->uio_offset = pos; + devfs_consider_time_update(dir_node, DEVFS_UPDATE_ACCESS); + return (error); } @@ -1405,8 +1518,11 @@ devfs_update(struct vnode *vp, struct timeval *access, struct timeval *modify) return (0); } + + DEVFS_ATTR_LOCK_SPIN(); microtime(&now); - dn_times(ip, access, modify, &now); + dn_times_locked(ip, access, modify, &now, DEVFS_UPDATE_ACCESS | DEVFS_UPDATE_MOD); + DEVFS_ATTR_UNLOCK(); return (0); } diff --git a/bsd/miscfs/devfs/devfsdefs.h b/bsd/miscfs/devfs/devfsdefs.h index ce85cf853..e8b12000a 100644 --- a/bsd/miscfs/devfs/devfsdefs.h +++ b/bsd/miscfs/devfs/devfsdefs.h @@ -178,6 +178,7 @@ struct devdirent extern devdirent_t * dev_root; extern struct devfs_stats devfs_stats; extern lck_mtx_t devfs_mutex; +extern lck_mtx_t devfs_attr_mutex; /* * Rules for front nodes: @@ -214,9 +215,10 @@ struct devfsmount #define VTODN(vp) ((devnode_t *)(vp)->v_data) #define DEVFS_LOCK() lck_mtx_lock(&devfs_mutex) - #define DEVFS_UNLOCK() lck_mtx_unlock(&devfs_mutex) +#define DEVFS_ATTR_LOCK_SPIN() lck_mtx_lock_spin(&devfs_attr_mutex); +#define DEVFS_ATTR_UNLOCK() lck_mtx_unlock(&devfs_attr_mutex); /* * XXX all the (SInt32 *) casts below assume sizeof(int) == sizeof(long) @@ -269,34 +271,32 @@ DEVFS_DECR_STRINGSPACE(int space) OSAddAtomic(-space, &devfs_stats.stringspace); } -static __inline__ void -dn_times(devnode_t * dnp, struct timeval *t1, struct timeval *t2, struct timeval *t3) -{ - if (dnp->dn_access) { - dnp->dn_atime.tv_sec = t1->tv_sec; - dnp->dn_atime.tv_nsec = t1->tv_usec * 1000; - dnp->dn_access = 0; - } - if (dnp->dn_update) { - dnp->dn_mtime.tv_sec = t2->tv_sec; - dnp->dn_mtime.tv_nsec = t2->tv_usec * 1000; - dnp->dn_update = 0; - } - if (dnp->dn_change) { - dnp->dn_ctime.tv_sec = t3->tv_sec; - dnp->dn_ctime.tv_nsec = t3->tv_usec * 1000; - dnp->dn_change = 0; - } - - return; -} +/* + * Access, change, and modify times are protected by a separate lock, + * which allows tty times to be updated (no more than once per second) + * in the I/O path without too much fear of contention. + * + * For getattr, update times to current time if the last update was recent; + * preserve legacy behavior that frequent stats can yield sub-second resolutions. + * If the last time is old, however, we know that the event that triggered + * the need for an update was no more than 1s after the last update. In that case, + * use (last update + 1s) as the time, avoiding the illusion that last update happened + * much later than it really did. + */ +#define DEVFS_LAZY_UPDATE_SECONDS 1 + +#define DEVFS_UPDATE_CHANGE 0x1 +#define DEVFS_UPDATE_MOD 0x2 +#define DEVFS_UPDATE_ACCESS 0x4 static __inline__ void dn_copy_times(devnode_t * target, devnode_t * source) { + DEVFS_ATTR_LOCK_SPIN(); target->dn_atime = source->dn_atime; target->dn_mtime = source->dn_mtime; target->dn_ctime = source->dn_ctime; + DEVFS_ATTR_UNLOCK(); return; } diff --git a/bsd/miscfs/fifofs/Makefile b/bsd/miscfs/fifofs/Makefile index ff18c9388..d70a3ab16 100644 --- a/bsd/miscfs/fifofs/Makefile +++ b/bsd/miscfs/fifofs/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/miscfs/nullfs/null.h b/bsd/miscfs/nullfs/null.h deleted file mode 100644 index 3209be3d9..000000000 --- a/bsd/miscfs/nullfs/null.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software donated to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)null.h 8.3 (Berkeley) 8/20/94 - * - * null.h 8.2 (Berkeley) 1/21/94 - */ -#ifdef __NULLFS_NULL_H__ -#define __NULLFS_NULL_H__ - -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_PRIVATE -struct null_args { - char *target; /* Target of loopback */ -}; - -struct null_mount { - struct mount *nullm_vfs; - struct vnode *nullm_rootvp; /* Reference to root null_node */ -}; - -#ifdef KERNEL -/* LP64 version of null_args. all pointers - * grow when we're dealing with a 64-bit process. - * WARNING - keep in sync with null_args - */ -struct user_null_args { - user_addr_t target; /* Target of loopback */ -}; - -/* - * A cache of vnode references - */ -struct null_node { - LIST_ENTRY(null_node) null_hash; /* Hash list */ - struct vnode *null_lowervp; /* VREFed once */ - struct vnode *null_vnode; /* Back pointer */ -}; - -extern int null_node_create(struct mount *mp, struct vnode *target, struct vnode **vpp); - -#define MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data)) -#define VTONULL(vp) ((struct null_node *)(vp)->v_data) -#define NULLTOV(xp) ((xp)->null_vnode) -#ifdef NULLFS_DIAGNOSTIC -extern struct vnode *null_checkvp(struct vnode *vp, char *fil, int lno); -#define NULLVPTOLOWERVP(vp) null_checkvp((vp), __FILE__, __LINE__) -#else -#define NULLVPTOLOWERVP(vp) (VTONULL(vp)->null_lowervp) -#endif - -extern int (**null_vnodeop_p)(void *); -extern struct vfsops null_vfsops; -#endif /* KERNEL */ - -#endif /* __APPLE_API_PRIVATE */ -#endif /* __NULLFS_NULL_H__ */ diff --git a/bsd/miscfs/nullfs/null_subr.c b/bsd/miscfs/nullfs/null_subr.c deleted file mode 100644 index d061bb77f..000000000 --- a/bsd/miscfs/nullfs/null_subr.c +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software donated to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)null_subr.c 8.7 (Berkeley) 5/14/95 - * - * null_subr.c 8.4 (Berkeley) 1/21/94 - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/proc.h> -#include <sys/time.h> -#include <sys/types.h> -#include <sys/vnode.h> -#include <sys/mount_internal.h> -#include <sys/namei.h> -#include <sys/malloc.h> -#include <sys/ubc.h> -#include <miscfs/nullfs/null.h> - -#define LOG2_SIZEVNODE 7 /* log2(sizeof struct vnode) */ -#define NNULLNODECACHE 16 - -/* - * Null layer cache: - * Each cache entry holds a reference to the lower vnode - * along with a pointer to the alias vnode. When an - * entry is added the lower vnode is vnode_get'd. When the - * alias is removed the lower vnode is vnode_put'd. - */ - -#define NULL_NHASH(vp) \ - (&null_node_hashtbl[(((uintptr_t)vp)>>LOG2_SIZEVNODE) & null_node_hash]) -LIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl; -u_long null_node_hash; - -/* - * Initialise cache headers - */ -nullfs_init() -{ - -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_init\n"); /* printed during system boot */ -#endif - null_node_hashtbl = hashinit(NNULLNODECACHE, M_CACHE, &null_node_hash); -} - -/* - * Return a vnode_get'ed alias for lower vnode if already exists, else 0. - */ -static struct vnode * -null_node_find(mp, lowervp) - struct mount *mp; - struct vnode *lowervp; -{ - struct proc *p = curproc; /* XXX */ - struct null_node_hashhead *hd; - struct null_node *a; - struct vnode *vp; - - /* - * Find hash base, and then search the (two-way) linked - * list looking for a null_node structure which is referencing - * the lower vnode. If found, the increment the null_node - * reference count (but NOT the lower vnode's vnode_get counter). - */ - hd = NULL_NHASH(lowervp); -loop: - for (a = hd->lh_first; a != 0; a = a->null_hash.le_next) { - if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) { - vp = NULLTOV(a); - - if (vnode_get(vp)) { - printf ("null_node_find: vget failed.\n"); - goto loop; - }; - return (vp); - } - } - - return NULL; -} - - -/* - * Make a new null_node node. - * Vp is the alias vnode, lofsvp is the lower vnode. - * Maintain a reference to (lowervp). - */ -static int -null_node_alloc(mp, lowervp, vpp) - struct mount *mp; - struct vnode *lowervp; - struct vnode **vpp; -{ - struct null_node_hashhead *hd; - struct null_node *xp; - struct vnode *othervp, *vp; - int error; - - MALLOC(xp, struct null_node *, sizeof(struct null_node), M_TEMP, M_WAITOK); - if (error = getnewvnode(VT_NULL, mp, null_vnodeop_p, vpp)) { - FREE(xp, M_TEMP); - return (error); - } - vp = *vpp; - - vp->v_type = lowervp->v_type; - xp->null_vnode = vp; - vp->v_data = xp; - xp->null_lowervp = lowervp; - /* - * Before we insert our new node onto the hash chains, - * check to see if someone else has beaten us to it. - */ - if (othervp = null_node_find(lowervp)) { - FREE(xp, M_TEMP); - vp->v_type = VBAD; /* node is discarded */ - vp->v_usecount = 0; /* XXX */ - vp->v_data = 0; /* prevent access to freed data */ - *vpp = othervp; - return 0; - }; - if (vp->v_type == VREG) - ubc_info_init(vp); - vnode_get(lowervp); /* Extra vnode_get will be vnode_put'd in null_node_create */ - hd = NULL_NHASH(lowervp); - LIST_INSERT_HEAD(hd, xp, null_hash); - return 0; -} - - -/* - * Try to find an existing null_node vnode refering - * to it, otherwise make a new null_node vnode which - * contains a reference to the lower vnode. - */ -int -null_node_create(mp, lowervp, newvpp) - struct mount *mp; - struct vnode *lowervp; - struct vnode **newvpp; -{ - struct vnode *aliasvp; - - if (aliasvp = null_node_find(mp, lowervp)) { - /* - * null_node_find has taken another reference - * to the alias vnode. - */ -#ifdef NULLFS_DIAGNOSTIC - vprint("null_node_create: exists", NULLTOV(ap)); -#endif - /* vnode_get(aliasvp); --- done in null_node_find */ - } else { - int error; - - /* - * Get new vnode. - */ -#ifdef NULLFS_DIAGNOSTIC - printf("null_node_create: create new alias vnode\n"); -#endif - - /* - * Make new vnode reference the null_node. - */ - if (error = null_node_alloc(mp, lowervp, &aliasvp)) - return error; - - /* - * aliasvp is already vnode_get'd by getnewvnode() - */ - } - - vnode_put(lowervp); - -#if DIAGNOSTIC - if (lowervp->v_usecount < 1) { - /* Should never happen... */ - vprint ("null_node_create: alias ", aliasvp); - vprint ("null_node_create: lower ", lowervp); - panic ("null_node_create: lower has 0 usecount."); - }; -#endif - -#ifdef NULLFS_DIAGNOSTIC - vprint("null_node_create: alias", aliasvp); - vprint("null_node_create: lower", lowervp); -#endif - - *newvpp = aliasvp; - return (0); -} -#ifdef NULLFS_DIAGNOSTIC -struct vnode * -null_checkvp(vp, fil, lno) - struct vnode *vp; - char *fil; - int lno; -{ - struct null_node *a = VTONULL(vp); -#ifdef notyet - /* - * Can't do this check because vnop_reclaim runs - * with a funny vop vector. - */ - if (vp->v_op != null_vnodeop_p) { - printf ("null_checkvp: on non-null-node\n"); - while (null_checkvp_barrier) /*WAIT*/ ; - panic("null_checkvp"); - }; -#endif - if (a->null_lowervp == NULL) { - /* Should never happen */ - int i; uint32_t *p; - printf("vp = %x, ZERO ptr\n", vp); - for (p = (uint32_t *) a, i = 0; i < 8; i++) - printf(" %x", p[i]); - printf("\n"); - /* wait for debugger */ - while (null_checkvp_barrier) /*WAIT*/ ; - panic("null_checkvp"); - } - if (a->null_lowervp->v_usecount < 1) { - int i; uint32_t *p; - printf("vp = %x, unref'ed lowervp\n", vp); - for (p = (uint32_t *) a, i = 0; i < 8; i++) - printf(" %x", p[i]); - printf("\n"); - /* wait for debugger */ - while (null_checkvp_barrier) /*WAIT*/ ; - panic ("null with unref'ed lowervp"); - }; -#ifdef notyet - printf("null %x/%d -> %x/%d [%s, %d]\n", - NULLTOV(a), NULLTOV(a)->v_usecount, - a->null_lowervp, a->null_lowervp->v_usecount, - fil, lno); -#endif - return a->null_lowervp; -} -#endif diff --git a/bsd/miscfs/nullfs/null_vfsops.c b/bsd/miscfs/nullfs/null_vfsops.c deleted file mode 100644 index e81c64059..000000000 --- a/bsd/miscfs/nullfs/null_vfsops.c +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software donated to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95 - * - * @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 - */ - -/* - * Null Layer - * (See null_vnops.c for a description of what this does.) - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/proc.h> -#include <sys/kauth.h> -#include <sys/time.h> -#include <sys/types.h> -#include <sys/vnode.h> -#include <sys/mount_internal.h> -#include <sys/namei.h> -#include <sys/malloc.h> -#include <miscfs/nullfs/null.h> - -/* - * Mount null layer - */ -static int -nullfs_mount(mp, devvp, data, context) - struct mount *mp; - vnode_t devvp; - user_addr_t data; - vfs_context_t context; -{ - int error = 0; - struct user_null_args args; - struct vnode *lowerrootvp, *vp; - struct vnode *nullm_rootvp; - struct null_mount *xmp; - u_int size; - -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_mount(mp = %x)\n", mp); -#endif - - /* - * Update is a no-op - */ - if (mp->mnt_flag & MNT_UPDATE) { - return (ENOTSUP); - /* return VFS_MOUNT(MOUNTTONULLMOUNT(mp)->nullm_vfs, devvp, data, p);*/ - } - - /* - * Get argument - */ - if (vfs_context_is64bit(context)) { - error = copyin(data, (caddr_t)&args, sizeof (args)); - } - else { - struct null_args temp; - error = copyin(data, (caddr_t)&temp, sizeof (temp)); - args.target = CAST_USER_ADDR_T(temp.target); - } - if (error) - return (error); - - /* - * Find lower node - */ - NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT|LOCKLEAF, - UIO_USERSPACE, args.target, context); - if (error = namei(ndp)) - return (error); - nameidone(ndp); - /* - * Sanity check on lower vnode - */ - lowerrootvp = ndp->ni_vp; - - vnode_put(ndp->ni_dvp); - ndp->ni_dvp = NULL; - - xmp = (struct null_mount *) _MALLOC(sizeof(struct null_mount), - M_UFSMNT, M_WAITOK); /* XXX */ - - /* - * Save reference to underlying FS - */ - xmp->nullm_vfs = lowerrootvp->v_mount; - - /* - * Save reference. Each mount also holds - * a reference on the root vnode. - */ - error = null_node_create(mp, lowerrootvp, &vp); - /* - * Make sure the node alias worked - */ - if (error) { - vnode_put(lowerrootvp); - FREE(xmp, M_UFSMNT); /* XXX */ - return (error); - } - - /* - * Keep a held reference to the root vnode. - * It is vnode_put'd in nullfs_unmount. - */ - nullm_rootvp = vp; - nullm_rootvp->v_flag |= VROOT; - xmp->nullm_rootvp = nullm_rootvp; - if (NULLVPTOLOWERVP(nullm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) - mp->mnt_flag |= MNT_LOCAL; - mp->mnt_data = (qaddr_t) xmp; - vfs_getnewfsid(mp); - - (void) copyinstr(args.target, mp->mnt_vfsstat.f_mntfromname, MAXPATHLEN - 1, - &size); - bzero(mp->mnt_vfsstat.f_mntfromname + size, MNAMELEN - size); -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_mount: lower %s, alias at %s\n", - mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname); -#endif - return (0); -} - -/* - * VFS start. Nothing needed here - the start routine - * on the underlying filesystem will have been called - * when that filesystem was mounted. - */ -static int -nullfs_start(mp, flags, context) - struct mount *mp; - int flags; - vfs_context_t context; -{ - return (0); - /* return VFS_START(MOUNTTONULLMOUNT(mp)->nullm_vfs, flags, context); */ -} - -/* - * Free reference to null layer - */ -static int -nullfs_unmount(mp, mntflags, context) - struct mount *mp; - int mntflags; - vfs_context_t context; -{ - struct vnode *nullm_rootvp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; - int error; - int flags = 0; - int force = 0; - -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_unmount(mp = %x)\n", mp); -#endif - - if (mntflags & MNT_FORCE) { - flags |= FORCECLOSE; - force = 1; - } - - if ( (nullm_rootvp->v_usecount > 1) && !force ) - return (EBUSY); - if ( (error = vflush(mp, nullm_rootvp, flags)) && !force ) - return (error); - -#ifdef NULLFS_DIAGNOSTIC - vprint("alias root of lower", nullm_rootvp); -#endif - /* - * Release reference on underlying root vnode - */ - vnode_put(nullm_rootvp); - /* - * And blow it away for future re-use - */ - vnode_reclaim(nullm_rootvp); - /* - * Finally, throw away the null_mount structure - */ - FREE(mp->mnt_data, M_UFSMNT); /* XXX */ - mp->mnt_data = 0; - return 0; -} - -static int -nullfs_root(mp, vpp, context) - struct mount *mp; - struct vnode **vpp; - vfs_context_t context; -{ - struct proc *p = curproc; /* XXX */ - struct vnode *vp; - -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_root(mp = %x, vp = %x->%x)\n", mp, - MOUNTTONULLMOUNT(mp)->nullm_rootvp, - NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp) - ); -#endif - - /* - * Return locked reference to root. - */ - vp = MOUNTTONULLMOUNT(mp)->nullm_rootvp; - vnode_get(vp); - *vpp = vp; - return 0; -} - -static int -nullfs_quotactl(mp, cmd, uid, datap, context) - struct mount *mp; - int cmd; - uid_t uid; - caddr_t datap; - vfs_context_t context; -{ - return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, datap, context); -} - -static int -nullfs_statfs(mp, sbp, context) - struct mount *mp; - struct vfsstatfs *sbp; - vfs_context_t context; -{ - int error; - struct vfsstatfs mstat; - -#ifdef NULLFS_DIAGNOSTIC - printf("nullfs_statfs(mp = %x, vp = %x->%x)\n", mp, - MOUNTTONULLMOUNT(mp)->nullm_rootvp, - NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp) - ); -#endif - - bzero(&mstat, sizeof(mstat)); - - error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, &mstat, context); - if (error) - return (error); - - /* now copy across the "interesting" information and fake the rest */ - //sbp->f_type = mstat.f_type; - sbp->f_flags = mstat.f_flags; - sbp->f_bsize = mstat.f_bsize; - sbp->f_iosize = mstat.f_iosize; - sbp->f_blocks = mstat.f_blocks; - sbp->f_bfree = mstat.f_bfree; - sbp->f_bavail = mstat.f_bavail; - sbp->f_files = mstat.f_files; - sbp->f_ffree = mstat.f_ffree; - return (0); -} - -static int -nullfs_sync(__unused struct mount *mp, __unused int waitfor, - __unused kauth_cred_t cred, __unused vfs_context_t context) -{ - /* - * XXX - Assumes no data cached at null layer. - */ - return (0); -} - -static int -nullfs_vget(mp, ino, vpp, context) - struct mount *mp; - ino64_t ino; - struct vnode **vpp; - vfs_context_t context; -{ - - return VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, vpp, context); -} - -static int -nullfs_fhtovp(mp, fhlen, fhp, vpp, context) - struct mount *mp; - int fhlen; - unsigned char *fhp; - struct vnode **vpp; - vfs_context_t context; -{ - - return VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fhlen, fhp, vpp, context); -} - -static int -nullfs_vptofh(vp, fhlenp, fhp, context) - struct vnode *vp; - int *fhlenp; - unsigned char *fhp; - vfs_context_t context; -{ - return VFS_VPTOFH(NULLVPTOLOWERVP(vp), fhlenp, fhp, context); -} - -int nullfs_init (struct vfsconf *); - -#define nullfs_sysctl (int (*) (int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, proc_t))eopnotsupp - -struct vfsops null_vfsops = { - nullfs_mount, - nullfs_start, - nullfs_unmount, - nullfs_root, - nullfs_quotactl, - nullfs_statfs, - nullfs_sync, - nullfs_vget, - nullfs_fhtovp, - nullfs_vptofh, - nullfs_init, - nullfs_sysctl -}; diff --git a/bsd/miscfs/nullfs/null_vnops.c b/bsd/miscfs/nullfs/null_vnops.c deleted file mode 100644 index 4b2fb2bbf..000000000 --- a/bsd/miscfs/nullfs/null_vnops.c +++ /dev/null @@ -1,570 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * John Heidemann of the UCLA Ficus project. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)null_vnops.c 8.6 (Berkeley) 5/27/95 - * - * Ancestors: - * @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92 - * ...and... - * @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project - */ - -/* - * Null Layer - * - * (See mount_null(8) for more information.) - * - * The null layer duplicates a portion of the file system - * name space under a new name. In this respect, it is - * similar to the loopback file system. It differs from - * the loopback fs in two respects: it is implemented using - * a stackable layers techniques, and it's "null-node"s stack above - * all lower-layer vnodes, not just over directory vnodes. - * - * The null layer has two purposes. First, it serves as a demonstration - * of layering by proving a layer which does nothing. (It actually - * does everything the loopback file system does, which is slightly - * more than nothing.) Second, the null layer can serve as a prototype - * layer. Since it provides all necessary layer framework, - * new file system layers can be created very easily be starting - * with a null layer. - * - * The remainder of this man page examines the null layer as a basis - * for constructing new layers. - * - * - * INSTANTIATING NEW NULL LAYERS - * - * New null layers are created with mount_null(8). - * Mount_null(8) takes two arguments, the pathname - * of the lower vfs (target-pn) and the pathname where the null - * layer will appear in the namespace (alias-pn). After - * the null layer is put into place, the contents - * of target-pn subtree will be aliased under alias-pn. - * - * - * OPERATION OF A NULL LAYER - * - * The null layer is the minimum file system layer, - * simply bypassing all possible operations to the lower layer - * for processing there. The majority of its activity centers - * on the bypass routine, though which nearly all vnode operations - * pass. - * - * The bypass routine accepts arbitrary vnode operations for - * handling by the lower layer. It begins by examing vnode - * operation arguments and replacing any null-nodes by their - * lower-layer equivlants. It then invokes the operation - * on the lower layer. Finally, it replaces the null-nodes - * in the arguments and, if a vnode is return by the operation, - * stacks a null-node on top of the returned vnode. - * - * Although bypass handles most operations, vnop_getattr, vnop_lock, - * vnop_unlock, vnop_inactive, vnop_reclaim, and vnop_print are not - * bypassed. Vop_getattr must change the fsid being returned. - * Vop_lock and vnop_unlock must handle any locking for the - * current vnode as well as pass the lock request down. - * Vop_inactive and vnop_reclaim are not bypassed so that - * they can handle freeing null-layer specific data. Vop_print - * is not bypassed to avoid excessive debugging information. - * Also, certain vnode operations change the locking state within - * the operation (create, mknod, remove, link, rename, mkdir, rmdir, - * and symlink). Ideally these operations should not change the - * lock state, but should be changed to let the caller of the - * function unlock them. Otherwise all intermediate vnode layers - * (such as union, umapfs, etc) must catch these functions to do - * the necessary locking at their layer. - * - * - * INSTANTIATING VNODE STACKS - * - * Mounting associates the null layer with a lower layer, - * effect stacking two VFSes. Vnode stacks are instead - * created on demand as files are accessed. - * - * The initial mount creates a single vnode stack for the - * root of the new null layer. All other vnode stacks - * are created as a result of vnode operations on - * this or other null vnode stacks. - * - * New vnode stacks come into existance as a result of - * an operation which returns a vnode. - * The bypass routine stacks a null-node above the new - * vnode before returning it to the caller. - * - * For example, imagine mounting a null layer with - * "mount_null /usr/include /dev/layer/null". - * Changing directory to /dev/layer/null will assign - * the root null-node (which was created when the null layer was mounted). - * Now consider opening "sys". A vnop_lookup would be - * done on the root null-node. This operation would bypass through - * to the lower layer which would return a vnode representing - * the UFS "sys". Null_bypass then builds a null-node - * aliasing the UFS "sys" and returns this to the caller. - * Later operations on the null-node "sys" will repeat this - * process when constructing other vnode stacks. - * - * - * CREATING OTHER FILE SYSTEM LAYERS - * - * One of the easiest ways to construct new file system layers is to make - * a copy of the null layer, rename all files and variables, and - * then begin modifing the copy. Sed can be used to easily rename - * all variables. - * - * The umap layer is an example of a layer descended from the - * null layer. - * - * - * INVOKING OPERATIONS ON LOWER LAYERS - * - * There are two techniques to invoke operations on a lower layer - * when the operation cannot be completely bypassed. Each method - * is appropriate in different situations. In both cases, - * it is the responsibility of the aliasing layer to make - * the operation arguments "correct" for the lower layer - * by mapping an vnode arguments to the lower layer. - * - * The first approach is to call the aliasing layer's bypass routine. - * This method is most suitable when you wish to invoke the operation - * currently being hanldled on the lower layer. It has the advantage - * that the bypass routine already must do argument mapping. - * An example of this is null_getattrs in the null layer. - * - * A second approach is to directly invoked vnode operations on - * the lower layer with the VOP_OPERATIONNAME interface. - * The advantage of this method is that it is easy to invoke - * arbitrary operations on the lower layer. The disadvantage - * is that vnodes arguments must be manualy mapped. - * - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/proc.h> -#include <sys/kauth.h> -#include <sys/time.h> -#include <sys/types.h> -#include <sys/vnode.h> -#include <sys/mount_internal.h> -#include <sys/namei.h> -#include <sys/malloc.h> -#include <sys/buf.h> -#include <miscfs/nullfs/null.h> - - -int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ - -/* - * This is the 10-Apr-92 bypass routine. - * This version has been optimized for speed, throwing away some - * safety checks. It should still always work, but it's not as - * robust to programmer errors. - * Define SAFETY to include some error checking code. - * - * In general, we map all vnodes going down and unmap them on the way back. - * As an exception to this, vnodes can be marked "unmapped" by setting - * the Nth bit in operation's vdesc_flags. - * - * Also, some BSD vnode operations have the side effect of node_put'ing - * their arguments. With stacking, the reference counts are held - * by the upper node, not the lower one, so we must handle these - * side-effects here. This is not of concern in Sun-derived systems - * since there are no such side-effects. - * - * This makes the following assumptions: - * - only one returned vpp - * - no INOUT vpp's (Sun's vnop_open has one of these) - * - the vnode operation vector of the first vnode should be used - * to determine what implementation of the op should be invoked - * - all mapped vnodes are of our vnode-type (NEEDSWORK: - * problems on rmdir'ing mount points and renaming?) - */ -int -null_bypass(ap) - struct vnop_generic_args /* { - struct vnodeop_desc *a_desc; - <other random data follows, presumably> - } */ *ap; -{ - extern int (**null_vnodeop_p)(void *); /* not extern, really "forward" */ - register struct vnode **this_vp_p; - int error; - struct vnode *old_vps[VDESC_MAX_VPS]; - struct vnode **vps_p[VDESC_MAX_VPS]; - struct vnode ***vppp; - struct vnodeop_desc *descp = ap->a_desc; - int reles, i; - - if (null_bug_bypass) - printf ("null_bypass: %s\n", descp->vdesc_name); - -#ifdef SAFETY - /* - * We require at least one vp. - */ - if (descp->vdesc_vp_offsets == NULL || - descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) - panic ("null_bypass: no vp's in map.\n"); -#endif - - /* - * Map the vnodes going in. - * Later, we'll invoke the operation based on - * the first mapped vnode's operation vector. - */ - reles = descp->vdesc_flags; - for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { - if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) - break; /* bail out at end of list */ - vps_p[i] = this_vp_p = - VOPARG_OFFSETTO(struct vnode**,descp->vdesc_vp_offsets[i],ap); - /* - * We're not guaranteed that any but the first vnode - * are of our type. Check for and don't map any - * that aren't. (We must always map first vp or vclean fails.) - */ - if (i && (*this_vp_p == NULL || - (*this_vp_p)->v_op != null_vnodeop_p)) { - old_vps[i] = NULL; - } else { - old_vps[i] = *this_vp_p; - *(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p); - /* - * XXX - Several operations have the side effect - * of vnode_put'ing their vp's. We must account for - * that. (This should go away in the future.) - */ - if (reles & 1) - vnode_get(*this_vp_p); - } - - } - - /* - * Call the operation on the lower layer - * with the modified argument structure. - */ - error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap); - - /* - * Maintain the illusion of call-by-value - * by restoring vnodes in the argument structure - * to their original value. - */ - reles = descp->vdesc_flags; - for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { - if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) - break; /* bail out at end of list */ - if (old_vps[i]) { - *(vps_p[i]) = old_vps[i]; - if (reles & 1) - vnode_put(*(vps_p[i])); - } - } - - /* - * Map the possible out-going vpp - * (Assumes that the lower layer always returns - * a vnode_get'ed vpp unless it gets an error.) - */ - if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && - !(descp->vdesc_flags & VDESC_NOMAP_VPP) && - !error) { - /* - * XXX - even though some ops have vpp returned vp's, - * several ops actually vnode_put this before returning. - * We must avoid these ops. - * (This should go away when these ops are regularized.) - */ - if (descp->vdesc_flags & VDESC_VPP_WILLRELE) - goto out; - vppp = VOPARG_OFFSETTO(struct vnode***, - descp->vdesc_vpp_offset,ap); - error = null_node_create(old_vps[0]->v_mount, **vppp, *vppp); - } - - out: - return (error); -} - -/* - * We have to carry on the locking protocol on the null layer vnodes - * as we progress through the tree. We also have to enforce read-only - * if this layer is mounted read-only. - */ -null_lookup(ap) - struct vnop_lookup_args /* { - struct vnode * a_dvp; - struct vnode ** a_vpp; - struct componentname * a_cnp; - vfs_context_t a_context; - } */ *ap; -{ - struct componentname *cnp = ap->a_cnp; - struct proc *p = cnp->cn_proc; - int flags = cnp->cn_flags; - struct vnode *dvp, *vp; - int error; - - error = null_bypass(ap); - - /* - * We must do the same locking and unlocking at this layer as - * is done in the layers below us. We could figure this out - * based on the error return and the LASTCN, LOCKPARENT, and - * LOCKLEAF flags. However, it is more expidient to just find - * out the state of the lower level vnodes and set ours to the - * same state. - */ - dvp = ap->a_dvp; - vp = *ap->a_vpp; - if (dvp == vp) - return (error); - return (error); -} - -/* - * Setattr call. - */ -int -null_setattr( - struct vnop_setattr_args /* { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct vnode_attr *a_vap; - kauth_cred_t a_cred; - struct proc *a_p; - } */ *ap) -{ - struct vnode *vp = ap->a_vp; - struct vnode_attr *vap = ap->a_vap; - - if (VATTR_IS_ACTIVE(vap, va_data_size)) { - switch (vp->v_type) { - case VDIR: - return (EISDIR); - case VCHR: - case VBLK: - case VSOCK: - case VFIFO: - return (0); - case VREG: - case VLNK: - default: - } - } - return (null_bypass(ap)); -} - -/* - * We handle getattr only to change the fsid. - */ -int -null_getattr(ap) - struct vnop_getattr_args /* { - struct vnode *a_vp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } */ *ap; -{ - int error; - - if (error = null_bypass(ap)) - return (error); - /* Requires that arguments be restored. */ - VATTR_RETURN(ap->a_vap, va_fsid, ap->a_vp->v_mount->mnt_vfsstat.f_fsid.val[0]); - return (0); -} - -int -null_access(ap) - struct vnop_access_args /* { - struct vnode *a_vp; - int a_action; - vfs_context_t a_context; - } */ *ap; -{ - return (null_bypass(ap)); -} - -int -null_inactive(ap) - struct vnop_inactive_args /* { - struct vnode *a_vp; - vfs_context_t a_context; - } */ *ap; -{ - /* - * Do nothing (and _don't_ bypass). - * Wait to vnode_put lowervp until reclaim, - * so that until then our null_node is in the - * cache and reusable. - * - * NEEDSWORK: Someday, consider inactive'ing - * the lowervp and then trying to reactivate it - * with capabilities (v_id) - * like they do in the name lookup cache code. - * That's too much work for now. - */ - return (0); -} - -int -null_reclaim(ap) - struct vnop_reclaim_args /* { - struct vnode *a_vp; - vfs_context_t a_context; - } */ *ap; -{ - struct vnode *vp = ap->a_vp; - struct null_node *xp = VTONULL(vp); - struct vnode *lowervp = xp->null_lowervp; - - /* - * Note: in vnop_reclaim, vp->v_op == dead_vnodeop_p, - * so we can't call VOPs on ourself. - */ - /* After this assignment, this node will not be re-used. */ - xp->null_lowervp = NULL; - LIST_REMOVE(xp, null_hash); - FREE(vp->v_data, M_TEMP); - vp->v_data = NULL; - vnode_put (lowervp); - return (0); -} - -/* - * XXX - vnop_strategy must be hand coded because it has no - * vnode in its arguments. - * This goes away with a merged VM/buffer cache. - */ -int -null_strategy(ap) - struct vnop_strategy_args /* { - struct buf *a_bp; - } */ *ap; -{ - struct buf *bp = ap->a_bp; - int error; - struct vnode *savedvp; - - savedvp = vnode(bp); - buf_setvnode(bp, NULLVPTOLOWERVP(savedvp)); - - error = VNOP_STRATEGY(bp); - - buf_setvnode(bp, savedvp); - - return (error); -} - -/* - * XXX - like vnop_strategy, vnop_bwrite must be hand coded because it has no - * vnode in its arguments. - * This goes away with a merged VM/buffer cache. - */ -int -null_bwrite(ap) - struct vnop_bwrite_args /* { - struct buf *a_bp; - } */ *ap; -{ - struct buf *bp = ap->a_bp; - int error; - struct vnode *savedvp; - - savedvp = buf_vnode(bp); - buf_setvnode(bp, NULLVPTOLOWERVP(savedvp)); - - error = VNOP_BWRITE(bp); - - buf_setvnode(bp, savedvp); - - return (error); -} - -/* - * Global vfs data structures - */ - -#define VOPFUNC int (*)(void *) - -int (**null_vnodeop_p)(void *); -struct vnodeopv_entry_desc null_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)null_bypass }, - - { &vnop_lookup_desc, (VOPFUNC)null_lookup }, - { &vnop_setattr_desc, (VOPFUNC)null_setattr }, - { &vnop_getattr_desc, (VOPFUNC)null_getattr }, - { &vnop_access_desc, (VOPFUNC)null_access }, - { &vnop_inactive_desc, (VOPFUNC)null_inactive }, - { &vnop_reclaim_desc, (VOPFUNC)null_reclaim }, - - { &vnop_strategy_desc, (VOPFUNC)null_strategy }, - { &vnop_bwrite_desc, (VOPFUNC)null_bwrite }, - - { (struct vnodeop_desc*)NULL, (int(*)())NULL } -}; -struct vnodeopv_desc null_vnodeop_opv_desc = - { &null_vnodeop_p, null_vnodeop_entries }; diff --git a/bsd/miscfs/specfs/Makefile b/bsd/miscfs/specfs/Makefile index 52832cc71..7c6f583e4 100644 --- a/bsd/miscfs/specfs/Makefile +++ b/bsd/miscfs/specfs/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/miscfs/specfs/spec_vnops.c b/bsd/miscfs/specfs/spec_vnops.c index cbd0de6d9..8050679f8 100644 --- a/bsd/miscfs/specfs/spec_vnops.c +++ b/bsd/miscfs/specfs/spec_vnops.c @@ -69,8 +69,9 @@ #include <sys/conf.h> #include <sys/buf_internal.h> #include <sys/mount_internal.h> -#include <sys/namei.h> #include <sys/vnode_internal.h> +#include <sys/file_internal.h> +#include <sys/namei.h> #include <sys/stat.h> #include <sys/errno.h> #include <sys/ioctl.h> @@ -82,6 +83,8 @@ #include <sys/resource.h> #include <miscfs/specfs/specdev.h> #include <vfs/vfs_support.h> +#include <kern/assert.h> +#include <kern/task.h> #include <sys/kdebug.h> @@ -247,7 +250,15 @@ spec_open(struct vnop_open_args *ap) vp->v_flag |= VISTTY; vnode_unlock(vp); } + + devsw_lock(dev, S_IFCHR); error = (*cdevsw[maj].d_open)(dev, ap->a_mode, S_IFCHR, p); + + if (error == 0) { + vp->v_specinfo->si_opencount++; + } + + devsw_unlock(dev, S_IFCHR); return (error); case VBLK: @@ -266,7 +277,14 @@ spec_open(struct vnop_open_args *ap) */ if ( (error = vfs_mountedon(vp)) ) return (error); + + devsw_lock(dev, S_IFBLK); error = (*bdevsw[maj].d_open)(dev, ap->a_mode, S_IFBLK, p); + if (!error) { + vp->v_specinfo->si_opencount++; + } + devsw_unlock(dev, S_IFBLK); + if (!error) { u_int64_t blkcnt; u_int32_t blksize; @@ -382,7 +400,7 @@ spec_read(struct vnop_read_args *ap) } n = min((unsigned)(n - on), uio_resid(uio)); - error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio); + error = uiomove((char *)buf_dataptr(bp) + on, n, uio); if (n + on == bsize) buf_markaged(bp); buf_brelse(bp); @@ -484,7 +502,7 @@ spec_write(struct vnop_write_args *ap) } n = min(n, bsize - buf_resid(bp)); - error = uiomove((char *)0 + buf_dataptr(bp) + on, n, uio); + error = uiomove((char *)buf_dataptr(bp) + on, n, uio); if (error) { buf_brelse(bp); return (error); @@ -562,6 +580,8 @@ spec_select(struct vnop_select_args *ap) } } +static int filt_specattach(struct knote *kn); + int spec_kqfilter(vnode_t vp, struct knote *kn) { @@ -575,8 +595,8 @@ spec_kqfilter(vnode_t vp, struct knote *kn) dev = vnode_specrdev(vp); if (vnode_istty(vp)) { - /* We can hook into the slave side of a tty */ - err = ptsd_kqfilter(dev, kn); + /* We can hook into TTYs... */ + err = filt_specattach(kn); } else { /* Try a bpf device, as defined in bsd/net/bpf.c */ err = bpfkqfilter(dev, kn); @@ -618,8 +638,12 @@ void IOSleep(int); #define LOWPRI_WINDOW_MSECS_INC 50 #define LOWPRI_MAX_WINDOW_MSECS 200 #define LOWPRI_MAX_WAITING_MSECS 200 -#define LOWPRI_SLEEP_INTERVAL 5 +#if CONFIG_EMBEDDED +#define LOWPRI_SLEEP_INTERVAL 5 +#else +#define LOWPRI_SLEEP_INTERVAL 2 +#endif struct _throttle_io_info_t { struct timeval last_normal_IO_timestamp; @@ -627,7 +651,6 @@ struct _throttle_io_info_t { SInt32 numthreads_throttling; SInt32 refcnt; SInt32 alloc; - }; struct _throttle_io_info_t _throttle_io_info[LOWPRI_MAX_NUM_DEV]; @@ -647,10 +670,31 @@ int lowpri_max_waiting_msecs = LOWPRI_MAX_WAITING_MSECS; #define DEBUG_ALLOC_THROTTLE_INFO(format, debug_info, args...) #endif -SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); -SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, ""); -SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); -SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_initial_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_initial_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_IO_window_inc, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_IO_window_msecs_inc, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_max_window_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_window_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); +SYSCTL_INT(_debug, OID_AUTO, lowpri_max_waiting_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &lowpri_max_waiting_msecs, LOWPRI_INITIAL_WINDOW_MSECS, ""); + +/* + * throttled I/O helper function + * convert the index of the lowest set bit to a device index + */ +int +num_trailing_0(uint64_t n) +{ + /* + * since in most cases the number of trailing 0s is very small, + * we simply counting sequentially from the lowest bit + */ + if (n == 0) + return sizeof(n) * 8; + int count = 0; + while (!ISSET(n, 1)) { + n >>= 1; + ++count; + } + return count; +} /* * Release the reference and if the item was allocated and this is the last @@ -760,6 +804,41 @@ throttle_info_mount_ref(mount_t mp, void *throttle_info) mp->mnt_throttle_info = throttle_info; } +/* + * Private KPI routine + * + * return a handle for accessing throttle_info given a throttle_mask. The + * handle must be released by throttle_info_rel_by_mask + */ +int +throttle_info_ref_by_mask(uint64_t throttle_mask, + throttle_info_handle_t *throttle_info_handle) +{ + int dev_index; + struct _throttle_io_info_t *info; + + if (throttle_info_handle == NULL) + return EINVAL; + + dev_index = num_trailing_0(throttle_mask); + info = &_throttle_io_info[dev_index]; + throttle_info_ref(info); + *(struct _throttle_io_info_t**)throttle_info_handle = info; + return 0; +} + +/* + * Private KPI routine + * + * release the handle obtained by throttle_info_ref_by_mask + */ +void +throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle) +{ + /* for now the handle is just a pointer to _throttle_io_info_t */ + throttle_info_rel((struct _throttle_io_info_t*)throttle_info_handle); +} + /* * KPI routine * @@ -804,12 +883,51 @@ update_last_io_time(mount_t mp) microuptime(&info->last_IO_timestamp); } + +#if CONFIG_EMBEDDED + +int throttle_get_io_policy(struct uthread **ut) +{ + int policy = IOPOL_DEFAULT; + proc_t p = current_proc(); + + *ut = get_bsdthread_info(current_thread()); + + if (p != NULL) + policy = p->p_iopol_disk; + + if (*ut != NULL) { + // the I/O policy of the thread overrides that of the process + // unless the I/O policy of the thread is default + if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT) + policy = (*ut)->uu_iopol_disk; + } + return policy; +} +#else + +int throttle_get_io_policy(__unused struct uthread **ut) +{ + *ut = get_bsdthread_info(current_thread()); + + return (proc_get_task_selfdiskacc()); +} +#endif + + static int throttle_io_will_be_throttled_internal(int lowpri_window_msecs, void * throttle_info) { struct _throttle_io_info_t *info = throttle_info; struct timeval elapsed; int elapsed_msecs; + int policy; + struct uthread *ut; + + policy = throttle_get_io_policy(&ut); + + if (ut->uu_throttle_bc == FALSE && policy != IOPOL_THROTTLE) + return (0); microuptime(&elapsed); timevalsub(&elapsed, &info->last_normal_IO_timestamp); @@ -841,12 +959,15 @@ throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp) return throttle_io_will_be_throttled_internal(lowpri_window_msecs, info); } -void throttle_lowpri_io(boolean_t ok_to_sleep) +uint32_t +throttle_lowpri_io(int sleep_amount) { - int i; + int sleep_cnt = 0; + int numthreads_throttling; int max_try_num; struct uthread *ut; struct _throttle_io_info_t *info; + int max_waiting_msecs; ut = get_bsdthread_info(current_thread()); @@ -854,23 +975,39 @@ void throttle_lowpri_io(boolean_t ok_to_sleep) goto done; info = ut->uu_throttle_info; - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, - ut->uu_lowpri_window, ok_to_sleep, 0, 0, 0); - if (ok_to_sleep == TRUE) { - max_try_num = lowpri_max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, info->numthreads_throttling); + if (sleep_amount != 0) { +#if CONFIG_EMBEDDED + max_waiting_msecs = lowpri_max_waiting_msecs; +#else + if (ut->uu_throttle_isssd == TRUE) + max_waiting_msecs = lowpri_max_waiting_msecs / 100; + else + max_waiting_msecs = lowpri_max_waiting_msecs; +#endif + if (max_waiting_msecs < LOWPRI_SLEEP_INTERVAL) + max_waiting_msecs = LOWPRI_SLEEP_INTERVAL; - for (i=0; i<max_try_num; i++) { + numthreads_throttling = info->numthreads_throttling + MIN(10, MAX(1, sleep_amount)) - 1; + max_try_num = max_waiting_msecs / LOWPRI_SLEEP_INTERVAL * MAX(1, numthreads_throttling); + + for (sleep_cnt = 0; sleep_cnt < max_try_num; sleep_cnt++) { if (throttle_io_will_be_throttled_internal(ut->uu_lowpri_window, info)) { + if (sleep_cnt == 0) { + KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_START, + ut->uu_lowpri_window, max_try_num, numthreads_throttling, 0, 0); + } IOSleep(LOWPRI_SLEEP_INTERVAL); DEBUG_ALLOC_THROTTLE_INFO("sleeping because of info = %p\n", info, info ); } else { break; } } + if (sleep_cnt) { + KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, + ut->uu_lowpri_window, sleep_cnt, 0, 0, 0); + } } - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 97)) | DBG_FUNC_END, - ut->uu_lowpri_window, i*5, 0, 0, 0); SInt32 oldValue; oldValue = OSDecrementAtomic(&info->numthreads_throttling); @@ -882,35 +1019,72 @@ done: if (ut->uu_throttle_info) throttle_info_rel(ut->uu_throttle_info); ut->uu_throttle_info = NULL; + ut->uu_throttle_bc = FALSE; + + return (sleep_cnt * LOWPRI_SLEEP_INTERVAL); } -int throttle_get_io_policy(struct uthread **ut) +/* + * KPI routine + * + * set a kernel thread's IO policy. policy can be: + * IOPOL_NORMAL, IOPOL_THROTTLE, IOPOL_PASSIVE + * + * explanations about these policies are in the man page of setiopolicy_np + */ +void throttle_set_thread_io_policy(int policy) { - int policy = IOPOL_DEFAULT; - proc_t p = current_proc(); +#if !CONFIG_EMBEDDED + proc_apply_thread_selfdiskacc(policy); +#else /* !CONFIG_EMBEDDED */ + struct uthread *ut; + ut = get_bsdthread_info(current_thread()); + ut->uu_iopol_disk = policy; +#endif /* !CONFIG_EMBEDDED */ +} - *ut = get_bsdthread_info(current_thread()); - - if (p != NULL) - policy = p->p_iopol_disk; - if (*ut != NULL) { - // the I/O policy of the thread overrides that of the process - // unless the I/O policy of the thread is default - if ((*ut)->uu_iopol_disk != IOPOL_DEFAULT) - policy = (*ut)->uu_iopol_disk; +static +void throttle_info_reset_window(struct uthread *ut) +{ + struct _throttle_io_info_t *info; + + info = ut->uu_throttle_info; + + OSDecrementAtomic(&info->numthreads_throttling); + throttle_info_rel(info); + ut->uu_throttle_info = NULL; + ut->uu_lowpri_window = 0; +} + +static +void throttle_info_set_initial_window(struct uthread *ut, struct _throttle_io_info_t *info, boolean_t isssd, boolean_t BC_throttle) +{ + SInt32 oldValue; + + ut->uu_throttle_info = info; + throttle_info_ref(info); + DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info ); + + oldValue = OSIncrementAtomic(&info->numthreads_throttling); + if (oldValue < 0) { + panic("%s: numthreads negative", __func__); } - return policy; + ut->uu_lowpri_window = lowpri_IO_initial_window_msecs; + ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue; + ut->uu_throttle_isssd = isssd; + ut->uu_throttle_bc = BC_throttle; } -void throttle_info_update(void *throttle_info, int flags) + +static +void throttle_info_update_internal(void *throttle_info, int flags, boolean_t isssd) { struct _throttle_io_info_t *info = throttle_info; struct uthread *ut; int policy; int is_throttleable_io = 0; int is_passive_io = 0; - SInt32 oldValue; if (!lowpri_IO_initial_window_msecs || (info == NULL)) return; @@ -949,28 +1123,19 @@ void throttle_info_update(void *throttle_info, int flags) * do the delay just before we return from the system * call that triggered this I/O or from vnode_pagein */ - if (ut->uu_lowpri_window == 0) { - ut->uu_throttle_info = info; - throttle_info_ref(ut->uu_throttle_info); - DEBUG_ALLOC_THROTTLE_INFO("updating info = %p\n", info, info ); - - oldValue = OSIncrementAtomic(&info->numthreads_throttling); - if (oldValue < 0) { - panic("%s: numthreads negative", __func__); - } - ut->uu_lowpri_window = lowpri_IO_initial_window_msecs; - ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * oldValue; - } else { + if (ut->uu_lowpri_window == 0) + throttle_info_set_initial_window(ut, info, isssd, FALSE); + else { /* The thread sends I/Os to different devices within the same system call */ if (ut->uu_throttle_info != info) { - struct _throttle_io_info_t *old_info = ut->uu_throttle_info; + struct _throttle_io_info_t *old_info = ut->uu_throttle_info; // keep track of the numthreads in the right device OSDecrementAtomic(&old_info->numthreads_throttling); OSIncrementAtomic(&info->numthreads_throttling); - DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info ); - DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info ); + DEBUG_ALLOC_THROTTLE_INFO("switching from info = %p\n", old_info, old_info ); + DEBUG_ALLOC_THROTTLE_INFO("switching to info = %p\n", info, info ); /* This thread no longer needs a reference on that throttle info */ throttle_info_rel(ut->uu_throttle_info); ut->uu_throttle_info = info; @@ -981,26 +1146,76 @@ void throttle_info_update(void *throttle_info, int flags) ut->uu_lowpri_window += lowpri_IO_window_msecs_inc * numthreads; if (ut->uu_lowpri_window > lowpri_max_window_msecs * numthreads) ut->uu_lowpri_window = lowpri_max_window_msecs * numthreads; + + if (isssd == FALSE) { + /* + * we're here because we've actually issued I/Os to different devices... + * if at least one of them was a non SSD, then thottle the thread + * using the policy for non SSDs + */ + ut->uu_throttle_isssd = FALSE; + } } } } +/* + * KPI routine + * + * this is usually called before every I/O, used for throttled I/O + * book keeping. This routine has low overhead and does not sleep + */ +void throttle_info_update(void *throttle_info, int flags) +{ + throttle_info_update_internal(throttle_info, flags, FALSE); +} + +/* + * KPI routine + * + * this is usually called before every I/O, used for throttled I/O + * book keeping. This routine has low overhead and does not sleep + */ +void throttle_info_update_by_mask(void *throttle_info_handle, int flags) +{ + void *throttle_info = throttle_info_handle; + /* for now we only use the lowest bit of the throttle mask, so the + * handle is the same as the throttle_info. Later if we store a + * set of throttle infos in the handle, we will want to loop through + * them and call throttle_info_update in a loop + */ + throttle_info_update(throttle_info, flags); +} + +extern int ignore_is_ssd; + int spec_strategy(struct vnop_strategy_args *ap) { buf_t bp; int bflags; - int policy; + int policy; dev_t bdev; uthread_t ut; - size_t devbsdunit; mount_t mp; + int strategy_ret; + struct _throttle_io_info_t *throttle_info; + boolean_t isssd = FALSE; bp = ap->a_bp; bdev = buf_device(bp); - bflags = buf_flags(bp); mp = buf_vnode(bp)->v_mount; + policy = throttle_get_io_policy(&ut); + + if (policy == IOPOL_THROTTLE) { + bp->b_flags |= B_THROTTLED_IO; + bp->b_flags &= ~B_PASSIVE; + } else if (policy == IOPOL_PASSIVE) + bp->b_flags |= B_PASSIVE; + + bflags = bp->b_flags; + if (kdebug_enable) { int code = 0; @@ -1014,6 +1229,11 @@ spec_strategy(struct vnop_strategy_args *ap) else if (bflags & B_PAGEIO) code |= DKIO_PAGING; + if (bflags & B_THROTTLED_IO) + code |= DKIO_THROTTLE; + else if (bflags & B_PASSIVE) + code |= DKIO_PASSIVE; + KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, bp, bdev, (int)buf_blkno(bp), buf_count(bp), 0); } @@ -1021,29 +1241,63 @@ spec_strategy(struct vnop_strategy_args *ap) mp && (mp->mnt_kern_flag & MNTK_ROOTDEV)) hard_throttle_on_root = 1; + if (mp != NULL) { + if ((mp->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) + isssd = TRUE; + throttle_info = &_throttle_io_info[mp->mnt_devbsdunit]; + } else + throttle_info = &_throttle_io_info[LOWPRI_MAX_NUM_DEV - 1]; - if (mp != NULL) - devbsdunit = mp->mnt_devbsdunit; - else - devbsdunit = LOWPRI_MAX_NUM_DEV - 1; - - throttle_info_update(&_throttle_io_info[devbsdunit], bflags); - if ((policy = throttle_get_io_policy(&ut)) == IOPOL_THROTTLE) { - bp->b_flags |= B_THROTTLED_IO; - } - + throttle_info_update_internal(throttle_info, bflags, isssd); if ((bflags & B_READ) == 0) { - microuptime(&_throttle_io_info[devbsdunit].last_IO_timestamp); + microuptime(&throttle_info->last_IO_timestamp); if (mp) { INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_write_size); } } else if (mp) { INCR_PENDING_IO(buf_count(bp), mp->mnt_pending_read_size); } - - (*bdevsw[major(bdev)].d_strategy)(bp); + /* + * The BootCache may give us special information about + * the IO, so it returns special values that we check + * for here. + * + * IO_SATISFIED_BY_CACHE + * The read has been satisfied by the boot cache. Don't + * throttle the thread unnecessarily. + * + * IO_SHOULD_BE_THROTTLED + * The boot cache is playing back a playlist and this IO + * cut through. Throttle it so we're not cutting through + * the boot cache too often. + * + * Note that typical strategy routines are defined with + * a void return so we'll get garbage here. In the + * unlikely case the garbage matches our special return + * value, it's not a big deal since we're only adjusting + * the throttling delay. + */ +#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed) +#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef) + typedef int strategy_fcn_ret_t(struct buf *bp); + strategy_ret = (*(strategy_fcn_ret_t*)bdevsw[major(bdev)].d_strategy)(bp); + + if ((IO_SATISFIED_BY_CACHE == strategy_ret) && (ut->uu_lowpri_window != 0) && (ut->uu_throttle_info != NULL)) { + /* + * If this was a throttled IO satisfied by the boot cache, + * don't delay the thread. + */ + throttle_info_reset_window(ut); + + } else if ((IO_SHOULD_BE_THROTTLED == strategy_ret) && (ut->uu_lowpri_window == 0) && (ut->uu_throttle_info == NULL)) { + /* + * If the boot cache indicates this IO should be throttled, + * delay the thread. + */ + throttle_info_set_initial_window(ut, throttle_info, isssd, TRUE); + } return (0); } @@ -1066,11 +1320,11 @@ spec_close(struct vnop_close_args *ap) { struct vnode *vp = ap->a_vp; dev_t dev = vp->v_rdev; - int (*devclose)(dev_t, int, int, struct proc *); - int mode, error; + int error = 0; int flags = ap->a_fflag; struct proc *p = vfs_context_proc(ap->a_context); struct session *sessp; + int do_rele = 0; switch (vp->v_type) { @@ -1088,38 +1342,56 @@ spec_close(struct vnop_close_args *ap) if (sessp != SESSION_NULL) { if ((vcount(vp) == 1) && (vp == sessp->s_ttyvp)) { + session_lock(sessp); - sessp->s_ttyvp = NULL; - sessp->s_ttyvid = 0; - sessp->s_ttyp = TTY_NULL; - sessp->s_ttypgrpid = NO_PID; + if (vp == sessp->s_ttyvp) { + sessp->s_ttyvp = NULL; + sessp->s_ttyvid = 0; + sessp->s_ttyp = TTY_NULL; + sessp->s_ttypgrpid = NO_PID; + do_rele = 1; + } session_unlock(sessp); - vnode_rele(vp); + + if (do_rele) { + vnode_rele(vp); + } } session_rele(sessp); } - devclose = cdevsw[major(dev)].d_close; - mode = S_IFCHR; + devsw_lock(dev, S_IFCHR); + + vp->v_specinfo->si_opencount--; + + if (vp->v_specinfo->si_opencount < 0) { + panic("Negative open count?"); + } /* * close on last reference or on vnode revoke call */ - if ((flags & IO_REVOKE) != 0) - break; - if (vcount(vp) > 0) + if ((vcount(vp) > 0) && ((flags & IO_REVOKE) == 0)) { + devsw_unlock(dev, S_IFCHR); return (0); + } + + error = cdevsw[major(dev)].d_close(dev, flags, S_IFCHR, p); + + devsw_unlock(dev, S_IFCHR); break; case VBLK: /* - * Since every use (buffer, vnode, swap, blockmap) - * holds a reference to the vnode, and because we mark - * any other vnodes that alias this device, when the - * sum of the reference counts on all the aliased - * vnodes descends to zero, we are on last close. + * If there is more than one outstanding open, don't + * send the close to the device. */ - if (vcount(vp) > 0) + devsw_lock(dev, S_IFBLK); + if (vcount(vp) > 1) { + vp->v_specinfo->si_opencount--; + devsw_unlock(dev, S_IFBLK); return (0); + } + devsw_unlock(dev, S_IFBLK); /* * On last close of a block device (that isn't mounted) @@ -1133,8 +1405,22 @@ spec_close(struct vnop_close_args *ap) if (error) return (error); - devclose = bdevsw[major(dev)].d_close; - mode = S_IFBLK; + devsw_lock(dev, S_IFBLK); + + vp->v_specinfo->si_opencount--; + + if (vp->v_specinfo->si_opencount < 0) { + panic("Negative open count?"); + } + + if (vcount(vp) > 0) { + devsw_unlock(dev, S_IFBLK); + return (0); + } + + error = bdevsw[major(dev)].d_close(dev, flags, S_IFBLK, p); + + devsw_unlock(dev, S_IFBLK); break; default: @@ -1142,7 +1428,7 @@ spec_close(struct vnop_close_args *ap) return(EBADF); } - return ((*devclose)(dev, flags, mode, p)); + return error; } /* @@ -1234,3 +1520,171 @@ spec_offtoblk(struct vnop_offtoblk_args *ap) return (0); } + +static void filt_specdetach(struct knote *kn); +static int filt_spec(struct knote *kn, long hint); +static unsigned filt_specpeek(struct knote *kn); + +struct filterops spec_filtops = { + .f_isfd = 1, + .f_attach = filt_specattach, + .f_detach = filt_specdetach, + .f_event = filt_spec, + .f_peek = filt_specpeek +}; + +static int +filter_to_seltype(int16_t filter) +{ + switch (filter) { + case EVFILT_READ: + return FREAD; + case EVFILT_WRITE: + return FWRITE; + break; + default: + panic("filt_to_seltype(): invalid filter %d\n", filter); + return 0; + } +} + +static int +filt_specattach(struct knote *kn) +{ + vnode_t vp; + dev_t dev; + + vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; /* Already have iocount, and vnode is alive */ + + assert(vnode_ischr(vp)); + + dev = vnode_specrdev(vp); + + if (major(dev) > nchrdev) { + return ENXIO; + } + + if ((cdevsw_flags[major(dev)] & CDEVSW_SELECT_KQUEUE) == 0) { + return EINVAL; + } + + /* Resulting wql is safe to unlink even if it has never been linked */ + kn->kn_hook = wait_queue_link_allocate(); + if (kn->kn_hook == NULL) { + return EAGAIN; + } + + kn->kn_fop = &spec_filtops; + kn->kn_hookid = vnode_vid(vp); + + knote_markstayqueued(kn); + + return 0; +} + +static void +filt_specdetach(struct knote *kn) +{ + kern_return_t ret; + + /* + * Given wait queue link and wait queue set, unlink. This is subtle. + * If the device has been revoked from under us, selclearthread() will + * have removed our link from the kqueue's wait queue set, which + * wait_queue_set_unlink_one() will detect and handle. + */ + ret = wait_queue_set_unlink_one(kn->kn_kq->kq_wqs, kn->kn_hook); + if (ret != KERN_SUCCESS) { + panic("filt_specdetach(): failed to unlink wait queue link."); + } + + (void)wait_queue_link_free(kn->kn_hook); + kn->kn_hook = NULL; + kn->kn_status &= ~KN_STAYQUEUED; +} + +static int +filt_spec(struct knote *kn, long hint) +{ + vnode_t vp; + uthread_t uth; + wait_queue_set_t old_wqs; + vfs_context_t ctx; + int selres; + int error; + int use_offset; + dev_t dev; + uint64_t flags; + + assert(kn->kn_hook != NULL); + + if (hint != 0) { + panic("filt_spec(): nonzero hint?"); + } + + uth = get_bsdthread_info(current_thread()); + ctx = vfs_context_current(); + vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; + + error = vnode_getwithvid(vp, kn->kn_hookid); + if (error != 0) { + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + return 1; + } + + dev = vnode_specrdev(vp); + flags = cdevsw_flags[major(dev)]; + use_offset = ((flags & CDEVSW_USE_OFFSET) != 0); + assert((flags & CDEVSW_SELECT_KQUEUE) != 0); + + /* Trick selrecord() into hooking kqueue's wait queue set into device wait queue */ + old_wqs = uth->uu_wqset; + uth->uu_wqset = kn->kn_kq->kq_wqs; + selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); + uth->uu_wqset = old_wqs; + + if (use_offset) { + if (kn->kn_fp->f_fglob->fg_offset >= (uint32_t)selres) { + kn->kn_data = 0; + } else { + kn->kn_data = ((uint32_t)selres) - kn->kn_fp->f_fglob->fg_offset; + } + } else { + kn->kn_data = selres; + } + + vnode_put(vp); + + return (kn->kn_data != 0); +} + +static unsigned +filt_specpeek(struct knote *kn) +{ + vnode_t vp; + uthread_t uth; + wait_queue_set_t old_wqs; + vfs_context_t ctx; + int error, selres; + + uth = get_bsdthread_info(current_thread()); + ctx = vfs_context_current(); + vp = (vnode_t)kn->kn_fp->f_fglob->fg_data; + + error = vnode_getwithvid(vp, kn->kn_hookid); + if (error != 0) { + return 1; /* Just like VNOP_SELECT() on recycled vnode */ + } + + /* + * Why pass the link here? Because we may not have registered in the past... + */ + old_wqs = uth->uu_wqset; + uth->uu_wqset = kn->kn_kq->kq_wqs; + selres = VNOP_SELECT(vp, filter_to_seltype(kn->kn_filter), 0, kn->kn_hook, ctx); + uth->uu_wqset = old_wqs; + + vnode_put(vp); + return selres; +} + diff --git a/bsd/miscfs/specfs/specdev.h b/bsd/miscfs/specfs/specdev.h index dfe9c9945..3394fedbf 100644 --- a/bsd/miscfs/specfs/specdev.h +++ b/bsd/miscfs/specfs/specdev.h @@ -79,6 +79,7 @@ struct specinfo { struct vnode *si_specnext; long si_flags; dev_t si_rdev; + int32_t si_opencount; daddr_t si_size; /* device block size in bytes */ daddr64_t si_lastr; /* last read blkno (read-ahead) */ u_int64_t si_devsize; /* actual device size in bytes */ diff --git a/bsd/miscfs/union/Makefile b/bsd/miscfs/union/Makefile index 72ccd7707..513e6bbb9 100644 --- a/bsd/miscfs/union/Makefile +++ b/bsd/miscfs/union/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/miscfs/union/union.h b/bsd/miscfs/union/union.h index 5411f538d..eee3e5a87 100644 --- a/bsd/miscfs/union/union.h +++ b/bsd/miscfs/union/union.h @@ -67,156 +67,5 @@ #ifndef __UNION_UNION_H__ #define __UNION_UNION_H__ -#include <sys/appleapiopts.h> -#include <sys/cdefs.h> - -#ifdef __APPLE_API_PRIVATE -struct union_args { - char *target; /* Target of loopback */ - int mntflags; /* Options on the mount */ -}; - -#define UNMNT_ABOVE 0x0001 /* Target appears above mount point */ -#define UNMNT_BELOW 0x0002 /* Target appears below mount point */ -#define UNMNT_REPLACE 0x0003 /* Target replaces mount point */ -#ifdef FAULTFS -#define UNMNT_FAULTIN 0x0004 /* get the files to TOT on lookup */ -#define UNMNT_OPMASK 0x0007 -#else -#define UNMNT_OPMASK 0x0003 -#endif - -#ifdef BSD_KERNEL_PRIVATE - -struct union_mount { - struct vnode *um_uppervp; /* */ - int um_uppervid; /* vid of upper vnode */ - struct vnode *um_lowervp; /* Left unlocked */ - int um_lowervid; /* vid of lower vnode */ - kauth_cred_t um_cred; /* Credentials of user calling mount */ - int um_cmode; /* cmask from mount process */ - int um_op; /* Operation mode */ - dev_t um_upperdev; /* Upper root node fsid[0]*/ -}; - - -#define UNION_ABOVE(x) (x->um_op == UNMNT_ABOVE) -#define UNION_LOWER(x) (x->um_op == UNMNT_BELOW) -#define UNION_REPLACE(x) (x->um_op == UNMNT_REPLACE) -#ifdef FAULTFS -#define UNION_FAULTIN(x) (x->um_op == UNMNT_FAULTIN) -#else -#define UNION_FAULTIN(x) (0) - -#endif - -/* LP64 version of union_args. all pointers - * grow when we're dealing with a 64-bit process. - * WARNING - keep in sync with union_args - */ - -struct user_union_args { - user_addr_t target; /* Target of loopback */ - int mntflags; /* Options on the mount */ - char _pad[4]; -}; - -/* - * DEFDIRMODE is the mode bits used to create a shadow directory. - */ -#define VRWXMODE (VREAD|VWRITE|VEXEC) -#define VRWMODE (VREAD|VWRITE) -#define UN_DIRMODE ((VRWXMODE)|(VRWXMODE>>3)|(VRWXMODE>>6)) -#define UN_FILEMODE ((VRWMODE)|(VRWMODE>>3)|(VRWMODE>>6)) - -/* - * A cache of vnode references - */ -struct union_node { - LIST_ENTRY(union_node) un_cache; /* Hash chain */ - struct vnode *un_vnode; /* Back pointer */ - - struct vnode *un_uppervp; /* overlaying object */ - int un_uppervid; /* vid of upper vnode */ - off_t un_uppersz; /* size of upper object */ - - struct vnode *un_lowervp; /* underlying object */ - int un_lowervid; /* vid of upper vnode */ - off_t un_lowersz; /* size of lower object */ - - struct vnode *un_dirvp; /* Parent dir of uppervp */ - struct vnode *un_pvp; /* Parent vnode */ - - char *un_path; /* saved component name */ - int un_hash; /* saved un_path hash value */ - int un_openl; /* # of opens on lowervp */ - int un_exclcnt; /* exclusive count */ - unsigned int un_flags; - mount_t un_mount; - struct vnode **un_dircache; /* cached union stack */ -}; - -#define UN_WANT 0x01 /* union node is needed */ -#define UN_LOCKED 0x02 /* union node is locked */ -#define UN_CACHED 0x04 /* In union cache */ -#define UN_TRANSIT 0x08 /* The union node is in creation */ -#define UN_DELETED 0x10 /* The union node is deleted */ -#ifdef FAULTFS -#define UN_FAULTFS 0x80 /* The union node is for faultfs */ -#endif -#define UN_DIRENVN 0x100 /* The union node is created for dir enumeration */ - - -#ifdef FAULTFS -#define UNNODE_FAULTIN(x) ((x->un_flags & UN_FAULTFS)== UN_FAULTFS) -#else -#define UNNODE_FAULTIN(x) (0) -#endif -/* - * Hash table locking flags - */ - -#define UNVP_WANT 0x01 -#define UNVP_LOCKED 0x02 - -#define MOUNTTOUNIONMOUNT(mp) ((struct union_mount *)((mp)->mnt_data)) -#define VTOUNION(vp) ((struct union_node *)(vp)->v_data) -#define UNIONTOV(un) ((un)->un_vnode) -#define LOWERVP(vp) (VTOUNION(vp)->un_lowervp) -#define UPPERVP(vp) (VTOUNION(vp)->un_uppervp) -#define OTHERVP(vp) (UPPERVP(vp) ? UPPERVP(vp) : LOWERVP(vp)) - - -extern int union_allocvp(struct vnode **, struct mount *, - struct vnode *, struct vnode *, - struct componentname *, struct vnode *, - struct vnode *, int); -extern int union_freevp(struct vnode *); -extern struct vnode * union_dircache(struct vnode *, vfs_context_t); -extern int union_copyfile(struct vnode *, struct vnode *,vfs_context_t ); -extern int union_copyup(struct union_node *, int, vfs_context_t ); -extern int union_dowhiteout(struct union_node *, vfs_context_t); -extern int union_mkshadow(struct union_mount *, struct vnode *, - struct componentname *, struct vnode **); -extern int union_mkwhiteout(struct union_mount *, struct vnode *, - struct componentname *, char *); -extern int union_vn_create(struct vnode **, struct union_node *, mode_t mode, vfs_context_t context); -extern int union_cn_close(struct vnode *, int, vfs_context_t context); -extern void union_removed_upper(struct union_node *un); -extern struct vnode *union_lowervp(struct vnode *); -extern void union_newsize(struct vnode *, off_t, off_t); -extern int union_init(struct vfsconf *); -extern void union_updatevp(struct union_node *, struct vnode *, struct vnode *); -extern void union_dircache_free(struct union_node *); -extern int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t); -extern int union_faultin_copyup(struct vnode ** uvpp, vnode_t udvp, vnode_t lvp, struct componentname * cnp, vfs_context_t context); -extern int (**union_vnodeop_p)(void *); -extern struct vfsops union_vfsops; -void union_lock(void); -void union_unlock(void); - -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /* __APPLE_API_PRIVATE */ #endif /* __UNION_UNION_H__ */ diff --git a/bsd/miscfs/union/union_subr.c b/bsd/miscfs/union/union_subr.c deleted file mode 100644 index 34dbe14f3..000000000 --- a/bsd/miscfs/union/union_subr.c +++ /dev/null @@ -1,1604 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1994 Jan-Simon Pendry - * Copyright (c) 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)union_subr.c 8.20 (Berkeley) 5/20/95 - */ -/* - * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce - * support for mandatory and extensible security protections. This notice - * is included in support of clause 2.2 (b) of the Apple Public License, - * Version 2.0. - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/proc_internal.h> -#include <sys/kauth.h> -#include <sys/time.h> -#include <sys/kernel.h> -#include <sys/vnode_internal.h> -#include <sys/namei.h> -#include <sys/malloc.h> -#include <sys/file_internal.h> -#include <sys/filedesc.h> -#include <sys/queue.h> -#include <sys/mount_internal.h> -#include <sys/stat.h> -#include <sys/ubc.h> -#include <sys/uio_internal.h> -#include <miscfs/union/union.h> -#include <sys/lock.h> -#include <sys/kdebug.h> -#if CONFIG_MACF -#include <security/mac_framework.h> -#endif - - -static int union_vn_close(struct vnode *vp, int fmode, vfs_context_t ctx); - -/* must be power of two, otherwise change UNION_HASH() */ -#define NHASH 32 - -/* unsigned int ... */ -#define UNION_HASH(u, l) \ - (((((uintptr_t) (u)) + ((uintptr_t) l)) >> 8) & (NHASH-1)) - -static LIST_HEAD(unhead, union_node) unhead[NHASH]; -static int unvplock[NHASH]; - -static lck_grp_t * union_lck_grp; -static lck_grp_attr_t * union_lck_grp_attr; -static lck_attr_t * union_lck_attr; -static lck_mtx_t * union_mtxp; - -static int union_dircheck(struct vnode **, struct fileproc *, vfs_context_t ctx); -static void union_newlower(struct union_node *, struct vnode *); -static void union_newupper(struct union_node *, struct vnode *); - - -int -union_init(__unused struct vfsconf *vfsp) -{ - int i; - - union_lck_grp_attr= lck_grp_attr_alloc_init(); -#if DIAGNOSTIC - lck_grp_attr_setstat(union_lck_grp_attr); -#endif - union_lck_grp = lck_grp_alloc_init("union", union_lck_grp_attr); - union_lck_attr = lck_attr_alloc_init(); -#if DIAGNOSTIC - lck_attr_setdebug(union_lck_attr); -#endif - union_mtxp = lck_mtx_alloc_init(union_lck_grp, union_lck_attr); - - for (i = 0; i < NHASH; i++) - LIST_INIT(&unhead[i]); - bzero((caddr_t) unvplock, sizeof(unvplock)); - /* add the hook for getdirentries */ - union_dircheckp = union_dircheck; - - return (0); -} - -void -union_lock() -{ - lck_mtx_lock(union_mtxp); -} - -void -union_unlock() -{ - lck_mtx_unlock(union_mtxp); -} - - -static int -union_list_lock(int ix) -{ - - if (unvplock[ix] & UNVP_LOCKED) { - unvplock[ix] |= UNVP_WANT; - msleep((caddr_t) &unvplock[ix], union_mtxp, PINOD, "union_list_lock", NULL); - return (1); - } - - unvplock[ix] |= UNVP_LOCKED; - - return (0); -} - -static void -union_list_unlock(int ix) -{ - - unvplock[ix] &= ~UNVP_LOCKED; - - if (unvplock[ix] & UNVP_WANT) { - unvplock[ix] &= ~UNVP_WANT; - wakeup((caddr_t) &unvplock[ix]); - } -} - -/* - * union_updatevp: - * - * The uppervp, if not NULL, must be referenced and not locked by us - * The lowervp, if not NULL, must be referenced. - * - * If uppervp and lowervp match pointers already installed, then - * nothing happens. The passed vp's (when matching) are not adjusted. - * - * This routine may only be called by union_newupper() and - * union_newlower(). - */ - -/* always called with union lock held */ -void -union_updatevp(struct union_node *un, struct vnode *uppervp, - struct vnode *lowervp) -{ - int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp); - int nhash = UNION_HASH(uppervp, lowervp); - int docache = (lowervp != NULLVP || uppervp != NULLVP); - int lhash, uhash; - vnode_t freevp; - vnode_t freedirvp; - caddr_t freepath; - - /* - * Ensure locking is ordered from lower to higher - * to avoid deadlocks. - */ - if (nhash < ohash) { - lhash = nhash; - uhash = ohash; - } else { - lhash = ohash; - uhash = nhash; - } - - if (lhash != uhash) { - while (union_list_lock(lhash)) - continue; - } - - while (union_list_lock(uhash)) - continue; - - if (ohash != nhash || !docache) { - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - } - - if (ohash != nhash) - union_list_unlock(ohash); - - if (un->un_lowervp != lowervp) { - freevp = freedirvp = NULLVP; - freepath = (caddr_t)0; - if (un->un_lowervp) { - freevp = un->un_lowervp; - un->un_lowervp = lowervp; - if (un->un_path) { - freepath = un->un_path; - un->un_path = 0; - } - if (un->un_dirvp) { - freedirvp = un->un_dirvp; - un->un_dirvp = NULLVP; - } - union_unlock(); - if (freevp) - vnode_put(freevp); - if (freedirvp) - vnode_put(freedirvp); - if (freepath) - _FREE(un->un_path, M_TEMP); - union_lock(); - } else - un->un_lowervp = lowervp; - if (lowervp != NULLVP) - un->un_lowervid = vnode_vid(lowervp); - un->un_lowersz = VNOVAL; - } - - if (un->un_uppervp != uppervp) { - freevp = NULLVP; - if (un->un_uppervp) { - freevp = un->un_uppervp; - } - un->un_uppervp = uppervp; - if (uppervp != NULLVP) - un->un_uppervid = vnode_vid(uppervp); - un->un_uppersz = VNOVAL; - union_unlock(); - if (freevp) - vnode_put(freevp); - union_lock(); - } - - if (docache && (ohash != nhash)) { - LIST_INSERT_HEAD(&unhead[nhash], un, un_cache); - un->un_flags |= UN_CACHED; - } - - union_list_unlock(nhash); -} - -/* - * Set a new lowervp. The passed lowervp must be referenced and will be - * stored in the vp in a referenced state. - */ -/* always called with union lock held */ - -static void -union_newlower(un, lowervp) - struct union_node *un; - struct vnode *lowervp; -{ - union_updatevp(un, un->un_uppervp, lowervp); -} - -/* - * Set a new uppervp. The passed uppervp must be locked and will be - * stored in the vp in a locked state. The caller should not unlock - * uppervp. - */ - -/* always called with union lock held */ -static void -union_newupper(un, uppervp) - struct union_node *un; - struct vnode *uppervp; -{ - union_updatevp(un, uppervp, un->un_lowervp); -} - -/* - * Keep track of size changes in the underlying vnodes. - * If the size changes, then callback to the vm layer - * giving priority to the upper layer size. - */ -/* always called with union lock held */ -void -union_newsize(vp, uppersz, lowersz) - struct vnode *vp; - off_t uppersz, lowersz; -{ - struct union_node *un; - off_t sz; - - /* only interested in regular files */ - if (vp->v_type != VREG) - return; - - un = VTOUNION(vp); - sz = VNOVAL; - - if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) { - un->un_uppersz = uppersz; - if (sz == VNOVAL) - sz = un->un_uppersz; - } - - if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) { - un->un_lowersz = lowersz; - if (sz == VNOVAL) - sz = un->un_lowersz; - } - - if (sz != VNOVAL) { -#ifdef UNION_DIAGNOSTIC - printf("union: %s size now %ld\n", - uppersz != VNOVAL ? "upper" : "lower", (long) sz); -#endif - union_unlock(); - ubc_setsize(vp, sz); - union_lock(); - } -} - -/* - * union_allocvp: allocate a union_node and associate it with a - * parent union_node and one or two vnodes. - * - * vpp Holds the returned vnode locked and referenced if no - * error occurs. - * - * mp Holds the mount point. mp may or may not be busied. - * allocvp() makes no changes to mp. - * - * dvp Holds the parent union_node to the one we wish to create. - * XXX may only be used to traverse an uncopied lowervp-based - * tree? XXX - * - * dvp may or may not be locked. allocvp() makes no changes - * to dvp. - * - * upperdvp Holds the parent vnode to uppervp, generally used along - * with path component information to create a shadow of - * lowervp when uppervp does not exist. - * - * upperdvp is referenced but unlocked on entry, and will be - * dereferenced on return. - * - * uppervp Holds the new uppervp vnode to be stored in the - * union_node we are allocating. uppervp is referenced but - * not locked, and will be dereferenced on return. - * - * lowervp Holds the new lowervp vnode to be stored in the - * union_node we are allocating. lowervp is referenced but - * not locked, and will be dereferenced on return. - * - * cnp Holds path component information to be coupled with - * lowervp and upperdvp to allow unionfs to create an uppervp - * later on. Only used if lowervp is valid. The contents - * of cnp is only valid for the duration of the call. - * - * docache Determine whether this node should be entered in the - * cache or whether it should be destroyed as soon as possible. - * - * All union_nodes are maintained on a singly-linked - * list. New nodes are only allocated when they cannot - * be found on this list. Entries on the list are - * removed when the vfs reclaim entry is called. - * - * A single lock is kept for the entire list. This is - * needed because the getnewvnode() function can block - * waiting for a vnode to become free, in which case there - * may be more than one process trying to get the same - * vnode. This lock is only taken if we are going to - * call getnewvnode(), since the kernel itself is single-threaded. - * - * If an entry is found on the list, then call vget() to - * take a reference. This is done because there may be - * zero references to it and so it needs to removed from - * the vnode free list. - */ - -/* always called with union lock held */ - -int -union_allocvp(struct vnode **vpp, - struct mount *mp, - struct vnode *undvp, - struct vnode *dvp, - struct componentname *cnp, - struct vnode *uppervp, - struct vnode *lowervp, - int docache) -{ - int error; - struct union_node *un = NULL; - struct union_node *unp; - struct vnode *xlowervp = NULLVP; - struct union_mount *um = MOUNTTOUNIONMOUNT(mp); - int hash = 0; /* protected by docache */ - int markroot; - int try; - struct vnode_fsparam vfsp; - enum vtype vtype; - - if (uppervp == NULLVP && lowervp == NULLVP) - panic("union: unidentifiable allocation"); - - /* - * if both upper and lower vp are provided and are off different type - * consider lowervp as NULL - */ - if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) { - xlowervp = lowervp; - lowervp = NULLVP; - } - - /* detect the root vnode (and aliases) */ - markroot = 0; - if ((uppervp == um->um_uppervp) && - ((lowervp == NULLVP) || lowervp == um->um_lowervp)) { - if (lowervp == NULLVP) { - lowervp = um->um_lowervp; - if (lowervp != NULLVP) { - union_unlock(); - vnode_get(lowervp); - union_lock(); - } - } - markroot = VROOT; - } - -loop: - if (!docache) { - un = NULL; - } else for (try = 0; try < 3; try++) { - switch (try) { - case 0: - if (lowervp == NULLVP) - continue; - hash = UNION_HASH(uppervp, lowervp); - break; - - case 1: - if (uppervp == NULLVP) - continue; - hash = UNION_HASH(uppervp, NULLVP); - break; - - case 2: - if (lowervp == NULLVP) - continue; - /* Not sure how this path gets exercised ? */ - hash = UNION_HASH(NULLVP, lowervp); - break; - } - - while (union_list_lock(hash)) - continue; - - for (un = unhead[hash].lh_first; un != 0; - un = un->un_cache.le_next) { - if ((un->un_lowervp == lowervp || - un->un_lowervp == NULLVP) && - (un->un_uppervp == uppervp || - un->un_uppervp == NULLVP) && - (un->un_mount == mp)) { - break; - } - } - - union_list_unlock(hash); - - if (un) - break; - } - - if (un) { - /* - * Obtain a lock on the union_node. - * uppervp is locked, though un->un_uppervp - * may not be. this doesn't break the locking - * hierarchy since in the case that un->un_uppervp - * is not yet locked it will be vnode_put'd and replaced - * with uppervp. - */ - - if (un->un_flags & UN_LOCKED) { - un->un_flags |= UN_WANT; - msleep((caddr_t) &un->un_flags, union_mtxp, PINOD, "union node locked", 0); - goto loop; - } - un->un_flags |= UN_LOCKED; - - union_unlock(); - if (UNIONTOV(un) == NULLVP) - panic("null vnode in union node\n"); - if (vnode_get(UNIONTOV(un))) { - union_lock(); - un->un_flags &= ~UN_LOCKED; - if ((un->un_flags & UN_WANT) == UN_WANT) { - un->un_flags &= ~UN_LOCKED; - wakeup(&un->un_flags); - } - goto loop; - } - union_lock(); - - /* - * At this point, the union_node is locked, - * un->un_uppervp may not be locked, and uppervp - * is locked or nil. - */ - - /* - * Save information about the upper layer. - */ - if (uppervp != un->un_uppervp) { - union_newupper(un, uppervp); - } else if (uppervp) { - union_unlock(); - vnode_put(uppervp); - union_lock(); - } - - /* - * Save information about the lower layer. - * This needs to keep track of pathname - * and directory information which union_vn_create - * might need. - */ - if (lowervp != un->un_lowervp) { - union_newlower(un, lowervp); - if (cnp && (lowervp != NULLVP)) { - un->un_hash = cnp->cn_hash; - union_unlock(); - MALLOC(un->un_path, caddr_t, cnp->cn_namelen+1, - M_TEMP, M_WAITOK); - bcopy(cnp->cn_nameptr, un->un_path, - cnp->cn_namelen); - vnode_get(dvp); - union_lock(); - un->un_path[cnp->cn_namelen] = '\0'; - un->un_dirvp = dvp; - } - } else if (lowervp) { - union_unlock(); - vnode_put(lowervp); - union_lock(); - } - *vpp = UNIONTOV(un); - un->un_flags &= ~UN_LOCKED; - if ((un->un_flags & UN_WANT) == UN_WANT) { - un->un_flags &= ~UN_WANT; - wakeup(&un->un_flags); - } - return (0); - } - - if (docache) { - /* - * otherwise lock the vp list while we call getnewvnode - * since that can block. - */ - hash = UNION_HASH(uppervp, lowervp); - - if (union_list_lock(hash)) - goto loop; - } - - union_unlock(); - MALLOC(unp, void *, sizeof(struct union_node), M_TEMP, M_WAITOK); - union_lock(); - - bzero(unp, sizeof(struct union_node)); - un = unp; - un->un_uppervp = uppervp; - if (uppervp != NULLVP) - un->un_uppervid = vnode_vid(uppervp); - un->un_uppersz = VNOVAL; - un->un_lowervp = lowervp; - if (lowervp != NULLVP) - un->un_lowervid = vnode_vid(lowervp); - un->un_lowersz = VNOVAL; - un->un_pvp = undvp; - if (undvp != NULLVP) - vnode_get(undvp); - un->un_dircache = 0; - un->un_openl = 0; - un->un_mount = mp; - un->un_flags = UN_LOCKED; -#ifdef FAULTFS - if (UNION_FAULTIN(um)) - un->un_flags |= UN_FAULTFS; -#endif - - if (docache) { - /* Insert with lock held */ - LIST_INSERT_HEAD(&unhead[hash], un, un_cache); - un->un_flags |= UN_CACHED; - union_list_unlock(hash); - } - - union_unlock(); - - if (uppervp) - vtype = uppervp->v_type; - else - vtype = lowervp->v_type; - - bzero(&vfsp, sizeof(struct vnode_fsparam)); - vfsp.vnfs_mp = mp; - vfsp.vnfs_vtype = vtype; - vfsp.vnfs_str = "unionfs"; - vfsp.vnfs_dvp = undvp; - vfsp.vnfs_fsnode = unp; - vfsp.vnfs_cnp = cnp; - vfsp.vnfs_vops = union_vnodeop_p; - vfsp.vnfs_rdev = 0; - vfsp.vnfs_filesize = 0; - vfsp.vnfs_flags = VNFS_NOCACHE | VNFS_CANTCACHE; - vfsp.vnfs_marksystem = 0; - vfsp.vnfs_markroot = markroot; - - error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, vpp); - if (error) { - /* XXXXX Is this right ???? XXXXXXX */ - if (uppervp) { - vnode_put(uppervp); - } - if (lowervp) - vnode_put(lowervp); - - union_lock(); - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - if (docache) - union_list_unlock(hash); - - FREE(unp, M_TEMP); - - return (error); - } - - if (cnp && (lowervp != NULLVP)) { - un->un_hash = cnp->cn_hash; - un->un_path = _MALLOC(cnp->cn_namelen+1, M_TEMP, M_WAITOK); - bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen); - un->un_path[cnp->cn_namelen] = '\0'; - vnode_get(dvp); - un->un_dirvp = dvp; - } else { - un->un_hash = 0; - un->un_path = 0; - un->un_dirvp = 0; - } - - if (xlowervp) - vnode_put(xlowervp); - - union_lock(); - - vnode_settag(*vpp, VT_UNION); - un->un_vnode = *vpp; - if (un->un_vnode->v_type == VDIR) { - if (un->un_uppervp == NULLVP) { - panic("faulting fs and no upper vp for dir?"); - } - - } - - - un->un_flags &= ~UN_LOCKED; - if ((un->un_flags & UN_WANT) == UN_WANT) { - un->un_flags &= ~UN_WANT; - wakeup(&un->un_flags); - } - - return(error); - -} - -/* always called with union lock held */ -int -union_freevp(struct vnode *vp) -{ - struct union_node *un = VTOUNION(vp); - - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - - union_unlock(); - if (un->un_pvp != NULLVP) - vnode_put(un->un_pvp); - if (un->un_uppervp != NULLVP) - vnode_put(un->un_uppervp); - if (un->un_lowervp != NULLVP) - vnode_put(un->un_lowervp); - if (un->un_dirvp != NULLVP) - vnode_put(un->un_dirvp); - if (un->un_path) - _FREE(un->un_path, M_TEMP); - - FREE(vp->v_data, M_TEMP); - vp->v_data = 0; - union_lock(); - - return (0); -} - -/* - * copyfile. copy the vnode (fvp) to the vnode (tvp) - * using a sequence of reads and writes. both (fvp) - * and (tvp) are locked on entry and exit. - */ -/* called with no union lock held */ -int -union_copyfile(struct vnode *fvp, struct vnode *tvp, vfs_context_t context) -{ - char *bufp; - struct uio *auio; - char uio_buf [ UIO_SIZEOF(1) ]; - int error = 0; - - /* - * strategy: - * allocate a buffer of size MAXPHYSIO. - * loop doing reads and writes, keeping track - * of the current uio offset. - * give up at the first sign of trouble. - */ - - auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, - UIO_READ /* will change */, &uio_buf, sizeof(uio_buf)); - - bufp = _MALLOC(MAXPHYSIO, M_TEMP, M_WAITOK); - if (bufp == NULL) { - return ENOMEM; - } - - /* ugly loop follows... */ - do { - off_t offset = uio_offset(auio); - - uio_reset(auio, offset, UIO_SYSSPACE, UIO_READ); - uio_addiov(auio, (uintptr_t)bufp, MAXPHYSIO); - error = VNOP_READ(fvp, auio, 0, context); - - if (error == 0) { - user_ssize_t resid = uio_resid(auio); - - uio_reset(auio, offset, UIO_SYSSPACE, UIO_WRITE); - uio_addiov(auio, (uintptr_t)bufp, MAXPHYSIO - resid); - - if (uio_resid(auio) == 0) - break; - - do { - error = VNOP_WRITE(tvp, auio, 0, context); - } while ((uio_resid(auio) > 0) && (error == 0)); - } - - } while (error == 0); - - _FREE(bufp, M_TEMP); - return (error); -} - -/* - * (un) is assumed to be locked on entry and remains - * locked on exit. - */ -/* always called with union lock held */ -int -union_copyup(struct union_node *un, int docopy, vfs_context_t context) -{ - int error; - struct vnode *lvp, *uvp; - struct vnode_attr vattr; - mode_t cmode = 0; - - - lvp = un->un_lowervp; - - union_unlock(); - - if (UNNODE_FAULTIN(un)) { - /* Need to inherit exec mode in faulting fs */ - VATTR_INIT(&vattr); - VATTR_WANTED(&vattr, va_flags); - if (vnode_getattr(lvp, &vattr, context) == 0 ) - cmode = vattr.va_mode; - - } - error = union_vn_create(&uvp, un, cmode, context); - if (error) { - union_lock(); - if (error == EEXIST) { - if (uvp != NULLVP) { - union_newupper(un, uvp); - error = 0; - } - } - return (error); - } - - union_lock(); - /* at this point, uppervp is locked */ - union_newupper(un, uvp); - union_unlock(); - - - if (docopy) { - /* - * XX - should not ignore errors - * from vnop_close - */ - error = VNOP_OPEN(lvp, FREAD, context); - if (error == 0) { - error = union_copyfile(lvp, uvp, context); - (void) VNOP_CLOSE(lvp, FREAD, context); - } -#ifdef UNION_DIAGNOSTIC - if (error == 0) - uprintf("union: copied up %s\n", un->un_path); -#endif - - } - union_vn_close(uvp, FWRITE, context); - - /* - * Subsequent IOs will go to the top layer, so - * call close on the lower vnode and open on the - * upper vnode to ensure that the filesystem keeps - * its references counts right. This doesn't do - * the right thing with (cred) and (FREAD) though. - * Ignoring error returns is not right, either. - */ - - /* No need to hold the lock as the union node should be locked for this(it is in faultin mode) */ - if (error == 0) { - int i; - - for (i = 0; i < un->un_openl; i++) { - (void) VNOP_CLOSE(lvp, FREAD, context); - (void) VNOP_OPEN(uvp, FREAD, context); - } - un->un_openl = 0; - } - - union_lock(); - - return (error); - -} - - -int -union_faultin_copyup(struct vnode **vpp, vnode_t udvp, vnode_t lvp, struct componentname * cnp, vfs_context_t context) -{ - int error; - struct vnode *uvp; - struct vnode_attr vattr; - struct vnode_attr *vap; - mode_t cmode = 0; - int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); - struct proc * p = vfs_context_proc(context); - struct componentname cn; - - - vap = &vattr; - VATTR_INIT(vap); - VATTR_WANTED(vap, va_flags); - if (vnode_getattr(lvp, vap, context) == 0 ) - cmode = vattr.va_mode; - - *vpp = NULLVP; - - - if (cmode == (mode_t)0) - cmode = UN_FILEMODE & ~p->p_fd->fd_cmask; - else - cmode = cmode & ~p->p_fd->fd_cmask; - - - /* - * Build a new componentname structure (for the same - * reasons outlines in union_mkshadow()). - * The difference here is that the file is owned by - * the current user, rather than by the person who - * did the mount, since the current user needs to be - * able to write the file (that's why it is being - * copied in the first place). - */ - bzero(&cn, sizeof(struct componentname)); - - cn.cn_namelen = cnp->cn_namelen; - cn.cn_pnbuf = (caddr_t) _MALLOC_ZONE(cn.cn_namelen+1, - M_NAMEI, M_WAITOK); - cn.cn_pnlen = cn.cn_namelen+1; - bcopy(cnp->cn_nameptr, cn.cn_pnbuf, cn.cn_namelen+1); - cn.cn_nameiop = CREATE; - cn.cn_flags = (HASBUF|SAVENAME|SAVESTART|ISLASTCN|UNIONCREATED); - cn.cn_context = context; - cn.cn_nameptr = cn.cn_pnbuf; - cn.cn_hash = 0; - cn.cn_consume = 0; - - /* - * Pass dvp unlocked and referenced on call to relookup(). - * - * If an error occurs, dvp will be returned unlocked and dereferenced. - */ - if ((error = relookup(udvp, &uvp, &cn)) != 0) { - goto out; - } - - /* - * If no error occurs, dvp will be returned locked with the reference - * left as before, and vpp will be returned referenced and locked. - */ - if (uvp) { - *vpp = uvp; - error = EEXIST; - goto out; - } - - /* - * Good - there was no race to create the file - * so go ahead and create it. The permissions - * on the file will be 0666 modified by the - * current user's umask. Access to the file, while - * it is unioned, will require access to the top *and* - * bottom files. Access when not unioned will simply - * require access to the top-level file. - * - * TODO: confirm choice of access permissions. - * decide on authorisation behaviour - */ - - VATTR_INIT(vap); - VATTR_SET(vap, va_type, VREG); - VATTR_SET(vap, va_mode, cmode); - - cn.cn_flags |= (UNIONCREATED); - if ((error = vn_create(udvp, &uvp, &cn, vap, 0, context)) != 0) { - goto out; - } - - - if ((error = VNOP_OPEN(uvp, fmode, context)) != 0) { - vn_clearunionwait(uvp, 0); - vnode_recycle(uvp); - vnode_put(uvp); - goto out; - } - - error = vnode_ref_ext(uvp, fmode); - if (error ) { - vn_clearunionwait(uvp, 0); - VNOP_CLOSE(uvp, fmode, context); - vnode_recycle(uvp); - vnode_put(uvp); - goto out; - } - - - /* - * XX - should not ignore errors - * from vnop_close - */ - error = VNOP_OPEN(lvp, FREAD, context); - if (error == 0) { - error = union_copyfile(lvp, uvp, context); - (void) VNOP_CLOSE(lvp, FREAD, context); - } - - VNOP_CLOSE(uvp, fmode, context); - vnode_rele_ext(uvp, fmode, 0); - vn_clearunionwait(uvp, 0); - - *vpp = uvp; -out: - if ((cn.cn_flags & HASBUF) == HASBUF) { - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - cn.cn_flags &= ~HASBUF; - } - return (error); -} - - -/* - * union_relookup: - * - * dvp should be locked on entry and will be locked on return. No - * net change in the ref count will occur. - * - * If an error is returned, *vpp will be invalid, otherwise it - * will hold a locked, referenced vnode. If *vpp == dvp then - * remember that only one exclusive lock is held. - */ - -/* No union lock held for this call */ -static int -union_relookup( -#ifdef XXX_HELP_ME - struct union_mount *um, -#else /* !XXX_HELP_ME */ - __unused struct union_mount *um, -#endif /* !XXX_HELP_ME */ - struct vnode *dvp, - struct vnode **vpp, - struct componentname *cnp, - struct componentname *cn, - char *path, - int pathlen) -{ - int error; - - /* - * A new componentname structure must be faked up because - * there is no way to know where the upper level cnp came - * from or what it is being used for. This must duplicate - * some of the work done by NDINIT, some of the work done - * by namei, some of the work done by lookup and some of - * the work done by vnop_lookup when given a CREATE flag. - * Conclusion: Horrible. - */ - cn->cn_namelen = pathlen; - cn->cn_pnbuf = _MALLOC_ZONE(cn->cn_namelen+1, M_NAMEI, M_WAITOK); - cn->cn_pnlen = cn->cn_namelen+1; - bcopy(path, cn->cn_pnbuf, cn->cn_namelen); - cn->cn_pnbuf[cn->cn_namelen] = '\0'; - - cn->cn_nameiop = CREATE; - cn->cn_flags = (HASBUF|SAVENAME|SAVESTART|ISLASTCN ); -#ifdef XXX_HELP_ME - cn->cn_proc = cnp->cn_proc; - if (um->um_op == UNMNT_ABOVE) - cn->cn_cred = cnp->cn_cred; - else - cn->cn_cred = um->um_cred; -#endif - cn->cn_context = cnp->cn_context; /* XXX !UNMNT_ABOVE case ??? */ - cn->cn_nameptr = cn->cn_pnbuf; - cn->cn_hash = 0; - cn->cn_consume = cnp->cn_consume; - - vnode_get(dvp); - error = relookup(dvp, vpp, cn); - vnode_put(dvp); - - return (error); -} - -/* - * Create a shadow directory in the upper layer. - * The new vnode is returned locked. - * - * (um) points to the union mount structure for access to the - * the mounting process's credentials. - * (dvp) is the directory in which to create the shadow directory, - * It is locked (but not ref'd) on entry and return. - * (cnp) is the component name to be created. - * (vpp) is the returned newly created shadow directory, which - * is returned locked and ref'd - */ -/* No union lock held for this call */ -int -union_mkshadow(um, dvp, cnp, vpp) - struct union_mount *um; - struct vnode *dvp; - struct componentname *cnp; - struct vnode **vpp; -{ - int error; - struct vnode_attr va; - struct componentname cn; - - bzero(&cn, sizeof(struct componentname)); - - - error = union_relookup(um, dvp, vpp, cnp, &cn, - cnp->cn_nameptr, cnp->cn_namelen); - if (error) - goto out; - - if (*vpp) { - error = EEXIST; - goto out; - } - - /* - * Policy: when creating the shadow directory in the - * upper layer, create it owned by the user who did - * the mount, group from parent directory, and mode - * 777 modified by umask (ie mostly identical to the - * mkdir syscall). (jsp, kb) - */ - - VATTR_INIT(&va); - VATTR_SET(&va, va_type, VDIR); - VATTR_SET(&va, va_mode, um->um_cmode); - - error = vn_create(dvp, vpp, &cn, &va, 0, cnp->cn_context); -out: - if ((cn.cn_flags & HASBUF) == HASBUF) { - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - cn.cn_flags &= ~HASBUF; - } - return (error); -} - -/* - * Create a whiteout entry in the upper layer. - * - * (um) points to the union mount structure for access to the - * the mounting process's credentials. - * (dvp) is the directory in which to create the whiteout. - * it is locked on entry and exit. - * (cnp) is the componentname to be created. - */ -/* No union lock held for this call */ -int -union_mkwhiteout(um, dvp, cnp, path) - struct union_mount *um; - struct vnode *dvp; - struct componentname *cnp; - char *path; -{ - int error; - struct vnode *wvp; - struct componentname cn; - - bzero(&cn, sizeof(struct componentname)); - - error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path)); - if (error) { - goto out; - } - if (wvp) { - error = EEXIST; - goto out; - } - - error = VNOP_WHITEOUT(dvp, &cn, CREATE, cnp->cn_context); - -out: - if ((cn.cn_flags & HASBUF) == HASBUF) { - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - cn.cn_flags &= ~HASBUF; - } - return (error); -} - - -/* - * union_vn_create: creates and opens a new shadow file - * on the upper union layer. This function is similar - * in spirit to calling vn_open() but it avoids calling namei(). - * The problem with calling namei() is that a) it locks too many - * things, and b) it doesn't start at the "right" directory, - * whereas relookup() is told where to start. - * - * On entry, the vnode associated with un is locked. It remains locked - * on return. - * - * If no error occurs, *vpp contains a locked referenced vnode for your - * use. If an error occurs *vpp iis undefined. - */ -/* called with no union lock held */ -int -union_vn_create(struct vnode **vpp, struct union_node *un, mode_t cmode, vfs_context_t context) -{ - struct vnode *vp; - struct vnode_attr vat; - struct vnode_attr *vap = &vat; - int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL); - int error; - struct proc * p = vfs_context_proc(context); - struct componentname cn; - - bzero(&cn, sizeof(struct componentname)); - *vpp = NULLVP; - - if (cmode == (mode_t)0) - cmode = UN_FILEMODE & ~p->p_fd->fd_cmask; - else - cmode = cmode & ~p->p_fd->fd_cmask; - - - /* - * Build a new componentname structure (for the same - * reasons outlines in union_mkshadow()). - * The difference here is that the file is owned by - * the current user, rather than by the person who - * did the mount, since the current user needs to be - * able to write the file (that's why it is being - * copied in the first place). - */ - cn.cn_namelen = strlen(un->un_path); - cn.cn_pnbuf = (caddr_t) _MALLOC_ZONE(cn.cn_namelen+1, - M_NAMEI, M_WAITOK); - cn.cn_pnlen = cn.cn_namelen+1; - bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1); - cn.cn_nameiop = CREATE; - if (UNNODE_FAULTIN(un)) - cn.cn_flags = (HASBUF|SAVENAME|SAVESTART|ISLASTCN|UNIONCREATED); - else - cn.cn_flags = (HASBUF|SAVENAME|SAVESTART|ISLASTCN); - cn.cn_context = context; - cn.cn_nameptr = cn.cn_pnbuf; - cn.cn_hash = un->un_hash; - cn.cn_consume = 0; - - /* - * Pass dvp unlocked and referenced on call to relookup(). - * - * If an error occurs, dvp will be returned unlocked and dereferenced. - */ - vnode_get(un->un_dirvp); - if ((error = relookup(un->un_dirvp, &vp, &cn)) != 0) { - vnode_put(un->un_dirvp); - goto out; - } - vnode_put(un->un_dirvp); - - /* - * If no error occurs, dvp will be returned locked with the reference - * left as before, and vpp will be returned referenced and locked. - */ - if (vp) { - *vpp = vp; - error = EEXIST; - goto out; - } - - /* - * Good - there was no race to create the file - * so go ahead and create it. The permissions - * on the file will be 0666 modified by the - * current user's umask. Access to the file, while - * it is unioned, will require access to the top *and* - * bottom files. Access when not unioned will simply - * require access to the top-level file. - * - * TODO: confirm choice of access permissions. - * decide on authorisation behaviour - */ - - VATTR_INIT(vap); - VATTR_SET(vap, va_type, VREG); - VATTR_SET(vap, va_mode, cmode); - - if ((error = vn_create(un->un_dirvp, &vp, &cn, vap, 0, context)) != 0) { - goto out; - } - - if ((error = VNOP_OPEN(vp, fmode, context)) != 0) { - vnode_put(vp); - goto out; - } - - vnode_lock(vp); - if (++vp->v_writecount <= 0) - panic("union: v_writecount"); - vnode_unlock(vp); - *vpp = vp; - error = 0; - -out: - if ((cn.cn_flags & HASBUF) == HASBUF) { - FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI); - cn.cn_flags &= ~HASBUF; - } - return(error); -} - -/* called with no union lock held */ -static int -union_vn_close(struct vnode *vp, int fmode, vfs_context_t context) -{ - - if (fmode & FWRITE) { - vnode_lock(vp); - --vp->v_writecount; - vnode_unlock(vp); - } - return (VNOP_CLOSE(vp, fmode, context)); -} - -/* - * union_removed_upper: - * - * An upper-only file/directory has been removed; un-cache it so - * that unionfs vnode gets reclaimed and the last uppervp reference - * disappears. - * - * Called with union_node unlocked. - */ -/* always called with union lock held */ -void -union_removed_upper(un) - struct union_node *un; -{ - union_newupper(un, NULLVP); - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - -} - -#if 0 -struct vnode * -union_lowervp(vp) - struct vnode *vp; -{ - struct union_node *un = VTOUNION(vp); - - if ((un->un_lowervp != NULLVP) && - (vp->v_type == un->un_lowervp->v_type)) { - if (vnode_get(un->un_lowervp) == 0) - return (un->un_lowervp); - } - - return (NULLVP); -} -#endif - -/* - * Determine whether a whiteout is needed - * during a remove/rmdir operation. - */ -/* called with no union lock held */ -int -union_dowhiteout(struct union_node *un, vfs_context_t ctx) -{ - struct vnode_attr va; - - if (UNNODE_FAULTIN(un)) - return(0); - - if ((un->un_lowervp != NULLVP) ) - return (1); - - VATTR_INIT(&va); - VATTR_WANTED(&va, va_flags); - if (vnode_getattr(un->un_uppervp, &va, ctx) == 0 && - (va.va_flags & OPAQUE)) - return (1); - - return (0); -} - -/* called with no union lock held */ -static void -union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp) -{ - struct union_node *un; - - if (vp->v_op != union_vnodeop_p) { - if (vppp) { - vnode_get(vp); - *(*vppp)++ = vp; - if (--(*cntp) == 0) - panic("union: dircache table too small"); - } else { - (*cntp)++; - } - - return; - } - - un = VTOUNION(vp); - if (un->un_uppervp != NULLVP) - union_dircache_r(un->un_uppervp, vppp, cntp); - if (un->un_lowervp != NULLVP) - union_dircache_r(un->un_lowervp, vppp, cntp); -} - -/* called with no union lock held */ -struct vnode * -union_dircache(struct vnode *vp, __unused vfs_context_t context) -{ - int count; - struct vnode *nvp, *lvp; - struct vnode **vpp; - struct vnode **dircache, **newdircache; - struct union_node *un; - int error; - int alloced = 0; - - union_lock(); - newdircache = NULL; - - nvp = NULLVP; - un = VTOUNION(vp); - - dircache = un->un_dircache; - if (dircache == 0) { - union_unlock(); - count = 0; - union_dircache_r(vp, 0, &count); - count++; -#if 0 - /* too bad; we need Union now! */ -#if MAC_XXX - panic("MAC Framework doesn't support unionfs (yet)\n"); -#endif /* MAC */ -#endif - - dircache = (struct vnode **) - _MALLOC(count * sizeof(struct vnode *), - M_TEMP, M_WAITOK); - if (dircache == NULL) { - goto out; - } - newdircache = dircache; - alloced = 1; - vpp = dircache; - union_dircache_r(vp, &vpp, &count); - *vpp = NULLVP; - vpp = dircache + 1; - union_lock(); - } else { - vpp = dircache; - do { - if (*vpp++ == un->un_uppervp) - break; - } while (*vpp != NULLVP); - } - - lvp = *vpp; - union_unlock(); - if (lvp == NULLVP) { - goto out; - } - - vnode_get(lvp); - union_lock(); - - error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, lvp, NULLVP, 0); - if (error) { - union_unlock(); - vnode_put(lvp); - goto out; - } - - un->un_dircache = 0; - un = VTOUNION(nvp); -#if 0 - if ((alloced != 0) && (un->un_dircache != 0)) { - union_unlock(); - for (vpp = newdircache; *vpp != NULLVP; vpp++) - vnode_put(*vpp); - _FREE(newdircache, M_TEMP); - newdircache = NULL; - union_lock(); - if (nvp != NULLVP) - union_freevp(nvp); - goto loop; - } -#endif - un->un_dircache = dircache; - un->un_flags |= UN_DIRENVN; - - newdircache = NULL; - union_unlock(); - return (nvp); - -out: - /* - * If we allocated a new dircache and couldn't attach - * it to a new vp, free the resources we allocated. - */ - if (newdircache) { - for (vpp = newdircache; *vpp != NULLVP; vpp++) - vnode_put(*vpp); - _FREE(newdircache, M_TEMP); - } - return (NULLVP); -} - -/* - * Module glue to remove #ifdef UNION from vfs_syscalls.c - */ -/* Called with no union lock, the union_dircache takes locks when necessary */ -static int -union_dircheck(struct vnode **vpp, struct fileproc *fp, vfs_context_t ctx) -{ - int error = 0; - vnode_t vp = *vpp; - - if (vp->v_op == union_vnodeop_p) { - struct vnode *lvp; - - lvp = union_dircache(vp, ctx); - if (lvp != NULLVP) { - struct vnode_attr va; - /* - * If the directory is opaque, - * then don't show lower entries - */ - VATTR_INIT(&va); - VATTR_WANTED(&va, va_flags); - error = vnode_getattr(vp, &va, ctx); - if (va.va_flags & OPAQUE) { - vnode_put(lvp); - lvp = NULL; - } - } - - if (lvp != NULLVP) { -#if CONFIG_MACF - error = mac_vnode_check_open(ctx, lvp, FREAD); - if (error) { - vnode_put(lvp); - return(error); - } -#endif /* MAC */ - error = VNOP_OPEN(lvp, FREAD, ctx); - if (error) { - vnode_put(lvp); - return(error); - } - vnode_ref(lvp); - fp->f_fglob->fg_data = (caddr_t) lvp; - fp->f_fglob->fg_offset = 0; - - error = VNOP_CLOSE(vp, FREAD, ctx); - vnode_rele(vp); - vnode_put(vp); - if (error) - return(error); - - *vpp = lvp; - return -1; /* goto unionread */ - } - } - return error; -} - -/* called from inactive with union lock held */ -void -union_dircache_free(struct union_node *un) -{ - struct vnode **vpp; - - vpp = un->un_dircache; - un->un_dircache = NULL; - union_unlock(); - - for (; *vpp != NULLVP; vpp++) - vnode_put(*vpp); - _FREE(un->un_dircache, M_TEMP); - union_lock(); -} - diff --git a/bsd/miscfs/union/union_vfsops.c b/bsd/miscfs/union/union_vfsops.c deleted file mode 100644 index 6924e2f67..000000000 --- a/bsd/miscfs/union/union_vfsops.c +++ /dev/null @@ -1,563 +0,0 @@ -/* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1994, 1995 The Regents of the University of California. - * Copyright (c) 1994, 1995 Jan-Simon Pendry. - * All rights reserved. - * - * This code is derived from software donated to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)union_vfsops.c 8.20 (Berkeley) 5/20/95 - */ - -/* - * Union Layer - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/time.h> -#include <sys/types.h> -#include <sys/proc_internal.h> -#include <sys/kauth.h> -#include <sys/vnode_internal.h> -#include <sys/mount_internal.h> -#include <sys/namei.h> -#include <sys/malloc.h> -#include <sys/filedesc.h> -#include <sys/queue.h> -#include <miscfs/union/union.h> - -static int union_itercallback(vnode_t, void *); -static int union_root(mount_t, vnode_t *, vfs_context_t); - -/* - * Mount union filesystem - */ -static int -union_mount(mount_t mp, __unused vnode_t devvp, user_addr_t data, vfs_context_t context) -{ - proc_t p = vfs_context_proc(context); - int error = 0; - struct user_union_args args; - struct vnode *lowerrootvp = NULLVP; - struct vnode *upperrootvp = NULLVP; - struct union_mount *um = NULL; - kauth_cred_t cred = NOCRED; - const char *cp = NULL; - char *vcp; - int len; - u_int size; - struct nameidata nd; - -#ifdef UNION_DIAGNOSTIC - printf("union_mount(mp = %x)\n", mp); -#endif - - /* - * Update is a no-op - */ - if (mp->mnt_flag & MNT_UPDATE) { - /* - * Need to provide. - * 1. a way to convert between rdonly and rdwr mounts. - * 2. support for nfs exports. - */ - error = ENOTSUP; - goto bad; - } - - /* - * Get argument - */ - if (vfs_context_is64bit(context)) { - error = copyin(data, (caddr_t)&args, sizeof(args)); - } - else { - struct union_args temp; - error = copyin(data, (caddr_t)&temp, sizeof (temp)); - args.target = CAST_USER_ADDR_T(temp.target); - args.mntflags = temp.mntflags; - } - if (error) - goto bad; - - lowerrootvp = mp->mnt_vnodecovered; - vnode_get(lowerrootvp); - - /* - * Find upper node. - */ - NDINIT(&nd, LOOKUP, FOLLOW|WANTPARENT, - (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), - args.target, context); - - if ((error = namei(&nd))) - goto bad; - - nameidone(&nd); - upperrootvp = nd.ni_vp; - vnode_put(nd.ni_dvp); - nd.ni_dvp = NULL; - - if (upperrootvp->v_type != VDIR) { - error = EINVAL; - goto bad; - } - - MALLOC(um, struct union_mount *, sizeof(struct union_mount), - M_UFSMNT, M_WAITOK); - - /* - * Keep a held reference to the target vnodes. - * They are vnode_put'd in union_unmount. - * - * Depending on the _BELOW flag, the filesystems are - * viewed in a different order. In effect, this is the - * same as providing a mount under option to the mount syscall. - */ - - um->um_op = args.mntflags & UNMNT_OPMASK; - switch (um->um_op) { - case UNMNT_ABOVE: - um->um_lowervp = lowerrootvp; - um->um_uppervp = upperrootvp; - break; - - case UNMNT_BELOW: - um->um_lowervp = upperrootvp; - um->um_uppervp = lowerrootvp; - break; - - case UNMNT_REPLACE: - vnode_put(lowerrootvp); - lowerrootvp = NULLVP; - um->um_uppervp = upperrootvp; - um->um_lowervp = lowerrootvp; - break; - -#ifdef FAULTFS - case UNMNT_FAULTIN: - um->um_lowervp = upperrootvp; - um->um_uppervp = lowerrootvp; - break; -#endif - - default: - error = EINVAL; - goto bad; - } - - if (um->um_lowervp != NULLVP) - um->um_lowervid = vnode_vid(um->um_lowervp); - if (um->um_uppervp != NULLVP) - um->um_uppervid = vnode_vid(um->um_uppervp); - /* - * Unless the mount is readonly, ensure that the top layer - * supports whiteout operations - */ -#ifdef FAULTFS - if ((um->um_op != UNMNT_FAULTIN) && (mp->mnt_flag & MNT_RDONLY) == 0) -#else - if ((mp->mnt_flag & MNT_RDONLY) == 0) -#endif - { - error = VNOP_WHITEOUT(um->um_uppervp, (struct componentname *) 0, - LOOKUP, context); - if (error) - goto bad; - } - - um->um_cred = kauth_cred_get_with_ref(); - um->um_cmode = UN_DIRMODE &~ p->p_fd->fd_cmask; - - /* - * Depending on what you think the MNT_LOCAL flag might mean, - * you may want the && to be || on the conditional below. - * At the moment it has been defined that the filesystem is - * only local if it is all local, ie the MNT_LOCAL flag implies - * that the entire namespace is local. If you think the MNT_LOCAL - * flag implies that some of the files might be stored locally - * then you will want to change the conditional. - */ - if (um->um_op == UNMNT_ABOVE) { - if (((um->um_lowervp == NULLVP) || - (um->um_lowervp->v_mount->mnt_flag & MNT_LOCAL)) && - (um->um_uppervp->v_mount->mnt_flag & MNT_LOCAL)) - mp->mnt_flag |= MNT_LOCAL; - } - - /* - * Copy in the upper layer's RDONLY flag. This is for the benefit - * of lookup() which explicitly checks the flag, rather than asking - * the filesystem for it's own opinion. This means, that an update - * mount of the underlying filesystem to go from rdonly to rdwr - * will leave the unioned view as read-only. - */ - mp->mnt_flag |= (um->um_uppervp->v_mount->mnt_flag & MNT_RDONLY); - - mp->mnt_data = (qaddr_t) um; - vfs_getnewfsid(mp); - - - switch (um->um_op) { - case UNMNT_ABOVE: - cp = "<above>:"; - break; - case UNMNT_BELOW: - cp = "<below>:"; - break; - case UNMNT_REPLACE: - cp = ""; - break; -#ifdef FAULTFS - case UNMNT_FAULTIN: - cp = "/FaultingFS/"; - break; -#endif - } - len = strlen(cp); - bcopy(cp, mp->mnt_vfsstat.f_mntfromname, len); - - vcp = mp->mnt_vfsstat.f_mntfromname + len; - len = MNAMELEN - len; - - (void) copyinstr(args.target, vcp, len - 1, (size_t *)&size); - bzero(vcp + size, len - size); - -#ifdef UNION_DIAGNOSTIC - printf("union_mount: from %s, on %s\n", - mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname); -#endif - return (0); - -bad: - if (um) - _FREE(um, M_UFSMNT); - if (IS_VALID_CRED(cred)) - kauth_cred_unref(&cred); - if (upperrootvp) - vnode_put(upperrootvp); - if (lowerrootvp) - vnode_put(lowerrootvp); - return (error); -} - -/* - * VFS start. Nothing needed here - the start routine - * on the underlying filesystem(s) will have been called - * when that filesystem was mounted. - */ -static int -union_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context) -{ - - return (0); -} - -static int -union_itercallback(__unused vnode_t vp, void *args) -{ - int num = *(int *)args; - - *(int *)args = num + 1; - return(VNODE_RETURNED); -} - - - -/* - * Free reference to union layer - */ -static int -union_unmount(mount_t mp, int mntflags, vfs_context_t context) -{ - struct union_mount *um = MOUNTTOUNIONMOUNT(mp); - struct vnode *um_rootvp; - int error; - int freeing; - int flags = 0; - -#ifdef UNION_DIAGNOSTIC - printf("union_unmount(mp = %x)\n", mp); -#endif - - if (mntflags & MNT_FORCE) - flags |= FORCECLOSE; - - if ((error = union_root(mp, &um_rootvp, context))) - return (error); - - /* - * Keep flushing vnodes from the mount list. - * This is needed because of the un_pvp held - * reference to the parent vnode. - * If more vnodes have been freed on a given pass, - * the try again. The loop will iterate at most - * (d) times, where (d) is the maximum tree depth - * in the filesystem. - */ - for (freeing = 0; vflush(mp, um_rootvp, flags) != 0;) { - int n = 0; - - vnode_iterate(mp, VNODE_NOLOCK_INTERNAL, union_itercallback, &n); - - /* if this is unchanged then stop */ - if (n == freeing) - break; - - /* otherwise try once more time */ - freeing = n; - } - - /* At this point the root vnode should have a single reference */ - if (vnode_isinuse(um_rootvp, 0)) { - vnode_put(um_rootvp); - return (EBUSY); - } - -#ifdef UNION_DIAGNOSTIC - vprint("union root", um_rootvp); -#endif - /* - * Discard references to upper and lower target vnodes. - */ - if (um->um_lowervp) - vnode_put(um->um_lowervp); - vnode_put(um->um_uppervp); - if (IS_VALID_CRED(um->um_cred)) { - kauth_cred_unref(&um->um_cred); - } - /* - * Release reference on underlying root vnode - */ - vnode_put(um_rootvp); - /* - * And blow it away for future re-use - */ - vnode_reclaim(um_rootvp); - /* - * Finally, throw away the union_mount structure - */ - _FREE(mp->mnt_data, M_UFSMNT); /* XXX */ - mp->mnt_data = NULL; - return (0); -} - -static int -union_root(mount_t mp, vnode_t *vpp, __unused vfs_context_t context) -{ - struct union_mount *um = MOUNTTOUNIONMOUNT(mp); - int error; - - /* - * Return locked reference to root. - */ - vnode_get(um->um_uppervp); - if (um->um_lowervp) - vnode_get(um->um_lowervp); - - union_lock(); - error = union_allocvp(vpp, mp, - (struct vnode *) 0, - (struct vnode *) 0, - (struct componentname *) 0, - um->um_uppervp, - um->um_lowervp, - 1); - union_unlock(); - - if (error) { - vnode_put(um->um_uppervp); - if (um->um_lowervp) - vnode_put(um->um_lowervp); - } - - return (error); -} - -static int -union_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t context) -{ - int error; - struct union_mount *um = MOUNTTOUNIONMOUNT(mp); - struct vfs_attr attr; - uint32_t lbsize = 0; - -#ifdef UNION_DIAGNOSTIC - printf("union_vfs_getattr(mp = %x, lvp = %x, uvp = %x)\n", mp, - um->um_lowervp, - um->um_uppervp); -#endif - - /* Get values from lower file system (if any) */ - if (um->um_lowervp) { - VFSATTR_INIT(&attr); - VFSATTR_WANTED(&attr, f_bsize); - VFSATTR_WANTED(&attr, f_blocks); - VFSATTR_WANTED(&attr, f_bused); - VFSATTR_WANTED(&attr, f_files); - error = vfs_getattr(um->um_lowervp->v_mount, &attr, context); - if (error) - return (error); - - /* now copy across the "interesting" information and fake the rest */ - if (VFSATTR_IS_SUPPORTED(&attr, f_bsize)) - lbsize = attr.f_bsize; - else - lbsize = um->um_lowervp->v_mount->mnt_devblocksize; - fsap->f_blocks = VFSATTR_IS_SUPPORTED(&attr, f_blocks) ? attr.f_blocks : 0; - fsap->f_bused = VFSATTR_IS_SUPPORTED(&attr, f_bused) ? attr.f_bused : 0; - fsap->f_files = VFSATTR_IS_SUPPORTED(&attr, f_files) ? attr.f_files : 0; - } else { - fsap->f_blocks = 0; - fsap->f_bused = 0; - fsap->f_files = 0; - } - - VFSATTR_INIT(&attr); - VFSATTR_WANTED(&attr, f_bsize); - VFSATTR_WANTED(&attr, f_blocks); - VFSATTR_WANTED(&attr, f_bfree); - VFSATTR_WANTED(&attr, f_bavail); - VFSATTR_WANTED(&attr, f_files); - VFSATTR_WANTED(&attr, f_ffree); - error = vfs_getattr(um->um_uppervp->v_mount, &attr, context); - if (error) - return (error); - - if (VFSATTR_IS_SUPPORTED(&attr, f_bsize)) { - fsap->f_bsize = attr.f_bsize; - VFSATTR_SET_SUPPORTED(fsap, f_bsize); - } - if (VFSATTR_IS_SUPPORTED(&attr, f_iosize)) { - fsap->f_iosize = attr.f_iosize; - VFSATTR_SET_SUPPORTED(fsap, f_iosize); - } - - /* - * if the lower and upper blocksizes differ, then frig the - * block counts so that the sizes reported by df make some - * kind of sense. none of this makes sense though. - */ - if (VFSATTR_IS_SUPPORTED(&attr, f_bsize)) - fsap->f_bsize = attr.f_bsize; - else - fsap->f_bsize = um->um_uppervp->v_mount->mnt_devblocksize; - VFSATTR_RETURN(fsap, f_bsize, attr.f_bsize); - if (fsap->f_bsize != lbsize) - fsap->f_blocks = fsap->f_blocks * lbsize / attr.f_bsize; - - /* - * The "total" fields count total resources in all layers, - * the "free" fields count only those resources which are - * free in the upper layer (since only the upper layer - * is writeable). - */ - if (VFSATTR_IS_SUPPORTED(&attr, f_blocks)) - fsap->f_blocks += attr.f_blocks; - if (VFSATTR_IS_SUPPORTED(&attr, f_bfree)) - fsap->f_bfree = attr.f_bfree; - if (VFSATTR_IS_SUPPORTED(&attr, f_bavail)) - fsap->f_bavail = attr.f_bavail; - if (VFSATTR_IS_SUPPORTED(&attr, f_bused)) - fsap->f_bused += attr.f_bused; - if (VFSATTR_IS_SUPPORTED(&attr, f_files)) - fsap->f_files += attr.f_files; - if (VFSATTR_IS_SUPPORTED(&attr, f_ffree)) - fsap->f_ffree = attr.f_ffree; - - VFSATTR_SET_SUPPORTED(fsap, f_bsize); - VFSATTR_SET_SUPPORTED(fsap, f_blocks); - VFSATTR_SET_SUPPORTED(fsap, f_bfree); - VFSATTR_SET_SUPPORTED(fsap, f_bavail); - VFSATTR_SET_SUPPORTED(fsap, f_bused); - VFSATTR_SET_SUPPORTED(fsap, f_files); - VFSATTR_SET_SUPPORTED(fsap, f_ffree); - - return (0); -} - -/* - * XXX - Assumes no data cached at union layer. - */ -#define union_sync (int (*) (mount_t, int, vfs_context_t))nullop - -#define union_fhtovp (int (*) (mount_t, int, unsigned char *, vnode_t *, vfs_context_t))eopnotsupp -#define union_sysctl (int (*) (int *, u_int, user_addr_t, size_t *, user_addr_t, size_t, vfs_context_t))eopnotsupp -#define union_vget (int (*) (mount_t, ino64_t, vnode_t *, vfs_context_t))eopnotsupp -#define union_vptofh (int (*) (vnode_t, int *, unsigned char *, vfs_context_t))eopnotsupp - -struct vfsops union_vfsops = { - union_mount, - union_start, - union_unmount, - union_root, - NULL, /* quotactl */ - union_vfs_getattr, - union_sync, - union_vget, - union_fhtovp, - union_vptofh, - union_init, - union_sysctl, - NULL, - {NULL} -}; - - diff --git a/bsd/miscfs/union/union_vnops.c b/bsd/miscfs/union/union_vnops.c deleted file mode 100644 index ddc374dea..000000000 --- a/bsd/miscfs/union/union_vnops.c +++ /dev/null @@ -1,1726 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* - * Copyright (c) 1992, 1993, 1994, 1995 Jan-Simon Pendry. - * Copyright (c) 1992, 1993, 1994, 1995 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Jan-Simon Pendry. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)union_vnops.c 8.32 (Berkeley) 6/23/95 - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/proc.h> -#include <sys/kauth.h> -#include <sys/file.h> -#include <sys/time.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <sys/vnode_internal.h> -#include <sys/mount_internal.h> -#include <sys/namei.h> -#include <sys/malloc.h> -#include <sys/buf_internal.h> -#include <sys/queue.h> -#include <sys/lock.h> -#include <miscfs/union/union.h> -#include <vfs/vfs_support.h> -#include <sys/ubc.h> -#include <sys/kdebug.h> -#include <sys/uio_internal.h> - -/* called with no union lock held */ -static int -union_lookup1(struct vnode *udvp, struct vnode **dvpp, struct vnode **vpp, - struct componentname *cnp) -{ - int error; - vfs_context_t ctx = cnp->cn_context; - struct vnode *tdvp; - struct vnode *dvp; - struct mount *mp; - - dvp = *dvpp; - - /* - * If stepping up the directory tree, check for going - * back across the mount point, in which case do what - * lookup would do by stepping back down the mount - * hierarchy. - */ - if (cnp->cn_flags & ISDOTDOT) { - while ((dvp != udvp) && (dvp->v_flag & VROOT)) { - /* - * Don't do the NOCROSSMOUNT check - * at this level. By definition, - * union fs deals with namespaces, not - * filesystems. - */ - tdvp = dvp; - *dvpp = dvp = dvp->v_mount->mnt_vnodecovered; - vnode_put(tdvp); - vnode_get(dvp); - } - } - - error = VNOP_LOOKUP(dvp, &tdvp, cnp, ctx); - if (error) - return (error); - - dvp = tdvp; - /* - * Lastly check if the current node is a mount point in - * which case walk up the mount hierarchy making sure not to - * bump into the root of the mount tree (ie. dvp != udvp). - */ - while (dvp != udvp && (dvp->v_type == VDIR) && - (mp = dvp->v_mountedhere)) { - if (vfs_busy(mp, LK_NOWAIT)) { - vnode_put(dvp); - return(ENOENT); - } - error = VFS_ROOT(mp, &tdvp, ctx); - vfs_unbusy(mp); - if (error) { - vnode_put(dvp); - return (error); - } - - vnode_put(dvp); - dvp = tdvp; - } - - *vpp = dvp; - return (0); -} - -static int -union_lookup(struct vnop_lookup_args *ap) -/* - struct vnop_lookup_args { - struct vnodeop_desc *a_desc; - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - vfs_context_t a_context; - } *ap) -*/ -{ - int error; - int uerror = 0; - int lerror = 0; - struct vnode *uppervp, *lowervp; - struct vnode *upperdvp, *lowerdvp; - struct vnode *dvp = ap->a_dvp; - struct union_node *dun; - struct componentname *cnp = ap->a_cnp; - vfs_context_t ctx = cnp->cn_context; - int lockparent = cnp->cn_flags & LOCKPARENT; - struct union_mount *um; - kauth_cred_t saved_cred; - int iswhiteout; - struct vnode_attr va; - int isfaultfs = 0; - int upperlookup = 0; - int retry_count = 0; - -#ifdef notyet - if (cnp->cn_namelen == 3 && - cnp->cn_nameptr[2] == '.' && - cnp->cn_nameptr[1] == '.' && - cnp->cn_nameptr[0] == '.') { - dvp = *ap->a_vpp = LOWERVP(ap->a_dvp); - if (dvp == NULLVP) - return (ENOENT); - vnode_get(dvp); - - return (0); - } -#endif - - - -retry: - union_lock(); - um = MOUNTTOUNIONMOUNT(dvp->v_mount); - dun = VTOUNION(dvp); - upperdvp = dun->un_uppervp; - lowerdvp = dun->un_lowervp; - uppervp = NULLVP; - lowervp = NULLVP; - iswhiteout = 0; - - union_unlock(); - - if(UNION_FAULTIN(um)) - isfaultfs = 1; - - if (isfaultfs == 0) - cnp->cn_flags |= LOCKPARENT; - - /* - * do the lookup in the upper level. - * if that level comsumes additional pathnames, - * then assume that something special is going - * on and just return that vnode. - */ - if (upperdvp != NULLVP) { - if (lockparent != 0) - cnp->cn_flags &= ~LOCKPARENT; - uerror = union_lookup1(um->um_uppervp, &upperdvp, - &uppervp, cnp); - upperlookup = 1; - - if (cnp->cn_consume != 0) { - *ap->a_vpp = uppervp; - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return (uerror); - } - if (uerror == ENOENT || uerror == EJUSTRETURN) { - if (cnp->cn_flags & ISWHITEOUT) { - iswhiteout = 1; - } else if (lowerdvp != NULLVP) { - VATTR_INIT(&va); - VATTR_WANTED(&va, va_flags); - lerror = vnode_getattr(upperdvp, &va, ap->a_context); - if (lerror == 0 && (va.va_flags & OPAQUE)) - iswhiteout = 1; - } - } - } else { - uerror = ENOENT; - } - - /* - * faultingfs: If upper layer lookup is succesful - * we will return that vp if it is regular file. - * So so skip lower level lookup - */ - - if ((isfaultfs == 1) && (upperlookup == 1) && (uerror == 0) && ((vnode_isreg(uppervp) != 0))) - goto donelowerlookup; - - /* - * in a similar way to the upper layer, do the lookup - * in the lower layer. this time, if there is some - * component magic going on, then vnode_put whatever we got - * back from the upper layer and return the lower vnode - * instead. - */ - if (lowerdvp != NULLVP && !iswhiteout) { - int nameiop; - - /* - * Only do a LOOKUP on the bottom node, since - * we won't be making changes to it anyway. - */ - nameiop = cnp->cn_nameiop; - cnp->cn_nameiop = LOOKUP; - if (um->um_op == UNMNT_BELOW) { - /* XXX BOGUS */ - saved_cred = cnp->cn_context->vc_ucred; - cnp->cn_context->vc_ucred = um->um_cred; - if (lockparent != 0) - cnp->cn_flags &= ~LOCKPARENT; - lerror = union_lookup1(um->um_lowervp, &lowerdvp, - &lowervp, cnp); - cnp->cn_context->vc_ucred = saved_cred; - } else { - if (lockparent != 0) - cnp->cn_flags &= ~LOCKPARENT; - lerror = union_lookup1(um->um_lowervp, &lowerdvp, - &lowervp, cnp); - } - cnp->cn_nameiop = nameiop; - - if (cnp->cn_consume != 0) { - if (uppervp != NULLVP) { - vnode_put(uppervp); - uppervp = NULLVP; - } - *ap->a_vpp = lowervp; - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return (lerror); - } - } else { - lerror = ENOENT; - if ((cnp->cn_flags & ISDOTDOT) && dun->un_pvp != NULLVP) { - lowervp = LOWERVP(dun->un_pvp); - if (lowervp != NULLVP) { - lerror = 0; - } - } - } - -donelowerlookup: - - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - - /* - * at this point, we have uerror and lerror indicating - * possible errors with the lookups in the upper and lower - * layers. additionally, uppervp and lowervp are (locked) - * references to existing vnodes in the upper and lower layers. - * - * there are now three cases to consider. - * 1. if both layers returned an error, then return whatever - * error the upper layer generated. - * - * 2. if the top layer failed and the bottom layer succeeded - * then two subcases occur. - * a. the bottom vnode is not a directory, in which - * case just return a new union vnode referencing - * an empty top layer and the existing bottom layer. - * b. the bottom vnode is a directory, in which case - * create a new directory in the top-level and - * continue as in case 3. - * - * 3. if the top layer succeeded then return a new union - * vnode referencing whatever the new top layer and - * whatever the bottom layer returned. - */ - - *ap->a_vpp = NULLVP; - - /* case 1. */ - if ((uerror != 0) && (lerror != 0)) { - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return (uerror); - } - - /* case 2. */ - if (uerror != 0 /* && (lerror == 0) */ ) { - if (lowervp->v_type == VDIR) { /* case 2b. */ - /* No need to lock the union here */ - /* if the vnode exists it returns it even if it marks error */ - - uppervp = NULLVP; - - uerror = union_mkshadow(um, upperdvp, cnp, &uppervp); - - if ((uerror == EEXIST)){ - if (uppervp == NULLVP) { - retry_count++; - if (retry_count <= 2) { - if (lowervp != NULLVP) - vnode_put(lowervp); - goto retry; - } - } - uerror = 0; - } - - if (uerror) { - if (uppervp != NULLVP) { - vnode_put(uppervp); - } - if (lowervp != NULLVP) { - vnode_put(lowervp); - } - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return (uerror); - } - } else if ((lowervp->v_type == VREG) && (isfaultfs == 1)) { - error = union_faultin_copyup(&uppervp, upperdvp, lowervp, cnp, ctx); - uerror = 0; - } - } - - - /* if this is faulting filesystem and upper vp exisits skip allocation of union node */ - if ((isfaultfs == 1) && (uerror == 0) && (uppervp != NULLVP) && ((vnode_isreg(uppervp) != 0)|| (vnode_islnk(uppervp) != 0))) { - vn_checkunionwait(uppervp); - *ap->a_vpp = uppervp; - if (lowervp != NULLVP) - vnode_put(lowervp); - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return(0); - } - - union_lock(); - error = union_allocvp(ap->a_vpp, dvp->v_mount, dvp, upperdvp, cnp, - uppervp, lowervp, 1); - union_unlock(); - - if (error) { - if (uppervp != NULLVP) - vnode_put(uppervp); - if (lowervp != NULLVP) - vnode_put(lowervp); - } - - if (!lockparent) - cnp->cn_flags &= ~LOCKPARENT; - else - cnp->cn_flags |= LOCKPARENT; - return (error); -} - -static int -union_create(struct vnop_create_args *ap) -/* - struct vnop_create_args { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_dvp); - struct vnode *dvp = un->un_uppervp; - struct componentname *cnp = ap->a_cnp; - - if (dvp != NULLVP) { - int error; - struct vnode *vp; - struct mount *mp; - - - mp = ap->a_dvp->v_mount; - - /* note that this is a direct passthrough to the filesystem */ - error = VNOP_CREATE(dvp, &vp, cnp, ap->a_vap, ap->a_context); - if (error) - return (error); - - /* if this is faulting filesystem and is a reg file, skip allocation of union node */ - if (UNNODE_FAULTIN(un) && (vp != NULLVP) && ((vnode_isreg(vp) != 0)|| (vnode_islnk(vp) != 0))) { - *ap->a_vpp = vp; - return(0); - } - - - union_lock(); - error = union_allocvp(ap->a_vpp, mp, NULLVP, NULLVP, cnp, vp, - NULLVP, 1); - union_unlock(); - if (error) - vnode_put(vp); - return (error); - } - - return (EROFS); -} - -static int -union_whiteout(struct vnop_whiteout_args *ap) -/* - struct vnop_whiteout_args { - struct vnode *a_dvp; - struct componentname *a_cnp; - int a_flags; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_dvp); - struct componentname *cnp = ap->a_cnp; - int error; - - if (un->un_uppervp == NULLVP) { - return (ENOTSUP); - } - - error = (VNOP_WHITEOUT(un->un_uppervp, cnp, ap->a_flags, ap->a_context)); - return(error); -} - -/* mknod can do fifos, chr, blk or whiteout entries */ -static int -union_mknod(struct vnop_mknod_args *ap) -/* - struct vnop_mknod_args { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_dvp); - struct vnode *dvp = un->un_uppervp; - struct componentname *cnp = ap->a_cnp; - - if (dvp != NULLVP) { - int error; - struct vnode *vp; - struct mount *mp; - - - mp = ap->a_dvp->v_mount; - - /* note that this is a direct passthrough to the filesystem */ - error = VNOP_MKNOD(dvp, &vp, cnp, ap->a_vap, ap->a_context); - if (error) - return (error); - - if (vp != NULLVP) { - union_lock(); - error = union_allocvp(ap->a_vpp, mp, NULLVP, NULLVP, - cnp, vp, NULLVP, 1); - union_unlock(); - if (error) - vnode_put(vp); - } - return (error); - } - return (EROFS); -} - -static int -union_open(struct vnop_open_args *ap) -/* - struct vnop_open_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - int a_mode; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_vp); - struct vnode *tvp; - int mode = ap->a_mode; - int error; - - /* - * If there is an existing upper vp then simply open that. - */ - - tvp = un->un_uppervp; - if (tvp == NULLVP) { - - /* - * If the lower vnode is being opened for writing, then - * copy the file contents to the upper vnode and open that, - * otherwise can simply open the lower vnode. - */ - tvp = un->un_lowervp; - if ((ap->a_mode & FWRITE) && (tvp->v_type == VREG)) { - /* For above below mounts we need draining.. */ - /* This path is not taken for faultin mode */ - /* LOCK the union node as well **/ - union_lock(); - un->un_flags |= UN_LOCKED; - - error = union_copyup(un, (mode&O_TRUNC) == 0, ap->a_context); - un->un_flags &= ~UN_LOCKED; - if ((un->un_flags & UN_WANT) == UN_WANT) { - un->un_flags &= ~UN_WANT; - wakeup(&un->un_flags); - } - union_unlock(); - if (error == 0) - error = VNOP_OPEN(un->un_uppervp, mode, ap->a_context); - return (error); - } - - /* - * Just open the lower vnode - */ - un->un_openl++; - - error = VNOP_OPEN(tvp, mode, ap->a_context); - - return (error); - } - - error = VNOP_OPEN(tvp, mode, ap->a_context); - - return (error); -} - -static int -union_close(struct vnop_close_args *ap) -/* - struct vnop_close_args { - struct vnode *a_vp; - int a_fflag; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_vp); - struct vnode *vp; - int error = 0; - - if ((vp = un->un_uppervp) == NULLVP) { -#ifdef UNION_DIAGNOSTIC - if (un->un_openl <= 0) - panic("union: un_openl cnt"); -#endif - --un->un_openl; - vp = un->un_lowervp; - } - - ap->a_vp = vp; - error = (VCALL(vp, VOFFSET(vnop_close), ap)); - return(error); -} - -/* - * Check access permission on the union vnode. - * The access check being enforced is to check - * against both the underlying vnode, and any - * copied vnode. This ensures that no additional - * file permissions are given away simply because - * the user caused an implicit file copy. - */ -static int -union_access(struct vnop_access_args *ap) -/* - struct vnop_access_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - int a_action; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_vp); - int error = EACCES; - struct vnode *vp; - - if ((vp = un->un_uppervp) != NULLVP) { - ap->a_vp = vp; - return (VCALL(vp, VOFFSET(vnop_access), ap)); - } - - if ((vp = un->un_lowervp) != NULLVP) { - ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vnop_access), ap); - if (error == 0) { - struct union_mount *um = MOUNTTOUNIONMOUNT(vp->v_mount); - - if (um->um_op == UNMNT_BELOW) { - error = VCALL(vp, VOFFSET(vnop_access), ap); - } - } - if (error) - return (error); - } - - return (error); -} - -/* - * We handle getattr only to change the fsid and - * track object sizes - */ -static int -union_getattr(struct vnop_getattr_args *ap) -/* - struct vnop_getattr_args { - struct vnode *a_vp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } *ap; -*/ -{ - int error=0; - struct union_node *un = VTOUNION(ap->a_vp); - struct vnode *vp = un->un_uppervp; - struct vnode_attr *vap; - struct vnode_attr va; - - - /* - * Some programs walk the filesystem hierarchy by counting - * links to directories to avoid stat'ing all the time. - * This means the link count on directories needs to be "correct". - * The only way to do that is to call getattr on both layers - * and fix up the link count. The link count will not necessarily - * be accurate but will be large enough to defeat the tree walkers. - */ - - vap = ap->a_vap; - - vp = un->un_uppervp; - if (vp != NULLVP) { - /* - * It's not clear whether vnop_getattr is to be - * called with the vnode locked or not. stat() calls - * it with (vp) locked, and fstat calls it with - * (vp) unlocked. - * In the mean time, compensate here by checking - * the union_node's lock flag. - */ - - error = vnode_getattr(vp, vap, ap->a_context); - if (error) { - return (error); - } - union_lock(); - union_newsize(ap->a_vp, vap->va_data_size, VNOVAL); - union_unlock(); - } - - if (vp == NULLVP) { - vp = un->un_lowervp; - } else if (vp->v_type == VDIR) { - vp = un->un_lowervp; - VATTR_INIT(&va); - /* all we want from the lower node is the link count */ - VATTR_WANTED(&va, va_nlink); - vap = &va; - } else { - vp = NULLVP; - } - - if (vp != NULLVP) { - error = vnode_getattr(vp, vap, ap->a_context); - if (error) { - return (error); - } - union_lock(); - union_newsize(ap->a_vp, VNOVAL, vap->va_data_size); - union_unlock(); - } - - if ((vap != ap->a_vap) && (vap->va_type == VDIR)) - ap->a_vap->va_nlink += vap->va_nlink; - - VATTR_RETURN(ap->a_vap, va_fsid, ap->a_vp->v_mount->mnt_vfsstat.f_fsid.val[0]); - return (0); -} - -static int -union_setattr(struct vnop_setattr_args *ap) -/* - struct vnop_setattr_args { - struct vnode *a_vp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_vp); - int error; - - /* - * Handle case of truncating lower object to zero size, - * by creating a zero length upper object. This is to - * handle the case of open with O_TRUNC and O_CREAT. - */ - if (VATTR_IS_ACTIVE(ap->a_vap, va_data_size) && - (un->un_uppervp == NULLVP) && - /* assert(un->un_lowervp != NULLVP) */ - (un->un_lowervp->v_type == VREG)) { - union_lock(); - error = union_copyup(un, (ap->a_vap->va_data_size != 0), ap->a_context); - union_unlock(); - if (error) { - return (error); - } - } - - /* - * Try to set attributes in upper layer, - * otherwise return read-only filesystem error. - */ - if (un->un_uppervp != NULLVP) { - error = vnode_setattr(un->un_uppervp, ap->a_vap, ap->a_context); - if ((error == 0) && VATTR_IS_ACTIVE(ap->a_vap, va_data_size)) { - union_lock(); - union_newsize(ap->a_vp, ap->a_vap->va_data_size, VNOVAL); - union_unlock(); - } - } else { - error = EROFS; - } - - return (error); -} - -static int -union_read(struct vnop_read_args *ap) -/* - struct vnop_read_args { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - error = VNOP_READ(vp, ap->a_uio, ap->a_ioflag, ap->a_context); - - /* - * XXX - * perhaps the size of the underlying object has changed under - * our feet. take advantage of the offset information present - * in the uio structure. - */ - if (error == 0) { - struct union_node *un = VTOUNION(ap->a_vp); - off_t cur = ap->a_uio->uio_offset; - - if (vp == un->un_uppervp) { - if (cur > un->un_uppersz) { - union_lock(); - union_newsize(ap->a_vp, cur, VNOVAL); - union_unlock(); - } - } else { - if (cur > un->un_lowersz) { - union_lock(); - union_newsize(ap->a_vp, VNOVAL, cur); - union_unlock(); - } - } - } - - return (error); -} - -static int -union_write(struct vnop_write_args *ap) -/* - struct vnop_write_args { - struct vnode *a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp; - struct union_node *un = VTOUNION(ap->a_vp); - - vp = UPPERVP(ap->a_vp); - if (vp == NULLVP) - panic("union: missing upper layer in write"); - - error = VNOP_WRITE(vp, ap->a_uio, ap->a_ioflag, ap->a_context); - - /* - * the size of the underlying object may be changed by the - * write. - */ - if (error == 0) { - off_t cur = ap->a_uio->uio_offset; - - if (cur > un->un_uppersz) { - union_lock(); - union_newsize(ap->a_vp, cur, VNOVAL); - union_unlock(); - } - } - - return (error); -} - - -static int -union_ioctl(struct vnop_ioctl_args *ap) -/* - struct vnop_ioctl_args { - struct vnode *a_vp; - int a_command; - caddr_t a_data; - int a_fflag; - vfs_context_t a_context; - } *ap; -*/ -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vnop_ioctl), ap)); -} - -static int -union_select(struct vnop_select_args *ap) -/* - struct vnop_select_args { - struct vnode *a_vp; - int a_which; - int a_fflags; - void * a_wql; - vfs_context_t a_context; - } *ap; -*/ -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vnop_select), ap)); -} - -static int -union_revoke(struct vnop_revoke_args *ap) -/* - struct vnop_revoke_args { - struct vnode *a_vp; - int a_flags; - vfs_context_t a_context; - } *ap; -*/ -{ - struct vnode *vp = ap->a_vp; - - if (UPPERVP(vp)) - VNOP_REVOKE(UPPERVP(vp), ap->a_flags, ap->a_context); - if (LOWERVP(vp)) - VNOP_REVOKE(LOWERVP(vp), ap->a_flags, ap->a_context); - vnode_reclaim(vp); - - return (0); -} - -static int -union_mmap(struct vnop_mmap_args *ap) -/* - struct vnop_mmap_args { - struct vnode *a_vp; - int a_fflags; - kauth_cred_t a_cred; - struct proc *a_p; - } *ap; -*/ -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vnop_mmap), ap)); -} - -static int -union_mnomap(struct vnop_mnomap_args *ap) -/* - struct vnop_mnomap_args { - struct vnode *a_vp; - int a_fflags; - kauth_cred_t a_cred; - struct proc *a_p; - } *ap; -*/ -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vnop_mnomap), ap)); -} - -static int -union_fsync(struct vnop_fsync_args *ap) -/* - struct vnop_fsync_args { - struct vnode *a_vp; - int a_waitfor; - vfs_context_t a_context; - } *ap; -*/ -{ - int error = 0; - struct vnode *targetvp = OTHERVP(ap->a_vp); - - if (targetvp != NULLVP) { - - error = VNOP_FSYNC(targetvp, ap->a_waitfor, ap->a_context); - } - - return (error); -} - -static int -union_remove(struct vnop_remove_args *ap) -/* - struct vnop_remove_args { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - vfs_context_t a_context; - } *ap; -*/ -{ - int error, flags; - struct union_node *dun = VTOUNION(ap->a_dvp); - struct union_node *un = VTOUNION(ap->a_vp); - struct componentname *cnp = ap->a_cnp; - int busydel = 0; - - if (dun->un_uppervp == NULLVP) - panic("union remove: null upper vnode"); - - if (UNNODE_FAULTIN(dun) && ((ap->a_vp != NULLVP) && - ((vnode_isreg(ap->a_vp) != 0) || (vnode_islnk(ap->a_vp) != 0)))) { - return(VNOP_REMOVE(dun->un_uppervp, ap->a_vp, ap->a_cnp, ap->a_flags, ap->a_context)); - } - - if (un->un_uppervp != NULLVP) { - struct vnode *dvp = dun->un_uppervp; - struct vnode *vp = un->un_uppervp; - - flags = ap->a_flags; - if (vnode_isinuse(ap->a_vp, 0)) - busydel = 1; - if ((flags & VNODE_REMOVE_NODELETEBUSY) && (busydel != 0)) { - return(EBUSY); - } - if (union_dowhiteout(un, cnp->cn_context)) - cnp->cn_flags |= DOWHITEOUT; - - if (busydel != 0) { - union_lock(); - un->un_flags |= UN_DELETED; - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - union_unlock(); - vnode_ref(vp); - } - error = VNOP_REMOVE(dvp, vp, cnp, 0, ap->a_context); - if (!error) { - union_lock(); - if (busydel == 0) - union_removed_upper(un); - union_unlock(); - } - } else { - if (UNNODE_FAULTIN(un)) - panic("faultfs: No uppervp"); - error = union_mkwhiteout( - MOUNTTOUNIONMOUNT(UNIONTOV(dun)->v_mount), - dun->un_uppervp, ap->a_cnp, un->un_path); - } - - return (error); -} - -static int -union_link(struct vnop_link_args *ap) -/* - struct vnop_link_args { - struct vnode *a_vp; - struct vnode *a_tdvp; - struct componentname *a_cnp; - vfs_context_t a_context; - } *ap; -*/ -{ - int error = 0; - struct componentname *cnp = ap->a_cnp; - struct union_node *un; - struct vnode *vp; - struct vnode *tdvp; - - un = VTOUNION(ap->a_tdvp); - - if (ap->a_tdvp->v_op != ap->a_vp->v_op) { - vp = ap->a_vp; - } else { - struct union_node *tun = VTOUNION(ap->a_vp); - if (tun->un_uppervp == NULLVP) { - if (UNNODE_FAULTIN(tun)) - panic("faultfs: No uppervp"); - if (un->un_uppervp == tun->un_dirvp) { - } - union_lock(); - /* Would need to drain for above,below mount and faulin does not enter this path */ - un->un_flags |= UN_LOCKED; - error = union_copyup(tun, 1, ap->a_context); - un->un_flags &= ~UN_LOCKED; - if ((un->un_flags & UN_WANT) == UN_WANT) { - un->un_flags &= ~UN_WANT; - wakeup(&un->un_flags); - } - union_unlock(); - } - vp = tun->un_uppervp; - } - tdvp = un->un_uppervp; - if (tdvp == NULLVP) - error = EROFS; - - if (error) { - return (error); - } - - - error = (VNOP_LINK(vp, tdvp, cnp, ap->a_context)); - return(error); -} - -static int -union_rename(struct vnop_rename_args *ap) -/* - struct vnop_rename_args { - struct vnode *a_fdvp; - struct vnode *a_fvp; - struct componentname *a_fcnp; - struct vnode *a_tdvp; - struct vnode *a_tvp; - struct componentname *a_tcnp; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - - struct vnode *fdvp = ap->a_fdvp; - struct vnode *fvp = ap->a_fvp; - struct vnode *tdvp = ap->a_tdvp; - struct vnode *tvp = ap->a_tvp; - - - if (fdvp->v_op == union_vnodeop_p) { /* always true */ - struct union_node *un = VTOUNION(fdvp); - if (un->un_uppervp == NULLVP) { - if (UNNODE_FAULTIN(un)) - panic("faultfs rename: No uppervp"); - /* - * this should never happen in normal - * operation but might if there was - * a problem creating the top-level shadow - * directory. - */ - error = EXDEV; - goto bad; - } - - fdvp = un->un_uppervp; - } - - if (fvp->v_op == union_vnodeop_p) { /* always true */ - struct union_node *un = VTOUNION(fvp); - if (un->un_uppervp == NULLVP) { - if (UNNODE_FAULTIN(un)) - panic("faultfs rename: No uppervp"); - /* XXX: should do a copyup */ - error = EXDEV; - goto bad; - } - - if (un->un_lowervp != NULLVP) - ap->a_fcnp->cn_flags |= DOWHITEOUT; - - fvp = un->un_uppervp; - } - - if (tdvp->v_op == union_vnodeop_p) { - struct union_node *un = VTOUNION(tdvp); - if (un->un_uppervp == NULLVP) { - /* - * this should never happen in normal - * operation but might if there was - * a problem creating the top-level shadow - * directory. - */ - if (UNNODE_FAULTIN(un)) - panic("faultfs rename: No uppervp"); - error = EXDEV; - goto bad; - } - - tdvp = un->un_uppervp; - } - - if (tvp != NULLVP && tvp->v_op == union_vnodeop_p) { - struct union_node *un = VTOUNION(tvp); - - tvp = un->un_uppervp; - } - - return (VNOP_RENAME(fdvp, fvp, ap->a_fcnp, tdvp, tvp, ap->a_tcnp, ap->a_context)); - -bad: - return (error); -} - -static int -union_mkdir(struct vnop_mkdir_args *ap) -/* - struct vnop_mkdir_args { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_dvp); - struct vnode *dvp = un->un_uppervp; - struct componentname *cnp = ap->a_cnp; - - if (dvp != NULLVP) { - int error; - struct vnode *vp; - - - /* note that this is a direct fallthrough to the filesystem */ - error = VNOP_MKDIR(dvp, &vp, cnp, ap->a_vap, ap->a_context); - if (error) - return (error); - - union_lock(); - error = union_allocvp(ap->a_vpp, ap->a_dvp->v_mount, ap->a_dvp, - NULLVP, cnp, vp, NULLVP, 1); - union_unlock(); - if (error) - vnode_put(vp); - return (error); - } - return (EROFS); -} - -static int -union_rmdir(struct vnop_rmdir_args *ap) -/* - struct vnop_rmdir_args { - struct vnode *a_dvp; - struct vnode *a_vp; - struct componentname *a_cnp; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct union_node *dun = VTOUNION(ap->a_dvp); - struct union_node *un = VTOUNION(ap->a_vp); - struct componentname *cnp = ap->a_cnp; - int busydel = 0; - - /******* NODE HAS TO BE LOCKED ******/ - if (dun->un_uppervp == NULLVP) - panic("union rmdir: null upper vnode"); - - if (un->un_uppervp != NULLVP) { - struct vnode *dvp = dun->un_uppervp; - struct vnode *vp = un->un_uppervp; - - if (vnode_isinuse(ap->a_vp, 0)) { - busydel = 1; - union_lock(); - un->un_flags |= UN_DELETED; - if (un->un_flags & UN_CACHED) { - un->un_flags &= ~UN_CACHED; - LIST_REMOVE(un, un_cache); - } - union_unlock(); - vnode_ref(vp); - } - - - if (union_dowhiteout(un, cnp->cn_context)) - cnp->cn_flags |= DOWHITEOUT; - error = VNOP_RMDIR(dvp, vp, ap->a_cnp, ap->a_context); - if (!error) { - union_lock(); - if (busydel == 0) - union_removed_upper(un); - union_unlock(); - } - } else { - if (UNNODE_FAULTIN(un)) - panic("faultfs: No uppervp"); - error = union_mkwhiteout( - MOUNTTOUNIONMOUNT(UNIONTOV(dun)->v_mount), - dun->un_uppervp, ap->a_cnp, un->un_path); - } - return (error); -} - -static int -union_symlink(struct vnop_symlink_args *ap) -/* - struct vnop_symlink_args { - struct vnode *a_dvp; - struct vnode **a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - char *a_target; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_dvp); - struct vnode *dvp = un->un_uppervp; - struct componentname *cnp = ap->a_cnp; - - if (dvp != NULLVP) { - int error; - struct vnode *vp; - - error = VNOP_SYMLINK(dvp, &vp, cnp, ap->a_vap, ap->a_target, ap->a_context); - *ap->a_vpp = vp; - return (error); - } - return (EROFS); -} - -/* - * union_readdir works in concert with getdirentries and - * readdir(3) to provide a list of entries in the unioned - * directories. getdirentries is responsible for walking - * down the union stack. readdir(3) is responsible for - * eliminating duplicate names from the returned data stream. - */ -static int -union_readdir(struct vnop_readdir_args *ap) -/* - struct vnop_readdir_args { - struct vnodeop_desc *a_desc; - struct vnode *a_vp; - struct uio *a_uio; - int a_flags; - int *a_eofflag; - int *a_numdirent; - vfs_context_t a_context; - } *ap; -*/ -{ - struct union_node *un = VTOUNION(ap->a_vp); - struct vnode *uvp = un->un_uppervp; - - if (ap->a_flags & (VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF)) - return (EINVAL); - - if (uvp == NULLVP) - return (0); - - ap->a_vp = uvp; - return (VCALL(uvp, VOFFSET(vnop_readdir), ap)); -} - -static int -union_readlink(struct vnop_readlink_args *ap) -/* - struct vnop_readlink_args { - struct vnode *a_vp; - struct uio *a_uio; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vnop_readlink), ap); - - return (error); -} - -static int -union_inactive(struct vnop_inactive_args *ap) -/* - struct vnop_inactive_args { - struct vnode *a_vp; - vfs_context_t a_context; - } *ap; -*/ -{ - struct vnode *vp = ap->a_vp; - struct union_node *un = VTOUNION(vp); - - /* - * Do nothing (and _don't_ bypass). - * Wait to vnode_put lowervp until reclaim, - * so that until then our union_node is in the - * cache and reusable. - * - * NEEDSWORK: Someday, consider inactive'ing - * the lowervp and then trying to reactivate it - * with capabilities (v_id) - * like they do in the name lookup cache code. - * That's too much work for now. - */ - - union_lock(); - if (un->un_flags & UN_DELETED) { - if(un->un_uppervp != NULLVP) { - vnode_rele(un->un_uppervp); - } - union_removed_upper(un); - } - - if (un->un_dircache != 0) { - union_dircache_free(un); - } - if (un->un_flags & UN_DIRENVN) { - vnode_recycle(vp); - } - - union_unlock(); - - return (0); -} - -static int -union_reclaim(struct vnop_reclaim_args *ap) -/* - struct vnop_reclaim_args { - struct vnode *a_vp; - vfs_context_t a_context; - } *ap; -*/ -{ - - union_lock(); - union_freevp(ap->a_vp); - union_unlock(); - - return (0); -} - -static int -union_blockmap(struct vnop_blockmap_args *ap) -/* - struct vnop_blockmap_args { - struct vnode *a_vp; - off_t a_offset; - size_t a_size; - daddr64_t *a_bpn; - size_t *a_run; - void *a_poff; - int a_flags; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vnop_blockmap), ap); - - return (error); -} - -static int -union_pathconf(struct vnop_pathconf_args *ap) -/* - struct vnop_pathconf_args { - struct vnode *a_vp; - int a_name; - int *a_retval; - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - ap->a_vp = vp; - error = VCALL(vp, VOFFSET(vnop_pathconf), ap); - - return (error); -} - -static int -union_advlock(struct vnop_advlock_args *ap) -/* - struct vnop_advlock_args { - struct vnode *a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - vfs_context_t a_context; - } *ap; -*/ -{ - register struct vnode *ovp = OTHERVP(ap->a_vp); - - ap->a_vp = ovp; - return (VCALL(ovp, VOFFSET(vnop_advlock), ap)); -} - - -/* - * XXX - vnop_strategy must be hand coded because it has no - * vnode in its arguments. - * This goes away with a merged VM/buffer cache. - */ -static int -union_strategy(struct vnop_strategy_args *ap) -/* - struct vnop_strategy_args { - struct buf *a_bp; - } *ap; -*/ -{ - struct buf *bp = ap->a_bp; - int error; - struct vnode *savedvp; - - savedvp = buf_vnode(bp); - buf_setvnode(bp, OTHERVP(savedvp)); - -#if DIAGNOSTIC - if (buf_vnode(bp) == NULLVP) - panic("union_strategy: nil vp"); - if (((buf_flags(bp) & B_READ) == 0) && - (buf_vnode(bp) == LOWERVP(savedvp))) - panic("union_strategy: writing to lowervp"); -#endif - - error = VNOP_STRATEGY(bp); - buf_setvnode(bp, savedvp); - - return (error); -} - -/* Pagein */ -static int -union_pagein(struct vnop_pagein_args *ap) -/* - struct vnop_pagein_args { - struct vnode *a_vp, - upl_t a_pl, - upl_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - int a_flags - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - error = VNOP_PAGEIN(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, ap->a_flags, ap->a_context); - - /* - * XXX - * perhaps the size of the underlying object has changed under - * our feet. take advantage of the offset information present - * in the uio structure. - */ - if (error == 0) { - struct union_node *un = VTOUNION(ap->a_vp); - off_t cur = ap->a_f_offset + (off_t)ap->a_pl_offset; - - if (vp == un->un_uppervp) { - if (cur > un->un_uppersz) { - union_lock(); - union_newsize(ap->a_vp, cur, VNOVAL); - union_unlock(); - } - } else { - if (cur > un->un_lowersz) { - union_lock(); - union_newsize(ap->a_vp, VNOVAL, cur); - union_unlock(); - } - } - } - - return (error); -} - -/* Pageout */ -static int -union_pageout(struct vnop_pageout_args *ap) -/* - struct vnop_pageout_args { - struct vnode *a_vp, - upl_t a_pl, - vm_offset_t a_pl_offset, - off_t a_f_offset, - size_t a_size, - int a_flags - vfs_context_t a_context; - } *ap; -*/ -{ - int error; - struct vnode *vp; - struct union_node *un = VTOUNION(ap->a_vp); - - vp = UPPERVP(ap->a_vp); - if (vp == NULLVP) - panic("union: missing upper layer in pageout"); - - error = VNOP_PAGEOUT(vp, ap->a_pl, ap->a_pl_offset, ap->a_f_offset, - ap->a_size, ap->a_flags, ap->a_context); - - /* - * the size of the underlying object may be changed by the - * write. - */ - if (error == 0) { - off_t cur = ap->a_f_offset + (off_t)ap->a_pl_offset; - - if (cur > un->un_uppersz) { - union_lock(); - union_newsize(ap->a_vp, cur, VNOVAL); - union_unlock(); - } - } - - return (error); -} - -/* Blktooff derives file offset for the given logical block number */ -static int -union_blktooff(struct vnop_blktooff_args *ap) -/* - struct vnop_blktooff_args { - struct vnode *a_vp; - daddr64_t a_lblkno; - off_t *a_offset; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - error = VNOP_BLKTOOFF(vp, ap->a_lblkno, ap->a_offset); - - return(error); -} - -/* offtoblk derives file offset for the given logical block number */ -static int -union_offtoblk(struct vnop_offtoblk_args *ap) -/* - struct vnop_offtoblk_args { - struct vnode *a_vp; - off_t a_offset; - daddr64_t *a_lblkno; - } *ap; -*/ -{ - int error; - struct vnode *vp = OTHERVP(ap->a_vp); - - error = VNOP_OFFTOBLK(vp, ap->a_offset, ap->a_lblkno); - - return(error); -} - -#define VOPFUNC int (*)(void *) - -/* - * Global vfs data structures - */ -int (**union_vnodeop_p)(void *); -struct vnodeopv_entry_desc union_vnodeop_entries[] = { - { &vnop_default_desc, (VOPFUNC)vn_default_error }, - { &vnop_lookup_desc, (VOPFUNC)union_lookup }, /* lookup */ - { &vnop_create_desc, (VOPFUNC)union_create }, /* create */ - { &vnop_whiteout_desc, (VOPFUNC)union_whiteout }, /* whiteout */ - { &vnop_mknod_desc, (VOPFUNC)union_mknod }, /* mknod */ - { &vnop_open_desc, (VOPFUNC)union_open }, /* open */ - { &vnop_close_desc, (VOPFUNC)union_close }, /* close */ - { &vnop_access_desc, (VOPFUNC)union_access }, /* access */ - { &vnop_getattr_desc, (VOPFUNC)union_getattr }, /* getattr */ - { &vnop_setattr_desc, (VOPFUNC)union_setattr }, /* setattr */ - { &vnop_read_desc, (VOPFUNC)union_read }, /* read */ - { &vnop_write_desc, (VOPFUNC)union_write }, /* write */ - { &vnop_ioctl_desc, (VOPFUNC)union_ioctl }, /* ioctl */ - { &vnop_select_desc, (VOPFUNC)union_select }, /* select */ - { &vnop_revoke_desc, (VOPFUNC)union_revoke }, /* revoke */ - { &vnop_mmap_desc, (VOPFUNC)union_mmap }, /* mmap */ - { &vnop_mnomap_desc, (VOPFUNC)union_mnomap }, /* mnomap */ - { &vnop_fsync_desc, (VOPFUNC)union_fsync }, /* fsync */ - { &vnop_remove_desc, (VOPFUNC)union_remove }, /* remove */ - { &vnop_link_desc, (VOPFUNC)union_link }, /* link */ - { &vnop_rename_desc, (VOPFUNC)union_rename }, /* rename */ - { &vnop_mkdir_desc, (VOPFUNC)union_mkdir }, /* mkdir */ - { &vnop_rmdir_desc, (VOPFUNC)union_rmdir }, /* rmdir */ - { &vnop_symlink_desc, (VOPFUNC)union_symlink }, /* symlink */ - { &vnop_readdir_desc, (VOPFUNC)union_readdir }, /* readdir */ - { &vnop_readlink_desc, (VOPFUNC)union_readlink }, /* readlink */ - { &vnop_inactive_desc, (VOPFUNC)union_inactive }, /* inactive */ - { &vnop_reclaim_desc, (VOPFUNC)union_reclaim }, /* reclaim */ - { &vnop_strategy_desc, (VOPFUNC)union_strategy }, /* strategy */ - { &vnop_pathconf_desc, (VOPFUNC)union_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (VOPFUNC)union_advlock }, /* advlock */ -#ifdef notdef - { &vnop_bwrite_desc, (VOPFUNC)union_bwrite }, /* bwrite */ -#endif - { &vnop_pagein_desc, (VOPFUNC)union_pagein }, /* Pagein */ - { &vnop_pageout_desc, (VOPFUNC)union_pageout }, /* Pageout */ - { &vnop_copyfile_desc, (VOPFUNC)err_copyfile }, /* Copyfile */ - { &vnop_blktooff_desc, (VOPFUNC)union_blktooff }, /* blktooff */ - { &vnop_offtoblk_desc, (VOPFUNC)union_offtoblk }, /* offtoblk */ - { &vnop_blockmap_desc, (VOPFUNC)union_blockmap }, /* blockmap */ - { (struct vnodeop_desc*)NULL, (int(*)())NULL } -}; -struct vnodeopv_desc union_vnodeop_opv_desc = - { &union_vnodeop_p, union_vnodeop_entries }; diff --git a/bsd/net/Makefile b/bsd/net/Makefile index 2f77f154b..79c622bf8 100644 --- a/bsd/net/Makefile +++ b/bsd/net/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES= \ @@ -32,14 +28,15 @@ KERNELFILES= \ if_ether.h init.h radix.h PRIVATE_DATAFILES = \ - if_atm.h if_vlan_var.h if_ppp.h firewire.h \ + if_vlan_var.h if_ppp.h firewire.h \ ppp_defs.h radix.h if_bond_var.h lacp.h ndrv_var.h \ - raw_cb.h etherdefs.h iso88025.h if_pflog.h pfvar.h \ - if_bridgevar.h + netsrc.h raw_cb.h etherdefs.h iso88025.h if_pflog.h pfvar.h \ + if_bridgevar.h ntstat.h if_llreach.h PRIVATE_KERNELFILES = ${KERNELFILES} \ bpfdesc.h dlil_pvt.h ppp_comp.h \ - zlib.h bpf_compat.h net_osdep.h + zlib.h bpf_compat.h net_osdep.h \ + ntstat.h if_llreach.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/net/bpf.c b/bsd/net/bpf.c index d9ec5b137..e370dfc5e 100644 --- a/bsd/net/bpf.c +++ b/bsd/net/bpf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -119,6 +119,7 @@ #include <net/dlil.h> #include <kern/locks.h> +#include <kern/thread_call.h> #if CONFIG_MACF_NET #include <security/mac_framework.h> @@ -147,13 +148,13 @@ static caddr_t bpf_alloc(); * The default read buffer size is patchable. */ static unsigned int bpf_bufsize = BPF_BUFSIZE; -SYSCTL_INT(_debug, OID_AUTO, bpf_bufsize, CTLFLAG_RW, +SYSCTL_INT(_debug, OID_AUTO, bpf_bufsize, CTLFLAG_RW | CTLFLAG_LOCKED, &bpf_bufsize, 0, ""); -static unsigned int bpf_maxbufsize = BPF_MAXBUFSIZE; -SYSCTL_INT(_debug, OID_AUTO, bpf_maxbufsize, CTLFLAG_RW, +__private_extern__ unsigned int bpf_maxbufsize = BPF_MAXBUFSIZE; +SYSCTL_INT(_debug, OID_AUTO, bpf_maxbufsize, CTLFLAG_RW | CTLFLAG_LOCKED, &bpf_maxbufsize, 0, ""); static unsigned int bpf_maxdevices = 256; -SYSCTL_UINT(_debug, OID_AUTO, bpf_maxdevices, CTLFLAG_RW, +SYSCTL_UINT(_debug, OID_AUTO, bpf_maxdevices, CTLFLAG_RW | CTLFLAG_LOCKED, &bpf_maxdevices, 0, ""); /* @@ -196,6 +197,7 @@ static void bpf_mcopy(const void *, void *, size_t); static int bpf_movein(struct uio *, int, struct mbuf **, struct sockaddr *, int *); static int bpf_setif(struct bpf_d *, ifnet_t ifp, u_int32_t dlt); +static void bpf_timed_out(void *, void *); static void bpf_wakeup(struct bpf_d *); static void catchpacket(struct bpf_d *, u_char *, u_int, u_int, void (*)(const void *, void *, size_t)); @@ -216,26 +218,26 @@ static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m); * Darwin differs from BSD here, the following are static * on BSD and not static on Darwin. */ - d_open_t bpfopen; - d_close_t bpfclose; - d_read_t bpfread; - d_write_t bpfwrite; - ioctl_fcn_t bpfioctl; - select_fcn_t bpfpoll; + d_open_t bpfopen; + d_close_t bpfclose; + d_read_t bpfread; + d_write_t bpfwrite; + ioctl_fcn_t bpfioctl; + select_fcn_t bpfselect; /* Darwin's cdevsw struct differs slightly from BSDs */ #define CDEV_MAJOR 23 static struct cdevsw bpf_cdevsw = { - /* open */ bpfopen, - /* close */ bpfclose, - /* read */ bpfread, - /* write */ bpfwrite, - /* ioctl */ bpfioctl, + /* open */ bpfopen, + /* close */ bpfclose, + /* read */ bpfread, + /* write */ bpfwrite, + /* ioctl */ bpfioctl, /* stop */ eno_stop, /* reset */ eno_reset, /* tty */ NULL, - /* select */ bpfpoll, + /* select */ bpfselect, /* mmap */ eno_mmap, /* strategy*/ eno_strat, /* getc */ eno_getc, @@ -314,6 +316,11 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc sa_family = AF_IEEE80211; hlen = 0; break; + + case DLT_IEEE802_11_RADIO: + sa_family = AF_IEEE80211; + hlen = 0; + break; default: return (EIO); @@ -367,6 +374,7 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; *mp = m; + /* * Make room for link header. */ @@ -383,8 +391,25 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc goto bad; } error = UIOMOVE(mtod(m, caddr_t), len - hlen, UIO_WRITE, uio); - if (!error) - return (0); + if (error) + goto bad; + + /* Check for multicast destination */ + switch (linktype) { + case DLT_EN10MB: { + struct ether_header *eh = mtod(m, struct ether_header *); + + if (ETHER_IS_MULTICAST(eh->ether_dhost)) { + if (_ether_cmp(etherbroadcastaddr, eh->ether_dhost) == 0) + m->m_flags |= M_BCAST; + else + m->m_flags |= M_MCAST; + } + break; + } + } + + return 0; bad: m_freem(m); return (error); @@ -550,6 +575,59 @@ bpf_detachd(struct bpf_d *d) } +/* + * Start asynchronous timer, if necessary. + * Must be called with bpf_mlock held. + */ +static void +bpf_start_timer(struct bpf_d *d) +{ + uint64_t deadline; + struct timeval tv; + + if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { + tv.tv_sec = d->bd_rtout / hz; + tv.tv_usec = (d->bd_rtout % hz) * tick; + + clock_interval_to_deadline((uint64_t)tv.tv_sec * USEC_PER_SEC + tv.tv_usec, + NSEC_PER_USEC, + &deadline); + /* + * The state is BPF_IDLE, so the timer hasn't + * been started yet, and hasn't gone off yet; + * there is no thread call scheduled, so this + * won't change the schedule. + * + * XXX - what if, by the time it gets entered, + * the deadline has already passed? + */ + thread_call_enter_delayed(d->bd_thread_call, deadline); + d->bd_state = BPF_WAITING; + } +} + +/* + * Cancel asynchronous timer. + * Must be called with bpf_mlock held. + */ +static boolean_t +bpf_stop_timer(struct bpf_d *d) +{ + /* + * If the timer has already gone off, this does nothing. + * Our caller is expected to set d->bd_state to BPF_IDLE, + * with the bpf_mlock, after we are called. bpf_timed_out() + * also grabs bpf_mlock, so, if the timer has gone off and + * bpf_timed_out() hasn't finished, it's waiting for the + * lock; when this thread releases the lock, it will + * find the state is BPF_IDLE, and just release the + * lock and return. + */ + return (thread_call_cancel(d->bd_thread_call)); +} + + + /* * Open ethernet device. Returns ENXIO for illegal minor device number, * EBUSY if file is open by another process. @@ -612,6 +690,16 @@ bpfopen(dev_t dev, int flags, __unused int fmt, d->bd_sig = SIGIO; d->bd_seesent = 1; d->bd_oflags = flags; + d->bd_state = BPF_IDLE; + d->bd_thread_call = thread_call_allocate(bpf_timed_out, d); + + if (d->bd_thread_call == NULL) { + printf("bpfopen: malloc thread call failed\n"); + bpf_dtab[minor(dev)] = NULL; + lck_mtx_unlock(bpf_mlock); + _FREE(d, M_DEVBUF); + return ENOMEM; + } #if CONFIG_MACF_NET mac_bpfdesc_label_init(d); mac_bpfdesc_label_associate(kauth_cred_get(), d); @@ -643,12 +731,67 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, } bpf_dtab[minor(dev)] = (void *)1; /* Mark closing */ + /* + * Deal with any in-progress timeouts. + */ + switch (d->bd_state) { + case BPF_IDLE: + /* + * Not waiting for a timeout, and no timeout happened. + */ + break; + + case BPF_WAITING: + /* + * Waiting for a timeout. + * Cancel any timer that has yet to go off, + * and mark the state as "closing". + * Then drop the lock to allow any timers that + * *have* gone off to run to completion, and wait + * for them to finish. + */ + if (!bpf_stop_timer(d)) { + /* + * There was no pending call, so the call must + * have been in progress. Wait for the call to + * complete; we have to drop the lock while + * waiting. to let the in-progrss call complete + */ + d->bd_state = BPF_DRAINING; + while (d->bd_state == BPF_DRAINING) + msleep((caddr_t)d, bpf_mlock, PRINET, + "bpfdraining", NULL); + } + d->bd_state = BPF_IDLE; + break; + + case BPF_TIMED_OUT: + /* + * Timer went off, and the timeout routine finished. + */ + d->bd_state = BPF_IDLE; + break; + + case BPF_DRAINING: + /* + * Another thread is blocked on a close waiting for + * a timeout to finish. + * This "shouldn't happen", as the first thread to enter + * bpfclose() will set bpf_dtab[minor(dev)] to 1, and + * all subsequent threads should see that and fail with + * ENXIO. + */ + panic("Two threads blocked in a BPF close"); + break; + } + if (d->bd_bif) bpf_detachd(d); selthreadclear(&d->bd_sel); #if CONFIG_MACF_NET mac_bpfdesc_label_destroy(d); #endif + thread_call_free(d->bd_thread_call); bpf_freed(d); /* Mark free in same context as bpfopen comes to check */ @@ -666,15 +809,12 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt, static int bpf_sleep(struct bpf_d *d, int pri, const char *wmesg, int timo) { - int st; + u_int64_t abstime = 0; - lck_mtx_unlock(bpf_mlock); - - st = tsleep((caddr_t)d, pri, wmesg, timo); - - lck_mtx_lock(bpf_mlock); + if(timo) + clock_interval_to_deadline(timo, NSEC_PER_SEC / hz, &abstime); - return st; + return msleep1((caddr_t)d, bpf_mlock, pri, wmesg, abstime); } /* @@ -695,6 +835,7 @@ int bpfread(dev_t dev, struct uio *uio, int ioflag) { struct bpf_d *d; + int timed_out; int error; lck_mtx_lock(bpf_mlock); @@ -705,7 +846,6 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) return (ENXIO); } - /* * Restrict application to use a buffer the same size as * as kernel buffers. @@ -714,6 +854,12 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) lck_mtx_unlock(bpf_mlock); return (EINVAL); } + + if (d->bd_state == BPF_WAITING) + bpf_stop_timer(d); + + timed_out = (d->bd_state == BPF_TIMED_OUT); + d->bd_state = BPF_IDLE; /* * If the hold buffer is empty, then do a timed sleep, which @@ -721,9 +867,14 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) * have arrived to fill the store buffer. */ while (d->bd_hbuf == 0) { - if (d->bd_immediate && d->bd_slen != 0) { + if ((d->bd_immediate || timed_out || (ioflag & IO_NDELAY)) + && d->bd_slen != 0) { /* - * A packet(s) either arrived since the previous + * We're in immediate mode, or are reading + * in non-blocking mode, or a timer was + * started before the read (e.g., by select() + * or poll()) and has expired and a packet(s) + * either arrived since the previous * read or arrived while we were asleep. * Rotate the buffers and return what's here. */ @@ -806,6 +957,10 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) static void bpf_wakeup(struct bpf_d *d) { + if (d->bd_state == BPF_WAITING) { + bpf_stop_timer(d); + d->bd_state = BPF_IDLE; + } wakeup((caddr_t)d); if (d->bd_async && d->bd_sig && d->bd_sigio) pgsigio(d->bd_sigio, d->bd_sig); @@ -826,6 +981,36 @@ bpf_wakeup(struct bpf_d *d) #endif } + +static void +bpf_timed_out(void *arg, __unused void *dummy) +{ + struct bpf_d *d = (struct bpf_d *)arg; + + lck_mtx_lock(bpf_mlock); + if (d->bd_state == BPF_WAITING) { + /* + * There's a select or kqueue waiting for this; if there's + * now stuff to read, wake it up. + */ + d->bd_state = BPF_TIMED_OUT; + if (d->bd_slen != 0) + bpf_wakeup(d); + } else if (d->bd_state == BPF_DRAINING) { + /* + * A close is waiting for this to finish. + * Mark it as finished, and wake the close up. + */ + d->bd_state = BPF_IDLE; + bpf_wakeup(d); + } + lck_mtx_unlock(bpf_mlock); +} + + + + + /* keep in sync with bpf_movein above: */ #define MAX_DATALINK_HDR_LEN (sizeof(struct firewire_header)) @@ -838,6 +1023,8 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) int error; char dst_buf[SOCKADDR_HDR_LEN + MAX_DATALINK_HDR_LEN]; int datlen = 0; + int bif_dlt; + int bd_hdrcmplt; lck_mtx_lock(bpf_mlock); @@ -853,31 +1040,56 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) ifp = d->bd_bif->bif_ifp; + if ((ifp->if_flags & IFF_UP) == 0) { + lck_mtx_unlock(bpf_mlock); + return (ENETDOWN); + } if (uio_resid(uio) == 0) { lck_mtx_unlock(bpf_mlock); return (0); } ((struct sockaddr *)dst_buf)->sa_len = sizeof(dst_buf); - error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, &m, - d->bd_hdrcmplt ? NULL : (struct sockaddr *)dst_buf, - &datlen); - if (error) { - lck_mtx_unlock(bpf_mlock); + + /* + * fix for PR-6849527 + * geting variables onto stack before dropping lock for bpf_movein() + */ + bif_dlt = (int)d->bd_bif->bif_dlt; + bd_hdrcmplt = d->bd_hdrcmplt; + + /* bpf_movein allocating mbufs; drop lock */ + lck_mtx_unlock(bpf_mlock); + + error = bpf_movein(uio, bif_dlt, &m, + bd_hdrcmplt ? NULL : (struct sockaddr *)dst_buf, + &datlen); + + if (error) { return (error); } - if ((unsigned)datlen > ifp->if_mtu) { + /* taking the lock again and verifying whether device is open */ + lck_mtx_lock(bpf_mlock); + d = bpf_dtab[minor(dev)]; + if (d == 0 || d == (void *)1) { lck_mtx_unlock(bpf_mlock); m_freem(m); - return (EMSGSIZE); + return (ENXIO); } - - if ((error = ifp_use(ifp, kIfNetUseCount_MustNotBeZero)) != 0) { + + if (d->bd_bif == NULL) { + lck_mtx_unlock(bpf_mlock); + m_free(m); + return (ENXIO); + } + + if ((unsigned)datlen > ifp->if_mtu) { lck_mtx_unlock(bpf_mlock); m_freem(m); - return (error); + return (EMSGSIZE); } + #if CONFIG_MACF_NET mac_mbuf_label_associate_bpfdesc(d, m); #endif @@ -892,10 +1104,7 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag) else { error = dlil_output(ifp, PF_INET, m, NULL, (struct sockaddr *)dst_buf, 0); } - - if (ifp_unuse(ifp) != 0) - ifp_use_reached_zero(ifp); - + /* * The driver frees the mbuf. */ @@ -956,6 +1165,10 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, return (ENXIO); } + if (d->bd_state == BPF_WAITING) + bpf_stop_timer(d); + d->bd_state = BPF_IDLE; + switch (cmd) { default: @@ -1124,34 +1337,60 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags, /* * Set read timeout. */ - case BIOCSRTIMEOUT: - { - struct BPF_TIMEVAL *_tv = (struct BPF_TIMEVAL *)addr; + case BIOCSRTIMEOUT32: + { + struct user32_timeval *_tv = (struct user32_timeval *)addr; struct timeval tv; tv.tv_sec = _tv->tv_sec; tv.tv_usec = _tv->tv_usec; - /* + /* * Subtract 1 tick from tvtohz() since this isn't * a one-shot timer. */ if ((error = itimerfix(&tv)) == 0) d->bd_rtout = tvtohz(&tv) - 1; break; - } + } - /* + case BIOCSRTIMEOUT64: + { + struct user64_timeval *_tv = (struct user64_timeval *)addr; + struct timeval tv; + + tv.tv_sec = _tv->tv_sec; + tv.tv_usec = _tv->tv_usec; + + /* + * Subtract 1 tick from tvtohz() since this isn't + * a one-shot timer. + */ + if ((error = itimerfix(&tv)) == 0) + d->bd_rtout = tvtohz(&tv) - 1; + break; + } + + /* * Get read timeout. */ - case BIOCGRTIMEOUT: + case BIOCGRTIMEOUT32: { - struct BPF_TIMEVAL *tv = (struct BPF_TIMEVAL *)addr; + struct user32_timeval *tv = (struct user32_timeval *)addr; tv->tv_sec = d->bd_rtout / hz; tv->tv_usec = (d->bd_rtout % hz) * tick; break; - } + } + + case BIOCGRTIMEOUT64: + { + struct user64_timeval *tv = (struct user64_timeval *)addr; + + tv->tv_sec = d->bd_rtout / hz; + tv->tv_usec = (d->bd_rtout % hz) * tick; + break; + } /* * Get packet stats. @@ -1320,14 +1559,10 @@ bpf_setif(struct bpf_d *d, ifnet_t theywant, u_int32_t dlt) continue; /* * We found the requested interface. - * If it's not up, return an error. * Allocate the packet buffers if we need to. * If we're already attached to requested interface, * just flush the buffer. */ - if ((ifp->if_flags & IFF_UP) == 0) - return (ENETDOWN); - if (d->bd_sbuf == 0) { error = bpf_allocbufs(d); if (error != 0) @@ -1441,10 +1676,10 @@ bpf_setdlt(struct bpf_d *d, uint32_t dlt) * Otherwise, return false but make a note that a selwakeup() must be done. */ int -bpfpoll(dev_t dev, int events, void * wql, struct proc *p) +bpfselect(dev_t dev, int which, void * wql, struct proc *p) { struct bpf_d *d; - int revents = 0; + int ret = 0; lck_mtx_lock(bpf_mlock); @@ -1454,25 +1689,38 @@ bpfpoll(dev_t dev, int events, void * wql, struct proc *p) return (ENXIO); } - /* - * An imitation of the FIONREAD ioctl code. - */ if (d->bd_bif == NULL) { lck_mtx_unlock(bpf_mlock); return (ENXIO); } - if (events & (POLLIN | POLLRDNORM)) { - if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0)) - revents |= events & (POLLIN | POLLRDNORM); - else - selrecord(p, &d->bd_sel, wql); + switch (which) { + case FREAD: + if (d->bd_hlen != 0 || + ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && + d->bd_slen != 0)) + ret = 1; /* read has data to return */ + else { + /* + * Read has no data to return. + * Make the select wait, and start a timer if + * necessary. + */ + selrecord(p, &d->bd_sel, wql); + bpf_start_timer(d); + } + break; + + case FWRITE: + ret = 1; /* can't determine whether a write would block */ + break; } lck_mtx_unlock(bpf_mlock); - return (revents); + return (ret); } + /* * Support for kevent() system call. Register EVFILT_READ filters and * reject all others. @@ -1511,9 +1759,6 @@ bpfkqfilter(dev_t dev, struct knote *kn) return (ENXIO); } - /* - * An imitation of the FIONREAD ioctl code. - */ if (d->bd_bif == NULL) { lck_mtx_unlock(bpf_mlock); return (ENXIO); @@ -1546,13 +1791,52 @@ filt_bpfread(struct knote *kn, long hint) lck_mtx_lock(bpf_mlock); if (d->bd_immediate) { + /* + * If there's data in the hold buffer, it's the + * amount of data a read will return. + * + * If there's no data in the hold buffer, but + * there's data in the store buffer, a read will + * immediately rotate the store buffer to the + * hold buffer, the amount of data in the store + * buffer is the amount of data a read will + * return. + * + * If there's no data in either buffer, we're not + * ready to read. + */ kn->kn_data = (d->bd_hlen == 0 ? d->bd_slen : d->bd_hlen); - ready = (kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ? - kn->kn_sdata : 1)); + int64_t lowwat = 1; + if (kn->kn_sfflags & NOTE_LOWAT) + { + if (kn->kn_sdata > d->bd_bufsize) + lowwat = d->bd_bufsize; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + ready = (kn->kn_data >= lowwat); } else { - kn->kn_data = d->bd_hlen; + /* + * If there's data in the hold buffer, it's the + * amount of data a read will return. + * + * If there's no data in the hold buffer, but + * there's data in the store buffer, if the + * timer has expired a read will immediately + * rotate the store buffer to the hold buffer, + * so the amount of data in the store buffer is + * the amount of data a read will return. + * + * If there's no data in either buffer, or there's + * no data in the hold buffer and the timer hasn't + * expired, we're not ready to read. + */ + kn->kn_data = (d->bd_hlen == 0 && d->bd_state == BPF_TIMED_OUT ? + d->bd_slen : d->bd_hlen); ready = (kn->kn_data > 0); } + if (!ready) + bpf_start_timer(d); if (hint == 0) lck_mtx_unlock(bpf_mlock); @@ -1721,6 +2005,7 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, struct bpf_hdr *hp; int totlen, curlen; int hdrlen = d->bd_bif->bif_hdrlen; + int do_wakeup = 0; /* * Figure out how many bytes to move. If the packet is * greater or equal to the snapshot length, transfer that @@ -1741,7 +2026,7 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, * Rotate the buffers if we can, then wakeup any * pending reads. */ - if (d->bd_fbuf == 0) { + if (d->bd_fbuf == NULL) { /* * We haven't completed the previous read yet, * so drop the packet. @@ -1750,15 +2035,16 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, return; } ROTATE_BUFFERS(d); - bpf_wakeup(d); + do_wakeup = 1; curlen = 0; } - else if (d->bd_immediate) + else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) /* - * Immediate mode is set. A packet arrived so any - * reads should be woken up. + * Immediate mode is set, or the read timeout has + * already expired during a select call. A packet + * arrived, so the reader should be woken up. */ - bpf_wakeup(d); + do_wakeup = 1; /* * Append the bpf header. @@ -1775,6 +2061,9 @@ catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen, */ (*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen)); d->bd_slen = curlen + totlen; + + if (do_wakeup) + bpf_wakeup(d); } /* diff --git a/bsd/net/bpf.h b/bsd/net/bpf.h index a07cfe28e..92a5f31a0 100644 --- a/bsd/net/bpf.h +++ b/bsd/net/bpf.h @@ -172,8 +172,16 @@ struct bpf_version { #define BIOCGDLT _IOR('B',106, u_int) #define BIOCGETIF _IOR('B',107, struct ifreq) #define BIOCSETIF _IOW('B',108, struct ifreq) -#define BIOCSRTIMEOUT _IOW('B',109, struct BPF_TIMEVAL) -#define BIOCGRTIMEOUT _IOR('B',110, struct BPF_TIMEVAL) +#define BIOCSRTIMEOUT _IOW('B',109, struct timeval) +#ifdef KERNEL_PRIVATE +#define BIOCSRTIMEOUT64 _IOW('B',109, struct user64_timeval) +#define BIOCSRTIMEOUT32 _IOW('B',109, struct user32_timeval) +#endif /* KERNEL_PRIVATE */ +#define BIOCGRTIMEOUT _IOR('B',110, struct timeval) +#ifdef KERNEL_PRIVATE +#define BIOCGRTIMEOUT64 _IOR('B',110, struct user64_timeval) +#define BIOCGRTIMEOUT32 _IOR('B',110, struct user32_timeval) +#endif /* KERNEL_PRIVATE */ #define BIOCGSTATS _IOR('B',111, struct bpf_stat) #define BIOCIMMEDIATE _IOW('B',112, u_int) #define BIOCVERSION _IOR('B',113, struct bpf_version) diff --git a/bsd/net/bpf_filter.c b/bsd/net/bpf_filter.c index 31ce77023..69d35371f 100644 --- a/bsd/net/bpf_filter.c +++ b/bsd/net/bpf_filter.c @@ -110,6 +110,8 @@ } \ } +extern unsigned int bpf_maxbufsize; + static u_int16_t m_xhalf(struct mbuf *m, bpf_u_int32 k, int *err); static u_int32_t m_xword(struct mbuf *m, bpf_u_int32 k, int *err); @@ -528,9 +530,10 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) /* * Return true if the 'fcode' is a valid filter program. * The constraints are that each jump be forward and to a valid - * code. The code must terminate with either an accept or reject. - * 'valid' is an array for use by the routine (it must be at least - * 'len' bytes long). + * code, that memory accesses are within valid ranges (to the + * extent that this can be checked statically; loads of packet data + * have to be, and are, also checked at run time), and that + * the code terminates with either an accept or reject. * * The kernel needs to be able to verify an application's filter code. * Otherwise, a bogus program could easily crash the system. @@ -538,40 +541,112 @@ bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen) int bpf_validate(const struct bpf_insn *f, int len) { - register int i; + u_int i, from; const struct bpf_insn *p; - for (i = 0; i < len; ++i) { - /* - * Check that that jumps are forward, and within - * the code block. - */ + if (len < 1 || len > BPF_MAXINSNS) + return 0; + + for (i = 0; i < ((u_int)len); ++i) { p = &f[i]; - if (BPF_CLASS(p->code) == BPF_JMP) { - register int from = i + 1; - - if (BPF_OP(p->code) == BPF_JA) { - if (from >= len || p->k >= (bpf_u_int32)(len - from)) + switch (BPF_CLASS(p->code)) { + /* + * Check that memory operations use valid addresses + */ + case BPF_LD: + case BPF_LDX: + switch (BPF_MODE(p->code)) { + case BPF_IMM: + break; + case BPF_ABS: + case BPF_IND: + case BPF_MSH: + /* + * More strict check with actual packet length + * is done runtime. + */ + if (p->k >= bpf_maxbufsize) + return 0; + break; + case BPF_MEM: + if (p->k >= BPF_MEMWORDS) + return 0; + break; + case BPF_LEN: + break; + default: + return 0; + } + break; + case BPF_ST: + case BPF_STX: + if (p->k >= BPF_MEMWORDS) return 0; - } - else if (from >= len || p->jt >= len - from || - p->jf >= len - from) + break; + case BPF_ALU: + switch (BPF_OP(p->code)) { + case BPF_ADD: + case BPF_SUB: + case BPF_MUL: + case BPF_OR: + case BPF_AND: + case BPF_LSH: + case BPF_RSH: + case BPF_NEG: + break; + case BPF_DIV: + /* + * Check for constant division by 0 + */ + if(BPF_SRC(p->code) == BPF_K && p->k == 0) + return 0; + break; + default: + return 0; + } + break; + case BPF_JMP: + /* + * Check that jumps are within the code block, + * and that unconditional branches don't go + * backwards as a result of an overflow. + * Unconditional branches have a 32-bit offset, + * so they could overflow; we check to make + * sure they don't. Conditional branches have + * an 8-bit offset, and the from address is + * less than equal to BPF_MAXINSNS, and we assume that + * BPF_MAXINSNS is sufficiently small that adding 255 + * to it won't overlflow + * + * We know that len is <= BPF_MAXINSNS, and we + * assume that BPF_MAXINSNS is less than the maximum + * size of a u_int, so that i+1 doesn't overflow + */ + from = i+1; + switch (BPF_OP(p->code)) { + case BPF_JA: + if (from + p->k < from || from + p->k >= ((u_int)len)) + return 0; + break; + case BPF_JEQ: + case BPF_JGT: + case BPF_JGE: + case BPF_JSET: + if (from + p->jt >= ((u_int)len) || from + p->jf >= ((u_int)len)) + return 0; + break; + default: + return 0; + } + break; + case BPF_RET: + break; + case BPF_MISC: + break; + default: return 0; } - /* - * Check that memory operations use valid addresses. - */ - if ((BPF_CLASS(p->code) == BPF_ST || - (BPF_CLASS(p->code) == BPF_LD && - (p->code & 0xe0) == BPF_MEM)) && - p->k >= BPF_MEMWORDS) - return 0; - /* - * Check for constant division by 0. - */ - if (p->code == (BPF_ALU|BPF_DIV|BPF_K) && p->k == 0) - return 0; } - return BPF_CLASS(f[len - 1].code) == BPF_RET; + return BPF_CLASS(f[len - 1].code) == BPF_RET; } #endif diff --git a/bsd/net/bpfdesc.h b/bsd/net/bpfdesc.h index 2a5cd1aaf..e0507f935 100644 --- a/bsd/net/bpfdesc.h +++ b/bsd/net/bpfdesc.h @@ -76,6 +76,7 @@ */ #include <sys/select.h> +#include <kern/thread_call.h> /* * Descriptor associated with each open bpf file. @@ -99,7 +100,7 @@ struct bpf_d { int bd_bufsize; /* absolute length of buffers */ - struct bpf_if * bd_bif; /* interface descriptor */ + struct bpf_if *bd_bif; /* interface descriptor */ u_int32_t bd_rtout; /* Read timeout in 'ticks' */ struct bpf_insn *bd_filter; /* filter code */ u_int32_t bd_rcount; /* number of packets received */ @@ -127,11 +128,24 @@ struct bpf_d { int bd_hdrcmplt; /* false to fill in src lladdr automatically */ int bd_seesent; /* true if bpf should see sent packets */ int bd_oflags; /* device open flags */ + thread_call_t bd_thread_call; /* for BPF timeouts with select */ #if CONFIG_MACF_NET struct label * bd_label; /* MAC label for descriptor */ #endif }; +/* Values for bd_state */ +#define BPF_IDLE 0 /* no select in progress or kqueue pending */ +#define BPF_WAITING 1 /* waiting for read timeout in select/kqueue */ +#define BPF_TIMED_OUT 2 /* read timeout has expired in select/kqueue */ +#define BPF_DRAINING 3 /* waiting for timeout routine to finish during close */ + +/* Test whether a BPF is ready for read(). */ +#define bpf_ready(bd) ((bd)->bd_hlen != 0 || \ + (((bd)->bd_immediate || (bd)->bd_state == BPF_TIMED_OUT) && \ + (bd)->bd_slen != 0)) + + /* * Descriptor associated with each attached hardware interface. */ diff --git a/bsd/net/bridgestp.c b/bsd/net/bridgestp.c new file mode 100644 index 000000000..1d6922f28 --- /dev/null +++ b/bsd/net/bridgestp.c @@ -0,0 +1,2425 @@ +/* $NetBSD: bridgestp.c,v 1.5 2003/11/28 08:56:48 keihan Exp $ */ + +/* + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright (c) 2000 Jason L. Wright (jason@thought.net) + * Copyright (c) 2006 Andrew Thompson (thompsa@FreeBSD.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: bridgestp.c,v 1.5 2001/03/22 03:48:29 jason Exp + */ + +/* + * Implementation of the spanning tree protocol as defined in + * ISO/IEC 802.1D-2004, June 9, 2004. + */ + +#include <sys/cdefs.h> +//__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/sockio.h> +#include <sys/kernel.h> +//#include <sys/callout.h> +//#include <sys/module.h> +#include <sys/proc.h> +#include <sys/lock.h> +//#include <sys/mutex.h> +//#include <sys/taskqueue.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/if_types.h> +#include <net/if_llc.h> +#include <net/if_media.h> + +#include <net/kpi_interface.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/if_ether.h> +#include <net/bridgestp.h> + +#include <kern/thread.h> + +static lck_mtx_t *bstp_task_mtx = NULL; +static lck_grp_t *bstp_task_grp = NULL; +static lck_attr_t *bstp_task_attr = NULL; +static thread_t bstp_task_thread; +static TAILQ_HEAD(bstp_task_queue, bstp_task) + bstp_task_queue = TAILQ_HEAD_INITIALIZER(bstp_task_queue); +static struct bstp_task *bstp_task_queue_running = NULL; + +static void bstp_create_task_thread(void); +static void bstp_task_thread_func(void); + +static void bstp_task_enqueue(struct bstp_task *); +static void bstp_task_drain(struct bstp_task *); + +#define BSTP_TASK_INIT(bt, func, context) do { \ + (bt)->bt_count = 0; \ + (bt)->bt_func = func; \ + (bt)->bt_context = context; \ +} while(0) + + + +#define BSTP_LOCK_INIT(_bs) (_bs)->bs_mtx = lck_mtx_alloc_init(bstp_lock_grp, bstp_lock_attr) +#define BSTP_LOCK_DESTROY(_bs) lck_mtx_free((_bs)->bs_mtx, bstp_lock_grp) +#define BSTP_LOCK(_bs) lck_mtx_lock((_bs)->bs_mtx) +#define BSTP_UNLOCK(_bs) lck_mtx_unlock((_bs)->bs_mtx) +#define BSTP_LOCK_ASSERT(_bs) lck_mtx_assert((_bs)->bs_mtx, LCK_MTX_ASSERT_OWNED) + + +#ifdef BRIDGESTP_DEBUG +#define DPRINTF(fmt, arg...) printf("bstp: " fmt, ##arg) +#else +#define DPRINTF(fmt, arg...) +#endif + +#define PV2ADDR(pv, eaddr) do { \ + eaddr[0] = pv >> 40; \ + eaddr[1] = pv >> 32; \ + eaddr[2] = pv >> 24; \ + eaddr[3] = pv >> 16; \ + eaddr[4] = pv >> 8; \ + eaddr[5] = pv >> 0; \ +} while (0) + +#define INFO_BETTER 1 +#define INFO_SAME 0 +#define INFO_WORSE -1 + +const uint8_t bstp_etheraddr[] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; + +LIST_HEAD(, bstp_state) bstp_list; +static lck_mtx_t *bstp_list_mtx; +static lck_grp_t *bstp_lock_grp = NULL; +static lck_attr_t *bstp_lock_attr = NULL; + +static void bstp_transmit(struct bstp_state *, struct bstp_port *); +static void bstp_transmit_bpdu(struct bstp_state *, struct bstp_port *); +static void bstp_transmit_tcn(struct bstp_state *, struct bstp_port *); +static void bstp_decode_bpdu(struct bstp_port *, struct bstp_cbpdu *, + struct bstp_config_unit *); +static void bstp_send_bpdu(struct bstp_state *, struct bstp_port *, + struct bstp_cbpdu *); +static void bstp_enqueue(struct ifnet *, struct mbuf *); +static int bstp_pdu_flags(struct bstp_port *); +static void bstp_received_stp(struct bstp_state *, struct bstp_port *, + struct mbuf **, struct bstp_tbpdu *); +static void bstp_received_rstp(struct bstp_state *, struct bstp_port *, + struct mbuf **, struct bstp_tbpdu *); +static void bstp_received_tcn(struct bstp_state *, struct bstp_port *, + struct bstp_tcn_unit *); +static void bstp_received_bpdu(struct bstp_state *, struct bstp_port *, + struct bstp_config_unit *); +static int bstp_pdu_rcvtype(struct bstp_port *, struct bstp_config_unit *); +static int bstp_pdu_bettersame(struct bstp_port *, int); +static int bstp_info_cmp(struct bstp_pri_vector *, + struct bstp_pri_vector *); +static int bstp_info_superior(struct bstp_pri_vector *, + struct bstp_pri_vector *); +static void bstp_assign_roles(struct bstp_state *); +static void bstp_update_roles(struct bstp_state *, struct bstp_port *); +static void bstp_update_state(struct bstp_state *, struct bstp_port *); +static void bstp_update_tc(struct bstp_port *); +static void bstp_update_info(struct bstp_port *); +static void bstp_set_other_tcprop(struct bstp_port *); +static void bstp_set_all_reroot(struct bstp_state *); +static void bstp_set_all_sync(struct bstp_state *); +static void bstp_set_port_state(struct bstp_port *, int); +static void bstp_set_port_role(struct bstp_port *, int); +static void bstp_set_port_proto(struct bstp_port *, int); +static void bstp_set_port_tc(struct bstp_port *, int); +static void bstp_set_timer_tc(struct bstp_port *); +static void bstp_set_timer_msgage(struct bstp_port *); +static int bstp_rerooted(struct bstp_state *, struct bstp_port *); +static uint32_t bstp_calc_path_cost(struct bstp_port *); +static void bstp_notify_state(void *, int); +static void bstp_notify_rtage(void *, int); +static void bstp_ifupdstatus(struct bstp_state *, struct bstp_port *); +static void bstp_enable_port(struct bstp_state *, struct bstp_port *); +static void bstp_disable_port(struct bstp_state *, struct bstp_port *); +static void bstp_tick(void *); +static void bstp_timer_start(struct bstp_timer *, uint16_t); +static void bstp_timer_stop(struct bstp_timer *); +static void bstp_timer_latch(struct bstp_timer *); +static int bstp_timer_expired(struct bstp_timer *); +static void bstp_hello_timer_expiry(struct bstp_state *, + struct bstp_port *); +static void bstp_message_age_expiry(struct bstp_state *, + struct bstp_port *); +static void bstp_migrate_delay_expiry(struct bstp_state *, + struct bstp_port *); +static void bstp_edge_delay_expiry(struct bstp_state *, + struct bstp_port *); +static int bstp_addr_cmp(const uint8_t *, const uint8_t *); +static int bstp_same_bridgeid(uint64_t, uint64_t); +static void bstp_reinit(struct bstp_state *); + +static void +bstp_transmit(struct bstp_state *bs, struct bstp_port *bp) +{ + if (bs->bs_running == 0) + return; + + /* + * a PDU can only be sent if we have tx quota left and the + * hello timer is running. + */ + if (bp->bp_hello_timer.active == 0) { + /* Test if it needs to be reset */ + bstp_hello_timer_expiry(bs, bp); + return; + } + if (bp->bp_txcount > bs->bs_txholdcount) + /* Ran out of karma */ + return; + + if (bp->bp_protover == BSTP_PROTO_RSTP) { + bstp_transmit_bpdu(bs, bp); + bp->bp_tc_ack = 0; + } else { /* STP */ + switch (bp->bp_role) { + case BSTP_ROLE_DESIGNATED: + bstp_transmit_bpdu(bs, bp); + bp->bp_tc_ack = 0; + break; + + case BSTP_ROLE_ROOT: + bstp_transmit_tcn(bs, bp); + break; + } + } + bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime); + bp->bp_flags &= ~BSTP_PORT_NEWINFO; +} + +static void +bstp_transmit_bpdu(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_cbpdu bpdu; + + BSTP_LOCK_ASSERT(bs); + + bpdu.cbu_rootpri = htons(bp->bp_desg_pv.pv_root_id >> 48); + PV2ADDR(bp->bp_desg_pv.pv_root_id, bpdu.cbu_rootaddr); + + bpdu.cbu_rootpathcost = htonl(bp->bp_desg_pv.pv_cost); + + bpdu.cbu_bridgepri = htons(bp->bp_desg_pv.pv_dbridge_id >> 48); + PV2ADDR(bp->bp_desg_pv.pv_dbridge_id, bpdu.cbu_bridgeaddr); + + bpdu.cbu_portid = htons(bp->bp_port_id); + bpdu.cbu_messageage = htons(bp->bp_desg_msg_age); + bpdu.cbu_maxage = htons(bp->bp_desg_max_age); + bpdu.cbu_hellotime = htons(bp->bp_desg_htime); + bpdu.cbu_forwarddelay = htons(bp->bp_desg_fdelay); + + bpdu.cbu_flags = bstp_pdu_flags(bp); + + switch (bp->bp_protover) { + case BSTP_PROTO_STP: + bpdu.cbu_bpdutype = BSTP_MSGTYPE_CFG; + break; + + case BSTP_PROTO_RSTP: + bpdu.cbu_bpdutype = BSTP_MSGTYPE_RSTP; + break; + } + + bstp_send_bpdu(bs, bp, &bpdu); +} + +static void +bstp_transmit_tcn(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_tbpdu bpdu; + struct ifnet *ifp = bp->bp_ifp; + struct ether_header *eh; + struct mbuf *m; + int touched = bs ? 1 : 0; + + touched++; + + KASSERT(bp == bs->bs_root_port, ("%s: bad root port\n", __func__)); + + if ((ifp->if_flags & IFF_RUNNING) == 0) + return; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return; + + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.len = sizeof(*eh) + sizeof(bpdu); + m->m_len = m->m_pkthdr.len; + + eh = mtod(m, struct ether_header *); + + memcpy(eh->ether_shost, ifnet_lladdr(ifp), ETHER_ADDR_LEN); + memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN); + eh->ether_type = htons(sizeof(bpdu)); + + bpdu.tbu_ssap = bpdu.tbu_dsap = LLC_8021D_LSAP; + bpdu.tbu_ctl = LLC_UI; + bpdu.tbu_protoid = 0; + bpdu.tbu_protover = 0; + bpdu.tbu_bpdutype = BSTP_MSGTYPE_TCN; + + memcpy(mtod(m, caddr_t) + sizeof(*eh), &bpdu, sizeof(bpdu)); + + bp->bp_txcount++; + bstp_enqueue(ifp, m); +} + +static void +bstp_decode_bpdu(struct bstp_port *bp, struct bstp_cbpdu *cpdu, + struct bstp_config_unit *cu) +{ + int flags; + + cu->cu_pv.pv_root_id = + (((uint64_t)ntohs(cpdu->cbu_rootpri)) << 48) | + (((uint64_t)cpdu->cbu_rootaddr[0]) << 40) | + (((uint64_t)cpdu->cbu_rootaddr[1]) << 32) | + (((uint64_t)cpdu->cbu_rootaddr[2]) << 24) | + (((uint64_t)cpdu->cbu_rootaddr[3]) << 16) | + (((uint64_t)cpdu->cbu_rootaddr[4]) << 8) | + (((uint64_t)cpdu->cbu_rootaddr[5]) << 0); + + cu->cu_pv.pv_dbridge_id = + (((uint64_t)ntohs(cpdu->cbu_bridgepri)) << 48) | + (((uint64_t)cpdu->cbu_bridgeaddr[0]) << 40) | + (((uint64_t)cpdu->cbu_bridgeaddr[1]) << 32) | + (((uint64_t)cpdu->cbu_bridgeaddr[2]) << 24) | + (((uint64_t)cpdu->cbu_bridgeaddr[3]) << 16) | + (((uint64_t)cpdu->cbu_bridgeaddr[4]) << 8) | + (((uint64_t)cpdu->cbu_bridgeaddr[5]) << 0); + + cu->cu_pv.pv_cost = ntohl(cpdu->cbu_rootpathcost); + cu->cu_message_age = ntohs(cpdu->cbu_messageage); + cu->cu_max_age = ntohs(cpdu->cbu_maxage); + cu->cu_hello_time = ntohs(cpdu->cbu_hellotime); + cu->cu_forward_delay = ntohs(cpdu->cbu_forwarddelay); + cu->cu_pv.pv_dport_id = ntohs(cpdu->cbu_portid); + cu->cu_pv.pv_port_id = bp->bp_port_id; + cu->cu_message_type = cpdu->cbu_bpdutype; + + /* Strip off unused flags in STP mode */ + flags = cpdu->cbu_flags; + switch (cpdu->cbu_protover) { + case BSTP_PROTO_STP: + flags &= BSTP_PDU_STPMASK; + /* A STP BPDU explicitly conveys a Designated Port */ + cu->cu_role = BSTP_ROLE_DESIGNATED; + break; + + case BSTP_PROTO_RSTP: + flags &= BSTP_PDU_RSTPMASK; + break; + } + + cu->cu_topology_change_ack = + (flags & BSTP_PDU_F_TCA) ? 1 : 0; + cu->cu_proposal = + (flags & BSTP_PDU_F_P) ? 1 : 0; + cu->cu_agree = + (flags & BSTP_PDU_F_A) ? 1 : 0; + cu->cu_learning = + (flags & BSTP_PDU_F_L) ? 1 : 0; + cu->cu_forwarding = + (flags & BSTP_PDU_F_F) ? 1 : 0; + cu->cu_topology_change = + (flags & BSTP_PDU_F_TC) ? 1 : 0; + + switch ((flags & BSTP_PDU_PRMASK) >> BSTP_PDU_PRSHIFT) { + case BSTP_PDU_F_ROOT: + cu->cu_role = BSTP_ROLE_ROOT; + break; + case BSTP_PDU_F_ALT: + cu->cu_role = BSTP_ROLE_ALTERNATE; + break; + case BSTP_PDU_F_DESG: + cu->cu_role = BSTP_ROLE_DESIGNATED; + break; + } +} + +static void +bstp_send_bpdu(struct bstp_state *bs, struct bstp_port *bp, + struct bstp_cbpdu *bpdu) +{ + struct ifnet *ifp; + struct mbuf *m; + struct ether_header *eh; + + BSTP_LOCK_ASSERT(bs); + + ifp = bp->bp_ifp; + + if ((ifp->if_flags & IFF_RUNNING) == 0) + return; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return; + + eh = mtod(m, struct ether_header *); + + bpdu->cbu_ssap = bpdu->cbu_dsap = LLC_8021D_LSAP; + bpdu->cbu_ctl = LLC_UI; + bpdu->cbu_protoid = htons(BSTP_PROTO_ID); + + memcpy(eh->ether_shost, ifnet_lladdr(ifp), ETHER_ADDR_LEN); + memcpy(eh->ether_dhost, bstp_etheraddr, ETHER_ADDR_LEN); + + switch (bpdu->cbu_bpdutype) { + case BSTP_MSGTYPE_CFG: + bpdu->cbu_protover = BSTP_PROTO_STP; + m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_STP_LEN; + eh->ether_type = htons(BSTP_BPDU_STP_LEN); + memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu, + BSTP_BPDU_STP_LEN); + break; + + case BSTP_MSGTYPE_RSTP: + bpdu->cbu_protover = BSTP_PROTO_RSTP; + bpdu->cbu_versionlen = htons(0); + m->m_pkthdr.len = sizeof(*eh) + BSTP_BPDU_RSTP_LEN; + eh->ether_type = htons(BSTP_BPDU_RSTP_LEN); + memcpy(mtod(m, caddr_t) + sizeof(*eh), bpdu, + BSTP_BPDU_RSTP_LEN); + break; + + default: + panic("not implemented"); + } + m->m_pkthdr.rcvif = ifp; + m->m_len = m->m_pkthdr.len; + + bp->bp_txcount++; + bstp_enqueue(ifp, m); +} + +static void +bstp_enqueue(struct ifnet *dst_ifp, struct mbuf *m) +{ + errno_t error = 0; + u_int32_t len = m->m_pkthdr.len; + + m->m_flags |= M_PROTO1; //set to avoid loops + + error = ifnet_output_raw(dst_ifp, 0, m); + if (error == 0) { + (void) ifnet_stat_increment_out(dst_ifp, 1, len, 0); + } else { + (void) ifnet_stat_increment_out(dst_ifp, 0, 0, 1); + } +} + +static int +bstp_pdu_flags(struct bstp_port *bp) +{ + int flags = 0; + + if (bp->bp_proposing && bp->bp_state != BSTP_IFSTATE_FORWARDING) + flags |= BSTP_PDU_F_P; + + if (bp->bp_agree) + flags |= BSTP_PDU_F_A; + + if (bp->bp_tc_timer.active) + flags |= BSTP_PDU_F_TC; + + if (bp->bp_tc_ack) + flags |= BSTP_PDU_F_TCA; + + switch (bp->bp_state) { + case BSTP_IFSTATE_LEARNING: + flags |= BSTP_PDU_F_L; + break; + + case BSTP_IFSTATE_FORWARDING: + flags |= (BSTP_PDU_F_L | BSTP_PDU_F_F); + break; + } + + switch (bp->bp_role) { + case BSTP_ROLE_ROOT: + flags |= + (BSTP_PDU_F_ROOT << BSTP_PDU_PRSHIFT); + break; + + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: /* fall through */ + flags |= + (BSTP_PDU_F_ALT << BSTP_PDU_PRSHIFT); + break; + + case BSTP_ROLE_DESIGNATED: + flags |= + (BSTP_PDU_F_DESG << BSTP_PDU_PRSHIFT); + break; + } + + /* Strip off unused flags in either mode */ + switch (bp->bp_protover) { + case BSTP_PROTO_STP: + flags &= BSTP_PDU_STPMASK; + break; + case BSTP_PROTO_RSTP: + flags &= BSTP_PDU_RSTPMASK; + break; + } + return (flags); +} + +struct mbuf * +bstp_input(struct bstp_port *bp, __unused struct ifnet *ifp, struct mbuf *m) +{ + struct bstp_state *bs = bp->bp_bs; + struct ether_header *eh; + struct bstp_tbpdu tpdu; + uint16_t len; + + if (bp->bp_active == 0) { + m_freem(m); + return (NULL); + } + + BSTP_LOCK(bs); + + eh = mtod(m, struct ether_header *); + + len = ntohs(eh->ether_type); + if (len < sizeof(tpdu)) + goto out; + + m_adj(m, ETHER_HDR_LEN); + + if (m->m_pkthdr.len > len) + m_adj(m, len - m->m_pkthdr.len); + if ((unsigned int)m->m_len < sizeof(tpdu) && + (m = m_pullup(m, sizeof(tpdu))) == NULL) + goto out; + + memcpy(&tpdu, mtod(m, caddr_t), sizeof(tpdu)); + + /* basic packet checks */ + if (tpdu.tbu_dsap != LLC_8021D_LSAP || + tpdu.tbu_ssap != LLC_8021D_LSAP || + tpdu.tbu_ctl != LLC_UI) + goto out; + if (tpdu.tbu_protoid != BSTP_PROTO_ID) + goto out; + + /* + * We can treat later versions of the PDU as the same as the maximum + * version we implement. All additional parameters/flags are ignored. + */ + if (tpdu.tbu_protover > BSTP_PROTO_MAX) + tpdu.tbu_protover = BSTP_PROTO_MAX; + + if (tpdu.tbu_protover != bp->bp_protover) { + /* + * Wait for the migration delay timer to expire before changing + * protocol version to avoid flip-flops. + */ + if (bp->bp_flags & BSTP_PORT_CANMIGRATE) + bstp_set_port_proto(bp, tpdu.tbu_protover); + else + goto out; + } + + /* Clear operedge upon receiving a PDU on the port */ + bp->bp_operedge = 0; + bstp_timer_start(&bp->bp_edge_delay_timer, + BSTP_DEFAULT_MIGRATE_DELAY); + + switch (tpdu.tbu_protover) { + case BSTP_PROTO_STP: + bstp_received_stp(bs, bp, &m, &tpdu); + break; + + case BSTP_PROTO_RSTP: + bstp_received_rstp(bs, bp, &m, &tpdu); + break; + } +out: + BSTP_UNLOCK(bs); + if (m) + m_freem(m); + return (NULL); +} + +static void +bstp_received_stp(struct bstp_state *bs, struct bstp_port *bp, + struct mbuf **mp, struct bstp_tbpdu *tpdu) +{ + struct bstp_cbpdu cpdu; + struct bstp_config_unit *cu = &bp->bp_msg_cu; + struct bstp_tcn_unit tu; + + switch (tpdu->tbu_bpdutype) { + case BSTP_MSGTYPE_TCN: + tu.tu_message_type = tpdu->tbu_bpdutype; + bstp_received_tcn(bs, bp, &tu); + break; + case BSTP_MSGTYPE_CFG: + if ((*mp)->m_len < BSTP_BPDU_STP_LEN && + (*mp = m_pullup(*mp, BSTP_BPDU_STP_LEN)) == NULL) + return; + memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_STP_LEN); + + bstp_decode_bpdu(bp, &cpdu, cu); + bstp_received_bpdu(bs, bp, cu); + break; + } +} + +static void +bstp_received_rstp(struct bstp_state *bs, struct bstp_port *bp, + struct mbuf **mp, struct bstp_tbpdu *tpdu) +{ + struct bstp_cbpdu cpdu; + struct bstp_config_unit *cu = &bp->bp_msg_cu; + + if (tpdu->tbu_bpdutype != BSTP_MSGTYPE_RSTP) + return; + + if ((*mp)->m_len < BSTP_BPDU_RSTP_LEN && + (*mp = m_pullup(*mp, BSTP_BPDU_RSTP_LEN)) == NULL) + return; + memcpy(&cpdu, mtod(*mp, caddr_t), BSTP_BPDU_RSTP_LEN); + + bstp_decode_bpdu(bp, &cpdu, cu); + bstp_received_bpdu(bs, bp, cu); +} + +static void +bstp_received_tcn(__unused struct bstp_state *bs, struct bstp_port *bp, + __unused struct bstp_tcn_unit *tcn) +{ + bp->bp_rcvdtcn = 1; + bstp_update_tc(bp); +} + +static void +bstp_received_bpdu(struct bstp_state *bs, struct bstp_port *bp, + struct bstp_config_unit *cu) +{ + int type; + + BSTP_LOCK_ASSERT(bs); + + /* We need to have transitioned to INFO_MINE before proceeding */ + switch (bp->bp_infois) { + case BSTP_INFO_DISABLED: + case BSTP_INFO_AGED: + return; + } + + type = bstp_pdu_rcvtype(bp, cu); + + switch (type) { + case BSTP_PDU_SUPERIOR: + bs->bs_allsynced = 0; + bp->bp_agreed = 0; + bp->bp_proposing = 0; + + if (cu->cu_proposal && cu->cu_forwarding == 0) + bp->bp_proposed = 1; + if (cu->cu_topology_change) + bp->bp_rcvdtc = 1; + if (cu->cu_topology_change_ack) + bp->bp_rcvdtca = 1; + + if (bp->bp_agree && + !bstp_pdu_bettersame(bp, BSTP_INFO_RECEIVED)) + bp->bp_agree = 0; + + /* copy the received priority and timers to the port */ + bp->bp_port_pv = cu->cu_pv; + bp->bp_port_msg_age = cu->cu_message_age; + bp->bp_port_max_age = cu->cu_max_age; + bp->bp_port_fdelay = cu->cu_forward_delay; + bp->bp_port_htime = + (cu->cu_hello_time > BSTP_MIN_HELLO_TIME ? + cu->cu_hello_time : BSTP_MIN_HELLO_TIME); + + /* set expiry for the new info */ + bstp_set_timer_msgage(bp); + + bp->bp_infois = BSTP_INFO_RECEIVED; + bstp_assign_roles(bs); + break; + + case BSTP_PDU_REPEATED: + if (cu->cu_proposal && cu->cu_forwarding == 0) + bp->bp_proposed = 1; + if (cu->cu_topology_change) + bp->bp_rcvdtc = 1; + if (cu->cu_topology_change_ack) + bp->bp_rcvdtca = 1; + + /* rearm the age timer */ + bstp_set_timer_msgage(bp); + break; + + case BSTP_PDU_INFERIOR: + if (cu->cu_learning) { + bp->bp_agreed = 1; + bp->bp_proposing = 0; + } + break; + + case BSTP_PDU_INFERIORALT: + /* + * only point to point links are allowed fast + * transitions to forwarding. + */ + if (cu->cu_agree && bp->bp_ptp_link) { + bp->bp_agreed = 1; + bp->bp_proposing = 0; + } else + bp->bp_agreed = 0; + + if (cu->cu_topology_change) + bp->bp_rcvdtc = 1; + if (cu->cu_topology_change_ack) + bp->bp_rcvdtca = 1; + break; + + case BSTP_PDU_OTHER: + return; /* do nothing */ + } + /* update the state machines with the new data */ + bstp_update_state(bs, bp); +} + +static int +bstp_pdu_rcvtype(struct bstp_port *bp, struct bstp_config_unit *cu) +{ + int type; + + /* default return type */ + type = BSTP_PDU_OTHER; + + switch (cu->cu_role) { + case BSTP_ROLE_DESIGNATED: + if (bstp_info_superior(&bp->bp_port_pv, &cu->cu_pv)) + /* bpdu priority is superior */ + type = BSTP_PDU_SUPERIOR; + else if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) == + INFO_SAME) { + if (bp->bp_port_msg_age != cu->cu_message_age || + bp->bp_port_max_age != cu->cu_max_age || + bp->bp_port_fdelay != cu->cu_forward_delay || + bp->bp_port_htime != cu->cu_hello_time) + /* bpdu priority is equal and timers differ */ + type = BSTP_PDU_SUPERIOR; + else + /* bpdu is equal */ + type = BSTP_PDU_REPEATED; + } else + /* bpdu priority is worse */ + type = BSTP_PDU_INFERIOR; + + break; + + case BSTP_ROLE_ROOT: + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: + if (bstp_info_cmp(&bp->bp_port_pv, &cu->cu_pv) <= INFO_SAME) + /* + * not a designated port and priority is the same or + * worse + */ + type = BSTP_PDU_INFERIORALT; + break; + } + + return (type); +} + +static int +bstp_pdu_bettersame(struct bstp_port *bp, int newinfo) +{ + if (newinfo == BSTP_INFO_RECEIVED && + bp->bp_infois == BSTP_INFO_RECEIVED && + bstp_info_cmp(&bp->bp_port_pv, &bp->bp_msg_cu.cu_pv) >= INFO_SAME) + return (1); + + if (newinfo == BSTP_INFO_MINE && + bp->bp_infois == BSTP_INFO_MINE && + bstp_info_cmp(&bp->bp_port_pv, &bp->bp_desg_pv) >= INFO_SAME) + return (1); + + return (0); +} + +static int +bstp_info_cmp(struct bstp_pri_vector *pv, + struct bstp_pri_vector *cpv) +{ + if (cpv->pv_root_id < pv->pv_root_id) + return (INFO_BETTER); + if (cpv->pv_root_id > pv->pv_root_id) + return (INFO_WORSE); + + if (cpv->pv_cost < pv->pv_cost) + return (INFO_BETTER); + if (cpv->pv_cost > pv->pv_cost) + return (INFO_WORSE); + + if (cpv->pv_dbridge_id < pv->pv_dbridge_id) + return (INFO_BETTER); + if (cpv->pv_dbridge_id > pv->pv_dbridge_id) + return (INFO_WORSE); + + if (cpv->pv_dport_id < pv->pv_dport_id) + return (INFO_BETTER); + if (cpv->pv_dport_id > pv->pv_dport_id) + return (INFO_WORSE); + + return (INFO_SAME); +} + +/* + * This message priority vector is superior to the port priority vector and + * will replace it if, and only if, the message priority vector is better than + * the port priority vector, or the message has been transmitted from the same + * designated bridge and designated port as the port priority vector. + */ +static int +bstp_info_superior(struct bstp_pri_vector *pv, + struct bstp_pri_vector *cpv) +{ + if (bstp_info_cmp(pv, cpv) == INFO_BETTER || + (bstp_same_bridgeid(pv->pv_dbridge_id, cpv->pv_dbridge_id) && + (cpv->pv_dport_id & 0xfff) == (pv->pv_dport_id & 0xfff))) + return (1); + return (0); +} + +static void +bstp_assign_roles(struct bstp_state *bs) +{ + struct bstp_port *bp, *rbp = NULL; + struct bstp_pri_vector pv; + + /* default to our priority vector */ + bs->bs_root_pv = bs->bs_bridge_pv; + bs->bs_root_msg_age = 0; + bs->bs_root_max_age = bs->bs_bridge_max_age; + bs->bs_root_fdelay = bs->bs_bridge_fdelay; + bs->bs_root_htime = bs->bs_bridge_htime; + bs->bs_root_port = NULL; + + /* check if any recieved info supersedes us */ + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + if (bp->bp_infois != BSTP_INFO_RECEIVED) + continue; + + pv = bp->bp_port_pv; + pv.pv_cost += bp->bp_path_cost; + + /* + * The root priority vector is the best of the set comprising + * the bridge priority vector plus all root path priority + * vectors whose bridge address is not equal to us. + */ + if (bstp_same_bridgeid(pv.pv_dbridge_id, + bs->bs_bridge_pv.pv_dbridge_id) == 0 && + bstp_info_cmp(&bs->bs_root_pv, &pv) == INFO_BETTER) { + /* the port vector replaces the root */ + bs->bs_root_pv = pv; + bs->bs_root_msg_age = bp->bp_port_msg_age + + BSTP_MESSAGE_AGE_INCR; + bs->bs_root_max_age = bp->bp_port_max_age; + bs->bs_root_fdelay = bp->bp_port_fdelay; + bs->bs_root_htime = bp->bp_port_htime; + rbp = bp; + } + } + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + /* calculate the port designated vector */ + bp->bp_desg_pv.pv_root_id = bs->bs_root_pv.pv_root_id; + bp->bp_desg_pv.pv_cost = bs->bs_root_pv.pv_cost; + bp->bp_desg_pv.pv_dbridge_id = bs->bs_bridge_pv.pv_dbridge_id; + bp->bp_desg_pv.pv_dport_id = bp->bp_port_id; + bp->bp_desg_pv.pv_port_id = bp->bp_port_id; + + /* calculate designated times */ + bp->bp_desg_msg_age = bs->bs_root_msg_age; + bp->bp_desg_max_age = bs->bs_root_max_age; + bp->bp_desg_fdelay = bs->bs_root_fdelay; + bp->bp_desg_htime = bs->bs_bridge_htime; + + + switch (bp->bp_infois) { + case BSTP_INFO_DISABLED: + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + break; + + case BSTP_INFO_AGED: + bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); + bstp_update_info(bp); + break; + + case BSTP_INFO_MINE: + bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); + /* update the port info if stale */ + if (bstp_info_cmp(&bp->bp_port_pv, + &bp->bp_desg_pv) != INFO_SAME || + (rbp != NULL && + (bp->bp_port_msg_age != rbp->bp_port_msg_age || + bp->bp_port_max_age != rbp->bp_port_max_age || + bp->bp_port_fdelay != rbp->bp_port_fdelay || + bp->bp_port_htime != rbp->bp_port_htime))) + bstp_update_info(bp); + break; + + case BSTP_INFO_RECEIVED: + if (bp == rbp) { + /* + * root priority is derived from this + * port, make it the root port. + */ + bstp_set_port_role(bp, BSTP_ROLE_ROOT); + bs->bs_root_port = bp; + } else if (bstp_info_cmp(&bp->bp_port_pv, + &bp->bp_desg_pv) == INFO_BETTER) { + /* + * the port priority is lower than the root + * port. + */ + bstp_set_port_role(bp, BSTP_ROLE_DESIGNATED); + bstp_update_info(bp); + } else { + if (bstp_same_bridgeid( + bp->bp_port_pv.pv_dbridge_id, + bs->bs_bridge_pv.pv_dbridge_id)) { + /* + * the designated bridge refers to + * another port on this bridge. + */ + bstp_set_port_role(bp, + BSTP_ROLE_BACKUP); + } else { + /* + * the port is an inferior path to the + * root bridge. + */ + bstp_set_port_role(bp, + BSTP_ROLE_ALTERNATE); + } + } + break; + } + } +} + +static void +bstp_update_state(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_port *bp2; + int synced; + + BSTP_LOCK_ASSERT(bs); + + /* check if all the ports have syncronised again */ + if (!bs->bs_allsynced) { + synced = 1; + LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { + if (!(bp2->bp_synced || + bp2->bp_role == BSTP_ROLE_ROOT)) { + synced = 0; + break; + } + } + bs->bs_allsynced = synced; + } + + bstp_update_roles(bs, bp); + bstp_update_tc(bp); +} + +static void +bstp_update_roles(struct bstp_state *bs, struct bstp_port *bp) +{ + switch (bp->bp_role) { + case BSTP_ROLE_DISABLED: + /* Clear any flags if set */ + if (bp->bp_sync || !bp->bp_synced || bp->bp_reroot) { + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + } + break; + + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: + if ((bs->bs_allsynced && !bp->bp_agree) || + (bp->bp_proposed && bp->bp_agree)) { + bp->bp_proposed = 0; + bp->bp_agree = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + DPRINTF("%s -> ALTERNATE_AGREED\n", + bp->bp_ifp->if_xname); + } + + if (bp->bp_proposed && !bp->bp_agree) { + bstp_set_all_sync(bs); + bp->bp_proposed = 0; + DPRINTF("%s -> ALTERNATE_PROPOSED\n", + bp->bp_ifp->if_xname); + } + + /* Clear any flags if set */ + if (bp->bp_sync || !bp->bp_synced || bp->bp_reroot) { + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + DPRINTF("%s -> ALTERNATE_PORT\n", bp->bp_ifp->if_xname); + } + break; + + case BSTP_ROLE_ROOT: + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && !bp->bp_reroot) { + bstp_set_all_reroot(bs); + DPRINTF("%s -> ROOT_REROOT\n", bp->bp_ifp->if_xname); + } + + if ((bs->bs_allsynced && !bp->bp_agree) || + (bp->bp_proposed && bp->bp_agree)) { + bp->bp_proposed = 0; + bp->bp_sync = 0; + bp->bp_agree = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + DPRINTF("%s -> ROOT_AGREED\n", bp->bp_ifp->if_xname); + } + + if (bp->bp_proposed && !bp->bp_agree) { + bstp_set_all_sync(bs); + bp->bp_proposed = 0; + DPRINTF("%s -> ROOT_PROPOSED\n", bp->bp_ifp->if_xname); + } + + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && + (bp->bp_forward_delay_timer.active == 0 || + (bstp_rerooted(bs, bp) && + bp->bp_recent_backup_timer.active == 0 && + bp->bp_protover == BSTP_PROTO_RSTP))) { + switch (bp->bp_state) { + case BSTP_IFSTATE_DISCARDING: + bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING); + break; + case BSTP_IFSTATE_LEARNING: + bstp_set_port_state(bp, + BSTP_IFSTATE_FORWARDING); + break; + } + } + + if (bp->bp_state == BSTP_IFSTATE_FORWARDING && bp->bp_reroot) { + bp->bp_reroot = 0; + DPRINTF("%s -> ROOT_REROOTED\n", bp->bp_ifp->if_xname); + } + break; + + case BSTP_ROLE_DESIGNATED: + if (bp->bp_recent_root_timer.active == 0 && bp->bp_reroot) { + bp->bp_reroot = 0; + DPRINTF("%s -> DESIGNATED_RETIRED\n", + bp->bp_ifp->if_xname); + } + + if ((bp->bp_state == BSTP_IFSTATE_DISCARDING && + !bp->bp_synced) || (bp->bp_agreed && !bp->bp_synced) || + (bp->bp_operedge && !bp->bp_synced) || + (bp->bp_sync && bp->bp_synced)) { + bstp_timer_stop(&bp->bp_recent_root_timer); + bp->bp_synced = 1; + bp->bp_sync = 0; + DPRINTF("%s -> DESIGNATED_SYNCED\n", + bp->bp_ifp->if_xname); + } + + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && + !bp->bp_agreed && !bp->bp_proposing && + !bp->bp_operedge) { + bp->bp_proposing = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_timer_start(&bp->bp_edge_delay_timer, + (bp->bp_ptp_link ? BSTP_DEFAULT_MIGRATE_DELAY : + bp->bp_desg_max_age)); + DPRINTF("%s -> DESIGNATED_PROPOSE\n", + bp->bp_ifp->if_xname); + } + + if (bp->bp_state != BSTP_IFSTATE_FORWARDING && + (bp->bp_forward_delay_timer.active == 0 || bp->bp_agreed || + bp->bp_operedge) && + (bp->bp_recent_root_timer.active == 0 || !bp->bp_reroot) && + !bp->bp_sync) { +#ifdef BRIDGESTP_DEBUG + if (bp->bp_agreed) + DPRINTF("%s -> AGREED\n", bp->bp_ifp->if_xname); +#endif /* BRIDGESTP_DEBUG */ + /* + * If agreed|operedge then go straight to forwarding, + * otherwise follow discard -> learn -> forward. + */ + if (bp->bp_agreed || bp->bp_operedge || + bp->bp_state == BSTP_IFSTATE_LEARNING) { + bstp_set_port_state(bp, + BSTP_IFSTATE_FORWARDING); + bp->bp_agreed = bp->bp_protover; + } else if (bp->bp_state == BSTP_IFSTATE_DISCARDING) + bstp_set_port_state(bp, BSTP_IFSTATE_LEARNING); + } + + if (((bp->bp_sync && !bp->bp_synced) || + (bp->bp_reroot && bp->bp_recent_root_timer.active) || + (bp->bp_flags & BSTP_PORT_DISPUTED)) && !bp->bp_operedge && + bp->bp_state != BSTP_IFSTATE_DISCARDING) { + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bp->bp_flags &= ~BSTP_PORT_DISPUTED; + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_protover == BSTP_PROTO_RSTP ? + bp->bp_desg_htime : bp->bp_desg_fdelay); + DPRINTF("%s -> DESIGNATED_DISCARD\n", + bp->bp_ifp->if_xname); + } + break; + } + + if (bp->bp_flags & BSTP_PORT_NEWINFO) + bstp_transmit(bs, bp); +} + +static void +bstp_update_tc(struct bstp_port *bp) +{ + switch (bp->bp_tcstate) { + case BSTP_TCSTATE_ACTIVE: + if ((bp->bp_role != BSTP_ROLE_DESIGNATED && + bp->bp_role != BSTP_ROLE_ROOT) || bp->bp_operedge) + bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); + + if (bp->bp_rcvdtcn) + bstp_set_port_tc(bp, BSTP_TCSTATE_TCN); + if (bp->bp_rcvdtc) + bstp_set_port_tc(bp, BSTP_TCSTATE_TC); + + if (bp->bp_tc_prop && !bp->bp_operedge) + bstp_set_port_tc(bp, BSTP_TCSTATE_PROPAG); + + if (bp->bp_rcvdtca) + bstp_set_port_tc(bp, BSTP_TCSTATE_ACK); + break; + + case BSTP_TCSTATE_INACTIVE: + if ((bp->bp_state == BSTP_IFSTATE_LEARNING || + bp->bp_state == BSTP_IFSTATE_FORWARDING) && + bp->bp_fdbflush == 0) + bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); + break; + + case BSTP_TCSTATE_LEARNING: + if (bp->bp_rcvdtc || bp->bp_rcvdtcn || bp->bp_rcvdtca || + bp->bp_tc_prop) + bstp_set_port_tc(bp, BSTP_TCSTATE_LEARNING); + else if (bp->bp_role != BSTP_ROLE_DESIGNATED && + bp->bp_role != BSTP_ROLE_ROOT && + bp->bp_state == BSTP_IFSTATE_DISCARDING) + bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); + + if ((bp->bp_role == BSTP_ROLE_DESIGNATED || + bp->bp_role == BSTP_ROLE_ROOT) && + bp->bp_state == BSTP_IFSTATE_FORWARDING && + !bp->bp_operedge) + bstp_set_port_tc(bp, BSTP_TCSTATE_DETECTED); + break; + + /* these are transient states and go straight back to ACTIVE */ + case BSTP_TCSTATE_DETECTED: + case BSTP_TCSTATE_TCN: + case BSTP_TCSTATE_TC: + case BSTP_TCSTATE_PROPAG: + case BSTP_TCSTATE_ACK: + DPRINTF("Invalid TC state for %s\n", + bp->bp_ifp->if_xname); + break; + } + +} + +static void +bstp_update_info(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + + bp->bp_proposing = 0; + bp->bp_proposed = 0; + + if (bp->bp_agreed && !bstp_pdu_bettersame(bp, BSTP_INFO_MINE)) + bp->bp_agreed = 0; + + if (bp->bp_synced && !bp->bp_agreed) { + bp->bp_synced = 0; + bs->bs_allsynced = 0; + } + + /* copy the designated pv to the port */ + bp->bp_port_pv = bp->bp_desg_pv; + bp->bp_port_msg_age = bp->bp_desg_msg_age; + bp->bp_port_max_age = bp->bp_desg_max_age; + bp->bp_port_fdelay = bp->bp_desg_fdelay; + bp->bp_port_htime = bp->bp_desg_htime; + bp->bp_infois = BSTP_INFO_MINE; + + /* Set transmit flag but do not immediately send */ + bp->bp_flags |= BSTP_PORT_NEWINFO; +} + +/* set tcprop on every port other than the caller */ +static void +bstp_set_other_tcprop(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + struct bstp_port *bp2; + + BSTP_LOCK_ASSERT(bs); + + LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { + if (bp2 == bp) + continue; + bp2->bp_tc_prop = 1; + } +} + +static void +bstp_set_all_reroot(struct bstp_state *bs) +{ + struct bstp_port *bp; + + BSTP_LOCK_ASSERT(bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bp->bp_reroot = 1; +} + +static void +bstp_set_all_sync(struct bstp_state *bs) +{ + struct bstp_port *bp; + + BSTP_LOCK_ASSERT(bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + bp->bp_sync = 1; + bp->bp_synced = 0; /* Not explicit in spec */ + } + + bs->bs_allsynced = 0; +} + +static void +bstp_set_port_state(struct bstp_port *bp, int state) +{ + if (bp->bp_state == state) + return; + + bp->bp_state = state; + + switch (bp->bp_state) { + case BSTP_IFSTATE_DISCARDING: + DPRINTF("state changed to DISCARDING on %s\n", + bp->bp_ifp->if_xname); + break; + + case BSTP_IFSTATE_LEARNING: + DPRINTF("state changed to LEARNING on %s\n", + bp->bp_ifp->if_xname); + + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_protover == BSTP_PROTO_RSTP ? + bp->bp_desg_htime : bp->bp_desg_fdelay); + break; + + case BSTP_IFSTATE_FORWARDING: + DPRINTF("state changed to FORWARDING on %s\n", + bp->bp_ifp->if_xname); + + bstp_timer_stop(&bp->bp_forward_delay_timer); + /* Record that we enabled forwarding */ + bp->bp_forward_transitions++; + break; + } + + /* notify the parent bridge */ + bstp_task_enqueue(&bp->bp_statetask); +} + +static void +bstp_set_port_role(struct bstp_port *bp, int role) +{ + struct bstp_state *bs = bp->bp_bs; + + if (bp->bp_role == role) + return; + + /* perform pre-change tasks */ + switch (bp->bp_role) { + case BSTP_ROLE_DISABLED: + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_desg_max_age); + break; + + case BSTP_ROLE_BACKUP: + bstp_timer_start(&bp->bp_recent_backup_timer, + bp->bp_desg_htime * 2); + /* fall through */ + case BSTP_ROLE_ALTERNATE: + bstp_timer_start(&bp->bp_forward_delay_timer, + bp->bp_desg_fdelay); + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + break; + + case BSTP_ROLE_ROOT: + bstp_timer_start(&bp->bp_recent_root_timer, + BSTP_DEFAULT_FORWARD_DELAY); + break; + } + + bp->bp_role = role; + /* clear values not carried between roles */ + bp->bp_proposing = 0; + bs->bs_allsynced = 0; + + /* initialise the new role */ + switch (bp->bp_role) { + case BSTP_ROLE_DISABLED: + case BSTP_ROLE_ALTERNATE: + case BSTP_ROLE_BACKUP: + DPRINTF("%s role -> ALT/BACK/DISABLED\n", + bp->bp_ifp->if_xname); + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bstp_timer_stop(&bp->bp_recent_root_timer); + bstp_timer_latch(&bp->bp_forward_delay_timer); + bp->bp_sync = 0; + bp->bp_synced = 1; + bp->bp_reroot = 0; + break; + + case BSTP_ROLE_ROOT: + DPRINTF("%s role -> ROOT\n", + bp->bp_ifp->if_xname); + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bstp_timer_latch(&bp->bp_recent_root_timer); + bp->bp_proposing = 0; + break; + + case BSTP_ROLE_DESIGNATED: + DPRINTF("%s role -> DESIGNATED\n", + bp->bp_ifp->if_xname); + bstp_timer_start(&bp->bp_hello_timer, + bp->bp_desg_htime); + bp->bp_agree = 0; + break; + } + + /* let the TC state know that the role changed */ + bstp_update_tc(bp); +} + +static void +bstp_set_port_proto(struct bstp_port *bp, int proto) +{ + struct bstp_state *bs = bp->bp_bs; + + /* supported protocol versions */ + switch (proto) { + case BSTP_PROTO_STP: + /* we can downgrade protocols only */ + bstp_timer_stop(&bp->bp_migrate_delay_timer); + /* clear unsupported features */ + bp->bp_operedge = 0; + /* STP compat mode only uses 16 bits of the 32 */ + if (bp->bp_path_cost > 65535) + bp->bp_path_cost = 65535; + break; + + case BSTP_PROTO_RSTP: + bstp_timer_start(&bp->bp_migrate_delay_timer, + bs->bs_migration_delay); + break; + + default: + DPRINTF("Unsupported STP version %d\n", proto); + return; + } + + bp->bp_protover = proto; + bp->bp_flags &= ~BSTP_PORT_CANMIGRATE; +} + +static void +bstp_set_port_tc(struct bstp_port *bp, int state) +{ + struct bstp_state *bs = bp->bp_bs; + + bp->bp_tcstate = state; + + /* initialise the new state */ + switch (bp->bp_tcstate) { + case BSTP_TCSTATE_ACTIVE: + DPRINTF("%s -> TC_ACTIVE\n", bp->bp_ifp->if_xname); + /* nothing to do */ + break; + + case BSTP_TCSTATE_INACTIVE: + bstp_timer_stop(&bp->bp_tc_timer); + /* flush routes on the parent bridge */ + bp->bp_fdbflush = 1; + bstp_task_enqueue(&bp->bp_rtagetask); + bp->bp_tc_ack = 0; + DPRINTF("%s -> TC_INACTIVE\n", bp->bp_ifp->if_xname); + break; + + case BSTP_TCSTATE_LEARNING: + bp->bp_rcvdtc = 0; + bp->bp_rcvdtcn = 0; + bp->bp_rcvdtca = 0; + bp->bp_tc_prop = 0; + DPRINTF("%s -> TC_LEARNING\n", bp->bp_ifp->if_xname); + break; + + case BSTP_TCSTATE_DETECTED: + bstp_set_timer_tc(bp); + bstp_set_other_tcprop(bp); + /* send out notification */ + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_transmit(bs, bp); + /* reviewed for getmicrotime usage */ + getmicrotime(&bs->bs_last_tc_time); + DPRINTF("%s -> TC_DETECTED\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + + case BSTP_TCSTATE_TCN: + bstp_set_timer_tc(bp); + DPRINTF("%s -> TC_TCN\n", bp->bp_ifp->if_xname); + /* fall through */ + case BSTP_TCSTATE_TC: + bp->bp_rcvdtc = 0; + bp->bp_rcvdtcn = 0; + if (bp->bp_role == BSTP_ROLE_DESIGNATED) + bp->bp_tc_ack = 1; + + bstp_set_other_tcprop(bp); + DPRINTF("%s -> TC_TC\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + + case BSTP_TCSTATE_PROPAG: + /* flush routes on the parent bridge */ + bp->bp_fdbflush = 1; + bstp_task_enqueue(&bp->bp_rtagetask); + bp->bp_tc_prop = 0; + bstp_set_timer_tc(bp); + DPRINTF("%s -> TC_PROPAG\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + + case BSTP_TCSTATE_ACK: + bstp_timer_stop(&bp->bp_tc_timer); + bp->bp_rcvdtca = 0; + DPRINTF("%s -> TC_ACK\n", bp->bp_ifp->if_xname); + bp->bp_tcstate = BSTP_TCSTATE_ACTIVE; /* UCT */ + break; + } +} + +static void +bstp_set_timer_tc(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + + if (bp->bp_tc_timer.active) + return; + + switch (bp->bp_protover) { + case BSTP_PROTO_RSTP: + bstp_timer_start(&bp->bp_tc_timer, + bp->bp_desg_htime + BSTP_TICK_VAL); + bp->bp_flags |= BSTP_PORT_NEWINFO; + break; + + case BSTP_PROTO_STP: + bstp_timer_start(&bp->bp_tc_timer, + bs->bs_root_max_age + bs->bs_root_fdelay); + break; + } +} + +static void +bstp_set_timer_msgage(struct bstp_port *bp) +{ + if (bp->bp_port_msg_age + BSTP_MESSAGE_AGE_INCR <= + bp->bp_port_max_age) { + bstp_timer_start(&bp->bp_message_age_timer, + bp->bp_port_htime * 3); + } else + /* expires immediately */ + bstp_timer_start(&bp->bp_message_age_timer, 0); +} + +static int +bstp_rerooted(struct bstp_state *bs, struct bstp_port *bp) +{ + struct bstp_port *bp2; + int rr_set = 0; + + LIST_FOREACH(bp2, &bs->bs_bplist, bp_next) { + if (bp2 == bp) + continue; + if (bp2->bp_recent_root_timer.active) { + rr_set = 1; + break; + } + } + return (!rr_set); +} + +int +bstp_set_htime(struct bstp_state *bs, int t) +{ + /* convert seconds to ticks */ + t *= BSTP_TICK_VAL; + + /* value can only be changed in leagacy stp mode */ + if (bs->bs_protover != BSTP_PROTO_STP) + return (EPERM); + + if (t < BSTP_MIN_HELLO_TIME || t > BSTP_MAX_HELLO_TIME) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_bridge_htime = t; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_fdelay(struct bstp_state *bs, int t) +{ + /* convert seconds to ticks */ + t *= BSTP_TICK_VAL; + + if (t < BSTP_MIN_FORWARD_DELAY || t > BSTP_MAX_FORWARD_DELAY) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_bridge_fdelay = t; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_maxage(struct bstp_state *bs, int t) +{ + /* convert seconds to ticks */ + t *= BSTP_TICK_VAL; + + if (t < BSTP_MIN_MAX_AGE || t > BSTP_MAX_MAX_AGE) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_bridge_max_age = t; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_holdcount(struct bstp_state *bs, int count) +{ + struct bstp_port *bp; + + if (count < BSTP_MIN_HOLD_COUNT || + count > BSTP_MAX_HOLD_COUNT) + return (EINVAL); + + BSTP_LOCK(bs); + bs->bs_txholdcount = count; + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bp->bp_txcount = 0; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_protocol(struct bstp_state *bs, int proto) +{ + struct bstp_port *bp; + + switch (proto) { + /* Supported protocol versions */ + case BSTP_PROTO_STP: + case BSTP_PROTO_RSTP: + break; + + default: + return (EINVAL); + } + + BSTP_LOCK(bs); + bs->bs_protover = proto; + bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME; + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + /* reinit state */ + bp->bp_infois = BSTP_INFO_DISABLED; + bp->bp_txcount = 0; + bstp_set_port_proto(bp, bs->bs_protover); + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); + bstp_timer_stop(&bp->bp_recent_backup_timer); + } + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_priority(struct bstp_state *bs, int pri) +{ + if (pri < 0 || pri > BSTP_MAX_PRIORITY) + return (EINVAL); + + /* Limit to steps of 4096 */ + pri -= pri % 4096; + + BSTP_LOCK(bs); + bs->bs_bridge_priority = pri; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_port_priority(struct bstp_port *bp, int pri) +{ + struct bstp_state *bs = bp->bp_bs; + + if (pri < 0 || pri > BSTP_MAX_PORT_PRIORITY) + return (EINVAL); + + /* Limit to steps of 16 */ + pri -= pri % 16; + + BSTP_LOCK(bs); + bp->bp_priority = pri; + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_path_cost(struct bstp_port *bp, uint32_t path_cost) +{ + struct bstp_state *bs = bp->bp_bs; + + if (path_cost > BSTP_MAX_PATH_COST) + return (EINVAL); + + /* STP compat mode only uses 16 bits of the 32 */ + if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535) + path_cost = 65535; + + BSTP_LOCK(bs); + + if (path_cost == 0) { /* use auto */ + bp->bp_flags &= ~BSTP_PORT_ADMCOST; + bp->bp_path_cost = bstp_calc_path_cost(bp); + } else { + bp->bp_path_cost = path_cost; + bp->bp_flags |= BSTP_PORT_ADMCOST; + } + bstp_reinit(bs); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_edge(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + if ((bp->bp_operedge = set) == 0) + bp->bp_flags &= ~BSTP_PORT_ADMEDGE; + else + bp->bp_flags |= BSTP_PORT_ADMEDGE; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_autoedge(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + if (set) { + bp->bp_flags |= BSTP_PORT_AUTOEDGE; + /* we may be able to transition straight to edge */ + if (bp->bp_edge_delay_timer.active == 0) + bstp_edge_delay_expiry(bs, bp); + } else + bp->bp_flags &= ~BSTP_PORT_AUTOEDGE; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_ptp(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + bp->bp_ptp_link = set; + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_set_autoptp(struct bstp_port *bp, int set) +{ + struct bstp_state *bs = bp->bp_bs; + + BSTP_LOCK(bs); + if (set) { + bp->bp_flags |= BSTP_PORT_AUTOPTP; + if (bp->bp_role != BSTP_ROLE_DISABLED) + bstp_ifupdstatus(bs, bp); + } else + bp->bp_flags &= ~BSTP_PORT_AUTOPTP; + BSTP_UNLOCK(bs); + return (0); +} + +/* + * Calculate the path cost according to the link speed. + */ +static uint32_t +bstp_calc_path_cost(struct bstp_port *bp) +{ + struct ifnet *ifp = bp->bp_ifp; + uint32_t path_cost; + + /* If the priority has been manually set then retain the value */ + if (bp->bp_flags & BSTP_PORT_ADMCOST) + return bp->bp_path_cost; + + if (bp->bp_if_link_state == LINK_STATE_DOWN) { + /* Recalc when the link comes up again */ + bp->bp_flags |= BSTP_PORT_PNDCOST; + return (BSTP_DEFAULT_PATH_COST); + } + + if (ifp->if_baudrate < 1000) + return (BSTP_DEFAULT_PATH_COST); + + /* formula from section 17.14, IEEE Std 802.1D-2004 */ + path_cost = 20000000000ULL / (ifp->if_baudrate / 1000); + + if (path_cost > BSTP_MAX_PATH_COST) + path_cost = BSTP_MAX_PATH_COST; + + /* STP compat mode only uses 16 bits of the 32 */ + if (bp->bp_protover == BSTP_PROTO_STP && path_cost > 65535) + path_cost = 65535; + + return (path_cost); +} + +/* + * Notify the bridge that a port state has changed, we need to do this from a + * taskqueue to avoid a LOR. + */ +static void +bstp_notify_state(void *arg, __unused int pending) +{ + struct bstp_port *bp = (struct bstp_port *)arg; + struct bstp_state *bs = bp->bp_bs; + + if (bp->bp_active == 1 && bs->bs_state_cb != NULL) + (*bs->bs_state_cb)(bp->bp_ifp, bp->bp_state); +} + +/* + * Flush the routes on the bridge port, we need to do this from a + * taskqueue to avoid a LOR. + */ +static void +bstp_notify_rtage(void *arg, __unused int pending) +{ + struct bstp_port *bp = (struct bstp_port *)arg; + struct bstp_state *bs = bp->bp_bs; + int age = 0; + + BSTP_LOCK(bs); + switch (bp->bp_protover) { + case BSTP_PROTO_STP: + /* convert to seconds */ + age = bp->bp_desg_fdelay / BSTP_TICK_VAL; + break; + + case BSTP_PROTO_RSTP: + age = 0; + break; + } + BSTP_UNLOCK(bs); + + if (bp->bp_active == 1 && bs->bs_rtage_cb != NULL) + (*bs->bs_rtage_cb)(bp->bp_ifp, age); + + /* flush is complete */ + BSTP_LOCK(bs); + bp->bp_fdbflush = 0; + BSTP_UNLOCK(bs); +} + +void +bstp_linkstate(struct ifnet *ifp, __unused int state) +{ + struct bstp_state *bs; + struct bstp_port *bp; + + /* search for the stp port */ + lck_mtx_lock(bstp_list_mtx); + LIST_FOREACH(bs, &bstp_list, bs_list) { + BSTP_LOCK(bs); + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + if (bp->bp_ifp == ifp) { + bstp_ifupdstatus(bs, bp); + bstp_update_state(bs, bp); + /* it only exists once so return */ + BSTP_UNLOCK(bs); + lck_mtx_unlock(bstp_list_mtx); + return; + } + } + BSTP_UNLOCK(bs); + } + lck_mtx_unlock(bstp_list_mtx); +} + +static void +bstp_ifupdstatus(struct bstp_state *bs, struct bstp_port *bp) +{ + struct ifnet *ifp = bp->bp_ifp; + struct ifmediareq ifmr; + int error = 0; + + BSTP_LOCK_ASSERT(bs); + + bzero((char *)&ifmr, sizeof(ifmr)); + error = (*ifp->if_ioctl)(ifp, SIOCGIFMEDIA, (caddr_t)&ifmr); + + if ((error == 0) && (ifp->if_flags & IFF_UP)) { + if (ifmr.ifm_status & IFM_ACTIVE) { + /* A full-duplex link is assumed to be point to point */ + if (bp->bp_flags & BSTP_PORT_AUTOPTP) { + bp->bp_ptp_link = + ifmr.ifm_active & IFM_FDX ? 1 : 0; + } + + /* Calc the cost if the link was down previously */ + if (bp->bp_flags & BSTP_PORT_PNDCOST) { + bp->bp_path_cost = bstp_calc_path_cost(bp); + bp->bp_flags &= ~BSTP_PORT_PNDCOST; + } + + if (bp->bp_role == BSTP_ROLE_DISABLED) + bstp_enable_port(bs, bp); + } else { + if (bp->bp_role != BSTP_ROLE_DISABLED) { + bstp_disable_port(bs, bp); + if ((bp->bp_flags & BSTP_PORT_ADMEDGE) && + bp->bp_protover == BSTP_PROTO_RSTP) + bp->bp_operedge = 1; + } + } + return; + } + + if (bp->bp_infois != BSTP_INFO_DISABLED) + bstp_disable_port(bs, bp); +} + +static void +bstp_enable_port(struct bstp_state *bs, struct bstp_port *bp) +{ + bp->bp_infois = BSTP_INFO_AGED; + bstp_assign_roles(bs); +} + +static void +bstp_disable_port(struct bstp_state *bs, struct bstp_port *bp) +{ + bp->bp_infois = BSTP_INFO_DISABLED; + bstp_assign_roles(bs); +} + +static void +bstp_tick(void *arg) +{ + struct bstp_state *bs = arg; + struct bstp_port *bp; + struct timespec ts; + + BSTP_LOCK(bs); + + if (bs->bs_running == 0) + return; + + /* slow timer to catch missed link events */ + if (bstp_timer_expired(&bs->bs_link_timer)) { + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bstp_ifupdstatus(bs, bp); + bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER); + } + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + /* no events need to happen for these */ + bstp_timer_expired(&bp->bp_tc_timer); + bstp_timer_expired(&bp->bp_recent_root_timer); + bstp_timer_expired(&bp->bp_forward_delay_timer); + bstp_timer_expired(&bp->bp_recent_backup_timer); + + if (bstp_timer_expired(&bp->bp_hello_timer)) + bstp_hello_timer_expiry(bs, bp); + + if (bstp_timer_expired(&bp->bp_message_age_timer)) + bstp_message_age_expiry(bs, bp); + + if (bstp_timer_expired(&bp->bp_migrate_delay_timer)) + bstp_migrate_delay_expiry(bs, bp); + + if (bstp_timer_expired(&bp->bp_edge_delay_timer)) + bstp_edge_delay_expiry(bs, bp); + + /* update the various state machines for the port */ + bstp_update_state(bs, bp); + + if (bp->bp_txcount > 0) + bp->bp_txcount--; + } + + BSTP_UNLOCK(bs); + + ts.tv_sec = 1; + ts.tv_nsec = 0; + bsd_timeout(bstp_tick, bs, &ts); +} + +static void +bstp_timer_start(struct bstp_timer *t, uint16_t v) +{ + t->value = v; + t->active = 1; + t->latched = 0; +} + +static void +bstp_timer_stop(struct bstp_timer *t) +{ + t->value = 0; + t->active = 0; + t->latched = 0; +} + +static void +bstp_timer_latch(struct bstp_timer *t) +{ + t->latched = 1; + t->active = 1; +} + +static int +bstp_timer_expired(struct bstp_timer *t) +{ + if (t->active == 0 || t->latched) + return (0); + t->value -= BSTP_TICK_VAL; + if (t->value <= 0) { + bstp_timer_stop(t); + return (1); + } + return (0); +} + +static void +bstp_hello_timer_expiry(struct bstp_state *bs, struct bstp_port *bp) +{ + if ((bp->bp_flags & BSTP_PORT_NEWINFO) || + bp->bp_role == BSTP_ROLE_DESIGNATED || + (bp->bp_role == BSTP_ROLE_ROOT && + bp->bp_tc_timer.active == 1)) { + bstp_timer_start(&bp->bp_hello_timer, bp->bp_desg_htime); + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_transmit(bs, bp); + } +} + +static void +bstp_message_age_expiry(struct bstp_state *bs, struct bstp_port *bp) +{ + if (bp->bp_infois == BSTP_INFO_RECEIVED) { + bp->bp_infois = BSTP_INFO_AGED; + bstp_assign_roles(bs); + DPRINTF("aged info on %s\n", bp->bp_ifp->if_xname); + } +} + +static void +bstp_migrate_delay_expiry(__unused struct bstp_state *bs, struct bstp_port *bp) +{ + bp->bp_flags |= BSTP_PORT_CANMIGRATE; +} + +static void +bstp_edge_delay_expiry(__unused struct bstp_state *bs, struct bstp_port *bp) +{ + if ((bp->bp_flags & BSTP_PORT_AUTOEDGE) && + bp->bp_protover == BSTP_PROTO_RSTP && bp->bp_proposing && + bp->bp_role == BSTP_ROLE_DESIGNATED) { + bp->bp_operedge = 1; + DPRINTF("%s -> edge port\n", bp->bp_ifp->if_xname); + } +} + +static int +bstp_addr_cmp(const uint8_t *a, const uint8_t *b) +{ + int i, d; + + for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) { + d = ((int)a[i]) - ((int)b[i]); + } + + return (d); +} + +/* + * compare the bridge address component of the bridgeid + */ +static int +bstp_same_bridgeid(uint64_t id1, uint64_t id2) +{ + u_char addr1[ETHER_ADDR_LEN]; + u_char addr2[ETHER_ADDR_LEN]; + + PV2ADDR(id1, addr1); + PV2ADDR(id2, addr2); + + if (bstp_addr_cmp(addr1, addr2) == 0) + return (1); + + return (0); +} + +void +bstp_reinit(struct bstp_state *bs) +{ + struct bstp_port *bp; + struct ifnet *ifp, *mif; + u_char *e_addr; + static const u_char llzero[ETHER_ADDR_LEN]; /* 00:00:00:00:00:00 */ + + BSTP_LOCK_ASSERT(bs); + + mif = NULL; + /* + * Search through the Ethernet adapters and find the one with the + * lowest value. The adapter which we take the MAC address from does + * not need to be part of the bridge, it just needs to be a unique + * value. + */ + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (ifp->if_type != IFT_ETHER) + continue; + + if (bstp_addr_cmp(ifnet_lladdr(ifp), llzero) == 0) + continue; + + if (mif == NULL) { + mif = ifp; + continue; + } + if (bstp_addr_cmp(ifnet_lladdr(ifp), ifnet_lladdr(mif)) < 0) { + mif = ifp; + continue; + } + } + ifnet_head_done(); + + if (LIST_EMPTY(&bs->bs_bplist) || mif == NULL) { + /* Set the bridge and root id (lower bits) to zero */ + bs->bs_bridge_pv.pv_dbridge_id = + ((uint64_t)bs->bs_bridge_priority) << 48; + bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id; + bs->bs_root_pv = bs->bs_bridge_pv; + /* Disable any remaining ports, they will have no MAC address */ + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + bp->bp_infois = BSTP_INFO_DISABLED; + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + } + bsd_untimeout(bstp_tick, bs); + return; + } + + e_addr = ifnet_lladdr(mif); + bs->bs_bridge_pv.pv_dbridge_id = + (((uint64_t)bs->bs_bridge_priority) << 48) | + (((uint64_t)e_addr[0]) << 40) | + (((uint64_t)e_addr[1]) << 32) | + (((uint64_t)e_addr[2]) << 24) | + (((uint64_t)e_addr[3]) << 16) | + (((uint64_t)e_addr[4]) << 8) | + (((uint64_t)e_addr[5])); + + bs->bs_bridge_pv.pv_root_id = bs->bs_bridge_pv.pv_dbridge_id; + bs->bs_bridge_pv.pv_cost = 0; + bs->bs_bridge_pv.pv_dport_id = 0; + bs->bs_bridge_pv.pv_port_id = 0; + + if (bs->bs_running) + bsd_untimeout(bstp_tick, bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) { + bp->bp_port_id = (bp->bp_priority << 8) | + (bp->bp_ifp->if_index & 0xfff); + bstp_ifupdstatus(bs, bp); + } + + bstp_assign_roles(bs); + bstp_timer_start(&bs->bs_link_timer, BSTP_LINK_TIMER); +} + +void +bstp_attach(struct bstp_state *bs, struct bstp_cb_ops *cb) +{ + BSTP_LOCK_INIT(bs); + LIST_INIT(&bs->bs_bplist); + + bs->bs_bridge_max_age = BSTP_DEFAULT_MAX_AGE; + bs->bs_bridge_htime = BSTP_DEFAULT_HELLO_TIME; + bs->bs_bridge_fdelay = BSTP_DEFAULT_FORWARD_DELAY; + bs->bs_bridge_priority = BSTP_DEFAULT_BRIDGE_PRIORITY; + bs->bs_hold_time = BSTP_DEFAULT_HOLD_TIME; + bs->bs_migration_delay = BSTP_DEFAULT_MIGRATE_DELAY; + bs->bs_txholdcount = BSTP_DEFAULT_HOLD_COUNT; + bs->bs_protover = BSTP_PROTO_RSTP; + bs->bs_state_cb = cb->bcb_state; + bs->bs_rtage_cb = cb->bcb_rtage; + + /* reviewed for getmicrotime usage */ + getmicrotime(&bs->bs_last_tc_time); + + lck_mtx_lock(bstp_list_mtx); + LIST_INSERT_HEAD(&bstp_list, bs, bs_list); + lck_mtx_unlock(bstp_list_mtx); +} + +void +bstp_detach(struct bstp_state *bs) +{ + KASSERT(LIST_EMPTY(&bs->bs_bplist), ("bstp still active")); + + lck_mtx_lock(bstp_list_mtx); + LIST_REMOVE(bs, bs_list); + lck_mtx_unlock(bstp_list_mtx); + bsd_untimeout(bstp_tick, bs); + BSTP_LOCK_DESTROY(bs); +} + +void +bstp_init(struct bstp_state *bs) +{ + struct timespec ts; + + ts.tv_sec = 1; + ts.tv_nsec = 0; + + BSTP_LOCK(bs); + bsd_timeout(bstp_tick, bs, &ts); + bs->bs_running = 1; + bstp_reinit(bs); + BSTP_UNLOCK(bs); +} + +void +bstp_stop(struct bstp_state *bs) +{ + struct bstp_port *bp; + + BSTP_LOCK(bs); + + LIST_FOREACH(bp, &bs->bs_bplist, bp_next) + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + + bs->bs_running = 0; + bsd_untimeout(bstp_tick, bs); + BSTP_UNLOCK(bs); +} + +int +bstp_create(struct bstp_state *bs, struct bstp_port *bp, struct ifnet *ifp) +{ + bzero(bp, sizeof(struct bstp_port)); + + BSTP_LOCK(bs); + bp->bp_ifp = ifp; + bp->bp_bs = bs; + bp->bp_priority = BSTP_DEFAULT_PORT_PRIORITY; + BSTP_TASK_INIT(&bp->bp_statetask, bstp_notify_state, bp); + BSTP_TASK_INIT(&bp->bp_rtagetask, bstp_notify_rtage, bp); + + /* Init state */ + bp->bp_infois = BSTP_INFO_DISABLED; + bp->bp_flags = BSTP_PORT_AUTOEDGE|BSTP_PORT_AUTOPTP; + bstp_set_port_state(bp, BSTP_IFSTATE_DISCARDING); + bstp_set_port_proto(bp, bs->bs_protover); + bstp_set_port_role(bp, BSTP_ROLE_DISABLED); + bstp_set_port_tc(bp, BSTP_TCSTATE_INACTIVE); + bp->bp_path_cost = bstp_calc_path_cost(bp); + BSTP_UNLOCK(bs); + return (0); +} + +int +bstp_enable(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + struct ifnet *ifp = bp->bp_ifp; + + KASSERT(bp->bp_active == 0, ("already a bstp member")); + + switch (ifp->if_type) { + case IFT_ETHER: /* These can do spanning tree. */ + break; + default: + /* Nothing else can. */ + return (EINVAL); + } + + BSTP_LOCK(bs); + LIST_INSERT_HEAD(&bs->bs_bplist, bp, bp_next); + bp->bp_active = 1; + bp->bp_flags |= BSTP_PORT_NEWINFO; + bstp_reinit(bs); + bstp_update_roles(bs, bp); + BSTP_UNLOCK(bs); + return (0); +} + +void +bstp_disable(struct bstp_port *bp) +{ + struct bstp_state *bs = bp->bp_bs; + + KASSERT(bp->bp_active == 1, ("not a bstp member")); + + BSTP_LOCK(bs); + bstp_disable_port(bs, bp); + LIST_REMOVE(bp, bp_next); + bp->bp_active = 0; + bstp_reinit(bs); + BSTP_UNLOCK(bs); +} + +/* + * The bstp_port structure is about to be freed by the parent bridge. + */ +void +bstp_destroy(struct bstp_port *bp) +{ + KASSERT(bp->bp_active == 0, ("port is still attached")); + bstp_task_drain(&bp->bp_statetask); + bstp_task_drain(&bp->bp_rtagetask); +} + + +__private_extern__ void +bstp_sys_init(void) +{ + lck_grp_attr_t *lck_grp_attr = NULL; + + lck_grp_attr = lck_grp_attr_alloc_init(); + bstp_lock_grp = lck_grp_alloc_init("bstp", lck_grp_attr); + bstp_lock_attr = lck_attr_alloc_init(); +#if BRIDGE_DEBUG + lck_attr_setdebug(bstp_lock_attr); +#endif + bstp_list_mtx = lck_mtx_alloc_init(bstp_lock_grp, bstp_lock_attr); + lck_grp_attr_free(lck_grp_attr); + + LIST_INIT(&bstp_list); + + bstp_create_task_thread(); +} + + + +static void +bstp_create_task_thread(void) +{ + kern_return_t error; + + lck_grp_attr_t *lck_grp_attr = NULL; + + lck_grp_attr = lck_grp_attr_alloc_init(); + bstp_task_grp = lck_grp_alloc_init("bstp_task", lck_grp_attr); + bstp_task_attr = lck_attr_alloc_init(); +#if BRIDGE_DEBUG + lck_attr_setdebug(bstp_task_attr); +#endif + bstp_task_mtx = lck_mtx_alloc_init(bstp_lock_grp, bstp_lock_attr); + lck_grp_attr_free(lck_grp_attr); + + error = kernel_thread_start((thread_continue_t)bstp_task_thread_func, NULL, &bstp_task_thread); +} + + +static void +bstp_task_thread_func(void) +{ + struct bstp_task *bt, *tvar; + + lck_mtx_lock(bstp_task_mtx); + + do { + while(TAILQ_EMPTY(&bstp_task_queue)) { + wakeup(&bstp_task_queue_running); + msleep(&bstp_task_queue, bstp_task_mtx, PZERO, "bstp_task_queue", NULL); + } + + TAILQ_FOREACH_SAFE(bt, &bstp_task_queue, bt_next, tvar) { + int count = bt->bt_count; + + bt->bt_count = 0; + + bstp_task_queue_running = bt; + lck_mtx_unlock(bstp_task_mtx); + + (*bt->bt_func)(bt->bt_context, count); + + lck_mtx_lock(bstp_task_mtx); + bstp_task_queue_running = NULL; + + if (bt->bt_count == 0) + TAILQ_REMOVE(&bstp_task_queue, bt, bt_next); + } + } while (1); + + /* UNREACHED */ +} + +static void +bstp_task_enqueue(struct bstp_task *bt) +{ + lck_mtx_lock(bstp_task_mtx); + + if (bt->bt_count) { + bt->bt_count++; + lck_mtx_unlock(bstp_task_mtx); + wakeup(&bstp_task_queue); + return; + } + + bt->bt_count = 1; + TAILQ_INSERT_TAIL(&bstp_task_queue, bt, bt_next); + + lck_mtx_unlock(bstp_task_mtx); + + wakeup(&bstp_task_queue); +} + +static void +bstp_task_drain(struct bstp_task *bt) +{ + lck_mtx_lock(bstp_task_mtx); + + while (bt->bt_count != 0 || bstp_task_queue_running == bt) { + wakeup(&bstp_task_queue); + msleep(&bstp_task_queue_running, bstp_task_mtx, PZERO, "bstp_task_queue", NULL); + } + lck_mtx_unlock(bstp_task_mtx); +} + + diff --git a/bsd/net/bridgestp.h b/bsd/net/bridgestp.h new file mode 100644 index 000000000..a70f7aaba --- /dev/null +++ b/bsd/net/bridgestp.h @@ -0,0 +1,441 @@ +/* $NetBSD: if_bridgevar.h,v 1.4 2003/07/08 07:13:50 itojun Exp $ */ + +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jason L. Wright + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.h,v 1.14 2001/03/22 03:48:29 jason Exp + * + * $FreeBSD$ + */ + +#ifndef __BRIDGESTP_H__ +#define __BRIDGESTP_H__ + +/* + * Data structure and control definitions for STP interfaces. + */ + +#include <sys/queue.h> +#include <kern/locks.h> +/* STP port states */ +#define BSTP_IFSTATE_DISABLED 0 +#define BSTP_IFSTATE_LISTENING 1 +#define BSTP_IFSTATE_LEARNING 2 +#define BSTP_IFSTATE_FORWARDING 3 +#define BSTP_IFSTATE_BLOCKING 4 +#define BSTP_IFSTATE_DISCARDING 5 + +#define BSTP_TCSTATE_ACTIVE 1 +#define BSTP_TCSTATE_DETECTED 2 +#define BSTP_TCSTATE_INACTIVE 3 +#define BSTP_TCSTATE_LEARNING 4 +#define BSTP_TCSTATE_PROPAG 5 +#define BSTP_TCSTATE_ACK 6 +#define BSTP_TCSTATE_TC 7 +#define BSTP_TCSTATE_TCN 8 + +#define BSTP_ROLE_DISABLED 0 +#define BSTP_ROLE_ROOT 1 +#define BSTP_ROLE_DESIGNATED 2 +#define BSTP_ROLE_ALTERNATE 3 +#define BSTP_ROLE_BACKUP 4 + +#ifdef XNU_KERNEL_PRIVATE + +/* STP port flags */ +#define BSTP_PORT_CANMIGRATE 0x0001 +#define BSTP_PORT_NEWINFO 0x0002 +#define BSTP_PORT_DISPUTED 0x0004 +#define BSTP_PORT_ADMCOST 0x0008 +#define BSTP_PORT_AUTOEDGE 0x0010 +#define BSTP_PORT_AUTOPTP 0x0020 +#define BSTP_PORT_ADMEDGE 0x0040 +#define BSTP_PORT_PNDCOST 0x0080 + +/* BPDU priority */ +#define BSTP_PDU_SUPERIOR 1 +#define BSTP_PDU_REPEATED 2 +#define BSTP_PDU_INFERIOR 3 +#define BSTP_PDU_INFERIORALT 4 +#define BSTP_PDU_OTHER 5 + +/* BPDU flags */ +#define BSTP_PDU_PRMASK 0x0c /* Port Role */ +#define BSTP_PDU_PRSHIFT 2 /* Port Role offset */ +#define BSTP_PDU_F_UNKN 0x00 /* Unknown port (00) */ +#define BSTP_PDU_F_ALT 0x01 /* Alt/Backup port (01) */ +#define BSTP_PDU_F_ROOT 0x02 /* Root port (10) */ +#define BSTP_PDU_F_DESG 0x03 /* Designated port (11) */ + +#define BSTP_PDU_STPMASK 0x81 /* strip unused STP flags */ +#define BSTP_PDU_RSTPMASK 0x7f /* strip unused RSTP flags */ +#define BSTP_PDU_F_TC 0x01 /* Topology change */ +#define BSTP_PDU_F_P 0x02 /* Proposal flag */ +#define BSTP_PDU_F_L 0x10 /* Learning flag */ +#define BSTP_PDU_F_F 0x20 /* Forwarding flag */ +#define BSTP_PDU_F_A 0x40 /* Agreement flag */ +#define BSTP_PDU_F_TCA 0x80 /* Topology change ack */ + +/* + * Spanning tree defaults. + */ +#define BSTP_DEFAULT_MAX_AGE (20 * 256) +#define BSTP_DEFAULT_HELLO_TIME (2 * 256) +#define BSTP_DEFAULT_FORWARD_DELAY (15 * 256) +#define BSTP_DEFAULT_HOLD_TIME (1 * 256) +#define BSTP_DEFAULT_MIGRATE_DELAY (3 * 256) +#define BSTP_DEFAULT_HOLD_COUNT 6 +#define BSTP_DEFAULT_BRIDGE_PRIORITY 0x8000 +#define BSTP_DEFAULT_PORT_PRIORITY 0x80 +#define BSTP_DEFAULT_PATH_COST 55 +#define BSTP_MIN_HELLO_TIME (1 * 256) +#define BSTP_MIN_MAX_AGE (6 * 256) +#define BSTP_MIN_FORWARD_DELAY (4 * 256) +#define BSTP_MIN_HOLD_COUNT 1 +#define BSTP_MAX_HELLO_TIME (2 * 256) +#define BSTP_MAX_MAX_AGE (40 * 256) +#define BSTP_MAX_FORWARD_DELAY (30 * 256) +#define BSTP_MAX_HOLD_COUNT 10 +#define BSTP_MAX_PRIORITY 61440 +#define BSTP_MAX_PORT_PRIORITY 240 +#define BSTP_MAX_PATH_COST 200000000 + +/* BPDU message types */ +#define BSTP_MSGTYPE_CFG 0x00 /* Configuration */ +#define BSTP_MSGTYPE_RSTP 0x02 /* Rapid STP */ +#define BSTP_MSGTYPE_TCN 0x80 /* Topology chg notification */ + +/* Protocol versions */ +#define BSTP_PROTO_ID 0x00 +#define BSTP_PROTO_STP 0x00 +#define BSTP_PROTO_RSTP 0x02 +#define BSTP_PROTO_MAX BSTP_PROTO_RSTP + +#define BSTP_INFO_RECIEVED 1 /* compat */ +#define BSTP_INFO_RECEIVED 1 +#define BSTP_INFO_MINE 2 +#define BSTP_INFO_AGED 3 +#define BSTP_INFO_DISABLED 4 + + +#define BSTP_MESSAGE_AGE_INCR (1 * 256) /* in 256ths of a second */ +#define BSTP_TICK_VAL (1 * 256) /* in 256ths of a second */ +#define BSTP_LINK_TIMER (BSTP_TICK_VAL * 15) + +/* + * Driver callbacks for STP state changes + */ +typedef void (*bstp_state_cb_t)(struct ifnet *, int); +typedef void (*bstp_rtage_cb_t)(struct ifnet *, int); +struct bstp_cb_ops { + bstp_state_cb_t bcb_state; + bstp_rtage_cb_t bcb_rtage; +}; + +/* + * Because BPDU's do not make nicely aligned structures, two different + * declarations are used: bstp_?bpdu (wire representation, packed) and + * bstp_*_unit (internal, nicely aligned version). + */ + +/* configuration bridge protocol data unit */ +struct bstp_cbpdu { + uint8_t cbu_dsap; /* LLC: destination sap */ + uint8_t cbu_ssap; /* LLC: source sap */ + uint8_t cbu_ctl; /* LLC: control */ + uint16_t cbu_protoid; /* protocol id */ + uint8_t cbu_protover; /* protocol version */ + uint8_t cbu_bpdutype; /* message type */ + uint8_t cbu_flags; /* flags (below) */ + + /* root id */ + uint16_t cbu_rootpri; /* root priority */ + uint8_t cbu_rootaddr[6]; /* root address */ + + uint32_t cbu_rootpathcost; /* root path cost */ + + /* bridge id */ + uint16_t cbu_bridgepri; /* bridge priority */ + uint8_t cbu_bridgeaddr[6]; /* bridge address */ + + uint16_t cbu_portid; /* port id */ + uint16_t cbu_messageage; /* current message age */ + uint16_t cbu_maxage; /* maximum age */ + uint16_t cbu_hellotime; /* hello time */ + uint16_t cbu_forwarddelay; /* forwarding delay */ + uint8_t cbu_versionlen; /* version 1 length */ +} __attribute__((__packed__)); +#define BSTP_BPDU_STP_LEN (3 + 35) /* LLC + STP pdu */ +#define BSTP_BPDU_RSTP_LEN (3 + 36) /* LLC + RSTP pdu */ + +/* topology change notification bridge protocol data unit */ +struct bstp_tbpdu { + uint8_t tbu_dsap; /* LLC: destination sap */ + uint8_t tbu_ssap; /* LLC: source sap */ + uint8_t tbu_ctl; /* LLC: control */ + uint16_t tbu_protoid; /* protocol id */ + uint8_t tbu_protover; /* protocol version */ + uint8_t tbu_bpdutype; /* message type */ +} __attribute__((__packed__)); + +/* + * Timekeeping structure used in spanning tree code. + */ + +typedef void bstp_task_func_t(void *context, int count); + +struct bstp_task { + TAILQ_ENTRY(bstp_task) bt_next; + int bt_count; + bstp_task_func_t *bt_func; + void *bt_context; +}; + +struct bstp_timer { + int active; + int latched; + int value; +}; + +struct bstp_pri_vector { + uint64_t pv_root_id; + uint32_t pv_cost; + uint64_t pv_dbridge_id; + uint16_t pv_dport_id; + uint16_t pv_port_id; +}; + +struct bstp_config_unit { + struct bstp_pri_vector cu_pv; + uint16_t cu_message_age; + uint16_t cu_max_age; + uint16_t cu_forward_delay; + uint16_t cu_hello_time; + uint8_t cu_message_type; + uint8_t cu_topology_change_ack; + uint8_t cu_topology_change; + uint8_t cu_proposal; + uint8_t cu_agree; + uint8_t cu_learning; + uint8_t cu_forwarding; + uint8_t cu_role; +}; + +struct bstp_tcn_unit { + uint8_t tu_message_type; +}; + +struct bstp_port { + LIST_ENTRY(bstp_port) bp_next; + struct ifnet *bp_ifp; /* parent if */ + struct bstp_state *bp_bs; + uint8_t bp_active; + uint8_t bp_protover; + uint32_t bp_flags; + uint32_t bp_path_cost; + uint16_t bp_port_msg_age; + uint16_t bp_port_max_age; + uint16_t bp_port_fdelay; + uint16_t bp_port_htime; + uint16_t bp_desg_msg_age; + uint16_t bp_desg_max_age; + uint16_t bp_desg_fdelay; + uint16_t bp_desg_htime; + struct bstp_timer bp_edge_delay_timer; + struct bstp_timer bp_forward_delay_timer; + struct bstp_timer bp_hello_timer; + struct bstp_timer bp_message_age_timer; + struct bstp_timer bp_migrate_delay_timer; + struct bstp_timer bp_recent_backup_timer; + struct bstp_timer bp_recent_root_timer; + struct bstp_timer bp_tc_timer; + struct bstp_config_unit bp_msg_cu; + struct bstp_pri_vector bp_desg_pv; + struct bstp_pri_vector bp_port_pv; + uint16_t bp_port_id; + uint8_t bp_state; + uint8_t bp_tcstate; + uint8_t bp_role; + uint8_t bp_infois; + uint8_t bp_tc_ack; + uint8_t bp_tc_prop; + uint8_t bp_fdbflush; + uint8_t bp_priority; + uint8_t bp_ptp_link; + uint8_t bp_agree; + uint8_t bp_agreed; + uint8_t bp_sync; + uint8_t bp_synced; + uint8_t bp_proposing; + uint8_t bp_proposed; + uint8_t bp_operedge; + uint8_t bp_reroot; + uint8_t bp_rcvdtc; + uint8_t bp_rcvdtca; + uint8_t bp_rcvdtcn; + uint32_t bp_forward_transitions; + uint8_t bp_txcount; + struct bstp_task bp_statetask; + struct bstp_task bp_rtagetask; + uint32_t bp_if_link_state; /* cache of the parent if link state */ +}; + +/* + * Values for bp_if_link_state. + */ +#define LINK_STATE_UNKNOWN 0 /* link invalid/unknown */ +#define LINK_STATE_DOWN 1 /* link is down */ +#define LINK_STATE_UP 2 /* link is up */ + +/* + * Software state for each bridge STP. + */ +struct bstp_state { + LIST_ENTRY(bstp_state) bs_list; + uint8_t bs_running; + lck_mtx_t *bs_mtx; + struct bstp_pri_vector bs_bridge_pv; + struct bstp_pri_vector bs_root_pv; + struct bstp_port *bs_root_port; + uint8_t bs_protover; + uint16_t bs_migration_delay; + uint16_t bs_edge_delay; + uint16_t bs_bridge_max_age; + uint16_t bs_bridge_fdelay; + uint16_t bs_bridge_htime; + uint16_t bs_root_msg_age; + uint16_t bs_root_max_age; + uint16_t bs_root_fdelay; + uint16_t bs_root_htime; + uint16_t bs_hold_time; + uint16_t bs_bridge_priority; + uint8_t bs_txholdcount; + uint8_t bs_allsynced; + struct bstp_timer bs_link_timer; + struct timeval bs_last_tc_time; + LIST_HEAD(, bstp_port) bs_bplist; + bstp_state_cb_t bs_state_cb; + bstp_rtage_cb_t bs_rtage_cb; +}; + +extern const uint8_t bstp_etheraddr[]; + +void bstp_attach(struct bstp_state *, struct bstp_cb_ops *); +void bstp_detach(struct bstp_state *); +void bstp_init(struct bstp_state *); +void bstp_stop(struct bstp_state *); +int bstp_create(struct bstp_state *, struct bstp_port *, struct ifnet *); +int bstp_enable(struct bstp_port *); +void bstp_disable(struct bstp_port *); +void bstp_destroy(struct bstp_port *); +void bstp_linkstate(struct ifnet *, int); +int bstp_set_htime(struct bstp_state *, int); +int bstp_set_fdelay(struct bstp_state *, int); +int bstp_set_maxage(struct bstp_state *, int); +int bstp_set_holdcount(struct bstp_state *, int); +int bstp_set_protocol(struct bstp_state *, int); +int bstp_set_priority(struct bstp_state *, int); +int bstp_set_port_priority(struct bstp_port *, int); +int bstp_set_path_cost(struct bstp_port *, uint32_t); +int bstp_set_edge(struct bstp_port *, int); +int bstp_set_autoedge(struct bstp_port *, int); +int bstp_set_ptp(struct bstp_port *, int); +int bstp_set_autoptp(struct bstp_port *, int); +struct mbuf *bstp_input(struct bstp_port *, struct ifnet *, struct mbuf *); + +void bstp_sys_init(void); + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* __BRIDGESTP_H__ */ + diff --git a/bsd/net/dlil.c b/bsd/net/dlil.c index 848b3b3f1..272388f02 100644 --- a/bsd/net/dlil.c +++ b/bsd/net/dlil.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999-2010 Apple Inc. All rights reserved. + * Copyright (c) 1999-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,10 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Data Link Inteface Layer - * Author: Ted Walker - */ /* * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce * support for mandatory and extensible security protections. This notice @@ -53,22 +49,41 @@ #include <net/if_arp.h> #include <sys/kern_event.h> #include <sys/kdebug.h> +#include <sys/mcache.h> #include <kern/assert.h> #include <kern/task.h> #include <kern/thread.h> #include <kern/sched_prim.h> #include <kern/locks.h> +#include <kern/zalloc.h> #include <net/kpi_protocol.h> #include <net/if_types.h> +#include <net/if_llreach.h> #include <net/kpi_interfacefilter.h> +#if INET +#include <netinet/in_var.h> +#include <netinet/igmp_var.h> +#endif /* INET */ + +#if INET6 +#include <netinet6/in6_var.h> +#include <netinet6/nd6.h> +#include <netinet6/mld6_var.h> +#endif /* INET6 */ + +#if NETAT +#include <netat/at_var.h> +#endif /* NETAT */ + #include <libkern/OSAtomic.h> #include <machine/machine_routines.h> #include <mach/thread_act.h> +#include <mach/sdt.h> #if CONFIG_MACF_NET #include <security/mac_framework.h> @@ -78,8 +93,8 @@ #include <net/pfvar.h> #endif /* PF */ -#define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0) -#define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2) +#define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0) +#define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2) #define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8)) #define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8)) #define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8)) @@ -95,48 +110,51 @@ #define DLIL_PRINTF kprintf #endif -#define atomic_add_32(a, n) \ - ((void) OSAddAtomic(n, (volatile SInt32 *)a)) - -#if PKT_PRIORITY #define _CASSERT(x) \ switch (0) { case 0: case (x): ; } -#define IF_DATA_REQUIRE_ALIGNED_32(f) \ - _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int32_t))) +#define IF_DATA_REQUIRE_ALIGNED_64(f) \ + _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t))) -#define IFNET_IF_DATA_REQUIRE_ALIGNED_32(f) \ - _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int32_t))) -#endif /* PKT_PRIORITY */ +#define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \ + _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t))) + +#define IFNET_IF_TC_REQUIRE_ALIGNED_64(f) \ + _CASSERT(!(offsetof(struct ifnet, if_tc.f) % sizeof (u_int64_t))) enum { kProtoKPI_v1 = 1, kProtoKPI_v2 = 2 }; +/* + * List of if_proto structures in if_proto_hash[] is protected by + * the ifnet lock. The rest of the fields are initialized at protocol + * attach time and never change, thus no lock required as long as + * a reference to it is valid, via if_proto_ref(). + */ struct if_proto { - SLIST_ENTRY(if_proto) next_hash; - int refcount; - int detaching; - struct ifnet *ifp; - struct domain *dl_domain; + SLIST_ENTRY(if_proto) next_hash; + u_int32_t refcount; + u_int32_t detached; + struct ifnet *ifp; protocol_family_t protocol_family; - int proto_kpi; + int proto_kpi; union { struct { - proto_media_input input; - proto_media_preout pre_output; - proto_media_event event; - proto_media_ioctl ioctl; + proto_media_input input; + proto_media_preout pre_output; + proto_media_event event; + proto_media_ioctl ioctl; proto_media_detached detached; proto_media_resolve_multi resolve_multi; proto_media_send_arp send_arp; } v1; struct { proto_media_input_v2 input; - proto_media_preout pre_output; - proto_media_event event; - proto_media_ioctl ioctl; + proto_media_preout pre_output; + proto_media_event event; + proto_media_ioctl ioctl; proto_media_detached detached; proto_media_resolve_multi resolve_multi; proto_media_send_arp send_arp; @@ -146,51 +164,118 @@ struct if_proto { SLIST_HEAD(proto_hash_entry, if_proto); +#define DLIL_SDLMAXLEN 64 +#define DLIL_SDLDATALEN \ + (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0])) struct dlil_ifnet { - /* ifnet and drvr_ext are used by the stack and drivers - drvr_ext extends the public ifnet and must follow dl_if */ - struct ifnet dl_if; /* public ifnet */ - - /* dlil private fields */ - TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet are link together */ - /* it is not the ifnet list */ - void *if_uniqueid; /* unique id identifying the interface */ - size_t if_uniqueid_len;/* length of the unique id */ - char if_namestorage[IFNAMSIZ]; /* interface name storage */ + struct ifnet dl_if; /* public ifnet */ + /* + * dlil private fields, protected by dl_if_lock + */ + decl_lck_mtx_data(, dl_if_lock); + TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */ + u_int32_t dl_if_flags; /* flags (below) */ + u_int32_t dl_if_refcnt; /* refcnt */ + void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */ + void *dl_if_uniqueid; /* unique interface id */ + size_t dl_if_uniqueid_len; /* length of the unique id */ + char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */ + struct { + struct ifaddr ifa; /* lladdr ifa */ + u_int8_t asdl[DLIL_SDLMAXLEN]; /* addr storage */ + u_int8_t msdl[DLIL_SDLMAXLEN]; /* mask storage */ + } dl_if_lladdr; + ctrace_t dl_if_attach; /* attach PC stacktrace */ + ctrace_t dl_if_detach; /* detach PC stacktrace */ +}; + +/* Values for dl_if_flags (private to DLIL) */ +#define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */ +#define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */ +#define DLIF_DEBUG 0x4 /* has debugging info */ + +#define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */ + +/* For gdb */ +__private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE; + +struct dlil_ifnet_dbg { + struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */ + u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */ + u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */ + /* + * Circular lists of ifnet_{reference,release} callers. + */ + ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE]; + ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE]; }; +#define DLIL_TO_IFP(s) (&s->dl_if) +#define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s) + struct ifnet_filter { TAILQ_ENTRY(ifnet_filter) filt_next; - ifnet_t filt_ifp; - int filt_detaching; - - const char *filt_name; - void *filt_cookie; - protocol_family_t filt_protocol; - iff_input_func filt_input; - iff_output_func filt_output; - iff_event_func filt_event; - iff_ioctl_func filt_ioctl; - iff_detached_func filt_detached; + u_int32_t filt_skip; + ifnet_t filt_ifp; + const char *filt_name; + void *filt_cookie; + protocol_family_t filt_protocol; + iff_input_func filt_input; + iff_output_func filt_output; + iff_event_func filt_event; + iff_ioctl_func filt_ioctl; + iff_detached_func filt_detached; }; struct proto_input_entry; static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head; static lck_grp_t *dlil_lock_group; -static lck_grp_t *ifnet_lock_group; +lck_grp_t *ifnet_lock_group; static lck_grp_t *ifnet_head_lock_group; -static lck_attr_t *ifnet_lock_attr; -static lck_rw_t *ifnet_head_mutex; -static lck_mtx_t *dlil_ifnet_mutex; -static lck_mtx_t *dlil_mutex; -static u_int32_t dlil_read_count = 0; -static u_int32_t dlil_detach_waiting = 0; +lck_attr_t *ifnet_lock_attr; +decl_lck_rw_data(, ifnet_head_lock); +decl_lck_mtx_data(, dlil_ifnet_lock); u_int32_t dlil_filter_count = 0; extern u_int32_t ipv4_ll_arp_aware; -#if IFNET_ROUTE_REFCNT +#if DEBUG +static unsigned int ifnet_debug = 1; /* debugging (enabled) */ +#else +static unsigned int ifnet_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int dlif_size; /* size of dlil_ifnet to allocate */ +static unsigned int dlif_bufsize; /* size of dlif_size + headroom */ +static struct zone *dlif_zone; /* zone for dlil_ifnet */ + +#define DLIF_ZONE_MAX 64 /* maximum elements in zone */ +#define DLIF_ZONE_NAME "ifnet" /* zone name */ + +static unsigned int dlif_filt_size; /* size of ifnet_filter */ +static struct zone *dlif_filt_zone; /* zone for ifnet_filter */ + +#define DLIF_FILT_ZONE_MAX 8 /* maximum elements in zone */ +#define DLIF_FILT_ZONE_NAME "ifnet_filter" /* zone name */ + +static unsigned int dlif_inp_size; /* size of dlil_threading_info */ +static struct zone *dlif_inp_zone; /* zone for dlil_threading_info */ + +#define DLIF_INP_ZONE_MAX DLIF_ZONE_MAX /* maximum elements in zone */ +#define DLIF_INP_ZONE_NAME "ifnet_thread" /* zone name */ + +static unsigned int dlif_phash_size; /* size of ifnet proto hash table */ +static struct zone *dlif_phash_zone; /* zone for ifnet proto hash table */ + +#define DLIF_PHASH_ZONE_MAX DLIF_ZONE_MAX /* maximum elements in zone */ +#define DLIF_PHASH_ZONE_NAME "ifnet_proto_hash" /* zone name */ + +static unsigned int dlif_proto_size; /* size of if_proto */ +static struct zone *dlif_proto_zone; /* zone for if_proto */ + +#define DLIF_PROTO_ZONE_MAX (DLIF_ZONE_MAX*2) /* maximum elements in zone */ +#define DLIF_PROTO_ZONE_NAME "ifnet_proto" /* zone name */ + /* * Updating this variable should be done by first acquiring the global * radix node head (rnh_lock), in tandem with settting/clearing the @@ -198,7 +283,6 @@ extern u_int32_t ipv4_ll_arp_aware; */ u_int32_t ifnet_aggressive_drainers; static u_int32_t net_rtref; -#endif /* IFNET_ROUTE_REFCNT */ static struct dlil_threading_info dlil_lo_thread; __private_extern__ struct dlil_threading_info *dlil_lo_thread_ptr = &dlil_lo_thread; @@ -206,135 +290,117 @@ __private_extern__ struct dlil_threading_info *dlil_lo_thread_ptr = &dlil_lo_th static struct mbuf *dlil_lo_input_mbuf_head = NULL; static struct mbuf *dlil_lo_input_mbuf_tail = NULL; -#if IFNET_INPUT_SANITY_CHK -static int dlil_lo_input_mbuf_count = 0; -int dlil_input_sanity_check = 0; /* sanity checking of input packet lists received */ -#endif -int dlil_multithreaded_input = 1; -static int cur_dlil_input_threads = 0; - static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg); static int dlil_detach_filter_internal(interface_filter_t filter, int detached); -static void dlil_call_delayed_detach_thread(void); +static void dlil_if_trace(struct dlil_ifnet *, int); +static void if_proto_ref(struct if_proto *); +static void if_proto_free(struct if_proto *); +static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t); +static int dlil_ifp_proto_count(struct ifnet *); +static void if_flt_monitor_busy(struct ifnet *); +static void if_flt_monitor_unbusy(struct ifnet *); +static void if_flt_monitor_enter(struct ifnet *); +static void if_flt_monitor_leave(struct ifnet *); +static int dlil_interface_filters_input(struct ifnet *, struct mbuf **, + char **, protocol_family_t); +static int dlil_interface_filters_output(struct ifnet *, struct mbuf **, + protocol_family_t); +static struct ifaddr *dlil_alloc_lladdr(struct ifnet *, + const struct sockaddr_dl *); +static int ifnet_lookup(struct ifnet *); +static void if_purgeaddrs(struct ifnet *); + +static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t, + struct mbuf *, char *); +static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t, + struct mbuf *); +static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t, + mbuf_t *, const struct sockaddr *, void *, char *, char *); +static void ifproto_media_event(struct ifnet *, protocol_family_t, + const struct kev_msg *); +static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t, + unsigned long, void *); +static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *, + struct sockaddr_dl *, size_t); +static errno_t ifproto_media_send_arp(struct ifnet *, u_short, + const struct sockaddr_dl *, const struct sockaddr *, + const struct sockaddr_dl *, const struct sockaddr *); + +static errno_t ifp_if_output(struct ifnet *, struct mbuf *); +static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *, + protocol_family_t *); +static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t, + const struct ifnet_demux_desc *, u_int32_t); +static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t); +static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *); +static errno_t ifp_if_framer(struct ifnet *, struct mbuf **, + const struct sockaddr *, const char *, const char *); +static errno_t ifp_if_ioctl(struct ifnet *, unsigned long, void *); +static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func); +static void ifp_if_free(struct ifnet *); +static void ifp_if_event(struct ifnet *, const struct kev_msg *); + +static void dlil_input_thread_func(struct dlil_threading_info *inpthread); +static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *); + +static void ifnet_delayed_thread_func(void); +static void ifnet_detach_final(struct ifnet *); +static void ifnet_detaching_enqueue(struct ifnet *); +static struct ifnet *ifnet_detaching_dequeue(void); + +static void ifp_src_route_copyout(struct ifnet *, struct route *); +static void ifp_src_route_copyin(struct ifnet *, struct route *); +#if INET6 +static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *); +static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *); +#endif /* INET6 */ + +/* The following are protected by dlil_ifnet_lock */ +static TAILQ_HEAD(, ifnet) ifnet_detaching_head; +static u_int32_t ifnet_detaching_cnt; +static void *ifnet_delayed_run; /* wait channel for detaching thread */ + +extern void bpfdetach(struct ifnet*); +extern void proto_input_run(void); -static void dlil_read_begin(void); -static __inline__ void dlil_read_end(void); -static int dlil_write_begin(void); -static void dlil_write_end(void); +__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); #if DEBUG -__private_extern__ int dlil_verbose = 1; +static int dlil_verbose = 1; #else -__private_extern__ int dlil_verbose = 0; +static int dlil_verbose = 0; #endif /* DEBUG */ +static int dlil_multithreaded_input = 1; +static int cur_dlil_input_threads = 0; +#if IFNET_INPUT_SANITY_CHK +static int dlil_lo_input_mbuf_count = 0; +/* sanity checking of input packet lists received */ +static int dlil_input_sanity_check = 0; +#endif -unsigned int net_affinity = 1; -static kern_return_t dlil_affinity_set(struct thread *, u_int32_t); +SYSCTL_DECL(_net_link_generic_system); -extern void bpfdetach(struct ifnet*); -extern void proto_input_run(void); // new run_netisr +SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose, CTLFLAG_RW, + &dlil_verbose, 0, "Log DLIL error messages"); -void dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m); -static void dlil_input_thread_func(struct dlil_threading_info *inpthread); -__private_extern__ int dlil_create_input_thread( - ifnet_t, struct dlil_threading_info *); -__private_extern__ void dlil_terminate_input_thread( - struct dlil_threading_info *); +SYSCTL_INT(_net_link_generic_system, OID_AUTO, multi_threaded_input, CTLFLAG_RW, + &dlil_multithreaded_input , 0, "Uses multiple input thread for DLIL input"); -__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); +#if IFNET_INPUT_SANITY_CHK +SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check, + CTLFLAG_RW, &dlil_input_sanity_check , 0, + "Turn on sanity checking in DLIL input"); +#endif -int dlil_expand_mcl; +unsigned int net_affinity = 1; +static kern_return_t dlil_affinity_set(struct thread *, u_int32_t); extern u_int32_t inject_buckets; -static const u_int32_t dlil_writer_waiting = 0x80000000; static lck_grp_attr_t *dlil_grp_attributes = NULL; static lck_attr_t *dlil_lck_attributes = NULL; static lck_grp_t *dlil_input_lock_grp = NULL; -static inline void* -_cast_non_const(const void * ptr) { - union { - const void* cval; - void* val; - } ret; - - ret.cval = ptr; - return (ret.val); -} - -/* Should these be inline? */ -static void -dlil_read_begin(void) -{ - u_int32_t new_value; - u_int32_t old_value; - struct uthread *uth = get_bsdthread_info(current_thread()); - - if (uth->dlil_incremented_read == dlil_writer_waiting) - panic("dlil_read_begin - thread is already a writer"); - - do { -again: - old_value = dlil_read_count; - - if ((old_value & dlil_writer_waiting) != 0 && uth->dlil_incremented_read == 0) - { - tsleep(&dlil_read_count, PRIBIO, "dlil_read_count", 1); - goto again; - } - - new_value = old_value + 1; - } while (!OSCompareAndSwap((UInt32)old_value, (UInt32)new_value, (UInt32*)&dlil_read_count)); - - uth->dlil_incremented_read++; -} - -static void -dlil_read_end(void) -{ - struct uthread *uth = get_bsdthread_info(current_thread()); - - OSDecrementAtomic(&dlil_read_count); - uth->dlil_incremented_read--; - if (dlil_read_count == dlil_writer_waiting) - wakeup(_cast_non_const(&dlil_writer_waiting)); -} - -static int -dlil_write_begin(void) -{ - struct uthread *uth = get_bsdthread_info(current_thread()); - - if (uth->dlil_incremented_read != 0) { - return EDEADLK; - } - lck_mtx_lock(dlil_mutex); - OSBitOrAtomic((UInt32)dlil_writer_waiting, &dlil_read_count); -again: - if (dlil_read_count == dlil_writer_waiting) { - uth->dlil_incremented_read = dlil_writer_waiting; - return 0; - } - else { - tsleep(_cast_non_const(&dlil_writer_waiting), PRIBIO, "dlil_writer_waiting", 1); - goto again; - } -} - -static void -dlil_write_end(void) -{ - struct uthread *uth = get_bsdthread_info(current_thread()); - - if (uth->dlil_incremented_read != dlil_writer_waiting) - panic("dlil_write_end - thread is not a writer"); - OSBitAndAtomic((UInt32)~dlil_writer_waiting, &dlil_read_count); - lck_mtx_unlock(dlil_mutex); - uth->dlil_incremented_read = 0; - wakeup(&dlil_read_count); -} - #define PROTO_HASH_SLOTS 0x5 /* @@ -351,192 +417,248 @@ proto_hash_value(u_int32_t protocol_family) */ switch(protocol_family) { case PF_INET: - return 0; + return (0); case PF_INET6: - return 1; + return (1); case PF_APPLETALK: - return 2; + return (2); case PF_VLAN: - return 3; + return (3); + case PF_UNSPEC: default: - return 4; + return (4); } } -static struct if_proto* +/* + * Caller must already be holding ifnet lock. + */ +static struct if_proto * find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family) { struct if_proto *proto = NULL; u_int32_t i = proto_hash_value(protocol_family); - if (ifp->if_proto_hash) { + + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED); + + if (ifp->if_proto_hash != NULL) proto = SLIST_FIRST(&ifp->if_proto_hash[i]); - } - - while(proto && proto->protocol_family != protocol_family) { + + while (proto != NULL && proto->protocol_family != protocol_family) proto = SLIST_NEXT(proto, next_hash); - } - - return proto; + + if (proto != NULL) + if_proto_ref(proto); + + return (proto); } static void if_proto_ref(struct if_proto *proto) { - OSAddAtomic(1, &proto->refcount); + atomic_add_32(&proto->refcount, 1); } +extern void if_rtproto_del(struct ifnet *ifp, int protocol); + static void if_proto_free(struct if_proto *proto) { - int oldval = OSAddAtomic(-1, &proto->refcount); - - if (oldval == 1) { /* This was the last reference */ - FREE(proto, M_IFADDR); + u_int32_t oldval; + struct ifnet *ifp = proto->ifp; + u_int32_t proto_family = proto->protocol_family; + struct kev_dl_proto_data ev_pr_data; + + oldval = atomic_add_32_ov(&proto->refcount, -1); + if (oldval > 1) + return; + + /* No more reference on this, protocol must have been detached */ + VERIFY(proto->detached); + + if (proto->proto_kpi == kProtoKPI_v1) { + if (proto->kpi.v1.detached) + proto->kpi.v1.detached(ifp, proto->protocol_family); + } + if (proto->proto_kpi == kProtoKPI_v2) { + if (proto->kpi.v2.detached) + proto->kpi.v2.detached(ifp, proto->protocol_family); } + + /* + * Cleanup routes that may still be in the routing table for that + * interface/protocol pair. + */ + if_rtproto_del(ifp, proto_family); + + /* + * The reserved field carries the number of protocol still attached + * (subject to change) + */ + ifnet_lock_shared(ifp); + ev_pr_data.proto_family = proto_family; + ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp); + ifnet_lock_done(ifp); + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED, + (struct net_event_data *)&ev_pr_data, + sizeof(struct kev_dl_proto_data)); + + zfree(dlif_proto_zone, proto); } __private_extern__ void -ifnet_lock_assert( - __unused struct ifnet *ifp, - __unused int what) +ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what) { -#if IFNET_RW_LOCK - /* - * Not implemented for rw locks. - * - * Function exists so when/if we use mutex we can - * enable this check. - */ -#else - lck_mtx_assert(ifp->if_lock, what); -#endif + unsigned int type = 0; + int ass = 1; + + switch (what) { + case IFNET_LCK_ASSERT_EXCLUSIVE: + type = LCK_RW_ASSERT_EXCLUSIVE; + break; + + case IFNET_LCK_ASSERT_SHARED: + type = LCK_RW_ASSERT_SHARED; + break; + + case IFNET_LCK_ASSERT_OWNED: + type = LCK_RW_ASSERT_HELD; + break; + + case IFNET_LCK_ASSERT_NOTOWNED: + /* nothing to do here for RW lock; bypass assert */ + ass = 0; + break; + + default: + panic("bad ifnet assert type: %d", what); + /* NOTREACHED */ + } + if (ass) + lck_rw_assert(&ifp->if_lock, type); } __private_extern__ void -ifnet_lock_shared( - struct ifnet *ifp) +ifnet_lock_shared(struct ifnet *ifp) { -#if IFNET_RW_LOCK - lck_rw_lock_shared(ifp->if_lock); -#else - lck_mtx_assert(ifp->if_lock, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(ifp->if_lock); -#endif + lck_rw_lock_shared(&ifp->if_lock); } __private_extern__ void -ifnet_lock_exclusive( - struct ifnet *ifp) +ifnet_lock_exclusive(struct ifnet *ifp) { -#if IFNET_RW_LOCK - lck_rw_lock_exclusive(ifp->if_lock); -#else - lck_mtx_assert(ifp->if_lock, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(ifp->if_lock); -#endif + lck_rw_lock_exclusive(&ifp->if_lock); } __private_extern__ void -ifnet_lock_done( - struct ifnet *ifp) +ifnet_lock_done(struct ifnet *ifp) { -#if IFNET_RW_LOCK - lck_rw_done(ifp->if_lock); -#else - lck_mtx_assert(ifp->if_lock, LCK_MTX_ASSERT_OWNED); - lck_mtx_unlock(ifp->if_lock); -#endif + lck_rw_done(&ifp->if_lock); } __private_extern__ void ifnet_head_lock_shared(void) { - lck_rw_lock_shared(ifnet_head_mutex); + lck_rw_lock_shared(&ifnet_head_lock); } __private_extern__ void ifnet_head_lock_exclusive(void) { - lck_rw_lock_exclusive(ifnet_head_mutex); + lck_rw_lock_exclusive(&ifnet_head_lock); } __private_extern__ void ifnet_head_done(void) { - lck_rw_done(ifnet_head_mutex); + lck_rw_done(&ifnet_head_lock); } -static int dlil_ifp_proto_count(struct ifnet * ifp) +/* + * Caller must already be holding ifnet lock. + */ +static int +dlil_ifp_proto_count(struct ifnet * ifp) { - int count = 0; - int i; - - if (ifp->if_proto_hash != NULL) { - for (i = 0; i < PROTO_HASH_SLOTS; i++) { - struct if_proto *proto; - SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) { - count++; - } + int i, count = 0; + + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED); + + if (ifp->if_proto_hash == NULL) + goto done; + + for (i = 0; i < PROTO_HASH_SLOTS; i++) { + struct if_proto *proto; + SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) { + count++; } } - - return count; +done: + return (count); } __private_extern__ void -dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass, u_int32_t event_code, - struct net_event_data *event_data, u_int32_t event_data_len) +dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass, + u_int32_t event_code, struct net_event_data *event_data, + u_int32_t event_data_len) { - struct net_event_data ev_data; - struct kev_msg ev_msg; - - /* + struct net_event_data ev_data; + struct kev_msg ev_msg; + + bzero(&ev_msg, sizeof (ev_msg)); + bzero(&ev_data, sizeof (ev_data)); + /* * a net event always starts with a net_event_data structure * but the caller can generate a simple net event or * provide a longer event structure to post */ - - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = event_subclass; - ev_msg.event_code = event_code; - - if (event_data == 0) { + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = event_subclass; + ev_msg.event_code = event_code; + + if (event_data == NULL) { event_data = &ev_data; event_data_len = sizeof(struct net_event_data); } - + strncpy(&event_data->if_name[0], ifp->if_name, IFNAMSIZ); event_data->if_family = ifp->if_family; event_data->if_unit = (u_int32_t) ifp->if_unit; - + ev_msg.dv[0].data_length = event_data_len; - ev_msg.dv[0].data_ptr = event_data; + ev_msg.dv[0].data_ptr = event_data; ev_msg.dv[1].data_length = 0; - + dlil_event_internal(ifp, &ev_msg); } -__private_extern__ int -dlil_create_input_thread( - ifnet_t ifp, struct dlil_threading_info *inputthread) +static int +dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inputthread) { int error; bzero(inputthread, sizeof(*inputthread)); - // loopback ifp may not be configured at dlil_init time. - if (ifp == lo_ifp) - strlcat(inputthread->input_name, "dlil_input_main_thread_mtx", 32); - else - snprintf(inputthread->input_name, 32, "dlil_input_%s%d_mtx", ifp->if_name, ifp->if_unit); + /* loopback ifp may not be configured at dlil_init time. */ + if (ifp == lo_ifp) { + (void) strlcat(inputthread->input_name, + "dlil_input_main_thread_mtx", DLIL_THREADNAME_LEN); + } else { + (void) snprintf(inputthread->input_name, DLIL_THREADNAME_LEN, + "dlil_input_%s%d_mtx", ifp->if_name, ifp->if_unit); + } - inputthread->lck_grp = lck_grp_alloc_init(inputthread->input_name, dlil_grp_attributes); - inputthread->input_lck = lck_mtx_alloc_init(inputthread->lck_grp, dlil_lck_attributes); + inputthread->lck_grp = lck_grp_alloc_init(inputthread->input_name, + dlil_grp_attributes); + lck_mtx_init(&inputthread->input_lck, inputthread->lck_grp, + dlil_lck_attributes); - error= kernel_thread_start((thread_continue_t)dlil_input_thread_func, inputthread, &inputthread->input_thread); + error= kernel_thread_start((thread_continue_t)dlil_input_thread_func, + inputthread, &inputthread->input_thread); if (error == 0) { - ml_thread_policy(inputthread->input_thread, MACHINE_GROUP, - (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); + ml_thread_policy(inputthread->input_thread, MACHINE_GROUP, + (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); /* * Except for the loopback dlil input thread, we create * an affinity set so that the matching workloop thread @@ -557,31 +679,16 @@ dlil_create_input_thread( } } } else { - panic("dlil_create_input_thread: couldn't create thread\n"); + panic("%s: couldn't create thread", __func__); + /* NOTREACHED */ } OSAddAtomic(1, &cur_dlil_input_threads); #if DLIL_DEBUG - printf("dlil_create_input_thread: threadinfo: %p input_thread=%p threads: cur=%d max=%d\n", - inputthread, inputthread->input_thread, dlil_multithreaded_input, cur_dlil_input_threads); + printf("%s: threadinfo: %p input_thread=%p threads: cur=%d max=%d\n", + __func__, inputthread, inputthread->input_thread, + dlil_multithreaded_input, cur_dlil_input_threads); #endif - return error; -} -__private_extern__ void -dlil_terminate_input_thread( - struct dlil_threading_info *inputthread) -{ - OSAddAtomic(-1, &cur_dlil_input_threads); - - lck_mtx_unlock(inputthread->input_lck); - lck_mtx_free(inputthread->input_lck, inputthread->lck_grp); - lck_grp_free(inputthread->lck_grp); - - FREE(inputthread, M_NKE); - - /* For the extra reference count from kernel_thread_start() */ - thread_deallocate(current_thread()); - - thread_terminate(current_thread()); + return (error); } static kern_return_t @@ -598,65 +705,246 @@ dlil_affinity_set(struct thread *tp, u_int32_t tag) void dlil_init(void) { - thread_t thread = THREAD_NULL; + thread_t thread = THREAD_NULL; + + /* + * The following fields must be 64-bit aligned for atomic operations. + */ + IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets); + IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors) + IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets); + IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors); + IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions); + IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes); + IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes); + IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts); + IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts); + IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops); + IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto); + + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors) + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops); + IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto); + + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ibkpackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ibkbytes); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_obkpackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_obkbytes); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivipackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivibytes); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovipackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovibytes); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivopackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ivobytes); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovopackets); + IFNET_IF_TC_REQUIRE_ALIGNED_64(ifi_ovobytes); + + /* + * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts. + */ + _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP); + _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP); + _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP); + _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT); + _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT); + _CASSERT(IF_HWASSIST_CSUM_TCP_SUM16 == IFNET_CSUM_SUM16); + _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING); + _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU); + _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4); + _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6); + + /* + * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info. + */ + _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN); + + PE_parse_boot_argn("net_affinity", &net_affinity, + sizeof (net_affinity)); - PE_parse_boot_argn("net_affinity", &net_affinity, sizeof (net_affinity)); -#if IFNET_ROUTE_REFCNT PE_parse_boot_argn("net_rtref", &net_rtref, sizeof (net_rtref)); -#endif /* IFNET_ROUTE_REFCNT */ + + PE_parse_boot_argn("ifnet_debug", &ifnet_debug, sizeof (ifnet_debug)); + + dlif_size = (ifnet_debug == 0) ? sizeof (struct dlil_ifnet) : + sizeof (struct dlil_ifnet_dbg); + /* Enforce 64-bit alignment for dlil_ifnet structure */ + dlif_bufsize = dlif_size + sizeof (void *) + sizeof (u_int64_t); + dlif_bufsize = P2ROUNDUP(dlif_bufsize, sizeof (u_int64_t)); + dlif_zone = zinit(dlif_bufsize, DLIF_ZONE_MAX * dlif_bufsize, + 0, DLIF_ZONE_NAME); + if (dlif_zone == NULL) { + panic("%s: failed allocating %s", __func__, DLIF_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_zone, Z_EXPAND, TRUE); + zone_change(dlif_zone, Z_CALLERACCT, FALSE); + + dlif_filt_size = sizeof (struct ifnet_filter); + dlif_filt_zone = zinit(dlif_filt_size, + DLIF_FILT_ZONE_MAX * dlif_filt_size, 0, DLIF_FILT_ZONE_NAME); + if (dlif_filt_zone == NULL) { + panic("%s: failed allocating %s", __func__, + DLIF_FILT_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_filt_zone, Z_EXPAND, TRUE); + zone_change(dlif_filt_zone, Z_CALLERACCT, FALSE); + + dlif_inp_size = sizeof (struct dlil_threading_info); + dlif_inp_zone = zinit(dlif_inp_size, + DLIF_INP_ZONE_MAX * dlif_inp_size, 0, DLIF_INP_ZONE_NAME); + if (dlif_inp_zone == NULL) { + panic("%s: failed allocating %s", __func__, DLIF_INP_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_inp_zone, Z_EXPAND, TRUE); + zone_change(dlif_inp_zone, Z_CALLERACCT, FALSE); + + dlif_phash_size = sizeof (struct proto_hash_entry) * PROTO_HASH_SLOTS; + dlif_phash_zone = zinit(dlif_phash_size, + DLIF_PHASH_ZONE_MAX * dlif_phash_size, 0, DLIF_PHASH_ZONE_NAME); + if (dlif_phash_zone == NULL) { + panic("%s: failed allocating %s", __func__, + DLIF_PHASH_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_phash_zone, Z_EXPAND, TRUE); + zone_change(dlif_phash_zone, Z_CALLERACCT, FALSE); + + dlif_proto_size = sizeof (struct if_proto); + dlif_proto_zone = zinit(dlif_proto_size, + DLIF_PROTO_ZONE_MAX * dlif_proto_size, 0, DLIF_PROTO_ZONE_NAME); + if (dlif_proto_zone == NULL) { + panic("%s: failed allocating %s", __func__, + DLIF_PROTO_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dlif_proto_zone, Z_EXPAND, TRUE); + zone_change(dlif_proto_zone, Z_CALLERACCT, FALSE); + + ifnet_llreach_init(); TAILQ_INIT(&dlil_ifnet_head); TAILQ_INIT(&ifnet_head); - + TAILQ_INIT(&ifnet_detaching_head); + /* Setup the lock groups we will use */ dlil_grp_attributes = lck_grp_attr_alloc_init(); - dlil_lock_group = lck_grp_alloc_init("dlil internal locks", dlil_grp_attributes); - ifnet_lock_group = lck_grp_alloc_init("ifnet locks", dlil_grp_attributes); - ifnet_head_lock_group = lck_grp_alloc_init("ifnet head lock", dlil_grp_attributes); - dlil_input_lock_grp = lck_grp_alloc_init("dlil input lock", dlil_grp_attributes); - + dlil_lock_group = lck_grp_alloc_init("dlil internal locks", + dlil_grp_attributes); + ifnet_lock_group = lck_grp_alloc_init("ifnet locks", + dlil_grp_attributes); + ifnet_head_lock_group = lck_grp_alloc_init("ifnet head lock", + dlil_grp_attributes); + dlil_input_lock_grp = lck_grp_alloc_init("dlil input lock", + dlil_grp_attributes); + /* Setup the lock attributes we will use */ dlil_lck_attributes = lck_attr_alloc_init(); - + ifnet_lock_attr = lck_attr_alloc_init(); - - - ifnet_head_mutex = lck_rw_alloc_init(ifnet_head_lock_group, dlil_lck_attributes); - dlil_ifnet_mutex = lck_mtx_alloc_init(dlil_lock_group, dlil_lck_attributes); - dlil_mutex = lck_mtx_alloc_init(dlil_lock_group, dlil_lck_attributes); - + + lck_rw_init(&ifnet_head_lock, ifnet_head_lock_group, + dlil_lck_attributes); + lck_mtx_init(&dlil_ifnet_lock, dlil_lock_group, dlil_lck_attributes); + lck_attr_free(dlil_lck_attributes); dlil_lck_attributes = NULL; - + + ifa_init(); + /* - * Create and start up the first dlil input thread once everything is initialized + * Create and start up the first dlil input thread once everything + * is initialized. */ - dlil_create_input_thread(0, dlil_lo_thread_ptr); + dlil_create_input_thread(lo_ifp, dlil_lo_thread_ptr); - (void) kernel_thread_start((thread_continue_t)dlil_call_delayed_detach_thread, NULL, &thread); + if (kernel_thread_start((thread_continue_t)ifnet_delayed_thread_func, + NULL, &thread) != 0) { + panic("%s: couldn't create detach thread", __func__); + /* NOTREACHED */ + } thread_deallocate(thread); + #if PF /* Initialize the packet filter */ pfinit(); #endif /* PF */ } +static void +if_flt_monitor_busy(struct ifnet *ifp) +{ + lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED); + + ++ifp->if_flt_busy; + VERIFY(ifp->if_flt_busy != 0); +} + +static void +if_flt_monitor_unbusy(struct ifnet *ifp) +{ + if_flt_monitor_leave(ifp); +} + +static void +if_flt_monitor_enter(struct ifnet *ifp) +{ + lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED); + + while (ifp->if_flt_busy) { + ++ifp->if_flt_waiters; + (void) msleep(&ifp->if_flt_head, &ifp->if_flt_lock, + (PZERO - 1), "if_flt_monitor", NULL); + } + if_flt_monitor_busy(ifp); +} + +static void +if_flt_monitor_leave(struct ifnet *ifp) +{ + lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED); + + VERIFY(ifp->if_flt_busy != 0); + --ifp->if_flt_busy; + + if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) { + ifp->if_flt_waiters = 0; + wakeup(&ifp->if_flt_head); + } +} + __private_extern__ int -dlil_attach_filter( - struct ifnet *ifp, - const struct iff_filter *if_filter, - interface_filter_t *filter_ref) -{ - int retval = 0; - struct ifnet_filter *filter; - - MALLOC(filter, struct ifnet_filter *, sizeof(*filter), M_NKE, M_WAITOK); - if (filter == NULL) - return ENOMEM; - bzero(filter, sizeof(*filter)); +dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter, + interface_filter_t *filter_ref) +{ + int retval = 0; + struct ifnet_filter *filter = NULL; - + ifnet_head_lock_shared(); + /* Check that the interface is in the global list */ + if (!ifnet_lookup(ifp)) { + retval = ENXIO; + goto done; + } + + filter = zalloc(dlif_filt_zone); + if (filter == NULL) { + retval = ENOMEM; + goto done; + } + bzero(filter, dlif_filt_size); + + /* refcnt held above during lookup */ filter->filt_ifp = ifp; filter->filt_cookie = if_filter->iff_cookie; filter->filt_name = if_filter->iff_name; @@ -666,14 +954,16 @@ dlil_attach_filter( filter->filt_event = if_filter->iff_event; filter->filt_ioctl = if_filter->iff_ioctl; filter->filt_detached = if_filter->iff_detached; - - if ((retval = dlil_write_begin()) != 0) { - /* Failed to acquire the write lock */ - FREE(filter, M_NKE); - return retval; - } + + lck_mtx_lock(&ifp->if_flt_lock); + if_flt_monitor_enter(ifp); + + lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED); TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next); - dlil_write_end(); + + if_flt_monitor_leave(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + *filter_ref = filter; /* @@ -684,71 +974,88 @@ dlil_attach_filter( if (use_routegenid) routegenid_update(); - return retval; + if (dlil_verbose) { + printf("%s%d: %s filter attached\n", ifp->if_name, + ifp->if_unit, if_filter->iff_name); + } +done: + ifnet_head_done(); + if (retval != 0 && ifp != NULL) { + DLIL_PRINTF("%s%d: failed to attach %s (err=%d)\n", + ifp->if_name, ifp->if_unit, if_filter->iff_name, retval); + } + if (retval != 0 && filter != NULL) + zfree(dlif_filt_zone, filter); + + return (retval); } static int -dlil_detach_filter_internal( - interface_filter_t filter, - int detached) +dlil_detach_filter_internal(interface_filter_t filter, int detached) { int retval = 0; - + if (detached == 0) { - ifnet_t ifp = NULL; - interface_filter_t entry = NULL; - - /* Take the write lock */ - retval = dlil_write_begin(); - if (retval != 0 && retval != EDEADLK) - return retval; - - /* - * At this point either we have the write lock (retval == 0) - * or we couldn't get it (retval == EDEADLK) because someone - * else up the stack is holding the read lock. It is safe to - * read, either the read or write is held. Verify the filter - * parameter before proceeding. - */ + ifnet_t ifp = NULL; + ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + interface_filter_t entry = NULL; + + lck_mtx_lock(&ifp->if_flt_lock); TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) { - if (entry == filter) - break; + if (entry != filter || entry->filt_skip) + continue; + /* + * We've found a match; since it's possible + * that the thread gets blocked in the monitor, + * we do the lock dance. Interface should + * not be detached since we still have a use + * count held during filter attach. + */ + entry->filt_skip = 1; /* skip input/output */ + lck_mtx_unlock(&ifp->if_flt_lock); + ifnet_head_done(); + + lck_mtx_lock(&ifp->if_flt_lock); + if_flt_monitor_enter(ifp); + lck_mtx_assert(&ifp->if_flt_lock, + LCK_MTX_ASSERT_OWNED); + + /* Remove the filter from the list */ + TAILQ_REMOVE(&ifp->if_flt_head, filter, + filt_next); + + if_flt_monitor_leave(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + if (dlil_verbose) { + printf("%s%d: %s filter detached\n", + ifp->if_name, ifp->if_unit, + filter->filt_name); + } + goto destroy; } - if (entry == filter) - break; + lck_mtx_unlock(&ifp->if_flt_lock); } ifnet_head_done(); - - if (entry != filter) { - /* filter parameter is not a valid filter ref */ - if (retval == 0) { - dlil_write_end(); - } - return EINVAL; - } - - if (retval == EDEADLK) { - /* Perform a delayed detach */ - filter->filt_detaching = 1; - dlil_detach_waiting = 1; - wakeup(&dlil_detach_waiting); - return 0; - } - - /* Remove the filter from the list */ - TAILQ_REMOVE(&ifp->if_flt_head, filter, filt_next); - dlil_write_end(); + + /* filter parameter is not a valid filter ref */ + retval = EINVAL; + goto done; } - - /* Call the detached funciton if there is one */ + + if (dlil_verbose) + printf("%s filter detached\n", filter->filt_name); + +destroy: + + /* Call the detached function if there is one */ if (filter->filt_detached) filter->filt_detached(filter->filt_cookie, filter->filt_ifp); /* Free the filter */ - FREE(filter, M_NKE); - + zfree(dlif_filt_zone, filter); + /* * Decrease filter count and route_generation ID to let TCP * know it should reevalute doing TSO or not @@ -757,7 +1064,12 @@ dlil_detach_filter_internal( if (use_routegenid) routegenid_update(); - return retval; +done: + if (retval != 0) { + DLIL_PRINTF("failed to detach %s filter (err=%d)\n", + filter->filt_name, retval); + } + return (retval); } __private_extern__ void @@ -769,8 +1081,7 @@ dlil_detach_filter(interface_filter_t filter) } static void -dlil_input_thread_func( - struct dlil_threading_info *inputthread) +dlil_input_thread_func(struct dlil_threading_info *inputthread) { while (1) { struct mbuf *m = NULL, *m_loop = NULL; @@ -779,28 +1090,44 @@ dlil_input_thread_func( int count; struct mbuf *m1; #endif /* IFNET_INPUT_SANITY_CHK */ - - lck_mtx_lock(inputthread->input_lck); - + + lck_mtx_lock_spin(&inputthread->input_lck); + /* Wait until there is work to be done */ - while ((inputthread->input_waiting & ~DLIL_INPUT_RUNNING) == 0) { + while (!(inputthread->input_waiting & ~DLIL_INPUT_RUNNING)) { inputthread->input_waiting &= ~DLIL_INPUT_RUNNING; - msleep(&inputthread->input_waiting, inputthread->input_lck, 0, inputthread->input_name, 0); + msleep(&inputthread->input_waiting, + &inputthread->input_lck, 0, + inputthread->input_name, 0); } - - lck_mtx_assert(inputthread->input_lck, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inputthread->input_lck, LCK_MTX_ASSERT_OWNED); m = inputthread->mbuf_head; inputthread->mbuf_head = NULL; inputthread->mbuf_tail = NULL; if (inputthread->input_waiting & DLIL_INPUT_TERMINATE) { - if (m) - mbuf_freem_list(m); - /* this is the end */ - dlil_terminate_input_thread(inputthread); - return; + lck_mtx_unlock(&inputthread->input_lck); + + if (m != NULL) + mbuf_freem_list(m); + + OSAddAtomic(-1, &cur_dlil_input_threads); + + lck_mtx_destroy(&inputthread->input_lck, + inputthread->lck_grp); + lck_grp_free(inputthread->lck_grp); + + zfree(dlif_inp_zone, inputthread); + + /* for the extra refcnt from kernel_thread_start() */ + thread_deallocate(current_thread()); + + /* this is the end */ + thread_terminate(current_thread()); + /* NOTREACHED */ + return; } inputthread->input_waiting |= DLIL_INPUT_RUNNING; @@ -820,69 +1147,76 @@ dlil_input_thread_func( loop_cnt = dlil_lo_input_mbuf_count; dlil_lo_input_mbuf_count = 0; } - - lck_mtx_unlock(inputthread->input_lck); - + + lck_mtx_unlock(&inputthread->input_lck); + for (m1 = m, count = 0; m1; m1 = mbuf_nextpkt(m1)) { count++; } if (count != mbuf_cnt) { - panic("dlil_input_func - thread=%p reg. loop queue has %d packets, should have %d\n", - inputthread, count, mbuf_cnt); + panic("%s - thread=%p reg. loop queue " + "has %d packets, should have %d\n", + __func__, inputthread, count, mbuf_cnt); + /* NOTREACHED */ } - + if (inputthread == dlil_lo_thread_ptr) { - for (m1 = m_loop, count = 0; m1; m1 = mbuf_nextpkt(m1)) { + for (m1 = m_loop, count = 0; m1; + m1 = mbuf_nextpkt(m1)) { count++; } if (count != loop_cnt) { - panic("dlil_input_func - thread=%p loop queue has %d packets, should have %d\n", - inputthread, count, loop_cnt); + panic("%s - thread=%p loop queue " + "has %d packets, should have %d\n", + __func__, inputthread, count, + loop_cnt); + /* NOTREACHED */ } } - } else + } else #endif /* IFNET_INPUT_SANITY_CHK */ { - lck_mtx_unlock(inputthread->input_lck); + lck_mtx_unlock(&inputthread->input_lck); } /* * NOTE warning %%% attention !!!! - * We should think about putting some thread starvation safeguards if - * we deal with long chains of packets. + * We should think about putting some thread starvation + * safeguards if we deal with long chains of packets. */ if (m_loop) { - if (inputthread == dlil_lo_thread_ptr) + if (inputthread == dlil_lo_thread_ptr) { dlil_input_packet_list(lo_ifp, m_loop); + } #if IFNET_INPUT_SANITY_CHK - else - panic("dlil_input_func - thread=%p loop queue has %d packets, should have none!\n", - inputthread, loop_cnt); + else { + panic("%s - thread=%p loop queue has %d " + "packets, should have none!\n", __func__, + inputthread, loop_cnt); + /* NOTREACHED */ + } #endif /* IFNET_INPUT_SANITY_CHK */ } - - if (m) + if (m != NULL) dlil_input_packet_list(0, m); + lck_mtx_lock_spin(&inputthread->input_lck); - lck_mtx_lock(inputthread->input_lck); - - if ((inputthread->input_waiting & (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)) != 0) { - lck_mtx_unlock(inputthread->input_lck); + if (inputthread->input_waiting & + (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)) { + lck_mtx_unlock(&inputthread->input_lck); proto_input_run(); - } - else - lck_mtx_unlock(inputthread->input_lck); + } else { + lck_mtx_unlock(&inputthread->input_lck); + } } } errno_t -ifnet_input( - ifnet_t ifp, - mbuf_t m_head, - const struct ifnet_stat_increment_param *stats) +ifnet_input(ifnet_t ifp, mbuf_t m_head, + const struct ifnet_stat_increment_param *stats) { struct thread *tp = current_thread(); mbuf_t m_tail; @@ -892,9 +1226,9 @@ ifnet_input( #endif /* IFNET_INPUT_SANITY_CHK */ if (ifp == NULL || m_head == NULL) { - if (m_head) + if (m_head != NULL) mbuf_freem_list(m_head); - return EINVAL; + return (EINVAL); } m_tail = m_head; @@ -902,14 +1236,16 @@ ifnet_input( #if IFNET_INPUT_SANITY_CHK if (dlil_input_sanity_check != 0) { ifnet_t rcvif; - + rcvif = mbuf_pkthdr_rcvif(m_tail); pkt_count++; - + if (rcvif == NULL || - (ifp->if_type != IFT_LOOP && rcvif != ifp) || - (mbuf_flags(m_head) & MBUF_PKTHDR) == 0) { - panic("ifnet_input - invalid mbuf %p\n", m_tail); + (ifp->if_type != IFT_LOOP && rcvif != ifp) || + !(mbuf_flags(m_head) & MBUF_PKTHDR)) { + panic("%s - invalid mbuf %p\n", + __func__, m_tail); + /* NOTREACHED */ } } #endif /* IFNET_INPUT_SANITY_CHK */ @@ -920,7 +1256,7 @@ ifnet_input( inp = ifp->if_input_thread; - if (dlil_multithreaded_input == 0 || inp == NULL) + if (dlil_multithreaded_input == 0 || inp == NULL) inp = dlil_lo_thread_ptr; /* @@ -928,11 +1264,11 @@ ifnet_input( * affinity set, associate this workloop thread with the same set. * We will only do this once. */ - lck_mtx_lock(inp->input_lck); + lck_mtx_lock_spin(&inp->input_lck); if (inp->net_affinity && inp->workloop_thread == NULL) { u_int32_t tag = inp->tag; inp->workloop_thread = tp; - lck_mtx_unlock(inp->input_lck); + lck_mtx_unlock(&inp->input_lck); /* Associated the current thread with the new affinity tag */ (void) dlil_affinity_set(tp, tag); @@ -943,7 +1279,7 @@ ifnet_input( * its affinity. */ thread_reference(tp); - lck_mtx_lock(inp->input_lck); + lck_mtx_lock_spin(&inp->input_lck); } /* WARNING @@ -964,11 +1300,10 @@ ifnet_input( inp->input_mbuf_cnt += pkt_count; inp->input_wake_cnt++; - lck_mtx_assert(inp->input_lck, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_OWNED); } #endif - } - else { + } else { if (inp->mbuf_head == NULL) inp->mbuf_head = m_head; else if (inp->mbuf_tail != NULL) @@ -980,58 +1315,71 @@ ifnet_input( inp->input_mbuf_cnt += pkt_count; inp->input_wake_cnt++; - lck_mtx_assert(inp->input_lck, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inp->input_lck, LCK_MTX_ASSERT_OWNED); } #endif } - inp->input_waiting |= DLIL_INPUT_WAITING; if ((inp->input_waiting & DLIL_INPUT_RUNNING) == 0) { wakeup((caddr_t)&inp->input_waiting); } + lck_mtx_unlock(&inp->input_lck); + if (stats) { - ifp->if_data.ifi_ipackets += stats->packets_in; - ifp->if_data.ifi_ibytes += stats->bytes_in; - ifp->if_data.ifi_ierrors += stats->errors_in; - - ifp->if_data.ifi_opackets += stats->packets_out; - ifp->if_data.ifi_obytes += stats->bytes_out; - ifp->if_data.ifi_oerrors += stats->errors_out; - - ifp->if_data.ifi_collisions += stats->collisions; - ifp->if_data.ifi_iqdrops += stats->dropped; + atomic_add_64(&ifp->if_data.ifi_ipackets, stats->packets_in); + atomic_add_64(&ifp->if_data.ifi_ibytes, stats->bytes_in); + atomic_add_64(&ifp->if_data.ifi_ierrors, stats->errors_in); + + atomic_add_64(&ifp->if_data.ifi_opackets, stats->packets_out); + atomic_add_64(&ifp->if_data.ifi_obytes, stats->bytes_out); + atomic_add_64(&ifp->if_data.ifi_oerrors, stats->errors_out); + + atomic_add_64(&ifp->if_data.ifi_collisions, stats->collisions); + atomic_add_64(&ifp->if_data.ifi_iqdrops, stats->dropped); } - lck_mtx_unlock(inp->input_lck); - - return 0; + return (0); } static int -dlil_interface_filters_input(struct ifnet * ifp, struct mbuf * * m_p, - char * * frame_header_p, - protocol_family_t protocol_family) +dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p, + char **frame_header_p, protocol_family_t protocol_family) { - struct ifnet_filter * filter; + struct ifnet_filter *filter; + /* + * Pass the inbound packet to the interface filters + */ + lck_mtx_lock_spin(&ifp->if_flt_lock); + /* prevent filter list from changing in case we drop the lock */ + if_flt_monitor_busy(ifp); TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { int result; - if (filter->filt_input - && (filter->filt_protocol == 0 - || filter->filt_protocol == protocol_family)) { + if (!filter->filt_skip && filter->filt_input != NULL && + (filter->filt_protocol == 0 || + filter->filt_protocol == protocol_family)) { + lck_mtx_unlock(&ifp->if_flt_lock); + result = (*filter->filt_input)(filter->filt_cookie, - ifp, protocol_family, - m_p, frame_header_p); + ifp, protocol_family, m_p, frame_header_p); + + lck_mtx_lock_spin(&ifp->if_flt_lock); if (result != 0) { + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); return (result); } } } + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); /* - * Strip away M_PROTO1 bit prior to sending packet up the stack as + * Strip away M_PROTO1 bit prior to sending packet up the stack as * it is meant to be local to a subsystem -- if_bridge for M_PROTO1 */ if (*m_p != NULL) @@ -1040,6 +1388,45 @@ dlil_interface_filters_input(struct ifnet * ifp, struct mbuf * * m_p, return (0); } +static int +dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p, + protocol_family_t protocol_family) +{ + struct ifnet_filter *filter; + + /* + * Pass the outbound packet to the interface filters + */ + lck_mtx_lock_spin(&ifp->if_flt_lock); + /* prevent filter list from changing in case we drop the lock */ + if_flt_monitor_busy(ifp); + TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { + int result; + + if (!filter->filt_skip && filter->filt_output != NULL && + (filter->filt_protocol == 0 || + filter->filt_protocol == protocol_family)) { + lck_mtx_unlock(&ifp->if_flt_lock); + + result = filter->filt_output(filter->filt_cookie, ifp, + protocol_family, m_p); + + lck_mtx_lock_spin(&ifp->if_flt_lock); + if (result != 0) { + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + return (result); + } + } + } + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + + return (0); +} + static void dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m) { @@ -1050,24 +1437,21 @@ dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m) while (m != NULL) { char * frame_header; mbuf_t next_packet; - + next_packet = m->m_nextpkt; m->m_nextpkt = NULL; frame_header = m->m_pkthdr.header; m->m_pkthdr.header = NULL; - error = (*ifproto->kpi.v1.input)(ifproto->ifp, - ifproto->protocol_family, - m, frame_header); + error = (*ifproto->kpi.v1.input)(ifproto->ifp, + ifproto->protocol_family, m, frame_header); if (error != 0 && error != EJUSTRETURN) m_freem(m); m = next_packet; } - } - else if (ifproto->proto_kpi == kProtoKPI_v2) { + } else if (ifproto->proto_kpi == kProtoKPI_v2) { /* Version 2 protocols support packet lists */ error = (*ifproto->kpi.v2.input)(ifproto->ifp, - ifproto->protocol_family, - m); + ifproto->protocol_family, m); if (error != 0 && error != EJUSTRETURN) m_freem_list(m); } @@ -1078,7 +1462,6 @@ __private_extern__ void dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) { int error = 0; - int locked = 0; protocol_family_t protocol_family; mbuf_t next_packet; ifnet_t ifp = ifp_param; @@ -1089,66 +1472,71 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START,0,0,0,0,0); + while (m != NULL) { - struct if_proto * ifproto = NULL; + struct if_proto *ifproto = NULL; + int iorefcnt = 0; - next_packet = m->m_nextpkt; - m->m_nextpkt = NULL; if (ifp_param == NULL) ifp = m->m_pkthdr.rcvif; + + /* Check if this mbuf looks valid */ + MBUF_INPUT_CHECK(m, ifp); + + next_packet = m->m_nextpkt; + m->m_nextpkt = NULL; frame_header = m->m_pkthdr.header; m->m_pkthdr.header = NULL; - if (locked == 0) { - /* dlil lock protects the demux and interface filters */ - locked = 1; - dlil_read_begin(); + /* Get an IO reference count if the interface is not + * loopback and it is attached. + */ + if (ifp != lo_ifp) { + if (!ifnet_is_attached(ifp, 1)) { + m_freem(m); + goto next; + } + iorefcnt = 1; } -#if PKT_PRIORITY switch (m->m_pkthdr.prio) { case MBUF_TC_BK: - ifp->if_tc.ifi_ibkpackets++; - ifp->if_tc.ifi_ibkbytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_ibkpackets, 1); + atomic_add_64(&ifp->if_tc.ifi_ibkbytes, m->m_pkthdr.len); break; case MBUF_TC_VI: - ifp->if_tc.ifi_ivipackets++; - ifp->if_tc.ifi_ivibytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_ivipackets, 1); + atomic_add_64(&ifp->if_tc.ifi_ivibytes, m->m_pkthdr.len); break; case MBUF_TC_VO: - ifp->if_tc.ifi_ivopackets++; - ifp->if_tc.ifi_ivobytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_ivopackets, 1); + atomic_add_64(&ifp->if_tc.ifi_ivobytes, m->m_pkthdr.len); break; default: break; } -#endif PKT_PRIORITY /* find which protocol family this packet is for */ + ifnet_lock_shared(ifp); error = (*ifp->if_demux)(ifp, m, frame_header, - &protocol_family); + &protocol_family); + ifnet_lock_done(ifp); if (error != 0) { - if (error == EJUSTRETURN) { + if (error == EJUSTRETURN) goto next; - } protocol_family = 0; } - - /* DANGER!!! */ + if (m->m_flags & (M_BCAST|M_MCAST)) - ifp->if_imcasts++; + atomic_add_64(&ifp->if_imcasts, 1); /* run interface filters, exclude VLAN packets PR-3586856 */ if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { - int filter_result; - - filter_result = dlil_interface_filters_input(ifp, &m, - &frame_header, - protocol_family); - if (filter_result != 0) { - if (filter_result != EJUSTRETURN) { + error = dlil_interface_filters_input(ifp, &m, + &frame_header, protocol_family); + if (error != 0) { + if (error != EJUSTRETURN) m_freem(m); - } goto next; } } @@ -1156,19 +1544,21 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) m_freem(m); goto next; } - + /* Lookup the protocol attachment to this interface */ if (protocol_family == 0) { ifproto = NULL; - } - else if (last_ifproto != NULL - && last_ifproto->ifp == ifp - && (last_ifproto->protocol_family - == protocol_family)) { + } else if (last_ifproto != NULL && last_ifproto->ifp == ifp && + (last_ifproto->protocol_family == protocol_family)) { + VERIFY(ifproto == NULL); ifproto = last_ifproto; - } - else { + if_proto_ref(last_ifproto); + } else { + VERIFY(ifproto == NULL); + ifnet_lock_shared(ifp); + /* callee holds a proto refcnt upon success */ ifproto = find_attached_proto(ifp, protocol_family); + ifnet_lock_done(ifp); } if (ifproto == NULL) { /* no protocol for this packet, discard */ @@ -1176,18 +1566,14 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) goto next; } if (ifproto != last_ifproto) { - /* make sure ifproto can't go away during input */ - if_proto_ref(ifproto); if (last_ifproto != NULL) { /* pass up the list for the previous protocol */ - dlil_read_end(); - dlil_ifproto_input(last_ifproto, pkt_first); pkt_first = NULL; if_proto_free(last_ifproto); - dlil_read_begin(); } last_ifproto = ifproto; + if_proto_ref(ifproto); } /* extend the list */ m->m_pkthdr.header = frame_header; @@ -1198,78 +1584,127 @@ dlil_input_packet_list(struct ifnet * ifp_param, struct mbuf *m) } pkt_next = &m->m_nextpkt; - next: +next: if (next_packet == NULL && last_ifproto != NULL) { /* pass up the last list of packets */ - dlil_read_end(); - dlil_ifproto_input(last_ifproto, pkt_first); if_proto_free(last_ifproto); - locked = 0; + last_ifproto = NULL; + } + if (ifproto != NULL) { + if_proto_free(ifproto); + ifproto = NULL; } + m = next_packet; + /* update the driver's multicast filter, if needed */ + if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) + ifp->if_updatemcasts = 0; + if (iorefcnt == 1) + ifnet_decr_iorefcnt(ifp); } - if (locked != 0) { - dlil_read_end(); - } + KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END,0,0,0,0,0); return; } +errno_t +if_mcasts_update(struct ifnet *ifp) +{ + errno_t err; + + err = ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL); + if (err == EAFNOSUPPORT) + err = 0; + printf("%s%d: %s %d suspended link-layer multicast membership(s) " + "(err=%d)\n", ifp->if_name, ifp->if_unit, + (err == 0 ? "successfully restored" : "failed to restore"), + ifp->if_updatemcasts, err); + + /* just return success */ + return (0); +} + static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *event) { struct ifnet_filter *filter; - - if (ifp_use(ifp, kIfNetUseCount_MustNotBeZero) == 0) { - dlil_read_begin(); - - /* Pass the event to the interface filters */ - TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { - if (filter->filt_event) - filter->filt_event(filter->filt_cookie, ifp, filter->filt_protocol, event); + + /* Get an io ref count if the interface is attached */ + if (!ifnet_is_attached(ifp, 1)) + goto done; + + /* + * Pass the event to the interface filters + */ + lck_mtx_lock_spin(&ifp->if_flt_lock); + /* prevent filter list from changing in case we drop the lock */ + if_flt_monitor_busy(ifp); + TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { + if (filter->filt_event != NULL) { + lck_mtx_unlock(&ifp->if_flt_lock); + + filter->filt_event(filter->filt_cookie, ifp, + filter->filt_protocol, event); + + lck_mtx_lock_spin(&ifp->if_flt_lock); } - - if (ifp->if_proto_hash) { - int i; - - for (i = 0; i < PROTO_HASH_SLOTS; i++) { - struct if_proto *proto; - - SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) { - proto_media_event eventp = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.event : proto->kpi.v2.event; - - if (eventp) - eventp(ifp, proto->protocol_family, event); + } + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + + ifnet_lock_shared(ifp); + if (ifp->if_proto_hash != NULL) { + int i; + + for (i = 0; i < PROTO_HASH_SLOTS; i++) { + struct if_proto *proto; + + SLIST_FOREACH(proto, &ifp->if_proto_hash[i], + next_hash) { + proto_media_event eventp = + (proto->proto_kpi == kProtoKPI_v1 ? + proto->kpi.v1.event : + proto->kpi.v2.event); + + if (eventp != NULL) { + if_proto_ref(proto); + ifnet_lock_done(ifp); + + eventp(ifp, proto->protocol_family, + event); + + ifnet_lock_shared(ifp); + if_proto_free(proto); } } } - - dlil_read_end(); - - /* Pass the event to the interface */ - if (ifp->if_event) - ifp->if_event(ifp, event); - - if (ifp_unuse(ifp)) - ifp_use_reached_zero(ifp); } - - return kev_post_msg(event); + ifnet_lock_done(ifp); + + /* Pass the event to the interface */ + if (ifp->if_event != NULL) + ifp->if_event(ifp, event); + + /* Release the io ref count */ + ifnet_decr_iorefcnt(ifp); + +done: + return (kev_post_msg(event)); } errno_t -ifnet_event( - ifnet_t ifp, - struct kern_event_msg *event) +ifnet_event(ifnet_t ifp, struct kern_event_msg *event) { struct kev_msg kev_msg; int result = 0; - if (ifp == NULL || event == NULL) return EINVAL; + if (ifp == NULL || event == NULL) + return (EINVAL); + bzero(&kev_msg, sizeof (kev_msg)); kev_msg.vendor_code = event->vendor_code; kev_msg.kev_class = event->kev_class; kev_msg.kev_subclass = event->kev_subclass; @@ -1277,16 +1712,17 @@ ifnet_event( kev_msg.dv[0].data_ptr = &event->event_data[0]; kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE; kev_msg.dv[1].data_length = 0; - + result = dlil_event_internal(ifp, &kev_msg); - return result; + return (result); } #if CONFIG_MACF_NET #include <netinet/ip6.h> #include <netinet/ip.h> -static int dlil_get_socket_type(struct mbuf **mp, int family, int raw) +static int +dlil_get_socket_type(struct mbuf **mp, int family, int raw) { struct mbuf *m; struct ip *ip; @@ -1327,184 +1763,26 @@ static int dlil_get_socket_type(struct mbuf **mp, int family, int raw) static void if_inc_traffic_class_out(ifnet_t ifp, mbuf_t m) { -#if !PKT_PRIORITY -#pragma unused(ifp) -#pragma unused(m) - return; -#else if (!(m->m_flags & M_PKTHDR)) return; switch (m->m_pkthdr.prio) { case MBUF_TC_BK: - ifp->if_tc.ifi_obkpackets++; - ifp->if_tc.ifi_obkbytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_obkpackets, 1); + atomic_add_64(&ifp->if_tc.ifi_obkbytes, m->m_pkthdr.len); break; case MBUF_TC_VI: - ifp->if_tc.ifi_ovipackets++; - ifp->if_tc.ifi_ovibytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_ovipackets, 1); + atomic_add_64(&ifp->if_tc.ifi_ovibytes, m->m_pkthdr.len); break; case MBUF_TC_VO: - ifp->if_tc.ifi_ovopackets++; - ifp->if_tc.ifi_ovobytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_tc.ifi_ovopackets, 1); + atomic_add_64(&ifp->if_tc.ifi_ovobytes, m->m_pkthdr.len); break; default: break; } -#endif PKT_PRIORITY -} - -#if 0 -int -dlil_output_list( - struct ifnet* ifp, - u_long proto_family, - struct mbuf *packetlist, - caddr_t route, - const struct sockaddr *dest, - int raw) -{ - char *frame_type = NULL; - char *dst_linkaddr = NULL; - int retval = 0; - char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; - char dst_linkaddr_buffer[MAX_LINKADDR * 4]; - struct ifnet_filter *filter; - struct if_proto *proto = 0; - mbuf_t m; - mbuf_t send_head = NULL; - mbuf_t *send_tail = &send_head; - - KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START,0,0,0,0,0); - - dlil_read_begin(); - - frame_type = frame_type_buffer; - dst_linkaddr = dst_linkaddr_buffer; - - if (raw == 0) { - proto = find_attached_proto(ifp, proto_family); - if (proto == NULL) { - retval = ENXIO; - goto cleanup; - } - } - -preout_again: - if (packetlist == NULL) - goto cleanup; - m = packetlist; - packetlist = packetlist->m_nextpkt; - m->m_nextpkt = NULL; - - if (raw == 0) { - proto_media_preout preoutp = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.pre_output : proto->kpi.v2.pre_output; - retval = 0; - if (preoutp) - retval = preoutp(ifp, proto_family, &m, dest, route, frame_type, dst_linkaddr); - - if (retval) { - if (retval == EJUSTRETURN) { - goto preout_again; - } - - m_freem(m); - goto cleanup; - } - } - - do { -#if CONFIG_MACF_NET - retval = mac_ifnet_check_transmit(ifp, m, proto_family, - dlil_get_socket_type(&m, proto_family, raw)); - if (retval) { - m_freem(m); - goto cleanup; - } -#endif - - if (raw == 0 && ifp->if_framer) { - retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr, frame_type); - if (retval) { - if (retval != EJUSTRETURN) { - m_freem(m); - } - goto next; - } - } - - /* - * Let interface filters (if any) do their thing ... - */ - /* Do not pass VLAN tagged packets to filters PR-3586856 */ - if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { - TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { - if ((filter->filt_protocol == 0 || (filter->filt_protocol == proto_family)) && - filter->filt_output) { - retval = filter->filt_output(filter->filt_cookie, ifp, proto_family, &m); - if (retval) { - if (retval != EJUSTRETURN) - m_freem(m); - goto next; - } - } - } - } - /* - * Strip away M_PROTO1 bit prior to sending packet to the driver - * as this field may be used by the driver - */ - m->m_flags &= ~M_PROTO1; - - /* - * Finally, call the driver. - */ - - if ((ifp->if_eflags & IFEF_SENDLIST) != 0) { - *send_tail = m; - send_tail = &m->m_nextpkt; - } - else { - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - retval = ifp->if_output(ifp, m); - if (retval && dlil_verbose) { - printf("dlil_output: output error on %s%d retval = %d\n", - ifp->if_name, ifp->if_unit, retval); - } - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); - } - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); - -next: - m = packetlist; - if (m) { - packetlist = packetlist->m_nextpkt; - m->m_nextpkt = NULL; - } - } while (m); - - if (send_head) { - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - retval = ifp->if_output(ifp, send_head); - if (retval && dlil_verbose) { - printf("dlil_output: output error on %s%d retval = %d\n", - ifp->if_name, ifp->if_unit, retval); - } - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); - } - - KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END,0,0,0,0,0); - -cleanup: - dlil_read_end(); - if (packetlist) /* if any packet left, clean up */ - mbuf_freem_list(packetlist); - if (retval == EJUSTRETURN) - retval = 0; - return retval; } -#endif /* * dlil_output @@ -1519,62 +1797,72 @@ cleanup: * because a protocol is likely to interact with an ifp while it * is under the protocol lock. */ -__private_extern__ errno_t -dlil_output( - ifnet_t ifp, - protocol_family_t proto_family, - mbuf_t packetlist, - void *route, - const struct sockaddr *dest, - int raw) -{ - char *frame_type = NULL; - char *dst_linkaddr = NULL; - int retval = 0; - char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; - char dst_linkaddr_buffer[MAX_LINKADDR * 4]; - struct ifnet_filter *filter; - struct if_proto *proto = 0; +errno_t +dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist, + void *route, const struct sockaddr *dest, int raw) +{ + char *frame_type = NULL; + char *dst_linkaddr = NULL; + int retval = 0; + char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4]; + char dst_linkaddr_buffer[MAX_LINKADDR * 4]; + struct if_proto *proto = NULL; mbuf_t m; mbuf_t send_head = NULL; mbuf_t *send_tail = &send_head; - + int iorefcnt = 0; + KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START,0,0,0,0,0); - - dlil_read_begin(); - - frame_type = frame_type_buffer; - dst_linkaddr = dst_linkaddr_buffer; - + + /* Get an io refcnt if the interface is attached to prevent ifnet_detach + * from happening while this operation is in progress */ + if (!ifnet_is_attached(ifp, 1)) { + retval = ENXIO; + goto cleanup; + } + iorefcnt = 1; + + /* update the driver's multicast filter, if needed */ + if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) + ifp->if_updatemcasts = 0; + + frame_type = frame_type_buffer; + dst_linkaddr = dst_linkaddr_buffer; + if (raw == 0) { + ifnet_lock_shared(ifp); + /* callee holds a proto refcnt upon success */ proto = find_attached_proto(ifp, proto_family); if (proto == NULL) { + ifnet_lock_done(ifp); retval = ENXIO; goto cleanup; } + ifnet_lock_done(ifp); } - + preout_again: if (packetlist == NULL) goto cleanup; + m = packetlist; packetlist = packetlist->m_nextpkt; m->m_nextpkt = NULL; - + if (raw == 0) { - proto_media_preout preoutp = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.pre_output : proto->kpi.v2.pre_output; + proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ? + proto->kpi.v1.pre_output : proto->kpi.v2.pre_output); retval = 0; - if (preoutp) - retval = preoutp(ifp, proto_family, &m, dest, route, frame_type, dst_linkaddr); - - if (retval) { - if (retval == EJUSTRETURN) { - goto preout_again; + if (preoutp != NULL) { + retval = preoutp(ifp, proto_family, &m, dest, route, + frame_type, dst_linkaddr); + + if (retval != 0) { + if (retval == EJUSTRETURN) + goto preout_again; + m_freem(m); + goto cleanup; } - - m_freem(m); - goto cleanup; } } @@ -1588,6 +1876,21 @@ preout_again: #endif do { +#if CONFIG_DTRACE + if (proto_family == PF_INET) { + struct ip *ip = mtod(m, struct ip*); + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, ifp, + struct ip *, ip, struct ip6_hdr *, NULL); + + } else if (proto_family == PF_INET6) { + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr*); + DTRACE_IP6(send, struct mbuf*, m, struct inpcb *, NULL, + struct ip6_hdr *, ip6, struct ifnet*, ifp, + struct ip*, NULL, struct ip6_hdr *, ip6); + } +#endif /* CONFIG_DTRACE */ + if (raw == 0 && ifp->if_framer) { int rcvif_set = 0; @@ -1605,11 +1908,11 @@ preout_again: rcvif_set = 1; } - retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr, frame_type); + retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr, + frame_type); if (retval) { - if (retval != EJUSTRETURN) { + if (retval != EJUSTRETURN) m_freem(m); - } goto next; } @@ -1625,25 +1928,20 @@ preout_again: if (rcvif_set && m->m_pkthdr.rcvif == ifp) m->m_pkthdr.rcvif = NULL; } - - /* + + /* * Let interface filters (if any) do their thing ... */ /* Do not pass VLAN tagged packets to filters PR-3586856 */ if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) == 0) { - TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { - if ((filter->filt_protocol == 0 || (filter->filt_protocol == proto_family)) && - filter->filt_output) { - retval = filter->filt_output(filter->filt_cookie, ifp, proto_family, &m); - if (retval) { - if (retval != EJUSTRETURN) - m_freem(m); - goto next; - } - } + retval = dlil_interface_filters_output(ifp, + &m, proto_family); + if (retval != 0) { + if (retval != EJUSTRETURN) + m_freem(m); + goto next; } } - /* * Strip away M_PROTO1 bit prior to sending packet to the driver * as this field may be used by the driver @@ -1663,40 +1961,43 @@ preout_again: goto next; } - /* - * If this is a TSO packet, make sure the interface still advertise TSO capability + /* + * If this is a TSO packet, make sure the interface still + * advertise TSO capability. */ - if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) && !(ifp->if_hwassist & IFNET_TSO_IPV4)) { - retval = EMSGSIZE; - m_freem(m); - goto cleanup; + if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) && + !(ifp->if_hwassist & IFNET_TSO_IPV4)) { + retval = EMSGSIZE; + m_freem(m); + goto cleanup; } - if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV6) && !(ifp->if_hwassist & IFNET_TSO_IPV6)) { - retval = EMSGSIZE; - m_freem(m); - goto cleanup; + if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV6) && + !(ifp->if_hwassist & IFNET_TSO_IPV6)) { + retval = EMSGSIZE; + m_freem(m); + goto cleanup; } + /* * Finally, call the driver. */ - if ((ifp->if_eflags & IFEF_SENDLIST) != 0) { *send_tail = m; send_tail = &m->m_nextpkt; - } - else { - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - + } else { if_inc_traffic_class_out(ifp, m); - + KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, + 0,0,0,0,0); retval = ifp->if_output(ifp, m); if (retval && dlil_verbose) { - printf("dlil_output: output error on %s%d retval = %d\n", - ifp->if_name, ifp->if_unit, retval); + printf("%s: output error on %s%d retval = %d\n", + __func__, ifp->if_name, ifp->if_unit, + retval); } - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); + KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, + 0,0,0,0,0); } KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); @@ -1709,115 +2010,121 @@ next: } while (m); if (send_head) { - KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); - if_inc_traffic_class_out(ifp, send_head); + KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START, 0,0,0,0,0); retval = ifp->if_output(ifp, send_head); if (retval && dlil_verbose) { - printf("dlil_output: output error on %s%d retval = %d\n", - ifp->if_name, ifp->if_unit, retval); + printf("%s: output error on %s%d retval = %d\n", + __func__, ifp->if_name, ifp->if_unit, retval); } KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0,0,0,0,0); } - + KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END,0,0,0,0,0); cleanup: - dlil_read_end(); - if (packetlist) /* if any packet left, clean up */ + if (proto != NULL) + if_proto_free(proto); + if (packetlist) /* if any packets are left, clean up */ mbuf_freem_list(packetlist); if (retval == EJUSTRETURN) retval = 0; - return retval; + if (iorefcnt == 1) + ifnet_decr_iorefcnt(ifp); + + return (retval); } errno_t -ifnet_ioctl( - ifnet_t ifp, - protocol_family_t proto_fam, - u_long ioctl_code, - void *ioctl_arg) -{ - struct ifnet_filter *filter; - int retval = EOPNOTSUPP; - int result = 0; - int holding_read = 0; - +ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code, + void *ioctl_arg) +{ + struct ifnet_filter *filter; + int retval = EOPNOTSUPP; + int result = 0; + if (ifp == NULL || ioctl_code == 0) - return EINVAL; - - /* Attempt to increment the use count. If it's zero, bail out, the ifp is invalid */ - result = ifp_use(ifp, kIfNetUseCount_MustNotBeZero); - if (result != 0) - return EOPNOTSUPP; - - dlil_read_begin(); - holding_read = 1; - + return (EINVAL); + + /* Get an io ref count if the interface is attached */ + if (!ifnet_is_attached(ifp, 1)) + return (EOPNOTSUPP); + /* Run the interface filters first. * We want to run all filters before calling the protocol, * interface family, or interface. */ + lck_mtx_lock_spin(&ifp->if_flt_lock); + /* prevent filter list from changing in case we drop the lock */ + if_flt_monitor_busy(ifp); TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) { - if ((filter->filt_protocol == 0 || (filter->filt_protocol == proto_fam)) && - filter->filt_ioctl != NULL) { - result = filter->filt_ioctl(filter->filt_cookie, ifp, proto_fam, ioctl_code, ioctl_arg); + if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 || + filter->filt_protocol == proto_fam)) { + lck_mtx_unlock(&ifp->if_flt_lock); + + result = filter->filt_ioctl(filter->filt_cookie, ifp, + proto_fam, ioctl_code, ioctl_arg); + + lck_mtx_lock_spin(&ifp->if_flt_lock); + /* Only update retval if no one has handled the ioctl */ if (retval == EOPNOTSUPP || result == EJUSTRETURN) { if (result == ENOTSUP) result = EOPNOTSUPP; retval = result; - if (retval && retval != EOPNOTSUPP) { + if (retval != 0 && retval != EOPNOTSUPP) { + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); goto cleanup; } } } } - + /* we're done with the filter list */ + if_flt_monitor_unbusy(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + /* Allow the protocol to handle the ioctl */ - if (proto_fam) { - struct if_proto *proto = find_attached_proto(ifp, proto_fam); - - if (proto != 0) { - proto_media_ioctl ioctlp = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.ioctl : proto->kpi.v2.ioctl; + if (proto_fam != 0) { + struct if_proto *proto; + + /* callee holds a proto refcnt upon success */ + ifnet_lock_shared(ifp); + proto = find_attached_proto(ifp, proto_fam); + ifnet_lock_done(ifp); + if (proto != NULL) { + proto_media_ioctl ioctlp = + (proto->proto_kpi == kProtoKPI_v1 ? + proto->kpi.v1.ioctl : proto->kpi.v2.ioctl); result = EOPNOTSUPP; - if (ioctlp) - result = ioctlp(ifp, proto_fam, ioctl_code, ioctl_arg); - + if (ioctlp != NULL) + result = ioctlp(ifp, proto_fam, ioctl_code, + ioctl_arg); + if_proto_free(proto); + /* Only update retval if no one has handled the ioctl */ if (retval == EOPNOTSUPP || result == EJUSTRETURN) { if (result == ENOTSUP) result = EOPNOTSUPP; retval = result; - if (retval && retval != EOPNOTSUPP) { + if (retval && retval != EOPNOTSUPP) goto cleanup; - } } } } - - /* - * Since we have incremented the use count on the ifp, we are guaranteed - * that the ifp will not go away (the function pointers may not be changed). - * We release the dlil read lock so the interface ioctl may trigger a - * protocol attach. This happens with vlan and may occur with other virtual - * interfaces. - */ - dlil_read_end(); - holding_read = 0; - + /* retval is either 0 or EOPNOTSUPP */ - + /* * Let the interface handle this ioctl. * If it returns EOPNOTSUPP, ignore that, we may have * already handled this in the protocol or family. */ - if (ifp->if_ioctl) + if (ifp->if_ioctl) result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg); - + /* Only update retval if no one has handled the ioctl */ if (retval == EOPNOTSUPP || result == EJUSTRETURN) { if (result == ENOTSUP) @@ -1827,60 +2134,59 @@ ifnet_ioctl( goto cleanup; } } - -cleanup: - if (holding_read) - dlil_read_end(); - if (ifp_unuse(ifp)) - ifp_use_reached_zero(ifp); +cleanup: if (retval == EJUSTRETURN) retval = 0; - return retval; + + ifnet_decr_iorefcnt(ifp); + + return (retval); } __private_extern__ errno_t -dlil_set_bpf_tap( - ifnet_t ifp, - bpf_tap_mode mode, - bpf_packet_func callback) +dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback) { errno_t error = 0; - - dlil_read_begin(); - if (ifp->if_set_bpf_tap) + + + if (ifp->if_set_bpf_tap) { + /* Get an io reference on the interface if it is attached */ + if (!ifnet_is_attached(ifp, 1)) + return ENXIO; error = ifp->if_set_bpf_tap(ifp, mode, callback); - dlil_read_end(); - - return error; + ifnet_decr_iorefcnt(ifp); + } + return (error); } errno_t -dlil_resolve_multi( - struct ifnet *ifp, - const struct sockaddr *proto_addr, - struct sockaddr *ll_addr, - size_t ll_len) +dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr, + struct sockaddr *ll_addr, size_t ll_len) { errno_t result = EOPNOTSUPP; struct if_proto *proto; const struct sockaddr *verify; proto_media_resolve_multi resolvep; - - dlil_read_begin(); - + + if (!ifnet_is_attached(ifp, 1)) + return result; + bzero(ll_addr, ll_len); - - /* Call the protocol first */ + + /* Call the protocol first; callee holds a proto refcnt upon success */ + ifnet_lock_shared(ifp); proto = find_attached_proto(ifp, proto_addr->sa_family); + ifnet_lock_done(ifp); if (proto != NULL) { - resolvep = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi; + resolvep = (proto->proto_kpi == kProtoKPI_v1 ? + proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi); if (resolvep != NULL) - result = resolvep(ifp, proto_addr,(struct sockaddr_dl*)ll_addr, - ll_len); + result = resolvep(ifp, proto_addr, + (struct sockaddr_dl*)ll_addr, ll_len); + if_proto_free(proto); } - + /* Let the interface verify the multicast address */ if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) { if (result == 0) @@ -1889,73 +2195,63 @@ dlil_resolve_multi( verify = proto_addr; result = ifp->if_check_multi(ifp, verify); } - - dlil_read_end(); - - return result; + + ifnet_decr_iorefcnt(ifp); + return (result); } __private_extern__ errno_t -dlil_send_arp_internal( - ifnet_t ifp, - u_short arpop, - const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto) +dlil_send_arp_internal(ifnet_t ifp, u_short arpop, + const struct sockaddr_dl* sender_hw, const struct sockaddr* sender_proto, + const struct sockaddr_dl* target_hw, const struct sockaddr* target_proto) { struct if_proto *proto; errno_t result = 0; - - dlil_read_begin(); - + + /* callee holds a proto refcnt upon success */ + ifnet_lock_shared(ifp); proto = find_attached_proto(ifp, target_proto->sa_family); + ifnet_lock_done(ifp); if (proto == NULL) { result = ENOTSUP; - } - else { + } else { proto_media_send_arp arpp; - arpp = proto->proto_kpi == kProtoKPI_v1 - ? proto->kpi.v1.send_arp : proto->kpi.v2.send_arp; + arpp = (proto->proto_kpi == kProtoKPI_v1 ? + proto->kpi.v1.send_arp : proto->kpi.v2.send_arp); if (arpp == NULL) result = ENOTSUP; else - result = arpp(ifp, arpop, sender_hw, sender_proto, target_hw, - target_proto); + result = arpp(ifp, arpop, sender_hw, sender_proto, + target_hw, target_proto); + if_proto_free(proto); } - - dlil_read_end(); - - return result; + + return (result); } static __inline__ int _is_announcement(const struct sockaddr_in * sender_sin, - const struct sockaddr_in * target_sin) + const struct sockaddr_in * target_sin) { if (sender_sin == NULL) { - return FALSE; + return (FALSE); } return (sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr); } __private_extern__ errno_t -dlil_send_arp( - ifnet_t ifp, - u_short arpop, - const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto) +dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl* sender_hw, + const struct sockaddr* sender_proto, const struct sockaddr_dl* target_hw, + const struct sockaddr* target_proto) { errno_t result = 0; const struct sockaddr_in * sender_sin; const struct sockaddr_in * target_sin; - - if (target_proto == NULL || (sender_proto && - sender_proto->sa_family != target_proto->sa_family)) - return EINVAL; - + + if (target_proto == NULL || (sender_proto != NULL && + sender_proto->sa_family != target_proto->sa_family)) + return (EINVAL); + /* * If this is an ARP request and the target IP is IPv4LL, * send the request on all interfaces. The exception is @@ -1964,281 +2260,293 @@ dlil_send_arp( */ sender_sin = (const struct sockaddr_in *)sender_proto; target_sin = (const struct sockaddr_in *)target_proto; - if (target_proto->sa_family == AF_INET - && IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) - && ipv4_ll_arp_aware != 0 - && arpop == ARPOP_REQUEST - && !_is_announcement(target_sin, sender_sin)) { + if (target_proto->sa_family == AF_INET && + IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) && + ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST && + !_is_announcement(target_sin, sender_sin)) { ifnet_t *ifp_list; u_int32_t count; u_int32_t ifp_on; - + result = ENOTSUP; if (ifnet_list_get(IFNET_FAMILY_ANY, &ifp_list, &count) == 0) { for (ifp_on = 0; ifp_on < count; ifp_on++) { - errno_t new_result; - ifaddr_t source_hw = NULL; - ifaddr_t source_ip = NULL; - struct sockaddr_in source_ip_copy; - + errno_t new_result; + ifaddr_t source_hw = NULL; + ifaddr_t source_ip = NULL; + struct sockaddr_in source_ip_copy; + struct ifnet *cur_ifp = ifp_list[ifp_on]; + /* - * Only arp on interfaces marked for IPv4LL ARPing. This may - * mean that we don't ARP on the interface the subnet route - * points to. + * Only arp on interfaces marked for IPv4LL + * ARPing. This may mean that we don't ARP on + * the interface the subnet route points to. */ - if ((ifp_list[ifp_on]->if_eflags & IFEF_ARPLL) == 0) { + if (!(cur_ifp->if_eflags & IFEF_ARPLL)) continue; - } /* Find the source IP address */ - ifnet_lock_shared(ifp_list[ifp_on]); - source_hw = TAILQ_FIRST(&ifp_list[ifp_on]->if_addrhead); - TAILQ_FOREACH(source_ip, &ifp_list[ifp_on]->if_addrhead, - ifa_link) { - if (source_ip->ifa_addr && - source_ip->ifa_addr->sa_family == AF_INET) { + ifnet_lock_shared(cur_ifp); + source_hw = cur_ifp->if_lladdr; + TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead, + ifa_link) { + IFA_LOCK(source_ip); + if (source_ip->ifa_addr != NULL && + source_ip->ifa_addr->sa_family == + AF_INET) { + /* Copy the source IP address */ + source_ip_copy = + *(struct sockaddr_in *) + source_ip->ifa_addr; + IFA_UNLOCK(source_ip); break; } + IFA_UNLOCK(source_ip); } - + /* No IP Source, don't arp */ if (source_ip == NULL) { - ifnet_lock_done(ifp_list[ifp_on]); + ifnet_lock_done(cur_ifp); continue; } - - /* Copy the source IP address */ - source_ip_copy = *(struct sockaddr_in*)source_ip->ifa_addr; - ifaref(source_hw); - ifnet_lock_done(ifp_list[ifp_on]); - + + IFA_ADDREF(source_hw); + ifnet_lock_done(cur_ifp); + /* Send the ARP */ - new_result = dlil_send_arp_internal(ifp_list[ifp_on], arpop, - (struct sockaddr_dl*)source_hw->ifa_addr, - (struct sockaddr*)&source_ip_copy, NULL, - target_proto); + new_result = dlil_send_arp_internal(cur_ifp, + arpop, + (struct sockaddr_dl *)source_hw->ifa_addr, + (struct sockaddr *)&source_ip_copy, NULL, + target_proto); - ifafree(source_hw); + IFA_REMREF(source_hw); if (result == ENOTSUP) { result = new_result; } } + ifnet_list_free(ifp_list); } - - ifnet_list_free(ifp_list); - } - else { - result = dlil_send_arp_internal(ifp, arpop, sender_hw, sender_proto, - target_hw, target_proto); + } else { + result = dlil_send_arp_internal(ifp, arpop, sender_hw, + sender_proto, target_hw, target_proto); } - - return result; + + return (result); } -__private_extern__ int -ifp_use( - struct ifnet *ifp, - int handle_zero) +/* + * Caller must hold ifnet head lock. + */ +static int +ifnet_lookup(struct ifnet *ifp) { - int old_value; - int retval = 0; - - do { - old_value = ifp->if_usecnt; - if (old_value == 0 && handle_zero == kIfNetUseCount_MustNotBeZero) { - retval = ENXIO; // ifp is invalid + struct ifnet *_ifp; + + lck_rw_assert(&ifnet_head_lock, LCK_RW_ASSERT_HELD); + TAILQ_FOREACH(_ifp, &ifnet_head, if_link) { + if (_ifp == ifp) break; - } - } while (!OSCompareAndSwap((UInt32)old_value, (UInt32)old_value + 1, (UInt32*)&ifp->if_usecnt)); - - return retval; + } + return (_ifp != NULL); } - -/* ifp_unuse is broken into two pieces. - * - * ifp_use and ifp_unuse must be called between when the caller calls - * dlil_write_begin and dlil_write_end. ifp_unuse needs to perform some - * operations after dlil_write_end has been called. For this reason, - * anyone calling ifp_unuse must call ifp_use_reached_zero if ifp_unuse - * returns a non-zero value. The caller must call ifp_use_reached_zero - * after the caller has called dlil_write_end. +/* + * Caller has to pass a non-zero refio argument to get a + * IO reference count. This will prevent ifnet_detach from + * being called when there are outstanding io reference counts. */ -__private_extern__ void -ifp_use_reached_zero( - struct ifnet *ifp) -{ - ifnet_detached_func free_func; - - dlil_read_begin(); - - if (ifp->if_usecnt != 0) - panic("ifp_use_reached_zero: ifp->if_usecnt != 0"); - - ifnet_head_lock_exclusive(); - ifnet_lock_exclusive(ifp); - - /* Remove ourselves from the list */ - TAILQ_REMOVE(&ifnet_head, ifp, if_link); - ifnet_addrs[ifp->if_index - 1] = NULL; - - /* ifp should be removed from the interface list */ - while (ifp->if_multiaddrs.lh_first) { - struct ifmultiaddr *ifma = ifp->if_multiaddrs.lh_first; - - /* - * When the interface is gone, we will no longer - * be listening on these multicasts. Various bits - * of the stack may be referencing these multicasts, - * release only our reference. +int +ifnet_is_attached(struct ifnet *ifp, int refio) +{ + int ret; + + lck_mtx_lock_spin(&ifp->if_ref_lock); + if ((ret = ((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) == + IFRF_ATTACHED))) { + if (refio > 0) + ifp->if_refio++; + } + lck_mtx_unlock(&ifp->if_ref_lock); + + return (ret); +} + +void +ifnet_decr_iorefcnt(struct ifnet *ifp) +{ + lck_mtx_lock_spin(&ifp->if_ref_lock); + VERIFY(ifp->if_refio > 0); + VERIFY((ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING)) != 0); + ifp->if_refio--; + + /* if there are no more outstanding io references, wakeup the + * ifnet_detach thread if detaching flag is set. + */ + if (ifp->if_refio == 0 && + (ifp->if_refflags & IFRF_DETACHING) != 0) { + /* Convert the spinlock to a regular mutex if we have + * to wait for any reason while doing a wakeup. */ - LIST_REMOVE(ifma, ifma_link); - ifma->ifma_ifp = NULL; - ifma_release(ifma); + lck_mtx_convert_spin(&ifp->if_ref_lock); + wakeup(&(ifp->if_refio)); } + lck_mtx_unlock(&ifp->if_ref_lock); +} - ifp->if_eflags &= ~IFEF_DETACHING; // clear the detaching flag - ifnet_lock_done(ifp); - ifnet_head_done(); +static void +dlil_if_trace(struct dlil_ifnet *dl_if, int refhold) +{ + struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; - free_func = ifp->if_free; - dlil_read_end(); - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0); - - if (free_func) - free_func(ifp); + if (!(dl_if->dl_if_flags & DLIF_DEBUG)) { + panic("%s: dl_if %p has no debug structure", __func__, dl_if); + /* NOTREACHED */ + } + + if (refhold) { + cnt = &dl_if_dbg->dldbg_if_refhold_cnt; + tr = dl_if_dbg->dldbg_if_refhold; + } else { + cnt = &dl_if_dbg->dldbg_if_refrele_cnt; + tr = dl_if_dbg->dldbg_if_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % IF_REF_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); } -__private_extern__ int -ifp_unuse( - struct ifnet *ifp) -{ - int oldval; - oldval = OSDecrementAtomic(&ifp->if_usecnt); - if (oldval == 0) - panic("ifp_unuse: ifp(%s%d)->if_usecnt was zero\n", ifp->if_name, ifp->if_unit); - - if (oldval > 1) - return 0; - - if ((ifp->if_eflags & IFEF_DETACHING) == 0) - panic("ifp_unuse: use count reached zero but detching flag is not set!"); - - return 1; /* caller must call ifp_use_reached_zero */ +errno_t +dlil_if_ref(struct ifnet *ifp) +{ + struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp; + + if (dl_if == NULL) + return (EINVAL); + + lck_mtx_lock_spin(&dl_if->dl_if_lock); + ++dl_if->dl_if_refcnt; + if (dl_if->dl_if_refcnt == 0) { + panic("%s: wraparound refcnt for ifp=%p", __func__, ifp); + /* NOTREACHED */ + } + if (dl_if->dl_if_trace != NULL) + (*dl_if->dl_if_trace)(dl_if, TRUE); + lck_mtx_unlock(&dl_if->dl_if_lock); + + return (0); } -extern lck_mtx_t *domain_proto_mtx; +errno_t +dlil_if_free(struct ifnet *ifp) +{ + struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp; + + if (dl_if == NULL) + return (EINVAL); + + lck_mtx_lock_spin(&dl_if->dl_if_lock); + if (dl_if->dl_if_refcnt == 0) { + panic("%s: negative refcnt for ifp=%p", __func__, ifp); + /* NOTREACHED */ + } + --dl_if->dl_if_refcnt; + if (dl_if->dl_if_trace != NULL) + (*dl_if->dl_if_trace)(dl_if, FALSE); + lck_mtx_unlock(&dl_if->dl_if_lock); + + return (0); +} static errno_t -dlil_attach_protocol_internal( - struct if_proto *proto, - const struct ifnet_demux_desc *demux_list, - u_int32_t demux_count) +dlil_attach_protocol_internal(struct if_proto *proto, + const struct ifnet_demux_desc *demux_list, u_int32_t demux_count) { - struct kev_dl_proto_data ev_pr_data; + struct kev_dl_proto_data ev_pr_data; struct ifnet *ifp = proto->ifp; int retval = 0; u_int32_t hash_value = proto_hash_value(proto->protocol_family); - - /* setup some of the common values */ - { - struct domain *dp; - lck_mtx_lock(domain_proto_mtx); - dp = domains; - while (dp && (protocol_family_t)dp->dom_family != proto->protocol_family) - dp = dp->dom_next; - proto->dl_domain = dp; - lck_mtx_unlock(domain_proto_mtx); - } - - /* - * Take the write lock to protect readers and exclude other writers. - */ - if ((retval = dlil_write_begin()) != 0) { - printf("dlil_attach_protocol_internal - dlil_write_begin returned %d\n", retval); - return retval; - } - - /* Check that the interface isn't currently detaching */ - ifnet_lock_shared(ifp); - if ((ifp->if_eflags & IFEF_DETACHING) != 0) { + struct if_proto *prev_proto; + struct if_proto *_proto; + + /* callee holds a proto refcnt upon success */ + ifnet_lock_exclusive(ifp); + _proto = find_attached_proto(ifp, proto->protocol_family); + if (_proto != NULL) { ifnet_lock_done(ifp); - dlil_write_end(); - return ENXIO; + if_proto_free(_proto); + return (EEXIST); } - ifnet_lock_done(ifp); - - if (find_attached_proto(ifp, proto->protocol_family) != NULL) { - dlil_write_end(); - return EEXIST; - } - + /* * Call family module add_proto routine so it can refine the * demux descriptors as it wishes. */ - retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list, demux_count); + retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list, + demux_count); if (retval) { - dlil_write_end(); - return retval; + ifnet_lock_done(ifp); + return (retval); } - - /* - * We can't fail from this point on. - * Increment the number of uses (protocol attachments + interface attached). - */ - ifp_use(ifp, kIfNetUseCount_MustNotBeZero); - + /* * Insert the protocol in the hash */ - { - struct if_proto* prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]); - while (prev_proto && SLIST_NEXT(prev_proto, next_hash) != NULL) - prev_proto = SLIST_NEXT(prev_proto, next_hash); - if (prev_proto) - SLIST_INSERT_AFTER(prev_proto, proto, next_hash); - else - SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value], proto, next_hash); - } + prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]); + while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) + prev_proto = SLIST_NEXT(prev_proto, next_hash); + if (prev_proto) + SLIST_INSERT_AFTER(prev_proto, proto, next_hash); + else + SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value], + proto, next_hash); + + /* hold a proto refcnt for attach */ + if_proto_ref(proto); /* - * Add to if_proto list for this interface + * The reserved field carries the number of protocol still attached + * (subject to change) */ - if_proto_ref(proto); - dlil_write_end(); - - /* the reserved field carries the number of protocol still attached (subject to change) */ ev_pr_data.proto_family = proto->protocol_family; ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp); - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED, - (struct net_event_data *)&ev_pr_data, - sizeof(struct kev_dl_proto_data)); -#if 0 - DLIL_PRINTF("dlil. Attached protocol %d to %s%d - %d\n", proto->protocol_family, - ifp->if_name, ifp->if_unit, retval); -#endif - return retval; + ifnet_lock_done(ifp); + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED, + (struct net_event_data *)&ev_pr_data, + sizeof (struct kev_dl_proto_data)); + return (retval); } errno_t ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol, - const struct ifnet_attach_proto_param *proto_details) + const struct ifnet_attach_proto_param *proto_details) { int retval = 0; struct if_proto *ifproto = NULL; - - if (ifp == NULL || protocol == 0 || proto_details == NULL) - return EINVAL; - - ifproto = _MALLOC(sizeof(struct if_proto), M_IFADDR, M_WAITOK); - if (ifproto == 0) { - DLIL_PRINTF("ERROR - dlil failed if_proto allocation\n"); + + ifnet_head_lock_shared(); + if (ifp == NULL || protocol == 0 || proto_details == NULL) { + retval = EINVAL; + goto end; + } + /* Check that the interface is in the global list */ + if (!ifnet_lookup(ifp)) { + retval = ENXIO; + goto end; + } + + ifproto = zalloc(dlif_proto_zone); + if (ifproto == NULL) { retval = ENOMEM; goto end; } - bzero(ifproto, sizeof(*ifproto)); - + bzero(ifproto, dlif_proto_size); + + /* refcnt held above during lookup */ ifproto->ifp = ifp; ifproto->protocol_family = protocol; ifproto->proto_kpi = kProtoKPI_v1; @@ -2249,34 +2557,52 @@ ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol, ifproto->kpi.v1.detached = proto_details->detached; ifproto->kpi.v1.resolve_multi = proto_details->resolve; ifproto->kpi.v1.send_arp = proto_details->send_arp; - + retval = dlil_attach_protocol_internal(ifproto, - proto_details->demux_list, proto_details->demux_count); - + proto_details->demux_list, proto_details->demux_count); + + if (dlil_verbose) { + printf("%s%d: attached v1 protocol %d\n", ifp->if_name, + ifp->if_unit, protocol); + } + end: - if (retval && ifproto) - FREE(ifproto, M_IFADDR); - return retval; + if (retval != 0 && retval != EEXIST && ifp != NULL) { + DLIL_PRINTF("%s%d: failed to attach v1 protocol %d (err=%d)\n", + ifp->if_name, ifp->if_unit, protocol, retval); + } + ifnet_head_done(); + if (retval != 0 && ifproto != NULL) + zfree(dlif_proto_zone, ifproto); + return (retval); } errno_t ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol, - const struct ifnet_attach_proto_param_v2 *proto_details) + const struct ifnet_attach_proto_param_v2 *proto_details) { int retval = 0; struct if_proto *ifproto = NULL; - - if (ifp == NULL || protocol == 0 || proto_details == NULL) - return EINVAL; - - ifproto = _MALLOC(sizeof(struct if_proto), M_IFADDR, M_WAITOK); - if (ifproto == 0) { - DLIL_PRINTF("ERROR - dlil failed if_proto allocation\n"); + + ifnet_head_lock_shared(); + if (ifp == NULL || protocol == 0 || proto_details == NULL) { + retval = EINVAL; + goto end; + } + /* Check that the interface is in the global list */ + if (!ifnet_lookup(ifp)) { + retval = ENXIO; + goto end; + } + + ifproto = zalloc(dlif_proto_zone); + if (ifproto == NULL) { retval = ENOMEM; goto end; } bzero(ifproto, sizeof(*ifproto)); - + + /* refcnt held above during lookup */ ifproto->ifp = ifp; ifproto->protocol_family = protocol; ifproto->proto_kpi = kProtoKPI_v2; @@ -2287,49 +2613,24 @@ ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol, ifproto->kpi.v2.detached = proto_details->detached; ifproto->kpi.v2.resolve_multi = proto_details->resolve; ifproto->kpi.v2.send_arp = proto_details->send_arp; - - retval = dlil_attach_protocol_internal(ifproto, - proto_details->demux_list, proto_details->demux_count); - -end: - if (retval && ifproto) - FREE(ifproto, M_IFADDR); - return retval; -} -extern void if_rtproto_del(struct ifnet *ifp, int protocol); + retval = dlil_attach_protocol_internal(ifproto, + proto_details->demux_list, proto_details->demux_count); -static int -dlil_detach_protocol_internal( - struct if_proto *proto) -{ - struct ifnet *ifp = proto->ifp; - u_int32_t proto_family = proto->protocol_family; - struct kev_dl_proto_data ev_pr_data; - - if (proto->proto_kpi == kProtoKPI_v1) { - if (proto->kpi.v1.detached) - proto->kpi.v1.detached(ifp, proto->protocol_family); + if (dlil_verbose) { + printf("%s%d: attached v2 protocol %d\n", ifp->if_name, + ifp->if_unit, protocol); } - if (proto->proto_kpi == kProtoKPI_v2) { - if (proto->kpi.v2.detached) - proto->kpi.v2.detached(ifp, proto->protocol_family); + +end: + if (retval != 0 && retval != EEXIST && ifp != NULL) { + DLIL_PRINTF("%s%d: failed to attach v2 protocol %d (err=%d)\n", + ifp->if_name, ifp->if_unit, protocol, retval); } - if_proto_free(proto); - - /* - * Cleanup routes that may still be in the routing table for that interface/protocol pair. - */ - - if_rtproto_del(ifp, proto_family); - - /* the reserved field carries the number of protocol still attached (subject to change) */ - ev_pr_data.proto_family = proto_family; - ev_pr_data.proto_remaining_count = dlil_ifp_proto_count(ifp); - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED, - (struct net_event_data *)&ev_pr_data, - sizeof(struct kev_dl_proto_data)); - return 0; + ifnet_head_done(); + if (retval != 0 && ifproto != NULL) + zfree(dlif_proto_zone, ifproto); + return (retval); } errno_t @@ -2337,260 +2638,169 @@ ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family) { struct if_proto *proto = NULL; int retval = 0; - int use_reached_zero = 0; - - if (ifp == NULL || proto_family == 0) return EINVAL; - - if ((retval = dlil_write_begin()) != 0) { - if (retval == EDEADLK) { - retval = 0; - dlil_read_begin(); - proto = find_attached_proto(ifp, proto_family); - if (proto == 0) { - retval = ENXIO; - } - else { - proto->detaching = 1; - dlil_detach_waiting = 1; - wakeup(&dlil_detach_waiting); - } - dlil_read_end(); - } + + if (ifp == NULL || proto_family == 0) { + retval = EINVAL; goto end; } - + + ifnet_lock_exclusive(ifp); + /* callee holds a proto refcnt upon success */ proto = find_attached_proto(ifp, proto_family); - if (proto == NULL) { retval = ENXIO; - dlil_write_end(); + ifnet_lock_done(ifp); goto end; } - - /* - * Call family module del_proto - */ - + + /* call family module del_proto */ if (ifp->if_del_proto) ifp->if_del_proto(ifp, proto->protocol_family); - SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)], proto, if_proto, next_hash); - - /* - * We can do the rest of the work outside of the write lock. - */ - use_reached_zero = ifp_unuse(ifp); - dlil_write_end(); - - dlil_detach_protocol_internal(proto); + SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)], + proto, if_proto, next_hash); + + if (proto->proto_kpi == kProtoKPI_v1) { + proto->kpi.v1.input = ifproto_media_input_v1; + proto->kpi.v1.pre_output= ifproto_media_preout; + proto->kpi.v1.event = ifproto_media_event; + proto->kpi.v1.ioctl = ifproto_media_ioctl; + proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi; + proto->kpi.v1.send_arp = ifproto_media_send_arp; + } else { + proto->kpi.v2.input = ifproto_media_input_v2; + proto->kpi.v2.pre_output = ifproto_media_preout; + proto->kpi.v2.event = ifproto_media_event; + proto->kpi.v2.ioctl = ifproto_media_ioctl; + proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi; + proto->kpi.v2.send_arp = ifproto_media_send_arp; + } + proto->detached = 1; + ifnet_lock_done(ifp); + + if (dlil_verbose) { + printf("%s%d: detached %s protocol %d\n", ifp->if_name, + ifp->if_unit, (proto->proto_kpi == kProtoKPI_v1) ? + "v1" : "v2", proto_family); + } + + /* release proto refcnt held during protocol attach */ + if_proto_free(proto); /* - * Only handle the case where the interface will go away after - * we've sent the message. This way post message can send the - * message to the interface safely. + * Release proto refcnt held during lookup; the rest of + * protocol detach steps will happen when the last proto + * reference is released. */ - - if (use_reached_zero) - ifp_use_reached_zero(ifp); - + if_proto_free(proto); + end: - return retval; + return (retval); } -/* - * dlil_delayed_detach_thread is responsible for detaching - * protocols, protocol filters, and interface filters after - * an attempt was made to detach one of those items while - * it was not safe to do so (i.e. called dlil_read_begin). - * - * This function will take the dlil write lock and walk - * through each of the interfaces looking for items with - * the detaching flag set. When an item is found, it is - * detached from the interface and placed on a local list. - * After all of the items have been collected, we drop the - * write lock and performed the post detach. This is done - * so we only have to take the write lock once. - * - * When detaching a protocol filter, if we find that we - * have detached the very last protocol and we need to call - * ifp_use_reached_zero, we have to break out of our work - * to drop the write lock so we can call ifp_use_reached_zero. - */ - -static void -dlil_delayed_detach_thread(__unused void* foo, __unused wait_result_t wait) + +static errno_t +ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol, + struct mbuf *packet, char *header) { - thread_t self = current_thread(); - int asserted = 0; - - ml_thread_policy(self, MACHINE_GROUP, - (MACHINE_NETWORK_GROUP|MACHINE_NETWORK_NETISR)); +#pragma unused(ifp, protocol, packet, header) + return (ENXIO); +} + +static errno_t +ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol, + struct mbuf *packet) +{ +#pragma unused(ifp, protocol, packet) + return (ENXIO); + +} + +static errno_t +ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol, + mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type, + char *link_layer_dest) +{ +#pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest) + return (ENXIO); - - while (1) { - if (dlil_detach_waiting != 0 && dlil_write_begin() == 0) { - struct ifnet *ifp; - struct proto_hash_entry detached_protos; - struct ifnet_filter_head detached_filters; - struct if_proto *proto; - struct if_proto *next_proto; - struct ifnet_filter *filt; - struct ifnet_filter *next_filt; - int reached_zero; - - reached_zero = 0; - - /* Clear the detach waiting flag */ - dlil_detach_waiting = 0; - TAILQ_INIT(&detached_filters); - SLIST_INIT(&detached_protos); - - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - int i; - - // Look for protocols and protocol filters - for (i = 0; i < PROTO_HASH_SLOTS && !reached_zero; i++) { - struct if_proto **prev_nextptr = &SLIST_FIRST(&ifp->if_proto_hash[i]); - for (proto = *prev_nextptr; proto; proto = *prev_nextptr) { - - // Detach this protocol - if (proto->detaching) { - if (ifp->if_del_proto) - ifp->if_del_proto(ifp, proto->protocol_family); - *prev_nextptr = SLIST_NEXT(proto, next_hash); - SLIST_INSERT_HEAD(&detached_protos, proto, next_hash); - reached_zero = ifp_unuse(ifp); - if (reached_zero) { - break; - } - } - else { - // Update prev_nextptr to point to our next ptr - prev_nextptr = &SLIST_NEXT(proto, next_hash); - } - } - } - - // look for interface filters that need to be detached - for (filt = TAILQ_FIRST(&ifp->if_flt_head); filt; filt = next_filt) { - next_filt = TAILQ_NEXT(filt, filt_next); - if (filt->filt_detaching != 0) { - // take this interface filter off the interface filter list - TAILQ_REMOVE(&ifp->if_flt_head, filt, filt_next); - - // put this interface filter on the detached filters list - TAILQ_INSERT_TAIL(&detached_filters, filt, filt_next); - } - } - - if (ifp->if_delayed_detach) { - ifp->if_delayed_detach = 0; - reached_zero = ifp_unuse(ifp); - } - - if (reached_zero) - break; - } - ifnet_head_done(); - dlil_write_end(); - - for (filt = TAILQ_FIRST(&detached_filters); filt; filt = next_filt) { - next_filt = TAILQ_NEXT(filt, filt_next); - /* - * dlil_detach_filter_internal won't remove an item from - * the list if it is already detached (second parameter). - * The item will be freed though. - */ - dlil_detach_filter_internal(filt, 1); - } - - for (proto = SLIST_FIRST(&detached_protos); proto; proto = next_proto) { - next_proto = SLIST_NEXT(proto, next_hash); - dlil_detach_protocol_internal(proto); - } - - if (reached_zero) { - ifp_use_reached_zero(ifp); - dlil_detach_waiting = 1; // we may have missed something - } - } - - if (!asserted && dlil_detach_waiting == 0) { - asserted = 1; - assert_wait(&dlil_detach_waiting, THREAD_UNINT); - } - - if (dlil_detach_waiting == 0) { - asserted = 0; - thread_block(dlil_delayed_detach_thread); - } - } } static void -dlil_call_delayed_detach_thread(void) { - dlil_delayed_detach_thread(NULL, THREAD_RESTART); +ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol, + const struct kev_msg *event) +{ +#pragma unused(ifp, protocol, event) +} + +static errno_t +ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol, + unsigned long command, void *argument) +{ +#pragma unused(ifp, protocol, command, argument) + return (ENXIO); +} + +static errno_t +ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr, + struct sockaddr_dl *out_ll, size_t ll_len) +{ +#pragma unused(ifp, proto_addr, out_ll, ll_len) + return (ENXIO); +} + +static errno_t +ifproto_media_send_arp(struct ifnet *ifp, u_short arpop, + const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto, + const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto) +{ +#pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto) + return (ENXIO); } extern int if_next_index(void); errno_t -ifnet_attach( - ifnet_t ifp, - const struct sockaddr_dl *ll_addr) +ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr) { - u_int32_t interface_family; struct ifnet *tmp_if; - struct proto_hash_entry *new_proto_list = NULL; - int locked = 0; - - if (ifp == NULL) return EINVAL; - if (ll_addr && ifp->if_addrlen == 0) { - ifp->if_addrlen = ll_addr->sdl_alen; - } - else if (ll_addr && ll_addr->sdl_alen != ifp->if_addrlen) { - return EINVAL; - } - - interface_family = ifp->if_family; - - ifnet_head_lock_shared(); + struct ifaddr *ifa; + struct if_data_internal if_data_saved; + struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp; + if (ifp == NULL) + return (EINVAL); + + ifnet_head_lock_exclusive(); /* Verify we aren't already on the list */ TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) { if (tmp_if == ifp) { ifnet_head_done(); - return EEXIST; + return (EEXIST); } } - - ifnet_head_done(); - - if ((ifp->if_eflags & IFEF_REUSE) == 0 || ifp->if_lock == 0) -#if IFNET_RW_LOCK - ifp->if_lock = lck_rw_alloc_init(ifnet_lock_group, ifnet_lock_attr); -#else - ifp->if_lock = lck_mtx_alloc_init(ifnet_lock_group, ifnet_lock_attr); -#endif - if (ifp->if_lock == 0) { - return ENOMEM; + lck_mtx_lock_spin(&ifp->if_ref_lock); + if (ifp->if_refflags & IFRF_ATTACHED) { + panic("%s: flags mismatch (attached set) ifp=%p", + __func__, ifp); + /* NOTREACHED */ } + lck_mtx_unlock(&ifp->if_ref_lock); - if (!(ifp->if_eflags & IFEF_REUSE) || ifp->if_fwd_route_lock == NULL) { - if (ifp->if_fwd_route_lock == NULL) - ifp->if_fwd_route_lock = lck_mtx_alloc_init( - ifnet_lock_group, ifnet_lock_attr); + ifnet_lock_exclusive(ifp); - if (ifp->if_fwd_route_lock == NULL) { -#if IFNET_RW_LOCK - lck_rw_free(ifp->if_lock, ifnet_lock_group); -#else - lck_mtx_free(ifp->if_lock, ifnet_lock_group); -#endif - ifp->if_lock = NULL; - return (ENOMEM); + /* Sanity check */ + VERIFY(ifp->if_detaching_link.tqe_next == NULL); + VERIFY(ifp->if_detaching_link.tqe_prev == NULL); + + if (ll_addr != NULL) { + if (ifp->if_addrlen == 0) { + ifp->if_addrlen = ll_addr->sdl_alen; + } else if (ll_addr->sdl_alen != ifp->if_addrlen) { + ifnet_lock_done(ifp); + ifnet_head_done(); + return (EINVAL); } } @@ -2598,251 +2808,606 @@ ifnet_attach( * Allow interfaces without protocol families to attach * only if they have the necessary fields filled out. */ - - if (ifp->if_add_proto == 0 || ifp->if_del_proto == 0) { - DLIL_PRINTF("dlil Attempt to attach interface without family module - %d\n", - interface_family); - return ENODEV; - } - - if ((ifp->if_eflags & IFEF_REUSE) == 0 || ifp->if_proto_hash == NULL) { - MALLOC(new_proto_list, struct proto_hash_entry*, sizeof(struct proto_hash_entry) * PROTO_HASH_SLOTS, - M_NKE, M_WAITOK); - - if (new_proto_list == 0) { - return ENOBUFS; - } + if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) { + DLIL_PRINTF("%s: Attempt to attach interface without " + "family module - %d\n", __func__, ifp->if_family); + ifnet_lock_done(ifp); + ifnet_head_done(); + return (ENODEV); } - dlil_write_begin(); - locked = 1; + /* Allocate protocol hash table */ + VERIFY(ifp->if_proto_hash == NULL); + ifp->if_proto_hash = zalloc(dlif_phash_zone); + if (ifp->if_proto_hash == NULL) { + ifnet_lock_done(ifp); + ifnet_head_done(); + return (ENOBUFS); + } + bzero(ifp->if_proto_hash, dlif_phash_size); + lck_mtx_lock_spin(&ifp->if_flt_lock); + VERIFY(TAILQ_EMPTY(&ifp->if_flt_head)); TAILQ_INIT(&ifp->if_flt_head); - - - if (new_proto_list) { - bzero(new_proto_list, (PROTO_HASH_SLOTS * sizeof(struct proto_hash_entry))); - ifp->if_proto_hash = new_proto_list; - new_proto_list = NULL; - } - - /* old_if_attach */ - { - char workbuf[64]; - int namelen, masklen, socksize, ifasize; - struct ifaddr *ifa = NULL; - - if (ifp->if_snd.ifq_maxlen == 0) - ifp->if_snd.ifq_maxlen = ifqmaxlen; - TAILQ_INIT(&ifp->if_prefixhead); + VERIFY(ifp->if_flt_busy == 0); + VERIFY(ifp->if_flt_waiters == 0); + lck_mtx_unlock(&ifp->if_flt_lock); + + VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead)); + TAILQ_INIT(&ifp->if_prefixhead); + + if (!(dl_if->dl_if_flags & DLIF_REUSE)) { + VERIFY(LIST_EMPTY(&ifp->if_multiaddrs)); LIST_INIT(&ifp->if_multiaddrs); - ifnet_touch_lastchange(ifp); - - /* usecount to track attachment to the ifnet list */ - ifp_use(ifp, kIfNetUseCount_MayBeZero); - - /* Lock the list of interfaces */ - ifnet_head_lock_exclusive(); - ifnet_lock_exclusive(ifp); - - if ((ifp->if_eflags & IFEF_REUSE) == 0 || ifp->if_index == 0) { - int idx = if_next_index(); - - if (idx == -1) { - ifnet_lock_done(ifp); - ifnet_head_done(); - ifp_unuse(ifp); - dlil_write_end(); - - return ENOBUFS; - } - ifp->if_index = idx; - } else { - ifa = TAILQ_FIRST(&ifp->if_addrhead); - } - namelen = snprintf(workbuf, sizeof(workbuf), "%s%d", ifp->if_name, ifp->if_unit); -#define _offsetof(t, m) ((uintptr_t)((caddr_t)&((t *)0)->m)) - masklen = _offsetof(struct sockaddr_dl, sdl_data[0]) + namelen; - socksize = masklen + ifp->if_addrlen; -#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(u_int32_t) - 1))) - if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) - socksize = sizeof(struct sockaddr_dl); - socksize = ROUNDUP(socksize); - ifasize = sizeof(struct ifaddr) + 2 * socksize; - - /* - * Allocate a new ifa if we don't have one - * or the old one is too small. - */ - if (ifa == NULL || socksize > ifa->ifa_addr->sa_len) { - if (ifa) - if_detach_ifa(ifp, ifa); - ifa = (struct ifaddr*)_MALLOC(ifasize, M_IFADDR, M_WAITOK); - } - - if (ifa) { - struct sockaddr_dl *sdl = (struct sockaddr_dl *)(ifa + 1); - ifnet_addrs[ifp->if_index - 1] = ifa; - bzero(ifa, ifasize); - ifa->ifa_debug |= IFD_ALLOC; - sdl->sdl_len = socksize; - sdl->sdl_family = AF_LINK; - bcopy(workbuf, sdl->sdl_data, namelen); - sdl->sdl_nlen = namelen; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = ifp->if_type; - if (ll_addr) { - sdl->sdl_alen = ll_addr->sdl_alen; - if (ll_addr->sdl_alen != ifp->if_addrlen) - panic("ifnet_attach - ll_addr->sdl_alen != ifp->if_addrlen"); - bcopy(CONST_LLADDR(ll_addr), LLADDR(sdl), sdl->sdl_alen); - } - ifa->ifa_ifp = ifp; - ifa->ifa_rtrequest = link_rtrequest; - ifa->ifa_addr = (struct sockaddr*)sdl; - sdl = (struct sockaddr_dl*)(socksize + (caddr_t)sdl); - ifa->ifa_netmask = (struct sockaddr*)sdl; - sdl->sdl_len = masklen; - while (namelen != 0) - sdl->sdl_data[--namelen] = 0xff; - } + } - TAILQ_INIT(&ifp->if_addrhead); - ifa = ifnet_addrs[ifp->if_index - 1]; - - if (ifa) { - /* - * We don't use if_attach_ifa because we want - * this address to be first on the list. - */ - ifaref(ifa); - ifa->ifa_debug |= IFD_ATTACHED; - TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); + VERIFY(ifp->if_allhostsinm == NULL); + VERIFY(TAILQ_EMPTY(&ifp->if_addrhead)); + TAILQ_INIT(&ifp->if_addrhead); + + if (ifp->if_snd.ifq_maxlen == 0) + ifp->if_snd.ifq_maxlen = ifqmaxlen; + + if (ifp->if_index == 0) { + int idx = if_next_index(); + + if (idx == -1) { + ifp->if_index = 0; + ifnet_lock_done(ifp); + ifnet_head_done(); + return (ENOBUFS); } + ifp->if_index = idx; + } + /* There should not be anything occupying this slot */ + VERIFY(ifindex2ifnet[ifp->if_index] == NULL); + + /* allocate (if needed) and initialize a link address */ + VERIFY(!(dl_if->dl_if_flags & DLIF_REUSE) || ifp->if_lladdr != NULL); + ifa = dlil_alloc_lladdr(ifp, ll_addr); + if (ifa == NULL) { + ifnet_lock_done(ifp); + ifnet_head_done(); + return (ENOBUFS); + } + + VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL); + ifnet_addrs[ifp->if_index - 1] = ifa; + + /* make this address the first on the list */ + IFA_LOCK(ifa); + /* hold a reference for ifnet_addrs[] */ + IFA_ADDREF_LOCKED(ifa); + /* if_attach_link_ifa() holds a reference for ifa_link */ + if_attach_link_ifa(ifp, ifa); + IFA_UNLOCK(ifa); + #if CONFIG_MACF_NET - mac_ifnet_label_associate(ifp); + mac_ifnet_label_associate(ifp); #endif - - TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link); - ifindex2ifnet[ifp->if_index] = ifp; - } - /* - * A specific dlil input thread is created per Ethernet/PDP interface. - * pseudo interfaces or other types of interfaces use the main ("loopback") thread. - * If the sysctl "net.link.generic.system.multi_threaded_input" is set to zero, all packets will - * be handled by the main loopback thread, reverting to 10.4.x behaviour. - * - */ + TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link); + ifindex2ifnet[ifp->if_index] = ifp; - if (ifp->if_type == IFT_ETHER || ifp->if_type == IFT_PDP) { + /* Hold a reference to the underlying dlil_ifnet */ + ifnet_reference(ifp); + + /* + * A specific dlil input thread is created per Ethernet/cellular + * interface. pseudo interfaces or other types of interfaces use + * the main ("loopback") thread. + * + * If the sysctl "net.link.generic.system.multi_threaded_input" is set + * to zero, all packets will be handled by the main loopback thread, + * reverting to 10.4.x behaviour. + */ + if (dlil_multithreaded_input && + (ifp->if_type == IFT_ETHER || ifp->if_type == IFT_CELLULAR)) { int err; - if (dlil_multithreaded_input > 0) { - ifp->if_input_thread = _MALLOC(sizeof(struct dlil_threading_info), M_NKE, M_WAITOK); - if (ifp->if_input_thread == NULL) - panic("ifnet_attach ifp=%p couldn't alloc threading\n", ifp); - if ((err = dlil_create_input_thread(ifp, ifp->if_input_thread)) != 0) - panic("ifnet_attach ifp=%p couldn't get a thread. err=%d\n", ifp, err); + ifp->if_input_thread = zalloc(dlif_inp_zone); + if (ifp->if_input_thread == NULL) { + panic("%s: ifp=%p couldn't alloc threading", + __func__, ifp); + /* NOTREACHED */ + } + bzero(ifp->if_input_thread, dlif_inp_size); + err = dlil_create_input_thread(ifp, ifp->if_input_thread); + if (err != 0) { + panic("%s: ifp=%p couldn't get a thread. " + "err=%d", __func__, ifp, err); + /* NOTREACHED */ + } #ifdef DLIL_DEBUG - printf("ifnet_attach: dlil thread for ifp=%p if_index=%d\n", ifp, ifp->if_index); + printf("%s: dlil thread for ifp=%p if_index=%d\n", + __func__, ifp, ifp->if_index); #endif - } } + + /* Clear stats (save and restore other fields that we care) */ + if_data_saved = ifp->if_data; + bzero(&ifp->if_data, sizeof (ifp->if_data)); + ifp->if_data.ifi_type = if_data_saved.ifi_type; + ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen; + ifp->if_data.ifi_physical = if_data_saved.ifi_physical; + ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen; + ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen; + ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu; + ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate; + ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist; + ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu; + ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu; + ifnet_touch_lastchange(ifp); + + /* Record attach PC stacktrace */ + ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach); + + ifp->if_updatemcasts = 0; + if (!LIST_EMPTY(&ifp->if_multiaddrs)) { + struct ifmultiaddr *ifma; + LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + IFMA_LOCK(ifma); + if (ifma->ifma_addr->sa_family == AF_LINK || + ifma->ifma_addr->sa_family == AF_UNSPEC) + ifp->if_updatemcasts++; + IFMA_UNLOCK(ifma); + } + + printf("%s%d: attached with %d suspended link-layer multicast " + "membership(s)\n", ifp->if_name, ifp->if_unit, + ifp->if_updatemcasts); + } + ifnet_lock_done(ifp); ifnet_head_done(); -#if PF + + lck_mtx_lock(&ifp->if_cached_route_lock); + /* Enable forwarding cached route */ + ifp->if_fwd_cacheok = 1; + /* Clean up any existing cached routes */ + if (ifp->if_fwd_route.ro_rt != NULL) + rtfree(ifp->if_fwd_route.ro_rt); + bzero(&ifp->if_fwd_route, sizeof (ifp->if_fwd_route)); + if (ifp->if_src_route.ro_rt != NULL) + rtfree(ifp->if_src_route.ro_rt); + bzero(&ifp->if_src_route, sizeof (ifp->if_src_route)); + if (ifp->if_src_route6.ro_rt != NULL) + rtfree(ifp->if_src_route6.ro_rt); + bzero(&ifp->if_src_route6, sizeof (ifp->if_src_route6)); + lck_mtx_unlock(&ifp->if_cached_route_lock); + + ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE)); + /* - * Attach packet filter to this interface, if enaled. + * Allocate and attach IGMPv3/MLDv2 interface specific variables + * and trees; do this before the ifnet is marked as attached. + * The ifnet keeps the reference to the info structures even after + * the ifnet is detached, since the network-layer records still + * refer to the info structures even after that. This also + * makes it possible for them to still function after the ifnet + * is recycled or reattached. */ - pf_ifnet_hook(ifp, 1); -#endif /* PF */ - dlil_write_end(); +#if INET + if (IGMP_IFINFO(ifp) == NULL) { + IGMP_IFINFO(ifp) = igmp_domifattach(ifp, M_WAITOK); + VERIFY(IGMP_IFINFO(ifp) != NULL); + } else { + VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp); + igmp_domifreattach(IGMP_IFINFO(ifp)); + } +#endif /* INET */ +#if INET6 + if (MLD_IFINFO(ifp) == NULL) { + MLD_IFINFO(ifp) = mld_domifattach(ifp, M_WAITOK); + VERIFY(MLD_IFINFO(ifp) != NULL); + } else { + VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp); + mld_domifreattach(MLD_IFINFO(ifp)); + } +#endif /* INET6 */ -#if IFNET_ROUTE_REFCNT + /* + * Finally, mark this ifnet as attached. + */ + lck_mtx_lock(rnh_lock); + ifnet_lock_exclusive(ifp); + lck_mtx_lock_spin(&ifp->if_ref_lock); + ifp->if_refflags = IFRF_ATTACHED; + lck_mtx_unlock(&ifp->if_ref_lock); if (net_rtref) { - (void) ifnet_set_idle_flags(ifp, IFRF_IDLE_NOTIFY, + /* boot-args override; enable idle notification */ + (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY, IFRF_IDLE_NOTIFY); + } else { + /* apply previous request(s) to set the idle flags, if any */ + (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags, + ifp->if_idle_new_flags_mask); + } -#endif /* IFNET_ROUTE_REFCNT */ + ifnet_lock_done(ifp); + lck_mtx_unlock(rnh_lock); + +#if PF + /* + * Attach packet filter to this interface, if enabled. + */ + pf_ifnet_hook(ifp, 1); +#endif /* PF */ dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, 0); - return 0; + if (dlil_verbose) { + printf("%s%d: attached%s\n", ifp->if_name, ifp->if_unit, + (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : ""); + } + + return (0); +} + +/* + * Prepare the storage for the first/permanent link address, which must + * must have the same lifetime as the ifnet itself. Although the link + * address gets removed from if_addrhead and ifnet_addrs[] at detach time, + * its location in memory must never change as it may still be referred + * to by some parts of the system afterwards (unfortunate implementation + * artifacts inherited from BSD.) + * + * Caller must hold ifnet lock as writer. + */ +static struct ifaddr * +dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr) +{ + struct ifaddr *ifa, *oifa; + struct sockaddr_dl *asdl, *msdl; + char workbuf[IFNAMSIZ*2]; + int namelen, masklen, socksize; + struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp; + + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen); + + namelen = snprintf(workbuf, sizeof (workbuf), "%s%d", + ifp->if_name, ifp->if_unit); + masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen; + socksize = masklen + ifp->if_addrlen; +#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1))) + if ((u_int32_t)socksize < sizeof (struct sockaddr_dl)) + socksize = sizeof(struct sockaddr_dl); + socksize = ROUNDUP(socksize); +#undef ROUNDUP + + ifa = ifp->if_lladdr; + if (socksize > DLIL_SDLMAXLEN || + (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) { + /* + * Rare, but in the event that the link address requires + * more storage space than DLIL_SDLMAXLEN, allocate the + * largest possible storages for address and mask, such + * that we can reuse the same space when if_addrlen grows. + * This same space will be used when if_addrlen shrinks. + */ + if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) { + int ifasize = sizeof (*ifa) + 2 * SOCK_MAXADDRLEN; + ifa = _MALLOC(ifasize, M_IFADDR, M_WAITOK | M_ZERO); + if (ifa == NULL) + return (NULL); + ifa_lock_init(ifa); + /* Don't set IFD_ALLOC, as this is permanent */ + ifa->ifa_debug = IFD_LINK; + } + IFA_LOCK(ifa); + /* address and mask sockaddr_dl locations */ + asdl = (struct sockaddr_dl *)(ifa + 1); + bzero(asdl, SOCK_MAXADDRLEN); + msdl = (struct sockaddr_dl *)((char *)asdl + SOCK_MAXADDRLEN); + bzero(msdl, SOCK_MAXADDRLEN); + } else { + VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa); + /* + * Use the storage areas for address and mask within the + * dlil_ifnet structure. This is the most common case. + */ + if (ifa == NULL) { + ifa = &dl_if->dl_if_lladdr.ifa; + ifa_lock_init(ifa); + /* Don't set IFD_ALLOC, as this is permanent */ + ifa->ifa_debug = IFD_LINK; + } + IFA_LOCK(ifa); + /* address and mask sockaddr_dl locations */ + asdl = (struct sockaddr_dl *)&dl_if->dl_if_lladdr.asdl; + bzero(asdl, sizeof (dl_if->dl_if_lladdr.asdl)); + msdl = (struct sockaddr_dl *)&dl_if->dl_if_lladdr.msdl; + bzero(msdl, sizeof (dl_if->dl_if_lladdr.msdl)); + } + + /* hold a permanent reference for the ifnet itself */ + IFA_ADDREF_LOCKED(ifa); + oifa = ifp->if_lladdr; + ifp->if_lladdr = ifa; + + VERIFY(ifa->ifa_debug == IFD_LINK); + ifa->ifa_ifp = ifp; + ifa->ifa_rtrequest = link_rtrequest; + ifa->ifa_addr = (struct sockaddr *)asdl; + asdl->sdl_len = socksize; + asdl->sdl_family = AF_LINK; + bcopy(workbuf, asdl->sdl_data, namelen); + asdl->sdl_nlen = namelen; + asdl->sdl_index = ifp->if_index; + asdl->sdl_type = ifp->if_type; + if (ll_addr != NULL) { + asdl->sdl_alen = ll_addr->sdl_alen; + bcopy(CONST_LLADDR(ll_addr), LLADDR(asdl), asdl->sdl_alen); + } else { + asdl->sdl_alen = 0; + } + ifa->ifa_netmask = (struct sockaddr*)msdl; + msdl->sdl_len = masklen; + while (namelen != 0) + msdl->sdl_data[--namelen] = 0xff; + IFA_UNLOCK(ifa); + + if (oifa != NULL) + IFA_REMREF(oifa); + + return (ifa); +} + +static void +if_purgeaddrs(struct ifnet *ifp) +{ +#if INET + in_purgeaddrs(ifp); +#endif /* INET */ +#if INET6 + in6_purgeaddrs(ifp); +#endif /* INET6 */ +#if NETAT + at_purgeaddrs(ifp); +#endif } errno_t -ifnet_detach( - ifnet_t ifp) +ifnet_detach(ifnet_t ifp) { - struct ifnet_filter *filter; - struct ifnet_filter *filter_next; - int zeroed = 0; - int retval = 0; - struct ifnet_filter_head fhead; - struct dlil_threading_info *inputthread; - - if (ifp == NULL) return EINVAL; - + if (ifp == NULL) + return (EINVAL); + + ifnet_head_lock_exclusive(); + lck_mtx_lock(rnh_lock); ifnet_lock_exclusive(ifp); - - if ((ifp->if_eflags & IFEF_DETACHING) != 0) { + + /* + * Check to see if this interface has previously triggered + * aggressive protocol draining; if so, decrement the global + * refcnt and clear PR_AGGDRAIN on the route domain if + * there are no more of such an interface around. + */ + (void) ifnet_set_idle_flags_locked(ifp, 0, ~0); + + lck_mtx_lock_spin(&ifp->if_ref_lock); + if (!(ifp->if_refflags & IFRF_ATTACHED)) { + lck_mtx_unlock(&ifp->if_ref_lock); + ifnet_lock_done(ifp); + lck_mtx_unlock(rnh_lock); + ifnet_head_done(); + return (EINVAL); + } else if (ifp->if_refflags & IFRF_DETACHING) { /* Interface has already been detached */ + lck_mtx_unlock(&ifp->if_ref_lock); ifnet_lock_done(ifp); - return ENXIO; + lck_mtx_unlock(rnh_lock); + ifnet_head_done(); + return (ENXIO); } - + /* Indicate this interface is being detached */ + ifp->if_refflags &= ~IFRF_ATTACHED; + ifp->if_refflags |= IFRF_DETACHING; + lck_mtx_unlock(&ifp->if_ref_lock); + + if (dlil_verbose) + printf("%s%d: detaching\n", ifp->if_name, ifp->if_unit); + /* - * Indicate this interface is being detached. - * - * This should prevent protocols from attaching - * from this point on. Interface will remain on - * the list until all of the protocols are detached. + * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will + * no longer be visible during lookups from this point. */ - ifp->if_eflags |= IFEF_DETACHING; + VERIFY(ifindex2ifnet[ifp->if_index] == ifp); + TAILQ_REMOVE(&ifnet_head, ifp, if_link); + ifp->if_link.tqe_next = NULL; + ifp->if_link.tqe_prev = NULL; + ifindex2ifnet[ifp->if_index] = NULL; + + /* Record detach PC stacktrace */ + ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach); + ifnet_lock_done(ifp); - - dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0); - + lck_mtx_unlock(rnh_lock); + ifnet_head_done(); + /* Let BPF know we're detaching */ bpfdetach(ifp); - -#if IFNET_ROUTE_REFCNT + + /* Mark the interface as DOWN */ + if_down(ifp); + + /* Disable forwarding cached route */ + lck_mtx_lock(&ifp->if_cached_route_lock); + ifp->if_fwd_cacheok = 0; + lck_mtx_unlock(&ifp->if_cached_route_lock); + /* - * Check to see if this interface has previously triggered - * aggressive protocol draining; if so, decrement the global - * refcnt and clear PR_AGGDRAIN on the route domain if - * there are no more of such an interface around. + * Drain any deferred IGMPv3/MLDv2 query responses, but keep the + * references to the info structures and leave them attached to + * this ifnet. */ - if (ifp->if_want_aggressive_drain != 0) - (void) ifnet_set_idle_flags(ifp, 0, ~0); -#endif /* IFNET_ROUTE_REFCNT */ - - if ((retval = dlil_write_begin()) != 0) { - if (retval == EDEADLK) { - retval = 0; - - /* We need to perform a delayed detach */ - ifp->if_delayed_detach = 1; - dlil_detach_waiting = 1; - wakeup(&dlil_detach_waiting); +#if INET + igmp_domifdetach(ifp); +#endif /* INET */ +#if INET6 + mld_domifdetach(ifp); +#endif /* INET6 */ + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, 0); + + /* Let worker thread take care of the rest, to avoid reentrancy */ + lck_mtx_lock(&dlil_ifnet_lock); + ifnet_detaching_enqueue(ifp); + lck_mtx_unlock(&dlil_ifnet_lock); + + return (0); +} + +static void +ifnet_detaching_enqueue(struct ifnet *ifp) +{ + lck_mtx_assert(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED); + + ++ifnet_detaching_cnt; + VERIFY(ifnet_detaching_cnt != 0); + TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link); + wakeup((caddr_t)&ifnet_delayed_run); +} + +static struct ifnet * +ifnet_detaching_dequeue(void) +{ + struct ifnet *ifp; + + lck_mtx_assert(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED); + + ifp = TAILQ_FIRST(&ifnet_detaching_head); + VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL); + if (ifp != NULL) { + VERIFY(ifnet_detaching_cnt != 0); + --ifnet_detaching_cnt; + TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link); + ifp->if_detaching_link.tqe_next = NULL; + ifp->if_detaching_link.tqe_prev = NULL; + } + return (ifp); +} + +static void +ifnet_delayed_thread_func(void) +{ + struct ifnet *ifp; + + for (;;) { + lck_mtx_lock(&dlil_ifnet_lock); + while (ifnet_detaching_cnt == 0) { + (void) msleep(&ifnet_delayed_run, &dlil_ifnet_lock, + (PZERO - 1), "ifnet_delayed_thread", NULL); + } + + VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL); + + /* Take care of detaching ifnet */ + ifp = ifnet_detaching_dequeue(); + if (ifp != NULL) { + lck_mtx_unlock(&dlil_ifnet_lock); + ifnet_detach_final(ifp); + } else { + lck_mtx_unlock(&dlil_ifnet_lock); } - return retval; } +} -#if PF - /* - * Detach this interface from packet filter, if enabled. +static void +ifnet_detach_final(struct ifnet *ifp) +{ + struct ifnet_filter *filter, *filter_next; + struct ifnet_filter_head fhead; + struct dlil_threading_info *inputthread; + struct ifaddr *ifa; + ifnet_detached_func if_free; + int i; + + lck_mtx_lock(&ifp->if_ref_lock); + if (!(ifp->if_refflags & IFRF_DETACHING)) { + panic("%s: flags mismatch (detaching not set) ifp=%p", + __func__, ifp); + /* NOTREACHED */ + } + + /* Wait until the existing IO references get released + * before we proceed with ifnet_detach */ - pf_ifnet_hook(ifp, 0); -#endif /* PF */ + while (ifp->if_refio > 0) { + printf("%s: Waiting for IO references on %s%d interface " + "to be released\n", __func__, ifp->if_name, ifp->if_unit); + (void) msleep(&(ifp->if_refio), &ifp->if_ref_lock, + (PZERO - 1), "ifnet_ioref_wait", NULL); + } + lck_mtx_unlock(&ifp->if_ref_lock); + + /* Detach interface filters */ + lck_mtx_lock(&ifp->if_flt_lock); + if_flt_monitor_enter(ifp); - /* Steal the list of interface filters */ + lck_mtx_assert(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED); fhead = ifp->if_flt_head; TAILQ_INIT(&ifp->if_flt_head); - /* unuse the interface */ - zeroed = ifp_unuse(ifp); + for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) { + filter_next = TAILQ_NEXT(filter, filt_next); + lck_mtx_unlock(&ifp->if_flt_lock); + + dlil_detach_filter_internal(filter, 1); + lck_mtx_lock(&ifp->if_flt_lock); + } + if_flt_monitor_leave(ifp); + lck_mtx_unlock(&ifp->if_flt_lock); + + /* Tell upper layers to drop their network addresses */ + if_purgeaddrs(ifp); + + ifnet_lock_exclusive(ifp); + + /* Uplumb all protocols */ + for (i = 0; i < PROTO_HASH_SLOTS; i++) { + struct if_proto *proto; + + proto = SLIST_FIRST(&ifp->if_proto_hash[i]); + while (proto != NULL) { + protocol_family_t family = proto->protocol_family; + ifnet_lock_done(ifp); + proto_unplumb(family, ifp); + ifnet_lock_exclusive(ifp); + proto = SLIST_FIRST(&ifp->if_proto_hash[i]); + } + /* There should not be any protocols left */ + VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i])); + } + zfree(dlif_phash_zone, ifp->if_proto_hash); + ifp->if_proto_hash = NULL; + + /* Detach (permanent) link address from if_addrhead */ + ifa = TAILQ_FIRST(&ifp->if_addrhead); + VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa); + IFA_LOCK(ifa); + if_detach_link_ifa(ifp, ifa); + IFA_UNLOCK(ifa); + + /* Remove (permanent) link address from ifnet_addrs[] */ + IFA_REMREF(ifa); + ifnet_addrs[ifp->if_index - 1] = NULL; + + /* This interface should not be on {ifnet_head,detaching} */ + VERIFY(ifp->if_link.tqe_next == NULL); + VERIFY(ifp->if_link.tqe_prev == NULL); + VERIFY(ifp->if_detaching_link.tqe_next == NULL); + VERIFY(ifp->if_detaching_link.tqe_prev == NULL); + + /* Prefix list should be empty by now */ + VERIFY(TAILQ_EMPTY(&ifp->if_prefixhead)); + + /* The slot should have been emptied */ + VERIFY(ifindex2ifnet[ifp->if_index] == NULL); + + /* There should not be any addresses left */ + VERIFY(TAILQ_EMPTY(&ifp->if_addrhead)); /* * If thread affinity was set for the workloop thread, we will need @@ -2853,16 +3418,19 @@ ifnet_detach( if (inputthread->net_affinity) { struct thread *tp; - if (inputthread == dlil_lo_thread_ptr) - panic("Thread affinity should not be enabled " - "on the loopback dlil input thread\n"); + if (inputthread == dlil_lo_thread_ptr) { + panic("%s: Thread affinity should not be " + "enabled on the loopback dlil input " + "thread", __func__); + /* NOTREACHED */ + } - lck_mtx_lock(inputthread->input_lck); + lck_mtx_lock_spin(&inputthread->input_lck); tp = inputthread->workloop_thread; inputthread->workloop_thread = NULL; inputthread->tag = 0; inputthread->net_affinity = FALSE; - lck_mtx_unlock(inputthread->input_lck); + lck_mtx_unlock(&inputthread->input_lck); /* Tear down workloop thread affinity */ if (tp != NULL) { @@ -2882,183 +3450,290 @@ ifnet_detach( if (inputthread != dlil_lo_thread_ptr) { #ifdef DLIL_DEBUG - printf("ifnet_detach: wakeup thread threadinfo: %p " + printf("%s: wakeup thread threadinfo: %p " "input_thread=%p threads: cur=%d max=%d\n", - inputthread, inputthread->input_thread, + __func__, inputthread, inputthread->input_thread, dlil_multithreaded_input, cur_dlil_input_threads); #endif - lck_mtx_lock(inputthread->input_lck); + lck_mtx_lock_spin(&inputthread->input_lck); inputthread->input_waiting |= DLIL_INPUT_TERMINATE; - if ((inputthread->input_waiting & DLIL_INPUT_RUNNING) == 0) { + if (!(inputthread->input_waiting & DLIL_INPUT_RUNNING)) wakeup((caddr_t)&inputthread->input_waiting); - } - lck_mtx_unlock(inputthread->input_lck); + + lck_mtx_unlock(&inputthread->input_lck); } } - /* last chance to clean up IPv4 forwarding cached route */ - lck_mtx_lock(ifp->if_fwd_route_lock); - if (ifp->if_fwd_route.ro_rt != NULL) { + + /* The driver might unload, so point these to ourselves */ + if_free = ifp->if_free; + ifp->if_output = ifp_if_output; + ifp->if_ioctl = ifp_if_ioctl; + ifp->if_set_bpf_tap = ifp_if_set_bpf_tap; + ifp->if_free = ifp_if_free; + ifp->if_demux = ifp_if_demux; + ifp->if_event = ifp_if_event; + ifp->if_framer = ifp_if_framer; + ifp->if_add_proto = ifp_if_add_proto; + ifp->if_del_proto = ifp_if_del_proto; + ifp->if_check_multi = ifp_if_check_multi; + + ifnet_lock_done(ifp); + +#if PF + /* + * Detach this interface from packet filter, if enabled. + */ + pf_ifnet_hook(ifp, 0); +#endif /* PF */ + + /* Filter list should be empty */ + lck_mtx_lock_spin(&ifp->if_flt_lock); + VERIFY(TAILQ_EMPTY(&ifp->if_flt_head)); + VERIFY(ifp->if_flt_busy == 0); + VERIFY(ifp->if_flt_waiters == 0); + lck_mtx_unlock(&ifp->if_flt_lock); + + /* Last chance to cleanup any cached route */ + lck_mtx_lock(&ifp->if_cached_route_lock); + VERIFY(!ifp->if_fwd_cacheok); + if (ifp->if_fwd_route.ro_rt != NULL) rtfree(ifp->if_fwd_route.ro_rt); - ifp->if_fwd_route.ro_rt = NULL; - } - lck_mtx_unlock(ifp->if_fwd_route_lock); - dlil_write_end(); - - for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) { - filter_next = TAILQ_NEXT(filter, filt_next); - dlil_detach_filter_internal(filter, 1); - } - - if (zeroed != 0) { - ifp_use_reached_zero(ifp); + bzero(&ifp->if_fwd_route, sizeof (ifp->if_fwd_route)); + if (ifp->if_src_route.ro_rt != NULL) + rtfree(ifp->if_src_route.ro_rt); + bzero(&ifp->if_src_route, sizeof (ifp->if_src_route)); + if (ifp->if_src_route6.ro_rt != NULL) + rtfree(ifp->if_src_route6.ro_rt); + bzero(&ifp->if_src_route6, sizeof (ifp->if_src_route6)); + lck_mtx_unlock(&ifp->if_cached_route_lock); + + ifnet_llreach_ifdetach(ifp); + + dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, 0); + + if (if_free != NULL) + if_free(ifp); + + /* + * Finally, mark this ifnet as detached. + */ + lck_mtx_lock_spin(&ifp->if_ref_lock); + if (!(ifp->if_refflags & IFRF_DETACHING)) { + panic("%s: flags mismatch (detaching not set) ifp=%p", + __func__, ifp); + /* NOTREACHED */ } - - return retval; + ifp->if_refflags &= ~IFRF_DETACHING; + lck_mtx_unlock(&ifp->if_ref_lock); + + if (dlil_verbose) + printf("%s%d: detached\n", ifp->if_name, ifp->if_unit); + + /* Release reference held during ifnet attach */ + ifnet_release(ifp); } static errno_t -dlil_recycle_ioctl( - __unused ifnet_t ifnet_ptr, - __unused u_long ioctl_code, - __unused void *ioctl_arg) +ifp_if_output(struct ifnet *ifp, struct mbuf *m) { - return EOPNOTSUPP; +#pragma unused(ifp) + m_freem(m); + return (0); } -static int -dlil_recycle_output( - __unused struct ifnet *ifnet_ptr, - struct mbuf *m) +static errno_t +ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf) { - m_freem(m); - return 0; +#pragma unused(ifp, fh, pf) + m_freem(m); + return (EJUSTRETURN); } -static void -dlil_recycle_free( - __unused ifnet_t ifnet_ptr) +static errno_t +ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf, + const struct ifnet_demux_desc *da, u_int32_t dc) { +#pragma unused(ifp, pf, da, dc) + return (EINVAL); } static errno_t -dlil_recycle_set_bpf_tap( - __unused ifnet_t ifp, - __unused bpf_tap_mode mode, - __unused bpf_packet_func callback) +ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf) { - /* XXX not sure what to do here */ - return 0; +#pragma unused(ifp, pf) + return (EINVAL); +} + +static errno_t +ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa) +{ +#pragma unused(ifp, sa) + return (EOPNOTSUPP); +} + +static errno_t +ifp_if_framer(struct ifnet *ifp, struct mbuf **m, + const struct sockaddr *sa, const char *ll, const char *t) +{ +#pragma unused(ifp, m, sa, ll, t) + m_freem(*m); + *m = NULL; + return (EJUSTRETURN); +} + +static errno_t +ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg) +{ +#pragma unused(ifp, cmd, arg) + return (EOPNOTSUPP); +} + +static errno_t +ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f) +{ +#pragma unused(ifp, tm, f) + /* XXX not sure what to do here */ + return (0); +} + +static void +ifp_if_free(struct ifnet *ifp) +{ +#pragma unused(ifp) +} + +static void +ifp_if_event(struct ifnet *ifp, const struct kev_msg *e) +{ +#pragma unused(ifp, e) } __private_extern__ -int dlil_if_acquire( - u_int32_t family, - const void *uniqueid, - size_t uniqueid_len, - struct ifnet **ifp) -{ - struct ifnet *ifp1 = NULL; - struct dlil_ifnet *dlifp1 = NULL; - int ret = 0; - - lck_mtx_lock(dlil_ifnet_mutex); - TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) { - - ifp1 = (struct ifnet *)dlifp1; - - if (ifp1->if_family == family) { - - /* same uniqueid and same len or no unique id specified */ - if ((uniqueid_len == dlifp1->if_uniqueid_len) - && !bcmp(uniqueid, dlifp1->if_uniqueid, uniqueid_len)) { - - /* check for matching interface in use */ - if (ifp1->if_eflags & IFEF_INUSE) { - if (uniqueid_len) { - ret = EBUSY; - goto end; - } - } - else { - if (!ifp1->if_lock) - panic("ifp's lock is gone\n"); - ifnet_lock_exclusive(ifp1); - ifp1->if_eflags |= (IFEF_INUSE | IFEF_REUSE); - ifnet_lock_done(ifp1); - *ifp = ifp1; +int dlil_if_acquire(u_int32_t family, const void *uniqueid, + size_t uniqueid_len, struct ifnet **ifp) +{ + struct ifnet *ifp1 = NULL; + struct dlil_ifnet *dlifp1 = NULL; + void *buf, *base, **pbuf; + int ret = 0; + + lck_mtx_lock(&dlil_ifnet_lock); + TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) { + ifp1 = (struct ifnet *)dlifp1; + + if (ifp1->if_family != family) + continue; + + lck_mtx_lock(&dlifp1->dl_if_lock); + /* same uniqueid and same len or no unique id specified */ + if ((uniqueid_len == dlifp1->dl_if_uniqueid_len) && + !bcmp(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len)) { + /* check for matching interface in use */ + if (dlifp1->dl_if_flags & DLIF_INUSE) { + if (uniqueid_len) { + ret = EBUSY; + lck_mtx_unlock(&dlifp1->dl_if_lock); goto end; - } - } - } - } - - /* no interface found, allocate a new one */ - MALLOC(dlifp1, struct dlil_ifnet *, sizeof(*dlifp1), M_NKE, M_WAITOK); - if (dlifp1 == 0) { - ret = ENOMEM; - goto end; - } - - bzero(dlifp1, sizeof(*dlifp1)); - - if (uniqueid_len) { - MALLOC(dlifp1->if_uniqueid, void *, uniqueid_len, M_NKE, M_WAITOK); - if (dlifp1->if_uniqueid == 0) { - FREE(dlifp1, M_NKE); - ret = ENOMEM; - goto end; - } - bcopy(uniqueid, dlifp1->if_uniqueid, uniqueid_len); - dlifp1->if_uniqueid_len = uniqueid_len; - } - - ifp1 = (struct ifnet *)dlifp1; - ifp1->if_eflags |= IFEF_INUSE; - ifp1->if_name = dlifp1->if_namestorage; + } + } else { + dlifp1->dl_if_flags |= (DLIF_INUSE|DLIF_REUSE); + lck_mtx_unlock(&dlifp1->dl_if_lock); + *ifp = ifp1; + goto end; + } + } + lck_mtx_unlock(&dlifp1->dl_if_lock); + } + + /* no interface found, allocate a new one */ + buf = zalloc(dlif_zone); + if (buf == NULL) { + ret = ENOMEM; + goto end; + } + bzero(buf, dlif_bufsize); + + /* Get the 64-bit aligned base address for this object */ + base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t), + sizeof (u_int64_t)); + VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize)); + + /* + * Wind back a pointer size from the aligned base and + * save the original address so we can free it later. + */ + pbuf = (void **)((intptr_t)base - sizeof (void *)); + *pbuf = buf; + dlifp1 = base; + + if (uniqueid_len) { + MALLOC(dlifp1->dl_if_uniqueid, void *, uniqueid_len, + M_NKE, M_WAITOK); + if (dlifp1->dl_if_uniqueid == NULL) { + zfree(dlif_zone, dlifp1); + ret = ENOMEM; + goto end; + } + bcopy(uniqueid, dlifp1->dl_if_uniqueid, uniqueid_len); + dlifp1->dl_if_uniqueid_len = uniqueid_len; + } + + ifp1 = (struct ifnet *)dlifp1; + dlifp1->dl_if_flags = DLIF_INUSE; + if (ifnet_debug) { + dlifp1->dl_if_flags |= DLIF_DEBUG; + dlifp1->dl_if_trace = dlil_if_trace; + } + ifp1->if_name = dlifp1->dl_if_namestorage; #if CONFIG_MACF_NET - mac_ifnet_label_init(ifp1); + mac_ifnet_label_init(ifp1); #endif - TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link); - - *ifp = ifp1; + lck_mtx_init(&dlifp1->dl_if_lock, ifnet_lock_group, ifnet_lock_attr); + lck_rw_init(&ifp1->if_lock, ifnet_lock_group, ifnet_lock_attr); + lck_mtx_init(&ifp1->if_ref_lock, ifnet_lock_group, ifnet_lock_attr); + lck_mtx_init(&ifp1->if_flt_lock, ifnet_lock_group, ifnet_lock_attr); + lck_mtx_init(&ifp1->if_cached_route_lock, ifnet_lock_group, + ifnet_lock_attr); + lck_mtx_init(&ifp1->if_addrconfig_lock, ifnet_lock_group, + ifnet_lock_attr); + lck_rw_init(&ifp1->if_llreach_lock, ifnet_lock_group, ifnet_lock_attr); + + TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link); + + *ifp = ifp1; end: - lck_mtx_unlock(dlil_ifnet_mutex); + lck_mtx_unlock(&dlil_ifnet_lock); - return ret; + VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof (u_int64_t)) && + IS_P2ALIGNED(&ifp1->if_data, sizeof (u_int64_t)))); + + return (ret); } __private_extern__ void -dlil_if_release( - ifnet_t ifp) -{ - struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp; - - /* Interface does not have a lock until it is attached - radar 3713951 */ - if (ifp->if_lock) - ifnet_lock_exclusive(ifp); - ifp->if_eflags &= ~IFEF_INUSE; - ifp->if_ioctl = dlil_recycle_ioctl; - ifp->if_output = dlil_recycle_output; - ifp->if_free = dlil_recycle_free; - ifp->if_set_bpf_tap = dlil_recycle_set_bpf_tap; - - strncpy(dlifp->if_namestorage, ifp->if_name, IFNAMSIZ); - ifp->if_name = dlifp->if_namestorage; +dlil_if_release(ifnet_t ifp) +{ + struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp; + + ifnet_lock_exclusive(ifp); + lck_mtx_lock(&dlifp->dl_if_lock); + dlifp->dl_if_flags &= ~DLIF_INUSE; + strncpy(dlifp->dl_if_namestorage, ifp->if_name, IFNAMSIZ); + ifp->if_name = dlifp->dl_if_namestorage; + lck_mtx_unlock(&dlifp->dl_if_lock); #if CONFIG_MACF_NET - /* - * We can either recycle the MAC label here or in dlil_if_acquire(). - * It seems logical to do it here but this means that anything that - * still has a handle on ifp will now see it as unlabeled. - * Since the interface is "dead" that may be OK. Revisit later. - */ - mac_ifnet_label_recycle(ifp); + /* + * We can either recycle the MAC label here or in dlil_if_acquire(). + * It seems logical to do it here but this means that anything that + * still has a handle on ifp will now see it as unlabeled. + * Since the interface is "dead" that may be OK. Revisit later. + */ + mac_ifnet_label_recycle(ifp); #endif - if (ifp->if_lock) - ifnet_lock_done(ifp); - + ifnet_lock_done(ifp); } __private_extern__ void @@ -3081,3 +3756,138 @@ dlil_proto_unplumb_all(struct ifnet *ifp) (void) proto_unplumb(PF_APPLETALK, ifp); #endif /* NETAT */ } + +static void +ifp_src_route_copyout(struct ifnet *ifp, struct route *dst) +{ + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); + + route_copyout(dst, &ifp->if_src_route, sizeof (*dst)); + + lck_mtx_unlock(&ifp->if_cached_route_lock); +} + +static void +ifp_src_route_copyin(struct ifnet *ifp, struct route *src) +{ + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); + + if (ifp->if_fwd_cacheok) { + route_copyin(src, &ifp->if_src_route, sizeof (*src)); + } else { + rtfree(src->ro_rt); + src->ro_rt = NULL; + } + lck_mtx_unlock(&ifp->if_cached_route_lock); +} + +#if INET6 +static void +ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst) +{ + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); + + route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6, + sizeof (*dst)); + + lck_mtx_unlock(&ifp->if_cached_route_lock); +} + +static void +ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src) +{ + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); + + if (ifp->if_fwd_cacheok) { + route_copyin((struct route *)src, + (struct route *)&ifp->if_src_route6, sizeof (*src)); + } else { + rtfree(src->ro_rt); + src->ro_rt = NULL; + } + lck_mtx_unlock(&ifp->if_cached_route_lock); +} +#endif /* INET6 */ + +struct rtentry * +ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip) +{ + struct route src_rt; + struct sockaddr_in *dst = (struct sockaddr_in *)(&src_rt.ro_dst); + + ifp_src_route_copyout(ifp, &src_rt); + + if (src_rt.ro_rt == NULL || !(src_rt.ro_rt->rt_flags & RTF_UP) || + src_ip.s_addr != dst->sin_addr.s_addr || + src_rt.ro_rt->generation_id != route_generation) { + if (src_rt.ro_rt != NULL) { + rtfree(src_rt.ro_rt); + src_rt.ro_rt = NULL; + } else if (dst->sin_family != AF_INET) { + bzero(&src_rt.ro_dst, sizeof (src_rt.ro_dst)); + dst->sin_len = sizeof (src_rt.ro_dst); + dst->sin_family = AF_INET; + } + dst->sin_addr = src_ip; + + if (src_rt.ro_rt == NULL) { + src_rt.ro_rt = rtalloc1_scoped((struct sockaddr *)dst, + 0, 0, ifp->if_index); + + if (src_rt.ro_rt != NULL) { + /* retain a ref, copyin consumes one */ + struct rtentry *rte = src_rt.ro_rt; + RT_ADDREF(rte); + ifp_src_route_copyin(ifp, &src_rt); + src_rt.ro_rt = rte; + } + } + } + + return (src_rt.ro_rt); +} + +#if INET6 +struct rtentry* +ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6) +{ + struct route_in6 src_rt; + + ifp_src_route6_copyout(ifp, &src_rt); + + if (src_rt.ro_rt == NULL || !(src_rt.ro_rt->rt_flags & RTF_UP) || + !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr) || + src_rt.ro_rt->generation_id != route_generation) { + if (src_rt.ro_rt != NULL) { + rtfree(src_rt.ro_rt); + src_rt.ro_rt = NULL; + } else if (src_rt.ro_dst.sin6_family != AF_INET6) { + bzero(&src_rt.ro_dst, sizeof (src_rt.ro_dst)); + src_rt.ro_dst.sin6_len = sizeof (src_rt.ro_dst); + src_rt.ro_dst.sin6_family = AF_INET6; + } + src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6); + src_rt.ro_dst.sin6_addr = *src_ip6; + + if (src_rt.ro_rt == NULL) { + src_rt.ro_rt = rtalloc1_scoped( + (struct sockaddr *)&src_rt.ro_dst, 0, 0, + ifp->if_index); + + if (src_rt.ro_rt != NULL) { + /* retain a ref, copyin consumes one */ + struct rtentry *rte = src_rt.ro_rt; + RT_ADDREF(rte); + ifp_src_route6_copyin(ifp, &src_rt); + src_rt.ro_rt = rte; + } + } + } + + return (src_rt.ro_rt); +} +#endif /* INET6 */ diff --git a/bsd/net/dlil.h b/bsd/net/dlil.h index e91dc7a01..db1060db8 100644 --- a/bsd/net/dlil.h +++ b/bsd/net/dlil.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 1999-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,12 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Copyright (c) 1999 Apple Computer, Inc. - * - * Data Link Inteface Layer - * Author: Ted Walker - */ #ifndef DLIL_H #define DLIL_H #ifdef KERNEL @@ -82,22 +76,22 @@ struct sockaddr_dl; #endif -#ifdef BSD_KERNEL_PRIVATE -struct ifnet_stat_increment_param; struct iff_filter; +#define DLIL_THREADNAME_LEN 32 + struct dlil_threading_info { - mbuf_t mbuf_head; /* start of mbuf list from if */ - mbuf_t mbuf_tail; - u_int32_t mbuf_count; + decl_lck_mtx_data(, input_lck); + lck_grp_t *lck_grp; /* lock group (for lock stats) */ + mbuf_t mbuf_head; /* start of mbuf list from if */ + mbuf_t mbuf_tail; + u_int32_t mbuf_count; boolean_t net_affinity; /* affinity set is available */ - u_int32_t input_waiting; /* DLIL condition of thread */ + u_int32_t input_waiting; /* DLIL condition of thread */ struct thread *input_thread; /* thread data for this input */ struct thread *workloop_thread; /* current workloop thread */ u_int32_t tag; /* current affinity tag */ - lck_mtx_t *input_lck; - lck_grp_t *lck_grp; /* lock group (for lock stats) */ - char input_name[32]; + char input_name[DLIL_THREADNAME_LEN]; #if IFNET_INPUT_SANITY_CHK u_int32_t input_wake_cnt; /* number of times the thread was awaken with packets to process */ u_long input_mbuf_cnt; /* total number of mbuf packets processed by this thread */ @@ -105,8 +99,8 @@ struct dlil_threading_info { }; /* - The following are shared with kpi_protocol.c so that it may wakeup - the input thread to run through packets queued for protocol input. + * The following are shared with kpi_protocol.c so that it may wakeup + * the input thread to run through packets queued for protocol input. */ #define DLIL_INPUT_RUNNING 0x80000000 #define DLIL_INPUT_WAITING 0x40000000 @@ -114,79 +108,52 @@ struct dlil_threading_info { #define DLIL_PROTO_WAITING 0x10000000 #define DLIL_INPUT_TERMINATE 0x08000000 -void dlil_init(void); +extern void dlil_init(void); -errno_t dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, - bpf_packet_func callback); +extern errno_t dlil_set_bpf_tap(ifnet_t, bpf_tap_mode, bpf_packet_func); /* - * Send arp internal bypasses the check for - * IPv4LL. + * Send arp internal bypasses the check for IPv4LL. */ -errno_t -dlil_send_arp_internal( - ifnet_t ifp, - u_int16_t arpop, - const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto); - -int -dlil_output( - ifnet_t ifp, - protocol_family_t proto_family, - mbuf_t packetlist, - void *route, - const struct sockaddr *dest, - int raw); - -errno_t -dlil_resolve_multi( - struct ifnet *ifp, - const struct sockaddr *proto_addr, - struct sockaddr *ll_addr, - size_t ll_len); - -errno_t -dlil_send_arp( - ifnet_t ifp, - u_int16_t arpop, - const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto); - -int dlil_attach_filter(ifnet_t ifp, const struct iff_filter *if_filter, - interface_filter_t *filter_ref); -void dlil_detach_filter(interface_filter_t filter); -int dlil_detach_protocol(ifnet_t ifp, u_int32_t protocol); -extern void dlil_proto_unplumb_all(ifnet_t); +extern errno_t dlil_send_arp_internal(ifnet_t, u_int16_t, + const struct sockaddr_dl *, const struct sockaddr *, + const struct sockaddr_dl *, const struct sockaddr *); -#endif /* BSD_KERNEL_PRIVATE */ +extern int dlil_output(ifnet_t, protocol_family_t, mbuf_t, void *, + const struct sockaddr *, int); -void -dlil_post_msg(struct ifnet *ifp,u_int32_t event_subclass, u_int32_t event_code, - struct net_event_data *event_data, u_int32_t event_data_len); +extern void dlil_input_packet_list(struct ifnet *, struct mbuf *); -/* - * dlil_if_acquire is obsolete. Use ifnet_allocate. - */ +extern errno_t dlil_resolve_multi(struct ifnet *, + const struct sockaddr *, struct sockaddr *, size_t); + +extern errno_t dlil_send_arp(ifnet_t, u_int16_t, const struct sockaddr_dl *, + const struct sockaddr *, const struct sockaddr_dl *, + const struct sockaddr *); -int dlil_if_acquire(u_int32_t family, const void *uniqueid, size_t uniqueid_len, - struct ifnet **ifp); - +extern int dlil_attach_filter(ifnet_t, const struct iff_filter *, + interface_filter_t *); +extern void dlil_detach_filter(interface_filter_t); -/* +extern void dlil_proto_unplumb_all(ifnet_t); + +extern void dlil_post_msg(struct ifnet *, u_int32_t, u_int32_t, + struct net_event_data *, u_int32_t); + +/* + * dlil_if_acquire is obsolete. Use ifnet_allocate. + */ +extern int dlil_if_acquire(u_int32_t, const void *, size_t, struct ifnet **); +/* * dlil_if_release is obsolete. The equivalent is called automatically when * an interface is detached. */ +extern void dlil_if_release(struct ifnet *ifp); -void dlil_if_release(struct ifnet *ifp); - -#if IFNET_ROUTE_REFCNT extern u_int32_t ifnet_aggressive_drainers; -#endif /* IFNET_ROUTE_REFCNT */ + +extern errno_t dlil_if_ref(struct ifnet *); +extern errno_t dlil_if_free(struct ifnet *); #endif /* KERNEL_PRIVATE */ #endif /* KERNEL */ diff --git a/bsd/net/ether_if_module.c b/bsd/net/ether_if_module.c index fc1d9e4cf..a1cbfb3d1 100644 --- a/bsd/net/ether_if_module.c +++ b/bsd/net/ether_if_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -98,6 +98,9 @@ #include <sys/socketvar.h> #include <net/if_vlan_var.h> #include <net/if_bond_var.h> +#if IF_BRIDGE +#include <net/if_bridgevar.h> +#endif /* IF_BRIDGE */ #include <net/dlil.h> @@ -133,7 +136,7 @@ struct en_desc { #endif /* - * Header for the demux list, hangs off of IFP at family_cookie + * Header for the demux list, hangs off of IFP at if_family_cookie */ struct ether_desc_blk_str { @@ -147,19 +150,6 @@ struct ether_desc_blk_str { __private_extern__ u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; -static __inline__ int -_ether_cmp(const void * a, const void * b) -{ - const u_int16_t * a_s = (const u_int16_t *)a; - const u_int16_t * b_s = (const u_int16_t *)b; - - if (a_s[0] != b_s[0] - || a_s[1] != b_s[1] - || a_s[2] != b_s[2]) { - return (1); - } - return (0); -} /* * Release all descriptor entries owned by this protocol (there may be several). @@ -171,7 +161,7 @@ ether_del_proto( ifnet_t ifp, protocol_family_t protocol_family) { - struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->family_cookie; + struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->if_family_cookie; u_int32_t current = 0; int found = 0; @@ -187,8 +177,8 @@ ether_del_proto( } if (desc_blk->n_used == 0) { - FREE(ifp->family_cookie, M_IFADDR); - ifp->family_cookie = 0; + FREE(ifp->if_family_cookie, M_IFADDR); + ifp->if_family_cookie = 0; } else { /* Decrement n_max_used */ @@ -207,7 +197,7 @@ ether_add_proto_internal( const struct ifnet_demux_desc *demux) { struct en_desc *ed; - struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->family_cookie; + struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->if_family_cookie; u_int32_t i; switch (demux->type) { @@ -291,7 +281,7 @@ ether_add_proto_internal( FREE(desc_blk, M_IFADDR); } desc_blk = tmp; - ifp->family_cookie = (uintptr_t)desc_blk; + ifp->if_family_cookie = (uintptr_t)desc_blk; desc_blk->n_count = new_count; } else { @@ -372,7 +362,7 @@ ether_demux( u_int16_t type; u_int8_t *data; u_int32_t i = 0; - struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->family_cookie; + struct ether_desc_blk_str *desc_blk = (struct ether_desc_blk_str *)ifp->if_family_cookie; u_int32_t maxd = desc_blk ? desc_blk->n_max_used : 0; struct en_desc *ed = desc_blk ? desc_blk->block_ptr : NULL; u_int32_t extProto1 = 0; @@ -386,6 +376,16 @@ ether_demux( m->m_flags |= M_MCAST; } + if (m->m_flags & M_HASFCS) { + /* + * If the M_HASFCS is set by the driver we want to make sure + * that we strip off the trailing FCS data before handing it + * up the stack. + */ + m_adj(m, -ETHER_CRC_LEN); + m->m_flags &= ~M_HASFCS; + } + if (ifp->if_eflags & IFEF_BOND) { /* if we're bonded, bond "protocol" gets all the packets */ *protocol_family = PF_BOND; @@ -632,6 +632,9 @@ __private_extern__ int ether_family_init(void) #if BOND bond_family_init(); #endif /* BOND */ +#if IF_BRIDGE + bridgeattach(0); +#endif /* IF_BRIDGE */ done: diff --git a/bsd/net/ether_inet6_pr_module.c b/bsd/net/ether_inet6_pr_module.c index 371cccfd6..e8411dec6 100644 --- a/bsd/net/ether_inet6_pr_module.c +++ b/bsd/net/ether_inet6_pr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,8 +59,6 @@ * */ - - #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> @@ -69,6 +67,7 @@ #include <sys/socket.h> #include <sys/sockio.h> #include <sys/sysctl.h> +#include <sys/socketvar.h> #include <kern/lock.h> #include <net/if.h> @@ -78,6 +77,7 @@ #include <net/if_types.h> #include <net/ndrv.h> #include <net/kpi_protocol.h> +#include <net/dlil.h> #include <netinet/in.h> #include <netinet/in_var.h> @@ -90,13 +90,6 @@ #include <netinet6/in6_ifattach.h> #endif - - -#include <sys/socketvar.h> - -#include <net/dlil.h> - - #if LLC && CCITT extern struct ifqueue pkintrq; #endif @@ -114,70 +107,83 @@ extern struct ifqueue pkintrq; * the ether header, which is provided separately. */ static errno_t -ether_inet6_input( - __unused ifnet_t ifp, - protocol_family_t protocol, - mbuf_t packet, - __unused char *header) +ether_inet6_input(ifnet_t ifp, protocol_family_t protocol, + mbuf_t packet, char *header) { - errno_t error; +#pragma unused(ifp, protocol) + struct ether_header *eh = (struct ether_header *)header; + + if (eh->ether_type == htons(ETHERTYPE_IPV6)) { + struct ifnet *mifp; + /* + * Trust the ifp in the mbuf, rather than ifproto's + * since the packet could have been injected via + * a dlil_input_packet_list() using an ifp that is + * different than the one where the packet really + * came from. + */ + mifp = mbuf_pkthdr_rcvif(packet); + + /* Update L2 reachability record, if present (and not bcast) */ + if (bcmp(eh->ether_shost, etherbroadcastaddr, + ETHER_ADDR_LEN) != 0) { + nd6_llreach_set_reachable(mifp, eh->ether_shost, + ETHER_ADDR_LEN); + } - if ((error = proto_input(protocol, packet))) + if (proto_input(protocol, packet) != 0) + m_freem(packet); + } else { m_freem(packet); - return error; + } + + return (EJUSTRETURN); } static errno_t -ether_inet6_pre_output( - ifnet_t ifp, - __unused protocol_family_t protocol_family, - mbuf_t *m0, - const struct sockaddr *dst_netaddr, - void *route, - char *type, - char *edst) +ether_inet6_pre_output(ifnet_t ifp, protocol_family_t protocol_family, + mbuf_t *m0, const struct sockaddr *dst_netaddr, void *route, + char *type, char *edst) { +#pragma unused(protocol_family) errno_t result; - struct sockaddr_dl sdl; - register struct mbuf *m = *m0; + struct sockaddr_dl sdl; + struct mbuf *m = *m0; /* * Tell ether_frameout it's ok to loop packet if necessary */ m->m_flags |= M_LOOP; - - result = nd6_lookup_ipv6(ifp, (const struct sockaddr_in6*)dst_netaddr, - &sdl, sizeof(sdl), route, *m0); - + + result = nd6_lookup_ipv6(ifp, (const struct sockaddr_in6 *)dst_netaddr, + &sdl, sizeof (sdl), route, *m0); + if (result == 0) { - *(u_int16_t*)type = htons(ETHERTYPE_IPV6); + *(u_int16_t *)type = htons(ETHERTYPE_IPV6); bcopy(LLADDR(&sdl), edst, sdl.sdl_alen); } - - - return result; + return (result); } static int -ether_inet6_resolve_multi( - ifnet_t ifp, - const struct sockaddr *proto_addr, - struct sockaddr_dl *out_ll, - size_t ll_len) +ether_inet6_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr, + struct sockaddr_dl *out_ll, size_t ll_len) { - static const size_t minsize = offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; - const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6*)proto_addr; - + static const size_t minsize = + offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; + const struct sockaddr_in6 *sin6 = + (const struct sockaddr_in6 *)proto_addr; + if (proto_addr->sa_family != AF_INET6) - return EAFNOSUPPORT; - - if (proto_addr->sa_len < sizeof(struct sockaddr_in6)) - return EINVAL; - + return (EAFNOSUPPORT); + + if (proto_addr->sa_len < sizeof (struct sockaddr_in6)) + return (EINVAL); + if (ll_len < minsize) - return EMSGSIZE; - + return (EMSGSIZE); + bzero(out_ll, minsize); out_ll->sdl_len = minsize; out_ll->sdl_family = AF_LINK; @@ -187,20 +193,17 @@ ether_inet6_resolve_multi( out_ll->sdl_alen = ETHER_ADDR_LEN; out_ll->sdl_slen = 0; ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, LLADDR(out_ll)); - - return 0; -} + return (0); +} static errno_t -ether_inet6_prmod_ioctl( - ifnet_t ifp, - __unused protocol_family_t protocol_family, - u_long command, - void *data) +ether_inet6_prmod_ioctl(ifnet_t ifp, protocol_family_t protocol_family, + u_long command, void *data) { - struct ifreq *ifr = (struct ifreq *) data; - int error = 0; +#pragma unused(protocol_family) + struct ifreq *ifr = (struct ifreq *)data; + int error = 0; switch (command) { case SIOCSIFADDR: @@ -211,30 +214,30 @@ ether_inet6_prmod_ioctl( break; case SIOCGIFADDR: - ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, ETHER_ADDR_LEN); - break; + (void) ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, + ETHER_ADDR_LEN); + break; - default: - error = EOPNOTSUPP; - break; - } - return (error); + default: + error = EOPNOTSUPP; + break; + } + return (error); } errno_t -ether_attach_inet6( - struct ifnet *ifp, - __unused protocol_family_t protocol_family) +ether_attach_inet6(struct ifnet *ifp, protocol_family_t protocol_family) { +#pragma unused(protocol_family) struct ifnet_attach_proto_param proto; struct ifnet_demux_desc demux[1]; - u_short en_6native=htons(ETHERTYPE_IPV6); + u_short en_6native = htons(ETHERTYPE_IPV6); errno_t error; - - bzero(&proto, sizeof(proto)); + + bzero(&proto, sizeof (proto)); demux[0].type = DLIL_DESC_ETYPE2; demux[0].data = &en_6native; - demux[0].datalen = sizeof(en_6native); + demux[0].datalen = sizeof (en_6native); proto.demux_list = demux; proto.demux_count = 1; proto.input = ether_inet6_input; @@ -243,24 +246,15 @@ ether_attach_inet6( proto.resolve = ether_inet6_resolve_multi; error = ifnet_attach_protocol(ifp, protocol_family, &proto); if (error && error != EEXIST) { - printf("WARNING: ether_attach_inet6 can't attach ipv6 to %s%d\n", - ifp->if_name, ifp->if_unit); + printf("WARNING: %s can't attach ipv6 to %s%d\n", __func__, + ifp->if_name, ifp->if_unit); } - - return error; + + return (error); } void -ether_detach_inet6( - struct ifnet *ifp, - protocol_family_t protocol_family) +ether_detach_inet6(struct ifnet *ifp, protocol_family_t protocol_family) { - errno_t error; - - error = ifnet_detach_protocol(ifp, protocol_family); - if (error && error != ENOENT) { - printf("WARNING: ether_detach_inet6 can't detach ipv6 from %s%d\n", - ifp->if_name, ifp->if_unit); - } + (void) ifnet_detach_protocol(ifp, protocol_family); } - diff --git a/bsd/net/ether_inet_pr_module.c b/bsd/net/ether_inet_pr_module.c index 422866e73..12a8ead3c 100644 --- a/bsd/net/ether_inet_pr_module.c +++ b/bsd/net/ether_inet_pr_module.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -103,64 +103,62 @@ #include <security/mac_framework.h> #endif -/* Local function declerations */ +/* Local function declarations */ extern void *kdp_get_interface(void); extern void kdp_set_ip_and_mac_addresses(struct in_addr *ipaddr, - struct ether_addr *macaddr); + struct ether_addr *macaddr); -static __inline__ void -_ip_copy(struct in_addr * dst, const struct in_addr * src) -{ - *dst = *src; - return; -} +#define _ip_copy(dst, src) \ + (*(dst) = *(src)) static void -ether_inet_arp_input( - struct mbuf *m) +ether_inet_arp_input(struct ifnet *ifp, struct mbuf *m) { struct ether_arp *ea; struct sockaddr_dl sender_hw; struct sockaddr_in sender_ip; struct sockaddr_in target_ip; - - if (mbuf_len(m) < sizeof(*ea) && - mbuf_pullup(&m, sizeof(*ea)) != 0) + + if (mbuf_len(m) < sizeof (*ea) && mbuf_pullup(&m, sizeof (*ea)) != 0) return; - + ea = mbuf_data(m); - + /* Verify this is an ethernet/ip arp and address lengths are correct */ if (ntohs(ea->arp_hrd) != ARPHRD_ETHER || - ntohs(ea->arp_pro) != ETHERTYPE_IP || - ea->arp_pln != sizeof(struct in_addr) || - ea->arp_hln != ETHER_ADDR_LEN) { - mbuf_free(m); + ntohs(ea->arp_pro) != ETHERTYPE_IP || + ea->arp_pln != sizeof (struct in_addr) || + ea->arp_hln != ETHER_ADDR_LEN) { + mbuf_freem(m); return; } - + /* Verify the sender is not broadcast */ if (bcmp(ea->arp_sha, etherbroadcastaddr, ETHER_ADDR_LEN) == 0) { - mbuf_free(m); + mbuf_freem(m); return; } - - bzero(&sender_ip, sizeof(sender_ip)); - sender_ip.sin_len = sizeof(sender_ip); + + bzero(&sender_ip, sizeof (sender_ip)); + sender_ip.sin_len = sizeof (sender_ip); sender_ip.sin_family = AF_INET; _ip_copy(&sender_ip.sin_addr, (const struct in_addr *)ea->arp_spa); target_ip = sender_ip; _ip_copy(&target_ip.sin_addr, (const struct in_addr *)ea->arp_tpa); - - bzero(&sender_hw, sizeof(sender_hw)); - sender_hw.sdl_len = sizeof(sender_hw); + + bzero(&sender_hw, sizeof (sender_hw)); + sender_hw.sdl_len = sizeof (sender_hw); sender_hw.sdl_family = AF_LINK; sender_hw.sdl_type = IFT_ETHER; sender_hw.sdl_alen = ETHER_ADDR_LEN; bcopy(ea->arp_sha, LLADDR(&sender_hw), ETHER_ADDR_LEN); - - arp_ip_handle_input(mbuf_pkthdr_rcvif(m), ntohs(ea->arp_op), &sender_hw, &sender_ip, &target_ip); - mbuf_free(m); + + /* update L2 reachability record, if present */ + arp_llreach_set_reachable(ifp, LLADDR(&sender_hw), ETHER_ADDR_LEN); + + arp_ip_handle_input(ifp, ntohs(ea->arp_op), &sender_hw, &sender_ip, + &target_ip); + mbuf_freem(m); } /* @@ -169,120 +167,131 @@ ether_inet_arp_input( * the ether header, which is provided separately. */ static errno_t -ether_inet_input( - __unused ifnet_t ifp, - __unused protocol_family_t protocol_family, - mbuf_t m_list) +ether_inet_input(ifnet_t ifp, protocol_family_t protocol_family, + mbuf_t m_list) { +#pragma unused(ifp, protocol_family) mbuf_t m; mbuf_t *tailptr = &m_list; mbuf_t nextpkt; - + /* Strip ARP and non-IP packets out of the list */ for (m = m_list; m; m = nextpkt) { - struct ether_header *eh = mbuf_pkthdr_header(m); - - nextpkt = m->m_nextpkt; - - if (eh->ether_type == htons(ETHERTYPE_IP)) { - /* put this packet in the list */ - *tailptr = m; - tailptr = &m->m_nextpkt; - } - else { - /* Pass ARP packets to arp input */ + struct ether_header *eh = mbuf_pkthdr_header(m); + struct ifnet *mifp; + + /* + * Trust the ifp in the mbuf, rather than ifproto's + * since the packet could have been injected via + * a dlil_input_packet_list() using an ifp that is + * different than the one where the packet really + * came from. + */ + mifp = mbuf_pkthdr_rcvif(m); + + nextpkt = m->m_nextpkt; + + if (eh->ether_type == htons(ETHERTYPE_IP)) { + /* + * Update L2 reachability record, if present + * (and if not a broadcast sender). + */ + if (bcmp(eh->ether_shost, etherbroadcastaddr, + ETHER_ADDR_LEN) != 0) { + arp_llreach_set_reachable(mifp, eh->ether_shost, + ETHER_ADDR_LEN); + } + /* put this packet in the list */ + *tailptr = m; + tailptr = &m->m_nextpkt; + } else { + /* Pass ARP packets to arp input */ m->m_nextpkt = NULL; - if (eh->ether_type == htons(ETHERTYPE_ARP)) - ether_inet_arp_input(m); - else - mbuf_freem(m); - } + if (eh->ether_type == htons(ETHERTYPE_ARP)) + ether_inet_arp_input(mifp, m); + else + mbuf_freem(m); + } } - + *tailptr = NULL; - + /* Pass IP list to ip input */ - if (m_list != NULL && proto_input(PF_INET, m_list) != 0) - { + if (m_list != NULL && proto_input(PF_INET, m_list) != 0) { mbuf_freem_list(m_list); } - - return 0; + + return (EJUSTRETURN); } static errno_t -ether_inet_pre_output( - ifnet_t ifp, - __unused protocol_family_t protocol_family, - mbuf_t *m0, - const struct sockaddr *dst_netaddr, - void* route, - char *type, - char *edst) +ether_inet_pre_output(ifnet_t ifp, protocol_family_t protocol_family, + mbuf_t *m0, const struct sockaddr *dst_netaddr, + void *route, char *type, char *edst) { - register struct mbuf *m = *m0; - const struct ether_header *eh; - errno_t result = 0; - - - if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) - return ENETDOWN; - - /* - * Tell ether_frameout it's ok to loop packet unless negated below. - */ - m->m_flags |= M_LOOP; - - switch (dst_netaddr->sa_family) { - - case AF_INET: { - struct sockaddr_dl ll_dest; - result = arp_lookup_ip(ifp, (const struct sockaddr_in*)dst_netaddr, - &ll_dest, sizeof(ll_dest), (route_t)route, *m0); - if (result == 0) { - bcopy(LLADDR(&ll_dest), edst, ETHER_ADDR_LEN); - *(u_int16_t*)type = htons(ETHERTYPE_IP); - } - } - break; +#pragma unused(protocol_family) + struct mbuf *m = *m0; + const struct ether_header *eh; + errno_t result = 0; - case pseudo_AF_HDRCMPLT: - case AF_UNSPEC: - m->m_flags &= ~M_LOOP; - eh = (const struct ether_header *)dst_netaddr->sa_data; - (void)memcpy(edst, eh->ether_dhost, 6); - *(u_short *)type = eh->ether_type; - break; - - default: - printf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, - dst_netaddr->sa_family); - - result = EAFNOSUPPORT; - } - - return result; + if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) + return (ENETDOWN); + + /* + * Tell ether_frameout it's ok to loop packet unless negated below. + */ + m->m_flags |= M_LOOP; + + switch (dst_netaddr->sa_family) { + case AF_INET: { + struct sockaddr_dl ll_dest; + + result = arp_lookup_ip(ifp, + (const struct sockaddr_in *)dst_netaddr, &ll_dest, + sizeof (ll_dest), (route_t)route, *m0); + if (result == 0) { + bcopy(LLADDR(&ll_dest), edst, ETHER_ADDR_LEN); + *(u_int16_t *)type = htons(ETHERTYPE_IP); + } + break; + } + + case pseudo_AF_HDRCMPLT: + case AF_UNSPEC: + m->m_flags &= ~M_LOOP; + eh = (const struct ether_header *)dst_netaddr->sa_data; + (void) memcpy(edst, eh->ether_dhost, 6); + *(u_short *)type = eh->ether_type; + break; + + default: + printf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, + dst_netaddr->sa_family); + + result = EAFNOSUPPORT; + break; + } + + return (result); } static errno_t -ether_inet_resolve_multi( - ifnet_t ifp, - const struct sockaddr *proto_addr, - struct sockaddr_dl *out_ll, - size_t ll_len) +ether_inet_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr, + struct sockaddr_dl *out_ll, size_t ll_len) { - static const size_t minsize = offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; - const struct sockaddr_in *sin = (const struct sockaddr_in*)proto_addr; - + static const size_t minsize = + offsetof(struct sockaddr_dl, sdl_data[0]) + ETHER_ADDR_LEN; + const struct sockaddr_in *sin = (const struct sockaddr_in *)proto_addr; + if (proto_addr->sa_family != AF_INET) - return EAFNOSUPPORT; - - if (proto_addr->sa_len < sizeof(struct sockaddr_in)) - return EINVAL; + return (EAFNOSUPPORT); + + if (proto_addr->sa_len < sizeof (struct sockaddr_in)) + return (EINVAL); if (ll_len < minsize) - return EMSGSIZE; - + return (EMSGSIZE); + bzero(out_ll, minsize); out_ll->sdl_len = minsize; out_ll->sdl_family = AF_LINK; @@ -292,141 +301,128 @@ ether_inet_resolve_multi( out_ll->sdl_alen = ETHER_ADDR_LEN; out_ll->sdl_slen = 0; ETHER_MAP_IP_MULTICAST(&sin->sin_addr, LLADDR(out_ll)); - - return 0; + + return (0); } static errno_t -ether_inet_prmod_ioctl( - ifnet_t ifp, - __unused protocol_family_t protocol_family, - u_long command, - void *data) +ether_inet_prmod_ioctl(ifnet_t ifp, protocol_family_t protocol_family, + u_long command, void *data) { - ifaddr_t ifa = data; - struct ifreq *ifr = data; - int error = 0; - +#pragma unused(protocol_family) + ifaddr_t ifa = data; + struct ifreq *ifr = data; + int error = 0; + + switch (command) { + case SIOCSIFADDR: + case SIOCAIFADDR: + if (!(ifnet_flags(ifp) & IFF_RUNNING)) { + ifnet_set_flags(ifp, IFF_UP, IFF_UP); + ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); + } - switch (command) { - case SIOCSIFADDR: - case SIOCAIFADDR: - if ((ifnet_flags(ifp) & IFF_RUNNING) == 0) { - ifnet_set_flags(ifp, IFF_UP, IFF_UP); - ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); - } + if (ifaddr_address_family(ifa) != AF_INET) + break; - switch (ifaddr_address_family(ifa)) { - - case AF_INET: - - inet_arp_init_ifaddr(ifp, ifa); - /* - * Register new IP and MAC addresses with the kernel - * debugger if the interface is the same as was registered - * by IOKernelDebugger. If no interface was registered, - * fall back and just match against en0 interface. - * Do this only for the first address of the interface - * and not for aliases. - */ - if (command == SIOCSIFADDR && - ((kdp_get_interface() != 0 && - kdp_get_interface() == ifp->if_softc) || - (kdp_get_interface() == 0 && ifp->if_unit == 0))) + inet_arp_init_ifaddr(ifp, ifa); + /* + * Register new IP and MAC addresses with the kernel + * debugger if the interface is the same as was registered + * by IOKernelDebugger. If no interface was registered, + * fall back and just match against en0 interface. + * Do this only for the first address of the interface + * and not for aliases. + */ + if (command == SIOCSIFADDR && + ((kdp_get_interface() != 0 && + kdp_get_interface() == ifp->if_softc) || + (kdp_get_interface() == 0 && ifp->if_unit == 0))) kdp_set_ip_and_mac_addresses(&(IA_SIN(ifa)->sin_addr), ifnet_lladdr(ifp)); + break; - break; - - default: - break; - } - - break; - - case SIOCGIFADDR: - ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, ETHER_ADDR_LEN); + case SIOCGIFADDR: + ifnet_lladdr_copy_bytes(ifp, ifr->ifr_addr.sa_data, + ETHER_ADDR_LEN); break; - default: + default: error = EOPNOTSUPP; break; - } + } - return (error); + return (error); } static void -ether_inet_event( - ifnet_t ifp, - __unused protocol_family_t protocol, - const struct kev_msg *event) +ether_inet_event(ifnet_t ifp, protocol_family_t protocol, + const struct kev_msg *event) { - ifaddr_t *addresses; - +#pragma unused(protocol) + ifaddr_t *addresses; + if (event->vendor_code != KEV_VENDOR_APPLE || - event->kev_class != KEV_NETWORK_CLASS || - event->kev_subclass != KEV_DL_SUBCLASS || - event->event_code != KEV_DL_LINK_ADDRESS_CHANGED) { + event->kev_class != KEV_NETWORK_CLASS || + event->kev_subclass != KEV_DL_SUBCLASS || + event->event_code != KEV_DL_LINK_ADDRESS_CHANGED) { return; } - + if (ifnet_get_address_list_family(ifp, &addresses, AF_INET) == 0) { int i; - + for (i = 0; addresses[i] != NULL; i++) { inet_arp_init_ifaddr(ifp, addresses[i]); } - + ifnet_free_address_list(addresses); } } static errno_t -ether_inet_arp( - ifnet_t ifp, - u_short arpop, - const struct sockaddr_dl* sender_hw, - const struct sockaddr* sender_proto, - const struct sockaddr_dl* target_hw, - const struct sockaddr* target_proto) +ether_inet_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw, + const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw, + const struct sockaddr *target_proto) { mbuf_t m; errno_t result; struct ether_header *eh; struct ether_arp *ea; - const struct sockaddr_in* sender_ip = (const struct sockaddr_in*)sender_proto; - const struct sockaddr_in* target_ip = (const struct sockaddr_in*)target_proto; + const struct sockaddr_in *sender_ip = + (const struct sockaddr_in *)sender_proto; + const struct sockaddr_in *target_ip = + (const struct sockaddr_in *)target_proto; char *datap; - + if (target_ip == NULL) - return EINVAL; - + return (EINVAL); + if ((sender_ip && sender_ip->sin_family != AF_INET) || target_ip->sin_family != AF_INET) - return EAFNOSUPPORT; - + return (EAFNOSUPPORT); + result = mbuf_gethdr(MBUF_DONTWAIT, MBUF_TYPE_DATA, &m); if (result != 0) - return result; - - mbuf_setlen(m, sizeof(*ea)); - mbuf_pkthdr_setlen(m, sizeof(*ea)); - + return (result); + + mbuf_setlen(m, sizeof (*ea)); + mbuf_pkthdr_setlen(m, sizeof (*ea)); + /* Move the data pointer in the mbuf to the end, aligned to 4 bytes */ datap = mbuf_datastart(m); datap += mbuf_trailingspace(m); datap -= (((uintptr_t)datap) & 0x3); - mbuf_setdata(m, datap, sizeof(*ea)); + mbuf_setdata(m, datap, sizeof (*ea)); ea = mbuf_data(m); - + /* * Prepend the ethernet header, we will send the raw frame; * callee frees the original mbuf when allocation fails. */ - result = mbuf_prepend(&m, sizeof(*eh), MBUF_DONTWAIT); + result = mbuf_prepend(&m, sizeof (*eh), MBUF_DONTWAIT); if (result != 0) - return result; + return (result); eh = mbuf_data(m); eh->ether_type = htons(ETHERTYPE_ARP); @@ -434,108 +430,108 @@ ether_inet_arp( #if CONFIG_MACF_NET mac_mbuf_label_associate_linklayer(ifp, m); #endif - + /* Fill out the arp header */ ea->arp_pro = htons(ETHERTYPE_IP); - ea->arp_hln = sizeof(ea->arp_sha); - ea->arp_pln = sizeof(ea->arp_spa); + ea->arp_hln = sizeof (ea->arp_sha); + ea->arp_pln = sizeof (ea->arp_spa); ea->arp_hrd = htons(ARPHRD_ETHER); ea->arp_op = htons(arpop); - + /* Sender Hardware */ if (sender_hw != NULL) { - bcopy(CONST_LLADDR(sender_hw), ea->arp_sha, sizeof(ea->arp_sha)); - } - else { + bcopy(CONST_LLADDR(sender_hw), ea->arp_sha, + sizeof (ea->arp_sha)); + } else { ifnet_lladdr_copy_bytes(ifp, ea->arp_sha, ETHER_ADDR_LEN); } - ifnet_lladdr_copy_bytes(ifp, eh->ether_shost, sizeof(eh->ether_shost)); - + ifnet_lladdr_copy_bytes(ifp, eh->ether_shost, sizeof (eh->ether_shost)); + /* Sender IP */ if (sender_ip != NULL) { - bcopy(&sender_ip->sin_addr, ea->arp_spa, sizeof(ea->arp_spa)); - } - else { + bcopy(&sender_ip->sin_addr, ea->arp_spa, sizeof (ea->arp_spa)); + } else { struct ifaddr *ifa; - + /* Look for an IP address to use as our source */ ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) + IFA_LOCK(ifa); + if (ifa->ifa_addr != NULL && + ifa->ifa_addr->sa_family == AF_INET) { + bcopy(&((struct sockaddr_in *)ifa->ifa_addr)-> + sin_addr, ea->arp_spa, sizeof(ea->arp_spa)); + IFA_UNLOCK(ifa); break; - } - if (ifa) { - bcopy(&((struct sockaddr_in*)ifa->ifa_addr)->sin_addr, ea->arp_spa, - sizeof(ea->arp_spa)); + } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); - + if (ifa == NULL) { - mbuf_free(m); - return ENXIO; + mbuf_freem(m); + return (ENXIO); } } - + /* Target Hardware */ - if (target_hw == 0) { - bzero(ea->arp_tha, sizeof(ea->arp_tha)); - bcopy(etherbroadcastaddr, eh->ether_dhost, sizeof(eh->ether_dhost)); - } - else { - bcopy(CONST_LLADDR(target_hw), ea->arp_tha, sizeof(ea->arp_tha)); - bcopy(CONST_LLADDR(target_hw), eh->ether_dhost, sizeof(eh->ether_dhost)); + if (target_hw == NULL) { + bzero(ea->arp_tha, sizeof (ea->arp_tha)); + bcopy(etherbroadcastaddr, eh->ether_dhost, + sizeof (eh->ether_dhost)); + } else { + bcopy(CONST_LLADDR(target_hw), ea->arp_tha, + sizeof (ea->arp_tha)); + bcopy(CONST_LLADDR(target_hw), eh->ether_dhost, + sizeof (eh->ether_dhost)); } - + /* Target IP */ - bcopy(&target_ip->sin_addr, ea->arp_tpa, sizeof(ea->arp_tpa)); - + bcopy(&target_ip->sin_addr, ea->arp_tpa, sizeof (ea->arp_tpa)); + ifnet_output_raw(ifp, PF_INET, m); - - return 0; + + return (0); } errno_t -ether_attach_inet( - struct ifnet *ifp, - __unused protocol_family_t proto_family) +ether_attach_inet(struct ifnet *ifp, protocol_family_t proto_family) { +#pragma unused(proto_family) struct ifnet_attach_proto_param_v2 proto; struct ifnet_demux_desc demux[2]; - u_short en_native=htons(ETHERTYPE_IP); - u_short arp_native=htons(ETHERTYPE_ARP); + u_short en_native = htons(ETHERTYPE_IP); + u_short arp_native = htons(ETHERTYPE_ARP); errno_t error; - - bzero(&demux[0], sizeof(demux)); + + bzero(&demux[0], sizeof (demux)); demux[0].type = DLIL_DESC_ETYPE2; demux[0].data = &en_native; - demux[0].datalen = sizeof(en_native); + demux[0].datalen = sizeof (en_native); demux[1].type = DLIL_DESC_ETYPE2; demux[1].data = &arp_native; - demux[1].datalen = sizeof(arp_native); + demux[1].datalen = sizeof (arp_native); - bzero(&proto, sizeof(proto)); + bzero(&proto, sizeof (proto)); proto.demux_list = demux; - proto.demux_count = sizeof(demux) / sizeof(demux[0]); + proto.demux_count = sizeof (demux) / sizeof (demux[0]); proto.input = ether_inet_input; proto.pre_output = ether_inet_pre_output; proto.ioctl = ether_inet_prmod_ioctl; proto.event = ether_inet_event; proto.resolve = ether_inet_resolve_multi; proto.send_arp = ether_inet_arp; - + error = ifnet_attach_protocol_v2(ifp, proto_family, &proto); if (error && error != EEXIST) { - printf("WARNING: ether_attach_inet can't attach ip to %s%d\n", - ifp->if_name, ifp->if_unit); + printf("WARNING: %s can't attach ip to %s%d\n", __func__, + ifp->if_name, ifp->if_unit); } - return error; + return (error); } void -ether_detach_inet( - struct ifnet *ifp, - protocol_family_t proto_family) +ether_detach_inet(struct ifnet *ifp, protocol_family_t proto_family) { - (void)ifnet_detach_protocol(ifp, proto_family); + (void) ifnet_detach_protocol(ifp, proto_family); } - diff --git a/bsd/net/ethernet.h b/bsd/net/ethernet.h index 00b7fa5fb..aea52bc20 100644 --- a/bsd/net/ethernet.h +++ b/bsd/net/ethernet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000,2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,6 +65,14 @@ */ #define ETHER_MAX_LEN 1518 +/* + * Mbuf adjust factor to force 32-bit alignment of IP header. + * Drivers should do m_adj(m, ETHER_ALIGN) when setting up a + * receive so the upper layers get the IP header properly aligned + * past the 14-byte Ethernet header. + */ +#define ETHER_ALIGN 2 /* driver adjust for IP hdr alignment */ + /* * A macro to validate a length with */ @@ -120,7 +128,23 @@ struct ether_addr *ether_aton(const char *); #ifdef BSD_KERNEL_PRIVATE extern u_char etherbroadcastaddr[ETHER_ADDR_LEN]; -#endif + + +static __inline__ int +_ether_cmp(const void * a, const void * b) +{ + const u_int16_t * a_s = (const u_int16_t *)a; + const u_int16_t * b_s = (const u_int16_t *)b; + + if (a_s[0] != b_s[0] + || a_s[1] != b_s[1] + || a_s[2] != b_s[2]) { + return (1); + } + return (0); +} + +#endif /* BSD_KERNEL_PRIVATE */ #define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ diff --git a/bsd/net/if.c b/bsd/net/if.c index 02b698007..26314b948 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -81,9 +81,13 @@ #include <sys/sockio.h> #include <sys/syslog.h> #include <sys/sysctl.h> +#include <sys/mcache.h> +#include <kern/zalloc.h> #include <machine/endian.h> +#include <pexpert/pexpert.h> + #include <net/if.h> #include <net/if_arp.h> #include <net/if_dl.h> @@ -106,15 +110,14 @@ #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/ip_var.h> +#include <netinet/ip6.h> #if INET6 #include <netinet6/in6_var.h> #include <netinet6/in6_ifattach.h> +#include <netinet6/ip6_var.h> #endif #endif -extern int dlil_multithreaded_input; -extern struct dlil_threading_info *dlil_lo_thread_ptr; - #if CONFIG_MACF_NET #include <security/mac_framework.h> #endif @@ -124,11 +127,21 @@ extern struct dlil_threading_info *dlil_lo_thread_ptr; * System initialization */ +/* Lock group and attribute for ifaddr lock */ +lck_attr_t *ifa_mtx_attr; +lck_grp_t *ifa_mtx_grp; +static lck_grp_attr_t *ifa_mtx_grp_attr; + static int ifconf(u_long cmd, user_addr_t ifrp, int * ret_space); static void if_qflush(struct ifqueue *); __private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *); void if_rtproto_del(struct ifnet *ifp, int protocol); +static int if_addmulti_common(struct ifnet *, const struct sockaddr *, + struct ifmultiaddr **, int); +static int if_delmulti_common(struct ifmultiaddr *, struct ifnet *, + const struct sockaddr *, int); + static int if_rtmtu(struct radix_node *, void *); static void if_rtmtu_update(struct ifnet *); @@ -137,7 +150,6 @@ static int if_clone_list(int count, int * total, user_addr_t dst); #endif /* IF_CLONE_LIST */ MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address"); -MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address"); int ifqmaxlen = IFQ_MAXLEN; struct ifnethead ifnet_head = TAILQ_HEAD_INITIALIZER(ifnet_head); @@ -147,6 +159,50 @@ LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners); static struct ifaddr *ifa_ifwithnet_common(const struct sockaddr *, unsigned int); +static void if_attach_ifa_common(struct ifnet *, struct ifaddr *, int); +static void if_detach_ifa_common(struct ifnet *, struct ifaddr *, int); + +static void if_attach_ifma(struct ifnet *, struct ifmultiaddr *, int); +static int if_detach_ifma(struct ifnet *, struct ifmultiaddr *, int); + +static struct ifmultiaddr *ifma_alloc(int); +static void ifma_free(struct ifmultiaddr *); +static void ifma_trace(struct ifmultiaddr *, int); + +#if DEBUG +static unsigned int ifma_debug = 1; /* debugging (enabled) */ +#else +static unsigned int ifma_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int ifma_size; /* size of zone element */ +static struct zone *ifma_zone; /* zone for ifmultiaddr */ + +#define IFMA_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int ifma_trace_hist_size = IFMA_TRACE_HIST_SIZE; + +struct ifmultiaddr_dbg { + struct ifmultiaddr ifma; /* ifmultiaddr */ + u_int16_t ifma_refhold_cnt; /* # of ref */ + u_int16_t ifma_refrele_cnt; /* # of rele */ + /* + * Circular lists of IFA_ADDREF and IFA_REMREF callers. + */ + ctrace_t ifma_refhold[IFMA_TRACE_HIST_SIZE]; + ctrace_t ifma_refrele[IFMA_TRACE_HIST_SIZE]; + /* + * Trash list linkage + */ + TAILQ_ENTRY(ifmultiaddr_dbg) ifma_trash_link; +}; + +/* List of trash ifmultiaddr entries protected by ifma_trash_lock */ +static TAILQ_HEAD(, ifmultiaddr_dbg) ifma_trash_head; +static decl_lck_mtx_data(, ifma_trash_lock); + +#define IFMA_ZONE_MAX 64 /* maximum elements in zone */ +#define IFMA_ZONE_NAME "ifmultiaddr" /* zone name */ #if INET6 /* @@ -154,9 +210,36 @@ static struct ifaddr *ifa_ifwithnet_common(const struct sockaddr *, * should be more generalized? */ extern void nd6_setmtu(struct ifnet *); +extern lck_mtx_t *nd6_mutex; #endif +void +ifa_init(void) +{ + /* Setup lock group and attribute for ifaddr */ + ifa_mtx_grp_attr = lck_grp_attr_alloc_init(); + ifa_mtx_grp = lck_grp_alloc_init("ifaddr", ifa_mtx_grp_attr); + ifa_mtx_attr = lck_attr_alloc_init(); + + PE_parse_boot_argn("ifa_debug", &ifma_debug, sizeof (ifma_debug)); + + ifma_size = (ifma_debug == 0) ? sizeof (struct ifmultiaddr) : + sizeof (struct ifmultiaddr_dbg); + + ifma_zone = zinit(ifma_size, IFMA_ZONE_MAX * ifma_size, 0, + IFMA_ZONE_NAME); + if (ifma_zone == NULL) { + panic("%s: failed allocating %s", __func__, IFMA_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(ifma_zone, Z_EXPAND, TRUE); + zone_change(ifma_zone, Z_CALLERACCT, FALSE); + + lck_mtx_init(&ifma_trash_lock, ifa_mtx_grp, ifa_mtx_attr); + TAILQ_INIT(&ifma_trash_head); +} + /* * Network interface utility routines. * @@ -169,45 +252,106 @@ struct ifaddr **ifnet_addrs; struct ifnet **ifindex2ifnet; __private_extern__ void -if_attach_ifa( - struct ifnet *ifp, - struct ifaddr *ifa) +if_attach_ifa(struct ifnet *ifp, struct ifaddr *ifa) +{ + if_attach_ifa_common(ifp, ifa, 0); +} + +__private_extern__ void +if_attach_link_ifa(struct ifnet *ifp, struct ifaddr *ifa) +{ + if_attach_ifa_common(ifp, ifa, 1); +} + +static void +if_attach_ifa_common(struct ifnet *ifp, struct ifaddr *ifa, int link) { - ifnet_lock_assert(ifp, LCK_MTX_ASSERT_OWNED); - if (ifa->ifa_debug & IFD_ATTACHED) { - panic("if_attach_ifa: Attempted to attach address that's already attached!\n"); + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + IFA_LOCK_ASSERT_HELD(ifa); + + if (ifa->ifa_ifp != ifp) { + panic("%s: Mismatch ifa_ifp=%p != ifp=%p", __func__, + ifa->ifa_ifp, ifp); + /* NOTREACHED */ + } else if (ifa->ifa_debug & IFD_ATTACHED) { + panic("%s: Attempt to attach an already attached ifa=%p", + __func__, ifa); + /* NOTREACHED */ + } else if (link && !(ifa->ifa_debug & IFD_LINK)) { + panic("%s: Unexpected non-link address ifa=%p", __func__, ifa); + /* NOTREACHED */ + } else if (!link && (ifa->ifa_debug & IFD_LINK)) { + panic("%s: Unexpected link address ifa=%p", __func__, ifa); + /* NOTREACHED */ } - ifaref(ifa); + IFA_ADDREF_LOCKED(ifa); ifa->ifa_debug |= IFD_ATTACHED; - TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); + if (link) + TAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link); + else + TAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link); + + if (ifa->ifa_attached != NULL) + (*ifa->ifa_attached)(ifa); } __private_extern__ void -if_detach_ifa( - struct ifnet *ifp, - struct ifaddr *ifa) -{ - ifnet_lock_assert(ifp, LCK_MTX_ASSERT_OWNED); -#if 1 - /* Debugging code */ - if ((ifa->ifa_debug & IFD_ATTACHED) == 0) { - printf("if_detach_ifa: ifa is not attached to any interface! flags=%u\n", ifa->ifa_debug); - return; - } - else { +if_detach_ifa(struct ifnet *ifp, struct ifaddr *ifa) +{ + if_detach_ifa_common(ifp, ifa, 0); +} + +__private_extern__ void +if_detach_link_ifa(struct ifnet *ifp, struct ifaddr *ifa) +{ + if_detach_ifa_common(ifp, ifa, 1); +} + +static void +if_detach_ifa_common(struct ifnet *ifp, struct ifaddr *ifa, int link) +{ + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + IFA_LOCK_ASSERT_HELD(ifa); + + if (link && !(ifa->ifa_debug & IFD_LINK)) { + panic("%s: Unexpected non-link address ifa=%p", __func__, ifa); + /* NOTREACHED */ + } else if (link && ifa != TAILQ_FIRST(&ifp->if_addrhead)) { + panic("%s: Link address ifa=%p not first", __func__, ifa); + /* NOTREACHED */ + } else if (!link && (ifa->ifa_debug & IFD_LINK)) { + panic("%s: Unexpected link address ifa=%p", __func__, ifa); + /* NOTREACHED */ + } else if (!(ifa->ifa_debug & IFD_ATTACHED)) { + panic("%s: Attempt to detach an unattached address ifa=%p", + __func__, ifa); + /* NOTREACHED */ + } else if (ifa->ifa_ifp != ifp) { + panic("%s: Mismatch ifa_ifp=%p, ifp=%p", __func__, + ifa->ifa_ifp, ifp); + /* NOTREACHED */ + } else if (ifa->ifa_debug & IFD_DEBUG) { struct ifaddr *ifa2; TAILQ_FOREACH(ifa2, &ifp->if_addrhead, ifa_link) { if (ifa2 == ifa) break; } if (ifa2 != ifa) { - printf("if_detach_ifa: Attempted to detach IFA that was not attached!\n"); - } + panic("%s: Attempt to detach a stray address ifa=%p", + __func__, ifa); + /* NOTREACHED */ + } } -#endif TAILQ_REMOVE(&ifp->if_addrhead, ifa, ifa_link); + /* This must not be the last reference to the ifaddr */ + if (IFA_REMREF_LOCKED(ifa) == NULL) { + panic("%s: unexpected (missing) refcnt ifa=%p", __func__, ifa); + /* NOTREACHED */ + } ifa->ifa_debug &= ~IFD_ATTACHED; - ifafree(ifa); + + if (ifa->ifa_detached != NULL) + (*ifa->ifa_detached)(ifa); } #define INITIAL_IF_INDEXLIM 8 @@ -346,7 +490,8 @@ if_clone_create(char *name, int len, void *params) * there's no straightforward way to recover if * it happens. */ - panic("if_clone_create(): interface name too long"); + panic("%s: interface name too long", __func__); + /* NOTREACHED */ } } @@ -548,36 +693,72 @@ ifa_foraddr_scoped(unsigned int addr, unsigned int scope) lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia, INADDR_HASH(addr), ia_hash) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (ia->ia_addr.sin_addr.s_addr == addr && - (scope == IFSCOPE_NONE || ia->ia_ifp->if_index == scope)) + (scope == IFSCOPE_NONE || ia->ia_ifp->if_index == scope)) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for caller */ + IFA_UNLOCK(&ia->ia_ifa); break; + } + IFA_UNLOCK(&ia->ia_ifa); } - if (ia != NULL) - ifaref(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return (ia); } +#if INET6 +/* + * Similar to ifa_foraddr, except that this for IPv6. + */ +__private_extern__ struct in6_ifaddr * +ifa_foraddr6(struct in6_addr *addr6) +{ + return (ifa_foraddr6_scoped(addr6, IFSCOPE_NONE)); +} + +__private_extern__ struct in6_ifaddr * +ifa_foraddr6_scoped(struct in6_addr *addr6, unsigned int scope) +{ + struct in6_ifaddr *ia = NULL; + + lck_rw_lock_shared(&in6_ifaddr_rwlock); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK(&ia->ia_ifa); + if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, addr6) && + (scope == IFSCOPE_NONE || ia->ia_ifp->if_index == scope)) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for caller */ + IFA_UNLOCK(&ia->ia_ifa); + break; + } + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(&in6_ifaddr_rwlock); + + return (ia); +} +#endif /* INET6 */ + /* * Return the first (primary) address of a given family on an interface. */ __private_extern__ struct ifaddr * ifa_ifpgetprimary(struct ifnet *ifp, int family) { - struct ifaddr *ifa0 = NULL, *ifa; + struct ifaddr *ifa; ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family == family && ifa0 == NULL) { - ifa0 = ifa; + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family == family) { + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } + IFA_UNLOCK(ifa); } - if (ifa0 != NULL) - ifaref(ifa0); ifnet_lock_done(ifp); - return (ifa0); + return (ifa); } /* @@ -585,75 +766,89 @@ ifa_ifpgetprimary(struct ifnet *ifp, int family) */ /*ARGSUSED*/ struct ifaddr * -ifa_ifwithaddr( - const struct sockaddr *addr) +ifa_ifwithaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *result = NULL; -#define equal(a1, a2) \ - (bcmp((const void*)(a1), (const void*)(a2), ((const struct sockaddr *)(a1))->sa_len) == 0) - +#define equal(a1, a2) \ + (bcmp((const void*)(a1), (const void*)(a2), \ + ((const struct sockaddr *)(a1))->sa_len) == 0) + ifnet_head_lock_shared(); - for (ifp = ifnet_head.tqh_first; ifp && !result; ifp = ifp->if_link.tqe_next) { + for (ifp = ifnet_head.tqh_first; ifp && !result; + ifp = ifp->if_link.tqe_next) { ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa; - ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != addr->sa_family) + ifa = ifa->ifa_link.tqe_next) { + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != addr->sa_family) { + IFA_UNLOCK(ifa); continue; + } if (equal(addr, ifa->ifa_addr)) { result = ifa; + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } - if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr && - /* IP6 doesn't have broadcast */ - ifa->ifa_broadaddr->sa_len != 0 && - equal(ifa->ifa_broadaddr, addr)) { + if ((ifp->if_flags & IFF_BROADCAST) && + ifa->ifa_broadaddr != NULL && + /* IP6 doesn't have broadcast */ + ifa->ifa_broadaddr->sa_len != 0 && + equal(ifa->ifa_broadaddr, addr)) { result = ifa; + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } + IFA_UNLOCK(ifa); } - if (result) - ifaref(result); ifnet_lock_done(ifp); } ifnet_head_done(); - - return result; + + return (result); } /* * Locate the point to point interface with a given destination address. */ /*ARGSUSED*/ struct ifaddr * -ifa_ifwithdstaddr( - const struct sockaddr *addr) +ifa_ifwithdstaddr(const struct sockaddr *addr) { struct ifnet *ifp; struct ifaddr *ifa; struct ifaddr *result = NULL; ifnet_head_lock_shared(); - for (ifp = ifnet_head.tqh_first; ifp && !result; ifp = ifp->if_link.tqe_next) { - if (ifp->if_flags & IFF_POINTOPOINT) { + for (ifp = ifnet_head.tqh_first; ifp && !result; + ifp = ifp->if_link.tqe_next) { + if ((ifp->if_flags & IFF_POINTOPOINT)) { ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa; - ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != addr->sa_family) + ifa = ifa->ifa_link.tqe_next) { + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != + addr->sa_family) { + IFA_UNLOCK(ifa); continue; - if (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr)) { + } + if (ifa->ifa_dstaddr && + equal(addr, ifa->ifa_dstaddr)) { result = ifa; + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } + IFA_UNLOCK(ifa); } - if (result) - ifaref(result); ifnet_lock_done(ifp); } } ifnet_head_done(); - return result; + return (result); } /* @@ -686,10 +881,15 @@ ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope) ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa != NULL; ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != addr->sa_family) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != addr->sa_family) { + IFA_UNLOCK(ifa); continue; + } if (equal(addr, ifa->ifa_addr)) { result = ifa; + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } if ((ifp->if_flags & IFF_BROADCAST) && @@ -698,11 +898,12 @@ ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope) ifa->ifa_broadaddr->sa_len != 0 && equal(ifa->ifa_broadaddr, addr)) { result = ifa; + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } + IFA_UNLOCK(ifa); } - if (result != NULL) - ifaref(result); ifnet_lock_done(ifp); } ifnet_head_done(); @@ -731,11 +932,17 @@ ifa_ifwithnet_common(const struct sockaddr *addr, unsigned int ifscope) { struct ifnet *ifp; struct ifaddr *ifa = NULL; - struct ifaddr *ifa_maybe = (struct ifaddr *) 0; + struct ifaddr *ifa_maybe = NULL; u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; - if (!ip_doscopedroute || addr->sa_family != AF_INET) +#if INET6 + if ((af != AF_INET && af != AF_INET6) || + (af == AF_INET && !ip_doscopedroute) || + (af == AF_INET6 && !ip6_doscopedroute)) +#else + if (af != AF_INET || !ip_doscopedroute) +#endif /* !INET6 */ ifscope = IFSCOPE_NONE; ifnet_head_lock_shared(); @@ -744,15 +951,14 @@ ifa_ifwithnet_common(const struct sockaddr *addr, unsigned int ifscope) * so do that if we can. */ if (af == AF_LINK) { - const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; - if (sdl->sdl_index && sdl->sdl_index <= if_index) { + const struct sockaddr_dl *sdl = (const struct sockaddr_dl *)addr; + if (sdl->sdl_index && sdl->sdl_index <= if_index) { ifa = ifnet_addrs[sdl->sdl_index - 1]; - - if (ifa) - ifaref(ifa); - + if (ifa != NULL) + IFA_ADDREF(ifa); + ifnet_head_done(); - return ifa; + return (ifa); } } @@ -766,15 +972,19 @@ ifa_ifwithnet_common(const struct sockaddr *addr, unsigned int ifscope) ifa = ifa->ifa_link.tqe_next) { const char *cp, *cp2, *cp3; - if (ifa->ifa_addr->sa_family != af) -next: continue; + IFA_LOCK(ifa); + if (ifa->ifa_addr == NULL || + ifa->ifa_addr->sa_family != af) { +next: + IFA_UNLOCK(ifa); + continue; + } #ifndef __APPLE__ /* This breaks tunneling application trying to install a route with * a specific subnet and the local address as the destination * It's breaks binary compatibility with previous version of MacOS X */ if ( - #if INET6 /* XXX: for maching gif tunnel dst as routing entry gateway */ addr->sa_family != AF_INET6 && #endif @@ -787,10 +997,13 @@ next: continue; * The trouble is that we don't know the * netmask for the remote end. */ - if (ifa->ifa_dstaddr != 0 - && equal(addr, ifa->ifa_dstaddr)) { - break; - } + if (ifa->ifa_dstaddr != 0 && + equal(addr, ifa->ifa_dstaddr)) { + IFA_ADDREF_LOCKED(ifa); + IFA_UNLOCK(ifa); + break; + } + IFA_UNLOCK(ifa); } else #endif /* __APPLE__*/ { @@ -799,8 +1012,10 @@ next: continue; * find using a matching interface. */ if (ifscope != IFSCOPE_NONE && - ifp->if_index != ifscope) + ifp->if_index != ifscope) { + IFA_UNLOCK(ifa); continue; + } /* * Scan all the bits in the ifa's address. @@ -809,8 +1024,10 @@ next: continue; * to see if it really matters. * (A byte at a time) */ - if (ifa->ifa_netmask == 0) + if (ifa->ifa_netmask == 0) { + IFA_UNLOCK(ifa); continue; + } cp = addr_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; @@ -826,40 +1043,33 @@ next: continue; * before continuing to search * for an even better one. */ - if (ifa_maybe == 0 || + if (ifa_maybe == NULL || rn_refines((caddr_t)ifa->ifa_netmask, (caddr_t)ifa_maybe->ifa_netmask)) { - ifaref(ifa); - if (ifa_maybe) - ifafree(ifa_maybe); + IFA_ADDREF_LOCKED(ifa); /* ifa_maybe */ + IFA_UNLOCK(ifa); + if (ifa_maybe != NULL) + IFA_REMREF(ifa_maybe); ifa_maybe = ifa; + } else { + IFA_UNLOCK(ifa); } } + IFA_LOCK_ASSERT_NOTHELD(ifa); } - - if (ifa) { - ifaref(ifa); - } - - /* - * ifa is set if we found an exact match. - * take a reference to the ifa before - * releasing the ifp lock - */ ifnet_lock_done(ifp); - - if (ifa) { + + if (ifa != NULL) break; - } } ifnet_head_done(); - if (!ifa) + + if (ifa == NULL) ifa = ifa_maybe; - else if (ifa_maybe) { - ifafree(ifa_maybe); - ifa_maybe = NULL; - } - return ifa; + else if (ifa_maybe != NULL) + IFA_REMREF(ifa_maybe); + + return (ifa); } /* @@ -867,9 +1077,7 @@ next: continue; * a given address. */ struct ifaddr * -ifaof_ifpforaddr( - const struct sockaddr *addr, - struct ifnet *ifp) +ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp) { struct ifaddr *ifa = NULL; const char *cp, *cp2, *cp3; @@ -880,55 +1088,80 @@ ifaof_ifpforaddr( if (af >= AF_MAX) return (NULL); - + ifnet_lock_shared(ifp); for (ifa = ifp->if_addrhead.tqh_first; ifa; ifa = ifa->ifa_link.tqe_next) { - if (ifa->ifa_addr->sa_family != af) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != af) { + IFA_UNLOCK(ifa); continue; - if (ifa_maybe == 0) + } + if (ifa_maybe == NULL) { + IFA_ADDREF_LOCKED(ifa); /* for ifa_maybe */ ifa_maybe = ifa; + } if (ifa->ifa_netmask == 0) { - if (equal(addr, ifa->ifa_addr) || - (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr))) - break; + if (equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr && + equal(addr, ifa->ifa_dstaddr))) { + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); + break; + } + IFA_UNLOCK(ifa); continue; } if (ifp->if_flags & IFF_POINTOPOINT) { - if (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr)) + if (ifa->ifa_dstaddr && equal(addr, ifa->ifa_dstaddr)) { + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; + } } else { - if (equal(addr, ifa->ifa_addr)) { + if (equal(addr, ifa->ifa_addr)) { /* exact match */ + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } cp = addr->sa_data; cp2 = ifa->ifa_addr->sa_data; cp3 = ifa->ifa_netmask->sa_data; - cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask; + cplim = ifa->ifa_netmask->sa_len + + (char *)ifa->ifa_netmask; for (; cp3 < cplim; cp3++) if ((*cp++ ^ *cp2++) & *cp3) break; if (cp3 == cplim) { /* subnet match */ if (better_ifa_maybe == NULL) { + /* for better_ifa_maybe */ + IFA_ADDREF_LOCKED(ifa); better_ifa_maybe = ifa; } } } + IFA_UNLOCK(ifa); } - + if (ifa == NULL) { if (better_ifa_maybe != NULL) { ifa = better_ifa_maybe; + better_ifa_maybe = NULL; } else { ifa = ifa_maybe; + ifa_maybe = NULL; } } - if (ifa) ifaref(ifa); - + ifnet_lock_done(ifp); - return ifa; + + if (better_ifa_maybe != NULL) + IFA_REMREF(better_ifa_maybe); + if (ifa_maybe != NULL) + IFA_REMREF(ifa_maybe); + + return (ifa); } #include <net/route.h> @@ -944,6 +1177,7 @@ link_rtrequest(int cmd, struct rtentry *rt, struct sockaddr *sa) struct ifaddr *ifa; struct sockaddr *dst; struct ifnet *ifp; + void (*ifa_rtrequest)(int, struct rtentry *, struct sockaddr *); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK_ASSERT_HELD(rt); @@ -951,12 +1185,19 @@ link_rtrequest(int cmd, struct rtentry *rt, struct sockaddr *sa) if (cmd != RTM_ADD || ((ifa = rt->rt_ifa) == 0) || ((ifp = ifa->ifa_ifp) == 0) || ((dst = rt_key(rt)) == 0)) return; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + ifa = ifaof_ifpforaddr(dst, ifp); if (ifa) { rtsetifa(rt, ifa); - if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest) - ifa->ifa_rtrequest(cmd, rt, sa); - ifafree(ifa); + IFA_LOCK_SPIN(ifa); + ifa_rtrequest = ifa->ifa_rtrequest; + IFA_UNLOCK(ifa); + if (ifa_rtrequest != NULL && ifa_rtrequest != link_rtrequest) + ifa_rtrequest(cmd, rt, sa); + IFA_REMREF(ifa); } } @@ -1088,19 +1329,19 @@ ifunit(const char *name) len = strlen(name); if (len < 2 || len > IFNAMSIZ) - return NULL; + return (NULL); cp = name + len - 1; c = *cp; if (c < '0' || c > '9') - return NULL; /* trailing garbage */ + return (NULL); /* trailing garbage */ unit = 0; m = 1; do { if (cp == name) - return NULL; /* no interface name */ + return (NULL); /* no interface name */ unit += (c - '0') * m; if (unit > 1000000) - return NULL; /* number is unreasonable */ + return (NULL); /* number is unreasonable */ m *= 10; c = *--cp; } while (c >= '0' && c <= '9'); @@ -1134,7 +1375,7 @@ if_withname(struct sockaddr *sa) if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) || (sdl->sdl_nlen > IFNAMSIZ) ) - return NULL; + return (NULL); /* * ifunit wants a null-terminated name. It may not be null-terminated @@ -1145,7 +1386,7 @@ if_withname(struct sockaddr *sa) bcopy(sdl->sdl_data, ifname, sdl->sdl_nlen); ifname[sdl->sdl_nlen] = '\0'; - return ifunit(ifname); + return (ifunit(ifname)); } @@ -1163,6 +1404,8 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) struct kev_msg ev_msg; struct net_event_data ev_data; + bzero(&ev_data, sizeof(struct net_event_data)); + bzero(&ev_msg, sizeof(struct kev_msg)); switch (cmd) { case OSIOCGIFCONF32: case SIOCGIFCONF32: { @@ -1210,21 +1453,26 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) } ifp = ifunit(ifr->ifr_name); - if (ifp == 0) + if (ifp == NULL) return (ENXIO); - switch (cmd) { + switch (cmd) { case SIOCGIFFLAGS: ifnet_lock_shared(ifp); ifr->ifr_flags = ifp->if_flags; ifnet_lock_done(ifp); break; + case SIOCGIFCAP: + ifnet_lock_shared(ifp); + ifr->ifr_reqcap = ifp->if_capabilities; + ifr->ifr_curcap = ifp->if_capenable; + ifnet_lock_done(ifp); + break; + #if CONFIG_MACF_NET case SIOCGIFMAC: error = mac_ifnet_label_get(kauth_cred_get(), ifr, ifp); - if (error) - return (error); break; #endif case SIOCGIFMETRIC: @@ -1247,19 +1495,27 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCSIFFLAGS: error = proc_suser(p); - if (error) - return (error); + if (error != 0) + break; - ifnet_set_flags(ifp, ifr->ifr_flags, (u_int16_t)~IFF_CANTCHANGE); + (void) ifnet_set_flags(ifp, ifr->ifr_flags, + (u_int16_t)~IFF_CANTCHANGE); - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + /* + * Note that we intentionally ignore any error from below + * for the SIOCSIFFLAGS case. + */ + (void) ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); - /* Send the event even upon error from the driver because we changed the flags */ + /* + * Send the event even upon error from the driver because + * we changed the flags. + */ ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_DL_SUBCLASS; - + ev_msg.event_code = KEV_DL_SIFFLAGS; strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); ev_data.if_family = ifp->if_family; @@ -1272,24 +1528,37 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) ifnet_touch_lastchange(ifp); break; + case SIOCSIFCAP: + error = proc_suser(p); + if (error != 0) + break; + + if ((ifr->ifr_reqcap & ~ifp->if_capabilities)) { + error = EINVAL; + break; + } + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + + ifnet_touch_lastchange(ifp); + break; + #if CONFIG_MACF_NET case SIOCSIFMAC: error = mac_ifnet_label_set(kauth_cred_get(), ifr, ifp); - if (error) - return (error); break; #endif case SIOCSIFMETRIC: error = proc_suser(p); - if (error) - return (error); - ifp->if_metric = ifr->ifr_metric; + if (error != 0) + break; + ifp->if_metric = ifr->ifr_metric; ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_DL_SUBCLASS; - + ev_msg.event_code = KEV_DL_SIFMETRICS; strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); ev_data.if_family = ifp->if_family; @@ -1305,115 +1574,135 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCSIFPHYS: error = proc_suser(p); - if (error) - return error; + if (error != 0) + break; - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + if (error != 0) + break; - if (error == 0) { - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = KEV_DL_SUBCLASS; - - ev_msg.event_code = KEV_DL_SIFPHYS; - strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); - ev_data.if_family = ifp->if_family; - ev_data.if_unit = (u_int32_t) ifp->if_unit; - ev_msg.dv[0].data_length = sizeof(struct net_event_data); - ev_msg.dv[0].data_ptr = &ev_data; - ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); - - ifnet_touch_lastchange(ifp); - } - return(error); + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_DL_SUBCLASS; + + ev_msg.event_code = KEV_DL_SIFPHYS; + strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); + ev_data.if_family = ifp->if_family; + ev_data.if_unit = (u_int32_t) ifp->if_unit; + ev_msg.dv[0].data_length = sizeof(struct net_event_data); + ev_msg.dv[0].data_ptr = &ev_data; + ev_msg.dv[1].data_length = 0; + kev_post_msg(&ev_msg); + + ifnet_touch_lastchange(ifp); + break; case SIOCSIFMTU: { u_int32_t oldmtu = ifp->if_mtu; error = proc_suser(p); - if (error) - return (error); - if (ifp->if_ioctl == NULL) - return (EOPNOTSUPP); - if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) - return (EINVAL); - - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + if (error != 0) + break; - if (error == 0) { - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = KEV_DL_SUBCLASS; - - ev_msg.event_code = KEV_DL_SIFMTU; - strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); - ev_data.if_family = ifp->if_family; - ev_data.if_unit = (u_int32_t) ifp->if_unit; - ev_msg.dv[0].data_length = sizeof(struct net_event_data); - ev_msg.dv[0].data_ptr = &ev_data; - ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); - - ifnet_touch_lastchange(ifp); - rt_ifmsg(ifp); + if (ifp->if_ioctl == NULL) { + error = EOPNOTSUPP; + break; + } + if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) { + error = EINVAL; + break; } + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + if (error != 0) + break; + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_DL_SUBCLASS; + + ev_msg.event_code = KEV_DL_SIFMTU; + strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); + ev_data.if_family = ifp->if_family; + ev_data.if_unit = (u_int32_t) ifp->if_unit; + ev_msg.dv[0].data_length = sizeof(struct net_event_data); + ev_msg.dv[0].data_ptr = &ev_data; + ev_msg.dv[1].data_length = 0; + kev_post_msg(&ev_msg); + + ifnet_touch_lastchange(ifp); + rt_ifmsg(ifp); + /* * If the link MTU changed, do network layer specific procedure * and update all route entries associated with the interface, * so that their MTU metric gets updated. */ - if (error == 0 && ifp->if_mtu != oldmtu) { + if (ifp->if_mtu != oldmtu) { if_rtmtu_update(ifp); #if INET6 nd6_setmtu(ifp); #endif } - return (error); + break; } case SIOCADDMULTI: case SIOCDELMULTI: error = proc_suser(p); - if (error) - return (error); + if (error != 0) + break; /* Don't allow group membership on non-multicast interfaces. */ - if ((ifp->if_flags & IFF_MULTICAST) == 0) - return EOPNOTSUPP; + if ((ifp->if_flags & IFF_MULTICAST) == 0) { + error = EOPNOTSUPP; + break; + } -#ifndef __APPLE__ /* Don't let users screw up protocols' entries. */ - if (ifr->ifr_addr.sa_family != AF_LINK) - return EINVAL; -#endif + if (ifr->ifr_addr.sa_family != AF_UNSPEC && + ifr->ifr_addr.sa_family != AF_LINK) { + error = EINVAL; + break; + } + /* + * User is permitted to anonymously join a particular link + * multicast group via SIOCADDMULTI. Subsequent join requested + * for the same record which has an outstanding refcnt from a + * past if_addmulti_anon() will not result in EADDRINUSE error + * (unlike other BSDs.) Anonymously leaving a group is also + * allowed only as long as there is an outstanding refcnt held + * by a previous anonymous request, or else ENOENT (even if the + * link-layer multicast membership exists for a network-layer + * membership.) + */ if (cmd == SIOCADDMULTI) { - error = if_addmulti(ifp, &ifr->ifr_addr, NULL); + error = if_addmulti_anon(ifp, &ifr->ifr_addr, NULL); ev_msg.event_code = KEV_DL_ADDMULTI; } else { - error = if_delmulti(ifp, &ifr->ifr_addr); + error = if_delmulti_anon(ifp, &ifr->ifr_addr); ev_msg.event_code = KEV_DL_DELMULTI; } - if (error == 0) { - ev_msg.vendor_code = KEV_VENDOR_APPLE; - ev_msg.kev_class = KEV_NETWORK_CLASS; - ev_msg.kev_subclass = KEV_DL_SUBCLASS; - strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); - - ev_data.if_family = ifp->if_family; - ev_data.if_unit = (u_int32_t) ifp->if_unit; - ev_msg.dv[0].data_length = sizeof(struct net_event_data); - ev_msg.dv[0].data_ptr = &ev_data; - ev_msg.dv[1].data_length = 0; - kev_post_msg(&ev_msg); - - ifnet_touch_lastchange(ifp); - } - return error; + if (error != 0) + break; + + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_DL_SUBCLASS; + strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); + + ev_data.if_family = ifp->if_family; + ev_data.if_unit = (u_int32_t) ifp->if_unit; + ev_msg.dv[0].data_length = sizeof(struct net_event_data); + ev_msg.dv[0].data_ptr = &ev_data; + ev_msg.dv[1].data_length = 0; + kev_post_msg(&ev_msg); + + ifnet_touch_lastchange(ifp); + break; case SIOCSIFPHYADDR: case SIOCDIFPHYADDR: @@ -1429,20 +1718,21 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCSIFVLAN: case SIOCSIFBOND: error = proc_suser(p); - if (error) - return (error); + if (error != 0) + break; - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + if (error != 0) + break; - if (error == 0) - ifnet_touch_lastchange(ifp); - return error; + ifnet_touch_lastchange(ifp); + break; case SIOCGIFSTATUS: ifs = (struct ifstat *)data; ifs->ascii[0] = '\0'; - + case SIOCGIFPSRCADDR: case SIOCGIFPDSTADDR: case SIOCGLIFPHYADDR: @@ -1450,12 +1740,15 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case SIOCGIFMEDIA64: case SIOCGIFGENERIC: case SIOCGIFDEVMTU: - return ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + break; + case SIOCGIFVLAN: case SIOCGIFBOND: - return ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, + cmd, data); + break; case SIOCGIFWAKEFLAGS: ifnet_lock_shared(ifp); @@ -1464,24 +1757,21 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) break; case SIOCGIFGETRTREFCNT: -#if IFNET_ROUTE_REFCNT ifnet_lock_shared(ifp); ifr->ifr_route_refcnt = ifp->if_route_refcnt; ifnet_lock_done(ifp); break; -#else - return (EOPNOTSUPP); -#endif /* IFNET_ROUTE_REFCNT */ default: oif_flags = ifp->if_flags; - if (so->so_proto == 0) - return (EOPNOTSUPP); + if (so->so_proto == NULL) { + error = EOPNOTSUPP; + break; + } { - int ocmd = cmd; + u_long ocmd = cmd; switch (cmd) { - case SIOCSIFDSTADDR: case SIOCSIFADDR: case SIOCSIFBRDADDR: @@ -1513,12 +1803,13 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) case OSIOCGIFNETMASK: cmd = SIOCGIFNETMASK; } + socket_lock(so, 1); - error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, - data, ifp, p)); + error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, + data, ifp, p)); socket_unlock(so, 1); - switch (ocmd) { + switch (ocmd) { case OSIOCGIFADDR: case OSIOCGIFDSTADDR: case OSIOCGIFBRDADDR: @@ -1534,12 +1825,12 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p) } if (error == EOPNOTSUPP || error == ENOTSUP) - error = ifnet_ioctl(ifp, so->so_proto->pr_domain->dom_family, - cmd, data); + error = ifnet_ioctl(ifp, + so->so_proto->pr_domain->dom_family, cmd, data); - return (error); + break; } - return (0); + return (error); } int @@ -1564,47 +1855,43 @@ ifnet_set_promiscuous( ifnet_t ifp, int pswitch) { - struct ifreq ifr; int error = 0; - int oldflags; - int locked = 0; - int changed = 0; + int oldflags = 0; + int newflags = 0; ifnet_lock_exclusive(ifp); - locked = 1; oldflags = ifp->if_flags; - if (pswitch) { - /* - * If the device is not configured up, we cannot put it in - * promiscuous mode. - */ - if ((ifp->if_flags & IFF_UP) == 0) { - error = ENETDOWN; - goto done; - } - if (ifp->if_pcount++ != 0) { - goto done; - } + ifp->if_pcount += pswitch ? 1 : -1; + + if (ifp->if_pcount > 0) ifp->if_flags |= IFF_PROMISC; - } else { - if (--ifp->if_pcount > 0) - goto done; + else ifp->if_flags &= ~IFF_PROMISC; - } - ifr.ifr_flags = ifp->if_flags; - locked = 0; + + newflags = ifp->if_flags; ifnet_lock_done(ifp); - error = ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, &ifr); - if (error == 0) - rt_ifmsg(ifp); - else - ifp->if_flags = oldflags; -done: - if (locked) ifnet_lock_done(ifp); - if (changed) { - log(LOG_INFO, "%s%d: promiscuous mode %s\n", + + if (newflags != oldflags && (newflags & IFF_UP) != 0) { + error = ifnet_ioctl(ifp, 0, SIOCSIFFLAGS, NULL); + if (error == 0) { + rt_ifmsg(ifp); + } else { + ifnet_lock_exclusive(ifp); + // revert the flags + ifp->if_pcount -= pswitch ? 1 : -1; + if (ifp->if_pcount > 0) + ifp->if_flags |= IFF_PROMISC; + else + ifp->if_flags &= ~IFF_PROMISC; + ifnet_lock_done(ifp); + } + } + + if (newflags != oldflags) { + log(LOG_INFO, "%s%d: promiscuous mode %s%s\n", ifp->if_name, ifp->if_unit, - pswitch != 0 ? "enabled" : "disabled"); + (newflags & IFF_PROMISC) != 0 ? "enable" : "disable", + error != 0 ? " failed" : " succeeded"); } return error; } @@ -1624,7 +1911,7 @@ ifconf(u_long cmd, user_addr_t ifrp, int * ret_space) struct ifreq ifr; int error = 0; size_t space; - + /* * Zero the ifr buffer to make sure we don't * disclose the contents of the stack. @@ -1633,7 +1920,8 @@ ifconf(u_long cmd, user_addr_t ifrp, int * ret_space) space = *ret_space; ifnet_head_lock_shared(); - for (ifp = ifnet_head.tqh_first; space > sizeof(ifr) && ifp; ifp = ifp->if_link.tqe_next) { + for (ifp = ifnet_head.tqh_first; space > sizeof(ifr) && + ifp; ifp = ifp->if_link.tqe_next) { char workbuf[64]; size_t ifnlen, addrs; @@ -1645,17 +1933,22 @@ ifconf(u_long cmd, user_addr_t ifrp, int * ret_space) } else { strlcpy(ifr.ifr_name, workbuf, IFNAMSIZ); } - + ifnet_lock_shared(ifp); addrs = 0; ifa = ifp->if_addrhead.tqh_first; for ( ; space > sizeof (ifr) && ifa; ifa = ifa->ifa_link.tqe_next) { - struct sockaddr *sa = ifa->ifa_addr; + struct sockaddr *sa; + + IFA_LOCK(ifa); + sa = ifa->ifa_addr; #ifndef __APPLE__ - if (curproc->p_prison && prison_if(curproc, sa)) + if (curproc->p_prison && prison_if(curproc, sa)) { + IFA_UNLOCK(ifa); continue; + } #endif addrs++; if (cmd == OSIOCGIFCONF32 || cmd == OSIOCGIFCONF64) { @@ -1663,30 +1956,38 @@ ifconf(u_long cmd, user_addr_t ifrp, int * ret_space) (struct osockaddr *)&ifr.ifr_addr; ifr.ifr_addr = *sa; osa->sa_family = sa->sa_family; - error = copyout((caddr_t)&ifr, ifrp, sizeof(ifr)); + error = copyout((caddr_t)&ifr, ifrp, + sizeof (ifr)); ifrp += sizeof(struct ifreq); } else if (sa->sa_len <= sizeof(*sa)) { ifr.ifr_addr = *sa; - error = copyout((caddr_t)&ifr, ifrp, sizeof(ifr)); + error = copyout((caddr_t)&ifr, ifrp, + sizeof (ifr)); ifrp += sizeof(struct ifreq); } else { - if (space < sizeof (ifr) + sa->sa_len - sizeof(*sa)) + if (space < + sizeof (ifr) + sa->sa_len - sizeof(*sa)) { + IFA_UNLOCK(ifa); break; + } space -= sa->sa_len - sizeof(*sa); - error = copyout((caddr_t)&ifr, ifrp, sizeof (ifr.ifr_name)); + error = copyout((caddr_t)&ifr, ifrp, + sizeof (ifr.ifr_name)); if (error == 0) { - error = copyout((caddr_t)sa, - (ifrp + offsetof(struct ifreq, ifr_addr)), - sa->sa_len); + error = copyout((caddr_t)sa, (ifrp + + offsetof(struct ifreq, ifr_addr)), + sa->sa_len); } - ifrp += (sa->sa_len + offsetof(struct ifreq, ifr_addr)); + ifrp += (sa->sa_len + offsetof(struct ifreq, + ifr_addr)); } + IFA_UNLOCK(ifa); if (error) break; space -= sizeof (ifr); } ifnet_lock_done(ifp); - + if (error) break; if (!addrs) { @@ -1738,64 +2039,300 @@ if_allmulti(struct ifnet *ifp, int onswitch) return error; } +static struct ifmultiaddr * +ifma_alloc(int how) +{ + struct ifmultiaddr *ifma; + + ifma = (how == M_WAITOK) ? zalloc(ifma_zone) : + zalloc_noblock(ifma_zone); + + if (ifma != NULL) { + bzero(ifma, ifma_size); + lck_mtx_init(&ifma->ifma_lock, ifa_mtx_grp, ifa_mtx_attr); + ifma->ifma_debug |= IFD_ALLOC; + if (ifma_debug != 0) { + ifma->ifma_debug |= IFD_DEBUG; + ifma->ifma_trace = ifma_trace; + } + } + return (ifma); +} + +static void +ifma_free(struct ifmultiaddr *ifma) +{ + IFMA_LOCK(ifma); + + if (ifma->ifma_protospec != NULL) { + panic("%s: Protospec not NULL for ifma=%p", __func__, ifma); + /* NOTREACHED */ + } else if ((ifma->ifma_flags & IFMAF_ANONYMOUS) || + ifma->ifma_anoncnt != 0) { + panic("%s: Freeing ifma=%p with outstanding anon req", + __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_debug & IFD_ATTACHED) { + panic("%s: ifma=%p attached to ifma_ifp=%p is being freed", + __func__, ifma, ifma->ifma_ifp); + /* NOTREACHED */ + } else if (!(ifma->ifma_debug & IFD_ALLOC)) { + panic("%s: ifma %p cannot be freed", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_refcount != 0) { + panic("%s: non-zero refcount ifma=%p", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_reqcnt != 0) { + panic("%s: non-zero reqcnt ifma=%p", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_ifp != NULL) { + panic("%s: non-NULL ifma_ifp=%p for ifma=%p", __func__, + ifma->ifma_ifp, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_ll != NULL) { + panic("%s: non-NULL ifma_ll=%p for ifma=%p", __func__, + ifma->ifma_ll, ifma); + /* NOTREACHED */ + } + ifma->ifma_debug &= ~IFD_ALLOC; + if ((ifma->ifma_debug & (IFD_DEBUG | IFD_TRASHED)) == + (IFD_DEBUG | IFD_TRASHED)) { + lck_mtx_lock(&ifma_trash_lock); + TAILQ_REMOVE(&ifma_trash_head, (struct ifmultiaddr_dbg *)ifma, + ifma_trash_link); + lck_mtx_unlock(&ifma_trash_lock); + ifma->ifma_debug &= ~IFD_TRASHED; + } + IFMA_UNLOCK(ifma); + + if (ifma->ifma_addr != NULL) { + FREE(ifma->ifma_addr, M_IFADDR); + ifma->ifma_addr = NULL; + } + lck_mtx_destroy(&ifma->ifma_lock, ifa_mtx_grp); + zfree(ifma_zone, ifma); +} + +static void +ifma_trace(struct ifmultiaddr *ifma, int refhold) +{ + struct ifmultiaddr_dbg *ifma_dbg = (struct ifmultiaddr_dbg *)ifma; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(ifma->ifma_debug & IFD_DEBUG)) { + panic("%s: ifma %p has no debug structure", __func__, ifma); + /* NOTREACHED */ + } + if (refhold) { + cnt = &ifma_dbg->ifma_refhold_cnt; + tr = ifma_dbg->ifma_refhold; + } else { + cnt = &ifma_dbg->ifma_refrele_cnt; + tr = ifma_dbg->ifma_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % IFMA_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + void -ifma_reference( - struct ifmultiaddr *ifma) +ifma_addref(struct ifmultiaddr *ifma, int locked) { - if (OSIncrementAtomic(&ifma->ifma_refcount) <= 0) - panic("ifma_reference: ifma already released or invalid\n"); + if (!locked) + IFMA_LOCK(ifma); + else + IFMA_LOCK_ASSERT_HELD(ifma); + + if (++ifma->ifma_refcount == 0) { + panic("%s: ifma=%p wraparound refcnt", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_trace != NULL) { + (*ifma->ifma_trace)(ifma, TRUE); + } + if (!locked) + IFMA_UNLOCK(ifma); } void -ifma_release( - struct ifmultiaddr *ifma) -{ - while (ifma) { - struct ifmultiaddr *next; - int32_t prevValue = OSDecrementAtomic(&ifma->ifma_refcount); - if (prevValue < 1) - panic("ifma_release: ifma already released or invalid\n"); - if (prevValue != 1) - break; - - /* Allow the allocator of the protospec to free it */ - if (ifma->ifma_protospec && ifma->ifma_free) { - ifma->ifma_free(ifma->ifma_protospec); - } - - next = ifma->ifma_ll; - FREE(ifma->ifma_addr, M_IFMADDR); - FREE(ifma, M_IFMADDR); - ifma = next; +ifma_remref(struct ifmultiaddr *ifma) +{ + struct ifmultiaddr *ll; + + IFMA_LOCK(ifma); + + if (ifma->ifma_refcount == 0) { + panic("%s: ifma=%p negative refcnt", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_trace != NULL) { + (*ifma->ifma_trace)(ifma, FALSE); + } + + --ifma->ifma_refcount; + if (ifma->ifma_refcount > 0) { + IFMA_UNLOCK(ifma); + return; } + + ll = ifma->ifma_ll; + ifma->ifma_ifp = NULL; + ifma->ifma_ll = NULL; + IFMA_UNLOCK(ifma); + ifma_free(ifma); /* deallocate it */ + + if (ll != NULL) + IFMA_REMREF(ll); +} + +static void +if_attach_ifma(struct ifnet *ifp, struct ifmultiaddr *ifma, int anon) +{ + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + IFMA_LOCK_ASSERT_HELD(ifma); + + if (ifma->ifma_ifp != ifp) { + panic("%s: Mismatch ifma_ifp=%p != ifp=%p", __func__, + ifma->ifma_ifp, ifp); + /* NOTREACHED */ + } else if (ifma->ifma_debug & IFD_ATTACHED) { + panic("%s: Attempt to attach an already attached ifma=%p", + __func__, ifma); + /* NOTREACHED */ + } else if (anon && (ifma->ifma_flags & IFMAF_ANONYMOUS)) { + panic("%s: ifma=%p unexpected IFMAF_ANONYMOUS", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_debug & IFD_TRASHED) { + panic("%s: Attempt to reattach a detached ifma=%p", + __func__, ifma); + /* NOTREACHED */ + } + + ifma->ifma_reqcnt++; + VERIFY(ifma->ifma_reqcnt == 1); + IFMA_ADDREF_LOCKED(ifma); + ifma->ifma_debug |= IFD_ATTACHED; + if (anon) { + ifma->ifma_anoncnt++; + VERIFY(ifma->ifma_anoncnt == 1); + ifma->ifma_flags |= IFMAF_ANONYMOUS; + } + + LIST_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); } - /* - * Find an ifmultiaddr that matches a socket address on an interface. - * - * Caller is responsible for holding the ifnet_lock while calling - * this function. - */ static int -if_addmulti_doesexist( - struct ifnet *ifp, - const struct sockaddr *sa, - struct ifmultiaddr **retifma) +if_detach_ifma(struct ifnet *ifp, struct ifmultiaddr *ifma, int anon) +{ + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); + IFMA_LOCK_ASSERT_HELD(ifma); + + if (ifma->ifma_reqcnt == 0) { + panic("%s: ifma=%p negative reqcnt", __func__, ifma); + /* NOTREACHED */ + } else if (anon && !(ifma->ifma_flags & IFMAF_ANONYMOUS)) { + panic("%s: ifma=%p missing IFMAF_ANONYMOUS", __func__, ifma); + /* NOTREACHED */ + } else if (anon && ifma->ifma_anoncnt == 0) { + panic("%s: ifma=%p negative anonreqcnt", __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_ifp != ifp) { + panic("%s: Mismatch ifma_ifp=%p, ifp=%p", __func__, + ifma->ifma_ifp, ifp); + /* NOTREACHED */ + } + + if (anon) { + --ifma->ifma_anoncnt; + if (ifma->ifma_anoncnt > 0) + return (0); + ifma->ifma_flags &= ~IFMAF_ANONYMOUS; + } + + --ifma->ifma_reqcnt; + if (ifma->ifma_reqcnt > 0) + return (0); + + if (ifma->ifma_protospec != NULL) { + panic("%s: Protospec not NULL for ifma=%p", __func__, ifma); + /* NOTREACHED */ + } else if ((ifma->ifma_flags & IFMAF_ANONYMOUS) || + ifma->ifma_anoncnt != 0) { + panic("%s: Detaching ifma=%p with outstanding anon req", + __func__, ifma); + /* NOTREACHED */ + } else if (!(ifma->ifma_debug & IFD_ATTACHED)) { + panic("%s: Attempt to detach an unattached address ifma=%p", + __func__, ifma); + /* NOTREACHED */ + } else if (ifma->ifma_debug & IFD_TRASHED) { + panic("%s: ifma %p is already in trash list", __func__, ifma); + /* NOTREACHED */ + } + + /* + * NOTE: Caller calls IFMA_REMREF + */ + ifma->ifma_debug &= ~IFD_ATTACHED; + LIST_REMOVE(ifma, ifma_link); + if (LIST_EMPTY(&ifp->if_multiaddrs)) + ifp->if_updatemcasts = 0; + + if (ifma->ifma_debug & IFD_DEBUG) { + /* Become a regular mutex, just in case */ + IFMA_CONVERT_LOCK(ifma); + lck_mtx_lock(&ifma_trash_lock); + TAILQ_INSERT_TAIL(&ifma_trash_head, + (struct ifmultiaddr_dbg *)ifma, ifma_trash_link); + lck_mtx_unlock(&ifma_trash_lock); + ifma->ifma_debug |= IFD_TRASHED; + } + + return (1); +} + +/* + * Find an ifmultiaddr that matches a socket address on an interface. + * + * Caller is responsible for holding the ifnet_lock while calling + * this function. + */ +static int +if_addmulti_doesexist(struct ifnet *ifp, const struct sockaddr *sa, + struct ifmultiaddr **retifma, int anon) { struct ifmultiaddr *ifma; - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) { - if (equal(sa, ifma->ifma_addr)) { - ifma->ifma_usecount++; - if (retifma) { - *retifma = ifma; - ifma_reference(*retifma); + + for (ifma = LIST_FIRST(&ifp->if_multiaddrs); ifma != NULL; + ifma = LIST_NEXT(ifma, ifma_link)) { + IFMA_LOCK_SPIN(ifma); + if (!equal(sa, ifma->ifma_addr)) { + IFMA_UNLOCK(ifma); + continue; + } + if (anon) { + VERIFY(!(ifma->ifma_flags & IFMAF_ANONYMOUS) || + ifma->ifma_anoncnt != 0); + VERIFY((ifma->ifma_flags & IFMAF_ANONYMOUS) || + ifma->ifma_anoncnt == 0); + ifma->ifma_anoncnt++; + if (!(ifma->ifma_flags & IFMAF_ANONYMOUS)) { + VERIFY(ifma->ifma_anoncnt == 1); + ifma->ifma_flags |= IFMAF_ANONYMOUS; } - return 0; } + if (!anon || ifma->ifma_anoncnt == 1) { + ifma->ifma_reqcnt++; + VERIFY(ifma->ifma_reqcnt > 1); + } + if (retifma != NULL) { + *retifma = ifma; + IFMA_ADDREF_LOCKED(ifma); + } + IFMA_UNLOCK(ifma); + return (0); } - - return ENOENT; + return (ENOENT); } /* @@ -1864,67 +2401,114 @@ copy_and_normalize( } /* - * Add a multicast listenership to the interface in question. - * The link layer provides a routine which converts + * Network-layer protocol domains which hold references to the underlying + * link-layer record must use this routine. */ int -if_addmulti( - struct ifnet *ifp, /* interface to manipulate */ - const struct sockaddr *sa, /* address to add */ - struct ifmultiaddr **retifma) +if_addmulti(struct ifnet *ifp, const struct sockaddr *sa, + struct ifmultiaddr **retifma) +{ + return (if_addmulti_common(ifp, sa, retifma, 0)); +} + +/* + * Anything other than network-layer protocol domains which hold references + * to the underlying link-layer record must use this routine: SIOCADDMULTI + * ioctl, ifnet_add_multicast(), AppleTalk, if_bond. + */ +int +if_addmulti_anon(struct ifnet *ifp, const struct sockaddr *sa, + struct ifmultiaddr **retifma) +{ + return (if_addmulti_common(ifp, sa, retifma, 1)); +} + +/* + * Register an additional multicast address with a network interface. + * + * - If the address is already present, bump the reference count on the + * address and return. + * - If the address is not link-layer, look up a link layer address. + * - Allocate address structures for one or both addresses, and attach to the + * multicast address list on the interface. If automatically adding a link + * layer address, the protocol address will own a reference to the link + * layer address, to be freed when it is freed. + * - Notify the network device driver of an addition to the multicast address + * list. + * + * 'sa' points to caller-owned memory with the desired multicast address. + * + * 'retifma' will be used to return a pointer to the resulting multicast + * address reference, if desired. + * + * 'anon' indicates a link-layer address with no protocol address reference + * made to it. Anything other than network-layer protocol domain requests + * are considered as anonymous. + */ +static int +if_addmulti_common(struct ifnet *ifp, const struct sockaddr *sa, + struct ifmultiaddr **retifma, int anon) { struct sockaddr_storage storage; struct sockaddr *llsa = NULL; struct sockaddr *dupsa = NULL; - int error = 0; + int error = 0, ll_firstref = 0, lladdr; struct ifmultiaddr *ifma = NULL; struct ifmultiaddr *llifma = NULL; - + + /* Only AF_UNSPEC/AF_LINK is allowed for an "anonymous" address */ + VERIFY(!anon || sa->sa_family == AF_UNSPEC || + sa->sa_family == AF_LINK); + /* If sa is a AF_LINK or AF_UNSPEC, duplicate and normalize it */ if (sa->sa_family == AF_LINK || sa->sa_family == AF_UNSPEC) { dupsa = copy_and_normalize(sa); if (dupsa == NULL) { - return ENOMEM; + error = ENOMEM; + goto cleanup; } sa = dupsa; } - + ifnet_lock_exclusive(ifp); - error = if_addmulti_doesexist(ifp, sa, retifma); - ifnet_lock_done(ifp); - - if (error == 0) { + if (!(ifp->if_flags & IFF_MULTICAST)) { + error = EADDRNOTAVAIL; + ifnet_lock_done(ifp); goto cleanup; } + /* If the address is already present, return a new reference to it */ + error = if_addmulti_doesexist(ifp, sa, retifma, anon); + ifnet_lock_done(ifp); + if (error == 0) + goto cleanup; + /* - * Give the link layer a chance to accept/reject it, and also - * find out which AF_LINK address this maps to, if it isn't one - * already. + * The address isn't already present; give the link layer a chance + * to accept/reject it, and also find out which AF_LINK address this + * maps to, if it isn't one already. */ - error = dlil_resolve_multi(ifp, sa, (struct sockaddr*)&storage, - sizeof(storage)); + error = dlil_resolve_multi(ifp, sa, (struct sockaddr *)&storage, + sizeof (storage)); if (error == 0 && storage.ss_len != 0) { - llsa = copy_and_normalize((struct sockaddr*)&storage); + llsa = copy_and_normalize((struct sockaddr *)&storage); if (llsa == NULL) { error = ENOMEM; goto cleanup; } - - MALLOC(llifma, struct ifmultiaddr *, sizeof *llifma, M_IFMADDR, M_WAITOK); + + llifma = ifma_alloc(M_WAITOK); if (llifma == NULL) { error = ENOMEM; goto cleanup; } } - + /* to be similar to FreeBSD */ - if (error == EOPNOTSUPP) { + if (error == EOPNOTSUPP) error = 0; - } - else if (error) { + else if (error != 0) goto cleanup; - } /* Allocate while we aren't holding any locks */ if (dupsa == NULL) { @@ -1934,185 +2518,212 @@ if_addmulti( goto cleanup; } } - MALLOC(ifma, struct ifmultiaddr *, sizeof *ifma, M_IFMADDR, M_WAITOK); + ifma = ifma_alloc(M_WAITOK); if (ifma == NULL) { error = ENOMEM; goto cleanup; } - + ifnet_lock_exclusive(ifp); /* * Check again for the matching multicast. */ - if ((error = if_addmulti_doesexist(ifp, sa, retifma)) == 0) { + error = if_addmulti_doesexist(ifp, sa, retifma, anon); + if (error == 0) { ifnet_lock_done(ifp); goto cleanup; } - bzero(ifma, sizeof(*ifma)); - ifma->ifma_addr = dupsa; - ifma->ifma_ifp = ifp; - ifma->ifma_usecount = 1; - ifma->ifma_refcount = 1; - - if (llifma != 0) { - if (if_addmulti_doesexist(ifp, llsa, &ifma->ifma_ll) == 0) { - FREE(llsa, M_IFMADDR); - FREE(llifma, M_IFMADDR); + if (llifma != NULL) { + VERIFY(!anon); /* must not get here if "anonymous" */ + if (if_addmulti_doesexist(ifp, llsa, &ifma->ifma_ll, 0) == 0) { + FREE(llsa, M_IFADDR); + llsa = NULL; + ifma_free(llifma); + llifma = NULL; + VERIFY(ifma->ifma_ll->ifma_ifp == ifp); } else { - bzero(llifma, sizeof(*llifma)); + ll_firstref = 1; llifma->ifma_addr = llsa; llifma->ifma_ifp = ifp; - llifma->ifma_usecount = 1; - llifma->ifma_refcount = 1; - LIST_INSERT_HEAD(&ifp->if_multiaddrs, llifma, ifma_link); - + IFMA_LOCK(llifma); + if_attach_ifma(ifp, llifma, 0); + /* add extra refcnt for ifma */ + IFMA_ADDREF_LOCKED(llifma); + IFMA_UNLOCK(llifma); ifma->ifma_ll = llifma; - ifma_reference(ifma->ifma_ll); } } - - LIST_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link); - - if (retifma) { + + /* "anonymous" request should not result in network address */ + VERIFY(!anon || ifma->ifma_ll == NULL); + + ifma->ifma_addr = dupsa; + ifma->ifma_ifp = ifp; + IFMA_LOCK(ifma); + if_attach_ifma(ifp, ifma, anon); + IFMA_ADDREF_LOCKED(ifma); /* for this routine */ + if (retifma != NULL) { *retifma = ifma; - ifma_reference(*retifma); + IFMA_ADDREF_LOCKED(*retifma); /* for caller */ } - + lladdr = (ifma->ifma_addr->sa_family == AF_UNSPEC || + ifma->ifma_addr->sa_family == AF_LINK); + IFMA_UNLOCK(ifma); ifnet_lock_done(ifp); - - if (llsa != 0) - rt_newmaddrmsg(RTM_NEWMADDR, ifma); + + rt_newmaddrmsg(RTM_NEWMADDR, ifma); + IFMA_REMREF(ifma); /* for this routine */ /* * We are certain we have added something, so call down to the - * interface to let them know about it. + * interface to let them know about it. Do this only for newly- + * added AF_LINK/AF_UNSPEC address in the if_multiaddrs set. */ - ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL); - - return 0; - + if (lladdr || ll_firstref) + (void) ifnet_ioctl(ifp, 0, SIOCADDMULTI, NULL); + + if (ifp->if_updatemcasts > 0) + ifp->if_updatemcasts = 0; + + return (0); + cleanup: - if (ifma) - FREE(ifma, M_IFADDR); - if (dupsa) + if (ifma != NULL) + ifma_free(ifma); + if (dupsa != NULL) FREE(dupsa, M_IFADDR); - if (llifma) - FREE(llifma, M_IFADDR); - if (llsa) + if (llifma != NULL) + ifma_free(llifma); + if (llsa != NULL) FREE(llsa, M_IFADDR); - - return error; + + return (error); } +/* + * Delete a multicast group membership by network-layer group address. + * This routine is deprecated. + */ int -if_delmultiaddr( - struct ifmultiaddr *ifma, - int locked) +if_delmulti(struct ifnet *ifp, const struct sockaddr *sa) { - struct ifnet *ifp; - int do_del_multi = 0; - - ifp = ifma->ifma_ifp; - - if (!locked && ifp) { - ifnet_lock_exclusive(ifp); - } - - while (ifma != NULL) { - struct ifmultiaddr *ll_ifma; - - if (ifma->ifma_usecount > 1) { - ifma->ifma_usecount--; - break; - } - - if (ifp) - LIST_REMOVE(ifma, ifma_link); - - ll_ifma = ifma->ifma_ll; - - if (ll_ifma) { /* send a routing msg for network addresses only */ - if (ifp) - ifnet_lock_done(ifp); - rt_newmaddrmsg(RTM_DELMADDR, ifma); - if (ifp) - ifnet_lock_exclusive(ifp); - } - - /* - * Make sure the interface driver is notified - * in the case of a link layer mcast group being left. - */ - if (ll_ifma == 0) { - if (ifp && ifma->ifma_addr->sa_family == AF_LINK) - do_del_multi = 1; - break; - } - - if (ifp) - ifma_release(ifma); - - ifma = ll_ifma; - } - - if (!locked && ifp) { - /* This wasn't initially locked, we should unlock it */ - ifnet_lock_done(ifp); - } - - if (do_del_multi) { - if (locked) - ifnet_lock_done(ifp); - ifnet_ioctl(ifp, 0, SIOCDELMULTI, NULL); - if (locked) - ifnet_lock_exclusive(ifp); - } - - return 0; + return (if_delmulti_common(NULL, ifp, sa, 0)); } /* - * Remove a reference to a multicast address on this interface. Yell - * if the request does not match an existing membership. + * Delete a multicast group membership by group membership pointer. + * Network-layer protocol domains must use this routine. */ int -if_delmulti( - struct ifnet *ifp, - const struct sockaddr *sa) +if_delmulti_ifma(struct ifmultiaddr *ifma) +{ + return (if_delmulti_common(ifma, NULL, NULL, 0)); +} + +/* + * Anything other than network-layer protocol domains which hold references + * to the underlying link-layer record must use this routine: SIOCDELMULTI + * ioctl, ifnet_remove_multicast(), AppleTalk, if_bond. + */ +int +if_delmulti_anon(struct ifnet *ifp, const struct sockaddr *sa) +{ + return (if_delmulti_common(NULL, ifp, sa, 1)); +} + +/* + * Delete a multicast group membership by network-layer group address. + * + * Returns ENOENT if the entry could not be found. + */ +static int +if_delmulti_common(struct ifmultiaddr *ifma, struct ifnet *ifp, + const struct sockaddr *sa, int anon) { - struct ifmultiaddr *ifma; struct sockaddr *dupsa = NULL; - int retval = 0; + int lastref, ll_lastref = 0, lladdr; + struct ifmultiaddr *ll = NULL; - if (sa->sa_family == AF_LINK || sa->sa_family == AF_UNSPEC) { + /* sanity check for callers */ + VERIFY(ifma != NULL || (ifp != NULL && sa != NULL)); + + if (ifma != NULL) + ifp = ifma->ifma_ifp; + + if (sa != NULL && + (sa->sa_family == AF_LINK || sa->sa_family == AF_UNSPEC)) { dupsa = copy_and_normalize(sa); - if (dupsa == NULL) { - return ENOMEM; - } + if (dupsa == NULL) + return (ENOMEM); sa = dupsa; } - + ifnet_lock_exclusive(ifp); - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) - if (equal(sa, ifma->ifma_addr)) + if (ifma == NULL) { + for (ifma = LIST_FIRST(&ifp->if_multiaddrs); ifma != NULL; + ifma = LIST_NEXT(ifma, ifma_link)) { + IFMA_LOCK(ifma); + if (!equal(sa, ifma->ifma_addr) || + (anon && !(ifma->ifma_flags & IFMAF_ANONYMOUS))) { + VERIFY(!(ifma->ifma_flags & IFMAF_ANONYMOUS) || + ifma->ifma_anoncnt != 0); + IFMA_UNLOCK(ifma); + continue; + } + /* found; keep it locked */ break; - if (ifma == 0) { - ifnet_lock_done(ifp); - if (dupsa) - FREE(dupsa, M_IFADDR); - return ENOENT; + } + if (ifma == NULL) { + if (dupsa != NULL) + FREE(dupsa, M_IFADDR); + ifnet_lock_done(ifp); + return (ENOENT); + } + } else { + IFMA_LOCK(ifma); + } + IFMA_LOCK_ASSERT_HELD(ifma); + IFMA_ADDREF_LOCKED(ifma); /* for this routine */ + lastref = if_detach_ifma(ifp, ifma, anon); + VERIFY(!lastref || (!(ifma->ifma_debug & IFD_ATTACHED) && + ifma->ifma_reqcnt == 0)); + VERIFY(!anon || ifma->ifma_ll == NULL); + ll = ifma->ifma_ll; + lladdr = (ifma->ifma_addr->sa_family == AF_UNSPEC || + ifma->ifma_addr->sa_family == AF_LINK); + IFMA_UNLOCK(ifma); + if (lastref && ll != NULL) { + IFMA_LOCK(ll); + ll_lastref = if_detach_ifma(ifp, ll, 0); + IFMA_UNLOCK(ll); } - - retval = if_delmultiaddr(ifma, 1); ifnet_lock_done(ifp); - if (dupsa) + + if (lastref) + rt_newmaddrmsg(RTM_DELMADDR, ifma); + + if ((ll == NULL && lastref && lladdr) || ll_lastref) { + /* + * Make sure the interface driver is notified in the + * case of a link layer mcast group being left. Do + * this only for a AF_LINK/AF_UNSPEC address that has + * been removed from the if_multiaddrs set. + */ + ifnet_ioctl(ifp, 0, SIOCDELMULTI, NULL); + } + + if (lastref) + IFMA_REMREF(ifma); /* for if_multiaddrs list */ + if (ll_lastref) + IFMA_REMREF(ll); /* for if_multiaddrs list */ + + IFMA_REMREF(ifma); /* for this routine */ + if (dupsa != NULL) FREE(dupsa, M_IFADDR); - - return retval; -} + return (0); +} /* * We don't use if_setlladdr, our interfaces are responsible for @@ -2126,21 +2737,6 @@ if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len) } #endif -struct ifmultiaddr * -ifmaof_ifpforaddr(const struct sockaddr *sa, struct ifnet *ifp) -{ - struct ifmultiaddr *ifma; - - ifnet_lock_shared(ifp); - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) - if (equal(ifma->ifma_addr, sa)) - break; - ifnet_lock_done(ifp); - - return ifma; -} - SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Link layers"); SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Generic link-management"); @@ -2279,17 +2875,18 @@ void if_rtmtu_update(struct ifnet *ifp) } __private_extern__ void -if_data_internal_to_if_data( - struct ifnet *ifp, - const struct if_data_internal *if_data_int, - struct if_data *if_data) +if_data_internal_to_if_data(struct ifnet *ifp, + const struct if_data_internal *if_data_int, struct if_data *if_data) { - struct dlil_threading_info *thread; - if ((thread = ifp->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - -#define COPYFIELD(fld) if_data->fld = if_data_int->fld +#pragma unused(ifp) +#define COPYFIELD(fld) if_data->fld = if_data_int->fld #define COPYFIELD32(fld) if_data->fld = (u_int32_t)(if_data_int->fld) +/* compiler will cast down to 32-bit */ +#define COPYFIELD32_ATOMIC(fld) do { \ + atomic_get_64(if_data->fld, \ + (u_int64_t *)(void *)(uintptr_t)&if_data_int->fld); \ +} while (0) + COPYFIELD(ifi_type); COPYFIELD(ifi_typelen); COPYFIELD(ifi_physical); @@ -2302,29 +2899,28 @@ if_data_internal_to_if_data( COPYFIELD(ifi_metric); if (if_data_int->ifi_baudrate & 0xFFFFFFFF00000000LL) { if_data->ifi_baudrate = 0xFFFFFFFF; - } - else { + } else { COPYFIELD32(ifi_baudrate); } + + COPYFIELD32_ATOMIC(ifi_ipackets); + COPYFIELD32_ATOMIC(ifi_ierrors); + COPYFIELD32_ATOMIC(ifi_opackets); + COPYFIELD32_ATOMIC(ifi_oerrors); + COPYFIELD32_ATOMIC(ifi_collisions); + COPYFIELD32_ATOMIC(ifi_ibytes); + COPYFIELD32_ATOMIC(ifi_obytes); + COPYFIELD32_ATOMIC(ifi_imcasts); + COPYFIELD32_ATOMIC(ifi_omcasts); + COPYFIELD32_ATOMIC(ifi_iqdrops); + COPYFIELD32_ATOMIC(ifi_noproto); + + COPYFIELD(ifi_recvtiming); + COPYFIELD(ifi_xmittiming); - lck_mtx_lock(thread->input_lck); - COPYFIELD32(ifi_ipackets); - COPYFIELD32(ifi_ierrors); - COPYFIELD32(ifi_opackets); - COPYFIELD32(ifi_oerrors); - COPYFIELD32(ifi_collisions); - COPYFIELD32(ifi_ibytes); - COPYFIELD32(ifi_obytes); - COPYFIELD32(ifi_imcasts); - COPYFIELD32(ifi_omcasts); - COPYFIELD32(ifi_iqdrops); - COPYFIELD32(ifi_noproto); - COPYFIELD32(ifi_recvtiming); - COPYFIELD32(ifi_xmittiming); if_data->ifi_lastchange.tv_sec = if_data_int->ifi_lastchange.tv_sec; if_data->ifi_lastchange.tv_usec = if_data_int->ifi_lastchange.tv_usec; - lck_mtx_unlock(thread->input_lck); - + #if IF_LASTCHANGEUPTIME if_data->ifi_lastchange.tv_sec += boottime_sec(); #endif @@ -2333,70 +2929,103 @@ if_data_internal_to_if_data( COPYFIELD(ifi_hwassist); if_data->ifi_reserved1 = 0; if_data->ifi_reserved2 = 0; +#undef COPYFIELD32_ATOMIC #undef COPYFIELD32 #undef COPYFIELD } __private_extern__ void -if_data_internal_to_if_data64( - struct ifnet *ifp, - const struct if_data_internal *if_data_int, - struct if_data64 *if_data64) +if_data_internal_to_if_data64(struct ifnet *ifp, + const struct if_data_internal *if_data_int, + struct if_data64 *if_data64) { - struct dlil_threading_info *thread; - if ((thread = ifp->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - -#define COPYFIELD(fld) if_data64->fld = if_data_int->fld - COPYFIELD(ifi_type); - COPYFIELD(ifi_typelen); - COPYFIELD(ifi_physical); - COPYFIELD(ifi_addrlen); - COPYFIELD(ifi_hdrlen); - COPYFIELD(ifi_recvquota); - COPYFIELD(ifi_xmitquota); +#pragma unused(ifp) +#define COPYFIELD64(fld) if_data64->fld = if_data_int->fld +#define COPYFIELD64_ATOMIC(fld) do { \ + atomic_get_64(if_data64->fld, \ + (u_int64_t *)(void *)(uintptr_t)&if_data_int->fld); \ +} while (0) + + COPYFIELD64(ifi_type); + COPYFIELD64(ifi_typelen); + COPYFIELD64(ifi_physical); + COPYFIELD64(ifi_addrlen); + COPYFIELD64(ifi_hdrlen); + COPYFIELD64(ifi_recvquota); + COPYFIELD64(ifi_xmitquota); if_data64->ifi_unused1 = 0; - COPYFIELD(ifi_mtu); - COPYFIELD(ifi_metric); - COPYFIELD(ifi_baudrate); - - lck_mtx_lock(thread->input_lck); - COPYFIELD(ifi_ipackets); - COPYFIELD(ifi_ierrors); - COPYFIELD(ifi_opackets); - COPYFIELD(ifi_oerrors); - COPYFIELD(ifi_collisions); - COPYFIELD(ifi_ibytes); - COPYFIELD(ifi_obytes); - COPYFIELD(ifi_imcasts); - COPYFIELD(ifi_omcasts); - COPYFIELD(ifi_iqdrops); - COPYFIELD(ifi_noproto); - COPYFIELD(ifi_recvtiming); - COPYFIELD(ifi_xmittiming); + COPYFIELD64(ifi_mtu); + COPYFIELD64(ifi_metric); + COPYFIELD64(ifi_baudrate); + + COPYFIELD64_ATOMIC(ifi_ipackets); + COPYFIELD64_ATOMIC(ifi_ierrors); + COPYFIELD64_ATOMIC(ifi_opackets); + COPYFIELD64_ATOMIC(ifi_oerrors); + COPYFIELD64_ATOMIC(ifi_collisions); + COPYFIELD64_ATOMIC(ifi_ibytes); + COPYFIELD64_ATOMIC(ifi_obytes); + COPYFIELD64_ATOMIC(ifi_imcasts); + COPYFIELD64_ATOMIC(ifi_omcasts); + COPYFIELD64_ATOMIC(ifi_iqdrops); + COPYFIELD64_ATOMIC(ifi_noproto); + + /* Note these two fields are actually 32 bit, so doing COPYFIELD64_ATOMIC will + * cause them to be misaligned + */ + COPYFIELD64(ifi_recvtiming); + COPYFIELD64(ifi_xmittiming); + if_data64->ifi_lastchange.tv_sec = if_data_int->ifi_lastchange.tv_sec; if_data64->ifi_lastchange.tv_usec = if_data_int->ifi_lastchange.tv_usec; - lck_mtx_unlock(thread->input_lck); - + #if IF_LASTCHANGEUPTIME if_data64->ifi_lastchange.tv_sec += boottime_sec(); #endif -#undef COPYFIELD +#undef COPYFIELD64 } -void -ifafree(struct ifaddr *ifa) +__private_extern__ void +if_copy_traffic_class(struct ifnet *ifp, + struct if_traffic_class *if_tc) { - int oldval; +#define COPY_IF_TC_FIELD64_ATOMIC(fld) do { \ + atomic_get_64(if_tc->fld, \ + (u_int64_t *)(void *)(uintptr_t)&ifp->if_tc.fld); \ +} while (0) + + COPY_IF_TC_FIELD64_ATOMIC(ifi_ibkpackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ibkbytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_obkpackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_obkbytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ivipackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ivibytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ovipackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ovibytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ivopackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ivobytes); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ovopackets); + COPY_IF_TC_FIELD64_ATOMIC(ifi_ovobytes); + +#undef COPY_IF_TC_FIELD64_ATOMIC +} - oldval = OSAddAtomic(-1, &ifa->ifa_refcnt); - if (oldval >= 1 && ifa->ifa_trace != NULL) - (*ifa->ifa_trace)(ifa, FALSE); - if (oldval == 0) { + +struct ifaddr * +ifa_remref(struct ifaddr *ifa, int locked) +{ + if (!locked) + IFA_LOCK_SPIN(ifa); + else + IFA_LOCK_ASSERT_HELD(ifa); + + if (ifa->ifa_refcnt == 0) panic("%s: ifa %p negative refcnt\n", __func__, ifa); - } else if (oldval == 1) { - if (ifa->ifa_debug & IFD_ATTACHED) + else if (ifa->ifa_trace != NULL) + (*ifa->ifa_trace)(ifa, FALSE); + if (--ifa->ifa_refcnt == 0) { + if (ifa->ifa_debug & IFD_ATTACHED) panic("ifa %p attached to ifp is being freed\n", ifa); /* * Some interface addresses are allocated either statically @@ -2406,22 +3035,54 @@ ifafree(struct ifaddr *ifa) * leave it alone. */ if (ifa->ifa_debug & IFD_ALLOC) { - if (ifa->ifa_free == NULL) + if (ifa->ifa_free == NULL) { + IFA_UNLOCK(ifa); FREE(ifa, M_IFADDR); - else + } else { + /* Become a regular mutex */ + IFA_CONVERT_LOCK(ifa); + /* callee will unlock */ (*ifa->ifa_free)(ifa); + } + } else { + IFA_UNLOCK(ifa); } + ifa = NULL; } + + if (!locked && ifa != NULL) + IFA_UNLOCK(ifa); + + return (ifa); } void -ifaref(struct ifaddr *ifa) +ifa_addref(struct ifaddr *ifa, int locked) { - int oldval; + if (!locked) + IFA_LOCK_SPIN(ifa); + else + IFA_LOCK_ASSERT_HELD(ifa); - oldval = OSAddAtomic(1, &ifa->ifa_refcnt); - if (oldval < 0) - panic("%s: ifa %p negative refcnt\n", __func__, ifa); - else if (ifa->ifa_trace != NULL) + if (++ifa->ifa_refcnt == 0) { + panic("%s: ifa %p wraparound refcnt\n", __func__, ifa); + /* NOTREACHED */ + } else if (ifa->ifa_trace != NULL) { (*ifa->ifa_trace)(ifa, TRUE); + } + if (!locked) + IFA_UNLOCK(ifa); +} + +void +ifa_lock_init(struct ifaddr *ifa) +{ + lck_mtx_init(&ifa->ifa_lock, ifa_mtx_grp, ifa_mtx_attr); +} + +void +ifa_lock_destroy(struct ifaddr *ifa) +{ + IFA_LOCK_ASSERT_NOTHELD(ifa); + lck_mtx_destroy(&ifa->ifa_lock, ifa_mtx_grp); } diff --git a/bsd/net/if.h b/bsd/net/if.h index 1e847ab04..a7974460c 100644 --- a/bsd/net/if.h +++ b/bsd/net/if.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -96,6 +96,7 @@ #define KEV_DL_LINK_ADDRESS_CHANGED 16 #define KEV_DL_WAKEFLAGS_CHANGED 17 #define KEV_DL_IF_IDLE_ROUTE_REFCNT 18 +#define KEV_DL_IFCAP_CHANGED 19 #include <net/if_var.h> #include <sys/types.h> @@ -146,16 +147,17 @@ struct if_clonereq32 { #define IFEF_AUTOCONFIGURING 0x1 #define IFEF_DVR_REENTRY_OK 0x20 /* When set, driver may be reentered from its own thread */ #define IFEF_ACCEPT_RTADVD 0x40 /* set to accept IPv6 router advertisement on the interface */ -#define IFEF_DETACHING 0x80 /* Set when interface is detaching */ -#define IFEF_USEKPI 0x100 /* Set when interface is created through the KPIs */ +#define _IFEF_DETACHING 0x80 /* deprecated */ +#define IFEF_USEKPI 0x100 /* Set when interface is created through the KPIs */ #define IFEF_VLAN 0x200 /* interface has one or more vlans */ #define IFEF_BOND 0x400 /* interface is part of bond */ #define IFEF_ARPLL 0x800 /* ARP for IPv4LL addresses on this port */ #define IFEF_NOWINDOWSCALE 0x1000 /* Don't scale TCP window on iface */ #define IFEF_NOAUTOIPV6LL 0x2000 /* Interface IPv6 LinkLocal address not provided by kernel */ -#define IFEF_SENDLIST 0x10000000 /* Interface supports sending a list of packets */ -#define IFEF_REUSE 0x20000000 /* DLIL ifnet recycler, ifnet is not new */ -#define IFEF_INUSE 0x40000000 /* DLIL ifnet recycler, ifnet in use */ +#define IFEF_SERVICE_TRIGGERED 0x20000 /* interface is on-demand dynamically created/destroyed */ +#define IFEF_SENDLIST 0x10000000 /* Interface supports sending a list of packets */ +#define _IFEF_REUSE 0x20000000 /* deprecated */ +#define _IFEF_INUSE 0x40000000 /* deprecated */ #define IFEF_UPDOWNCHANGE 0x80000000 /* Interface's up/down state is changing */ /* @@ -177,6 +179,40 @@ struct if_clonereq32 { #endif /* KERNEL_PRIVATE */ +/* + * Capabilities that interfaces can advertise. + * + * struct ifnet.if_capabilities + * contains the optional features & capabilities a particular interface + * supports (not only the driver but also the detected hw revision). + * Capabilities are defined by IFCAP_* below. + * struct ifnet.if_capenable + * contains the enabled (either by default or through ifconfig) optional + * features & capabilities on this interface. + * Capabilities are defined by IFCAP_* below. + * struct if_data.ifi_hwassist in IFNET_* form, defined in net/kpi_interface.h, + * contains the enabled optional features & capabilites that can be used + * individually per packet and are specified in the mbuf pkthdr.csum_flags + * field. IFCAP_* and IFNET_* do not match one to one and IFNET_* may be + * more detailed or differenciated than IFCAP_*. + * IFNET_* hwassist flags have corresponding CSUM_* in sys/mbuf.h + */ +#define IFCAP_RXCSUM 0x00001 /* can offload checksum on RX */ +#define IFCAP_TXCSUM 0x00002 /* can offload checksum on TX */ +#define IFCAP_VLAN_MTU 0x00004 /* VLAN-compatible MTU */ +#define IFCAP_VLAN_HWTAGGING 0x00008 /* hardware VLAN tag support */ +#define IFCAP_JUMBO_MTU 0x00010 /* 9000 byte MTU supported */ +#define IFCAP_TSO4 0x00020 /* can do TCP Segmentation Offload */ +#define IFCAP_TSO6 0x00040 /* can do TCP6 Segmentation Offload */ +#define IFCAP_LRO 0x00080 /* can do Large Receive Offload */ +#define IFCAP_AV 0x00100 /* can do 802.1 AV Bridging */ + +#define IFCAP_HWCSUM (IFCAP_RXCSUM | IFCAP_TXCSUM) +#define IFCAP_TSO (IFCAP_TSO4 | IFCAP_TSO6) + +#define IFCAP_VALID (IFCAP_HWCSUM | IFCAP_TSO | IFCAP_LRO | IFCAP_VLAN_MTU | \ + IFCAP_VLAN_HWTAGGING | IFCAP_JUMBO_MTU | IFCAP_AV) + #define IFQ_MAXLEN 50 #define IFNET_SLOWHZ 1 /* granularity is 1 second */ @@ -341,6 +377,7 @@ struct ifreq { struct ifkpi ifru_kpi; u_int32_t ifru_wake_flags; u_int32_t ifru_route_refcnt; + int ifru_cap[2]; } ifr_ifru; #define ifr_addr ifr_ifru.ifru_addr /* address */ #define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */ @@ -364,6 +401,8 @@ struct ifreq { #define ifr_kpi ifr_ifru.ifru_kpi #define ifr_wake_flags ifr_ifru.ifru_wake_flags /* wake capabilities of devive */ #define ifr_route_refcnt ifr_ifru.ifru_route_refcnt /* route references on interface */ +#define ifr_reqcap ifr_ifru.ifru_cap[0] /* requested capabilities */ +#define ifr_curcap ifr_ifru.ifru_cap[1] /* current capabilities */ }; #define _SIZEOF_ADDR_IFREQ(ifr) \ diff --git a/bsd/net/if_atm.h b/bsd/net/if_atm.h deleted file mode 100644 index dc4689b56..000000000 --- a/bsd/net/if_atm.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* $NetBSD: if_atm.h,v 1.7 1996/11/09 23:02:27 chuck Exp $ */ -/* $FreeBSD: src/sys/net/if_atm.h,v 1.4 1999/12/29 04:38:34 peter Exp $ */ - -/* - * - * Copyright (c) 1996 Charles D. Cranor and Washington University. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Charles D. Cranor and - * Washington University. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * net/if_atm.h - */ - -#ifdef KERNEL_PRIVATE -#if defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) -#define RTALLOC1(A,B) rtalloc1((A),(B)) -#elif defined(__FreeBSD__) || defined(__APPLE__) -#define RTALLOC1(A,B) rtalloc1((A),(B),0UL) -#endif -#endif /* KERNEL_PRIVATE */ - -/* - * pseudo header for packet transmission - */ -struct atm_pseudohdr { - u_int8_t atm_ph[4]; /* flags+VPI+VCI1(msb)+VCI2(lsb) */ -}; - -#define ATM_PH_FLAGS(X) ((X)->atm_ph[0]) -#define ATM_PH_VPI(X) ((X)->atm_ph[1]) -#define ATM_PH_VCI(X) ((((X)->atm_ph[2]) << 8) | ((X)->atm_ph[3])) -#define ATM_PH_SETVCI(X,V) { \ - (X)->atm_ph[2] = ((V) >> 8) & 0xff; \ - (X)->atm_ph[3] = ((V) & 0xff); \ -} - -#define ATM_PH_AAL5 0x01 /* use AAL5? (0 == aal0) */ -#define ATM_PH_LLCSNAP 0x02 /* use the LLC SNAP encoding (iff aal5) */ - -#define ATM_PH_DRIVER7 0x40 /* reserve for driver's use */ -#define ATM_PH_DRIVER8 0x80 /* reserve for driver's use */ - -#define ATMMTU 9180 /* ATM MTU size for IP */ - /* XXX: could be 9188 with LLC/SNAP according - to comer */ - -/* user's ioctl hook for raw atm mode */ -#define SIOCRAWATM _IOWR('a', 122, int) /* set driver's raw mode */ - -/* atm_pseudoioctl: turns on and off RX VCIs [for internal use only!] */ -struct atm_pseudoioctl { - struct atm_pseudohdr aph; - void *rxhand; -}; -#define SIOCATMENA _IOWR('a', 123, struct atm_pseudoioctl) /* enable */ -#define SIOCATMDIS _IOWR('a', 124, struct atm_pseudoioctl) /* disable */ - - -/* - * XXX forget all the garbage in if_llc.h and do it the easy way - */ - -#define ATMLLC_HDR "\252\252\3\0\0\0" -struct atmllc { - u_int8_t llchdr[6]; /* aa.aa.03.00.00.00 */ - u_int8_t type[2]; /* "ethernet" type */ -}; - -/* ATM_LLC macros: note type code in host byte order */ -#define ATM_LLC_TYPE(X) (((X)->type[0] << 8) | ((X)->type[1])) -#define ATM_LLC_SETTYPE(X,V) { \ - (X)->type[1] = ((V) >> 8) & 0xff; \ - (X)->type[0] = ((V) & 0xff); \ -} - -#ifdef KERNEL_PRIVATE -void atm_ifattach(struct ifnet *); -void atm_input(struct ifnet *, struct atm_pseudohdr *, - struct mbuf *, void *); -int atm_output(struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *); -#endif /* KERNEL_PRIVATE */ - diff --git a/bsd/net/if_bond.c b/bsd/net/if_bond.c index fa07935a6..91790bd3a 100644 --- a/bsd/net/if_bond.c +++ b/bsd/net/if_bond.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,8 +79,6 @@ #include <net/if_media.h> #include <net/multicast_list.h> -extern void dlil_input_packet_list(struct ifnet *, struct mbuf *); - static struct ether_addr slow_proto_multicast = { IEEE8023AD_SLOW_PROTO_MULTICAST }; @@ -724,8 +722,9 @@ ifbond_release(ifbond_ref ifb) printf("ifbond_release(%s) removing multicast\n", ifb->ifb_name); } - (void)if_delmultiaddr(ifb->ifb_ifma_slow_proto, 0); - ifma_release(ifb->ifb_ifma_slow_proto); + (void) if_delmulti_anon(ifb->ifb_ifma_slow_proto->ifma_ifp, + ifb->ifb_ifma_slow_proto->ifma_addr); + IFMA_REMREF(ifb->ifb_ifma_slow_proto); } if (ifb->ifb_distributing_array != NULL) { FREE(ifb->ifb_distributing_array, M_BOND); @@ -885,10 +884,6 @@ if_siflladdr(struct ifnet * ifp, const struct ether_addr * ea_p) ifr.ifr_addr.sa_family = AF_UNSPEC; ifr.ifr_addr.sa_len = ETHER_ADDR_LEN; ether_addr_copy(ifr.ifr_addr.sa_data, ea_p); -#if 0 - snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "%s%d", ifnet_name(ifp), - ifnet_unit(ifp)); -#endif return (ifnet_ioctl(ifp, 0, SIOCSIFLLADDR, &ifr)); } @@ -909,9 +904,6 @@ bond_globals_create(lacp_system_priority sys_pri, TAILQ_INIT(&b->ifbond_list); b->system = *sys; b->system_priority = sys_pri; -#if 0 - b->verbose = 1; -#endif return (b); } @@ -936,7 +928,6 @@ bond_globals_init(void) for (i = 0; i < 4; i++) { char ifname[IFNAMSIZ+1]; snprintf(ifname, sizeof(ifname), "en%d", i); - /* XXX ifunit() needs to return a reference on the ifp */ ifp = ifunit(ifname); if (ifp != NULL) { break; @@ -1108,8 +1099,7 @@ ifbond_add_slow_proto_multicast(ifbond_ref ifb) sdl.sdl_nlen = 0; sdl.sdl_alen = sizeof(slow_proto_multicast); bcopy(&slow_proto_multicast, sdl.sdl_data, sizeof(slow_proto_multicast)); - error = if_addmulti(ifb->ifb_ifp, (struct sockaddr *)&sdl, - &ifma); + error = if_addmulti_anon(ifb->ifb_ifp, (struct sockaddr *)&sdl, &ifma); if (error == 0) { ifb->ifb_ifma_slow_proto = ifma; } @@ -1236,10 +1226,10 @@ bond_if_detach(struct ifnet * ifp) int error; error = ifnet_detach(ifp); - if (error) { - printf("bond_if_detach %s%d: ifnet_detach failed, %d\n", - ifnet_name(ifp), ifnet_unit(ifp), error); - } + if (error) { + printf("bond_if_detach %s%d: ifnet_detach failed, %d\n", + ifnet_name(ifp), ifnet_unit(ifp), error); + } return; } @@ -2571,24 +2561,10 @@ static int bond_set_promisc(__unused struct ifnet *ifp) { int error = 0; -#if 0 - ifbond_ref ifb = ifnet_softc(ifp); - - - if ((ifnet_flags(ifp) & IFF_PROMISC) != 0) { - if ((ifb->ifb_flags & IFBF_PROMISC) == 0) { - error = ifnet_set_promiscuous(ifb->ifb_p, 1); - if (error == 0) - ifb->ifb_flags |= IFBF_PROMISC; - } - } else { - if ((ifb->ifb_flags & IFBF_PROMISC) != 0) { - error = ifnet_set_promiscuous(ifb->ifb_p, 0); - if (error == 0) - ifb->ifb_flags &= ~IFBF_PROMISC; - } - } -#endif + /* + * The benefit of doing this currently does not warrant + * the added code complexity. Do nothing and return. + */ return (error); } @@ -2812,7 +2788,6 @@ bond_ioctl(struct ifnet *ifp, u_long cmd, void * data) switch (ibr.ibr_op) { case IF_BOND_OP_ADD_INTERFACE: case IF_BOND_OP_REMOVE_INTERFACE: - /* XXX ifunit() needs to return a reference on the ifp */ port_ifp = ifunit(ibr.ibr_ibru.ibru_if_name); if (port_ifp == NULL) { error = ENXIO; @@ -2947,23 +2922,16 @@ bond_if_free(struct ifnet * ifp) } static void -bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, - const struct kev_msg * event) +bond_handle_event(struct ifnet * port_ifp, int event_code) { struct ifnet * bond_ifp = NULL; - int event_code = 0; ifbond_ref ifb; int old_distributing_count; bondport_ref p; struct media_info media_info = { 0, 0}; - if (event->vendor_code != KEV_VENDOR_APPLE - || event->kev_class != KEV_NETWORK_CLASS - || event->kev_subclass != KEV_DL_SUBCLASS) { - return; - } - switch (event->event_code) { - case KEV_DL_IF_DETACHING: + switch (event_code) { + case KEV_DL_IF_DETACHED: break; case KEV_DL_LINK_OFF: case KEV_DL_LINK_ON: @@ -2980,8 +2948,8 @@ bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, } ifb = p->po_bond; old_distributing_count = ifb->ifb_distributing_count; - switch (event->event_code) { - case KEV_DL_IF_DETACHING: + switch (event_code) { + case KEV_DL_IF_DETACHED: bond_remove_interface(ifb, p->po_ifp); break; case KEV_DL_LINK_OFF: @@ -3042,6 +3010,37 @@ bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, return; } +static void +bond_event(struct ifnet * port_ifp, __unused protocol_family_t protocol, + const struct kev_msg * event) +{ + int event_code; + + if (event->vendor_code != KEV_VENDOR_APPLE + || event->kev_class != KEV_NETWORK_CLASS + || event->kev_subclass != KEV_DL_SUBCLASS) { + return; + } + event_code = event->event_code; + switch (event_code) { + case KEV_DL_LINK_OFF: + case KEV_DL_LINK_ON: + /* we only care about link status changes */ + bond_handle_event(port_ifp, event_code); + break; + default: + break; + } + return; +} + +static errno_t +bond_detached(ifnet_t port_ifp, __unused protocol_family_t protocol) +{ + bond_handle_event(port_ifp, KEV_DL_IF_DETACHED); + return (0); +} + static void interface_link_event(struct ifnet * ifp, u_int32_t event_code) { @@ -3051,6 +3050,7 @@ interface_link_event(struct ifnet * ifp, u_int32_t event_code) char if_name[IFNAMSIZ]; } event; + bzero(&event, sizeof(event)); event.header.total_size = sizeof(event); event.header.vendor_code = KEV_VENDOR_APPLE; event.header.kev_class = KEV_NETWORK_CLASS; @@ -3082,6 +3082,7 @@ bond_attach_protocol(struct ifnet *ifp) bzero(®, sizeof(reg)); reg.input = bond_input; reg.event = bond_event; + reg.detached = bond_detached; error = ifnet_attach_protocol(ifp, PF_BOND, ®); if (error) { diff --git a/bsd/net/if_bridge.c b/bsd/net/if_bridge.c new file mode 100644 index 000000000..fd546fa0e --- /dev/null +++ b/bsd/net/if_bridge.c @@ -0,0 +1,5138 @@ +/* $NetBSD: if_bridge.c,v 1.31 2005/06/01 19:45:34 jdc Exp $ */ +/* + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.c,v 1.60 2001/06/15 03:38:33 itojun Exp + */ + +/* + * Network interface bridge support. + * + * TODO: + * + * - Currently only supports Ethernet-like interfaces (Ethernet, + * 802.11, VLANs on Ethernet, etc.) Figure out a nice way + * to bridge other types of interfaces (FDDI-FDDI, and maybe + * consider heterogenous bridges). + */ + +#include <sys/cdefs.h> +//__FBSDID("$FreeBSD$"); + +//#include "opt_inet.h" +//#include "opt_inet6.h" +//#include "opt_carp.h" + +#define BRIDGE_DEBUG 1 +#ifndef BRIDGE_DEBUG +#define BRIDGE_DEBUG 0 +#endif /* BRIDGE_DEBUG */ + +#include <sys/param.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/protosw.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/socket.h> /* for net/if.h */ +#include <sys/sockio.h> +//#include <sys/ctype.h> /* string functions */ +#include <sys/kernel.h> +#include <sys/random.h> +#include <sys/syslog.h> +#include <sys/sysctl.h> +//#include <vm/uma.h> +//#include <sys/module.h> +//#include <sys/priv.h> +#include <sys/proc.h> +#include <sys/lock.h> +//#include <sys/mutex.h> +#include <sys/mcache.h> + +#include <sys/kauth.h> + +#include <libkern/libkern.h> + +#include <kern/zalloc.h> + +#if NBPFILTER > 0 +#include <net/bpf.h> +#endif +#include <net/if.h> +//#include <net/if_clone.h> +#include <net/if_dl.h> +#include <net/if_types.h> +#include <net/if_var.h> +//#include <net/pfil.h> + +#include <netinet/in.h> /* for struct arpcom */ +#include <netinet/in_systm.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif +#ifdef DEV_CARP +#include <netinet/ip_carp.h> +#endif +//#include <machine/in_cksum.h> +#include <netinet/if_ether.h> /* for struct arpcom */ +#include <net/bridgestp.h> +#include <net/if_bridgevar.h> +#include <net/if_llc.h> +#include <net/if_vlan_var.h> + +#include <net/if_ether.h> +#include <net/dlil.h> +#include <net/kpi_interfacefilter.h> + +#include <net/route.h> +#ifdef PFIL_HOOKS +#include <netinet/ip_fw2.h> +#include <netinet/ip_dummynet.h> +#endif /* PFIL_HOOKS */ + +#if BRIDGE_DEBUG + +#define BR_LCKDBG_MAX 4 + +#define BRIDGE_LOCK(_sc) bridge_lock(_sc) +#define BRIDGE_UNLOCK(_sc) bridge_unlock(_sc) +#define BRIDGE_LOCK_ASSERT(_sc) lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED) +#define BRIDGE_LOCK2REF(_sc, _err) _err = bridge_lock2ref(_sc) +#define BRIDGE_UNREF(_sc) bridge_unref(_sc) +#define BRIDGE_XLOCK(_sc) bridge_xlock(_sc) +#define BRIDGE_XDROP(_sc) bridge_xdrop(_sc) + +#else /* BRIDGE_DEBUG */ + +#define BRIDGE_LOCK(_sc) lck_mtx_lock((_sc)->sc_mtx) +#define BRIDGE_UNLOCK(_sc) lck_mtx_unlock((_sc)->sc_mtx) +#define BRIDGE_LOCK_ASSERT(_sc) lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED) +#define BRIDGE_LOCK2REF(_sc, _err) do { \ + lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ + if ((_sc)->sc_iflist_xcnt > 0) \ + (_err) = EBUSY; \ + else \ + (_sc)->sc_iflist_ref++; \ + lck_mtx_unlock((_sc)->sc_mtx); \ +} while (0) +#define BRIDGE_UNREF(_sc) do { \ + lck_mtx_lock((_sc)->sc_mtx); \ + (_sc)->sc_iflist_ref--; \ + if (((_sc)->sc_iflist_xcnt > 0) && ((_sc)->sc_iflist_ref == 0)) { \ + lck_mtx_unlock((_sc)->sc_mtx); \ + wakeup(&(_sc)->sc_cv); \ + } else \ + lck_mtx_unlock((_sc)->sc_mtx); \ +} while (0) +#define BRIDGE_XLOCK(_sc) do { \ + lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ + (_sc)->sc_iflist_xcnt++; \ + while ((_sc)->sc_iflist_ref > 0) \ + msleep(&(_sc)->sc_cv, (_sc)->sc_mtx, PZERO, "BRIDGE_XLOCK", NULL); \ +} while (0) +#define BRIDGE_XDROP(_sc) do { \ + lck_mtx_assert((_sc)->sc_mtx, LCK_MTX_ASSERT_OWNED); \ + (_sc)->sc_iflist_xcnt--; \ +} while (0) + +#endif /* BRIDGE_DEBUG */ + +#if NBPFILTER > 0 +#define BRIDGE_BPF_MTAP_INPUT(sc, m) \ + if (sc->sc_bpf_input) \ + bridge_bpf_input(sc->sc_ifp, m) +#else /* NBPFILTER */ +#define BRIDGE_BPF_MTAP_INPUT(ifp, m) +#endif /* NBPFILTER */ + +/* + * Size of the route hash table. Must be a power of two. + */ +/* APPLE MODIFICATION - per Wasabi performance improvement, change the hash table size */ +#if 0 +#ifndef BRIDGE_RTHASH_SIZE +#define BRIDGE_RTHASH_SIZE 1024 +#endif +#else +#ifndef BRIDGE_RTHASH_SIZE +#define BRIDGE_RTHASH_SIZE 256 +#endif +#endif + +/* APPLE MODIFICATION - support for HW checksums */ +#if APPLE_BRIDGE_HWCKSUM_SUPPORT +#include <netinet/udp.h> +#include <netinet/tcp.h> +#endif + +#define BRIDGE_RTHASH_MASK (BRIDGE_RTHASH_SIZE - 1) + +/* + * Maximum number of addresses to cache. + */ +#ifndef BRIDGE_RTABLE_MAX +#define BRIDGE_RTABLE_MAX 100 +#endif + + +/* + * Timeout (in seconds) for entries learned dynamically. + */ +#ifndef BRIDGE_RTABLE_TIMEOUT +#define BRIDGE_RTABLE_TIMEOUT (20 * 60) /* same as ARP */ +#endif + +/* + * Number of seconds between walks of the route list. + */ +#ifndef BRIDGE_RTABLE_PRUNE_PERIOD +#define BRIDGE_RTABLE_PRUNE_PERIOD (5 * 60) +#endif + +/* + * List of capabilities to possibly mask on the member interface. + */ +#define BRIDGE_IFCAPS_MASK (IFCAP_TOE|IFCAP_TSO|IFCAP_TXCSUM) +/* + * List of capabilities to disable on the member interface. + */ +#define BRIDGE_IFCAPS_STRIP IFCAP_LRO + +/* + * Bridge interface list entry. + */ +struct bridge_iflist { + TAILQ_ENTRY(bridge_iflist) bif_next; + struct ifnet *bif_ifp; /* member if */ + struct bstp_port bif_stp; /* STP state */ + uint32_t bif_flags; /* member if flags */ + int bif_savedcaps; /* saved capabilities */ + uint32_t bif_addrmax; /* max # of addresses */ + uint32_t bif_addrcnt; /* cur. # of addresses */ + uint32_t bif_addrexceeded;/* # of address violations */ + + interface_filter_t bif_iff_ref; + struct bridge_softc *bif_sc; + char bif_promisc; /* promiscuous mode set */ + char bif_proto_attached; /* protocol attached */ + char bif_filter_attached; /* interface filter attached */ +}; + +/* + * Bridge route node. + */ +struct bridge_rtnode { + LIST_ENTRY(bridge_rtnode) brt_hash; /* hash table linkage */ + LIST_ENTRY(bridge_rtnode) brt_list; /* list linkage */ + struct bridge_iflist *brt_dst; /* destination if */ + unsigned long brt_expire; /* expiration time */ + uint8_t brt_flags; /* address flags */ + uint8_t brt_addr[ETHER_ADDR_LEN]; + uint16_t brt_vlan; /* vlan id */ + +}; +#define brt_ifp brt_dst->bif_ifp + +/* + * Software state for each bridge. + */ +struct bridge_softc { + struct ifnet *sc_ifp; /* make this an interface */ + LIST_ENTRY(bridge_softc) sc_list; + lck_mtx_t *sc_mtx; + void *sc_cv; + uint32_t sc_brtmax; /* max # of addresses */ + uint32_t sc_brtcnt; /* cur. # of addresses */ + uint32_t sc_brttimeout; /* rt timeout in seconds */ + uint32_t sc_iflist_ref; /* refcount for sc_iflist */ + uint32_t sc_iflist_xcnt; /* refcount for sc_iflist */ + TAILQ_HEAD(, bridge_iflist) sc_iflist; /* member interface list */ + LIST_HEAD(, bridge_rtnode) *sc_rthash; /* our forwarding table */ + LIST_HEAD(, bridge_rtnode) sc_rtlist; /* list version of above */ + uint32_t sc_rthash_key; /* key for hash */ + TAILQ_HEAD(, bridge_iflist) sc_spanlist; /* span ports list */ + struct bstp_state sc_stp; /* STP state */ + uint32_t sc_brtexceeded; /* # of cache drops */ + uint32_t sc_filter_flags; /* ipf and flags */ + + char sc_if_xname[IFNAMSIZ]; + bpf_packet_func sc_bpf_input; + bpf_packet_func sc_bpf_output; + u_int32_t sc_flags; + +#if BRIDGE_DEBUG + void *lock_lr[BR_LCKDBG_MAX]; /* locking calling history */ + int next_lock_lr; + void *unlock_lr[BR_LCKDBG_MAX]; /* unlocking caller history */ + int next_unlock_lr; +#endif /* BRIDGE_DEBUG */ +}; + +#define SCF_DETACHING 0x1 + +static lck_mtx_t *bridge_list_mtx; +//eventhandler_tag bridge_detach_cookie = NULL; + +int bridge_rtable_prune_period = BRIDGE_RTABLE_PRUNE_PERIOD; + +static zone_t bridge_rtnode_pool = NULL; + +static int bridge_clone_create(struct if_clone *, uint32_t, void *); +static int bridge_clone_destroy(struct ifnet *); + +static errno_t bridge_ioctl(struct ifnet *, u_long, void *); +#if HAS_IF_CAP +static void bridge_mutecaps(struct bridge_softc *); +static void bridge_set_ifcap(struct bridge_softc *, struct bridge_iflist *, + int); +#endif +__private_extern__ void bridge_ifdetach(struct bridge_iflist *, struct ifnet *); +static int bridge_init(struct ifnet *); +#if HAS_BRIDGE_DUMMYNET +static void bridge_dummynet(struct mbuf *, struct ifnet *); +#endif +static void bridge_stop(struct ifnet *, int); +static errno_t bridge_start(struct ifnet *, struct mbuf *); +__private_extern__ errno_t bridge_input(struct ifnet *, struct mbuf *, void *); +#if BRIDGE_MEMBER_OUT_FILTER +static errno_t bridge_iff_output(void *, ifnet_t , protocol_family_t , mbuf_t *); +static int bridge_output(struct ifnet *, struct mbuf *, struct sockaddr *, + struct rtentry *); +#endif +static void bridge_enqueue(struct bridge_softc *, struct ifnet *, + struct mbuf *); +static void bridge_rtdelete(struct bridge_softc *, struct ifnet *ifp, int); + +static void bridge_forward(struct bridge_softc *, struct bridge_iflist *, + struct mbuf *m); + +static void bridge_timer(void *); + +static void bridge_broadcast(struct bridge_softc *, struct ifnet *, + struct mbuf *, int); +static void bridge_span(struct bridge_softc *, struct mbuf *); + +static int bridge_rtupdate(struct bridge_softc *, const uint8_t *, + uint16_t, struct bridge_iflist *, int, uint8_t); +static struct ifnet *bridge_rtlookup(struct bridge_softc *, const uint8_t *, + uint16_t); +static void bridge_rttrim(struct bridge_softc *); +static void bridge_rtage(struct bridge_softc *); +static void bridge_rtflush(struct bridge_softc *, int); +static int bridge_rtdaddr(struct bridge_softc *, const uint8_t *, + uint16_t); + +static int bridge_rtable_init(struct bridge_softc *); +static void bridge_rtable_fini(struct bridge_softc *); + +static int bridge_rtnode_addr_cmp(const uint8_t *, const uint8_t *); +static struct bridge_rtnode *bridge_rtnode_lookup(struct bridge_softc *, + const uint8_t *, uint16_t); +static int bridge_rtnode_insert(struct bridge_softc *, + struct bridge_rtnode *); +static void bridge_rtnode_destroy(struct bridge_softc *, + struct bridge_rtnode *); +static void bridge_rtable_expire(struct ifnet *, int); +static void bridge_state_change(struct ifnet *, int); + +static struct bridge_iflist *bridge_lookup_member(struct bridge_softc *, + const char *name); +static struct bridge_iflist *bridge_lookup_member_if(struct bridge_softc *, + struct ifnet *ifp); +static void bridge_delete_member(struct bridge_softc *, + struct bridge_iflist *, int); +static void bridge_delete_span(struct bridge_softc *, + struct bridge_iflist *); + +static int bridge_ioctl_add(struct bridge_softc *, void *); +static int bridge_ioctl_del(struct bridge_softc *, void *); +static int bridge_ioctl_gifflags(struct bridge_softc *, void *); +static int bridge_ioctl_sifflags(struct bridge_softc *, void *); +static int bridge_ioctl_scache(struct bridge_softc *, void *); +static int bridge_ioctl_gcache(struct bridge_softc *, void *); +static int bridge_ioctl_gifs32(struct bridge_softc *, void *); +static int bridge_ioctl_gifs64(struct bridge_softc *, void *); +static int bridge_ioctl_rts32(struct bridge_softc *, void *); +static int bridge_ioctl_rts64(struct bridge_softc *, void *); +static int bridge_ioctl_saddr32(struct bridge_softc *, void *); +static int bridge_ioctl_saddr64(struct bridge_softc *, void *); +static int bridge_ioctl_sto(struct bridge_softc *, void *); +static int bridge_ioctl_gto(struct bridge_softc *, void *); +static int bridge_ioctl_daddr32(struct bridge_softc *, void *); +static int bridge_ioctl_daddr64(struct bridge_softc *, void *); +static int bridge_ioctl_flush(struct bridge_softc *, void *); +static int bridge_ioctl_gpri(struct bridge_softc *, void *); +static int bridge_ioctl_spri(struct bridge_softc *, void *); +static int bridge_ioctl_ght(struct bridge_softc *, void *); +static int bridge_ioctl_sht(struct bridge_softc *, void *); +static int bridge_ioctl_gfd(struct bridge_softc *, void *); +static int bridge_ioctl_sfd(struct bridge_softc *, void *); +static int bridge_ioctl_gma(struct bridge_softc *, void *); +static int bridge_ioctl_sma(struct bridge_softc *, void *); +static int bridge_ioctl_sifprio(struct bridge_softc *, void *); +static int bridge_ioctl_sifcost(struct bridge_softc *, void *); +static int bridge_ioctl_sifmaxaddr(struct bridge_softc *, void *); +static int bridge_ioctl_addspan(struct bridge_softc *, void *); +static int bridge_ioctl_delspan(struct bridge_softc *, void *); +static int bridge_ioctl_gbparam32(struct bridge_softc *, void *); +static int bridge_ioctl_gbparam64(struct bridge_softc *, void *); +static int bridge_ioctl_grte(struct bridge_softc *, void *); +static int bridge_ioctl_gifsstp32(struct bridge_softc *, void *); +static int bridge_ioctl_gifsstp64(struct bridge_softc *, void *); +static int bridge_ioctl_sproto(struct bridge_softc *, void *); +static int bridge_ioctl_stxhc(struct bridge_softc *, void *); +static int bridge_ioctl_purge(struct bridge_softc *sc, void *arg); +static int bridge_ioctl_gfilt(struct bridge_softc *, void *); +static int bridge_ioctl_sfilt(struct bridge_softc *, void *); +#ifdef PFIL_HOOKS +static int bridge_pfil(struct mbuf **, struct ifnet *, struct ifnet *, + int); +static int bridge_ip_checkbasic(struct mbuf **mp); +#ifdef INET6 +static int bridge_ip6_checkbasic(struct mbuf **mp); +#endif /* INET6 */ +static int bridge_fragment(struct ifnet *, struct mbuf *, + struct ether_header *, int, struct llc *); +#endif /* PFIL_HOOKS */ + +static errno_t bridge_set_bpf_tap(ifnet_t ifn, bpf_tap_mode mode, bpf_packet_func bpf_callback); +__private_extern__ errno_t bridge_bpf_input(ifnet_t ifp, struct mbuf *m); +__private_extern__ errno_t bridge_bpf_output(ifnet_t ifp, struct mbuf *m); + +static void bridge_detach(ifnet_t ifp); + +#define m_copypacket(m, how) m_copym(m, 0, M_COPYALL, how) + +/* The default bridge vlan is 1 (IEEE 802.1Q-2003 Table 9-2) */ +#define VLANTAGOF(_m) 0 + +static struct bstp_cb_ops bridge_ops = { + .bcb_state = bridge_state_change, + .bcb_rtage = bridge_rtable_expire +}; + +SYSCTL_DECL(_net_link); +SYSCTL_NODE(_net_link, IFT_BRIDGE, bridge, CTLFLAG_RW, 0, "Bridge"); + +#if defined(PFIL_HOOKS) +static int pfil_onlyip = 1; /* only pass IP[46] packets when pfil is enabled */ +static int pfil_bridge = 1; /* run pfil hooks on the bridge interface */ +static int pfil_member = 1; /* run pfil hooks on the member interface */ +static int pfil_ipfw = 0; /* layer2 filter with ipfw */ +static int pfil_ipfw_arp = 0; /* layer2 filter with ipfw */ +static int pfil_local_phys = 0; /* run pfil hooks on the physical interface for + locally destined packets */ +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_onlyip, CTLFLAG_RW, + &pfil_onlyip, 0, "Only pass IP packets when pfil is enabled"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, ipfw_arp, CTLFLAG_RW, + &pfil_ipfw_arp, 0, "Filter ARP packets through IPFW layer2"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_bridge, CTLFLAG_RW, + &pfil_bridge, 0, "Packet filter on the bridge interface"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_member, CTLFLAG_RW, + &pfil_member, 0, "Packet filter on the member interface"); +SYSCTL_INT(_net_link_bridge, OID_AUTO, pfil_local_phys, CTLFLAG_RW, + &pfil_local_phys, 0, + "Packet filter on the physical interface for locally destined packets"); +#endif /* PFIL_HOOKS */ + +static int log_stp = 0; /* log STP state changes */ +SYSCTL_INT(_net_link_bridge, OID_AUTO, log_stp, CTLFLAG_RW, + &log_stp, 0, "Log STP state changes"); + +struct bridge_control { + int (*bc_func)(struct bridge_softc *, void *); + unsigned int bc_argsize; + unsigned int bc_flags; +}; + +#define BC_F_COPYIN 0x01 /* copy arguments in */ +#define BC_F_COPYOUT 0x02 /* copy arguments out */ +#define BC_F_SUSER 0x04 /* do super-user check */ + +static const struct bridge_control bridge_control_table32[] = { + { bridge_ioctl_add, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_del, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_sifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_scache, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gcache, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifs32, sizeof(struct ifbifconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_rts32, sizeof(struct ifbaconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_saddr32, sizeof(struct ifbareq32), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gto, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_daddr32, sizeof(struct ifbareq32), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_flush, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gpri, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_spri, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_ght, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sht, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfd, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfd, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gma, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sma, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifprio, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifcost, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfilt, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfilt, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_purge, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_addspan, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_delspan, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gbparam32, sizeof(struct ifbropreq32), + BC_F_COPYOUT }, + + { bridge_ioctl_grte, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifsstp32, sizeof(struct ifbpstpconf32), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_sproto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_stxhc, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, +}; + +static const struct bridge_control bridge_control_table64[] = { + { bridge_ioctl_add, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_del, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_sifflags, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_scache, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gcache, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifs64, sizeof(struct ifbifconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + { bridge_ioctl_rts64, sizeof(struct ifbaconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_saddr64, sizeof(struct ifbareq64), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_gto, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_daddr64, sizeof(struct ifbareq64), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_flush, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gpri, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_spri, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_ght, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sht, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfd, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfd, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gma, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sma, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifprio, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifcost, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gfilt, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + { bridge_ioctl_sfilt, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_purge, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_addspan, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + { bridge_ioctl_delspan, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_gbparam64, sizeof(struct ifbropreq64), + BC_F_COPYOUT }, + + { bridge_ioctl_grte, sizeof(struct ifbrparam), + BC_F_COPYOUT }, + + { bridge_ioctl_gifsstp64, sizeof(struct ifbpstpconf64), + BC_F_COPYIN|BC_F_COPYOUT }, + + { bridge_ioctl_sproto, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_stxhc, sizeof(struct ifbrparam), + BC_F_COPYIN|BC_F_SUSER }, + + { bridge_ioctl_sifmaxaddr, sizeof(struct ifbreq), + BC_F_COPYIN|BC_F_SUSER }, +}; + +static const unsigned int bridge_control_table_size = +sizeof(bridge_control_table32) / sizeof(bridge_control_table32[0]); + +static LIST_HEAD(, bridge_softc) bridge_list = LIST_HEAD_INITIALIZER(bridge_list); + +static lck_grp_t *bridge_lock_grp = NULL; +static lck_attr_t *bridge_lock_attr = NULL; + +static if_clone_t bridge_cloner = NULL; + +__private_extern__ int _if_brige_debug = 0; + +SYSCTL_INT(_net_link_bridge, OID_AUTO, debug, CTLFLAG_RW, + &_if_brige_debug, 0, "Bridge debug"); + +#if BRIDGE_DEBUG + +static void printf_ether_header(struct ether_header *eh); +static void printf_mbuf_data(mbuf_t m, size_t offset, size_t len); +static void printf_mbuf_pkthdr(mbuf_t m, const char *prefix, const char *suffix); +static void printf_mbuf(mbuf_t m, const char *prefix, const char *suffix); +static void link_print(struct sockaddr_dl * dl_p); + +static void bridge_lock(struct bridge_softc *); +static void bridge_unlock(struct bridge_softc *); +static int bridge_lock2ref(struct bridge_softc *); +static void bridge_unref(struct bridge_softc *); +static void bridge_xlock(struct bridge_softc *); +static void bridge_xdrop(struct bridge_softc *); + +static void bridge_lock(struct bridge_softc *sc) +{ + void *lr_saved = __builtin_return_address(0); + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + lck_mtx_lock(sc->sc_mtx); + + sc->lock_lr[sc->next_lock_lr] = lr_saved; + sc->next_lock_lr = (sc->next_lock_lr+1) % SO_LCKDBG_MAX; +} + +static void bridge_unlock(struct bridge_softc *sc) +{ + void *lr_saved = __builtin_return_address(0); + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + sc->unlock_lr[sc->next_unlock_lr] = lr_saved; + sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; + + lck_mtx_unlock(sc->sc_mtx); +} + +static int bridge_lock2ref(struct bridge_softc *sc) +{ + int error = 0; + void *lr_saved = __builtin_return_address(0); + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + if (sc->sc_iflist_xcnt > 0) + error = EBUSY; + else + sc->sc_iflist_ref++; + + sc->unlock_lr[sc->next_unlock_lr] = lr_saved; + sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; + lck_mtx_unlock(sc->sc_mtx); + + return error; +} + +static void bridge_unref(struct bridge_softc *sc) +{ + void *lr_saved = __builtin_return_address(0); + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + lck_mtx_lock(sc->sc_mtx); + sc->lock_lr[sc->next_lock_lr] = lr_saved; + sc->next_lock_lr = (sc->next_lock_lr+1) % SO_LCKDBG_MAX; + + sc->sc_iflist_ref--; + + sc->unlock_lr[sc->next_unlock_lr] = lr_saved; + sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; + if ((sc->sc_iflist_xcnt > 0) && (sc->sc_iflist_ref == 0)) { + lck_mtx_unlock(sc->sc_mtx); + wakeup(&sc->sc_cv); + } else + lck_mtx_unlock(sc->sc_mtx); +} + +static void bridge_xlock(struct bridge_softc *sc) +{ + void *lr_saved = __builtin_return_address(0); + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + sc->sc_iflist_xcnt++; + while (sc->sc_iflist_ref > 0) { + sc->unlock_lr[sc->next_unlock_lr] = lr_saved; + sc->next_unlock_lr = (sc->next_unlock_lr+1) % SO_LCKDBG_MAX; + + msleep(&sc->sc_cv, sc->sc_mtx, PZERO, "BRIDGE_XLOCK", NULL); + + sc->lock_lr[sc->next_lock_lr] = lr_saved; + sc->next_lock_lr = (sc->next_lock_lr+1) % SO_LCKDBG_MAX; + } +} + +static void bridge_xdrop(struct bridge_softc *sc) +{ + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + + sc->sc_iflist_xcnt--; +} + +void +printf_mbuf_pkthdr(mbuf_t m, const char *prefix, const char *suffix) +{ + if (m) + printf("%spktlen: %u rcvif: %p header: %p nextpkt: %p%s", + prefix ? prefix : "", + (unsigned int)mbuf_pkthdr_len(m), mbuf_pkthdr_rcvif(m), mbuf_pkthdr_header(m), mbuf_nextpkt(m), + suffix ? suffix : ""); + else + printf("%s<NULL>%s\n", prefix, suffix); +} + +void +printf_mbuf(mbuf_t m, const char *prefix, const char *suffix) +{ + if (m) { + printf("%s%p type: %u flags: 0x%x len: %u data: %p maxlen: %u datastart: %p next: %p%s", + prefix ? prefix : "", + m, mbuf_type(m), mbuf_flags(m), (unsigned int)mbuf_len(m), mbuf_data(m), + (unsigned int)mbuf_maxlen(m), mbuf_datastart(m), mbuf_next(m), + !suffix || (mbuf_flags(m) & MBUF_PKTHDR) ? "" : suffix); + if ((mbuf_flags(m) & MBUF_PKTHDR)) + printf_mbuf_pkthdr(m, " ", suffix); + } else + printf("%s<NULL>%s\n", prefix, suffix); +} + +void +printf_mbuf_data(mbuf_t m, size_t offset, size_t len) +{ + mbuf_t n; + size_t i, j; + size_t pktlen, mlen, maxlen; + unsigned char *ptr; + + pktlen = mbuf_pkthdr_len(m); + + if (offset > pktlen) + return; + + maxlen = (pktlen - offset > len) ? len : pktlen; + n = m; + mlen = mbuf_len(n); + ptr = mbuf_data(n); + for (i = 0, j = 0; i < maxlen; i++, j++) { + if (j >= mlen) { + n = mbuf_next(n); + if (n == 0) + break; + ptr = mbuf_data(n); + mlen = mbuf_len(n); + j = 0; + } + if (i >= offset) { + printf("%02x%s", ptr[j], i % 2 ? " " : ""); + } + } + return; +} + +static void +printf_ether_header(struct ether_header *eh) +{ + printf("%02x:%02x:%02x:%02x:%02x:%02x > %02x:%02x:%02x:%02x:%02x:%02x 0x%04x ", + eh->ether_shost[0], eh->ether_shost[1], eh->ether_shost[2], + eh->ether_shost[3], eh->ether_shost[4], eh->ether_shost[5], + eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2], + eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5], + eh->ether_type); +} + +static void +link_print(struct sockaddr_dl * dl_p) +{ + int i; + +#if 1 + printf("sdl len %d index %d family %d type 0x%x nlen %d alen %d" + " slen %d addr ", dl_p->sdl_len, + dl_p->sdl_index, dl_p->sdl_family, dl_p->sdl_type, + dl_p->sdl_nlen, dl_p->sdl_alen, dl_p->sdl_slen); +#endif + for (i = 0; i < dl_p->sdl_alen; i++) + printf("%s%x", i ? ":" : "", + (CONST_LLADDR(dl_p))[i]); + printf("\n"); + return; +} + +#endif /* BRIDGE_DEBUG */ + +/* + * bridgeattach: + * + * Pseudo-device attach routine. + */ +__private_extern__ int +bridgeattach(__unused int n) +{ + int error; + lck_grp_attr_t *lck_grp_attr = NULL; + struct ifnet_clone_params ifnet_clone_params; + + bridge_rtnode_pool = zinit(sizeof(struct bridge_rtnode), 1024 * sizeof(struct bridge_rtnode), + 0, "bridge_rtnode"); + zone_change(bridge_rtnode_pool, Z_CALLERACCT, FALSE); + + lck_grp_attr = lck_grp_attr_alloc_init(); + + bridge_lock_grp = lck_grp_alloc_init("if_bridge", lck_grp_attr); + + bridge_lock_attr = lck_attr_alloc_init(); + +#if BRIDGE_DEBUG + lck_attr_setdebug(bridge_lock_attr); +#endif + + bridge_list_mtx = lck_mtx_alloc_init(bridge_lock_grp, bridge_lock_attr); + + // can free the attributes once we've allocated the group lock + lck_grp_attr_free(lck_grp_attr); + + LIST_INIT(&bridge_list); + + bstp_sys_init(); + + ifnet_clone_params.ifc_name = "bridge"; + ifnet_clone_params.ifc_create = bridge_clone_create; + ifnet_clone_params.ifc_destroy = bridge_clone_destroy; + + error = ifnet_clone_attach(&ifnet_clone_params, &bridge_cloner); + if (error != 0) + printf("bridgeattach: ifnet_clone_attach failed %d\n", error); + + return error; +} + +#if defined(PFIL_HOOKS) +/* + * handler for net.link.bridge.pfil_ipfw + */ +static int +sysctl_pfil_ipfw SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1,arg2) + int enable = pfil_ipfw; + int error; + + error = sysctl_handle_int(oidp, &enable, 0, req); + enable = (enable) ? 1 : 0; + + if (enable != pfil_ipfw) { + pfil_ipfw = enable; + + /* + * Disable pfil so that ipfw doesnt run twice, if the user + * really wants both then they can re-enable pfil_bridge and/or + * pfil_member. Also allow non-ip packets as ipfw can filter by + * layer2 type. + */ + if (pfil_ipfw) { + pfil_onlyip = 0; + pfil_bridge = 0; + pfil_member = 0; + } + } + + return (error); +} +SYSCTL_PROC(_net_link_bridge, OID_AUTO, ipfw, CTLTYPE_INT|CTLFLAG_RW, + &pfil_ipfw, 0, &sysctl_pfil_ipfw, "I", "Layer2 filter with IPFW"); +#endif /* PFIL_HOOKS */ + +/* + * bridge_clone_create: + * + * Create a new bridge instance. + */ +static int +bridge_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) +{ + struct ifnet *ifp = NULL; + struct bridge_softc *sc; + u_char eaddr[6]; + struct ifnet_init_params init_params; + errno_t error = 0; + uint32_t sdl_buffer[offsetof(struct sockaddr_dl, sdl_data) + IFNAMSIZ + ETHER_ADDR_LEN]; + struct sockaddr_dl *sdl = (struct sockaddr_dl *)sdl_buffer; + + sc = _MALLOC(sizeof(*sc), M_DEVBUF, M_WAITOK); + memset(sc, 0, sizeof(*sc)); + + sc->sc_mtx = lck_mtx_alloc_init(bridge_lock_grp, bridge_lock_attr); + sc->sc_brtmax = BRIDGE_RTABLE_MAX; + sc->sc_brttimeout = BRIDGE_RTABLE_TIMEOUT; + sc->sc_filter_flags = IFBF_FILT_DEFAULT; +#ifndef BRIDGE_IPF + /* + * For backwards compatibility with previous behaviour... + * Switch off filtering on the bridge itself if BRIDGE_IPF is + * not defined. + */ + sc->sc_filter_flags &= ~IFBF_FILT_USEIPF; +#endif + + /* Initialize our routing table. */ + error = bridge_rtable_init(sc); + if (error != 0) { + printf("bridge_clone_create: bridge_rtable_init failed %d\n", error); + goto done; + } + + TAILQ_INIT(&sc->sc_iflist); + TAILQ_INIT(&sc->sc_spanlist); + + /* use the interface name as the unique id for ifp recycle */ + snprintf(sc->sc_if_xname, sizeof(sc->sc_if_xname), "%s%d", + ifc->ifc_name, unit); + memset(&init_params, 0, sizeof(struct ifnet_init_params)); + init_params.uniqueid = sc->sc_if_xname; + init_params.uniqueid_len = strlen(sc->sc_if_xname); + init_params.name = ifc->ifc_name; + init_params.unit = unit; + init_params.family = IFNET_FAMILY_ETHERNET; + init_params.type = IFT_BRIDGE; + init_params.output = bridge_start; + init_params.demux = ether_demux; + init_params.add_proto = ether_add_proto; + init_params.del_proto = ether_del_proto; + init_params.check_multi = ether_check_multi; + init_params.framer = ether_frameout; + init_params.softc = sc; + init_params.ioctl = bridge_ioctl; + init_params.set_bpf_tap = bridge_set_bpf_tap; + init_params.detach = bridge_detach; + init_params.broadcast_addr = etherbroadcastaddr; + init_params.broadcast_len = ETHER_ADDR_LEN; + error = ifnet_allocate(&init_params, &ifp); + if (error != 0) { + printf("bridge_clone_create: ifnet_allocate failed %d\n", error); + goto done; + } + sc->sc_ifp = ifp; + + error = ifnet_set_mtu(ifp, ETHERMTU); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_mtu failed %d\n", error); + goto done; + } + error = ifnet_set_addrlen(ifp, ETHER_ADDR_LEN); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_addrlen failed %d\n", error); + goto done; + } + error = ifnet_set_baudrate(ifp, 10000000) ; // XXX: this is what IONetworking does + if (error != 0) { + printf("bridge_clone_create: ifnet_set_baudrate failed %d\n", error); + goto done; + } + error = ifnet_set_hdrlen(ifp, ETHER_HDR_LEN); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_hdrlen failed %d\n", error); + goto done; + } + error = ifnet_set_flags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_NOTRAILERS | IFF_MULTICAST, + 0xffff); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_flags failed %d\n", error); + goto done; + } + +#if 0 + /* + * Generate a random ethernet address with a locally administered + * address. + * + * Since we are using random ethernet addresses for the bridge, it is + * possible that we might have address collisions, so make sure that + * this hardware address isn't already in use on another bridge. + */ + { + int retry; + + for (retry = 1; retry != 0;) { + struct ifnet *bifp; + struct bridge_softc *sc2; + + read_random(eaddr, ETHER_ADDR_LEN); + eaddr[0] &= ~1; /* clear multicast bit */ + eaddr[0] |= 2; /* set the LAA bit */ + retry = 0; + lck_mtx_lock(bridge_list_mtx); + LIST_FOREACH(sc2, &bridge_list, sc_list) { + bifp = sc2->sc_ifp; + if (memcmp(eaddr, ifnet_lladdr(bifp), ETHER_ADDR_LEN) == 0) + retry = 1; + } + lck_mtx_unlock(bridge_list_mtx); + } + } +#else + /* + * Generate a random ethernet address and use the private AC:DE:48 + * OUI code. + */ + { + uint32_t r; + + read_random(&r, sizeof(r)); + eaddr[0] = 0xAC; + eaddr[1] = 0xDE; + eaddr[2] = 0x48; + eaddr[3] = (r >> 0) & 0xffu; + eaddr[4] = (r >> 8) & 0xffu; + eaddr[5] = (r >> 16) & 0xffu; + } +#endif + + memset(sdl, 0, sizeof(sdl_buffer)); + sdl->sdl_family = AF_LINK; + sdl->sdl_nlen = strlen(sc->sc_if_xname); + sdl->sdl_alen = ETHER_ADDR_LEN; + sdl->sdl_len = offsetof(struct sockaddr_dl, sdl_data); + memcpy(sdl->sdl_data, sc->sc_if_xname, sdl->sdl_nlen); + memcpy(LLADDR(sdl), eaddr, ETHER_ADDR_LEN); + +#if BRIDGE_DEBUG + link_print(sdl); +#endif + + error = ifnet_attach(ifp, NULL); + if (error != 0) { + printf("bridge_clone_create: ifnet_attach failed %d\n", error); + goto done; + } + + error = ifnet_set_lladdr_and_type(ifp, eaddr, ETHER_ADDR_LEN, IFT_ETHER); + if (error != 0) { + printf("bridge_clone_create: ifnet_set_lladdr_and_type failed %d\n", error); + goto done; + } + +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + /* + * APPLE MODIFICATION - our bridge can support HW checksums + * (useful if underlying interfaces support them) on TX, + * RX is not that interesting, since the stack just looks to + * see if the packet has been checksummed already (I think) + * but we might as well indicate we support it + */ + ifp->if_capabilities = + IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_UDPv4_Tx | + IFCAP_CSUM_IPv4_Rx | IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_UDPv4_Rx ; +#endif + + bstp_attach(&sc->sc_stp, &bridge_ops); + + lck_mtx_lock(bridge_list_mtx); + LIST_INSERT_HEAD(&bridge_list, sc, sc_list); + lck_mtx_unlock(bridge_list_mtx); + + /* attach as ethernet */ + error = bpf_attach(ifp, DLT_EN10MB, sizeof(struct ether_header), NULL, NULL); + +done: + if (error != 0) { + printf("bridge_clone_create failed error %d\n", error); + /* Cleanup TBD */ + } + + return error; +} + +/* + * bridge_clone_destroy: + * + * Destroy a bridge instance. + */ +static int +bridge_clone_destroy(struct ifnet *ifp) +{ + struct bridge_softc *sc = ifp->if_softc; + struct bridge_iflist *bif; + errno_t error; + + BRIDGE_LOCK(sc); + if ((sc->sc_flags & SCF_DETACHING)) { + BRIDGE_UNLOCK(sc); + return 0; + } + sc->sc_flags |= SCF_DETACHING; + + bridge_stop(ifp, 1); + + error = ifnet_set_flags(ifp, 0, IFF_UP); + if (error != 0) { + printf("bridge_clone_destroy: ifnet_set_flags failed %d\n", error); + } + + while ((bif = TAILQ_FIRST(&sc->sc_iflist)) != NULL) + bridge_delete_member(sc, bif, 0); + + while ((bif = TAILQ_FIRST(&sc->sc_spanlist)) != NULL) { + bridge_delete_span(sc, bif); + } + + BRIDGE_UNLOCK(sc); + + error = ifnet_detach(ifp); + if (error != 0) { + panic("bridge_clone_destroy: ifnet_detach(%p) failed %d\n", ifp, error); + if ((sc = (struct bridge_softc *)ifnet_softc(ifp)) != NULL) { + BRIDGE_LOCK(sc); + sc->sc_flags &= ~SCF_DETACHING; + BRIDGE_UNLOCK(sc); + } + return 0; + } + + return 0; +} + +#define DRVSPEC do { \ + if (ifd->ifd_cmd >= bridge_control_table_size) { \ + error = EINVAL; \ + break; \ + } \ + bc = &bridge_control_table[ifd->ifd_cmd]; \ + \ + if (cmd == SIOCGDRVSPEC && \ + (bc->bc_flags & BC_F_COPYOUT) == 0) { \ + error = EINVAL; \ + break; \ + } \ + else if (cmd == SIOCSDRVSPEC && \ + (bc->bc_flags & BC_F_COPYOUT) != 0) { \ + error = EINVAL; \ + break; \ + } \ + \ + if (bc->bc_flags & BC_F_SUSER) { \ + error = kauth_authorize_generic(kauth_cred_get(), KAUTH_GENERIC_ISSUSER); \ + if (error) \ + break; \ + } \ + \ + if (ifd->ifd_len != bc->bc_argsize || \ + ifd->ifd_len > sizeof(args)) { \ + error = EINVAL; \ + break; \ + } \ + \ + bzero(&args, sizeof(args)); \ + if (bc->bc_flags & BC_F_COPYIN) { \ + error = copyin(ifd->ifd_data, &args, ifd->ifd_len); \ + if (error) \ + break; \ + } \ + \ + BRIDGE_LOCK(sc); \ + error = (*bc->bc_func)(sc, &args); \ + BRIDGE_UNLOCK(sc); \ + if (error) \ + break; \ + \ + if (bc->bc_flags & BC_F_COPYOUT) \ + error = copyout(&args, ifd->ifd_data, ifd->ifd_len); \ +} while (0) + + +/* + * bridge_ioctl: + * + * Handle a control request from the operator. + */ +static errno_t +bridge_ioctl(struct ifnet *ifp, u_long cmd, void *data) +{ + struct bridge_softc *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *) data; + int error = 0; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_ioctl: ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu)\n", + ifp, + cmd, + (cmd & IOC_IN) ? 'I' : ' ', + (cmd & IOC_OUT) ? 'O' : ' ', + IOCPARM_LEN(cmd), + (char)IOCGROUP(cmd), + cmd & 0xff); +#endif + + switch (cmd) { + + case SIOCSIFADDR: + case SIOCAIFADDR: + ifnet_set_flags(ifp, IFF_UP, IFF_UP); + break; + + case SIOCGIFMEDIA32: + case SIOCGIFMEDIA64: + error = EINVAL; + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + break; + + case SIOCSDRVSPEC32: + case SIOCGDRVSPEC32: { + union { + struct ifbreq ifbreq; + struct ifbifconf32 ifbifconf; + struct ifbareq32 ifbareq; + struct ifbaconf32 ifbaconf; + struct ifbrparam ifbrparam; + struct ifbropreq32 ifbropreq; + } args; + struct ifdrv32 *ifd = (struct ifdrv32 *) data; + const struct bridge_control *bridge_control_table = bridge_control_table32, *bc; + + DRVSPEC; + + break; + } + case SIOCSDRVSPEC64: + case SIOCGDRVSPEC64: { + union { + struct ifbreq ifbreq; + struct ifbifconf64 ifbifconf; + struct ifbareq64 ifbareq; + struct ifbaconf64 ifbaconf; + struct ifbrparam ifbrparam; + struct ifbropreq64 ifbropreq; + } args; + struct ifdrv64 *ifd = (struct ifdrv64 *) data; + const struct bridge_control *bridge_control_table = bridge_control_table64, *bc; + + DRVSPEC; + + break; + } + + case SIOCSIFFLAGS: + if (!(ifp->if_flags & IFF_UP) && + (ifp->if_flags & IFF_RUNNING)) { + /* + * If interface is marked down and it is running, + * then stop and disable it. + */ + BRIDGE_LOCK(sc); + bridge_stop(ifp, 1); + BRIDGE_UNLOCK(sc); + } else if ((ifp->if_flags & IFF_UP) && + !(ifp->if_flags & IFF_RUNNING)) { + /* + * If interface is marked up and it is stopped, then + * start it. + */ + BRIDGE_LOCK(sc); + error = bridge_init(ifp); + BRIDGE_UNLOCK(sc); + } + break; + + case SIOCSIFLLADDR: + error = ifnet_set_lladdr(ifp, ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len); + if (error != 0) + printf("bridge_ioctl: ifnet_set_lladdr failed %d\n", error); + break; + + case SIOCSIFMTU: + /* Do not allow the MTU to be changed on the bridge */ + error = EINVAL; + break; + + default: + /* + * drop the lock as ether_ioctl() will call bridge_start() and + * cause the lock to be recursed. + */ + error = ether_ioctl(ifp, cmd, data); +#if BRIDGE_DEBUG + if (error != 0) + printf("bridge_ioctl: ether_ioctl ifp %p cmd 0x%08lx (%c%c [%lu] %c %lu) failed error: %d\n", + ifp, + cmd, + (cmd & IOC_IN) ? 'I' : ' ', + (cmd & IOC_OUT) ? 'O' : ' ', + IOCPARM_LEN(cmd), + (char) IOCGROUP(cmd), + cmd & 0xff, + error); +#endif /* BRIDGE_DEBUG */ + break; + } + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + return (error); +} + +#if HAS_IF_CAP +/* + * bridge_mutecaps: + * + * Clear or restore unwanted capabilities on the member interface + */ +static void +bridge_mutecaps(struct bridge_softc *sc) +{ + struct bridge_iflist *bif; + int enabled, mask; + + /* Initial bitmask of capabilities to test */ + mask = BRIDGE_IFCAPS_MASK; + + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + /* Every member must support it or its disabled */ + mask &= bif->bif_savedcaps; + } + + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + enabled = bif->bif_ifp->if_capenable; + enabled &= ~BRIDGE_IFCAPS_STRIP; + /* strip off mask bits and enable them again if allowed */ + enabled &= ~BRIDGE_IFCAPS_MASK; + enabled |= mask; + + bridge_set_ifcap(sc, bif, enabled); + } + +} + +static void +bridge_set_ifcap(struct bridge_softc *sc, struct bridge_iflist *bif, int set) +{ + struct ifnet *ifp = bif->bif_ifp; + struct ifreq ifr; + int error; + + bzero(&ifr, sizeof(ifr)); + ifr.ifr_reqcap = set; + + if (ifp->if_capenable != set) { + IFF_LOCKGIANT(ifp); + error = (*ifp->if_ioctl)(ifp, SIOCSIFCAP, (caddr_t)&ifr); + IFF_UNLOCKGIANT(ifp); + if (error) + printf("error setting interface capabilities on %s\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifp->if_xname); + } +} +#endif /* HAS_IF_CAP */ + +/* + * bridge_lookup_member: + * + * Lookup a bridge member interface. + */ +static struct bridge_iflist * +bridge_lookup_member(struct bridge_softc *sc, const char *name) +{ + struct bridge_iflist *bif; + struct ifnet *ifp; + char if_xname[IFNAMSIZ]; + + BRIDGE_LOCK_ASSERT(sc); + + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + ifp = bif->bif_ifp; + snprintf(if_xname, sizeof(if_xname), "%s%d", + ifnet_name(ifp), ifnet_unit(ifp)); + if (strncmp(if_xname, name, sizeof(if_xname)) == 0) + return (bif); + } + + return (NULL); +} + +/* + * bridge_lookup_member_if: + * + * Lookup a bridge member interface by ifnet*. + */ +static struct bridge_iflist * +bridge_lookup_member_if(struct bridge_softc *sc, struct ifnet *member_ifp) +{ + struct bridge_iflist *bif; + + BRIDGE_LOCK_ASSERT(sc); + + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + if (bif->bif_ifp == member_ifp) + return (bif); + } + + return (NULL); +} + +static errno_t +bridge_iff_input(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, + mbuf_t *data, char **frame_ptr) +{ + errno_t error = 0; + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + struct bridge_softc *sc = bif->bif_sc; + int included = 0; + size_t frmlen = 0; + mbuf_t m = *data; + + if ((m->m_flags & M_PROTO1)) + goto out; + + if (*frame_ptr >= (char *)mbuf_datastart(m) && *frame_ptr <= (char *)mbuf_data(m)) { + included = 1; + frmlen = (char *)mbuf_data(m) - *frame_ptr; + } +#if BRIDGE_DEBUG + if (_if_brige_debug) { + printf("bridge_iff_input %s%d from %s%d m %p data %p frame %p %s frmlen %lu\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifp), ifnet_unit(ifp), + m, mbuf_data(m), *frame_ptr, included ? "inside" : "outside", frmlen); + + if (_if_brige_debug > 1) { + printf_mbuf(m, "bridge_iff_input[", "\n"); + printf_ether_header((struct ether_header *)*frame_ptr); + printf_mbuf_data(m, 0, 20); + printf("\n"); + } + } +#endif /* BRIDGE_DEBUG */ + + /* Move data pointer to start of frame to the link layer header */ + if (included) { + (void) mbuf_setdata(m, (char *)mbuf_data(m) - frmlen, mbuf_len(m) + frmlen); + (void) mbuf_pkthdr_adjustlen(m, frmlen); + } else { + printf("bridge_iff_input: frame_ptr outside mbuf\n"); + goto out; + } + + error = bridge_input(ifp, m, *frame_ptr); + + /* Adjust packet back to original */ + if (error == 0) { + (void) mbuf_setdata(m, (char *)mbuf_data(m) + frmlen, mbuf_len(m) - frmlen); + (void) mbuf_pkthdr_adjustlen(m, -frmlen); + } +#if BRIDGE_DEBUG + if (_if_brige_debug > 1) { + printf("\n"); + printf_mbuf(m, "bridge_iff_input]", "\n"); + } +#endif /* BRIDGE_DEBUG */ + +out: + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + return error; +} + + +#if BRIDGE_MEMBER_OUT_FILTER +static errno_t +bridge_iff_output(void *cookie, ifnet_t ifp, __unused protocol_family_t protocol, mbuf_t *data) +{ + errno_t error = 0; + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + struct bridge_softc *sc = bif->bif_sc; + mbuf_t m = *data; + + if ((m->m_flags & M_PROTO1)) + goto out; + +#if BRIDGE_DEBUG + if (_if_brige_debug) { + printf("bridge_iff_output %s%d from %s%d m %p data %p\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifp), ifnet_unit(ifp), + m, mbuf_data(m)); + } +#endif /* BRIDGE_DEBUG */ + + error = bridge_output(sc, ifp, m); + if (error != 0) { + printf("bridge_iff_output: bridge_output failed error %d\n", error); + } + +out: + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + return error; +} +#endif /* BRIDGE_MEMBER_OUT_FILTER */ + + +static void +bridge_iff_event(void* cookie, ifnet_t ifp, __unused protocol_family_t protocol, + const struct kev_msg *event_msg) +{ + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + + if (event_msg->vendor_code == KEV_VENDOR_APPLE && + event_msg->kev_class == KEV_NETWORK_CLASS && + event_msg->kev_subclass == KEV_DL_SUBCLASS) { + switch (event_msg->event_code) { + case KEV_DL_IF_DETACHING: + case KEV_DL_IF_DETACHED: + bridge_ifdetach(bif, ifp); + break; + + case KEV_DL_LINK_OFF: + case KEV_DL_LINK_ON: { + bstp_linkstate(ifp, event_msg->event_code); + break; + } + + case KEV_DL_SIFFLAGS: { + if (bif->bif_promisc == 0 && (ifp->if_flags & IFF_UP)) { + errno_t error = ifnet_set_promiscuous(ifp, 1); + if (error != 0) { + printf("bridge_iff_event: ifnet_set_promiscuous(%s%d) failed %d\n", + ifnet_name(ifp), ifnet_unit(ifp), error); + } else { + bif->bif_promisc = 1; + } + } + break; + } + + default: + break; + } + } +} + +/* + * bridge_iff_detached: + * + * Detach an interface from a bridge. Called when a member + * interface is detaching. + */ +static void +bridge_iff_detached(void* cookie, __unused ifnet_t ifp) +{ + struct bridge_iflist *bif = (struct bridge_iflist *)cookie; + +#if BRIDGE_DEBUG + printf("bridge_iff_detached: %s%d\n", + ifnet_name(ifp), ifnet_unit(ifp)); +#endif + + bridge_ifdetach(bif, ifp); + + _FREE(bif, M_DEVBUF); + + return; +} + +static errno_t +bridge_proto_input(ifnet_t ifp, __unused protocol_family_t protocol, + __unused mbuf_t packet, __unused char *header) +{ + printf("bridge_proto_input: unexpected packet from %s%d\n", + ifnet_name(ifp), ifnet_unit(ifp)); + return 0; +} + +static int +bridge_attach_protocol(struct ifnet *ifp) +{ + int error; + struct ifnet_attach_proto_param reg; + + printf("bridge_attach_protocol: %s%d\n", + ifnet_name(ifp), ifnet_unit(ifp)); + + bzero(®, sizeof(reg)); + reg.input = bridge_proto_input; + + error = ifnet_attach_protocol(ifp, PF_BRIDGE, ®); + if (error) + printf("bridge_attach_protocol: ifnet_attach_protocol(%s%d) failed, %d\n", + ifnet_name(ifp), ifnet_unit(ifp), error); + + return (error); +} + +static int +bridge_detach_protocol(struct ifnet *ifp) +{ + int error; + + printf("bridge_detach_protocol: %s%d\n", + ifnet_name(ifp), ifnet_unit(ifp)); + + error = ifnet_detach_protocol(ifp, PF_BRIDGE); + if (error) + printf("bridge_attach_protocol: ifnet_detach_protocol(%s%d) failed, %d\n", + ifnet_name(ifp), ifnet_unit(ifp), error); + + return (error); +} + +/* + * bridge_delete_member: + * + * Delete the specified member interface. + */ +static void +bridge_delete_member(struct bridge_softc *sc, struct bridge_iflist *bif, + int gone) +{ + struct ifnet *ifs = bif->bif_ifp; + + BRIDGE_LOCK_ASSERT(sc); + + if (!gone) { + switch (ifs->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: + /* + * Take the interface out of promiscuous mode. + */ + if (bif->bif_promisc) + (void) ifnet_set_promiscuous(ifs, 0); + break; + + case IFT_GIF: + break; + + default: +#ifdef DIAGNOSTIC + panic("bridge_delete_member: impossible"); +#endif + break; + } + +#if HAS_IF_CAP + /* reneable any interface capabilities */ + bridge_set_ifcap(sc, bif, bif->bif_savedcaps); +#endif + } + + if (bif->bif_proto_attached) { + /* Respect lock ordering with DLIL lock */ + BRIDGE_UNLOCK(sc); + (void) bridge_detach_protocol(ifs); + BRIDGE_LOCK(sc); + } + if (bif->bif_flags & IFBIF_STP) + bstp_disable(&bif->bif_stp); + + ifs->if_bridge = NULL; + BRIDGE_XLOCK(sc); + TAILQ_REMOVE(&sc->sc_iflist, bif, bif_next); + BRIDGE_XDROP(sc); + + ifnet_release(ifs); + +#if HAS_IF_CAP + bridge_mutecaps(sc); /* recalcuate now this interface is removed */ +#endif /* HAS_IF_CAP */ + bridge_rtdelete(sc, ifs, IFBF_FLUSHALL); + KASSERT(bif->bif_addrcnt == 0, + ("%s: %d bridge routes referenced", __func__, bif->bif_addrcnt)); + + BRIDGE_UNLOCK(sc); + bstp_destroy(&bif->bif_stp); /* prepare to free */ + BRIDGE_LOCK(sc); + + if (bif->bif_filter_attached) { + /* Respect lock ordering with DLIL lock */ + BRIDGE_UNLOCK(sc); + iflt_detach(bif->bif_iff_ref); + BRIDGE_LOCK(sc); + } else { + _FREE(bif, M_DEVBUF); + } +} + +/* + * bridge_delete_span: + * + * Delete the specified span interface. + */ +static void +bridge_delete_span(struct bridge_softc *sc, struct bridge_iflist *bif) +{ + BRIDGE_LOCK_ASSERT(sc); + + KASSERT(bif->bif_ifp->if_bridge == NULL, + ("%s: not a span interface", __func__)); + + ifnet_release(bif->bif_ifp); + + TAILQ_REMOVE(&sc->sc_spanlist, bif, bif_next); + _FREE(bif, M_DEVBUF); +} + +static int +bridge_ioctl_add(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif = NULL; + struct ifnet *ifs; + int error = 0; + struct iff_filter iff; + + ifs = ifunit(req->ifbr_ifsname); + if (ifs == NULL) + return (ENOENT); + if (ifs->if_ioctl == NULL) /* must be supported */ + return (EINVAL); + + /* If it's in the span list, it can't be a member. */ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifs == bif->bif_ifp) + return (EBUSY); + + /* Allow the first Ethernet member to define the MTU */ + if (ifs->if_type != IFT_GIF) { + if (TAILQ_EMPTY(&sc->sc_iflist)) + sc->sc_ifp->if_mtu = ifs->if_mtu; + else if (sc->sc_ifp->if_mtu != ifs->if_mtu) { + printf("%s%d: invalid MTU for %s%d", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifs), ifnet_unit(ifs)); + return (EINVAL); + } + } + + if (ifs->if_bridge == sc) + return (EEXIST); + + if (ifs->if_bridge != NULL) + return (EBUSY); + + bif = _MALLOC(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); + if (bif == NULL) + return (ENOMEM); + + bif->bif_ifp = ifs; + bif->bif_flags = IFBIF_LEARNING | IFBIF_DISCOVER; +#if HAS_IF_CAP + bif->bif_savedcaps = ifs->if_capenable; +#endif /* HAS_IF_CAP */ + bif->bif_sc = sc; + + ifnet_reference(ifs); + + ifs->if_bridge = sc; + bstp_create(&sc->sc_stp, &bif->bif_stp, bif->bif_ifp); + /* + * XXX: XLOCK HERE!?! + */ + TAILQ_INSERT_TAIL(&sc->sc_iflist, bif, bif_next); + +#if HAS_IF_CAP + /* Set interface capabilities to the intersection set of all members */ + bridge_mutecaps(sc); +#endif /* HAS_IF_CAP */ + + + switch (ifs->if_type) { + case IFT_ETHER: + case IFT_L2VLAN: + /* + * Place the interface into promiscuous mode. + */ + error = ifnet_set_promiscuous(ifs, 1); + if (error) { + /* Ignore error when device is not up */ + if (error != ENETDOWN) + goto out; + error = 0; + } else { + bif->bif_promisc = 1; + } + break; + + case IFT_GIF: + break; + + default: + error = EINVAL; + goto out; + } + + /* + * Respect lock ordering with DLIL lock for the following operations + */ + BRIDGE_UNLOCK(sc); + + /* + * install an interface filter + */ + memset(&iff, 0, sizeof(struct iff_filter)); + iff.iff_cookie = bif; + iff.iff_name = "com.apple.kernel.bsd.net.if_bridge"; + iff.iff_input = bridge_iff_input; +#if BRIDGE_MEMBER_OUT_FILTER + iff.iff_output = bridge_iff_output; +#endif /* BRIDGE_MEMBER_OUT_FILTER */ + iff.iff_event = bridge_iff_event; + iff.iff_detached = bridge_iff_detached; + error = iflt_attach(ifs, &iff, &bif->bif_iff_ref); + if (error != 0) { + printf("bridge_ioctl_add: iflt_attach failed %d\n", error); + BRIDGE_LOCK(sc); + goto out; + } + bif->bif_filter_attached = 1; + + /* + * install an dummy "bridge" protocol + */ + if ((error = bridge_attach_protocol(ifs)) != 0) { + if (error != 0) { + printf("bridge_ioctl_add: bridge_attach_protocol failed %d\n", error); + BRIDGE_LOCK(sc); + goto out; + } + } + bif->bif_proto_attached = 1; + + BRIDGE_LOCK(sc); + +out: + if (error && bif != NULL) + bridge_delete_member(sc, bif, 1); + + return (error); +} + +static int +bridge_ioctl_del(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bridge_delete_member(sc, bif, 0); + + return (0); +} + +static int +bridge_ioctl_purge(__unused struct bridge_softc *sc, __unused void *arg) +{ + return (0); +} + +static int +bridge_ioctl_gifflags(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + struct bstp_port *bp; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bp = &bif->bif_stp; + req->ifbr_ifsflags = bif->bif_flags; + req->ifbr_state = bp->bp_state; + req->ifbr_priority = bp->bp_priority; + req->ifbr_path_cost = bp->bp_path_cost; + req->ifbr_portno = bif->bif_ifp->if_index & 0xfff; + req->ifbr_proto = bp->bp_protover; + req->ifbr_role = bp->bp_role; + req->ifbr_stpflags = bp->bp_flags; + req->ifbr_addrcnt = bif->bif_addrcnt; + req->ifbr_addrmax = bif->bif_addrmax; + req->ifbr_addrexceeded = bif->bif_addrexceeded; + + /* Copy STP state options as flags */ + if (bp->bp_operedge) + req->ifbr_ifsflags |= IFBIF_BSTP_EDGE; + if (bp->bp_flags & BSTP_PORT_AUTOEDGE) + req->ifbr_ifsflags |= IFBIF_BSTP_AUTOEDGE; + if (bp->bp_ptp_link) + req->ifbr_ifsflags |= IFBIF_BSTP_PTP; + if (bp->bp_flags & BSTP_PORT_AUTOPTP) + req->ifbr_ifsflags |= IFBIF_BSTP_AUTOPTP; + if (bp->bp_flags & BSTP_PORT_ADMEDGE) + req->ifbr_ifsflags |= IFBIF_BSTP_ADMEDGE; + if (bp->bp_flags & BSTP_PORT_ADMCOST) + req->ifbr_ifsflags |= IFBIF_BSTP_ADMCOST; + return (0); +} + +static int +bridge_ioctl_sifflags(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + struct bstp_port *bp; + int error; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + bp = &bif->bif_stp; + + if (req->ifbr_ifsflags & IFBIF_SPAN) + /* SPAN is readonly */ + return (EINVAL); + + + if (req->ifbr_ifsflags & IFBIF_STP) { + if ((bif->bif_flags & IFBIF_STP) == 0) { + error = bstp_enable(&bif->bif_stp); + if (error) + return (error); + } + } else { + if ((bif->bif_flags & IFBIF_STP) != 0) + bstp_disable(&bif->bif_stp); + } + + /* Pass on STP flags */ + bstp_set_edge(bp, req->ifbr_ifsflags & IFBIF_BSTP_EDGE ? 1 : 0); + bstp_set_autoedge(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOEDGE ? 1 : 0); + bstp_set_ptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_PTP ? 1 : 0); + bstp_set_autoptp(bp, req->ifbr_ifsflags & IFBIF_BSTP_AUTOPTP ? 1 : 0); + + /* Save the bits relating to the bridge */ + bif->bif_flags = req->ifbr_ifsflags & IFBIFMASK; + + + return (0); +} + +static int +bridge_ioctl_scache(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + sc->sc_brtmax = param->ifbrp_csize; + bridge_rttrim(sc); + + return (0); +} + +static int +bridge_ioctl_gcache(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_csize = sc->sc_brtmax; + + return (0); +} + + +#define BRIDGE_IOCTL_GIFS do { \ + struct bridge_iflist *bif; \ + struct ifbreq breq; \ + char *buf, *outbuf; \ + unsigned int count, buflen, len; \ + \ + count = 0; \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) \ + count++; \ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) \ + count++; \ + \ + buflen = sizeof(breq) * count; \ + if (bifc->ifbic_len == 0) { \ + bifc->ifbic_len = buflen; \ + return (0); \ + } \ + BRIDGE_UNLOCK(sc); \ + outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ + BRIDGE_LOCK(sc); \ + \ + count = 0; \ + buf = outbuf; \ + len = min(bifc->ifbic_len, buflen); \ + bzero(&breq, sizeof(breq)); \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ + if (len < sizeof(breq)) \ + break; \ + \ + snprintf(breq.ifbr_ifsname, sizeof(breq.ifbr_ifsname), "%s%d", \ + ifnet_name(bif->bif_ifp), ifnet_unit(bif->bif_ifp)); \ + /* Fill in the ifbreq structure */ \ + error = bridge_ioctl_gifflags(sc, &breq); \ + if (error) \ + break; \ + memcpy(buf, &breq, sizeof(breq)); \ + count++; \ + buf += sizeof(breq); \ + len -= sizeof(breq); \ + } \ + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) { \ + if (len < sizeof(breq)) \ + break; \ + \ + snprintf(breq.ifbr_ifsname, sizeof(breq.ifbr_ifsname), "%s%d", \ + ifnet_name(bif->bif_ifp), ifnet_unit(bif->bif_ifp)); \ + breq.ifbr_ifsflags = bif->bif_flags; \ + breq.ifbr_portno = bif->bif_ifp->if_index & 0xfff; \ + memcpy(buf, &breq, sizeof(breq)); \ + count++; \ + buf += sizeof(breq); \ + len -= sizeof(breq); \ + } \ + \ + BRIDGE_UNLOCK(sc); \ + bifc->ifbic_len = sizeof(breq) * count; \ + error = copyout(outbuf, bifc->ifbic_req, bifc->ifbic_len); \ + BRIDGE_LOCK(sc); \ + _FREE(outbuf, M_TEMP); \ +} while (0) + +static int +bridge_ioctl_gifs64(struct bridge_softc *sc, void *arg) +{ + struct ifbifconf64 *bifc = arg; + int error = 0; + + BRIDGE_IOCTL_GIFS; + + return (error); +} + +static int +bridge_ioctl_gifs32(struct bridge_softc *sc, void *arg) +{ + struct ifbifconf32 *bifc = arg; + int error = 0; + + BRIDGE_IOCTL_GIFS; + + return (error); +} + + +#define BRIDGE_IOCTL_RTS do { \ + struct bridge_rtnode *brt; \ + char *buf, *outbuf; \ + unsigned int count, buflen, len; \ + struct timespec now; \ + \ + if (bac->ifbac_len == 0) \ + return (0); \ + \ + count = 0; \ + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) \ + count++; \ + buflen = sizeof(bareq) * count; \ + \ + BRIDGE_UNLOCK(sc); \ + outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ + BRIDGE_LOCK(sc); \ + \ + count = 0; \ + buf = outbuf; \ + len = min(bac->ifbac_len, buflen); \ + bzero(&bareq, sizeof(bareq)); \ + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { \ + if (len < sizeof(bareq)) \ + goto out; \ + snprintf(bareq.ifba_ifsname, sizeof(bareq.ifba_ifsname), "%s%d", \ + ifnet_name(brt->brt_ifp), ifnet_unit(brt->brt_ifp)); \ + memcpy(bareq.ifba_dst, brt->brt_addr, sizeof(brt->brt_addr)); \ + bareq.ifba_vlan = brt->brt_vlan; \ + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { \ + nanouptime(&now); \ + if ((unsigned long)now.tv_sec < brt->brt_expire) \ + bareq.ifba_expire = brt->brt_expire - now.tv_sec; \ + } else \ + bareq.ifba_expire = 0; \ + bareq.ifba_flags = brt->brt_flags; \ + \ + memcpy(buf, &bareq, sizeof(bareq)); \ + count++; \ + buf += sizeof(bareq); \ + len -= sizeof(bareq); \ + } \ +out: \ + BRIDGE_UNLOCK(sc); \ + bac->ifbac_len = sizeof(bareq) * count; \ + error = copyout(outbuf, bac->ifbac_req, bac->ifbac_len); \ + BRIDGE_LOCK(sc); \ + _FREE(outbuf, M_TEMP); \ + return (error); \ +} while (0) + +static int +bridge_ioctl_rts64(struct bridge_softc *sc, void *arg) +{ + struct ifbaconf64 *bac = arg; + struct ifbareq64 bareq; + int error = 0; + + BRIDGE_IOCTL_RTS; + + return (error); +} + +static int +bridge_ioctl_rts32(struct bridge_softc *sc, void *arg) +{ + struct ifbaconf32 *bac = arg; + struct ifbareq32 bareq; + int error = 0; + + BRIDGE_IOCTL_RTS; + + return (error); +} + +static int +bridge_ioctl_saddr32(struct bridge_softc *sc, void *arg) +{ + struct ifbareq32 *req = arg; + struct bridge_iflist *bif; + int error; + + bif = bridge_lookup_member(sc, req->ifba_ifsname); + if (bif == NULL) + return (ENOENT); + + error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1, + req->ifba_flags); + + return (error); +} + +static int +bridge_ioctl_saddr64(struct bridge_softc *sc, void *arg) +{ + struct ifbareq64 *req = arg; + struct bridge_iflist *bif; + int error; + + bif = bridge_lookup_member(sc, req->ifba_ifsname); + if (bif == NULL) + return (ENOENT); + + error = bridge_rtupdate(sc, req->ifba_dst, req->ifba_vlan, bif, 1, + req->ifba_flags); + + return (error); +} + +static int +bridge_ioctl_sto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + sc->sc_brttimeout = param->ifbrp_ctime; + return (0); +} + +static int +bridge_ioctl_gto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_ctime = sc->sc_brttimeout; + return (0); +} + +static int +bridge_ioctl_daddr32(struct bridge_softc *sc, void *arg) +{ + struct ifbareq32 *req = arg; + + return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan)); +} + +static int +bridge_ioctl_daddr64(struct bridge_softc *sc, void *arg) +{ + struct ifbareq64 *req = arg; + + return (bridge_rtdaddr(sc, req->ifba_dst, req->ifba_vlan)); +} + +static int +bridge_ioctl_flush(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + + bridge_rtflush(sc, req->ifbr_ifsflags); + return (0); +} + +static int +bridge_ioctl_gpri(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_prio = bs->bs_bridge_priority; + return (0); +} + +static int +bridge_ioctl_spri(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_priority(&sc->sc_stp, param->ifbrp_prio)); +} + +static int +bridge_ioctl_ght(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_hellotime = bs->bs_bridge_htime >> 8; + return (0); +} + +static int +bridge_ioctl_sht(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_htime(&sc->sc_stp, param->ifbrp_hellotime)); +} + +static int +bridge_ioctl_gfd(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_fwddelay = bs->bs_bridge_fdelay >> 8; + return (0); +} + +static int +bridge_ioctl_sfd(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_fdelay(&sc->sc_stp, param->ifbrp_fwddelay)); +} + +static int +bridge_ioctl_gma(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + struct bstp_state *bs = &sc->sc_stp; + + param->ifbrp_maxage = bs->bs_bridge_max_age >> 8; + return (0); +} + +static int +bridge_ioctl_sma(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_maxage(&sc->sc_stp, param->ifbrp_maxage)); +} + +static int +bridge_ioctl_sifprio(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + return (bstp_set_port_priority(&bif->bif_stp, req->ifbr_priority)); +} + +static int +bridge_ioctl_sifcost(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + return (bstp_set_path_cost(&bif->bif_stp, req->ifbr_path_cost)); +} + +static int +bridge_ioctl_gfilt(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_filter = sc->sc_filter_flags; + + return (0); +} + +static int +bridge_ioctl_sfilt(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + if (param->ifbrp_filter & ~IFBF_FILT_MASK) + return (EINVAL); + +#ifndef BRIDGE_IPF + if (param->ifbrp_filter & IFBF_FILT_USEIPF) + return (EINVAL); +#endif + + sc->sc_filter_flags = param->ifbrp_filter; + + return (0); +} + +static int +bridge_ioctl_sifmaxaddr(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + + bif = bridge_lookup_member(sc, req->ifbr_ifsname); + if (bif == NULL) + return (ENOENT); + + bif->bif_addrmax = req->ifbr_addrmax; + return (0); +} + +static int +bridge_ioctl_addspan(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif = NULL; + struct ifnet *ifs; + + ifs = ifunit(req->ifbr_ifsname); + if (ifs == NULL) + return (ENOENT); + + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifs == bif->bif_ifp) + return (EBUSY); + + if (ifs->if_bridge != NULL) + return (EBUSY); + + switch (ifs->if_type) { + case IFT_ETHER: + case IFT_GIF: + case IFT_L2VLAN: + break; + default: + return (EINVAL); + } + + bif = _MALLOC(sizeof(*bif), M_DEVBUF, M_NOWAIT|M_ZERO); + if (bif == NULL) + return (ENOMEM); + + bif->bif_ifp = ifs; + bif->bif_flags = IFBIF_SPAN; + + ifnet_reference(bif->bif_ifp); + + TAILQ_INSERT_HEAD(&sc->sc_spanlist, bif, bif_next); + + return (0); +} + +static int +bridge_ioctl_delspan(struct bridge_softc *sc, void *arg) +{ + struct ifbreq *req = arg; + struct bridge_iflist *bif; + struct ifnet *ifs; + + ifs = ifunit(req->ifbr_ifsname); + if (ifs == NULL) + return (ENOENT); + + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifs == bif->bif_ifp) + break; + + if (bif == NULL) + return (ENOENT); + + bridge_delete_span(sc, bif); + + return (0); +} + +#define BRIDGE_IOCTL_GBPARAM do { \ + struct bstp_state *bs = &sc->sc_stp; \ + struct bstp_port *root_port; \ + \ + req->ifbop_maxage = bs->bs_bridge_max_age >> 8; \ + req->ifbop_hellotime = bs->bs_bridge_htime >> 8; \ + req->ifbop_fwddelay = bs->bs_bridge_fdelay >> 8; \ + \ + root_port = bs->bs_root_port; \ + if (root_port == NULL) \ + req->ifbop_root_port = 0; \ + else \ + req->ifbop_root_port = root_port->bp_ifp->if_index; \ + \ + req->ifbop_holdcount = bs->bs_txholdcount; \ + req->ifbop_priority = bs->bs_bridge_priority; \ + req->ifbop_protocol = bs->bs_protover; \ + req->ifbop_root_path_cost = bs->bs_root_pv.pv_cost; \ + req->ifbop_bridgeid = bs->bs_bridge_pv.pv_dbridge_id; \ + req->ifbop_designated_root = bs->bs_root_pv.pv_root_id; \ + req->ifbop_designated_bridge = bs->bs_root_pv.pv_dbridge_id; \ + req->ifbop_last_tc_time.tv_sec = bs->bs_last_tc_time.tv_sec; \ + req->ifbop_last_tc_time.tv_usec = bs->bs_last_tc_time.tv_usec; \ +} while (0) + +static int +bridge_ioctl_gbparam32(struct bridge_softc *sc, void *arg) +{ + struct ifbropreq32 *req = arg; + + BRIDGE_IOCTL_GBPARAM; + + return (0); +} + +static int +bridge_ioctl_gbparam64(struct bridge_softc *sc, void *arg) +{ + struct ifbropreq64 *req = arg; + + BRIDGE_IOCTL_GBPARAM; + + return (0); +} + + +static int +bridge_ioctl_grte(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + param->ifbrp_cexceeded = sc->sc_brtexceeded; + return (0); +} + +#define BRIDGE_IOCTL_GIFSSTP do { \ + struct bridge_iflist *bif; \ + struct bstp_port *bp; \ + struct ifbpstpreq bpreq; \ + char *buf, *outbuf; \ + unsigned int count, buflen, len; \ + \ + count = 0; \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ + if ((bif->bif_flags & IFBIF_STP) != 0) \ + count++; \ + } \ + \ + buflen = sizeof(bpreq) * count; \ + if (bifstp->ifbpstp_len == 0) { \ + bifstp->ifbpstp_len = buflen; \ + return (0); \ + } \ + \ + BRIDGE_UNLOCK(sc); \ + outbuf = _MALLOC(buflen, M_TEMP, M_WAITOK | M_ZERO); \ + BRIDGE_LOCK(sc); \ + \ + count = 0; \ + buf = outbuf; \ + len = min(bifstp->ifbpstp_len, buflen); \ + bzero(&bpreq, sizeof(bpreq)); \ + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { \ + if (len < sizeof(bpreq)) \ + break; \ + \ + if ((bif->bif_flags & IFBIF_STP) == 0) \ + continue; \ + \ + bp = &bif->bif_stp; \ + bpreq.ifbp_portno = bif->bif_ifp->if_index & 0xfff; \ + bpreq.ifbp_fwd_trans = bp->bp_forward_transitions; \ + bpreq.ifbp_design_cost = bp->bp_desg_pv.pv_cost; \ + bpreq.ifbp_design_port = bp->bp_desg_pv.pv_port_id; \ + bpreq.ifbp_design_bridge = bp->bp_desg_pv.pv_dbridge_id; \ + bpreq.ifbp_design_root = bp->bp_desg_pv.pv_root_id; \ + \ + memcpy(buf, &bpreq, sizeof(bpreq)); \ + count++; \ + buf += sizeof(bpreq); \ + len -= sizeof(bpreq); \ + } \ + \ + BRIDGE_UNLOCK(sc); \ + bifstp->ifbpstp_len = sizeof(bpreq) * count; \ + error = copyout(outbuf, bifstp->ifbpstp_req, bifstp->ifbpstp_len); \ + BRIDGE_LOCK(sc); \ + _FREE(outbuf, M_TEMP); \ + return (error); \ +} while (0) + +static int +bridge_ioctl_gifsstp32(struct bridge_softc *sc, void *arg) +{ + struct ifbpstpconf32 *bifstp = arg; + int error = 0; + + BRIDGE_IOCTL_GIFSSTP; + + return (error); +} + +static int +bridge_ioctl_gifsstp64(struct bridge_softc *sc, void *arg) +{ + struct ifbpstpconf64 *bifstp = arg; + int error = 0; + + BRIDGE_IOCTL_GIFSSTP; + + return (error); +} + +static int +bridge_ioctl_sproto(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_protocol(&sc->sc_stp, param->ifbrp_proto)); +} + +static int +bridge_ioctl_stxhc(struct bridge_softc *sc, void *arg) +{ + struct ifbrparam *param = arg; + + return (bstp_set_holdcount(&sc->sc_stp, param->ifbrp_txhc)); +} + +/* + * bridge_ifdetach: + * + * Detach an interface from a bridge. Called when a member + * interface is detaching. + */ +__private_extern__ void +bridge_ifdetach(struct bridge_iflist *bif, struct ifnet *ifp) +{ + struct bridge_softc *sc = ifp->if_bridge; + +#if BRIDGE_DEBUG + printf("bridge_ifdetach %s%d\n", ifnet_name(ifp), ifnet_unit(ifp)); +#endif + + /* Check if the interface is a bridge member */ + if (sc != NULL) { + BRIDGE_LOCK(sc); + + bif = bridge_lookup_member_if(sc, ifp); + if (bif != NULL) + bridge_delete_member(sc, bif, 1); + + BRIDGE_UNLOCK(sc); + return; + } + + /* Check if the interface is a span port */ + lck_mtx_lock(bridge_list_mtx); + LIST_FOREACH(sc, &bridge_list, sc_list) { + BRIDGE_LOCK(sc); + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) + if (ifp == bif->bif_ifp) { + bridge_delete_span(sc, bif); + break; + } + + BRIDGE_UNLOCK(sc); + } + lck_mtx_unlock(bridge_list_mtx); +} + +/* + * bridge_init: + * + * Initialize a bridge interface. + */ +static int +bridge_init(struct ifnet *ifp) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifp->if_softc; + struct timespec ts; + errno_t error; + + BRIDGE_LOCK_ASSERT(sc); + + if ((ifnet_flags(ifp) & IFF_RUNNING)) + return 0; + + ts.tv_sec = bridge_rtable_prune_period; + ts.tv_nsec = 0; + bsd_timeout(bridge_timer, sc, &ts); + + error = ifnet_set_flags(ifp, IFF_RUNNING, IFF_RUNNING); + if (error == 0) + bstp_init(&sc->sc_stp); /* Initialize Spanning Tree */ + + return error; +} + +/* + * bridge_stop: + * + * Stop the bridge interface. + */ +static void +bridge_stop(struct ifnet *ifp, __unused int disable) +{ + struct bridge_softc *sc = ifp->if_softc; + + BRIDGE_LOCK_ASSERT(sc); + + if ((ifnet_flags(ifp) & IFF_RUNNING) == 0) + return; + + bsd_untimeout(bridge_timer, sc); + bstp_stop(&sc->sc_stp); + + bridge_rtflush(sc, IFBF_FLUSHDYN); + + (void) ifnet_set_flags(ifp, 0, IFF_RUNNING); +} + +/* + * bridge_enqueue: + * + * Enqueue a packet on a bridge member interface. + * + */ +static void +bridge_enqueue(struct bridge_softc *sc, struct ifnet *dst_ifp, struct mbuf *m) +{ + int len, error = 0; + short mflags; + struct mbuf *m0; + + /* We may be sending a fragment so traverse the mbuf */ + for (; m; m = m0) { + m0 = m->m_nextpkt; + m->m_nextpkt = NULL; + + len = m->m_pkthdr.len; + mflags = m->m_flags; + m->m_flags |= M_PROTO1; //set to avoid loops + +#if HAS_IF_CAP + /* + * If underlying interface can not do VLAN tag insertion itself + * then attach a packet tag that holds it. + */ + if ((m->m_flags & M_VLANTAG) && + (dst_ifp->if_capenable & IFCAP_VLAN_HWTAGGING) == 0) { + m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); + if (m == NULL) { + printf("%s%d: unable to prepend VLAN header\n", + ifnet_name(dst_ifp), ifnet_unit(dst_ifp)); + (void) ifnet_stat_increment_out(dst_ifp, 0, 0, 1); + continue; + } + m->m_flags &= ~M_VLANTAG; + } +#endif /* HAS_IF_CAP */ + + error = ifnet_output_raw(dst_ifp, 0, m); + if (error == 0) { + (void) ifnet_stat_increment_out(sc->sc_ifp, 1, len, 0); + } else { + (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + } + } + + return; +} + +#if HAS_BRIDGE_DUMMYNET +/* + * bridge_dummynet: + * + * Receive a queued packet from dummynet and pass it on to the output + * interface. + * + * The mbuf has the Ethernet header already attached. + */ +static void +bridge_dummynet(struct mbuf *m, struct ifnet *ifp) +{ + struct bridge_softc *sc; + + sc = ifp->if_bridge; + + /* + * The packet didnt originate from a member interface. This should only + * ever happen if a member interface is removed while packets are + * queued for it. + */ + if (sc == NULL) { + m_freem(m); + return; + } + + if (PFIL_HOOKED(&inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&inet6_pfil_hook) +#endif + ) { + if (bridge_pfil(&m, sc->sc_ifp, ifp, PFIL_OUT) != 0) + return; + if (m == NULL) + return; + } + + bridge_enqueue(sc, ifp, m); +} +#endif /* HAS_BRIDGE_DUMMYNET */ + +#if BRIDGE_MEMBER_OUT_FILTER +/* + * bridge_output: + * + * Send output from a bridge member interface. This + * performs the bridging function for locally originated + * packets. + * + * The mbuf has the Ethernet header already attached. We must + * enqueue or free the mbuf before returning. + */ +static int +bridge_output(struct ifnet *ifp, struct mbuf *m, __unused struct sockaddr *sa, + __unused struct rtentry *rt) +{ + struct ether_header *eh; + struct ifnet *dst_if; + struct bridge_softc *sc; + uint16_t vlan; + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_output ifp %p %s%d\n", ifp, ifnet_name(ifp), ifnet_unit(ifp)); +#endif /* BRIDGE_DEBUG */ + + if (m->m_len < ETHER_HDR_LEN) { + m = m_pullup(m, ETHER_HDR_LEN); + if (m == NULL) + return (0); + } + + eh = mtod(m, struct ether_header *); + sc = ifp->if_bridge; + vlan = VLANTAGOF(m); + + BRIDGE_LOCK(sc); + + /* APPLE MODIFICATION + * If the packet is an 802.1X ethertype, then only send on the + * original output interface. + */ + if (eh->ether_type == htons(ETHERTYPE_PAE)) { + dst_if = ifp; + goto sendunicast; + } + + /* + * If bridge is down, but the original output interface is up, + * go ahead and send out that interface. Otherwise, the packet + * is dropped below. + */ + if ((sc->sc_ifp->if_flags & IFF_RUNNING) == 0) { + dst_if = ifp; + goto sendunicast; + } + + /* + * If the packet is a multicast, or we don't know a better way to + * get there, send to all interfaces. + */ + if (ETHER_IS_MULTICAST(eh->ether_dhost)) + dst_if = NULL; + else + dst_if = bridge_rtlookup(sc, eh->ether_dhost, vlan); + if (dst_if == NULL) { + struct bridge_iflist *bif; + struct mbuf *mc; + int error = 0, used = 0; + + bridge_span(sc, m); + + BRIDGE_LOCK2REF(sc, error); + if (error) { + m_freem(m); + return (0); + } + + TAILQ_FOREACH(bif, &sc->sc_iflist, bif_next) { + dst_if = bif->bif_ifp; + + if (dst_if->if_type == IFT_GIF) + continue; + if ((dst_if->if_flags & IFF_RUNNING) == 0) + continue; + + /* + * If this is not the original output interface, + * and the interface is participating in spanning + * tree, make sure the port is in a state that + * allows forwarding. + */ + if (dst_if != ifp && (bif->bif_flags & IFBIF_STP) && + bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + continue; + + if (LIST_NEXT(bif, bif_next) == NULL) { + used = 1; + mc = m; + } else { + mc = m_copypacket(m, M_DONTWAIT); + if (mc == NULL) { + (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + continue; + } + } + + bridge_enqueue(sc, dst_if, mc); + } + if (used == 0) + m_freem(m); + BRIDGE_UNREF(sc); + return (0); + } + +sendunicast: + /* + * XXX Spanning tree consideration here? + */ + + bridge_span(sc, m); + if ((dst_if->if_flags & IFF_RUNNING) == 0) { + m_freem(m); + BRIDGE_UNLOCK(sc); + return (0); + } + + BRIDGE_UNLOCK(sc); + bridge_enqueue(sc, dst_if, m); + return (0); +} +#endif /* BRIDGE_MEMBER_OUT_FILTER */ + +#if APPLE_BRIDGE_HWCKSUM_SUPPORT +static struct mbuf* bridge_fix_txcsum( struct mbuf *m ) +{ + // basic tests indicate that the vast majority of packets being processed + // here have an Ethernet header mbuf pre-pended to them (the first case below) + // the second highest are those where the Ethernet and IP/TCP/UDP headers are + // all in one mbuf (second case below) + // the third case has, in fact, never hit for me -- although if I comment out + // the first two cases, that code works for them, so I consider it a + // decent general solution + + int amt = ETHER_HDR_LEN; + int hlen = M_CSUM_DATA_IPv4_IPHL( m->m_pkthdr.csum_data ); + int off = M_CSUM_DATA_IPv4_OFFSET( m->m_pkthdr.csum_data ); + + /* + * NOTE we should never get vlan-attached packets here; + * support for those COULD be added, but we don't use them + * and it really kinda slows things down to worry about them + */ + +#ifdef DIAGNOSTIC + if ( m_tag_find( m, PACKET_TAG_VLAN, NULL ) != NULL ) + { + printf( "bridge: transmitting packet tagged with VLAN?\n" ); + KASSERT( 0 ); + m_freem( m ); + return NULL; + } +#endif + + if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + amt += hlen; + } + if ( m->m_pkthdr.csum_flags & M_CSUM_TCPv4 ) + { + amt += off + sizeof( uint16_t ); + } + + if ( m->m_pkthdr.csum_flags & M_CSUM_UDPv4 ) + { + amt += off + sizeof( uint16_t ); + } + + if ( m->m_len == ETHER_HDR_LEN ) + { + // this is the case where there's an Ethernet header in an mbuf + + // the first mbuf is the Ethernet header -- just strip it off and do the checksum + struct mbuf *m_ip = m->m_next; + + // set up m_ip so the cksum operations work + /* APPLE MODIFICATION 22 Apr 2008 <mvega@apple.com> + * <rdar://5817385> Clear the m_tag list before setting + * M_PKTHDR. + * + * If this m_buf chain was extended via M_PREPEND(), then + * m_ip->m_pkthdr is identical to m->m_pkthdr (see + * M_MOVE_PKTHDR()). The only thing preventing access to this + * invalid packet header data is the fact that the M_PKTHDR + * flag is clear, i.e., m_ip->m_flag & M_PKTHDR == 0, but we're + * about to set the M_PKTHDR flag, so to be safe we initialize, + * more accurately, we clear, m_ip->m_pkthdr.tags via + * m_tag_init(). + * + * Suppose that we do not do this; if m_pullup(), below, fails, + * then m_ip will be freed along with m_ip->m_pkthdr.tags, but + * we will also free m soon after, via m_freem(), and + * consequently attempt to free m->m_pkthdr.tags in the + * process. The problem is that m->m_pkthdr.tags will have + * already been freed by virtue of being equal to + * m_ip->m_pkthdr.tags. Attempts to dereference + * m->m_pkthdr.tags in m_tag_delete_chain() will result in a + * panic. + */ + m_tag_init(m_ip); + /* END MODIFICATION */ + m_ip->m_flags |= M_PKTHDR; + m_ip->m_pkthdr.csum_flags = m->m_pkthdr.csum_flags; + m_ip->m_pkthdr.csum_data = m->m_pkthdr.csum_data; + m_ip->m_pkthdr.len = m->m_pkthdr.len - ETHER_HDR_LEN; + + // set up the header mbuf so we can prepend it back on again later + m->m_pkthdr.csum_flags = 0; + m->m_pkthdr.csum_data = 0; + m->m_pkthdr.len = ETHER_HDR_LEN; + m->m_next = NULL; + + + // now do the checksums we need -- first IP + if ( m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + // make sure the IP header (or at least the part with the cksum) is there + m_ip = m_pullup( m_ip, sizeof( struct ip ) ); + if ( m_ip == NULL ) + { + printf( "bridge: failed to flatten header\n "); + m_freem( m ); + return NULL; + } + + // now do the checksum + { + struct ip *ip = mtod( m_ip, struct ip* ); + ip->ip_sum = in_cksum( m_ip, hlen ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed IPv4 checksum\n" ); +#endif + } + } + + // now do a TCP or UDP delayed checksum + if ( m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) + { + in_delayed_cksum( m_ip ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); +#endif + } + + // now attach the ethernet header back onto the IP packet + m->m_next = m_ip; + m->m_pkthdr.len += m_length( m_ip ); + + // clear the M_PKTHDR flags on the ip packet (again, we re-attach later) + m_ip->m_flags &= ~M_PKTHDR; + + // and clear any csum flags + m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } + else if ( m->m_len >= amt ) + { + // everything fits in the first mbuf, so futz with m->m_data, m->m_len and m->m_pkthdr.len to + // make it work + m->m_len -= ETHER_HDR_LEN; + m->m_data += ETHER_HDR_LEN; + m->m_pkthdr.len -= ETHER_HDR_LEN; + + // now do the checksums we need -- first IP + if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + struct ip *ip = mtod( m, struct ip* ); + ip->ip_sum = in_cksum( m, hlen ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed IPv4 checksum\n" ); +#endif + } + + // now do a TCP or UDP delayed checksum + if ( m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) + { + in_delayed_cksum( m ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); +#endif + } + + // now stick the ethernet header back on + m->m_len += ETHER_HDR_LEN; + m->m_data -= ETHER_HDR_LEN; + m->m_pkthdr.len += ETHER_HDR_LEN; + + // and clear any csum flags + m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } + else + { + struct mbuf *m_ip; + + // general case -- need to simply split it off and deal + + // first, calculate how much needs to be made writable (we may have a read-only mbuf here) + hlen = M_CSUM_DATA_IPv4_IPHL( m->m_pkthdr.csum_data ); +#if PARANOID + off = M_CSUM_DATA_IPv4_OFFSET( m->m_pkthdr.csum_data ); + + if ( m->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + amt += hlen; + } + + if ( m->m_pkthdr.csum_flags & M_CSUM_TCPv4 ) + { + amt += sizeof( struct tcphdr * ); + amt += off; + } + + if ( m->m_pkthdr.csum_flags & M_CSUM_UDPv4 ) + { + amt += sizeof( struct udphdr * ); + amt += off; + } +#endif + + // now split the ethernet header off of the IP packet (we'll re-attach later) + m_ip = m_split( m, ETHER_HDR_LEN, M_NOWAIT ); + if ( m_ip == NULL ) + { + printf( "bridge_fix_txcsum: could not split ether header\n" ); + + m_freem( m ); + return NULL; + } + +#if PARANOID + // make sure that the IP packet is writable for the portion we need + if ( m_makewritable( &m_ip, 0, amt, M_DONTWAIT ) != 0 ) + { + printf( "bridge_fix_txcsum: could not make %d bytes writable\n", amt ); + + m_freem( m ); + m_freem( m_ip ); + return NULL; + } +#endif + + m_ip->m_pkthdr.csum_flags = m->m_pkthdr.csum_flags; + m_ip->m_pkthdr.csum_data = m->m_pkthdr.csum_data; + + m->m_pkthdr.csum_flags = 0; + m->m_pkthdr.csum_data = 0; + + // now do the checksums we need -- first IP + if ( m_ip->m_pkthdr.csum_flags & M_CSUM_IPv4 ) + { + // make sure the IP header (or at least the part with the cksum) is there + m_ip = m_pullup( m_ip, sizeof( struct ip ) ); + if ( m_ip == NULL ) + { + printf( "bridge: failed to flatten header\n "); + m_freem( m ); + return NULL; + } + + // now do the checksum + { + struct ip *ip = mtod( m_ip, struct ip* ); + ip->ip_sum = in_cksum( m_ip, hlen ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed IPv4 checksum\n" ); +#endif + } + } + + // now do a TCP or UDP delayed checksum + if ( m_ip->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4) ) + { + in_delayed_cksum( m_ip ); + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf( "bridge: performed TCPv4/UDPv4 checksum\n" ); +#endif + } + + // now attach the ethernet header back onto the IP packet + m->m_next = m_ip; + m->m_pkthdr.len += m_length( m_ip ); + + // clear the M_PKTHDR flags on the ip packet (again, we re-attach later) + m_ip->m_flags &= ~M_PKTHDR; + + // and clear any csum flags + m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4); + } + + return m; +} +#endif + +/* + * bridge_start: + * + * Start output on a bridge. + * + */ +static errno_t +bridge_start(struct ifnet *ifp, struct mbuf *m) +{ + struct bridge_softc *sc = ifnet_softc(ifp); + struct ether_header *eh; + struct ifnet *dst_if; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_NOTOWNED); + + eh = mtod(m, struct ether_header *); + + BRIDGE_LOCK(sc); + + if ((m->m_flags & (M_BCAST|M_MCAST)) == 0 && + (dst_if = bridge_rtlookup(sc, eh->ether_dhost, 0)) != NULL) { + + { +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + /* + * APPLE MODIFICATION - if the packet needs a checksum (i.e., + * checksum has been deferred for HW support) AND the destination + * interface doesn't support HW checksums, then we + * need to fix-up the checksum here + */ + if ( + ( (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4) ) != 0 ) && + ( (dst_if->if_csum_flags_tx & m->m_pkthdr.csum_flags ) != m->m_pkthdr.csum_flags ) + ) + { + m = bridge_fix_txcsum( m ); + if ( m == NULL ) + { + goto done; + } + } + +#else + if (eh->ether_type == htons(ETHERTYPE_IP)) + mbuf_outbound_finalize(m, PF_INET, sizeof(struct ether_header)); + else + m->m_pkthdr.csum_flags = 0; +#endif + #if NBPFILTER > 0 + if (sc->sc_bpf_output) + bridge_bpf_output(ifp, m); + #endif + BRIDGE_UNLOCK(sc); + bridge_enqueue(sc, dst_if, m); + } + } else + { +#if APPLE_BRIDGE_HWCKSUM_SUPPORT + + /* + * APPLE MODIFICATION - if the MULTICAST packet needs a checksum (i.e., + * checksum has been deferred for HW support) AND at least one destination + * interface doesn't support HW checksums, then we go ahead and fix it up + * here, since it doesn't make sense to do it more than once + */ + + if ( + (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_IPv4)) && + /* + * XXX FIX ME: keep track of whether or not we have any interfaces that + * do not support checksums (for now, assume we do) + */ + ( 1 ) + ) + { + m = bridge_fix_txcsum( m ); + if ( m == NULL ) + { + goto done; + } + } +#else + if (eh->ether_type == htons(ETHERTYPE_IP)) + mbuf_outbound_finalize(m, PF_INET, sizeof(struct ether_header)); + else + m->m_pkthdr.csum_flags = 0; +#endif + + #if NBPFILTER > 0 + if (sc->sc_bpf_output) + bridge_bpf_output(ifp, m); + #endif + bridge_broadcast(sc, ifp, m, 0); + } +#if APPLE_BRIDGE_HWCKSUM_SUPPORT +done: +#endif + + return 0; +} + +/* + * bridge_forward: + * + * The forwarding function of the bridge. + * + * NOTE: Releases the lock on return. + */ +static void +bridge_forward(struct bridge_softc *sc, struct bridge_iflist *sbif, + struct mbuf *m) +{ + struct bridge_iflist *dbif; + struct ifnet *src_if, *dst_if, *ifp; + struct ether_header *eh; + uint16_t vlan; + uint8_t *dst; + int error; + + lck_mtx_assert(sc->sc_mtx, LCK_MTX_ASSERT_OWNED); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_forward %s%d m%p\n", ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), m); +#endif /* BRIDGE_DEBUG */ + + src_if = m->m_pkthdr.rcvif; + ifp = sc->sc_ifp; + + (void) ifnet_stat_increment_in(ifp, 1, m->m_pkthdr.len, 0); + vlan = VLANTAGOF(m); + + + if ((sbif->bif_flags & IFBIF_STP) && + sbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + goto drop; + + eh = mtod(m, struct ether_header *); + dst = eh->ether_dhost; + + /* If the interface is learning, record the address. */ + if (sbif->bif_flags & IFBIF_LEARNING) { + error = bridge_rtupdate(sc, eh->ether_shost, vlan, + sbif, 0, IFBAF_DYNAMIC); + /* + * If the interface has addresses limits then deny any source + * that is not in the cache. + */ + if (error && sbif->bif_addrmax) + goto drop; + } + + if ((sbif->bif_flags & IFBIF_STP) != 0 && + sbif->bif_stp.bp_state == BSTP_IFSTATE_LEARNING) + goto drop; + + /* + * At this point, the port either doesn't participate + * in spanning tree or it is in the forwarding state. + */ + + /* + * If the packet is unicast, destined for someone on + * "this" side of the bridge, drop it. + */ + if ((m->m_flags & (M_BCAST|M_MCAST)) == 0) { + dst_if = bridge_rtlookup(sc, dst, vlan); + if (src_if == dst_if) + goto drop; + } else { + /* + * Check if its a reserved multicast address, any address + * listed in 802.1D section 7.12.6 may not be forwarded by the + * bridge. + * This is currently 01-80-C2-00-00-00 to 01-80-C2-00-00-0F + */ + if (dst[0] == 0x01 && dst[1] == 0x80 && + dst[2] == 0xc2 && dst[3] == 0x00 && + dst[4] == 0x00 && dst[5] <= 0x0f) + goto drop; + + + /* ...forward it to all interfaces. */ + atomic_add_64(&ifp->if_imcasts, 1); + dst_if = NULL; + } + + /* + * If we have a destination interface which is a member of our bridge, + * OR this is a unicast packet, push it through the bpf(4) machinery. + * For broadcast or multicast packets, don't bother because it will + * be reinjected into ether_input. We do this before we pass the packets + * through the pfil(9) framework, as it is possible that pfil(9) will + * drop the packet, or possibly modify it, making it difficult to debug + * firewall issues on the bridge. + */ +#if NBPFILTER > 0 + if (eh->ether_type == htons(ETHERTYPE_RSN_PREAUTH) || + dst_if != NULL || (m->m_flags & (M_BCAST | M_MCAST)) == 0) { + m->m_pkthdr.rcvif = ifp; + if (sc->sc_bpf_input) + bridge_bpf_input(ifp, m); + } +#endif /* NBPFILTER */ + +#if defined(PFIL_HOOKS) + /* run the packet filter */ + if (PFIL_HOOKED(&inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&inet6_pfil_hook) +#endif /* INET6 */ + ) { + BRIDGE_UNLOCK(sc); + if (bridge_pfil(&m, ifp, src_if, PFIL_IN) != 0) + return; + if (m == NULL) + return; + BRIDGE_LOCK(sc); + } +#endif /* PFIL_HOOKS */ + + if (dst_if == NULL) { + /* + * Clear any in-bound checksum flags for this packet. + */ + mbuf_inbound_modified(m); + + bridge_broadcast(sc, src_if, m, 1); + + return; + } + + /* + * At this point, we're dealing with a unicast frame + * going to a different interface. + */ + if ((dst_if->if_flags & IFF_RUNNING) == 0) + goto drop; + + dbif = bridge_lookup_member_if(sc, dst_if); + if (dbif == NULL) + /* Not a member of the bridge (anymore?) */ + goto drop; + + /* Private segments can not talk to each other */ + if (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE) + goto drop; + + if ((dbif->bif_flags & IFBIF_STP) && + dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + goto drop; + +#if HAS_DHCPRA_MASK + /* APPLE MODIFICATION <rdar://6985737> */ + if ((dst_if->if_extflags & IFEXTF_DHCPRA_MASK) != 0) { + m = ip_xdhcpra_output(dst_if, m); + if (!m) { + ++sc->sc_sc.sc_ifp.if_xdhcpra; + return; + } + } +#endif /* HAS_DHCPRA_MASK */ + + BRIDGE_UNLOCK(sc); + +#if defined(PFIL_HOOKS) + if (PFIL_HOOKED(&inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&inet6_pfil_hook) +#endif + ) { + if (bridge_pfil(&m, ifp, dst_if, PFIL_OUT) != 0) + return; + if (m == NULL) + return; + } +#endif /* PFIL_HOOKS */ + + /* + * Clear any in-bound checksum flags for this packet. + */ + mbuf_inbound_modified(m); + + bridge_enqueue(sc, dst_if, m); + return; + +drop: + BRIDGE_UNLOCK(sc); + m_freem(m); +} + +#if BRIDGE_DEBUG + +char * ether_ntop(char *, size_t , const u_char *); + +__private_extern__ char * +ether_ntop(char *buf, size_t len, const u_char *ap) +{ + snprintf(buf, len, "%02x:%02x:%02x:%02x:%02x:%02x", + ap[0], ap[1], ap[2], ap[3], ap[4], ap[5]); + + return buf; +} + +#endif /* BRIDGE_DEBUG */ + +/* + * bridge_input: + * + * Filter input from a member interface. Queue the packet for + * bridging if it is not for us. + */ +__private_extern__ errno_t +bridge_input(struct ifnet *ifp, struct mbuf *m, __unused void *frame_header) +{ + struct bridge_softc *sc = ifp->if_bridge; + struct bridge_iflist *bif, *bif2; + struct ifnet *bifp; + struct ether_header *eh; + struct mbuf *mc, *mc2; + uint16_t vlan; + int error; + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf("bridge_input: %s%d from %s%d m %p data %p\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + ifnet_name(ifp), ifnet_unit(ifp), + m, mbuf_data(m)); +#endif /* BRIDGE_DEBUG */ + + if ((sc->sc_ifp->if_flags & IFF_RUNNING) == 0) { +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d not running passing along\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); +#endif /* BRIDGE_DEBUG */ + return 0; + } + + bifp = sc->sc_ifp; + vlan = VLANTAGOF(m); + +#ifdef IFF_MONITOR + /* + * Implement support for bridge monitoring. If this flag has been + * set on this interface, discard the packet once we push it through + * the bpf(4) machinery, but before we do, increment the byte and + * packet counters associated with this interface. + */ + if ((bifp->if_flags & IFF_MONITOR) != 0) { + m->m_pkthdr.rcvif = bifp; + BRIDGE_BPF_MTAP_INPUT(sc, m); + (void) ifnet_stat_increment_in(bifp, 1, m->m_pkthdr.len, 0); + m_freem(m); + return EJUSTRETURN; + } +#endif /* IFF_MONITOR */ + + /* + * Need to clear the promiscous flags otherwise it will be + * dropped by DLIL after processing filters + */ + if ((mbuf_flags(m) & MBUF_PROMISC)) + mbuf_setflags_mask(m, 0, MBUF_PROMISC); + + BRIDGE_LOCK(sc); + bif = bridge_lookup_member_if(sc, ifp); + if (bif == NULL) { + BRIDGE_UNLOCK(sc); +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d bridge_lookup_member_if failed\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); +#endif /* BRIDGE_DEBUG */ + return 0; + } + + eh = mtod(m, struct ether_header *); + + bridge_span(sc, m); + + if (m->m_flags & (M_BCAST|M_MCAST)) { + +#if BRIDGE_DEBUG + if (_if_brige_debug) + if ((m->m_flags & M_MCAST)) + printf("mulicast: %02x:%02x:%02x:%02x:%02x:%02x\n", + eh->ether_dhost[0], eh->ether_dhost[1], eh->ether_dhost[2], + eh->ether_dhost[3], eh->ether_dhost[4], eh->ether_dhost[5]); + +#endif /* BRIDGE_DEBUG */ + + /* Tap off 802.1D packets; they do not get forwarded. */ + if (memcmp(eh->ether_dhost, bstp_etheraddr, + ETHER_ADDR_LEN) == 0) { + m = bstp_input(&bif->bif_stp, ifp, m); + if (m == NULL) { + BRIDGE_UNLOCK(sc); + return EJUSTRETURN; + } + } + + if ((bif->bif_flags & IFBIF_STP) && + bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { + BRIDGE_UNLOCK(sc); + return 0; + } + + /* + * Make a deep copy of the packet and enqueue the copy + * for bridge processing; return the original packet for + * local processing. + */ + mc = m_dup(m, M_DONTWAIT); + if (mc == NULL) { + BRIDGE_UNLOCK(sc); + return 0; + } + + /* + * Perform the bridge forwarding function with the copy. + * + * Note that bridge_forward calls BRIDGE_UNLOCK + */ + bridge_forward(sc, bif, mc); + + /* + * Reinject the mbuf as arriving on the bridge so we have a + * chance at claiming multicast packets. We can not loop back + * here from ether_input as a bridge is never a member of a + * bridge. + */ + KASSERT(bifp->if_bridge == NULL, + ("loop created in bridge_input")); + mc2 = m_dup(m, M_DONTWAIT); + if (mc2 != NULL) { + /* Keep the layer3 header aligned */ + int i = min(mc2->m_pkthdr.len, max_protohdr); + mc2 = m_copyup(mc2, i, ETHER_ALIGN); + } + if (mc2 != NULL) { + // mark packet as arriving on the bridge + mc2->m_pkthdr.rcvif = bifp; + mc2->m_pkthdr.header = mbuf_data(mc2); + +#if NBPFILTER > 0 + if (sc->sc_bpf_input) + bridge_bpf_input(bifp, mc2); +#endif /* NBPFILTER */ + (void) mbuf_setdata(mc2, (char *)mbuf_data(mc2) + ETHER_HDR_LEN, mbuf_len(mc2) - ETHER_HDR_LEN); + (void) mbuf_pkthdr_adjustlen(mc2, - ETHER_HDR_LEN); + + (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(mc2), 0); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d mcast for us\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); +#endif /* BRIDGE_DEBUG */ + + dlil_input_packet_list(bifp, mc2); + } + + /* Return the original packet for local processing. */ + return 0; + } + + if ((bif->bif_flags & IFBIF_STP) && + bif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) { + BRIDGE_UNLOCK(sc); + return 0; + } + +#ifdef DEV_CARP +# define OR_CARP_CHECK_WE_ARE_DST(iface) \ + || ((iface)->if_carp \ + && carp_forus((iface)->if_carp, eh->ether_dhost)) +# define OR_CARP_CHECK_WE_ARE_SRC(iface) \ + || ((iface)->if_carp \ + && carp_forus((iface)->if_carp, eh->ether_shost)) +#else +# define OR_CARP_CHECK_WE_ARE_DST(iface) +# define OR_CARP_CHECK_WE_ARE_SRC(iface) +#endif + +#ifdef INET6 +# define OR_PFIL_HOOKED_INET6 \ + || PFIL_HOOKED(&inet6_pfil_hook) +#else +# define OR_PFIL_HOOKED_INET6 +#endif + +#if defined(PFIL_HOOKS) +#define PFIL_PHYS(sc, ifp, m) do { \ + if (pfil_local_phys && \ + (PFIL_HOOKED(&inet_pfil_hook) \ + OR_PFIL_HOOKED_INET6)) { \ + if (bridge_pfil(&m, NULL, ifp, \ + PFIL_IN) != 0 || m == NULL) { \ + BRIDGE_UNLOCK(sc); \ + return (NULL); \ + } \ + } \ + } while (0) +#else /* PFIL_HOOKS */ +#define PFIL_PHYS(sc, ifp, m) +#endif /* PFIL_HOOKS */ + +#define GRAB_OUR_PACKETS(iface) \ + if ((iface)->if_type == IFT_GIF) \ + continue; \ + /* It is destined for us. */ \ + if (memcmp(ifnet_lladdr((iface)), eh->ether_dhost, ETHER_ADDR_LEN) == 0 \ + OR_CARP_CHECK_WE_ARE_DST((iface)) \ + ) { \ + if ((iface)->if_type == IFT_BRIDGE) { \ + BRIDGE_BPF_MTAP_INPUT(sc, m); \ + /* Filter on the physical interface. */ \ + PFIL_PHYS(sc, iface, m); \ + } \ + if (bif->bif_flags & IFBIF_LEARNING) { \ + error = bridge_rtupdate(sc, eh->ether_shost, \ + vlan, bif, 0, IFBAF_DYNAMIC); \ + if (error && bif->bif_addrmax) { \ + BRIDGE_UNLOCK(sc); \ + return EJUSTRETURN; \ + } \ + } \ + m->m_pkthdr.rcvif = iface; \ + BRIDGE_UNLOCK(sc); \ + return 0; \ + } \ + \ + /* We just received a packet that we sent out. */ \ + if (memcmp(ifnet_lladdr((iface)), eh->ether_shost, ETHER_ADDR_LEN) == 0 \ + OR_CARP_CHECK_WE_ARE_SRC((iface)) \ + ) { \ + BRIDGE_UNLOCK(sc); \ + return EJUSTRETURN; \ + } + + /* + * Unicast. + */ + /* + * If the packet is for us, set the packets source as the + * bridge, and return the packet back to ether_input for + * local processing. + */ + if (memcmp(eh->ether_dhost, ifnet_lladdr(bifp), + ETHER_ADDR_LEN) == 0 + OR_CARP_CHECK_WE_ARE_DST(bifp)) { + + /* Mark the packet as arriving on the bridge interface */ + (void) mbuf_pkthdr_setrcvif(m, bifp); + mbuf_pkthdr_setheader(m, frame_header); + + /* + * If the interface is learning, and the source + * address is valid and not multicast, record + * the address. + */ + if ((bif->bif_flags & IFBIF_LEARNING) != 0 && + ETHER_IS_MULTICAST(eh->ether_shost) == 0 && + (eh->ether_shost[0] | eh->ether_shost[1] | + eh->ether_shost[2] | eh->ether_shost[3] | + eh->ether_shost[4] | eh->ether_shost[5]) != 0) { + (void) bridge_rtupdate(sc, eh->ether_shost, + vlan, bif, 0, IFBAF_DYNAMIC); + } + + BRIDGE_BPF_MTAP_INPUT(sc, m); + + (void) mbuf_setdata(m, (char *)mbuf_data(m) + ETHER_HDR_LEN, mbuf_len(m) - ETHER_HDR_LEN); + (void) mbuf_pkthdr_adjustlen(m, - ETHER_HDR_LEN); + + (void) ifnet_stat_increment_in(bifp, 1, mbuf_pkthdr_len(m), 0); + + BRIDGE_UNLOCK(sc); + +#if BRIDGE_DEBUG + if (_if_brige_debug) + printf( "bridge_input: %s%d packet for bridge\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp)); +#endif /* BRIDGE_DEBUG */ + + dlil_input_packet_list(bifp, m); + + return EJUSTRETURN; + } + + /* + * if the destination of the packet is for the MAC address of + * the member interface itself, then we don't need to forward + * it -- just pass it back. Note that it'll likely just be + * dropped by the stack, but if something else is bound to + * the interface directly (for example, the wireless stats + * protocol -- although that actually uses BPF right now), + * then it will consume the packet + * + * ALSO, note that we do this check AFTER checking for the + * bridge's own MAC address, because the bridge may be + * using the SAME MAC address as one of its interfaces + */ + if (memcmp(eh->ether_dhost, ifnet_lladdr(ifp), + ETHER_ADDR_LEN) == 0) { + +#ifdef VERY_VERY_VERY_DIAGNOSTIC + printf("bridge_input: not forwarding packet bound for member interface\n" ); +#endif + BRIDGE_UNLOCK(sc); + return 0; + } + + /* Now check the all bridge members. */ + TAILQ_FOREACH(bif2, &sc->sc_iflist, bif_next) { + GRAB_OUR_PACKETS(bif2->bif_ifp) + } + +#undef OR_CARP_CHECK_WE_ARE_DST +#undef OR_CARP_CHECK_WE_ARE_SRC +#undef OR_PFIL_HOOKED_INET6 +#undef GRAB_OUR_PACKETS + + /* + * Perform the bridge forwarding function. + * + * Note that bridge_forward calls BRIDGE_UNLOCK + */ + bridge_forward(sc, bif, m); + + return EJUSTRETURN; +} + +/* + * bridge_broadcast: + * + * Send a frame to all interfaces that are members of + * the bridge, except for the one on which the packet + * arrived. + * + * NOTE: Releases the lock on return. + */ +static void +bridge_broadcast(struct bridge_softc *sc, struct ifnet *src_if, + struct mbuf *m, int runfilt) +{ +#ifndef PFIL_HOOKS +#pragma unused(runfilt) +#endif + struct bridge_iflist *dbif, *sbif; + struct mbuf *mc; + struct ifnet *dst_if; + int error = 0, used = 0; + + sbif = bridge_lookup_member_if(sc, src_if); + + BRIDGE_LOCK2REF(sc, error); + if (error) { + m_freem(m); + return; + } + +#ifdef PFIL_HOOKS + /* Filter on the bridge interface before broadcasting */ + if (runfilt && (PFIL_HOOKED(&inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&inet6_pfil_hook) +#endif /* INET6 */ + )) { + if (bridge_pfil(&m, sc->sc_ifp, NULL, PFIL_OUT) != 0) + goto out; + if (m == NULL) + goto out; + } +#endif /* PFIL_HOOKS */ + + TAILQ_FOREACH(dbif, &sc->sc_iflist, bif_next) { + dst_if = dbif->bif_ifp; + if (dst_if == src_if) + continue; + + /* Private segments can not talk to each other */ + if (sbif && (sbif->bif_flags & dbif->bif_flags & IFBIF_PRIVATE)) + continue; + + if ((dbif->bif_flags & IFBIF_STP) && + dbif->bif_stp.bp_state == BSTP_IFSTATE_DISCARDING) + continue; + + if ((dbif->bif_flags & IFBIF_DISCOVER) == 0 && + (m->m_flags & (M_BCAST|M_MCAST)) == 0) + continue; + + if ((dst_if->if_flags & IFF_RUNNING) == 0) + continue; + + if (TAILQ_NEXT(dbif, bif_next) == NULL) { + mc = m; + used = 1; + } else { + mc = m_dup(m, M_DONTWAIT); + if (mc == NULL) { + (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + continue; + } + } + +#ifdef PFIL_HOOKS + /* + * Filter on the output interface. Pass a NULL bridge interface + * pointer so we do not redundantly filter on the bridge for + * each interface we broadcast on. + */ + if (runfilt && (PFIL_HOOKED(&inet_pfil_hook) +#ifdef INET6 + || PFIL_HOOKED(&inet6_pfil_hook) +#endif + )) { + if (used == 0) { + /* Keep the layer3 header aligned */ + int i = min(mc->m_pkthdr.len, max_protohdr); + mc = m_copyup(mc, i, ETHER_ALIGN); + if (mc == NULL) { + (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + continue; + } + } + if (bridge_pfil(&mc, NULL, dst_if, PFIL_OUT) != 0) + continue; + if (mc == NULL) + continue; + } +#endif /* PFIL_HOOKS */ + + bridge_enqueue(sc, dst_if, mc); + } + if (used == 0) + m_freem(m); + +#ifdef PFIL_HOOKS +out: +#endif /* PFIL_HOOKS */ + + BRIDGE_UNREF(sc); +} + +/* + * bridge_span: + * + * Duplicate a packet out one or more interfaces that are in span mode, + * the original mbuf is unmodified. + */ +static void +bridge_span(struct bridge_softc *sc, struct mbuf *m) +{ + struct bridge_iflist *bif; + struct ifnet *dst_if; + struct mbuf *mc; + + if (TAILQ_EMPTY(&sc->sc_spanlist)) + return; + + TAILQ_FOREACH(bif, &sc->sc_spanlist, bif_next) { + dst_if = bif->bif_ifp; + + if ((dst_if->if_flags & IFF_RUNNING) == 0) + continue; + + mc = m_copypacket(m, M_DONTWAIT); + if (mc == NULL) { + (void) ifnet_stat_increment_out(sc->sc_ifp, 0, 0, 1); + continue; + } + + bridge_enqueue(sc, dst_if, mc); + } +} + + + +/* + * bridge_rtupdate: + * + * Add a bridge routing entry. + */ +static int +bridge_rtupdate(struct bridge_softc *sc, const uint8_t *dst, uint16_t vlan, + struct bridge_iflist *bif, int setflags, uint8_t flags) +{ + struct bridge_rtnode *brt; + int error; + + BRIDGE_LOCK_ASSERT(sc); + + /* Check the source address is valid and not multicast. */ + if (ETHER_IS_MULTICAST(dst) || + (dst[0] == 0 && dst[1] == 0 && dst[2] == 0 && + dst[3] == 0 && dst[4] == 0 && dst[5] == 0) != 0) + return (EINVAL); + + + /* 802.1p frames map to vlan 1 */ + if (vlan == 0) + vlan = 1; + + /* + * A route for this destination might already exist. If so, + * update it, otherwise create a new one. + */ + if ((brt = bridge_rtnode_lookup(sc, dst, vlan)) == NULL) { + if (sc->sc_brtcnt >= sc->sc_brtmax) { + sc->sc_brtexceeded++; + return (ENOSPC); + } + /* Check per interface address limits (if enabled) */ + if (bif->bif_addrmax && bif->bif_addrcnt >= bif->bif_addrmax) { + bif->bif_addrexceeded++; + return (ENOSPC); + } + + /* + * Allocate a new bridge forwarding node, and + * initialize the expiration time and Ethernet + * address. + */ + brt = zalloc_noblock(bridge_rtnode_pool); + if (brt == NULL) + return (ENOMEM); + + if (bif->bif_flags & IFBIF_STICKY) + brt->brt_flags = IFBAF_STICKY; + else + brt->brt_flags = IFBAF_DYNAMIC; + + memcpy(brt->brt_addr, dst, ETHER_ADDR_LEN); + brt->brt_vlan = vlan; + + + if ((error = bridge_rtnode_insert(sc, brt)) != 0) { + zfree(bridge_rtnode_pool, brt); + return (error); + } + brt->brt_dst = bif; + bif->bif_addrcnt++; + } + + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC && + brt->brt_dst != bif) { + brt->brt_dst->bif_addrcnt--; + brt->brt_dst = bif; + brt->brt_dst->bif_addrcnt++; + } + + if ((flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { + struct timespec now; + + nanouptime(&now); + brt->brt_expire = now.tv_sec + sc->sc_brttimeout; + } + if (setflags) + brt->brt_flags = flags; + + + return (0); +} + +/* + * bridge_rtlookup: + * + * Lookup the destination interface for an address. + */ +static struct ifnet * +bridge_rtlookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) +{ + struct bridge_rtnode *brt; + + BRIDGE_LOCK_ASSERT(sc); + + if ((brt = bridge_rtnode_lookup(sc, addr, vlan)) == NULL) + return (NULL); + + return (brt->brt_ifp); +} + +/* + * bridge_rttrim: + * + * Trim the routine table so that we have a number + * of routing entries less than or equal to the + * maximum number. + */ +static void +bridge_rttrim(struct bridge_softc *sc) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + /* Make sure we actually need to do this. */ + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + + /* Force an aging cycle; this might trim enough addresses. */ + bridge_rtage(sc); + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { + bridge_rtnode_destroy(sc, brt); + if (sc->sc_brtcnt <= sc->sc_brtmax) + return; + } + } +} + +/* + * bridge_timer: + * + * Aging timer for the bridge. + */ +static void +bridge_timer(void *arg) +{ + struct bridge_softc *sc = arg; + + BRIDGE_LOCK(sc); + + bridge_rtage(sc); + + BRIDGE_UNLOCK(sc); + + if (sc->sc_ifp->if_flags & IFF_RUNNING) { + struct timespec ts; + + ts.tv_sec = bridge_rtable_prune_period; + ts.tv_nsec = 0; + bsd_timeout(bridge_timer, sc, &ts); + } +} + +/* + * bridge_rtage: + * + * Perform an aging cycle. + */ +static void +bridge_rtage(struct bridge_softc *sc) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if ((brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) { + struct timespec now; + + nanouptime(&now); + if ((unsigned long)now.tv_sec >= brt->brt_expire) + bridge_rtnode_destroy(sc, brt); + } + } +} + +/* + * bridge_rtflush: + * + * Remove all dynamic addresses from the bridge. + */ +static void +bridge_rtflush(struct bridge_softc *sc, int full) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if (full || (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) + bridge_rtnode_destroy(sc, brt); + } +} + +/* + * bridge_rtdaddr: + * + * Remove an address from the table. + */ +static int +bridge_rtdaddr(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) +{ + struct bridge_rtnode *brt; + int found = 0; + + BRIDGE_LOCK_ASSERT(sc); + + /* + * If vlan is zero then we want to delete for all vlans so the lookup + * may return more than one. + */ + while ((brt = bridge_rtnode_lookup(sc, addr, vlan)) != NULL) { + bridge_rtnode_destroy(sc, brt); + found = 1; + } + + return (found ? 0 : ENOENT); +} + +/* + * bridge_rtdelete: + * + * Delete routes to a speicifc member interface. + */ +static void +bridge_rtdelete(struct bridge_softc *sc, struct ifnet *ifp, int full) +{ + struct bridge_rtnode *brt, *nbrt; + + BRIDGE_LOCK_ASSERT(sc); + + LIST_FOREACH_SAFE(brt, &sc->sc_rtlist, brt_list, nbrt) { + if (brt->brt_ifp == ifp && (full || + (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC)) + bridge_rtnode_destroy(sc, brt); + } +} + +/* + * bridge_rtable_init: + * + * Initialize the route table for this bridge. + */ +static int +bridge_rtable_init(struct bridge_softc *sc) +{ + int i; + + sc->sc_rthash = _MALLOC(sizeof(*sc->sc_rthash) * BRIDGE_RTHASH_SIZE, + M_DEVBUF, M_NOWAIT); + if (sc->sc_rthash == NULL) + return (ENOMEM); + + for (i = 0; i < BRIDGE_RTHASH_SIZE; i++) + LIST_INIT(&sc->sc_rthash[i]); + + sc->sc_rthash_key = random(); + + LIST_INIT(&sc->sc_rtlist); + + return (0); +} + +/* + * bridge_rtable_fini: + * + * Deconstruct the route table for this bridge. + */ +static void +bridge_rtable_fini(struct bridge_softc *sc) +{ + + KASSERT(sc->sc_brtcnt == 0, + ("%s: %d bridge routes referenced", __func__, sc->sc_brtcnt)); + _FREE(sc->sc_rthash, M_DEVBUF); +} + +/* + * The following hash function is adapted from "Hash Functions" by Bob Jenkins + * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + */ +#define mix(a, b, c) \ +do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ +} while (/*CONSTCOND*/0) + +static __inline uint32_t +bridge_rthash(struct bridge_softc *sc, const uint8_t *addr) +{ + uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = sc->sc_rthash_key; + + b += addr[5] << 8; + b += addr[4]; + a += addr[3] << 24; + a += addr[2] << 16; + a += addr[1] << 8; + a += addr[0]; + + mix(a, b, c); + + return (c & BRIDGE_RTHASH_MASK); +} + +#undef mix + +static int +bridge_rtnode_addr_cmp(const uint8_t *a, const uint8_t *b) +{ + int i, d; + + for (i = 0, d = 0; i < ETHER_ADDR_LEN && d == 0; i++) { + d = ((int)a[i]) - ((int)b[i]); + } + + return (d); +} + +/* + * bridge_rtnode_lookup: + * + * Look up a bridge route node for the specified destination. Compare the + * vlan id or if zero then just return the first match. + */ +static struct bridge_rtnode * +bridge_rtnode_lookup(struct bridge_softc *sc, const uint8_t *addr, uint16_t vlan) +{ + struct bridge_rtnode *brt; + uint32_t hash; + int dir; + + BRIDGE_LOCK_ASSERT(sc); + + hash = bridge_rthash(sc, addr); + LIST_FOREACH(brt, &sc->sc_rthash[hash], brt_hash) { + dir = bridge_rtnode_addr_cmp(addr, brt->brt_addr); + if (dir == 0 && (brt->brt_vlan == vlan || vlan == 0)) + return (brt); + if (dir > 0) + return (NULL); + } + + return (NULL); +} + +/* + * bridge_rtnode_insert: + * + * Insert the specified bridge node into the route table. We + * assume the entry is not already in the table. + */ +static int +bridge_rtnode_insert(struct bridge_softc *sc, struct bridge_rtnode *brt) +{ + struct bridge_rtnode *lbrt; + uint32_t hash; + int dir; + + BRIDGE_LOCK_ASSERT(sc); + + hash = bridge_rthash(sc, brt->brt_addr); + + lbrt = LIST_FIRST(&sc->sc_rthash[hash]); + if (lbrt == NULL) { + LIST_INSERT_HEAD(&sc->sc_rthash[hash], brt, brt_hash); + goto out; + } + + do { + dir = bridge_rtnode_addr_cmp(brt->brt_addr, lbrt->brt_addr); + if (dir == 0 && brt->brt_vlan == lbrt->brt_vlan) + return (EEXIST); + if (dir > 0) { + LIST_INSERT_BEFORE(lbrt, brt, brt_hash); + goto out; + } + if (LIST_NEXT(lbrt, brt_hash) == NULL) { + LIST_INSERT_AFTER(lbrt, brt, brt_hash); + goto out; + } + lbrt = LIST_NEXT(lbrt, brt_hash); + } while (lbrt != NULL); + +#ifdef DIAGNOSTIC + panic("bridge_rtnode_insert: impossible"); +#endif + +out: + LIST_INSERT_HEAD(&sc->sc_rtlist, brt, brt_list); + sc->sc_brtcnt++; + + return (0); +} + +/* + * bridge_rtnode_destroy: + * + * Destroy a bridge rtnode. + */ +static void +bridge_rtnode_destroy(struct bridge_softc *sc, struct bridge_rtnode *brt) +{ + BRIDGE_LOCK_ASSERT(sc); + + LIST_REMOVE(brt, brt_hash); + + LIST_REMOVE(brt, brt_list); + sc->sc_brtcnt--; + brt->brt_dst->bif_addrcnt--; + zfree(bridge_rtnode_pool, brt); +} + +/* + * bridge_rtable_expire: + * + * Set the expiry time for all routes on an interface. + */ +static void +bridge_rtable_expire(struct ifnet *ifp, int age) +{ + struct bridge_softc *sc = ifp->if_bridge; + struct bridge_rtnode *brt; + + BRIDGE_LOCK(sc); + + /* + * If the age is zero then flush, otherwise set all the expiry times to + * age for the interface + */ + if (age == 0) + bridge_rtdelete(sc, ifp, IFBF_FLUSHDYN); + else { + LIST_FOREACH(brt, &sc->sc_rtlist, brt_list) { + struct timespec now; + + nanouptime(&now); + /* Cap the expiry time to 'age' */ + if (brt->brt_ifp == ifp && + brt->brt_expire > (unsigned long)now.tv_sec + age && + (brt->brt_flags & IFBAF_TYPEMASK) == IFBAF_DYNAMIC) + brt->brt_expire = (unsigned long)now.tv_sec + age; + } + } + BRIDGE_UNLOCK(sc); +} + +/* + * bridge_state_change: + * + * Callback from the bridgestp code when a port changes states. + */ +static void +bridge_state_change(struct ifnet *ifp, int state) +{ + struct bridge_softc *sc = ifp->if_bridge; + static const char *stpstates[] = { + "disabled", + "listening", + "learning", + "forwarding", + "blocking", + "discarding" + }; + + if (log_stp) + log(LOG_NOTICE, "%s%d: state changed to %s on %s%d\n", + ifnet_name(sc->sc_ifp), ifnet_unit(sc->sc_ifp), + stpstates[state], + ifnet_name(ifp), ifnet_unit(ifp)); +} + +#ifdef PFIL_HOOKS +/* + * Send bridge packets through pfil if they are one of the types pfil can deal + * with, or if they are ARP or REVARP. (pfil will pass ARP and REVARP without + * question.) If *bifp or *ifp are NULL then packet filtering is skipped for + * that interface. + */ +static int +bridge_pfil(struct mbuf **mp, struct ifnet *bifp, struct ifnet *ifp, int dir) +{ + int snap, error, i, hlen; + struct ether_header *eh1, eh2; + struct ip_fw_args args; + struct ip *ip; + struct llc llc1; + u_int16_t ether_type; + + snap = 0; + error = -1; /* Default error if not error == 0 */ + +#if 0 + /* we may return with the IP fields swapped, ensure its not shared */ + KASSERT(M_WRITABLE(*mp), ("%s: modifying a shared mbuf", __func__)); +#endif + + if (pfil_bridge == 0 && pfil_member == 0 && pfil_ipfw == 0) + return (0); /* filtering is disabled */ + + i = min((*mp)->m_pkthdr.len, max_protohdr); + if ((*mp)->m_len < i) { + *mp = m_pullup(*mp, i); + if (*mp == NULL) { + printf("%s: m_pullup failed\n", __func__); + return (-1); + } + } + + eh1 = mtod(*mp, struct ether_header *); + ether_type = ntohs(eh1->ether_type); + + /* + * Check for SNAP/LLC. + */ + if (ether_type < ETHERMTU) { + struct llc *llc2 = (struct llc *)(eh1 + 1); + + if ((*mp)->m_len >= ETHER_HDR_LEN + 8 && + llc2->llc_dsap == LLC_SNAP_LSAP && + llc2->llc_ssap == LLC_SNAP_LSAP && + llc2->llc_control == LLC_UI) { + ether_type = htons(llc2->llc_un.type_snap.ether_type); + snap = 1; + } + } + + /* + * If we're trying to filter bridge traffic, don't look at anything + * other than IP and ARP traffic. If the filter doesn't understand + * IPv6, don't allow IPv6 through the bridge either. This is lame + * since if we really wanted, say, an AppleTalk filter, we are hosed, + * but of course we don't have an AppleTalk filter to begin with. + * (Note that since pfil doesn't understand ARP it will pass *ALL* + * ARP traffic.) + */ + switch (ether_type) { + case ETHERTYPE_ARP: + case ETHERTYPE_REVARP: + if (pfil_ipfw_arp == 0) + return (0); /* Automatically pass */ + break; + + case ETHERTYPE_IP: +#ifdef INET6 + case ETHERTYPE_IPV6: +#endif /* INET6 */ + break; + default: + /* + * Check to see if the user wants to pass non-ip + * packets, these will not be checked by pfil(9) and + * passed unconditionally so the default is to drop. + */ + if (pfil_onlyip) + goto bad; + } + + /* Strip off the Ethernet header and keep a copy. */ + m_copydata(*mp, 0, ETHER_HDR_LEN, (caddr_t) &eh2); + m_adj(*mp, ETHER_HDR_LEN); + + /* Strip off snap header, if present */ + if (snap) { + m_copydata(*mp, 0, sizeof(struct llc), (caddr_t) &llc1); + m_adj(*mp, sizeof(struct llc)); + } + + /* + * Check the IP header for alignment and errors + */ + if (dir == PFIL_IN) { + switch (ether_type) { + case ETHERTYPE_IP: + error = bridge_ip_checkbasic(mp); + break; +#ifdef INET6 + case ETHERTYPE_IPV6: + error = bridge_ip6_checkbasic(mp); + break; +#endif /* INET6 */ + default: + error = 0; + } + if (error) + goto bad; + } + + if (IPFW_LOADED && pfil_ipfw != 0 && dir == PFIL_OUT && ifp != NULL) { + error = -1; + args.rule = ip_dn_claim_rule(*mp); + if (args.rule != NULL && fw_one_pass) + goto ipfwpass; /* packet already partially processed */ + + args.m = *mp; + args.oif = ifp; + args.next_hop = NULL; + args.eh = &eh2; + args.inp = NULL; /* used by ipfw uid/gid/jail rules */ + i = ip_fw_chk_ptr(&args); + *mp = args.m; + + if (*mp == NULL) + return (error); + + if (DUMMYNET_LOADED && (i == IP_FW_DUMMYNET)) { + + /* put the Ethernet header back on */ + M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT); + if (*mp == NULL) + return (error); + bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); + + /* + * Pass the pkt to dummynet, which consumes it. The + * packet will return to us via bridge_dummynet(). + */ + args.oif = ifp; + ip_dn_io_ptr(mp, DN_TO_IFB_FWD, &args); + return (error); + } + + if (i != IP_FW_PASS) /* drop */ + goto bad; + } + +ipfwpass: + error = 0; + + /* + * Run the packet through pfil + */ + switch (ether_type) { + case ETHERTYPE_IP: + /* + * before calling the firewall, swap fields the same as + * IP does. here we assume the header is contiguous + */ + ip = mtod(*mp, struct ip *); + + ip->ip_len = ntohs(ip->ip_len); + ip->ip_off = ntohs(ip->ip_off); + + /* + * Run pfil on the member interface and the bridge, both can + * be skipped by clearing pfil_member or pfil_bridge. + * + * Keep the order: + * in_if -> bridge_if -> out_if + */ + if (pfil_bridge && dir == PFIL_OUT && bifp != NULL) + error = pfil_run_hooks(&inet_pfil_hook, mp, bifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_member && ifp != NULL) + error = pfil_run_hooks(&inet_pfil_hook, mp, ifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_bridge && dir == PFIL_IN && bifp != NULL) + error = pfil_run_hooks(&inet_pfil_hook, mp, bifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + /* check if we need to fragment the packet */ + if (pfil_member && ifp != NULL && dir == PFIL_OUT) { + i = (*mp)->m_pkthdr.len; + if (i > ifp->if_mtu) { + error = bridge_fragment(ifp, *mp, &eh2, snap, + &llc1); + return (error); + } + } + + /* Recalculate the ip checksum and restore byte ordering */ + ip = mtod(*mp, struct ip *); + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) + goto bad; + if (hlen > (*mp)->m_len) { + if ((*mp = m_pullup(*mp, hlen)) == 0) + goto bad; + ip = mtod(*mp, struct ip *); + if (ip == NULL) + goto bad; + } + ip->ip_len = htons(ip->ip_len); + ip->ip_off = htons(ip->ip_off); + ip->ip_sum = 0; + if (hlen == sizeof(struct ip)) + ip->ip_sum = in_cksum_hdr(ip); + else + ip->ip_sum = in_cksum(*mp, hlen); + + break; +#ifdef INET6 + case ETHERTYPE_IPV6: + if (pfil_bridge && dir == PFIL_OUT && bifp != NULL) + error = pfil_run_hooks(&inet6_pfil_hook, mp, bifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_member && ifp != NULL) + error = pfil_run_hooks(&inet6_pfil_hook, mp, ifp, + dir, NULL); + + if (*mp == NULL || error != 0) /* filter may consume */ + break; + + if (pfil_bridge && dir == PFIL_IN && bifp != NULL) + error = pfil_run_hooks(&inet6_pfil_hook, mp, bifp, + dir, NULL); + break; +#endif + default: + error = 0; + break; + } + + if (*mp == NULL) + return (error); + if (error != 0) + goto bad; + + error = -1; + + /* + * Finally, put everything back the way it was and return + */ + if (snap) { + M_PREPEND(*mp, sizeof(struct llc), M_DONTWAIT); + if (*mp == NULL) + return (error); + bcopy(&llc1, mtod(*mp, caddr_t), sizeof(struct llc)); + } + + M_PREPEND(*mp, ETHER_HDR_LEN, M_DONTWAIT); + if (*mp == NULL) + return (error); + bcopy(&eh2, mtod(*mp, caddr_t), ETHER_HDR_LEN); + + return (0); + +bad: + m_freem(*mp); + *mp = NULL; + return (error); +} + + +/* + * Perform basic checks on header size since + * pfil assumes ip_input has already processed + * it for it. Cut-and-pasted from ip_input.c. + * Given how simple the IPv6 version is, + * does the IPv4 version really need to be + * this complicated? + * + * XXX Should we update ipstat here, or not? + * XXX Right now we update ipstat but not + * XXX csum_counter. + */ +static int +bridge_ip_checkbasic(struct mbuf **mp) +{ + struct mbuf *m = *mp; + struct ip *ip; + int len, hlen; + u_short sum; + + if (*mp == NULL) + return (-1); + + if (IP_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { + if ((m = m_copyup(m, sizeof(struct ip), + (max_linkhdr + 3) & ~3)) == NULL) { + /* XXXJRT new stat, please */ + ipstat.ips_toosmall++; + goto bad; + } + } else if (__predict_false(m->m_len < sizeof (struct ip))) { + if ((m = m_pullup(m, sizeof (struct ip))) == NULL) { + ipstat.ips_toosmall++; + goto bad; + } + } + ip = mtod(m, struct ip *); + if (ip == NULL) goto bad; + + if (ip->ip_v != IPVERSION) { + ipstat.ips_badvers++; + goto bad; + } + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) { /* minimum header length */ + ipstat.ips_badhlen++; + goto bad; + } + if (hlen > m->m_len) { + if ((m = m_pullup(m, hlen)) == 0) { + ipstat.ips_badhlen++; + goto bad; + } + ip = mtod(m, struct ip *); + if (ip == NULL) goto bad; + } + + if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { + sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); + } else { + if (hlen == sizeof(struct ip)) { + sum = in_cksum_hdr(ip); + } else { + sum = in_cksum(m, hlen); + } + } + if (sum) { + ipstat.ips_badsum++; + goto bad; + } + + /* Retrieve the packet length. */ + len = ntohs(ip->ip_len); + + /* + * Check for additional length bogosity + */ + if (len < hlen) { + ipstat.ips_badlen++; + goto bad; + } + + /* + * Check that the amount of data in the buffers + * is as at least much as the IP header would have us expect. + * Drop packet if shorter than we expect. + */ + if (m->m_pkthdr.len < len) { + ipstat.ips_tooshort++; + goto bad; + } + + /* Checks out, proceed */ + *mp = m; + return (0); + +bad: + *mp = m; + return (-1); +} + +#ifdef INET6 +/* + * Same as above, but for IPv6. + * Cut-and-pasted from ip6_input.c. + * XXX Should we update ip6stat, or not? + */ +static int +bridge_ip6_checkbasic(struct mbuf **mp) +{ + struct mbuf *m = *mp; + struct ip6_hdr *ip6; + + /* + * If the IPv6 header is not aligned, slurp it up into a new + * mbuf with space for link headers, in the event we forward + * it. Otherwise, if it is aligned, make sure the entire base + * IPv6 header is in the first mbuf of the chain. + */ + if (IP6_HDR_ALIGNED_P(mtod(m, caddr_t)) == 0) { + struct ifnet *inifp = m->m_pkthdr.rcvif; + if ((m = m_copyup(m, sizeof(struct ip6_hdr), + (max_linkhdr + 3) & ~3)) == NULL) { + /* XXXJRT new stat, please */ + ip6stat.ip6s_toosmall++; + in6_ifstat_inc(inifp, ifs6_in_hdrerr); + goto bad; + } + } else if (__predict_false(m->m_len < sizeof(struct ip6_hdr))) { + struct ifnet *inifp = m->m_pkthdr.rcvif; + if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { + ip6stat.ip6s_toosmall++; + in6_ifstat_inc(inifp, ifs6_in_hdrerr); + goto bad; + } + } + + ip6 = mtod(m, struct ip6_hdr *); + + if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { + ip6stat.ip6s_badvers++; + in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); + goto bad; + } + + /* Checks out, proceed */ + *mp = m; + return (0); + +bad: + *mp = m; + return (-1); +} +#endif /* INET6 */ + +/* + * bridge_fragment: + * + * Return a fragmented mbuf chain. + */ +static int +bridge_fragment(struct ifnet *ifp, struct mbuf *m, struct ether_header *eh, + int snap, struct llc *llc) +{ + struct mbuf *m0; + struct ip *ip; + int error = -1; + + if (m->m_len < sizeof(struct ip) && + (m = m_pullup(m, sizeof(struct ip))) == NULL) + goto out; + ip = mtod(m, struct ip *); + + error = ip_fragment(ip, &m, ifp->if_mtu, ifp->if_hwassist, + CSUM_DELAY_IP); + if (error) + goto out; + + /* walk the chain and re-add the Ethernet header */ + for (m0 = m; m0; m0 = m0->m_nextpkt) { + if (error == 0) { + if (snap) { + M_PREPEND(m0, sizeof(struct llc), M_DONTWAIT); + if (m0 == NULL) { + error = ENOBUFS; + continue; + } + bcopy(llc, mtod(m0, caddr_t), + sizeof(struct llc)); + } + M_PREPEND(m0, ETHER_HDR_LEN, M_DONTWAIT); + if (m0 == NULL) { + error = ENOBUFS; + continue; + } + bcopy(eh, mtod(m0, caddr_t), ETHER_HDR_LEN); + } else + m_freem(m); + } + + if (error == 0) + ipstat.ips_fragmented++; + + return (error); + +out: + if (m != NULL) + m_freem(m); + return (error); +} +#endif /* PFIL_HOOKS */ + +static errno_t +bridge_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func bpf_callback) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + //printf("bridge_set_bpf_tap ifp %p mode %d\n", ifp, mode); + + /* TBD locking */ + if (sc == NULL || (sc->sc_flags & SCF_DETACHING)) { + return ENODEV; + } + + switch (mode) { + case BPF_TAP_DISABLE: + sc->sc_bpf_input = sc->sc_bpf_output = NULL; + break; + + case BPF_TAP_INPUT: + sc->sc_bpf_input = bpf_callback; + break; + + case BPF_TAP_OUTPUT: + sc->sc_bpf_output = bpf_callback; + break; + + case BPF_TAP_INPUT_OUTPUT: + sc->sc_bpf_input = sc->sc_bpf_output = bpf_callback; + break; + + default: + break; + } + + return 0; +} + +static void +bridge_detach(ifnet_t ifp) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + bstp_detach(&sc->sc_stp); + + /* Tear down the routing table. */ + bridge_rtable_fini(sc); + + lck_mtx_lock(bridge_list_mtx); + LIST_REMOVE(sc, sc_list); + lck_mtx_unlock(bridge_list_mtx); + + ifnet_release(ifp); + + lck_mtx_free(sc->sc_mtx, bridge_lock_grp); + + _FREE(sc, M_DEVBUF); + return; +} + +__private_extern__ errno_t bridge_bpf_input(ifnet_t ifp, struct mbuf *m) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + if (sc->sc_bpf_input) { + if (mbuf_pkthdr_rcvif(m) != ifp) + printf("bridge_bpf_input rcvif: %p != ifp %p\n", mbuf_pkthdr_rcvif(m), ifp); + (*sc->sc_bpf_input)(ifp, m); + } + return 0; +} + +__private_extern__ errno_t bridge_bpf_output(ifnet_t ifp, struct mbuf *m) +{ + struct bridge_softc *sc = (struct bridge_softc *)ifnet_softc(ifp); + + if (sc->sc_bpf_output) { + (*sc->sc_bpf_output)(ifp, m); + } + return 0; +} diff --git a/bsd/net/if_bridgevar.h b/bsd/net/if_bridgevar.h new file mode 100644 index 000000000..3d1375aed --- /dev/null +++ b/bsd/net/if_bridgevar.h @@ -0,0 +1,499 @@ +/* $NetBSD: if_bridgevar.h,v 1.4 2003/07/08 07:13:50 itojun Exp $ */ +/* + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Copyright 2001 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Jason R. Thorpe for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1999, 2000 Jason L. Wright (jason@thought.net) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Jason L. Wright + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * OpenBSD: if_bridge.h,v 1.14 2001/03/22 03:48:29 jason Exp + * + * $FreeBSD$ + */ + +/* + * Data structure and control definitions for bridge interfaces. + */ + +#ifndef _NET_IF_BRIDGEVAR_H_ +#define _NET_IF_BRIDGEVAR_H_ + +#ifdef PRIVATE + +#include <sys/queue.h> + +#include <net/if.h> +#include <net/ethernet.h> + +/* + * Commands used in the SIOCSDRVSPEC ioctl. Note the lookup of the + * bridge interface itself is keyed off the ifdrv structure. + */ +#define BRDGADD 0 /* add bridge member (ifbreq) */ +#define BRDGDEL 1 /* delete bridge member (ifbreq) */ +#define BRDGGIFFLGS 2 /* get member if flags (ifbreq) */ +#define BRDGSIFFLGS 3 /* set member if flags (ifbreq) */ +#define BRDGSCACHE 4 /* set cache size (ifbrparam) */ +#define BRDGGCACHE 5 /* get cache size (ifbrparam) */ +#define BRDGGIFS 6 /* get member list (ifbifconf) */ +#define BRDGRTS 7 /* get address list (ifbaconf) */ +#define BRDGSADDR 8 /* set static address (ifbareq) */ +#define BRDGSTO 9 /* set cache timeout (ifbrparam) */ +#define BRDGGTO 10 /* get cache timeout (ifbrparam) */ +#define BRDGDADDR 11 /* delete address (ifbareq) */ +#define BRDGFLUSH 12 /* flush address cache (ifbreq) */ + +#define BRDGGPRI 13 /* get priority (ifbrparam) */ +#define BRDGSPRI 14 /* set priority (ifbrparam) */ +#define BRDGGHT 15 /* get hello time (ifbrparam) */ +#define BRDGSHT 16 /* set hello time (ifbrparam) */ +#define BRDGGFD 17 /* get forward delay (ifbrparam) */ +#define BRDGSFD 18 /* set forward delay (ifbrparam) */ +#define BRDGGMA 19 /* get max age (ifbrparam) */ +#define BRDGSMA 20 /* set max age (ifbrparam) */ +#define BRDGSIFPRIO 21 /* set if priority (ifbreq) */ +#define BRDGSIFCOST 22 /* set if path cost (ifbreq) */ +#define BRDGGFILT 23 /* get filter flags (ifbrparam) */ +#define BRDGSFILT 24 /* set filter flags (ifbrparam) */ +#define BRDGPURGE 25 /* purge address cache for a particular interface (ifbreq) */ +#define BRDGADDS 26 /* add bridge span member (ifbreq) */ +#define BRDGDELS 27 /* delete bridge span member (ifbreq) */ +#define BRDGPARAM 28 /* get bridge STP params (ifbropreq) */ +#define BRDGGRTE 29 /* get cache drops (ifbrparam) */ +#define BRDGGIFSSTP 30 /* get member STP params list (ifbpstpconf) */ +#define BRDGSPROTO 31 /* set protocol (ifbrparam) */ +#define BRDGSTXHC 32 /* set tx hold count (ifbrparam) */ +#define BRDGSIFAMAX 33 /* set max interface addrs (ifbreq) */ + +/* + * Generic bridge control request. + */ +#pragma pack(4) + +struct ifbreq { + char ifbr_ifsname[IFNAMSIZ]; /* member if name */ + uint32_t ifbr_ifsflags; /* member if flags */ + uint32_t ifbr_stpflags; /* member if STP flags */ + uint32_t ifbr_path_cost; /* member if STP cost */ + uint8_t ifbr_portno; /* member if port number */ + uint8_t ifbr_priority; /* member if STP priority */ + uint8_t ifbr_proto; /* member if STP protocol */ + uint8_t ifbr_role; /* member if STP role */ + uint8_t ifbr_state; /* member if STP state */ + uint32_t ifbr_addrcnt; /* member if addr number */ + uint32_t ifbr_addrmax; /* member if addr max */ + uint32_t ifbr_addrexceeded; /* member if addr violations */ + uint8_t pad[32]; +}; + +#pragma pack() + +/* BRDGGIFFLAGS, BRDGSIFFLAGS */ +#define IFBIF_LEARNING 0x0001 /* if can learn */ +#define IFBIF_DISCOVER 0x0002 /* if sends packets w/ unknown dest. */ +#define IFBIF_STP 0x0004 /* if participates in spanning tree */ +#define IFBIF_SPAN 0x0008 /* if is a span port */ +#define IFBIF_STICKY 0x0010 /* if learned addresses stick */ +#define IFBIF_BSTP_EDGE 0x0020 /* member stp edge port */ +#define IFBIF_BSTP_AUTOEDGE 0x0040 /* member stp autoedge enabled */ +#define IFBIF_BSTP_PTP 0x0080 /* member stp point to point */ +#define IFBIF_BSTP_AUTOPTP 0x0100 /* member stp autoptp enabled */ +#define IFBIF_BSTP_ADMEDGE 0x0200 /* member stp admin edge enabled */ +#define IFBIF_BSTP_ADMCOST 0x0400 /* member stp admin path cost */ +#define IFBIF_PRIVATE 0x0800 /* if is a private segment */ + +#define IFBIFBITS "\020\001LEARNING\002DISCOVER\003STP\004SPAN" \ + "\005STICKY\014PRIVATE\006EDGE\007AUTOEDGE\010PTP" \ + "\011AUTOPTP" +#define IFBIFMASK ~(IFBIF_BSTP_EDGE|IFBIF_BSTP_AUTOEDGE|IFBIF_BSTP_PTP| \ + IFBIF_BSTP_AUTOPTP|IFBIF_BSTP_ADMEDGE| \ + IFBIF_BSTP_ADMCOST) /* not saved */ + +/* BRDGFLUSH */ +#define IFBF_FLUSHDYN 0x00 /* flush learned addresses only */ +#define IFBF_FLUSHALL 0x01 /* flush all addresses */ + +/* BRDGSFILT */ +#define IFBF_FILT_USEIPF 0x00000001 /* run pfil hooks on the bridge +interface */ +#define IFBF_FILT_MEMBER 0x00000002 /* run pfil hooks on the member +interfaces */ +#define IFBF_FILT_ONLYIP 0x00000004 /* only pass IP[46] packets when +pfil is enabled */ +#define IFBF_FILT_MASK 0x00000007 /* mask of valid values */ + + +/* APPLE MODIFICATION <jhw@apple.com>: Default is to pass non-IP packets. */ +#define IFBF_FILT_DEFAULT ( IFBF_FILT_USEIPF | IFBF_FILT_MEMBER ) +#if 0 +#define IFBF_FILT_DEFAULT (IFBF_FILT_USEIPF | \ +IFBF_FILT_MEMBER | \ +IFBF_FILT_ONLYIP) +#endif + +/* + * Interface list structure. + */ + +#pragma pack(4) + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbifconf { + uint32_t ifbic_len; /* buffer size */ + union { + caddr_t ifbicu_buf; + struct ifbreq *ifbicu_req; +#define ifbic_buf ifbic_ifbicu.ifbicu_buf +#define ifbic_req ifbic_ifbicu.ifbicu_req + } ifbic_ifbicu; +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbifconf32 { + uint32_t ifbic_len; /* buffer size */ + union { + user32_addr_t ifbicu_buf; + user32_addr_t ifbicu_req; +#define ifbic_buf ifbic_ifbicu.ifbicu_buf +#define ifbic_req ifbic_ifbicu.ifbicu_req + } ifbic_ifbicu; +}; + +struct ifbifconf64 { + uint32_t ifbic_len; /* buffer size */ + union { + user64_addr_t ifbicu_buf; + user64_addr_t ifbicu_req; + } ifbic_ifbicu; +}; +#endif /* XNU_KERNEL_PRIVATE */ + +#pragma pack() + +/* + * Bridge address request. + */ + +#pragma pack(4) + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbareq { + char ifba_ifsname[IFNAMSIZ]; /* member if name */ + unsigned long ifba_expire; /* address expire time */ + uint8_t ifba_flags; /* address flags */ + uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */ + uint16_t ifba_vlan; /* vlan id */ +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbareq32 { + char ifba_ifsname[IFNAMSIZ]; /* member if name */ + uint32_t ifba_expire; /* address expire time */ + uint8_t ifba_flags; /* address flags */ + uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */ + uint16_t ifba_vlan; /* vlan id */ +}; + +struct ifbareq64 { + char ifba_ifsname[IFNAMSIZ]; /* member if name */ + uint64_t ifba_expire; /* address expire time */ + uint8_t ifba_flags; /* address flags */ + uint8_t ifba_dst[ETHER_ADDR_LEN];/* destination address */ + uint16_t ifba_vlan; /* vlan id */ +}; +#endif /* XNU_KERNEL_PRIVATE */ + +#pragma pack() + +#define IFBAF_TYPEMASK 0x03 /* address type mask */ +#define IFBAF_DYNAMIC 0x00 /* dynamically learned address */ +#define IFBAF_STATIC 0x01 /* static address */ +#define IFBAF_STICKY 0x02 /* sticky address */ + +#define IFBAFBITS "\020\1STATIC\2STICKY" + +/* + * Address list structure. + */ + +#pragma pack(4) + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbaconf { + uint32_t ifbac_len; /* buffer size */ + union { + caddr_t ifbacu_buf; + struct ifbareq *ifbacu_req; +#define ifbac_buf ifbac_ifbacu.ifbacu_buf +#define ifbac_req ifbac_ifbacu.ifbacu_req + } ifbac_ifbacu; +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbaconf32 { + uint32_t ifbac_len; /* buffer size */ + union { + user32_addr_t ifbacu_buf; + user32_addr_t ifbacu_req; +#define ifbac_buf ifbac_ifbacu.ifbacu_buf +#define ifbac_req ifbac_ifbacu.ifbacu_req + } ifbac_ifbacu; +}; + +struct ifbaconf64 { + uint32_t ifbac_len; /* buffer size */ + union { + user64_addr_t ifbacu_buf; + user64_addr_t ifbacu_req; + } ifbac_ifbacu; +}; +#endif /* XNU_KERNEL_PRIVATE */ + +#pragma pack() + +/* + * Bridge parameter structure. + */ + +#pragma pack(4) + +struct ifbrparam { + union { + uint32_t ifbrpu_int32; + uint16_t ifbrpu_int16; + uint8_t ifbrpu_int8; + } ifbrp_ifbrpu; +}; + +#pragma pack() + +#define ifbrp_csize ifbrp_ifbrpu.ifbrpu_int32 /* cache size */ +#define ifbrp_ctime ifbrp_ifbrpu.ifbrpu_int32 /* cache time (sec) */ +#define ifbrp_prio ifbrp_ifbrpu.ifbrpu_int16 /* bridge priority */ +#define ifbrp_proto ifbrp_ifbrpu.ifbrpu_int8 /* bridge protocol */ +#define ifbrp_txhc ifbrp_ifbrpu.ifbrpu_int8 /* bpdu tx holdcount */ +#define ifbrp_hellotime ifbrp_ifbrpu.ifbrpu_int8 /* hello time (sec) */ +#define ifbrp_fwddelay ifbrp_ifbrpu.ifbrpu_int8 /* fwd time (sec) */ +#define ifbrp_maxage ifbrp_ifbrpu.ifbrpu_int8 /* max age (sec) */ +#define ifbrp_cexceeded ifbrp_ifbrpu.ifbrpu_int32 /* # of cache dropped + * adresses */ +#define ifbrp_filter ifbrp_ifbrpu.ifbrpu_int32 /* filtering flags */ + +/* + * Bridge current operational parameters structure. + */ + +#pragma pack(4) + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbropreq { + uint8_t ifbop_holdcount; + uint8_t ifbop_maxage; + uint8_t ifbop_hellotime; + uint8_t ifbop_fwddelay; + uint8_t ifbop_protocol; + uint16_t ifbop_priority; + uint16_t ifbop_root_port; + uint32_t ifbop_root_path_cost; + uint64_t ifbop_bridgeid; + uint64_t ifbop_designated_root; + uint64_t ifbop_designated_bridge; + struct timeval ifbop_last_tc_time; +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbropreq32 { + uint8_t ifbop_holdcount; + uint8_t ifbop_maxage; + uint8_t ifbop_hellotime; + uint8_t ifbop_fwddelay; + uint8_t ifbop_protocol; + uint16_t ifbop_priority; + uint16_t ifbop_root_port; + uint32_t ifbop_root_path_cost; + uint64_t ifbop_bridgeid; + uint64_t ifbop_designated_root; + uint64_t ifbop_designated_bridge; + struct timeval ifbop_last_tc_time; +}; + +struct ifbropreq64 { + uint8_t ifbop_holdcount; + uint8_t ifbop_maxage; + uint8_t ifbop_hellotime; + uint8_t ifbop_fwddelay; + uint8_t ifbop_protocol; + uint16_t ifbop_priority; + uint16_t ifbop_root_port; + uint32_t ifbop_root_path_cost; + uint64_t ifbop_bridgeid; + uint64_t ifbop_designated_root; + uint64_t ifbop_designated_bridge; + struct timeval ifbop_last_tc_time; +}; + +#endif + +#pragma pack() + +/* + * Bridge member operational STP params structure. + */ + +#pragma pack(4) + +struct ifbpstpreq { + uint8_t ifbp_portno; /* bp STP port number */ + uint32_t ifbp_fwd_trans; /* bp STP fwd transitions */ + uint32_t ifbp_design_cost; /* bp STP designated cost */ + uint32_t ifbp_design_port; /* bp STP designated port */ + uint64_t ifbp_design_bridge; /* bp STP designated bridge */ + uint64_t ifbp_design_root; /* bp STP designated root */ +}; + +#pragma pack() + +/* + * Bridge STP ports list structure. + */ + +#pragma pack(4) + +#ifndef XNU_KERNEL_PRIVATE + +struct ifbpstpconf { + uint32_t ifbpstp_len; /* buffer size */ + union { + caddr_t ifbpstpu_buf; + struct ifbpstpreq *ifbpstpu_req; + } ifbpstp_ifbpstpu; +#define ifbpstp_buf ifbpstp_ifbpstpu.ifbpstpu_buf +#define ifbpstp_req ifbpstp_ifbpstpu.ifbpstpu_req +}; + +#else /* XNU_KERNEL_PRIVATE */ + +struct ifbpstpconf32 { + uint32_t ifbpstp_len; /* buffer size */ + union { + user32_addr_t ifbpstpu_buf; + user32_addr_t ifbpstpu_req; +#define ifbpstp_buf ifbpstp_ifbpstpu.ifbpstpu_buf +#define ifbpstp_req ifbpstp_ifbpstpu.ifbpstpu_req + } ifbpstp_ifbpstpu; +}; + +struct ifbpstpconf64 { + uint32_t ifbpstp_len; /* buffer size */ + union { + user64_addr_t ifbpstpu_buf; + user64_addr_t ifbpstpu_req; + } ifbpstp_ifbpstpu; +}; + +#endif /* XNU_KERNEL_PRIVATE */ + +#pragma pack() + + +#ifdef XNU_KERNEL_PRIVATE + +int bridgeattach(int); + +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* PRIVATE */ +#endif /* !_NET_IF_BRIDGEVAR_H_ */ diff --git a/bsd/net/if_disc.c b/bsd/net/if_disc.c deleted file mode 100644 index 229e281f6..000000000 --- a/bsd/net/if_disc.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * From: @(#)if_loop.c 8.1 (Berkeley) 6/10/93 - * $FreeBSD: src/sys/net/if_disc.c,v 1.26.2.1 2001/03/06 00:32:09 obrien Exp $ - */ - -/* - * Discard interface driver for protocol testing and timing. - * (Based on the loopback.) - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/socket.h> -#include <sys/sockio.h> - -#include <net/if.h> -#include <net/if_types.h> -#include <net/route.h> -#include <net/bpf.h> - -#ifdef TINY_DSMTU -#define DSMTU (1024+512) -#else -#define DSMTU 65532 -#endif - -static void discattach(void); - -static struct ifnet discif; -static int discoutput(struct ifnet *, struct mbuf *, struct sockaddr *, - struct rtentry *); -static void discrtrequest(int cmd, struct rtentry *rt, struct sockaddr *sa); -static int discioctl(struct ifnet *, u_long, caddr_t); - -/* ARGSUSED */ -static void -discattach() -{ - register struct ifnet *ifp = &discif; - - ifp->if_name = "ds"; - ifp->if_family = APPLE_IF_FAM_DISC; - ifp->if_mtu = DSMTU; - ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST; - ifp->if_ioctl = discioctl; - ifp->if_output = discoutput; - ifp->if_type = IFT_LOOP; - ifp->if_hdrlen = 0; - ifp->if_addrlen = 0; - if_attach(ifp); - bpfattach(ifp, DLT_NULL, sizeof(u_int)); -} - -#ifndef __APPLE__ -static int -disc_modevent(module_t mod, int type, void *data) -{ - switch (type) { - case MOD_LOAD: - discattach(); - break; - case MOD_UNLOAD: - printf("if_disc module unload - not possible for this module type\n"); - return EINVAL; - } - return 0; -} - -static moduledata_t disc_mod = { - "if_disc", - disc_modevent, - NULL -}; - -DECLARE_MODULE(if_disc, disc_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); -#endif - -static int -discoutput(ifp, m, dst, rt) - struct ifnet *ifp; - register struct mbuf *m; - struct sockaddr *dst; - register struct rtentry *rt; -{ - if ((m->m_flags & M_PKTHDR) == 0) - panic("discoutput no HDR"); - /* BPF write needs to be handled specially */ - if (dst->sa_family == AF_UNSPEC) { - dst->sa_family = *(mtod(m, int *)); - m->m_len -= sizeof(int); - m->m_pkthdr.len -= sizeof(int); - m->m_data += sizeof(int); - } - - if (discif.if_bpf) { - /* We need to prepend the address family as a four byte field. */ - u_int af = dst->sa_family; - - bpf_tap_out(ifp, 0, m, &af, sizeof(af)); - } - m->m_pkthdr.rcvif = ifp; - - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; - - m_freem(m); - return 0; -} - -/* ARGSUSED */ -static void -discrtrequest(cmd, rt, sa) - int cmd; - struct rtentry *rt; - struct sockaddr *sa; -{ - if (rt != NULL) { - RT_LOCK_ASSERT_HELD(rt); - rt->rt_rmx.rmx_mtu = DSMTU; - } -} - -/* - * Process an ioctl request. - */ -/* ARGSUSED */ -static int -discioctl(ifp, cmd, data) - register struct ifnet *ifp; - u_long cmd; - caddr_t data; -{ - register struct ifaddr *ifa; - register struct ifreq *ifr = (struct ifreq *)data; - register int error = 0; - - switch (cmd) { - - case SIOCSIFADDR: - ifnet_set_flags(ifp, IFF_UP, IFF_UP); - ifa = (struct ifaddr *)data; - if (ifa != 0) - ifa->ifa_rtrequest = discrtrequest; - /* - * Everything else is done at a higher level. - */ - break; - - case SIOCADDMULTI: - case SIOCDELMULTI: - if (ifr == 0) { - error = EAFNOSUPPORT; /* XXX */ - break; - } - switch (ifr->ifr_addr.sa_family) { - -#if INET - case AF_INET: - break; -#endif -#if INET6 - case AF_INET6: - break; -#endif - - default: - error = EAFNOSUPPORT; - break; - } - break; - - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - break; - - default: - error = EINVAL; - } - return (error); -} diff --git a/bsd/net/if_dummy.c b/bsd/net/if_dummy.c deleted file mode 100644 index 68dac9c9d..000000000 --- a/bsd/net/if_dummy.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ -/* - * derived from - * @(#)if_loop.c 8.1 (Berkeley) 6/10/93 - * Id: if_loop.c,v 1.22 1996/06/19 16:24:10 wollman Exp - */ - -/* - * Loopback interface driver for protocol testing and timing. - */ -#if BSD310 -#include "opt_inet.h" -#endif -#include "dummy.h" -#if NDUMMY > 0 - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/mbuf.h> -#include <sys/socket.h> -#include <sys/sockio.h> -#include <sys/errno.h> -#include <sys/time.h> - -#include <net/if.h> -#include <net/if_types.h> -#include <net/route.h> -#include <net/bpf.h> - -#if INET -#include <netinet/in.h> -#include <netinet/in_systm.h> -#include <netinet/in_var.h> -#include <netinet/ip.h> -#endif - -#if INET6 -#if !INET -#include <netinet/in.h> -#endif -#include <netinet6/in6_var.h> -#include <netinet6/ip6.h> -#endif - -#if NETATALK -#include <netinet/if_ether.h> -#include <netatalk/at.h> -#include <netatalk/at_var.h> -#endif /* NETATALK */ - -#include "bpfilter.h" - -static int dummyioctl(struct ifnet *, u_long, caddr_t); -int dummyoutput(struct ifnet *, register struct mbuf *, struct sockaddr *, - register struct rtentry *); -static void dummyrtrequest(int, struct rtentry *, struct sockaddr *); - -static void dummyattach(void *); -PSEUDO_SET(dummyattach, if_dummy); - -#if TINY_DUMMYMTU -#define DUMMYMTU (1024+512) -#else -#define DUMMYMTU 16384 -#endif -#define HAVE_OLD_BPF 1 - -static struct ifnet dummyif[NDUMMY]; - -/* ARGSUSED */ -static void -dummyattach(dummy) - void *dummy; -{ - register struct ifnet *ifp; - register int i = 0; - - for (i = 0; i < NDUMMY; i++) { - ifp = &dummyif[i]; -#if defined(__NetBSD__) || defined(__OpenBSD__) - sprintf(ifp->if_xname, "dummy%d", i); -#else - ifp->if_name = "dummy"; - ifp->if_unit = i; -#endif -#ifndef __bsdi__ - ifp->if_softc = NULL; -#endif - ifp->if_mtu = DUMMYMTU; - /* Change to BROADCAST experimentaly to announce its prefix. */ - ifp->if_flags = /* IFF_LOOPBACK */ IFF_BROADCAST | IFF_MULTICAST; - ifp->if_ioctl = dummyioctl; - ifp->if_output = dummyoutput; - ifp->if_type = IFT_DUMMY; - ifp->if_hdrlen = 0; - ifp->if_addrlen = 0; - if_attach(ifp); -#if NBPFILTER > 0 -#ifdef HAVE_OLD_BPF - bpfattach(ifp, DLT_NULL, sizeof(u_int)); -#else - bpfattach(&ifp->if_bpf, ifp, DLT_NULL, sizeof(u_int)); -#endif -#endif - } -} - -int -dummyoutput(ifp, m, dst, rt) - struct ifnet *ifp; - register struct mbuf *m; - struct sockaddr *dst; - register struct rtentry *rt; -{ - if ((m->m_flags & M_PKTHDR) == 0) - panic("dummyoutput no HDR"); -#if NBPFILTER > 0 - /* BPF write needs to be handled specially */ - if (dst->sa_family == AF_UNSPEC) { - dst->sa_family = *(mtod(m, int *)); - m->m_len -= sizeof(int); - m->m_pkthdr.len -= sizeof(int); - m->m_data += sizeof(int); - } - - if (ifp->if_bpf) { - /* We need to prepend the address family as a four byte field. */ - u_int af = dst->sa_family; - - bpf_tap_out(ifp, 0, m, &af, sizeof(af)); - } -#endif - m->m_pkthdr.rcvif = ifp; - - if (rt != NULL) { - u_int32_t rt_flags = rt->rt_flags; - if (rt_flags & (RTF_REJECT | RTF_BLACKHOLE)) { - m_freem(m); - return ((rt_flags & RTF_BLACKHOLE) ? 0 : - (rt_flags & RTF_HOST) ? EHOSTUNREACH : ENETUNREACH); - } - } - ifp->if_opackets++; - ifp->if_obytes += m->m_pkthdr.len; - proto_inject(dst->sa_family, m); - ifp->if_ipackets++; - ifp->if_ibytes += m->m_pkthdr.len; - return (0); -} - -/* ARGSUSED */ -static void -dummyrtrequest(cmd, rt, sa) - int cmd; - struct rtentry *rt; - struct sockaddr *sa; -{ - if (rt != NULL) { - RT_LOCK_ASSERT_HELD(rt); - rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */ - /* - * For optimal performance, the send and receive buffers - * should be at least twice the MTU plus a little more for - * overhead. - */ - rt->rt_rmx.rmx_recvpipe = - rt->rt_rmx.rmx_sendpipe = 3 * DUMMYMTU; - } -} - -/* - * Process an ioctl request. - */ -/* ARGSUSED */ -static int -dummyioctl(ifp, cmd, data) - register struct ifnet *ifp; - u_long cmd; - caddr_t data; -{ - register struct ifaddr *ifa; - register struct ifreq *ifr = (struct ifreq *)data; - register int error = 0; - - switch (cmd) { - - case SIOCSIFADDR: - ifnet_set_flags(ifp, IFF_UP | IFF_RUNNING, IFF_UP | IFF_RUNNING); - ifa = (struct ifaddr *)data; - ifa->ifa_rtrequest = dummyrtrequest; - /* - * Everything else is done at a higher level. - */ - break; - - case SIOCADDMULTI: - case SIOCDELMULTI: - if (ifr == 0) { - error = EAFNOSUPPORT; /* XXX */ - break; - } - switch (ifr->ifr_addr.sa_family) { - -#if INET - case AF_INET: - break; -#endif -#if INET6 - case AF_INET6: - break; -#endif - - default: - error = EAFNOSUPPORT; - break; - } - break; - - case SIOCSIFMTU: - ifp->if_mtu = ifr->ifr_mtu; - break; - - case SIOCSIFFLAGS: - break; - - default: - error = EINVAL; - } - return (error); -} -#endif /* NDUMMY > 0 */ diff --git a/bsd/net/if_ethersubr.c b/bsd/net/if_ethersubr.c deleted file mode 100644 index e407e009f..000000000 --- a/bsd/net/if_ethersubr.c +++ /dev/null @@ -1,229 +0,0 @@ -/* - * Copyright (c) 2000, 2009 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1989, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)if_ethersubr.c 8.1 (Berkeley) 6/10/93 - * $FreeBSD: src/sys/net/if_ethersubr.c,v 1.70.2.17 2001/08/01 00:47:49 fenner Exp $ - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/kernel.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/socket.h> -#include <sys/sockio.h> -#include <sys/sysctl.h> - -#include <net/if.h> -#include <net/route.h> -#include <net/if_llc.h> -#include <net/if_dl.h> -#include <net/if_types.h> - -#if INET || INET6 -#include <netinet/in.h> -#include <netinet/in_var.h> -#include <netinet/if_ether.h> -#include <netinet/in_systm.h> -#include <netinet/ip.h> -#endif - -#if IPX -#include <netipx/ipx.h> -#include <netipx/ipx_if.h> -#endif - -#include <sys/socketvar.h> - -#if LLC && CCITT -extern struct ifqueue pkintrq; -#endif - -/* #include "vlan.h" */ -#if NVLAN > 0 -#include <net/if_vlan_var.h> -#endif /* NVLAN > 0 */ - -extern u_char etherbroadcastaddr[]; -#define senderr(e) do { error = (e); goto bad;} while (0) - -/* - * Perform common duties while attaching to interface list - */ - -int -ether_resolvemulti( - struct ifnet *ifp, - struct sockaddr **llsa, - struct sockaddr *sa) -{ - struct sockaddr_dl *sdl; - struct sockaddr_in *sin; - u_char *e_addr; -#if INET6 - struct sockaddr_in6 *sin6; -#endif - - - switch(sa->sa_family) { - case AF_UNSPEC: - /* AppleTalk uses AF_UNSPEC for multicast registration. - * No mapping needed. Just check that it's a valid MC address. - */ - e_addr = &sa->sa_data[0]; - if ((e_addr[0] & 1) != 1) - return EADDRNOTAVAIL; - *llsa = 0; - return 0; - - case AF_LINK: - /* - * No mapping needed. Just check that it's a valid MC address. - */ - sdl = (struct sockaddr_dl *)sa; - e_addr = LLADDR(sdl); - if ((e_addr[0] & 1) != 1) - return EADDRNOTAVAIL; - *llsa = 0; - return 0; - -#if INET - case AF_INET: - sin = (struct sockaddr_in *)sa; - if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) - return EADDRNOTAVAIL; - MALLOC(sdl, struct sockaddr_dl *, sizeof *sdl, M_IFMADDR, - M_WAITOK); - if (sdl == NULL) - return ENOBUFS; - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ETHER; - sdl->sdl_nlen = 0; - sdl->sdl_alen = ETHER_ADDR_LEN; - sdl->sdl_slen = 0; - e_addr = LLADDR(sdl); - ETHER_MAP_IP_MULTICAST(&sin->sin_addr, e_addr); - *llsa = (struct sockaddr *)sdl; - return 0; -#endif -#if INET6 - case AF_INET6: - sin6 = (struct sockaddr_in6 *)sa; - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - /* - * An IP6 address of 0 means listen to all - * of the Ethernet multicast address used for IP6. - * (This is used for multicast routers.) - */ - ifp->if_flags |= IFF_ALLMULTI; - *llsa = 0; - return 0; - } - MALLOC(sdl, struct sockaddr_dl *, sizeof *sdl, M_IFMADDR, - M_WAITOK); - if (sdl == NULL) - return ENOBUFS; - sdl->sdl_len = sizeof *sdl; - sdl->sdl_family = AF_LINK; - sdl->sdl_index = ifp->if_index; - sdl->sdl_type = IFT_ETHER; - sdl->sdl_nlen = 0; - sdl->sdl_alen = ETHER_ADDR_LEN; - sdl->sdl_slen = 0; - e_addr = LLADDR(sdl); - ETHER_MAP_IPV6_MULTICAST(&sin6->sin6_addr, e_addr); -#if 0 - printf("ether_resolvemulti Adding %x:%x:%x:%x:%x:%x\n", - e_addr[0], e_addr[1], e_addr[2], e_addr[3], e_addr[4], e_addr[5]); -#endif - *llsa = (struct sockaddr *)sdl; - return 0; -#endif - - default: - /* - * Well, the text isn't quite right, but it's the name - * that counts... - */ - return EAFNOSUPPORT; - } -} - - -/* - * Convert Ethernet address to printable (loggable) representation. - */ -static u_char digits[] = "0123456789abcdef"; -char * -ether_sprintf(p, ap) - register u_char *p; - register u_char *ap; -{ register char *cp; - register i; - - for (cp = p, i = 0; i < 6; i++) { - *cp++ = digits[*ap >> 4]; - *cp++ = digits[*ap++ & 0xf]; - *cp++ = ':'; - } - *--cp = 0; - return (p); -} diff --git a/bsd/net/if_fddisubr.c b/bsd/net/if_fddisubr.c deleted file mode 100644 index 1de331796..000000000 --- a/bsd/net/if_fddisubr.c +++ /dev/null @@ -1,637 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1995, 1996 - * Matt Thomas <matt@3am-software.com>. All rights reserved. - * Copyright (c) 1982, 1989, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: if_ethersubr.c,v 1.5 1994/12/13 22:31:45 wollman Exp - */ - -#include "opt_atalk.h" -#include "opt_inet.h" -#include "opt_ipx.h" - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/mbuf.h> -#include <sys/socket.h> - -#include <net/if.h> -#include <net/netisr.h> -#include <net/route.h> -#include <net/if_llc.h> -#include <net/if_dl.h> -#include <net/if_types.h> - -#if INET -#include <netinet/in.h> -#include <netinet/in_var.h> -#include <netinet/if_ether.h> -#endif -#if defined(__FreeBSD__) -#include <netinet/if_fddi.h> -#else -#include <net/if_fddi.h> -#endif - -#if IPX -#include <netipx/ipx.h> -#include <netipx/ipx_if.h> -#endif - -#if DECNET -#include <netdnet/dn.h> -#endif - -#include "bpfilter.h" - -#define senderr(e) { error = (e); goto bad;} - -/* - * This really should be defined in if_llc.h but in case it isn't. - */ -#ifndef llc_snap -#define llc_snap llc_un.type_snap -#endif - -#if defined(__bsdi__) || defined(__NetBSD__) -#define RTALLOC1(a, b) rtalloc1(a, b) -#define ARPRESOLVE(a, b, c, d, e, f) arpresolve(a, b, c, d, e) -#elif defined(__FreeBSD__) -#define RTALLOC1(a, b) rtalloc1(a, b, 0UL) -#define ARPRESOLVE(a, b, c, d, e, f) arpresolve(a, b, c, d, e, f) -#endif -/* - * FDDI output routine. - * Encapsulate a packet of type family for the local net. - * Use trailer local net encapsulation if enough data in first - * packet leaves a multiple of 512 bytes of data in remainder. - * Assumes that ifp is actually pointer to arpcom structure. - */ -int -fddi_output(ifp, m0, dst, rt0) - register struct ifnet *ifp; - struct mbuf *m0; - struct sockaddr *dst; - struct rtentry *rt0; -{ - u_int16_t type; - int s, loop_copy = 0, error = 0; - u_char edst[6]; - register struct mbuf *m = m0; - register struct rtentry *rt; - register struct fddi_header *fh; - struct arpcom *ac = (struct arpcom *)ifp; - - if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) - senderr(ENETDOWN); - getmicrotime(&ifp->if_lastchange); -#if !defined(__bsdi__) || _BSDI_VERSION >= 199401 - if (rt = rt0) { - if ((rt->rt_flags & RTF_UP) == 0) { - if (rt0 = rt = RTALLOC1(dst, 1)) - rtunref(rt); - else - senderr(EHOSTUNREACH); - } - if (rt->rt_flags & RTF_GATEWAY) { - if (rt->rt_gwroute == 0) - goto lookup; - if (((rt = rt->rt_gwroute)->rt_flags & RTF_UP) == 0) { - rtfree(rt); rt = rt0; - lookup: rt->rt_gwroute = RTALLOC1(rt->rt_gateway, 1); - if ((rt = rt->rt_gwroute) == 0) - senderr(EHOSTUNREACH); - } - } - if (rt->rt_flags & RTF_REJECT) - if (rt->rt_rmx.rmx_expire == 0 || - time_second < rt->rt_rmx.rmx_expire) - senderr(rt == rt0 ? EHOSTDOWN : EHOSTUNREACH); - } -#endif - switch (dst->sa_family) { - -#if INET - case AF_INET: { -#if !defined(__bsdi__) || _BSDI_VERSION >= 199401 - if (!ARPRESOLVE(ac, rt, m, dst, edst, rt0)) - return (0); /* if not yet resolved */ -#else - int usetrailers; - if (!arpresolve(ac, m, &((struct sockaddr_in *)dst)->sin_addr, edst, &usetrailers)) - return (0); /* if not yet resolved */ -#endif - type = htons(ETHERTYPE_IP); - break; - } -#endif -#if IPX - case AF_IPX: - type = htons(ETHERTYPE_IPX); - bcopy((caddr_t)&(((struct sockaddr_ipx *)dst)->sipx_addr.x_host), - (caddr_t)edst, sizeof (edst)); - break; -#endif - -#if NS - case AF_NS: - type = htons(ETHERTYPE_NS); - bcopy((caddr_t)&(((struct sockaddr_ns *)dst)->sns_addr.x_host), - (caddr_t)edst, sizeof (edst)); - break; -#endif -#if ISO - case AF_ISO: { - int snpalen; - struct llc *l; - register struct sockaddr_dl *sdl; - - if (rt && (sdl = (struct sockaddr_dl *)rt->rt_gateway) && - sdl->sdl_family == AF_LINK && sdl->sdl_alen > 0) { - bcopy(LLADDR(sdl), (caddr_t)edst, sizeof(edst)); - } else if (error = - iso_snparesolve(ifp, (struct sockaddr_iso *)dst, - (char *)edst, &snpalen)) - goto bad; /* Not Resolved */ - /* If broadcasting on a simplex interface, loopback a copy */ - if (*edst & 1) - m->m_flags |= (M_BCAST|M_MCAST); - M_PREPEND(m, 3, M_DONTWAIT); - if (m == NULL) - return (0); - type = 0; - l = mtod(m, struct llc *); - l->llc_dsap = l->llc_ssap = LLC_ISO_LSAP; - l->llc_control = LLC_UI; - IFDEBUG(D_ETHER) - int i; - printf("unoutput: sending pkt to: "); - for (i=0; i<6; i++) - printf("%x ", edst[i] & 0xff); - printf("\n"); - ENDDEBUG - } break; -#endif /* ISO */ -#if LLC -/* case AF_NSAP: */ - case AF_CCITT: { - register struct sockaddr_dl *sdl = - (struct sockaddr_dl *) rt -> rt_gateway; - - if (sdl && sdl->sdl_family != AF_LINK && sdl->sdl_alen <= 0) - goto bad; /* Not a link interface ? Funny ... */ - bcopy(LLADDR(sdl), (char *)edst, sizeof(edst)); - if (*edst & 1) - loop_copy = 1; - type = 0; -#if LLC_DEBUG - { - int i; - register struct llc *l = mtod(m, struct llc *); - - printf("fddi_output: sending LLC2 pkt to: "); - for (i=0; i<6; i++) - printf("%x ", edst[i] & 0xff); - printf(" len 0x%x dsap 0x%x ssap 0x%x control 0x%x\n", - type & 0xff, l->llc_dsap & 0xff, l->llc_ssap &0xff, - l->llc_control & 0xff); - - } -#endif /* LLC_DEBUG */ - } break; -#endif /* LLC */ - - case AF_UNSPEC: - { - struct ether_header *eh; - loop_copy = -1; - eh = (struct ether_header *)dst->sa_data; - (void)memcpy((caddr_t)edst, (caddr_t)eh->ether_dhost, sizeof (edst)); - if (*edst & 1) - m->m_flags |= (M_BCAST|M_MCAST); - type = eh->ether_type; - break; - } - -#if NBPFILTER > 0 - case AF_IMPLINK: - { - fh = mtod(m, struct fddi_header *); - error = EPROTONOSUPPORT; - switch (fh->fddi_fc & (FDDIFC_C|FDDIFC_L|FDDIFC_F)) { - case FDDIFC_LLC_ASYNC: { - /* legal priorities are 0 through 7 */ - if ((fh->fddi_fc & FDDIFC_Z) > 7) - goto bad; - break; - } - case FDDIFC_LLC_SYNC: { - /* FDDIFC_Z bits reserved, must be zero */ - if (fh->fddi_fc & FDDIFC_Z) - goto bad; - break; - } - case FDDIFC_SMT: { - /* FDDIFC_Z bits must be non zero */ - if ((fh->fddi_fc & FDDIFC_Z) == 0) - goto bad; - break; - } - default: { - /* anything else is too dangerous */ - goto bad; - } - } - error = 0; - if (fh->fddi_dhost[0] & 1) - m->m_flags |= (M_BCAST|M_MCAST); - goto queue_it; - } -#endif - default: - printf("%s%d: can't handle af%d\n", ifp->if_name, ifp->if_unit, - dst->sa_family); - senderr(EAFNOSUPPORT); - } - - if (type != 0) { - register struct llc *l; - M_PREPEND(m, sizeof (struct llc), M_DONTWAIT); - if (m == 0) - senderr(ENOBUFS); - l = mtod(m, struct llc *); - l->llc_control = LLC_UI; - l->llc_dsap = l->llc_ssap = LLC_SNAP_LSAP; - l->llc_snap.org_code[0] = l->llc_snap.org_code[1] = l->llc_snap.org_code[2] = 0; - (void)memcpy((caddr_t) &l->llc_snap.ether_type, (caddr_t) &type, - sizeof(u_int16_t)); - } - - /* - * Add local net header. If no space in first mbuf, - * allocate another. - */ - M_PREPEND(m, sizeof (struct fddi_header), M_DONTWAIT); - if (m == 0) - senderr(ENOBUFS); - fh = mtod(m, struct fddi_header *); - fh->fddi_fc = FDDIFC_LLC_ASYNC|FDDIFC_LLC_PRIO4; - (void)memcpy((caddr_t)fh->fddi_dhost, (caddr_t)edst, sizeof (edst)); - queue_it: - (void)memcpy((caddr_t)fh->fddi_shost, (caddr_t)ac->ac_enaddr, - sizeof(fh->fddi_shost)); - - /* - * If a simplex interface, and the packet is being sent to our - * Ethernet address or a broadcast address, loopback a copy. - * XXX To make a simplex device behave exactly like a duplex - * device, we should copy in the case of sending to our own - * ethernet address (thus letting the original actually appear - * on the wire). However, we don't do that here for security - * reasons and compatibility with the original behavior. - */ - if ((ifp->if_flags & IFF_SIMPLEX) && - (loop_copy != -1)) { - if ((m->m_flags & M_BCAST) || loop_copy) { - struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); - - (void) if_simloop(ifp, - n, dst, sizeof(struct fddi_header)); - } else if (bcmp(fh->fddi_dhost, - fh->fddi_shost, sizeof(fh->fddi_shost)) == 0) { - (void) if_simloop(ifp, - m, dst, sizeof(struct fddi_header)); - return(0); /* XXX */ - } - } - - s = splimp(); - /* - * Queue message on interface, and start output if interface - * not yet active. - */ - if (IF_QFULL(&ifp->if_snd)) { - IF_DROP(&ifp->if_snd); - splx(s); - senderr(ENOBUFS); - } - ifp->if_obytes += m->m_pkthdr.len; - IF_ENQUEUE(&ifp->if_snd, m); - if ((ifp->if_flags & IFF_OACTIVE) == 0) - (*ifp->if_start)(ifp); - splx(s); - if (m->m_flags & M_MCAST) - ifp->if_omcasts++; - return (error); - -bad: - if (m) - m_freem(m); - return (error); -} - -/* - * Process a received FDDI packet; - * the packet is in the mbuf chain m without - * the fddi header, which is provided separately. - */ -void -fddi_input(ifp, fh, m) - struct ifnet *ifp; - register struct fddi_header *fh; - struct mbuf *m; -{ - register struct ifqueue *inq; - register struct llc *l; - int s; - - if ((ifp->if_flags & IFF_UP) == 0) { - m_freem(m); - return; - } - getmicrotime(&ifp->if_lastchange); - ifp->if_ibytes += m->m_pkthdr.len + sizeof (*fh); - if (fh->fddi_dhost[0] & 1) { - if (bcmp((caddr_t)fddibroadcastaddr, (caddr_t)fh->fddi_dhost, - sizeof(fddibroadcastaddr)) == 0) - m->m_flags |= M_BCAST; - else - m->m_flags |= M_MCAST; - ifp->if_imcasts++; - } else if ((ifp->if_flags & IFF_PROMISC) - && bcmp(((struct arpcom *)ifp)->ac_enaddr, (caddr_t)fh->fddi_dhost, - sizeof(fh->fddi_dhost)) != 0) { - m_freem(m); - return; - } - -#ifdef M_LINK0 - /* - * If this has a LLC priority of 0, then mark it so upper - * layers have a hint that it really came via a FDDI/Ethernet - * bridge. - */ - if ((fh->fddi_fc & FDDIFC_LLC_PRIO7) == FDDIFC_LLC_PRIO0) - m->m_flags |= M_LINK0; -#endif - - l = mtod(m, struct llc *); - switch (l->llc_dsap) { -#if defined(INET) || NS || IPX || defined(NETATALK) - case LLC_SNAP_LSAP: - { - u_int16_t type; - if (l->llc_control != LLC_UI || l->llc_ssap != LLC_SNAP_LSAP) - goto dropanyway; - - if (l->llc_snap.org_code[0] != 0 || l->llc_snap.org_code[1] != 0|| l->llc_snap.org_code[2] != 0) - goto dropanyway; - type = ntohs(l->llc_snap.ether_type); - m_adj(m, 8); - switch (type) { -#if INET - case ETHERTYPE_IP: - if (ipflow_fastforward(m)) - return; - schednetisr(NETISR_IP); - inq = &ipintrq; - break; - - case ETHERTYPE_ARP: -#if !defined(__bsdi__) || _BSDI_VERSION >= 199401 - schednetisr(NETISR_ARP); - inq = &arpintrq; - break; -#else - arpinput((struct arpcom *)ifp, m); - return; -#endif -#endif -#if IPX - case ETHERTYPE_IPX: - schednetisr(NETISR_IPX); - inq = &ipxintrq; - break; -#endif -#if NS - case ETHERTYPE_NS: - schednetisr(NETISR_NS); - inq = &nsintrq; - break; -#endif -#if DECNET - case ETHERTYPE_DECNET: - schednetisr(NETISR_DECNET); - inq = &decnetintrq; - break; -#endif - - default: - /* printf("fddi_input: unknown protocol 0x%x\n", type); */ - ifp->if_noproto++; - goto dropanyway; - } - break; - } -#endif /* INET || NS */ -#if ISO - case LLC_ISO_LSAP: - switch (l->llc_control) { - case LLC_UI: - /* LLC_UI_P forbidden in class 1 service */ - if ((l->llc_dsap == LLC_ISO_LSAP) && - (l->llc_ssap == LLC_ISO_LSAP)) { - /* LSAP for ISO */ - m->m_data += 3; /* XXX */ - m->m_len -= 3; /* XXX */ - m->m_pkthdr.len -= 3; /* XXX */ - M_PREPEND(m, sizeof *fh, M_DONTWAIT); - if (m == 0) - return; - *mtod(m, struct fddi_header *) = *fh; - IFDEBUG(D_ETHER) - printf("clnp packet"); - ENDDEBUG - schednetisr(NETISR_ISO); - inq = &clnlintrq; - break; - } - goto dropanyway; - - case LLC_XID: - case LLC_XID_P: - if(m->m_len < 6) - goto dropanyway; - l->llc_window = 0; - l->llc_fid = 9; - l->llc_class = 1; - l->llc_dsap = l->llc_ssap = 0; - /* Fall through to */ - case LLC_TEST: - case LLC_TEST_P: - { - struct sockaddr sa; - register struct ether_header *eh; - struct arpcom *ac = (struct arpcom *) ifp; - int i; - u_char c = l->llc_dsap; - - l->llc_dsap = l->llc_ssap; - l->llc_ssap = c; - if (m->m_flags & (M_BCAST | M_MCAST)) - bcopy((caddr_t)ac->ac_enaddr, - (caddr_t)eh->ether_dhost, 6); - sa.sa_family = AF_UNSPEC; - sa.sa_len = sizeof(sa); - eh = (struct ether_header *)sa.sa_data; - for (i = 0; i < 6; i++) { - eh->ether_shost[i] = fh->fddi_dhost[i]; - eh->ether_dhost[i] = fh->fddi_shost[i]; - } - eh->ether_type = 0; - ifp->if_output(ifp, m, &sa, NULL); - return; - } - default: - m_freem(m); - return; - } - break; -#endif /* ISO */ -#if LLC - case LLC_X25_LSAP: - { - M_PREPEND(m, sizeof(struct sdl_hdr) , M_DONTWAIT); - if (m == 0) - return; - if ( !sdl_sethdrif(ifp, fh->fddi_shost, LLC_X25_LSAP, - fh->fddi_dhost, LLC_X25_LSAP, 6, - mtod(m, struct sdl_hdr *))) - panic("ETHER cons addr failure"); - mtod(m, struct sdl_hdr *)->sdlhdr_len = m->m_pkthdr.len - sizeof(struct sdl_hdr); -#if LLC_DEBUG - printf("llc packet\n"); -#endif /* LLC_DEBUG */ - schednetisr(NETISR_CCITT); - inq = &llcintrq; - break; - } -#endif /* LLC */ - - default: - /* printf("fddi_input: unknown dsap 0x%x\n", l->llc_dsap); */ - ifp->if_noproto++; - dropanyway: - m_freem(m); - return; - } - - s = splimp(); - if (IF_QFULL(inq)) { - IF_DROP(inq); - m_freem(m); - } else - IF_ENQUEUE(inq, m); - splx(s); -} -/* - * Perform common duties while attaching to interface list - */ -#ifdef __NetBSD__ -#define ifa_next ifa_list.tqe_next -#endif - -void -fddi_ifattach(ifp) - register struct ifnet *ifp; -{ - register struct ifaddr *ifa; - register struct sockaddr_dl *sdl; - - ifp->if_type = IFT_FDDI; - ifp->if_addrlen = 6; - ifp->if_hdrlen = 21; - ifp->if_mtu = FDDIMTU; - ifp->if_baudrate = 100000000; -#if IFF_NOTRAILERS - ifp->if_flags |= IFF_NOTRAILERS; -#endif -#if defined(__FreeBSD__) - ifa = ifnet_addrs[ifp->if_index - 1]; - sdl = (struct sockaddr_dl *)ifa->ifa_addr; - sdl->sdl_type = IFT_FDDI; - sdl->sdl_alen = ifp->if_addrlen; - bcopy(((struct arpcom *)ifp)->ac_enaddr, LLADDR(sdl), ifp->if_addrlen); -#elif defined(__NetBSD__) - LIST_INIT(&((struct arpcom *)ifp)->ac_multiaddrs); - for (ifa = ifp->if_addrlist.tqh_first; ifa != NULL; ifa = ifa->ifa_list.tqe_next) -#else - for (ifa = ifp->if_addrlist; ifa != NULL; ifa = ifa->ifa_next) -#endif -#if !defined(__FreeBSD__) - if ((sdl = (struct sockaddr_dl *)ifa->ifa_addr) && - sdl->sdl_family == AF_LINK) { - sdl->sdl_type = IFT_FDDI; - sdl->sdl_alen = ifp->if_addrlen; - bcopy((caddr_t)((struct arpcom *)ifp)->ac_enaddr, - LLADDR(sdl), ifp->if_addrlen); - break; - } -#endif -} diff --git a/bsd/net/if_gif.c b/bsd/net/if_gif.c index 38e876d6d..b25ecb3a5 100644 --- a/bsd/net/if_gif.c +++ b/bsd/net/if_gif.c @@ -122,7 +122,6 @@ TAILQ_HEAD(gifhead, gif_softc) gifs = TAILQ_HEAD_INITIALIZER(gifs); #ifdef __APPLE__ void gifattach(void); -static void gif_create_dev(void); static int gif_encapcheck(const struct mbuf*, int, int, void*); static errno_t gif_output(ifnet_t ifp, mbuf_t m); static errno_t gif_input(ifnet_t ifp, protocol_family_t protocol_family, @@ -156,6 +155,11 @@ struct ip6protosw in6_gif_protosw = }; #endif +static if_clone_t gif_cloner = NULL; +static int gif_clone_create(struct if_clone *, uint32_t, void *); +static int gif_clone_destroy(struct ifnet *); +static void gif_delete_tunnel(struct gif_softc *); + #ifdef __APPLE__ /* * Theory of operation: initially, one gif interface is created. @@ -237,6 +241,8 @@ __private_extern__ void gifattach(void) { errno_t result; + struct ifnet_clone_params ifnet_clone_params; + struct if_clone *ifc = NULL; /* Init the list of interfaces */ TAILQ_INIT(&gifs); @@ -252,8 +258,17 @@ gifattach(void) if (result != 0) printf("proto_register_plumber failed for AF_INET6 error=%d\n", result); + ifnet_clone_params.ifc_name = "gif"; + ifnet_clone_params.ifc_create = gif_clone_create; + ifnet_clone_params.ifc_destroy = gif_clone_destroy; + + result = ifnet_clone_attach(&ifnet_clone_params, &gif_cloner); + if (result != 0) + printf("gifattach: ifnet_clone_attach failed %d\n", result); + /* Create first device */ - gif_create_dev(); + ifc = if_clone_lookup("gif", NULL); + gif_clone_create(ifc, 0, NULL); } static errno_t @@ -270,35 +285,34 @@ gif_set_bpf_tap( return 0; } -/* Creates another gif device if there are none free */ -static void -gif_create_dev(void) + +static int +gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params) { - struct gif_softc *sc; - struct ifnet_init_params gif_init; - errno_t result = 0; - - + struct gif_softc *sc = NULL; + struct ifnet_init_params gif_init; + errno_t result = 0; + /* Can't create more than GIF_MAXUNIT */ if (ngif >= GIF_MAXUNIT) - return; - - /* Check for unused gif interface */ - TAILQ_FOREACH(sc, &gifs, gif_link) { - /* If unused, return, no need to create a new interface */ - if ((ifnet_flags(sc->gif_if) & IFF_RUNNING) == 0) - return; - } + return (ENXIO); sc = _MALLOC(sizeof(struct gif_softc), M_DEVBUF, M_WAITOK); if (sc == NULL) { - log(LOG_ERR, "gifattach: failed to allocate gif%d\n", ngif); - return; + log(LOG_ERR, "gif_clone_create: failed to allocate gif%d\n", unit); + return ENOBUFS; } - + bzero(sc, sizeof(struct gif_softc)); + + /* use the interface name as the unique id for ifp recycle */ + snprintf(sc->gif_ifname, sizeof(sc->gif_ifname), "%s%d", + ifc->ifc_name, unit); + bzero(&gif_init, sizeof(gif_init)); + gif_init.uniqueid = sc->gif_ifname; + gif_init.uniqueid_len = strlen(sc->gif_ifname); gif_init.name = GIFNAME; - gif_init.unit = ngif; + gif_init.unit = unit; gif_init.type = IFT_GIF; gif_init.family = IFNET_FAMILY_GIF; gif_init.output = gif_output; @@ -309,22 +323,22 @@ gif_create_dev(void) gif_init.ioctl = gif_ioctl; gif_init.set_bpf_tap = gif_set_bpf_tap; - bzero(sc, sizeof(struct gif_softc)); result = ifnet_allocate(&gif_init, &sc->gif_if); if (result != 0) { - printf("gif_create_dev, ifnet_allocate failed - %d\n", result); + printf("gif_clone_create, ifnet_allocate failed - %d\n", result); _FREE(sc, M_DEVBUF); - return; + return ENOBUFS; } + sc->encap_cookie4 = sc->encap_cookie6 = NULL; #if INET sc->encap_cookie4 = encap_attach_func(AF_INET, -1, - gif_encapcheck, &in_gif_protosw, sc); + gif_encapcheck, &in_gif_protosw, sc); if (sc->encap_cookie4 == NULL) { printf("%s: unable to attach encap4\n", if_name(sc->gif_if)); ifnet_release(sc->gif_if); FREE(sc, M_DEVBUF); - return; + return ENOBUFS; } #endif #if INET6 @@ -338,7 +352,7 @@ gif_create_dev(void) printf("%s: unable to attach encap6\n", if_name(sc->gif_if)); ifnet_release(sc->gif_if); FREE(sc, M_DEVBUF); - return; + return ENOBUFS; } #endif sc->gif_called = 0; @@ -350,10 +364,18 @@ gif_create_dev(void) #endif result = ifnet_attach(sc->gif_if, NULL); if (result != 0) { - printf("gif_create_dev - ifnet_attach failed - %d\n", result); + printf("gif_clone_create - ifnet_attach failed - %d\n", result); ifnet_release(sc->gif_if); + if (sc->encap_cookie4) { + encap_detach(sc->encap_cookie4); + sc->encap_cookie4 = NULL; + } + if (sc->encap_cookie6) { + encap_detach(sc->encap_cookie6); + sc->encap_cookie6 = NULL; + } FREE(sc, M_DEVBUF); - return; + return result; } #if CONFIG_MACF_NET mac_ifnet_label_init(&sc->gif_if); @@ -361,6 +383,43 @@ gif_create_dev(void) bpfattach(sc->gif_if, DLT_NULL, sizeof(u_int)); TAILQ_INSERT_TAIL(&gifs, sc, gif_link); ngif++; + return 0; +} + +static int +gif_clone_destroy(struct ifnet *ifp) +{ +#if defined(INET) || defined(INET6) + int err = 0; +#endif + struct gif_softc *sc = ifp->if_softc; + + TAILQ_REMOVE(&gifs, sc, gif_link); + + gif_delete_tunnel(sc); +#ifdef INET6 + if (sc->encap_cookie6 != NULL) { + err = encap_detach(sc->encap_cookie6); + KASSERT(err == 0, ("gif_clone_destroy: Unexpected error detaching encap_cookie6")); + } +#endif +#ifdef INET + if (sc->encap_cookie4 != NULL) { + err = encap_detach(sc->encap_cookie4); + KASSERT(err == 0, ("gif_clone_destroy: Unexpected error detaching encap_cookie4")); + } +#endif + err = ifnet_set_flags(ifp, 0, IFF_UP); + if (err != 0) { + printf("gif_clone_destroy: ifnet_set_flags failed %d\n", err); + } + + err = ifnet_detach(ifp); + if (err != 0) + panic("gif_clone_destroy: ifnet_detach(%p) failed %d\n", ifp, err); + FREE(sc, M_DEVBUF); + ngif--; + return 0; } static int @@ -488,7 +547,6 @@ gif_input( mbuf_t m, __unused char *frame_header) { - errno_t error; struct gif_softc *sc = ifnet_softc(ifp); bpf_tap_in(ifp, 0, m, &sc->gif_proto, sizeof(sc->gif_proto)); @@ -505,8 +563,11 @@ gif_input( * it occurs more times than we thought, we may change the policy * again. */ - error = proto_input(protocol_family, m); - ifnet_stat_increment_in(ifp, 1, m->m_pkthdr.len, 0); + if (proto_input(protocol_family, m) != 0) { + ifnet_stat_increment_in(ifp, 0, 0, 1); + m_freem(m); + } else + ifnet_stat_increment_in(ifp, 1, m->m_pkthdr.len, 0); return (0); } @@ -716,11 +777,6 @@ gif_ioctl( ifnet_set_flags(ifp, IFF_RUNNING | IFF_UP, IFF_RUNNING | IFF_UP); -#ifdef __APPLE__ - /* Make sure at least one unused device is still available */ - gif_create_dev(); -#endif - error = 0; break; @@ -839,7 +895,6 @@ gif_ioctl( return error; } -#ifndef __APPLE__ /* This function is not used in our stack */ void gif_delete_tunnel(sc) @@ -857,4 +912,3 @@ gif_delete_tunnel(sc) } /* change the IFF_UP flag as well? */ } -#endif diff --git a/bsd/net/if_gif.h b/bsd/net/if_gif.h index dc193f74c..dfba33645 100644 --- a/bsd/net/if_gif.h +++ b/bsd/net/if_gif.h @@ -90,6 +90,7 @@ struct gif_softc { TAILQ_ENTRY(gif_softc) gif_link; /* all gif's are linked */ bpf_tap_mode tap_mode; bpf_packet_func tap_callback; + char gif_ifname[IFNAMSIZ]; }; #define gif_ro gifsc_gifscr.gifscr_ro diff --git a/bsd/net/if_llreach.c b/bsd/net/if_llreach.c new file mode 100644 index 000000000..669beb0f4 --- /dev/null +++ b/bsd/net/if_llreach.c @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/* + * Link-layer Reachability Record + * + * Each interface maintains a red-black tree which contains records related + * to the on-link nodes which we are interested in communicating with. Each + * record gets allocated and inserted into the tree in the following manner: + * upon processing an ARP announcement or reply from a known node (i.e. there + * exists a ARP route entry for the node), and if a link-layer reachability + * record for the node doesn't yet exist; and, upon processing a ND6 RS/RA/ + * NS/NA/redirect from a node, and if a link-layer reachability record for the + * node doesn't yet exist. + * + * Each newly created record is then referred to by the resolver route entry; + * if a record already exists, its reference count gets increased for the new + * resolver entry which now refers to it. A record gets removed from the tree + * and freed once its reference counts drops to zero, i.e. when there is no + * more resolver entry referring to it. + * + * A record contains the link-layer protocol (e.g. Ethertype IP/IPv6), the + * HW address of the sender, the "last heard from" timestamp (lr_lastrcvd) and + * the number of references made to it (lr_reqcnt). Because the key for each + * record in the red-black tree consists of the link-layer protocol, therefore + * the namespace for the records is partitioned based on the type of link-layer + * protocol, i.e. an Ethertype IP link-layer record is only referred to by one + * or more ARP entries; an Ethernet IPv6 link-layer record is only referred to + * by one or more ND6 entries. Therefore, lr_reqcnt represents the number of + * resolver entry references to the record for the same protocol family. + * + * Upon receiving packets from the network, the protocol's input callback + * (e.g. ether_inet{6}_input) informs the corresponding resolver (ARP/ND6) + * about the (link-layer) origin of the packet. This results in searching + * for a matching record in the red-black tree for the interface where the + * packet arrived on. If there's no match, no further processing takes place. + * Otherwise, the lr_lastrcvd timestamp of the record is updated. + * + * When an IP/IPv6 packet is transmitted to the resolver (i.e. the destination + * is on-link), ARP/ND6 records the "last spoken to" timestamp in the route + * entry ({la,ln}_lastused). + * + * The reachability of the on-link node is determined by the following logic, + * upon sending a packet thru the resolver: + * + * a) If the record is only used by exactly one resolver entry (lr_reqcnt + * is 1), i.e. the target host does not have IP/IPv6 aliases that we know + * of, check if lr_lastrcvd is "recent." If so, simply send the packet; + * otherwise, re-resolve the target node. + * + * b) If the record is shared by multiple resolver entries (lr_reqcnt is + * greater than 1), i.e. the target host has more than one IP/IPv6 aliases + * on the same network interface, we can't rely on lr_lastrcvd alone, as + * one of the IP/IPv6 aliases could have been silently moved to another + * node for which we don't have a link-layer record. If lr_lastrcvd is + * not "recent", we re-resolve the target node. Otherwise, we perform + * an additional check against {la,ln}_lastused to see whether it is also + * "recent", relative to lr_lastrcvd. If so, simply send the packet; + * otherwise, re-resolve the target node. + * + * The value for "recent" is configurable by adjusting the basetime value for + * net.link.ether.inet.arp_llreach_base or net.inet6.icmp6.nd6_llreach_base. + * The default basetime value is 30 seconds, and the actual expiration time + * is calculated by multiplying the basetime value with some random factor, + * which results in a number between 15 to 45 seconds. Setting the basetime + * value to 0 effectively disables this feature for the corresponding resolver. + * + * Assumptions: + * + * The above logic is based upon the following assumptions: + * + * i) Network traffics are mostly bi-directional, i.e. the act of sending + * packets to an on-link node would most likely cause us to receive + * packets from that node. + * + * ii) If the on-link node's IP/IPv6 address silently moves to another + * on-link node for which we are not aware of, non-unicast packets + * from the old node would trigger the record's lr_lastrcvd to be + * kept recent. + * + * We can mitigate the above by having the resolver check its {la,ln}_lastused + * timestamp at all times, i.e. not only when lr_reqcnt is greater than 1; but + * we currently optimize for the common cases. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/tree.h> +#include <sys/sysctl.h> +#include <sys/mcache.h> +#include <sys/protosw.h> + +#include <net/if_dl.h> +#include <net/if.h> +#include <net/if_var.h> +#include <net/if_llreach.h> +#include <net/dlil.h> + +#include <kern/assert.h> +#include <kern/locks.h> +#include <kern/zalloc.h> + +#if INET6 +#include <netinet6/in6_var.h> +#include <netinet6/nd6.h> +#endif /* INET6 */ + +static unsigned int iflr_size; /* size of if_llreach */ +static struct zone *iflr_zone; /* zone for if_llreach */ + +#define IFLR_ZONE_MAX 128 /* maximum elements in zone */ +#define IFLR_ZONE_NAME "if_llreach" /* zone name */ + +static struct if_llreach *iflr_alloc(int); +static void iflr_free(struct if_llreach *); +static __inline int iflr_cmp(const struct if_llreach *, + const struct if_llreach *); +static __inline int iflr_reachable(struct if_llreach *, int, u_int64_t); +static int sysctl_llreach_ifinfo SYSCTL_HANDLER_ARGS; + +/* The following is protected by if_llreach_lock */ +RB_GENERATE_PREV(ll_reach_tree, if_llreach, lr_link, iflr_cmp); + +SYSCTL_DECL(_net_link_generic_system); + +SYSCTL_NODE(_net_link_generic_system, OID_AUTO, llreach_info, + CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_llreach_ifinfo, + "Per-interface tree of source link-layer reachability records"); + +/* + * Link-layer reachability is based off node constants in RFC4861. + */ +#if INET6 +#define LL_COMPUTE_RTIME(x) ND_COMPUTE_RTIME(x) +#else +#define LL_MIN_RANDOM_FACTOR 512 /* 1024 * 0.5 */ +#define LL_MAX_RANDOM_FACTOR 1536 /* 1024 * 1.5 */ +#define LL_COMPUTE_RTIME(x) \ + (((LL_MIN_RANDOM_FACTOR * (x >> 10)) + (random() & \ + ((LL_MAX_RANDOM_FACTOR - LL_MIN_RANDOM_FACTOR) * (x >> 10)))) / 1000) +#endif /* !INET6 */ + +void +ifnet_llreach_init(void) +{ + iflr_size = sizeof (struct if_llreach); + iflr_zone = zinit(iflr_size, + IFLR_ZONE_MAX * iflr_size, 0, IFLR_ZONE_NAME); + if (iflr_zone == NULL) { + panic("%s: failed allocating %s", __func__, IFLR_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(iflr_zone, Z_EXPAND, TRUE); + zone_change(iflr_zone, Z_CALLERACCT, FALSE); +} + +void +ifnet_llreach_ifattach(struct ifnet *ifp, boolean_t reuse) +{ + lck_rw_lock_exclusive(&ifp->if_llreach_lock); + /* Initialize link-layer source tree (if not already) */ + if (!reuse) + RB_INIT(&ifp->if_ll_srcs); + lck_rw_done(&ifp->if_llreach_lock); +} + +void +ifnet_llreach_ifdetach(struct ifnet *ifp) +{ +#pragma unused(ifp) + /* + * Nothing to do for now; the link-layer source tree might + * contain entries at this point, that are still referred + * to by route entries pointing to this ifp. + */ +} + +/* + * Link-layer source tree comparison function. + * + * An ordered predicate is necessary; bcmp() is not documented to return + * an indication of order, memcmp() is, and is an ISO C99 requirement. + */ +static __inline int +iflr_cmp(const struct if_llreach *a, const struct if_llreach *b) +{ + return (memcmp(&a->lr_key, &b->lr_key, sizeof (a->lr_key))); +} + +static __inline int +iflr_reachable(struct if_llreach *lr, int cmp_delta, u_int64_t tval) +{ + u_int64_t now; + u_int64_t expire; + + now = net_uptime(); /* current approx. uptime */ + /* + * No need for lr_lock; atomically read the last rcvd uptime. + */ + expire = lr->lr_lastrcvd + lr->lr_reachable; + /* + * If we haven't heard back from the local host for over + * lr_reachable seconds, consider that the host is no + * longer reachable. + */ + if (!cmp_delta) + return (expire >= now); + /* + * If the caller supplied a reference time, consider the + * host is reachable if the record hasn't expired (see above) + * and if the reference time is within the past lr_reachable + * seconds. + */ + return ((expire >= now) && (now - tval) < lr->lr_reachable); +} + +int +ifnet_llreach_reachable(struct if_llreach *lr) +{ + /* + * Check whether the cache is too old to be trusted. + */ + return (iflr_reachable(lr, 0, 0)); +} + +int +ifnet_llreach_reachable_delta(struct if_llreach *lr, u_int64_t tval) +{ + /* + * Check whether the cache is too old to be trusted. + */ + return (iflr_reachable(lr, 1, tval)); +} + +void +ifnet_llreach_set_reachable(struct ifnet *ifp, u_int16_t llproto, void *addr, + unsigned int alen) +{ + struct if_llreach find, *lr; + + VERIFY(alen == IF_LLREACH_MAXLEN); /* for now */ + + find.lr_key.proto = llproto; + bcopy(addr, &find.lr_key.addr, IF_LLREACH_MAXLEN); + + lck_rw_lock_shared(&ifp->if_llreach_lock); + lr = RB_FIND(ll_reach_tree, &ifp->if_ll_srcs, &find); + if (lr == NULL) { + lck_rw_done(&ifp->if_llreach_lock); + return; + } + /* + * No need for lr_lock; atomically update the last rcvd uptime. + */ + lr->lr_lastrcvd = net_uptime(); + lck_rw_done(&ifp->if_llreach_lock); +} + +struct if_llreach * +ifnet_llreach_alloc(struct ifnet *ifp, u_int16_t llproto, void *addr, + unsigned int alen, u_int64_t llreach_base) +{ + struct if_llreach find, *lr; + struct timeval now; + + if (llreach_base == 0) + return (NULL); + + VERIFY(alen == IF_LLREACH_MAXLEN); /* for now */ + + find.lr_key.proto = llproto; + bcopy(addr, &find.lr_key.addr, IF_LLREACH_MAXLEN); + + lck_rw_lock_shared(&ifp->if_llreach_lock); + lr = RB_FIND(ll_reach_tree, &ifp->if_ll_srcs, &find); + if (lr != NULL) { +found: + IFLR_LOCK(lr); + VERIFY(lr->lr_reqcnt >= 1); + lr->lr_reqcnt++; + VERIFY(lr->lr_reqcnt != 0); + IFLR_ADDREF_LOCKED(lr); /* for caller */ + lr->lr_lastrcvd = net_uptime(); /* current approx. uptime */ + IFLR_UNLOCK(lr); + lck_rw_done(&ifp->if_llreach_lock); + return (lr); + } + + if (!lck_rw_lock_shared_to_exclusive(&ifp->if_llreach_lock)) + lck_rw_lock_exclusive(&ifp->if_llreach_lock); + + lck_rw_assert(&ifp->if_llreach_lock, LCK_RW_ASSERT_EXCLUSIVE); + + /* in case things have changed while becoming writer */ + lr = RB_FIND(ll_reach_tree, &ifp->if_ll_srcs, &find); + if (lr != NULL) + goto found; + + lr = iflr_alloc(M_WAITOK); + if (lr == NULL) { + lck_rw_done(&ifp->if_llreach_lock); + return (NULL); + } + IFLR_LOCK(lr); + lr->lr_reqcnt++; + VERIFY(lr->lr_reqcnt == 1); + IFLR_ADDREF_LOCKED(lr); /* for RB tree */ + IFLR_ADDREF_LOCKED(lr); /* for caller */ + lr->lr_lastrcvd = net_uptime(); /* current approx. uptime */ + lr->lr_baseup = lr->lr_lastrcvd; /* base uptime */ + microtime(&now); + lr->lr_basecal = now.tv_sec; /* base calendar time */ + lr->lr_basereachable = llreach_base; + lr->lr_reachable = LL_COMPUTE_RTIME(lr->lr_basereachable * 1000); + lr->lr_debug |= IFD_ATTACHED; + lr->lr_ifp = ifp; + lr->lr_key.proto = llproto; + bcopy(addr, &lr->lr_key.addr, IF_LLREACH_MAXLEN); + RB_INSERT(ll_reach_tree, &ifp->if_ll_srcs, lr); + IFLR_UNLOCK(lr); + lck_rw_done(&ifp->if_llreach_lock); + + return (lr); +} + +void +ifnet_llreach_free(struct if_llreach *lr) +{ + struct ifnet *ifp; + + /* no need to lock here; lr_ifp never changes */ + ifp = lr->lr_ifp; + + lck_rw_lock_exclusive(&ifp->if_llreach_lock); + IFLR_LOCK(lr); + if (lr->lr_reqcnt == 0) { + panic("%s: lr=%p negative reqcnt", __func__, lr); + /* NOTREACHED */ + } + --lr->lr_reqcnt; + if (lr->lr_reqcnt > 0) { + IFLR_UNLOCK(lr); + lck_rw_done(&ifp->if_llreach_lock); + IFLR_REMREF(lr); /* for caller */ + return; + } + if (!(lr->lr_debug & IFD_ATTACHED)) { + panic("%s: Attempt to detach an unattached llreach lr=%p", + __func__, lr); + /* NOTREACHED */ + } + lr->lr_debug &= ~IFD_ATTACHED; + RB_REMOVE(ll_reach_tree, &ifp->if_ll_srcs, lr); + IFLR_UNLOCK(lr); + lck_rw_done(&ifp->if_llreach_lock); + + IFLR_REMREF(lr); /* for RB tree */ + IFLR_REMREF(lr); /* for caller */ +} + +u_int64_t +ifnet_llreach_up2cal(struct if_llreach *lr, u_int64_t uptime) +{ + u_int64_t calendar = 0; + + if (uptime != 0) { + struct timeval cnow; + u_int64_t unow; + + getmicrotime(&cnow); /* current calendar time */ + unow = net_uptime(); /* current approx. uptime */ + /* + * Take into account possible calendar time changes; + * adjust base calendar value if necessary, i.e. + * the calendar skew should equate to the uptime skew. + */ + lr->lr_basecal += (cnow.tv_sec - lr->lr_basecal) - + (unow - lr->lr_baseup); + + calendar = lr->lr_basecal + lr->lr_reachable + + (uptime - lr->lr_baseup); + } + + return (calendar); +} + +static struct if_llreach * +iflr_alloc(int how) +{ + struct if_llreach *lr; + + lr = (how == M_WAITOK) ? zalloc(iflr_zone) : zalloc_noblock(iflr_zone); + if (lr != NULL) { + bzero(lr, iflr_size); + lck_mtx_init(&lr->lr_lock, ifnet_lock_group, ifnet_lock_attr); + lr->lr_debug |= IFD_ALLOC; + } + return (lr); +} + +static void +iflr_free(struct if_llreach *lr) +{ + IFLR_LOCK(lr); + if (lr->lr_debug & IFD_ATTACHED) { + panic("%s: attached lr=%p is being freed", __func__, lr); + /* NOTREACHED */ + } else if (!(lr->lr_debug & IFD_ALLOC)) { + panic("%s: lr %p cannot be freed", __func__, lr); + /* NOTREACHED */ + } else if (lr->lr_refcnt != 0) { + panic("%s: non-zero refcount lr=%p", __func__, lr); + /* NOTREACHED */ + } else if (lr->lr_reqcnt != 0) { + panic("%s: non-zero reqcnt lr=%p", __func__, lr); + /* NOTREACHED */ + } + lr->lr_debug &= ~IFD_ALLOC; + IFLR_UNLOCK(lr); + + lck_mtx_destroy(&lr->lr_lock, ifnet_lock_group); + zfree(iflr_zone, lr); +} + +void +iflr_addref(struct if_llreach *lr, int locked) +{ + if (!locked) + IFLR_LOCK(lr); + else + IFLR_LOCK_ASSERT_HELD(lr); + + if (++lr->lr_refcnt == 0) { + panic("%s: lr=%p wraparound refcnt", __func__, lr); + /* NOTREACHED */ + } + if (!locked) + IFLR_UNLOCK(lr); +} + +void +iflr_remref(struct if_llreach *lr) +{ + IFLR_LOCK(lr); + if (lr->lr_refcnt == 0) { + panic("%s: lr=%p negative refcnt", __func__, lr); + /* NOTREACHED */ + } + --lr->lr_refcnt; + if (lr->lr_refcnt > 0) { + IFLR_UNLOCK(lr); + return; + } + IFLR_UNLOCK(lr); + + iflr_free(lr); /* deallocate it */ +} + +void +ifnet_lr2ri(struct if_llreach *lr, struct rt_reach_info *ri) +{ + struct if_llreach_info lri; + + IFLR_LOCK_ASSERT_HELD(lr); + + bzero(ri, sizeof (*ri)); + ifnet_lr2lri(lr, &lri); + ri->ri_refcnt = lri.lri_refcnt; + ri->ri_probes = lri.lri_probes; + ri->ri_rcv_expire = lri.lri_expire; +} + +void +ifnet_lr2lri(struct if_llreach *lr, struct if_llreach_info *lri) +{ + IFLR_LOCK_ASSERT_HELD(lr); + + bzero(lri, sizeof (*lri)); + /* + * Note here we return request count, not actual memory refcnt. + */ + lri->lri_refcnt = lr->lr_reqcnt; + lri->lri_ifindex = lr->lr_ifp->if_index; + lri->lri_probes = lr->lr_probes; + lri->lri_expire = ifnet_llreach_up2cal(lr, lr->lr_lastrcvd); + lri->lri_proto = lr->lr_key.proto; + bcopy(&lr->lr_key.addr, &lri->lri_addr, IF_LLREACH_MAXLEN); +} + +static int +sysctl_llreach_ifinfo SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + int *name, retval = 0; + unsigned int namelen; + uint32_t ifindex; + struct if_llreach *lr; + struct if_llreach_info lri; + struct ifnet *ifp; + + name = (int *)arg1; + namelen = (unsigned int)arg2; + + if (req->newptr != USER_ADDR_NULL) + return (EPERM); + + if (namelen != 1) + return (EINVAL); + + ifindex = name[0]; + ifnet_head_lock_shared(); + if (ifindex <= 0 || ifindex > (u_int)if_index) { + printf("%s: ifindex %u out of range\n", __func__, ifindex); + ifnet_head_done(); + return (ENOENT); + } + + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp == NULL) { + printf("%s: no ifp for ifindex %u\n", __func__, ifindex); + return (ENOENT); + } + + lck_rw_lock_shared(&ifp->if_llreach_lock); + RB_FOREACH(lr, ll_reach_tree, &ifp->if_ll_srcs) { + /* Export to if_llreach_info structure */ + IFLR_LOCK(lr); + ifnet_lr2lri(lr, &lri); + IFLR_UNLOCK(lr); + + if ((retval = SYSCTL_OUT(req, &lri, sizeof (lri))) != 0) + break; + } + lck_rw_done(&ifp->if_llreach_lock); + + return (retval); +} diff --git a/bsd/net/if_llreach.h b/bsd/net/if_llreach.h new file mode 100644 index 000000000..e922fb0e4 --- /dev/null +++ b/bsd/net/if_llreach.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _NET_IF_LLREACH_H_ +#define _NET_IF_LLREACH_H_ + +#ifdef PRIVATE +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> + +/* + * Per-interface link-layer reachability information (private). + */ +#define IF_LLREACHINFO_ADDRLEN 64 /* max ll addr len */ + +struct if_llreach_info { + u_int32_t lri_refcnt; /* reference count */ + u_int32_t lri_ifindex; /* interface index */ + u_int64_t lri_expire; /* expiration (calendar) time */ + u_int32_t lri_probes; /* total # of probes */ + u_int16_t lri_reserved; /* for future use */ + u_int16_t lri_proto; /* ll proto */ + u_int8_t lri_addr[IF_LLREACHINFO_ADDRLEN]; /* ll addr */ +}; + +#ifdef XNU_KERNEL_PRIVATE +#include <sys/tree.h> +#include <kern/lock.h> +#include <net/ethernet.h> +#include <netinet/in.h> +#if INET6 +#include <netinet6/in6_var.h> +#include <netinet6/nd6.h> +#endif /* INET6 */ + +/* + * Link-layer reachability is based off node constants in RFC4861. + */ +#if INET6 +#define LL_BASE_REACHABLE REACHABLE_TIME +#else +#define LL_BASE_REACHABLE 30000 /* msec */ +#endif /* !INET6 */ + +/* + * Per-interface link-layer reachability. (Currently only for ARP/Ethernet.) + */ +#define IF_LLREACH_MAXLEN ETHER_ADDR_LEN + +struct if_llreach { + decl_lck_mtx_data(, lr_lock); + RB_ENTRY(if_llreach) lr_link; /* RB tree links */ + struct ifnet *lr_ifp; /* back pointer to ifnet */ + u_int32_t lr_refcnt; /* reference count */ + u_int32_t lr_reqcnt; /* RB tree request count */ + u_int32_t lr_debug; /* see ifa_debug flags */ + u_int32_t lr_probes; /* number of probes so far */ + u_int64_t lr_basecal; /* base calendar time */ + u_int64_t lr_baseup; /* base uptime */ + u_int64_t lr_lastrcvd; /* last-heard-of timestamp */ + u_int32_t lr_basereachable; /* baseline time */ + u_int32_t lr_reachable; /* reachable time */ + struct lr_key_s { + u_int16_t proto; /* ll proto */ + u_int8_t addr[IF_LLREACH_MAXLEN]; /* ll addr */ + } lr_key; +}; + +RB_PROTOTYPE_SC_PREV(__private_extern__, ll_reach_tree, if_llreach, + ls_link, ifllr_cmp); + +#define IFLR_LOCK_ASSERT_HELD(_iflr) \ + lck_mtx_assert(&(_iflr)->lr_lock, LCK_MTX_ASSERT_OWNED) + +#define IFLR_LOCK_ASSERT_NOTHELD(_iflr) \ + lck_mtx_assert(&(_iflr)->lr_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IFLR_LOCK(_iflr) \ + lck_mtx_lock(&(_iflr)->lr_lock) + +#define IFLR_LOCK_SPIN(_iflr) \ + lck_mtx_lock_spin(&(_iflr)->lr_lock) + +#define IFLR_CONVERT_LOCK(_iflr) do { \ + IFLR_LOCK_ASSERT_HELD(_iflr); \ + lck_mtx_convert_spin(&(_iflr)->lr_lock); \ +} while (0) + +#define IFLR_UNLOCK(_iflr) \ + lck_mtx_unlock(&(_iflr)->lr_lock) + +#define IFLR_ADDREF(_iflr) \ + iflr_addref(_iflr, 0) + +#define IFLR_ADDREF_LOCKED(_iflr) \ + iflr_addref(_iflr, 1) + +#define IFLR_REMREF(_iflr) \ + iflr_remref(_iflr) + +extern void ifnet_llreach_init(void); +extern void ifnet_llreach_ifattach(struct ifnet *, boolean_t); +extern void ifnet_llreach_ifdetach(struct ifnet *); +extern struct if_llreach *ifnet_llreach_alloc(struct ifnet *, u_int16_t, void *, + unsigned int, u_int64_t); +extern void ifnet_llreach_free(struct if_llreach *); +extern int ifnet_llreach_reachable(struct if_llreach *); +extern int ifnet_llreach_reachable_delta(struct if_llreach *, u_int64_t); +extern void ifnet_llreach_set_reachable(struct ifnet *, u_int16_t, void *, + unsigned int); +extern u_int64_t ifnet_llreach_up2cal(struct if_llreach *, u_int64_t); +extern void ifnet_lr2ri(struct if_llreach *, struct rt_reach_info *); +extern void ifnet_lr2lri(struct if_llreach *, struct if_llreach_info *); +extern void iflr_addref(struct if_llreach *, int); +extern void iflr_remref(struct if_llreach *); +#endif /* XNU_KERNEL_PRIVATE */ + +#ifdef __cplusplus +} +#endif +#endif /* PRIVATE */ +#endif /* !_NET_IF_LLREACH_H_ */ diff --git a/bsd/net/if_loop.c b/bsd/net/if_loop.c index f62bdc362..5ba5b11a5 100644 --- a/bsd/net/if_loop.c +++ b/bsd/net/if_loop.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/sockio.h> +#include <sys/mcache.h> #include <net/if.h> #include <net/if_types.h> @@ -106,11 +107,6 @@ extern struct ifqueue atalkintrq; #endif -#include "bpfilter.h" -#if NBPFILTER > 0 -#include <net/bpfdesc.h> -#endif - #if CONFIG_MACF_NET #include <security/mac_framework.h> #endif @@ -214,11 +210,11 @@ lo_output( if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = ifp; - ifp->if_ibytes += m->m_pkthdr.len; - ifp->if_obytes += m->m_pkthdr.len; + atomic_add_64(&ifp->if_ibytes, m->m_pkthdr.len); + atomic_add_64(&ifp->if_obytes, m->m_pkthdr.len); - ifp->if_opackets++; - ifp->if_ipackets++; + atomic_add_64(&ifp->if_opackets, 1); + atomic_add_64(&ifp->if_ipackets, 1); m->m_pkthdr.header = mtod(m, char *); if (apple_hwcksum_tx != 0) { @@ -339,7 +335,9 @@ loioctl( case SIOCSIFADDR: ifnet_set_flags(ifp, IFF_UP | IFF_RUNNING, IFF_UP | IFF_RUNNING); ifa = (struct ifaddr *)data; + IFA_LOCK_SPIN(ifa); ifa->ifa_rtrequest = lortrequest; + IFA_UNLOCK(ifa); /* * Everything else is done at a higher level. */ @@ -475,7 +473,9 @@ More than one loopback interface is not supported. ifnet_set_mtu(lo_ifp, LOMTU); ifnet_set_flags(lo_ifp, IFF_LOOPBACK | IFF_MULTICAST, IFF_LOOPBACK | IFF_MULTICAST); - ifnet_set_offload(lo_ifp, IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | IFNET_MULTIPAGES); + ifnet_set_offload(lo_ifp, IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | + IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_IPV6_FRAGMENT | + IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | IFNET_MULTIPAGES); ifnet_set_hdrlen(lo_ifp, sizeof(struct loopback_header)); ifnet_set_eflags(lo_ifp, IFEF_SENDLIST, IFEF_SENDLIST); diff --git a/bsd/net/if_media.h b/bsd/net/if_media.h index 12cbc871b..32afe224d 100644 --- a/bsd/net/if_media.h +++ b/bsd/net/if_media.h @@ -221,7 +221,7 @@ int ifmedia_ioctl(struct ifnet *ifp, struct ifreq *ifr, #define IFM_FDX 0x00100000 /* Force full duplex */ #define IFM_HDX 0x00200000 /* Force half duplex */ #define IFM_FLOW 0x00400000 /* enable hardware flow control */ -#define IFM_EEE 0x00800000 /* Support energy efficient ethernet */ +#define IFM_EEE 0x00800000 /* Driver defined flag */ #define IFM_FLAG0 0x01000000 /* Driver defined flag */ #define IFM_FLAG1 0x02000000 /* Driver defined flag */ #define IFM_FLAG2 0x04000000 /* Driver defined flag */ diff --git a/bsd/net/if_mib.c b/bsd/net/if_mib.c index b21529b2a..9ab76f698 100644 --- a/bsd/net/if_mib.c +++ b/bsd/net/if_mib.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,8 +67,6 @@ #include <net/if_mib.h> #include <net/if_var.h> -#if NETMIBS - /* * A sysctl(3) MIB for generic interface information. This information * is exported in the net.link.generic branch, which has the following @@ -97,31 +95,17 @@ SYSCTL_DECL(_net_link_generic); SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system, CTLFLAG_RD|CTLFLAG_LOCKED, 0, "Variables global to all interfaces"); -SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD, +SYSCTL_INT(_net_link_generic_system, IFMIB_IFCOUNT, ifcount, CTLFLAG_RD | CTLFLAG_LOCKED, &if_index, 0, "Number of configured interfaces"); static int sysctl_ifdata SYSCTL_HANDLER_ARGS; -SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RD, +SYSCTL_NODE(_net_link_generic, IFMIB_IFDATA, ifdata, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_ifdata, "Interface table"); static int sysctl_ifalldata SYSCTL_HANDLER_ARGS; -SYSCTL_NODE(_net_link_generic, IFMIB_IFALLDATA, ifalldata, CTLFLAG_RD, +SYSCTL_NODE(_net_link_generic, IFMIB_IFALLDATA, ifalldata, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_ifalldata, "Interface table"); -extern int dlil_multithreaded_input; -SYSCTL_INT(_net_link_generic_system, OID_AUTO, multi_threaded_input, CTLFLAG_RW, - &dlil_multithreaded_input , 0, "Uses multiple input thread for DLIL input"); -#ifdef IFNET_INPUT_SANITY_CHK -extern int dlil_input_sanity_check; -SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check, CTLFLAG_RW, - &dlil_input_sanity_check , 0, "Turn on sanity checking in DLIL input"); -#endif - -extern int dlil_verbose; -SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose, CTLFLAG_RW, - &dlil_verbose, 0, "Log DLIL error messages"); - - static int make_ifmibdata(struct ifnet *, int *, struct sysctl_req *); int @@ -140,7 +124,7 @@ make_ifmibdata(struct ifnet *ifp, int *name, struct sysctl_req *req) /* * Make sure the interface is in use */ - if (ifp->if_refcnt > 0) { + if (ifnet_is_attached(ifp, 0)) { snprintf(ifmd.ifmd_name, sizeof(ifmd.ifmd_name), "%s%d", ifp->if_name, ifp->if_unit); @@ -191,11 +175,14 @@ make_ifmibdata(struct ifnet *ifp, int *name, struct sysctl_req *req) #endif /* IF_MIB_WR */ break; -#if PKT_PRIORITY - case IFDATA_SUPPLEMENTAL: - error = SYSCTL_OUT(req, &ifp->if_tc, sizeof(struct if_traffic_class)); + case IFDATA_SUPPLEMENTAL: { + struct if_traffic_class if_tc; + + if_copy_traffic_class(ifp, &if_tc); + + error = SYSCTL_OUT(req, &if_tc, sizeof(struct if_traffic_class)); break; -#endif /* PKT_PRIORITY */ + } } return error; @@ -211,23 +198,24 @@ sysctl_ifdata SYSCTL_HANDLER_ARGS /* XXX bad syntax! */ struct ifnet *ifp; if (namelen != 2) - return EINVAL; + return (EINVAL); + ifnet_head_lock_shared(); if (name[0] <= 0 || name[0] > if_index || - (ifp = ifindex2ifnet[name[0]]) == NULL || - ifp->if_refcnt == 0) { + (ifp = ifindex2ifnet[name[0]]) == NULL) { ifnet_head_done(); - return ENOENT; + return (ENOENT); } + ifnet_reference(ifp); ifnet_head_done(); ifnet_lock_shared(ifp); - error = make_ifmibdata(ifp, name, req); - ifnet_lock_done(ifp); - - return error; + + ifnet_release(ifp); + + return (error); } int @@ -240,20 +228,18 @@ sysctl_ifalldata SYSCTL_HANDLER_ARGS /* XXX bad syntax! */ struct ifnet *ifp; if (namelen != 2) - return EINVAL; + return (EINVAL); ifnet_head_lock_shared(); TAILQ_FOREACH(ifp, &ifnet_head, if_link) { ifnet_lock_shared(ifp); - + error = make_ifmibdata(ifp, name, req); - + ifnet_lock_done(ifp); - if (error) + if (error != 0) break; } ifnet_head_done(); return error; } - -#endif diff --git a/bsd/net/if_mib.h b/bsd/net/if_mib.h index 36d2667a4..5b773bddf 100644 --- a/bsd/net/if_mib.h +++ b/bsd/net/if_mib.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * diff --git a/bsd/net/if_pflog.c b/bsd/net/if_pflog.c index 8e7480911..ae2f9254c 100644 --- a/bsd/net/if_pflog.c +++ b/bsd/net/if_pflog.c @@ -68,6 +68,7 @@ #include <sys/proc_internal.h> #include <sys/socket.h> #include <sys/ioctl.h> +#include <sys/mcache.h> #include <net/if.h> #include <net/if_var.h> @@ -344,8 +345,8 @@ pflog_packet(struct pfi_kif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir, } #endif /* INET */ - ifn->if_opackets++; - ifn->if_obytes += m->m_pkthdr.len; + atomic_add_64(&ifn->if_opackets, 1); + atomic_add_64(&ifn->if_obytes, m->m_pkthdr.len); switch (dir) { case PF_IN: diff --git a/bsd/net/if_stf.c b/bsd/net/if_stf.c index 96b9664b5..c9d24e249 100644 --- a/bsd/net/if_stf.c +++ b/bsd/net/if_stf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -120,6 +120,8 @@ #include <sys/malloc.h> +#include <kern/locks.h> + #include <net/if.h> #include <net/route.h> #include <net/if_types.h> @@ -148,9 +150,10 @@ #include <security/mac_framework.h> #endif -#define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) #define GET_V4(x) ((const struct in_addr *)(&(x)->s6_addr16[1])) +static lck_grp_t *stf_mtx_grp; + struct stf_softc { ifnet_t sc_if; /* common area */ u_int32_t sc_protocol_family; /* dlil protocol attached */ @@ -159,6 +162,7 @@ struct stf_softc { struct route_in6 __sc_ro6; /* just for safety */ } __sc_ro46; #define sc_ro __sc_ro46.__sc_ro4 + decl_lck_mtx_data(, sc_ro_mtx); const struct encaptab *encap_cookie; bpf_tap_mode tap_mode; bpf_packet_func tap_callback; @@ -167,14 +171,16 @@ struct stf_softc { void stfattach (void); static int ip_stf_ttl = 40; +static int stf_init_done; static void in_stf_input(struct mbuf *, int); +static void stfinit(void); extern struct domain inetdomain; struct protosw in_stf_protosw = { SOCK_RAW, &inetdomain, IPPROTO_IPV6, PR_ATOMIC|PR_ADDR, in_stf_input, NULL, NULL, rip_ctloutput, NULL, - NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, &rip_usrreqs, NULL, rip_unlock, NULL, {NULL, NULL}, NULL, {0} @@ -192,6 +198,15 @@ static void stf_rtrequest(int, struct rtentry *, struct sockaddr *); static errno_t stf_ioctl(ifnet_t ifp, u_long cmd, void *data); static errno_t stf_output(ifnet_t ifp, mbuf_t m); +static void +stfinit(void) +{ + if (!stf_init_done) { + stf_mtx_grp = lck_grp_alloc_init("stf", LCK_GRP_ATTR_NULL); + stf_init_done = 1; + } +} + /* * gif_input is the input handler for IP and IPv6 attached to gif */ @@ -202,7 +217,8 @@ stf_media_input( mbuf_t m, __unused char *frame_header) { - proto_input(protocol_family, m); + if (proto_input(protocol_family, m) != 0) + m_freem(m); return (0); } @@ -297,6 +313,8 @@ stfattach(void) const struct encaptab *p; struct ifnet_init_params stf_init; + stfinit(); + error = proto_register_plumber(PF_INET6, APPLE_IF_FAM_STF, stf_attach_inet6, NULL); if (error != 0) @@ -318,6 +336,7 @@ stfattach(void) return; } sc->encap_cookie = p; + lck_mtx_init(&sc->sc_ro_mtx, stf_mtx_grp, LCK_ATTR_NULL); bzero(&stf_init, sizeof(stf_init)); stf_init.name = "stf"; @@ -336,6 +355,7 @@ stfattach(void) if (error != 0) { printf("stfattach, ifnet_allocate failed - %d\n", error); encap_detach(sc->encap_cookie); + lck_mtx_destroy(&sc->sc_ro_mtx, stf_mtx_grp); FREE(sc, M_DEVBUF); return; } @@ -355,6 +375,7 @@ stfattach(void) printf("stfattach: ifnet_attach returned error=%d\n", error); encap_detach(sc->encap_cookie); ifnet_release(sc->sc_if); + lck_mtx_destroy(&sc->sc_ro_mtx, stf_mtx_grp); FREE(sc, M_DEVBUF); return; } @@ -404,9 +425,11 @@ stf_encapcheck( * local 6to4 address. * success on: dst = 10.1.1.1, ia6->ia_addr = 2002:0a01:0101:... */ + IFA_LOCK(&ia6->ia_ifa); if (bcmp(GET_V4(&ia6->ia_addr.sin6_addr), &ip.ip_dst, sizeof(ip.ip_dst)) != 0) { - ifafree(&ia6->ia_ifa); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return 0; } /* @@ -421,11 +444,13 @@ stf_encapcheck( b = ip.ip_src; b.s_addr &= GET_V4(&ia6->ia_prefixmask.sin6_addr)->s_addr; if (a.s_addr != b.s_addr) { - ifafree(&ia6->ia_ifa); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return 0; } /* stf interface makes single side match only */ - ifafree(&ia6->ia_ifa); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return 32; } @@ -438,38 +463,46 @@ stf_getsrcifa6(struct ifnet *ifp) struct in_addr in; ifnet_lock_shared(ifp); - for (ia = ifp->if_addrlist.tqh_first; - ia; - ia = ia->ifa_list.tqe_next) - { - if (ia->ifa_addr == NULL) + for (ia = ifp->if_addrlist.tqh_first; ia; ia = ia->ifa_list.tqe_next) { + IFA_LOCK(ia); + if (ia->ifa_addr == NULL) { + IFA_UNLOCK(ia); continue; - if (ia->ifa_addr->sa_family != AF_INET6) + } + if (ia->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ia); continue; + } sin6 = (struct sockaddr_in6 *)ia->ifa_addr; - if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) + if (!IN6_IS_ADDR_6TO4(&sin6->sin6_addr)) { + IFA_UNLOCK(ia); continue; - + } bcopy(GET_V4(&sin6->sin6_addr), &in, sizeof(in)); + IFA_UNLOCK(ia); lck_rw_lock_shared(in_ifaddr_rwlock); for (ia4 = TAILQ_FIRST(&in_ifaddrhead); ia4; ia4 = TAILQ_NEXT(ia4, ia_link)) { - if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) + IFA_LOCK(&ia4->ia_ifa); + if (ia4->ia_addr.sin_addr.s_addr == in.s_addr) { + IFA_UNLOCK(&ia4->ia_ifa); break; + } + IFA_UNLOCK(&ia4->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); if (ia4 == NULL) continue; - ifaref(ia); + IFA_ADDREF(ia); /* for caller */ ifnet_lock_done(ifp); - return (struct in6_ifaddr *)ia; + return ((struct in6_ifaddr *)ia); } ifnet_lock_done(ifp); - return NULL; + return (NULL); } int @@ -491,6 +524,7 @@ stf_pre_output( struct ip6_hdr *ip6; struct in6_ifaddr *ia6; struct sockaddr_in *dst4; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; errno_t result = 0; sc = ifnet_softc(ifp); @@ -516,7 +550,7 @@ stf_pre_output( m = m_pullup(m, sizeof(*ip6)); if (!m) { *m0 = NULL; /* makes sure this won't be double freed */ - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return ENOBUFS; } } @@ -532,7 +566,7 @@ stf_pre_output( else if (IN6_IS_ADDR_6TO4(&dst6->sin6_addr)) in4 = GET_V4(&dst6->sin6_addr); else { - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return ENETUNREACH; } @@ -548,15 +582,17 @@ stf_pre_output( m = m_pullup(m, sizeof(struct ip)); if (m == NULL) { *m0 = NULL; - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return ENOBUFS; } ip = mtod(m, struct ip *); bzero(ip, sizeof(*ip)); + IFA_LOCK_SPIN(&ia6->ia_ifa); bcopy(GET_V4(&((struct sockaddr_in6 *)&ia6->ia_addr)->sin6_addr), &ip->ip_src, sizeof(ip->ip_src)); + IFA_UNLOCK(&ia6->ia_ifa); bcopy(in4, &ip->ip_dst, sizeof(ip->ip_dst)); ip->ip_p = IPPROTO_IPV6; ip->ip_ttl = ip_stf_ttl; @@ -566,11 +602,11 @@ stf_pre_output( else ip_ecn_ingress(ECN_NOCARE, &ip->ip_tos, &tos); + lck_mtx_lock(&sc->sc_ro_mtx); dst4 = (struct sockaddr_in *)&sc->sc_ro.ro_dst; if (dst4->sin_family != AF_INET || bcmp(&dst4->sin_addr, &ip->ip_dst, sizeof(ip->ip_dst)) != 0) { - /* cache route doesn't match */ - printf("stf_output: cached route doesn't match \n"); + /* cache route doesn't match: always the case during the first use */ dst4->sin_family = AF_INET; dst4->sin_len = sizeof(struct sockaddr_in); bcopy(&ip->ip_dst, &dst4->sin_addr, sizeof(dst4->sin_addr)); @@ -580,21 +616,15 @@ stf_pre_output( } } - if (sc->sc_ro.ro_rt == NULL) { - rtalloc(&sc->sc_ro); - if (sc->sc_ro.ro_rt == NULL) { - ifafree(&ia6->ia_ifa); - return ENETUNREACH; - } - } + result = ip_output_list(m, 0, NULL, &sc->sc_ro, IP_OUTARGS, NULL, &ipoa); + lck_mtx_unlock(&sc->sc_ro_mtx); - result = ip_output_list(m, 0, NULL, &sc->sc_ro, 0, NULL, NULL); /* Assumption: ip_output will free mbuf on errors */ /* All the output processing is done here, don't let stf_output be called */ if (result == 0) result = EJUSTRETURN; *m0 = NULL; - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return result; } static errno_t @@ -635,12 +665,17 @@ stf_checkaddr4( ia4; ia4 = TAILQ_NEXT(ia4, ia_link)) { - if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) + IFA_LOCK(&ia4->ia_ifa); + if ((ia4->ia_ifa.ifa_ifp->if_flags & IFF_BROADCAST) == 0) { + IFA_UNLOCK(&ia4->ia_ifa); continue; + } if (in->s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { + IFA_UNLOCK(&ia4->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return -1; } + IFA_UNLOCK(&ia4->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); @@ -820,7 +855,13 @@ stf_ioctl( switch (cmd) { case SIOCSIFADDR: ifa = (struct ifaddr *)data; - if (ifa == NULL || ifa->ifa_addr->sa_family != AF_INET6) { + if (ifa == NULL) { + error = EAFNOSUPPORT; + break; + } + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); error = EAFNOSUPPORT; break; } @@ -829,10 +870,16 @@ stf_ioctl( if ( !(ifnet_flags( ifp ) & IFF_UP) ) { /* do this only if the interface is not already up */ ifa->ifa_rtrequest = stf_rtrequest; + IFA_UNLOCK(ifa); ifnet_set_flags(ifp, IFF_UP, IFF_UP); + } else { + IFA_UNLOCK(ifa); } - } else + } else { + IFA_UNLOCK(ifa); error = EINVAL; + } + IFA_LOCK_ASSERT_NOTHELD(ifa); break; case SIOCADDMULTI: diff --git a/bsd/net/if_types.h b/bsd/net/if_types.h index 4eced169b..b3f8e5b65 100644 --- a/bsd/net/if_types.h +++ b/bsd/net/if_types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -148,6 +148,7 @@ #define IFT_PFSYNC 0xf6 /* Packet filter state syncing */ #define IFT_CARP 0xf8 /* Common Address Redundancy Protocol */ -#define IFT_PDP 0xff /* GPRS Packet Data Protocol */ +#define IFT_CELLULAR 0xff /* Packet Data over Cellular */ +#define IFT_PDP IFT_CELLULAR /* deprecated; use IFT_CELLULAR */ #endif diff --git a/bsd/net/if_utun.c b/bsd/net/if_utun.c index 1b35e44b4..a8667845b 100644 --- a/bsd/net/if_utun.c +++ b/bsd/net/if_utun.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2009 Apple Inc. All rights reserved. + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -366,6 +366,9 @@ utun_cleanup_family( goto cleanup; } + /* always set SS_PRIV, we want to close and detach regardless */ + sock_setpriv(pf_socket, 1); + result = utun_detach_ip(interface, protocol, pf_socket); if (result == 0 || result == ENXIO) { /* We are done! We either detached or weren't attached. */ @@ -705,7 +708,6 @@ utun_ioctl( void *data) { errno_t result = 0; - struct ifaddr *ifa = (struct ifaddr *)data; switch(command) { case SIOCSIFMTU: @@ -716,13 +718,6 @@ utun_ioctl( /* ifioctl() takes care of it */ break; - case SIOCSIFADDR: - case SIOCAIFADDR: - /* This will be called for called for IPv6 Address additions */ - if (ifa->ifa_addr->sa_family == AF_INET6) - break; - /* Fall though for other families like IPv4 */ - default: result = EOPNOTSUPP; } @@ -754,7 +749,8 @@ utun_proto_input( // remove protocol family first mbuf_adj(m, sizeof(u_int32_t)); - proto_input(protocol, m); + if (proto_input(protocol, m) != 0) + m_freem(m); return 0; } @@ -770,8 +766,8 @@ utun_proto_pre_output( __unused char *link_layer_dest) { - *(protocol_family_t *)(void *)frame_type = protocol; - return 0; + *(protocol_family_t *)(void *)frame_type = protocol; + return 0; } static errno_t diff --git a/bsd/net/if_var.h b/bsd/net/if_var.h index 51b48d27c..a76aa7dbb 100644 --- a/bsd/net/if_var.h +++ b/bsd/net/if_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -223,8 +223,6 @@ struct if_data64 { struct IF_DATA_TIMEVAL ifi_lastchange; /* time of last administrative change */ }; -#pragma pack() - #ifdef PRIVATE struct if_traffic_class { u_int64_t ifi_ibkpackets; /* TC_BK packets received on interface */ @@ -240,11 +238,28 @@ struct if_traffic_class { u_int64_t ifi_ovopackets; /* TC_VO packets sent on interface */ u_int64_t ifi_ovobytes; /* TC_VO bytes sent on interface */ }; +#endif /* PRIVATE */ + +#pragma pack() +/* + * Structure defining a queue for a network interface. + */ +struct ifqueue { + void *ifq_head; + void *ifq_tail; + int ifq_len; + int ifq_maxlen; + int ifq_drops; +}; + +#ifdef XNU_KERNEL_PRIVATE /* * Internal storage of if_data. This is bound to change. Various places in the * stack will translate this data structure in to the externally visible - * if_data structure above. + * if_data structure above. Note that during interface attach time, the + * embedded if_data structure in ifnet is cleared, with the exception of + * some non-statistics related fields. */ struct if_data_internal { /* generic interface information */ @@ -259,6 +274,7 @@ struct if_data_internal { u_int32_t ifi_mtu; /* maximum transmission unit */ u_int32_t ifi_metric; /* routing metric (external only) */ u_int32_t ifi_baudrate; /* linespeed */ + u_int32_t _pad; /* volatile statistics */ u_int64_t ifi_ipackets; /* packets received on interface */ u_int64_t ifi_ierrors; /* input errors on interface */ @@ -279,7 +295,9 @@ struct if_data_internal { u_int32_t ifi_tso_v4_mtu; /* TCP Segment Offload IPv4 maximum segment size */ u_int32_t ifi_tso_v6_mtu; /* TCP Segment Offload IPv6 maximum segment size */ }; +#endif /* XNU_KERNEL_PRIVATE */ +#ifdef PRIVATE #define if_mtu if_data.ifi_mtu #define if_type if_data.ifi_type #define if_typelen if_data.ifi_typelen @@ -303,47 +321,57 @@ struct if_data_internal { #define if_lastchange if_data.ifi_lastchange #define if_recvquota if_data.ifi_recvquota #define if_xmitquota if_data.ifi_xmitquota -#define if_iflags if_data.ifi_iflags +#endif /* PRIVATE */ +#ifdef XNU_KERNEL_PRIVATE #define if_tso_v4_mtu if_data.ifi_tso_v4_mtu #define if_tso_v6_mtu if_data.ifi_tso_v6_mtu +#endif /* XNU_KERNEL_PRIVATE */ -struct mbuf; -struct ifaddr; -TAILQ_HEAD(ifnethead, ifnet); /* we use TAILQs so that the order of */ -TAILQ_HEAD(ifaddrhead, ifaddr); /* instantiation is preserved in the list */ -TAILQ_HEAD(ifprefixhead, ifprefix); -LIST_HEAD(ifmultihead, ifmultiaddr); -struct tqdummy; -TAILQ_HEAD(tailq_head, tqdummy); - +#ifdef XNU_KERNEL_PRIVATE /* * Forward structure declarations for function prototypes [sic]. */ -struct proc; -struct rtentry; -struct socket; -struct ether_header; -struct sockaddr_dl; +struct proc; +struct rtentry; +struct socket; struct ifnet_filter; +struct mbuf; +struct ifaddr; +struct tqdummy; +struct proto_hash_entry; +struct dlil_threading_info; +#if PF +struct pfi_kif; +#endif /* PF */ +/* we use TAILQs so that the order of instantiation is preserved in the list */ +TAILQ_HEAD(ifnethead, ifnet); +TAILQ_HEAD(ifaddrhead, ifaddr); +TAILQ_HEAD(ifprefixhead, ifprefix); +LIST_HEAD(ifmultihead, ifmultiaddr); +TAILQ_HEAD(tailq_head, tqdummy); TAILQ_HEAD(ifnet_filter_head, ifnet_filter); TAILQ_HEAD(ddesc_head_name, dlil_demux_desc); +#endif /* XNU_KERNEL_PRIVATE */ -/* All of the following IF_HWASSIST_* flags are defined - * in kpi_inteface.h as IFNET_* flags. These are redefined - * here as constants to avoid failures to build user level - * programs that can not include kpi_interface.h. It is - * important to keep this in sync with the definitions in - * kpi_interface.h. The corresponding constant for each - * definition is mentioned in the comment. +#ifdef PRIVATE +/* + * All of the following IF_HWASSIST_* flags are defined in kpi_inteface.h as + * IFNET_* flags. These are redefined here as constants to avoid failures to + * build user level programs that can not include kpi_interface.h. It is + * important to keep this in sync with the definitions in kpi_interface.h. + * The corresponding constant for each definition is mentioned in the comment. * - * Bottom 16 bits reserved for hardware checksum + * Bottom 16 bits reserved for hardware checksum */ #define IF_HWASSIST_CSUM_IP 0x0001 /* will csum IP, IFNET_CSUM_IP */ #define IF_HWASSIST_CSUM_TCP 0x0002 /* will csum TCP, IFNET_CSUM_TCP */ #define IF_HWASSIST_CSUM_UDP 0x0004 /* will csum UDP, IFNET_CSUM_UDP */ #define IF_HWASSIST_CSUM_IP_FRAGS 0x0008 /* will csum IP fragments, IFNET_CSUM_FRAGMENT */ #define IF_HWASSIST_CSUM_FRAGMENT 0x0010 /* will do IP fragmentation, IFNET_IP_FRAGMENT */ +#define IF_HWASSIST_CSUM_TCPIPV6 0x0020 /* will csum TCPv6, IFNET_CSUM_TCPIPV6 */ +#define IF_HWASSIST_CSUM_UDPIPV6 0x0040 /* will csum UDPv6, IFNET_CSUM_UDP */ +#define IF_HWASSIST_CSUM_FRAGMENT_IPV6 0x0080 /* will do IPv6 fragmentation, IFNET_IPV6_FRAGMENT */ #define IF_HWASSIST_CSUM_TCP_SUM16 0x1000 /* simple TCP Sum16 computation, IFNET_CSUM_SUM16 */ #define IF_HWASSIST_CSUM_MASK 0xffff #define IF_HWASSIST_CSUM_FLAGS(hwassist) ((hwassist) & IF_HWASSIST_CSUM_MASK) @@ -356,30 +384,13 @@ TAILQ_HEAD(ddesc_head_name, dlil_demux_desc); #define IF_HWASSIST_TSO_V4 0x00200000 /* will do TCP Segment offload for IPv4, IFNET_TSO_IPV4 */ #define IF_HWASSIST_TSO_V6 0x00400000 /* will do TCP Segment offload for IPv6, IFNET_TSO_IPV6 */ - -#define IFNET_RW_LOCK 1 - #endif /* PRIVATE */ -/* - * Structure defining a queue for a network interface. - */ -struct ifqueue { - void *ifq_head; - void *ifq_tail; - int ifq_len; - int ifq_maxlen; - int ifq_drops; -}; -#ifdef PRIVATE +#ifdef XNU_KERNEL_PRIVATE +#include <sys/tree.h> +#include <netinet/in.h> -struct ddesc_head_str; -struct proto_hash_entry; -struct kev_msg; -struct dlil_threading_info; -#if PF -struct pfi_kif; -#endif /* PF */ +RB_HEAD(ll_reach_tree, if_llreach); /* define struct ll_reach_tree */ /* * Structure defining a network interface. @@ -387,34 +398,42 @@ struct pfi_kif; * (Would like to call this struct ``if'', but C isn't PL/1.) */ struct ifnet { - void *if_softc; /* pointer to driver state */ - const char *if_name; /* name, e.g. ``en'' or ``lo'' */ - TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ - struct ifaddrhead if_addrhead; /* linked list of addresses per if */ - u_int32_t if_refcnt; -#ifdef __KPI_INTERFACE__ - ifnet_check_multi if_check_multi; -#else - void* if_check_multi; -#endif /* __KPI_INTERFACE__ */ - int if_pcount; /* number of promiscuous listeners */ - struct bpf_if *if_bpf; /* packet filter structure */ - u_short if_index; /* numeric abbreviation for this if */ - short if_unit; /* sub-unit for lower level driver */ - short if_timer; /* time 'til if_watchdog called */ - short if_flags; /* up/down, broadcast, etc. */ - int if_ipending; /* interrupts pending */ - void *if_linkmib; /* link-type-specific MIB data */ - size_t if_linkmiblen; /* length of above data */ - struct if_data_internal if_data; - -/* New with DLIL */ -#ifdef BSD_KERNEL_PRIVATE - int if_usecnt; -#else - int refcnt; -#endif -#ifdef __KPI_INTERFACE__ + /* + * Lock (RW or mutex) to protect this data structure (static storage.) + */ + decl_lck_rw_data(, if_lock); + void *if_softc; /* pointer to driver state */ + const char *if_name; /* name, e.g. ``en'' or ``lo'' */ + TAILQ_ENTRY(ifnet) if_link; /* all struct ifnets are chained */ + TAILQ_ENTRY(ifnet) if_detaching_link; /* list of detaching ifnets */ + + decl_lck_mtx_data(, if_ref_lock) + u_int32_t if_refflags; + u_int32_t if_refio; /* number of io ops to the underlying driver */ + +#define if_list if_link + struct ifaddrhead if_addrhead; /* linked list of addresses per if */ +#define if_addrlist if_addrhead + struct ifaddr *if_lladdr; /* link address (first/permanent) */ + + int if_pcount; /* number of promiscuous listeners */ + struct bpf_if *if_bpf; /* packet filter structure */ + u_short if_index; /* numeric abbreviation for this if */ + short if_unit; /* sub-unit for lower level driver */ + short if_timer; /* time 'til if_watchdog called */ + short if_flags; /* up/down, broadcast, etc. */ + u_int32_t if_eflags; /* see <net/if.h> */ + + int if_capabilities; /* interface features & capabilities */ + int if_capenable; /* enabled features & capabilities */ + + void *if_linkmib; /* link-type-specific MIB data */ + size_t if_linkmiblen; /* length of above data */ + + struct if_data_internal if_data __attribute__((aligned(8))); + + ifnet_family_t if_family; /* value assigned by Apple */ + uintptr_t if_family_cookie; ifnet_output_func if_output; ifnet_ioctl_func if_ioctl; ifnet_set_bpf_tap if_set_bpf_tap; @@ -422,64 +441,28 @@ struct ifnet { ifnet_demux_func if_demux; ifnet_event_func if_event; ifnet_framer_func if_framer; - ifnet_family_t if_family; /* value assigned by Apple */ -#else - void* if_output; - void* if_ioctl; - void* if_set_bpf_tap; - void* if_free; - void* if_demux; - void* if_event; - void* if_framer; - u_int32_t if_family; /* value assigned by Apple */ -#endif + ifnet_add_proto_func if_add_proto; + ifnet_del_proto_func if_del_proto; + ifnet_check_multi if_check_multi; + struct proto_hash_entry *if_proto_hash; + void *if_kpi_storage; + decl_lck_mtx_data(, if_flt_lock) + u_int32_t if_flt_busy; + u_int32_t if_flt_waiters; struct ifnet_filter_head if_flt_head; -/* End DLIL specific */ + struct ifmultihead if_multiaddrs; /* multicast addresses */ + u_int32_t if_updatemcasts; /* mcast addrs need updating */ + int if_amcount; /* # of all-multicast reqs */ + decl_lck_mtx_data(, if_addrconfig_lock); /* for serializing addr config */ + struct in_multi *if_allhostsinm; /* store all-hosts inm for this ifp */ - u_int32_t if_delayed_detach; /* need to perform delayed detach */ - void *if_private; /* private to interface */ - long if_eflags; /* autoaddr, autoaddr done, etc. */ - - struct ifmultihead if_multiaddrs; /* multicast addresses configured */ - int if_amcount; /* number of all-multicast requests */ -/* procedure handles */ -#ifdef __KPI_INTERFACE__ - ifnet_add_proto_func if_add_proto; - ifnet_del_proto_func if_del_proto; -#else /* !__KPI_INTERFACE__ */ - void* if_add_proto; - void* if_del_proto; -#endif /* !__KPI_INTERFACE__ */ - struct proto_hash_entry *if_proto_hash; - void *if_kpi_storage; -#if 0 - void *unused_was_init; -#else struct dlil_threading_info *if_input_thread; -#endif - void *unused_was_resolvemulti; - - struct ifqueue if_snd; - u_int32_t unused_2[1]; -#ifdef __APPLE__ - uintptr_t family_cookie; - struct ifprefixhead if_prefixhead; /* list of prefixes per if */ -#ifdef _KERN_LOCKS_H_ -#if IFNET_RW_LOCK - lck_rw_t *if_lock; /* Lock to protect this interface */ -#else - lck_mtx_t *if_lock; /* Lock to protect this interface */ -#endif -#else - void *if_lock; -#endif + struct ifqueue if_snd; -#else - struct ifprefixhead if_prefixhead; /* list of prefixes per if */ -#endif /* __APPLE__ */ + struct ifprefixhead if_prefixhead; /* list of prefixes per if */ struct { u_int32_t length; union { @@ -488,133 +471,134 @@ struct ifnet { } u; } if_broadcast; #if CONFIG_MACF_NET - struct label *if_label; /* interface MAC label */ + struct label *if_label; /* interface MAC label */ #endif - u_int32_t if_wake_properties; + u_int32_t if_wake_properties; #if PF - struct thread *if_pf_curthread; - struct pfi_kif *if_pf_kif; + struct thread *if_pf_curthread; + struct pfi_kif *if_pf_kif; #endif /* PF */ -#ifdef _KERN_LOCKS_H_ - lck_mtx_t *if_fwd_route_lock; -#else - void *if_fwd_route_lock; -#endif - struct route if_fwd_route; /* cached IPv4 forwarding route */ - void *if_bridge; /* bridge glue */ -#if IFNET_ROUTE_REFCNT - u_int32_t if_want_aggressive_drain; - u_int32_t if_idle_flags; /* idle flags */ - u_int32_t if_route_refcnt; /* idle: route ref count */ -#endif /* IFNET_ROUTE_REFCNT */ -#if PKT_PRIORITY - struct if_traffic_class if_tc __attribute__((aligned(8))); -#endif /* PKT_PRIORITY */ -}; -#ifndef __APPLE__ -/* for compatibility with other BSDs */ -#define if_addrlist if_addrhead -#define if_list if_link -#endif /* !__APPLE__ */ + decl_lck_mtx_data(, if_cached_route_lock); + u_int32_t if_fwd_cacheok; + struct route if_fwd_route; /* cached forwarding route */ + struct route if_src_route; /* cached ipv4 source route */ + struct route_in6 if_src_route6; /* cached ipv6 source route */ + decl_lck_rw_data(, if_llreach_lock); + struct ll_reach_tree if_ll_srcs; /* source link-layer tree */ -#endif /* PRIVATE */ + void *if_bridge; /* bridge glue */ + + u_int32_t if_want_aggressive_drain; + u_int32_t if_idle_flags; /* idle flags */ + u_int32_t if_idle_new_flags; /* temporary idle flags */ + u_int32_t if_idle_new_flags_mask; /* temporary mask */ + u_int32_t if_route_refcnt; /* idle: route ref count */ + + struct if_traffic_class if_tc __attribute__((aligned(8))); +#if INET + struct igmp_ifinfo *if_igi; /* for IGMPv3 */ +#endif /* INET */ +#if INET6 + struct mld_ifinfo *if_mli; /* for MLDv2 */ +#endif /* INET6 */ +}; + +/* + * Valid values for if_useflags + */ +#define IFRF_ATTACHED 0x1 /* ifnet attach is completely done */ +#define IFRF_DETACHING 0x2 /* detach has been requested */ -#ifdef KERNEL_PRIVATE /* * Structure describing a `cloning' interface. */ struct if_clone { LIST_ENTRY(if_clone) ifc_list; /* on list of cloners */ - const char *ifc_name; /* name of device, e.g. `vlan' */ - size_t ifc_namelen; /* length of name */ - u_int32_t ifc_minifs; /* minimum number of interfaces */ - u_int32_t ifc_maxunit; /* maximum unit number */ - unsigned char *ifc_units; /* bitmap to handle units */ - u_int32_t ifc_bmlen; /* bitmap length */ - - int (*ifc_create)(struct if_clone *, u_int32_t, void *); - int (*ifc_destroy)(struct ifnet *); + const char *ifc_name; /* name of device, e.g. `vlan' */ + size_t ifc_namelen; /* length of name */ + u_int32_t ifc_minifs; /* minimum number of interfaces */ + u_int32_t ifc_maxunit; /* maximum unit number */ + unsigned char *ifc_units; /* bitmap to handle units */ + u_int32_t ifc_bmlen; /* bitmap length */ + + int (*ifc_create)(struct if_clone *, u_int32_t, void *); + int (*ifc_destroy)(struct ifnet *); }; -#define IF_CLONE_INITIALIZER(name, create, destroy, minifs, maxunit) \ - { { NULL, NULL }, name, sizeof(name) - 1, minifs, maxunit, NULL, 0, create, destroy } +#define IF_CLONE_INITIALIZER(name, create, destroy, minifs, maxunit) { \ + { NULL, NULL }, name, (sizeof (name) - 1), minifs, maxunit, NULL, 0, \ + create, destroy \ +} #define M_CLONE M_IFADDR /* - * Bit values in if_ipending - */ -#define IFI_RECV 1 /* I want to receive */ -#define IFI_XMIT 2 /* I want to transmit */ - -/* - * Output queues (ifp->if_snd) and slow device input queues (*ifp->if_slowq) - * are queues of messages stored on ifqueue structures - * (defined above). Entries are added to and deleted from these structures - * by these macros, which should be called with ipl raised to splimp(). + * Macros to manipulate ifqueue. Users of these macros are responsible + * for serialization, by holding whatever lock is appropriate for the + * corresponding structure that is referring the ifqueue. */ #define IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen) #define IF_DROP(ifq) ((ifq)->ifq_drops++) -#define IF_ENQUEUE(ifq, m) { \ - (m)->m_nextpkt = 0; \ - if ((ifq)->ifq_tail == 0) \ - (ifq)->ifq_head = m; \ - else \ - ((struct mbuf*)(ifq)->ifq_tail)->m_nextpkt = m; \ - (ifq)->ifq_tail = m; \ - (ifq)->ifq_len++; \ +#define IF_ENQUEUE(ifq, m) { \ + (m)->m_nextpkt = NULL; \ + if ((ifq)->ifq_tail == NULL) \ + (ifq)->ifq_head = m; \ + else \ + ((struct mbuf*)(ifq)->ifq_tail)->m_nextpkt = m; \ + (ifq)->ifq_tail = m; \ + (ifq)->ifq_len++; \ } -#define IF_PREPEND(ifq, m) { \ - (m)->m_nextpkt = (ifq)->ifq_head; \ - if ((ifq)->ifq_tail == 0) \ - (ifq)->ifq_tail = (m); \ - (ifq)->ifq_head = (m); \ - (ifq)->ifq_len++; \ +#define IF_PREPEND(ifq, m) { \ + (m)->m_nextpkt = (ifq)->ifq_head; \ + if ((ifq)->ifq_tail == NULL) \ + (ifq)->ifq_tail = (m); \ + (ifq)->ifq_head = (m); \ + (ifq)->ifq_len++; \ } -#define IF_DEQUEUE(ifq, m) { \ - (m) = (ifq)->ifq_head; \ - if (m) { \ - if (((ifq)->ifq_head = (m)->m_nextpkt) == 0) \ - (ifq)->ifq_tail = 0; \ - (m)->m_nextpkt = 0; \ - (ifq)->ifq_len--; \ - } \ +#define IF_DEQUEUE(ifq, m) { \ + (m) = (ifq)->ifq_head; \ + if (m != NULL) { \ + if (((ifq)->ifq_head = (m)->m_nextpkt) == NULL) \ + (ifq)->ifq_tail = NULL; \ + (m)->m_nextpkt = NULL; \ + (ifq)->ifq_len--; \ + } \ } - -#define IF_ENQ_DROP(ifq, m) if_enq_drop(ifq, m) - -#if defined(__GNUC__) && defined(MT_HEADER) -static __inline int -if_queue_drop(struct ifqueue *ifq, __unused struct mbuf *m) -{ - IF_DROP(ifq); - return 0; +#define IF_REMQUEUE(ifq, m) { \ + struct mbuf *_p = (ifq)->ifq_head; \ + struct mbuf *_n = (m)->m_nextpkt; \ + if ((m) == _p) \ + _p = NULL; \ + while (_p != NULL) { \ + if (_p->m_nextpkt == (m)) \ + break; \ + _p = _p->m_nextpkt; \ + } \ + VERIFY(_p != NULL || ((m) == (ifq)->ifq_head)); \ + if ((m) == (ifq)->ifq_head) \ + (ifq)->ifq_head = _n; \ + if ((m) == (ifq)->ifq_tail) \ + (ifq)->ifq_tail = _p; \ + VERIFY((ifq)->ifq_tail != NULL || (ifq)->ifq_head == NULL); \ + VERIFY((ifq)->ifq_len != 0); \ + --(ifq)->ifq_len; \ + if (_p != NULL) \ + _p->m_nextpkt = _n; \ + (m)->m_nextpkt = NULL; \ } +#define IF_DRAIN(ifq) do { \ + struct mbuf *m; \ + for (;;) { \ + IF_DEQUEUE(ifq, m); \ + if (m == NULL) \ + break; \ + m_freem(m); \ + } \ +} while (0) -static __inline int -if_enq_drop(struct ifqueue *ifq, struct mbuf *m) -{ - if (IF_QFULL(ifq) && - !if_queue_drop(ifq, m)) - return 0; - IF_ENQUEUE(ifq, m); - return 1; -} -#else - -#ifdef MT_HEADER -int if_enq_drop(struct ifqueue *, struct mbuf *); -#endif /* MT_HEADER */ - -#endif /* defined(__GNUC__) && defined(MT_HEADER) */ - -#endif /* KERNEL_PRIVATE */ - - -#ifdef PRIVATE /* * The ifaddr structure contains information about one address * of an interface. They are maintained by the different address families, @@ -622,21 +606,24 @@ int if_enq_drop(struct ifqueue *, struct mbuf *); * together so all addresses for an interface can be located. */ struct ifaddr { - struct sockaddr *ifa_addr; /* address of interface */ - struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ + decl_lck_mtx_data(, ifa_lock); /* lock for ifaddr */ + uint32_t ifa_refcnt; /* ref count, use IFA_{ADD,REM}REF */ + uint32_t ifa_debug; /* debug flags */ + struct sockaddr *ifa_addr; /* address of interface */ + struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */ #define ifa_broadaddr ifa_dstaddr /* broadcast address interface */ - struct sockaddr *ifa_netmask; /* used to determine subnet */ - struct ifnet *ifa_ifp; /* back-pointer to interface */ + struct sockaddr *ifa_netmask; /* used to determine subnet */ + struct ifnet *ifa_ifp; /* back-pointer to interface */ TAILQ_ENTRY(ifaddr) ifa_link; /* queue macro glue */ void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */ (int, struct rtentry *, struct sockaddr *); - uint32_t ifa_flags; /* mostly rt_flags for cloning */ - int32_t ifa_refcnt; /* ref count, use ifaref, ifafree */ - int32_t ifa_metric; /* cost of going out this interface */ + uint32_t ifa_flags; /* mostly rt_flags for cloning */ + int32_t ifa_metric; /* cost of going out this interface */ void (*ifa_free)(struct ifaddr *); /* callback fn for freeing */ void (*ifa_trace) /* callback fn for tracing refs */ (struct ifaddr *, int); - uint32_t ifa_debug; /* debug flags */ + void (*ifa_attached)(struct ifaddr *); /* callback fn for attaching */ + void (*ifa_detached)(struct ifaddr *); /* callback fn for detaching */ }; /* @@ -648,13 +635,47 @@ struct ifaddr { /* * Valid values for ifa_debug */ -#define IFD_ATTACHED 0x1 /* attached to an interface */ +#define IFD_ATTACHED 0x1 /* attached to list */ #define IFD_ALLOC 0x2 /* dynamically allocated */ #define IFD_DEBUG 0x4 /* has debugging info */ +#define IFD_LINK 0x8 /* link address */ +#define IFD_TRASHED 0x10 /* in trash list */ +#define IFD_SKIP 0x20 /* skip this entry */ +#define IFD_NOTREADY 0x40 /* embryonic; not yet ready */ -#endif /* PRIVATE */ +#define IFA_LOCK_ASSERT_HELD(_ifa) \ + lck_mtx_assert(&(_ifa)->ifa_lock, LCK_MTX_ASSERT_OWNED) + +#define IFA_LOCK_ASSERT_NOTHELD(_ifa) \ + lck_mtx_assert(&(_ifa)->ifa_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IFA_LOCK(_ifa) \ + lck_mtx_lock(&(_ifa)->ifa_lock) + +#define IFA_LOCK_SPIN(_ifa) \ + lck_mtx_lock_spin(&(_ifa)->ifa_lock) + +#define IFA_CONVERT_LOCK(_ifa) do { \ + IFA_LOCK_ASSERT_HELD(_ifa); \ + lck_mtx_convert_spin(&(_ifa)->ifa_lock); \ +} while (0) + +#define IFA_UNLOCK(_ifa) \ + lck_mtx_unlock(&(_ifa)->ifa_lock) + +#define IFA_ADDREF(_ifa) \ + ifa_addref(_ifa, 0) + +#define IFA_ADDREF_LOCKED(_ifa) \ + ifa_addref(_ifa, 1) + +#define IFA_REMREF(_ifa) do { \ + (void) ifa_remref(_ifa, 0); \ +} while (0) + +#define IFA_REMREF_LOCKED(_ifa) \ + ifa_remref(_ifa, 1) -#ifdef KERNEL_PRIVATE /* * The prefix structure contains information about one prefix * of an interface. They are maintained by the different address families, @@ -668,115 +689,169 @@ struct ifprefix { u_char ifpr_plen; /* prefix length in bits */ u_char ifpr_type; /* protocol dependent prefix type */ }; -#endif /* KERNEL_PRIVATE */ - -#ifdef PRIVATE -typedef void (*ifma_protospec_free_func)(void* ifma_protospec); /* * Multicast address structure. This is analogous to the ifaddr * structure except that it keeps track of multicast addresses. - * Also, the reference count here is a count of requests for this - * address, not a count of pointers to this structure. + * Also, the request count here is a count of requests for this + * address, not a count of pointers to this structure; anonymous + * membership(s) holds one outstanding request count. */ struct ifmultiaddr { + decl_lck_mtx_data(, ifma_lock); + u_int32_t ifma_refcount; /* reference count */ + u_int32_t ifma_anoncnt; /* # of anonymous requests */ + u_int32_t ifma_reqcnt; /* total requests for this address */ + u_int32_t ifma_debug; /* see ifa_debug flags */ + u_int32_t ifma_flags; /* see below */ LIST_ENTRY(ifmultiaddr) ifma_link; /* queue macro glue */ - struct sockaddr *ifma_addr; /* address this membership is for */ + struct sockaddr *ifma_addr; /* address this membership is for */ struct ifmultiaddr *ifma_ll; /* link-layer translation, if any */ - struct ifnet *ifma_ifp; /* back-pointer to interface */ - u_int ifma_usecount; /* use count, protected by ifp's lock */ - void *ifma_protospec; /* protocol-specific state, if any */ - int32_t ifma_refcount; /* reference count, atomically protected */ - ifma_protospec_free_func ifma_free; /* function called to free ifma_protospec */ + struct ifnet *ifma_ifp; /* back-pointer to interface */ + void *ifma_protospec; /* protocol-specific state, if any */ + void (*ifma_trace) /* callback fn for tracing refs */ + (struct ifmultiaddr *, int); }; -#endif /* PRIVATE */ - -#ifdef KERNEL_PRIVATE -#define IFAREF(ifa) ifaref(ifa) -#define IFAFREE(ifa) ifafree(ifa) /* - * To preserve kmem compatibility, we define - * ifnet_head to ifnet. This should be temp. + * Values for ifma_flags */ -#define ifnet_head ifnet -extern struct ifnethead ifnet_head; -extern struct ifnet **ifindex2ifnet; -extern int ifqmaxlen; -extern ifnet_t lo_ifp; -extern int if_index; -extern struct ifaddr **ifnet_addrs; - -int if_addmulti(struct ifnet *, const struct sockaddr *, struct ifmultiaddr **); -int if_allmulti(struct ifnet *, int); -void if_attach(struct ifnet *); -int if_delmultiaddr(struct ifmultiaddr *ifma, int locked); -int if_delmulti(struct ifnet *, const struct sockaddr *); -void if_down(struct ifnet *); -int if_down_all(void); -void if_route(struct ifnet *, int flag, int fam); -void if_unroute(struct ifnet *, int flag, int fam); -void if_up(struct ifnet *); -void if_updown(struct ifnet *ifp, int up); -/*void ifinit(void));*/ /* declared in systm.h for main( */ -int ifioctl(struct socket *, u_long, caddr_t, struct proc *); -int ifioctllocked(struct socket *, u_long, caddr_t, struct proc *); -struct ifnet *ifunit(const char *); -struct ifnet *if_withname(struct sockaddr *); - -int if_clone_attach(struct if_clone *); -void if_clone_detach(struct if_clone *); -struct if_clone * - if_clone_lookup(const char *, u_int32_t *); - -void ifnet_lock_assert(struct ifnet *ifp, int what); -void ifnet_lock_shared(struct ifnet *ifp); -void ifnet_lock_exclusive(struct ifnet *ifp); -void ifnet_lock_done(struct ifnet *ifp); - -void ifnet_head_lock_shared(void); -void ifnet_head_lock_exclusive(void); -void ifnet_head_done(void); - -void if_attach_ifa(struct ifnet * ifp, struct ifaddr *ifa); -void if_detach_ifa(struct ifnet * ifp, struct ifaddr *ifa); - -void ifma_reference(struct ifmultiaddr *ifma); -void ifma_release(struct ifmultiaddr *ifma); - -struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); -struct ifaddr *ifa_ifwithaddr_scoped(const struct sockaddr *, unsigned int); -struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *); -struct ifaddr *ifa_ifwithnet(const struct sockaddr *); -struct ifaddr *ifa_ifwithnet_scoped(const struct sockaddr *, unsigned int); -struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, const struct sockaddr *); -struct ifaddr *ifa_ifwithroute_locked(int, const struct sockaddr *, const struct sockaddr *); -struct ifaddr *ifa_ifwithroute_scoped_locked(int, const struct sockaddr *, +#define IFMAF_ANONYMOUS 0x1 /* has anonymous request ref(s) held */ + +#define IFMA_LOCK_ASSERT_HELD(_ifma) \ + lck_mtx_assert(&(_ifma)->ifma_lock, LCK_MTX_ASSERT_OWNED) + +#define IFMA_LOCK_ASSERT_NOTHELD(_ifma) \ + lck_mtx_assert(&(_ifma)->ifma_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IFMA_LOCK(_ifma) \ + lck_mtx_lock(&(_ifma)->ifma_lock) + +#define IFMA_LOCK_SPIN(_ifma) \ + lck_mtx_lock_spin(&(_ifma)->ifma_lock) + +#define IFMA_CONVERT_LOCK(_ifma) do { \ + IFMA_LOCK_ASSERT_HELD(_ifma); \ + lck_mtx_convert_spin(&(_ifma)->ifma_lock); \ +} while (0) + +#define IFMA_UNLOCK(_ifma) \ + lck_mtx_unlock(&(_ifma)->ifma_lock) + +#define IFMA_ADDREF(_ifma) \ + ifma_addref(_ifma, 0) + +#define IFMA_ADDREF_LOCKED(_ifma) \ + ifma_addref(_ifma, 1) + +#define IFMA_REMREF(_ifma) \ + ifma_remref(_ifma) + +__private_extern__ struct ifnethead ifnet_head; +__private_extern__ struct ifnet **ifindex2ifnet; +__private_extern__ int ifqmaxlen; +__private_extern__ int if_index; +__private_extern__ struct ifaddr **ifnet_addrs; +__private_extern__ lck_attr_t *ifa_mtx_attr; +__private_extern__ lck_grp_t *ifa_mtx_grp; +__private_extern__ lck_grp_t *ifnet_lock_group; +__private_extern__ lck_attr_t *ifnet_lock_attr; +extern ifnet_t lo_ifp; + +extern int if_addmulti(struct ifnet *, const struct sockaddr *, + struct ifmultiaddr **); +extern int if_addmulti_anon(struct ifnet *, const struct sockaddr *, + struct ifmultiaddr **); +extern int if_allmulti(struct ifnet *, int); +extern int if_delmulti(struct ifnet *, const struct sockaddr *); +extern int if_delmulti_ifma(struct ifmultiaddr *); +extern int if_delmulti_anon(struct ifnet *, const struct sockaddr *); +extern void if_down(struct ifnet *); +extern int if_down_all(void); +extern void if_up(struct ifnet *); +__private_extern__ void if_updown(struct ifnet *ifp, int up); +extern int ifioctl(struct socket *, u_long, caddr_t, struct proc *); +extern int ifioctllocked(struct socket *, u_long, caddr_t, struct proc *); +extern struct ifnet *ifunit(const char *); +extern struct ifnet *if_withname(struct sockaddr *); + +extern struct if_clone *if_clone_lookup(const char *, u_int32_t *); +extern int if_clone_attach(struct if_clone *); +extern void if_clone_detach(struct if_clone *); + +extern errno_t if_mcasts_update(struct ifnet *); + +typedef enum { + IFNET_LCK_ASSERT_EXCLUSIVE, /* RW: held as writer */ + IFNET_LCK_ASSERT_SHARED, /* RW: held as reader */ + IFNET_LCK_ASSERT_OWNED, /* RW: writer/reader, MTX: held */ + IFNET_LCK_ASSERT_NOTOWNED /* not held */ +} ifnet_lock_assert_t; + +__private_extern__ void ifnet_lock_assert(struct ifnet *, ifnet_lock_assert_t); +__private_extern__ void ifnet_lock_shared(struct ifnet *ifp); +__private_extern__ void ifnet_lock_exclusive(struct ifnet *ifp); +__private_extern__ void ifnet_lock_done(struct ifnet *ifp); + +__private_extern__ void ifnet_head_lock_shared(void); +__private_extern__ void ifnet_head_lock_exclusive(void); +__private_extern__ void ifnet_head_done(void); + +__private_extern__ errno_t ifnet_set_idle_flags_locked(ifnet_t, u_int32_t, + u_int32_t); +__private_extern__ int ifnet_is_attached(struct ifnet *, int refio); +__private_extern__ void ifnet_decr_iorefcnt(struct ifnet *); + +__private_extern__ void if_attach_ifa(struct ifnet *, struct ifaddr *); +__private_extern__ void if_attach_link_ifa(struct ifnet *, struct ifaddr *); +__private_extern__ void if_detach_ifa(struct ifnet *, struct ifaddr *); +__private_extern__ void if_detach_link_ifa(struct ifnet *, struct ifaddr *); + +extern struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); +extern struct ifaddr *ifa_ifwithaddr_scoped(const struct sockaddr *, unsigned int); +extern struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *); +extern struct ifaddr *ifa_ifwithnet(const struct sockaddr *); +extern struct ifaddr *ifa_ifwithnet_scoped(const struct sockaddr *, unsigned int); +extern struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, + const struct sockaddr *); +extern struct ifaddr *ifa_ifwithroute_locked(int, const struct sockaddr *, const struct sockaddr *); +extern struct ifaddr *ifa_ifwithroute_scoped_locked(int, const struct sockaddr *, const struct sockaddr *, unsigned int); -struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); -struct ifaddr *ifa_ifpgetprimary(struct ifnet *, int); -void ifafree(struct ifaddr *); -void ifaref(struct ifaddr *); - -struct ifmultiaddr *ifmaof_ifpforaddr(const struct sockaddr *, struct ifnet *); - -extern struct in_ifaddr *ifa_foraddr(unsigned int); -extern struct in_ifaddr *ifa_foraddr_scoped(unsigned int, unsigned int); - -#ifdef BSD_KERNEL_PRIVATE -enum { - kIfNetUseCount_MayBeZero = 0, - kIfNetUseCount_MustNotBeZero = 1 -}; - -int ifp_use(struct ifnet *ifp, int handle_zero); -int ifp_unuse(struct ifnet *ifp); -void ifp_use_reached_zero(struct ifnet *ifp); - -void if_data_internal_to_if_data(struct ifnet *ifp, const struct if_data_internal *if_data_int, - struct if_data *if_data); -void if_data_internal_to_if_data64(struct ifnet *ifp, const struct if_data_internal *if_data_int, - struct if_data64 *if_data64); -#endif /* BSD_KERNEL_PRIVATE */ -#endif /* KERNEL_PRIVATE */ +extern struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); +__private_extern__ struct ifaddr *ifa_ifpgetprimary(struct ifnet *, int); +extern void ifa_addref(struct ifaddr *, int); +extern struct ifaddr *ifa_remref(struct ifaddr *, int); +extern void ifa_lock_init(struct ifaddr *); +extern void ifa_lock_destroy(struct ifaddr *); +extern void ifma_addref(struct ifmultiaddr *, int); +extern void ifma_remref(struct ifmultiaddr *); + +extern void ifa_init(void); + +__private_extern__ struct in_ifaddr *ifa_foraddr(unsigned int); +__private_extern__ struct in_ifaddr *ifa_foraddr_scoped(unsigned int, + unsigned int); + +#if INET6 +struct in6_addr; +__private_extern__ struct in6_ifaddr *ifa_foraddr6(struct in6_addr *); +__private_extern__ struct in6_ifaddr *ifa_foraddr6_scoped(struct in6_addr *, + unsigned int); +#endif /* INET6 */ + +__private_extern__ void if_data_internal_to_if_data(struct ifnet *ifp, + const struct if_data_internal *if_data_int, struct if_data *if_data); +__private_extern__ void if_data_internal_to_if_data64(struct ifnet *ifp, + const struct if_data_internal *if_data_int, struct if_data64 *if_data64); +__private_extern__ void if_copy_traffic_class(struct ifnet *ifp, + struct if_traffic_class *if_tc); + +__private_extern__ struct rtentry *ifnet_cached_rtlookup_inet(struct ifnet *, + struct in_addr); +#if INET6 +__private_extern__ struct rtentry *ifnet_cached_rtlookup_inet6(struct ifnet *, + struct in6_addr *); +#endif /* INET6 */ + +#endif /* XNU_KERNEL_PRIVATE */ #endif /* !_NET_IF_VAR_H_ */ diff --git a/bsd/net/if_vlan.c b/bsd/net/if_vlan.c index 0179f102c..cf090602d 100644 --- a/bsd/net/if_vlan.c +++ b/bsd/net/if_vlan.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2010 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/kern_event.h> +#include <sys/mcache.h> #include <net/bpf.h> #include <net/ethernet.h> @@ -185,8 +186,14 @@ LIST_HEAD(vlan_parent_list, vlan_parent); struct ifvlan; LIST_HEAD(ifvlan_list, ifvlan); +typedef LIST_ENTRY(vlan_parent) +vlan_parent_entry; +typedef LIST_ENTRY(ifvlan) +ifvlan_entry; + +#define VLP_SIGNATURE 0xfaceface typedef struct vlan_parent { - LIST_ENTRY(vlan_parent) vlp_parent_list;/* list of parents */ + vlan_parent_entry vlp_parent_list;/* list of parents */ struct ifnet * vlp_ifp; /* interface */ struct ifvlan_list vlp_vlan_list; /* list of VLAN's */ #define VLPF_SUPPORTS_VLAN_MTU 0x1 @@ -195,10 +202,12 @@ typedef struct vlan_parent { u_int32_t vlp_flags; struct ifdevmtu vlp_devmtu; SInt32 vlp_retain_count; + UInt32 vlp_signature; /* VLP_SIGNATURE */ } vlan_parent, * vlan_parent_ref; +#define IFV_SIGNATURE 0xbeefbeef struct ifvlan { - LIST_ENTRY(ifvlan) ifv_vlan_list; + ifvlan_entry ifv_vlan_list; char ifv_name[IFNAMSIZ]; /* our unique id */ struct ifnet * ifv_ifp; /* our interface */ vlan_parent_ref ifv_vlp; /* parent information */ @@ -215,6 +224,8 @@ struct ifvlan { u_int32_t ifv_flags; bpf_packet_func ifv_bpf_input; bpf_packet_func ifv_bpf_output; + SInt32 ifv_retain_count; + UInt32 ifv_signature; /* IFV_SIGNATURE */ }; typedef struct ifvlan * ifvlan_ref; @@ -230,6 +241,11 @@ static vlan_globals_ref g_vlan; #define ifv_encaplen ifv_mib.ifvm_encaplen #define ifv_mtufudge ifv_mib.ifvm_mtufudge +static void +vlan_parent_retain(vlan_parent_ref vlp); + +static void +vlan_parent_release(vlan_parent_ref vlp); /** ** vlan_parent_ref vlp_flags in-lines @@ -363,12 +379,10 @@ static int vlan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, static int vlan_attach_protocol(struct ifnet *ifp); static int vlan_detach_protocol(struct ifnet *ifp); static int vlan_setmulti(struct ifnet *ifp); -static int vlan_unconfig(struct ifnet *ifp); +static int vlan_unconfig(ifvlan_ref ifv, int need_to_wait); static int vlan_config(struct ifnet * ifp, struct ifnet * p, int tag); static void vlan_if_free(struct ifnet * ifp); -static void vlan_remove(ifvlan_ref ifv); -static void vlan_if_detach(struct ifnet * ifp); -static int vlan_new_mtu(struct ifnet * ifp, int mtu); +static int vlan_remove(ifvlan_ref ifv, int need_to_wait); static struct if_clone vlan_cloner = IF_CLONE_INITIALIZER(VLANNAME, vlan_clone_create, @@ -376,9 +390,118 @@ static struct if_clone vlan_cloner = IF_CLONE_INITIALIZER(VLANNAME, 0, IF_MAXUNIT); static void interface_link_event(struct ifnet * ifp, u_int32_t event_code); -static void vlan_parent_link_event(vlan_parent_ref vlp, +static void vlan_parent_link_event(struct ifnet * p, u_int32_t event_code); -extern void dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m); + +static int ifvlan_new_mtu(ifvlan_ref ifv, int mtu); + +/** + ** ifvlan_ref routines + **/ +static void +ifvlan_retain(ifvlan_ref ifv) +{ + if (ifv->ifv_signature != IFV_SIGNATURE) { + panic("ifvlan_retain: bad signature\n"); + } + if (ifv->ifv_retain_count == 0) { + panic("ifvlan_retain: retain count is 0\n"); + } + OSIncrementAtomic(&ifv->ifv_retain_count); +} + +static void +ifvlan_release(ifvlan_ref ifv) +{ + UInt32 old_retain_count; + + if (ifv->ifv_signature != IFV_SIGNATURE) { + panic("ifvlan_release: bad signature\n"); + } + old_retain_count = OSDecrementAtomic(&ifv->ifv_retain_count); + switch (old_retain_count) { + case 0: + panic("ifvlan_release: retain count is 0\n"); + break; + case 1: + if (g_vlan->verbose) { + printf("ifvlan_release(%s)\n", ifv->ifv_name); + } + ifv->ifv_signature = 0; + FREE(ifv, M_VLAN); + break; + default: + break; + } + return; +} + +static vlan_parent_ref +ifvlan_get_vlan_parent_retained(ifvlan_ref ifv) +{ + vlan_parent_ref vlp = ifv->ifv_vlp; + + if (vlan_parent_flags_detaching(vlp)) { + return (NULL); + } + vlan_parent_retain(vlp); + return (vlp); +} + +/** + ** ifnet_* routines + **/ + +static ifvlan_ref +ifnet_get_ifvlan(struct ifnet * ifp) +{ + ifvlan_ref ifv; + + ifv = (ifvlan_ref)ifnet_softc(ifp); + return (ifv); +} + +static ifvlan_ref +ifnet_get_ifvlan_retained(struct ifnet * ifp) +{ + ifvlan_ref ifv; + + ifv = ifnet_get_ifvlan(ifp); + if (ifv == NULL) { + return (NULL); + } + if (ifvlan_flags_detaching(ifv)) { + return (NULL); + } + ifvlan_retain(ifv); + return (ifv); +} + +static int +ifnet_ifvlan_vlan_parent_ok(struct ifnet * ifp, ifvlan_ref ifv, + vlan_parent_ref vlp) +{ + ifvlan_ref check_ifv; + + check_ifv = ifnet_get_ifvlan(ifp); + if (check_ifv != ifv || ifvlan_flags_detaching(ifv)) { + /* ifvlan_ref no longer valid */ + return (FALSE); + } + if (ifv->ifv_vlp != vlp) { + /* vlan_parent no longer valid */ + return (FALSE); + } + if (vlan_parent_flags_detaching(vlp)) { + /* parent is detaching */ + return (FALSE); + } + return (TRUE); +} + +/** + ** vlan, etc. routines + **/ static int vlan_globals_init(void) @@ -471,17 +594,26 @@ vlan_bpf_input(struct ifnet * ifp, struct mbuf * m, /** ** vlan_parent synchronization routines **/ -static __inline__ void +static void vlan_parent_retain(vlan_parent_ref vlp) { + if (vlp->vlp_signature != VLP_SIGNATURE) { + panic("vlan_parent_retain: signature is bad\n"); + } + if (vlp->vlp_retain_count == 0) { + panic("vlan_parent_retain: retain count is 0\n"); + } OSIncrementAtomic(&vlp->vlp_retain_count); } -static __inline__ void +static void vlan_parent_release(vlan_parent_ref vlp) { UInt32 old_retain_count; + if (vlp->vlp_signature != VLP_SIGNATURE) { + panic("vlan_parent_release: signature is bad\n"); + } old_retain_count = OSDecrementAtomic(&vlp->vlp_retain_count); switch (old_retain_count) { case 0: @@ -493,6 +625,7 @@ vlan_parent_release(vlan_parent_ref vlp) printf("vlan_parent_release(%s%d)\n", ifnet_name(ifp), ifnet_unit(ifp)); } + vlp->vlp_signature = 0; FREE(vlp, M_VLAN); break; default: @@ -561,7 +694,6 @@ vlan_parent_signal(vlan_parent_ref vlp, const char * msg) return; } - /* * Program our multicast filter. What we're actually doing is * programming the multicast filter of the parent. This has the @@ -576,35 +708,22 @@ vlan_setmulti(struct ifnet * ifp) int error = 0; ifvlan_ref ifv; struct ifnet * p; - vlan_parent_ref vlp; + vlan_parent_ref vlp = NULL; vlan_lock(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL) { goto unlock_done; } - vlp = ifv->ifv_vlp; + vlp = ifvlan_get_vlan_parent_retained(ifv); if (vlp == NULL) { /* no parent, no need to program the multicast filter */ goto unlock_done; } - if (vlan_parent_flags_detaching(vlp)) { - goto unlock_done; - } - vlan_parent_retain(vlp); vlan_parent_wait(vlp, "vlan_setmulti"); /* check again, things could have changed */ - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { - goto signal_done; - } - if (ifv->ifv_vlp != vlp) { - /* vlan parent changed */ - goto signal_done; - } - if (vlp == NULL) { - /* no parent, no need to program the multicast filter */ + if (ifnet_ifvlan_vlan_parent_ok(ifp, ifv, vlp) == FALSE) { goto signal_done; } p = vlp->vlp_ifp; @@ -620,6 +739,12 @@ vlan_setmulti(struct ifnet * ifp) unlock_done: vlan_unlock(); + if (ifv != NULL) { + ifvlan_release(ifv); + } + if (vlp != NULL) { + vlan_parent_release(vlp); + } return (error); } @@ -711,7 +836,8 @@ vlan_parent_create(struct ifnet * p, vlan_parent_ref * ret_vlp) } LIST_INIT(&vlp->vlp_vlan_list); vlp->vlp_ifp = p; - vlan_parent_retain(vlp); + vlp->vlp_retain_count = 1; + vlp->vlp_signature = VLP_SIGNATURE; if (ifnet_offload(p) & (IF_HWASSIST_VLAN_MTU | IF_HWASSIST_VLAN_TAGGING)) { vlan_parent_flags_set_supports_vlan_mtu(vlp); @@ -721,28 +847,57 @@ vlan_parent_create(struct ifnet * p, vlan_parent_ref * ret_vlp) } static void -vlan_parent_remove_all_vlans(vlan_parent_ref vlp) +vlan_parent_remove_all_vlans(struct ifnet * p) { ifvlan_ref ifv; - struct ifnet * p; - - vlan_assert_lock_held(); + int need_vlp_release = 0; + ifvlan_ref next; + vlan_parent_ref vlp; - while ((ifv = LIST_FIRST(&vlp->vlp_vlan_list)) != NULL) { - vlan_remove(ifv); + vlan_lock(); + vlp = parent_list_lookup(p); + if (vlp == NULL || vlan_parent_flags_detaching(vlp)) { + /* no VLAN's */ vlan_unlock(); - vlan_if_detach(ifv->ifv_ifp); - vlan_lock(); + return; + } + vlan_parent_flags_set_detaching(vlp); + vlan_parent_retain(vlp); + vlan_parent_wait(vlp, "vlan_parent_remove_all_vlans"); + need_vlp_release++; + vlp = parent_list_lookup(p); + /* check again */ + if (vlp == NULL) { + goto signal_done; + } + + for (ifv = LIST_FIRST(&vlp->vlp_vlan_list); ifv != NULL; ifv = next) { + struct ifnet * ifp = ifv->ifv_ifp; + int removed; + + next = LIST_NEXT(ifv, ifv_vlan_list); + removed = vlan_remove(ifv, FALSE); + if (removed) { + vlan_unlock(); + ifnet_detach(ifp); + vlan_lock(); + } } /* the vlan parent has no more VLAN's */ - p = vlp->vlp_ifp; ifnet_set_eflags(p, 0, IFEF_VLAN); /* clear IFEF_VLAN */ + LIST_REMOVE(vlp, vlp_parent_list); + need_vlp_release++; /* one for being in the list */ + need_vlp_release++; /* final reference */ + + signal_done: + vlan_parent_signal(vlp, "vlan_parent_remove_all_vlans"); vlan_unlock(); - vlan_parent_release(vlp); - vlan_lock(); + while (need_vlp_release--) { + vlan_parent_release(vlp); + } return; } @@ -797,13 +952,16 @@ vlan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) if (ifv == NULL) return ENOBUFS; bzero(ifv, sizeof(struct ifvlan)); + ifv->ifv_retain_count = 1; + ifv->ifv_signature = IFV_SIGNATURE; multicast_list_init(&ifv->ifv_multicast); /* use the interface name as the unique id for ifp recycle */ - if ((unsigned int)snprintf(ifv->ifv_name, sizeof(ifv->ifv_name), "%s%d", - ifc->ifc_name, unit) >= sizeof(ifv->ifv_name)) { - FREE(ifv, M_VLAN); - return (EINVAL); + if ((unsigned int) + snprintf(ifv->ifv_name, sizeof(ifv->ifv_name), "%s%d", + ifc->ifc_name, unit) >= sizeof(ifv->ifv_name)) { + ifvlan_release(ifv); + return (EINVAL); } bzero(&vlan_init, sizeof(vlan_init)); @@ -828,16 +986,10 @@ vlan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) error = ifnet_allocate(&vlan_init, &ifp); if (error) { - FREE(ifv, M_VLAN); - return (error); + ifvlan_release(ifv); + return (error); } -#if 0 - /* NB: flags are not set here */ - ifnet_set_link_mib_data(ifp, &ifv->ifv_mib, sizeof ifv->ifv_mib); - /* NB: mtu is not set here */ -#endif - ifnet_set_offload(ifp, 0); ifnet_set_addrlen(ifp, ETHER_ADDR_LEN); /* XXX ethernet specific */ ifnet_set_baudrate(ifp, 0); @@ -845,9 +997,9 @@ vlan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) error = ifnet_attach(ifp, NULL); if (error) { - ifnet_release(ifp); - FREE(ifv, M_VLAN); - return (error); + ifnet_release(ifp); + ifvlan_release(ifv); + return (error); } ifv->ifv_ifp = ifp; @@ -856,21 +1008,18 @@ vlan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params) return (0); } -static void -vlan_remove(ifvlan_ref ifv) +static int +vlan_remove(ifvlan_ref ifv, int need_to_wait) { vlan_assert_lock_held(); + if (ifvlan_flags_detaching(ifv)) { + return (0); + } ifvlan_flags_set_detaching(ifv); - vlan_unconfig(ifv->ifv_ifp); - return; + vlan_unconfig(ifv, need_to_wait); + return (1); } -static void -vlan_if_detach(struct ifnet * ifp) -{ - ifnet_detach(ifp); - return; -} static int vlan_clone_destroy(struct ifnet *ifp) @@ -878,18 +1027,19 @@ vlan_clone_destroy(struct ifnet *ifp) ifvlan_ref ifv; vlan_lock(); - ifv = ifnet_softc(ifp); - if (ifv == NULL || ifnet_type(ifp) != IFT_L2VLAN) { + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL) { vlan_unlock(); return 0; } - if (ifvlan_flags_detaching(ifv)) { + if (vlan_remove(ifv, TRUE) == 0) { vlan_unlock(); + ifvlan_release(ifv); return 0; } - vlan_remove(ifv); vlan_unlock(); - vlan_if_detach(ifp); + ifvlan_release(ifv); + ifnet_detach(ifp); return 0; } @@ -900,8 +1050,8 @@ vlan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func func) ifvlan_ref ifv; vlan_lock(); - ifv = ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL) { vlan_unlock(); return (ENODEV); } @@ -925,6 +1075,7 @@ vlan_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func func) break; } vlan_unlock(); + ifvlan_release(ifv); return 0; } @@ -938,7 +1089,7 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) struct ifnet * p; int soft_vlan; u_short tag; - vlan_parent_ref vlp; + vlan_parent_ref vlp = NULL; if (m == 0) { return (0); @@ -948,18 +1099,13 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) return (0); } vlan_lock(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv) - || ifvlan_flags_ready(ifv) == 0) { - vlan_unlock(); - m_freem_list(m); - return (0); + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL || ifvlan_flags_ready(ifv) == 0) { + goto unlock_done; } - vlp = ifv->ifv_vlp; + vlp = ifvlan_get_vlan_parent_retained(ifv); if (vlp == NULL) { - vlan_unlock(); - m_freem_list(m); - return (0); + goto unlock_done; } p = vlp->vlp_ifp; (void)ifnet_stat_increment_out(ifp, 1, m->m_pkthdr.len, 0); @@ -968,12 +1114,16 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) tag = ifv->ifv_tag; encaplen = ifv->ifv_encaplen; vlan_unlock(); + + ifvlan_release(ifv); + vlan_parent_release(vlp); + vlan_bpf_output(ifp, m, bpf_func); /* do not run parent's if_output() if the parent is not up */ if ((ifnet_flags(p) & (IFF_UP | IFF_RUNNING)) != (IFF_UP | IFF_RUNNING)) { m_freem(m); - ifp->if_collisions++; + atomic_add_64(&ifp->if_collisions, 1); return (0); } /* @@ -992,7 +1142,7 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) if (m == NULL) { printf("%s%d: unable to prepend VLAN header\n", ifnet_name(ifp), ifnet_unit(ifp)); - ifp->if_oerrors++; + atomic_add_64(&ifp->if_oerrors, 1); return (0); } /* M_PREPEND takes care of m_len, m_pkthdr.len for us */ @@ -1001,7 +1151,7 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) if (m == NULL) { printf("%s%d: unable to pullup VLAN header\n", ifnet_name(ifp), ifnet_unit(ifp)); - ifp->if_oerrors++; + atomic_add_64(&ifp->if_oerrors, 1); return (0); } } @@ -1017,7 +1167,19 @@ vlan_output(struct ifnet * ifp, struct mbuf * m) evl->evl_encap_proto = htons(ETHERTYPE_VLAN); evl->evl_tag = htons(tag); } - return ifnet_output_raw(p, PF_VLAN, m); + return (ifnet_output_raw(p, PF_VLAN, m)); + + unlock_done: + vlan_unlock(); + if (ifv != NULL) { + ifvlan_release(ifv); + } + if (vlp != NULL) { + vlan_parent_release(vlp); + } + m_freem_list(m); + return (0); + } static int @@ -1120,20 +1282,17 @@ vlan_input(ifnet_t p, __unused protocol_family_t protocol, return 0; } -#define VLAN_CONFIG_PROGRESS_VLP_RETAINED 0x1 -#define VLAN_CONFIG_PROGRESS_IN_LIST 0x2 - static int vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) { int error; - int first_vlan = 0; + int first_vlan = FALSE; ifvlan_ref ifv = NULL; - vlan_parent_ref new_vlp = NULL; + int ifv_added = FALSE; int need_vlp_release = 0; + vlan_parent_ref new_vlp = NULL; ifnet_offload_t offload; u_int16_t parent_flags; - u_int32_t progress = 0; vlan_parent_ref vlp = NULL; /* pre-allocate space for vlan_parent, in case we're first */ @@ -1143,14 +1302,19 @@ vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) } vlan_lock(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv != NULL && ifv->ifv_vlp != NULL) { + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL || ifv->ifv_vlp != NULL) { vlan_unlock(); + if (ifv != NULL) { + ifvlan_release(ifv); + } vlan_parent_release(new_vlp); return (EBUSY); } vlp = parent_list_lookup(p); if (vlp != NULL) { + vlan_parent_retain(vlp); + need_vlp_release++; if (vlan_parent_lookup_tag(vlp, tag) != NULL) { /* already a VLAN with that tag on this interface */ error = EADDRINUSE; @@ -1158,28 +1322,38 @@ vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) } } else { + /* one for being in the list */ + vlan_parent_retain(new_vlp); + /* we're the first VLAN on this interface */ LIST_INSERT_HEAD(&g_vlan->parent_list, new_vlp, vlp_parent_list); vlp = new_vlp; + + vlan_parent_retain(vlp); + need_vlp_release++; } /* need to wait to ensure no one else is trying to add/remove */ - vlan_parent_retain(vlp); - progress |= VLAN_CONFIG_PROGRESS_VLP_RETAINED; vlan_parent_wait(vlp, "vlan_config"); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL) { - error = EOPNOTSUPP; + if (ifnet_get_ifvlan(ifp) != ifv) { + error = EINVAL; goto signal_done; } + + /* check again because someone might have gotten in */ + if (parent_list_lookup(p) != vlp) { + error = EBUSY; + goto signal_done; + } + if (vlan_parent_flags_detaching(vlp) || ifvlan_flags_detaching(ifv) || ifv->ifv_vlp != NULL) { error = EBUSY; goto signal_done; } - /* check again because someone might have gotten in */ + /* check again because someone might have gotten the tag */ if (vlan_parent_lookup_tag(vlp, tag) != NULL) { /* already a VLAN with that tag on this interface */ error = EADDRINUSE; @@ -1187,10 +1361,11 @@ vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) } if (vlan_parent_no_vlans(vlp)) { - first_vlan = 1; + first_vlan = TRUE; } vlan_parent_add_vlan(vlp, ifv, tag); - progress |= VLAN_CONFIG_PROGRESS_IN_LIST; + ifvlan_retain(ifv); /* parent references ifv */ + ifv_added = TRUE; /* check whether bond interface is using parent interface */ ifnet_lock_exclusive(p); @@ -1271,34 +1446,44 @@ vlan_config(struct ifnet * ifp, struct ifnet * p, int tag) /* throw it away, it wasn't needed */ vlan_parent_release(new_vlp); } + if (ifv != NULL) { + ifvlan_release(ifv); + } return 0; signal_done: vlan_assert_lock_held(); - vlan_parent_signal(vlp, "vlan_config"); - unlock_done: - if ((progress & VLAN_CONFIG_PROGRESS_IN_LIST) != 0) { + if (ifv_added) { vlan_parent_remove_vlan(vlp, ifv); + if (!vlan_parent_flags_detaching(vlp) && vlan_parent_no_vlans(vlp)) { + /* the vlan parent has no more VLAN's */ + ifnet_set_eflags(p, 0, IFEF_VLAN); + LIST_REMOVE(vlp, vlp_parent_list); + /* release outside of the lock below */ + need_vlp_release++; + + /* one for being in the list */ + need_vlp_release++; + } } - if (!vlan_parent_flags_detaching(vlp) && vlan_parent_no_vlans(vlp)) { - /* the vlan parent has no more VLAN's */ - ifnet_set_eflags(p, 0, IFEF_VLAN); - LIST_REMOVE(vlp, vlp_parent_list); - /* release outside of the lock below */ - need_vlp_release = 1; - } + vlan_parent_signal(vlp, "vlan_config"); + + unlock_done: vlan_unlock(); - if ((progress & VLAN_CONFIG_PROGRESS_VLP_RETAINED) != 0) { - vlan_parent_release(vlp); - } - if (need_vlp_release) { + while (need_vlp_release--) { vlan_parent_release(vlp); } if (new_vlp != vlp) { vlan_parent_release(new_vlp); } + if (ifv != NULL) { + if (ifv_added) { + ifvlan_release(ifv); + } + ifvlan_release(ifv); + } return (error); } @@ -1311,7 +1496,7 @@ vlan_link_event(struct ifnet * ifp, struct ifnet * p) bzero(&ifmr, sizeof(ifmr)); snprintf(ifmr.ifm_name, sizeof(ifmr.ifm_name), "%s%d", ifnet_name(p), ifnet_unit(p)); - if (ifnet_ioctl(p, 0, SIOCGIFMEDIA, &ifmr) == 0 + if (ifnet_ioctl(p, 0, SIOCGIFMEDIA, &ifmr) == 0 && ifmr.ifm_count > 0 && ifmr.ifm_status & IFM_AVALID) { u_int32_t event; @@ -1323,36 +1508,36 @@ vlan_link_event(struct ifnet * ifp, struct ifnet * p) } static int -vlan_unconfig(struct ifnet * ifp) +vlan_unconfig(ifvlan_ref ifv, int need_to_wait) { - int error = 0; - ifvlan_ref ifv; - int last_vlan = 0; + struct ifnet * ifp = ifv->ifv_ifp; + int last_vlan = FALSE; + int need_ifv_release = 0; int need_vlp_release = 0; struct ifnet * p; vlan_parent_ref vlp; vlan_assert_lock_held(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL) { - return (0); - } vlp = ifv->ifv_vlp; if (vlp == NULL) { return (0); } - vlan_parent_retain(vlp); - vlan_parent_wait(vlp, "vlan_unconfig"); + if (need_to_wait) { + need_vlp_release++; + vlan_parent_retain(vlp); + vlan_parent_wait(vlp, "vlan_unconfig"); - /* check again because another thread could be in vlan_unconfig */ - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL) { - goto signal_done; - } - if (ifv->ifv_vlp != vlp) { - /* vlan parent changed */ - goto signal_done; + /* check again because another thread could be in vlan_unconfig */ + if (ifv != ifnet_get_ifvlan(ifp)) { + goto signal_done; + } + if (ifv->ifv_vlp != vlp) { + /* vlan parent changed */ + goto signal_done; + } } + + /* ifv has a reference on vlp, need to remove it */ need_vlp_release++; p = vlp->vlp_ifp; @@ -1362,56 +1547,67 @@ vlan_unconfig(struct ifnet * ifp) printf("vlan_unconfig: last vlan on %s%d\n", ifnet_name(p), ifnet_unit(p)); } - last_vlan = 1; + last_vlan = TRUE; } /* back-out any effect our mtu might have had on the parent */ - (void)vlan_new_mtu(ifp, ETHERMTU - ifv->ifv_mtufudge); + (void)ifvlan_new_mtu(ifv, ETHERMTU - ifv->ifv_mtufudge); vlan_unlock(); - /* detach VLAN "protocol" */ - if (last_vlan) { - (void)vlan_detach_protocol(p); - } - /* un-join multicast on parent interface */ (void)multicast_list_remove(&ifv->ifv_multicast); /* Clear our MAC address. */ ifnet_set_lladdr_and_type(ifp, NULL, 0, IFT_L2VLAN); - vlan_lock(); + /* detach VLAN "protocol" */ + if (last_vlan) { + (void)vlan_detach_protocol(p); + } - /* Disconnect from parent. */ - vlan_parent_remove_vlan(vlp, ifv); + vlan_lock(); /* return to the state we were in before SIFVLAN */ ifnet_set_mtu(ifp, 0); ifnet_set_flags(ifp, 0, IFF_BROADCAST | IFF_MULTICAST | IFF_SIMPLEX | IFF_RUNNING); ifnet_set_offload(ifp, 0); - ifv->ifv_flags = 0; ifv->ifv_mtufudge = 0; - if (!vlan_parent_flags_detaching(vlp) && vlan_parent_no_vlans(vlp)) { + /* Disconnect from parent. */ + vlan_parent_remove_vlan(vlp, ifv); + ifv->ifv_flags = 0; + + /* vlan_parent has reference to ifv, remove it */ + need_ifv_release++; + + /* from this point on, no more referencing ifv */ + if (last_vlan && !vlan_parent_flags_detaching(vlp)) { /* the vlan parent has no more VLAN's */ ifnet_set_eflags(p, 0, IFEF_VLAN); LIST_REMOVE(vlp, vlp_parent_list); + + /* one for being in the list */ + need_vlp_release++; + /* release outside of the lock below */ need_vlp_release++; } signal_done: - vlan_parent_signal(vlp, "vlan_unconfig"); + if (need_to_wait) { + vlan_parent_signal(vlp, "vlan_unconfig"); + } vlan_unlock(); - vlan_parent_release(vlp); /* one because we waited */ - - while (need_vlp_release--) { + while (need_ifv_release--) { + ifvlan_release(ifv); + } + while (need_vlp_release--) { /* references to vlp */ vlan_parent_release(vlp); } vlan_lock(); - return (error); + return (0); } static int @@ -1422,9 +1618,9 @@ vlan_set_promisc(struct ifnet * ifp) vlan_parent_ref vlp; vlan_lock(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { - error = (ifv == NULL) ? EOPNOTSUPP : EBUSY; + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL) { + error = EBUSY; goto done; } @@ -1449,22 +1645,24 @@ vlan_set_promisc(struct ifnet * ifp) } done: vlan_unlock(); + if (ifv != NULL) { + ifvlan_release(ifv); + } return (error); } static int -vlan_new_mtu(struct ifnet * ifp, int mtu) +ifvlan_new_mtu(ifvlan_ref ifv, int mtu) { struct ifdevmtu * devmtu_p; int error = 0; - ifvlan_ref ifv; + struct ifnet * ifp = ifv->ifv_ifp; int max_mtu; int new_mtu = 0; int req_mtu; vlan_parent_ref vlp; vlan_assert_lock_held(); - ifv = (ifvlan_ref)ifnet_softc(ifp); vlp = ifv->ifv_vlp; devmtu_p = &vlp->vlp_devmtu; req_mtu = mtu + ifv->ifv_mtufudge; @@ -1504,44 +1702,45 @@ vlan_set_mtu(struct ifnet * ifp, int mtu) return (EINVAL); } vlan_lock(); - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { + ifv = ifnet_get_ifvlan_retained(ifp); + if (ifv == NULL) { vlan_unlock(); - return ((ifv == NULL) ? EOPNOTSUPP : EBUSY); + return (EBUSY); } - vlp = ifv->ifv_vlp; - if (vlp == NULL || vlan_parent_flags_detaching(vlp)) { + vlp = ifvlan_get_vlan_parent_retained(ifv); + if (vlp == NULL) { vlan_unlock(); + ifvlan_release(ifv); if (mtu != 0) { return (EINVAL); } return (0); } - vlan_parent_retain(vlp); vlan_parent_wait(vlp, "vlan_set_mtu"); /* check again, something might have changed */ - ifv = (ifvlan_ref)ifnet_softc(ifp); - if (ifv == NULL || ifvlan_flags_detaching(ifv)) { - error = (ifv == NULL) ? EOPNOTSUPP : EBUSY; + if (ifnet_get_ifvlan(ifp) != ifv + || ifvlan_flags_detaching(ifv)) { + error = EBUSY; goto signal_done; } if (ifv->ifv_vlp != vlp) { /* vlan parent changed */ goto signal_done; } - if (vlp == NULL || vlan_parent_flags_detaching(vlp)) { + if (vlan_parent_flags_detaching(vlp)) { if (mtu != 0) { error = EINVAL; } goto signal_done; } - error = vlan_new_mtu(ifp, mtu); + error = ifvlan_new_mtu(ifv, mtu); signal_done: vlan_parent_signal(vlp, "vlan_set_mtu"); vlan_unlock(); vlan_parent_release(vlp); + ifvlan_release(ifv); return (error); } @@ -1685,7 +1884,10 @@ vlan_ioctl(ifnet_t ifp, u_long cmd, void * data) /* generate a link event based on the state of the parent */ vlan_link_event(ifp, p); - } else { + } + else { + int need_link_event = FALSE; + vlan_lock(); ifv = (ifvlan_ref)ifnet_softc(ifp); if (ifv == NULL || ifvlan_flags_detaching(ifv)) { @@ -1693,9 +1895,9 @@ vlan_ioctl(ifnet_t ifp, u_long cmd, void * data) error = (ifv == NULL ? EOPNOTSUPP : EBUSY); break; } - error = vlan_unconfig(ifp); + need_link_event = vlan_remove(ifv, TRUE); vlan_unlock(); - if (error == 0) { + if (need_link_event) { interface_link_event(ifp, KEV_DL_LINK_OFF); } } @@ -1748,22 +1950,20 @@ vlan_if_free(struct ifnet * ifp) if (ifp == NULL) { return; } - vlan_lock(); ifv = (ifvlan_ref)ifnet_softc(ifp); if (ifv == NULL) { - vlan_unlock(); return; } - vlan_unlock(); + ifvlan_release(ifv); ifnet_release(ifp); - FREE(ifv, M_VLAN); + return; } static void vlan_event(struct ifnet * p, __unused protocol_family_t protocol, const struct kev_msg * event) { - vlan_parent_ref vlp; + int event_code; /* Check if the interface we are attached to is being detached */ if (event->vendor_code != KEV_VENDOR_APPLE @@ -1771,43 +1971,28 @@ vlan_event(struct ifnet * p, __unused protocol_family_t protocol, || event->kev_subclass != KEV_DL_SUBCLASS) { return; } - switch (event->event_code) { - case KEV_DL_IF_DETACHING: + event_code = event->event_code; + switch (event_code) { case KEV_DL_LINK_OFF: case KEV_DL_LINK_ON: + vlan_parent_link_event(p, event_code); break; default: return; } - vlan_lock(); - if ((ifnet_eflags(p) & IFEF_VLAN) == 0) { - vlan_unlock(); - /* no VLAN's */ - return; - } - vlp = parent_list_lookup(p); - if (vlp == NULL) { - /* no VLAN's */ - vlan_unlock(); - return; - } - switch (event->event_code) { - case KEV_DL_IF_DETACHING: - vlan_parent_flags_set_detaching(vlp); - vlan_parent_remove_all_vlans(vlp); - break; - - case KEV_DL_LINK_OFF: - case KEV_DL_LINK_ON: - vlan_parent_link_event(vlp, event->event_code); - break; - default: - break; - } - vlan_unlock(); return; } +static errno_t +vlan_detached(ifnet_t p, __unused protocol_family_t protocol) +{ + if (ifnet_is_attached(p, 0) == 0) { + /* if the parent isn't attached, remove all VLANs */ + vlan_parent_remove_all_vlans(p); + } + return (0); +} + static void interface_link_event(struct ifnet * ifp, u_int32_t event_code) { @@ -1817,6 +2002,7 @@ interface_link_event(struct ifnet * ifp, u_int32_t event_code) char if_name[IFNAMSIZ]; } event; + bzero(&event, sizeof(event)); event.header.total_size = sizeof(event); event.header.vendor_code = KEV_VENDOR_APPLE; event.header.kev_class = KEV_NETWORK_CLASS; @@ -1830,13 +2016,45 @@ interface_link_event(struct ifnet * ifp, u_int32_t event_code) } static void -vlan_parent_link_event(vlan_parent_ref vlp, u_int32_t event_code) +vlan_parent_link_event(struct ifnet * p, u_int32_t event_code) { - ifvlan_ref ifv; + ifvlan_ref ifv; + vlan_parent_ref vlp; + vlan_lock(); + if ((ifnet_eflags(p) & IFEF_VLAN) == 0) { + vlan_unlock(); + /* no VLAN's */ + return; + } + vlp = parent_list_lookup(p); + if (vlp == NULL) { + /* no VLAN's */ + vlan_unlock(); + return; + } + + vlan_parent_retain(vlp); + vlan_parent_wait(vlp, "vlan_parent_link_event"); + if (vlan_parent_flags_detaching(vlp)) { + goto signal_done; + } + + vlan_unlock(); + + /* vlan_parent_wait() gives us exclusive access to the list */ LIST_FOREACH(ifv, &vlp->vlp_vlan_list, ifv_vlan_list) { - interface_link_event(ifv->ifv_ifp, event_code); + struct ifnet * ifp = ifv->ifv_ifp; + + interface_link_event(ifp, event_code); } + + vlan_lock(); + + signal_done: + vlan_parent_signal(vlp, "vlan_parent_link_event"); + vlan_unlock(); + vlan_parent_release(vlp); return; } @@ -1860,6 +2078,7 @@ vlan_attach_protocol(struct ifnet *ifp) bzero(®, sizeof(reg)); reg.input = vlan_input; reg.event = vlan_event; + reg.detached = vlan_detached; error = ifnet_attach_protocol(ifp, PF_VLAN, ®); if (error) { printf("vlan_proto_attach(%s%d) ifnet_attach_protocol failed, %d\n", diff --git a/bsd/net/kext_net.h b/bsd/net/kext_net.h index 6215515a3..48ade0710 100644 --- a/bsd/net/kext_net.h +++ b/bsd/net/kext_net.h @@ -46,48 +46,29 @@ * Internal implementation bits */ -struct socket_filter; - -#define SFEF_DETACHUSEZERO 0x1 /* Detach when use reaches zero */ -#define SFEF_UNREGISTERING 0x2 /* Remove due to unregister */ -#define SFEF_DETACHXREF 0x4 /* Extra reference held for detach */ - -struct socket_filter_entry { - struct socket_filter_entry *sfe_next_onsocket; - struct socket_filter_entry *sfe_next_onfilter; - - struct socket_filter *sfe_filter; - struct socket *sfe_socket; - void *sfe_cookie; - - u_int32_t sfe_flags; -}; - -#define SFF_DETACHING 0x1 - -struct socket_filter { - TAILQ_ENTRY(socket_filter) sf_protosw_next; - TAILQ_ENTRY(socket_filter) sf_global_next; - struct socket_filter_entry *sf_entry_head; - - struct protosw *sf_proto; - struct sflt_filter sf_filter; - u_int32_t sf_flags; - u_int32_t sf_usecount; -}; - -TAILQ_HEAD(socket_filter_list, socket_filter); - /* Private, internal implementation functions */ -void sflt_init(void) __attribute__((section("__TEXT, initcode"))); -void sflt_initsock(struct socket *so); -void sflt_termsock(struct socket *so); -void sflt_use(struct socket *so); -void sflt_unuse(struct socket *so); -void sflt_notify(struct socket *so, sflt_event_t event, void *param); -int sflt_data_in(struct socket *so, const struct sockaddr *from, mbuf_t *data, - mbuf_t *control, sflt_data_flag_t flags, int *filtered); -int sflt_attach_private(struct socket *so, struct socket_filter *filter, sflt_handle handle, int locked); +extern void sflt_init(void) __attribute__((section("__TEXT, initcode"))); +extern void sflt_initsock(struct socket *so); +extern void sflt_termsock(struct socket *so); +extern errno_t sflt_attach_internal(struct socket *so, sflt_handle handle); +extern void sflt_notify(struct socket *so, sflt_event_t event, void *param); +extern int sflt_ioctl(struct socket *so, u_long cmd, caddr_t data); +extern int sflt_bind(struct socket *so, const struct sockaddr *nam); +extern int sflt_listen(struct socket *so); +extern int sflt_accept(struct socket *head, struct socket *so, + const struct sockaddr *local, + const struct sockaddr *remote); +extern int sflt_getsockname(struct socket *so, struct sockaddr **local); +extern int sflt_getpeername(struct socket *so, struct sockaddr **remote); +extern int sflt_connectin(struct socket *head, const struct sockaddr *remote); +extern int sflt_connectout(struct socket *so, const struct sockaddr *nam); +extern int sflt_setsockopt(struct socket *so, struct sockopt *sopt); +extern int sflt_getsockopt(struct socket *so, struct sockopt *sopt); +extern int sflt_data_out(struct socket *so, const struct sockaddr *to, + mbuf_t *data, mbuf_t *control, + sflt_data_flag_t flags); +extern int sflt_data_in(struct socket *so, const struct sockaddr *from, + mbuf_t *data, mbuf_t *control, sflt_data_flag_t flags); #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/net/kpi_interface.c b/bsd/net/kpi_interface.c index e56564c58..82ba11b03 100644 --- a/bsd/net/kpi_interface.c +++ b/bsd/net/kpi_interface.c @@ -37,6 +37,7 @@ #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/kpi_mbuf.h> +#include <sys/mcache.h> #include <net/if_var.h> #include <net/if_dl.h> #include <net/dlil.h> @@ -55,9 +56,6 @@ #define TOUCHLASTCHANGE(__if_lastchange) microtime(__if_lastchange) #endif -extern struct dlil_threading_info *dlil_lo_thread_ptr; -extern int dlil_multithreaded_input; - static errno_t ifnet_list_get_common(ifnet_family_t, boolean_t, ifnet_t **, u_int32_t *); @@ -184,31 +182,15 @@ ifnet_allocate( } errno_t -ifnet_reference( - ifnet_t ifp) +ifnet_reference(ifnet_t ifp) { - int oldval; - - if (ifp == NULL) return EINVAL; - - oldval = OSIncrementAtomic(&ifp->if_refcnt); - - return 0; + return (dlil_if_ref(ifp)); } errno_t -ifnet_release( - ifnet_t ifp) +ifnet_release(ifnet_t ifp) { - int oldval; - - if (ifp == NULL) return EINVAL; - - oldval = OSDecrementAtomic(&ifp->if_refcnt); - if (oldval == 0) - panic("ifnet_release - refcount decremented past zero!"); - - return 0; + return (dlil_if_free(ifp)); } errno_t @@ -256,27 +238,22 @@ ifnet_index( } errno_t -ifnet_set_flags( - ifnet_t interface, - u_int16_t new_flags, - u_int16_t mask) +ifnet_set_flags(ifnet_t interface, u_int16_t new_flags, u_int16_t mask) { - int lock; - - if (interface == NULL) return EINVAL; - lock = (interface->if_lock != 0); - - if (lock) ifnet_lock_exclusive(interface); - + if (interface == NULL) + return (EINVAL); + + ifnet_lock_exclusive(interface); + /* If we are modifying the up/down state, call if_updown */ - if (lock && (mask & IFF_UP) != 0) { + if ((mask & IFF_UP) != 0) { if_updown(interface, (new_flags & IFF_UP) == IFF_UP); } - + interface->if_flags = (new_flags & mask) | (interface->if_flags & ~mask); - if (lock) ifnet_lock_done(interface); - - return 0; + ifnet_lock_done(interface); + + return (0); } u_int16_t @@ -287,21 +264,16 @@ ifnet_flags( } errno_t -ifnet_set_eflags( - ifnet_t interface, - u_int32_t new_flags, - u_int32_t mask) +ifnet_set_eflags(ifnet_t interface, u_int32_t new_flags, u_int32_t mask) { - int lock; - - if (interface == NULL) return EINVAL; - lock = (interface->if_lock != 0); - - if (lock) ifnet_lock_exclusive(interface); + if (interface == NULL) + return (EINVAL); + + ifnet_lock_exclusive(interface); interface->if_eflags = (new_flags & mask) | (interface->if_eflags & ~mask); - if (lock) ifnet_lock_done(interface); - - return 0; + ifnet_lock_done(interface); + + return (0); } u_int32_t @@ -312,19 +284,28 @@ ifnet_eflags( } errno_t -ifnet_set_idle_flags(ifnet_t ifp, u_int32_t new_flags, u_int32_t mask) +ifnet_set_idle_flags_locked(ifnet_t ifp, u_int32_t new_flags, u_int32_t mask) { -#if IFNET_ROUTE_REFCNT - int lock, before, after; + int before, after; if (ifp == NULL) return (EINVAL); - lck_mtx_lock(rnh_lock); + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); + ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE); - lock = (ifp->if_lock != NULL); - if (lock) - ifnet_lock_exclusive(ifp); + /* + * If this is called prior to ifnet attach, the actual work will + * be done at attach time. Otherwise, if it is called after + * ifnet detach, then it is a no-op. + */ + if (!ifnet_is_attached(ifp, 0)) { + ifp->if_idle_new_flags = new_flags; + ifp->if_idle_new_flags_mask = mask; + return (0); + } else { + ifp->if_idle_new_flags = ifp->if_idle_new_flags_mask = 0; + } before = ifp->if_idle_flags; ifp->if_idle_flags = (new_flags & mask) | (ifp->if_idle_flags & ~mask); @@ -345,49 +326,140 @@ ifnet_set_idle_flags(ifnet_t ifp, u_int32_t new_flags, u_int32_t mask) rt_aggdrain(1); } - if (lock) - ifnet_lock_done(ifp); + return (0); +} + +errno_t +ifnet_set_idle_flags(ifnet_t ifp, u_int32_t new_flags, u_int32_t mask) +{ + errno_t err; + lck_mtx_lock(rnh_lock); + ifnet_lock_exclusive(ifp); + err = ifnet_set_idle_flags_locked(ifp, new_flags, mask); + ifnet_lock_done(ifp); lck_mtx_unlock(rnh_lock); - return (0); -#else -#pragma unused(ifp, new_flags, mask) - return (ENOTSUP); -#endif /* IFNET_ROUTE_REFCNT */ + return (err); } u_int32_t ifnet_idle_flags(ifnet_t ifp) { -#if IFNET_ROUTE_REFCNT return ((ifp == NULL) ? 0 : ifp->if_idle_flags); -#else -#pragma unused(ifp) - return (0); -#endif /* IFNET_ROUTE_REFCNT */ +} + +errno_t ifnet_set_capabilities_supported(ifnet_t ifp, u_int32_t new_caps, + u_int32_t mask) +{ + errno_t error = 0; + int tmp; + + if (ifp == NULL) + return EINVAL; + + ifnet_lock_exclusive(ifp); + tmp = (new_caps & mask) | (ifp->if_capabilities & ~mask); + if ((tmp & ~IFCAP_VALID)) + error = EINVAL; + else + ifp->if_capabilities = tmp; + ifnet_lock_done(ifp); + + return error; +} + +u_int32_t ifnet_capabilities_supported(ifnet_t ifp) +{ + return ((ifp == NULL) ? 0 : ifp->if_capabilities); +} + + +errno_t ifnet_set_capabilities_enabled(ifnet_t ifp, u_int32_t new_caps, + u_int32_t mask) +{ + errno_t error = 0; + int tmp; + struct kev_msg ev_msg; + struct net_event_data ev_data; + + if (ifp == NULL) + return EINVAL; + + ifnet_lock_exclusive(ifp); + tmp = (new_caps & mask) | (ifp->if_capenable & ~mask); + if ((tmp & ~IFCAP_VALID) || (tmp & ~ifp->if_capabilities)) + error = EINVAL; + else + ifp->if_capenable = tmp; + ifnet_lock_done(ifp); + + /* Notify application of the change */ + bzero(&ev_data, sizeof(struct net_event_data)); + bzero(&ev_msg, sizeof(struct kev_msg)); + ev_msg.vendor_code = KEV_VENDOR_APPLE; + ev_msg.kev_class = KEV_NETWORK_CLASS; + ev_msg.kev_subclass = KEV_DL_SUBCLASS; + + ev_msg.event_code = KEV_DL_IFCAP_CHANGED; + strlcpy(&ev_data.if_name[0], ifp->if_name, IFNAMSIZ); + ev_data.if_family = ifp->if_family; + ev_data.if_unit = (u_int32_t) ifp->if_unit; + ev_msg.dv[0].data_length = sizeof(struct net_event_data); + ev_msg.dv[0].data_ptr = &ev_data; + ev_msg.dv[1].data_length = 0; + kev_post_msg(&ev_msg); + + return error; +} + +u_int32_t ifnet_capabilities_enabled(ifnet_t ifp) +{ + return ((ifp == NULL) ? 0 : ifp->if_capenable); + + return 0; } static const ifnet_offload_t offload_mask = IFNET_CSUM_IP | IFNET_CSUM_TCP | IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | IFNET_IP_FRAGMENT | + IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | IFNET_IPV6_FRAGMENT | IFNET_CSUM_SUM16 | IFNET_VLAN_TAGGING | IFNET_VLAN_MTU | IFNET_MULTIPAGES | IFNET_TSO_IPV4 | IFNET_TSO_IPV6; +static const ifnet_offload_t any_offload_csum = IFNET_CSUM_IP | IFNET_CSUM_TCP | + IFNET_CSUM_UDP | IFNET_CSUM_FRAGMENT | + IFNET_CSUM_TCPIPV6 | IFNET_CSUM_UDPIPV6 | + IFNET_CSUM_SUM16; + + errno_t -ifnet_set_offload( - ifnet_t interface, - ifnet_offload_t offload) +ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload) { - int lock; - - if (interface == NULL) return EINVAL; - lock = (interface->if_lock != 0); + u_int32_t ifcaps = 0; - if (lock) ifnet_lock_exclusive(interface); - interface->if_hwassist = (offload & offload_mask); - if (lock) ifnet_lock_done(interface); - - return 0; + if (interface == NULL) + return (EINVAL); + + ifnet_lock_exclusive(interface); + interface->if_hwassist = (offload & offload_mask); + ifnet_lock_done(interface); + + if ((offload & any_offload_csum)) + ifcaps |= IFCAP_HWCSUM; + if ((offload & IFNET_TSO_IPV4)) + ifcaps |= IFCAP_TSO4; + if ((offload & IFNET_TSO_IPV6)) + ifcaps |= IFCAP_TSO6; + if ((offload & IFNET_VLAN_MTU)) + ifcaps |= IFCAP_VLAN_MTU; + if ((offload & IFNET_VLAN_TAGGING)) + ifcaps |= IFCAP_VLAN_HWTAGGING; + if (ifcaps != 0) { + (void) ifnet_set_capabilities_supported(interface, ifcaps, IFCAP_VALID); + (void) ifnet_set_capabilities_enabled(interface, ifcaps, IFCAP_VALID); + } + + return (0); } ifnet_offload_t @@ -466,13 +538,14 @@ ifnet_get_tso_mtu( return error; } -errno_t +errno_t ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) { - int lock; - struct kev_msg ev_msg; - struct net_event_data ev_data; - + struct kev_msg ev_msg; + struct net_event_data ev_data; + + bzero(&ev_data, sizeof(struct net_event_data)); + bzero(&ev_msg, sizeof(struct kev_msg)); if (interface == NULL) return EINVAL; @@ -480,15 +553,11 @@ ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) if ((properties & mask) & ~IF_WAKE_VALID_FLAGS) return EINVAL; - lock = (interface->if_lock != 0); - - if (lock) - ifnet_lock_exclusive(interface); + ifnet_lock_exclusive(interface); interface->if_wake_properties = (properties & mask) | (interface->if_wake_properties & ~mask); - if (lock) - ifnet_lock_done(interface); + ifnet_lock_done(interface); (void) ifnet_touch_lastchange(interface); @@ -505,7 +574,7 @@ ifnet_set_wake_flags(ifnet_t interface, u_int32_t properties, u_int32_t mask) ev_msg.dv[0].data_ptr = &ev_data; ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); - + return 0; } @@ -515,55 +584,43 @@ ifnet_get_wake_flags(ifnet_t interface) return interface == NULL ? 0 : interface->if_wake_properties; } - - - /* * Should MIB data store a copy? */ errno_t -ifnet_set_link_mib_data( - ifnet_t interface, - void* mibData, - u_int32_t mibLen) +ifnet_set_link_mib_data(ifnet_t interface, void *mibData, u_int32_t mibLen) { - int lock; - - if (interface == NULL) return EINVAL; - lock = (interface->if_lock != 0); - - if (lock) ifnet_lock_exclusive(interface); + if (interface == NULL) + return (EINVAL); + + ifnet_lock_exclusive(interface); interface->if_linkmib = (void*)mibData; interface->if_linkmiblen = mibLen; - if (lock) ifnet_lock_done(interface); - return 0; + ifnet_lock_done(interface); + return (0); } errno_t -ifnet_get_link_mib_data( - ifnet_t interface, - void *mibData, - u_int32_t *mibLen) +ifnet_get_link_mib_data(ifnet_t interface, void *mibData, u_int32_t *mibLen) { errno_t result = 0; - int lock; - - if (interface == NULL) return EINVAL; - lock = (interface->if_lock != NULL); - - if (lock) ifnet_lock_shared(interface); + + if (interface == NULL) + return (EINVAL); + + ifnet_lock_shared(interface); if (*mibLen < interface->if_linkmiblen) result = EMSGSIZE; if (result == 0 && interface->if_linkmib == NULL) result = ENOTSUP; - + if (result == 0) { *mibLen = interface->if_linkmiblen; bcopy(interface->if_linkmib, mibData, *mibLen); } - if (lock) ifnet_lock_done(interface); - - return result; + ifnet_lock_done(interface); + + return (result); } u_int32_t @@ -634,15 +691,12 @@ ifnet_type( #if 0 errno_t -ifnet_set_typelen( - ifnet_t interface, - u_char typelen) +ifnet_set_typelen(ifnet_t interface, u_char typelen) { - int lock = (interface->if_lock != 0); - if (lock) ifnet_lock_exclusive(interface); + ifnet_lock_exclusive(interface); interface->if_data.ifi_typelen = typelen; - if (lock) ifnet_lock_done(interface); - return 0; + ifnet_lock_done(interface); + return (0); } u_char @@ -733,310 +787,283 @@ ifnet_baudrate( } errno_t -ifnet_stat_increment( - ifnet_t interface, - const struct ifnet_stat_increment_param *counts) +ifnet_stat_increment(ifnet_t interface, + const struct ifnet_stat_increment_param *counts) { - struct dlil_threading_info *thread; - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; + if (interface == NULL) + return (EINVAL); - lck_mtx_lock(thread->input_lck); + atomic_add_64(&interface->if_data.ifi_ipackets, counts->packets_in); + atomic_add_64(&interface->if_data.ifi_ibytes, counts->bytes_in); + atomic_add_64(&interface->if_data.ifi_ierrors, counts->errors_in); - interface->if_data.ifi_ipackets += counts->packets_in; - interface->if_data.ifi_ibytes += counts->bytes_in; - interface->if_data.ifi_ierrors += counts->errors_in; + atomic_add_64(&interface->if_data.ifi_opackets, counts->packets_out); + atomic_add_64(&interface->if_data.ifi_obytes, counts->bytes_out); + atomic_add_64(&interface->if_data.ifi_oerrors, counts->errors_out); - interface->if_data.ifi_opackets += counts->packets_out; - interface->if_data.ifi_obytes += counts->bytes_out; - interface->if_data.ifi_oerrors += counts->errors_out; + atomic_add_64(&interface->if_data.ifi_collisions, counts->collisions); + atomic_add_64(&interface->if_data.ifi_iqdrops, counts->dropped); - interface->if_data.ifi_collisions += counts->collisions; - interface->if_data.ifi_iqdrops += counts->dropped; - /* Touch the last change time. */ TOUCHLASTCHANGE(&interface->if_lastchange); - lck_mtx_unlock(thread->input_lck); - - return 0; + return (0); } errno_t -ifnet_stat_increment_in( - ifnet_t interface, - u_int32_t packets_in, - u_int32_t bytes_in, - u_int32_t errors_in) +ifnet_stat_increment_in(ifnet_t interface, u_int32_t packets_in, + u_int32_t bytes_in, u_int32_t errors_in) { - struct dlil_threading_info *thread; - - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - - lck_mtx_lock(thread->input_lck); + if (interface == NULL) + return (EINVAL); - interface->if_data.ifi_ipackets += packets_in; - interface->if_data.ifi_ibytes += bytes_in; - interface->if_data.ifi_ierrors += errors_in; + atomic_add_64(&interface->if_data.ifi_ipackets, packets_in); + atomic_add_64(&interface->if_data.ifi_ibytes, bytes_in); + atomic_add_64(&interface->if_data.ifi_ierrors, errors_in); TOUCHLASTCHANGE(&interface->if_lastchange); - lck_mtx_unlock(thread->input_lck); - - return 0; + return (0); } errno_t -ifnet_stat_increment_out( - ifnet_t interface, - u_int32_t packets_out, - u_int32_t bytes_out, - u_int32_t errors_out) +ifnet_stat_increment_out(ifnet_t interface, u_int32_t packets_out, + u_int32_t bytes_out, u_int32_t errors_out) { - struct dlil_threading_info *thread; - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - - lck_mtx_lock(thread->input_lck); + if (interface == NULL) + return (EINVAL); - interface->if_data.ifi_opackets += packets_out; - interface->if_data.ifi_obytes += bytes_out; - interface->if_data.ifi_oerrors += errors_out; + atomic_add_64(&interface->if_data.ifi_opackets, packets_out); + atomic_add_64(&interface->if_data.ifi_obytes, bytes_out); + atomic_add_64(&interface->if_data.ifi_oerrors, errors_out); TOUCHLASTCHANGE(&interface->if_lastchange); - lck_mtx_unlock(thread->input_lck); - - return 0; + return (0); } errno_t -ifnet_set_stat( - ifnet_t interface, - const struct ifnet_stats_param *stats) +ifnet_set_stat(ifnet_t interface, const struct ifnet_stats_param *stats) { - struct dlil_threading_info *thread; + if (interface == NULL) + return (EINVAL); - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; + atomic_set_64(&interface->if_data.ifi_ipackets, stats->packets_in); + atomic_set_64(&interface->if_data.ifi_ibytes, stats->bytes_in); + atomic_set_64(&interface->if_data.ifi_imcasts, stats->multicasts_in); + atomic_set_64(&interface->if_data.ifi_ierrors, stats->errors_in); - lck_mtx_lock(thread->input_lck); + atomic_set_64(&interface->if_data.ifi_opackets, stats->packets_out); + atomic_set_64(&interface->if_data.ifi_obytes, stats->bytes_out); + atomic_set_64(&interface->if_data.ifi_omcasts, stats->multicasts_out); + atomic_set_64(&interface->if_data.ifi_oerrors, stats->errors_out); - interface->if_data.ifi_ipackets = stats->packets_in; - interface->if_data.ifi_ibytes = stats->bytes_in; - interface->if_data.ifi_imcasts = stats->multicasts_in; - interface->if_data.ifi_ierrors = stats->errors_in; - - interface->if_data.ifi_opackets = stats->packets_out; - interface->if_data.ifi_obytes = stats->bytes_out; - interface->if_data.ifi_omcasts = stats->multicasts_out; - interface->if_data.ifi_oerrors = stats->errors_out; - - interface->if_data.ifi_collisions = stats->collisions; - interface->if_data.ifi_iqdrops = stats->dropped; - interface->if_data.ifi_noproto = stats->no_protocol; + atomic_set_64(&interface->if_data.ifi_collisions, stats->collisions); + atomic_set_64(&interface->if_data.ifi_iqdrops, stats->dropped); + atomic_set_64(&interface->if_data.ifi_noproto, stats->no_protocol); /* Touch the last change time. */ TOUCHLASTCHANGE(&interface->if_lastchange); - lck_mtx_unlock(thread->input_lck); - return 0; } errno_t -ifnet_stat( - ifnet_t interface, - struct ifnet_stats_param *stats) +ifnet_stat(ifnet_t interface, struct ifnet_stats_param *stats) { - struct dlil_threading_info *thread; - - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - - lck_mtx_lock(thread->input_lck); + if (interface == NULL) + return (EINVAL); - stats->packets_in = interface->if_data.ifi_ipackets; - stats->bytes_in = interface->if_data.ifi_ibytes; - stats->multicasts_in = interface->if_data.ifi_imcasts; - stats->errors_in = interface->if_data.ifi_ierrors; + atomic_get_64(stats->packets_in, &interface->if_data.ifi_ipackets); + atomic_get_64(stats->bytes_in, &interface->if_data.ifi_ibytes); + atomic_get_64(stats->multicasts_in, &interface->if_data.ifi_imcasts); + atomic_get_64(stats->errors_in, &interface->if_data.ifi_ierrors); - stats->packets_out = interface->if_data.ifi_opackets; - stats->bytes_out = interface->if_data.ifi_obytes; - stats->multicasts_out = interface->if_data.ifi_omcasts; - stats->errors_out = interface->if_data.ifi_oerrors; + atomic_get_64(stats->packets_out, &interface->if_data.ifi_opackets); + atomic_get_64(stats->bytes_out, &interface->if_data.ifi_obytes); + atomic_get_64(stats->multicasts_out, &interface->if_data.ifi_omcasts); + atomic_get_64(stats->errors_out, &interface->if_data.ifi_oerrors); - stats->collisions = interface->if_data.ifi_collisions; - stats->dropped = interface->if_data.ifi_iqdrops; - stats->no_protocol = interface->if_data.ifi_noproto; + atomic_get_64(stats->collisions, &interface->if_data.ifi_collisions); + atomic_get_64(stats->dropped, &interface->if_data.ifi_iqdrops); + atomic_get_64(stats->no_protocol, &interface->if_data.ifi_noproto); - lck_mtx_unlock(thread->input_lck); - - return 0; + return (0); } errno_t -ifnet_touch_lastchange( - ifnet_t interface) +ifnet_touch_lastchange(ifnet_t interface) { - struct dlil_threading_info *thread; - - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - - lck_mtx_lock(thread->input_lck); + if (interface == NULL) + return (EINVAL); TOUCHLASTCHANGE(&interface->if_lastchange); - lck_mtx_unlock(thread->input_lck); - - return 0; + return (0); } errno_t -ifnet_lastchange( - ifnet_t interface, - struct timeval *last_change) +ifnet_lastchange(ifnet_t interface, struct timeval *last_change) { - struct dlil_threading_info *thread; - - if (interface == NULL) return EINVAL; - - if ((thread = interface->if_input_thread) == NULL || (dlil_multithreaded_input == 0)) - thread = dlil_lo_thread_ptr; - - lck_mtx_lock(thread->input_lck); + if (interface == NULL) + return (EINVAL); *last_change = interface->if_data.ifi_lastchange; - - lck_mtx_unlock(thread->input_lck); - #if IF_LASTCHANGEUPTIME /* Crude conversion from uptime to calendar time */ last_change->tv_sec += boottime_sec(); #endif - - return 0; + return (0); } errno_t -ifnet_get_address_list( - ifnet_t interface, - ifaddr_t **addresses) +ifnet_get_address_list(ifnet_t interface, ifaddr_t **addresses) { - if (addresses == NULL) return EINVAL; - return ifnet_get_address_list_family(interface, addresses, 0); + return (addresses == NULL ? EINVAL : + ifnet_get_address_list_family(interface, addresses, 0)); } +struct ifnet_addr_list { + SLIST_ENTRY(ifnet_addr_list) ifal_le; + struct ifaddr *ifal_ifa; +}; + errno_t -ifnet_get_address_list_family( - ifnet_t interface, - ifaddr_t **addresses, - sa_family_t family) +ifnet_get_address_list_family(ifnet_t interface, ifaddr_t **addresses, + sa_family_t family) +{ + return (ifnet_get_address_list_family_internal(interface, addresses, + family, 0, M_NOWAIT)); +} + +__private_extern__ errno_t +ifnet_get_address_list_family_internal(ifnet_t interface, ifaddr_t **addresses, + sa_family_t family, int detached, int how) { + SLIST_HEAD(, ifnet_addr_list) ifal_head; + struct ifnet_addr_list *ifal, *ifal_tmp; struct ifnet *ifp; int count = 0; - int cmax = 0; - - if (addresses == NULL) return EINVAL; + errno_t err = 0; + + SLIST_INIT(&ifal_head); + + if (addresses == NULL) { + err = EINVAL; + goto done; + } *addresses = NULL; - + + if (detached) { + /* + * Interface has been detached, so skip the lookup + * at ifnet_head and go directly to inner loop. + */ + ifp = interface; + if (ifp == NULL) { + err = EINVAL; + goto done; + } + goto one; + } + ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet, if_link) - { - if (interface && ifp != interface) continue; - + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (interface != NULL && ifp != interface) + continue; +one: ifnet_lock_shared(ifp); - if ((ifp->if_eflags & IFEF_DETACHING) == 0) { - if (interface == NULL || interface == ifp) - { - struct ifaddr *addr; - TAILQ_FOREACH(addr, &ifp->if_addrhead, ifa_link) - { - if (family == 0 || addr->ifa_addr->sa_family == family) - cmax++; + if (interface == NULL || interface == ifp) { + struct ifaddr *ifa; + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + IFA_LOCK(ifa); + if (family != 0 && + ifa->ifa_addr->sa_family != family) { + IFA_UNLOCK(ifa); + continue; } + MALLOC(ifal, struct ifnet_addr_list *, + sizeof (*ifal), M_TEMP, how); + if (ifal == NULL) { + IFA_UNLOCK(ifa); + ifnet_lock_done(ifp); + if (!detached) + ifnet_head_done(); + err = ENOMEM; + goto done; + } + ifal->ifal_ifa = ifa; + IFA_ADDREF_LOCKED(ifa); + SLIST_INSERT_HEAD(&ifal_head, ifal, ifal_le); + ++count; + IFA_UNLOCK(ifa); } } - else if (interface != NULL) { - ifnet_lock_done(ifp); - ifnet_head_done(); - return ENXIO; - } ifnet_lock_done(ifp); + if (detached) + break; } - - MALLOC(*addresses, ifaddr_t*, sizeof(ifaddr_t) * (cmax + 1), M_TEMP, M_NOWAIT); - if (*addresses == NULL) { + if (!detached) ifnet_head_done(); - return ENOMEM; + + if (count == 0) { + err = ENXIO; + goto done; } - - TAILQ_FOREACH(ifp, &ifnet, if_link) - { - if (interface && ifp != interface) continue; - - ifnet_lock_shared(ifp); - if ((ifp->if_eflags & IFEF_DETACHING) == 0) { - if (interface == NULL || (struct ifnet*)interface == ifp) - { - struct ifaddr *addr; - TAILQ_FOREACH(addr, &ifp->if_addrhead, ifa_link) - { - if (count + 1 > cmax) break; - if (family == 0 || addr->ifa_addr->sa_family == family) { - (*addresses)[count] = (ifaddr_t)addr; - ifaddr_reference((*addresses)[count]); - count++; - } - } - } - } - ifnet_lock_done(ifp); - if (interface || count == cmax) - break; + MALLOC(*addresses, ifaddr_t *, sizeof (ifaddr_t) * (count + 1), + M_TEMP, how); + if (*addresses == NULL) { + err = ENOMEM; + goto done; } - ifnet_head_done(); - (*addresses)[cmax] = 0; - - return 0; + bzero(*addresses, sizeof (ifaddr_t) * (count + 1)); + +done: + SLIST_FOREACH_SAFE(ifal, &ifal_head, ifal_le, ifal_tmp) { + SLIST_REMOVE(&ifal_head, ifal, ifnet_addr_list, ifal_le); + if (err == 0) + (*addresses)[--count] = ifal->ifal_ifa; + else + IFA_REMREF(ifal->ifal_ifa); + FREE(ifal, M_TEMP); + } + + return (err); } void -ifnet_free_address_list( - ifaddr_t *addresses) +ifnet_free_address_list(ifaddr_t *addresses) { int i; - - if (addresses == NULL) return; - + + if (addresses == NULL) + return; + for (i = 0; addresses[i] != NULL; i++) - { - ifaddr_release(addresses[i]); - } - + IFA_REMREF(addresses[i]); + FREE(addresses, M_TEMP); } -void* -ifnet_lladdr( - ifnet_t interface) +void * +ifnet_lladdr(ifnet_t interface) { - if (interface == NULL) return NULL; - return LLADDR(SDL(interface->if_addrhead.tqh_first->ifa_addr)); + struct ifaddr *ifa; + void *lladdr; + + if (interface == NULL) + return (NULL); + + /* + * if_lladdr points to the permanent link address of + * the interface; it never gets deallocated. + */ + ifa = interface->if_lladdr; + IFA_LOCK_SPIN(ifa); + lladdr = LLADDR(SDL(ifa->ifa_addr)); + IFA_UNLOCK(ifa); + + return (lladdr); } errno_t @@ -1068,74 +1095,80 @@ ifnet_llbroadcast_copy_bytes( } errno_t -ifnet_lladdr_copy_bytes( - ifnet_t interface, - void* lladdr, - size_t lladdr_len) +ifnet_lladdr_copy_bytes(ifnet_t interface, void *lladdr, size_t lladdr_len) { struct sockaddr_dl *sdl; - if (interface == NULL || lladdr == NULL) return EINVAL; - - sdl = SDL(interface->if_addrhead.tqh_first->ifa_addr); - - while (1) { - if (lladdr_len != sdl->sdl_alen) { - bzero(lladdr, lladdr_len); - return EMSGSIZE; - } - bcopy(LLADDR(sdl), lladdr, lladdr_len); - if (bcmp(lladdr, LLADDR(sdl), lladdr_len) == 0 && - lladdr_len == sdl->sdl_alen) - break; + struct ifaddr *ifa; + + if (interface == NULL || lladdr == NULL) + return (EINVAL); + + /* + * if_lladdr points to the permanent link address of + * the interface; it never gets deallocated. + */ + ifa = interface->if_lladdr; + IFA_LOCK_SPIN(ifa); + sdl = SDL(ifa->ifa_addr); + if (lladdr_len != sdl->sdl_alen) { + bzero(lladdr, lladdr_len); + IFA_UNLOCK(ifa); + return (EMSGSIZE); } - return 0; + bcopy(LLADDR(sdl), lladdr, lladdr_len); + IFA_UNLOCK(ifa); + + return (0); } static errno_t -ifnet_set_lladdr_internal( - ifnet_t interface, - const void *lladdr, - size_t lladdr_len, - u_char new_type, - int apply_type) +ifnet_set_lladdr_internal(ifnet_t interface, const void *lladdr, + size_t lladdr_len, u_char new_type, int apply_type) { struct ifaddr *ifa; - struct sockaddr_dl *sdl; errno_t error = 0; - - if (interface == NULL) return EINVAL; - - if (lladdr_len != 0 && (lladdr_len != interface->if_addrlen || lladdr == 0)) - return EINVAL; - + + if (interface == NULL) + return (EINVAL); + ifnet_head_lock_shared(); + ifnet_lock_exclusive(interface); + if (lladdr_len != 0 && + (lladdr_len != interface->if_addrlen || lladdr == 0)) { + ifnet_lock_done(interface); + ifnet_head_done(); + return (EINVAL); + } ifa = ifnet_addrs[interface->if_index - 1]; if (ifa != NULL) { + struct sockaddr_dl *sdl; + + IFA_LOCK_SPIN(ifa); sdl = (struct sockaddr_dl*)ifa->ifa_addr; if (lladdr_len != 0) { bcopy(lladdr, LLADDR(sdl), lladdr_len); - } - else { + } else { bzero(LLADDR(sdl), interface->if_addrlen); } sdl->sdl_alen = lladdr_len; - + if (apply_type) { sdl->sdl_type = new_type; } - } - else { + IFA_UNLOCK(ifa); + } else { error = ENXIO; } + ifnet_lock_done(interface); ifnet_head_done(); - + /* Generate a kernel event */ if (error == 0) { dlil_post_msg(interface, KEV_DL_SUBCLASS, KEV_DL_LINK_ADDRESS_CHANGED, NULL, 0); } - - return error; + + return (error); } errno_t @@ -1158,64 +1191,68 @@ ifnet_set_lladdr_and_type( } errno_t -ifnet_add_multicast( - ifnet_t interface, - const struct sockaddr *maddr, - ifmultiaddr_t *address) +ifnet_add_multicast(ifnet_t interface, const struct sockaddr *maddr, + ifmultiaddr_t *ifmap) { - if (interface == NULL || maddr == NULL) return EINVAL; - return if_addmulti(interface, maddr, address); + if (interface == NULL || maddr == NULL) + return (EINVAL); + + /* Don't let users screw up protocols' entries. */ + if (maddr->sa_family != AF_UNSPEC && maddr->sa_family != AF_LINK) + return (EINVAL); + + return (if_addmulti_anon(interface, maddr, ifmap)); } errno_t -ifnet_remove_multicast( - ifmultiaddr_t address) +ifnet_remove_multicast(ifmultiaddr_t ifma) { - if (address == NULL) return EINVAL; - return if_delmultiaddr(address, 0); + struct sockaddr *maddr; + + if (ifma == NULL) + return (EINVAL); + + maddr = ifma->ifma_addr; + /* Don't let users screw up protocols' entries. */ + if (maddr->sa_family != AF_UNSPEC && maddr->sa_family != AF_LINK) + return (EINVAL); + + return (if_delmulti_anon(ifma->ifma_ifp, maddr)); } -errno_t ifnet_get_multicast_list(ifnet_t interface, ifmultiaddr_t **addresses) +errno_t +ifnet_get_multicast_list(ifnet_t ifp, ifmultiaddr_t **addresses) { int count = 0; int cmax = 0; struct ifmultiaddr *addr; - int lock; - - if (interface == NULL || addresses == NULL) - return EINVAL; - - lock = (interface->if_lock != 0); - if (lock) ifnet_lock_shared(interface); - if ((interface->if_eflags & IFEF_DETACHING) == 0) { - LIST_FOREACH(addr, &interface->if_multiaddrs, ifma_link) - { - cmax++; - } - } - else { - if (lock) ifnet_lock_done(interface); - return ENXIO; + + if (ifp == NULL || addresses == NULL) + return (EINVAL); + + ifnet_lock_shared(ifp); + LIST_FOREACH(addr, &ifp->if_multiaddrs, ifma_link) { + cmax++; } - - MALLOC(*addresses, ifmultiaddr_t*, sizeof(ifmultiaddr_t) * (cmax + 1), M_TEMP, M_NOWAIT); + + MALLOC(*addresses, ifmultiaddr_t *, sizeof (ifmultiaddr_t) * (cmax + 1), + M_TEMP, M_NOWAIT); if (*addresses == NULL) { - if (lock) ifnet_lock_done(interface); - return ENOMEM; + ifnet_lock_done(ifp); + return (ENOMEM); } - - LIST_FOREACH(addr, &interface->if_multiaddrs, ifma_link) - { + + LIST_FOREACH(addr, &ifp->if_multiaddrs, ifma_link) { if (count + 1 > cmax) break; (*addresses)[count] = (ifmultiaddr_t)addr; ifmaddr_reference((*addresses)[count]); count++; } - (*addresses)[cmax] = 0; - if (lock) ifnet_lock_done(interface); - - return 0; + (*addresses)[cmax] = NULL; + ifnet_lock_done(ifp); + + return (0); } void @@ -1235,44 +1272,42 @@ ifnet_free_multicast_list( } errno_t -ifnet_find_by_name( - const char *ifname, - ifnet_t *interface) +ifnet_find_by_name(const char *ifname, ifnet_t *ifpp) { struct ifnet *ifp; int namelen; - - if (ifname == NULL) return EINVAL; - + + if (ifname == NULL) + return (EINVAL); + namelen = strlen(ifname); - - *interface = NULL; - + + *ifpp = NULL; + ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet, if_link) - { - struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1]; + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + struct ifaddr *ifa; struct sockaddr_dl *ll_addr; - if (!ifa || !ifa->ifa_addr) + ifa = ifnet_addrs[ifp->if_index - 1]; + if (ifa == NULL) continue; + IFA_LOCK(ifa); ll_addr = (struct sockaddr_dl *)ifa->ifa_addr; - if ((ifp->if_eflags & IFEF_DETACHING) == 0 && - namelen == ll_addr->sdl_nlen && - (strncmp(ll_addr->sdl_data, ifname, ll_addr->sdl_nlen) == 0)) - { + if (namelen == ll_addr->sdl_nlen && + !strncmp(ll_addr->sdl_data, ifname, ll_addr->sdl_nlen)) { + IFA_UNLOCK(ifa); + *ifpp = ifp; + ifnet_reference(*ifpp); break; } - } - if (ifp) { - *interface = ifp; - ifnet_reference(*interface); + IFA_UNLOCK(ifa); } ifnet_head_done(); - - return (ifp == NULL) ? ENXIO : 0; + + return ((ifp == NULL) ? ENXIO : 0); } errno_t @@ -1287,54 +1322,74 @@ ifnet_list_get_all(ifnet_family_t family, ifnet_t **list, u_int32_t *count) return (ifnet_list_get_common(family, TRUE, list, count)); } +struct ifnet_list { + SLIST_ENTRY(ifnet_list) ifl_le; + struct ifnet *ifl_ifp; +}; + static errno_t ifnet_list_get_common(ifnet_family_t family, boolean_t get_all, ifnet_t **list, u_int32_t *count) { +#pragma unused(get_all) + SLIST_HEAD(, ifnet_list) ifl_head; + struct ifnet_list *ifl, *ifl_tmp; struct ifnet *ifp; - u_int32_t cmax = 0; - *count = 0; - errno_t result = 0; - - if (list == NULL || count == NULL) - return (EINVAL); - - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet, if_link) { - if ((ifp->if_eflags & IFEF_DETACHING) && !get_all) - continue; - if (family == IFNET_FAMILY_ANY || ifp->if_family == family) - cmax++; - } + int cnt = 0; + errno_t err = 0; - if (cmax == 0) - result = ENXIO; + SLIST_INIT(&ifl_head); - if (result == 0) { - MALLOC(*list, ifnet_t*, sizeof(ifnet_t) * (cmax + 1), - M_TEMP, M_NOWAIT); - if (*list == NULL) - result = ENOMEM; + if (list == NULL || count == NULL) { + err = EINVAL; + goto done; } + *count = 0; + *list = NULL; - if (result == 0) { - TAILQ_FOREACH(ifp, &ifnet, if_link) { - if ((ifp->if_eflags & IFEF_DETACHING) && !get_all) - continue; - if (*count + 1 > cmax) - break; - if (family == IFNET_FAMILY_ANY || - ((ifnet_family_t)ifp->if_family) == family) { - (*list)[*count] = (ifnet_t)ifp; - ifnet_reference((*list)[*count]); - (*count)++; + ifnet_head_lock_shared(); + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (family == IFNET_FAMILY_ANY || ifp->if_family == family) { + MALLOC(ifl, struct ifnet_list *, sizeof (*ifl), + M_TEMP, M_NOWAIT); + if (ifl == NULL) { + ifnet_head_done(); + err = ENOMEM; + goto done; } + ifl->ifl_ifp = ifp; + ifnet_reference(ifp); + SLIST_INSERT_HEAD(&ifl_head, ifl, ifl_le); + ++cnt; } - (*list)[*count] = NULL; } ifnet_head_done(); - return (result); + if (cnt == 0) { + err = ENXIO; + goto done; + } + + MALLOC(*list, ifnet_t *, sizeof (ifnet_t) * (cnt + 1), + M_TEMP, M_NOWAIT); + if (*list == NULL) { + err = ENOMEM; + goto done; + } + bzero(*list, sizeof (ifnet_t) * (cnt + 1)); + *count = cnt; + +done: + SLIST_FOREACH_SAFE(ifl, &ifl_head, ifl_le, ifl_tmp) { + SLIST_REMOVE(&ifl_head, ifl, ifnet_list, ifl_le); + if (err == 0) + (*list)[--cnt] = ifl->ifl_ifp; + else + ifnet_release(ifl->ifl_ifp); + FREE(ifl, M_TEMP); + } + + return (err); } void @@ -1345,9 +1400,8 @@ ifnet_list_free(ifnet_t *interfaces) if (interfaces == NULL) return; - for (i = 0; interfaces[i]; i++) { + for (i = 0; interfaces[i]; i++) ifnet_release(interfaces[i]); - } FREE(interfaces, M_TEMP); } @@ -1357,97 +1411,132 @@ ifnet_list_free(ifnet_t *interfaces) /****************************************************************************/ errno_t -ifaddr_reference( - ifaddr_t ifa) +ifaddr_reference(ifaddr_t ifa) { - if (ifa == NULL) return EINVAL; - ifaref(ifa); - return 0; + if (ifa == NULL) + return (EINVAL); + + IFA_ADDREF(ifa); + return (0); } errno_t -ifaddr_release( - ifaddr_t ifa) +ifaddr_release(ifaddr_t ifa) { - if (ifa == NULL) return EINVAL; - ifafree(ifa); - return 0; + if (ifa == NULL) + return (EINVAL); + + IFA_REMREF(ifa); + return (0); } sa_family_t -ifaddr_address_family( - ifaddr_t ifa) +ifaddr_address_family(ifaddr_t ifa) { - if (ifa && ifa->ifa_addr) - return ifa->ifa_addr->sa_family; - - return 0; + sa_family_t family = 0; + + if (ifa != NULL) { + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr != NULL) + family = ifa->ifa_addr->sa_family; + IFA_UNLOCK(ifa); + } + return (family); } errno_t -ifaddr_address( - ifaddr_t ifa, - struct sockaddr *out_addr, - u_int32_t addr_size) +ifaddr_address(ifaddr_t ifa, struct sockaddr *out_addr, u_int32_t addr_size) { u_int32_t copylen; - - if (ifa == NULL || out_addr == NULL) return EINVAL; - if (ifa->ifa_addr == NULL) return ENOTSUP; - - copylen = (addr_size >= ifa->ifa_addr->sa_len) ? ifa->ifa_addr->sa_len : addr_size; + + if (ifa == NULL || out_addr == NULL) + return (EINVAL); + + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr == NULL) { + IFA_UNLOCK(ifa); + return (ENOTSUP); + } + + copylen = (addr_size >= ifa->ifa_addr->sa_len) ? + ifa->ifa_addr->sa_len : addr_size; bcopy(ifa->ifa_addr, out_addr, copylen); - - if (ifa->ifa_addr->sa_len > addr_size) return EMSGSIZE; - - return 0; + + if (ifa->ifa_addr->sa_len > addr_size) { + IFA_UNLOCK(ifa); + return (EMSGSIZE); + } + + IFA_UNLOCK(ifa); + return (0); } errno_t -ifaddr_dstaddress( - ifaddr_t ifa, - struct sockaddr *out_addr, - u_int32_t addr_size) +ifaddr_dstaddress(ifaddr_t ifa, struct sockaddr *out_addr, u_int32_t addr_size) { u_int32_t copylen; - if (ifa == NULL || out_addr == NULL) return EINVAL; - if (ifa->ifa_dstaddr == NULL) return ENOTSUP; - - copylen = (addr_size >= ifa->ifa_dstaddr->sa_len) ? ifa->ifa_dstaddr->sa_len : addr_size; + + if (ifa == NULL || out_addr == NULL) + return (EINVAL); + + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_dstaddr == NULL) { + IFA_UNLOCK(ifa); + return (ENOTSUP); + } + + copylen = (addr_size >= ifa->ifa_dstaddr->sa_len) ? + ifa->ifa_dstaddr->sa_len : addr_size; bcopy(ifa->ifa_dstaddr, out_addr, copylen); - if (ifa->ifa_dstaddr->sa_len > addr_size) return EMSGSIZE; - - return 0; + if (ifa->ifa_dstaddr->sa_len > addr_size) { + IFA_UNLOCK(ifa); + return (EMSGSIZE); + } + + IFA_UNLOCK(ifa); + return (0); } errno_t -ifaddr_netmask( - ifaddr_t ifa, - struct sockaddr *out_addr, - u_int32_t addr_size) +ifaddr_netmask(ifaddr_t ifa, struct sockaddr *out_addr, u_int32_t addr_size) { u_int32_t copylen; - if (ifa == NULL || out_addr == NULL) return EINVAL; - if (ifa->ifa_netmask == NULL) return ENOTSUP; - - copylen = addr_size >= ifa->ifa_netmask->sa_len ? ifa->ifa_netmask->sa_len : addr_size; + + if (ifa == NULL || out_addr == NULL) + return (EINVAL); + + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_netmask == NULL) { + IFA_UNLOCK(ifa); + return (ENOTSUP); + } + + copylen = addr_size >= ifa->ifa_netmask->sa_len ? + ifa->ifa_netmask->sa_len : addr_size; bcopy(ifa->ifa_netmask, out_addr, copylen); - - if (ifa->ifa_netmask->sa_len > addr_size) return EMSGSIZE; - - return 0; + + if (ifa->ifa_netmask->sa_len > addr_size) { + IFA_UNLOCK(ifa); + return (EMSGSIZE); + } + + IFA_UNLOCK(ifa); + return (0); } ifnet_t -ifaddr_ifnet( - ifaddr_t ifa) +ifaddr_ifnet(ifaddr_t ifa) { struct ifnet *ifp; - if (ifa == NULL) return NULL; + + if (ifa == NULL) + return (NULL); + + /* ifa_ifp is set once at creation time; it is never changed */ ifp = ifa->ifa_ifp; - - return (ifnet_t)ifp; + + return (ifp); } ifaddr_t @@ -1494,60 +1583,70 @@ ifaddr_findbestforaddr( } errno_t -ifmaddr_reference( - ifmultiaddr_t ifmaddr) +ifmaddr_reference(ifmultiaddr_t ifmaddr) { - if (ifmaddr == NULL) return EINVAL; - ifma_reference(ifmaddr); - return 0; + if (ifmaddr == NULL) + return (EINVAL); + + IFMA_ADDREF(ifmaddr); + return (0); } errno_t -ifmaddr_release( - ifmultiaddr_t ifmaddr) +ifmaddr_release(ifmultiaddr_t ifmaddr) { - if (ifmaddr == NULL) return EINVAL; - ifma_release(ifmaddr); - return 0; + if (ifmaddr == NULL) + return (EINVAL); + + IFMA_REMREF(ifmaddr); + return (0); } errno_t -ifmaddr_address( - ifmultiaddr_t ifmaddr, - struct sockaddr *out_addr, - u_int32_t addr_size) +ifmaddr_address(ifmultiaddr_t ifma, struct sockaddr *out_addr, + u_int32_t addr_size) { u_int32_t copylen; - - if (ifmaddr == NULL || out_addr == NULL) return EINVAL; - if (ifmaddr->ifma_addr == NULL) return ENOTSUP; - - copylen = addr_size >= ifmaddr->ifma_addr->sa_len ? ifmaddr->ifma_addr->sa_len : addr_size; - bcopy(ifmaddr->ifma_addr, out_addr, copylen); - - if (ifmaddr->ifma_addr->sa_len > addr_size) return EMSGSIZE; - - return 0; + + if (ifma == NULL || out_addr == NULL) + return (EINVAL); + + IFMA_LOCK(ifma); + if (ifma->ifma_addr == NULL) { + IFMA_UNLOCK(ifma); + return (ENOTSUP); + } + + copylen = (addr_size >= ifma->ifma_addr->sa_len ? + ifma->ifma_addr->sa_len : addr_size); + bcopy(ifma->ifma_addr, out_addr, copylen); + + if (ifma->ifma_addr->sa_len > addr_size) { + IFMA_UNLOCK(ifma); + return (EMSGSIZE); + } + IFMA_UNLOCK(ifma); + return (0); } errno_t -ifmaddr_lladdress( - ifmultiaddr_t ifmaddr, - struct sockaddr *out_addr, - u_int32_t addr_size) +ifmaddr_lladdress(ifmultiaddr_t ifma, struct sockaddr *out_addr, + u_int32_t addr_size) { - if (ifmaddr == NULL || out_addr == NULL) return EINVAL; - if (ifmaddr->ifma_ll == NULL) return ENOTSUP; - - return ifmaddr_address(ifmaddr->ifma_ll, out_addr, addr_size); + struct ifmultiaddr *ifma_ll; + + if (ifma == NULL || out_addr == NULL) + return (EINVAL); + if ((ifma_ll = ifma->ifma_ll) == NULL) + return (ENOTSUP); + + return (ifmaddr_address(ifma_ll, out_addr, addr_size)); } ifnet_t -ifmaddr_ifnet( - ifmultiaddr_t ifmaddr) +ifmaddr_ifnet(ifmultiaddr_t ifma) { - if (ifmaddr == NULL || ifmaddr->ifma_ifp == NULL) return NULL; - return ifmaddr->ifma_ifp; + return (ifma == NULL ? NULL : ifma->ifma_ifp); } /******************************************************************************/ diff --git a/bsd/net/kpi_interface.h b/bsd/net/kpi_interface.h index e22eba1c7..e2fd084b6 100644 --- a/bsd/net/kpi_interface.h +++ b/bsd/net/kpi_interface.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2010 Apple Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,6 +66,7 @@ struct ifnet_demux_desc; @constant IFNET_FAMILY_STF A 6to4 interface. @constant IFNET_FAMILY_FIREWIRE An IEEE 1394 (firewire) interface. @constant IFNET_FAMILY_BOND A virtual bonded interface. + @constant IFNET_FAMILY_CELLULAR A cellular interface. */ enum { @@ -83,7 +84,8 @@ enum { IFNET_FAMILY_FAITH = 11, IFNET_FAMILY_STF = 12, IFNET_FAMILY_FIREWIRE = 13, - IFNET_FAMILY_BOND = 14 + IFNET_FAMILY_BOND = 14, + IFNET_FAMILY_CELLULAR = 15 }; /*! @typedef ifnet_family_t @@ -129,6 +131,9 @@ typedef u_int32_t protocol_family_t; @constant IFNET_CSUM_UDP Hardware will calculate UDP checksums. @constant IFNET_CSUM_FRAGMENT Hardware will checksum IP fragments. @constant IFNET_IP_FRAGMENT Hardware will fragment IP packets. + @constant IFNET_CSUM_TCPIPV6 Hardware will calculate TCP IPv6 checksums. + @constant IFNET_CSUM_UDPIPV6 Hardware will calculate UDP IPv6 checksums. + @constant IFNET_IPV6_FRAGMENT Hardware will fragment IPv6 packets. @constant IFNET_VLAN_TAGGING Hardware will generate VLAN headers. @constant IFNET_VLAN_MTU Hardware supports VLAN MTU. @constant IFNET_MULTIPAGES Driver is capable of handling packets @@ -147,8 +152,15 @@ typedef u_int32_t protocol_family_t; If the Interface driver sets this flag, TCP will send larger frames (up to 64KB) as one frame to the adapter which will perform the final packetization. The maximum TSO segment supported by the interface can be set with "ifnet_set_tso_mtu". To retreive the real MTU - for the TCP connection the function "mbuf_get_tso_requested" is used by the driver. + for the TCP connection the function "mbuf_get_tso_requested" is used by the driver. Note + that if TSO is active, all the packets will be flagged for TSO, not just large packets. @constant IFNET_TSO_IPV6 Hardware supports IPv6 TCP Segment Offloading. + If the Interface driver sets this flag, TCP IPv6 will send larger frames (up to 64KB) as one + frame to the adapter which will perform the final packetization. The maximum TSO segment + supported by the interface can be set with "ifnet_set_tso_mtu". To retreive the real MTU + for the TCP IPv6 connection the function "mbuf_get_tso_requested" is used by the driver. + Note that if TSO is active, all the packets will be flagged for TSO, not just large packets. + */ enum { @@ -157,6 +169,9 @@ enum { IFNET_CSUM_UDP = 0x00000004, IFNET_CSUM_FRAGMENT = 0x00000008, IFNET_IP_FRAGMENT = 0x00000010, + IFNET_CSUM_TCPIPV6 = 0x00000020, + IFNET_CSUM_UDPIPV6 = 0x00000040, + IFNET_IPV6_FRAGMENT = 0x00000080, #ifdef KERNEL_PRIVATE IFNET_CSUM_SUM16 = 0x00001000, #endif /* KERNEL_PRIVATE */ @@ -839,7 +854,7 @@ extern const char *ifnet_name(ifnet_t interface); @function ifnet_family @discussion Returns the family of the interface. @param interface Interface to retrieve the unit number from. - @result Unit number. + @result Interface family type. */ extern ifnet_family_t ifnet_family(ifnet_t interface); @@ -942,16 +957,92 @@ extern u_int32_t ifnet_idle_flags(ifnet_t interface); #endif /* KERNEL_PRIVATE */ +/*! + @function ifnet_set_capabilities_supported + @discussion Specify the capabilities supported by the interface. + @discussion This function lets you specify which capabilities are supported + by the interface. Typically this function is called by the driver when + the interface gets attached to the system. + The mask allows to control which capability to set or unset. + The kernel will effectively take the lock, then set the + interface's flags to (if_capabilities & ~mask) | (new_caps & mask). + + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + @param interface Interface to set the capabilities on. + @param new_caps The value of the capabilities that should be set or unset. These + flags are defined in net/if.h + @param mask Which capabilities that should be affected. These + flags are defined in net/if.h + @result 0 on success otherwise the errno error. + */ +extern errno_t ifnet_set_capabilities_supported(ifnet_t interface, u_int32_t new_caps, + u_int32_t mask); + +/*! + @function ifnet_capabilities_supported + @discussion Retrieve the interface capabilities supported by the interface. + @param interface Interface to retrieve the capabilities from. + @result Flags. Capabilities flags are defined in net/if.h + */ +extern u_int32_t ifnet_capabilities_supported(ifnet_t interface); + +/*! + @function ifnet_set_capabilities_enabled + @discussion Enable and/or disable the interface capabilities to match new_caps. + @discussion Sets the interface capabilities to new_caps. This function + lets you specify which capabilities you want to change using the mask. + The kernel will effectively take the lock, then set the + interface's flags to (if_capenable & ~mask) | (new_caps & mask). + + This function is intended to be called by the driver. A kext + must not call this function on an interface the kext does not + own. + + Typically this function is called by the driver when the interface is + created to specify which of the supported capabilities are enabled by + default. This function is also meant to be called when the driver handles + the interface ioctl SIOCSIFCAP. + + The driver should call ifnet_set_offlad() to indicate the corresponding + hardware offload bits that will be used by the networking stack. + + It is an error to enable a capability that is not marked as + supported by the interface. + @param interface Interface to set the capabilities on. + @param new_caps The value of the capabilities that should be set or unset. These + flags are defined in net/if.h + @param mask Which capabilities that should be affected. These + flags are defined in net/if.h + @result 0 on success otherwise the errno error. + */ +extern errno_t ifnet_set_capabilities_enabled(ifnet_t interface, u_int32_t new_caps, + u_int32_t mask); + +/*! + @function ifnet_capabilities_enabled + @discussion Retrieve the interface capabilities enabled on the interface. + @param interface Interface to retrieve the capabilities from. + @result Flags. Capabilities flags are defined in net/if.h + */ +extern u_int32_t ifnet_capabilities_enabled(ifnet_t interface); + + /*! @function ifnet_set_offload @discussion Sets a bitfield to indicate special hardware offload support provided by the interface such as hardware checksums and VLAN. This replaces the if_hwassist flags field. Any flags unrecognized by the stack will not be set. + + Note the system will automatically set the interface capabilities + that correspond to the offload flags modified -- i.e. the driver + does not have to call ifnet_set_capabilities_enabled() and + ifnet_set_capabilities_supported(). @param interface The interface. @param offload The new set of flags indicating which offload options the device supports. - @param mask The mask of flags to be modified. @result 0 on success otherwise the errno error. */ extern errno_t ifnet_set_offload(ifnet_t interface, ifnet_offload_t offload); @@ -1446,6 +1537,11 @@ extern errno_t ifnet_get_address_list(ifnet_t interface, ifaddr_t **addresses); extern errno_t ifnet_get_address_list_family(ifnet_t interface, ifaddr_t **addresses, sa_family_t family); +#ifdef KERNEL_PRIVATE +__private_extern__ errno_t ifnet_get_address_list_family_internal(ifnet_t, + ifaddr_t **, sa_family_t, int, int); +#endif /* KERNEL_PRIVATE */ + /*! @function ifnet_free_address_list @discussion Free a list of addresses returned from @@ -1543,9 +1639,9 @@ extern errno_t ifnet_resolve_multicast(ifnet_t ifp, ifnet_remove_multicast and making sure you no longer have any references to the multicast. @param interface The interface. - @param maddr The multicast address to join. Either a physical - address or logical address to be translated to a physical - address. + @param maddr The multicast address (AF_UNSPEC/AF_LINK) to join. Either + a physical address or logical address to be translated to a + physical address. @param multicast The resulting ifmultiaddr_t multicast address. @result 0 on success otherwise the errno error. */ diff --git a/bsd/net/kpi_protocol.c b/bsd/net/kpi_protocol.c index a48cd249a..6c3043c94 100644 --- a/bsd/net/kpi_protocol.c +++ b/bsd/net/kpi_protocol.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -154,14 +154,14 @@ proto_register_input( } - lck_mtx_lock(thread->input_lck); + lck_mtx_lock(&thread->input_lck); entry->next = proto_input_add_list; proto_input_add_list = entry; thread->input_waiting |= DLIL_PROTO_REGISTER; if ((thread->input_waiting & DLIL_INPUT_RUNNING) == 0) wakeup((caddr_t)&thread->input_waiting); - lck_mtx_unlock(thread->input_lck); + lck_mtx_unlock(&thread->input_lck); return 0; } @@ -219,14 +219,14 @@ proto_input_run(void) mbuf_t packet_list; int i, locked = 0; - lck_mtx_assert(thread->input_lck, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_assert(&thread->input_lck, LCK_MTX_ASSERT_NOTOWNED); if ((thread->input_waiting & DLIL_PROTO_REGISTER) != 0) { - lck_mtx_lock(thread->input_lck); + lck_mtx_lock_spin(&thread->input_lck); entry = proto_input_add_list; proto_input_add_list = NULL; thread->input_waiting &= ~DLIL_PROTO_REGISTER; - lck_mtx_unlock(thread->input_lck); + lck_mtx_unlock(&thread->input_lck); proto_delayed_attach(entry); } /* @@ -237,7 +237,7 @@ proto_input_run(void) for (entry = proto_hash[i]; entry && proto_total_waiting; entry = entry->next) { if (entry->inject_first) { - lck_mtx_lock(thread->input_lck); + lck_mtx_lock_spin(&thread->input_lck); thread->input_waiting &= ~DLIL_PROTO_WAITING; packet_list = entry->inject_first; @@ -246,7 +246,7 @@ proto_input_run(void) entry->inject_last = NULL; proto_total_waiting--; - lck_mtx_unlock(thread->input_lck); + lck_mtx_unlock(&thread->input_lck); if (entry->domain && (entry->domain->dom_flags & DOM_REENTRANT) == 0) { lck_mtx_lock(entry->domain->dom_mtx); @@ -333,7 +333,7 @@ proto_inject( } if (entry) { - lck_mtx_lock(thread->input_lck); + lck_mtx_lock(&thread->input_lck); if (entry->inject_first == NULL) { proto_total_waiting++; thread->input_waiting |= DLIL_PROTO_WAITING; @@ -346,7 +346,7 @@ proto_inject( if ((thread->input_waiting & DLIL_INPUT_RUNNING) == 0) { wakeup((caddr_t)&thread->input_waiting); } - lck_mtx_unlock(thread->input_lck); + lck_mtx_unlock(&thread->input_lck); } else { diff --git a/bsd/net/multicast_list.c b/bsd/net/multicast_list.c index 68fbf23b0..e91aeeb11 100644 --- a/bsd/net/multicast_list.c +++ b/bsd/net/multicast_list.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -151,5 +151,6 @@ multicast_list_program(struct multicast_list * mc_list, (void)multicast_list_remove(mc_list); *mc_list = new_mc_list; } + ifnet_free_multicast_list(source_multicast_list); return (error); } diff --git a/bsd/net/ndrv.c b/bsd/net/ndrv.c index 1797d16f4..51c218910 100644 --- a/bsd/net/ndrv.c +++ b/bsd/net/ndrv.c @@ -100,6 +100,8 @@ extern struct domain ndrvdomain; extern struct protosw ndrvsw; extern lck_mtx_t *domain_proto_mtx; +#define NDRV_PROTODEMUX_COUNT 10 + /* * Verify these values match. * To keep clients from including dlil.h, we define @@ -703,6 +705,8 @@ ndrv_setspec(struct ndrv_cb *np, struct sockopt *sopt) return ENOTSUP; // version is too new! else if (ndrvSpec.version < 1) return EINVAL; // version is not valid + else if (ndrvSpec.demux_count > NDRV_PROTODEMUX_COUNT || ndrvSpec.demux_count == 0) + return EINVAL; // demux_count is not valid bzero(&proto_param, sizeof(proto_param)); proto_param.demux_count = ndrvSpec.demux_count; diff --git a/bsd/net/ndrv.h b/bsd/net/ndrv.h index 6f61df9f5..7e9fc9700 100644 --- a/bsd/net/ndrv.h +++ b/bsd/net/ndrv.h @@ -109,7 +109,7 @@ struct ndrv_demux_desc * Field: * version : must be NDRV_PROTOCOL_DESC_VERS * protocol_family : unique identifier for this protocol - * demux_count : number of demux_list descriptors in demux_list + * demux_count : number of demux_list descriptors in demux_list; maximum of 10 * demux_list : pointer to array of demux descriptors */ struct ndrv_protocol_desc diff --git a/bsd/net/net_osdep.h b/bsd/net/net_osdep.h index 4208380cf..a17921f57 100644 --- a/bsd/net/net_osdep.h +++ b/bsd/net/net_osdep.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -244,9 +244,10 @@ * NetBSD 1.5: always use IFAREF whenever reference gets added. * always use IFAFREE whenever reference gets freed. * IFAFREE frees ifaddr when ifa_refcnt reaches 0. - * Darwin: always use ifaref whenever reference gets added. - * always use ifafree whenever reference gets freed. - * ifaref and ifafree are responsible for determining when to free. + * Darwin: always use IFA_ADDREF whenever reference gets added. + * always use IFA_REMREF whenever reference gets freed. + * IFA_ADDREF and IFA_REMREF are responsible for determining + * when to free. * others: do not increase refcnt for ifp->if_addrlist and in_ifaddr. * use IFAFREE once when ifaddr is disconnected from * ifp->if_addrlist and in_ifaddr. IFAFREE frees ifaddr when @@ -267,11 +268,6 @@ extern const char *if_name(struct ifnet *); #define if_addrlist if_addrhead #define if_list if_link -/* sys/net/if.h */ -#ifndef __APPLE__ -#define IFAREF(ifa) do { ++(ifa)->ifa_refcnt; } while (0) -#endif - #define WITH_CONVERT_AND_STRIP_IP_LEN #if 1 /* at this moment, all OSes do this */ diff --git a/bsd/net/net_str_id.c b/bsd/net/net_str_id.c index 7f4fcd52f..bc28f03c4 100644 --- a/bsd/net/net_str_id.c +++ b/bsd/net/net_str_id.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008,2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -52,8 +52,6 @@ static lck_mtx_t *net_str_id_lock = NULL; static u_int32_t nsi_kind_next[NSI_MAX_KIND] = { FIRST_NET_STR_ID, FIRST_NET_STR_ID, FIRST_NET_STR_ID }; static u_int32_t nsi_next_id = FIRST_NET_STR_ID; -#if NETMIBS - extern int sysctl_if_family_ids SYSCTL_HANDLER_ARGS; SYSCTL_DECL(_net_link_generic_system); @@ -61,9 +59,6 @@ SYSCTL_DECL(_net_link_generic_system); SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_family_ids, CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, sysctl_if_family_ids, "S, if_family_id", "Interface Family ID table"); -#endif /* NETMIBS */ - - __private_extern__ void net_str_id_init(void) { @@ -153,8 +148,6 @@ net_str_id_find_internal(const char *string, u_int32_t *out_id, } -#if NETMIBS - #define ROUNDUP32(a) \ ((a) > 0 ? (1 + (((a) - 1) | (sizeof(uint32_t) - 1))) : sizeof(uint32_t)) @@ -210,6 +203,3 @@ done: _FREE(iffmid, M_TEMP); return error; } - -#endif /* NETMIBS */ - diff --git a/bsd/net/netsrc.c b/bsd/net/netsrc.c new file mode 100644 index 000000000..2c1037c26 --- /dev/null +++ b/bsd/net/netsrc.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/kpi_mbuf.h> +#include <sys/socket.h> +#include <sys/kern_control.h> +#include <sys/mcache.h> +#include <sys/socketvar.h> + +#include <kern/debug.h> + +#include <libkern/libkern.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/in_var.h> +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> + +#include <net/netsrc.h> + +static errno_t netsrc_ctlsend(kern_ctl_ref, uint32_t, void *, mbuf_t, int); +static errno_t netsrc_ctlconnect(kern_ctl_ref, struct sockaddr_ctl *, void **); +static errno_t netsrc_ipv4(kern_ctl_ref, uint32_t, struct netsrc_req *); +static errno_t netsrc_ipv6(kern_ctl_ref, uint32_t, struct netsrc_req *); + +static kern_ctl_ref netsrc_ctlref = NULL; + +__private_extern__ void +netsrc_init(void) +{ + errno_t error; + struct kern_ctl_reg netsrc_ctl = { + .ctl_connect = netsrc_ctlconnect, + .ctl_send = netsrc_ctlsend, + }; + + strlcpy(netsrc_ctl.ctl_name, NETSRC_CTLNAME, sizeof(NETSRC_CTLNAME)); + + if ((error = ctl_register(&netsrc_ctl, &netsrc_ctlref))) + printf("%s: ctl_register failed %d\n", __func__, error); +} + +static errno_t +netsrc_ctlconnect(kern_ctl_ref kctl, struct sockaddr_ctl *sac, void **uinfo) +{ +#pragma unused(kctl, sac, uinfo) + + /* + * We don't need to do anything here. This callback is only necessary + * for ctl_register() to succeed. + */ + return (0); +} + +static errno_t +netsrc_ctlsend(kern_ctl_ref kctl, uint32_t unit, void *uinfo, mbuf_t m, + int flags) +{ +#pragma unused(uinfo, flags) + errno_t error; + struct netsrc_req *nrq, storage; + + if (mbuf_pkthdr_len(m) < sizeof(*nrq)) { + error = EINVAL; + goto out; + } + if (mbuf_len(m) >= sizeof(*nrq)) + nrq = mbuf_data(m); + else { + mbuf_copydata(m, 0, sizeof(storage), &storage); + nrq = &storage; + } + /* We only have one version right now. */ + if (nrq->nrq_ver != NETSRC_VERSION1) { + error = EINVAL; + goto out; + } + switch (nrq->nrq_sin.sin_family) { + case AF_INET: + error = netsrc_ipv4(kctl, unit, nrq); + break; + case AF_INET6: + error = netsrc_ipv6(kctl, unit, nrq); + break; + default: + printf("%s: invalid family\n", __func__); + error = EINVAL; + } +out: + mbuf_freem(m); + + return (error); + +} + +static errno_t +netsrc_ipv4(kern_ctl_ref kctl, uint32_t unit, struct netsrc_req *nrq) +{ + errno_t error = EHOSTUNREACH; + struct sockaddr_in *dstsin; + struct rtentry *rt; + struct in_ifaddr *ia; + struct netsrc_rep nrp; + struct sockaddr_in6 v4entry = { + .sin6_family = AF_INET6, + .sin6_len = sizeof(struct sockaddr_in6), + .sin6_addr = IN6ADDR_V4MAPPED_INIT, + }; + struct in6_addrpolicy *policy; + + dstsin = &nrq->nrq_sin; + + if (dstsin->sin_len < sizeof (*dstsin) || + dstsin->sin_addr.s_addr == INADDR_ANY) + return (EINVAL); + + lck_mtx_lock(rnh_lock); + rt = rt_lookup(TRUE, (struct sockaddr *)dstsin, NULL, + rt_tables[AF_INET], nrq->nrq_ifscope); + lck_mtx_unlock(rnh_lock); + if (!rt) + return (EHOSTUNREACH); + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + IFA_LOCK_SPIN(&ia->ia_ifa); + if (ia->ia_ifp == rt->rt_ifp) { + memset(&nrp, 0, sizeof(nrp)); + memcpy(&nrp.nrp_sin, IA_SIN(ia), sizeof(nrp.nrp_sin)); + IFA_UNLOCK(&ia->ia_ifa); + v4entry.sin6_addr.s6_addr32[3] = + nrp.nrp_sin.sin_addr.s_addr; + policy = in6_addrsel_lookup_policy(&v4entry); + if (policy->label != -1) { + nrp.nrp_label = policy->label; + nrp.nrp_precedence = policy->preced; + /* XXX might not be true */ + nrp.nrp_dstlabel = policy->label; + nrp.nrp_dstprecedence = policy->preced; + } + error = ctl_enqueuedata(kctl, unit, &nrp, + sizeof(nrp), CTL_DATA_EOR); + break; + } + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(in_ifaddr_rwlock); + if (rt) + rtfree(rt); + + return (error); +} + +static errno_t +netsrc_ipv6(kern_ctl_ref kctl, uint32_t unit, struct netsrc_req *nrq) +{ + struct sockaddr_in6 *dstsin6; + struct in6_addr *in6, storage; + struct in6_ifaddr *ia; + struct route_in6 ro; + int error = EHOSTUNREACH; + struct netsrc_rep nrp; + + dstsin6 = &nrq->nrq_sin6; + + if (dstsin6->sin6_len < sizeof (*dstsin6) || + IN6_IS_ADDR_UNSPECIFIED(&dstsin6->sin6_addr)) + return (EINVAL); + + memset(&ro, 0, sizeof(ro)); + lck_mtx_lock(rnh_lock); + ro.ro_rt = rt_lookup(TRUE, (struct sockaddr *)dstsin6, NULL, + rt_tables[AF_INET6], nrq->nrq_ifscope); + lck_mtx_unlock(rnh_lock); + if (!ro.ro_rt) + return (EHOSTUNREACH); + in6 = in6_selectsrc(dstsin6, NULL, NULL, &ro, NULL, &storage, + nrq->nrq_ifscope, &error); + if (ro.ro_rt) + rtfree(ro.ro_rt); + if (!in6 || error) + return (error); + memset(&nrp, 0, sizeof(nrp)); + nrp.nrp_sin6.sin6_family = AF_INET6; + nrp.nrp_sin6.sin6_len = sizeof(nrp.nrp_sin6); + memcpy(&nrp.nrp_sin6.sin6_addr, in6, sizeof(nrp.nrp_sin6.sin6_addr)); + lck_rw_lock_shared(&in6_ifaddr_rwlock); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + if (memcmp(&ia->ia_addr.sin6_addr, in6, sizeof(*in6)) == 0) { + struct sockaddr_in6 sin6; + struct in6_addrpolicy *policy; + + if (ia->ia6_flags & IN6_IFF_TEMPORARY) + nrp.nrp_flags |= NETSRC_IP6_FLAG_TEMPORARY; + if (ia->ia6_flags & IN6_IFF_TENTATIVE) + nrp.nrp_flags |= NETSRC_IP6_FLAG_TENTATIVE; + if (ia->ia6_flags & IN6_IFF_DEPRECATED) + nrp.nrp_flags |= NETSRC_IP6_FLAG_DEPRECATED; + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); + memcpy(&sin6.sin6_addr, in6, sizeof(*in6)); + policy = in6_addrsel_lookup_policy(&sin6); + if (policy->label != -1) { + nrp.nrp_label = policy->label; + nrp.nrp_precedence = policy->preced; + } + memcpy(&sin6.sin6_addr, &dstsin6->sin6_addr, + sizeof(dstsin6->sin6_addr)); + policy = in6_addrsel_lookup_policy(&sin6); + if (policy->label != -1) { + nrp.nrp_dstlabel = policy->label; + nrp.nrp_dstprecedence = policy->preced; + } + break; + } + } + lck_rw_done(&in6_ifaddr_rwlock); + error = ctl_enqueuedata(kctl, unit, &nrp, sizeof(nrp), + CTL_DATA_EOR); + + return (error); +} diff --git a/EXTERNAL_HEADERS/architecture/ppc/cframe.h b/bsd/net/netsrc.h similarity index 58% rename from EXTERNAL_HEADERS/architecture/ppc/cframe.h rename to bsd/net/netsrc.h index 0db3fce7d..54ba8d8be 100644 --- a/EXTERNAL_HEADERS/architecture/ppc/cframe.h +++ b/bsd/net/netsrc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,26 +25,46 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1991 NeXT Software, Inc. All rights reserved. - * - * File: architecture/ppc/cframe.h - * Author: Mike DeMoney, NeXT Software, Inc. - * - * This include file defines C calling sequence defines - * for ppc port. - */ -#ifndef _ARCH_PPC_CFRAME_H_ -#define _ARCH_PPC_CFRAME_H_ - -#if defined (__ppc64__) -#define C_ARGSAVE_LEN 64 /* at least 64 bytes of arg save */ -#define C_STACK_ALIGN 32 /* stack must be 32 byte aligned */ -#define C_RED_ZONE 320 /* 320 bytes to skip over saved registers */ -#else -#define C_ARGSAVE_LEN 32 /* at least 32 bytes of arg save */ -#define C_STACK_ALIGN 16 /* stack must be 16 byte aligned */ -#define C_RED_ZONE 224 /* 224 bytes to skip over saved registers */ +#ifndef __NET_NETSRC_H__ + +#define NETSRC_CTLNAME "com.apple.netsrc" + +#define NETSRC_VERSION1 1 +#define NETSRC_CURVERS NETSRC_VERSION1 + +struct netsrc_req { + unsigned int nrq_ver; + unsigned int nrq_ifscope; + union { + struct sockaddr_in _usin; + struct sockaddr_in6 _usin6; + } _usa; +}; + +#define nrq_sin _usa._usin +#define nrq_sin6 _usa._usin6 + +struct netsrc_rep { + union { + struct sockaddr_in _usin; + struct sockaddr_in6 _usin6; + } _usa; +#define NETSRC_IP6_FLAG_TENTATIVE 0x0001 +#define NETSRC_IP6_FLAG_TEMPORARY 0x0002 +#define NETSRC_IP6_FLAG_DEPRECATED 0x0004 + uint16_t nrp_flags; + uint16_t nrp_label; + uint16_t nrp_precedence; + uint16_t nrp_dstlabel; + uint16_t nrp_dstprecedence; +}; + +#define nrp_sin _usa._usin +#define nrp_sin6 _usa._usin6 + +#ifdef KERNEL_PRIVATE +__private_extern__ void netsrc_init(void); #endif -#endif /* _ARCH_PPC_CFRAME_H_ */ +#endif /* __NET_NETSRC_H__ */ diff --git a/bsd/net/ntstat.c b/bsd/net/ntstat.c new file mode 100644 index 000000000..4bb6e1c28 --- /dev/null +++ b/bsd/net/ntstat.c @@ -0,0 +1,1954 @@ +/* + * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/kpi_mbuf.h> +#include <sys/socket.h> +#include <sys/kern_control.h> +#include <sys/mcache.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> + +#include <kern/clock.h> +#include <kern/debug.h> + +#include <libkern/libkern.h> +#include <libkern/OSMalloc.h> +#include <libkern/OSAtomic.h> +#include <libkern/locks.h> + +#include <net/if.h> +#include <net/route.h> +#include <net/ntstat.h> + +#include <netinet/ip_var.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_fsm.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> +#include <netinet6/in6_pcb.h> +#include <netinet6/in6_var.h> + +__private_extern__ int nstat_collect = 1; +SYSCTL_INT(_net, OID_AUTO, statistics, CTLFLAG_RW | CTLFLAG_LOCKED, + &nstat_collect, 0, "Collect detailed statistics"); + +typedef struct nstat_control_state +{ + struct nstat_control_state *next; + u_int32_t watching; + decl_lck_mtx_data(, mtx); + kern_ctl_ref kctl; + u_int32_t unit; + nstat_src_ref_t next_srcref; + struct nstat_src *srcs; + int cleanup; + int suser; +} nstat_control_state; + +static void nstat_control_register(void); + +static volatile OSMallocTag nstat_malloc_tag = NULL; +static nstat_control_state *nstat_controls = NULL; +static uint64_t nstat_idle_time = 0ULL; +static decl_lck_mtx_data(, nstat_mtx); + +static void +nstat_copy_sa_out( + const struct sockaddr *src, + struct sockaddr *dst, + int maxlen) +{ + if (src->sa_len > maxlen) return; + + bcopy(src, dst, src->sa_len); + if (src->sa_family == AF_INET6 && + src->sa_len >= sizeof(struct sockaddr_in6)) + { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)dst; + if (IN6_IS_SCOPE_EMBED(&sin6->sin6_addr)) + { + if (sin6->sin6_scope_id == 0) + sin6->sin6_scope_id = ntohs(sin6->sin6_addr.__u6_addr.__u6_addr16[1]); + sin6->sin6_addr.__u6_addr.__u6_addr16[1] = 0; + } + } +} + +static void +nstat_ip_to_sockaddr( + const struct in_addr *ip, + u_int16_t port, + struct sockaddr_in *sin, + u_int32_t maxlen) +{ + if (maxlen < sizeof(struct sockaddr_in)) + return; + + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_port = port; + sin->sin_addr = *ip; +} + +static void +nstat_ip6_to_sockaddr( + const struct in6_addr *ip6, + u_int16_t port, + struct sockaddr_in6 *sin6, + u_int32_t maxlen) +{ + if (maxlen < sizeof(struct sockaddr_in6)) + return; + + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_port = port; + sin6->sin6_addr = *ip6; + if (IN6_IS_SCOPE_EMBED(&sin6->sin6_addr)) + { + sin6->sin6_scope_id = ntohs(sin6->sin6_addr.__u6_addr.__u6_addr16[1]); + sin6->sin6_addr.__u6_addr.__u6_addr16[1] = 0; + } +} + +#pragma mark -- Network Statistic Providers -- + +typedef struct nstat_provider +{ + struct nstat_provider *next; + nstat_provider_id_t nstat_provider_id; + size_t nstat_descriptor_length; + errno_t (*nstat_lookup)(const void *data, u_int32_t length, nstat_provider_cookie_t *out_cookie); + int (*nstat_gone)(nstat_provider_cookie_t cookie); + errno_t (*nstat_counts)(nstat_provider_cookie_t cookie, struct nstat_counts *out_counts, int *out_gone); + errno_t (*nstat_watcher_add)(nstat_control_state *state); + void (*nstat_watcher_remove)(nstat_control_state *state); + errno_t (*nstat_copy_descriptor)(nstat_provider_cookie_t cookie, void *data, u_int32_t len); + void (*nstat_release)(nstat_provider_cookie_t cookie); +} nstat_provider; + +static errno_t nstat_control_source_add(u_int64_t context, nstat_control_state *state, nstat_provider *provider, nstat_provider_cookie_t cookie); +struct nstat_provider *nstat_providers = NULL; + +static struct nstat_provider* +nstat_find_provider_by_id( + nstat_provider_id_t id) +{ + struct nstat_provider *provider; + + for (provider = nstat_providers; provider != NULL; provider = provider->next) + { + if (provider->nstat_provider_id == id) + break; + } + + return provider; +} + +static errno_t +nstat_lookup_entry( + nstat_provider_id_t id, + const void *data, + u_int32_t length, + nstat_provider **out_provider, + nstat_provider_cookie_t *out_cookie) +{ + *out_provider = nstat_find_provider_by_id(id); + if (*out_provider == NULL) + { + printf("%s:%d: provider %u not found\n", __FUNCTION__, __LINE__, id); + return ENOENT; + } + + return (*out_provider)->nstat_lookup(data, length, out_cookie); +} + +static void nstat_init_route_provider(void); +static void nstat_init_tcp_provider(void); +static void nstat_init_udp_provider(void); + +static void +nstat_init(void) +{ + if (nstat_malloc_tag != NULL) return; + + OSMallocTag tag = OSMalloc_Tagalloc(NET_STAT_CONTROL_NAME, OSMT_DEFAULT); + if (!OSCompareAndSwapPtr(NULL, tag, &nstat_malloc_tag)) + { + OSMalloc_Tagfree(tag); + tag = nstat_malloc_tag; + } + else + { + // we need to initialize other things, we do it here as this code path will only be hit once; + nstat_init_route_provider(); + nstat_init_tcp_provider(); + nstat_init_udp_provider(); + nstat_control_register(); + } +} + +#pragma mark -- Aligned Buffer Allocation -- + +struct align_header +{ + u_int32_t offset; + u_int32_t length; +}; + +static void* +nstat_malloc_aligned( + u_int32_t length, + u_int8_t alignment, + OSMallocTag tag) +{ + struct align_header *hdr = NULL; + u_int32_t size = length + sizeof(*hdr) + alignment - 1; + + u_int8_t *buffer = OSMalloc(size, tag); + if (buffer == NULL) return NULL; + + u_int8_t *aligned = buffer + sizeof(*hdr); + aligned = (u_int8_t*)P2ROUNDUP(aligned, alignment); + + hdr = (struct align_header*)(aligned - sizeof(*hdr)); + hdr->offset = aligned - buffer; + hdr->length = size; + + return aligned; +} + +static void +nstat_free_aligned( + void *buffer, + OSMallocTag tag) +{ + struct align_header *hdr = (struct align_header*)((u_int8_t*)buffer - sizeof(*hdr)); + OSFree(((char*)buffer) - hdr->offset, hdr->length, tag); +} + +#pragma mark -- Route Provider -- + +static nstat_provider nstat_route_provider; + +static errno_t +nstat_route_lookup( + const void *data, + u_int32_t length, + nstat_provider_cookie_t *out_cookie) +{ + // rt_lookup doesn't take const params but it doesn't modify the parameters for + // the lookup. So...we use a union to eliminate the warning. + union + { + struct sockaddr *sa; + const struct sockaddr *const_sa; + } dst, mask; + + const nstat_route_add_param *param = (const nstat_route_add_param*)data; + *out_cookie = NULL; + + if (length < sizeof(*param)) + { + printf("%s:%d: expected %lu byte param, received %u\n", __FUNCTION__, __LINE__, sizeof(*param), length); + return EINVAL; + } + + if (param->dst.v4.sin_family == 0 || + param->dst.v4.sin_family > AF_MAX || + (param->mask.v4.sin_family != 0 && param->mask.v4.sin_family != param->dst.v4.sin_family)) + { + printf("%s:%d invalid family (dst=%d, mask=%d)\n", __FUNCTION__, __LINE__, + param->dst.v4.sin_family, param->mask.v4.sin_family); + return EINVAL; + } + + if (param->dst.v4.sin_len > sizeof(param->dst) || + (param->mask.v4.sin_family && param->mask.v4.sin_len > sizeof(param->mask.v4.sin_len))) + { + printf("%s:%d invalid length (dst=%d, mask=%d)\n", __FUNCTION__, __LINE__, + param->dst.v4.sin_len, param->mask.v4.sin_len); + } + + // TBD: Need to validate length of sockaddr for different families? + dst.const_sa = (const struct sockaddr*)¶m->dst; + mask.const_sa = param->mask.v4.sin_family ? (const struct sockaddr*)¶m->mask : NULL; + + struct radix_node_head *rnh = rt_tables[dst.sa->sa_family]; + if (rnh == NULL) return EAFNOSUPPORT; + + lck_mtx_lock(rnh_lock); + struct rtentry *rt = rt_lookup(TRUE, dst.sa, mask.sa, rnh, param->ifindex); + lck_mtx_unlock(rnh_lock); + + if (rt) *out_cookie = (nstat_provider_cookie_t)rt; + + return rt ? 0 : ENOENT; +} + +static int +nstat_route_gone( + nstat_provider_cookie_t cookie) +{ + struct rtentry *rt = (struct rtentry*)cookie; + return ((rt->rt_flags & RTF_UP) == 0) ? 1 : 0; +} + +static errno_t +nstat_route_counts( + nstat_provider_cookie_t cookie, + struct nstat_counts *out_counts, + int *out_gone) +{ + struct rtentry *rt = (struct rtentry*)cookie; + struct nstat_counts *rt_stats = rt->rt_stats; + + *out_gone = 0; + + if ((rt->rt_flags & RTF_UP) == 0) *out_gone = 1; + + if (rt_stats) + { + atomic_get_64(out_counts->nstat_rxpackets, &rt_stats->nstat_rxpackets); + atomic_get_64(out_counts->nstat_rxbytes, &rt_stats->nstat_rxbytes); + atomic_get_64(out_counts->nstat_txpackets, &rt_stats->nstat_txpackets); + atomic_get_64(out_counts->nstat_txbytes, &rt_stats->nstat_txbytes); + out_counts->nstat_rxduplicatebytes = rt_stats->nstat_rxduplicatebytes; + out_counts->nstat_rxoutoforderbytes = rt_stats->nstat_rxoutoforderbytes; + out_counts->nstat_txretransmit = rt_stats->nstat_txretransmit; + out_counts->nstat_connectattempts = rt_stats->nstat_connectattempts; + out_counts->nstat_connectsuccesses = rt_stats->nstat_connectsuccesses; + out_counts->nstat_min_rtt = rt_stats->nstat_min_rtt; + out_counts->nstat_avg_rtt = rt_stats->nstat_avg_rtt; + out_counts->nstat_var_rtt = rt_stats->nstat_var_rtt; + } + else + bzero(out_counts, sizeof(*out_counts)); + + return 0; +} + +static void +nstat_route_release( + nstat_provider_cookie_t cookie) +{ + rtfree((struct rtentry*)cookie); +} + +static u_int32_t nstat_route_watchers = 0; + +static int +nstat_route_walktree_add( + struct radix_node *rn, + void *context) +{ + errno_t result = 0; + struct rtentry *rt = (struct rtentry *)rn; + nstat_control_state *state = (nstat_control_state*)context; + + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); + + /* RTF_UP can't change while rnh_lock is held */ + if ((rt->rt_flags & RTF_UP) != 0) + { + /* Clear RTPRF_OURS if the route is still usable */ + RT_LOCK(rt); + if (rt_validate(rt)) { + RT_ADDREF_LOCKED(rt); + RT_UNLOCK(rt); + } else { + RT_UNLOCK(rt); + rt = NULL; + } + + /* Otherwise if RTF_CONDEMNED, treat it as if it were down */ + if (rt == NULL) + return (0); + + result = nstat_control_source_add(0, state, &nstat_route_provider, rt); + if (result != 0) + rtfree_locked(rt); + } + + return result; +} + +static errno_t +nstat_route_add_watcher( + nstat_control_state *state) +{ + int i; + errno_t result = 0; + OSIncrementAtomic(&nstat_route_watchers); + + lck_mtx_lock(rnh_lock); + for (i = 1; i < AF_MAX; i++) + { + struct radix_node_head *rnh; + rnh = rt_tables[i]; + if (!rnh) continue; + + result = rnh->rnh_walktree(rnh, nstat_route_walktree_add, state); + if (result != 0) + { + printf("%s:%d rnh_walktree failed: %d\n", __FUNCTION__, __LINE__, result); + break; + } + } + lck_mtx_unlock(rnh_lock); + + return result; +} + +__private_extern__ void +nstat_route_new_entry( + struct rtentry *rt) +{ + if (nstat_route_watchers == 0) + return; + + lck_mtx_lock(&nstat_mtx); + if ((rt->rt_flags & RTF_UP) != 0) + { + nstat_control_state *state; + for (state = nstat_controls; state; state = state->next) + { + if ((state->watching & (1 << NSTAT_PROVIDER_ROUTE)) != 0) + { + // this client is watching routes + // acquire a reference for the route + RT_ADDREF(rt); + + // add the source, if that fails, release the reference + if (nstat_control_source_add(0, state, &nstat_route_provider, rt) != 0) + RT_REMREF(rt); + } + } + } + lck_mtx_unlock(&nstat_mtx); +} + +static void +nstat_route_remove_watcher( + __unused nstat_control_state *state) +{ + OSDecrementAtomic(&nstat_route_watchers); +} + +static errno_t +nstat_route_copy_descriptor( + nstat_provider_cookie_t cookie, + void *data, + u_int32_t len) +{ + nstat_route_descriptor *desc = (nstat_route_descriptor*)data; + if (len < sizeof(*desc)) + { + printf("%s:%d invalid length, wanted %lu, got %d\n", __FUNCTION__, __LINE__, sizeof(*desc), len); + return EINVAL; + } + bzero(desc, sizeof(*desc)); + + struct rtentry *rt = (struct rtentry*)cookie; + desc->id = (uintptr_t)rt; + desc->parent_id = (uintptr_t)rt->rt_parent; + desc->gateway_id = (uintptr_t)rt->rt_gwroute; + + + // key/dest + struct sockaddr *sa; + if ((sa = rt_key(rt))) + nstat_copy_sa_out(sa, &desc->dst.sa, sizeof(desc->dst)); + + // mask + if ((sa = rt_mask(rt)) && sa->sa_len <= sizeof(desc->mask)) + memcpy(&desc->mask, sa, sa->sa_len); + + // gateway + if ((sa = rt->rt_gateway)) + nstat_copy_sa_out(sa, &desc->gateway.sa, sizeof(desc->gateway)); + + if (rt->rt_ifp) + desc->ifindex = rt->rt_ifp->if_index; + + desc->flags = rt->rt_flags; + + return 0; +} + +static void +nstat_init_route_provider(void) +{ + bzero(&nstat_route_provider, sizeof(nstat_route_provider)); + nstat_route_provider.nstat_descriptor_length = sizeof(nstat_route_descriptor); + nstat_route_provider.nstat_provider_id = NSTAT_PROVIDER_ROUTE; + nstat_route_provider.nstat_lookup = nstat_route_lookup; + nstat_route_provider.nstat_gone = nstat_route_gone; + nstat_route_provider.nstat_counts = nstat_route_counts; + nstat_route_provider.nstat_release = nstat_route_release; + nstat_route_provider.nstat_watcher_add = nstat_route_add_watcher; + nstat_route_provider.nstat_watcher_remove = nstat_route_remove_watcher; + nstat_route_provider.nstat_copy_descriptor = nstat_route_copy_descriptor; + nstat_route_provider.next = nstat_providers; + nstat_providers = &nstat_route_provider; +} + +#pragma mark -- Route Collection -- + +static struct nstat_counts* +nstat_route_attach( + struct rtentry *rte) +{ + struct nstat_counts *result = rte->rt_stats; + if (result) return result; + + if (nstat_malloc_tag == NULL) nstat_init(); + + result = nstat_malloc_aligned(sizeof(*result), sizeof(u_int64_t), nstat_malloc_tag); + if (!result) return result; + + bzero(result, sizeof(*result)); + + if (!OSCompareAndSwapPtr(NULL, result, &rte->rt_stats)) + { + nstat_free_aligned(result, nstat_malloc_tag); + result = rte->rt_stats; + } + + return result; +} + +__private_extern__ void +nstat_route_detach( + struct rtentry *rte) +{ + if (rte->rt_stats) + { + nstat_free_aligned(rte->rt_stats, nstat_malloc_tag); + rte->rt_stats = NULL; + } +} + +__private_extern__ void +nstat_route_connect_attempt( + struct rtentry *rte) +{ + while (rte) + { + struct nstat_counts* stats = nstat_route_attach(rte); + if (stats) + { + OSIncrementAtomic(&stats->nstat_connectattempts); + } + + rte = rte->rt_parent; + } +} + +__private_extern__ void +nstat_route_connect_success( + struct rtentry *rte) +{ + // This route + while (rte) + { + struct nstat_counts* stats = nstat_route_attach(rte); + if (stats) + { + OSIncrementAtomic(&stats->nstat_connectsuccesses); + } + + rte = rte->rt_parent; + } +} + +__private_extern__ void +nstat_route_tx( + struct rtentry *rte, + u_int32_t packets, + u_int32_t bytes, + u_int32_t flags) +{ + while (rte) + { + struct nstat_counts* stats = nstat_route_attach(rte); + if (stats) + { + if ((flags & NSTAT_TX_FLAG_RETRANSMIT) != 0) + { + OSAddAtomic(bytes, &stats->nstat_txretransmit); + } + else + { + OSAddAtomic64((SInt64)packets, (SInt64*)&stats->nstat_txpackets); + OSAddAtomic64((SInt64)bytes, (SInt64*)&stats->nstat_txbytes); + } + } + + rte = rte->rt_parent; + } +} + +__private_extern__ void +nstat_route_rx( + struct rtentry *rte, + u_int32_t packets, + u_int32_t bytes, + u_int32_t flags) +{ + while (rte) + { + struct nstat_counts* stats = nstat_route_attach(rte); + if (stats) + { + if (flags == 0) + { + OSAddAtomic64((SInt64)packets, (SInt64*)&stats->nstat_rxpackets); + OSAddAtomic64((SInt64)bytes, (SInt64*)&stats->nstat_rxbytes); + } + else + { + if (flags & NSTAT_RX_FLAG_OUT_OF_ORDER) + OSAddAtomic(bytes, &stats->nstat_rxoutoforderbytes); + if (flags & NSTAT_RX_FLAG_DUPLICATE) + OSAddAtomic(bytes, &stats->nstat_rxduplicatebytes); + } + } + + rte = rte->rt_parent; + } +} + +__private_extern__ void +nstat_route_rtt( + struct rtentry *rte, + u_int32_t rtt, + u_int32_t rtt_var) +{ + const int32_t factor = 8; + + while (rte) + { + struct nstat_counts* stats = nstat_route_attach(rte); + if (stats) + { + int32_t oldrtt; + int32_t newrtt; + + // average + do + { + oldrtt = stats->nstat_avg_rtt; + if (oldrtt == 0) + { + newrtt = rtt; + } + else + { + newrtt = oldrtt - (oldrtt - (int32_t)rtt) / factor; + } + if (oldrtt == newrtt) break; + } while (!OSCompareAndSwap(oldrtt, newrtt, &stats->nstat_avg_rtt)); + + // minimum + do + { + oldrtt = stats->nstat_min_rtt; + if (oldrtt != 0 && oldrtt < (int32_t)rtt) + { + break; + } + } while (!OSCompareAndSwap(oldrtt, rtt, &stats->nstat_min_rtt)); + + // variance + do + { + oldrtt = stats->nstat_var_rtt; + if (oldrtt == 0) + { + newrtt = rtt_var; + } + else + { + newrtt = oldrtt - (oldrtt - (int32_t)rtt_var) / factor; + } + if (oldrtt == newrtt) break; + } while (!OSCompareAndSwap(oldrtt, newrtt, &stats->nstat_var_rtt)); + } + + rte = rte->rt_parent; + } +} + +#pragma mark -- TCP Provider -- + +static nstat_provider nstat_tcp_provider; + +static errno_t +nstat_tcpudp_lookup( + struct inpcbinfo *inpinfo, + const void *data, + u_int32_t length, + nstat_provider_cookie_t *out_cookie) +{ + // parameter validation + const nstat_tcp_add_param *param = (const nstat_tcp_add_param*)data; + if (length < sizeof(*param)) + { + printf("%s:%d expected %lu byte param, received %u\n", __FUNCTION__, __LINE__, sizeof(*param), length); + return EINVAL; + } + + // src and dst must match + if (param->remote.v4.sin_family != 0 && + param->remote.v4.sin_family != param->local.v4.sin_family) + { + printf("%s:%d src family (%d) and dst family (%d) don't match\n", + __FUNCTION__, __LINE__, param->local.v4.sin_family, param->remote.v4.sin_family); + return EINVAL; + } + + struct inpcb *inp = NULL; + + switch (param->local.v4.sin_family) + { + case AF_INET: + { + if (param->local.v4.sin_len != sizeof(param->local.v4) || + (param->remote.v4.sin_family != 0 && + param->remote.v4.sin_len != sizeof(param->remote.v4))) + { + printf("%s:%d invalid length for v4 src (%d) or dst (%d), should be %lu\n", + __FUNCTION__, __LINE__, param->local.v4.sin_len, param->remote.v4.sin_len, + sizeof(param->remote.v4)); + return EINVAL; + } + + inp = in_pcblookup_hash(inpinfo, param->remote.v4.sin_addr, param->remote.v4.sin_port, + param->local.v4.sin_addr, param->local.v4.sin_port, 1, NULL); + } + break; + +#if INET6 + case AF_INET6: + { + union + { + const struct in6_addr *in6c; + struct in6_addr *in6; + } local, remote; + + if (param->local.v6.sin6_len != sizeof(param->local.v6) || + (param->remote.v6.sin6_family != 0 && + param->remote.v6.sin6_len != sizeof(param->remote.v6))) + { + printf("%s:%d invalid length for v6 src (%d) or dst (%d), should be %lu\n", + __FUNCTION__, __LINE__, param->local.v6.sin6_len, param->remote.v6.sin6_len, + sizeof(param->remote.v6)); + return EINVAL; + } + + local.in6c = ¶m->local.v6.sin6_addr; + remote.in6c = ¶m->remote.v6.sin6_addr; + + inp = in6_pcblookup_hash(inpinfo, remote.in6, param->remote.v6.sin6_port, + local.in6, param->local.v6.sin6_port, 1, NULL); + } + break; +#endif + + default: + printf("%s:%d unsupported address family %d\n", __FUNCTION__, __LINE__, param->local.v4.sin_family); + return EINVAL; + } + + if (inp == NULL) return ENOENT; + + // At this point we have a ref to the inpcb + *out_cookie = inp; + return 0; +} + +static errno_t +nstat_tcp_lookup( + const void *data, + u_int32_t length, + nstat_provider_cookie_t *out_cookie) +{ + return nstat_tcpudp_lookup(&tcbinfo, data, length, out_cookie); +} + +static int +nstat_tcp_gone( + nstat_provider_cookie_t cookie) +{ + struct inpcb *inp = (struct inpcb*)cookie; + struct tcpcb *tp = intotcpcb(inp); + return (inp->inp_state == INPCB_STATE_DEAD || tp->t_state == TCPS_TIME_WAIT) ? 1 : 0; +} + +static errno_t +nstat_tcp_counts( + nstat_provider_cookie_t cookie, + struct nstat_counts *out_counts, + int *out_gone) +{ + struct inpcb *inp = (struct inpcb*)cookie; + struct tcpcb *tp = intotcpcb(inp); + + bzero(out_counts, sizeof(*out_counts)); + + *out_gone = 0; + + // if the pcb is in the dead state, we should stop using it + if (inp->inp_state == INPCB_STATE_DEAD || tp->t_state == TCPS_TIME_WAIT) + { + *out_gone = 1; + } + + if (tp->t_state > TCPS_LISTEN) + { + atomic_get_64(out_counts->nstat_rxpackets, &inp->inp_stat->rxpackets); + atomic_get_64(out_counts->nstat_rxbytes, &inp->inp_stat->rxbytes); + atomic_get_64(out_counts->nstat_txpackets, &inp->inp_stat->txpackets); + atomic_get_64(out_counts->nstat_txbytes, &inp->inp_stat->txbytes); + out_counts->nstat_rxduplicatebytes = tp->t_stat.rxduplicatebytes; + out_counts->nstat_rxoutoforderbytes = tp->t_stat.rxoutoforderbytes; + out_counts->nstat_txretransmit = tp->t_stat.txretransmitbytes; + out_counts->nstat_connectattempts = tp->t_state >= TCPS_SYN_SENT ? 1 : 0; + out_counts->nstat_connectsuccesses = tp->t_state >= TCPS_ESTABLISHED ? 1 : 0; + out_counts->nstat_avg_rtt = tp->t_srtt; + out_counts->nstat_min_rtt = tp->t_rttbest; + out_counts->nstat_var_rtt = tp->t_rttvar; + } + + return 0; +} + +static void +nstat_tcp_release( + nstat_provider_cookie_t cookie) +{ + struct inpcb *inp = (struct inpcb*)cookie; + in_pcb_checkstate(inp, WNT_RELEASE, 0); +} + +static u_int32_t nstat_tcp_watchers = 0; + +static errno_t +nstat_tcp_add_watcher( + nstat_control_state *state) +{ + OSIncrementAtomic(&nstat_tcp_watchers); + + lck_rw_lock_shared(tcbinfo.mtx); + + // Add all current tcp inpcbs. Ignore those in timewait + struct inpcb *inp; + for (inp = LIST_FIRST(tcbinfo.listhead); inp; inp = LIST_NEXT(inp, inp_list)) + { + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + + if (nstat_control_source_add(0, state, &nstat_tcp_provider, inp) != 0) + { + in_pcb_checkstate(inp, WNT_RELEASE, 0); + break; + } + } + + lck_rw_done(tcbinfo.mtx); + + return 0; +} + +static void +nstat_tcp_remove_watcher( + __unused nstat_control_state *state) +{ + OSDecrementAtomic(&nstat_tcp_watchers); +} + +__private_extern__ void +nstat_tcp_new_pcb( + struct inpcb *inp) +{ + if (nstat_tcp_watchers == 0) + return; + + lck_mtx_lock(&nstat_mtx); + nstat_control_state *state; + for (state = nstat_controls; state; state = state->next) + { + if ((state->watching & (1 << NSTAT_PROVIDER_TCP)) != 0) + { + // this client is watching tcp + // acquire a reference for it + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + break; + + // add the source, if that fails, release the reference + if (nstat_control_source_add(0, state, &nstat_tcp_provider, inp) != 0) + { + in_pcb_checkstate(inp, WNT_RELEASE, 0); + break; + } + } + } + lck_mtx_unlock(&nstat_mtx); +} + +static errno_t +nstat_tcp_copy_descriptor( + nstat_provider_cookie_t cookie, + void *data, + u_int32_t len) +{ + if (len < sizeof(nstat_tcp_descriptor)) + { + printf("%s:%d invalid length, wanted %lu, got %d\n", __FUNCTION__, __LINE__, sizeof(nstat_tcp_descriptor), len); + return EINVAL; + } + + nstat_tcp_descriptor *desc = (nstat_tcp_descriptor*)data; + struct inpcb *inp = (struct inpcb*)cookie; + struct tcpcb *tp = intotcpcb(inp); + + bzero(desc, sizeof(*desc)); + + if (inp->inp_vflag & INP_IPV6) + { + nstat_ip6_to_sockaddr(&inp->in6p_laddr, inp->inp_lport, + &desc->local.v6, sizeof(desc->local)); + nstat_ip6_to_sockaddr(&inp->in6p_faddr, inp->inp_fport, + &desc->remote.v6, sizeof(desc->remote)); + } + else if (inp->inp_vflag & INP_IPV4) + { + nstat_ip_to_sockaddr(&inp->inp_laddr, inp->inp_lport, + &desc->local.v4, sizeof(desc->local)); + nstat_ip_to_sockaddr(&inp->inp_faddr, inp->inp_fport, + &desc->remote.v4, sizeof(desc->remote)); + } + + desc->state = intotcpcb(inp)->t_state; + if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->rt_ifp) + desc->ifindex = inp->inp_route.ro_rt->rt_ifp->if_index; + + // danger - not locked, values could be bogus + desc->txunacked = tp->snd_max - tp->snd_una; + desc->txwindow = tp->snd_wnd; + desc->txcwindow = tp->snd_cwnd; + + struct socket *so = inp->inp_socket; + if (so) + { + // TBD - take the socket lock around these to make sure + // they're in sync? + desc->upid = so->last_upid; + desc->pid = so->last_pid; + + proc_name(desc->pid, desc->pname, sizeof(desc->pname)); + desc->pname[sizeof(desc->pname) - 1] = 0; + + desc->sndbufsize = so->so_snd.sb_hiwat; + desc->sndbufused = so->so_snd.sb_cc; + desc->rcvbufsize = so->so_rcv.sb_hiwat; + desc->rcvbufused = so->so_rcv.sb_cc; + } + + return 0; +} + +static void +nstat_init_tcp_provider(void) +{ + bzero(&nstat_tcp_provider, sizeof(nstat_tcp_provider)); + nstat_tcp_provider.nstat_descriptor_length = sizeof(nstat_tcp_descriptor); + nstat_tcp_provider.nstat_provider_id = NSTAT_PROVIDER_TCP; + nstat_tcp_provider.nstat_lookup = nstat_tcp_lookup; + nstat_tcp_provider.nstat_gone = nstat_tcp_gone; + nstat_tcp_provider.nstat_counts = nstat_tcp_counts; + nstat_tcp_provider.nstat_release = nstat_tcp_release; + nstat_tcp_provider.nstat_watcher_add = nstat_tcp_add_watcher; + nstat_tcp_provider.nstat_watcher_remove = nstat_tcp_remove_watcher; + nstat_tcp_provider.nstat_copy_descriptor = nstat_tcp_copy_descriptor; + nstat_tcp_provider.next = nstat_providers; + nstat_providers = &nstat_tcp_provider; +} + +#pragma mark -- UDP Provider -- + +static nstat_provider nstat_udp_provider; + +static errno_t +nstat_udp_lookup( + const void *data, + u_int32_t length, + nstat_provider_cookie_t *out_cookie) +{ + return nstat_tcpudp_lookup(&udbinfo, data, length, out_cookie); +} + +static int +nstat_udp_gone( + nstat_provider_cookie_t cookie) +{ + struct inpcb *inp = (struct inpcb*)cookie; + return (inp->inp_state == INPCB_STATE_DEAD) ? 1 : 0; +} + +static errno_t +nstat_udp_counts( + nstat_provider_cookie_t cookie, + struct nstat_counts *out_counts, + int *out_gone) +{ + struct inpcb *inp = (struct inpcb*)cookie; + + *out_gone = 0; + + // if the pcb is in the dead state, we should stop using it + if (inp->inp_state == INPCB_STATE_DEAD) + { + *out_gone = 1; + } + + atomic_get_64(out_counts->nstat_rxpackets, &inp->inp_stat->rxpackets); + atomic_get_64(out_counts->nstat_rxbytes, &inp->inp_stat->rxbytes); + atomic_get_64(out_counts->nstat_txpackets, &inp->inp_stat->txpackets); + atomic_get_64(out_counts->nstat_txbytes, &inp->inp_stat->txbytes); + + return 0; +} + +static void +nstat_udp_release( + nstat_provider_cookie_t cookie) +{ + struct inpcb *inp = (struct inpcb*)cookie; + in_pcb_checkstate(inp, WNT_RELEASE, 0); +} + +static u_int32_t nstat_udp_watchers = 0; + +static errno_t +nstat_udp_add_watcher( + nstat_control_state *state) +{ + OSIncrementAtomic(&nstat_udp_watchers); + + lck_rw_lock_shared(tcbinfo.mtx); + + // Add all current tcp inpcbs. Ignore those in timewait + struct inpcb *inp; + for (inp = LIST_FIRST(udbinfo.listhead); inp; inp = LIST_NEXT(inp, inp_list)) + { + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + continue; + + if (nstat_control_source_add(0, state, &nstat_udp_provider, inp) != 0) + { + in_pcb_checkstate(inp, WNT_RELEASE, 0); + break; + } + } + + lck_rw_done(tcbinfo.mtx); + + return 0; +} + +static void +nstat_udp_remove_watcher( + __unused nstat_control_state *state) +{ + OSDecrementAtomic(&nstat_udp_watchers); +} + +__private_extern__ void +nstat_udp_new_pcb( + struct inpcb *inp) +{ + if (nstat_udp_watchers == 0) + return; + + lck_mtx_lock(&nstat_mtx); + nstat_control_state *state; + for (state = nstat_controls; state; state = state->next) + { + if ((state->watching & (1 << NSTAT_PROVIDER_UDP)) != 0) + { + // this client is watching tcp + // acquire a reference for it + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) + break; + + // add the source, if that fails, release the reference + if (nstat_control_source_add(0, state, &nstat_udp_provider, inp) != 0) + { + in_pcb_checkstate(inp, WNT_RELEASE, 0); + break; + } + } + } + lck_mtx_unlock(&nstat_mtx); +} + +static errno_t +nstat_udp_copy_descriptor( + nstat_provider_cookie_t cookie, + void *data, + u_int32_t len) +{ + if (len < sizeof(nstat_udp_descriptor)) + { + printf("%s:%d invalid length, wanted %lu, got %d\n", __FUNCTION__, __LINE__, sizeof(nstat_tcp_descriptor), len); + return EINVAL; + } + + nstat_udp_descriptor *desc = (nstat_udp_descriptor*)data; + struct inpcb *inp = (struct inpcb*)cookie; + + bzero(desc, sizeof(*desc)); + + if (inp->inp_vflag & INP_IPV6) + { + nstat_ip6_to_sockaddr(&inp->in6p_laddr, inp->inp_lport, + &desc->local.v6, sizeof(desc->local)); + nstat_ip6_to_sockaddr(&inp->in6p_faddr, inp->inp_fport, + &desc->remote.v6, sizeof(desc->remote)); + } + else if (inp->inp_vflag & INP_IPV4) + { + nstat_ip_to_sockaddr(&inp->inp_laddr, inp->inp_lport, + &desc->local.v4, sizeof(desc->local)); + nstat_ip_to_sockaddr(&inp->inp_faddr, inp->inp_fport, + &desc->remote.v4, sizeof(desc->remote)); + } + + if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->rt_ifp) + desc->ifindex = inp->inp_route.ro_rt->rt_ifp->if_index; + + struct socket *so = inp->inp_socket; + if (so) + { + // TBD - take the socket lock around these to make sure + // they're in sync? + desc->upid = so->last_upid; + desc->pid = so->last_pid; + + desc->rcvbufsize = so->so_rcv.sb_hiwat; + desc->rcvbufused = so->so_rcv.sb_cc; + + proc_name(desc->pid, desc->pname, sizeof(desc->pname)); + desc->pname[sizeof(desc->pname) - 1] = 0; + } + + return 0; +} + +static void +nstat_init_udp_provider(void) +{ + bzero(&nstat_udp_provider, sizeof(nstat_udp_provider)); + nstat_udp_provider.nstat_provider_id = NSTAT_PROVIDER_UDP; + nstat_udp_provider.nstat_descriptor_length = sizeof(nstat_udp_descriptor); + nstat_udp_provider.nstat_lookup = nstat_udp_lookup; + nstat_udp_provider.nstat_gone = nstat_udp_gone; + nstat_udp_provider.nstat_counts = nstat_udp_counts; + nstat_udp_provider.nstat_watcher_add = nstat_udp_add_watcher; + nstat_udp_provider.nstat_watcher_remove = nstat_udp_remove_watcher; + nstat_udp_provider.nstat_copy_descriptor = nstat_udp_copy_descriptor; + nstat_udp_provider.nstat_release = nstat_udp_release; + nstat_udp_provider.next = nstat_providers; + nstat_providers = &nstat_udp_provider; +} + +#pragma mark -- Kernel Control Socket -- + +typedef struct nstat_src +{ + struct nstat_src *next; + nstat_src_ref_t srcref; + nstat_provider *provider; + nstat_provider_cookie_t cookie; +} nstat_src; + +static kern_ctl_ref nstat_ctlref = NULL; +static lck_grp_t *nstat_lck_grp = NULL; + +static errno_t nstat_control_connect(kern_ctl_ref kctl, struct sockaddr_ctl *sac, void **uinfo); +static errno_t nstat_control_disconnect(kern_ctl_ref kctl, u_int32_t unit, void *uinfo); +static errno_t nstat_control_send(kern_ctl_ref kctl, u_int32_t unit, void *uinfo, mbuf_t m, int flags); +static int nstat_control_send_description(nstat_control_state *state, nstat_src *src, u_int64_t context); +static void nstat_control_cleanup_source(nstat_control_state *state, struct nstat_src *src); + + +static void* +nstat_idle_check( + __unused thread_call_param_t p0, + __unused thread_call_param_t p1) +{ + lck_mtx_lock(&nstat_mtx); + + nstat_idle_time = 0ULL; + + nstat_control_state *control; + nstat_src *dead = NULL; + nstat_src *dead_list = NULL; + for (control = nstat_controls; control; control = control->next) + { + lck_mtx_lock(&control->mtx); + nstat_src **srcpp = &control->srcs; + + while(*srcpp != NULL) + { + if ((*srcpp)->provider->nstat_gone((*srcpp)->cookie)) + { + // Pull it off the list + dead = *srcpp; + *srcpp = (*srcpp)->next; + + // send a last description + nstat_control_send_description(control, dead, 0ULL); + + // send the source removed notification + nstat_msg_src_removed removed; + removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED; + removed.hdr.context = 0; + removed.srcref = dead->srcref; + errno_t result = ctl_enqueuedata(control->kctl, control->unit, &removed, sizeof(removed), CTL_DATA_EOR); + if (result != 0) printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result); + + // Put this on the list to release later + dead->next = dead_list; + dead_list = dead; + } + else + { + srcpp = &(*srcpp)->next; + } + } + lck_mtx_unlock(&control->mtx); + } + + if (nstat_controls) + { + clock_interval_to_deadline(60, NSEC_PER_SEC, &nstat_idle_time); + thread_call_func_delayed((thread_call_func_t)nstat_idle_check, NULL, nstat_idle_time); + } + + lck_mtx_unlock(&nstat_mtx); + + // Release the sources now that we aren't holding lots of locks + while (dead_list) + { + dead = dead_list; + dead_list = dead->next; + + nstat_control_cleanup_source(NULL, dead); + } + + return NULL; +} + +static void +nstat_control_register(void) +{ + // Create our lock group first + lck_grp_attr_t *grp_attr = lck_grp_attr_alloc_init(); + lck_grp_attr_setdefault(grp_attr); + nstat_lck_grp = lck_grp_alloc_init("network statistics kctl", grp_attr); + lck_grp_attr_free(grp_attr); + + lck_mtx_init(&nstat_mtx, nstat_lck_grp, NULL); + + // Register the control + struct kern_ctl_reg nstat_control; + bzero(&nstat_control, sizeof(nstat_control)); + strlcpy(nstat_control.ctl_name, NET_STAT_CONTROL_NAME, sizeof(nstat_control.ctl_name)); + nstat_control.ctl_connect = nstat_control_connect; + nstat_control.ctl_disconnect = nstat_control_disconnect; + nstat_control.ctl_send = nstat_control_send; + + errno_t result = ctl_register(&nstat_control, &nstat_ctlref); + if (result != 0) + printf("%s:%d ctl_register failed: %d", __FUNCTION__, __LINE__, result); +} + +static void +nstat_control_cleanup_source( + nstat_control_state *state, + struct nstat_src *src) +{ + if (state) + { + nstat_msg_src_removed removed; + removed.hdr.type = NSTAT_MSG_TYPE_SRC_REMOVED; + removed.hdr.context = 0; + removed.srcref = src->srcref; + errno_t result = ctl_enqueuedata(state->kctl, state->unit, &removed, sizeof(removed), CTL_DATA_EOR); + if (result != 0) printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result); + } + + // Cleanup the source if we found it. + src->provider->nstat_release(src->cookie); + OSFree(src, sizeof(*src), nstat_malloc_tag); +} + +static errno_t +nstat_control_connect( + kern_ctl_ref kctl, + struct sockaddr_ctl *sac, + void **uinfo) +{ + nstat_control_state *state = OSMalloc(sizeof(*state), nstat_malloc_tag); + if (state == NULL) return ENOMEM; + + bzero(state, sizeof(*state)); + lck_mtx_init(&state->mtx, nstat_lck_grp, NULL); + state->kctl = kctl; + state->unit = sac->sc_unit; + *uinfo = state; + + // check if we're super user + proc_t pself = proc_self(); + state->suser = proc_suser(pself) == 0; + proc_rele(pself); + + lck_mtx_lock(&nstat_mtx); + state->next = nstat_controls; + nstat_controls = state; + + if (nstat_idle_time == 0ULL) + { + clock_interval_to_deadline(60, NSEC_PER_SEC, &nstat_idle_time); + thread_call_func_delayed((thread_call_func_t)nstat_idle_check, NULL, nstat_idle_time); + } + + lck_mtx_unlock(&nstat_mtx); + + return 0; +} + +static errno_t +nstat_control_disconnect( + __unused kern_ctl_ref kctl, + __unused u_int32_t unit, + __unused void *uinfo) +{ + u_int32_t watching; + nstat_control_state *state = (nstat_control_state*)uinfo; + + // pull it out of the global list of states + lck_mtx_lock(&nstat_mtx); + nstat_control_state **statepp; + for (statepp = &nstat_controls; *statepp; statepp = &(*statepp)->next) + { + if (*statepp == state) + { + *statepp = state->next; + break; + } + } + lck_mtx_unlock(&nstat_mtx); + + lck_mtx_lock(&state->mtx); + // Stop watching for sources + nstat_provider *provider; + watching = state->watching; + state->watching = 0; + for (provider = nstat_providers; provider && watching; provider = provider->next) + { + if ((watching & (1 << provider->nstat_provider_id)) != 0) + { + watching &= ~(1 << provider->nstat_provider_id); + provider->nstat_watcher_remove(state); + } + } + + // set cleanup flags + state->cleanup = TRUE; + + // Copy out the list of sources + nstat_src *srcs = state->srcs; + state->srcs = NULL; + lck_mtx_unlock(&state->mtx); + + while (srcs) + { + nstat_src *src; + + // pull it out of the list + src = srcs; + srcs = src->next; + + // clean it up + nstat_control_cleanup_source(NULL, src); + } + + OSFree(state, sizeof(*state), nstat_malloc_tag); + + return 0; +} + +static nstat_src_ref_t +nstat_control_next_src_ref( + nstat_control_state *state) +{ + int i = 0; + nstat_src_ref_t toReturn = NSTAT_SRC_REF_INVALID; + + for (i = 0; i < 1000 && toReturn == NSTAT_SRC_REF_INVALID; i++) + { + if (state->next_srcref == NSTAT_SRC_REF_INVALID || + state->next_srcref == NSTAT_SRC_REF_ALL) + { + state->next_srcref = 1; + } + + nstat_src *src; + for (src = state->srcs; src; src = src->next) + { + if (src->srcref == state->next_srcref) + break; + } + + if (src == NULL) toReturn = state->next_srcref; + state->next_srcref++; + } + + return toReturn; +} + +static int +nstat_control_send_description( + nstat_control_state *state, + nstat_src *src, + u_int64_t context) +{ + // Provider doesn't support getting the descriptor? Done. + if (src->provider->nstat_descriptor_length == 0 || + src->provider->nstat_copy_descriptor == NULL) + { + lck_mtx_unlock(&state->mtx); + printf("%s:%d - provider doesn't support descriptions\n", __FUNCTION__, __LINE__); + return EOPNOTSUPP; + } + + // Allocate storage for the descriptor message + mbuf_t msg; + unsigned int one = 1; + u_int32_t size = offsetof(nstat_msg_src_description, data) + src->provider->nstat_descriptor_length; + if (mbuf_allocpacket(MBUF_WAITOK, size, &one, &msg) != 0) + { + lck_mtx_unlock(&state->mtx); + printf("%s:%d - failed to allocate response\n", __FUNCTION__, __LINE__); + return ENOMEM; + } + + nstat_msg_src_description *desc = (nstat_msg_src_description*)mbuf_data(msg); + mbuf_setlen(msg, size); + mbuf_pkthdr_setlen(msg, mbuf_len(msg)); + + // Query the provider for the provider specific bits + errno_t result = src->provider->nstat_copy_descriptor(src->cookie, desc->data, src->provider->nstat_descriptor_length); + + if (result != 0) + { + mbuf_freem(msg); + printf("%s:%d - provider failed to copy descriptor %d\n", __FUNCTION__, __LINE__, result); + return result; + } + + desc->hdr.context = context; + desc->hdr.type = NSTAT_MSG_TYPE_SRC_DESC; + desc->srcref = src->srcref; + desc->provider = src->provider->nstat_provider_id; + + result = ctl_enqueuembuf(state->kctl, state->unit, msg, CTL_DATA_EOR); + if (result != 0) + { + printf("%s:%d ctl_enqueuembuf returned error %d\n", __FUNCTION__, __LINE__, result); + mbuf_freem(msg); + } + + return result; +} + +static errno_t +nstat_control_handle_add_request( + nstat_control_state *state, + mbuf_t m) +{ + errno_t result; + + // Verify the header fits in the first mbuf + if (mbuf_len(m) < offsetof(nstat_msg_add_src_req, param)) + { + printf("mbuf_len(m)=%lu, offsetof(nstat_msg_add_src_req*, param)=%lu\n", + mbuf_len(m), offsetof(nstat_msg_add_src_req, param)); + return EINVAL; + } + + // Calculate the length of the parameter field + int32_t paramlength = mbuf_pkthdr_len(m) - offsetof(nstat_msg_add_src_req, param); + if (paramlength < 0 || paramlength > 2 * 1024) + { + printf("invalid paramlength=%d\n", paramlength); + return EINVAL; + } + + nstat_provider *provider; + nstat_provider_cookie_t cookie; + nstat_msg_add_src_req *req = mbuf_data(m); + if (mbuf_pkthdr_len(m) > mbuf_len(m)) + { + // parameter is too large, we need to make a contiguous copy + void *data = OSMalloc(paramlength, nstat_malloc_tag); + + if (!data) return ENOMEM; + result = mbuf_copydata(m, offsetof(nstat_msg_add_src_req, param), paramlength, data); + if (result == 0) + result = nstat_lookup_entry(req->provider, data, paramlength, &provider, &cookie); + OSFree(data, paramlength, nstat_malloc_tag); + } + else + { + result = nstat_lookup_entry(req->provider, (void*)&req->param, paramlength, &provider, &cookie); + } + + if (result != 0) + { + printf("nstat_lookup_entry failed: %d\n", result); + return result; + } + + result = nstat_control_source_add(req->hdr.context, state, provider, cookie); + if (result != 0) + provider->nstat_release(cookie); + + return result; +} + +static int +nstat_perm_check( + __unused nstat_control_state *state) +{ + int allow = 0; +#if !REQUIRE_ROOT_FOR_STATS + allow = 1; +#else + // If the socket was created by a priv process, allow + if (state->suser) return 1; + + // If the current process is priv, allow + proc_t self = proc_self(); + allow = proc_suser(self) == 0; + proc_rele(self); + + // TBD: check for entitlement, root check is too coarse +#endif /* REQUIRE_ROOT_FOR_STATS */ + + return allow; +} + +static errno_t +nstat_control_handle_add_all( + nstat_control_state *state, + mbuf_t m) +{ + errno_t result = 0; + + if (!nstat_perm_check(state)) + { + return EPERM; + } + + // Verify the header fits in the first mbuf + if (mbuf_len(m) < sizeof(nstat_msg_add_all_srcs)) + { + printf("mbuf_len(m)=%lu, sizeof(nstat_msg_add_all_srcs)=%lu\n", + mbuf_len(m), sizeof(nstat_msg_add_all_srcs)); + return EINVAL; + } + + nstat_msg_add_all_srcs *req = mbuf_data(m); + nstat_provider *provider = nstat_find_provider_by_id(req->provider); + + if (!provider) return ENOENT; + if (provider->nstat_watcher_add == NULL) return ENOTSUP; + + // Make sure we don't add the provider twice + lck_mtx_lock(&state->mtx); + if ((state->watching & (1 << provider->nstat_provider_id)) != 0) + result = EALREADY; + state->watching |= (1 << provider->nstat_provider_id); + lck_mtx_unlock(&state->mtx); + if (result != 0) return result; + + result = provider->nstat_watcher_add(state); + if (result != 0) + { + lck_mtx_lock(&state->mtx); + state->watching &= ~(1 << provider->nstat_provider_id); + lck_mtx_unlock(&state->mtx); + } + + if (result == 0) + { + // Notify the client + nstat_msg_hdr success; + success.context = req->hdr.context; + success.type = NSTAT_MSG_TYPE_SUCCESS; + success.pad = 0; + if (ctl_enqueuedata(state->kctl, state->unit, &success, sizeof(success), CTL_DATA_EOR) != 0) + printf("%s:%d - failed to enqueue success message\n", __FUNCTION__, __LINE__); + } + + return result; +} + +static errno_t +nstat_control_source_add( + u_int64_t context, + nstat_control_state *state, + nstat_provider *provider, + nstat_provider_cookie_t cookie) +{ + // Fill out source added message + mbuf_t msg = NULL; + unsigned int one = 1; + + if (mbuf_allocpacket(MBUF_WAITOK, sizeof(nstat_msg_src_added), &one, &msg) != 0) + return ENOMEM; + + mbuf_setlen(msg, sizeof(nstat_msg_src_added)); + mbuf_pkthdr_setlen(msg, mbuf_len(msg)); + nstat_msg_src_added *add = mbuf_data(msg); + bzero(add, sizeof(*add)); + add->hdr.type = NSTAT_MSG_TYPE_SRC_ADDED; + add->hdr.context = context; + add->provider = provider->nstat_provider_id; + + // Allocate storage for the source + nstat_src *src = OSMalloc(sizeof(*src), nstat_malloc_tag); + if (src == NULL) + { + mbuf_freem(msg); + return ENOMEM; + } + + // Fill in the source, including picking an unused source ref + lck_mtx_lock(&state->mtx); + + add->srcref = src->srcref = nstat_control_next_src_ref(state); + if (state->cleanup || src->srcref == NSTAT_SRC_REF_INVALID) + { + lck_mtx_unlock(&state->mtx); + OSFree(src, sizeof(*src), nstat_malloc_tag); + mbuf_freem(msg); + return EINVAL; + } + src->provider = provider; + src->cookie = cookie; + + // send the source added message + errno_t result = ctl_enqueuembuf(state->kctl, state->unit, msg, CTL_DATA_EOR); + if (result != 0) + { + lck_mtx_unlock(&state->mtx); + printf("%s:%d ctl_enqueuembuf failed: %d\n", __FUNCTION__, __LINE__, result); + OSFree(src, sizeof(*src), nstat_malloc_tag); + mbuf_freem(msg); + return result; + } + + // Put the source in the list + src->next = state->srcs; + state->srcs = src; + + // send the description message + // not useful as the source is often not complete +// nstat_control_send_description(state, src, 0ULL); + + lck_mtx_unlock(&state->mtx); + + return 0; +} + +static errno_t +nstat_control_handle_remove_request( + nstat_control_state *state, + mbuf_t m) +{ + nstat_src_ref_t srcref = NSTAT_SRC_REF_INVALID; + + if (mbuf_copydata(m, offsetof(nstat_msg_rem_src_req, srcref), sizeof(srcref), &srcref) != 0) + { + printf("%s:%d - invalid length %u, expected %lu\n", __FUNCTION__, __LINE__, (u_int32_t)mbuf_pkthdr_len(m), sizeof(nstat_msg_rem_src_req)); + return EINVAL; + } + + lck_mtx_lock(&state->mtx); + + // Remove this source as we look for it + nstat_src **nextp; + nstat_src *src = NULL; + for (nextp = &state->srcs; *nextp; nextp = &(*nextp)->next) + { + if ((*nextp)->srcref == srcref) + { + src = *nextp; + *nextp = src->next; + break; + } + } + + lck_mtx_unlock(&state->mtx); + + if (src) nstat_control_cleanup_source(state, src); + + return src ? 0 : ENOENT; +} + +static errno_t +nstat_control_handle_query_request( + nstat_control_state *state, + mbuf_t m) +{ + // TBD: handle this from another thread so we can enqueue a lot of data + // As written, if a client requests query all, this function will be + // called from their send of the request message. We will attempt to write + // responses and succeed until the buffer fills up. Since the clients thread + // is blocked on send, it won't be reading unless the client has two threads + // using this socket, one for read and one for write. Two threads probably + // won't work with this code anyhow since we don't have proper locking in + // place yet. + nstat_src *dead_srcs = NULL; + errno_t result = ENOENT; + nstat_msg_query_src_req req; + if (mbuf_copydata(m, 0, sizeof(req), &req) != 0) + { + printf("%s:%d - invalid length %u, expected %lu\n", __FUNCTION__, __LINE__, (u_int32_t)mbuf_pkthdr_len(m), sizeof(req)); + return EINVAL; + } + + lck_mtx_lock(&state->mtx); + nstat_src **srcpp = &state->srcs; + while (*srcpp != NULL) + { + int gone; + gone = 0; + + if (req.srcref == NSTAT_SRC_REF_ALL || + (*srcpp)->srcref == req.srcref) + { + nstat_msg_src_counts counts; + counts.hdr.type = NSTAT_MSG_TYPE_SRC_COUNTS; + counts.hdr.context = req.hdr.context; + counts.srcref = (*srcpp)->srcref; + bzero(&counts.counts, sizeof(counts.counts)); + result = (*srcpp)->provider->nstat_counts((*srcpp)->cookie, &counts.counts, &gone); + + if (result == 0) + { + result = ctl_enqueuedata(state->kctl, state->unit, &counts, sizeof(counts), CTL_DATA_EOR); + if (result != 0) + { + printf("%s:%d ctl_enqueuedata failed: %d\n", __FUNCTION__, __LINE__, result); + } + } + else + { + printf("%s:%d provider->nstat_counts failed: %d\n", __FUNCTION__, __LINE__, result); + } + + if (gone) + { + // send one last descriptor message so client may see last state + nstat_control_send_description(state, *srcpp, 0ULL); + + // pull src out of the list + nstat_src *src = *srcpp; + *srcpp = src->next; + + src->next = dead_srcs; + dead_srcs = src; + } + + if (req.srcref != NSTAT_SRC_REF_ALL) + break; + } + + if (!gone) + srcpp = &(*srcpp)->next; + } + lck_mtx_unlock(&state->mtx); + + while (dead_srcs) + { + nstat_src *src; + + src = dead_srcs; + dead_srcs = src->next; + + // release src and send notification + nstat_control_cleanup_source(state, src); + } + + if (req.srcref == NSTAT_SRC_REF_ALL) + { + nstat_msg_hdr success; + success.context = req.hdr.context; + success.type = NSTAT_MSG_TYPE_SUCCESS; + success.pad = 0; + if (ctl_enqueuedata(state->kctl, state->unit, &success, sizeof(success), CTL_DATA_EOR) != 0) + printf("%s:%d - failed to enqueue success message\n", __FUNCTION__, __LINE__); + result = 0; + } + + return result; +} + +static errno_t +nstat_control_handle_get_src_description( + nstat_control_state *state, + mbuf_t m) +{ + nstat_msg_get_src_description req; + if (mbuf_copydata(m, 0, sizeof(req), &req) != 0) + { + printf("%s:%d - invalid length %u, expected %lu\n", __FUNCTION__, __LINE__, (u_int32_t)mbuf_pkthdr_len(m), sizeof(req)); + return EINVAL; + } + + // Find the source + lck_mtx_lock(&state->mtx); + nstat_src *src; + for (src = state->srcs; src; src = src->next) + { + if (src->srcref == req.srcref) + break; + } + + // No source? Done. + if (!src) + { + lck_mtx_unlock(&state->mtx); + printf("%s:%d - no matching source\n", __FUNCTION__, __LINE__); + return ENOENT; + } + + errno_t result = nstat_control_send_description(state, src, req.hdr.context); + lck_mtx_unlock(&state->mtx); + + return result; +} + +static errno_t +nstat_control_send( + kern_ctl_ref kctl, + u_int32_t unit, + __unused void *uinfo, + mbuf_t m, + __unused int flags) +{ + nstat_control_state *state = (nstat_control_state*)uinfo; + struct nstat_msg_hdr *hdr; + struct nstat_msg_hdr storage; + errno_t result = 0; + + if (mbuf_pkthdr_len(m) < sizeof(hdr)) + { + // Is this the right thing to do? + printf("%s:%d - message too short, was %ld expected %lu\n", __FUNCTION__, __LINE__, + mbuf_pkthdr_len(m), sizeof(*hdr)); + mbuf_freem(m); + return EINVAL; + } + + if (mbuf_len(m) >= sizeof(*hdr)) + { + hdr = mbuf_data(m); + } + else + { + mbuf_copydata(m, 0, sizeof(storage), &storage); + hdr = &storage; + } + + switch (hdr->type) + { + case NSTAT_MSG_TYPE_ADD_SRC: + result = nstat_control_handle_add_request(state, m); + break; + + case NSTAT_MSG_TYPE_ADD_ALL_SRCS: + result = nstat_control_handle_add_all(state, m); + break; + + case NSTAT_MSG_TYPE_REM_SRC: + result = nstat_control_handle_remove_request(state, m); + break; + + case NSTAT_MSG_TYPE_QUERY_SRC: + result = nstat_control_handle_query_request(state, m); + break; + + case NSTAT_MSG_TYPE_GET_SRC_DESC: + result = nstat_control_handle_get_src_description(state, m); + break; + + default: + printf("%s:%d - unknown message type %d\n", __FUNCTION__, __LINE__, hdr->type); + result = EINVAL; + break; + } + + if (result != 0) + { + struct nstat_msg_error err; + + err.hdr.type = NSTAT_MSG_TYPE_ERROR; + err.hdr.context = hdr->context; + err.error = result; + + result = ctl_enqueuedata(kctl, unit, &err, sizeof(err), CTL_DATA_EOR); + } + + mbuf_freem(m); + + return result; +} diff --git a/bsd/net/ntstat.h b/bsd/net/ntstat.h new file mode 100644 index 000000000..4bbb3dc1b --- /dev/null +++ b/bsd/net/ntstat.h @@ -0,0 +1,348 @@ +/* + * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef __NTSTAT_H__ +#define __NTSTAT_H__ +#include <netinet/in.h> + +#ifdef PRIVATE +#pragma pack(push, 4) +#pragma mark -- Common Data Structures -- + +#define __NSTAT_REVISION__ 1 + +typedef u_int32_t nstat_provider_id_t; +typedef u_int32_t nstat_src_ref_t; + +typedef struct nstat_counts +{ + /* Counters */ + u_int64_t nstat_rxpackets __attribute__((aligned(8))); + u_int64_t nstat_rxbytes __attribute__((aligned(8))); + u_int64_t nstat_txpackets __attribute__((aligned(8))); + u_int64_t nstat_txbytes __attribute__((aligned(8))); + + u_int32_t nstat_rxduplicatebytes; + u_int32_t nstat_rxoutoforderbytes; + u_int32_t nstat_txretransmit; + + u_int32_t nstat_connectattempts; + u_int32_t nstat_connectsuccesses; + + u_int32_t nstat_min_rtt; + u_int32_t nstat_avg_rtt; + u_int32_t nstat_var_rtt; +} nstat_counts; + +#pragma mark -- Network Statistics Providers -- + +enum +{ + NSTAT_PROVIDER_ROUTE = 1 + ,NSTAT_PROVIDER_TCP = 2 + ,NSTAT_PROVIDER_UDP = 3 +}; + +typedef struct nstat_route_add_param +{ + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } dst; + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } mask; + u_int32_t ifindex; +} nstat_route_add_param; + +typedef struct nstat_tcp_add_param +{ + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } local; + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } remote; +} nstat_tcp_add_param; + +typedef struct nstat_tcp_descriptor +{ + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } local; + + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } remote; + + u_int32_t ifindex; + + u_int32_t state; + + u_int32_t sndbufsize; + u_int32_t sndbufused; + u_int32_t rcvbufsize; + u_int32_t rcvbufused; + u_int32_t txunacked; + u_int32_t txwindow; + u_int32_t txcwindow; + + u_int64_t upid; + u_int32_t pid; + char pname[64]; +} nstat_tcp_descriptor; + +typedef struct nstat_tcp_add_param nstat_udp_add_param; + +typedef struct nstat_udp_descriptor +{ + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } local; + + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + } remote; + + u_int32_t ifindex; + + u_int32_t rcvbufsize; + u_int32_t rcvbufused; + + u_int64_t upid; + u_int32_t pid; + char pname[64]; +} nstat_udp_descriptor; + +typedef struct nstat_route_descriptor +{ + u_int64_t id; + u_int64_t parent_id; + u_int64_t gateway_id; + + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + struct sockaddr sa; + } dst; + + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + struct sockaddr sa; + } mask; + + union + { + struct sockaddr_in v4; + struct sockaddr_in6 v6; + struct sockaddr sa; + } gateway; + + u_int32_t ifindex; + u_int32_t flags; + +} nstat_route_descriptor; + +#pragma mark -- Network Statistics User Client -- + +#define NET_STAT_CONTROL_NAME "com.apple.network.statistics" + +enum +{ + // generice respnse messages + NSTAT_MSG_TYPE_SUCCESS = 0 + ,NSTAT_MSG_TYPE_ERROR = 1 + + // Requests + ,NSTAT_MSG_TYPE_ADD_SRC = 1001 + ,NSTAT_MSG_TYPE_ADD_ALL_SRCS = 1002 + ,NSTAT_MSG_TYPE_REM_SRC = 1003 + ,NSTAT_MSG_TYPE_QUERY_SRC = 1004 + ,NSTAT_MSG_TYPE_GET_SRC_DESC = 1005 + + // Responses/Notfications + ,NSTAT_MSG_TYPE_SRC_ADDED = 10001 + ,NSTAT_MSG_TYPE_SRC_REMOVED = 10002 + ,NSTAT_MSG_TYPE_SRC_DESC = 10003 + ,NSTAT_MSG_TYPE_SRC_COUNTS = 10004 +}; + +enum +{ + NSTAT_SRC_REF_ALL = 0xffffffff + ,NSTAT_SRC_REF_INVALID = 0 +}; + +typedef struct nstat_msg_hdr +{ + u_int64_t context; + u_int32_t type; + u_int32_t pad; // unused for now +} nstat_msg_hdr; + +typedef struct nstat_msg_error +{ + nstat_msg_hdr hdr; + u_int32_t error; // errno error +} nstat_msg_error; + +typedef struct nstat_msg_add_src +{ + nstat_msg_hdr hdr; + nstat_provider_id_t provider; + u_int8_t param[]; +} nstat_msg_add_src_req; + +typedef struct nstat_msg_add_all_srcs +{ + nstat_msg_hdr hdr; + nstat_provider_id_t provider; +} nstat_msg_add_all_srcs; + +typedef struct nstat_msg_src_added +{ + nstat_msg_hdr hdr; + nstat_provider_id_t provider; + nstat_src_ref_t srcref; +} nstat_msg_src_added; + +typedef struct nstat_msg_rem_src +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; +} nstat_msg_rem_src_req; + +typedef struct nstat_msg_get_src_description +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; +} nstat_msg_get_src_description; + +typedef struct nstat_msg_src_description +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; + nstat_provider_id_t provider; + u_int8_t data[]; +} nstat_msg_src_description; + +typedef struct nstat_msg_query_src +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; +} nstat_msg_query_src_req; + +typedef struct nstat_msg_src_counts +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; + nstat_counts counts; +} nstat_msg_src_counts; + +typedef struct nstat_msg_src_removed +{ + nstat_msg_hdr hdr; + nstat_src_ref_t srcref; +} nstat_msg_src_removed; + +#pragma pack(pop) + +#endif /* PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +#include <sys/mcache.h> + +#pragma mark -- Generic Network Statistics Provider -- + +typedef void * nstat_provider_cookie_t; + +#pragma mark -- Route Statistics Gathering Functions -- +struct rtentry; + +enum +{ + NSTAT_TX_FLAG_RETRANSMIT = 1 +}; + +enum +{ + NSTAT_RX_FLAG_DUPLICATE = 1, + NSTAT_RX_FLAG_OUT_OF_ORDER = 2 +}; + +// indicates whether or not collection of statistics is enabled +extern int nstat_collect; + +// Route collection routines +void nstat_route_connect_attempt(struct rtentry *rte); +void nstat_route_connect_success(struct rtentry *rte); +void nstat_route_tx(struct rtentry *rte, u_int32_t packets, u_int32_t bytes, u_int32_t flags); +void nstat_route_rx(struct rtentry *rte, u_int32_t packets, u_int32_t bytes, u_int32_t flags); +void nstat_route_rtt(struct rtentry *rte, u_int32_t rtt, u_int32_t rtt_var); +void nstat_route_detach(struct rtentry *rte); + +// watcher support +struct inpcb; +void nstat_tcp_new_pcb(struct inpcb *inp); +void nstat_udp_new_pcb(struct inpcb *inp); +void nstat_route_new_entry(struct rtentry *rt); + +// locked_add_64 uses atomic operations on 32bit so the 64bit +// value can be properly read. The values are only ever incremented +// while under the socket lock, so on 64bit we don't actually need +// atomic operations to increment. +#if defined(__LP64__) +#define locked_add_64(__addr, __count) do { \ + *(__addr) += (__count); \ +} while (0) +#else +#define locked_add_64(__addr, __count) do { \ + atomic_add_64((__addr), (__count)); \ +} while (0) +#endif + +#endif /* XNU_KERNEL_PRIVATE */ + +#endif /* __NTSTAT_H__ */ diff --git a/bsd/net/pf.c b/bsd/net/pf.c index 47134bda3..62b39f2ad 100644 --- a/bsd/net/pf.c +++ b/bsd/net/pf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -379,6 +379,7 @@ pf_lazy_makewritable(struct pf_pdesc *pd, struct mbuf *m, int len) pd->lmw = len; if (len >= 0 && m != pd->mp) { pd->mp = m; + pd->pf_mtag = pf_find_mtag(m); switch (pd->af) { case AF_INET: { @@ -2356,18 +2357,42 @@ pf_change_ap(int dir, struct mbuf *m, struct pf_addr *a, u_int16_t *p, #endif /* INET */ #if INET6 case AF_INET6: - *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( - pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( - pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, - ao.addr16[0], an->addr16[0], u), - ao.addr16[1], an->addr16[1], u), - ao.addr16[2], an->addr16[2], u), - ao.addr16[3], an->addr16[3], u), - ao.addr16[4], an->addr16[4], u), - ao.addr16[5], an->addr16[5], u), - ao.addr16[6], an->addr16[6], u), - ao.addr16[7], an->addr16[7], u), - po, pn, u); + /* + * If the packet is originated from an ALG on the NAT gateway + * (source address is loopback or local), in which case the + * TCP/UDP checksum field contains the pseudo header checksum + * that's not yet complemented. + */ + if (dir == PF_OUT && m != NULL && + (m->m_flags & M_PKTHDR) && + (m->m_pkthdr.csum_flags & (CSUM_TCPIPV6 | CSUM_UDPIPV6))) { + /* Pseudo-header checksum does not include ports */ + *pc = ~pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(~*pc, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + ao.addr16[2], an->addr16[2], u), + ao.addr16[3], an->addr16[3], u), + ao.addr16[4], an->addr16[4], u), + ao.addr16[5], an->addr16[5], u), + ao.addr16[6], an->addr16[6], u), + ao.addr16[7], an->addr16[7], u), + po, pn, u); + } else { + *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup( + pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc, + ao.addr16[0], an->addr16[0], u), + ao.addr16[1], an->addr16[1], u), + ao.addr16[2], an->addr16[2], u), + ao.addr16[3], an->addr16[3], u), + ao.addr16[4], an->addr16[4], u), + ao.addr16[5], an->addr16[5], u), + ao.addr16[6], an->addr16[6], u), + ao.addr16[7], an->addr16[7], u), + po, pn, u); + } break; #endif /* INET6 */ } @@ -2721,7 +2746,7 @@ pf_send_tcp(const struct pf_rule *r, sa_family_t af, h6->ip6_hlim = IPV6_DEFHLIM; bzero(&ro6, sizeof (ro6)); - ip6_output(m, NULL, &ro6, 0, NULL, NULL, 0); + ip6_output(m, NULL, &ro6, 0, NULL, NULL, NULL); if (ro6.ro_rt != NULL) rtfree(ro6.ro_rt); break; @@ -3959,8 +3984,8 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) { struct pf_addr *saddr, *daddr; u_int16_t sport, dport; - struct inpcbinfo *pi; - struct inpcb *inp = NULL; + struct inpcbinfo *pi; + int inp = 0; if (pd == NULL) return (-1); @@ -4001,10 +4026,10 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) switch (pd->af) { #if INET case AF_INET: - inp = in_pcblookup_hash(pi, saddr->v4, sport, daddr->v4, dport, - 0, NULL); + inp = in_pcblookup_hash_exists(pi, saddr->v4, sport, daddr->v4, dport, + 0, &pd->lookup.uid, &pd->lookup.gid, NULL); #if INET6 - if (inp == NULL) { + if (inp == 0) { struct in6_addr s6, d6; memset(&s6, 0, sizeof (s6)); @@ -4017,25 +4042,26 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) memcpy(&d6.s6_addr32[3], &daddr->v4, sizeof (daddr->v4)); - inp = in6_pcblookup_hash(pi, &s6, sport, - &d6, dport, 0, NULL); - if (inp == NULL) { - inp = in_pcblookup_hash(pi, saddr->v4, sport, - daddr->v4, dport, INPLOOKUP_WILDCARD, NULL); - if (inp == NULL) { - inp = in6_pcblookup_hash(pi, &s6, sport, + inp = in6_pcblookup_hash_exists(pi, &s6, sport, + &d6, dport, 0, &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) { + inp = in_pcblookup_hash_exists(pi, saddr->v4, sport, + daddr->v4, dport, INPLOOKUP_WILDCARD, &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) { + inp = in6_pcblookup_hash_exists(pi, &s6, sport, &d6, dport, INPLOOKUP_WILDCARD, - NULL); - if (inp == NULL) + &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) return (-1); } } } #else - if (inp == NULL) { - inp = in_pcblookup_hash(pi, saddr->v4, sport, - daddr->v4, dport, INPLOOKUP_WILDCARD, NULL); - if (inp == NULL) + if (inp == 0) { + inp = in_pcblookup_hash_exists(pi, saddr->v4, sport, + daddr->v4, dport, INPLOOKUP_WILDCARD, + &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) return (-1); } #endif /* !INET6 */ @@ -4043,24 +4069,22 @@ pf_socket_lookup(int direction, struct pf_pdesc *pd) #endif /* INET */ #if INET6 case AF_INET6: - inp = in6_pcblookup_hash(pi, &saddr->v6, sport, &daddr->v6, - dport, 0, NULL); - if (inp == NULL) { - inp = in6_pcblookup_hash(pi, &saddr->v6, sport, - &daddr->v6, dport, INPLOOKUP_WILDCARD, NULL); - if (inp == NULL) + inp = in6_pcblookup_hash_exists(pi, &saddr->v6, sport, &daddr->v6, + dport, 0, &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) { + inp = in6_pcblookup_hash_exists(pi, &saddr->v6, sport, + &daddr->v6, dport, INPLOOKUP_WILDCARD, + &pd->lookup.uid, &pd->lookup.gid, NULL); + if (inp == 0) return (-1); } break; #endif /* INET6 */ - + default: return (-1); } - if (inp != NULL) - in_pcb_checkstate(inp, WNT_RELEASE, 0); - return (1); } @@ -8162,10 +8186,12 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, } ifp = ro->ro_rt->rt_ifp; + RT_LOCK(ro->ro_rt); ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = satosin(ro->ro_rt->rt_gateway); + RT_UNLOCK(ro->ro_rt); } else { if (TAILQ_EMPTY(&r->rpool.list)) { DPFPRINTF(PF_DEBUG_URGENT, @@ -8277,7 +8303,14 @@ pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, } m1 = m0; + + /* PR-8933605: send ip_len,ip_off to ip_fragment in host byte order */ +#if BYTE_ORDER != BIG_ENDIAN + NTOHS(ip->ip_off); + NTOHS(ip->ip_len); +#endif error = ip_fragment(m0, ifp, ifp->if_mtu, sw_csum); + if (error) { m0 = NULL; goto bad; @@ -8365,7 +8398,7 @@ pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, if ((pf_mtag = pf_get_mtag(m0)) == NULL) goto bad; pf_mtag->flags |= PF_TAG_GENERATED; - ip6_output(m0, NULL, NULL, 0, NULL, NULL, 0); + ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); return; } @@ -8410,7 +8443,7 @@ pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp, if (IN6_IS_SCOPE_EMBED(&dst->sin6_addr)) dst->sin6_addr.s6_addr16[1] = htons(ifp->if_index); if ((unsigned)m0->m_pkthdr.len <= ifp->if_mtu) { - error = nd6_output(ifp, ifp, m0, dst, NULL, 0); + error = nd6_output(ifp, ifp, m0, dst, NULL); } else { in6_ifstat_inc(ifp, ifs6_in_toobig); if (r->rt != PF_DUPTO) @@ -9536,6 +9569,7 @@ pool_init(struct pool *pp, size_t size, unsigned int align, unsigned int ioff, pp->pool_zone = zinit(size, 1024 * size, PAGE_SIZE, wchan); if (pp->pool_zone != NULL) { zone_change(pp->pool_zone, Z_EXPAND, TRUE); + zone_change(pp->pool_zone, Z_CALLERACCT, FALSE); pp->pool_hiwat = pp->pool_limit = (unsigned int)-1; pp->pool_name = wchan; } @@ -9622,8 +9656,8 @@ pf_get_mtag(struct mbuf *m) if ((mtag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF, NULL)) == NULL) { - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF, - sizeof (struct pf_mtag), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_PF, + sizeof (struct pf_mtag), M_NOWAIT, m); if (mtag == NULL) return (NULL); bzero(mtag + 1, sizeof (struct pf_mtag)); diff --git a/bsd/net/pf_if.c b/bsd/net/pf_if.c index 06873fce6..4c05205ba 100644 --- a/bsd/net/pf_if.c +++ b/bsd/net/pf_if.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -436,28 +436,45 @@ pfi_instance_add(struct ifnet *ifp, int net, int flags) return; ifnet_lock_shared(ifp); TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { - if (ia->ifa_addr == NULL) + IFA_LOCK(ia); + if (ia->ifa_addr == NULL) { + IFA_UNLOCK(ia); continue; + } af = ia->ifa_addr->sa_family; - if (af != AF_INET && af != AF_INET6) + if (af != AF_INET && af != AF_INET6) { + IFA_UNLOCK(ia); continue; - if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6) + } + if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6) { + IFA_UNLOCK(ia); continue; + } if ((flags & PFI_AFLAG_BROADCAST) && - !(ifp->if_flags & IFF_BROADCAST)) + !(ifp->if_flags & IFF_BROADCAST)) { + IFA_UNLOCK(ia); continue; + } if ((flags & PFI_AFLAG_PEER) && - !(ifp->if_flags & IFF_POINTOPOINT)) + !(ifp->if_flags & IFF_POINTOPOINT)) { + IFA_UNLOCK(ia); continue; + } if ((flags & PFI_AFLAG_NETWORK) && af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL( - &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr)) + &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr)) { + IFA_UNLOCK(ia); continue; + } if (flags & PFI_AFLAG_NOALIAS) { - if (af == AF_INET && got4) + if (af == AF_INET && got4) { + IFA_UNLOCK(ia); continue; - if (af == AF_INET6 && got6) + } + if (af == AF_INET6 && got6) { + IFA_UNLOCK(ia); continue; + } } if (af == AF_INET) got4 = 1; @@ -480,6 +497,7 @@ pfi_instance_add(struct ifnet *ifp, int net, int flags) pfi_address_add(ia->ifa_dstaddr, af, net2); else pfi_address_add(ia->ifa_addr, af, net2); + IFA_UNLOCK(ia); } ifnet_lock_done(ifp); } diff --git a/bsd/net/pf_ioctl.c b/bsd/net/pf_ioctl.c index 9165abfd4..25763d8f5 100644 --- a/bsd/net/pf_ioctl.c +++ b/bsd/net/pf_ioctl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include <sys/kauth.h> #include <sys/conf.h> #include <sys/mcache.h> +#include <sys/queue.h> #include <mach/vm_param.h> @@ -134,6 +135,8 @@ static int pf_rollback_altq(u_int32_t); static int pf_commit_altq(u_int32_t); static int pf_enable_altq(struct pf_altq *); static int pf_disable_altq(struct pf_altq *); +static void pf_altq_copyin(struct pf_altq *, struct pf_altq *); +static void pf_altq_copyout(struct pf_altq *, struct pf_altq *); #endif /* ALTQ */ static int pf_begin_rules(u_int32_t *, int, const char *); static int pf_rollback_rules(u_int32_t, int, char *); @@ -145,10 +148,14 @@ static void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *, u_int8_t); static void pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *); #endif static int pf_commit_rules(u_int32_t, int, char *); +static void pf_rule_copyin(struct pf_rule *, struct pf_rule *, struct proc *); +static void pf_rule_copyout(struct pf_rule *, struct pf_rule *); static void pf_state_export(struct pfsync_state *, struct pf_state_key *, struct pf_state *); static void pf_state_import(struct pfsync_state *, struct pf_state_key *, struct pf_state *); +static void pf_pooladdr_copyin(struct pf_pooladdr *, struct pf_pooladdr *); +static void pf_pooladdr_copyout(struct pf_pooladdr *, struct pf_pooladdr *); #define PF_CDEV_MAJOR (-1) @@ -180,7 +187,16 @@ static void pf_detach_hooks(void); * and used in pf_af_hook() for performance optimization, such that packets * will enter pf_test() or pf_test6() only when PF is running. */ -static int pf_is_enabled; +int pf_is_enabled = 0; + +/* + * These are the pf enabled reference counting variables + */ +static u_int64_t pf_enabled_ref_count; +static u_int32_t nr_tokens = 0; + +SLIST_HEAD(list_head, pfioc_kernel_token); +static struct list_head token_list_head; struct pf_rule pf_default_rule; #if ALTQ @@ -230,6 +246,78 @@ struct thread *pf_purge_thread; extern void pfi_kifaddr_update(void *); +/* pf enable ref-counting helper functions */ +static u_int64_t generate_token(void); +static int remove_token(struct pfioc_remove_token *); +static void invalidate_all_tokens(void); + +static u_int64_t +generate_token(void) +{ + u_int64_t token_value; + struct pfioc_kernel_token *new_token; + + new_token = _MALLOC(sizeof (struct pfioc_kernel_token), M_TEMP, M_WAITOK|M_ZERO); + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + if (new_token == NULL) { + /* malloc failed! bail! */ + printf("%s: unable to allocate pf token structure!", __func__); + return 0; + } + + token_value = (u_int64_t)(uintptr_t)new_token; + + new_token->token.token_value = token_value; + new_token->token.pid = proc_pid(current_proc()); + proc_name(new_token->token.pid, new_token->token.proc_name, + sizeof (new_token->token.proc_name)); + new_token->token.timestamp = pf_calendar_time_second(); + + SLIST_INSERT_HEAD(&token_list_head, new_token, next); + nr_tokens++; + + return token_value; +} + +static int +remove_token(struct pfioc_remove_token *tok) +{ + struct pfioc_kernel_token *entry, *tmp; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + SLIST_FOREACH_SAFE(entry, &token_list_head, next, tmp) { + if (tok->token_value == entry->token.token_value) { + SLIST_REMOVE(&token_list_head, entry, pfioc_kernel_token, next); + _FREE(entry, M_TEMP); + nr_tokens--; + return 0; /* success */ + } + } + + printf("pf : remove failure\n"); + return ESRCH; /* failure */ +} + +static void +invalidate_all_tokens(void) +{ + struct pfioc_kernel_token *entry, *tmp; + + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + SLIST_FOREACH_SAFE(entry, &token_list_head, next, tmp) { + SLIST_REMOVE(&token_list_head, entry, pfioc_kernel_token, next); + _FREE(entry, M_TEMP); + } + + nr_tokens = 0; + + return; +} + void pfinit(void) { @@ -859,6 +947,27 @@ pf_disable_altq(struct pf_altq *altq) return (error); } + +static void +pf_altq_copyin(struct pf_altq *src, struct pf_altq *dst) +{ + bcopy(src, dst, sizeof (struct pf_altq)); + + dst->ifname[sizeof (dst->ifname) - 1] = '\0'; + dst->qname[sizeof (dst->qname) - 1] = '\0'; + dst->parent[sizeof (dst->parent) - 1] = '\0'; + dst->altq_disc = NULL; + TAILQ_INIT(&dst->entries); +} + +static void +pf_altq_copyout(struct pf_altq *src, struct pf_altq *dst) +{ + bcopy(src, dst, sizeof (struct pf_altq)); + + dst->altq_disc = NULL; + TAILQ_INIT(&dst->entries); +} #endif /* ALTQ */ static int @@ -951,7 +1060,7 @@ pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr) PF_MD5_UPD(pfr, xport.range.port[0]); PF_MD5_UPD(pfr, xport.range.port[1]); PF_MD5_UPD(pfr, xport.range.op); - break; + break; default: break; @@ -1067,6 +1176,53 @@ pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor) return (0); } +static void +pf_rule_copyin(struct pf_rule *src, struct pf_rule *dst, struct proc *p) +{ + bcopy(src, dst, sizeof (struct pf_rule)); + + dst->label[sizeof (dst->label) - 1] = '\0'; + dst->ifname[sizeof (dst->ifname) - 1] = '\0'; + dst->qname[sizeof (dst->qname) - 1] = '\0'; + dst->pqname[sizeof (dst->pqname) - 1] = '\0'; + dst->tagname[sizeof (dst->tagname) - 1] = '\0'; + dst->match_tagname[sizeof (dst->match_tagname) - 1] = '\0'; + dst->overload_tblname[sizeof (dst->overload_tblname) - 1] = '\0'; + + dst->cuid = kauth_cred_getuid(p->p_ucred); + dst->cpid = p->p_pid; + + dst->anchor = NULL; + dst->kif = NULL; + dst->overload_tbl = NULL; + + TAILQ_INIT(&dst->rpool.list); + dst->rpool.cur = NULL; + + /* initialize refcounting */ + dst->states = 0; + dst->src_nodes = 0; + + dst->entries.tqe_prev = NULL; + dst->entries.tqe_next = NULL; +} + +static void +pf_rule_copyout(struct pf_rule *src, struct pf_rule *dst) +{ + bcopy(src, dst, sizeof (struct pf_rule)); + + dst->anchor = NULL; + dst->kif = NULL; + dst->overload_tbl = NULL; + + TAILQ_INIT(&dst->rpool.list); + dst->rpool.cur = NULL; + + dst->entries.tqe_prev = NULL; + dst->entries.tqe_next = NULL; +} + static void pf_state_export(struct pfsync_state *sp, struct pf_state_key *sk, struct pf_state *s) @@ -1176,6 +1332,27 @@ pf_state_import(struct pfsync_state *sp, struct pf_state_key *sk, s->bytes[0] = s->bytes[1] = 0; } +static void +pf_pooladdr_copyin(struct pf_pooladdr *src, struct pf_pooladdr *dst) +{ + bcopy(src, dst, sizeof (struct pf_pooladdr)); + + dst->entries.tqe_prev = NULL; + dst->entries.tqe_next = NULL; + dst->ifname[sizeof (dst->ifname) - 1] = '\0'; + dst->kif = NULL; +} + +static void +pf_pooladdr_copyout(struct pf_pooladdr *src, struct pf_pooladdr *dst) +{ + bcopy(src, dst, sizeof (struct pf_pooladdr)); + + dst->entries.tqe_prev = NULL; + dst->entries.tqe_next = NULL; + dst->kif = NULL; +} + static int pf_setup_pfsync_matching(struct pf_ruleset *rs) { @@ -1216,6 +1393,38 @@ pf_setup_pfsync_matching(struct pf_ruleset *rs) return (0); } +static void +pf_start(void) +{ + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + VERIFY(pf_is_enabled == 0); + + pf_is_enabled = 1; + pf_status.running = 1; + pf_status.since = pf_calendar_time_second(); + if (pf_status.stateid == 0) { + pf_status.stateid = pf_time_second(); + pf_status.stateid = pf_status.stateid << 32; + } + wakeup(pf_purge_thread_fn); + DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n")); +} + +static void +pf_stop(void) +{ + lck_mtx_assert(pf_lock, LCK_MTX_ASSERT_OWNED); + + VERIFY(pf_is_enabled); + + pf_status.running = 0; + pf_is_enabled = 0; + pf_status.since = pf_calendar_time_second(); + wakeup(pf_purge_thread_fn); + DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n")); +} + static int pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) { @@ -1282,7 +1491,10 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) if (!(flags & FWRITE)) switch (cmd) { case DIOCSTART: + case DIOCSTARTREF: case DIOCSTOP: + case DIOCSTOPREF: + case DIOCGETSTARTERS: case DIOCGETRULES: case DIOCGETADDRS: case DIOCGETADDR: @@ -1316,7 +1528,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCRSETADDRS: case DIOCRSETTFLAGS: if (((struct pfioc_table *)addr)->pfrio_flags & - PFR_FLAG_DUMMY) { + PFR_FLAG_DUMMY) { flags |= FWRITE; /* need write lock for dummy */ break; /* dummy operation ok */ } @@ -1341,20 +1553,41 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCSTART: if (pf_status.running) { + /* + * Increment the reference for a simple -e enable, so + * that even if other processes drop their references, + * pf will still be available to processes that turned + * it on without taking a reference + */ + if (nr_tokens == pf_enabled_ref_count) { + pf_enabled_ref_count++; + VERIFY(pf_enabled_ref_count != 0); + } error = EEXIST; } else if (pf_purge_thread == NULL) { error = ENOMEM; } else { - pf_is_enabled = 1; - pf_status.running = 1; - pf_status.since = pf_calendar_time_second(); - if (pf_status.stateid == 0) { - pf_status.stateid = pf_time_second(); - pf_status.stateid = pf_status.stateid << 32; + pf_start(); + pf_enabled_ref_count++; + VERIFY(pf_enabled_ref_count != 0); + } + break; + + case DIOCSTARTREF: /* returns a token */ + if (pf_purge_thread == NULL) { + error = ENOMEM; + } else { + if ((*(u_int64_t *)addr = generate_token()) != 0) { + if (pf_is_enabled == 0) { + pf_start(); + } + pf_enabled_ref_count++; + VERIFY(pf_enabled_ref_count != 0); + } else { + error = ENOMEM; + DPFPRINTF(PF_DEBUG_URGENT, + ("pf: unable to generate token\n")); } - mbuf_growth_aggressive(); - wakeup(pf_purge_thread_fn); - DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n")); } break; @@ -1362,23 +1595,102 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) if (!pf_status.running) { error = ENOENT; } else { - mbuf_growth_normal(); - pf_status.running = 0; - pf_is_enabled = 0; - pf_status.since = pf_calendar_time_second(); - wakeup(pf_purge_thread_fn); - DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n")); + pf_stop(); + pf_enabled_ref_count = 0; + invalidate_all_tokens(); } break; + case DIOCSTOPREF: + if (!pf_status.running) { + error = ENOENT; + } else { + if ((error = remove_token( + (struct pfioc_remove_token*)addr))==0) { + VERIFY(pf_enabled_ref_count != 0); + pf_enabled_ref_count--; + // return currently held references + ((struct pfioc_remove_token *)addr)->refcount + = pf_enabled_ref_count; + DPFPRINTF(PF_DEBUG_MISC, + ("pf: enabled refcount decremented\n")); + } else { + error = EINVAL; + DPFPRINTF(PF_DEBUG_URGENT, + ("pf: token mismatch\n")); + break; + } + + if (pf_enabled_ref_count == 0) + pf_stop(); + } + break; + + case DIOCGETSTARTERS: { + struct pfioc_tokens *g_token = (struct pfioc_tokens *)addr; + struct pfioc_token *tokens; + struct pfioc_kernel_token *entry, *tmp; + user_addr_t token_buf; + int g_token_size_copy; + char *ptr; + + if (nr_tokens == 0) { + error = ENOENT; + break; + } + + g_token_size_copy = g_token->size; + + if (g_token->size == 0) { + g_token->size = sizeof (struct pfioc_token) * nr_tokens; + break; + } + + token_buf = PF_USER_ADDR(addr, pfioc_tokens, pgt_buf); + tokens = _MALLOC(sizeof(struct pfioc_token) * nr_tokens, + M_TEMP, M_WAITOK); + + if (tokens == NULL) { + error = ENOMEM; + break; + } + + ptr = (void *)tokens; + SLIST_FOREACH_SAFE(entry, &token_list_head, next, tmp) { + if ((unsigned)g_token_size_copy + < sizeof(struct pfioc_token)) + break; /* no more buffer space left */ + + ((struct pfioc_token *)(ptr))->token_value = entry->token.token_value; + ((struct pfioc_token *)(ptr))->timestamp = entry->token.timestamp; + ((struct pfioc_token *)(ptr))->pid = entry->token.pid; + memcpy(((struct pfioc_token *)(ptr))->proc_name, entry->token.proc_name, + PFTOK_PROCNAME_LEN); + ptr += sizeof(struct pfioc_token); + + g_token_size_copy -= sizeof(struct pfioc_token); + } + + if (g_token_size_copy < g_token->size) { + error = copyout(tokens, token_buf, + g_token->size - g_token_size_copy); + } + + g_token->size -= g_token_size_copy; + _FREE(tokens, M_TEMP); + + break; + } + case DIOCADDRULE: { struct pfioc_rule *pr = (struct pfioc_rule *)addr; struct pf_ruleset *ruleset; struct pf_rule *rule, *tail; struct pf_pooladdr *apa; - int rs_num; + int rs_num; - pr->anchor[sizeof (pr->anchor) - 1] = 0; + pr->anchor[sizeof (pr->anchor) - 1] = '\0'; + pr->anchor_call[sizeof (pr->anchor_call) - 1] = '\0'; ruleset = pf_find_ruleset(pr->anchor); if (ruleset == NULL) { error = EINVAL; @@ -1406,16 +1718,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - bcopy(&pr->rule, rule, sizeof (struct pf_rule)); - rule->cuid = kauth_cred_getuid(p->p_ucred); - rule->cpid = p->p_pid; - rule->anchor = NULL; - rule->kif = NULL; - TAILQ_INIT(&rule->rpool.list); - /* initialize refcounting */ - rule->states = 0; - rule->src_nodes = 0; - rule->entries.tqe_prev = NULL; + pf_rule_copyin(&pr->rule, rule, p); #if !INET if (rule->af == AF_INET) { pool_put(&pf_rule_pl, rule); @@ -1526,7 +1829,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pf_rule *tail; int rs_num; - pr->anchor[sizeof (pr->anchor) - 1] = 0; + pr->anchor[sizeof (pr->anchor) - 1] = '\0'; + pr->anchor_call[sizeof (pr->anchor_call) - 1] = '\0'; ruleset = pf_find_ruleset(pr->anchor); if (ruleset == NULL) { error = EINVAL; @@ -1553,7 +1857,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pf_rule *rule; int rs_num, i; - pr->anchor[sizeof (pr->anchor) - 1] = 0; + pr->anchor[sizeof (pr->anchor) - 1] = '\0'; + pr->anchor_call[sizeof (pr->anchor_call) - 1] = '\0'; ruleset = pf_find_ruleset(pr->anchor); if (ruleset == NULL) { error = EINVAL; @@ -1575,7 +1880,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EBUSY; break; } - bcopy(rule, &pr->rule, sizeof (struct pf_rule)); + pf_rule_copyout(rule, &pr->rule); if (pf_anchor_copyout(ruleset, rule, pr)) { error = EBUSY; break; @@ -1620,6 +1925,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EINVAL; break; } + pcr->anchor[sizeof (pcr->anchor) - 1] = '\0'; + pcr->anchor_call[sizeof (pcr->anchor_call) - 1] = '\0'; ruleset = pf_find_ruleset(pcr->anchor); if (ruleset == NULL) { error = EINVAL; @@ -1652,13 +1959,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - bcopy(&pcr->rule, newrule, sizeof (struct pf_rule)); - newrule->cuid = kauth_cred_getuid(p->p_ucred); - newrule->cpid = p->p_pid; - TAILQ_INIT(&newrule->rpool.list); - /* initialize refcounting */ - newrule->states = 0; - newrule->entries.tqe_prev = NULL; + pf_rule_copyin(&pcr->rule, newrule, p); #if !INET if (newrule->af == AF_INET) { pool_put(&pf_rule_pl, newrule); @@ -1816,6 +2117,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr; int killed = 0; + psk->psk_ifname[sizeof (psk->psk_ifname) - 1] = '\0'; for (s = RB_MIN(pf_state_tree_id, &tree_id); s; s = nexts) { nexts = RB_NEXT(pf_state_tree_id, &tree_id, s); @@ -2268,7 +2570,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - bcopy(&pa->altq, altq, sizeof (struct pf_altq)); + pf_altq_copyin(&pa->altq, altq); /* * if this is for a queue, find the discipline and @@ -2297,7 +2599,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) } TAILQ_INSERT_TAIL(pf_altqs_inactive, altq, entries); - bcopy(altq, &pa->altq, sizeof (struct pf_altq)); + pf_altq_copyout(altq, &pa->altq); break; } @@ -2331,7 +2633,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EBUSY; break; } - bcopy(altq, &pa->altq, sizeof (struct pf_altq)); + pf_altq_copyout(altq, &pa->altq); break; } @@ -2381,6 +2683,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) case DIOCADDADDR: { struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; + pp->anchor[sizeof (pp->anchor) - 1] = '\0'; if (pp->ticket != ticket_pabuf) { error = EBUSY; break; @@ -2408,7 +2711,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - bcopy(&pp->addr, pa, sizeof (struct pf_pooladdr)); + pf_pooladdr_copyin(&pp->addr, pa); if (pa->ifname[0]) { pa->kif = pfi_kif_get(pa->ifname); if (pa->kif == NULL) { @@ -2433,6 +2736,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; pp->nr = 0; + pp->anchor[sizeof (pp->anchor) - 1] = '\0'; pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, pp->r_num, 0, 1, 0); if (pool == NULL) { @@ -2448,6 +2752,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pfioc_pooladdr *pp = (struct pfioc_pooladdr *)addr; u_int32_t nr = 0; + pp->anchor[sizeof (pp->anchor) - 1] = '\0'; pool = pf_get_pool(pp->anchor, pp->ticket, pp->r_action, pp->r_num, 0, 1, 1); if (pool == NULL) { @@ -2463,7 +2768,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EBUSY; break; } - bcopy(pa, &pp->addr, sizeof (struct pf_pooladdr)); + pf_pooladdr_copyout(pa, &pp->addr); pfi_dynaddr_copyout(&pp->addr.addr); pf_tbladdr_copyout(&pp->addr.addr); pf_rtlabel_copyout(&pp->addr.addr); @@ -2487,6 +2792,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) break; } + pca->anchor[sizeof (pca->anchor) - 1] = '\0'; ruleset = pf_find_ruleset(pca->anchor); if (ruleset == NULL) { error = EBUSY; @@ -2504,7 +2810,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENOMEM; break; } - bcopy(&pca->addr, newpa, sizeof (struct pf_pooladdr)); + pf_pooladdr_copyin(&pca->addr, newpa); #if !INET if (pca->af == AF_INET) { pool_put(&pf_pooladdr_pl, newpa); @@ -2585,7 +2891,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pf_ruleset *ruleset; struct pf_anchor *anchor; - pr->path[sizeof (pr->path) - 1] = 0; + pr->path[sizeof (pr->path) - 1] = '\0'; + pr->name[sizeof (pr->name) - 1] = '\0'; if ((ruleset = pf_find_ruleset(pr->path)) == NULL) { error = EINVAL; break; @@ -2610,7 +2917,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) struct pf_anchor *anchor; u_int32_t nr = 0; - pr->path[sizeof (pr->path) - 1] = 0; + pr->path[sizeof (pr->path) - 1] = '\0'; if ((ruleset = pf_find_ruleset(pr->path)) == NULL) { error = EINVAL; break; @@ -2645,6 +2952,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2684,6 +2992,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_get_tables(&io->pfrio_table, buf, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2697,6 +3006,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_get_tstats(&io->pfrio_table, buf, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2736,6 +3046,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2749,6 +3060,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_add_addrs(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -2763,6 +3075,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_del_addrs(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -2777,6 +3090,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_set_addrs(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd, &io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags | @@ -2792,6 +3106,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_get_addrs(&io->pfrio_table, buf, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2805,6 +3120,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_get_astats(&io->pfrio_table, buf, &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL); break; @@ -2818,6 +3134,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_clr_astats(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -2832,6 +3149,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_tst_addrs(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -2846,6 +3164,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = ENODEV; break; } + pfr_table_copyin_cleanup(&io->pfrio_table); error = pfr_ina_define(&io->pfrio_table, buf, io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr, io->pfrio_ticket, io->pfrio_flags | PFR_FLAG_USERIOCTL); @@ -2885,6 +3204,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EFAULT; goto fail; } + ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: #if ALTQ @@ -2954,6 +3274,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EFAULT; goto fail; } + ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: #if ALTQ @@ -3019,6 +3340,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EFAULT; goto fail; } + ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: #if ALTQ @@ -3077,6 +3399,7 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) error = EFAULT; goto fail; } + ioe->anchor[sizeof (ioe->anchor) - 1] = '\0'; switch (ioe->rs_num) { case PF_RULESET_ALTQ: #if ALTQ @@ -3155,6 +3478,10 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct proc *p) n->conn_rate.count * diff / n->conn_rate.seconds; + _RB_PARENT(pstore, entry) = NULL; + RB_LEFT(pstore, entry) = RB_RIGHT(pstore, entry) = NULL; + pstore->kif = NULL; + error = copyout(pstore, buf, sizeof (*pstore)); if (error) { _FREE(pstore, M_TEMP); @@ -3387,9 +3714,11 @@ pf_inet_hook(struct ifnet *ifp, struct mbuf **mp, int input) } #if BYTE_ORDER != BIG_ENDIAN else { - ip = mtod(*mp, struct ip *); - NTOHS(ip->ip_len); - NTOHS(ip->ip_off); + if (*mp != NULL) { + ip = mtod(*mp, struct ip *); + NTOHS(ip->ip_len); + NTOHS(ip->ip_off); + } } #endif return (error); @@ -3402,10 +3731,6 @@ pf_inet6_hook(struct ifnet *ifp, struct mbuf **mp, int input) { int error = 0; -#if 0 - /* - * TODO: once we support IPv6 hardware checksum offload - */ /* * If the packet is outbound, is originated locally, is flagged for * delayed UDP/TCP checksum calculation, and is about to be processed @@ -3414,16 +3739,15 @@ pf_inet6_hook(struct ifnet *ifp, struct mbuf **mp, int input) * it properly. */ if (!input && (*mp)->m_pkthdr.rcvif == NULL) { - static const int mask = CSUM_DELAY_DATA; + static const int mask = CSUM_DELAY_IPV6_DATA; const int flags = (*mp)->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); if (flags & mask) { - in6_delayed_cksum(*mp); + in6_delayed_cksum(*mp, sizeof(struct ip6_hdr)); (*mp)->m_pkthdr.csum_flags &= ~mask; } } -#endif if (pf_test6(input ? PF_IN : PF_OUT, ifp, mp, NULL) != PF_PASS) { if (*mp != NULL) { @@ -3449,7 +3773,8 @@ pf_ifaddr_hook(struct ifnet *ifp, unsigned long cmd) case SIOCAIFADDR: case SIOCDIFADDR: #if INET6 - case SIOCAIFADDR_IN6: + case SIOCAIFADDR_IN6_32: + case SIOCAIFADDR_IN6_64: case SIOCDIFADDR_IN6: #endif /* INET6 */ if (ifp->if_pf_kif != NULL) diff --git a/bsd/net/pf_osfp.c b/bsd/net/pf_osfp.c index b7e579d5c..89d71e889 100644 --- a/bsd/net/pf_osfp.c +++ b/bsd/net/pf_osfp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -312,7 +312,7 @@ int pf_osfp_add(struct pf_osfp_ioctl *fpioc) { struct pf_os_fingerprint *fp, fpadd; - struct pf_osfp_entry *entry; + struct pf_osfp_entry *entry, *uentry; memset(&fpadd, 0, sizeof (fpadd)); fpadd.fp_tcpopts = fpioc->fp_tcpopts; @@ -324,6 +324,12 @@ pf_osfp_add(struct pf_osfp_ioctl *fpioc) fpadd.fp_wscale = fpioc->fp_wscale; fpadd.fp_ttl = fpioc->fp_ttl; + uentry = &fpioc->fp_os; + uentry->fp_entry.sle_next = NULL; + uentry->fp_class_nm[sizeof (uentry->fp_class_nm) - 1] = '\0'; + uentry->fp_version_nm[sizeof (uentry->fp_version_nm) - 1] = '\0'; + uentry->fp_subtype_nm[sizeof (uentry->fp_subtype_nm) - 1] = '\0'; + DPFPRINTF("adding osfp %s %s %s = %s%d:%d:%d:%s%d:0x%llx %d " "(TS=%s,M=%s%d,W=%s%d) %x\n", fpioc->fp_os.fp_class_nm, fpioc->fp_os.fp_version_nm, @@ -527,6 +533,7 @@ pf_osfp_get(struct pf_osfp_ioctl *fpioc) fpioc->fp_getnum = num; memcpy(&fpioc->fp_os, entry, sizeof (fpioc->fp_os)); + fpioc->fp_os.fp_entry.sle_next = NULL; return (0); } } diff --git a/bsd/net/pf_table.c b/bsd/net/pf_table.c index 735c65b81..ea3b529f5 100644 --- a/bsd/net/pf_table.c +++ b/bsd/net/pf_table.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1190,6 +1190,7 @@ pfr_add_tables(user_addr_t tbl, int size, int *nadd, int flags) for (i = 0; i < size; i++, tbl += sizeof (key.pfrkt_t)) { if (COPYIN(tbl, &key.pfrkt_t, sizeof (key.pfrkt_t), flags)) senderr(EFAULT); + pfr_table_copyin_cleanup(&key.pfrkt_t); if (pfr_validate_table(&key.pfrkt_t, PFR_TFLAG_USRMASK, flags & PFR_FLAG_USERIOCTL)) senderr(EINVAL); @@ -1266,6 +1267,7 @@ pfr_del_tables(user_addr_t tbl, int size, int *ndel, int flags) for (i = 0; i < size; i++, tbl += sizeof (key.pfrkt_t)) { if (COPYIN(tbl, &key.pfrkt_t, sizeof (key.pfrkt_t), flags)) return (EFAULT); + pfr_table_copyin_cleanup(&key.pfrkt_t); if (pfr_validate_table(&key.pfrkt_t, 0, flags & PFR_FLAG_USERIOCTL)) return (EINVAL); @@ -1385,6 +1387,7 @@ pfr_clr_tstats(user_addr_t tbl, int size, int *nzero, int flags) for (i = 0; i < size; i++, tbl += sizeof (key.pfrkt_t)) { if (COPYIN(tbl, &key.pfrkt_t, sizeof (key.pfrkt_t), flags)) return (EFAULT); + pfr_table_copyin_cleanup(&key.pfrkt_t); if (pfr_validate_table(&key.pfrkt_t, 0, 0)) return (EINVAL); p = RB_FIND(pfr_ktablehead, &pfr_ktables, &key); @@ -1420,6 +1423,7 @@ pfr_set_tflags(user_addr_t tbl, int size, int setflag, int clrflag, for (i = 0; i < size; i++, tbl += sizeof (key.pfrkt_t)) { if (COPYIN(tbl, &key.pfrkt_t, sizeof (key.pfrkt_t), flags)) return (EFAULT); + pfr_table_copyin_cleanup(&key.pfrkt_t); if (pfr_validate_table(&key.pfrkt_t, 0, flags & PFR_FLAG_USERIOCTL)) return (EINVAL); @@ -1730,6 +1734,13 @@ pfr_commit_ktable(struct pfr_ktable *kt, u_int64_t tzero) pfr_setflags_ktable(kt, nflags); } +void +pfr_table_copyin_cleanup(struct pfr_table *tbl) +{ + tbl->pfrt_anchor[sizeof (tbl->pfrt_anchor) - 1] = '\0'; + tbl->pfrt_name[sizeof (tbl->pfrt_name) - 1] = '\0'; +} + static int pfr_validate_table(struct pfr_table *tbl, int allowedflags, int no_reserved) { diff --git a/bsd/net/pfkeyv2.h b/bsd/net/pfkeyv2.h index fa89f14c7..e452e1d2e 100644 --- a/bsd/net/pfkeyv2.h +++ b/bsd/net/pfkeyv2.h @@ -412,6 +412,7 @@ struct sadb_sastat { #define SADB_X_EXT_NATT_KEEPALIVE 0x0004 /* Local node is behind NAT, send keepalives */ /* Should only be set for outbound SAs */ #define SADB_X_EXT_NATT_MULTIPLEUSERS 0x0008 /* For use on VPN server - support multiple users */ +#define SADB_X_EXT_NATT_DETECTED_PEER 0x0010 #endif /* PRIVATE */ diff --git a/bsd/net/pfvar.h b/bsd/net/pfvar.h index fc35db9e7..58c4b3969 100644 --- a/bsd/net/pfvar.h +++ b/bsd/net/pfvar.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,7 +77,7 @@ extern "C" { #include <sys/param.h> #include <sys/types.h> #include <sys/queue.h> -#include <sys/tree.h> +#include <libkern/tree.h> #include <net/radix.h> #include <netinet/in.h> @@ -1771,6 +1771,55 @@ struct pfioc_states_64 { }; #endif /* KERNEL */ +#define PFTOK_PROCNAME_LEN 64 +#pragma pack(1) +struct pfioc_token { + u_int64_t token_value; + u_int64_t timestamp; + pid_t pid; + char proc_name[PFTOK_PROCNAME_LEN]; +}; +#pragma pack() + +struct pfioc_kernel_token { + SLIST_ENTRY(pfioc_kernel_token) next; + struct pfioc_token token; +}; + +struct pfioc_remove_token { + u_int64_t token_value; + u_int64_t refcount; +}; + +struct pfioc_tokens { + int size; + union { + caddr_t pgtu_buf; + struct pfioc_token *pgtu_tokens; + } pgt_u __attribute__((aligned(8))); +#define pgt_buf pgt_u.pgtu_buf +#define pgt_tokens pgt_u.pgtu_tokens +}; + +#ifdef KERNEL +struct pfioc_tokens_32 { + int size; + union { + user32_addr_t pgtu_buf; + user32_addr_t pgtu_tokens; + } pgt_u __attribute__((aligned(8))); +}; + +struct pfioc_tokens_64 { + int size; + union { + user64_addr_t pgtu_buf; + user64_addr_t pgtu_tokens; + } pgt_u __attribute__((aligned(8))); +}; +#endif /* KERNEL */ + + struct pfioc_src_nodes { int psn_len; union { @@ -1860,6 +1909,7 @@ struct pfioc_trans_64 { }; #endif /* KERNEL */ + #define PFR_FLAG_ATOMIC 0x00000001 #define PFR_FLAG_DUMMY 0x00000002 #define PFR_FLAG_FEEDBACK 0x00000004 @@ -1955,12 +2005,15 @@ struct pfioc_iface_64 { #define DIOCSTART _IO ('D', 1) #define DIOCSTOP _IO ('D', 2) #define DIOCADDRULE _IOWR('D', 4, struct pfioc_rule) +#define DIOCGETSTARTERS _IOWR('D', 5, struct pfioc_tokens) #define DIOCGETRULES _IOWR('D', 6, struct pfioc_rule) #define DIOCGETRULE _IOWR('D', 7, struct pfioc_rule) -/* XXX cut 8 - 17 */ +#define DIOCSTARTREF _IOR ('D', 8, u_int64_t) +#define DIOCSTOPREF _IOWR('D', 9, struct pfioc_remove_token) +/* XXX cut 10 - 17 */ #define DIOCCLRSTATES _IOWR('D', 18, struct pfioc_state_kill) #define DIOCGETSTATE _IOWR('D', 19, struct pfioc_state) -#define DIOCSETSTATUSIF _IOWR('D', 20, struct pfioc_if) +#define DIOCSETSTATUSIF _IOWR('D', 20, struct pfioc_if) #define DIOCGETSTATUS _IOWR('D', 21, struct pf_status) #define DIOCCLRSTATUS _IO ('D', 22) #define DIOCNATLOOK _IOWR('D', 23, struct pfioc_natlook) @@ -1995,23 +2048,23 @@ struct pfioc_iface_64 { #define DIOCRDELTABLES _IOWR('D', 62, struct pfioc_table) #define DIOCRGETTABLES _IOWR('D', 63, struct pfioc_table) #define DIOCRGETTSTATS _IOWR('D', 64, struct pfioc_table) -#define DIOCRCLRTSTATS _IOWR('D', 65, struct pfioc_table) +#define DIOCRCLRTSTATS _IOWR('D', 65, struct pfioc_table) #define DIOCRCLRADDRS _IOWR('D', 66, struct pfioc_table) #define DIOCRADDADDRS _IOWR('D', 67, struct pfioc_table) #define DIOCRDELADDRS _IOWR('D', 68, struct pfioc_table) #define DIOCRSETADDRS _IOWR('D', 69, struct pfioc_table) #define DIOCRGETADDRS _IOWR('D', 70, struct pfioc_table) #define DIOCRGETASTATS _IOWR('D', 71, struct pfioc_table) -#define DIOCRCLRASTATS _IOWR('D', 72, struct pfioc_table) +#define DIOCRCLRASTATS _IOWR('D', 72, struct pfioc_table) #define DIOCRTSTADDRS _IOWR('D', 73, struct pfioc_table) #define DIOCRSETTFLAGS _IOWR('D', 74, struct pfioc_table) #define DIOCRINADEFINE _IOWR('D', 77, struct pfioc_table) #define DIOCOSFPFLUSH _IO('D', 78) #define DIOCOSFPADD _IOWR('D', 79, struct pf_osfp_ioctl) #define DIOCOSFPGET _IOWR('D', 80, struct pf_osfp_ioctl) -#define DIOCXBEGIN _IOWR('D', 81, struct pfioc_trans) -#define DIOCXCOMMIT _IOWR('D', 82, struct pfioc_trans) -#define DIOCXROLLBACK _IOWR('D', 83, struct pfioc_trans) +#define DIOCXBEGIN _IOWR('D', 81, struct pfioc_trans) +#define DIOCXCOMMIT _IOWR('D', 82, struct pfioc_trans) +#define DIOCXROLLBACK _IOWR('D', 83, struct pfioc_trans) #define DIOCGETSRCNODES _IOWR('D', 84, struct pfioc_src_nodes) #define DIOCCLRSRCNODES _IO('D', 85) #define DIOCSETHOSTID _IOWR('D', 86, u_int32_t) @@ -2158,6 +2211,7 @@ __private_extern__ int pfr_pool_get(struct pfr_ktable *, int *, struct pf_addr *, struct pf_addr **, struct pf_addr **, sa_family_t); __private_extern__ void pfr_dynaddr_update(struct pfr_ktable *, struct pfi_dynaddr *); +__private_extern__ void pfr_table_copyin_cleanup(struct pfr_table *); __private_extern__ struct pfr_ktable *pfr_attach_table(struct pf_ruleset *, char *); __private_extern__ void pfr_detach_table(struct pfr_ktable *); @@ -2248,6 +2302,9 @@ __private_extern__ struct pf_anchor_global pf_anchors; __private_extern__ struct pf_anchor pf_main_anchor; #define pf_main_ruleset pf_main_anchor.ruleset +__private_extern__ int pf_is_enabled; +#define PF_IS_ENABLED (pf_is_enabled != 0) + /* these ruleset functions can be linked into userland programs (pfctl) */ __private_extern__ int pf_get_ruleset_number(u_int8_t); __private_extern__ void pf_init_ruleset(struct pf_ruleset *); diff --git a/bsd/net/ppp_deflate.c b/bsd/net/ppp_deflate.c index 5968578b0..4541def29 100644 --- a/bsd/net/ppp_deflate.c +++ b/bsd/net/ppp_deflate.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -316,7 +316,7 @@ z_compress(arg, mret, mp, orig_len, maxolen) } ++state->seqno; - rptr += (proto > 0xff)? 2: 3; /* skip 1st proto byte if 0 */ + rptr += (proto > 0xff)? 2: 3; /* skip 1st proto byte if 0 */ state->strm.next_in = rptr; state->strm.avail_in = mtod(mp, u_char *) + mp->m_len - rptr; mp = mp->m_next; diff --git a/bsd/net/route.c b/bsd/net/route.c index 5f1580e8f..5ed681a0f 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,16 +70,25 @@ #include <sys/domain.h> #include <sys/syslog.h> #include <sys/queue.h> +#include <sys/mcache.h> +#include <sys/protosw.h> #include <kern/lock.h> #include <kern/zalloc.h> #include <net/if.h> #include <net/route.h> +#include <net/ntstat.h> #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/ip_mroute.h> #include <netinet/ip_var.h> +#include <netinet/ip6.h> + +#if INET6 +#include <netinet6/ip6_var.h> +#include <netinet6/in6_var.h> +#endif /* INET6 */ #include <net/if_dl.h> @@ -187,7 +196,6 @@ */ #define equal(a1, a2) (bcmp((caddr_t)(a1), (caddr_t)(a2), (a1)->sa_len) == 0) -#define SA(p) ((struct sockaddr *)(p)) extern void kdp_set_gateway_mac (void *gatewaymac); @@ -261,11 +269,6 @@ struct rtentry_dbg { TAILQ_ENTRY(rtentry_dbg) rtd_trash_link; }; -#define atomic_add_16_ov(a, n) \ - ((uint16_t) OSAddAtomic16(n, (volatile SInt16 *)a)) -#define atomic_add_32_ov(a, n) \ - ((uint32_t) OSAddAtomic(n, a)) - /* List of trash route entries protected by rnh_lock */ static TAILQ_HEAD(, rtentry_dbg) rttrash_head; @@ -285,33 +288,34 @@ static struct rtentry *rtalloc1_common_locked(struct sockaddr *, int, uint32_t, static int rtrequest_common_locked(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, unsigned int); +static struct rtentry *rtalloc1_locked(struct sockaddr *, int, uint32_t); static void rtalloc_ign_common_locked(struct route *, uint32_t, unsigned int); -static inline void sa_set_ifscope(struct sockaddr *, unsigned int); -static struct sockaddr *sin_copy(struct sockaddr_in *, struct sockaddr_in *, - unsigned int); -static struct sockaddr *mask_copy(struct sockaddr *, struct sockaddr_in *, - unsigned int); +static inline void sin6_set_ifscope(struct sockaddr *, unsigned int); +static inline void sin6_set_embedded_ifscope(struct sockaddr *, unsigned int); +static inline unsigned int sin6_get_embedded_ifscope(struct sockaddr *); +static struct sockaddr *sa_copy(struct sockaddr *, struct sockaddr_storage *, + unsigned int *); +static struct sockaddr *ma_copy(int, struct sockaddr *, + struct sockaddr_storage *, unsigned int); static struct sockaddr *sa_trim(struct sockaddr *, int); static struct radix_node *node_lookup(struct sockaddr *, struct sockaddr *, unsigned int); -static struct radix_node *node_lookup_default(void); +static struct radix_node *node_lookup_default(int); static int rn_match_ifscope(struct radix_node *, void *); static struct ifaddr *ifa_ifwithroute_common_locked(int, const struct sockaddr *, const struct sockaddr *, unsigned int); static struct rtentry *rte_alloc(void); static void rte_free(struct rtentry *); static void rtfree_common(struct rtentry *, boolean_t); -#if IFNET_ROUTE_REFCNT static void rte_if_ref(struct ifnet *, int); -#endif /* IFNET_ROUTE_REFCNT */ uint32_t route_generation = 0; /* - * sockaddr_in with embedded interface scope; this is used internally - * to keep track of scoped route entries in the routing table. The - * fact that such a scope is embedded in the structure is an artifact - * of the current implementation which could change in future. + * sockaddr_in with scope ID field; this is used internally to keep + * track of scoped route entries in the routing table. The fact that + * such a value is embedded in the structure is an artifact of the + * current implementation which could change in future. */ struct sockaddr_inifscope { __uint8_t sin_len; @@ -330,11 +334,14 @@ struct sockaddr_inifscope { __uint32_t ifscope; } _in_index; } un; -#define sin_ifscope un._in_index.ifscope +#define sin_scope_id un._in_index.ifscope }; +#define SA(sa) ((struct sockaddr *)(size_t)(sa)) #define SIN(sa) ((struct sockaddr_in *)(size_t)(sa)) +#define SIN6(sa) ((struct sockaddr_in6 *)(size_t)(sa)) #define SINIFSCOPE(sa) ((struct sockaddr_inifscope *)(size_t)(sa)) +#define SIN6IFSCOPE(sa) SIN6(sa) #define ASSERT_SINIFSCOPE(sa) { \ if ((sa)->sa_family != AF_INET || \ @@ -342,6 +349,12 @@ struct sockaddr_inifscope { panic("%s: bad sockaddr_in %p\n", __func__, sa); \ } +#define ASSERT_SIN6IFSCOPE(sa) { \ + if ((sa)->sa_family != AF_INET6 || \ + (sa)->sa_len < sizeof (struct sockaddr_in6)) \ + panic("%s: bad sockaddr_in %p\n", __func__, sa); \ +} + /* * Argument to leaf-matching routine; at present it is scoped routing * specific but can be expanded in future to include other search filters. @@ -358,27 +371,36 @@ static struct sockaddr sin_def = { sizeof (struct sockaddr_in), AF_INET, { 0, } }; +static struct sockaddr_in6 sin6_def = { + sizeof (struct sockaddr_in6), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 +}; + /* * Interface index (scope) of the primary interface; determined at * the time when the default, non-scoped route gets added, changed * or deleted. Protected by rnh_lock. */ static unsigned int primary_ifscope = IFSCOPE_NONE; +static unsigned int primary6_ifscope = IFSCOPE_NONE; + +#define INET_DEFAULT(sa) \ + ((sa)->sa_family == AF_INET && SIN(sa)->sin_addr.s_addr == 0) -#define INET_DEFAULT(dst) \ - ((dst)->sa_family == AF_INET && SIN(dst)->sin_addr.s_addr == 0) +#define INET6_DEFAULT(sa) \ + ((sa)->sa_family == AF_INET6 && \ + IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr)) +#define SA_DEFAULT(sa) (INET_DEFAULT(sa) || INET6_DEFAULT(sa)) #define RT(r) ((struct rtentry *)r) +#define RN(r) ((struct radix_node *)r) #define RT_HOST(r) (RT(r)->rt_flags & RTF_HOST) -#if IFNET_ROUTE_REFCNT SYSCTL_DECL(_net_idle_route); static int rt_if_idle_expire_timeout = RT_IF_IDLE_EXPIRE_TIMEOUT; SYSCTL_INT(_net_idle_route, OID_AUTO, expire_timeout, CTLFLAG_RW, &rt_if_idle_expire_timeout, 0, "Default expiration time on routes for " "interface idle reference counting"); -#endif /* IFNET_ROUTE_REFCNT */ /* * Given a route, determine whether or not it is the non-scoped default @@ -386,88 +408,189 @@ SYSCTL_INT(_net_idle_route, OID_AUTO, expire_timeout, CTLFLAG_RW, * a separate place when rt is in the process of being created. */ boolean_t -rt_inet_default(struct rtentry *rt, struct sockaddr *dst) +rt_primary_default(struct rtentry *rt, struct sockaddr *dst) { - return (INET_DEFAULT(dst) && !(rt->rt_flags & RTF_IFSCOPE)); + return (SA_DEFAULT(dst) && !(rt->rt_flags & RTF_IFSCOPE)); } /* * Set the ifscope of the primary interface; caller holds rnh_lock. */ void -set_primary_ifscope(unsigned int ifscope) +set_primary_ifscope(int af, unsigned int ifscope) { - primary_ifscope = ifscope; + if (af == AF_INET) + primary_ifscope = ifscope; + else + primary6_ifscope = ifscope; } /* * Return the ifscope of the primary interface; caller holds rnh_lock. */ unsigned int -get_primary_ifscope(void) +get_primary_ifscope(int af) { - return (primary_ifscope); + return (af == AF_INET ? primary_ifscope : primary6_ifscope); } /* - * Embed ifscope into a given a sockaddr_in. + * Set the scope ID of a given a sockaddr_in. */ -static inline void -sa_set_ifscope(struct sockaddr *sa, unsigned int ifscope) +void +sin_set_ifscope(struct sockaddr *sa, unsigned int ifscope) { /* Caller must pass in sockaddr_in */ ASSERT_SINIFSCOPE(sa); - SINIFSCOPE(sa)->sin_ifscope = ifscope; + SINIFSCOPE(sa)->sin_scope_id = ifscope; } /* - * Given a sockaddr_in, return the embedded ifscope to the caller. + * Set the scope ID of given a sockaddr_in6. + */ +static inline void +sin6_set_ifscope(struct sockaddr *sa, unsigned int ifscope) +{ + /* Caller must pass in sockaddr_in6 */ + ASSERT_SIN6IFSCOPE(sa); + + SIN6IFSCOPE(sa)->sin6_scope_id = ifscope; +} + +/* + * Given a sockaddr_in, return the scope ID to the caller. */ unsigned int -sa_get_ifscope(struct sockaddr *sa) +sin_get_ifscope(struct sockaddr *sa) { /* Caller must pass in sockaddr_in */ ASSERT_SINIFSCOPE(sa); - return (SINIFSCOPE(sa)->sin_ifscope); + return (SINIFSCOPE(sa)->sin_scope_id); } /* - * Copy a sockaddr_in src to dst and embed ifscope into dst. + * Given a sockaddr_in6, return the scope ID to the caller. + */ +unsigned int +sin6_get_ifscope(struct sockaddr *sa) +{ + /* Caller must pass in sockaddr_in6 */ + ASSERT_SIN6IFSCOPE(sa); + + return (SIN6IFSCOPE(sa)->sin6_scope_id); +} + +static inline void +sin6_set_embedded_ifscope(struct sockaddr *sa, unsigned int ifscope) +{ + /* Caller must pass in sockaddr_in6 */ + ASSERT_SIN6IFSCOPE(sa); + VERIFY(IN6_IS_SCOPE_EMBED(&(SIN6(sa)->sin6_addr))); + + SIN6(sa)->sin6_addr.s6_addr16[1] = htons(ifscope); +} + +static inline unsigned int +sin6_get_embedded_ifscope(struct sockaddr *sa) +{ + /* Caller must pass in sockaddr_in6 */ + ASSERT_SIN6IFSCOPE(sa); + + return (ntohs(SIN6(sa)->sin6_addr.s6_addr16[1])); +} + +/* + * Copy a sockaddr_{in,in6} src to a dst storage and set scope ID into dst. + * + * To clear the scope ID, pass is a NULL pifscope. To set the scope ID, pass + * in a non-NULL pifscope with non-zero ifscope. Otherwise if pifscope is + * non-NULL and ifscope is IFSCOPE_NONE, the existing scope ID is left intact. + * In any case, the effective scope ID value is returned to the caller via + * pifscope, if it is non-NULL. */ static struct sockaddr * -sin_copy(struct sockaddr_in *src, struct sockaddr_in *dst, unsigned int ifscope) +sa_copy(struct sockaddr *src, struct sockaddr_storage *dst, + unsigned int *pifscope) { - *dst = *src; - sa_set_ifscope(SA(dst), ifscope); + int af = src->sa_family; + unsigned int ifscope = (pifscope != NULL) ? *pifscope : IFSCOPE_NONE; + + VERIFY(af == AF_INET || af == AF_INET6); + + bzero(dst, sizeof (*dst)); + + if (af == AF_INET) { + bcopy(src, dst, sizeof (struct sockaddr_in)); + if (pifscope == NULL || ifscope != IFSCOPE_NONE) + sin_set_ifscope(SA(dst), ifscope); + } else { + bcopy(src, dst, sizeof (struct sockaddr_in6)); + if (pifscope != NULL && + IN6_IS_SCOPE_EMBED(&SIN6(dst)->sin6_addr)) { + unsigned int eifscope; + /* + * If the address contains the embedded scope ID, + * use that as the value for sin6_scope_id as long + * the caller doesn't insist on clearing it (by + * passing NULL) or setting it. + */ + eifscope = sin6_get_embedded_ifscope(SA(dst)); + if (eifscope != IFSCOPE_NONE && ifscope == IFSCOPE_NONE) + ifscope = eifscope; + sin6_set_ifscope(SA(dst), ifscope); + /* + * If sin6_scope_id is set but the address doesn't + * contain the equivalent embedded value, set it. + */ + if (ifscope != IFSCOPE_NONE && eifscope != ifscope) + sin6_set_embedded_ifscope(SA(dst), ifscope); + } else if (pifscope == NULL || ifscope != IFSCOPE_NONE) { + sin6_set_ifscope(SA(dst), ifscope); + } + } + + if (pifscope != NULL) { + *pifscope = (af == AF_INET) ? sin_get_ifscope(SA(dst)) : + sin6_get_ifscope(SA(dst)); + } return (SA(dst)); } /* - * Copy a mask from src to a sockaddr_in dst and embed ifscope into dst. + * Copy a mask from src to a dst storage and set scope ID into dst. */ static struct sockaddr * -mask_copy(struct sockaddr *src, struct sockaddr_in *dst, unsigned int ifscope) +ma_copy(int af, struct sockaddr *src, struct sockaddr_storage *dst, + unsigned int ifscope) { - /* We know dst is at least the size of sockaddr{_in} */ + VERIFY(af == AF_INET || af == AF_INET6); + bzero(dst, sizeof (*dst)); rt_maskedcopy(src, SA(dst), src); /* * The length of the mask sockaddr would need to be adjusted - * to cover the additional sin_ifscope field; when ifscope is - * IFSCOPE_NONE, we'd end up clearing the embedded ifscope on + * to cover the additional {sin,sin6}_ifscope field; when ifscope + * is IFSCOPE_NONE, we'd end up clearing the scope ID field on * the destination mask in addition to extending the length * of the sockaddr, as a side effect. This is okay, as any * trailing zeroes would be skipped by rn_addmask prior to * inserting or looking up the mask in the mask tree. */ - SINIFSCOPE(dst)->sin_ifscope = ifscope; - SINIFSCOPE(dst)->sin_len = - offsetof(struct sockaddr_inifscope, sin_ifscope) + - sizeof (SINIFSCOPE(dst)->sin_ifscope); + if (af == AF_INET) { + SINIFSCOPE(dst)->sin_scope_id = ifscope; + SINIFSCOPE(dst)->sin_len = + offsetof(struct sockaddr_inifscope, sin_scope_id) + + sizeof (SINIFSCOPE(dst)->sin_scope_id); + } else { + SIN6IFSCOPE(dst)->sin6_scope_id = ifscope; + SIN6IFSCOPE(dst)->sin6_len = + offsetof(struct sockaddr_in6, sin6_scope_id) + + sizeof (SIN6IFSCOPE(dst)->sin6_scope_id); + } return (SA(dst)); } @@ -501,15 +624,15 @@ sa_trim(struct sockaddr *sa, int skip) } /* - * Called by rtm_msg{1,2} routines to "scrub" the embedded interface scope - * away from the socket address structure, so that clients of the routing - * socket will not be confused by the presence of the embedded scope, or the - * side effect of the increased length due to that. The source sockaddr is - * not modified; instead, the scrubbing happens on the destination sockaddr - * storage that is passed in by the caller. + * Called by rtm_msg{1,2} routines to "scrub" the scope ID field away from + * the socket address structure, so that clients of the routing socket will + * not be confused by the presence of the information, or the side effect of + * the increased length due to that. The source sockaddr is not modified; + * instead, the scrubbing happens on the destination sockaddr storage that + * is passed in by the caller. */ struct sockaddr * -rtm_scrub_ifscope(int idx, struct sockaddr *hint, struct sockaddr *sa, +rtm_scrub_ifscope(int type, int idx, struct sockaddr *hint, struct sockaddr *sa, struct sockaddr_storage *ss) { struct sockaddr *ret = sa; @@ -517,39 +640,64 @@ rtm_scrub_ifscope(int idx, struct sockaddr *hint, struct sockaddr *sa, switch (idx) { case RTAX_DST: /* - * If this is for an AF_INET destination address, call - * sin_copy() with IFSCOPE_NONE as it does what we need. + * If this is for an AF_INET/AF_INET6 destination address, + * call sa_copy() to clear the scope ID field. */ if (sa->sa_family == AF_INET && - SINIFSCOPE(sa)->sin_ifscope != IFSCOPE_NONE) { - bzero(ss, sizeof (*ss)); - ret = sin_copy(SIN(sa), SIN(ss), IFSCOPE_NONE); + SINIFSCOPE(sa)->sin_scope_id != IFSCOPE_NONE) { + ret = sa_copy(sa, ss, NULL); + } else if (sa->sa_family == AF_INET6 && + SIN6IFSCOPE(sa)->sin6_scope_id != IFSCOPE_NONE) { + ret = sa_copy(sa, ss, NULL); } break; case RTAX_NETMASK: { + int skip, af; /* - * If this is for a mask, we can't tell whether or not - * there is an embedded interface scope, as the span of - * bytes between sa_len and the beginning of the mask - * (offset of sin_addr in the case of AF_INET) may be - * filled with all-ones by rn_addmask(), and hence we - * cannot rely on sa_family. Because of this, we use - * the sa_family of the hint sockaddr (RTAX_{DST,IFA}) - * as indicator as to whether or not the mask is to be - * treated as one for AF_INET. Clearing the embedded - * scope involves setting it to IFSCOPE_NONE followed - * by calling sa_trim() to trim trailing zeroes from - * the storage sockaddr, which reverses what was done - * earlier by mask_copy() on the source sockaddr. + * If this is for a mask, we can't tell whether or not there + * is an valid scope ID value, as the span of bytes between + * sa_len and the beginning of the mask (offset of sin_addr in + * the case of AF_INET, or sin6_addr for AF_INET6) may be + * filled with all-ones by rn_addmask(), and hence we cannot + * rely on sa_family. Because of this, we use the sa_family + * of the hint sockaddr (RTAX_{DST,IFA}) as indicator as to + * whether or not the mask is to be treated as one for AF_INET + * or AF_INET6. Clearing the scope ID field involves setting + * it to IFSCOPE_NONE followed by calling sa_trim() to trim + * trailing zeroes from the storage sockaddr, which reverses + * what was done earlier by ma_copy() on the source sockaddr. */ - int skip = offsetof(struct sockaddr_in, sin_addr); - if (sa->sa_len > skip && sa->sa_len <= sizeof (*ss) && - hint != NULL && hint->sa_family == AF_INET) { + if (hint == NULL || + ((af = hint->sa_family) != AF_INET && af != AF_INET6)) + break; /* nothing to do */ + + skip = (af == AF_INET) ? + offsetof(struct sockaddr_in, sin_addr) : + offsetof(struct sockaddr_in6, sin6_addr); + + if (sa->sa_len > skip && sa->sa_len <= sizeof (*ss)) { bzero(ss, sizeof (*ss)); bcopy(sa, ss, sa->sa_len); - SINIFSCOPE(ss)->sin_ifscope = IFSCOPE_NONE; + /* + * Don't use {sin,sin6}_set_ifscope() as sa_family + * and sa_len for the netmask might not be set to + * the corresponding expected values of the hint. + */ + if (hint->sa_family == AF_INET) + SINIFSCOPE(ss)->sin_scope_id = IFSCOPE_NONE; + else + SIN6IFSCOPE(ss)->sin6_scope_id = IFSCOPE_NONE; ret = sa_trim(SA(ss), skip); + + /* + * For AF_INET6 mask, set sa_len appropriately unless + * this is requested via systl_dumpentry(), in which + * case we return the raw value. + */ + if (hint->sa_family == AF_INET6 && + type != RTM_GET && type != RTM_GET2) + SA(ret)->sa_len = sizeof (struct sockaddr_in6); } break; } @@ -569,11 +717,14 @@ rn_match_ifscope(struct radix_node *rn, void *arg) { struct rtentry *rt = (struct rtentry *)rn; struct matchleaf_arg *ma = arg; + int af = rt_key(rt)->sa_family; - if (!(rt->rt_flags & RTF_IFSCOPE) || rt_key(rt)->sa_family != AF_INET) + if (!(rt->rt_flags & RTF_IFSCOPE) || (af != AF_INET && af != AF_INET6)) return (0); - return (SINIFSCOPE(rt_key(rt))->sin_ifscope == ma->ifscope); + return (af == AF_INET ? + (SINIFSCOPE(rt_key(rt))->sin_scope_id == ma->ifscope) : + (SIN6IFSCOPE(rt_key(rt))->sin6_scope_id == ma->ifscope)); } static void @@ -624,6 +775,7 @@ route_init(void) panic("route_init: failed allocating rte_zone"); zone_change(rte_zone, Z_EXPAND, TRUE); + zone_change(rte_zone, Z_CALLERACCT, FALSE); zone_change(rte_zone, Z_NOENCRYPT, TRUE); TAILQ_INIT(&rttrash_head); @@ -648,16 +800,9 @@ rtalloc(struct route *ro) } void -rtalloc_ign_locked(struct route *ro, uint32_t ignore) +rtalloc_scoped(struct route *ro, unsigned int ifscope) { - return (rtalloc_ign_common_locked(ro, ignore, IFSCOPE_NONE)); -} - -void -rtalloc_scoped_ign_locked(struct route *ro, uint32_t ignore, - unsigned int ifscope) -{ - return (rtalloc_ign_common_locked(ro, ignore, ifscope)); + rtalloc_scoped_ign(ro, 0, ifscope); } static void @@ -689,7 +834,7 @@ rtalloc_ign(struct route *ro, uint32_t ignore) { lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(rnh_lock); - rtalloc_ign_locked(ro, ignore); + rtalloc_ign_common_locked(ro, ignore, IFSCOPE_NONE); lck_mtx_unlock(rnh_lock); } @@ -698,11 +843,11 @@ rtalloc_scoped_ign(struct route *ro, uint32_t ignore, unsigned int ifscope) { lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(rnh_lock); - rtalloc_scoped_ign_locked(ro, ignore, ifscope); + rtalloc_ign_common_locked(ro, ignore, ifscope); lck_mtx_unlock(rnh_lock); } -struct rtentry * +static struct rtentry * rtalloc1_locked(struct sockaddr *dst, int report, uint32_t ignflags) { return (rtalloc1_common_locked(dst, report, ignflags, IFSCOPE_NONE)); @@ -910,6 +1055,9 @@ rtfree_common(struct rtentry *rt, boolean_t locked) * resources associated with the route. */ if (!(rt->rt_flags & RTF_UP)) { + struct rtentry *rt_parent; + struct ifaddr *rt_ifa; + if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic("rt %p freed while in radix tree\n", rt); /* @@ -922,25 +1070,15 @@ rtfree_common(struct rtentry *rt, boolean_t locked) rtd_trash_link); } - /* - * Route is no longer in the tree and refcnt is 0; - * we have exclusive access, so destroy it. - */ - RT_UNLOCK(rt); - /* * release references on items we hold them on.. * e.g other routes and ifaddrs. */ - if (rt->rt_parent != NULL) { - rtfree_locked(rt->rt_parent); + if ((rt_parent = rt->rt_parent) != NULL) rt->rt_parent = NULL; - } - if (rt->rt_ifa != NULL) { - ifafree(rt->rt_ifa); + if ((rt_ifa = rt->rt_ifa) != NULL) rt->rt_ifa = NULL; - } /* * Now free any attached link-layer info. @@ -953,6 +1091,18 @@ rtfree_common(struct rtentry *rt, boolean_t locked) rt->rt_llinfo = NULL; } + /* + * Route is no longer in the tree and refcnt is 0; + * we have exclusive access, so destroy it. + */ + RT_UNLOCK(rt); + + if (rt_parent != NULL) + rtfree_locked(rt_parent); + + if (rt_ifa != NULL) + IFA_REMREF(rt_ifa); + /* * The key is separately alloc'd so free it (see rt_setgate()). * This also frees the gateway, as they are always malloc'd @@ -960,6 +1110,11 @@ rtfree_common(struct rtentry *rt, boolean_t locked) */ R_Free(rt_key(rt)); + /* + * Free any statistics that may have been allocated + */ + nstat_route_detach(rt); + /* * and the rtentry itself of course */ @@ -1057,16 +1212,19 @@ rtsetifa(struct rtentry *rt, struct ifaddr* ifa) if (rt->rt_ifa == ifa) return; + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + /* Release the old ifa */ if (rt->rt_ifa) - ifafree(rt->rt_ifa); + IFA_REMREF(rt->rt_ifa); /* Set rt_ifa */ rt->rt_ifa = ifa; /* Take a reference to the ifa */ if (rt->rt_ifa) - ifaref(rt->rt_ifa); + IFA_ADDREF(rt->rt_ifa); } /* @@ -1086,11 +1244,23 @@ rtredirect(struct ifnet *ifp, struct sockaddr *dst, struct sockaddr *gateway, struct rt_addrinfo info; struct ifaddr *ifa = NULL; unsigned int ifscope = (ifp != NULL) ? ifp->if_index : IFSCOPE_NONE; - struct sockaddr_in sin; + struct sockaddr_storage ss; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(rnh_lock); + /* + * Transform src into the internal routing table form for + * comparison against rt_gateway below. + */ +#if INET6 + if ((src->sa_family == AF_INET && ip_doscopedroute) || + (src->sa_family == AF_INET6 && ip6_doscopedroute)) +#else + if (src->sa_family == AF_INET && ip_doscopedroute) +#endif /* !INET6 */ + src = sa_copy(src, &ss, &ifscope); + /* * Verify the gateway is directly reachable; if scoped routing * is enabled, verify that it is reachable from the interface @@ -1106,31 +1276,29 @@ rtredirect(struct ifnet *ifp, struct sockaddr *dst, struct sockaddr *gateway, if (rt != NULL) RT_LOCK(rt); - /* Embed scope in src for comparison against rt_gateway below */ - if (ip_doscopedroute && src->sa_family == AF_INET) - src = sin_copy(SIN(src), &sin, ifscope); - /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, * we have a routing loop, perhaps as a result of an interface - * going down recently. + * going down recently. Holding rnh_lock here prevents the + * possibility of rt_ifa/ifa's ifa_addr from changing (e.g. + * in_ifinit), so okay to access ifa_addr without locking. */ if (!(flags & RTF_DONE) && rt != NULL && (!equal(src, rt->rt_gateway) || !equal(rt->rt_ifa->ifa_addr, ifa->ifa_addr))) { error = EINVAL; } else { - ifafree(ifa); + IFA_REMREF(ifa); if ((ifa = ifa_ifwithaddr(gateway))) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; error = EHOSTUNREACH; } } if (ifa) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; } @@ -1265,25 +1433,36 @@ ifa_ifwithroute_scoped_locked(int flags, const struct sockaddr *dst, static struct ifaddr * ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, - const struct sockaddr *gateway, unsigned int ifscope) + const struct sockaddr *gw, unsigned int ifscope) { struct ifaddr *ifa = NULL; struct rtentry *rt = NULL; - struct sockaddr_in dst_in, gw_in; + struct sockaddr_storage dst_ss, gw_ss; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); - if (ip_doscopedroute) { - /* - * Just in case the sockaddr passed in by the caller - * contains embedded scope, make sure to clear it since - * IPv4 interface addresses aren't scoped. - */ - if (dst != NULL && dst->sa_family == AF_INET) - dst = sin_copy(SIN(dst), &dst_in, IFSCOPE_NONE); - if (gateway != NULL && gateway->sa_family == AF_INET) - gateway = sin_copy(SIN(gateway), &gw_in, IFSCOPE_NONE); - } + /* + * Just in case the sockaddr passed in by the caller + * contains a scope ID, make sure to clear it since + * interface addresses aren't scoped. + */ +#if INET6 + if (dst != NULL && + ((dst->sa_family == AF_INET && ip_doscopedroute) || + (dst->sa_family == AF_INET6 && ip6_doscopedroute))) +#else + if (dst != NULL && dst->sa_family == AF_INET && ip_doscopedroute) +#endif /* !INET6 */ + dst = sa_copy(SA(dst), &dst_ss, NULL); + +#if INET6 + if (gw != NULL && + ((gw->sa_family == AF_INET && ip_doscopedroute) || + (gw->sa_family == AF_INET6 && ip6_doscopedroute))) +#else + if (gw != NULL && gw->sa_family == AF_INET && ip_doscopedroute) +#endif /* !INET6 */ + gw = sa_copy(SA(gw), &gw_ss, NULL); if (!(flags & RTF_GATEWAY)) { /* @@ -1297,17 +1476,17 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, ifa = ifa_ifwithdstaddr(dst); } if (ifa == NULL) - ifa = ifa_ifwithaddr_scoped(gateway, ifscope); + ifa = ifa_ifwithaddr_scoped(gw, ifscope); } else { /* * If we are adding a route to a remote net * or host, the gateway may still be on the * other end of a pt to pt link. */ - ifa = ifa_ifwithdstaddr(gateway); + ifa = ifa_ifwithdstaddr(gw); } if (ifa == NULL) - ifa = ifa_ifwithnet_scoped(gateway, ifscope); + ifa = ifa_ifwithnet_scoped(gw, ifscope); if (ifa == NULL) { /* Workaround to avoid gcc warning regarding const variable */ rt = rtalloc1_scoped_locked((struct sockaddr *)(size_t)dst, @@ -1315,19 +1494,27 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, if (rt != NULL) { RT_LOCK_SPIN(rt); ifa = rt->rt_ifa; - if (ifa != NULL) - ifaref(ifa); + if (ifa != NULL) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + IFA_ADDREF(ifa); + } RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); rt = NULL; } } + /* + * Holding rnh_lock here prevents the possibility of ifa from + * changing (e.g. in_ifinit), so it is safe to access its + * ifa_addr (here and down below) without locking. + */ if (ifa != NULL && ifa->ifa_addr->sa_family != dst->sa_family) { struct ifaddr *newifa; /* Callee adds reference to newifa upon success */ newifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp); if (newifa != NULL) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = newifa; } } @@ -1337,18 +1524,21 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, * that may not agree with info garnered from the interfaces. * The routing table should carry more precedence than the * interfaces in this matter. Must be careful not to stomp - * on new entries from rtinit, hence (ifa->ifa_addr != gateway). + * on new entries from rtinit, hence (ifa->ifa_addr != gw). */ if ((ifa == NULL || - !equal(ifa->ifa_addr, (struct sockaddr *)(size_t)gateway)) && - (rt = rtalloc1_scoped_locked((struct sockaddr *)(size_t)gateway, + !equal(ifa->ifa_addr, (struct sockaddr *)(size_t)gw)) && + (rt = rtalloc1_scoped_locked((struct sockaddr *)(size_t)gw, 0, 0, ifscope)) != NULL) { if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); RT_LOCK_SPIN(rt); ifa = rt->rt_ifa; - if (ifa != NULL) - ifaref(ifa); + if (ifa != NULL) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + IFA_ADDREF(ifa); + } RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); } @@ -1359,7 +1549,7 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, */ if ((flags & RTF_IFSCOPE) && ifa != NULL && ifa->ifa_ifp->if_index != ifscope) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; } @@ -1400,7 +1590,7 @@ rtrequest_scoped_locked(int req, struct sockaddr *dst, * Do appropriate manipulations of a routing tree given all the bits of * info needed. * - * Embedding the scope in the radix key is an internal job that should be + * Storing the scope ID in the radix key is an internal job that should be * left to routines in this module. Callers should specify the scope value * to the "scoped" variants of route routines instead of manipulating the * key itself. This is typically done when creating a scoped route, e.g. @@ -1422,59 +1612,79 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, struct radix_node_head *rnh; struct ifaddr *ifa = NULL; struct sockaddr *ndst, *dst = dst0; - struct sockaddr_in sin, mask; + struct sockaddr_storage ss, mask; + struct timeval curr_calendartime; + int af = dst->sa_family; + void (*ifa_rtrequest)(int, struct rtentry *, struct sockaddr *); + #define senderr(x) { error = x ; goto bad; } lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); /* * Find the correct routing tree to use for this Address Family */ - if ((rnh = rt_tables[dst->sa_family]) == 0) + if ((rnh = rt_tables[af]) == NULL) senderr(ESRCH); /* * If we are adding a host route then we don't want to put * a netmask in the tree */ if (flags & RTF_HOST) - netmask = 0; + netmask = NULL; /* - * If RTF_IFSCOPE is specified, use a local copy of the destination - * address to embed the scope into. This logic is repeated below + * If Scoped Routing is enabled, use a local copy of the destination + * address to store the scope ID into. This logic is repeated below * in the RTM_RESOLVE handler since the caller does not normally - * specify such a flag during a resolve; instead it passes in the - * route used for cloning for which the scope info is derived from. - * Note also that in the case of RTM_DELETE, the address passed in - * by the caller might already contain the embedded scope info when - * it is the key itself, thus making RTF_IFSCOPE unnecessary; one - * instance where it is explicitly set is inside route_output() - * as part of handling a routing socket request. + * specify such a flag during a resolve, as well as for the handling + * of IPv4 link-local address; instead, it passes in the route used for + * cloning for which the scope info is derived from. Note also that + * in the case of RTM_DELETE, the address passed in by the caller + * might already contain the scope ID info when it is the key itself, + * thus making RTF_IFSCOPE unnecessary; one instance where it is + * explicitly set is inside route_output() as part of handling a + * routing socket request. */ - if (req != RTM_RESOLVE && (flags & RTF_IFSCOPE)) { - /* Scoped routing is for AF_INET only */ - if (dst->sa_family != AF_INET || - (req == RTM_ADD && !ip_doscopedroute)) - senderr(EINVAL); +#if INET6 + if (req != RTM_RESOLVE && + ((af == AF_INET && ip_doscopedroute) || + (af == AF_INET6 && ip6_doscopedroute))) { +#else + if (req != RTM_RESOLVE && af == AF_INET && ip_doscopedroute) { +#endif /* !INET6 */ + /* Transform dst into the internal routing table form */ + dst = sa_copy(dst, &ss, &ifscope); - if (ifscope == IFSCOPE_NONE) { - flags &= ~RTF_IFSCOPE; - } else { - /* Embed ifscope into the key (local copy) */ - dst = sin_copy(SIN(dst), &sin, ifscope); + /* Transform netmask into the internal routing table form */ + if (netmask != NULL) + netmask = ma_copy(af, netmask, &mask, ifscope); - /* Embed ifscope into netmask (local copy) */ - if (netmask != NULL) - netmask = mask_copy(netmask, &mask, ifscope); - } + if (ifscope != IFSCOPE_NONE) + flags |= RTF_IFSCOPE; + } else { + if ((flags & RTF_IFSCOPE) && (af != AF_INET && af != AF_INET6)) + senderr(EINVAL); + +#if INET6 + if ((af == AF_INET && !ip_doscopedroute) || + (af == AF_INET6 && !ip6_doscopedroute)) +#else + if (af == AF_INET && !ip_doscopedroute) +#endif /* !INET6 */ + ifscope = IFSCOPE_NONE; } + if (ifscope == IFSCOPE_NONE) + flags &= ~RTF_IFSCOPE; + switch (req) { - case RTM_DELETE: + case RTM_DELETE: { + struct rtentry *gwrt = NULL; /* * Remove the item from the tree and return it. * Complain if it is not there and do no more processing. */ - if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == 0) + if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == NULL) senderr(ESRCH); if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT)) panic ("rtrequest delete"); @@ -1512,20 +1722,22 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, /* * Remove any external references we may have. - * This might result in another rtentry being freed if - * we held its last reference. */ - if (rt->rt_gwroute != NULL) { - rtfree_locked(rt->rt_gwroute); + if ((gwrt = rt->rt_gwroute) != NULL) rt->rt_gwroute = NULL; - } /* * give the protocol a chance to keep things in sync. */ - if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) - ifa->ifa_rtrequest(RTM_DELETE, rt, SA(0)); - ifa = NULL; + if ((ifa = rt->rt_ifa) != NULL) { + IFA_LOCK_SPIN(ifa); + ifa_rtrequest = ifa->ifa_rtrequest; + IFA_UNLOCK(ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_DELETE, rt, NULL); + /* keep reference on rt_ifa */ + ifa = NULL; + } /* * one more rtentry floating around that is not @@ -1541,18 +1753,23 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, * If this is the (non-scoped) default route, clear * the interface index used for the primary ifscope. */ - if (rt_inet_default(rt, rt_key(rt))) - set_primary_ifscope(IFSCOPE_NONE); - -#if IFNET_ROUTE_REFCNT - if (rt->rt_if_ref_fn != NULL) { - rt->rt_if_ref_fn(rt->rt_ifp, -1); - rt->rt_flags &= ~RTF_IFREF; + if (rt_primary_default(rt, rt_key(rt))) { + set_primary_ifscope(rt_key(rt)->sa_family, + IFSCOPE_NONE); } -#endif /* IFNET_ROUTE_REFCNT */ + rt_clear_idleref(rt); RT_UNLOCK(rt); + /* + * This might result in another rtentry being freed if + * we held its last reference. Do this after the rtentry + * lock is dropped above, as it could lead to the same + * lock being acquired if gwrt is a clone of rt. + */ + if (gwrt != NULL) + rtfree_locked(gwrt); + /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be @@ -1566,9 +1783,9 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, rtfree_locked(rt); } break; - + } case RTM_RESOLVE: - if (ret_nrt == 0 || (rt = *ret_nrt) == 0) + if (ret_nrt == NULL || (rt = *ret_nrt) == NULL) senderr(EINVAL); /* * If cloning, we have the parent route given by the caller @@ -1581,40 +1798,55 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, * of rt_rmx. */ ifa = rt->rt_ifa; - ifaref(ifa); + IFA_ADDREF(ifa); flags = rt->rt_flags & ~(RTF_CLONING | RTF_PRCLONING | RTF_STATIC); flags |= RTF_WASCLONED; gateway = rt->rt_gateway; - if ((netmask = rt->rt_genmask) == 0) + if ((netmask = rt->rt_genmask) == NULL) flags |= RTF_HOST; - if (!ip_doscopedroute || dst->sa_family != AF_INET) +#if INET6 + if ((af != AF_INET && af != AF_INET6) || + (af == AF_INET && !ip_doscopedroute) || + (af == AF_INET6 && !ip6_doscopedroute)) +#else + if (af != AF_INET || !ip_doscopedroute) +#endif /* !INET6 */ goto makeroute; + /* * When scoped routing is enabled, cloned entries are * always scoped according to the interface portion of * the parent route. The exception to this are IPv4 * link local addresses. */ - if (!IN_LINKLOCAL(ntohl(SIN(dst)->sin_addr.s_addr))) { + if (af == AF_INET && + IN_LINKLOCAL(ntohl(SIN(dst)->sin_addr.s_addr))) { + ifscope = IFSCOPE_NONE; + flags &= ~RTF_IFSCOPE; + } else { if (flags & RTF_IFSCOPE) { - ifscope = sa_get_ifscope(rt_key(rt)); + ifscope = (af == AF_INET) ? + sin_get_ifscope(rt_key(rt)) : + sin6_get_ifscope(rt_key(rt)); } else { ifscope = rt->rt_ifp->if_index; flags |= RTF_IFSCOPE; } - } else { - ifscope = IFSCOPE_NONE; - flags &= ~RTF_IFSCOPE; + VERIFY(ifscope != IFSCOPE_NONE); } - /* Embed or clear ifscope into/from the key (local copy) */ - dst = sin_copy(SIN(dst), &sin, ifscope); + /* + * Transform dst into the internal routing table form, + * clearing out the scope ID field if ifscope isn't set. + */ + dst = sa_copy(dst, &ss, (ifscope == IFSCOPE_NONE) ? + NULL : &ifscope); - /* Embed or clear ifscope into/from netmask (local copy) */ + /* Transform netmask into the internal routing table form */ if (netmask != NULL) - netmask = mask_copy(netmask, &mask, ifscope); + netmask = ma_copy(af, netmask, &mask, ifscope); goto makeroute; @@ -1631,10 +1863,13 @@ rtrequest_common_locked(int req, struct sockaddr *dst0, if (ifa == NULL) senderr(ENETUNREACH); makeroute: + getmicrotime(&curr_calendartime); if ((rt = rte_alloc()) == NULL) senderr(ENOBUFS); Bzero(rt, sizeof(*rt)); rte_lock_init(rt); + rt->base_calendartime = curr_calendartime.tv_sec; + rt->base_uptime = net_uptime(); RT_LOCK(rt); rt->rt_flags = RTF_UP | flags; @@ -1644,6 +1879,7 @@ makeroute: */ if ((error = rt_setgate(rt, dst, gateway)) != 0) { RT_UNLOCK(rt); + nstat_route_detach(rt); rte_lock_destroy(rt); rte_free(rt); senderr(error); @@ -1712,23 +1948,24 @@ makeroute: * If it still failed to go into the tree, * then un-make it (this should be a function) */ - if (rn == 0) { + if (rn == NULL) { if (rt->rt_gwroute) { rtfree_locked(rt->rt_gwroute); rt->rt_gwroute = NULL; } if (rt->rt_ifa) { - ifafree(rt->rt_ifa); + IFA_REMREF(rt->rt_ifa); rt->rt_ifa = NULL; } R_Free(rt_key(rt)); RT_UNLOCK(rt); + nstat_route_detach(rt); rte_lock_destroy(rt); rte_free(rt); senderr(EEXIST); } - rt->rt_parent = 0; + rt->rt_parent = NULL; /* * If we got here from RESOLVE, then we are cloning so clone @@ -1741,42 +1978,46 @@ makeroute: */ if (req == RTM_RESOLVE) { RT_LOCK_SPIN(*ret_nrt); - rt->rt_rmx = (*ret_nrt)->rt_rmx; /* copy metrics */ + VERIFY((*ret_nrt)->rt_expire == 0 || (*ret_nrt)->rt_rmx.rmx_expire != 0); + VERIFY((*ret_nrt)->rt_expire != 0 || (*ret_nrt)->rt_rmx.rmx_expire == 0); + rt->rt_rmx = (*ret_nrt)->rt_rmx; + rt_setexpire(rt, (*ret_nrt)->rt_expire); if ((*ret_nrt)->rt_flags & (RTF_CLONING | RTF_PRCLONING)) { rt->rt_parent = (*ret_nrt); RT_ADDREF_LOCKED(*ret_nrt); } RT_UNLOCK(*ret_nrt); -#if IFNET_ROUTE_REFCNT /* * Enable interface reference counting for unicast * cloned routes and bump up the reference count. */ if (rt->rt_parent != NULL && !(rt->rt_flags & (RTF_BROADCAST | RTF_MULTICAST))) { - rt->rt_if_ref_fn = rte_if_ref; - rt->rt_if_ref_fn(rt->rt_ifp, 1); - rt->rt_flags |= RTF_IFREF; + rt_set_idleref(rt); } -#endif /* IFNET_ROUTE_REFCNT */ } /* * if this protocol has something to add to this then * allow it to do that as well. */ - if (ifa->ifa_rtrequest) - ifa->ifa_rtrequest(req, rt, SA(ret_nrt ? *ret_nrt : 0)); - ifafree(ifa); - ifa = 0; + IFA_LOCK_SPIN(ifa); + ifa_rtrequest = ifa->ifa_rtrequest; + IFA_UNLOCK(ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(req, rt, SA(ret_nrt ? *ret_nrt : NULL)); + IFA_REMREF(ifa); + ifa = NULL; /* * If this is the (non-scoped) default route, record * the interface index used for the primary ifscope. */ - if (rt_inet_default(rt, rt_key(rt))) - set_primary_ifscope(rt->rt_ifp->if_index); + if (rt_primary_default(rt, rt_key(rt))) { + set_primary_ifscope(rt_key(rt)->sa_family, + rt->rt_ifp->if_index); + } /* * actually return a resultant rtentry and @@ -1793,7 +2034,7 @@ makeroute: * hasn't been added to the tree yet. */ if (req == RTM_ADD && - !(rt->rt_flags & RTF_HOST) && rt_mask(rt) != 0) { + !(rt->rt_flags & RTF_HOST) && rt_mask(rt) != NULL) { struct rtfc_arg arg; arg.rnh = rnh; arg.rt0 = rt; @@ -1803,22 +2044,19 @@ makeroute: } else { RT_UNLOCK(rt); } + + nstat_route_new_entry(rt); break; } bad: if (ifa) - ifafree(ifa); + IFA_REMREF(ifa); return (error); } int -rtrequest( - int req, - struct sockaddr *dst, - struct sockaddr *gateway, - struct sockaddr *netmask, - int flags, - struct rtentry **ret_nrt) +rtrequest(int req, struct sockaddr *dst, struct sockaddr *gateway, + struct sockaddr *netmask, int flags, struct rtentry **ret_nrt) { int error; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); @@ -1827,6 +2065,21 @@ rtrequest( lck_mtx_unlock(rnh_lock); return (error); } + +int +rtrequest_scoped(int req, struct sockaddr *dst, struct sockaddr *gateway, + struct sockaddr *netmask, int flags, struct rtentry **ret_nrt, + unsigned int ifscope) +{ + int error; + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(rnh_lock); + error = rtrequest_scoped_locked(req, dst, gateway, netmask, flags, + ret_nrt, ifscope); + lck_mtx_unlock(rnh_lock); + return (error); +} + /* * Called from rtrequest(RTM_DELETE, ...) to fix up the route's ``family'' * (i.e., the routes related to it by the operation of cloning). This @@ -2018,11 +2271,16 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) struct rtentry *gwrt; unsigned int ifscope; - ifscope = (dst->sa_family == AF_INET) ? - sa_get_ifscope(dst) : IFSCOPE_NONE; + if (dst->sa_family == AF_INET) + ifscope = sin_get_ifscope(dst); + else if (dst->sa_family == AF_INET6) + ifscope = sin6_get_ifscope(dst); + else + ifscope = IFSCOPE_NONE; RT_UNLOCK(rt); - gwrt = rtalloc1_scoped_locked(gate, 1, RTF_PRCLONING, ifscope); + gwrt = rtalloc1_scoped_locked(gate, 1, + RTF_CLONING | RTF_PRCLONING, ifscope); if (gwrt != NULL) RT_LOCK_ASSERT_NOTHELD(gwrt); RT_LOCK(rt); @@ -2082,8 +2340,10 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) * primary ifscope. Also done in rt_setif() to take care * of the non-redirect cases. */ - if (rt_inet_default(rt, dst) && rt->rt_ifp != NULL) - set_primary_ifscope(rt->rt_ifp->if_index); + if (rt_primary_default(rt, dst) && rt->rt_ifp != NULL) { + set_primary_ifscope(dst->sa_family, + rt->rt_ifp->if_index); + } /* * Tell the kernel debugger about the new default gateway @@ -2095,8 +2355,8 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) */ if ((dst->sa_family == AF_INET) && gwrt != NULL && gwrt->rt_gateway->sa_family == AF_LINK && - (gwrt->rt_ifp->if_index == get_primary_ifscope() || - get_primary_ifscope() == IFSCOPE_NONE)) + (gwrt->rt_ifp->if_index == get_primary_ifscope(AF_INET) || + get_primary_ifscope(AF_INET) == IFSCOPE_NONE)) kdp_set_gateway_mac(SDL(gwrt->rt_gateway)->sdl_data); } @@ -2142,11 +2402,16 @@ rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) * For consistency between rt_gateway and rt_key(gwrt). */ if ((rt->rt_flags & RTF_GATEWAY) && rt->rt_gwroute != NULL && - (rt->rt_gwroute->rt_flags & RTF_IFSCOPE) && - rt->rt_gateway->sa_family == AF_INET && - rt_key(rt->rt_gwroute)->sa_family == AF_INET) { - sa_set_ifscope(rt->rt_gateway, - sa_get_ifscope(rt_key(rt->rt_gwroute))); + (rt->rt_gwroute->rt_flags & RTF_IFSCOPE)) { + if (rt->rt_gateway->sa_family == AF_INET && + rt_key(rt->rt_gwroute)->sa_family == AF_INET) { + sin_set_ifscope(rt->rt_gateway, + sin_get_ifscope(rt_key(rt->rt_gwroute))); + } else if (rt->rt_gateway->sa_family == AF_INET6 && + rt_key(rt->rt_gwroute)->sa_family == AF_INET6) { + sin6_set_ifscope(rt->rt_gateway, + sin6_get_ifscope(rt_key(rt->rt_gwroute))); + } } /* @@ -2192,32 +2457,35 @@ rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, } /* - * Lookup an AF_INET scoped or non-scoped route depending on the ifscope - * value passed in by the caller (IFSCOPE_NONE implies non-scoped). + * Lookup an AF_INET/AF_INET6 scoped or non-scoped route depending on the + * ifscope value passed in by the caller (IFSCOPE_NONE implies non-scoped). */ static struct radix_node * node_lookup(struct sockaddr *dst, struct sockaddr *netmask, unsigned int ifscope) { - struct radix_node_head *rnh = rt_tables[AF_INET]; + struct radix_node_head *rnh; struct radix_node *rn; - struct sockaddr_in sin, mask; + struct sockaddr_storage ss, mask; + int af = dst->sa_family; struct matchleaf_arg ma = { ifscope }; rn_matchf_t *f = rn_match_ifscope; void *w = &ma; - if (dst->sa_family != AF_INET) + if (af != AF_INET && af != AF_INET6) return (NULL); + rnh = rt_tables[af]; + /* - * Embed ifscope into the search key; for a non-scoped - * search this will clear out any embedded scope value. + * Transform dst into the internal routing table form, + * clearing out the scope ID field if ifscope isn't set. */ - dst = sin_copy(SIN(dst), &sin, ifscope); + dst = sa_copy(dst, &ss, (ifscope == IFSCOPE_NONE) ? NULL : &ifscope); - /* Embed (or clear) ifscope into netmask */ + /* Transform netmask into the internal routing table form */ if (netmask != NULL) - netmask = mask_copy(netmask, &mask, ifscope); + netmask = ma_copy(af, netmask, &mask, ifscope); if (ifscope == IFSCOPE_NONE) f = w = NULL; @@ -2230,13 +2498,18 @@ node_lookup(struct sockaddr *dst, struct sockaddr *netmask, } /* - * Lookup the AF_INET non-scoped default route. + * Lookup the AF_INET/AF_INET6 non-scoped default route. */ static struct radix_node * -node_lookup_default(void) +node_lookup_default(int af) { - struct radix_node_head *rnh = rt_tables[AF_INET]; - return (rnh->rnh_lookup(&sin_def, NULL, rnh)); + struct radix_node_head *rnh; + + VERIFY(af == AF_INET || af == AF_INET6); + rnh = rt_tables[af]; + + return (af == AF_INET ? rnh->rnh_lookup(&sin_def, NULL, rnh) : + rnh->rnh_lookup(&sin6_def, NULL, rnh)); } /* @@ -2250,10 +2523,10 @@ node_lookup_default(void) * per-interface route instance. This permits multiple route entries having * the same destination (but not necessarily the same gateway) to exist in * the routing table; each of these entries is specific to the corresponding - * interface. This is made possible by embedding the scope value into the + * interface. This is made possible by storing the scope ID value into the * radix key, thus making each route entry unique. These scoped entries * exist along with the regular, non-scoped entries in the same radix tree - * for a given address family (currently AF_INET only); the scope logically + * for a given address family (AF_INET/AF_INET6); the scope logically * partitions it into multiple per-interface sub-trees. * * When a scoped route lookup is performed, the routing table is searched for @@ -2267,7 +2540,9 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, struct radix_node_head *rnh, unsigned int ifscope) { struct radix_node *rn0, *rn; - boolean_t dontcare = (ifscope == IFSCOPE_NONE); + boolean_t dontcare; + int af = dst->sa_family; + struct sockaddr_storage dst_ss, mask_ss; lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); @@ -2277,11 +2552,14 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, /* * Non-scoped route lookup. */ - if (!ip_doscopedroute || dst->sa_family != AF_INET) { - if (lookup_only) - rn = rnh->rnh_lookup(dst, netmask, rnh); - else - rn = rnh->rnh_matchaddr(dst, rnh); +#if INET6 + if ((af != AF_INET && af != AF_INET6) || + (af == AF_INET && !ip_doscopedroute) || + (af == AF_INET6 && !ip6_doscopedroute)) { +#else + if (af != AF_INET || !ip_doscopedroute) { +#endif /* !INET6 */ + rn = rnh->rnh_matchaddr(dst, rnh); /* * Don't return a root node; also, rnh_matchaddr callback @@ -2303,6 +2581,12 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, return (RT(rn)); } + /* Transform dst/netmask into the internal routing table form */ + dst = sa_copy(dst, &dst_ss, &ifscope); + if (netmask != NULL) + netmask = ma_copy(af, netmask, &mask_ss, ifscope); + dontcare = (ifscope == IFSCOPE_NONE); + /* * Scoped route lookup: * @@ -2316,10 +2600,13 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, /* * If the caller did not specify a scope, use the primary scope * derived from the system's non-scoped default route. If, for - * any reason, there is no primary interface, return what we have. + * any reason, there is no primary interface, ifscope will be + * set to IFSCOPE_NONE; if the above lookup resulted in a route, + * we'll do a more-specific search below, scoped to the interface + * of that route. */ - if (dontcare && (ifscope = get_primary_ifscope()) == IFSCOPE_NONE) - goto done; + if (dontcare) + ifscope = get_primary_ifscope(af); /* * Keep the original result if either of the following is true: @@ -2381,7 +2668,7 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, * as a more specific route. */ if (rn == NULL || (rn0 != NULL && - ((INET_DEFAULT(rt_key(RT(rn))) && !INET_DEFAULT(rt_key(RT(rn0)))) || + ((SA_DEFAULT(rt_key(RT(rn))) && !SA_DEFAULT(rt_key(RT(rn0)))) || (!RT_HOST(rn) && RT_HOST(rn0))))) rn = rn0; @@ -2389,23 +2676,20 @@ rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, * If we still don't have a route, use the non-scoped default * route as long as the interface portion satistifes the scope. */ - if (rn == NULL && (rn = node_lookup_default()) != NULL && + if (rn == NULL && (rn = node_lookup_default(af)) != NULL && RT(rn)->rt_ifp->if_index != ifscope) rn = NULL; -done: if (rn != NULL) { /* - * Manually clear RTPRF_OURS using in_validate() and + * Manually clear RTPRF_OURS using rt_validate() and * bump up the reference count after, and not before; - * we only get here for AF_INET. node_lookup() has - * done the check against RNF_ROOT, so we can be sure + * we only get here for AF_INET/AF_INET6. node_lookup() + * has done the check against RNF_ROOT, so we can be sure * that we're not returning a root node here. */ RT_LOCK_SPIN(RT(rn)); - if (!(RT(rn)->rt_flags & RTF_CONDEMNED)) { - if (!lookup_only) - (void) in_validate(rn); + if (rt_validate(RT(rn))) { RT_ADDREF_LOCKED(RT(rn)); RT_UNLOCK(RT(rn)); } else { @@ -2417,6 +2701,25 @@ done: return (RT(rn)); } +boolean_t +rt_validate(struct rtentry *rt) +{ + RT_LOCK_ASSERT_HELD(rt); + + if (!(rt->rt_flags & RTF_CONDEMNED)) { + int af = rt_key(rt)->sa_family; + + if (af == AF_INET) + (void) in_validate(RN(rt)); + else if (af == AF_INET6) + (void) in6_validate(RN(rt)); + } else { + rt = NULL; + } + + return (rt != NULL); +} + /* * Set up a routing table entry, normally * for an interface. @@ -2440,8 +2743,14 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) struct sockaddr *deldst; struct mbuf *m = 0; struct rtentry *nrt = 0; + u_int32_t ifa_flags; int error; + /* + * Holding rnh_lock here prevents the possibility of ifa from + * changing (e.g. in_ifinit), so it is safe to access its + * ifa_{dst}addr (here and down below) without locking. + */ dst = flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr; /* * If it's a delete, check that if it exists, it's on the correct @@ -2513,8 +2822,11 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) /* * Do the actual request */ + IFA_LOCK_SPIN(ifa); + ifa_flags = ifa->ifa_flags; + IFA_UNLOCK(ifa); error = rtrequest_locked(cmd, dst, ifa->ifa_addr, ifa->ifa_netmask, - flags | ifa->ifa_flags, &nrt); + flags | ifa_flags, &nrt); if (m) (void) m_free(m); /* @@ -2544,6 +2856,9 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) * have already existed or something. (XXX) */ if (rt->rt_ifa != ifa) { + void (*ifa_rtrequest) + (int, struct rtentry *, struct sockaddr *); + if (!(rt->rt_ifa->ifa_ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK))) printf("rtinit: wrong ifa (%p) was (%p)\n", @@ -2553,22 +2868,31 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) * remove anything it has associated with * this route and ifaddr. */ - if (rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_DELETE, rt, SA(0)); + IFA_LOCK_SPIN(rt->rt_ifa); + ifa_rtrequest = rt->rt_ifa->ifa_rtrequest; + IFA_UNLOCK(rt->rt_ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_DELETE, rt, SA(0)); /* * Set the route's ifa. */ rtsetifa(rt, ifa); -#if IFNET_ROUTE_REFCNT - /* - * Adjust route ref count for the interfaces. - */ - if (rt->rt_if_ref_fn != NULL && - rt->rt_ifp != ifa->ifa_ifp) { - rt->rt_if_ref_fn(ifa->ifa_ifp, 1); - rt->rt_if_ref_fn(rt->rt_ifp, -1); + + if (rt->rt_ifp != ifa->ifa_ifp) { + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + /* + * Adjust route ref count for the interfaces. + */ + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(ifa->ifa_ifp, 1); + rt->rt_if_ref_fn(rt->rt_ifp, -1); + } } -#endif /* IFNET_ROUTE_REFCNT */ + /* * And substitute in references to the ifaddr * we are adding. @@ -2579,8 +2903,11 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) * Now ask the protocol to check if it needs * any special processing in its new form. */ - if (ifa->ifa_rtrequest) - ifa->ifa_rtrequest(RTM_ADD, rt, SA(0)); + IFA_LOCK_SPIN(ifa); + ifa_rtrequest = ifa->ifa_rtrequest; + IFA_UNLOCK(ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_ADD, rt, SA(0)); } /* * notify any listenning routing agents of the change @@ -2604,7 +2931,6 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags) u_int64_t rt_expiry(struct rtentry *rt, u_int64_t base, u_int32_t delta) { -#if IFNET_ROUTE_REFCNT u_int64_t retval; /* @@ -2619,10 +2945,29 @@ rt_expiry(struct rtentry *rt, u_int64_t base, u_int32_t delta) retval = base + MIN(rt_if_idle_expire_timeout, delta); return (retval); -#else -#pragma unused(rt) - return (base + delta); -#endif /* IFNET_ROUTE_REFCNT */ +} + +void +rt_set_idleref(struct rtentry *rt) +{ + RT_LOCK_ASSERT_HELD(rt); + + rt_clear_idleref(rt); + rt->rt_if_ref_fn = rte_if_ref; + rt->rt_if_ref_fn(rt->rt_ifp, 1); + rt->rt_flags |= RTF_IFREF; +} + +void +rt_clear_idleref(struct rtentry *rt) +{ + RT_LOCK_ASSERT_HELD(rt); + + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(rt->rt_ifp, -1); + rt->rt_flags &= ~RTF_IFREF; + rt->rt_if_ref_fn = NULL; + } } static void @@ -2703,7 +3048,6 @@ rte_free(struct rtentry *p) zfree(rte_zone, p); } -#if IFNET_ROUTE_REFCNT static void rte_if_ref(struct ifnet *ifp, int cnt) { @@ -2749,7 +3093,6 @@ rte_if_ref(struct ifnet *ifp, int cnt) kev_post_msg(&ev_msg); } } -#endif /* IFNET_ROUTE_REFCNT */ static inline struct rtentry * rte_alloc_debug(void) @@ -2799,3 +3142,50 @@ ctrace_record(ctrace_t *tr) bzero(tr->pc, sizeof (tr->pc)); (void) OSBacktrace(tr->pc, CTRACE_STACK_SIZE); } + +__private_extern__ void +route_copyout( + struct route *dst, + const struct route *src, + size_t length) +{ + /* Copy everything (rt, dst, flags) from ifnet */ + bcopy(src, dst, length); + + /* Hold one reference for the local copy of struct route */ + if (dst->ro_rt != NULL) + RT_ADDREF(dst->ro_rt); +} + +__private_extern__ void +route_copyin( + struct route *src, + struct route *dst, + size_t length) +{ + /* No cached route in the ifnet? */ + if (dst->ro_rt == NULL) { + /* + * Copy everything (rt, dst, flags) from ip_forward(); + * the reference to the route was held at the time + * it was allocated and is kept intact. + */ + bcopy(src, dst, length); + } else if (src->ro_rt != NULL) { + /* + * If the same, update just the ro_flags and ditch the one + * in the local copy. Else ditch the one that is currently + * cached, and cache the new route. + */ + if (dst->ro_rt == src->ro_rt) { + dst->ro_flags = src->ro_flags; + rtfree(src->ro_rt); + } else { + rtfree(dst->ro_rt); + bcopy(src, dst, length); + } + } + + /* This function consumes the reference */ + src->ro_rt = NULL; +} diff --git a/bsd/net/route.h b/bsd/net/route.h index 71eb4f8f8..47aa3f902 100644 --- a/bsd/net/route.h +++ b/bsd/net/route.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,6 +95,15 @@ struct route { #define ROF_SRCIF_SELECTED 0x1 /* source interface was selected */ +/* + * Route reachability info (private) + */ +struct rt_reach_info { + u_int32_t ri_refcnt; /* reference count */ + u_int32_t ri_probes; /* total # of probes */ + u_int64_t ri_snd_expire; /* transmit expiration (calendar) time */ + u_int64_t ri_rcv_expire; /* receive expiration (calendar) time */ +}; #else struct route; #endif /* PRIVATE */ @@ -159,6 +168,9 @@ struct rtentry { struct ifaddr *rt_ifa; /* the answer: interface addr to use */ struct sockaddr *rt_genmask; /* for generation of cloned routes */ void *rt_llinfo; /* pointer to link level info cache */ + void (*rt_llinfo_get_ri) /* llinfo get reachability info fn */ + (struct rtentry *, struct rt_reach_info *); + void (*rt_llinfo_purge)(struct rtentry *); /* llinfo purge fn */ void (*rt_llinfo_free)(void *); /* link level info free function */ struct rt_metrics rt_rmx; /* metrics used by rx'ing protocols */ struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */ @@ -168,10 +180,15 @@ struct rtentry { * See bsd/net/route.c for synchronization notes. */ decl_lck_mtx_data(, rt_lock); /* lock for routing entry */ -#if IFNET_ROUTE_REFCNT + struct nstat_counts *rt_stats; void (*rt_if_ref_fn)(struct ifnet *, int); /* interface ref func */ -#endif /* IFNET_ROUTE_REFCNT */ + + uint64_t rt_expire; /* expiration time in uptime seconds */ + uint64_t base_calendartime; /* calendar time upon entry creation */ + uint64_t base_uptime;/* uptime upon entry creation */ }; + +extern void rt_setexpire(struct rtentry *, uint64_t); #endif /* KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE @@ -251,6 +268,27 @@ struct rt_msghdr2 { struct rt_metrics rtm_rmx; /* metrics themselves */ }; +#ifdef PRIVATE +/* + * Extended routing message header (private). + */ +struct rt_msghdr_ext { + u_short rtm_msglen; /* to skip over non-understood messages */ + u_char rtm_version; /* future binary compatibility */ + u_char rtm_type; /* message type */ + u_int32_t rtm_index; /* index for associated ifp */ + u_int32_t rtm_flags; /* flags, incl. kern & message, e.g. DONE */ + u_int32_t rtm_reserved; /* for future use */ + u_int32_t rtm_addrs; /* bitmask identifying sockaddrs in msg */ + pid_t rtm_pid; /* identify sender */ + int rtm_seq; /* for sender to identify action */ + int rtm_errno; /* why failed */ + u_int32_t rtm_use; /* from rtentry */ + u_int32_t rtm_inits; /* which metrics we are initializing */ + struct rt_metrics rtm_rmx; /* metrics themselves */ + struct rt_reach_info rtm_ri; /* route reachability info */ +}; +#endif /* PRIVATE */ #define RTM_VERSION 5 /* Up the ante and ignore older versions */ @@ -279,6 +317,9 @@ struct rt_msghdr2 { #define RTM_IFINFO2 0x12 /* */ #define RTM_NEWMADDR2 0x13 /* */ #define RTM_GET2 0x14 /* */ +#ifdef PRIVATE +#define RTM_GET_EXT 0x15 +#endif /* PRIVATE */ /* * Bitmask values for rtm_inits and rmx_locks. @@ -445,18 +486,16 @@ extern void rt_missmsg(int, struct rt_addrinfo *, int, int); extern void rt_newaddrmsg(int, struct ifaddr *, int, struct rtentry *); extern void rt_newmaddrmsg(int, struct ifmultiaddr *); extern int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *); -extern void set_primary_ifscope(unsigned int); -extern unsigned int get_primary_ifscope(void); -extern boolean_t rt_inet_default(struct rtentry *, struct sockaddr *); +extern void set_primary_ifscope(int, unsigned int); +extern unsigned int get_primary_ifscope(int); +extern boolean_t rt_primary_default(struct rtentry *, struct sockaddr *); extern struct rtentry *rt_lookup(boolean_t, struct sockaddr *, struct sockaddr *, struct radix_node_head *, unsigned int); extern void rtalloc(struct route *); +extern void rtalloc_scoped(struct route *, unsigned int); extern void rtalloc_ign(struct route *, uint32_t); -extern void rtalloc_ign_locked(struct route *, uint32_t); extern void rtalloc_scoped_ign(struct route *, uint32_t, unsigned int); -extern void rtalloc_scoped_ign_locked(struct route *, uint32_t, unsigned int); extern struct rtentry *rtalloc1(struct sockaddr *, int, uint32_t); -extern struct rtentry *rtalloc1_locked(struct sockaddr *, int, uint32_t); extern struct rtentry *rtalloc1_scoped(struct sockaddr *, int, uint32_t, unsigned int); extern struct rtentry *rtalloc1_scoped_locked(struct sockaddr *, int, @@ -478,19 +517,30 @@ extern void rtredirect(struct ifnet *, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, struct rtentry **); extern int rtrequest(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **); +extern int rtrequest_scoped(int, struct sockaddr *, struct sockaddr *, + struct sockaddr *, int, struct rtentry **, unsigned int); extern int rtrequest_locked(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **); extern int rtrequest_scoped_locked(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **, unsigned int); -extern unsigned int sa_get_ifscope(struct sockaddr *); +extern void sin_set_ifscope(struct sockaddr *, unsigned int); +extern unsigned int sin_get_ifscope(struct sockaddr *); +extern unsigned int sin6_get_ifscope(struct sockaddr *); extern void rt_lock(struct rtentry *, boolean_t); extern void rt_unlock(struct rtentry *); -extern struct sockaddr *rtm_scrub_ifscope(int, struct sockaddr *, +extern struct sockaddr *rtm_scrub_ifscope(int, int, struct sockaddr *, struct sockaddr *, struct sockaddr_storage *); extern u_int64_t rt_expiry(struct rtentry *, u_int64_t, u_int32_t); -#if IFNET_ROUTE_REFCNT +extern void rt_set_idleref(struct rtentry *); +extern void rt_clear_idleref(struct rtentry *); extern void rt_aggdrain(int); -#endif /* IFNET_ROUTE_REFCNT */ +extern boolean_t rt_validate(struct rtentry *); + +#ifdef XNU_KERNEL_PRIVATE +extern void route_copyin(struct route *src, struct route *dst, size_t length); +extern void route_copyout(struct route *dst, const struct route *src, size_t length); +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* KERNEL_PRIVATE */ #endif diff --git a/bsd/net/rtsock.c b/bsd/net/rtsock.c index 819f8349c..42b20064a 100644 --- a/bsd/net/rtsock.c +++ b/bsd/net/rtsock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,6 +73,7 @@ #include <sys/domain.h> #include <sys/protosw.h> #include <sys/syslog.h> +#include <sys/mcache.h> #include <kern/lock.h> #include <net/if.h> @@ -95,7 +96,6 @@ MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables"); static struct sockaddr route_dst = { 2, PF_ROUTE, { 0, } }; static struct sockaddr route_src = { 2, PF_ROUTE, { 0, } }; static struct sockaddr sa_zero = { sizeof(sa_zero), AF_INET, { 0, } }; -static struct sockproto route_proto = { PF_ROUTE, 0 }; struct walkarg { int w_tmemsize; @@ -108,30 +108,21 @@ static struct mbuf *rt_msg1(int, struct rt_addrinfo *); static int rt_msg2(int, struct rt_addrinfo *, caddr_t, struct walkarg *); static int rt_xaddrs(caddr_t, caddr_t, struct rt_addrinfo *); static int sysctl_dumpentry(struct radix_node *rn, void *vw); +static int sysctl_dumpentry_ext(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_iflist2(int af, struct walkarg *w); -static int route_output(struct mbuf *, struct socket *); -static void rt_setmetrics(u_int32_t, struct rt_metrics *, struct rt_metrics *); +static int route_output(struct mbuf *, struct socket *); +static void rt_setmetrics(u_int32_t, struct rt_metrics *, struct rtentry *); +static void rt_getmetrics(struct rtentry *, struct rt_metrics *); static void rt_setif(struct rtentry *, struct sockaddr *, struct sockaddr *, struct sockaddr *, unsigned int); -#if IFNET_ROUTE_REFCNT static void rt_drainall(void); -#endif /* IFNET_ROUTE_REFCNT */ #define SIN(sa) ((struct sockaddr_in *)(size_t)(sa)) -/* Sleazy use of local variables throughout file, warning!!!! */ -#define dst info.rti_info[RTAX_DST] -#define gate info.rti_info[RTAX_GATEWAY] -#define netmask info.rti_info[RTAX_NETMASK] -#define genmask info.rti_info[RTAX_GENMASK] -#define ifpaddr info.rti_info[RTAX_IFP] -#define ifaaddr info.rti_info[RTAX_IFA] -#define brdaddr info.rti_info[RTAX_BRD] SYSCTL_NODE(_net, OID_AUTO, idle, CTLFLAG_RW, 0, "idle network monitoring"); -#if IFNET_ROUTE_REFCNT static struct timeval last_ts; SYSCTL_NODE(_net_idle, OID_AUTO, route, CTLFLAG_RW, 0, "idle route monitoring"); @@ -140,7 +131,15 @@ static int rt_if_idle_drain_interval = RT_IF_IDLE_DRAIN_INTERVAL; SYSCTL_INT(_net_idle_route, OID_AUTO, drain_interval, CTLFLAG_RW, &rt_if_idle_drain_interval, 0, "Default interval for draining " "routes when doing interface idle reference counting."); -#endif /* IFNET_ROUTE_REFCNT */ + +/* + * This macro calculates skew in wall clock, just in case the user changes the + * system time. This skew adjustment is required because we now keep the route + * expiration times in uptime terms in the kernel, but the userland still + * expects expiration times in terms of calendar times. + */ +#define CALCULATE_CLOCKSKEW(cc, ic, cu, iu)\ + ((cc.tv_sec - ic) - (cu - iu)) /* * It really doesn't make any sense at all for this code to share much @@ -322,6 +321,7 @@ route_output(struct mbuf *m, struct socket *so) struct radix_node_head *rnh; struct rt_addrinfo info; int len, error = 0; + sa_family_t dst_sa_family = 0; struct ifnet *ifp = NULL; #ifndef __APPLE__ struct proc *curproc = current_proc(); @@ -344,17 +344,17 @@ route_output(struct mbuf *m, struct socket *so) len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) { - dst = NULL; + info.rti_info[RTAX_DST] = NULL; senderr(EINVAL); } R_Malloc(rtm, struct rt_msghdr *, len); if (rtm == NULL) { - dst = NULL; + info.rti_info[RTAX_DST] = NULL; senderr(ENOBUFS); } m_copydata(m, 0, len, (caddr_t)rtm); if (rtm->rtm_version != RTM_VERSION) { - dst = NULL; + info.rti_info[RTAX_DST] = NULL; senderr(EPROTONOSUPPORT); } @@ -374,51 +374,52 @@ route_output(struct mbuf *m, struct socket *so) * may perform operations other than RTM_GET */ if (rtm->rtm_type != RTM_GET && (so->so_state & SS_PRIV) == 0) { - dst = NULL; + info.rti_info[RTAX_DST] = NULL; senderr(EPERM); } rtm->rtm_pid = proc_selfpid(); info.rti_addrs = rtm->rtm_addrs; if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) { - dst = NULL; + info.rti_info[RTAX_DST] = NULL; senderr(EINVAL); } - if (dst == NULL || (dst->sa_family >= AF_MAX) || - (gate != NULL && (gate->sa_family >= AF_MAX))) { + if (info.rti_info[RTAX_DST] == NULL || (info.rti_info[RTAX_DST]->sa_family >= AF_MAX) || + (info.rti_info[RTAX_GATEWAY] != NULL && (info.rti_info[RTAX_GATEWAY]->sa_family >= AF_MAX))) { senderr(EINVAL); } - if (dst->sa_family == AF_INET && dst->sa_len != sizeof (dst_in)) { + if (info.rti_info[RTAX_DST]->sa_family == AF_INET && info.rti_info[RTAX_DST]->sa_len != sizeof (dst_in)) { /* At minimum, we need up to sin_addr */ - if (dst->sa_len < offsetof(struct sockaddr_in, sin_zero)) + if (info.rti_info[RTAX_DST]->sa_len < offsetof(struct sockaddr_in, sin_zero)) senderr(EINVAL); bzero(&dst_in, sizeof (dst_in)); dst_in.sin_len = sizeof (dst_in); dst_in.sin_family = AF_INET; - dst_in.sin_port = SIN(dst)->sin_port; - dst_in.sin_addr = SIN(dst)->sin_addr; - dst = (struct sockaddr *)&dst_in; + dst_in.sin_port = SIN(info.rti_info[RTAX_DST])->sin_port; + dst_in.sin_addr = SIN(info.rti_info[RTAX_DST])->sin_addr; + info.rti_info[RTAX_DST] = (struct sockaddr *)&dst_in; + dst_sa_family = info.rti_info[RTAX_DST]->sa_family; } - if (gate != NULL && - gate->sa_family == AF_INET && gate->sa_len != sizeof (gate_in)) { + if (info.rti_info[RTAX_GATEWAY] != NULL && + info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET && info.rti_info[RTAX_GATEWAY]->sa_len != sizeof (gate_in)) { /* At minimum, we need up to sin_addr */ - if (gate->sa_len < offsetof(struct sockaddr_in, sin_zero)) + if (info.rti_info[RTAX_GATEWAY]->sa_len < offsetof(struct sockaddr_in, sin_zero)) senderr(EINVAL); bzero(&gate_in, sizeof (gate_in)); gate_in.sin_len = sizeof (gate_in); gate_in.sin_family = AF_INET; - gate_in.sin_port = SIN(gate)->sin_port; - gate_in.sin_addr = SIN(gate)->sin_addr; - gate = (struct sockaddr *)&gate_in; + gate_in.sin_port = SIN(info.rti_info[RTAX_GATEWAY])->sin_port; + gate_in.sin_addr = SIN(info.rti_info[RTAX_GATEWAY])->sin_addr; + info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gate_in; } - if (genmask) { + if (info.rti_info[RTAX_GENMASK]) { struct radix_node *t; - t = rn_addmask((caddr_t)genmask, 0, 1); - if (t && Bcmp(genmask, t->rn_key, *(u_char *)genmask) == 0) - genmask = (struct sockaddr *)(t->rn_key); + t = rn_addmask((caddr_t)info.rti_info[RTAX_GENMASK], 0, 1); + if (t && Bcmp(info.rti_info[RTAX_GENMASK], t->rn_key, *(u_char *)info.rti_info[RTAX_GENMASK]) == 0) + info.rti_info[RTAX_GENMASK] = (struct sockaddr *)(t->rn_key); else senderr(ENOBUFS); } @@ -427,16 +428,27 @@ route_output(struct mbuf *m, struct socket *so) * If RTF_IFSCOPE flag is set, then rtm_index specifies the scope. */ if (rtm->rtm_flags & RTF_IFSCOPE) { - /* Scoped routing is for AF_INET only */ - if (dst->sa_family != AF_INET) + if (info.rti_info[RTAX_DST]->sa_family != AF_INET && info.rti_info[RTAX_DST]->sa_family != AF_INET6) senderr(EINVAL); ifscope = rtm->rtm_index; } + /* + * For AF_INET, always zero out the embedded scope ID. If this is + * a scoped request, it must be done explicitly by setting RTF_IFSCOPE + * flag and the corresponding rtm_index value. This is to prevent + * false interpretation of the scope ID because it's using the sin_zero + * field, which might not be properly cleared by the requestor. + */ + if (info.rti_info[RTAX_DST]->sa_family == AF_INET) + sin_set_ifscope(info.rti_info[RTAX_DST], IFSCOPE_NONE); + if (info.rti_info[RTAX_GATEWAY] != NULL && info.rti_info[RTAX_GATEWAY]->sa_family == AF_INET) + sin_set_ifscope(info.rti_info[RTAX_GATEWAY], IFSCOPE_NONE); + switch (rtm->rtm_type) { case RTM_ADD: - if (gate == NULL) + if (info.rti_info[RTAX_GATEWAY] == NULL) senderr(EINVAL); #ifdef __APPLE__ @@ -454,21 +466,21 @@ route_output(struct mbuf *m, struct socket *so) { #define satosinaddr(sa) (((struct sockaddr_in *)sa)->sin_addr.s_addr) - if (check_routeselfref && (dst && dst->sa_family == AF_INET) && - (netmask && satosinaddr(netmask) == INADDR_BROADCAST) && - (gate && satosinaddr(dst) == satosinaddr(gate))) { + if (check_routeselfref && (info.rti_info[RTAX_DST] && info.rti_info[RTAX_DST]->sa_family == AF_INET) && + (info.rti_info[RTAX_NETMASK] && satosinaddr(info.rti_info[RTAX_NETMASK]) == INADDR_BROADCAST) && + (info.rti_info[RTAX_GATEWAY] && satosinaddr(info.rti_info[RTAX_DST]) == satosinaddr(info.rti_info[RTAX_GATEWAY]))) { log(LOG_WARNING, "route_output: circular route %ld.%ld.%ld.%ld/32 ignored\n", - (ntohl(satosinaddr(gate)>>24))&0xff, - (ntohl(satosinaddr(gate)>>16))&0xff, - (ntohl(satosinaddr(gate)>>8))&0xff, - (ntohl(satosinaddr(gate)))&0xff); + (ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])>>24))&0xff, + (ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])>>16))&0xff, + (ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])>>8))&0xff, + (ntohl(satosinaddr(info.rti_info[RTAX_GATEWAY])))&0xff); senderr(EINVAL); } } #endif - error = rtrequest_scoped_locked(RTM_ADD, dst, gate, - netmask, rtm->rtm_flags, &saved_nrt, ifscope); + error = rtrequest_scoped_locked(RTM_ADD, info.rti_info[RTAX_DST], info.rti_info[RTAX_GATEWAY], + info.rti_info[RTAX_NETMASK], rtm->rtm_flags, &saved_nrt, ifscope); if (error == 0 && saved_nrt) { RT_LOCK(saved_nrt); #ifdef __APPLE__ @@ -499,24 +511,24 @@ route_output(struct mbuf *m, struct socket *so) * rarely encountered. * dwiggins@bbn.com */ - - rt_setif(saved_nrt, ifpaddr, ifaaddr, gate, + + rt_setif(saved_nrt, info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], info.rti_info[RTAX_GATEWAY], ifscope); #endif rt_setmetrics(rtm->rtm_inits, - &rtm->rtm_rmx, &saved_nrt->rt_rmx); + &rtm->rtm_rmx, saved_nrt); saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); saved_nrt->rt_rmx.rmx_locks |= - (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); - saved_nrt->rt_genmask = genmask; + (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); + saved_nrt->rt_genmask = info.rti_info[RTAX_GENMASK]; RT_REMREF_LOCKED(saved_nrt); RT_UNLOCK(saved_nrt); } break; case RTM_DELETE: - error = rtrequest_scoped_locked(RTM_DELETE, dst, - gate, netmask, rtm->rtm_flags, &saved_nrt, ifscope); + error = rtrequest_scoped_locked(RTM_DELETE, info.rti_info[RTAX_DST], + info.rti_info[RTAX_GATEWAY], info.rti_info[RTAX_NETMASK], rtm->rtm_flags, &saved_nrt, ifscope); if (error == 0) { rt = saved_nrt; RT_LOCK(rt); @@ -527,18 +539,23 @@ route_output(struct mbuf *m, struct socket *so) case RTM_GET: case RTM_CHANGE: case RTM_LOCK: - if ((rnh = rt_tables[dst->sa_family]) == NULL) + if ((rnh = rt_tables[info.rti_info[RTAX_DST]->sa_family]) == NULL) senderr(EAFNOSUPPORT); /* * Lookup the best match based on the key-mask pair; * callee adds a reference and checks for root node. */ - rt = rt_lookup(TRUE, dst, netmask, rnh, ifscope); + rt = rt_lookup(TRUE, info.rti_info[RTAX_DST], info.rti_info[RTAX_NETMASK], rnh, ifscope); if (rt == NULL) senderr(ESRCH); RT_LOCK(rt); + /* + * Holding rnh_lock here prevents the possibility of + * ifa from changing (e.g. in_ifinit), so it is safe + * to access its ifa_addr (down below) without locking. + */ switch(rtm->rtm_type) { case RTM_GET: { @@ -546,52 +563,63 @@ route_output(struct mbuf *m, struct socket *so) report: ifa2 = NULL; RT_LOCK_ASSERT_HELD(rt); - dst = rt_key(rt); - gate = rt->rt_gateway; - netmask = rt_mask(rt); - genmask = rt->rt_genmask; + info.rti_info[RTAX_DST] = rt_key(rt); + dst_sa_family = info.rti_info[RTAX_DST]->sa_family; + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = rt->rt_genmask; if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { ifp = rt->rt_ifp; if (ifp) { ifnet_lock_shared(ifp); - ifa2 = ifp->if_addrhead.tqh_first; - ifpaddr = ifa2->ifa_addr; - ifaref(ifa2); + ifa2 = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa2->ifa_addr; + IFA_ADDREF(ifa2); ifnet_lock_done(ifp); - ifaaddr = rt->rt_ifa->ifa_addr; + info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; rtm->rtm_index = ifp->if_index; } else { - ifpaddr = 0; - ifaaddr = 0; + info.rti_info[RTAX_IFP] = NULL; + info.rti_info[RTAX_IFA] = NULL; } + } else if ((ifp = rt->rt_ifp) != NULL) { + rtm->rtm_index = ifp->if_index; } + if (ifa2 != NULL) + IFA_LOCK(ifa2); len = rt_msg2(rtm->rtm_type, &info, (caddr_t)0, (struct walkarg *)0); + if (ifa2 != NULL) + IFA_UNLOCK(ifa2); if (len > rtm->rtm_msglen) { struct rt_msghdr *new_rtm; R_Malloc(new_rtm, struct rt_msghdr *, len); if (new_rtm == 0) { RT_UNLOCK(rt); if (ifa2 != NULL) - ifafree(ifa2); + IFA_REMREF(ifa2); senderr(ENOBUFS); } Bcopy(rtm, new_rtm, rtm->rtm_msglen); R_Free(rtm); rtm = new_rtm; } + if (ifa2 != NULL) + IFA_LOCK(ifa2); (void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, (struct walkarg *)0); + if (ifa2 != NULL) + IFA_UNLOCK(ifa2); rtm->rtm_flags = rt->rt_flags; - rtm->rtm_rmx = rt->rt_rmx; + rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; if (ifa2 != NULL) - ifafree(ifa2); + IFA_REMREF(ifa2); } break; case RTM_CHANGE: - if (gate && (error = rt_setgate(rt, - rt_key(rt), gate))) { + if (info.rti_info[RTAX_GATEWAY] && (error = rt_setgate(rt, + rt_key(rt), info.rti_info[RTAX_GATEWAY]))) { RT_UNLOCK(rt); senderr(error); } @@ -602,8 +630,8 @@ route_output(struct mbuf *m, struct socket *so) * flags on the default route without changing the * default gateway. Changing flags still doesn't work. */ - if ((rt->rt_flags & RTF_GATEWAY) && !gate) - gate = rt->rt_gateway; + if ((rt->rt_flags & RTF_GATEWAY) && !info.rti_info[RTAX_GATEWAY]) + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; #ifdef __APPLE__ /* @@ -611,19 +639,19 @@ route_output(struct mbuf *m, struct socket *so) * equivalent to the code found at this very spot * in BSD. */ - rt_setif(rt, ifpaddr, ifaaddr, gate, + rt_setif(rt, info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], info.rti_info[RTAX_GATEWAY], ifscope); #endif rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, - &rt->rt_rmx); + rt); #ifndef __APPLE__ /* rt_setif, called above does this for us on darwin */ if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, gate); + rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info.rti_info[RTAX_GATEWAY]); #endif - if (genmask) - rt->rt_genmask = genmask; + if (info.rti_info[RTAX_GENMASK]) + rt->rt_genmask = info.rti_info[RTAX_GENMASK]; /* * Fall into */ @@ -684,10 +712,11 @@ flush: if (error) return error; } else { + struct sockproto route_proto = {PF_ROUTE, 0}; if (rp) rp->rcb_proto.sp_family = 0; /* Avoid us */ - if (dst) - route_proto.sp_protocol = dst->sa_family; + if (dst_sa_family != 0) + route_proto.sp_protocol = dst_sa_family; if (m) { socket_unlock(so, 0); raw_input(m, &route_proto, &route_src, &route_dst); @@ -700,10 +729,28 @@ flush: return (error); } +void +rt_setexpire(struct rtentry *rt, uint64_t expiry) +{ + /* set both rt_expire and rmx_expire */ + rt->rt_expire = expiry; + if (expiry) { + rt->rt_rmx.rmx_expire = expiry + rt->base_calendartime - + rt->base_uptime; + } else + rt->rt_rmx.rmx_expire = 0; +} + static void -rt_setmetrics(u_int32_t which, struct rt_metrics *in, struct rt_metrics *out) +rt_setmetrics(u_int32_t which, struct rt_metrics *in, struct rtentry *out) { -#define metric(f, e) if (which & (f)) out->e = in->e; + struct timeval curr_calendar_time; + uint64_t curr_uptime; + + getmicrotime(&curr_calendar_time); + curr_uptime = net_uptime(); + +#define metric(f, e) if (which & (f)) out->rt_rmx.e = in->e; metric(RTV_RPIPE, rmx_recvpipe); metric(RTV_SPIPE, rmx_sendpipe); metric(RTV_SSTHRESH, rmx_ssthresh); @@ -713,17 +760,65 @@ rt_setmetrics(u_int32_t which, struct rt_metrics *in, struct rt_metrics *out) metric(RTV_MTU, rmx_mtu); metric(RTV_EXPIRE, rmx_expire); #undef metric + + if (out->rt_rmx.rmx_expire > 0) { + /* account for system time change */ + curr_uptime = net_uptime(); + getmicrotime(&curr_calendar_time); + out->base_calendartime += + CALCULATE_CLOCKSKEW(curr_calendar_time, + out->base_calendartime, + curr_uptime, out->base_uptime); + rt_setexpire(out, + out->rt_rmx.rmx_expire - + out->base_calendartime + + out->base_uptime); + } else { + rt_setexpire(out, 0); + } + + VERIFY(out->rt_expire == 0 || out->rt_rmx.rmx_expire != 0); + VERIFY(out->rt_expire != 0 || out->rt_rmx.rmx_expire == 0); +} + +static void +rt_getmetrics(struct rtentry *in, struct rt_metrics *out) +{ + struct timeval curr_calendar_time; + uint64_t curr_uptime; + + VERIFY(in->rt_expire == 0 || in->rt_rmx.rmx_expire != 0); + VERIFY(in->rt_expire != 0 || in->rt_rmx.rmx_expire == 0); + + *out = in->rt_rmx; + + if (in->rt_expire) { + /* account for system time change */ + getmicrotime(&curr_calendar_time); + curr_uptime = net_uptime(); + + in->base_calendartime += + CALCULATE_CLOCKSKEW(curr_calendar_time, + in->base_calendartime, + curr_uptime, in->base_uptime); + + out->rmx_expire = in->base_calendartime + + in->rt_expire - in->base_uptime; + } else + out->rmx_expire = 0; } /* - * Set route's interface given ifpaddr, ifaaddr, and gateway. + * Set route's interface given info.rti_info[RTAX_IFP], info.rti_info[RTAX_IFA], and gateway. */ static void rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, struct sockaddr *Gate, unsigned int ifscope) { - struct ifaddr *ifa = 0; - struct ifnet *ifp = 0; + struct ifaddr *ifa = NULL; + struct ifnet *ifp = NULL; + void (*ifa_rtrequest) + (int, struct rtentry *, struct sockaddr *); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); @@ -740,6 +835,9 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, /* Add an extra ref for ourselves */ RT_ADDREF_LOCKED(rt); + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + /* * New gateway could require new ifaddr, ifp; flags may also * be different; ifp may be specified by ll sockaddr when @@ -747,11 +845,11 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, */ if (Ifpaddr && (ifa = ifa_ifwithnet_scoped(Ifpaddr, ifscope)) && (ifp = ifa->ifa_ifp) && (Ifaaddr || Gate)) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = ifaof_ifpforaddr(Ifaaddr ? Ifaaddr : Gate, ifp); } else { if (ifa) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = 0; } if (Ifpaddr && (ifp = if_withname(Ifpaddr)) ) { @@ -761,7 +859,7 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, ifnet_lock_shared(ifp); ifa = TAILQ_FIRST(&ifp->if_addrhead); if (ifa != NULL) - ifaref(ifa); + IFA_ADDREF(ifa); ifnet_lock_done(ifp); } } else if (Ifaaddr && @@ -783,7 +881,7 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, /* Don't update a defunct route */ if (rt->rt_flags & RTF_CONDEMNED) { if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); /* Release extra ref */ RT_REMREF_LOCKED(rt); return; @@ -793,39 +891,63 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, if (ifa) { struct ifaddr *oifa = rt->rt_ifa; if (oifa != ifa) { - if (oifa && oifa->ifa_rtrequest) - oifa->ifa_rtrequest(RTM_DELETE, rt, Gate); + if (oifa != NULL) { + IFA_LOCK_SPIN(oifa); + ifa_rtrequest = oifa->ifa_rtrequest; + IFA_UNLOCK(oifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_DELETE, rt, Gate); + } rtsetifa(rt, ifa); -#if IFNET_ROUTE_REFCNT - /* - * Adjust route ref count for the interfaces. - */ - if (rt->rt_if_ref_fn != NULL && rt->rt_ifp != ifp) { - rt->rt_if_ref_fn(ifp, 1); - rt->rt_if_ref_fn(rt->rt_ifp, -1); + + if (rt->rt_ifp != ifp) { + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + + /* + * Adjust route ref count for the interfaces. + */ + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(ifp, 1); + rt->rt_if_ref_fn(rt->rt_ifp, -1); + } } -#endif /* IFNET_ROUTE_REFCNT */ rt->rt_ifp = ifp; /* * If this is the (non-scoped) default route, record * the interface index used for the primary ifscope. */ - if (rt_inet_default(rt, rt_key(rt))) - set_primary_ifscope(rt->rt_ifp->if_index); + if (rt_primary_default(rt, rt_key(rt))) { + set_primary_ifscope(rt_key(rt)->sa_family, + rt->rt_ifp->if_index); + } rt->rt_rmx.rmx_mtu = ifp->if_mtu; - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate); - ifafree(ifa); + if (rt->rt_ifa != NULL) { + IFA_LOCK_SPIN(rt->rt_ifa); + ifa_rtrequest = rt->rt_ifa->ifa_rtrequest; + IFA_UNLOCK(rt->rt_ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_ADD, rt, Gate); + } + IFA_REMREF(ifa); /* Release extra ref */ RT_REMREF_LOCKED(rt); return; } - ifafree(ifa); + IFA_REMREF(ifa); } /* XXX: to reset gateway to correct value, at RTM_CHANGE */ - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) - rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate); + if (rt->rt_ifa != NULL) { + IFA_LOCK_SPIN(rt->rt_ifa); + ifa_rtrequest = rt->rt_ifa->ifa_rtrequest; + IFA_UNLOCK(rt->rt_ifa); + if (ifa_rtrequest != NULL) + ifa_rtrequest(RTM_ADD, rt, Gate); + } /* Release extra ref */ RT_REMREF_LOCKED(rt); @@ -935,7 +1057,7 @@ rt_msg1(int type, struct rt_addrinfo *rtinfo) hint = rtinfo->rti_info[RTAX_IFA]; /* Scrub away any trace of embedded interface scope */ - sa = rtm_scrub_ifscope(i, hint, sa, &ss); + sa = rtm_scrub_ifscope(type, i, hint, sa, &ss); break; default: @@ -990,6 +1112,10 @@ again: len = sizeof(struct ifma_msghdr2); break; + case RTM_GET_EXT: + len = sizeof (struct rt_msghdr_ext); + break; + case RTM_GET2: len = sizeof(struct rt_msghdr2); break; @@ -1014,7 +1140,7 @@ again: hint = rtinfo->rti_info[RTAX_IFA]; /* Scrub away any trace of embedded interface scope */ - sa = rtm_scrub_ifscope(i, hint, sa, &ss); + sa = rtm_scrub_ifscope(type, i, hint, sa, &ss); break; default: @@ -1070,6 +1196,7 @@ rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) struct rt_msghdr *rtm; struct mbuf *m; struct sockaddr *sa = rtinfo->rti_info[RTAX_DST]; + struct sockproto route_proto = {PF_ROUTE, 0}; if (route_cb.any_count == 0) return; @@ -1080,7 +1207,7 @@ rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error) rtm->rtm_flags = RTF_DONE | flags; rtm->rtm_errno = error; rtm->rtm_addrs = rtinfo->rti_addrs; - route_proto.sp_protocol = sa ? sa->sa_family : 0; + route_proto.sp_family = sa ? sa->sa_family : 0; raw_input(m, &route_proto, &route_src, &route_dst); } @@ -1095,6 +1222,7 @@ rt_ifmsg( struct if_msghdr *ifm; struct mbuf *m; struct rt_addrinfo info; + struct sockproto route_proto = {PF_ROUTE, 0}; if (route_cb.any_count == 0) return; @@ -1107,7 +1235,6 @@ rt_ifmsg( ifm->ifm_flags = (u_short)ifp->if_flags; if_data_internal_to_if_data(ifp, &ifp->if_data, &ifm->ifm_data); ifm->ifm_addrs = 0; - route_proto.sp_protocol = 0; raw_input(m, &route_proto, &route_src, &route_dst); } @@ -1120,7 +1247,7 @@ rt_ifmsg( * copies of it. * * Since this is coming from the interface, it is expected that the - * interface will be locked. Caller must hold rt_lock. + * interface will be locked. Caller must hold rnh_lock and rt_lock. */ void rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) @@ -1130,11 +1257,16 @@ rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) int pass; struct mbuf *m = 0; struct ifnet *ifp = ifa->ifa_ifp; + struct sockproto route_proto = {PF_ROUTE, 0}; + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK_ASSERT_HELD(rt); if (route_cb.any_count == 0) return; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); for (pass = 1; pass < 3; pass++) { bzero((caddr_t)&info, sizeof(info)); if ((cmd == RTM_ADD && pass == 1) || @@ -1142,21 +1274,32 @@ rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) struct ifa_msghdr *ifam; int ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR; - /* Lock ifp for if_addrhead */ + /* Lock ifp for if_lladdr */ ifnet_lock_shared(ifp); - ifaaddr = sa = ifa->ifa_addr; - ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; - netmask = ifa->ifa_netmask; - brdaddr = ifa->ifa_dstaddr; + IFA_LOCK(ifa); + info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr; + /* + * Holding ifnet lock here prevents the link address + * from changing contents, so no need to hold its + * lock. The link address is always present; it's + * never freed. + */ + info.rti_info[RTAX_IFP] = ifp->if_lladdr->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; if ((m = rt_msg1(ncmd, &info)) == NULL) { + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); continue; } + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); ifam = mtod(m, struct ifa_msghdr *); ifam->ifam_index = ifp->if_index; + IFA_LOCK_SPIN(ifa); ifam->ifam_metric = ifa->ifa_metric; ifam->ifam_flags = ifa->ifa_flags; + IFA_UNLOCK(ifa); ifam->ifam_addrs = info.rti_addrs; } if ((cmd == RTM_ADD && pass == 2) || @@ -1165,9 +1308,9 @@ rt_newaddrmsg(int cmd, struct ifaddr *ifa, int error, struct rtentry *rt) if (rt == 0) continue; - netmask = rt_mask(rt); - dst = sa = rt_key(rt); - gate = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_DST] = sa = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; if ((m = rt_msg1(cmd, &info)) == NULL) continue; rtm = mtod(m, struct rt_msghdr *); @@ -1193,35 +1336,34 @@ rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma) struct mbuf *m = 0; struct ifnet *ifp = ifma->ifma_ifp; struct ifma_msghdr *ifmam; + struct sockproto route_proto = {PF_ROUTE, 0}; if (route_cb.any_count == 0) return; + /* Lock ifp for if_lladdr */ + ifnet_lock_shared(ifp); bzero((caddr_t)&info, sizeof(info)); - ifaaddr = ifma->ifma_addr; - /* Lock ifp for if_addrhead */ - if (ifp != NULL) - ifnet_lock_shared(ifp); - if (ifp && ifp->if_addrhead.tqh_first) - ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; - else - ifpaddr = NULL; + IFMA_LOCK(ifma); + info.rti_info[RTAX_IFA] = ifma->ifma_addr; + info.rti_info[RTAX_IFP] = ifp->if_lladdr->ifa_addr; /* lladdr doesn't need lock */ + /* * If a link-layer address is present, present it as a ``gateway'' * (similarly to how ARP entries, e.g., are presented). */ - gate = ifma->ifma_ll->ifma_addr; + info.rti_info[RTAX_GATEWAY] = (ifma->ifma_ll != NULL) ? ifma->ifma_ll->ifma_addr : NULL; if ((m = rt_msg1(cmd, &info)) == NULL) { - if (ifp != NULL) - ifnet_lock_done(ifp); + IFMA_UNLOCK(ifma); + ifnet_lock_done(ifp); return; } - if (ifp != NULL) - ifnet_lock_done(ifp); ifmam = mtod(m, struct ifma_msghdr *); - ifmam->ifmam_index = ifp ? ifp->if_index : 0; + ifmam->ifmam_index = ifp->if_index; ifmam->ifmam_addrs = info.rti_addrs; route_proto.sp_protocol = ifma->ifma_addr->sa_family; + IFMA_UNLOCK(ifma); + ifnet_lock_done(ifp); raw_input(m, &route_proto, &route_src, &route_dst); } @@ -1242,10 +1384,11 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) return 0; } bzero((caddr_t)&info, sizeof(info)); - dst = rt_key(rt); - gate = rt->rt_gateway; - netmask = rt_mask(rt); - genmask = rt->rt_genmask; + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = rt->rt_genmask; + if (w->w_op != NET_RT_DUMP2) { size = rt_msg2(RTM_GET, &info, 0, w); if (w->w_req && w->w_tmem) { @@ -1253,201 +1396,418 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) rtm->rtm_flags = rt->rt_flags; rtm->rtm_use = rt->rt_use; - rtm->rtm_rmx = rt->rt_rmx; + rt_getmetrics(rt, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_pid = 0; - rtm->rtm_seq = 0; - rtm->rtm_errno = 0; + rtm->rtm_seq = 0; + rtm->rtm_errno = 0; rtm->rtm_addrs = info.rti_addrs; error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); RT_UNLOCK(rt); return (error); } } else { - size = rt_msg2(RTM_GET2, &info, 0, w); - if (w->w_req && w->w_tmem) { - struct rt_msghdr2 *rtm = (struct rt_msghdr2 *)w->w_tmem; - - rtm->rtm_flags = rt->rt_flags; - rtm->rtm_use = rt->rt_use; - rtm->rtm_rmx = rt->rt_rmx; - rtm->rtm_index = rt->rt_ifp->if_index; - rtm->rtm_refcnt = rt->rt_refcnt; + size = rt_msg2(RTM_GET2, &info, 0, w); + if (w->w_req && w->w_tmem) { + struct rt_msghdr2 *rtm = (struct rt_msghdr2 *)w->w_tmem; + + rtm->rtm_flags = rt->rt_flags; + rtm->rtm_use = rt->rt_use; + rt_getmetrics(rt, &rtm->rtm_rmx); + rtm->rtm_index = rt->rt_ifp->if_index; + rtm->rtm_refcnt = rt->rt_refcnt; if (rt->rt_parent) rtm->rtm_parentflags = rt->rt_parent->rt_flags; else rtm->rtm_parentflags = 0; - rtm->rtm_reserved = 0; - rtm->rtm_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); + rtm->rtm_reserved = 0; + rtm->rtm_addrs = info.rti_addrs; + error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size); RT_UNLOCK(rt); - return (error); - + return (error); } } RT_UNLOCK(rt); return (error); } +/* + * This is used for dumping extended information from route entries. + */ int -sysctl_iflist( - int af, - struct walkarg *w) +sysctl_dumpentry_ext(struct radix_node *rn, void *vw) +{ + struct walkarg *w = vw; + struct rtentry *rt = (struct rtentry *)rn; + int error = 0, size; + struct rt_addrinfo info; + + RT_LOCK(rt); + if (w->w_op == NET_RT_DUMPX_FLAGS && !(rt->rt_flags & w->w_arg)) { + RT_UNLOCK(rt); + return (0); + } + bzero(&info, sizeof (info)); + info.rti_info[RTAX_DST] = rt_key(rt); + info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; + info.rti_info[RTAX_NETMASK] = rt_mask(rt); + info.rti_info[RTAX_GENMASK] = rt->rt_genmask; + + size = rt_msg2(RTM_GET_EXT, &info, 0, w); + if (w->w_req && w->w_tmem) { + struct rt_msghdr_ext *ertm = (struct rt_msghdr_ext *)w->w_tmem; + + ertm->rtm_flags = rt->rt_flags; + ertm->rtm_use = rt->rt_use; + rt_getmetrics(rt, &ertm->rtm_rmx); + ertm->rtm_index = rt->rt_ifp->if_index; + ertm->rtm_pid = 0; + ertm->rtm_seq = 0; + ertm->rtm_errno = 0; + ertm->rtm_addrs = info.rti_addrs; + if (rt->rt_llinfo_get_ri == NULL) + bzero(&ertm->rtm_ri, sizeof (ertm->rtm_ri)); + else + rt->rt_llinfo_get_ri(rt, &ertm->rtm_ri); + + error = SYSCTL_OUT(w->w_req, (caddr_t)ertm, size); + RT_UNLOCK(rt); + return (error); + } + RT_UNLOCK(rt); + return (error); +} + +/* + * rdar://9307819 + * To avoid to call copyout() while holding locks and to cause problems + * in the paging path, sysctl_iflist() and sysctl_iflist2() contstruct + * the list in two passes. In the first pass we compute the total + * length of the data we are going to copyout, then we release + * all locks to allocate a temporary buffer that gets filled + * in the second pass. + * + * Note that we are verifying the assumption that _MALLOC returns a buffer + * that is at least 32 bits aligned and that the messages and addresses are + * 32 bits aligned. + */ + +int +sysctl_iflist(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; int len, error = 0; + int pass = 0; + int total_len = 0, current_len = 0; + char *total_buffer = NULL, *cp = NULL; bzero((caddr_t)&info, sizeof(info)); - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if (error) - break; - if (w->w_arg && w->w_arg != ifp->if_index) - continue; - ifnet_lock_shared(ifp); - ifa = ifp->if_addrhead.tqh_first; - ifpaddr = ifa->ifa_addr; - len = rt_msg2(RTM_IFINFO, &info, (caddr_t)0, w); - ifpaddr = 0; - if (w->w_req && w->w_tmem) { - struct if_msghdr *ifm; - - ifm = (struct if_msghdr *)w->w_tmem; - ifm->ifm_index = ifp->if_index; - ifm->ifm_flags = (u_short)ifp->if_flags; - if_data_internal_to_if_data(ifp, &ifp->if_data, &ifm->ifm_data); - ifm->ifm_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req,(caddr_t)ifm, len); - if (error) { - ifnet_lock_done(ifp); + + for (pass = 0; pass < 2; pass++) { + ifnet_head_lock_shared(); + + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (error) break; - } - } - while ((ifa = ifa->ifa_link.tqe_next) != 0) { - if (af && af != ifa->ifa_addr->sa_family) + if (w->w_arg && w->w_arg != ifp->if_index) continue; - ifaaddr = ifa->ifa_addr; - netmask = ifa->ifa_netmask; - brdaddr = ifa->ifa_dstaddr; - len = rt_msg2(RTM_NEWADDR, &info, 0, w); - if (w->w_req && w->w_tmem) { - struct ifa_msghdr *ifam; - - ifam = (struct ifa_msghdr *)w->w_tmem; - ifam->ifam_index = ifa->ifa_ifp->if_index; - ifam->ifam_flags = ifa->ifa_flags; - ifam->ifam_metric = ifa->ifa_metric; - ifam->ifam_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, w->w_tmem, len); - if (error) + ifnet_lock_shared(ifp); + /* + * Holding ifnet lock here prevents the link address from + * changing contents, so no need to hold the ifa lock. + * The link address is always present; it's never freed. + */ + ifa = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO, &info, (caddr_t)0, NULL); + if (pass == 0) { + total_len += len; + } else { + struct if_msghdr *ifm; + + if (current_len + len > total_len) { + ifnet_lock_done(ifp); + printf("sysctl_iflist: current_len (%d) + len (%d) > total_len (%d)\n", + current_len, len, total_len); + error = ENOBUFS; break; + } + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO, &info, (caddr_t)cp, NULL); + info.rti_info[RTAX_IFP] = NULL; + + ifm = (struct if_msghdr *)cp; + ifm->ifm_index = ifp->if_index; + ifm->ifm_flags = (u_short)ifp->if_flags; + if_data_internal_to_if_data(ifp, &ifp->if_data, + &ifm->ifm_data); + ifm->ifm_addrs = info.rti_addrs; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + current_len += len; } + while ((ifa = ifa->ifa_link.tqe_next) != 0) { + IFA_LOCK(ifa); + if (af && af != ifa->ifa_addr->sa_family) { + IFA_UNLOCK(ifa); + continue; + } + info.rti_info[RTAX_IFA] = ifa->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; + len = rt_msg2(RTM_NEWADDR, &info, 0, 0); + if (pass == 0) { + total_len += len; + } else { + struct ifa_msghdr *ifam; + + if (current_len + len > total_len) { + IFA_UNLOCK(ifa); + printf("sysctl_iflist: current_len (%d) + len (%d) > total_len (%d)\n", + current_len, len, total_len); + error = ENOBUFS; + break; + } + len = rt_msg2(RTM_NEWADDR, &info, (caddr_t)cp, NULL); + + ifam = (struct ifa_msghdr *)cp; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_addrs = info.rti_addrs; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + current_len += len; + } + IFA_UNLOCK(ifa); + } + ifnet_lock_done(ifp); + info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] = + info.rti_info[RTAX_BRD] = NULL; + } + + ifnet_head_done(); + + if (error) + break; + + if (pass == 0) { + /* Better to return zero length buffer than ENOBUFS */ + if (total_len == 0) + total_len = 1; + total_len += total_len >> 3; + total_buffer = _MALLOC(total_len, M_RTABLE, M_ZERO | M_WAITOK); + if (total_buffer == NULL) { + printf("sysctl_iflist: _MALLOC(%d) failed\n", total_len); + error = ENOBUFS; + break; + } + cp = total_buffer; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + } else { + error = SYSCTL_OUT(w->w_req, total_buffer, current_len); + if (error) + break; } - ifnet_lock_done(ifp); - ifaaddr = netmask = brdaddr = 0; } - ifnet_head_done(); + + if (total_buffer != NULL) + _FREE(total_buffer, M_RTABLE); + return error; } int -sysctl_iflist2( - int af, - struct walkarg *w) +sysctl_iflist2(int af, struct walkarg *w) { struct ifnet *ifp; struct ifaddr *ifa; struct rt_addrinfo info; int len, error = 0; - + int pass = 0; + int total_len = 0, current_len = 0; + char *total_buffer = NULL, *cp = NULL; + bzero((caddr_t)&info, sizeof(info)); - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_link) { - if (error) - break; - if (w->w_arg && w->w_arg != ifp->if_index) - continue; - ifnet_lock_shared(ifp); - ifa = ifp->if_addrhead.tqh_first; - ifpaddr = ifa->ifa_addr; - len = rt_msg2(RTM_IFINFO2, &info, (caddr_t)0, w); - ifpaddr = 0; - if (w->w_req && w->w_tmem) { - struct if_msghdr2 *ifm; - - ifm = (struct if_msghdr2 *)w->w_tmem; - ifm->ifm_addrs = info.rti_addrs; - ifm->ifm_flags = (u_short)ifp->if_flags; - ifm->ifm_index = ifp->if_index; - ifm->ifm_snd_len = ifp->if_snd.ifq_len; - ifm->ifm_snd_maxlen = ifp->if_snd.ifq_maxlen; - ifm->ifm_snd_drops = ifp->if_snd.ifq_drops; - ifm->ifm_timer = ifp->if_timer; - if_data_internal_to_if_data64(ifp, &ifp->if_data, &ifm->ifm_data); - error = SYSCTL_OUT(w->w_req, w->w_tmem, len); - if (error) { - ifnet_lock_done(ifp); + + for (pass = 0; pass < 2; pass++) { + ifnet_head_lock_shared(); + + TAILQ_FOREACH(ifp, &ifnet_head, if_link) { + if (error) break; - } - } - while ((ifa = ifa->ifa_link.tqe_next) != 0) { - if (af && af != ifa->ifa_addr->sa_family) + if (w->w_arg && w->w_arg != ifp->if_index) continue; - ifaaddr = ifa->ifa_addr; - netmask = ifa->ifa_netmask; - brdaddr = ifa->ifa_dstaddr; - len = rt_msg2(RTM_NEWADDR, &info, 0, w); - if (w->w_req && w->w_tmem) { - struct ifa_msghdr *ifam; - - ifam = (struct ifa_msghdr *)w->w_tmem; - ifam->ifam_index = ifa->ifa_ifp->if_index; - ifam->ifam_flags = ifa->ifa_flags; - ifam->ifam_metric = ifa->ifa_metric; - ifam->ifam_addrs = info.rti_addrs; - error = SYSCTL_OUT(w->w_req, w->w_tmem, len); - if (error) + ifnet_lock_shared(ifp); + /* + * Holding ifnet lock here prevents the link address from + * changing contents, so no need to hold the ifa lock. + * The link address is always present; it's never freed. + */ + ifa = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO2, &info, (caddr_t)0, NULL); + if (pass == 0) { + total_len += len; + } else { + struct if_msghdr2 *ifm; + + if (current_len + len > total_len) { + ifnet_lock_done(ifp); + printf("sysctl_iflist2: current_len (%d) + len (%d) > total_len (%d)\n", + current_len, len, total_len); + error = ENOBUFS; break; + } + info.rti_info[RTAX_IFP] = ifa->ifa_addr; + len = rt_msg2(RTM_IFINFO2, &info, (caddr_t)cp, NULL); + info.rti_info[RTAX_IFP] = NULL; + + ifm = (struct if_msghdr2 *)cp; + ifm->ifm_addrs = info.rti_addrs; + ifm->ifm_flags = (u_short)ifp->if_flags; + ifm->ifm_index = ifp->if_index; + ifm->ifm_snd_len = ifp->if_snd.ifq_len; + ifm->ifm_snd_maxlen = ifp->if_snd.ifq_maxlen; + ifm->ifm_snd_drops = ifp->if_snd.ifq_drops; + ifm->ifm_timer = ifp->if_timer; + if_data_internal_to_if_data64(ifp, &ifp->if_data, + &ifm->ifm_data); + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + current_len += len; } - } - if (error) { - ifnet_lock_done(ifp); - break; - } - { - struct ifmultiaddr *ifma; - - for (ifma = ifp->if_multiaddrs.lh_first; ifma; - ifma = ifma->ifma_link.le_next) { - if (af && af != ifma->ifma_addr->sa_family) + while ((ifa = ifa->ifa_link.tqe_next) != 0) { + IFA_LOCK(ifa); + if (af && af != ifa->ifa_addr->sa_family) { + IFA_UNLOCK(ifa); continue; - bzero((caddr_t)&info, sizeof(info)); - ifaaddr = ifma->ifma_addr; - if (ifp->if_addrhead.tqh_first) - ifpaddr = ifp->if_addrhead.tqh_first->ifa_addr; - if (ifma->ifma_ll) - gate = ifma->ifma_ll->ifma_addr; - len = rt_msg2(RTM_NEWMADDR2, &info, 0, w); - if (w->w_req && w->w_tmem) { - struct ifma_msghdr2 *ifmam; - - ifmam = (struct ifma_msghdr2 *)w->w_tmem; - ifmam->ifmam_addrs = info.rti_addrs; - ifmam->ifmam_flags = 0; - ifmam->ifmam_index = ifma->ifma_ifp->if_index; - ifmam->ifmam_refcount = ifma->ifma_refcount; - error = SYSCTL_OUT(w->w_req, w->w_tmem, len); - if (error) + } + info.rti_info[RTAX_IFA] = ifa->ifa_addr; + info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask; + info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr; + len = rt_msg2(RTM_NEWADDR, &info, 0, 0); + if (pass == 0) { + total_len += len; + } else { + struct ifa_msghdr *ifam; + + if (current_len + len > total_len) { + IFA_UNLOCK(ifa); + printf("sysctl_iflist2: current_len (%d) + len (%d) > total_len (%d)\n", + current_len, len, total_len); + error = ENOBUFS; break; + } + len = rt_msg2(RTM_NEWADDR, &info, (caddr_t)cp, 0); + + ifam = (struct ifa_msghdr *)cp; + ifam->ifam_index = ifa->ifa_ifp->if_index; + ifam->ifam_flags = ifa->ifa_flags; + ifam->ifam_metric = ifa->ifa_metric; + ifam->ifam_addrs = info.rti_addrs; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + current_len += len; + } + IFA_UNLOCK(ifa); + } + if (error) { + ifnet_lock_done(ifp); + break; + } + { + struct ifmultiaddr *ifma; + + for (ifma = LIST_FIRST(&ifp->if_multiaddrs); + ifma != NULL; ifma = LIST_NEXT(ifma, ifma_link)) { + struct ifaddr *ifa0; + + IFMA_LOCK(ifma); + if (af && af != ifma->ifma_addr->sa_family) { + IFMA_UNLOCK(ifma); + continue; + } + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_IFA] = ifma->ifma_addr; + /* + * Holding ifnet lock here prevents the link + * address from changing contents, so no need + * to hold the ifa0 lock. The link address is + * always present; it's never freed. + */ + ifa0 = ifp->if_lladdr; + info.rti_info[RTAX_IFP] = ifa0->ifa_addr; + if (ifma->ifma_ll != NULL) + info.rti_info[RTAX_GATEWAY] = ifma->ifma_ll->ifma_addr; + len = rt_msg2(RTM_NEWMADDR2, &info, 0, 0); + if (pass == 0) { + total_len += len; + } else { + struct ifma_msghdr2 *ifmam; + + if (current_len + len > total_len) { + IFMA_UNLOCK(ifma); + printf("sysctl_iflist2: current_len (%d) + len (%d) > total_len (%d)\n", + current_len, len, total_len); + error = ENOBUFS; + break; + } + len = rt_msg2(RTM_NEWMADDR2, &info, (caddr_t)cp, 0); + + ifmam = (struct ifma_msghdr2 *)cp; + ifmam->ifmam_addrs = info.rti_addrs; + ifmam->ifmam_flags = 0; + ifmam->ifmam_index = + ifma->ifma_ifp->if_index; + ifmam->ifmam_refcount = + ifma->ifma_reqcnt; + + cp += len; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + current_len += len; + } + IFMA_UNLOCK(ifma); } } + ifnet_lock_done(ifp); + info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] = + info.rti_info[RTAX_BRD] = NULL; + } + ifnet_head_done(); + + if (error) + break; + + if (pass == 0) { + /* Better to return zero length buffer than ENOBUFS */ + if (total_len == 0) + total_len = 1; + total_len += total_len >> 3; + total_buffer = _MALLOC(total_len, M_RTABLE, M_ZERO | M_WAITOK); + if (total_buffer == NULL) { + printf("sysctl_iflist2: _MALLOC(%d) failed\n", total_len); + error = ENOBUFS; + break; + } + cp = total_buffer; + VERIFY(IS_P2ALIGNED(cp, sizeof(u_int32_t))); + } else { + error = SYSCTL_OUT(w->w_req, total_buffer, current_len); + if (error) + break; } - ifnet_lock_done(ifp); - ifaaddr = netmask = brdaddr = 0; } - ifnet_head_done(); + + if (total_buffer != NULL) + _FREE(total_buffer, M_RTABLE); + return error; } @@ -1467,16 +1827,15 @@ sysctl_rtstat(struct sysctl_req *req) static int sysctl_rttrash(struct sysctl_req *req) { - int error; - - error = SYSCTL_OUT(req, &rttrash, sizeof(rttrash)); - if (error) - return (error); + int error; + + error = SYSCTL_OUT(req, &rttrash, sizeof(rttrash)); + if (error) + return (error); - return 0; + return 0; } -#if IFNET_ROUTE_REFCNT /* * Called from pfslowtimo(), protected by domain_proto_mtx */ @@ -1503,9 +1862,11 @@ rt_drainall(void) timerclear(&last_ts); in_rtqdrain(); /* protocol cloned routes: INET */ - in6_rtqdrain(); /* protocol cloned routes: INET6 */ in_arpdrain(NULL); /* cloned routes: ARP */ +#if INET6 + in6_rtqdrain(); /* protocol cloned routes: INET6 */ nd6_drain(NULL); /* cloned routes: ND6 */ +#endif /* INET6 */ last_ts.tv_sec = current_ts.tv_sec; last_ts.tv_usec = current_ts.tv_usec; @@ -1522,7 +1883,6 @@ rt_aggdrain(int on) else routedomain.dom_protosw->pr_flags &= ~PR_AGGDRAIN; } -#endif /* IFNET_ROUTE_REFCNT */ static int sysctl_rtsock SYSCTL_HANDLER_ARGS @@ -1556,7 +1916,17 @@ sysctl_rtsock SYSCTL_HANDLER_ARGS for (i = 1; i <= AF_MAX; i++) if ((rnh = rt_tables[i]) && (af == 0 || af == i) && (error = rnh->rnh_walktree(rnh, - sysctl_dumpentry, &w))) + sysctl_dumpentry, &w))) + break; + lck_mtx_unlock(rnh_lock); + break; + case NET_RT_DUMPX: + case NET_RT_DUMPX_FLAGS: + lck_mtx_lock(rnh_lock); + for (i = 1; i <= AF_MAX; i++) + if ((rnh = rt_tables[i]) && (af == 0 || af == i) && + (error = rnh->rnh_walktree(rnh, + sysctl_dumpentry_ext, &w))) break; lck_mtx_unlock(rnh_lock); break; @@ -1578,7 +1948,7 @@ sysctl_rtsock SYSCTL_HANDLER_ARGS return (error); } -SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD, sysctl_rtsock, ""); +SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_rtsock, ""); /* * Definitions of protocols supported in the ROUTE domain. @@ -1587,12 +1957,7 @@ static struct protosw routesw[] = { { SOCK_RAW, &routedomain, 0, PR_ATOMIC|PR_ADDR, 0, route_output, raw_ctlinput, 0, 0, - raw_init, 0, 0, -#if IFNET_ROUTE_REFCNT - rt_drainall, -#else - 0, -#endif /* IFNET_ROUTE_REFCNT */ + raw_init, 0, 0, rt_drainall, 0, &route_usrreqs, 0, 0, 0, diff --git a/bsd/net/rtsock_mip.c b/bsd/net/rtsock_mip.c deleted file mode 100644 index 2acd2585d..000000000 --- a/bsd/net/rtsock_mip.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* $KAME$ */ - -/* to be included from net/rtsock.c - ugly but necessary for portability */ -/* - * Mobile IPv6 addition. - * Send a routing message to all routing socket listeners. - */ -void -rt_mip6msg(cmd, ifp, rt) - int cmd; - struct ifnet *ifp; - register struct rtentry *rt; -{ - struct rt_addrinfo info; - struct sockaddr *sa = 0; - struct mbuf *m = 0; - register struct rt_msghdr *rtm; - -#ifdef MIP6_DEBUG - printf("route_cb.any_count = %d\n", route_cb.any_count); -#endif - bzero((caddr_t)&info, sizeof(info)); - - if (rt == 0 || ifp == 0) - return; - netmask = rt_mask(rt); - dst = sa = rt_key(rt); - gate = rt->rt_gateway; - genmask = rt->rt_genmask; - if ((m = rt_msg1(cmd, &info)) == NULL) { -#ifdef MIP6_DEBUG - printf("failure... \n"); -#endif - return; - } - rtm = mtod(m, struct rt_msghdr *); - rtm->rtm_index = ifp->if_index; - rtm->rtm_flags |= rt->rt_flags; - rtm->rtm_rmx = rt->rt_rmx; - rtm->rtm_addrs = info.rti_addrs; - rtm->rtm_flags |= RTF_DONE; - - route_proto.sp_protocol = sa ? sa->sa_family : 0; -#ifdef __bsdi__ - raw_input(m, NULL, &route_proto, &route_src, &route_dst); -#else - raw_input(m, &route_proto, &route_src, &route_dst); -#endif -} diff --git a/bsd/netat/Makefile b/bsd/netat/Makefile index 9282cd445..3f307255c 100644 --- a/bsd/netat/Makefile +++ b/bsd/netat/Makefile @@ -10,14 +10,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ SETUP_SUBDIRS = \ diff --git a/bsd/netat/asp_proto.c b/bsd/netat/asp_proto.c index bd471ff5a..8f1621b54 100644 --- a/bsd/netat/asp_proto.c +++ b/bsd/netat/asp_proto.c @@ -319,7 +319,7 @@ return ( static char mbuf_str[100]; char *mbuf_totals() { - sprintf(mbuf_str, + snprintf(mbuf_str, sizeof(mbuf_str), /* "dat = %d, prot = %d, ioc = %d, err = %d, hu = %d, ack = %d, nak = %d, ctl = %d", */ diff --git a/bsd/netat/at.c b/bsd/netat/at.c index 572b7f58b..ae6120798 100644 --- a/bsd/netat/at.c +++ b/bsd/netat/at.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -128,6 +128,21 @@ static int set_zones(zone_usage_t *ifz) return(0); } /* set_zones */ +static int +at_domifattach(struct ifnet *ifp, at_ifaddr_t *ifID) +{ + int error; + + if ((error = proto_plumb(PF_APPLETALK, ifp))) { + if (error != EEXIST) + log(LOG_ERR, "%s: proto_plumb returned %d if=%s%d\n", + __func__, error, ifp->if_name, ifp->if_unit); + } else if (ifID) + ifID->at_was_attached = 1; + + return (error); +} + /* * Generic internet control operations (ioctl's). * ifp is 0 if not an interface-specific ioctl. @@ -580,10 +595,10 @@ at_control(so, cmd, data, ifp) ifID->aa_ifp = ifp; ifa = &ifID->aa_ifa; - error = proto_plumb(PF_APPLETALK, ifp); + error = at_domifattach(ifp, ifID); if (error == EEXIST) { - ifID->at_was_attached = 1; - error = 0; + ifID->at_was_attached = 1; + error = 0; } if (error != 0) { break; @@ -592,27 +607,36 @@ at_control(so, cmd, data, ifp) ifID->cable_multicast_addr = etalk_multicast_addr; xpatcnt++; ifnet_lock_exclusive(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) - if ((sdl = (struct sockaddr_dl *)ifa->ifa_addr) && - (sdl->sdl_family == AF_LINK)) { - bcopy(LLADDR(sdl), ifID->xaddr, sizeof(ifID->xaddr)); + /* + * Holding ifnet lock here prevents the link address + * from changing contents, so no need to hold the ifa + * lock. The link address is always present; it's + * never freed. + */ + sdl = (struct sockaddr_dl *)ifp->if_lladdr->ifa_addr; + bcopy(LLADDR(sdl), ifID->xaddr, sizeof(ifID->xaddr)); #ifdef APPLETALK_DEBUG - kprintf("SIOCSIFADDR: local enet address is %x.%x.%x.%x.%x.%x\n", - ifID->xaddr[0], ifID->xaddr[1], - ifID->xaddr[2], ifID->xaddr[3], - ifID->xaddr[4], ifID->xaddr[5]); + kprintf("SIOCSIFADDR: local enet address is " + "%x.%x.%x.%x.%x.%x\n", + ifID->xaddr[0], ifID->xaddr[1], + ifID->xaddr[2], ifID->xaddr[3], + ifID->xaddr[4], ifID->xaddr[5]); #endif - break; - } /* attach the AppleTalk address to the ifnet structure */ ifa = &ifID->aa_ifa; + ifa_lock_init(ifa); + VERIFY(!(ifa->ifa_debug & IFD_ALLOC)); ifa->ifa_addr = (struct sockaddr *)&ifID->ifNodeAddress; ifID->ifNodeAddress.sat_len = sizeof(struct sockaddr_at); ifID->ifNodeAddress.sat_family = AF_APPLETALK; /* the address itself will be filled in when ifThisNode is set */ + IFA_LOCK(ifa); if_attach_ifa(ifp, ifa); + /* add a reference for at_interfaces[] */ + IFA_ADDREF_LOCKED(ifa); + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); } break; @@ -678,11 +702,7 @@ at_control(so, cmd, data, ifp) error = EACCES; break; } - error = proto_plumb(PF_APPLETALK, ifp); - if (ifID != NULL - && (error == 0 || error == EEXIST)) { - ifID->at_was_attached = 1; - } + error = at_domifattach(ifp, ifID); break; case SIOCPROTODETACH: @@ -713,6 +733,7 @@ void atalk_post_msg(struct ifnet *ifp, u_long event_code, struct at_addr *addres struct kev_atalk_data at_event_data; struct kev_msg ev_msg; + bzero(&ev_msg, sizeof(struct kev_msg)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_ATALK_SUBCLASS; @@ -739,3 +760,22 @@ void atalk_post_msg(struct ifnet *ifp, u_long event_code, struct at_addr *addres kev_post_msg(&ev_msg); } + + +/* + * This is untested; the code is here only for completeness. + */ +void +at_purgeaddrs(struct ifnet *ifp) +{ + at_ifaddr_t *ifID = NULL; + int pat_id; + + /* Find address for this interface, if it exists */ + for (pat_id = 0; pat_id < xpatcnt; pat_id++) { + if (at_interfaces[pat_id].aa_ifp == ifp) { + ifID = &at_interfaces[pat_id]; + elap_offline(ifID); + } + } +} diff --git a/bsd/netat/at_var.h b/bsd/netat/at_var.h index 64025eb6f..1513f9a82 100644 --- a/bsd/netat/at_var.h +++ b/bsd/netat/at_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -351,7 +351,7 @@ void init_ddp_handler(void); int elap_wput(gref_t *gref, gbuf_t *m); int at_ioctl(struct atpcb *, u_long, caddr_t, int ); - +extern void at_purgeaddrs(struct ifnet *); #endif /* KERNEL_PRIVATE */ #endif /* __APPLE_API_OBSOLETE */ diff --git a/bsd/netat/ddp.c b/bsd/netat/ddp.c index 31467fa6b..0e2ebea4b 100644 --- a/bsd/netat/ddp.c +++ b/bsd/netat/ddp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -384,12 +384,17 @@ void ddp_rem_if(ifID) struct ifaddr *ifa = &ifID->aa_ifa; /* un-do processing done in SIOCSIFADDR */ - if (ifa->ifa_addr) { - ifnet_lock_exclusive(ifID->aa_ifp); + ifnet_lock_exclusive(ifID->aa_ifp); + IFA_LOCK(ifa); + if (ifa->ifa_debug & IFD_ATTACHED) { if_detach_ifa(ifID->aa_ifp, ifa); ifa->ifa_addr = NULL; - ifnet_lock_done(ifID->aa_ifp); } + IFA_UNLOCK(ifa); + /* release reference held for at_interfaces[] */ + IFA_REMREF(ifa); + ifnet_lock_done(ifID->aa_ifp); + if (ifID->at_was_attached == 0 && ifID->aa_ifp != NULL) { (void)proto_unplumb(PF_APPLETALK, ifID->aa_ifp); } diff --git a/bsd/netat/ddp_lap.c b/bsd/netat/ddp_lap.c index c7e075c5a..42f81cdad 100644 --- a/bsd/netat/ddp_lap.c +++ b/bsd/netat/ddp_lap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1602,7 +1602,7 @@ int at_reg_mcast(ifID, data) *(unsigned*)data, (*(unsigned *)(data+2))&0x0000ffff, (unsigned)ifID)); - if (if_addmulti(nddp, (struct sockaddr *)&sdl, 0)) + if (if_addmulti_anon(nddp, (struct sockaddr *)&sdl, NULL)) return -1; } return 0; @@ -1641,7 +1641,7 @@ int at_unreg_mcast(ifID, data) (unsigned)ifID)); bzero(data, sizeof(struct etalk_addr)); - if (if_delmulti(nddp, (struct sockaddr *)&sdl)) + if (if_delmulti_anon(nddp, (struct sockaddr *)&sdl)) return -1; } return 0; @@ -1687,7 +1687,7 @@ int at_reg_mcast(ifID, data) *(unsigned*)data, (*(unsigned *)(data+2))&0x0000ffff, (unsigned)ifID)); - if (if_addmulti(nddp, (struct sockaddr *)&sdl, 0)) + if (if_addmulti_anon(nddp, (struct sockaddr *)&sdl, NULL)) return -1; } @@ -1724,7 +1724,7 @@ int at_unreg_mcast(ifID, data) (unsigned)ifID)); bzero(data, ETHERNET_ADDR_LEN); - if (if_delmulti(nddp, (struct sockaddr *)&sdl)) + if (if_delmulti_anon(nddp, (struct sockaddr *)&sdl)) return(-1); } diff --git a/bsd/netat/sys_glue.c b/bsd/netat/sys_glue.c index dd22563be..acb307fbc 100644 --- a/bsd/netat/sys_glue.c +++ b/bsd/netat/sys_glue.c @@ -97,10 +97,10 @@ int RouterMix = RT_MIX_DEFAULT; /* default for nbr of ppsec */ SYSCTL_INT(_net_appletalk, OID_AUTO, routermix, CTLFLAG_WR, &RouterMix, 0, "Appletalk RouterMix"); at_ddp_stats_t at_ddp_stats; /* DDP statistics */ -SYSCTL_STRUCT(_net_appletalk, OID_AUTO, ddpstats, CTLFLAG_RD, +SYSCTL_STRUCT(_net_appletalk, OID_AUTO, ddpstats, CTLFLAG_RD | CTLFLAG_LOCKED, &at_ddp_stats, at_ddp_stats, "AppleTalk DDP Stats"); extern int atp_resp_seqno2big; -SYSCTL_INT(_net_appletalk, OID_AUTO, atp_resp_seqno2big, CTLFLAG_RD, +SYSCTL_INT(_net_appletalk, OID_AUTO, atp_resp_seqno2big, CTLFLAG_RD | CTLFLAG_LOCKED, &atp_resp_seqno2big, 0, "Appletalk ATP seqno too big count"); static void ioccmd_t_32_to_64( ioccmd_t *from_p, user_ioccmd_t *to_p ); diff --git a/bsd/netinet/Makefile b/bsd/netinet/Makefile index de3d2890a..91973125c 100644 --- a/bsd/netinet/Makefile +++ b/bsd/netinet/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ @@ -32,12 +28,12 @@ KERNELFILES = \ kpi_ipfilter.h in_arp.h PRIVATE_DATAFILES = \ - if_fddi.h if_atm.h ip_dummynet.h \ + ip_dummynet.h \ tcp_debug.h \ in_gif.h ip_compat.h PRIVATE_KERNELFILES = ${KERNELFILES} \ - ip_ecn.h ip_encap.h ip_flow.h + ip_ecn.h ip_encap.h INSTALL_MI_LIST = ${DATAFILES} @@ -48,7 +44,6 @@ EXPORT_MI_LIST = ${DATAFILES} ${KERNELFILES} EXPORT_MI_DIR = ${INSTALL_MI_DIR} INSTALL_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} - INSTALL_KF_MI_LCL_LIST = ${INSTALL_MI_LCL_LIST} ${PRIVATE_KERNELFILES} include $(MakeInc_rule) diff --git a/bsd/netinet/icmp6.h b/bsd/netinet/icmp6.h index e3c559b11..aab7c4ffd 100644 --- a/bsd/netinet/icmp6.h +++ b/bsd/netinet/icmp6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000,2008 Apple Inc. All rights reserved. + * Copyright (c) 2000,2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,6 +95,10 @@ #define _NETINET_ICMP6_H_ #include <sys/appleapiopts.h> +#ifdef XNU_KERNEL_PRIVATE +#include <sys/mcache.h> +#endif + #define ICMPV6_PLD_MAXLEN 1232 /* IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct icmp6_hdr) */ @@ -107,7 +111,7 @@ struct icmp6_hdr { u_int16_t icmp6_un_data16[2]; /* type-specific field */ u_int8_t icmp6_un_data8[4]; /* type-specific field */ } icmp6_dataun; -}; +} __attribute__((__packed__)); #define icmp6_data32 icmp6_dataun.icmp6_un_data32 #define icmp6_data16 icmp6_dataun.icmp6_un_data16 @@ -125,23 +129,27 @@ struct icmp6_hdr { #define ICMP6_ECHO_REQUEST 128 /* echo service */ #define ICMP6_ECHO_REPLY 129 /* echo reply */ +#define MLD_LISTENER_QUERY 130 /* multicast listener query */ +#define MLD_LISTENER_REPORT 131 /* multicast listener report */ +#define MLD_LISTENER_DONE 132 /* multicast listener done */ +#define MLD_LISTENER_REDUCTION MLD_LISTENER_DONE /* RFC3542 definition */ + +/* RFC2292 decls */ #define ICMP6_MEMBERSHIP_QUERY 130 /* group membership query */ -#define MLD6_LISTENER_QUERY 130 /* multicast listener query */ #define ICMP6_MEMBERSHIP_REPORT 131 /* group membership report */ -#define MLD6_LISTENER_REPORT 131 /* multicast listener report */ #define ICMP6_MEMBERSHIP_REDUCTION 132 /* group membership termination */ -#define MLD6_LISTENER_DONE 132 /* multicast listener done */ #ifndef KERNEL -#define MLD_LISTENER_QUERY MLD6_LISTENER_QUERY -#define MLD_LISTENER_REPORT MLD6_LISTENER_REPORT -#define MLD_LISTENER_DONE MLD6_LISTENER_DONE -#endif /* !KERNEL */ +/* the followings are for backward compatibility to old KAME apps. */ +#define MLD6_LISTENER_QUERY MLD_LISTENER_QUERY +#define MLD6_LISTENER_REPORT MLD_LISTENER_REPORT +#define MLD6_LISTENER_DONE MLD_LISTENER_DONE +#endif #define ND_ROUTER_SOLICIT 133 /* router solicitation */ -#define ND_ROUTER_ADVERT 134 /* router advertisment */ +#define ND_ROUTER_ADVERT 134 /* router advertisement */ #define ND_NEIGHBOR_SOLICIT 135 /* neighbor solicitation */ -#define ND_NEIGHBOR_ADVERT 136 /* neighbor advertisment */ +#define ND_NEIGHBOR_ADVERT 136 /* neighbor advertisement */ #define ND_REDIRECT 137 /* redirect */ #define ICMP6_ROUTER_RENUMBERING 138 /* router renumbering */ @@ -152,20 +160,18 @@ struct icmp6_hdr { #define ICMP6_FQDN_REPLY 140 /* FQDN reply */ #define ICMP6_NI_QUERY 139 /* node information request */ #define ICMP6_NI_REPLY 140 /* node information reply */ +#define MLDV2_LISTENER_REPORT 143 /* RFC3810 listener report */ /* The definitions below are experimental. TBA */ -#define MLD6_MTRACE_RESP 200 /* mtrace response(to sender) */ -#define MLD6_MTRACE 201 /* mtrace messages */ +#define MLD_MTRACE_RESP 200 /* mtrace resp (to sender) */ +#define MLD_MTRACE 201 /* mtrace messages */ #ifndef KERNEL -#define MLD_MTRACE_RESP MLD6_MTRACE_RESP -#define MLD_MTRACE MLD6_MTRACE -#endif /* !KERNEL */ +#define MLD6_MTRACE_RESP MLD_MTRACE_RESP +#define MLD6_MTRACE MLD_MTRACE +#endif -#define ICMP6_HADISCOV_REQUEST 202 /* XXX To be defined */ -#define ICMP6_HADISCOV_REPLY 203 /* XXX To be defined */ - -#define ICMP6_MAXTYPE 203 +#define ICMP6_MAXTYPE 201 #define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ #define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ @@ -202,16 +208,30 @@ struct icmp6_hdr { /* * Multicast Listener Discovery */ -struct mld6_hdr { - struct icmp6_hdr mld6_hdr; - struct in6_addr mld6_addr; /* multicast address */ -}; +struct mld_hdr { + struct icmp6_hdr mld_icmp6_hdr; + struct in6_addr mld_addr; /* multicast address */ +} __attribute__((__packed__)); + +/* definitions to provide backward compatibility to old KAME applications */ +#ifndef KERNEL +#define mld6_hdr mld_hdr +#define mld6_type mld_type +#define mld6_code mld_code +#define mld6_cksum mld_cksum +#define mld6_maxdelay mld_maxdelay +#define mld6_reserved mld_reserved +#define mld6_addr mld_addr +#endif -#define mld6_type mld6_hdr.icmp6_type -#define mld6_code mld6_hdr.icmp6_code -#define mld6_cksum mld6_hdr.icmp6_cksum -#define mld6_maxdelay mld6_hdr.icmp6_data16[0] -#define mld6_reserved mld6_hdr.icmp6_data16[1] +/* shortcut macro definitions */ +#define mld_type mld_icmp6_hdr.icmp6_type +#define mld_code mld_icmp6_hdr.icmp6_code +#define mld_cksum mld_icmp6_hdr.icmp6_cksum +#define mld_maxdelay mld_icmp6_hdr.icmp6_data16[0] +#define mld_reserved mld_icmp6_hdr.icmp6_data16[1] +#define mld_v2_reserved mld_icmp6_hdr.icmp6_data16[0] +#define mld_v2_numrecs mld_icmp6_hdr.icmp6_data16[1] /* * Neighbor Discovery @@ -220,7 +240,7 @@ struct mld6_hdr { struct nd_router_solicit { /* router solicitation */ struct icmp6_hdr nd_rs_hdr; /* could be followed by options */ -}; +}__attribute__((__packed__)); #define nd_rs_type nd_rs_hdr.icmp6_type #define nd_rs_code nd_rs_hdr.icmp6_code @@ -232,7 +252,7 @@ struct nd_router_advert { /* router advertisement */ u_int32_t nd_ra_reachable; /* reachable time */ u_int32_t nd_ra_retransmit; /* retransmit timer */ /* could be followed by options */ -}; +} __attribute__((__packed__)); #define nd_ra_type nd_ra_hdr.icmp6_type #define nd_ra_code nd_ra_hdr.icmp6_code @@ -260,7 +280,7 @@ struct nd_neighbor_solicit { /* neighbor solicitation */ struct icmp6_hdr nd_ns_hdr; struct in6_addr nd_ns_target; /*target address */ /* could be followed by options */ -}; +}__attribute__((__packed__)); #define nd_ns_type nd_ns_hdr.icmp6_type #define nd_ns_code nd_ns_hdr.icmp6_code @@ -271,7 +291,7 @@ struct nd_neighbor_advert { /* neighbor advertisement */ struct icmp6_hdr nd_na_hdr; struct in6_addr nd_na_target; /* target address */ /* could be followed by options */ -}; +}__attribute__((__packed__)); #define nd_na_type nd_na_hdr.icmp6_type #define nd_na_code nd_na_hdr.icmp6_code @@ -294,7 +314,7 @@ struct nd_redirect { /* redirect */ struct in6_addr nd_rd_target; /* target address */ struct in6_addr nd_rd_dst; /* destination address */ /* could be followed by options */ -}; +}__attribute__((__packed__)); #define nd_rd_type nd_rd_hdr.icmp6_type #define nd_rd_code nd_rd_hdr.icmp6_code @@ -305,13 +325,14 @@ struct nd_opt_hdr { /* Neighbor discovery option header */ u_int8_t nd_opt_type; u_int8_t nd_opt_len; /* followed by option specific data*/ -}; +}__attribute__((__packed__)); #define ND_OPT_SOURCE_LINKADDR 1 #define ND_OPT_TARGET_LINKADDR 2 #define ND_OPT_PREFIX_INFORMATION 3 #define ND_OPT_REDIRECTED_HEADER 4 #define ND_OPT_MTU 5 +#define ND_OPT_RDNSS 25 /* RFC 5006 */ #define ND_OPT_ROUTE_INFO 200 /* draft-ietf-ipngwg-router-preference, not officially assigned yet */ @@ -324,7 +345,7 @@ struct nd_opt_prefix_info { /* prefix information */ u_int32_t nd_opt_pi_preferred_time; u_int32_t nd_opt_pi_reserved2; struct in6_addr nd_opt_pi_prefix; -}; +}__attribute__((__packed__)); #define ND_OPT_PI_FLAG_ONLINK 0x80 #define ND_OPT_PI_FLAG_AUTO 0x40 @@ -335,14 +356,14 @@ struct nd_opt_rd_hdr { /* redirected header */ u_int16_t nd_opt_rh_reserved1; u_int32_t nd_opt_rh_reserved2; /* followed by IP header and data */ -}; +} __attribute__((__packed__)); struct nd_opt_mtu { /* MTU option */ u_int8_t nd_opt_mtu_type; u_int8_t nd_opt_mtu_len; u_int16_t nd_opt_mtu_reserved; u_int32_t nd_opt_mtu_mtu; -}; +}__attribute__((__packed__)); struct nd_opt_route_info { /* route info */ u_int8_t nd_opt_rti_type; @@ -350,8 +371,16 @@ struct nd_opt_route_info { /* route info */ u_int8_t nd_opt_rti_prefixlen; u_int8_t nd_opt_rti_flags; u_int32_t nd_opt_rti_lifetime; - /* followed by prefix */ -}; + /* prefix follows */ +}__attribute__((__packed__)); + +struct nd_opt_rdnss { /* recursive domain name system servers */ + u_int8_t nd_opt_rdnss_type; + u_int8_t nd_opt_rdnss_len; + u_int16_t nd_opt_rdnss_reserved; + u_int32_t nd_opt_rdnss_lifetime; + struct in6_addr nd_opt_rdnss_addr[1]; +} __attribute__((__packed__)); /* * icmp6 namelookup @@ -366,7 +395,7 @@ struct icmp6_namelookup { u_int8_t icmp6_nl_name[3]; #endif /* could be followed by options */ -}; +}__attribute__((__packed__)); /* * icmp6 node information @@ -375,7 +404,7 @@ struct icmp6_nodeinfo { struct icmp6_hdr icmp6_ni_hdr; u_int8_t icmp6_ni_nonce[8]; /* could be followed by reply data */ -}; +}__attribute__((__packed__)); #define ni_type icmp6_ni_hdr.icmp6_type #define ni_code icmp6_ni_hdr.icmp6_code @@ -438,7 +467,7 @@ struct ni_reply_fqdn { u_int32_t ni_fqdn_ttl; /* TTL */ u_int8_t ni_fqdn_namelen; /* length in octets of the FQDN */ u_int8_t ni_fqdn_name[3]; /* XXX: alignment */ -}; +}__attribute__((__packed__)); /* * Router Renumbering. as router-renum-08.txt @@ -449,7 +478,7 @@ struct icmp6_router_renum { /* router renumbering header */ u_int8_t rr_flags; u_int16_t rr_maxdelay; u_int32_t rr_reserved; -}; +} __attribute__((__packed__)); #define ICMP6_RR_FLAGS_TEST 0x80 #define ICMP6_RR_FLAGS_REQRESULT 0x40 @@ -471,7 +500,7 @@ struct rr_pco_match { /* match prefix part */ u_int8_t rpm_maxlen; u_int16_t rpm_reserved; struct in6_addr rpm_prefix; -}; +} __attribute__((__packed__)); #define RPM_PCO_ADD 1 #define RPM_PCO_CHANGE 2 @@ -487,7 +516,7 @@ struct rr_pco_use { /* use prefix part */ u_int32_t rpu_pltime; u_int32_t rpu_flags; struct in6_addr rpu_prefix; -}; +} __attribute__((__packed__)); #define ICMP6_RR_PCOUSE_RAFLAGS_ONLINK 0x80 #define ICMP6_RR_PCOUSE_RAFLAGS_AUTO 0x40 @@ -505,7 +534,7 @@ struct rr_result { /* router renumbering result message */ u_int8_t rrr_matchedlen; u_int32_t rrr_ifid; struct in6_addr rrr_prefix; -}; +} __attribute__((__packed__)); #if BYTE_ORDER == BIG_ENDIAN #define ICMP6_RR_RESULT_FLAGS_OOB 0x0002 #define ICMP6_RR_RESULT_FLAGS_FORBIDDEN 0x0001 @@ -613,24 +642,32 @@ struct icmp6stat { /* * Names for ICMP sysctl objects */ -#define ICMPV6CTL_STATS 1 -#define ICMPV6CTL_REDIRACCEPT 2 /* accept/process redirects */ -#define ICMPV6CTL_REDIRTIMEOUT 3 /* redirect cache time */ -#define ICMPV6CTL_ND6_PRUNE 6 -#define ICMPV6CTL_ND6_DELAY 8 -#define ICMPV6CTL_ND6_UMAXTRIES 9 +#define ICMPV6CTL_STATS 1 +#define ICMPV6CTL_REDIRACCEPT 2 /* accept/process redirects */ +#define ICMPV6CTL_REDIRTIMEOUT 3 /* redirect cache time */ +#if 0 /*obsoleted*/ +#define ICMPV6CTL_ERRRATELIMIT 5 /* ICMPv6 error rate limitation */ +#endif +#define ICMPV6CTL_ND6_PRUNE 6 +#define ICMPV6CTL_ND6_DELAY 8 +#define ICMPV6CTL_ND6_UMAXTRIES 9 #define ICMPV6CTL_ND6_MMAXTRIES 10 #define ICMPV6CTL_ND6_USELOOPBACK 11 /*#define ICMPV6CTL_ND6_PROXYALL 12 obsoleted, do not reuse here */ -#define ICMPV6CTL_NODEINFO 13 -#define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */ +#define ICMPV6CTL_NODEINFO 13 +#define ICMPV6CTL_ERRPPSLIMIT 14 /* ICMPv6 error pps limitation */ #define ICMPV6CTL_ND6_MAXNUDHINT 15 -#define ICMPV6CTL_MTUDISC_HIWAT 16 -#define ICMPV6CTL_MTUDISC_LOWAT 17 -#define ICMPV6CTL_ND6_DEBUG 18 -#define ICMPV6CTL_ND6_DRLIST 19 -#define ICMPV6CTL_ND6_PRLIST 20 -#define ICMPV6CTL_MAXID 21 +#define ICMPV6CTL_MTUDISC_HIWAT 16 +#define ICMPV6CTL_MTUDISC_LOWAT 17 +#define ICMPV6CTL_ND6_DEBUG 18 +#define ICMPV6CTL_ND6_DRLIST 19 +#define ICMPV6CTL_ND6_PRLIST 20 +#define ICMPV6CTL_MLD_MAXSRCFILTER 21 +#define ICMPV6CTL_MLD_SOMAXSRC 22 +#define ICMPV6CTL_MLD_VERSION 23 +#define ICMPV6CTL_ND6_MAXQLEN 24 +#define ICMPV6CTL_ND6_ACCEPT_6TO4 25 +#define ICMPV6CTL_MAXID 26 #ifdef KERNEL_PRIVATE #define ICMPV6CTL_NAMES { \ @@ -655,6 +692,11 @@ struct icmp6stat { { "nd6_debug", CTLTYPE_INT }, \ { 0, 0 }, \ { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { 0, 0 }, \ + { "nd6_accept_6to4", CTLTYPE_INT }, \ } #define RTF_PROBEMTU RTF_PROTO1 @@ -667,8 +709,8 @@ struct in6_multi; void icmp6_init(void); void icmp6_paramerror(struct mbuf *, int); void icmp6_error(struct mbuf *, int, int, int); -int icmp6_input(struct mbuf **, int *); -void icmp6_fasttimo(void); +void icmp6_error2(struct mbuf *, int, int, int, struct ifnet *); +int icmp6_input(struct mbuf **, int *, int); void icmp6_reflect(struct mbuf *, size_t); void icmp6_prepare(struct mbuf *); void icmp6_redirect_input(struct mbuf *, int); @@ -677,14 +719,17 @@ void icmp6_redirect_output(struct mbuf *, struct rtentry *); struct ip6ctlparam; void icmp6_mtudisc_update(struct ip6ctlparam *, int); +extern lck_rw_t icmp6_ifs_rwlock; /* XXX: is this the right place for these macros? */ #define icmp6_ifstat_inc(ifp, tag) \ -do { \ +do { \ + lck_rw_lock_shared(&icmp6_ifs_rwlock); \ if ((ifp) && (ifp)->if_index <= if_index \ - && (ifp)->if_index < icmp6_ifstatmax \ - && icmp6_ifstat && icmp6_ifstat[(ifp)->if_index]) { \ - icmp6_ifstat[(ifp)->if_index]->tag++; \ - } \ + && (ifp)->if_index < icmp6_ifstatmax \ + && icmp6_ifstat && icmp6_ifstat[(ifp)->if_index]) { \ + atomic_add_64(&icmp6_ifstat[(ifp)->if_index]->tag, 1); \ + } \ + lck_rw_done(&icmp6_ifs_rwlock); \ } while (0) #define icmp6_ifoutstat_inc(ifp, type, code) \ @@ -692,7 +737,7 @@ do { \ icmp6_ifstat_inc(ifp, ifs6_out_msg); \ if (type < ICMP6_INFOMSG_MASK) \ icmp6_ifstat_inc(ifp, ifs6_out_error); \ - switch(type) { \ + switch (type) { \ case ICMP6_DST_UNREACH: \ icmp6_ifstat_inc(ifp, ifs6_out_dstunreach); \ if (code == ICMP6_DST_UNREACH_ADMIN) \ @@ -713,13 +758,13 @@ do { \ case ICMP6_ECHO_REPLY: \ icmp6_ifstat_inc(ifp, ifs6_out_echoreply); \ break; \ - case MLD6_LISTENER_QUERY: \ + case MLD_LISTENER_QUERY: \ icmp6_ifstat_inc(ifp, ifs6_out_mldquery); \ break; \ - case MLD6_LISTENER_REPORT: \ + case MLD_LISTENER_REPORT: \ icmp6_ifstat_inc(ifp, ifs6_out_mldreport); \ break; \ - case MLD6_LISTENER_DONE: \ + case MLD_LISTENER_DONE: \ icmp6_ifstat_inc(ifp, ifs6_out_mlddone); \ break; \ case ND_ROUTER_SOLICIT: \ @@ -742,6 +787,12 @@ do { \ extern int icmp6_rediraccept; /* accept/process redirects */ extern int icmp6_redirtimeout; /* cache time for redirect routes */ + +#define ICMP6_NODEINFO_FQDNOK 0x1 +#define ICMP6_NODEINFO_NODEADDROK 0x2 +#define ICMP6_NODEINFO_TMPADDROK 0x4 +#define ICMP6_NODEINFO_GLOBALOK 0x8 + #endif /* KERNEL_PRIVATE */ #endif /* !_NETINET_ICMP6_H_ */ diff --git a/bsd/netinet/if_atm.c b/bsd/netinet/if_atm.c deleted file mode 100644 index 0fa54144f..000000000 --- a/bsd/netinet/if_atm.c +++ /dev/null @@ -1,303 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* $NetBSD: if_atm.c,v 1.6 1996/10/13 02:03:01 christos Exp $ */ - -/* - * - * Copyright (c) 1996 Charles D. Cranor and Washington University. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Charles D. Cranor and - * Washington University. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD: src/sys/netinet/if_atm.c,v 1.8 1999/12/07 17:39:06 shin Exp $ - */ - -/* - * IP <=> ATM address resolution. - */ - -#if defined(INET) || defined(INET6) - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/queue.h> -#include <sys/mbuf.h> -#include <sys/socket.h> -#include <sys/sockio.h> -#include <sys/syslog.h> - -#include <net/if.h> -#include <net/if_dl.h> -#include <net/route.h> -#include <net/if_atm.h> - -#include <netinet/in.h> -#include <netinet/if_atm.h> -#include <net/dlil.h> - - -#if NATM -#include <netnatm/natm.h> -#endif - - -#define SDL(s) ((struct sockaddr_dl *)s) - -/* - * atm_rtrequest: handle ATM rt request (in support of generic code) - * inputs: "req" = request code - * "rt" = route entry - * "sa" = sockaddr - */ - -void -atm_rtrequest(req, rt, sa) - int req; - register struct rtentry *rt; - struct sockaddr *sa; -{ - register struct sockaddr *gate = rt->rt_gateway; - struct atm_pseudoioctl api; -#if NATM - struct sockaddr_in *sin; - struct natmpcb *npcb = NULL; - struct atm_pseudohdr *aph; -#endif - static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK}; - - if (rt->rt_flags & RTF_GATEWAY) /* link level requests only */ - return; - - switch (req) { - - case RTM_RESOLVE: /* resolve: only happens when cloning */ - printf("atm_rtrequest: RTM_RESOLVE request detected?\n"); - break; - - case RTM_ADD: - - /* - * route added by a command (e.g. ifconfig, route, arp...). - * - * first check to see if this is not a host route, in which - * case we are being called via "ifconfig" to set the address. - */ - - if ((rt->rt_flags & RTF_HOST) == 0) { - rt_setgate(rt,rt_key(rt),(struct sockaddr *)&null_sdl); - gate = rt->rt_gateway; - SDL(gate)->sdl_type = rt->rt_ifp->if_type; - SDL(gate)->sdl_index = rt->rt_ifp->if_index; - break; - } - - if ((rt->rt_flags & RTF_CLONING) != 0) { - printf("atm_rtrequest: cloning route detected?\n"); - break; - } - if (gate->sa_family != AF_LINK || - gate->sa_len < sizeof(null_sdl)) { - log(LOG_DEBUG, "atm_rtrequest: bad gateway value"); - break; - } - -#if DIAGNOSTIC - if (rt->rt_ifp->if_ioctl == NULL) panic("atm null ioctl"); -#endif - -#if NATM - /* - * let native ATM know we are using this VCI/VPI - * (i.e. reserve it) - */ - sin = (struct sockaddr_in *) rt_key(rt); - if (sin->sin_family != AF_INET) - goto failed; - aph = (struct atm_pseudohdr *) LLADDR(SDL(gate)); - npcb = npcb_add(NULL, rt->rt_ifp, ATM_PH_VCI(aph), - ATM_PH_VPI(aph)); - if (npcb == NULL) - goto failed; - npcb->npcb_flags |= NPCB_IP; - npcb->ipaddr.s_addr = sin->sin_addr.s_addr; - /* XXX: move npcb to llinfo when ATM ARP is ready */ - rt->rt_llinfo = (caddr_t) npcb; - rt->rt_flags |= RTF_LLINFO; -#endif - /* - * let the lower level know this circuit is active - */ - bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph)); - api.rxhand = NULL; - if (ifnet_ioctl(rt->rt_ifpm 0, SIOCATMENA, &api) != 0) { - printf("atm: couldn't add VC\n"); - goto failed; - } - - SDL(gate)->sdl_type = rt->rt_ifp->if_type; - SDL(gate)->sdl_index = rt->rt_ifp->if_index; - - break; - -failed: -#if NATM - if (npcb) { - npcb_free(npcb, NPCB_DESTROY); - rt->rt_llinfo = NULL; - rt->rt_flags &= ~RTF_LLINFO; - } -#endif - rtrequest(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, - rt_mask(rt), 0, (struct rtentry **) 0); - break; - - case RTM_DELETE: - -#if NATM - /* - * tell native ATM we are done with this VC - */ - - if (rt->rt_flags & RTF_LLINFO) { - npcb_free((struct natmpcb *)rt->rt_llinfo, - NPCB_DESTROY); - rt->rt_llinfo = NULL; - rt->rt_flags &= ~RTF_LLINFO; - } -#endif - /* - * tell the lower layer to disable this circuit - */ - - bcopy(LLADDR(SDL(gate)), &api.aph, sizeof(api.aph)); - api.rxhand = NULL; - ifnet_ioctl(rt->rt_ifp, 0, SIOCATMDIS, &api); - - break; - } -} - -/* - * atmresolve: - * inputs: - * [1] "rt" = the link level route to use (or null if need to look one up) - * [2] "m" = mbuf containing the data to be sent - * [3] "dst" = sockaddr_in (IP) address of dest. - * output: - * [4] "desten" = ATM pseudo header which we will fill in VPI/VCI info - * return: - * 0 == resolve FAILED; note that "m" gets m_freem'd in this case - * 1 == resolve OK; desten contains result - * - * XXX: will need more work if we wish to support ATMARP in the kernel, - * but this is enough for PVCs entered via the "route" command. - */ - -int -atmresolve(rt, m, dst, desten) - -register struct rtentry *rt; -struct mbuf *m; -register struct sockaddr *dst; -register struct atm_pseudohdr *desten; /* OUT */ - -{ - struct sockaddr_dl *sdl; - - if (m->m_flags & (M_BCAST|M_MCAST)) { - log(LOG_INFO, "atmresolve: BCAST/MCAST packet detected/dumped"); - goto bad; - } - - if (rt == NULL) { - rt = RTALLOC1(dst, 0); - if (rt == NULL) goto bad; /* failed */ - rtunref(rt); /* don't keep LL references */ - if ((rt->rt_flags & RTF_GATEWAY) != 0 || - (rt->rt_flags & RTF_LLINFO) == 0 || - /* XXX: are we using LLINFO? */ - rt->rt_gateway->sa_family != AF_LINK) { - goto bad; - } - } - - /* - * note that rt_gateway is a sockaddr_dl which contains the - * atm_pseudohdr data structure for this route. we currently - * don't need any rt_llinfo info (but will if we want to support - * ATM ARP [c.f. if_ether.c]). - */ - - sdl = SDL(rt->rt_gateway); - - /* - * Check the address family and length is valid, the address - * is resolved; otherwise, try to resolve. - */ - - - if (sdl->sdl_family == AF_LINK && sdl->sdl_alen == sizeof(*desten)) { - bcopy(LLADDR(sdl), desten, sdl->sdl_alen); - return(1); /* ok, go for it! */ - } - - /* - * we got an entry, but it doesn't have valid link address - * info in it (it is prob. the interface route, which has - * sdl_alen == 0). dump packet. (fall through to "bad"). - */ - -bad: - m_freem(m); - return(0); -} -#endif /* INET */ diff --git a/bsd/netinet/if_atm.h b/bsd/netinet/if_atm.h deleted file mode 100644 index 989fa974d..000000000 --- a/bsd/netinet/if_atm.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* $FreeBSD: src/sys/netinet/if_atm.h,v 1.2.6.1 2000/08/03 01:07:02 peter Exp $ */ -/* $NetBSD: if_atm.h,v 1.2 1996/07/03 17:17:17 chuck Exp $ */ - -/* - * - * Copyright (c) 1996 Charles D. Cranor and Washington University. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by Charles D. Cranor and - * Washington University. - * 4. The name of the author may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * if_atm.h - */ -#include <sys/appleapiopts.h> - -#ifdef KERNEL_PRIVATE -struct atm_pseudohdr; -struct mbuf; -struct rtentry; -struct sockaddr; - -void atm_rtrequest(int, struct rtentry *, struct sockaddr *); -int atmresolve(struct rtentry *, struct mbuf *, struct sockaddr *, - struct atm_pseudohdr *); -#endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet/if_fddi.h b/bsd/netinet/if_fddi.h deleted file mode 100644 index fb9f81f10..000000000 --- a/bsd/netinet/if_fddi.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * Copyright (c) 1995 Matt Thomas (thomas@lkg.dec.com) - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)if_fddi.h 8.1 (Berkeley) 6/10/93 - * $FreeBSD: src/sys/netinet/if_fddi.h,v 1.8 1999/12/29 04:40:58 peter Exp $ - */ - -#ifndef _NETINET_IF_FDDI_H_ -#define _NETINET_IF_FDDI_H_ -#include <sys/appleapiopts.h> - -/* - * Structure of an 100Mb/s FDDI header. - */ -struct fddi_header { - u_char fddi_fc; - u_char fddi_dhost[6]; - u_char fddi_shost[6]; -}; - -#define FDDIIPMTU 4352 -#define FDDIMTU 4470 -#define FDDIMIN 3 - -#define FDDIFC_C 0x80 /* 0b10000000 */ -#define FDDIFC_L 0x40 /* 0b01000000 */ -#define FDDIFC_F 0x30 /* 0b00110000 */ -#define FDDIFC_Z 0x0F /* 0b00001111 */ - -#define FDDIFC_LLC_ASYNC 0x50 -#define FDDIFC_LLC_PRIO0 0 -#define FDDIFC_LLC_PRIO1 1 -#define FDDIFC_LLC_PRIO2 2 -#define FDDIFC_LLC_PRIO3 3 -#define FDDIFC_LLC_PRIO4 4 -#define FDDIFC_LLC_PRIO5 5 -#define FDDIFC_LLC_PRIO6 6 -#define FDDIFC_LLC_PRIO7 7 -#define FDDIFC_LLC_SYNC 0xd0 -#define FDDIFC_SMT 0x40 - -#ifdef KERNEL_PRIVATE -#define fddibroadcastaddr etherbroadcastaddr -#define fddi_ipmulticast_min ether_ipmulticast_min -#define fddi_ipmulticast_max ether_ipmulticast_max -#define fddi_addmulti ether_addmulti -#define fddi_delmulti ether_delmulti -#define fddi_sprintf ether_sprintf - -void fddi_ifattach(struct ifnet *); -void fddi_input(struct ifnet *, struct fddi_header *, struct mbuf *); -int fddi_output(struct ifnet *, - struct mbuf *, struct sockaddr *, struct rtentry *); -#endif /* KERNEL_PRIVATE */ - -#endif /* _NETINET_IF_FDDI_H_ */ diff --git a/bsd/netinet/igmp.c b/bsd/netinet/igmp.c index bc2b80d68..d10484ddf 100644 --- a/bsd/netinet/igmp.c +++ b/bsd/netinet/igmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,7 +25,8 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* +/*- + * Copyright (c) 2007-2009 Bruce Simpson. * Copyright (c) 1988 Stephen Deering. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. @@ -72,15 +73,19 @@ /* * Internet Group Management Protocol (IGMP) routines. + * [RFC1112, RFC2236, RFC3376] * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb 1995. * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995. + * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson. * * MULTICAST Revision: 3.5.1.4 */ +#include <sys/cdefs.h> + #include <sys/param.h> #include <sys/systm.h> #include <sys/malloc.h> @@ -89,6 +94,10 @@ #include <sys/protosw.h> #include <sys/kernel.h> #include <sys/sysctl.h> +#include <sys/mcache.h> + +#include <libkern/libkern.h> +#include <kern/zalloc.h> #include <net/if.h> #include <net/route.h> @@ -100,449 +109,3659 @@ #include <netinet/ip_var.h> #include <netinet/igmp.h> #include <netinet/igmp_var.h> +#include <netinet/kpi_ipfilter_var.h> + +#ifdef IGMP_DEBUG +__inline__ char * +inet_ntoa(struct in_addr ina) +{ + static char buf[4*sizeof "123"]; + unsigned char *ucp = (unsigned char *)&ina; -#if CONFIG_MACF_NET -#include <security/mac_framework.h> + snprintf(buf, sizeof(buf), "%d.%d.%d.%d", + ucp[0] & 0xff, + ucp[1] & 0xff, + ucp[2] & 0xff, + ucp[3] & 0xff); + return buf; +} #endif -#ifndef __APPLE__ -static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state"); +static void igi_initvar(struct igmp_ifinfo *, struct ifnet *, int); +static struct igmp_ifinfo *igi_alloc(int); +static void igi_free(struct igmp_ifinfo *); +static void igi_delete(const struct ifnet *); +static void igmp_dispatch_queue(struct igmp_ifinfo *, struct ifqueue *, + int, const int, struct ifnet *); +static void igmp_final_leave(struct in_multi *, struct igmp_ifinfo *); +static int igmp_handle_state_change(struct in_multi *, + struct igmp_ifinfo *); +static int igmp_initial_join(struct in_multi *, struct igmp_ifinfo *); +static int igmp_input_v1_query(struct ifnet *, const struct ip *, + const struct igmp *); +static int igmp_input_v2_query(struct ifnet *, const struct ip *, + const struct igmp *); +static int igmp_input_v3_query(struct ifnet *, const struct ip *, + /*const*/ struct igmpv3 *); +static int igmp_input_v3_group_query(struct in_multi *, + int, /*const*/ struct igmpv3 *); +static int igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *, + /*const*/ struct igmp *); +static int igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *, + /*const*/ struct igmp *); +void igmp_sendpkt(struct mbuf *, struct ifnet *); +static __inline__ int igmp_isgroupreported(const struct in_addr); +static struct mbuf * + igmp_ra_alloc(void); +#ifdef IGMP_DEBUG +static const char * igmp_rec_type_to_str(const int); #endif +static void igmp_set_version(struct igmp_ifinfo *, const int); +static void igmp_flush_relq(struct igmp_ifinfo *); +static int igmp_v1v2_queue_report(struct in_multi *, const int); +static void igmp_v1v2_process_group_timer(struct in_multi *, const int); +static void igmp_v1v2_process_querier_timers(struct igmp_ifinfo *); +static void igmp_v2_update_group(struct in_multi *, const int); +static void igmp_v3_cancel_link_timers(struct igmp_ifinfo *); +static void igmp_v3_dispatch_general_query(struct igmp_ifinfo *); +static struct mbuf * + igmp_v3_encap_report(struct ifnet *, struct mbuf *); +static int igmp_v3_enqueue_group_record(struct ifqueue *, + struct in_multi *, const int, const int, const int); +static int igmp_v3_enqueue_filter_change(struct ifqueue *, + struct in_multi *); +static void igmp_v3_process_group_timers(struct igmp_ifinfo *, + struct ifqueue *, struct ifqueue *, struct in_multi *, + const int); +static int igmp_v3_merge_state_changes(struct in_multi *, + struct ifqueue *); +static void igmp_v3_suppress_group_record(struct in_multi *); +static int sysctl_igmp_ifinfo SYSCTL_HANDLER_ARGS; +static int sysctl_igmp_gsr SYSCTL_HANDLER_ARGS; +static int sysctl_igmp_default_version SYSCTL_HANDLER_ARGS; -static struct router_info * - find_rti(struct ifnet *ifp, int wait); +struct mbuf *m_raopt; /* Router Alert option */ -static struct igmpstat igmpstat; +static int interface_timers_running; /* IGMPv3 general + * query response */ +static int state_change_timers_running; /* IGMPv3 state-change + * retransmit */ +static int current_state_timers_running; /* IGMPv1/v2 host + * report; IGMPv3 g/sg + * query response */ -SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RD, - &igmpstat, igmpstat, ""); +static LIST_HEAD(, igmp_ifinfo) igi_head; +static struct igmpstat_v3 igmpstat_v3 = { + .igps_version = IGPS_VERSION_3, + .igps_len = sizeof(struct igmpstat_v3), +}; +static struct igmpstat igmpstat; /* old IGMPv2 stats structure */ +static struct timeval igmp_gsrdelay = {10, 0}; -static int igmp_timers_are_running; -static uint32_t igmp_all_hosts_group; -static uint32_t igmp_all_rtrs_group; -static struct mbuf *router_alert; -static struct router_info *Head; +static int igmp_recvifkludge = 1; +static int igmp_sendra = 1; +static int igmp_sendlocal = 1; +static int igmp_v1enable = 1; +static int igmp_v2enable = 1; +static int igmp_legacysupp = 0; +static int igmp_default_version = IGMP_VERSION_3; -static void igmp_sendpkt(struct in_multi *, int, uint32_t); +SYSCTL_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, + &igmpstat, igmpstat, ""); +SYSCTL_STRUCT(_net_inet_igmp, OID_AUTO, v3stats, + CTLFLAG_RD | CTLFLAG_LOCKED, &igmpstat_v3, igmpstat_v3, ""); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_recvifkludge, 0, + "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address"); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_sendra, 0, + "Send IP Router Alert option in IGMPv2/v3 messages"); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_sendlocal, 0, + "Send IGMP membership reports for 224.0.0.0/24 groups"); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_v1enable, 0, + "Enable backwards compatibility with IGMPv1"); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_v2enable, 0, + "Enable backwards compatibility with IGMPv2"); +SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW | CTLFLAG_LOCKED, + &igmp_legacysupp, 0, + "Allow v1/v2 reports to suppress v3 group responses"); +SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version, + CTLTYPE_INT | CTLFLAG_RW, + &igmp_default_version, 0, sysctl_igmp_default_version, "I", + "Default version of IGMP to run on each interface"); +SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay, + CTLTYPE_INT | CTLFLAG_RW, + &igmp_gsrdelay.tv_sec, 0, sysctl_igmp_gsr, "I", + "Rate limit for IGMPv3 Group-and-Source queries in seconds"); +#ifdef IGMP_DEBUG +int igmp_debug = 0; +SYSCTL_INT(_net_inet_igmp, OID_AUTO, + debug, CTLFLAG_RW | CTLFLAG_LOCKED, &igmp_debug, 0, ""); +#endif -void -igmp_init(void) -{ - struct ipoption *ra; +SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_LOCKED, + sysctl_igmp_ifinfo, "Per-interface IGMPv3 state"); - /* - * To avoid byte-swapping the same value over and over again. - */ - igmp_all_hosts_group = htonl(INADDR_ALLHOSTS_GROUP); - igmp_all_rtrs_group = htonl(INADDR_ALLRTRS_GROUP); +/* Lock group and attribute for igmp_mtx */ +static lck_attr_t *igmp_mtx_attr; +static lck_grp_t *igmp_mtx_grp; +static lck_grp_attr_t *igmp_mtx_grp_attr; - igmp_timers_are_running = 0; +/* + * Locking and reference counting: + * + * igmp_mtx mainly protects igi_head. In cases where both igmp_mtx and + * in_multihead_lock must be held, the former must be acquired first in order + * to maintain lock ordering. It is not a requirement that igmp_mtx be + * acquired first before in_multihead_lock, but in case both must be acquired + * in succession, the correct lock ordering must be followed. + * + * Instead of walking the if_multiaddrs list at the interface and returning + * the ifma_protospec value of a matching entry, we search the global list + * of in_multi records and find it that way; this is done with in_multihead + * lock held. Doing so avoids the race condition issues that many other BSDs + * suffer from (therefore in our implementation, ifma_protospec will never be + * NULL for as long as the in_multi is valid.) + * + * The above creates a requirement for the in_multi to stay in in_multihead + * list even after the final IGMP leave (in IGMPv3 mode) until no longer needs + * be retransmitted (this is not required for IGMPv1/v2.) In order to handle + * this, the request and reference counts of the in_multi are bumped up when + * the state changes to IGMP_LEAVING_MEMBER, and later dropped in the timeout + * handler. Each in_multi holds a reference to the underlying igmp_ifinfo. + * + * Thus, the permitted lock oder is: + * + * igmp_mtx, in_multihead_lock, inm_lock, igi_lock + * + * Any may be taken independently, but if any are held at the same time, + * the above lock order must be followed. + */ +static decl_lck_mtx_data(, igmp_mtx); +static int igmp_timers_are_running; - /* - * Construct a Router Alert option to use in outgoing packets - */ - MGET(router_alert, M_WAIT, MT_DATA); - ra = mtod(router_alert, struct ipoption *); - ra->ipopt_dst.s_addr = 0; - ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ - ra->ipopt_list[1] = 0x04; /* 4 bytes long */ - ra->ipopt_list[2] = 0x00; - ra->ipopt_list[3] = 0x00; - router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; +#define IGI_ZONE_MAX 64 /* maximum elements in zone */ +#define IGI_ZONE_NAME "igmp_ifinfo" /* zone name */ - Head = (struct router_info *) 0; -} +static unsigned int igi_size; /* size of zone element */ +static struct zone *igi_zone; /* zone for igmp_ifinfo */ -static struct router_info * -find_rti( - struct ifnet *ifp, int wait) +#ifdef IGMP_DEBUG +static __inline char * +inet_ntoa_haddr(in_addr_t haddr) { - struct router_info *rti = Head; - - -#if IGMP_DEBUG - printf("[igmp.c, _find_rti] --> entering \n"); -#endif - while (rti) { - if (rti->rti_ifp == ifp) { -#if IGMP_DEBUG - printf("[igmp.c, _find_rti] --> found old entry \n"); -#endif - return rti; - } - rti = rti->rti_next; - } - - MALLOC(rti, struct router_info *, sizeof *rti, M_IGMP, wait); - if (rti != NULL) - { - rti->rti_ifp = ifp; - rti->rti_type = IGMP_V2_ROUTER; - rti->rti_time = 0; - rti->rti_next = Head; - Head = rti; - } -#if IGMP_DEBUG - if (rti) printf("[igmp.c, _find_rti] --> created an entry \n"); + struct in_addr ia; + + ia.s_addr = htonl(haddr); + return (inet_ntoa(ia)); +} #endif - return rti; +/* + * Retrieve or set default IGMP version. + */ +static int +sysctl_igmp_default_version SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg2) + int error; + int new; + + lck_mtx_lock(&igmp_mtx); + + error = SYSCTL_OUT(req, arg1, sizeof(int)); + if (error || !req->newptr) + goto out_locked; + + new = igmp_default_version; + + error = SYSCTL_IN(req, &new, sizeof(int)); + if (error) + goto out_locked; + + if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) { + error = EINVAL; + goto out_locked; + } + + IGMP_PRINTF(("change igmp_default_version from %d to %d\n", + igmp_default_version, new)); + + igmp_default_version = new; + +out_locked: + lck_mtx_unlock(&igmp_mtx); + return (error); } -void -igmp_input( - struct mbuf *m, - int iphlen) +/* + * Retrieve or set threshold between group-source queries in seconds. + * + */ +static int +sysctl_igmp_gsr SYSCTL_HANDLER_ARGS { - struct igmp *igmp; - struct ip *ip; - int igmplen; - struct ifnet *ifp = m->m_pkthdr.rcvif; - int minlen; - struct in_multi *inm; - struct in_ifaddr *ia; - struct in_multistep step; - struct router_info *rti; - - int timer; /** timer value in the igmp query header **/ +#pragma unused(arg1, arg2) + int error; + int i; - ++igmpstat.igps_rcv_total; + lck_mtx_lock(&igmp_mtx); - ip = mtod(m, struct ip *); - igmplen = ip->ip_len; + i = igmp_gsrdelay.tv_sec; - /* - * Validate lengths - */ - if (igmplen < IGMP_MINLEN) { - ++igmpstat.igps_rcv_tooshort; - m_freem(m); - return; - } - minlen = iphlen + IGMP_MINLEN; - if ((m->m_flags & M_EXT || m->m_len < minlen) && - (m = m_pullup(m, minlen)) == 0) { - ++igmpstat.igps_rcv_tooshort; - return; - } + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || !req->newptr) + goto out_locked; - /* - * Validate checksum - */ - m->m_data += iphlen; - m->m_len -= iphlen; - igmp = mtod(m, struct igmp *); - if (in_cksum(m, igmplen)) { - ++igmpstat.igps_rcv_badsum; - m_freem(m); - return; + if (i < -1 || i >= 60) { + error = EINVAL; + goto out_locked; } - m->m_data -= iphlen; - m->m_len += iphlen; - ip = mtod(m, struct ip *); - timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE; - if (timer == 0) - timer = 1; - rti = find_rti(ifp, M_NOWAIT); - if (rti == NULL) { - m_freem(m); - return; - } + igmp_gsrdelay.tv_sec = i; - /* - * In the IGMPv2 specification, there are 3 states and a flag. - * - * In Non-Member state, we simply don't have a membership record. - * In Delaying Member state, our timer is running (inm->inm_timer) - * In Idle Member state, our timer is not running (inm->inm_timer==0) - * - * The flag is inm->inm_state, it is set to IGMP_OTHERMEMBER if - * we have heard a report from another member, or IGMP_IREPORTEDLAST - * if I sent the last report. - */ - switch (igmp->igmp_type) { +out_locked: + lck_mtx_unlock(&igmp_mtx); + return (error); +} - case IGMP_MEMBERSHIP_QUERY: - ++igmpstat.igps_rcv_queries; +/* + * Expose struct igmp_ifinfo to userland, keyed by ifindex. + * For use by ifmcstat(8). + * + */ +static int +sysctl_igmp_ifinfo SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + int *name; + int error; + u_int namelen; + struct ifnet *ifp; + struct igmp_ifinfo *igi; + struct igmp_ifinfo_u igi_u; - if (ifp->if_flags & IFF_LOOPBACK) - break; + name = (int *)arg1; + namelen = arg2; - if (igmp->igmp_code == 0) { - /* - * Old router. Remember that the querier on this - * interface is old, and set the timer to the - * value in RFC 1112. - */ + if (req->newptr != USER_ADDR_NULL) + return (EPERM); - rti->rti_type = IGMP_V1_ROUTER; - rti->rti_time = 0; + if (namelen != 1) + return (EINVAL); - timer = IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ; + lck_mtx_lock(&igmp_mtx); - if (ip->ip_dst.s_addr != igmp_all_hosts_group || - igmp->igmp_group.s_addr != 0) { - ++igmpstat.igps_rcv_badqueries; - m_freem(m); - return; - } - } else { - /* - * New router. Simply do the new validity check. - */ - - if (igmp->igmp_group.s_addr != 0 && - !IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { - ++igmpstat.igps_rcv_badqueries; - m_freem(m); - return; - } - } + if (name[0] <= 0 || name[0] > (u_int)if_index) { + error = ENOENT; + goto out_locked; + } - /* - * - Start the timers in all of our membership records - * that the query applies to for the interface on - * which the query arrived excl. those that belong - * to the "all-hosts" group (224.0.0.1). - * - Restart any timer that is already running but has - * a value longer than the requested timeout. - * - Use the value specified in the query message as - * the maximum timeout. - */ - lck_mtx_lock(rnh_lock); - IN_FIRST_MULTI(step, inm); - while (inm != NULL) { - if (inm->inm_ifp == ifp && - inm->inm_addr.s_addr != igmp_all_hosts_group && - (igmp->igmp_group.s_addr == 0 || - igmp->igmp_group.s_addr == inm->inm_addr.s_addr)) { - if (inm->inm_timer == 0 || - inm->inm_timer > timer) { - inm->inm_timer = - IGMP_RANDOM_DELAY(timer); - igmp_timers_are_running = 1; - } - } - IN_NEXT_MULTI(step, inm); + error = ENOENT; + + ifnet_head_lock_shared(); + ifp = ifindex2ifnet[name[0]]; + ifnet_head_done(); + if (ifp == NULL) + goto out_locked; + + bzero(&igi_u, sizeof (igi_u)); + + LIST_FOREACH(igi, &igi_head, igi_link) { + IGI_LOCK(igi); + if (ifp != igi->igi_ifp) { + IGI_UNLOCK(igi); + continue; } - lck_mtx_unlock(rnh_lock); + igi_u.igi_ifindex = igi->igi_ifp->if_index; + igi_u.igi_version = igi->igi_version; + igi_u.igi_v1_timer = igi->igi_v1_timer; + igi_u.igi_v2_timer = igi->igi_v2_timer; + igi_u.igi_v3_timer = igi->igi_v3_timer; + igi_u.igi_flags = igi->igi_flags; + igi_u.igi_rv = igi->igi_rv; + igi_u.igi_qi = igi->igi_qi; + igi_u.igi_qri = igi->igi_qri; + igi_u.igi_uri = igi->igi_uri; + IGI_UNLOCK(igi); + error = SYSCTL_OUT(req, &igi_u, sizeof (igi_u)); break; + } - case IGMP_V1_MEMBERSHIP_REPORT: - case IGMP_V2_MEMBERSHIP_REPORT: - /* - * For fast leave to work, we have to know that we are the - * last person to send a report for this group. Reports - * can potentially get looped back if we are a multicast - * router, so discard reports sourced by me. - */ - IFP_TO_IA(ifp, ia); - if (ia && ip->ip_src.s_addr == IA_SIN(ia)->sin_addr.s_addr) { - ifafree(&ia->ia_ifa); - break; - } +out_locked: + lck_mtx_unlock(&igmp_mtx); + return (error); +} - ++igmpstat.igps_rcv_reports; +/* + * Dispatch an entire queue of pending packet chains + * + * Must not be called with inm_lock held. + */ +static void +igmp_dispatch_queue(struct igmp_ifinfo *igi, struct ifqueue *ifq, int limit, + const int loop, struct ifnet *ifp) +{ + struct mbuf *m; + struct ip *ip; - if (ifp->if_flags & IFF_LOOPBACK) { - if (ia != NULL) - ifafree(&ia->ia_ifa); + if (igi != NULL) + IGI_LOCK_ASSERT_HELD(igi); + + for (;;) { + IF_DEQUEUE(ifq, m); + if (m == NULL) break; - } + IGMP_PRINTF(("%s: dispatch %p from %p\n", __func__, ifq, m)); + ip = mtod(m, struct ip *); + if (loop) + m->m_flags |= M_IGMP_LOOP; + if (igi != NULL) + IGI_UNLOCK(igi); + igmp_sendpkt(m, ifp); + if (igi != NULL) + IGI_LOCK(igi); + if (--limit == 0) + break; + } - if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr))) { - ++igmpstat.igps_rcv_badreports; - m_freem(m); - if (ia != NULL) - ifafree(&ia->ia_ifa); - return; - } + if (igi != NULL) + IGI_LOCK_ASSERT_HELD(igi); +} - /* - * KLUDGE: if the IP source address of the report has an - * unspecified (i.e., zero) subnet number, as is allowed for - * a booting host, replace it with the correct subnet number - * so that a process-level multicast routing demon can - * determine which subnet it arrived from. This is necessary - * to compensate for the lack of any way for a process to - * determine the arrival interface of an incoming packet. - */ - if ((ntohl(ip->ip_src.s_addr) & IN_CLASSA_NET) == 0) - if (ia) ip->ip_src.s_addr = htonl(ia->ia_subnet); +/* + * Filter outgoing IGMP report state by group. + * + * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1). + * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are + * disabled for all groups in the 224.0.0.0/24 link-local scope. However, + * this may break certain IGMP snooping switches which rely on the old + * report behaviour. + * + * Return zero if the given group is one for which IGMP reports + * should be suppressed, or non-zero if reports should be issued. + */ - /* - * If we belong to the group being reported, stop - * our timer for that group. - */ - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(igmp->igmp_group, ifp, inm); - ifnet_lock_done(ifp); +static __inline__ +int igmp_isgroupreported(const struct in_addr addr) +{ - if (inm != NULL) { - inm->inm_timer = 0; - ++igmpstat.igps_rcv_ourreports; + if (in_allhosts(addr) || + ((!igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr))))) + return (0); - inm->inm_state = IGMP_OTHERMEMBER; - } + return (1); +} - if (ia != NULL) - ifafree(&ia->ia_ifa); - break; - } +/* + * Construct a Router Alert option to use in outgoing packets. + */ +static struct mbuf * +igmp_ra_alloc(void) +{ + struct mbuf *m; + struct ipoption *p; - /* - * Pass all valid IGMP packets up to any process(es) listening - * on a raw IGMP socket. - */ - rip_input(m, iphlen); + MGET(m, M_WAITOK, MT_DATA); + p = mtod(m, struct ipoption *); + p->ipopt_dst.s_addr = INADDR_ANY; + p->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ + p->ipopt_list[1] = 0x04; /* 4 bytes long */ + p->ipopt_list[2] = IPOPT_EOL; /* End of IP option list */ + p->ipopt_list[3] = 0x00; /* pad byte */ + m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1]; + + return (m); } -int -igmp_joingroup(struct in_multi *inm) +/* + * Attach IGMP when PF_INET is attached to an interface. + */ +struct igmp_ifinfo * +igmp_domifattach(struct ifnet *ifp, int how) { + struct igmp_ifinfo *igi; - if (inm->inm_addr.s_addr == igmp_all_hosts_group - || inm->inm_ifp->if_flags & IFF_LOOPBACK) { - inm->inm_timer = 0; - inm->inm_state = IGMP_OTHERMEMBER; - } else { - inm->inm_rti = find_rti(inm->inm_ifp, M_WAITOK); - if (inm->inm_rti == NULL) return ENOMEM; - igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); - inm->inm_timer = IGMP_RANDOM_DELAY( - IGMP_MAX_HOST_REPORT_DELAY*PR_FASTHZ); - inm->inm_state = IGMP_IREPORTEDLAST; - igmp_timers_are_running = 1; - } - return 0; + IGMP_PRINTF(("%s: called for ifp %p(%s)\n", + __func__, ifp, ifp->if_name)); + + igi = igi_alloc(how); + if (igi == NULL) + return (NULL); + + lck_mtx_lock(&igmp_mtx); + + IGI_LOCK(igi); + igi_initvar(igi, ifp, 0); + igi->igi_debug |= IFD_ATTACHED; + IGI_ADDREF_LOCKED(igi); /* hold a reference for igi_head */ + IGI_ADDREF_LOCKED(igi); /* hold a reference for caller */ + IGI_UNLOCK(igi); + + LIST_INSERT_HEAD(&igi_head, igi, igi_link); + + lck_mtx_unlock(&igmp_mtx); + + IGMP_PRINTF(("allocate igmp_ifinfo for ifp %p(%s)\n", + ifp, ifp->if_name)); + + return (igi); } +/* + * Attach IGMP when PF_INET is reattached to an interface. Caller is + * expected to have an outstanding reference to the igi. + */ void -igmp_leavegroup(struct in_multi *inm) +igmp_domifreattach(struct igmp_ifinfo *igi) { - if (inm->inm_state == IGMP_IREPORTEDLAST && - inm->inm_addr.s_addr != igmp_all_hosts_group && - !(inm->inm_ifp->if_flags & IFF_LOOPBACK) && - inm->inm_rti->rti_type != IGMP_V1_ROUTER) - igmp_sendpkt(inm, IGMP_V2_LEAVE_GROUP, igmp_all_rtrs_group); + struct ifnet *ifp; + + lck_mtx_lock(&igmp_mtx); + + IGI_LOCK(igi); + VERIFY(!(igi->igi_debug & IFD_ATTACHED)); + ifp = igi->igi_ifp; + VERIFY(ifp != NULL); + igi_initvar(igi, ifp, 1); + igi->igi_debug |= IFD_ATTACHED; + IGI_ADDREF_LOCKED(igi); /* hold a reference for igi_head */ + IGI_UNLOCK(igi); + + LIST_INSERT_HEAD(&igi_head, igi, igi_link); + + lck_mtx_unlock(&igmp_mtx); + + IGMP_PRINTF(("reattached igmp_ifinfo for ifp %p(%s)\n", + ifp, ifp->if_name)); } +/* + * Hook for domifdetach. + */ void -igmp_fasttimo(void) +igmp_domifdetach(struct ifnet *ifp) { - struct in_multi *inm; - struct in_multistep step; + IGMP_PRINTF(("%s: called for ifp %p(%s%d)\n", + __func__, ifp, ifp->if_name, ifp->if_unit)); + + lck_mtx_lock(&igmp_mtx); + igi_delete(ifp); + lck_mtx_unlock(&igmp_mtx); +} + +/* + * Called at interface detach time. Note that we only flush all deferred + * responses and record releases; all remaining inm records and their source + * entries related to this interface are left intact, in order to handle + * the reattach case. + */ +static void +igi_delete(const struct ifnet *ifp) +{ + struct igmp_ifinfo *igi, *tigi; + + lck_mtx_assert(&igmp_mtx, LCK_MTX_ASSERT_OWNED); + + LIST_FOREACH_SAFE(igi, &igi_head, igi_link, tigi) { + IGI_LOCK(igi); + if (igi->igi_ifp == ifp) { + /* + * Free deferred General Query responses. + */ + IF_DRAIN(&igi->igi_gq); + IF_DRAIN(&igi->igi_v2q); + igmp_flush_relq(igi); + VERIFY(SLIST_EMPTY(&igi->igi_relinmhead)); + igi->igi_debug &= ~IFD_ATTACHED; + IGI_UNLOCK(igi); + + LIST_REMOVE(igi, igi_link); + IGI_REMREF(igi); /* release igi_head reference */ + return; + } + IGI_UNLOCK(igi); + } + panic("%s: igmp_ifinfo not found for ifp %p\n", __func__, ifp); +} + +static void +igi_initvar(struct igmp_ifinfo *igi, struct ifnet *ifp, int reattach) +{ + IGI_LOCK_ASSERT_HELD(igi); + + igi->igi_ifp = ifp; + igi->igi_version = igmp_default_version; + igi->igi_flags = 0; + igi->igi_rv = IGMP_RV_INIT; + igi->igi_qi = IGMP_QI_INIT; + igi->igi_qri = IGMP_QRI_INIT; + igi->igi_uri = IGMP_URI_INIT; + + /* ifnet is not yet attached; no need to hold ifnet lock */ + if (!(ifp->if_flags & IFF_MULTICAST)) + igi->igi_flags |= IGIF_SILENT; + + if (!reattach) + SLIST_INIT(&igi->igi_relinmhead); /* - * Quick check to see if any work needs to be done, in order - * to minimize the overhead of fasttimo processing. + * Responses to general queries are subject to bounds. */ + igi->igi_gq.ifq_maxlen = IGMP_MAX_RESPONSE_PACKETS; + igi->igi_v2q.ifq_maxlen = IGMP_MAX_RESPONSE_PACKETS; +} - if (!igmp_timers_are_running) - return; +static struct igmp_ifinfo * +igi_alloc(int how) +{ + struct igmp_ifinfo *igi; - igmp_timers_are_running = 0; - IN_FIRST_MULTI(step, inm); - while (inm != NULL) { - if (inm->inm_timer == 0) { - /* do nothing */ - } else if ((--inm->inm_timer == 0) && (inm->inm_rti != NULL)) { - igmp_sendpkt(inm, inm->inm_rti->rti_type, 0); - inm->inm_state = IGMP_IREPORTEDLAST; - } else { - igmp_timers_are_running = 1; - } - IN_NEXT_MULTI(step, inm); + igi = (how == M_WAITOK) ? zalloc(igi_zone) : zalloc_noblock(igi_zone); + if (igi != NULL) { + bzero(igi, igi_size); + lck_mtx_init(&igi->igi_lock, igmp_mtx_grp, igmp_mtx_attr); + igi->igi_debug |= IFD_ALLOC; + } + return (igi); +} + +static void +igi_free(struct igmp_ifinfo *igi) +{ + IGI_LOCK(igi); + if (igi->igi_debug & IFD_ATTACHED) { + panic("%s: attached igi=%p is being freed", __func__, igi); + /* NOTREACHED */ + } else if (igi->igi_ifp != NULL) { + panic("%s: ifp not NULL for igi=%p", __func__, igi); + /* NOTREACHED */ + } else if (!(igi->igi_debug & IFD_ALLOC)) { + panic("%s: igi %p cannot be freed", __func__, igi); + /* NOTREACHED */ + } else if (igi->igi_refcnt != 0) { + panic("%s: non-zero refcnt igi=%p", __func__, igi); + /* NOTREACHED */ } + igi->igi_debug &= ~IFD_ALLOC; + IGI_UNLOCK(igi); + + lck_mtx_destroy(&igi->igi_lock, igmp_mtx_grp); + zfree(igi_zone, igi); } void -igmp_slowtimo(void) +igi_addref(struct igmp_ifinfo *igi, int locked) { - struct router_info *rti = Head; + if (!locked) + IGI_LOCK_SPIN(igi); + else + IGI_LOCK_ASSERT_HELD(igi); -#if IGMP_DEBUG - printf("[igmp.c,_slowtimo] -- > entering \n"); -#endif - while (rti) { - if (rti->rti_type == IGMP_V1_ROUTER) { - rti->rti_time++; - if (rti->rti_time >= IGMP_AGE_THRESHOLD) { - rti->rti_type = IGMP_V2_ROUTER; - } - } - rti = rti->rti_next; + if (++igi->igi_refcnt == 0) { + panic("%s: igi=%p wraparound refcnt", __func__, igi); + /* NOTREACHED */ } -#if IGMP_DEBUG - printf("[igmp.c,_slowtimo] -- > exiting \n"); -#endif + if (!locked) + IGI_UNLOCK(igi); } -static void -igmp_sendpkt(struct in_multi *inm, int type, uint32_t addr) +void +igi_remref(struct igmp_ifinfo *igi) { - struct mbuf *m; - struct igmp *igmp; - struct ip *ip; - struct ip_moptions imo; - struct route ro; + struct ifnet *ifp; - MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */ - if (m == NULL) - return; + IGI_LOCK_SPIN(igi); - m->m_pkthdr.rcvif = lo_ifp; -#if CONFIG_MACF_NET - mac_mbuf_label_associate_linklayer(inm->inm_ifp, m); -#endif - m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN; - MH_ALIGN(m, IGMP_MINLEN + sizeof(struct ip)); - m->m_data += sizeof(struct ip); - m->m_len = IGMP_MINLEN; - m->m_pkthdr.csum_flags = 0; - m->m_pkthdr.csum_data = 0; - igmp = mtod(m, struct igmp *); - igmp->igmp_type = type; - igmp->igmp_code = 0; - igmp->igmp_group = inm->inm_addr; - igmp->igmp_cksum = 0; - igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN); - - m->m_data -= sizeof(struct ip); - m->m_len += sizeof(struct ip); - ip = mtod(m, struct ip *); - ip->ip_tos = 0; - ip->ip_len = sizeof(struct ip) + IGMP_MINLEN; - ip->ip_off = 0; - ip->ip_p = IPPROTO_IGMP; - ip->ip_src.s_addr = INADDR_ANY; - ip->ip_dst.s_addr = addr ? addr : igmp->igmp_group.s_addr; - - imo.imo_multicast_ifp = inm->inm_ifp; - imo.imo_multicast_ttl = 1; - imo.imo_multicast_vif = -1; -#if MROUTING - /* - * Request loopback of the report if we are acting as a multicast - * router, so that the process-level routing demon can hear it. - */ - imo.imo_multicast_loop = (ip_mrouter != NULL); -#else - imo.imo_multicast_loop = 0; -#endif + if (igi->igi_refcnt == 0) { + panic("%s: igi=%p negative refcnt", __func__, igi); + /* NOTREACHED */ + } + + --igi->igi_refcnt; + if (igi->igi_refcnt > 0) { + IGI_UNLOCK(igi); + return; + } + + ifp = igi->igi_ifp; + igi->igi_ifp = NULL; + IF_DRAIN(&igi->igi_gq); + IF_DRAIN(&igi->igi_v2q); + igmp_flush_relq(igi); + VERIFY(SLIST_EMPTY(&igi->igi_relinmhead)); + IGI_UNLOCK(igi); + + IGMP_PRINTF(("%s: freeing igmp_ifinfo for ifp %p(%s%d)\n", + __func__, ifp, ifp->if_name, ifp->if_unit)); + + igi_free(igi); +} + +/* + * Process a received IGMPv1 query. + * Return non-zero if the message should be dropped. + */ +static int +igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip, + const struct igmp *igmp) +{ + struct igmp_ifinfo *igi; + struct in_multi *inm; + struct in_multistep step; /* - * XXX - * Do we have to worry about reentrancy here? Don't think so. + * IGMPv1 Host Membership Queries SHOULD always be addressed to + * 224.0.0.1. They are always treated as General Queries. + * igmp_group is always ignored. Do not drop it as a userland + * daemon may wish to see it. */ - bzero(&ro, sizeof (ro)); - (void) ip_output(m, router_alert, &ro, 0, &imo, NULL); - if (ro.ro_rt != NULL) { - rtfree(ro.ro_rt); - ro.ro_rt = NULL; + if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) { + IGMPSTAT_INC(igps_rcv_badqueries); + OIGMPSTAT_INC(igps_rcv_badqueries); + return (0); } + IGMPSTAT_INC(igps_rcv_gen_queries); - ++igmpstat.igps_snd_reports; -} + igi = IGMP_IFINFO(ifp); + VERIFY(igi != NULL); + + IGI_LOCK(igi); + if (igi->igi_flags & IGIF_LOOPBACK) { + IGMP_PRINTF(("ignore v1 query on IGIF_LOOPBACK ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + IGI_UNLOCK(igi); + return (0); + } + /* + * Switch to IGMPv1 host compatibility mode. + */ + igmp_set_version(igi, IGMP_VERSION_1); + IGI_UNLOCK(igi); + IGMP_PRINTF(("process v1 query on ifp %p(%s%d)\n", ifp, ifp->if_name, + ifp->if_unit)); + + /* + * Start the timers in all of our group records + * for the interface on which the query arrived, + * except those which are already running. + */ + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp != ifp) + goto next; + if (inm->inm_timer != 0) + goto next; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_V1V2_MAX_RI * PR_SLOWHZ); + current_state_timers_running = 1; + break; + case IGMP_LEAVING_MEMBER: + break; + } +next: + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + + return (0); +} + +/* + * Process a received IGMPv2 general or group-specific query. + */ +static int +igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip, + const struct igmp *igmp) +{ + struct igmp_ifinfo *igi; + struct in_multi *inm; + int is_general_query; + uint16_t timer; + + is_general_query = 0; + + /* + * Validate address fields upfront. + */ + if (in_nullhost(igmp->igmp_group)) { + /* + * IGMPv2 General Query. + * If this was not sent to the all-hosts group, ignore it. + */ + if (!in_allhosts(ip->ip_dst)) + return (0); + IGMPSTAT_INC(igps_rcv_gen_queries); + is_general_query = 1; + } else { + /* IGMPv2 Group-Specific Query. */ + IGMPSTAT_INC(igps_rcv_group_queries); + } + + igi = IGMP_IFINFO(ifp); + VERIFY(igi != NULL); + + IGI_LOCK(igi); + if (igi->igi_flags & IGIF_LOOPBACK) { + IGMP_PRINTF(("ignore v2 query on IGIF_LOOPBACK ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + IGI_UNLOCK(igi); + return(0); + } + /* + * Ignore v2 query if in v1 Compatibility Mode. + */ + if (igi->igi_version == IGMP_VERSION_1) { + IGI_UNLOCK(igi); + return (0); + } + igmp_set_version(igi, IGMP_VERSION_2); + IGI_UNLOCK(igi); + + timer = igmp->igmp_code * PR_SLOWHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + + if (is_general_query) { + struct in_multistep step; + + IGMP_PRINTF(("process v2 general query on ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + /* + * For each reporting group joined on this + * interface, kick the report timer. + */ + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp == ifp) + igmp_v2_update_group(inm, timer); + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + } else { + /* + * Group-specific IGMPv2 query, we need only + * look up the single group to process it. + */ + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&igmp->igmp_group, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL) { + INM_LOCK(inm); + IGMP_PRINTF(("process v2 query %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, + ifp->if_unit)); + igmp_v2_update_group(inm, timer); + INM_UNLOCK(inm); + INM_REMREF(inm); /* from IN_LOOKUP_MULTI */ + } + } + + return (0); +} + +/* + * Update the report timer on a group in response to an IGMPv2 query. + * + * If we are becoming the reporting member for this group, start the timer. + * If we already are the reporting member for this group, and timer is + * below the threshold, reset it. + * + * We may be updating the group for the first time since we switched + * to IGMPv3. If we are, then we must clear any recorded source lists, + * and transition to REPORTING state; the group timer is overloaded + * for group and group-source query responses. + * + * Unlike IGMPv3, the delay per group should be jittered + * to avoid bursts of IGMPv2 reports. + */ +static void +igmp_v2_update_group(struct in_multi *inm, const int timer) +{ + + IGMP_PRINTF(("%s: %s/%s%d timer=%d\n", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name, + inm->inm_ifp->if_unit, timer)); + + INM_LOCK_ASSERT_HELD(inm); + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + if (inm->inm_timer != 0 && + inm->inm_timer <= timer) { + IGMP_PRINTF(("%s: REPORTING and timer running, " + "skipping.\n", __func__)); + break; + } + /* FALLTHROUGH */ + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + IGMP_PRINTF(("%s: ->REPORTING\n", __func__)); + inm->inm_state = IGMP_REPORTING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + current_state_timers_running = 1; + break; + case IGMP_SLEEPING_MEMBER: + IGMP_PRINTF(("%s: ->AWAKENING\n", __func__)); + inm->inm_state = IGMP_AWAKENING_MEMBER; + break; + case IGMP_LEAVING_MEMBER: + break; + } +} + +/* + * Process a received IGMPv3 general, group-specific or + * group-and-source-specific query. + * Assumes m has already been pulled up to the full IGMP message length. + * Return 0 if successful, otherwise an appropriate error code is returned. + */ +static int +igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip, + /*const*/ struct igmpv3 *igmpv3) +{ + struct igmp_ifinfo *igi; + struct in_multi *inm; + int is_general_query; + uint32_t maxresp, nsrc, qqi; + uint16_t timer; + uint8_t qrv; + + is_general_query = 0; + + IGMP_PRINTF(("process v3 query on ifp %p(%s%d)\n", ifp, ifp->if_name, + ifp->if_unit)); + + maxresp = igmpv3->igmp_code; /* in 1/10ths of a second */ + if (maxresp >= 128) { + maxresp = IGMP_MANT(igmpv3->igmp_code) << + (IGMP_EXP(igmpv3->igmp_code) + 3); + } + + /* + * Robustness must never be less than 2 for on-wire IGMPv3. + * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make + * an exception for interfaces whose IGMPv3 state changes + * are redirected to loopback (e.g. MANET). + */ + qrv = IGMP_QRV(igmpv3->igmp_misc); + if (qrv < 2) { + IGMP_PRINTF(("%s: clamping qrv %d to %d\n", __func__, + qrv, IGMP_RV_INIT)); + qrv = IGMP_RV_INIT; + } + + qqi = igmpv3->igmp_qqi; + if (qqi >= 128) { + qqi = IGMP_MANT(igmpv3->igmp_qqi) << + (IGMP_EXP(igmpv3->igmp_qqi) + 3); + } + + timer = maxresp * PR_SLOWHZ / IGMP_TIMER_SCALE; + if (timer == 0) + timer = 1; + + nsrc = ntohs(igmpv3->igmp_numsrc); + + /* + * Validate address fields and versions upfront before + * accepting v3 query. + */ + if (in_nullhost(igmpv3->igmp_group)) { + /* + * IGMPv3 General Query. + * + * General Queries SHOULD be directed to 224.0.0.1. + * A general query with a source list has undefined + * behaviour; discard it. + */ + IGMPSTAT_INC(igps_rcv_gen_queries); + if (!in_allhosts(ip->ip_dst) || nsrc > 0) { + IGMPSTAT_INC(igps_rcv_badqueries); + OIGMPSTAT_INC(igps_rcv_badqueries); + return (0); + } + is_general_query = 1; + } else { + /* Group or group-source specific query. */ + if (nsrc == 0) + IGMPSTAT_INC(igps_rcv_group_queries); + else + IGMPSTAT_INC(igps_rcv_gsr_queries); + } + + igi = IGMP_IFINFO(ifp); + VERIFY(igi != NULL); + + IGI_LOCK(igi); + if (igi->igi_flags & IGIF_LOOPBACK) { + IGMP_PRINTF(("ignore v3 query on IGIF_LOOPBACK ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + IGI_UNLOCK(igi); + return (0); + } + + /* + * Discard the v3 query if we're in Compatibility Mode. + * The RFC is not obviously worded that hosts need to stay in + * compatibility mode until the Old Version Querier Present + * timer expires. + */ + if (igi->igi_version != IGMP_VERSION_3) { + IGMP_PRINTF(("ignore v3 query in v%d mode on ifp %p(%s%d)\n", + igi->igi_version, ifp, ifp->if_name, ifp->if_unit)); + IGI_UNLOCK(igi); + return (0); + } + + igmp_set_version(igi, IGMP_VERSION_3); + igi->igi_rv = qrv; + igi->igi_qi = qqi; + igi->igi_qri = maxresp; + + + IGMP_PRINTF(("%s: qrv %d qi %d qri %d\n", __func__, qrv, qqi, + maxresp)); + + if (is_general_query) { + /* + * Schedule a current-state report on this ifp for + * all groups, possibly containing source lists. + * If there is a pending General Query response + * scheduled earlier than the selected delay, do + * not schedule any other reports. + * Otherwise, reset the interface timer. + */ + IGMP_PRINTF(("process v3 general query on ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { + igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer); + interface_timers_running = 1; + } + IGI_UNLOCK(igi); + } else { + IGI_UNLOCK(igi); + /* + * Group-source-specific queries are throttled on + * a per-group basis to defeat denial-of-service attempts. + * Queries for groups we are not a member of on this + * link are simply ignored. + */ + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&igmpv3->igmp_group, ifp, inm); + in_multihead_lock_done(); + if (inm == NULL) + return (0); + + INM_LOCK(inm); +#ifndef __APPLE__ + /* TODO: need ratecheck equivalent */ + if (nsrc > 0) { + if (!ratecheck(&inm->inm_lastgsrtv, + &igmp_gsrdelay)) { + IGMP_PRINTF(("%s: GS query throttled.\n", + __func__)); + IGMPSTAT_INC(igps_drop_gsr_queries); + INM_UNLOCK(inm); + INM_REMREF(inm); /* from IN_LOOKUP_MULTI */ + return (0); + } + } +#endif + IGMP_PRINTF(("process v3 %s query on ifp %p(%s%d)\n", + inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_name, + ifp->if_unit)); + /* + * If there is a pending General Query response + * scheduled sooner than the selected delay, no + * further report need be scheduled. + * Otherwise, prepare to respond to the + * group-specific or group-and-source query. + */ + IGI_LOCK(igi); + if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) { + IGI_UNLOCK(igi); + igmp_input_v3_group_query(inm, timer, igmpv3); + } else { + IGI_UNLOCK(igi); + } + INM_UNLOCK(inm); + INM_REMREF(inm); /* from IN_LOOKUP_MULTI */ + } + + return (0); +} + +/* + * Process a recieved IGMPv3 group-specific or group-and-source-specific + * query. + * Return <0 if any error occured. Currently this is ignored. + */ +static int +igmp_input_v3_group_query(struct in_multi *inm, + int timer, /*const*/ struct igmpv3 *igmpv3) +{ + int retval; + uint16_t nsrc; + + INM_LOCK_ASSERT_HELD(inm); + + retval = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LEAVING_MEMBER: + return (retval); + case IGMP_REPORTING_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + break; + } + + nsrc = ntohs(igmpv3->igmp_numsrc); + + /* + * Deal with group-specific queries upfront. + * If any group query is already pending, purge any recorded + * source-list state if it exists, and schedule a query response + * for this group-specific query. + */ + if (nsrc == 0) { + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { + inm_clear_recorded(inm); + timer = min(inm->inm_timer, timer); + } + inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + current_state_timers_running = 1; + return (retval); + } + + /* + * Deal with the case where a group-and-source-specific query has + * been received but a group-specific query is already pending. + */ + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) { + timer = min(inm->inm_timer, timer); + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + current_state_timers_running = 1; + return (retval); + } + + /* + * Finally, deal with the case where a group-and-source-specific + * query has been received, where a response to a previous g-s-r + * query exists, or none exists. + * In this case, we need to parse the source-list which the Querier + * has provided us with and check if we have any source list filter + * entries at T1 for these sources. If we do not, there is no need + * schedule a report and the query may be dropped. + * If we do, we must record them and schedule a current-state + * report for those sources. + * FIXME: Handling source lists larger than 1 mbuf requires that + * we pass the mbuf chain pointer down to this function, and use + * m_getptr() to walk the chain. + */ + if (inm->inm_nsrc > 0) { + const struct in_addr *ap; + int i, nrecorded; + + ap = (const struct in_addr *)(igmpv3 + 1); + nrecorded = 0; + for (i = 0; i < nsrc; i++, ap++) { + retval = inm_record_source(inm, ap->s_addr); + if (retval < 0) + break; + nrecorded += retval; + } + if (nrecorded > 0) { + IGMP_PRINTF(("%s: schedule response to SG query\n", + __func__)); + inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER; + inm->inm_timer = IGMP_RANDOM_DELAY(timer); + current_state_timers_running = 1; + } + } + + return (retval); +} + +/* + * Process a received IGMPv1 host membership report. + * + * NOTE: 0.0.0.0 workaround breaks const correctness. + */ +static int +igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip, + /*const*/ struct igmp *igmp) +{ + struct in_ifaddr *ia; + struct in_multi *inm; + + IGMPSTAT_INC(igps_rcv_reports); + OIGMPSTAT_INC(igps_rcv_reports); + + if (ifp->if_flags & IFF_LOOPBACK) + return (0); + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr) || + !in_hosteq(igmp->igmp_group, ip->ip_dst))) { + IGMPSTAT_INC(igps_rcv_badreports); + OIGMPSTAT_INC(igps_rcv_badreports); + return (EINVAL); + } + + /* + * RFC 3376, Section 4.2.13, 9.2, 9.3: + * Booting clients may use the source address 0.0.0.0. Some + * IGMP daemons may not know how to use IP_RECVIF to determine + * the interface upon which this message was received. + * Replace 0.0.0.0 with the subnet address if told to do so. + */ + if (igmp_recvifkludge && in_nullhost(ip->ip_src)) { + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + ip->ip_src.s_addr = htonl(ia->ia_subnet); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + } + } + + IGMP_PRINTF(("process v1 report %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, ifp->if_unit)); + + /* + * IGMPv1 report suppression. + * If we are a member of this group, and our membership should be + * reported, stop our group timer and transition to the 'lazy' state. + */ + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&igmp->igmp_group, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL) { + struct igmp_ifinfo *igi; + + INM_LOCK(inm); + + igi = inm->inm_igi; + VERIFY(igi != NULL); + + IGMPSTAT_INC(igps_rcv_ourreports); + OIGMPSTAT_INC(igps_rcv_ourreports); + + /* + * If we are in IGMPv3 host mode, do not allow the + * other host's IGMPv1 report to suppress our reports + * unless explicitly configured to do so. + */ + IGI_LOCK(igi); + if (igi->igi_version == IGMP_VERSION_3) { + if (igmp_legacysupp) + igmp_v3_suppress_group_record(inm); + IGI_UNLOCK(igi); + INM_UNLOCK(inm); + INM_REMREF(inm); /* from IN_LOOKUP_MULTI */ + return (0); + } + + INM_LOCK_ASSERT_HELD(inm); + inm->inm_timer = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + IGMP_PRINTF(("report suppressed for %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, + ifp->if_unit)); + case IGMP_SLEEPING_MEMBER: + inm->inm_state = IGMP_SLEEPING_MEMBER; + break; + case IGMP_REPORTING_MEMBER: + IGMP_PRINTF(("report suppressed for %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, + ifp->if_unit)); + if (igi->igi_version == IGMP_VERSION_1) + inm->inm_state = IGMP_LAZY_MEMBER; + else if (igi->igi_version == IGMP_VERSION_2) + inm->inm_state = IGMP_SLEEPING_MEMBER; + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } + IGI_UNLOCK(igi); + INM_UNLOCK(inm); + INM_REMREF(inm); /* from IN_LOOKUP_MULTI */ + } + + return (0); +} + +/* + * Process a received IGMPv2 host membership report. + * + * NOTE: 0.0.0.0 workaround breaks const correctness. + */ +static int +igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip, + /*const*/ struct igmp *igmp) +{ + struct in_ifaddr *ia; + struct in_multi *inm; + + /* + * Make sure we don't hear our own membership report. Fast + * leave requires knowing that we are the only member of a + * group. + */ + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + if (in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) { + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + return (0); + } + IFA_UNLOCK(&ia->ia_ifa); + } + + IGMPSTAT_INC(igps_rcv_reports); + OIGMPSTAT_INC(igps_rcv_reports); + + if (ifp->if_flags & IFF_LOOPBACK) { + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + return (0); + } + + if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) || + !in_hosteq(igmp->igmp_group, ip->ip_dst)) { + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + IGMPSTAT_INC(igps_rcv_badreports); + OIGMPSTAT_INC(igps_rcv_badreports); + return (EINVAL); + } + + /* + * RFC 3376, Section 4.2.13, 9.2, 9.3: + * Booting clients may use the source address 0.0.0.0. Some + * IGMP daemons may not know how to use IP_RECVIF to determine + * the interface upon which this message was received. + * Replace 0.0.0.0 with the subnet address if told to do so. + */ + if (igmp_recvifkludge && in_nullhost(ip->ip_src)) { + if (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + ip->ip_src.s_addr = htonl(ia->ia_subnet); + IFA_UNLOCK(&ia->ia_ifa); + } + } + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + + IGMP_PRINTF(("process v2 report %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, ifp->if_unit)); + + /* + * IGMPv2 report suppression. + * If we are a member of this group, and our membership should be + * reported, and our group timer is pending or about to be reset, + * stop our group timer by transitioning to the 'lazy' state. + */ + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&igmp->igmp_group, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL) { + struct igmp_ifinfo *igi; + + INM_LOCK(inm); + igi = inm->inm_igi; + VERIFY(igi != NULL); + + IGMPSTAT_INC(igps_rcv_ourreports); + OIGMPSTAT_INC(igps_rcv_ourreports); + + /* + * If we are in IGMPv3 host mode, do not allow the + * other host's IGMPv1 report to suppress our reports + * unless explicitly configured to do so. + */ + IGI_LOCK(igi); + if (igi->igi_version == IGMP_VERSION_3) { + if (igmp_legacysupp) + igmp_v3_suppress_group_record(inm); + IGI_UNLOCK(igi); + INM_UNLOCK(inm); + INM_REMREF(inm); + return (0); + } + + inm->inm_timer = 0; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_AWAKENING_MEMBER: + IGMP_PRINTF(("report suppressed for %s on ifp %p(%s%d)\n", + inet_ntoa(igmp->igmp_group), ifp, ifp->if_name, + ifp->if_unit)); + case IGMP_LAZY_MEMBER: + inm->inm_state = IGMP_LAZY_MEMBER; + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } + IGI_UNLOCK(igi); + INM_UNLOCK(inm); + INM_REMREF(inm); + } + + return (0); +} + +void +igmp_input(struct mbuf *m, int off) +{ + int iphlen; + struct ifnet *ifp; + struct igmp *igmp; + struct ip *ip; + int igmplen; + int minlen; + int queryver; + + IGMP_PRINTF(("%s: called w/mbuf (%p,%d)\n", __func__, m, off)); + + ifp = m->m_pkthdr.rcvif; + + IGMPSTAT_INC(igps_rcv_total); + OIGMPSTAT_INC(igps_rcv_total); + + ip = mtod(m, struct ip *); + iphlen = off; + + /* By now, ip_len no longer contains the length of IP header */ + igmplen = ip->ip_len; + + /* + * Validate lengths. + */ + if (igmplen < IGMP_MINLEN) { + IGMPSTAT_INC(igps_rcv_tooshort); + OIGMPSTAT_INC(igps_rcv_tooshort); + m_freem(m); + return; + } + + /* + * Always pullup to the minimum size for v1/v2 or v3 + * to amortize calls to m_pulldown(). + */ + if (igmplen >= IGMP_V3_QUERY_MINLEN) + minlen = IGMP_V3_QUERY_MINLEN; + else + minlen = IGMP_MINLEN; + + M_STRUCT_GET(igmp, struct igmp *, m, off, minlen); + if (igmp == NULL) { + IGMPSTAT_INC(igps_rcv_tooshort); + OIGMPSTAT_INC(igps_rcv_tooshort); + return; + } + + /* + * Validate checksum. + */ + m->m_data += iphlen; + m->m_len -= iphlen; + if (in_cksum(m, igmplen)) { + IGMPSTAT_INC(igps_rcv_badsum); + OIGMPSTAT_INC(igps_rcv_badsum); + m_freem(m); + return; + } + m->m_data -= iphlen; + m->m_len += iphlen; + + /* + * IGMP control traffic is link-scope, and must have a TTL of 1. + * DVMRP traffic (e.g. mrinfo, mtrace) is an exception; + * probe packets may come from beyond the LAN. + */ + if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) { + IGMPSTAT_INC(igps_rcv_badttl); + m_freem(m); + return; + } + + switch (igmp->igmp_type) { + case IGMP_HOST_MEMBERSHIP_QUERY: + if (igmplen == IGMP_MINLEN) { + if (igmp->igmp_code == 0) + queryver = IGMP_VERSION_1; + else + queryver = IGMP_VERSION_2; + } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { + queryver = IGMP_VERSION_3; + } else { + IGMPSTAT_INC(igps_rcv_tooshort); + OIGMPSTAT_INC(igps_rcv_tooshort); + m_freem(m); + return; + } + + OIGMPSTAT_INC(igps_rcv_queries); + + switch (queryver) { + case IGMP_VERSION_1: + IGMPSTAT_INC(igps_rcv_v1v2_queries); + if (!igmp_v1enable) + break; + if (igmp_input_v1_query(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_VERSION_2: + IGMPSTAT_INC(igps_rcv_v1v2_queries); + if (!igmp_v2enable) + break; + if (igmp_input_v2_query(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_VERSION_3: { + struct igmpv3 *igmpv3; + uint16_t igmpv3len; + uint16_t srclen; + int nsrc; + + IGMPSTAT_INC(igps_rcv_v3_queries); + igmpv3 = (struct igmpv3 *)igmp; + /* + * Validate length based on source count. + */ + nsrc = ntohs(igmpv3->igmp_numsrc); + srclen = sizeof(struct in_addr) * nsrc; + if (igmplen < (IGMP_V3_QUERY_MINLEN + srclen)) { + IGMPSTAT_INC(igps_rcv_tooshort); + OIGMPSTAT_INC(igps_rcv_tooshort); + m_freem(m); + return; + } + igmpv3len = IGMP_V3_QUERY_MINLEN + srclen; + M_STRUCT_GET(igmpv3, struct igmpv3 *, m, + off, igmpv3len); + if (igmpv3 == NULL) { + IGMPSTAT_INC(igps_rcv_tooshort); + OIGMPSTAT_INC(igps_rcv_tooshort); + return; + } + if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) { + m_freem(m); + return; + } + } + break; + } + break; + + case IGMP_v1_HOST_MEMBERSHIP_REPORT: + if (!igmp_v1enable) + break; + if (igmp_input_v1_report(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_v2_HOST_MEMBERSHIP_REPORT: + if (!igmp_v2enable) + break; +#ifndef __APPLE__ + if (!ip_checkrouteralert(m)) + IGMPSTAT_INC(igps_rcv_nora); +#endif + if (igmp_input_v2_report(ifp, ip, igmp) != 0) { + m_freem(m); + return; + } + break; + + case IGMP_v3_HOST_MEMBERSHIP_REPORT: + /* + * Hosts do not need to process IGMPv3 membership reports, + * as report suppression is no longer required. + */ +#ifndef __APPLE__ + if (!ip_checkrouteralert(m)) + IGMPSTAT_INC(igps_rcv_nora); +#endif + break; + + default: + break; + } + + lck_mtx_assert(&igmp_mtx, LCK_MTX_ASSERT_NOTOWNED); + /* + * Pass all valid IGMP packets up to any process(es) listening on a + * raw IGMP socket. + */ + rip_input(m, off); +} + + +/* + * IGMP slowtimo handler. + * Combiles both the slow and fast timer into one. We loose some responsivness but + * allows the system to avoid having a pr_fasttimo, thus allowing for power savings. + * + */ +void +igmp_slowtimo(void) +{ + struct ifqueue scq; /* State-change packets */ + struct ifqueue qrq; /* Query response packets */ + struct ifnet *ifp; + struct igmp_ifinfo *igi; + struct in_multi *inm; + int loop = 0, uri_fasthz = 0; + + lck_mtx_lock(&igmp_mtx); + + LIST_FOREACH(igi, &igi_head, igi_link) { + IGI_LOCK(igi); + igmp_v1v2_process_querier_timers(igi); + IGI_UNLOCK(igi); + } + + /* + * NOTE: previously handled by fasttimo + * + * Quick check to see if any work needs to be done, in order to + * minimize the overhead of fasttimo processing. + */ + if (!current_state_timers_running && + !interface_timers_running && + !state_change_timers_running) { + lck_mtx_unlock(&igmp_mtx); + return; + } + + /* + * IGMPv3 General Query response timer processing. + */ + if (interface_timers_running) { + interface_timers_running = 0; + LIST_FOREACH(igi, &igi_head, igi_link) { + IGI_LOCK(igi); + if (igi->igi_v3_timer == 0) { + /* Do nothing. */ + } else if (--igi->igi_v3_timer == 0) { + igmp_v3_dispatch_general_query(igi); + } else { + interface_timers_running = 1; + } + IGI_UNLOCK(igi); + } + } + + if (!current_state_timers_running && + !state_change_timers_running) + goto out_locked; + + current_state_timers_running = 0; + state_change_timers_running = 0; + + memset(&qrq, 0, sizeof(struct ifqueue)); + qrq.ifq_maxlen = IGMP_MAX_G_GS_PACKETS; + + memset(&scq, 0, sizeof(struct ifqueue)); + scq.ifq_maxlen = IGMP_MAX_STATE_CHANGE_PACKETS; + + /* + * IGMPv1/v2/v3 host report and state-change timer processing. + * Note: Processing a v3 group timer may remove a node. + */ + LIST_FOREACH(igi, &igi_head, igi_link) { + struct in_multistep step; + + IGI_LOCK(igi); + ifp = igi->igi_ifp; + loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; + uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri * PR_SLOWHZ); + IGI_UNLOCK(igi); + + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp != ifp) + goto next; + + IGI_LOCK(igi); + switch (igi->igi_version) { + case IGMP_VERSION_1: + case IGMP_VERSION_2: + igmp_v1v2_process_group_timer(inm, + igi->igi_version); + break; + case IGMP_VERSION_3: + igmp_v3_process_group_timers(igi, &qrq, + &scq, inm, uri_fasthz); + break; + } + IGI_UNLOCK(igi); +next: + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + + IGI_LOCK(igi); + if (igi->igi_version == IGMP_VERSION_1 || + igi->igi_version == IGMP_VERSION_2) { + igmp_dispatch_queue(igi, &igi->igi_v2q, 0, loop, ifp); + } else if (igi->igi_version == IGMP_VERSION_3) { + IGI_UNLOCK(igi); + igmp_dispatch_queue(NULL, &qrq, 0, loop, ifp); + igmp_dispatch_queue(NULL, &scq, 0, loop, ifp); + VERIFY(qrq.ifq_len == 0); + VERIFY(scq.ifq_len == 0); + IGI_LOCK(igi); + } + /* + * In case there are still any pending membership reports + * which didn't get drained at version change time. + */ + IF_DRAIN(&igi->igi_v2q); + /* + * Release all deferred inm records, and drain any locally + * enqueued packets; do it even if the current IGMP version + * for the link is no longer IGMPv3, in order to handle the + * version change case. + */ + igmp_flush_relq(igi); + VERIFY(SLIST_EMPTY(&igi->igi_relinmhead)); + IGI_UNLOCK(igi); + + IF_DRAIN(&qrq); + IF_DRAIN(&scq); + } + +out_locked: + lck_mtx_unlock(&igmp_mtx); +} + +/* + * Free the in_multi reference(s) for this IGMP lifecycle. + * + * Caller must be holding igi_lock. + */ +static void +igmp_flush_relq(struct igmp_ifinfo *igi) +{ + struct in_multi *inm; + +again: + IGI_LOCK_ASSERT_HELD(igi); + inm = SLIST_FIRST(&igi->igi_relinmhead); + if (inm != NULL) { + int lastref; + + SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele); + IGI_UNLOCK(igi); + + in_multihead_lock_exclusive(); + INM_LOCK(inm); + VERIFY(inm->inm_nrelecnt != 0); + inm->inm_nrelecnt--; + lastref = in_multi_detach(inm); + VERIFY(!lastref || (!(inm->inm_debug & IFD_ATTACHED) && + inm->inm_reqcnt == 0)); + INM_UNLOCK(inm); + in_multihead_lock_done(); + /* from igi_relinmhead */ + INM_REMREF(inm); + /* from in_multihead list */ + if (lastref) + INM_REMREF(inm); + + IGI_LOCK(igi); + goto again; + } +} + +/* + * Update host report group timer for IGMPv1/v2. + * Will update the global pending timer flags. + */ +static void +igmp_v1v2_process_group_timer(struct in_multi *inm, const int igmp_version) +{ + int report_timer_expired; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(inm->inm_igi); + + if (inm->inm_timer == 0) { + report_timer_expired = 0; + } else if (--inm->inm_timer == 0) { + report_timer_expired = 1; + } else { + current_state_timers_running = 1; + return; + } + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + if (report_timer_expired) { + inm->inm_state = IGMP_IDLE_MEMBER; + (void) igmp_v1v2_queue_report(inm, + (igmp_version == IGMP_VERSION_2) ? + IGMP_v2_HOST_MEMBERSHIP_REPORT : + IGMP_v1_HOST_MEMBERSHIP_REPORT); + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(inm->inm_igi); + } + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } +} + +/* + * Update a group's timers for IGMPv3. + * Will update the global pending timer flags. + * Note: Unlocked read from igi. + */ +static void +igmp_v3_process_group_timers(struct igmp_ifinfo *igi, + struct ifqueue *qrq, struct ifqueue *scq, + struct in_multi *inm, const int uri_fasthz) +{ + int query_response_timer_expired; + int state_change_retransmit_timer_expired; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(igi); + VERIFY(igi == inm->inm_igi); + + query_response_timer_expired = 0; + state_change_retransmit_timer_expired = 0; + + /* + * During a transition from v1/v2 compatibility mode back to v3, + * a group record in REPORTING state may still have its group + * timer active. This is a no-op in this function; it is easier + * to deal with it here than to complicate the slow-timeout path. + */ + if (inm->inm_timer == 0) { + query_response_timer_expired = 0; + } else if (--inm->inm_timer == 0) { + query_response_timer_expired = 1; + } else { + current_state_timers_running = 1; + } + + if (inm->inm_sctimer == 0) { + state_change_retransmit_timer_expired = 0; + } else if (--inm->inm_sctimer == 0) { + state_change_retransmit_timer_expired = 1; + } else { + state_change_timers_running = 1; + } + + /* We are in fasttimo, so be quick about it. */ + if (!state_change_retransmit_timer_expired && + !query_response_timer_expired) + return; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_AWAKENING_MEMBER: + case IGMP_IDLE_MEMBER: + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + /* + * Respond to a previously pending Group-Specific + * or Group-and-Source-Specific query by enqueueing + * the appropriate Current-State report for + * immediate transmission. + */ + if (query_response_timer_expired) { + int retval; + + retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1, + (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)); + IGMP_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + inm->inm_state = IGMP_REPORTING_MEMBER; + /* XXX Clear recorded sources for next time. */ + inm_clear_recorded(inm); + } + /* FALLTHROUGH */ + case IGMP_REPORTING_MEMBER: + case IGMP_LEAVING_MEMBER: + if (state_change_retransmit_timer_expired) { + /* + * State-change retransmission timer fired. + * If there are any further pending retransmissions, + * set the global pending state-change flag, and + * reset the timer. + */ + if (--inm->inm_scrv > 0) { + inm->inm_sctimer = uri_fasthz; + state_change_timers_running = 1; + } + /* + * Retransmit the previously computed state-change + * report. If there are no further pending + * retransmissions, the mbuf queue will be consumed. + * Update T0 state to T1 as we have now sent + * a state-change. + */ + (void) igmp_v3_merge_state_changes(inm, scq); + + inm_commit(inm); + IGMP_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name, + inm->inm_ifp->if_unit)); + + /* + * If we are leaving the group for good, make sure + * we release IGMP's reference to it. + * This release must be deferred using a SLIST, + * as we are called from a loop which traverses + * the in_multihead list. + */ + if (inm->inm_state == IGMP_LEAVING_MEMBER && + inm->inm_scrv == 0) { + inm->inm_state = IGMP_NOT_MEMBER; + /* + * A reference has already been held in + * igmp_final_leave() for this inm, so + * no need to hold another one. We also + * bumped up its request count then, so + * that it stays in in_multihead. Both + * of them will be released when it is + * dequeued later on. + */ + VERIFY(inm->inm_nrelecnt != 0); + SLIST_INSERT_HEAD(&igi->igi_relinmhead, + inm, inm_nrele); + } + } + break; + } +} + +/* + * Suppress a group's pending response to a group or source/group query. + * + * Do NOT suppress state changes. This leads to IGMPv3 inconsistency. + * Do NOT update ST1/ST0 as this operation merely suppresses + * the currently pending group record. + * Do NOT suppress the response to a general query. It is possible but + * it would require adding another state or flag. + */ +static void +igmp_v3_suppress_group_record(struct in_multi *inm) +{ + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(inm->inm_igi); + + VERIFY(inm->inm_igi->igi_version == IGMP_VERSION_3); + + if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER) + return; + + if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) + inm_clear_recorded(inm); + + inm->inm_timer = 0; + inm->inm_state = IGMP_REPORTING_MEMBER; +} + +/* + * Switch to a different IGMP version on the given interface, + * as per Section 7.2.1. + */ +static void +igmp_set_version(struct igmp_ifinfo *igi, const int igmp_version) +{ + int old_version_timer; + + IGI_LOCK_ASSERT_HELD(igi); + + IGMP_PRINTF(("%s: switching to v%d on ifp %p(%s%d)\n", __func__, + igmp_version, igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + + if (igmp_version == IGMP_VERSION_1 || igmp_version == IGMP_VERSION_2) { + /* + * Compute the "Older Version Querier Present" timer as per + * Section 8.12. + */ + old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri; + old_version_timer *= PR_SLOWHZ; + + if (igmp_version == IGMP_VERSION_1) { + igi->igi_v1_timer = old_version_timer; + igi->igi_v2_timer = 0; + } else if (igmp_version == IGMP_VERSION_2) { + igi->igi_v1_timer = 0; + igi->igi_v2_timer = old_version_timer; + } + } + + if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { + if (igi->igi_version != IGMP_VERSION_2) { + igi->igi_version = IGMP_VERSION_2; + igmp_v3_cancel_link_timers(igi); + } + } else if (igi->igi_v1_timer > 0) { + if (igi->igi_version != IGMP_VERSION_1) { + igi->igi_version = IGMP_VERSION_1; + igmp_v3_cancel_link_timers(igi); + } + } + + IGI_LOCK_ASSERT_HELD(igi); +} + +/* + * Cancel pending IGMPv3 timers for the given link and all groups + * joined on it; state-change, general-query, and group-query timers. + * + * Only ever called on a transition from v3 to Compatibility mode. Kill + * the timers stone dead (this may be expensive for large N groups), they + * will be restarted if Compatibility Mode deems that they must be due to + * query processing. + */ +static void +igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + struct in_multi *inm; + struct in_multistep step; + + IGI_LOCK_ASSERT_HELD(igi); + + IGMP_PRINTF(("%s: cancel v3 timers on ifp %p(%s%d)\n", __func__, + igi->igi_ifp, igi->igi_ifp->if_name, igi->igi_ifp->if_unit)); + + /* + * Stop the v3 General Query Response on this link stone dead. + * If fasttimo is woken up due to interface_timers_running, + * the flag will be cleared if there are no pending link timers. + */ + igi->igi_v3_timer = 0; + + /* + * Now clear the current-state and state-change report timers + * for all memberships scoped to this link. + */ + ifp = igi->igi_ifp; + IGI_UNLOCK(igi); + + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp != ifp) + goto next; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + /* + * These states are either not relevant in v3 mode, + * or are unreported. Do nothing. + */ + break; + case IGMP_LEAVING_MEMBER: + /* + * If we are leaving the group and switching to + * compatibility mode, we need to release the final + * reference held for issuing the INCLUDE {}, and + * transition to REPORTING to ensure the host leave + * message is sent upstream to the old querier -- + * transition to NOT would lose the leave and race. + * During igmp_final_leave(), we bumped up both the + * request and reference counts. Since we cannot + * call in_multi_detach() here, defer this task to + * the timer routine. + */ + VERIFY(inm->inm_nrelecnt != 0); + IGI_LOCK(igi); + SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); + IGI_UNLOCK(igi); + /* FALLTHROUGH */ + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + inm_clear_recorded(inm); + /* FALLTHROUGH */ + case IGMP_REPORTING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + break; + } + /* + * Always clear state-change and group report timers. + * Free any pending IGMPv3 state-change records. + */ + inm->inm_sctimer = 0; + inm->inm_timer = 0; + IF_DRAIN(&inm->inm_scq); +next: + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + + IGI_LOCK(igi); +} + +/* + * Update the Older Version Querier Present timers for a link. + * See Section 7.2.1 of RFC 3376. + */ +static void +igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi) +{ + IGI_LOCK_ASSERT_HELD(igi); + + if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) { + /* + * IGMPv1 and IGMPv2 Querier Present timers expired. + * + * Revert to IGMPv3. + */ + if (igi->igi_version != IGMP_VERSION_3) { + IGMP_PRINTF(("%s: transition from v%d -> v%d on %p(%s%d)\n", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + igi->igi_version = IGMP_VERSION_3; + IF_DRAIN(&igi->igi_v2q); + } + } else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) { + /* + * IGMPv1 Querier Present timer expired, + * IGMPv2 Querier Present timer running. + * If IGMPv2 was disabled since last timeout, + * revert to IGMPv3. + * If IGMPv2 is enabled, revert to IGMPv2. + */ + if (!igmp_v2enable) { + IGMP_PRINTF(("%s: transition from v%d -> v%d on %p(%s%d)\n", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + igi->igi_v2_timer = 0; + igi->igi_version = IGMP_VERSION_3; + IF_DRAIN(&igi->igi_v2q); + } else { + --igi->igi_v2_timer; + if (igi->igi_version != IGMP_VERSION_2) { + IGMP_PRINTF(("%s: transition from v%d -> v%d on %p(%s%d)\n", + __func__, igi->igi_version, IGMP_VERSION_2, + igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + igi->igi_version = IGMP_VERSION_2; + IF_DRAIN(&igi->igi_gq); + } + } + } else if (igi->igi_v1_timer > 0) { + /* + * IGMPv1 Querier Present timer running. + * Stop IGMPv2 timer if running. + * + * If IGMPv1 was disabled since last timeout, + * revert to IGMPv3. + * If IGMPv1 is enabled, reset IGMPv2 timer if running. + */ + if (!igmp_v1enable) { + IGMP_PRINTF(("%s: transition from v%d -> v%d on %p(%s%d)\n", + __func__, igi->igi_version, IGMP_VERSION_3, + igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + igi->igi_v1_timer = 0; + igi->igi_version = IGMP_VERSION_3; + IF_DRAIN(&igi->igi_v2q); + } else { + --igi->igi_v1_timer; + } + if (igi->igi_v2_timer > 0) { + IGMP_PRINTF(("%s: cancel v2 timer on %p(%s%d)\n", + __func__, igi->igi_ifp, igi->igi_ifp->if_name, + igi->igi_ifp->if_unit)); + igi->igi_v2_timer = 0; + } + } +} + +/* + * Dispatch an IGMPv1/v2 host report or leave message. + * These are always small enough to fit inside a single mbuf. + */ +static int +igmp_v1v2_queue_report(struct in_multi *inm, const int type) +{ + struct ifnet *ifp; + struct igmp *igmp; + struct ip *ip; + struct mbuf *m; + int error = 0; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(inm->inm_igi); + + ifp = inm->inm_ifp; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return (ENOMEM); + MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp)); + + m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp); + + m->m_data += sizeof(struct ip); + m->m_len = sizeof(struct igmp); + + igmp = mtod(m, struct igmp *); + igmp->igmp_type = type; + igmp->igmp_code = 0; + igmp->igmp_group = inm->inm_addr; + igmp->igmp_cksum = 0; + igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp)); + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + + ip = mtod(m, struct ip *); + ip->ip_tos = 0; + ip->ip_len = sizeof(struct ip) + sizeof(struct igmp); + ip->ip_off = 0; + ip->ip_p = IPPROTO_IGMP; + ip->ip_src.s_addr = INADDR_ANY; + + if (type == IGMP_HOST_LEAVE_MESSAGE) + ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP); + else + ip->ip_dst = inm->inm_addr; + + m->m_flags |= M_IGMPV2; + if (inm->inm_igi->igi_flags & IGIF_LOOPBACK) + m->m_flags |= M_IGMP_LOOP; + + /* + * Due to the fact that at this point we are possibly holding + * in_multihead_lock in shared or exclusive mode, we can't call + * igmp_sendpkt() here since that will eventually call ip_output(), + * which will try to lock in_multihead_lock and cause a deadlock. + * Instead we defer the work to the igmp_slowtimo() thread, thus + * avoiding unlocking in_multihead_lock here. + */ + if (IF_QFULL(&inm->inm_igi->igi_v2q)) { + IGMP_PRINTF(("%s: v1/v2 outbound queue full\n", __func__)); + error = ENOMEM; + m_freem(m); + } else + IF_ENQUEUE(&inm->inm_igi->igi_v2q, m); + + return (error); +} + +/* + * Process a state change from the upper layer for the given IPv4 group. + * + * Each socket holds a reference on the in_multi in its own ip_moptions. + * The socket layer will have made the necessary updates to the group + * state, it is now up to IGMP to issue a state change report if there + * has been any change between T0 (when the last state-change was issued) + * and T1 (now). + * + * We use the IGMPv3 state machine at group level. The IGMP module + * however makes the decision as to which IGMP protocol version to speak. + * A state change *from* INCLUDE {} always means an initial join. + * A state change *to* INCLUDE {} always means a final leave. + * + * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can + * save ourselves a bunch of work; any exclusive mode groups need not + * compute source filter lists. + */ +int +igmp_change_state(struct in_multi *inm) +{ + struct igmp_ifinfo *igi; + struct ifnet *ifp; + int error = 0; + + INM_LOCK_ASSERT_HELD(inm); + VERIFY(inm->inm_igi != NULL); + IGI_LOCK_ASSERT_NOTHELD(inm->inm_igi); + + /* + * Try to detect if the upper layer just asked us to change state + * for an interface which has now gone away. + */ + VERIFY(inm->inm_ifma != NULL); + ifp = inm->inm_ifma->ifma_ifp; + /* + * Sanity check that netinet's notion of ifp is the same as net's. + */ + VERIFY(inm->inm_ifp == ifp); + + igi = IGMP_IFINFO(ifp); + VERIFY(igi != NULL); + + /* + * If we detect a state transition to or from MCAST_UNDEFINED + * for this group, then we are starting or finishing an IGMP + * life cycle for this group. + */ + if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) { + IGMP_PRINTF(("%s: inm transition %d -> %d\n", __func__, + inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode)); + if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) { + IGMP_PRINTF(("%s: initial join\n", __func__)); + error = igmp_initial_join(inm, igi); + goto out; + } else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) { + IGMP_PRINTF(("%s: final leave\n", __func__)); + igmp_final_leave(inm, igi); + goto out; + } + } else { + IGMP_PRINTF(("%s: filter set change\n", __func__)); + } + + error = igmp_handle_state_change(inm, igi); +out: + return (error); +} + +/* + * Perform the initial join for an IGMP group. + * + * When joining a group: + * If the group should have its IGMP traffic suppressed, do nothing. + * IGMPv1 starts sending IGMPv1 host membership reports. + * IGMPv2 starts sending IGMPv2 host membership reports. + * IGMPv3 will schedule an IGMPv3 state-change report containing the + * initial state of the membership. + */ +static int +igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + struct ifqueue *ifq; + int error, retval, syncstates; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_NOTHELD(igi); + + IGMP_PRINTF(("%s: initial join %s on ifp %p(%s%d)\n", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_name, inm->inm_ifp->if_unit)); + + error = 0; + syncstates = 1; + + ifp = inm->inm_ifp; + + IGI_LOCK(igi); + VERIFY(igi->igi_ifp == ifp); + + /* + * Groups joined on loopback or marked as 'not reported', + * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and + * are never reported in any IGMP protocol exchanges. + * All other groups enter the appropriate IGMP state machine + * for the version in use on this link. + * A link marked as IGIF_SILENT causes IGMP to be completely + * disabled for the link. + */ + if ((ifp->if_flags & IFF_LOOPBACK) || + (igi->igi_flags & IGIF_SILENT) || + !igmp_isgroupreported(inm->inm_addr)) { + IGMP_PRINTF(("%s: not kicking state machine for silent group\n", + __func__)); + inm->inm_state = IGMP_SILENT_MEMBER; + inm->inm_timer = 0; + } else { + /* + * Deal with overlapping in_multi lifecycle. + * If this group was LEAVING, then make sure + * we drop the reference we picked up to keep the + * group around for the final INCLUDE {} enqueue. + * Since we cannot call in_multi_detach() here, + * defer this task to the timer routine. + */ + if (igi->igi_version == IGMP_VERSION_3 && + inm->inm_state == IGMP_LEAVING_MEMBER) { + VERIFY(inm->inm_nrelecnt != 0); + SLIST_INSERT_HEAD(&igi->igi_relinmhead, inm, inm_nrele); + } + + inm->inm_state = IGMP_REPORTING_MEMBER; + + switch (igi->igi_version) { + case IGMP_VERSION_1: + case IGMP_VERSION_2: + inm->inm_state = IGMP_IDLE_MEMBER; + error = igmp_v1v2_queue_report(inm, + (igi->igi_version == IGMP_VERSION_2) ? + IGMP_v2_HOST_MEMBERSHIP_REPORT : + IGMP_v1_HOST_MEMBERSHIP_REPORT); + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(igi); + + if (error == 0) { + inm->inm_timer = IGMP_RANDOM_DELAY( + IGMP_V1V2_MAX_RI * PR_SLOWHZ); + current_state_timers_running = 1; + } + break; + + case IGMP_VERSION_3: + /* + * Defer update of T0 to T1, until the first copy + * of the state change has been transmitted. + */ + syncstates = 0; + + /* + * Immediately enqueue a State-Change Report for + * this interface, freeing any previous reports. + * Don't kick the timers if there is nothing to do, + * or if an error occurred. + */ + ifq = &inm->inm_scq; + IF_DRAIN(ifq); + retval = igmp_v3_enqueue_group_record(ifq, inm, 1, + 0, 0); + IGMP_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + if (retval <= 0) { + error = retval * -1; + break; + } + + /* + * Schedule transmission of pending state-change + * report up to RV times for this link. The timer + * will fire at the next igmp_fasttimo (~200ms), + * giving us an opportunity to merge the reports. + */ + if (igi->igi_flags & IGIF_LOOPBACK) { + inm->inm_scrv = 1; + } else { + VERIFY(igi->igi_rv > 1); + inm->inm_scrv = igi->igi_rv; + } + inm->inm_sctimer = 1; + state_change_timers_running = 1; + + error = 0; + break; + } + } + IGI_UNLOCK(igi); + + /* + * Only update the T0 state if state change is atomic, + * i.e. we don't need to wait for a timer to fire before we + * can consider the state change to have been communicated. + */ + if (syncstates) { + inm_commit(inm); + IGMP_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name, + inm->inm_ifp->if_unit)); + } + + return (error); +} + +/* + * Issue an intermediate state change during the IGMP life-cycle. + */ +static int +igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + int retval; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_NOTHELD(igi); + + IGMP_PRINTF(("%s: state change for %s on ifp %p(%s%d)\n", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_name, inm->inm_ifp->if_unit)); + + ifp = inm->inm_ifp; + + IGI_LOCK(igi); + VERIFY(igi->igi_ifp == ifp); + + if ((ifp->if_flags & IFF_LOOPBACK) || + (igi->igi_flags & IGIF_SILENT) || + !igmp_isgroupreported(inm->inm_addr) || + (igi->igi_version != IGMP_VERSION_3)) { + IGI_UNLOCK(igi); + if (!igmp_isgroupreported(inm->inm_addr)) { + IGMP_PRINTF(("%s: not kicking state " + "machine for silent group\n", __func__)); + } + IGMP_PRINTF(("%s: nothing to do\n", __func__)); + inm_commit(inm); + IGMP_PRINTF(("%s: T1 -> T0 for %s/%s\n", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name)); + return (0); + } + + IF_DRAIN(&inm->inm_scq); + + retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0); + IGMP_PRINTF(("%s: enqueue record = %d\n", __func__, retval)); + if (retval <= 0) { + IGI_UNLOCK(igi); + return (-retval); + } + /* + * If record(s) were enqueued, start the state-change + * report timer for this group. + */ + inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv); + inm->inm_sctimer = 1; + state_change_timers_running = 1; + IGI_UNLOCK(igi); + + return (0); +} + +/* + * Perform the final leave for an IGMP group. + * + * When leaving a group: + * IGMPv1 does nothing. + * IGMPv2 sends a host leave message, if and only if we are the reporter. + * IGMPv3 enqueues a state-change report containing a transition + * to INCLUDE {} for immediate transmission. + */ +static void +igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi) +{ + int syncstates = 1; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_NOTHELD(igi); + + IGMP_PRINTF(("%s: final leave %s on ifp %p(%s%d)\n", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp, + inm->inm_ifp->if_name, inm->inm_ifp->if_unit)); + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + case IGMP_LEAVING_MEMBER: + /* Already leaving or left; do nothing. */ + IGMP_PRINTF(("%s: not kicking state machine for silent group\n", + __func__)); + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + IGI_LOCK(igi); + if (igi->igi_version == IGMP_VERSION_2) { + if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER || + inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) { + panic("%s: IGMPv3 state reached, not IGMPv3 " + "mode\n", __func__); + /* NOTREACHED */ + } + igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE); + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(igi); + + inm->inm_state = IGMP_NOT_MEMBER; + } else if (igi->igi_version == IGMP_VERSION_3) { + /* + * Stop group timer and all pending reports. + * Immediately enqueue a state-change report + * TO_IN {} to be sent on the next fast timeout, + * giving us an opportunity to merge reports. + */ + IF_DRAIN(&inm->inm_scq); + inm->inm_timer = 0; + if (igi->igi_flags & IGIF_LOOPBACK) { + inm->inm_scrv = 1; + } else { + inm->inm_scrv = igi->igi_rv; + } + IGMP_PRINTF(("%s: Leaving %s/%s%d with %d " + "pending retransmissions.\n", __func__, + inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_name, inm->inm_ifp->if_unit, + inm->inm_scrv)); + if (inm->inm_scrv == 0) { + inm->inm_state = IGMP_NOT_MEMBER; + inm->inm_sctimer = 0; + } else { + int retval; + /* + * Stick around in the in_multihead list; + * the final detach will be issued by + * igmp_v3_process_group_timers() when + * the retransmit timer expires. + */ + INM_ADDREF_LOCKED(inm); + VERIFY(inm->inm_debug & IFD_ATTACHED); + inm->inm_reqcnt++; + VERIFY(inm->inm_reqcnt >= 1); + inm->inm_nrelecnt++; + VERIFY(inm->inm_nrelecnt != 0); + + retval = igmp_v3_enqueue_group_record( + &inm->inm_scq, inm, 1, 0, 0); + KASSERT(retval != 0, + ("%s: enqueue record = %d\n", __func__, + retval)); + + inm->inm_state = IGMP_LEAVING_MEMBER; + inm->inm_sctimer = 1; + state_change_timers_running = 1; + syncstates = 0; + } + } + IGI_UNLOCK(igi); + break; + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + /* Our reports are suppressed; do nothing. */ + break; + } + + if (syncstates) { + inm_commit(inm); + IGMP_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name, + inm->inm_ifp->if_unit)); + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + IGMP_PRINTF(("%s: T1 now MCAST_UNDEFINED for %s/%s%d\n", + __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_name, + inm->inm_ifp->if_unit)); + } +} + +/* + * Enqueue an IGMPv3 group record to the given output queue. + * + * XXX This function could do with having the allocation code + * split out, and the multiple-tree-walks coalesced into a single + * routine as has been done in igmp_v3_enqueue_filter_change(). + * + * If is_state_change is zero, a current-state record is appended. + * If is_state_change is non-zero, a state-change report is appended. + * + * If is_group_query is non-zero, an mbuf packet chain is allocated. + * If is_group_query is zero, and if there is a packet with free space + * at the tail of the queue, it will be appended to providing there + * is enough free space. + * Otherwise a new mbuf packet chain is allocated. + * + * If is_source_query is non-zero, each source is checked to see if + * it was recorded for a Group-Source query, and will be omitted if + * it is not both in-mode and recorded. + * + * The function will attempt to allocate leading space in the packet + * for the IP/IGMP header to be prepended without fragmenting the chain. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm, + const int is_state_change, const int is_group_query, + const int is_source_query) +{ + struct igmp_grouprec ig; + struct igmp_grouprec *pig; + struct ifnet *ifp; + struct ip_msource *ims, *nims; + struct mbuf *m0, *m, *md; + int error, is_filter_list_change; + int minrec0len, m0srcs, msrcs, nbytes, off; + int record_has_sources; + int now; + int type; + in_addr_t naddr; + uint8_t mode; + + INM_LOCK_ASSERT_HELD(inm); + IGI_LOCK_ASSERT_HELD(inm->inm_igi); + + error = 0; + ifp = inm->inm_ifp; + is_filter_list_change = 0; + m = NULL; + m0 = NULL; + m0srcs = 0; + msrcs = 0; + nbytes = 0; + nims = NULL; + record_has_sources = 1; + pig = NULL; + type = IGMP_DO_NOTHING; + mode = inm->inm_st[1].iss_fmode; + + /* + * If we did not transition out of ASM mode during t0->t1, + * and there are no source nodes to process, we can skip + * the generation of source records. + */ + if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 && + inm->inm_nsrc == 0) + record_has_sources = 0; + + if (is_state_change) { + /* + * Queue a state change record. + * If the mode did not change, and there are non-ASM + * listeners or source filters present, + * we potentially need to issue two records for the group. + * If we are transitioning to MCAST_UNDEFINED, we need + * not send any sources. + * If there are ASM listeners, and there was no filter + * mode transition of any kind, do nothing. + */ + if (mode != inm->inm_st[0].iss_fmode) { + if (mode == MCAST_EXCLUDE) { + IGMP_PRINTF(("%s: change to EXCLUDE\n", + __func__)); + type = IGMP_CHANGE_TO_EXCLUDE_MODE; + } else { + IGMP_PRINTF(("%s: change to INCLUDE\n", + __func__)); + type = IGMP_CHANGE_TO_INCLUDE_MODE; + if (mode == MCAST_UNDEFINED) + record_has_sources = 0; + } + } else { + if (record_has_sources) { + is_filter_list_change = 1; + } else { + type = IGMP_DO_NOTHING; + } + } + } else { + /* + * Queue a current state record. + */ + if (mode == MCAST_EXCLUDE) { + type = IGMP_MODE_IS_EXCLUDE; + } else if (mode == MCAST_INCLUDE) { + type = IGMP_MODE_IS_INCLUDE; + VERIFY(inm->inm_st[1].iss_asm == 0); + } + } + + /* + * Generate the filter list changes using a separate function. + */ + if (is_filter_list_change) + return (igmp_v3_enqueue_filter_change(ifq, inm)); + + if (type == IGMP_DO_NOTHING) { + IGMP_PRINTF(("%s: nothing to do for %s/%s%d\n", + __func__, inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_name, inm->inm_ifp->if_unit)); + return (0); + } + + /* + * If any sources are present, we must be able to fit at least + * one in the trailing space of the tail packet's mbuf, + * ideally more. + */ + minrec0len = sizeof(struct igmp_grouprec); + if (record_has_sources) + minrec0len += sizeof(in_addr_t); + + IGMP_PRINTF(("%s: queueing %s for %s/%s%d\n", __func__, + igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr), + inm->inm_ifp->if_name, inm->inm_ifp->if_unit)); + + /* + * Check if we have a packet in the tail of the queue for this + * group into which the first group record for this group will fit. + * Otherwise allocate a new packet. + * Always allocate leading space for IP+RA_OPT+IGMP+REPORT. + * Note: Group records for G/GSR query responses MUST be sent + * in their own packet. + */ + m0 = ifq->ifq_tail; + if (!is_group_query && + m0 != NULL && + (m0->m_pkthdr.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) && + (m0->m_pkthdr.len + minrec0len) < + (ifp->if_mtu - IGMP_LEADINGSPACE)) { + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + m = m0; + IGMP_PRINTF(("%s: use existing packet\n", __func__)); + } else { + if (IF_QFULL(ifq)) { + IGMP_PRINTF(("%s: outbound queue full\n", __func__)); + return (-ENOMEM); + } + m = NULL; + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + if (!is_state_change && !is_group_query) { + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m) + m->m_data += IGMP_LEADINGSPACE; + } + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) + return (-ENOMEM); + + IGMP_PRINTF(("%s: allocated first packet\n", __func__)); + } + + /* + * Append group record. + * If we have sources, we don't know how many yet. + */ + ig.ig_type = type; + ig.ig_datalen = 0; + ig.ig_numsrc = 0; + ig.ig_group = inm->inm_addr; + if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed.\n", __func__)); + return (-ENOMEM); + } + nbytes += sizeof(struct igmp_grouprec); + + /* + * Append as many sources as will fit in the first packet. + * If we are appending to a new packet, the chain allocation + * may potentially use clusters; use m_getptr() in this case. + * If we are appending to an existing packet, we need to obtain + * a pointer to the group record after m_append(), in case a new + * mbuf was allocated. + * Only append sources which are in-mode at t1. If we are + * transitioning to MCAST_UNDEFINED state on the group, do not + * include source entries. + * Only report recorded sources in our filter set when responding + * to a group-source query. + */ + if (record_has_sources) { + if (m == m0) { + md = m_last(m); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + + md->m_len - nbytes); + } else { + md = m_getptr(m, 0, &off); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + + off); + } + msrcs = 0; + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) { + IGMP_PRINTF(("%s: visit node %s\n", __func__, + inet_ntoa_haddr(ims->ims_haddr))); + now = ims_get_mode(inm, ims, 1); + IGMP_PRINTF(("%s: node is %d\n", __func__, now)); + if ((now != mode) || + (now == mode && mode == MCAST_UNDEFINED)) { + IGMP_PRINTF(("%s: skip node\n", __func__)); + continue; + } + if (is_source_query && ims->ims_stp == 0) { + IGMP_PRINTF(("%s: skip unrecorded node\n", + __func__)); + continue; + } + IGMP_PRINTF(("%s: append node\n", __func__)); + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed.\n", + __func__)); + return (-ENOMEM); + } + nbytes += sizeof(in_addr_t); + ++msrcs; + if (msrcs == m0srcs) + break; + } + IGMP_PRINTF(("%s: msrcs is %d this packet\n", __func__, + msrcs)); + pig->ig_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(in_addr_t)); + } + + if (is_source_query && msrcs == 0) { + IGMP_PRINTF(("%s: no recorded sources to report\n", __func__)); + if (m != m0) + m_freem(m); + return (0); + } + + /* + * We are good to go with first packet. + */ + if (m != m0) { + IGMP_PRINTF(("%s: enqueueing first packet\n", __func__)); + m->m_pkthdr.vt_nrecs = 1; + m->m_pkthdr.rcvif = ifp; + IF_ENQUEUE(ifq, m); + } else { + m->m_pkthdr.vt_nrecs++; + } + /* + * No further work needed if no source list in packet(s). + */ + if (!record_has_sources) + return (nbytes); + + /* + * Whilst sources remain to be announced, we need to allocate + * a new packet and fill out as many sources as will fit. + * Always try for a cluster first. + */ + while (nims != NULL) { + if (IF_QFULL(ifq)) { + IGMP_PRINTF(("%s: outbound queue full\n", __func__)); + return (-ENOMEM); + } + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m) + m->m_data += IGMP_LEADINGSPACE; + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) + return (-ENOMEM); + md = m_getptr(m, 0, &off); + pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off); + IGMP_PRINTF(("%s: allocated next packet\n", __func__)); + + if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed.\n", __func__)); + return (-ENOMEM); + } + m->m_pkthdr.vt_nrecs = 1; + nbytes += sizeof(struct igmp_grouprec); + + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / sizeof(in_addr_t); + + msrcs = 0; + RB_FOREACH_FROM(ims, ip_msource_tree, nims) { + IGMP_PRINTF(("%s: visit node %s\n", __func__, + inet_ntoa_haddr(ims->ims_haddr))); + now = ims_get_mode(inm, ims, 1); + if ((now != mode) || + (now == mode && mode == MCAST_UNDEFINED)) { + IGMP_PRINTF(("%s: skip node\n", __func__)); + continue; + } + if (is_source_query && ims->ims_stp == 0) { + IGMP_PRINTF(("%s: skip unrecorded node\n", + __func__)); + continue; + } + IGMP_PRINTF(("%s: append node\n", __func__)); + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed.\n", + __func__)); + return (-ENOMEM); + } + ++msrcs; + if (msrcs == m0srcs) + break; + } + pig->ig_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(in_addr_t)); + + IGMP_PRINTF(("%s: enqueueing next packet\n", __func__)); + m->m_pkthdr.rcvif = ifp; + IF_ENQUEUE(ifq, m); + } + + return (nbytes); +} + +/* + * Type used to mark record pass completion. + * We exploit the fact we can cast to this easily from the + * current filter modes on each ip_msource node. + */ +typedef enum { + REC_NONE = 0x00, /* MCAST_UNDEFINED */ + REC_ALLOW = 0x01, /* MCAST_INCLUDE */ + REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ + REC_FULL = REC_ALLOW | REC_BLOCK +} rectype_t; + +/* + * Enqueue an IGMPv3 filter list change to the given output queue. + * + * Source list filter state is held in an RB-tree. When the filter list + * for a group is changed without changing its mode, we need to compute + * the deltas between T0 and T1 for each source in the filter set, + * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. + * + * As we may potentially queue two record types, and the entire R-B tree + * needs to be walked at once, we break this out into its own function + * so we can generate a tightly packed queue of packets. + * + * XXX This could be written to only use one tree walk, although that makes + * serializing into the mbuf chains a bit harder. For now we do two walks + * which makes things easier on us, and it may or may not be harder on + * the L2 cache. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm) +{ + static const int MINRECLEN = + sizeof(struct igmp_grouprec) + sizeof(in_addr_t); + struct ifnet *ifp; + struct igmp_grouprec ig; + struct igmp_grouprec *pig; + struct ip_msource *ims, *nims; + struct mbuf *m, *m0, *md; + in_addr_t naddr; + int m0srcs, nbytes, npbytes, off, rsrcs, schanged; + int nallow, nblock; + uint8_t mode, now, then; + rectype_t crt, drt, nrt; + + INM_LOCK_ASSERT_HELD(inm); + + if (inm->inm_nsrc == 0 || + (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0)) + return (0); + + ifp = inm->inm_ifp; /* interface */ + mode = inm->inm_st[1].iss_fmode; /* filter mode at t1 */ + crt = REC_NONE; /* current group record type */ + drt = REC_NONE; /* mask of completed group record types */ + nrt = REC_NONE; /* record type for current node */ + m0srcs = 0; /* # source which will fit in current mbuf chain */ + nbytes = 0; /* # of bytes appended to group's state-change queue */ + npbytes = 0; /* # of bytes appended this packet */ + rsrcs = 0; /* # sources encoded in current record */ + schanged = 0; /* # nodes encoded in overall filter change */ + nallow = 0; /* # of source entries in ALLOW_NEW */ + nblock = 0; /* # of source entries in BLOCK_OLD */ + nims = NULL; /* next tree node pointer */ + + /* + * For each possible filter record mode. + * The first kind of source we encounter tells us which + * is the first kind of record we start appending. + * If a node transitioned to UNDEFINED at t1, its mode is treated + * as the inverse of the group's filter mode. + */ + while (drt != REC_FULL) { + do { + m0 = ifq->ifq_tail; + if (m0 != NULL && + (m0->m_pkthdr.vt_nrecs + 1 <= + IGMP_V3_REPORT_MAXRECS) && + (m0->m_pkthdr.len + MINRECLEN) < + (ifp->if_mtu - IGMP_LEADINGSPACE)) { + m = m0; + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct igmp_grouprec)) / + sizeof(in_addr_t); + IGMP_PRINTF(("%s: use previous packet\n", + __func__)); + } else { + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m) + m->m_data += IGMP_LEADINGSPACE; + if (m == NULL) { + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m) + MH_ALIGN(m, IGMP_LEADINGSPACE); + } + if (m == NULL) { + IGMP_PRINTF(("%s: m_get*() failed\n", + __func__)); + return (-ENOMEM); + } + m->m_pkthdr.vt_nrecs = 0; + m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE - + sizeof(struct igmp_grouprec)) / + sizeof(in_addr_t); + npbytes = 0; + IGMP_PRINTF(("%s: allocated new packet\n", + __func__)); + } + /* + * Append the IGMP group record header to the + * current packet's data area. + * Recalculate pointer to free space for next + * group record, in case m_append() allocated + * a new mbuf or cluster. + */ + memset(&ig, 0, sizeof(ig)); + ig.ig_group = inm->inm_addr; + if (!m_append(m, sizeof(ig), (void *)&ig)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed\n", + __func__)); + return (-ENOMEM); + } + npbytes += sizeof(struct igmp_grouprec); + if (m != m0) { + /* new packet; offset in c hain */ + md = m_getptr(m, npbytes - + sizeof(struct igmp_grouprec), &off); + pig = (struct igmp_grouprec *)(mtod(md, + uint8_t *) + off); + } else { + /* current packet; offset from last append */ + md = m_last(m); + pig = (struct igmp_grouprec *)(mtod(md, + uint8_t *) + md->m_len - + sizeof(struct igmp_grouprec)); + } + /* + * Begin walking the tree for this record type + * pass, or continue from where we left off + * previously if we had to allocate a new packet. + * Only report deltas in-mode at t1. + * We need not report included sources as allowed + * if we are in inclusive mode on the group, + * however the converse is not true. + */ + rsrcs = 0; + if (nims == NULL) + nims = RB_MIN(ip_msource_tree, &inm->inm_srcs); + RB_FOREACH_FROM(ims, ip_msource_tree, nims) { + IGMP_PRINTF(("%s: visit node %s\n", + __func__, inet_ntoa_haddr(ims->ims_haddr))); + now = ims_get_mode(inm, ims, 1); + then = ims_get_mode(inm, ims, 0); + IGMP_PRINTF(("%s: mode: t0 %d, t1 %d\n", + __func__, then, now)); + if (now == then) { + IGMP_PRINTF(("%s: skip unchanged\n", + __func__)); + continue; + } + if (mode == MCAST_EXCLUDE && + now == MCAST_INCLUDE) { + IGMP_PRINTF(("%s: skip IN src on EX " + "group\n", __func__)); + continue; + } + nrt = (rectype_t)now; + if (nrt == REC_NONE) + nrt = (rectype_t)(~mode & REC_FULL); + if (schanged++ == 0) { + crt = nrt; + } else if (crt != nrt) + continue; + naddr = htonl(ims->ims_haddr); + if (!m_append(m, sizeof(in_addr_t), + (void *)&naddr)) { + if (m != m0) + m_freem(m); + IGMP_PRINTF(("%s: m_append() failed\n", + __func__)); + return (-ENOMEM); + } + nallow += !!(crt == REC_ALLOW); + nblock += !!(crt == REC_BLOCK); + if (++rsrcs == m0srcs) + break; + } + /* + * If we did not append any tree nodes on this + * pass, back out of allocations. + */ + if (rsrcs == 0) { + npbytes -= sizeof(struct igmp_grouprec); + if (m != m0) { + IGMP_PRINTF(("%s: m_free(m)\n", + __func__)); + m_freem(m); + } else { + IGMP_PRINTF(("%s: m_adj(m, -ig)\n", + __func__)); + m_adj(m, -((int)sizeof( + struct igmp_grouprec))); + } + continue; + } + npbytes += (rsrcs * sizeof(in_addr_t)); + if (crt == REC_ALLOW) + pig->ig_type = IGMP_ALLOW_NEW_SOURCES; + else if (crt == REC_BLOCK) + pig->ig_type = IGMP_BLOCK_OLD_SOURCES; + pig->ig_numsrc = htons(rsrcs); + /* + * Count the new group record, and enqueue this + * packet if it wasn't already queued. + */ + m->m_pkthdr.vt_nrecs++; + m->m_pkthdr.rcvif = ifp; + if (m != m0) + IF_ENQUEUE(ifq, m); + nbytes += npbytes; + } while (nims != NULL); + drt |= crt; + crt = (~crt & REC_FULL); + } + + IGMP_PRINTF(("%s: queued %d ALLOW_NEW, %d BLOCK_OLD\n", __func__, + nallow, nblock)); + + return (nbytes); +} + +static int +igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq) +{ + struct ifqueue *gq; + struct mbuf *m; /* pending state-change */ + struct mbuf *m0; /* copy of pending state-change */ + struct mbuf *mt; /* last state-change in packet */ + struct mbuf *n; + int docopy, domerge; + u_int recslen; + + INM_LOCK_ASSERT_HELD(inm); + + docopy = 0; + domerge = 0; + recslen = 0; + + /* + * If there are further pending retransmissions, make a writable + * copy of each queued state-change message before merging. + */ + if (inm->inm_scrv > 0) + docopy = 1; + + gq = &inm->inm_scq; +#ifdef IGMP_DEBUG + if (gq->ifq_head == NULL) { + IGMP_PRINTF(("%s: WARNING: queue for inm %p is empty\n", + __func__, inm)); + } +#endif + + /* + * Use IF_REMQUEUE() instead of IF_DEQUEUE() below, since the + * packet might not always be at the head of the ifqueue. + */ + m = gq->ifq_head; + while (m != NULL) { + /* + * Only merge the report into the current packet if + * there is sufficient space to do so; an IGMPv3 report + * packet may only contain 65,535 group records. + * Always use a simple mbuf chain concatentation to do this, + * as large state changes for single groups may have + * allocated clusters. + */ + domerge = 0; + mt = ifscq->ifq_tail; + if (mt != NULL) { + recslen = m_length(m); + + if ((mt->m_pkthdr.vt_nrecs + + m->m_pkthdr.vt_nrecs <= + IGMP_V3_REPORT_MAXRECS) && + (mt->m_pkthdr.len + recslen <= + (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE))) + domerge = 1; + } + + if (!domerge && IF_QFULL(gq)) { + IGMP_PRINTF(("%s: outbound queue full, skipping whole " + "packet %p\n", __func__, m)); + n = m->m_nextpkt; + if (!docopy) { + IF_REMQUEUE(gq, m); + m_freem(m); + } + m = n; + continue; + } + + if (!docopy) { + IGMP_PRINTF(("%s: dequeueing %p\n", __func__, m)); + n = m->m_nextpkt; + IF_REMQUEUE(gq, m); + m0 = m; + m = n; + } else { + IGMP_PRINTF(("%s: copying %p\n", __func__, m)); + m0 = m_dup(m, M_NOWAIT); + if (m0 == NULL) + return (ENOMEM); + m0->m_nextpkt = NULL; + m = m->m_nextpkt; + } + + if (!domerge) { + IGMP_PRINTF(("%s: queueing %p to ifscq %p)\n", + __func__, m0, ifscq)); + m0->m_pkthdr.rcvif = inm->inm_ifp; + IF_ENQUEUE(ifscq, m0); + } else { + struct mbuf *mtl; /* last mbuf of packet mt */ + + IGMP_PRINTF(("%s: merging %p with ifscq tail %p)\n", + __func__, m0, mt)); + + mtl = m_last(mt); + m0->m_flags &= ~M_PKTHDR; + mt->m_pkthdr.len += recslen; + mt->m_pkthdr.vt_nrecs += + m0->m_pkthdr.vt_nrecs; + + mtl->m_next = m0; + } + } + + return (0); +} + +/* + * Respond to a pending IGMPv3 General Query. + */ +static void +igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi) +{ + struct ifnet *ifp; + struct in_multi *inm; + struct in_multistep step; + int retval, loop; + + IGI_LOCK_ASSERT_HELD(igi); + + VERIFY(igi->igi_version == IGMP_VERSION_3); + + ifp = igi->igi_ifp; + IGI_UNLOCK(igi); + + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp != ifp) + goto next; + + switch (inm->inm_state) { + case IGMP_NOT_MEMBER: + case IGMP_SILENT_MEMBER: + break; + case IGMP_REPORTING_MEMBER: + case IGMP_IDLE_MEMBER: + case IGMP_LAZY_MEMBER: + case IGMP_SLEEPING_MEMBER: + case IGMP_AWAKENING_MEMBER: + inm->inm_state = IGMP_REPORTING_MEMBER; + IGI_LOCK(igi); + retval = igmp_v3_enqueue_group_record(&igi->igi_gq, + inm, 0, 0, 0); + IGI_UNLOCK(igi); + IGMP_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + break; + case IGMP_G_QUERY_PENDING_MEMBER: + case IGMP_SG_QUERY_PENDING_MEMBER: + case IGMP_LEAVING_MEMBER: + break; + } +next: + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + + IGI_LOCK(igi); + loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0; + igmp_dispatch_queue(igi, &igi->igi_gq, IGMP_MAX_RESPONSE_BURST, + loop, ifp); + IGI_LOCK_ASSERT_HELD(igi); + /* + * Slew transmission of bursts over 500ms intervals. + */ + if (igi->igi_gq.ifq_head != NULL) { + igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY( + IGMP_RESPONSE_BURST_INTERVAL); + interface_timers_running = 1; + } +} + +/* + * Transmit the next pending IGMP message in the output queue. + * + * Must not be called with inm_lock or igi_lock held. + */ +void +igmp_sendpkt(struct mbuf *m, struct ifnet *ifp) +{ + struct ip_moptions *imo; + struct mbuf *ipopts, *m0; + int error; + struct route ro; + + IGMP_PRINTF(("%s: transmit %p\n", __func__, m)); + + /* + * Check if the ifnet is still attached. + */ + if (ifp == NULL || !ifnet_is_attached(ifp, 0)) { + IGMP_PRINTF(("%s: dropped %p as ifp u went away.\n", + __func__, m)); + m_freem(m); + OSAddAtomic(1, &ipstat.ips_noroute); + return; + } + + ipopts = igmp_sendra ? m_raopt : NULL; + + imo = ip_allocmoptions(M_WAITOK); + if (imo == NULL) { + m_freem(m); + return; + } + + imo->imo_multicast_ttl = 1; + imo->imo_multicast_vif = -1; +#if MROUTING + imo->imo_multicast_loop = (ip_mrouter != NULL); +#else + imo->imo_multicast_loop = 0; +#endif + + /* + * If the user requested that IGMP traffic be explicitly + * redirected to the loopback interface (e.g. they are running a + * MANET interface and the routing protocol needs to see the + * updates), handle this now. + */ + if (m->m_flags & M_IGMP_LOOP) + imo->imo_multicast_ifp = lo_ifp; + else + imo->imo_multicast_ifp = ifp; + + if (m->m_flags & M_IGMPV2) { + m0 = m; + } else { + m0 = igmp_v3_encap_report(ifp, m); + if (m0 == NULL) { + /* + * If igmp_v3_encap_report() failed, then M_PREPEND() + * already freed the original mbuf chain. + * This means that we don't have to m_freem(m) here. + */ + IGMP_PRINTF(("%s: dropped %p\n", __func__, m)); + IMO_REMREF(imo); + atomic_add_32(&ipstat.ips_odropped, 1); + return; + } + } + + m->m_flags &= ~(M_PROTOFLAGS | M_IGMP_LOOP); + m0->m_pkthdr.rcvif = lo_ifp; +#ifdef MAC + mac_netinet_igmp_send(ifp, m0); +#endif + bzero(&ro, sizeof (ro)); + error = ip_output(m0, ipopts, &ro, 0, imo, NULL); + if (ro.ro_rt != NULL) { + rtfree(ro.ro_rt); + ro.ro_rt = NULL; + } + + IMO_REMREF(imo); + + if (error) { + IGMP_PRINTF(("%s: ip_output(%p) = %d\n", __func__, m0, error)); + return; + } + + IGMPSTAT_INC(igps_snd_reports); + OIGMPSTAT_INC(igps_snd_reports); +} +/* + * Encapsulate an IGMPv3 report. + * + * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf + * chain has already had its IP/IGMPv3 header prepended. In this case + * the function will not attempt to prepend; the lengths and checksums + * will however be re-computed. + * + * Returns a pointer to the new mbuf chain head, or NULL if the + * allocation failed. + */ +static struct mbuf * +igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m) +{ + struct igmp_report *igmp; + struct ip *ip; + int hdrlen, igmpreclen; + + VERIFY((m->m_flags & M_PKTHDR)); + + igmpreclen = m_length(m); + hdrlen = sizeof(struct ip) + sizeof(struct igmp_report); + + if (m->m_flags & M_IGMPV3_HDR) { + igmpreclen -= hdrlen; + } else { + M_PREPEND(m, hdrlen, M_DONTWAIT); + if (m == NULL) + return (NULL); + m->m_flags |= M_IGMPV3_HDR; + } + + IGMP_PRINTF(("%s: igmpreclen is %d\n", __func__, igmpreclen)); + + m->m_data += sizeof(struct ip); + m->m_len -= sizeof(struct ip); + + igmp = mtod(m, struct igmp_report *); + igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT; + igmp->ir_rsv1 = 0; + igmp->ir_rsv2 = 0; + igmp->ir_numgrps = htons(m->m_pkthdr.vt_nrecs); + igmp->ir_cksum = 0; + igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen); + m->m_pkthdr.vt_nrecs = 0; + + m->m_data -= sizeof(struct ip); + m->m_len += sizeof(struct ip); + + ip = mtod(m, struct ip *); + ip->ip_tos = IPTOS_PREC_INTERNETCONTROL; + ip->ip_len = hdrlen + igmpreclen; + ip->ip_off = IP_DF; + ip->ip_p = IPPROTO_IGMP; + ip->ip_sum = 0; + + ip->ip_src.s_addr = INADDR_ANY; + + if (m->m_flags & M_IGMP_LOOP) { + struct in_ifaddr *ia; + + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + ip->ip_src = ia->ia_addr.sin_addr; + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + } + } + + ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP); + + return (m); +} + +#ifdef IGMP_DEBUG +static const char * +igmp_rec_type_to_str(const int type) +{ + switch (type) { + case IGMP_CHANGE_TO_EXCLUDE_MODE: + return "TO_EX"; + break; + case IGMP_CHANGE_TO_INCLUDE_MODE: + return "TO_IN"; + break; + case IGMP_MODE_IS_EXCLUDE: + return "MODE_EX"; + break; + case IGMP_MODE_IS_INCLUDE: + return "MODE_IN"; + break; + case IGMP_ALLOW_NEW_SOURCES: + return "ALLOW_NEW"; + break; + case IGMP_BLOCK_OLD_SOURCES: + return "BLOCK_OLD"; + break; + default: + break; + } + return "unknown"; +} +#endif + +void +igmp_init(void) +{ + + IGMP_PRINTF(("%s: initializing\n", __func__)); + + igmp_timers_are_running = 0; + + /* Setup lock group and attribute for igmp_mtx */ + igmp_mtx_grp_attr = lck_grp_attr_alloc_init(); + igmp_mtx_grp = lck_grp_alloc_init("igmp_mtx", igmp_mtx_grp_attr); + igmp_mtx_attr = lck_attr_alloc_init(); + lck_mtx_init(&igmp_mtx, igmp_mtx_grp, igmp_mtx_attr); + + LIST_INIT(&igi_head); + m_raopt = igmp_ra_alloc(); + + igi_size = sizeof (struct igmp_ifinfo); + igi_zone = zinit(igi_size, IGI_ZONE_MAX * igi_size, + 0, IGI_ZONE_NAME); + if (igi_zone == NULL) { + panic("%s: failed allocating %s", __func__, IGI_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(igi_zone, Z_EXPAND, TRUE); + zone_change(igi_zone, Z_CALLERACCT, FALSE); +} diff --git a/bsd/netinet/igmp.h b/bsd/netinet/igmp.h index 3774cd860..28352317d 100644 --- a/bsd/netinet/igmp.h +++ b/bsd/netinet/igmp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,8 +77,11 @@ * MULTICAST Revision: 3.5.1.2 */ +/* Minimum length of any IGMP protocol message. */ +#define IGMP_MINLEN 8 + /* - * IGMP packet format. + * IGMPv1/v2 query and host report format. */ struct igmp { u_char igmp_type; /* version & type of IGMP message */ @@ -87,38 +90,91 @@ struct igmp { struct in_addr igmp_group; /* group address being reported */ }; /* (zero for queries) */ -#define IGMP_MINLEN 8 +/* + * IGMP v3 query format. + */ +struct igmpv3 { + u_char igmp_type; /* version & type of IGMP message */ + u_char igmp_code; /* subtype for routing msgs */ + u_short igmp_cksum; /* IP-style checksum */ + struct in_addr igmp_group; /* group address being reported */ + /* (zero for queries) */ + u_char igmp_misc; /* reserved/suppress/robustness */ + u_char igmp_qqi; /* querier's query interval */ + u_short igmp_numsrc; /* number of sources */ + /*struct in_addr igmp_sources[1];*/ /* source addresses */ +}; +#define IGMP_V3_QUERY_MINLEN 12 +#define IGMP_EXP(x) (((x) >> 4) & 0x07) +#define IGMP_MANT(x) ((x) & 0x0f) +#define IGMP_QRESV(x) (((x) >> 4) & 0x0f) +#define IGMP_SFLAG(x) (((x) >> 3) & 0x01) +#define IGMP_QRV(x) ((x) & 0x07) + +struct igmp_grouprec { + u_char ig_type; /* record type */ + u_char ig_datalen; /* length of auxiliary data */ + u_short ig_numsrc; /* number of sources */ + struct in_addr ig_group; /* group address being reported */ + /*struct in_addr ig_sources[1];*/ /* source addresses */ +}; +#define IGMP_GRPREC_HDRLEN 8 /* - * Message types, including version number. + * IGMPv3 host membership report header. */ -#define IGMP_MEMBERSHIP_QUERY 0x11 /* membership query */ -#define IGMP_V1_MEMBERSHIP_REPORT 0x12 /* Ver. 1 membership report */ -#define IGMP_V2_MEMBERSHIP_REPORT 0x16 /* Ver. 2 membership report */ -#define IGMP_V2_LEAVE_GROUP 0x17 /* Leave-group message */ +struct igmp_report { + u_char ir_type; /* IGMP_v3_HOST_MEMBERSHIP_REPORT */ + u_char ir_rsv1; /* must be zero */ + u_short ir_cksum; /* checksum */ + u_short ir_rsv2; /* must be zero */ + u_short ir_numgrps; /* number of group records */ + /*struct igmp_grouprec ir_groups[1];*/ /* group records */ +}; +#define IGMP_V3_REPORT_MINLEN 8 +#define IGMP_V3_REPORT_MAXRECS 65535 +/* + * Message types, including version number. + */ +#define IGMP_HOST_MEMBERSHIP_QUERY 0x11 /* membership query */ +#define IGMP_v1_HOST_MEMBERSHIP_REPORT 0x12 /* Ver. 1 membership report */ #define IGMP_DVMRP 0x13 /* DVMRP routing message */ -#define IGMP_PIM 0x14 /* PIM routing message */ - -#define IGMP_MTRACE_RESP 0x1e /* traceroute resp.(to sender)*/ -#define IGMP_MTRACE 0x1f /* mcast traceroute messages */ +#define IGMP_PIM 0x14 /* PIMv1 message (historic) */ +#define IGMP_v2_HOST_MEMBERSHIP_REPORT 0x16 /* Ver. 2 membership report */ +#define IGMP_HOST_LEAVE_MESSAGE 0x17 /* Leave-group message */ +#define IGMP_MTRACE_REPLY 0x1e /* mtrace(8) reply */ +#define IGMP_MTRACE_QUERY 0x1f /* mtrace(8) probe */ +#define IGMP_v3_HOST_MEMBERSHIP_REPORT 0x22 /* Ver. 3 membership report */ -#define IGMP_MAX_HOST_REPORT_DELAY 10 /* max delay for response to */ - /* query (in seconds) according */ - /* to RFC1112 */ +/* + * IGMPv3 report modes. + */ +#define IGMP_DO_NOTHING 0 /* don't send a record */ +#define IGMP_MODE_IS_INCLUDE 1 /* MODE_IN */ +#define IGMP_MODE_IS_EXCLUDE 2 /* MODE_EX */ +#define IGMP_CHANGE_TO_INCLUDE_MODE 3 /* TO_IN */ +#define IGMP_CHANGE_TO_EXCLUDE_MODE 4 /* TO_EX */ +#define IGMP_ALLOW_NEW_SOURCES 5 /* ALLOW_NEW */ +#define IGMP_BLOCK_OLD_SOURCES 6 /* BLOCK_OLD */ +/* + * IGMPv3 query types. + */ +#define IGMP_V3_GENERAL_QUERY 1 +#define IGMP_V3_GROUP_QUERY 2 +#define IGMP_V3_GROUP_SOURCE_QUERY 3 -#define IGMP_TIMER_SCALE 10 /* denotes that the igmp code field */ - /* specifies time in 10th of seconds*/ +/* + * Maximum report interval for IGMP v1/v2 host membership reports [RFC 1112] + */ +#define IGMP_V1V2_MAX_RI 10 +#define IGMP_MAX_HOST_REPORT_DELAY IGMP_V1V2_MAX_RI /* - * The following four defininitions are for backwards compatibility. - * They should be removed as soon as all applications are updated to - * use the new constant names. + * IGMP_TIMER_SCALE denotes that the igmp code field specifies + * time in tenths of a second. */ -#define IGMP_HOST_MEMBERSHIP_QUERY IGMP_MEMBERSHIP_QUERY -#define IGMP_HOST_MEMBERSHIP_REPORT IGMP_V1_MEMBERSHIP_REPORT -#define IGMP_HOST_NEW_MEMBERSHIP_REPORT IGMP_V2_MEMBERSHIP_REPORT -#define IGMP_HOST_LEAVE_MESSAGE IGMP_V2_LEAVE_GROUP +#define IGMP_TIMER_SCALE 10 #endif /* _NETINET_IGMP_H_ */ diff --git a/bsd/netinet/igmp_var.h b/bsd/netinet/igmp_var.h index 5e9f7e983..8fdaab868 100644 --- a/bsd/netinet/igmp_var.h +++ b/bsd/netinet/igmp_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -68,7 +68,6 @@ #define _NETINET_IGMP_VAR_H_ #include <sys/appleapiopts.h> - /* * Internet Group Management Protocol (IGMP), * implementation-specific definitions. @@ -78,6 +77,48 @@ * MULTICAST Revision: 3.5.1.3 */ +struct igmpstat_v3 { + /* + * Structure header (to insulate ABI changes). + */ + uint32_t igps_version; /* version of this structure */ + uint32_t igps_len; /* length of this structure */ + /* + * Message statistics. + */ + uint64_t igps_rcv_total; /* total IGMP messages received */ + uint64_t igps_rcv_tooshort; /* received with too few bytes */ + uint64_t igps_rcv_badttl; /* received with ttl other than 1 */ + uint64_t igps_rcv_badsum; /* received with bad checksum */ + /* + * Query statistics. + */ + uint64_t igps_rcv_v1v2_queries; /* received IGMPv1/IGMPv2 queries */ + uint64_t igps_rcv_v3_queries; /* received IGMPv3 queries */ + uint64_t igps_rcv_badqueries; /* received invalid queries */ + uint64_t igps_rcv_gen_queries; /* received general queries */ + uint64_t igps_rcv_group_queries;/* received group queries */ + uint64_t igps_rcv_gsr_queries; /* received group-source queries */ + uint64_t igps_drop_gsr_queries; /* dropped group-source queries */ + /* + * Report statistics. + */ + uint64_t igps_rcv_reports; /* received membership reports */ + uint64_t igps_rcv_badreports; /* received invalid reports */ + uint64_t igps_rcv_ourreports; /* received reports for our groups */ + uint64_t igps_rcv_nora; /* received w/o Router Alert option */ + uint64_t igps_snd_reports; /* sent membership reports */ + /* + * Padding for future additions. + */ + uint64_t __igps_pad[4]; +} __attribute__((aligned(8))); + +/* + * Old IGMPv2 stat structure for backward compatibility + * + */ + struct igmpstat { u_int igps_rcv_total; /* total IGMP messages received */ u_int igps_rcv_tooshort; /* received with too few bytes */ @@ -90,41 +131,189 @@ struct igmpstat { u_int igps_snd_reports; /* sent membership reports */ }; -#ifdef KERNEL_PRIVATE -#ifdef KERNEL +#define IGPS_VERSION_3 3 +#define IGPS_VERSION3_LEN 168 + +#ifdef PRIVATE +/* + * Per-interface IGMP router version information. + */ +#ifndef XNU_KERNEL_PRIVATE +struct igmp_ifinfo { +#else +struct igmp_ifinfo_u { +#endif /* XNU_KERNEL_PRIVATE */ + uint32_t igi_ifindex; /* interface this instance belongs to */ + uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ + uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ + uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ + uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ + uint32_t igi_flags; /* IGMP per-interface flags */ + uint32_t igi_rv; /* IGMPv3 Robustness Variable */ + uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ + uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ + uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ +}; + +#define IGIF_SILENT 0x00000001 /* Do not use IGMP on this ifp */ +#define IGIF_LOOPBACK 0x00000002 /* Send IGMP reports to loopback */ + +/* + * IGMP version tag. + */ +#define IGMP_VERSION_NONE 0 /* Invalid */ +#define IGMP_VERSION_1 1 +#define IGMP_VERSION_2 2 +#define IGMP_VERSION_3 3 /* Default */ +#endif /* PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +#include <libkern/libkern.h> +#define IGMP_DEBUG 1 +#ifdef IGMP_DEBUG +extern char * inet_ntoa(struct in_addr); +extern int igmp_debug; + +#define IGMP_PRINTF(x) do { if (igmp_debug) printf x; } while (0) +#else +#define IGMP_PRINTF(x) +#endif + +#define OIGMPSTAT_ADD(name, val) atomic_add_32(&igmpstat.name , (val)) +#define OIGMPSTAT_INC(name) OIGMPSTAT_ADD(name, 1) + +#define IGMPSTAT_ADD(name, val) atomic_add_64(&igmpstat_v3.name , (val)) +#define IGMPSTAT_INC(name) IGMPSTAT_ADD(name, 1) + #define IGMP_RANDOM_DELAY(X) (random() % (X) + 1) +#define IGMP_MAX_STATE_CHANGES 24 /* Max pending changes per group */ + /* - * States for IGMPv2's leave processing + * IGMP per-group states. */ -#define IGMP_OTHERMEMBER 0 -#define IGMP_IREPORTEDLAST 1 +#define IGMP_NOT_MEMBER 0 /* Can garbage collect in_multi */ +#define IGMP_SILENT_MEMBER 1 /* Do not perform IGMP for group */ +#define IGMP_REPORTING_MEMBER 2 /* IGMPv1/2/3 we are reporter */ +#define IGMP_IDLE_MEMBER 3 /* IGMPv1/2 we reported last */ +#define IGMP_LAZY_MEMBER 4 /* IGMPv1/2 other member reporting */ +#define IGMP_SLEEPING_MEMBER 5 /* IGMPv1/2 start query response */ +#define IGMP_AWAKENING_MEMBER 6 /* IGMPv1/2 group timer will start */ +#define IGMP_G_QUERY_PENDING_MEMBER 7 /* IGMPv3 group query pending */ +#define IGMP_SG_QUERY_PENDING_MEMBER 8 /* IGMPv3 source query pending */ +#define IGMP_LEAVING_MEMBER 9 /* IGMPv3 dying gasp (pending last */ + /* retransmission of INCLUDE {}) */ +/* + * IGMPv3 protocol control variables. + */ +#define IGMP_RV_INIT 2 /* Robustness Variable */ +#define IGMP_RV_MIN 1 +#define IGMP_RV_MAX 7 + +#define IGMP_QI_INIT 125 /* Query Interval (s) */ +#define IGMP_QI_MIN 1 +#define IGMP_QI_MAX 255 + +#define IGMP_QRI_INIT 10 /* Query Response Interval (s) */ +#define IGMP_QRI_MIN 1 +#define IGMP_QRI_MAX 255 + +#define IGMP_URI_INIT 3 /* Unsolicited Report Interval (s) */ +#define IGMP_URI_MIN 0 +#define IGMP_URI_MAX 10 + +#define IGMP_MAX_G_GS_PACKETS 8 /* # of packets to answer G/GS */ +#define IGMP_MAX_STATE_CHANGE_PACKETS 8 /* # of packets per state change */ +#define IGMP_MAX_RESPONSE_PACKETS 16 /* # of packets for general query */ +#define IGMP_MAX_RESPONSE_BURST 4 /* # of responses to send at once */ +#define IGMP_RESPONSE_BURST_INTERVAL (PR_SLOWHZ) /* 500ms */ /* - * We must remember what version the subnet's querier is. - * We conveniently use the IGMP message type for the proper - * membership report to keep this state. + * IGMP-specific mbuf flags. */ -#define IGMP_V1_ROUTER IGMP_V1_MEMBERSHIP_REPORT -#define IGMP_V2_ROUTER IGMP_V2_MEMBERSHIP_REPORT +#define M_IGMPV2 M_PROTO1 /* Packet is IGMPv2 */ +#define M_IGMPV3_HDR M_PROTO2 /* Packet has IGMPv3 headers */ +#define M_GROUPREC M_PROTO3 /* mbuf chain is a group record */ +#define M_IGMP_LOOP M_LOOP /* transmit on loif, not real ifp */ + +/* + * Default amount of leading space for IGMPv3 to allocate at the + * beginning of its mbuf packet chains, to avoid fragmentation and + * unnecessary allocation of leading mbufs. + */ +#define RAOPT_LEN 4 /* Length of IP Router Alert option */ +#define IGMP_LEADINGSPACE \ + (sizeof(struct ip) + RAOPT_LEN + sizeof(struct igmp_report)) + +struct igmp_ifinfo { + decl_lck_mtx_data(, igi_lock); + uint32_t igi_refcnt; /* reference count */ + uint32_t igi_debug; /* see ifa_debug flags */ + LIST_ENTRY(igmp_ifinfo) igi_link; + struct ifnet *igi_ifp; /* interface this instance belongs to */ + uint32_t igi_version; /* IGMPv3 Host Compatibility Mode */ + uint32_t igi_v1_timer; /* IGMPv1 Querier Present timer (s) */ + uint32_t igi_v2_timer; /* IGMPv2 Querier Present timer (s) */ + uint32_t igi_v3_timer; /* IGMPv3 General Query (interface) timer (s)*/ + uint32_t igi_flags; /* IGMP per-interface flags */ + uint32_t igi_rv; /* IGMPv3 Robustness Variable */ + uint32_t igi_qi; /* IGMPv3 Query Interval (s) */ + uint32_t igi_qri; /* IGMPv3 Query Response Interval (s) */ + uint32_t igi_uri; /* IGMPv3 Unsolicited Report Interval (s) */ + SLIST_HEAD(,in_multi) igi_relinmhead; /* released groups */ + struct ifqueue igi_gq; /* queue of general query responses */ + struct ifqueue igi_v2q; /* queue of v1/v2 packets */ +}; + +#define IGI_LOCK_ASSERT_HELD(_igi) \ + lck_mtx_assert(&(_igi)->igi_lock, LCK_MTX_ASSERT_OWNED) + +#define IGI_LOCK_ASSERT_NOTHELD(_igi) \ + lck_mtx_assert(&(_igi)->igi_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IGI_LOCK(_igi) \ + lck_mtx_lock(&(_igi)->igi_lock) + +#define IGI_LOCK_SPIN(_igi) \ + lck_mtx_lock_spin(&(_igi)->igi_lock) + +#define IGI_CONVERT_LOCK(_igi) do { \ + IGI_LOCK_ASSERT_HELD(_igi); \ + lck_mtx_convert_spin(&(_igi)->igi_lock); \ +} while (0) + +#define IGI_UNLOCK(_igi) \ + lck_mtx_unlock(&(_igi)->igi_lock) + +#define IGI_ADDREF(_igi) \ + igi_addref(_igi, 0) + +#define IGI_ADDREF_LOCKED(_igi) \ + igi_addref(_igi, 1) + +#define IGI_REMREF(_igi) \ + igi_remref(_igi) /* - * Revert to new router if we haven't heard from an old router in - * this amount of time. + * Per-link IGMP context. */ -#define IGMP_AGE_THRESHOLD 540 +#define IGMP_IFINFO(ifp) ((ifp)->if_igi) -void igmp_init(void) __attribute__((section("__TEXT, initcode"))); -void igmp_input(struct mbuf *, int); -int igmp_joingroup(struct in_multi *); -void igmp_leavegroup(struct in_multi *); -void igmp_fasttimo(void); -void igmp_slowtimo(void); +extern void igmp_init(void) __attribute__((section("__TEXT, initcode"))); +extern int igmp_change_state(struct in_multi *); +extern struct igmp_ifinfo *igmp_domifattach(struct ifnet *, int); +extern void igmp_domifreattach(struct igmp_ifinfo *); +extern void igmp_domifdetach(struct ifnet *); +extern void igmp_input(struct mbuf *, int); +extern int igmp_joingroup(struct in_multi *); +extern void igmp_leavegroup(struct in_multi *); +extern void igmp_slowtimo(void); +extern void igi_addref(struct igmp_ifinfo *, int); +extern void igi_remref(struct igmp_ifinfo *); SYSCTL_DECL(_net_inet_igmp); -#endif /* KERNEL */ -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ /* * Names for IGMP sysctl objects @@ -132,11 +321,11 @@ SYSCTL_DECL(_net_inet_igmp); #define IGMPCTL_STATS 1 /* statistics (read-only) */ #define IGMPCTL_MAXID 2 -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define IGMPCTL_NAMES { \ { 0, 0 }, \ { "stats", CTLTYPE_STRUCT }, \ } -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #endif diff --git a/bsd/netinet/in.c b/bsd/netinet/in.c index 32b8c64f6..85b9d38af 100644 --- a/bsd/netinet/in.c +++ b/bsd/netinet/in.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,6 +72,7 @@ #include <sys/sysctl.h> #include <sys/kern_event.h> #include <sys/syslog.h> +#include <sys/mcache.h> #include <kern/zalloc.h> #include <pexpert/pexpert.h> @@ -122,42 +123,66 @@ static void in_iahash_remove(struct in_ifaddr *); static void in_iahash_insert(struct in_ifaddr *); static void in_iahash_insert_ptp(struct in_ifaddr *); static struct in_ifaddr *in_ifaddr_alloc(int); +static void in_ifaddr_attached(struct ifaddr *); +static void in_ifaddr_detached(struct ifaddr *); static void in_ifaddr_free(struct ifaddr *); static void in_ifaddr_trace(struct ifaddr *, int); static int subnetsarelocal = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, subnets_are_local, CTLFLAG_RW | CTLFLAG_LOCKED, &subnetsarelocal, 0, ""); -struct in_multihead in_multihead; /* XXX BSS initialization */ - /* Track whether or not the SIOCARPIPLL ioctl has been called */ __private_extern__ u_int32_t ipv4_ll_arp_aware = 0; +#define INIFA_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int inifa_trace_hist_size = INIFA_TRACE_HIST_SIZE; + struct in_ifaddr_dbg { struct in_ifaddr inifa; /* in_ifaddr */ struct in_ifaddr inifa_old; /* saved in_ifaddr */ - u_int16_t inifa_refhold_cnt; /* # of ifaref */ - u_int16_t inifa_refrele_cnt; /* # of ifafree */ + u_int16_t inifa_refhold_cnt; /* # of IFA_ADDREF */ + u_int16_t inifa_refrele_cnt; /* # of IFA_REMREF */ /* * Alloc and free callers. */ ctrace_t inifa_alloc; ctrace_t inifa_free; /* - * Circular lists of ifaref and ifafree callers. + * Circular lists of IFA_ADDREF and IFA_REMREF callers. */ - ctrace_t inifa_refhold[CTRACE_HIST_SIZE]; - ctrace_t inifa_refrele[CTRACE_HIST_SIZE]; + ctrace_t inifa_refhold[INIFA_TRACE_HIST_SIZE]; + ctrace_t inifa_refrele[INIFA_TRACE_HIST_SIZE]; + /* + * Trash list linkage + */ + TAILQ_ENTRY(in_ifaddr_dbg) inifa_trash_link; }; -static unsigned int inifa_debug; /* debug flags */ +/* List of trash in_ifaddr entries protected by inifa_trash_lock */ +static TAILQ_HEAD(, in_ifaddr_dbg) inifa_trash_head; +static decl_lck_mtx_data(, inifa_trash_lock); + +#if DEBUG +static unsigned int inifa_debug = 1; /* debugging (enabled) */ +#else +static unsigned int inifa_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ static unsigned int inifa_size; /* size of zone element */ static struct zone *inifa_zone; /* zone for in_ifaddr */ #define INIFA_ZONE_MAX 64 /* maximum elements in zone */ #define INIFA_ZONE_NAME "in_ifaddr" /* zone name */ +/* + * Return 1 if the address is + * - loopback + * - unicast or multicast link local + * - routed via a link level gateway + * - belongs to a directly connected (sub)net + */ int inaddr_local(struct in_addr in) { @@ -165,20 +190,27 @@ inaddr_local(struct in_addr in) struct sockaddr_in sin; int local = 0; - sin.sin_family = AF_INET; - sin.sin_len = sizeof (sin); - sin.sin_addr = in; - rt = rtalloc1((struct sockaddr *)&sin, 0, 0); - - if (rt != NULL) { - RT_LOCK_SPIN(rt); - if (rt->rt_gateway->sa_family == AF_LINK || - (rt->rt_ifp->if_flags & IFF_LOOPBACK)) + if (ntohl(in.s_addr) == INADDR_LOOPBACK || IN_LINKLOCAL(ntohl(in.s_addr))) { + local = 1; + } else if (ntohl(in.s_addr) >= INADDR_UNSPEC_GROUP && + ntohl(in.s_addr) <= INADDR_MAX_LOCAL_GROUP) { local = 1; - RT_UNLOCK(rt); - rtfree(rt); } else { - local = in_localaddr(in); + sin.sin_family = AF_INET; + sin.sin_len = sizeof (sin); + sin.sin_addr = in; + rt = rtalloc1((struct sockaddr *)&sin, 0, 0); + + if (rt != NULL) { + RT_LOCK_SPIN(rt); + if (rt->rt_gateway->sa_family == AF_LINK || + (rt->rt_ifp->if_flags & IFF_LOOPBACK)) + local = 1; + RT_UNLOCK(rt); + rtfree(rt); + } else { + local = in_localaddr(in); + } } return (local); } @@ -198,20 +230,28 @@ in_localaddr(struct in_addr in) if (subnetsarelocal) { lck_rw_lock_shared(in_ifaddr_rwlock); for (ia = in_ifaddrhead.tqh_first; ia; - ia = ia->ia_link.tqe_next) + ia = ia->ia_link.tqe_next) { + IFA_LOCK(&ia->ia_ifa); if ((i & ia->ia_netmask) == ia->ia_net) { + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return (1); } + IFA_UNLOCK(&ia->ia_ifa); + } lck_rw_done(in_ifaddr_rwlock); } else { lck_rw_lock_shared(in_ifaddr_rwlock); for (ia = in_ifaddrhead.tqh_first; ia; - ia = ia->ia_link.tqe_next) + ia = ia->ia_link.tqe_next) { + IFA_LOCK(&ia->ia_ifa); if ((i & ia->ia_subnetmask) == ia->ia_subnet) { + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return (1); } + IFA_UNLOCK(&ia->ia_ifa); + } lck_rw_done(in_ifaddr_rwlock); } return (0); @@ -292,6 +332,18 @@ in_len2mask(struct in_addr *mask, int len) static int in_interfaces; /* number of external internet interfaces */ +static int +in_domifattach(struct ifnet *ifp) +{ + int error; + + if ((error = proto_plumb(PF_INET, ifp)) && error != EEXIST) + log(LOG_ERR, "%s: proto_plumb returned %d if=%s%d\n", + __func__, error, ifp->if_name, ifp->if_unit); + + return (error); +} + /* * Generic internet control operations (ioctl's). * Ifp is 0 if not an interface-specific ioctl. @@ -331,6 +383,8 @@ in_control( struct kev_msg ev_msg; struct kev_in_data in_event_data; + bzero(&in_event_data, sizeof(struct kev_in_data)); + bzero(&ev_msg, sizeof(struct kev_msg)); switch (cmd) { case SIOCALIFADDR: case SIOCDLIFADDR: @@ -354,19 +408,24 @@ in_control( for (iap = in_ifaddrhead.tqh_first; iap; iap = iap->ia_link.tqe_next) if (iap->ia_ifp == ifp) { + IFA_LOCK(&iap->ia_ifa); if (((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr.s_addr == iap->ia_addr.sin_addr.s_addr) { ia = iap; + IFA_UNLOCK(&iap->ia_ifa); break; } else if (ia == NULL) { ia = iap; - if (ifr->ifr_addr.sa_family != AF_INET) + if (ifr->ifr_addr.sa_family != AF_INET) { + IFA_UNLOCK(&iap->ia_ifa); break; + } } + IFA_UNLOCK(&iap->ia_ifa); } /* take a reference on ia before releasing lock */ if (ia != NULL) { - ifaref(&ia->ia_ifa); + IFA_ADDREF(&ia->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); } @@ -393,19 +452,19 @@ in_control( lck_rw_lock_shared(in_ifaddr_rwlock); for (oia = ia; ia; ia = ia->ia_link.tqe_next) { + IFA_LOCK(&ia->ia_ifa); if (ia->ia_ifp == ifp && ia->ia_addr.sin_addr.s_addr == - ifra->ifra_addr.sin_addr.s_addr) + ifra->ifra_addr.sin_addr.s_addr) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); break; - } - /* take a reference on ia before releasing lock */ - if (ia != NULL && ia != oia) { - ifaref(&ia->ia_ifa); + } + IFA_UNLOCK(&ia->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); - if (oia != NULL && oia != ia) { - ifafree(&oia->ia_ifa); - } + if (oia != NULL) + IFA_REMREF(&oia->ia_ifa); if ((ifp->if_flags & IFF_POINTOPOINT) && (cmd == SIOCAIFADDR) && (ifra->ifra_dstaddr.sin_addr.s_addr @@ -426,7 +485,13 @@ in_control( case SIOCSIFADDR: case SIOCSIFNETMASK: case SIOCSIFDSTADDR: - if ((so->so_state & SS_PRIV) == 0) { + /* socket is NULL if called from in_purgeaddrs() */ + if (so != NULL && (so->so_state & SS_PRIV) == 0) { + error = EPERM; + goto done; + } + /* in case it's NULL, make sure it came from the kernel */ + if (so == NULL && p != kernproc) { error = EPERM; goto done; } @@ -439,21 +504,22 @@ in_control( error = EINVAL; goto done; } - if (ia == (struct in_ifaddr *)0) { + if (ia == NULL) { ia = in_ifaddr_alloc(M_WAITOK); - if (ia == (struct in_ifaddr *)NULL) { + if (ia == NULL) { error = ENOBUFS; goto done; } - IA_HASH_INIT(ia); + ifnet_lock_exclusive(ifp); ifa = &ia->ia_ifa; + IFA_LOCK(ifa); /* Hold a reference for this routine */ - ifaref(ifa); + IFA_ADDREF_LOCKED(ifa); + IA_HASH_INIT(ia); ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr; ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask; ia->ia_sockmask.sin_len = 8; - ifnet_lock_exclusive(ifp); if (ifp->if_flags & IFF_BROADCAST) { ia->ia_broadaddr.sin_len = sizeof(ia->ia_addr); ia->ia_broadaddr.sin_family = AF_INET; @@ -463,22 +529,25 @@ in_control( in_interfaces++; /* if_attach_ifa() holds a reference for ifa_link */ if_attach_ifa(ifp, ifa); + /* + * If we have to go through in_ifinit(), make sure + * to avoid installing route(s) based on this address + * via PFC_IFUP event, before the link resolver (ARP) + * initializes it. + */ + if (cmd == SIOCAIFADDR || cmd == SIOCSIFADDR) + ifa->ifa_debug |= IFD_NOTREADY; + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); lck_rw_lock_exclusive(in_ifaddr_rwlock); /* Hold a reference for ia_link */ - ifaref(ifa); + IFA_ADDREF(ifa); TAILQ_INSERT_TAIL(&in_ifaddrhead, ia, ia_link); lck_rw_done(in_ifaddr_rwlock); - - /* Generic protocol plumbing */ - - if ((error = proto_plumb(PF_INET, ifp))) { - if (error != EEXIST) { - kprintf("in.c: warning can't plumb proto if=%s%d type %d error=%d\n", - ifp->if_name, ifp->if_unit, ifp->if_type, error); - } - error = 0; /*discard error, can be cold with unsupported interfaces */ - } + error = in_domifattach(ifp); + /* discard error,can be cold with unsupported interfaces */ + if (error) + error = 0; } break; @@ -531,7 +600,9 @@ in_control( break; case SIOCGIFADDR: + IFA_LOCK(&ia->ia_ifa); *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_addr; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCGIFBRDADDR: @@ -539,7 +610,9 @@ in_control( error = EINVAL; break; } + IFA_LOCK(&ia->ia_ifa); *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_broadaddr; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCGIFDSTADDR: @@ -547,11 +620,15 @@ in_control( error = EINVAL; break; } + IFA_LOCK(&ia->ia_ifa); *((struct sockaddr_in *)&ifr->ifr_dstaddr) = ia->ia_dstaddr; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCGIFNETMASK: + IFA_LOCK(&ia->ia_ifa); *((struct sockaddr_in *)&ifr->ifr_addr) = ia->ia_sockmask; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCSIFDSTADDR: @@ -559,23 +636,28 @@ in_control( error = EINVAL; break; } + IFA_LOCK(&ia->ia_ifa); oldaddr = ia->ia_dstaddr; ia->ia_dstaddr = *(struct sockaddr_in *)&ifr->ifr_dstaddr; if (ia->ia_dstaddr.sin_family == AF_INET) ia->ia_dstaddr.sin_len = sizeof (struct sockaddr_in); + IFA_UNLOCK(&ia->ia_ifa); error = ifnet_ioctl(ifp, PF_INET, SIOCSIFDSTADDR, ia); + IFA_LOCK(&ia->ia_ifa); if (error == EOPNOTSUPP) { error = 0; } if (error) { ia->ia_dstaddr = oldaddr; + IFA_UNLOCK(&ia->ia_ifa); break; } + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET_SUBCLASS; - + ev_msg.event_code = KEV_INET_SIFDSTADDR; if (ia->ia_ifa.ifa_dstaddr) @@ -590,6 +672,7 @@ in_control( in_event_data.ia_subnet = ia->ia_subnet; in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -600,14 +683,22 @@ in_control( kev_post_msg(&ev_msg); - + lck_mtx_lock(rnh_lock); + IFA_LOCK(&ia->ia_ifa); if (ia->ia_flags & IFA_ROUTE) { ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&oldaddr; - rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + IFA_UNLOCK(&ia->ia_ifa); + rtinit_locked(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); + IFA_LOCK(&ia->ia_ifa); ia->ia_ifa.ifa_dstaddr = - (struct sockaddr *)&ia->ia_dstaddr; - rtinit(&(ia->ia_ifa), (int)RTM_ADD, RTF_HOST|RTF_UP); + (struct sockaddr *)&ia->ia_dstaddr; + IFA_UNLOCK(&ia->ia_ifa); + rtinit_locked(&(ia->ia_ifa), (int)RTM_ADD, + RTF_HOST|RTF_UP); + } else { + IFA_UNLOCK(&ia->ia_ifa); } + lck_mtx_unlock(rnh_lock); break; case SIOCSIFBRDADDR: @@ -615,12 +706,13 @@ in_control( error = EINVAL; break; } + IFA_LOCK(&ia->ia_ifa); ia->ia_broadaddr = *(struct sockaddr_in *)&ifr->ifr_broadaddr; ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET_SUBCLASS; - + ev_msg.event_code = KEV_INET_SIFBRDADDR; if (ia->ia_ifa.ifa_dstaddr) @@ -635,6 +727,7 @@ in_control( in_event_data.ia_subnet = ia->ia_subnet; in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -661,34 +754,41 @@ in_control( break; case SIOCPROTOATTACH: - error = proto_plumb(PF_INET, ifp); + error = in_domifattach(ifp); break; - + case SIOCPROTODETACH: - // if an ip address is still present, refuse to detach + /* + * If an IPv4 address is still present, refuse to detach. + */ ifnet_lock_shared(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) - if (ifa->ifa_addr->sa_family == AF_INET) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family == AF_INET) { + IFA_UNLOCK(ifa); break; + } + IFA_UNLOCK(ifa); + } ifnet_lock_done(ifp); - if (ifa != 0) { + if (ifa != NULL) { error = EBUSY; break; } error = proto_unplumb(PF_INET, ifp); break; - case SIOCSIFNETMASK: { u_long i; - + i = ifra->ifra_addr.sin_addr.s_addr; + IFA_LOCK(&ia->ia_ifa); ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr = i); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET_SUBCLASS; - + ev_msg.event_code = KEV_INET_SIFNETMASK; if (ia->ia_ifa.ifa_dstaddr) @@ -703,6 +803,7 @@ in_control( in_event_data.ia_subnet = ia->ia_subnet; in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -720,6 +821,7 @@ in_control( hostIsNew = 1; error = 0; + IFA_LOCK(&ia->ia_ifa); if (ia->ia_addr.sin_family == AF_INET) { if (ifra->ifra_addr.sin_len == 0) { ifra->ifra_addr = ia->ia_addr; @@ -729,7 +831,9 @@ in_control( hostIsNew = 0; } if (ifra->ifra_mask.sin_len) { + IFA_UNLOCK(&ia->ia_ifa); in_ifscrub(ifp, ia, 0); + IFA_LOCK(&ia->ia_ifa); ia->ia_sockmask = ifra->ifra_mask; ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr); @@ -737,19 +841,25 @@ in_control( } if ((ifp->if_flags & IFF_POINTOPOINT) && (ifra->ifra_dstaddr.sin_family == AF_INET)) { + IFA_UNLOCK(&ia->ia_ifa); in_ifscrub(ifp, ia, 0); + IFA_LOCK(&ia->ia_ifa); ia->ia_dstaddr = ifra->ifra_dstaddr; ia->ia_dstaddr.sin_len = sizeof (struct sockaddr_in); maskIsNew = 1; /* We lie; but the effect's the same */ } if (ifra->ifra_addr.sin_family == AF_INET && (hostIsNew || maskIsNew)) { + IFA_UNLOCK(&ia->ia_ifa); error = in_ifinit(ifp, ia, &ifra->ifra_addr, 0); + } else { + IFA_UNLOCK(&ia->ia_ifa); } #if PF if (!error) (void) pf_ifaddr_hook(ifp, cmd); #endif /* PF */ + IFA_LOCK(&ia->ia_ifa); if ((ifp->if_flags & IFF_BROADCAST) && (ifra->ifra_broadaddr.sin_family == AF_INET)) ia->ia_broadaddr = ifra->ifra_broadaddr; @@ -780,6 +890,7 @@ in_control( in_event_data.ia_subnet = ia->ia_subnet; in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -789,6 +900,8 @@ in_control( ev_msg.dv[1].data_length = 0; kev_post_msg(&ev_msg); + } else { + IFA_UNLOCK(&ia->ia_ifa); } break; @@ -804,9 +917,10 @@ in_control( ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET_SUBCLASS; - + ev_msg.event_code = KEV_INET_ADDR_DELETED; + IFA_LOCK(&ia->ia_ifa); if (ia->ia_ifa.ifa_dstaddr) in_event_data.ia_dstaddr = ((struct sockaddr_in *)ia->ia_ifa.ifa_dstaddr)->sin_addr; @@ -819,6 +933,7 @@ in_control( in_event_data.ia_subnet = ia->ia_subnet; in_event_data.ia_subnetmask = ia->ia_subnetmask; in_event_data.ia_netbroadcast = ia->ia_netbroadcast; + IFA_UNLOCK(&ia->ia_ifa); strncpy(&in_event_data.link_data.if_name[0], ifp->if_name, IFNAMSIZ); in_event_data.link_data.if_family = ifp->if_family; in_event_data.link_data.if_unit = (u_int32_t) ifp->if_unit; @@ -830,10 +945,12 @@ in_control( ifa = &ia->ia_ifa; lck_rw_lock_exclusive(in_ifaddr_rwlock); /* Release ia_link reference */ - ifafree(ifa); + IFA_REMREF(ifa); TAILQ_REMOVE(&in_ifaddrhead, ia, ia_link); + IFA_LOCK(ifa); if (IA_IS_HASHED(ia)) in_iahash_remove(ia); + IFA_UNLOCK(ifa); lck_rw_done(in_ifaddr_rwlock); /* @@ -841,31 +958,42 @@ in_control( */ in_ifscrub(ifp, ia, 0); ifnet_lock_exclusive(ifp); + IFA_LOCK(ifa); /* if_detach_ifa() releases ifa_link reference */ if_detach_ifa(ifp, ifa); -#ifdef __APPLE__ + /* Our reference to this address is dropped at the bottom */ + IFA_UNLOCK(ifa); + /* * If the interface supports multicast, and no address is left, * remove the "all hosts" multicast group from that interface. */ - if (ifp->if_flags & IFF_MULTICAST) { - struct in_addr addr; - struct in_multi *inm = NULL; + if ((ifp->if_flags & IFF_MULTICAST) != 0 || + ifp->if_allhostsinm != NULL ) { - TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) - if (ifa->ifa_addr->sa_family == AF_INET) + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family == AF_INET) { + IFA_UNLOCK(ifa); break; - - if (ifa == 0) { - addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); - IN_LOOKUP_MULTI(addr, ifp, inm); + } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); - if (inm) - in_delmulti(&inm); - } else + + lck_mtx_lock(&ifp->if_addrconfig_lock); + if (ifa == NULL && ifp->if_allhostsinm != NULL) { + struct in_multi *inm = ifp->if_allhostsinm; + ifp->if_allhostsinm = NULL; + + in_delmulti(inm); + /* release the reference for allhostsinm pointer */ + INM_REMREF(inm); + } + lck_mtx_unlock(&ifp->if_addrconfig_lock); + } else { ifnet_lock_done(ifp); -#endif + } /* Post the kernel event */ kev_post_msg(&ev_msg); @@ -881,7 +1009,7 @@ in_control( error = 0; /* Release reference from ifa_ifpgetprimary() */ - ifafree(ifa); + IFA_REMREF(ifa); } #if PF (void) pf_ifaddr_hook(ifp, cmd); @@ -933,7 +1061,6 @@ in_control( /* Multicast options */ if (cloned_inp->inp_moptions != NULL) { - int i; struct ip_moptions *cloned_imo = cloned_inp->inp_moptions; struct ip_moptions *imo = inp->inp_moptions; @@ -942,35 +1069,15 @@ in_control( * No multicast option buffer attached to the pcb; * allocate one. */ - imo = (struct ip_moptions*) - _MALLOC(sizeof(*imo), M_IPMOPTS, M_WAITOK); + imo = ip_allocmoptions(M_WAITOK); if (imo == NULL) { error2 = ENOBUFS; break; } inp->inp_moptions = imo; } - imo->imo_multicast_ifp = cloned_imo->imo_multicast_ifp; - imo->imo_multicast_vif = cloned_imo->imo_multicast_vif; - imo->imo_multicast_ttl = cloned_imo->imo_multicast_ttl; - imo->imo_multicast_loop = cloned_imo->imo_multicast_loop; - imo->imo_num_memberships = cloned_imo->imo_num_memberships; - for (i = 0; i < cloned_imo->imo_num_memberships; i++) { - imo->imo_membership[i] = - in_addmulti(&cloned_imo->imo_membership[i]->inm_addr, - cloned_imo->imo_membership[i]->inm_ifp); - if (imo->imo_membership[i] == NULL) { - error2 = ENOBUFS; - break; - } - } - if (i < cloned_imo->imo_num_memberships) { - /* Failed, perform cleanup */ - for (i--; i >= 0; i--) - in_delmulti(&imo->imo_membership[i]); - imo->imo_num_memberships = 0; - break; - } + + error2 = imo_clone(cloned_imo, imo); } } break; @@ -982,7 +1089,7 @@ in_control( } done: if (ia != NULL) { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } return (error); } @@ -1017,7 +1124,7 @@ in_lifaddr_ioctl( /* sanity checks */ if (!data || !ifp) { panic("invalid argument to in_lifaddr_ioctl"); - /*NOTRECHED*/ + /*NOTREACHED*/ } switch (cmd) { @@ -1112,21 +1219,30 @@ in_lifaddr_ioctl( ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (!cmp) + } + if (!cmp) { + IFA_UNLOCK(ifa); break; + } candidate.s_addr = ((struct sockaddr_in *)&ifa->ifa_addr)->sin_addr.s_addr; candidate.s_addr &= mask.s_addr; + IFA_UNLOCK(ifa); if (candidate.s_addr == match.s_addr) break; } + if (ifa != NULL) + IFA_ADDREF(ifa); ifnet_lock_done(ifp); if (!ifa) return EADDRNOTAVAIL; ia = (struct in_ifaddr *)ifa; if (cmd == SIOCGLIFADDR) { + IFA_LOCK(ifa); /* fill in the if_laddrreq structure */ bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin_len); @@ -1141,6 +1257,8 @@ in_lifaddr_ioctl( iflr->flags = 0; /*XXX*/ + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); return 0; } else { struct in_aliasreq ifra; @@ -1150,6 +1268,7 @@ in_lifaddr_ioctl( bcopy(iflr->iflr_name, ifra.ifra_name, sizeof(ifra.ifra_name)); + IFA_LOCK(ifa); bcopy(&ia->ia_addr, &ifra.ifra_addr, ia->ia_addr.sin_len); if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { @@ -1158,7 +1277,8 @@ in_lifaddr_ioctl( } bcopy(&ia->ia_sockmask, &ifra.ifra_dstaddr, ia->ia_sockmask.sin_len); - + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); return in_control(so, SIOCDIFADDR, (caddr_t)&ifra, ifp, p); } @@ -1172,21 +1292,23 @@ in_lifaddr_ioctl( * Delete any existing route for an interface. */ void -in_ifscrub( - struct ifnet *ifp, - struct in_ifaddr *ia, - int locked) +in_ifscrub(struct ifnet *ifp, struct in_ifaddr *ia, int locked) { - - if ((ia->ia_flags & IFA_ROUTE) == 0) + IFA_LOCK(&ia->ia_ifa); + if ((ia->ia_flags & IFA_ROUTE) == 0) { + IFA_UNLOCK(&ia->ia_ifa); return; + } + IFA_UNLOCK(&ia->ia_ifa); if (!locked) lck_mtx_lock(rnh_lock); if (ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)) rtinit_locked(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST); else rtinit_locked(&(ia->ia_ifa), (int)RTM_DELETE, 0); + IFA_LOCK(&ia->ia_ifa); ia->ia_flags &= ~IFA_ROUTE; + IFA_UNLOCK(&ia->ia_ifa); if (!locked) lck_mtx_unlock(rnh_lock); } @@ -1197,12 +1319,20 @@ in_ifscrub( static void in_iahash_remove(struct in_ifaddr *ia) { - if (!IA_IS_HASHED(ia)) - panic("attempt to remove wrong ia %p from hash table\n", ia); + lck_rw_assert(in_ifaddr_rwlock, LCK_RW_ASSERT_EXCLUSIVE); + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); + if (!IA_IS_HASHED(ia)) { + panic("attempt to remove wrong ia %p from hash table\n", ia); + /* NOTREACHED */ + } TAILQ_REMOVE(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); IA_HASH_INIT(ia); - ifafree(&ia->ia_ifa); + if (IFA_REMREF_LOCKED(&ia->ia_ifa) == NULL) { + panic("%s: unexpected (missing) refcnt ifa=%p", __func__, + &ia->ia_ifa); + /* NOTREACHED */ + } } /* @@ -1211,13 +1341,18 @@ in_iahash_remove(struct in_ifaddr *ia) static void in_iahash_insert(struct in_ifaddr *ia) { - if (ia->ia_addr.sin_family != AF_INET) + lck_rw_assert(in_ifaddr_rwlock, LCK_RW_ASSERT_EXCLUSIVE); + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); + + if (ia->ia_addr.sin_family != AF_INET) { panic("attempt to insert wrong ia %p into hash table\n", ia); - else if (IA_IS_HASHED(ia)) + /* NOTREACHED */ + } else if (IA_IS_HASHED(ia)) { panic("attempt to double-insert ia %p into hash table\n", ia); - + /* NOTREACHED */ + } TAILQ_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); - ifaref(&ia->ia_ifa); + IFA_ADDREF_LOCKED(&ia->ia_ifa); } /* @@ -1236,22 +1371,39 @@ in_iahash_insert_ptp(struct in_ifaddr *ia) struct in_ifaddr *tmp_ifa; struct ifnet *tmp_ifp; - if (ia->ia_addr.sin_family != AF_INET) + lck_rw_assert(in_ifaddr_rwlock, LCK_RW_ASSERT_EXCLUSIVE); + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); + + if (ia->ia_addr.sin_family != AF_INET) { panic("attempt to insert wrong ia %p into hash table\n", ia); - else if (IA_IS_HASHED(ia)) + /* NOTREACHED */ + } else if (IA_IS_HASHED(ia)) { panic("attempt to double-insert ia %p into hash table\n", ia); - - TAILQ_FOREACH(tmp_ifa, INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia_hash) - if (IA_SIN(tmp_ifa)->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) + /* NOTREACHED */ + } + IFA_UNLOCK(&ia->ia_ifa); + TAILQ_FOREACH(tmp_ifa, INADDR_HASH(ia->ia_addr.sin_addr.s_addr), + ia_hash) { + IFA_LOCK(&tmp_ifa->ia_ifa); + /* ia->ia_addr won't change, so check without lock */ + if (IA_SIN(tmp_ifa)->sin_addr.s_addr == + ia->ia_addr.sin_addr.s_addr) { + IFA_UNLOCK(&tmp_ifa->ia_ifa); break; + } + IFA_UNLOCK(&tmp_ifa->ia_ifa); + } tmp_ifp = (tmp_ifa == NULL) ? NULL : tmp_ifa->ia_ifp; - if (tmp_ifp == NULL) - TAILQ_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); - else - TAILQ_INSERT_TAIL(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia, ia_hash); - - ifaref(&ia->ia_ifa); + IFA_LOCK(&ia->ia_ifa); + if (tmp_ifp == NULL) { + TAILQ_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), + ia, ia_hash); + } else { + TAILQ_INSERT_TAIL(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), + ia, ia_hash); + } + IFA_ADDREF_LOCKED(&ia->ia_ifa); } /* @@ -1273,9 +1425,10 @@ in_ifinit( int oldremoved = 0; /* Take an extra reference for this routine */ - ifaref(&ia->ia_ifa); + IFA_ADDREF(&ia->ia_ifa); lck_rw_lock_exclusive(in_ifaddr_rwlock); + IFA_LOCK(&ia->ia_ifa); oldaddr = ia->ia_addr; if (IA_IS_HASHED(ia)) { oldremoved = 1; @@ -1285,8 +1438,9 @@ in_ifinit( ia->ia_addr.sin_len = sizeof (*sin); if ((ifp->if_flags & IFF_POINTOPOINT)) in_iahash_insert_ptp(ia); - else + else in_iahash_insert(ia); + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); /* @@ -1315,10 +1469,11 @@ in_ifinit( } /* Release reference from ifa_ifpgetprimary() */ - ifafree(ifa0); + IFA_REMREF(ifa0); if (error) { lck_rw_lock_exclusive(in_ifaddr_rwlock); + IFA_LOCK(&ia->ia_ifa); if (IA_IS_HASHED(ia)) in_iahash_remove(ia); ia->ia_addr = oldaddr; @@ -1328,17 +1483,27 @@ in_ifinit( else in_iahash_insert(ia); } + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); /* Release extra reference taken above */ - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); return (error); } lck_mtx_lock(rnh_lock); + IFA_LOCK(&ia->ia_ifa); + /* + * Address has been initialized by the link resolver (ARP) + * via ifnet_ioctl() above; it may now generate route(s). + */ + ia->ia_ifa.ifa_debug &= ~IFD_NOTREADY; if (scrub) { ia->ia_ifa.ifa_addr = (struct sockaddr *)&oldaddr; + IFA_UNLOCK(&ia->ia_ifa); in_ifscrub(ifp, ia, 1); + IFA_LOCK(&ia->ia_ifa); ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; } + IFA_LOCK_ASSERT_HELD(&ia->ia_ifa); if (IN_CLASSA(i)) ia->ia_netmask = IN_CLASSA_NET; else if (IN_CLASSB(i)) @@ -1372,16 +1537,21 @@ in_ifinit( flags |= RTF_HOST; } else if (ifp->if_flags & IFF_POINTOPOINT) { if (ia->ia_dstaddr.sin_family != AF_INET) { + IFA_UNLOCK(&ia->ia_ifa); lck_mtx_unlock(rnh_lock); /* Release extra reference taken above */ - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); return (0); } ia->ia_dstaddr.sin_len = sizeof (*sin); flags |= RTF_HOST; } - if ((error = rtinit_locked(&(ia->ia_ifa), (int)RTM_ADD, flags)) == 0) + IFA_UNLOCK(&ia->ia_ifa); + if ((error = rtinit_locked(&(ia->ia_ifa), (int)RTM_ADD, flags)) == 0) { + IFA_LOCK(&ia->ia_ifa); ia->ia_flags |= IFA_ROUTE; + IFA_UNLOCK(&ia->ia_ifa); + } lck_mtx_unlock(rnh_lock); /* XXX check if the subnet route points to the same interface */ @@ -1393,19 +1563,29 @@ in_ifinit( * multicast group on that interface. */ if (ifp->if_flags & IFF_MULTICAST) { - struct in_multi *inm; struct in_addr addr; + lck_mtx_lock(&ifp->if_addrconfig_lock); addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP); - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(addr, ifp, inm); - ifnet_lock_done(ifp); - if (inm == 0) - in_addmulti(&addr, ifp); + if (ifp->if_allhostsinm == NULL) { + struct in_multi *inm; + inm = in_addmulti(&addr, ifp); + + if (inm != NULL) { + /* keep the reference on inm added by + * in_addmulti above for storing the + * pointer in allhostsinm + */ + ifp->if_allhostsinm = inm; + } else { + printf("Failed to add membership to all-hosts multicast address on interface %s%d\n", ifp->if_name, ifp->if_unit); + } + } + lck_mtx_unlock(&ifp->if_addrconfig_lock); } /* Release extra reference taken above */ - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); return (error); } @@ -1414,18 +1594,15 @@ in_ifinit( * Return 1 if the address might be a local broadcast address. */ int -in_broadcast( - struct in_addr in, - struct ifnet *ifp) +in_broadcast(struct in_addr in, struct ifnet *ifp) { struct ifaddr *ifa; u_int32_t t; - if (in.s_addr == INADDR_BROADCAST || - in.s_addr == INADDR_ANY) - return 1; + if (in.s_addr == INADDR_BROADCAST || in.s_addr == INADDR_ANY) + return (1); if ((ifp->if_flags & IFF_BROADCAST) == 0) - return 0; + return (0); t = ntohl(in.s_addr); /* * Look through the list of addresses for a match @@ -1434,10 +1611,7 @@ in_broadcast( #define ia ((struct in_ifaddr *)ifa) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr == NULL) { - ifnet_lock_done(ifp); - return (0); - } + IFA_LOCK(ifa); if (ifa->ifa_addr->sa_family == AF_INET && (in.s_addr == ia->ia_broadaddr.sin_addr.s_addr || in.s_addr == ia->ia_netbroadcast.s_addr || @@ -1451,140 +1625,76 @@ in_broadcast( * address. */ ia->ia_subnetmask != (u_int32_t)0xffffffff) { + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); - return 1; + return (1); } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); return (0); #undef ia } -static void -in_free_inm( - void* ifma_protospec) -{ - struct in_multi *inm = ifma_protospec; - - /* - * No remaining claims to this record; let IGMP know that - * we are leaving the multicast group. - */ - igmp_leavegroup(inm); - lck_mtx_lock(rnh_lock); - LIST_REMOVE(inm, inm_link); - lck_mtx_unlock(rnh_lock); - FREE(inm, M_IPMADDR); -} - -/* - * Add an address to the list of IP multicast addresses for a given interface. - */ -struct in_multi * -in_addmulti( - struct in_addr *ap, - struct ifnet *ifp) +void +in_purgeaddrs(struct ifnet *ifp) { - struct in_multi *inm; - int error; - struct sockaddr_in sin; - struct ifmultiaddr *ifma; + struct ifaddr **ifap; + int err, i; /* - * Call generic routine to add membership or increment - * refcount. It wants addresses in the form of a sockaddr, - * so we build one here (being careful to zero the unused bytes). + * Be nice, and try the civilized way first. If we can't get + * rid of them this way, then do it the rough way. We must + * only get here during detach time, after the ifnet has been + * removed from the global list and arrays. */ - bzero(&sin, sizeof sin); - sin.sin_family = AF_INET; - sin.sin_len = sizeof sin; - sin.sin_addr = *ap; - error = if_addmulti(ifp, (struct sockaddr *)&sin, &ifma); - if (error) { - return 0; - } - - /* - * If ifma->ifma_protospec is null, then if_addmulti() created - * a new record. Otherwise, we are done. - */ - if (ifma->ifma_protospec != 0) { - return ifma->ifma_protospec; - } - - inm = (struct in_multi *) _MALLOC(sizeof(*inm), M_IPMADDR, M_WAITOK); - if (inm == NULL) { - return (NULL); - } - - bzero(inm, sizeof *inm); - inm->inm_addr = *ap; - inm->inm_ifp = ifp; - inm->inm_ifma = ifma; - lck_mtx_lock(rnh_lock); - if (ifma->ifma_protospec == NULL) { - ifma->ifma_protospec = inm; - ifma->ifma_free = in_free_inm; - LIST_INSERT_HEAD(&in_multihead, inm, inm_link); - } - lck_mtx_unlock(rnh_lock); - - if (ifma->ifma_protospec != inm) { - _FREE(inm, M_IPMADDR); - return ifma->ifma_protospec; - } - - /* - * Let IGMP know that we have joined a new IP multicast group. - */ - error = igmp_joingroup(inm); - if (error) { - char addrbuf[16]; - - /* - * We can't free the inm because someone else may already be - * using it. Once we put it in to ifma->ifma_protospec, it - * must exist as long as the ifma does. Might be nice to flag - * the error so we can try igmp_joingroup the next time through. - */ - log(LOG_ERR, "igmp_joingroup error %d joining multicast %s on %s%d\n", - error, inet_ntop(AF_INET, &sin.sin_addr, addrbuf, sizeof(addrbuf)), - ifp->if_name, ifp->if_unit); - } - - return (inm); -} - -/* - * Delete a multicast address record. - */ -void -in_delmulti( - struct in_multi **inm) -{ - struct in_multi *inm2; - - lck_mtx_lock(rnh_lock); - LIST_FOREACH(inm2, &in_multihead, inm_link) { - if (inm2 == *inm) - break; - } - if (inm2 != *inm) { - lck_mtx_unlock(rnh_lock); - printf("in_delmulti - ignoring invalid inm (%p)\n", *inm); - return; - } - lck_mtx_unlock(rnh_lock); - - /* We intentionally do this a bit differently than BSD */ - if ((*inm)->inm_ifma) { - if_delmultiaddr((*inm)->inm_ifma, 0); - ifma_release((*inm)->inm_ifma); + err = ifnet_get_address_list_family_internal(ifp, &ifap, AF_INET, 1, + M_WAITOK); + if (err == 0 && ifap != NULL) { + for (i = 0; ifap[i] != NULL; i++) { + struct ifaliasreq ifr; + struct ifaddr *ifa; + + ifa = ifap[i]; + bzero(&ifr, sizeof (ifr)); + IFA_LOCK(ifa); + ifr.ifra_addr = *ifa->ifa_addr; + if (ifa->ifa_dstaddr != NULL) + ifr.ifra_broadaddr = *ifa->ifa_dstaddr; + IFA_UNLOCK(ifa); + err = in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp, + kernproc); + /* if we lost the race, ignore it */ + if (err == EADDRNOTAVAIL) + err = 0; + if (err != 0) { + char s_addr[MAX_IPv4_STR_LEN]; + char s_dstaddr[MAX_IPv4_STR_LEN]; + struct in_addr *s, *d; + + IFA_LOCK(ifa); + s = &((struct sockaddr_in *) + ifa->ifa_addr)->sin_addr; + d = &((struct sockaddr_in *) + ifa->ifa_dstaddr)->sin_addr; + (void) inet_ntop(AF_INET, &s->s_addr, s_addr, + sizeof (s_addr)); + (void) inet_ntop(AF_INET, &d->s_addr, s_dstaddr, + sizeof (s_dstaddr)); + IFA_UNLOCK(ifa); + + printf("%s: SIOCDIFADDR ifp=%p ifa_addr=%s " + "ifa_dstaddr=%s (err=%d)\n", __func__, ifp, + s_addr, s_dstaddr, err); + } + } + ifnet_free_address_list(ifap); + } else if (err != 0 && err != ENXIO) { + printf("%s: error retrieving list of AF_INET addresses for " + "ifp=%p (err=%d)\n", __func__, ifp, err); } - *inm = NULL; } -#if !NFSCLIENT int inet_aton(char *cp, struct in_addr *pin); int inet_aton(char * cp, struct in_addr * pin) @@ -1605,7 +1715,19 @@ inet_aton(char * cp, struct in_addr * pin) } return (TRUE); } -#endif + +int inet_ntoa2(struct in_addr * pin, char * cp, const int len); +int inet_ntoa2(struct in_addr * pin, char * cp, const int len) +{ + int ret; + + /* address is in network byte order */ + ret = snprintf(cp, len, "%u.%u.%u.%u", pin->s_addr & 0xFF, + (pin->s_addr >> 8) & 0xFF, (pin->s_addr >> 16) & 0xFF, + (pin->s_addr >> 24) & 0xFF); + + return ret < len ? TRUE : FALSE; +} /* * Called as part of ip_init @@ -1613,6 +1735,8 @@ inet_aton(char * cp, struct in_addr * pin) void in_ifaddr_init(void) { + in_multi_init(); + PE_parse_boot_argn("ifa_debug", &inifa_debug, sizeof (inifa_debug)); inifa_size = (inifa_debug == 0) ? sizeof (struct in_ifaddr) : @@ -1620,10 +1744,15 @@ in_ifaddr_init(void) inifa_zone = zinit(inifa_size, INIFA_ZONE_MAX * inifa_size, 0, INIFA_ZONE_NAME); - if (inifa_zone == NULL) + if (inifa_zone == NULL) { panic("%s: failed allocating %s", __func__, INIFA_ZONE_NAME); - + /* NOTREACHED */ + } zone_change(inifa_zone, Z_EXPAND, TRUE); + zone_change(inifa_zone, Z_CALLERACCT, FALSE); + + lck_mtx_init(&inifa_trash_lock, ifa_mtx_grp, ifa_mtx_attr); + TAILQ_INIT(&inifa_trash_head); } static struct in_ifaddr * @@ -1637,11 +1766,14 @@ in_ifaddr_alloc(int how) bzero(inifa, inifa_size); inifa->ia_ifa.ifa_free = in_ifaddr_free; inifa->ia_ifa.ifa_debug |= IFD_ALLOC; + ifa_lock_init(&inifa->ia_ifa); if (inifa_debug != 0) { struct in_ifaddr_dbg *inifa_dbg = (struct in_ifaddr_dbg *)inifa; inifa->ia_ifa.ifa_debug |= IFD_DEBUG; inifa->ia_ifa.ifa_trace = in_ifaddr_trace; + inifa->ia_ifa.ifa_attached = in_ifaddr_attached; + inifa->ia_ifa.ifa_detached = in_ifaddr_detached; ctrace_record(&inifa_dbg->inifa_alloc); } } @@ -1651,21 +1783,79 @@ in_ifaddr_alloc(int how) static void in_ifaddr_free(struct ifaddr *ifa) { - if (ifa->ifa_refcnt != 0) + IFA_LOCK_ASSERT_HELD(ifa); + + if (ifa->ifa_refcnt != 0) { panic("%s: ifa %p bad ref cnt", __func__, ifa); - if (!(ifa->ifa_debug & IFD_ALLOC)) + /* NOTREACHED */ + } if (!(ifa->ifa_debug & IFD_ALLOC)) { panic("%s: ifa %p cannot be freed", __func__, ifa); - + /* NOTREACHED */ + } if (ifa->ifa_debug & IFD_DEBUG) { struct in_ifaddr_dbg *inifa_dbg = (struct in_ifaddr_dbg *)ifa; ctrace_record(&inifa_dbg->inifa_free); bcopy(&inifa_dbg->inifa, &inifa_dbg->inifa_old, sizeof (struct in_ifaddr)); + if (ifa->ifa_debug & IFD_TRASHED) { + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&inifa_trash_lock); + TAILQ_REMOVE(&inifa_trash_head, inifa_dbg, + inifa_trash_link); + lck_mtx_unlock(&inifa_trash_lock); + ifa->ifa_debug &= ~IFD_TRASHED; + } } + IFA_UNLOCK(ifa); + ifa_lock_destroy(ifa); bzero(ifa, sizeof (struct in_ifaddr)); zfree(inifa_zone, ifa); } +static void +in_ifaddr_attached(struct ifaddr *ifa) +{ + struct in_ifaddr_dbg *inifa_dbg = (struct in_ifaddr_dbg *)ifa; + + IFA_LOCK_ASSERT_HELD(ifa); + + if (!(ifa->ifa_debug & IFD_DEBUG)) { + panic("%s: ifa %p has no debug structure", __func__, ifa); + /* NOTREACHED */ + } + if (ifa->ifa_debug & IFD_TRASHED) { + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&inifa_trash_lock); + TAILQ_REMOVE(&inifa_trash_head, inifa_dbg, inifa_trash_link); + lck_mtx_unlock(&inifa_trash_lock); + ifa->ifa_debug &= ~IFD_TRASHED; + } +} + +static void +in_ifaddr_detached(struct ifaddr *ifa) +{ + struct in_ifaddr_dbg *inifa_dbg = (struct in_ifaddr_dbg *)ifa; + + IFA_LOCK_ASSERT_HELD(ifa); + + if (!(ifa->ifa_debug & IFD_DEBUG)) { + panic("%s: ifa %p has no debug structure", __func__, ifa); + /* NOTREACHED */ + } else if (ifa->ifa_debug & IFD_TRASHED) { + panic("%s: ifa %p is already in trash list", __func__, ifa); + /* NOTREACHED */ + } + ifa->ifa_debug |= IFD_TRASHED; + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&inifa_trash_lock); + TAILQ_INSERT_TAIL(&inifa_trash_head, inifa_dbg, inifa_trash_link); + lck_mtx_unlock(&inifa_trash_lock); +} + static void in_ifaddr_trace(struct ifaddr *ifa, int refhold) { @@ -1674,9 +1864,10 @@ in_ifaddr_trace(struct ifaddr *ifa, int refhold) u_int32_t idx; u_int16_t *cnt; - if (!(ifa->ifa_debug & IFD_DEBUG)) + if (!(ifa->ifa_debug & IFD_DEBUG)) { panic("%s: ifa %p has no debug structure", __func__, ifa); - + /* NOTREACHED */ + } if (refhold) { cnt = &inifa_dbg->inifa_refhold_cnt; tr = inifa_dbg->inifa_refhold; @@ -1685,6 +1876,6 @@ in_ifaddr_trace(struct ifaddr *ifa, int refhold) tr = inifa_dbg->inifa_refrele; } - idx = OSAddAtomic16(1, (volatile SInt16 *)cnt) % CTRACE_HIST_SIZE; + idx = atomic_add_16_ov(cnt, 1) % INIFA_TRACE_HIST_SIZE; ctrace_record(&tr[idx]); } diff --git a/bsd/netinet/in.h b/bsd/netinet/in.h index fc38f8401..4e66c26c7 100644 --- a/bsd/netinet/in.h +++ b/bsd/netinet/in.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,6 +67,10 @@ #include <sys/_types.h> #include <stdint.h> /* uint(8|16|32)_t */ +#ifndef KERNEL +#include <Availability.h> +#endif + #ifndef _IN_ADDR_T #define _IN_ADDR_T typedef __uint32_t in_addr_t; /* base type for internet address */ @@ -207,10 +211,11 @@ typedef __uint16_t in_port_t; #define IPPROTO_ENCAP 98 /* encapsulation header */ #define IPPROTO_APES 99 /* any private encr. scheme */ #define IPPROTO_GMTP 100 /* GMTP*/ -#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ /* 101-254: Partly Unassigned */ #define IPPROTO_PIM 103 /* Protocol Independent Mcast */ +#define IPPROTO_IPCOMP 108 /* payload compression (IPComp) */ #define IPPROTO_PGM 113 /* PGM */ +#define IPPROTO_SCTP 132 /* SCTP */ /* 255: Reserved */ /* BSD Private, local use, namespace incursion */ #define IPPROTO_DIVERT 254 /* divert pseudo-protocol */ @@ -341,6 +346,7 @@ struct in_addr { #define IN_BADCLASS(i) (((u_int32_t)(i) & 0xf0000000) == 0xf0000000) #define INADDR_LOOPBACK (u_int32_t)0x7f000001 + #ifndef KERNEL #define INADDR_NONE 0xffffffff /* -1 return */ #endif @@ -348,11 +354,25 @@ struct in_addr { #define INADDR_UNSPEC_GROUP (u_int32_t)0xe0000000 /* 224.0.0.0 */ #define INADDR_ALLHOSTS_GROUP (u_int32_t)0xe0000001 /* 224.0.0.1 */ #define INADDR_ALLRTRS_GROUP (u_int32_t)0xe0000002 /* 224.0.0.2 */ +#define INADDR_ALLRPTS_GROUP (u_int32_t)0xe0000016 /* 224.0.0.22, IGMPv3 */ +#define INADDR_CARP_GROUP (u_int32_t)0xe0000012 /* 224.0.0.18 */ +#define INADDR_PFSYNC_GROUP (u_int32_t)0xe00000f0 /* 224.0.0.240 */ +#define INADDR_ALLMDNS_GROUP (u_int32_t)0xe00000fb /* 224.0.0.251 */ #define INADDR_MAX_LOCAL_GROUP (u_int32_t)0xe00000ff /* 224.0.0.255 */ #ifdef __APPLE__ #define IN_LINKLOCALNETNUM (u_int32_t)0xA9FE0000 /* 169.254.0.0 */ #define IN_LINKLOCAL(i) (((u_int32_t)(i) & IN_CLASSB_NET) == IN_LINKLOCALNETNUM) +#define IN_LOOPBACK(i) (((u_int32_t)(i) & 0xff000000) == 0x7f000000) +#define IN_ZERONET(i) (((u_int32_t)(i) & 0xff000000) == 0) + +#define IN_PRIVATE(i) ((((u_int32_t)(i) & 0xff000000) == 0x0a000000) || \ + (((u_int32_t)(i) & 0xfff00000) == 0xac100000) || \ + (((u_int32_t)(i) & 0xffff0000) == 0xc0a80000)) + +#define IN_LOCAL_GROUP(i) (((u_int32_t)(i) & 0xffffff00) == 0xe0000000) + +#define IN_ANY_LOCAL(i) (IN_LINKLOCAL(i) || IN_LOCAL_GROUP(i)) #endif #define IN_LOOPBACKNET 127 /* official! */ @@ -415,7 +435,9 @@ struct ip_opts { #define IP_STRIPHDR 23 /* bool: drop receive of raw IP header */ #endif #define IP_RECVTTL 24 /* bool; receive reception TTL w/dgram */ -#define IP_BOUND_IF 25 /* set/get bound interface */ +#define IP_BOUND_IF 25 /* int; set/get bound interface */ +#define IP_PKTINFO 26 /* get pktinfo on recv socket, set src on sent dgram */ +#define IP_RECVPKTINFO IP_PKTINFO /* receive pktinfo w/dgram */ #define IP_FW_ADD 40 /* add a firewall rule to chain */ @@ -440,24 +462,53 @@ struct ip_opts { #define IP_DUMMYNET_GET 64 /* get entire dummynet pipes */ #define IP_TRAFFIC_MGT_BACKGROUND 65 /* int*; get background IO flags; set background IO */ +#define IP_MULTICAST_IFINDEX 66 /* int*; set/get IP multicast i/f index */ + +/* IPv4 Source Filter Multicast API [RFC3678] */ +#define IP_ADD_SOURCE_MEMBERSHIP 70 /* join a source-specific group */ +#define IP_DROP_SOURCE_MEMBERSHIP 71 /* drop a single source */ +#define IP_BLOCK_SOURCE 72 /* block a source */ +#define IP_UNBLOCK_SOURCE 73 /* unblock a source */ + +/* The following option is private; do not use it from user applications. */ +#define IP_MSFILTER 74 /* set/get filter list */ + +/* Protocol Independent Multicast API [RFC3678] */ +#define MCAST_JOIN_GROUP 80 /* join an any-source group */ +#define MCAST_LEAVE_GROUP 81 /* leave all sources for group */ +#define MCAST_JOIN_SOURCE_GROUP 82 /* join a source-specific group */ +#define MCAST_LEAVE_SOURCE_GROUP 83 /* leave a single source */ +#define MCAST_BLOCK_SOURCE 84 /* block a source */ +#define MCAST_UNBLOCK_SOURCE 85 /* unblock a source */ #ifdef PRIVATE #define IP_FORCE_OUT_IFP 69 /* deprecated; use IP_BOUND_IF instead */ -#endif - -/* Background socket configuration flags */ -#ifdef __APPLE_API_UNSTABLE -#define TRAFFIC_MGT_SO_BACKGROUND 0x0001 /* background socket */ -#define TRAFFIC_MGT_SO_BG_SUPPRESSED 0x0002 /* currently throttled */ -#define TRAFFIC_MGT_SO_BG_REGULATE 0x0004 /* traffic is regulated */ -#endif /* __APPLE_API_UNSTABLE */ +#define IP_NO_IFT_CELLULAR 6969 /* for internal use only */ +#define IP_NO_IFT_PDP IP_NO_IFT_CELLULAR /* deprecated */ +#define IP_OUT_IF 9696 /* for internal use only */ +#endif /* PRIVATE */ /* * Defaults and limits for options */ #define IP_DEFAULT_MULTICAST_TTL 1 /* normally limit m'casts to 1 hop */ #define IP_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ -#define IP_MAX_MEMBERSHIPS 20 /* per socket */ + +/* + * The imo_membership vector for each socket is now dynamically allocated at + * run-time, bounded by USHRT_MAX, and is reallocated when needed, sized + * according to a power-of-two increment. + */ +#define IP_MIN_MEMBERSHIPS 31 +#define IP_MAX_MEMBERSHIPS 4095 + +/* + * Default resource limits for IPv4 multicast source filtering. + * These may be modified by sysctl. + */ +#define IP_MAX_GROUP_SRC_FILTER 512 /* sources per group */ +#define IP_MAX_SOCK_SRC_FILTER 128 /* sources per socket/group */ +#define IP_MAX_SOCK_MUTE_FILTER 128 /* XXX no longer used */ /* * Argument structure for IP_ADD_MEMBERSHIP and IP_DROP_MEMBERSHIP. @@ -467,6 +518,105 @@ struct ip_mreq { struct in_addr imr_interface; /* local IP address of interface */ }; +/* + * Modified argument structure for IP_MULTICAST_IF, obtained from Linux. + * This is used to specify an interface index for multicast sends, as + * the IPv4 legacy APIs do not support this (unless IP_SENDIF is available). + */ +struct ip_mreqn { + struct in_addr imr_multiaddr; /* IP multicast address of group */ + struct in_addr imr_address; /* local IP address of interface */ + int imr_ifindex; /* Interface index; cast to uint32_t */ +}; + +#pragma pack(4) +/* + * Argument structure for IPv4 Multicast Source Filter APIs. [RFC3678] + */ +struct ip_mreq_source { + struct in_addr imr_multiaddr; /* IP multicast address of group */ + struct in_addr imr_sourceaddr; /* IP address of source */ + struct in_addr imr_interface; /* local IP address of interface */ +}; + +/* + * Argument structures for Protocol-Independent Multicast Source + * Filter APIs. [RFC3678] + */ +struct group_req { + uint32_t gr_interface; /* interface index */ + struct sockaddr_storage gr_group; /* group address */ +}; + +struct group_source_req { + uint32_t gsr_interface; /* interface index */ + struct sockaddr_storage gsr_group; /* group address */ + struct sockaddr_storage gsr_source; /* source address */ +}; + +#ifndef __MSFILTERREQ_DEFINED +#define __MSFILTERREQ_DEFINED +/* + * The following structure is private; do not use it from user applications. + * It is used to communicate IP_MSFILTER/IPV6_MSFILTER information between + * the RFC 3678 libc functions and the kernel. + */ +struct __msfilterreq { + uint32_t msfr_ifindex; /* interface index */ + uint32_t msfr_fmode; /* filter mode for group */ + uint32_t msfr_nsrcs; /* # of sources in msfr_srcs */ + uint32_t __msfr_align; + struct sockaddr_storage msfr_group; /* group address */ + struct sockaddr_storage *msfr_srcs; +}; + +#ifdef XNU_KERNEL_PRIVATE +struct __msfilterreq32 { + uint32_t msfr_ifindex; /* interface index */ + uint32_t msfr_fmode; /* filter mode for group */ + uint32_t msfr_nsrcs; /* # of sources in msfr_srcs */ + uint32_t __msfr_align; + struct sockaddr_storage msfr_group; /* group address */ + user32_addr_t msfr_srcs; +}; + +struct __msfilterreq64 { + uint32_t msfr_ifindex; /* interface index */ + uint32_t msfr_fmode; /* filter mode for group */ + uint32_t msfr_nsrcs; /* # of sources in msfr_srcs */ + uint32_t __msfr_align; + struct sockaddr_storage msfr_group; /* group address */ + user64_addr_t msfr_srcs; +}; +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* __MSFILTERREQ_DEFINED */ + +#pragma pack() +struct sockaddr; + +#ifndef KERNEL +/* + * Advanced (Full-state) APIs [RFC3678] + * The RFC specifies uint_t for the 6th argument to [sg]etsourcefilter(). + * We use uint32_t here to be consistent. + */ +int setipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t, + uint32_t, struct in_addr *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); +int getipv4sourcefilter(int, struct in_addr, struct in_addr, uint32_t *, + uint32_t *, struct in_addr *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); +int setsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, + uint32_t, uint32_t, struct sockaddr_storage *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); +int getsourcefilter(int, uint32_t, struct sockaddr *, socklen_t, + uint32_t *, uint32_t *, struct sockaddr_storage *) __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_NA); +#endif + +/* + * Filter modes; also used to represent per-socket filter mode internally. + */ +#define MCAST_UNDEFINED 0 /* fmode: not yet defined */ +#define MCAST_INCLUDE 1 /* fmode: include these source(s) */ +#define MCAST_EXCLUDE 2 /* fmode: exclude these source(s) */ + /* * Argument for IP_PORTRANGE: * - which range to search when port is unspecified at bind() or connect() @@ -476,6 +626,31 @@ struct ip_mreq { #define IP_PORTRANGE_LOW 2 /* "low" - vouchsafe security */ +/* + * IP_PKTINFO: Packet information (equivalent to RFC2292 sec 5 for IPv4) + * This structure is used for + * + * 1) Receiving ancilliary data about the datagram if IP_PKTINFO sockopt is + * set on the socket. In this case ipi_ifindex will contain the interface + * index the datagram was received on, ipi_addr is the IP address the + * datagram was received to. + * + * 2) Sending a datagram using a specific interface or IP source address. + * if ipi_ifindex is set to non-zero when in_pktinfo is passed as + * ancilliary data of type IP_PKTINFO, this will be used as the source + * interface to send the datagram from. If ipi_ifindex is null, ip_spec_dst + * will be used for the source address. + * + * Note: if IP_BOUND_IF is set on the socket, ipi_ifindex in the ancillary + * IP_PKTINFO option silently overrides the bound interface when it is + * specified during send time. + */ +struct in_pktinfo { + unsigned int ipi_ifindex; /* send/recv interface index */ + struct in_addr ipi_spec_dst; /* Local address */ + struct in_addr ipi_addr; /* IP Header dst address */ +}; + /* * Definitions for inet sysctl operations. * @@ -616,6 +791,11 @@ extern int in_localaddr(struct in_addr); extern u_int32_t in_netof(struct in_addr); extern int inaddr_local(struct in_addr); + +#define in_hosteq(s, t) ((s).s_addr == (t).s_addr) +#define in_nullhost(x) ((x).s_addr == INADDR_ANY) +#define in_allhosts(x) ((x).s_addr == htonl(INADDR_ALLHOSTS_GROUP)) + #endif /* KERNEL_PRIVATE */ #define MAX_IPv4_STR_LEN 16 #define MAX_IPv6_STR_LEN 64 diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c index 886528306..8a4dfcd14 100644 --- a/bsd/netinet/in_arp.c +++ b/bsd/netinet/in_arp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2009 Apple Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,11 +70,14 @@ #include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/sysctl.h> +#include <sys/mcache.h> +#include <sys/protosw.h> #include <string.h> #include <net/if_arp.h> #include <net/if_dl.h> #include <net/dlil.h> #include <net/if_types.h> +#include <net/if_llreach.h> #include <net/route.h> #include <netinet/if_ether.h> #include <netinet/in_var.h> @@ -83,7 +86,6 @@ #define SA(p) ((struct sockaddr *)(p)) #define SIN(s) ((struct sockaddr_in *)s) #define CONST_LLADDR(s) ((const u_char*)((s)->sdl_data + (s)->sdl_nlen)) -#define rt_expire rt_rmx.rmx_expire #define equal(a1, a2) (bcmp((caddr_t)(a1), (caddr_t)(a2), (a1)->sa_len) == 0) static const size_t MAX_HW_LEN = 10; @@ -100,16 +102,26 @@ static int arpt_down = 20; /* once declared down, don't send for 20 sec */ int apple_hwcksum_tx = 1; int apple_hwcksum_rx = 1; -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, CTLFLAG_RW, - &arpt_prune, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_RW, - &arpt_keep, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, CTLFLAG_RW, - &arpt_down, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_tx, CTLFLAG_RW, - &apple_hwcksum_tx, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_rx, CTLFLAG_RW, - &apple_hwcksum_rx, 0, ""); +static int arp_llreach_base = (LL_BASE_REACHABLE / 1000); /* seconds */ + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, prune_intvl, + CTLFLAG_RW | CTLFLAG_LOCKED, &arpt_prune, 0, ""); + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, + CTLFLAG_RW | CTLFLAG_LOCKED, &arpt_keep, 0, ""); + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, host_down_time, + CTLFLAG_RW | CTLFLAG_LOCKED, &arpt_down, 0, ""); + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_tx, + CTLFLAG_RW | CTLFLAG_LOCKED, &apple_hwcksum_tx, 0, ""); + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, apple_hwcksum_rx, + CTLFLAG_RW | CTLFLAG_LOCKED, &apple_hwcksum_rx, 0, ""); + +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, arp_llreach_base, + CTLFLAG_RW | CTLFLAG_LOCKED, &arp_llreach_base, LL_BASE_REACHABLE, + "default ARP link-layer reachability max lifetime (in seconds)"); struct llinfo_arp { /* @@ -121,7 +133,10 @@ struct llinfo_arp { * The following are protected by rt_lock */ struct mbuf *la_hold; /* last packet until resolved/timeout */ - int32_t la_asked; /* last time we QUERIED for this addr */ + struct if_llreach *la_llreach; /* link-layer reachability record */ + u_int64_t la_lastused; /* last used timestamp */ + u_int32_t la_asked; /* # of requests sent */ + u_int32_t la_persist; /* expirable, but stays around */ }; /* @@ -140,7 +155,7 @@ struct llinfo_arp { * * - Routing lock (rnh_lock) * - * la_hold, la_asked + * la_hold, la_asked, la_llreach, la_lastused * * - Routing entry lock (rt_lock) * @@ -153,33 +168,36 @@ static LIST_HEAD(, llinfo_arp) llinfo_arp; static int arp_inuse, arp_allocated; -static int arp_maxtries = 5; +static u_int32_t arp_maxtries = 5; static int useloopback = 1; /* use loopback interface for local traffic */ static int arp_proxyall = 0; static int arp_sendllconflict = 0; -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_RW | CTLFLAG_LOCKED, &arp_maxtries, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, useloopback, CTLFLAG_RW | CTLFLAG_LOCKED, &useloopback, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_RW | CTLFLAG_LOCKED, &arp_proxyall, 0, ""); -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, sendllconflict, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, sendllconflict, CTLFLAG_RW | CTLFLAG_LOCKED, &arp_sendllconflict, 0, ""); -static int log_arp_warnings = 0; +static int log_arp_warnings = 0; /* Thread safe: no accumulated state */ -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_warnings, CTLFLAG_RW, +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_warnings, + CTLFLAG_RW | CTLFLAG_LOCKED, &log_arp_warnings, 0, "log arp warning messages"); -static int keep_announcements = 1; -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, keep_announcements, CTLFLAG_RW, +static int keep_announcements = 1; /* Thread safe: no aging of state */ +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, keep_announcements, + CTLFLAG_RW | CTLFLAG_LOCKED, &keep_announcements, 0, "keep arp announcements"); -static int send_conflicting_probes = 1; -SYSCTL_INT(_net_link_ether_inet, OID_AUTO, send_conflicting_probes, CTLFLAG_RW, +static int send_conflicting_probes = 1; /* Thread safe: no accumulated state */ +SYSCTL_INT(_net_link_ether_inet, OID_AUTO, send_conflicting_probes, + CTLFLAG_RW | CTLFLAG_LOCKED, &send_conflicting_probes, 0, "send conflicting link-local arp probes"); @@ -188,6 +206,13 @@ static errno_t arp_lookup_route(const struct in_addr *, int, static void arptimer(void *); static struct llinfo_arp *arp_llinfo_alloc(void); static void arp_llinfo_free(void *); +static void arp_llinfo_purge(struct rtentry *); +static void arp_llinfo_get_ri(struct rtentry *, struct rt_reach_info *); + +static __inline void arp_llreach_use(struct llinfo_arp *); +static __inline int arp_llreach_reachable(struct llinfo_arp *); +static void arp_llreach_alloc(struct rtentry *, struct ifnet *, void *, + unsigned int, boolean_t); extern u_int32_t ipv4_ll_arp_aware; @@ -214,6 +239,7 @@ arp_init(void) panic("%s: failed allocating llinfo_arp_zone", __func__); zone_change(llinfo_arp_zone, Z_EXPAND, TRUE); + zone_change(llinfo_arp_zone, Z_CALLERACCT, FALSE); arpinit_done = 1; @@ -243,9 +269,194 @@ arp_llinfo_free(void *arg) la->la_hold = NULL; } + /* Purge any link-layer info caching */ + VERIFY(la->la_rt->rt_llinfo == la); + if (la->la_rt->rt_llinfo_purge != NULL) + la->la_rt->rt_llinfo_purge(la->la_rt); + zfree(llinfo_arp_zone, la); } +static void +arp_llinfo_purge(struct rtentry *rt) +{ + struct llinfo_arp *la = rt->rt_llinfo; + + RT_LOCK_ASSERT_HELD(rt); + VERIFY(rt->rt_llinfo_purge == arp_llinfo_purge && la != NULL); + + if (la->la_llreach != NULL) { + RT_CONVERT_LOCK(rt); + ifnet_llreach_free(la->la_llreach); + la->la_llreach = NULL; + } + la->la_lastused = 0; +} + +static void +arp_llinfo_get_ri(struct rtentry *rt, struct rt_reach_info *ri) +{ + struct llinfo_arp *la = rt->rt_llinfo; + struct if_llreach *lr = la->la_llreach; + + if (lr == NULL) { + bzero(ri, sizeof (*ri)); + } else { + IFLR_LOCK(lr); + /* Export to rt_reach_info structure */ + ifnet_lr2ri(lr, ri); + /* Export ARP send expiration time */ + ri->ri_snd_expire = ifnet_llreach_up2cal(lr, la->la_lastused); + IFLR_UNLOCK(lr); + } +} + +void +arp_llreach_set_reachable(struct ifnet *ifp, void *addr, unsigned int alen) +{ + /* Nothing more to do if it's disabled */ + if (arp_llreach_base == 0) + return; + + ifnet_llreach_set_reachable(ifp, ETHERTYPE_IP, addr, alen); +} + +static __inline void +arp_llreach_use(struct llinfo_arp *la) +{ + if (la->la_llreach != NULL) + la->la_lastused = net_uptime(); +} + +static __inline int +arp_llreach_reachable(struct llinfo_arp *la) +{ + struct if_llreach *lr; + const char *why = NULL; + + /* Nothing more to do if it's disabled; pretend it's reachable */ + if (arp_llreach_base == 0) + return (1); + + if ((lr = la->la_llreach) == NULL) { + /* + * Link-layer reachability record isn't present for this + * ARP entry; pretend it's reachable and use it as is. + */ + return (1); + } else if (ifnet_llreach_reachable(lr)) { + /* + * Record is present, it's not shared with other ARP + * entries and a packet has recently been received + * from the remote host; consider it reachable. + */ + if (lr->lr_reqcnt == 1) + return (1); + + /* Prime it up, if this is the first time */ + if (la->la_lastused == 0) { + VERIFY(la->la_llreach != NULL); + arp_llreach_use(la); + } + + /* + * Record is present and shared with one or more ARP + * entries, and a packet has recently been received + * from the remote host. Since it's shared by more + * than one IP addresses, we can't rely on the link- + * layer reachability alone; consider it reachable if + * this ARP entry has been used "recently." + */ + if (ifnet_llreach_reachable_delta(lr, la->la_lastused)) + return (1); + + why = "has alias(es) and hasn't been used in a while"; + } else { + why = "haven't heard from it in a while"; + } + + if (log_arp_warnings) { + char tmp[MAX_IPv4_STR_LEN]; + u_int64_t now = net_uptime(); + + log(LOG_DEBUG, "%s%d: ARP probe(s) needed for %s; " + "%s [lastused %lld, lastrcvd %lld] secs ago\n", + lr->lr_ifp->if_name, lr->lr_ifp->if_unit, inet_ntop(AF_INET, + &SIN(rt_key(la->la_rt))->sin_addr, tmp, sizeof (tmp)), why, + (la->la_lastused ? (int64_t)(now - la->la_lastused) : -1), + (lr->lr_lastrcvd ? (int64_t)(now - lr->lr_lastrcvd) : -1)); + + } + return (0); +} + +/* + * Obtain a link-layer source cache entry for the sender. + * + * NOTE: This is currently only for ARP/Ethernet. + */ +static void +arp_llreach_alloc(struct rtentry *rt, struct ifnet *ifp, void *addr, + unsigned int alen, boolean_t solicited) +{ + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + if (arp_llreach_base != 0 && + rt->rt_expire != 0 && rt->rt_ifp != lo_ifp && + ifp->if_addrlen == IF_LLREACH_MAXLEN && /* Ethernet */ + alen == ifp->if_addrlen) { + struct llinfo_arp *la = rt->rt_llinfo; + struct if_llreach *lr; + const char *why = NULL, *type = ""; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + + if ((lr = la->la_llreach) != NULL) { + type = (solicited ? "ARP reply" : "ARP announcement"); + /* + * If target has changed, create a new record; + * otherwise keep existing record. + */ + IFLR_LOCK(lr); + if (bcmp(addr, lr->lr_key.addr, alen) != 0) { + IFLR_UNLOCK(lr); + /* Purge any link-layer info caching */ + VERIFY(rt->rt_llinfo_purge != NULL); + rt->rt_llinfo_purge(rt); + lr = NULL; + why = " for different target HW address; " + "using new llreach record"; + } else { + lr->lr_probes = 0; /* reset probe count */ + IFLR_UNLOCK(lr); + if (solicited) { + why = " for same target HW address; " + "keeping existing llreach record"; + } + } + } + + if (lr == NULL) { + lr = la->la_llreach = ifnet_llreach_alloc(ifp, + ETHERTYPE_IP, addr, alen, arp_llreach_base); + if (lr != NULL) { + lr->lr_probes = 0; /* reset probe count */ + if (why == NULL) + why = "creating new llreach record"; + } + } + + if (log_arp_warnings && lr != NULL && why != NULL) { + char tmp[MAX_IPv4_STR_LEN]; + + log(LOG_DEBUG, "%s%d: %s%s for %s\n", ifp->if_name, + ifp->if_unit, type, why, inet_ntop(AF_INET, + &SIN(rt_key(rt))->sin_addr, tmp, sizeof (tmp))); + } + } +} + /* * Free an arp entry. */ @@ -264,6 +475,16 @@ arptfree(struct llinfo_arp *la) la->la_asked = 0; rt->rt_flags &= ~RTF_REJECT; RT_UNLOCK(rt); + } else if (la->la_persist) { + /* + * Instead of issuing RTM_DELETE, stop this route entry + * from holding an interface idle reference count; if + * the route is later reused, arp_validate() will revert + * this action. + */ + if (rt->rt_refcnt == 0) + rt_clear_idleref(rt); + RT_UNLOCK(rt); } else { /* * Safe to drop rt_lock and use rt_key, since holding @@ -281,16 +502,18 @@ in_arpdrain(void *ignored_arg) { #pragma unused (ignored_arg) struct llinfo_arp *la, *ola; - struct timeval timenow; + uint64_t timenow; lck_mtx_lock(rnh_lock); la = llinfo_arp.lh_first; - getmicrotime(&timenow); + timenow = net_uptime(); while ((ola = la) != 0) { struct rtentry *rt = la->la_rt; la = la->la_le.le_next; RT_LOCK(rt); - if (rt->rt_expire && rt->rt_expire <= timenow.tv_sec) + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + if (rt->rt_expire && rt->rt_expire <= timenow) arptfree(ola); /* timer has expired, clear */ else RT_UNLOCK(rt); @@ -298,6 +521,20 @@ in_arpdrain(void *ignored_arg) lck_mtx_unlock(rnh_lock); } +void +arp_validate(struct rtentry *rt) +{ + struct llinfo_arp *la = rt->rt_llinfo; + + RT_LOCK_ASSERT_HELD(rt); + /* + * If this is a persistent ARP entry, make it count towards the + * interface idleness just like before arptfree() was called. + */ + if (la->la_persist) + rt_set_idleref(rt); +} + /* * Timeout routine. Age arp_tab entries periodically. */ @@ -322,7 +559,7 @@ arp_rtrequest( struct sockaddr *gate = rt->rt_gateway; struct llinfo_arp *la = rt->rt_llinfo; static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK, 0, 0, 0, 0, 0, {0}}; - struct timeval timenow; + uint64_t timenow; if (!arpinit_done) { panic("%s: ARP has not been initialized", __func__); @@ -333,7 +570,7 @@ arp_rtrequest( if (rt->rt_flags & RTF_GATEWAY) return; - getmicrotime(&timenow); + timenow = net_uptime(); switch (req) { case RTM_ADD: @@ -358,12 +595,14 @@ arp_rtrequest( * In case we're called before 1.0 sec. * has elapsed. */ - rt->rt_expire = MAX(timenow.tv_sec, 1); + rt_setexpire(rt, MAX(timenow, 1)); } break; } /* Announce a new entry if requested. */ if (rt->rt_flags & RTF_ANNOUNCE) { + if (la != NULL) + arp_llreach_use(la); /* Mark use timestamp */ RT_UNLOCK(rt); dlil_send_arp(rt->rt_ifp, ARPOP_REQUEST, SDL(gate), rt_key(rt), NULL, rt_key(rt)); @@ -391,6 +630,8 @@ arp_rtrequest( log(LOG_DEBUG, "%s: malloc failed\n", __func__); break; } + rt->rt_llinfo_get_ri = arp_llinfo_get_ri; + rt->rt_llinfo_purge = arp_llinfo_purge; rt->rt_llinfo_free = arp_llinfo_free; arp_inuse++, arp_allocated++; @@ -402,14 +643,16 @@ arp_rtrequest( /* * This keeps the multicast addresses from showing up * in `arp -a' listings as unresolved. It's not actually - * functional. Then the same for broadcast. + * functional. Then the same for broadcast. For IPv4 + * link-local address, keep the entry around even after + * it has expired. */ if (IN_MULTICAST(ntohl(SIN(rt_key(rt))->sin_addr.s_addr))) { RT_UNLOCK(rt); dlil_resolve_multi(rt->rt_ifp, rt_key(rt), gate, sizeof(struct sockaddr_dl)); RT_LOCK(rt); - rt->rt_expire = 0; + rt_setexpire(rt, 0); } else if (in_broadcast(SIN(rt_key(rt))->sin_addr, rt->rt_ifp)) { struct sockaddr_dl *gate_ll = SDL(gate); @@ -421,35 +664,60 @@ arp_rtrequest( gate_ll->sdl_family = AF_LINK; gate_ll->sdl_len = sizeof(struct sockaddr_dl); /* In case we're called before 1.0 sec. has elapsed */ - rt->rt_expire = MAX(timenow.tv_sec, 1); + rt_setexpire(rt, MAX(timenow, 1)); + } else if (IN_LINKLOCAL(ntohl(SIN(rt_key(rt))->sin_addr.s_addr))) { + /* + * The persistent bit implies that once the ARP + * entry has reached it expiration time, the idle + * reference count to the interface will be released, + * but the ARP entry itself stays in the routing table + * until it is explicitly removed. + */ + la->la_persist = 1; + rt->rt_flags |= RTF_STATIC; } + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + IFA_LOCK_SPIN(rt->rt_ifa); if (SIN(rt_key(rt))->sin_addr.s_addr == (IA_SIN(rt->rt_ifa))->sin_addr.s_addr) { - /* - * This test used to be - * if (loif.if_flags & IFF_UP) - * It allowed local traffic to be forced - * through the hardware by configuring the loopback down. - * However, it causes problems during network configuration - * for boards that can't receive packets they send. - * It is now necessary to clear "useloopback" and remove - * the route to force traffic out to the hardware. - */ - rt->rt_expire = 0; - ifnet_lladdr_copy_bytes(rt->rt_ifp, LLADDR(SDL(gate)), SDL(gate)->sdl_alen = 6); + IFA_UNLOCK(rt->rt_ifa); + /* + * This test used to be + * if (loif.if_flags & IFF_UP) + * It allowed local traffic to be forced through the + * hardware by configuring the loopback down. However, + * it causes problems during network configuration + * for boards that can't receive packets they send. + * It is now necessary to clear "useloopback" and + * remove the route to force traffic out to the + * hardware. + */ + rt_setexpire(rt, 0); + ifnet_lladdr_copy_bytes(rt->rt_ifp, LLADDR(SDL(gate)), + SDL(gate)->sdl_alen = rt->rt_ifp->if_addrlen); if (useloopback) { -#if IFNET_ROUTE_REFCNT - /* Adjust route ref count for the interfaces */ - if (rt->rt_if_ref_fn != NULL && - rt->rt_ifp != lo_ifp) { - rt->rt_if_ref_fn(lo_ifp, 1); - rt->rt_if_ref_fn(rt->rt_ifp, -1); + if (rt->rt_ifp != lo_ifp) { + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + + /* + * Adjust route ref count for the + * interfaces. + */ + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(lo_ifp, 1); + rt->rt_if_ref_fn(rt->rt_ifp, -1); + } } -#endif /* IFNET_ROUTE_REFCNT */ rt->rt_ifp = lo_ifp; } - + } else { + IFA_UNLOCK(rt->rt_ifa); } break; @@ -466,10 +734,18 @@ arp_rtrequest( LIST_REMOVE(la, la_le); la->la_le.le_next = NULL; la->la_le.le_prev = NULL; + + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + rt->rt_flags &= ~RTF_LLINFO; - if (la->la_hold != NULL) + if (la->la_hold != NULL) { m_freem(la->la_hold); - la->la_hold = NULL; + la->la_hold = NULL; + } } } @@ -518,6 +794,13 @@ arp_lookup_route(const struct in_addr *addr, int create, int proxy, sin.sin_addr.s_addr = addr->s_addr; sin.sin_other = proxy ? SIN_PROXY : 0; + /* + * If the destination is a link-local address, don't + * constrain the lookup (don't scope it). + */ + if (IN_LINKLOCAL(ntohl(addr->s_addr))) + ifscope = IFSCOPE_NONE; + rt = rtalloc1_scoped((struct sockaddr*)&sin, create, 0, ifscope); if (rt == NULL) return (ENETUNREACH); @@ -592,7 +875,7 @@ __private_extern__ errno_t arp_route_to_gateway_route(const struct sockaddr *net_dest, route_t hint0, route_t *out_route) { - struct timeval timenow; + uint64_t timenow; route_t rt = hint0, hint = hint0; errno_t error = 0; @@ -728,9 +1011,11 @@ lookup: } if (rt->rt_flags & RTF_REJECT) { - getmicrotime(&timenow); - if (rt->rt_rmx.rmx_expire == 0 || - timenow.tv_sec < rt->rt_rmx.rmx_expire) { + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + timenow = net_uptime(); + if (rt->rt_expire == 0 || + timenow < rt->rt_expire) { RT_UNLOCK(rt); senderr(rt == hint ? EHOSTDOWN : EHOSTUNREACH); } @@ -774,8 +1059,9 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, route_t route = NULL; /* output route */ errno_t result = 0; struct sockaddr_dl *gateway; - struct llinfo_arp *llinfo; - struct timeval timenow; + struct llinfo_arp *llinfo = NULL; + uint64_t timenow; + int unreachable = 0; if (net_dest->sin_family != AF_INET) return (EAFNOSUPPORT); @@ -849,7 +1135,7 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, RT_LOCK_ASSERT_HELD(route); } - if (result || route == NULL || route->rt_llinfo == NULL) { + if (result || route == NULL || (llinfo = route->rt_llinfo) == NULL) { char tmp[MAX_IPv4_STR_LEN]; /* In case result is 0 but no route, return an error */ @@ -868,13 +1154,22 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, * Now that we have the right route, is it filled in? */ gateway = SDL(route->rt_gateway); - getmicrotime(&timenow); - if ((route->rt_rmx.rmx_expire == 0 || - route->rt_rmx.rmx_expire > timenow.tv_sec) && gateway != NULL && - gateway->sdl_family == AF_LINK && gateway->sdl_alen != 0) { + timenow = net_uptime(); + VERIFY(route->rt_expire == 0 || route->rt_rmx.rmx_expire != 0); + VERIFY(route->rt_expire != 0 || route->rt_rmx.rmx_expire == 0); + if ((route->rt_expire == 0 || + route->rt_expire > timenow) && gateway != NULL && + gateway->sdl_family == AF_LINK && gateway->sdl_alen != 0 && + !(unreachable = !arp_llreach_reachable(llinfo))) { bcopy(gateway, ll_dest, MIN(gateway->sdl_len, ll_dest_len)); result = 0; + arp_llreach_use(llinfo); /* Mark use timestamp */ goto release; + } else if (unreachable) { + /* + * Discard existing answer in case we need to probe. + */ + gateway->sdl_alen = 0; } if (ifp->if_flags & IFF_NOARP) { @@ -885,34 +1180,51 @@ arp_lookup_ip(ifnet_t ifp, const struct sockaddr_in *net_dest, /* * Route wasn't complete/valid. We need to arp. */ - llinfo = route->rt_llinfo; if (packet != NULL) { if (llinfo->la_hold != NULL) m_freem(llinfo->la_hold); llinfo->la_hold = packet; } - if (route->rt_rmx.rmx_expire) { + if (route->rt_expire) { route->rt_flags &= ~RTF_REJECT; if (llinfo->la_asked == 0 || - route->rt_rmx.rmx_expire != timenow.tv_sec) { - route->rt_rmx.rmx_expire = timenow.tv_sec; + route->rt_expire != timenow) { + rt_setexpire(route, timenow); if (llinfo->la_asked++ < arp_maxtries) { struct ifaddr *rt_ifa = route->rt_ifa; - ifaref(rt_ifa); + struct sockaddr *sa; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(route); + /* Update probe count, if applicable */ + if (llinfo->la_llreach != NULL) { + IFLR_LOCK_SPIN(llinfo->la_llreach); + llinfo->la_llreach->lr_probes++; + IFLR_UNLOCK(llinfo->la_llreach); + } + IFA_LOCK_SPIN(rt_ifa); + IFA_ADDREF_LOCKED(rt_ifa); + sa = rt_ifa->ifa_addr; + IFA_UNLOCK(rt_ifa); + arp_llreach_use(llinfo); /* Mark use timestamp */ RT_UNLOCK(route); dlil_send_arp(ifp, ARPOP_REQUEST, NULL, - rt_ifa->ifa_addr, NULL, - (const struct sockaddr*)net_dest); - ifafree(rt_ifa); + sa, NULL, (const struct sockaddr*)net_dest); + IFA_REMREF(rt_ifa); RT_LOCK(route); result = EJUSTRETURN; goto release; } else { route->rt_flags |= RTF_REJECT; - route->rt_rmx.rmx_expire = rt_expiry(route, - route->rt_rmx.rmx_expire, arpt_down); + rt_setexpire(route, rt_expiry(route, + route->rt_expire, arpt_down)); llinfo->la_asked = 0; + /* + * Clear la_hold; don't free the packet since + * we're not returning EJUSTRETURN; the caller + * will handle the freeing. + */ llinfo->la_hold = NULL; result = EHOSTUNREACH; goto release; @@ -950,52 +1262,61 @@ arp_ip_handle_input( struct ifaddr *ifa; struct in_ifaddr *ia; struct in_ifaddr *best_ia = NULL; + struct sockaddr_in best_ia_sin; route_t route = NULL; char buf[3 * MAX_HW_LEN]; // enough for MAX_HW_LEN byte hw address struct llinfo_arp *llinfo; errno_t error; int created_announcement = 0; int bridged = 0, is_bridge = 0; - + /* Do not respond to requests for 0.0.0.0 */ if (target_ip->sin_addr.s_addr == 0 && arpop == ARPOP_REQUEST) goto done; - - if (ifp->if_bridge) + + if (ifp->if_bridge) bridged = 1; if (ifp->if_type == IFT_BRIDGE) is_bridge = 1; /* * Determine if this ARP is for us - * For a bridge, we want to check the address irrespective + * For a bridge, we want to check the address irrespective * of the receive interface. */ lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), ia_hash) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (((bridged && ia->ia_ifp->if_bridge != NULL) || - (ia->ia_ifp == ifp)) && + (ia->ia_ifp == ifp)) && ia->ia_addr.sin_addr.s_addr == target_ip->sin_addr.s_addr) { - best_ia = ia; - ifaref(&best_ia->ia_ifa); - lck_rw_done(in_ifaddr_rwlock); - goto match; + best_ia = ia; + best_ia_sin = best_ia->ia_addr; + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(in_ifaddr_rwlock); + goto match; } + IFA_UNLOCK(&ia->ia_ifa); } TAILQ_FOREACH(ia, INADDR_HASH(sender_ip->sin_addr.s_addr), ia_hash) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (((bridged && ia->ia_ifp->if_bridge != NULL) || - (ia->ia_ifp == ifp)) && + (ia->ia_ifp == ifp)) && ia->ia_addr.sin_addr.s_addr == sender_ip->sin_addr.s_addr) { - best_ia = ia; - ifaref(&best_ia->ia_ifa); - lck_rw_done(in_ifaddr_rwlock); - goto match; + best_ia = ia; + best_ia_sin = best_ia->ia_addr; + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(in_ifaddr_rwlock); + goto match; } + IFA_UNLOCK(&ia->ia_ifa); } -#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ - (ia->ia_ifp->if_bridge == ifp->if_softc && \ +#define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia) \ + (ia->ia_ifp->if_bridge == ifp->if_softc && \ !bcmp(ifnet_lladdr(ia->ia_ifp), ifnet_lladdr(ifp), ifp->if_addrlen) && \ addr == ia->ia_addr.sin_addr.s_addr) /* @@ -1005,14 +1326,20 @@ arp_ip_handle_input( * meant to be destined to the bridge member. */ if (is_bridge) { - TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), ia_hash) { - if (BDG_MEMBER_MATCHES_ARP(target_ip->sin_addr.s_addr, ifp, ia)) { + TAILQ_FOREACH(ia, INADDR_HASH(target_ip->sin_addr.s_addr), + ia_hash) { + IFA_LOCK_SPIN(&ia->ia_ifa); + if (BDG_MEMBER_MATCHES_ARP(target_ip->sin_addr.s_addr, + ifp, ia)) { ifp = ia->ia_ifp; best_ia = ia; - ifaref(&best_ia->ia_ifa); + best_ia_sin = best_ia->ia_addr; + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); goto match; } + IFA_UNLOCK(&ia->ia_ifa); } } lck_rw_done(in_ifaddr_rwlock); @@ -1024,10 +1351,15 @@ arp_ip_handle_input( */ ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ifa); continue; + } best_ia = (struct in_ifaddr *)ifa; - ifaref(&best_ia->ia_ifa); + best_ia_sin = best_ia->ia_addr; + IFA_ADDREF_LOCKED(ifa); + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); goto match; } @@ -1042,15 +1374,17 @@ arp_ip_handle_input( match: /* If the packet is from this interface, ignore the packet */ - if (!bcmp(CONST_LLADDR(sender_hw), ifnet_lladdr(ifp), sender_hw->sdl_len)) { + if (!bcmp(CONST_LLADDR(sender_hw), ifnet_lladdr(ifp), sender_hw->sdl_alen)) { goto done; } /* Check for a conflict */ - if (!bridged && sender_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr) { + if (!bridged && sender_ip->sin_addr.s_addr == best_ia_sin.sin_addr.s_addr) { struct kev_msg ev_msg; struct kev_in_collision *in_collision; u_char storage[sizeof(struct kev_in_collision) + MAX_HW_LEN]; + bzero(&ev_msg, sizeof(struct kev_msg)); + bzero(storage, (sizeof(struct kev_in_collision) + MAX_HW_LEN)); in_collision = (struct kev_in_collision*)storage; log(LOG_ERR, "%s%d duplicate IP address %s sent from address %s\n", ifp->if_name, ifp->if_unit, @@ -1083,7 +1417,7 @@ match: * entry locked, upon success. */ error = arp_lookup_route(&sender_ip->sin_addr, - (target_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr && + (target_ip->sin_addr.s_addr == best_ia_sin.sin_addr.s_addr && sender_ip->sin_addr.s_addr != 0), 0, &route, ifp->if_index); if (error == 0) @@ -1142,6 +1476,9 @@ match: sdl_addr_to_hex(sender_hw, buf, sizeof(buf)), ifp->if_name, ifp->if_unit); } + /* Mark use timestamp */ + if (route->rt_llinfo != NULL) + arp_llreach_use(route->rt_llinfo); /* We're done with the route */ RT_REMREF_LOCKED(route); RT_UNLOCK(route); @@ -1152,21 +1489,19 @@ match: * This will not force the device to pick a new number if the device * has already assigned that number. * This will not imply to the device that we own that address. + * The link address is always present; it's never freed. */ ifnet_lock_shared(ifp); - ifa = TAILQ_FIRST(&ifp->if_addrhead); - if (ifa != NULL) - ifaref(ifa); + ifa = ifp->if_lladdr; + IFA_ADDREF(ifa); ifnet_lock_done(ifp); dlil_send_arp_internal(ifp, ARPOP_REQUEST, - ifa != NULL ? SDL(ifa->ifa_addr) : NULL, + SDL(ifa->ifa_addr), (const struct sockaddr*)sender_ip, sender_hw, (const struct sockaddr*)target_ip); - if (ifa != NULL) { - ifafree(ifa); - ifa = NULL; - } - } + IFA_REMREF(ifa); + ifa = NULL; + } } goto respond; } else if (keep_announcements != 0 @@ -1203,6 +1538,8 @@ match: } RT_LOCK_ASSERT_HELD(route); + VERIFY(route->rt_expire == 0 || route->rt_rmx.rmx_expire != 0); + VERIFY(route->rt_expire != 0 || route->rt_rmx.rmx_expire == 0); gateway = SDL(route->rt_gateway); if (!bridged && route->rt_ifp != ifp) { if (!IN_LINKLOCAL(ntohl(sender_ip->sin_addr.s_addr)) || (ifp->if_eflags & IFEF_ARPLL) == 0) { @@ -1218,7 +1555,7 @@ match: } else { /* Don't change a permanent address */ - if (route->rt_rmx.rmx_expire == 0) { + if (route->rt_expire == 0) { goto respond; } @@ -1249,14 +1586,19 @@ match: lck_mtx_unlock(rnh_lock); goto respond; } -#if IFNET_ROUTE_REFCNT - /* Adjust route ref count for the interfaces */ - if (route->rt_if_ref_fn != NULL && - route->rt_ifp != ifp) { - route->rt_if_ref_fn(ifp, 1); - route->rt_if_ref_fn(route->rt_ifp, -1); + if (route->rt_ifp != ifp) { + /* + * Purge any link-layer info caching. + */ + if (route->rt_llinfo_purge != NULL) + route->rt_llinfo_purge(route); + + /* Adjust route ref count for the interfaces */ + if (route->rt_if_ref_fn != NULL) { + route->rt_if_ref_fn(ifp, 1); + route->rt_if_ref_fn(route->rt_ifp, -1); + } } -#endif /* IFNET_ROUTE_REFCNT */ /* Change the interface when the existing route is on */ route->rt_ifp = ifp; rtsetifa(route, &best_ia->ia_ifa); @@ -1274,7 +1616,7 @@ match: } if (gateway->sdl_alen && bcmp(LLADDR(gateway), CONST_LLADDR(sender_hw), gateway->sdl_alen)) { - if (route->rt_rmx.rmx_expire && log_arp_warnings) { + if (route->rt_expire && log_arp_warnings) { char buf2[3 * MAX_HW_LEN]; log(LOG_INFO, "arp: %s moved from %s to %s on %s%d\n", inet_ntop(AF_INET, &sender_ip->sin_addr, ipv4str, @@ -1283,7 +1625,7 @@ match: sdl_addr_to_hex(sender_hw, buf2, sizeof(buf2)), ifp->if_name, ifp->if_unit); } - else if (route->rt_rmx.rmx_expire == 0) { + else if (route->rt_expire == 0) { if (log_arp_warnings) { log(LOG_ERR, "arp: %s attempts to modify " "permanent entry for %s on %s%d\n", @@ -1302,22 +1644,26 @@ match: bcopy(CONST_LLADDR(sender_hw), LLADDR(gateway), gateway->sdl_alen); /* Update the expire time for the route and clear the reject flag */ - if (route->rt_rmx.rmx_expire) { - struct timeval timenow; - - getmicrotime(&timenow); - route->rt_rmx.rmx_expire = - rt_expiry(route, timenow.tv_sec, arpt_keep); + if (route->rt_expire) { + uint64_t timenow; + + timenow = net_uptime(); + rt_setexpire(route, + rt_expiry(route, timenow, arpt_keep)); } route->rt_flags &= ~RTF_REJECT; + /* cache the gateway (sender HW) address */ + arp_llreach_alloc(route, ifp, LLADDR(gateway), gateway->sdl_alen, + (arpop == ARPOP_REPLY)); + /* update the llinfo, send a queued packet if there is one */ llinfo = route->rt_llinfo; llinfo->la_asked = 0; if (llinfo->la_hold) { struct mbuf *m0; m0 = llinfo->la_hold; - llinfo->la_hold = 0; + llinfo->la_hold = NULL; RT_UNLOCK(route); dlil_output(ifp, PF_INET, m0, (caddr_t)route, rt_key(route), 0); @@ -1327,6 +1673,9 @@ match: respond: if (route != NULL) { + /* Mark use timestamp if we're going to send a reply */ + if (arpop == ARPOP_REQUEST && route->rt_llinfo != NULL) + arp_llreach_use(route->rt_llinfo); RT_REMREF_LOCKED(route); RT_UNLOCK(route); route = NULL; @@ -1336,7 +1685,7 @@ respond: goto done; /* If we are not the target, check if we should proxy */ - if (target_ip->sin_addr.s_addr != best_ia->ia_addr.sin_addr.s_addr) { + if (target_ip->sin_addr.s_addr != best_ia_sin.sin_addr.s_addr) { /* * Find a proxy route; callee holds a reference on the * route and returns with the route entry locked, upon @@ -1390,6 +1739,9 @@ respond: goto done; } } + /* Mark use timestamp */ + if (route->rt_llinfo != NULL) + arp_llreach_use(route->rt_llinfo); RT_REMREF_LOCKED(route); RT_UNLOCK(route); } @@ -1400,16 +1752,19 @@ respond: done: if (best_ia != NULL) - ifafree(&best_ia->ia_ifa); + IFA_REMREF(&best_ia->ia_ifa); return 0; } void -arp_ifinit( - struct ifnet *ifp, - struct ifaddr *ifa) +arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa) { + struct sockaddr *sa; + + IFA_LOCK(ifa); ifa->ifa_rtrequest = arp_rtrequest; ifa->ifa_flags |= RTF_CLONING; - dlil_send_arp(ifp, ARPOP_REQUEST, NULL, ifa->ifa_addr, NULL, ifa->ifa_addr); + sa = ifa->ifa_addr; + IFA_UNLOCK(ifa); + dlil_send_arp(ifp, ARPOP_REQUEST, NULL, sa, NULL, sa); } diff --git a/bsd/netinet/in_arp.h b/bsd/netinet/in_arp.h index 9b1a740ac..99a106572 100644 --- a/bsd/netinet/in_arp.h +++ b/bsd/netinet/in_arp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2009-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,8 +67,11 @@ extern errno_t inet_arp_lookup(ifnet_t interface, size_t ll_dest_len, route_t hint, mbuf_t packet); #endif /* BSD_KERNEL_PRIVATE */ #ifdef KERNEL_PRIVATE +struct in_addr; extern void arp_init(void); extern void in_arpdrain(void *); +extern void arp_validate(struct rtentry *); +extern void arp_llreach_set_reachable(struct ifnet *, void *, unsigned int); /* arp_lookup_ip is obsolete, use inet_arp_lookup */ extern errno_t arp_lookup_ip(ifnet_t interface, const struct sockaddr_in *ip_dest, struct sockaddr_dl *ll_dest, diff --git a/bsd/netinet/in_cksum.c b/bsd/netinet/in_cksum.c index cf3e3dbca..1fcafd583 100644 --- a/bsd/netinet/in_cksum.c +++ b/bsd/netinet/in_cksum.c @@ -93,7 +93,7 @@ union q_util { u_int64_t q; }; -#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define ADDCARRY(x) do { if (x > 65535) { x -= 65535; } } while (0) #define REDUCE32 \ { \ @@ -118,7 +118,7 @@ inet_cksum_simple(struct mbuf *m, int len) return (inet_cksum(m, 0, 0, len)); } -inline u_short +u_short in_addword(u_short a, u_short b) { union l_util l_util; @@ -128,7 +128,7 @@ in_addword(u_short a, u_short b) return (sum); } -inline u_short +u_short in_pseudo(u_int a, u_int b, u_int c) { u_int64_t sum; @@ -141,77 +141,7 @@ in_pseudo(u_int a, u_int b, u_int c) } -#if defined(__ppc__) - -extern u_short xsum_assym(u_short *p, int len, u_short xsum, int odd); - -u_int16_t -inet_cksum(struct mbuf *m, unsigned int nxt, unsigned int skip, - unsigned int len) -{ - u_short *w; - u_int32_t sum = 0; - int mlen = 0; - int starting_on_odd = 0; - - KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_START, len,0,0,0,0); - - /* sanity check */ - if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.len < skip + len) { - panic("inet_cksum: mbuf len (%d) < off+len (%d+%d)\n", - m->m_pkthdr.len, skip, len); - } - - /* include pseudo header checksum? */ - if (nxt != 0) { - struct ip *iph; - - if (m->m_len < sizeof (struct ip)) - panic("inet_cksum: bad mbuf chain"); - - iph = mtod(m, struct ip *); - sum = in_pseudo(iph->ip_src.s_addr, iph->ip_dst.s_addr, - htonl(len + nxt)); - } - - if (skip != 0) { - for (; skip && m; m = m->m_next) { - if (m->m_len > skip) { - mlen = m->m_len - skip; - w = (u_short *)(m->m_data+skip); - goto skip_start; - } else { - skip -= m->m_len; - } - } - } - - for (;m && len; m = m->m_next) { - if (m->m_len == 0) - continue; - mlen = m->m_len; - w = mtod(m, u_short *); - -skip_start: - if (len < mlen) - mlen = len; - sum = xsum_assym(w, mlen, sum, starting_on_odd); - len -= mlen; - if (mlen & 0x1) - { - if (starting_on_odd) - starting_on_odd = 0; - else - starting_on_odd = 1; - } - } - - KERNEL_DEBUG(DBG_FNC_IN_CKSUM | DBG_FUNC_END, 0,0,0,0,0); - - return (~sum & 0xffff); -} - -#elif defined(__arm__) && __ARM_ARCH__ >= 6 +#if defined(__arm__) && __ARM_ARCH__ >= 6 extern int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum); diff --git a/bsd/netinet/in_dhcp.c b/bsd/netinet/in_dhcp.c index c6fdffdd8..90fc06ae5 100644 --- a/bsd/netinet/in_dhcp.c +++ b/bsd/netinet/in_dhcp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1988-2007 Apple Inc. All rights reserved. + * Copyright (c) 1988-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -281,12 +281,6 @@ link_print(struct sockaddr_dl * dl_p) { int i; -#if 0 - printf("len %d index %d family %d type 0x%x nlen %d alen %d" - " slen %d addr ", dl_p->sdl_len, - dl_p->sdl_index, dl_p->sdl_family, dl_p->sdl_type, - dl_p->sdl_nlen, dl_p->sdl_alen, dl_p->sdl_slen); -#endif for (i = 0; i < dl_p->sdl_alen; i++) printf("%s%x", i ? ":" : "", (link_address(dl_p))[i]); @@ -297,19 +291,7 @@ link_print(struct sockaddr_dl * dl_p) static struct sockaddr_dl * link_from_ifnet(struct ifnet * ifp) { - struct ifaddr * addr; - - ifnet_lock_shared(ifp); - TAILQ_FOREACH(addr, &ifp->if_addrhead, ifa_link) { - if (addr->ifa_addr->sa_family == AF_LINK) { - struct sockaddr_dl * dl_p = (struct sockaddr_dl *)(addr->ifa_addr); - - ifnet_lock_done(ifp); - return (dl_p); - } - } - ifnet_lock_done(ifp); - return (NULL); + return ((struct sockaddr_dl *)ifp->if_lladdr->ifa_addr); } /* diff --git a/bsd/netinet/in_gif.c b/bsd/netinet/in_gif.c index 482aef5e4..9a6cb3db6 100644 --- a/bsd/netinet/in_gif.c +++ b/bsd/netinet/in_gif.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -94,7 +94,7 @@ #include <net/net_osdep.h> int ip_gif_ttl = GIF_TTL; -SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_gif_ttl, 0, ""); int @@ -111,7 +111,7 @@ in_gif_output( struct ip iphdr; /* capsule IP header, host byte ordered */ int proto, error; u_int8_t tos; - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; if (sin_src == NULL || sin_dst == NULL || sin_src->sin_family != AF_INET || @@ -371,10 +371,13 @@ gif_encapcheck4( { if ((ifnet_flags(ia4->ia_ifa.ifa_ifp) & IFF_BROADCAST) == 0) continue; + IFA_LOCK(&ia4->ia_ifa); if (ip.ip_src.s_addr == ia4->ia_broadaddr.sin_addr.s_addr) { + IFA_UNLOCK(&ia4->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return 0; } + IFA_UNLOCK(&ia4->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); @@ -393,11 +396,6 @@ gif_encapcheck4( if (rt != NULL) RT_LOCK(rt); if (rt == NULL || rt->rt_ifp != m->m_pkthdr.rcvif) { -#if 0 - log(LOG_WARNING, "%s: packet from 0x%x dropped " - "due to ingress filter\n", if_name(&sc->gif_if), - (u_int32_t)ntohl(sin.sin_addr.s_addr)); -#endif if (rt != NULL) { RT_UNLOCK(rt); rtfree(rt); diff --git a/bsd/netinet/in_mcast.c b/bsd/netinet/in_mcast.c new file mode 100644 index 000000000..1854fd26e --- /dev/null +++ b/bsd/netinet/in_mcast.c @@ -0,0 +1,3641 @@ +/* + * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2007-2009 Bruce Simpson. + * Copyright (c) 2005 Robert N. M. Watson. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * IPv4 multicast socket, group, and socket option processing module. + */ + +#include <sys/cdefs.h> + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/sysctl.h> +#include <sys/tree.h> +#include <sys/mcache.h> + +#include <kern/zalloc.h> + +#include <pexpert/pexpert.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/in_pcb.h> +#include <netinet/in_var.h> +#include <netinet/ip_var.h> +#include <netinet/igmp_var.h> + +#ifndef __SOCKUNION_DECLARED +union sockunion { + struct sockaddr_storage ss; + struct sockaddr sa; + struct sockaddr_dl sdl; + struct sockaddr_in sin; +}; +typedef union sockunion sockunion_t; +#define __SOCKUNION_DECLARED +#endif /* __SOCKUNION_DECLARED */ + +/* + * Functions with non-static linkage defined in this file should be + * declared in in_var.h: + * imo_multi_filter() + * in_addmulti() + * in_delmulti() + * in_joingroup() + * in_leavegroup() + * and ip_var.h: + * inp_freemoptions() + * inp_getmoptions() + * inp_setmoptions() + * + * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti() + * and in_delmulti(). + */ +static void imf_commit(struct in_mfilter *); +static int imf_get_source(struct in_mfilter *imf, + const struct sockaddr_in *psin, + struct in_msource **); +static struct in_msource * + imf_graft(struct in_mfilter *, const uint8_t, + const struct sockaddr_in *); +static int imf_prune(struct in_mfilter *, const struct sockaddr_in *); +static void imf_rollback(struct in_mfilter *); +static void imf_reap(struct in_mfilter *); +static int imo_grow(struct ip_moptions *, size_t); +static size_t imo_match_group(const struct ip_moptions *, + const struct ifnet *, const struct sockaddr *); +static struct in_msource * + imo_match_source(const struct ip_moptions *, const size_t, + const struct sockaddr *); +static void ims_merge(struct ip_msource *ims, + const struct in_msource *lims, const int rollback); +static int in_getmulti(struct ifnet *, const struct in_addr *, + struct in_multi **); +static int in_joingroup(struct ifnet *, const struct in_addr *, + struct in_mfilter *, struct in_multi **); +static int inm_get_source(struct in_multi *inm, const in_addr_t haddr, + const int noalloc, struct ip_msource **pims); +static int inm_is_ifp_detached(const struct in_multi *); +static int inm_merge(struct in_multi *, /*const*/ struct in_mfilter *); +static void inm_reap(struct in_multi *); +static struct ip_moptions * + inp_findmoptions(struct inpcb *); +static int inp_get_source_filters(struct inpcb *, struct sockopt *); +static struct ifnet * + inp_lookup_mcast_ifp(const struct inpcb *, + const struct sockaddr_in *, const struct in_addr); +static int inp_block_unblock_source(struct inpcb *, struct sockopt *); +static int inp_set_multicast_if(struct inpcb *, struct sockopt *); +static int inp_set_source_filters(struct inpcb *, struct sockopt *); +static int sysctl_ip_mcast_filters SYSCTL_HANDLER_ARGS; +static struct ifnet * ip_multicast_if(struct in_addr *, unsigned int *); +static __inline__ int ip_msource_cmp(const struct ip_msource *, + const struct ip_msource *); + +SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IPv4 multicast"); + +static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER; +SYSCTL_LONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc, + CTLFLAG_RW | CTLFLAG_LOCKED, &in_mcast_maxgrpsrc, "Max source filters per group"); + +static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER; +SYSCTL_LONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc, + CTLFLAG_RW | CTLFLAG_LOCKED, &in_mcast_maxsocksrc, + "Max source filters per socket"); + +int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP; +SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_LOCKED, + &in_mcast_loop, 0, "Loopback multicast datagrams by default"); + +SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters, + CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_ip_mcast_filters, + "Per-interface stack-wide source filters"); + +RB_GENERATE_PREV(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp); + +#define INM_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int inm_trace_hist_size = INM_TRACE_HIST_SIZE; + +struct in_multi_dbg { + struct in_multi inm; /* in_multi */ + u_int16_t inm_refhold_cnt; /* # of ref */ + u_int16_t inm_refrele_cnt; /* # of rele */ + /* + * Circular lists of inm_addref and inm_remref callers. + */ + ctrace_t inm_refhold[INM_TRACE_HIST_SIZE]; + ctrace_t inm_refrele[INM_TRACE_HIST_SIZE]; + /* + * Trash list linkage + */ + TAILQ_ENTRY(in_multi_dbg) inm_trash_link; +}; + +/* List of trash in_multi entries protected by inm_trash_lock */ +static TAILQ_HEAD(, in_multi_dbg) inm_trash_head; +static decl_lck_mtx_data(, inm_trash_lock); + +#define INM_ZONE_MAX 64 /* maximum elements in zone */ +#define INM_ZONE_NAME "in_multi" /* zone name */ + +#if DEBUG +static unsigned int inm_debug = 1; /* debugging (enabled) */ +#else +static unsigned int inm_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int inm_size; /* size of zone element */ +static struct zone *inm_zone; /* zone for in_multi */ + +#define IPMS_ZONE_MAX 64 /* maximum elements in zone */ +#define IPMS_ZONE_NAME "ip_msource" /* zone name */ + +static unsigned int ipms_size; /* size of zone element */ +static struct zone *ipms_zone; /* zone for ip_msource */ + +#define INMS_ZONE_MAX 64 /* maximum elements in zone */ +#define INMS_ZONE_NAME "in_msource" /* zone name */ + +static unsigned int inms_size; /* size of zone element */ +static struct zone *inms_zone; /* zone for in_msource */ + +/* Lock group and attribute for in_multihead_lock lock */ +static lck_attr_t *in_multihead_lock_attr; +static lck_grp_t *in_multihead_lock_grp; +static lck_grp_attr_t *in_multihead_lock_grp_attr; + +static decl_lck_rw_data(, in_multihead_lock); +struct in_multihead in_multihead; + +static struct in_multi *in_multi_alloc(int); +static void in_multi_free(struct in_multi *); +static void in_multi_attach(struct in_multi *); +static void inm_trace(struct in_multi *, int); + +static struct ip_msource *ipms_alloc(int); +static void ipms_free(struct ip_msource *); +static struct in_msource *inms_alloc(int); +static void inms_free(struct in_msource *); + +#define IMO_CAST_TO_NONCONST(x) ((struct ip_moptions *)(void *)(uintptr_t)x) +#define INM_CAST_TO_NONCONST(x) ((struct in_multi *)(void *)(uintptr_t)x) + +static __inline int +ip_msource_cmp(const struct ip_msource *a, const struct ip_msource *b) +{ + + if (a->ims_haddr < b->ims_haddr) + return (-1); + if (a->ims_haddr == b->ims_haddr) + return (0); + return (1); +} + +/* + * Inline function which wraps assertions for a valid ifp. + */ +static __inline__ int +inm_is_ifp_detached(const struct in_multi *inm) +{ + VERIFY(inm->inm_ifma != NULL); + VERIFY(inm->inm_ifp == inm->inm_ifma->ifma_ifp); + + return (!ifnet_is_attached(inm->inm_ifp, 0)); +} + +/* + * Initialize an in_mfilter structure to a known state at t0, t1 + * with an empty source filter list. + */ +static __inline__ void +imf_init(struct in_mfilter *imf, const int st0, const int st1) +{ + memset(imf, 0, sizeof(struct in_mfilter)); + RB_INIT(&imf->imf_sources); + imf->imf_st[0] = st0; + imf->imf_st[1] = st1; +} + +/* + * Resize the ip_moptions vector to the next power-of-two minus 1. + */ +static int +imo_grow(struct ip_moptions *imo, size_t newmax) +{ + struct in_multi **nmships; + struct in_multi **omships; + struct in_mfilter *nmfilters; + struct in_mfilter *omfilters; + size_t idx; + size_t oldmax; + + IMO_LOCK_ASSERT_HELD(imo); + + nmships = NULL; + nmfilters = NULL; + omships = imo->imo_membership; + omfilters = imo->imo_mfilters; + oldmax = imo->imo_max_memberships; + if (newmax == 0) + newmax = ((oldmax + 1) * 2) - 1; + + if (newmax > IP_MAX_MEMBERSHIPS) + return (ETOOMANYREFS); + + if ((nmships = (struct in_multi **)_REALLOC(omships, + sizeof (struct in_multi *) * newmax, M_IPMOPTS, + M_WAITOK | M_ZERO)) == NULL) + return (ENOMEM); + + imo->imo_membership = nmships; + + if ((nmfilters = (struct in_mfilter *)_REALLOC(omfilters, + sizeof (struct in_mfilter) * newmax, M_INMFILTER, + M_WAITOK | M_ZERO)) == NULL) + return (ENOMEM); + + imo->imo_mfilters = nmfilters; + + /* Initialize newly allocated source filter heads. */ + for (idx = oldmax; idx < newmax; idx++) + imf_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); + + imo->imo_max_memberships = newmax; + + return (0); +} + +/* + * Find an IPv4 multicast group entry for this ip_moptions instance + * which matches the specified group, and optionally an interface. + * Return its index into the array, or -1 if not found. + */ +static size_t +imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group) +{ + const struct sockaddr_in *gsin; + struct in_multi *pinm; + int idx; + int nmships; + + IMO_LOCK_ASSERT_HELD(IMO_CAST_TO_NONCONST(imo)); + + gsin = (const struct sockaddr_in *)group; + + /* The imo_membership array may be lazy allocated. */ + if (imo->imo_membership == NULL || imo->imo_num_memberships == 0) + return (-1); + + nmships = imo->imo_num_memberships; + for (idx = 0; idx < nmships; idx++) { + pinm = imo->imo_membership[idx]; + if (pinm == NULL) + continue; + INM_LOCK(pinm); + if ((ifp == NULL || (pinm->inm_ifp == ifp)) && + in_hosteq(pinm->inm_addr, gsin->sin_addr)) { + INM_UNLOCK(pinm); + break; + } + INM_UNLOCK(pinm); + } + if (idx >= nmships) + idx = -1; + + return (idx); +} + +/* + * Find an IPv4 multicast source entry for this imo which matches + * the given group index for this socket, and source address. + * + * NOTE: This does not check if the entry is in-mode, merely if + * it exists, which may not be the desired behaviour. + */ +static struct in_msource * +imo_match_source(const struct ip_moptions *imo, const size_t gidx, + const struct sockaddr *src) +{ + struct ip_msource find; + struct in_mfilter *imf; + struct ip_msource *ims; + const sockunion_t *psa; + + IMO_LOCK_ASSERT_HELD(IMO_CAST_TO_NONCONST(imo)); + + VERIFY(src->sa_family == AF_INET); + VERIFY(gidx != (size_t)-1 && gidx < imo->imo_num_memberships); + + /* The imo_mfilters array may be lazy allocated. */ + if (imo->imo_mfilters == NULL) + return (NULL); + imf = &imo->imo_mfilters[gidx]; + + /* Source trees are keyed in host byte order. */ + psa = (const sockunion_t *)src; + find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + + return ((struct in_msource *)ims); +} + +/* + * Perform filtering for multicast datagrams on a socket by group and source. + * + * Returns 0 if a datagram should be allowed through, or various error codes + * if the socket was not a member of the group, or the source was muted, etc. + */ +int +imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group, const struct sockaddr *src) +{ + size_t gidx; + struct in_msource *ims; + int mode; + + IMO_LOCK_ASSERT_HELD(IMO_CAST_TO_NONCONST(imo)); + VERIFY(ifp != NULL); + + gidx = imo_match_group(imo, ifp, group); + if (gidx == (size_t)-1) + return (MCAST_NOTGMEMBER); + + /* + * Check if the source was included in an (S,G) join. + * Allow reception on exclusive memberships by default, + * reject reception on inclusive memberships by default. + * Exclude source only if an in-mode exclude filter exists. + * Include source only if an in-mode include filter exists. + * NOTE: We are comparing group state here at IGMP t1 (now) + * with socket-layer t0 (since last downcall). + */ + mode = imo->imo_mfilters[gidx].imf_st[1]; + ims = imo_match_source(imo, gidx, src); + + if ((ims == NULL && mode == MCAST_INCLUDE) || + (ims != NULL && ims->imsl_st[0] != mode)) { + return (MCAST_NOTSMEMBER); + } + + return (MCAST_PASS); +} + +int +imo_clone(struct ip_moptions *from, struct ip_moptions *to) +{ + int i, err = 0; + + IMO_LOCK(from); + IMO_LOCK(to); + + to->imo_multicast_ifp = from->imo_multicast_ifp; + to->imo_multicast_vif = from->imo_multicast_vif; + to->imo_multicast_ttl = from->imo_multicast_ttl; + to->imo_multicast_loop = from->imo_multicast_loop; + + /* + * We're cloning, so drop any existing memberships and source + * filters on the destination ip_moptions. + */ + for (i = 0; i < to->imo_num_memberships; ++i) { + struct in_mfilter *imf; + + imf = to->imo_mfilters ? &to->imo_mfilters[i] : NULL; + if (imf != NULL) + imf_leave(imf); + + (void) in_leavegroup(to->imo_membership[i], imf); + + if (imf != NULL) + imf_purge(imf); + + INM_REMREF(to->imo_membership[i]); + to->imo_membership[i] = NULL; + } + to->imo_num_memberships = 0; + + VERIFY(to->imo_max_memberships != 0 && from->imo_max_memberships != 0); + if (to->imo_max_memberships < from->imo_max_memberships) { + /* + * Ensure source and destination ip_moptions memberships + * and source filters arrays are at least equal in size. + */ + err = imo_grow(to, from->imo_max_memberships); + if (err != 0) + goto done; + } + VERIFY(to->imo_max_memberships >= from->imo_max_memberships); + + /* + * Source filtering doesn't apply to OpenTransport socket, + * so simply hold additional reference count per membership. + */ + for (i = 0; i < from->imo_num_memberships; i++) { + to->imo_membership[i] = from->imo_membership[i]; + INM_ADDREF(from->imo_membership[i]); + to->imo_num_memberships++; + } + VERIFY(to->imo_num_memberships == from->imo_num_memberships); + +done: + IMO_UNLOCK(to); + IMO_UNLOCK(from); + + return (err); +} + +/* + * Find and return a reference to an in_multi record for (ifp, group), + * and bump its reference count. + * If one does not exist, try to allocate it, and update link-layer multicast + * filters on ifp to listen for group. + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +in_getmulti(struct ifnet *ifp, const struct in_addr *group, + struct in_multi **pinm) +{ + struct sockaddr_in gsin; + struct ifmultiaddr *ifma; + struct in_multi *inm; + int error; + + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(group, ifp, inm); + if (inm != NULL) { + INM_LOCK(inm); + VERIFY(inm->inm_reqcnt >= 1); + inm->inm_reqcnt++; + VERIFY(inm->inm_reqcnt != 0); + *pinm = inm; + INM_UNLOCK(inm); + in_multihead_lock_done(); + /* + * We already joined this group; return the inm + * with a refcount held (via lookup) for caller. + */ + return (0); + } + in_multihead_lock_done(); + + bzero(&gsin, sizeof(gsin)); + gsin.sin_family = AF_INET; + gsin.sin_len = sizeof(struct sockaddr_in); + gsin.sin_addr = *group; + + /* + * Check if a link-layer group is already associated + * with this network-layer group on the given ifnet. + */ + error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma); + if (error != 0) + return (error); + + /* + * See comments in inm_remref() for access to ifma_protospec. + */ + in_multihead_lock_exclusive(); + IFMA_LOCK(ifma); + if ((inm = ifma->ifma_protospec) != NULL) { + VERIFY(ifma->ifma_addr != NULL); + VERIFY(ifma->ifma_addr->sa_family == AF_INET); + INM_ADDREF(inm); /* for caller */ + IFMA_UNLOCK(ifma); + INM_LOCK(inm); + VERIFY(inm->inm_ifma == ifma); + VERIFY(inm->inm_ifp == ifp); + VERIFY(in_hosteq(inm->inm_addr, *group)); + if (inm->inm_debug & IFD_ATTACHED) { + VERIFY(inm->inm_reqcnt >= 1); + inm->inm_reqcnt++; + VERIFY(inm->inm_reqcnt != 0); + *pinm = inm; + INM_UNLOCK(inm); + in_multihead_lock_done(); + IFMA_REMREF(ifma); + /* + * We lost the race with another thread doing + * in_getmulti(); since this group has already + * been joined; return the inm with a refcount + * held for caller. + */ + return (0); + } + /* + * We lost the race with another thread doing in_delmulti(); + * the inm referring to the ifma has been detached, thus we + * reattach it back to the in_multihead list and return the + * inm with a refcount held for the caller. + */ + in_multi_attach(inm); + VERIFY((inm->inm_debug & + (IFD_ATTACHED | IFD_TRASHED)) == IFD_ATTACHED); + *pinm = inm; + INM_UNLOCK(inm); + in_multihead_lock_done(); + IFMA_REMREF(ifma); + return (0); + } + IFMA_UNLOCK(ifma); + + /* + * A new in_multi record is needed; allocate and initialize it. + * We DO NOT perform an IGMP join as the in_ layer may need to + * push an initial source list down to IGMP to support SSM. + * + * The initial source filter state is INCLUDE, {} as per the RFC. + */ + inm = in_multi_alloc(M_WAITOK); + if (inm == NULL) { + in_multihead_lock_done(); + IFMA_REMREF(ifma); + return (ENOMEM); + } + INM_LOCK(inm); + inm->inm_addr = *group; + inm->inm_ifp = ifp; + inm->inm_igi = IGMP_IFINFO(ifp); + VERIFY(inm->inm_igi != NULL); + IGI_ADDREF(inm->inm_igi); + inm->inm_ifma = ifma; /* keep refcount from if_addmulti() */ + inm->inm_state = IGMP_NOT_MEMBER; + /* + * Pending state-changes per group are subject to a bounds check. + */ + inm->inm_scq.ifq_maxlen = IGMP_MAX_STATE_CHANGES; + inm->inm_st[0].iss_fmode = MCAST_UNDEFINED; + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + RB_INIT(&inm->inm_srcs); + *pinm = inm; + in_multi_attach(inm); + VERIFY((inm->inm_debug & (IFD_ATTACHED | IFD_TRASHED)) == IFD_ATTACHED); + INM_ADDREF_LOCKED(inm); /* for caller */ + INM_UNLOCK(inm); + + IFMA_LOCK(ifma); + VERIFY(ifma->ifma_protospec == NULL); + ifma->ifma_protospec = inm; + IFMA_UNLOCK(ifma); + in_multihead_lock_done(); + + return (0); +} + +/* + * Clear recorded source entries for a group. + * Used by the IGMP code. + * FIXME: Should reap. + */ +void +inm_clear_recorded(struct in_multi *inm) +{ + struct ip_msource *ims; + + INM_LOCK_ASSERT_HELD(inm); + + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { + if (ims->ims_stp) { + ims->ims_stp = 0; + --inm->inm_st[1].iss_rec; + } + } + VERIFY(inm->inm_st[1].iss_rec == 0); +} + +/* + * Record a source as pending for a Source-Group IGMPv3 query. + * This lives here as it modifies the shared tree. + * + * inm is the group descriptor. + * naddr is the address of the source to record in network-byte order. + * + * If the net.inet.igmp.sgalloc sysctl is non-zero, we will + * lazy-allocate a source node in response to an SG query. + * Otherwise, no allocation is performed. This saves some memory + * with the trade-off that the source will not be reported to the + * router if joined in the window between the query response and + * the group actually being joined on the local host. + * + * Return 0 if the source didn't exist or was already marked as recorded. + * Return 1 if the source was marked as recorded by this function. + * Return <0 if any error occured (negated errno code). + */ +int +inm_record_source(struct in_multi *inm, const in_addr_t naddr) +{ + struct ip_msource find; + struct ip_msource *ims, *nims; + + INM_LOCK_ASSERT_HELD(inm); + + find.ims_haddr = ntohl(naddr); + ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); + if (ims && ims->ims_stp) + return (0); + if (ims == NULL) { + if (inm->inm_nsrc == in_mcast_maxgrpsrc) + return (-ENOSPC); + nims = ipms_alloc(M_WAITOK); + if (nims == NULL) + return (-ENOMEM); + nims->ims_haddr = find.ims_haddr; + RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); + ++inm->inm_nsrc; + ims = nims; + } + + /* + * Mark the source as recorded and update the recorded + * source count. + */ + ++ims->ims_stp; + ++inm->inm_st[1].iss_rec; + + return (1); +} + +/* + * Return a pointer to an in_msource owned by an in_mfilter, + * given its source address. + * Lazy-allocate if needed. If this is a new entry its filter state is + * undefined at t0. + * + * imf is the filter set being modified. + * haddr is the source address in *host* byte-order. + * + * Caller is expected to be holding imo_lock. + */ +static int +imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin, + struct in_msource **plims) +{ + struct ip_msource find; + struct ip_msource *ims; + struct in_msource *lims; + int error; + + error = 0; + ims = NULL; + lims = NULL; + + /* key is host byte order */ + find.ims_haddr = ntohl(psin->sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + lims = (struct in_msource *)ims; + if (lims == NULL) { + if (imf->imf_nsrc == in_mcast_maxsocksrc) + return (ENOSPC); + lims = inms_alloc(M_WAITOK); + if (lims == NULL) + return (ENOMEM); + lims->ims_haddr = find.ims_haddr; + lims->imsl_st[0] = MCAST_UNDEFINED; + RB_INSERT(ip_msource_tree, &imf->imf_sources, + (struct ip_msource *)lims); + ++imf->imf_nsrc; + } + + *plims = lims; + + return (error); +} + +/* + * Graft a source entry into an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being in the new filter mode at t1. + * + * Return the pointer to the new node, otherwise return NULL. + * + * Caller is expected to be holding imo_lock. + */ +static struct in_msource * +imf_graft(struct in_mfilter *imf, const uint8_t st1, + const struct sockaddr_in *psin) +{ + struct in_msource *lims; + + lims = inms_alloc(M_WAITOK); + if (lims == NULL) + return (NULL); + lims->ims_haddr = ntohl(psin->sin_addr.s_addr); + lims->imsl_st[0] = MCAST_UNDEFINED; + lims->imsl_st[1] = st1; + RB_INSERT(ip_msource_tree, &imf->imf_sources, + (struct ip_msource *)lims); + ++imf->imf_nsrc; + + return (lims); +} + +/* + * Prune a source entry from an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being left at t1, it is not freed. + * + * Return 0 if no error occurred, otherwise return an errno value. + * + * Caller is expected to be holding imo_lock. + */ +static int +imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin) +{ + struct ip_msource find; + struct ip_msource *ims; + struct in_msource *lims; + + /* key is host byte order */ + find.ims_haddr = ntohl(psin->sin_addr.s_addr); + ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find); + if (ims == NULL) + return (ENOENT); + lims = (struct in_msource *)ims; + lims->imsl_st[1] = MCAST_UNDEFINED; + return (0); +} + +/* + * Revert socket-layer filter set deltas at t1 to t0 state. + * + * Caller is expected to be holding imo_lock. + */ +static void +imf_rollback(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + struct in_msource *lims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == lims->imsl_st[1]) { + /* no change at t1 */ + continue; + } else if (lims->imsl_st[0] != MCAST_UNDEFINED) { + /* revert change to existing source at t1 */ + lims->imsl_st[1] = lims->imsl_st[0]; + } else { + /* revert source added t1 */ + IGMP_PRINTF(("%s: free inms %p\n", __func__, lims)); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + inms_free(lims); + imf->imf_nsrc--; + } + } + imf->imf_st[1] = imf->imf_st[0]; +} + +/* + * Mark socket-layer filter set as INCLUDE {} at t1. + * + * Caller is expected to be holding imo_lock. + */ +void +imf_leave(struct in_mfilter *imf) +{ + struct ip_msource *ims; + struct in_msource *lims; + + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + lims->imsl_st[1] = MCAST_UNDEFINED; + } + imf->imf_st[1] = MCAST_INCLUDE; +} + +/* + * Mark socket-layer filter set deltas as committed. + * + * Caller is expected to be holding imo_lock. + */ +static void +imf_commit(struct in_mfilter *imf) +{ + struct ip_msource *ims; + struct in_msource *lims; + + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + lims->imsl_st[0] = lims->imsl_st[1]; + } + imf->imf_st[0] = imf->imf_st[1]; +} + +/* + * Reap unreferenced sources from socket-layer filter set. + * + * Caller is expected to be holding imo_lock. + */ +static void +imf_reap(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + struct in_msource *lims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + lims = (struct in_msource *)ims; + if ((lims->imsl_st[0] == MCAST_UNDEFINED) && + (lims->imsl_st[1] == MCAST_UNDEFINED)) { + IGMP_PRINTF(("%s: free inms %p\n", __func__, lims)); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + inms_free(lims); + imf->imf_nsrc--; + } + } +} + +/* + * Purge socket-layer filter set. + * + * Caller is expected to be holding imo_lock. + */ +void +imf_purge(struct in_mfilter *imf) +{ + struct ip_msource *ims, *tims; + struct in_msource *lims; + + RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) { + lims = (struct in_msource *)ims; + IGMP_PRINTF(("%s: free inms %p\n", __func__, lims)); + RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims); + inms_free(lims); + imf->imf_nsrc--; + } + imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED; + VERIFY(RB_EMPTY(&imf->imf_sources)); +} + +/* + * Look up a source filter entry for a multicast group. + * + * inm is the group descriptor to work with. + * haddr is the host-byte-order IPv4 address to look up. + * noalloc may be non-zero to suppress allocation of sources. + * *pims will be set to the address of the retrieved or allocated source. + * + * Return 0 if successful, otherwise return a non-zero error code. + */ +static int +inm_get_source(struct in_multi *inm, const in_addr_t haddr, + const int noalloc, struct ip_msource **pims) +{ + struct ip_msource find; + struct ip_msource *ims, *nims; +#ifdef IGMP_DEBUG + struct in_addr ia; +#endif + INM_LOCK_ASSERT_HELD(inm); + + find.ims_haddr = haddr; + ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find); + if (ims == NULL && !noalloc) { + if (inm->inm_nsrc == in_mcast_maxgrpsrc) + return (ENOSPC); + nims = ipms_alloc(M_WAITOK); + if (nims == NULL) + return (ENOMEM); + nims->ims_haddr = haddr; + RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims); + ++inm->inm_nsrc; + ims = nims; +#ifdef IGMP_DEBUG + ia.s_addr = htonl(haddr); + IGMP_PRINTF(("%s: allocated %s as %p\n", __func__, + inet_ntoa(ia), ims)); +#endif + } + + *pims = ims; + return (0); +} + +/* + * Helper function to derive the filter mode on a source entry + * from its internal counters. Predicates are: + * A source is only excluded if all listeners exclude it. + * A source is only included if no listeners exclude it, + * and at least one listener includes it. + * May be used by ifmcstat(8). + */ +uint8_t +ims_get_mode(const struct in_multi *inm, const struct ip_msource *ims, + uint8_t t) +{ + INM_LOCK_ASSERT_HELD(INM_CAST_TO_NONCONST(inm)); + + t = !!t; + if (inm->inm_st[t].iss_ex > 0 && + inm->inm_st[t].iss_ex == ims->ims_st[t].ex) + return (MCAST_EXCLUDE); + else if (ims->ims_st[t].in > 0 && ims->ims_st[t].ex == 0) + return (MCAST_INCLUDE); + return (MCAST_UNDEFINED); +} + +/* + * Merge socket-layer source into IGMP-layer source. + * If rollback is non-zero, perform the inverse of the merge. + */ +static void +ims_merge(struct ip_msource *ims, const struct in_msource *lims, + const int rollback) +{ + int n = rollback ? -1 : 1; +#ifdef IGMP_DEBUG + struct in_addr ia; + + ia.s_addr = htonl(ims->ims_haddr); +#endif + + if (lims->imsl_st[0] == MCAST_EXCLUDE) { + IGMP_PRINTF(("%s: t1 ex -= %d on %s\n", + __func__, n, inet_ntoa(ia))); + ims->ims_st[1].ex -= n; + } else if (lims->imsl_st[0] == MCAST_INCLUDE) { + IGMP_PRINTF(("%s: t1 in -= %d on %s\n", + __func__, n, inet_ntoa(ia))); + ims->ims_st[1].in -= n; + } + + if (lims->imsl_st[1] == MCAST_EXCLUDE) { + IGMP_PRINTF(("%s: t1 ex += %d on %s\n", + __func__, n, inet_ntoa(ia))); + ims->ims_st[1].ex += n; + } else if (lims->imsl_st[1] == MCAST_INCLUDE) { + IGMP_PRINTF(("%s: t1 in += %d on %s\n", + __func__, n, inet_ntoa(ia))); + ims->ims_st[1].in += n; + } +} + +/* + * Atomically update the global in_multi state, when a membership's + * filter list is being updated in any way. + * + * imf is the per-inpcb-membership group filter pointer. + * A fake imf may be passed for in-kernel consumers. + * + * XXX This is a candidate for a set-symmetric-difference style loop + * which would eliminate the repeated lookup from root of ims nodes, + * as they share the same key space. + * + * If any error occurred this function will back out of refcounts + * and return a non-zero value. + */ +static int +inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf) +{ + struct ip_msource *ims, *nims; + struct in_msource *lims; + int schanged, error; + int nsrc0, nsrc1; + + INM_LOCK_ASSERT_HELD(inm); + + schanged = 0; + error = 0; + nsrc1 = nsrc0 = 0; + + /* + * Update the source filters first, as this may fail. + * Maintain count of in-mode filters at t0, t1. These are + * used to work out if we transition into ASM mode or not. + * Maintain a count of source filters whose state was + * actually modified by this operation. + */ + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++; + if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++; + if (lims->imsl_st[0] == lims->imsl_st[1]) continue; + error = inm_get_source(inm, lims->ims_haddr, 0, &nims); + ++schanged; + if (error) + break; + ims_merge(nims, lims, 0); + } + if (error) { + struct ip_msource *bims; + + RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == lims->imsl_st[1]) + continue; + (void) inm_get_source(inm, lims->ims_haddr, 1, &bims); + if (bims == NULL) + continue; + ims_merge(bims, lims, 1); + } + goto out_reap; + } + + IGMP_PRINTF(("%s: imf filters in-mode: %d at t0, %d at t1\n", + __func__, nsrc0, nsrc1)); + + /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ + if (imf->imf_st[0] == imf->imf_st[1] && + imf->imf_st[1] == MCAST_INCLUDE) { + if (nsrc1 == 0) { + IGMP_PRINTF(("%s: --in on inm at t1\n", __func__)); + --inm->inm_st[1].iss_in; + } + } + + /* Handle filter mode transition on socket. */ + if (imf->imf_st[0] != imf->imf_st[1]) { + IGMP_PRINTF(("%s: imf transition %d to %d\n", + __func__, imf->imf_st[0], imf->imf_st[1])); + + if (imf->imf_st[0] == MCAST_EXCLUDE) { + IGMP_PRINTF(("%s: --ex on inm at t1\n", __func__)); + --inm->inm_st[1].iss_ex; + } else if (imf->imf_st[0] == MCAST_INCLUDE) { + IGMP_PRINTF(("%s: --in on inm at t1\n", __func__)); + --inm->inm_st[1].iss_in; + } + + if (imf->imf_st[1] == MCAST_EXCLUDE) { + IGMP_PRINTF(("%s: ex++ on inm at t1\n", __func__)); + inm->inm_st[1].iss_ex++; + } else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) { + IGMP_PRINTF(("%s: in++ on inm at t1\n", __func__)); + inm->inm_st[1].iss_in++; + } + } + + /* + * Track inm filter state in terms of listener counts. + * If there are any exclusive listeners, stack-wide + * membership is exclusive. + * Otherwise, if only inclusive listeners, stack-wide is inclusive. + * If no listeners remain, state is undefined at t1, + * and the IGMP lifecycle for this group should finish. + */ + if (inm->inm_st[1].iss_ex > 0) { + IGMP_PRINTF(("%s: transition to EX\n", __func__)); + inm->inm_st[1].iss_fmode = MCAST_EXCLUDE; + } else if (inm->inm_st[1].iss_in > 0) { + IGMP_PRINTF(("%s: transition to IN\n", __func__)); + inm->inm_st[1].iss_fmode = MCAST_INCLUDE; + } else { + IGMP_PRINTF(("%s: transition to UNDEF\n", __func__)); + inm->inm_st[1].iss_fmode = MCAST_UNDEFINED; + } + + /* Decrement ASM listener count on transition out of ASM mode. */ + if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { + if ((imf->imf_st[1] != MCAST_EXCLUDE) || + (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { + IGMP_PRINTF(("%s: --asm on inm at t1\n", __func__)); + --inm->inm_st[1].iss_asm; + } + } + + /* Increment ASM listener count on transition to ASM mode. */ + if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { + IGMP_PRINTF(("%s: asm++ on inm at t1\n", __func__)); + inm->inm_st[1].iss_asm++; + } + + IGMP_PRINTF(("%s: merged imf %p to inm %p\n", __func__, imf, inm)); + inm_print(inm); + +out_reap: + if (schanged > 0) { + IGMP_PRINTF(("%s: sources changed; reaping\n", __func__)); + inm_reap(inm); + } + return (error); +} + +/* + * Mark an in_multi's filter set deltas as committed. + * Called by IGMP after a state change has been enqueued. + */ +void +inm_commit(struct in_multi *inm) +{ + struct ip_msource *ims; + + INM_LOCK_ASSERT_HELD(inm); + + IGMP_PRINTF(("%s: commit inm %p\n", __func__, inm)); + IGMP_PRINTF(("%s: pre commit:\n", __func__)); + inm_print(inm); + + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { + ims->ims_st[0] = ims->ims_st[1]; + } + inm->inm_st[0] = inm->inm_st[1]; +} + +/* + * Reap unreferenced nodes from an in_multi's filter set. + */ +static void +inm_reap(struct in_multi *inm) +{ + struct ip_msource *ims, *tims; + + INM_LOCK_ASSERT_HELD(inm); + + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { + if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 || + ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 || + ims->ims_stp != 0) + continue; + IGMP_PRINTF(("%s: free ims %p\n", __func__, ims)); + RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); + ipms_free(ims); + inm->inm_nsrc--; + } +} + +/* + * Purge all source nodes from an in_multi's filter set. + */ +void +inm_purge(struct in_multi *inm) +{ + struct ip_msource *ims, *tims; + + INM_LOCK_ASSERT_HELD(inm); + + RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) { + IGMP_PRINTF(("%s: free ims %p\n", __func__, ims)); + RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims); + ipms_free(ims); + inm->inm_nsrc--; + } +} + +/* + * Join a multicast group; real entry point. + * + * Only preserves atomicity at inm level. + * NOTE: imf argument cannot be const due to sys/tree.h limitations. + * + * If the IGMP downcall fails, the group is not joined, and an error + * code is returned. + */ +static int +in_joingroup(struct ifnet *ifp, const struct in_addr *gina, + /*const*/ struct in_mfilter *imf, struct in_multi **pinm) +{ + struct in_mfilter timf; + struct in_multi *inm = NULL; + int error = 0; + + IGMP_PRINTF(("%s: join %s on %p(%s%d))\n", __func__, + inet_ntoa(*gina), ifp, ifp->if_name, ifp->if_unit)); + + *pinm = NULL; + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); + imf = &timf; + } + + error = in_getmulti(ifp, gina, &inm); + if (error) { + IGMP_PRINTF(("%s: in_getmulti() failure\n", __func__)); + return (error); + } + + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + + INM_LOCK(inm); + error = inm_merge(inm, imf); + if (error) { + IGMP_PRINTF(("%s: failed to merge inm state\n", __func__)); + goto out_inm_release; + } + + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); + if (error) { + IGMP_PRINTF(("%s: failed to update source\n", __func__)); + goto out_inm_release; + } + +out_inm_release: + if (error) { + IGMP_PRINTF(("%s: dropping ref on %p\n", __func__, inm)); + INM_UNLOCK(inm); + INM_REMREF(inm); + } else { + INM_UNLOCK(inm); + *pinm = inm; /* keep refcount from in_getmulti() */ + } + + return (error); +} + +/* + * Leave a multicast group; real entry point. + * All source filters will be expunged. + * + * Only preserves atomicity at inm level. + * + * Note: This is not the same as inm_release(*) as this function also + * makes a state change downcall into IGMP. + */ +int +in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf) +{ + struct in_mfilter timf; + int error, lastref; + + error = 0; + + INM_LOCK_ASSERT_NOTHELD(inm); + + in_multihead_lock_exclusive(); + INM_LOCK(inm); + + IGMP_PRINTF(("%s: leave inm %p, %s/%s%d, imf %p\n", __func__, + inm, inet_ntoa(inm->inm_addr), + (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_name), + inm->inm_ifp->if_unit, imf)); + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); + imf = &timf; + } + + /* + * Begin state merge transaction at IGMP layer. + * + * As this particular invocation should not cause any memory + * to be allocated, and there is no opportunity to roll back + * the transaction, it MUST NOT fail. + */ + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + + error = inm_merge(inm, imf); + KASSERT(error == 0, ("%s: failed to merge inm state\n", __func__)); + + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); +#if IGMP_DEBUG + if (error) + IGMP_PRINTF(("%s: failed igmp downcall\n", __func__)); +#endif + lastref = in_multi_detach(inm); + VERIFY(!lastref || (!(inm->inm_debug & IFD_ATTACHED) && + inm->inm_reqcnt == 0)); + INM_UNLOCK(inm); + in_multihead_lock_done(); + + if (lastref) + INM_REMREF(inm); /* for in_multihead list */ + + return (error); +} + +/* + * Join an IPv4 multicast group in (*,G) exclusive mode. + * The group must be a 224.0.0.0/24 link-scope group. + * This KPI is for legacy kernel consumers only. + */ +struct in_multi * +in_addmulti(struct in_addr *ap, struct ifnet *ifp) +{ + struct in_multi *pinm = NULL; + int error; + + KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)), + ("%s: %s not in 224.0.0.0/24\n", __func__, inet_ntoa(*ap))); + + error = in_joingroup(ifp, ap, NULL, &pinm); + VERIFY(pinm != NULL || error != 0); + + return (pinm); +} + +/* + * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode. + * This KPI is for legacy kernel consumers only. + */ +void +in_delmulti(struct in_multi *inm) +{ + + (void) in_leavegroup(inm, NULL); +} + +/* + * Block or unblock an ASM multicast source on an inpcb. + * This implements the delta-based API described in RFC 3678. + * + * The delta-based API applies only to exclusive-mode memberships. + * An IGMP downcall will be performed. + * + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_msource *ims; + struct in_multi *inm; + size_t idx; + uint16_t fmode; + int error, doblock; + unsigned int ifindex = 0; + + ifp = NULL; + error = 0; + doblock = 0; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + ssa = (sockunion_t *)&gsr.gsr_source; + + switch (sopt->sopt_name) { + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: { + struct ip_mreq_source mreqs; + + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq_source), + sizeof(struct ip_mreq_source)); + if (error) + return (error); + + gsa->sin.sin_family = AF_INET; + gsa->sin.sin_len = sizeof(struct sockaddr_in); + gsa->sin.sin_addr = mreqs.imr_multiaddr; + + ssa->sin.sin_family = AF_INET; + ssa->sin.sin_len = sizeof(struct sockaddr_in); + ssa->sin.sin_addr = mreqs.imr_sourceaddr; + + if (!in_nullhost(mreqs.imr_interface)) + ifp = ip_multicast_if(&mreqs.imr_interface, &ifindex); + + if (sopt->sopt_name == IP_BLOCK_SOURCE) + doblock = 1; + + IGMP_PRINTF(("%s: imr_interface = %s, ifp = %p\n", + __func__, inet_ntoa(mreqs.imr_interface), ifp)); + break; + } + + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + if (error) + return (error); + + if (gsa->sin.sin_family != AF_INET || + gsa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + if (ssa->sin.sin_family != AF_INET || + ssa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + ifnet_head_lock_shared(); + if (gsr.gsr_interface == 0 || + (u_int)if_index < gsr.gsr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + + ifp = ifindex2ifnet[gsr.gsr_interface]; + ifnet_head_done(); + + if (ifp == NULL) + return (EADDRNOTAVAIL); + + if (sopt->sopt_name == MCAST_BLOCK_SOURCE) + doblock = 1; + break; + + default: + IGMP_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + /* + * Check if we are actually a member of this group. + */ + imo = inp_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IMO_LOCK(imo); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->imo_mfilters == NULL) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + + VERIFY(imo->imo_mfilters != NULL); + imf = &imo->imo_mfilters[idx]; + inm = imo->imo_membership[idx]; + + /* + * Attempting to use the delta-based API on an + * non exclusive-mode membership is an error. + */ + fmode = imf->imf_st[0]; + if (fmode != MCAST_EXCLUDE) { + error = EINVAL; + goto out_imo_locked; + } + + /* + * Deal with error cases up-front: + * Asked to block, but already blocked; or + * Asked to unblock, but nothing to unblock. + * If adding a new block entry, allocate it. + */ + ims = imo_match_source(imo, idx, &ssa->sa); + if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { + IGMP_PRINTF(("%s: source %s %spresent\n", __func__, + inet_ntoa(ssa->sin.sin_addr), doblock ? "" : "not ")); + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + + /* + * Begin state merge transaction at socket layer. + */ + if (doblock) { + IGMP_PRINTF(("%s: %s source\n", __func__, "block")); + ims = imf_graft(imf, fmode, &ssa->sin); + if (ims == NULL) + error = ENOMEM; + } else { + IGMP_PRINTF(("%s: %s source\n", __func__, "allow")); + error = imf_prune(imf, &ssa->sin); + } + + if (error) { + IGMP_PRINTF(("%s: merge imf state failed\n", __func__)); + goto out_imf_rollback; + } + + /* + * Begin state merge transaction at IGMP layer. + */ + INM_LOCK(inm); + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + error = inm_merge(inm, imf); + if (error) { + IGMP_PRINTF(("%s: failed to merge inm state\n", __func__)); + INM_UNLOCK(inm); + goto out_imf_rollback; + } + + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); + INM_UNLOCK(inm); +#if IGMP_DEBUG + if (error) + IGMP_PRINTF(("%s: failed igmp downcall\n", __func__)); +#endif + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); + +out_imo_locked: + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + return (error); +} + +/* + * Given an inpcb, return its multicast options structure pointer. + * + * Caller is responsible for locking the inpcb, and releasing the + * extra reference held on the imo, upon a successful return. + */ +static struct ip_moptions * +inp_findmoptions(struct inpcb *inp) +{ + struct ip_moptions *imo; + struct in_multi **immp; + struct in_mfilter *imfp; + size_t idx; + + if ((imo = inp->inp_moptions) != NULL) { + IMO_ADDREF(imo); /* for caller */ + return (imo); + } + + imo = ip_allocmoptions(M_WAITOK); + if (imo == NULL) + return (NULL); + + immp = _MALLOC(sizeof (*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS, + M_WAITOK | M_ZERO); + if (immp == NULL) { + IMO_REMREF(imo); + return (NULL); + } + + imfp = _MALLOC(sizeof (struct in_mfilter) * IP_MIN_MEMBERSHIPS, + M_INMFILTER, M_WAITOK | M_ZERO); + if (imfp == NULL) { + _FREE(immp, M_IPMOPTS); + IMO_REMREF(imo); + return (NULL); + } + + imo->imo_multicast_ifp = NULL; + imo->imo_multicast_addr.s_addr = INADDR_ANY; + imo->imo_multicast_vif = -1; + imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + imo->imo_multicast_loop = in_mcast_loop; + imo->imo_num_memberships = 0; + imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; + imo->imo_membership = immp; + + /* Initialize per-group source filters. */ + for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++) + imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); + + imo->imo_mfilters = imfp; + inp->inp_moptions = imo; /* keep reference from ip_allocmoptions() */ + IMO_ADDREF(imo); /* for caller */ + + return (imo); +} +/* + * Atomically get source filters on a socket for an IPv4 multicast group. + */ +static int +inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt) +{ + struct __msfilterreq64 msfr, msfr64; + struct __msfilterreq32 msfr32; + sockunion_t *gsa; + struct ifnet *ifp; + struct ip_moptions *imo; + struct in_mfilter *imf; + struct ip_msource *ims; + struct in_msource *lims; + struct sockaddr_in *psin; + struct sockaddr_storage *ptss; + struct sockaddr_storage *tss; + int error; + size_t idx, nsrcs, ncsrcs; + user_addr_t tmp_ptr; + + imo = inp->inp_moptions; + VERIFY(imo != NULL); + + if (IS_64BIT_PROCESS(current_proc())) { + error = sooptcopyin(sopt, &msfr64, + sizeof(struct __msfilterreq64), + sizeof(struct __msfilterreq64)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr64, sizeof(msfr)); + } else { + error = sooptcopyin(sopt, &msfr32, + sizeof(struct __msfilterreq32), + sizeof(struct __msfilterreq32)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr32, sizeof(msfr)); + } + + ifnet_head_lock_shared(); + if (msfr.msfr_ifindex == 0 || (u_int)if_index < msfr.msfr_ifindex) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + + ifp = ifindex2ifnet[msfr.msfr_ifindex]; + ifnet_head_done(); + + if (ifp == NULL) + return (EADDRNOTAVAIL); + + if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) + msfr.msfr_nsrcs = in_mcast_maxsocksrc; + + IMO_LOCK(imo); + /* + * Lookup group on the socket. + */ + gsa = (sockunion_t *)&msfr.msfr_group; + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->imo_mfilters == NULL) { + IMO_UNLOCK(imo); + return (EADDRNOTAVAIL); + } + imf = &imo->imo_mfilters[idx]; + + /* + * Ignore memberships which are in limbo. + */ + if (imf->imf_st[1] == MCAST_UNDEFINED) { + IMO_UNLOCK(imo); + return (EAGAIN); + } + msfr.msfr_fmode = imf->imf_st[1]; + + /* + * If the user specified a buffer, copy out the source filter + * entries to userland gracefully. + * We only copy out the number of entries which userland + * has asked for, but we always tell userland how big the + * buffer really needs to be. + */ + + if (IS_64BIT_PROCESS(current_proc())) + tmp_ptr = msfr64.msfr_srcs; + else + tmp_ptr = CAST_USER_ADDR_T(msfr32.msfr_srcs); + + tss = NULL; + if (tmp_ptr != USER_ADDR_NULL && msfr.msfr_nsrcs > 0) { + tss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + M_TEMP, M_WAITOK | M_ZERO); + if (tss == NULL) { + IMO_UNLOCK(imo); + return (ENOBUFS); + } + } + + /* + * Count number of sources in-mode at t0. + * If buffer space exists and remains, copy out source entries. + */ + nsrcs = msfr.msfr_nsrcs; + ncsrcs = 0; + ptss = tss; + RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) { + lims = (struct in_msource *)ims; + if (lims->imsl_st[0] == MCAST_UNDEFINED || + lims->imsl_st[0] != imf->imf_st[0]) + continue; + if (tss != NULL && nsrcs > 0) { + psin = (struct sockaddr_in *)ptss; + psin->sin_family = AF_INET; + psin->sin_len = sizeof(struct sockaddr_in); + psin->sin_addr.s_addr = htonl(lims->ims_haddr); + psin->sin_port = 0; + ++ptss; + --nsrcs; + ++ncsrcs; + } + } + + IMO_UNLOCK(imo); + + if (tss != NULL) { + error = copyout(tss, tmp_ptr, + sizeof(struct sockaddr_storage) * ncsrcs); + FREE(tss, M_TEMP); + if (error) + return (error); + } + + msfr.msfr_nsrcs = ncsrcs; + if (IS_64BIT_PROCESS(current_proc())) { + msfr64.msfr_ifindex = msfr.msfr_ifindex; + msfr64.msfr_fmode = msfr.msfr_fmode; + msfr64.msfr_nsrcs = msfr.msfr_nsrcs; + memcpy(&msfr64.msfr_group, &msfr.msfr_group, + sizeof(struct sockaddr_storage)); + error = sooptcopyout(sopt, &msfr64, + sizeof(struct __msfilterreq64)); + } else { + msfr32.msfr_ifindex = msfr.msfr_ifindex; + msfr32.msfr_fmode = msfr.msfr_fmode; + msfr32.msfr_nsrcs = msfr.msfr_nsrcs; + memcpy(&msfr64.msfr_group, &msfr.msfr_group, + sizeof(struct sockaddr_storage)); + error = sooptcopyout(sopt, &msfr32, + sizeof(struct __msfilterreq32)); + } + + return (error); +} + +/* + * Return the IP multicast options in response to user getsockopt(). + */ +int +inp_getmoptions(struct inpcb *inp, struct sockopt *sopt) +{ + struct ip_mreqn mreqn; + struct ip_moptions *imo; + struct ifnet *ifp; + struct in_ifaddr *ia; + int error, optval; + unsigned int ifindex; + u_char coptval; + + imo = inp->inp_moptions; + /* + * If socket is neither of type SOCK_RAW or SOCK_DGRAM, + * or is a divert socket, reject it. + */ + if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || + (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { + return (EOPNOTSUPP); + } + + error = 0; + switch (sopt->sopt_name) { +#ifdef MROUTING + case IP_MULTICAST_VIF: + if (imo != NULL) { + IMO_LOCK(imo); + optval = imo->imo_multicast_vif; + IMO_UNLOCK(imo); + } else + optval = -1; + error = sooptcopyout(sopt, &optval, sizeof(int)); + break; +#endif /* MROUTING */ + + case IP_MULTICAST_IF: + memset(&mreqn, 0, sizeof(struct ip_mreqn)); + if (imo != NULL) { + IMO_LOCK(imo); + ifp = imo->imo_multicast_ifp; + if (!in_nullhost(imo->imo_multicast_addr)) { + mreqn.imr_address = imo->imo_multicast_addr; + } else if (ifp != NULL) { + mreqn.imr_ifindex = ifp->if_index; + IFP_TO_IA(ifp, ia); + if (ia != NULL) { + IFA_LOCK_SPIN(&ia->ia_ifa); + mreqn.imr_address = + IA_SIN(ia)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + } + } + IMO_UNLOCK(imo); + } + if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { + error = sooptcopyout(sopt, &mreqn, + sizeof(struct ip_mreqn)); + } else { + error = sooptcopyout(sopt, &mreqn.imr_address, + sizeof(struct in_addr)); + } + break; + + case IP_MULTICAST_IFINDEX: + if (imo != NULL) + IMO_LOCK(imo); + if (imo == NULL || imo->imo_multicast_ifp == NULL) { + ifindex = 0; + } else { + ifindex = imo->imo_multicast_ifp->if_index; + } + if (imo != NULL) + IMO_UNLOCK(imo); + error = sooptcopyout(sopt, &ifindex, sizeof (ifindex)); + break; + + case IP_MULTICAST_TTL: + if (imo == NULL) + optval = coptval = IP_DEFAULT_MULTICAST_TTL; + else { + IMO_LOCK(imo); + optval = coptval = imo->imo_multicast_ttl; + IMO_UNLOCK(imo); + } + if (sopt->sopt_valsize == sizeof(u_char)) + error = sooptcopyout(sopt, &coptval, sizeof(u_char)); + else + error = sooptcopyout(sopt, &optval, sizeof(int)); + break; + + case IP_MULTICAST_LOOP: + if (imo == 0) + optval = coptval = IP_DEFAULT_MULTICAST_LOOP; + else { + IMO_LOCK(imo); + optval = coptval = imo->imo_multicast_loop; + IMO_UNLOCK(imo); + } + if (sopt->sopt_valsize == sizeof(u_char)) + error = sooptcopyout(sopt, &coptval, sizeof(u_char)); + else + error = sooptcopyout(sopt, &optval, sizeof(int)); + break; + + case IP_MSFILTER: + if (imo == NULL) { + error = EADDRNOTAVAIL; + } else { + error = inp_get_source_filters(inp, sopt); + } + break; + + default: + error = ENOPROTOOPT; + break; + } + + return (error); +} + +/* + * Look up the ifnet to use for a multicast group membership, + * given the IPv4 address of an interface, and the IPv4 group address. + * + * This routine exists to support legacy multicast applications + * which do not understand that multicast memberships are scoped to + * specific physical links in the networking stack, or which need + * to join link-scope groups before IPv4 addresses are configured. + * + * If inp is non-NULL and is bound to an interface, use this socket's + * inp_boundif for any required routing table lookup. + * + * If the route lookup fails, attempt to use the first non-loopback + * interface with multicast capability in the system as a + * last resort. The legacy IPv4 ASM API requires that we do + * this in order to allow groups to be joined when the routing + * table has not yet been populated during boot. + * + * Returns NULL if no ifp could be found. + * + */ +static struct ifnet * +inp_lookup_mcast_ifp(const struct inpcb *inp, + const struct sockaddr_in *gsin, const struct in_addr ina) +{ + struct ifnet *ifp; + unsigned int ifindex = 0; + + VERIFY(gsin->sin_family == AF_INET); + VERIFY(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr))); + + ifp = NULL; + if (!in_nullhost(ina)) { + struct in_addr new_ina; + memcpy(&new_ina, &ina, sizeof(struct in_addr)); + ifp = ip_multicast_if(&new_ina, &ifindex); + } else { + struct route ro; + unsigned int ifscope = IFSCOPE_NONE; + + if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) + ifscope = inp->inp_boundif; + + bzero(&ro, sizeof (ro)); + memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in)); + rtalloc_scoped_ign(&ro, 0, ifscope); + if (ro.ro_rt != NULL) { + ifp = ro.ro_rt->rt_ifp; + VERIFY(ifp != NULL); + rtfree(ro.ro_rt); + } else { + struct in_ifaddr *ia; + struct ifnet *mifp; + + mifp = NULL; + lck_rw_lock_shared(in_ifaddr_rwlock); + TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { + IFA_LOCK_SPIN(&ia->ia_ifa); + mifp = ia->ia_ifp; + IFA_UNLOCK(&ia->ia_ifa); + if (!(mifp->if_flags & IFF_LOOPBACK) && + (mifp->if_flags & IFF_MULTICAST)) { + ifp = mifp; + break; + } + } + lck_rw_done(in_ifaddr_rwlock); + } + } + + return (ifp); +} + +/* + * Join an IPv4 multicast group, possibly with a source. + * + * NB: sopt->sopt_val might point to the kernel address space. This means that + * we were called by the IPv6 stack due to the presence of an IPv6 v4 mapped + * address. In this scenario, sopt_p points to kernproc and sooptcopyin() will + * just issue an in-kernel memcpy. + */ +int +inp_join_group(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_multi *inm = NULL; + struct in_msource *lims; + size_t idx; + int error, is_new; + + ifp = NULL; + imf = NULL; + error = 0; + is_new = 0; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + gsa->ss.ss_family = AF_UNSPEC; + ssa = (sockunion_t *)&gsr.gsr_source; + ssa->ss.ss_family = AF_UNSPEC; + + switch (sopt->sopt_name) { + case IP_ADD_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: { + struct ip_mreq_source mreqs; + + if (sopt->sopt_name == IP_ADD_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq), + sizeof(struct ip_mreq)); + /* + * Do argument switcharoo from ip_mreq into + * ip_mreq_source to avoid using two instances. + */ + mreqs.imr_interface = mreqs.imr_sourceaddr; + mreqs.imr_sourceaddr.s_addr = INADDR_ANY; + } else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq_source), + sizeof(struct ip_mreq_source)); + } + if (error) { + IGMP_PRINTF(("%s: error copyin IP_ADD_MEMBERSHIP/" + "IP_ADD_SOURCE_MEMBERSHIP %d err=%d\n", + __func__, sopt->sopt_name, error)); + return (error); + } + + gsa->sin.sin_family = AF_INET; + gsa->sin.sin_len = sizeof(struct sockaddr_in); + gsa->sin.sin_addr = mreqs.imr_multiaddr; + + if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) { + ssa->sin.sin_family = AF_INET; + ssa->sin.sin_len = sizeof(struct sockaddr_in); + ssa->sin.sin_addr = mreqs.imr_sourceaddr; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + ifp = inp_lookup_mcast_ifp(inp, &gsa->sin, + mreqs.imr_interface); + IGMP_PRINTF(("%s: imr_interface = %s, ifp = %p\n", + __func__, inet_ntoa(mreqs.imr_interface), ifp)); + break; + } + + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + if (sopt->sopt_name == MCAST_JOIN_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_req), + sizeof(struct group_req)); + } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + } + if (error) + return (error); + + if (gsa->sin.sin_family != AF_INET || + gsa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + /* + * Overwrite the port field if present, as the sockaddr + * being copied in may be matched with a binary comparison. + */ + gsa->sin.sin_port = 0; + if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { + if (ssa->sin.sin_family != AF_INET || + ssa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + ssa->sin.sin_port = 0; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + ifnet_head_lock_shared(); + if (gsr.gsr_interface == 0 || + (u_int)if_index < gsr.gsr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[gsr.gsr_interface]; + ifnet_head_done(); + + break; + + default: + IGMP_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) + return (EADDRNOTAVAIL); + + imo = inp_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IMO_LOCK(imo); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1) { + is_new = 1; + } else { + inm = imo->imo_membership[idx]; + imf = &imo->imo_mfilters[idx]; + if (ssa->ss.ss_family != AF_UNSPEC) { + /* + * MCAST_JOIN_SOURCE_GROUP on an exclusive membership + * is an error. On an existing inclusive membership, + * it just adds the source to the filter list. + */ + if (imf->imf_st[1] != MCAST_INCLUDE) { + error = EINVAL; + goto out_imo_locked; + } + /* + * Throw out duplicates. + * + * XXX FIXME: This makes a naive assumption that + * even if entries exist for *ssa in this imf, + * they will be rejected as dupes, even if they + * are not valid in the current mode (in-mode). + * + * in_msource is transactioned just as for anything + * else in SSM -- but note naive use of inm_graft() + * below for allocating new filter entries. + * + * This is only an issue if someone mixes the + * full-state SSM API with the delta-based API, + * which is discouraged in the relevant RFCs. + */ + lims = imo_match_source(imo, idx, &ssa->sa); + if (lims != NULL /*&& + lims->imsl_st[1] == MCAST_INCLUDE*/) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + } else { + /* + * MCAST_JOIN_GROUP on an existing exclusive + * membership is an error; return EADDRINUSE + * to preserve 4.4BSD API idempotence, and + * avoid tedious detour to code below. + * NOTE: This is bending RFC 3678 a bit. + * + * On an existing inclusive membership, this is also + * an error; if you want to change filter mode, + * you must use the userland API setsourcefilter(). + * XXX We don't reject this for imf in UNDEFINED + * state at t1, because allocation of a filter + * is atomic with allocation of a membership. + */ + error = EINVAL; + /* See comments above for EADDRINUSE */ + if (imf->imf_st[1] == MCAST_EXCLUDE) + error = EADDRINUSE; + goto out_imo_locked; + } + } + + /* + * Begin state merge transaction at socket layer. + */ + + if (is_new) { + if (imo->imo_num_memberships == imo->imo_max_memberships) { + error = imo_grow(imo, 0); + if (error) + goto out_imo_locked; + } + /* + * Allocate the new slot upfront so we can deal with + * grafting the new source filter in same code path + * as for join-source on existing membership. + */ + idx = imo->imo_num_memberships; + imo->imo_membership[idx] = NULL; + imo->imo_num_memberships++; + VERIFY(imo->imo_mfilters != NULL); + imf = &imo->imo_mfilters[idx]; + VERIFY(RB_EMPTY(&imf->imf_sources)); + } + + /* + * Graft new source into filter list for this inpcb's + * membership of the group. The in_multi may not have + * been allocated yet if this is a new membership, however, + * the in_mfilter slot will be allocated and must be initialized. + */ + if (ssa->ss.ss_family != AF_UNSPEC) { + /* Membership starts in IN mode */ + if (is_new) { + IGMP_PRINTF(("%s: new join w/source\n", __func__)); + imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE); + } else { + IGMP_PRINTF(("%s: %s source\n", __func__, "allow")); + } + lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin); + if (lims == NULL) { + IGMP_PRINTF(("%s: merge imf state failed\n", + __func__)); + error = ENOMEM; + goto out_imo_free; + } + } else { + /* No address specified; Membership starts in EX mode */ + if (is_new) { + IGMP_PRINTF(("%s: new join w/o source\n", __func__)); + imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); + } + } + + /* + * Begin state merge transaction at IGMP layer. + */ + + if (is_new) { + VERIFY(inm == NULL); + error = in_joingroup(ifp, &gsa->sin.sin_addr, imf, &inm); + VERIFY(inm != NULL || error != 0); + if (error) + goto out_imo_free; + imo->imo_membership[idx] = inm; /* from in_joingroup() */ + } else { + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + INM_LOCK(inm); + error = inm_merge(inm, imf); + if (error) { + IGMP_PRINTF(("%s: failed to merge inm state\n", + __func__)); + INM_UNLOCK(inm); + goto out_imf_rollback; + } + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); + INM_UNLOCK(inm); + if (error) { + IGMP_PRINTF(("%s: failed igmp downcall\n", + __func__)); + goto out_imf_rollback; + } + } + +out_imf_rollback: + if (error) { + imf_rollback(imf); + if (is_new) + imf_purge(imf); + else + imf_reap(imf); + } else { + imf_commit(imf); + } + +out_imo_free: + if (error && is_new) { + VERIFY(inm == NULL); + imo->imo_membership[idx] = NULL; + --imo->imo_num_memberships; + } + +out_imo_locked: + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + return (error); +} + +/* + * Leave an IPv4 multicast group on an inpcb, possibly with a source. + * + * NB: sopt->sopt_val might point to the kernel address space. Refer to the + * block comment on top of inp_join_group() for more information. + */ +int +inp_leave_group(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + struct ip_mreq_source mreqs; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_msource *ims; + struct in_multi *inm = NULL; + size_t idx; + int error, is_final; + unsigned int ifindex = 0; + + ifp = NULL; + error = 0; + is_final = 1; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + gsa->ss.ss_family = AF_UNSPEC; + ssa = (sockunion_t *)&gsr.gsr_source; + ssa->ss.ss_family = AF_UNSPEC; + + switch (sopt->sopt_name) { + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + if (sopt->sopt_name == IP_DROP_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq), + sizeof(struct ip_mreq)); + /* + * Swap interface and sourceaddr arguments, + * as ip_mreq and ip_mreq_source are laid + * out differently. + */ + mreqs.imr_interface = mreqs.imr_sourceaddr; + mreqs.imr_sourceaddr.s_addr = INADDR_ANY; + } else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { + error = sooptcopyin(sopt, &mreqs, + sizeof(struct ip_mreq_source), + sizeof(struct ip_mreq_source)); + } + if (error) + return (error); + + gsa->sin.sin_family = AF_INET; + gsa->sin.sin_len = sizeof(struct sockaddr_in); + gsa->sin.sin_addr = mreqs.imr_multiaddr; + + if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) { + ssa->sin.sin_family = AF_INET; + ssa->sin.sin_len = sizeof(struct sockaddr_in); + ssa->sin.sin_addr = mreqs.imr_sourceaddr; + } + /* + * Attempt to look up hinted ifp from interface address. + * Fallthrough with null ifp iff lookup fails, to + * preserve 4.4BSD mcast API idempotence. + * XXX NOTE WELL: The RFC 3678 API is preferred because + * using an IPv4 address as a key is racy. + */ + if (!in_nullhost(mreqs.imr_interface)) + ifp = ip_multicast_if(&mreqs.imr_interface, &ifindex); + + IGMP_PRINTF(("%s: imr_interface = %s, ifp = %p\n", + __func__, inet_ntoa(mreqs.imr_interface), ifp)); + + break; + + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + if (sopt->sopt_name == MCAST_LEAVE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_req), + sizeof(struct group_req)); + } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + } + if (error) + return (error); + + if (gsa->sin.sin_family != AF_INET || + gsa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { + if (ssa->sin.sin_family != AF_INET || + ssa->sin.sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + } + + ifnet_head_lock_shared(); + if (gsr.gsr_interface == 0 || + (u_int)if_index < gsr.gsr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + + ifp = ifindex2ifnet[gsr.gsr_interface]; + ifnet_head_done(); + break; + + default: + IGMP_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + /* + * Find the membership in the membership array. + */ + imo = inp_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IMO_LOCK(imo); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1) { + error = EADDRNOTAVAIL; + goto out_locked; + } + inm = imo->imo_membership[idx]; + imf = &imo->imo_mfilters[idx]; + + if (ssa->ss.ss_family != AF_UNSPEC) { + IGMP_PRINTF(("%s: opt=%d is_final=0\n", __func__, + sopt->sopt_name)); + is_final = 0; + } + + /* + * Begin state merge transaction at socket layer. + */ + + /* + * If we were instructed only to leave a given source, do so. + * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. + */ + if (is_final) { + imf_leave(imf); + } else { + if (imf->imf_st[0] == MCAST_EXCLUDE) { + error = EADDRNOTAVAIL; + goto out_locked; + } + ims = imo_match_source(imo, idx, &ssa->sa); + if (ims == NULL) { + IGMP_PRINTF(("%s: source %s %spresent\n", __func__, + inet_ntoa(ssa->sin.sin_addr), "not ")); + error = EADDRNOTAVAIL; + goto out_locked; + } + IGMP_PRINTF(("%s: %s source\n", __func__, "block")); + error = imf_prune(imf, &ssa->sin); + if (error) { + IGMP_PRINTF(("%s: merge imf state failed\n", + __func__)); + goto out_locked; + } + } + + /* + * Begin state merge transaction at IGMP layer. + */ + + if (is_final) { + /* + * Give up the multicast address record to which + * the membership points. Reference held in imo + * will be released below. + */ + (void) in_leavegroup(inm, imf); + } else { + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + INM_LOCK(inm); + error = inm_merge(inm, imf); + if (error) { + IGMP_PRINTF(("%s: failed to merge inm state\n", + __func__)); + INM_UNLOCK(inm); + goto out_imf_rollback; + } + + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); + if (error) { + IGMP_PRINTF(("%s: failed igmp downcall\n", __func__)); + } + INM_UNLOCK(inm); + } + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); + + if (is_final) { + /* Remove the gap in the membership and filter array. */ + VERIFY(inm == imo->imo_membership[idx]); + imo->imo_membership[idx] = NULL; + INM_REMREF(inm); + for (++idx; idx < imo->imo_num_memberships; ++idx) { + imo->imo_membership[idx-1] = imo->imo_membership[idx]; + imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx]; + } + imo->imo_num_memberships--; + } + +out_locked: + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + return (error); +} + +/* + * Select the interface for transmitting IPv4 multicast datagrams. + * + * Either an instance of struct in_addr or an instance of struct ip_mreqn + * may be passed to this socket option. An address of INADDR_ANY or an + * interface index of 0 is used to remove a previous selection. + * When no interface is selected, one is chosen for every send. + */ +static int +inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) +{ + struct in_addr addr; + struct ip_mreqn mreqn; + struct ifnet *ifp; + struct ip_moptions *imo; + int error = 0 ; + unsigned int ifindex = 0; + + if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) { + /* + * An interface index was specified using the + * Linux-derived ip_mreqn structure. + */ + error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn), + sizeof(struct ip_mreqn)); + if (error) + return (error); + + ifnet_head_lock_shared(); + if (mreqn.imr_ifindex < 0 || if_index < mreqn.imr_ifindex) { + ifnet_head_done(); + return (EINVAL); + } + + if (mreqn.imr_ifindex == 0) { + ifp = NULL; + } else { + ifp = ifindex2ifnet[mreqn.imr_ifindex]; + if (ifp == NULL) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + } + ifnet_head_done(); + } else { + /* + * An interface was specified by IPv4 address. + * This is the traditional BSD usage. + */ + error = sooptcopyin(sopt, &addr, sizeof(struct in_addr), + sizeof(struct in_addr)); + if (error) + return (error); + if (in_nullhost(addr)) { + ifp = NULL; + } else { + ifp = ip_multicast_if(&addr, &ifindex); + if (ifp == NULL) { + IGMP_PRINTF(("%s: can't find ifp for addr=%s\n", + __func__, inet_ntoa(addr))); + return (EADDRNOTAVAIL); + } + } +#ifdef IGMP_DEBUG0 + IGMP_PRINTF(("%s: ifp = %p, addr = %s\n", __func__, ifp, + inet_ntoa(addr))); +#endif + } + + /* Reject interfaces which do not support multicast. */ + if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0) + return (EOPNOTSUPP); + + imo = inp_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IMO_LOCK(imo); + imo->imo_multicast_ifp = ifp; + if (ifindex) + imo->imo_multicast_addr = addr; + else + imo->imo_multicast_addr.s_addr = INADDR_ANY; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + + return (0); +} + +/* + * Atomically set source filters on a socket for an IPv4 multicast group. + */ +static int +inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt) +{ + struct __msfilterreq64 msfr, msfr64; + struct __msfilterreq32 msfr32; + sockunion_t *gsa; + struct ifnet *ifp; + struct in_mfilter *imf; + struct ip_moptions *imo; + struct in_multi *inm; + size_t idx; + int error; + user_addr_t tmp_ptr; + + if (IS_64BIT_PROCESS(current_proc())) { + error = sooptcopyin(sopt, &msfr64, + sizeof(struct __msfilterreq64), + sizeof(struct __msfilterreq64)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr64, sizeof(msfr)); + } else { + error = sooptcopyin(sopt, &msfr32, + sizeof(struct __msfilterreq32), + sizeof(struct __msfilterreq32)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr32, sizeof(msfr)); + } + + if (msfr.msfr_nsrcs > in_mcast_maxsocksrc) + return (ENOBUFS); + + if ((msfr.msfr_fmode != MCAST_EXCLUDE && + msfr.msfr_fmode != MCAST_INCLUDE)) + return (EINVAL); + + if (msfr.msfr_group.ss_family != AF_INET || + msfr.msfr_group.ss_len != sizeof(struct sockaddr_in)) + return (EINVAL); + + gsa = (sockunion_t *)&msfr.msfr_group; + if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr))) + return (EINVAL); + + gsa->sin.sin_port = 0; /* ignore port */ + + ifnet_head_lock_shared(); + if (msfr.msfr_ifindex == 0 || (u_int)if_index < msfr.msfr_ifindex) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + + ifp = ifindex2ifnet[msfr.msfr_ifindex]; + ifnet_head_done(); + if (ifp == NULL) + return (EADDRNOTAVAIL); + + /* + * Check if this socket is a member of this group. + */ + imo = inp_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IMO_LOCK(imo); + idx = imo_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->imo_mfilters == NULL) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + inm = imo->imo_membership[idx]; + imf = &imo->imo_mfilters[idx]; + + /* + * Begin state merge transaction at socket layer. + */ + + imf->imf_st[1] = msfr.msfr_fmode; + + /* + * Apply any new source filters, if present. + * Make a copy of the user-space source vector so + * that we may copy them with a single copyin. This + * allows us to deal with page faults up-front. + */ + if (msfr.msfr_nsrcs > 0) { + struct in_msource *lims; + struct sockaddr_in *psin; + struct sockaddr_storage *kss, *pkss; + int i; + + if (IS_64BIT_PROCESS(current_proc())) + tmp_ptr = msfr64.msfr_srcs; + else + tmp_ptr = CAST_USER_ADDR_T(msfr32.msfr_srcs); + + IGMP_PRINTF(("%s: loading %lu source list entries\n", + __func__, (unsigned long)msfr.msfr_nsrcs)); + kss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + M_TEMP, M_WAITOK); + if (kss == NULL) { + error = ENOMEM; + goto out_imo_locked; + } + error = copyin(tmp_ptr, kss, + sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + if (error) { + FREE(kss, M_TEMP); + goto out_imo_locked; + } + + /* + * Mark all source filters as UNDEFINED at t1. + * Restore new group filter mode, as imf_leave() + * will set it to INCLUDE. + */ + imf_leave(imf); + imf->imf_st[1] = msfr.msfr_fmode; + + /* + * Update socket layer filters at t1, lazy-allocating + * new entries. This saves a bunch of memory at the + * cost of one RB_FIND() per source entry; duplicate + * entries in the msfr_nsrcs vector are ignored. + * If we encounter an error, rollback transaction. + * + * XXX This too could be replaced with a set-symmetric + * difference like loop to avoid walking from root + * every time, as the key space is common. + */ + for (i = 0, pkss = kss; (u_int)i < msfr.msfr_nsrcs; + i++, pkss++) { + psin = (struct sockaddr_in *)pkss; + if (psin->sin_family != AF_INET) { + error = EAFNOSUPPORT; + break; + } + if (psin->sin_len != sizeof(struct sockaddr_in)) { + error = EINVAL; + break; + } + error = imf_get_source(imf, psin, &lims); + if (error) + break; + lims->imsl_st[1] = imf->imf_st[1]; + } + FREE(kss, M_TEMP); + } + + if (error) + goto out_imf_rollback; + + /* + * Begin state merge transaction at IGMP layer. + */ + INM_LOCK(inm); + IGMP_PRINTF(("%s: merge inm state\n", __func__)); + error = inm_merge(inm, imf); + if (error) { + IGMP_PRINTF(("%s: failed to merge inm state\n", __func__)); + INM_UNLOCK(inm); + goto out_imf_rollback; + } + + IGMP_PRINTF(("%s: doing igmp downcall\n", __func__)); + error = igmp_change_state(inm); + INM_UNLOCK(inm); +#ifdef IGMP_DEBUG + if (error) + IGMP_PRINTF(("%s: failed igmp downcall\n", __func__)); +#endif + +out_imf_rollback: + if (error) + imf_rollback(imf); + else + imf_commit(imf); + + imf_reap(imf); + +out_imo_locked: + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + + return (error); +} + +/* + * Set the IP multicast options in response to user setsockopt(). + * + * Many of the socket options handled in this function duplicate the + * functionality of socket options in the regular unicast API. However, + * it is not possible to merge the duplicate code, because the idempotence + * of the IPv4 multicast part of the BSD Sockets API must be preserved; + * the effects of these options must be treated as separate and distinct. + * + * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING + * is refactored to no longer use vifs. + */ +int +inp_setmoptions(struct inpcb *inp, struct sockopt *sopt) +{ + struct ip_moptions *imo; + int error; + unsigned int ifindex; + struct ifnet *ifp; + + error = 0; + + /* + * If socket is neither of type SOCK_RAW or SOCK_DGRAM, + * or is a divert socket, reject it. + */ + if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || + (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) + return (EOPNOTSUPP); + + switch (sopt->sopt_name) { +#if MROUTING + case IP_MULTICAST_VIF: { + int vifi; + /* + * Select a multicast VIF for transmission. + * Only useful if multicast forwarding is active. + */ + if (legal_vif_num == NULL) { + error = EOPNOTSUPP; + break; + } + error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int)); + if (error) + break; + if (!legal_vif_num(vifi) && (vifi != -1)) { + error = EINVAL; + break; + } + imo = inp_findmoptions(inp); + if (imo == NULL) { + error = ENOMEM; + break; + } + IMO_LOCK(imo); + imo->imo_multicast_vif = vifi; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + break; + } +#endif + case IP_MULTICAST_IF: + error = inp_set_multicast_if(inp, sopt); + break; + + case IP_MULTICAST_IFINDEX: + /* + * Select the interface for outgoing multicast packets. + */ + error = sooptcopyin(sopt, &ifindex, sizeof (ifindex), + sizeof (ifindex)); + if (error) + break; + + imo = inp_findmoptions(inp); + if (imo == NULL) { + error = ENOMEM; + break; + } + /* + * Index 0 is used to remove a previous selection. + * When no interface is selected, a default one is + * chosen every time a multicast packet is sent. + */ + if (ifindex == 0) { + IMO_LOCK(imo); + imo->imo_multicast_ifp = NULL; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + break; + } + + ifnet_head_lock_shared(); + /* Don't need to check is ifindex is < 0 since it's unsigned */ + if ((unsigned int)if_index < ifindex) { + ifnet_head_done(); + IMO_REMREF(imo); /* from inp_findmoptions() */ + error = ENXIO; /* per IPV6_MULTICAST_IF */ + break; + } + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + + /* If it's detached or isn't a multicast interface, bail out */ + if (ifp == NULL || !(ifp->if_flags & IFF_MULTICAST)) { + IMO_REMREF(imo); /* from inp_findmoptions() */ + error = EADDRNOTAVAIL; + break; + } + IMO_LOCK(imo); + imo->imo_multicast_ifp = ifp; + /* + * Clear out any remnants of past IP_MULTICAST_IF. The addr + * isn't really used anywhere in the kernel; we could have + * iterated thru the addresses of the interface and pick one + * here, but that is redundant since ip_getmoptions() already + * takes care of that for INADDR_ANY. + */ + imo->imo_multicast_addr.s_addr = INADDR_ANY; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + break; + + case IP_MULTICAST_TTL: { + u_char ttl; + + /* + * Set the IP time-to-live for outgoing multicast packets. + * The original multicast API required a char argument, + * which is inconsistent with the rest of the socket API. + * We allow either a char or an int. + */ + if (sopt->sopt_valsize == sizeof(u_char)) { + error = sooptcopyin(sopt, &ttl, sizeof(u_char), + sizeof(u_char)); + if (error) + break; + } else { + u_int ittl; + + error = sooptcopyin(sopt, &ittl, sizeof(u_int), + sizeof(u_int)); + if (error) + break; + if (ittl > 255) { + error = EINVAL; + break; + } + ttl = (u_char)ittl; + } + imo = inp_findmoptions(inp); + if (imo == NULL) { + error = ENOMEM; + break; + } + IMO_LOCK(imo); + imo->imo_multicast_ttl = ttl; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + break; + } + + case IP_MULTICAST_LOOP: { + u_char loop; + + /* + * Set the loopback flag for outgoing multicast packets. + * Must be zero or one. The original multicast API required a + * char argument, which is inconsistent with the rest + * of the socket API. We allow either a char or an int. + */ + if (sopt->sopt_valsize == sizeof(u_char)) { + error = sooptcopyin(sopt, &loop, sizeof(u_char), + sizeof(u_char)); + if (error) + break; + } else { + u_int iloop; + + error = sooptcopyin(sopt, &iloop, sizeof(u_int), + sizeof(u_int)); + if (error) + break; + loop = (u_char)iloop; + } + imo = inp_findmoptions(inp); + if (imo == NULL) { + error = ENOMEM; + break; + } + IMO_LOCK(imo); + imo->imo_multicast_loop = !!loop; + IMO_UNLOCK(imo); + IMO_REMREF(imo); /* from inp_findmoptions() */ + break; + } + + case IP_ADD_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + error = inp_join_group(inp, sopt); + break; + + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + error = inp_leave_group(inp, sopt); + break; + + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = inp_block_unblock_source(inp, sopt); + break; + + case IP_MSFILTER: + error = inp_set_source_filters(inp, sopt); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +/* + * Expose IGMP's multicast filter mode and source list(s) to userland, + * keyed by (ifindex, group). + * The filter mode is written out as a uint32_t, followed by + * 0..n of struct in_addr. + * For use by ifmcstat(8). + */ +static int +sysctl_ip_mcast_filters SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + + struct in_addr src, group; + struct ifnet *ifp; + struct in_multi *inm; + struct in_multistep step; + struct ip_msource *ims; + int *name; + int retval = 0; + u_int namelen; + uint32_t fmode, ifindex; + + name = (int *)arg1; + namelen = (u_int)arg2; + + if (req->newptr != USER_ADDR_NULL) + return (EPERM); + + if (namelen != 2) + return (EINVAL); + + ifindex = name[0]; + ifnet_head_lock_shared(); + if (ifindex <= 0 || ifindex > (u_int)if_index) { + IGMP_PRINTF(("%s: ifindex %u out of range\n", + __func__, ifindex)); + ifnet_head_done(); + return (ENOENT); + } + + group.s_addr = name[1]; + if (!IN_MULTICAST(ntohl(group.s_addr))) { + IGMP_PRINTF(("%s: group %s is not multicast\n", + __func__, inet_ntoa(group))); + ifnet_head_done(); + return (EINVAL); + } + + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp == NULL) { + IGMP_PRINTF(("%s: no ifp for ifindex %u\n", __func__, ifindex)); + return (ENOENT); + } + + in_multihead_lock_shared(); + IN_FIRST_MULTI(step, inm); + while (inm != NULL) { + INM_LOCK(inm); + if (inm->inm_ifp != ifp) + goto next; + + if (!in_hosteq(inm->inm_addr, group)) + goto next; + + fmode = inm->inm_st[1].iss_fmode; + retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); + if (retval != 0) { + INM_UNLOCK(inm); + break; /* abort */ + } + RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) { +#ifdef IGMP_DEBUG + struct in_addr ina; + ina.s_addr = htonl(ims->ims_haddr); + IGMP_PRINTF(("%s: visit node %s\n", __func__, + inet_ntoa(ina))); +#endif + /* + * Only copy-out sources which are in-mode. + */ + if (fmode != ims_get_mode(inm, ims, 1)) { + IGMP_PRINTF(("%s: skip non-in-mode\n", + __func__)); + continue; /* process next source */ + } + src.s_addr = htonl(ims->ims_haddr); + retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr)); + if (retval != 0) + break; /* process next inm */ + } +next: + INM_UNLOCK(inm); + IN_NEXT_MULTI(step, inm); + } + in_multihead_lock_done(); + + return (retval); +} + +/* + * XXX + * The whole multicast option thing needs to be re-thought. + * Several of these options are equally applicable to non-multicast + * transmission, and one (IP_MULTICAST_TTL) totally duplicates a + * standard option (IP_TTL). + */ +/* + * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. + */ +static struct ifnet * +ip_multicast_if(struct in_addr *a, unsigned int *ifindexp) +{ + unsigned int ifindex; + struct ifnet *ifp; + + if (ifindexp != NULL) + *ifindexp = 0; + if (ntohl(a->s_addr) >> 24 == 0) { + ifindex = ntohl(a->s_addr) & 0xffffff; + ifnet_head_lock_shared(); + /* Don't need to check is ifindex is < 0 since it's unsigned */ + if ((unsigned int)if_index < ifindex) { + ifnet_head_done(); + return (NULL); + } + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp != NULL && ifindexp != NULL) + *ifindexp = ifindex; + } else { + INADDR_TO_IFP(*a, ifp); + } + return (ifp); +} + +void +in_multi_init(void) +{ + PE_parse_boot_argn("ifa_debug", &inm_debug, sizeof (inm_debug)); + + /* Setup lock group and attribute for in_multihead */ + in_multihead_lock_grp_attr = lck_grp_attr_alloc_init(); + in_multihead_lock_grp = lck_grp_alloc_init("in_multihead", + in_multihead_lock_grp_attr); + in_multihead_lock_attr = lck_attr_alloc_init(); + lck_rw_init(&in_multihead_lock, in_multihead_lock_grp, + in_multihead_lock_attr); + + lck_mtx_init(&inm_trash_lock, in_multihead_lock_grp, + in_multihead_lock_attr); + TAILQ_INIT(&inm_trash_head); + + inm_size = (inm_debug == 0) ? sizeof (struct in_multi) : + sizeof (struct in_multi_dbg); + inm_zone = zinit(inm_size, INM_ZONE_MAX * inm_size, + 0, INM_ZONE_NAME); + if (inm_zone == NULL) { + panic("%s: failed allocating %s", __func__, INM_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(inm_zone, Z_EXPAND, TRUE); + + ipms_size = sizeof (struct ip_msource); + ipms_zone = zinit(ipms_size, IPMS_ZONE_MAX * ipms_size, + 0, IPMS_ZONE_NAME); + if (ipms_zone == NULL) { + panic("%s: failed allocating %s", __func__, IPMS_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(ipms_zone, Z_EXPAND, TRUE); + + inms_size = sizeof (struct in_msource); + inms_zone = zinit(inms_size, INMS_ZONE_MAX * inms_size, + 0, INMS_ZONE_NAME); + if (inms_zone == NULL) { + panic("%s: failed allocating %s", __func__, INMS_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(inms_zone, Z_EXPAND, TRUE); +} + +static struct in_multi * +in_multi_alloc(int how) +{ + struct in_multi *inm; + + inm = (how == M_WAITOK) ? zalloc(inm_zone) : zalloc_noblock(inm_zone); + if (inm != NULL) { + bzero(inm, inm_size); + lck_mtx_init(&inm->inm_lock, in_multihead_lock_grp, + in_multihead_lock_attr); + inm->inm_debug |= IFD_ALLOC; + if (inm_debug != 0) { + inm->inm_debug |= IFD_DEBUG; + inm->inm_trace = inm_trace; + } + } + return (inm); +} + +static void +in_multi_free(struct in_multi *inm) +{ + INM_LOCK(inm); + if (inm->inm_debug & IFD_ATTACHED) { + panic("%s: attached inm=%p is being freed", __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_ifma != NULL) { + panic("%s: ifma not NULL for inm=%p", __func__, inm); + /* NOTREACHED */ + } else if (!(inm->inm_debug & IFD_ALLOC)) { + panic("%s: inm %p cannot be freed", __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_refcount != 0) { + panic("%s: non-zero refcount inm=%p", __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_reqcnt != 0) { + panic("%s: non-zero reqcnt inm=%p", __func__, inm); + /* NOTREACHED */ + } + + /* Free any pending IGMPv3 state-change records */ + IF_DRAIN(&inm->inm_scq); + + inm->inm_debug &= ~IFD_ALLOC; + if ((inm->inm_debug & (IFD_DEBUG | IFD_TRASHED)) == + (IFD_DEBUG | IFD_TRASHED)) { + lck_mtx_lock(&inm_trash_lock); + TAILQ_REMOVE(&inm_trash_head, (struct in_multi_dbg *)inm, + inm_trash_link); + lck_mtx_unlock(&inm_trash_lock); + inm->inm_debug &= ~IFD_TRASHED; + } + INM_UNLOCK(inm); + + lck_mtx_destroy(&inm->inm_lock, in_multihead_lock_grp); + zfree(inm_zone, inm); +} + +static void +in_multi_attach(struct in_multi *inm) +{ + in_multihead_lock_assert(LCK_RW_ASSERT_EXCLUSIVE); + INM_LOCK_ASSERT_HELD(inm); + + if (inm->inm_debug & IFD_ATTACHED) { + panic("%s: Attempt to attach an already attached inm=%p", + __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_debug & IFD_TRASHED) { + panic("%s: Attempt to reattach a detached inm=%p", + __func__, inm); + /* NOTREACHED */ + } + + inm->inm_reqcnt++; + VERIFY(inm->inm_reqcnt == 1); + INM_ADDREF_LOCKED(inm); + inm->inm_debug |= IFD_ATTACHED; + /* + * Reattach case: If debugging is enabled, take it + * out of the trash list and clear IFD_TRASHED. + */ + if ((inm->inm_debug & (IFD_DEBUG | IFD_TRASHED)) == + (IFD_DEBUG | IFD_TRASHED)) { + /* Become a regular mutex, just in case */ + INM_CONVERT_LOCK(inm); + lck_mtx_lock(&inm_trash_lock); + TAILQ_REMOVE(&inm_trash_head, (struct in_multi_dbg *)inm, + inm_trash_link); + lck_mtx_unlock(&inm_trash_lock); + inm->inm_debug &= ~IFD_TRASHED; + } + + LIST_INSERT_HEAD(&in_multihead, inm, inm_link); +} + +int +in_multi_detach(struct in_multi *inm) +{ + in_multihead_lock_assert(LCK_RW_ASSERT_EXCLUSIVE); + INM_LOCK_ASSERT_HELD(inm); + + if (inm->inm_reqcnt == 0) { + panic("%s: inm=%p negative reqcnt", __func__, inm); + /* NOTREACHED */ + } + + --inm->inm_reqcnt; + if (inm->inm_reqcnt > 0) + return (0); + + if (!(inm->inm_debug & IFD_ATTACHED)) { + panic("%s: Attempt to detach an unattached record inm=%p", + __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_debug & IFD_TRASHED) { + panic("%s: inm %p is already in trash list", __func__, inm); + /* NOTREACHED */ + } + + /* + * NOTE: Caller calls IFMA_REMREF + */ + inm->inm_debug &= ~IFD_ATTACHED; + LIST_REMOVE(inm, inm_link); + + if (inm->inm_debug & IFD_DEBUG) { + /* Become a regular mutex, just in case */ + INM_CONVERT_LOCK(inm); + lck_mtx_lock(&inm_trash_lock); + TAILQ_INSERT_TAIL(&inm_trash_head, + (struct in_multi_dbg *)inm, inm_trash_link); + lck_mtx_unlock(&inm_trash_lock); + inm->inm_debug |= IFD_TRASHED; + } + + return (1); +} + +void +inm_addref(struct in_multi *inm, int locked) +{ + if (!locked) + INM_LOCK_SPIN(inm); + else + INM_LOCK_ASSERT_HELD(inm); + + if (++inm->inm_refcount == 0) { + panic("%s: inm=%p wraparound refcnt", __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_trace != NULL) { + (*inm->inm_trace)(inm, TRUE); + } + if (!locked) + INM_UNLOCK(inm); +} + +void +inm_remref(struct in_multi *inm, int locked) +{ + struct ifmultiaddr *ifma; + struct igmp_ifinfo *igi; + + if (!locked) + INM_LOCK_SPIN(inm); + else + INM_LOCK_ASSERT_HELD(inm); + + if (inm->inm_refcount == 0 || (inm->inm_refcount == 1 && locked)) { + panic("%s: inm=%p negative/missing refcnt", __func__, inm); + /* NOTREACHED */ + } else if (inm->inm_trace != NULL) { + (*inm->inm_trace)(inm, FALSE); + } + + --inm->inm_refcount; + if (inm->inm_refcount > 0) { + if (!locked) + INM_UNLOCK(inm); + return; + } + + /* + * Synchronization with in_getmulti(). In the event the inm has been + * detached, the underlying ifma would still be in the if_multiaddrs + * list, and thus can be looked up via if_addmulti(). At that point, + * the only way to find this inm is via ifma_protospec. To avoid + * race conditions between the last inm_remref() of that inm and its + * use via ifma_protospec, in_multihead lock is used for serialization. + * In order to avoid violating the lock order, we must drop inm_lock + * before acquiring in_multihead lock. To prevent the inm from being + * freed prematurely, we hold an extra reference. + */ + ++inm->inm_refcount; + INM_UNLOCK(inm); + in_multihead_lock_shared(); + INM_LOCK_SPIN(inm); + --inm->inm_refcount; + if (inm->inm_refcount > 0) { + /* We've lost the race, so abort since inm is still in use */ + INM_UNLOCK(inm); + in_multihead_lock_done(); + /* If it was locked, return it as such */ + if (locked) + INM_LOCK(inm); + return; + } + inm_purge(inm); + ifma = inm->inm_ifma; + inm->inm_ifma = NULL; + inm->inm_ifp = NULL; + igi = inm->inm_igi; + inm->inm_igi = NULL; + INM_UNLOCK(inm); + IFMA_LOCK_SPIN(ifma); + ifma->ifma_protospec = NULL; + IFMA_UNLOCK(ifma); + in_multihead_lock_done(); + + in_multi_free(inm); + if_delmulti_ifma(ifma); + /* Release reference held to the underlying ifmultiaddr */ + IFMA_REMREF(ifma); + + if (igi != NULL) + IGI_REMREF(igi); +} + +static void +inm_trace(struct in_multi *inm, int refhold) +{ + struct in_multi_dbg *inm_dbg = (struct in_multi_dbg *)inm; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(inm->inm_debug & IFD_DEBUG)) { + panic("%s: inm %p has no debug structure", __func__, inm); + /* NOTREACHED */ + } + if (refhold) { + cnt = &inm_dbg->inm_refhold_cnt; + tr = inm_dbg->inm_refhold; + } else { + cnt = &inm_dbg->inm_refrele_cnt; + tr = inm_dbg->inm_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % INM_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + +void +in_multihead_lock_exclusive(void) +{ + lck_rw_lock_exclusive(&in_multihead_lock); +} + +void +in_multihead_lock_shared(void) +{ + lck_rw_lock_shared(&in_multihead_lock); +} + +void +in_multihead_lock_assert(int what) +{ + lck_rw_assert(&in_multihead_lock, what); +} + +void +in_multihead_lock_done(void) +{ + lck_rw_done(&in_multihead_lock); +} + +static struct ip_msource * +ipms_alloc(int how) +{ + struct ip_msource *ims; + + ims = (how == M_WAITOK) ? zalloc(ipms_zone) : zalloc_noblock(ipms_zone); + if (ims != NULL) + bzero(ims, ipms_size); + + return (ims); +} + +static void +ipms_free(struct ip_msource *ims) +{ + zfree(ipms_zone, ims); +} + +static struct in_msource * +inms_alloc(int how) +{ + struct in_msource *inms; + + inms = (how == M_WAITOK) ? zalloc(inms_zone) : + zalloc_noblock(inms_zone); + if (inms != NULL) + bzero(inms, inms_size); + + return (inms); +} + +static void +inms_free(struct in_msource *inms) +{ + zfree(inms_zone, inms); +} + +#ifdef IGMP_DEBUG + +static const char *inm_modestrs[] = { "un\n", "in", "ex" }; + +static const char * +inm_mode_str(const int mode) +{ + if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) + return (inm_modestrs[mode]); + return ("??"); +} + +static const char *inm_statestrs[] = { + "not-member\n", + "silent\n", + "idle\n", + "lazy\n", + "sleeping\n", + "awakening\n", + "query-pending\n", + "sg-query-pending\n", + "leaving" +}; + +static const char * +inm_state_str(const int state) +{ + if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER) + return (inm_statestrs[state]); + return ("??"); +} + +/* + * Dump an in_multi structure to the console. + */ +void +inm_print(const struct in_multi *inm) +{ + int t; + + INM_LOCK_ASSERT_HELD(INM_CAST_TO_NONCONST(inm)); + + if (igmp_debug == 0) + return; + + printf("%s: --- begin inm %p ---\n", __func__, inm); + printf("addr %s ifp %p(%s%d) ifma %p\n", + inet_ntoa(inm->inm_addr), + inm->inm_ifp, + inm->inm_ifp->if_name, + inm->inm_ifp->if_unit, + inm->inm_ifma); + printf("timer %u state %s refcount %u scq.len %u\n", + inm->inm_timer, + inm_state_str(inm->inm_state), + inm->inm_refcount, + inm->inm_scq.ifq_len); + printf("igi %p nsrc %lu sctimer %u scrv %u\n", + inm->inm_igi, + inm->inm_nsrc, + inm->inm_sctimer, + inm->inm_scrv); + for (t = 0; t < 2; t++) { + printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, + inm_mode_str(inm->inm_st[t].iss_fmode), + inm->inm_st[t].iss_asm, + inm->inm_st[t].iss_ex, + inm->inm_st[t].iss_in, + inm->inm_st[t].iss_rec); + } + printf("%s: --- end inm %p ---\n", __func__, inm); +} + +#else + +void +inm_print(__unused const struct in_multi *inm) +{ + +} + +#endif diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index 696222176..51eeca0b3 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,6 +75,9 @@ #endif #include <sys/kernel.h> #include <sys/sysctl.h> +#include <sys/mcache.h> +#include <sys/kauth.h> +#include <sys/priv.h> #include <libkern/OSAtomic.h> #include <machine/limits.h> @@ -156,17 +159,17 @@ sysctl_net_ipport_check SYSCTL_HANDLER_ARGS SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "IP Ports"); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", ""); -SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", ""); extern int udp_use_randomport; @@ -233,14 +236,17 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, __unused struct proc * } mac_inpcb_label_associate(so, inp); #endif + // make sure inp_stat is always 64bit aligned + inp->inp_stat = (struct inp_stat*)P2ROUNDUP(inp->inp_stat_store, sizeof(u_int64_t)); + if (((uintptr_t)inp->inp_stat - (uintptr_t)inp->inp_stat_store) + + sizeof(*inp->inp_stat) > sizeof(inp->inp_stat_store)) { + panic("insufficient space to align inp_stat"); + } + so->so_pcb = (caddr_t)inp; if (so->so_proto->pr_flags & PR_PCBLOCK) { - inp->inpcb_mtx = lck_mtx_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr); - if (inp->inpcb_mtx == NULL) { - printf("in_pcballoc: can't alloc mutex! so=%p\n", so); - return(ENOMEM); - } + lck_mtx_init(&inp->inpcb_mtx, pcbinfo->mtx_grp, pcbinfo->mtx_attr); } #if IPSEC @@ -297,7 +303,7 @@ in_pcblookup_local_and_cleanup( if (inp && inp->inp_wantcnt == WNT_STOPUSING) { struct socket *so = inp->inp_socket; - lck_mtx_lock(inp->inpcb_mtx); + lck_mtx_lock(&inp->inpcb_mtx); if (so->so_usecount == 0) { if (inp->inp_state != INPCB_STATE_DEAD) @@ -306,7 +312,7 @@ in_pcblookup_local_and_cleanup( inp = NULL; } else { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); } } @@ -324,6 +330,8 @@ in_pcb_conflict_post_msg(u_int16_t port) struct kev_msg ev_msg; struct kev_in_portinuse in_portinuse; + bzero(&in_portinuse, sizeof(struct kev_in_portinuse)); + bzero(&ev_msg, sizeof(struct kev_msg)); in_portinuse.port = ntohs(port); /* port in host order */ in_portinuse.req_pid = proc_selfpid(); ev_msg.vendor_code = KEV_VENDOR_APPLE; @@ -344,7 +352,7 @@ in_pcb_conflict_post_msg(u_int16_t port) * EACCES Permission denied * EADDRINUSE Address in use * EAGAIN Resource unavailable, try again - * proc_suser:EPERM Operation not permitted + * priv_check_cred:EPERM Operation not permitted */ int in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) @@ -356,6 +364,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) u_short lport = 0, rand_port = 0; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); int error, randomport, conflict = 0; + kauth_cred_t cred; if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */ return (EADDRNOTAVAIL); @@ -366,6 +375,8 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) socket_unlock(so, 0); /* keep reference on socket */ lck_rw_lock_exclusive(pcbinfo->mtx); if (nam) { + unsigned int outif = 0; + sin = (struct sockaddr_in *)nam; if (nam->sa_len != sizeof (*sin)) { lck_rw_done(pcbinfo->mtx); @@ -403,7 +414,10 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) return (EADDRNOTAVAIL); } else { - ifafree(ifa); + IFA_LOCK(ifa); + outif = ifa->ifa_ifp->if_index; + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); } } if (lport) { @@ -411,10 +425,15 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) /* GROSS */ #if !CONFIG_EMBEDDED - if (ntohs(lport) < IPPORT_RESERVED && proc_suser(p)) { - lck_rw_done(pcbinfo->mtx); - socket_lock(so, 0); - return (EACCES); + if (ntohs(lport) < IPPORT_RESERVED) { + cred = kauth_cred_proc_ref(p); + error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); + kauth_cred_unref(&cred); + if (error != 0) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); + return (EACCES); + } } #endif if (so->so_uid && @@ -487,6 +506,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) } } inp->inp_laddr = sin->sin_addr; + inp->inp_last_outif = outif; } if (lport == 0) { u_short first, last; @@ -502,7 +522,10 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) last = ipport_hilastauto; lastport = &pcbinfo->lasthi; } else if (inp->inp_flags & INP_LOWPORT) { - if ((error = proc_suser(p)) != 0) { + cred = kauth_cred_proc_ref(p); + error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); + kauth_cred_unref(&cred); + if (error != 0) { lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); return error; @@ -541,6 +564,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_last_outif = 0; return (EADDRNOTAVAIL); } --*lastport; @@ -564,6 +588,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_last_outif = 0; return (EADDRNOTAVAIL); } ++*lastport; @@ -579,6 +604,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) if (in_pcbinshash(inp, 1) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; + inp->inp_last_outif = 0; lck_rw_done(pcbinfo->mtx); return (EAGAIN); } @@ -605,7 +631,7 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) */ int in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, - struct sockaddr_in **plocal_sin) + struct sockaddr_in *plocal_sin, unsigned int *out_ifscope) { struct in_ifaddr *ia; struct sockaddr_in *sin = (struct sockaddr_in *)nam; @@ -619,6 +645,7 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, lck_rw_lock_shared(in_ifaddr_rwlock); if (!TAILQ_EMPTY(&in_ifaddrhead)) { + ia = TAILQ_FIRST(&in_ifaddrhead); /* * If the destination address is INADDR_ANY, * use the primary local address. @@ -629,21 +656,34 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, #define satosin(sa) ((struct sockaddr_in *)(sa)) #define sintosa(sin) ((struct sockaddr *)(sin)) #define ifatoia(ifa) ((struct in_ifaddr *)(ifa)) + IFA_LOCK_SPIN(&ia->ia_ifa); if (sin->sin_addr.s_addr == INADDR_ANY) - sin->sin_addr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr; + sin->sin_addr = IA_SIN(ia)->sin_addr; else if (sin->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST && - (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST)) - sin->sin_addr = satosin(&TAILQ_FIRST(&in_ifaddrhead)->ia_broadaddr)->sin_addr; + (ia->ia_ifp->if_flags & IFF_BROADCAST)) + sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); + ia = NULL; } lck_rw_done(in_ifaddr_rwlock); if (inp->inp_laddr.s_addr == INADDR_ANY) { struct route *ro; - unsigned int ifscope; - + unsigned int ifscope = IFSCOPE_NONE; + unsigned int nocell; + /* + * If the socket is bound to a specifc interface, the + * optional scoped takes precedence over that if it + * is set by the caller. + */ ia = (struct in_ifaddr *)0; - ifscope = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : IFSCOPE_NONE; + + if (out_ifscope != NULL && *out_ifscope != IFSCOPE_NONE) + ifscope = *out_ifscope; + else if (inp->inp_flags & INP_BOUND_IF) + ifscope = inp->inp_boundif; + + nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; /* * If route is known or can be allocated now, * our src addr is taken from the i/f, else punt. @@ -672,10 +712,23 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, ro->ro_dst.sa_len = sizeof(struct sockaddr_in); ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = sin->sin_addr; - rtalloc_scoped_ign(ro, 0, ifscope); + rtalloc_scoped(ro, ifscope); if (ro->ro_rt != NULL) RT_LOCK_SPIN(ro->ro_rt); } + /* + * If the route points to a cellular interface and the + * caller forbids our using interfaces of such type, + * pretend that there is no route. + */ + if (nocell && ro->ro_rt != NULL) { + RT_LOCK_ASSERT_HELD(ro->ro_rt); + if (ro->ro_rt->rt_ifp->if_type == IFT_CELLULAR) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + } /* * If we found a route, use the address * corresponding to the outgoing interface @@ -683,11 +736,13 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, * to our address on another net goes to loopback). */ if (ro->ro_rt != NULL) { - RT_LOCK_ASSERT_HELD(ro->ro_rt); + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); if (!(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) { ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + IFA_ADDREF(&ia->ia_ifa); + } } RT_UNLOCK(ro->ro_rt); } @@ -705,9 +760,19 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, lck_rw_lock_shared(in_ifaddr_rwlock); ia = TAILQ_FIRST(&in_ifaddrhead); if (ia) - ifaref(&ia->ia_ifa); + IFA_ADDREF(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); } + /* + * If the source address belongs to a cellular interface + * and the socket forbids our using interfaces of such + * type, pretend that there is no source address. + */ + if (nocell && ia != NULL && + ia->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR) { + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } if (ia == 0) return (EADDRNOTAVAIL); } @@ -722,29 +787,37 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct ifnet *ifp; imo = inp->inp_moptions; + IMO_LOCK(imo); if (imo->imo_multicast_ifp != NULL && (ia == NULL || ia->ia_ifp != imo->imo_multicast_ifp)) { ifp = imo->imo_multicast_ifp; if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { if (ia->ia_ifp == ifp) break; } if (ia) - ifaref(&ia->ia_ifa); + IFA_ADDREF(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); - if (ia == 0) + if (ia == 0) { + IMO_UNLOCK(imo); return (EADDRNOTAVAIL); + } } + IMO_UNLOCK(imo); } /* * Don't do pcblookup call here; return interface in plocal_sin * and exit to caller, that will do the lookup. */ - *plocal_sin = &ia->ia_addr; - ifafree(&ia->ia_ifa); + IFA_LOCK_SPIN(&ia->ia_ifa); + *plocal_sin = ia->ia_addr; + if (out_ifscope != NULL) + *out_ifscope = ia->ia_ifp->if_index; + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } return(0); } @@ -757,9 +830,9 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, * then pick one. */ int -in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p) +in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p, unsigned int *ifscope) { - struct sockaddr_in *ifaddr; + struct sockaddr_in ifaddr; struct sockaddr_in *sin = (struct sockaddr_in *)nam; struct inpcb *pcb; int error; @@ -767,14 +840,23 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p) /* * Call inner routine, to assign local interface address. */ - if ((error = in_pcbladdr(inp, nam, &ifaddr)) != 0) + if ((error = in_pcbladdr(inp, nam, &ifaddr, ifscope)) != 0) return(error); socket_unlock(inp->inp_socket, 0); pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, - inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr, + inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr.sin_addr, inp->inp_lport, 0, NULL); socket_lock(inp->inp_socket, 0); + + /* Check if the socket is still in a valid state. When we unlock this + * embryonic socket, it can get aborted if another thread is closing + * the listener (radar 7947600). + */ + if ((inp->inp_socket->so_flags & SOF_ABORTED) != 0) { + return ECONNREFUSED; + } + if (pcb != NULL) { in_pcb_checkstate(pcb, WNT_RELEASE, pcb == inp ? 1 : 0); return (EADDRINUSE); @@ -791,7 +873,8 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p) lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); socket_lock(inp->inp_socket, 0); } - inp->inp_laddr = ifaddr->sin_addr; + inp->inp_laddr = ifaddr.sin_addr; + inp->inp_last_outif = ifscope ? *ifscope : IFSCOPE_NONE; inp->inp_flags |= INP_INADDR_ANY; } else { @@ -858,6 +941,7 @@ in_pcbdetach(struct inpcb *inp) #endif if ((so->so_flags & SOF_PCBCLEARING) == 0) { struct rtentry *rt; + struct ip_moptions *imo; inp->inp_vflag = 0; if (inp->inp_options) @@ -866,8 +950,10 @@ in_pcbdetach(struct inpcb *inp) inp->inp_route.ro_rt = NULL; rtfree(rt); } - ip_freemoptions(inp->inp_moptions); + imo = inp->inp_moptions; inp->inp_moptions = NULL; + if (imo != NULL) + IMO_REMREF(imo); sofreelastref(so, 0); inp->inp_state = INPCB_STATE_DEAD; so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */ @@ -886,9 +972,10 @@ in_pcbdispose(struct inpcb *inp) printf("in_pcbdispose: not dead yet? so=%p\n", so); } #endif - if (so && so->so_usecount != 0) - panic("in_pcbdispose: use count=%x so=%p\n", so->so_usecount, so); + panic("%s: so %p so_usecount %d so_lockhistory %s\n", + __func__, so, so->so_usecount, + (so != NULL) ? solockhistory_nr(so) : "--"); lck_rw_assert(ipi->mtx, LCK_RW_ASSERT_EXCLUSIVE); @@ -909,8 +996,8 @@ in_pcbdispose(struct inpcb *inp) } if (so->so_head != NULL) panic("in_pcbdispose, so=%p head still exist\n", so); - lck_mtx_unlock(inp->inpcb_mtx); - lck_mtx_free(inp->inpcb_mtx, ipi->mtx_grp); + lck_mtx_unlock(&inp->inpcb_mtx); + lck_mtx_destroy(&inp->inpcb_mtx, ipi->mtx_grp); } so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */ so->so_saved_pcb = (caddr_t) inp; @@ -1075,7 +1162,7 @@ in_losing(struct inpcb *inp) if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) { inp->inp_route.ro_rt = NULL; rtfree(rt); - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } /* * A new route can be allocated @@ -1099,7 +1186,7 @@ in_rtchange(struct inpcb *inp, __unused int errno) if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) == NULL) { return; /* we can't remove the route now. not sure if still ok to use src */ } - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); rtfree(rt); inp->inp_route.ro_rt = NULL; /* @@ -1200,6 +1287,131 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, } } +/* + * Check if PCB exists in hash list. + */ +int +in_pcblookup_hash_exists( + struct inpcbinfo *pcbinfo, + struct in_addr faddr, + u_int fport_arg, + struct in_addr laddr, + u_int lport_arg, + int wildcard, + uid_t *uid, + gid_t *gid, + __unused struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp; + u_short fport = fport_arg, lport = lport_arg; + int found; + + *uid = UID_MAX; + *gid = GID_MAX; + + /* + * We may have found the pcb in the last lookup - check this first. + */ + + lck_rw_lock_shared(pcbinfo->mtx); + + /* + * First look for an exact match. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, + pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#if INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == faddr.s_addr && + inp->inp_laddr.s_addr == laddr.s_addr && + inp->inp_fport == fport && + inp->inp_lport == lport) { + if ((found = (inp->inp_socket != NULL))) { + /* + * Found. + */ + *uid = inp->inp_socket->so_uid; + *gid = inp->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + } + if (wildcard) { + struct inpcb *local_wild = NULL; +#if INET6 + struct inpcb *local_wild_mapped = NULL; +#endif + + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, + pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { +#if INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + continue; +#endif + if (inp->inp_faddr.s_addr == INADDR_ANY && + inp->inp_lport == lport) { +#if defined(NFAITH) && NFAITH > 0 + if (ifp && ifp->if_type == IFT_FAITH && + (inp->inp_flags & INP_FAITH) == 0) + continue; +#endif + if (inp->inp_laddr.s_addr == laddr.s_addr) { + if ((found = (inp->inp_socket != NULL))) { + *uid = inp->inp_socket->so_uid; + *gid = inp->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + else if (inp->inp_laddr.s_addr == INADDR_ANY) { +#if INET6 + if (inp->inp_socket && + INP_CHECK_SOCKAF(inp->inp_socket, + AF_INET6)) + local_wild_mapped = inp; + else +#endif /* INET6 */ + local_wild = inp; + } + } + } + if (local_wild == NULL) { +#if INET6 + if (local_wild_mapped != NULL) { + if ((found = (local_wild_mapped->inp_socket != NULL))) { + *uid = local_wild_mapped->inp_socket->so_uid; + *gid = local_wild_mapped->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } +#endif /* INET6 */ + lck_rw_done(pcbinfo->mtx); + return (0); + } + if (local_wild != NULL) { + if ((found = (local_wild->inp_socket != NULL))) { + *uid = local_wild->inp_socket->so_uid; + *gid = local_wild->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + } + + /* + * Not found. + */ + lck_rw_done(pcbinfo->mtx); + return (0); +} + /* * Lookup PCB in hash list. */ @@ -1336,10 +1548,15 @@ in_pcbinshash(struct inpcb *inp, int locked) if (!locked) { if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) { - /*lock inversion issue, mostly with udp multicast packets */ + /*lock inversion issue, mostly with udp multicast packets */ socket_unlock(inp->inp_socket, 0); lck_rw_lock_exclusive(pcbinfo->mtx); socket_lock(inp->inp_socket, 0); + if (inp->inp_state == INPCB_STATE_DEAD) { + /* The socket got dropped when it was unlocked */ + lck_rw_done(pcbinfo->mtx); + return(ECONNABORTED); + } } } @@ -1458,6 +1675,7 @@ in_pcb_checkstate(struct inpcb *pcb, int mode, int locked) if (locked == 0) socket_lock(pcb->inp_socket, 1); pcb->inp_state = INPCB_STATE_DEAD; + stopusing: if (pcb->inp_socket->so_usecount < 0) panic("in_pcb_checkstate STOP pcb=%p so=%p usecount is negative\n", pcb, pcb->inp_socket); @@ -1569,25 +1787,26 @@ inpcb_to_xinpcb64( struct inpcb *inp, struct xinpcb64 *xinp) { - xinp->inp_fport = inp->inp_fport; - xinp->inp_lport = inp->inp_lport; - xinp->inp_gencnt = inp->inp_gencnt; - xinp->inp_flags = inp->inp_flags; - xinp->inp_flow = inp->inp_flow; - xinp->inp_vflag = inp->inp_vflag; - xinp->inp_ip_ttl = inp->inp_ip_ttl; - xinp->inp_ip_p = inp->inp_ip_p; - xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign; - xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local; - xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos; - xinp->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim; - xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum; + xinp->inp_fport = inp->inp_fport; + xinp->inp_lport = inp->inp_lport; + xinp->inp_gencnt = inp->inp_gencnt; + xinp->inp_flags = inp->inp_flags; + xinp->inp_flow = inp->inp_flow; + xinp->inp_vflag = inp->inp_vflag; + xinp->inp_ip_ttl = inp->inp_ip_ttl; + xinp->inp_ip_p = inp->inp_ip_p; + xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign; + xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local; + xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos; + xinp->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim; + xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum; xinp->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex; - xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops; + xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops; } #endif /* !CONFIG_EMBEDDED */ + /* * The following routines implement this scheme: * @@ -1619,7 +1838,7 @@ inp_route_copyout(struct inpcb *inp, struct route *dst) { struct route *src = &inp->inp_route; - lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); /* * If the route in the PCB is not for IPv4, blow it away; @@ -1629,13 +1848,8 @@ inp_route_copyout(struct inpcb *inp, struct route *dst) rtfree(src->ro_rt); src->ro_rt = NULL; } - - /* Copy everything (rt, dst, flags) from PCB */ - bcopy(src, dst, sizeof (*dst)); - - /* Hold one reference for the local copy of struct route */ - if (dst->ro_rt != NULL) - RT_ADDREF(dst->ro_rt); + + route_copyout(dst, src, sizeof(*dst)); } void @@ -1643,33 +1857,61 @@ inp_route_copyin(struct inpcb *inp, struct route *src) { struct route *dst = &inp->inp_route; - lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); /* Minor sanity check */ if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) panic("%s: wrong or corrupted route: %p", __func__, src); - /* No cached route in the PCB? */ - if (dst->ro_rt == NULL) { - /* - * Copy everything (rt, dst, flags) from ip_output(); - * the reference to the route was held at the time - * it was allocated and is kept intact. - */ - bcopy(src, dst, sizeof (*dst)); - } else if (src->ro_rt != NULL) { - /* - * If the same, update just the ro_flags and ditch the one - * in the local copy. Else ditch the one that is currently - * cached, and cache what we got back from ip_output(). - */ - if (dst->ro_rt == src->ro_rt) { - dst->ro_flags = src->ro_flags; - rtfree(src->ro_rt); - src->ro_rt = NULL; - } else { - rtfree(dst->ro_rt); - bcopy(src, dst, sizeof (*dst)); - } + route_copyin(src, dst, sizeof(*src)); +} + +/* + * Handler for setting IP_FORCE_OUT_IFP/IP_BOUND_IF/IPV6_BOUND_IF socket option. + */ +void +inp_bindif(struct inpcb *inp, unsigned int ifscope) +{ + /* + * A zero interface scope value indicates an "unbind". + * Otherwise, take in whatever value the app desires; + * the app may already know the scope (or force itself + * to such a scope) ahead of time before the interface + * gets attached. It doesn't matter either way; any + * route lookup from this point on will require an + * exact match for the embedded interface scope. + */ + inp->inp_boundif = ifscope; + if (inp->inp_boundif == IFSCOPE_NONE) + inp->inp_flags &= ~INP_BOUND_IF; + else + inp->inp_flags |= INP_BOUND_IF; + + /* Blow away any cached route in the PCB */ + if (inp->inp_route.ro_rt != NULL) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = NULL; + } +} + +/* + * Handler for setting IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option. + */ +int +inp_nocellular(struct inpcb *inp, unsigned int val) +{ + if (val) { + inp->inp_flags |= INP_NO_IFT_CELLULAR; + } else if (inp->inp_flags & INP_NO_IFT_CELLULAR) { + /* once set, it cannot be unset */ + return (EINVAL); } + + /* Blow away any cached route in the PCB */ + if (inp->inp_route.ro_rt != NULL) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = NULL; + } + + return (0); } diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index a793f3a12..728b93e33 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -100,8 +100,8 @@ typedef u_quad_t inp_gen_t; /* * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. - * So, AF_INET6 null laddr is also used as AF_INET null laddr, - * by utilize following structure. (At last, same as INRIA) + * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing + * the following structure. */ struct in_addr_4in6 { u_int32_t ia46_pad32[3]; @@ -120,6 +120,14 @@ struct icmp6_filter; struct label; #endif +struct inp_stat +{ + u_int64_t rxpackets; + u_int64_t rxbytes; + u_int64_t txpackets; + u_int64_t txbytes; +}; + struct inpcb { LIST_ENTRY(inpcb) inp_hash; /* hash list */ int inp_wantcnt; /* pcb wanted count. protected by pcb list lock */ @@ -127,7 +135,7 @@ struct inpcb { u_short inp_fport; /* foreign port */ u_short inp_lport; /* local port */ LIST_ENTRY(inpcb) inp_list; /* list for all PCBs of this proto */ - caddr_t inp_ppcb; /* pointer to per-protocol pcb */ + void *inp_ppcb; /* pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* PCB list info */ struct socket *inp_socket; /* back pointer to socket */ u_char nat_owner; /* Used to NAT TCP/UDP traffic */ @@ -187,16 +195,15 @@ struct inpcb { int hash_element; /* Array index of pcb's hash list */ caddr_t inp_saved_ppcb; /* place to save pointer while cached */ struct inpcbpolicy *inp_sp; -#ifdef _KERN_LOCKS_H_ - lck_mtx_t *inpcb_mtx; /* inpcb per-socket mutex */ -#else - void *inpcb_mtx; -#endif + decl_lck_mtx_data( ,inpcb_mtx); /* inpcb per-socket mutex */ unsigned int inp_boundif; /* interface scope for INP_BOUND_IF */ - u_int32_t inp_reserved[3]; /* reserved for future use */ + unsigned int inp_last_outif; /* last known outgoing interface */ + u_int32_t inp_reserved[2]; /* reserved for future use */ #if CONFIG_MACF_NET struct label *inp_label; /* MAC label */ #endif + struct inp_stat *inp_stat; + u_int8_t inp_stat_store[sizeof(struct inp_stat) + sizeof(u_int64_t)]; }; #endif /* KERNEL_PRIVATE */ @@ -355,29 +362,70 @@ struct xinpcb64 { u_char inp_vflag; u_char inp_ip_ttl; /* time to live */ u_char inp_ip_p; /* protocol */ - union { /* foreign host table entry */ - struct in_addr_4in6 inp46_foreign; - struct in6_addr inp6_foreign; - } inp_dependfaddr; - union { /* local host table entry */ - struct in_addr_4in6 inp46_local; - struct in6_addr inp6_local; - } inp_dependladdr; - struct { - u_char inp4_ip_tos; /* type of service */ - } inp_depend4; - struct { - u_int8_t inp6_hlim; - int inp6_cksum; - u_short inp6_ifindex; - short inp6_hops; - } inp_depend6; - struct xsocket64 xi_socket; + union { /* foreign host table entry */ + struct in_addr_4in6 inp46_foreign; + struct in6_addr inp6_foreign; + } inp_dependfaddr; + union { /* local host table entry */ + struct in_addr_4in6 inp46_local; + struct in6_addr inp6_local; + } inp_dependladdr; + struct { + u_char inp4_ip_tos; /* type of service */ + } inp_depend4; + struct { + u_int8_t inp6_hlim; + int inp6_cksum; + u_short inp6_ifindex; + short inp6_hops; + } inp_depend6; + struct xsocket64 xi_socket; u_quad_t xi_alignment_hack; }; #endif /* !CONFIG_EMBEDDED */ +#ifdef PRIVATE + +struct xinpcb_list_entry { + u_int64_t le_next; + u_int64_t le_prev; +}; + +struct xinpcb_n { + u_int32_t xi_len; /* length of this structure */ + u_int32_t xi_kind; /* XSO_INPCB */ + u_int64_t xi_inpp; + u_short inp_fport; /* foreign port */ + u_short inp_lport; /* local port */ + u_int64_t inp_ppcb; /* pointer to per-protocol pcb */ + inp_gen_t inp_gencnt; /* generation count of this instance */ + int inp_flags; /* generic IP/datagram flags */ + u_int32_t inp_flow; + u_char inp_vflag; + u_char inp_ip_ttl; /* time to live */ + u_char inp_ip_p; /* protocol */ + union { /* foreign host table entry */ + struct in_addr_4in6 inp46_foreign; + struct in6_addr inp6_foreign; + } inp_dependfaddr; + union { /* local host table entry */ + struct in_addr_4in6 inp46_local; + struct in6_addr inp6_local; + } inp_dependladdr; + struct { + u_char inp4_ip_tos; /* type of service */ + } inp_depend4; + struct { + u_int8_t inp6_hlim; + int inp6_cksum; + u_short inp6_ifindex; + short inp6_hops; + } inp_depend6; +}; + +#endif /* PRIVATE */ + struct xinpgen { u_int32_t xig_len; /* length of this structure */ u_int xig_count; /* number of PCBs at this time */ @@ -419,6 +467,7 @@ struct xinpgen { #define in6p_ppcb inp_ppcb /* for KAME src sync over BSD*'s */ #define in6p_state inp_state #define in6p_wantcnt inp_wantcnt +#define in6p_last_outif inp_last_outif #ifdef KERNEL_PRIVATE struct inpcbport { @@ -477,31 +526,36 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #ifdef __APPLE__ #define INP_STRIPHDR 0x200 /* Strip headers in raw_ip, for OT support */ #endif -#define INP_FAITH 0x400 /* accept FAITH'ed connections */ +#define INP_FAITH 0x400 /* accept FAITH'ed connections */ #define INP_INADDR_ANY 0x800 /* local address wasn't specified */ #define INP_RECVTTL 0x1000 #define INP_UDP_NOCKSUM 0x2000 /* Turn off outbound UDP checksum */ #define INP_BOUND_IF 0x4000 /* bind socket to an ifindex */ -#define IN6P_IPV6_V6ONLY 0x008000 /* restrict AF_INET6 socket for v6 */ - -#define IN6P_PKTINFO 0x010000 /* receive IP6 dst and I/F */ -#define IN6P_HOPLIMIT 0x020000 /* receive hoplimit */ -#define IN6P_HOPOPTS 0x040000 /* receive hop-by-hop options */ -#define IN6P_DSTOPTS 0x080000 /* receive dst options after rthdr */ -#define IN6P_RTHDR 0x100000 /* receive routing header */ +#define IN6P_IPV6_V6ONLY 0x8000 /* restrict AF_INET6 socket for v6 */ +#define IN6P_PKTINFO 0x10000 /* receive IP6 dst and I/F */ +#define IN6P_HOPLIMIT 0x20000 /* receive hoplimit */ +#define IN6P_HOPOPTS 0x40000 /* receive hop-by-hop options */ +#define IN6P_DSTOPTS 0x80000 /* receive dst options after rthdr */ +#define IN6P_RTHDR 0x100000 /* receive routing header */ #define IN6P_RTHDRDSTOPTS 0x200000 /* receive dstoptions before rthdr */ -#define IN6P_TCLASS 0x400000 /* receive traffic class value */ +#define IN6P_TCLASS 0x400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x800000 /* attach flowlabel automatically */ -#define IN6P_BINDV6ONLY 0x10000000 /* do not grab IPv4 traffic */ +#define IN6P_BINDV6ONLY 0x1000000 /* do not grab IPv4 traffic */ +#define IN6P_RFC2292 0x2000000 /* used RFC2292 API on the socket */ +#define IN6P_MTU 0x4000000 /* receive path MTU */ +#define INP_PKTINFO 0x8000000 /* receive and send PKTINFO for IPv4 */ + +#define INP_NO_IFT_CELLULAR 0x20000000 /* do not use IFT_CELLULAR route */ #ifdef KERNEL_PRIVATE #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ - INP_RECVIF|INP_RECVTTL|\ + INP_RECVIF|INP_RECVTTL|INP_PKTINFO|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ - IN6P_TCLASS|IN6P_AUTOFLOWLABEL) + IN6P_TCLASS|IN6P_RFC2292|IN6P_MTU) + #define INP_UNMAPPABLEOPTS (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL) @@ -513,6 +567,7 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define IN6P_MTUDISC INP_MTUDISC #define IN6P_FAITH INP_FAITH #define IN6P_CONTROLOPTS INP_CONTROLOPTS +#define IN6P_NO_IFT_CELLULAR INP_NO_IFT_CELLULAR /* * socket AF version is {newer than,or include} * actual datagram AF version @@ -530,6 +585,7 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family +#define INP_SOCKTYPE(so) so->so_proto->pr_type #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) @@ -541,6 +597,8 @@ extern int ipport_lastauto; extern int ipport_hifirstauto; extern int ipport_hilastauto; +struct sysctl_req; + #define INPCB_STATE_INUSE 0x1 /* freshly allocated PCB, it's in use */ #define INPCB_STATE_CACHED 0x2 /* this pcb is sitting in a a cache */ #define INPCB_STATE_DEAD 0x3 /* should treat as gone, will be garbage collected and freed */ @@ -553,19 +611,21 @@ extern void in_losing(struct inpcb *); extern void in_rtchange(struct inpcb *, int); extern int in_pcballoc(struct socket *, struct inpcbinfo *, struct proc *); extern int in_pcbbind(struct inpcb *, struct sockaddr *, struct proc *); -extern int in_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *); +extern int in_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *, unsigned int *); extern void in_pcbdetach(struct inpcb *); extern void in_pcbdispose (struct inpcb *); extern void in_pcbdisconnect(struct inpcb *); extern int in_pcbinshash(struct inpcb *, int); extern int in_pcbladdr(struct inpcb *, struct sockaddr *, - struct sockaddr_in **); + struct sockaddr_in *, unsigned int *); extern struct inpcb *in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_int, int); extern struct inpcb *in_pcblookup_local_and_cleanup(struct inpcbinfo *, struct in_addr, u_int, int); extern struct inpcb *in_pcblookup_hash(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); +extern int in_pcblookup_hash_exists(struct inpcbinfo *, struct in_addr, + u_int, struct in_addr, u_int, int, uid_t *, gid_t *, struct ifnet *); extern void in_pcbnotifyall(struct inpcbinfo *, struct in_addr, int, void (*)(struct inpcb *, int)); extern void in_pcbrehash(struct inpcb *); @@ -580,8 +640,11 @@ extern void inpcb_to_compat(struct inpcb *inp, extern void inpcb_to_xinpcb64(struct inpcb *inp, struct xinpcb64 *xinp); #endif +extern int get_pcblist_n(short , struct sysctl_req *, struct inpcbinfo *); extern void inp_route_copyout(struct inpcb *, struct route *); extern void inp_route_copyin(struct inpcb *, struct route *); +extern void inp_bindif(struct inpcb *, unsigned int); +extern int inp_nocellular(struct inpcb *, unsigned int); #endif /* KERNEL */ #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet/in_pcblist.c b/bsd/netinet/in_pcblist.c new file mode 100644 index 000000000..9ff8839b5 --- /dev/null +++ b/bsd/netinet/in_pcblist.c @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/types.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/domain.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/dtrace.h> + +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> + +#include <netinet/udp.h> +#include <netinet/udp_var.h> + +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> + +#ifndef ROUNDUP64 +#define ROUNDUP64(x) P2ROUNDUP((x), sizeof(u_int64_t)) +#endif + +#ifndef ADVANCE64 +#define ADVANCE64(p, n) (void*)((char *)(p) + ROUNDUP64(n)) +#endif + + +void sotoxsocket_n(struct socket *, struct xsocket_n *); +void sbtoxsockbuf_n(struct sockbuf *, struct xsockbuf_n *); +void sbtoxsockstat_n(struct socket *, struct xsockstat_n *); +void inpcb_to_xinpcb_n(struct inpcb *, struct xinpcb_n *); +void tcpcb_to_xtcpcb_n(struct tcpcb *, struct xtcpcb_n *); + +__private_extern__ void +sotoxsocket_n(struct socket *so, struct xsocket_n *xso) +{ + xso->xso_len = sizeof(struct xsocket_n); + xso->xso_kind = XSO_SOCKET; + + if (so != NULL) { + xso->xso_so = (u_int64_t)(uintptr_t)so; + xso->so_type = so->so_type; + xso->so_options = so->so_options; + xso->so_linger = so->so_linger; + xso->so_state = so->so_state; + xso->so_pcb = (u_int64_t)(uintptr_t)so->so_pcb; + if (so->so_proto) { + xso->xso_protocol = so->so_proto->pr_protocol; + xso->xso_family = so->so_proto->pr_domain->dom_family; + } else { + xso->xso_protocol = xso->xso_family = 0; + } + xso->so_qlen = so->so_qlen; + xso->so_incqlen = so->so_incqlen; + xso->so_qlimit = so->so_qlimit; + xso->so_timeo = so->so_timeo; + xso->so_error = so->so_error; + xso->so_pgid = so->so_pgid; + xso->so_oobmark = so->so_oobmark; + xso->so_uid = so->so_uid; + } +} + +__private_extern__ void +sbtoxsockbuf_n(struct sockbuf *sb, struct xsockbuf_n *xsb) +{ + xsb->xsb_len = sizeof(struct xsockbuf_n); + xsb->xsb_kind = (sb->sb_flags & SB_RECV) ? XSO_RCVBUF : XSO_SNDBUF; + + if (sb != NULL) { + xsb->sb_cc = sb->sb_cc; + xsb->sb_hiwat = sb->sb_hiwat; + xsb->sb_mbcnt = sb->sb_mbcnt; + xsb->sb_mbmax = sb->sb_mbmax; + xsb->sb_lowat = sb->sb_lowat; + xsb->sb_flags = sb->sb_flags; + xsb->sb_timeo = (short) + (sb->sb_timeo.tv_sec * hz) + sb->sb_timeo.tv_usec / tick; + if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0) + xsb->sb_timeo = 1; + } +} + +__private_extern__ void +sbtoxsockstat_n(struct socket *so, struct xsockstat_n *xst) +{ + int i; + + xst->xst_len = sizeof(struct xsockstat_n); + xst->xst_kind = XSO_STATS; + + for (i = 0; i < SO_TC_STATS_MAX; i++) { + xst->xst_tc_stats[i].rxpackets = so->so_tc_stats[i].rxpackets; + xst->xst_tc_stats[i].rxbytes = so->so_tc_stats[i].rxbytes; + xst->xst_tc_stats[i].txpackets = so->so_tc_stats[i].txpackets; + xst->xst_tc_stats[i].txbytes = so->so_tc_stats[i].txbytes; + } +} + +__private_extern__ void +inpcb_to_xinpcb_n(struct inpcb *inp, struct xinpcb_n *xinp) +{ + xinp->xi_len = sizeof(struct xinpcb_n); + xinp->xi_kind = XSO_INPCB; + xinp->xi_inpp = (u_int64_t)(uintptr_t)inp; + xinp->inp_fport = inp->inp_fport; + xinp->inp_lport = inp->inp_lport; + xinp->inp_ppcb = (u_int64_t)(uintptr_t)inp->inp_ppcb; + xinp->inp_gencnt = inp->inp_gencnt; + xinp->inp_flags = inp->inp_flags; + xinp->inp_flow = inp->inp_flow; + xinp->inp_vflag = inp->inp_vflag; + xinp->inp_ip_ttl = inp->inp_ip_ttl; + xinp->inp_ip_p = inp->inp_ip_p; + xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign; + xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local; + xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos; + xinp->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim; + xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum; + xinp->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex; + xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops; +} + +__private_extern__ void +tcpcb_to_xtcpcb_n(struct tcpcb *tp, struct xtcpcb_n *xt) +{ + int i; + + xt->xt_len = sizeof(struct xtcpcb_n); + xt->xt_kind = XSO_TCPCB; + + xt->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first; + xt->t_dupacks = tp->t_dupacks; + for (i = 0; i < TCPT_NTIMERS_EXT; i++) + xt->t_timer[i] = tp->t_timer[i]; + xt->t_state = tp->t_state; + xt->t_flags = tp->t_flags; + xt->t_force = tp->t_force; + xt->snd_una = tp->snd_una; + xt->snd_max = tp->snd_max; + xt->snd_nxt = tp->snd_nxt; + xt->snd_up = tp->snd_up; + xt->snd_wl1 = tp->snd_wl1; + xt->snd_wl2 = tp->snd_wl2; + xt->iss = tp->iss; + xt->irs = tp->irs; + xt->rcv_nxt = tp->rcv_nxt; + xt->rcv_adv = tp->rcv_adv; + xt->rcv_wnd = tp->rcv_wnd; + xt->rcv_up = tp->rcv_up; + xt->snd_wnd = tp->snd_wnd; + xt->snd_cwnd = tp->snd_cwnd; + xt->snd_ssthresh = tp->snd_ssthresh; + xt->t_maxopd = tp->t_maxopd; + xt->t_rcvtime = tp->t_rcvtime; + xt->t_starttime = tp->t_starttime; + xt->t_rtttime = tp->t_rtttime; + xt->t_rtseq = tp->t_rtseq; + xt->t_rxtcur = tp->t_rxtcur; + xt->t_maxseg = tp->t_maxseg; + xt->t_srtt = tp->t_srtt; + xt->t_rttvar = tp->t_rttvar; + xt->t_rxtshift = tp->t_rxtshift; + xt->t_rttmin = tp->t_rttmin; + xt->t_rttupdated = tp->t_rttupdated; + xt->max_sndwnd = tp->max_sndwnd; + xt->t_softerror = tp->t_softerror; + xt->t_oobflags = tp->t_oobflags; + xt->t_iobc = tp->t_iobc; + xt->snd_scale = tp->snd_scale; + xt->rcv_scale = tp->rcv_scale; + xt->request_r_scale = tp->request_r_scale; + xt->requested_s_scale = tp->requested_s_scale; + xt->ts_recent = tp->ts_recent; + xt->ts_recent_age = tp->ts_recent_age; + xt->last_ack_sent = tp->last_ack_sent; + xt->cc_send = tp->cc_send; + xt->cc_recv = tp->cc_recv; + xt->snd_recover = tp->snd_recover; + xt->snd_cwnd_prev = tp->snd_cwnd_prev; + xt->snd_ssthresh_prev = tp->snd_ssthresh_prev; + xt->t_badrxtwin = tp->t_badrxtwin; +} + +__private_extern__ int +get_pcblist_n(short proto, struct sysctl_req *req, struct inpcbinfo *pcbinfo) +{ + int error = 0; + int i, n; + struct inpcb *inp, **inp_list = NULL; + inp_gen_t gencnt; + struct xinpgen xig; + void *buf = NULL; + size_t item_size = ROUNDUP64(sizeof(struct xinpcb_n)) + + ROUNDUP64(sizeof(struct xsocket_n)) + + 2 * ROUNDUP64(sizeof(struct xsockbuf_n)) + + ROUNDUP64(sizeof(struct xsockstat_n)); + + if (proto == IPPROTO_TCP) + item_size += ROUNDUP64(sizeof(struct xtcpcb_n)); + + /* + * The process of preparing the PCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + lck_rw_lock_exclusive(pcbinfo->mtx); + if (req->oldptr == USER_ADDR_NULL) { + n = pcbinfo->ipi_count; + req->oldidx = 2 * (sizeof xig) + + (n + n/8) * item_size; + goto done; + } + + if (req->newptr != USER_ADDR_NULL) { + error = EPERM; + goto done; + } + + /* + * OK, now we're committed to doing something. + */ + gencnt = pcbinfo->ipi_gencnt; + n = pcbinfo->ipi_count; + + bzero(&xig, sizeof(xig)); + xig.xig_len = sizeof xig; + xig.xig_count = n; + xig.xig_gen = gencnt; + xig.xig_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xig, sizeof xig); + if (error) { + goto done; + } + /* + * We are done if there is no pcb + */ + if (n == 0) { + goto done; + } + + buf = _MALLOC(item_size, M_TEMP, M_WAITOK); + if (buf == 0) { + error = ENOMEM; + goto done; + } + + inp_list = _MALLOC(n * sizeof *inp_list, M_TEMP, M_WAITOK); + if (inp_list == 0) { + error = ENOMEM; + goto done; + } + + for (inp = pcbinfo->listhead->lh_first, i = 0; inp && i < n; + inp = inp->inp_list.le_next) { + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) + inp_list[i++] = inp; + } + n = i; + + error = 0; + for (i = 0; i < n; i++) { + inp = inp_list[i]; + if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { + struct xinpcb_n *xi = (struct xinpcb_n *)buf; + struct xsocket_n *xso = (struct xsocket_n *)ADVANCE64(xi, sizeof(*xi)); + struct xsockbuf_n *xsbrcv = (struct xsockbuf_n *)ADVANCE64(xso, sizeof(*xso)); + struct xsockbuf_n *xsbsnd = (struct xsockbuf_n *)ADVANCE64(xsbrcv, sizeof(*xsbrcv)); + struct xsockstat_n *xsostats = (struct xsockstat_n *)ADVANCE64(xsbsnd, sizeof(*xsbsnd)); + + bzero(buf, item_size); + + inpcb_to_xinpcb_n(inp, xi); + sotoxsocket_n(inp->inp_socket, xso); + sbtoxsockbuf_n(inp->inp_socket ? &inp->inp_socket->so_rcv : NULL, xsbrcv); + sbtoxsockbuf_n(inp->inp_socket ? &inp->inp_socket->so_snd : NULL, xsbsnd); + sbtoxsockstat_n(inp->inp_socket, xsostats); + if (proto == IPPROTO_TCP) { + struct xtcpcb_n *xt = (struct xtcpcb_n *)ADVANCE64(xsostats, sizeof(*xsostats)); + + /* + * inp->inp_ppcb, can only be NULL on + * an initialization race window. + * No need to lock. + */ + if (inp->inp_ppcb == NULL) + continue; + + tcpcb_to_xtcpcb_n((struct tcpcb *)inp->inp_ppcb, xt); + } + error = SYSCTL_OUT(req, buf, item_size); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + bzero(&xig, sizeof(xig)); + xig.xig_len = sizeof xig; + xig.xig_gen = pcbinfo->ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = pcbinfo->ipi_count; + error = SYSCTL_OUT(req, &xig, sizeof xig); + } +done: + lck_rw_done(pcbinfo->mtx); + if (inp_list) + FREE(inp_list, M_TEMP); + if (buf) + FREE(buf, M_TEMP); + return error; +} + diff --git a/bsd/netinet/in_proto.c b/bsd/netinet/in_proto.c index f08af184f..7c979683a 100644 --- a/bsd/netinet/in_proto.c +++ b/bsd/netinet/in_proto.c @@ -164,7 +164,7 @@ struct protosw inetsw[] = { { SOCK_RAW, &inetdomain, IPPROTO_IGMP, PR_ATOMIC|PR_ADDR|PR_LASTHDR, igmp_input, 0, 0, rip_ctloutput, 0, - igmp_init, igmp_fasttimo, igmp_slowtimo, 0, + igmp_init, 0, igmp_slowtimo, 0, 0, &rip_usrreqs, 0, rip_unlock, 0, { 0, 0 }, 0, { 0 } diff --git a/bsd/netinet/in_rmx.c b/bsd/netinet/in_rmx.c index 37ca3d250..2d0c2735d 100644 --- a/bsd/netinet/in_rmx.c +++ b/bsd/netinet/in_rmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,13 +75,16 @@ #include <sys/sysctl.h> #include <sys/socket.h> #include <sys/mbuf.h> +#include <sys/protosw.h> #include <sys/syslog.h> +#include <sys/mcache.h> #include <kern/lock.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> +#include <netinet/in_arp.h> extern int tvtohz(struct timeval *); extern int in_inithead(void **head, int off); @@ -139,11 +142,15 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { rt->rt_flags |= RTF_BROADCAST; } else { + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + IFA_LOCK_SPIN(rt->rt_ifa); #define satosin(sa) ((struct sockaddr_in *)sa) if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr == sin->sin_addr.s_addr) rt->rt_flags |= RTF_LOCAL; #undef satosin + IFA_UNLOCK(rt->rt_ifa); } } @@ -160,7 +167,7 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * ARP entry and delete it if so. */ rt2 = rtalloc1_scoped_locked(rt_key(rt), 0, - RTF_CLONING | RTF_PRCLONING, sa_get_ifscope(rt_key(rt))); + RTF_CLONING | RTF_PRCLONING, sin_get_ifscope(rt_key(rt))); if (rt2) { RT_LOCK(rt2); if ((rt2->rt_flags & RTF_LLINFO) && @@ -199,9 +206,17 @@ in_validate(struct radix_node *rn) RT_LOCK_ASSERT_HELD(rt); /* This is first reference? */ - if (rt->rt_refcnt == 0 && (rt->rt_flags & RTPRF_OURS)) { - rt->rt_flags &= ~RTPRF_OURS; - rt->rt_rmx.rmx_expire = 0; + if (rt->rt_refcnt == 0) { + if (rt->rt_flags & RTPRF_OURS) { + /* It's one of ours; unexpire it */ + rt->rt_flags &= ~RTPRF_OURS; + rt_setexpire(rt, 0); + } else if ((rt->rt_flags & RTF_LLINFO) && + (rt->rt_flags & RTF_HOST) && rt->rt_gateway != NULL && + rt->rt_gateway->sa_family == AF_LINK) { + /* It's ARP; let it be handled there */ + arp_validate(rt); + } } return (rn); } @@ -236,19 +251,19 @@ in_matroute_args(void *v_arg, struct radix_node_head *head, static int rtq_reallyold = 60*60; /* one hour is ``really old'' */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_reallyold , 0, "Default expiration time on dynamically learned routes"); static int rtq_minreallyold = 10; /* never automatically crank down to less */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_minreallyold , 0, "Minimum time to attempt to hold onto dynamically learned routes"); static int rtq_toomany = 128; /* 128 cached routes is ``too many'' */ -SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_toomany , 0, "Upper limit on dynamically learned routes"); #ifdef __APPLE__ @@ -265,12 +280,12 @@ SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW, * If for some reason a circular route is needed, turn this sysctl (net.inet.ip.check_route_selfref) to zero. */ int check_routeselfref = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, check_route_selfref, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, check_route_selfref, CTLFLAG_RW | CTLFLAG_LOCKED, &check_routeselfref , 0, ""); #endif int use_routegenid = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, use_route_genid, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, use_route_genid, CTLFLAG_RW | CTLFLAG_LOCKED, &use_routegenid , 0, ""); /* @@ -319,12 +334,12 @@ in_clsroute(struct radix_node *rn, __unused struct radix_node_head *head) RT_LOCK(rt); } } else { - struct timeval timenow; + uint64_t timenow; - getmicrotime(&timenow); + timenow = net_uptime(); rt->rt_flags |= RTPRF_OURS; - rt->rt_rmx.rmx_expire = - rt_expiry(rt, timenow.tv_sec, rtq_reallyold); + rt_setexpire(rt, + rt_expiry(rt, timenow, rtq_reallyold)); } } @@ -334,7 +349,7 @@ struct rtqk_arg { int killed; int found; int updating; - time_t nextstop; + uint64_t nextstop; }; /* @@ -348,16 +363,18 @@ in_rtqkill(struct radix_node *rn, void *rock) struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; - struct timeval timenow; + uint64_t timenow; - getmicrotime(&timenow); + timenow = net_uptime(); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK(rt); if (rt->rt_flags & RTPRF_OURS) { ap->found++; - if (ap->draining || rt->rt_rmx.rmx_expire <= timenow.tv_sec) { + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + if (ap->draining || rt->rt_expire <= timenow) { if (rt->rt_refcnt > 0) panic("rtqkill route really not free"); @@ -380,13 +397,13 @@ in_rtqkill(struct radix_node *rn, void *rock) } } else { if (ap->updating && - (unsigned)(rt->rt_rmx.rmx_expire - timenow.tv_sec) > + (rt->rt_expire - timenow) > rt_expiry(rt, 0, rtq_reallyold)) { - rt->rt_rmx.rmx_expire = rt_expiry(rt, - timenow.tv_sec, rtq_reallyold); + rt_setexpire(rt, rt_expiry(rt, + timenow, rtq_reallyold)); } ap->nextstop = lmin(ap->nextstop, - rt->rt_rmx.rmx_expire); + rt->rt_expire); RT_UNLOCK(rt); } } else { @@ -411,16 +428,16 @@ in_rtqtimo(void *rock) struct radix_node_head *rnh = rock; struct rtqk_arg arg; struct timeval atv; - static time_t last_adjusted_timeout = 0; - struct timeval timenow; + static uint64_t last_adjusted_timeout = 0; + uint64_t timenow; lck_mtx_lock(rnh_lock); /* Get the timestamp after we acquire the lock for better accuracy */ - getmicrotime(&timenow); + timenow = net_uptime(); arg.found = arg.killed = 0; arg.rnh = rnh; - arg.nextstop = timenow.tv_sec + rtq_timeout; + arg.nextstop = timenow + rtq_timeout; arg.draining = arg.updating = 0; rnh->rnh_walktree(rnh, in_rtqkill, &arg); @@ -433,14 +450,14 @@ in_rtqtimo(void *rock) * hard. */ if((arg.found - arg.killed > rtq_toomany) - && (timenow.tv_sec - last_adjusted_timeout >= rtq_timeout) + && ((timenow - last_adjusted_timeout) >= (uint64_t)rtq_timeout) && rtq_reallyold > rtq_minreallyold) { rtq_reallyold = 2*rtq_reallyold / 3; if(rtq_reallyold < rtq_minreallyold) { rtq_reallyold = rtq_minreallyold; } - last_adjusted_timeout = timenow.tv_sec; + last_adjusted_timeout = timenow; #if DIAGNOSTIC log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", rtq_reallyold); @@ -451,7 +468,7 @@ in_rtqtimo(void *rock) } atv.tv_usec = 0; - atv.tv_sec = arg.nextstop - timenow.tv_sec; + atv.tv_sec = arg.nextstop - timenow; lck_mtx_unlock(rnh_lock); timeout(in_rtqtimo_funnel, rock, tvtohz(&atv)); } @@ -557,8 +574,13 @@ in_ifadown(struct ifaddr *ifa, int delete) lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); + /* + * Holding rnh_lock here prevents the possibility of + * ifa from changing (e.g. in_ifinit), so it is safe + * to access its ifa_addr without locking. + */ if (ifa->ifa_addr->sa_family != AF_INET) - return 1; + return (1); /* trigger route cache reevaluation */ if (use_routegenid) @@ -568,6 +590,8 @@ in_ifadown(struct ifaddr *ifa, int delete) arg.ifa = ifa; arg.del = delete; rnh->rnh_walktree(rnh, in_ifadownkill, &arg); + IFA_LOCK_SPIN(ifa); ifa->ifa_flags &= ~IFA_ROUTE; - return 0; + IFA_UNLOCK(ifa); + return (0); } diff --git a/bsd/netinet/in_tclass.c b/bsd/netinet/in_tclass.c new file mode 100644 index 000000000..54b5fcc1d --- /dev/null +++ b/bsd/netinet/in_tclass.c @@ -0,0 +1,850 @@ +/* + * Copyright (c) 2009-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/types.h> +#include <sys/filedesc.h> +#include <sys/file_internal.h> +#include <sys/proc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/errno.h> +#include <sys/protosw.h> +#include <sys/domain.h> +#include <sys/mbuf.h> +#include <sys/queue.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/in_pcb.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#include <netinet/udp.h> +#include <netinet/udp_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcp_cc.h> + +extern char *proc_name_address(void *p); + +static int tfp_count = 0; + +static TAILQ_HEAD(, tclass_for_proc) tfp_head = TAILQ_HEAD_INITIALIZER(tfp_head); + +struct tclass_for_proc { + TAILQ_ENTRY(tclass_for_proc) tfp_link; + int tfp_class; + pid_t tfp_pid; + char tfp_pname[MAXCOMLEN + 1]; +}; + +extern void tcp_set_background_cc(struct socket *); +extern void tcp_set_foreground_cc(struct socket *); + +int dscp_code_from_mbuf_tclass(int ); + +static int get_pid_tclass(pid_t , int *); +static int get_pname_tclass(const char * , int *); +static int set_pid_tclass(pid_t , int ); +static int set_pname_tclass(const char * , int ); +static int purge_tclass_for_proc(void); +static int flush_tclass_for_proc(void); + + +static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */ +static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */ +static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */ +static lck_mtx_t *tclass_lock = NULL; + +/* + * Must be called with tclass_lock held + */ +static struct tclass_for_proc * +find_tfp_by_pid(pid_t pid) +{ + struct tclass_for_proc *tfp; + + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { + if (tfp->tfp_pid == pid) + break; + } + return tfp; +} + +/* + * Must be called with tclass_lock held + */ +static struct tclass_for_proc * +find_tfp_by_pname(const char *pname) +{ + struct tclass_for_proc *tfp; + + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { + if (strncmp(pname, tfp->tfp_pname, sizeof(tfp->tfp_pname)) == 0) + break; + } + return tfp; +} + +static int +get_tclass_for_curr_proc(void) +{ + struct tclass_for_proc *tfp; + int sotc = SO_TC_BE; + proc_t p = current_proc(); /* Not ref counted */ + pid_t pid = proc_pid(p); + char *pname = proc_name_address(p); + + lck_mtx_lock(tclass_lock); + + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { + if ((tfp->tfp_pid == pid) || + (tfp->tfp_pid == -1 && strncmp(pname, tfp->tfp_pname, sizeof(tfp->tfp_pname)) == 0)) { + sotc = tfp->tfp_class; + break; + } + } + + lck_mtx_unlock(tclass_lock); + + return sotc; +} + +/* + * Purge entries with PIDs of exited processes + */ +int +purge_tclass_for_proc(void) +{ + int error = 0; + struct tclass_for_proc *tfp, *tvar; + + lck_mtx_lock(tclass_lock); + + TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { + proc_t p; + + if (tfp->tfp_pid == -1) + continue; + if ((p = proc_find(tfp->tfp_pid)) == NULL) { + tfp_count--; + TAILQ_REMOVE(&tfp_head, tfp, tfp_link); + + _FREE(tfp, M_TEMP); + } else { + proc_rele(p); + } + } + + lck_mtx_unlock(tclass_lock); + + return error; +} + +/* + * Remove one entry + * Must be called with tclass_lock held + */ +static void +free_tclass_for_proc(struct tclass_for_proc *tfp) +{ + if (tfp == NULL) + return; + tfp_count--; + TAILQ_REMOVE(&tfp_head, tfp, tfp_link); + _FREE(tfp, M_TEMP); +} + +/* + * Remove all entries + */ +int +flush_tclass_for_proc(void) +{ + int error = 0; + struct tclass_for_proc *tfp, *tvar; + + lck_mtx_lock(tclass_lock); + + TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { + free_tclass_for_proc(tfp); + } + + lck_mtx_unlock(tclass_lock); + + return error; + +} + +/* + * Must be called with tclass_lock held + */ +static struct tclass_for_proc * +alloc_tclass_for_proc(pid_t pid, const char *pname, int tclass) +{ + struct tclass_for_proc *tfp; + + if (pid == -1 && pname == NULL) + return NULL; + + tfp = _MALLOC(sizeof(struct tclass_for_proc), M_TEMP, M_NOWAIT | M_ZERO); + if (tfp == NULL) + return NULL; + + tfp->tfp_pid = pid; + tfp->tfp_class = tclass; + /* + * Add per pid entries before per proc name so we can find + * a specific instance of a process before the general name base entry. + */ + if (pid != -1) { + TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link); + } else { + strlcpy(tfp->tfp_pname, pname, sizeof(tfp->tfp_pname)); + TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link); + } + + tfp_count++; + + return tfp; +} + +/* + * -1 for tclass means to remove the entry + */ +int +set_pid_tclass(pid_t pid, int tclass) +{ + int error = EINVAL; + proc_t p = NULL; + struct filedesc *fdp; + struct fileproc *fp; + struct tclass_for_proc *tfp; + int i; + + p = proc_find(pid); + if (p == NULL) { + printf("set_pid_tclass proc_find(%d) \n", pid); + goto done; + } + + /* Need a tfp */ + lck_mtx_lock(tclass_lock); + + tfp = find_tfp_by_pid(pid); + if (tclass == -1) { + if (tfp != NULL) { + free_tclass_for_proc(tfp); + error = 0; + } + lck_mtx_unlock(tclass_lock); + goto done; + } else { + if (tfp == NULL) { + tfp = alloc_tclass_for_proc(pid, NULL, tclass); + if (tfp == NULL) { + lck_mtx_unlock(tclass_lock); + error = ENOBUFS; + goto done; + } + } else { + tfp->tfp_class = tclass; + } + } + lck_mtx_unlock(tclass_lock); + + if (tfp != NULL) { + proc_fdlock(p); + + fdp = p->p_fd; + for (i = 0; i < fdp->fd_nfiles; i++) { + struct socket *so; + + fp = fdp->fd_ofiles[i]; + if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || + fp->f_fglob->fg_type != DTYPE_SOCKET) + continue; + + so = (struct socket *)fp->f_fglob->fg_data; + if (so->so_proto->pr_domain->dom_family != AF_INET && + so->so_proto->pr_domain->dom_family != AF_INET6) + continue; + socket_lock(so, 1); + error = so_set_traffic_class(so, tclass != -1 ? tclass : SO_TC_BE); + socket_unlock(so, 1); + if (error != 0) { + printf("set_pid_tclass so_set_traffic_class(%p, %d) failed %d\n", so, tclass, error); + error = 0; + } + } + + proc_fdunlock(p); + } + + error = 0; +done: + if (p != NULL) + proc_rele(p); + + return error; +} + +int +set_pname_tclass(const char *pname, int tclass) +{ + int error = EINVAL; + struct tclass_for_proc *tfp; + + lck_mtx_lock(tclass_lock); + + tfp = find_tfp_by_pname(pname); + if (tclass == -1) { + if (tfp != NULL) + free_tclass_for_proc(tfp); + } else { + if (tfp == NULL) { + tfp = alloc_tclass_for_proc(-1, pname, tclass); + if (tfp == NULL) { + lck_mtx_unlock(tclass_lock); + error = ENOBUFS; + goto done; + } + } else { + tfp->tfp_class = tclass; + } + } + lck_mtx_unlock(tclass_lock); + + error = 0; +done: + + return error; +} + +int +get_pid_tclass(pid_t pid, int *tclass) +{ + int error = EINVAL; + proc_t p = NULL; + struct tclass_for_proc *tfp; + + *tclass = -1; /* Means not set */ + + p = proc_find(pid); + if (p == NULL) { + printf("get_pid_tclass proc_find(%d) \n", pid); + goto done; + } + + /* Need a tfp */ + lck_mtx_lock(tclass_lock); + + tfp = find_tfp_by_pid(pid); + if (tfp != NULL) { + *tclass = tfp->tfp_class ; + error = 0; + } + lck_mtx_unlock(tclass_lock); +done: + if (p != NULL) + proc_rele(p); + + return error; +} + +int +get_pname_tclass(const char *pname, int *tclass) +{ + int error = EINVAL; + struct tclass_for_proc *tfp; + + *tclass = -1; /* Means not set */ + + /* Need a tfp */ + lck_mtx_lock(tclass_lock); + + tfp = find_tfp_by_pname(pname); + if (tfp != NULL) { + *tclass = tfp->tfp_class ; + error = 0; + } + lck_mtx_unlock(tclass_lock); + + return error; +} + + + +/* + * Setting options requires privileges + */ +__private_extern__ int +so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg) +{ + int error = 0; + + if ((so->so_state & SS_PRIV) == 0) + return EPERM; + + socket_unlock(so, 0); + + switch (so_tcdbg->so_tcdbg_cmd) { + case SO_TCDBG_PID: + error = set_pid_tclass(so_tcdbg->so_tcdbg_pid, so_tcdbg->so_tcdbg_tclass); + break; + + case SO_TCDBG_PNAME: + error = set_pname_tclass(so_tcdbg->so_tcdbg_pname, so_tcdbg->so_tcdbg_tclass); + break; + + case SO_TCDBG_PURGE: + error = purge_tclass_for_proc(); + break; + + case SO_TCDBG_FLUSH: + error = flush_tclass_for_proc(); + break; + + default: + error = EINVAL; + break; + + } + + socket_lock(so, 0); + + return error; +} + +/* + * Not required to be privileged to get + */ +__private_extern__ int +sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) +{ + int error = 0; + struct so_tcdbg so_tcdbg; + void *buf = NULL; + size_t len = sopt->sopt_valsize; + + error = sooptcopyin(sopt, &so_tcdbg, sizeof(struct so_tcdbg), sizeof(struct so_tcdbg)); + if (error != 0) + return error; + + sopt->sopt_valsize = len; + + socket_unlock(so, 0); + + switch (so_tcdbg.so_tcdbg_cmd) { + case SO_TCDBG_PID: + error = get_pid_tclass(so_tcdbg.so_tcdbg_pid, &so_tcdbg.so_tcdbg_tclass); + break; + + case SO_TCDBG_PNAME: + error = get_pname_tclass(so_tcdbg.so_tcdbg_pname, &so_tcdbg.so_tcdbg_tclass); + break; + + case SO_TCDBG_COUNT: + lck_mtx_lock(tclass_lock); + so_tcdbg.so_tcdbg_count = tfp_count; + lck_mtx_unlock(tclass_lock); + break; + + case SO_TCDBG_LIST: { + struct tclass_for_proc *tfp; + int n, alloc_count; + struct so_tcdbg *ptr; + + lck_mtx_lock(tclass_lock); + if ((alloc_count = tfp_count) == 0) { + lck_mtx_unlock(tclass_lock); + error = EINVAL; + break; + } + len = alloc_count * sizeof(struct so_tcdbg); + lck_mtx_unlock(tclass_lock); + + buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO); + if (buf == NULL) { + error = ENOBUFS; + break; + } + + lck_mtx_lock(tclass_lock); + n = 0; + ptr = (struct so_tcdbg *)buf; + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { + if (++n > alloc_count) + break; + if (tfp->tfp_pid != -1) { + ptr->so_tcdbg_cmd = SO_TCDBG_PID; + ptr->so_tcdbg_pid = tfp->tfp_pid; + } else { + ptr->so_tcdbg_cmd = SO_TCDBG_PNAME; + ptr->so_tcdbg_pid = -1; + strlcpy(ptr->so_tcdbg_pname, tfp->tfp_pname, sizeof(ptr->so_tcdbg_pname)); + } + ptr->so_tcdbg_tclass = tfp->tfp_class; + ptr++; + } + + lck_mtx_unlock(tclass_lock); + } + break; + + default: + error = EINVAL; + break; + + } + + socket_lock(so, 0); + + if (error == 0) { + if (buf == NULL) { + error = sooptcopyout(sopt, &so_tcdbg, sizeof(struct so_tcdbg)); + } else { + error = sooptcopyout(sopt, buf, len); + _FREE(buf, M_TEMP); + } + } + return error; +} + + +__private_extern__ int +so_set_traffic_class(struct socket *so, int optval) +{ + int error = 0; + + if (optval < SO_TC_BE || optval > SO_TC_VO) { + error = EINVAL; + } else { + so->so_traffic_class = optval; + + if ((INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) && + INP_SOCKTYPE(so) == SOCK_STREAM) { + set_tcp_stream_priority(so); + } + } + return error; +} + +__private_extern__ void +so_set_default_traffic_class(struct socket *so) +{ + int sotc = SO_TC_BE; + + if (tfp_count > 0 && (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6)) { + sotc = get_tclass_for_curr_proc(); + } + + so->so_traffic_class = sotc; + + return; +} + + +__private_extern__ int +mbuf_traffic_class_from_control(struct mbuf *control) +{ + struct cmsghdr *cm; + + for (cm = M_FIRST_CMSGHDR(control); + cm != NULL; + cm = M_NXT_CMSGHDR(control, cm)) { + int tc; + + if (cm->cmsg_len < sizeof(struct cmsghdr)) + break; + + if (cm->cmsg_level != SOL_SOCKET || + cm->cmsg_type != SO_TRAFFIC_CLASS) + continue; + if (cm->cmsg_len != CMSG_LEN(sizeof(int))) + continue; + + tc = *(int *)CMSG_DATA(cm); + + switch (tc) { + case SO_TC_BE: + return MBUF_TC_BE; + case SO_TC_BK: + return MBUF_TC_BK; + case SO_TC_VI: + return MBUF_TC_VI; + case SO_TC_VO: + return MBUF_TC_VO; + default: + break; + } + } + + return MBUF_TC_UNSPEC; +} + +__private_extern__ int +dscp_code_from_mbuf_tclass(int mtc) +{ + int dscp_code; + + switch (mtc) { + default: + case MBUF_TC_BE: + dscp_code = 0; + break; + case MBUF_TC_BK: + dscp_code = 0x08; + break; + case MBUF_TC_VI: + dscp_code = 0x20; + break; + case MBUF_TC_VO: + dscp_code = 0x30; + break; + } + + return dscp_code; +} + +__private_extern__ void +so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off) +{ + uint32_t sotc = m->m_pkthdr.prio; + + if (sotc >= SO_TC_STATS_MAX) + sotc = SO_TC_BE; + + so->so_tc_stats[sotc].rxpackets += 1; + so->so_tc_stats[sotc].rxbytes += ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off; + + return; +} + +__private_extern__ void +set_tcp_stream_priority(struct socket *so) +{ + struct tcpcb *tp = intotcpcb(sotoinpcb(so)); + + /* If the socket was marked as a background socket or if the + * traffic class is set to background with traffic class socket + * option then make both send and recv side of the stream to be + * background. The variable sotcdb which can be set with sysctl + * is used to disable these settings for testing. + */ + if (soisbackground(so) || so->so_traffic_class == SO_TC_BK) { + if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0) { + if (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) + tcp_set_foreground_cc(so); + } else { + if (tp->tcp_cc_index != TCP_CC_ALGO_BACKGROUND_INDEX) + tcp_set_background_cc(so); + } + + /* Set receive side background flags */ + if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0) { + so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); + } else { + so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG; + } + } else { + so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); + if (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) + tcp_set_foreground_cc(so); + } + return; +} + +/* + * Set traffic class to an IPv4 or IPv6 packet + * - mark the mbuf + * - set the DSCP code following the WMM mapping + */ +__private_extern__ void +set_packet_tclass(struct mbuf *m, struct socket *so, int in_mtc, int isipv6) +{ + int mtc = MBUF_TC_BE; /* Best effort by default */ + struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */ + struct ip *ip = mtod(m, struct ip *); +#if INET6 + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); +#endif /* INET6 */ + + if (!(m->m_flags & M_PKTHDR)) + return; + + /* + * Here is the precedence: + * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all + * 2) Traffic class passed via ancillary data to sendmsdg(2) + * 3) Traffic class socket option last + */ + if (soisbackground(so)) { + mtc = MBUF_TC_BK; + } else if (in_mtc != MBUF_TC_UNSPEC) { + if (in_mtc >= MBUF_TC_BE && in_mtc <= MBUF_TC_VO) + mtc = in_mtc; + } else { + switch (so->so_traffic_class) { + case SO_TC_BE: + mtc = MBUF_TC_BE; + break; + case SO_TC_BK: + mtc = MBUF_TC_BK; + break; + case SO_TC_VI: + mtc = MBUF_TC_VI; + break; + case SO_TC_VO: + mtc = MBUF_TC_VO; + break; + default: + break; + } + } + + /* + * Set the traffic class in the mbuf packet header prio field + */ + if ((sotcdb & SOTCDB_NO_MTC)) + goto no_mbtc; + m->m_pkthdr.prio = mtc; + +no_mbtc: + /* + * Quick exit when best effort + */ + if (mtc == MBUF_TC_BE) + goto no_dscp; + /* + * Now let set the DSCP code in IPv4 or IPv6 header + * By default do this only for local traffic if a code is not already set + */ + if ((sotcdb & SOTCDB_NO_DSCP)) + goto no_dscp; + + /* + * Test if a IP TOS or IPV6 TCLASS has already been set on the socket or the raw packet + */ + if ((sotcdb & SOTCDB_NO_DSCPTST) == 0) { +#if INET6 + if (isipv6) + { + if ((so->so_type == SOCK_RAW && (ip6->ip6_flow & htonl(0xff << 20)) != 0) || + (inp->in6p_outputopts && inp->in6p_outputopts->ip6po_tclass != -1)) + goto no_dscp; + } + else +#endif /* INET6 */ + { + if ((so->so_type == SOCK_RAW && (inp->inp_flags & INP_HDRINCL)) || + inp->inp_ip_tos != 0) + goto no_dscp; + } + } + + /* + * Test if destination is local + */ + if ((sotcdb & SOTCDB_NO_LCLTST) == 0) { + int islocal = 0; + struct route *ro = &inp->inp_route; + + if (so->so_type == SOCK_STREAM) { + struct tcpcb *tp = intotcpcb(inp); + + if ((tp->t_flags & TF_LOCAL)) + islocal = 1; + } + else +#if INET6 + if (isipv6) + { + if ((ro != NULL && ro->ro_rt != NULL && + (ro->ro_rt->rt_gateway->sa_family == AF_LINK || + (ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))) || + in6addr_local(&ip6->ip6_dst)) + islocal = 1; + } + else +#endif /* INET6 */ + { + if ((ro != NULL && ro->ro_rt != NULL && + (ro->ro_rt->rt_gateway->sa_family == AF_LINK || + (ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))) || + inaddr_local(ip->ip_dst)) + islocal = 1; + } + if (islocal == 0) + goto no_dscp; + } + +#if INET6 + if (isipv6) + ip6->ip6_flow |= + htonl(dscp_code_from_mbuf_tclass(m->m_pkthdr.prio) << 20); + else +#endif /* INET6 */ + ip->ip_tos |= dscp_code_from_mbuf_tclass(m->m_pkthdr.prio) << 2; + +no_dscp: + /* + * For TCP with background traffic class switch CC algo based on sysctl + */ + if (so->so_type == SOCK_STREAM) { + set_tcp_stream_priority(so); + } + + /* + * Assume socket and mbuf traffic class values are the same + * Also assume the socket lock is held + */ + so->so_tc_stats[mtc].txpackets += 1; + so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len; + + return; +} + +__private_extern__ void +socket_tclass_init(void) +{ + tclass_lck_grp_attr = lck_grp_attr_alloc_init(); + tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr); + tclass_lck_attr = lck_attr_alloc_init(); + if ((tclass_lock = lck_mtx_alloc_init(tclass_lck_grp, tclass_lck_attr)) == NULL) { + panic("failed to allocate memory for tclass\n"); + } +} + + diff --git a/bsd/netinet/in_var.h b/bsd/netinet/in_var.h index df7a968af..0b5d373de 100644 --- a/bsd/netinet/in_var.h +++ b/bsd/netinet/in_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,7 +70,7 @@ #include <sys/kern_event.h> #endif -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #include <net/route.h> /* @@ -96,7 +96,7 @@ struct in_ifaddr { struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ TAILQ_ENTRY(in_ifaddr) ia_hash; /* hash bucket entry */ }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct in_aliasreq { char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */ @@ -155,9 +155,10 @@ struct kev_in_portinuse { #define KEV_INET_PORTINUSE 8 /* use ken_in_portinuse */ #endif -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #include <net/if_var.h> #include <kern/locks.h> +#include <sys/tree.h> /* * Given a pointer to an in_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in. @@ -195,9 +196,14 @@ extern int apple_hwcksum_rx; struct in_ifaddr *ia; \ \ lck_rw_lock_shared(in_ifaddr_rwlock); \ - TAILQ_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) \ - if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) \ + TAILQ_FOREACH(ia, INADDR_HASH((addr).s_addr), ia_hash) { \ + IFA_LOCK_SPIN(&ia->ia_ifa); \ + if (IA_SIN(ia)->sin_addr.s_addr == (addr).s_addr) { \ + IFA_UNLOCK(&ia->ia_ifa); \ break; \ + } \ + IFA_UNLOCK(&ia->ia_ifa); \ + } \ (ifp) = (ia == NULL) ? NULL : ia->ia_ifp; \ lck_rw_done(in_ifaddr_rwlock); \ } @@ -217,7 +223,7 @@ extern int apple_hwcksum_rx; (ia) = TAILQ_NEXT((ia), ia_link)) \ continue; \ if ((ia) != NULL) \ - ifaref(&(ia)->ia_ifa); \ + IFA_ADDREF(&(ia)->ia_ifa); \ lck_rw_done(in_ifaddr_rwlock); \ } @@ -226,31 +232,153 @@ extern int apple_hwcksum_rx; * to change that - as it might break a number of things */ +/* + * Legacy IPv4 IGMP per-link structure. + */ struct router_info { struct ifnet *rti_ifp; int rti_type; /* type of router which is querier on this interface */ int rti_time; /* # of slow timeouts since last old query */ - struct router_info *rti_next; + SLIST_ENTRY(router_info) rti_list; +}; + +/* + * IPv4 multicast IGMP-layer source entry. + */ +struct ip_msource { + RB_ENTRY(ip_msource) ims_link; /* RB tree links */ + in_addr_t ims_haddr; /* host byte order */ + struct ims_st { + uint16_t ex; /* # of exclusive members */ + uint16_t in; /* # of inclusive members */ + } ims_st[2]; /* state at t0, t1 */ + uint8_t ims_stp; /* pending query */ +}; + +/* + * IPv4 multicast PCB-layer source entry. + */ +struct in_msource { + RB_ENTRY(ip_msource) ims_link; /* RB tree links */ + in_addr_t ims_haddr; /* host byte order */ + uint8_t imsl_st[2]; /* state before/at commit */ }; +RB_HEAD(ip_msource_tree, ip_msource); /* define struct ip_msource_tree */ + +RB_PROTOTYPE_SC_PREV(__private_extern__, ip_msource_tree, ip_msource, + ims_link, ip_msource_cmp); + +/* + * IPv4 multicast PCB-layer group filter descriptor. + */ +struct in_mfilter { + struct ip_msource_tree imf_sources; /* source list for (S,G) */ + u_long imf_nsrc; /* # of source entries */ + uint8_t imf_st[2]; /* state before/at commit */ +}; + +struct igmp_ifinfo; + /* - * Internet multicast address structure. There is one of these for each IP - * multicast group to which this host belongs on a given network interface. - * For every entry on the interface's if_multiaddrs list which represents - * an IP multicast group, there is one of these structures. They are also - * kept on a system-wide list to make it easier to keep our legacy IGMP code - * compatible with the rest of the world (see IN_FIRST_MULTI et al, below). + * IPv4 group descriptor. + * + * For every entry on an ifnet's if_multiaddrs list which represents + * an IP multicast group, there is one of these structures. + * + * If any source filters are present, then a node will exist in the RB-tree + * to permit fast lookup by source whenever an operation takes place. + * This permits pre-order traversal when we issue reports. + * Source filter trees are kept separately from the socket layer to + * greatly simplify locking. + * + * When IGMPv3 is active, inm_timer is the response to group query timer. + * The state-change timer inm_sctimer is separate; whenever state changes + * for the group the state change record is generated and transmitted, + * and kept if retransmissions are necessary. + * + * FUTURE: inm_link is now only used when groups are being purged + * on a detaching ifnet. It could be demoted to a SLIST_ENTRY, but + * because it is at the very start of the struct, we can't do this + * w/o breaking the ABI for ifmcstat. */ struct in_multi { + decl_lck_mtx_data(, inm_lock); + u_int32_t inm_refcount; /* reference count */ + u_int32_t inm_reqcnt; /* request count for this address */ + u_int32_t inm_debug; /* see ifa_debug flags */ LIST_ENTRY(in_multi) inm_link; /* queue macro glue */ struct in_addr inm_addr; /* IP multicast address, convenience */ struct ifnet *inm_ifp; /* back pointer to ifnet */ struct ifmultiaddr *inm_ifma; /* back pointer to ifmultiaddr */ - u_int inm_timer; /* IGMP membership report timer */ + u_int inm_timer; /* IGMPv1/v2 group / v3 query timer */ u_int inm_state; /* state of the membership */ - struct router_info *inm_rti; /* router info*/ + void *inm_rti; /* unused, legacy field */ + + /* New fields for IGMPv3 follow. */ + struct igmp_ifinfo *inm_igi; /* IGMP info */ + SLIST_ENTRY(in_multi) inm_nrele; /* to-be-released by IGMP */ + u_int32_t inm_nrelecnt; /* deferred release count */ + struct ip_msource_tree inm_srcs; /* tree of sources */ + u_long inm_nsrc; /* # of tree entries */ + + struct ifqueue inm_scq; /* queue of pending + * state-change packets */ + struct timeval inm_lastgsrtv; /* Time of last G-S-R query */ + uint16_t inm_sctimer; /* state-change timer */ + uint16_t inm_scrv; /* state-change rexmit count */ + + /* + * SSM state counters which track state at T0 (the time the last + * state-change report's RV timer went to zero) and T1 + * (time of pending report, i.e. now). + * Used for computing IGMPv3 state-change reports. Several refcounts + * are maintained here to optimize for common use-cases. + */ + struct inm_st { + uint16_t iss_fmode; /* IGMP filter mode */ + uint16_t iss_asm; /* # of ASM listeners */ + uint16_t iss_ex; /* # of exclusive members */ + uint16_t iss_in; /* # of inclusive members */ + uint16_t iss_rec; /* # of recorded sources */ + } inm_st[2]; /* state at t0, t1 */ + + void (*inm_trace) /* callback fn for tracing refs */ + (struct in_multi *, int); }; +#define INM_LOCK_ASSERT_HELD(_inm) \ + lck_mtx_assert(&(_inm)->inm_lock, LCK_MTX_ASSERT_OWNED) + +#define INM_LOCK_ASSERT_NOTHELD(_inm) \ + lck_mtx_assert(&(_inm)->inm_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define INM_LOCK(_inm) \ + lck_mtx_lock(&(_inm)->inm_lock) + +#define INM_LOCK_SPIN(_inm) \ + lck_mtx_lock_spin(&(_inm)->inm_lock) + +#define INM_CONVERT_LOCK(_inm) do { \ + INM_LOCK_ASSERT_HELD(_inm); \ + lck_mtx_convert_spin(&(_inm)->inm_lock); \ +} while (0) + +#define INM_UNLOCK(_inm) \ + lck_mtx_unlock(&(_inm)->inm_lock) + +#define INM_ADDREF(_inm) \ + inm_addref(_inm, 0) + +#define INM_ADDREF_LOCKED(_inm) \ + inm_addref(_inm, 1) + +#define INM_REMREF(_inm) \ + inm_remref(_inm, 0) + +#define INM_REMREF_LOCKED(_inm) \ + inm_remref(_inm, 1) + #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_ip); SYSCTL_DECL(_net_inet_raw); @@ -269,22 +397,36 @@ struct in_multistep { /* * Macro for looking up the in_multi record for a given IP multicast address * on a given interface. If no matching record is found, "inm" is set null. + * + * We do this differently compared other BSD implementations; instead of + * walking the if_multiaddrs list at the interface and returning the + * ifma_protospec value of a matching entry, we search the global list + * of in_multi records and find it that way. Otherwise either the two + * structures (in_multi, ifmultiaddr) need to be ref counted both ways, + * which will make things too complicated, or they need to reside in the + * same protected domain, which they aren't. + * + * Must be called with in_multihead_lock held. */ -#define IN_LOOKUP_MULTI(addr, ifp, inm) \ - /* struct in_addr addr; */ \ - /* struct ifnet *ifp; */ \ - /* struct in_multi *inm; */ \ -do { \ - struct ifmultiaddr *ifma; \ -\ - LIST_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) { \ - if (ifma->ifma_addr->sa_family == AF_INET \ - && ((struct sockaddr_in *)ifma->ifma_addr)->sin_addr.s_addr == \ - (addr).s_addr) \ - break; \ - } \ - (inm) = ifma ? ifma->ifma_protospec : NULL; \ -} while(0) +#define IN_LOOKUP_MULTI(addr, ifp, inm) \ + /* struct in_addr *addr; */ \ + /* struct ifnet *ifp; */ \ + /* struct in_multi *inm; */ \ +do { \ + struct in_multistep _step; \ + IN_FIRST_MULTI(_step, inm); \ + while ((inm) != NULL) { \ + INM_LOCK_SPIN(inm); \ + if ((inm)->inm_ifp == (ifp) && \ + (inm)->inm_addr.s_addr == (addr)->s_addr) { \ + INM_ADDREF_LOCKED(inm); \ + INM_UNLOCK(inm); \ + break; \ + } \ + INM_UNLOCK(inm); \ + IN_NEXT_MULTI(_step, inm); \ + } \ +} while (0) /* * Macro to step through all of the in_multi records, one at a time. @@ -292,28 +434,57 @@ do { \ * provide. IN_FIRST_MULTI(), below, must be called to initialize "step" * and get the first record. Both macros return a NULL "inm" when there * are no remaining records. + * + * Must be called with in_multihead_lock held. */ -#define IN_NEXT_MULTI(step, inm) \ - /* struct in_multistep step; */ \ - /* struct in_multi *inm; */ \ -do { \ - if (((inm) = (step).i_inm) != NULL) \ - (step).i_inm = LIST_NEXT((step).i_inm, inm_link); \ -} while(0) - -#define IN_FIRST_MULTI(step, inm) \ - /* struct in_multistep step; */ \ - /* struct in_multi *inm; */ \ -do { \ - (step).i_inm = LIST_FIRST(&in_multihead); \ - IN_NEXT_MULTI((step), (inm)); \ -} while(0) +#define IN_NEXT_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +do { \ + in_multihead_lock_assert(LCK_RW_ASSERT_HELD); \ + if (((inm) = (step).i_inm) != NULL) \ + (step).i_inm = LIST_NEXT((step).i_inm, inm_link); \ +} while (0) + +#define IN_FIRST_MULTI(step, inm) \ + /* struct in_multistep step; */ \ + /* struct in_multi *inm; */ \ +do { \ + in_multihead_lock_assert(LCK_RW_ASSERT_HELD); \ + (step).i_inm = LIST_FIRST(&in_multihead); \ + IN_NEXT_MULTI((step), (inm)); \ +} while (0) struct route; +struct ip_moptions; + +/* + * Return values for imo_multi_filter(). + */ +#define MCAST_PASS 0 /* Pass */ +#define MCAST_NOTGMEMBER 1 /* This host not a member of group */ +#define MCAST_NOTSMEMBER 2 /* This host excluded source */ +#define MCAST_MUTED 3 /* [deprecated] */ extern void in_ifaddr_init(void); +extern int imo_multi_filter(const struct ip_moptions *, const struct ifnet *, + const struct sockaddr *, const struct sockaddr *); +extern int imo_clone(struct ip_moptions *, struct ip_moptions *); +extern void inm_commit(struct in_multi *); +extern void inm_clear_recorded(struct in_multi *); +extern void inm_print(const struct in_multi *); +extern int inm_record_source(struct in_multi *inm, const in_addr_t); +extern void inm_release(struct in_multi *); +extern void in_multi_init(void); extern struct in_multi *in_addmulti(struct in_addr *, struct ifnet *); -extern void in_delmulti(struct in_multi **); +extern void in_delmulti(struct in_multi *); +extern int in_leavegroup(struct in_multi *, /*const*/ struct in_mfilter *); +extern int in_multi_detach(struct in_multi *); +extern void inm_addref(struct in_multi *, int); +extern void inm_remref(struct in_multi *, int); +extern void inm_purge(struct in_multi *); +extern uint8_t ims_get_mode(const struct in_multi *, + const struct ip_msource *, uint8_t); extern int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct proc *); extern void in_rtqdrain(void); @@ -321,14 +492,20 @@ extern struct radix_node *in_validate(struct radix_node *); extern void ip_input(struct mbuf *); extern int in_ifadown(struct ifaddr *ifa, int); extern void in_ifscrub(struct ifnet *, struct in_ifaddr *, int); -extern int ipflow_fastforward(struct mbuf *); -#if IPFLOW -extern void ipflow_create(const struct route *, struct mbuf *); -extern void ipflow_slowtimo(void); -#endif /* IPFLOW */ extern u_int32_t inaddr_hashval(u_int32_t); - -#endif /* KERNEL_PRIVATE */ +extern void in_purgeaddrs(struct ifnet *); +extern void imf_leave(struct in_mfilter *); +extern void imf_purge(struct in_mfilter *); + +struct inpcb; + +__private_extern__ int inp_join_group(struct inpcb *, struct sockopt *); +__private_extern__ int inp_leave_group(struct inpcb *, struct sockopt *); +__private_extern__ void in_multihead_lock_exclusive(void); +__private_extern__ void in_multihead_lock_shared(void); +__private_extern__ void in_multihead_lock_assert(int); +__private_extern__ void in_multihead_lock_done(void); +#endif /* XNU_KERNEL_PRIVATE */ /* INET6 stuff */ #include <netinet6/in6_var.h> diff --git a/bsd/netinet/ip6.h b/bsd/netinet/ip6.h index 203e86a64..a740ddc49 100644 --- a/bsd/netinet/ip6.h +++ b/bsd/netinet/ip6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -170,7 +170,10 @@ struct ip6_dest { #define IP6OPT_JUMBO 0xC2 /* 11 0 00010 = 194 */ #define IP6OPT_NSAP_ADDR 0xC3 /* 11 0 00011 */ #define IP6OPT_TUNNEL_LIMIT 0x04 /* 00 0 00100 */ +#ifndef KERNEL_PRIVATE #define IP6OPT_RTALERT 0x05 /* 00 0 00101 (KAME definition) */ +#endif +#define IP6OPT_ROUTER_ALERT 0x05 /* 00 0 00101 (RFC3542, recommended) */ #define IP6OPT_RTALERT_LEN 4 #define IP6OPT_RTALERT_MLD 0 /* Datagram contains an MLD message */ @@ -178,10 +181,6 @@ struct ip6_dest { #define IP6OPT_RTALERT_ACTNET 2 /* contains an Active Networks msg */ #define IP6OPT_MINLEN 2 -#define IP6OPT_BINDING_UPDATE 0xc6 /* 11 0 00110 */ -#define IP6OPT_BINDING_ACK 0x07 /* 00 0 00111 */ -#define IP6OPT_BINDING_REQ 0x08 /* 00 0 01000 */ -#define IP6OPT_HOME_ADDRESS 0xc9 /* 11 0 01001 */ #define IP6OPT_EID 0x8a /* 10 0 01010 */ #define IP6OPT_TYPE(o) ((o) & 0xC0) @@ -192,8 +191,56 @@ struct ip6_dest { #define IP6OPT_MUTABLE 0x20 +/* IPv6 options: common part */ +struct ip6_opt { + u_int8_t ip6o_type; + u_int8_t ip6o_len; +} __attribute__((__packed__)); + +/* Jumbo Payload Option */ +struct ip6_opt_jumbo { + u_int8_t ip6oj_type; + u_int8_t ip6oj_len; + u_int8_t ip6oj_jumbo_len[4]; +} __attribute__((__packed__)); #define IP6OPT_JUMBO_LEN 6 +/* NSAP Address Option */ +struct ip6_opt_nsap { + u_int8_t ip6on_type; + u_int8_t ip6on_len; + u_int8_t ip6on_src_nsap_len; + u_int8_t ip6on_dst_nsap_len; + /* followed by source NSAP */ + /* followed by destination NSAP */ +}__attribute__((__packed__)); + +/* Tunnel Limit Option */ +struct ip6_opt_tunnel { + u_int8_t ip6ot_type; + u_int8_t ip6ot_len; + u_int8_t ip6ot_encap_limit; +}__attribute__((__packed__)); + +/* Router Alert Option */ +struct ip6_opt_router { + u_int8_t ip6or_type; + u_int8_t ip6or_len; + u_int8_t ip6or_value[2]; +}__attribute__((__packed__)); +/* Router alert values (in network byte order) */ +#if BYTE_ORDER == BIG_ENDIAN +#define IP6_ALERT_MLD 0x0000 +#define IP6_ALERT_RSVP 0x0001 +#define IP6_ALERT_AN 0x0002 +#else +#if BYTE_ORDER == LITTLE_ENDIAN +#define IP6_ALERT_MLD 0x0000 +#define IP6_ALERT_RSVP 0x0100 +#define IP6_ALERT_AN 0x0200 +#endif /* LITTLE_ENDIAN */ +#endif + /* Routing header */ struct ip6_rthdr { u_int8_t ip6r_nxt; /* next header */ @@ -235,13 +282,14 @@ struct ip6_frag { /* * Internet implementation parameters. */ -#define IPV6_MAXHLIM 255 /* maximun hoplimit */ +#define IPV6_MAXHLIM 255 /* maximum hoplimit */ #define IPV6_DEFHLIM 64 /* default hlim */ #define IPV6_FRAGTTL 120 /* ttl for fragment packets, in slowtimo tick */ -#define IPV6_HLIMDEC 1 /* subtracted when forwaeding */ +#define IPV6_HLIMDEC 1 /* subtracted when forwarding */ #define IPV6_MMTU 1280 /* minimal MTU and reassembly. 1024 + 256 */ #define IPV6_MAXPACKET 65535 /* ip6 max packet size without Jumbo payload*/ +#define IPV6_MAXOPTHDR 2048 /* max option header size, 256 64-bit words */ #ifdef KERNEL_PRIVATE /* @@ -291,45 +339,12 @@ do { \ * with type "typ". * IP6_EXTHDR_GET0 does the same, except that it aligns the structure at the * very top of mbuf. GET0 is likely to make memory copy than GET. - * - * XXX we're now testing this, needs m_pulldown() */ -#define IP6_EXTHDR_GET(val, typ, m, off, len) \ -do { \ - struct mbuf *t; \ - int tmp; \ - if ((m)->m_len >= (off) + (len)) \ - (val) = (typ)(mtod((m), caddr_t) + (off)); \ - else { \ - t = m_pulldown((m), (off), (len), &tmp); \ - if (t) { \ - if (t->m_len < tmp + (len)) \ - panic("m_pulldown malfunction"); \ - (val) = (typ)(mtod(t, caddr_t) + tmp); \ - } else { \ - (val) = (typ)NULL; \ - (m) = NULL; \ - } \ - } \ -} while (0) +#define IP6_EXTHDR_GET(val, typ, m, off, len) \ + M_STRUCT_GET(val, typ, m, off, len) -#define IP6_EXTHDR_GET0(val, typ, m, off, len) \ -do { \ - struct mbuf *t; \ - if ((off) == 0) \ - (val) = (typ)mtod(m, caddr_t); \ - else { \ - t = m_pulldown((m), (off), (len), NULL); \ - if (t) { \ - if (t->m_len < (len)) \ - panic("m_pulldown malfunction"); \ - (val) = (typ)mtod(t, caddr_t); \ - } else { \ - (val) = (typ)NULL; \ - (m) = NULL; \ - } \ - } \ -} while (0) +#define IP6_EXTHDR_GET0(val, typ, m, off, len) \ + M_STRUCT_GET0(val, typ, m, off, len) #endif /* KERNEL_PRIVATE */ #endif /* !_NETINET_IP6_H_ */ diff --git a/bsd/netinet/ip_divert.c b/bsd/netinet/ip_divert.c index e3c771e6d..600a796d1 100644 --- a/bsd/netinet/ip_divert.c +++ b/bsd/netinet/ip_divert.c @@ -236,12 +236,14 @@ divert_packet(struct mbuf *m, int incoming, int port, int rule) /* Find IP address for receive interface */ ifnet_lock_shared(m->m_pkthdr.rcvif); TAILQ_FOREACH(ifa, &m->m_pkthdr.rcvif->if_addrhead, ifa_link) { - if (ifa->ifa_addr == NULL) - continue; - if (ifa->ifa_addr->sa_family != AF_INET) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ifa); continue; + } divsrc.sin_addr = ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; + IFA_UNLOCK(ifa); break; } ifnet_lock_done(m->m_pkthdr.rcvif); @@ -314,14 +316,10 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, struct ip *const ip = mtod(m, struct ip *); struct sockaddr_in *sin = (struct sockaddr_in *)addr; int error = 0; -#if PKT_PRIORITY - mbuf_traffic_class_t mtc = MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ + mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; if (control != NULL) { -#if PKT_PRIORITY mtc = mbuf_traffic_class_from_control(control); -#endif /* PKT_PRIORITY */ m_freem(control); /* XXX */ } @@ -332,8 +330,8 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, int len = 0; char *c = sin->sin_zero; - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, - sizeof(struct divert_tag), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DIVERT, + sizeof(struct divert_tag), M_NOWAIT, m); if (mtag == NULL) { error = ENOBUFS; goto cantsend; @@ -359,8 +357,9 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; struct route ro; + struct ip_moptions *imo; /* * Don't allow both user specified and setsockopt options, @@ -382,10 +381,11 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, /* Copy the cached route and take an extra reference */ inp_route_copyout(inp, &ro); -#if PKT_PRIORITY - set_traffic_class(m, so, mtc); -#endif /* PKT_PRIORITY */ + set_packet_tclass(m, so, mtc, 0); + imo = inp->inp_moptions; + if (imo != NULL) + IMO_ADDREF(imo); socket_unlock(so, 0); #if CONFIG_MACF_NET mac_mbuf_label_associate_inpcb(inp, m); @@ -394,9 +394,11 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, error = ip_output(m, inp->inp_options, &ro, (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST | IP_RAWOUTPUT | IP_OUTARGS, - inp->inp_moptions, &ipoa); + imo, &ipoa); socket_lock(so, 0); + if (imo != NULL) + IMO_REMREF(imo); /* Synchronize cached PCB route */ inp_route_copyin(inp, &ro); } else { @@ -417,7 +419,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, goto cantsend; } m->m_pkthdr.rcvif = ifa->ifa_ifp; - ifafree(ifa); + IFA_REMREF(ifa); } #if CONFIG_MACF_NET mac_mbuf_label_associate_socket(so, m); @@ -462,7 +464,7 @@ div_attach(struct socket *so, int proto, struct proc *p) #ifdef MORE_DICVLOCK_DEBUG printf("div_attach: so=%p sopcb=%p lock=%x ref=%x\n", - so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount); + so, so->so_pcb, &(((struct inpcb *)so->so_pcb)->inpcb_mtx), so->so_usecount); #endif return 0; } @@ -474,7 +476,7 @@ div_detach(struct socket *so) #ifdef MORE_DICVLOCK_DEBUG printf("div_detach: so=%p sopcb=%p lock=%x ref=%x\n", - so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount); + so, so->so_pcb, &(((struct inpcb *)so->so_pcb)->inpcb_mtx), so->so_usecount); #endif inp = sotoinpcb(so); if (inp == 0) @@ -656,11 +658,11 @@ div_lock(struct socket *so, int refcount, void *lr) #ifdef MORE_DICVLOCK_DEBUG printf("div_lock: so=%p sopcb=%p lock=%p ref=%x lr=%p\n", so, so->so_pcb, so->so_pcb ? - ((struct inpcb *)so->so_pcb)->inpcb_mtx : NULL, + &(((struct inpcb *)so->so_pcb)->inpcb_mtx) : NULL, so->so_usecount, lr_saved); #endif if (so->so_pcb) { - lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } else { panic("div_lock: so=%p NO PCB! lr=%p lrh= lrh= %s\n", so, lr_saved, solockhistory_nr(so)); @@ -697,7 +699,7 @@ div_unlock(struct socket *so, int refcount, void *lr) #ifdef MORE_DICVLOCK_DEBUG printf("div_unlock: so=%p sopcb=%p lock=%p ref=%x lr=%p\n", so, so->so_pcb, so->so_pcb ? - ((struct inpcb *)so->so_pcb)->inpcb_mtx : NULL, + &(((struct inpcb *)so->so_pcb)->inpcb_mtx) : NULL, so->so_usecount, lr_saved); #endif if (refcount) @@ -713,7 +715,7 @@ div_unlock(struct socket *so, int refcount, void *lr) so, so->so_usecount, lr_saved, solockhistory_nr(so)); /* NOTREACHED */ } - mutex_held = ((struct inpcb *)so->so_pcb)->inpcb_mtx; + mutex_held = &((struct inpcb *)so->so_pcb)->inpcb_mtx; if (so->so_usecount == 0 && (inp->inp_wantcnt == WNT_STOPUSING)) { lck_rw_lock_exclusive(divcbinfo.mtx); @@ -739,7 +741,7 @@ div_getlock(struct socket *so, __unused int locktype) if (so->so_usecount < 0) panic("div_getlock: so=%p usecount=%x lrh= %s\n", so, so->so_usecount, solockhistory_nr(so)); - return(inpcb->inpcb_mtx); + return(&inpcb->inpcb_mtx); } else { panic("div_getlock: so=%p NULL NO PCB lrh= %s\n", so, solockhistory_nr(so)); diff --git a/bsd/netinet/ip_dummynet.c b/bsd/netinet/ip_dummynet.c index 54fceaef4..048cff004 100644 --- a/bsd/netinet/ip_dummynet.c +++ b/bsd/netinet/ip_dummynet.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -89,6 +89,7 @@ #include <sys/socketvar.h> #include <sys/time.h> #include <sys/sysctl.h> +//#include <sys/mcache.h> #include <net/if.h> #include <net/route.h> #include <net/kpi_protocol.h> @@ -121,6 +122,8 @@ static int red_lookup_depth = 256; /* RED - default lookup table depth */ static int red_avg_pkt_size = 512; /* RED - default medium packet size */ static int red_max_pkt_size = 1500; /* RED - default max packet size */ +static int serialize = 0; + /* * Three heaps contain queues and pipes that the scheduler handles: * @@ -152,9 +155,6 @@ static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head, */ static void dummynet_send(struct mbuf *m); -/* Flag to signify the existance of a dequeued packet chain */ -static int serialize = 0; - #define HASHSIZE 16 #define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f) static struct dn_pipe_head pipehash[HASHSIZE]; /* all pipes */ @@ -163,36 +163,36 @@ static struct dn_flow_set_head flowsethash[HASHSIZE]; /* all flowsets */ #ifdef SYSCTL_NODE SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, - CTLFLAG_RW, 0, "Dummynet"); + CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Dummynet"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, - CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); + CTLFLAG_RW | CTLFLAG_LOCKED, &dn_hash_size, 0, "Default hash table size"); SYSCTL_QUAD(_net_inet_ip_dummynet, OID_AUTO, curr_time, - CTLFLAG_RD, &curr_time, "Current tick"); + CTLFLAG_RD | CTLFLAG_LOCKED, &curr_time, "Current tick"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, - CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); + CTLFLAG_RD | CTLFLAG_LOCKED, &ready_heap.size, 0, "Size of ready heap"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, - CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); + CTLFLAG_RD | CTLFLAG_LOCKED, &extract_heap.size, 0, "Size of extract heap"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, - CTLFLAG_RD, &searches, 0, "Number of queue searches"); + CTLFLAG_RD | CTLFLAG_LOCKED, &searches, 0, "Number of queue searches"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, - CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); + CTLFLAG_RD | CTLFLAG_LOCKED, &search_steps, 0, "Number of queue search steps"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, - CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); + CTLFLAG_RW | CTLFLAG_LOCKED, &pipe_expire, 0, "Expire queue if empty"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, - CTLFLAG_RW, &dn_max_ratio, 0, + CTLFLAG_RW | CTLFLAG_LOCKED, &dn_max_ratio, 0, "Max ratio between dynamic queues and buckets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, - CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); + CTLFLAG_RD | CTLFLAG_LOCKED, &red_lookup_depth, 0, "Depth of RED lookup table"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, - CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); + CTLFLAG_RD | CTLFLAG_LOCKED, &red_avg_pkt_size, 0, "RED Medium packet size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, - CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); + CTLFLAG_RD | CTLFLAG_LOCKED, &red_max_pkt_size, 0, "RED Max packet size"); #endif #ifdef DUMMYNET_DEBUG int dummynet_debug = 0; #ifdef SYSCTL_NODE -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug, +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &dummynet_debug, 0, "control debugging printfs"); #endif #define DPRINTF(X) if (dummynet_debug) printf X @@ -457,6 +457,7 @@ char *cp_pipe_to_32_user(struct dn_pipe *p, struct dn_pipe_32 *pipe_bp) pipe_bp->pipe_nr = p->pipe_nr; pipe_bp->bandwidth = p->bandwidth; + pipe_bp->delay = p->delay; bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_32)); pipe_bp->scheduler_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->scheduler_heap.p); bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_32)); @@ -497,6 +498,7 @@ char *cp_pipe_to_64_user(struct dn_pipe *p, struct dn_pipe_64 *pipe_bp) pipe_bp->pipe_nr = p->pipe_nr; pipe_bp->bandwidth = p->bandwidth; + pipe_bp->delay = p->delay; bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_64)); pipe_bp->scheduler_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->scheduler_heap.p); bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_64)); @@ -648,47 +650,6 @@ heap_extract(struct dn_heap *h, void *obj) } } -#if 0 -/* - * change object position and update references - * XXX this one is never used! - */ -static void -heap_move(struct dn_heap *h, dn_key new_key, void *object) -{ - int temp; - int i ; - int maxelt = h->elements-1 ; - struct dn_heap_entry buf ; - - if (h->offset <= 0) - panic("cannot move items on this heap"); - - i = *((int *)((char *)object + h->offset)); - if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ - h->p[i].key = new_key ; - for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; - i = temp ) { /* bubble up */ - HEAP_SWAP(h->p[i], h->p[temp], buf) ; - SET_OFFSET(h, i); - } - } else { /* must move down */ - h->p[i].key = new_key ; - while ( (temp = HEAP_LEFT(i)) <= maxelt ) { /* found left child */ - if ((temp != maxelt) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) - temp++ ; /* select child with min key */ - if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ - HEAP_SWAP(h->p[i], h->p[temp], buf) ; - SET_OFFSET(h, i); - } else - break ; - i = temp ; - } - } - SET_OFFSET(h, i); -} -#endif /* heap_move, unused */ - /* * heapify() will reorganize data inside an array to maintain the * heap property. It is needed when we delete a bunch of entries. @@ -757,10 +718,10 @@ transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) { struct mbuf *m ; struct dn_pkt_tag *pkt ; + u_int64_t schedule_time; lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); - - /* Extract packets only if no pending chain is being currently processed */ + ASSERT(serialize >= 0); if (serialize == 0) { while ((m = pipe->head) != NULL) { pkt = dn_tag_get(m); @@ -774,9 +735,13 @@ transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) *head = m; *tail = m; } + if (*tail != NULL) (*tail)->m_nextpkt = NULL; - } + } + + schedule_time = DN_KEY_LEQ(pkt->output_time, curr_time) ? + curr_time+1 : pkt->output_time; /* if there are leftover packets, put the pipe into the heap for next ready event */ if ((m = pipe->head) != NULL) { @@ -784,7 +749,7 @@ transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) /* XXX should check errors on heap_insert, by draining the * whole pipe p and hoping in the future we are more successful */ - heap_insert(&extract_heap, pkt->output_time, pipe); + heap_insert(&extract_heap, schedule_time, pipe); } } @@ -1105,21 +1070,17 @@ dummynet(__unused void * unused) break; } } - - /* - * If a packet chain has been dequeued, set serialize=1 so that new - * packets don't get dispatched out of turn - */ + if (head != NULL) - serialize = 1; - - lck_mtx_unlock(dn_mutex); + serialize++; + + lck_mtx_unlock(dn_mutex); /* Send out the de-queued list of ready-to-send packets */ if (head != NULL) { dummynet_send(head); lck_mtx_lock(dn_mutex); - serialize = 0; + serialize--; lck_mtx_unlock(dn_mutex); } } @@ -1193,13 +1154,19 @@ if_tx_rdy(struct ifnet *ifp) p->numbytes = 0 ; /* mark ready for I/O */ ready_event_wfq(p, &head, &tail); } + + if (head != NULL) { + serialize++; + } + lck_mtx_unlock(dn_mutex); /* Send out the de-queued list of ready-to-send packets */ - if (head != NULL) + if (head != NULL) { dummynet_send(head); - + serialize--; + } return 0; } @@ -1214,6 +1181,7 @@ expire_queues(struct dn_flow_set *fs) int i, initial_elements = fs->rq_elements ; struct timeval timenow; + /* reviewed for getmicrotime usage */ getmicrotime(&timenow); if (fs->last_expired == timenow.tv_sec) @@ -1564,8 +1532,8 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) goto dropit ; /* XXX expensive to zero, see if we can remove it*/ - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, - sizeof(struct dn_pkt_tag), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, + sizeof(struct dn_pkt_tag), M_NOWAIT, m); if ( mtag == NULL ) goto dropit ; /* cannot allocate packet header */ m_tag_prepend(m, mtag); /* attach to mbuf chain */ @@ -1591,7 +1559,7 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */ fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; - pkt->dn_dst = fwa->dst; + bcopy (fwa->dst, &pkt->dn_dst, sizeof(pkt->dn_dst)); pkt->flags = fwa->flags; if (fwa->ipoa != NULL) pkt->ipoa = *(fwa->ipoa); @@ -1619,7 +1587,7 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) if (pipe->bandwidth) t = SET_TICKS(m, q, pipe); q->sched_time = curr_time ; - if (t == 0) /* must process it now */ + if (t == 0) /* must process it now */ ready_event( q , &head, &tail ); else heap_insert(&ready_heap, curr_time + t , q ); @@ -1682,9 +1650,10 @@ done: ts.tv_nsec = 1 * 1000000; // 1ms timer_enabled = 1; bsd_timeout(dummynet, NULL, &ts); - } + } lck_mtx_unlock(dn_mutex); + if (head != NULL) dummynet_send(head); @@ -1964,9 +1933,9 @@ set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) x->qsize = 1024*1024 ; } else { if (x->qsize == 0) - x->qsize = 50 ; + x->qsize = 50; if (x->qsize > 100) - x->qsize = 50 ; + x->qsize = 50; } /* configuring RED */ if ( x->flags_fs & DN_IS_RED ) diff --git a/bsd/netinet/ip_dummynet.h b/bsd/netinet/ip_dummynet.h index 83f38d24e..e5dd1f337 100644 --- a/bsd/netinet/ip_dummynet.h +++ b/bsd/netinet/ip_dummynet.h @@ -157,7 +157,7 @@ struct dn_pkt_tag { dn_key output_time; /* when the pkt is due for delivery */ struct ifnet *ifp; /* interface, for ip_output */ - struct sockaddr_in *dn_dst ; + struct sockaddr_in dn_dst ; struct route ro; /* route, for ip_output. MUST COPY */ int flags ; /* flags, for ip_output (IPv6 ?) */ struct ip_out_args ipoa; /* output args, for ip_output. MUST COPY */ diff --git a/bsd/netinet/ip_encap.c b/bsd/netinet/ip_encap.c index 6dd02fe9e..0d487326a 100644 --- a/bsd/netinet/ip_encap.c +++ b/bsd/netinet/ip_encap.c @@ -259,9 +259,7 @@ encap4_input(m, off) #if INET6 int -encap6_input(mp, offp) - struct mbuf **mp; - int *offp; +encap6_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ip6_hdr *ip6; @@ -269,10 +267,8 @@ encap6_input(mp, offp) const struct ip6protosw *psw; struct encaptab *ep, *match; int prio, matchprio; - int proto; ip6 = mtod(m, struct ip6_hdr *); - proto = ip6->ip6_nxt; bzero(&s, sizeof(s)); s.sin6_family = AF_INET6; @@ -315,7 +311,7 @@ encap6_input(mp, offp) psw = (const struct ip6protosw *)match->psw; if (psw && psw->pr_input) { encap_fillarg(m, match); - return (*psw->pr_input)(mp, offp); + return (*psw->pr_input)(mp, offp, proto); } else { m_freem(m); return IPPROTO_DONE; @@ -323,7 +319,7 @@ encap6_input(mp, offp) } /* last resort: inject to raw socket */ - return rip6_input(mp, offp); + return rip6_input(mp, offp, proto); } #endif @@ -532,8 +528,8 @@ encap_fillarg( struct m_tag *tag; struct encaptabtag *et; - tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_ENCAP, - sizeof(struct encaptabtag), M_WAITOK); + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_ENCAP, + sizeof(struct encaptabtag), M_WAITOK, m); if (tag != NULL) { et = (struct encaptabtag*)(tag + 1); diff --git a/bsd/netinet/ip_encap.h b/bsd/netinet/ip_encap.h index 66dfb2588..1c65ea956 100644 --- a/bsd/netinet/ip_encap.h +++ b/bsd/netinet/ip_encap.h @@ -77,7 +77,7 @@ struct encaptab { void encap_init(void) __attribute__((section("__TEXT, initcode"))); void encap4_input(struct mbuf *, int); -int encap6_input(struct mbuf **, int *); +int encap6_input(struct mbuf **, int *, int); const struct encaptab *encap_attach(int, int, const struct sockaddr *, const struct sockaddr *, const struct sockaddr *, const struct sockaddr *, const struct protosw *, void *); diff --git a/bsd/netinet/ip_flow.c b/bsd/netinet/ip_flow.c deleted file mode 100644 index be5aa9495..000000000 --- a/bsd/netinet/ip_flow.c +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1998 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD: src/sys/netinet/ip_flow.c,v 1.9.2.1 2001/08/08 08:20:35 ru Exp $ - */ - -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/protosw.h> -#include <sys/socket.h> -#include <sys/kernel.h> - -#include <sys/sysctl.h> -#include <libkern/OSAtomic.h> - -#include <net/if.h> -#include <net/route.h> - -#include <netinet/in.h> -#include <netinet/in_systm.h> -#include <netinet/ip.h> -#include <netinet/in_var.h> -#include <netinet/ip_var.h> -#include <netinet/ip_flow.h> -#include <net/dlil.h> - -#if IPFLOW - -#define IPFLOW_TIMER (5 * PR_SLOWHZ) -#define IPFLOW_HASHBITS 6 /* should not be a multiple of 8 */ -#define IPFLOW_HASHSIZE (1 << IPFLOW_HASHBITS) -static LIST_HEAD(ipflowhead, ipflow) ipflows[IPFLOW_HASHSIZE]; -static int ipflow_inuse; -#define IPFLOW_MAX 256 - -#ifdef __APPLE__ -#define M_IPFLOW M_TEMP -#endif - -static int ipflow_active = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_FASTFORWARDING, fastforwarding, CTLFLAG_RW, - &ipflow_active, 0, "Enable flow-based IP forwarding"); - -#ifndef __APPLE__ -static MALLOC_DEFINE(M_IPFLOW, "ip_flow", "IP flow"); -#endif - -static unsigned -ipflow_hash( - struct in_addr dst, - struct in_addr src, - unsigned tos) -{ - unsigned hash = tos; - int idx; - for (idx = 0; idx < 32; idx += IPFLOW_HASHBITS) - hash += (dst.s_addr >> (32 - idx)) + (src.s_addr >> idx); - return hash & (IPFLOW_HASHSIZE-1); -} - -static struct ipflow * -ipflow_lookup( - const struct ip *ip) -{ - unsigned hash; - struct ipflow *ipf; - - hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); - - ipf = LIST_FIRST(&ipflows[hash]); - while (ipf != NULL) { - if (ip->ip_dst.s_addr == ipf->ipf_dst.s_addr - && ip->ip_src.s_addr == ipf->ipf_src.s_addr - && ip->ip_tos == ipf->ipf_tos) - break; - ipf = LIST_NEXT(ipf, ipf_next); - } - return ipf; -} - -int -ipflow_fastforward( - struct mbuf *m) -{ - struct ip *ip; - struct ipflow *ipf; - struct rtentry *rt; - struct sockaddr *dst; - int error; - - /* - * Are we forwarding packets? Big enough for an IP packet? - */ - if (!ipforwarding || !ipflow_active || m->m_len < sizeof(struct ip)) - return 0; - /* - * IP header with no option and valid version and length - */ - ip = mtod(m, struct ip *); - if (ip->ip_v != IPVERSION || ip->ip_hl != (sizeof(struct ip) >> 2) - || ntohs(ip->ip_len) > m->m_pkthdr.len) - return 0; - /* - * Find a flow. - */ - if ((ipf = ipflow_lookup(ip)) == NULL) - return 0; - - /* - * Route and interface still up? - */ - rt = ipf->ipf_ro.ro_rt; - if ((rt->rt_flags & RTF_UP) == 0 || (rt->rt_ifp->if_flags & IFF_UP) == 0) - return 0; - - /* - * Packet size OK? TTL? - */ - if (m->m_pkthdr.len > rt->rt_ifp->if_mtu || ip->ip_ttl <= IPTTLDEC) - return 0; - - /* - * Everything checks out and so we can forward this packet. - * Modify the TTL and incrementally change the checksum. - */ - ip->ip_ttl -= IPTTLDEC; - if (ip->ip_sum >= htons(0xffff - (IPTTLDEC << 8))) { - ip->ip_sum += htons(IPTTLDEC << 8) + 1; - } else { - ip->ip_sum += htons(IPTTLDEC << 8); - } - - /* - * Send the packet on its way. All we can get back is ENOBUFS - */ - ipf->ipf_uses++; - ipf->ipf_timer = IPFLOW_TIMER; - - if (rt->rt_flags & RTF_GATEWAY) - dst = rt->rt_gateway; - else - dst = &ipf->ipf_ro.ro_dst; -#ifdef __APPLE__ - /* Not sure the rt_dlt is valid here !! XXX */ - if ((error = dlil_output(rt->rt_ifp, PF_INET, m, (caddr_t) rt, dst, 0)) != 0) { - -#else - if ((error = (*rt->rt_ifp->if_output)(rt->rt_ifp, m, dst, rt)) != 0) { -#endif - if (error == ENOBUFS) - ipf->ipf_dropped++; - else - ipf->ipf_errors++; - } - return 1; -} - -static void -ipflow_addstats( - struct ipflow *ipf) -{ - ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; - OSAddAtomic(ipf->ipf_errors + ipf->ipf_dropped, &ipstat.ips_cantforward); - OSAddAtomic(ipf->ipf_uses, &ipstat.ips_forward); - OSAddAtomic(ipf->ipf_uses, &ipstat.ips_fastforward); -} - -static void -ipflow_free( - struct ipflow *ipf) -{ - /* - * Remove the flow from the hash table (at elevated IPL). - * Once it's off the list, we can deal with it at normal - * network IPL. - */ - LIST_REMOVE(ipf, ipf_next); - ipflow_addstats(ipf); - rtfree(ipf->ipf_ro.ro_rt); - ipflow_inuse--; - FREE(ipf, M_IPFLOW); -} - -static struct ipflow * -ipflow_reap( - void) -{ - struct ipflow *ipf, *maybe_ipf = NULL; - int idx; - - for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { - ipf = LIST_FIRST(&ipflows[idx]); - while (ipf != NULL) { - /* - * If this no longer points to a valid route - * reclaim it. - */ - if ((ipf->ipf_ro.ro_rt->rt_flags & RTF_UP) == 0) - goto done; - /* - * choose the one that's been least recently used - * or has had the least uses in the last 1.5 - * intervals. - */ - if (maybe_ipf == NULL - || ipf->ipf_timer < maybe_ipf->ipf_timer - || (ipf->ipf_timer == maybe_ipf->ipf_timer - && ipf->ipf_last_uses + ipf->ipf_uses < - maybe_ipf->ipf_last_uses + - maybe_ipf->ipf_uses)) - maybe_ipf = ipf; - ipf = LIST_NEXT(ipf, ipf_next); - } - } - ipf = maybe_ipf; - done: - /* - * Remove the entry from the flow table. - */ - LIST_REMOVE(ipf, ipf_next); - ipflow_addstats(ipf); - rtfree(ipf->ipf_ro.ro_rt); - return ipf; -} -/* note: called under the ip_mutex lock */ -void -ipflow_slowtimo( - void) -{ - struct ipflow *ipf; - int idx; - - for (idx = 0; idx < IPFLOW_HASHSIZE; idx++) { - ipf = LIST_FIRST(&ipflows[idx]); - while (ipf != NULL) { - struct ipflow *next_ipf = LIST_NEXT(ipf, ipf_next); - if (--ipf->ipf_timer == 0) { - ipflow_free(ipf); - } else { - ipf->ipf_last_uses = ipf->ipf_uses; - ipf->ipf_ro.ro_rt->rt_use += ipf->ipf_uses; - OSAddAtomic(ipf->ipf_uses, &ipstat.ips_forward); - OSAddAtomic(ipf->ipf_uses, &ipstat.ips_fastforward); - ipstat.ips_forward += ipf->ipf_uses; - ipstat.ips_fastforward += ipf->ipf_uses; - ipf->ipf_uses = 0; - } - ipf = next_ipf; - } - } -} - -void -ipflow_create( - const struct route *ro, - struct mbuf *m) -{ - const struct ip *const ip = mtod(m, struct ip *); - struct ipflow *ipf; - unsigned hash; - - /* - * Don't create cache entries for ICMP messages. - */ - if (!ipflow_active || ip->ip_p == IPPROTO_ICMP) - return; - /* - * See if an existing flow struct exists. If so remove it from it's - * list and free the old route. If not, try to malloc a new one - * (if we aren't at our limit). - */ - ipf = ipflow_lookup(ip); - if (ipf == NULL) { - if (ipflow_inuse == IPFLOW_MAX) { - ipf = ipflow_reap(); - } else { - ipf = (struct ipflow *) _MALLOC(sizeof(*ipf), M_IPFLOW, - M_NOWAIT); - if (ipf == NULL) - return; - ipflow_inuse++; - } - bzero((caddr_t) ipf, sizeof(*ipf)); - } else { - LIST_REMOVE(ipf, ipf_next); - ipflow_addstats(ipf); - rtfree(ipf->ipf_ro.ro_rt); - ipf->ipf_uses = ipf->ipf_last_uses = 0; - ipf->ipf_errors = ipf->ipf_dropped = 0; - } - - /* - * Fill in the updated information. - */ - ipf->ipf_ro = *ro; - RT_ADDREF(ro->ro_rt); - ipf->ipf_dst = ip->ip_dst; - ipf->ipf_src = ip->ip_src; - ipf->ipf_tos = ip->ip_tos; - ipf->ipf_timer = IPFLOW_TIMER; - /* - * Insert into the approriate bucket of the flow table. - */ - hash = ipflow_hash(ip->ip_dst, ip->ip_src, ip->ip_tos); - LIST_INSERT_HEAD(&ipflows[hash], ipf, ipf_next); -} -#else /* !IPFLOW */ -int -ipflow_fastforward(struct mbuf *m) -{ -#pragma unused(m) - /* - * Since this symbol is exported (albeit unsupported), just return - * false to keep things (e.g. PPP) happy, in case ipflow is not - * compiled in. - */ - return (0); -} -#endif /* !IPFLOW */ diff --git a/bsd/netinet/ip_flow.h b/bsd/netinet/ip_flow.h deleted file mode 100644 index 972d96351..000000000 --- a/bsd/netinet/ip_flow.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/*- - * Copyright (c) 1998 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by the 3am Software Foundry ("3am"). It was developed by Matt Thomas. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD: src/sys/netinet/ip_flow.h,v 1.2 1999/08/28 00:49:22 peter Exp $ - */ - -#ifndef _NETINET_IP_FLOW_H -#define _NETINET_IP_FLOW_H -#include <sys/appleapiopts.h> - -#ifdef KERNEL_PRIVATE -struct ipflow { - LIST_ENTRY(ipflow) ipf_next; /* next ipflow in bucket */ - struct in_addr ipf_dst; /* destination address */ - struct in_addr ipf_src; /* source address */ - - u_int8_t ipf_tos; /* type-of-service */ - struct route ipf_ro; /* associated route entry */ - u_int32_t ipf_uses; /* number of uses in this period */ - - int ipf_timer; /* remaining lifetime of this entry */ - u_int32_t ipf_dropped; /* ENOBUFS returned by if_output */ - u_int32_t ipf_errors; /* other errors returned by if_output */ - u_int32_t ipf_last_uses; /* number of uses in last period */ -}; -#endif /* KERNEL_PRIVATE */ - -#endif diff --git a/bsd/netinet/ip_fw.h b/bsd/netinet/ip_fw.h index 6755fab56..53ead3fa0 100644 --- a/bsd/netinet/ip_fw.h +++ b/bsd/netinet/ip_fw.h @@ -42,6 +42,7 @@ #ifndef _IP_FW_H #define _IP_FW_H +#ifdef __APPLE_API_OBSOLETE #include <sys/appleapiopts.h> @@ -324,4 +325,5 @@ extern struct ipfw_flow_id last_pkt ; #endif /* KERNEL_PRIVATE */ #endif /* !IPFW2 */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _IP_FW_H */ diff --git a/bsd/netinet/ip_fw2.c b/bsd/netinet/ip_fw2.c index 9be482912..bb66be5d7 100644 --- a/bsd/netinet/ip_fw2.c +++ b/bsd/netinet/ip_fw2.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -165,21 +165,21 @@ static int ipfw_sysctl SYSCTL_HANDLER_ARGS; SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Firewall"); SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &fw_enable, 0, ipfw_sysctl, "I", "Enable ipfw"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW | CTLFLAG_LOCKED, &autoinc_step, 0, "Rule number autincrement step"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass, - CTLFLAG_RW, + CTLFLAG_RW | CTLFLAG_LOCKED, &fw_one_pass, 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, debug, - CTLFLAG_RW, + CTLFLAG_RW | CTLFLAG_LOCKED, &fw_debug, 0, "Enable printing of debug ip_fw statements"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose, - CTLFLAG_RW, + CTLFLAG_RW | CTLFLAG_LOCKED, &fw_verbose, 0, "Log matches to ipfw rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &verbose_limit, 0, "Set upper limit of matches of ipfw rules logged"); /* @@ -251,29 +251,29 @@ static u_int32_t static_len_64; /* size in bytes of static rules for 64 bit clie static u_int32_t dyn_count; /* # of dynamic rules */ static u_int32_t dyn_max = 4096; /* max # of dynamic rules */ -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_buckets, 0, "Number of dyn. buckets"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD | CTLFLAG_LOCKED, &curr_dyn_buckets, 0, "Current Number of dyn. buckets"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD | CTLFLAG_LOCKED, &dyn_count, 0, "Number of dyn. rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_max, 0, "Max number of dyn. rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD | CTLFLAG_LOCKED, &static_count, 0, "Number of static rules"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_ack_lifetime, 0, "Lifetime of dyn. rules for acks"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_syn_lifetime, 0, "Lifetime of dyn. rules for syn"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_fin_lifetime, 0, "Lifetime of dyn. rules for fin"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_rst_lifetime, 0, "Lifetime of dyn. rules for rst"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_udp_lifetime, 0, "Lifetime of dyn. rules for UDP"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_short_lifetime, 0, "Lifetime of dyn. rules for other situations"); -SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW | CTLFLAG_LOCKED, &dyn_keepalive, 0, "Enable keepalives for dyn. rules"); @@ -338,6 +338,8 @@ void ipfwsyslog( int level, const char *format,...) unsigned char pri; int loglen; + bzero(msgBuf, msgsize); + bzero(&ev_msg, sizeof(struct kev_msg)); va_start( ap, format ); loglen = vsnprintf(msgBuf, msgsize, format, ap); va_end( ap ); @@ -965,15 +967,18 @@ iface_match(struct ifnet *ifp, ipfw_insn_if *cmd) ifnet_lock_shared(ifp); TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { - if (ia->ifa_addr == NULL) - continue; - if (ia->ifa_addr->sa_family != AF_INET) + IFA_LOCK(ia); + if (ia->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ia); continue; + } if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) { + IFA_UNLOCK(ia); ifnet_lock_done(ifp); return(1); /* match */ } + IFA_UNLOCK(ia); } ifnet_lock_done(ifp); } diff --git a/bsd/netinet/ip_fw2.h b/bsd/netinet/ip_fw2.h index 0485bcbc2..5e093b170 100644 --- a/bsd/netinet/ip_fw2.h +++ b/bsd/netinet/ip_fw2.h @@ -55,6 +55,7 @@ #ifndef _IPFW2_H #define _IPFW2_H +#ifdef __APPLE_API_OBSOLETE /* * Define IP Firewall event subclass, and associated events. @@ -634,4 +635,5 @@ extern int fw_enable; #endif /* IPFIREWALL */ #endif /* KERNEL */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _IPFW2_H */ diff --git a/bsd/netinet/ip_fw2_compat.c b/bsd/netinet/ip_fw2_compat.c index 766fa8fc8..712f49241 100644 --- a/bsd/netinet/ip_fw2_compat.c +++ b/bsd/netinet/ip_fw2_compat.c @@ -1592,7 +1592,7 @@ ipfw_version_latest_to_zero(struct ip_fw *curr_rule, struct ip_old_fw *rule_vers rule_vers0->pipe_ptr = CAST_DOWN_EXPLICIT(void*, rule_vers1.pipe_ptr); rule_vers0->next_rule_ptr = CAST_DOWN_EXPLICIT(void*, rule_vers1.next_rule_ptr); - if (rule_vers1.fw_ipflg && IP_FW_IF_TCPEST_COMPAT) rule_vers0->fw_tcpf |= IP_OLD_FW_TCPF_ESTAB; + if (rule_vers1.fw_ipflg & IP_FW_IF_TCPEST_COMPAT) rule_vers0->fw_tcpf |= IP_OLD_FW_TCPF_ESTAB; } else { struct ip_fw_compat_32 rule_vers1; @@ -1620,7 +1620,7 @@ ipfw_version_latest_to_zero(struct ip_fw *curr_rule, struct ip_old_fw *rule_vers rule_vers0->pipe_ptr = CAST_DOWN_EXPLICIT(void*, rule_vers1.pipe_ptr); rule_vers0->next_rule_ptr = CAST_DOWN_EXPLICIT(void*, rule_vers1.next_rule_ptr); - if (rule_vers1.fw_ipflg && IP_FW_IF_TCPEST_COMPAT) rule_vers0->fw_tcpf |= IP_OLD_FW_TCPF_ESTAB; + if (rule_vers1.fw_ipflg & IP_FW_IF_TCPEST_COMPAT) rule_vers0->fw_tcpf |= IP_OLD_FW_TCPF_ESTAB; } } diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index 7f55b2a5f..48ea2f0f5 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -124,23 +124,23 @@ */ struct icmpstat icmpstat; -SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet_icmp, ICMPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &icmpstat, icmpstat, ""); static int icmpmaskrepl = 0; -SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_RW | CTLFLAG_LOCKED, &icmpmaskrepl, 0, ""); static int icmptimestamp = 0; -SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_TIMESTAMP, timestamp, CTLFLAG_RW | CTLFLAG_LOCKED, &icmptimestamp, 0, ""); static int drop_redirect = 0; -SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_RW | CTLFLAG_LOCKED, &drop_redirect, 0, ""); static int log_redirect = 0; -SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW | CTLFLAG_LOCKED, &log_redirect, 0, ""); #if ICMP_BANDLIM @@ -151,12 +151,12 @@ SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_RW, */ static int icmplim = 250; -SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RW | CTLFLAG_LOCKED, &icmplim, 0, ""); #else static int icmplim = -1; -SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RD, +SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RD | CTLFLAG_LOCKED, &icmplim, 0, ""); #endif @@ -166,7 +166,7 @@ SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_RD, */ static int icmpbmcastecho = 1; -SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW, +SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_RW | CTLFLAG_LOCKED, &icmpbmcastecho, 0, ""); @@ -537,8 +537,10 @@ icmp_input(struct mbuf *m, int hlen) (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif); if (ia == 0) break; + IFA_LOCK(&ia->ia_ifa); if (ia->ia_ifp == 0) { - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; break; } @@ -550,7 +552,8 @@ icmp_input(struct mbuf *m, int hlen) else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr; } - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); reflect: ip->ip_len += hlen; /* since ip_input deducts this */ icmpstat.icps_reflect++; @@ -662,8 +665,13 @@ icmp_reflect(struct mbuf *m) */ lck_rw_lock_shared(in_ifaddr_rwlock); TAILQ_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) { - if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) + IFA_LOCK(&ia->ia_ifa); + if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); goto match; + } + IFA_UNLOCK(&ia->ia_ifa); } /* * Slow path; check for broadcast addresses. Find a source @@ -671,13 +679,16 @@ icmp_reflect(struct mbuf *m) * let IP handle the source interface selection work. */ for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { + IFA_LOCK(&ia->ia_ifa); if (ia->ia_ifp && (ia->ia_ifp->if_flags & IFF_BROADCAST) && - t.s_addr == satosin(&ia->ia_broadaddr)->sin_addr.s_addr) + t.s_addr == satosin(&ia->ia_broadaddr)->sin_addr.s_addr) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); break; + } + IFA_UNLOCK(&ia->ia_ifa); } match: - if (ia) - ifaref(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); icmpdst.sin_addr = t; if ((ia == (struct in_ifaddr *)0) && m->m_pkthdr.rcvif) @@ -695,16 +706,18 @@ match: m_freem(m); goto done; } - ifaref(&ia->ia_ifa); + IFA_ADDREF(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); } #if CONFIG_MACF_NET mac_netinet_icmp_reply(m); #endif + IFA_LOCK_SPIN(&ia->ia_ifa); t = IA_SIN(ia)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); ip->ip_src = t; ip->ip_ttl = ip_defttl; - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; if (optlen > 0) { @@ -797,10 +810,10 @@ icmp_send(struct mbuf *m, struct mbuf *opts) int hlen; struct icmp *icp; struct route ro; - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) - ipoa.ipoa_ifscope = m->m_pkthdr.rcvif->if_index; + ipoa.ipoa_boundif = m->m_pkthdr.rcvif->if_index; hlen = IP_VHL_HL(ip->ip_vhl) << 2; m->m_data += hlen; @@ -1037,6 +1050,7 @@ icmp_dgram_ctloutput(struct socket *so, struct sockopt *sopt) case IP_RECVDSTADDR: case IP_RETOPTS: case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: @@ -1054,6 +1068,7 @@ icmp_dgram_ctloutput(struct socket *so, struct sockopt *sopt) #if CONFIG_FORCE_OUT_IFP case IP_FORCE_OUT_IFP: #endif + case IP_NO_IFT_CELLULAR: error = rip_ctloutput(so, sopt); break; @@ -1109,12 +1124,15 @@ icmp_dgram_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *n } TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) { + IFA_LOCK(&ia->ia_ifa); if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) { + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); socket_lock(so, 0); goto ours; } + IFA_UNLOCK(&ia->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); socket_lock(so, 0); diff --git a/bsd/netinet/ip_id.c b/bsd/netinet/ip_id.c index 7a6fef876..46c7fecd9 100644 --- a/bsd/netinet/ip_id.c +++ b/bsd/netinet/ip_id.c @@ -137,6 +137,7 @@ ip_initid(void) struct timeval timenow; getmicrotime(&timenow); + read_random((void *) &tmp, sizeof(tmp)); ru_x = (tmp & 0xFFFF) % RU_M; /* 15 bits of random seed */ diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 10156a869..761b4b40c 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -80,6 +80,8 @@ #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/sysctl.h> +#include <sys/mcache.h> +#include <mach/mach_time.h> #include <machine/endian.h> @@ -93,6 +95,7 @@ #include <net/if_dl.h> #include <net/route.h> #include <net/kpi_protocol.h> +#include <net/ntstat.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -113,6 +116,7 @@ #include <netinet/udp.h> #include <netinet/udp_var.h> #include <netinet/bootp.h> +#include <mach/sdt.h> #if CONFIG_MACF_NET #include <security/mac_framework.h> @@ -167,46 +171,46 @@ SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding, sysctl_ipforwarding, "I", "Enable IP forwarding between interfaces"); static int ipsendredirects = 1; /* XXX */ -SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_RW | CTLFLAG_LOCKED, &ipsendredirects, 0, "Enable sending IP redirects"); int ip_defttl = IPDEFTTL; -SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_defttl, 0, "Maximum TTL on IP packets"); static int ip_dosourceroute = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_dosourceroute, 0, "Enable forwarding source routed IP packets"); static int ip_acceptsourceroute = 0; SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute, - CTLFLAG_RW, &ip_acceptsourceroute, 0, + CTLFLAG_RW | CTLFLAG_LOCKED, &ip_acceptsourceroute, 0, "Enable accepting source routed IP packets"); static int ip_keepfaith = 0; -SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_KEEPFAITH, keepfaith, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_keepfaith, 0, "Enable packet capture for FAITH IPv4->IPv6 translater daemon"); static int nipq = 0; /* total # of reass queues */ static int maxnipq; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_RW | CTLFLAG_LOCKED, &maxnipq, 0, "Maximum number of IPv4 fragment reassembly queue entries"); static int maxfragsperpacket; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_RW | CTLFLAG_LOCKED, &maxfragsperpacket, 0, "Maximum number of IPv4 fragments allowed per packet"); static int maxfrags; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW | CTLFLAG_LOCKED, &maxfrags, 0, "Maximum number of IPv4 fragments allowed"); static int currentfrags = 0; int ip_doscopedroute = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, &ip_doscopedroute, 0, "Enable IPv4 scoped routing"); /* @@ -223,7 +227,7 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RW, * packets for those addresses are received. */ static int ip_checkinterface = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_checkinterface, 0, "Verify packet arrives on correct interface"); @@ -251,13 +255,13 @@ static u_int32_t inaddr_nhash; /* hash table size */ static u_int32_t inaddr_hashp; /* next largest prime */ struct ifqueue ipintrq; -SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLFLAG_RW | CTLFLAG_LOCKED, &ipintrq.ifq_maxlen, 0, "Maximum size of the IP input queue"); -SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD, +SYSCTL_INT(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLFLAG_RD | CTLFLAG_LOCKED, &ipintrq.ifq_drops, 0, "Number of packets dropped from the IP input queue"); struct ipstat ipstat; -SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet_ip, IPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); /* Packet reassembly stuff */ @@ -279,13 +283,13 @@ lck_mtx_t *inet_domain_mutex; extern lck_mtx_t *domain_proto_mtx; #if IPCTL_DEFMTU -SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_mtu, 0, "Default MTU"); #endif #if IPSTEALTH static int ipstealth = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW | CTLFLAG_LOCKED, &ipstealth, 0, ""); #endif @@ -304,17 +308,17 @@ ip_dn_io_t *ip_dn_io_ptr; int (*fr_checkp)(struct ip *, int, struct ifnet *, int, struct mbuf **) = NULL; #endif /* IPFIREWALL */ -SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "link local"); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local"); struct ip_linklocal_stat ip_linklocal_stat; -SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat, CTLFLAG_RD | CTLFLAG_LOCKED, &ip_linklocal_stat, ip_linklocal_stat, "Number of link local packets with TTL less than 255"); -SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "link local input"); +SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "link local input"); int ip_linklocal_in_allowbadttl = 1; -SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_linklocal_in_allowbadttl, 0, "Allow incoming link local packets with TTL less than 255"); @@ -359,7 +363,7 @@ void in_dinit(void); extern u_short ip_id; int ip_use_randomid = 1; -SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, random_id, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_use_randomid, 0, "Randomize IP packets IDs"); #endif @@ -379,6 +383,9 @@ ip_init(void) if (!ip_initialized) { + PE_parse_boot_argn("net.inet.ip.scopedroute", + &ip_doscopedroute, sizeof (ip_doscopedroute)); + in_ifaddr_init(); in_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init(); @@ -391,6 +398,8 @@ ip_init(void) TAILQ_INIT(&in_ifaddrhead); in_ifaddrhashtbl_init(); + ip_moptions_init(); + pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == 0) panic("ip_init"); @@ -531,9 +540,7 @@ in_dinit(void) if (!inetdomain_initted) { -#if 0 - kprintf("Initing %d protosw entries\n", in_proto_count); -#endif + /* kprintf("Initing %d protosw entries\n", in_proto_count); */ dp = &inetdomain; dp->dom_flags = DOM_REENTRANT; @@ -637,6 +644,9 @@ ip_input(struct mbuf *m) #endif ipfilter_t inject_filter_ref = 0; + /* Check if the mbuf is still valid after interface filter processing */ + MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); + #if IPFIREWALL args.eh = NULL; args.oif = NULL; @@ -707,6 +717,11 @@ ipfw_tags_done: if (inject_filter_ref != 0) { ip = mtod(m, struct ip *); hlen = IP_VHL_HL(ip->ip_vhl) << 2; + + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + ip->ip_len = ntohs(ip->ip_len) - hlen; ip->ip_off = ntohs(ip->ip_off); ip_proto_dispatch_in(m, hlen, ip->ip_p, inject_filter_ref); @@ -801,6 +816,30 @@ ipfw_tags_done: goto bad; } + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + + /* + * Naively assume we can attribute inbound data to the route we would + * use to send to this destination. Asymetric routing breaks this + * assumption, but it still allows us to account for traffic from + * a remote node in the routing table. + * this has a very significant performance impact so we bypass + * if nstat_collect is disabled. We may also bypass if the + * protocol is tcp in the future because tcp will have a route that + * we can use to attribute the data to. That does mean we would not + * account for forwarded tcp traffic. + */ + if (nstat_collect) { + struct rtentry *rt = + ifnet_cached_rtlookup_inet(m->m_pkthdr.rcvif, ip->ip_src); + if (rt != NULL) { + nstat_route_rx(rt, 1, m->m_pkthdr.len, 0); + rtfree(rt); + } + } + /* * Convert fields to host representation. */ @@ -839,36 +878,29 @@ tooshort: m_adj(m, ip->ip_len - m->m_pkthdr.len); } -#if IPSEC - if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) - goto pass; -#endif - - /* - * IpHack's section. - * Right now when no processing on packet has done - * and it is still fresh out of network we do our black - * deals with it. - * - Firewall: deny/allow/divert - * - Xlate: translate packet's addr/port (NAT). - * - Pipe: pass pkt through dummynet. - * - Wrap: fake packet's addr/port <unimpl.> - * - Encapsulate: put it in another IP and send out. <unimp.> - */ #if PF /* Invoke inbound packet filter */ - if (pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET, TRUE) != 0) { - if (m != NULL) { - panic("%s: unexpected packet %p\n", __func__, m); - /* NOTREACHED */ - } - /* Already freed by callee */ - return; + if (PF_IS_ENABLED) { + int error; + error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET, TRUE); + if (error != 0) { + if (m != NULL) { + panic("%s: unexpected packet %p\n", __func__, m); + /* NOTREACHED */ + } + /* Already freed by callee */ + return; + } + ip = mtod(m, struct ip *); + hlen = IP_VHL_HL(ip->ip_vhl) << 2; } - ip = mtod(m, struct ip *); - hlen = IP_VHL_HL(ip->ip_vhl) << 2; #endif /* PF */ +#if IPSEC + if (ipsec_bypass == 0 && ipsec_gethist(m, NULL)) + goto pass; +#endif + #if IPFIREWALL #if DUMMYNET iphack: @@ -1015,11 +1047,14 @@ pass: * arrived via the correct interface if checking is * enabled. */ + IFA_LOCK_SPIN(&ia->ia_ifa); if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst.s_addr && (!checkif || ia->ia_ifp == m->m_pkthdr.rcvif)) { + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); goto ours; } + IFA_UNLOCK(&ia->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); @@ -1037,15 +1072,20 @@ pass: ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { - if (ifa->ifa_addr->sa_family != AF_INET) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET) { + IFA_UNLOCK(ifa); continue; + } ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == pkt_dst.s_addr || ia->ia_netbroadcast.s_addr == pkt_dst.s_addr) { + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); goto ours; } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); } @@ -1085,14 +1125,15 @@ pass: * See if we belong to the destination multicast group on the * arrival interface. */ - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(ip->ip_dst, ifp, inm); - ifnet_lock_done(ifp); + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&ip->ip_dst, ifp, inm); + in_multihead_lock_done(); if (inm == NULL) { OSAddAtomic(1, &ipstat.ips_notmember); m_freem(m); return; } + INM_REMREF(inm); goto ours; } if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST) @@ -1350,9 +1391,9 @@ found: struct m_tag *fwd_tag; struct ip_fwd_tag *ipfwd_tag; - fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, + fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, sizeof (*ipfwd_tag), - M_NOWAIT); + M_NOWAIT, m); if (fwd_tag == NULL) { goto bad; } @@ -1731,9 +1772,6 @@ ip_slowtimo(void) } } } -#if IPFLOW - ipflow_slowtimo(); -#endif lck_mtx_unlock(ip_mutex); } @@ -1843,7 +1881,7 @@ ip_dooptions(struct mbuf *m, __unused int pass, struct sockaddr_in *next_hop) break; } else { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; } off--; /* 0 origin */ @@ -1903,9 +1941,11 @@ nosourcerouting: goto bad; } ip->ip_dst = ipaddr.sin_addr; + IFA_LOCK(&ia->ia_ifa); (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; cp[IPOPT_OFFSET] += sizeof(struct in_addr); /* @@ -1942,9 +1982,11 @@ nosourcerouting: goto bad; } } + IFA_LOCK(&ia->ia_ifa); (void)memcpy(cp + off, &(IA_SIN(ia)->sin_addr), sizeof(struct in_addr)); - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; cp[IPOPT_OFFSET] += sizeof(struct in_addr); break; @@ -1987,10 +2029,12 @@ nosourcerouting: m->m_pkthdr.rcvif); if (ia == 0) continue; + IFA_LOCK(&ia->ia_ifa); (void)memcpy(sin, &IA_SIN(ia)->sin_addr, sizeof(struct in_addr)); + IFA_UNLOCK(&ia->ia_ifa); ipt->ipt_ptr += sizeof(struct in_addr); - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; break; @@ -2005,7 +2049,7 @@ nosourcerouting: sizeof(struct in_addr)); if ((ia = (struct in_ifaddr*)ifa_ifwithaddr((SA)&ipaddr)) == 0) continue; - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; ipt->ipt_ptr += sizeof(struct in_addr); break; @@ -2057,7 +2101,7 @@ ip_rtaddr(struct in_addr dst) RT_LOCK(ro.ro_rt); if ((rt_ifa = ro.ro_rt->rt_ifa) != NULL) - ifaref(rt_ifa); + IFA_ADDREF(rt_ifa); RT_UNLOCK(ro.ro_rt); rtfree(ro.ro_rt); @@ -2204,12 +2248,12 @@ sysctl_ipforwarding SYSCTL_HANDLER_ARGS for (i = 0; i <= if_index; i++) { struct ifnet *ifp = ifindex2ifnet[i]; if (ifp != NULL) { - lck_mtx_lock(ifp->if_fwd_route_lock); - if (ifp->if_fwd_route.ro_rt != NULL) { + lck_mtx_lock(&ifp->if_cached_route_lock); + if (ifp->if_fwd_route.ro_rt != NULL) rtfree(ifp->if_fwd_route.ro_rt); - ifp->if_fwd_route.ro_rt = NULL; - } - lck_mtx_unlock(ifp->if_fwd_route_lock); + bzero(&ifp->if_fwd_route, + sizeof (ifp->if_fwd_route)); + lck_mtx_unlock(&ifp->if_cached_route_lock); } } ifnet_head_done(); @@ -2228,20 +2272,16 @@ ip_fwd_route_copyout(struct ifnet *ifp, struct route *dst) { struct route *src = &ifp->if_fwd_route; - lck_mtx_lock(ifp->if_fwd_route_lock); + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); /* Minor sanity check */ if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) panic("%s: wrong or corrupted route: %p", __func__, src); - /* Copy everything (rt, dst, flags) from ifnet */ - bcopy(src, dst, sizeof (*dst)); - - /* Hold one reference for the local copy of struct route */ - if (dst->ro_rt != NULL) - RT_ADDREF(dst->ro_rt); + route_copyout(dst, src, sizeof(*dst)); - lck_mtx_unlock(ifp->if_fwd_route_lock); + lck_mtx_unlock(&ifp->if_cached_route_lock); } static void @@ -2249,37 +2289,17 @@ ip_fwd_route_copyin(struct ifnet *ifp, struct route *src) { struct route *dst = &ifp->if_fwd_route; - lck_mtx_lock(ifp->if_fwd_route_lock); + lck_mtx_lock_spin(&ifp->if_cached_route_lock); + lck_mtx_convert_spin(&ifp->if_cached_route_lock); /* Minor sanity check */ if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) panic("%s: wrong or corrupted route: %p", __func__, src); - /* No cached route in the ifnet? */ - if (dst->ro_rt == NULL) { - /* - * Copy everything (rt, dst, flags) from ip_forward(); - * the reference to the route was held at the time - * it was allocated and is kept intact. - */ - bcopy(src, dst, sizeof (*dst)); - } else if (src->ro_rt != NULL) { - /* - * If the same, update just the ro_flags and ditch the one - * in the local copy. Else ditch the one that is currently - * cached, and cache what we got back from ip_output(). - */ - if (dst->ro_rt == src->ro_rt) { - dst->ro_flags = src->ro_flags; - rtfree(src->ro_rt); - src->ro_rt = NULL; - } else { - rtfree(dst->ro_rt); - bcopy(src, dst, sizeof (*dst)); - } - } + if (ifp->if_fwd_cacheok) + route_copyin(src, dst, sizeof(*src)); - lck_mtx_unlock(ifp->if_fwd_route_lock); + lck_mtx_unlock(&ifp->if_cached_route_lock); } /* @@ -2311,7 +2331,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) n_long dest; struct in_addr pkt_dst; u_int32_t nextmtu = 0; - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; struct ifnet *ifp = m->m_pkthdr.rcvif; #if PF struct pf_mtag *pf_mtag; @@ -2355,7 +2375,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) #if PF pf_mtag = pf_find_mtag(m); if (pf_mtag != NULL && pf_mtag->rtableid != IFSCOPE_NONE) - ipoa.ipoa_ifscope = pf_mtag->rtableid; + ipoa.ipoa_boundif = pf_mtag->rtableid; #endif /* PF */ ip_fwd_route_copyout(ifp, &fwd_rt); @@ -2372,7 +2392,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) sin->sin_len = sizeof (*sin); sin->sin_addr = pkt_dst; - rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_ifscope); + rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_boundif); if (fwd_rt.ro_rt == NULL) { icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, 0); goto done; @@ -2417,24 +2437,27 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) if (rt->rt_ifp == m->m_pkthdr.rcvif && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0 && - ipsendredirects && !srcrt) { -#define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) + ipsendredirects && !srcrt && rt->rt_ifa != NULL) { + struct in_ifaddr *ia = (struct in_ifaddr *)rt->rt_ifa; u_int32_t src = ntohl(ip->ip_src.s_addr); - if (RTA(rt) && - (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { - if (rt->rt_flags & RTF_GATEWAY) - dest = satosin(rt->rt_gateway)->sin_addr.s_addr; - else - dest = pkt_dst.s_addr; - /* Router requirements says to only send host redirects */ - type = ICMP_REDIRECT; - code = ICMP_REDIRECT_HOST; + /* Become a regular mutex */ + RT_CONVERT_LOCK(rt); + IFA_LOCK_SPIN(&ia->ia_ifa); + if ((src & ia->ia_subnetmask) == ia->ia_subnet) { + if (rt->rt_flags & RTF_GATEWAY) + dest = satosin(rt->rt_gateway)->sin_addr.s_addr; + else + dest = pkt_dst.s_addr; + /* Router requirements says to only send host redirects */ + type = ICMP_REDIRECT; + code = ICMP_REDIRECT_HOST; #if DIAGNOSTIC - if (ipprintfs) - printf("redirect (%d) to %lx\n", code, (u_int32_t)dest); + if (ipprintfs) + printf("redirect (%d) to %lx\n", code, (u_int32_t)dest); #endif } + IFA_UNLOCK(&ia->ia_ifa); } RT_UNLOCK(rt); @@ -2444,9 +2467,9 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) struct m_tag *tag; struct ip_fwd_tag *ipfwd_tag; - tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, - sizeof (*ipfwd_tag), M_NOWAIT); + sizeof (*ipfwd_tag), M_NOWAIT, m); if (tag == NULL) { error = ENOBUFS; m_freem(m); @@ -2473,9 +2496,6 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) OSAddAtomic(1, &ipstat.ips_redirectsent); else { if (mcopy) { -#if IPFLOW - ipflow_create(&fwd_rt, mcopy); -#endif /* * If we didn't have to go thru ipflow and * the packet was successfully consumed by @@ -2580,6 +2600,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) } sav = key_allocsa_policy(&saidx); if (sav != NULL) { + lck_mtx_lock(sadb_mutex); if (sav->sah != NULL) { ro = &sav->sah->sa_route; if (ro->ro_rt != NULL) { @@ -2591,7 +2612,8 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) RT_UNLOCK(ro->ro_rt); } } - key_freesav(sav, KEY_SADB_UNLOCKED); + key_freesav(sav, KEY_SADB_LOCKED); + lck_mtx_unlock(sadb_mutex); } } } @@ -2617,27 +2639,41 @@ done: ip_fwd_route_copyin(ifp, &fwd_rt); } -void +int ip_savecontrol( struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { + *mp = NULL; if (inp->inp_socket->so_options & SO_TIMESTAMP) { struct timeval tv; microtime(&tv); - *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), - SCM_TIMESTAMP, SOL_SOCKET); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) &tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET, mp); + if (*mp == NULL) { + goto no_mbufs; + } } + if ((inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + uint64_t time; + + time = mach_absolute_time(); + mp = sbcreatecontrol_mbuf((caddr_t) &time, sizeof(time), + SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp); + + if (*mp == NULL) { + goto no_mbufs; + } + } if (inp->inp_flags & INP_RECVDSTADDR) { - *mp = sbcreatecontrol((caddr_t) &ip->ip_dst, - sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) &ip->ip_dst, + sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } #ifdef notyet /* XXX @@ -2646,17 +2682,19 @@ ip_savecontrol( */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { - *mp = sbcreatecontrol((caddr_t) opts_deleted_above, - sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) opts_deleted_above, + sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { - *mp = sbcreatecontrol((caddr_t) ip_srcroute(), - sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) ip_srcroute(), + sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } #endif if (inp->inp_flags & INP_RECVIF) { @@ -2669,24 +2707,27 @@ ip_savecontrol( struct sockaddr_dl *sdl2 = &sdlbuf.sdl; ifnet_head_lock_shared(); - if (((ifp = m->m_pkthdr.rcvif)) - && ( ifp->if_index && (ifp->if_index <= if_index))) { + if ((ifp = m->m_pkthdr.rcvif) != NULL && + ifp->if_index && (ifp->if_index <= if_index)) { struct ifaddr *ifa = ifnet_addrs[ifp->if_index - 1]; if (!ifa || !ifa->ifa_addr) goto makedummy; + IFA_LOCK_SPIN(ifa); sdp = (struct sockaddr_dl *)ifa->ifa_addr; /* * Change our mind and don't try copy. */ - if ((sdp->sdl_family != AF_LINK) - || (sdp->sdl_len > sizeof(sdlbuf))) { + if ((sdp->sdl_family != AF_LINK) || + (sdp->sdl_len > sizeof(sdlbuf))) { + IFA_UNLOCK(ifa); goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); + IFA_UNLOCK(ifa); } else { -makedummy: +makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; @@ -2694,15 +2735,46 @@ makedummy: sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } ifnet_head_done(); - *mp = sbcreatecontrol((caddr_t) sdl2, sdl2->sdl_len, - IP_RECVIF, IPPROTO_IP); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t) sdl2, sdl2->sdl_len, + IP_RECVIF, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } if (inp->inp_flags & INP_RECVTTL) { - *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(ip->ip_ttl), IP_RECVTTL, IPPROTO_IP); - if (*mp) mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_ttl, sizeof(ip->ip_ttl), + IP_RECVTTL, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } + } + if ((inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) != 0) { + int tc = m->m_pkthdr.prio; + + mp = sbcreatecontrol_mbuf((caddr_t) &tc, sizeof(tc), + SO_TRAFFIC_CLASS, SOL_SOCKET, mp); + if (*mp == NULL) { + goto no_mbufs; + } + } + if (inp->inp_flags & INP_PKTINFO) { + struct in_pktinfo pi; + + bzero(&pi, sizeof(struct in_pktinfo)); + bcopy(&ip->ip_dst, &pi.ipi_addr, sizeof(struct in_addr)); + pi.ipi_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; + + mp = sbcreatecontrol_mbuf((caddr_t)&pi, sizeof(struct in_pktinfo), + IP_RECVPKTINFO, IPPROTO_IP, mp); + if (*mp == NULL) { + goto no_mbufs; + } } + return 0; + +no_mbufs: + ipstat.ips_pktdropcntrl++; + return ENOBUFS; } int diff --git a/bsd/netinet/ip_mroute.c b/bsd/netinet/ip_mroute.c index e61d2ed64..f33537ef8 100644 --- a/bsd/netinet/ip_mroute.c +++ b/bsd/netinet/ip_mroute.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -131,12 +131,12 @@ int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *) = _ip_mforward; int -_mrt_ioctl(__unused int req, __unused caddr_t data, __unused struct proc *p) +_mrt_ioctl(__unused u_long req, __unused caddr_t data, __unused struct proc *p) { return EOPNOTSUPP; } -int (*mrt_ioctl)(int, caddr_t, struct proc *) = _mrt_ioctl; +int (*mrt_ioctl)(u_long, caddr_t, struct proc *) = _mrt_ioctl; void rsvp_input(struct mbuf *m, int iphlen) /* XXX must fixup manually */ @@ -293,7 +293,7 @@ static int X_ip_mrouter_done(void); static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); static int X_legal_vif_num(int vif); -static int X_mrt_ioctl(int cmd, caddr_t data); +static int X_mrt_ioctl(u_long cmd, caddr_t data); static int get_sg_cnt(struct sioc_sg_req *); static int get_vif_cnt(struct sioc_vif_req *); @@ -493,7 +493,7 @@ int (*ip_mrouter_get)(struct socket *, struct sockopt *) = X_ip_mrouter_get; * Handle ioctl commands to obtain information from the cache */ static int -X_mrt_ioctl(int cmd, caddr_t data) +X_mrt_ioctl(u_long cmd, caddr_t data) { int error = 0; @@ -512,7 +512,7 @@ X_mrt_ioctl(int cmd, caddr_t data) } #if !defined(MROUTE_LKM) || !MROUTE_LKM -int (*mrt_ioctl)(int, caddr_t) = X_mrt_ioctl; +int (*mrt_ioctl)(u_long, caddr_t) = X_mrt_ioctl; #endif /* @@ -695,7 +695,7 @@ add_vif(struct vifctl *vifcp) ifa = ifa_ifwithaddr((struct sockaddr *)&sin); if (ifa == 0) return EADDRNOTAVAIL; ifp = ifa->ifa_ifp; - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; if (vifcp->vifc_flags & VIFF_TUNNEL) { @@ -1099,7 +1099,10 @@ X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, return 1; } + if (imo != NULL) + IMO_LOCK(imo); if ((imo) && ((vifi = imo->imo_multicast_vif) < numvifs)) { + IMO_UNLOCK(imo); if (ip->ip_ttl < 255) ip->ip_ttl++; /* compensate for -1 in *_send routines */ if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { @@ -1110,6 +1113,8 @@ X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, vifp->v_ifp->if_name, vifp->v_ifp->if_unit); } return (ip_mdq(m, ifp, NULL, vifi)); + } else if (imo != NULL) { + IMO_UNLOCK(imo); } if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { printf("Warning: IPPROTO_RSVP from %x to %x without vif option\n", @@ -1807,7 +1812,6 @@ tbf_dq_sel(struct vif *vifp, struct ip *ip) static void tbf_send_packet(struct vif *vifp, struct mbuf *m) { - struct ip_moptions imo; int error; static struct route ro; @@ -1816,10 +1820,18 @@ tbf_send_packet(struct vif *vifp, struct mbuf *m) ip_output(m, (struct mbuf *)0, &vifp->v_route, IP_FORWARDING, (struct ip_moptions *)0, NULL); } else { - imo.imo_multicast_ifp = vifp->v_ifp; - imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; - imo.imo_multicast_loop = 1; - imo.imo_multicast_vif = -1; + struct ip_moptions *imo; + + imo = ip_allocmoptions(M_DONTWAIT); + if (imo == NULL) { + error = ENOMEM; + goto done; + } + + imo->imo_multicast_ifp = vifp->v_ifp; + imo->imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; + imo->imo_multicast_loop = 1; + imo->imo_multicast_vif = -1; /* * Re-entrancy should not be a problem here, because @@ -1828,8 +1840,10 @@ tbf_send_packet(struct vif *vifp, struct mbuf *m) * the loopback interface, thus preventing looping. */ error = ip_output(m, (struct mbuf *)0, &ro, - IP_FORWARDING, &imo, NULL); + IP_FORWARDING, imo, NULL); + IMO_REMREF(imo); +done: if (mrtdebug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on vif %d err %d\n", vifp - viftable, error); diff --git a/bsd/netinet/ip_mroute.h b/bsd/netinet/ip_mroute.h index 71c39440a..f234e20ab 100644 --- a/bsd/netinet/ip_mroute.h +++ b/bsd/netinet/ip_mroute.h @@ -298,9 +298,9 @@ extern int (*ip_mrouter_set)(struct socket *, struct sockopt *); extern int (*ip_mrouter_get)(struct socket *, struct sockopt *); extern int (*ip_mrouter_done)(void); #if MROUTING -extern int (*mrt_ioctl)(int, caddr_t); +extern int (*mrt_ioctl)(u_long, caddr_t); #else -extern int (*mrt_ioctl)(int, caddr_t, struct proc *); +extern int (*mrt_ioctl)(u_long, caddr_t, struct proc *); #endif #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index 07d74f97f..57f522919 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,12 +79,17 @@ #include <sys/socketvar.h> #include <kern/locks.h> #include <sys/sysctl.h> +#include <sys/mcache.h> #include <machine/endian.h> +#include <pexpert/pexpert.h> #include <net/if.h> #include <net/if_dl.h> +#include <net/if_types.h> #include <net/route.h> +#include <net/ntstat.h> +#include <net/net_osdep.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -124,6 +129,7 @@ #include <netinet/ip_fw.h> #include <netinet/ip_divert.h> +#include <mach/sdt.h> #if DUMMYNET #include <netinet/ip_dummynet.h> @@ -144,20 +150,14 @@ u_short ip_id; static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *); -static struct ifnet *ip_multicast_if(struct in_addr *, int *); static void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *, int); -static int ip_getmoptions(struct sockopt *, struct ip_moptions *); static int ip_pcbopts(int, struct mbuf **, struct mbuf *); -static int ip_setmoptions(struct sockopt *, struct ip_moptions **); +static void imo_trace(struct ip_moptions *, int); static void ip_out_cksum_stats(int, u_int32_t); static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int); -static void ip_bindif(struct inpcb *, unsigned int); -int ip_createmoptions(struct ip_moptions **imop); -int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); int ip_optcopy(struct ip *, struct ip *); void in_delayed_cksum_offset(struct mbuf *, int ); void in_cksum_offset(struct mbuf* , size_t ); @@ -175,18 +175,50 @@ extern int ipsec_bypass; #endif static int ip_maxchainsent = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0, "use dlil_output_list"); #if DEBUG static int forge_ce = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0, "Forge ECN CE"); #endif /* DEBUG */ static int ip_select_srcif_debug = 0; -SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW, +SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0, "log source interface selection debug info"); +#define IMO_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE; + +struct ip_moptions_dbg { + struct ip_moptions imo; /* ip_moptions */ + u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */ + u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */ + /* + * Alloc and free callers. + */ + ctrace_t imo_alloc; + ctrace_t imo_free; + /* + * Circular lists of IMO_ADDREF and IMO_REMREF callers. + */ + ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE]; + ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE]; +}; + +#if DEBUG +static unsigned int imo_debug = 1; /* debugging (enabled) */ +#else +static unsigned int imo_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int imo_size; /* size of zone element */ +static struct zone *imo_zone; /* zone for ip_moptions */ + +#define IMO_ZONE_MAX 64 /* maximum elements in zone */ +#define IMO_ZONE_NAME "ip_moptions" /* zone name */ + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -244,11 +276,12 @@ ip_output_list( struct ifnet *ifp = NULL; struct mbuf *m = m0, **mppn = NULL; int hlen = sizeof (struct ip); - int len = 0, off, error = 0; + int len = 0, error = 0; struct sockaddr_in *dst = NULL; struct in_ifaddr *ia = NULL, *src_ia = NULL; int isbroadcast, sw_csum; struct in_addr pkt_dst; + struct ipf_pktopts *ippo = NULL, ipf_pktopts; #if IPSEC struct route iproute; struct socket *so = NULL; @@ -258,18 +291,24 @@ ip_output_list( int fwd_rewrite_src = 0; #endif #if IPFIREWALL + int off; struct ip_fw_args args; + struct m_tag *tag; + struct sockaddr_in *next_hop_from_ipfwd_tag = NULL; #endif int didfilter = 0; ipfilter_t inject_filter_ref = 0; - struct m_tag *tag; +#if DUMMYNET struct route saved_route; struct ip_out_args saved_ipoa; + struct sockaddr_in dst_buf; +#endif /* DUMMYNET */ struct mbuf * packetlist; int pktcnt = 0, tso = 0; + u_int32_t bytecnt = 0; unsigned int ifscope; + unsigned int nocell; boolean_t select_srcif; - KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); packetlist = m0; @@ -296,7 +335,8 @@ ip_output_list( ro = &saved_route; imo = NULL; - dst = dn_tag->dn_dst; + bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf)); + dst = &dst_buf; ifp = dn_tag->ifp; flags = dn_tag->flags; saved_ipoa = dn_tag->ipoa; @@ -323,8 +363,8 @@ ip_output_list( struct ip_fwd_tag *ipfwd_tag; ipfwd_tag = (struct ip_fwd_tag *)(tag+1); - args.next_hop = ipfwd_tag->next_hop; - + next_hop_from_ipfwd_tag = ipfwd_tag->next_hop; + m_tag_delete(m0, tag); } ipfw_tags_done: @@ -340,6 +380,9 @@ ipfw_tags_done: mtod(m, struct ip *)->ip_p); #endif + bzero(&ipf_pktopts, sizeof(struct ipf_pktopts)); + ippo = &ipf_pktopts; + /* * At present the IP_OUTARGS flag implies a request for IP to * perform source interface selection. In the forwarding case, @@ -348,12 +391,22 @@ ipfw_tags_done: */ if (ip_doscopedroute && (flags & IP_OUTARGS)) { select_srcif = !(flags & IP_FORWARDING); - ifscope = ipoa->ipoa_ifscope; + ifscope = ipoa->ipoa_boundif; + ipf_pktopts.ippo_flags = IPPOF_BOUND_IF; + ipf_pktopts.ippo_flags |= (ifscope << IPPOF_SHIFT_IFSCOPE); } else { select_srcif = FALSE; ifscope = IFSCOPE_NONE; } + if (flags & IP_OUTARGS) { + nocell = ipoa->ipoa_nocell; + if (nocell) + ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; + } else { + nocell = 0; + } + #if IPFIREWALL if (args.rule != NULL) { /* dummynet already saw us */ ip = mtod(m, struct ip *); @@ -361,8 +414,11 @@ ipfw_tags_done: if (ro->ro_rt != NULL) { RT_LOCK_SPIN(ro->ro_rt); ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa; - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } RT_UNLOCK(ro->ro_rt); } #if IPSEC @@ -397,11 +453,29 @@ loopit: } ip = mtod(m, struct ip *); #if IPFIREWALL + /* + * rdar://8542331 + * + * When dealing with a packet chain, we need to reset "next_hop" because + * "dst" may have been changed to the gateway address below for the previous + * packet of the chain. This could cause the route to be inavertandly changed + * to the route to the gateway address (instead of the route to the destination). + */ + args.next_hop = next_hop_from_ipfwd_tag; pkt_dst = args.next_hop ? args.next_hop->sin_addr : ip->ip_dst; #else pkt_dst = ip->ip_dst; #endif + /* + * We must not send if the packet is destined to network zero. + * RFC1122 3.2.1.3 (a) and (b). + */ + if (IN_ZERONET(ntohl(pkt_dst.s_addr))) { + error = EHOSTUNREACH; + goto bad; + } + /* * Fill in IP header. */ @@ -450,7 +524,7 @@ loopit: error = EADDRNOTAVAIL; goto bad; } - ifafree(&src_ia->ia_ifa); + IFA_REMREF(&src_ia->ia_ifa); } /* * Test rt_flags without holding rt_lock for performance @@ -487,7 +561,7 @@ loopit: #define sintosa(sin) ((struct sockaddr *)(sin)) if (flags & IP_ROUTETOIF) { if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) { if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) { OSAddAtomic(1, &ipstat.ips_noroute); @@ -499,15 +573,14 @@ loopit: ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && - imo != NULL && imo->imo_multicast_ifp != NULL) { + imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) { /* * Bypass the normal routing lookup for multicast * packets if the interface is specified. */ - ifp = imo->imo_multicast_ifp; isbroadcast = 0; if (ia != NULL) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* Macro takes reference on ia */ IFP_TO_IA(ifp, ia); @@ -530,6 +603,18 @@ loopit: /* Find the source interface */ ifa = in_selectsrcif(ip, ro, ifscope); + /* + * If the source address belongs to a cellular interface + * and the caller forbids our using interfaces of such + * type, pretend that there is no source address. + */ + if (nocell && ifa != NULL && + ifa->ifa_ifp->if_type == IFT_CELLULAR) { + IFA_REMREF(ifa); + error = EADDRNOTAVAIL; + goto bad; + } + /* * If the source address is spoofed (in the case * of IP_RAWOUTPUT), or if this is destined for @@ -560,7 +645,7 @@ loopit: if (ifa != NULL) { if (ifscope == IFSCOPE_NONE) ifscope = ifa->ifa_ifp->if_index; - ifafree(ifa); + IFA_REMREF(ifa); cloneok = (!(flags & IP_RAWOUTPUT) && !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)))); } @@ -611,6 +696,23 @@ loopit: rtalloc_ign(ro, ign); else rtalloc_scoped_ign(ro, ign, ifscope); + + /* + * If the route points to a cellular interface and the + * caller forbids our using interfaces of such type, + * pretend that there is no route. + */ + if (nocell && ro->ro_rt != NULL) { + RT_LOCK_SPIN(ro->ro_rt); + if (ro->ro_rt->rt_ifp->if_type == + IFT_CELLULAR) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } else { + RT_UNLOCK(ro->ro_rt); + } + } } if (ro->ro_rt == NULL) { @@ -620,11 +722,14 @@ loopit: } if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } ifp = ro->ro_rt->rt_ifp; ro->ro_rt->rt_use++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) @@ -641,6 +746,9 @@ loopit: if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) { struct in_multi *inm; + u_int32_t vif; + u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL; + u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP; m->m_flags |= M_MCAST; /* @@ -653,22 +761,28 @@ loopit: * See if the caller provided any multicast options */ if (imo != NULL) { - if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = imo->imo_multicast_ttl; - if (imo->imo_multicast_ifp != NULL) { + IMO_LOCK(imo); + vif = imo->imo_multicast_vif; + ttl = imo->imo_multicast_ttl; + loop = imo->imo_multicast_loop; + if ((flags & IP_RAWOUTPUT) == 0) + ip->ip_ttl = ttl; + if (imo->imo_multicast_ifp != NULL) ifp = imo->imo_multicast_ifp; - } + IMO_UNLOCK(imo); #if MROUTING - if (imo->imo_multicast_vif != -1 && - ((flags & IP_RAWOUTPUT) == 0 || ip->ip_src.s_addr == INADDR_ANY)) - ip->ip_src.s_addr = - ip_mcast_src(imo->imo_multicast_vif); + if (vif != -1 && ((flags & IP_RAWOUTPUT) == 0 || + ip->ip_src.s_addr == INADDR_ANY)) + ip->ip_src.s_addr = ip_mcast_src(vif); #endif /* MROUTING */ - } else - if ((flags & IP_RAWOUTPUT) == 0) ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; + } else if ((flags & IP_RAWOUTPUT) == 0) { + vif = -1; + ip->ip_ttl = ttl; + } /* * Confirm that the outgoing interface supports multicast. */ - if ((imo == NULL) || (imo->imo_multicast_vif == -1)) { + if (imo == NULL || vif == -1) { if ((ifp->if_flags & IFF_MULTICAST) == 0) { OSAddAtomic(1, &ipstat.ips_noroute); error = ENETUNREACH; @@ -682,11 +796,15 @@ loopit: if (ip->ip_src.s_addr == INADDR_ANY) { struct in_ifaddr *ia1; lck_rw_lock_shared(in_ifaddr_rwlock); - TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) + TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) { + IFA_LOCK_SPIN(&ia1->ia_ifa); if (ia1->ia_ifp == ifp) { ip->ip_src = IA_SIN(ia1)->sin_addr; + IFA_UNLOCK(&ia1->ia_ifa); break; } + IFA_UNLOCK(&ia1->ia_ifa); + } lck_rw_done(in_ifaddr_rwlock); if (ip->ip_src.s_addr == INADDR_ANY) { error = ENETUNREACH; @@ -694,11 +812,10 @@ loopit: } } - ifnet_lock_shared(ifp); - IN_LOOKUP_MULTI(pkt_dst, ifp, inm); - ifnet_lock_done(ifp); - if (inm != NULL && - (imo == NULL || imo->imo_multicast_loop)) { + in_multihead_lock_shared(); + IN_LOOKUP_MULTI(&pkt_dst, ifp, inm); + in_multihead_lock_done(); + if (inm != NULL && (imo == NULL || loop)) { /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not @@ -707,17 +824,16 @@ loopit: if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; int seen = (inject_filter_ref == 0); - struct ipf_pktopts *ippo = 0, ipf_pktopts; - if (imo) { - ippo = &ipf_pktopts; - ipf_pktopts.ippo_mcast_ifnet = imo->imo_multicast_ifp; - ipf_pktopts.ippo_mcast_ttl = imo->imo_multicast_ttl; - ipf_pktopts.ippo_mcast_loop = imo->imo_multicast_loop; + if (imo != NULL) { + ipf_pktopts.ippo_flags |= IPPOF_MCAST_OPTS; + ipf_pktopts.ippo_mcast_ifnet = ifp; + ipf_pktopts.ippo_mcast_ttl = ttl; + ipf_pktopts.ippo_mcast_loop = loop; } - + ipf_ref(); - + /* 4135317 - always pass network byte order to filter */ #if BYTE_ORDER != BIG_ENDIAN @@ -734,15 +850,17 @@ loopit: result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); + INM_REMREF(inm); goto done; } if (result != 0) { ipf_unref(); + INM_REMREF(inm); goto bad; } } } - + /* set back to host byte order */ ip = mtod(m, struct ip *); @@ -778,15 +896,18 @@ loopit: * as prescribed by rsvpd. */ if (!rsvp_on) - imo = NULL; + imo = NULL; if (ip_mforward(ip, ifp, m, imo) != 0) { m_freem(m); + if (inm != NULL) + INM_REMREF(inm); goto done; } } } #endif /* MROUTING */ - + if (inm != NULL) + INM_REMREF(inm); /* * Multicasts with a time-to-live of zero may be looped- * back, above, but must not be transmitted on a network. @@ -808,7 +929,9 @@ loopit: * of outgoing interface. */ if (ip->ip_src.s_addr == INADDR_ANY) { + IFA_LOCK_SPIN(&ia->ia_ifa); ip->ip_src = IA_SIN(ia)->sin_addr; + IFA_UNLOCK(&ia->ia_ifa); #if IPFIREWALL_FORWARD /* Keep note that we did this - if the firewall changes * the next-hop, our interface may change, changing the @@ -847,26 +970,30 @@ loopit: sendit: #if PF /* Invoke outbound packet filter */ - if (pf_af_hook(ifp, mppn, &m, AF_INET, FALSE) != 0) { - if (packetlist == m0) { - packetlist = m; - mppn = NULL; - } - if (m != NULL) { - m0 = m; - /* Next packet in the chain */ - goto loopit; - } else if (packetlist != NULL) { - /* No more packet; send down the chain */ - goto sendchain; + if ( PF_IS_ENABLED) { + int rc; + rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE); + if (rc != 0) { + if (packetlist == m0) { + packetlist = m; + mppn = NULL; + } + if (m != NULL) { + m0 = m; + /* Next packet in the chain */ + goto loopit; + } else if (packetlist != NULL) { + /* No more packet; send down the chain */ + goto sendchain; + } + /* Nothing left; we're done */ + goto done; } - /* Nothing left; we're done */ - goto done; + m0 = m; + ip = mtod(m, struct ip *); + pkt_dst = ip->ip_dst; + hlen = IP_VHL_HL(ip->ip_vhl) << 2; } - m0 = m; - ip = mtod(m, struct ip *); - pkt_dst = ip->ip_dst; - hlen = IP_VHL_HL(ip->ip_vhl) << 2; #endif /* PF */ /* * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt @@ -882,7 +1009,8 @@ sendit: if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; int seen = (inject_filter_ref == 0); - + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; + /* Check that a TSO frame isn't passed to a filter. * This could happen if a filter is inserted while * TCP is sending the TSO packet. @@ -907,7 +1035,7 @@ sendit: seen = 1; } else if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; @@ -1011,6 +1139,10 @@ sendit: HTONS(ip->ip_off); #endif + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, ifp, + struct ip *, ip, struct ip6_hdr *, NULL); + error = ipsec4_output(&state, sp, flags); m0 = m = state.m; @@ -1071,7 +1203,7 @@ sendit: rtfree(ro->ro_rt); ro->ro_rt = NULL; if (src_ia != NULL) - ifafree(&src_ia->ia_ifa); + IFA_REMREF(&src_ia->ia_ifa); } if (ro->ro_rt == NULL) { @@ -1085,11 +1217,14 @@ sendit: } } else { if (ia) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); RT_LOCK_SPIN(ro->ro_rt); ia = ifatoia(ro->ro_rt->rt_ifa); - if (ia) - ifaref(&ia->ia_ifa); + if (ia) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro->ro_rt); + IFA_ADDREF(&ia->ia_ifa); + } ifp = ro->ro_rt->rt_ifp; RT_UNLOCK(ro->ro_rt); } @@ -1107,6 +1242,8 @@ sendit: if (!TAILQ_EMPTY(&ipv4_filters)) { struct ipfilter *filter; + ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS; + /* Check that a TSO frame isn't passed to a filter. * This could happen if a filter is inserted while * TCP is sending the TSO packet. @@ -1128,7 +1265,7 @@ sendit: TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) { if (filter->ipf_filter.ipf_output) { errno_t result; - result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, 0); + result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); goto done; @@ -1310,31 +1447,35 @@ skip_ipsec: * of ours, we pretend to * be the destination for this packet. */ + IFA_LOCK_SPIN(&ia_fw->ia_ifa); if (IA_SIN(ia_fw)->sin_addr.s_addr == - dst->sin_addr.s_addr) + dst->sin_addr.s_addr) { + IFA_UNLOCK(&ia_fw->ia_ifa); break; + } + IFA_UNLOCK(&ia_fw->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); if (ia_fw) { /* tell ip_input "dont filter" */ struct m_tag *fwd_tag; struct ip_fwd_tag *ipfwd_tag; - - fwd_tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, + + fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, - sizeof (*ipfwd_tag), M_NOWAIT); + sizeof (*ipfwd_tag), M_NOWAIT, m); if (fwd_tag == NULL) { error = ENOBUFS; goto bad; } - + ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1); ipfwd_tag->next_hop = args.next_hop; m_tag_prepend(m, fwd_tag); if (m->m_pkthdr.rcvif == NULL) - m->m_pkthdr.rcvif = ifunit("lo0"); + m->m_pkthdr.rcvif = lo_ifp; if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) & m->m_pkthdr.csum_flags) == 0) { if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { @@ -1387,8 +1528,11 @@ skip_ipsec: RT_LOCK_SPIN(ro_fwd->ro_rt); ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa); - if (ia_fw != NULL) - ifaref(&ia_fw->ia_ifa); + if (ia_fw != NULL) { + /* Become a regular mutex */ + RT_CONVERT_LOCK(ro_fwd->ro_rt); + IFA_ADDREF(&ia_fw->ia_ifa); + } ifp = ro_fwd->ro_rt->rt_ifp; ro_fwd->ro_rt->rt_use++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) @@ -1412,9 +1556,12 @@ skip_ipsec: * interface, do it again, from the new one. */ if (ia_fw != NULL) { - if (fwd_rewrite_src) + if (fwd_rewrite_src) { + IFA_LOCK_SPIN(&ia_fw->ia_ifa); ip->ip_src = IA_SIN(ia_fw)->sin_addr; - ifafree(&ia_fw->ia_ifa); + IFA_UNLOCK(&ia_fw->ia_ifa); + } + IFA_REMREF(&ia_fw->ia_ifa); } goto pass ; } @@ -1427,9 +1574,9 @@ skip_ipsec: error = EACCES; /* not sure this is the right error msg */ goto done; } -#endif /* IPFIREWALL */ pass: +#endif /* IPFIREWALL */ #if __APPLE__ /* Do not allow loopback address to wind up on a wire */ if ((ifp->if_flags & IFF_LOOPBACK) == 0 && @@ -1526,11 +1673,14 @@ pass: ipsec_delaux(m); #endif if (packetchain == 0) { + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, (struct sockaddr *)dst); goto done; } else { /* packet chaining allows us to reuse the route for all packets */ + bytecnt += m->m_pkthdr.len; mppn = &m->m_nextpkt; m = m->m_nextpkt; if (m == NULL) { @@ -1539,10 +1689,13 @@ sendchain: #endif /* PF */ if (pktcnt > ip_maxchainsent) ip_maxchainsent = pktcnt; + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, pktcnt, bytecnt, 0); //send error = ifnet_output(ifp, PF_INET, packetlist, ro->ro_rt, (struct sockaddr *)dst); pktcnt = 0; + bytecnt = 0; goto done; } @@ -1556,23 +1709,28 @@ sendchain: * Must be able to put at least 8 bytes per fragment. */ - if (ip->ip_off & IP_DF || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) { + if (ip->ip_off & IP_DF || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) || + pktcnt > 0) { error = EMSGSIZE; /* * This case can happen if the user changed the MTU - * * of an interface after enabling IP on it. Because * most netifs don't keep track of routes pointing to * them, there is no way for one to update all its * routes when the MTU is changed. */ - RT_LOCK_SPIN(ro->ro_rt); - if (ro->ro_rt && (ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) - && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) - && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { - ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + if (ro->ro_rt) { + RT_LOCK_SPIN(ro->ro_rt); + if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) + && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) + && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { + ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; + } + RT_UNLOCK(ro->ro_rt); + } + if (pktcnt > 0) { + m0 = packetlist; } - RT_UNLOCK(ro->ro_rt); OSAddAtomic(1, &ipstat.ips_cantfrag); goto bad; } @@ -1604,6 +1762,8 @@ sendchain: #endif if ((packetchain != 0) && (pktcnt > 0)) panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist); + if (ro->ro_rt && nstat_collect) + nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0); error = ifnet_output(ifp, PF_INET, m, ro->ro_rt, (struct sockaddr *)dst); } else @@ -1615,7 +1775,7 @@ sendchain: done: if (ia) { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; } #if IPSEC @@ -1781,8 +1941,11 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) struct ip *ip; unsigned char buf[sizeof(struct ip)]; u_short csum, offset, ip_len; - struct mbuf *m = m0; - + + /* Save copy of first mbuf pointer and the ip_offset before modifying */ + struct mbuf *m = m0; + int ip_offset_copy = ip_offset; + while (ip_offset >= m->m_len) { ip_offset -= m->m_len; m = m->m_next; @@ -1823,12 +1986,12 @@ in_delayed_cksum_offset(struct mbuf *m0, int ip_offset) * is bogus and we give up. */ ip_len = ip->ip_len; - if (ip_len != (m0->m_pkthdr.len - ip_offset)) { + if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { ip_len = SWAP16(ip_len); - if (ip_len != (m0->m_pkthdr.len - ip_offset)) { + if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) { printf("in_delayed_cksum_offset: ip_len %d (%d) " "doesn't match actual length %d\n", ip->ip_len, - ip_len, (m0->m_pkthdr.len - ip_offset)); + ip_len, (m0->m_pkthdr.len - ip_offset_copy)); return; } } @@ -1880,6 +2043,10 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) int hlen = 0; unsigned char buf[sizeof(struct ip)]; int swapped = 0; + + /* Save copy of first mbuf pointer and the ip_offset before modifying */ + struct mbuf* m0 = m; + size_t ip_offset_copy = ip_offset; while (ip_offset >= m->m_len) { ip_offset -= m->m_len; @@ -1927,15 +2094,15 @@ in_cksum_offset(struct mbuf* m, size_t ip_offset) * the length and check again. If it still fails, then the packet * is bogus and we give up. */ - if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) { + if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { ip->ip_len = SWAP16(ip->ip_len); swapped = 1; - if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) { + if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) { ip->ip_len = SWAP16(ip->ip_len); printf("in_cksum_offset: ip_len %d (%d) " "doesn't match actual length %lu\n", ip->ip_len, SWAP16(ip->ip_len), - (m->m_pkthdr.len - ip_offset)); + (m0->m_pkthdr.len - ip_offset_copy)); return; } } @@ -2120,6 +2287,7 @@ ip_ctloutput(so, sopt) #if defined(NFAITH) && NFAITH > 0 case IP_FAITH: #endif + case IP_RECVPKTINFO: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -2164,6 +2332,9 @@ ip_ctloutput(so, sopt) OPTSET(INP_FAITH); break; #endif + case IP_RECVPKTINFO: + OPTSET(INP_PKTINFO); + break; } break; #undef OPTSET @@ -2200,14 +2371,14 @@ ip_ctloutput(so, sopt) break; } - if (sopt->sopt_valsize == 0 || ifname[0] == NULL) { + if (sopt->sopt_valsize == 0 || ifname[0] == '\0') { /* Unbind this socket from any interface */ ifscope = IFSCOPE_NONE; } else { ifnet_t ifp; /* Verify name is NULL terminated */ - if (ifname[sopt->sopt_valsize - 1] != NULL) { + if (ifname[sopt->sopt_valsize - 1] != '\0') { error = EINVAL; break; } @@ -2227,17 +2398,33 @@ ip_ctloutput(so, sopt) */ ifnet_release(ifp); } - ip_bindif(inp, ifscope); + inp_bindif(inp, ifscope); } break; #endif + /* + * Multicast socket options are processed by the in_mcast + * module. + */ case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: case IP_ADD_MEMBERSHIP: case IP_DROP_MEMBERSHIP: - error = ip_setmoptions(sopt, &inp->inp_moptions); + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_MSFILTER: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = inp_setmoptions(inp, sopt); break; case IP_PORTRANGE: @@ -2277,10 +2464,6 @@ ip_ctloutput(so, sopt) struct mbuf *m; int optname; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ @@ -2306,13 +2489,11 @@ ip_ctloutput(so, sopt) break; if (background) { - socket_set_traffic_mgt_flags(so, - TRAFFIC_MGT_SO_BACKGROUND | - TRAFFIC_MGT_SO_BG_REGULATE); + socket_set_traffic_mgt_flags_locked(so, + TRAFFIC_MGT_SO_BACKGROUND); } else { - socket_clear_traffic_mgt_flags(so, - TRAFFIC_MGT_SO_BACKGROUND | - TRAFFIC_MGT_SO_BG_REGULATE); + socket_clear_traffic_mgt_flags_locked(so, + TRAFFIC_MGT_SO_BACKGROUND); } break; @@ -2331,11 +2512,11 @@ ip_ctloutput(so, sopt) * on the destination address type (e.g. unicast, multicast, * or broadcast if applicable) or whether or not the host is * directly reachable. Note that in the multicast transmit - * case, IP_MULTICAST_IF takes precedence over IP_BOUND_IF, - * since the former practically bypasses the routing table; - * in this case, IP_BOUND_IF sets the default interface used - * for sending multicast packets in the absence of an explicit - * transmit interface set via IP_MULTICAST_IF. + * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over + * IP_BOUND_IF, since the former practically bypasses the + * routing table; in this case, IP_BOUND_IF sets the default + * interface used for sending multicast packets in the absence + * of an explicit multicast transmit interface. */ case IP_BOUND_IF: /* This option is settable only for IPv4 */ @@ -2350,7 +2531,28 @@ ip_ctloutput(so, sopt) if (error) break; - ip_bindif(inp, optval); + inp_bindif(inp, optval); + break; + + case IP_NO_IFT_CELLULAR: + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + + if (error) + break; + + error = inp_nocellular(inp, optval); + break; + + case IP_OUT_IF: + /* This option is not settable */ + error = EINVAL; break; default: @@ -2383,6 +2585,7 @@ ip_ctloutput(so, sopt) #if defined(NFAITH) && NFAITH > 0 case IP_FAITH: #endif + case IP_RECVPKTINFO: switch (sopt->sopt_name) { case IP_TOS: @@ -2429,17 +2632,20 @@ ip_ctloutput(so, sopt) optval = OPTBIT(INP_FAITH); break; #endif + case IP_RECVPKTINFO: + optval = OPTBIT(INP_PKTINFO); + break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IP_MULTICAST_IF: + case IP_MULTICAST_IFINDEX: case IP_MULTICAST_VIF: case IP_MULTICAST_TTL: case IP_MULTICAST_LOOP: - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - error = ip_getmoptions(sopt, inp->inp_moptions); + case IP_MSFILTER: + error = inp_getmoptions(inp, sopt); break; #if IPSEC @@ -2465,7 +2671,7 @@ ip_ctloutput(so, sopt) #if TRAFFIC_MGT case IP_TRAFFIC_MGT_BACKGROUND: { - unsigned background = so->so_traffic_mgt_flags; + unsigned background = (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND); return (sooptcopyout(sopt, &background, sizeof(background))); break; } @@ -2477,6 +2683,16 @@ ip_ctloutput(so, sopt) error = sooptcopyout(sopt, &optval, sizeof (optval)); break; + case IP_NO_IFT_CELLULAR: + optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + + case IP_OUT_IF: + optval = inp->inp_last_outif; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + default: error = ENOPROTOOPT; break; @@ -2591,471 +2807,138 @@ bad: return (EINVAL); } -/* - * XXX - * The whole multicast option thing needs to be re-thought. - * Several of these options are equally applicable to non-multicast - * transmission, and one (IP_MULTICAST_TTL) totally duplicates a - * standard option (IP_TTL). - */ - -/* - * following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index. - */ -static struct ifnet * -ip_multicast_if(a, ifindexp) - struct in_addr *a; - int *ifindexp; +void +ip_moptions_init(void) { - int ifindex; - struct ifnet *ifp; + PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug)); - if (ifindexp) - *ifindexp = 0; - if (ntohl(a->s_addr) >> 24 == 0) { - ifindex = ntohl(a->s_addr) & 0xffffff; - ifnet_head_lock_shared(); - if (ifindex < 0 || if_index < ifindex) { - ifnet_head_done(); - return NULL; - } - ifp = ifindex2ifnet[ifindex]; - ifnet_head_done(); - if (ifindexp) - *ifindexp = ifindex; - } else { - INADDR_TO_IFP(*a, ifp); + imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) : + sizeof (struct ip_moptions_dbg); + + imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0, + IMO_ZONE_NAME); + if (imo_zone == NULL) { + panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME); + /* NOTREACHED */ } - return ifp; + zone_change(imo_zone, Z_EXPAND, TRUE); } -/* - * Set the IP multicast options in response to user setsockopt(). - */ -static int -ip_setmoptions(sopt, imop) - struct sockopt *sopt; - struct ip_moptions **imop; +void +imo_addref(struct ip_moptions *imo, int locked) { - int error = 0; - struct in_addr addr; - struct ip_mreq mreq; - struct ifnet *ifp = NULL; - struct ip_moptions *imo = *imop; - int ifindex; - - if (imo == NULL) { - /* - * No multicast option buffer attached to the pcb; - * allocate one and initialize to default values. - */ - error = ip_createmoptions(imop); - if (error != 0) - return error; - imo = *imop; - } - - switch (sopt->sopt_name) { - /* store an index number for the vif you wanna use in the send */ -#if MROUTING - case IP_MULTICAST_VIF: - { - int i; - if (legal_vif_num == 0) { - error = EOPNOTSUPP; - break; - } - error = sooptcopyin(sopt, &i, sizeof i, sizeof i); - if (error) - break; - if (!legal_vif_num(i) && (i != -1)) { - error = EINVAL; - break; - } - imo->imo_multicast_vif = i; - break; - } -#endif /* MROUTING */ - - case IP_MULTICAST_IF: - /* - * Select the interface for outgoing multicast packets. - */ - error = sooptcopyin(sopt, &addr, sizeof addr, sizeof addr); - if (error) - break; - /* - * INADDR_ANY is used to remove a previous selection. - * When no interface is selected, a default one is - * chosen every time a multicast packet is sent. - */ - if (addr.s_addr == INADDR_ANY) { - imo->imo_multicast_ifp = NULL; - break; - } - /* - * The selected interface is identified by its local - * IP address. Find the interface and confirm that - * it supports multicasting. - */ - ifp = ip_multicast_if(&addr, &ifindex); - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - break; - } - imo->imo_multicast_ifp = ifp; - if (ifindex) - imo->imo_multicast_addr = addr; - else - imo->imo_multicast_addr.s_addr = INADDR_ANY; - break; - - case IP_MULTICAST_TTL: - /* - * Set the IP time-to-live for outgoing multicast packets. - * The original multicast API required a char argument, - * which is inconsistent with the rest of the socket API. - * We allow either a char or an int. - */ - if (sopt->sopt_valsize == 1) { - u_char ttl; - error = sooptcopyin(sopt, &ttl, 1, 1); - if (error) - break; - imo->imo_multicast_ttl = ttl; - } else { - u_int ttl; - error = sooptcopyin(sopt, &ttl, sizeof ttl, - sizeof ttl); - if (error) - break; - if (ttl > 255) - error = EINVAL; - else - imo->imo_multicast_ttl = ttl; - } - break; - - case IP_MULTICAST_LOOP: - /* - * Set the loopback flag for outgoing multicast packets. - * Must be zero or one. The original multicast API required a - * char argument, which is inconsistent with the rest - * of the socket API. We allow either a char or an int. - */ - if (sopt->sopt_valsize == 1) { - u_char loop; - error = sooptcopyin(sopt, &loop, 1, 1); - if (error) - break; - imo->imo_multicast_loop = !!loop; - } else { - u_int loop; - error = sooptcopyin(sopt, &loop, sizeof loop, - sizeof loop); - if (error) - break; - imo->imo_multicast_loop = !!loop; - } - break; - - case IP_ADD_MEMBERSHIP: - /* - * Add a multicast group membership. - * Group must be a valid IP multicast address. - */ - error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); - if (error) - break; - - error = ip_addmembership(imo, &mreq); - break; - - case IP_DROP_MEMBERSHIP: - /* - * Drop a multicast group membership. - * Group must be a valid IP multicast address. - */ - error = sooptcopyin(sopt, &mreq, sizeof mreq, sizeof mreq); - if (error) - break; - - error = ip_dropmembership(imo, &mreq); - break; - - default: - error = EOPNOTSUPP; - break; - } + if (!locked) + IMO_LOCK(imo); + else + IMO_LOCK_ASSERT_HELD(imo); - /* - * If all options have default values, no need to keep the mbuf. - */ - if (imo->imo_multicast_ifp == NULL && - imo->imo_multicast_vif == (u_int32_t)-1 && - imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && - imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && - imo->imo_num_memberships == 0) { - FREE(*imop, M_IPMOPTS); - *imop = NULL; + if (++imo->imo_refcnt == 0) { + panic("%s: imo %p wraparound refcnt\n", __func__, imo); + /* NOTREACHED */ + } else if (imo->imo_trace != NULL) { + (*imo->imo_trace)(imo, TRUE); } - return (error); + if (!locked) + IMO_UNLOCK(imo); } -/* - * Set the IP multicast options in response to user setsockopt(). - */ -__private_extern__ int -ip_createmoptions( - struct ip_moptions **imop) -{ - struct ip_moptions *imo; - imo = (struct ip_moptions*) _MALLOC(sizeof(*imo), M_IPMOPTS, - M_WAITOK); - - if (imo == NULL) - return (ENOBUFS); - *imop = imo; - imo->imo_multicast_ifp = NULL; - imo->imo_multicast_addr.s_addr = INADDR_ANY; - imo->imo_multicast_vif = -1; - imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - imo->imo_num_memberships = 0; - - return 0; -} - -/* - * Add membership to an IPv4 multicast. - */ -__private_extern__ int -ip_addmembership( - struct ip_moptions *imo, - struct ip_mreq *mreq) +void +imo_remref(struct ip_moptions *imo) { - struct route ro; - struct sockaddr_in *dst; - struct ifnet *ifp = NULL; - int error = 0; int i; - bzero((caddr_t)&ro, sizeof(ro)); - - if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { - error = EINVAL; - goto done; - } - /* - * If no interface address was provided, use the interface of - * the route to the given multicast address. - */ - if (mreq->imr_interface.s_addr == INADDR_ANY) { - dst = (struct sockaddr_in *)&ro.ro_dst; - dst->sin_len = sizeof(*dst); - dst->sin_family = AF_INET; - dst->sin_addr = mreq->imr_multiaddr; - rtalloc_ign(&ro, 0); - if (ro.ro_rt != NULL) { - ifp = ro.ro_rt->rt_ifp; - } else { - /* If there's no default route, try using loopback */ - mreq->imr_interface.s_addr = htonl(INADDR_LOOPBACK); - } + IMO_LOCK(imo); + if (imo->imo_refcnt == 0) { + panic("%s: imo %p negative refcnt", __func__, imo); + /* NOTREACHED */ + } else if (imo->imo_trace != NULL) { + (*imo->imo_trace)(imo, FALSE); } - if (ifp == NULL) { - ifp = ip_multicast_if(&mreq->imr_interface, NULL); + --imo->imo_refcnt; + if (imo->imo_refcnt > 0) { + IMO_UNLOCK(imo); + return; } - /* - * See if we found an interface, and confirm that it - * supports multicast. - */ - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - goto done; - } - /* - * See if the membership already exists or if all the - * membership slots are full. - */ for (i = 0; i < imo->imo_num_memberships; ++i) { - if (imo->imo_membership[i]->inm_ifp == ifp && - imo->imo_membership[i]->inm_addr.s_addr - == mreq->imr_multiaddr.s_addr) - break; - } - if (i < imo->imo_num_memberships) { - error = EADDRINUSE; - goto done; - } - if (i == IP_MAX_MEMBERSHIPS) { - error = ETOOMANYREFS; - goto done; - } - /* - * Everything looks good; add a new record to the multicast - * address list for the given interface. - */ - if ((imo->imo_membership[i] = - in_addmulti(&mreq->imr_multiaddr, ifp)) == NULL) { - error = ENOBUFS; - goto done; - } - ++imo->imo_num_memberships; + struct in_mfilter *imf; -done: - if (ro.ro_rt != NULL) - rtfree(ro.ro_rt); + imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL; + if (imf != NULL) + imf_leave(imf); - return error; -} + (void) in_leavegroup(imo->imo_membership[i], imf); -/* - * Drop membership of an IPv4 multicast. - */ -__private_extern__ int -ip_dropmembership( - struct ip_moptions *imo, - struct ip_mreq *mreq) -{ - int error = 0; - struct ifnet* ifp = NULL; - int i; - - if (!IN_MULTICAST(ntohl(mreq->imr_multiaddr.s_addr))) { - error = EINVAL; - return error; - } + if (imf != NULL) + imf_purge(imf); - /* - * If an interface address was specified, get a pointer - * to its ifnet structure. - */ - if (mreq->imr_interface.s_addr == INADDR_ANY) - ifp = NULL; - else { - ifp = ip_multicast_if(&mreq->imr_interface, NULL); - if (ifp == NULL) { - error = EADDRNOTAVAIL; - return error; - } + INM_REMREF(imo->imo_membership[i]); + imo->imo_membership[i] = NULL; } - /* - * Find the membership in the membership array. - */ - for (i = 0; i < imo->imo_num_memberships; ++i) { - if ((ifp == NULL || - imo->imo_membership[i]->inm_ifp == ifp) && - imo->imo_membership[i]->inm_addr.s_addr == - mreq->imr_multiaddr.s_addr) - break; + imo->imo_num_memberships = 0; + if (imo->imo_mfilters != NULL) { + FREE(imo->imo_mfilters, M_INMFILTER); + imo->imo_mfilters = NULL; } - if (i == imo->imo_num_memberships) { - error = EADDRNOTAVAIL; - return error; + if (imo->imo_membership != NULL) { + FREE(imo->imo_membership, M_IPMOPTS); + imo->imo_membership = NULL; } - /* - * Give up the multicast address record to which the - * membership points. - */ - in_delmulti(&imo->imo_membership[i]); - /* - * Remove the gap in the membership array. - */ - for (++i; i < imo->imo_num_memberships; ++i) - imo->imo_membership[i-1] = imo->imo_membership[i]; - --imo->imo_num_memberships; - - return error; -} - -/* - * Return the IP multicast options in response to user getsockopt(). - */ -static int -ip_getmoptions(sopt, imo) - struct sockopt *sopt; - register struct ip_moptions *imo; -{ - struct in_addr addr; - struct in_ifaddr *ia; - int error, optval; - u_char coptval; - - error = 0; - switch (sopt->sopt_name) { -#if MROUTING - case IP_MULTICAST_VIF: - if (imo != NULL) - optval = imo->imo_multicast_vif; - else - optval = -1; - error = sooptcopyout(sopt, &optval, sizeof optval); - break; -#endif /* MROUTING */ + IMO_UNLOCK(imo); - case IP_MULTICAST_IF: - if (imo == NULL || imo->imo_multicast_ifp == NULL) - addr.s_addr = INADDR_ANY; - else if (imo->imo_multicast_addr.s_addr) { - /* return the value user has set */ - addr = imo->imo_multicast_addr; - } else { - IFP_TO_IA(imo->imo_multicast_ifp, ia); - addr.s_addr = (ia == NULL) ? INADDR_ANY - : IA_SIN(ia)->sin_addr.s_addr; - if (ia != NULL) - ifafree(&ia->ia_ifa); - } - error = sooptcopyout(sopt, &addr, sizeof addr); - break; - - case IP_MULTICAST_TTL: - if (imo == 0) - optval = coptval = IP_DEFAULT_MULTICAST_TTL; - else - optval = coptval = imo->imo_multicast_ttl; - if (sopt->sopt_valsize == 1) - error = sooptcopyout(sopt, &coptval, 1); - else - error = sooptcopyout(sopt, &optval, sizeof optval); - break; + lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp); - case IP_MULTICAST_LOOP: - if (imo == 0) - optval = coptval = IP_DEFAULT_MULTICAST_LOOP; - else - optval = coptval = imo->imo_multicast_loop; - if (sopt->sopt_valsize == 1) - error = sooptcopyout(sopt, &coptval, 1); - else - error = sooptcopyout(sopt, &optval, sizeof optval); - break; + if (!(imo->imo_debug & IFD_ALLOC)) { + panic("%s: imo %p cannot be freed", __func__, imo); + /* NOTREACHED */ + } + zfree(imo_zone, imo); +} - default: - error = ENOPROTOOPT; - break; +static void +imo_trace(struct ip_moptions *imo, int refhold) +{ + struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(imo->imo_debug & IFD_DEBUG)) { + panic("%s: imo %p has no debug structure", __func__, imo); + /* NOTREACHED */ + } + if (refhold) { + cnt = &imo_dbg->imo_refhold_cnt; + tr = imo_dbg->imo_refhold; + } else { + cnt = &imo_dbg->imo_refrele_cnt; + tr = imo_dbg->imo_refrele; } - return (error); + + idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); } -/* - * Discard the IP multicast options. - */ -void -ip_freemoptions(imo) - register struct ip_moptions *imo; +struct ip_moptions * +ip_allocmoptions(int how) { - register int i; + struct ip_moptions *imo; + imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone); if (imo != NULL) { - for (i = 0; i < imo->imo_num_memberships; ++i) - in_delmulti(&imo->imo_membership[i]); - FREE(imo, M_IPMOPTS); + bzero(imo, imo_size); + lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr); + imo->imo_debug |= IFD_ALLOC; + if (imo_debug != 0) { + imo->imo_debug |= IFD_DEBUG; + imo->imo_trace = imo_trace; + } + IMO_ADDREF(imo); } + + return (imo); } /* @@ -3174,6 +3057,8 @@ ip_mloopback(ifp, m, dst, hlen) * without any locks based on the assumption that ip_output() is single- * threaded per-pcb, i.e. for any given pcb there can only be one thread * performing output at the IP layer. + * + * This routine is analogous to in6_selectroute() for IPv6. */ static struct ifaddr * in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) @@ -3215,9 +3100,9 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) */ if (scope == IFSCOPE_NONE) { scope = rt_ifp->if_index; - if (scope != get_primary_ifscope() && + if (scope != get_primary_ifscope(AF_INET) && ro->ro_rt->generation_id != route_generation) - scope = get_primary_ifscope(); + scope = get_primary_ifscope(AF_INET); } ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope); @@ -3232,7 +3117,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) */ ifa = (struct ifaddr *)ifa_foraddr(src.s_addr); if (ifa != NULL) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; ifscope = IFSCOPE_NONE; } @@ -3240,16 +3125,14 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) if (ip_select_srcif_debug && ifa != NULL) { if (ro->ro_rt != NULL) { - printf("%s->%s ifscope %d->%d ifa_if %s%d " - "ro_if %s%d\n", s_src, s_dst, ifscope, - scope, ifa->ifa_ifp->if_name, - ifa->ifa_ifp->if_unit, rt_ifp->if_name, - rt_ifp->if_unit); + printf("%s->%s ifscope %d->%d ifa_if %s " + "ro_if %s\n", s_src, s_dst, ifscope, + scope, if_name(ifa->ifa_ifp), + if_name(rt_ifp)); } else { - printf("%s->%s ifscope %d->%d ifa_if %s%d\n", + printf("%s->%s ifscope %d->%d ifa_if %s\n", s_src, s_dst, ifscope, scope, - ifa->ifa_ifp->if_name, - ifa->ifa_ifp->if_unit); + if_name(ifa->ifa_ifp)); } } } @@ -3296,7 +3179,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) if (ifa->ifa_ifp != rt->rt_ifp) { oifa = ifa; ifa = rt->rt_ifa; - ifaref(ifa); + IFA_ADDREF(ifa); RT_UNLOCK(rt); } else { RT_UNLOCK(rt); @@ -3322,8 +3205,8 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) * as well as the route interface * address, and use this instead. */ - ifafree(oifa); - ifafree(ifa); + IFA_REMREF(oifa); + IFA_REMREF(ifa); ifa = iifa; } else if (!ipforwarding || (rt->rt_flags & RTF_GATEWAY)) { @@ -3334,7 +3217,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) * original one, and let the caller * do a scoped route lookup. */ - ifafree(ifa); + IFA_REMREF(ifa); ifa = oifa; } else { /* @@ -3347,7 +3230,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) * the original one and use the route * interface address instead. */ - ifafree(oifa); + IFA_REMREF(oifa); } } } else if (ifa != NULL && ro->ro_rt != NULL && @@ -3359,15 +3242,14 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) * as the interface used by the known route; drop the * original one and use the route interface address. */ - ifafree(ifa); + IFA_REMREF(ifa); ifa = ro->ro_rt->rt_ifa; - ifaref(ifa); + IFA_ADDREF(ifa); } if (ip_select_srcif_debug && ifa != NULL) { - printf("%s->%s ifscope %d ifa_if %s%d\n", - s_src, s_dst, ifscope, ifa->ifa_ifp->if_name, - ifa->ifa_ifp->if_unit); + printf("%s->%s ifscope %d ifa_if %s\n", + s_src, s_dst, ifscope, if_name(ifa->ifa_ifp)); } } @@ -3384,16 +3266,14 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) !(ro->ro_rt->rt_flags & RTF_UP))) { if (ip_select_srcif_debug) { if (ifa != NULL) { - printf("%s->%s ifscope %d ro_if %s%d != " - "ifa_if %s%d (cached route cleared)\n", - s_src, s_dst, ifscope, rt_ifp->if_name, - rt_ifp->if_unit, ifa->ifa_ifp->if_name, - ifa->ifa_ifp->if_unit); + printf("%s->%s ifscope %d ro_if %s != " + "ifa_if %s (cached route cleared)\n", + s_src, s_dst, ifscope, if_name(rt_ifp), + if_name(ifa->ifa_ifp)); } else { - printf("%s->%s ifscope %d ro_if %s%d " + printf("%s->%s ifscope %d ro_if %s " "(no ifa_if found)\n", - s_src, s_dst, ifscope, rt_ifp->if_name, - rt_ifp->if_unit); + s_src, s_dst, ifscope, if_name(rt_ifp)); } } @@ -3414,7 +3294,7 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) */ if (IN_LINKLOCAL(ntohl(dst.s_addr)) && !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) { - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; } } @@ -3444,31 +3324,3 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) return (ifa); } - -/* - * Handler for setting IP_FORCE_OUT_IFP or IP_BOUND_IF socket option. - */ -static void -ip_bindif(struct inpcb *inp, unsigned int ifscope) -{ - /* - * A zero interface scope value indicates an "unbind". - * Otherwise, take in whatever value the app desires; - * the app may already know the scope (or force itself - * to such a scope) ahead of time before the interface - * gets attached. It doesn't matter either way; any - * route lookup from this point on will require an - * exact match for the embedded interface scope. - */ - inp->inp_boundif = ifscope; - if (inp->inp_boundif == IFSCOPE_NONE) - inp->inp_flags &= ~INP_BOUND_IF; - else - inp->inp_flags |= INP_BOUND_IF; - - /* Blow away any cached route in the PCB */ - if (inp->inp_route.ro_rt != NULL) { - rtfree(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = NULL; - } -} diff --git a/bsd/netinet/ip_var.h b/bsd/netinet/ip_var.h index 9d4044d4c..971a88126 100644 --- a/bsd/netinet/ip_var.h +++ b/bsd/netinet/ip_var.h @@ -121,7 +121,7 @@ struct ipq { */ #endif /* KERNEL_PRIVATE */ #define MAX_IPOPTLEN 40 -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE struct ipoption { struct in_addr ipopt_dst; /* first-hop dst if source routed */ @@ -133,21 +133,57 @@ struct ipoption { * passed to ip_output when IP multicast options are in use. */ struct ip_moptions { + decl_lck_mtx_data(, imo_lock); + uint32_t imo_refcnt; /* ref count */ + uint32_t imo_debug; /* see ifa_debug flags */ struct ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */ u_char imo_multicast_ttl; /* TTL for outgoing multicasts */ u_char imo_multicast_loop; /* 1 => hear sends if a member */ u_short imo_num_memberships; /* no. memberships this socket */ - struct in_multi *imo_membership[IP_MAX_MEMBERSHIPS]; - u_int32_t imo_multicast_vif; /* vif num outgoing multicasts */ + u_short imo_max_memberships; /* max memberships this socket */ + struct in_multi **imo_membership; /* group memberships */ + struct in_mfilter *imo_mfilters; /* source filters */ + u_int32_t imo_multicast_vif; /* vif num outgoing multicasts */ struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */ + void (*imo_trace) /* callback fn for tracing refs */ + (struct ip_moptions *, int); }; +#define IMO_LOCK_ASSERT_HELD(_imo) \ + lck_mtx_assert(&(_imo)->imo_lock, LCK_MTX_ASSERT_OWNED) + +#define IMO_LOCK_ASSERT_NOTHELD(_imo) \ + lck_mtx_assert(&(_imo)->imo_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IMO_LOCK(_imo) \ + lck_mtx_lock(&(_imo)->imo_lock) + +#define IMO_LOCK_SPIN(_imo) \ + lck_mtx_lock_spin(&(_imo)->imo_lock) + +#define IMO_CONVERT_LOCK(_imo) do { \ + IMO_LOCK_ASSERT_HELD(_imo); \ + lck_mtx_convert_spin(&(_imo)->imo_lock); \ +} while (0) + +#define IMO_UNLOCK(_imo) \ + lck_mtx_unlock(&(_imo)->imo_lock) + +#define IMO_ADDREF(_imo) \ + imo_addref(_imo, 0) + +#define IMO_ADDREF_LOCKED(_imo) \ + imo_addref(_imo, 1) + +#define IMO_REMREF(_imo) \ + imo_remref(_imo) + /* mbuf tag for ip_forwarding info */ struct ip_fwd_tag { struct sockaddr_in *next_hop; /* next_hop */ }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct ipstat { u_int32_t ips_total; /* total packets received */ @@ -179,6 +215,9 @@ struct ipstat { u_int32_t ips_notmember; /* multicasts for unregistered grps */ u_int32_t ips_nogif; /* no match gif found */ u_int32_t ips_badaddr; /* invalid address on header */ +#ifdef PRIVATE + u_int32_t ips_pktdropcntrl; /* pkt dropped, no mbufs for control data */ +#endif /* PRIVATE */ }; struct ip_linklocal_stat { @@ -206,7 +245,8 @@ struct sockopt; * Extra information passed to ip_output when IP_OUTARGS is set. */ struct ip_out_args { - unsigned int ipoa_ifscope; /* interface scope */ + unsigned int ipoa_boundif; /* bound outgoing interface */ + unsigned int ipoa_nocell; /* don't use IFT_CELLULAR */ }; extern struct ipstat ipstat; @@ -224,9 +264,15 @@ extern int rsvp_on; extern struct pr_usrreqs rip_usrreqs; extern int ip_doscopedroute; +extern void ip_moptions_init(void); +extern struct ip_moptions *ip_allocmoptions(int); +extern int inp_getmoptions(struct inpcb *, struct sockopt *); +extern int inp_setmoptions(struct inpcb *, struct sockopt *); +extern void imo_addref(struct ip_moptions *, int); +extern void imo_remref(struct ip_moptions *); + int ip_ctloutput(struct socket *, struct sockopt *sopt); void ip_drain(void); -void ip_freemoptions(struct ip_moptions *); void ip_init(void) __attribute__((section("__TEXT, initcode"))); extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); @@ -235,7 +281,7 @@ extern int ip_output(struct mbuf *, struct mbuf *, struct route *, int, extern int ip_output_list(struct mbuf *, int, struct mbuf *, struct route *, int, struct ip_moptions *, struct ip_out_args *); struct in_ifaddr *ip_rtaddr(struct in_addr); -void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, +int ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, struct mbuf *); void ip_slowtimo(void); struct mbuf * diff --git a/bsd/netinet/kpi_ipfilter.c b/bsd/netinet/kpi_ipfilter.c index 6aea8ccf2..b03f56cd1 100644 --- a/bsd/netinet/kpi_ipfilter.c +++ b/bsd/netinet/kpi_ipfilter.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -235,7 +235,7 @@ ipf_inject_input( } if (filter_ref == 0 && m->m_pkthdr.rcvif == 0) { - m->m_pkthdr.rcvif = ifunit("lo0"); + m->m_pkthdr.rcvif = lo_ifp; m->m_pkthdr.csum_data = 0; m->m_pkthdr.csum_flags = 0; if (vers == 4) { @@ -245,8 +245,8 @@ ipf_inject_input( } } if (filter_ref != 0) { - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, - sizeof (ipfilter_t), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, + sizeof (ipfilter_t), M_NOWAIT, m); if (mtag == NULL) { error = ENOMEM; goto done; @@ -262,58 +262,54 @@ done: } static errno_t -ipf_injectv4_out( - mbuf_t data, - ipfilter_t filter_ref, - ipf_pktopts_t options) +ipf_injectv4_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) { struct route ro; - struct sockaddr_in *sin = (struct sockaddr_in*)&ro.ro_dst; struct ip *ip; struct mbuf *m = (struct mbuf*)data; errno_t error = 0; - struct m_tag *mtag = 0; - struct ip_moptions *imo = 0, ip_moptions; - + struct m_tag *mtag = NULL; + struct ip_moptions *imo = NULL; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + /* Make the IP header contiguous in the mbuf */ - if ((size_t)m->m_len < sizeof(struct ip)) { - m = m_pullup(m, sizeof(struct ip)); - if (m == NULL) return ENOMEM; + if ((size_t)m->m_len < sizeof (struct ip)) { + m = m_pullup(m, sizeof (struct ip)); + if (m == NULL) + return (ENOMEM); } - ip = (struct ip*)m_mtod(m); - + ip = (struct ip *)m_mtod(m); + if (filter_ref != 0) { - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, - sizeof (ipfilter_t), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFILT, sizeof (ipfilter_t), M_NOWAIT, m); if (mtag == NULL) { m_freem(m); - return ENOMEM; + return (ENOMEM); } - *(ipfilter_t*)(mtag+1) = filter_ref; + *(ipfilter_t *)(mtag + 1) = filter_ref; m_tag_prepend(m, mtag); } - - if (options && (options->ippo_flags & IPPOF_MCAST_OPTS)) { - imo = &ip_moptions; - - bzero(imo, sizeof(struct ip6_moptions)); + + if (options != NULL && (options->ippo_flags & IPPOF_MCAST_OPTS) && + (imo = ip_allocmoptions(M_DONTWAIT)) != NULL) { imo->imo_multicast_ifp = options->ippo_mcast_ifnet; imo->imo_multicast_ttl = options->ippo_mcast_ttl; imo->imo_multicast_loop = options->ippo_mcast_loop; } - - /* Fill out a route structure and get a route */ - bzero(&ro, sizeof(struct route)); - sin->sin_len = sizeof(struct sockaddr_in); - sin->sin_family = AF_INET; - sin->sin_port = 0; - sin->sin_addr = ip->ip_dst; - rtalloc(&ro); - if (ro.ro_rt == NULL) { - m_freem(m); - return ENETUNREACH; + + if (options != NULL && + (options->ippo_flags & (IPPOF_BOUND_IF | IPPOF_NO_IFT_CELLULAR))) { + if (options->ippo_flags & IPPOF_BOUND_IF) { + ipoa.ipoa_boundif = options->ippo_flags >> + IPPOF_SHIFT_IFSCOPE; + } + if (options->ippo_flags & IPPOF_NO_IFT_CELLULAR) + ipoa.ipoa_nocell = 1; } - + + bzero(&ro, sizeof(struct route)); + /* Put ip_len and ip_off in host byte order, ip_output expects that */ #if BYTE_ORDER != BIG_ENDIAN @@ -321,88 +317,85 @@ ipf_injectv4_out( NTOHS(ip->ip_off); #endif - /* Send */ - error = ip_output(m, NULL, &ro, IP_ALLOWBROADCAST | IP_RAWOUTPUT, imo, NULL); - + /* Send; enforce source interface selection via IP_OUTARGS flag */ + error = ip_output(m, NULL, &ro, + IP_ALLOWBROADCAST | IP_RAWOUTPUT | IP_OUTARGS, imo, &ipoa); + /* Release the route */ if (ro.ro_rt) rtfree(ro.ro_rt); - - return error; + + if (imo != NULL) + IMO_REMREF(imo); + + return (error); } #if INET6 static errno_t -ipf_injectv6_out( - mbuf_t data, - ipfilter_t filter_ref, - ipf_pktopts_t options) +ipf_injectv6_out(mbuf_t data, ipfilter_t filter_ref, ipf_pktopts_t options) { struct route_in6 ro; - struct sockaddr_in6 *sin6 = &ro.ro_dst; struct ip6_hdr *ip6; struct mbuf *m = (struct mbuf*)data; errno_t error = 0; - struct m_tag *mtag = 0; - struct ip6_moptions *im6o = 0, ip6_moptions; - + struct m_tag *mtag = NULL; + struct ip6_moptions *im6o = NULL; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + /* Make the IP header contiguous in the mbuf */ if ((size_t)m->m_len < sizeof(struct ip6_hdr)) { m = m_pullup(m, sizeof(struct ip6_hdr)); - if (m == NULL) return ENOMEM; + if (m == NULL) + return (ENOMEM); } ip6 = (struct ip6_hdr*)m_mtod(m); if (filter_ref != 0) { - mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFILT, - sizeof (ipfilter_t), M_NOWAIT); + mtag = m_tag_create(KERNEL_MODULE_TAG_ID, + KERNEL_TAG_TYPE_IPFILT, sizeof (ipfilter_t), M_NOWAIT, m); if (mtag == NULL) { m_freem(m); - return ENOMEM; + return (ENOMEM); } - *(ipfilter_t*)(mtag+1) = filter_ref; + *(ipfilter_t *)(mtag + 1) = filter_ref; m_tag_prepend(m, mtag); } - - if (options && (options->ippo_flags & IPPOF_MCAST_OPTS)) { - im6o = &ip6_moptions; - - bzero(im6o, sizeof(struct ip6_moptions)); + + if (options != NULL && (options->ippo_flags & IPPOF_MCAST_OPTS) && + (im6o = ip6_allocmoptions(M_DONTWAIT)) != NULL) { im6o->im6o_multicast_ifp = options->ippo_mcast_ifnet; im6o->im6o_multicast_hlim = options->ippo_mcast_ttl; im6o->im6o_multicast_loop = options->ippo_mcast_loop; } - - - /* Fill out a route structure and get a route */ - bzero(&ro, sizeof(struct route_in6)); - sin6->sin6_len = sizeof(struct sockaddr_in6); - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = ip6->ip6_dst; -#if 0 - /* This is breaks loopback multicast! */ - /* The scope ID should already at s6_addr16[1] */ - if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { - /* Hack, pull the scope_id out of the dest addr */ - sin6->sin6_scope_id = ntohs(ip6->ip6_dst.s6_addr16[1]); - ip6->ip6_dst.s6_addr16[1] = 0; - } else - sin6->sin6_scope_id = 0; -#endif - rtalloc((struct route*)&ro); - if (ro.ro_rt == NULL) { - m_freem(m); - return ENETUNREACH; + + if (options != NULL && + (options->ippo_flags & (IPPOF_BOUND_IF | IPPOF_NO_IFT_CELLULAR))) { + if (options->ippo_flags & IPPOF_BOUND_IF) { + ip6oa.ip6oa_boundif = options->ippo_flags >> + IPPOF_SHIFT_IFSCOPE; + } + if (options->ippo_flags & IPPOF_NO_IFT_CELLULAR) + ip6oa.ip6oa_nocell = 1; } - - /* Send */ - error = ip6_output(m, NULL, &ro, 0, im6o, NULL, 0); - + + bzero(&ro, sizeof(struct route_in6)); + + /* + * Send mbuf and ifscope information. Check for correctness + * of ifscope information is done while searching for a route in + * ip6_output. + */ + error = ip6_output(m, NULL, &ro, IPV6_OUTARGS, im6o, NULL, &ip6oa); + /* Release the route */ if (ro.ro_rt) rtfree(ro.ro_rt); - - return error; + + if (im6o != NULL) + IM6O_REMREF(im6o); + + return (error); } #endif /* INET6 */ diff --git a/bsd/netinet/kpi_ipfilter.h b/bsd/netinet/kpi_ipfilter.h index 3d2aaaac9..1f7fae6f0 100644 --- a/bsd/netinet/kpi_ipfilter.h +++ b/bsd/netinet/kpi_ipfilter.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -50,7 +50,12 @@ struct ipf_pktopts { int ippo_mcast_loop; u_int8_t ippo_mcast_ttl; }; -#define IPPOF_MCAST_OPTS 0x1 +#define IPPOF_MCAST_OPTS 0x1 +#ifdef PRIVATE +#define IPPOF_BOUND_IF 0x2 +#define IPPOF_NO_IFT_CELLULAR 0x4 +#define IPPOF_SHIFT_IFSCOPE 16 +#endif /* PRIVATE */ typedef struct ipf_pktopts *ipf_pktopts_t; diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index c03fde7d0..0b63a3c0d 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -200,7 +200,7 @@ rip_input(m, iphlen) register struct inpcb *inp; struct inpcb *last = 0; struct mbuf *opts = 0; - int skipit; + int skipit = 0, ret = 0; ripsrc.sin_addr = ip->ip_src; lck_rw_lock_shared(ripcbinfo.mtx); @@ -220,9 +220,9 @@ rip_input(m, iphlen) if (last) { struct mbuf *n = m_copy(m, 0, (int)M_COPYALL); + skipit = 0; #if IPSEC /* check AH/ESP integrity. */ - skipit = 0; if (ipsec_bypass == 0 && n) { if (ipsec4_in_reject_so(n, last->inp_socket)) { m_freem(n); @@ -235,27 +235,36 @@ rip_input(m, iphlen) #if CONFIG_MACF_NET if (n && skipit == 0) { if (mac_inpcb_check_deliver(last, n, AF_INET, - SOCK_RAW) != 0) + SOCK_RAW) != 0) { + m_freem(n); skipit = 1; + } } #endif if (n && skipit == 0) { int error = 0; - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) - ip_savecontrol(last, &opts, ip, n); + if ((last->inp_flags & INP_CONTROLOPTS) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip_savecontrol(last, &opts, ip, n); + if (ret != 0) { + m_freem(n); + m_freem(opts); + last = inp; + continue; + } + } if (last->inp_flags & INP_STRIPHDR) { n->m_len -= iphlen; n->m_pkthdr.len -= iphlen; n->m_data += iphlen; } -// ###LOCK need to lock that socket? + so_recv_data_stat(last->inp_socket, m, 0); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&ripsrc, n, opts, &error) != 0) { sorwakeup(last->inp_socket); - } - else { + } else { if (error) { /* should notify about lost packet */ kprintf("rip_input can't append to socket\n"); @@ -266,10 +275,10 @@ rip_input(m, iphlen) } last = inp; } - lck_rw_done(ripcbinfo.mtx); + + skipit = 0; #if IPSEC /* check AH/ESP integrity. */ - skipit = 0; if (ipsec_bypass == 0 && last) { if (ipsec4_in_reject_so(m, last->inp_socket)) { m_freem(m); @@ -282,20 +291,30 @@ rip_input(m, iphlen) #endif /*IPSEC*/ #if CONFIG_MACF_NET if (last && skipit == 0) { - if (mac_inpcb_check_deliver(last, m, AF_INET, SOCK_RAW) != 0) + if (mac_inpcb_check_deliver(last, m, AF_INET, SOCK_RAW) != 0) { skipit = 1; + m_freem(m); + } } #endif if (skipit == 0) { if (last) { - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) - ip_savecontrol(last, &opts, ip, m); + if ((last->inp_flags & INP_CONTROLOPTS) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip_savecontrol(last, &opts, ip, m); + if (ret != 0) { + m_freem(m); + m_freem(opts); + goto unlock; + } + } if (last->inp_flags & INP_STRIPHDR) { m->m_len -= iphlen; m->m_pkthdr.len -= iphlen; m->m_data += iphlen; } + so_recv_data_stat(last->inp_socket, m, 0); if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&ripsrc, m, opts, NULL) != 0) { sorwakeup(last->inp_socket); @@ -308,6 +327,12 @@ rip_input(m, iphlen) OSAddAtomic(-1, &ipstat.ips_delivered); } } +unlock: + /* + * Keep the list locked because socket filter may force the socket lock + * to be released when calling sbappendaddr() -- see rdar://7627704 + */ + lck_rw_done(ripcbinfo.mtx); } /* @@ -325,21 +350,19 @@ rip_output( register struct inpcb *inp = sotoinpcb(so); int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; struct ip_out_args ipoa; + struct ip_moptions *imo; int error = 0; -#if PKT_PRIORITY - mbuf_traffic_class_t mtc = MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ + mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; if (control != NULL) { -#if PKT_PRIORITY mtc = mbuf_traffic_class_from_control(control); -#endif /* PKT_PRIORITY */ m_freem(control); } /* If socket was bound to an ifindex, tell ip_output about it */ - ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ? + ipoa.ipoa_boundif = (inp->inp_flags & INP_BOUND_IF) ? inp->inp_boundif : IFSCOPE_NONE; + ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; flags |= IP_OUTARGS; /* @@ -401,35 +424,52 @@ rip_output( inp->inp_route.ro_rt = NULL; } -#if PKT_PRIORITY - set_traffic_class(m, so, mtc); -#endif /* PKT_PRIORITY */ + set_packet_tclass(m, so, mtc, 0); #if CONFIG_MACF_NET mac_mbuf_label_associate_inpcb(inp, m); #endif + imo = inp->inp_moptions; + if (imo != NULL) + IMO_ADDREF(imo); /* * The domain lock is held across ip_output, so it is okay * to pass the PCB cached route pointer directly to IP and * the modules beneath it. */ error = ip_output(m, inp->inp_options, &inp->inp_route, flags, - inp->inp_moptions, &ipoa); + imo, &ipoa); -#if IFNET_ROUTE_REFCNT - /* - * Always discard the cached route for unconnected socket - * or if it is a non-unicast route. - */ - if (inp->inp_route.ro_rt != NULL && - ((inp->inp_route.ro_rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) || - inp->inp_socket == NULL || - inp->inp_socket->so_state != SS_ISCONNECTED)) { - rtfree(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = NULL; + if (imo != NULL) + IMO_REMREF(imo); + + if (inp->inp_route.ro_rt != NULL) { + struct rtentry *rt = inp->inp_route.ro_rt; + unsigned int outif; + + if ((rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) || + inp->inp_socket == NULL || + !(inp->inp_socket->so_state & SS_ISCONNECTED)) { + rt = NULL; /* unusable */ + } + /* + * Always discard the cached route for unconnected + * socket or if it is a multicast route. + */ + if (rt == NULL) { + rtfree(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = NULL; + } + /* + * If this is a connected socket and the destination + * route is unicast, update outif with that of the route + * interface index used by IP. + */ + if (rt != NULL && + (outif = rt->rt_ifp->if_index) != inp->inp_last_outif) + inp->inp_last_outif = outif; } -#endif /* IFNET_ROUTE_REFCNT */ return (error); } @@ -642,10 +682,12 @@ rip_ctlinput( lck_rw_lock_shared(in_ifaddr_rwlock); for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { - if (ia->ia_ifa.ifa_addr == sa - && (ia->ia_flags & IFA_ROUTE)) { + IFA_LOCK(&ia->ia_ifa); + if (ia->ia_ifa.ifa_addr == sa && + (ia->ia_flags & IFA_ROUTE)) { done = 1; - ifaref(&ia->ia_ifa); + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); lck_mtx_lock(rnh_lock); /* @@ -660,9 +702,10 @@ rip_ctlinput( */ in_ifadown(&ia->ia_ifa, 1); lck_mtx_unlock(rnh_lock); - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); break; } + IFA_UNLOCK(&ia->ia_ifa); } if (!done) lck_rw_done(in_ifaddr_rwlock); @@ -672,14 +715,22 @@ rip_ctlinput( lck_rw_lock_shared(in_ifaddr_rwlock); for (ia = in_ifaddrhead.tqh_first; ia; ia = ia->ia_link.tqe_next) { - if (ia->ia_ifa.ifa_addr == sa) + IFA_LOCK(&ia->ia_ifa); + if (ia->ia_ifa.ifa_addr == sa) { + /* keep it locked */ break; + } + IFA_UNLOCK(&ia->ia_ifa); } - if (ia == 0 || (ia->ia_flags & IFA_ROUTE)) { + if (ia == NULL || (ia->ia_flags & IFA_ROUTE) || + (ia->ia_ifa.ifa_debug & IFD_NOTREADY)) { + if (ia != NULL) + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return; } - ifaref(&ia->ia_ifa); + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); flags = RTF_UP; @@ -690,9 +741,12 @@ rip_ctlinput( flags |= RTF_HOST; err = rtinit(&ia->ia_ifa, RTM_ADD, flags); - if (err == 0) + if (err == 0) { + IFA_LOCK_SPIN(&ia->ia_ifa); ia->ia_flags |= IFA_ROUTE; - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + } + IFA_REMREF(&ia->ia_ifa); break; } } @@ -700,9 +754,9 @@ rip_ctlinput( u_int32_t rip_sendspace = RIPSNDQ; u_int32_t rip_recvspace = RIPRCVQ; -SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW, +SYSCTL_INT(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW | CTLFLAG_LOCKED, &rip_sendspace, 0, "Maximum outgoing raw IP datagram size"); -SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW, +SYSCTL_INT(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW | CTLFLAG_LOCKED, &rip_recvspace, 0, "Maximum incoming raw IP datagram size"); static int @@ -770,6 +824,7 @@ rip_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) struct inpcb *inp = sotoinpcb(so); struct sockaddr_in *addr = (struct sockaddr_in *)nam; struct ifaddr *ifa = NULL; + unsigned int outif = 0; if (nam->sa_len != sizeof(*addr)) return EINVAL; @@ -781,10 +836,13 @@ rip_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) return EADDRNOTAVAIL; } else if (ifa) { - ifafree(ifa); - ifa = NULL; + IFA_LOCK(ifa); + outif = ifa->ifa_ifp->if_index; + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); } inp->inp_laddr = addr->sin_addr; + inp->inp_last_outif = outif; return 0; } @@ -815,7 +873,7 @@ rip_shutdown(struct socket *so) __private_extern__ int rip_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr *nam, - __unused struct mbuf *control, __unused struct proc *p) + struct mbuf *control, __unused struct proc *p) { struct inpcb *inp = sotoinpcb(so); register u_int32_t dst; @@ -979,7 +1037,7 @@ rip_pcblist SYSCTL_HANDLER_ARGS return error; } -SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, rip_pcblist, "S,xinpcb", "List of active raw IP sockets"); #if !CONFIG_EMBEDDED @@ -1082,11 +1140,26 @@ rip_pcblist64 SYSCTL_HANDLER_ARGS return error; } -SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist64, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, rip_pcblist64, "S,xinpcb64", "List of active raw IP sockets"); #endif /* !CONFIG_EMBEDDED */ + +static int +rip_pcblist_n SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error = 0; + + error = get_pcblist_n(IPPROTO_IP, req, &ripcbinfo); + + return error; +} + +SYSCTL_PROC(_net_inet_raw, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, + rip_pcblist_n, "S,xinpcb_n", "List of active raw IP sockets"); + struct pr_usrreqs rip_usrreqs = { rip_abort, pru_accept_notsupp, rip_attach, rip_bind, rip_connect, pru_connect2_notsupp, in_control, rip_detach, rip_disconnect, diff --git a/bsd/netinet/tcp.h b/bsd/netinet/tcp.h index 3b4d8f92f..a3a183bfe 100644 --- a/bsd/netinet/tcp.h +++ b/bsd/netinet/tcp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -185,8 +185,6 @@ struct tcphdr { #define TCP_MAX_WINSHIFT 14 /* maximum window shift */ -#define TCP_MAXBURST 4 /* maximum segments in a burst */ - #define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ #define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) /* max space left for options */ @@ -202,6 +200,90 @@ struct tcphdr { #define TCP_NOOPT 0x08 /* don't use TCP options */ #define TCP_KEEPALIVE 0x10 /* idle time used when SO_KEEPALIVE is enabled */ #define TCP_CONNECTIONTIMEOUT 0x20 /* connection timeout */ +#define PERSIST_TIMEOUT 0x40 /* time after which a connection in + * persist timeout will terminate. + * see draft-ananth-tcpm-persist-02.txt + */ +#define TCP_RXT_CONNDROPTIME 0x80 /* time after which tcp retransmissions will be + * stopped and the connection will be dropped + */ +#define TCP_RXT_FINDROP 0x100 /* when this option is set, drop a connection + * after retransmitting the FIN 3 times. It will + * prevent holding too many mbufs in socket + * buffer queues. + */ +#ifdef PRIVATE +#define TCP_INFO 0x200 /* retrieve tcp_info structure */ + +/* + * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits + * the caller to query certain information about the state of a TCP + * connection. We provide an overlapping set of fields with the Linux + * implementation, but since this is a fixed size structure, room has been + * left for growth. In order to maximize potential future compatibility with + * the Linux API, the same variable names and order have been adopted, and + * padding left to make room for omitted fields in case they are added later. + * + * XXX: This is currently an unstable ABI/API, in that it is expected to + * change. + */ +#pragma pack(4) + +#define TCPI_OPT_TIMESTAMPS 0x01 +#define TCPI_OPT_SACK 0x02 +#define TCPI_OPT_WSCALE 0x04 +#define TCPI_OPT_ECN 0x08 + +struct tcp_info { + u_int8_t tcpi_state; /* TCP FSM state. */ + u_int8_t tcpi_options; /* Options enabled on conn. */ + u_int8_t tcpi_snd_wscale; /* RFC1323 send shift value. */ + u_int8_t tcpi_rcv_wscale; /* RFC1323 recv shift value. */ + + u_int32_t tcpi_snd_mss; /* Max segment size for send. */ + u_int32_t tcpi_rcv_mss; /* Max segment size for receive. */ + + u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ + u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ + + u_int32_t tcpi_rcv_space; /* Advertised recv window. */ + + u_int32_t tcpi_snd_wnd; /* Advertised send window. */ + u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */ + u_int32_t tcpi_snd_nxt; /* Next egress seqno */ + u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ + + int32_t tcpi_last_outif; /* if_index of interface used to send last */ +}; + +/* + * Note that IPv6 link local addresses should have the appropriate scope ID + */ + +struct info_tuple { + u_int8_t itpl_proto; + union { + struct sockaddr _itpl_sa; + struct sockaddr_in _itpl_sin; + struct sockaddr_in6 _itpl_sin6; + } itpl_localaddr; + union { + struct sockaddr _itpl_sa; + struct sockaddr_in _itpl_sin; + struct sockaddr_in6 _itpl_sin6; + } itpl_remoteaddr; +}; + +#define itpl_local_sa itpl_localaddr._itpl_sa +#define itpl_local_sin itpl_localaddr._itpl_sin +#define itpl_local_sin6 itpl_localaddr._itpl_sin6 +#define itpl_remote_sa itpl_remoteaddr._itpl_sa +#define itpl_remote_sin itpl_remoteaddr._itpl_sin +#define itpl_remote_sin6 itpl_remoteaddr._itpl_sin6 + +#pragma pack() + +#endif /* PRIVATE */ #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #endif diff --git a/bsd/netinet/tcp_cc.h b/bsd/netinet/tcp_cc.h new file mode 100644 index 000000000..c78ba3531 --- /dev/null +++ b/bsd/netinet/tcp_cc.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2008 Swinburne University of Technology, Melbourne, Australia + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by Lawrence Stewart and James Healy, + * made possible in part by a grant from the Cisco University Research Program + * Fund at Community Foundation Silicon Valley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_CC_H_ +#define _NETINET_CC_H_ + +#ifdef KERNEL + +#include <netinet/tcp_var.h> + +#define TCP_CC_ALGO_NEWRENO_INDEX 0 /* default congestion control algorithm */ +#define TCP_CC_ALGO_BACKGROUND_INDEX 1 /* congestion control for background transport */ +#define TCP_CC_ALGO_COUNT 2 /* Count of CC algorithms defined */ + +#define TCP_CA_NAME_MAX 16 /* Maximum characters in the name of a CC algorithm */ + +/* + * Structure to hold definition various actions defined by a congestion control + * algorithm for TCP. This can be used to change the congestion control on a + * connection based on the user settings of priority of a connection. + */ +struct tcp_cc_algo { + char name[TCP_CA_NAME_MAX]; + uint32_t num_sockets; + uint32_t flags; + + /* init the congestion algorithm for the specified control block */ + int (*init) (struct tcpcb *tp); + + /* cleanup any state that is stored in the connection related to the algorithm */ + int (*cleanup) (struct tcpcb *tp); + + /* initialize cwnd at the start of a connection */ + void (*cwnd_init) (struct tcpcb *tp); + + /* called on the receipt of in-sequence ack during congestion avoidance phase */ + void (*inseq_ack_rcvd) (struct tcpcb *tp, struct tcphdr *th); + + /* called on the receipt of a valid ack */ + void (*ack_rcvd) (struct tcpcb *tp, struct tcphdr *th); + + /* called before entering FR */ + void (*pre_fr) (struct tcpcb *tp, struct tcphdr *th); + + /* after exiting FR */ + void (*post_fr) (struct tcpcb *tp, struct tcphdr *th); + + /* perform tasks when data transfer resumes after an idle period */ + void (*after_idle) (struct tcpcb *tp); + + /* perform tasks when the connection's retransmit timer expires */ + void (*after_timeout) (struct tcpcb *tp); + + /* Whether or not to delay the ack */ + int (*delay_ack)(struct tcpcb *tp, struct tcphdr *th); + + /* Switch a connection to this CC algorithm after sending some packets */ + void (*switch_to)(struct tcpcb *tp, uint16_t old_cc_index); + +} __attribute__((aligned(4))); + +extern struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; + +#define CC_ALGO(tp) (tcp_cc_algo_list[tp->tcp_cc_index]) + +#endif /* KERNEL */ +#endif /* _NETINET_CC_H_ */ diff --git a/bsd/netinet/tcp_debug.c b/bsd/netinet/tcp_debug.c index 58b1141b2..8ba9eb6af 100644 --- a/bsd/netinet/tcp_debug.c +++ b/bsd/netinet/tcp_debug.c @@ -96,7 +96,7 @@ #if TCPDEBUG __private_extern__ int tcpconsdebug = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcpconsdebug, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcpconsdebug, CTLFLAG_RW | CTLFLAG_LOCKED, &tcpconsdebug, 0, "Turn tcp debugging on or off"); #endif diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index b65e9d5c6..6f06b2b14 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -86,6 +86,7 @@ #include <net/if.h> #include <net/if_types.h> #include <net/route.h> +#include <net/ntstat.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -95,6 +96,7 @@ #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ #include <netinet/in_pcb.h> #include <netinet/ip_var.h> +#include <mach/sdt.h> #if INET6 #include <netinet/ip6.h> #include <netinet/icmp6.h> @@ -107,6 +109,8 @@ #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> +#include <netinet/tcp_cc.h> +#include <kern/zalloc.h> #if INET6 #include <netinet6/tcp6_var.h> #endif @@ -131,10 +135,6 @@ struct tcphdr tcp_savetcp; #include <sys/kdebug.h> -#ifndef __APPLE__ -MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry"); -#endif - #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0) #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2) #define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8)) @@ -150,26 +150,31 @@ extern int ipsec_bypass; struct tcpstat tcpstat; static int log_in_vain = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED, &log_in_vain, 0, "Log all incoming TCP connections"); static int blackhole = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED, &blackhole, 0, "Do not send RST when dropping refused connections"); int tcp_delack_enabled = 3; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_delack_enabled, 0, "Delay ACK to try and piggyback it onto a data packet"); int tcp_lq_overflow = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_lq_overflow, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_lq_overflow, 0, "Listen Queue Overflow"); +int tcp_recv_bg = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_recv_bg, 0, + "Receive background"); + #if TCP_DROP_SYNFIN static int drop_synfin = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW | CTLFLAG_LOCKED, &drop_synfin, 0, "Drop TCP packets with SYN+FIN set"); #endif @@ -177,59 +182,85 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "TCP Segment Reassembly Queue"); __private_extern__ int tcp_reass_maxseg = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_reass_maxseg, 0, "Global maximum number of TCP Segments in Reassembly Queue"); __private_extern__ int tcp_reass_qsize = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_reass_qsize, 0, "Global number of TCP Segments currently in Reassembly Queue"); static int tcp_reass_overflows = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows, CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_reass_overflows, 0, "Global number of TCP Segment Reassembly Queue Overflows"); __private_extern__ int slowlink_wsize = 8192; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowlink_wsize, CTLFLAG_RW | CTLFLAG_LOCKED, &slowlink_wsize, 0, "Maximum advertised window size for slowlink"); -static int maxseg_unacked = 8; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW, +int maxseg_unacked = 8; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, maxseg_unacked, CTLFLAG_RW | CTLFLAG_LOCKED, &maxseg_unacked, 0, "Maximum number of outstanding segments left unacked"); -static int tcp_do_rfc3465 = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW, +int tcp_do_rfc3465 = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc3465, 0, ""); -static int tcp_do_rfc3465_lim2 = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW, +int tcp_do_rfc3465_lim2 = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3465_lim2, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc3465_lim2, 0, "Appropriate bytes counting w/ L=2*SMSS"); +int rtt_samples_per_slot = 20; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_samples_per_slot, CTLFLAG_RW | CTLFLAG_LOCKED, + &rtt_samples_per_slot, 0, "Number of RTT samples stored for rtt history"); + +int tcp_allowed_iaj = ALLOWED_IAJ; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, recv_allowed_iaj, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_allowed_iaj, 0, "Allowed inter-packet arrival jiter"); + +int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_high_thresh, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_acc_iaj_high_thresh, 0, "Used in calculating maximum accumulated IAJ"); + #if CONFIG_IFEF_NOWINDOWSCALE int tcp_obey_ifef_nowindowscale = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, obey_ifef_nowindowscale, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_obey_ifef_nowindowscale, 0, ""); #endif extern int tcp_TCPTV_MIN; +extern int tcp_acc_iaj_high; +extern int tcp_acc_iaj_react_limit; +extern struct zone *tcp_reass_zone; + u_int32_t tcp_now; +struct timeval tcp_uptime; /* uptime when tcp_now was last updated */ +lck_spin_t *tcp_uptime_lock; /* Used to sychronize updates to tcp_now */ struct inpcbhead tcb; #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; -static void tcp_dooptions(struct tcpcb *, - u_char *, int, struct tcphdr *, struct tcpopt *, unsigned int); +static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *, + struct tcpopt *, unsigned int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); static void tcp_xmit_timer(struct tcpcb *, int); static inline unsigned int tcp_maxmtu(struct rtentry *); +static inline int tcp_stretch_ack_enable(struct tcpcb *tp); + +#if TRAFFIC_MGT +static inline void update_iaj_state(struct tcpcb *tp, uint32_t tlen, int reset_size); +void compute_iaj(struct tcpcb *tp); +static inline void clear_iaj_state(struct tcpcb *tp); +#endif /* TRAFFIC_MGT */ + #if INET6 static inline unsigned int tcp_maxmtu6(struct rtentry *); #endif @@ -247,9 +278,7 @@ do { \ #define ND6_HINT(tp) #endif -extern u_int32_t *delack_bitmask; - -extern void add_to_time_wait(struct tcpcb *); +extern void add_to_time_wait(struct tcpcb *, uint32_t delay); extern void postevent(struct socket *, struct sockbuf *, int); extern void ipfwsyslog( int level, const char *format,...); @@ -269,37 +298,140 @@ __private_extern__ int tcp_win_scale; #define log_in_vain_log( a ) { log a; } #endif +int tcp_rcvunackwin = TCPTV_UNACKWIN; +int tcp_maxrcvidle = TCPTV_MAXRCVIDLE; +int tcp_rcvsspktcnt = TCP_RCV_SS_PKTCOUNT; -/* - * Indicate whether this ack should be delayed. - * We can delay the ack if: - * - delayed acks are enabled (set to 1) and - * - our last ack wasn't a 0-sized window. We never want to delay - * the ack that opens up a 0-sized window. - * - delayed acks are enabled (set to 2, "more compatible") and - * - our last ack wasn't a 0-sized window. - * - if the peer hasn't sent us a TH_PUSH data packet (this solves 3649245) - * - the peer hasn't sent us a TH_PUSH data packet, if he did, take this as a clue that we - * need to ACK with no delay. This helps higher level protocols who won't send - * us more data even if the window is open because their last "segment" hasn't been ACKed - * - delayed acks are enabled (set to 3, "streaming detection") and - * - if we receive more than "maxseg_unacked" full packets per second on this socket - * - if we don't have more than "maxseg_unacked" delayed so far - * - if those criteria aren't met, acts like "2". Allowing faster acking while browsing for example. - * - */ -#define DELAY_ACK(tp) \ - (((tcp_delack_enabled == 1) && ((tp->t_flags & TF_RXWIN0SENT) == 0)) || \ - (((tcp_delack_enabled == 2) && (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - ((thflags & TH_PUSH) == 0) && ((tp->t_flags & TF_DELACK) == 0)) || \ - (((tcp_delack_enabled == 3) && (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - (tp->t_rcvtime == 0) && ((thflags & TH_PUSH) == 0) && \ - (((tp->t_unacksegs == 0)) || \ - ((tp->rcv_byps > (maxseg_unacked * tp->t_maxseg)) && (tp->t_unacksegs < maxseg_unacked))))) +#define DELAY_ACK(tp, th) (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th)) static int tcp_dropdropablreq(struct socket *head); static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th); +static void update_base_rtt(struct tcpcb *tp, uint32_t rtt); +uint32_t get_base_rtt(struct tcpcb *tp); +void tcp_set_background_cc(struct socket *so); +void tcp_set_foreground_cc(struct socket *so); +static void tcp_set_new_cc(struct socket *so, uint16_t cc_index); + +#if TRAFFIC_MGT +void +reset_acc_iaj(struct tcpcb *tp) +{ + tp->acc_iaj = 0; + tp->iaj_rwintop = 0; + clear_iaj_state(tp); +} + +static inline void +update_iaj_state(struct tcpcb *tp, uint32_t size, int rst_size) +{ + if (rst_size > 0) + tp->iaj_size = 0; + if (tp->iaj_size == 0 || size >= tp->iaj_size) { + tp->iaj_size = size; + tp->iaj_rcv_ts = tcp_now; + tp->iaj_small_pkt = 0; + } +} + +static inline void +clear_iaj_state(struct tcpcb *tp) +{ + tp->iaj_rcv_ts = 0; +} + +/* For every 32 bit unsigned integer(v), this function will find the + * largest integer n such that (n*n <= v). This takes at most 16 iterations + * irrespective of the value of v and does not involve multiplications. + */ +static inline int +isqrt(unsigned int val) { + unsigned int sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100}; + unsigned int temp, g=0, b=0x8000, bshft=15; + if ( val <= 100) { + for (g = 0; g <= 10; ++g) { + if (sqrt_cache[g] > val) { + g--; + break; + } else if (sqrt_cache[g] == val) { + break; + } + } + } else { + do { + temp = (((g << 1) + b) << (bshft--)); + if (val >= temp) { + g += b; + val -= temp; + } + b >>= 1; + } while ( b > 0 && val > 0); + } + return(g); +} + +void +compute_iaj(struct tcpcb *tp) +{ + /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds, throttle the + * receive window to a minimum of MIN_IAJ_WIN packets + */ +#define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit) + + uint32_t allowed_iaj, acc_iaj = 0; + uint32_t cur_iaj = tcp_now - tp->iaj_rcv_ts; + + uint32_t mean, temp; + int32_t cur_iaj_dev; + cur_iaj_dev = (cur_iaj - tp->avg_iaj); + + /* Allow a jitter of "allowed_iaj" milliseconds. Some connections may have a + * constant jitter more than that. We detect this by using + * standard deviation. + */ + allowed_iaj = tp->avg_iaj + tp->std_dev_iaj; + if (allowed_iaj < tcp_allowed_iaj) + allowed_iaj = tcp_allowed_iaj; + + /* Initially when the connection starts, the senders congestion window + * is small. During this period we avoid throttling a connection because + * we do not have a good starting point for allowed_iaj. IAJ_IGNORE_PKTCNT + * is used to quietly gloss over the first few packets. + */ + if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) { + if ( cur_iaj <= allowed_iaj ) { + if (tp->acc_iaj >= 2) + acc_iaj = tp->acc_iaj - 2; + else + acc_iaj = 0; + } else { + acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj); + } + + if (acc_iaj > MAX_ACC_IAJ) + acc_iaj = MAX_ACC_IAJ; + tp->acc_iaj = acc_iaj; + } + + /* Compute weighted average where the history has a weight of + * 15 out of 16 and the current value has a weight of 1 out of 16. + * This will make the short-term measurements have more weight. + */ + tp->avg_iaj = (((tp->avg_iaj << 4) - tp->avg_iaj) + cur_iaj) >> 4; + + /* Compute Root-mean-square of deviation where mean is a weighted + * average as described above + */ + temp = tp->std_dev_iaj * tp->std_dev_iaj; + mean = (((temp << 4) - temp) + (cur_iaj_dev * cur_iaj_dev)) >> 4; + + tp->std_dev_iaj = isqrt(mean); + + DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj, uint32_t, allowed_iaj); + + return; +} +#endif /* TRAFFIC_MGT */ static int tcp_reass(tp, th, tlenp, m) @@ -322,6 +454,27 @@ tcp_reass(tp, th, tlenp, m) */ if (th == NULL) goto present; + + /* If the reassembly queue already has entries or if we are going to add + * a new one, then the connection has reached a loss state. + * Reset the stretch-ack algorithm at this point. + */ + if ((tp->t_flags & TF_STRETCHACK) != 0) + tcp_reset_stretch_ack(tp); + + /* When the connection reaches a loss state, we need to send more acks + * for a period of time so that the sender's congestion window will + * open. Wait until we see some packets on the connection before + * stretching acks again. + */ + tp->t_flagsext |= TF_RCVUNACK_WAITSS; + tp->rcv_waitforss = 0; + + +#if TRAFFIC_MGT + if (tp->acc_iaj > 0) + reset_acc_iaj(tp); +#endif /* TRAFFIC_MGT */ /* * Limit the number of segments in the reassembly queue to prevent @@ -340,8 +493,7 @@ tcp_reass(tp, th, tlenp, m) } /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */ - MALLOC(te, struct tseg_qent *, sizeof (struct tseg_qent), M_TSEGQ, - M_NOWAIT); + te = (struct tseg_qent *) zalloc_noblock(tcp_reass_zone); if (te == NULL) { tcpstat.tcps_rcvmemdrop++; m_freem(m); @@ -371,8 +523,14 @@ tcp_reass(tp, th, tlenp, m) if (i >= *tlenp) { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += *tlenp; + if (nstat_collect) { + nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_DUPLICATE); + locked_add_64(&tp->t_inpcb->inp_stat->rxpackets, 1); + locked_add_64(&tp->t_inpcb->inp_stat->rxbytes, *tlenp); + tp->t_stat.rxduplicatebytes += *tlenp; + } m_freem(m); - FREE(te, M_TSEGQ); + zfree(tcp_reass_zone, te); tcp_reass_qsize--; /* * Try to present any queued data @@ -389,6 +547,12 @@ tcp_reass(tp, th, tlenp, m) } tcpstat.tcps_rcvoopack++; tcpstat.tcps_rcvoobyte += *tlenp; + if (nstat_collect) { + nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, *tlenp, NSTAT_RX_FLAG_OUT_OF_ORDER); + locked_add_64(&tp->t_inpcb->inp_stat->rxpackets, 1); + locked_add_64(&tp->t_inpcb->inp_stat->rxbytes, *tlenp); + tp->t_stat.rxoutoforderbytes += *tlenp; + } /* * While we overlap succeeding segments trim them or, @@ -408,7 +572,7 @@ tcp_reass(tp, th, tlenp, m) nq = LIST_NEXT(q, tqe_q); LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); - FREE(q, M_TSEGQ); + zfree(tcp_reass_zone, q); tcp_reass_qsize--; q = nq; } @@ -442,10 +606,11 @@ present: if (so->so_state & SS_CANTRCVMORE) m_freem(q->tqe_m); else { + so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */ if (sbappendstream(&so->so_rcv, q->tqe_m)) dowakeup = 1; } - FREE(q, M_TSEGQ); + zfree(tcp_reass_zone, q); tcp_reass_qsize--; q = nq; } while (q && q->tqe_th->th_seq == tp->rcv_nxt); @@ -480,15 +645,15 @@ present: */ static void tcp_reduce_congestion_window( - struct tcpcb *tp) + struct tcpcb *tp, struct tcphdr *th) { - u_int win; - - win = min(tp->snd_wnd, tp->snd_cwnd) / - 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_ssthresh = win * tp->t_maxseg; + /* + * If the current tcp cc module has + * defined a hook for tasks to run + * before entering FR, call it + */ + if (CC_ALGO(tp)->pre_fr != NULL) + CC_ALGO(tp)->pre_fr(tp, th); ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; tp->t_timer[TCPT_REXMT] = 0; @@ -505,10 +670,9 @@ tcp_reduce_congestion_window( */ #if INET6 int -tcp6_input(mp, offp) - struct mbuf **mp; - int *offp; +tcp6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) register struct mbuf *m = *mp; struct in6_ifaddr *ia6; @@ -519,20 +683,71 @@ tcp6_input(mp, offp) * better place to put this in? */ ia6 = ip6_getdstifaddr(m); - if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) { - struct ip6_hdr *ip6; + if (ia6 != NULL) { + IFA_LOCK_SPIN(&ia6->ia_ifa); + if (ia6->ia6_flags & IN6_IFF_ANYCAST) { + struct ip6_hdr *ip6; - ip6 = mtod(m, struct ip6_hdr *); - icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + ip6 = mtod(m, struct ip6_hdr *); + icmp6_error(m, ICMP6_DST_UNREACH, + ICMP6_DST_UNREACH_ADDR, (caddr_t)&ip6->ip6_dst - (caddr_t)ip6); - return IPPROTO_DONE; + return (IPPROTO_DONE); + } + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); } tcp_input(m, *offp); - return IPPROTO_DONE; + return (IPPROTO_DONE); } #endif +/* A receiver will evaluate the flow of packets on a connection + * to see if it can reduce ack traffic. The receiver will start + * stretching acks if all of the following conditions are met: + * 1. tcp_delack_enabled is set to 3 + * 2. If the bytes received in the last 100ms is greater than a threshold + * defined by maxseg_unacked + * 3. If the connection has not been idle for tcp_maxrcvidle period. + * 4. If the connection has seen enough packets to let the slow-start + * finish after connection establishment or after some packet loss. + * + * The receiver will stop stretching acks if there is congestion/reordering + * as indicated by packets on reassembly queue or an ECN. If the delayed-ack + * timer fires while stretching acks, it means that the packet flow has gone + * below the threshold defined by maxseg_unacked and the receiver will stop + * stretching acks. The receiver gets no indication when slow-start is completed + * or when the connection reaches an idle state. That is why we use + * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle + * state. + */ + static inline int + tcp_stretch_ack_enable(struct tcpcb *tp) { + if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) && + TSTMP_GT(tp->rcv_unackwin + tcp_maxrcvidle, tcp_now) && + (((tp->t_flagsext & TF_RCVUNACK_WAITSS) == 0) || + (tp->rcv_waitforss >= tcp_rcvsspktcnt))) { + return(1); + } + return(0); +} + +/* Reset the state related to stretch-ack algorithm. This will make + * the receiver generate an ack every other packet. The receiver + * will start re-evaluating the rate at which packets come to decide + * if it can benefit by lowering the ack traffic. + */ +void +tcp_reset_stretch_ack(struct tcpcb *tp) +{ + tp->t_flags &= ~(TF_STRETCHACK); + tp->rcv_by_unackwin = 0; + tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; +} + void tcp_input(m, off0) struct mbuf *m; @@ -565,7 +780,8 @@ tcp_input(m, off0) #endif struct m_tag *fwd_tag; u_char ip_ecn = IPTOS_ECN_NOTECT; - unsigned int ifscope; + unsigned int ifscope, nocell = 0; + uint8_t isconnected, isdisconnected; /* * Record the interface where this segment arrived on; this does not @@ -579,6 +795,11 @@ tcp_input(m, off0) else ifscope = IFSCOPE_NONE; + /* Since this is an entry point for input processing of tcp packets, we + * can update the tcp clock here. + */ + calculate_tcp_clock(); + /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if (!SLIST_EMPTY(&m->m_pkthdr.tags)) { fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, @@ -616,12 +837,29 @@ tcp_input(m, off0) /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */ ip6 = mtod(m, struct ip6_hdr *); tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0; - if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { - tcpstat.tcps_rcvbadsum++; - goto dropnosock; - } th = (struct tcphdr *)((caddr_t)ip6 + off0); + if ((apple_hwcksum_rx != 0) && (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in6_cksum_phdr(&ip6->ip6_src, + &ip6->ip6_dst, htonl(sizeof(struct tcphdr)), + htonl(IPPROTO_TCP)); + + th->th_sum ^= 0xffff; + if (th->th_sum) { + tcpstat.tcps_rcvbadsum++; + goto dropnosock; + } + } + else { + if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) { + tcpstat.tcps_rcvbadsum++; + goto dropnosock; + } + } + KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport), (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])), th->th_seq, th->th_ack, th->th_win); @@ -637,6 +875,11 @@ tcp_input(m, off0) /* XXX stat */ goto dropnosock; } + DTRACE_TCP5(receive, sruct mbuf *, m, struct inpcb *, NULL, + struct ip6_hdr *, ip6, struct tcpcb *, NULL, + struct tcphdr *, th); + + ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; } else #endif /* INET6 */ { @@ -662,6 +905,9 @@ tcp_input(m, off0) th = (struct tcphdr *)((caddr_t)ip + off0); tlen = ip->ip_len; + DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th); + KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport), (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), th->th_seq, th->th_ack, th->th_win); @@ -821,6 +1067,10 @@ tcp_input(m, off0) * Locate pcb for segment. */ findpcb: + + isconnected = FALSE; + isdisconnected = FALSE; + #if IPFIREWALL_FORWARD if (next_hop != NULL #if INET6 @@ -871,6 +1121,13 @@ findpcb: */ if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) ifscope = inp->inp_boundif; + /* + * If the PCB is present and the socket isn't allowed to use + * the cellular interface, indicate it as such for tcp_respond. + */ + if (inp != NULL && (inp->inp_flags & INP_NO_IFT_CELLULAR)) + nocell = 1; + #if IPSEC if (ipsec_bypass == 0) { #if INET6 @@ -981,7 +1238,7 @@ findpcb: goto dropnosock; } - tcp_lock(so, 1, (void *)2); + tcp_lock(so, 1, 0); if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { tcp_unlock(so, 1, (void *)2); inp = NULL; // pretend we didn't find it @@ -1034,10 +1291,13 @@ findpcb: struct inpcb *oinp = sotoinpcb(so); #endif /* INET6 */ unsigned int head_ifscope; + unsigned int head_nocell; /* Get listener's bound-to-interface, if any */ head_ifscope = (inp->inp_flags & INP_BOUND_IF) ? inp->inp_boundif : IFSCOPE_NONE; + /* Get listener's no-cellular information, if any */ + head_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; /* * If the state is LISTEN then ignore segment if it contains an RST. @@ -1130,11 +1390,18 @@ findpcb: if (isipv6 && !ip6_use_deprecated) { struct in6_ifaddr *ia6; - if ((ia6 = ip6_getdstifaddr(m)) && - (ia6->ia6_flags & IN6_IFF_DEPRECATED)) { - tp = NULL; - rstreason = BANDLIM_RST_OPENPORT; - goto dropwithreset; + ia6 = ip6_getdstifaddr(m); + if (ia6 != NULL) { + IFA_LOCK_SPIN(&ia6->ia_ifa); + if (ia6->ia6_flags & IN6_IFF_DEPRECATED) { + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + tp = NULL; + rstreason = BANDLIM_RST_OPENPORT; + goto dropwithreset; + } + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); } } #endif @@ -1203,14 +1470,16 @@ findpcb: * can only be set to a non-zero value earlier if * the listener has such a flag set. */ -#if INET6 - if (head_ifscope != IFSCOPE_NONE && !isipv6) { -#else if (head_ifscope != IFSCOPE_NONE) { -#endif /* INET6 */ inp->inp_flags |= INP_BOUND_IF; inp->inp_boundif = head_ifscope; } + /* + * Inherit INP_NO_IFT_CELLULAR from listener. + */ + if (head_nocell) { + inp->inp_flags |= INP_NO_IFT_CELLULAR; + } #if INET6 if (isipv6) inp->in6p_laddr = ip6->ip6_dst; @@ -1277,8 +1546,11 @@ findpcb: } #endif /* inherit states from the listener */ + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_LISTEN); tp->t_state = TCPS_LISTEN; tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY); + tp->t_flagsext |= (tp0->t_flagsext & TF_RXTFINDROP); tp->t_keepinit = tp0->t_keepinit; tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl; @@ -1300,9 +1572,8 @@ findpcb: KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0); } } -#if 1 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + /* * Radar 3529618 * This is the second part of the MSS DoS prevention code (after @@ -1329,13 +1600,9 @@ findpcb: * this check. * * Account for packet if payload packet, skip over ACK, etc. - * - * The packet per second count is done all the time and is also used - * by "DELAY_ACK" to detect streaming situations. - * */ if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) { - if (tp->rcv_reset > tcp_now) { + if (TSTMP_GT(tp->rcv_reset, tcp_now)) { tp->rcv_pps++; tp->rcv_byps += tlen + off; if (tp->rcv_byps > tp->rcv_maxbyps) @@ -1369,25 +1636,30 @@ findpcb: tp->rcv_pps = 1; tp->rcv_byps = tlen + off; } + + /* Evaluate the rate of arrival of packets to see if the + * receiver can reduce the ack traffic. The algorithm to + * stretch acks will be enabled if the connection meets + * certain criteria defined in tcp_stretch_ack_enable function. + */ + if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) { + tp->rcv_waitforss++; + } + if (tcp_stretch_ack_enable(tp)) { + tp->t_flags |= TF_STRETCHACK; + tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS); + tp->rcv_waitforss = 0; + } else { + tp->t_flags &= ~(TF_STRETCHACK); + } + if (TSTMP_GT(tp->rcv_unackwin, tcp_now)) { + tp->rcv_by_unackwin += (tlen + off); + } else { + tp->rcv_unackwin = tcp_now + tcp_rcvunackwin; + tp->rcv_by_unackwin = tlen + off; + } } -#if TRAFFIC_MGT - if (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_REGULATE) { - tcpstat.tcps_bg_rcvtotal++; - - /* Take snapshots of pkts recv; - * tcpcb should have been initialized to 0 when allocated, - * so if 0 then this is the first time we're doing this - */ - if (!tp->tot_recv_snapshot) { - tp->tot_recv_snapshot = tcpstat.tcps_rcvtotal; - } - if (!tp->bg_recv_snapshot) { - tp->bg_recv_snapshot = tcpstat.tcps_bg_rcvtotal; - } - } -#endif /* TRAFFIC_MGT */ - /* Explicit Congestion Notification - Flag that we need to send ECT if + The IP Congestion experienced flag was set. @@ -1413,14 +1685,22 @@ findpcb: if ((thflags & TH_CWR) == TH_CWR) { tp->ecn_flags &= ~TE_SENDECE; } + + /* If we received an explicit notification of congestion in + * ip tos ecn bits or by the CWR bit in TCP header flags, reset + * the ack-strteching state. + */ + if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_STRETCHACK) != 0 && + ((ip_ecn == IPTOS_ECN_CE) || ((thflags & TH_CWR) == TH_CWR))) + tcp_reset_stretch_ack(tp); /* * Segment received on connection. * Reset idle time and keep-alive timer. */ - tp->t_rcvtime = 0; + tp->t_rcvtime = tcp_now; if (TCPS_HAVEESTABLISHED(tp->t_state)) - tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp)); /* * Process options if not in LISTEN state, @@ -1451,6 +1731,64 @@ findpcb: } } +#if TRAFFIC_MGT + /* Compute inter-packet arrival jitter. According to RFC 3550, inter-packet + * arrival jitter is defined as the difference in packet spacing at the + * receiver compared to the sender for a pair of packets. When two packets + * of maximum segment size come one after the other with consecutive + * sequence numbers, we consider them as packets sent together at the + * sender and use them as a pair to compute inter-packet arrival jitter. + * This metric indicates the delay induced by the network components due + * to queuing in edge/access routers. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_PUSH)) == TH_ACK && + ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && + ((to.to_flags & TOF_TS) == 0 || + TSTMP_GEQ(to.to_tsval, tp->ts_recent)) && + th->th_seq == tp->rcv_nxt && + LIST_EMPTY(&tp->t_segq)) { + if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) { + tp->iaj_pktcnt++; + } + + if ( tp->iaj_size == 0 || tlen > tp->iaj_size || + (tlen == tp->iaj_size && tp->iaj_rcv_ts == 0)) { + /* State related to inter-arrival jitter is uninitialized + * or we are trying to find a good first packet to start + * computing the metric + */ + update_iaj_state(tp, tlen, 0); + } else { + if (tlen == tp->iaj_size) { + /* Compute inter-arrival jitter taking this packet + * as the second packet + */ + compute_iaj(tp); + } + if (tlen < tp->iaj_size) { + /* There is a smaller packet in the stream. + * Some times the maximum size supported on a path can + * change if there is a new link with smaller MTU. + * The receiver will not know about this change. + * If there are too many packets smaller than iaj_size, + * we try to learn the iaj_size again. + */ + tp->iaj_small_pkt++; + if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) { + update_iaj_state(tp, tlen, 1); + } else { + clear_iaj_state(tp); + } + } else { + update_iaj_state(tp, tlen, 0); + } + } + } else { + clear_iaj_state(tp); + } +#endif /* TRAFFIC_MGT */ + /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has @@ -1498,11 +1836,10 @@ findpcb: if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_ssthresh && - ((!tcp_do_newreno && !tp->sack_enable && - tp->t_dupacks < tcprexmtthresh) || - ((tcp_do_newreno || tp->sack_enable) && - !IN_FASTRECOVERY(tp) && to.to_nsacks == 0 && - TAILQ_EMPTY(&tp->snd_holes)))) { + (!IN_FASTRECOVERY(tp) && + ((!tp->sack_enable && tp->t_dupacks < tcprexmtthresh) || + (tp->sack_enable && to.to_nsacks == 0 && + TAILQ_EMPTY(&tp->snd_holes))))) { /* * this is a pure ack for outstanding data. */ @@ -1511,7 +1848,7 @@ findpcb: * "bad retransmit" recovery */ if (tp->t_rxtshift == 1 && - tcp_now < tp->t_badrxtwin) { + TSTMP_LT(tcp_now, tp->t_badrxtwin)) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = @@ -1521,6 +1858,11 @@ findpcb: ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; + tp->t_rxtshift = 0; + tp->rxt_start = 0; + DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_BAD_REXMT_RECOVERY); } /* * Recalculate the transmit timer / rtt. @@ -1530,33 +1872,29 @@ findpcb: * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ - if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0)) { /* Makes sure we already have a TS */ - if (!tp->t_rttlow || - tp->t_rttlow > tcp_now - to.to_tsecr) - tp->t_rttlow = tcp_now - to.to_tsecr; + if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0) && + TSTMP_GEQ(tcp_now, to.to_tsecr)) { tcp_xmit_timer(tp, tcp_now - to.to_tsecr); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { - if (!tp->t_rttlow || - tp->t_rttlow > tcp_now - tp->t_rtttime) - tp->t_rttlow = tcp_now - tp->t_rtttime; - tcp_xmit_timer(tp, tp->t_rtttime); + tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); } acked = th->th_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; - /* - * Grow the congestion window, if the - * connection is cwnd bound. + + /* Handle an ack that is in sequence during congestion + * avoidance phase. The calculations in this function + * assume that snd_una is not updated yet. */ - if (tp->snd_cwnd < tp->snd_wnd) { - tp->t_bytes_acked += acked; - if (tp->t_bytes_acked > tp->snd_cwnd) { - tp->t_bytes_acked -= tp->snd_cwnd; - tp->snd_cwnd += tp->t_maxseg; - } - } + if (CC_ALGO(tp)->inseq_ack_rcvd != NULL) + CC_ALGO(tp)->inseq_ack_rcvd(tp, th); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_INSEQ_ACK_RCVD); + sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) @@ -1583,13 +1921,14 @@ findpcb: if (tp->snd_una == tp->snd_max) tp->t_timer[TCPT_REXMT] = 0; else if (tp->t_timer[TCPT_PERSIST] == 0) - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); sowwakeup(so); /* has to be done with socket lock held */ if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) { - tp->t_unacksegs = 0; (void) tcp_output(tp); } + + tcp_check_timer_state(tp); tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; @@ -1619,10 +1958,15 @@ findpcb: tp->rcv_up = tp->rcv_nxt; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; + if (nstat_collect) { + locked_add_64(&inp->inp_stat->rxpackets, 1); + locked_add_64(&inp->inp_stat->rxbytes, tlen); + } ND6_HINT(tp); /* some progress has been done */ /* * Add data to socket buffer. */ + so_recv_data_stat(so, m, 0); m_adj(m, drop_hdrlen); /* delayed header drop */ if (sbappendstream(&so->so_rcv, m)) sorwakeup(so); @@ -1639,14 +1983,17 @@ findpcb: (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)), th->th_seq, th->th_ack, th->th_win); } - if (DELAY_ACK(tp)) { - tp->t_flags |= TF_DELACK; + if (DELAY_ACK(tp, th)) { + if ((tp->t_flags & TF_DELACK) == 0) { + tp->t_flags |= TF_DELACK; + tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + } tp->t_unacksegs++; } else { - tp->t_unacksegs = 0; tp->t_flags |= TF_ACKNOW; tcp_output(tp); } + tcp_check_timer_state(tp); tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; @@ -1659,9 +2006,8 @@ findpcb: * Receive window is amount of space in rcv queue, * but not less than advertised window. */ -#if 1 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + { int win; win = tcp_sbspace(tp); @@ -1692,9 +2038,7 @@ findpcb: register struct sockaddr_in6 *sin6; #endif -#if 1 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); #if INET6 if (isipv6) { MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, @@ -1719,9 +2063,7 @@ findpcb: } else #endif { -#if 0 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_NOWAIT); if (sin == NULL) @@ -1734,7 +2076,7 @@ findpcb: laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = ip->ip_dst; - if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0)) { + if (in_pcbconnect(inp, (struct sockaddr *)sin, proc0, NULL)) { inp->inp_laddr = laddr; FREE(sin, M_SONAME); goto drop; @@ -1768,9 +2110,15 @@ findpcb: tp->snd_wnd = tiwin; /* initial send-window */ tp->t_flags |= TF_ACKNOW; tp->t_unacksegs = 0; + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED); tp->t_state = TCPS_SYN_RECEIVED; - tp->t_timer[TCPT_KEEP] = tp->t_keepinit ? tp->t_keepinit : tcp_keepinit; + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_keepinit ? tp->t_keepinit : tcp_keepinit); dropsocket = 0; /* committed to socket */ + + /* reset the incomp processing flag */ + so->so_flags &= ~(SOF_INCOMP_INPROGRESS); tcpstat.tcps_accepts++; if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE | TH_CWR)) { /* ECN-setup SYN */ @@ -1860,13 +2208,15 @@ findpcb: * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ - if (DELAY_ACK(tp) && tlen != 0) { - tp->t_flags |= TF_DELACK; + if (DELAY_ACK(tp, th) && tlen != 0) { + if ((tp->t_flags & TF_DELACK) == 0) { + tp->t_flags |= TF_DELACK; + tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + } tp->t_unacksegs++; } else { tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; } /* * Received <SYN,ACK> in SYN_SENT[*] state. @@ -1874,30 +2224,35 @@ findpcb: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ - tp->t_starttime = 0; + tp->t_starttime = tcp_now; if (tp->t_flags & TF_NEEDFIN) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1); tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED); tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp)); + if (nstat_collect) + nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt); } - /* soisconnected may lead to socket_unlock in case of upcalls, - * make sure this is done when everything is setup. - */ - soisconnected(so); + isconnected = TRUE; } else { - /* - * Received initial SYN in SYN-SENT[*] state => simul- - * taneous open. If segment contains CC option and there is - * a cached CC, apply TAO test; if it succeeds, connection is - * half-synchronized. Otherwise, do 3-way handshake: - * SYN-SENT -> SYN-RECEIVED - * SYN-SENT* -> SYN-RECEIVED* - */ + /* + * Received initial SYN in SYN-SENT[*] state => simul- + * taneous open. If segment contains CC option and there is + * a cached CC, apply TAO test; if it succeeds, connection is + * half-synchronized. Otherwise, do 3-way handshake: + * SYN-SENT -> SYN-RECEIVED + * SYN-SENT* -> SYN-RECEIVED* + */ tp->t_flags |= TF_ACKNOW; tp->t_timer[TCPT_REXMT] = 0; + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED); tp->t_state = TCPS_SYN_RECEIVED; } @@ -2044,6 +2399,8 @@ trimthenstep6: so->so_error = ECONNRESET; close: postevent(so, 0, EV_RESET); + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSED); tp->t_state = TCPS_CLOSED; tcpstat.tcps_drops++; tp = tcp_close(tp); @@ -2061,10 +2418,6 @@ trimthenstep6: goto drop; } -#if 0 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif - /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. @@ -2090,6 +2443,12 @@ trimthenstep6: tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += tlen; tcpstat.tcps_pawsdrop++; + if (nstat_collect) { + nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, tlen, NSTAT_RX_FLAG_DUPLICATE); + locked_add_64(&inp->inp_stat->rxpackets, 1); + locked_add_64(&inp->inp_stat->rxbytes, tlen); + tp->t_stat.rxduplicatebytes += tlen; + } if (tlen) goto dropafterack; goto drop; @@ -2136,7 +2495,6 @@ trimthenstep6: * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; todrop = tlen; tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += todrop; @@ -2144,6 +2502,12 @@ trimthenstep6: tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } + if (nstat_collect) { + nstat_route_rx(tp->t_inpcb->inp_route.ro_rt, 1, todrop, NSTAT_RX_FLAG_DUPLICATE); + locked_add_64(&inp->inp_stat->rxpackets, 1); + locked_add_64(&inp->inp_stat->rxbytes, todrop); + tp->t_stat.rxduplicatebytes += todrop; + } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; @@ -2199,7 +2563,6 @@ trimthenstep6: */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; tcpstat.tcps_rcvwinprobe++; } else goto dropafterack; @@ -2281,19 +2644,27 @@ trimthenstep6: (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->snd_scale = tp->requested_s_scale; tp->rcv_scale = tp->request_r_scale; + tp->snd_wnd = th->th_win << tp->snd_scale; + tiwin = tp->snd_wnd; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ - tp->t_starttime = 0; + tp->t_starttime = tcp_now; if (tp->t_flags & TF_NEEDFIN) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1); tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED); tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp)); + if (nstat_collect) + nstat_route_connect_success(tp->t_inpcb->inp_route.ro_rt); } /* * If segment contains data or ACK, will call tcp_reass() @@ -2306,10 +2677,7 @@ trimthenstep6: /* FALLTHROUGH */ - /* soisconnected may lead to socket_unlock in case of upcalls, - * make sure this is done when everything is setup. - */ - soisconnected(so); + isconnected = TRUE; /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range @@ -2364,8 +2732,7 @@ trimthenstep6: th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - ((tcp_do_newreno || tp->sack_enable) && - IN_FASTRECOVERY(tp))) { + IN_FASTRECOVERY(tp)) { if (tp->sack_enable && IN_FASTRECOVERY(tp)) { int awnd; @@ -2384,12 +2751,15 @@ trimthenstep6: } } else tp->snd_cwnd += tp->t_maxseg; - tp->t_unacksegs = 0; + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_IN_FASTRECOVERY); + (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; - u_int win; /* * If we're doing sack, check to @@ -2403,18 +2773,21 @@ trimthenstep6: tp->t_dupacks = 0; break; } - } else if (tcp_do_newreno) { + } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } - win = min(tp->snd_wnd, tp->snd_cwnd) / - 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_ssthresh = win * tp->t_maxseg; + + /* + * If the current tcp cc module has + * defined a hook for tasks to run + * before entering FR, call it + */ + if (CC_ALGO(tp)->pre_fr != NULL) + CC_ALGO(tp)->pre_fr(tp, th); ENTER_FASTRECOVERY(tp); tp->snd_recover = tp->snd_max; tp->t_timer[TCPT_REXMT] = 0; @@ -2424,18 +2797,24 @@ trimthenstep6: tcpstat.tcps_sack_recovery_episode++; tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; - tp->t_unacksegs = 0; + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_ENTER_FASTRECOVERY); + (void) tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; - tp->t_unacksegs = 0; (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_ENTER_FASTRECOVERY); goto drop; } } else @@ -2446,69 +2825,33 @@ trimthenstep6: * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (!IN_FASTRECOVERY(tp)) { + if (IN_FASTRECOVERY(tp)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (tp->sack_enable) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_PARTIAL_ACK); + } else { + EXIT_FASTRECOVERY(tp); + if (CC_ALGO(tp)->post_fr != NULL) + CC_ALGO(tp)->post_fr(tp, th); + tp->t_dupacks = 0; + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_EXIT_FASTRECOVERY); + } + } else { /* - * We were not in fast recovery. Reset the duplicate ack + * We were not in fast recovery. Reset the duplicate ack * counter. */ tp->t_dupacks = 0; } - /* - * If the congestion window was inflated to account - * for the other side's cached packets, retract it. - */ - else { - if (tcp_do_newreno || tp->sack_enable) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { - if (tp->sack_enable) - tcp_sack_partialack(tp, th); - else - tcp_newreno_partial_ack(tp, th); - } - else { - if (tcp_do_newreno) { - int32_t ss = tp->snd_max - th->th_ack; - - /* - * Complete ack. Inflate the congestion window to - * ssthresh and exit fast recovery. - * - * Window inflation should have left us with approx. - * snd_ssthresh outstanding data. But in case we - * would be inclined to send a burst, better to do - * it via the slow start mechanism. - */ - if (ss < tp->snd_ssthresh) - tp->snd_cwnd = ss + tp->t_maxseg; - else - tp->snd_cwnd = tp->snd_ssthresh; - } - else { - /* - * Clamp the congestion window to the crossover point - * and exit fast recovery. - */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; - } - - EXIT_FASTRECOVERY(tp); - tp->t_dupacks = 0; - tp->t_bytes_acked = 0; - } - } - else { - /* - * Clamp the congestion window to the crossover point - * and exit fast recovery in non-newreno and non-SACK case. - */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; - EXIT_FASTRECOVERY(tp); - tp->t_dupacks = 0; - tp->t_bytes_acked = 0; - } - } /* @@ -2545,7 +2888,8 @@ process_ACK: * original cwnd and ssthresh, and proceed to transmit where * we left off. */ - if (tp->t_rxtshift == 1 && tcp_now < tp->t_badrxtwin) { + if (tp->t_rxtshift == 1 && + TSTMP_LT(tcp_now, tp->t_badrxtwin)) { ++tcpstat.tcps_sndrexmitbad; tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; @@ -2554,6 +2898,12 @@ process_ACK: ENTER_FASTRECOVERY(tp); tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; /* XXX probably not required */ + tp->t_rxtshift = 0; + tp->rxt_start = 0; + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_BAD_REXMT_RECOVERY); } /* @@ -2571,14 +2921,11 @@ process_ACK: * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ - if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0)) { - if (!tp->t_rttlow || tp->t_rttlow > tcp_now - to.to_tsecr) - tp->t_rttlow = tcp_now - to.to_tsecr; + if (((to.to_flags & TOF_TS) != 0) && (to.to_tsecr != 0) && + TSTMP_GEQ(tcp_now, to.to_tsecr)) { tcp_xmit_timer(tp, tcp_now - to.to_tsecr); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { - if (!tp->t_rttlow || tp->t_rttlow > tcp_now - tp->t_rtttime) - tp->t_rttlow = tcp_now - tp->t_rtttime; - tcp_xmit_timer(tp, tp->t_rtttime); + tcp_xmit_timer(tp, tcp_now - tp->t_rtttime); } /* @@ -2591,7 +2938,7 @@ process_ACK: tp->t_timer[TCPT_REXMT] = 0; needoutput = 1; } else if (tp->t_timer[TCPT_PERSIST] == 0) - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, @@ -2600,78 +2947,35 @@ process_ACK: if (acked == 0) goto step6; - /* - * When new data is acked, open the congestion window. - */ if ((thflags & TH_ECE) != 0 && (tp->ecn_flags & TE_SETUPSENT) != 0) { /* * Reduce the congestion window if we haven't done so. */ - if (!(tp->sack_enable && IN_FASTRECOVERY(tp)) && - !(tcp_do_newreno && SEQ_LEQ(th->th_ack, tp->snd_recover))) { - tcp_reduce_congestion_window(tp); + if (!tp->sack_enable && !IN_FASTRECOVERY(tp) && + SEQ_GEQ(th->th_ack, tp->snd_recover)) { + tcp_reduce_congestion_window(tp, th); + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_ECN_RCVD); } - } else if ((!tcp_do_newreno && !tp->sack_enable) || - !IN_FASTRECOVERY(tp)) { - /* - * RFC 3465 - Appropriate Byte Counting. - * - * If the window is currently less than ssthresh, - * open the window by the number of bytes ACKed by - * the last ACK, however clamp the window increase - * to an upper limit "L". - * - * In congestion avoidance phase, open the window by - * one segment each time "bytes_acked" grows to be - * greater than or equal to the congestion window. - */ - - register u_int cw = tp->snd_cwnd; - register u_int incr = tp->t_maxseg; - - if (tcp_do_rfc3465) { - - if (cw >= tp->snd_ssthresh) { - tp->t_bytes_acked += acked; - if (tp->t_bytes_acked >= cw) { - /* Time to increase the window. */ - tp->t_bytes_acked -= cw; - } else { - /* No need to increase yet. */ - incr = 0; - } - } else { - /* - * If the user explicitly enables RFC3465 - * use 2*SMSS for the "L" param. Otherwise - * use the more conservative 1*SMSS. - * - * (See RFC 3465 2.3 Choosing the Limit) - */ - u_int abc_lim; - - abc_lim = (tcp_do_rfc3465_lim2 && - tp->snd_nxt == tp->snd_max) ? incr * 2 : incr; - - incr = lmin(acked, abc_lim); - } - } - else { - /* - * If the window gives us less than ssthresh packets - * in flight, open exponentially (segsz per packet). - * Otherwise open linearly: segsz per window - * (segsz^2 / cwnd per packet). - */ - - if (cw >= tp->snd_ssthresh) { - incr = max((incr * incr / cw), 1); - } - } - + } - tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale); + /* + * When new data is acked, open the congestion window. + * The specifics of how this is achieved are up to the + * congestion control algorithm in use for this connection. + * + * The calculations in this function assume that snd_una is + * not updated yet. + */ + if (!IN_FASTRECOVERY(tp)) { + if (CC_ALGO(tp)->ack_rcvd != NULL) + CC_ALGO(tp)->ack_rcvd(tp, th); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, th, + int32_t, TCP_CC_ACK_RCVD); } if (acked > so->so_snd.sb_cc) { tp->snd_wnd -= so->so_snd.sb_cc; @@ -2683,15 +2987,15 @@ process_ACK: ourfinisacked = 0; } /* detect una wraparound */ - if ((tcp_do_newreno || tp->sack_enable) && - !IN_FASTRECOVERY(tp) && + if ( !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if ((tcp_do_newreno || tp->sack_enable) && - IN_FASTRECOVERY(tp) && + + if (IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); + tp->snd_una = th->th_ack; if (tp->sack_enable) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) @@ -2723,10 +3027,12 @@ process_ACK: * we'll hang forever. */ if (so->so_state & SS_CANTRCVMORE) { - tp->t_timer[TCPT_2MSL] = tcp_maxidle; - add_to_time_wait(tp); - soisdisconnected(so); + add_to_time_wait(tp, tcp_maxidle); + isconnected = FALSE; + isdisconnected = TRUE; } + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_2); tp->t_state = TCPS_FIN_WAIT_2; /* fall through and make sure we also recognize data ACKed with the FIN */ } @@ -2741,17 +3047,18 @@ process_ACK: */ case TCPS_CLOSING: if (ourfinisacked) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_TIME_WAIT); tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && - tp->t_starttime < (u_int32_t)tcp_msl) - tp->t_timer[TCPT_2MSL] = - tp->t_rxtcur * TCPTV_TWTRUNC; + ((int)(tcp_now - tp->t_starttime)) < tcp_msl) + add_to_time_wait(tp, tp->t_rxtcur * TCPTV_TWTRUNC); else - tp->t_timer[TCPT_2MSL] = 2 * tcp_msl; - add_to_time_wait(tp); - soisdisconnected(so); + add_to_time_wait(tp, 2 * tcp_msl); + isconnected = FALSE; + isdisconnected = TRUE; } tp->t_flags |= TF_ACKNOW; break; @@ -2775,8 +3082,7 @@ process_ACK: * it and restart the finack timer. */ case TCPS_TIME_WAIT: - tp->t_timer[TCPT_2MSL] = 2 * tcp_msl; - add_to_time_wait(tp); + add_to_time_wait(tp, 2 * tcp_msl); goto dropafterack; } } @@ -2856,7 +3162,7 @@ step6: ) tcp_pulloutofband(so, th, m, drop_hdrlen); /* hdr drop is delayed */ - } else + } else { /* * If no out of band data is expected, * pull receive urgent pointer along @@ -2864,8 +3170,27 @@ step6: */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; -dodata: /* XXX */ + } +dodata: + /* Set socket's connect or disconnect state correcly before doing data. + * The following might unlock the socket if there is an upcall or a socket + * filter. + */ + if (isconnected) { + soisconnected(so); + } else if (isdisconnected) { + soisdisconnected(so); + } + + /* Let's check the state of pcb just to make sure that it did not get closed + * when we unlocked above + */ + if (inp->inp_state == INPCB_STATE_DEAD) { + /* Just drop the packet that we are processing and return */ + goto drop; + } + /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. @@ -2894,25 +3219,31 @@ dodata: /* XXX */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { - if (DELAY_ACK(tp) && ((tp->t_flags & TF_ACKNOW) == 0)) { - tp->t_flags |= TF_DELACK; + if (DELAY_ACK(tp, th) && ((tp->t_flags & TF_ACKNOW) == 0)) { + if ((tp->t_flags & TF_DELACK) == 0) { + tp->t_flags |= TF_DELACK; + tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + } tp->t_unacksegs++; } else { - tp->t_unacksegs = 0; tp->t_flags |= TF_ACKNOW; } tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; tcpstat.tcps_rcvpack++; tcpstat.tcps_rcvbyte += tlen; + if (nstat_collect) { + locked_add_64(&inp->inp_stat->rxpackets, 1); + locked_add_64(&inp->inp_stat->rxbytes, tlen); + } ND6_HINT(tp); + so_recv_data_stat(so, m, drop_hdrlen); if (sbappendstream(&so->so_rcv, m)) sorwakeup(so); } else { thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; } if (tlen > 0 && tp->sack_enable) @@ -2965,13 +3296,15 @@ dodata: /* XXX */ * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ - if (DELAY_ACK(tp) && (tp->t_flags & TF_NEEDSYN)) { - tp->t_flags |= TF_DELACK; + if (DELAY_ACK(tp, th) && (tp->t_flags & TF_NEEDSYN)) { + if ((tp->t_flags & TF_DELACK) == 0) { + tp->t_flags |= TF_DELACK; + tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack); + } tp->t_unacksegs++; } else { tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; } tp->rcv_nxt++; } @@ -2982,8 +3315,10 @@ dodata: /* XXX */ * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: - tp->t_starttime = 0; + tp->t_starttime = tcp_now; case TCPS_ESTABLISHED: + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT); tp->t_state = TCPS_CLOSE_WAIT; break; @@ -2992,6 +3327,8 @@ dodata: /* XXX */ * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSING); tp->t_state = TCPS_CLOSING; break; @@ -3001,21 +3338,20 @@ dodata: /* XXX */ * standard timers. */ case TCPS_FIN_WAIT_2: + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_TIME_WAIT); tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && - tp->t_starttime < (u_int32_t)tcp_msl) { - tp->t_timer[TCPT_2MSL] = - tp->t_rxtcur * TCPTV_TWTRUNC; + ((int)(tcp_now - tp->t_starttime)) < tcp_msl) { + add_to_time_wait(tp, tp->t_rxtcur * TCPTV_TWTRUNC); /* For transaction client, force ACK now. */ tp->t_flags |= TF_ACKNOW; tp->t_unacksegs = 0; } else - tp->t_timer[TCPT_2MSL] = 2 * tcp_msl; - - add_to_time_wait(tp); + add_to_time_wait(tp, 2 * tcp_msl); soisdisconnected(so); break; @@ -3023,8 +3359,7 @@ dodata: /* XXX */ * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: - tp->t_timer[TCPT_2MSL] = 2 * tcp_msl; - add_to_time_wait(tp); + add_to_time_wait(tp, 2 * tcp_msl); break; } } @@ -3038,9 +3373,12 @@ dodata: /* XXX */ * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) { - tp->t_unacksegs = 0; (void) tcp_output(tp); } + + tcp_check_timer_state(tp); + + tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; @@ -3074,8 +3412,9 @@ dropafterack: #endif m_freem(m); tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; (void) tcp_output(tp); + + /* Don't need to check timer state as we should have done it during tcp_output */ tcp_unlock(so, 1, 0); KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; @@ -3119,22 +3458,22 @@ dropwithreset: if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, - TH_RST, ifscope); + TH_RST, ifscope, nocell); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, - (tcp_seq)0, TH_RST|TH_ACK, ifscope); + (tcp_seq)0, TH_RST|TH_ACK, ifscope, nocell); } /* destroy temporarily created socket */ if (dropsocket) { (void) soabort(so); tcp_unlock(so, 1, 0); } - else - if ((inp != NULL) && (nosock == 0)) - tcp_unlock(so, 1, 0); + else if ((inp != NULL) && (nosock == 0)) { + tcp_unlock(so, 1, 0); + } KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; dropnosock: @@ -3154,9 +3493,9 @@ drop: (void) soabort(so); tcp_unlock(so, 1, 0); } - else - if (nosock == 0) - tcp_unlock(so, 1, 0); + else if (nosock == 0) { + tcp_unlock(so, 1, 0); + } KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END,0,0,0,0,0); return; } @@ -3301,6 +3640,36 @@ tcp_pulloutofband(so, th, m, off) panic("tcp_pulloutofband"); } +uint32_t +get_base_rtt(struct tcpcb *tp) +{ + uint32_t base_rtt = 0, i; + for (i = 0; i < N_RTT_BASE; ++i) { + if (tp->rtt_hist[i] != 0 && + (base_rtt == 0 || tp->rtt_hist[i] < base_rtt)) + base_rtt = tp->rtt_hist[i]; + } + return base_rtt; +} + +/* Each value of RTT base represents the minimum RTT seen in a minute. + * We keep upto N_RTT_BASE minutes worth of history. + */ +void +update_base_rtt(struct tcpcb *tp, uint32_t rtt) +{ + if (++tp->rtt_count >= rtt_samples_per_slot) { + int i=0; + for (i = (N_RTT_BASE-1); i > 0; --i) { + tp->rtt_hist[i] = tp->rtt_hist[i-1]; + } + tp->rtt_hist[0] = rtt; + tp->rtt_count = 0; + } else { + tp->rtt_hist[0] = min(tp->rtt_hist[0], rtt); + } +} + /* * Collect new round-trip time estimate * and update averages and current timeout. @@ -3314,15 +3683,26 @@ tcp_xmit_timer(tp, rtt) tcpstat.tcps_rttupdated++; tp->t_rttupdated++; + + if (rtt > 0) { + tp->t_rttcur = rtt; + update_base_rtt(tp, rtt); + } + if (tp->t_srtt != 0) { /* * srtt is stored as fixed point with 5 bits after the - * binary point (i.e., scaled by 8). The following magic + * binary point (i.e., scaled by 32). The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed - * point). Adjust rtt to origin 0. + * point). + * + * Freebsd adjusts rtt to origin 0 by subtracting 1 from the provided + * rtt value. This was required because of the way t_rtttime was + * initiailised to 1 before. Since we changed t_rtttime to be based on + * tcp_now, this extra adjustment is not needed. */ - delta = ((rtt - 1) << TCP_DELTA_SHIFT) + delta = (rtt << TCP_DELTA_SHIFT) - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); if ((tp->t_srtt += delta) <= 0) @@ -3355,8 +3735,10 @@ tcp_xmit_timer(tp, rtt) tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } + nstat_route_rtt(tp->t_inpcb->inp_route.ro_rt, tp->t_srtt, tp->t_rttvar); tp->t_rtttime = 0; tp->t_rxtshift = 0; + tp->rxt_start = 0; /* * the retransmit should happen at rtt + 4 * rttvar. @@ -3370,7 +3752,8 @@ tcp_xmit_timer(tp, rtt) * the minimum feasible timer (which is 2 ticks). */ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), - max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX, + TCP_ADD_REXMTSLOP(tp)); /* * We received an ack for a packet that wasn't retransmitted; @@ -3471,12 +3854,14 @@ tcp_mss(tp, offer, input_ifscope) #if INET6 if (isipv6) { - rt = tcp_rtlookup6(inp); + rt = tcp_rtlookup6(inp, input_ifscope); if (rt != NULL && (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) || IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) || - rt->rt_gateway->sa_family == AF_LINK)) - isnetlocal = TRUE; + rt->rt_gateway->sa_family == AF_LINK || + in6_localaddr(&inp->in6p_faddr))) { + tp->t_flags |= TF_LOCAL; + } } else #endif /* INET6 */ @@ -3484,9 +3869,13 @@ tcp_mss(tp, offer, input_ifscope) rt = tcp_rtlookup(inp, input_ifscope); if (rt != NULL && (rt->rt_gateway->sa_family == AF_LINK || - rt->rt_ifp->if_flags & IFF_LOOPBACK)) - isnetlocal = TRUE; + rt->rt_ifp->if_flags & IFF_LOOPBACK || + in_localaddr(inp->inp_faddr))) { + tp->t_flags |= TF_LOCAL; + } } + isnetlocal = (tp->t_flags & TF_LOCAL); + if (rt == NULL) { tp->t_maxopd = tp->t_maxseg = #if INET6 @@ -3554,7 +3943,7 @@ tcp_mss(tp, offer, input_ifscope) if (rt->rt_rmx.rmx_locks & RTV_RTT) tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ); else - tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCP_RETRANSHZ; + tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; tp->t_srtt = rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE)); tcpstat.tcps_usedrtt++; if (rt->rt_rmx.rmx_rttvar) { @@ -3568,10 +3957,11 @@ tcp_mss(tp, offer, input_ifscope) } TCPT_RANGESET(tp->t_rxtcur, ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_rttmin, TCPTV_REXMTMAX, + TCP_ADD_REXMTSLOP(tp)); } else - tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCP_RETRANSHZ; + tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN; #if INET6 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt)); @@ -3651,14 +4041,7 @@ tcp_mss(tp, offer, input_ifscope) (void)sbreserve(&so->so_rcv, bufsize); } - /* - * Set the slow-start flight size depending on whether this - * is a local network or not. - */ - if (isnetlocal) - tp->snd_cwnd = mss * ss_fltsz_local; - else - tp->snd_cwnd = mss * ss_fltsz; + set_tcp_stream_priority(so); if (rt->rt_rmx.rmx_ssthresh) { /* @@ -3673,6 +4056,17 @@ tcp_mss(tp, offer, input_ifscope) tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; } + + /* + * Set the slow-start flight size depending on whether this + * is a local network or not. + */ + if (CC_ALGO(tp)->cwnd_init != NULL) + CC_ALGO(tp)->cwnd_init(tp); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, struct tcpcb *, tp, + struct tcphdr *, NULL, int32_t, TCP_CC_CWND_INIT); + /* Route locked during lookup above */ RT_UNLOCK(rt); } @@ -3701,7 +4095,7 @@ tcp_mssopt(tp) #if INET6 if (isipv6) - rt = tcp_rtlookup6(tp->t_inpcb); + rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE); else #endif /* INET6 */ rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE); @@ -3736,7 +4130,7 @@ tcp_mssopt(tp) /* * On a partial ack arrives, force the retransmission of the * next unacknowledged segment. Do not clear tp->t_dupacks. - * By setting snd_nxt to ti_ack, this forces retransmission timer to + * By setting snd_nxt to th_ack, this forces retransmission timer to * be started again. */ static void @@ -3756,7 +4150,6 @@ tcp_newreno_partial_ack(tp, th) */ tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una); tp->t_flags |= TF_ACKNOW; - tp->t_unacksegs = 0; (void) tcp_output(tp); tp->snd_cwnd = ocwnd; if (SEQ_GT(onxt, tp->snd_nxt)) @@ -3828,11 +4221,13 @@ tcp_dropdropablreq(struct socket *head) * and being dropped by another input thread. * If we can't get a hold on this mutex, then grab the next socket in line. */ - if (lck_mtx_try_lock(inp->inpcb_mtx)) { + if (lck_mtx_try_lock(&inp->inpcb_mtx)) { so->so_usecount++; - if ((so->so_usecount == 2) && so->so_state & SS_INCOMP) + if ((so->so_usecount == 2) && + (so->so_state & SS_INCOMP) != 0 && + (so->so_flags & SOF_INCOMP_INPROGRESS) == 0) break; - else {/* don't use if beeing accepted or used in any other way */ + else {/* don't use if being accepted or used in any other way */ in_pcb_checkstate(inp, WNT_RELEASE, 1); tcp_unlock(so, 1, 0); } @@ -3851,44 +4246,120 @@ tcp_dropdropablreq(struct socket *head) if (!so) return 0; - TAILQ_REMOVE(&head->so_incomp, so, so_list); - tcp_unlock(head, 0, 0); - /* Makes sure socket is still in the right state to be discarded */ if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { tcp_unlock(so, 1, 0); - tcp_lock(head, 0, 0); return 0; } if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) { - /* do not discard: that socket is beeing accepted */ + /* do not discard: that socket is being accepted */ tcp_unlock(so, 1, 0); - tcp_lock(head, 0, 0); return 0; } - so->so_head = NULL; + TAILQ_REMOVE(&head->so_incomp, so, so_list); + tcp_unlock(head, 0, 0); - /* - * We do not want to lose track of the PCB right away in case we receive - * more segments from the peer - */ + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); tp = sototcpcb(so); so->so_flags |= SOF_OVERFLOW; - tp->t_state = TCPS_TIME_WAIT; - (void) tcp_close(tp); + so->so_head = NULL; + + tcp_close(tp); tp->t_unacksegs = 0; + + if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) { + /* Some one has a wantcnt on this pcb. Since WNT_ACQUIRE + * doesn't require a lock, it could have happened while + * we are holding the lock. This pcb will have to + * be garbage collected later. + * Release the reference held for so_incomp queue + */ + so->so_usecount--; + + tcp_unlock(so, 1, 0); + } else { + /* Unlock this socket and leave the reference on. We need to + * acquire the pcbinfo lock in order to fully dispose it off + */ + tcp_unlock(so, 0, 0); + + lck_rw_lock_exclusive(tcbinfo.mtx); + + tcp_lock(so, 0, 0); + + /* Release the reference held for so_incomp queue */ + so->so_usecount--; + + if (so->so_usecount != 1 || + (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING)) { + /* There is an extra wantcount or usecount that must + * have been added when the socket was unlocked. This + * socket will have to be garbage collected later + */ + tcp_unlock(so, 1, 0); + } else { + + /* Drop the reference held for this function */ + so->so_usecount--; + + in_pcbdispose(inp); + } + lck_rw_done(tcbinfo.mtx); + } tcpstat.tcps_drops++; - tcp_canceltimers(tp); - add_to_time_wait(tp); - - tcp_unlock(so, 1, 0); + tcp_lock(head, 0, 0); head->so_incqlen--; head->so_qlen--; - return 1; + return(1); +} + +/* Set background congestion control on a socket */ +void +tcp_set_background_cc(struct socket *so) +{ + tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX); +} + +/* Set foreground congestion control on a socket */ +void +tcp_set_foreground_cc(struct socket *so) +{ + tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX); +} + +static void +tcp_set_new_cc(struct socket *so, uint16_t cc_index) +{ + struct inpcb *inp = sotoinpcb(so); + struct tcpcb *tp = intotcpcb(inp); + uint16_t old_cc_index = 0; + if (tp->tcp_cc_index != cc_index) { + + old_cc_index = tp->tcp_cc_index; + + if (CC_ALGO(tp)->cleanup != NULL) + CC_ALGO(tp)->cleanup(tp); + tp->tcp_cc_index = cc_index; + + /* Decide if the connection is just starting or if + * we have sent some packets on it. + */ + if (tp->snd_nxt > tp->iss) { + /* Already sent some packets */ + if (CC_ALGO(tp)->switch_to != NULL) + CC_ALGO(tp)->switch_to(tp, old_cc_index); + } else { + if (CC_ALGO(tp)->init != NULL) + CC_ALGO(tp)->init(tp); + } + DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, struct tcphdr *, NULL, + int32_t, TCP_CC_CHANGE_ALGO); + } } static int @@ -3908,7 +4379,7 @@ tcp_getstat SYSCTL_HANDLER_ARGS } -SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_getstat, "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)"); static int @@ -3936,5 +4407,5 @@ sysctl_rexmtthresh SYSCTL_HANDLER_ARGS return (0); } -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I", "Duplicate ACK Threshold for Fast Retransmit"); diff --git a/bsd/netinet/tcp_ledbat.c b/bsd/netinet/tcp_ledbat.c new file mode 100644 index 000000000..5baf28bea --- /dev/null +++ b/bsd/netinet/tcp_ledbat.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/protosw.h> +#include <sys/mcache.h> +#include <sys/sysctl.h> + +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> + +#if INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#include <netinet/tcp_cc.h> + +#include <libkern/OSAtomic.h> + +/* This file implements an alternate TCP congestion control algorithm + * for background transport developed by LEDBAT working group at IETF and + * described in draft: draft-ietf-ledbat-congestion-02 + */ + +int tcp_ledbat_init(struct tcpcb *tp); +int tcp_ledbat_cleanup(struct tcpcb *tp); +void tcp_ledbat_cwnd_init(struct tcpcb *tp); +void tcp_ledbat_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_pre_fr(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_post_fr(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_after_idle(struct tcpcb *tp); +void tcp_ledbat_after_timeout(struct tcpcb *tp); +int tcp_ledbat_delay_ack(struct tcpcb *tp, struct tcphdr *th); +void tcp_ledbat_switch_cc(struct tcpcb *tp, uint16_t old_cc_index); + +struct tcp_cc_algo tcp_cc_ledbat = { + .name = "ledbat", + .init = tcp_ledbat_init, + .cleanup = tcp_ledbat_cleanup, + .cwnd_init = tcp_ledbat_cwnd_init, + .inseq_ack_rcvd = tcp_ledbat_inseq_ack_rcvd, + .ack_rcvd = tcp_ledbat_ack_rcvd, + .pre_fr = tcp_ledbat_pre_fr, + .post_fr = tcp_ledbat_post_fr, + .after_idle = tcp_ledbat_after_idle, + .after_timeout = tcp_ledbat_after_timeout, + .delay_ack = tcp_ledbat_delay_ack, + .switch_to = tcp_ledbat_switch_cc +}; + +extern int tcp_do_rfc3465; +extern int tcp_do_rfc3465_lim2; +extern uint32_t get_base_rtt(struct tcpcb *tp); + +/* Target queuing delay in milliseconds. This includes the processing + * and scheduling delay on both of the end-hosts. A LEDBAT sender tries + * to keep queuing delay below this limit. When the queuing delay + * goes above this limit, a LEDBAT sender will start reducing the + * congestion window. + * + * The LEDBAT draft says that target queue delay MUST be 100 ms for + * inter-operability. + */ +int target_qdelay = 100; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_target_qdelay, CTLFLAG_RW | CTLFLAG_LOCKED, + &target_qdelay , 100, "Target queuing delay"); + +/* Allowed increase and tether are used to place an upper bound on + * congestion window based on the amount of data that is outstanding. + * This will limit the congestion window when the amount of data in + * flight is little because the application is writing to the socket + * intermittently and is preventing the connection from becoming idle . + * + * max_allowed_cwnd = allowed_increase + (tether * flight_size) + * cwnd = min(cwnd, max_allowed_cwnd) + * + * 'Allowed_increase' parameter is set to 2. If the flight size is zero, then + * we want the congestion window to be at least 2 packets to reduce the + * delay induced by delayed ack. This helps when the receiver is acking every + * other packet. + * + * 'Tether' is also set to 2. We do not want this to limit the growth of cwnd + * during slow-start. + */ +int allowed_increase = 2; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_allowed_increase, CTLFLAG_RW | CTLFLAG_LOCKED, + &allowed_increase, 1, "Additive constant used to calculate max allowed congestion window"); + +/* Left shift for cwnd to get tether value of 2 */ +int tether_shift = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_tether_shift, CTLFLAG_RW | CTLFLAG_LOCKED, + &tether_shift, 1, "Tether shift for max allowed congestion window"); + +/* Start with an initial window of 2. This will help to get more accurate + * minimum RTT measurement in the beginning. It will help to probe + * the path slowly and will not add to the existing delay if the path is + * already congested. Using 2 packets will reduce the delay induced by delayed-ack. + */ +uint32_t bg_ss_fltsz = 2; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, bg_ss_fltsz, CTLFLAG_RW | CTLFLAG_LOCKED, + &bg_ss_fltsz, 2, "Initial congestion window for background transport"); + +extern int rtt_samples_per_slot; + +static void update_cwnd(struct tcpcb *tp, uint32_t incr) { + uint32_t max_allowed_cwnd = 0, flight_size = 0; + uint32_t qdelay, base_rtt; + int32_t off_target; + + base_rtt = get_base_rtt(tp); + + /* If we do not have a good RTT measurement yet, increment + * congestion window by the default value. + */ + if (base_rtt == 0 || tp->t_rttcur == 0) { + tp->snd_cwnd += incr; + goto check_max; + } + + qdelay = tp->t_rttcur - base_rtt; + off_target = (int32_t)(target_qdelay - qdelay); + + if (off_target >= 0) { + /* Delay decreased or remained the same, we can increase + * the congestion window according to RFC 3465. + * + * Move background slow-start threshold to current + * congestion window so that the next time (after some idle + * period), we can attempt to do slow-start till here if there + * is no increase in rtt + */ + if (tp->bg_ssthresh < tp->snd_cwnd) + tp->bg_ssthresh = tp->snd_cwnd; + tp->snd_cwnd += incr; + + } else { + /* In response to an increase in rtt, reduce the congestion + * window by one-eighth. This will help to yield immediately + * to a competing stream. + */ + uint32_t redwin; + + redwin = tp->snd_cwnd >> 3; + tp->snd_cwnd -= redwin; + if (tp->snd_cwnd < bg_ss_fltsz * tp->t_maxseg) + tp->snd_cwnd = bg_ss_fltsz * tp->t_maxseg; + + /* Lower background slow-start threshold so that the connection + * will go into congestion avoidance phase + */ + if (tp->bg_ssthresh > tp->snd_cwnd) + tp->bg_ssthresh = tp->snd_cwnd; + } +check_max: + /* Calculate the outstanding flight size and restrict the + * congestion window to a factor of flight size. + */ + flight_size = tp->snd_max - tp->snd_una; + + max_allowed_cwnd = (allowed_increase * tp->t_maxseg) + + (flight_size << tether_shift); + tp->snd_cwnd = min(tp->snd_cwnd, max_allowed_cwnd); + return; +} + +int tcp_ledbat_init(struct tcpcb *tp) { +#pragma unused(tp) + OSIncrementAtomic((volatile SInt32 *)&tcp_cc_ledbat.num_sockets); + return 0; +} + +int tcp_ledbat_cleanup(struct tcpcb *tp) { +#pragma unused(tp) + OSDecrementAtomic((volatile SInt32 *)&tcp_cc_ledbat.num_sockets); + return 0; +} + +/* Initialize the congestion window for a connection + * + */ + +void +tcp_ledbat_cwnd_init(struct tcpcb *tp) { + tp->snd_cwnd = tp->t_maxseg * bg_ss_fltsz; + tp->bg_ssthresh = tp->snd_ssthresh; +} + +/* Function to handle an in-sequence ack which is fast-path processing + * of an in sequence ack in tcp_input function (called as header prediction). + * This gets called only during congestion avoidance phase. + */ +void +tcp_ledbat_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { + int acked = 0; + u_int32_t incr = 0; + + acked = th->th_ack - tp->snd_una; + tp->t_bytes_acked += acked; + if (tp->t_bytes_acked > tp->snd_cwnd) { + tp->t_bytes_acked -= tp->snd_cwnd; + incr = tp->t_maxseg; + } + + if (tp->snd_cwnd < tp->snd_wnd && incr > 0) { + update_cwnd(tp, incr); + } +} +/* Function to process an ack. + */ +void +tcp_ledbat_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { + /* + * RFC 3465 - Appropriate Byte Counting. + * + * If the window is currently less than ssthresh, + * open the window by the number of bytes ACKed by + * the last ACK, however clamp the window increase + * to an upper limit "L". + * + * In congestion avoidance phase, open the window by + * one segment each time "bytes_acked" grows to be + * greater than or equal to the congestion window. + */ + + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + int acked = 0; + + acked = th->th_ack - tp->snd_una; + tp->t_bytes_acked += acked; + if (cw >= tp->bg_ssthresh) { + /* congestion-avoidance */ + if (tp->t_bytes_acked < cw) { + /* No need to increase yet. */ + incr = 0; + } + } else { + /* + * If the user explicitly enables RFC3465 + * use 2*SMSS for the "L" param. Otherwise + * use the more conservative 1*SMSS. + * + * (See RFC 3465 2.3 Choosing the Limit) + */ + u_int abc_lim; + + abc_lim = (tcp_do_rfc3465_lim2 && + tp->snd_nxt == tp->snd_max) ? incr * 2 : incr; + + incr = lmin(acked, abc_lim); + } + if (tp->t_bytes_acked >= cw) + tp->t_bytes_acked -= cw; + if (incr > 0) + update_cwnd(tp, incr); +} + +void +tcp_ledbat_pre_fr(struct tcpcb *tp, struct tcphdr *th) { +#pragma unused(th) + + uint32_t win; + + win = min(tp->snd_wnd, tp->snd_cwnd) / + 2 / tp->t_maxseg; + if ( win < 2 ) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + if (tp->bg_ssthresh > tp->snd_ssthresh) + tp->bg_ssthresh = tp->snd_ssthresh; +} + +void +tcp_ledbat_post_fr(struct tcpcb *tp, struct tcphdr *th) { + int32_t ss; + + ss = tp->snd_max - th->th_ack; + + /* + * Complete ack. Inflate the congestion window to + * ssthresh and exit fast recovery. + * + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (ss < (int32_t)tp->snd_ssthresh) + tp->snd_cwnd = ss + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_bytes_acked = 0; +} + +/* + * Function to handle connections that have been idle for + * some time. Slow start to get ack "clock" running again. + * Clear base history after idle time. + */ +void +tcp_ledbat_after_idle(struct tcpcb *tp) { + int32_t n = N_RTT_BASE, i = (N_RTT_BASE - 1); + + /* Decide how many base history entries have to be cleared + * based on how long the connection has been idle. + */ + + if (tp->t_rttcur > 0) { + int32_t nrtt, idle_time; + + idle_time = tcp_now - tp->t_rcvtime; + nrtt = idle_time / tp->t_rttcur; + n = nrtt / rtt_samples_per_slot; + if (n > N_RTT_BASE) + n = N_RTT_BASE; + } + for (i = (N_RTT_BASE - 1); n > 0; --i, --n) { + tp->rtt_hist[i] = 0; + } + for (n = (N_RTT_BASE - 1); i >= 0; --i, --n) { + tp->rtt_hist[n] = tp->rtt_hist[i]; + tp->rtt_hist[i] = 0; + } + + /* Reset the congestion window */ + tp->snd_cwnd = tp->t_maxseg * bg_ss_fltsz; +} + +/* Function to change the congestion window when the retransmit + * timer fires. The behavior is the same as that for best-effort + * TCP, reduce congestion window to one segment and start probing + * the link using "slow start". The slow start threshold is set + * to half of the current window. Lower the background slow start + * threshold also. + */ +void +tcp_ledbat_after_timeout(struct tcpcb *tp) { + if (tp->t_state >= TCPS_ESTABLISHED) { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_bytes_acked = 0; + tp->t_dupacks = 0; + + if (tp->bg_ssthresh > tp->snd_ssthresh) + tp->bg_ssthresh = tp->snd_ssthresh; + } +} + +/* + * Indicate whether this ack should be delayed. + * We can delay the ack if: + * - our last ack wasn't a 0-sized window. + * - the peer hasn't sent us a TH_PUSH data packet: if he did, take this + * as a clue that we need to ACK without any delay. This helps higher + * level protocols who won't send us more data even if the window is + * open because their last "segment" hasn't been ACKed + * Otherwise the receiver will ack every other full-sized segment or when the + * delayed ack timer fires. This will help to generate better rtt estimates for + * the other end if it is a ledbat sender. + * + */ + +int +tcp_ledbat_delay_ack(struct tcpcb *tp, struct tcphdr *th) { + if ((tp->t_flags & TF_RXWIN0SENT) == 0 && + (th->th_flags & TH_PUSH) == 0 && + (tp->t_flags & TF_DELACK) == 0) + return(1); + return(0); +} + +/* Change a connection to use ledbat. First, lower bg_ssthresh value + * if it needs to be. + */ +void +tcp_ledbat_switch_cc(struct tcpcb *tp, uint16_t old_cc_index) { +#pragma unused(old_cc_index) + uint32_t cwnd; + + if (tp->bg_ssthresh == 0 || tp->bg_ssthresh > tp->snd_ssthresh) + tp->bg_ssthresh = tp->snd_ssthresh; + + cwnd = min(tp->snd_wnd, tp->snd_cwnd); + + if (tp->snd_cwnd > tp->bg_ssthresh) + cwnd = cwnd / tp->t_maxseg; + else + cwnd = cwnd / 2 / tp->t_maxseg; + + if (cwnd < bg_ss_fltsz) + cwnd = bg_ss_fltsz; + + tp->snd_cwnd = cwnd * tp->t_maxseg; + tp->t_bytes_acked = 0; + + OSIncrementAtomic((volatile SInt32 *)&tcp_cc_ledbat.num_sockets); +} diff --git a/bsd/netinet/tcp_newreno.c b/bsd/netinet/tcp_newreno.c new file mode 100644 index 000000000..5c9db2de9 --- /dev/null +++ b/bsd/netinet/tcp_newreno.c @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/protosw.h> + +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> + +#if INET6 +#include <netinet/ip6.h> +#endif +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> +#include <netinet/tcp_cc.h> +#include <libkern/OSAtomic.h> + +int tcp_newreno_init(struct tcpcb *tp); +int tcp_newreno_cleanup(struct tcpcb *tp); +void tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp); +void tcp_newreno_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_after_idle(struct tcpcb *tp); +void tcp_newreno_after_timeout(struct tcpcb *tp); +int tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th); +void tcp_newreno_switch_cc(struct tcpcb *tp, uint16_t old_index); + +struct tcp_cc_algo tcp_cc_newreno = { + .name = "newreno", + .init = tcp_newreno_init, + .cleanup = tcp_newreno_cleanup, + .cwnd_init = tcp_newreno_cwnd_init_or_reset, + .inseq_ack_rcvd = tcp_newreno_inseq_ack_rcvd, + .ack_rcvd = tcp_newreno_ack_rcvd, + .pre_fr = tcp_newreno_pre_fr, + .post_fr = tcp_newreno_post_fr, + .after_idle = tcp_newreno_cwnd_init_or_reset, + .after_timeout = tcp_newreno_after_timeout, + .delay_ack = tcp_newreno_delay_ack, + .switch_to = tcp_newreno_switch_cc +}; + +extern int tcp_do_rfc3465; +extern int tcp_do_rfc3465_lim2; +extern int maxseg_unacked; + +int tcp_newreno_init(struct tcpcb *tp) { +#pragma unused(tp) + OSIncrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); + return 0; +} + +int tcp_newreno_cleanup(struct tcpcb *tp) { +#pragma unused(tp) + OSDecrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); + return 0; +} + +/* Initialize the congestion window for a connection or + * handles connections that have been idle for + * some time. In this state, no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. + */ +void +tcp_newreno_cwnd_init_or_reset(struct tcpcb *tp) { + if ( tp->t_flags & TF_LOCAL ) + tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; + else { + /* Calculate initial cwnd according to RFC3390, + * - On a standard link, this will result in a higher cwnd + * and improve initial transfer rate. + * - Keep the old ss_fltsz sysctl for ABI compabitility issues. + * but it will be overriden if tcp_do_rfc3390 sysctl is set. + */ + + if (tcp_do_rfc3390) + tp->snd_cwnd = min(4 * tp->t_maxseg, max(2 * tp->t_maxseg, 4380)); + + else + tp->snd_cwnd = tp->t_maxseg * ss_fltsz; + } +} + + +/* Function to handle an in-sequence ack during congestion avoidance phase. + * This will get called from header prediction code. + */ +void +tcp_newreno_inseq_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { + int acked = 0; + acked = th->th_ack - tp->snd_una; + /* + * Grow the congestion window, if the + * connection is cwnd bound. + */ + if (tp->snd_cwnd < tp->snd_wnd) { + tp->t_bytes_acked += acked; + if (tp->t_bytes_acked > tp->snd_cwnd) { + tp->t_bytes_acked -= tp->snd_cwnd; + tp->snd_cwnd += tp->t_maxseg; + } + } +} +/* Function to process an ack. + */ +void +tcp_newreno_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) { + /* + * RFC 3465 - Appropriate Byte Counting. + * + * If the window is currently less than ssthresh, + * open the window by the number of bytes ACKed by + * the last ACK, however clamp the window increase + * to an upper limit "L". + * + * In congestion avoidance phase, open the window by + * one segment each time "bytes_acked" grows to be + * greater than or equal to the congestion window. + */ + + register u_int cw = tp->snd_cwnd; + register u_int incr = tp->t_maxseg; + int acked = 0; + + acked = th->th_ack - tp->snd_una; + if (tcp_do_rfc3465) { + + if (cw >= tp->snd_ssthresh) { + tp->t_bytes_acked += acked; + if (tp->t_bytes_acked >= cw) { + /* Time to increase the window. */ + tp->t_bytes_acked -= cw; + } else { + /* No need to increase yet. */ + incr = 0; + } + } else { + /* + * If the user explicitly enables RFC3465 + * use 2*SMSS for the "L" param. Otherwise + * use the more conservative 1*SMSS. + * + * (See RFC 3465 2.3 Choosing the Limit) + */ + u_int abc_lim; + + abc_lim = (tcp_do_rfc3465_lim2 && + tp->snd_nxt == tp->snd_max) ? incr * 2 : incr; + + incr = lmin(acked, abc_lim); + } + } else { + /* + * If the window gives us less than ssthresh packets + * in flight, open exponentially (segsz per packet). + * Otherwise open linearly: segsz per window + * (segsz^2 / cwnd per packet). + */ + + if (cw >= tp->snd_ssthresh) + incr = max((incr * incr / cw), 1); + } + tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale); +} + +void +tcp_newreno_pre_fr(struct tcpcb *tp, struct tcphdr *th) { +#pragma unused(th) + + uint32_t win; + + win = min(tp->snd_wnd, tp->snd_cwnd) / + 2 / tp->t_maxseg; + if ( win < 2 ) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; +} + +void +tcp_newreno_post_fr(struct tcpcb *tp, struct tcphdr *th) { + int32_t ss; + + ss = tp->snd_max - th->th_ack; + + /* + * Complete ack. Inflate the congestion window to + * ssthresh and exit fast recovery. + * + * Window inflation should have left us with approx. + * snd_ssthresh outstanding data. But in case we + * would be inclined to send a burst, better to do + * it via the slow start mechanism. + */ + if (ss < (int32_t)tp->snd_ssthresh) + tp->snd_cwnd = ss + tp->t_maxseg; + else + tp->snd_cwnd = tp->snd_ssthresh; + tp->t_bytes_acked = 0; +} + +/* Function to change the congestion window when the retransmit + * timer fires. + */ +void +tcp_newreno_after_timeout(struct tcpcb *tp) { + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + if (tp->t_state >= TCPS_ESTABLISHED) { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_bytes_acked = 0; + tp->t_dupacks = 0; + } +} + +/* + * Indicate whether this ack should be delayed. + * We can delay the ack if: + * - delayed acks are enabled and set to 1, same as when value is set to 2. + * We kept this for binary compatibility. + * - delayed acks are enabled and set to 2, will "ack every other packet" + * - if our last ack wasn't a 0-sized window. + * - if the peer hasn't sent us a TH_PUSH data packet (this solves 3649245). + * If TH_PUSH is set, take this as a clue that we need to ACK + * with no delay. This helps higher level protocols who won't send + * us more data even if the window is open because their + * last "segment" hasn't been ACKed + * - delayed acks are enabled and set to 3, will do "streaming detection" + * (see the comment in tcp_input.c) and + * - if we receive more than "maxseg_unacked" full packets in the last 100ms + * - if the connection is not in slow-start or idle or loss/recovery states + * - if those criteria aren't met, it will ack every other packet. + */ + +int +tcp_newreno_delay_ack(struct tcpcb *tp, struct tcphdr *th) { + switch (tcp_delack_enabled) { + case 1: + case 2: + if ((tp->t_flags & TF_RXWIN0SENT) == 0 && + (th->th_flags & TH_PUSH) == 0 && + (tp->t_flags & TF_DELACK) == 0) + return(1); + break; + case 3: + if ((tp->t_flags & TF_RXWIN0SENT) == 0 && + (th->th_flags & TH_PUSH) == 0 && + ((tp->t_unacksegs == 0) || + ((tp->t_flags & TF_STRETCHACK) != 0 && + tp->t_unacksegs < (maxseg_unacked - 1)))) + return(1); + break; + } + return(0); +} + +/* Switch to newreno from a different CC. If the connection is in + * congestion avoidance state, it can continue to use the current + * congestion window because it is going to be conservative. But + * if the connection is in slow-start, we will halve the congestion + * window and let newreno work from there. + */ +void +tcp_newreno_switch_cc(struct tcpcb *tp, uint16_t old_index) { +#pragma unused(old_index) + + uint32_t cwnd = min(tp->snd_wnd, tp->snd_cwnd); + if (tp->snd_cwnd >= tp->snd_ssthresh) { + cwnd = cwnd / tp->t_maxseg; + } else { + cwnd = cwnd / 2 / tp->t_maxseg; + } + if (cwnd < 1) + cwnd = 1; + tp->snd_cwnd = cwnd * tp->t_maxseg; + + /* Start counting bytes for RFC 3465 again */ + tp->t_bytes_acked = 0; + + OSIncrementAtomic((volatile SInt32 *)&tcp_cc_newreno.num_sockets); +} diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index 69a2c2aed..5c310770d 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,6 +81,7 @@ #include <sys/socketvar.h> #include <net/route.h> +#include <net/ntstat.h> #include <net/if_var.h> #include <netinet/in.h> @@ -89,6 +90,7 @@ #include <netinet/ip.h> #include <netinet/in_pcb.h> #include <netinet/ip_var.h> +#include <mach/sdt.h> #if INET6 #include <netinet6/in6_pcb.h> #include <netinet/ip6.h> @@ -101,10 +103,12 @@ #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> #include <netinet/tcpip.h> +#include <netinet/tcp_cc.h> #if TCPDEBUG #include <netinet/tcp_debug.h> #endif #include <sys/kdebug.h> +#include <mach/sdt.h> #if IPSEC #include <netinet6/ipsec.h> @@ -118,48 +122,55 @@ #define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3) #define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) | 1) - #ifdef notyet extern struct mbuf *m_copypack(); #endif int path_mtu_discovery = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW | CTLFLAG_LOCKED, &path_mtu_discovery, 1, "Enable Path MTU Discovery"); int ss_fltsz = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW | CTLFLAG_LOCKED, &ss_fltsz, 1, "Slow start flight size"); int ss_fltsz_local = 8; /* starts with eight segments max */ -SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW | CTLFLAG_LOCKED, &ss_fltsz_local, 1, "Slow start flight size for local networks"); -int tcp_do_newreno = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno, - 0, "Enable NewReno Algorithms"); - int tcp_do_tso = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); int tcp_ecn_outbound = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW, &tcp_ecn_outbound, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_initiate_out, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_outbound, 0, "Initiate ECN for outbound connections"); int tcp_ecn_inbound = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW, &tcp_ecn_inbound, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, ecn_negotiate_in, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_ecn_inbound, 0, "Allow ECN negotiation for inbound connections"); int tcp_packet_chaining = 50; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW, &tcp_packet_chaining, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, packetchain, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_packet_chaining, 0, "Enable TCP output packet chaining"); int tcp_output_unlocked = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW, &tcp_output_unlocked, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, socket_unlocked_on_output, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_output_unlocked, 0, "Unlock TCP when sending packets down to IP"); +int tcp_do_rfc3390 = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_do_rfc3390, 1, "Calculate intial slowstart cwnd depending on MSS"); + +int tcp_min_iaj_win = MIN_IAJ_WIN; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, min_iaj_win, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_min_iaj_win, 1, "Minimum recv win based on inter-packet arrival jitter"); + +int tcp_acc_iaj_react_limit = ACC_IAJ_REACT_LIMIT; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, acc_iaj_react_limit, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_acc_iaj_react_limit, 1, "Accumulated IAJ when receiver starts to react"); + static int32_t packchain_newlist = 0; static int32_t packchain_looped = 0; static int32_t packchain_sent = 0; @@ -181,10 +192,13 @@ extern int ip_use_randomid; #endif /* RANDOM_IP_ID */ extern u_int32_t dlil_filter_count; extern u_int32_t kipf_count; +extern int tcp_recv_bg; static int tcp_ip_output(struct socket *, struct tcpcb *, struct mbuf *, int, struct mbuf *, int, int, int32_t); +static inline int is_tcp_recv_bg(struct socket *so); + static __inline__ u_int16_t get_socket_id(struct socket * s) { @@ -200,6 +214,12 @@ get_socket_id(struct socket * s) return (val); } +static inline int +is_tcp_recv_bg(struct socket *so) +{ + return (so->so_traffic_mgt_flags & TRAFFIC_MGT_TCP_RECVBG); +} + /* * Tcp output routine: figure out what should be sent and send it. * @@ -242,10 +262,10 @@ tcp_output(struct tcpcb *tp) #ifdef IPSEC unsigned ipsec_optlen = 0; #endif - int maxburst = TCP_MAXBURST; int last_off = 0; int m_off; - struct mbuf *m_last = NULL; + int idle_time = 0; + struct mbuf *m_lastm = NULL; struct mbuf *m_head = NULL; struct mbuf *packetlist = NULL; struct mbuf *tp_inp_options = tp->t_inpcb->inp_depend4.inp4_options; @@ -265,28 +285,17 @@ tcp_output(struct tcpcb *tp) * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); - if (idle && tp->t_rcvtime >= tp->t_rxtcur) { - /* - * We have been idle for "a while" and no acks are - * expected to clock out any data we send -- - * slow start to get ack "clock" running again. - * - * Set the slow-start flight size depending on whether - * this is a local network or not. - */ - if ( -#if INET6 - (isipv6 && in6_localaddr(&tp->t_inpcb->in6p_faddr)) || - (!isipv6 && -#endif - in_localaddr(tp->t_inpcb->inp_faddr) -#if INET6 - ) -#endif - ) - tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; - else - tp->snd_cwnd = tp->t_maxseg * ss_fltsz; + + /* Since idle_time is signed integer, the following integer subtraction + * will take care of wrap around of tcp_now + */ + idle_time = tcp_now - tp->t_rcvtime; + if (idle && idle_time >= tp->t_rxtcur) { + if (CC_ALGO(tp)->after_idle != NULL) + CC_ALGO(tp)->after_idle(tp); + DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, + struct tcpcb *, tp, struct tcphdr *, NULL, + int32_t, TCP_CC_IDLE_TIMEOUT); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { @@ -344,14 +353,16 @@ again: /* set Retransmit timer if it wasn't set * reset Persist timer and shift register as the - * adversed peer window may not be valid anymore + * advertised peer window may not be valid anymore */ if (!tp->t_timer[TCPT_REXMT]) { - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); if (tp->t_timer[TCPT_PERSIST]) { tp->t_timer[TCPT_PERSIST] = 0; tp->t_rxtshift = 0; + tp->t_persist_stop = 0; + tp->rxt_start = 0; } } @@ -364,10 +375,12 @@ again: tcp_drop(tp, EADDRNOTAVAIL); return(EADDRNOTAVAIL); } - else + else { + tcp_check_timer_state(tp); return(0); /* silently ignore, keep data in socket: address may be back */ + } } - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* * Address is still valid; check for multipages capability @@ -463,6 +476,12 @@ again: tcpstat.tcps_sack_rexmits++; tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); + if (nstat_collect) { + nstat_route_tx(tp->t_inpcb->inp_route.ro_rt, 1, min(len, tp->t_maxseg), NSTAT_TX_FLAG_RETRANSMIT); + locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1); + locked_add_64(&tp->t_inpcb->inp_stat->txbytes, min(len, tp->t_maxseg)); + tp->t_stat.txretransmitbytes += min(len, tp->t_maxseg); + } } else len = 0; @@ -507,6 +526,8 @@ after_sack_rexmit: } else { tp->t_timer[TCPT_PERSIST] = 0; tp->t_rxtshift = 0; + tp->rxt_start = 0; + tp->t_persist_stop = 0; } } @@ -587,6 +608,8 @@ after_sack_rexmit: (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) { tp->t_flags &= ~TF_CLOSING; (void) tcp_close(tp); + } else { + tcp_check_timer_state(tp); } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); @@ -604,7 +627,12 @@ after_sack_rexmit: flags &= ~TH_FIN; } - if (len < 0) { + /* The check here used to be (len < 0). Some times len is zero when + * the congestion window is closed and we need to check if persist timer + * has to be set in that case. But don't set persist until connection + * is established. + */ + if (len <= 0 && !(flags & TH_SYN)) { /* * If FIN has been sent but not acked, * but we haven't been called to retransmit, @@ -619,6 +647,7 @@ after_sack_rexmit: if (sendwin == 0) { tp->t_timer[TCPT_REXMT] = 0; tp->t_rxtshift = 0; + tp->rxt_start = 0; tp->snd_nxt = tp->snd_una; if (tp->t_timer[TCPT_PERSIST] == 0) tcp_setpersist(tp); @@ -782,7 +811,7 @@ after_sack_rexmit: if (tp->sack_enable && (tp->t_state >= TCPS_ESTABLISHED) && SEQ_GT(tp->snd_max, tp->snd_una) && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); goto just_return; } /* @@ -810,6 +839,7 @@ after_sack_rexmit: if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && tp->t_timer[TCPT_PERSIST] == 0) { tp->t_rxtshift = 0; + tp->rxt_start = 0; tcp_setpersist(tp); } just_return: @@ -833,6 +863,8 @@ just_return: if ((tp->t_flags & (TF_CLOSING|TF_SENDINPROG)) == TF_CLOSING) { tp->t_flags &= ~TF_CLOSING; (void) tcp_close(tp); + } else { + tcp_check_timer_state(tp); } KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); @@ -1136,9 +1168,19 @@ send: else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tcpstat.tcps_sndrexmitpack++; tcpstat.tcps_sndrexmitbyte += len; + if (nstat_collect) { + nstat_route_tx(tp->t_inpcb->inp_route.ro_rt, 1, len, NSTAT_TX_FLAG_RETRANSMIT); + locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1); + locked_add_64(&tp->t_inpcb->inp_stat->txbytes, len); + tp->t_stat.txretransmitbytes += len; + } } else { tcpstat.tcps_sndpack++; tcpstat.tcps_sndbyte += len; + if (nstat_collect) { + locked_add_64(&tp->t_inpcb->inp_stat->txpackets, 1); + locked_add_64(&tp->t_inpcb->inp_stat->txbytes, len); + } } #ifdef notyet if ((m = m_copypack(so->so_snd.sb_mb, off, @@ -1221,7 +1263,7 @@ send: * setting the mbuf pointer to NULL is sufficient to disable the hint mechanism. */ if (m_head != so->so_snd.sb_mb || sack_rxmit || last_off != off) - m_last = NULL; + m_lastm = NULL; last_off = off + len; m_head = so->so_snd.sb_mb; @@ -1235,7 +1277,7 @@ send: * m_copym_with_hdrs will always return the last mbuf pointer and the offset into it that * it acted on to fullfill the current request, whether a valid 'hint' was passed in or not */ - if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, len, M_DONTWAIT, &m_last, &m_off)) == NULL) { + if ((m = m_copym_with_hdrs(so->so_snd.sb_mb, off, len, M_DONTWAIT, &m_lastm, &m_off)) == NULL) { error = ENOBUFS; goto out; } @@ -1285,6 +1327,10 @@ send: ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcp_fillheaders(tp, ip6, th); + if ((tp->ecn_flags & TE_SENDIPECT) != 0 && len && + !SEQ_LT(tp->snd_nxt, tp->snd_max)) { + ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); + } } else #endif /* INET6 */ { @@ -1349,13 +1395,25 @@ send: if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) { if (recwin > (int32_t)slowlink_wsize) recwin = slowlink_wsize; - th->th_win = htons((u_short) (recwin>>tp->rcv_scale)); } - else { - if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) - recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale); - th->th_win = htons((u_short) (recwin>>tp->rcv_scale)); + +#if TRAFFIC_MGT + if (tcp_recv_bg == 1 || is_tcp_recv_bg(so)) { + if (tp->acc_iaj > tcp_acc_iaj_react_limit) { + uint32_t min_iaj_win = tcp_min_iaj_win * tp->t_maxseg; + if (tp->iaj_rwintop == 0 || + SEQ_LT(tp->iaj_rwintop, tp->rcv_adv)) + tp->iaj_rwintop = tp->rcv_adv; + if (SEQ_LT(tp->iaj_rwintop, tp->rcv_nxt + min_iaj_win)) + tp->iaj_rwintop = tp->rcv_nxt + min_iaj_win; + recwin = min(tp->iaj_rwintop - tp->rcv_nxt, recwin); + } } +#endif /* TRAFFIC_MGT */ + + if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) + recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale); + th->th_win = htons((u_short) (recwin>>tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised @@ -1365,7 +1423,7 @@ send: * to read more data then can be buffered prior to transmitting on * the connection. */ - if (recwin == 0) + if (th->th_win == 0) tp->t_flags |= TF_RXWIN0SENT; else tp->t_flags &= ~TF_RXWIN0SENT; @@ -1387,13 +1445,17 @@ send: */ m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ #if INET6 - if (isipv6) + if (isipv6) { /* * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ - th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), - sizeof(struct tcphdr) + optlen + len); + m->m_pkthdr.csum_flags = CSUM_TCPIPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + if (len + optlen) + th->th_sum = in_addword(th->th_sum, + htons((u_short)(optlen + len))); + } else #endif /* INET6 */ { @@ -1407,7 +1469,6 @@ send: /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. - * XXX: Fixme: This is currently not the case for IPv6. */ if (tso) { #if INET6 @@ -1450,7 +1511,7 @@ send: * not currently timing anything. */ if (tp->t_rtttime == 0) { - tp->t_rtttime = 1; + tp->t_rtttime = tcp_now; tp->t_rtseq = startseq; tcpstat.tcps_segstimed++; } @@ -1471,8 +1532,10 @@ timer: if (tp->t_timer[TCPT_PERSIST]) { tp->t_timer[TCPT_PERSIST] = 0; tp->t_rxtshift = 0; + tp->rxt_start = 0; + tp->t_persist_stop = 0; } - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); } } else { /* @@ -1510,6 +1573,15 @@ timer: */ #if INET6 if (isipv6) { + struct rtentry *rt6; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + unsigned int outif; + + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | + (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), + 0,0,0); /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. @@ -1531,46 +1603,40 @@ timer: #endif /*IPSEC*/ m->m_pkthdr.socket_id = socket_id; -#if PKT_PRIORITY - set_traffic_class(m, so, MBUF_TC_NONE); -#endif /* PKT_PRIORITY */ - error = ip6_output(m, - inp6_pktopts, - &tp->t_inpcb->in6p_route, - (so_options & SO_DONTROUTE), NULL, NULL, 0); + rt6 = tp->t_inpcb->in6p_route.ro_rt; + if (rt6 != NULL && rt6->rt_ifp != NULL + && rt6->rt_ifp != lo_ifp) + set_packet_tclass(m, so, MBUF_TC_UNSPEC, 1); + + DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb, struct ip6_hdr *, ip6, + struct tcpcb *, tp, struct tcphdr *, th); + + if (tp->t_inpcb->inp_flags & INP_BOUND_IF) + ip6oa.ip6oa_boundif = tp->t_inpcb->inp_boundif; + + ip6oa.ip6oa_nocell = (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + + error = ip6_output(m, inp6_pktopts, &tp->t_inpcb->in6p_route, + (so_options & SO_DONTROUTE) | IPV6_OUTARGS, NULL, NULL, + &ip6oa); + + /* Refresh rt6 as we may have lost the route while in ip6_output() */ + if ((rt6 = tp->t_inpcb->in6p_route.ro_rt) != NULL && + (outif = rt6->rt_ifp->if_index) != tp->t_inpcb->in6p_last_outif) + tp->t_inpcb->in6p_last_outif = outif; } else #endif /* INET6 */ { ip->ip_len = m->m_pkthdr.len; -#if INET6 - if (isipv6) - ip->ip_ttl = in6_selecthlim(tp->t_inpcb, - tp->t_inpcb->in6p_route.ro_rt ? - tp->t_inpcb->in6p_route.ro_rt->rt_ifp - : NULL); - else -#endif /* INET6 */ ip->ip_ttl = tp->t_inpcb->inp_ip_ttl; /* XXX */ ip->ip_tos |= (tp->t_inpcb->inp_ip_tos & ~IPTOS_ECN_MASK); /* XXX */ -#if INET6 - if (isipv6) { - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), - (((tp->t_inpcb->in6p_laddr.s6_addr16[0] & 0xffff) << 16) | - (tp->t_inpcb->in6p_faddr.s6_addr16[0] & 0xffff)), - 0,0,0); - } - else -#endif - { - KERNEL_DEBUG(DBG_LAYER_BEG, - ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), - (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | - (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), - 0,0,0); - } + KERNEL_DEBUG(DBG_LAYER_BEG, + ((tp->t_inpcb->inp_fport << 16) | tp->t_inpcb->inp_lport), + (((tp->t_inpcb->inp_laddr.s_addr & 0xffff) << 16) | + (tp->t_inpcb->inp_faddr.s_addr & 0xffff)), + 0,0,0); /* * See if we should do MTU discovery. @@ -1596,11 +1662,18 @@ timer: lost = 0; m->m_pkthdr.socket_id = socket_id; m->m_nextpkt = NULL; -#if PKT_PRIORITY - set_traffic_class(m, so, MBUF_TC_NONE); -#endif /* PKT_PRIORITY */ + + if (tp->t_inpcb->inp_route.ro_rt != NULL && + tp->t_inpcb->inp_route.ro_rt->rt_ifp != NULL && + tp->t_inpcb->inp_route.ro_rt->rt_ifp != lo_ifp) + set_packet_tclass(m, so, MBUF_TC_UNSPEC, 0); + tp->t_pktlist_sentlen += len; tp->t_lastchain++; + + DTRACE_TCP5(send, struct mbuf *, m, struct inpcb *, tp->t_inpcb, + struct ip *, ip, struct tcpcb *, tp, struct tcphdr *, th); + if (tp->t_pktlist_head != NULL) { tp->t_pktlist_tail->m_nextpkt = m; tp->t_pktlist_tail = m; @@ -1685,12 +1758,17 @@ out: if (error == ENOBUFS) { if (!tp->t_timer[TCPT_REXMT] && !tp->t_timer[TCPT_PERSIST]) - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); tp->snd_cwnd = tp->t_maxseg; tp->t_bytes_acked = 0; + tcp_check_timer_state(tp); KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, + struct tcpcb *, tp, struct tcphdr *, NULL, + int32_t, TCP_CC_OUTPUT_ERROR); return (0); } if (error == EMSGSIZE) { @@ -1710,21 +1788,26 @@ out: tp->t_flags &= ~TF_TSO; tcp_mtudisc(tp->t_inpcb, 0); + tcp_check_timer_state(tp); + KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return 0; } if ((error == EHOSTUNREACH || error == ENETDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) { tp->t_softerror = error; + tcp_check_timer_state(tp); KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (0); } + tcp_check_timer_state(tp); KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END, 0,0,0,0,0); return (error); } tcpstat.tcps_sndtotal++; +#if INET6 /* * Data sent (as far as we can tell). * If this advertises a larger window than any other segment, @@ -1733,18 +1816,21 @@ out: * we unlock the socket. * NOTE: for now, this is done in tcp_ip_output for IPv4 */ -#if INET6 if (isipv6) { if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); + tp->t_timer[TCPT_DELACK] = 0; + tp->t_unacksegs = 0; } #endif KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT | DBG_FUNC_END,0,0,0,0,0); - if (sendalot && (!tcp_do_newreno || --maxburst)) + if (sendalot) goto again; + + tcp_check_timer_state(tp); return (0); } @@ -1758,13 +1844,12 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, struct inpcb *inp = tp->t_inpcb; struct ip_out_args ipoa; struct route ro; -#if CONFIG_OUT_IF unsigned int outif; -#endif /* CONFIG_OUT_IF */ /* If socket was bound to an ifindex, tell ip_output about it */ - ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ? + ipoa.ipoa_boundif = (inp->inp_flags & INP_BOUND_IF) ? inp->inp_boundif : IFSCOPE_NONE; + ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; flags |= IP_OUTARGS; /* Copy the cached route and take an extra reference */ @@ -1781,14 +1866,21 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, tp->rcv_adv = tp->rcv_nxt + recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); + tp->t_timer[TCPT_DELACK] = 0; + tp->t_unacksegs = 0; /* * If allowed, unlock TCP socket while in IP * but only if the connection is established and - * if we're not sending from an upcall. + * in a normal mode where reentrancy on the tcpcb won't be + * an issue: + * - there is no SACK episode + * - we're not in Fast Recovery mode + * - if we're not sending from an upcall. */ if (tcp_output_unlocked && ((so->so_flags & SOF_UPCALLINUSE) == 0) && - (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0)) { + (tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == 0) && + ((tp->t_flags & TF_FASTRECOVERY) == 0)) { unlocked = TRUE; socket_unlock(so, 0); } @@ -1828,7 +1920,6 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, */ cnt = 0; } - error = ip_output_list(pkt, cnt, opt, &ro, flags, 0, &ipoa); if (chain || error) { /* @@ -1846,6 +1937,10 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, if (unlocked) socket_lock(so, 0); + if (ro.ro_rt != NULL && + (outif = ro.ro_rt->rt_ifp->if_index) != inp->inp_last_outif) + inp->inp_last_outif = outif; + /* Synchronize cached PCB route */ inp_route_copyin(inp, &ro); @@ -1858,14 +1953,27 @@ tcp_setpersist(tp) { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; - if (tp->t_timer[TCPT_REXMT]) - panic("tcp_setpersist: retransmit pending"); + /* If a PERSIST_TIMER option was set we will limit the + * time the persist timer will be active for that connection + * in order to avoid DOS by using zero window probes. + * see rdar://5805356 + */ + + if ((tp->t_persist_timeout != 0) && + (tp->t_timer[TCPT_PERSIST] == 0) && + (tp->t_persist_stop == 0)) { + tp->t_persist_stop = tcp_now + tp->t_persist_timeout; + } + /* * Start/restart persistance timer. */ TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], t * tcp_backoff[tp->t_rxtshift], - TCPTV_PERSMIN, TCPTV_PERSMAX); + TCPTV_PERSMIN, TCPTV_PERSMAX, + TCP_ADD_REXMTSLOP(tp)); + tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]); + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } diff --git a/bsd/netinet/tcp_sack.c b/bsd/netinet/tcp_sack.c index 5842ad2b8..69fb8a7d0 100644 --- a/bsd/netinet/tcp_sack.c +++ b/bsd/netinet/tcp_sack.c @@ -103,20 +103,20 @@ #endif /*IPSEC*/ int tcp_do_sack = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, &tcp_do_sack, 0, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_sack, 0, "Enable/Disable TCP SACK support"); static int tcp_sack_maxholes = 128; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_maxholes, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_maxholes, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_sack_maxholes, 0, "Maximum number of TCP SACK holes allowed per connection"); static int tcp_sack_globalmaxholes = 65536; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalmaxholes, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalmaxholes, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_sack_globalmaxholes, 0, "Global maximum number of TCP SACK holes"); static int tcp_sack_globalholes = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalholes, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalholes, CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_sack_globalholes, 0, "Global number of TCP SACK holes currently allocated"); @@ -203,6 +203,18 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end) /* Save the number of SACK blocks. */ tp->rcv_numsacks = num_head + num_saved; + + /* If we are requesting SACK recovery, reset the stretch-ack state + * so that connection will generate more acks after recovery and + * sender's cwnd will open. + */ + if ((tp->t_flags & TF_STRETCHACK) != 0 && tp->rcv_numsacks > 0) + tcp_reset_stretch_ack(tp); + +#if TRAFFIC_MGT + if (tp->acc_iaj > 0 && tp->rcv_numsacks > 0) + reset_acc_iaj(tp); +#endif /* TRAFFIC_MGT */ } /* diff --git a/bsd/netinet/tcp_seq.h b/bsd/netinet/tcp_seq.h index 89a16ef79..df7bfa4e9 100644 --- a/bsd/netinet/tcp_seq.h +++ b/bsd/netinet/tcp_seq.h @@ -79,6 +79,8 @@ /* for modulo comparisons of timestamps */ #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) +#define TSTMP_GT(a,b) ((int)((a)-(b)) > 0) +#define TSTMP_LEQ(a,b) ((int)((a)-(b)) <= 0) #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) /* diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index f1b220bc2..8cf658482 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,6 +82,7 @@ #include <sys/protosw.h> #include <sys/random.h> #include <sys/syslog.h> +#include <sys/mcache.h> #include <kern/locks.h> #include <kern/zalloc.h> @@ -112,6 +113,9 @@ #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> +#include <netinet/tcp_cc.h> +#include <kern/thread_call.h> + #if INET6 #include <netinet6/tcp6_var.h> #endif @@ -136,6 +140,7 @@ #include <libkern/crypto/md5.h> #include <sys/kdebug.h> +#include <mach/sdt.h> #define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2)) @@ -147,13 +152,13 @@ extern int ipsec_bypass; #endif int tcp_mssdflt = TCP_MSS; -SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mssdflt , 0, "Default TCP Maximum Segment Size"); #if INET6 int tcp_v6mssdflt = TCP6_MSS; SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, - CTLFLAG_RW, &tcp_v6mssdflt , 0, + CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_v6mssdflt , 0, "Default TCP Maximum Segment Size for IPv6"); #endif @@ -166,7 +171,7 @@ SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, * checking. This setting prevents us from sending too small packets. */ int tcp_minmss = TCP_MINMSS; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_minmss , 0, "Minmum TCP Maximum Segment Size"); /* @@ -182,51 +187,70 @@ __private_extern__ int tcp_minmssoverload = TCP_MINMSSOVERLOAD; #else __private_extern__ int tcp_minmssoverload = 0; #endif -SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmssoverload, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_minmssoverload , 0, "Number of TCP Segments per Second allowed to" "be under the MINMSS Size"); static int tcp_do_rfc1323 = 1; -SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions"); +// Not used static int tcp_do_rfc1644 = 0; -SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions"); static int do_tcpdrain = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW | CTLFLAG_LOCKED, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); -SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, &tcbinfo.ipi_count, 0, "Number of active PCBs"); static int icmp_may_rst = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp_may_rst, 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); static int tcp_strict_rfc1948 = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly"); static int tcp_isn_reseed_interval = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); static int tcp_background_io_enabled = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_enabled, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_background_io_enabled, 0, "Background IO Enabled"); -int tcp_TCPTV_MIN = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW, +int tcp_TCPTV_MIN = 100; /* 100ms minimum RTT */ +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_TCPTV_MIN, 0, "min rtt value allowed"); +int tcp_rexmt_slop = TCPTV_REXMTSLOP; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmt_slop, CTLFLAG_RW, + &tcp_rexmt_slop, 0, "Slop added to retransmit timeout"); + __private_extern__ int tcp_use_randomport = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_use_randomport, 0, "Randomize TCP port numbers"); +extern struct tcp_cc_algo tcp_cc_newreno; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets, CTLFLAG_RD | CTLFLAG_LOCKED, + &tcp_cc_newreno.num_sockets, 0, "Number of sockets using newreno"); + +extern struct tcp_cc_algo tcp_cc_ledbat; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets, CTLFLAG_RD | CTLFLAG_LOCKED, + &tcp_cc_ledbat.num_sockets, 0, "Number of sockets using background transport"); + static void tcp_cleartaocache(void); static void tcp_notify(struct inpcb *, int); +static void tcp_cc_init(void); + struct zone *sack_hole_zone; +struct zone *tcp_reass_zone; + +/* The array containing pointers to currently implemented TCP CC algorithms */ +struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; extern unsigned int total_mb_cnt; extern unsigned int total_cl_cnt; @@ -247,7 +271,7 @@ extern int path_mtu_discovery; #endif __private_extern__ int tcp_tcbhashsize = TCBHASHSIZE; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); /* @@ -259,25 +283,25 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD, * to be changed, eventually, for greater efficiency). */ #define ALIGNMENT 32 -#define ALIGNM1 (ALIGNMENT - 1) struct inp_tp { - union { - struct inpcb inp; - char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; - } inp_tp_u; - struct tcpcb tcb; + struct inpcb inp; + struct tcpcb tcb __attribute__((aligned(ALIGNMENT))); }; #undef ALIGNMENT -#undef ALIGNM1 extern struct inpcbhead time_wait_slots[]; -extern u_int32_t *delack_bitmask; +extern struct tcptimerlist tcp_timer_list; int get_inpcb_str_size(void); int get_tcp_str_size(void); static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *); +static lck_attr_t *tcp_uptime_mtx_attr = NULL; /* mutex attributes */ +static lck_grp_t *tcp_uptime_mtx_grp = NULL; /* mutex group definition */ +static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL; /* mutex group attributes */ + + int get_inpcb_str_size(void) { return sizeof(struct inpcb); @@ -291,6 +315,17 @@ int get_tcp_str_size(void) int tcp_freeq(struct tcpcb *tp); +/* + * Initialize TCP congestion control algorithms. + */ + +void +tcp_cc_init(void) +{ + bzero(&tcp_cc_algo_list, sizeof(tcp_cc_algo_list)); + tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno; + tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat; +} /* * Tcp initialization @@ -310,9 +345,10 @@ tcp_init() tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_msl = TCPTV_MSL; - read_random(&tcp_now, sizeof(tcp_now)); - tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal 100ms clock at a random value */ + microuptime(&tcp_uptime); + read_random(&tcp_now, sizeof(tcp_now)); + tcp_now = tcp_now & 0x3fffffff; /* Starts tcp internal clock at a random value */ LIST_INIT(&tcb); tcbinfo.listhead = &tcb; @@ -325,10 +361,26 @@ tcp_init() tcbinfo.hashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.hashmask); tcbinfo.porthashbase = hashinit(tcp_tcbhashsize, M_PCB, &tcbinfo.porthashmask); - str_size = (vm_size_t) sizeof(struct inp_tp); + str_size = P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t)); tcbinfo.ipi_zone = (void *) zinit(str_size, 120000*str_size, 8192, "tcpcb"); + zone_change(tcbinfo.ipi_zone, Z_CALLERACCT, FALSE); + zone_change(tcbinfo.ipi_zone, Z_EXPAND, TRUE); + + str_size = P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t)); sack_hole_zone = zinit(str_size, 120000*str_size, 8192, "sack_hole zone"); + zone_change(sack_hole_zone, Z_CALLERACCT, FALSE); + zone_change(sack_hole_zone, Z_EXPAND, TRUE); + tcp_reass_maxseg = nmbclusters / 16; + str_size = P2ROUNDUP(sizeof(struct tseg_qent), sizeof(u_int64_t)); + tcp_reass_zone = zinit(str_size, (tcp_reass_maxseg + 1) * str_size, + 0, "tcp_reass_zone"); + if (tcp_reass_zone == NULL) { + panic("%s: failed allocating tcp_reass_zone", __func__); + /* NOTREACHED */ + } + zone_change(tcp_reass_zone, Z_CALLERACCT, FALSE); + zone_change(tcp_reass_zone, Z_EXPAND, TRUE); #if INET6 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)) @@ -341,10 +393,10 @@ tcp_init() panic("tcp_init"); #undef TCP_MINPROTOHDR - /* + /* * allocate lock group attribute and group for tcp pcb mutexes */ - pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init(); + pcbinfo->mtx_grp_attr = lck_grp_attr_alloc_init(); pcbinfo->mtx_grp = lck_grp_alloc_init("tcppcb", pcbinfo->mtx_grp_attr); /* @@ -357,18 +409,37 @@ tcp_init() return; /* pretty much dead if this fails... */ } - delack_bitmask = _MALLOC((4 * tcp_tcbhashsize)/32, M_PCB, M_WAITOK); - if (delack_bitmask == 0) - panic("Delack Memory"); - - for (i=0; i < (tcbinfo.hashsize / 32); i++) - delack_bitmask[i] = 0; - for (i=0; i < N_TIME_WAIT_SLOTS; i++) { LIST_INIT(&time_wait_slots[i]); } - timeout(tcp_fasttimo, NULL, hz/TCP_RETRANSHZ); + bzero(&tcp_timer_list, sizeof(tcp_timer_list)); + LIST_INIT(&tcp_timer_list.lhead); + /* + * allocate lock group attribute, group and attribute for the tcp timer list + */ + tcp_timer_list.mtx_grp_attr = lck_grp_attr_alloc_init(); + tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist", tcp_timer_list.mtx_grp_attr); + tcp_timer_list.mtx_attr = lck_attr_alloc_init(); + if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp, tcp_timer_list.mtx_attr)) == NULL) { + panic("failed to allocate memory for tcp_timer_list.mtx\n"); + }; + tcp_timer_list.fast_quantum = TCP_FASTTIMER_QUANTUM; + tcp_timer_list.slow_quantum = TCP_SLOWTIMER_QUANTUM; + if ((tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL)) == NULL) { + panic("failed to allocate call entry 1 in tcp_init\n"); + } + + /* + * allocate lock group attribute, group and attribute for tcp_uptime_lock + */ + tcp_uptime_mtx_grp_attr = lck_grp_attr_alloc_init(); + tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime", tcp_uptime_mtx_grp_attr); + tcp_uptime_mtx_attr = lck_attr_alloc_init(); + tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp, tcp_uptime_mtx_attr); + + /* Initialize TCP congestion control algorithms list */ + tcp_cc_init(); } /* @@ -398,7 +469,9 @@ tcp_fillheaders(tp, ip_ptr, tcp_ptr) ip6->ip6_plen = sizeof(struct tcphdr); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; - tcp_hdr->th_sum = 0; + tcp_hdr->th_sum = in6_cksum_phdr(&inp->in6p_laddr, + &inp->in6p_faddr, htonl(sizeof(struct tcphdr)), + htonl(IPPROTO_TCP)); } else #endif { @@ -474,7 +547,8 @@ tcp_respond( tcp_seq ack, tcp_seq seq, int flags, - unsigned int ifscope + unsigned int ifscope, + unsigned int nocell ) { register int tlen; @@ -489,6 +563,7 @@ tcp_respond( struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ + unsigned int outif; #if INET6 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; @@ -614,9 +689,11 @@ tcp_respond( #if INET6 if (isipv6) { nth->th_sum = 0; - nth->th_sum = in6_cksum(m, IPPROTO_TCP, - sizeof(struct ip6_hdr), - tlen - sizeof(struct ip6_hdr)); + nth->th_sum = in6_cksum_phdr(&ip6->ip6_src, + &ip6->ip6_dst, htons((u_short)(tlen - sizeof(struct ip6_hdr))), + htonl(IPPROTO_TCP)); + m->m_pkthdr.csum_flags = CSUM_TCPIPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : @@ -639,21 +716,29 @@ tcp_respond( return; } #endif -#if PKT_PRIORITY - if (tp != NULL) - set_traffic_class(m, tp->t_inpcb->inp_socket, MBUF_TC_NONE); -#endif /* PKT_PRIORITY */ + + if (tp != NULL) + set_packet_tclass(m, tp->t_inpcb->inp_socket, MBUF_TC_UNSPEC, isipv6); + #if INET6 if (isipv6) { - (void)ip6_output(m, NULL, ro6, 0, NULL, NULL, 0); - if (ro6 == &sro6 && ro6->ro_rt) { - rtfree(ro6->ro_rt); - ro6->ro_rt = NULL; + struct ip6_out_args ip6oa = { ifscope, nocell }; + + (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL, + NULL, &ip6oa); + if (ro6->ro_rt != NULL) { + if (ro6 == &sro6) { + rtfree(ro6->ro_rt); + ro6->ro_rt = NULL; + } else if ((outif = ro6->ro_rt->rt_ifp->if_index) != + tp->t_inpcb->in6p_last_outif) { + tp->t_inpcb->in6p_last_outif = outif; + } } } else #endif /* INET6 */ { - struct ip_out_args ipoa = { ifscope }; + struct ip_out_args ipoa = { ifscope, nocell }; if (ro != &sro) { /* Copy the cached route and take an extra reference */ @@ -665,6 +750,10 @@ tcp_respond( (void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa); if (ro != &sro) { + if (sro.ro_rt != NULL && + (outif = sro.ro_rt->rt_ifp->if_index) != + tp->t_inpcb->inp_last_outif) + tp->t_inpcb->inp_last_outif = outif; /* Synchronize cached PCB route */ inp_route_copyin(tp->t_inpcb, &sro); } else if (sro.ro_rt != NULL) { @@ -690,13 +779,15 @@ tcp_newtcpcb(inp) int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ + calculate_tcp_clock(); + if (so->cached_in_sock_layer == 0) { it = (struct inp_tp *)inp; tp = &it->tcb; } else tp = (struct tcpcb *) inp->inp_saved_ppcb; - + bzero((char *) tp, sizeof(struct tcpcb)); LIST_INIT(&tp->t_segq); tp->t_maxseg = tp->t_maxopd = @@ -719,12 +810,25 @@ tcp_newtcpcb(inp) tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_TCPTV_MIN; tp->t_rxtcur = TCPTV_RTOBASE; + + /* Initialize congestion control algorithm for this connection + * to newreno by default + */ + tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX; + if (CC_ALGO(tp)->init != NULL) { + CC_ALGO(tp)->init(tp); + } + tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->t_rcvtime = 0; + tp->t_rcvtime = tcp_now; tp->t_bw_rtttime = 0; + tp->tentry.timer_start = tcp_now; + tp->t_persist_timeout = tcp_max_persist_timeout; + tp->t_persist_stop = 0; + tp->t_flagsext |= TF_RCVUNACK_WAITSS; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, @@ -746,8 +850,13 @@ tcp_drop(tp, errno) int errno; { struct socket *so = tp->t_inpcb->inp_socket; - +#if CONFIG_DTRACE + struct inpcb *inp = tp->t_inpcb; +#endif /* CONFIG_DTRACE */ + if (TCPS_HAVERCVDSYN(tp->t_state)) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSED); tp->t_state = TCPS_CLOSED; (void) tcp_output(tp); tcpstat.tcps_drops++; @@ -778,26 +887,10 @@ tcp_close(tp) int dosavessthresh; if ( inp->inp_ppcb == NULL) /* tcp_close was called previously, bail */ - return NULL; - - /* Clear the timers before we delete the PCB. */ - { - int i; - for (i = 0; i < TCPT_NTIMERS; i++) { - tp->t_timer[i] = 0; - } - } + return(NULL); + tcp_canceltimers(tp); KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp,0,0,0,0); - switch (tp->t_state) - { - case TCPS_ESTABLISHED: - case TCPS_FIN_WAIT_1: - case TCPS_CLOSING: - case TCPS_CLOSE_WAIT: - case TCPS_LAST_ACK: - break; - } /* * If another thread for this tcp is currently in ip (indicated by @@ -816,6 +909,10 @@ tcp_close(tp) return (NULL); } + if (CC_ALGO(tp)->cleanup != NULL) { + CC_ALGO(tp)->cleanup(tp); + } + #if INET6 rt = isipv6 ? inp->in6p_route.ro_rt : inp->inp_route.ro_rt; #else @@ -853,8 +950,11 @@ tcp_close(tp) if (rt == NULL || !(rt->rt_flags & RTF_UP) || ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr == INADDR_ANY || rt->generation_id != route_generation) { - if (tp->t_state >= TCPS_CLOSE_WAIT) + if (tp->t_state >= TCPS_CLOSE_WAIT) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSING); tp->t_state = TCPS_CLOSING; + } goto no_valid_rt; } @@ -961,17 +1061,23 @@ no_valid_rt: if (so->cached_in_sock_layer) inp->inp_saved_ppcb = (caddr_t) tp; #endif + /* Issue a wakeup before detach so that we don't miss + * a wakeup + */ + sodisconnectwakeup(so); - soisdisconnected(so); #if INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) in6_pcbdetach(inp); else #endif /* INET6 */ in_pcbdetach(inp); + + /* Call soisdisconnected after detach because it might unlock the socket */ + soisdisconnected(so); tcpstat.tcps_closed++; KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END, tcpstat.tcps_closed,0,0,0,0); - return ((struct tcpcb *)0); + return(NULL); } int @@ -985,7 +1091,7 @@ tcp_freeq(tp) while((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); - FREE(q, M_TSEGQ); + zfree(tcp_reass_zone, q); tcp_reass_qsize--; rv = 1; } @@ -1019,7 +1125,7 @@ tcp_drain() != NULL) { LIST_REMOVE(te, tqe_q); m_freem(te->tqe_m); - FREE(te, M_TSEGQ); + zfree(tcp_reass_zone, te); tcp_reass_qsize--; } } @@ -1083,7 +1189,7 @@ tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp) otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first; otp->t_dupacks = tp->t_dupacks; - for (i = 0; i < TCPT_NTIMERS; i++) + for (i = 0; i < TCPT_NTIMERS_EXT; i++) otp->t_timer[i] = tp->t_timer[i]; otp->t_inpcb = (_TCPCB_PTR(struct inpcb *))(uintptr_t)tp->t_inpcb; otp->t_state = tp->t_state; @@ -1258,7 +1364,7 @@ tcp_pcblist SYSCTL_HANDLER_ARGS return error; } -SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #if !CONFIG_EMBEDDED @@ -1270,7 +1376,7 @@ tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp) otp->t_segq = (u_int32_t)(uintptr_t)tp->t_segq.lh_first; otp->t_dupacks = tp->t_dupacks; - for (i = 0; i < TCPT_NTIMERS; i++) + for (i = 0; i < TCPT_NTIMERS_EXT; i++) otp->t_timer[i] = tp->t_timer[i]; otp->t_state = tp->t_state; otp->t_flags = tp->t_flags; @@ -1406,44 +1512,60 @@ tcp_pcblist64 SYSCTL_HANDLER_ARGS for (i = 0; i < n; i++) { inp = inp_list[i]; if (inp->inp_gencnt <= gencnt && inp->inp_state != INPCB_STATE_DEAD) { - struct xtcpcb64 xt; - - bzero(&xt, sizeof(xt)); - xt.xt_len = sizeof xt; - inpcb_to_xinpcb64(inp, &xt.xt_inpcb); - xt.xt_inpcb.inp_ppcb = (u_int64_t)(uintptr_t)inp->inp_ppcb; - if (inp->inp_ppcb != NULL) - tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, &xt); - if (inp->inp_socket) - sotoxsocket64(inp->inp_socket, &xt.xt_inpcb.xi_socket); - error = SYSCTL_OUT(req, &xt, sizeof xt); + struct xtcpcb64 xt; + + bzero(&xt, sizeof(xt)); + xt.xt_len = sizeof xt; + inpcb_to_xinpcb64(inp, &xt.xt_inpcb); + xt.xt_inpcb.inp_ppcb = (u_int64_t)(uintptr_t)inp->inp_ppcb; + if (inp->inp_ppcb != NULL) + tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb, &xt); + if (inp->inp_socket) + sotoxsocket64(inp->inp_socket, &xt.xt_inpcb.xi_socket); + error = SYSCTL_OUT(req, &xt, sizeof xt); } } if (!error) { - /* - * Give the user an updated idea of our state. - * If the generation differs from what we told - * her before, she knows that something happened - * while we were processing this request, and it - * might be necessary to retry. - */ - bzero(&xig, sizeof(xig)); - xig.xig_len = sizeof xig; - xig.xig_gen = tcbinfo.ipi_gencnt; - xig.xig_sogen = so_gencnt; - xig.xig_count = tcbinfo.ipi_count; - error = SYSCTL_OUT(req, &xig, sizeof xig); + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + bzero(&xig, sizeof(xig)); + xig.xig_len = sizeof xig; + xig.xig_gen = tcbinfo.ipi_gencnt; + xig.xig_sogen = so_gencnt; + xig.xig_count = tcbinfo.ipi_count; + error = SYSCTL_OUT(req, &xig, sizeof xig); } FREE(inp_list, M_TEMP); lck_rw_done(tcbinfo.mtx); return error; } -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections"); #endif /* !CONFIG_EMBEDDED */ +static int +tcp_pcblist_n SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error = 0; + + error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo); + + return error; +} + + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, + tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections"); + + void tcp_ctlinput(cmd, sa, vip) int cmd; @@ -1618,10 +1740,10 @@ tcp6_ctlinput(cmd, sa, d) in6_pcbnotify(&tcbinfo, sa, th.th_dport, (struct sockaddr *)ip6cp->ip6c_src, - th.th_sport, cmd, notify); + th.th_sport, cmd, NULL, notify); } else { in6_pcbnotify(&tcbinfo, sa, 0, - (struct sockaddr *)(size_t)sa6_src, 0, cmd, notify); + (struct sockaddr *)(size_t)sa6_src, 0, cmd, NULL, notify); } } #endif /* INET6 */ @@ -1773,7 +1895,7 @@ tcp_mtudisc( if (tp) { #if INET6 if (isipv6) - rt = tcp_rtlookup6(inp); + rt = tcp_rtlookup6(inp, IFSCOPE_NONE); else #endif /* INET6 */ rt = tcp_rtlookup(inp, IFSCOPE_NONE); @@ -1837,6 +1959,11 @@ tcp_mtudisc( tp->t_maxseg = mss; + /* + * Reset the slow-start flight size as it may depends on the new MSS + */ + if (CC_ALGO(tp)->cwnd_init != NULL) + CC_ALGO(tp)->cwnd_init(tp); tcpstat.tcps_mturesent++; tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; @@ -1889,7 +2016,7 @@ tcp_rtlookup(inp, input_ifscope) if (rt != NULL) RT_UNLOCK(rt); - rtalloc_scoped_ign(ro, 0, ifscope); + rtalloc_scoped(ro, ifscope); if ((rt = ro->ro_rt) != NULL) RT_LOCK(rt); } @@ -1934,8 +2061,9 @@ tcp_rtlookup(inp, input_ifscope) #if INET6 struct rtentry * -tcp_rtlookup6(inp) +tcp_rtlookup6(inp, input_ifscope) struct inpcb *inp; + unsigned int input_ifscope; { struct route_in6 *ro6; struct rtentry *rt; @@ -1952,14 +2080,26 @@ tcp_rtlookup6(inp) /* No route yet, so try to acquire one */ if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { struct sockaddr_in6 *dst6; + unsigned int ifscope; dst6 = (struct sockaddr_in6 *)&ro6->ro_dst; dst6->sin6_family = AF_INET6; dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = inp->in6p_faddr; + + /* + * If the socket was bound to an interface, then + * the bound-to-interface takes precedence over + * the inbound interface passed in by the caller + * (if we get here as part of the output path then + * input_ifscope is IFSCOPE_NONE). + */ + ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : input_ifscope; + if (rt != NULL) RT_UNLOCK(rt); - rtalloc_ign((struct route *)ro6, 0); + rtalloc_scoped((struct route *)ro6, ifscope); if ((rt = ro6->ro_rt) != NULL) RT_LOCK(rt); } @@ -2068,7 +2208,7 @@ tcp_gettaocache(inp) #if INET6 if ((inp->inp_vflag & INP_IPV6) != 0) - rt = tcp_rtlookup6(inp); + rt = tcp_rtlookup6(inp, IFSCOPE_NONE); else #endif /* INET6 */ rt = tcp_rtlookup(inp, IFSCOPE_NONE); @@ -2112,7 +2252,7 @@ tcp_lock(struct socket *so, int refcount, void *lr) lr_saved = lr; if (so->so_pcb != NULL) { - lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } else { panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n", so, lr_saved, solockhistory_nr(so)); @@ -2143,7 +2283,7 @@ tcp_unlock(struct socket *so, int refcount, void *lr) #ifdef MORE_TCPLOCK_DEBUG printf("tcp_unlock: so=%p sopcb=%p lock=%p ref=%x lr=%p\n", - so, so->so_pcb, ((struct inpcb *)so->so_pcb)->inpcb_mtx, + so, so->so_pcb, &((struct inpcb *)so->so_pcb)->inpcb_mtx, so->so_usecount, lr_saved); #endif if (refcount) @@ -2159,11 +2299,11 @@ tcp_unlock(struct socket *so, int refcount, void *lr) so, so->so_usecount, lr_saved, solockhistory_nr(so)); /* NOTREACHED */ } else { - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); so->unlock_lr[so->next_unlock_lr] = lr_saved; so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX; - lck_mtx_unlock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } return (0); } @@ -2179,7 +2319,7 @@ tcp_getlock( if (so->so_usecount < 0) panic("tcp_getlock: so=%p usecount=%x lrh= %s\n", so, so->so_usecount, solockhistory_nr(so)); - return(inp->inpcb_mtx); + return(&inp->inpcb_mtx); } else { panic("tcp_getlock: so=%p NULL so_pcb %s\n", @@ -2199,17 +2339,7 @@ tcp_sbspace(struct tcpcb *tp) if (space < 0) space = 0; -#if TRAFFIC_MGT - if (tp->t_inpcb->inp_socket->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_REGULATE) { - if (tcp_background_io_enabled && - tp->t_inpcb->inp_socket->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_SUPPRESSED) { - tp->t_flags |= TF_RXWIN0SENT; - return 0; /* Triggers TCP window closing by responding there is no space */ - } - } -#endif /* TRAFFIC_MGT */ - - /* Avoid inscreasing window size if the current window + /* Avoid increasing window size if the current window * is already very low, we could be in "persist" mode and * we could break some apps (see rdar://5409343) */ @@ -2252,11 +2382,6 @@ tcp_set_tso(tp, ifp) int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; if (isipv6) { - /* - * Radar 6921834: Disable TSO IPv6 because there is no support - * for TSO & HW checksum in ip6_output yet - */ -#if 0 if (ifp && ifp->if_hwassist & IFNET_TSO_IPV6) { tp->t_flags |= TF_TSO; if (ifp->if_tso_v6_mtu != 0) @@ -2266,7 +2391,6 @@ tcp_set_tso(tp, ifp) } else tp->t_flags &= ~TF_TSO; -#endif } else #endif /* INET6 */ @@ -2281,4 +2405,50 @@ tcp_set_tso(tp, ifp) tp->t_flags &= ~TF_TSO; } } + +#define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC) + +/* Function to calculate the tcp clock. The tcp clock will get updated + * at the boundaries of the tcp layer. This is done at 3 places: + * 1. Right before processing an input tcp packet + * 2. Whenever a connection wants to access the network using tcp_usrreqs + * 3. When a tcp timer fires or before tcp slow timeout + * + */ + +void +calculate_tcp_clock() +{ + struct timeval tv = tcp_uptime; + struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC}; + struct timeval now, hold_now; + uint32_t incr = 0; + + timevaladd(&tv, &interval); + microuptime(&now); + if (timevalcmp(&now, &tv, >)) { + /* time to update the clock */ + lck_spin_lock(tcp_uptime_lock); + if (timevalcmp(&tcp_uptime, &now, >=)) { + /* clock got updated while we were waiting for the lock */ + lck_spin_unlock(tcp_uptime_lock); + return; + } + + microuptime(&now); + hold_now = now; + tv = tcp_uptime; + timevalsub(&now, &tv); + + incr = TIMEVAL_TO_TCPHZ(now); + if (incr > 0) { + tcp_uptime = hold_now; + tcp_now += incr; + } + + lck_spin_unlock(tcp_uptime_lock); + } + return; +} + /* DSEP Review Done pl-20051213-v02 @3253,@3391,@3400 */ diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index fd66419d0..706ec823c 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -71,6 +71,8 @@ #include <sys/socketvar.h> #include <sys/protosw.h> #include <sys/domain.h> +#include <sys/mcache.h> +#include <sys/queue.h> #include <kern/locks.h> #include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */ @@ -89,6 +91,7 @@ #include <netinet/tcp_seq.h> #include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> +#include <netinet/tcp_cc.h> #if INET6 #include <netinet6/tcp6_var.h> #endif @@ -97,23 +100,31 @@ #include <netinet/tcp_debug.h> #endif #include <sys/kdebug.h> +#include <mach/sdt.h> extern void postevent(struct socket *, struct sockbuf *, int); #define DBG_FNC_TCP_FAST NETDBG_CODE(DBG_NETTCP, (5 << 8)) #define DBG_FNC_TCP_SLOW NETDBG_CODE(DBG_NETTCP, (5 << 8) | 1) +#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next))) + +#define VERIFY_NEXT_LINK(elm,field) do { \ + if (LIST_NEXT((elm),field) != NULL && \ + LIST_NEXT((elm),field)->field.le_prev != \ + &((elm)->field.le_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while(0) + +#define VERIFY_PREV_LINK(elm,field) do { \ + if (*(elm)->field.le_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while(0) + static int background_io_trigger = 5; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_io_trigger, CTLFLAG_RW | CTLFLAG_LOCKED, &background_io_trigger, 0, "Background IO Trigger Setting"); -/* - * NOTE - WARNING - * - * - * - * - */ static int sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS { @@ -136,25 +147,42 @@ sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS } int tcp_keepinit; -SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", ""); int tcp_keepidle; -SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", ""); int tcp_keepintvl; -SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", ""); int tcp_msl; -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); +/* + * Avoid DoS via TCP Robustness in Persist Condition (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt) + * by allowing a system wide maximum persistence timeout value when in Zero Window Probe mode. + * Expressed in milliseconds to be consistent without timeout related values, the TCP socket option is in seconds. + */ +u_int32_t tcp_max_persist_timeout = 0; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I", "Maximum persistence timout for ZWP"); + static int always_keepalive = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW | CTLFLAG_LOCKED, &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); +/* This parameter determines how long the timer list will stay in fast mode even + * though all connections are idle. In fast mode, the timer will fire more frequently + * anticipating new data. + */ +int timer_fastmode_idlemax = TCP_FASTMODE_IDLEGEN_MAX; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_fastmode_idlemax, CTLFLAG_RW | CTLFLAG_LOCKED, + &timer_fastmode_idlemax, 0, "Maximum idle generations in fast mode"); + /* * See tcp_syn_backoff[] for interval values between SYN retransmits; * the value set below defines the number of retransmits, before we @@ -163,16 +191,25 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, * two options. */ static int tcp_broken_peer_syn_rxmit_thres = 7; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, broken_peer_syn_rxmit_thres, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_broken_peer_syn_rxmit_thres, 0, "Number of retransmitted SYNs before " "TCP disables rfc1323 and rfc1644 during the rest of attempts"); +static int tcp_timer_advanced = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced, CTLFLAG_RD | CTLFLAG_LOCKED, + &tcp_timer_advanced, 0, "Number of times one of the timers was advanced"); + +static int tcp_resched_timerlist = 0; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist, CTLFLAG_RD | CTLFLAG_LOCKED, + &tcp_resched_timerlist, 0, + "Number of times timer list was rescheduled as part of processing a packet"); + int tcp_pmtud_black_hole_detect = 1 ; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_detect, 0, "Path MTU Discovery Black Hole Detection"); int tcp_pmtud_black_hole_mss = 1200 ; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_pmtud_black_hole_mss, 0, "Path MTU Discovery Black Hole Detection lowered MSS"); static int tcp_keepcnt = TCPTV_KEEPCNT; @@ -182,28 +219,68 @@ int tcp_maxpersistidle; /* max idle time in persist */ int tcp_maxidle; +/* TCP delack timer is set to 100 ms. Since the processing of timer list in fast + * mode will happen no faster than 100 ms, the delayed ack timer will fire some where + * between 100 and 200 ms. + */ +int tcp_delack = TCP_RETRANSHZ / 10; + struct inpcbhead time_wait_slots[N_TIME_WAIT_SLOTS]; int cur_tw_slot = 0; -u_int32_t *delack_bitmask; +/* tcp timer list */ +struct tcptimerlist tcp_timer_list; -void add_to_time_wait_locked(struct tcpcb *tp); -void add_to_time_wait(struct tcpcb *tp) ; +/* The frequency of running through the TCP timer list in + * fast and slow mode can be configured. + */ +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_fastquantum, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_timer_list.fast_quantum, TCP_FASTTIMER_QUANTUM, + "Frequency of running timer list in fast mode"); + +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, timer_slowquantum, CTLFLAG_RW | CTLFLAG_LOCKED, + &tcp_timer_list.slow_quantum, TCP_SLOWTIMER_QUANTUM, + "Frequency of running timer list in slow mode"); + +static void tcp_remove_timer(struct tcpcb *tp); +static void tcp_sched_timerlist(uint32_t offset); +static uint32_t tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index); +static void tcp_sched_timers(struct tcpcb *tp); +static inline void tcp_set_lotimer_index(struct tcpcb *); + +/* Macro to compare two timers. If there is a reset of the sign bit, it is + * safe to assume that the timer has wrapped around. By doing signed comparision, + * we take care of wrap around such that the value with the sign bit reset is + * actually ahead of the other. + */ + +static inline int32_t +timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) { + return (int32_t)((t1 + toff1) - (t2 + toff2)); +}; + +/* Returns true if the timer is on the timer list */ +#define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST) + + +void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay); +void add_to_time_wait(struct tcpcb *tp, uint32_t delay) ; static void tcp_garbage_collect(struct inpcb *, int); -void add_to_time_wait_locked(struct tcpcb *tp) +void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay) { int tw_slot; - struct inpcbinfo *pcbinfo = &tcbinfo; + struct inpcbinfo *pcbinfo = &tcbinfo; + uint32_t timer; /* pcb list should be locked when we get here */ lck_rw_assert(pcbinfo->mtx, LCK_RW_ASSERT_EXCLUSIVE); LIST_REMOVE(tp->t_inpcb, inp_list); - if (tp->t_timer[TCPT_2MSL] <= 0) - tp->t_timer[TCPT_2MSL] = 1; + /* if (tp->t_timer[TCPT_2MSL] <= 0) + tp->t_timer[TCPT_2MSL] = 1; */ /* * Because we're pulling this pcb out of the main TCP pcb list, @@ -211,19 +288,19 @@ void add_to_time_wait_locked(struct tcpcb *tp) * higher timer granularity. */ - tp->t_timer[TCPT_2MSL] = (tp->t_timer[TCPT_2MSL] / TCP_RETRANSHZ) * PR_SLOWHZ; + timer = (delay / TCP_RETRANSHZ) * PR_SLOWHZ; tp->t_rcvtime = (tp->t_rcvtime / TCP_RETRANSHZ) * PR_SLOWHZ; - tp->t_rcvtime += tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1); + tp->t_rcvtime += timer & (N_TIME_WAIT_SLOTS - 1); - tw_slot = (tp->t_timer[TCPT_2MSL] & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot; + tw_slot = (timer & (N_TIME_WAIT_SLOTS - 1)) + cur_tw_slot; if (tw_slot >= N_TIME_WAIT_SLOTS) tw_slot -= N_TIME_WAIT_SLOTS; LIST_INSERT_HEAD(&time_wait_slots[tw_slot], tp->t_inpcb, inp_list); } -void add_to_time_wait(struct tcpcb *tp) +void add_to_time_wait(struct tcpcb *tp, uint32_t delay) { struct inpcbinfo *pcbinfo = &tcbinfo; @@ -232,97 +309,10 @@ void add_to_time_wait(struct tcpcb *tp) lck_rw_lock_exclusive(pcbinfo->mtx); tcp_lock(tp->t_inpcb->inp_socket, 0, 0); } - add_to_time_wait_locked(tp); + add_to_time_wait_locked(tp, delay); lck_rw_done(pcbinfo->mtx); } - - - -/* - * Fast timeout routine for processing delayed acks - */ -void -tcp_fasttimo(void *arg) -{ -#pragma unused(arg) - struct inpcb *inp; - register struct tcpcb *tp; - struct socket *so; -#if TCPDEBUG - int ostate; -#endif - - - struct inpcbinfo *pcbinfo = &tcbinfo; - - int delack_done = 0; - - KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_START, 0,0,0,0,0); - - - lck_rw_lock_shared(pcbinfo->mtx); - - /* Walk the list of valid tcpcbs and send ACKS on the ones with DELACK bit set */ - - LIST_FOREACH(inp, &tcb, inp_list) { - - so = inp->inp_socket; - - if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) - continue; - - tcp_lock(so, 1, 0); - - if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING) && so->so_usecount == 1) { - tcp_unlock(so, 1, 0); - continue; - } - - tp = intotcpcb(inp); - - if (tp == 0 || tp->t_state == TCPS_LISTEN) { - tcp_unlock(so, 1, 0); - continue; - } - - - /* Only run the retransmit timer in that case */ - if (tp->t_timer[0] && --tp->t_timer[0] == 0) { - tp = tcp_timers(tp, 0); - if (tp == NULL) - goto tpgone; - } - - /* TCP pcb timers following the tcp_now clock rate */ - - tp->t_rcvtime++; - tp->t_starttime++; - if (tp->t_rtttime) - tp->t_rtttime++; - - /* - * Process delayed acks (if enabled) according to PR_FASTHZ, not the retrans timer - */ - - if (tcp_delack_enabled && (tcp_now % (TCP_RETRANSHZ/PR_FASTHZ)) && tp->t_flags & TF_DELACK) { - delack_done++; - tp->t_flags &= ~TF_DELACK; - tp->t_flags |= TF_ACKNOW; - tcpstat.tcps_delack++; - tp->t_unacksegs = 0; - (void) tcp_output(tp); - } -tpgone: - tcp_unlock(so, 1, 0); - } - KERNEL_DEBUG(DBG_FNC_TCP_FAST | DBG_FUNC_END, delack_done, 0, tcpstat.tcps_delack,0,0); - lck_rw_done(pcbinfo->mtx); - - tcp_now++; - timeout(tcp_fasttimo, 0, hz/TCP_RETRANSHZ); -} - static void tcp_garbage_collect(struct inpcb *inp, int istimewait) { @@ -339,12 +329,12 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) * overflow sockets that are eligible for garbage collection have * their usecounts set to 1. */ - if (so->so_usecount > 1 || !lck_mtx_try_lock_spin(inp->inpcb_mtx)) + if (so->so_usecount > 1 || !lck_mtx_try_lock_spin(&inp->inpcb_mtx)) return; /* Check again under the lock */ if (so->so_usecount > 1) { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); return; } @@ -365,7 +355,7 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) if (inp->inp_state != INPCB_STATE_DEAD) { /* Become a regular mutex */ - lck_mtx_convert_spin(inp->inpcb_mtx); + lck_mtx_convert_spin(&inp->inpcb_mtx); #if INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) in6_pcbdetach(inp); @@ -374,10 +364,10 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) in_pcbdetach(inp); } so->so_usecount--; - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); return; } else if (inp->inp_wantcnt != WNT_STOPUSING) { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); return; } @@ -392,8 +382,10 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) * socket is dropped at the end of tcp_input(). */ if (so->so_usecount == 0) { + DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp, + struct tcpcb *, tp, int32_t, TCPS_CLOSED); /* Become a regular mutex */ - lck_mtx_convert_spin(inp->inpcb_mtx); + lck_mtx_convert_spin(&inp->inpcb_mtx); if (inp->inp_state != INPCB_STATE_DEAD) { #if INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) @@ -404,20 +396,15 @@ tcp_garbage_collect(struct inpcb *inp, int istimewait) } in_pcbdispose(inp); } else { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); } } -static int bg_cnt = 0; -#define BG_COUNTER_MAX 3 - void tcp_slowtimo(void) { struct inpcb *inp, *nxt; struct tcpcb *tp; - struct socket *so; - int i; #if TCPDEBUG int ostate; #endif @@ -432,114 +419,14 @@ tcp_slowtimo(void) tcp_maxidle = tcp_keepcnt * tcp_keepintvl; - lck_rw_lock_shared(pcbinfo->mtx); - - bg_cnt++; - - LIST_FOREACH(inp, &tcb, inp_list) { - - so = inp->inp_socket; + /* Update tcp_now here as it may get used while processing the slow timer */ + calculate_tcp_clock(); - if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) - continue; - - tcp_lock(so, 1, 0); - - if ((in_pcb_checkstate(inp, WNT_RELEASE,1) == WNT_STOPUSING) && so->so_usecount == 1) { - tcp_unlock(so, 1, 0); - continue; - } - tp = intotcpcb(inp); - if (tp == 0 || tp->t_state == TCPS_LISTEN) { - tcp_unlock(so, 1, 0); - continue; - } - - tp = intotcpcb(inp); - - if (tp == 0 || tp->t_state == TCPS_LISTEN) - goto tpgone; - -#if TRAFFIC_MGT - if (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_REGULATE && - bg_cnt > BG_COUNTER_MAX) { - u_int32_t curr_recvtotal = tcpstat.tcps_rcvtotal; - u_int32_t curr_bg_recvtotal = tcpstat.tcps_bg_rcvtotal; - u_int32_t bg_recvdiff = curr_bg_recvtotal - tp->bg_recv_snapshot; - u_int32_t tot_recvdiff = curr_recvtotal - tp->tot_recv_snapshot; - u_int32_t fg_recv_change = tot_recvdiff - bg_recvdiff; - u_int32_t recv_change; - - if (!(so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BG_SUPPRESSED)) { - if (tot_recvdiff) - recv_change = (fg_recv_change * 100) / tot_recvdiff; - else - recv_change = 0; - - if (recv_change > background_io_trigger) { - socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BG_SUPPRESSED); - } - - tp->tot_recv_snapshot = curr_recvtotal; - tp->bg_recv_snapshot = curr_bg_recvtotal; - } - else { // SUPPRESSED - // this allows for bg traffic to subside before we start measuring total traffic change - if (tot_recvdiff) - recv_change = (bg_recvdiff * 100) / tot_recvdiff; - else - recv_change = 0; - - if (recv_change < background_io_trigger) { - // Draconian for now: if there is any change at all, keep suppressed - if (!tot_recvdiff) { - socket_clear_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BG_SUPPRESSED); - tp->t_unacksegs = 0; - (void) tcp_output(tp); // open window - } - } - - tp->tot_recv_snapshot = curr_recvtotal; - tp->bg_recv_snapshot = curr_bg_recvtotal; - } - } -#endif /* TRAFFIC_MGT */ - - for (i = 1; i < TCPT_NTIMERS; i++) { - if (tp->t_timer[i] != 0) { - tp->t_timer[i] -= TCP_RETRANSHZ/PR_SLOWHZ; - if (tp->t_timer[i] <= 0) { -#if TCPDEBUG - ostate = tp->t_state; -#endif - - tp->t_timer[i] = 0; /* account for granularity change between tcp_now and slowtimo */ - tp = tcp_timers(tp, i); - if (tp == NULL) - goto tpgone; -#if TCPDEBUG - if (tp->t_inpcb->inp_socket->so_options - & SO_DEBUG) - tcp_trace(TA_USER, ostate, tp, - (void *)0, - (struct tcphdr *)0, - PRU_SLOWTIMO); -#endif - } - } - } -tpgone: - tcp_unlock(so, 1, 0); - } - - if (bg_cnt > 3) - bg_cnt = 0; - - /* Second part of tcp_slowtimo: garbage collect socket/tcpcb - * We need to acquire the list lock exclusively to do this + /* Garbage collect socket/tcpcb: We need to acquire the list lock + * exclusively to do this */ - if (lck_rw_lock_shared_to_exclusive(pcbinfo->mtx) == FALSE) { + if (lck_rw_try_lock_exclusive(pcbinfo->mtx) == FALSE) { if (tcp_gc_done == TRUE) { /* don't sweat it this time. cleanup was done last time */ tcp_gc_done = FALSE; KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked, cur_tw_slot,0,0,0); @@ -617,8 +504,11 @@ tcp_canceltimers(tp) { register int i; + tcp_remove_timer(tp); for (i = 0; i < TCPT_NTIMERS; i++) tp->t_timer[i] = 0; + tp->tentry.timer_start = tcp_now; + tp->tentry.index = TCPT_NONE; } int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = @@ -641,6 +531,7 @@ tcp_timers(tp, timer) struct socket *so_tmp; struct tcptemp *t_template; int optlen = 0; + int idle_time = 0; #if TCPDEBUG int ostate; @@ -651,6 +542,7 @@ tcp_timers(tp, timer) #endif /* INET6 */ so_tmp = tp->t_inpcb->inp_socket; + idle_time = tcp_now - tp->t_rcvtime; switch (timer) { @@ -666,8 +558,8 @@ tcp_timers(tp, timer) tcp_free_sackholes(tp); if (tp->t_state != TCPS_TIME_WAIT && tp->t_state != TCPS_FIN_WAIT_2 && - tp->t_rcvtime < tcp_maxidle) { - tp->t_timer[TCPT_2MSL] = (u_int32_t)tcp_keepintvl; + ((idle_time > 0) && (idle_time < tcp_maxidle))) { + tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, (u_int32_t)tcp_keepintvl); } else { tp = tcp_close(tp); @@ -682,9 +574,26 @@ tcp_timers(tp, timer) */ case TCPT_REXMT: tcp_free_sackholes(tp); - if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + /* Drop a connection in the retransmit timer + * 1. If we have retransmitted more than TCP_MAXRXTSHIFT times + * 2. If the time spent in this retransmission episode is more than + * the time limit set with TCP_RXT_CONNDROPTIME socket option + * 3. If TCP_RXT_FINDROP socket option was set and we have already + * retransmitted the FIN 3 times without receiving an ack + */ + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || + (tp->rxt_conndroptime > 0 && tp->rxt_start > 0 && + (tcp_now - tp->rxt_start) >= tp->rxt_conndroptime) || + ((tp->t_flagsext & TF_RXTFINDROP) != 0 && + (tp->t_flags & TF_SENTFIN) != 0 && + tp->t_rxtshift >= 4)) { + + if ((tp->t_flagsext & TF_RXTFINDROP) != 0) { + tcpstat.tcps_rxtfindrop++; + } else { + tcpstat.tcps_timeoutdrop++; + } tp->t_rxtshift = TCP_MAXRXTSHIFT; - tcpstat.tcps_timeoutdrop++; tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); postevent(so_tmp, 0, EV_TIMEOUT); @@ -709,6 +618,11 @@ tcp_timers(tp, timer) else tp->t_flags &= ~TF_WASFRECOVERY; tp->t_badrxtwin = tcp_now + (tp->t_srtt >> (TCP_RTT_SHIFT)); + + /* Set the time at which retransmission on this + * connection started + */ + tp->rxt_start = tcp_now; } tcpstat.tcps_rexmttimeo++; if (tp->t_state == TCPS_SYN_SENT) @@ -716,8 +630,9 @@ tcp_timers(tp, timer) else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, - tp->t_rttmin, TCPTV_REXMTMAX); - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + tp->t_rttmin, TCPTV_REXMTMAX, + TCP_ADD_REXMTSLOP(tp)); + tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur); /* * Check for potential Path MTU Discovery Black Hole @@ -745,6 +660,12 @@ tcp_timers(tp, timer) tcp_mssdflt; } tp->t_maxseg = tp->t_maxopd - optlen; + + /* + * Reset the slow-start flight size as it may depends on the new MSS + */ + if (CC_ALGO(tp)->cwnd_init != NULL) + CC_ALGO(tp)->cwnd_init(tp); } /* * If further retransmissions are still unsuccessful with a lowered MTU, @@ -759,6 +680,11 @@ tcp_timers(tp, timer) optlen = tp->t_maxopd - tp->t_maxseg; tp->t_maxopd = tp->t_pmtud_saved_maxopd; tp->t_maxseg = tp->t_maxopd - optlen; + /* + * Reset the slow-start flight size as it may depends on the new MSS + */ + if (CC_ALGO(tp)->cwnd_init != NULL) + CC_ALGO(tp)->cwnd_init(tp); } } } @@ -806,41 +732,17 @@ tcp_timers(tp, timer) * If timing a segment in this window, stop the timer. */ tp->t_rtttime = 0; - /* - * Close the congestion window down to one segment - * (we'll open it by one segment for each ack we get). - * Since we probably have a window's worth of unacked - * data accumulated, this "slow start" keeps us from - * dumping all that data as back-to-back packets (which - * might overwhelm an intermediate gateway). - * - * There are two phases to the opening: Initially we - * open by one mss on each ack. This makes the window - * size increase exponentially with time. If the - * window is larger than the path can handle, this - * exponential growth results in dropped packet(s) - * almost immediately. To get more time between - * drops but still "push" the network to take advantage - * of improving conditions, we switch from exponential - * to linear window opening at some threshhold size. - * For a threshhold, we use half the current window - * size, truncated to a multiple of the mss. - * - * (the minimum cwnd that will give us exponential - * growth is 2 mss. We don't allow the threshhold - * to go below this.) - */ - if (tp->t_state >= TCPS_ESTABLISHED) { - u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_cwnd = tp->t_maxseg; - tp->snd_ssthresh = win * tp->t_maxseg; - tp->t_bytes_acked = 0; - tp->t_dupacks = 0; - tp->t_unacksegs = 0; - } + + if (CC_ALGO(tp)->after_timeout != NULL) + CC_ALGO(tp)->after_timeout(tp); + + tp->t_dupacks = 0; EXIT_FASTRECOVERY(tp); + + DTRACE_TCP5(cc, void, NULL, struct inpcb *, tp->t_inpcb, + struct tcpcb *, tp, struct tcphdr *, NULL, + int32_t, TCP_CC_REXMT_TIMEOUT); + (void) tcp_output(tp); break; @@ -856,10 +758,15 @@ tcp_timers(tp, timer) * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. + * + * Drop the connection if we reached the maximum allowed time for + * Zero Window Probes without a non-zero update from the peer. + * See rdar://5805356 */ - if (tp->t_rxtshift == TCP_MAXRXTSHIFT && - (tp->t_rcvtime >= tcp_maxpersistidle || - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + if ((tp->t_rxtshift == TCP_MAXRXTSHIFT && + (idle_time >= tcp_maxpersistidle || + idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) || + ((tp->t_persist_stop != 0) && (tp->t_persist_stop <= tcp_now))) { tcpstat.tcps_persistdrop++; so_tmp = tp->t_inpcb->inp_socket; tp = tcp_drop(tp, ETIMEDOUT); @@ -868,7 +775,6 @@ tcp_timers(tp, timer) } tcp_setpersist(tp); tp->t_force = 1; - tp->t_unacksegs = 0; (void) tcp_output(tp); tp->t_force = 0; break; @@ -884,7 +790,7 @@ tcp_timers(tp, timer) if ((always_keepalive || tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) { - if (tp->t_rcvtime >= TCP_KEEPIDLE(tp) + (u_int32_t)tcp_maxidle) + if (idle_time >= TCP_KEEPIDLE(tp) + (u_int32_t)tcp_maxidle) goto dropit; /* * Send a packet designed to force a response @@ -901,21 +807,45 @@ tcp_timers(tp, timer) tcpstat.tcps_keepprobe++; t_template = tcp_maketemplate(tp); if (t_template) { - unsigned int ifscope; + unsigned int ifscope, nocell = 0; if (tp->t_inpcb->inp_flags & INP_BOUND_IF) ifscope = tp->t_inpcb->inp_boundif; else ifscope = IFSCOPE_NONE; + /* + * If the socket isn't allowed to use the + * cellular interface, indicate it as such. + */ + if (tp->t_inpcb->inp_flags & INP_NO_IFT_CELLULAR) + nocell = 1; + tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, - tp->rcv_nxt, tp->snd_una - 1, 0, ifscope); + tp->rcv_nxt, tp->snd_una - 1, 0, ifscope, + nocell); (void) m_free(dtom(t_template)); } - tp->t_timer[TCPT_KEEP] = tcp_keepintvl; + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, tcp_keepintvl); } else - tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, TCP_KEEPIDLE(tp)); + break; + case TCPT_DELACK: + if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) { + tp->t_flags &= ~TF_DELACK; + tp->t_timer[TCPT_DELACK] = 0; + tp->t_flags |= TF_ACKNOW; + + /* If delayed ack timer fired while we are stretching acks, + * go back to acking every other packet + */ + if ((tp->t_flags & TF_STRETCHACK) != 0) + tcp_reset_stretch_ack(tp); + + tcpstat.tcps_delack++; + (void) tcp_output(tp); + } break; #if TCPDEBUG @@ -931,3 +861,462 @@ tcp_timers(tp, timer) } return (tp); } + +/* Remove a timer entry from timer list */ +void +tcp_remove_timer(struct tcpcb *tp) +{ + struct tcptimerlist *listp = &tcp_timer_list; + + lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + if (!(TIMER_IS_ON_LIST(tp))) { + return; + } + lck_mtx_lock(listp->mtx); + + /* Check if pcb is on timer list again after acquiring the lock */ + if (!(TIMER_IS_ON_LIST(tp))) { + lck_mtx_unlock(listp->mtx); + return; + } + + if (listp->next_te != NULL && listp->next_te == &tp->tentry) + listp->next_te = LIST_NEXT(&tp->tentry, le); + + LIST_REMOVE(&tp->tentry, le); + tp->t_flags &= ~(TF_TIMER_ONLIST); + + listp->entries--; + lck_mtx_unlock(listp->mtx); + + tp->tentry.le.le_next = NULL; + tp->tentry.le.le_prev = NULL; +} + +/* Function to check if the timerlist needs to be rescheduled to run + * the timer entry correctly. Basically, this is to check if we can avoid + * taking the list lock. + */ + +static boolean_t +need_to_resched_timerlist(uint32_t runtime, uint16_t index) { + struct tcptimerlist *listp = &tcp_timer_list; + int32_t diff; + boolean_t is_fast; + + if (runtime == 0 || index == TCPT_NONE) + return FALSE; + is_fast = !(IS_TIMER_SLOW(index)); + + /* If the list is being processed then the state of the list is in flux. + * In this case always acquire the lock and set the state correctly. + */ + if (listp->running) { + return TRUE; + } + + diff = timer_diff(listp->runtime, 0, runtime, 0); + if (diff <= 0) { + /* The list is going to run before this timer */ + return FALSE; + } else { + if (is_fast) { + if (diff <= listp->fast_quantum) + return FALSE; + } else { + if (diff <= listp->slow_quantum) + return FALSE; + } + } + return TRUE; +} + +void +tcp_sched_timerlist(uint32_t offset) +{ + + uint64_t deadline = 0; + struct tcptimerlist *listp = &tcp_timer_list; + + lck_mtx_assert(listp->mtx, LCK_MTX_ASSERT_OWNED); + + listp->runtime = tcp_now + offset; + + clock_interval_to_deadline(offset, NSEC_PER_SEC / TCP_RETRANSHZ, + &deadline); + + thread_call_enter_delayed(listp->call, deadline); +} + +/* Function to run the timers for a connection. + * + * Returns the offset of next timer to be run for this connection which + * can be used to reschedule the timerlist. + */ +uint32_t +tcp_run_conn_timer(struct tcpcb *tp, uint16_t *next_index) { + + struct socket *so; + uint16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE; + uint32_t timer_val, offset = 0, lo_timer = 0; + int32_t diff; + boolean_t needtorun[TCPT_NTIMERS]; + int count = 0; + + VERIFY(tp != NULL); + bzero(needtorun, sizeof(needtorun)); + + tcp_lock(tp->t_inpcb->inp_socket, 1, 0); + + so = tp->t_inpcb->inp_socket; + /* Release the want count on inp */ + if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1) == WNT_STOPUSING) { + if (TIMER_IS_ON_LIST(tp)) { + tcp_remove_timer(tp); + } + + /* Looks like the TCP connection got closed while we + * were waiting for the lock.. Done + */ + goto done; + } + + /* Since the timer thread needs to wait for tcp lock, it may race + * with another thread that can cancel or reschedule the timer that is + * about to run. Check if we need to run anything. + */ + index = tp->tentry.index; + timer_val = tp->t_timer[index]; + + if (index == TCPT_NONE || tp->tentry.runtime == 0) + goto done; + + diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0); + if (diff > 0) { + if (tp->tentry.index != TCPT_NONE) { + offset = diff; + *(next_index) = tp->tentry.index; + } + goto done; + } + + tp->t_timer[index] = 0; + if (timer_val > 0) { + tp = tcp_timers(tp, index); + if (tp == NULL) + goto done; + } + + /* Check if there are any other timers that need to be run. While doing it, + * adjust the timer values wrt tcp_now. + */ + for (i = 0; i < TCPT_NTIMERS; ++i) { + if (tp->t_timer[i] != 0) { + diff = timer_diff(tp->tentry.timer_start, tp->t_timer[i], tcp_now, 0); + if (diff <= 0) { + tp->t_timer[i] = 0; + needtorun[i] = TRUE; + count++; + } else { + tp->t_timer[i] = diff; + needtorun[i] = FALSE; + if (lo_timer == 0 || diff < lo_timer) { + lo_timer = diff; + lo_index = i; + } + } + } + } + + tp->tentry.timer_start = tcp_now; + tp->tentry.index = lo_index; + if (lo_index != TCPT_NONE) { + tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index]; + } else { + tp->tentry.runtime = 0; + } + + if (count > 0) { + /* run any other timers that are also outstanding at this time. */ + for (i = 0; i < TCPT_NTIMERS; ++i) { + if (needtorun[i]) { + tp->t_timer[i] = 0; + tp = tcp_timers(tp, i); + if (tp == NULL) + goto done; + } + } + tcp_set_lotimer_index(tp); + } + + if (tp->tentry.index < TCPT_NONE) { + offset = tp->t_timer[tp->tentry.index]; + *(next_index) = tp->tentry.index; + } + +done: + if (tp != NULL && tp->tentry.index == TCPT_NONE) { + tcp_remove_timer(tp); + } + tcp_unlock(so, 1, 0); + return offset; +} + +void +tcp_run_timerlist(void * arg1, void * arg2) { + +#pragma unused(arg1, arg2) + + struct tcptimerentry *te, *next_te; + struct tcptimerlist *listp = &tcp_timer_list; + struct tcpcb *tp; + uint32_t next_timer = 0; + uint16_t index = TCPT_NONE; + boolean_t need_fast = FALSE; + uint32_t active_count = 0; + uint32_t mode = TCP_TIMERLIST_FASTMODE; + + calculate_tcp_clock(); + + lck_mtx_lock(listp->mtx); + + listp->running = TRUE; + + LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) { + uint32_t offset = 0; + uint32_t runtime = te->runtime; + if (TSTMP_GT(runtime, tcp_now)) { + offset = timer_diff(runtime, 0, tcp_now, 0); + if (next_timer == 0 || offset < next_timer) { + next_timer = offset; + } + continue; + } + active_count++; + + tp = TIMERENTRY_TO_TP(te); + + /* Acquire an inp wantcnt on the inpcb so that the socket won't get + * detached even if tcp_close is called + */ + if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0) == WNT_STOPUSING) { + /* Some how this pcb went into dead state while on the timer list, + * just take it off the list. Since the timer list entry pointers + * are protected by the timer list lock, we can do it here + */ + if (TIMER_IS_ON_LIST(tp)) { + tp->t_flags &= ~(TF_TIMER_ONLIST); + LIST_REMOVE(&tp->tentry, le); + listp->entries--; + + tp->tentry.le.le_next = NULL; + tp->tentry.le.le_prev = NULL; + } + continue; + } + + /* Store the next timerentry pointer before releasing the list lock. + * If that entry has to be removed when we release the lock, this + * pointer will be updated to the element after that. + */ + listp->next_te = next_te; + + VERIFY_NEXT_LINK(&tp->tentry, le); + VERIFY_PREV_LINK(&tp->tentry, le); + + lck_mtx_unlock(listp->mtx); + + index = TCPT_NONE; + offset = tcp_run_conn_timer(tp, &index); + + lck_mtx_lock(listp->mtx); + + next_te = listp->next_te; + listp->next_te = NULL; + + if (offset > 0) { + if (index < TCPT_NONE) { + /* Check if this is a fast_timer. */ + if (!need_fast && !(IS_TIMER_SLOW(index))) { + need_fast = TRUE; + } + + if (next_timer == 0 || offset < next_timer) { + next_timer = offset; + } + } + } + } + + if (!LIST_EMPTY(&listp->lhead)) { + if (listp->mode == TCP_TIMERLIST_FASTMODE) { + if (need_fast || active_count > 0 || + listp->pref_mode == TCP_TIMERLIST_FASTMODE) { + listp->idlegen = 0; + } else { + listp->idlegen++; + if (listp->idlegen > timer_fastmode_idlemax) { + mode = TCP_TIMERLIST_SLOWMODE; + listp->idlegen = 0; + } + } + } else { + if (!need_fast) { + mode = TCP_TIMERLIST_SLOWMODE; + } + } + + if (mode == TCP_TIMERLIST_FASTMODE || + listp->pref_mode == TCP_TIMERLIST_FASTMODE) { + next_timer = listp->fast_quantum; + } else { + if (listp->pref_offset != 0 && + listp->pref_offset < next_timer) + next_timer = listp->pref_offset; + if (next_timer < listp->slow_quantum) + next_timer = listp->slow_quantum; + } + + listp->mode = mode; + + tcp_sched_timerlist(next_timer); + } else { + /* No need to reschedule this timer */ + listp->runtime = 0; + } + + listp->running = FALSE; + listp->pref_mode = 0; + listp->pref_offset = 0; + + lck_mtx_unlock(listp->mtx); +} + +/* Function to verify if a change in timer state is required for a connection */ +void +tcp_sched_timers(struct tcpcb *tp) +{ + struct tcptimerentry *te = &tp->tentry; + uint16_t index = te->index; + struct tcptimerlist *listp = &tcp_timer_list; + uint32_t offset = 0; + boolean_t is_fast; + int list_locked = 0; + + if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) { + /* Just return without adding the dead pcb to the list */ + if (TIMER_IS_ON_LIST(tp)) { + tcp_remove_timer(tp); + } + return; + } + + if (index == TCPT_NONE) { + tcp_remove_timer(tp); + return; + } + + is_fast = !(IS_TIMER_SLOW(index)); + offset = te->runtime - tcp_now; + if (offset == 0) { + offset = 1; + tcp_timer_advanced++; + } + if (is_fast) + offset = listp->fast_quantum; + + if (!TIMER_IS_ON_LIST(tp)) { + if (!list_locked) { + lck_mtx_lock(listp->mtx); + list_locked = 1; + } + + LIST_INSERT_HEAD(&listp->lhead, te, le); + tp->t_flags |= TF_TIMER_ONLIST; + + listp->entries++; + if (listp->entries > listp->maxentries) + listp->maxentries = listp->entries; + + /* if the list is not scheduled, just schedule it */ + if (listp->runtime == 0) + goto schedule; + + } + + + /* timer entry is currently on the list */ + if (need_to_resched_timerlist(te->runtime, index)) { + tcp_resched_timerlist++; + + if (!list_locked) { + lck_mtx_lock(listp->mtx); + list_locked = 1; + } + + VERIFY_NEXT_LINK(te, le); + VERIFY_PREV_LINK(te, le); + + if (listp->running) { + if (is_fast) { + listp->pref_mode = TCP_TIMERLIST_FASTMODE; + } else if (listp->pref_offset == 0 || + ((int)offset) < listp->pref_offset) { + listp->pref_offset = offset; + } + } else { + int32_t diff; + diff = timer_diff(listp->runtime, 0, tcp_now, offset); + if (diff <= 0) { + /* The list is going to run before this timer */ + goto done; + } else { + goto schedule; + } + } + } + goto done; + +schedule: + if (is_fast) { + listp->mode = TCP_TIMERLIST_FASTMODE; + listp->idlegen = 0; + } + tcp_sched_timerlist(offset); + +done: + if (list_locked) + lck_mtx_unlock(listp->mtx); + + return; +} + +void +tcp_set_lotimer_index(struct tcpcb *tp) { + uint16_t i, lo_index = TCPT_NONE; + uint32_t lo_timer = 0; + for (i = 0; i < TCPT_NTIMERS; ++i) { + if (tp->t_timer[i] != 0 && + (lo_timer == 0 || tp->t_timer[i] < lo_timer)) { + lo_timer = tp->t_timer[i]; + lo_index = i; + } + } + tp->tentry.index = lo_index; + if (lo_index != TCPT_NONE) { + tp->tentry.runtime = tp->tentry.timer_start + tp->t_timer[lo_index]; + } else { + tp->tentry.runtime = 0; + } +} + +void +tcp_check_timer_state(struct tcpcb *tp) { + + lck_mtx_assert(&tp->t_inpcb->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + + tcp_set_lotimer_index(tp); + + tcp_sched_timers(tp); + return; +} diff --git a/bsd/netinet/tcp_timer.h b/bsd/netinet/tcp_timer.h index c4ea59c6b..df1162053 100644 --- a/bsd/netinet/tcp_timer.h +++ b/bsd/netinet/tcp_timer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,16 +65,25 @@ #define _NETINET_TCP_TIMER_H_ #include <sys/appleapiopts.h> +#ifdef KERNEL +#include <kern/thread_call.h> +#endif /* KERNEL */ + /* - * Definitions of the TCP timers. These timers are counted - * down PR_SLOWHZ times a second. + * Definitions of the TCP timers. */ -#define TCPT_NTIMERS 4 +#define TCPT_NTIMERS 5 + +/* Keep the external definition the same for binary compatibility */ +#define TCPT_NTIMERS_EXT 4 #define TCPT_REXMT 0 /* retransmit */ #define TCPT_PERSIST 1 /* retransmit persistence */ #define TCPT_KEEP 2 /* keep alive */ #define TCPT_2MSL 3 /* 2*msl quiet time timer */ +#define TCPT_DELACK 4 /* delayed ack timer */ +#define TCPT_MAX 4 +#define TCPT_NONE (TCPT_MAX + 1) /* * The TCPT_REXMT timer is used to force retransmissions. @@ -119,7 +128,7 @@ */ #define TCPTV_MSL ( 15*TCP_RETRANSHZ) /* max seg lifetime (hah!) */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; - if 0, no idea yet */ + if 0, no idea yet */ #define TCPTV_RTOBASE ( 1*TCP_RETRANSHZ) /* assumed RTO if no info */ #define TCPTV_SRTTDFLT ( 1*TCP_RETRANSHZ) /* assumed RTT if no info */ @@ -131,9 +140,18 @@ #define TCPTV_KEEPINTVL ( 75*TCP_RETRANSHZ) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ -//#define TCPTV_MIN ( 3*TCP_RETRANSHZ) /* minimum allowable value */ -#define TCPTV_MIN (1) /* minimum allowable value */ -#define TCPTV_REXMTMAX ( 64*TCP_RETRANSHZ) /* max allowable REXMT value */ +#define TCPTV_REXMTMAX ( 64*TCP_RETRANSHZ ) /* max allowable REXMT value */ +#define TCPTV_REXMTMIN ( TCP_RETRANSHZ/33 ) /* min REXMT for non-local connections */ +#define TCPTV_UNACKWIN ( TCP_RETRANSHZ/10 ) /* Window for counting rcv bytes to see if + ack-stretching can start (default 100 ms) */ +#define TCPTV_MAXRCVIDLE (TCP_RETRANSHZ/5 ) /* Receiver idle time, avoid ack-stretching after that*/ + +/* No ack stretching during slow-start, until we see some packets. + * By the time the receiver gets 512 packets, the senders cwnd + * should open by a few hundred packets considering the progression + * during slow-start. + */ +#define TCP_RCV_SS_PKTCOUNT 512 #define TCPTV_TWTRUNC 8 /* RTO factor to truncate TW */ @@ -143,15 +161,81 @@ #ifdef TCPTIMERS static char *tcptimers[] = - { "REXMT", "PERSIST", "KEEP", "2MSL" }; + { "REXMT", "PERSIST", "KEEP", "2MSL" , "DELACK"}; #endif #ifdef KERNEL + +/* We consider persist, keep and 2msl as slow timers which can be coalesced + * at a higher granularity (500 ms). Rexmt and delayed ack are considered fast + * timers which fire in the order of 100ms. + * + * The following conditional is to check if a timer is one of the slow timers. This + * is fast and works well for now. If we add more slow timers for any reason, + * we may need to change this. + */ +#define IS_TIMER_SLOW(ind) ((ind & 0x3) != 0) + +struct tcptimerlist; + +struct tcptimerentry { + LIST_ENTRY(tcptimerentry) le; /* links for timer list */ + uint32_t timer_start; /* tcp clock when the timer was started */ + uint16_t index; /* index of lowest timer that needs to run first */ + uint32_t runtime; /* deadline at which the first timer has to fire */ +}; + +LIST_HEAD(timerlisthead, tcptimerentry); + +struct tcptimerlist { + struct timerlisthead lhead; /* head of the list of timer entries */ + lck_mtx_t *mtx; /* lock to protect the list */ + lck_attr_t *mtx_attr; /* mutex attributes */ + lck_grp_t *mtx_grp; /* mutex group definition */ + lck_grp_attr_t *mtx_grp_attr; /* mutex group attributes */ + uint32_t fast_quantum; /* minimum time quantum to coalesce fast timers */ + uint32_t slow_quantum; /* minimum time quantum to coalesce slow timers */ + thread_call_t call; /* call entry */ + uint32_t runtime; /* time at which this list is going to run */ + uint32_t entries; /* Number of entries on the list */ + uint32_t maxentries; /* Max number of entries at any time */ + + /* Set desired mode when timer list running */ + boolean_t running; /* Set when timer list is being processed */ +#define TCP_TIMERLIST_FASTMODE 0x1 +#define TCP_TIMERLIST_SLOWMODE 0x2 + uint32_t mode; /* Current mode, fast or slow */ + uint32_t pref_mode; /* Preferred mode set by a connection, fast or slow */ + uint32_t pref_offset; /* Preferred offset set by a connection */ + uint32_t idlegen; /* Number of times the list has been idle in fast mode */ + struct tcptimerentry *next_te; /* Store the next timer entry pointer to process */ + +}; + +#define TCP_FASTMODE_IDLEGEN_MAX 20 /* Approximately 2 seconds */ + /* - * Force a time value to be in a certain range. + * Minimum retransmit timeout is set to 30ms. We add a slop of + * 200 ms to the retransmit value to account for processing + * variance and delayed ack. This extra 200ms will help to avoid + * spurious retransmits by taking into consideration the receivers + * that wait for delayed ack timer instead of generating an ack + * for every two packets. + * + * On a local link, the minimum retransmit timeout is 100ms and + * variance is set to 0. This will make the sender a little bit more + * aggressive on local link. When the connection is not established yet, + * there is no need to add an extra 200ms to retransmit timeout because + * the initial value is high (1s) and delayed ack is not a problem in + * that case. */ -#define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ - (tv) = (value); \ +#define TCPTV_REXMTSLOP ( TCP_RETRANSHZ/5 ) /* rexmt slop allowed (200 ms) */ + +/* macro to decide when retransmit slop (described above) should be added */ +#define TCP_ADD_REXMTSLOP(tp) ((tp->t_flags & TF_LOCAL) != 0 || tp->t_state >= TCPS_ESTABLISHED) + +#define TCPT_RANGESET(tv, value, tvmin, tvmax, addslop) do { \ + (tv) = ((addslop) ? tcp_rexmt_slop : 0) + (value); \ if ((uint32_t)(tv) < (uint32_t)(tvmin)) \ (tv) = (tvmin); \ else if ((uint32_t)(tv) > (uint32_t)(tvmax)) \ @@ -166,16 +250,15 @@ extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_maxidle; /* time to drop after starting probes */ +extern int tcp_delack; /* delayed ack timer */ extern int tcp_maxpersistidle; extern int tcp_msl; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; +extern int tcp_rexmt_slop; +extern u_int32_t tcp_max_persist_timeout; /* Maximum persistence for Zero Window Probes */ -void tcp_timer_2msl(void *xtp); -void tcp_timer_keep(void *xtp); -void tcp_timer_persist(void *xtp); -void tcp_timer_rexmt(void *xtp); -void tcp_timer_delack(void *xtp); +#define OFFSET_FROM_START(tp, off) ((tcp_now + (off)) - (tp)->tentry.timer_start) #endif /* KERNEL */ #endif /* PRIVATE */ diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index d477b60d5..d4fddb517 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,7 @@ #include <net/if.h> #include <net/route.h> +#include <net/ntstat.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -105,6 +106,11 @@ #include <netinet6/ipsec.h> #endif /*IPSEC*/ +void tcp_fill_info(struct tcpcb *, struct tcp_info *); +errno_t tcp_fill_info_for_info_tuple(struct info_tuple *, struct tcp_info *); + +int tcp_sysctl_info(struct sysctl_oid *, void *, int , struct sysctl_req *); + /* * TCP protocol interface to socket abstraction. */ @@ -121,26 +127,26 @@ static struct tcpcb * tcp_usrclosed(struct tcpcb *); __private_extern__ int tcp_win_scale = 3; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_win_scale, 0, "Window scaling factor"); static u_int32_t tcps_in_sw_cksum; -SYSCTL_UINT(_net_inet_tcp, OID_AUTO, in_sw_cksum, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, in_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED, &tcps_in_sw_cksum, 0, "Number of received packets checksummed in software"); static u_int64_t tcps_in_sw_cksum_bytes; -SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, in_sw_cksum_bytes, CTLFLAG_RD, +SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, in_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &tcps_in_sw_cksum_bytes, "Amount of received data checksummed in software"); static u_int32_t tcps_out_sw_cksum; -SYSCTL_UINT(_net_inet_tcp, OID_AUTO, out_sw_cksum, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, out_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED, &tcps_out_sw_cksum, 0, "Number of transmitted packets checksummed in software"); static u_int64_t tcps_out_sw_cksum_bytes; -SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD, +SYSCTL_QUAD(_net_inet_tcp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &tcps_out_sw_cksum_bytes, "Amount of transmitted data checksummed in software"); @@ -160,9 +166,13 @@ __private_extern__ unsigned int tcp_sockthreshold = 64; #else __private_extern__ unsigned int tcp_sockthreshold = 0; #endif -SYSCTL_INT(_net_inet_tcp, OID_AUTO, sockthreshold, CTLFLAG_RW, +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sockthreshold, CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_sockthreshold , 0, "TCP Socket size increased if less than threshold"); + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, info, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, + 0 , 0, tcp_sysctl_info, "S", "TCP info per tuple"); + /* * TCP attaches to socket via pru_attach(), reserving space, * and an internet control block. @@ -186,7 +196,7 @@ tcp_usr_attach(struct socket *so, __unused int proto, struct proc *p) error = EISCONN; goto out; } - + error = tcp_attach(so, p); if (error) goto out; @@ -217,14 +227,15 @@ tcp_usr_detach(struct socket *so) if (inp == 0 || (inp->inp_state == INPCB_STATE_DEAD)) { return EINVAL; /* XXX */ } -#if 1 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); tp = intotcpcb(inp); /* In case we got disconnected from the peer */ if (tp == 0) goto out; TCPDEBUG1(); + + calculate_tcp_clock(); + tp = tcp_disconnect(tp); out: TCPDEBUG2(PRU_DETACH); @@ -238,6 +249,7 @@ out: } \ tp = intotcpcb(inp); \ TCPDEBUG1(); \ + calculate_tcp_clock(); \ } while(0) #define COMMON_END(req) out: TCPDEBUG2(req); return error; goto out @@ -415,6 +427,8 @@ tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct proc *p) tp = intotcpcb(inp); TCPDEBUG1(); + calculate_tcp_clock(); + if (nam->sa_family != 0 && nam->sa_family != AF_INET) { error = EAFNOSUPPORT; goto out; @@ -505,9 +519,7 @@ tcp_usr_disconnect(struct socket *so) struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp; -#if 1 - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); COMMON_START(); /* In case we got disconnected from the peer */ if (tp == 0) @@ -529,6 +541,8 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) struct tcpcb *tp = NULL; TCPDEBUG0; + in_setpeeraddr(so, nam); + if (so->so_state & SS_ISDISCONNECTED) { error = ECONNABORTED; goto out; @@ -538,7 +552,9 @@ tcp_usr_accept(struct socket *so, struct sockaddr **nam) } tp = intotcpcb(inp); TCPDEBUG1(); - in_setpeeraddr(so, nam); + + calculate_tcp_clock(); + COMMON_END(PRU_ACCEPT); } @@ -560,6 +576,9 @@ tcp6_usr_accept(struct socket *so, struct sockaddr **nam) } tp = intotcpcb(inp); TCPDEBUG1(); + + calculate_tcp_clock(); + in6_mapped_peeraddr(so, nam); COMMON_END(PRU_ACCEPT); } @@ -681,6 +700,9 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, #endif /* INET6 */ tp = intotcpcb(inp); TCPDEBUG1(); + + calculate_tcp_clock(); + if (control) { /* TCP doesn't do control messages (rights, creds, etc) */ if (control->m_len) { @@ -878,10 +900,11 @@ tcp_connect(tp, nam, p) struct socket *so = inp->inp_socket; struct tcpcb *otp; struct sockaddr_in *sin = (struct sockaddr_in *)nam; - struct sockaddr_in *ifaddr; + struct sockaddr_in ifaddr; struct rmxp_tao *taop; struct rmxp_tao tao_noncached; int error; + unsigned int outif = 0; if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, p); @@ -894,7 +917,7 @@ tcp_connect(tp, nam, p) * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ - error = in_pcbladdr(inp, nam, &ifaddr); + error = in_pcbladdr(inp, nam, &ifaddr, &outif); if (error) return error; @@ -902,7 +925,7 @@ tcp_connect(tp, nam, p) oinp = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port, inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr - : ifaddr->sin_addr, + : ifaddr.sin_addr, inp->inp_lport, 0, NULL); tcp_lock(inp->inp_socket, 0, 0); @@ -917,7 +940,7 @@ tcp_connect(tp, nam, p) if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && otp->t_state == TCPS_TIME_WAIT && - otp->t_starttime < (u_int32_t)tcp_msl && + ((int)(tcp_now - otp->t_starttime)) < tcp_msl && (otp->t_flags & TF_RCVD_CC)) otp = tcp_close(otp); else { @@ -930,7 +953,7 @@ tcp_connect(tp, nam, p) tcp_unlock(oinp->inp_socket, 1, 0); } skip_oinp: - if ((inp->inp_laddr.s_addr == INADDR_ANY ? ifaddr->sin_addr.s_addr : + if ((inp->inp_laddr.s_addr == INADDR_ANY ? ifaddr.sin_addr.s_addr : inp->inp_laddr.s_addr) == sin->sin_addr.s_addr && inp->inp_lport == sin->sin_port) return EINVAL; @@ -940,8 +963,10 @@ skip_oinp: lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); socket_lock(inp->inp_socket, 0); } - if (inp->inp_laddr.s_addr == INADDR_ANY) - inp->inp_laddr = ifaddr->sin_addr; + if (inp->inp_laddr.s_addr == INADDR_ANY) { + inp->inp_laddr = ifaddr.sin_addr; + inp->inp_last_outif = outif; + } inp->inp_faddr = sin->sin_addr; inp->inp_fport = sin->sin_port; in_pcbrehash(inp); @@ -968,9 +993,12 @@ skip_oinp: soisconnecting(so); tcpstat.tcps_connattempt++; tp->t_state = TCPS_SYN_SENT; - tp->t_timer[TCPT_KEEP] = tp->t_keepinit ? tp->t_keepinit : tcp_keepinit; + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_keepinit ? tp->t_keepinit : tcp_keepinit); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); + if (nstat_collect) + nstat_route_connect_attempt(inp->inp_route.ro_rt); /* * Generate a CC value for this connection and @@ -1008,6 +1036,7 @@ tcp6_connect(tp, nam, p) struct rmxp_tao *taop; struct rmxp_tao tao_noncached; int error; + unsigned int outif = 0; if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, p); @@ -1020,7 +1049,7 @@ tcp6_connect(tp, nam, p) * earlier incarnation of this same connection still in * TIME_WAIT state, creating an ADDRINUSE error. */ - error = in6_pcbladdr(inp, nam, &addr6); + error = in6_pcbladdr(inp, nam, &addr6, &outif); if (error) return error; tcp_unlock(inp->inp_socket, 0, 0); @@ -1034,7 +1063,7 @@ tcp6_connect(tp, nam, p) if (oinp) { if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && otp->t_state == TCPS_TIME_WAIT && - otp->t_starttime < (u_int32_t)tcp_msl && + ((int)(tcp_now - otp->t_starttime)) < tcp_msl && (otp->t_flags & TF_RCVD_CC)) otp = tcp_close(otp); else @@ -1046,8 +1075,10 @@ tcp6_connect(tp, nam, p) lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx); socket_lock(inp->inp_socket, 0); } - if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { inp->in6p_laddr = addr6; + inp->in6p_last_outif = outif; + } inp->in6p_faddr = sin6->sin6_addr; inp->inp_fport = sin6->sin6_port; if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0) @@ -1063,9 +1094,12 @@ tcp6_connect(tp, nam, p) soisconnecting(so); tcpstat.tcps_connattempt++; tp->t_state = TCPS_SYN_SENT; - tp->t_timer[TCPT_KEEP] = tp->t_keepinit ? tp->t_keepinit : tcp_keepinit; + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + tp->t_keepinit ? tp->t_keepinit : tcp_keepinit); tp->iss = tcp_new_isn(tp); tcp_sendseqinit(tp); + if (nstat_collect) + nstat_route_connect_attempt(inp->inp_route.ro_rt); /* * Generate a CC value for this connection and @@ -1089,6 +1123,132 @@ tcp6_connect(tp, nam, p) } #endif /* INET6 */ +/* + * Export TCP internal state information via a struct tcp_info + */ +__private_extern__ void +tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) +{ + bzero(ti, sizeof(*ti)); + + ti->tcpi_state = tp->t_state; + + if (tp->t_state > TCPS_LISTEN) { + if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) + ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; + if (tp->t_flags & TF_SACK_PERMIT) + ti->tcpi_options |= TCPI_OPT_SACK; + if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { + ti->tcpi_options |= TCPI_OPT_WSCALE; + ti->tcpi_snd_wscale = tp->snd_scale; + ti->tcpi_rcv_wscale = tp->rcv_scale; + } + + ti->tcpi_snd_mss = tp->t_maxseg; + ti->tcpi_rcv_mss = tp->t_maxseg; + + ti->tcpi_snd_ssthresh = tp->snd_ssthresh; + ti->tcpi_snd_cwnd = tp->snd_cwnd; + + ti->tcpi_rcv_space = tp->rcv_wnd; + + ti->tcpi_snd_wnd = tp->snd_wnd; + ti->tcpi_snd_bwnd = tp->snd_bwnd; + ti->tcpi_snd_nxt = tp->snd_nxt; + ti->tcpi_rcv_nxt = tp->rcv_nxt; + + ti->tcpi_last_outif = tp->t_inpcb->inp_last_outif; + } +} + +__private_extern__ errno_t +tcp_fill_info_for_info_tuple(struct info_tuple *itpl, struct tcp_info *ti) +{ + struct inpcbinfo *pcbinfo = NULL; + struct inpcb *inp = NULL; + struct socket *so; + struct tcpcb *tp; + + if (itpl->itpl_proto == IPPROTO_TCP) + pcbinfo = &tcbinfo; + else + return EINVAL; + + if (itpl->itpl_local_sa.sa_family == AF_INET && + itpl->itpl_remote_sa.sa_family == AF_INET) { + inp = in_pcblookup_hash(pcbinfo, + itpl->itpl_remote_sin.sin_addr, + itpl->itpl_remote_sin.sin_port, + itpl->itpl_local_sin.sin_addr, + itpl->itpl_local_sin.sin_port, + 0, NULL); + } else if (itpl->itpl_local_sa.sa_family == AF_INET6 && + itpl->itpl_remote_sa.sa_family == AF_INET6) { + struct in6_addr ina6_local; + struct in6_addr ina6_remote; + + ina6_local = itpl->itpl_local_sin6.sin6_addr; + if (IN6_IS_SCOPE_LINKLOCAL(&ina6_local) && itpl->itpl_local_sin6.sin6_scope_id) + ina6_local.s6_addr16[1] = htons(itpl->itpl_local_sin6.sin6_scope_id); + + ina6_remote = itpl->itpl_remote_sin6.sin6_addr; + if (IN6_IS_SCOPE_LINKLOCAL(&ina6_remote) && itpl->itpl_remote_sin6.sin6_scope_id) + ina6_remote.s6_addr16[1] = htons(itpl->itpl_remote_sin6.sin6_scope_id); + + inp = in6_pcblookup_hash(pcbinfo, + &ina6_remote, + itpl->itpl_remote_sin6.sin6_port, + &ina6_local, + itpl->itpl_local_sin6.sin6_port, + 0, NULL); + } else + return EINVAL; + if (inp == NULL || (so = inp->inp_socket) == NULL) + return ENOENT; + + socket_lock(so, 0); + if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) { + socket_unlock(so, 0); + return ENOENT; + } + tp = intotcpcb(inp); + + tcp_fill_info(tp, ti); + socket_unlock(so, 0); + + return 0; +} + + +__private_extern__ int +tcp_sysctl_info(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req) +{ + int error; + struct tcp_info ti; + struct info_tuple itpl; + + if (req->newptr == USER_ADDR_NULL) { + return EINVAL; + } + if (req->newlen < sizeof(struct info_tuple)) { + return EINVAL; + } + error = SYSCTL_IN(req, &itpl, sizeof(struct info_tuple)); + if (error != 0) { + return error; + } + error = tcp_fill_info_for_info_tuple(&itpl, &ti); + if (error != 0) { + return error; + } + error = SYSCTL_OUT(req, &ti, sizeof(struct tcp_info)); + if (error != 0) { + return error; + } + + return 0; +} + /* * The new sockopt interface makes it possible for us to block in the * copyin/out step (if we take a page fault). Taking a page fault at @@ -1124,6 +1284,8 @@ tcp_ctloutput(so, sopt) return (ECONNRESET); } + calculate_tcp_clock(); + switch (sopt->sopt_dir) { case SOPT_SET: switch (sopt->sopt_name) { @@ -1155,7 +1317,17 @@ tcp_ctloutput(so, sopt) else tp->t_flags &= ~opt; break; - + case TCP_RXT_FINDROP: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + opt = TF_RXTFINDROP; + if (optval) + tp->t_flagsext |= opt; + else + tp->t_flagsext &= ~opt; + break; case TCP_MAXSEG: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); @@ -1178,7 +1350,9 @@ tcp_ctloutput(so, sopt) error = EINVAL; else { tp->t_keepidle = optval * TCP_RETRANSHZ; - tp->t_timer[TCPT_KEEP] = TCP_KEEPIDLE(tp); /* reset the timer to new value */ + tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, + TCP_KEEPIDLE(tp)); /* reset the timer to new value */ + tcp_check_timer_state(tp); } break; @@ -1193,6 +1367,26 @@ tcp_ctloutput(so, sopt) tp->t_keepinit = optval * TCP_RETRANSHZ; break; + case PERSIST_TIMEOUT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + break; + if (optval < 0) + error = EINVAL; + else + tp->t_persist_timeout = optval * TCP_RETRANSHZ; + break; + case TCP_RXT_CONNDROPTIME: + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error) + break; + if (optval < 0) + error = EINVAL; + else + tp->rxt_conndroptime = optval * TCP_RETRANSHZ; + break; default: error = ENOPROTOOPT; break; @@ -1219,6 +1413,22 @@ tcp_ctloutput(so, sopt) case TCP_CONNECTIONTIMEOUT: optval = tp->t_keepinit / TCP_RETRANSHZ; break; + case PERSIST_TIMEOUT: + optval = tp->t_persist_timeout / TCP_RETRANSHZ; + break; + case TCP_RXT_CONNDROPTIME: + optval = tp->rxt_conndroptime / TCP_RETRANSHZ; + break; + case TCP_RXT_FINDROP: + optval = tp->t_flagsext & TF_RXTFINDROP; + break; + case TCP_INFO: { + struct tcp_info ti; + + tcp_fill_info(tp, &ti); + error = sooptcopyout(sopt, &ti, sizeof(struct tcp_info)); + goto done; + } default: error = ENOPROTOOPT; break; @@ -1227,6 +1437,7 @@ tcp_ctloutput(so, sopt) error = sooptcopyout(sopt, &optval, sizeof optval); break; } +done: return (error); } @@ -1272,9 +1483,9 @@ sysctl_tcp_sospace(struct sysctl_oid *oidp, __unused void *arg1, return error; } -SYSCTL_PROC(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_sendspace , 0, &sysctl_tcp_sospace, "IU", "Maximum outgoing TCP datagram size"); -SYSCTL_PROC(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_recvspace , 0, &sysctl_tcp_sospace, "IU", "Maximum incoming TCP datagram size"); @@ -1353,6 +1564,9 @@ tcp_attach(so, p) so->so_state |= nofd; return (ENOBUFS); } + if (nstat_collect) { + nstat_tcp_new_pcb(inp); + } tp->t_state = TCPS_CLOSED; return (0); } @@ -1425,7 +1639,7 @@ tcp_usrclosed(tp) soisdisconnected(tp->t_inpcb->inp_socket); /* To prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) - tp->t_timer[TCPT_2MSL] = tcp_maxidle; + tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, tcp_maxidle); } return (tp); } diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index 0fa518d78..4066829cb 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,11 +79,57 @@ struct name { \ #define _TCPCB_LIST_HEAD(name, type) LIST_HEAD(name, type) #endif -#define TCP_RETRANSHZ 10 /* tcp retrans timer (100ms) per hz */ +#define TCP_RETRANSHZ 1000 /* granularity of TCP timestamps, 1ms */ +#define TCP_TIMERHZ 100 /* frequency of TCP fast timer, 100 ms */ + +/* Minimum time quantum within which the timers are coalesced */ +#define TCP_FASTTIMER_QUANTUM TCP_TIMERHZ /* fast mode, once every 100ms */ +#define TCP_SLOWTIMER_QUANTUM TCP_RETRANSHZ / PR_SLOWHZ /* slow mode, once every 500ms */ + +#define TCP_RETRANSHZ_TO_USEC 1000 #ifdef KERNEL_PRIVATE #define N_TIME_WAIT_SLOTS 128 /* must be power of 2 */ +/* Base RTT is stored for N_MIN_RTT_HISTORY slots. This is used to + * estimate expected minimum RTT for delay based congestion control + * algorithms. + */ +#define N_RTT_BASE 5 + +/* Always allow at least 4 packets worth of recv window when adjusting + * recv window using inter-packet arrival jitter. + */ +#define MIN_IAJ_WIN 4 + +/* A variation in delay of this many milliseconds is tolerable. This limit has to + * be low but greater than zero. We also use standard deviation on jitter to adjust + * this limit for different link and connection types. + */ +#define ALLOWED_IAJ 5 + +/* Ignore the first few packets on a connection until the ACK clock gets going + */ +#define IAJ_IGNORE_PKTCNT 40 + +/* Let the accumulated IAJ value increase by this threshold at most. This limit + * will control how many ALLOWED_IAJ measurements a receiver will have to see + * before opening the receive window + */ +#define ACC_IAJ_HIGH_THRESH 100 + +/* When accumulated IAJ reaches this value, the receiver starts to react by + * closing the window + */ +#define ACC_IAJ_REACT_LIMIT 200 + +/* If the number of small packets (smaller than IAJ packet size) seen on a + * connection is more than this threshold, reset the size and learn it again. + * This is needed because the sender might send smaller segments after PMTU + * discovery and the receiver has to learn the new size. + */ +#define RESET_IAJ_SIZE_THRESH 20 + /* * Kernel variables for tcp. */ @@ -133,9 +179,8 @@ struct tcptemp { struct tcpcb { struct tsegqe_head t_segq; int t_dupacks; /* consecutive dup acks recd */ - struct tcptemp *unused; /* unused now: was t_template */ - - int t_timer[TCPT_NTIMERS]; /* tcp timers */ + uint32_t t_timer[TCPT_NTIMERS]; /* tcp timers */ + struct tcptimerentry tentry; /* entry in timer list */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ int t_state; /* state of this connection */ @@ -157,11 +202,9 @@ struct tcpcb { #define TF_RCVD_CC 0x04000 /* a CC was received in SYN */ #define TF_SENDCCNEW 0x08000 /* send CCnew instead of CC in SYN */ #define TF_MORETOCOME 0x10000 /* More data to be appended to sock */ -#define TF_LQ_OVERFLOW 0x20000 /* UNUSED listen queue overflow */ +#define TF_LOCAL 0x20000 /* connection to a host on local link */ #define TF_RXWIN0SENT 0x40000 /* sent a receiver win 0 in response */ #define TF_SLOWLINK 0x80000 /* route is a on a modem speed link */ - - #define TF_LASTIDLE 0x100000 /* connection was previously idle */ #define TF_FASTRECOVERY 0x200000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x400000 /* was in NewReno Fast Recovery */ @@ -172,6 +215,8 @@ struct tcpcb { #define TF_CLOSING 0x8000000 /* pending tcp close */ #define TF_TSO 0x10000000 /* TCP Segment Offloading is enable on this connection */ #define TF_BLACKHOLE 0x20000000 /* Path MTU Discovery Black Hole detection */ +#define TF_TIMER_ONLIST 0x40000000 /* pcb is on tcp_timer_list */ +#define TF_STRETCHACK 0x80000000 /* receiver is going to delay acks */ int t_force; /* 1 if forcing out a byte */ @@ -199,14 +244,14 @@ struct tcpcb { * for slow start exponential to * linear switch */ - u_int32_t snd_bandwidth; /* calculated bandwidth or 0 */ + u_int32_t snd_bandwidth; /* calculated bandwidth or 0 */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ u_int t_maxopd; /* mss plus options */ - u_int32_t t_rcvtime; /* inactivity time */ - u_int32_t t_starttime; /* time connection was established */ - int t_rtttime; /* round trip time */ + u_int32_t t_rcvtime; /* time at which a packet was received */ + u_int32_t t_starttime; /* time connection was established */ + int t_rtttime; /* tcp clock when rtt calculation was started */ tcp_seq t_rtseq; /* sequence number being timed */ int t_bw_rtttime; /* used for bandwidth calculation */ @@ -220,7 +265,10 @@ struct tcpcb { int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ + u_int t_rttcur; /* most recent value of rtt */ u_int32_t t_rttupdated; /* number of times rtt sampled */ + u_int32_t rxt_conndroptime; /* retxmt conn gets dropped after this time, when set */ + u_int32_t rxt_start; /* time at a connection starts retransmitting */ u_int32_t max_sndwnd; /* largest window peer has offered */ int t_softerror; /* possible error not yet reported */ @@ -234,6 +282,7 @@ struct tcpcb { u_char rcv_scale; /* window scaling for recv window */ u_char request_r_scale; /* pending window scaling */ u_char requested_s_scale; + u_int16_t tcp_cc_index; /* index of congestion control algorithm */ u_int32_t ts_recent; /* timestamp echo data */ u_int32_t ts_recent_age; /* when last updated */ @@ -251,24 +300,36 @@ struct tcpcb { int t_keepidle; /* keepalive idle timer (override global if > 0) */ int t_lastchain; /* amount of packets chained last time around */ int t_unacksegs; /* received but unacked segments: used for delaying acks */ + u_int32_t t_persist_timeout; /* ZWP persistence limit as set by PERSIST_TIMEOUT */ + u_int32_t t_persist_stop; /* persistence limit deadline if triggered by ZWP */ /* 3529618 MSS overload prevention */ u_int32_t rcv_reset; u_int32_t rcv_pps; u_int32_t rcv_byps; - u_int32_t rcv_maxbyps; + u_int32_t rcv_maxbyps; + +/* Receiver state for stretch-ack algorithm */ + u_int32_t rcv_unackwin; /* to measure win for stretching acks */ + u_int32_t rcv_by_unackwin; /* bytes seen during the last ack-stretching win */ + u_int16_t rcv_waitforss; /* wait for packets during slow-start */ + u_int16_t ecn_flags; +#define TE_SETUPSENT 0x01 /* Indicate we have sent ECN-SETUP SYN or SYN-ACK */ +#define TE_SETUPRECEIVED 0x02 /* Indicate we have received ECN-SETUP SYN or SYN-ACK */ +#define TE_SENDIPECT 0x04 /* Indicate we haven't sent or received non-ECN-setup SYN or SYN-ACK */ +#define TE_SENDCWR 0x08 /* Indicate that the next non-retransmit should have the TCP CWR flag set */ +#define TE_SENDECE 0x10 /* Indicate that the next packet should have the TCP ECE flag set */ tcp_seq snd_high; /* for use in NewReno Fast Recovery */ tcp_seq snd_high_prev; /* snd_high prior to retransmit */ - tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ u_char snd_limited; /* segments limited transmitted */ /* anti DoS counters */ u_int32_t rcv_second; /* start of interval second */ + /* SACK related state */ int sack_enable; /* enable SACK for this connection */ int snd_numholes; /* number of holes seen by sender */ - TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ @@ -277,18 +338,7 @@ struct tcpcb { tcp_seq sack_newdata; /* New data xmitted in this recovery episode starts at this seq number */ struct sackhint sackhint; /* SACK scoreboard hint */ - int t_rttlow; /* smallest observerved RTT */ - u_long ecn_flags; -#define TE_SETUPSENT 0x01 /* Indicate we have sent ECN-SETUP SYN or SYN-ACK */ -#define TE_SETUPRECEIVED 0x02 /* Indicate we have received ECN-SETUP SYN or SYN-ACK */ -#define TE_SENDIPECT 0x04 /* Indicate we haven't sent or received non-ECN-setup SYN or SYN-ACK */ -#define TE_SENDCWR 0x08 /* Indicate that the next non-retransmit should have the TCP CWR flag set */ -#define TE_SENDECE 0x10 /* Indicate that the next packet should have the TCP ECE flag set */ -#if TRAFFIC_MGT - u_int32_t tot_recv_snapshot; /* snapshot of global total pkts received */ - u_int32_t bg_recv_snapshot; /* snapshot of global background pkts received */ -#endif /* TRAFFIC_MGT */ u_int32_t t_pktlist_sentlen; /* total bytes in transmit chain */ struct mbuf *t_pktlist_head; /* First packet in transmit chain */ struct mbuf *t_pktlist_tail; /* Last packet in transmit chain */ @@ -296,12 +346,57 @@ struct tcpcb { int t_keepinit; /* connection timeout, i.e. idle time in SYN_SENT or SYN_RECV state */ u_int32_t tso_max_segment_size; /* TCP Segment Offloading maximum segment unit for NIC */ u_int t_pmtud_saved_maxopd; /* MSS saved before performing PMTU-D BlackHole detection */ + + struct + { + u_int32_t rxduplicatebytes; + u_int32_t rxoutoforderbytes; + u_int32_t txretransmitbytes; + u_int32_t unused_pad_to_8; + } t_stat; + + /* Background congestion related state */ + uint32_t rtt_hist[N_RTT_BASE]; /* history of minimum RTT */ + uint32_t rtt_count; /* Number of RTT samples in recent base history */ + uint32_t bg_ssthresh; /* Slow start threshold until delay increases */ + uint32_t t_flagsext; /* Another field to accommodate more flags */ +#define TF_RXTFINDROP 0x1 /* Drop conn after retransmitting FIN 3 times */ +#define TF_RCVUNACK_WAITSS 0x2 /* set when the receiver should not stretch acks */ + +#if TRAFFIC_MGT + /* Inter-arrival jitter related state */ + uint32_t iaj_rcv_ts; /* tcp clock when the first packet was received */ + uint16_t iaj_size; /* Size of packet for iaj measurement */ + uint16_t iaj_small_pkt; /* Count of packets smaller than iaj_size */ + uint16_t iaj_pktcnt; /* packet count, to avoid throttling initially */ + uint16_t acc_iaj; /* Accumulated iaj */ + tcp_seq iaj_rwintop; /* recent max advertised window */ + uint32_t avg_iaj; /* Mean */ + uint32_t std_dev_iaj; /* Standard deviation */ +#endif /* TRAFFIC_MGT */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(tp) tp->t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(tp) tp->t_flags &= ~TF_FASTRECOVERY +#if CONFIG_DTRACE +enum tcp_cc_event { + TCP_CC_CWND_INIT, + TCP_CC_INSEQ_ACK_RCVD, + TCP_CC_ACK_RCVD, + TCP_CC_ENTER_FASTRECOVERY, + TCP_CC_IN_FASTRECOVERY, + TCP_CC_EXIT_FASTRECOVERY, + TCP_CC_PARTIAL_ACK, + TCP_CC_IDLE_TIMEOUT, + TCP_CC_REXMT_TIMEOUT, + TCP_CC_ECN_RCVD, + TCP_CC_BAD_REXMT_RECOVERY, + TCP_CC_OUTPUT_ERROR, + TCP_CC_CHANGE_ALGO +}; +#endif /* CONFIG_DTRACE */ /* * Structure to hold TCP options that are only used during segment @@ -346,18 +441,19 @@ struct rmxp_tao { #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* - * The smoothed round-trip time and estimated variance + * The rtt measured is in milliseconds as the timestamp granularity is + * a millisecond. The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). - * With these scales, srtt has 3 bits to the right of the binary point, - * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the + * With these scales, srtt has 5 bits to the right of the binary point, + * and thus an "ALPHA" of 0.875. rttvar has 4 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ -#define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ -#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ -#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ +#define TCP_RTT_SHIFT 5 /* shift for srtt; 5 bits frac. */ +#define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 4 bits */ +#define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 4 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* @@ -399,7 +495,7 @@ struct tcpcb { int t_dupacks; /* consecutive dup acks recd */ u_int32_t unused; /* unused now: was t_template */ - int t_timer[TCPT_NTIMERS]; /* tcp timers */ + int t_timer[TCPT_NTIMERS_EXT]; /* tcp timers */ _TCPCB_PTR(struct inpcb *) t_inpcb; /* back pointer to internet pcb */ int t_state; /* state of this connection */ @@ -452,7 +548,7 @@ struct tcpcb { */ u_int t_maxopd; /* mss plus options */ - u_int32_t t_rcvtime; /* inactivity time */ + u_int32_t t_rcvtime; /* time at which a packet was received */ u_int32_t t_starttime; /* time connection was established */ int t_rtttime; /* round trip time */ tcp_seq t_rtseq; /* sequence number being timed */ @@ -595,9 +691,8 @@ struct tcpstat { u_int32_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ u_int32_t tcps_sack_sboverflow; /* SACK sendblock overflow */ -#if TRAFFIC_MGT u_int32_t tcps_bg_rcvtotal; /* total background packets received */ -#endif /* TRAFFIC_MGT */ + u_int32_t tcps_rxtfindrop; /* drop conn after retransmitting FIN */ }; #pragma pack(4) @@ -633,7 +728,7 @@ struct xtcpcb64 { u_int64_t t_segq; int t_dupacks; /* consecutive dup acks recd */ - int t_timer[TCPT_NTIMERS]; /* tcp timers */ + int t_timer[TCPT_NTIMERS_EXT]; /* tcp timers */ int t_state; /* state of this connection */ u_int t_flags; @@ -665,7 +760,7 @@ struct xtcpcb64 { */ u_int t_maxopd; /* mss plus options */ - u_int32_t t_rcvtime; /* inactivity time */ + u_int32_t t_rcvtime; /* time at which a packet was received */ u_int32_t t_starttime; /* time connection was established */ int t_rtttime; /* round trip time */ tcp_seq t_rtseq; /* sequence number being timed */ @@ -707,6 +802,87 @@ struct xtcpcb64 { #endif /* !CONFIG_EMBEDDED */ +#ifdef PRIVATE + +struct xtcpcb_n { + u_int32_t xt_len; + u_int32_t xt_kind; /* XSO_TCPCB */ + + u_int64_t t_segq; + int t_dupacks; /* consecutive dup acks recd */ + + int t_timer[TCPT_NTIMERS_EXT]; /* tcp timers */ + + int t_state; /* state of this connection */ + u_int t_flags; + + int t_force; /* 1 if forcing out a byte */ + + tcp_seq snd_una; /* send unacknowledged */ + tcp_seq snd_max; /* highest sequence number sent; + * used to recognize retransmits + */ + tcp_seq snd_nxt; /* send next */ + tcp_seq snd_up; /* send urgent pointer */ + + tcp_seq snd_wl1; /* window update seg seq number */ + tcp_seq snd_wl2; /* window update seg ack number */ + tcp_seq iss; /* initial send sequence number */ + tcp_seq irs; /* initial receive sequence number */ + + tcp_seq rcv_nxt; /* receive next */ + tcp_seq rcv_adv; /* advertised window */ + u_int32_t rcv_wnd; /* receive window */ + tcp_seq rcv_up; /* receive urgent pointer */ + + u_int32_t snd_wnd; /* send window */ + u_int32_t snd_cwnd; /* congestion-controlled window */ + u_int32_t snd_ssthresh; /* snd_cwnd size threshold for + * for slow start exponential to + * linear switch + */ + u_int t_maxopd; /* mss plus options */ + + u_int32_t t_rcvtime; /* time at which a packet was received */ + u_int32_t t_starttime; /* time connection was established */ + int t_rtttime; /* round trip time */ + tcp_seq t_rtseq; /* sequence number being timed */ + + int t_rxtcur; /* current retransmit value (ticks) */ + u_int t_maxseg; /* maximum segment size */ + int t_srtt; /* smoothed round-trip time */ + int t_rttvar; /* variance in round-trip time */ + + int t_rxtshift; /* log(2) of rexmt exp. backoff */ + u_int t_rttmin; /* minimum rtt allowed */ + u_int32_t t_rttupdated; /* number of times rtt sampled */ + u_int32_t max_sndwnd; /* largest window peer has offered */ + + int t_softerror; /* possible error not yet reported */ + /* out-of-band data */ + char t_oobflags; /* have some */ + char t_iobc; /* input character */ + /* RFC 1323 variables */ + u_char snd_scale; /* window scaling for send window */ + u_char rcv_scale; /* window scaling for recv window */ + u_char request_r_scale; /* pending window scaling */ + u_char requested_s_scale; + u_int32_t ts_recent; /* timestamp echo data */ + + u_int32_t ts_recent_age; /* when last updated */ + tcp_seq last_ack_sent; + /* RFC 1644 variables */ + tcp_cc cc_send; /* send connection count */ + tcp_cc cc_recv; /* receive connection count */ + tcp_seq snd_recover; /* for use in fast recovery */ + /* experimental */ + u_int32_t snd_cwnd_prev; /* cwnd prior to retransmit */ + u_int32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ + u_int32_t t_badrxtwin; /* window for retransmit recovery */ +}; + +#endif /* PRIVATE */ + #pragma pack() /* @@ -760,11 +936,14 @@ extern struct tcpstat tcpstat; /* tcp statistics */ extern int tcp_mssdflt; /* XXX */ extern int tcp_minmss; extern int tcp_minmssoverload; -extern int tcp_do_newreno; extern int ss_fltsz; extern int ss_fltsz_local; +extern int tcp_do_rfc3390; /* Calculate ss_fltsz according to RFC 3390 */ #ifdef __APPLE__ extern u_int32_t tcp_now; /* for RFC 1323 timestamps */ +extern struct timeval tcp_uptime; +extern lck_spin_t *tcp_uptime_lock; + extern int tcp_delack_enabled; #endif /* __APPLE__ */ @@ -782,7 +961,6 @@ int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); -void tcp_fasttimo(void *); struct rmxp_tao * tcp_gettaocache(struct inpcb *); void tcp_init(void) __attribute__((section("__TEXT, initcode"))); @@ -796,10 +974,13 @@ struct tcpcb * int tcp_output(struct tcpcb *); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int, - unsigned int); + unsigned int, unsigned int); struct rtentry *tcp_rtlookup(struct inpcb *, unsigned int); void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); +void tcp_check_timer_state(struct tcpcb *tp); +void tcp_run_timerlist(void *arg1, void *arg2); + struct tcptemp * tcp_maketemplate(struct tcpcb *); void tcp_fillheaders(struct tcpcb *, void *, void *); @@ -816,10 +997,16 @@ void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int32_t tcp_sbspace(struct tcpcb *tp); void tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp); +void tcp_reset_stretch_ack(struct tcpcb *tp); +#if TRAFFIC_MGT +void reset_acc_iaj(struct tcpcb *tp); +#endif /* TRAFFIC_MGT */ int tcp_lock (struct socket *, int, void *); int tcp_unlock (struct socket *, int, void *); +void calculate_tcp_clock(void); + #ifdef _KERN_LOCKS_H_ lck_mtx_t * tcp_getlock (struct socket *, int); #else diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index 500cdcc90..37cc4153c 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -72,6 +72,8 @@ #include <sys/socketvar.h> #include <sys/sysctl.h> #include <sys/syslog.h> +#include <sys/mcache.h> +#include <net/ntstat.h> #include <kern/zalloc.h> @@ -121,35 +123,35 @@ static int udpcksum = 1; #else static int udpcksum = 0; /* XXX */ #endif -SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_RW | CTLFLAG_LOCKED, &udpcksum, 0, ""); static u_int32_t udps_in_sw_cksum; -SYSCTL_UINT(_net_inet_udp, OID_AUTO, in_sw_cksum, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_udp, OID_AUTO, in_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED, &udps_in_sw_cksum, 0, "Number of received packets checksummed in software"); static u_int64_t udps_in_sw_cksum_bytes; -SYSCTL_QUAD(_net_inet_udp, OID_AUTO, in_sw_cksum_bytes, CTLFLAG_RD, +SYSCTL_QUAD(_net_inet_udp, OID_AUTO, in_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &udps_in_sw_cksum_bytes, "Amount of received data checksummed in software"); static u_int32_t udps_out_sw_cksum; -SYSCTL_UINT(_net_inet_udp, OID_AUTO, out_sw_cksum, CTLFLAG_RD, +SYSCTL_UINT(_net_inet_udp, OID_AUTO, out_sw_cksum, CTLFLAG_RD | CTLFLAG_LOCKED, &udps_out_sw_cksum, 0, "Number of transmitted packets checksummed in software"); static u_int64_t udps_out_sw_cksum_bytes; -SYSCTL_QUAD(_net_inet_udp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD, +SYSCTL_QUAD(_net_inet_udp, OID_AUTO, out_sw_cksum_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &udps_out_sw_cksum_bytes, "Amount of transmitted data checksummed in software"); int log_in_vain = 0; -SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_RW | CTLFLAG_LOCKED, &log_in_vain, 0, "Log all incoming UDP packets"); static int blackhole = 0; -SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_RW | CTLFLAG_LOCKED, &blackhole, 0, "Do not send port unreachables for refused connects"); struct inpcbhead udb; /* from udp_var.h */ @@ -179,13 +181,13 @@ static int udp_gc_done = FALSE; /* Garbage collection performed last slowtimo */ #endif struct udpstat udpstat; /* from udp_var.h */ -SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet_udp, UDPCTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); -SYSCTL_INT(_net_inet_udp, OID_AUTO, pcbcount, CTLFLAG_RD, +SYSCTL_INT(_net_inet_udp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED, &udbinfo.ipi_count, 0, "Number of active PCBs"); __private_extern__ int udp_use_randomport = 1; -SYSCTL_INT(_net_inet_udp, OID_AUTO, randomize_ports, CTLFLAG_RW, +SYSCTL_INT(_net_inet_udp, OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED, &udp_use_randomport, 0, "Randomize UDP port numbers"); #if INET6 @@ -254,13 +256,15 @@ udp_input(m, iphlen) register struct udphdr *uh; register struct inpcb *inp; struct mbuf *opts = 0; - int len; + int len, isbroadcast; struct ip save_ip; struct sockaddr *append_sa; struct inpcbinfo *pcbinfo = &udbinfo; struct sockaddr_in udp_in = { sizeof (udp_in), AF_INET, 0, { 0 }, { 0, 0, 0, 0, 0, 0, 0, 0 } }; + struct ip_moptions *imo = NULL; + int foundmembership = 0, ret = 0; #if INET6 struct udp_in6 udp_in6 = { { sizeof (udp_in6.uin6_sin), AF_INET6, 0, 0, @@ -365,8 +369,9 @@ doudpcksum: udpstat.udps_nosum++; #endif - if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || - in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) { + isbroadcast = in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif); + + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || isbroadcast) { int reuse_sock = 0, mcast_delivered = 0; @@ -409,6 +414,11 @@ doudpcksum: if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif + if ((inp->inp_moptions == NULL) && + (ntohl(ip->ip_dst.s_addr) != INADDR_ALLHOSTS_GROUP) && + (isbroadcast == 0) ) + continue; + if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) { continue; @@ -441,6 +451,35 @@ doudpcksum: } } + if (isbroadcast == 0 && (ntohl(ip->ip_dst.s_addr) != INADDR_ALLHOSTS_GROUP)) { + if((imo = inp->inp_moptions) == NULL) { + udp_unlock(inp->inp_socket, 1, 0); + continue; + } else { + struct sockaddr_in group; + int blocked; + + IMO_LOCK(imo); + + bzero(&group, sizeof(struct sockaddr_in)); + group.sin_len = sizeof(struct sockaddr_in); + group.sin_family = AF_INET; + group.sin_addr = ip->ip_dst; + + blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif, + (struct sockaddr *)&group, + (struct sockaddr *)&udp_in); + if (blocked == MCAST_PASS) + foundmembership = 1; + + IMO_UNLOCK(imo); + if (!foundmembership) { + udp_unlock(inp->inp_socket, 1, 0); + continue; + } + foundmembership = 0; + } + } reuse_sock = inp->inp_socket->so_options& (SO_REUSEPORT|SO_REUSEADDR); { #if IPSEC @@ -537,21 +576,9 @@ doudpcksum: } else if (payload_len == 4 && *(u_int32_t*)((caddr_t)uh + sizeof(struct udphdr)) != 0) { /* UDP encapsulated IPSec packet to pass through NAT */ - size_t stripsiz; - - stripsiz = sizeof(struct udphdr); - - ip = mtod(m, struct ip *); - ovbcopy((caddr_t)ip, (caddr_t)(((u_char *)ip) + stripsiz), iphlen); - m->m_data += stripsiz; - m->m_len -= stripsiz; - m->m_pkthdr.len -= stripsiz; - ip = mtod(m, struct ip *); - ip->ip_len = ip->ip_len - stripsiz; - ip->ip_p = IPPROTO_ESP; - KERNEL_DEBUG(DBG_FNC_UDP_INPUT | DBG_FUNC_END, 0,0,0,0,0); - esp4_input(m, iphlen); + /* preserve the udp header */ + esp4_input(m, iphlen + sizeof(struct udphdr)); return; } } @@ -624,8 +651,9 @@ doudpcksum: */ udp_in.sin_port = uh->uh_sport; udp_in.sin_addr = ip->ip_src; - if (inp->inp_flags & INP_CONTROLOPTS - || inp->inp_socket->so_options & SO_TIMESTAMP) { + if ((inp->inp_flags & INP_CONTROLOPTS) != 0 + || (inp->inp_socket->so_options & SO_TIMESTAMP) != 0 + || (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { #if INET6 if (inp->inp_vflag & INP_IPV6) { int savedflags; @@ -633,11 +661,17 @@ doudpcksum: ip_2_ip6_hdr(&udp_ip6.uip6_ip6, ip); savedflags = inp->inp_flags; inp->inp_flags &= ~INP_UNMAPPABLEOPTS; - ip6_savecontrol(inp, &opts, &udp_ip6.uip6_ip6, m); + ret = ip6_savecontrol(inp, m, &opts); inp->inp_flags = savedflags; } else #endif - ip_savecontrol(inp, &opts, ip, m); + { + ret = ip_savecontrol(inp, &opts, ip, m); + } + if (ret != 0) { + udp_unlock(inp->inp_socket, 1, 0); + goto bad; + } } m_adj(m, iphlen + sizeof(struct udphdr)); @@ -651,10 +685,14 @@ doudpcksum: } else #endif append_sa = (struct sockaddr *)&udp_in; + if (nstat_collect) { + locked_add_64(&inp->inp_stat->rxpackets, 1); + locked_add_64(&inp->inp_stat->rxbytes, m->m_pkthdr.len); + } + so_recv_data_stat(inp->inp_socket, m, 0); if (sbappendaddr(&inp->inp_socket->so_rcv, append_sa, m, opts, NULL) == 0) { udpstat.udps_fullsock++; - } - else { + } else { sorwakeup(inp->inp_socket); } udp_unlock(inp->inp_socket, 1, 0); @@ -702,6 +740,7 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, { struct sockaddr *append_sa; struct mbuf *opts = 0; + int ret = 0; #if CONFIG_MACF_NET if (mac_inpcb_check_deliver(last, n, AF_INET, SOCK_DGRAM) != 0) { @@ -709,8 +748,9 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, return; } #endif - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) { + if ((last->inp_flags & INP_CONTROLOPTS) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP) != 0 || + (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { #if INET6 if (last->inp_vflag & INP_IPV6) { int savedflags; @@ -721,11 +761,20 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, } savedflags = last->inp_flags; last->inp_flags &= ~INP_UNMAPPABLEOPTS; - ip6_savecontrol(last, &opts, &pudp_ip6->uip6_ip6, n); + ret = ip6_savecontrol(last, n, &opts); + if (ret != 0) { + last->inp_flags = savedflags; + goto error; + } last->inp_flags = savedflags; } else #endif - ip_savecontrol(last, &opts, ip, n); + { + ret = ip_savecontrol(last, &opts, ip, n); + if (ret != 0) { + goto error; + } + } } #if INET6 if (last->inp_vflag & INP_IPV6) { @@ -737,11 +786,22 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off, } else #endif append_sa = (struct sockaddr *)pudp_in; + if (nstat_collect) { + locked_add_64(&last->inp_stat->rxpackets, 1); + locked_add_64(&last->inp_stat->rxbytes, n->m_pkthdr.len); + } + so_recv_data_stat(last->inp_socket, n, 0); m_adj(n, off); if (sbappendaddr(&last->inp_socket->so_rcv, append_sa, n, opts, NULL) == 0) { udpstat.udps_fullsock++; - } else + } else { sorwakeup(last->inp_socket); + } + return; +error: + m_freem(n); + m_freem(opts); + return; } /* @@ -952,7 +1012,7 @@ udp_pcblist SYSCTL_HANDLER_ARGS return error; } -SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist, "S,xinpcb", "List of active UDP sockets"); #if !CONFIG_EMBEDDED @@ -1055,11 +1115,27 @@ udp_pcblist64 SYSCTL_HANDLER_ARGS return error; } -SYSCTL_PROC(_net_inet_udp, OID_AUTO, pcblist64, CTLFLAG_RD, 0, 0, +SYSCTL_PROC(_net_inet_udp, OID_AUTO, pcblist64, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, udp_pcblist64, "S,xinpcb64", "List of active UDP sockets"); #endif /* !CONFIG_EMBEDDED */ +static int +udp_pcblist_n SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) + int error = 0; + + error = get_pcblist_n(IPPROTO_UDP, req, &udbinfo); + + return error; +} + + +SYSCTL_PROC(_net_inet_udp, OID_AUTO, pcblist_n, CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, + udp_pcblist_n, "S,xinpcb_n", "List of active UDP sockets"); + + static __inline__ u_int16_t get_socket_id(struct socket * s) { @@ -1075,6 +1151,69 @@ get_socket_id(struct socket * s) return (val); } +static int +udp_check_pktinfo(struct mbuf *control, unsigned int *ifindex, struct in_addr *laddr) +{ + struct cmsghdr *cm = 0; + struct in_pktinfo *pktinfo; + struct ifnet *ifp; + + /* + * XXX: Currently, we assume all the optional information is stored + * in a single mbuf. + */ + if (control->m_next) + return (EINVAL); + + if (control->m_len < CMSG_LEN(0)) + return (EINVAL); + + for (cm = M_FIRST_CMSGHDR(control); cm; cm = M_NXT_CMSGHDR(control, cm)) { + if (cm->cmsg_len < sizeof(struct cmsghdr) || cm->cmsg_len > control->m_len) + return (EINVAL); + + if (cm->cmsg_level != IPPROTO_IP || cm->cmsg_type != IP_PKTINFO) + continue; + + if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) + return (EINVAL); + + pktinfo = (struct in_pktinfo *)CMSG_DATA(cm); + + /* Check for a valid ifindex in pktinfo */ + ifnet_head_lock_shared(); + + if (pktinfo->ipi_ifindex > if_index) { + ifnet_head_done(); + return (ENXIO); + } + + /* If ipi_ifindex is specified it takes precedence over ipi_spec_dst */ + + if (pktinfo->ipi_ifindex) { + ifp = ifindex2ifnet[pktinfo->ipi_ifindex]; + if (ifp == NULL) { + ifnet_head_done(); + return (ENXIO); + } + + ifnet_head_done(); + + *ifindex = pktinfo->ipi_ifindex; + laddr->s_addr = INADDR_ANY; + break; + } + + ifnet_head_done(); + + /* Use the provided ipi_spec_dst address for temp source address */ + *ifindex = 0; + *laddr = pktinfo->ipi_spec_dst; + break; + } + return (0); +} + static int udp_output(inp, m, addr, control, p) register struct inpcb *inp; @@ -1086,28 +1225,34 @@ udp_output(inp, m, addr, control, p) register struct udpiphdr *ui; register int len = m->m_pkthdr.len; struct sockaddr_in *sin; - struct in_addr origladdr, laddr, faddr; + struct in_addr origladdr, laddr, faddr, pi_laddr; u_short lport, fport; - struct sockaddr_in *ifaddr; - int error = 0, udp_dodisconnect = 0; + struct sockaddr_in ifaddr; + int error = 0, udp_dodisconnect = 0, pktinfo = 0; struct socket *so = inp->inp_socket; int soopts = 0; struct mbuf *inpopts; struct ip_moptions *mopts; struct route ro; - struct ip_out_args ipoa; -#if PKT_PRIORITY - mbuf_traffic_class_t mtc = MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; + unsigned int origoutif; + + pi_laddr.s_addr = INADDR_ANY; KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); if (control != NULL) { -#if PKT_PRIORITY mtc = mbuf_traffic_class_from_control(control); -#endif /* PKT_PRIORITY */ + + error = udp_check_pktinfo(control, &ipoa.ipoa_boundif, &pi_laddr); + m_freem(control); + if (error) + goto release; + pktinfo++; } + KERNEL_DEBUG(DBG_LAYER_OUT_BEG, inp->inp_fport, inp->inp_lport, inp->inp_laddr.s_addr, inp->inp_faddr.s_addr, (htons((u_short)len + sizeof (struct udphdr)))); @@ -1117,11 +1262,16 @@ udp_output(inp, m, addr, control, p) goto release; } - lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); - /* If socket was bound to an ifindex, tell ip_output about it */ - ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ? - inp->inp_boundif : IFSCOPE_NONE; + /* + * If socket was bound to an ifindex, tell ip_output about it. + * If the ancillary IP_PKTINFO option contains an interface index, + * it takes precedence over the one specified by IP_BOUND_IF. + */ + if (ipoa.ipoa_boundif == IFSCOPE_NONE && (inp->inp_flags & INP_BOUND_IF)) + ipoa.ipoa_boundif = inp->inp_boundif; + ipoa.ipoa_nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; soopts |= IP_OUTARGS; /* If there was a routing change, discard cached route and check @@ -1134,22 +1284,45 @@ udp_output(inp, m, addr, control, p) /* src address is gone? */ if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) == NULL) { - if (inp->inp_flags & INP_INADDR_ANY) { - /* new src will be set later */ - inp->inp_laddr.s_addr = INADDR_ANY; - } else { + if (((inp->inp_flags & INP_INADDR_ANY) == 0) || (so->so_state & SS_ISCONNECTED)) { + /* Rdar://5448998 + * If the source address is gone, return an error if: + * - the source was specified + * - the socket was already connected + */ error = EADDRNOTAVAIL; goto release; + } else { + /* new src will be set later */ + inp->inp_laddr.s_addr = INADDR_ANY; + inp->inp_last_outif = 0; } } if (ia != NULL) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if (inp->inp_route.ro_rt != NULL) rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = NULL; } - origladdr= laddr = inp->inp_laddr; + origoutif = inp->inp_last_outif; + + /* IP_PKTINFO option check. + * If a temporary scope or src address is provided, use it for this packet only + * and make sure we forget it after sending this datagram. + */ + + if (pi_laddr.s_addr != INADDR_ANY || + (ipoa.ipoa_boundif != IFSCOPE_NONE && pktinfo)) { + laddr = pi_laddr; /* temp src address for this datagram only */ + origladdr.s_addr = INADDR_ANY; + udp_dodisconnect = 1; /* we don't want to keep the laddr or route */ + inp->inp_flags |= INP_INADDR_ANY; /* remember we don't care about src addr.*/ + } else { + origladdr = laddr = inp->inp_laddr; + } + + origoutif = inp->inp_last_outif; faddr = inp->inp_faddr; lport = inp->inp_lport; fport = inp->inp_fport; @@ -1165,8 +1338,11 @@ udp_output(inp, m, addr, control, p) * In case we don't have a local port set, go through the full connect. * We don't have a local port yet (ie, we can't be looked up), * so it's not an issue if the input runs at the same time we do this. - */ - error = in_pcbconnect(inp, addr, p); + */ + + if (pi_laddr.s_addr != INADDR_ANY) /* if we have a source address specified, use that */ + inp->inp_laddr = pi_laddr; + error = in_pcbconnect(inp, addr, p, &ipoa.ipoa_boundif); /* if a scope is specified, use it */ if (error) { goto release; } @@ -1176,19 +1352,21 @@ udp_output(inp, m, addr, control, p) fport = inp->inp_fport; udp_dodisconnect = 1; } - else { + else { /* Fast path case * we have a full address and a local port. * use those info to build the packet without changing the pcb * and interfering with the input path. See 3851370 + * Note: if we may have a scope from IP_PKTINFO but the + * priority is always given to the scope provided by INP_BOUND_IF. */ if (laddr.s_addr == INADDR_ANY) { - if ((error = in_pcbladdr(inp, addr, &ifaddr)) != 0) + if ((error = in_pcbladdr(inp, addr, &ifaddr, &ipoa.ipoa_boundif)) != 0) goto release; - laddr = ifaddr->sin_addr; + laddr = ifaddr.sin_addr; inp->inp_flags |= INP_INADDR_ANY; /* from pcbconnect: remember we don't care about src addr.*/ } - + faddr = sin->sin_addr; fport = sin->sin_port; } @@ -1256,68 +1434,63 @@ udp_output(inp, m, addr, control, p) inpopts = inp->inp_options; soopts |= (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)); mopts = inp->inp_moptions; + if (mopts != NULL) + IMO_ADDREF(mopts); /* Copy the cached route and take an extra reference */ inp_route_copyout(inp, &ro); -#if PKT_PRIORITY - set_traffic_class(m, so, mtc); -#endif /* PKT_PRIORITY */ + set_packet_tclass(m, so, mtc, 0); socket_unlock(so, 0); - /* XXX jgraessley please look at XXX */ error = ip_output_list(m, 0, inpopts, &ro, soopts, mopts, &ipoa); + m = NULL; socket_lock(so, 0); + if (mopts != NULL) + IMO_REMREF(mopts); + if (error == 0 && nstat_collect) { + locked_add_64(&inp->inp_stat->txpackets, 1); + locked_add_64(&inp->inp_stat->txbytes, len); + } /* Synchronize PCB cached route */ inp_route_copyin(inp, &ro); +abort: if (udp_dodisconnect) { -#if IFNET_ROUTE_REFCNT /* Always discard the cached route for unconnected socket */ if (inp->inp_route.ro_rt != NULL) { rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = NULL; } -#endif /* IFNET_ROUTE_REFCNT */ in_pcbdisconnect(inp); inp->inp_laddr = origladdr; /* XXX rehash? */ - } -#if IFNET_ROUTE_REFCNT - else if (inp->inp_route.ro_rt != NULL && - (inp->inp_route.ro_rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST))) { - /* Always discard non-unicast cached route */ - rtfree(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = NULL; - } -#endif /* IFNET_ROUTE_REFCNT */ + inp->inp_last_outif = origoutif; + } else if (inp->inp_route.ro_rt != NULL) { + struct rtentry *rt = inp->inp_route.ro_rt; + unsigned int outif; - KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_END, error, 0,0,0,0); - return (error); - -abort: - if (udp_dodisconnect) { -#if IFNET_ROUTE_REFCNT - /* Always discard the cached route for unconnected socket */ - if (inp->inp_route.ro_rt != NULL) { + if (rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST)) + rt = NULL; /* unusable */ + /* + * Always discard if it is a multicast or broadcast route. + */ + if (rt == NULL) { rtfree(inp->inp_route.ro_rt); inp->inp_route.ro_rt = NULL; } -#endif /* IFNET_ROUTE_REFCNT */ - in_pcbdisconnect(inp); - inp->inp_laddr = origladdr; /* XXX rehash? */ - } -#if IFNET_ROUTE_REFCNT - else if (inp->inp_route.ro_rt != NULL && - (inp->inp_route.ro_rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST))) { - /* Always discard non-unicast cached route */ - rtfree(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = NULL; + /* + * If the destination route is unicast, update outif with + * that of the route interface index used by IP. + */ + if (rt != NULL && + (outif = rt->rt_ifp->if_index) != inp->inp_last_outif) + inp->inp_last_outif = outif; } -#endif /* IFNET_ROUTE_REFCNT */ release: - m_freem(m); + if (m != NULL) + m_freem(m); KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_END, error, 0,0,0,0); return (error); } @@ -1362,10 +1535,10 @@ sysctl_udp_sospace(struct sysctl_oid *oidp, __unused void *arg1, return error; } -SYSCTL_PROC(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &udp_recvspace, 0, &sysctl_udp_sospace, "IU", "Maximum incoming UDP datagram size"); -SYSCTL_PROC(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_PROC(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &udp_sendspace, 0, &sysctl_udp_sospace, "IU", "Maximum outgoing UDP datagram size"); static int @@ -1400,6 +1573,7 @@ udp_attach(struct socket *so, __unused int proto, struct proc *p) inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_ttl = ip_defttl; + nstat_udp_new_pcb(inp); return 0; } @@ -1431,7 +1605,7 @@ udp_connect(struct socket *so, struct sockaddr *nam, struct proc *p) return EINVAL; if (inp->inp_faddr.s_addr != INADDR_ANY) return EISCONN; - error = in_pcbconnect(inp, nam, p); + error = in_pcbconnect(inp, nam, p, NULL); if (error == 0) soisconnected(so); return error; @@ -1464,6 +1638,7 @@ udp_disconnect(struct socket *so) in_pcbdisconnect(inp); inp->inp_laddr.s_addr = INADDR_ANY; so->so_state &= ~SS_ISCONNECTED; /* XXX */ + inp->inp_last_outif = 0; return 0; } @@ -1514,9 +1689,9 @@ udp_lock(struct socket *so, int refcount, void *debug) lr_saved = debug; if (so->so_pcb) { - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_NOTOWNED); - lck_mtx_lock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } else { panic("udp_lock: so=%p NO PCB! lr=%p lrh= %s\n", so, lr_saved, solockhistory_nr(so)); @@ -1548,11 +1723,11 @@ udp_unlock(struct socket *so, int refcount, void *debug) so, lr_saved, solockhistory_nr(so)); /* NOTREACHED */ } else { - lck_mtx_assert(((struct inpcb *)so->so_pcb)->inpcb_mtx, + lck_mtx_assert(&((struct inpcb *)so->so_pcb)->inpcb_mtx, LCK_MTX_ASSERT_OWNED); so->unlock_lr[so->next_unlock_lr] = lr_saved; so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX; - lck_mtx_unlock(((struct inpcb *)so->so_pcb)->inpcb_mtx); + lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx); } @@ -1566,7 +1741,7 @@ udp_getlock(struct socket *so, __unused int locktype) if (so->so_pcb) - return(inp->inpcb_mtx); + return(&inp->inpcb_mtx); else { panic("udp_getlock: so=%p NULL so_pcb lrh= %s\n", so, solockhistory_nr(so)); @@ -1598,7 +1773,7 @@ udp_slowtimo() continue; so = inp->inp_socket; - if (!lck_mtx_try_lock(inp->inpcb_mtx)) /* skip if busy, no hurry for cleanup... */ + if (!lck_mtx_try_lock(&inp->inpcb_mtx)) /* skip if busy, no hurry for cleanup... */ continue; if (so->so_usecount == 0) { @@ -1612,7 +1787,7 @@ udp_slowtimo() } in_pcbdispose(inp); } else { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); } } lck_rw_done(pcbinfo->mtx); diff --git a/bsd/netinet6/Makefile b/bsd/netinet6/Makefile index fc12c8bef..f765bace4 100644 --- a/bsd/netinet6/Makefile +++ b/bsd/netinet6/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ @@ -25,12 +21,12 @@ DATAFILES = \ in6_var.h ip6_mroute.h nd6.h ip6_fw.h PRIVATE_DATAFILES = \ - in6_pcb.h ip6_var.h pim6_var.h + in6_pcb.h ip6_var.h pim6_var.h mld6_var.h PRIVATE_KERNELFILES = \ ah6.h esp6.h esp_rijndael.h in6_gif.h in6_ifattach.h \ in6_prefix.h ip6_ecn.h ip6_fw.h \ - ip6protosw.h ipcomp6.h ipsec6.h mld6_var.h \ + ip6protosw.h ipcomp6.h ipsec6.h \ raw_ip6.h scope6_var.h tcp6_var.h udp6_var.h INSTALL_MI_LIST = ${DATAFILES} diff --git a/bsd/netinet6/ah.h b/bsd/netinet6/ah.h index f77826a0d..cf9ddad2a 100644 --- a/bsd/netinet6/ah.h +++ b/bsd/netinet6/ah.h @@ -71,7 +71,7 @@ struct ah_algorithm { const char *name; int (*init)(struct ah_algorithm_state *, struct secasvar *); void (*update)(struct ah_algorithm_state *, caddr_t, size_t); - void (*result)(struct ah_algorithm_state *, caddr_t); + void (*result)(struct ah_algorithm_state *, caddr_t, size_t); }; #define AH_MAXSUMSIZE 64 // sha2-512's output size diff --git a/bsd/netinet6/ah6.h b/bsd/netinet6/ah6.h index 22cda6f12..688f946d5 100644 --- a/bsd/netinet6/ah6.h +++ b/bsd/netinet6/ah6.h @@ -41,7 +41,7 @@ #ifdef KERNEL_PRIVATE struct secasvar; -extern int ah6_input(struct mbuf **, int *); +extern int ah6_input(struct mbuf **, int *, int); extern int ah6_output(struct mbuf *, u_char *, struct mbuf *, struct secasvar *); extern int ah6_calccksum(struct mbuf *, caddr_t, size_t, diff --git a/bsd/netinet6/ah_core.c b/bsd/netinet6/ah_core.c index 042550b78..27098a76f 100644 --- a/bsd/netinet6/ah_core.c +++ b/bsd/netinet6/ah_core.c @@ -119,42 +119,42 @@ static int ah_sumsiz_zero(struct secasvar *); static int ah_none_mature(struct secasvar *); static int ah_none_init(struct ah_algorithm_state *, struct secasvar *); static void ah_none_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_none_result(struct ah_algorithm_state *, caddr_t); +static void ah_none_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_keyed_md5_mature(struct secasvar *); static int ah_keyed_md5_init(struct ah_algorithm_state *, struct secasvar *); static void ah_keyed_md5_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_keyed_md5_result(struct ah_algorithm_state *, caddr_t); +static void ah_keyed_md5_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_keyed_sha1_mature(struct secasvar *); static int ah_keyed_sha1_init(struct ah_algorithm_state *, struct secasvar *); static void ah_keyed_sha1_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_keyed_sha1_result(struct ah_algorithm_state *, caddr_t); +static void ah_keyed_sha1_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_hmac_md5_mature(struct secasvar *); static int ah_hmac_md5_init(struct ah_algorithm_state *, struct secasvar *); static void ah_hmac_md5_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_hmac_md5_result(struct ah_algorithm_state *, caddr_t); +static void ah_hmac_md5_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_hmac_sha1_mature(struct secasvar *); static int ah_hmac_sha1_init(struct ah_algorithm_state *, struct secasvar *); static void ah_hmac_sha1_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_hmac_sha1_result(struct ah_algorithm_state *, caddr_t); +static void ah_hmac_sha1_result(struct ah_algorithm_state *, caddr_t, size_t); #if ALLCRYPTO static int ah_sumsiz_sha2_256(struct secasvar *); static int ah_hmac_sha2_256_mature(struct secasvar *); static int ah_hmac_sha2_256_init(struct ah_algorithm_state *, struct secasvar *); static void ah_hmac_sha2_256_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_hmac_sha2_256_result(struct ah_algorithm_state *, caddr_t); +static void ah_hmac_sha2_256_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_sumsiz_sha2_384(struct secasvar *); static int ah_hmac_sha2_384_mature(struct secasvar *); static int ah_hmac_sha2_384_init(struct ah_algorithm_state *, struct secasvar *); static void ah_hmac_sha2_384_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_hmac_sha2_384_result(struct ah_algorithm_state *, caddr_t); +static void ah_hmac_sha2_384_result(struct ah_algorithm_state *, caddr_t, size_t); static int ah_sumsiz_sha2_512(struct secasvar *); static int ah_hmac_sha2_512_mature(struct secasvar *); static int ah_hmac_sha2_512_init(struct ah_algorithm_state *, struct secasvar *); static void ah_hmac_sha2_512_loop(struct ah_algorithm_state *, caddr_t, size_t); -static void ah_hmac_sha2_512_result(struct ah_algorithm_state *, caddr_t); +static void ah_hmac_sha2_512_result(struct ah_algorithm_state *, caddr_t, size_t); #endif /* ALLCRYPTO */ static void ah_update_mbuf(struct mbuf *, int, int, @@ -280,7 +280,8 @@ ah_none_loop( static void ah_none_result( __unused struct ah_algorithm_state *state, - __unused caddr_t addr) + __unused caddr_t addr, + __unused size_t l) { } @@ -363,9 +364,10 @@ ah_keyed_md5_loop(state, addr, len) } static void -ah_keyed_md5_result(state, addr) +ah_keyed_md5_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { u_char digest[16]; @@ -379,7 +381,7 @@ ah_keyed_md5_result(state, addr) } MD5Final(&digest[0], (MD5_CTX *)state->foo); FREE(state->foo, M_TEMP); - bcopy(&digest[0], (void *)addr, sizeof(digest)); + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); } static int @@ -484,9 +486,10 @@ ah_keyed_sha1_loop(state, addr, len) } static void -ah_keyed_sha1_result(state, addr) +ah_keyed_sha1_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { u_char digest[SHA1_RESULTLEN]; /* SHA-1 generates 160 bits */ SHA1_CTX *ctxt; @@ -500,7 +503,7 @@ ah_keyed_sha1_result(state, addr) (u_int)_KEYLEN(state->sav->key_auth)); } SHA1Final((caddr_t)&digest[0], ctxt); - bcopy(&digest[0], (void *)addr, HMACSIZE); + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -601,9 +604,10 @@ ah_hmac_md5_loop(state, addr, len) } static void -ah_hmac_md5_result(state, addr) +ah_hmac_md5_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { u_char digest[16]; u_char *ipad; @@ -624,7 +628,7 @@ ah_hmac_md5_result(state, addr) MD5Update(ctxt, &digest[0], sizeof(digest)); MD5Final(&digest[0], ctxt); - bcopy(&digest[0], (void *)addr, HMACSIZE); + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -727,9 +731,10 @@ ah_hmac_sha1_loop(state, addr, len) } static void -ah_hmac_sha1_result(state, addr) +ah_hmac_sha1_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { u_char digest[SHA1_RESULTLEN]; /* SHA-1 generates 160 bits */ u_char *ipad; @@ -750,7 +755,7 @@ ah_hmac_sha1_result(state, addr) SHA1Update(ctxt, (caddr_t)&digest[0], sizeof(digest)); SHA1Final((caddr_t)&digest[0], ctxt); - bcopy(&digest[0], (void *)addr, HMACSIZE); + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -869,10 +874,12 @@ ah_hmac_sha2_256_loop(state, addr, len) } static void -ah_hmac_sha2_256_result(state, addr) +ah_hmac_sha2_256_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { + u_char digest[SHA256_DIGEST_LENGTH]; u_char *ipad; u_char *opad; SHA256_CTX *ctxt; @@ -884,13 +891,14 @@ ah_hmac_sha2_256_result(state, addr) opad = (u_char *)(ipad + 64); ctxt = (SHA256_CTX *)(opad + 64); - SHA256_Final((u_int8_t *)addr, ctxt); + SHA256_Final((u_int8_t *)digest, ctxt); - bzero(ctxt, sizeof(*ctxt)); SHA256_Init(ctxt); SHA256_Update(ctxt, opad, 64); - SHA256_Update(ctxt, (const u_int8_t *)addr, SHA256_DIGEST_LENGTH); - SHA256_Final((u_int8_t *)addr, ctxt); + SHA256_Update(ctxt, (const u_int8_t *)digest, sizeof(digest)); + SHA256_Final((u_int8_t *)digest, ctxt); + + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -1009,10 +1017,12 @@ ah_hmac_sha2_384_loop(state, addr, len) } static void -ah_hmac_sha2_384_result(state, addr) +ah_hmac_sha2_384_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { + u_char digest[SHA384_DIGEST_LENGTH]; u_char *ipad; u_char *opad; SHA384_CTX *ctxt; @@ -1024,13 +1034,14 @@ ah_hmac_sha2_384_result(state, addr) opad = (u_char *)(ipad + 128); ctxt = (SHA384_CTX *)(opad + 128); - SHA384_Final((u_int8_t *)addr, ctxt); + SHA384_Final((u_int8_t *)digest, ctxt); - bzero(ctxt, sizeof(*ctxt)); SHA384_Init(ctxt); SHA384_Update(ctxt, opad, 128); - SHA384_Update(ctxt, (const u_int8_t *)addr, SHA384_DIGEST_LENGTH); - SHA384_Final((u_int8_t *)addr, ctxt); + SHA384_Update(ctxt, (const u_int8_t *)digest, sizeof(digest)); + SHA384_Final((u_int8_t *)digest, ctxt); + + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -1149,10 +1160,12 @@ ah_hmac_sha2_512_loop(state, addr, len) } static void -ah_hmac_sha2_512_result(state, addr) +ah_hmac_sha2_512_result(state, addr, l) struct ah_algorithm_state *state; caddr_t addr; + size_t l; { + u_char digest[SHA512_DIGEST_LENGTH]; u_char *ipad; u_char *opad; SHA512_CTX *ctxt; @@ -1164,13 +1177,14 @@ ah_hmac_sha2_512_result(state, addr) opad = (u_char *)(ipad + 128); ctxt = (SHA512_CTX *)(opad + 128); - SHA512_Final((u_int8_t *)addr, ctxt); + SHA512_Final((u_int8_t *)digest, ctxt); - bzero(ctxt, sizeof(*ctxt)); SHA512_Init(ctxt); SHA512_Update(ctxt, opad, 128); - SHA512_Update(ctxt, (const u_int8_t *)addr, SHA512_DIGEST_LENGTH); - SHA512_Final((u_int8_t *)addr, ctxt); + SHA512_Update(ctxt, (const u_int8_t *)digest, sizeof(digest)); + SHA512_Final((u_int8_t *)digest, ctxt); + + bcopy(&digest[0], (void *)addr, sizeof(digest) > l ? l : sizeof(digest)); FREE(state->foo, M_TEMP); } @@ -1453,7 +1467,7 @@ again: goto fail; } - (algo->result)(&algos, (caddr_t) &sumbuf[0]); + (algo->result)(&algos, (caddr_t) &sumbuf[0], sizeof(sumbuf)); bcopy(&sumbuf[0], ahdat, (*algo->sumsiz)(sav)); if (n) @@ -1680,7 +1694,7 @@ ah6_calccksum(m, ahdat, len, algo, sav) goto fail; } - (algo->result)(&algos, (caddr_t) &sumbuf[0]); + (algo->result)(&algos, (caddr_t) &sumbuf[0], sizeof(sumbuf)); bcopy(&sumbuf[0], ahdat, (*algo->sumsiz)(sav)); /* just in case */ diff --git a/bsd/netinet6/ah_input.c b/bsd/netinet6/ah_input.c index fcffc1ded..a448295b7 100644 --- a/bsd/netinet6/ah_input.c +++ b/bsd/netinet6/ah_input.c @@ -116,6 +116,7 @@ #include <net/kpi_protocol.h> #include <netinet/kpi_ipfilter_var.h> +#include <mach/sdt.h> #include <net/net_osdep.h> @@ -416,6 +417,9 @@ ah4_input(struct mbuf *m, int off) stripsiz = sizeof(struct newah) + siz1; } if (ipsec4_tunnel_validate(m, off + stripsiz, nxt, sav, &ifamily)) { + ifaddr_t ifa; + struct sockaddr_storage addr; + /* * strip off all the headers that precedes AH. * IP xx AH IP' payload -> IP' payload @@ -481,7 +485,25 @@ ah4_input(struct mbuf *m, int off) IPSEC_STAT_INCREMENT(ipsecstat.in_nomem); goto fail; } - proto_input(PF_INET, m); + + if (ip_doscopedroute) { + struct sockaddr_in *ipaddr; + + bzero(&addr, sizeof(addr)); + ipaddr = (__typeof__(ipaddr))&addr; + ipaddr->sin_family = AF_INET; + ipaddr->sin_len = sizeof(*ipaddr); + ipaddr->sin_addr = ip->ip_dst; + + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); + } + } + if (proto_input(PF_INET, m) != 0) + goto fail; nxt = IPPROTO_DONE; } else { /* @@ -549,6 +571,10 @@ ah4_input(struct mbuf *m, int off) goto fail; } + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + if (nxt != IPPROTO_DONE) { if ((ip_protox[nxt]->pr_flags & PR_LASTHDR) != 0 && ipsec4_in_reject(m, NULL)) { @@ -583,10 +609,9 @@ fail: #if INET6 int -ah6_input(mp, offp) - struct mbuf **mp; - int *offp; +ah6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m = *mp; int off = *offp; struct ip6_hdr *ip6; @@ -825,6 +850,9 @@ ah6_input(mp, offp) stripsiz = sizeof(struct newah) + siz1; } if (ipsec6_tunnel_validate(m, off + stripsiz, nxt, sav)) { + ifaddr_t ifa; + struct sockaddr_storage addr; + /* * strip off all the headers that precedes AH. * IP6 xx AH IP6' payload -> IP6' payload @@ -875,7 +903,26 @@ ah6_input(mp, offp) IPSEC_STAT_INCREMENT(ipsec6stat.in_nomem); goto fail; } - proto_input(PF_INET6, m); + + if (ip6_doscopedroute) { + struct sockaddr_in6 *ip6addr; + + bzero(&addr, sizeof(addr)); + ip6addr = (__typeof__(ip6addr))&addr; + ip6addr->sin6_family = AF_INET6; + ip6addr->sin6_len = sizeof(*ip6addr); + ip6addr->sin6_addr = ip6->ip6_dst; + + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); + } + } + + if (proto_input(PF_INET6, m) != 0) + goto fail; nxt = IPPROTO_DONE; } else { /* diff --git a/bsd/netinet6/dest6.c b/bsd/netinet6/dest6.c index ae7a18b8a..993ee1a91 100644 --- a/bsd/netinet6/dest6.c +++ b/bsd/netinet6/dest6.c @@ -54,17 +54,13 @@ * Destination options header processing. */ int -dest6_input(mp, offp) - struct mbuf **mp; - int *offp; +dest6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m = *mp; int off = *offp, dstoptlen, optlen; struct ip6_dest *dstopts; u_int8_t *opt; - struct ip6_hdr *ip6; - - ip6 = mtod(m, struct ip6_hdr *); /* validation of the length of the header */ #ifndef PULLDOWN_TEST @@ -107,7 +103,7 @@ dest6_input(mp, offp) default: /* unknown option */ optlen = ip6_unknown_opt(opt, m, - opt - mtod(m, u_int8_t *), 0); + opt - mtod(m, u_int8_t *)); if (optlen == -1) return (IPPROTO_DONE); optlen += 2; diff --git a/bsd/netinet6/esp6.h b/bsd/netinet6/esp6.h index e0c40b37f..7b054cd50 100644 --- a/bsd/netinet6/esp6.h +++ b/bsd/netinet6/esp6.h @@ -69,7 +69,7 @@ #ifdef KERNEL_PRIVATE extern int esp6_output(struct mbuf *, u_char *, struct mbuf *, struct secasvar *); -extern int esp6_input(struct mbuf **, int *); +extern int esp6_input(struct mbuf **, int *, int); extern void esp6_ctlinput(int, struct sockaddr *, void *); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet6/esp_core.c b/bsd/netinet6/esp_core.c index 3bae5bd18..905de9ba2 100644 --- a/bsd/netinet6/esp_core.c +++ b/bsd/netinet6/esp_core.c @@ -1203,7 +1203,7 @@ esp_auth(m0, skip, length, sav, sum) break; } } - (*algo->result)(&s, (caddr_t) sumbuf); + (*algo->result)(&s, (caddr_t) sumbuf, sizeof(sumbuf)); bcopy(sumbuf, sum, siz); /*XXX*/ KERNEL_DEBUG(DBG_FNC_ESPAUTH | DBG_FUNC_END, 6,0,0,0,0); return 0; diff --git a/bsd/netinet6/esp_input.c b/bsd/netinet6/esp_input.c index b228fb035..c64150319 100644 --- a/bsd/netinet6/esp_input.c +++ b/bsd/netinet6/esp_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -119,6 +119,7 @@ #include <netinet/kpi_ipfilter_var.h> #include <net/net_osdep.h> +#include <mach/sdt.h> #include <sys/kdebug.h> #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIPSEC, 1) @@ -136,16 +137,37 @@ extern struct protosw inetsw[]; (sizeof(struct esp) < sizeof(struct newesp) \ ? sizeof(struct newesp) : sizeof(struct esp)) +static struct ip * +esp4_input_strip_UDP_encap (struct mbuf *m, int iphlen) +{ + // strip the udp header that's encapsulating ESP + struct ip *ip; + size_t stripsiz = sizeof(struct udphdr); + + ip = mtod(m, __typeof__(ip)); + ovbcopy((caddr_t)ip, (caddr_t)(((u_char *)ip) + stripsiz), iphlen); + m->m_data += stripsiz; + m->m_len -= stripsiz; + m->m_pkthdr.len -= stripsiz; + ip = mtod(m, __typeof__(ip)); + ip->ip_len = ip->ip_len - stripsiz; + ip->ip_p = IPPROTO_ESP; + return ip; +} + void esp4_input(m, off) struct mbuf *m; int off; { struct ip *ip; +#if INET6 struct ip6_hdr *ip6; +#endif /* INET6 */ struct esp *esp; struct esptail esptail; u_int32_t spi; + u_int32_t seq; struct secasvar *sav = NULL; size_t taillen; u_int16_t nxt; @@ -175,6 +197,14 @@ esp4_input(m, off) } ip = mtod(m, struct ip *); + // expect udp-encap and esp packets only + if (ip->ip_p != IPPROTO_ESP && + !(ip->ip_p == IPPROTO_UDP && off >= sizeof(struct udphdr))) { + ipseclog((LOG_DEBUG, + "IPv4 ESP input: invalid protocol type\n")); + IPSEC_STAT_INCREMENT(ipsecstat.in_inval); + goto bad; + } esp = (struct esp *)(((u_int8_t *)ip) + off); #ifdef _IP_VHL hlen = IP_VHL_HL(ip->ip_vhl) << 2; @@ -222,6 +252,7 @@ esp4_input(m, off) goto bad; } + seq = ntohl(((struct newesp *)esp)->esp_seq); if (!((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay && (sav->alg_auth && sav->key_auth))) goto noreplaycheck; @@ -233,7 +264,7 @@ esp4_input(m, off) /* * check for sequence number. */ - if (ipsec_chkreplay(ntohl(((struct newesp *)esp)->esp_seq), sav)) + if (ipsec_chkreplay(seq, sav)) ; /*okey*/ else { IPSEC_STAT_INCREMENT(ipsecstat.in_espreplay); @@ -298,7 +329,7 @@ esp4_input(m, off) * update sequence number. */ if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) { - if (ipsec_updatereplay(ntohl(((struct newesp *)esp)->esp_seq), sav)) { + if (ipsec_updatereplay(seq, sav)) { IPSEC_STAT_INCREMENT(ipsecstat.in_espreplay); goto bad; } @@ -388,9 +419,40 @@ noreplaycheck: #else ip->ip_len = htons(ntohs(ip->ip_len) - taillen); #endif + if (ip->ip_p == IPPROTO_UDP) { + // offset includes the outer ip and udp header lengths. + if (m->m_len < off) { + m = m_pullup(m, off); + if (!m) { + ipseclog((LOG_DEBUG, + "IPv4 ESP input: invalid udp encapsulated ESP packet length \n")); + IPSEC_STAT_INCREMENT(ipsecstat.in_inval); + goto bad; + } + } + + // check the UDP encap header to detect changes in the source port, and then strip the header + off -= sizeof(struct udphdr); // off no longer includes the udphdr's size + // if peer is behind nat and this is the latest esp packet + if ((sav->flags & SADB_X_EXT_NATT_DETECTED_PEER) != 0 && + (sav->flags & SADB_X_EXT_OLD) == 0 && + seq && sav->replay && + seq >= sav->replay->lastseq) { + struct udphdr *encap_uh = (__typeof__(encap_uh))((caddr_t)ip + off); + if (encap_uh->uh_sport && + encap_uh->uh_sport != sav->remote_ike_port) { + sav->remote_ike_port = encap_uh->uh_sport; + } + } + ip = esp4_input_strip_UDP_encap(m, off); + esp = (struct esp *)(((u_int8_t *)ip) + off); + } /* was it transmitted over the IPsec tunnel SA? */ if (ipsec4_tunnel_validate(m, off + esplen + ivlen, nxt, sav, &ifamily)) { + ifaddr_t ifa; + struct sockaddr_storage addr; + /* * strip off all the headers that precedes ESP header. * IP4 xx ESP IP4' payload -> IP4' payload @@ -403,6 +465,8 @@ noreplaycheck: tos = ip->ip_tos; m_adj(m, off + esplen + ivlen); if (ifamily == AF_INET) { + struct sockaddr_in *ipaddr; + if (m->m_len < sizeof(*ip)) { m = m_pullup(m, sizeof(*ip)); if (!m) { @@ -421,8 +485,18 @@ noreplaycheck: IPSEC_STAT_INCREMENT(ipsecstat.in_inval); goto bad; } + + if (ip_doscopedroute) { + bzero(&addr, sizeof(addr)); + ipaddr = (__typeof__(ipaddr))&addr; + ipaddr->sin_family = AF_INET; + ipaddr->sin_len = sizeof(*ipaddr); + ipaddr->sin_addr = ip->ip_dst; + } #if INET6 } else if (ifamily == AF_INET6) { + struct sockaddr_in6 *ip6addr; + #ifndef PULLDOWN_TEST /* * m_pullup is prohibited in KAME IPv6 input processing @@ -452,7 +526,15 @@ noreplaycheck: ipsec6_logpacketstr(ip6, spi), ipsec_logsastr(sav))); IPSEC_STAT_INCREMENT(ipsecstat.in_inval); goto bad; - } + } + + if (ip6_doscopedroute) { + bzero(&addr, sizeof(addr)); + ip6addr = (__typeof__(ip6addr))&addr; + ip6addr->sin6_family = AF_INET6; + ip6addr->sin6_len = sizeof(*ip6addr); + ip6addr->sin6_addr = ip6->ip6_dst; + } #endif /* INET6 */ } else { ipseclog((LOG_ERR, "ipsec tunnel unsupported address family " @@ -466,10 +548,21 @@ noreplaycheck: IPSEC_STAT_INCREMENT(ipsecstat.in_nomem); goto bad; } - + + if (ip_doscopedroute || ip6_doscopedroute) { + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); + } + } + /* Clear the csum flags, they can't be valid for the inner headers */ m->m_pkthdr.csum_flags = 0; - proto_input(ifamily == AF_INET ? PF_INET : PF_INET6, m); + if (proto_input(ifamily == AF_INET ? PF_INET : PF_INET6, m) != 0) + goto bad; + nxt = IPPROTO_DONE; KERNEL_DEBUG(DBG_FNC_ESPIN | DBG_FUNC_END, 2,0,0,0,0); } else { @@ -554,6 +647,11 @@ noreplaycheck: udp->uh_sport = htons(sav->remote_ike_port); udp->uh_sum = 0; } + + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + ip_proto_dispatch_in(m, off, nxt, 0); } else m_freem(m); @@ -583,16 +681,16 @@ bad: #if INET6 int -esp6_input(mp, offp) - struct mbuf **mp; - int *offp; +esp6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m = *mp; int off = *offp; struct ip6_hdr *ip6; struct esp *esp; struct esptail esptail; u_int32_t spi; + u_int32_t seq; struct secasvar *sav = NULL; size_t taillen; u_int16_t nxt; @@ -667,6 +765,8 @@ esp6_input(mp, offp) goto bad; } + seq = ntohl(((struct newesp *)esp)->esp_seq); + if (!((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay && (sav->alg_auth && sav->key_auth))) goto noreplaycheck; @@ -678,7 +778,7 @@ esp6_input(mp, offp) /* * check for sequence number. */ - if (ipsec_chkreplay(ntohl(((struct newesp *)esp)->esp_seq), sav)) + if (ipsec_chkreplay(seq, sav)) ; /*okey*/ else { IPSEC_STAT_INCREMENT(ipsec6stat.in_espreplay); @@ -740,7 +840,7 @@ esp6_input(mp, offp) * update sequence number. */ if ((sav->flags & SADB_X_EXT_OLD) == 0 && sav->replay) { - if (ipsec_updatereplay(ntohl(((struct newesp *)esp)->esp_seq), sav)) { + if (ipsec_updatereplay(seq, sav)) { IPSEC_STAT_INCREMENT(ipsec6stat.in_espreplay); goto bad; } @@ -828,6 +928,9 @@ noreplaycheck: /* was it transmitted over the IPsec tunnel SA? */ if (ipsec6_tunnel_validate(m, off + esplen + ivlen, nxt, sav)) { + ifaddr_t ifa; + struct sockaddr_storage addr; + /* * strip off all the headers that precedes ESP header. * IP6 xx ESP IP6' payload -> IP6' payload @@ -872,7 +975,26 @@ noreplaycheck: IPSEC_STAT_INCREMENT(ipsec6stat.in_nomem); goto bad; } - proto_input(PF_INET6, m); + + if (ip6_doscopedroute) { + struct sockaddr_in6 *ip6addr; + + bzero(&addr, sizeof(addr)); + ip6addr = (__typeof__(ip6addr))&addr; + ip6addr->sin6_family = AF_INET6; + ip6addr->sin6_len = sizeof(*ip6addr); + ip6addr->sin6_addr = ip6->ip6_dst; + + // update the receiving interface address based on the inner address + ifa = ifa_ifwithaddr((struct sockaddr *)&addr); + if (ifa) { + m->m_pkthdr.rcvif = ifa->ifa_ifp; + IFA_REMREF(ifa); + } + } + + if (proto_input(PF_INET6, m) != 0) + goto bad; nxt = IPPROTO_DONE; } else { /* diff --git a/bsd/netinet6/frag6.c b/bsd/netinet6/frag6.c index ea75e5acb..b6b68b920 100644 --- a/bsd/netinet6/frag6.c +++ b/bsd/netinet6/frag6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ #include <netinet/in.h> #include <netinet/in_var.h> +#include <netinet/ip.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet/icmp6.h> @@ -102,9 +103,6 @@ u_int frag6_nfragpackets; static u_int frag6_nfrags; struct ip6q ip6q; /* ip6 reassemble queue */ -#ifndef __APPLE__ -MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); -#endif extern lck_mtx_t *inet6_domain_mutex; /* @@ -162,10 +160,9 @@ frag6_init() * inet6_domain_mutex is protecting he frag6 queue manipulation. */ int -frag6_input(mp, offp) - struct mbuf **mp; - int *offp; +frag6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m = *mp, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; @@ -176,6 +173,8 @@ frag6_input(mp, offp) int fragoff, frgpartlen; /* must be larger than u_int16_t */ struct ifnet *dstifp; struct ifaddr *ifa = NULL; + u_int8_t ecn, ecn0; + #ifdef IN6_IFSTAT_STRICT struct route_in6 ro; struct sockaddr_in6 *dst; @@ -204,7 +203,7 @@ frag6_input(mp, offp) if (ro.ro_rt != NULL) { RT_LOCK(ro.ro_rt); if ((ifa = ro.ro_rt->rt_ifa) != NULL) { - ifaref(ifa); + IFA_ADDREF(ifa); dstifp = ((struct in6_ifaddr *)ro.ro_rt->rt_ifa)->ia_ifp; } RT_UNLOCK(ro.ro_rt); @@ -222,7 +221,7 @@ frag6_input(mp, offp) icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset); in6_ifstat_inc(dstifp, ifs6_reass_fail); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return IPPROTO_DONE; } @@ -239,7 +238,7 @@ frag6_input(mp, offp) offsetof(struct ip6_hdr, ip6_plen)); in6_ifstat_inc(dstifp, ifs6_reass_fail); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return IPPROTO_DONE; } @@ -298,10 +297,11 @@ frag6_input(mp, offp) q6->ip6q_nxtp = (u_char *)nxtp; #endif q6->ip6q_ident = ip6f->ip6f_ident; - q6->ip6q_arrive = 0; /* Is it used anywhere? */ q6->ip6q_ttl = IPV6_FRAGTTL; q6->ip6q_src = ip6->ip6_src; q6->ip6q_dst = ip6->ip6_dst; + q6->ip6q_ecn = + (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; q6->ip6q_unfrglen = -1; /* The 1st fragment has not arrived. */ q6->ip6q_nfrag = 0; @@ -332,7 +332,7 @@ frag6_input(mp, offp) offsetof(struct ip6_frag, ip6f_offlg)); frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return(IPPROTO_DONE); } } @@ -342,7 +342,7 @@ frag6_input(mp, offp) offsetof(struct ip6_frag, ip6f_offlg)); frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return(IPPROTO_DONE); } /* @@ -387,10 +387,6 @@ frag6_input(mp, offp) if (ip6af == NULL) goto dropfrag; bzero(ip6af, sizeof(*ip6af)); - ip6af->ip6af_head = ip6->ip6_flow; - ip6af->ip6af_len = ip6->ip6_plen; - ip6af->ip6af_nxt = ip6->ip6_nxt; - ip6af->ip6af_hlim = ip6->ip6_hlim; ip6af->ip6af_mff = ip6f->ip6f_offlg & IP6F_MORE_FRAG; ip6af->ip6af_off = fragoff; ip6af->ip6af_frglen = frgpartlen; @@ -402,6 +398,26 @@ frag6_input(mp, offp) goto insert; } + /* + * Handle ECN by comparing this segment with the first one; + * if CE is set, do not lose CE. + * drop if CE and not-ECT are mixed for the same packet. + */ + ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; + ecn0 = q6->ip6q_ecn; + if (ecn == IPTOS_ECN_CE) { + if (ecn0 == IPTOS_ECN_NOTECT) { + FREE(ip6af, M_FTABLE); + goto dropfrag; + } + if (ecn0 != IPTOS_ECN_CE) + q6->ip6q_ecn = IPTOS_ECN_CE; + } + if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) { + FREE(ip6af, M_FTABLE); + goto dropfrag; + } + /* * Find a segment which begins after this one does. */ @@ -450,6 +466,11 @@ frag6_input(mp, offp) * If the incoming framgent overlaps some existing fragments in * the reassembly queue, drop it, since it is dangerous to override * existing fragments from a security point of view. + * We don't know which fragment is the bad guy - here we trust + * fragment that came in earlier, with no real reason. + * + * Note: due to changes after disabling this part, mbuf passed to + * m_adj() below now does not meet the requirement. */ if (af6->ip6af_up != (struct ip6asfrag *)q6) { i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen @@ -501,7 +522,7 @@ insert: if (af6->ip6af_off != next) { frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return IPPROTO_DONE; } next += af6->ip6af_frglen; @@ -509,7 +530,7 @@ insert: if (af6->ip6af_up->ip6af_mff) { frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return IPPROTO_DONE; } @@ -538,15 +559,17 @@ insert: ip6->ip6_plen = htons((u_short)next + offset - sizeof(struct ip6_hdr)); ip6->ip6_src = q6->ip6q_src; ip6->ip6_dst = q6->ip6q_dst; + if (q6->ip6q_ecn == IPTOS_ECN_CE) + ip6->ip6_flow |= htonl(IPTOS_ECN_CE << 20); + nxt = q6->ip6q_nxt; #if notyet *q6->ip6q_nxtp = (u_char)(nxt & 0xff); #endif - /* - * Delete frag6 header with as a few cost as possible. - */ - if (offset < m->m_len) { + /* Delete frag6 header */ + if (m->m_len >= offset + sizeof(struct ip6_frag)) { + /* This is the only possible case with !PULLDOWN_TEST */ ovbcopy((caddr_t)ip6, (caddr_t)ip6 + sizeof(struct ip6_frag), offset); m->m_data += sizeof(struct ip6_frag); @@ -596,7 +619,7 @@ insert: frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return nxt; dropfrag: @@ -605,7 +628,7 @@ insert: m_freem(m); frag6_doing_reass = 0; if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return IPPROTO_DONE; } @@ -636,7 +659,7 @@ frag6_freef(q6) /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); - /* restoure source and destination addresses */ + /* restore source and destination addresses */ ip6->ip6_src = q6->ip6q_src; ip6->ip6_dst = q6->ip6q_dst; icmp6_error(m, ICMP6_TIME_EXCEEDED, diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index 02d19734f..43a61a6d2 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -123,6 +123,7 @@ #include <netinet6/nd6.h> #include <netinet6/in6_ifattach.h> #include <netinet6/ip6protosw.h> +#include <netinet6/scope6_var.h> #if IPSEC #include <netinet6/ipsec.h> @@ -148,8 +149,6 @@ static int icmp6errpps_count = 0; static struct timeval icmp6errppslim_last; extern int icmp6_nodeinfo; extern struct inpcbinfo ripcbinfo; -extern lck_mtx_t *ip6_mutex; -extern lck_mtx_t *nd6_mutex; extern lck_mtx_t *inet6_domain_mutex; static void icmp6_errcount(struct icmp6errstat *, int, int); @@ -169,19 +168,12 @@ static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *, struct ifnet *, int); static int icmp6_notify_error(struct mbuf *, int, int, int); -#ifdef COMPAT_RFC1885 -/* - * XXX: Compiled out for now, but if enabled we must use a lock for accesses, - * or easier, define it locally inside icmp6_reflect() and don't cache. - */ -static struct route_in6 icmp6_reflect_rt; -#endif void icmp6_init() { - mld6_init(); + mld_init(); } static void @@ -242,13 +234,44 @@ icmp6_errcount(stat, type, code) stat->icp6errs_unknown++; } +/* + * A wrapper function for icmp6_error() necessary when the erroneous packet + * may not contain enough scope zone information. + */ +void +icmp6_error2(struct mbuf *m, int type, int code, int param, + struct ifnet *ifp) +{ + struct ip6_hdr *ip6; + + if (ifp == NULL) + return; + +#ifndef PULLDOWN_TEST + IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr),return ); +#else + if (m->m_len < sizeof(struct ip6_hdr)) { + m = m_pullup(m, sizeof(struct ip6_hdr)); + if (m == NULL) + return; + } +#endif + + ip6 = mtod(m, struct ip6_hdr *); + + if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0) + return; + if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) + return; + + icmp6_error(m, type, code, param); +} + /* * Generate an error packet of type error in response to bad IP6 packet. */ void -icmp6_error(m, type, code, param) - struct mbuf *m; - int type, code, param; +icmp6_error(struct mbuf *m, int type, int code, int param) { struct ip6_hdr *oip6, *nip6; struct icmp6_hdr *icmp6; @@ -258,7 +281,6 @@ icmp6_error(m, type, code, param) icmp6stat.icp6s_error++; - lck_mtx_assert(ip6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* count per-type-code statistics */ icmp6_errcount(&icmp6stat.icp6s_outerrhist, type, code); @@ -281,9 +303,15 @@ icmp6_error(m, type, code, param) oip6 = mtod(m, struct ip6_hdr *); /* - * Multicast destination check. For unrecognized option errors, - * this check has already done in ip6_unknown_opt(), so we can - * check only for other errors. + * If the destination address of the erroneous packet is a multicast + * address, or the packet was sent using link-layer multicast, + * we should basically suppress sending an error (RFC 2463, Section + * 2.4). + * We have two exceptions (the item e.2 in that section): + * - the Pakcet Too Big message can be sent for path MTU discovery. + * - the Parameter Problem Message that can be allowed an icmp6 error + * in the option type field. This check has been done in + * ip6_unknown_opt(), so we can just check the type and code. */ if ((m->m_flags & (M_BCAST|M_MCAST) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) && @@ -292,7 +320,10 @@ icmp6_error(m, type, code, param) code != ICMP6_PARAMPROB_OPTION))) goto freeit; - /* Source address check. XXX: the case of anycast source? */ + /* + * RFC 2463, 2.4 (e.5): source address check. + * XXX: the case of anycast source? + */ if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) || IN6_IS_ADDR_MULTICAST(&oip6->ip6_src)) goto freeit; @@ -361,10 +392,8 @@ icmp6_error(m, type, code, param) nip6->ip6_src = oip6->ip6_src; nip6->ip6_dst = oip6->ip6_dst; - if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_src)) - oip6->ip6_src.s6_addr16[1] = 0; - if (IN6_IS_SCOPE_LINKLOCAL(&oip6->ip6_dst)) - oip6->ip6_dst.s6_addr16[1] = 0; + in6_clearscope(&oip6->ip6_src); + in6_clearscope(&oip6->ip6_dst); icmp6 = (struct icmp6_hdr *)(nip6 + 1); icmp6->icmp6_type = type; @@ -373,7 +402,7 @@ icmp6_error(m, type, code, param) /* * icmp6_reflect() is designed to be in the input path. - * icmp6_error() can be called from both input and outut path, + * icmp6_error() can be called from both input and output path, * and if we are in output path rcvif could contain bogus value. * clear m->m_pkthdr.rcvif for safety, we should have enough scope * information in ip header (nip6). @@ -387,7 +416,7 @@ icmp6_error(m, type, code, param) freeit: /* - * If we can't tell wheter or not we can generate ICMP6, free it. + * If we can't tell whether or not we can generate ICMP6, free it. */ m_freem(m); } @@ -396,17 +425,19 @@ icmp6_error(m, type, code, param) * Process a received ICMP6 message. */ int -icmp6_input(mp, offp) - struct mbuf **mp; - int *offp; +icmp6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m = *mp, *n; + struct ifnet *ifp; struct ip6_hdr *ip6, *nip6; struct icmp6_hdr *icmp6, *nicmp6; int off = *offp; int icmp6len = m->m_pkthdr.len - *offp; int code, sum, noff; + ifp = m->m_pkthdr.rcvif; + #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(struct icmp6_hdr), return IPPROTO_DONE); /* m might change if M_LOOP. So, call mtod after this */ @@ -423,6 +454,26 @@ icmp6_input(mp, offp) goto freeit; } + /* + * Check multicast group membership. + * Note: SSM filters are not applied for ICMPv6 traffic. + */ + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + struct in6_multi *inm; + + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&ip6->ip6_dst, ifp, inm); + in6_multihead_lock_done(); + + if (inm == NULL) { + ip6stat.ip6s_notmember++; + in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); + goto freeit; + } else { + IN6M_REMREF(inm); + } + } + /* * calculate the checksum */ @@ -449,7 +500,7 @@ icmp6_input(mp, offp) if (faithprefix(&ip6->ip6_dst)) { /* * Deliver very specific ICMP6 type only. - * This is important to deilver TOOBIG. Otherwise PMTUD + * This is important to deliver TOOBIG. Otherwise PMTUD * will not work. */ switch (icmp6->icmp6_type) { @@ -468,7 +519,6 @@ icmp6_input(mp, offp) if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_error); - switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_dstunreach); @@ -483,30 +533,21 @@ icmp6_input(mp, offp) case ICMP6_DST_UNREACH_ADDR: code = PRC_HOSTDEAD; break; -#ifdef COMPAT_RFC1885 - case ICMP6_DST_UNREACH_NOTNEIGHBOR: - code = PRC_UNREACH_SRCFAIL; - break; -#else case ICMP6_DST_UNREACH_BEYONDSCOPE: /* I mean "source address was incorrect." */ code = PRC_PARAMPROB; break; -#endif case ICMP6_DST_UNREACH_NOPORT: code = PRC_UNREACH_PORT; break; default: goto badcode; } - goto deliver; break; case ICMP6_PACKET_TOO_BIG: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_pkttoobig); - if (code != 0) - goto badcode; code = PRC_MSGSIZE; @@ -521,8 +562,10 @@ icmp6_input(mp, offp) icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_timeexceed); switch (code) { case ICMP6_TIME_EXCEED_TRANSIT: + code = PRC_TIMXCEED_INTRANS; + break; case ICMP6_TIME_EXCEED_REASSEMBLY: - code += PRC_TIMXCEED_INTRANS; + code = PRC_TIMXCEED_REASS; break; default: goto badcode; @@ -631,11 +674,12 @@ icmp6_input(mp, offp) goto badcode; break; - case MLD6_LISTENER_QUERY: - case MLD6_LISTENER_REPORT: - if (icmp6len < sizeof(struct mld6_hdr)) + case MLD_LISTENER_QUERY: + case MLD_LISTENER_REPORT: + + if (icmp6len < sizeof(struct mld_hdr)) goto badlen; - if (icmp6->icmp6_type == MLD6_LISTENER_QUERY) /* XXX: ugly... */ + if (icmp6->icmp6_type == MLD_LISTENER_QUERY) /* XXX: ugly... */ icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldquery); else icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mldreport); @@ -647,31 +691,32 @@ icmp6_input(mp, offp) if ((n = m_copym(m, 0, M_COPYALL, M_DONTWAIT)) == NULL) { /* give up local */ - mld6_input(m, off); - m = NULL; + if (mld_input(m, off, icmp6len) == IPPROTO_DONE) + m = NULL; goto freeit; } - mld6_input(n, off); + if (mld_input(n, off, icmp6len) != IPPROTO_DONE) + m_freem(n); /* m stays. */ goto rate_limit_checked; break; - case MLD6_LISTENER_DONE: + case MLD_LISTENER_DONE: icmp6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_mlddone); - if (icmp6len < sizeof(struct mld6_hdr)) /* necessary? */ + if (icmp6len < sizeof(struct mld_hdr)) /* necessary? */ goto badlen; break; /* nothing to be done in kernel */ - case MLD6_MTRACE_RESP: - case MLD6_MTRACE: - /* XXX: these two are experimental. not officially defind. */ + case MLD_MTRACE_RESP: + case MLD_MTRACE: + /* XXX: these two are experimental. not officially defined. */ /* XXX: per-interface statistics? */ break; /* just pass it to applications */ case ICMP6_NI_QUERY: if (!icmp6_nodeinfo) break; - +//### LD 10/20 Check fbsd differences here. Not sure we're more advanced or not. /* By RFC 4620 refuse to answer queries from global scope addresses */ if ((icmp6_nodeinfo & 8) != 8 && in6_addrscope(&ip6->ip6_src) == IPV6_ADDR_SCOPE_GLOBAL) break; @@ -948,7 +993,7 @@ icmp6_notify_error(m, off, icmp6len, code) return(-1); } #endif - + if (nxt == IPPROTO_AH) eoff += (eh->ip6e_len + 2) << 2; else @@ -1023,7 +1068,7 @@ icmp6_notify_error(m, off, icmp6len, code) eoff, sizeof(*fh)); if (fh == NULL) { icmp6stat.icp6s_tooshort++; - return(-1); + return (-1); } #endif /* @@ -1055,14 +1100,23 @@ icmp6_notify_error(m, off, icmp6len, code) icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(icmp6, struct icmp6_hdr *, m, off, - sizeof(*icmp6) + sizeof(struct ip6_hdr)); + sizeof(*icmp6) + sizeof(struct ip6_hdr)); if (icmp6 == NULL) { icmp6stat.icp6s_tooshort++; - return(-1); + return (-1); } #endif + /* + * retrieve parameters from the inner IPv6 header, and convert + * them into sockaddr structures. + * XXX: there is no guarantee that the source or destination + * addresses of the inner packet are in the same scope as + * the addresses of the icmp packet. But there is no other + * way to determine the zone. + */ eip6 = (struct ip6_hdr *)(icmp6 + 1); + bzero(&icmp6dst, sizeof(icmp6dst)); icmp6dst.sin6_len = sizeof(struct sockaddr_in6); icmp6dst.sin6_family = AF_INET6; @@ -1070,39 +1124,16 @@ icmp6_notify_error(m, off, icmp6len, code) icmp6dst.sin6_addr = eip6->ip6_dst; else icmp6dst.sin6_addr = *finaldst; - icmp6dst.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, - &icmp6dst.sin6_addr); -#ifndef SCOPEDROUTING - if (in6_embedscope(&icmp6dst.sin6_addr, &icmp6dst, - NULL, NULL)) { - /* should be impossbile */ - nd6log((LOG_DEBUG, - "icmp6_notify_error: in6_embedscope failed\n")); + if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; - } -#endif - - /* - * retrieve parameters from the inner IPv6 header, and convert - * them into sockaddr structures. - */ bzero(&icmp6src, sizeof(icmp6src)); icmp6src.sin6_len = sizeof(struct sockaddr_in6); icmp6src.sin6_family = AF_INET6; icmp6src.sin6_addr = eip6->ip6_src; - icmp6src.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, - &icmp6src.sin6_addr); -#ifndef SCOPEDROUTING - if (in6_embedscope(&icmp6src.sin6_addr, &icmp6src, - NULL, NULL)) { - /* should be impossbile */ - nd6log((LOG_DEBUG, - "icmp6_notify_error: in6_embedscope failed\n")); + if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL)) goto freeit; - } -#endif icmp6src.sin6_flowinfo = - (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); + (eip6->ip6_flow & IPV6_FLOWLABEL_MASK); if (finaldst == NULL) finaldst = &eip6->ip6_dst; @@ -1145,9 +1176,16 @@ icmp6_mtudisc_update(ip6cp, validated) u_int mtu = ntohl(icmp6->icmp6_mtu); struct rtentry *rt = NULL; struct sockaddr_in6 sin6; + /* + * we reject ICMPv6 too big with abnormally small value. + * XXX what is the good definition of "abnormally small"? + */ + if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8) + return; if (!validated) return; + /* * In case the suggested mtu is less than IPV6_MMTU, we * only need to remember that it was for above mentioned @@ -1167,19 +1205,16 @@ icmp6_mtudisc_update(ip6cp, validated) htons(m->m_pkthdr.rcvif->if_index); } /* sin6.sin6_scope_id = XXX: should be set if DST is a scoped addr */ - rt = rtalloc1((struct sockaddr *)&sin6, 0, RTF_CLONING | RTF_PRCLONING); + rt = rtalloc1_scoped((struct sockaddr *)&sin6, 0, + RTF_CLONING | RTF_PRCLONING, m->m_pkthdr.rcvif->if_index); if (rt != NULL) { RT_LOCK(rt); if ((rt->rt_flags & RTF_HOST) && - !(rt->rt_rmx.rmx_locks & RTV_MTU)) { - if (mtu < IPV6_MMTU) { - /* xxx */ - rt->rt_rmx.rmx_locks |= RTV_MTU; - } else if (mtu < rt->rt_ifp->if_mtu && - rt->rt_rmx.rmx_mtu > mtu) { - icmp6stat.icp6s_pmtuchg++; - rt->rt_rmx.rmx_mtu = mtu; - } + !(rt->rt_rmx.rmx_locks & RTV_MTU) && + mtu < IN6_LINKMTU(rt->rt_ifp) && + rt->rt_rmx.rmx_mtu > mtu) { + icmp6stat.icp6s_pmtuchg++; + rt->rt_rmx.rmx_mtu = mtu; } RT_UNLOCK(rt); rtfree(rt); @@ -1189,7 +1224,7 @@ icmp6_mtudisc_update(ip6cp, validated) /* * Process a Node Information Query packet, based on * draft-ietf-ipngwg-icmp-name-lookups-07. - * + * * Spec incompatibilities: * - IPv6 Subject address handling * - IPv4 Subject address handling support missing @@ -1216,7 +1251,6 @@ ni6_input(m, off) struct ip6_hdr *ip6; int oldfqdn = 0; /* if 1, return pascal string (03 draft) */ char *subj = NULL; - struct in6_ifaddr *ia6 = NULL; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST @@ -1225,40 +1259,59 @@ ni6_input(m, off) IP6_EXTHDR_GET(ni6, struct icmp6_nodeinfo *, m, off, sizeof(*ni6)); if (ni6 == NULL) { /* m is already reclaimed */ - return NULL; + return (NULL); } #endif + /* + * Validate IPv6 source address. + * The default configuration MUST be to refuse answering queries from + * global-scope addresses according to RFC4602. + * Notes: + * - it's not very clear what "refuse" means; this implementation + * simply drops it. + * - it's not very easy to identify global-scope (unicast) addresses + * since there are many prefixes for them. It should be safer + * and in practice sufficient to check "all" but loopback and + * link-local (note that site-local unicast was deprecated and + * ULA is defined as global scope-wise) + */ + if ((icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 && + !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) && + !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) + goto bad; + /* * Validate IPv6 destination address. * * The Responder must discard the Query without further processing * unless it is one of the Responder's unicast or anycast addresses, or * a link-local scope multicast address which the Responder has joined. - * [icmp-name-lookups-07, Section 4.] + * [RFC4602, Section 5.] */ - bzero(&sin6, sizeof(sin6)); - sin6.sin6_family = AF_INET6; - sin6.sin6_len = sizeof(struct sockaddr_in6); - bcopy(&ip6->ip6_dst, &sin6.sin6_addr, sizeof(sin6.sin6_addr)); - /* XXX scopeid */ - if ((ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)&sin6)) != NULL) { - /* unicast/anycast, fine */ - if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && - (icmp6_nodeinfo & 4) == 0) { - ifafree(&ia6->ia_ifa); - ia6 = NULL; + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst)) + goto bad; + /* else it's a link-local multicast, fine */ + } else { /* unicast or anycast */ + struct in6_ifaddr *ia6; + + if ((ia6 = ip6_getdstifaddr(m)) == NULL) + goto bad; /* XXX impossible */ + + IFA_LOCK(&ia6->ia_ifa); + if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) && + !(icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) { nd6log((LOG_DEBUG, "ni6_input: ignore node info to " "a temporary address in %s:%d", __FILE__, __LINE__)); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); goto bad; } - ifafree(&ia6->ia_ifa); - ia6 = NULL; - } else if (IN6_IS_ADDR_MC_LINKLOCAL(&sin6.sin6_addr)) - ; /* link-local multicast, fine */ - else - goto bad; + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + } /* validate query Subject field. */ qtype = ntohs(ni6->ni_qtype); @@ -1272,6 +1325,7 @@ ni6_input(m, off) /* FALLTHROUGH */ case NI_QTYPE_FQDN: case NI_QTYPE_NODEADDR: + case NI_QTYPE_IPV4ADDR: switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: #if ICMP6_NI_SUBJ_IPV6 != 0 @@ -1291,7 +1345,7 @@ ni6_input(m, off) goto bad; #endif - if (subjlen != sizeof(sin6.sin6_addr)) + if (subjlen != sizeof(struct in6_addr)) goto bad; /* @@ -1313,18 +1367,16 @@ ni6_input(m, off) subjlen, (caddr_t)&sin6.sin6_addr); sin6.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, &sin6.sin6_addr); -#ifndef SCOPEDROUTING - in6_embedscope(&sin6.sin6_addr, &sin6, NULL, NULL); -#endif + in6_embedscope(&sin6.sin6_addr, &sin6, NULL, NULL, + NULL); bzero(&sin6_d, sizeof(sin6_d)); sin6_d.sin6_family = AF_INET6; /* not used, actually */ sin6_d.sin6_len = sizeof(sin6_d); /* ditto */ sin6_d.sin6_addr = ip6->ip6_dst; sin6_d.sin6_scope_id = in6_addr2scopeid(m->m_pkthdr.rcvif, &ip6->ip6_dst); -#ifndef SCOPEDROUTING - in6_embedscope(&sin6_d.sin6_addr, &sin6_d, NULL, NULL); -#endif + in6_embedscope(&sin6_d.sin6_addr, &sin6_d, NULL, NULL, + NULL); subj = (char *)&sin6; if (SA6_ARE_ADDR_EQUAL(&sin6, &sin6_d)) break; @@ -1333,7 +1385,8 @@ ni6_input(m, off) * XXX if we are to allow other cases, we should really * be careful about scope here. * basically, we should disallow queries toward IPv6 - * destination X with subject Y, if scope(X) > scope(Y). + * destination X with subject Y, + * if scope(X) > scope(Y). * if we allow scope(X) > scope(Y), it will result in * information leakage across scope boundary. */ @@ -1376,11 +1429,12 @@ ni6_input(m, off) /* refuse based on configuration. XXX ICMP6_NI_REFUSED? */ switch (qtype) { case NI_QTYPE_FQDN: - if ((icmp6_nodeinfo & 1) == 0) + if ((icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0) goto bad; break; case NI_QTYPE_NODEADDR: - if ((icmp6_nodeinfo & 2) == 0) + case NI_QTYPE_IPV4ADDR: + if ((icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0) goto bad; break; } @@ -1399,13 +1453,16 @@ ni6_input(m, off) case NI_QTYPE_NODEADDR: addrs = ni6_addrs(ni6, &ifp, subj); if ((replylen += addrs * (sizeof(struct in6_addr) + - sizeof(u_int32_t))) > MCLBYTES) + sizeof(u_int32_t))) > MCLBYTES) replylen = MCLBYTES; /* XXX: will truncate pkt later */ break; + case NI_QTYPE_IPV4ADDR: + /* unsupported - should respond with unknown Qtype? */ + break; default: /* * XXX: We must return a reply with the ICMP6 code - * `unknown Qtype' in this case. However we regard the case + * `unknown Qtype' in this case. However we regard the case * as an FQDN query for backward compatibility. * Older versions set a random value to this field, * so it rarely varies in the defined qtypes. @@ -1423,7 +1480,9 @@ ni6_input(m, off) MGETHDR(n, M_DONTWAIT, m->m_type); /* MAC-OK */ if (n == NULL) { m_freem(m); - return(NULL); + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); } M_COPY_PKTHDR(n, m); /* just for recvif */ if (replylen > MHLEN) { @@ -1500,13 +1559,17 @@ ni6_input(m, off) nni6->ni_type = ICMP6_NI_REPLY; m_freem(m); - return(n); + if (ifp != NULL) + ifnet_release(ifp); + return (n); - bad: +bad: m_freem(m); if (n) m_freem(n); - return(NULL); + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); } #undef hostnamelen @@ -1693,6 +1756,9 @@ ni6_addrs(ni6, ifpp, subj) int addrs = 0, addrsofif, iffound = 0; int niflags = ni6->ni_flags; + if (ifpp != NULL) + *ifpp = NULL; + if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) { switch (ni6->ni_code) { case ICMP6_NI_SUBJ_IPV6: @@ -1705,7 +1771,7 @@ ni6_addrs(ni6, ifpp, subj) * XXX: we only support IPv6 subject address for * this Qtype. */ - return(0); + return (0); } } @@ -1715,8 +1781,11 @@ ni6_addrs(ni6, ifpp, subj) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } ifa6 = (struct in6_ifaddr *)ifa; if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 && @@ -1737,18 +1806,25 @@ ni6_addrs(ni6, ifpp, subj) /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: - if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; case IPV6_ADDR_SCOPE_SITELOCAL: - if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; case IPV6_ADDR_SCOPE_GLOBAL: - if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; default: + IFA_UNLOCK(ifa); continue; } @@ -1757,17 +1833,24 @@ ni6_addrs(ni6, ifpp, subj) * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && - (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) + (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) { + IFA_UNLOCK(ifa); continue; /* we need only unicast addresses */ + } if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && - (icmp6_nodeinfo & 4) == 0) { + (icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { + IFA_UNLOCK(ifa); continue; } addrsofif++; /* count the address */ + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); if (iffound) { - *ifpp = ifp; + if (ifpp != NULL) { + *ifpp = ifp; + ifnet_reference(ifp); + } ifnet_head_done(); return(addrsofif); } @@ -1776,7 +1859,7 @@ ni6_addrs(ni6, ifpp, subj) } ifnet_head_done(); - return(addrs); + return (addrs); } static int @@ -1798,20 +1881,23 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) getmicrotime(&timenow); if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL)) - return(0); /* needless to copy */ + return (0); /* needless to copy */ again: ifnet_head_lock_shared(); - if (ifp == NULL) ifp = TAILQ_FIRST(&ifnet_head); - + if (ifp == NULL) + ifp = TAILQ_FIRST(&ifnet_head); + for (; ifp; ifp = TAILQ_NEXT(ifp, if_list)) { ifnet_lock_shared(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; - ifa = ifa->ifa_list.tqe_next) - { - if (ifa->ifa_addr->sa_family != AF_INET6) + ifa = ifa->ifa_list.tqe_next) { + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } ifa6 = (struct in6_ifaddr *)ifa; if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 && @@ -1825,45 +1911,57 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) if (ifp_dep == NULL) ifp_dep = ifp; + IFA_UNLOCK(ifa); continue; - } - else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && - allow_deprecated != 0) + } else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 && + allow_deprecated != 0) { + IFA_UNLOCK(ifa); continue; /* we now collect deprecated addrs */ - + } /* What do we have to do about ::1? */ switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) { case IPV6_ADDR_SCOPE_LINKLOCAL: - if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; case IPV6_ADDR_SCOPE_SITELOCAL: - if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; case IPV6_ADDR_SCOPE_GLOBAL: - if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) + if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0) { + IFA_UNLOCK(ifa); continue; + } break; default: + IFA_UNLOCK(ifa); continue; } /* * check if anycast is okay. - * XXX: just experimental. not in the spec. + * XXX: just experimental. not in the spec. */ if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 && - (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) + (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0) { + IFA_UNLOCK(ifa); continue; + } if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && - (icmp6_nodeinfo & 4) == 0) { + (icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) { + IFA_UNLOCK(ifa); continue; } /* now we can copy the address */ if (resid < sizeof(struct in6_addr) + sizeof(u_int32_t)) { + IFA_UNLOCK(ifa); /* * We give up much more copy. * Set the truncate flag and return. @@ -1890,7 +1988,8 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) * address configuration by DHCPv6, so the former * case can't happen. */ - if (ifa6->ia6_lifetime.ia6t_expire == 0) + if (ifa6->ia6_lifetime.ia6t_expire == 0 && + (ifa6->ia6_flags & IN6_IFF_TEMPORARY) == 0) ltime = ND6_INFINITE_LIFETIME; else { if (ifa6->ia6_lifetime.ia6t_expire > @@ -1899,7 +1998,7 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) else ltime = 0; } - + bcopy(<ime, cp, sizeof(u_int32_t)); cp += sizeof(u_int32_t); @@ -1910,10 +2009,11 @@ ni6_store_addrs(ni6, nni6, ifp0, resid) if (IN6_IS_ADDR_LINKLOCAL(&ifa6->ia_addr.sin6_addr)) ((struct in6_addr *)cp)->s6_addr16[1] = 0; cp += sizeof(struct in6_addr); - + resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t)); copied += (sizeof(struct in6_addr) + sizeof(u_int32_t)); + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); if (ifp0) /* we need search only on the specified IF */ @@ -1946,6 +2046,7 @@ icmp6_rip6_input(mp, off) struct sockaddr_in6 rip6src; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; + int ret = 0; #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ @@ -1958,21 +2059,22 @@ icmp6_rip6_input(mp, off) } #endif + /* + * XXX: the address may have embedded scope zone ID, which should be + * hidden from applications. + */ bzero(&rip6src, sizeof(rip6src)); - rip6src.sin6_len = sizeof(struct sockaddr_in6); rip6src.sin6_family = AF_INET6; - /* KAME hack: recover scopeid */ - (void)in6_recoverscope(&rip6src, &ip6->ip6_src, m->m_pkthdr.rcvif); - + rip6src.sin6_len = sizeof(struct sockaddr_in6); + rip6src.sin6_addr = ip6->ip6_src; + if (sa6_recoverscope(&rip6src)) + return (IPPROTO_DONE); + lck_rw_lock_shared(ripcbinfo.mtx); LIST_FOREACH(in6p, &ripcb, inp_list) { if ((in6p->inp_vflag & INP_IPV6) == 0) continue; -#if HAVE_NRL_INPCB - if (!(in6p->in6p_flags & INP_IPV6)) - continue; -#endif if (in6p->in6p_ip6_nxt != IPPROTO_ICMPV6) continue; if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) && @@ -1988,10 +2090,20 @@ icmp6_rip6_input(mp, off) if (last) { struct mbuf *n; if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) { - if (last->in6p_flags & IN6P_CONTROLOPTS) - ip6_savecontrol(last, &opts, ip6, n); + if ((last->in6p_flags & IN6P_CONTROLOPTS) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(last, n, &opts); + if (ret != 0) { + m_freem(n); + m_freem(opts); + last = in6p; + continue; + } + } /* strip intermediate headers */ m_adj(n, off); + so_recv_data_stat(last->in6p_socket, m, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, n, opts, NULL) != 0) { @@ -2002,21 +2114,35 @@ icmp6_rip6_input(mp, off) } last = in6p; } - lck_rw_done(ripcbinfo.mtx); if (last) { - if (last->in6p_flags & IN6P_CONTROLOPTS) - ip6_savecontrol(last, &opts, ip6, m); + if ((last->in6p_flags & INP_CONTROLOPTS) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(last, m, &opts); + if (ret != 0) { + goto error; + } + } /* strip intermediate headers */ m_adj(m, off); + so_recv_data_stat(last->in6p_socket, m, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, m, opts, NULL) != 0) { sorwakeup(last->in6p_socket); } } else { - m_freem(m); - ip6stat.ip6s_delivered--; + goto error; } + lck_rw_done(ripcbinfo.mtx); return IPPROTO_DONE; + +error: + lck_rw_done(ripcbinfo.mtx); + m_freem(m); + m_freem(opts); + ip6stat.ip6s_delivered--; + return IPPROTO_DONE; + } /* @@ -2036,11 +2162,11 @@ icmp6_reflect(m, off) int type, code; struct ifnet *outif = NULL; struct sockaddr_in6 sa6_src, sa6_dst; -#ifdef COMPAT_RFC1885 - int mtu = IPV6_MMTU; - struct sockaddr_in6 *sin6 = &icmp6_reflect_rt.ro_dst; -#endif u_int32_t oflow; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) + ip6oa.ip6oa_boundif = m->m_pkthdr.rcvif->if_index; /* too short to reflect */ if (off < sizeof(struct ip6_hdr)) { @@ -2098,74 +2224,42 @@ icmp6_reflect(m, off) * XXX: make sure to embed scope zone information, using * already embedded IDs or the received interface (if any). * Note that rcvif may be NULL. - * TODO: scoped routing case (XXX). */ bzero(&sa6_src, sizeof(sa6_src)); sa6_src.sin6_family = AF_INET6; sa6_src.sin6_len = sizeof(sa6_src); sa6_src.sin6_addr = ip6->ip6_dst; in6_recoverscope(&sa6_src, &ip6->ip6_dst, m->m_pkthdr.rcvif); - in6_embedscope(&ip6->ip6_dst, &sa6_src, NULL, NULL); + in6_embedscope(&ip6->ip6_dst, &sa6_src, NULL, NULL, NULL); bzero(&sa6_dst, sizeof(sa6_dst)); sa6_dst.sin6_family = AF_INET6; sa6_dst.sin6_len = sizeof(sa6_dst); sa6_dst.sin6_addr = t; in6_recoverscope(&sa6_dst, &t, m->m_pkthdr.rcvif); - in6_embedscope(&t, &sa6_dst, NULL, NULL); + in6_embedscope(&t, &sa6_dst, NULL, NULL, NULL); -#ifdef COMPAT_RFC1885 - /* - * xxx guess MTU - * RFC 1885 requires that echo reply should be truncated if it - * does not fit in with (return) path MTU, but the description was - * removed in the new spec. - */ - if (icmp6_reflect_rt.ro_rt == NULL || - !(icmp6_reflect_rt.ro_rt->rt_flags & RTF_UP) || - icmp6_reflect_rt.ro_rt->generation_id != route_generation || - ! (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, &ip6->ip6_dst))) { - if (icmp6_reflect_rt.ro_rt) { - rtfree(icmp6_reflect_rt.ro_rt); - icmp6_reflect_rt.ro_rt = 0; - } - bzero(sin6, sizeof(*sin6)); - sin6->sin6_family = PF_INET6; - sin6->sin6_len = sizeof(struct sockaddr_in6); - sin6->sin6_addr = ip6->ip6_dst; - - rtalloc_ign((struct route *)&icmp6_reflect_rt.ro_rt, - RTF_PRCLONING); - } - - if (icmp6_reflect_rt.ro_rt == 0) - goto bad; - - RT_LOCK(icmp6_reflect_rt.ro_rt); - if ((icmp6_reflect_rt.ro_rt->rt_flags & RTF_HOST) - && mtu < icmp6_reflect_rt.ro_rt->rt_ifp->if_mtu) - mtu = icmp6_reflect_rt.ro_rt->rt_rmx.rmx_mtu; - RT_UNLOCK(icmp6_reflect_rt.ro_rt); - - if (mtu < m->m_pkthdr.len) { - plen -= (m->m_pkthdr.len - mtu); - m_adj(m, mtu - m->m_pkthdr.len); - } -#endif /* * If the incoming packet was addressed directly to us(i.e. unicast), * use dst as the src for the reply. - * The IN6_IFF_NOTREADY case would be VERY rare, but is possible + * The IN6_IFF_NOTREADY case should be VERY rare, but is possible * (for example) when we encounter an error while forwarding procedure * destined to a duplicated address of ours. + * Note that ip6_getdstifaddr() may fail if we are in an error handling + * procedure of an outgoing packet of our own, in which case we need + * to search in the ifaddr list. */ - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia; ia = ia->ia_next) + lck_rw_lock_shared(&in6_ifaddr_rwlock); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK(&ia->ia_ifa); if (IN6_ARE_ADDR_EQUAL(&t, &ia->ia_addr.sin6_addr) && (ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) { + IFA_UNLOCK(&ia->ia_ifa); src = &t; break; } - lck_mtx_unlock(nd6_mutex); + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(&in6_ifaddr_rwlock); if (ia == NULL && IN6_IS_ADDR_LINKLOCAL(&t) && (m->m_flags & M_LOOP)) { /* * This is the case if the dst is our link-local address @@ -2174,8 +2268,9 @@ icmp6_reflect(m, off) src = &t; } - if (src == 0) { + if (src == NULL) { int e; + struct sockaddr_in6 sin6; struct route_in6 ro; /* @@ -2183,8 +2278,14 @@ icmp6_reflect(m, off) * that we do not own. Select a source address based on the * source address of the erroneous packet. */ + bzero(&sin6, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_len = sizeof(sin6); + sin6.sin6_addr = ip6->ip6_dst; /* zone ID should be embedded */ + bzero(&ro, sizeof(ro)); - src = in6_selectsrc(&sa6_src, NULL, NULL, &ro, NULL, &src_storage, &e); + src = in6_selectsrc(&sin6, NULL, NULL, &ro, &outif, + &src_storage, ip6oa.ip6oa_boundif, &e); if (ro.ro_rt) rtfree(ro.ro_rt); /* XXX: we could use this */ if (src == NULL) { @@ -2195,10 +2296,8 @@ icmp6_reflect(m, off) goto bad; } } - - ip6->ip6_src = *src; - oflow = ip6->ip6_flow; /* Save for later */ + ip6->ip6_src = *src; ip6->ip6_flow = 0; ip6->ip6_vfc &= ~IPV6_VERSION_MASK; ip6->ip6_vfc |= IPV6_VERSION; @@ -2206,14 +2305,16 @@ icmp6_reflect(m, off) ip6->ip6_flow |= (oflow & htonl(0x0ff00000)); } ip6->ip6_nxt = IPPROTO_ICMPV6; + lck_rw_lock_shared(nd_if_rwlock); + if (outif) + ip6->ip6_hlim = ND_IFINFO(outif)->chlim; if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_index < nd_ifinfo_indexlim) { /* XXX: This may not be the outgoing interface */ - lck_rw_lock_shared(nd_if_rwlock); ip6->ip6_hlim = nd_ifinfo[m->m_pkthdr.rcvif->if_index].chlim; - lck_rw_done(nd_if_rwlock); } else { ip6->ip6_hlim = ip6_defhlim; } + lck_rw_done(nd_if_rwlock); /* Use the same traffic class as in the request to match IPv4 */ icmp6->icmp6_cksum = 0; icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6, @@ -2230,28 +2331,24 @@ icmp6_reflect(m, off) (void)ipsec_setsocket(m, NULL); #endif /*IPSEC*/ -#ifdef COMPAT_RFC1885 - ip6_output(m, NULL, &icmp6_reflect_rt, 0, NULL, &outif, 0); -#else - ip6_output(m, NULL, NULL, 0, NULL, &outif, 0); -#endif - if (outif) + if (outif != NULL) { + ifnet_release(outif); + outif = NULL; + } + ip6_output(m, NULL, NULL, IPV6_OUTARGS, NULL, &outif, &ip6oa); + if (outif != NULL) { icmp6_ifoutstat_inc(outif, type, code); - + ifnet_release(outif); + } return; - bad: +bad: m_freem(m); + if (outif != NULL) + ifnet_release(outif); return; } -void -icmp6_fasttimo() -{ - - mld6_fasttimeo(); -} - static const char * icmp6_redirect_diag(src6, dst6, tgt6) struct in6_addr *src6; @@ -2307,10 +2404,10 @@ icmp6_redirect_input(m, off) redtgt6 = nd_rd->nd_rd_target; reddst6 = nd_rd->nd_rd_dst; - if (IN6_IS_ADDR_LINKLOCAL(&redtgt6)) - redtgt6.s6_addr16[1] = htons(ifp->if_index); - if (IN6_IS_ADDR_LINKLOCAL(&reddst6)) - reddst6.s6_addr16[1] = htons(ifp->if_index); + if (in6_setscope(&redtgt6, m->m_pkthdr.rcvif, NULL) || + in6_setscope(&reddst6, m->m_pkthdr.rcvif, NULL)) { + goto freeit; + } /* validation */ if (!IN6_IS_ADDR_LINKLOCAL(&src6)) { @@ -2335,7 +2432,7 @@ icmp6_redirect_input(m, off) sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); bcopy(&reddst6, &sin6.sin6_addr, sizeof(reddst6)); - rt = rtalloc1((struct sockaddr *)&sin6, 0, 0); + rt = rtalloc1_scoped((struct sockaddr *)&sin6, 0, 0, ifp->if_index); if (rt) { RT_LOCK(rt); if (rt->rt_gateway == NULL || @@ -2494,6 +2591,7 @@ icmp6_redirect_output(m0, rt) u_char *p; struct ifnet *outif = NULL; struct sockaddr_in6 src_sa; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; icmp6_errcount(&icmp6stat.icp6s_outerrhist, ND_REDIRECT, 0); @@ -2565,8 +2663,10 @@ icmp6_redirect_output(m0, rt) IN6_IFF_NOTREADY| IN6_IFF_ANYCAST)) == NULL) goto fail; + IFA_LOCK(&ia->ia_ifa); ifp_ll6 = ia->ia_addr.sin6_addr; - ifafree(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } /* get ip6 linklocal address for the router. */ @@ -2622,42 +2722,44 @@ icmp6_redirect_output(m0, rt) if (!router_ll6) goto nolladdropt; - { - /* target lladdr option */ - struct rtentry *rt_router = NULL; - int len; - struct sockaddr_dl *sdl; - struct nd_opt_hdr *nd_opt; - char *lladdr; - - /* Callee returns a locked route upon success */ - rt_router = nd6_lookup(router_ll6, 0, ifp, 0); - if (!rt_router) - goto nolladdropt; - RT_LOCK_ASSERT_HELD(rt_router); - len = sizeof(*nd_opt) + ifp->if_addrlen; - len = (len + 7) & ~7; /* round by 8 */ - /* safety check */ - if (len + (p - (u_char *)ip6) > maxlen) { + { + /* target lladdr option */ + struct rtentry *rt_router = NULL; + int len; + struct sockaddr_dl *sdl; + struct nd_opt_hdr *nd_opt; + char *lladdr; + + /* Callee returns a locked route upon success */ + rt_router = nd6_lookup(router_ll6, 0, ifp, 0); + if (!rt_router) + goto nolladdropt; + RT_LOCK_ASSERT_HELD(rt_router); + len = sizeof(*nd_opt) + ifp->if_addrlen; + len = (len + 7) & ~7; /* round by 8 */ + /* safety check */ + if (len + (p - (u_char *)ip6) > maxlen) { + RT_REMREF_LOCKED(rt_router); + RT_UNLOCK(rt_router); + goto nolladdropt; + } + + if (!(rt_router->rt_flags & RTF_GATEWAY) && + (rt_router->rt_flags & RTF_LLINFO) && + (rt_router->rt_gateway->sa_family == AF_LINK) && + (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) && + sdl->sdl_alen) { + nd_opt = (struct nd_opt_hdr *)p; + nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; + nd_opt->nd_opt_len = len >> 3; + lladdr = (char *)(nd_opt + 1); + bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen); + p += len; + } RT_REMREF_LOCKED(rt_router); RT_UNLOCK(rt_router); - goto nolladdropt; - } - if (!(rt_router->rt_flags & RTF_GATEWAY) && - (rt_router->rt_flags & RTF_LLINFO) && - (rt_router->rt_gateway->sa_family == AF_LINK) && - (sdl = (struct sockaddr_dl *)rt_router->rt_gateway) && - sdl->sdl_alen) { - nd_opt = (struct nd_opt_hdr *)p; - nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; - nd_opt->nd_opt_len = len >> 3; - lladdr = (char *)(nd_opt + 1); - bcopy(LLADDR(sdl), lladdr, ifp->if_addrlen); - p += len; - } - RT_REMREF_LOCKED(rt_router); - RT_UNLOCK(rt_router); - } + } + nolladdropt:; m->m_pkthdr.len = m->m_len = p - (u_char *)ip6; @@ -2741,20 +2843,11 @@ nolladdropt:; } noredhdropt:; - if (IN6_IS_ADDR_LINKLOCAL(&sip6->ip6_src)) - sip6->ip6_src.s6_addr16[1] = 0; - if (IN6_IS_ADDR_LINKLOCAL(&sip6->ip6_dst)) - sip6->ip6_dst.s6_addr16[1] = 0; -#if 0 - if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) - ip6->ip6_src.s6_addr16[1] = 0; - if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) - ip6->ip6_dst.s6_addr16[1] = 0; -#endif - if (IN6_IS_ADDR_LINKLOCAL(&nd_rd->nd_rd_target)) - nd_rd->nd_rd_target.s6_addr16[1] = 0; - if (IN6_IS_ADDR_LINKLOCAL(&nd_rd->nd_rd_dst)) - nd_rd->nd_rd_dst.s6_addr16[1] = 0; + /* XXX: clear embedded link IDs in the inner header */ + in6_clearscope(&sip6->ip6_src); + in6_clearscope(&sip6->ip6_dst); + in6_clearscope(&nd_rd->nd_rd_target); + in6_clearscope(&nd_rd->nd_rd_dst); ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr)); @@ -2768,10 +2861,14 @@ noredhdropt:; if (ipsec_bypass == 0) (void)ipsec_setsocket(m, NULL); #endif /*IPSEC*/ - ip6_output(m, NULL, NULL, 0, NULL, &outif, 0); + + ip6oa.ip6oa_boundif = ifp->if_index; + + ip6_output(m, NULL, NULL, IPV6_OUTARGS, NULL, &outif, &ip6oa); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_redirect); + ifnet_release(outif); } icmp6stat.icp6s_outhist[ND_REDIRECT]++; @@ -2786,11 +2883,6 @@ fail: m_freem(m0); } -#if HAVE_NRL_INPCB -#define sotoin6pcb sotoinpcb -#define in6pcb inpcb -#define in6p_icmp6filt inp_icmp6filt -#endif /* * ICMPv6 socket option processing. */ @@ -2823,7 +2915,7 @@ icmp6_ctloutput(so, sopt) { struct icmp6_filter *p; - if (optlen != sizeof(*p)) { + if (optlen != 0 && optlen != sizeof(*p)) { error = EMSGSIZE; break; } @@ -2831,8 +2923,17 @@ icmp6_ctloutput(so, sopt) error = EINVAL; break; } - error = sooptcopyin(sopt, inp->in6p_icmp6filt, optlen, - optlen); + + if (optlen == 0) { + /* According to RFC 3542, an installed filter can be + * cleared by issuing a setsockopt for ICMP6_FILTER + * with a zero length. + */ + ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt); + } else { + error = sooptcopyin(sopt, inp->in6p_icmp6filt, optlen, + optlen); + } break; } @@ -2851,7 +2952,7 @@ icmp6_ctloutput(so, sopt) break; } error = sooptcopyout(sopt, inp->in6p_icmp6filt, - sizeof(struct icmp6_filter)); + min(sizeof(struct icmp6_filter), optlen)); break; } @@ -2864,11 +2965,6 @@ icmp6_ctloutput(so, sopt) return(error); } -#if HAVE_NRL_INPCB -#undef sotoin6pcb -#undef in6pcb -#undef in6p_icmp6filt -#endif /* * ICMPv6 socket datagram option processing. @@ -2892,16 +2988,19 @@ icmp6_dgram_ctloutput(struct socket *so, struct sockopt *sopt) return EINVAL; switch (sopt->sopt_name) { - case IPV6_PKTOPTIONS: case IPV6_UNICAST_HOPS: case IPV6_CHECKSUM: case IPV6_FAITH: case IPV6_V6ONLY: + case IPV6_USE_MIN_MTU: + case IPV6_RECVRTHDR: + case IPV6_RECVPKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_PATHMTU: case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_HOPOPTS: case IPV6_DSTOPTS: - case IPV6_RTHDR: case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: @@ -2911,6 +3010,15 @@ icmp6_dgram_ctloutput(struct socket *so, struct sockopt *sopt) case IPV6_IPSEC_POLICY: case IPV6_RECVTCLASS: case IPV6_TCLASS: + case IPV6_2292PKTOPTIONS: + case IPV6_2292PKTINFO: + case IPV6_2292HOPLIMIT: + case IPV6_2292HOPOPTS: + case IPV6_2292DSTOPTS: + case IPV6_2292RTHDR: + case IPV6_BOUND_IF: + case IPV6_NO_IFT_CELLULAR: + return ip6_ctloutput(so, sopt); default: @@ -2921,23 +3029,24 @@ icmp6_dgram_ctloutput(struct socket *so, struct sockopt *sopt) } __private_extern__ int -icmp6_dgram_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr *nam, - struct mbuf *control, __unused struct proc *p) +icmp6_dgram_send(struct socket *so, int flags, struct mbuf *m, + struct sockaddr *nam, struct mbuf *control, struct proc *p) { +#pragma unused(flags, p) int error = 0; struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 tmp; - struct sockaddr_in6 *dst; + struct sockaddr_in6 *dst = (struct sockaddr_in6 *)nam; struct icmp6_hdr *icmp6; if (so->so_uid == 0) - return rip6_output(m, so, (struct sockaddr_in6 *) nam, control); + return rip6_output(m, so, (struct sockaddr_in6 *) nam, control, 0); /* always copy sockaddr to avoid overwrites */ if (so->so_state & SS_ISCONNECTED) { if (nam) { - m_freem(m); - return EISCONN; + m_freem(m); + return EISCONN; } /* XXX */ bzero(&tmp, sizeof(tmp)); @@ -2948,8 +3057,8 @@ icmp6_dgram_send(struct socket *so, __unused int flags, struct mbuf *m, struct s dst = &tmp; } else { if (nam == NULL) { - m_freem(m); - return ENOTCONN; + m_freem(m); + return ENOTCONN; } tmp = *(struct sockaddr_in6 *)nam; dst = &tmp; @@ -2988,7 +3097,7 @@ icmp6_dgram_send(struct socket *so, __unused int flags, struct mbuf *m, struct s } #endif - return rip6_output(m, so, (struct sockaddr_in6 *) nam, control); + return rip6_output(m, so, (struct sockaddr_in6 *) nam, control, 0); bad: m_freem(m); return error; @@ -3124,4 +3233,3 @@ icmp6_ratelimit( return ret; } - diff --git a/bsd/netinet6/in6.c b/bsd/netinet6/in6.c index a9fd82b98..f11a99041 100644 --- a/bsd/netinet6/in6.c +++ b/bsd/netinet6/in6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2009 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -106,6 +106,8 @@ #include <sys/kernel.h> #include <sys/syslog.h> #include <sys/kern_event.h> +#include <sys/mcache.h> +#include <sys/protosw.h> #include <kern/locks.h> #include <kern/zalloc.h> @@ -122,11 +124,10 @@ #include <netinet/in.h> #include <netinet/in_var.h> #include <netinet/if_ether.h> -#ifndef SCOPEDROUTING #include <netinet/in_systm.h> #include <netinet/ip.h> #include <netinet/in_pcb.h> -#endif +#include <netinet/icmp6.h> #include <netinet6/nd6.h> #include <netinet/ip6.h> @@ -135,9 +136,8 @@ #include <netinet6/ip6_mroute.h> #include <netinet6/in6_ifattach.h> #include <netinet6/scope6_var.h> -#ifndef SCOPEDROUTING +#include <netinet6/in6_var.h> #include <netinet6/in6_pcb.h> -#endif #include <net/net_osdep.h> @@ -145,9 +145,6 @@ #include <net/pfvar.h> #endif /* PF */ -#ifndef __APPLE__ -MALLOC_DEFINE(M_IPMADDR, "in6_multi", "internet multicast address"); -#endif /* * Definitions of some costant IP6 addresses. */ @@ -159,8 +156,12 @@ const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; +const struct in6_addr in6addr_linklocal_allv2routers = + IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; +const struct in6_addr in6mask7 = IN6MASK7; +const struct in6_addr in6mask16 = IN6MASK16; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; @@ -173,36 +174,53 @@ static int in6_lifaddr_ioctl(struct socket *, u_long, caddr_t, struct ifnet *, struct proc *); static int in6_ifinit(struct ifnet *, struct in6_ifaddr *, struct sockaddr_in6 *, int); -static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *, int); +static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static struct in6_ifaddr *in6_ifaddr_alloc(int); +static void in6_ifaddr_attached(struct ifaddr *); +static void in6_ifaddr_detached(struct ifaddr *); static void in6_ifaddr_free(struct ifaddr *); static void in6_ifaddr_trace(struct ifaddr *, int); static struct in6_aliasreq *in6_aliasreq_to_native(void *, int, struct in6_aliasreq *); -struct in6_multihead in6_multihead; /* XXX BSS initialization */ extern lck_mtx_t *nd6_mutex; -extern lck_mtx_t *ip6_mutex; extern int in6_init2done; +#define IN6IFA_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int in6ifa_trace_hist_size = IN6IFA_TRACE_HIST_SIZE; + struct in6_ifaddr_dbg { struct in6_ifaddr in6ifa; /* in6_ifaddr */ struct in6_ifaddr in6ifa_old; /* saved in6_ifaddr */ - u_int16_t in6ifa_refhold_cnt; /* # of ifaref */ - u_int16_t in6ifa_refrele_cnt; /* # of ifafree */ + u_int16_t in6ifa_refhold_cnt; /* # of IFA_ADDREF */ + u_int16_t in6ifa_refrele_cnt; /* # of IFA_REMREF */ /* * Alloc and free callers. */ ctrace_t in6ifa_alloc; ctrace_t in6ifa_free; /* - * Circular lists of ifaref and ifafree callers. + * Circular lists of IFA_ADDREF and IFA_REMREF callers. + */ + ctrace_t in6ifa_refhold[IN6IFA_TRACE_HIST_SIZE]; + ctrace_t in6ifa_refrele[IN6IFA_TRACE_HIST_SIZE]; + /* + * Trash list linkage */ - ctrace_t in6ifa_refhold[CTRACE_HIST_SIZE]; - ctrace_t in6ifa_refrele[CTRACE_HIST_SIZE]; + TAILQ_ENTRY(in6_ifaddr_dbg) in6ifa_trash_link; }; -static unsigned int in6ifa_debug; /* debug flags */ +/* List of trash in6_ifaddr entries protected by in6ifa_trash_lock */ +static TAILQ_HEAD(, in6_ifaddr_dbg) in6ifa_trash_head; +static decl_lck_mtx_data(, in6ifa_trash_lock); + +#if DEBUG +static unsigned int in6ifa_debug = 1; /* debugging (enabled) */ +#else +static unsigned int in6ifa_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ static unsigned int in6ifa_size; /* size of zone element */ static struct zone *in6ifa_zone; /* zone for in6_ifaddr */ @@ -232,6 +250,7 @@ in6_ifloop_request(int cmd, struct ifaddr *ifa) * would be happy. Note that we assume the caller of the function * (probably implicitly) set nd6_rtrequest() to ifa->ifa_rtrequest, * which changes the outgoing interface to the loopback interface. + * ifa_addr for INET6 is set once during init; no need to hold lock. */ lck_mtx_lock(rnh_lock); e = rtrequest_locked(cmd, ifa->ifa_addr, ifa->ifa_addr, @@ -290,7 +309,10 @@ in6_ifaddloop(struct ifaddr *ifa) { struct rtentry *rt; - /* If there is no loopback entry, allocate one. */ + /* + * If there is no loopback entry, allocate one. ifa_addr for + * INET6 is set once during init; no need to hold lock. + */ rt = rtalloc1(ifa->ifa_addr, 0, 0); if (rt != NULL) RT_LOCK(rt); @@ -312,7 +334,7 @@ in6_ifaddloop(struct ifaddr *ifa) * if it exists. */ static void -in6_ifremloop(struct ifaddr *ifa, int locked) +in6_ifremloop(struct ifaddr *ifa) { struct in6_ifaddr *ia; struct rtentry *rt; @@ -334,26 +356,29 @@ in6_ifremloop(struct ifaddr *ifa, int locked) * (probably p2p) interfaces. * XXX: we should avoid such a configuration in IPv6... */ - if (!locked) - lck_mtx_lock(nd6_mutex); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK(&ia->ia_ifa); if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &ia->ia_addr.sin6_addr)) { ia_count++; - if (ia_count > 1) + if (ia_count > 1) { + IFA_UNLOCK(&ia->ia_ifa); break; + } } + IFA_UNLOCK(&ia->ia_ifa); } - if (!locked) - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); if (ia_count == 1) { /* * Before deleting, check if a corresponding loopbacked host * route surely exists. With this check, we can avoid to * delete an interface direct route whose destination is same - * as the address being removed. This can happen when remofing + * as the address being removed. This can happen when removing * a subnet-router anycast address on an interface attahced - * to a shared medium. + * to a shared medium. ifa_addr for INET6 is set once during + * init; no need to hold lock. */ rt = rtalloc1(ifa->ifa_addr, 0, 0); if (rt != NULL) { @@ -370,43 +395,6 @@ in6_ifremloop(struct ifaddr *ifa, int locked) } } -#if 0 -/* Not used */ -int -in6_ifindex2scopeid(idx) - int idx; -{ - struct ifnet *ifp; - struct ifaddr *ifa; - struct sockaddr_in6 *sin6; - - ifnet_head_lock_shared(); - if (idx <= 0 || if_index < idx) { - ifnet_head_done(); - return -1; - } - - ifp = ifindex2ifnet[idx]; - ifnet_head_done(); - - ifnet_lock_shared(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) - { - if (ifa->ifa_addr->sa_family != AF_INET6) - continue; - sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; - if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) { - int scopeid = sin6->sin6_scope_id & 0xffff; - ifnet_lock_done(ifp); - return scopeid; - } - } - ifnet_lock_done(ifp); - - return -1; -} -#endif - int in6_mask2len(mask, lim0) @@ -416,8 +404,8 @@ in6_mask2len(mask, lim0) int x = 0, y; u_char *lim = lim0, *p; - if (lim0 == NULL || - lim0 - (u_char *)mask > sizeof(*mask)) /* ignore the scope_id part */ + /* ignore the scope_id part */ + if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) @@ -437,12 +425,12 @@ in6_mask2len(mask, lim0) */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) - return(-1); + return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) - return(-1); + return (-1); } - + return x * 8 + y; } @@ -536,6 +524,25 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, return (mrt6_ioctl(cmd, data)); } #endif + + switch(cmd) { + case SIOCAADDRCTL_POLICY: + case SIOCDADDRCTL_POLICY: + if (!privileged) + return (EPERM); + return (in6_src_ioctl(cmd, data)); + } + + switch (cmd) { + case SIOCDRADD_IN6_32: + case SIOCDRADD_IN6_64: + case SIOCDRDEL_IN6_32: + case SIOCDRDEL_IN6_64: + if (!privileged) + return (EPERM); + return (defrtrlist_ioctl(cmd, data)); + } + if (ifp == NULL) return (EOPNOTSUPP); @@ -612,7 +619,16 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } /* - * Point ifra and sa6 to the right places depending on the command. + * Find address for this interface, if it exists. + * + * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation + * only, and used the first interface address as the target of other + * operations (without checking ifra_addr). This was because netinet + * code/API assumed at most 1 interface address per interface. + * Since IPv6 allows a node to assign multiple addresses + * on a single interface, we almost always look and check the + * presence of ifra_addr, and reject invalid ones here. + * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCLL_START_32: @@ -643,6 +659,9 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: + case SIOCSNDFLUSH_IN6: + case SIOCSPFXFLUSH_IN6: + case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCSIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: @@ -665,25 +684,39 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, /* NOTREACHED */ case SIOCAUTOCONF_STOP: { - struct in6_ifaddr *nia = NULL; - ifnet_lock_exclusive(ifp); ifp->if_eflags &= ~IFEF_ACCEPT_RTADVD; ifnet_lock_done(ifp); - /* nuke prefix list. this may try to remove some ifaddrs as well */ - in6_purgeprefix(ifp); - - /* removed autoconfigured address from interface */ - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia != NULL; ia = nia) { - nia = ia->ia_next; - if (ia->ia_ifa.ifa_ifp != ifp) + /* Remove autoconfigured address from interface */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + if (ia->ia_ifa.ifa_ifp != ifp) { + ia = ia->ia_next; continue; - if (ia->ia6_flags & IN6_IFF_AUTOCONF) - in6_purgeaddr(&ia->ia_ifa, 1); + } + IFA_LOCK(&ia->ia_ifa); + if (ia->ia6_flags & IN6_IFF_AUTOCONF) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for us */ + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + in6_purgeaddr(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* for us */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + /* + * Purging the address caused in6_ifaddr_rwlock + * to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + ia = in6_ifaddrs; + continue; + } + IFA_UNLOCK(&ia->ia_ifa); + ia = ia->ia_next; } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); return (0); } @@ -694,7 +727,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * be done here. They are currently done in in6_ifattach() * for the interfaces that need it. */ - if (((ifp->if_type == IFT_PPP) || ((ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0)) && + if ((ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0 && ifra->ifra_addr.sin6_family == AF_INET6 && ifra->ifra_dstaddr.sin6_family == AF_INET6) { /* some interfaces may provide LinkLocal addresses */ @@ -706,28 +739,41 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, /* NOTREACHED */ case SIOCLL_STOP: { - struct in6_ifaddr *nia = NULL; - - /* removed link local addresses from interface */ - - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia != NULL; ia = nia) { - nia = ia->ia_next; - if (ia->ia_ifa.ifa_ifp != ifp) + /* Remove link local addresses from interface */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + if (ia->ia_ifa.ifa_ifp != ifp) { + ia = ia->ia_next; + continue; + } + IFA_LOCK(&ia->ia_ifa); + if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for us */ + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + in6_purgeaddr(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* for us */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + /* + * Purging the address caused in6_ifaddr_rwlock + * to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + ia = in6_ifaddrs; continue; - if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) - in6_purgeaddr(&ia->ia_ifa, 1); + } + IFA_UNLOCK(&ia->ia_ifa); + ia = ia->ia_next; } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); return (0); } case SIOCPROTOATTACH_IN6_32: case SIOCPROTOATTACH_IN6_64: - if ((error = proto_plumb(PF_INET6, ifp))) - printf("SIOCPROTOATTACH_IN6: %s " - "error=%d\n", if_name(ifp), error); - return (error); + return (in6_domifattach(ifp)); /* NOTREACHED */ case SIOCPROTODETACH_IN6: @@ -772,8 +818,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses - * on a single interface, SIOCSIFxxx ioctls are not suitable - * and should be unused. + * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; @@ -782,10 +827,10 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow - * "ifconfig if0 delete" to remove first IPv4 address on the - * interface. For IPv6, as the spec allow multiple interface - * address from the day one, we consider "remove the first one" - * semantics to be not preferable. + * "ifconfig if0 delete" to remove the first IPv4 address on + * the interface. For IPv6, as the spec allows multiple + * interface address from the day one, we consider "remove the + * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; @@ -840,13 +885,17 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, lt = (struct in6_addrlifetime_64 *) &ifr->ifr_ifru.ifru_lifetime; - if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME - && lt->ia6t_vltime + timenow.tv_sec < timenow.tv_sec) { + if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 + || lt->ia6t_vltime != ND6_INFINITE_LIFETIME) + && lt->ia6t_vltime + timenow.tv_sec < + timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } - if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME - && lt->ia6t_pltime + timenow.tv_sec < timenow.tv_sec) { + if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 + || lt->ia6t_pltime != ND6_INFINITE_LIFETIME) + && lt->ia6t_pltime + timenow.tv_sec < + timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } @@ -855,13 +904,17 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, lt = (struct in6_addrlifetime_32 *) &ifr->ifr_ifru.ifru_lifetime; - if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME - && lt->ia6t_vltime + timenow.tv_sec < timenow.tv_sec) { + if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 + || lt->ia6t_vltime != ND6_INFINITE_LIFETIME) + && lt->ia6t_vltime + timenow.tv_sec < + timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } - if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME - && lt->ia6t_pltime + timenow.tv_sec < timenow.tv_sec) { + if (((ia->ia6_flags & IN6_IFF_TEMPORARY) != 0 + || lt->ia6t_pltime != ND6_INFINITE_LIFETIME) + && lt->ia6t_pltime + timenow.tv_sec < + timenow.tv_sec) { error = EINVAL; goto ioctl_cleanup; } @@ -870,8 +923,15 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } switch (cmd) { + case SIOCGIFADDR_IN6: + IFA_LOCK(&ia->ia_ifa); ifr->ifr_addr = ia->ia_addr; + IFA_UNLOCK(&ia->ia_ifa); + if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) { + IFA_REMREF(&ia->ia_ifa); + return (error); + } break; case SIOCGIFDSTADDR_IN6: @@ -883,15 +943,25 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * XXX: should we check if ifa_dstaddr is NULL and return * an error? */ + IFA_LOCK(&ia->ia_ifa); ifr->ifr_dstaddr = ia->ia_dstaddr; + IFA_UNLOCK(&ia->ia_ifa); + if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) { + IFA_REMREF(&ia->ia_ifa); + return (error); + } break; case SIOCGIFNETMASK_IN6: + IFA_LOCK(&ia->ia_ifa); ifr->ifr_addr = ia->ia_prefixmask; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCGIFAFLAG_IN6: + IFA_LOCK(&ia->ia_ifa); ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCGIFSTAT_IN6: @@ -900,7 +970,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, goto ioctl_cleanup; } index = ifp->if_index; - lck_mtx_lock(ip6_mutex); + lck_rw_lock_shared(&in6_ifs_rwlock); if (in6_ifstat == NULL || index >= in6_ifstatmax || in6_ifstat[index] == NULL) { /* return EAFNOSUPPORT? */ @@ -909,7 +979,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } else { ifr->ifr_ifru.ifru_stat = *in6_ifstat[index]; } - lck_mtx_unlock(ip6_mutex); + lck_rw_done(&in6_ifs_rwlock); break; case SIOCGIFSTAT_ICMP6: @@ -918,7 +988,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, goto ioctl_cleanup; } index = ifp->if_index; - lck_mtx_lock(ip6_mutex); + lck_rw_lock_shared(&icmp6_ifs_rwlock); if (icmp6_ifstat == NULL || index >= icmp6_ifstatmax || icmp6_ifstat[index] == NULL) { /* return EAFNOSUPPORT? */ @@ -927,10 +997,11 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } else { ifr->ifr_ifru.ifru_icmp6stat = *icmp6_ifstat[index]; } - lck_mtx_unlock(ip6_mutex); + lck_rw_done(&icmp6_ifs_rwlock); break; case SIOCGIFALIFETIME_IN6: + IFA_LOCK(&ia->ia_ifa); if (p64) { struct in6_addrlifetime_64 *lt; @@ -954,9 +1025,11 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, lt->ia6t_pltime = (uint32_t)ia->ia6_lifetime.ia6t_pltime; } + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCSIFALIFETIME_IN6: + IFA_LOCK(&ia->ia_ifa); if (p64) { struct in6_addrlifetime_64 *lt; @@ -979,16 +1052,19 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, ia->ia6_lifetime.ia6t_pltime = lt->ia6t_pltime; } /* for sanity */ - if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { + if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME || + (ia->ia6_flags & IN6_IFF_TEMPORARY) != 0) { ia->ia6_lifetime.ia6t_expire = timenow.tv_sec + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; - if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { + if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME || + (ia->ia6_flags & IN6_IFF_TEMPORARY) != 0) { ia->ia6_lifetime.ia6t_preferred = timenow.tv_sec + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; + IFA_UNLOCK(&ia->ia_ifa); break; case SIOCAIFADDR_IN6_32: @@ -996,17 +1072,13 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, int i; struct nd_prefix pr0, *pr; - /* Attempt to attache the protocol, in case it isn't attached */ - error = proto_plumb(PF_INET6, ifp); + /* Attempt to attach the protocol, in case it isn't attached */ + error = in6_domifattach(ifp); if (error) { - if (error != EEXIST) { - printf("SIOCAIFADDR_IN6: %s can't plumb " - "protocol error=%d\n", if_name(ifp), error); + if (error == EEXIST) + error = 0; + else goto ioctl_cleanup; - } - - /* Ignore, EEXIST */ - error = 0; } else { /* PF_INET6 wasn't previously attached */ if ((error = in6_if_up(ifp, NULL)) != 0) @@ -1017,7 +1089,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * first, make or update the interface address structure, * and link it to the list. */ - if ((error = in6_update_ifa(ifp, ifra, ia, M_WAITOK)) != 0) + if ((error = in6_update_ifa(ifp, ifra, ia, 0, M_WAITOK)) != 0) goto ioctl_cleanup; /* @@ -1056,6 +1128,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; + pr0.ndpr_stateflags |= NDPRF_STATIC; /* add the prefix if there's one. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { @@ -1063,7 +1136,7 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * nd6_prelist_add will install the corresponding * interface route. */ - if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) + if ((error = nd6_prelist_add(&pr0, NULL, &pr, FALSE)) != 0) goto ioctl_cleanup; if (pr == NULL) { log(LOG_ERR, "nd6_prelist_add succedded but " @@ -1073,19 +1146,21 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, } } if (ia != NULL) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* XXX: this should not happen! */ log(LOG_ERR, "in6_control: addition succeeded, but" " no ifaddr\n"); } else { + IFA_LOCK(&ia->ia_ifa); if ((ia->ia6_flags & IN6_IFF_AUTOCONF) != 0 && ia->ia6_ndpr == NULL) { /* new autoconfed addr */ - lck_mtx_lock(nd6_mutex); - pr->ndpr_refcnt++; - lck_mtx_unlock(nd6_mutex); + NDPR_LOCK(pr); + pr->ndpr_addrcnt++; + VERIFY(pr->ndpr_addrcnt != 0); ia->ia6_ndpr = pr; + NDPR_ADDREF_LOCKED(pr); /* for addr reference */ /* * If this is the first autoconf address from @@ -1093,8 +1168,12 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * as well (when specified). */ if (ip6_use_tempaddr && - pr->ndpr_refcnt == 1) { + pr->ndpr_addrcnt == 1) { int e; + + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ia->ia_ifa); + if ((e = in6_tmpifadd(ia, 1, M_WAITOK)) != 0) { log(LOG_NOTICE, "in6_control: " @@ -1103,19 +1182,25 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, "errno=%d\n", e); } + } else { + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ia->ia_ifa); } + } else { + IFA_UNLOCK(&ia->ia_ifa); } - /* * this might affect the status of autoconfigured * addresses, that is, this address might make * other addresses detached. */ - pfxlist_onlink_check(0); + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); + lck_mtx_unlock(nd6_mutex); } /* Drop use count held above during lookup/add */ - ndpr_rele(pr, FALSE); + NDPR_REMREF(pr); #if PF pf_ifaddr_hook(ifp, cmd); #endif /* PF */ @@ -1129,24 +1214,29 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. - * XXX: theoretically, we don't have to warry about such + * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. + * Note that in6_purgeaddr() will decrement ndpr_addrcnt. */ + IFA_LOCK(&ia->ia_ifa); bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); - if (pr0.ndpr_plen == 128) + if (pr0.ndpr_plen == 128) { + IFA_UNLOCK(&ia->ia_ifa); goto purgeaddr; + } pr0.ndpr_prefix = ia->ia_addr; pr0.ndpr_mask = ia->ia_prefixmask.sin6_addr; for (i = 0; i < 4; i++) { pr0.ndpr_prefix.sin6_addr.s6_addr32[i] &= ia->ia_prefixmask.sin6_addr.s6_addr32[i]; } + IFA_UNLOCK(&ia->ia_ifa); /* * The logic of the following condition is a bit complicated. * We expire the prefix when @@ -1155,20 +1245,24 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, * 2. the address does not obey autoconf and there is no * other owner of the prefix. */ - if ((pr = nd6_prefix_lookup(&pr0)) != NULL && - (((ia->ia6_flags & IN6_IFF_AUTOCONF) != 0 && - pr->ndpr_refcnt == 1) || - ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0 && - pr->ndpr_refcnt == 0))) { - pr->ndpr_expire = 1; /* XXX: just for expiration */ - } + if ((pr = nd6_prefix_lookup(&pr0)) != NULL) { + IFA_LOCK(&ia->ia_ifa); + NDPR_LOCK(pr); + if (((ia->ia6_flags & IN6_IFF_AUTOCONF) != 0 && + pr->ndpr_addrcnt == 1) || + ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0 && + pr->ndpr_addrcnt == 0)) { + pr->ndpr_expire = 1; /* XXX: just for expiration */ + } + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ia->ia_ifa); - /* Drop use count held above during lookup */ - if (pr != NULL) - ndpr_rele(pr, FALSE); + /* Drop use count held above during lookup */ + NDPR_REMREF(pr); + } purgeaddr: - in6_purgeaddr(&ia->ia_ifa, 0); + in6_purgeaddr(&ia->ia_ifa); #if PF pf_ifaddr_hook(ifp, cmd); #endif /* PF */ @@ -1181,7 +1275,7 @@ purgeaddr: } ioctl_cleanup: if (ia != NULL) - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); return (error); } @@ -1189,23 +1283,23 @@ ioctl_cleanup: * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). - * XXX: should this be performed under splnet()? */ int -in6_update_ifa(ifp, ifra, ia, how) - struct ifnet *ifp; - struct in6_aliasreq *ifra; - struct in6_ifaddr *ia; - int how; +in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, + struct in6_ifaddr *ia, int flags, int how) { int error = 0, hostIsNew = 0, plen = -1; struct in6_ifaddr *oia; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; + struct in6_multi *in6m_sol = NULL; + struct in6_multi_mship *imm; struct timeval timenow; + struct rtentry *rt; + struct ifaddr *ifa = NULL; + int delay; - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return(EINVAL); @@ -1245,14 +1339,15 @@ in6_update_ifa(ifp, ifra, ia, how) (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) - return(EINVAL); - } - else { + return (EINVAL); + } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ + IFA_LOCK(&ia->ia_ifa); plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); + IFA_UNLOCK(&ia->ia_ifa); } /* * If the destination address on a p2p interface is specified, @@ -1260,27 +1355,25 @@ in6_update_ifa(ifp, ifra, ia, how) * zone identifier. */ dst6 = ifra->ifra_dstaddr; - if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) && + if (((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 ) && (dst6.sin6_family == AF_INET6)) { int scopeid; -#ifndef SCOPEDROUTING if ((error = in6_recoverscope(&dst6, &ifra->ifra_dstaddr.sin6_addr, ifp)) != 0) return(error); -#endif + scopeid = in6_addr2scopeid(ifp, &dst6.sin6_addr); if (dst6.sin6_scope_id == 0) /* user omit to specify the ID. */ dst6.sin6_scope_id = scopeid; else if (dst6.sin6_scope_id != scopeid) return(EINVAL); /* scope ID mismatch. */ -#ifndef SCOPEDROUTING - if ((error = in6_embedscope(&dst6.sin6_addr, &dst6, NULL, NULL)) - != 0) + + if ((error = in6_embedscope(&dst6.sin6_addr, &dst6, NULL, NULL, + NULL)) != 0) return(error); dst6.sin6_scope_id = 0; /* XXX */ -#endif } /* * The destination address can be specified only for a p2p or a @@ -1308,7 +1401,8 @@ in6_update_ifa(ifp, ifra, ia, how) getmicrotime(&timenow); lt = &ifra->ifra_lifetime; - if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME + if ((lt->ia6t_vltime != ND6_INFINITE_LIFETIME + || (ifra->ifra_flags & IN6_IFF_TEMPORARY) != 0) && lt->ia6t_vltime + timenow.tv_sec < timenow.tv_sec) { return EINVAL; } @@ -1321,7 +1415,8 @@ in6_update_ifa(ifp, ifra, ia, how) "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(&ifra->ifra_addr.sin6_addr)); } - if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME + if ((lt->ia6t_pltime != ND6_INFINITE_LIFETIME + || (ifra->ifra_flags & IN6_IFF_TEMPORARY) != 0) && lt->ia6t_pltime + timenow.tv_sec < timenow.tv_sec) { return EINVAL; } @@ -1340,11 +1435,15 @@ in6_update_ifa(ifp, ifra, ia, how) */ ia = in6_ifaddr_alloc(how); if (ia == NULL) - return ENOBUFS; - /* Initialize the address and masks */ + return (ENOBUFS); + ifnet_lock_exclusive(ifp); + IFA_LOCK(&ia->ia_ifa); + LIST_INIT(&ia->ia6_memberships); + /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); + ia->ia6_createtime = timenow.tv_sec; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * XXX: some functions expect that ifa_dstaddr is not @@ -1359,21 +1458,34 @@ in6_update_ifa(ifp, ifra, ia, how) = (struct sockaddr *)&ia->ia_prefixmask; ia->ia_ifp = ifp; - ifaref(&ia->ia_ifa); - lck_mtx_lock(nd6_mutex); + /* if_attach_ifa() holds a reference for ifa_link */ + if_attach_ifa(ifp, &ia->ia_ifa); + /* hold a reference for this routine */ + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + ifnet_lock_done(ifp); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + /* Hold a reference for in6_ifaddrs link */ + IFA_ADDREF(&ia->ia_ifa); if ((oia = in6_ifaddrs) != NULL) { for ( ; oia->ia_next; oia = oia->ia_next) continue; oia->ia_next = ia; - } else + } else { in6_ifaddrs = ia; - lck_mtx_unlock(nd6_mutex); - - ifnet_lock_exclusive(ifp); - if_attach_ifa(ifp, &ia->ia_ifa); - ifnet_lock_done(ifp); + } + lck_rw_done(&in6_ifaddr_rwlock); + } else { + /* hold a reference for this routine */ + IFA_ADDREF(&ia->ia_ifa); } + ifa = &ia->ia_ifa; + IFA_LOCK(ifa); + + /* update timestamp */ + ia->ia6_updatetime = timenow.tv_sec; + /* set prefix mask */ if (ifra->ifra_prefixmask.sin6_len) { /* @@ -1388,6 +1500,7 @@ in6_update_ifa(ifp, ifra, ia, how) " existing (%s) address should not be changed\n", ip6_sprintf(&ia->ia_addr.sin6_addr)); error = EINVAL; + IFA_UNLOCK(ifa); goto unlink; } ia->ia_prefixmask = ifra->ifra_prefixmask; @@ -1396,82 +1509,145 @@ in6_update_ifa(ifp, ifra, ia, how) /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be - * p2p or loopback (see the check above.) + * p2p or loopback (see the check above.) */ if (dst6.sin6_family == AF_INET6 && - !IN6_ARE_ADDR_EQUAL(&dst6.sin6_addr, - &ia->ia_dstaddr.sin6_addr)) { - int e; - - if ((ia->ia_flags & IFA_ROUTE) != 0 && - (e = rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST)) - != 0) { - log(LOG_ERR, "in6_update_ifa: failed to remove " - "a route to the old destination: %s\n", - ip6_sprintf(&ia->ia_addr.sin6_addr)); - /* proceed anyway... */ - } - else + !IN6_ARE_ADDR_EQUAL(&dst6.sin6_addr, &ia->ia_dstaddr.sin6_addr)) { + if ((ia->ia_flags & IFA_ROUTE)) { + int e; + + IFA_UNLOCK(ifa); + if ((e = rtinit(&(ia->ia_ifa), (int)RTM_DELETE, + RTF_HOST)) != 0) { + log(LOG_ERR, "in6_update_ifa: failed to remove " + "a route to the old destination: %s\n", + ip6_sprintf(&ia->ia_addr.sin6_addr)); + /* proceed anyway... */ + } + IFA_LOCK(ifa); + } else { ia->ia_flags &= ~IFA_ROUTE; + } + IFA_LOCK_ASSERT_HELD(ifa); ia->ia_dstaddr = dst6; } + /* + * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred + * to see if the address is deprecated or invalidated, but initialize + * these members for applications. + */ + ia->ia6_lifetime = ifra->ifra_lifetime; + if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME + || (ifra->ifra_flags & IN6_IFF_TEMPORARY) != 0) { + ia->ia6_lifetime.ia6t_expire = + timenow.tv_sec + ia->ia6_lifetime.ia6t_vltime; + } else + ia->ia6_lifetime.ia6t_expire = 0; + if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME + || (ifra->ifra_flags & IN6_IFF_TEMPORARY) != 0) { + ia->ia6_lifetime.ia6t_preferred = + timenow.tv_sec + ia->ia6_lifetime.ia6t_pltime; + } else + ia->ia6_lifetime.ia6t_preferred = 0; + + IFA_UNLOCK(ifa); /* reset the interface and routing table appropriately. */ if ((error = in6_ifinit(ifp, ia, &ifra->ifra_addr, hostIsNew)) != 0) goto unlink; + IFA_LOCK(ifa); /* - * Beyond this point, we should call in6_purgeaddr upon an error, - * not just go to unlink. + * configure address flags. */ - -#if 0 /* disable this mechanism for now */ - /* update prefix list */ - if (hostIsNew && - (ifra->ifra_flags & IN6_IFF_NOPFX) == 0) { /* XXX */ - int iilen; - - iilen = (sizeof(ia->ia_prefixmask.sin6_addr) << 3) - plen; - if ((error = in6_prefix_add_ifid(iilen, ia)) != 0) { - in6_purgeaddr((struct ifaddr *)ia, 0); - return(error); - } + ia->ia6_flags = ifra->ifra_flags; + /* + * backward compatibility - if IN6_IFF_DEPRECATED is set from the + * userland, make it deprecated. + */ + if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { + ia->ia6_lifetime.ia6t_pltime = 0; + ia->ia6_lifetime.ia6t_preferred = timenow.tv_sec; } -#endif + /* + * Make the address tentative before joining multicast addresses, + * so that corresponding MLD responses would not have a tentative + * source address. + */ + ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ + if (hostIsNew && in6if_do_dad(ifp)) + ia->ia6_flags |= IN6_IFF_TENTATIVE; + /* + * We are done if we have simply modified an existing address. + */ + if (!hostIsNew) { + IFA_UNLOCK(ifa); + /* release reference held for this routine */ + IFA_REMREF(ifa); + return (error); + } + /* + * Beyond this point, we should call in6_purgeaddr upon an error, + * not just go to unlink. + */ + IFA_LOCK_ASSERT_HELD(ifa); + /* Join necessary multicast groups */ if ((ifp->if_flags & IFF_MULTICAST) != 0) { struct sockaddr_in6 mltaddr, mltmask; - struct in6_multi *in6m; + struct in6_addr llsol; - if (hostIsNew) { + IFA_UNLOCK(ifa); + /* join solicited multicast addr for new host id */ + bzero(&llsol, sizeof(struct in6_addr)); + llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL; + llsol.s6_addr32[1] = 0; + llsol.s6_addr32[2] = htonl(1); + llsol.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; + llsol.s6_addr8[12] = 0xff; + if ((error = in6_setscope(&llsol, ifp, NULL)) != 0) { + /* XXX: should not happen */ + log(LOG_ERR, "in6_update_ifa: " + "in6_setscope failed\n"); + goto cleanup; + } + delay = 0; + if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* - * join solicited multicast addr for new host id + * We need a random delay for DAD on the address + * being configured. It also means delaying + * transmission of the corresponding MLD report to + * avoid report collision. + * [draft-ietf-ipv6-rfc2462bis-02.txt] */ - struct in6_addr llsol; - bzero(&llsol, sizeof(struct in6_addr)); - llsol.s6_addr16[0] = htons(0xff02); - llsol.s6_addr16[1] = htons(ifp->if_index); - llsol.s6_addr32[1] = 0; - llsol.s6_addr32[2] = htonl(1); - llsol.s6_addr32[3] = - ifra->ifra_addr.sin6_addr.s6_addr32[3]; - llsol.s6_addr8[12] = 0xff; - (void)in6_addmulti(&llsol, ifp, &error, 0); - if (error != 0) { - log(LOG_WARNING, - "in6_update_ifa: addmulti failed for " - "%s on %s (errno=%d)\n", - ip6_sprintf(&llsol), if_name(ifp), - error); - in6_purgeaddr((struct ifaddr *)ia, 0); - return(error); - } + delay = random() % + (MAX_RTR_SOLICITATION_DELAY * PR_SLOWHZ); + } + imm = in6_joingroup(ifp, &llsol, &error, delay); + if (imm == NULL) { + nd6log((LOG_WARNING, + "in6_update_ifa: addmulti failed for " + "%s on %s (errno=%d)\n", + ip6_sprintf(&llsol), if_name(ifp), + error)); + in6_purgeaddr((struct ifaddr *)ia); + /* release reference held for this routine */ + IFA_REMREF(ifa); + return (error); } + in6m_sol = imm->i6mm_maddr; + /* take a refcount for this routine */ + IN6M_ADDREF(in6m_sol); + + IFA_LOCK_SPIN(ifa); + LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); + IFA_UNLOCK(ifa); bzero(&mltmask, sizeof(mltmask)); mltmask.sin6_len = sizeof(struct sockaddr_in6); mltmask.sin6_family = AF_INET6; mltmask.sin6_addr = in6mask32; +#define MLTMASK_LEN 4 /* mltmask's masklen (=32bit=4octet) */ /* * join link-local all-nodes address @@ -1480,111 +1656,134 @@ in6_update_ifa(ifp, ifra, ia, how) mltaddr.sin6_len = sizeof(struct sockaddr_in6); mltaddr.sin6_family = AF_INET6; mltaddr.sin6_addr = in6addr_linklocal_allnodes; - mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); + if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) != + 0) + goto cleanup; /* XXX: should not fail */ - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m == NULL) { - rtrequest(RTM_ADD, - (struct sockaddr *)&mltaddr, - (struct sockaddr *)&ia->ia_addr, - (struct sockaddr *)&mltmask, - RTF_UP|RTF_CLONING, /* xxx */ - (struct rtentry **)0); - (void)in6_addmulti(&mltaddr.sin6_addr, ifp, &error, 0); - if (error != 0) { - log(LOG_WARNING, - "in6_update_ifa: addmulti failed for " - "%s on %s (errno=%d)\n", - ip6_sprintf(&mltaddr.sin6_addr), - if_name(ifp), error); + /* + * XXX: do we really need this automatic routes? + * We should probably reconsider this stuff. Most applications + * actually do not need the routes, since they usually specify + * the outgoing interface. + */ + rt = rtalloc1_scoped((struct sockaddr *)&mltaddr, 0, 0UL, + ia->ia_ifp->if_index); + if (rt) { + if (memcmp(&mltaddr.sin6_addr, + &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr, + MLTMASK_LEN)) { + rtfree(rt); + rt = NULL; } } + if (!rt) { + error = rtrequest_scoped(RTM_ADD, + (struct sockaddr *)&mltaddr, + (struct sockaddr *)&ia->ia_addr, + (struct sockaddr *)&mltmask, RTF_UP | RTF_CLONING, + NULL, ia->ia_ifp->if_index); + if (error) + goto cleanup; + } else { + rtfree(rt); + } + + imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0); + if (!imm) { + nd6log((LOG_WARNING, + "in6_update_ifa: addmulti failed for " + "%s on %s (errno=%d)\n", + ip6_sprintf(&mltaddr.sin6_addr), + if_name(ifp), error)); + goto cleanup; + } + IFA_LOCK_SPIN(ifa); + LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); + IFA_UNLOCK(ifa); /* * join node information group address */ #define hostnamelen strlen(hostname) + delay = 0; + if ((flags & IN6_IFAUPDATE_DADDELAY)) { + /* + * The spec doesn't say anything about delay for this + * group, but the same logic should apply. + */ + delay = random() % + (MAX_RTR_SOLICITATION_DELAY * PR_SLOWHZ); + } if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr.sin6_addr) == 0) { - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m == NULL && ia != NULL) { - (void)in6_addmulti(&mltaddr.sin6_addr, - ifp, &error, 0); - if (error != 0) { - log(LOG_WARNING, "in6_update_ifa: " - "addmulti failed for " - "%s on %s (errno=%d)\n", - ip6_sprintf(&mltaddr.sin6_addr), - if_name(ifp), error); - } + imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, + delay); /* XXX jinmei */ + if (!imm) { + nd6log((LOG_WARNING, "in6_update_ifa: " + "addmulti failed for %s on %s " + "(errno=%d)\n", + ip6_sprintf(&mltaddr.sin6_addr), + if_name(ifp), error)); + /* XXX not very fatal, go on... */ + } else { + IFA_LOCK_SPIN(ifa); + LIST_INSERT_HEAD(&ia->ia6_memberships, + imm, i6mm_chain); + IFA_UNLOCK(ifa); } } #undef hostnamelen /* - * join node-local all-nodes address, on loopback. - * XXX: since "node-local" is obsoleted by interface-local, - * we have to join the group on every interface with - * some interface-boundary restriction. + * join interface-local all-nodes address. + * (ff01::1%ifN, and ff01::%ifN/32) */ - if (ifp->if_flags & IFF_LOOPBACK) { - struct in6_ifaddr *ia_loop; - - struct in6_addr loop6 = in6addr_loopback; - ia_loop = in6ifa_ifpwithaddr(ifp, &loop6); - - mltaddr.sin6_addr = in6addr_nodelocal_allnodes; - - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m == NULL && ia_loop != NULL) { - rtrequest(RTM_ADD, - (struct sockaddr *)&mltaddr, - (struct sockaddr *)&ia_loop->ia_addr, - (struct sockaddr *)&mltmask, - RTF_UP, - (struct rtentry **)0); - (void)in6_addmulti(&mltaddr.sin6_addr, ifp, - &error, 0); - if (error != 0) { - log(LOG_WARNING, "in6_update_ifa: " - "addmulti failed for %s on %s " - "(errno=%d)\n", - ip6_sprintf(&mltaddr.sin6_addr), - if_name(ifp), error); - } + mltaddr.sin6_addr = in6addr_nodelocal_allnodes; + if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) + != 0) + goto cleanup; /* XXX: should not fail */ + /* XXX: again, do we really need the route? */ + rt = rtalloc1_scoped((struct sockaddr *)&mltaddr, 0, 0UL, + ia->ia_ifp->if_index); + if (rt) { + if (memcmp(&mltaddr.sin6_addr, + &((struct sockaddr_in6 *)rt_key(rt))->sin6_addr, + MLTMASK_LEN)) { + rtfree(rt); + rt = NULL; } - if (ia_loop != NULL) - ifafree(&ia_loop->ia_ifa); } + if (!rt) { + error = rtrequest_scoped(RTM_ADD, + (struct sockaddr *)&mltaddr, + (struct sockaddr *)&ia->ia_addr, + (struct sockaddr *)&mltmask, RTF_UP | RTF_CLONING, + NULL, ia->ia_ifp->if_index); + if (error) + goto cleanup; + } else + rtfree(rt); + + imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0); + if (!imm) { + nd6log((LOG_WARNING, "in6_update_ifa: " + "addmulti failed for %s on %s " + "(errno=%d)\n", + ip6_sprintf(&mltaddr.sin6_addr), + if_name(ifp), error)); + goto cleanup; + } + IFA_LOCK(ifa); + LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); + /* keep it locked */ +#undef MLTMASK_LEN } - - ia->ia6_flags = ifra->ifra_flags; - ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /*safety*/ - ia->ia6_flags &= ~IN6_IFF_NODAD; /* Mobile IPv6 */ - - ia->ia6_lifetime = ifra->ifra_lifetime; - /* for sanity */ - if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { - ia->ia6_lifetime.ia6t_expire = - timenow.tv_sec + ia->ia6_lifetime.ia6t_vltime; - } else - ia->ia6_lifetime.ia6t_expire = 0; - if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { - ia->ia6_lifetime.ia6t_preferred = - timenow.tv_sec + ia->ia6_lifetime.ia6t_pltime; - } else - ia->ia6_lifetime.ia6t_preferred = 0; - + IFA_LOCK_ASSERT_HELD(ifa); /* - * make sure to initialize ND6 information. this is to workaround + * Make sure to initialize ND6 information. this is to workaround * issues with interfaces with IPv6 addresses, which have never brought * up. We are assuming that it is safe to nd6_ifattach multiple times. + * NOTE: this is how stf0 gets initialized */ if ((error = nd6_ifattach(ifp)) != 0) return error; @@ -1594,29 +1793,74 @@ in6_update_ifa(ifp, ifra, ia, how) * XXX It may be of use, if we can administratively * disable DAD. */ - if (in6if_do_dad(ifp) && (ifra->ifra_flags & IN6_IFF_NODAD) == 0) { - ia->ia6_flags |= IN6_IFF_TENTATIVE; - nd6_dad_start((struct ifaddr *)ia, NULL); - } + if (hostIsNew && in6if_do_dad(ifp) && + ((ifra->ifra_flags & IN6_IFF_NODAD) == 0) && + (ia->ia6_flags & IN6_IFF_TENTATIVE)) + { + int mindelay, maxdelay; - return(error); + IFA_UNLOCK(ifa); + delay = 0; + if ((flags & IN6_IFAUPDATE_DADDELAY)) { + /* + * We need to impose a delay before sending an NS + * for DAD. Check if we also needed a delay for the + * corresponding MLD message. If we did, the delay + * should be larger than the MLD delay (this could be + * relaxed a bit, but this simple logic is at least + * safe). + */ + mindelay = 0; + if (in6m_sol != NULL) { + IN6M_LOCK(in6m_sol); + if (in6m_sol->in6m_state == MLD_REPORTING_MEMBER) + mindelay = in6m_sol->in6m_timer; + IN6M_UNLOCK(in6m_sol); + } + maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; + if (maxdelay - mindelay == 0) + delay = 0; + else { + delay = + (random() % (maxdelay - mindelay)) + + mindelay; + } + } + nd6_dad_start((struct ifaddr *)ia, &delay); + } else { + IFA_UNLOCK(ifa); + } +done: + /* release reference held for this routine */ + if (ifa != NULL) + IFA_REMREF(ifa); + if (in6m_sol != NULL) + IN6M_REMREF(in6m_sol); + return (error); - unlink: +unlink: /* * XXX: if a change of an existing address failed, keep the entry * anyway. */ - if (hostIsNew) - in6_unlink_ifa(ia, ifp, 0); - return(error); + if (hostIsNew) { + in6_unlink_ifa(ia, ifp); + } + goto done; + +cleanup: + in6_purgeaddr(&ia->ia_ifa); + goto done; } void -in6_purgeaddr( - struct ifaddr *ifa, int nd6_locked) +in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; + struct in6_multi_mship *imm; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* stop DAD processing */ nd6_dad_stop(ifa); @@ -1625,9 +1869,11 @@ in6_purgeaddr( * delete route to the destination of the address being purged. * The interface must be p2p or loopback in this case. */ + IFA_LOCK(ifa); if ((ia->ia_flags & IFA_ROUTE) != 0 && ia->ia_dstaddr.sin6_len != 0) { int e; + IFA_UNLOCK(ifa); if ((e = rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST)) != 0) { log(LOG_ERR, "in6_purgeaddr: failed to remove " @@ -1636,73 +1882,71 @@ in6_purgeaddr( ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ifp), e); /* proceed anyway... */ - } - else + } else { + IFA_LOCK_SPIN(ifa); ia->ia_flags &= ~IFA_ROUTE; + IFA_UNLOCK(ifa); + } + } else { + IFA_UNLOCK(ifa); } + IFA_LOCK_ASSERT_NOTHELD(ifa); /* Remove ownaddr's loopback rtentry, if it exists. */ - in6_ifremloop(&(ia->ia_ifa), nd6_locked); - - if (ifp->if_flags & IFF_MULTICAST) { - /* - * delete solicited multicast addr for deleting host id - */ - struct in6_multi *in6m; - struct in6_addr llsol; - bzero(&llsol, sizeof(struct in6_addr)); - llsol.s6_addr16[0] = htons(0xff02); - llsol.s6_addr16[1] = htons(ifp->if_index); - llsol.s6_addr32[1] = 0; - llsol.s6_addr32[2] = htonl(1); - llsol.s6_addr32[3] = - ia->ia_addr.sin6_addr.s6_addr32[3]; - llsol.s6_addr8[12] = 0xff; + in6_ifremloop(&(ia->ia_ifa)); - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(llsol, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m) - in6_delmulti(in6m, nd6_locked); + /* + * leave from multicast groups we have joined for the interface + */ + IFA_LOCK(ifa); + while ((imm = ia->ia6_memberships.lh_first) != NULL) { + LIST_REMOVE(imm, i6mm_chain); + IFA_UNLOCK(ifa); + in6_leavegroup(imm); + IFA_LOCK(ifa); } + IFA_UNLOCK(ifa); - in6_unlink_ifa(ia, ifp, nd6_locked); + /* in6_unlink_ifa() will need exclusive access */ + in6_unlink_ifa(ia, ifp); in6_post_msg(ifp, KEV_INET6_ADDR_DELETED, ia); } static void -in6_unlink_ifa(ia, ifp, nd6_locked) - struct in6_ifaddr *ia; - struct ifnet *ifp; - int nd6_locked; +in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { - int plen, iilen; struct in6_ifaddr *oia; + struct ifaddr *ifa; + int unlinked; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + + ifa = &ia->ia_ifa; + IFA_ADDREF(ifa); ifnet_lock_exclusive(ifp); - if_detach_ifa(ifp, &ia->ia_ifa); + IFA_LOCK(ifa); + if (ifa->ifa_debug & IFD_ATTACHED) + if_detach_ifa(ifp, ifa); + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); - if (!nd6_locked) - lck_mtx_lock(nd6_mutex); + unlinked = 1; + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); oia = ia; - if (oia == (ia = in6_ifaddrs)) + if (oia == (ia = in6_ifaddrs)) { in6_ifaddrs = ia->ia_next; - else { + } else { while (ia->ia_next && (ia->ia_next != oia)) ia = ia->ia_next; - if (ia->ia_next) + if (ia->ia_next) { ia->ia_next = oia->ia_next; - else { + } else { /* search failed */ printf("Couldn't unlink in6_ifaddr from in6_ifaddr\n"); + unlinked = 0; } } - if (oia->ia6_ifpr) { /* check for safety */ - plen = in6_mask2len(&oia->ia_prefixmask.sin6_addr, NULL); - iilen = (sizeof(oia->ia_prefixmask.sin6_addr) << 3) - plen; - in6_prefix_remove_ifid(iilen, oia); - } /* * When an autoconfigured address is being removed, release the @@ -1710,48 +1954,76 @@ in6_unlink_ifa(ia, ifp, nd6_locked) * affect the status of other (detached) addresses, call * pfxlist_onlink_check(). */ + ifa = &oia->ia_ifa; + IFA_LOCK(ifa); if ((oia->ia6_flags & IN6_IFF_AUTOCONF) != 0) { if (oia->ia6_ndpr == NULL) { log(LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " "%p has no prefix\n", oia); } else { - oia->ia6_ndpr->ndpr_refcnt--; + struct nd_prefix *pr = oia->ia6_ndpr; + oia->ia6_flags &= ~IN6_IFF_AUTOCONF; oia->ia6_ndpr = NULL; + NDPR_LOCK(pr); + VERIFY(pr->ndpr_addrcnt != 0); + pr->ndpr_addrcnt--; + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); /* release addr reference */ } - - pfxlist_onlink_check(1); - } - if (!nd6_locked) + IFA_UNLOCK(ifa); + lck_rw_done(&in6_ifaddr_rwlock); + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); lck_mtx_unlock(nd6_mutex); - + } else { + IFA_UNLOCK(ifa); + lck_rw_done(&in6_ifaddr_rwlock); + } /* * release another refcnt for the link from in6_ifaddrs. - * Note that we should decrement the refcnt at least once for all *BSD. + * Do this only if it's not already unlinked in the event that we lost + * the race, since in6_ifaddr_rwlock was momentarily dropped above. */ - ifafree(&oia->ia_ifa); + if (unlinked) + IFA_REMREF(ifa); + /* release reference held for this routine */ + IFA_REMREF(ifa); } void -in6_purgeif(ifp) - struct ifnet *ifp; +in6_purgeif(struct ifnet *ifp) { - struct in6_ifaddr *ia, *nia = NULL; + struct in6_ifaddr *ia; - if (ifp == NULL || &ifp->if_addrlist == NULL) + if (ifp == NULL) return; - - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia != NULL; ia = nia) - { - nia = ia->ia_next; - if (ia->ia_ifa.ifa_ifp != ifp) + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + if (ia->ia_ifa.ifa_ifp != ifp) { + ia = ia->ia_next; continue; - in6_purgeaddr(&ia->ia_ifa, 1); + } + IFA_ADDREF(&ia->ia_ifa); /* for us */ + lck_rw_done(&in6_ifaddr_rwlock); + in6_purgeaddr(&ia->ia_ifa); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + IFA_REMREF(&ia->ia_ifa); /* for us */ + /* + * Purging the address would have caused + * in6_ifaddr_rwlock to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + ia = in6_ifaddrs; } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); in6_ifdetach(ifp); } @@ -1791,7 +2063,7 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, /* sanity checks */ if (!data || !ifp) { panic("invalid argument to in6_lifaddr_ioctl"); - /*NOTRECHED*/ + /*NOTREACHED*/ } switch (cmd) { @@ -1845,9 +2117,11 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); if (!ifa) return EADDRNOTAVAIL; + IFA_LOCK_SPIN(ifa); hostaddr = *IFA_IN6(ifa); + IFA_UNLOCK(ifa); hostid_found = 1; - ifafree(ifa); + IFA_REMREF(ifa); ifa = NULL; /* prefixlen must be <= 64. */ @@ -1855,10 +2129,10 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, return EINVAL; prefixlen = iflr->prefixlen; - /* hostaddr part must be zero. */ + /* hostid part must be zero. */ sin6 = (struct sockaddr_in6 *)&iflr->addr; - if (sin6->sin6_addr.s6_addr32[2] != 0 - || sin6->sin6_addr.s6_addr32[3] != 0) { + if (sin6->sin6_addr.s6_addr32[2] != 0 || + sin6->sin6_addr.s6_addr32[3] != 0) { return EINVAL; } } else @@ -1890,7 +2164,7 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, } ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); - in6_len2mask(&ifra.ifra_prefixmask.sin6_addr, prefixlen); + in6_prefixlen2mask(&ifra.ifra_prefixmask.sin6_addr, prefixlen); ifra.ifra_flags = iflr->flags & ~IFLR_PREFIX; if (!p64) { @@ -1935,7 +2209,7 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, bzero(&mask, sizeof(mask)); if (iflr->flags & IFLR_PREFIX) { /* lookup a prefix rather than address. */ - in6_len2mask(&mask, iflr->prefixlen); + in6_prefixlen2mask(&mask, iflr->prefixlen); sin6 = (struct sockaddr_in6 *)&iflr->addr; bcopy(&sin6->sin6_addr, &match, sizeof(match)); @@ -1955,7 +2229,7 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, cmp = 0; /* XXX */ } else { /* on deleting an address, do exact match */ - in6_len2mask(&mask, 128); + in6_prefixlen2mask(&mask, 128); sin6 = (struct sockaddr_in6 *)&iflr->addr; bcopy(&sin6->sin6_addr, &match, sizeof(match)); @@ -1966,13 +2240,18 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (!cmp) + } + if (!cmp) { + IFA_UNLOCK(ifa); break; + } bcopy(IFA_IN6(ifa), &candidate, sizeof(candidate)); -#ifndef SCOPEDROUTING + IFA_UNLOCK(ifa); /* * XXX: this is adhoc, but is necessary to allow * a user to specify fe80::/64 (not /10) for a @@ -1980,7 +2259,6 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, */ if (IN6_IS_ADDR_LINKLOCAL(&candidate)) candidate.s6_addr16[1] = 0; -#endif candidate.s6_addr32[0] &= mask.s6_addr32[0]; candidate.s6_addr32[1] &= mask.s6_addr32[1]; candidate.s6_addr32[2] &= mask.s6_addr32[2]; @@ -1988,30 +2266,28 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, if (IN6_ARE_ADDR_EQUAL(&candidate, &match)) break; } + if (ifa != NULL) + IFA_ADDREF(ifa); ifnet_lock_done(ifp); if (!ifa) return EADDRNOTAVAIL; ia = ifa2ia6(ifa); if (cmd == SIOCGLIFADDR) { -#ifndef SCOPEDROUTING struct sockaddr_in6 *s6; -#endif + IFA_LOCK(ifa); /* fill in the if_laddrreq structure */ bcopy(&ia->ia_addr, &iflr->addr, ia->ia_addr.sin6_len); -#ifndef SCOPEDROUTING /* XXX see above */ s6 = (struct sockaddr_in6 *)&iflr->addr; if (IN6_IS_ADDR_LINKLOCAL(&s6->sin6_addr)) { s6->sin6_addr.s6_addr16[1] = 0; s6->sin6_scope_id = in6_addr2scopeid(ifp, &s6->sin6_addr); } -#endif if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { bcopy(&ia->ia_dstaddr, &iflr->dstaddr, ia->ia_dstaddr.sin6_len); -#ifndef SCOPEDROUTING /* XXX see above */ s6 = (struct sockaddr_in6 *)&iflr->dstaddr; if (IN6_IS_ADDR_LINKLOCAL(&s6->sin6_addr)) { s6->sin6_addr.s6_addr16[1] = 0; @@ -2019,7 +2295,6 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, in6_addr2scopeid(ifp, &s6->sin6_addr); } -#endif } else bzero(&iflr->dstaddr, sizeof(iflr->dstaddr)); @@ -2028,7 +2303,8 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, NULL); iflr->flags = ia->ia6_flags; /* XXX */ - + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); return 0; } else { struct in6_aliasreq ifra; @@ -2038,6 +2314,7 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, bcopy(iflr->iflr_name, ifra.ifra_name, sizeof(ifra.ifra_name)); + IFA_LOCK(ifa); bcopy(&ia->ia_addr, &ifra.ifra_addr, ia->ia_addr.sin6_len); if ((ifp->if_flags & IFF_POINTOPOINT) != 0) { @@ -2051,6 +2328,8 @@ in6_lifaddr_ioctl(struct socket *so, u_long cmd, caddr_t data, ia->ia_prefixmask.sin6_len); ifra.ifra_flags = ia->ia6_flags; + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); if (!p64) { #if defined(__LP64__) struct in6_aliasreq_32 ifra_32; @@ -2120,24 +2399,30 @@ in6_ifinit(ifp, ia, sin6, newhost) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr == NULL) - continue; /* just for safety */ - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } ifacount++; + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); + ifa = &ia->ia_ifa; + IFA_LOCK_SPIN(ifa); ia->ia_addr = *sin6; - + IFA_UNLOCK(ifa); if (ifacount <= 1 && (error = ifnet_ioctl(ifp, PF_INET6, SIOCSIFADDR, ia))) { - if (error) { + if (error == EOPNOTSUPP) + error = 0; + else if (error) return(error); - } } + IFA_LOCK(ifa); ia->ia_ifa.ifa_metric = ifp->if_metric; /* we could do in(6)_socktrim here, but just omit it at this moment. */ @@ -2150,11 +2435,14 @@ in6_ifinit(ifp, ia, sin6, newhost) */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { + IFA_UNLOCK(ifa); if ((error = rtinit(&(ia->ia_ifa), (int)RTM_ADD, - RTF_UP | RTF_HOST)) != 0) + RTF_UP | RTF_HOST)) != 0) return(error); + IFA_LOCK(ifa); ia->ia_flags |= IFA_ROUTE; } + IFA_LOCK_ASSERT_HELD(ifa); if (plen < 128) { /* * The RTF_CLONING flag is necessary for in6_is_ifloop_auto(). @@ -2166,104 +2454,19 @@ in6_ifinit(ifp, ia, sin6, newhost) if (newhost) { /* set the rtrequest function to create llinfo */ ia->ia_ifa.ifa_rtrequest = nd6_rtrequest; + IFA_UNLOCK(ifa); in6_ifaddloop(&(ia->ia_ifa)); + } else { + IFA_UNLOCK(ifa); } return(error); } -/* - * Add an address to the list of IP6 multicast addresses for a - * given interface. - */ -struct in6_multi * -in6_addmulti(maddr6, ifp, errorp, nd6_locked) - struct in6_addr *maddr6; - struct ifnet *ifp; - int *errorp; - int nd6_locked; -{ - struct in6_multi *in6m; - struct sockaddr_in6 sin6; - struct ifmultiaddr *ifma; - - *errorp = 0; - - /* - * Call generic routine to add membership or increment - * refcount. It wants addresses in the form of a sockaddr, - * so we build one here (being careful to zero the unused bytes). - */ - bzero(&sin6, sizeof sin6); - sin6.sin6_family = AF_INET6; - sin6.sin6_len = sizeof sin6; - sin6.sin6_addr = *maddr6; - *errorp = if_addmulti(ifp, (struct sockaddr *)&sin6, &ifma); - if (*errorp) { - return 0; - } - - /* - * If ifma->ifma_protospec is null, then if_addmulti() created - * a new record. Otherwise, we are done. - */ - if (ifma->ifma_protospec != 0) - return ifma->ifma_protospec; - - /* XXX - if_addmulti uses M_WAITOK. Can this really be called - at interrupt time? If so, need to fix if_addmulti. XXX */ - in6m = (struct in6_multi *)_MALLOC(sizeof(*in6m), M_IPMADDR, M_NOWAIT); - if (in6m == NULL) { - return (NULL); - } - - bzero(in6m, sizeof *in6m); - in6m->in6m_addr = *maddr6; - in6m->in6m_ifp = ifp; - in6m->in6m_ifma = ifma; - ifma->ifma_protospec = in6m; - if (nd6_locked == 0) - lck_mtx_lock(nd6_mutex); - LIST_INSERT_HEAD(&in6_multihead, in6m, in6m_entry); - if (nd6_locked == 0) - lck_mtx_unlock(nd6_mutex); - - /* - * Let MLD6 know that we have joined a new IP6 multicast - * group. - */ - mld6_start_listening(in6m); - return(in6m); -} - -/* - * Delete a multicast address record. - */ void -in6_delmulti( - struct in6_multi *in6m, int nd6locked) +in6_purgeaddrs(struct ifnet *ifp) { - struct ifmultiaddr *ifma = in6m->in6m_ifma; - - if (ifma && ifma->ifma_usecount == 1) { - /* - * No remaining claims to this record; let MLD6 know - * that we are leaving the multicast group. - */ - mld6_stop_listening(in6m); - ifma->ifma_protospec = 0; - if (nd6locked == 0) - lck_mtx_lock(nd6_mutex); - LIST_REMOVE(in6m, in6m_entry); - if (nd6locked == 0) - lck_mtx_unlock(nd6_mutex); - FREE(in6m, M_IPMADDR); - } - /* XXX - should be separate API for when we have an ifma? */ - if (ifma) { - if_delmultiaddr(ifma, 0); - ifma_release(ifma); - } + in6_purgeif(ifp); } /* @@ -2279,19 +2482,23 @@ in6ifa_ifpforlinklocal(ifp, ignoreflags) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr == NULL) - continue; /* just for safety */ - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & - ignoreflags) != 0) + ignoreflags) != 0) { + IFA_UNLOCK(ifa); continue; + } + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; } + IFA_UNLOCK(ifa); } - if (ifa != NULL) - ifaref(ifa); ifnet_lock_done(ifp); return((struct in6_ifaddr *)ifa); @@ -2310,15 +2517,18 @@ in6ifa_ifpwithaddr(ifp, addr) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr == NULL) - continue; /* just for safety */ - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) + } + if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); break; + } + IFA_UNLOCK(ifa); } - if (ifa != NULL) - ifaref(ifa); ifnet_lock_done(ifp); return((struct in6_ifaddr *)ifa); @@ -2385,7 +2595,7 @@ in6addr_local(struct in6_addr *in6) struct sockaddr_in6 sin6; int local = 0; - if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) + if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_SCOPE_LINKLOCAL(in6)) return (1); sin6.sin6_family = AF_INET6; @@ -2406,48 +2616,48 @@ in6addr_local(struct in6_addr *in6) } int -in6_localaddr(in6) - struct in6_addr *in6; +in6_localaddr(struct in6_addr *in6) { struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) - return 1; + return (1); - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia; ia = ia->ia_next) + lck_rw_lock_shared(&in6_ifaddr_rwlock); + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, - &ia->ia_prefixmask.sin6_addr)) { - lck_mtx_unlock(nd6_mutex); - return 1; + &ia->ia_prefixmask.sin6_addr)) { + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + return (1); } - - lck_mtx_unlock(nd6_mutex); + IFA_UNLOCK(&ia->ia_ifa); + } + lck_rw_done(&in6_ifaddr_rwlock); return (0); } int -in6_is_addr_deprecated(sa6) - struct sockaddr_in6 *sa6; +in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct in6_ifaddr *ia; - lck_mtx_lock(nd6_mutex); + lck_rw_lock_shared(&in6_ifaddr_rwlock); for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, - &sa6->sin6_addr) && -#if SCOPEDROUTING - ia->ia_addr.sin6_scope_id == sa6->sin6_scope_id && -#endif + &sa6->sin6_addr) && (ia->ia6_flags & IN6_IFF_DEPRECATED) != 0) { - lck_mtx_unlock(nd6_mutex); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); return(1); /* true */ } - /* XXX: do we still have to go thru the rest of the list? */ + IFA_UNLOCK(&ia->ia_ifa); } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); return(0); /* false */ } @@ -2542,9 +2752,7 @@ in6_ifawithscope( struct in6_ifaddr *ifa_best = NULL; if (oifp == NULL) { -#if 0 - printf("in6_ifawithscope: output interface is not specified\n"); -#endif + /* output interface is not specified */ return(NULL); } @@ -2567,9 +2775,11 @@ in6_ifawithscope( { int tlen = -1, dscopecmp, bscopecmp, matchcmp; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - + } src_scope = in6_addrscope(IFA_IN6(ifa)); /* @@ -2577,18 +2787,21 @@ in6_ifawithscope( * nor a duplicated address. */ if (((struct in6_ifaddr *)ifa)->ia6_flags & - IN6_IFF_NOTREADY) + IN6_IFF_NOTREADY) { + IFA_UNLOCK(ifa); continue; - + } /* XXX: is there any case to allow anycasts? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & - IN6_IFF_ANYCAST) + IN6_IFF_ANYCAST) { + IFA_UNLOCK(ifa); continue; - + } if (((struct in6_ifaddr *)ifa)->ia6_flags & - IN6_IFF_DETACHED) + IN6_IFF_DETACHED) { + IFA_UNLOCK(ifa); continue; - + } /* * If this is the first address we find, * keep it anyway. @@ -2620,9 +2833,10 @@ in6_ifawithscope( IN6_ARE_SCOPE_CMP(src_scope, dst_scope) >= 0) goto replace; /* (A) */ if (IN6_ARE_SCOPE_CMP(src_scope, dst_scope) < 0 && - IN6_ARE_SCOPE_CMP(best_scope, dst_scope) >= 0) + IN6_ARE_SCOPE_CMP(best_scope, dst_scope) >= 0) { + IFA_UNLOCK(ifa); continue; /* (B) */ - + } /* * A deprecated address SHOULD NOT be used in new * communications if an alternate (non-deprecated) @@ -2635,16 +2849,19 @@ in6_ifawithscope( * Ignore any deprecated addresses if * specified by configuration. */ - if (!ip6_use_deprecated) + if (!ip6_use_deprecated) { + IFA_UNLOCK(ifa); continue; - + } /* * If we have already found a non-deprecated * candidate, just ignore deprecated addresses. */ if ((ifa_best->ia6_flags & IN6_IFF_DEPRECATED) - == 0) + == 0) { + IFA_UNLOCK(ifa); continue; + } } /* @@ -2660,7 +2877,7 @@ in6_ifawithscope( /* * When we use temporary addresses described in - * RFC 3041, we prefer temporary addresses to + * RFC 4941, we prefer temporary addresses to * public autoconf addresses. Again, note the * invariants from (A) and (B). Also note that we * don't have any preference between static addresses @@ -2685,6 +2902,7 @@ in6_ifawithscope( (ifat->ia6_flags & (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY)) == IN6_IFF_AUTOCONF) { + IFA_UNLOCK(ifa); continue; } } @@ -2745,8 +2963,10 @@ in6_ifawithscope( if (bscopecmp == 0) { struct ifnet *bifp = ifa_best->ia_ifp; - if (bifp == oifp && ifp != oifp) /* (1) */ + if (bifp == oifp && ifp != oifp) { /* (1) */ + IFA_UNLOCK(ifa); continue; + } if (bifp != oifp && ifp == oifp) /* (2) */ goto replace; @@ -2761,16 +2981,20 @@ in6_ifawithscope( matchcmp = tlen - blen; if (matchcmp > 0) /* (3) */ goto replace; + IFA_UNLOCK(ifa); continue; /* (4) */ } if (dscopecmp > 0) { - if (bscopecmp > 0) /* (5) */ + if (bscopecmp > 0) { /* (5) */ + IFA_UNLOCK(ifa); continue; + } goto replace; /* (6) */ } if (dscopecmp < 0) { if (bscopecmp > 0) /* (7) */ goto replace; + IFA_UNLOCK(ifa); continue; /* (8) */ } @@ -2778,14 +3002,15 @@ in6_ifawithscope( if (bscopecmp < 0) goto replace; /* (9) */ - replace: - ifaref(ifa); - if (ifa_best) - ifafree(&ifa_best->ia_ifa); - ifa_best = (struct in6_ifaddr *)ifa; +replace: + IFA_ADDREF_LOCKED(ifa); /* for ifa_best */ blen = tlen >= 0 ? tlen : in6_matchlen(IFA_IN6(ifa), dst); - best_scope = in6_addrscope(&ifa_best->ia_addr.sin6_addr); + best_scope = in6_addrscope(&ifa2ia6(ifa)->ia_addr.sin6_addr); + IFA_UNLOCK(ifa); + if (ifa_best) + IFA_REMREF(&ifa_best->ia_ifa); + ifa_best = (struct in6_ifaddr *)ifa; } ifnet_lock_done(ifp); } @@ -2795,6 +3020,7 @@ in6_ifawithscope( if (ifa_best == NULL) ip6stat.ip6s_sources_none++; else { + IFA_LOCK_SPIN(&ifa_best->ia_ifa); if (oifp == ifa_best->ia_ifp) ip6stat.ip6s_sources_sameif[best_scope]++; else @@ -2807,6 +3033,7 @@ in6_ifawithscope( if ((ifa_best->ia6_flags & IN6_IFF_DEPRECATED) != 0) ip6stat.ip6s_sources_deprecated[best_scope]++; + IFA_UNLOCK(&ifa_best->ia_ifa); } return(ifa_best); @@ -2823,7 +3050,7 @@ in6_ifawithifp( { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; - struct in6_ifaddr *besta = 0; + struct in6_ifaddr *besta = NULL; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ dep[0] = dep[1] = NULL; @@ -2837,20 +3064,32 @@ in6_ifawithifp( ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_ANYCAST) { + IFA_UNLOCK(ifa); continue; /* XXX: is there any case to allow anycast? */ - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_NOTREADY) { + IFA_UNLOCK(ifa); continue; /* don't use this interface */ - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_DETACHED) { + IFA_UNLOCK(ifa); continue; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (ip6_use_deprecated) { + IFA_ADDREF_LOCKED(ifa); /* for dep[0] */ + IFA_UNLOCK(ifa); if (dep[0] != NULL) - ifafree(&dep[0]->ia_ifa); + IFA_REMREF(&dep[0]->ia_ifa); dep[0] = (struct in6_ifaddr *)ifa; - ifaref(ifa); + } else { + IFA_UNLOCK(ifa); } continue; } @@ -2860,51 +3099,77 @@ in6_ifawithifp( * call in6_matchlen() as few as possible */ if (besta) { - if (blen == -1) + if (blen == -1) { + IFA_UNLOCK(ifa); + IFA_LOCK(&besta->ia_ifa); blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); + IFA_UNLOCK(&besta->ia_ifa); + IFA_LOCK(ifa); + } tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; + IFA_ADDREF_LOCKED(ifa); /* for besta */ + IFA_UNLOCK(ifa); + IFA_REMREF(&besta->ia_ifa); besta = (struct in6_ifaddr *)ifa; + } else { + IFA_UNLOCK(ifa); } - } else + } else { besta = (struct in6_ifaddr *)ifa; + IFA_ADDREF_LOCKED(ifa); /* for besta */ + IFA_UNLOCK(ifa); + } + } else { + IFA_UNLOCK(ifa); } } if (besta) { - ifaref(&besta->ia_ifa); ifnet_lock_done(ifp); if (dep[0] != NULL) - ifafree(&dep[0]->ia_ifa); + IFA_REMREF(&dep[0]->ia_ifa); return(besta); } TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_ANYCAST) { + IFA_UNLOCK(ifa); continue; /* XXX: is there any case to allow anycast? */ - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_NOTREADY) { + IFA_UNLOCK(ifa); continue; /* don't use this interface */ - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_DETACHED) { + IFA_UNLOCK(ifa); continue; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { + } + if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (ip6_use_deprecated) { + IFA_ADDREF_LOCKED(ifa); /* for dep[1] */ + IFA_UNLOCK(ifa); if (dep[1] != NULL) - ifafree(&dep[1]->ia_ifa); + IFA_REMREF(&dep[1]->ia_ifa); dep[1] = (struct in6_ifaddr *)ifa; - ifaref(ifa); + } else { + IFA_UNLOCK(ifa); } continue; } - if (ifa != NULL) - ifaref(ifa); + IFA_ADDREF_LOCKED(ifa); /* for caller */ + IFA_UNLOCK(ifa); ifnet_lock_done(ifp); if (dep[0] != NULL) - ifafree(&dep[0]->ia_ifa); + IFA_REMREF(&dep[0]->ia_ifa); if (dep[1] != NULL) - ifafree(&dep[1]->ia_ifa); + IFA_REMREF(&dep[1]->ia_ifa); return (struct in6_ifaddr *)ifa; } ifnet_lock_done(ifp); @@ -2912,7 +3177,7 @@ in6_ifawithifp( /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { if (dep[1] != NULL) - ifafree(&dep[1]->ia_ifa); + IFA_REMREF(&dep[1]->ia_ifa); return dep[0]; } if (dep[1]) @@ -2945,14 +3210,22 @@ in6_if_up( return error; dad_delay = 0; + ifnet_lock_exclusive(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK_SPIN(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } ia = (struct in6_ifaddr *)ifa; - if (ia->ia6_flags & IN6_IFF_TENTATIVE) + if (ia->ia6_flags & IN6_IFF_TENTATIVE) { + IFA_UNLOCK(ifa); nd6_dad_start(ifa, &dad_delay); + } else { + IFA_UNLOCK(ifa); + } } ifnet_lock_done(ifp); @@ -2966,6 +3239,15 @@ in6if_do_dad( if ((ifp->if_flags & IFF_LOOPBACK) != 0) return(0); + /* + * Skip DAD on service triggered interfaces, for now, + * until we have support for Opportunistic Duplicate + * Address Detection [RFC 4429] and we can then back + * this out. + */ + if (ifp->if_eflags & IFEF_SERVICE_TRIGGERED) + return (0); + switch (ifp->if_type) { #if IFT_DUMMY case IFT_DUMMY: @@ -3018,7 +3300,69 @@ in6_setmaxmtu() if (maxmtu) /* update only when maxmtu is positive */ in6_maxmtu = maxmtu; } - +/* + * Provide the length of interface identifiers to be used for the link attached + * to the given interface. The length should be defined in "IPv6 over + * xxx-link" document. Note that address architecture might also define + * the length for a particular set of address prefixes, regardless of the + * link type. As clarified in rfc2462bis, those two definitions should be + * consistent, and those really are as of August 2004. + */ +int +in6_if2idlen(struct ifnet *ifp) +{ + switch (ifp->if_type) { + case IFT_ETHER: /* RFC2464 */ + case IFT_IEEE8023ADLAG: /* IEEE802.3ad Link Aggregate */ +#ifdef IFT_PROPVIRTUAL + case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ +#endif +#ifdef IFT_L2VLAN + case IFT_L2VLAN: /* ditto */ +#endif +#ifdef IFT_IEEE80211 + case IFT_IEEE80211: /* ditto */ +#endif +#ifdef IFT_MIP + case IFT_MIP: /* ditto */ +#endif + return (64); + case IFT_FDDI: /* RFC2467 */ + return (64); + case IFT_ISO88025: /* RFC2470 (IPv6 over Token Ring) */ + return (64); + case IFT_PPP: /* RFC2472 */ + return (64); + case IFT_ARCNET: /* RFC2497 */ + return (64); + case IFT_FRELAY: /* RFC2590 */ + return (64); + case IFT_IEEE1394: /* RFC3146 */ + return (64); + case IFT_GIF: + return (64); /* draft-ietf-v6ops-mech-v2-07 */ + case IFT_LOOP: + return (64); /* XXX: is this really correct? */ + case IFT_OTHER: + return (64); /* for utun interfaces */ + case IFT_CELLULAR: + return (64); /* Packet Data over Cellular */ + default: + /* + * Unknown link type: + * It might be controversial to use the today's common constant + * of 64 for these cases unconditionally. For full compliance, + * we should return an error in this case. On the other hand, + * if we simply miss the standard for the link type or a new + * standard is defined for a new link type, the IFID length + * is very likely to be the common constant. As a compromise, + * we always use the constant, but make an explicit notice + * indicating the "unknown" case. + */ + printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); + return (64); + } +} /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr @@ -3030,7 +3374,7 @@ in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; - sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; + sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ @@ -3096,11 +3440,14 @@ in6_post_msg(struct ifnet *ifp, u_int32_t event_code, struct in6_ifaddr *ifa) struct kev_msg ev_msg; struct kev_in6_data in6_event_data; + bzero(&in6_event_data, sizeof(struct kev_in6_data)); + bzero(&ev_msg, sizeof(struct kev_msg)); ev_msg.vendor_code = KEV_VENDOR_APPLE; ev_msg.kev_class = KEV_NETWORK_CLASS; ev_msg.kev_subclass = KEV_INET6_SUBCLASS; ev_msg.event_code = event_code; + IFA_LOCK(&ifa->ia_ifa); in6_event_data.ia_addr = ifa->ia_addr; in6_event_data.ia_net = ifa->ia_net; in6_event_data.ia_dstaddr = ifa->ia_dstaddr; @@ -3116,6 +3463,7 @@ in6_post_msg(struct ifnet *ifp, u_int32_t event_code, struct in6_ifaddr *ifa) ifa->ia6_lifetime.ia6t_vltime; in6_event_data.ia_lifetime.ia6t_pltime = ifa->ia6_lifetime.ia6t_pltime; + IFA_UNLOCK(&ifa->ia_ifa); if (ifp != NULL) { strncpy(&in6_event_data.link_data.if_name[0], @@ -3137,6 +3485,8 @@ in6_post_msg(struct ifnet *ifp, u_int32_t event_code, struct in6_ifaddr *ifa) void in6_ifaddr_init(void) { + in6_multi_init(); + PE_parse_boot_argn("ifa_debug", &in6ifa_debug, sizeof (in6ifa_debug)); in6ifa_size = (in6ifa_debug == 0) ? sizeof (struct in6_ifaddr) : @@ -3144,10 +3494,15 @@ in6_ifaddr_init(void) in6ifa_zone = zinit(in6ifa_size, IN6IFA_ZONE_MAX * in6ifa_size, 0, IN6IFA_ZONE_NAME); - if (in6ifa_zone == NULL) + if (in6ifa_zone == NULL) { panic("%s: failed allocating %s", __func__, IN6IFA_ZONE_NAME); - + /* NOTREACHED */ + } zone_change(in6ifa_zone, Z_EXPAND, TRUE); + zone_change(in6ifa_zone, Z_CALLERACCT, FALSE); + + lck_mtx_init(&in6ifa_trash_lock, ifa_mtx_grp, ifa_mtx_attr); + TAILQ_INIT(&in6ifa_trash_head); } static struct in6_ifaddr * @@ -3161,11 +3516,14 @@ in6_ifaddr_alloc(int how) bzero(in6ifa, in6ifa_size); in6ifa->ia_ifa.ifa_free = in6_ifaddr_free; in6ifa->ia_ifa.ifa_debug |= IFD_ALLOC; + ifa_lock_init(&in6ifa->ia_ifa); if (in6ifa_debug != 0) { struct in6_ifaddr_dbg *in6ifa_dbg = (struct in6_ifaddr_dbg *)in6ifa; in6ifa->ia_ifa.ifa_debug |= IFD_DEBUG; in6ifa->ia_ifa.ifa_trace = in6_ifaddr_trace; + in6ifa->ia_ifa.ifa_attached = in6_ifaddr_attached; + in6ifa->ia_ifa.ifa_detached = in6_ifaddr_detached; ctrace_record(&in6ifa_dbg->in6ifa_alloc); } } @@ -3175,22 +3533,80 @@ in6_ifaddr_alloc(int how) static void in6_ifaddr_free(struct ifaddr *ifa) { - if (ifa->ifa_refcnt != 0) + IFA_LOCK_ASSERT_HELD(ifa); + + if (ifa->ifa_refcnt != 0) { panic("%s: ifa %p bad ref cnt", __func__, ifa); - if (!(ifa->ifa_debug & IFD_ALLOC)) + /* NOTREACHED */ + } else if (!(ifa->ifa_debug & IFD_ALLOC)) { panic("%s: ifa %p cannot be freed", __func__, ifa); - + /* NOTREACHED */ + } if (ifa->ifa_debug & IFD_DEBUG) { struct in6_ifaddr_dbg *in6ifa_dbg = (struct in6_ifaddr_dbg *)ifa; ctrace_record(&in6ifa_dbg->in6ifa_free); bcopy(&in6ifa_dbg->in6ifa, &in6ifa_dbg->in6ifa_old, sizeof (struct in6_ifaddr)); + if (ifa->ifa_debug & IFD_TRASHED) { + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&in6ifa_trash_lock); + TAILQ_REMOVE(&in6ifa_trash_head, in6ifa_dbg, + in6ifa_trash_link); + lck_mtx_unlock(&in6ifa_trash_lock); + ifa->ifa_debug &= ~IFD_TRASHED; + } } + IFA_UNLOCK(ifa); + ifa_lock_destroy(ifa); bzero(ifa, sizeof (struct in6_ifaddr)); zfree(in6ifa_zone, ifa); } +static void +in6_ifaddr_attached(struct ifaddr *ifa) +{ + struct in6_ifaddr_dbg *in6ifa_dbg = (struct in6_ifaddr_dbg *)ifa; + + IFA_LOCK_ASSERT_HELD(ifa); + + if (!(ifa->ifa_debug & IFD_DEBUG)) { + panic("%s: ifa %p has no debug structure", __func__, ifa); + /* NOTREACHED */ + } + if (ifa->ifa_debug & IFD_TRASHED) { + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&in6ifa_trash_lock); + TAILQ_REMOVE(&in6ifa_trash_head, in6ifa_dbg, in6ifa_trash_link); + lck_mtx_unlock(&in6ifa_trash_lock); + ifa->ifa_debug &= ~IFD_TRASHED; + } +} + +static void +in6_ifaddr_detached(struct ifaddr *ifa) +{ + struct in6_ifaddr_dbg *in6ifa_dbg = (struct in6_ifaddr_dbg *)ifa; + + IFA_LOCK_ASSERT_HELD(ifa); + + if (!(ifa->ifa_debug & IFD_DEBUG)) { + panic("%s: ifa %p has no debug structure", __func__, ifa); + /* NOTREACHED */ + } else if (ifa->ifa_debug & IFD_TRASHED) { + panic("%s: ifa %p is already in trash list", __func__, ifa); + /* NOTREACHED */ + } + ifa->ifa_debug |= IFD_TRASHED; + /* Become a regular mutex, just in case */ + IFA_CONVERT_LOCK(ifa); + lck_mtx_lock(&in6ifa_trash_lock); + TAILQ_INSERT_TAIL(&in6ifa_trash_head, in6ifa_dbg, in6ifa_trash_link); + lck_mtx_unlock(&in6ifa_trash_lock); +} + static void in6_ifaddr_trace(struct ifaddr *ifa, int refhold) { @@ -3199,9 +3615,10 @@ in6_ifaddr_trace(struct ifaddr *ifa, int refhold) u_int32_t idx; u_int16_t *cnt; - if (!(ifa->ifa_debug & IFD_DEBUG)) + if (!(ifa->ifa_debug & IFD_DEBUG)) { panic("%s: ifa %p has no debug structure", __func__, ifa); - + /* NOTREACHED */ + } if (refhold) { cnt = &in6ifa_dbg->in6ifa_refhold_cnt; tr = in6ifa_dbg->in6ifa_refhold; @@ -3210,6 +3627,6 @@ in6_ifaddr_trace(struct ifaddr *ifa, int refhold) tr = in6ifa_dbg->in6ifa_refrele; } - idx = OSAddAtomic16(1, (volatile SInt16 *)cnt) % CTRACE_HIST_SIZE; + idx = atomic_add_16_ov(cnt, 1) % IN6IFA_TRACE_HIST_SIZE; ctrace_record(&tr[idx]); } diff --git a/bsd/netinet6/in6.h b/bsd/netinet6/in6.h index fb0479fde..c0838ec43 100644 --- a/bsd/netinet6/in6.h +++ b/bsd/netinet6/in6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -114,7 +114,7 @@ typedef __uint8_t sa_family_t; * has the table of implementation/integration differences. */ #define __KAME__ -#define __KAME_VERSION "20010528/apple-darwin" +#define __KAME_VERSION "2009/apple-darwin" #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) /* @@ -192,6 +192,10 @@ struct sockaddr_in6 { * Local definition for masks */ #define IN6MASK0 {{{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }}} +#define IN6MASK7 {{{ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}} +#define IN6MASK16 {{{ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}} #define IN6MASK32 {{{ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }}} #define IN6MASK64 {{{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \ @@ -206,6 +210,8 @@ struct sockaddr_in6 { extern const struct sockaddr_in6 sa6_any; extern const struct in6_addr in6mask0; +extern const struct in6_addr in6mask7; +extern const struct in6_addr in6mask16; extern const struct in6_addr in6mask32; extern const struct in6_addr in6mask64; extern const struct in6_addr in6mask96; @@ -250,12 +256,21 @@ extern const struct in6_addr in6mask128; #define IN6ADDR_NODELOCAL_ALLNODES_INIT \ {{{ 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} +#define IN6ADDR_INTFACELOCAL_ALLNODES_INIT \ + {{{ 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} #define IN6ADDR_LINKLOCAL_ALLNODES_INIT \ {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }}} #define IN6ADDR_LINKLOCAL_ALLROUTERS_INIT \ {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02 }}} +#define IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT \ + {{{ 0xff, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16 }}} +#define IN6ADDR_V4MAPPED_INIT \ + {{{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ + 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }}} #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ extern const struct in6_addr in6addr_any; @@ -264,6 +279,7 @@ extern const struct in6_addr in6addr_loopback; extern const struct in6_addr in6addr_nodelocal_allnodes; extern const struct in6_addr in6addr_linklocal_allnodes; extern const struct in6_addr in6addr_linklocal_allrouters; +extern const struct in6_addr in6addr_linklocal_allv2routers; #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ /* @@ -326,6 +342,11 @@ extern const struct in6_addr in6addr_linklocal_allrouters; (*(const __uint32_t *)(const void *)(&(a)->s6_addr[4]) == 0) && \ (*(const __uint32_t *)(const void *)(&(a)->s6_addr[8]) == ntohl(0x0000ffff))) +/* + * 6to4 + */ +#define IN6_IS_ADDR_6TO4(x) (ntohs((x)->s6_addr16[0]) == 0x2002) + /* * KAME Scope Values */ @@ -339,6 +360,7 @@ extern const struct in6_addr in6addr_linklocal_allrouters; #define IPV6_ADDR_SCOPE_GLOBAL 0x0e #else #define __IPV6_ADDR_SCOPE_NODELOCAL 0x01 +#define __IPV6_ADDR_SCOPE_INTFACELOCAL 0x01 #define __IPV6_ADDR_SCOPE_LINKLOCAL 0x02 #define __IPV6_ADDR_SCOPE_SITELOCAL 0x05 #define __IPV6_ADDR_SCOPE_ORGLOCAL 0x08 /* just used in this file */ @@ -359,6 +381,11 @@ extern const struct in6_addr in6addr_linklocal_allrouters; */ #define IN6_IS_ADDR_MULTICAST(a) ((a)->s6_addr[0] == 0xff) +/* + * Unique Local IPv6 Unicast Addresses (per RFC 4193) + */ +#define IN6_IS_ADDR_UNIQUE_LOCAL(a) (((a)->s6_addr[0] == 0xfc) || ((a)->s6_addr[0] == 0xfd)) + #ifdef KERNEL /*XXX nonstandard*/ #define IPV6_ADDR_MC_SCOPE(a) ((a)->s6_addr[1] & 0x0f) #else @@ -450,6 +477,35 @@ struct route_in6 { */ /* no hdrincl */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +/* + * RFC 3542 define the following socket options in a manner incompatible + * with RFC 2292: + * IPV6_PKTINFO + * IPV6_HOPLIMIT + * IPV6_NEXTHOP + * IPV6_HOPOPTS + * IPV6_DSTOPTS + * IPV6_RTHDR + * + * To use the new IPv6 Sockets options introduced by RFC 3542 + * the constant __APPLE_USE_RFC_3542 must be defined before + * including <netinet/in.h> + * + * To use the old IPv6 Sockets options from RFC 2292 + * the constant __APPLE_USE_RFC_2292 must be defined before + * including <netinet/in.h> + * + * Note that eventually RFC 3542 is going to be the + * default and RFC 2292 will be obsolete. + */ +#ifdef XNU_KERNEL_PRIVATE +#define __APPLE_USE_RFC_3542 1 +#endif /* XNU_KERNEL_PRIVATE */ + +#if defined(__APPLE_USE_RFC_3542) && defined(__APPLE_USE_RFC_2292) +#error "__APPLE_USE_RFC_3542 and __APPLE_USE_RFC_2292 cannot be both defined" +#endif + #if 0 /* the followings are relic in IPv4 and hence are disabled */ #define IPV6_OPTIONS 1 /* buf/ip6_opts; set/get IP6 options */ #define IPV6_RECVOPTS 5 /* bool; receive all IP6 opts w/dgram */ @@ -469,14 +525,24 @@ struct route_in6 { #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #define IPV6_PORTRANGE 14 /* int; range to choose for unspec port */ #define ICMP6_FILTER 18 /* icmp6_filter; icmp6 filter */ -/* RFC2292 options */ -#define IPV6_PKTINFO 19 /* bool; send/recv if, src/dst addr */ -#define IPV6_HOPLIMIT 20 /* bool; hop limit */ -#define IPV6_NEXTHOP 21 /* bool; next hop addr */ -#define IPV6_HOPOPTS 22 /* bool; hop-by-hop option */ -#define IPV6_DSTOPTS 23 /* bool; destination option */ -#define IPV6_RTHDR 24 /* bool; routing header */ -#define IPV6_PKTOPTIONS 25 /* buf/cmsghdr; set/get IPv6 options */ +#define IPV6_2292PKTINFO 19 /* bool; send/recv if, src/dst addr */ +#define IPV6_2292HOPLIMIT 20 /* bool; hop limit */ +#define IPV6_2292NEXTHOP 21 /* bool; next hop addr */ +#define IPV6_2292HOPOPTS 22 /* bool; hop-by-hop option */ +#define IPV6_2292DSTOPTS 23 /* bool; destinaion option */ +#define IPV6_2292RTHDR 24 /* ip6_rthdr: routing header */ +#define IPV6_2292PKTOPTIONS 25 /* buf/cmsghdr; set/get IPv6 options */ + /* obsoleted by RFC3542 */ + +#ifdef __APPLE_USE_RFC_2292 +#define IPV6_PKTINFO IPV6_2292PKTINFO +#define IPV6_HOPLIMIT IPV6_2292HOPLIMIT +#define IPV6_NEXTHOP IPV6_2292NEXTHOP +#define IPV6_HOPOPTS IPV6_2292HOPOPTS +#define IPV6_DSTOPTS IPV6_2292DSTOPTS +#define IPV6_RTHDR IPV6_2292RTHDR +#define IPV6_PKTOPTIONS IPV6_2292PKTOPTIONS +#endif /* __APPLE_USE_RFC_2292 */ #define IPV6_CHECKSUM 26 /* int; checksum offset for raw socket */ #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ @@ -500,8 +566,80 @@ struct route_in6 { #define IPV6_FW_GET 34 /* get entire firewall rule chain */ #endif /* 1 */ -#define IPV6_RECVTCLASS 35 /* bool; recv traffic class values */ -#define IPV6_TCLASS 36 /* int; send traffic class value */ +/* APPLE: NOTE the value of those 2 options is kept unchanged from + * previous version of darwin/OS X for binary compatibility reasons + * and differ from FreeBSD (values 57 and 61). See below. + */ +#define IPV6_RECVTCLASS 35 /* bool; recv traffic class values */ +#define IPV6_TCLASS 36 /* int; send traffic class value */ + +#ifdef __APPLE_USE_RFC_3542 +/* new socket options introduced in RFC3542 */ +#define IPV6_RTHDRDSTOPTS 57 /* ip6_dest; send dst option before rthdr + * APPLE: Value purposely different than FreeBSD (35) to avoid + * collision with definition of IPV6_RECVTCLASS in previous + * darwin implementations */ + +#define IPV6_RECVPKTINFO 61 /* bool; recv if, dst addr + * APPLE: Value purposely different than FreeBSD (36) to avoid + * collision with definition of IPV6_TCLASS in previous + * darwin implementations */ + +#define IPV6_RECVHOPLIMIT 37 /* bool; recv hop limit */ +#define IPV6_RECVRTHDR 38 /* bool; recv routing header */ +#define IPV6_RECVHOPOPTS 39 /* bool; recv hop-by-hop option */ +#define IPV6_RECVDSTOPTS 40 /* bool; recv dst option after rthdr */ +#ifdef KERNEL +#define IPV6_RECVRTHDRDSTOPTS 41 /* bool; recv dst option before rthdr */ +#endif + +#define IPV6_USE_MIN_MTU 42 /* bool; send packets at the minimum MTU */ +#define IPV6_RECVPATHMTU 43 /* bool; notify an according MTU */ + +#define IPV6_PATHMTU 44 /* mtuinfo; get the current path MTU (sopt), + 4 bytes int; MTU notification (cmsg) */ +#if 0 /*obsoleted during 2292bis -> 3542*/ +#define IPV6_REACHCONF 45 /* no data; ND reachability confirm + (cmsg only/not in of RFC3542) */ +#endif +/* more new socket options introduced in RFC3542 */ +#define IPV6_3542PKTINFO 46 /* in6_pktinfo; send if, src addr */ +#define IPV6_3542HOPLIMIT 47 /* int; send hop limit */ +#define IPV6_3542NEXTHOP 48 /* sockaddr; next hop addr */ +#define IPV6_3542HOPOPTS 49 /* ip6_hbh; send hop-by-hop option */ +#define IPV6_3542DSTOPTS 50 /* ip6_dest; send dst option befor rthdr */ +#define IPV6_3542RTHDR 51 /* ip6_rthdr; send routing header */ + +#define IPV6_PKTINFO IPV6_3542PKTINFO +#define IPV6_HOPLIMIT IPV6_3542HOPLIMIT +#define IPV6_NEXTHOP IPV6_3542NEXTHOP +#define IPV6_HOPOPTS IPV6_3542HOPOPTS +#define IPV6_DSTOPTS IPV6_3542DSTOPTS +#define IPV6_RTHDR IPV6_3542RTHDR + +#define IPV6_AUTOFLOWLABEL 59 /* bool; attach flowlabel automagically */ + +#define IPV6_DONTFRAG 62 /* bool; disable IPv6 fragmentation */ + +#define IPV6_PREFER_TEMPADDR 63 /* int; prefer temporary addresses as + * the source address. + */ + +/* + * The following option is private; do not use it from user applications. + * It is deliberately defined to the same value as IP_MSFILTER. + */ +#define IPV6_MSFILTER 74 /* struct __msfilterreq; + * set/get multicast source filter list. + */ +#endif /* __APPLE_USE_RFC_3542 */ + +#define IPV6_BOUND_IF 125 /* int; set/get bound interface */ + +#ifdef PRIVATE +#define IPV6_NO_IFT_CELLULAR 6969 /* for internal use only */ +#define IPV6_OUT_IF 9696 /* for internal use only */ +#endif /* PRIVATE */ /* to define items, should talk with KAME guys first, for *BSD compatibility */ @@ -515,6 +653,21 @@ struct route_in6 { #define IPV6_DEFAULT_MULTICAST_HOPS 1 /* normally limit m'casts to 1 hop */ #define IPV6_DEFAULT_MULTICAST_LOOP 1 /* normally hear sends if a member */ +/* + * The im6o_membership vector for each socket is now dynamically allocated at + * run-time, bounded by USHRT_MAX, and is reallocated when needed, sized + * according to a power-of-two increment. + */ +#define IPV6_MIN_MEMBERSHIPS 31 +#define IPV6_MAX_MEMBERSHIPS 4095 + +/* + * Default resource limits for IPv6 multicast source filtering. + * These may be modified by sysctl. + */ +#define IPV6_MAX_GROUP_SRC_FILTER 512 /* sources per group */ +#define IPV6_MAX_SOCK_SRC_FILTER 128 /* sources per socket/group */ + /* * Argument structure for IPV6_JOIN_GROUP and IPV6_LEAVE_GROUP. */ @@ -524,13 +677,21 @@ struct ipv6_mreq { }; /* - * IPV6_PKTINFO: Packet information(RFC2292 sec 5) + * IPV6_2292PKTINFO: Packet information(RFC2292 sec 5) */ struct in6_pktinfo { struct in6_addr ipi6_addr; /* src/dst IPv6 address */ unsigned int ipi6_ifindex; /* send/recv interface index */ }; +/* + * Control structure for IPV6_RECVPATHMTU socket option. + */ +struct ip6_mtuinfo { + struct sockaddr_in6 ip6m_addr; /* or sockaddr_storage? */ + uint32_t ip6m_mtu; +}; + /* * Argument for IPV6_PORTRANGE: * - which range to search when port is unspecified at bind() or connect() @@ -582,22 +743,27 @@ struct in6_pktinfo { #define IPV6CTL_RTMINEXPIRE 26 /* min value for expiration time */ #define IPV6CTL_RTMAXCACHE 27 /* trigger level for dynamic expire */ -#define IPV6CTL_USETEMPADDR 32 /* use temporary addresses (RFC3041) */ +#define IPV6CTL_USETEMPADDR 32 /* use temporary addresses [RFC 4941] */ #define IPV6CTL_TEMPPLTIME 33 /* preferred lifetime for tmpaddrs */ #define IPV6CTL_TEMPVLTIME 34 /* valid lifetime for tmpaddrs */ #define IPV6CTL_AUTO_LINKLOCAL 35 /* automatic link-local addr assign */ #define IPV6CTL_RIP6STATS 36 /* raw_ip6 stats */ +#define IPV6CTL_PREFER_TEMPADDR 37 /* prefer temporary addr as src */ +#define IPV6CTL_ADDRCTLPOLICY 38 /* get/set address selection policy */ +#define IPV6CTL_USE_DEFAULTZONE 39 /* use default scope zone */ -#define IPV6CTL_MAXFRAGS 41 /* max fragments */ +#define IPV6CTL_MAXFRAGS 41 /* max fragments */ +#define IPV6CTL_MCAST_PMTU 44 /* enable pMTU discovery for multicast? */ #define IPV6CTL_NEIGHBORGCTHRESH 46 #define IPV6CTL_MAXIFPREFIXES 47 #define IPV6CTL_MAXIFDEFROUTERS 48 #define IPV6CTL_MAXDYNROUTES 49 +#define ICMPV6CTL_ND6_ONLINKNSRFC4861 50 /* New entries should be added here from current IPV6CTL_MAXID value. */ /* to define items, should talk with KAME guys first, for *BSD compatibility */ -#define IPV6CTL_MAXID 50 +#define IPV6CTL_MAXID 51 #ifdef KERNEL_PRIVATE #define CTL_IPV6PROTO_NAMES { \ @@ -651,7 +817,6 @@ struct in6_pktinfo { */ #define M_AUTHIPHDR M_PROTO2 #define M_DECRYPTED M_PROTO3 -#define M_LOOP M_PROTO4 #define M_AUTHIPDGM M_PROTO5 struct cmsghdr; @@ -676,12 +841,65 @@ extern void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6); extern void in6_sin6_2_sin_in_sock(struct sockaddr *nam); extern int in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam); +extern void in6_delayed_cksum(struct mbuf *, u_int16_t); #define satosin6(sa) ((struct sockaddr_in6 *)(sa)) #define sin6tosa(sin6) ((struct sockaddr *)(sin6)) #define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) extern int in6addr_local(struct in6_addr *); + +#define DEBUG_HWCKSUM 1 /* IPv6 Hardware checksum on/off */ +/* + * in6_cksum_phdr: + * + * Compute significant parts of the IPv6 checksum pseudo-header + * for use in a delayed TCP/UDP checksum calculation. + * + * Args: + * + * src Source IPv6 address + * dst Destination IPv6 address + * len htonl(proto-hdr-len) + * nxt htonl(next-proto-number) + * + * NOTE: We expect the src and dst addresses to be 16-bit + * aligned! + */ +static __inline u_int16_t __unused +in6_cksum_phdr(const struct in6_addr *src, const struct in6_addr *dst, + u_int32_t len, u_int32_t nxt) +{ + u_int32_t sum = 0; + const u_int16_t *w; + + /*LINTED*/ + w = (const u_int16_t *) src; + sum += w[0]; + if (!IN6_IS_SCOPE_LINKLOCAL(src)) + sum += w[1]; + sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; + sum += w[6]; sum += w[7]; + + /*LINTED*/ + w = (const u_int16_t *) dst; + sum += w[0]; + if (!IN6_IS_SCOPE_LINKLOCAL(dst)) + sum += w[1]; + sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; + sum += w[6]; sum += w[7]; + + sum += (u_int16_t)(len >> 16) + (u_int16_t)(len /*& 0xffff*/); + + sum += (u_int16_t)(nxt >> 16) + (u_int16_t)(nxt /*& 0xffff*/); + + sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/); + + if (sum > 0xffff) + sum -= 0xffff; + + return (sum); +} #endif /* KERNEL_PRIVATE */ #ifndef KERNEL @@ -708,23 +926,24 @@ extern int inet6_rthdr_segments(const struct cmsghdr *); extern struct in6_addr *inet6_rthdr_getaddr(struct cmsghdr *, int); extern int inet6_rthdr_getflags(const struct cmsghdr *, int); -extern int inet6_opt_init(void *, size_t); -extern int inet6_opt_append(void *, size_t, int, __uint8_t, - size_t, __uint8_t, void **); -extern int inet6_opt_finish(void *, size_t, int); -extern int inet6_opt_set_val(void *, size_t, void *, int); - -extern int inet6_opt_next(void *, size_t, int, __uint8_t *, - size_t *, void **); -extern int inet6_opt_find(void *, size_t, int, __uint8_t, - size_t *, void **); -extern int inet6_opt_get_val(void *, size_t, void *, int); -extern size_t inet6_rth_space(int, int); -extern void *inet6_rth_init(void *, int, int, int); +extern int inet6_opt_init(void *, socklen_t); +extern int inet6_opt_append(void *, socklen_t, int, __uint8_t, + socklen_t, __uint8_t, void **); +extern int inet6_opt_finish(void *, socklen_t, int); +extern int inet6_opt_set_val(void *, int, void *, socklen_t); + +extern int inet6_opt_next(void *, socklen_t, int, __uint8_t *, + socklen_t *, void **); +extern int inet6_opt_find(void *, socklen_t, int, __uint8_t, + socklen_t *, void **); +extern int inet6_opt_get_val(void *, int, void *, socklen_t); +extern socklen_t inet6_rth_space(int, int); +extern void *inet6_rth_init(void *, socklen_t, int, int); extern int inet6_rth_add(void *, const struct in6_addr *); extern int inet6_rth_reverse(const void *, void *); extern int inet6_rth_segments(const void *); extern struct in6_addr *inet6_rth_getaddr(const void *, int); +extern void addrsel_policy_init(void); __END_DECLS #endif /* !KERNEL */ #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ diff --git a/bsd/netinet6/in6_cksum.c b/bsd/netinet6/in6_cksum.c index d964493dc..f0352eb72 100644 --- a/bsd/netinet6/in6_cksum.c +++ b/bsd/netinet6/in6_cksum.c @@ -224,7 +224,7 @@ inet6_cksum(struct mbuf *m, unsigned int nxt, unsigned int off, * code and should be modified for each CPU to be as fast as possible. */ -#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define ADDCARRY(x) do { if (x > 65535) { x -= 65535; } } while (0) #define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} /* diff --git a/bsd/netinet6/in6_gif.c b/bsd/netinet6/in6_gif.c index 332271e88..d620db95e 100644 --- a/bsd/netinet6/in6_gif.c +++ b/bsd/netinet6/in6_gif.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2009-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -192,11 +192,9 @@ in6_gif_output( m_freem(m); return ENETUNREACH; } - if (ifp->if_flags & IFF_LINK1) - ip_ecn_ingress(ECN_ALLOWED, &otos, &itos); - else - ip_ecn_ingress(ECN_NOCARE, &otos, &itos); - ip6->ip6_flow &= ~ntohl(0xff00000); + ip_ecn_ingress((ifp->if_flags & IFF_LINK1) ? ECN_ALLOWED : ECN_NOCARE, + &otos, &itos); + ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t)otos << 20); if (dst->sin6_family != sin6_dst->sin6_family || @@ -244,22 +242,19 @@ in6_gif_output( * it is too painful to ask for resend of inner packet, to achieve * path MTU discovery for encapsulated packets. */ - return(ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL, 0)); + return(ip6_output(m, 0, &sc->gif_ro6, IPV6_MINMTU, 0, NULL, NULL)); #else - return(ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL, 0)); + return(ip6_output(m, 0, &sc->gif_ro6, 0, 0, NULL, NULL)); #endif } -int in6_gif_input(mp, offp) - struct mbuf **mp; - int *offp; +int in6_gif_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m = *mp; struct ifnet *gifp = NULL; struct ip6_hdr *ip6; int af = 0; u_int32_t otos; - u_int8_t proto; ip6 = mtod(m, struct ip6_hdr *); @@ -271,7 +266,6 @@ int in6_gif_input(mp, offp) return IPPROTO_DONE; } - proto = ip6->ip6_nxt; otos = ip6->ip6_flow; m_adj(m, *offp); @@ -360,9 +354,6 @@ gif_validate6( sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_addr = ip6->ip6_src; -#ifndef SCOPEDROUTING - sin6.sin6_scope_id = 0; /* XXX */ -#endif rt = rtalloc1((struct sockaddr *)&sin6, 0, 0); if (rt != NULL) diff --git a/bsd/netinet6/in6_gif.h b/bsd/netinet6/in6_gif.h index 8383c6b4e..8baafdd43 100644 --- a/bsd/netinet6/in6_gif.h +++ b/bsd/netinet6/in6_gif.h @@ -37,7 +37,7 @@ #ifdef KERNEL_PRIVATE #define GIF_HLIM 30 -int in6_gif_input(struct mbuf **, int *); +int in6_gif_input(struct mbuf **, int *, int); int in6_gif_output(struct ifnet *, int, struct mbuf *, struct rtentry *); int gif_encapcheck6(const struct mbuf *, int, int, void *); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet6/in6_ifattach.c b/bsd/netinet6/in6_ifattach.c index 5995b212d..10bf295f4 100644 --- a/bsd/netinet6/in6_ifattach.c +++ b/bsd/netinet6/in6_ifattach.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,7 @@ #include <netinet/in_var.h> #include <netinet/if_ether.h> #include <netinet/in_pcb.h> +#include <netinet/icmp6.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> @@ -97,7 +98,6 @@ size_t in6_ifstatmax = 0; size_t icmp6_ifstatmax = 0; u_int32_t in6_maxmtu = 0; extern lck_mtx_t *nd6_mutex; -extern lck_mtx_t *inet6_domain_mutex; #if IP6_AUTO_LINKLOCAL int ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; @@ -105,13 +105,14 @@ int ip6_auto_linklocal = IP6_AUTO_LINKLOCAL; int ip6_auto_linklocal = 1; /* enable by default */ #endif +int loopattach6_done = 0; + extern struct inpcbinfo udbinfo; extern struct inpcbinfo ripcbinfo; -extern lck_mtx_t *ip6_mutex; static int get_rand_ifid(struct ifnet *, struct in6_addr *); static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *); -static int get_hw_ifid(struct ifnet *, struct in6_addr *); +int in6_get_hw_ifid(struct ifnet *, struct in6_addr *); static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *); static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *, struct in6_aliasreq *); static int in6_ifattach_loopback(struct ifnet *); @@ -133,6 +134,8 @@ static int in6_ifattach_loopback(struct ifnet *); * The goal here is to get an interface identifier that is * (1) random enough and (2) does not change across reboot. * We currently use MD5(hostname) for it. + * + * in6 - upper 64bits are preserved */ static int get_rand_ifid( @@ -141,7 +144,7 @@ get_rand_ifid( { MD5_CTX ctxt; u_int8_t digest[16]; - int len = strlen(hostname); + int hostnlen = strlen(hostname); #if 0 /* we need at least several letters as seed for ifid */ @@ -152,7 +155,7 @@ get_rand_ifid( /* generate 8 bytes of pseudo-random value. */ bzero(&ctxt, sizeof(ctxt)); MD5Init(&ctxt); - MD5Update(&ctxt, hostname, len); + MD5Update(&ctxt, hostname, hostnlen); MD5Final(digest, &ctxt); /* assumes sizeof(digest) > sizeof(ifid) */ @@ -179,7 +182,7 @@ generate_tmp_ifid( u_int32_t val32; struct timeval tv; - /* If there's no hisotry, start with a random seed. */ + /* If there's no history, start with a random seed. */ bzero(nullbuf, sizeof(nullbuf)); if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) { int i; @@ -213,7 +216,7 @@ generate_tmp_ifid( MD5Final(digest, &ctxt); /* - * RFC 3041 3.2.1. (3) + * RFC 4941 3.2.1. (3) * Take the left-most 64-bits of the MD5 digest and set bit 6 (the * left-most bit is numbered 0) to zero. */ @@ -226,8 +229,8 @@ generate_tmp_ifid( * use a random non-zero value as the last resort. */ if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) { - log(LOG_INFO, - "generate_tmp_ifid: computed MD5 value is zero.\n"); + nd6log((LOG_INFO, + "generate_tmp_ifid: computed MD5 value is zero.\n")); microtime(&tv); val32 = random() ^ tv.tv_usec; @@ -235,10 +238,10 @@ generate_tmp_ifid( } /* - * RFC 3041 3.2.1. (4) + * RFC 4941 3.2.1. (4) * Take the rightmost 64-bits of the MD5 digest and save them in * stable storage as the history value to be used in the next - * iteration of the algorithm. + * iteration of the algorithm. */ bcopy(&digest[8], seed0, 8); @@ -257,42 +260,35 @@ generate_tmp_ifid( /* * Get interface identifier for the specified interface. * XXX assumes single sockaddr_dl (AF_LINK address) per an interface + * + * in6 - upper 64bits are preserved */ -static int -get_hw_ifid( +int +in6_get_hw_ifid( struct ifnet *ifp, struct in6_addr *in6) /* upper 64bits are preserved */ { - struct ifaddr *ifa; + struct ifaddr *ifa = NULL; struct sockaddr_dl *sdl; u_int8_t *addr; size_t addrlen; static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; static u_int8_t allone[8] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + int err = -1; /* Why doesn't this code use ifnet_addrs? */ ifnet_lock_shared(ifp); - for (ifa = ifp->if_addrlist.tqh_first; - ifa; - ifa = ifa->ifa_list.tqe_next) - { - if (ifa->ifa_addr->sa_family != AF_LINK) - continue; - sdl = (struct sockaddr_dl *)ifa->ifa_addr; - if (sdl == NULL) - continue; - if (sdl->sdl_alen == 0) - continue; - - goto found; + ifa = ifp->if_lladdr; + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + if (sdl->sdl_alen == 0) { + ifnet_lock_done(ifp); + return (-1); } + IFA_ADDREF(ifa); /* for this routine */ ifnet_lock_done(ifp); - return -1; - -found: - ifnet_lock_done(ifp); + IFA_LOCK(ifa); addr = (u_int8_t *) LLADDR(sdl); addrlen = sdl->sdl_alen; @@ -300,6 +296,7 @@ found: switch (ifp->if_type) { case IFT_ETHER: case IFT_FDDI: + case IFT_ISO88025: case IFT_ATM: case IFT_IEEE1394: case IFT_L2VLAN: @@ -315,7 +312,7 @@ found: /* look at IEEE802/EUI64 only */ if (addrlen != 8 && addrlen != 6) - return -1; + goto done; /* * check for invalid MAC address - on bsdi, we see it a lot @@ -323,9 +320,9 @@ found: * card insertion. */ if (bcmp(addr, allzero, addrlen) == 0) - return -1; + goto done; if (bcmp(addr, allone, addrlen) == 0) - return -1; + goto done; /* make EUI64 address */ if (addrlen == 8) @@ -344,9 +341,9 @@ found: case IFT_ARCNET: if (addrlen != 1) - return -1; + goto done; if (!addr[0]) - return -1; + goto done; bzero(&in6->s6_addr[8], 8); in6->s6_addr[15] = addr[0]; @@ -368,15 +365,18 @@ found: * identifier source (can be renumbered). * we don't do this. */ - return -1; + goto done; + + case IFT_CELLULAR: + goto done; default: - return -1; + goto done; } /* sanity check: g bit must not indicate "group" */ if (EUI64_GROUP(in6)) - return -1; + goto done; /* convert EUI64 into IPv6 interface identifier */ EUI64_TO_IFID(in6); @@ -387,16 +387,27 @@ found: */ if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 && bcmp(&in6->s6_addr[9], allzero, 7) == 0) { - return -1; + goto done; } - return 0; + err = 0; /* found */ + +done: + /* This must not be the last reference to the lladdr */ + if (IFA_REMREF_LOCKED(ifa) == NULL) { + panic("%s: unexpected (missing) refcnt ifa=%p", __func__, ifa); + /* NOTREACHED */ + } + IFA_UNLOCK(ifa); + return (err); } /* * Get interface identifier for the specified interface. If it is not * available on ifp0, borrow interface identifier from other information * sources. + * + * altifp - secondary EUI64 source */ static int get_ifid( @@ -407,14 +418,14 @@ get_ifid( struct ifnet *ifp; /* first, try to get it from the interface itself */ - if (get_hw_ifid(ifp0, in6) == 0) { + if (in6_get_hw_ifid(ifp0, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n", if_name(ifp0))); goto success; } /* try secondary EUI64 source. this basically is for ATM PVC */ - if (altifp && get_hw_ifid(altifp, in6) == 0) { + if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n", if_name(ifp0), if_name(altifp))); goto success; @@ -425,7 +436,7 @@ get_ifid( TAILQ_FOREACH(ifp, &ifnet_head, if_list) { if (ifp == ifp0) continue; - if (get_hw_ifid(ifp, in6) != 0) + if (in6_get_hw_ifid(ifp, in6) != 0) continue; /* @@ -488,18 +499,14 @@ in6_ifattach_linklocal( */ strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); - if (((ifp->if_type == IFT_PPP) || ((ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0)) && - ifra_passed != NULL) /* PPP provided both addresses for us */ + if ((ifp->if_eflags & IFEF_NOAUTOIPV6LL) != 0 && + ifra_passed != NULL) /* interface provided both addresses for us */ bcopy(&ifra_passed->ifra_addr, &(ifra.ifra_addr), sizeof(struct sockaddr_in6)); else { ifra.ifra_addr.sin6_family = AF_INET6; ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_addr.sin6_addr.s6_addr16[0] = htons(0xfe80); -#if SCOPEDROUTING - ifra.ifra_addr.sin6_addr.s6_addr16[1] = 0 -#else - ifra.ifra_addr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); /* XXX */ -#endif + ifra.ifra_addr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0; if ((ifp->if_flags & IFF_LOOPBACK) != 0) { ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0; @@ -508,58 +515,42 @@ in6_ifattach_linklocal( if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) { nd6log((LOG_ERR, " %s: no ifid available\n", if_name(ifp))); - return -1; + return EADDRNOTAVAIL; } } -#if SCOPEDROUTING - ifra.ifra_addr.sin6_scope_id = - in6_addr2scopeid(ifp, &ifra.ifra_addr.sin6_addr); -#endif } + if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL)) + return (EADDRNOTAVAIL); + ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_prefixmask.sin6_family = AF_INET6; ifra.ifra_prefixmask.sin6_addr = in6mask64; -#if SCOPEDROUTING - /* take into accound the sin6_scope_id field for routing */ - ifra.ifra_prefixmask.sin6_scope_id = 0xffffffff; -#endif /* link-local addresses should NEVER expire. */ ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME; ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME; - /* - * Do not let in6_update_ifa() do DAD, since we need a random delay - * before sending an NS at the first time the interface becomes up. - * Instead, in6_if_up() will start DAD with a proper random delay. - */ - ifra.ifra_flags |= IN6_IFF_NODAD; - /* * Now call in6_update_ifa() to do a bunch of procedures to configure - * a link-local address. We can set NULL to the 3rd argument, because + * a link-local address. We can set the 3rd argument to NULL, because * we know there's no other link-local address on the interface * and therefore we are adding one (instead of updating one). */ - if ((error = in6_update_ifa(ifp, &ifra, NULL, M_WAITOK)) != 0) { + if ((error = in6_update_ifa(ifp, &ifra, NULL, + IN6_IFAUPDATE_DADDELAY, M_WAITOK)) != 0) { /* * XXX: When the interface does not support IPv6, this call * would fail in the SIOCSIFADDR ioctl. I believe the * notification is rather confusing in this case, so just - * supress it. (jinmei@kame.net 20010130) + * suppress it. (jinmei@kame.net 20010130) */ if (error != EAFNOSUPPORT) - log(LOG_NOTICE, "in6_ifattach_linklocal: failed to " + nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to " "configure a link-local address on %s " "(errno=%d)\n", - if_name(ifp), error); - return(-1); + if_name(ifp), error)); + return (EADDRNOTAVAIL); } - /* - * Adjust ia6_flags so that in6_if_up will perform DAD. - * XXX: Some P2P interfaces seem not to send packets just after - * becoming up, so we skip p2p interfaces for safety. - */ ia = in6ifa_ifpforlinklocal(ifp, 0); /* ia must not be NULL */ #if DIAGNOSTIC if (!ia) { @@ -567,19 +558,15 @@ in6_ifattach_linklocal( /*NOTREACHED*/ } #endif - if (in6if_do_dad(ifp) && (ifp->if_flags & IFF_POINTOPOINT) == 0) { - ia->ia6_flags &= ~IN6_IFF_NODAD; - ia->ia6_flags |= IN6_IFF_TENTATIVE; - } - /* - * Make the link-local prefix (fe80::/64%link) as on-link. + * Make the link-local prefix (fe80::%link/64) as on-link. * Since we'd like to manage prefixes separately from addresses, * we make an ND6 prefix structure for the link-local prefix, * and add it to the prefix list as a never-expire prefix. * XXX: this change might affect some existing code base... */ bzero(&pr0, sizeof(pr0)); + lck_mtx_init(&pr0.ndpr_lock, ifa_mtx_grp, ifa_mtx_attr); pr0.ndpr_ifp = ifp; /* this should be 64 at this moment. */ pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL); @@ -598,6 +585,7 @@ in6_ifattach_linklocal( pr0.ndpr_raf_auto = 1; /* probably meaningless */ pr0.ndpr_vltime = ND6_INFINITE_LIFETIME; pr0.ndpr_pltime = ND6_INFINITE_LIFETIME; + pr0.ndpr_stateflags |= NDPRF_STATIC; /* * Since there is no other link-local addresses, nd6_prefix_lookup() * probably returns NULL. However, we cannot always expect the result. @@ -606,21 +594,23 @@ in6_ifattach_linklocal( * valid with referring to the old link-local address. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { - if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { - printf("in6_ifattach_linklocal: nd6_prelist_add failed %d\n", error); - ifafree(&ia->ia_ifa); + if ((error = nd6_prelist_add(&pr0, NULL, &pr, TRUE)) != 0) { + IFA_REMREF(&ia->ia_ifa); + lck_mtx_destroy(&pr0.ndpr_lock, ifa_mtx_grp); return(error); } } if (ia != NULL) { in6_post_msg(ifp, KEV_INET6_NEW_LL_ADDR, ia); - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } /* Drop use count held above during lookup/add */ if (pr != NULL) - ndpr_rele(pr, FALSE); + NDPR_REMREF(pr); + + lck_mtx_destroy(&pr0.ndpr_lock, ifa_mtx_grp); return 0; } @@ -670,11 +660,11 @@ in6_ifattach_loopback( * We are sure that this is a newly assigned address, so we can set * NULL to the 3rd arg. */ - if ((error = in6_update_ifa(ifp, &ifra, NULL, M_WAITOK)) != 0) { - log(LOG_ERR, "in6_ifattach_loopback: failed to configure " + if ((error = in6_update_ifa(ifp, &ifra, NULL, 0, M_WAITOK)) != 0) { + nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure " "the loopback address on %s (errno=%d)\n", - if_name(ifp), error); - return(-1); + if_name(ifp), error)); + return (EADDRNOTAVAIL); } return 0; @@ -724,76 +714,33 @@ in6_nigroup( MD5Final(digest, &ctxt); bzero(in6, sizeof(*in6)); - in6->s6_addr16[0] = htons(0xff02); - if (ifp) - in6->s6_addr16[1] = htons(ifp->if_index); + in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL; in6->s6_addr8[11] = 2; bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3])); + if (in6_setscope(in6, ifp, NULL)) + return (-1); /* XXX: should not fail */ return 0; } -void -in6_nigroup_attach( - const char *name, - int namelen) +int +in6_domifattach(struct ifnet *ifp) { - struct ifnet *ifp; - struct sockaddr_in6 mltaddr; - struct in6_multi *in6m; - int error; - - bzero(&mltaddr, sizeof(mltaddr)); - mltaddr.sin6_family = AF_INET6; - mltaddr.sin6_len = sizeof(struct sockaddr_in6); - if (in6_nigroup(NULL, name, namelen, &mltaddr.sin6_addr) != 0) - return; + int error = 0; - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_list) { - mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); - ifnet_lock_done(ifp); - if (!in6m) { - if (!in6_addmulti(&mltaddr.sin6_addr, ifp, &error, 0)) { - nd6log((LOG_ERR, "%s: failed to join %s " - "(errno=%d)\n", if_name(ifp), - ip6_sprintf(&mltaddr.sin6_addr), - error)); - } - } + if ((error = proto_plumb(PF_INET6, ifp))) { + if (error != EEXIST) + log(LOG_ERR, "%s: proto_plumb returned %d if=%s%d\n", + __func__, error, ifp->if_name, ifp->if_unit); + } else { + nd6_ifattach(ifp); + scope6_ifattach(ifp); } - ifnet_head_done(); -} - -void -in6_nigroup_detach( - const char *name, - int namelen) -{ - struct ifnet *ifp; - struct sockaddr_in6 mltaddr; - struct in6_multi *in6m; - - bzero(&mltaddr, sizeof(mltaddr)); - mltaddr.sin6_family = AF_INET6; - mltaddr.sin6_len = sizeof(struct sockaddr_in6); - if (in6_nigroup(NULL, name, namelen, &mltaddr.sin6_addr) != 0) - return; - ifnet_head_lock_shared(); - TAILQ_FOREACH(ifp, &ifnet_head, if_list) { - mltaddr.sin6_addr.s6_addr16[1] = htons(ifp->if_index); - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mltaddr.sin6_addr, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m) - in6_delmulti(in6m, 0); - } - ifnet_head_done(); + return (error); } + /* * XXX multiple loopback interface needs more care. for instance, * nodelocal address needs to be configured onto only one of them. @@ -807,8 +754,10 @@ in6_ifattach( { static size_t if_indexlim = 8; struct in6_ifaddr *ia; + struct in6_addr in6; int error; + lck_rw_lock_exclusive(&in6_ifs_rwlock); /* * We have some arrays that should be indexed by if_index. * since if_index will grow dynamically, they should grow too. @@ -821,7 +770,6 @@ in6_ifattach( if_indexlim <<= 1; } - lck_mtx_lock(ip6_mutex); /* grow in6_ifstat */ if (in6_ifstatmax < if_indexlim) { size_t n; @@ -830,7 +778,7 @@ in6_ifattach( n = if_indexlim * sizeof(struct in6_ifstat *); q = (caddr_t)_MALLOC(n, M_IFADDR, M_WAITOK); if (q == NULL) { - lck_mtx_unlock(ip6_mutex); + lck_rw_done(&in6_ifs_rwlock); return ENOBUFS; } bzero(q, n); @@ -847,17 +795,14 @@ in6_ifattach( in6_ifstat[ifp->if_index] = (struct in6_ifstat *) _MALLOC(sizeof(struct in6_ifstat), M_IFADDR, M_WAITOK); if (in6_ifstat[ifp->if_index] == NULL) { - lck_mtx_unlock(ip6_mutex); + lck_rw_done(&in6_ifs_rwlock); return ENOBUFS; } bzero(in6_ifstat[ifp->if_index], sizeof(struct in6_ifstat)); } - lck_mtx_unlock(ip6_mutex); + lck_rw_done(&in6_ifs_rwlock); - /* grow icmp6_ifstat, use inet6_domain_mutex as that is used in - * icmp6 routines - */ - lck_mtx_lock(inet6_domain_mutex); + lck_rw_lock_exclusive(&icmp6_ifs_rwlock); if (icmp6_ifstatmax < if_indexlim) { size_t n; caddr_t q; @@ -865,7 +810,7 @@ in6_ifattach( n = if_indexlim * sizeof(struct icmp6_ifstat *); q = (caddr_t)_MALLOC(n, M_IFADDR, M_WAITOK); if (q == NULL) { - lck_mtx_unlock(inet6_domain_mutex); + lck_rw_done(&icmp6_ifs_rwlock); return ENOBUFS; } bzero(q, n); @@ -882,12 +827,12 @@ in6_ifattach( icmp6_ifstat[ifp->if_index] = (struct icmp6_ifstat *) _MALLOC(sizeof(struct icmp6_ifstat), M_IFADDR, M_WAITOK); if (icmp6_ifstat[ifp->if_index] == NULL) { - lck_mtx_unlock(inet6_domain_mutex); + lck_rw_done(&icmp6_ifs_rwlock); return ENOBUFS; } bzero(icmp6_ifstat[ifp->if_index], sizeof(struct icmp6_ifstat)); } - lck_mtx_unlock(inet6_domain_mutex); + lck_rw_done(&icmp6_ifs_rwlock); /* initialize NDP variables */ if ((error = nd6_ifattach(ifp)) != 0) @@ -919,9 +864,9 @@ in6_ifattach( * usually, we require multicast capability to the interface */ if ((ifp->if_flags & IFF_MULTICAST) == 0) { - log(LOG_INFO, "in6_ifattach: " - "%s is not multicast capable, IPv6 not enabled\n", - if_name(ifp)); + nd6log((LOG_INFO, "in6_ifattach: ", + "%s is not multicast capable, IPv6 not enabled\n", + if_name(ifp))); return EINVAL; } @@ -930,12 +875,23 @@ in6_ifattach( * XXX multiple loopback interface case. */ if ((ifp->if_flags & IFF_LOOPBACK) != 0) { - if (in6_ifattach_loopback(ifp) != 0) - printf("in6_ifattach: in6_ifattach_loopback failed\n"); + struct in6_ifaddr *ia6 = NULL; + if (!OSCompareAndSwap(0, 1, (UInt32 *)&loopattach6_done)) { + in6 = in6addr_loopback; + if ((ia6 = in6ifa_ifpwithaddr(ifp, &in6)) == NULL) { + if (in6_ifattach_loopback(ifp) != 0) { + OSCompareAndSwap(1, 0, (UInt32 *)&loopattach6_done); + return EINVAL; + } + } + else { + IFA_REMREF(&ia6->ia_ifa); + } + } } /* - * assign a link-local address, if there's none. + * assign a link-local address, if there's none. */ if (ip6_auto_linklocal) { ia = in6ifa_ifpforlinklocal(ifp, 0); @@ -943,13 +899,13 @@ in6_ifattach( if (in6_ifattach_linklocal(ifp, altifp, ifra) == 0) { /* linklocal address assigned */ } else { - log(LOG_INFO, "in6_ifattach: %s failed to " + nd6log((LOG_INFO, "in6_ifattach: %s failed to " "attach a linklocal address.\n", - if_name(ifp)); + if_name(ifp))); /* failed to assign linklocal address. bark? */ } } else { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); } } @@ -966,83 +922,147 @@ statinit: /* * NOTE: in6_ifdetach() does not support loopback if at this moment. - * We don't need this function in bsdi, because interfaces are never removed - * from the ifnet list in bsdi. */ void -in6_ifdetach( - struct ifnet *ifp) +in6_ifdetach(struct ifnet *ifp) { - struct in6_ifaddr *ia, *oia, *nia; - struct ifaddr *ifa, *next; + struct in6_ifaddr *ia, *oia; + struct ifaddr *ifa; struct rtentry *rt; struct sockaddr_in6 sin6; + struct in6_multi_mship *imm; + int unlinked; - /* nuke prefix list. this may try to remove some of ifaddrs as well */ - in6_purgeprefix(ifp); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* remove neighbor management table */ nd6_purge(ifp); /* nuke any of IPv6 addresses we have */ - - lck_mtx_lock(nd6_mutex); - for (ia = in6_ifaddrs; ia != NULL; ia = nia) { - nia = ia->ia_next; - if (ia->ia_ifa.ifa_ifp != ifp) + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + if (ia->ia_ifa.ifa_ifp != ifp) { + ia = ia->ia_next; continue; - in6_purgeaddr(&ia->ia_ifa, 1); + } + IFA_ADDREF(&ia->ia_ifa); /* for us */ + lck_rw_done(&in6_ifaddr_rwlock); + in6_purgeaddr(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); /* for us */ + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + /* + * Purging the address caused in6_ifaddr_rwlock + * to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + ia = in6_ifaddrs; } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); ifnet_lock_exclusive(ifp); /* undo everything done by in6_ifattach(), just in case */ - for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = next) - { - next = ifa->ifa_list.tqe_next; - - - if (ifa->ifa_addr->sa_family != AF_INET6 - || !IN6_IS_ADDR_LINKLOCAL(&satosin6(&ifa->ifa_addr)->sin6_addr)) { + ifa = TAILQ_FIRST(&ifp->if_addrlist); + while (ifa != NULL) { + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6 || + !IN6_IS_ADDR_LINKLOCAL(&satosin6(&ifa->ifa_addr)-> + sin6_addr)) { + IFA_UNLOCK(ifa); + ifa = TAILQ_NEXT(ifa, ifa_list); continue; } ia = (struct in6_ifaddr *)ifa; - /* remove from the routing table */ - if ((ia->ia_flags & IFA_ROUTE) && - (rt = rtalloc1((struct sockaddr *)&ia->ia_addr, 0, 0))) { - (void) rtrequest(RTM_DELETE, - (struct sockaddr *)&ia->ia_addr, - (struct sockaddr *)&ia->ia_addr, - (struct sockaddr *)&ia->ia_prefixmask, - rt->rt_flags, (struct rtentry **)0); - rtfree(rt); + /* hold a reference for this routine */ + IFA_ADDREF_LOCKED(ifa); + /* remove from the linked list */ + if_detach_ifa(ifp, ifa); + IFA_UNLOCK(ifa); + + /* + * Leaving the multicast group(s) may involve freeing the + * link address multicast structure(s) for the interface, + * which is protected by ifnet lock. To avoid violating + * lock ordering, we must drop ifnet lock before doing so. + * The ifa won't go away since we held a refcnt above. + */ + ifnet_lock_done(ifp); + + /* + * We have to do this work manually here instead of calling + * in6_purgeaddr() since in6_purgeaddr() uses the RTM_HOST flag. + */ + + /* + * leave from multicast groups we have joined for the interface + */ + IFA_LOCK(ifa); + while ((imm = ia->ia6_memberships.lh_first) != NULL) { + LIST_REMOVE(imm, i6mm_chain); + IFA_UNLOCK(ifa); + in6_leavegroup(imm); + IFA_LOCK(ifa); } - /* remove from the linked list */ - if_detach_ifa(ifp, &ia->ia_ifa); + /* remove from the routing table */ + if (ia->ia_flags & IFA_ROUTE) { + IFA_UNLOCK(ifa); + rt = rtalloc1((struct sockaddr *)&ia->ia_addr, 0, 0); + if (rt != NULL) { + (void) rtrequest(RTM_DELETE, + (struct sockaddr *)&ia->ia_addr, + (struct sockaddr *)&ia->ia_addr, + (struct sockaddr *)&ia->ia_prefixmask, + rt->rt_flags, (struct rtentry **)0); + rtfree(rt); + } + } else { + IFA_UNLOCK(ifa); + } /* also remove from the IPv6 address chain(itojun&jinmei) */ + unlinked = 1; oia = ia; - lck_mtx_lock(nd6_mutex); - if (oia == (ia = in6_ifaddrs)) + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + if (oia == (ia = in6_ifaddrs)) { in6_ifaddrs = ia->ia_next; - else { + } else { while (ia->ia_next && (ia->ia_next != oia)) ia = ia->ia_next; - if (ia->ia_next) + if (ia->ia_next) { ia->ia_next = oia->ia_next; - else { - nd6log((LOG_ERR, + } else { + nd6log((LOG_ERR, "%s: didn't unlink in6ifaddr from " "list\n", if_name(ifp))); + unlinked = 0; } } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); + + ifa = &oia->ia_ifa; + /* + * release another refcnt for the link from in6_ifaddrs. + * Do this only if it's not already unlinked in the event + * that we lost the race, since in6_ifaddr_rwlock was momentarily + * dropped above. + */ + if (unlinked) + IFA_REMREF(ifa); + /* release reference held for this routine */ + IFA_REMREF(ifa); - ifafree(&oia->ia_ifa); + /* + * This is suboptimal, but since we dropped ifnet lock above + * the list might have changed. Repeat the search from the + * beginning until we find the first eligible IPv6 address. + */ + ifnet_lock_exclusive(ifp); + ifa = TAILQ_FIRST(&ifp->if_addrlist); } ifnet_lock_done(ifp); @@ -1128,7 +1148,7 @@ in6_tmpaddrtimer( bzero(nullbuf, sizeof(nullbuf)); for (i = 1; i < nd_ifinfo_indexlim + 1; i++) { ndi = &nd_ifinfo[i]; - if (ndi->flags != ND6_IFF_PERFORMNUD) + if ((ndi->flags | ND6_IFF_PERFORMNUD) != ND6_IFF_PERFORMNUD) continue; if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) { /* diff --git a/bsd/netinet6/in6_ifattach.h b/bsd/netinet6/in6_ifattach.h index 7fa627f2d..40ffa0379 100644 --- a/bsd/netinet6/in6_ifattach.h +++ b/bsd/netinet6/in6_ifattach.h @@ -1,3 +1,30 @@ +/* + * Copyright (c) 2003-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ /* $KAME: in6_ifattach.h,v 1.4 2000/02/22 14:04:18 itojun Exp $ */ /* @@ -34,13 +61,12 @@ #include <sys/appleapiopts.h> #ifdef KERNEL_PRIVATE -void in6_nigroup_attach(const char *, int); -void in6_nigroup_detach(const char *, int); -int in6_ifattach(struct ifnet *, struct ifnet *, struct in6_aliasreq *); -void in6_ifdetach(struct ifnet *); -void in6_get_tmpifid(struct ifnet *, u_int8_t *, const u_int8_t *, int); -void in6_tmpaddrtimer(void *); -int in6_nigroup(struct ifnet *, const char *, int, struct in6_addr *); +extern int in6_domifattach(struct ifnet *); +extern int in6_ifattach(struct ifnet *, struct ifnet *, struct in6_aliasreq *); +extern void in6_ifdetach(struct ifnet *); +extern void in6_get_tmpifid(struct ifnet *, u_int8_t *, const u_int8_t *, int); +extern void in6_tmpaddrtimer(void *); +extern int in6_nigroup(struct ifnet *, const char *, int, struct in6_addr *); #endif /* KERNEL_PRIVATE */ #endif /* _NETINET6_IN6_IFATTACH_H_ */ diff --git a/bsd/netinet6/in6_mcast.c b/bsd/netinet6/in6_mcast.c new file mode 100644 index 000000000..05670d211 --- /dev/null +++ b/bsd/netinet6/in6_mcast.c @@ -0,0 +1,3490 @@ +/* + * Copyright (c) 2010-2011 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Copyright (c) 2009 Bruce Simpson. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * IPv6 multicast socket, group, and socket option processing module. + * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810. + */ + +#include <sys/cdefs.h> + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/protosw.h> +#include <sys/sysctl.h> +#include <sys/tree.h> +#include <sys/mcache.h> + +#include <kern/zalloc.h> + +#include <pexpert/pexpert.h> + +#include <net/if.h> +#include <net/if_dl.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet6/in6_var.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet6/ip6_var.h> +#include <netinet/in_pcb.h> +#include <netinet/tcp.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_var.h> +#include <netinet6/nd6.h> +#include <netinet6/mld6_var.h> +#include <netinet6/scope6_var.h> + +#ifndef __SOCKUNION_DECLARED +union sockunion { + struct sockaddr_storage ss; + struct sockaddr sa; + struct sockaddr_dl sdl; + struct sockaddr_in6 sin6; +}; +typedef union sockunion sockunion_t; +#define __SOCKUNION_DECLARED +#endif /* __SOCKUNION_DECLARED */ + +static void im6f_commit(struct in6_mfilter *); +static int im6f_get_source(struct in6_mfilter *imf, + const struct sockaddr_in6 *psin, + struct in6_msource **); +static struct in6_msource * + im6f_graft(struct in6_mfilter *, const uint8_t, + const struct sockaddr_in6 *); +static int im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *); +static void im6f_rollback(struct in6_mfilter *); +static void im6f_reap(struct in6_mfilter *); +static int im6o_grow(struct ip6_moptions *, size_t); +static size_t im6o_match_group(const struct ip6_moptions *, + const struct ifnet *, const struct sockaddr *); +static struct in6_msource * + im6o_match_source(const struct ip6_moptions *, const size_t, + const struct sockaddr *); +static void im6s_merge(struct ip6_msource *ims, + const struct in6_msource *lims, const int rollback); +static int in6_mc_get(struct ifnet *, const struct in6_addr *, + struct in6_multi **); +static int in6m_get_source(struct in6_multi *inm, + const struct in6_addr *addr, const int noalloc, + struct ip6_msource **pims); +static int in6m_is_ifp_detached(const struct in6_multi *); +static int in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *); +static void in6m_reap(struct in6_multi *); +static struct ip6_moptions * + in6p_findmoptions(struct inpcb *); +static int in6p_get_source_filters(struct inpcb *, struct sockopt *); +static int in6p_lookup_v4addr(struct ipv6_mreq *, struct ip_mreq *); +static int in6p_join_group(struct inpcb *, struct sockopt *); +static int in6p_leave_group(struct inpcb *, struct sockopt *); +static struct ifnet * + in6p_lookup_mcast_ifp(const struct inpcb *, + const struct sockaddr_in6 *); +static int in6p_block_unblock_source(struct inpcb *, struct sockopt *); +static int in6p_set_multicast_if(struct inpcb *, struct sockopt *); +static int in6p_set_source_filters(struct inpcb *, struct sockopt *); +static int sysctl_ip6_mcast_filters SYSCTL_HANDLER_ARGS; +static __inline__ int ip6_msource_cmp(const struct ip6_msource *, + const struct ip6_msource *); + +SYSCTL_DECL(_net_inet6_ip6); /* XXX Not in any common header. */ + +SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IPv6 multicast"); + +static unsigned long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER; +SYSCTL_LONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc, + CTLFLAG_RW | CTLFLAG_LOCKED, &in6_mcast_maxgrpsrc, + "Max source filters per group"); + +static unsigned long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER; +SYSCTL_LONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc, + CTLFLAG_RW | CTLFLAG_LOCKED, &in6_mcast_maxsocksrc, + "Max source filters per socket"); + +int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP; +SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RW | CTLFLAG_LOCKED, + &in6_mcast_loop, 0, "Loopback multicast datagrams by default"); + +SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters, + CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_ip6_mcast_filters, + "Per-interface stack-wide source filters"); + +RB_GENERATE_PREV(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp); + +#define IN6M_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int in6m_trace_hist_size = IN6M_TRACE_HIST_SIZE; + +struct in6_multi_dbg { + struct in6_multi in6m; /* in6_multi */ + u_int16_t in6m_refhold_cnt; /* # of ref */ + u_int16_t in6m_refrele_cnt; /* # of rele */ + /* + * Circular lists of in6m_addref and in6m_remref callers. + */ + ctrace_t in6m_refhold[IN6M_TRACE_HIST_SIZE]; + ctrace_t in6m_refrele[IN6M_TRACE_HIST_SIZE]; + /* + * Trash list linkage + */ + TAILQ_ENTRY(in6_multi_dbg) in6m_trash_link; +}; + +/* List of trash in6_multi entries protected by in6m_trash_lock */ +static TAILQ_HEAD(, in6_multi_dbg) in6m_trash_head; +static decl_lck_mtx_data(, in6m_trash_lock); + +#if DEBUG +static unsigned int in6m_debug = 1; /* debugging (enabled) */ +#else +static unsigned int in6m_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ +static unsigned int in6m_size; /* size of zone element */ +static struct zone *in6m_zone; /* zone for in6_multi */ + +#define IN6M_ZONE_MAX 64 /* maximum elements in zone */ +#define IN6M_ZONE_NAME "in6_multi" /* zone name */ + +static unsigned int imm_size; /* size of zone element */ +static struct zone *imm_zone; /* zone for in6_multi_mship */ + +#define IMM_ZONE_MAX 64 /* maximum elements in zone */ +#define IMM_ZONE_NAME "in6_multi_mship" /* zone name */ + +#define IP6MS_ZONE_MAX 64 /* maximum elements in zone */ +#define IP6MS_ZONE_NAME "ip6_msource" /* zone name */ + +static unsigned int ip6ms_size; /* size of zone element */ +static struct zone *ip6ms_zone; /* zone for ip6_msource */ + +#define IN6MS_ZONE_MAX 64 /* maximum elements in zone */ +#define IN6MS_ZONE_NAME "in6_msource" /* zone name */ + +static unsigned int in6ms_size; /* size of zone element */ +static struct zone *in6ms_zone; /* zone for in6_msource */ + +/* Lock group and attribute for in6_multihead_lock lock */ +static lck_attr_t *in6_multihead_lock_attr; +static lck_grp_t *in6_multihead_lock_grp; +static lck_grp_attr_t *in6_multihead_lock_grp_attr; + +static decl_lck_rw_data(, in6_multihead_lock); +struct in6_multihead in6_multihead; + +static struct in6_multi *in6_multi_alloc(int); +static void in6_multi_free(struct in6_multi *); +static void in6_multi_attach(struct in6_multi *); +static struct in6_multi_mship *in6_multi_mship_alloc(int); +static void in6_multi_mship_free(struct in6_multi_mship *); +static void in6m_trace(struct in6_multi *, int); + +static struct ip6_msource *ip6ms_alloc(int); +static void ip6ms_free(struct ip6_msource *); +static struct in6_msource *in6ms_alloc(int); +static void in6ms_free(struct in6_msource *); + +#define IM6O_CAST_TO_NONCONST(x) ((struct ip6_moptions *)(void *)(uintptr_t)x) +#define IN6M_CAST_TO_NONCONST(x) ((struct in6_multi *)(void *)(uintptr_t)x) + +/* + * IPv6 source tree comparison function. + * + * An ordered predicate is necessary; bcmp() is not documented to return + * an indication of order, memcmp() is, and is an ISO C99 requirement. + */ +static __inline int +ip6_msource_cmp(const struct ip6_msource *a, const struct ip6_msource *b) +{ + return (memcmp(&a->im6s_addr, &b->im6s_addr, sizeof(struct in6_addr))); +} + +/* + * Inline function which wraps assertions for a valid ifp. + */ +static __inline__ int +in6m_is_ifp_detached(const struct in6_multi *inm) +{ + VERIFY(inm->in6m_ifma != NULL); + VERIFY(inm->in6m_ifp == inm->in6m_ifma->ifma_ifp); + + return (!ifnet_is_attached(inm->in6m_ifp, 0)); +} + +/* + * Initialize an in6_mfilter structure to a known state at t0, t1 + * with an empty source filter list. + */ +static __inline__ void +im6f_init(struct in6_mfilter *imf, const int st0, const int st1) +{ + memset(imf, 0, sizeof(struct in6_mfilter)); + RB_INIT(&imf->im6f_sources); + imf->im6f_st[0] = st0; + imf->im6f_st[1] = st1; +} + +/* + * Resize the ip6_moptions vector to the next power-of-two minus 1. + */ +static int +im6o_grow(struct ip6_moptions *imo, size_t newmax) +{ + struct in6_multi **nmships; + struct in6_multi **omships; + struct in6_mfilter *nmfilters; + struct in6_mfilter *omfilters; + size_t idx; + size_t oldmax; + + IM6O_LOCK_ASSERT_HELD(imo); + + nmships = NULL; + nmfilters = NULL; + omships = imo->im6o_membership; + omfilters = imo->im6o_mfilters; + oldmax = imo->im6o_max_memberships; + if (newmax == 0) + newmax = ((oldmax + 1) * 2) - 1; + + if (newmax > IPV6_MAX_MEMBERSHIPS) + return (ETOOMANYREFS); + + if ((nmships = (struct in6_multi **)_REALLOC(omships, + sizeof (struct in6_multi *) * newmax, M_IP6MOPTS, + M_WAITOK | M_ZERO)) == NULL) + return (ENOMEM); + + imo->im6o_membership = nmships; + + if ((nmfilters = (struct in6_mfilter *)_REALLOC(omfilters, + sizeof (struct in6_mfilter) * newmax, M_IN6MFILTER, + M_WAITOK | M_ZERO)) == NULL) + return (ENOMEM); + + imo->im6o_mfilters = nmfilters; + + /* Initialize newly allocated source filter heads. */ + for (idx = oldmax; idx < newmax; idx++) + im6f_init(&nmfilters[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); + + imo->im6o_max_memberships = newmax; + + return (0); +} + +/* + * Find an IPv6 multicast group entry for this ip6_moptions instance + * which matches the specified group, and optionally an interface. + * Return its index into the array, or -1 if not found. + */ +static size_t +im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group) +{ + const struct sockaddr_in6 *gsin6; + struct in6_multi *pinm; + int idx; + int nmships; + + IM6O_LOCK_ASSERT_HELD(IM6O_CAST_TO_NONCONST(imo)); + + gsin6 = (const struct sockaddr_in6 *)group; + + /* The im6o_membership array may be lazy allocated. */ + if (imo->im6o_membership == NULL || imo->im6o_num_memberships == 0) + return (-1); + + nmships = imo->im6o_num_memberships; + for (idx = 0; idx < nmships; idx++) { + pinm = imo->im6o_membership[idx]; + if (pinm == NULL) + continue; + IN6M_LOCK(pinm); + if ((ifp == NULL || (pinm->in6m_ifp == ifp)) && + IN6_ARE_ADDR_EQUAL(&pinm->in6m_addr, + &gsin6->sin6_addr)) { + IN6M_UNLOCK(pinm); + break; + } + IN6M_UNLOCK(pinm); + } + if (idx >= nmships) + idx = -1; + + return (idx); +} + +/* + * Find an IPv6 multicast source entry for this imo which matches + * the given group index for this socket, and source address. + * + * XXX TODO: The scope ID, if present in src, is stripped before + * any comparison. We SHOULD enforce scope/zone checks where the source + * filter entry has a link scope. + * + * NOTE: This does not check if the entry is in-mode, merely if + * it exists, which may not be the desired behaviour. + */ +static struct in6_msource * +im6o_match_source(const struct ip6_moptions *imo, const size_t gidx, + const struct sockaddr *src) +{ + struct ip6_msource find; + struct in6_mfilter *imf; + struct ip6_msource *ims; + const sockunion_t *psa; + + IM6O_LOCK_ASSERT_HELD(IM6O_CAST_TO_NONCONST(imo)); + + VERIFY(src->sa_family == AF_INET6); + VERIFY(gidx != (size_t)-1 && gidx < imo->im6o_num_memberships); + + /* The im6o_mfilters array may be lazy allocated. */ + if (imo->im6o_mfilters == NULL) + return (NULL); + imf = &imo->im6o_mfilters[gidx]; + + psa = (const sockunion_t *)src; + find.im6s_addr = psa->sin6.sin6_addr; + in6_clearscope(&find.im6s_addr); /* XXX */ + ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); + + return ((struct in6_msource *)ims); +} + +/* + * Perform filtering for multicast datagrams on a socket by group and source. + * + * Returns 0 if a datagram should be allowed through, or various error codes + * if the socket was not a member of the group, or the source was muted, etc. + */ +int +im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp, + const struct sockaddr *group, const struct sockaddr *src) +{ + size_t gidx; + struct in6_msource *ims; + int mode; + + IM6O_LOCK_ASSERT_HELD(IM6O_CAST_TO_NONCONST(imo)); + VERIFY(ifp != NULL); + + gidx = im6o_match_group(imo, ifp, group); + if (gidx == (size_t)-1) + return (MCAST_NOTGMEMBER); + + /* + * Check if the source was included in an (S,G) join. + * Allow reception on exclusive memberships by default, + * reject reception on inclusive memberships by default. + * Exclude source only if an in-mode exclude filter exists. + * Include source only if an in-mode include filter exists. + * NOTE: We are comparing group state here at MLD t1 (now) + * with socket-layer t0 (since last downcall). + */ + mode = imo->im6o_mfilters[gidx].im6f_st[1]; + ims = im6o_match_source(imo, gidx, src); + + if ((ims == NULL && mode == MCAST_INCLUDE) || + (ims != NULL && ims->im6sl_st[0] != mode)) + return (MCAST_NOTSMEMBER); + + return (MCAST_PASS); +} + +/* + * Find and return a reference to an in6_multi record for (ifp, group), + * and bump its reference count. + * If one does not exist, try to allocate it, and update link-layer multicast + * filters on ifp to listen for group. + * Assumes the IN6_MULTI lock is held across the call. + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +in6_mc_get(struct ifnet *ifp, const struct in6_addr *group, + struct in6_multi **pinm) +{ + struct sockaddr_in6 gsin6; + struct ifmultiaddr *ifma; + struct in6_multi *inm; + int error; + + *pinm = NULL; + + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(group, ifp, inm); + if (inm != NULL) { + IN6M_LOCK(inm); + VERIFY(inm->in6m_reqcnt >= 1); + inm->in6m_reqcnt++; + VERIFY(inm->in6m_reqcnt != 0); + *pinm = inm; + IN6M_UNLOCK(inm); + in6_multihead_lock_done(); + /* + * We already joined this group; return the in6m + * with a refcount held (via lookup) for caller. + */ + return (0); + } + in6_multihead_lock_done(); + + memset(&gsin6, 0, sizeof(gsin6)); + gsin6.sin6_family = AF_INET6; + gsin6.sin6_len = sizeof(struct sockaddr_in6); + gsin6.sin6_addr = *group; + + /* + * Check if a link-layer group is already associated + * with this network-layer group on the given ifnet. + */ + error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma); + if (error != 0) + return (error); + + /* + * See comments in in6m_remref() for access to ifma_protospec. + */ + in6_multihead_lock_exclusive(); + IFMA_LOCK(ifma); + if ((inm = ifma->ifma_protospec) != NULL) { + VERIFY(ifma->ifma_addr != NULL); + VERIFY(ifma->ifma_addr->sa_family == AF_INET6); + IN6M_ADDREF(inm); /* for caller */ + IFMA_UNLOCK(ifma); + IN6M_LOCK(inm); + VERIFY(inm->in6m_ifma == ifma); + VERIFY(inm->in6m_ifp == ifp); + VERIFY(IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group)); + if (inm->in6m_debug & IFD_ATTACHED) { + VERIFY(inm->in6m_reqcnt >= 1); + inm->in6m_reqcnt++; + VERIFY(inm->in6m_reqcnt != 0); + *pinm = inm; + IN6M_UNLOCK(inm); + in6_multihead_lock_done(); + IFMA_REMREF(ifma); + /* + * We lost the race with another thread doing + * in6_mc_get(); since this group has already + * been joined; return the inm with a refcount + * held for caller. + */ + return (0); + } + /* + * We lost the race with another thread doing in6_delmulti(); + * the inm referring to the ifma has been detached, thus we + * reattach it back to the in6_multihead list, and return the + * inm with a refcount held for the caller. + */ + in6_multi_attach(inm); + VERIFY((inm->in6m_debug & + (IFD_ATTACHED | IFD_TRASHED)) == IFD_ATTACHED); + *pinm = inm; + IN6M_UNLOCK(inm); + in6_multihead_lock_done(); + IFMA_REMREF(ifma); + return (0); + } + IFMA_UNLOCK(ifma); + + /* + * A new in6_multi record is needed; allocate and initialize it. + * We DO NOT perform an MLD join as the in6_ layer may need to + * push an initial source list down to MLD to support SSM. + * + * The initial source filter state is INCLUDE, {} as per the RFC. + * Pending state-changes per group are subject to a bounds check. + */ + inm = in6_multi_alloc(M_WAITOK); + if (inm == NULL) { + in6_multihead_lock_done(); + IFMA_REMREF(ifma); + return (ENOMEM); + } + IN6M_LOCK(inm); + inm->in6m_addr = *group; + inm->in6m_ifp = ifp; + inm->in6m_mli = MLD_IFINFO(ifp); + VERIFY(inm->in6m_mli != NULL); + MLI_ADDREF(inm->in6m_mli); + inm->in6m_ifma = ifma; /* keep refcount from if_addmulti() */ + inm->in6m_state = MLD_NOT_MEMBER; + /* + * Pending state-changes per group are subject to a bounds check. + */ + inm->in6m_scq.ifq_maxlen = MLD_MAX_STATE_CHANGES; + inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED; + inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; + RB_INIT(&inm->in6m_srcs); + *pinm = inm; + in6_multi_attach(inm); + VERIFY((inm->in6m_debug & + (IFD_ATTACHED | IFD_TRASHED)) == IFD_ATTACHED); + IN6M_ADDREF_LOCKED(inm); /* for caller */ + IN6M_UNLOCK(inm); + + IFMA_LOCK(ifma); + VERIFY(ifma->ifma_protospec == NULL); + ifma->ifma_protospec = inm; + IFMA_UNLOCK(ifma); + in6_multihead_lock_done(); + + return (0); +} + +/* + * Clear recorded source entries for a group. + * Used by the MLD code. Caller must hold the IN6_MULTI lock. + * FIXME: Should reap. + */ +void +in6m_clear_recorded(struct in6_multi *inm) +{ + struct ip6_msource *ims; + + IN6M_LOCK_ASSERT_HELD(inm); + + RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { + if (ims->im6s_stp) { + ims->im6s_stp = 0; + --inm->in6m_st[1].iss_rec; + } + } + VERIFY(inm->in6m_st[1].iss_rec == 0); +} + +/* + * Record a source as pending for a Source-Group MLDv2 query. + * This lives here as it modifies the shared tree. + * + * inm is the group descriptor. + * naddr is the address of the source to record in network-byte order. + * + * If the net.inet6.mld.sgalloc sysctl is non-zero, we will + * lazy-allocate a source node in response to an SG query. + * Otherwise, no allocation is performed. This saves some memory + * with the trade-off that the source will not be reported to the + * router if joined in the window between the query response and + * the group actually being joined on the local host. + * + * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed. + * This turns off the allocation of a recorded source entry if + * the group has not been joined. + * + * Return 0 if the source didn't exist or was already marked as recorded. + * Return 1 if the source was marked as recorded by this function. + * Return <0 if any error occured (negated errno code). + */ +int +in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr) +{ + struct ip6_msource find; + struct ip6_msource *ims, *nims; + + IN6M_LOCK_ASSERT_HELD(inm); + + find.im6s_addr = *addr; + ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); + if (ims && ims->im6s_stp) + return (0); + if (ims == NULL) { + if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) + return (-ENOSPC); + nims = ip6ms_alloc(M_WAITOK); + if (nims == NULL) + return (-ENOMEM); + nims->im6s_addr = find.im6s_addr; + RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); + ++inm->in6m_nsrc; + ims = nims; + } + + /* + * Mark the source as recorded and update the recorded + * source count. + */ + ++ims->im6s_stp; + ++inm->in6m_st[1].iss_rec; + + return (1); +} + +/* + * Return a pointer to an in6_msource owned by an in6_mfilter, + * given its source address. + * Lazy-allocate if needed. If this is a new entry its filter state is + * undefined at t0. + * + * imf is the filter set being modified. + * addr is the source address. + * + * Caller is expected to be holding im6o_lock. + */ +static int +im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin, + struct in6_msource **plims) +{ + struct ip6_msource find; + struct ip6_msource *ims; + struct in6_msource *lims; + int error; + + error = 0; + ims = NULL; + lims = NULL; + + find.im6s_addr = psin->sin6_addr; + ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); + lims = (struct in6_msource *)ims; + if (lims == NULL) { + if (imf->im6f_nsrc == in6_mcast_maxsocksrc) + return (ENOSPC); + lims = in6ms_alloc(M_WAITOK); + if (lims == NULL) + return (ENOMEM); + lims->im6s_addr = find.im6s_addr; + lims->im6sl_st[0] = MCAST_UNDEFINED; + RB_INSERT(ip6_msource_tree, &imf->im6f_sources, + (struct ip6_msource *)lims); + ++imf->im6f_nsrc; + } + + *plims = lims; + + return (error); +} + +/* + * Graft a source entry into an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being in the new filter mode at t1. + * + * Return the pointer to the new node, otherwise return NULL. + * + * Caller is expected to be holding im6o_lock. + */ +static struct in6_msource * +im6f_graft(struct in6_mfilter *imf, const uint8_t st1, + const struct sockaddr_in6 *psin) +{ + struct in6_msource *lims; + + lims = in6ms_alloc(M_WAITOK); + if (lims == NULL) + return (NULL); + lims->im6s_addr = psin->sin6_addr; + lims->im6sl_st[0] = MCAST_UNDEFINED; + lims->im6sl_st[1] = st1; + RB_INSERT(ip6_msource_tree, &imf->im6f_sources, + (struct ip6_msource *)lims); + ++imf->im6f_nsrc; + + return (lims); +} + +/* + * Prune a source entry from an existing socket-layer filter set, + * maintaining any required invariants and checking allocations. + * + * The source is marked as being left at t1, it is not freed. + * + * Return 0 if no error occurred, otherwise return an errno value. + * + * Caller is expected to be holding im6o_lock. + */ +static int +im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin) +{ + struct ip6_msource find; + struct ip6_msource *ims; + struct in6_msource *lims; + + find.im6s_addr = psin->sin6_addr; + ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find); + if (ims == NULL) + return (ENOENT); + lims = (struct in6_msource *)ims; + lims->im6sl_st[1] = MCAST_UNDEFINED; + return (0); +} + +/* + * Revert socket-layer filter set deltas at t1 to t0 state. + * + * Caller is expected to be holding im6o_lock. + */ +static void +im6f_rollback(struct in6_mfilter *imf) +{ + struct ip6_msource *ims, *tims; + struct in6_msource *lims; + + RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { + lims = (struct in6_msource *)ims; + if (lims->im6sl_st[0] == lims->im6sl_st[1]) { + /* no change at t1 */ + continue; + } else if (lims->im6sl_st[0] != MCAST_UNDEFINED) { + /* revert change to existing source at t1 */ + lims->im6sl_st[1] = lims->im6sl_st[0]; + } else { + /* revert source added t1 */ + MLD_PRINTF(("%s: free in6ms %p\n", __func__, lims)); + RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); + in6ms_free(lims); + imf->im6f_nsrc--; + } + } + imf->im6f_st[1] = imf->im6f_st[0]; +} + +/* + * Mark socket-layer filter set as INCLUDE {} at t1. + * + * Caller is expected to be holding im6o_lock. + */ +void +im6f_leave(struct in6_mfilter *imf) +{ + struct ip6_msource *ims; + struct in6_msource *lims; + + RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { + lims = (struct in6_msource *)ims; + lims->im6sl_st[1] = MCAST_UNDEFINED; + } + imf->im6f_st[1] = MCAST_INCLUDE; +} + +/* + * Mark socket-layer filter set deltas as committed. + * + * Caller is expected to be holding im6o_lock. + */ +static void +im6f_commit(struct in6_mfilter *imf) +{ + struct ip6_msource *ims; + struct in6_msource *lims; + + RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { + lims = (struct in6_msource *)ims; + lims->im6sl_st[0] = lims->im6sl_st[1]; + } + imf->im6f_st[0] = imf->im6f_st[1]; +} + +/* + * Reap unreferenced sources from socket-layer filter set. + * + * Caller is expected to be holding im6o_lock. + */ +static void +im6f_reap(struct in6_mfilter *imf) +{ + struct ip6_msource *ims, *tims; + struct in6_msource *lims; + + RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { + lims = (struct in6_msource *)ims; + if ((lims->im6sl_st[0] == MCAST_UNDEFINED) && + (lims->im6sl_st[1] == MCAST_UNDEFINED)) { + MLD_PRINTF(("%s: free in6ms %p\n", __func__, lims)); + RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); + in6ms_free(lims); + imf->im6f_nsrc--; + } + } +} + +/* + * Purge socket-layer filter set. + * + * Caller is expected to be holding im6o_lock. + */ +void +im6f_purge(struct in6_mfilter *imf) +{ + struct ip6_msource *ims, *tims; + struct in6_msource *lims; + + RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) { + lims = (struct in6_msource *)ims; + MLD_PRINTF(("%s: free in6ms %p\n", __func__, lims)); + RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims); + in6ms_free(lims); + imf->im6f_nsrc--; + } + imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED; + VERIFY(RB_EMPTY(&imf->im6f_sources)); +} + +/* + * Look up a source filter entry for a multicast group. + * + * inm is the group descriptor to work with. + * addr is the IPv6 address to look up. + * noalloc may be non-zero to suppress allocation of sources. + * *pims will be set to the address of the retrieved or allocated source. + * + * Return 0 if successful, otherwise return a non-zero error code. + */ +static int +in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr, + const int noalloc, struct ip6_msource **pims) +{ + struct ip6_msource find; + struct ip6_msource *ims, *nims; + + IN6M_LOCK_ASSERT_HELD(inm); + + find.im6s_addr = *addr; + ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find); + if (ims == NULL && !noalloc) { + if (inm->in6m_nsrc == in6_mcast_maxgrpsrc) + return (ENOSPC); + nims = ip6ms_alloc(M_WAITOK); + if (nims == NULL) + return (ENOMEM); + nims->im6s_addr = *addr; + RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims); + ++inm->in6m_nsrc; + ims = nims; + MLD_PRINTF(("%s: allocated %s as %p\n", __func__, + ip6_sprintf(addr), ims)); + } + + *pims = ims; + return (0); +} + +/* + * Helper function to derive the filter mode on a source entry + * from its internal counters. Predicates are: + * A source is only excluded if all listeners exclude it. + * A source is only included if no listeners exclude it, + * and at least one listener includes it. + * May be used by ifmcstat(8). + */ +uint8_t +im6s_get_mode(const struct in6_multi *inm, const struct ip6_msource *ims, + uint8_t t) +{ + IN6M_LOCK_ASSERT_HELD(IN6M_CAST_TO_NONCONST(inm)); + + t = !!t; + if (inm->in6m_st[t].iss_ex > 0 && + inm->in6m_st[t].iss_ex == ims->im6s_st[t].ex) + return (MCAST_EXCLUDE); + else if (ims->im6s_st[t].in > 0 && ims->im6s_st[t].ex == 0) + return (MCAST_INCLUDE); + return (MCAST_UNDEFINED); +} + +/* + * Merge socket-layer source into MLD-layer source. + * If rollback is non-zero, perform the inverse of the merge. + */ +static void +im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims, + const int rollback) +{ + int n = rollback ? -1 : 1; + + if (lims->im6sl_st[0] == MCAST_EXCLUDE) { + MLD_PRINTF(("%s: t1 ex -= %d on %s\n", __func__, n, + ip6_sprintf(&lims->im6s_addr))); + ims->im6s_st[1].ex -= n; + } else if (lims->im6sl_st[0] == MCAST_INCLUDE) { + MLD_PRINTF(("%s: t1 in -= %d on %s\n", __func__, n, + ip6_sprintf(&lims->im6s_addr))); + ims->im6s_st[1].in -= n; + } + + if (lims->im6sl_st[1] == MCAST_EXCLUDE) { + MLD_PRINTF(("%s: t1 ex += %d on %s\n", __func__, n, + ip6_sprintf(&lims->im6s_addr))); + ims->im6s_st[1].ex += n; + } else if (lims->im6sl_st[1] == MCAST_INCLUDE) { + MLD_PRINTF(("%s: t1 in += %d on %s\n", __func__, n, + ip6_sprintf(&lims->im6s_addr))); + ims->im6s_st[1].in += n; + } +} + +/* + * Atomically update the global in6_multi state, when a membership's + * filter list is being updated in any way. + * + * imf is the per-inpcb-membership group filter pointer. + * A fake imf may be passed for in-kernel consumers. + * + * XXX This is a candidate for a set-symmetric-difference style loop + * which would eliminate the repeated lookup from root of ims nodes, + * as they share the same key space. + * + * If any error occurred this function will back out of refcounts + * and return a non-zero value. + */ +static int +in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) +{ + struct ip6_msource *ims, *nims; + struct in6_msource *lims; + int schanged, error; + int nsrc0, nsrc1; + + IN6M_LOCK_ASSERT_HELD(inm); + + schanged = 0; + error = 0; + nsrc1 = nsrc0 = 0; + + /* + * Update the source filters first, as this may fail. + * Maintain count of in-mode filters at t0, t1. These are + * used to work out if we transition into ASM mode or not. + * Maintain a count of source filters whose state was + * actually modified by this operation. + */ + RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { + lims = (struct in6_msource *)ims; + if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++; + if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++; + if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue; + error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims); + ++schanged; + if (error) + break; + im6s_merge(nims, lims, 0); + } + if (error) { + struct ip6_msource *bims; + + RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) { + lims = (struct in6_msource *)ims; + if (lims->im6sl_st[0] == lims->im6sl_st[1]) + continue; + (void) in6m_get_source(inm, &lims->im6s_addr, 1, &bims); + if (bims == NULL) + continue; + im6s_merge(bims, lims, 1); + } + goto out_reap; + } + + MLD_PRINTF(("%s: imf filters in-mode: %d at t0, %d at t1\n", + __func__, nsrc0, nsrc1)); + + /* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */ + if (imf->im6f_st[0] == imf->im6f_st[1] && + imf->im6f_st[1] == MCAST_INCLUDE) { + if (nsrc1 == 0) { + MLD_PRINTF(("%s: --in on inm at t1\n", __func__)); + --inm->in6m_st[1].iss_in; + } + } + + /* Handle filter mode transition on socket. */ + if (imf->im6f_st[0] != imf->im6f_st[1]) { + MLD_PRINTF(("%s: imf transition %d to %d\n", + __func__, imf->im6f_st[0], imf->im6f_st[1])); + + if (imf->im6f_st[0] == MCAST_EXCLUDE) { + MLD_PRINTF(("%s: --ex on inm at t1\n", __func__)); + --inm->in6m_st[1].iss_ex; + } else if (imf->im6f_st[0] == MCAST_INCLUDE) { + MLD_PRINTF(("%s: --in on inm at t1\n", __func__)); + --inm->in6m_st[1].iss_in; + } + + if (imf->im6f_st[1] == MCAST_EXCLUDE) { + MLD_PRINTF(("%s: ex++ on inm at t1\n", __func__)); + inm->in6m_st[1].iss_ex++; + } else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) { + MLD_PRINTF(("%s: in++ on inm at t1\n", __func__)); + inm->in6m_st[1].iss_in++; + } + } + + /* + * Track inm filter state in terms of listener counts. + * If there are any exclusive listeners, stack-wide + * membership is exclusive. + * Otherwise, if only inclusive listeners, stack-wide is inclusive. + * If no listeners remain, state is undefined at t1, + * and the MLD lifecycle for this group should finish. + */ + if (inm->in6m_st[1].iss_ex > 0) { + MLD_PRINTF(("%s: transition to EX\n", __func__)); + inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE; + } else if (inm->in6m_st[1].iss_in > 0) { + MLD_PRINTF(("%s: transition to IN\n", __func__)); + inm->in6m_st[1].iss_fmode = MCAST_INCLUDE; + } else { + MLD_PRINTF(("%s: transition to UNDEF\n", __func__)); + inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; + } + + /* Decrement ASM listener count on transition out of ASM mode. */ + if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) { + if ((imf->im6f_st[1] != MCAST_EXCLUDE) || + (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) { + MLD_PRINTF(("%s: --asm on inm at t1\n", __func__)); + --inm->in6m_st[1].iss_asm; + } + } + + /* Increment ASM listener count on transition to ASM mode. */ + if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) { + MLD_PRINTF(("%s: asm++ on inm at t1\n", __func__)); + inm->in6m_st[1].iss_asm++; + } + + MLD_PRINTF(("%s: merged imf %p to inm %p\n", __func__, imf, inm)); + in6m_print(inm); + +out_reap: + if (schanged > 0) { + MLD_PRINTF(("%s: sources changed; reaping\n", __func__)); + in6m_reap(inm); + } + return (error); +} + +/* + * Mark an in6_multi's filter set deltas as committed. + * Called by MLD after a state change has been enqueued. + */ +void +in6m_commit(struct in6_multi *inm) +{ + struct ip6_msource *ims; + + IN6M_LOCK_ASSERT_HELD(inm); + + MLD_PRINTF(("%s: commit inm %p\n", __func__, inm)); + MLD_PRINTF(("%s: pre commit:\n", __func__)); + in6m_print(inm); + + RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { + ims->im6s_st[0] = ims->im6s_st[1]; + } + inm->in6m_st[0] = inm->in6m_st[1]; +} + +/* + * Reap unreferenced nodes from an in6_multi's filter set. + */ +static void +in6m_reap(struct in6_multi *inm) +{ + struct ip6_msource *ims, *tims; + + IN6M_LOCK_ASSERT_HELD(inm); + + RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { + if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 || + ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 || + ims->im6s_stp != 0) + continue; + MLD_PRINTF(("%s: free ims %p\n", __func__, ims)); + RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); + ip6ms_free(ims); + inm->in6m_nsrc--; + } +} + +/* + * Purge all source nodes from an in6_multi's filter set. + */ +void +in6m_purge(struct in6_multi *inm) +{ + struct ip6_msource *ims, *tims; + + IN6M_LOCK_ASSERT_HELD(inm); + + RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) { + MLD_PRINTF(("%s: free ims %p\n", __func__, ims)); + RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims); + ip6ms_free(ims); + inm->in6m_nsrc--; + } +} + +/* + * Join a multicast address w/o sources. + * KAME compatibility entry point. + * + */ +struct in6_multi_mship * +in6_joingroup(struct ifnet *ifp, struct in6_addr *mcaddr, + int *errorp, int delay) +{ + struct in6_multi_mship *imm; + int error; + + *errorp = 0; + + imm = in6_multi_mship_alloc(M_WAITOK); + if (imm == NULL) { + *errorp = ENOBUFS; + return (NULL); + } + + delay = (delay * PR_SLOWHZ) / hz; + + error = in6_mc_join(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay); + if (error) { + *errorp = error; + in6_multi_mship_free(imm); + return (NULL); + } + + return (imm); +} + +/* + * Leave a multicast address w/o sources. + * KAME compatibility entry point. + */ +int +in6_leavegroup(struct in6_multi_mship *imm) +{ + if (imm->i6mm_maddr != NULL) { + in6_mc_leave(imm->i6mm_maddr, NULL); + IN6M_REMREF(imm->i6mm_maddr); + imm->i6mm_maddr = NULL; + } + in6_multi_mship_free(imm); + return 0; +} + +/* + * Join a multicast group; real entry point. + * + * Only preserves atomicity at inm level. + * NOTE: imf argument cannot be const due to sys/tree.h limitations. + * + * If the MLD downcall fails, the group is not joined, and an error + * code is returned. + */ +int +in6_mc_join(struct ifnet *ifp, const struct in6_addr *mcaddr, + /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm, + const int delay) +{ + struct in6_mfilter timf; + struct in6_multi *inm = NULL; + int error = 0; + + /* + * Sanity: Check scope zone ID was set for ifp, if and + * only if group is scoped to an interface. + */ + VERIFY(IN6_IS_ADDR_MULTICAST(mcaddr)); + if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) || + IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) { + VERIFY(mcaddr->s6_addr16[1] != 0); + } + + MLD_PRINTF(("%s: join %s on %p(%s%d))\n", __func__, + ip6_sprintf(mcaddr), ifp, ifp->if_name, ifp->if_unit)); + + *pinm = NULL; + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE); + imf = &timf; + } + + error = in6_mc_get(ifp, mcaddr, &inm); + if (error) { + MLD_PRINTF(("%s: in6_mc_get() failure\n", __func__)); + return (error); + } + + MLD_PRINTF(("%s: merge inm state\n", __func__)); + + IN6M_LOCK(inm); + error = in6m_merge(inm, imf); + if (error) { + MLD_PRINTF(("%s: failed to merge inm state\n", __func__)); + goto out_in6m_release; + } + + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, delay); + if (error) { + MLD_PRINTF(("%s: failed to update source\n", __func__)); + goto out_in6m_release; + } + +out_in6m_release: + if (error) { + MLD_PRINTF(("%s: dropping ref on %p\n", __func__, inm)); + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); + } else { + IN6M_UNLOCK(inm); + *pinm = inm; /* keep refcount from in6_mc_get() */ + } + + return (error); +} + +/* + * Leave a multicast group; real entry point. + * All source filters will be expunged. + * + * Only preserves atomicity at inm level. + * + * Holding the write lock for the INP which contains imf + * is highly advisable. We can't assert for it as imf does not + * contain a back-pointer to the owning inp. + * + * Note: This is not the same as in6m_release(*) as this function also + * makes a state change downcall into MLD. + */ +int +in6_mc_leave(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf) +{ + struct in6_mfilter timf; + int error, lastref; + + error = 0; + + IN6M_LOCK_ASSERT_NOTHELD(inm); + + in6_multihead_lock_exclusive(); + IN6M_LOCK(inm); + + MLD_PRINTF(("%s: leave inm %p, %s/%s%d, imf %p\n", __func__, + inm, ip6_sprintf(&inm->in6m_addr), + (in6m_is_ifp_detached(inm) ? "null" : inm->in6m_ifp->if_name), + inm->in6m_ifp->if_unit, imf)); + + /* + * If no imf was specified (i.e. kernel consumer), + * fake one up and assume it is an ASM join. + */ + if (imf == NULL) { + im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED); + imf = &timf; + } + + /* + * Begin state merge transaction at MLD layer. + * + * As this particular invocation should not cause any memory + * to be allocated, and there is no opportunity to roll back + * the transaction, it MUST NOT fail. + */ + MLD_PRINTF(("%s: merge inm state\n", __func__)); + + error = in6m_merge(inm, imf); + KASSERT(error == 0, ("%s: failed to merge inm state\n", __func__)); + + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, 0); +#if MLD_DEBUG + if (error) + MLD_PRINTF(("%s: failed mld downcall\n", __func__)); +#endif + lastref = in6_multi_detach(inm); + VERIFY(!lastref || (!(inm->in6m_debug & IFD_ATTACHED) && + inm->in6m_reqcnt == 0)); + IN6M_UNLOCK(inm); + in6_multihead_lock_done(); + + if (lastref) + IN6M_REMREF(inm); /* for in6_multihead list */ + + return (error); +} + +/* + * Block or unblock an ASM multicast source on an inpcb. + * This implements the delta-based API described in RFC 3678. + * + * The delta-based API applies only to exclusive-mode memberships. + * An MLD downcall will be performed. + * + * Return 0 if successful, otherwise return an appropriate error code. + */ +static int +in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in6_mfilter *imf; + struct ip6_moptions *imo; + struct in6_msource *ims; + struct in6_multi *inm; + size_t idx; + uint16_t fmode; + int error, doblock; + + ifp = NULL; + error = 0; + doblock = 0; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + ssa = (sockunion_t *)&gsr.gsr_source; + + switch (sopt->sopt_name) { + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + if (error) + return (error); + + if (gsa->sin6.sin6_family != AF_INET6 || + gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + if (ssa->sin6.sin6_family != AF_INET6 || + ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + ifnet_head_lock_shared(); + if (gsr.gsr_interface == 0 || + (u_int)if_index < gsr.gsr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + + ifp = ifindex2ifnet[gsr.gsr_interface]; + ifnet_head_done(); + + if (ifp == NULL) + return (EADDRNOTAVAIL); + + if (sopt->sopt_name == MCAST_BLOCK_SOURCE) + doblock = 1; + break; + + default: + MLD_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) + return (EINVAL); + + (void) in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); + + /* + * Check if we are actually a member of this group. + */ + imo = in6p_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IM6O_LOCK(imo); + idx = im6o_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->im6o_mfilters == NULL) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + + VERIFY(imo->im6o_mfilters != NULL); + imf = &imo->im6o_mfilters[idx]; + inm = imo->im6o_membership[idx]; + + /* + * Attempting to use the delta-based API on an + * non exclusive-mode membership is an error. + */ + fmode = imf->im6f_st[0]; + if (fmode != MCAST_EXCLUDE) { + error = EINVAL; + goto out_imo_locked; + } + + /* + * Deal with error cases up-front: + * Asked to block, but already blocked; or + * Asked to unblock, but nothing to unblock. + * If adding a new block entry, allocate it. + */ + ims = im6o_match_source(imo, idx, &ssa->sa); + if ((ims != NULL && doblock) || (ims == NULL && !doblock)) { + MLD_PRINTF(("%s: source %s %spresent\n", __func__, + ip6_sprintf(&ssa->sin6.sin6_addr), + doblock ? "" : "not ")); + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + + /* + * Begin state merge transaction at socket layer. + */ + if (doblock) { + MLD_PRINTF(("%s: %s source\n", __func__, "block")); + ims = im6f_graft(imf, fmode, &ssa->sin6); + if (ims == NULL) + error = ENOMEM; + } else { + MLD_PRINTF(("%s: %s source\n", __func__, "allow")); + error = im6f_prune(imf, &ssa->sin6); + } + + if (error) { + MLD_PRINTF(("%s: merge imf state failed\n", __func__)); + goto out_im6f_rollback; + } + + /* + * Begin state merge transaction at MLD layer. + */ + IN6M_LOCK(inm); + MLD_PRINTF(("%s: merge inm state\n", __func__)); + error = in6m_merge(inm, imf); + if (error) { + MLD_PRINTF(("%s: failed to merge inm state\n", __func__)); + IN6M_UNLOCK(inm); + goto out_im6f_rollback; + } + + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, 0); + IN6M_UNLOCK(inm); +#if MLD_DEBUG + if (error) + MLD_PRINTF(("%s: failed mld downcall\n", __func__)); +#endif + +out_im6f_rollback: + if (error) + im6f_rollback(imf); + else + im6f_commit(imf); + + im6f_reap(imf); + +out_imo_locked: + IM6O_UNLOCK(imo); + IM6O_REMREF(imo); /* from in6p_findmoptions() */ + return (error); +} + +/* + * Given an inpcb, return its multicast options structure pointer. Accepts + * an unlocked inpcb pointer, but will return it locked. May sleep. + * + */ +static struct ip6_moptions * +in6p_findmoptions(struct inpcb *inp) +{ + struct ip6_moptions *imo; + struct in6_multi **immp; + struct in6_mfilter *imfp; + size_t idx; + + if ((imo = inp->in6p_moptions) != NULL) { + IM6O_ADDREF(imo); /* for caller */ + return (imo); + } + + imo = ip6_allocmoptions(M_WAITOK); + if (imo == NULL) + return (NULL); + + immp = _MALLOC(sizeof (*immp) * IPV6_MIN_MEMBERSHIPS, M_IP6MOPTS, + M_WAITOK | M_ZERO); + if (immp == NULL) { + IM6O_REMREF(imo); + return (NULL); + } + + imfp = _MALLOC(sizeof (struct in6_mfilter) * IPV6_MIN_MEMBERSHIPS, + M_IN6MFILTER, M_WAITOK | M_ZERO); + if (imfp == NULL) { + _FREE(immp, M_IP6MOPTS); + IM6O_REMREF(imo); + return (NULL); + } + + imo->im6o_multicast_ifp = NULL; + imo->im6o_multicast_hlim = ip6_defmcasthlim; + imo->im6o_multicast_loop = in6_mcast_loop; + imo->im6o_num_memberships = 0; + imo->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS; + imo->im6o_membership = immp; + + /* Initialize per-group source filters. */ + for (idx = 0; idx < IPV6_MIN_MEMBERSHIPS; idx++) + im6f_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE); + + imo->im6o_mfilters = imfp; + inp->in6p_moptions = imo; /* keep reference from ip6_allocmoptions() */ + IM6O_ADDREF(imo); /* for caller */ + + return (imo); +} + +/* + * Atomically get source filters on a socket for an IPv6 multicast group. + * Called with INP lock held; returns with lock released. + */ +static int +in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt) +{ + struct __msfilterreq64 msfr, msfr64; + struct __msfilterreq32 msfr32; + sockunion_t *gsa; + struct ifnet *ifp; + struct ip6_moptions *imo; + struct in6_mfilter *imf; + struct ip6_msource *ims; + struct in6_msource *lims; + struct sockaddr_in6 *psin; + struct sockaddr_storage *ptss; + struct sockaddr_storage *tss; + int error; + size_t idx, nsrcs, ncsrcs; + user_addr_t tmp_ptr; + + imo = inp->in6p_moptions; + VERIFY(imo != NULL); + + if (IS_64BIT_PROCESS(current_proc())) { + error = sooptcopyin(sopt, &msfr64, + sizeof(struct __msfilterreq64), + sizeof(struct __msfilterreq64)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr64, sizeof(msfr)); + } else { + error = sooptcopyin(sopt, &msfr32, + sizeof(struct __msfilterreq32), + sizeof(struct __msfilterreq32)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr32, sizeof(msfr)); + } + + if (msfr.msfr_group.ss_family != AF_INET6 || + msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + gsa = (sockunion_t *)&msfr.msfr_group; + if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) + return (EINVAL); + + ifnet_head_lock_shared(); + if (msfr.msfr_ifindex == 0 || (u_int)if_index < msfr.msfr_ifindex) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[msfr.msfr_ifindex]; + ifnet_head_done(); + + if (ifp == NULL) + return (EADDRNOTAVAIL); + + if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) + msfr.msfr_nsrcs = in6_mcast_maxsocksrc; + + (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); + + IM6O_LOCK(imo); + /* + * Lookup group on the socket. + */ + idx = im6o_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->im6o_mfilters == NULL) { + IM6O_UNLOCK(imo); + return (EADDRNOTAVAIL); + } + imf = &imo->im6o_mfilters[idx]; + + /* + * Ignore memberships which are in limbo. + */ + if (imf->im6f_st[1] == MCAST_UNDEFINED) { + IM6O_UNLOCK(imo); + return (EAGAIN); + } + msfr.msfr_fmode = imf->im6f_st[1]; + + /* + * If the user specified a buffer, copy out the source filter + * entries to userland gracefully. + * We only copy out the number of entries which userland + * has asked for, but we always tell userland how big the + * buffer really needs to be. + */ + tss = NULL; + + if (IS_64BIT_PROCESS(current_proc())) + tmp_ptr = msfr64.msfr_srcs; + else + tmp_ptr = CAST_USER_ADDR_T(msfr32.msfr_srcs); + + if (tmp_ptr != USER_ADDR_NULL && msfr.msfr_nsrcs > 0) { + tss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + M_TEMP, M_WAITOK | M_ZERO); + if (tss == NULL) { + IM6O_UNLOCK(imo); + return (ENOBUFS); + } + } + + /* + * Count number of sources in-mode at t0. + * If buffer space exists and remains, copy out source entries. + */ + nsrcs = msfr.msfr_nsrcs; + ncsrcs = 0; + ptss = tss; + RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) { + lims = (struct in6_msource *)ims; + if (lims->im6sl_st[0] == MCAST_UNDEFINED || + lims->im6sl_st[0] != imf->im6f_st[0]) + continue; + if (tss != NULL && nsrcs > 0) { + psin = (struct sockaddr_in6 *)ptss; + psin->sin6_family = AF_INET6; + psin->sin6_len = sizeof(struct sockaddr_in6); + psin->sin6_addr = lims->im6s_addr; + psin->sin6_port = 0; + --nsrcs; + ++ptss; + ++ncsrcs; + } + } + + IM6O_UNLOCK(imo); + + if (tss != NULL) { + error = copyout(tss, tmp_ptr, + sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + FREE(tss, M_TEMP); + if (error) + return (error); + } + + msfr.msfr_nsrcs = ncsrcs; + if (IS_64BIT_PROCESS(current_proc())) { + msfr64.msfr_ifindex = msfr.msfr_ifindex; + msfr64.msfr_fmode = msfr.msfr_fmode; + msfr64.msfr_nsrcs = msfr.msfr_nsrcs; + memcpy(&msfr64.msfr_group, &msfr.msfr_group, + sizeof(struct sockaddr_storage)); + error = sooptcopyout(sopt, &msfr64, + sizeof(struct __msfilterreq64)); + } else { + msfr32.msfr_ifindex = msfr.msfr_ifindex; + msfr32.msfr_fmode = msfr.msfr_fmode; + msfr32.msfr_nsrcs = msfr.msfr_nsrcs; + memcpy(&msfr64.msfr_group, &msfr.msfr_group, + sizeof(struct sockaddr_storage)); + error = sooptcopyout(sopt, &msfr32, + sizeof(struct __msfilterreq32)); + } + + return (error); +} + +/* + * Return the IP multicast options in response to user getsockopt(). + */ +int +ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt) +{ + struct ip6_moptions *im6o; + int error; + u_int optval; + + im6o = inp->in6p_moptions; + /* + * If socket is neither of type SOCK_RAW or SOCK_DGRAM, + * or is a divert socket, reject it. + */ + if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || + (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) { + return (EOPNOTSUPP); + } + + error = 0; + switch (sopt->sopt_name) { + case IPV6_MULTICAST_IF: + if (im6o != NULL) + IM6O_LOCK(im6o); + if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) { + optval = 0; + } else { + optval = im6o->im6o_multicast_ifp->if_index; + } + if (im6o != NULL) + IM6O_UNLOCK(im6o); + error = sooptcopyout(sopt, &optval, sizeof(u_int)); + break; + + case IPV6_MULTICAST_HOPS: + if (im6o == NULL) { + optval = ip6_defmcasthlim; + } else { + IM6O_LOCK(im6o); + optval = im6o->im6o_multicast_hlim; + IM6O_UNLOCK(im6o); + } + error = sooptcopyout(sopt, &optval, sizeof(u_int)); + break; + + case IPV6_MULTICAST_LOOP: + if (im6o == NULL) { + optval = in6_mcast_loop; /* XXX VIMAGE */ + } else { + IM6O_LOCK(im6o); + optval = im6o->im6o_multicast_loop; + IM6O_UNLOCK(im6o); + } + error = sooptcopyout(sopt, &optval, sizeof(u_int)); + break; + + case IPV6_MSFILTER: + if (im6o == NULL) { + error = EADDRNOTAVAIL; + } else { + error = in6p_get_source_filters(inp, sopt); + } + break; + + default: + error = ENOPROTOOPT; + break; + } + + return (error); +} + +/* + * Look up the ifnet to use for a multicast group membership, + * given the address of an IPv6 group. + * + * This routine exists to support legacy IPv6 multicast applications. + * + * If inp is non-NULL and is bound to an interface, use this socket's + * inp_boundif for any required routing table lookup. + * + * If the route lookup fails, return NULL. + * + * FUTURE: Support multiple forwarding tables for IPv6. + * + * Returns NULL if no ifp could be found. + */ +static struct ifnet * +in6p_lookup_mcast_ifp(const struct inpcb *in6p, + const struct sockaddr_in6 *gsin6) +{ + struct route_in6 ro6; + struct ifnet *ifp; + unsigned int ifscope = IFSCOPE_NONE; + + VERIFY(in6p == NULL || (in6p->inp_vflag & INP_IPV6)); + VERIFY(gsin6->sin6_family == AF_INET6); + if (IN6_IS_ADDR_MULTICAST(&gsin6->sin6_addr) == 0) + return NULL; + + if (in6p != NULL && (in6p->inp_flags & INP_BOUND_IF)) + ifscope = in6p->inp_boundif; + + ifp = NULL; + memset(&ro6, 0, sizeof(struct route_in6)); + memcpy(&ro6.ro_dst, gsin6, sizeof(struct sockaddr_in6)); + rtalloc_scoped_ign((struct route *)&ro6, 0, ifscope); + if (ro6.ro_rt != NULL) { + ifp = ro6.ro_rt->rt_ifp; + VERIFY(ifp != NULL); + rtfree(ro6.ro_rt); + } + + return (ifp); +} + +/* + * Since ipv6_mreq contains an ifindex and ip_mreq contains an AF_INET + * address, we need to lookup the AF_INET address when translating an + * ipv6_mreq structure into an ipmreq structure. + * This is used when userland performs multicast setsockopt() on AF_INET6 + * sockets with AF_INET multicast addresses (IPv6 v4 mapped addresses). + */ +static int +in6p_lookup_v4addr(struct ipv6_mreq *mreq, struct ip_mreq *v4mreq) +{ + struct ifnet *ifp; + struct ifaddr *ifa; + struct sockaddr_in *sin; + + ifnet_head_lock_shared(); + if (mreq->ipv6mr_interface > (unsigned int)if_index) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } else + ifp = ifindex2ifnet[mreq->ipv6mr_interface]; + ifnet_head_done(); + if (ifp == NULL) + return (EADDRNOTAVAIL); + ifa = ifa_ifpgetprimary(ifp, AF_INET); + if (ifa == NULL) + return (EADDRNOTAVAIL); + sin = (struct sockaddr_in *)ifa->ifa_addr; + v4mreq->imr_interface.s_addr = sin->sin_addr.s_addr; + IFA_REMREF(ifa); + + return (0); +} + +/* + * Join an IPv6 multicast group, possibly with a source. + * + * FIXME: The KAME use of the unspecified address (::) + * to join *all* multicast groups is currently unsupported. + */ +static int +in6p_join_group(struct inpcb *inp, struct sockopt *sopt) +{ + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in6_mfilter *imf; + struct ip6_moptions *imo; + struct in6_multi *inm = NULL; + struct in6_msource *lims = NULL; + size_t idx; + int error, is_new; + uint32_t scopeid = 0; + + ifp = NULL; + imf = NULL; + error = 0; + is_new = 0; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + gsa->ss.ss_family = AF_UNSPEC; + ssa = (sockunion_t *)&gsr.gsr_source; + ssa->ss.ss_family = AF_UNSPEC; + + /* + * Chew everything into struct group_source_req. + * Overwrite the port field if present, as the sockaddr + * being copied in may be matched with a binary comparison. + * Ignore passed-in scope ID. + */ + switch (sopt->sopt_name) { + case IPV6_JOIN_GROUP: { + struct ipv6_mreq mreq; + struct sockaddr_in6 *gsin6; + + error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), + sizeof(struct ipv6_mreq)); + if (error) + return (error); + if (IN6_IS_ADDR_V4MAPPED(&mreq.ipv6mr_multiaddr)) { + struct ip_mreq v4mreq; + struct sockopt v4sopt; + + v4mreq.imr_multiaddr.s_addr = + mreq.ipv6mr_multiaddr.s6_addr32[3]; + if (mreq.ipv6mr_interface == 0) + v4mreq.imr_interface.s_addr = INADDR_ANY; + else + error = in6p_lookup_v4addr(&mreq, &v4mreq); + if (error) + return (error); + v4sopt.sopt_dir = SOPT_SET; + v4sopt.sopt_level = sopt->sopt_level; + v4sopt.sopt_name = IP_ADD_MEMBERSHIP; + v4sopt.sopt_val = CAST_USER_ADDR_T(&v4mreq); + v4sopt.sopt_valsize = sizeof(v4mreq); + v4sopt.sopt_p = kernproc; + + return (inp_join_group(inp, &v4sopt)); + } + gsa->sin6.sin6_family = AF_INET6; + gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); + gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; + + gsin6 = &gsa->sin6; + + /* Only allow IPv6 multicast addresses */ + if (IN6_IS_ADDR_MULTICAST(&gsin6->sin6_addr) == 0) { + return (EINVAL); + } + + if (mreq.ipv6mr_interface == 0) { + ifp = in6p_lookup_mcast_ifp(inp, gsin6); + } else { + ifnet_head_lock_shared(); + if ((u_int)if_index < mreq.ipv6mr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[mreq.ipv6mr_interface]; + ifnet_head_done(); + } + MLD_PRINTF(("%s: ipv6mr_interface = %d, ifp = %p\n", + __func__, mreq.ipv6mr_interface, ifp)); + break; + } + + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + if (sopt->sopt_name == MCAST_JOIN_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_req), + sizeof(struct group_req)); + } else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + } + if (error) + return (error); + + if (gsa->sin6.sin6_family != AF_INET6 || + gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) { + if (ssa->sin6.sin6_family != AF_INET6 || + ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) + return (EINVAL); + /* + * TODO: Validate embedded scope ID in source + * list entry against passed-in ifp, if and only + * if source list filter entry is iface or node local. + */ + in6_clearscope(&ssa->sin6.sin6_addr); + ssa->sin6.sin6_port = 0; + ssa->sin6.sin6_scope_id = 0; + } + + ifnet_head_lock_shared(); + if (gsr.gsr_interface == 0 || + (u_int)if_index < gsr.gsr_interface) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[gsr.gsr_interface]; + ifnet_head_done(); + break; + + default: + MLD_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) + return (EINVAL); + + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) + return (EADDRNOTAVAIL); + + gsa->sin6.sin6_port = 0; + gsa->sin6.sin6_scope_id = 0; + + /* + * Always set the scope zone ID on memberships created from userland. + * Use the passed-in ifp to do this. + */ + (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, &scopeid); + /* + * Some addresses are not valid without an embedded scopeid. + * This check must be present because otherwise we will later hit + * a VERIFY() in in6_mc_join(). + */ + if ((IN6_IS_ADDR_MC_LINKLOCAL(&gsa->sin6.sin6_addr) || + IN6_IS_ADDR_MC_INTFACELOCAL(&gsa->sin6.sin6_addr)) && scopeid == 0) + return (EINVAL); + + imo = in6p_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IM6O_LOCK(imo); + idx = im6o_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1) { + is_new = 1; + } else { + inm = imo->im6o_membership[idx]; + imf = &imo->im6o_mfilters[idx]; + if (ssa->ss.ss_family != AF_UNSPEC) { + /* + * MCAST_JOIN_SOURCE_GROUP on an exclusive membership + * is an error. On an existing inclusive membership, + * it just adds the source to the filter list. + */ + if (imf->im6f_st[1] != MCAST_INCLUDE) { + error = EINVAL; + goto out_imo_locked; + } + /* + * Throw out duplicates. + * + * XXX FIXME: This makes a naive assumption that + * even if entries exist for *ssa in this imf, + * they will be rejected as dupes, even if they + * are not valid in the current mode (in-mode). + * + * in6_msource is transactioned just as for anything + * else in SSM -- but note naive use of in6m_graft() + * below for allocating new filter entries. + * + * This is only an issue if someone mixes the + * full-state SSM API with the delta-based API, + * which is discouraged in the relevant RFCs. + */ + lims = im6o_match_source(imo, idx, &ssa->sa); + if (lims != NULL /*&& + lims->im6sl_st[1] == MCAST_INCLUDE*/) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + } else { + /* + * MCAST_JOIN_GROUP on an existing exclusive + * membership is an error; return EADDRINUSE + * to preserve 4.4BSD API idempotence, and + * avoid tedious detour to code below. + * NOTE: This is bending RFC 3678 a bit. + * + * On an existing inclusive membership, this is also + * an error; if you want to change filter mode, + * you must use the userland API setsourcefilter(). + * XXX We don't reject this for imf in UNDEFINED + * state at t1, because allocation of a filter + * is atomic with allocation of a membership. + */ + error = EINVAL; + /* See comments above for EADDRINUSE */ + if (imf->im6f_st[1] == MCAST_EXCLUDE) + error = EADDRINUSE; + goto out_imo_locked; + } + } + + /* + * Begin state merge transaction at socket layer. + */ + + if (is_new) { + if (imo->im6o_num_memberships == imo->im6o_max_memberships) { + error = im6o_grow(imo, 0); + if (error) + goto out_imo_locked; + } + /* + * Allocate the new slot upfront so we can deal with + * grafting the new source filter in same code path + * as for join-source on existing membership. + */ + idx = imo->im6o_num_memberships; + imo->im6o_membership[idx] = NULL; + imo->im6o_num_memberships++; + VERIFY(imo->im6o_mfilters != NULL); + imf = &imo->im6o_mfilters[idx]; + VERIFY(RB_EMPTY(&imf->im6f_sources)); + } + + /* + * Graft new source into filter list for this inpcb's + * membership of the group. The in6_multi may not have + * been allocated yet if this is a new membership, however, + * the in_mfilter slot will be allocated and must be initialized. + * + * Note: Grafting of exclusive mode filters doesn't happen + * in this path. + * XXX: Should check for non-NULL lims (node exists but may + * not be in-mode) for interop with full-state API. + */ + if (ssa->ss.ss_family != AF_UNSPEC) { + /* Membership starts in IN mode */ + if (is_new) { + MLD_PRINTF(("%s: new join w/source\n", __func__); + im6f_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE)); + } else { + MLD_PRINTF(("%s: %s source\n", __func__, "allow")); + } + lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6); + if (lims == NULL) { + MLD_PRINTF(("%s: merge imf state failed\n", + __func__)); + error = ENOMEM; + goto out_im6o_free; + } + } else { + /* No address specified; Membership starts in EX mode */ + if (is_new) { + MLD_PRINTF(("%s: new join w/o source", __func__)); + im6f_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE); + } + } + + /* + * Begin state merge transaction at MLD layer. + */ + + if (is_new) { + VERIFY(inm == NULL); + error = in6_mc_join(ifp, &gsa->sin6.sin6_addr, imf, &inm, 0); + VERIFY(inm != NULL || error != 0); + if (error) + goto out_im6o_free; + imo->im6o_membership[idx] = inm; /* from in6_mc_join() */ + } else { + MLD_PRINTF(("%s: merge inm state\n", __func__)); + IN6M_LOCK(inm); + error = in6m_merge(inm, imf); + if (error) { + MLD_PRINTF(("%s: failed to merge inm state\n", + __func__)); + IN6M_UNLOCK(inm); + goto out_im6f_rollback; + } + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, 0); + IN6M_UNLOCK(inm); + if (error) { + MLD_PRINTF(("%s: failed mld downcall\n", + __func__)); + goto out_im6f_rollback; + } + } + +out_im6f_rollback: + if (error) { + im6f_rollback(imf); + if (is_new) + im6f_purge(imf); + else + im6f_reap(imf); + } else { + im6f_commit(imf); + } + +out_im6o_free: + if (error && is_new) { + VERIFY(inm == NULL); + imo->im6o_membership[idx] = NULL; + --imo->im6o_num_memberships; + } + +out_imo_locked: + IM6O_UNLOCK(imo); + IM6O_REMREF(imo); /* from in6p_findmoptions() */ + return (error); +} + +/* + * Leave an IPv6 multicast group on an inpcb, possibly with a source. + */ +static int +in6p_leave_group(struct inpcb *inp, struct sockopt *sopt) +{ + struct ipv6_mreq mreq; + struct group_source_req gsr; + sockunion_t *gsa, *ssa; + struct ifnet *ifp; + struct in6_mfilter *imf; + struct ip6_moptions *imo; + struct in6_msource *ims; + struct in6_multi *inm = NULL; + uint32_t ifindex = 0; + size_t idx; + int error, is_final; + + ifp = NULL; + error = 0; + is_final = 1; + + memset(&gsr, 0, sizeof(struct group_source_req)); + gsa = (sockunion_t *)&gsr.gsr_group; + gsa->ss.ss_family = AF_UNSPEC; + ssa = (sockunion_t *)&gsr.gsr_source; + ssa->ss.ss_family = AF_UNSPEC; + + /* + * Chew everything passed in up into a struct group_source_req + * as that is easier to process. + * Note: Any embedded scope ID in the multicast group passed + * in by userland is ignored, the interface index is the recommended + * mechanism to specify an interface; see below. + */ + switch (sopt->sopt_name) { + case IPV6_LEAVE_GROUP: { + struct sockaddr_in6 *gsin6; + + error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq), + sizeof(struct ipv6_mreq)); + if (error) + return (error); + if (IN6_IS_ADDR_V4MAPPED(&mreq.ipv6mr_multiaddr)) { + struct ip_mreq v4mreq; + struct sockopt v4sopt; + + v4mreq.imr_multiaddr.s_addr = + mreq.ipv6mr_multiaddr.s6_addr32[3]; + if (mreq.ipv6mr_interface == 0) + v4mreq.imr_interface.s_addr = INADDR_ANY; + else + error = in6p_lookup_v4addr(&mreq, &v4mreq); + if (error) + return (error); + v4sopt.sopt_dir = SOPT_SET; + v4sopt.sopt_level = sopt->sopt_level; + v4sopt.sopt_name = IP_DROP_MEMBERSHIP; + v4sopt.sopt_val = CAST_USER_ADDR_T(&v4mreq); + v4sopt.sopt_valsize = sizeof(v4mreq); + v4sopt.sopt_p = kernproc; + + return (inp_leave_group(inp, &v4sopt)); + } + gsa->sin6.sin6_family = AF_INET6; + gsa->sin6.sin6_len = sizeof(struct sockaddr_in6); + gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr; + gsa->sin6.sin6_port = 0; + gsa->sin6.sin6_scope_id = 0; + ifindex = mreq.ipv6mr_interface; + gsin6 = &gsa->sin6; + /* Only allow IPv6 multicast addresses */ + if (IN6_IS_ADDR_MULTICAST(&gsin6->sin6_addr) == 0) { + return (EINVAL); + } + break; + } + + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + if (sopt->sopt_name == MCAST_LEAVE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_req), + sizeof(struct group_req)); + } else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { + error = sooptcopyin(sopt, &gsr, + sizeof(struct group_source_req), + sizeof(struct group_source_req)); + } + if (error) + return (error); + + if (gsa->sin6.sin6_family != AF_INET6 || + gsa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) { + if (ssa->sin6.sin6_family != AF_INET6 || + ssa->sin6.sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr)) + return (EINVAL); + /* + * TODO: Validate embedded scope ID in source + * list entry against passed-in ifp, if and only + * if source list filter entry is iface or node local. + */ + in6_clearscope(&ssa->sin6.sin6_addr); + } + gsa->sin6.sin6_port = 0; + gsa->sin6.sin6_scope_id = 0; + ifindex = gsr.gsr_interface; + break; + + default: + MLD_PRINTF(("%s: unknown sopt_name %d\n", + __func__, sopt->sopt_name)); + return (EOPNOTSUPP); + break; + } + + if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) + return (EINVAL); + + /* + * Validate interface index if provided. If no interface index + * was provided separately, attempt to look the membership up + * from the default scope as a last resort to disambiguate + * the membership we are being asked to leave. + * XXX SCOPE6 lock potentially taken here. + */ + if (ifindex != 0) { + ifnet_head_lock_shared(); + if ((u_int)if_index < ifindex) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp == NULL) + return (EADDRNOTAVAIL); + (void) in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); + } else { + error = sa6_embedscope(&gsa->sin6, ip6_use_defzone); + if (error) + return (EADDRNOTAVAIL); + /* + * Some badly behaved applications don't pass an ifindex + * or a scope ID, which is an API violation. In this case, + * perform a lookup as per a v6 join. + * + * XXX For now, stomp on zone ID for the corner case. + * This is not the 'KAME way', but we need to see the ifp + * directly until such time as this implementation is + * refactored, assuming the scope IDs are the way to go. + */ + ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]); + if (ifindex == 0) { + MLD_PRINTF(("%s: warning: no ifindex, looking up " + "ifp for group %s.\n", __func__, + ip6_sprintf(&gsa->sin6.sin6_addr))); + ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6); + } else { + ifnet_head_lock_shared(); + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + } + if (ifp == NULL) + return (EADDRNOTAVAIL); + } + + VERIFY(ifp != NULL); + MLD_PRINTF(("%s: ifp = %p\n", __func__, ifp)); + + /* + * Find the membership in the membership array. + */ + imo = in6p_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IM6O_LOCK(imo); + idx = im6o_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1) { + error = EADDRNOTAVAIL; + goto out_locked; + } + inm = imo->im6o_membership[idx]; + imf = &imo->im6o_mfilters[idx]; + + if (ssa->ss.ss_family != AF_UNSPEC) + is_final = 0; + + /* + * Begin state merge transaction at socket layer. + */ + + /* + * If we were instructed only to leave a given source, do so. + * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships. + */ + if (is_final) { + im6f_leave(imf); + } else { + if (imf->im6f_st[0] == MCAST_EXCLUDE) { + error = EADDRNOTAVAIL; + goto out_locked; + } + ims = im6o_match_source(imo, idx, &ssa->sa); + if (ims == NULL) { + MLD_PRINTF(("%s: source %p %spresent\n", __func__, + ip6_sprintf(&ssa->sin6.sin6_addr), + "not ")); + error = EADDRNOTAVAIL; + goto out_locked; + } + MLD_PRINTF(("%s: %s source\n", __func__, "block")); + error = im6f_prune(imf, &ssa->sin6); + if (error) { + MLD_PRINTF(("%s: merge imf state failed\n", + __func__)); + goto out_locked; + } + } + + /* + * Begin state merge transaction at MLD layer. + */ + + if (is_final) { + /* + * Give up the multicast address record to which + * the membership points. Reference held in im6o + * will be released below. + */ + (void) in6_mc_leave(inm, imf); + } else { + MLD_PRINTF(("%s: merge inm state\n", __func__)); + IN6M_LOCK(inm); + error = in6m_merge(inm, imf); + if (error) { + MLD_PRINTF(("%s: failed to merge inm state\n", + __func__)); + IN6M_UNLOCK(inm); + goto out_im6f_rollback; + } + + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, 0); + if (error) { + MLD_PRINTF(("%s: failed mld downcall\n", __func__)); + } + IN6M_UNLOCK(inm); + } + +out_im6f_rollback: + if (error) + im6f_rollback(imf); + else + im6f_commit(imf); + + im6f_reap(imf); + + if (is_final) { + /* Remove the gap in the membership array. */ + VERIFY(inm == imo->im6o_membership[idx]); + imo->im6o_membership[idx] = NULL; + IN6M_REMREF(inm); + for (++idx; idx < imo->im6o_num_memberships; ++idx) { + imo->im6o_membership[idx-1] = imo->im6o_membership[idx]; + imo->im6o_mfilters[idx-1] = imo->im6o_mfilters[idx]; + } + imo->im6o_num_memberships--; + } + +out_locked: + IM6O_UNLOCK(imo); + IM6O_REMREF(imo); /* from in6p_findmoptions() */ + return (error); +} + +/* + * Select the interface for transmitting IPv6 multicast datagrams. + * + * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn + * may be passed to this socket option. An address of in6addr_any or an + * interface index of 0 is used to remove a previous selection. + * When no interface is selected, one is chosen for every send. + */ +static int +in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt) +{ + struct ifnet *ifp; + struct ip6_moptions *imo; + u_int ifindex; + int error; + + if (sopt->sopt_valsize != sizeof(u_int)) + return (EINVAL); + + error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int)); + if (error) + return (error); + + ifnet_head_lock_shared(); + if ((u_int)if_index < ifindex) { + ifnet_head_done(); + return (EINVAL); + } + + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) + return (EADDRNOTAVAIL); + + imo = in6p_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IM6O_LOCK(imo); + imo->im6o_multicast_ifp = ifp; + IM6O_UNLOCK(imo); + IM6O_REMREF(imo); /* from in6p_findmoptions() */ + + return (0); +} + +/* + * Atomically set source filters on a socket for an IPv6 multicast group. + * + */ +static int +in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt) +{ + struct __msfilterreq64 msfr, msfr64; + struct __msfilterreq32 msfr32; + sockunion_t *gsa; + struct ifnet *ifp; + struct in6_mfilter *imf; + struct ip6_moptions *imo; + struct in6_multi *inm; + size_t idx; + int error; + user_addr_t tmp_ptr; + + if (IS_64BIT_PROCESS(current_proc())) { + error = sooptcopyin(sopt, &msfr64, + sizeof(struct __msfilterreq64), + sizeof(struct __msfilterreq64)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr64, sizeof(msfr)); + } else { + error = sooptcopyin(sopt, &msfr32, + sizeof(struct __msfilterreq32), + sizeof(struct __msfilterreq32)); + if (error) + return (error); + /* we never use msfr.msfr_srcs; */ + memcpy(&msfr, &msfr32, sizeof(msfr)); + } + + if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc) + return (ENOBUFS); + + if (msfr.msfr_fmode != MCAST_EXCLUDE && + msfr.msfr_fmode != MCAST_INCLUDE) + return (EINVAL); + + if (msfr.msfr_group.ss_family != AF_INET6 || + msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + gsa = (sockunion_t *)&msfr.msfr_group; + if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr)) + return (EINVAL); + + gsa->sin6.sin6_port = 0; /* ignore port */ + + ifnet_head_lock_shared(); + if (msfr.msfr_ifindex == 0 || (u_int)if_index < msfr.msfr_ifindex) { + ifnet_head_done(); + return (EADDRNOTAVAIL); + } + ifp = ifindex2ifnet[msfr.msfr_ifindex]; + ifnet_head_done(); + if (ifp == NULL) + return (EADDRNOTAVAIL); + + (void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL); + + /* + * Take the INP write lock. + * Check if this socket is a member of this group. + */ + imo = in6p_findmoptions(inp); + if (imo == NULL) + return (ENOMEM); + + IM6O_LOCK(imo); + idx = im6o_match_group(imo, ifp, &gsa->sa); + if (idx == (size_t)-1 || imo->im6o_mfilters == NULL) { + error = EADDRNOTAVAIL; + goto out_imo_locked; + } + inm = imo->im6o_membership[idx]; + imf = &imo->im6o_mfilters[idx]; + + /* + * Begin state merge transaction at socket layer. + */ + + imf->im6f_st[1] = msfr.msfr_fmode; + + /* + * Apply any new source filters, if present. + * Make a copy of the user-space source vector so + * that we may copy them with a single copyin. This + * allows us to deal with page faults up-front. + */ + if (msfr.msfr_nsrcs > 0) { + struct in6_msource *lims; + struct sockaddr_in6 *psin; + struct sockaddr_storage *kss, *pkss; + unsigned int i; + + if (IS_64BIT_PROCESS(current_proc())) + tmp_ptr = msfr64.msfr_srcs; + else + tmp_ptr = CAST_USER_ADDR_T(msfr32.msfr_srcs); + + MLD_PRINTF(("%s: loading %lu source list entries\n", + __func__, (unsigned long)msfr.msfr_nsrcs)); + kss = _MALLOC(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs, + M_TEMP, M_WAITOK); + if (kss == NULL) { + error = ENOMEM; + goto out_imo_locked; + } + + error = copyin(tmp_ptr, kss, + sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs); + if (error) { + FREE(kss, M_TEMP); + goto out_imo_locked; + } + + /* + * Mark all source filters as UNDEFINED at t1. + * Restore new group filter mode, as im6f_leave() + * will set it to INCLUDE. + */ + im6f_leave(imf); + imf->im6f_st[1] = msfr.msfr_fmode; + + /* + * Update socket layer filters at t1, lazy-allocating + * new entries. This saves a bunch of memory at the + * cost of one RB_FIND() per source entry; duplicate + * entries in the msfr_nsrcs vector are ignored. + * If we encounter an error, rollback transaction. + * + * XXX This too could be replaced with a set-symmetric + * difference like loop to avoid walking from root + * every time, as the key space is common. + */ + for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) { + psin = (struct sockaddr_in6 *)pkss; + if (psin->sin6_family != AF_INET6) { + error = EAFNOSUPPORT; + break; + } + if (psin->sin6_len != sizeof(struct sockaddr_in6)) { + error = EINVAL; + break; + } + if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) { + error = EINVAL; + break; + } + /* + * TODO: Validate embedded scope ID in source + * list entry against passed-in ifp, if and only + * if source list filter entry is iface or node local. + */ + in6_clearscope(&psin->sin6_addr); + error = im6f_get_source(imf, psin, &lims); + if (error) + break; + lims->im6sl_st[1] = imf->im6f_st[1]; + } + FREE(kss, M_TEMP); + } + + if (error) + goto out_im6f_rollback; + + /* + * Begin state merge transaction at MLD layer. + */ + IN6M_LOCK(inm); + MLD_PRINTF(("%s: merge inm state\n", __func__)); + error = in6m_merge(inm, imf); + if (error) { + MLD_PRINTF(("%s: failed to merge inm state\n", __func__)); + IN6M_UNLOCK(inm); + goto out_im6f_rollback; + } + + MLD_PRINTF(("%s: doing mld downcall\n", __func__)); + error = mld_change_state(inm, 0); + IN6M_UNLOCK(inm); +#if MLD_DEBUG + if (error) + MLD_PRINTF(("%s: failed mld downcall\n", __func__)); +#endif + +out_im6f_rollback: + if (error) + im6f_rollback(imf); + else + im6f_commit(imf); + + im6f_reap(imf); + +out_imo_locked: + IM6O_UNLOCK(imo); + IM6O_REMREF(imo); /* from in6p_findmoptions() */ + + return (error); +} + +/* + * Set the IP multicast options in response to user setsockopt(). + * + * Many of the socket options handled in this function duplicate the + * functionality of socket options in the regular unicast API. However, + * it is not possible to merge the duplicate code, because the idempotence + * of the IPv6 multicast part of the BSD Sockets API must be preserved; + * the effects of these options must be treated as separate and distinct. + * + */ +int +ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt) +{ + struct ip6_moptions *im6o; + int error; + + error = 0; + + /* + * If socket is neither of type SOCK_RAW or SOCK_DGRAM, + * or is a divert socket, reject it. + */ + if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT || + (inp->inp_socket->so_proto->pr_type != SOCK_RAW && + inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) + return (EOPNOTSUPP); + + switch (sopt->sopt_name) { + case IPV6_MULTICAST_IF: + error = in6p_set_multicast_if(inp, sopt); + break; + + case IPV6_MULTICAST_HOPS: { + int hlim; + + if (sopt->sopt_valsize != sizeof(int)) { + error = EINVAL; + break; + } + error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int)); + if (error) + break; + if (hlim < -1 || hlim > 255) { + error = EINVAL; + break; + } else if (hlim == -1) { + hlim = ip6_defmcasthlim; + } + im6o = in6p_findmoptions(inp); + if (im6o == NULL) { + error = ENOMEM; + break; + } + IM6O_LOCK(im6o); + im6o->im6o_multicast_hlim = hlim; + IM6O_UNLOCK(im6o); + IM6O_REMREF(im6o); /* from in6p_findmoptions() */ + break; + } + + case IPV6_MULTICAST_LOOP: { + u_int loop; + + /* + * Set the loopback flag for outgoing multicast packets. + * Must be zero or one. + */ + if (sopt->sopt_valsize != sizeof(u_int)) { + error = EINVAL; + break; + } + error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int)); + if (error) + break; + if (loop > 1) { + error = EINVAL; + break; + } + im6o = in6p_findmoptions(inp); + if (im6o == NULL) { + error = ENOMEM; + break; + } + IM6O_LOCK(im6o); + im6o->im6o_multicast_loop = loop; + IM6O_UNLOCK(im6o); + IM6O_REMREF(im6o); /* from in6p_findmoptions() */ + break; + } + + case IPV6_JOIN_GROUP: + case MCAST_JOIN_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + error = in6p_join_group(inp, sopt); + break; + + case IPV6_LEAVE_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + error = in6p_leave_group(inp, sopt); + break; + + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + error = in6p_block_unblock_source(inp, sopt); + break; + + case IPV6_MSFILTER: + error = in6p_set_source_filters(inp, sopt); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} +/* + * Expose MLD's multicast filter mode and source list(s) to userland, + * keyed by (ifindex, group). + * The filter mode is written out as a uint32_t, followed by + * 0..n of struct in6_addr. + * For use by ifmcstat(8). + */ +static int +sysctl_ip6_mcast_filters SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + + struct in6_addr mcaddr; + struct in6_addr src; + struct ifnet *ifp; + struct in6_multi *inm; + struct in6_multistep step; + struct ip6_msource *ims; + int *name; + int retval = 0; + u_int namelen; + uint32_t fmode, ifindex; + + name = (int *)arg1; + namelen = arg2; + + if (req->newptr != USER_ADDR_NULL) + return (EPERM); + + /* int: ifindex + 4 * 32 bits of IPv6 address */ + if (namelen != 5) + return (EINVAL); + + ifindex = name[0]; + ifnet_head_lock_shared(); + if (ifindex <= 0 || ifindex > (u_int)if_index) { + MLD_PRINTF(("%s: ifindex %u out of range\n", + __func__, ifindex)); + ifnet_head_done(); + return (ENOENT); + } + + memcpy(&mcaddr, &name[1], sizeof(struct in6_addr)); + if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) { + MLD_PRINTF(("%s: group %s is not multicast\n", + __func__, ip6_sprintf(&mcaddr))); + ifnet_head_done(); + return (EINVAL); + } + + ifp = ifindex2ifnet[ifindex]; + ifnet_head_done(); + if (ifp == NULL) { + MLD_PRINTF(("%s: no ifp for ifindex %u\n", __func__, ifindex)); + return (ENOENT); + } + /* + * Internal MLD lookups require that scope/zone ID is set. + */ + (void)in6_setscope(&mcaddr, ifp, NULL); + + in6_multihead_lock_shared(); + IN6_FIRST_MULTI(step, inm); + while (inm != NULL) { + IN6M_LOCK(inm); + if (inm->in6m_ifp != ifp) + goto next; + + if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr)) + goto next; + + fmode = inm->in6m_st[1].iss_fmode; + retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t)); + if (retval != 0) { + IN6M_UNLOCK(inm); + break; /* abort */ + } + RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) { + MLD_PRINTF(("%s: visit node %p\n", __func__, ims)); + /* + * Only copy-out sources which are in-mode. + */ + if (fmode != im6s_get_mode(inm, ims, 1)) { + MLD_PRINTF(("%s: skip non-in-mode\n", + __func__)); + continue; /* process next source */ + } + src = ims->im6s_addr; + retval = SYSCTL_OUT(req, &src, sizeof(struct in6_addr)); + if (retval != 0) + break; /* process next inm */ + } +next: + IN6M_UNLOCK(inm); + IN6_NEXT_MULTI(step, inm); + } + in6_multihead_lock_done(); + + return (retval); +} + +void +in6_multi_init(void) +{ + PE_parse_boot_argn("ifa_debug", &in6m_debug, sizeof (in6m_debug)); + + /* Setup lock group and attribute for in6_multihead */ + in6_multihead_lock_grp_attr = lck_grp_attr_alloc_init(); + in6_multihead_lock_grp = lck_grp_alloc_init("in6_multihead", + in6_multihead_lock_grp_attr); + in6_multihead_lock_attr = lck_attr_alloc_init(); + lck_rw_init(&in6_multihead_lock, in6_multihead_lock_grp, + in6_multihead_lock_attr); + + lck_mtx_init(&in6m_trash_lock, in6_multihead_lock_grp, + in6_multihead_lock_attr); + TAILQ_INIT(&in6m_trash_head); + + in6m_size = (in6m_debug == 0) ? sizeof (struct in6_multi) : + sizeof (struct in6_multi_dbg); + in6m_zone = zinit(in6m_size, IN6M_ZONE_MAX * in6m_size, + 0, IN6M_ZONE_NAME); + if (in6m_zone == NULL) { + panic("%s: failed allocating %s", __func__, IN6M_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(in6m_zone, Z_EXPAND, TRUE); + + imm_size = sizeof (struct in6_multi_mship); + imm_zone = zinit(imm_size, IMM_ZONE_MAX * imm_size, 0, IMM_ZONE_NAME); + if (imm_zone == NULL) { + panic("%s: failed allocating %s", __func__, IMM_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(imm_zone, Z_EXPAND, TRUE); + + ip6ms_size = sizeof (struct ip6_msource); + ip6ms_zone = zinit(ip6ms_size, IP6MS_ZONE_MAX * ip6ms_size, + 0, IP6MS_ZONE_NAME); + if (ip6ms_zone == NULL) { + panic("%s: failed allocating %s", __func__, IP6MS_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(ip6ms_zone, Z_EXPAND, TRUE); + + in6ms_size = sizeof (struct in6_msource); + in6ms_zone = zinit(in6ms_size, IN6MS_ZONE_MAX * in6ms_size, + 0, IN6MS_ZONE_NAME); + if (in6ms_zone == NULL) { + panic("%s: failed allocating %s", __func__, IN6MS_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(in6ms_zone, Z_EXPAND, TRUE); +} + +static struct in6_multi * +in6_multi_alloc(int how) +{ + struct in6_multi *in6m; + + in6m = (how == M_WAITOK) ? zalloc(in6m_zone) : + zalloc_noblock(in6m_zone); + if (in6m != NULL) { + bzero(in6m, in6m_size); + lck_mtx_init(&in6m->in6m_lock, in6_multihead_lock_grp, + in6_multihead_lock_attr); + in6m->in6m_debug |= IFD_ALLOC; + if (in6m_debug != 0) { + in6m->in6m_debug |= IFD_DEBUG; + in6m->in6m_trace = in6m_trace; + } + } + return (in6m); +} + +static void +in6_multi_free(struct in6_multi *in6m) +{ + IN6M_LOCK(in6m); + if (in6m->in6m_debug & IFD_ATTACHED) { + panic("%s: attached in6m=%p is being freed", __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_ifma != NULL) { + panic("%s: ifma not NULL for in6m=%p", __func__, in6m); + /* NOTREACHED */ + } else if (!(in6m->in6m_debug & IFD_ALLOC)) { + panic("%s: in6m %p cannot be freed", __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_refcount != 0) { + panic("%s: non-zero refcount in6m=%p", __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_reqcnt != 0) { + panic("%s: non-zero reqcnt in6m=%p", __func__, in6m); + /* NOTREACHED */ + } + + /* Free any pending MLDv2 state-change records */ + IF_DRAIN(&in6m->in6m_scq); + + in6m->in6m_debug &= ~IFD_ALLOC; + if ((in6m->in6m_debug & (IFD_DEBUG | IFD_TRASHED)) == + (IFD_DEBUG | IFD_TRASHED)) { + lck_mtx_lock(&in6m_trash_lock); + TAILQ_REMOVE(&in6m_trash_head, (struct in6_multi_dbg *)in6m, + in6m_trash_link); + lck_mtx_unlock(&in6m_trash_lock); + in6m->in6m_debug &= ~IFD_TRASHED; + } + IN6M_UNLOCK(in6m); + + lck_mtx_destroy(&in6m->in6m_lock, in6_multihead_lock_grp); + zfree(in6m_zone, in6m); +} + +static void +in6_multi_attach(struct in6_multi *in6m) +{ + in6_multihead_lock_assert(LCK_RW_ASSERT_EXCLUSIVE); + IN6M_LOCK_ASSERT_HELD(in6m); + + if (in6m->in6m_debug & IFD_ATTACHED) { + panic("%s: Attempt to attach an already attached in6m=%p", + __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_debug & IFD_TRASHED) { + panic("%s: Attempt to reattach a detached in6m=%p", + __func__, in6m); + /* NOTREACHED */ + } + + in6m->in6m_reqcnt++; + VERIFY(in6m->in6m_reqcnt == 1); + IN6M_ADDREF_LOCKED(in6m); + in6m->in6m_debug |= IFD_ATTACHED; + /* + * Reattach case: If debugging is enabled, take it + * out of the trash list and clear IFD_TRASHED. + */ + if ((in6m->in6m_debug & (IFD_DEBUG | IFD_TRASHED)) == + (IFD_DEBUG | IFD_TRASHED)) { + /* Become a regular mutex, just in case */ + IN6M_CONVERT_LOCK(in6m); + lck_mtx_lock(&in6m_trash_lock); + TAILQ_REMOVE(&in6m_trash_head, (struct in6_multi_dbg *)in6m, + in6m_trash_link); + lck_mtx_unlock(&in6m_trash_lock); + in6m->in6m_debug &= ~IFD_TRASHED; + } + + LIST_INSERT_HEAD(&in6_multihead, in6m, in6m_entry); +} + +int +in6_multi_detach(struct in6_multi *in6m) +{ + in6_multihead_lock_assert(LCK_RW_ASSERT_EXCLUSIVE); + IN6M_LOCK_ASSERT_HELD(in6m); + + if (in6m->in6m_reqcnt == 0) { + panic("%s: in6m=%p negative reqcnt", __func__, in6m); + /* NOTREACHED */ + } + + --in6m->in6m_reqcnt; + if (in6m->in6m_reqcnt > 0) + return (0); + + if (!(in6m->in6m_debug & IFD_ATTACHED)) { + panic("%s: Attempt to detach an unattached record in6m=%p", + __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_debug & IFD_TRASHED) { + panic("%s: in6m %p is already in trash list", __func__, in6m); + /* NOTREACHED */ + } + + /* + * NOTE: Caller calls IFMA_REMREF + */ + in6m->in6m_debug &= ~IFD_ATTACHED; + LIST_REMOVE(in6m, in6m_entry); + + if (in6m->in6m_debug & IFD_DEBUG) { + /* Become a regular mutex, just in case */ + IN6M_CONVERT_LOCK(in6m); + lck_mtx_lock(&in6m_trash_lock); + TAILQ_INSERT_TAIL(&in6m_trash_head, + (struct in6_multi_dbg *)in6m, in6m_trash_link); + lck_mtx_unlock(&in6m_trash_lock); + in6m->in6m_debug |= IFD_TRASHED; + } + + return (1); +} + +void +in6m_addref(struct in6_multi *in6m, int locked) +{ + if (!locked) + IN6M_LOCK_SPIN(in6m); + else + IN6M_LOCK_ASSERT_HELD(in6m); + + if (++in6m->in6m_refcount == 0) { + panic("%s: in6m=%p wraparound refcnt", __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_trace != NULL) { + (*in6m->in6m_trace)(in6m, TRUE); + } + if (!locked) + IN6M_UNLOCK(in6m); +} + +void +in6m_remref(struct in6_multi *in6m, int locked) +{ + struct ifmultiaddr *ifma; + struct mld_ifinfo *mli; + + if (!locked) + IN6M_LOCK_SPIN(in6m); + else + IN6M_LOCK_ASSERT_HELD(in6m); + + if (in6m->in6m_refcount == 0 || (in6m->in6m_refcount == 1 && locked)) { + panic("%s: in6m=%p negative refcnt", __func__, in6m); + /* NOTREACHED */ + } else if (in6m->in6m_trace != NULL) { + (*in6m->in6m_trace)(in6m, FALSE); + } + + --in6m->in6m_refcount; + if (in6m->in6m_refcount > 0) { + if (!locked) + IN6M_UNLOCK(in6m); + return; + } + + /* + * Synchronization with in6_mc_get(). In the event the in6m has been + * detached, the underlying ifma would still be in the if_multiaddrs + * list, and thus can be looked up via if_addmulti(). At that point, + * the only way to find this in6m is via ifma_protospec. To avoid + * race conditions between the last in6m_remref() of that in6m and its + * use via ifma_protospec, in6_multihead lock is used for serialization. + * In order to avoid violating the lock order, we must drop in6m_lock + * before acquiring in6_multihead lock. To prevent the in6m from being + * freed prematurely, we hold an extra reference. + */ + ++in6m->in6m_refcount; + IN6M_UNLOCK(in6m); + in6_multihead_lock_shared(); + IN6M_LOCK_SPIN(in6m); + --in6m->in6m_refcount; + if (in6m->in6m_refcount > 0) { + /* We've lost the race, so abort since in6m is still in use */ + IN6M_UNLOCK(in6m); + in6_multihead_lock_done(); + /* If it was locked, return it as such */ + if (locked) + IN6M_LOCK(in6m); + return; + } + in6m_purge(in6m); + ifma = in6m->in6m_ifma; + in6m->in6m_ifma = NULL; + in6m->in6m_ifp = NULL; + mli = in6m->in6m_mli; + in6m->in6m_mli = NULL; + IN6M_UNLOCK(in6m); + IFMA_LOCK_SPIN(ifma); + ifma->ifma_protospec = NULL; + IFMA_UNLOCK(ifma); + in6_multihead_lock_done(); + + in6_multi_free(in6m); + if_delmulti_ifma(ifma); + /* Release reference held to the underlying ifmultiaddr */ + IFMA_REMREF(ifma); + + if (mli != NULL) + MLI_REMREF(mli); +} + +static void +in6m_trace(struct in6_multi *in6m, int refhold) +{ + struct in6_multi_dbg *in6m_dbg = (struct in6_multi_dbg *)in6m; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(in6m->in6m_debug & IFD_DEBUG)) { + panic("%s: in6m %p has no debug structure", __func__, in6m); + /* NOTREACHED */ + } + if (refhold) { + cnt = &in6m_dbg->in6m_refhold_cnt; + tr = in6m_dbg->in6m_refhold; + } else { + cnt = &in6m_dbg->in6m_refrele_cnt; + tr = in6m_dbg->in6m_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % IN6M_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + +static struct in6_multi_mship * +in6_multi_mship_alloc(int how) +{ + struct in6_multi_mship *imm; + + imm = (how == M_WAITOK) ? zalloc(imm_zone) : zalloc_noblock(imm_zone); + if (imm != NULL) + bzero(imm, imm_size); + + return (imm); +} + +static void +in6_multi_mship_free(struct in6_multi_mship *imm) +{ + if (imm->i6mm_maddr != NULL) { + panic("%s: i6mm_maddr not NULL for imm=%p", __func__, imm); + /* NOTREACHED */ + } + zfree(imm_zone, imm); +} + +void +in6_multihead_lock_exclusive(void) +{ + lck_rw_lock_exclusive(&in6_multihead_lock); +} + +void +in6_multihead_lock_shared(void) +{ + lck_rw_lock_shared(&in6_multihead_lock); +} + +void +in6_multihead_lock_assert(int what) +{ + lck_rw_assert(&in6_multihead_lock, what); +} + +void +in6_multihead_lock_done(void) +{ + lck_rw_done(&in6_multihead_lock); +} + +static struct ip6_msource * +ip6ms_alloc(int how) +{ + struct ip6_msource *i6ms; + + i6ms = (how == M_WAITOK) ? zalloc(ip6ms_zone) : + zalloc_noblock(ip6ms_zone); + if (i6ms != NULL) + bzero(i6ms, ip6ms_size); + + return (i6ms); +} + +static void +ip6ms_free(struct ip6_msource *i6ms) +{ + zfree(ip6ms_zone, i6ms); +} + +static struct in6_msource * +in6ms_alloc(int how) +{ + struct in6_msource *in6ms; + + in6ms = (how == M_WAITOK) ? zalloc(in6ms_zone) : + zalloc_noblock(in6ms_zone); + if (in6ms != NULL) + bzero(in6ms, in6ms_size); + + return (in6ms); +} + +static void +in6ms_free(struct in6_msource *in6ms) +{ + zfree(in6ms_zone, in6ms); +} + +#ifdef MLD_DEBUG + +static const char *in6m_modestrs[] = { "un\n", "in", "ex" }; + +static const char * +in6m_mode_str(const int mode) +{ + if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE) + return (in6m_modestrs[mode]); + return ("??"); +} + +static const char *in6m_statestrs[] = { + "not-member\n", + "silent\n", + "idle\n", + "lazy\n", + "sleeping\n", + "awakening\n", + "query-pending\n", + "sg-query-pending\n", + "leaving" +}; + +static const char * +in6m_state_str(const int state) +{ + if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER) + return (in6m_statestrs[state]); + return ("??"); +} + +/* + * Dump an in6_multi structure to the console. + */ +void +in6m_print(const struct in6_multi *inm) +{ + int t; + + IN6M_LOCK_ASSERT_HELD(IN6M_CAST_TO_NONCONST(inm)); + + if (mld_debug == 0) + return; + + printf("%s: --- begin in6m %p ---\n", __func__, inm); + printf("addr %s ifp %p(%s%d) ifma %p\n", + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp, + inm->in6m_ifp->if_name, + inm->in6m_ifp->if_unit, + inm->in6m_ifma); + printf("timer %u state %s refcount %u scq.len %u\n", + inm->in6m_timer, + in6m_state_str(inm->in6m_state), + inm->in6m_refcount, + inm->in6m_scq.ifq_len); + printf("mli %p nsrc %lu sctimer %u scrv %u\n", + inm->in6m_mli, + inm->in6m_nsrc, + inm->in6m_sctimer, + inm->in6m_scrv); + for (t = 0; t < 2; t++) { + printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t, + in6m_mode_str(inm->in6m_st[t].iss_fmode), + inm->in6m_st[t].iss_asm, + inm->in6m_st[t].iss_ex, + inm->in6m_st[t].iss_in, + inm->in6m_st[t].iss_rec); + } + printf("%s: --- end in6m %p ---\n", __func__, inm); +} + +#else + +void +in6m_print(__unused const struct in6_multi *inm) +{ + +} + +#endif diff --git a/bsd/netinet6/in6_pcb.c b/bsd/netinet6/in6_pcb.c index 20f39a34d..2ea4d7a5d 100644 --- a/bsd/netinet6/in6_pcb.c +++ b/bsd/netinet6/in6_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -102,6 +102,8 @@ #include <sys/errno.h> #include <sys/time.h> #include <sys/proc.h> +#include <sys/kauth.h> +#include <sys/priv.h> #include <net/if.h> #include <net/if_types.h> @@ -160,7 +162,7 @@ in6_pcblookup_local_and_cleanup( if (inp && inp->inp_wantcnt == WNT_STOPUSING) { struct socket *so = inp->inp_socket; - lck_mtx_lock(inp->inpcb_mtx); + lck_mtx_lock(&inp->inpcb_mtx); if (so->so_usecount == 0) { if (inp->inp_state != INPCB_STATE_DEAD) @@ -169,23 +171,23 @@ in6_pcblookup_local_and_cleanup( inp = NULL; } else { - lck_mtx_unlock(inp->inpcb_mtx); + lck_mtx_unlock(&inp->inpcb_mtx); } } return inp; } + int -in6_pcbbind( - struct inpcb *inp, - struct sockaddr *nam, - struct proc *p) +in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) { struct socket *so = inp->inp_socket; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)NULL; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; u_short lport = 0; int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); + int error; + kauth_cred_t cred; if (!in6_ifaddrs) /* XXX broken! */ return (EADDRNOTAVAIL); @@ -196,6 +198,8 @@ in6_pcbbind( socket_unlock(so, 0); /* keep reference */ lck_rw_lock_exclusive(pcbinfo->mtx); if (nam) { + unsigned int outif = 0; + sin6 = (struct sockaddr_in6 *)nam; if (nam->sa_len != sizeof(*sin6)) { lck_rw_done(pcbinfo->mtx); @@ -212,7 +216,8 @@ in6_pcbbind( } /* KAME hack: embed scopeid */ - if (in6_embedscope(&sin6->sin6_addr, sin6, inp, NULL) != 0) { + if (in6_embedscope(&sin6->sin6_addr, sin6, inp, NULL, + NULL) != 0) { lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); return EINVAL; @@ -232,10 +237,10 @@ in6_pcbbind( if (so->so_options & SO_REUSEADDR) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - struct ifaddr *ia = NULL; + struct ifaddr *ifa; sin6->sin6_port = 0; /* yech... */ - if ((ia = ifa_ifwithaddr((struct sockaddr *)sin6)) == 0) { + if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin6)) == 0) { lck_rw_done(pcbinfo->mtx); socket_lock(so, 0); return(EADDRNOTAVAIL); @@ -247,26 +252,34 @@ in6_pcbbind( * We should allow to bind to a deprecated address, since * the application dare to use it. */ - if (ia && - ((struct in6_ifaddr *)ia)->ia6_flags & - (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { - ifafree(ia); - lck_rw_done(pcbinfo->mtx); - socket_lock(so, 0); - return(EADDRNOTAVAIL); + if (ifa != NULL) { + IFA_LOCK_SPIN(ifa); + if (((struct in6_ifaddr *)ifa)->ia6_flags & + (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_DETACHED)) { + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); + return(EADDRNOTAVAIL); + } + outif = ifa->ifa_ifp->if_index; + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); } - ifafree(ia); - ia = NULL; } if (lport) { struct inpcb *t; /* GROSS */ - if (ntohs(lport) < IPV6PORT_RESERVED && - ((so->so_state & SS_PRIV) == 0)) { - lck_rw_done(pcbinfo->mtx); - socket_lock(so, 0); - return(EACCES); + if (ntohs(lport) < IPV6PORT_RESERVED) { + cred = kauth_cred_proc_ref(p); + error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); + kauth_cred_unref(&cred); + if (error != 0) { + lck_rw_done(pcbinfo->mtx); + socket_lock(so, 0); + return(EACCES); + } } if (so->so_uid && @@ -335,6 +348,7 @@ in6_pcbbind( } } inp->in6p_laddr = sin6->sin6_addr; + inp->in6p_last_outif = outif; } socket_lock(so, 0); if (lport == 0) { @@ -349,10 +363,11 @@ in6_pcbbind( if (in_pcbinshash(inp, 1) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; + inp->in6p_last_outif = 0; lck_rw_done(pcbinfo->mtx); return (EAGAIN); } - } + } lck_rw_done(pcbinfo->mtx); sflt_notify(so, sock_evt_bound, NULL); return(0); @@ -371,17 +386,14 @@ in6_pcbbind( */ int -in6_pcbladdr( - struct inpcb *inp, - struct sockaddr *nam, - struct in6_addr *plocal_addr6) +in6_pcbladdr(struct inpcb *inp, struct sockaddr *nam, + struct in6_addr *plocal_addr6, unsigned int *poutif) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr *addr6 = NULL; struct in6_addr src_storage; - - struct ifnet *ifp = NULL; int error = 0; + unsigned int ifscope; if (nam->sa_len != sizeof (*sin6)) return (EINVAL); @@ -391,7 +403,7 @@ in6_pcbladdr( return (EADDRNOTAVAIL); /* KAME hack: embed scopeid */ - if (in6_embedscope(&sin6->sin6_addr, sin6, inp, &ifp) != 0) + if (in6_embedscope(&sin6->sin6_addr, sin6, inp, NULL, NULL) != 0) return EINVAL; if (in6_ifaddrs) { @@ -402,33 +414,37 @@ in6_pcbladdr( if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) sin6->sin6_addr = in6addr_loopback; } - { - /* - * XXX: in6_selectsrc might replace the bound local address - * with the address specified by setsockopt(IPV6_PKTINFO). - * Is it the intended behavior? - */ - addr6 = in6_selectsrc(sin6, inp->in6p_outputopts, - inp->in6p_moptions, - &inp->in6p_route, - &inp->in6p_laddr, &src_storage, &error); - if (addr6 == 0) { - if (error == 0) - error = EADDRNOTAVAIL; - return(error); - } - *plocal_addr6 = *addr6; - /* - * Don't do pcblookup call here; return interface in - * plocal_addr6 - * and exit to caller, that will do the lookup. - */ + + ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; + + /* + * XXX: in6_selectsrc might replace the bound local address + * with the address specified by setsockopt(IPV6_PKTINFO). + * Is it the intended behavior? + */ + addr6 = in6_selectsrc(sin6, inp->in6p_outputopts, inp, + &inp->in6p_route, NULL, &src_storage, ifscope, &error); + if (addr6 == 0) { + if (error == 0) + error = EADDRNOTAVAIL; + return(error); } - /* XXX: what is the point in doing this? */ - if (inp->in6p_route.ro_rt) - ifp = inp->in6p_route.ro_rt->rt_ifp; + if (poutif != NULL) { + struct rtentry *rt; + if ((rt = inp->in6p_route.ro_rt) != NULL) + *poutif = rt->rt_ifp->if_index; + else + *poutif = 0; + } + *plocal_addr6 = *addr6; + /* + * Don't do pcblookup call here; return interface in + * plocal_addr6 + * and exit to caller, that will do the lookup. + */ return(0); } @@ -440,21 +456,22 @@ in6_pcbladdr( * then pick one. */ int -in6_pcbconnect(inp, nam, p) - struct inpcb *inp; - struct sockaddr *nam; - struct proc *p; +in6_pcbconnect( + struct inpcb *inp, + struct sockaddr *nam, + struct proc *p) { struct in6_addr addr6; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct inpcb *pcb; int error; + unsigned int outif = 0; /* * Call inner routine, to assign local interface address. * in6_pcbladdr() may automatically fill in sin6_scope_id. */ - if ((error = in6_pcbladdr(inp, nam, &addr6)) != 0) + if ((error = in6_pcbladdr(inp, nam, &addr6, &outif)) != 0) return(error); socket_unlock(inp->inp_socket, 0); pcb = in6_pcblookup_hash(inp->inp_pcbinfo, &sin6->sin6_addr, @@ -474,6 +491,7 @@ in6_pcbconnect(inp, nam, p) return (error); } inp->in6p_laddr = addr6; + inp->in6p_last_outif = outif; } if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { /*lock inversion issue, mostly with udp multicast packets */ @@ -495,8 +513,8 @@ in6_pcbconnect(inp, nam, p) } void -in6_pcbdisconnect(inp) - struct inpcb *inp; +in6_pcbdisconnect( + struct inpcb *inp) { if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) { /*lock inversion issue, mostly with udp multicast packets */ @@ -515,8 +533,8 @@ in6_pcbdisconnect(inp) } void -in6_pcbdetach(inp) - struct inpcb *inp; +in6_pcbdetach( + struct inpcb *inp) { struct socket *so = inp->inp_socket; struct inpcbinfo *ipi = inp->inp_pcbinfo; @@ -533,13 +551,15 @@ in6_pcbdetach(inp) inp->inp_state = INPCB_STATE_DEAD; if ((so->so_flags & SOF_PCBCLEARING) == 0) { + struct ip_moptions *imo; + struct ip6_moptions *im6o; + inp->inp_vflag = 0; so->so_flags |= SOF_PCBCLEARING; inp->inp_gencnt = ++ipi->ipi_gencnt; if (inp->in6p_options) m_freem(inp->in6p_options); ip6_freepcbopts(inp->in6p_outputopts); - ip6_freemoptions(inp->in6p_moptions); if (inp->in6p_route.ro_rt) { rtfree(inp->in6p_route.ro_rt); inp->in6p_route.ro_rt = NULL; @@ -547,16 +567,23 @@ in6_pcbdetach(inp) /* Check and free IPv4 related resources in case of mapped addr */ if (inp->inp_options) (void)m_free(inp->inp_options); - ip_freemoptions(inp->inp_moptions); + + im6o = inp->in6p_moptions; + inp->in6p_moptions = NULL; + if (im6o != NULL) + IM6O_REMREF(im6o); + + imo = inp->inp_moptions; inp->inp_moptions = NULL; - + if (imo != NULL) + IMO_REMREF(imo); } } struct sockaddr * -in6_sockaddr(port, addr_p) - in_port_t port; - struct in6_addr *addr_p; +in6_sockaddr( + in_port_t port, + struct in6_addr *addr_p) { struct sockaddr_in6 *sin6; @@ -579,9 +606,9 @@ in6_sockaddr(port, addr_p) } struct sockaddr * -in6_v4mapsin6_sockaddr(port, addr_p) - in_port_t port; - struct in_addr *addr_p; +in6_v4mapsin6_sockaddr( + in_port_t port, + struct in_addr *addr_p) { struct sockaddr_in sin; struct sockaddr_in6 *sin6_p; @@ -612,9 +639,9 @@ in6_v4mapsin6_sockaddr(port, addr_p) * because there actually /is/ a programming error somewhere... XXX) */ int -in6_setsockaddr(so, nam) - struct socket *so; - struct sockaddr **nam; +in6_setsockaddr( + struct socket *so, + struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; @@ -634,9 +661,9 @@ in6_setsockaddr(so, nam) } int -in6_setpeeraddr(so, nam) - struct socket *so; - struct sockaddr **nam; +in6_setpeeraddr( + struct socket *so, + struct sockaddr **nam) { struct inpcb *inp; struct in6_addr addr; @@ -701,17 +728,15 @@ in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam) * cmds that are uninteresting (e.g., no error in the map). * Call the protocol specific routine (if any) to report * any errors for each matching socket. - * - * Must be called at splnet. */ void -in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, notify) +in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, cmdarg, notify) struct inpcbinfo *pcbinfo; struct sockaddr *dst; const struct sockaddr *src; u_int fport_arg, lport_arg; int cmd; -// struct inpcb *(*notify)(struct inpcb *, int); + void *cmdarg; void (*notify)(struct inpcb *, int); { struct inpcb *inp, *ninp; @@ -758,6 +783,22 @@ in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, notify) if ((inp->inp_vflag & INP_IPV6) == 0) continue; + /* + * If the error designates a new path MTU for a destination + * and the application (associated with this socket) wanted to + * know the value, notify. Note that we notify for all + * disconnected sockets if the corresponding application + * wanted. This is because some UDP applications keep sending + * sockets disconnected. + * XXX: should we avoid to notify the value to TCP sockets? + */ + if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 && + (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) || + IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &sa6_dst->sin6_addr))) { + ip6_notify_pmtu(inp, (struct sockaddr_in6 *)dst, + (u_int32_t *)cmdarg); + } + /* * Detect if we should notify the error. If no source and * destination ports are specifed, but non-zero flowinfo and @@ -799,11 +840,11 @@ in6_pcbnotify(pcbinfo, dst, fport_arg, src, lport_arg, cmd, notify) * Lookup a PCB based on the local address and port. */ struct inpcb * -in6_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay) - struct inpcbinfo *pcbinfo; - struct in6_addr *laddr; - u_int lport_arg; - int wild_okay; +in6_pcblookup_local( + struct inpcbinfo *pcbinfo, + struct in6_addr *laddr, + u_int lport_arg, + int wild_okay) { struct inpcb *inp; int matchwild = 3, wildcard; @@ -883,47 +924,6 @@ in6_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay) return (match); } } -#ifndef APPLE -/* this is not used in Darwin */ -void -in6_pcbpurgeif0( - struct in6pcb *head, - struct ifnet *ifp) -{ - struct in6pcb *in6p; - struct ip6_moptions *im6o; - struct in6_multi_mship *imm, *nimm; - - for (in6p = head; in6p != NULL; in6p = LIST_NEXT(in6p, inp_list)) { - im6o = in6p->in6p_moptions; - if ((in6p->inp_vflag & INP_IPV6) && - im6o) { - /* - * Unselect the outgoing interface if it is being - * detached. - */ - if (im6o->im6o_multicast_ifp == ifp) - im6o->im6o_multicast_ifp = NULL; - - /* - * Drop multicast group membership if we joined - * through the interface being detached. - * XXX controversial - is it really legal for kernel - * to force this? - */ - for (imm = im6o->im6o_memberships.lh_first; - imm != NULL; imm = nimm) { - nimm = imm->i6mm_chain.le_next; - if (imm->i6mm_maddr->in6m_ifp == ifp) { - LIST_REMOVE(imm, i6mm_chain); - in6_delmulti(imm->i6mm_maddr); - FREE(imm, M_IPMADDR); - } - } - } - } -} -#endif /* * Check for alternatives when higher level complains @@ -932,8 +932,8 @@ in6_pcbpurgeif0( * (by a redirect), time to try a default gateway again. */ void -in6_losing(in6p) - struct inpcb *in6p; +in6_losing( + struct inpcb *in6p) { struct rtentry *rt; struct rt_addrinfo info; @@ -987,6 +987,104 @@ in6_rtchange( } } +/* + * Check if PCB exists hash list. Also returns uid and gid of socket + */ +int +in6_pcblookup_hash_exists( + struct inpcbinfo *pcbinfo, + struct in6_addr *faddr, + u_int fport_arg, + struct in6_addr *laddr, + u_int lport_arg, + int wildcard, + uid_t *uid, + gid_t *gid, + __unused struct ifnet *ifp) +{ + struct inpcbhead *head; + struct inpcb *inp; + u_short fport = fport_arg, lport = lport_arg; + int faith; + int found; + +#if defined(NFAITH) && NFAITH > 0 + faith = faithprefix(laddr); +#else + faith = 0; +#endif + + *uid = UID_MAX; + *gid = GID_MAX; + + lck_rw_lock_shared(pcbinfo->mtx); + + /* + * First look for an exact match. + */ + head = &pcbinfo->hashbase[INP_PCBHASH(faddr->s6_addr32[3] /* XXX */, + lport, fport, + pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, faddr) && + IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, laddr) && + inp->inp_fport == fport && + inp->inp_lport == lport) { + if ((found = (inp->inp_socket != NULL))) { + /* + * Found. Check if pcb is still valid + */ + *uid = inp->inp_socket->so_uid; + *gid = inp->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + } + if (wildcard) { + struct inpcb *local_wild = NULL; + + head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, + pcbinfo->hashmask)]; + LIST_FOREACH(inp, head, inp_hash) { + if ((inp->inp_vflag & INP_IPV6) == 0) + continue; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + inp->inp_lport == lport) { + if (faith && (inp->inp_flags & INP_FAITH) == 0) + continue; + if (IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, + laddr)) { + if ((found = (inp->inp_socket != NULL))) { + *uid = inp->inp_socket->so_uid; + *gid = inp->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + local_wild = inp; + } + } + if (local_wild) { + if ((found = (local_wild->inp_socket != NULL))) { + *uid = local_wild->inp_socket->so_uid; + *gid = local_wild->inp_socket->so_gid; + } + lck_rw_done(pcbinfo->mtx); + return (found); + } + } + + /* + * Not found. + */ + lck_rw_done(pcbinfo->mtx); + return (0); +} + /* * Lookup PCB in hash list. */ diff --git a/bsd/netinet6/in6_pcb.h b/bsd/netinet6/in6_pcb.h index 58476cc5e..d83836bbc 100644 --- a/bsd/netinet6/in6_pcb.h +++ b/bsd/netinet6/in6_pcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,13 +104,16 @@ extern int in6_pcbbind(struct inpcb *, struct sockaddr *, struct proc *); extern int in6_pcbconnect(struct inpcb *, struct sockaddr *, struct proc *); extern void in6_pcbdetach(struct inpcb *); extern void in6_pcbdisconnect(struct inpcb *); -extern int in6_pcbladdr(struct inpcb *, struct sockaddr *, struct in6_addr *); +extern int in6_pcbladdr(struct inpcb *, struct sockaddr *, + struct in6_addr *, unsigned int *); extern struct inpcb *in6_pcblookup_local(struct inpcbinfo *, struct in6_addr *, u_int, int); extern struct inpcb *in6_pcblookup_hash(struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, u_int, int, struct ifnet *); +extern int in6_pcblookup_hash_exists(struct inpcbinfo *, struct in6_addr *, + u_int, struct in6_addr *, u_int, int, uid_t *, gid_t *, struct ifnet *); extern void in6_pcbnotify(struct inpcbinfo *, struct sockaddr *, u_int, - const struct sockaddr *, u_int, int, void (*)(struct inpcb *, int)); + const struct sockaddr *, u_int, int, void *, void (*)(struct inpcb *, int)); extern void in6_rtchange(struct inpcb *, int); extern struct sockaddr *in6_sockaddr(in_port_t port, struct in6_addr *addr_p); extern struct sockaddr *in6_v4mapsin6_sockaddr(in_port_t port, @@ -119,9 +122,6 @@ extern int in6_setpeeraddr(struct socket *so, struct sockaddr **nam); extern int in6_setsockaddr(struct socket *so, struct sockaddr **nam); extern int in6_mapped_sockaddr(struct socket *so, struct sockaddr **nam); extern int in6_mapped_peeraddr(struct socket *so, struct sockaddr **nam); -extern struct in6_addr *in6_selectsrc(struct sockaddr_in6 *, - struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, - struct in6_addr *, struct in6_addr *, int *); extern int in6_selecthlim(struct in6pcb *, struct ifnet *); extern int in6_pcbsetport(struct in6_addr *, struct inpcb *, struct proc *, int); diff --git a/bsd/netinet6/in6_prefix.c b/bsd/netinet6/in6_prefix.c index 891917965..da85f486d 100644 --- a/bsd/netinet6/in6_prefix.c +++ b/bsd/netinet6/in6_prefix.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -227,18 +227,25 @@ search_matched_prefix(struct ifnet *ifp, struct in6_prefixreq *ipr) ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } if (ipr->ipr_plen <= - in6_matchlen(&ipr->ipr_prefix.sin6_addr, IFA_IN6(ifa))) + in6_matchlen(&ipr->ipr_prefix.sin6_addr, IFA_IN6(ifa))) { + /* keep it locked */ break; + } + IFA_UNLOCK(ifa); } if (ifa == NULL) { ifnet_lock_done(ifp); return NULL; } - + IFA_LOCK_ASSERT_HELD(ifa); rpp = ifpr2rp(((struct in6_ifaddr *)ifa)->ia6_ifpr); + IFA_UNLOCK(ifa); if (rpp != 0) { ifnet_lock_done(ifp); return rpp; @@ -302,24 +309,31 @@ mark_matched_prefixes(u_int32_t cmd, struct ifnet *ifp, struct in6_rrenumreq *ir { struct rr_prefix *rpp; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } matchlen = in6_matchlen(&irr->irr_matchprefix.sin6_addr, IFA_IN6(ifa)); if (irr->irr_m_minlen > matchlen || - irr->irr_m_maxlen < matchlen || irr->irr_m_len > matchlen) - continue; + irr->irr_m_maxlen < matchlen || irr->irr_m_len > matchlen) { + IFA_UNLOCK(ifa); + continue; + } rpp = ifpr2rp(((struct in6_ifaddr *)ifa)->ia6_ifpr); if (rpp != 0) { matched = 1; rpp->rp_statef_addmark = 1; if (cmd == SIOCCIFPREFIX_IN6) rpp->rp_statef_delmark = 1; - } else + } else { log(LOG_WARNING, "in6_prefix.c: mark_matched_prefixes:" "no back pointer to ifprefix for %s. " "ND autoconfigured addr?\n", ip6_sprintf(IFA_IN6(ifa))); + } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); return matched; @@ -447,15 +461,17 @@ assign_ra_entry(struct rr_prefix *rpp, int iilen, struct in6_ifaddr *ia) return error; /* copy interface id part */ + IFA_LOCK(&ia->ia_ifa); bit_copy((caddr_t)&rap->ra_ifid, sizeof(rap->ra_ifid) << 3, - (caddr_t)IA6_IN6(ia), - sizeof(*IA6_IN6(ia)) << 3, rpp->rp_plen, iilen); + (caddr_t)IA6_IN6(ia), sizeof(*IA6_IN6(ia)) << 3, + rpp->rp_plen, iilen); /* link to ia, and put into list */ rap->ra_addr = ia; - ifaref(&rap->ra_addr->ia_ifa); + IFA_ADDREF_LOCKED(&rap->ra_addr->ia_ifa); #if 0 /* Can't do this now, because rpp may be on th stack. should fix it? */ ia->ia6_ifpr = rp2ifpr(rpp); #endif + IFA_UNLOCK(&ia->ia_ifa); lck_mtx_lock(prefix6_mutex); LIST_INSERT_HEAD(&rpp->rp_addrhead, rap, ra_entry); lck_mtx_unlock(prefix6_mutex); @@ -478,9 +494,11 @@ in6_prefix_add_llifid(__unused int iilen, struct in6_ifaddr *ia) if ((error = create_ra_entry(&rap)) != 0) return(error); /* copy interface id part */ + IFA_LOCK(&ia->ia_ifa); bit_copy((caddr_t)&rap->ra_ifid, sizeof(rap->ra_ifid) << 3, (caddr_t)IA6_IN6(ia), sizeof(*IA6_IN6(ia)) << 3, 64, (sizeof(rap->ra_ifid) << 3) - 64); + IFA_UNLOCK(&ia->ia_ifa); /* XXX: init dummy so */ bzero(&so, sizeof(so)); /* insert into list */ @@ -500,6 +518,7 @@ in6_prefix_add_llifid(__unused int iilen, struct in6_ifaddr *ia) return 0; } +#if 0 /* * add an address to an interface. if the interface id portion is new, * we will add new interface address (prefix database + new interface id). @@ -507,17 +526,24 @@ in6_prefix_add_llifid(__unused int iilen, struct in6_ifaddr *ia) int in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia) { - int plen = (sizeof(*IA6_IN6(ia)) << 3) - iilen; + struct in6_addr addr; + int plen; struct ifprefix *ifpr; struct rp_addr *rap; int error = 0; - if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))) + IFA_LOCK_SPIN(&ia->ia_ifa); + addr = *IA6_IN6(ia); + plen = (sizeof(*IA6_IN6(ia)) << 3) - iilen; + IFA_UNLOCK(&ia->ia_ifa); + + if (IN6_IS_ADDR_LINKLOCAL(&addr)) return(in6_prefix_add_llifid(iilen, ia)); - ifpr = in6_prefixwithifp(ia->ia_ifp, plen, IA6_IN6(ia)); + ifpr = in6_prefixwithifp(ia->ia_ifp, plen, &addr); if (ifpr == NULL) { struct rr_prefix rp; struct socket so; + struct ifnet *ifp; int pplen = (plen == 128) ? 64 : plen; /* XXX hardcoded 64 is bad */ /* allocate a prefix for ia, with default properties */ @@ -525,14 +551,12 @@ in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia) /* init rp */ bzero(&rp, sizeof(rp)); rp.rp_type = IN6_PREFIX_RR; - rp.rp_ifp = ia->ia_ifp; + rp.rp_ifp = ifp = ia->ia_ifp; rp.rp_plen = pplen; rp.rp_prefix.sin6_len = sizeof(rp.rp_prefix); rp.rp_prefix.sin6_family = AF_INET6; bit_copy((char *)RP_IN6(&rp), sizeof(*RP_IN6(&rp)) << 3, - (char *)&ia->ia_addr.sin6_addr, - sizeof(ia->ia_addr.sin6_addr) << 3, - 0, pplen); + (char *)&addr, sizeof (addr) << 3, 0, pplen); rp.rp_vltime = rp.rp_pltime = RR_INFINITE_LIFETIME; rp.rp_raf_onlink = 1; rp.rp_raf_auto = 1; @@ -541,7 +565,9 @@ in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia) rp.rp_origin = PR_ORIG_RR; /* can be renumbered */ /* create ra_entry */ + ifnet_lock_shared(ifp); error = link_stray_ia6s(&rp); + ifnet_lock_done(ifp); if (error != 0) { free_rp_entries(&rp); return error; @@ -559,53 +585,69 @@ in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia) return error; /* search again */ - ifpr = in6_prefixwithifp(ia->ia_ifp, pplen, IA6_IN6(ia)); + ifpr = in6_prefixwithifp(ia->ia_ifp, pplen, &addr); if (ifpr == NULL) return 0; } - rap = search_ifidwithprefix(ifpr2rp(ifpr), IA6_IN6(ia)); + rap = search_ifidwithprefix(ifpr2rp(ifpr), &addr); if (rap != NULL) { if (rap->ra_addr == NULL) { rap->ra_addr = ia; - ifaref(&rap->ra_addr->ia_ifa); + IFA_ADDREF(&rap->ra_addr->ia_ifa); } else if (rap->ra_addr != ia) { /* There may be some inconsistencies between addrs. */ log(LOG_ERR, "ip6_prefix.c: addr %s/%d matched prefix" " already has another ia %p(%s) on its ifid list\n", - ip6_sprintf(IA6_IN6(ia)), plen, - rap->ra_addr, + ip6_sprintf(&addr), plen, rap->ra_addr, ip6_sprintf(IA6_IN6(rap->ra_addr))); return EADDRINUSE /* XXX */; } + IFA_LOCK_SPIN(&ia->ia_ifa); ia->ia6_ifpr = ifpr; + IFA_UNLOCK(&ia->ia_ifa); return 0; } error = assign_ra_entry(ifpr2rp(ifpr), iilen, ia); - if (error == 0) + if (error == 0) { + IFA_LOCK_SPIN(&ia->ia_ifa); ia->ia6_ifpr = ifpr; + IFA_UNLOCK(&ia->ia_ifa); + } return (error); } +#endif +#if 0 void in6_prefix_remove_ifid(__unused int iilen, struct in6_ifaddr *ia) { struct rp_addr *rap; + struct in6_addr addr; + struct ifprefix *ifpr; - if (ia->ia6_ifpr == NULL) + IFA_LOCK_SPIN(&ia->ia_ifa); + if ((ifpr = ia->ia6_ifpr) == NULL) { + IFA_UNLOCK(&ia->ia_ifa); return; - rap = search_ifidwithprefix(ifpr2rp(ia->ia6_ifpr), IA6_IN6(ia)); + } + addr = *IA6_IN6(ia); + IFA_UNLOCK(&ia->ia_ifa); + rap = search_ifidwithprefix(ifpr2rp(ifpr), &addr); if (rap != NULL) { lck_mtx_lock(prefix6_mutex); LIST_REMOVE(rap, ra_entry); lck_mtx_unlock(prefix6_mutex); - if (rap->ra_addr) - ifafree(&rap->ra_addr->ia_ifa); + if (rap->ra_addr) { + IFA_REMREF(&rap->ra_addr->ia_ifa); + rap->ra_addr = NULL; + } FREE(rap, M_RR_ADDR); } - if (LIST_EMPTY(&ifpr2rp(ia->ia6_ifpr)->rp_addrhead)) - rp_remove(ifpr2rp(ia->ia6_ifpr)); + if (LIST_EMPTY(&ifpr2rp(ifpr)->rp_addrhead)) + rp_remove(ifpr2rp(ifpr)); } +#endif void in6_purgeprefix( @@ -665,20 +707,29 @@ add_each_addr(struct socket *so, struct rr_prefix *rpp, struct rp_addr *rap) ia6 = in6ifa_ifpwithaddr(rpp->rp_ifp, &ifra.ifra_addr.sin6_addr); if (ia6 != NULL) { + struct in6_ifaddr *ria6 = NULL; + + IFA_LOCK(&ia6->ia_ifa); if (ia6->ia6_ifpr == NULL) { /* link this addr and the prefix each other */ - if (rap->ra_addr) - ifafree(&rap->ra_addr->ia_ifa); + if (rap->ra_addr != NULL) + ria6 = rap->ra_addr; /* Reference held in in6ifa_ifpwithaddr() */ rap->ra_addr = ia6; ia6->ia6_ifpr = rp2ifpr(rpp); + IFA_UNLOCK(&ia6->ia_ifa); + if (ria6 != NULL) + IFA_REMREF(&ria6->ia_ifa); return; } if (ia6->ia6_ifpr == rp2ifpr(rpp)) { - if (rap->ra_addr) - ifafree(&rap->ra_addr->ia_ifa); + if (rap->ra_addr != NULL) + ria6 = rap->ra_addr; /* Reference held in in6ifa_ifpwithaddr() */ rap->ra_addr = ia6; + IFA_UNLOCK(&ia6->ia_ifa); + if (ria6 != NULL) + IFA_REMREF(&ria6->ia_ifa); return; } /* @@ -697,7 +748,8 @@ add_each_addr(struct socket *so, struct rr_prefix *rpp, struct rp_addr *rap) ip6_sprintf(&ifra.ifra_addr.sin6_addr), rpp->rp_plen, ip6_sprintf(IA6_IN6(ia6)), in6_mask2len(&ia6->ia_prefixmask.sin6_addr, NULL)); - ifafree(&ia6->ia_ifa); + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); return; } /* propagate ANYCAST flag if it is set for ancestor addr */ @@ -803,8 +855,10 @@ rrpr_update(struct socket *so, struct rr_prefix *new) LIST_REMOVE(rap, ra_entry); if (search_ifidwithprefix(rpp, &rap->ra_ifid) != NULL) { - if (rap->ra_addr) - ifafree(&rap->ra_addr->ia_ifa); + if (rap->ra_addr) { + IFA_REMREF(&rap->ra_addr->ia_ifa); + rap->ra_addr = NULL; + } FREE(rap, M_RR_ADDR); continue; } @@ -870,11 +924,14 @@ rrpr_update(struct socket *so, struct rr_prefix *new) * init the prefix pointer. */ lck_mtx_lock(prefix6_mutex); - LIST_FOREACH(rap, &rpp->rp_addrhead, ra_entry) - { - if (rap->ra_addr != NULL) { - if (rap->ra_addr->ia6_ifpr == NULL) - rap->ra_addr->ia6_ifpr = rp2ifpr(rpp); + LIST_FOREACH(rap, &rpp->rp_addrhead, ra_entry) { + struct in6_ifaddr *ia6; + + if ((ia6 = rap->ra_addr) != NULL) { + IFA_LOCK(&ia6->ia_ifa); + if (ia6->ia6_ifpr == NULL) + ia6->ia6_ifpr = rp2ifpr(rpp); + IFA_UNLOCK(&ia6->ia_ifa); continue; } add_each_addr(so, rpp, rap); @@ -967,13 +1024,20 @@ init_newprefix(struct in6_rrenumreq *irr, struct ifprefix *ifpr, { struct rp_addr *rap; int error = 0; + struct in6_ifaddr *ia6; if ((error = create_ra_entry(&rap)) != 0) return error; rap->ra_ifid = orap->ra_ifid; - rap->ra_flags.anycast = (orap->ra_addr != NULL && - (orap->ra_addr->ia6_flags & - IN6_IFF_ANYCAST) != 0) ? 1 : 0; + ia6 = orap->ra_addr->ia_ifa; + if (ia6 != NULL) { + IFA_LOCK(&ia6->ia_ifa); + rap->ra_flags.anycast = + ((ia6->ia6_flags & IN6_IFF_ANYCAST) != 0) ? 1 : 0; + IFA_UNLOCK(&ia6->ia_ifa); + } else { + rap->ra_flags.anycast = 0; + } LIST_INSERT_HEAD(&rpp->rp_addrhead, rap, ra_entry); } rpp->rp_vltime = irr->irr_vltime; @@ -1005,8 +1069,10 @@ free_rp_entries(struct rr_prefix *rpp) rap = LIST_FIRST(&rpp->rp_addrhead); LIST_REMOVE(rap, ra_entry); - if (rap->ra_addr) - ifafree(&rap->ra_addr->ia_ifa); + if (rap->ra_addr) { + IFA_REMREF(&rap->ra_addr->ia_ifa); + rap->ra_addr = NULL; + } FREE(rap, M_RR_ADDR); } lck_mtx_unlock(prefix6_mutex); @@ -1054,10 +1120,14 @@ unprefer_prefix(struct rr_prefix *rpp) lck_mtx_lock(prefix6_mutex); for (rap = rpp->rp_addrhead.lh_first; rap != NULL; rap = rap->ra_entry.le_next) { - if (rap->ra_addr == NULL) + struct in6_ifaddr *ia6; + + if ((ia6 = rap->ra_addr) == NULL) continue; - rap->ra_addr->ia6_lifetime.ia6t_preferred = timenow.tv_sec; - rap->ra_addr->ia6_lifetime.ia6t_pltime = 0; + IFA_LOCK(&ia6->ia_ifa); + ia6->ia6_lifetime.ia6t_preferred = timenow.tv_sec; + ia6->ia6_lifetime.ia6t_pltime = 0; + IFA_UNLOCK(&ia6->ia_ifa); } lck_mtx_unlock(prefix6_mutex); @@ -1074,20 +1144,24 @@ delete_each_prefix(struct rr_prefix *rpp, u_char origin) lck_mtx_lock(prefix6_mutex); while (rpp->rp_addrhead.lh_first != NULL) { struct rp_addr *rap; + struct in6_ifaddr *ia6; rap = LIST_FIRST(&rpp->rp_addrhead); if (rap == NULL) { break; } LIST_REMOVE(rap, ra_entry); - if (rap->ra_addr == NULL) { + if ((ia6 = rap->ra_addr) == NULL) { FREE(rap, M_RR_ADDR); continue; } - rap->ra_addr->ia6_ifpr = NULL; + rap->ra_addr = NULL; + IFA_LOCK(&ia6->ia_ifa); + ia6->ia6_ifpr = NULL; + IFA_UNLOCK(&ia6->ia_ifa); - in6_purgeaddr(&rap->ra_addr->ia_ifa, 0); - ifafree(&rap->ra_addr->ia_ifa); + in6_purgeaddr(&ia6->ia_ifa, 0); + IFA_REMREF(&ia6->ia_ifa); FREE(rap, M_RR_ADDR); } rp_remove(rpp); @@ -1122,6 +1196,8 @@ link_stray_ia6s(struct rr_prefix *rpp) { struct ifaddr *ifa; + ifnet_lock_assert(rpp->rp_ifp, IFNET_LCK_ASSERT_OWNED); + for (ifa = rpp->rp_ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { @@ -1129,11 +1205,15 @@ link_stray_ia6s(struct rr_prefix *rpp) struct rr_prefix *orpp; int error = 0; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (rpp->rp_plen > in6_matchlen(RP_IN6(rpp), IFA_IN6(ifa))) + } + if (rpp->rp_plen > in6_matchlen(RP_IN6(rpp), IFA_IN6(ifa))) { + IFA_UNLOCK(ifa); continue; - + } orpp = ifpr2rp(((struct in6_ifaddr *)ifa)->ia6_ifpr); if (orpp != NULL) { if (!in6_are_prefix_equal(RP_IN6(orpp), RP_IN6(rpp), @@ -1144,8 +1224,10 @@ link_stray_ia6s(struct rr_prefix *rpp) ip6_sprintf(IFA_IN6(ifa)), orpp->rp_plen, ip6_sprintf(RP_IN6(rpp)), rpp->rp_plen); + IFA_UNLOCK(ifa); continue; } + IFA_UNLOCK(ifa); if ((error = assign_ra_entry(rpp, (sizeof(rap->ra_ifid) << 3) - rpp->rp_plen, @@ -1237,23 +1319,28 @@ in6_prefix_ioctl(struct socket *so, u_long cmd, caddr_t data, rp_tmp.rp_origin = ipr->ipr_origin; /* create rp_addr entries, usually at least for lladdr */ + ifnet_lock_shared(ifp); if ((error = link_stray_ia6s(&rp_tmp)) != 0) { + ifnet_lock_done(ifp); free_rp_entries(&rp_tmp); break; } - ifnet_lock_exclusive(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { - if (ifa->ifa_addr == NULL) - continue; /* just for safety */ - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa)) == 0) + } + if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa)) == 0) { + IFA_UNLOCK(ifa); continue; - + } if ((error = create_ra_entry(&rap)) != 0) { + IFA_UNLOCK(ifa); + ifnet_lock_done(ifp); free_rp_entries(&rp_tmp); goto bad; } @@ -1264,6 +1351,7 @@ in6_prefix_ioctl(struct socket *so, u_long cmd, caddr_t data, sizeof(*IFA_IN6(ifa)) << 3, rp_tmp.rp_plen, (sizeof(rap->ra_ifid) << 3) - rp_tmp.rp_plen); + IFA_UNLOCK(ifa); /* insert into list */ lck_mtx_lock(prefix6_mutex); LIST_INSERT_HEAD(&rp_tmp.rp_addrhead, rap, ra_entry); @@ -1292,30 +1380,3 @@ in6_prefix_ioctl(struct socket *so, u_long cmd, caddr_t data, } #endif -void -in6_rr_timer(__unused void *ignored_arg) -{ - struct rr_prefix *rpp; - struct timeval timenow; - - getmicrotime(&timenow); - - /* expire */ - lck_mtx_lock(prefix6_mutex); - rpp = LIST_FIRST(&rr_prefix); - while (rpp) { - if (rpp->rp_expire && rpp->rp_expire < timenow.tv_sec) { - struct rr_prefix *next_rpp; - - next_rpp = LIST_NEXT(rpp, rp_entry); - delete_each_prefix(rpp, PR_ORIG_KERNEL); - rpp = next_rpp; - continue; - } - if (rpp->rp_preferred && rpp->rp_preferred < timenow.tv_sec) - unprefer_prefix(rpp); - rpp = LIST_NEXT(rpp, rp_entry); - } - lck_mtx_unlock(prefix6_mutex); - timeout(in6_rr_timer, (caddr_t)0, ip6_rr_prune * hz); -} diff --git a/bsd/netinet6/in6_prefix.h b/bsd/netinet6/in6_prefix.h index f69562ae4..fa3567676 100644 --- a/bsd/netinet6/in6_prefix.h +++ b/bsd/netinet6/in6_prefix.h @@ -85,7 +85,6 @@ LIST_HEAD(rr_prhead, rr_prefix); extern struct rr_prhead rr_prefix; -void in6_rr_timer(void *); int delete_each_prefix (struct rr_prefix *rpp, u_char origin); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet6/in6_proto.c b/bsd/netinet6/in6_proto.c index b7dbae799..c0228feeb 100644 --- a/bsd/netinet6/in6_proto.c +++ b/bsd/netinet6/in6_proto.c @@ -129,6 +129,7 @@ #include <netinet6/pim6_var.h> #include <netinet6/nd6.h> #include <netinet6/in6_prefix.h> +#include <netinet6/mld6_var.h> #include <netinet6/ip6_mroute.h> @@ -217,7 +218,7 @@ struct ip6protosw inet6sw[] = { { SOCK_RAW, &inet6domain, IPPROTO_ICMPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, icmp6_input, rip6_pr_output, rip6_ctlinput, rip6_ctloutput, 0, - icmp6_init, icmp6_fasttimo, 0, 0, + icmp6_init, 0, mld_slowtimo, 0, 0, &rip6_usrreqs, 0, rip_unlock, 0, @@ -226,7 +227,7 @@ struct ip6protosw inet6sw[] = { { SOCK_DGRAM, &inet6domain, IPPROTO_ICMPV6, PR_ATOMIC|PR_ADDR|PR_LASTHDR, icmp6_input, rip6_pr_output, rip6_ctlinput, icmp6_dgram_ctloutput, 0, - icmp6_init, icmp6_fasttimo, 0, 0, + icmp6_init, 0, mld_slowtimo, 0, 0, &icmp6_dgram_usrreqs, 0, rip_unlock, 0, @@ -398,7 +399,7 @@ int ip6_accept_rtadv = 0; /* "IPV6FORWARDING ? 0 : 1" is dangerous */ int ip6_maxfragpackets; /* initialized in frag6.c:frag6_init() */ int ip6_maxfrags; int ip6_log_interval = 5; -int ip6_hdrnestlimit = 50; /* appropriate? */ +int ip6_hdrnestlimit = 15; /* How many header options will we process? */ int ip6_dad_count = 1; /* DupAddrDetectionTransmits */ u_int32_t ip6_flow_seq; int ip6_auto_flowlabel = 1; @@ -406,16 +407,19 @@ int ip6_gif_hlim = 0; int ip6_use_deprecated = 1; /* allow deprecated addr (RFC2462 5.5.4) */ int ip6_rr_prune = 5; /* router renumbering prefix * walk list every 5 sec. */ -int ip6_v6only = 0; /* Mapped addresses on by default - Radar 3347718 */ +int ip6_mcast_pmtu = 0; /* enable pMTU discovery for multicast? */ +int ip6_v6only = 0; /* Mapped addresses off by default - Radar 3347718 -- REVISITING FOR 10.7 -- TESTING WITH MAPPED@ OFF */ int ip6_neighborgcthresh = 1024; /* Threshold # of NDP entries for GC */ int ip6_maxifprefixes = 16; /* Max acceptable prefixes via RA per IF */ int ip6_maxifdefrouters = 16; /* Max acceptable def routers via RA */ int ip6_maxdynroutes = 1024; /* Max # of routes created via redirect */ +int ip6_only_allow_rfc4193_prefix = 0; /* Only allow RFC4193 style Unique Local IPv6 Unicast prefixes */ u_int32_t ip6_id = 0UL; int ip6_keepfaith = 0; time_t ip6_log_time = (time_t)0L; +int nd6_onlink_ns_rfc4861 = 0; /* allow 'on-link' nd6 NS (as in RFC 4861) */ /* icmp6 */ /* @@ -450,7 +454,7 @@ int udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6)); /* * sysctl related items. */ -SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW, 0, +SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Internet6 Family"); /* net.inet6 */ @@ -475,7 +479,8 @@ sysctl_ip6_temppltime SYSCTL_HANDLER_ARGS return (error); old = ip6_temp_preferred_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); - if (ip6_temp_preferred_lifetime < + if (ip6_temp_preferred_lifetime > ND6_MAX_LIFETIME || + ip6_temp_preferred_lifetime < ip6_desync_factor + ip6_temp_regen_advance) { ip6_temp_preferred_lifetime = old; return(EINVAL); @@ -495,7 +500,8 @@ sysctl_ip6_tempvltime SYSCTL_HANDLER_ARGS return (error); old = ip6_temp_valid_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); - if (ip6_temp_valid_lifetime < ip6_temp_preferred_lifetime) { + if (ip6_temp_valid_lifetime > ND6_MAX_LIFETIME || + ip6_temp_valid_lifetime < ip6_temp_preferred_lifetime) { ip6_temp_preferred_lifetime = old; return(EINVAL); } @@ -503,90 +509,103 @@ sysctl_ip6_tempvltime SYSCTL_HANDLER_ARGS } SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, - forwarding, CTLFLAG_RW, &ip6_forwarding, 0, ""); + forwarding, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_forwarding, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, - redirect, CTLFLAG_RW, &ip6_sendredirects, 0, ""); + redirect, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_sendredirects, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, - hlim, CTLFLAG_RW, &ip6_defhlim, 0, ""); -SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_STATS, stats, CTLFLAG_RD, + hlim, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_defhlim, 0, ""); +SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &ip6stat, ip6stat, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, - maxfragpackets, CTLFLAG_RW, &ip6_maxfragpackets, 0, ""); + maxfragpackets, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxfragpackets, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, - maxfrags, CTLFLAG_RW, &ip6_maxfrags, 0, ""); + maxfrags, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxfrags, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, - accept_rtadv, CTLFLAG_RW, &ip6_accept_rtadv, 0, ""); + accept_rtadv, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip6_accept_rtadv, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_KEEPFAITH, - keepfaith, CTLFLAG_RW, &ip6_keepfaith, 0, ""); + keepfaith, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_keepfaith, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, - log_interval, CTLFLAG_RW, &ip6_log_interval, 0, ""); + log_interval, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_log_interval, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, - hdrnestlimit, CTLFLAG_RW, &ip6_hdrnestlimit, 0, ""); + hdrnestlimit, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_hdrnestlimit, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, - dad_count, CTLFLAG_RW, &ip6_dad_count, 0, ""); + dad_count, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_dad_count, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, - auto_flowlabel, CTLFLAG_RW, &ip6_auto_flowlabel, 0, ""); + auto_flowlabel, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_auto_flowlabel, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, - defmcasthlim, CTLFLAG_RW, &ip6_defmcasthlim, 0, ""); + defmcasthlim, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_defmcasthlim, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, - gifhlim, CTLFLAG_RW, &ip6_gif_hlim, 0, ""); + gifhlim, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_gif_hlim, 0, ""); SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, - kame_version, CTLFLAG_RD, (void *)((uintptr_t)(__KAME_VERSION)), 0, ""); + kame_version, CTLFLAG_RD | CTLFLAG_LOCKED, (void *)((uintptr_t)(__KAME_VERSION)), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, - use_deprecated, CTLFLAG_RW, &ip6_use_deprecated, 0, ""); + use_deprecated, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_use_deprecated, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, - rr_prune, CTLFLAG_RW, &ip6_rr_prune, 0, ""); + rr_prune, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_rr_prune, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, - use_tempaddr, CTLFLAG_RW, &ip6_use_tempaddr, 0, ""); + use_tempaddr, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_use_tempaddr, 0, ""); SYSCTL_OID(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, - CTLTYPE_INT|CTLFLAG_RW, &ip6_temp_preferred_lifetime, 0, + CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_temp_preferred_lifetime, 0, sysctl_ip6_temppltime, "I", ""); SYSCTL_OID(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, - CTLTYPE_INT|CTLFLAG_RW, &ip6_temp_valid_lifetime, 0, + CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_temp_valid_lifetime, 0, sysctl_ip6_tempvltime, "I", ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, - v6only, CTLFLAG_RW, &ip6_v6only, 0, ""); + v6only, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_v6only, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, - auto_linklocal, CTLFLAG_RW, &ip6_auto_linklocal, 0, ""); -SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD, + auto_linklocal, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_auto_linklocal, 0, ""); +SYSCTL_STRUCT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, CTLFLAG_RD | CTLFLAG_LOCKED, &rip6stat, rip6stat, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, + prefer_tempaddr, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_prefer_tempaddr, 0, ""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, + use_defaultzone, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_use_defzone, 0,""); +SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, + mcast_pmtu, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_mcast_pmtu, 0, ""); #if MROUTING -SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RD, +SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RD | CTLFLAG_LOCKED, &mrt6stat, mrt6stat, ""); #endif SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NEIGHBORGCTHRESH, - neighborgcthresh, CTLFLAG_RW, &ip6_neighborgcthresh, 0, ""); + neighborgcthresh, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_neighborgcthresh, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXIFPREFIXES, - maxifprefixes, CTLFLAG_RW, &ip6_maxifprefixes, 0, ""); + maxifprefixes, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxifprefixes, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXIFDEFROUTERS, - maxifdefrouters, CTLFLAG_RW, &ip6_maxifdefrouters, 0, ""); + maxifdefrouters, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxifdefrouters, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXDYNROUTES, - maxdynroutes, CTLFLAG_RW, &ip6_maxdynroutes, 0, ""); - + maxdynroutes, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_maxdynroutes, 0, ""); +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, + only_allow_rfc4193_prefixes, CTLFLAG_RW | CTLFLAG_LOCKED, + &ip6_only_allow_rfc4193_prefix, 0, ""); /* net.inet6.icmp6 */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, - rediraccept, CTLFLAG_RW, &icmp6_rediraccept, 0, ""); + rediraccept, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6_rediraccept, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, - redirtimeout, CTLFLAG_RW, &icmp6_redirtimeout, 0, ""); -SYSCTL_STRUCT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, CTLFLAG_RD, + redirtimeout, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6_redirtimeout, 0, ""); +SYSCTL_STRUCT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, CTLFLAG_RD | CTLFLAG_LOCKED, &icmp6stat, icmp6stat, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, - nd6_prune, CTLFLAG_RW, &nd6_prune, 0, ""); + nd6_prune, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_prune, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, - nd6_delay, CTLFLAG_RW, &nd6_delay, 0, ""); + nd6_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_delay, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, - nd6_umaxtries, CTLFLAG_RW, &nd6_umaxtries, 0, ""); + nd6_umaxtries, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_umaxtries, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, - nd6_mmaxtries, CTLFLAG_RW, &nd6_mmaxtries, 0, ""); + nd6_mmaxtries, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_mmaxtries, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, - nd6_useloopback, CTLFLAG_RW, &nd6_useloopback, 0, ""); + nd6_useloopback, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_useloopback, 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ACCEPT_6TO4, + nd6_accept_6to4, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_accept_6to4, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, - nodeinfo, CTLFLAG_RW, &icmp6_nodeinfo, 0, ""); + nodeinfo, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6_nodeinfo, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, - errppslimit, CTLFLAG_RW, &icmp6errppslim, 0, ""); + errppslimit, CTLFLAG_RW | CTLFLAG_LOCKED, &icmp6errppslim, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, - nd6_maxnudhint, CTLFLAG_RW, &nd6_maxnudhint, 0, ""); + nd6_maxnudhint, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_maxnudhint, 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, - nd6_debug, CTLFLAG_RW, &nd6_debug, 0, ""); - + nd6_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_debug, 0, ""); +SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, + nd6_onlink_ns_rfc4861, CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_onlink_ns_rfc4861, 0, + "Accept 'on-link' nd6 NS in compliance with RFC 4861."); diff --git a/bsd/netinet6/in6_rmx.c b/bsd/netinet6/in6_rmx.c index d0ad6f2b1..63a66121d 100644 --- a/bsd/netinet6/in6_rmx.c +++ b/bsd/netinet6/in6_rmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2009 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -108,8 +108,10 @@ #include <kern/queue.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/protosw.h> #include <sys/mbuf.h> #include <sys/syslog.h> +#include <sys/mcache.h> #include <kern/lock.h> #include <net/if.h> @@ -130,8 +132,8 @@ extern int in6_inithead(void **head, int off); static void in6_rtqtimo(void *rock); -static void in6_mtutimo(void *rock); -extern int tvtohz(struct timeval *); +static void in6_mtutimo(void *rock); +extern int tvtohz(struct timeval *); static struct radix_node *in6_matroute_args(void *, struct radix_node_head *, rn_matchf_t *, void *); @@ -195,11 +197,13 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * should elaborate the code. */ if (rt->rt_flags & RTF_HOST) { + IFA_LOCK_SPIN(rt->rt_ifa); if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr) ->sin6_addr, &sin6->sin6_addr)) { rt->rt_flags |= RTF_LOCAL; } + IFA_UNLOCK(rt->rt_ifa); } if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) @@ -214,8 +218,8 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * Find out if it is because of an * ARP entry and delete it if so. */ - rt2 = rtalloc1_locked((struct sockaddr *)sin6, 0, - RTF_CLONING | RTF_PRCLONING); + rt2 = rtalloc1_scoped_locked((struct sockaddr *)sin6, 0, + RTF_CLONING | RTF_PRCLONING, sin6_get_ifscope(rt_key(rt))); if (rt2) { RT_LOCK(rt2); if ((rt2->rt_flags & RTF_LLINFO) && @@ -253,8 +257,8 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * net route entry, 3ffe:0501:: -> if0. * This case should not raise an error. */ - rt2 = rtalloc1_locked((struct sockaddr *)sin6, 0, - RTF_CLONING | RTF_PRCLONING); + rt2 = rtalloc1_scoped_locked((struct sockaddr *)sin6, 0, + RTF_CLONING | RTF_PRCLONING, sin6_get_ifscope(rt_key(rt))); if (rt2) { RT_LOCK(rt2); if ((rt2->rt_flags & (RTF_CLONING|RTF_HOST|RTF_GATEWAY)) @@ -294,6 +298,24 @@ in6_deleteroute(void * v_arg, void *netmask_arg, struct radix_node_head *head) return (rn); } +/* + * Validate (unexpire) an expiring AF_INET6 route. + */ +struct radix_node * +in6_validate(struct radix_node *rn) +{ + struct rtentry *rt = (struct rtentry *)rn; + + RT_LOCK_ASSERT_HELD(rt); + + /* This is first reference? */ + if (rt->rt_refcnt == 0 && (rt->rt_flags & RTPRF_OURS)) { + rt->rt_flags &= ~RTPRF_OURS; + rt_setexpire(rt, 0); + } + return (rn); +} + /* * Similar to in6_matroute_args except without the leaf-matching parameters. */ @@ -313,16 +335,11 @@ in6_matroute_args(void *v_arg, struct radix_node_head *head, rn_matchf_t *f, void *w) { struct radix_node *rn = rn_match_args(v_arg, head, f, w); - struct rtentry *rt = (struct rtentry *)rn; - /* This is first reference? */ - if (rt != NULL) { - RT_LOCK_SPIN(rt); - if (rt->rt_refcnt == 0 && (rt->rt_flags & RTPRF_OURS)) { - rt->rt_flags &= ~RTPRF_OURS; - rt->rt_rmx.rmx_expire = 0; - } - RT_UNLOCK(rt); + if (rn != NULL) { + RT_LOCK_SPIN((struct rtentry *)rn); + in6_validate(rn); + RT_UNLOCK((struct rtentry *)rn); } return (rn); } @@ -332,17 +349,17 @@ SYSCTL_DECL(_net_inet6_ip6); static int rtq_reallyold = 60*60; /* one hour is ``really old'' */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTEXPIRE, rtexpire, - CTLFLAG_RW, &rtq_reallyold , 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_reallyold , 0, ""); static int rtq_minreallyold = 10; /* never automatically crank down to less */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMINEXPIRE, rtminexpire, - CTLFLAG_RW, &rtq_minreallyold , 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_minreallyold , 0, ""); static int rtq_toomany = 128; /* 128 cached routes is ``too many'' */ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache, - CTLFLAG_RW, &rtq_toomany , 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &rtq_toomany , 0, ""); /* @@ -394,12 +411,12 @@ in6_clsroute(struct radix_node *rn, __unused struct radix_node_head *head) RT_LOCK(rt); } } else { - struct timeval timenow; + uint64_t timenow; - getmicrotime(&timenow); + timenow = net_uptime(); rt->rt_flags |= RTPRF_OURS; - rt->rt_rmx.rmx_expire = - rt_expiry(rt, timenow.tv_sec, rtq_reallyold); + rt_setexpire(rt, + rt_expiry(rt, timenow, rtq_reallyold)); } } @@ -410,7 +427,7 @@ struct rtqk_arg { int draining; int killed; int found; - time_t nextstop; + uint64_t nextstop; }; /* @@ -426,16 +443,17 @@ in6_rtqkill(struct radix_node *rn, void *rock) struct rtqk_arg *ap = rock; struct rtentry *rt = (struct rtentry *)rn; int err; - struct timeval timenow; + uint64_t timenow; - getmicrotime(&timenow); + timenow = net_uptime(); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); RT_LOCK(rt); if (rt->rt_flags & RTPRF_OURS) { ap->found++; - - if (ap->draining || rt->rt_rmx.rmx_expire <= timenow.tv_sec || + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + if (ap->draining || rt->rt_expire <= timenow || ((rt->rt_flags & RTF_DYNAMIC) != 0 && ip6_maxdynroutes >= 0 && in6dynroutes > ip6_maxdynroutes / 2)) { @@ -461,13 +479,13 @@ in6_rtqkill(struct radix_node *rn, void *rock) } } else { if (ap->updating && - (unsigned)(rt->rt_rmx.rmx_expire - timenow.tv_sec) > + (rt->rt_expire - timenow) > rt_expiry(rt, 0, rtq_reallyold)) { - rt->rt_rmx.rmx_expire = rt_expiry(rt, - timenow.tv_sec, rtq_reallyold); + rt_setexpire(rt, rt_expiry(rt, + timenow, rtq_reallyold)); } ap->nextstop = lmin(ap->nextstop, - rt->rt_rmx.rmx_expire); + rt->rt_expire); RT_UNLOCK(rt); } } else { @@ -486,16 +504,16 @@ in6_rtqtimo(void *rock) struct radix_node_head *rnh = rock; struct rtqk_arg arg; struct timeval atv; - static time_t last_adjusted_timeout = 0; - struct timeval timenow; + static uint64_t last_adjusted_timeout = 0; + uint64_t timenow; lck_mtx_lock(rnh_lock); /* Get the timestamp after we acquire the lock for better accuracy */ - getmicrotime(&timenow); + timenow = net_uptime(); arg.found = arg.killed = 0; arg.rnh = rnh; - arg.nextstop = timenow.tv_sec + rtq_timeout; + arg.nextstop = timenow + rtq_timeout; arg.draining = arg.updating = 0; rnh->rnh_walktree(rnh, in6_rtqkill, &arg); @@ -508,14 +526,14 @@ in6_rtqtimo(void *rock) * hard. */ if ((arg.found - arg.killed > rtq_toomany) - && (timenow.tv_sec - last_adjusted_timeout >= rtq_timeout) + && ((timenow - last_adjusted_timeout) >= (uint64_t)rtq_timeout) && rtq_reallyold > rtq_minreallyold) { rtq_reallyold = 2*rtq_reallyold / 3; if (rtq_reallyold < rtq_minreallyold) { rtq_reallyold = rtq_minreallyold; } - last_adjusted_timeout = timenow.tv_sec; + last_adjusted_timeout = timenow; #if DIAGNOSTIC log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold to %d", rtq_reallyold); @@ -526,7 +544,7 @@ in6_rtqtimo(void *rock) } atv.tv_usec = 0; - atv.tv_sec = arg.nextstop - timenow.tv_sec; + atv.tv_sec = arg.nextstop - timenow; lck_mtx_unlock(rnh_lock); timeout(in6_rtqtimo, rock, tvtohz(&atv)); } @@ -536,7 +554,7 @@ in6_rtqtimo(void *rock) */ struct mtuex_arg { struct radix_node_head *rnh; - time_t nextstop; + uint64_t nextstop; }; static int @@ -544,21 +562,23 @@ in6_mtuexpire(struct radix_node *rn, void *rock) { struct rtentry *rt = (struct rtentry *)rn; struct mtuex_arg *ap = rock; - struct timeval timenow; + uint64_t timenow; - getmicrotime(&timenow); + timenow = net_uptime(); /* sanity */ if (!rt) panic("rt == NULL in in6_mtuexpire"); RT_LOCK(rt); - if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) { - if (rt->rt_rmx.rmx_expire <= timenow.tv_sec) { + VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); + VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); + if (rt->rt_expire && !(rt->rt_flags & RTF_PROBEMTU)) { + if (rt->rt_expire <= timenow) { rt->rt_flags |= RTF_PROBEMTU; } else { ap->nextstop = lmin(ap->nextstop, - rt->rt_rmx.rmx_expire); + rt->rt_expire); } } RT_UNLOCK(rt); @@ -574,24 +594,24 @@ in6_mtutimo(void *rock) struct radix_node_head *rnh = rock; struct mtuex_arg arg; struct timeval atv; - struct timeval timenow; + uint64_t timenow, timo; - getmicrotime(&timenow); + timenow = net_uptime(); arg.rnh = rnh; - arg.nextstop = timenow.tv_sec + MTUTIMO_DEFAULT; + arg.nextstop = timenow + MTUTIMO_DEFAULT; lck_mtx_lock(rnh_lock); rnh->rnh_walktree(rnh, in6_mtuexpire, &arg); atv.tv_usec = 0; - atv.tv_sec = arg.nextstop; - if (atv.tv_sec < timenow.tv_sec) { + timo = arg.nextstop; + if (timo < timenow) { #if DIAGNOSTIC log(LOG_DEBUG, "IPv6: invalid mtu expiration time on routing table\n"); #endif - arg.nextstop = timenow.tv_sec + 30; /*last resort*/ + arg.nextstop = timenow + 30; /*last resort*/ } - atv.tv_sec -= timenow.tv_sec; + atv.tv_sec = timo - timenow; lck_mtx_unlock(rnh_lock); timeout(in6_mtutimo, rock, tvtohz(&atv)); } diff --git a/bsd/netinet6/in6_src.c b/bsd/netinet6/in6_src.c index 71441847b..1eb5cd60f 100644 --- a/bsd/netinet6/in6_src.c +++ b/bsd/netinet6/in6_src.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,9 +104,13 @@ #include <sys/errno.h> #include <sys/time.h> #include <sys/proc.h> +#include <sys/sysctl.h> +#include <sys/kauth.h> +#include <sys/priv.h> #include <kern/lock.h> #include <net/if.h> +#include <net/if_types.h> #include <net/route.h> #include <netinet/in.h> @@ -118,273 +122,950 @@ #include <netinet/ip6.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6_var.h> +#include <netinet6/scope6_var.h> #include <netinet6/nd6.h> -#if ENABLE_DEFAULT_SCOPE -#include <netinet6/scope6_var.h> -#endif #include <net/net_osdep.h> #include "loop.h" +SYSCTL_DECL(_net_inet6_ip6); + +static int ip6_select_srcif_debug = 0; +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, select_srcif_debug, + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_select_srcif_debug, 0, + "log source interface selection debug info"); + +#define ADDR_LABEL_NOTAPP (-1) +struct in6_addrpolicy defaultaddrpolicy; + +int ip6_prefer_tempaddr = 1; +#ifdef ENABLE_ADDRSEL +extern lck_mtx_t *addrsel_mutex; +#define ADDRSEL_LOCK() lck_mtx_lock(addrsel_mutex) +#define ADDRSEL_UNLOCK() lck_mtx_unlock(addrsel_mutex) +#else +#define ADDRSEL_LOCK() +#define ADDRSEL_UNLOCK() +#endif + +static int selectroute(struct sockaddr_in6 *, struct sockaddr_in6 *, + struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, + struct ifnet **, struct rtentry **, int, int, unsigned int, + unsigned int); +static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *, + struct ip6_moptions *, struct route_in6 *ro, unsigned int, + unsigned int, struct ifnet **); +static void init_policy_queue(void); +static int add_addrsel_policyent(const struct in6_addrpolicy *); +#ifdef ENABLE_ADDRSEL +static int delete_addrsel_policyent(const struct in6_addrpolicy *); +#endif +static int walk_addrsel_policy(int (*)(const struct in6_addrpolicy *, void *), + void *); +static int dump_addrsel_policyent(const struct in6_addrpolicy *, void *); +static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *); +void addrsel_policy_init(void); + /* * Return an IPv6 address, which is the most appropriate for a given * destination and user specified options. * If necessary, this function lookups the routing table and returns * an entry to the caller for later use. */ +#define REPLACE(r) do {\ + if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ + sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ + ip6stat.ip6s_sources_rule[(r)]++; \ + goto replace; \ +} while(0) +#define NEXTSRC(r) do {\ + if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ + sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ + ip6stat.ip6s_sources_rule[(r)]++; \ + goto next; /* XXX: we can't use 'continue' here */ \ +} while(0) +#define BREAK(r) do { \ + if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \ + sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \ + ip6stat.ip6s_sources_rule[(r)]++; \ + goto out; /* XXX: we can't use 'break' here */ \ +} while(0) + struct in6_addr * -in6_selectsrc( - struct sockaddr_in6 *dstsock, - struct ip6_pktopts *opts, - struct ip6_moptions *mopts, - struct route_in6 *ro, - struct in6_addr *laddr, - struct in6_addr *src_storage, - int *errorp) +in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, + struct inpcb *inp, struct route_in6 *ro, + struct ifnet **ifpp, struct in6_addr *src_storage, unsigned int ifscope, + int *errorp) { - struct in6_addr *dst; - struct in6_ifaddr *ia6 = 0; + struct in6_addr dst; + struct ifnet *ifp = NULL; + struct in6_ifaddr *ia = NULL, *ia_best = NULL; struct in6_pktinfo *pi = NULL; + int dst_scope = -1, best_scope = -1, best_matchlen = -1; + struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL; + u_int32_t odstzone; + int prefer_tempaddr; + struct ip6_moptions *mopts; + struct timeval timenow; + unsigned int nocell; + boolean_t islocal = FALSE; - dst = &dstsock->sin6_addr; + getmicrotime(&timenow); + + dst = dstsock->sin6_addr; /* make a copy for local operation */ *errorp = 0; + if (ifpp != NULL) + *ifpp = NULL; + + if (inp != NULL) { + mopts = inp->in6p_moptions; + nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + } else { + mopts = NULL; + nocell = 0; + } /* * If the source address is explicitly specified by the caller, - * use it. + * check if the requested source address is indeed a unicast address + * assigned to the node, and can be used as the packet's source + * address. If everything is okay, use the address as source. */ if (opts && (pi = opts->ip6po_pktinfo) && - !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) - return(&pi->ipi6_addr); + !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) { + struct sockaddr_in6 srcsock; + struct in6_ifaddr *ia6; + + /* get the outgoing interface */ + if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, ifscope, + nocell, &ifp)) != 0) { + return (NULL); + } + + /* + * determine the appropriate zone id of the source based on + * the zone of the destination and the outgoing interface. + * If the specified address is ambiguous wrt the scope zone, + * the interface must be specified; otherwise, ifa_ifwithaddr() + * will fail matching the address. + */ + bzero(&srcsock, sizeof(srcsock)); + srcsock.sin6_family = AF_INET6; + srcsock.sin6_len = sizeof(srcsock); + srcsock.sin6_addr = pi->ipi6_addr; + if (ifp) { + *errorp = in6_setscope(&srcsock.sin6_addr, ifp, NULL); + if (*errorp != 0) { + ifnet_release(ifp); + return (NULL); + } + } + ia6 = (struct in6_ifaddr *)ifa_ifwithaddr((struct sockaddr *)(&srcsock)); + if (ia6 == NULL) { + *errorp = EADDRNOTAVAIL; + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); + } + IFA_LOCK_SPIN(&ia6->ia_ifa); + if ((ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) || + (nocell && (ia6->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR))) { + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + *errorp = EADDRNOTAVAIL; + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); + } + + *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; + IFA_UNLOCK(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + if (ifpp != NULL) { + /* if ifp is non-NULL, refcnt held in in6_selectif() */ + *ifpp = ifp; + } else if (ifp != NULL) { + ifnet_release(ifp); + } + return (src_storage); + } /* - * If the source address is not specified but the socket(if any) - * is already bound, use the bound address. + * Otherwise, if the socket has already bound the source, just use it. */ - if (laddr && !IN6_IS_ADDR_UNSPECIFIED(laddr)) - return(laddr); + if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + return (&inp->in6p_laddr); /* - * If the caller doesn't specify the source address but - * the outgoing interface, use an address associated with - * the interface. + * If the address is not specified, choose the best one based on + * the outgoing interface and the destination address. */ - if (pi && pi->ipi6_ifindex) { - ifnet_t out_ifp = NULL; - ifnet_head_lock_shared(); - if (pi->ipi6_ifindex > if_index) { - ifnet_head_done(); - *errorp = EADDRNOTAVAIL; - return(0); - } else { - out_ifp = ifindex2ifnet[pi->ipi6_ifindex]; + + /* get the outgoing interface */ + if ((*errorp = in6_selectif(dstsock, opts, mopts, ro, ifscope, nocell, + &ifp)) != 0) + return (NULL); + +#ifdef DIAGNOSTIC + if (ifp == NULL) /* this should not happen */ + panic("in6_selectsrc: NULL ifp"); +#endif + *errorp = in6_setscope(&dst, ifp, &odstzone); + if (*errorp != 0) { + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); + } + lck_rw_lock_shared(&in6_ifaddr_rwlock); + + for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + int new_scope = -1, new_matchlen = -1; + struct in6_addrpolicy *new_policy = NULL; + u_int32_t srczone, osrczone, dstzone; + struct in6_addr src; + struct ifnet *ifp1 = ia->ia_ifp; + + IFA_LOCK(&ia->ia_ifa); + /* + * We'll never take an address that breaks the scope zone + * of the destination. We also skip an address if its zone + * does not contain the outgoing interface. + * XXX: we should probably use sin6_scope_id here. + */ + if (in6_setscope(&dst, ifp1, &dstzone) || + odstzone != dstzone) + goto next; + + src = ia->ia_addr.sin6_addr; + if (in6_setscope(&src, ifp, &osrczone) || + in6_setscope(&src, ifp1, &srczone) || + osrczone != srczone) + goto next; + + /* avoid unusable addresses */ + if ((ia->ia6_flags & + (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) + goto next; + + if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia)) + goto next; + + /* Rule 1: Prefer same address */ + if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) + BREAK(1); /* there should be no better candidate */ + + if (ia_best == NULL) + REPLACE(0); + + /* Rule 2: Prefer appropriate scope */ + if (dst_scope < 0) + dst_scope = in6_addrscope(&dst); + new_scope = in6_addrscope(&ia->ia_addr.sin6_addr); + if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) { + if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0) + REPLACE(2); + NEXTSRC(2); + } else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) { + if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0) + NEXTSRC(2); + REPLACE(2); } - ifnet_head_done(); - - /* XXX boundary check is assumed to be already done. */ - ia6 = in6_ifawithscope(out_ifp, dst); - if (ia6 == 0) { - *errorp = EADDRNOTAVAIL; - return(0); + + /* + * Rule 3: Avoid deprecated addresses. Note that the case of + * !ip6_use_deprecated is already rejected above. + */ + if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia)) + NEXTSRC(3); + if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia)) + REPLACE(3); + + /* Rule 4: Prefer home addresses */ + /* + * XXX: This is a TODO. We should probably merge the MIP6 + * case above. + */ + + /* Rule 5: Prefer outgoing interface */ + if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp) + NEXTSRC(5); + if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp) + REPLACE(5); + + /* + * Rule 6: Prefer matching label + * Note that best_policy should be non-NULL here. + */ + if (dst_policy == NULL) + dst_policy = in6_addrsel_lookup_policy(dstsock); + if (dst_policy->label != ADDR_LABEL_NOTAPP) { + new_policy = in6_addrsel_lookup_policy(&ia->ia_addr); + if (dst_policy->label == best_policy->label && + dst_policy->label != new_policy->label) + NEXTSRC(6); + if (dst_policy->label != best_policy->label && + dst_policy->label == new_policy->label) + REPLACE(6); } - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - return src_storage; + + /* + * Rule 7: Prefer public addresses. + * We allow users to reverse the logic by configuring + * a sysctl variable, so that privacy conscious users can + * always prefer temporary addresses. + * Don't use temporary addresses for local destinations or + * for multicast addresses unless we were passed in an option. + */ + if (IN6_IS_ADDR_MULTICAST(&dst) || + in6_matchlen(&ia_best->ia_addr.sin6_addr, &dst) >= + in6_mask2len(&ia_best->ia_prefixmask.sin6_addr, NULL)) + islocal = TRUE; + if (opts == NULL || + opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) { + prefer_tempaddr = islocal ? 0 : ip6_prefer_tempaddr; + } else if (opts->ip6po_prefer_tempaddr == + IP6PO_TEMPADDR_NOTPREFER) { + prefer_tempaddr = 0; + } else + prefer_tempaddr = 1; + if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) && + (ia->ia6_flags & IN6_IFF_TEMPORARY)) { + if (prefer_tempaddr) + REPLACE(7); + else + NEXTSRC(7); + } + if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) && + !(ia->ia6_flags & IN6_IFF_TEMPORARY)) { + if (prefer_tempaddr) + NEXTSRC(7); + else + REPLACE(7); + } + + /* + * Rule 8: prefer addresses on alive interfaces. + * This is a KAME specific rule. + */ + if ((ia_best->ia_ifp->if_flags & IFF_UP) && + !(ia->ia_ifp->if_flags & IFF_UP)) + NEXTSRC(8); + if (!(ia_best->ia_ifp->if_flags & IFF_UP) && + (ia->ia_ifp->if_flags & IFF_UP)) + REPLACE(8); + + /* + * Rule 14: Use longest matching prefix. + * Note: in the address selection draft, this rule is + * documented as "Rule 8". However, since it is also + * documented that this rule can be overridden, we assign + * a large number so that it is easy to assign smaller numbers + * to more preferred rules. + */ + new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst); + if (best_matchlen < new_matchlen) + REPLACE(14); + if (new_matchlen < best_matchlen) + NEXTSRC(14); + + /* Rule 15 is reserved. */ + + /* + * Last resort: just keep the current candidate. + * Or, do we need more rules? + */ + IFA_UNLOCK(&ia->ia_ifa); + continue; + +replace: + best_scope = (new_scope >= 0 ? new_scope : + in6_addrscope(&ia->ia_addr.sin6_addr)); + best_policy = (new_policy ? new_policy : + in6_addrsel_lookup_policy(&ia->ia_addr)); + best_matchlen = (new_matchlen >= 0 ? new_matchlen : + in6_matchlen(&ia->ia_addr.sin6_addr, &dst)); + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for ia_best */ + IFA_UNLOCK(&ia->ia_ifa); + if (ia_best != NULL) + IFA_REMREF(&ia_best->ia_ifa); + ia_best = ia; + continue; + +next: + IFA_UNLOCK(&ia->ia_ifa); + continue; + +out: + IFA_ADDREF_LOCKED(&ia->ia_ifa); /* for ia_best */ + IFA_UNLOCK(&ia->ia_ifa); + if (ia_best != NULL) + IFA_REMREF(&ia_best->ia_ifa); + ia_best = ia; + break; + } + + lck_rw_done(&in6_ifaddr_rwlock); + + if (nocell && ia_best != NULL && + (ia_best->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR)) { + IFA_REMREF(&ia_best->ia_ifa); + ia_best = NULL; + } + + if ( (ia = ia_best) == NULL) { + *errorp = EADDRNOTAVAIL; + if (ifp != NULL) + ifnet_release(ifp); + return (NULL); } + IFA_LOCK_SPIN(&ia->ia_ifa); + *src_storage = satosin6(&ia->ia_addr)->sin6_addr; + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + if (ifpp != NULL) { + /* if ifp is non-NULL, refcnt held in in6_selectif() */ + *ifpp = ifp; + } else if (ifp != NULL) { + ifnet_release(ifp); + } + return (src_storage); +} + +/* + * Given a source IPv6 address (and route, if available), determine the best + * interface to send the packet from. Checking for (and updating) the + * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done + * without any locks, based on the assumption that in the event this is + * called from ip6_output(), the output operation is single-threaded per-pcb, + * i.e. for any given pcb there can only be one thread performing output at + * the IPv6 layer. + * + * This routine is analogous to in_selectsrcif() for IPv4. + * + * clone - meaningful only for bsdi and freebsd + */ +static int +selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, + struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, + struct ifnet **retifp, struct rtentry **retrt, int clone, + int norouteok, unsigned int ifscope, unsigned int nocell) +{ + int error = 0; + struct ifnet *ifp = NULL; + struct route_in6 *route = NULL; + struct sockaddr_in6 *sin6_next; + struct in6_pktinfo *pi = NULL; + struct in6_addr *dst = &dstsock->sin6_addr; + struct ifaddr *ifa = NULL; + char s_src[MAX_IPv6_STR_LEN], s_dst[MAX_IPv6_STR_LEN]; + boolean_t select_srcif; + +#if 0 + char ip6buf[INET6_ADDRSTRLEN]; + + if (dstsock->sin6_addr.s6_addr32[0] == 0 && + dstsock->sin6_addr.s6_addr32[1] == 0 && + !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) { + printf("in6_selectroute: strange destination %s\n", + ip6_sprintf(ip6buf, &dstsock->sin6_addr)); + } else { + printf("in6_selectroute: destination = %s%%%d\n", + ip6_sprintf(ip6buf, &dstsock->sin6_addr), + dstsock->sin6_scope_id); /* for debug */ + } +#endif + + if (retifp != NULL) + *retifp = NULL; + + if (retrt != NULL) + *retrt = NULL; + + if (ip6_select_srcif_debug) { + struct in6_addr src; + src = (srcsock != NULL) ? srcsock->sin6_addr : in6addr_any; + (void) inet_ntop(AF_INET6, &src, s_src, sizeof (s_src)); + (void) inet_ntop(AF_INET6, dst, s_dst, sizeof (s_dst)); + } + + /* + * If the destination address is UNSPECIFIED addr, bail out. + */ + if (IN6_IS_ADDR_UNSPECIFIED(dst)) { + error = EHOSTUNREACH; + goto done; + } + + /* + * Perform source interface selection only if Scoped Routing + * is enabled and a source address that isn't unspecified. + */ + select_srcif = (ip6_doscopedroute && srcsock != NULL && + !IN6_IS_ADDR_UNSPECIFIED(&srcsock->sin6_addr)); + /* - * If the destination address is a link-local unicast address or - * a multicast address, and if the outgoing interface is specified - * by the sin6_scope_id filed, use an address associated with the - * interface. - * XXX: We're now trying to define more specific semantics of - * sin6_scope_id field, so this part will be rewritten in - * the near future. + * If Scoped Routing is disabled, ignore the given ifscope. + * Otherwise even if source selection won't be performed, + * we still obey IPV6_BOUND_IF. */ - if ((IN6_IS_ADDR_LINKLOCAL(dst) || IN6_IS_ADDR_MULTICAST(dst)) && - dstsock->sin6_scope_id) { + if (!ip6_doscopedroute && ifscope != IFSCOPE_NONE) + ifscope = IFSCOPE_NONE; + + /* If the caller specified the outgoing interface explicitly, use it */ + if (opts != NULL && (pi = opts->ip6po_pktinfo) != NULL && + pi->ipi6_ifindex != 0) { /* - * I'm not sure if boundary check for scope_id is done - * somewhere... - * - * Since sin6_scope_id is unsigned, we only need to check against if_index. + * If IPV6_PKTINFO takes precedence over IPV6_BOUND_IF. */ - ifnet_t out_ifp = NULL; + ifscope = pi->ipi6_ifindex; ifnet_head_lock_shared(); - if (if_index < dstsock->sin6_scope_id) { - *errorp = ENXIO; /* XXX: better error? */ - ifnet_head_done(); - return(0); + /* ifp may be NULL if detached or out of range */ + ifp = (ifscope <= if_index) ? ifindex2ifnet[ifscope] : NULL; + ifnet_head_done(); + if (norouteok || retrt == NULL || IN6_IS_ADDR_MULTICAST(dst)) { + /* + * We do not have to check or get the route for + * multicast. If the caller didn't ask/care for + * the route and we have no interface to use, + * it's an error. + */ + if (ifp == NULL) + error = EHOSTUNREACH; + goto done; } else { - out_ifp = ifindex2ifnet[dstsock->sin6_scope_id]; + goto getsrcif; } - ifnet_head_done(); + } - ia6 = in6_ifawithscope(out_ifp, dst); - if (ia6 == 0) { - *errorp = EADDRNOTAVAIL; - return(0); + /* + * If the destination address is a multicast address and the outgoing + * interface for the address is specified by the caller, use it. + */ + if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL) { + IM6O_LOCK(mopts); + if ((ifp = mopts->im6o_multicast_ifp) != NULL) { + IM6O_UNLOCK(mopts); + goto done; /* we do not need a route for multicast. */ } - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - return src_storage; + IM6O_UNLOCK(mopts); + } + +getsrcif: + /* + * If the outgoing interface was not set via IPV6_BOUND_IF or + * IPV6_PKTINFO, use the scope ID in the destination address. + */ + if (ip6_doscopedroute && ifscope == IFSCOPE_NONE) + ifscope = dstsock->sin6_scope_id; + + /* + * Perform source interface selection; the source IPv6 address + * must belong to one of the addresses of the interface used + * by the route. For performance reasons, do this only if + * there is no route, or if the routing table has changed, + * or if we haven't done source interface selection on this + * route (for this PCB instance) before. + */ + if (!select_srcif || (ro != NULL && ro->ro_rt != NULL && + (ro->ro_rt->rt_flags & RTF_UP) && + ro->ro_rt->generation_id == route_generation && + (ro->ro_flags & ROF_SRCIF_SELECTED))) { + if (ro != NULL && ro->ro_rt != NULL) { + ifa = ro->ro_rt->rt_ifa; + IFA_ADDREF(ifa); + } + goto getroute; } /* - * If the destination address is a multicast address and - * the outgoing interface for the address is specified - * by the caller, use an address associated with the interface. - * There is a sanity check here; if the destination has node-local - * scope, the outgoing interfacde should be a loopback address. - * Even if the outgoing interface is not specified, we also - * choose a loopback interface as the outgoing interface. + * Given the source IPv6 address, find a suitable source interface + * to use for transmission; if a scope ID has been specified, + * optimize the search by looking at the addresses only for that + * interface. This is still suboptimal, however, as we need to + * traverse the per-interface list. */ - if (IN6_IS_ADDR_MULTICAST(dst)) { - struct ifnet *ifp = mopts ? mopts->im6o_multicast_ifp : NULL; + if (ifscope != IFSCOPE_NONE || (ro != NULL && ro->ro_rt != NULL)) { + unsigned int scope = ifscope; + struct ifnet *rt_ifp; + + rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL; - if (ifp == NULL && IN6_IS_ADDR_MC_NODELOCAL(dst)) { - ifp = lo_ifp; + /* + * If no scope is specified and the route is stale (pointing + * to a defunct interface) use the current primary interface; + * this happens when switching between interfaces configured + * with the same IPv6 address. Otherwise pick up the scope + * information from the route; the ULP may have looked up a + * correct route and we just need to verify it here and mark + * it with the ROF_SRCIF_SELECTED flag below. + */ + if (scope == IFSCOPE_NONE) { + scope = rt_ifp->if_index; + if (scope != get_primary_ifscope(AF_INET6) && + ro->ro_rt->generation_id != route_generation) + scope = get_primary_ifscope(AF_INET6); } - if (ifp) { - ia6 = in6_ifawithscope(ifp, dst); - if (ia6 == 0) { - *errorp = EADDRNOTAVAIL; - return(0); + ifa = (struct ifaddr *) + ifa_foraddr6_scoped(&srcsock->sin6_addr, scope); + + if (ip6_select_srcif_debug && ifa != NULL) { + if (ro->ro_rt != NULL) { + printf("%s->%s ifscope %d->%d ifa_if %s " + "ro_if %s\n", s_src, s_dst, ifscope, + scope, if_name(ifa->ifa_ifp), + if_name(rt_ifp)); + } else { + printf("%s->%s ifscope %d->%d ifa_if %s\n", + s_src, s_dst, ifscope, scope, + if_name(ifa->ifa_ifp)); } - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - return src_storage; } } /* - * If the next hop address for the packet is specified - * by caller, use an address associated with the route - * to the next hop. + * Slow path; search for an interface having the corresponding source + * IPv6 address if the scope was not specified by the caller, and: + * + * 1) There currently isn't any route, or, + * 2) The interface used by the route does not own that source + * IPv6 address; in this case, the route will get blown away + * and we'll do a more specific scoped search using the newly + * found interface. */ - { - struct sockaddr_in6 *sin6_next; - struct rtentry *rt; - - if (opts && opts->ip6po_nexthop) { - sin6_next = satosin6(opts->ip6po_nexthop); - rt = nd6_lookup(&sin6_next->sin6_addr, 1, NULL, 0); - if (rt != NULL) { - RT_LOCK_ASSERT_HELD(rt); - ia6 = in6_ifawithscope(rt->rt_ifp, dst); - if (ia6 == 0) { - ia6 = ifatoia6(rt->rt_ifa); - if (ia6 != NULL) - ifaref(&ia6->ia_ifa); - } + if (ifa == NULL && ifscope == IFSCOPE_NONE) { + ifa = (struct ifaddr *)ifa_foraddr6(&srcsock->sin6_addr); + + if (ip6_select_srcif_debug && ifa != NULL) { + printf("%s->%s ifscope %d ifa_if %s\n", + s_src, s_dst, ifscope, if_name(ifa->ifa_ifp)); + } + + } + +getroute: + if (ifa != NULL) + ifscope = ifa->ifa_ifp->if_index; + + /* + * If the next hop address for the packet is specified by the caller, + * use it as the gateway. + */ + if (opts != NULL && opts->ip6po_nexthop != NULL) { + struct route_in6 *ron; + + sin6_next = satosin6(opts->ip6po_nexthop); + + /* at this moment, we only support AF_INET6 next hops */ + if (sin6_next->sin6_family != AF_INET6) { + error = EAFNOSUPPORT; /* or should we proceed? */ + goto done; + } + + /* + * If the next hop is an IPv6 address, then the node identified + * by that address must be a neighbor of the sending host. + */ + ron = &opts->ip6po_nextroute; + if (ron->ro_rt != NULL) + RT_LOCK(ron->ro_rt); + if ((ron->ro_rt != NULL && + ((ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) != + (RTF_UP | RTF_LLINFO) || + ron->ro_rt->generation_id != route_generation || + (select_srcif && (ifa == NULL || + ifa->ifa_ifp != ron->ro_rt->rt_ifp)))) || + !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr, + &sin6_next->sin6_addr)) { + if (ron->ro_rt != NULL) { + RT_UNLOCK(ron->ro_rt); + rtfree(ron->ro_rt); + ron->ro_rt = NULL; } - if (ia6 == 0) { - *errorp = EADDRNOTAVAIL; - if (rt != NULL) { - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); + *satosin6(&ron->ro_dst) = *sin6_next; + } + if (ron->ro_rt == NULL) { + rtalloc_scoped((struct route *)ron, ifscope); + if (ron->ro_rt != NULL) + RT_LOCK(ron->ro_rt); + if (ron->ro_rt == NULL || + !(ron->ro_rt->rt_flags & RTF_LLINFO) || + !IN6_ARE_ADDR_EQUAL(&satosin6(rt_key(ron->ro_rt))-> + sin6_addr, &sin6_next->sin6_addr)) { + if (ron->ro_rt != NULL) { + RT_UNLOCK(ron->ro_rt); + rtfree(ron->ro_rt); + ron->ro_rt = NULL; } - return(0); + error = EHOSTUNREACH; + goto done; + } + } + route = ron; + ifp = ron->ro_rt->rt_ifp; + + /* + * When cloning is required, try to allocate a route to the + * destination so that the caller can store path MTU + * information. + */ + if (!clone) { + if (select_srcif) { + /* Keep the route locked */ + goto validateroute; } - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); - return src_storage; + RT_UNLOCK(ron->ro_rt); + goto done; } + RT_UNLOCK(ron->ro_rt); } /* - * If route is known or can be allocated now, - * our src addr is taken from the i/f, else punt. + * Use a cached route if it exists and is valid, else try to allocate + * a new one. Note that we should check the address family of the + * cached destination, in case of sharing the cache with IPv4. */ - if (ro) { + if (ro == NULL) + goto done; + if (ro->ro_rt != NULL) + RT_LOCK(ro->ro_rt); + if (ro->ro_rt != NULL && (!(ro->ro_rt->rt_flags & RTF_UP) || + satosin6(&ro->ro_dst)->sin6_family != AF_INET6 || + ro->ro_rt->generation_id != route_generation || + !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, dst) || + (select_srcif && (ifa == NULL || + ifa->ifa_ifp != ro->ro_rt->rt_ifp)))) { + RT_UNLOCK(ro->ro_rt); + rtfree(ro->ro_rt); + ro->ro_rt = NULL; + } + if (ro->ro_rt == NULL) { + struct sockaddr_in6 *sa6; + if (ro->ro_rt != NULL) - RT_LOCK(ro->ro_rt); - if (ro->ro_rt != NULL && - (!(ro->ro_rt->rt_flags & RTF_UP) || - satosin6(&ro->ro_dst)->sin6_family != AF_INET6 || - ro->ro_rt->generation_id != route_generation || - !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr, - dst))) { RT_UNLOCK(ro->ro_rt); - rtfree(ro->ro_rt); - ro->ro_rt = NULL; - } - if (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL) { - struct sockaddr_in6 *sa6; - - if (ro->ro_rt != NULL) - RT_UNLOCK(ro->ro_rt); - /* No route yet, so try to acquire one */ - bzero(&ro->ro_dst, sizeof(struct sockaddr_in6)); - sa6 = (struct sockaddr_in6 *)&ro->ro_dst; - sa6->sin6_family = AF_INET6; - sa6->sin6_len = sizeof(struct sockaddr_in6); - sa6->sin6_addr = *dst; -#if SCOPEDROUTING - sa6->sin6_scope_id = dstsock->sin6_scope_id; -#endif - if (IN6_IS_ADDR_MULTICAST(dst)) { - ro->ro_rt = rtalloc1( - &((struct route *)ro)->ro_dst, 0, 0); - } else { - rtalloc_ign((struct route *)ro, 0); - } - if (ro->ro_rt != NULL) - RT_LOCK(ro->ro_rt); + /* No route yet, so try to acquire one */ + bzero(&ro->ro_dst, sizeof(struct sockaddr_in6)); + sa6 = (struct sockaddr_in6 *)&ro->ro_dst; + sa6->sin6_family = AF_INET6; + sa6->sin6_len = sizeof(struct sockaddr_in6); + sa6->sin6_addr = *dst; + if (IN6_IS_ADDR_MULTICAST(dst)) { + ro->ro_rt = rtalloc1_scoped( + &((struct route *)ro)->ro_dst, 0, 0, ifscope); + } else { + rtalloc_scoped((struct route *)ro, ifscope); } + if (ro->ro_rt != NULL) + RT_LOCK(ro->ro_rt); + } + /* + * Do not care about the result if we have the nexthop + * explicitly specified (in case we're asked to clone.) + */ + if (opts != NULL && opts->ip6po_nexthop != NULL) { + if (ro->ro_rt != NULL) + RT_UNLOCK(ro->ro_rt); + goto done; + } + + if (ro->ro_rt != NULL) { + RT_LOCK_ASSERT_HELD(ro->ro_rt); + ifp = ro->ro_rt->rt_ifp; + } else { + error = EHOSTUNREACH; + } + route = ro; + +validateroute: + if (select_srcif) { + boolean_t has_route = (route != NULL && route->ro_rt != NULL); + + if (has_route) + RT_LOCK_ASSERT_HELD(route->ro_rt); /* - * in_pcbconnect() checks out IFF_LOOPBACK to skip using - * the address. But we don't know why it does so. - * It is necessary to ensure the scope even for lo0 - * so doesn't check out IFF_LOOPBACK. + * If there is a non-loopback route with the wrong interface, + * or if there is no interface configured with such an address, + * blow it away. Except for local/loopback, we look for one + * with a matching interface scope/index. */ - if (ro->ro_rt != NULL) { - RT_LOCK_ASSERT_HELD(ro->ro_rt); - ia6 = in6_ifawithscope(ro->ro_rt->rt_ifa->ifa_ifp, dst); - if (ia6 == 0) { - ia6 = ifatoia6(ro->ro_rt->rt_ifa); - if (ia6) - ifaref(&ia6->ia_ifa); + if (has_route && (ifa == NULL || + (ifa->ifa_ifp != ifp && ifp != lo_ifp) || + !(route->ro_rt->rt_flags & RTF_UP))) { + if (ip6_select_srcif_debug) { + if (ifa != NULL) { + printf("%s->%s ifscope %d ro_if %s " + "!= ifa_if %s (cached route " + "cleared)\n", s_src, s_dst, + ifscope, if_name(ifp), + if_name(ifa->ifa_ifp)); + } else { + printf("%s->%s ifscope %d ro_if %s " + "(no ifa_if found)\n", s_src, + s_dst, ifscope, if_name(ifp)); + } } + RT_UNLOCK(route->ro_rt); + rtfree(route->ro_rt); + route->ro_rt = NULL; + route->ro_flags &= ~ROF_SRCIF_SELECTED; + error = EHOSTUNREACH; + /* Undo the settings done above */ + route = NULL; + ifp = NULL; + } else if (has_route) { + route->ro_flags |= ROF_SRCIF_SELECTED; + route->ro_rt->generation_id = route_generation; + RT_UNLOCK(route->ro_rt); + } + } else { + if (ro->ro_rt != NULL) RT_UNLOCK(ro->ro_rt); + if (ifp != NULL && opts != NULL && + opts->ip6po_pktinfo != NULL && + opts->ip6po_pktinfo->ipi6_ifindex != 0) { + /* + * Check if the outgoing interface conflicts with the + * interface specified by ipi6_ifindex (if specified). + * Note that loopback interface is always okay. + * (this may happen when we are sending a packet to + * one of our own addresses.) + */ + if (!(ifp->if_flags & IFF_LOOPBACK) && ifp->if_index != + opts->ip6po_pktinfo->ipi6_ifindex) { + error = EHOSTUNREACH; + goto done; + } } -#if 0 + } + +done: + if (nocell && error == 0) { + if ((ifp != NULL && ifp->if_type == IFT_CELLULAR) || + (route != NULL && route->ro_rt != NULL && + route->ro_rt->rt_ifp->if_type == IFT_CELLULAR)) { + if (route != NULL && route->ro_rt != NULL) { + rtfree(route->ro_rt); + route->ro_rt = NULL; + route->ro_flags &= ~ROF_SRCIF_SELECTED; + route = NULL; + } + ifp = NULL; + error = EHOSTUNREACH; + } + } + + if (ifp == NULL && (route == NULL || route->ro_rt == NULL)) { /* - * xxx The followings are necessary? (kazu) - * I don't think so. - * It's for SO_DONTROUTE option in IPv4.(jinmei) + * This can happen if the caller did not pass a cached route + * nor any other hints. We treat this case an error. */ - if (ia6 == 0) { - struct sockaddr_in6 sin6 = {sizeof(sin6), AF_INET6, 0}; - - sin6->sin6_addr = *dst; + error = EHOSTUNREACH; + } + if (error == EHOSTUNREACH) + ip6stat.ip6s_noroute++; - ia6 = ifatoia6(ifa_ifwithdstaddr(sin6tosa(&sin6))); - if (ia6 == 0) - ia6 = ifatoia6(ifa_ifwithnet(sin6tosa(&sin6))); - if (ia6 == 0) - return(0); - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - return src_storage; + if (error == 0) { + if (retifp != NULL) { + if (ifp != NULL) + ifnet_reference(ifp); /* for caller */ + *retifp = ifp; } -#endif /* 0 */ - if (ia6 == 0) { - *errorp = EHOSTUNREACH; /* no route */ - return(0); - } - *src_storage = satosin6(&ia6->ia_addr)->sin6_addr; - ifafree(&ia6->ia_ifa); - return src_storage; + if (retrt != NULL && route != NULL) + *retrt = route->ro_rt; /* ro_rt may be NULL */ + } else if (select_srcif && ip6_select_srcif_debug) { + printf("%s->%s ifscope %d ifa_if %s ro_if %s (error=%d)\n", + s_src, s_dst, ifscope, + (ifa != NULL) ? if_name(ifa->ifa_ifp) : "NONE", + (ifp != NULL) ? if_name(ifp) : "NONE", error); } - *errorp = EADDRNOTAVAIL; - return(0); + if (ifa != NULL) + IFA_REMREF(ifa); + + return (error); +} + +static int +in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts, + struct ip6_moptions *mopts, struct route_in6 *ro, unsigned int ifscope, + unsigned int nocell, struct ifnet **retifp) +{ + int error; + struct route_in6 sro; + struct rtentry *rt = NULL; + + if (ro == NULL) { + bzero(&sro, sizeof(sro)); + ro = &sro; + } + + if ((error = selectroute(NULL, dstsock, opts, mopts, ro, retifp, + &rt, 0, 1, ifscope, nocell)) != 0) { + if (ro == &sro && rt && rt == sro.ro_rt) + rtfree(rt); + return (error); + } + + /* + * do not use a rejected or black hole route. + * XXX: this check should be done in the L2 output routine. + * However, if we skipped this check here, we'd see the following + * scenario: + * - install a rejected route for a scoped address prefix + * (like fe80::/10) + * - send a packet to a destination that matches the scoped prefix, + * with ambiguity about the scope zone. + * - pick the outgoing interface from the route, and disambiguate the + * scope zone with the interface. + * - ip6_output() would try to get another route with the "new" + * destination, which may be valid. + * - we'd see no error on output. + * Although this may not be very harmful, it should still be confusing. + * We thus reject the case here. + */ + if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) { + int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + + if (ro == &sro && rt && rt == sro.ro_rt) + rtfree(rt); + return (flags); + } + + /* + * Adjust the "outgoing" interface. If we're going to loop the packet + * back to ourselves, the ifp would be the loopback interface. + * However, we'd rather know the interface associated to the + * destination address (which should probably be one of our own + * addresses.) + */ + if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp) { + if (*retifp != NULL) + ifnet_release(*retifp); + *retifp = rt->rt_ifa->ifa_ifp; + ifnet_reference(*retifp); + } + + if (ro == &sro && rt && rt == sro.ro_rt) + rtfree(rt); + return (0); +} + +/* + * clone - meaningful only for bsdi and freebsd + */ +int +in6_selectroute(struct sockaddr_in6 *srcsock, struct sockaddr_in6 *dstsock, + struct ip6_pktopts *opts, struct ip6_moptions *mopts, struct route_in6 *ro, + struct ifnet **retifp, struct rtentry **retrt, int clone, + unsigned int ifscope, unsigned int nocell) +{ + + return (selectroute(srcsock, dstsock, opts, mopts, ro, retifp, + retrt, clone, 0, ifscope, nocell)); } /* @@ -429,6 +1110,7 @@ in6_pcbsetport( u_int16_t lport = 0, first, last, *lastport; int count, error = 0, wild = 0; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + kauth_cred_t cred; if (!locked) { /* Make sure we don't run into a deadlock: 4052373 */ if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) { socket_unlock(inp->inp_socket, 0); @@ -448,7 +1130,10 @@ in6_pcbsetport( last = ipport_hilastauto; lastport = &pcbinfo->lasthi; } else if (inp->inp_flags & INP_LOWPORT) { - if ((error = proc_suser(p)) != 0) { + cred = kauth_cred_proc_ref(p); + error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0); + kauth_cred_unref(&cred); + if (error != 0) { if (!locked) lck_rw_done(pcbinfo->mtx); return error; @@ -481,6 +1166,7 @@ in6_pcbsetport( * occurred above. */ inp->in6p_laddr = in6addr_any; + inp->in6p_last_outif = 0; if (!locked) lck_rw_done(pcbinfo->mtx); return (EAGAIN); @@ -504,6 +1190,7 @@ in6_pcbsetport( * occurred above. */ inp->in6p_laddr = in6addr_any; + inp->in6p_last_outif = 0; if (!locked) lck_rw_done(pcbinfo->mtx); return (EAGAIN); @@ -520,6 +1207,7 @@ in6_pcbsetport( if (in_pcbinshash(inp, 1) != 0) { inp->in6p_laddr = in6addr_any; inp->inp_lport = 0; + inp->in6p_last_outif = 0; if (!locked) lck_rw_done(pcbinfo->mtx); return (EAGAIN); @@ -530,6 +1218,350 @@ in6_pcbsetport( return(0); } +/* + * * The followings are implementation of the policy table using a + * * simple tail queue. + * * XXX such details should be hidden. + * * XXX implementation using binary tree should be more efficient. + * */ +struct addrsel_policyent { + TAILQ_ENTRY(addrsel_policyent) ape_entry; + struct in6_addrpolicy ape_policy; +}; + +TAILQ_HEAD(addrsel_policyhead, addrsel_policyent); + +struct addrsel_policyhead addrsel_policytab; + +static void +init_policy_queue(void) +{ + + TAILQ_INIT(&addrsel_policytab); +} + +void +addrsel_policy_init(void) +{ + /* + * Default address selection policy based on RFC 3484 and + * draft-arifumi-6man-rfc3484-revise-03. + */ + static const struct in6_addrpolicy defaddrsel[] = { + /* localhost */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK128, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 60, + .label = 0 }, + /* ULA */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = {{{ 0xfc }}}, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK7, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 50, + .label = 1 }, + /* any IPv6 src */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK0, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 40, + .label = 2 }, + /* any IPv4 src */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_V4MAPPED_INIT, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK96, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 30, + .label = 3 }, + /* 6to4 */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = {{{ 0x20, 0x02 }}}, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK16, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 20, + .label = 4 }, + /* Teredo */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = {{{ 0x20, 0x01 }}}, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK32, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 10, + .label = 5 }, + /* v4 compat addresses */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK96, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 1, + .label = 10 }, + /* site-local (deprecated) */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = {{{ 0xfe, 0xc0 }}}, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK16, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 1, + .label = 11 }, + /* 6bone (deprecated) */ + { .addr = { .sin6_family = AF_INET6, + .sin6_addr = {{{ 0x3f, 0xfe }}}, + .sin6_len = sizeof(struct sockaddr_in6) }, + .addrmask = { .sin6_family = AF_INET6, + .sin6_addr = IN6MASK16, + .sin6_len = sizeof(struct sockaddr_in6) }, + .preced = 1, + .label = 12 }, + }; + int i; + + init_policy_queue(); + + /* initialize the "last resort" policy */ + bzero(&defaultaddrpolicy, sizeof(defaultaddrpolicy)); + defaultaddrpolicy.label = ADDR_LABEL_NOTAPP; + + for (i = 0; i < sizeof(defaddrsel) / sizeof(defaddrsel[0]); i++) + add_addrsel_policyent(&defaddrsel[i]); + +} + +struct in6_addrpolicy * +in6_addrsel_lookup_policy(struct sockaddr_in6 *key) +{ + struct in6_addrpolicy *match = NULL; + + ADDRSEL_LOCK(); + match = match_addrsel_policy(key); + + if (match == NULL) + match = &defaultaddrpolicy; + else + match->use++; + ADDRSEL_UNLOCK(); + + return (match); +} + +static struct in6_addrpolicy * +match_addrsel_policy(struct sockaddr_in6 *key) +{ + struct addrsel_policyent *pent; + struct in6_addrpolicy *bestpol = NULL, *pol; + int matchlen, bestmatchlen = -1; + u_char *mp, *ep, *k, *p, m; + + TAILQ_FOREACH(pent, &addrsel_policytab, ape_entry) { + matchlen = 0; + + pol = &pent->ape_policy; + mp = (u_char *)&pol->addrmask.sin6_addr; + ep = mp + 16; /* XXX: scope field? */ + k = (u_char *)&key->sin6_addr; + p = (u_char *)&pol->addr.sin6_addr; + for (; mp < ep && *mp; mp++, k++, p++) { + m = *mp; + if ((*k & m) != *p) + goto next; /* not match */ + if (m == 0xff) /* short cut for a typical case */ + matchlen += 8; + else { + while (m >= 0x80) { + matchlen++; + m <<= 1; + } + } + } + + /* matched. check if this is better than the current best. */ + if (bestpol == NULL || + matchlen > bestmatchlen) { + bestpol = pol; + bestmatchlen = matchlen; + } + + next: + continue; + } + + return (bestpol); +} + +static int +add_addrsel_policyent(const struct in6_addrpolicy *newpolicy) +{ + struct addrsel_policyent *new, *pol; + + MALLOC(new, struct addrsel_policyent *, sizeof(*new), M_IFADDR, + M_WAITOK); + + ADDRSEL_LOCK(); + + /* duplication check */ + TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { + if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr, + &pol->ape_policy.addr.sin6_addr) && + IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr, + &pol->ape_policy.addrmask.sin6_addr)) { + ADDRSEL_UNLOCK(); + FREE(new, M_IFADDR); + return (EEXIST); /* or override it? */ + } + } + + bzero(new, sizeof(*new)); + + /* XXX: should validate entry */ + new->ape_policy = *newpolicy; + + TAILQ_INSERT_TAIL(&addrsel_policytab, new, ape_entry); + ADDRSEL_UNLOCK(); + + return (0); +} +#ifdef ENABLE_ADDRSEL +static int +delete_addrsel_policyent(const struct in6_addrpolicy *key) +{ + struct addrsel_policyent *pol; + + + ADDRSEL_LOCK(); + + /* search for the entry in the table */ + TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { + if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr, + &pol->ape_policy.addr.sin6_addr) && + IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr, + &pol->ape_policy.addrmask.sin6_addr)) { + break; + } + } + if (pol == NULL) { + ADDRSEL_UNLOCK(); + return (ESRCH); + } + + TAILQ_REMOVE(&addrsel_policytab, pol, ape_entry); + FREE(pol, M_IFADDR); + pol = NULL; + ADDRSEL_UNLOCK(); + + return (0); +} +#endif /* ENABLE_ADDRSEL */ + +int +walk_addrsel_policy(int (*callback)(const struct in6_addrpolicy *, void *), + void *w) +{ + struct addrsel_policyent *pol; + int error = 0; + + ADDRSEL_LOCK(); + TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { + if ((error = (*callback)(&pol->ape_policy, w)) != 0) { + ADDRSEL_UNLOCK(); + return (error); + } + } + ADDRSEL_UNLOCK(); + return (error); +} +/* + * Subroutines to manage the address selection policy table via sysctl. + */ +struct walkarg { + struct sysctl_req *w_req; +}; + + +static int +dump_addrsel_policyent(const struct in6_addrpolicy *pol, void *arg) +{ + int error = 0; + struct walkarg *w = arg; + + error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol)); + + return (error); +} + +static int +in6_src_sysctl SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp, arg1, arg2) +struct walkarg w; + + if (req->newptr) + return EPERM; + bzero(&w, sizeof(w)); + w.w_req = req; + + return (walk_addrsel_policy(dump_addrsel_policyent, &w)); +} + + +SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy, + CTLFLAG_RD | CTLFLAG_LOCKED, in6_src_sysctl, ""); +int +in6_src_ioctl(u_long cmd, caddr_t data) +{ + int i; + struct in6_addrpolicy ent0; + + if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY) + return (EOPNOTSUPP); /* check for safety */ + + ent0 = *(struct in6_addrpolicy *)data; + + if (ent0.label == ADDR_LABEL_NOTAPP) + return (EINVAL); + /* check if the prefix mask is consecutive. */ + if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0) + return (EINVAL); + /* clear trailing garbages (if any) of the prefix address. */ + for (i = 0; i < 4; i++) { + ent0.addr.sin6_addr.s6_addr32[i] &= + ent0.addrmask.sin6_addr.s6_addr32[i]; + } + ent0.use = 0; + + switch (cmd) { + case SIOCAADDRCTL_POLICY: +#ifdef ENABLE_ADDRSEL + return (add_addrsel_policyent(&ent0)); +#else + return (ENOTSUP); +#endif + case SIOCDADDRCTL_POLICY: +#ifdef ENABLE_ADDRSEL + return (delete_addrsel_policyent(&ent0)); +#else + return (ENOTSUP); +#endif + } + + return (0); /* XXX: compromise compilers */ +} + /* * generate kernel-internal form (scopeid embedded into s6_addr16[1]). * If the address scope of is link-local, embed the interface index in the @@ -549,21 +1581,17 @@ int in6_embedscope( struct in6_addr *in6, const struct sockaddr_in6 *sin6, -#ifdef HAVE_NRL_INPCB - struct inpcb *in6p, -#define in6p_outputopts inp_outputopts6 -#define in6p_moptions inp_moptions6 -#else struct in6pcb *in6p, -#endif - struct ifnet **ifpp) + struct ifnet **ifpp, + struct ip6_pktopts *opt) { struct ifnet *ifp = NULL; u_int32_t scopeid; + struct ip6_pktopts *optp = NULL; *in6 = sin6->sin6_addr; scopeid = sin6->sin6_scope_id; - if (ifpp) + if (ifpp != NULL) *ifpp = NULL; /* @@ -578,21 +1606,31 @@ in6_embedscope( if (IN6_IS_SCOPE_LINKLOCAL(in6)) { struct in6_pktinfo *pi; + struct ifnet *im6o_multicast_ifp = NULL; + + if (in6p != NULL && IN6_IS_ADDR_MULTICAST(in6) && + in6p->in6p_moptions != NULL) { + IM6O_LOCK(in6p->in6p_moptions); + im6o_multicast_ifp = + in6p->in6p_moptions->im6o_multicast_ifp; + IM6O_UNLOCK(in6p->in6p_moptions); + } + if (opt) + optp = opt; + else if (in6p) + optp = in6p->in6p_outputopts; /* * KAME assumption: link id == interface id */ - ifnet_head_lock_shared(); - if (in6p && in6p->in6p_outputopts && - (pi = in6p->in6p_outputopts->ip6po_pktinfo) && + if (in6p && optp && (pi = optp->ip6po_pktinfo) && pi->ipi6_ifindex) { ifp = ifindex2ifnet[pi->ipi6_ifindex]; in6->s6_addr16[1] = htons(pi->ipi6_ifindex); } else if (in6p && IN6_IS_ADDR_MULTICAST(in6) && - in6p->in6p_moptions && - in6p->in6p_moptions->im6o_multicast_ifp) { - ifp = in6p->in6p_moptions->im6o_multicast_ifp; + in6p->in6p_moptions != NULL && im6o_multicast_ifp != NULL) { + ifp = im6o_multicast_ifp; in6->s6_addr16[1] = htons(ifp->if_index); } else if (scopeid) { /* @@ -610,16 +1648,15 @@ in6_embedscope( } ifnet_head_done(); - if (ifpp) + if (ifpp != NULL) { + if (ifp != NULL) + ifnet_reference(ifp); /* for caller */ *ifpp = ifp; + } } return 0; } -#if HAVE_NRL_INPCB -#undef in6p_outputopts -#undef in6p_moptions -#endif /* * generate standard sockaddr_in6 from embedded form. @@ -667,15 +1704,3 @@ in6_recoverscope( return 0; } - -/* - * just clear the embedded scope identifer. - * XXX: currently used for bsdi4 only as a supplement function. - */ -void -in6_clearscope(addr) - struct in6_addr *addr; -{ - if (IN6_IS_SCOPE_LINKLOCAL(addr)) - addr->s6_addr16[1] = 0; -} diff --git a/bsd/netinet6/in6_var.h b/bsd/netinet6/in6_var.h index e423627db..67f8fa429 100644 --- a/bsd/netinet6/in6_var.h +++ b/bsd/netinet6/in6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,6 +97,11 @@ #define _NETINET6_IN6_VAR_H_ #include <sys/appleapiopts.h> +#ifdef XNU_KERNEL_PRIVATE +#include <sys/tree.h> +#include <sys/mcache.h> +#endif + #ifdef __APPLE__ #include <sys/kern_event.h> #endif @@ -116,7 +121,7 @@ struct in6_addrlifetime { u_int32_t ia6t_pltime; /* prefix lifetime */ }; -#if defined(KERNEL_PRIVATE) +#ifdef XNU_KERNEL_PRIVATE struct in6_addrlifetime_32 { u_int32_t ia6t_expire; u_int32_t ia6t_preferred; @@ -125,9 +130,9 @@ struct in6_addrlifetime_32 { }; struct in6_addrlifetime_64 { - time_t ia6t_expire; - time_t ia6t_preferred __attribute__((aligned(8))); - u_int32_t ia6t_vltime __attribute__((aligned(8))); + u_int64_t ia6t_expire; + u_int64_t ia6t_preferred; + u_int32_t ia6t_vltime; u_int32_t ia6t_pltime; }; @@ -150,13 +155,29 @@ struct in6_ifaddr { int ia6_flags; struct in6_addrlifetime ia6_lifetime; + time_t ia6_createtime; /* the creation time of this address, which is + * currently used for temporary addresses only. + */ + time_t ia6_updatetime; + struct ifprefix *ia6_ifpr; /* back pointer to ifprefix */ - struct nd_prefix *ia6_ndpr; /* back pointer to the ND prefix - * (for autoconfigured addresses only) - */ + /* back pointer to the ND prefix (for autoconfigured addresses only) */ + struct nd_prefix *ia6_ndpr; + + /* multicast addresses joined from the kernel */ + LIST_HEAD(, in6_multi_mship) ia6_memberships; +}; +#endif /* XNU_KERNEL_PRIVATE */ + +/* control structure to manage address selection policy */ +struct in6_addrpolicy { + struct sockaddr_in6 addr; /* prefix address */ + struct sockaddr_in6 addrmask; /* prefix mask */ + int preced; /* precedence */ + int label; /* matching label */ + u_quad_t use; /* statistics */ }; -#endif /* KERNEL_PRIVATE */ /* * IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12). @@ -282,7 +303,7 @@ struct in6_ifreq { union { struct sockaddr_in6 ifru_addr; struct sockaddr_in6 ifru_dstaddr; - short ifru_flags; + int ifru_flags; int ifru_flags6; int ifru_metric; caddr_t ifru_data; @@ -302,7 +323,7 @@ struct in6_aliasreq { struct in6_addrlifetime ifra_lifetime; }; -#if defined(KERNEL_PRIVATE) +#ifdef XNU_KERNEL_PRIVATE struct in6_aliasreq_32 { char ifra_name[IFNAMSIZ]; struct sockaddr_in6 ifra_addr; @@ -320,7 +341,7 @@ struct in6_aliasreq_64 { int ifra_flags; struct in6_addrlifetime_64 ifra_lifetime; }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ /* prefix type macro */ #define IN6_PREFIX_ND 1 @@ -404,7 +425,7 @@ struct in6_rrenumreq { #define irr_rrf_decrvalid irr_flags.prf_rr.decrvalid #define irr_rrf_decrprefd irr_flags.prf_rr.decrprefd -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE /* * Given a pointer to an in6_ifaddr (ifaddr), * return a pointer to the addr as a sockaddr_in6 @@ -418,7 +439,7 @@ struct in6_rrenumreq { #define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr) #define IFPR_IN6(x) (&((struct sockaddr_in6 *)((x)->ifpr_prefix))->sin6_addr) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ /* * Event data, internet6 style. @@ -455,10 +476,10 @@ struct kev_in6_data { #define KEV_INET6_NEW_RTADV_ADDR 5 /* Autoconf router advertised address has appeared */ #define KEV_INET6_DEFROUTER 6 /* Default router dectected by kernel */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE /* Utility function used inside netinet6 kernel code for generating events */ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \ (((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \ @@ -481,37 +502,37 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); #define SIOCDIFADDR_IN6 _IOW('i', 25, struct in6_ifreq) #define SIOCAIFADDR_IN6 _IOW('i', 26, struct in6_aliasreq) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCAIFADDR_IN6_32 _IOW('i', 26, struct in6_aliasreq_32) #define SIOCAIFADDR_IN6_64 _IOW('i', 26, struct in6_aliasreq_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCSIFPHYADDR_IN6 _IOW('i', 62, struct in6_aliasreq) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCSIFPHYADDR_IN6_32 _IOW('i', 62, struct in6_aliasreq_32) #define SIOCSIFPHYADDR_IN6_64 _IOW('i', 62, struct in6_aliasreq_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCGIFPSRCADDR_IN6 _IOWR('i', 63, struct in6_ifreq) #define SIOCGIFPDSTADDR_IN6 _IOWR('i', 64, struct in6_ifreq) #define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq) #define SIOCGDRLST_IN6 _IOWR('i', 74, struct in6_drlist) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCGDRLST_IN6_32 _IOWR('i', 74, struct in6_drlist_32) #define SIOCGDRLST_IN6_64 _IOWR('i', 74, struct in6_drlist_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCGPRLST_IN6 _IOWR('i', 75, struct in6_prlist) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCGPRLST_IN6_32 _IOWR('i', 75, struct in6_prlist_32) #define SIOCGPRLST_IN6_64 _IOWR('i', 75, struct in6_prlist_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define OSIOCGIFINFO_IN6 _IOWR('i', 108, struct in6_ondireq) #define SIOCGIFINFO_IN6 _IOWR('i', 76, struct in6_ondireq) #define SIOCSNDFLUSH_IN6 _IOWR('i', 77, struct in6_ifreq) #define SIOCGNBRINFO_IN6 _IOWR('i', 78, struct in6_nbrinfo) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCGNBRINFO_IN6_32 _IOWR('i', 78, struct in6_nbrinfo_32) #define SIOCGNBRINFO_IN6_64 _IOWR('i', 78, struct in6_nbrinfo_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCSPFXFLUSH_IN6 _IOWR('i', 79, struct in6_ifreq) #define SIOCSRTRFLUSH_IN6 _IOWR('i', 80, struct in6_ifreq) @@ -522,12 +543,12 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); #define SIOCSDEFIFACE_IN6 _IOWR('i', 85, struct in6_ndifreq) #define SIOCGDEFIFACE_IN6 _IOWR('i', 86, struct in6_ndifreq) -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCSDEFIFACE_IN6_32 _IOWR('i', 85, struct in6_ndifreq_32) #define SIOCSDEFIFACE_IN6_64 _IOWR('i', 85, struct in6_ndifreq_64) #define SIOCGDEFIFACE_IN6_32 _IOWR('i', 86, struct in6_ndifreq_32) #define SIOCGDEFIFACE_IN6_64 _IOWR('i', 86, struct in6_ndifreq_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCSIFINFO_FLAGS _IOWR('i', 87, struct in6_ndireq) /* XXX */ @@ -548,30 +569,44 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); struct sioc_sg_req6) /* get s,g pkt cnt */ #define SIOCGETMIFCNT_IN6 _IOWR('u', 107, \ struct sioc_mif_req6) /* get pkt cnt per if */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCGETMIFCNT_IN6_32 _IOWR('u', 107, struct sioc_mif_req6_32) #define SIOCGETMIFCNT_IN6_64 _IOWR('u', 107, struct sioc_mif_req6_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ + +#define SIOCAADDRCTL_POLICY _IOW('u', 108, struct in6_addrpolicy) +#define SIOCDADDRCTL_POLICY _IOW('u', 109, struct in6_addrpolicy) #ifdef PRIVATE /* * temporary control calls to attach/detach IP to/from an ethernet interface */ #define SIOCPROTOATTACH_IN6 _IOWR('i', 110, struct in6_aliasreq) /* attach proto to interface */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCPROTOATTACH_IN6_32 _IOWR('i', 110, struct in6_aliasreq_32) #define SIOCPROTOATTACH_IN6_64 _IOWR('i', 110, struct in6_aliasreq_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCPROTODETACH_IN6 _IOWR('i', 111, struct in6_ifreq) /* detach proto from interface */ #define SIOCLL_START _IOWR('i', 130, struct in6_aliasreq) /* start aquiring linklocal on interface */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define SIOCLL_START_32 _IOWR('i', 130, struct in6_aliasreq_32) #define SIOCLL_START_64 _IOWR('i', 130, struct in6_aliasreq_64) -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define SIOCLL_STOP _IOWR('i', 131, struct in6_ifreq) /* deconfigure linklocal from interface */ #define SIOCAUTOCONF_START _IOWR('i', 132, struct in6_ifreq) /* accept rtadvd on this interface */ #define SIOCAUTOCONF_STOP _IOWR('i', 133, struct in6_ifreq) /* stop accepting rtadv for this interface */ + +#define SIOCDRADD_IN6 _IOWR('u', 134, struct in6_defrouter) +#ifdef XNU_KERNEL_PRIVATE +#define SIOCDRADD_IN6_32 _IOWR('u', 134, struct in6_defrouter_32) +#define SIOCDRADD_IN6_64 _IOWR('u', 134, struct in6_defrouter_64) +#endif /* XNU_KERNEL_PRIVATE */ +#define SIOCDRDEL_IN6 _IOWR('u', 135, struct in6_defrouter) +#ifdef XNU_KERNEL_PRIVATE +#define SIOCDRDEL_IN6_32 _IOWR('u', 135, struct in6_defrouter_32) +#define SIOCDRDEL_IN6_64 _IOWR('u', 135, struct in6_defrouter_64) +#endif /* XNU_KERNEL_PRIVATE */ #endif /* PRIVATE */ #define IN6_IFF_ANYCAST 0x01 /* anycast address */ @@ -596,7 +631,7 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *); #define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b)) #endif /* KERNEL */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE extern struct in6_ifaddr *in6_ifaddrs; extern struct in6_ifstat **in6_ifstat; @@ -604,23 +639,25 @@ extern size_t in6_ifstatmax; extern struct icmp6stat icmp6stat; extern struct icmp6_ifstat **icmp6_ifstat; extern size_t icmp6_ifstatmax; +extern lck_rw_t in6_ifs_rwlock; #define in6_ifstat_inc(ifp, tag) \ -do { \ - int _z_index = ifp ? ifp->if_index : 0; \ - if ((_z_index) && _z_index <= if_index \ - && _z_index < (signed)in6_ifstatmax \ - && in6_ifstat && in6_ifstat[_z_index]) { \ - in6_ifstat[_z_index]->tag++; \ - } \ +do { \ + lck_rw_lock_shared(&in6_ifs_rwlock); \ + int _z_index = ifp ? ifp->if_index : 0; \ + if ((_z_index) && _z_index <= if_index \ + && _z_index < (signed)in6_ifstatmax \ + && in6_ifstat && in6_ifstat[_z_index]) { \ + atomic_add_64(&in6_ifstat[_z_index]->tag, 1); \ + } \ + lck_rw_done(&in6_ifs_rwlock); \ } while (0) +__private_extern__ lck_rw_t in6_ifaddr_rwlock; + extern struct ifqueue ip6intrq; /* IP6 packet input queue */ extern struct in6_addr zeroin6_addr; extern u_char inet6ctlerrmap[]; extern u_int32_t in6_maxmtu; -#ifdef MALLOC_DECLARE -MALLOC_DECLARE(M_IPMADDR); -#endif /* MALLOC_DECLARE */ /* * Macro for finding the internet address structure (in6_ifaddr) corresponding @@ -631,35 +668,156 @@ MALLOC_DECLARE(M_IPMADDR); /* struct ifnet *ifp; */ \ /* struct in6_ifaddr *ia; */ \ do { \ - struct ifaddr *ifa; \ - for (ifa = (ifp)->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { \ - if (!ifa->ifa_addr) \ - continue; \ - if (ifa->ifa_addr->sa_family == AF_INET6) \ + struct ifaddr *_ifa; \ + ifnet_lock_assert(ifp, LCK_RW_ASSERT_HELD); \ + for (_ifa = (ifp)->if_addrlist.tqh_first; _ifa != NULL; \ + _ifa = _ifa->ifa_list.tqe_next) { \ + IFA_LOCK(_ifa); \ + if (_ifa->ifa_addr->sa_family == AF_INET6) { \ + IFA_ADDREF_LOCKED(_ifa); \ + IFA_UNLOCK(_ifa); \ break; \ + } \ + IFA_UNLOCK(_ifa); \ } \ - (ia) = (struct in6_ifaddr *)ifa; \ + (ia) = (struct in6_ifaddr *)_ifa; \ } while (0) /* - * Multi-cast membership entry. One for each group/ifp that a PCB - * belongs to. + * IPv6 multicast MLD-layer source entry. + */ +struct ip6_msource { + RB_ENTRY(ip6_msource) im6s_link; /* RB tree links */ + struct in6_addr im6s_addr; + struct im6s_st { + uint16_t ex; /* # of exclusive members */ + uint16_t in; /* # of inclusive members */ + } im6s_st[2]; /* state at t0, t1 */ + uint8_t im6s_stp; /* pending query */ +}; + +RB_HEAD(ip6_msource_tree, ip6_msource); + +RB_PROTOTYPE_SC_PREV(__private_extern__, ip6_msource_tree, ip6_msource, + im6s_link, ip6_msource_cmp); + +/* + * IPv6 multicast PCB-layer source entry. + * + * NOTE: overlapping use of struct ip6_msource fields at start. + */ +struct in6_msource { + RB_ENTRY(ip6_msource) im6s_link; /* Common field */ + struct in6_addr im6s_addr; /* Common field */ + uint8_t im6sl_st[2]; /* state before/at commit */ +}; + +/* + * IPv6 multicast PCB-layer group filter descriptor. + */ +struct in6_mfilter { + struct ip6_msource_tree im6f_sources; /* source list for (S,G) */ + u_long im6f_nsrc; /* # of source entries */ + uint8_t im6f_st[2]; /* state before/at commit */ +}; + +/* + * Legacy KAME IPv6 multicast membership descriptor. */ struct in6_multi_mship { struct in6_multi *i6mm_maddr; /* Multicast address pointer */ LIST_ENTRY(in6_multi_mship) i6mm_chain; /* multicast options chain */ }; +struct mld_ifinfo; + +/* + * The request count here is a count of requests for this address, not a + * count of pointers to this structure. + */ struct in6_multi { + decl_lck_mtx_data(, in6m_lock); + u_int32_t in6m_refcount; /* reference count */ + u_int32_t in6m_reqcnt; /* request count for this address */ + u_int32_t in6m_debug; /* see ifa_debug flags */ LIST_ENTRY(in6_multi) in6m_entry; /* list glue */ struct in6_addr in6m_addr; /* IP6 multicast address */ struct ifnet *in6m_ifp; /* back pointer to ifnet */ struct ifmultiaddr *in6m_ifma; /* back pointer to ifmultiaddr */ - u_int in6m_refcount; /* # membership claims by sockets */ u_int in6m_state; /* state of the membership */ u_int in6m_timer; /* MLD6 listener report timer */ + /* New fields for MLDv2 follow. */ + struct mld_ifinfo *in6m_mli; /* MLD info */ + SLIST_ENTRY(in6_multi) in6m_nrele; /* to-be-released by MLD */ + u_int32_t in6m_nrelecnt; /* deferred release count */ + struct ip6_msource_tree in6m_srcs; /* tree of sources */ + u_long in6m_nsrc; /* # of tree entries */ + + struct ifqueue in6m_scq; /* queue of pending + * state-change packets */ + struct timeval in6m_lastgsrtv; /* last G-S-R query */ + uint16_t in6m_sctimer; /* state-change timer */ + uint16_t in6m_scrv; /* state-change rexmit count */ + /* + * SSM state counters which track state at T0 (the time the last + * state-change report's RV timer went to zero) and T1 + * (time of pending report, i.e. now). + * Used for computing MLDv2 state-change reports. Several refcounts + * are maintained here to optimize for common use-cases. + */ + struct in6m_st { + uint16_t iss_fmode; /* MLD filter mode */ + uint16_t iss_asm; /* # of ASM listeners */ + uint16_t iss_ex; /* # of exclusive members */ + uint16_t iss_in; /* # of inclusive members */ + uint16_t iss_rec; /* # of recorded sources */ + } in6m_st[2]; /* state at t0, t1 */ + + void (*in6m_trace) /* callback fn for tracing refs */ + (struct in6_multi *, int); }; +#define IN6M_LOCK_ASSERT_HELD(_in6m) \ + lck_mtx_assert(&(_in6m)->in6m_lock, LCK_MTX_ASSERT_OWNED) + +#define IN6M_LOCK_ASSERT_NOTHELD(_in6m) \ + lck_mtx_assert(&(_in6m)->in6m_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IN6M_LOCK(_in6m) \ + lck_mtx_lock(&(_in6m)->in6m_lock) + +#define IN6M_LOCK_SPIN(_in6m) \ + lck_mtx_lock_spin(&(_in6m)->in6m_lock) + +#define IN6M_CONVERT_LOCK(_in6m) do { \ + IN6M_LOCK_ASSERT_HELD(_in6m); \ + lck_mtx_convert_spin(&(_in6m)->in6m_lock); \ +} while (0) + +#define IN6M_UNLOCK(_in6m) \ + lck_mtx_unlock(&(_in6m)->in6m_lock) + +#define IN6M_ADDREF(_in6m) \ + in6m_addref(_in6m, 0) + +#define IN6M_ADDREF_LOCKED(_in6m) \ + in6m_addref(_in6m, 1) + +#define IN6M_REMREF(_in6m) \ + in6m_remref(_in6m, 0) + +#define IN6M_REMREF_LOCKED(_in6m) \ + in6m_remref(_in6m, 1) + +#define IN6M_TIMER_UNDEF -1 + +/* flags to in6_update_ifa */ +#define IN6_IFAUPDATE_DADDELAY 0x1 /* first time to configure an address */ + +struct ip6_moptions; +struct sockopt; +struct inpcb; + extern LIST_HEAD(in6_multihead, in6_multi) in6_multihead; /* @@ -674,23 +832,36 @@ struct in6_multistep { /* * Macros for looking up the in6_multi record for a given IP6 multicast * address on a given interface. If no matching record is found, "in6m" - * returns NLL. + * returns NULL. + * + * We do this differently compared other BSD implementations; instead of + * walking the if_multiaddrs list at the interface and returning the + * ifma_protospec value of a matching entry, we search the global list + * of in6_multi records and find it that way. Otherwise either the two + * structures (in6_multi, ifmultiaddr) need to be ref counted both ways, + * which will make things too complicated, or they need to reside in the + * same protected domain, which they aren't. + * + * Must be called with in6_multihead_lock held. */ - -#define IN6_LOOKUP_MULTI(addr, ifp, in6m) \ -/* struct in6_addr addr; */ \ -/* struct ifnet *ifp; */ \ -/* struct in6_multi *in6m; */ \ -do { \ - struct ifmultiaddr *_ifma; \ - for (_ifma = (ifp)->if_multiaddrs.lh_first; _ifma; \ - _ifma = _ifma->ifma_link.le_next) { \ - if (_ifma->ifma_addr->sa_family == AF_INET6 \ - && IN6_ARE_ADDR_EQUAL(&((struct sockaddr_in6 *)_ifma->ifma_addr)->sin6_addr, \ - &(addr))) \ - break; \ - } \ - (in6m) = (struct in6_multi *)(_ifma ? _ifma->ifma_protospec : 0); \ +#define IN6_LOOKUP_MULTI(addr, ifp, in6m) \ + /* struct in6_addr *addr; */ \ + /* struct ifnet *ifp; */ \ + /* struct in6_multi *in6m; */ \ +do { \ + struct in6_multistep _step; \ + IN6_FIRST_MULTI(_step, in6m); \ + while ((in6m) != NULL) { \ + IN6M_LOCK_SPIN(in6m); \ + if ((in6m)->in6m_ifp == (ifp) && \ + IN6_ARE_ADDR_EQUAL(&(in6m)->in6m_addr, (addr))) { \ + IN6M_ADDREF_LOCKED(in6m); \ + IN6M_UNLOCK(in6m); \ + break; \ + } \ + IN6M_UNLOCK(in6m); \ + IN6_NEXT_MULTI(_step, in6m); \ + } \ } while(0) /* @@ -699,34 +870,58 @@ do { \ * provide. IN6_FIRST_MULTI(), below, must be called to initialize "step" * and get the first record. Both macros return a NULL "in6m" when there * are no remaining records. + * + * Must be called with in6_multihead_lock held. */ #define IN6_NEXT_MULTI(step, in6m) \ -/* struct in6_multistep step; */ \ -/* struct in6_multi *in6m; */ \ -do { \ - if (((in6m) = (step).i_in6m) != NULL) \ - (step).i_in6m = (step).i_in6m->in6m_entry.le_next; \ -} while(0) + /* struct in6_multistep step; */ \ + /* struct in6_multi *in6m; */ \ +do { \ + in6_multihead_lock_assert(LCK_RW_ASSERT_HELD); \ + if (((in6m) = (step).i_in6m) != NULL) \ + (step).i_in6m = (step).i_in6m->in6m_entry.le_next; \ +} while (0) -#define IN6_FIRST_MULTI(step, in6m) \ -/* struct in6_multistep step; */ \ -/* struct in6_multi *in6m */ \ -do { \ - (step).i_in6m = in6_multihead.lh_first; \ - IN6_NEXT_MULTI((step), (in6m)); \ -} while(0) +#define IN6_FIRST_MULTI(step, in6m) \ + /* struct in6_multistep step; */ \ + /* struct in6_multi *in6m */ \ +do { \ + in6_multihead_lock_assert(LCK_RW_ASSERT_HELD); \ + (step).i_in6m = in6_multihead.lh_first; \ + IN6_NEXT_MULTI((step), (in6m)); \ +} while (0) -extern struct in6_multi *in6_addmulti(struct in6_addr *, struct ifnet *, - int *, int); -extern void in6_delmulti(struct in6_multi *, int); +/* Multicast private KPIs. */ +extern int im6o_mc_filter(const struct ip6_moptions *, const struct ifnet *, + const struct sockaddr *, const struct sockaddr *); +extern int in6_mc_join(struct ifnet *, const struct in6_addr *, + struct in6_mfilter *, struct in6_multi **, int); +extern int in6_mc_leave(struct in6_multi *, struct in6_mfilter *); +extern void in6m_clear_recorded(struct in6_multi *); +extern void in6m_commit(struct in6_multi *); +extern void in6m_purge(struct in6_multi *); +extern void in6m_print(const struct in6_multi *); +extern int in6m_record_source(struct in6_multi *, const struct in6_addr *); +extern int ip6_getmoptions(struct inpcb *, struct sockopt *); +extern int ip6_setmoptions(struct inpcb *, struct sockopt *); + +/* Legacy KAME multicast private KPIs. */ +extern struct in6_multi_mship *in6_joingroup(struct ifnet *, + struct in6_addr *, int *, int); +extern int in6_leavegroup(struct in6_multi_mship *); + +extern void in6_multi_init(void); +extern void in6m_addref(struct in6_multi *, int); +extern void in6m_remref(struct in6_multi *, int); +extern int in6_multi_detach(struct in6_multi *); extern int in6_ifindex2scopeid(int); extern int in6_mask2len(struct in6_addr *, u_char *); extern void in6_len2mask(struct in6_addr *, int); extern int in6_control(struct socket *, u_long, caddr_t, struct ifnet *, struct proc *); extern int in6_update_ifa(struct ifnet *, struct in6_aliasreq *, - struct in6_ifaddr *, int); -extern void in6_purgeaddr(struct ifaddr *, int); + struct in6_ifaddr *, int, int); +extern void in6_purgeaddr(struct ifaddr *); extern int in6if_do_dad(struct ifnet *); extern void in6_purgeif(struct ifnet *); extern void in6_savemkludge(struct in6_ifaddr *); @@ -744,21 +939,35 @@ extern void in6_prefixlen2mask(struct in6_addr *maskp, int len); extern int in6_prefix_add_ifid(int iilen, struct in6_ifaddr *ia); extern void in6_prefix_remove_ifid(int iilen, struct in6_ifaddr *ia); extern void in6_purgeprefix(struct ifnet *); +extern void in6_purgeaddrs(struct ifnet *); extern int in6_is_addr_deprecated(struct sockaddr_in6 *); +extern uint8_t im6s_get_mode(const struct in6_multi *, + const struct ip6_msource *, uint8_t); + +extern void im6f_leave(struct in6_mfilter *); +extern void im6f_purge(struct in6_mfilter *); struct inpcb; +struct ip6_pktopts; extern int in6_embedscope(struct in6_addr *, const struct sockaddr_in6 *, - struct inpcb *, struct ifnet **); + struct inpcb *, struct ifnet **, struct ip6_pktopts *); extern int in6_recoverscope(struct sockaddr_in6 *, const struct in6_addr *, struct ifnet *); -extern void in6_clearscope(struct in6_addr *); extern void in6_aliasreq_64_to_32(struct in6_aliasreq_64 *, struct in6_aliasreq_32 *); extern void in6_aliasreq_32_to_64(struct in6_aliasreq_32 *, struct in6_aliasreq_64 *); extern void in6_ifaddr_init(void); extern void in6_rtqdrain(void); -#endif /* KERNEL_PRIVATE */ +extern struct radix_node *in6_validate(struct radix_node *); +extern int in6_if2idlen(struct ifnet *); +extern int in6_src_ioctl (u_long, caddr_t); + +__private_extern__ void in6_multihead_lock_exclusive(void); +__private_extern__ void in6_multihead_lock_shared(void); +__private_extern__ void in6_multihead_lock_assert(int); +__private_extern__ void in6_multihead_lock_done(void); +#endif /* XNU_KERNEL_PRIVATE */ #endif /* _NETINET6_IN6_VAR_H_ */ diff --git a/bsd/netinet6/ip6_forward.c b/bsd/netinet6/ip6_forward.c index 202d8ccb1..f2d6e3bd6 100644 --- a/bsd/netinet6/ip6_forward.c +++ b/bsd/netinet6/ip6_forward.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,6 +84,7 @@ #include <netinet6/ip6_var.h> #include <netinet/icmp6.h> #include <netinet6/nd6.h> +#include <netinet6/scope6_var.h> #include <netinet/in_pcb.h> @@ -95,7 +96,6 @@ #include <netkey/key.h> extern int ipsec_bypass; #endif /* IPSEC */ -extern lck_mtx_t *ip6_mutex; #include <netinet6/ip6_fw.h> @@ -120,7 +120,7 @@ extern lck_mtx_t *ip6_mutex; void ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, - int srcrt, int locked) + int srcrt) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct sockaddr_in6 *dst; @@ -128,14 +128,24 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, int error, type = 0, code = 0; struct mbuf *mcopy = NULL; struct ifnet *ifp, *origifp; /* maybe unnecessary */ + u_int32_t inzone, outzone; + struct in6_addr src_in6, dst_in6; #if IPSEC struct secpolicy *sp = NULL; #endif struct timeval timenow; int tunneledv4 = 0; + unsigned int ifscope = IFSCOPE_NONE; +#if PF + struct pf_mtag *pf_mtag; +#endif /* PF */ getmicrotime(&timenow); - +#if PF + pf_mtag = pf_find_mtag(m); + if (pf_mtag != NULL && pf_mtag->rtableid != IFSCOPE_NONE) + ifscope = pf_mtag->rtableid; +#endif /* PF */ #if IPSEC /* @@ -181,12 +191,8 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, if (ip6->ip6_hlim <= IPV6_HLIMDEC) { /* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */ - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_TIME_EXCEEDED, ICMP6_TIME_EXCEED_TRANSIT, 0); - if (locked) - lck_mtx_lock(ip6_mutex); return; } ip6->ip6_hlim -= IPV6_HLIMDEC; @@ -293,11 +299,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, state.ro = NULL; /* update at ipsec6_output_tunnel() */ state.dst = NULL; /* update at ipsec6_output_tunnel() */ - if (locked) - lck_mtx_unlock(ip6_mutex); error = ipsec6_output_tunnel(&state, sp, 0, &tunneledv4); - if (locked) - lck_mtx_lock(ip6_mutex); key_freesp(sp, KEY_SADB_UNLOCKED); if (tunneledv4) return; /* packet is gone - sent over IPv4 */ @@ -334,15 +336,6 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, skip_ipsec: #endif /* IPSEC */ - /* - * If "locked", ip6forward_rt points to the globally defined - * struct route cache which requires ip6_mutex, e.g. when this - * is called from ip6_input(). Else the caller is responsible - * for the struct route and its serialization (if needed), e.g. - * when this is called from ip6_rthdr0(). - */ - if (locked) - lck_mtx_assert(ip6_mutex, LCK_MTX_ASSERT_OWNED); dst = (struct sockaddr_in6 *)&ip6forward_rt->ro_dst; if ((rt = ip6forward_rt->ro_rt) != NULL) { RT_LOCK(rt); @@ -364,8 +357,8 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, ip6forward_rt->ro_rt = NULL; } /* this probably fails but give it a try again */ - rtalloc_ign((struct route *)ip6forward_rt, - RTF_PRCLONING); + rtalloc_scoped_ign((struct route *)ip6forward_rt, + RTF_PRCLONING, ifscope); if ((rt = ip6forward_rt->ro_rt) != NULL) { RT_LOCK(rt); /* Take an extra ref for ourselves */ @@ -376,14 +369,9 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, if (rt == NULL) { ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); - if (mcopy) { - if (locked) - lck_mtx_unlock(ip6_mutex); + if (mcopy) icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); - if (locked) - lck_mtx_lock(ip6_mutex); - } m_freem(m); return; } @@ -403,18 +391,14 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, dst->sin6_family = AF_INET6; dst->sin6_addr = ip6->ip6_dst; - rtalloc_ign((struct route *)ip6forward_rt, RTF_PRCLONING); + rtalloc_scoped_ign((struct route *)ip6forward_rt, + RTF_PRCLONING, ifscope); if ((rt = ip6forward_rt->ro_rt) == NULL) { ip6stat.ip6s_noroute++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute); - if (mcopy) { - if (locked) - lck_mtx_unlock(ip6_mutex); + if (mcopy) icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOROUTE, 0); - if (locked) - lck_mtx_lock(ip6_mutex); - } m_freem(m); return; } @@ -424,14 +408,29 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, } /* - * Scope check: if a packet can't be delivered to its destination - * for the reason that the destination is beyond the scope of the - * source address, discard the packet and return an icmp6 destination - * unreachable error with Code 2 (beyond scope of source address). - * [draft-ietf-ipngwg-icmp-v3-02.txt, Section 3.1] + * Source scope check: if a packet can't be delivered to its + * destination for the reason that the destination is beyond the scope + * of the source address, discard the packet and return an icmp6 + * destination unreachable error with Code 2 (beyond scope of source + * address). We use a local copy of ip6_src, since in6_setscope() + * will possibly modify its first argument. + * [draft-ietf-ipngwg-icmp-v3-04.txt, Section 3.1] */ - if (in6_addr2scopeid(m->m_pkthdr.rcvif, &ip6->ip6_src) != - in6_addr2scopeid(rt->rt_ifp, &ip6->ip6_src)) { + src_in6 = ip6->ip6_src; + if (in6_setscope(&src_in6, rt->rt_ifp, &outzone)) { + /* XXX: this should not happen */ + ip6stat.ip6s_cantforward++; + ip6stat.ip6s_badscope++; + m_freem(m); + return; + } + if (in6_setscope(&src_in6, m->m_pkthdr.rcvif, &inzone)) { + ip6stat.ip6s_cantforward++; + ip6stat.ip6s_badscope++; + m_freem(m); + return; + } + if (inzone != outzone) { ip6stat.ip6s_cantforward++; ip6stat.ip6s_badscope++; in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard); @@ -450,17 +449,30 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); if (mcopy) { - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_BEYONDSCOPE, 0); - if (locked) - lck_mtx_lock(ip6_mutex); } m_freem(m); return; } + /* + * Destination scope check: if a packet is going to break the scope + * zone of packet's destination address, discard it. This case should + * usually be prevented by appropriately-configured routing table, but + * we need an explicit check because we may mistakenly forward the + * packet to a different zone by (e.g.) a default route. + */ + dst_in6 = ip6->ip6_dst; + if (in6_setscope(&dst_in6, m->m_pkthdr.rcvif, &inzone) != 0 || + in6_setscope(&dst_in6, rt->rt_ifp, &outzone) != 0 || + inzone != outzone) { + ip6stat.ip6s_cantforward++; + ip6stat.ip6s_badscope++; + m_freem(m); + return; + } + if (m->m_pkthdr.len > rt->rt_ifp->if_mtu) { in6_ifstat_inc(rt->rt_ifp, ifs6_in_toobig); if (mcopy) { @@ -475,7 +487,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, #if IPSEC /* * When we do IPsec tunnel ingress, we need to play - * with if_mtu value (decrement IPsec header size + * with the link value (decrement IPsec header size * from mtu value). The code is much simpler than v4 * case, as we have the outgoing interface for * encapsulated packet as "rt->rt_ifp". @@ -499,11 +511,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, /* Release extra ref */ RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0, mtu); - if (locked) - lck_mtx_lock(ip6_mutex); } else { /* Release extra ref */ RT_REMREF_LOCKED(rt); @@ -525,7 +533,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, * Also, don't send redirect if forwarding using a route * modified by a redirect. */ - if (rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && + if (ip6_sendredirects && rt->rt_ifp == m->m_pkthdr.rcvif && !srcrt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0) { if ((rt->rt_ifp->if_flags & IFF_POINTOPOINT) != 0) { /* @@ -540,12 +548,8 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, */ RT_REMREF_LOCKED(rt); /* Release extra ref */ RT_UNLOCK(rt); - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR, 0); - if (locked) - lck_mtx_lock(ip6_mutex); m_freem(m); return; } @@ -611,29 +615,21 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, } else origifp = rt->rt_ifp; -#ifndef SCOPEDROUTING /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); -#endif ifp = rt->rt_ifp; /* Drop the lock but retain the extra ref */ RT_UNLOCK(rt); #if PF - if (locked) - lck_mtx_unlock(ip6_mutex); - /* Invoke outbound packet filter */ error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE); - if (locked) - lck_mtx_lock(ip6_mutex); - if (error) { if (m != NULL) { panic("%s: unexpected packet %p\n", __func__, m); @@ -645,7 +641,7 @@ ip6_forward(struct mbuf *m, struct route_in6 *ip6forward_rt, ip6 = mtod(m, struct ip6_hdr *); #endif /* PF */ - error = nd6_output(ifp, origifp, m, dst, rt, locked); + error = nd6_output(ifp, origifp, m, dst, rt); if (error) { in6_ifstat_inc(ifp, ifs6_out_discard); ip6stat.ip6s_cantforward++; @@ -697,11 +693,7 @@ senderr: code = ICMP6_DST_UNREACH_ADDR; break; } - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(mcopy, type, code, 0); - if (locked) - lck_mtx_lock(ip6_mutex); /* Release extra ref */ RT_REMREF(rt); return; diff --git a/bsd/netinet6/ip6_fw.c b/bsd/netinet6/ip6_fw.c index f1b9f0508..ae221caad 100644 --- a/bsd/netinet6/ip6_fw.c +++ b/bsd/netinet6/ip6_fw.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -150,11 +150,11 @@ static int ip6fw_sysctl SYSCTL_HANDLER_ARGS; SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Firewall"); SYSCTL_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, - CTLTYPE_INT | CTLFLAG_RW, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_fw_enable, 0, ip6fw_sysctl, "I", "Enable ip6fw"); -SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, debug, CTLFLAG_RW, &fw6_debug, 0, ""); -SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, verbose, CTLFLAG_RW, &fw6_verbose, 0, ""); -SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &fw6_verbose_limit, 0, ""); +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &fw6_debug, 0, ""); +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED, &fw6_verbose, 0, ""); +SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, verbose_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &fw6_verbose_limit, 0, ""); static int ip6fw_sysctl SYSCTL_HANDLER_ARGS @@ -202,7 +202,6 @@ static void cp_to_user_32( struct ip6_fw_32 *userrule_32, struct ip6_fw *rule); static void cp_from_user_32( struct ip6_fw_32 *userrule_32, struct ip6_fw *rule); static char err_prefix[] = "ip6_fw_ctl:"; -extern lck_mtx_t *ip6_mutex; /* * Returns 1 if the port is matched by the vector, 0 otherwise @@ -390,17 +389,21 @@ iface_match(struct ifnet *ifp, union ip6_fw_if *ifu, int byname) struct ifaddr *ia; ifnet_lock_shared(ifp); - for (ia = ifp->if_addrlist.tqh_first; ia; ia = ia->ifa_list.tqe_next) + for (ia = ifp->if_addrlist.tqh_first; ia; + ia = ia->ifa_list.tqe_next) { - - if (ia->ifa_addr == NULL) - continue; - if (ia->ifa_addr->sa_family != AF_INET6) + IFA_LOCK_SPIN(ia); + if (ia->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ia); continue; + } if (!IN6_ARE_ADDR_EQUAL(&ifu->fu_via_ip6, &(((struct sockaddr_in6 *) - (ia->ifa_addr))->sin6_addr))) + (ia->ifa_addr))->sin6_addr))) { + IFA_UNLOCK(ia); continue; + } + IFA_UNLOCK(ia); ifnet_lock_done(ifp); return(1); } @@ -558,7 +561,7 @@ ip6_fw_chk(struct ip6_hdr **pip6, struct ip6_fw_chain *chain; struct ip6_fw *rule = NULL; struct ip6_hdr *ip6 = *pip6; - struct ifnet *const rif = ((*m)->m_flags & M_LOOP) ? ifunit("lo0") : (*m)->m_pkthdr.rcvif; + struct ifnet *const rif = ((*m)->m_flags & M_LOOP) ? lo_ifp : (*m)->m_pkthdr.rcvif; u_short offset = 0; int off = sizeof(struct ip6_hdr), nxt = ip6->ip6_nxt; u_short src_port, dst_port; @@ -870,18 +873,15 @@ got_match: } bcopy(&ti, ip6, sizeof(ti)); tcp_respond(NULL, ip6, (struct tcphdr *)(ip6 + 1), - *m, ack, seq, flags, IFSCOPE_NONE); + *m, ack, seq, flags, IFSCOPE_NONE, 0); *m = NULL; break; } default: /* Send an ICMP unreachable using code */ if (oif) (*m)->m_pkthdr.rcvif = oif; - lck_mtx_assert(ip6_mutex, LCK_MTX_ASSERT_OWNED); - lck_mtx_unlock(ip6_mutex); icmp6_error(*m, ICMP6_DST_UNREACH, rule->fw_reject_code, 0); - lck_mtx_lock(ip6_mutex); *m = NULL; break; } @@ -962,6 +962,7 @@ add_entry6(struct ip6_fw_head *chainptr, struct ip6_fw *frwl) } } + bcopy(ftmp, frwl, sizeof(struct ip6_fw)); splx(s); return (0); } @@ -1400,6 +1401,17 @@ ip6_fw_ctl(struct sockopt *sopt) ip6fw_kev_post_msg(KEV_IP6FW_ADD); } else error = EINVAL; + + if (is64user){ + struct ip6_fw_64 userrule_64; + cp_to_user_64( &userrule_64, &rule); + error = sooptcopyout(sopt, &userrule_64, userrulesize); + } + else { + struct ip6_fw_32 userrule_32; + cp_to_user_32( &userrule_32, &rule); + error = sooptcopyout(sopt, &userrule_32, userrulesize); + } break; case IPV6_FW_DEL: diff --git a/bsd/netinet6/ip6_fw.h b/bsd/netinet6/ip6_fw.h index 32f4f280b..92f913f29 100644 --- a/bsd/netinet6/ip6_fw.h +++ b/bsd/netinet6/ip6_fw.h @@ -42,6 +42,7 @@ #ifndef _IP6_FW_H #define _IP6_FW_H +#ifdef __APPLE_API_OBSOLETE #include <sys/appleapiopts.h> @@ -343,4 +344,5 @@ extern int ip6_fw_enable; #endif /* KERNEL_PRIVATE */ +#endif /* __APPLE_API_OBSOLETE */ #endif /* _IP6_FW_H */ diff --git a/bsd/netinet6/ip6_id.c b/bsd/netinet6/ip6_id.c new file mode 100644 index 000000000..26fffd286 --- /dev/null +++ b/bsd/netinet6/ip6_id.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +/*- + * Copyright (C) 2003 WIDE Project. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the project nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $KAME: ip6_id.c,v 1.13 2003/09/16 09:11:19 itojun Exp $ + */ + +/*- + * Copyright 1998 Niels Provos <provos@citi.umich.edu> + * All rights reserved. + * + * Theo de Raadt <deraadt@openbsd.org> came up with the idea of using + * such a mathematical system to generate more random (yet non-repeating) + * ids to solve the resolver/named problem. But Niels designed the + * actual system based on the constraints. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Niels Provos. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $OpenBSD: ip_id.c,v 1.6 2002/03/15 18:19:52 millert Exp $ + */ + +#include <sys/cdefs.h> + +/* + * seed = random (bits - 1) bit + * n = prime, g0 = generator to n, + * j = random so that gcd(j,n-1) == 1 + * g = g0^j mod n will be a generator again. + * + * X[0] = random seed. + * X[n] = a*X[n-1]+b mod m is a Linear Congruential Generator + * with a = 7^(even random) mod m, + * b = random with gcd(b,m) == 1 + * m = constant and a maximal period of m-1. + * + * The transaction id is determined by: + * id[n] = seed xor (g^X[n] mod n) + * + * Effectivly the id is restricted to the lower (bits - 1) bits, thus + * yielding two different cycles by toggling the msb on and off. + * This avoids reuse issues caused by reseeding. + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/kernel.h> +#include <sys/random.h> +#include <libkern/libkern.h> + +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> + +#ifndef INT32_MAX +#define INT32_MAX 0x7fffffffU +#endif + +struct randomtab { + const int ru_bits; /* resulting bits */ + const long ru_out; /* Time after wich will be reseeded */ + const u_int32_t ru_max; /* Uniq cycle, avoid blackjack prediction */ + const u_int32_t ru_gen; /* Starting generator */ + const u_int32_t ru_n; /* ru_n: prime, ru_n - 1: product of pfacts[] */ + const u_int32_t ru_agen; /* determine ru_a as ru_agen^(2*rand) */ + const u_int32_t ru_m; /* ru_m = 2^x*3^y */ + const u_int32_t pfacts[4]; /* factors of ru_n */ + + u_int32_t ru_counter; + u_int32_t ru_msb; + + u_int32_t ru_x; + u_int32_t ru_seed, ru_seed2; + u_int32_t ru_a, ru_b; + u_int32_t ru_g; + long ru_reseed; +}; + +static struct randomtab randomtab_32 = { + 32, /* resulting bits */ + 180, /* Time after wich will be reseeded */ + 1000000000, /* Uniq cycle, avoid blackjack prediction */ + 2, /* Starting generator */ + 2147483629, /* RU_N-1 = 2^2*3^2*59652323 */ + 7, /* determine ru_a as RU_AGEN^(2*rand) */ + 1836660096, /* RU_M = 2^7*3^15 - don't change */ + { 2, 3, 59652323, 0 }, /* factors of ru_n */ + 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static struct randomtab randomtab_20 = { + 20, /* resulting bits */ + 180, /* Time after wich will be reseeded */ + 200000, /* Uniq cycle, avoid blackjack prediction */ + 2, /* Starting generator */ + 524269, /* RU_N-1 = 2^2*3^2*14563 */ + 7, /* determine ru_a as RU_AGEN^(2*rand) */ + 279936, /* RU_M = 2^7*3^7 - don't change */ + { 2, 3, 14563, 0 }, /* factors of ru_n */ + 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static u_int32_t pmod(u_int32_t, u_int32_t, u_int32_t); +static void initid(struct randomtab *); +static u_int32_t randomid(struct randomtab *); + +/* + * Do a fast modular exponation, returned value will be in the range + * of 0 - (mod-1) + */ +static u_int32_t +pmod(u_int32_t gen, u_int32_t expo, u_int32_t mod) +{ + u_int64_t s, t, u; + + s = 1; + t = gen; + u = expo; + + while (u) { + if (u & 1) + s = (s * t) % mod; + u >>= 1; + t = (t * t) % mod; + } + return (s); +} + +/* + * Initalizes the seed and chooses a suitable generator. Also toggles + * the msb flag. The msb flag is used to generate two distinct + * cycles of random numbers and thus avoiding reuse of ids. + * + * This function is called from id_randomid() when needed, an + * application does not have to worry about it. + */ +static void +initid(struct randomtab *p) +{ + u_int32_t j, i; + int noprime = 1; + struct timeval timenow; + + getmicrotime(&timenow); + + p->ru_x = random() % p->ru_m; + + /* (bits - 1) bits of random seed */ + p->ru_seed = random() & (~0U >> (32 - p->ru_bits + 1)); + p->ru_seed2 = random() & (~0U >> (32 - p->ru_bits + 1)); + + /* Determine the LCG we use */ + p->ru_b = (random() & (~0U >> (32 - p->ru_bits))) | 1; + p->ru_a = pmod(p->ru_agen, + (random() & (~0U >> (32 - p->ru_bits))) & (~1U), p->ru_m); + while (p->ru_b % 3 == 0) + p->ru_b += 2; + + j = random() % p->ru_n; + + /* + * Do a fast gcd(j, RU_N - 1), so we can find a j with + * gcd(j, RU_N - 1) == 1, giving a new generator for + * RU_GEN^j mod RU_N + */ + while (noprime) { + for (i = 0; p->pfacts[i] > 0; i++) + if (j % p->pfacts[i] == 0) + break; + + if (p->pfacts[i] == 0) + noprime = 0; + else + j = (j + 1) % p->ru_n; + } + + p->ru_g = pmod(p->ru_gen, j, p->ru_n); + p->ru_counter = 0; + + p->ru_reseed = timenow.tv_sec + p->ru_out; + p->ru_msb = p->ru_msb ? 0 : (1U << (p->ru_bits - 1)); +} + +static u_int32_t +randomid(struct randomtab *p) +{ + int i, n; + u_int32_t tmp; + struct timeval timenow; + + getmicrotime(&timenow); + + if (p->ru_counter >= p->ru_max || timenow.tv_sec > p->ru_reseed) + initid(p); + + tmp = random(); + + /* Skip a random number of ids */ + n = tmp & 0x3; tmp = tmp >> 2; + if (p->ru_counter + n >= p->ru_max) + initid(p); + + for (i = 0; i <= n; i++) { + /* Linear Congruential Generator */ + p->ru_x = (u_int32_t)((u_int64_t)p->ru_a * p->ru_x + p->ru_b) % p->ru_m; + } + + p->ru_counter += i; + + return (p->ru_seed ^ pmod(p->ru_g, p->ru_seed2 ^ p->ru_x, p->ru_n)) | + p->ru_msb; +} + +u_int32_t +ip6_randomid(void) +{ + + return randomid(&randomtab_32); +} + +u_int32_t +ip6_randomflowlabel(void) +{ + + return randomid(&randomtab_20) & 0xfffff; +} diff --git a/bsd/netinet6/ip6_input.c b/bsd/netinet6/ip6_input.c index 0a8320298..ae8fecd68 100644 --- a/bsd/netinet6/ip6_input.c +++ b/bsd/netinet6/ip6_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,8 +105,13 @@ #include <sys/time.h> #include <sys/kernel.h> #include <sys/syslog.h> +#include <sys/sysctl.h> #include <sys/proc.h> #include <sys/kauth.h> +#include <sys/mcache.h> +#include <mach/mach_time.h> + +#include <pexpert/pexpert.h> #include <net/if.h> #include <net/if_var.h> @@ -114,6 +119,7 @@ #include <net/if_dl.h> #include <net/route.h> #include <net/kpi_protocol.h> +#include <net/ntstat.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -128,7 +134,8 @@ #include <netinet/icmp6.h> #include <netinet6/in6_ifattach.h> #include <netinet6/nd6.h> -#include <netinet6/in6_prefix.h> +#include <netinet6/scope6_var.h> +#include <mach/sdt.h> #if IPSEC #include <netinet6/ipsec.h> @@ -159,7 +166,14 @@ extern struct ip6protosw inet6sw[]; struct ip6protosw * ip6_protox[IPPROTO_MAX]; static int ip6qmaxlen = IFQ_MAXLEN; -struct in6_ifaddr *in6_ifaddrs; + +static lck_grp_attr_t *in6_ifaddr_rwlock_grp_attr; +static lck_grp_t *in6_ifaddr_rwlock_grp; +static lck_attr_t *in6_ifaddr_rwlock_attr; +decl_lck_rw_data(, in6_ifaddr_rwlock); + +/* Protected by in6_ifaddr_rwlock */ +struct in6_ifaddr *in6_ifaddrs = NULL; int ip6_forward_srcrt; /* XXX */ int ip6_sourcecheck; /* XXX */ @@ -168,7 +182,14 @@ const int int6intrq_present = 1; int ip6_ours_check_algorithm; int in6_init2done = 0; +int in6_init_done = 0; +#define _CASSERT(x) \ + switch (0) { case 0: case (x): ; } +#define IN6_IFSTAT_REQUIRE_ALIGNED_64(f) \ + _CASSERT(!(offsetof(struct in6_ifstat, f) % sizeof (uint64_t))) +#define ICMP6_IFSTAT_REQUIRE_ALIGNED_64(f) \ + _CASSERT(!(offsetof(struct icmp6_ifstat, f) % sizeof (uint64_t))) #if IPFW2 /* firewall hooks */ @@ -181,17 +202,23 @@ struct ip6stat ip6stat; #ifdef __APPLE__ struct ifqueue ip6intrq; -lck_mtx_t *ip6_mutex; +decl_lck_mtx_data(, ip6_init_mutex); lck_mtx_t *dad6_mutex; lck_mtx_t *nd6_mutex; lck_mtx_t *prefix6_mutex; lck_mtx_t *scope6_mutex; +#ifdef ENABLE_ADDRSEL +lck_mtx_t *addrsel_mutex; +#endif +decl_lck_rw_data(, in6_ifs_rwlock); +decl_lck_rw_data(, icmp6_ifs_rwlock); lck_attr_t *ip6_mutex_attr; lck_grp_t *ip6_mutex_grp; lck_grp_attr_t *ip6_mutex_grp_attr; extern lck_mtx_t *inet6_domain_mutex; #endif extern int loopattach_done; +extern void addrsel_policy_init(void); static void ip6_init2(void *); static struct ip6aux *ip6_setdstifaddr(struct mbuf *, struct in6_ifaddr *); @@ -209,6 +236,11 @@ void stfattach(void); extern lck_mtx_t *domain_proto_mtx; +SYSCTL_DECL(_net_inet6_ip6); + +int ip6_doscopedroute = 1; +SYSCTL_INT(_net_inet6_ip6, OID_AUTO, scopedroute, CTLFLAG_RD | CTLFLAG_LOCKED, + &ip6_doscopedroute, 0, "Enable IPv6 scoped routing"); static void ip6_proto_input( @@ -229,6 +261,9 @@ ip6_init() int i; struct timeval tv; + PE_parse_boot_argn("net.inet6.ip6.scopedroute", &ip6_doscopedroute, + sizeof (ip6_doscopedroute)); + #if DIAGNOSTIC if (sizeof(struct protosw) != sizeof(struct ip6protosw)) panic("sizeof(protosw) != sizeof(ip6protosw)"); @@ -251,9 +286,6 @@ ip6_init() ip6_mutex_grp = lck_grp_alloc_init("ip6", ip6_mutex_grp_attr); ip6_mutex_attr = lck_attr_alloc_init(); - if ((ip6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { - panic("ip6_init: can't alloc ip6_mutex\n"); - } if ((dad6_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { panic("ip6_init: can't alloc dad6_mutex\n"); } @@ -269,14 +301,90 @@ ip6_init() panic("ip6_init: can't alloc scope6_mutex\n"); } +#ifdef ENABLE_ADDRSEL + if ((addrsel_mutex = lck_mtx_alloc_init(ip6_mutex_grp, ip6_mutex_attr)) == NULL) { + panic("ip6_init: can't alloc addrsel_mutex\n"); + } +#endif + + lck_rw_init(&in6_ifs_rwlock, ip6_mutex_grp, ip6_mutex_attr); + lck_rw_init(&icmp6_ifs_rwlock, ip6_mutex_grp, ip6_mutex_attr); + lck_mtx_init(&ip6_init_mutex, ip6_mutex_grp, ip6_mutex_attr); inet6domain.dom_flags = DOM_REENTRANT; ip6intrq.ifq_maxlen = ip6qmaxlen; + + in6_ifaddr_rwlock_grp_attr = lck_grp_attr_alloc_init(); + in6_ifaddr_rwlock_grp = lck_grp_alloc_init("in6_ifaddr_rwlock", + in6_ifaddr_rwlock_grp_attr); + in6_ifaddr_rwlock_attr = lck_attr_alloc_init(); + lck_rw_init(&in6_ifaddr_rwlock, in6_ifaddr_rwlock_grp, + in6_ifaddr_rwlock_attr); + + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_receive); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_hdrerr); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_toobig); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_noroute); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_addrerr); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_protounknown); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_truncated); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_discard); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_deliver); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_forward); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_request); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_discard); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_fragok); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_fragfail); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_fragcreat); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_reass_reqd); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_reass_ok); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_reass_fail); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_mcast); + IN6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_mcast); + + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_msg); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_error); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_dstunreach); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_adminprohib); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_timeexceed); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_paramprob); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_pkttoobig); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_echo); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_echoreply); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_routersolicit); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_routeradvert); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_neighborsolicit); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_neighboradvert); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_redirect); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_mldquery); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_mldreport); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_in_mlddone); + + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_msg); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_error); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_dstunreach); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_adminprohib); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_timeexceed); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_paramprob); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_pkttoobig); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_echo); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_echoreply); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_routersolicit); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_routeradvert); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_neighborsolicit); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_neighboradvert); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_redirect); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_mldquery); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_mldreport); + ICMP6_IFSTAT_REQUIRE_ALIGNED_64(ifs6_out_mlddone); + in6_ifaddr_init(); + ip6_moptions_init(); nd6_init(); frag6_init(); icmp6_init(); + addrsel_policy_init(); /* * in many cases, random() here does NOT return random number * as initialization during bootstrap time occur in fixed order. @@ -310,9 +418,6 @@ ip6_init2( /* nd6_timer_init */ timeout(nd6_timer, (caddr_t)0, hz); - /* router renumbering prefix list maintenance */ - timeout(in6_rr_timer, (caddr_t)0, hz); - /* timer for regeneranation of temporary addresses randomize ID */ timeout(in6_tmpaddrtimer, (caddr_t)0, (ip6_temp_preferred_lifetime - ip6_desync_factor - @@ -327,42 +432,27 @@ ip6_init2( #if NSTF stfattach(); #endif -#else - /* nd6_timer_init */ - - callout_init(&nd6_timer_ch); - callout_reset(&nd6_timer_ch, hz, nd6_timer, NULL); - - /* router renumbering prefix list maintenance */ - callout_init(&in6_rr_timer_ch); - callout_reset(&in6_rr_timer_ch, hz, in6_rr_timer, NULL); - - /* timer for regeneranation of temporary addresses randomize ID */ - callout_reset(&in6_tmpaddrtimer_ch, - (ip6_temp_preferred_lifetime - ip6_desync_factor - - ip6_temp_regen_advance) * hz, - in6_tmpaddrtimer, NULL); #endif - in6_init2done = 1; -} -#if __FreeBSD__ -/* cheat */ -/* This must be after route_init(), which is now SI_ORDER_THIRD */ -SYSINIT(netinet6init2, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ip6_init2, NULL); -#endif + lck_mtx_lock(&ip6_init_mutex); + in6_init_done = 1; + wakeup(&in6_init_done); + lck_mtx_unlock(&ip6_init_mutex); +} -/* - * ip6_forward_rt contains the route entry that was recently used during - * the forwarding of an IPv6 packet and thus acts as a route cache. Access - * to this variable is protected by the global lock ip6_mutex. - */ -static struct route_in6 ip6_forward_rt; +void +ip6_fin() +{ + lck_mtx_lock(&ip6_init_mutex); + while (in6_init_done == 0) { + (void) msleep(&in6_init_done, &ip6_init_mutex, 0, "ip6_fin()", NULL); + } + lck_mtx_unlock(&ip6_init_mutex); +} void -ip6_input(m) - struct mbuf *m; +ip6_input(struct mbuf *m) { struct ip6_hdr *ip6; int off = sizeof(struct ip6_hdr), nest; @@ -372,6 +462,16 @@ ip6_input(m) struct ifnet *deliverifp = NULL; ipfilter_t inject_ipfref = 0; int seen; + struct in6_ifaddr *ia6 = NULL; + struct route_in6 ip6_forward_rt; + struct sockaddr_in6 *dst6; + + bzero(&ip6_forward_rt, sizeof(ip6_forward_rt)); + + /* Check if the packet we received is valid after interface filter + * processing + */ + MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif); /* * No need to proccess packet twice if we've @@ -402,7 +502,6 @@ ip6_input(m) */ ip6_delaux(m); - lck_mtx_lock(ip6_mutex); /* * mbuf statistics */ @@ -425,6 +524,15 @@ ip6_input(m) #undef M2MMAX } + /* drop the packet if IPv6 operation is disabled on the IF */ + lck_rw_lock_shared(nd_if_rwlock); + if (m->m_pkthdr.rcvif->if_index < nd_ifinfo_indexlim && + (nd_ifinfo[m->m_pkthdr.rcvif->if_index].flags & ND6_IFF_IFDISABLED)) { + lck_rw_done(nd_if_rwlock); + goto bad; + } + lck_rw_done(nd_if_rwlock); + in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_receive); ip6stat.ip6s_total++; @@ -447,11 +555,8 @@ ip6_input(m) n = NULL; } } - if (n == NULL) { - m_freem(m); - lck_mtx_unlock(ip6_mutex); - return; /*ENOBUFS*/ - } + if (n == NULL) + goto bad; m_copydata(m, 0, m->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = m->m_pkthdr.len; @@ -459,7 +564,7 @@ ip6_input(m) m = n; } IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), - {lck_mtx_unlock(ip6_mutex); return;}); + {goto done;}); #endif if (m->m_len < sizeof(struct ip6_hdr)) { @@ -468,8 +573,7 @@ ip6_input(m) if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == 0) { ip6stat.ip6s_toosmall++; in6_ifstat_inc(inifp, ifs6_in_hdrerr); - lck_mtx_unlock(ip6_mutex); - return; + goto done; } } @@ -495,10 +599,8 @@ ip6_input(m) m_freem(m); m = NULL; } - if (!m) { - lck_mtx_unlock(ip6_mutex); - return; - } + if (!m) + goto done; } #endif @@ -514,9 +616,14 @@ ip6_input(m) in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } - if ((IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) || - IN6_IS_ADDR_LOOPBACK(&ip6->ip6_dst)) && - (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) == 0) { + if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && + !(m->m_flags & M_LOOP)) { + /* + * In this case, the packet should come from the loopback + * interface. However, we cannot just check the if_flags, + * because ip6_mloopback() passes the "actual" interface + * as the outgoing/incoming interface. + */ ip6stat.ip6s_badscope++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; @@ -526,13 +633,13 @@ ip6_input(m) * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using - * IPv6 src ::ffff:127.0.0.1). Be cautious. + * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a - * partical support for SIIT environment. + * partial support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { @@ -556,19 +663,42 @@ ip6_input(m) } #endif + /* + * Naively assume we can attribute inbound data to the route we would + * use to send to this destination. Asymetric routing breaks this + * assumption, but it still allows us to account for traffic from + * a remote node in the routing table. + * this has a very significant performance impact so we bypass + * if nstat_collect is disabled. We may also bypass if the + * protocol is tcp in the future because tcp will have a route that + * we can use to attribute the data to. That does mean we would not + * account for forwarded tcp traffic. + */ + if (nstat_collect) { + struct rtentry *rte = + ifnet_cached_rtlookup_inet6(m->m_pkthdr.rcvif, + &ip6->ip6_src); + if (rte != NULL) { + nstat_route_rx(rte, 1, m->m_pkthdr.len, 0); + rtfree(rte); + } + } + #if PF /* Invoke inbound packet filter */ - lck_mtx_unlock(ip6_mutex); - if (pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET6, TRUE) != 0) { - if (m != NULL) { - panic("%s: unexpected packet %p\n", __func__, m); - /* NOTREACHED */ + if (PF_IS_ENABLED) { + int error; + error = pf_af_hook(m->m_pkthdr.rcvif, NULL, &m, AF_INET6, TRUE); + if (error != 0) { + if (m != NULL) { + panic("%s: unexpected packet %p\n", __func__, m); + /* NOTREACHED */ + } + /* Already freed by callee */ + goto done; } - /* Already freed by callee */ - return; + ip6 = mtod(m, struct ip6_hdr *); } - ip6 = mtod(m, struct ip6_hdr *); - lck_mtx_lock(ip6_mutex); #endif /* PF */ /* drop packets if interface ID portion is already filled */ @@ -592,39 +722,11 @@ ip6_input(m) ip6->ip6_dst.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); -#if 0 /* this case seems to be unnecessary. (jinmei, 20010401) */ - /* - * We use rt->rt_ifp to determine if the address is ours or not. - * If rt_ifp is lo0, the address is ours. - * The problem here is, rt->rt_ifp for fe80::%lo0/64 is set to lo0, - * so any address under fe80::%lo0/64 will be mistakenly considered - * local. The special case is supplied to handle the case properly - * by actually looking at interface addresses - * (using in6ifa_ifpwithaddr). - */ - if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) != 0 && - IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) { - struct in6_ifaddr *ia6; - if (!(ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif, &ip6->ip6_dst))) { - lck_mtx_unlock(ip6_mutex); - icmp6_error(m, ICMP6_DST_UNREACH, - ICMP6_DST_UNREACH_ADDR, 0); - /* m is already freed */ - return; - } - ifafree(&ia6->ia_ifa); - - ours = 1; - deliverifp = m->m_pkthdr.rcvif; - goto hbhcheck; - } -#endif - /* * Multicast check */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { - struct in6_multi *in6m = 0; + struct in6_multi *in6m = NULL; struct ifnet *ifp = m->m_pkthdr.rcvif; in6_ifstat_inc(ifp, ifs6_in_mcast); @@ -632,16 +734,18 @@ ip6_input(m) * See if we belong to the destination multicast group on the * arrival interface. */ - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m) + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&ip6->ip6_dst, ifp, in6m); + in6_multihead_lock_done(); + if (in6m != NULL) { + IN6M_REMREF(in6m); ours = 1; + } + else #if MROUTING - else if (!ip6_mrouter) { -#else - else { + if (!ip6_mrouter) #endif + { ip6stat.ip6s_notmember++; ip6stat.ip6s_cantforward++; in6_ifstat_inc(ifp, ifs6_in_discard); @@ -651,42 +755,18 @@ ip6_input(m) goto hbhcheck; } - if (ip6_forward_rt.ro_rt != NULL) - RT_LOCK(ip6_forward_rt.ro_rt); /* * Unicast check */ - if (ip6_forward_rt.ro_rt != NULL && - (ip6_forward_rt.ro_rt->rt_flags & RTF_UP) && - IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, - &((struct sockaddr_in6 *)(&ip6_forward_rt.ro_dst))->sin6_addr) && - ip6_forward_rt.ro_rt->generation_id == route_generation) { - ip6stat.ip6s_forward_cachehit++; - } else { - struct sockaddr_in6 *dst6; - - if (ip6_forward_rt.ro_rt != NULL) { - /* route is down/stale or destination is different */ - ip6stat.ip6s_forward_cachemiss++; - RT_UNLOCK(ip6_forward_rt.ro_rt); - rtfree(ip6_forward_rt.ro_rt); - ip6_forward_rt.ro_rt = NULL; - } - - bzero(&ip6_forward_rt.ro_dst, sizeof(struct sockaddr_in6)); - dst6 = (struct sockaddr_in6 *)&ip6_forward_rt.ro_dst; - dst6->sin6_len = sizeof(struct sockaddr_in6); - dst6->sin6_family = AF_INET6; - dst6->sin6_addr = ip6->ip6_dst; -#if SCOPEDROUTING - ip6_forward_rt.ro_dst.sin6_scope_id = - in6_addr2scopeid(m->m_pkthdr.rcvif, &ip6->ip6_dst); -#endif + dst6 = (struct sockaddr_in6 *)&ip6_forward_rt.ro_dst; + dst6->sin6_len = sizeof(struct sockaddr_in6); + dst6->sin6_family = AF_INET6; + dst6->sin6_addr = ip6->ip6_dst; - rtalloc_ign((struct route *)&ip6_forward_rt, RTF_PRCLONING); - if (ip6_forward_rt.ro_rt != NULL) - RT_LOCK(ip6_forward_rt.ro_rt); - } + rtalloc_scoped_ign((struct route *)&ip6_forward_rt, + RTF_PRCLONING, IFSCOPE_NONE); + if (ip6_forward_rt.ro_rt != NULL) + RT_LOCK(ip6_forward_rt.ro_rt); #define rt6_key(r) ((struct sockaddr_in6 *)((r)->rt_nodes->rn_key)) @@ -726,8 +806,7 @@ ip6_input(m) &rt6_key(ip6_forward_rt.ro_rt)->sin6_addr) #endif ip6_forward_rt.ro_rt->rt_ifp->if_type == IFT_LOOP) { - struct in6_ifaddr *ia6 = - (struct in6_ifaddr *)ip6_forward_rt.ro_rt->rt_ifa; + ia6 = (struct in6_ifaddr *)ip6_forward_rt.ro_rt->rt_ifa; /* * record address information into m_aux. @@ -738,31 +817,32 @@ ip6_input(m) * packets to a tentative, duplicated, or somehow invalid * address must not be accepted. */ + RT_CONVERT_LOCK(ip6_forward_rt.ro_rt); /* just in case */ + IFA_LOCK_SPIN(&ia6->ia_ifa); if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) { + IFA_UNLOCK(&ia6->ia_ifa); /* this address is ready */ ours = 1; deliverifp = ia6->ia_ifp; /* correct? */ /* Count the packet in the ip address stats */ -#ifndef __APPLE__ - ia6->ia_ifa.if_ipackets++; - ia6->ia_ifa.if_ibytes += m->m_pkthdr.len; -#endif RT_UNLOCK(ip6_forward_rt.ro_rt); + ia6 = NULL; goto hbhcheck; - } else { - RT_UNLOCK(ip6_forward_rt.ro_rt); - /* address is not ready, so discard the packet. */ - nd6log((LOG_INFO, - "ip6_input: packet to an unready address %s->%s\n", - ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst))); - goto bad; } + IFA_UNLOCK(&ia6->ia_ifa); + RT_UNLOCK(ip6_forward_rt.ro_rt); + /* address is not ready, so discard the packet. */ + nd6log((LOG_INFO, + "ip6_input: packet to an unready address %s->%s\n", + ip6_sprintf(&ip6->ip6_src), + ip6_sprintf(&ip6->ip6_dst))); + ia6 = NULL; + goto bad; } /* - * FAITH(Firewall Aided Internet Translator) + * FAITH (Firewall Aided Internet Translator) */ #if defined(NFAITH) && 0 < NFAITH if (ip6_keepfaith) { @@ -796,9 +876,7 @@ ip6_input(m) * as our interface address (e.g. multicast addresses, addresses * within FAITH prefixes and such). */ - if (deliverifp && !ip6_getdstifaddr(m)) { - struct in6_ifaddr *ia6; - + if (deliverifp && (ia6 = ip6_getdstifaddr(m)) == NULL) { ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst); if (ia6) { if (!ip6_setdstifaddr(m, ia6)) { @@ -808,10 +886,16 @@ ip6_input(m) * to the upper layers. */ } - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; } } + if (ia6 != NULL) { + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; + } + /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). @@ -825,8 +909,7 @@ ip6_input(m) #if 0 /*touches NULL pointer*/ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); #endif - lck_mtx_unlock(ip6_mutex); - return; /* m have already been freed */ + goto done; /* m have already been freed */ } /* adjust pointer */ @@ -840,17 +923,16 @@ ip6_input(m) if (ip6->ip6_plen == 0 && plen == 0) { /* * Note that if a valid jumbo payload option is - * contained, ip6_hoptops_input() must set a valid - * (non-zero) payload length to the variable plen. + * contained, ip6_hopopts_input() must set a valid + * (non-zero) payload length to the variable plen. */ ip6stat.ip6s_badoptions++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); - return; + goto done; } #ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ @@ -860,18 +942,31 @@ ip6_input(m) sizeof(struct ip6_hbh)); if (hbh == NULL) { ip6stat.ip6s_tooshort++; - lck_mtx_unlock(ip6_mutex); - return; + goto done; } #endif nxt = hbh->ip6h_nxt; /* - * accept the packet if a router alert option is included - * and we act as an IPv6 router. + * If we are acting as a router and the packet contains a + * router alert option, see if we know the option value. + * Currently, we only support the option value for MLD, in which + * case we should pass the packet to the multicast routing + * daemon. */ - if (rtalert != ~0 && ip6_forwarding) - ours = 1; + if (rtalert != ~0 && ip6_forwarding) { + switch (rtalert) { + case IP6OPT_RTALERT_MLD: + ours = 1; + break; + default: + /* + * RFC2711 requires unrecognized values must be + * silently ignored. + */ + break; + } + } } else nxt = ip6->ip6_nxt; @@ -909,20 +1004,14 @@ ip6_input(m) #if MROUTING if (ip6_mrouter && ip6_mforward(ip6, m->m_pkthdr.rcvif, m)) { ip6stat.ip6s_cantforward++; - m_freem(m); - lck_mtx_unlock(ip6_mutex); - return; + goto bad; } #endif - if (!ours) { - m_freem(m); - lck_mtx_unlock(ip6_mutex); - return; - } + if (!ours) + goto bad; } else if (!ours) { - ip6_forward(m, &ip6_forward_rt, 0, 1); - lck_mtx_unlock(ip6_mutex); - return; + ip6_forward(m, &ip6_forward_rt, 0); + goto done; } ip6 = mtod(m, struct ip6_hdr *); @@ -949,17 +1038,16 @@ ip6_input(m) ip6stat.ip6s_delivered++; in6_ifstat_inc(deliverifp, ifs6_in_deliver); - lck_mtx_unlock(ip6_mutex); injectit: nest = 0; while (nxt != IPPROTO_DONE) { struct ipfilter *filter; - int (*pr_input)(struct mbuf **, int *); + int (*pr_input)(struct mbuf **, int *, int); if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) { ip6stat.ip6s_toomanyhdr++; - goto badunlocked; + goto bad; } /* @@ -969,24 +1057,9 @@ injectit: if (m->m_pkthdr.len < off) { ip6stat.ip6s_tooshort++; in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); - goto badunlocked; - } - -#if 0 - /* - * do we need to do it for every header? yeah, other - * functions can play with it (like re-allocate and copy). - */ - mhist = ip6_addaux(m); - if (mhist && M_TRAILINGSPACE(mhist) >= sizeof(nxt)) { - hist = mtod(mhist, caddr_t) + mhist->m_len; - bcopy(&nxt, hist, sizeof(nxt)); - mhist->m_len += sizeof(nxt); - } else { - ip6stat.ip6s_toomanyhdr++; goto bad; } -#endif + #if IPSEC /* @@ -997,7 +1070,7 @@ injectit: if ((ipsec_bypass == 0) && (ip6_protox[nxt]->pr_flags & PR_LASTHDR) != 0) { if (ipsec6_in_reject(m, NULL)) { IPSEC_STAT_INCREMENT(ipsec6stat.in_polvio); - goto badunlocked; + goto bad; } } #endif @@ -1018,36 +1091,40 @@ injectit: filter->ipf_filter.cookie, (mbuf_t*)&m, off, nxt); if (result == EJUSTRETURN) { ipf_unref(); - return; + goto done; } if (result != 0) { ipf_unref(); - m_freem(m); - return; + goto bad; } } } ipf_unref(); } + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip6_hdr *, ip6, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, NULL, struct ip6_hdr *, ip6); + if ((pr_input = ip6_protox[nxt]->pr_input) == NULL) { m_freem(m); m = NULL; nxt = IPPROTO_DONE; } else if (!(ip6_protox[nxt]->pr_flags & PR_PROTOLOCK)) { lck_mtx_lock(inet6_domain_mutex); - nxt = pr_input(&m, &off); + nxt = pr_input(&m, &off, nxt); lck_mtx_unlock(inet6_domain_mutex); } else { - nxt = pr_input(&m, &off); + nxt = pr_input(&m, &off, nxt); } } +done: + if (ip6_forward_rt.ro_rt != NULL) + rtfree(ip6_forward_rt.ro_rt); return; bad: - lck_mtx_unlock(ip6_mutex); - badunlocked: m_freem(m); - return; + goto done; } /* @@ -1060,8 +1137,13 @@ ip6_setdstifaddr(struct mbuf *m, struct in6_ifaddr *ia6) struct ip6aux *n; n = ip6_addaux(m); - if (n) + if (n != NULL) { + if (ia6 != NULL) + IFA_ADDREF(&ia6->ia_ifa); + if (n->ip6a_dstia6 != NULL) + IFA_REMREF(&n->ip6a_dstia6->ia_ifa); n->ip6a_dstia6 = ia6; + } return (struct ip6aux *)n; /* NULL if failed to set */ } @@ -1072,10 +1154,12 @@ ip6_getdstifaddr(m) struct ip6aux *n; n = ip6_findaux(m); - if (n) - return n->ip6a_dstia6; - else - return NULL; + if (n != NULL) { + if (n->ip6a_dstia6 != NULL) + IFA_ADDREF(&n->ip6a_dstia6->ia_ifa); + return (n->ip6a_dstia6); + } + return (NULL); } /* @@ -1083,11 +1167,8 @@ ip6_getdstifaddr(m) * included, the real payload length will be stored in plenp. */ static int -ip6_hopopts_input(plenp, rtalertp, mp, offp) - u_int32_t *plenp; - u_int32_t *rtalertp; /* XXX: should be stored more smart way */ - struct mbuf **mp; - int *offp; +ip6_hopopts_input(uint32_t *plenp, uint32_t *rtalertp, struct mbuf **mp, + int *offp) { struct mbuf *m = *mp; int off = *offp, hbhlen; @@ -1123,11 +1204,11 @@ ip6_hopopts_input(plenp, rtalertp, mp, offp) if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) - return(-1); + return (-1); *offp = off; *mp = m; - return(0); + return (0); } /* @@ -1167,7 +1248,7 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) } optlen = *(opt + 1) + 2; break; - case IP6OPT_RTALERT: + case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { ip6stat.ip6s_toosmall++; @@ -1175,11 +1256,9 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); - lck_mtx_lock(ip6_mutex); return(-1); } optlen = IP6OPT_RTALERT_LEN; @@ -1194,11 +1273,9 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); - lck_mtx_lock(ip6_mutex); return(-1); } optlen = IP6OPT_JUMBO_LEN; @@ -1210,11 +1287,9 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { ip6stat.ip6s_badoptions++; - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); - lck_mtx_lock(ip6_mutex); return(-1); } @@ -1236,11 +1311,9 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) */ if (*plenp != 0) { ip6stat.ip6s_badoptions++; - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); - lck_mtx_lock(ip6_mutex); return(-1); } #endif @@ -1250,11 +1323,9 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) */ if (jumboplen <= IPV6_MAXPACKET) { ip6stat.ip6s_badoptions++; - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); - lck_mtx_lock(ip6_mutex); return(-1); } *plenp = jumboplen; @@ -1266,9 +1337,8 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) goto bad; } optlen = ip6_unknown_opt(opt, m, - erroff + opt - opthead, 1); + erroff + opt - opthead); if (optlen == -1) { - /* ip6_unknown opt unlocked ip6_mutex */ return(-1); } optlen += 2; @@ -1290,11 +1360,7 @@ ip6_process_hopopts(m, opthead, hbhlen, rtalertp, plenp) * is not continuous in order to return an ICMPv6 error. */ int -ip6_unknown_opt(optp, m, off, locked) - u_int8_t *optp; - struct mbuf *m; - int off; - int locked; +ip6_unknown_opt(uint8_t *optp, struct mbuf *m, int off) { struct ip6_hdr *ip6; @@ -1306,11 +1372,7 @@ ip6_unknown_opt(optp, m, off, locked) return(-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ ip6stat.ip6s_badoptions++; - if (locked) - lck_mtx_unlock(ip6_mutex); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); - if (locked) - lck_mtx_lock(ip6_mutex); return(-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ ip6stat.ip6s_badoptions++; @@ -1318,109 +1380,147 @@ ip6_unknown_opt(optp, m, off, locked) if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); - else { - if (locked) - lck_mtx_unlock(ip6_mutex); + else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); - if (locked) - lck_mtx_lock(ip6_mutex); - } return(-1); } m_freem(m); /* XXX: NOTREACHED */ - return(-1); + return (-1); } /* * Create the "control" list for this pcb. - * The function will not modify mbuf chain at all. + * These functions will not modify mbuf chain at all. * - * with KAME mbuf chain restriction: + * With KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called IP6_EXTHDR_CHECK() and all the extension headers are located in the * very first mbuf on the mbuf chain. + * + * ip6_savecontrol_v4 will handle those options that are possible to be + * set on a v4-mapped socket. + * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those + * options and handle the v6-only ones itself. */ -void -ip6_savecontrol(in6p, mp, ip6, m) - struct inpcb *in6p; - struct mbuf **mp; - struct ip6_hdr *ip6; - struct mbuf *m; +struct mbuf ** +ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, + int *v4only) { - int rthdr_exist = 0; + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); -#if SO_TIMESTAMP - if ((in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0) { + if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) { struct timeval tv; microtime(&tv); - *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), - SCM_TIMESTAMP, SOL_SOCKET); - if (*mp) { - mp = &(*mp)->m_next; - } + mp = sbcreatecontrol_mbuf((caddr_t) &tv, sizeof(tv), + SCM_TIMESTAMP, SOL_SOCKET, mp); + if (*mp == NULL) + return NULL; } -#endif + if ((inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + uint64_t time; - /* some OSes call this logic with IPv4 packet, for SO_TIMESTAMP */ - if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) - return; + time = mach_absolute_time(); + mp = sbcreatecontrol_mbuf((caddr_t) &time, sizeof(time), + SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, mp); + if (*mp == NULL) + return NULL; + } + if ((inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) != 0) { + int tc = m->m_pkthdr.prio; + + mp = sbcreatecontrol_mbuf((caddr_t) &tc, sizeof(tc), + SO_TRAFFIC_CLASS, SOL_SOCKET, mp); + if (*mp == NULL) + return NULL; + } + + if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { + if (v4only != NULL) + *v4only = 1; + return (mp); + } + +#define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ - if ((in6p->in6p_flags & IN6P_PKTINFO) != 0) { + if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; + bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); - if (IN6_IS_SCOPE_LINKLOCAL(&pi6.ipi6_addr)) - pi6.ipi6_addr.s6_addr16[1] = 0; - pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) - ? m->m_pkthdr.rcvif->if_index - : 0; - *mp = sbcreatecontrol((caddr_t) &pi6, - sizeof(struct in6_pktinfo), IPV6_PKTINFO, - IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; + in6_clearscope(&pi6.ipi6_addr); /* XXX */ + pi6.ipi6_ifindex = + (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; + + mp = sbcreatecontrol_mbuf((caddr_t) &pi6, + sizeof(struct in6_pktinfo), + IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6, mp); + if (*mp == NULL) + return NULL; } - if ((in6p->in6p_flags & IN6P_HOPLIMIT) != 0) { + if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) { int hlim = ip6->ip6_hlim & 0xff; - *mp = sbcreatecontrol((caddr_t) &hlim, - sizeof(int), IPV6_HOPLIMIT, IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; + + mp = sbcreatecontrol_mbuf((caddr_t) &hlim, sizeof(int), + IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), + IPPROTO_IPV6, mp); + if (*mp == NULL) + return NULL; } - if ((in6p->in6p_flags & IN6P_TCLASS) != 0) { - u_int32_t flowinfo; - int tclass; + if (v4only != NULL) + *v4only = 0; + return (mp); +} + +int +ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) +{ + struct mbuf **np; + struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + int v4only = 0; + + *mp = NULL; + np = ip6_savecontrol_v4(in6p, m, mp, &v4only); + if (np == NULL) + goto no_mbufs; - flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); - flowinfo >>= 20; + mp = np; + if (v4only) + return(0); - tclass = flowinfo & 0xff; - *mp = sbcreatecontrol((caddr_t) &tclass, sizeof(tclass), - IPV6_TCLASS, IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; - } + if ((in6p->inp_flags & IN6P_TCLASS) != 0) { + u_int32_t flowinfo; + int tclass; + + flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); + flowinfo >>= 20; + + tclass = flowinfo & 0xff; + mp = sbcreatecontrol_mbuf((caddr_t) &tclass, sizeof(tclass), + IPV6_TCLASS, IPPROTO_IPV6, mp); + if (*mp == NULL) + goto no_mbufs; + } /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. - * See RFC 2292 section 6. + * See also RFC 2292 section 6 (or RFC 3542 section 8). */ - if ((in6p->in6p_flags & IN6P_HOPOPTS) != 0) { + if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be - * just after the IPv6 header, which fact is assured through - * the IPv6 input processing. + * just after the IPv6 header, which is assured through the + * IPv6 input processing. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { @@ -1438,67 +1538,38 @@ ip6_savecontrol(in6p, mp, ip6, m) ip6->ip6_nxt); if (ext == NULL) { ip6stat.ip6s_tooshort++; - return; + return(0); } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); ip6stat.ip6s_tooshort++; - return; + return(0); } #endif /* - * XXX: We copy whole the header even if a jumbo - * payload option is included, which option is to - * be removed before returning in the RFC 2292. - * Note: this constraint is removed in 2292bis. + * XXX: We copy the whole header even if a + * jumbo payload option is included, the option which + * is to be removed before returning according to + * RFC2292. + * Note: this constraint is removed in RFC3542 */ - *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, - IPV6_HOPOPTS, IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t)hbh, hbhlen, + IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS), + IPPROTO_IPV6, mp); + #if PULLDOWN_TEST m_freem(ext); #endif - } - } - - /* IPV6_DSTOPTS and IPV6_RTHDR socket options */ - if ((in6p->in6p_flags & (IN6P_DSTOPTS | IN6P_RTHDRDSTOPTS)) != 0) { - int proto, off, nxt; - - /* - * go through the header chain to see if a routing header is - * contained in the packet. We need this information to store - * destination options headers (if any) properly. - * XXX: performance issue. We should record this info when - * processing extension headers in incoming routine. - * (todo) use m_aux? - */ - proto = IPPROTO_IPV6; - off = 0; - nxt = -1; - while (1) { - int newoff; - - newoff = ip6_nexthdr(m, off, proto, &nxt); - if (newoff < 0) - break; - if (newoff < off) /* invalid, check for safety */ - break; - if ((proto = nxt) == IPPROTO_ROUTING) { - rthdr_exist = 1; - break; + if (*mp == NULL) { + goto no_mbufs; } - off = newoff; } } - if ((in6p->in6p_flags & - (IN6P_RTHDR | IN6P_DSTOPTS | IN6P_RTHDRDSTOPTS)) != 0) { - ip6 = mtod(m, struct ip6_hdr *); + if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* @@ -1543,7 +1614,7 @@ ip6_savecontrol(in6p, mp, ip6, m) ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { ip6stat.ip6s_tooshort++; - return; + return(0); } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) @@ -1553,30 +1624,39 @@ ip6_savecontrol(in6p, mp, ip6, m) if (elen != ext->m_len) { m_freem(ext); ip6stat.ip6s_tooshort++; - return; + return(0); } #endif switch (nxt) { case IPPROTO_DSTOPTS: - if ((in6p->in6p_flags & IN6P_DSTOPTS) == 0) + if (!(in6p->inp_flags & IN6P_DSTOPTS)) break; - *mp = sbcreatecontrol((caddr_t)ip6e, elen, - IPV6_DSTOPTS, - IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t)ip6e, elen, + IS2292(in6p, + IPV6_2292DSTOPTS, IPV6_DSTOPTS), + IPPROTO_IPV6, mp); + if (*mp == NULL) { +#if PULLDOWN_TEST + m_freem(ext); +#endif + goto no_mbufs; + } break; case IPPROTO_ROUTING: - if (!in6p->in6p_flags & IN6P_RTHDR) + if (!in6p->inp_flags & IN6P_RTHDR) break; - *mp = sbcreatecontrol((caddr_t)ip6e, elen, - IPV6_RTHDR, - IPPROTO_IPV6); - if (*mp) - mp = &(*mp)->m_next; + mp = sbcreatecontrol_mbuf((caddr_t)ip6e, elen, + IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR), + IPPROTO_IPV6, mp); + if (*mp == NULL) { +#if PULLDOWN_TEST + m_freem(ext); +#endif + goto no_mbufs; + } break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ @@ -1584,7 +1664,7 @@ ip6_savecontrol(in6p, mp, ip6, m) default: /* - * other cases have been filtered in the above. + * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). @@ -1608,7 +1688,49 @@ ip6_savecontrol(in6p, mp, ip6, m) loopend: ; } + return(0); +no_mbufs: + ip6stat.ip6s_pktdropcntrl++; + /* XXX increment a stat to show the failure */ + return(ENOBUFS); +} +#undef IS2292 + +void +ip6_notify_pmtu(struct inpcb *in6p, struct sockaddr_in6 *dst, u_int32_t *mtu) +{ + struct socket *so; + struct mbuf *m_mtu; + struct ip6_mtuinfo mtuctl; + + so = in6p->inp_socket; + if (mtu == NULL) + return; + +#ifdef DIAGNOSTIC + if (so == NULL) /* I believe this is impossible */ + panic("ip6_notify_pmtu: socket is NULL"); +#endif + + bzero(&mtuctl, sizeof(mtuctl)); /* zero-clear for safety */ + mtuctl.ip6m_mtu = *mtu; + mtuctl.ip6m_addr = *dst; + if (sa6_recoverscope(&mtuctl.ip6m_addr)) + return; + + if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl), + IPV6_PATHMTU, IPPROTO_IPV6)) == NULL) + return; + + if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu, NULL) + == 0) { + m_freem(m_mtu); + /* XXX: should count statistics */ + } else + sorwakeup(so); + + return; } #if PULLDOWN_TEST @@ -1837,8 +1959,8 @@ ip6_addaux( tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, NULL); if (tag == NULL) { /* Allocate a tag */ - tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, - sizeof (struct ip6aux), M_DONTWAIT); + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, + sizeof (struct ip6aux), M_DONTWAIT, m); /* Attach it to the mbuf */ if (tag) { @@ -1855,7 +1977,7 @@ ip6_findaux( { struct m_tag *tag; - tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_ENCAP, NULL); + tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, NULL); return tag ? (struct ip6aux*)(tag + 1) : NULL; } @@ -1866,12 +1988,35 @@ ip6_delaux( { struct m_tag *tag; - tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_ENCAP, NULL); + tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_INET6, NULL); if (tag) { m_tag_delete(m, tag); } } +/* + * Called by m_tag_free(). + */ +void +ip6_destroyaux(struct ip6aux *n) +{ + if (n->ip6a_dstia6 != NULL) { + IFA_REMREF(&n->ip6a_dstia6->ia_ifa); + n->ip6a_dstia6 = NULL; + } +} + +/* + * Called by m_tag_copy() + */ +void +ip6_copyaux(struct ip6aux *src, struct ip6aux *dst) +{ + bcopy(src, dst, sizeof (*dst)); + if (dst->ip6a_dstia6 != NULL) + IFA_ADDREF(&dst->ip6a_dstia6->ia_ifa); +} + /* * System control for IP6 */ diff --git a/bsd/netinet6/ip6_mroute.c b/bsd/netinet6/ip6_mroute.c index da8c4fc96..39f146284 100644 --- a/bsd/netinet6/ip6_mroute.c +++ b/bsd/netinet6/ip6_mroute.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -104,7 +104,9 @@ #include <netinet/ip6.h> #include <netinet6/ip6_var.h> +#include <netinet6/scope6_var.h> #include <netinet6/ip6_mroute.h> +#include <netinet/icmp6.h> #include <netinet6/pim6.h> #include <netinet6/pim6_var.h> @@ -127,7 +129,6 @@ static int socket_send(struct socket *, struct mbuf *, static int register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *); -extern lck_mtx_t *ip6_mutex; /* * Globals. All but ip6_mrouter, ip6_mrtproto and mrt6stat could be static, * except for netstat or debugging purposes. @@ -258,9 +259,6 @@ static int del_m6if(mifi_t *); static int add_m6fc(struct mf6cctl *); static int del_m6fc(struct mf6cctl *); -#ifndef __APPLE__ -static struct callout expire_upcalls_ch; -#endif /* * Handle MRT setsockopt commands to modify the multicast routing tables. */ @@ -478,12 +476,7 @@ ip6_mrouter_init(so, v, cmd) pim6 = 0;/* used for stubbing out/in pim stuff */ -#ifndef __APPLE__ - callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, - expire_upcalls, NULL); -#else timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); -#endif #if MRT6DEBUG if (mrt6debug) @@ -540,20 +533,12 @@ ip6_mrouter_done() } } } -#if notyet - bzero((caddr_t)qtable, sizeof(qtable)); - bzero((caddr_t)tbftable, sizeof(tbftable)); -#endif bzero((caddr_t)mif6table, sizeof(mif6table)); nummifs = 0; pim6 = 0; /* used to stub out/in pim specific code */ -#ifndef __APPLE__ - callout_stop(&expire_upcalls_ch); -#else untimeout(expire_upcalls, (caddr_t)NULL); -#endif /* * Free all multicast forwarding cache entries. @@ -617,7 +602,9 @@ add_m6if(mifcp) return EINVAL; mifp = mif6table + mifcp->mif6c_mifi; if (mifp->m6_ifp) - return EADDRINUSE; /* XXX: is it appropriate? */ + return (EADDRINUSE); /* XXX: is it appropriate? */ + if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > if_index) + return (ENXIO); ifnet_head_lock_shared(); if (mifcp->mif6c_pifi == 0 || mifcp->mif6c_pifi > if_index) { @@ -653,10 +640,7 @@ add_m6if(mifcp) mifp->m6_flags = mifcp->mif6c_flags; mifp->m6_ifp = ifp; -#if notyet - /* scaling up here allows division by 1024 in critical code */ - mifp->m6_rate_limit = mifcp->mif6c_rate_limit * 1024 / 1000; -#endif + /* initialize per mif pkt counters */ mifp->m6_pkt_in = 0; mifp->m6_pkt_out = 0; @@ -705,10 +689,6 @@ del_m6if(mifip) if_allmulti(ifp, 0); } -#if notyet - bzero((caddr_t)qtable[*mifip], sizeof(qtable[*mifip])); - bzero((caddr_t)mifp->m6_tbf, sizeof(*(mifp->m6_tbf))); -#endif bzero((caddr_t)mifp, sizeof(*mifp)); /* Adjust nummifs down */ @@ -1285,12 +1265,7 @@ expire_upcalls( } } -#ifndef __APPLE__ - callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, - expire_upcalls, NULL); -#else timeout(expire_upcalls, (caddr_t)NULL, EXPIRE_TIMEOUT); -#endif } /* @@ -1306,6 +1281,9 @@ ip6_mdq(m, ifp, rt) mifi_t mifi, iif; struct mif6 *mifp; int plen = m->m_pkthdr.len; + struct in6_addr src0, dst0; /* copies for local work */ + u_int32_t iszone, idzone, oszone, odzone; + int error = 0; /* * Macro to send packet on mif. Since RSVP packets don't get counted on @@ -1437,7 +1415,14 @@ ip6_mdq(m, ifp, rt) * For each mif, forward a copy of the packet if there are group * members downstream on the interface. */ - for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) + src0 = ip6->ip6_src; + dst0 = ip6->ip6_dst; + if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 || + (error = in6_setscope(&dst0, ifp, &idzone)) != 0) { + ip6stat.ip6s_badscope++; + return (error); + } + for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) { if (IF_ISSET(mifi, &rt->mf6c_ifset)) { /* * check if the outgoing packet is going to break @@ -1445,23 +1430,25 @@ ip6_mdq(m, ifp, rt) * XXX For packets through PIM register tunnel * interface, we believe a routing daemon. */ - if ((mif6table[rt->mf6c_parent].m6_flags & - MIFF_REGISTER) == 0 && - (mif6table[mifi].m6_flags & MIFF_REGISTER) == 0 && - (in6_addr2scopeid(ifp, &ip6->ip6_dst) != - in6_addr2scopeid(mif6table[mifi].m6_ifp, - &ip6->ip6_dst) || - in6_addr2scopeid(ifp, &ip6->ip6_src) != - in6_addr2scopeid(mif6table[mifi].m6_ifp, - &ip6->ip6_src))) { - ip6stat.ip6s_badscope++; - continue; + if (!(mif6table[rt->mf6c_parent].m6_flags & + MIFF_REGISTER) && + !(mif6table[mifi].m6_flags & MIFF_REGISTER)) { + if (in6_setscope(&src0, mif6table[mifi].m6_ifp, + &oszone) || + in6_setscope(&dst0, mif6table[mifi].m6_ifp, + &odzone) || + iszone != oszone || + idzone != odzone) { + ip6stat.ip6s_badscope++; + continue; + } } mifp->m6_pkt_out++; mifp->m6_bytes_out += plen; MC6_SEND(ip6, mifp, m); } + } return 0; } @@ -1501,16 +1488,22 @@ phyint_send(ip6, mifp, m) * sending queue. */ if (m->m_pkthdr.rcvif == NULL) { - struct ip6_moptions im6o; + struct ip6_moptions *im6o; - im6o.im6o_multicast_ifp = ifp; - /* XXX: ip6_output will override ip6->ip6_hlim */ - im6o.im6o_multicast_hlim = ip6->ip6_hlim; - im6o.im6o_multicast_loop = 1; - error = ip6_output(mb_copy, NULL, &ro, - IPV6_FORWARDING, &im6o, NULL, 0); + im6o = ip6_allocmoptions(M_DONTWAIT); + if (im6o == NULL) { + m_freem(mb_copy); + return; + } + im6o->im6o_multicast_ifp = ifp; + /* XXX: ip6_output will override ip6->ip6_hlim */ + im6o->im6o_multicast_hlim = ip6->ip6_hlim; + im6o->im6o_multicast_loop = 1; + error = ip6_output(mb_copy, NULL, &ro, IPV6_FORWARDING, + im6o, NULL, NULL); + IM6O_REMREF(im6o); #if MRT6DEBUG if (mrt6debug & DEBUG_XMIT) log(LOG_DEBUG, "phyint_send on mif %d err %d\n", @@ -1524,10 +1517,11 @@ phyint_send(ip6, mifp, m) * on the outgoing interface, loop back a copy. */ dst6 = (struct sockaddr_in6 *)&ro.ro_dst; - ifnet_lock_shared(ifp); + in6_multihead_lock_shared(); IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); - ifnet_lock_done(ifp); + in6_multihead_lock_done(); if (in6m != NULL) { + IN6M_REMREF(in6m); dst6->sin6_len = sizeof(struct sockaddr_in6); dst6->sin6_family = AF_INET6; dst6->sin6_addr = ip6->ip6_dst; @@ -1552,10 +1546,8 @@ phyint_send(ip6, mifp, m) mb_copy->m_pkthdr.csum_data = 0; mb_copy->m_pkthdr.csum_flags = 0; - lck_mtx_unlock(ip6_mutex); error = dlil_output(ifp, PF_INET6, mb_copy, NULL, (struct sockaddr *)&ro.ro_dst, 0); - lck_mtx_lock(ip6_mutex); #else error = (*ifp->if_output)(ifp, mb_copy, (struct sockaddr *)&ro.ro_dst, @@ -1567,21 +1559,28 @@ phyint_send(ip6, mifp, m) mifp - mif6table, error); #endif } else { -#if MULTICAST_PMTUD - icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); -#else + /* + * pMTU discovery is intentionally disabled by default, since + * various router may notify pMTU in multicast, which can be + * a DDoS to a router + */ + if (ip6_mcast_pmtu) + icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu); #if MRT6DEBUG - if (mrt6debug & DEBUG_XMIT) - log(LOG_DEBUG, - "phyint_send: packet too big on %s o %s g %s" - " size %d(discarded)\n", - if_name(ifp), - ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&ip6->ip6_dst), - mb_copy->m_pkthdr.len); + else { + if (mrt6debug & DEBUG_XMIT) { + log(LOG_DEBUG, + "phyint_send: packet too big on %s o %s " + "g %s size %d(discarded)\n", + if_name(ifp), + ip6_sprintf(&ip6->ip6_src), + ip6_sprintf(&ip6->ip6_dst), + mb_copy->m_pkthdr.len); + } + } #endif /* MRT6DEBUG */ m_freem(mb_copy); /* simply discard the packet */ -#endif + } } @@ -1666,9 +1665,7 @@ register_send(ip6, mif, m) * is stripped off, and the inner packet is passed to register_mforward. */ int -pim6_input(mp, offp) - struct mbuf **mp; - int *offp; +pim6_input(struct mbuf **mp, int *offp, int proto) { struct pim *pim; /* pointer to a pim struct */ struct ip6_hdr *ip6; @@ -1676,13 +1673,11 @@ pim6_input(mp, offp) struct mbuf *m = *mp; int minlen; int off = *offp; - int proto; ++pim6stat.pim6s_rcv_total; ip6 = mtod(m, struct ip6_hdr *); pimlen = m->m_pkthdr.len - *offp; - proto = ip6->ip6_nxt; /* * Validate lengths @@ -1881,9 +1876,7 @@ pim6_input(mp, offp) #ifdef __APPLE__ if (lo_ifp) { - lck_mtx_unlock(ip6_mutex); dlil_output(lo_ifp, PF_INET6, m, 0, (struct sockaddr *)&dst, 0); - lck_mtx_lock(ip6_mutex); } else { printf("Warning: pim6_input call to dlil_find_dltag failed!\n"); diff --git a/bsd/netinet6/ip6_mroute.h b/bsd/netinet6/ip6_mroute.h index 5eef448db..193efea2a 100644 --- a/bsd/netinet6/ip6_mroute.h +++ b/bsd/netinet6/ip6_mroute.h @@ -79,9 +79,9 @@ /* * Multicast Routing set/getsockopt commands. */ -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define MRT6_OINIT 100 /* initialize forwarder (omrt6msg) */ -#endif +#endif /* XNU_KERNEL_PRIVATE */ #define MRT6_DONE 101 /* shut down forwarder */ #define MRT6_ADD_MIF 102 /* add multicast interface */ #define MRT6_DEL_MIF 103 /* delete multicast interface */ @@ -164,7 +164,7 @@ struct mrt6stat { u_quad_t mrt6s_upq_sockfull; /* upcalls dropped - socket full */ }; -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #if MRT6_OINIT /* * Struct used to communicate from kernel to multicast router @@ -185,7 +185,7 @@ struct omrt6msg { struct in6_addr im6_src, im6_dst; }; #endif -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ /* * Structure used to communicate from kernel to multicast router. @@ -229,7 +229,7 @@ struct sioc_mif_req6 { u_quad_t obytes; /* Output byte count on mif */ }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct sioc_mif_req6_32 { mifi_t mifi; u_quad_t icount; @@ -245,7 +245,7 @@ struct sioc_mif_req6_64 { u_quad_t ibytes; u_quad_t obytes; } __attribute__((aligned(8))); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #ifdef PRIVATE /* @@ -314,14 +314,14 @@ struct rtdetq { /* XXX: rtdetq is also defined in ip_mroute.h */ #endif /* _NETINET_IP_MROUTE_H_ */ #if MROUTING -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE extern struct mrt6stat mrt6stat; extern int ip6_mrouter_set(struct socket *, struct sockopt *); extern int ip6_mrouter_get(struct socket *, struct sockopt *); extern int ip6_mrouter_done(void); extern int mrt6_ioctl(u_long, caddr_t); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #endif /* PRIVATE */ #endif diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index 309686f7f..7abf54ca7 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2011 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -110,6 +110,11 @@ #include <sys/kernel.h> #include <sys/proc.h> #include <sys/kauth.h> +#include <sys/mcache.h> +#include <sys/sysctl.h> +#include <kern/zalloc.h> + +#include <pexpert/pexpert.h> #include <net/if.h> #include <net/route.h> @@ -120,10 +125,13 @@ #include <netinet/ip_var.h> #include <netinet6/in6_var.h> #include <netinet/ip6.h> +#include <netinet6/ip6protosw.h> #include <netinet/icmp6.h> #include <netinet6/ip6_var.h> #include <netinet/in_pcb.h> #include <netinet6/nd6.h> +#include <netinet6/scope6_var.h> +#include <mach/sdt.h> #if IPSEC #include <netinet6/ipsec.h> @@ -133,7 +141,6 @@ #include <netkey/key.h> extern int ipsec_bypass; #endif /* IPSEC */ -extern lck_mtx_t *nd6_mutex; #if CONFIG_MACF_NET #include <security/mac.h> @@ -161,23 +168,54 @@ struct ip6_exthdrs { struct mbuf *ip6e_dest2; }; +int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *sopt); -static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt); +static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, int uproto); static int ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt); -static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt); -static int ip6_setmoptions(int, struct inpcb *, struct mbuf *); -static int ip6_getmoptions(int, struct ip6_moptions *, struct mbuf **); +static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, int sticky, int cmsg, int uproto); +static void im6o_trace(struct ip6_moptions *, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); +static int ip6_getpmtu (struct route_in6 *, struct route_in6 *, + struct ifnet *, struct in6_addr *, u_int32_t *, int *); + +#define IM6O_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int im6o_trace_hist_size = IM6O_TRACE_HIST_SIZE; + +struct ip6_moptions_dbg { + struct ip6_moptions im6o; /* ip6_moptions */ + u_int16_t im6o_refhold_cnt; /* # of IM6O_ADDREF */ + u_int16_t im6o_refrele_cnt; /* # of IM6O_REMREF */ + /* + * Alloc and free callers. + */ + ctrace_t im6o_alloc; + ctrace_t im6o_free; + /* + * Circular lists of IM6O_ADDREF and IM6O_REMREF callers. + */ + ctrace_t im6o_refhold[IM6O_TRACE_HIST_SIZE]; + ctrace_t im6o_refrele[IM6O_TRACE_HIST_SIZE]; +}; + +#if DEBUG +static unsigned int im6o_debug = 1; /* debugging (enabled) */ +#else +static unsigned int im6o_debug; /* debugging (disabled) */ +#endif /* !DEBUG */ + +static unsigned int im6o_size; /* size of zone element */ +static struct zone *im6o_zone; /* zone for ip6_moptions */ + +#define IM6O_ZONE_MAX 64 /* maximum elements in zone */ +#define IM6O_ZONE_NAME "ip6_moptions" /* zone name */ -extern int ip_createmoptions(struct ip_moptions **imop); -extern int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -extern int ip_dropmembership(struct ip_moptions *imo, struct ip_mreq *mreq); -extern lck_mtx_t *ip6_mutex; /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 @@ -198,32 +236,39 @@ ip6_output( int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, /* XXX: just for statistics */ - int locked) + struct ip6_out_args *ip6oa) { struct ip6_hdr *ip6, *mhip6; - struct ifnet *ifp, *origifp; + struct ifnet *ifp = NULL, *origifp = NULL; struct mbuf *m = m0; int hlen, tlen, len, off; struct route_in6 ip6route; - struct sockaddr_in6 *dst; + struct rtentry *rt = NULL; + struct sockaddr_in6 *dst, src_sa, dst_sa; int error = 0; struct in6_ifaddr *ia = NULL; u_int32_t mtu; + int alwaysfrag = 0, dontfrag = 0; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; - struct in6_addr finaldst; + struct in6_addr finaldst, src0, dst0; + u_int32_t zone; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int needipsec = 0; ipfilter_t inject_filter_ref; - + int tso; + unsigned int ifscope; + unsigned int nocell; + boolean_t select_srcif; + struct ipf_pktopts *ippo = NULL, ipf_pktopts; + u_int32_t ifmtu; + #if IPSEC int needipsectun = 0; struct socket *so = NULL; struct secpolicy *sp = NULL; - if (!locked) - lck_mtx_lock(ip6_mutex); /* for AH processing. stupid to have "socket" variable in IP layer... */ if (ipsec_bypass == 0) { @@ -232,10 +277,32 @@ ip6_output( } #endif /* IPSEC */ + bzero(&ipf_pktopts, sizeof(struct ipf_pktopts)); + ippo = &ipf_pktopts; + ip6 = mtod(m, struct ip6_hdr *); inject_filter_ref = ipf_get_inject_filter(m); + finaldst = ip6->ip6_dst; + if (ip6_doscopedroute && (flags & IPV6_OUTARGS)) { + select_srcif = !(flags & (IPV6_FORWARDING | IPV6_UNSPECSRC | IPV6_FLAG_NOSRCIFSEL)); + ifscope = ip6oa->ip6oa_boundif; + ipf_pktopts.ippo_flags = IPPOF_BOUND_IF; + ipf_pktopts.ippo_flags |= (ifscope << IPPOF_SHIFT_IFSCOPE); + } else { + select_srcif = FALSE; + ifscope = IFSCOPE_NONE; + } + + if (flags & IPV6_OUTARGS) { + nocell = ip6oa->ip6oa_nocell; + if (nocell) + ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR; + } else { + nocell = 0; + } + #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ @@ -253,7 +320,19 @@ ip6_output( /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ - MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); + if (opt->ip6po_rthdr) { + /* + * Destination options header(1st part) + * This only makes sense with a routing header. + * See Section 9.2 of RFC 3542. + * Disabling this part just for MIP6 convenience is + * a bad idea. We need to think carefully about a + * way to make the advanced API coexist with MIP6 + * options, which might automatically be inserted in + * the kernel. + */ + MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); + } /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ @@ -314,12 +393,24 @@ ip6_output( * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; - if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; - if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; - if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; + if (exthdrs.ip6e_hbh) + optlen += exthdrs.ip6e_hbh->m_len; + if (exthdrs.ip6e_dest1) + optlen += exthdrs.ip6e_dest1->m_len; + if (exthdrs.ip6e_rthdr) + optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); + /* NOTE: we don't add AH/ESP length here. do that later. */ - if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; + if (exthdrs.ip6e_dest2) + optlen += exthdrs.ip6e_dest2->m_len; + + + if (needipsec && + (m->m_pkthdr.csum_flags & CSUM_DELAY_IPV6_DATA) != 0) { + in6_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IPV6_DATA; + } /* * If we need IPsec, or there is at least one extension header, @@ -419,14 +510,14 @@ ip6_output( struct ipfilter *filter; int seen = (inject_filter_ref == 0); int fixscope = 0; - struct ipf_pktopts *ippo = 0, ipf_pktopts; - + if (im6o != NULL && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { - ippo = &ipf_pktopts; - ippo->ippo_flags = IPPOF_MCAST_OPTS; + ippo->ippo_flags |= IPPOF_MCAST_OPTS; + IM6O_LOCK(im6o); ippo->ippo_mcast_ifnet = im6o->im6o_multicast_ifp; ippo->ippo_mcast_ttl = im6o->im6o_multicast_hlim; ippo->ippo_mcast_loop = im6o->im6o_multicast_loop; + IM6O_UNLOCK(im6o); } /* Hack: embed the scope_id in the destination */ @@ -436,7 +527,6 @@ ip6_output( ip6->ip6_dst.s6_addr16[1] = htons(ro->ro_dst.sin6_scope_id); } { - lck_mtx_unlock(ip6_mutex); ipf_ref(); TAILQ_FOREACH(filter, &ipv6_filters, ipf_link) { /* @@ -452,18 +542,15 @@ ip6_output( result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo); if (result == EJUSTRETURN) { ipf_unref(); - locked = 1; /* Don't want to take lock to unlock it right away */ goto done; } if (result != 0) { ipf_unref(); - locked = 1; /* Don't want to take lock to unlock it right away */ goto bad; } } } ipf_unref(); - lck_mtx_lock(ip6_mutex); } ip6 = mtod(m, struct ip6_hdr *); /* Hack: cleanup embedded scope_id if we put it there */ @@ -495,10 +582,8 @@ ip6_output( bzero(&state, sizeof(state)); state.m = m; - lck_mtx_unlock(ip6_mutex); error = ipsec6_output_trans(&state, nexthdrp, mprev, sp, flags, &needipsectun); - lck_mtx_lock(ip6_mutex); m = state.m; if (error) { /* mbuf is already reclaimed in ipsec6_output_trans. */ @@ -524,13 +609,13 @@ ip6_output( /* ah6_output doesn't modify mbuf chain */ rh->ip6r_segleft = segleft_org; } - } -skip_ipsec2:; -#endif + } } +skip_ipsec2: +#endif /* - * If there is a routing header, replace destination address field + * If there is a routing header, replace the destination address field * with the first hop of the routing header. */ if (exthdrs.ip6e_rthdr) { @@ -538,17 +623,38 @@ skip_ipsec2:; (struct ip6_rthdr *)(mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *)); struct ip6_rthdr0 *rh0; + struct in6_addr *addr; + struct sockaddr_in6 sa; - finaldst = ip6->ip6_dst; switch (rh->ip6r_type) { case IPV6_RTHDR_TYPE_0: rh0 = (struct ip6_rthdr0 *)rh; - ip6->ip6_dst = rh0->ip6r0_addr[0]; - bcopy((caddr_t)&rh0->ip6r0_addr[1], - (caddr_t)&rh0->ip6r0_addr[0], - sizeof(struct in6_addr)*(rh0->ip6r0_segleft - 1) - ); - rh0->ip6r0_addr[rh0->ip6r0_segleft - 1] = finaldst; + addr = (struct in6_addr *)(rh0 + 1); + + /* + * construct a sockaddr_in6 form of + * the first hop. + * + * XXX: we may not have enough + * information about its scope zone; + * there is no standard API to pass + * the information from the + * application. + */ + bzero(&sa, sizeof(sa)); + sa.sin6_family = AF_INET6; + sa.sin6_len = sizeof(sa); + sa.sin6_addr = addr[0]; + if ((error = sa6_embedscope(&sa, + ip6_use_defzone)) != 0) { + goto bad; + } + ip6->ip6_dst = sa.sin6_addr; + bcopy(&addr[1], &addr[0], sizeof(struct in6_addr) + * (rh0->ip6r0_segleft - 1)); + addr[rh0->ip6r0_segleft - 1] = finaldst; + /* XXX */ + in6_clearscope(addr + rh0->ip6r0_segleft - 1); break; default: /* is it possible? */ error = EINVAL; @@ -558,7 +664,7 @@ skip_ipsec2:; /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && - (flags & IPV6_DADOUTPUT) == 0) { + (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; ip6stat.ip6s_badscope++; goto bad; @@ -582,6 +688,38 @@ skip_ipsec2:; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; + + if (ro && ro->ro_rt) + RT_LOCK_ASSERT_NOTHELD(ro->ro_rt); + /* + * if specified, try to fill in the traffic class field. + * do not override if a non-zero value is already set. + * we check the diffserv field and the ecn field separately. + */ + if (opt && opt->ip6po_tclass >= 0) { + int mask = 0; + + if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) + mask |= 0xfc; + if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) + mask |= 0x03; + if (mask != 0) + ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); + } + + /* fill in or override the hop limit field, if necessary. */ + if (opt && opt->ip6po_hlim != -1) + ip6->ip6_hlim = opt->ip6po_hlim & 0xff; + else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + if (im6o != NULL) { + IM6O_LOCK(im6o); + ip6->ip6_hlim = im6o->im6o_multicast_hlim; + IM6O_UNLOCK(im6o); + } else { + ip6->ip6_hlim = ip6_defmcasthlim; + } + } + /* * If there is a cached route, check that it is to the same * destination and is still up. If not, free it and try again. @@ -602,17 +740,14 @@ skip_ipsec2:; dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(struct sockaddr_in6); dst->sin6_addr = ip6->ip6_dst; -#if SCOPEDROUTING - /* XXX: sin6_scope_id should already be fixed at this point */ - if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) - dst->sin6_scope_id = ntohs(dst->sin6_addr.s6_addr16[1]); -#endif } #if IPSEC if (needipsec && needipsectun) { struct ipsec_output_state state; int tunneledv4 = 0; - +#if CONFIG_DTRACE + struct ifnet *trace_ifp = (ifpp != NULL) ? (*ifpp) : NULL; +#endif /* CONFIG_DTRACE */ /* * All the extension headers will become inaccessible * (since they can be encrypted). @@ -628,9 +763,13 @@ skip_ipsec2:; state.m = m; state.ro = (struct route *)ro; state.dst = (struct sockaddr *)dst; - lck_mtx_unlock(ip6_mutex); + + /* Added a trace here so that we can see packets inside a tunnel */ + DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL, + struct ip6_hdr *, ip6, struct ifnet *, trace_ifp, + struct ip *, NULL, struct ip6_hdr *, ip6); + error = ipsec6_output_tunnel(&state, sp, flags, &tunneledv4); - lck_mtx_lock(ip6_mutex); if (tunneledv4) /* tunneled in IPv4 - packet is gone */ goto done; m = state.m; @@ -657,182 +796,147 @@ skip_ipsec2:; } goto bad; } - + /* + * The packet has been encapsulated so the ifscope is no longer valid + * since it does not apply to the outer address: ignore the ifscope. + */ + ifscope = IFSCOPE_NONE; + if (opt != NULL && opt->ip6po_pktinfo != NULL) { + if (opt->ip6po_pktinfo->ipi6_ifindex != IFSCOPE_NONE) + opt->ip6po_pktinfo->ipi6_ifindex = IFSCOPE_NONE; + } exthdrs.ip6e_ip6 = m; } #endif /* IPSEC */ - if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { - /* Unicast */ - -#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa)) -#define sin6tosa(sin6) ((struct sockaddr *)(sin6)) - /* xxx - * interface selection comes here - * if an interface is specified from an upper layer, - * ifp must point it. - */ - if (ro->ro_rt == NULL) { - /* - * non-bsdi always clone routes, if parent is - * PRF_CLONING. - */ - rtalloc_ign((struct route *)ro, 0); - } - if (ro->ro_rt == NULL) { - ip6stat.ip6s_noroute++; - error = EHOSTUNREACH; - /* XXX in6_ifstat_inc(ifp, ifs6_out_discard); */ - goto bad; - } - RT_LOCK_SPIN(ro->ro_rt); - ia = ifatoia6(ro->ro_rt->rt_ifa); - if (ia != NULL) - ifaref(&ia->ia_ifa); - ifp = ro->ro_rt->rt_ifp; - ro->ro_rt->rt_use++; - if (ro->ro_rt->rt_flags & RTF_GATEWAY) - dst = (struct sockaddr_in6 *)ro->ro_rt->rt_gateway; - RT_UNLOCK(ro->ro_rt); - m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ + /* for safety */ + if (ifp != NULL) { + ifnet_release(ifp); + ifp = NULL; + } - in6_ifstat_inc(ifp, ifs6_out_request); + /* adjust pointer */ + ip6 = mtod(m, struct ip6_hdr *); - /* - * Check if the outgoing interface conflicts with - * the interface specified by ifi6_ifindex (if specified). - * Note that loopback interface is always okay. - * (this may happen when we are sending a packet to one of - * our own addresses.) - */ - if (opt && opt->ip6po_pktinfo - && opt->ip6po_pktinfo->ipi6_ifindex) { - if (!(ifp->if_flags & IFF_LOOPBACK) - && ifp->if_index != opt->ip6po_pktinfo->ipi6_ifindex) { - ip6stat.ip6s_noroute++; - in6_ifstat_inc(ifp, ifs6_out_discard); - error = EHOSTUNREACH; - goto bad; - } + if (select_srcif) { + bzero(&src_sa, sizeof(src_sa)); + src_sa.sin6_family = AF_INET6; + src_sa.sin6_len = sizeof(src_sa); + src_sa.sin6_addr = ip6->ip6_src; + } + bzero(&dst_sa, sizeof(dst_sa)); + dst_sa.sin6_family = AF_INET6; + dst_sa.sin6_len = sizeof(dst_sa); + dst_sa.sin6_addr = ip6->ip6_dst; + + if ((error = in6_selectroute(select_srcif ? &src_sa : NULL, + &dst_sa, opt, im6o, ro, &ifp, &rt, 0, ifscope, nocell)) != 0) { + switch (error) { + case EHOSTUNREACH: + ip6stat.ip6s_noroute++; + break; + case EADDRNOTAVAIL: + default: + break; /* XXX statistics? */ } - + if (ifp != NULL) + in6_ifstat_inc(ifp, ifs6_out_discard); + goto bad; + } + if (rt == NULL) { /* - * if specified, try to fill in the traffic class field. - * do not override if a non-zero value is already set. - * we check the diffserv field and the ecn field separately. + * If in6_selectroute() does not return a route entry, + * dst may not have been updated. */ - if (opt && opt->ip6po_tclass >= 0) { - int mask = 0; - - if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) - mask |= 0xfc; - if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) - mask |= 0x03; - if (mask != 0) - ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); - } + *dst = dst_sa; /* XXX */ + } - if (opt && opt->ip6po_hlim != -1) - ip6->ip6_hlim = opt->ip6po_hlim & 0xff; - } else { - /* Multicast */ - struct in6_multi *in6m; + /* + * then rt (for unicast) and ifp must be non-NULL valid values. + */ + if ((flags & IPV6_FORWARDING) == 0) { + /* XXX: the FORWARDING flag can be set for mrouting. */ + in6_ifstat_inc(ifp, ifs6_out_request); + } + if (rt != NULL) { + RT_LOCK(rt); + ia = (struct in6_ifaddr *)(rt->rt_ifa); + if (ia != NULL) + IFA_ADDREF(&ia->ia_ifa); + rt->rt_use++; + RT_UNLOCK(rt); + } - m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; + /* + * The outgoing interface must be in the zone of source and + * destination addresses. We should use ia_ifp to support the + * case of sending packets to an address of our own. + */ + if (ia != NULL && ia->ia_ifp) { + ifnet_reference(ia->ia_ifp); + if (origifp != NULL) + ifnet_release(origifp); + origifp = ia->ia_ifp; + } else { + if (ifp != NULL) + ifnet_reference(ifp); + if (origifp != NULL) + ifnet_release(origifp); + origifp = ifp; + } + src0 = ip6->ip6_src; + if (in6_setscope(&src0, origifp, &zone)) + goto badscope; + bzero(&src_sa, sizeof(src_sa)); + src_sa.sin6_family = AF_INET6; + src_sa.sin6_len = sizeof(src_sa); + src_sa.sin6_addr = ip6->ip6_src; + if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) + goto badscope; + + dst0 = ip6->ip6_dst; + if (in6_setscope(&dst0, origifp, &zone)) + goto badscope; + /* re-initialize to be sure */ + bzero(&dst_sa, sizeof(dst_sa)); + dst_sa.sin6_family = AF_INET6; + dst_sa.sin6_len = sizeof(dst_sa); + dst_sa.sin6_addr = ip6->ip6_dst; + if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) { + goto badscope; + } - /* - * See if the caller provided any multicast options - */ - ifp = NULL; - if (im6o != NULL) { - ip6->ip6_hlim = im6o->im6o_multicast_hlim; - if (im6o->im6o_multicast_ifp != NULL) - ifp = im6o->im6o_multicast_ifp; - } else - ip6->ip6_hlim = ip6_defmcasthlim; + /* scope check is done. */ + goto routefound; - /* - * See if the caller provided the outgoing interface - * as an ancillary data. - * Boundary check for ifindex is assumed to be already done. - */ - if (opt && opt->ip6po_pktinfo && opt->ip6po_pktinfo->ipi6_ifindex) { - unsigned int index = opt->ip6po_pktinfo->ipi6_ifindex; - ifnet_head_lock_shared(); - if (index > 0 && index <= if_index) { - ifp = ifindex2ifnet[index]; - } - ifnet_head_done(); - } + badscope: + ip6stat.ip6s_badscope++; + in6_ifstat_inc(origifp, ifs6_out_discard); + if (error == 0) + error = EHOSTUNREACH; /* XXX */ + goto bad; - /* - * If the destination is a node-local scope multicast, - * the packet should be loop-backed only. - */ - if (IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst)) { + routefound: + if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + if (opt && opt->ip6po_nextroute.ro_rt) { /* - * If the outgoing interface is already specified, - * it should be a loopback interface. + * The nexthop is explicitly specified by the + * application. We assume the next hop is an IPv6 + * address. */ - if (ifp && (ifp->if_flags & IFF_LOOPBACK) == 0) { - ip6stat.ip6s_badscope++; - error = ENETUNREACH; /* XXX: better error? */ - /* XXX correct ifp? */ - in6_ifstat_inc(ifp, ifs6_out_discard); - goto bad; - } else { - ifp = lo_ifp; - } - } - - /* - * if specified, try to fill in the traffic class field. - * do not override if a non-zero value is already set. - * we check the diffserv field and the ecn field separately. - */ - if (opt && opt->ip6po_tclass >= 0) { - int mask = 0; - - if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) - mask |= 0xfc; - if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) - mask |= 0x03; - if (mask != 0) - ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); + dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } + else if ((rt->rt_flags & RTF_GATEWAY)) + dst = (struct sockaddr_in6 *)rt->rt_gateway; + } - if (opt && opt->ip6po_hlim != -1) - ip6->ip6_hlim = opt->ip6po_hlim & 0xff; + if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ + } else { + struct in6_multi *in6m; - /* - * If caller did not provide an interface lookup a - * default in the routing table. This is either a - * default for the speicfied group (i.e. a host - * route), or a multicast default (a route for the - * ``net'' ff00::/8). - */ - if (ifp == NULL) { - if (ro->ro_rt == NULL) { - ro->ro_rt = rtalloc1( - (struct sockaddr *)&ro->ro_dst, 0, 0); - } - if (ro->ro_rt == NULL) { - ip6stat.ip6s_noroute++; - error = EHOSTUNREACH; - /* XXX in6_ifstat_inc(ifp, ifs6_out_discard) */ - goto bad; - } - RT_LOCK_SPIN(ro->ro_rt); - ia = ifatoia6(ro->ro_rt->rt_ifa); - if (ia != NULL) - ifaref(&ia->ia_ifa); - ifp = ro->ro_rt->rt_ifp; - ro->ro_rt->rt_use++; - RT_UNLOCK(ro->ro_rt); - } + m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; - if ((flags & IPV6_FORWARDING) == 0) - in6_ifstat_inc(ifp, ifs6_out_request); in6_ifstat_inc(ifp, ifs6_out_mcast); /* @@ -844,11 +948,15 @@ skip_ipsec2:; error = ENETUNREACH; goto bad; } - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(ip6->ip6_dst, ifp, in6m); - ifnet_lock_done(ifp); + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&ip6->ip6_dst, ifp, in6m); + in6_multihead_lock_done(); + if (im6o != NULL) + IM6O_LOCK(im6o); if (in6m != NULL && (im6o == NULL || im6o->im6o_multicast_loop)) { + if (im6o != NULL) + IM6O_UNLOCK(im6o); /* * If we belong to the destination multicast group * on the outgoing interface, and the caller did not @@ -856,6 +964,8 @@ skip_ipsec2:; */ ip6_mloopback(ifp, m, dst); } else { + if (im6o != NULL) + IM6O_UNLOCK(im6o); /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just @@ -870,13 +980,25 @@ skip_ipsec2:; */ #if MROUTING if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { + /* + * XXX: ip6_mforward expects that rcvif is NULL + * when it is called from the originating path. + * However, it is not always the case, since + * some versions of MGETHDR() does not + * initialize the field. + */ + m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); + if (in6m != NULL) + IN6M_REMREF(in6m); goto done; } } #endif } + if (in6m != NULL) + IN6M_REMREF(in6m); /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. @@ -885,7 +1007,8 @@ skip_ipsec2:; * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ - if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK)) { + if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || + IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } @@ -895,122 +1018,48 @@ skip_ipsec2:; * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ - if (ifpp) + if (ifpp != NULL) { + ifnet_reference(ifp); /* for caller */ + if (*ifpp != NULL) + ifnet_release(*ifpp); *ifpp = ifp; - - /* - * Determine path MTU. - */ - if (ro_pmtu != ro) { - /* The first hop and the final destination may differ. */ - struct sockaddr_in6 *sin6_fin = - (struct sockaddr_in6 *)&ro_pmtu->ro_dst; - if (ro_pmtu->ro_rt != NULL && - (!(ro_pmtu->ro_rt->rt_flags & RTF_UP) || - ro_pmtu->ro_rt->generation_id != route_generation || - !IN6_ARE_ADDR_EQUAL(&sin6_fin->sin6_addr, &finaldst))) { - rtfree(ro_pmtu->ro_rt); - ro_pmtu->ro_rt = NULL; - } - if (ro_pmtu->ro_rt == NULL) { - bzero(sin6_fin, sizeof(*sin6_fin)); - sin6_fin->sin6_family = AF_INET6; - sin6_fin->sin6_len = sizeof(struct sockaddr_in6); - sin6_fin->sin6_addr = finaldst; - - rtalloc((struct route *)ro_pmtu); - } } - if (ro_pmtu->ro_rt != NULL) { - u_int32_t ifmtu; - - lck_rw_lock_shared(nd_if_rwlock); - ifmtu = IN6_LINKMTU(ifp); - lck_rw_done(nd_if_rwlock); - RT_LOCK_SPIN(ro_pmtu->ro_rt); - mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu; - if (mtu > ifmtu || mtu == 0) { - /* - * The MTU on the route is larger than the MTU on - * the interface! This shouldn't happen, unless the - * MTU of the interface has been changed after the - * interface was brought up. Change the MTU in the - * route to match the interface MTU (as long as the - * field isn't locked). - * - * if MTU on the route is 0, we need to fix the MTU. - * this case happens with path MTU discovery timeouts. - */ - mtu = ifmtu; - if ((ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU) == 0) - ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; /* XXX */ - } - RT_UNLOCK(ro_pmtu->ro_rt); - } else { - lck_rw_lock_shared(nd_if_rwlock); - mtu = IN6_LINKMTU(ifp); - lck_rw_done(nd_if_rwlock); - } + /* Determine path MTU. */ + if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu, + &alwaysfrag)) != 0) + goto bad; /* - * advanced API (IPV6_USE_MIN_MTU) overrides mtu setting + * The caller of this function may specify to use the minimum MTU + * in some cases. + * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU + * setting. The logic is a bit complicated; by default, unicast + * packets will follow path MTU while multicast packets will be sent at + * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets + * including unicast ones will be sent at the minimum MTU. Multicast + * packets will always be sent at the minimum MTU unless + * IP6PO_MINMTU_DISABLE is explicitly specified. + * See RFC 3542 for more details. */ - if ((flags & IPV6_MINMTU) != 0 && mtu > IPV6_MMTU) - mtu = IPV6_MMTU; - - /* Fake scoped addresses */ - if ((ifp->if_flags & IFF_LOOPBACK) != 0) { - /* - * If source or destination address is a scoped address, and - * the packet is going to be sent to a loopback interface, - * we should keep the original interface. - */ - - /* - * XXX: this is a very experimental and temporary solution. - * We eventually have sockaddr_in6 and use the sin6_scope_id - * field of the structure here. - * We rely on the consistency between two scope zone ids - * of source and destination, which should already be assured. - * Larger scopes than link will be supported in the future. - */ - u_short index = 0; - origifp = NULL; - if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) - index = ntohs(ip6->ip6_src.s6_addr16[1]); - else if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) - index = ntohs(ip6->ip6_dst.s6_addr16[1]); - ifnet_head_lock_shared(); - if (index > 0 && index <= if_index) { - origifp = ifindex2ifnet[index]; + if (mtu > IPV6_MMTU) { + if ((flags & IPV6_MINMTU)) + mtu = IPV6_MMTU; + else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) + mtu = IPV6_MMTU; + else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && + (opt == NULL || + opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { + mtu = IPV6_MMTU; } - ifnet_head_done(); - /* - * XXX: origifp can be NULL even in those two cases above. - * For example, if we remove the (only) link-local address - * from the loopback interface, and try to send a link-local - * address without link-id information. Then the source - * address is ::1, and the destination address is the - * link-local address with its s6_addr16[1] being zero. - * What is worse, if the packet goes to the loopback interface - * by a default rejected route, the null pointer would be - * passed to looutput, and the kernel would hang. - * The following last resort would prevent such disaster. - */ - if (origifp == NULL) - origifp = ifp; } - else - origifp = ifp; -#ifndef SCOPEDROUTING + /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); -#endif #if IPFW2 /* @@ -1038,8 +1087,7 @@ skip_ipsec2:; */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); - u_int32_t dummy1; /* XXX unused */ - u_int32_t dummy2; /* XXX unused */ + u_int32_t dummy; /* XXX unused */ #if DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) @@ -1053,11 +1101,9 @@ skip_ipsec2:; */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; - if (ip6_process_hopopts(m, - (u_int8_t *)(hbh + 1), - ((hbh->ip6h_len + 1) << 3) - - sizeof(struct ip6_hbh), - &dummy1, &dummy2) < 0) { + if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), + ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), + &dummy, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; @@ -1067,75 +1113,127 @@ skip_ipsec2:; } #if PF - lck_mtx_unlock(ip6_mutex); - - /* Invoke outbound packet filter */ - error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE); - - lck_mtx_lock(ip6_mutex); + if (PF_IS_ENABLED) { + /* Invoke outbound packet filter */ + error = pf_af_hook(ifp, NULL, &m, AF_INET6, FALSE); - if (error) { - if (m != NULL) { - panic("%s: unexpected packet %p\n", __func__, m); - /* NOTREACHED */ + if (error) { + if (m != NULL) { + panic("%s: unexpected packet %p\n", __func__, m); + /* NOTREACHED */ + } + /* Already freed by callee */ + goto done; } - /* Already freed by callee */ - goto done; + ip6 = mtod(m, struct ip6_hdr *); } - ip6 = mtod(m, struct ip6_hdr *); #endif /* PF */ /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. + * + * the logic here is rather complex: + * 1: normal case (dontfrag == 0, alwaysfrag == 0) + * 1-a: send as is if tlen <= path mtu + * 1-b: fragment if tlen > path mtu + * + * 2: if user asks us not to fragment (dontfrag == 1) + * 2-a: send as is if tlen <= interface mtu + * 2-b: error if tlen > interface mtu + * + * 3: if we always need to attach fragment header (alwaysfrag == 1) + * always fragment + * + * 4: if dontfrag == 1 && alwaysfrag == 1 + * error, as we cannot handle this conflicting request */ tlen = m->m_pkthdr.len; - if (tlen <= mtu -#if notyet - /* - * On any link that cannot convey a 1280-octet packet in one piece, - * link-specific fragmentation and reassembly must be provided at - * a layer below IPv6. [RFC 2460, sec.5] - * Thus if the interface has ability of link-level fragmentation, - * we can just send the packet even if the packet size is - * larger than the link's MTU. - * XXX: IFF_FRAGMENTABLE (or such) flag has not been defined yet... - */ - - || ifp->if_flags & IFF_FRAGMENTABLE -#endif - ) - { - /* Record statistics for this interface address. */ - if (ia && !(flags & IPV6_FORWARDING)) { -#ifndef __APPLE__ - ia->ia_ifa.if_opackets++; - ia->ia_ifa.if_obytes += m->m_pkthdr.len; -#endif - } + + if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) + dontfrag = 1; + else + dontfrag = 0; + if (dontfrag && alwaysfrag) { /* case 4 */ + /* conflicting request - can't transmit */ + error = EMSGSIZE; + goto bad; + } + + lck_rw_lock_shared(nd_if_rwlock); + ifmtu = IN6_LINKMTU(ifp); + lck_rw_done(nd_if_rwlock); + + if (dontfrag && tlen > ifmtu) { /* case 2-b */ + /* + * Even if the DONTFRAG option is specified, we cannot send the + * packet when the data length is larger than the MTU of the + * outgoing interface. + * Notify the error by sending IPV6_PATHMTU ancillary data as + * well as returning an error code (the latter is not described + * in the API spec.) + */ + u_int32_t mtu32; + struct ip6ctlparam ip6cp; + + mtu32 = (u_int32_t)mtu; + bzero(&ip6cp, sizeof(ip6cp)); + ip6cp.ip6c_cmdarg = (void *)&mtu32; + pfctlinput2(PRC_MSGSIZE, (struct sockaddr *)&ro_pmtu->ro_dst, + (void *)&ip6cp); + + error = EMSGSIZE; + goto bad; + } + + /* + * transmit packet without fragmentation + */ + tso = (ifp->if_hwassist & IFNET_TSO_IPV6) && + (m->m_pkthdr.csum_flags & CSUM_TSO_IPV6); + if (dontfrag || (!alwaysfrag && /* case 1-a and 2-a */ + (tlen <= mtu || tso || (ifp->if_hwassist & CSUM_FRAGMENT_IPV6)))) { + int sw_csum; + + ip6 = mtod(m, struct ip6_hdr *); #ifdef IPSEC /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif - error = nd6_output(ifp, origifp, m, dst, ro->ro_rt, 1); + if (apple_hwcksum_tx == 0) /* Do not let HW handle cksum */ + sw_csum = m->m_pkthdr.csum_flags; + else + sw_csum = m->m_pkthdr.csum_flags & + ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist); + + if ((sw_csum & CSUM_DELAY_IPV6_DATA) != 0) { + in6_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IPV6_DATA; + } + if (ro->ro_rt) + RT_LOCK_ASSERT_NOTHELD(ro->ro_rt); + error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); goto done; - } else if (mtu < IPV6_MMTU) { - /* - * note that path MTU is never less than IPV6_MMTU - * (see icmp6_input). - */ + } + + /* + * try to fragment the packet. case 1-b and 3 + */ + if (mtu < IPV6_MMTU) { + /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; - } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ + } else if (ip6->ip6_plen == 0) { + /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { struct mbuf **mnext, *m_frgpart; - struct ip6_frag *ip6f = NULL; - u_int32_t id = htonl(ip6_id++); + struct ip6_frag *ip6f; + u_int32_t id = htonl(ip6_randomid()); u_char nextproto; /* @@ -1174,6 +1272,11 @@ skip_ipsec2:; ip6->ip6_nxt = IPPROTO_FRAGMENT; } + if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IPV6_DATA) != 0) { + in6_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IPV6_DATA; + } + /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto @@ -1254,7 +1357,7 @@ sendorfree: /* clean ipsec history once it goes out of the node */ ipsec_delaux(m); #endif - error = nd6_output(ifp, origifp, m, dst, ro->ro_rt, 1); + error = nd6_output(ifp, origifp, m, dst, ro->ro_rt); } else m_freem(m); @@ -1264,8 +1367,6 @@ sendorfree: ip6stat.ip6s_fragmented++; done: - if (!locked) - lck_mtx_unlock(ip6_mutex); if (ro == &ip6route && ro->ro_rt) { /* brace necessary for rtfree */ rtfree(ro->ro_rt); } else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) { @@ -1278,8 +1379,12 @@ done: #endif /* IPSEC */ if (ia != NULL) - ifafree(&ia->ia_ifa); - return(error); + IFA_REMREF(&ia->ia_ifa); + if (ifp != NULL) + ifnet_release(ifp); + if (origifp != NULL) + ifnet_release(origifp); + return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ @@ -1311,7 +1416,7 @@ ip6_copyexthdr(mp, hdr, hlen) MCLGET(m, M_DONTWAIT); if ((m->m_flags & M_EXT) == 0) { m_free(m); - return(ENOBUFS); + return (ENOBUFS); } } m->m_len = hlen; @@ -1319,9 +1424,29 @@ ip6_copyexthdr(mp, hdr, hlen) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; - return(0); + return (0); } +/* + * Process a delayed payload checksum calculation. + */ +void +in6_delayed_cksum(struct mbuf *m, uint16_t offset) +{ + uint16_t csum; + + csum = in6_cksum(m, 0, offset, m->m_pkthdr.len - offset); + if (csum == 0 && (m->m_pkthdr.csum_flags & CSUM_UDPIPV6) != 0) { + csum = 0xffff; + } + + offset += (m->m_pkthdr.csum_data & 0xffff); + if ((offset + sizeof(csum)) > m->m_len) { + m_copyback(m, offset, sizeof(csum), &csum); + } else { + *(uint16_t *)(mtod(m, char *) + offset) = csum; + } +} /* * Insert jumbo payload option. */ @@ -1345,7 +1470,7 @@ ip6_insert_jumboopt(exthdrs, plen) if (exthdrs->ip6e_hbh == 0) { MGET(mopt, M_DONTWAIT, MT_DATA); if (mopt == 0) - return(ENOBUFS); + return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ @@ -1361,7 +1486,7 @@ ip6_insert_jumboopt(exthdrs, plen) * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ - int oldoptlen = mopt->m_len; + u_int32_t oldoptlen = mopt->m_len; struct mbuf *n; /* @@ -1369,7 +1494,7 @@ ip6_insert_jumboopt(exthdrs, plen) * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) - return(ENOBUFS); + return (ENOBUFS); /* * As a consequence, we must always prepare a cluster @@ -1384,11 +1509,11 @@ ip6_insert_jumboopt(exthdrs, plen) } } if (!n) - return(ENOBUFS); + return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), - oldoptlen); - optbuf = (u_char *) (mtod(n, caddr_t) + oldoptlen); + oldoptlen); + optbuf = mtod(n, u_char *) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { @@ -1415,7 +1540,7 @@ ip6_insert_jumboopt(exthdrs, plen) /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; - return(0); + return (0); #undef JUMBOOPTLEN } @@ -1432,9 +1557,9 @@ ip6_insertfraghdr(m0, m, hlen, frghdrp) if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), - hlen - sizeof(struct ip6_hdr), M_DONTWAIT); + hlen - sizeof(struct ip6_hdr), M_DONTWAIT); if (n == 0) - return(ENOBUFS); + return (ENOBUFS); m->m_next = n; } else n = m; @@ -1446,8 +1571,8 @@ ip6_insertfraghdr(m0, m, hlen, frghdrp) if ((mlast->m_flags & M_EXT) == 0 && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ - *frghdrp = - (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); + *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { @@ -1456,18 +1581,102 @@ ip6_insertfraghdr(m0, m, hlen, frghdrp) MGET(mfrg, M_DONTWAIT, MT_DATA); if (mfrg == 0) - return(ENOBUFS); + return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } - return(0); + return (0); } extern int load_ipfw(void); +static int +ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro, + struct ifnet *ifp, struct in6_addr *dst, u_int32_t *mtup, + int *alwaysfragp) +{ + u_int32_t mtu = 0; + int alwaysfrag = 0; + int error = 0; -/* + if (ro_pmtu != ro) { + /* The first hop and the final destination may differ. */ + struct sockaddr_in6 *sa6_dst = + (struct sockaddr_in6 *)&ro_pmtu->ro_dst; + if (ro_pmtu->ro_rt && + ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 || + ro_pmtu->ro_rt->generation_id != route_generation || + !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) { + rtfree(ro_pmtu->ro_rt); + ro_pmtu->ro_rt = (struct rtentry *)NULL; + } + if (ro_pmtu->ro_rt == NULL) { + bzero(sa6_dst, sizeof(*sa6_dst)); + sa6_dst->sin6_family = AF_INET6; + sa6_dst->sin6_len = sizeof(struct sockaddr_in6); + sa6_dst->sin6_addr = *dst; + + rtalloc_scoped((struct route *)ro_pmtu, + ifp != NULL ? ifp->if_index : IFSCOPE_NONE); + } + } + + + if (ro_pmtu->ro_rt != NULL) { + u_int32_t ifmtu; + + lck_rw_lock_shared(nd_if_rwlock); + ifmtu = IN6_LINKMTU(ifp); + lck_rw_done(nd_if_rwlock); + + RT_LOCK_SPIN(ro_pmtu->ro_rt); + mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu; + if (mtu > ifmtu || mtu == 0) { + /* + * The MTU on the route is larger than the MTU on + * the interface! This shouldn't happen, unless the + * MTU of the interface has been changed after the + * interface was brought up. Change the MTU in the + * route to match the interface MTU (as long as the + * field isn't locked). + * + * if MTU on the route is 0, we need to fix the MTU. + * this case happens with path MTU discovery timeouts. + */ + mtu = ifmtu; + if ((ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU) == 0) + ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; /* XXX */ + } + else if (mtu < IPV6_MMTU) { + /* + * RFC2460 section 5, last paragraph: + * if we record ICMPv6 too big message with + * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU + * or smaller, with framgent header attached. + * (fragment header is needed regardless from the + * packet size, for translators to identify packets) + */ + alwaysfrag = 1; + mtu = IPV6_MMTU; + } + RT_UNLOCK(ro_pmtu->ro_rt); + } else { + if (ifp) { + lck_rw_lock_shared(nd_if_rwlock); + mtu = IN6_LINKMTU(ifp); + lck_rw_done(nd_if_rwlock); + } else + error = EHOSTUNREACH; /* XXX */ + } + + *mtup = mtu; + if (alwaysfragp) + *alwaysfragp = alwaysfrag; + return (error); +} + +/* * IP6 socket option processing. */ int @@ -1475,6 +1684,8 @@ ip6_ctloutput(so, sopt) struct socket *so; struct sockopt *sopt; { + int optdatalen, uproto; + void *optdata; int privileged; struct inpcb *in6p = sotoinpcb(so); int error = 0, optval = 0; @@ -1491,6 +1702,7 @@ ip6_ctloutput(so, sopt) optname = sopt->sopt_name; optlen = sopt->sopt_valsize; p = sopt->sopt_p; + uproto = (int)so->so_proto->pr_protocol; privileged = (proc_suser(p) == 0); @@ -1499,14 +1711,10 @@ ip6_ctloutput(so, sopt) case SOPT_SET: switch (optname) { - case IPV6_PKTOPTIONS: + case IPV6_2292PKTOPTIONS: { struct mbuf *m; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; @@ -1532,12 +1740,23 @@ ip6_ctloutput(so, sopt) * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ + case IPV6_RECVHOPOPTS: + case IPV6_RECVDSTOPTS: + case IPV6_RECVRTHDRDSTOPTS: + if (!privileged) + break; + /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: - case IPV6_CHECKSUM: + case IPV6_HOPLIMIT: case IPV6_FAITH: + case IPV6_RECVPKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_RECVRTHDR: + case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_V6ONLY: + case IPV6_AUTOFLOWLABEL: if (optlen != sizeof(int)) { error = EINVAL; break; @@ -1554,8 +1773,7 @@ ip6_ctloutput(so, sopt) else { /* -1 = kernel default */ in6p->in6p_hops = optval; - - if ((in6p->in6p_vflag & + if ((in6p->inp_vflag & INP_IPV4) != 0) in6p->inp_ip_ttl = optval; } @@ -1563,18 +1781,103 @@ ip6_ctloutput(so, sopt) #define OPTSET(bit) \ do { \ if (optval) \ - in6p->in6p_flags |= (bit); \ + in6p->inp_flags |= (bit); \ else \ - in6p->in6p_flags &= ~(bit); \ -} while (0) -#define OPTBIT(bit) (in6p->in6p_flags & (bit) ? 1 : 0) + in6p->inp_flags &= ~(bit); \ +} while (/*CONSTCOND*/ 0) +#define OPTSET2292(bit) \ +do { \ + in6p->inp_flags |= IN6P_RFC2292; \ + if (optval) \ + in6p->inp_flags |= (bit); \ + else \ + in6p->inp_flags &= ~(bit); \ +} while (/*CONSTCOND*/ 0) +#define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) + + case IPV6_RECVPKTINFO: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_PKTINFO); + break; + + case IPV6_HOPLIMIT: + { + struct ip6_pktopts **optp; + + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + optp = &in6p->in6p_outputopts; + error = ip6_pcbopt(IPV6_HOPLIMIT, + (u_char *)&optval, sizeof(optval), + optp, uproto); + break; + } + + case IPV6_RECVHOPLIMIT: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_HOPLIMIT); + break; + + case IPV6_RECVHOPOPTS: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_HOPOPTS); + break; + + case IPV6_RECVDSTOPTS: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_DSTOPTS); + break; - case IPV6_CHECKSUM: - in6p->in6p_cksum = optval; + case IPV6_RECVRTHDRDSTOPTS: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_RTHDRDSTOPTS); + break; + + case IPV6_RECVRTHDR: + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { + error = EINVAL; + break; + } + OPTSET(IN6P_RTHDR); break; case IPV6_FAITH: - OPTSET(IN6P_FAITH); + OPTSET(INP_FAITH); + break; + + case IPV6_RECVPATHMTU: + /* + * We ignore this option for TCP + * sockets. + * (RFC3542 leaves this case + * unspecified.) + */ + if (uproto != IPPROTO_TCP) + OPTSET(IN6P_MTU); break; case IPV6_V6ONLY: @@ -1583,30 +1886,54 @@ do { \ * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ - if (in6p->in6p_lport || - !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) - { + if (in6p->inp_lport || + !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) - in6p->in6p_vflag &= ~INP_IPV4; + in6p->inp_vflag &= ~INP_IPV4; else - in6p->in6p_vflag |= INP_IPV4; + in6p->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: - /* cannot mix with RFC2292 XXX */ + /* we can mix with RFC2292 */ OPTSET(IN6P_TCLASS); break; + case IPV6_AUTOFLOWLABEL: + OPTSET(IN6P_AUTOFLOWLABEL); + break; + } break; - case IPV6_PKTINFO: - case IPV6_HOPLIMIT: - case IPV6_HOPOPTS: - case IPV6_DSTOPTS: - case IPV6_RTHDR: + case IPV6_TCLASS: + case IPV6_DONTFRAG: + case IPV6_USE_MIN_MTU: + case IPV6_PREFER_TEMPADDR: + if (optlen != sizeof(optval)) { + error = EINVAL; + break; + } + error = sooptcopyin(sopt, &optval, + sizeof optval, sizeof optval); + if (error) + break; + { + struct ip6_pktopts **optp; + optp = &in6p->in6p_outputopts; + error = ip6_pcbopt(optname, + (u_char *)&optval, sizeof(optval), + optp, uproto); + break; + } + + case IPV6_2292PKTINFO: + case IPV6_2292HOPLIMIT: + case IPV6_2292HOPOPTS: + case IPV6_2292DSTOPTS: + case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; @@ -1617,68 +1944,74 @@ do { \ if (error) break; switch (optname) { - case IPV6_PKTINFO: - OPTSET(IN6P_PKTINFO); + case IPV6_2292PKTINFO: + OPTSET2292(IN6P_PKTINFO); break; - case IPV6_HOPLIMIT: - OPTSET(IN6P_HOPLIMIT); + case IPV6_2292HOPLIMIT: + OPTSET2292(IN6P_HOPLIMIT); break; - case IPV6_HOPOPTS: + case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (!privileged) return(EPERM); - OPTSET(IN6P_HOPOPTS); + OPTSET2292(IN6P_HOPOPTS); break; - case IPV6_DSTOPTS: + case IPV6_2292DSTOPTS: if (!privileged) return(EPERM); - OPTSET(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ + OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; - case IPV6_RTHDR: - OPTSET(IN6P_RTHDR); + case IPV6_2292RTHDR: + OPTSET2292(IN6P_RTHDR); break; } break; -#undef OPTSET + case IPV6_3542PKTINFO: + case IPV6_3542HOPOPTS: + case IPV6_3542RTHDR: + case IPV6_3542DSTOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_3542NEXTHOP: + { + /* new advanced API (RFC3542) */ + struct mbuf *m; - case IPV6_TCLASS: - if (optlen != sizeof(optval)) { + /* cannot mix with RFC2292 */ + if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } - error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); - if (error) + error = soopt_getm(sopt, &m); + if (error != 0) + break; + error = soopt_mcopyin(sopt, m); + if (error) { + m_freem(m); break; - error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), &in6p->in6p_outputopts); + } + error = ip6_pcbopt(optname, mtod(m, u_char *), + m->m_len, &in6p->in6p_outputopts, uproto); + m_freem(m); break; + } +#undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: - { - struct mbuf *m; - if (sopt->sopt_valsize > MLEN) { - error = EMSGSIZE; - break; - } - /* XXX */ - MGET(m, sopt->sopt_p != kernproc ? - M_WAIT : M_DONTWAIT, MT_HEADER); - if (m == 0) { - error = ENOBUFS; - break; - } - m->m_len = sopt->sopt_valsize; - error = sooptcopyin(sopt, mtod(m, char *), - m->m_len, m->m_len); - error = ip6_setmoptions(sopt->sopt_name, in6p, m); - (void)m_free(m); - } + case IPV6_MSFILTER: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + error = ip6_setmoptions(in6p, sopt); break; case IPV6_PORTRANGE: @@ -1689,18 +2022,18 @@ do { \ switch (optval) { case IPV6_PORTRANGE_DEFAULT: - in6p->in6p_flags &= ~(IN6P_LOWPORT); - in6p->in6p_flags &= ~(IN6P_HIGHPORT); + in6p->inp_flags &= ~(INP_LOWPORT); + in6p->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: - in6p->in6p_flags &= ~(IN6P_LOWPORT); - in6p->in6p_flags |= IN6P_HIGHPORT; + in6p->inp_flags &= ~(INP_LOWPORT); + in6p->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: - in6p->in6p_flags &= ~(IN6P_HIGHPORT); - in6p->in6p_flags |= IN6P_LOWPORT; + in6p->inp_flags &= ~(INP_HIGHPORT); + in6p->inp_flags |= INP_LOWPORT; break; default: @@ -1716,10 +2049,6 @@ do { \ size_t len = 0; struct mbuf *m; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ @@ -1751,6 +2080,47 @@ do { \ break; #endif /* IPFIREWALL */ + /* + * IPv6 variant of IP_BOUND_IF; for details see + * comments on IP_BOUND_IF in ip_ctloutput(). + */ + case IPV6_BOUND_IF: + /* This option is settable only on IPv6 */ + if (!(in6p->inp_vflag & INP_IPV6)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, + sizeof (optval), sizeof (optval)); + + if (error) + break; + + inp_bindif(in6p, optval); + break; + + case IPV6_NO_IFT_CELLULAR: + /* This option is settable only for IPv6 */ + if (!(in6p->inp_vflag & INP_IPV6)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, + sizeof (optval), sizeof (optval)); + + if (error) + break; + + error = inp_nocellular(in6p, optval); + break; + + case IPV6_OUT_IF: + /* This option is not settable */ + error = EINVAL; + break; + default: error = ENOPROTOOPT; break; @@ -1760,41 +2130,69 @@ do { \ case SOPT_GET: switch (optname) { - case IPV6_PKTOPTIONS: - if (in6p->in6p_options) { - struct mbuf *m; - m = m_copym(in6p->in6p_options, - 0, M_COPYALL, M_WAIT); - if (m == NULL) { - error = ENOBUFS; - break; - } - error = soopt_mcopyout(sopt, m); - if (error == 0) - m_freem(m); - } else - sopt->sopt_valsize = 0; + case IPV6_2292PKTOPTIONS: + /* + * RFC3542 (effectively) deprecated the + * semantics of the 2292-style pktoptions. + * Since it was not reliable in nature (i.e., + * applications had to expect the lack of some + * information after all), it would make sense + * to simplify this part by always returning + * empty data. + */ + sopt->sopt_valsize = 0; break; + case IPV6_RECVHOPOPTS: + case IPV6_RECVDSTOPTS: + case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: - case IPV6_CHECKSUM: + case IPV6_RECVPKTINFO: + case IPV6_RECVHOPLIMIT: + case IPV6_RECVRTHDR: + case IPV6_RECVPATHMTU: case IPV6_FAITH: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: + case IPV6_AUTOFLOWLABEL: switch (optname) { + case IPV6_RECVHOPOPTS: + optval = OPTBIT(IN6P_HOPOPTS); + break; + + case IPV6_RECVDSTOPTS: + optval = OPTBIT(IN6P_DSTOPTS); + break; + + case IPV6_RECVRTHDRDSTOPTS: + optval = OPTBIT(IN6P_RTHDRDSTOPTS); + break; + case IPV6_UNICAST_HOPS: optval = in6p->in6p_hops; break; - case IPV6_CHECKSUM: - optval = in6p->in6p_cksum; + case IPV6_RECVPKTINFO: + optval = OPTBIT(IN6P_PKTINFO); + break; + + case IPV6_RECVHOPLIMIT: + optval = OPTBIT(IN6P_HOPLIMIT); + break; + + case IPV6_RECVRTHDR: + optval = OPTBIT(IN6P_RTHDR); + break; + + case IPV6_RECVPATHMTU: + optval = OPTBIT(IN6P_MTU); break; case IPV6_FAITH: - optval = OPTBIT(IN6P_FAITH); + optval = OPTBIT(INP_FAITH); break; case IPV6_V6ONLY: @@ -1804,10 +2202,10 @@ do { \ case IPV6_PORTRANGE: { int flags; - flags = in6p->in6p_flags; - if (flags & IN6P_HIGHPORT) + flags = in6p->inp_flags; + if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; - else if (flags & IN6P_LOWPORT) + else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; @@ -1817,64 +2215,93 @@ do { \ optval = OPTBIT(IN6P_TCLASS); break; + case IPV6_AUTOFLOWLABEL: + optval = OPTBIT(IN6P_AUTOFLOWLABEL); + break; } + if (error) + break; error = sooptcopyout(sopt, &optval, sizeof optval); break; - case IPV6_PKTINFO: - case IPV6_HOPLIMIT: - case IPV6_HOPOPTS: - case IPV6_RTHDR: - case IPV6_DSTOPTS: - if ((optname == IPV6_HOPOPTS || - optname == IPV6_DSTOPTS) && - !privileged) - return(EPERM); + case IPV6_PATHMTU: + { + u_int32_t pmtu = 0; + struct ip6_mtuinfo mtuinfo; + struct route_in6 sro; + + bzero(&sro, sizeof(sro)); + + if (!(so->so_state & SS_ISCONNECTED)) + return (ENOTCONN); + /* + * XXX: we dot not consider the case of source + * routing, or optional information to specify + * the outgoing interface. + */ + error = ip6_getpmtu(&sro, NULL, NULL, + &in6p->in6p_faddr, &pmtu, NULL); + if (sro.ro_rt) + rtfree(sro.ro_rt); + if (error) + break; + if (pmtu > IPV6_MAXPACKET) + pmtu = IPV6_MAXPACKET; + + bzero(&mtuinfo, sizeof(mtuinfo)); + mtuinfo.ip6m_mtu = (u_int32_t)pmtu; + optdata = (void *)&mtuinfo; + optdatalen = sizeof(mtuinfo); + error = sooptcopyout(sopt, optdata, + optdatalen); + break; + } + + case IPV6_2292PKTINFO: + case IPV6_2292HOPLIMIT: + case IPV6_2292HOPOPTS: + case IPV6_2292RTHDR: + case IPV6_2292DSTOPTS: switch (optname) { - case IPV6_PKTINFO: + case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; - case IPV6_HOPLIMIT: + case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; - case IPV6_HOPOPTS: - if (!privileged) - return(EPERM); + case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; - case IPV6_RTHDR: + case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; - case IPV6_DSTOPTS: - if (!privileged) - return(EPERM); + case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, - sizeof optval); + sizeof optval); break; - + case IPV6_PKTINFO: + case IPV6_HOPOPTS: + case IPV6_RTHDR: + case IPV6_DSTOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_NEXTHOP: case IPV6_TCLASS: - error = ip6_getpcbopt(in6p->in6p_outputopts, optname, sopt); + case IPV6_DONTFRAG: + case IPV6_USE_MIN_MTU: + case IPV6_PREFER_TEMPADDR: + error = ip6_getpcbopt(in6p->in6p_outputopts, + optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: - case IPV6_JOIN_GROUP: - case IPV6_LEAVE_GROUP: - { - struct mbuf *m; - error = ip6_getmoptions(sopt->sopt_name, - in6p->in6p_moptions, &m); - if (error == 0) - error = sooptcopyout(sopt, - mtod(m, char *), m->m_len); - if (m != NULL) - m_freem(m); - } + case IPV6_MSFILTER: + error = ip6_getmoptions(in6p, sopt); break; #if IPSEC @@ -1885,10 +2312,6 @@ do { \ struct mbuf *m = NULL; struct mbuf **mp = &m; - if (sopt->sopt_valsize > MCLBYTES) { - error = EMSGSIZE; - break; - } error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; @@ -1921,6 +2344,26 @@ do { \ break; #endif /* IPFIREWALL */ + case IPV6_BOUND_IF: + if (in6p->inp_flags & INP_BOUND_IF) + optval = in6p->inp_boundif; + error = sooptcopyout(sopt, &optval, + sizeof (optval)); + break; + + case IPV6_NO_IFT_CELLULAR: + optval = (in6p->inp_flags & INP_NO_IFT_CELLULAR) + ? 1 : 0; + error = sooptcopyout(sopt, &optval, + sizeof (optval)); + break; + + case IPV6_OUT_IF: + optval = in6p->in6p_last_outif; + error = sooptcopyout(sopt, &optval, + sizeof (optval)); + break; + default: error = ENOPROTOOPT; break; @@ -1933,35 +2376,105 @@ do { \ return(error); } -/* - * Set up IP6 options in pcb for insertion in output packets or - * specifying behavior of outgoing packets. - */ -static int -ip6_pcbopts( - struct ip6_pktopts **pktopt, - struct mbuf *m, - __unused struct socket *so, - struct sockopt *sopt) +int +ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { - struct ip6_pktopts *opt = *pktopt; - int error = 0, priv; - struct proc *p = sopt->sopt_p; + int error = 0, optval, optlen; + const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); + struct inpcb *in6p = sotoinpcb(so); + int level, op, optname; - /* turn off any old options. */ - if (opt) { -#if DIAGNOSTIC - if (opt->ip6po_pktinfo || opt->ip6po_nexthop || - opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || - opt->ip6po_rhinfo.ip6po_rhi_rthdr) - printf("ip6_pcbopts: all specified options are cleared.\n"); -#endif - ip6_clearpktopts(opt, 1, -1); - } else { - opt = _MALLOC(sizeof(*opt), M_IP6OPT, M_WAITOK); - if (opt == NULL) - return ENOBUFS; - } + level = sopt->sopt_level; + op = sopt->sopt_dir; + optname = sopt->sopt_name; + optlen = sopt->sopt_valsize; + + if (level != IPPROTO_IPV6) { + return (EINVAL); + } + + switch (optname) { + case IPV6_CHECKSUM: + /* + * For ICMPv6 sockets, no modification allowed for checksum + * offset, permit "no change" values to help existing apps. + * + * RFC3542 says: "An attempt to set IPV6_CHECKSUM + * for an ICMPv6 socket will fail." + * The current behavior does not meet RFC3542. + */ + switch (op) { + case SOPT_SET: + if (optlen != sizeof(int)) { + error = EINVAL; + break; + } + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + if (error) + break; + if ((optval % 2) != 0) { + /* the API assumes even offset values */ + error = EINVAL; + } else if (so->so_proto->pr_protocol == + IPPROTO_ICMPV6) { + if (optval != icmp6off) + error = EINVAL; + } else + in6p->in6p_cksum = optval; + break; + + case SOPT_GET: + if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) + optval = icmp6off; + else + optval = in6p->in6p_cksum; + + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; + + default: + error = EINVAL; + break; + } + break; + + default: + error = ENOPROTOOPT; + break; + } + + return (error); +} + +/* + * Set up IP6 options in pcb for insertion in output packets or + * specifying behavior of outgoing packets. + */ +static int +ip6_pcbopts( + struct ip6_pktopts **pktopt, + struct mbuf *m, + __unused struct socket *so, + __unused struct sockopt *sopt) +{ + struct ip6_pktopts *opt = *pktopt; + int error = 0; + + /* turn off any old options. */ + if (opt) { +#if DIAGNOSTIC + if (opt->ip6po_pktinfo || opt->ip6po_nexthop || + opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || + opt->ip6po_rhinfo.ip6po_rhi_rthdr) + printf("ip6_pcbopts: all specified options are cleared.\n"); +#endif + ip6_clearpktopts(opt, -1); + } else { + opt = _MALLOC(sizeof(*opt), M_IP6OPT, M_WAITOK); + if (opt == NULL) + return ENOBUFS; + } *pktopt = NULL; if (!m || m->m_len == 0) { @@ -1974,11 +2487,9 @@ ip6_pcbopts( return(0); } - priv = (proc_suser(p) == 0); - /* set options specified by user. */ - if ((error = ip6_setpktoptions(m, opt, priv, 1)) != 0) { - ip6_clearpktopts(opt, 1, -1); /* XXX: discard all options */ + if ((error = ip6_setpktopts(m, opt, NULL, so->so_proto->pr_protocol)) != 0) { + ip6_clearpktopts(opt, -1); /* XXX: discard all options */ FREE(opt, M_IP6OPT); return(error); } @@ -1986,19 +2497,36 @@ ip6_pcbopts( return(0); } +/* + * initialize ip6_pktopts. beware that there are non-zero default values in + * the struct. + */ +void +ip6_initpktopts(struct ip6_pktopts *opt) +{ + + bzero(opt, sizeof(*opt)); + opt->ip6po_hlim = -1; /* -1 means default hop limit */ + opt->ip6po_tclass = -1; /* -1 means default traffic class */ + opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; + opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; +} + static int -ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt) +ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, int uproto) { struct ip6_pktopts *opt; opt = *pktopt; if (opt == NULL) { opt = _MALLOC(sizeof(*opt), M_IP6OPT, M_WAITOK); + if (opt == NULL) + return(ENOBUFS); ip6_initpktopts(opt); *pktopt = opt; } - return (ip6_setpktopt(optname, buf, len, opt)); + return (ip6_setpktopt(optname, buf, len, opt, 1, 0, uproto)); } static int @@ -2006,15 +2534,85 @@ ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) { void *optdata = NULL; int optdatalen = 0; - int deftclass = 0; + struct ip6_ext *ip6e; int error = 0; + struct in6_pktinfo null_pktinfo; + int deftclass = 0, on; + int defminmtu = IP6PO_MINMTU_MCASTONLY; + int defpreftemp = IP6PO_TEMPADDR_SYSTEM; switch (optname) { + case IPV6_PKTINFO: + if (pktopt && pktopt->ip6po_pktinfo) + optdata = (void *)pktopt->ip6po_pktinfo; + else { + /* XXX: we don't have to do this every time... */ + bzero(&null_pktinfo, sizeof(null_pktinfo)); + optdata = (void *)&null_pktinfo; + } + optdatalen = sizeof(struct in6_pktinfo); + break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) - optdata = &pktopt->ip6po_tclass; + optdata = (void *)&pktopt->ip6po_tclass; + else + optdata = (void *)&deftclass; + optdatalen = sizeof(int); + break; + case IPV6_HOPOPTS: + if (pktopt && pktopt->ip6po_hbh) { + optdata = (void *)pktopt->ip6po_hbh; + ip6e = (struct ip6_ext *)pktopt->ip6po_hbh; + optdatalen = (ip6e->ip6e_len + 1) << 3; + } + break; + case IPV6_RTHDR: + if (pktopt && pktopt->ip6po_rthdr) { + optdata = (void *)pktopt->ip6po_rthdr; + ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr; + optdatalen = (ip6e->ip6e_len + 1) << 3; + } + break; + case IPV6_RTHDRDSTOPTS: + if (pktopt && pktopt->ip6po_dest1) { + optdata = (void *)pktopt->ip6po_dest1; + ip6e = (struct ip6_ext *)pktopt->ip6po_dest1; + optdatalen = (ip6e->ip6e_len + 1) << 3; + } + break; + case IPV6_DSTOPTS: + if (pktopt && pktopt->ip6po_dest2) { + optdata = (void *)pktopt->ip6po_dest2; + ip6e = (struct ip6_ext *)pktopt->ip6po_dest2; + optdatalen = (ip6e->ip6e_len + 1) << 3; + } + break; + case IPV6_NEXTHOP: + if (pktopt && pktopt->ip6po_nexthop) { + optdata = (void *)pktopt->ip6po_nexthop; + optdatalen = pktopt->ip6po_nexthop->sa_len; + } + break; + case IPV6_USE_MIN_MTU: + if (pktopt) + optdata = (void *)&pktopt->ip6po_minmtu; else - optdata = &deftclass; + optdata = (void *)&defminmtu; + optdatalen = sizeof(int); + break; + case IPV6_DONTFRAG: + if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) + on = 1; + else + on = 0; + optdata = (void *)&on; + optdatalen = sizeof(on); + break; + case IPV6_PREFER_TEMPADDR: + if (pktopt) + optdata = (void *)&pktopt->ip6po_prefer_tempaddr; + else + optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ @@ -2025,81 +2623,48 @@ ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) } error = sooptcopyout(sopt, optdata, optdatalen); - return (error); -} - -static int -ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt) -{ - switch (optname) { - case IPV6_TCLASS: - { - int tclass; - if (len != sizeof(int)) - return (EINVAL); - tclass = *(int *)buf; - if (tclass < -1 || tclass > 255) - return (EINVAL); - - opt->ip6po_tclass = tclass; - break; - } - - default: - return (ENOPROTOOPT); - } /* end of switch */ - - return (0); -} - -/* - * initialize ip6_pktopts. beware that there are non-zero default values in - * the struct. - */ -void -ip6_initpktopts(opt) - struct ip6_pktopts *opt; -{ - bzero(opt, sizeof(*opt)); - opt->ip6po_hlim = -1; /* -1 means default hop limit */ - opt->ip6po_tclass = -1; /* -1 means default traffic class */ + return (error); } void -ip6_clearpktopts(pktopt, needfree, optname) +ip6_clearpktopts(pktopt, optname) struct ip6_pktopts *pktopt; - int needfree, optname; + int optname; { if (pktopt == NULL) return; - if (optname == -1) { - if (needfree && pktopt->ip6po_pktinfo) + if (optname == -1 || optname == IPV6_PKTINFO) { + if (pktopt->ip6po_pktinfo) FREE(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } - if (optname == -1) + if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; - if (optname == -1) + if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; - if (optname == -1) { - if (needfree && pktopt->ip6po_nexthop) + if (optname == -1 || optname == IPV6_NEXTHOP) { + if (pktopt->ip6po_nextroute.ro_rt) { + rtfree(pktopt->ip6po_nextroute.ro_rt); + pktopt->ip6po_nextroute.ro_rt = NULL; + } + if (pktopt->ip6po_nexthop) FREE(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } - if (optname == -1) { - if (needfree && pktopt->ip6po_hbh) + if (optname == -1 || optname == IPV6_HOPOPTS) { + if (pktopt->ip6po_hbh) FREE(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } - if (optname == -1) { - if (needfree && pktopt->ip6po_dest1) + if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { + if (pktopt->ip6po_dest1) FREE(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } - if (optname == -1) { - if (needfree && pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) + if (optname == -1 || optname == IPV6_RTHDR) { + if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) FREE(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { @@ -2107,8 +2672,8 @@ ip6_clearpktopts(pktopt, needfree, optname) pktopt->ip6po_route.ro_rt = NULL; } } - if (optname == -1) { - if (needfree && pktopt->ip6po_dest2) + if (optname == -1 || optname == IPV6_DSTOPTS) { + if (pktopt->ip6po_dest2) FREE(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } @@ -2126,25 +2691,17 @@ do {\ }\ } while (0) -struct ip6_pktopts * -ip6_copypktopts(src, canwait) - struct ip6_pktopts *src; - int canwait; +static int +copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { - struct ip6_pktopts *dst; - - if (src == NULL) { + if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); - return(NULL); + return (EINVAL); } - dst = _MALLOC(sizeof(*dst), M_IP6OPT, canwait); - if (dst == NULL && canwait == M_NOWAIT) - return (NULL); - bzero(dst, sizeof(*dst)); - dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; + dst->ip6po_flags = src->ip6po_flags; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = _MALLOC(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); @@ -2164,20 +2721,33 @@ ip6_copypktopts(src, canwait) PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ - return(dst); + return (0); bad: - if (dst->ip6po_pktinfo) FREE(dst->ip6po_pktinfo, M_IP6OPT); - if (dst->ip6po_nexthop) FREE(dst->ip6po_nexthop, M_IP6OPT); - if (dst->ip6po_hbh) FREE(dst->ip6po_hbh, M_IP6OPT); - if (dst->ip6po_dest1) FREE(dst->ip6po_dest1, M_IP6OPT); - if (dst->ip6po_dest2) FREE(dst->ip6po_dest2, M_IP6OPT); - if (dst->ip6po_rthdr) FREE(dst->ip6po_rthdr, M_IP6OPT); - FREE(dst, M_IP6OPT); - return(NULL); + ip6_clearpktopts(dst, -1); + return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY +struct ip6_pktopts * +ip6_copypktopts(struct ip6_pktopts *src, int canwait) +{ + int error; + struct ip6_pktopts *dst; + + dst = _MALLOC(sizeof(*dst), M_IP6OPT, canwait); + if (dst == NULL) + return (NULL); + ip6_initpktopts(dst); + + if ((error = copypktopts(dst, src, canwait)) != 0) { + FREE(dst, M_IP6OPT); + return (NULL); + } + + return (dst); +} + void ip6_freepcbopts(pktopt) struct ip6_pktopts *pktopt; @@ -2185,707 +2755,596 @@ ip6_freepcbopts(pktopt) if (pktopt == NULL) return; - ip6_clearpktopts(pktopt, 1, -1); + ip6_clearpktopts(pktopt, -1); FREE(pktopt, M_IP6OPT); } -/* - * Set the IP6 multicast options in response to user setsockopt(). - */ -static int -ip6_setmoptions( - int optname, - struct inpcb* in6p, - struct mbuf *m) +void +ip6_moptions_init(void) { - int error = 0; - u_int loop, ifindex; - struct ipv6_mreq *mreq; - struct ifnet *ifp; - struct ip6_moptions **im6op = &in6p->in6p_moptions; - struct ip6_moptions *im6o = *im6op; - struct ip_moptions *imo; - struct route_in6 ro; - struct sockaddr_in6 *dst; - struct in6_multi_mship *imm; - - if (im6o == NULL) { - /* - * No multicast option buffer attached to the pcb; - * allocate one and initialize to default values. - */ - im6o = (struct ip6_moptions *) - _MALLOC(sizeof(*im6o), M_IPMOPTS, M_WAITOK); - - if (im6o == NULL) - return(ENOBUFS); - *im6op = im6o; - im6o->im6o_multicast_ifp = NULL; - im6o->im6o_multicast_hlim = ip6_defmcasthlim; - im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP; - LIST_INIT(&im6o->im6o_memberships); - } - - if (in6p->inp_moptions == NULL) { - /* - * No IPv4 multicast option buffer attached to the pcb; - * call ip_createmoptions to allocate one and initialize - * to default values. - */ - error = ip_createmoptions(&in6p->inp_moptions); - if (error != 0) - return error; - } - imo = in6p->inp_moptions; - - switch (optname) { - - case IPV6_MULTICAST_IF: - /* - * Select the interface for outgoing multicast packets. - */ - if (m == NULL || m->m_len != sizeof(u_int)) { - error = EINVAL; - break; - } - bcopy(mtod(m, u_int *), &ifindex, sizeof(ifindex)); - - ifnet_head_lock_shared(); - /* Don't need to check is ifindex is < 0 since it's unsigned */ - if (if_index < ifindex) { - error = ENXIO; /* XXX EINVAL? */ - ifnet_head_done(); - break; - } - ifp = ifindex2ifnet[ifindex]; - ifnet_head_done(); - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - break; - } - im6o->im6o_multicast_ifp = ifp; - imo->imo_multicast_ifp = ifp; - break; - - case IPV6_MULTICAST_HOPS: - { - /* - * Set the IP6 hoplimit for outgoing multicast packets. - */ - int optval; - if (m == NULL || m->m_len != sizeof(int)) { - error = EINVAL; - break; - } - bcopy(mtod(m, u_int *), &optval, sizeof(optval)); - if (optval < -1 || optval >= 256) - error = EINVAL; - else if (optval == -1) { - im6o->im6o_multicast_hlim = ip6_defmcasthlim; - imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - } else { - im6o->im6o_multicast_hlim = optval; - imo->imo_multicast_ttl = optval; - } - break; - } - - case IPV6_MULTICAST_LOOP: - /* - * Set the loopback flag for outgoing multicast packets. - * Must be zero or one. - */ - if (m == NULL || m->m_len != sizeof(u_int)) { - error = EINVAL; - break; - } - bcopy(mtod(m, u_int *), &loop, sizeof(loop)); - if (loop > 1) { - error = EINVAL; - break; - } - im6o->im6o_multicast_loop = loop; - imo->imo_multicast_loop = loop; - break; - - case IPV6_JOIN_GROUP: - /* - * Add a multicast group membership. - * Group must be a valid IP6 multicast address. - */ - if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) { - error = EINVAL; - break; - } - mreq = mtod(m, struct ipv6_mreq *); - /* - * If the interface is specified, validate it. - * - * Don't need to check if it's < 0, since it's unsigned - */ - ifnet_head_lock_shared(); - if (if_index < mreq->ipv6mr_interface) { - ifnet_head_done(); - error = ENXIO; /* XXX EINVAL? */ - break; - } - ifp = ifindex2ifnet[mreq->ipv6mr_interface]; - ifnet_head_done(); - - if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { - /* - * We use the unspecified address to specify to accept - * all multicast addresses. Only super user is allowed - * to do this. - */ - if (suser(kauth_cred_get(), 0)) - { - error = EACCES; - break; - } - } else if (IN6_IS_ADDR_V4MAPPED(&mreq->ipv6mr_multiaddr)) { - struct ip_mreq v4req; - - v4req.imr_multiaddr.s_addr = mreq->ipv6mr_multiaddr.s6_addr32[3]; - v4req.imr_interface.s_addr = INADDR_ANY; - - /* Find an IPv4 address on the specified interface. */ - if (mreq->ipv6mr_interface != 0) { - struct in_ifaddr *ifa; - - lck_rw_lock_shared(in_ifaddr_rwlock); - TAILQ_FOREACH(ifa, &in_ifaddrhead, ia_link) { - if (ifa->ia_ifp == ifp) { - v4req.imr_interface = IA_SIN(ifa)->sin_addr; - break; - } - } - lck_rw_done(in_ifaddr_rwlock); - - if (v4req.imr_multiaddr.s_addr == 0) { - /* Interface has no IPv4 address. */ - error = EINVAL; - break; - } - } - - error = ip_addmembership(imo, &v4req); - break; - } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { - error = EINVAL; - break; - } - /* - * If no interface was explicitly specified, choose an - * appropriate one according to the given multicast address. - */ - if (mreq->ipv6mr_interface == 0) { - /* - * If the multicast address is in node-local scope, - * the interface should be a loopback interface. - * Otherwise, look up the routing table for the - * address, and choose the outgoing interface. - * XXX: is it a good approach? - */ - if (IN6_IS_ADDR_MC_NODELOCAL(&mreq->ipv6mr_multiaddr)) { - ifp = lo_ifp; - } else { - ro.ro_rt = NULL; - dst = (struct sockaddr_in6 *)&ro.ro_dst; - bzero(dst, sizeof(*dst)); - dst->sin6_len = sizeof(struct sockaddr_in6); - dst->sin6_family = AF_INET6; - dst->sin6_addr = mreq->ipv6mr_multiaddr; - rtalloc((struct route *)&ro); - if (ro.ro_rt == NULL) { - error = EADDRNOTAVAIL; - break; - } - ifp = ro.ro_rt->rt_ifp; - rtfree(ro.ro_rt); - ro.ro_rt = NULL; - } - } + PE_parse_boot_argn("ifa_debug", &im6o_debug, sizeof (im6o_debug)); - /* - * See if we found an interface, and confirm that it - * supports multicast - */ - if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) { - error = EADDRNOTAVAIL; - break; - } - /* - * Put interface index into the multicast address, - * if the address has link-local scope. - */ - if (IN6_IS_ADDR_MC_LINKLOCAL(&mreq->ipv6mr_multiaddr)) { - mreq->ipv6mr_multiaddr.s6_addr16[1] - = htons(mreq->ipv6mr_interface); - } - /* - * See if the membership already exists. - */ - lck_mtx_lock(nd6_mutex); - for (imm = im6o->im6o_memberships.lh_first; - imm != NULL; imm = imm->i6mm_chain.le_next) - if (imm->i6mm_maddr->in6m_ifp == ifp && - IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, - &mreq->ipv6mr_multiaddr)) - break; - if (imm != NULL) { - error = EADDRINUSE; - lck_mtx_unlock(nd6_mutex); - break; - } - /* - * Everything looks good; add a new record to the multicast - * address list for the given interface. - */ - imm = _MALLOC(sizeof(*imm), M_IPMADDR, M_WAITOK); - if (imm == NULL) { - error = ENOBUFS; - lck_mtx_unlock(nd6_mutex); - break; - } - if ((imm->i6mm_maddr = - in6_addmulti(&mreq->ipv6mr_multiaddr, ifp, &error, 1)) == NULL) { - FREE(imm, M_IPMADDR); - lck_mtx_unlock(nd6_mutex); - break; - } - LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain); - lck_mtx_unlock(nd6_mutex); - break; - - case IPV6_LEAVE_GROUP: - /* - * Drop a multicast group membership. - * Group must be a valid IP6 multicast address. - */ - if (m == NULL || m->m_len != sizeof(struct ipv6_mreq)) { - error = EINVAL; - break; - } - mreq = mtod(m, struct ipv6_mreq *); - /* - * If an interface address was specified, get a pointer - * to its ifnet structure. - * - * Don't need to check if it's < 0, since it's unsigned. - */ - ifnet_head_lock_shared(); - if (if_index < mreq->ipv6mr_interface) { - ifnet_head_done(); - error = ENXIO; /* XXX EINVAL? */ - break; - } - ifp = ifindex2ifnet[mreq->ipv6mr_interface]; - ifnet_head_done(); - - if (IN6_IS_ADDR_UNSPECIFIED(&mreq->ipv6mr_multiaddr)) { - if (suser(kauth_cred_get(), 0)) { - error = EACCES; - break; - } - } else if (IN6_IS_ADDR_V4MAPPED(&mreq->ipv6mr_multiaddr)) { - struct ip_mreq v4req; - - v4req.imr_multiaddr.s_addr = mreq->ipv6mr_multiaddr.s6_addr32[3]; - v4req.imr_interface.s_addr = INADDR_ANY; - - if (ifp != NULL) { - struct in_ifaddr *ifa; - - lck_rw_lock_shared(in_ifaddr_rwlock); - TAILQ_FOREACH(ifa, &in_ifaddrhead, ia_link) { - if (ifa->ia_ifp == ifp) { - v4req.imr_interface = IA_SIN(ifa)->sin_addr; - break; - } - } - lck_rw_done(in_ifaddr_rwlock); - } - - error = ip_dropmembership(imo, &v4req); - break; - } else if (!IN6_IS_ADDR_MULTICAST(&mreq->ipv6mr_multiaddr)) { - error = EINVAL; - break; - } - /* - * Put interface index into the multicast address, - * if the address has link-local scope. - */ - if (IN6_IS_ADDR_MC_LINKLOCAL(&mreq->ipv6mr_multiaddr)) { - mreq->ipv6mr_multiaddr.s6_addr16[1] - = htons(mreq->ipv6mr_interface); - } - /* - * Find the membership in the membership list. - */ - lck_mtx_lock(nd6_mutex); - for (imm = im6o->im6o_memberships.lh_first; - imm != NULL; imm = imm->i6mm_chain.le_next) { - if ((ifp == NULL || - imm->i6mm_maddr->in6m_ifp == ifp) && - IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, - &mreq->ipv6mr_multiaddr)) - break; - } - if (imm == NULL) { - /* Unable to resolve interface */ - error = EADDRNOTAVAIL; - lck_mtx_unlock(nd6_mutex); - break; - } - /* - * Give up the multicast address record to which the - * membership points. - */ - LIST_REMOVE(imm, i6mm_chain); - in6_delmulti(imm->i6mm_maddr, 1); - lck_mtx_unlock(nd6_mutex); - FREE(imm, M_IPMADDR); - break; - - default: - error = EOPNOTSUPP; - break; + im6o_size = (im6o_debug == 0) ? sizeof (struct ip6_moptions) : + sizeof (struct ip6_moptions_dbg); + + im6o_zone = zinit(im6o_size, IM6O_ZONE_MAX * im6o_size, 0, + IM6O_ZONE_NAME); + if (im6o_zone == NULL) { + panic("%s: failed allocating %s", __func__, IM6O_ZONE_NAME); + /* NOTREACHED */ } + zone_change(im6o_zone, Z_EXPAND, TRUE); +} - /* - * If all options have default values, no need to keep the mbuf. - */ - lck_mtx_lock(nd6_mutex); - if (im6o->im6o_multicast_ifp == NULL && - im6o->im6o_multicast_hlim == ip6_defmcasthlim && - im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP && - im6o->im6o_memberships.lh_first == NULL) { - FREE(*im6op, M_IPMOPTS); - *im6op = NULL; - } - if (imo->imo_multicast_ifp == NULL && - imo->imo_multicast_vif == -1 && - imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && - imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP && - imo->imo_num_memberships == 0) { - ip_freemoptions(imo); - in6p->inp_moptions = 0; - } - lck_mtx_unlock(nd6_mutex); +void +im6o_addref(struct ip6_moptions *im6o, int locked) +{ + if (!locked) + IM6O_LOCK(im6o); + else + IM6O_LOCK_ASSERT_HELD(im6o); - return(error); + if (++im6o->im6o_refcnt == 0) { + panic("%s: im6o %p wraparound refcnt\n", __func__, im6o); + /* NOTREACHED */ + } else if (im6o->im6o_trace != NULL) { + (*im6o->im6o_trace)(im6o, TRUE); + } + + if (!locked) + IM6O_UNLOCK(im6o); } -/* - * Return the IP6 multicast options in response to user getsockopt(). - */ -static int -ip6_getmoptions(optname, im6o, mp) - int optname; - struct ip6_moptions *im6o; - struct mbuf **mp; +void +im6o_remref(struct ip6_moptions *im6o) { - u_int *hlim, *loop, *ifindex; + int i; - *mp = m_get(M_WAIT, MT_HEADER); /*XXX*/ - if (*mp == NULL) - return ENOBUFS; + IM6O_LOCK(im6o); + if (im6o->im6o_refcnt == 0) { + panic("%s: im6o %p negative refcnt", __func__, im6o); + /* NOTREACHED */ + } else if (im6o->im6o_trace != NULL) { + (*im6o->im6o_trace)(im6o, FALSE); + } - switch (optname) { + --im6o->im6o_refcnt; + if (im6o->im6o_refcnt > 0) { + IM6O_UNLOCK(im6o); + return; + } - case IPV6_MULTICAST_IF: - ifindex = mtod(*mp, u_int *); - (*mp)->m_len = sizeof(u_int); - if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) - *ifindex = 0; - else - *ifindex = im6o->im6o_multicast_ifp->if_index; - return(0); + for (i = 0; i < im6o->im6o_num_memberships; ++i) { + struct in6_mfilter *imf; - case IPV6_MULTICAST_HOPS: - hlim = mtod(*mp, u_int *); - (*mp)->m_len = sizeof(u_int); - if (im6o == NULL) - *hlim = ip6_defmcasthlim; - else - *hlim = im6o->im6o_multicast_hlim; - return(0); + imf = im6o->im6o_mfilters ? &im6o->im6o_mfilters[i] : NULL; + if (imf != NULL) + im6f_leave(imf); - case IPV6_MULTICAST_LOOP: - loop = mtod(*mp, u_int *); - (*mp)->m_len = sizeof(u_int); - if (im6o == NULL) - *loop = ip6_defmcasthlim; - else - *loop = im6o->im6o_multicast_loop; - return(0); + (void) in6_mc_leave(im6o->im6o_membership[i], imf); - default: - return(EOPNOTSUPP); + if (imf != NULL) + im6f_purge(imf); + + IN6M_REMREF(im6o->im6o_membership[i]); + im6o->im6o_membership[i] = NULL; + } + im6o->im6o_num_memberships = 0; + if (im6o->im6o_mfilters != NULL) { + FREE(im6o->im6o_mfilters, M_IN6MFILTER); + im6o->im6o_mfilters = NULL; + } + if (im6o->im6o_membership != NULL) { + FREE(im6o->im6o_membership, M_IP6MOPTS); + im6o->im6o_membership = NULL; + } + IM6O_UNLOCK(im6o); + + lck_mtx_destroy(&im6o->im6o_lock, ifa_mtx_grp); + + if (!(im6o->im6o_debug & IFD_ALLOC)) { + panic("%s: im6o %p cannot be freed", __func__, im6o); + /* NOTREACHED */ } + zfree(im6o_zone, im6o); } -/* - * Discard the IP6 multicast options. - */ -void -ip6_freemoptions(im6o) - struct ip6_moptions *im6o; +static void +im6o_trace(struct ip6_moptions *im6o, int refhold) { - struct in6_multi_mship *imm; + struct ip6_moptions_dbg *im6o_dbg = (struct ip6_moptions_dbg *)im6o; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; - if (im6o == NULL) - return; - - lck_mtx_lock(nd6_mutex); - while ((imm = im6o->im6o_memberships.lh_first) != NULL) { - LIST_REMOVE(imm, i6mm_chain); - if (imm->i6mm_maddr) - in6_delmulti(imm->i6mm_maddr, 1); - FREE(imm, M_IPMADDR); - } - lck_mtx_unlock(nd6_mutex); - FREE(im6o, M_IPMOPTS); + if (!(im6o->im6o_debug & IFD_DEBUG)) { + panic("%s: im6o %p has no debug structure", __func__, im6o); + /* NOTREACHED */ + } + if (refhold) { + cnt = &im6o_dbg->im6o_refhold_cnt; + tr = im6o_dbg->im6o_refhold; + } else { + cnt = &im6o_dbg->im6o_refrele_cnt; + tr = im6o_dbg->im6o_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % IM6O_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + +struct ip6_moptions * +ip6_allocmoptions(int how) +{ + struct ip6_moptions *im6o; + + im6o = (how == M_WAITOK) ? + zalloc(im6o_zone) : zalloc_noblock(im6o_zone); + if (im6o != NULL) { + bzero(im6o, im6o_size); + lck_mtx_init(&im6o->im6o_lock, ifa_mtx_grp, ifa_mtx_attr); + im6o->im6o_debug |= IFD_ALLOC; + if (im6o_debug != 0) { + im6o->im6o_debug |= IFD_DEBUG; + im6o->im6o_trace = im6o_trace; + } + IM6O_ADDREF(im6o); + } + + return (im6o); } /* * Set IPv6 outgoing packet options based on advanced API. */ int -ip6_setpktoptions(control, opt, priv, needcopy) - struct mbuf *control; - struct ip6_pktopts *opt; - int priv, needcopy; +ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, + struct ip6_pktopts *stickyopt, int uproto) { struct cmsghdr *cm = 0; - if (control == 0 || opt == 0) - return(EINVAL); + if (control == NULL || opt == NULL) + return (EINVAL); ip6_initpktopts(opt); + if (stickyopt) { + int error; + + /* + * If stickyopt is provided, make a local copy of the options + * for this particular packet, then override them by ancillary + * objects. + * XXX: copypktopts() does not copy the cached route to a next + * hop (if any). This is not very good in terms of efficiency, + * but we can allow this since this option should be rarely + * used. + */ + if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) + return (error); + } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) - return(EINVAL); + return (EINVAL); - for (; control->m_len; control->m_data += CMSG_ALIGN(cm->cmsg_len), - control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { - cm = mtod(control, struct cmsghdr *); - if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) - return(EINVAL); + if (control->m_len < CMSG_LEN(0)) + return (EINVAL); + + for (cm = M_FIRST_CMSGHDR(control); cm; cm = M_NXT_CMSGHDR(control, cm)) { + int error; + + if (cm->cmsg_len < sizeof(struct cmsghdr) || cm->cmsg_len > control->m_len) + return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; + error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), + cm->cmsg_len - CMSG_LEN(0), opt, 0, 1, uproto); + if (error) + return (error); + } + + return (0); +} +/* + * Set a particular packet option, as a sticky option or an ancillary data + * item. "len" can be 0 only when it's a sticky option. + * We have 4 cases of combination of "sticky" and "cmsg": + * "sticky=0, cmsg=0": impossible + * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data + * "sticky=1, cmsg=0": RFC3542 socket option + * "sticky=1, cmsg=1": RFC2292 socket option + */ +static int +ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, + int sticky, int cmsg, int uproto) +{ + int minmtupolicy, preftemp; + int error; + + if (!sticky && !cmsg) { +#ifdef DIAGNOSTIC + printf("ip6_setpktopt: impossible case\n"); +#endif + return (EINVAL); + } + + /* + * IPV6_2292xxx is for backward compatibility to RFC2292, and should + * not be specified in the context of RFC3542. Conversely, + * RFC3542 types should not be specified in the context of RFC2292. + */ + if (!cmsg) { + switch (optname) { + case IPV6_2292PKTINFO: + case IPV6_2292HOPLIMIT: + case IPV6_2292NEXTHOP: + case IPV6_2292HOPOPTS: + case IPV6_2292DSTOPTS: + case IPV6_2292RTHDR: + case IPV6_2292PKTOPTIONS: + return (ENOPROTOOPT); + } + } + if (sticky && cmsg) { + switch (optname) { + case IPV6_PKTINFO: + case IPV6_HOPLIMIT: + case IPV6_NEXTHOP: + case IPV6_HOPOPTS: + case IPV6_DSTOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_RTHDR: + case IPV6_USE_MIN_MTU: + case IPV6_DONTFRAG: + case IPV6_TCLASS: + case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ + return (ENOPROTOOPT); + } + } + + switch (optname) { + case IPV6_2292PKTINFO: + case IPV6_PKTINFO: + { + struct ifnet *ifp = NULL; + struct in6_pktinfo *pktinfo; + + if (len != sizeof(struct in6_pktinfo)) + return (EINVAL); + + pktinfo = (struct in6_pktinfo *)buf; + /* - * XXX should check if RFC2292 API is mixed with 2292bis API + * An application can clear any sticky IPV6_PKTINFO option by + * doing a "regular" setsockopt with ipi6_addr being + * in6addr_any and ipi6_ifindex being zero. + * [RFC 3542, Section 6] */ - switch (cm->cmsg_type) { - case IPV6_PKTINFO: - if (cm->cmsg_len != CMSG_LEN(sizeof(struct in6_pktinfo))) - return(EINVAL); - if (needcopy) { - /* XXX: Is it really WAITOK? */ - opt->ip6po_pktinfo = - _MALLOC(sizeof(struct in6_pktinfo), - M_IP6OPT, M_WAITOK); - if (opt->ip6po_pktinfo == NULL) - return ENOBUFS; - bcopy(CMSG_DATA(cm), opt->ip6po_pktinfo, - sizeof(struct in6_pktinfo)); - } else - opt->ip6po_pktinfo = - (struct in6_pktinfo *)CMSG_DATA(cm); - if (opt->ip6po_pktinfo->ipi6_ifindex && - IN6_IS_ADDR_LINKLOCAL(&opt->ip6po_pktinfo->ipi6_addr)) - opt->ip6po_pktinfo->ipi6_addr.s6_addr16[1] = - htons(opt->ip6po_pktinfo->ipi6_ifindex); - - if (opt->ip6po_pktinfo->ipi6_ifindex > if_index) { - return(ENXIO); - } + if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && + pktinfo->ipi6_ifindex == 0 && + IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { + ip6_clearpktopts(opt, optname); + break; + } - /* - * Check if the requested source address is indeed a - * unicast address assigned to the node, and can be - * used as the packet's source address. - */ - if (!IN6_IS_ADDR_UNSPECIFIED(&opt->ip6po_pktinfo->ipi6_addr)) { - struct in6_ifaddr *ia6; - struct sockaddr_in6 sin6; - - bzero(&sin6, sizeof(sin6)); - sin6.sin6_len = sizeof(sin6); - sin6.sin6_family = AF_INET6; - sin6.sin6_addr = - opt->ip6po_pktinfo->ipi6_addr; - ia6 = (struct in6_ifaddr *)ifa_ifwithaddr(sin6tosa(&sin6)); - if (ia6 == NULL || - (ia6->ia6_flags & (IN6_IFF_ANYCAST | - IN6_IFF_NOTREADY)) != 0) { - if (ia6) ifafree(&ia6->ia_ifa); - return(EADDRNOTAVAIL); - } - ifafree(&ia6->ia_ifa); - ia6 = NULL; + if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && + sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { + return (EINVAL); + } + + /* validate the interface index if specified. */ + ifnet_head_lock_shared(); + + if (pktinfo->ipi6_ifindex > if_index) { + ifnet_head_done(); + return (ENXIO); + } + + if (pktinfo->ipi6_ifindex) { + ifp = ifindex2ifnet[pktinfo->ipi6_ifindex]; + if (ifp == NULL) { + ifnet_head_done(); + return (ENXIO); } - break; + } + + ifnet_head_done(); - case IPV6_HOPLIMIT: - if (cm->cmsg_len != CMSG_LEN(sizeof(int))) - return(EINVAL); + /* + * We store the address anyway, and let in6_selectsrc() + * validate the specified address. This is because ipi6_addr + * may not have enough information about its scope zone, and + * we may need additional information (such as outgoing + * interface or the scope zone of a destination address) to + * disambiguate the scope. + * XXX: the delay of the validation may confuse the + * application when it is used as a sticky option. + */ + if (opt->ip6po_pktinfo == NULL) { + opt->ip6po_pktinfo = _MALLOC(sizeof(*pktinfo), + M_IP6OPT, M_NOWAIT); + if (opt->ip6po_pktinfo == NULL) + return (ENOBUFS); + } + bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); + break; + } - opt->ip6po_hlim = *(int *)CMSG_DATA(cm); - if (opt->ip6po_hlim < -1 || opt->ip6po_hlim > 255) - return(EINVAL); - break; + case IPV6_2292HOPLIMIT: + case IPV6_HOPLIMIT: + { + int *hlimp; - case IPV6_TCLASS: - if (cm->cmsg_len != CMSG_LEN(sizeof(int))) - return(EINVAL); + /* + * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT + * to simplify the ordering among hoplimit options. + */ + if (optname == IPV6_HOPLIMIT && sticky) + return (ENOPROTOOPT); - opt->ip6po_tclass = *(int *)CMSG_DATA(cm); - if (opt->ip6po_tclass < -1 || opt->ip6po_tclass > 255) - return (EINVAL); - break; + if (len != sizeof(int)) + return (EINVAL); + hlimp = (int *)buf; + if (*hlimp < -1 || *hlimp > 255) + return (EINVAL); - case IPV6_NEXTHOP: - if (!priv) - return(EPERM); - - if (cm->cmsg_len < sizeof(u_char) || - /* check if cmsg_len is large enough for sa_len */ - cm->cmsg_len < CMSG_LEN(*CMSG_DATA(cm))) - return(EINVAL); - - if (needcopy) { - opt->ip6po_nexthop = - _MALLOC(*CMSG_DATA(cm), - M_IP6OPT, M_WAITOK); - if (opt->ip6po_nexthop == NULL) - return ENOBUFS; - bcopy(CMSG_DATA(cm), - opt->ip6po_nexthop, - *CMSG_DATA(cm)); - } else - opt->ip6po_nexthop = - (struct sockaddr *)CMSG_DATA(cm); + opt->ip6po_hlim = *hlimp; + break; + } + + case IPV6_TCLASS: + { + int tclass; + + if (len != sizeof(int)) + return (EINVAL); + tclass = *(int *)buf; + if (tclass < -1 || tclass > 255) + return (EINVAL); + + opt->ip6po_tclass = tclass; + break; + } + + case IPV6_2292NEXTHOP: + case IPV6_NEXTHOP: + error = suser(kauth_cred_get(), 0); + if (error) + return (EACCES); + + if (len == 0) { /* just remove the option */ + ip6_clearpktopts(opt, IPV6_NEXTHOP); break; + } - case IPV6_HOPOPTS: + /* check if cmsg_len is large enough for sa_len */ + if (len < sizeof(struct sockaddr) || len < *buf) + return (EINVAL); + + switch (((struct sockaddr *)buf)->sa_family) { + case AF_INET6: { - struct ip6_hbh *hbh; - int hbhlen; - - if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_hbh))) - return(EINVAL); - hbh = (struct ip6_hbh *)CMSG_DATA(cm); - hbhlen = (hbh->ip6h_len + 1) << 3; - if (cm->cmsg_len != CMSG_LEN(hbhlen)) - return(EINVAL); - - if (needcopy) { - opt->ip6po_hbh = - _MALLOC(hbhlen, M_IP6OPT, M_WAITOK); - if (opt->ip6po_hbh == NULL) - return ENOBUFS; - bcopy(hbh, opt->ip6po_hbh, hbhlen); - } else - opt->ip6po_hbh = hbh; + struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; + + if (sa6->sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + + if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || + IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { + return (EINVAL); + } + if ((error = sa6_embedscope(sa6, ip6_use_defzone)) + != 0) { + return (error); + } break; } + case AF_LINK: /* should eventually be supported */ + default: + return (EAFNOSUPPORT); + } - case IPV6_DSTOPTS: - { - struct ip6_dest *dest, **newdest; - int destlen; - - if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_dest))) - return(EINVAL); - dest = (struct ip6_dest *)CMSG_DATA(cm); - destlen = (dest->ip6d_len + 1) << 3; - if (cm->cmsg_len != CMSG_LEN(destlen)) - return(EINVAL); - - /* - * The old advacned API is ambiguous on this - * point. Our approach is to determine the - * position based according to the existence - * of a routing header. Note, however, that - * this depends on the order of the extension - * headers in the ancillary data; the 1st part - * of the destination options header must - * appear before the routing header in the - * ancillary data, too. - * RFC2292bis solved the ambiguity by - * introducing separate cmsg types. + /* turn off the previous option, then set the new option. */ + ip6_clearpktopts(opt, IPV6_NEXTHOP); + opt->ip6po_nexthop = _MALLOC(*buf, M_IP6OPT, M_NOWAIT); + if (opt->ip6po_nexthop == NULL) + return (ENOBUFS); + bcopy(buf, opt->ip6po_nexthop, *buf); + break; + + case IPV6_2292HOPOPTS: + case IPV6_HOPOPTS: + { + struct ip6_hbh *hbh; + int hbhlen; + + /* + * XXX: We don't allow a non-privileged user to set ANY HbH + * options, since per-option restriction has too much + * overhead. + */ + error = suser(kauth_cred_get(), 0); + if (error) + return (EACCES); + + if (len == 0) { + ip6_clearpktopts(opt, IPV6_HOPOPTS); + break; /* just remove the option */ + } + + /* message length validation */ + if (len < sizeof(struct ip6_hbh)) + return (EINVAL); + hbh = (struct ip6_hbh *)buf; + hbhlen = (hbh->ip6h_len + 1) << 3; + if (len != hbhlen) + return (EINVAL); + + /* turn off the previous option, then set the new option. */ + ip6_clearpktopts(opt, IPV6_HOPOPTS); + opt->ip6po_hbh = _MALLOC(hbhlen, M_IP6OPT, M_NOWAIT); + if (opt->ip6po_hbh == NULL) + return (ENOBUFS); + bcopy(hbh, opt->ip6po_hbh, hbhlen); + + break; + } + + case IPV6_2292DSTOPTS: + case IPV6_DSTOPTS: + case IPV6_RTHDRDSTOPTS: + { + struct ip6_dest *dest, **newdest = NULL; + int destlen; + + error = suser(kauth_cred_get(), 0); + if (error) + return (EACCES); + + if (len == 0) { + ip6_clearpktopts(opt, optname); + break; /* just remove the option */ + } + + /* message length validation */ + if (len < sizeof(struct ip6_dest)) + return (EINVAL); + dest = (struct ip6_dest *)buf; + destlen = (dest->ip6d_len + 1) << 3; + if (len != destlen) + return (EINVAL); + + /* + * Determine the position that the destination options header + * should be inserted; before or after the routing header. + */ + switch (optname) { + case IPV6_2292DSTOPTS: + /* + * The old advacned API is ambiguous on this point. + * Our approach is to determine the position based + * according to the existence of a routing header. + * Note, however, that this depends on the order of the + * extension headers in the ancillary data; the 1st + * part of the destination options header must appear + * before the routing header in the ancillary data, + * too. + * RFC3542 solved the ambiguity by introducing + * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; - - if (needcopy) { - *newdest = _MALLOC(destlen, M_IP6OPT, M_WAITOK); - if (*newdest == NULL) - return ENOBUFS; - bcopy(dest, *newdest, destlen); - } else - *newdest = dest; - + break; + case IPV6_RTHDRDSTOPTS: + newdest = &opt->ip6po_dest1; + break; + case IPV6_DSTOPTS: + newdest = &opt->ip6po_dest2; break; } - case IPV6_RTHDR: - { - struct ip6_rthdr *rth; - int rthlen; - - if (cm->cmsg_len < CMSG_LEN(sizeof(struct ip6_rthdr))) - return(EINVAL); - rth = (struct ip6_rthdr *)CMSG_DATA(cm); - rthlen = (rth->ip6r_len + 1) << 3; - if (cm->cmsg_len != CMSG_LEN(rthlen)) - return(EINVAL); - - switch (rth->ip6r_type) { - case IPV6_RTHDR_TYPE_0: - /* must contain one addr */ - if (rth->ip6r_len == 0) - return(EINVAL); - /* length must be even */ - if (rth->ip6r_len % 2) - return(EINVAL); - if (rth->ip6r_len / 2 != rth->ip6r_segleft) - return(EINVAL); - break; - default: - return(EINVAL); /* not supported */ - } + /* turn off the previous option, then set the new option. */ + ip6_clearpktopts(opt, optname); + *newdest = _MALLOC(destlen, M_IP6OPT, M_NOWAIT); + if (*newdest == NULL) + return (ENOBUFS); + bcopy(dest, *newdest, destlen); - if (needcopy) { - opt->ip6po_rthdr = _MALLOC(rthlen, M_IP6OPT, - M_WAITOK); - if (opt->ip6po_rthdr == NULL) - return ENOBUFS; - bcopy(rth, opt->ip6po_rthdr, rthlen); - } else - opt->ip6po_rthdr = rth; + break; + } - break; + case IPV6_2292RTHDR: + case IPV6_RTHDR: + { + struct ip6_rthdr *rth; + int rthlen; + + if (len == 0) { + ip6_clearpktopts(opt, IPV6_RTHDR); + break; /* just remove the option */ } + /* message length validation */ + if (len < sizeof(struct ip6_rthdr)) + return (EINVAL); + rth = (struct ip6_rthdr *)buf; + rthlen = (rth->ip6r_len + 1) << 3; + if (len != rthlen) + return (EINVAL); + + switch (rth->ip6r_type) { + case IPV6_RTHDR_TYPE_0: + if (rth->ip6r_len == 0) /* must contain one addr */ + return (EINVAL); + if (rth->ip6r_len % 2) /* length must be even */ + return (EINVAL); + if (rth->ip6r_len / 2 != rth->ip6r_segleft) + return (EINVAL); + break; default: - return(ENOPROTOOPT); + return (EINVAL); /* not supported */ } + + /* turn off the previous option */ + ip6_clearpktopts(opt, IPV6_RTHDR); + opt->ip6po_rthdr = _MALLOC(rthlen, M_IP6OPT, M_NOWAIT); + if (opt->ip6po_rthdr == NULL) + return (ENOBUFS); + bcopy(rth, opt->ip6po_rthdr, rthlen); + + break; } - return(0); + case IPV6_USE_MIN_MTU: + if (len != sizeof(int)) + return (EINVAL); + minmtupolicy = *(int *)buf; + if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && + minmtupolicy != IP6PO_MINMTU_DISABLE && + minmtupolicy != IP6PO_MINMTU_ALL) { + return (EINVAL); + } + opt->ip6po_minmtu = minmtupolicy; + break; + + case IPV6_DONTFRAG: + if (len != sizeof(int)) + return (EINVAL); + + if (uproto == IPPROTO_TCP || *(int *)buf == 0) { + /* + * we ignore this option for TCP sockets. + * (RFC3542 leaves this case unspecified.) + */ + opt->ip6po_flags &= ~IP6PO_DONTFRAG; + } else + opt->ip6po_flags |= IP6PO_DONTFRAG; + break; + + case IPV6_PREFER_TEMPADDR: + if (len != sizeof(int)) + return (EINVAL); + preftemp = *(int *)buf; + if (preftemp != IP6PO_TEMPADDR_SYSTEM && + preftemp != IP6PO_TEMPADDR_NOTPREFER && + preftemp != IP6PO_TEMPADDR_PREFER) { + return (EINVAL); + } + opt->ip6po_prefer_tempaddr = preftemp; + break; + + default: + return (ENOPROTOOPT); + } /* end of switch */ + + return (0); } /* @@ -2927,28 +3386,28 @@ ip6_mloopback( #endif ip6 = mtod(copym, struct ip6_hdr *); -#ifndef SCOPEDROUTING /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); -#endif #ifdef __APPLE__ /* Makes sure the HW checksum flags are cleaned before sending the packet */ + if ((copym->m_pkthdr.csum_flags & CSUM_DELAY_IPV6_DATA) != 0) { + in6_delayed_cksum(copym, sizeof(struct ip6_hdr)); + copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_IPV6_DATA; + } copym->m_pkthdr.rcvif = 0; copym->m_pkthdr.csum_data = 0; copym->m_pkthdr.csum_flags = 0; if (lo_ifp) { copym->m_pkthdr.rcvif = ifp; - lck_mtx_unlock(ip6_mutex); dlil_output(lo_ifp, PF_INET6, copym, 0, (struct sockaddr *)dst, 0); - lck_mtx_lock(ip6_mutex); } else m_free(copym); #else @@ -3002,7 +3461,7 @@ ip6_optlen(in6p) len = 0; #define elen(x) \ - (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) + (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p->in6p_outputopts->ip6po_hbh); if (in6p->in6p_outputopts->ip6po_rthdr) @@ -3013,4 +3472,3 @@ ip6_optlen(in6p) return len; #undef elen } - diff --git a/bsd/netinet6/ip6_var.h b/bsd/netinet6/ip6_var.h index 9aa8e0e3f..acb9c3857 100644 --- a/bsd/netinet6/ip6_var.h +++ b/bsd/netinet6/ip6_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -97,37 +97,29 @@ #define _NETINET6_IP6_VAR_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE /* * IP6 reassembly queue structure. Each fragment * being reassembled is attached to one of these structures. */ struct ip6q { - u_int32_t ip6q_head; - u_int16_t ip6q_len; - u_int8_t ip6q_nxt; /* ip6f_nxt in first fragment */ - u_int8_t ip6q_hlim; struct ip6asfrag *ip6q_down; struct ip6asfrag *ip6q_up; u_int32_t ip6q_ident; - u_int8_t ip6q_arrive; + u_int8_t ip6q_nxt; + u_int8_t ip6q_ecn; u_int8_t ip6q_ttl; - struct in6_addr ip6q_src, ip6q_dst; + struct in6_addr ip6q_src, ip6q_dst; struct ip6q *ip6q_next; struct ip6q *ip6q_prev; int ip6q_unfrglen; /* len of unfragmentable part */ #if notyet u_char *ip6q_nxtp; #endif - int ip6q_nfrag; /* number of fragments */ + int ip6q_nfrag; /* # of fragments */ }; struct ip6asfrag { - u_int32_t ip6af_head; - u_int16_t ip6af_len; - u_int8_t ip6af_nxt; - u_int8_t ip6af_hlim; - /* must not override the above members during reassembling */ struct ip6asfrag *ip6af_down; struct ip6asfrag *ip6af_up; struct mbuf *ip6af_m; @@ -140,12 +132,49 @@ struct ip6asfrag { #define IP6_REASS_MBUF(ip6af) (*(struct mbuf **)&((ip6af)->ip6af_m)) struct ip6_moptions { + decl_lck_mtx_data(, im6o_lock); + uint32_t im6o_refcnt; /* ref count */ + uint32_t im6o_debug; /* see ifa_debug flags */ struct ifnet *im6o_multicast_ifp; /* ifp for outgoing multicasts */ u_char im6o_multicast_hlim; /* hoplimit for outgoing multicasts */ u_char im6o_multicast_loop; /* 1 >= hear sends if a member */ - LIST_HEAD(, in6_multi_mship) im6o_memberships; + u_short im6o_num_memberships; /* no. memberships this socket */ + u_short im6o_max_memberships; /* max memberships this socket */ + struct in6_multi **im6o_membership; /* group memberships */ + struct in6_mfilter *im6o_mfilters; /* source filters */ + void (*im6o_trace) /* callback fn for tracing refs */ + (struct ip6_moptions *, int); }; +#define IM6O_LOCK_ASSERT_HELD(_im6o) \ + lck_mtx_assert(&(_im6o)->im6o_lock, LCK_MTX_ASSERT_OWNED) + +#define IM6O_LOCK_ASSERT_NOTHELD(_im6o) \ + lck_mtx_assert(&(_im6o)->im6o_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define IM6O_LOCK(_im6o) \ + lck_mtx_lock(&(_im6o)->im6o_lock) + +#define IM6O_LOCK_SPIN(_im6o) \ + lck_mtx_lock_spin(&(_im6o)->im6o_lock) + +#define IM6O_CONVERT_LOCK(_im6o) do { \ + IM6O_LOCK_ASSERT_HELD(_im6o); \ + lck_mtx_convert_spin(&(_im6o)->im6o_lock); \ +} while (0) + +#define IM6O_UNLOCK(_im6o) \ + lck_mtx_unlock(&(_im6o)->im6o_lock) + +#define IM6O_ADDREF(_im6o) \ + im6o_addref(_im6o, 0) + +#define IM6O_ADDREF_LOCKED(_im6o) \ + im6o_addref(_im6o, 1) + +#define IM6O_REMREF(_im6o) \ + im6o_remref(_im6o) + /* * Control options for outgoing packets */ @@ -158,6 +187,14 @@ struct ip6po_rhinfo { #define ip6po_rthdr ip6po_rhinfo.ip6po_rhi_rthdr #define ip6po_route ip6po_rhinfo.ip6po_rhi_route +/* Nexthop related info */ +struct ip6po_nhinfo { + struct sockaddr *ip6po_nhi_nexthop; + struct route_in6 ip6po_nhi_route; /* Route to the nexthop */ +}; +#define ip6po_nexthop ip6po_nhinfo.ip6po_nhi_nexthop +#define ip6po_nextroute ip6po_nhinfo.ip6po_nhi_route + struct ip6_pktopts { struct mbuf *ip6po_m; /* Pointer to mbuf storing the data */ int ip6po_hlim; /* Hoplimit for outgoing packets */ @@ -165,8 +202,9 @@ struct ip6_pktopts { /* Outgoing IF/address information */ struct in6_pktinfo *ip6po_pktinfo; - struct sockaddr *ip6po_nexthop; /* Next-hop address */ - + /* Next-hop address information */ + struct ip6po_nhinfo ip6po_nhinfo; + struct ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */ /* Destination options header (before a routing header) */ @@ -178,13 +216,32 @@ struct ip6_pktopts { /* Destination options header (after a routing header) */ struct ip6_dest *ip6po_dest2; - int ip6po_tclass; /* traffic class */ + int ip6po_tclass; /* traffic class */ + + int ip6po_minmtu; /* fragment vs PMTU discovery policy */ +#define IP6PO_MINMTU_MCASTONLY -1 /* default; send at min MTU for multicast*/ +#define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */ +#define IP6PO_MINMTU_ALL 1 /* always send at min MTU */ + + int ip6po_prefer_tempaddr; /* whether temporary addresses are + preferred as source address */ +#define IP6PO_TEMPADDR_SYSTEM -1 /* follow the system default */ +#define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */ +#define IP6PO_TEMPADDR_PREFER 1 /* prefer temporary address */ + + int ip6po_flags; +#if 0 /* parameters in this block is obsolete. do not reuse the values. */ +#define IP6PO_REACHCONF 0x01 /* upper-layer reachability confirmation. */ +#define IP6PO_MINMTU 0x02 /* use minimum MTU (IPV6_USE_MIN_MTU) */ +#endif +#define IP6PO_DONTFRAG 0x04 /* disable fragmentation (IPV6_DONTFRAG) */ +#define IP6PO_USECOA 0x08 /* use care of address */ }; /* * Control options for incoming packets */ -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct ip6stat { u_quad_t ip6s_total; /* total packets received */ @@ -201,7 +258,7 @@ struct ip6stat { u_quad_t ip6s_localout; /* total ip packets generated here */ u_quad_t ip6s_odropped; /* lost packets due to nobufs, etc. */ u_quad_t ip6s_reassembled; /* total packets reassembled ok */ - u_quad_t ip6s_fragmented; /* datagrams sucessfully fragmented */ + u_quad_t ip6s_fragmented; /* datagrams successfully fragmented */ u_quad_t ip6s_ofragments; /* output fragments created */ u_quad_t ip6s_cantfrag; /* don't fragment flag was set, etc. */ u_quad_t ip6s_badoptions; /* error in option processing */ @@ -240,11 +297,17 @@ struct ip6stat { * from the destination is chosen. */ u_quad_t ip6s_sources_otherscope[16]; - /* number of times that an deprecated address is chosen */ + /* number of times that a deprecated address is chosen */ u_quad_t ip6s_sources_deprecated[16]; u_quad_t ip6s_forward_cachehit; u_quad_t ip6s_forward_cachemiss; + + /* number of times that each rule of source selection is applied. */ + u_quad_t ip6s_sources_rule[16]; +#ifdef PRIVATE + u_quad_t ip6s_pktdropcntrl; /* pkt dropped, no mbufs for control data */ +#endif /* PRIVATE */ }; #ifdef KERNEL_PRIVATE @@ -279,12 +342,28 @@ struct ip6aux { }; /* flags passed to ip6_output as last parameter */ -#define IPV6_DADOUTPUT 0x01 /* DAD */ +#define IPV6_UNSPECSRC 0x01 /* allow :: as the source address */ #define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */ #define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */ +#define IPV6_FLAG_NOSRCIFSEL 0x80 /* bypas source address selection */ +#define IPV6_OUTARGS 0x100 /* has ancillary output info */ + +#ifdef __NO_STRICT_ALIGNMENT +#define IP6_HDR_ALIGNED_P(ip) 1 +#else +#define IP6_HDR_ALIGNED_P(ip) ((((intptr_t) (ip)) & 3) == 0) +#endif + +/* + * Extra information passed to ip6_output when IP6_OUTARGS is set. + */ +struct ip6_out_args { + unsigned int ip6oa_boundif; /* bound outgoing interface */ + unsigned int ip6oa_nocell; /* don't use IFT_CELLULAR */ +}; extern struct ip6stat ip6stat; /* statistics */ -extern u_int32_t ip6_id; /* fragment identifier */ +extern u_int32_t ip6_id; /* fragment identifier */ extern int ip6_defhlim; /* default hop limit */ extern int ip6_defmcasthlim; /* default multicast hop limit */ extern int ip6_forwarding; /* act as router? */ @@ -293,7 +372,8 @@ extern int ip6_gif_hlim; /* Hop limit for gif encap packet */ extern int ip6_use_deprecated; /* allow deprecated addr as source */ extern int ip6_rr_prune; /* router renumbering prefix * walk list every 5 sec. */ -#define ip6_mapped_addr_on (!ip6_v6only) +extern int ip6_mcast_pmtu; /* enable pMTU discovery for multicast? */ +#define ip6_mapped_addr_on (!ip6_v6only) extern int ip6_v6only; extern int ip6_neighborgcthresh; /* Threshold # of NDP entries for GC */ @@ -304,8 +384,8 @@ extern int ip6_maxdynroutes; /* Max # of routes created via redirect */ extern struct socket *ip6_mrouter; /* multicast routing daemon */ #endif extern int ip6_sendredirects; /* send IP redirects when forwarding? */ -extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ -extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */ +extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */ +extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */ extern int ip6_sourcecheck; /* Verify source interface */ extern int ip6_sourcecheck_interval; /* Interval between log messages */ extern int ip6_accept_rtadv; /* Acts as a host not a router */ @@ -314,6 +394,7 @@ extern int ip6_log_interval; extern time_t ip6_log_time; extern int ip6_hdrnestlimit; /* upper limit of # of extension headers */ extern int ip6_dad_count; /* DupAddrDetectionTransmits */ +extern int ip6_only_allow_rfc4193_prefix; /* RFC4193 Unique Local Unicast Prefixes only */ extern u_int32_t ip6_flow_seq; extern int ip6_auto_flowlabel; @@ -325,9 +406,16 @@ extern int ip6_lowportmin; /* minimum reserved port */ extern int ip6_lowportmax; /* maximum reserved port */ extern int ip6_use_tempaddr; /* whether to use temporary addresses. */ +extern int ip6_prefer_tempaddr; /* whether to prefer temporary addresses + in the source address selection */ +extern int ip6_use_defzone; /* whether to use the default scope zone + when unspecified */ extern struct pr_usrreqs rip6_usrreqs; extern struct pr_usrreqs icmp6_dgram_usrreqs; + +extern int ip6_doscopedroute; + struct sockopt; struct inpcb; @@ -340,51 +428,71 @@ int icmp6_dgram_attach(struct socket *, int , struct proc *); struct in6_ifaddr; void ip6_init(void); +void ip6_fin(void); void ip6_input(struct mbuf *); struct in6_ifaddr *ip6_getdstifaddr(struct mbuf *); void ip6_freepcbopts(struct ip6_pktopts *); -void ip6_freemoptions(struct ip6_moptions *); -int ip6_unknown_opt(u_int8_t *, struct mbuf *, int, int); +int ip6_unknown_opt(u_int8_t *, struct mbuf *, int); char * ip6_get_prevhdr(struct mbuf *, int); int ip6_nexthdr(struct mbuf *, int, int, int *); int ip6_lasthdr(struct mbuf *, int, int, int *); +extern void ip6_moptions_init(void); +extern struct ip6_moptions *ip6_allocmoptions(int); +extern void im6o_addref(struct ip6_moptions *, int); +extern void im6o_remref(struct ip6_moptions *); + struct ip6aux *ip6_addaux(struct mbuf *); struct ip6aux *ip6_findaux(struct mbuf *); void ip6_delaux(struct mbuf *); +extern void ip6_destroyaux(struct ip6aux *); +extern void ip6_copyaux(struct ip6aux *, struct ip6aux *); int ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *); int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *, u_int32_t *); -void ip6_savecontrol(struct inpcb *, struct mbuf **, struct ip6_hdr *, - struct mbuf *); -void ip6_forward(struct mbuf *, struct route_in6 *, int, int); - +struct mbuf **ip6_savecontrol_v4(struct inpcb *, struct mbuf *, + struct mbuf **, int *); +int ip6_savecontrol(struct inpcb *, struct mbuf *, struct mbuf **); +void ip6_forward(struct mbuf *, struct route_in6 *, int); +void ip6_notify_pmtu __P((struct inpcb *, struct sockaddr_in6 *, + u_int32_t *)); void ip6_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in6 *); -int ip6_output(struct mbuf *, struct ip6_pktopts *, - struct route_in6 *, - int, - struct ip6_moptions *, struct ifnet **, int locked); +int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route_in6 *, + int, struct ip6_moptions *, struct ifnet **, + struct ip6_out_args *); int ip6_ctloutput(struct socket *, struct sockopt *sopt); void ip6_initpktopts(struct ip6_pktopts *); int ip6_setpktoptions(struct mbuf *, struct ip6_pktopts *, int, int); -void ip6_clearpktopts(struct ip6_pktopts *, int, int); +void ip6_clearpktopts(struct ip6_pktopts *, int); struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int); int ip6_optlen(struct inpcb *); -int route6_input(struct mbuf **, int *); +int route6_input(struct mbuf **, int *, int); void frag6_init(void); -int frag6_input(struct mbuf **, int *); +int frag6_input(struct mbuf **, int *, int); void frag6_slowtimo(void); void frag6_drain(void); -int rip6_input(struct mbuf **mp, int *offset); +int rip6_input(struct mbuf **, int *, int); void rip6_ctlinput(int, struct sockaddr *, void *); int rip6_ctloutput(struct socket *so, struct sockopt *sopt); -int rip6_output(struct mbuf *, struct socket *, struct sockaddr_in6 *, struct mbuf *); +int rip6_output(struct mbuf *, struct socket *, struct sockaddr_in6 *, struct mbuf *, int); + +int dest6_input(struct mbuf **, int *, int); +extern struct in6_addr *in6_selectsrc(struct sockaddr_in6 *, + struct ip6_pktopts *, struct inpcb *, struct route_in6 *, + struct ifnet **, struct in6_addr *, unsigned int, int *); +extern struct in6_addrpolicy * + in6_addrsel_lookup_policy(struct sockaddr_in6 *); +int in6_selectroute(struct sockaddr_in6 *, struct sockaddr_in6 *, + struct ip6_pktopts *, struct ip6_moptions *, struct route_in6 *, + struct ifnet **, struct rtentry **, int, unsigned int, unsigned int); +int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, int uproto); +u_int32_t ip6_randomid(void); +u_int32_t ip6_randomflowlabel(void); -int dest6_input(struct mbuf **, int *); #endif /* KERNEL */ #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet6/ip6protosw.h b/bsd/netinet6/ip6protosw.h index 303f61964..dbadffc81 100644 --- a/bsd/netinet6/ip6protosw.h +++ b/bsd/netinet6/ip6protosw.h @@ -1,6 +1,6 @@ /* $FreeBSD: src/sys/netinet6/ip6protosw.h,v 1.2.2.3 2001/07/03 11:01:54 ume Exp $ */ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -122,7 +122,7 @@ struct pr_usrreqs; * argument type for the last arg of pr_ctlinput(). * should be consulted only with AF_INET6 family. * - * IPv6 ICMP IPv6 [exthdrs] finalhdr paylaod + * IPv6 ICMP IPv6 [exthdrs] finalhdr payload * ^ ^ ^ ^ * | | ip6c_ip6 ip6c_off * | ip6c_icmp6 @@ -157,7 +157,7 @@ struct ip6protosw { short pr_protocol; /* protocol number */ unsigned int pr_flags; /* see below */ /* protocol-protocol hooks */ - int (*pr_input)(struct mbuf **, int *); + int (*pr_input)(struct mbuf **, int *, int); /* input to protocol (from below) */ int (*pr_output)(struct mbuf *m, struct socket *so, struct sockaddr_in6 *, struct mbuf *); @@ -173,8 +173,12 @@ struct ip6protosw { /* utility hooks */ void (*pr_init)(void); /* initialization hook */ +#if __APPLE__ + void (*pr_unused)(void); /* placeholder - fasttimo is removed */ +#else void (*pr_fasttimo)(void); /* fast timeout (200ms) */ +#endif void (*pr_slowtimo)(void); /* slow timeout (500ms) */ void (*pr_drain)(void); diff --git a/bsd/netinet6/ipcomp6.h b/bsd/netinet6/ipcomp6.h index 8fd6fdba9..2bd7b6678 100644 --- a/bsd/netinet6/ipcomp6.h +++ b/bsd/netinet6/ipcomp6.h @@ -40,7 +40,7 @@ #include <netinet6/ipsec.h> #ifdef KERNEL_PRIVATE -extern int ipcomp6_input(struct mbuf **, int *); +extern int ipcomp6_input(struct mbuf **, int *, int); extern int ipcomp6_output(struct mbuf *, u_char *, struct mbuf *, struct secasvar *); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet6/ipcomp_input.c b/bsd/netinet6/ipcomp_input.c index c9473dd8b..3c6a9a43d 100644 --- a/bsd/netinet6/ipcomp_input.c +++ b/bsd/netinet6/ipcomp_input.c @@ -78,6 +78,7 @@ #include <netkey/keydb.h> #include <net/net_osdep.h> +#include <mach/sdt.h> #define IPLEN_FLIPPED @@ -214,6 +215,11 @@ ipcomp4_input(struct mbuf *m, int off) IPSEC_STAT_INCREMENT(ipsecstat.in_polvio); goto fail; } + + DTRACE_IP6(receive, struct mbuf *, m, struct inpcb *, NULL, + struct ip *, ip, struct ifnet *, m->m_pkthdr.rcvif, + struct ip *, ip, struct ip6_hdr *, NULL); + ip_proto_dispatch_in(m, off, nxt, 0); } else m_freem(m); @@ -233,10 +239,9 @@ fail: #if INET6 int -ipcomp6_input(mp, offp) - struct mbuf **mp; - int *offp; +ipcomp6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct mbuf *m, *md; int off; struct ip6_hdr *ip6; diff --git a/bsd/netinet6/ipsec.c b/bsd/netinet6/ipsec.c index 6a7da3d2b..91fd6db6d 100644 --- a/bsd/netinet6/ipsec.c +++ b/bsd/netinet6/ipsec.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -143,7 +143,6 @@ int ipsec_debug = 0; #define DBG_FNC_IPSEC_OUT NETDBG_CODE(DBG_NETIPSEC, (3 << 8)) extern lck_mtx_t *sadb_mutex; -extern lck_mtx_t *ip6_mutex; struct ipsecstat ipsecstat; int ip4_ah_cleartos = 1; @@ -169,33 +168,33 @@ SYSCTL_DECL(_net_inet6_ipsec6); #endif /* net.inet.ipsec */ SYSCTL_STRUCT(_net_inet_ipsec, IPSECCTL_STATS, - stats, CTLFLAG_RD, &ipsecstat, ipsecstat, ""); -SYSCTL_PROC(_net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy, CTLTYPE_INT|CTLFLAG_RW, + stats, CTLFLAG_RD | CTLFLAG_LOCKED, &ipsecstat, ipsecstat, ""); +SYSCTL_PROC(_net_inet_ipsec, IPSECCTL_DEF_POLICY, def_policy, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_def_policy.policy, 0, &sysctl_def_policy, "I", ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, - CTLFLAG_RW, &ip4_esp_trans_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_esp_trans_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, - CTLFLAG_RW, &ip4_esp_net_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_esp_net_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, - CTLFLAG_RW, &ip4_ah_trans_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ah_trans_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, - CTLFLAG_RW, &ip4_ah_net_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ah_net_deflev, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_CLEARTOS, - ah_cleartos, CTLFLAG_RW, &ip4_ah_cleartos, 0, ""); + ah_cleartos, CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ah_cleartos, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_AH_OFFSETMASK, - ah_offsetmask, CTLFLAG_RW, &ip4_ah_offsetmask, 0, ""); + ah_offsetmask, CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ah_offsetmask, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DFBIT, - dfbit, CTLFLAG_RW, &ip4_ipsec_dfbit, 0, ""); + dfbit, CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ipsec_dfbit, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ECN, - ecn, CTLFLAG_RW, &ip4_ipsec_ecn, 0, ""); + ecn, CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_ipsec_ecn, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_DEBUG, - debug, CTLFLAG_RW, &ipsec_debug, 0, ""); + debug, CTLFLAG_RW | CTLFLAG_LOCKED, &ipsec_debug, 0, ""); SYSCTL_INT(_net_inet_ipsec, IPSECCTL_ESP_RANDPAD, - esp_randpad, CTLFLAG_RW, &ip4_esp_randpad, 0, ""); + esp_randpad, CTLFLAG_RW | CTLFLAG_LOCKED, &ip4_esp_randpad, 0, ""); /* for performance, we bypass ipsec until a security policy is set */ int ipsec_bypass = 1; -SYSCTL_INT(_net_inet_ipsec, OID_AUTO, bypass, CTLFLAG_RD, &ipsec_bypass,0, ""); +SYSCTL_INT(_net_inet_ipsec, OID_AUTO, bypass, CTLFLAG_RD | CTLFLAG_LOCKED, &ipsec_bypass,0, ""); /* * NAT Traversal requires a UDP port for encapsulation, @@ -204,7 +203,7 @@ SYSCTL_INT(_net_inet_ipsec, OID_AUTO, bypass, CTLFLAG_RD, &ipsec_bypass,0, ""); * for nat traversal. */ SYSCTL_INT(_net_inet_ipsec, OID_AUTO, esp_port, - CTLFLAG_RW, &esp_udp_encap_port, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &esp_udp_encap_port, 0, ""); #if INET6 struct ipsecstat ipsec6stat; @@ -218,23 +217,23 @@ int ip6_esp_randpad = -1; /* net.inet6.ipsec6 */ SYSCTL_STRUCT(_net_inet6_ipsec6, IPSECCTL_STATS, - stats, CTLFLAG_RD, &ipsec6stat, ipsecstat, ""); + stats, CTLFLAG_RD | CTLFLAG_LOCKED, &ipsec6stat, ipsecstat, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_POLICY, - def_policy, CTLFLAG_RW, &ip6_def_policy.policy, 0, ""); + def_policy, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_def_policy.policy, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_TRANSLEV, esp_trans_deflev, - CTLFLAG_RW, &ip6_esp_trans_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_esp_trans_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_ESP_NETLEV, esp_net_deflev, - CTLFLAG_RW, &ip6_esp_net_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_esp_net_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_TRANSLEV, ah_trans_deflev, - CTLFLAG_RW, &ip6_ah_trans_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_ah_trans_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEF_AH_NETLEV, ah_net_deflev, - CTLFLAG_RW, &ip6_ah_net_deflev, 0, ""); + CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_ah_net_deflev, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ECN, - ecn, CTLFLAG_RW, &ip6_ipsec_ecn, 0, ""); + ecn, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_ipsec_ecn, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_DEBUG, - debug, CTLFLAG_RW, &ipsec_debug, 0, ""); + debug, CTLFLAG_RW | CTLFLAG_LOCKED, &ipsec_debug, 0, ""); SYSCTL_INT(_net_inet6_ipsec6, IPSECCTL_ESP_RANDPAD, - esp_randpad, CTLFLAG_RW, &ip6_esp_randpad, 0, ""); + esp_randpad, CTLFLAG_RW | CTLFLAG_LOCKED, &ip6_esp_randpad, 0, ""); #endif /* INET6 */ static int ipsec_setspidx_mbuf(struct secpolicyindex *, u_int, u_int, @@ -1717,7 +1716,7 @@ ipsec_get_reqlevel(isr) ? (ipsec_debug \ ? log(LOG_INFO, "fixed system default level " #lev ":%d->%d\n",\ (lev), IPSEC_LEVEL_REQUIRE) \ - : 0), \ + : (void)0), \ (lev) = IPSEC_LEVEL_REQUIRE, \ (lev) \ : (lev)) @@ -2961,13 +2960,19 @@ ipsec4_output( } ip = mtod(state->m, struct ip *); + // grab sadb_mutex, before updating sah's route cache + lck_mtx_lock(sadb_mutex); state->ro = &sav->sah->sa_route; state->dst = (struct sockaddr *)&state->ro->ro_dst; dst4 = (struct sockaddr_in *)state->dst; + if (state->ro->ro_rt != NULL) { + RT_LOCK(state->ro->ro_rt); + } if (state->ro->ro_rt != NULL && (state->ro->ro_rt->generation_id != route_generation || !(state->ro->ro_rt->rt_flags & RTF_UP) || dst4->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RT_UNLOCK(state->ro->ro_rt); rtfree(state->ro->ro_rt); state->ro->ro_rt = NULL; } @@ -2976,11 +2981,14 @@ ipsec4_output( dst4->sin_len = sizeof(*dst4); dst4->sin_addr = ip->ip_dst; rtalloc(state->ro); - } - if (state->ro->ro_rt == 0) { - OSAddAtomic(1, &ipstat.ips_noroute); - error = EHOSTUNREACH; - goto bad; + if (state->ro->ro_rt == 0) { + OSAddAtomic(1, &ipstat.ips_noroute); + error = EHOSTUNREACH; + // release sadb_mutex, after updating sah's route cache + lck_mtx_unlock(sadb_mutex); + goto bad; + } + RT_LOCK(state->ro->ro_rt); } /* @@ -2996,6 +3004,9 @@ ipsec4_output( state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway; dst4 = (struct sockaddr_in *)state->dst; } + RT_UNLOCK(state->ro->ro_rt); + // release sadb_mutex, after updating sah's route cache + lck_mtx_unlock(sadb_mutex); } state->m = ipsec4_splithdr(state->m); @@ -3384,7 +3395,8 @@ ipsec6_output_tunnel( struct ip *ip; struct sockaddr_in* dst4; struct route *ro4 = NULL; - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct route ro4_copy; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; /* * must be last isr because encapsulated IPv6 packet @@ -3406,12 +3418,18 @@ ipsec6_output_tunnel( /* Now we have an IPv4 packet */ ip = mtod(state->m, struct ip *); + // grab sadb_mutex, to update sah's route cache and get a local copy of it + lck_mtx_lock(sadb_mutex); ro4 = &sav->sah->sa_route; dst4 = (struct sockaddr_in *)&ro4->ro_dst; + if (ro4->ro_rt) { + RT_LOCK(ro4->ro_rt); + } if (ro4->ro_rt != NULL && (ro4->ro_rt->generation_id != route_generation || !(ro4->ro_rt->rt_flags & RTF_UP) || dst4->sin_addr.s_addr != ip->ip_dst.s_addr)) { + RT_UNLOCK(ro4->ro_rt); rtfree(ro4->ro_rt); ro4->ro_rt = NULL; } @@ -3419,10 +3437,18 @@ ipsec6_output_tunnel( dst4->sin_family = AF_INET; dst4->sin_len = sizeof(*dst4); dst4->sin_addr = ip->ip_dst; + } else { + RT_UNLOCK(ro4->ro_rt); } + route_copyout(&ro4_copy, ro4, sizeof(ro4_copy)); + // release sadb_mutex, after updating sah's route cache and getting a local copy + lck_mtx_unlock(sadb_mutex); state->m = ipsec4_splithdr(state->m); if (!state->m) { error = ENOMEM; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } switch (isr->saidx.proto) { @@ -3430,6 +3456,9 @@ ipsec6_output_tunnel( #if IPSEC_ESP if ((error = esp4_output(state->m, sav)) != 0) { state->m = NULL; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } break; @@ -3438,17 +3467,26 @@ ipsec6_output_tunnel( m_freem(state->m); state->m = NULL; error = EINVAL; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; #endif case IPPROTO_AH: if ((error = ah4_output(state->m, sav)) != 0) { state->m = NULL; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } break; case IPPROTO_IPCOMP: if ((error = ipcomp4_output(state->m, sav)) != 0) { state->m = NULL; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } break; @@ -3459,17 +3497,27 @@ ipsec6_output_tunnel( m_freem(state->m); state->m = NULL; error = EINVAL; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } if (state->m == 0) { error = ENOMEM; + if (ro4_copy.ro_rt != NULL) { + rtfree(ro4_copy.ro_rt); + } goto bad; } ip = mtod(state->m, struct ip *); ip->ip_len = ntohs(ip->ip_len); /* flip len field before calling ip_output */ - error = ip_output(state->m, NULL, ro4, IP_OUTARGS, NULL, &ipoa); + error = ip_output(state->m, NULL, &ro4_copy, IP_OUTARGS, NULL, &ipoa); state->m = NULL; + // grab sadb_mutex, to synchronize the sah's route cache with the local copy + lck_mtx_lock(sadb_mutex); + route_copyin(&ro4_copy, ro4, sizeof(ro4_copy)); + lck_mtx_unlock(sadb_mutex); if (error != 0) goto bad; goto done; @@ -3481,14 +3529,20 @@ ipsec6_output_tunnel( error = EAFNOSUPPORT; goto bad; } - + + // grab sadb_mutex, before updating sah's route cache + lck_mtx_lock(sadb_mutex); state->ro = &sav->sah->sa_route; state->dst = (struct sockaddr *)&state->ro->ro_dst; dst6 = (struct sockaddr_in6 *)state->dst; + if (state->ro->ro_rt) { + RT_LOCK(state->ro->ro_rt); + } if (state->ro->ro_rt != NULL && (state->ro->ro_rt->generation_id != route_generation || !(state->ro->ro_rt->rt_flags & RTF_UP) || !IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &ip6->ip6_dst))) { + RT_UNLOCK(state->ro->ro_rt); rtfree(state->ro->ro_rt); state->ro->ro_rt = NULL; } @@ -3498,11 +3552,16 @@ ipsec6_output_tunnel( dst6->sin6_len = sizeof(*dst6); dst6->sin6_addr = ip6->ip6_dst; rtalloc(state->ro); + if (state->ro->ro_rt) { + RT_LOCK(state->ro->ro_rt); + } } if (state->ro->ro_rt == 0) { ip6stat.ip6s_noroute++; IPSEC_STAT_INCREMENT(ipsec6stat.out_noroute); error = EHOSTUNREACH; + // release sadb_mutex, after updating sah's route cache + lck_mtx_unlock(sadb_mutex); goto bad; } @@ -3519,6 +3578,9 @@ ipsec6_output_tunnel( state->dst = (struct sockaddr *)state->ro->ro_rt->rt_gateway; dst6 = (struct sockaddr_in6 *)state->dst; } + RT_UNLOCK(state->ro->ro_rt); + // release sadb_mutex, after updating sah's route cache + lck_mtx_unlock(sadb_mutex); } state->m = ipsec6_splithdr(state->m); @@ -3982,8 +4044,8 @@ ipsec_addaux( struct ipsec_tag *itag; /* Allocate a tag */ - tag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPSEC, - IPSEC_TAG_SIZE, M_DONTWAIT); + tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPSEC, + IPSEC_TAG_SIZE, M_DONTWAIT, m); if (tag) { itag = (struct ipsec_tag*)(tag + 1); @@ -4128,7 +4190,8 @@ ipsec_send_natt_keepalive( struct udphdr *uh; struct ip *ip; int error; - struct ip_out_args ipoa = { IFSCOPE_NONE }; + struct ip_out_args ipoa = { IFSCOPE_NONE, 0 }; + struct route ro; lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); @@ -4168,8 +4231,23 @@ ipsec_send_natt_keepalive( uh->uh_ulen = htons(1 + sizeof(struct udphdr)); uh->uh_sum = 0; *(u_int8_t*)((char*)m_mtod(m) + sizeof(struct ip) + sizeof(struct udphdr)) = 0xFF; - - error = ip_output(m, NULL, &sav->sah->sa_route, IP_OUTARGS | IP_NOIPSEC, NULL, &ipoa); + + // grab sadb_mutex, to get a local copy of sah's route cache + lck_mtx_lock(sadb_mutex); + if (sav->sah->sa_route.ro_rt != NULL && + rt_key(sav->sah->sa_route.ro_rt)->sa_family != AF_INET) { + rtfree(sav->sah->sa_route.ro_rt); + sav->sah->sa_route.ro_rt = NULL; + } + route_copyout(&ro, &sav->sah->sa_route, sizeof(ro)); + lck_mtx_unlock(sadb_mutex); + + error = ip_output(m, NULL, &ro, IP_OUTARGS | IP_NOIPSEC, NULL, &ipoa); + + // grab sadb_mutex, to synchronize the sah's route cache with the local copy + lck_mtx_lock(sadb_mutex); + route_copyin(&ro, &sav->sah->sa_route, sizeof(ro)); + lck_mtx_unlock(sadb_mutex); if (error == 0) { sav->natt_last_activity = natt_now; return TRUE; diff --git a/bsd/netinet6/mld6.c b/bsd/netinet6/mld6.c index 7e9a882e8..90bcb94b4 100644 --- a/bsd/netinet6/mld6.c +++ b/bsd/netinet6/mld6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,13 +25,8 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - -/* $FreeBSD: src/sys/netinet6/mld6.c,v 1.4.2.2 2001/07/03 11:01:54 ume Exp $ */ -/* $KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $ */ - -/* - * Copyright (C) 1998 WIDE Project. - * All rights reserved. +/*- + * Copyright (c) 2009 Bruce Simpson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -41,14 +36,14 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the project nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. * - * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -103,426 +98,3312 @@ * Version 2.0. */ +#include <sys/cdefs.h> + #include <sys/param.h> #include <sys/systm.h> #include <sys/mbuf.h> #include <sys/socket.h> #include <sys/protosw.h> -#include <sys/syslog.h> +#include <sys/sysctl.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mcache.h> + +#include <kern/zalloc.h> #include <net/if.h> +#include <net/route.h> #include <netinet/in.h> #include <netinet/in_var.h> +#include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> +#include <netinet6/scope6_var.h> #include <netinet/icmp6.h> +#include <netinet6/mld6.h> #include <netinet6/mld6_var.h> -#include <net/net_osdep.h> +/* Lock group and attribute for mld6_mtx */ +static lck_attr_t *mld_mtx_attr; +static lck_grp_t *mld_mtx_grp; +static lck_grp_attr_t *mld_mtx_grp_attr; + +/* + * Locking and reference counting: + * + * mld_mtx mainly protects mli_head. In cases where both mld_mtx and + * in6_multihead_lock must be held, the former must be acquired first in order + * to maintain lock ordering. It is not a requirement that mld_mtx be + * acquired first before in6_multihead_lock, but in case both must be acquired + * in succession, the correct lock ordering must be followed. + * + * Instead of walking the if_multiaddrs list at the interface and returning + * the ifma_protospec value of a matching entry, we search the global list + * of in6_multi records and find it that way; this is done with in6_multihead + * lock held. Doing so avoids the race condition issues that many other BSDs + * suffer from (therefore in our implementation, ifma_protospec will never be + * NULL for as long as the in6_multi is valid.) + * + * The above creates a requirement for the in6_multi to stay in in6_multihead + * list even after the final MLD leave (in MLDv2 mode) until no longer needs + * be retransmitted (this is not required for MLDv1.) In order to handle + * this, the request and reference counts of the in6_multi are bumped up when + * the state changes to MLD_LEAVING_MEMBER, and later dropped in the timeout + * handler. Each in6_multi holds a reference to the underlying mld_ifinfo. + * + * Thus, the permitted lock oder is: + * + * mld_mtx, in6_multihead_lock, inm6_lock, mli_lock + * + * Any may be taken independently, but if any are held at the same time, + * the above lock order must be followed. + */ +static decl_lck_mtx_data(, mld_mtx); + +static void mli_initvar(struct mld_ifinfo *, struct ifnet *, int); +static struct mld_ifinfo *mli_alloc(int); +static void mli_free(struct mld_ifinfo *); +static void mli_delete(const struct ifnet *); +static void mld_dispatch_packet(struct mbuf *); +static void mld_final_leave(struct in6_multi *, struct mld_ifinfo *); +static int mld_handle_state_change(struct in6_multi *, + struct mld_ifinfo *); +static int mld_initial_join(struct in6_multi *, struct mld_ifinfo *, + const int); +#ifdef MLD_DEBUG +static const char * mld_rec_type_to_str(const int); +#endif +static void mld_set_version(struct mld_ifinfo *, const int); +static void mld_flush_relq(struct mld_ifinfo *); +static void mld_dispatch_queue(struct mld_ifinfo *, struct ifqueue *, int); +static int mld_v1_input_query(struct ifnet *, const struct ip6_hdr *, + /*const*/ struct mld_hdr *); +static int mld_v1_input_report(struct ifnet *, const struct ip6_hdr *, + /*const*/ struct mld_hdr *); +static void mld_v1_process_group_timer(struct in6_multi *, const int); +static void mld_v1_process_querier_timers(struct mld_ifinfo *); +static int mld_v1_transmit_report(struct in6_multi *, const int); +static void mld_v1_update_group(struct in6_multi *, const int); +static void mld_v2_cancel_link_timers(struct mld_ifinfo *); +static void mld_v2_dispatch_general_query(struct mld_ifinfo *); +static struct mbuf * + mld_v2_encap_report(struct ifnet *, struct mbuf *); +static int mld_v2_enqueue_filter_change(struct ifqueue *, + struct in6_multi *); +static int mld_v2_enqueue_group_record(struct ifqueue *, + struct in6_multi *, const int, const int, const int, + const int); +static int mld_v2_input_query(struct ifnet *, const struct ip6_hdr *, + struct mbuf *, const int, const int); +static int mld_v2_merge_state_changes(struct in6_multi *, + struct ifqueue *); +static void mld_v2_process_group_timers(struct mld_ifinfo *, + struct ifqueue *, struct ifqueue *, + struct in6_multi *, const int); +static int mld_v2_process_group_query(struct in6_multi *, + int, struct mbuf *, const int); +static int sysctl_mld_gsr SYSCTL_HANDLER_ARGS; +static int sysctl_mld_ifinfo SYSCTL_HANDLER_ARGS; + +/* + * Normative references: RFC 2710, RFC 3590, RFC 3810. + * + * XXX LOR PREVENTION + * A special case for IPv6 is the in6_setscope() routine. ip6_output() + * will not accept an ifp; it wants an embedded scope ID, unlike + * ip_output(), which happily takes the ifp given to it. The embedded + * scope ID is only used by MLD to select the outgoing interface. + * + * As such, we exploit the fact that the scope ID is just the interface + * index, and embed it in the IPv6 destination address accordingly. + * This is potentially NOT VALID for MLDv1 reports, as they + * are always sent to the multicast group itself; as MLDv2 + * reports are always sent to ff02::16, this is not an issue + * when MLDv2 is in use. + */ + +#define MLD_EMBEDSCOPE(pin6, zoneid) \ + (pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF) + +static struct timeval mld_gsrdelay = {10, 0}; +static LIST_HEAD(, mld_ifinfo) mli_head; + +static int interface_timers_running6; +static int state_change_timers_running6; +static int current_state_timers_running6; + +static decl_lck_mtx_data(, mld6_mtx); + +#define MLD_LOCK() \ + lck_mtx_lock(&mld6_mtx) +#define MLD_LOCK_ASSERT_HELD() \ + lck_mtx_assert(&mld6_mtx, LCK_MTX_ASSERT_OWNED) +#define MLD_LOCK_ASSERT_NOTHELD() \ + lck_mtx_assert(&mld6_mtx, LCK_MTX_ASSERT_NOTOWNED) +#define MLD_UNLOCK() \ + lck_mtx_unlock(&mld6_mtx) + +#define MLI_ZONE_MAX 64 /* maximum elements in zone */ +#define MLI_ZONE_NAME "mld_ifinfo" /* zone name */ + +static unsigned int mli_size; /* size of zone element */ +static struct zone *mli_zone; /* zone for mld_ifinfo */ + +SYSCTL_DECL(_net_inet6); /* Note: Not in any common header. */ + +SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW | CTLFLAG_LOCKED, 0, + "IPv6 Multicast Listener Discovery"); +SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, + &mld_gsrdelay.tv_sec, 0, sysctl_mld_gsr, "I", + "Rate limit for MLDv2 Group-and-Source queries in seconds"); + +SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_LOCKED, + sysctl_mld_ifinfo, "Per-interface MLDv2 state"); + +static int mld_v1enable = 1; +SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RW | CTLFLAG_LOCKED, + &mld_v1enable, 0, "Enable fallback to MLDv1"); + +static int mld_use_allow = 1; +SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RW | CTLFLAG_LOCKED, + &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves"); + +#ifdef MLD_DEBUG +int mld_debug = 0; +SYSCTL_INT(_net_inet6_mld, OID_AUTO, + debug, CTLFLAG_RW | CTLFLAG_LOCKED, &mld_debug, 0, ""); +#endif +/* + * Packed Router Alert option structure declaration. + */ +struct mld_raopt { + struct ip6_hbh hbh; + struct ip6_opt pad; + struct ip6_opt_router ra; +} __packed; + +/* + * Router Alert hop-by-hop option header. + */ +static struct mld_raopt mld_ra = { + .hbh = { 0, 0 }, + .pad = { .ip6o_type = IP6OPT_PADN, 0 }, + .ra = { + .ip6or_type = (u_int8_t)IP6OPT_ROUTER_ALERT, + .ip6or_len = (u_int8_t)(IP6OPT_RTALERT_LEN - 2), + .ip6or_value = {((IP6OPT_RTALERT_MLD >> 8) & 0xFF), + (IP6OPT_RTALERT_MLD & 0xFF) } + } +}; +static struct ip6_pktopts mld_po; + +/* + * Retrieve or set threshold between group-source queries in seconds. + */ +static int +sysctl_mld_gsr SYSCTL_HANDLER_ARGS +{ +#pragma unused(arg1, arg2) + int error; + int i; + + MLD_LOCK(); + + i = mld_gsrdelay.tv_sec; + + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || !req->newptr) + goto out_locked; + + if (i < -1 || i >= 60) { + error = EINVAL; + goto out_locked; + } + + mld_gsrdelay.tv_sec = i; + +out_locked: + MLD_UNLOCK(); + return (error); +} +/* + * Expose struct mld_ifinfo to userland, keyed by ifindex. + * For use by ifmcstat(8). + * + */ +static int +sysctl_mld_ifinfo SYSCTL_HANDLER_ARGS +{ +#pragma unused(oidp) + int *name; + int error; + u_int namelen; + struct ifnet *ifp; + struct mld_ifinfo *mli; + struct mld_ifinfo_u mli_u; + + name = (int *)arg1; + namelen = arg2; + + if (req->newptr != USER_ADDR_NULL) + return (EPERM); + + if (namelen != 1) + return (EINVAL); + + MLD_LOCK(); + + if (name[0] <= 0 || name[0] > (u_int)if_index) { + error = ENOENT; + goto out_locked; + } + + error = ENOENT; + + ifnet_head_lock_shared(); + ifp = ifindex2ifnet[name[0]]; + ifnet_head_done(); + if (ifp == NULL) + goto out_locked; + + bzero(&mli_u, sizeof (mli_u)); + + LIST_FOREACH(mli, &mli_head, mli_link) { + MLI_LOCK(mli); + if (ifp != mli->mli_ifp) { + MLI_UNLOCK(mli); + continue; + } + + mli_u.mli_ifindex = mli->mli_ifp->if_index; + mli_u.mli_version = mli->mli_version; + mli_u.mli_v1_timer = mli->mli_v1_timer; + mli_u.mli_v2_timer = mli->mli_v2_timer; + mli_u.mli_flags = mli->mli_flags; + mli_u.mli_rv = mli->mli_rv; + mli_u.mli_qi = mli->mli_qi; + mli_u.mli_qri = mli->mli_qri; + mli_u.mli_uri = mli->mli_uri; + MLI_UNLOCK(mli); + + error = SYSCTL_OUT(req, &mli_u, sizeof (mli_u)); + break; + } + +out_locked: + MLD_UNLOCK(); + return (error); +} + +/* + * Dispatch an entire queue of pending packet chains. + * + * Must not be called with in6m_lock held. + */ +static void +mld_dispatch_queue(struct mld_ifinfo *mli, struct ifqueue *ifq, int limit) +{ + struct mbuf *m; + + if (mli != NULL) + MLI_LOCK_ASSERT_HELD(mli); + + for (;;) { + IF_DEQUEUE(ifq, m); + if (m == NULL) + break; + MLD_PRINTF(("%s: dispatch %p from %p\n", __func__, ifq, m)); + if (mli != NULL) + MLI_UNLOCK(mli); + mld_dispatch_packet(m); + if (mli != NULL) + MLI_LOCK(mli); + if (--limit == 0) + break; + } -#if CONFIG_MACF_NET -#include <security/mac.h> -#endif /* MAC_NET */ + if (mli != NULL) + MLI_LOCK_ASSERT_HELD(mli); +} /* - * Protocol constants + * Filter outgoing MLD report state by group. + * + * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1) + * and node-local addresses. However, kernel and socket consumers + * always embed the KAME scope ID in the address provided, so strip it + * when performing comparison. + * Note: This is not the same as the *multicast* scope. + * + * Return zero if the given group is one for which MLD reports + * should be suppressed, or non-zero if reports should be issued. */ +static __inline__ int +mld_is_addr_reported(const struct in6_addr *addr) +{ + + VERIFY(IN6_IS_ADDR_MULTICAST(addr)); + + if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL) + return (0); + + if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) { + struct in6_addr tmp = *addr; + in6_clearscope(&tmp); + if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes)) + return (0); + } + + return (1); +} -/* denotes that the MLD max response delay field specifies time in milliseconds */ -#define MLD6_TIMER_SCALE 1000 /* - * time between repetitions of a node's initial report of interest in a - * multicast address(in seconds) + * Attach MLD when PF_INET6 is attached to an interface. */ -#define MLD6_UNSOLICITED_REPORT_INTERVAL 10 +struct mld_ifinfo * +mld_domifattach(struct ifnet *ifp, int how) +{ + struct mld_ifinfo *mli; + + MLD_PRINTF(("%s: called for ifp %p(%s%d)\n", + __func__, ifp, ifp->if_name, ifp->if_unit)); + + mli = mli_alloc(how); + if (mli == NULL) + return (NULL); + + MLD_LOCK(); + + MLI_LOCK(mli); + mli_initvar(mli, ifp, 0); + mli->mli_debug |= IFD_ATTACHED; + MLI_ADDREF_LOCKED(mli); /* hold a reference for mli_head */ + MLI_ADDREF_LOCKED(mli); /* hold a reference for caller */ + MLI_UNLOCK(mli); + + LIST_INSERT_HEAD(&mli_head, mli, mli_link); + + MLD_UNLOCK(); -extern lck_mtx_t *nd6_mutex; -static struct ip6_pktopts ip6_opts; -static int mld6_timers_are_running; -static int mld6_init_done = 0 ; -/* XXX: These are necessary for KAME's link-local hack */ -static struct in6_addr mld6_all_nodes_linklocal = IN6ADDR_LINKLOCAL_ALLNODES_INIT; -static struct in6_addr mld6_all_routers_linklocal = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; + MLD_PRINTF(("allocate mld_ifinfo for ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); -static void mld6_sendpkt(struct in6_multi *, int, const struct in6_addr *); + return (mli); +} +/* + * Attach MLD when PF_INET6 is reattached to an interface. Caller is + * expected to have an outstanding reference to the mli. + */ void -mld6_init() +mld_domifreattach(struct mld_ifinfo *mli) { - static u_int8_t hbh_buf[8]; - struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf; - u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD); + struct ifnet *ifp; - if (mld6_init_done) - return; + MLD_LOCK(); - mld6_init_done = 1; - mld6_timers_are_running = 0; + MLI_LOCK(mli); + VERIFY(!(mli->mli_debug & IFD_ATTACHED)); + ifp = mli->mli_ifp; + VERIFY(ifp != NULL); + mli_initvar(mli, ifp, 1); + mli->mli_debug |= IFD_ATTACHED; + MLI_ADDREF_LOCKED(mli); /* hold a reference for mli_head */ + MLI_UNLOCK(mli); - /* ip6h_nxt will be fill in later */ - hbh->ip6h_len = 0; /* (8 >> 3) - 1 */ + LIST_INSERT_HEAD(&mli_head, mli, mli_link); - /* XXX: grotty hard coding... */ - hbh_buf[2] = IP6OPT_PADN; /* 2 byte padding */ - hbh_buf[3] = 0; - hbh_buf[4] = IP6OPT_RTALERT; - hbh_buf[5] = IP6OPT_RTALERT_LEN - 2; - bcopy((caddr_t)&rtalert_code, &hbh_buf[6], sizeof(u_int16_t)); + MLD_UNLOCK(); - ip6_initpktopts(&ip6_opts); - ip6_opts.ip6po_hbh = hbh; + MLD_PRINTF(("reattached mld_ifinfo for ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); } +/* + * Hook for domifdetach. + */ void -mld6_start_listening( - struct in6_multi *in6m) -{ - /* - * RFC2710 page 10: - * The node never sends a Report or Done for the link-scope all-nodes - * address. - * MLD messages are never sent for multicast addresses whose scope is 0 - * (reserved) or 1 (node-local). - */ - mld6_all_nodes_linklocal.s6_addr16[1] = - htons(in6m->in6m_ifp->if_index); /* XXX */ - if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &mld6_all_nodes_linklocal) || - IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) { - in6m->in6m_timer = 0; - in6m->in6m_state = MLD6_OTHERLISTENER; - } else { - mld6_sendpkt(in6m, MLD6_LISTENER_REPORT, NULL); - in6m->in6m_timer = MLD6_RANDOM_DELAY( - MLD6_UNSOLICITED_REPORT_INTERVAL * PR_FASTHZ); - in6m->in6m_state = MLD6_IREPORTEDLAST; - mld6_timers_are_running = 1; +mld_domifdetach(struct ifnet *ifp) +{ + + MLD_PRINTF(("%s: called for ifp %p(%s%d)\n", + __func__, ifp, ifp->if_name, ifp->if_unit)); + + MLD_LOCK(); + mli_delete(ifp); + MLD_UNLOCK(); +} + +/* + * Called at interface detach time. Note that we only flush all deferred + * responses and record releases; all remaining inm records and their source + * entries related to this interface are left intact, in order to handle + * the reattach case. + */ +static void +mli_delete(const struct ifnet *ifp) +{ + struct mld_ifinfo *mli, *tmli; + + MLD_LOCK_ASSERT_HELD(); + + LIST_FOREACH_SAFE(mli, &mli_head, mli_link, tmli) { + MLI_LOCK(mli); + if (mli->mli_ifp == ifp) { + /* + * Free deferred General Query responses. + */ + IF_DRAIN(&mli->mli_gq); + IF_DRAIN(&mli->mli_v1q); + mld_flush_relq(mli); + VERIFY(SLIST_EMPTY(&mli->mli_relinmhead)); + mli->mli_debug &= ~IFD_ATTACHED; + MLI_UNLOCK(mli); + + LIST_REMOVE(mli, mli_link); + MLI_REMREF(mli); /* release mli_head reference */ + return; + } + MLI_UNLOCK(mli); + } + panic("%s: mld_ifinfo not found for ifp %p\n", __func__, ifp); +} + +static void +mli_initvar(struct mld_ifinfo *mli, struct ifnet *ifp, int reattach) +{ + MLI_LOCK_ASSERT_HELD(mli); + + mli->mli_ifp = ifp; + mli->mli_version = MLD_VERSION_2; + mli->mli_flags = 0; + mli->mli_rv = MLD_RV_INIT; + mli->mli_qi = MLD_QI_INIT; + mli->mli_qri = MLD_QRI_INIT; + mli->mli_uri = MLD_URI_INIT; + + /* ifnet is not yet attached; no need to hold ifnet lock */ + if (!(ifp->if_flags & IFF_MULTICAST)) + mli->mli_flags |= MLIF_SILENT; + if (mld_use_allow) + mli->mli_flags |= MLIF_USEALLOW; + if (!reattach) + SLIST_INIT(&mli->mli_relinmhead); + + /* + * Responses to general queries are subject to bounds. + */ + mli->mli_gq.ifq_maxlen = MLD_MAX_RESPONSE_PACKETS; + mli->mli_v1q.ifq_maxlen = MLD_MAX_RESPONSE_PACKETS; +} + +static struct mld_ifinfo * +mli_alloc(int how) +{ + struct mld_ifinfo *mli; + + mli = (how == M_WAITOK) ? zalloc(mli_zone) : zalloc_noblock(mli_zone); + if (mli != NULL) { + bzero(mli, mli_size); + lck_mtx_init(&mli->mli_lock, mld_mtx_grp, mld_mtx_attr); + mli->mli_debug |= IFD_ALLOC; + } + return (mli); +} + +static void +mli_free(struct mld_ifinfo *mli) +{ + MLI_LOCK(mli); + if (mli->mli_debug & IFD_ATTACHED) { + panic("%s: attached mli=%p is being freed", __func__, mli); + /* NOTREACHED */ + } else if (mli->mli_ifp != NULL) { + panic("%s: ifp not NULL for mli=%p", __func__, mli); + /* NOTREACHED */ + } else if (!(mli->mli_debug & IFD_ALLOC)) { + panic("%s: mli %p cannot be freed", __func__, mli); + /* NOTREACHED */ + } else if (mli->mli_refcnt != 0) { + panic("%s: non-zero refcnt mli=%p", __func__, mli); + /* NOTREACHED */ } + mli->mli_debug &= ~IFD_ALLOC; + MLI_UNLOCK(mli); + + lck_mtx_destroy(&mli->mli_lock, mld_mtx_grp); + zfree(mli_zone, mli); } void -mld6_stop_listening( - struct in6_multi *in6m) +mli_addref(struct mld_ifinfo *mli, int locked) { - mld6_all_nodes_linklocal.s6_addr16[1] = - htons(in6m->in6m_ifp->if_index); /* XXX */ - mld6_all_routers_linklocal.s6_addr16[1] = - htons(in6m->in6m_ifp->if_index); /* XXX: necessary when mrouting */ + if (!locked) + MLI_LOCK_SPIN(mli); + else + MLI_LOCK_ASSERT_HELD(mli); - if (in6m->in6m_state == MLD6_IREPORTEDLAST && - (!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &mld6_all_nodes_linklocal)) && - IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) > IPV6_ADDR_SCOPE_NODELOCAL) - mld6_sendpkt(in6m, MLD6_LISTENER_DONE, - &mld6_all_routers_linklocal); + if (++mli->mli_refcnt == 0) { + panic("%s: mli=%p wraparound refcnt", __func__, mli); + /* NOTREACHED */ + } + if (!locked) + MLI_UNLOCK(mli); } void -mld6_input( - struct mbuf *m, - int off) -{ - struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); - struct mld6_hdr *mldh; - struct ifnet *ifp = m->m_pkthdr.rcvif; - struct in6_multi *in6m; - struct in6_ifaddr *ia; - struct ifmultiaddr *ifma; - int timer; /* timer value in the MLD query header */ - -#ifndef PULLDOWN_TEST - IP6_EXTHDR_CHECK(m, off, sizeof(*mldh), return); - mldh = (struct mld6_hdr *)(mtod(m, caddr_t) + off); -#else - IP6_EXTHDR_GET(mldh, struct mld6_hdr *, m, off, sizeof(*mldh)); - if (mldh == NULL) { - icmp6stat.icp6s_tooshort++; - return; +mli_remref(struct mld_ifinfo *mli) +{ + struct ifnet *ifp; + + MLI_LOCK_SPIN(mli); + + if (mli->mli_refcnt == 0) { + panic("%s: mli=%p negative refcnt", __func__, mli); + /* NOTREACHED */ } -#endif - /* source address validation */ - ip6 = mtod(m, struct ip6_hdr *);/* in case mpullup */ - if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) { - log(LOG_ERR, - "mld6_input: src %s is not link-local (grp=%s)\n", - ip6_sprintf(&ip6->ip6_src), - ip6_sprintf(&mldh->mld6_addr)); - /* - * spec (RFC2710) does not explicitly - * specify to discard the packet from a non link-local - * source address. But we believe it's expected to do so. - * XXX: do we have to allow :: as source? - */ - m_freem(m); + --mli->mli_refcnt; + if (mli->mli_refcnt > 0) { + MLI_UNLOCK(mli); return; } + ifp = mli->mli_ifp; + mli->mli_ifp = NULL; + IF_DRAIN(&mli->mli_gq); + IF_DRAIN(&mli->mli_v1q); + mld_flush_relq(mli); + VERIFY(SLIST_EMPTY(&mli->mli_relinmhead)); + MLI_UNLOCK(mli); + + MLD_PRINTF(("%s: freeing mld_ifinfo for ifp %p(%s%d)\n", + __func__, ifp, ifp->if_name, ifp->if_unit)); + + mli_free(mli); +} + +/* + * Process a received MLDv1 general or address-specific query. + * Assumes that the query header has been pulled up to sizeof(mld_hdr). + * + * NOTE: Can't be fully const correct as we temporarily embed scope ID in + * mld_addr. This is OK as we own the mbuf chain. + */ +static int +mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, + /*const*/ struct mld_hdr *mld) +{ + struct mld_ifinfo *mli; + struct in6_multi *inm; + int is_general_query; + uint16_t timer; + + is_general_query = 0; + + if (!mld_v1enable) { + MLD_PRINTF(("ignore v1 query %s on ifp %p(%s%d)\n", + ip6_sprintf(&mld->mld_addr), + ifp, ifp->if_name, ifp->if_unit)); + return (0); + } + /* - * In the MLD6 specification, there are 3 states and a flag. - * - * In Non-Listener state, we simply don't have a membership record. - * In Delaying Listener state, our timer is running (in6m->in6m_timer) - * In Idle Listener state, our timer is not running (in6m->in6m_timer==0) - * - * The flag is in6m->in6m_state, it is set to MLD6_OTHERLISTENER if - * we have heard a report from another member, or MLD6_IREPORTEDLAST - * if we sent the last report. + * RFC3810 Section 6.2: MLD queries must originate from + * a router's link-local address. */ - switch(mldh->mld6_type) { - case MLD6_LISTENER_QUERY: - if (ifp->if_flags & IFF_LOOPBACK) - break; - - if (!IN6_IS_ADDR_UNSPECIFIED(&mldh->mld6_addr) && - !IN6_IS_ADDR_MULTICAST(&mldh->mld6_addr)) - break; /* print error or log stat? */ - if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) - mldh->mld6_addr.s6_addr16[1] = - htons(ifp->if_index); /* XXX */ + if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { + MLD_PRINTF(("ignore v1 query src %s on ifp %p(%s%d)\n", + ip6_sprintf(&ip6->ip6_src), + ifp, ifp->if_name, ifp->if_unit)); + return (0); + } + /* + * Do address field validation upfront before we accept + * the query. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { /* - * - Start the timers in all of our membership records - * that the query applies to for the interface on - * which the query arrived excl. those that belong - * to the "all-nodes" group (ff02::1). - * - Restart any timer that is already running but has - * A value longer than the requested timeout. - * - Use the value specified in the query message as - * the maximum timeout. + * MLDv1 General Query. + * If this was not sent to the all-nodes group, ignore it. */ - ifnet_lock_exclusive(ifp); - IFP_TO_IA6(ifp, ia); - if (ia == NULL) - break; + struct in6_addr dst; + dst = ip6->ip6_dst; + in6_clearscope(&dst); + if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes)) + return (EINVAL); + is_general_query = 1; + } else { /* - * XXX: System timer resolution is too low to handle Max - * Response Delay, so set 1 to the internal timer even if - * the calculated value equals to zero when Max Response - * Delay is positive. + * Embed scope ID of receiving interface in MLD query for + * lookup whilst we don't hold other locks. */ - timer = ntohs(mldh->mld6_maxdelay)*PR_FASTHZ/MLD6_TIMER_SCALE; - if (timer == 0 && mldh->mld6_maxdelay) - timer = 1; - mld6_all_nodes_linklocal.s6_addr16[1] = - htons(ifp->if_index); /* XXX */ - - LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) - { - if (ifma->ifma_addr->sa_family != AF_INET6) - continue; - in6m = (struct in6_multi *)ifma->ifma_protospec; - if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, - &mld6_all_nodes_linklocal) || - IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < - IPV6_ADDR_SCOPE_LINKLOCAL) - continue; + in6_setscope(&mld->mld_addr, ifp, NULL); + } - if (IN6_IS_ADDR_UNSPECIFIED(&mldh->mld6_addr) || - IN6_ARE_ADDR_EQUAL(&mldh->mld6_addr, - &in6m->in6m_addr)) - { - if (timer == 0) { - /* send a report immediately */ - mld6_sendpkt(in6m, MLD6_LISTENER_REPORT, - NULL); - in6m->in6m_timer = 0; /* reset timer */ - in6m->in6m_state = MLD6_IREPORTEDLAST; - } - else if (in6m->in6m_timer == 0 || /*idle state*/ - in6m->in6m_timer > timer) { - in6m->in6m_timer = - MLD6_RANDOM_DELAY(timer); - mld6_timers_are_running = 1; - } - } - } - ifnet_lock_done(ifp); + /* + * Switch to MLDv1 host compatibility mode. + */ + mli = MLD_IFINFO(ifp); + VERIFY(mli != NULL); - if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) - mldh->mld6_addr.s6_addr16[1] = 0; /* XXX */ - break; - case MLD6_LISTENER_REPORT: - /* - * For fast leave to work, we have to know that we are the - * last person to send a report for this group. Reports - * can potentially get looped back if we are a multicast - * router, so discard reports sourced by me. - * Note that it is impossible to check IFF_LOOPBACK flag of - * ifp for this purpose, since ip6_mloopback pass the physical - * interface to looutput. - */ - if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */ - break; + MLI_LOCK(mli); + mld_set_version(mli, MLD_VERSION_1); + MLI_UNLOCK(mli); - if (!IN6_IS_ADDR_MULTICAST(&mldh->mld6_addr)) - break; + timer = (ntohs(mld->mld_maxdelay) * PR_SLOWHZ) / MLD_TIMER_SCALE; + if (timer == 0) + timer = 1; + + if (is_general_query) { + struct in6_multistep step; - if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) - mldh->mld6_addr.s6_addr16[1] = - htons(ifp->if_index); /* XXX */ + MLD_PRINTF(("process v1 general query on ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); /* - * If we belong to the group being reported, stop - * our timer for that group. + * For each reporting group joined on this + * interface, kick the report timer. */ - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(mldh->mld6_addr, ifp, in6m); - if (in6m) { - in6m->in6m_timer = 0; /* transit to idle state */ - in6m->in6m_state = MLD6_OTHERLISTENER; /* clear flag */ + in6_multihead_lock_shared(); + IN6_FIRST_MULTI(step, inm); + while (inm != NULL) { + IN6M_LOCK(inm); + if (inm->in6m_ifp == ifp) + mld_v1_update_group(inm, timer); + IN6M_UNLOCK(inm); + IN6_NEXT_MULTI(step, inm); } - ifnet_lock_done(ifp); + in6_multihead_lock_done(); + } else { + /* + * MLDv1 Group-Specific Query. + * If this is a group-specific MLDv1 query, we need only + * look up the single group to process it. + */ + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&mld->mld_addr, ifp, inm); + in6_multihead_lock_done(); - if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) - mldh->mld6_addr.s6_addr16[1] = 0; /* XXX */ - break; - default: /* this is impossible */ - log(LOG_ERR, "mld6_input: illegal type(%d)", mldh->mld6_type); - break; + if (inm != NULL) { + IN6M_LOCK(inm); + MLD_PRINTF(("process v1 query %s on ifp %p(%s%d)\n", + ip6_sprintf(&mld->mld_addr), + ifp, ifp->if_name, ifp->if_unit)); + mld_v1_update_group(inm, timer); + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); /* from IN6_LOOKUP_MULTI */ + } + /* XXX Clear embedded scope ID as userland won't expect it. */ + in6_clearscope(&mld->mld_addr); } - m_freem(m); + return (0); } -void -mld6_fasttimeo() +/* + * Update the report timer on a group in response to an MLDv1 query. + * + * If we are becoming the reporting member for this group, start the timer. + * If we already are the reporting member for this group, and timer is + * below the threshold, reset it. + * + * We may be updating the group for the first time since we switched + * to MLDv2. If we are, then we must clear any recorded source lists, + * and transition to REPORTING state; the group timer is overloaded + * for group and group-source query responses. + * + * Unlike MLDv2, the delay per group should be jittered + * to avoid bursts of MLDv1 reports. + */ +static void +mld_v1_update_group(struct in6_multi *inm, const int timer) { - struct in6_multi *in6m; - struct in6_multistep step; + IN6M_LOCK_ASSERT_HELD(inm); - /* - * Quick check to see if any work needs to be done, in order - * to minimize the overhead of fasttimo processing. - */ - if (!mld6_timers_are_running) - return; + MLD_PRINTF(("%s: %s/%s%d timer=%d\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit, timer)); - lck_mtx_lock(nd6_mutex); - mld6_timers_are_running = 0; - IN6_FIRST_MULTI(step, in6m); - while (in6m != NULL) { - if (in6m->in6m_timer == 0) { - /* do nothing */ - } else if (--in6m->in6m_timer == 0) { - mld6_sendpkt(in6m, MLD6_LISTENER_REPORT, NULL); - in6m->in6m_state = MLD6_IREPORTEDLAST; - } else { - mld6_timers_are_running = 1; + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + break; + case MLD_REPORTING_MEMBER: + if (inm->in6m_timer != 0 && + inm->in6m_timer <= timer) { + MLD_PRINTF(("%s: REPORTING and timer running, " + "skipping.\n", __func__)); + break; } - IN6_NEXT_MULTI(step, in6m); + /* FALLTHROUGH */ + case MLD_SG_QUERY_PENDING_MEMBER: + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_AWAKENING_MEMBER: + MLD_PRINTF(("%s: ->REPORTING\n", __func__)); + inm->in6m_state = MLD_REPORTING_MEMBER; + inm->in6m_timer = MLD_RANDOM_DELAY(timer); + current_state_timers_running6 = 1; + break; + case MLD_SLEEPING_MEMBER: + MLD_PRINTF(("%s: ->AWAKENING\n", __func__)); + inm->in6m_state = MLD_AWAKENING_MEMBER; + break; + case MLD_LEAVING_MEMBER: + break; } - lck_mtx_unlock(nd6_mutex); } -static void -mld6_sendpkt( - struct in6_multi *in6m, - int type, - const struct in6_addr *dst) +/* + * Process a received MLDv2 general, group-specific or + * group-and-source-specific query. + * + * Assumes that the query header has been pulled up to sizeof(mldv2_query). + * + * Return 0 if successful, otherwise an appropriate error code is returned. + */ +static int +mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6, + struct mbuf *m, const int off, const int icmp6len) { - struct mbuf *mh, *md; - struct mld6_hdr *mldh; - struct ip6_hdr *ip6; - struct ip6_moptions im6o; - struct in6_ifaddr *ia; - struct ifnet *ifp = in6m->in6m_ifp; - struct ifnet *outif = NULL; + struct mld_ifinfo *mli; + struct mldv2_query *mld; + struct in6_multi *inm; + uint32_t maxdelay, nsrc, qqi; + int is_general_query; + uint16_t timer; + uint8_t qrv; - /* - * At first, find a link local address on the outgoing interface - * to use as the source address of the MLD packet. - */ - if ((ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST)) - == NULL) - return; + is_general_query = 0; /* - * Allocate mbufs to store ip6 header and MLD header. - * We allocate 2 mbufs and make chain in advance because - * it is more convenient when inserting the hop-by-hop option later. + * RFC3810 Section 6.2: MLD queries must originate from + * a router's link-local address. */ - MGETHDR(mh, M_DONTWAIT, MT_HEADER); - if (mh == NULL) { - ifafree(&ia->ia_ifa); - return; - } - MGET(md, M_DONTWAIT, MT_DATA); - if (md == NULL) { - m_free(mh); - ifafree(&ia->ia_ifa); - return; + if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) { + MLD_PRINTF(("ignore v1 query src %s on ifp %p(%s%d)\n", + ip6_sprintf(&ip6->ip6_src), + ifp, ifp->if_name, ifp->if_unit)); + return (0); } - mh->m_next = md; - mh->m_pkthdr.rcvif = NULL; -#ifdef __darwin8_notyet -#if CONFIG_MACF_NET - mac_create_mbuf_linklayer(in6m->in6m_ifp, m); -#endif -#endif - mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld6_hdr); - mh->m_len = sizeof(struct ip6_hdr); - MH_ALIGN(mh, sizeof(struct ip6_hdr)); + MLD_PRINTF(("input v2 query on ifp %p(%s%d)\n", ifp, ifp->if_name, + ifp->if_unit)); - /* fill in the ip6 header */ - ip6 = mtod(mh, struct ip6_hdr *); - ip6->ip6_flow = 0; - ip6->ip6_vfc &= ~IPV6_VERSION_MASK; - ip6->ip6_vfc |= IPV6_VERSION; - /* ip6_plen will be set later */ - ip6->ip6_nxt = IPPROTO_ICMPV6; - /* ip6_hlim will be set by im6o.im6o_multicast_hlim */ - ip6->ip6_src = ia->ia_addr.sin6_addr; - ip6->ip6_dst = dst ? *dst : in6m->in6m_addr; - - /* fill in the MLD header */ - md->m_len = sizeof(struct mld6_hdr); - mldh = mtod(md, struct mld6_hdr *); - mldh->mld6_type = type; - mldh->mld6_code = 0; - mldh->mld6_cksum = 0; - /* XXX: we assume the function will not be called for query messages */ - mldh->mld6_maxdelay = 0; - mldh->mld6_reserved = 0; - mldh->mld6_addr = in6m->in6m_addr; - if (IN6_IS_ADDR_MC_LINKLOCAL(&mldh->mld6_addr)) - mldh->mld6_addr.s6_addr16[1] = 0; /* XXX */ - mldh->mld6_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), - sizeof(struct mld6_hdr)); - - /* construct multicast option */ - bzero(&im6o, sizeof(im6o)); - im6o.im6o_multicast_ifp = ifp; - im6o.im6o_multicast_hlim = 1; - - /* - * Request loopback of the report if we are acting as a multicast - * router, so that the process-level routing daemon can hear it. - */ -#if MROUTING - im6o.im6o_multicast_loop = (ip6_mrouter != NULL); -#else - im6o.im6o_multicast_loop = 0; -#endif + mld = (struct mldv2_query *)(mtod(m, uint8_t *) + off); - /* increment output statictics */ - icmp6stat.icp6s_outhist[type]++; + maxdelay = ntohs(mld->mld_maxdelay); /* in 1/10ths of a second */ + if (maxdelay >= 32678) { + maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) << + (MLD_MRC_EXP(maxdelay) + 3); + } + timer = (maxdelay * PR_SLOWHZ) / MLD_TIMER_SCALE; + if (timer == 0) + timer = 1; - ip6_output(mh, &ip6_opts, NULL, 0, &im6o, &outif, 0); - if (outif) { - icmp6_ifstat_inc(outif, ifs6_out_msg); - switch (type) { - case MLD6_LISTENER_QUERY: - icmp6_ifstat_inc(outif, ifs6_out_mldquery); - break; - case MLD6_LISTENER_REPORT: - icmp6_ifstat_inc(outif, ifs6_out_mldreport); - break; - case MLD6_LISTENER_DONE: - icmp6_ifstat_inc(outif, ifs6_out_mlddone); - break; - } + qrv = MLD_QRV(mld->mld_misc); + if (qrv < 2) { + MLD_PRINTF(("%s: clamping qrv %d to %d\n", __func__, + qrv, MLD_RV_INIT)); + qrv = MLD_RV_INIT; } - ifafree(&ia->ia_ifa); -} + qqi = mld->mld_qqi; + if (qqi >= 128) { + qqi = MLD_QQIC_MANT(mld->mld_qqi) << + (MLD_QQIC_EXP(mld->mld_qqi) + 3); + } + + nsrc = ntohs(mld->mld_numsrc); + if (nsrc > MLD_MAX_GS_SOURCES) + return (EMSGSIZE); + if (icmp6len < sizeof(struct mldv2_query) + + (nsrc * sizeof(struct in6_addr))) + return (EMSGSIZE); + + /* + * Do further input validation upfront to avoid resetting timers + * should we need to discard this query. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) { + /* + * General Queries SHOULD be directed to ff02::1. + * A general query with a source list has undefined + * behaviour; discard it. + */ + struct in6_addr dst; + + dst = ip6->ip6_dst; + in6_clearscope(&dst); + if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes) || + nsrc > 0) + return (EINVAL); + is_general_query = 1; + } else { + /* + * Embed scope ID of receiving interface in MLD query for + * lookup whilst we don't hold other locks (due to KAME + * locking lameness). We own this mbuf chain just now. + */ + in6_setscope(&mld->mld_addr, ifp, NULL); + } + + mli = MLD_IFINFO(ifp); + VERIFY(mli != NULL); + + MLI_LOCK(mli); + /* + * Discard the v2 query if we're in Compatibility Mode. + * The RFC is pretty clear that hosts need to stay in MLDv1 mode + * until the Old Version Querier Present timer expires. + */ + if (mli->mli_version != MLD_VERSION_2) { + MLI_UNLOCK(mli); + return (0); + } + + mld_set_version(mli, MLD_VERSION_2); + mli->mli_rv = qrv; + mli->mli_qi = qqi; + mli->mli_qri = maxdelay; + + MLD_PRINTF(("%s: qrv %d qi %d maxdelay %d\n", __func__, qrv, qqi, + maxdelay)); + + if (is_general_query) { + /* + * MLDv2 General Query. + * + * Schedule a current-state report on this ifp for + * all groups, possibly containing source lists. + * + * If there is a pending General Query response + * scheduled earlier than the selected delay, do + * not schedule any other reports. + * Otherwise, reset the interface timer. + */ + MLD_PRINTF(("process v2 general query on ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) { + mli->mli_v2_timer = MLD_RANDOM_DELAY(timer); + interface_timers_running6 = 1; + } + MLI_UNLOCK(mli); + } else { + MLI_UNLOCK(mli); + /* + * MLDv2 Group-specific or Group-and-source-specific Query. + * + * Group-source-specific queries are throttled on + * a per-group basis to defeat denial-of-service attempts. + * Queries for groups we are not a member of on this + * link are simply ignored. + */ + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&mld->mld_addr, ifp, inm); + in6_multihead_lock_done(); + if (inm == NULL) + return (0); + + IN6M_LOCK(inm); +#ifndef __APPLE__ + /* TODO: need ratecheck equivalent */ + if (nsrc > 0) { + if (!ratecheck(&inm->in6m_lastgsrtv, + &mld_gsrdelay)) { + MLD_PRINTF(("%s: GS query throttled.\n", + __func__)); + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); /* from IN6_LOOKUP_MULTI */ + return (0); + } + } +#endif + MLD_PRINTF(("process v2 group query on ifp %p(%s%d)\n", + ifp, ifp->if_name, ifp->if_unit)); + /* + * If there is a pending General Query response + * scheduled sooner than the selected delay, no + * further report need be scheduled. + * Otherwise, prepare to respond to the + * group-specific or group-and-source query. + */ + MLI_LOCK(mli); + if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) { + MLI_UNLOCK(mli); + mld_v2_process_group_query(inm, timer, m, off); + } else { + MLI_UNLOCK(mli); + } + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); /* from IN6_LOOKUP_MULTI */ + /* XXX Clear embedded scope ID as userland won't expect it. */ + in6_clearscope(&mld->mld_addr); + } + + return (0); +} + +/* + * Process a recieved MLDv2 group-specific or group-and-source-specific + * query. + * Return <0 if any error occured. Currently this is ignored. + */ +static int +mld_v2_process_group_query(struct in6_multi *inm, int timer, struct mbuf *m0, + const int off) +{ + struct mldv2_query *mld; + int retval; + uint16_t nsrc; + + IN6M_LOCK_ASSERT_HELD(inm); + + retval = 0; + mld = (struct mldv2_query *)(mtod(m0, uint8_t *) + off); + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_AWAKENING_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_LEAVING_MEMBER: + return (retval); + break; + case MLD_REPORTING_MEMBER: + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + break; + } + + nsrc = ntohs(mld->mld_numsrc); + + /* + * Deal with group-specific queries upfront. + * If any group query is already pending, purge any recorded + * source-list state if it exists, and schedule a query response + * for this group-specific query. + */ + if (nsrc == 0) { + if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || + inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) { + in6m_clear_recorded(inm); + timer = min(inm->in6m_timer, timer); + } + inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER; + inm->in6m_timer = MLD_RANDOM_DELAY(timer); + current_state_timers_running6 = 1; + return (retval); + } + + /* + * Deal with the case where a group-and-source-specific query has + * been received but a group-specific query is already pending. + */ + if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) { + timer = min(inm->in6m_timer, timer); + inm->in6m_timer = MLD_RANDOM_DELAY(timer); + current_state_timers_running6 = 1; + return (retval); + } + + /* + * Finally, deal with the case where a group-and-source-specific + * query has been received, where a response to a previous g-s-r + * query exists, or none exists. + * In this case, we need to parse the source-list which the Querier + * has provided us with and check if we have any source list filter + * entries at T1 for these sources. If we do not, there is no need + * schedule a report and the query may be dropped. + * If we do, we must record them and schedule a current-state + * report for those sources. + */ + if (inm->in6m_nsrc > 0) { + struct mbuf *m; + uint8_t *sp; + int i, nrecorded; + int soff; + + m = m0; + soff = off + sizeof(struct mldv2_query); + nrecorded = 0; + for (i = 0; i < nsrc; i++) { + sp = mtod(m, uint8_t *) + soff; + retval = in6m_record_source(inm, + (const struct in6_addr *)sp); + if (retval < 0) + break; + nrecorded += retval; + soff += sizeof(struct in6_addr); + if (soff >= m->m_len) { + soff = soff - m->m_len; + m = m->m_next; + if (m == NULL) + break; + } + } + if (nrecorded > 0) { + MLD_PRINTF(( "%s: schedule response to SG query\n", + __func__)); + inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER; + inm->in6m_timer = MLD_RANDOM_DELAY(timer); + current_state_timers_running6 = 1; + } + } + + return (retval); +} + +/* + * Process a received MLDv1 host membership report. + * Assumes mld points to mld_hdr in pulled up mbuf chain. + * + * NOTE: Can't be fully const correct as we temporarily embed scope ID in + * mld_addr. This is OK as we own the mbuf chain. + */ +static int +mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6, + /*const*/ struct mld_hdr *mld) +{ + struct in6_addr src, dst; + struct in6_ifaddr *ia; + struct in6_multi *inm; + + if (!mld_v1enable) { + MLD_PRINTF(("ignore v1 report %s on ifp %p(%s%d)\n", + ip6_sprintf(&mld->mld_addr), + ifp, ifp->if_name, ifp->if_unit)); + return (0); + } + + if (ifp->if_flags & IFF_LOOPBACK) + return (0); + + /* + * MLDv1 reports must originate from a host's link-local address, + * or the unspecified address (when booting). + */ + src = ip6->ip6_src; + in6_clearscope(&src); + if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) { + MLD_PRINTF(("ignore v1 query src %s on ifp %p(%s%d)\n", + ip6_sprintf(&ip6->ip6_src), + ifp, ifp->if_name, ifp->if_unit)); + return (EINVAL); + } + + /* + * RFC2710 Section 4: MLDv1 reports must pertain to a multicast + * group, and must be directed to the group itself. + */ + dst = ip6->ip6_dst; + in6_clearscope(&dst); + if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) || + !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) { + MLD_PRINTF(("ignore v1 query dst %s on ifp %p(%s%d)\n", + ip6_sprintf(&ip6->ip6_dst), + ifp, ifp->if_name, ifp->if_unit)); + return (EINVAL); + } + + /* + * Make sure we don't hear our own membership report, as fast + * leave requires knowing that we are the only member of a + * group. Assume we used the link-local address if available, + * otherwise look for ::. + * + * XXX Note that scope ID comparison is needed for the address + * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be + * performed for the on-wire address. + */ + ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); + if (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + if ((IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia)))){ + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + return (0); + } + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + } else if (IN6_IS_ADDR_UNSPECIFIED(&src)) { + return (0); + } + + MLD_PRINTF(("process v1 report %s on ifp %p(%s%d)\n", + ip6_sprintf(&mld->mld_addr), ifp, ifp->if_name, ifp->if_unit)); + + /* + * Embed scope ID of receiving interface in MLD query for lookup + * whilst we don't hold other locks (due to KAME locking lameness). + */ + if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) + in6_setscope(&mld->mld_addr, ifp, NULL); + + /* + * MLDv1 report suppression. + * If we are a member of this group, and our membership should be + * reported, and our group timer is pending or about to be reset, + * stop our group timer by transitioning to the 'lazy' state. + */ + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&mld->mld_addr, ifp, inm); + in6_multihead_lock_done(); + + if (inm != NULL) { + struct mld_ifinfo *mli; + + IN6M_LOCK(inm); + mli = inm->in6m_mli; + VERIFY(mli != NULL); + + MLI_LOCK(mli); + /* + * If we are in MLDv2 host mode, do not allow the + * other host's MLDv1 report to suppress our reports. + */ + if (mli->mli_version == MLD_VERSION_2) { + MLI_UNLOCK(mli); + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); /* from IN6_LOOKUP_MULTI */ + goto out; + } + MLI_UNLOCK(mli); + + inm->in6m_timer = 0; + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_SLEEPING_MEMBER: + break; + case MLD_REPORTING_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_AWAKENING_MEMBER: + MLD_PRINTF(("report suppressed for %s on ifp %p(%s%d)\n", + ip6_sprintf(&mld->mld_addr), + ifp, ifp->if_name, ifp->if_unit)); + case MLD_LAZY_MEMBER: + inm->in6m_state = MLD_LAZY_MEMBER; + break; + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + case MLD_LEAVING_MEMBER: + break; + } + IN6M_UNLOCK(inm); + IN6M_REMREF(inm); /* from IN6_LOOKUP_MULTI */ + } + +out: + /* XXX Clear embedded scope ID as userland won't expect it. */ + in6_clearscope(&mld->mld_addr); + + return (0); +} + +/* + * MLD input path. + * + * Assume query messages which fit in a single ICMPv6 message header + * have been pulled up. + * Assume that userland will want to see the message, even if it + * otherwise fails kernel input validation; do not free it. + * Pullup may however free the mbuf chain m if it fails. + * + * Return IPPROTO_DONE if we freed m. Otherwise, return 0. + */ +int +mld_input(struct mbuf *m, int off, int icmp6len) +{ + struct ifnet *ifp; + struct ip6_hdr *ip6; + struct mld_hdr *mld; + int mldlen; + + MLD_PRINTF(("%s: called w/mbuf (%p,%d)\n", __func__, m, off)); + + ifp = m->m_pkthdr.rcvif; + + ip6 = mtod(m, struct ip6_hdr *); + + /* Pullup to appropriate size. */ + mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off); + if (mld->mld_type == MLD_LISTENER_QUERY && + icmp6len >= sizeof(struct mldv2_query)) { + mldlen = sizeof(struct mldv2_query); + } else { + mldlen = sizeof(struct mld_hdr); + } + IP6_EXTHDR_GET(mld, struct mld_hdr *, m, off, mldlen); + if (mld == NULL) { + icmp6stat.icp6s_badlen++; + return (IPPROTO_DONE); + } + + /* + * Userland needs to see all of this traffic for implementing + * the endpoint discovery portion of multicast routing. + */ + switch (mld->mld_type) { + case MLD_LISTENER_QUERY: + icmp6_ifstat_inc(ifp, ifs6_in_mldquery); + if (icmp6len == sizeof(struct mld_hdr)) { + if (mld_v1_input_query(ifp, ip6, mld) != 0) + return (0); + } else if (icmp6len >= sizeof(struct mldv2_query)) { + if (mld_v2_input_query(ifp, ip6, m, off, + icmp6len) != 0) + return (0); + } + break; + case MLD_LISTENER_REPORT: + icmp6_ifstat_inc(ifp, ifs6_in_mldreport); + if (mld_v1_input_report(ifp, ip6, mld) != 0) + return (0); + break; + case MLDV2_LISTENER_REPORT: + icmp6_ifstat_inc(ifp, ifs6_in_mldreport); + break; + case MLD_LISTENER_DONE: + icmp6_ifstat_inc(ifp, ifs6_in_mlddone); + break; + default: + break; + } + + return (0); +} + +/* + * MLD6 slowtimo handler. + * Combiles both the slow and fast timer into one. We loose some responsivness but + * allows the system to avoid having a pr_fasttimo, thus allowing for power savings. + */ +void +mld_slowtimo(void) +{ + struct ifqueue scq; /* State-change packets */ + struct ifqueue qrq; /* Query response packets */ + struct ifnet *ifp; + struct mld_ifinfo *mli; + struct in6_multi *inm; + int uri_fasthz = 0; + + MLD_LOCK(); + + LIST_FOREACH(mli, &mli_head, mli_link) { + MLI_LOCK(mli); + mld_v1_process_querier_timers(mli); + MLI_UNLOCK(mli); + } + + /* + * Quick check to see if any work needs to be done, in order to + * minimize the overhead of fasttimo processing. + */ + if (!current_state_timers_running6 && + !interface_timers_running6 && + !state_change_timers_running6) { + MLD_UNLOCK(); + return; + } + + /* + * MLDv2 General Query response timer processing. + */ + if (interface_timers_running6) { +#if 0 + MLD_PRINTF(("%s: interface timers running\n", __func__)); +#endif + interface_timers_running6 = 0; + LIST_FOREACH(mli, &mli_head, mli_link) { + MLI_LOCK(mli); + if (mli->mli_v2_timer == 0) { + /* Do nothing. */ + } else if (--mli->mli_v2_timer == 0) { + mld_v2_dispatch_general_query(mli); + } else { + interface_timers_running6 = 1; + } + MLI_UNLOCK(mli); + } + } + + if (!current_state_timers_running6 && + !state_change_timers_running6) + goto out_locked; + + current_state_timers_running6 = 0; + state_change_timers_running6 = 0; +#if 0 + MLD_PRINTF(("%s: state change timers running\n", __func__)); +#endif + + memset(&qrq, 0, sizeof(struct ifqueue)); + qrq.ifq_maxlen = MLD_MAX_G_GS_PACKETS; + + memset(&scq, 0, sizeof(struct ifqueue)); + scq.ifq_maxlen = MLD_MAX_STATE_CHANGE_PACKETS; + + /* + * MLD host report and state-change timer processing. + * Note: Processing a v2 group timer may remove a node. + */ + LIST_FOREACH(mli, &mli_head, mli_link) { + struct in6_multistep step; + + MLI_LOCK(mli); + ifp = mli->mli_ifp; + uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri * PR_SLOWHZ); + MLI_UNLOCK(mli); + + in6_multihead_lock_shared(); + IN6_FIRST_MULTI(step, inm); + while (inm != NULL) { + IN6M_LOCK(inm); + if (inm->in6m_ifp != ifp) + goto next; + + MLI_LOCK(mli); + switch (mli->mli_version) { + case MLD_VERSION_1: + mld_v1_process_group_timer(inm, + mli->mli_version); + break; + case MLD_VERSION_2: + mld_v2_process_group_timers(mli, &qrq, + &scq, inm, uri_fasthz); + break; + } + MLI_UNLOCK(mli); +next: + IN6M_UNLOCK(inm); + IN6_NEXT_MULTI(step, inm); + } + in6_multihead_lock_done(); + + MLI_LOCK(mli); + if (mli->mli_version == MLD_VERSION_1) { + mld_dispatch_queue(mli, &mli->mli_v1q, 0); + } else if (mli->mli_version == MLD_VERSION_2) { + MLI_UNLOCK(mli); + mld_dispatch_queue(NULL, &qrq, 0); + mld_dispatch_queue(NULL, &scq, 0); + VERIFY(qrq.ifq_len == 0); + VERIFY(scq.ifq_len == 0); + MLI_LOCK(mli); + } + /* + * In case there are still any pending membership reports + * which didn't get drained at version change time. + */ + IF_DRAIN(&mli->mli_v1q); + /* + * Release all deferred inm records, and drain any locally + * enqueued packets; do it even if the current MLD version + * for the link is no longer MLDv2, in order to handle the + * version change case. + */ + mld_flush_relq(mli); + VERIFY(SLIST_EMPTY(&mli->mli_relinmhead)); + MLI_UNLOCK(mli); + + IF_DRAIN(&qrq); + IF_DRAIN(&scq); + } + +out_locked: + MLD_UNLOCK(); +} + +/* + * Free the in6_multi reference(s) for this MLD lifecycle. + * + * Caller must be holding mli_lock. + */ +static void +mld_flush_relq(struct mld_ifinfo *mli) +{ + struct in6_multi *inm; + +again: + MLI_LOCK_ASSERT_HELD(mli); + inm = SLIST_FIRST(&mli->mli_relinmhead); + if (inm != NULL) { + int lastref; + + SLIST_REMOVE_HEAD(&mli->mli_relinmhead, in6m_nrele); + MLI_UNLOCK(mli); + + in6_multihead_lock_exclusive(); + IN6M_LOCK(inm); + VERIFY(inm->in6m_nrelecnt != 0); + inm->in6m_nrelecnt--; + lastref = in6_multi_detach(inm); + VERIFY(!lastref || (!(inm->in6m_debug & IFD_ATTACHED) && + inm->in6m_reqcnt == 0)); + IN6M_UNLOCK(inm); + in6_multihead_lock_done(); + /* from mli_relinmhead */ + IN6M_REMREF(inm); + /* from in6_multihead_list */ + if (lastref) + IN6M_REMREF(inm); + + MLI_LOCK(mli); + goto again; + } +} + +/* + * Update host report group timer. + * Will update the global pending timer flags. + */ +static void +mld_v1_process_group_timer(struct in6_multi *inm, const int mld_version) +{ +#pragma unused(mld_version) + int report_timer_expired; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(inm->in6m_mli); + + if (inm->in6m_timer == 0) { + report_timer_expired = 0; + } else if (--inm->in6m_timer == 0) { + report_timer_expired = 1; + } else { + current_state_timers_running6 = 1; + return; + } + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_AWAKENING_MEMBER: + break; + case MLD_REPORTING_MEMBER: + if (report_timer_expired) { + inm->in6m_state = MLD_IDLE_MEMBER; + (void) mld_v1_transmit_report(inm, + MLD_LISTENER_REPORT); + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(inm->in6m_mli); + } + break; + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + case MLD_LEAVING_MEMBER: + break; + } +} + +/* + * Update a group's timers for MLDv2. + * Will update the global pending timer flags. + * Note: Unlocked read from mli. + */ +static void +mld_v2_process_group_timers(struct mld_ifinfo *mli, + struct ifqueue *qrq, struct ifqueue *scq, + struct in6_multi *inm, const int uri_fasthz) +{ + int query_response_timer_expired; + int state_change_retransmit_timer_expired; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(mli); + VERIFY(mli == inm->in6m_mli); + + query_response_timer_expired = 0; + state_change_retransmit_timer_expired = 0; + + /* + * During a transition from compatibility mode back to MLDv2, + * a group record in REPORTING state may still have its group + * timer active. This is a no-op in this function; it is easier + * to deal with it here than to complicate the slow-timeout path. + */ + if (inm->in6m_timer == 0) { + query_response_timer_expired = 0; + } else if (--inm->in6m_timer == 0) { + query_response_timer_expired = 1; + } else { + current_state_timers_running6 = 1; + } + + if (inm->in6m_sctimer == 0) { + state_change_retransmit_timer_expired = 0; + } else if (--inm->in6m_sctimer == 0) { + state_change_retransmit_timer_expired = 1; + } else { + state_change_timers_running6 = 1; + } + + /* We are in fasttimo, so be quick about it. */ + if (!state_change_retransmit_timer_expired && + !query_response_timer_expired) + return; + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_AWAKENING_MEMBER: + case MLD_IDLE_MEMBER: + break; + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + /* + * Respond to a previously pending Group-Specific + * or Group-and-Source-Specific query by enqueueing + * the appropriate Current-State report for + * immediate transmission. + */ + if (query_response_timer_expired) { + int retval; + + retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1, + (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER), + 0); + MLD_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + inm->in6m_state = MLD_REPORTING_MEMBER; + in6m_clear_recorded(inm); + } + /* FALLTHROUGH */ + case MLD_REPORTING_MEMBER: + case MLD_LEAVING_MEMBER: + if (state_change_retransmit_timer_expired) { + /* + * State-change retransmission timer fired. + * If there are any further pending retransmissions, + * set the global pending state-change flag, and + * reset the timer. + */ + if (--inm->in6m_scrv > 0) { + inm->in6m_sctimer = uri_fasthz; + state_change_timers_running6 = 1; + } + /* + * Retransmit the previously computed state-change + * report. If there are no further pending + * retransmissions, the mbuf queue will be consumed. + * Update T0 state to T1 as we have now sent + * a state-change. + */ + (void) mld_v2_merge_state_changes(inm, scq); + + in6m_commit(inm); + MLD_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + + /* + * If we are leaving the group for good, make sure + * we release MLD's reference to it. + * This release must be deferred using a SLIST, + * as we are called from a loop which traverses + * the in_ifmultiaddr TAILQ. + */ + if (inm->in6m_state == MLD_LEAVING_MEMBER && + inm->in6m_scrv == 0) { + inm->in6m_state = MLD_NOT_MEMBER; + /* + * A reference has already been held in + * mld_final_leave() for this inm, so + * no need to hold another one. We also + * bumped up its request count then, so + * that it stays in in6_multihead. Both + * of them will be released when it is + * dequeued later on. + */ + VERIFY(inm->in6m_nrelecnt != 0); + SLIST_INSERT_HEAD(&mli->mli_relinmhead, + inm, in6m_nrele); + } + } + break; + } +} + +/* + * Switch to a different version on the given interface, + * as per Section 9.12. + */ +static void +mld_set_version(struct mld_ifinfo *mli, const int mld_version) +{ + int old_version_timer; + + MLI_LOCK_ASSERT_HELD(mli); + + MLD_PRINTF(("%s: switching to v%d on ifp %p(%s%d)\n", __func__, + mld_version, mli->mli_ifp, mli->mli_ifp->if_name, + mli->mli_ifp->if_unit)); + + if (mld_version == MLD_VERSION_1) { + /* + * Compute the "Older Version Querier Present" timer as per + * Section 9.12. + */ + old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri; + old_version_timer *= PR_SLOWHZ; + mli->mli_v1_timer = old_version_timer; + } + + if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) { + mli->mli_version = MLD_VERSION_1; + mld_v2_cancel_link_timers(mli); + } + + MLI_LOCK_ASSERT_HELD(mli); +} + +/* + * Cancel pending MLDv2 timers for the given link and all groups + * joined on it; state-change, general-query, and group-query timers. + */ +static void +mld_v2_cancel_link_timers(struct mld_ifinfo *mli) +{ + struct ifnet *ifp; + struct in6_multi *inm; + struct in6_multistep step; + + MLI_LOCK_ASSERT_HELD(mli); + + MLD_PRINTF(("%s: cancel v2 timers on ifp %p(%s%d)\n", __func__, + mli->mli_ifp, mli->mli_ifp->if_name, mli->mli_ifp->if_unit)); + + /* + * Fast-track this potentially expensive operation + * by checking all the global 'timer pending' flags. + */ + if (!interface_timers_running6 && + !state_change_timers_running6 && + !current_state_timers_running6) + return; + + mli->mli_v2_timer = 0; + ifp = mli->mli_ifp; + MLI_UNLOCK(mli); + + in6_multihead_lock_shared(); + IN6_FIRST_MULTI(step, inm); + while (inm != NULL) { + IN6M_LOCK(inm); + if (inm->in6m_ifp != ifp) + goto next; + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_AWAKENING_MEMBER: + break; + case MLD_LEAVING_MEMBER: + /* + * If we are leaving the group and switching + * version, we need to release the final + * reference held for issuing the INCLUDE {}. + * During mld_final_leave(), we bumped up both the + * request and reference counts. Since we cannot + * call in6_multi_detach() here, defer this task to + * the timer routine. + */ + VERIFY(inm->in6m_nrelecnt != 0); + MLI_LOCK(mli); + SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm, + in6m_nrele); + MLI_UNLOCK(mli); + /* FALLTHROUGH */ + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + in6m_clear_recorded(inm); + /* FALLTHROUGH */ + case MLD_REPORTING_MEMBER: + inm->in6m_sctimer = 0; + inm->in6m_timer = 0; + inm->in6m_state = MLD_REPORTING_MEMBER; + /* + * Free any pending MLDv2 state-change records. + */ + IF_DRAIN(&inm->in6m_scq); + break; + } +next: + IN6M_UNLOCK(inm); + IN6_NEXT_MULTI(step, inm); + } + in6_multihead_lock_done(); + + MLI_LOCK(mli); +} + +/* + * Update the Older Version Querier Present timers for a link. + * See Section 9.12 of RFC 3810. + */ +static void +mld_v1_process_querier_timers(struct mld_ifinfo *mli) +{ + MLI_LOCK_ASSERT_HELD(mli); + + if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) { + /* + * MLDv1 Querier Present timer expired; revert to MLDv2. + */ + MLD_PRINTF(("%s: transition from v%d -> v%d on %p(%s%d)\n", + __func__, mli->mli_version, MLD_VERSION_2, + mli->mli_ifp, mli->mli_ifp->if_name, mli->mli_ifp->if_unit)); + mli->mli_version = MLD_VERSION_2; + } +} + +/* + * Transmit an MLDv1 report immediately. + */ +static int +mld_v1_transmit_report(struct in6_multi *in6m, const int type) +{ + struct ifnet *ifp; + struct in6_ifaddr *ia; + struct ip6_hdr *ip6; + struct mbuf *mh, *md; + struct mld_hdr *mld; + int error = 0; + + IN6M_LOCK_ASSERT_HELD(in6m); + MLI_LOCK_ASSERT_HELD(in6m->in6m_mli); + + ifp = in6m->in6m_ifp; + /* ia may be NULL if link-local address is tentative. */ + ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); + + MGETHDR(mh, M_DONTWAIT, MT_HEADER); + if (mh == NULL) { + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + return (ENOMEM); + } + MGET(md, M_DONTWAIT, MT_DATA); + if (md == NULL) { + m_free(mh); + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + return (ENOMEM); + } + mh->m_next = md; + + /* + * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so + * that ether_output() does not need to allocate another mbuf + * for the header in the most common case. + */ + MH_ALIGN(mh, sizeof(struct ip6_hdr)); + mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr); + mh->m_len = sizeof(struct ip6_hdr); + + ip6 = mtod(mh, struct ip6_hdr *); + ip6->ip6_flow = 0; + ip6->ip6_vfc &= ~IPV6_VERSION_MASK; + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_nxt = IPPROTO_ICMPV6; + if (ia != NULL) + IFA_LOCK(&ia->ia_ifa); + ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; + if (ia != NULL) { + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } + ip6->ip6_dst = in6m->in6m_addr; + + md->m_len = sizeof(struct mld_hdr); + mld = mtod(md, struct mld_hdr *); + mld->mld_type = type; + mld->mld_code = 0; + mld->mld_cksum = 0; + mld->mld_maxdelay = 0; + mld->mld_reserved = 0; + mld->mld_addr = in6m->in6m_addr; + in6_clearscope(&mld->mld_addr); + mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, + sizeof(struct ip6_hdr), sizeof(struct mld_hdr)); + + mh->m_flags |= M_MLDV1; + + + /* + * Due to the fact that at this point we are possibly holding + * in6_multihead_lock in shared or exclusive mode, we can't call + * mld_dispatch_packet() here since that will eventually call + * ip6_output(), which will try to lock in6_multihead_lock and cause + * a deadlock. + * Instead we defer the work to the mld_slowtimo() thread, thus + * avoiding unlocking in_multihead_lock here. + */ + if (IF_QFULL(&in6m->in6m_mli->mli_v1q)) { + MLD_PRINTF(("%s: v1 outbound queue full\n", __func__)); + error = ENOMEM; + m_freem(mh); + } else + IF_ENQUEUE(&in6m->in6m_mli->mli_v1q, mh); + + return (error); +} + +/* + * Process a state change from the upper layer for the given IPv6 group. + * + * Each socket holds a reference on the in6_multi in its own ip_moptions. + * The socket layer will have made the necessary updates to.the group + * state, it is now up to MLD to issue a state change report if there + * has been any change between T0 (when the last state-change was issued) + * and T1 (now). + * + * We use the MLDv2 state machine at group level. The MLd module + * however makes the decision as to which MLD protocol version to speak. + * A state change *from* INCLUDE {} always means an initial join. + * A state change *to* INCLUDE {} always means a final leave. + * + * If delay is non-zero, and the state change is an initial multicast + * join, the state change report will be delayed by 'delay' ticks + * in units of PR_FASTHZ if MLDv1 is active on the link; otherwise + * the initial MLDv2 state change report will be delayed by whichever + * is sooner, a pending state-change timer or delay itself. + */ +int +mld_change_state(struct in6_multi *inm, const int delay) +{ + struct mld_ifinfo *mli; + struct ifnet *ifp; + int error = 0; + + IN6M_LOCK_ASSERT_HELD(inm); + VERIFY(inm->in6m_mli != NULL); + MLI_LOCK_ASSERT_NOTHELD(inm->in6m_mli); + + /* + * Try to detect if the upper layer just asked us to change state + * for an interface which has now gone away. + */ + VERIFY(inm->in6m_ifma != NULL); + ifp = inm->in6m_ifma->ifma_ifp; + /* + * Sanity check that netinet6's notion of ifp is the same as net's. + */ + VERIFY(inm->in6m_ifp == ifp); + + mli = MLD_IFINFO(ifp); + VERIFY(mli != NULL); + + /* + * If we detect a state transition to or from MCAST_UNDEFINED + * for this group, then we are starting or finishing an MLD + * life cycle for this group. + */ + if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) { + MLD_PRINTF(("%s: inm transition %d -> %d\n", __func__, + inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode)); + if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) { + MLD_PRINTF(("%s: initial join\n", __func__)); + error = mld_initial_join(inm, mli, delay); + goto out; + } else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) { + MLD_PRINTF(("%s: final leave\n", __func__)); + mld_final_leave(inm, mli); + goto out; + } + } else { + MLD_PRINTF(("%s: filter set change\n", __func__)); + } + + error = mld_handle_state_change(inm, mli); + +out: + return (error); +} + +/* + * Perform the initial join for an MLD group. + * + * When joining a group: + * If the group should have its MLD traffic suppressed, do nothing. + * MLDv1 starts sending MLDv1 host membership reports. + * MLDv2 will schedule an MLDv2 state-change report containing the + * initial state of the membership. + * + * If the delay argument is non-zero, then we must delay sending the + * initial state change for delay ticks (in units of PR_FASTHZ). + */ +static int +mld_initial_join(struct in6_multi *inm, struct mld_ifinfo *mli, + const int delay) +{ + struct ifnet *ifp; + struct ifqueue *ifq; + int error, retval, syncstates; + int odelay; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_NOTHELD(mli); + + MLD_PRINTF(("%s: initial join %s on ifp %p(%s%d)\n", + __func__, ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp, inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + + error = 0; + syncstates = 1; + + ifp = inm->in6m_ifp; + + MLI_LOCK(mli); + VERIFY(mli->mli_ifp == ifp); + + /* + * Groups joined on loopback or marked as 'not reported', + * enter the MLD_SILENT_MEMBER state and + * are never reported in any protocol exchanges. + * All other groups enter the appropriate state machine + * for the version in use on this link. + * A link marked as MLIF_SILENT causes MLD to be completely + * disabled for the link. + */ + if ((ifp->if_flags & IFF_LOOPBACK) || + (mli->mli_flags & MLIF_SILENT) || + !mld_is_addr_reported(&inm->in6m_addr)) { + MLD_PRINTF(("%s: not kicking state machine for silent group\n", + __func__)); + inm->in6m_state = MLD_SILENT_MEMBER; + inm->in6m_timer = 0; + } else { + /* + * Deal with overlapping in6_multi lifecycle. + * If this group was LEAVING, then make sure + * we drop the reference we picked up to keep the + * group around for the final INCLUDE {} enqueue. + * Since we cannot call in6_multi_detach() here, + * defer this task to the timer routine. + */ + if (mli->mli_version == MLD_VERSION_2 && + inm->in6m_state == MLD_LEAVING_MEMBER) { + VERIFY(inm->in6m_nrelecnt != 0); + SLIST_INSERT_HEAD(&mli->mli_relinmhead, inm, + in6m_nrele); + } + + inm->in6m_state = MLD_REPORTING_MEMBER; + + switch (mli->mli_version) { + case MLD_VERSION_1: + /* + * If a delay was provided, only use it if + * it is greater than the delay normally + * used for an MLDv1 state change report, + * and delay sending the initial MLDv1 report + * by not transitioning to the IDLE state. + */ + odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * PR_SLOWHZ); + if (delay) { + inm->in6m_timer = max(delay, odelay); + current_state_timers_running6 = 1; + } else { + inm->in6m_state = MLD_IDLE_MEMBER; + error = mld_v1_transmit_report(inm, + MLD_LISTENER_REPORT); + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(mli); + + if (error == 0) { + inm->in6m_timer = odelay; + current_state_timers_running6 = 1; + } + } + break; + + case MLD_VERSION_2: + /* + * Defer update of T0 to T1, until the first copy + * of the state change has been transmitted. + */ + syncstates = 0; + + /* + * Immediately enqueue a State-Change Report for + * this interface, freeing any previous reports. + * Don't kick the timers if there is nothing to do, + * or if an error occurred. + */ + ifq = &inm->in6m_scq; + IF_DRAIN(ifq); + retval = mld_v2_enqueue_group_record(ifq, inm, 1, + 0, 0, (mli->mli_flags & MLIF_USEALLOW)); + MLD_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + if (retval <= 0) { + error = retval * -1; + break; + } + + /* + * Schedule transmission of pending state-change + * report up to RV times for this link. The timer + * will fire at the next mld_fasttimo (~200ms), + * giving us an opportunity to merge the reports. + * + * If a delay was provided to this function, only + * use this delay if sooner than the existing one. + */ + VERIFY(mli->mli_rv > 1); + inm->in6m_scrv = mli->mli_rv; + if (delay) { + if (inm->in6m_sctimer > 1) { + inm->in6m_sctimer = + min(inm->in6m_sctimer, delay); + } else + inm->in6m_sctimer = delay; + } else + inm->in6m_sctimer = 1; + state_change_timers_running6 = 1; + + error = 0; + break; + } + } + MLI_UNLOCK(mli); + + /* + * Only update the T0 state if state change is atomic, + * i.e. we don't need to wait for a timer to fire before we + * can consider the state change to have been communicated. + */ + if (syncstates) { + in6m_commit(inm); + MLD_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, ifp->if_unit)); + } + + return (error); +} + +/* + * Issue an intermediate state change during the life-cycle. + */ +static int +mld_handle_state_change(struct in6_multi *inm, struct mld_ifinfo *mli) +{ + struct ifnet *ifp; + int retval; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_NOTHELD(mli); + + MLD_PRINTF(("%s: state change for %s on ifp %p(%s%d)\n", + __func__, ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp, inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + + ifp = inm->in6m_ifp; + + MLI_LOCK(mli); + VERIFY(mli->mli_ifp == ifp); + + if ((ifp->if_flags & IFF_LOOPBACK) || + (mli->mli_flags & MLIF_SILENT) || + !mld_is_addr_reported(&inm->in6m_addr) || + (mli->mli_version != MLD_VERSION_2)) { + MLI_UNLOCK(mli); + if (!mld_is_addr_reported(&inm->in6m_addr)) { + MLD_PRINTF(("%s: not kicking state machine for silent " + "group\n", __func__)); + } + MLD_PRINTF(("%s: nothing to do\n", __func__)); + in6m_commit(inm); + MLD_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + return (0); + } + + IF_DRAIN(&inm->in6m_scq); + + retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0, + (mli->mli_flags & MLIF_USEALLOW)); + MLD_PRINTF(("%s: enqueue record = %d\n", __func__, retval)); + if (retval <= 0) { + MLI_UNLOCK(mli); + return (-retval); + } + /* + * If record(s) were enqueued, start the state-change + * report timer for this group. + */ + inm->in6m_scrv = mli->mli_rv; + inm->in6m_sctimer = 1; + state_change_timers_running6 = 1; + MLI_UNLOCK(mli); + + return (0); +} + +/* + * Perform the final leave for a multicast address. + * + * When leaving a group: + * MLDv1 sends a DONE message, if and only if we are the reporter. + * MLDv2 enqueues a state-change report containing a transition + * to INCLUDE {} for immediate transmission. + */ +static void +mld_final_leave(struct in6_multi *inm, struct mld_ifinfo *mli) +{ + int syncstates = 1; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_NOTHELD(mli); + + MLD_PRINTF(("%s: final leave %s on ifp %p(%s%d)\n", + __func__, ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp, inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + case MLD_LEAVING_MEMBER: + /* Already leaving or left; do nothing. */ + MLD_PRINTF(("%s: not kicking state machine for silent group\n", + __func__)); + break; + case MLD_REPORTING_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + MLI_LOCK(mli); + if (mli->mli_version == MLD_VERSION_1) { + if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER || + inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) { + panic("%s: MLDv2 state reached, not MLDv2 " + "mode\n", __func__); + /* NOTREACHED */ + } + mld_v1_transmit_report(inm, MLD_LISTENER_DONE); + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(mli); + + inm->in6m_state = MLD_NOT_MEMBER; + } else if (mli->mli_version == MLD_VERSION_2) { + /* + * Stop group timer and all pending reports. + * Immediately enqueue a state-change report + * TO_IN {} to be sent on the next fast timeout, + * giving us an opportunity to merge reports. + */ + IF_DRAIN(&inm->in6m_scq); + inm->in6m_timer = 0; + inm->in6m_scrv = mli->mli_rv; + MLD_PRINTF(("%s: Leaving %s/%s%d with %d " + "pending retransmissions.\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit, + inm->in6m_scrv)); + if (inm->in6m_scrv == 0) { + inm->in6m_state = MLD_NOT_MEMBER; + inm->in6m_sctimer = 0; + } else { + int retval; + /* + * Stick around in the in6_multihead list; + * the final detach will be issued by + * mld_v2_process_group_timers() when + * the retransmit timer expires. + */ + IN6M_ADDREF_LOCKED(inm); + VERIFY(inm->in6m_debug & IFD_ATTACHED); + inm->in6m_reqcnt++; + VERIFY(inm->in6m_reqcnt >= 1); + inm->in6m_nrelecnt++; + VERIFY(inm->in6m_nrelecnt != 0); + + retval = mld_v2_enqueue_group_record( + &inm->in6m_scq, inm, 1, 0, 0, + (mli->mli_flags & MLIF_USEALLOW)); + KASSERT(retval != 0, + ("%s: enqueue record = %d\n", __func__, + retval)); + + inm->in6m_state = MLD_LEAVING_MEMBER; + inm->in6m_sctimer = 1; + state_change_timers_running6 = 1; + syncstates = 0; + } + } + MLI_UNLOCK(mli); + break; + case MLD_LAZY_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_AWAKENING_MEMBER: + /* Our reports are suppressed; do nothing. */ + break; + } + + if (syncstates) { + in6m_commit(inm); + MLD_PRINTF(("%s: T1 -> T0 for %s/%s%d\n", __func__, + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED; + MLD_PRINTF(("%s: T1 now MCAST_UNDEFINED for %p/%s%d\n", + __func__, &inm->in6m_addr, inm->in6m_ifp->if_name, + inm->in6m_ifp->if_unit)); + } +} + +/* + * Enqueue an MLDv2 group record to the given output queue. + * + * If is_state_change is zero, a current-state record is appended. + * If is_state_change is non-zero, a state-change report is appended. + * + * If is_group_query is non-zero, an mbuf packet chain is allocated. + * If is_group_query is zero, and if there is a packet with free space + * at the tail of the queue, it will be appended to providing there + * is enough free space. + * Otherwise a new mbuf packet chain is allocated. + * + * If is_source_query is non-zero, each source is checked to see if + * it was recorded for a Group-Source query, and will be omitted if + * it is not both in-mode and recorded. + * + * If use_block_allow is non-zero, state change reports for initial join + * and final leave, on an inclusive mode group with a source list, will be + * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively. + * + * The function will attempt to allocate leading space in the packet + * for the IPv6+ICMP headers to be prepended without fragmenting the chain. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +mld_v2_enqueue_group_record(struct ifqueue *ifq, struct in6_multi *inm, + const int is_state_change, const int is_group_query, + const int is_source_query, const int use_block_allow) +{ + struct mldv2_record mr; + struct mldv2_record *pmr; + struct ifnet *ifp; + struct ip6_msource *ims, *nims; + struct mbuf *m0, *m, *md; + int error, is_filter_list_change; + int minrec0len, m0srcs, msrcs, nbytes, off; + int record_has_sources; + int now; + int type; + uint8_t mode; + + IN6M_LOCK_ASSERT_HELD(inm); + MLI_LOCK_ASSERT_HELD(inm->in6m_mli); + + error = 0; + ifp = inm->in6m_ifp; + is_filter_list_change = 0; + m = NULL; + m0 = NULL; + m0srcs = 0; + msrcs = 0; + nbytes = 0; + nims = NULL; + record_has_sources = 1; + pmr = NULL; + type = MLD_DO_NOTHING; + mode = inm->in6m_st[1].iss_fmode; + + /* + * If we did not transition out of ASM mode during t0->t1, + * and there are no source nodes to process, we can skip + * the generation of source records. + */ + if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 && + inm->in6m_nsrc == 0) + record_has_sources = 0; + + if (is_state_change) { + /* + * Queue a state change record. + * If the mode did not change, and there are non-ASM + * listeners or source filters present, + * we potentially need to issue two records for the group. + * If there are ASM listeners, and there was no filter + * mode transition of any kind, do nothing. + * + * If we are transitioning to MCAST_UNDEFINED, we need + * not send any sources. A transition to/from this state is + * considered inclusive with some special treatment. + * + * If we are rewriting initial joins/leaves to use + * ALLOW/BLOCK, and the group's membership is inclusive, + * we need to send sources in all cases. + */ + if (mode != inm->in6m_st[0].iss_fmode) { + if (mode == MCAST_EXCLUDE) { + MLD_PRINTF(("%s: change to EXCLUDE\n", + __func__)); + type = MLD_CHANGE_TO_EXCLUDE_MODE; + } else { + MLD_PRINTF(("%s: change to INCLUDE\n", + __func__)); + if (use_block_allow) { + /* + * XXX + * Here we're interested in state + * edges either direction between + * MCAST_UNDEFINED and MCAST_INCLUDE. + * Perhaps we should just check + * the group state, rather than + * the filter mode. + */ + if (mode == MCAST_UNDEFINED) { + type = MLD_BLOCK_OLD_SOURCES; + } else { + type = MLD_ALLOW_NEW_SOURCES; + } + } else { + type = MLD_CHANGE_TO_INCLUDE_MODE; + if (mode == MCAST_UNDEFINED) + record_has_sources = 0; + } + } + } else { + if (record_has_sources) { + is_filter_list_change = 1; + } else { + type = MLD_DO_NOTHING; + } + } + } else { + /* + * Queue a current state record. + */ + if (mode == MCAST_EXCLUDE) { + type = MLD_MODE_IS_EXCLUDE; + } else if (mode == MCAST_INCLUDE) { + type = MLD_MODE_IS_INCLUDE; + VERIFY(inm->in6m_st[1].iss_asm == 0); + } + } + + /* + * Generate the filter list changes using a separate function. + */ + if (is_filter_list_change) + return (mld_v2_enqueue_filter_change(ifq, inm)); + + if (type == MLD_DO_NOTHING) { + MLD_PRINTF(("%s: nothing to do for %s/%s%d\n", + __func__, ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + return (0); + } + + /* + * If any sources are present, we must be able to fit at least + * one in the trailing space of the tail packet's mbuf, + * ideally more. + */ + minrec0len = sizeof(struct mldv2_record); + if (record_has_sources) + minrec0len += sizeof(struct in6_addr); + MLD_PRINTF(("%s: queueing %s for %s/%s%d\n", __func__, + mld_rec_type_to_str(type), + ip6_sprintf(&inm->in6m_addr), + inm->in6m_ifp->if_name, inm->in6m_ifp->if_unit)); + + /* + * Check if we have a packet in the tail of the queue for this + * group into which the first group record for this group will fit. + * Otherwise allocate a new packet. + * Always allocate leading space for IP6+RA+ICMPV6+REPORT. + * Note: Group records for G/GSR query responses MUST be sent + * in their own packet. + */ + m0 = ifq->ifq_tail; + if (!is_group_query && + m0 != NULL && + (m0->m_pkthdr.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) && + (m0->m_pkthdr.len + minrec0len) < + (ifp->if_mtu - MLD_MTUSPACE)) { + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct mldv2_record)) / + sizeof(struct in6_addr); + m = m0; + MLD_PRINTF(("%s: use existing packet\n", __func__)); + } else { + if (IF_QFULL(ifq)) { + MLD_PRINTF(("%s: outbound queue full\n", __func__)); + return (-ENOMEM); + } + m = NULL; + m0srcs = (ifp->if_mtu - MLD_MTUSPACE - + sizeof(struct mldv2_record)) / sizeof(struct in6_addr); + if (!is_state_change && !is_group_query) + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + return (-ENOMEM); + + MLD_PRINTF(("%s: allocated first packet\n", __func__)); + } + + /* + * Append group record. + * If we have sources, we don't know how many yet. + */ + mr.mr_type = type; + mr.mr_datalen = 0; + mr.mr_numsrc = 0; + mr.mr_addr = inm->in6m_addr; + in6_clearscope(&mr.mr_addr); + if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed.\n", __func__)); + return (-ENOMEM); + } + nbytes += sizeof(struct mldv2_record); + + /* + * Append as many sources as will fit in the first packet. + * If we are appending to a new packet, the chain allocation + * may potentially use clusters; use m_getptr() in this case. + * If we are appending to an existing packet, we need to obtain + * a pointer to the group record after m_append(), in case a new + * mbuf was allocated. + * + * Only append sources which are in-mode at t1. If we are + * transitioning to MCAST_UNDEFINED state on the group, and + * use_block_allow is zero, do not include source entries. + * Otherwise, we need to include this source in the report. + * + * Only report recorded sources in our filter set when responding + * to a group-source query. + */ + if (record_has_sources) { + if (m == m0) { + md = m_last(m); + pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + + md->m_len - nbytes); + } else { + md = m_getptr(m, 0, &off); + pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + + off); + } + msrcs = 0; + RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, + nims) { + MLD_PRINTF(("%s: visit node %s\n", __func__, + ip6_sprintf(&ims->im6s_addr))); + now = im6s_get_mode(inm, ims, 1); + MLD_PRINTF(("%s: node is %d\n", __func__, now)); + if ((now != mode) || + (now == mode && + (!use_block_allow && mode == MCAST_UNDEFINED))) { + MLD_PRINTF(("%s: skip node\n", __func__)); + continue; + } + if (is_source_query && ims->im6s_stp == 0) { + MLD_PRINTF(("%s: skip unrecorded node\n", + __func__)); + continue; + } + MLD_PRINTF(("%s: append node\n", __func__)); + if (!m_append(m, sizeof(struct in6_addr), + (void *)&ims->im6s_addr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed.\n", + __func__)); + return (-ENOMEM); + } + nbytes += sizeof(struct in6_addr); + ++msrcs; + if (msrcs == m0srcs) + break; + } + MLD_PRINTF(("%s: msrcs is %d this packet\n", __func__, + msrcs)); + pmr->mr_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(struct in6_addr)); + } + + if (is_source_query && msrcs == 0) { + MLD_PRINTF(("%s: no recorded sources to report\n", __func__)); + if (m != m0) + m_freem(m); + return (0); + } + + /* + * We are good to go with first packet. + */ + if (m != m0) { + MLD_PRINTF(("%s: enqueueing first packet\n", __func__)); + m->m_pkthdr.vt_nrecs = 1; + m->m_pkthdr.rcvif = ifp; + IF_ENQUEUE(ifq, m); + } else { + m->m_pkthdr.vt_nrecs++; + } + /* + * No further work needed if no source list in packet(s). + */ + if (!record_has_sources) + return (nbytes); + + /* + * Whilst sources remain to be announced, we need to allocate + * a new packet and fill out as many sources as will fit. + * Always try for a cluster first. + */ + while (nims != NULL) { + if (IF_QFULL(ifq)) { + MLD_PRINTF(("%s: outbound queue full\n", __func__)); + return (-ENOMEM); + } + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) + return (-ENOMEM); + md = m_getptr(m, 0, &off); + pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off); + MLD_PRINTF(("%s: allocated next packet\n", __func__)); + + if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed.\n", __func__)); + return (-ENOMEM); + } + m->m_pkthdr.vt_nrecs = 1; + nbytes += sizeof(struct mldv2_record); + + m0srcs = (ifp->if_mtu - MLD_MTUSPACE - + sizeof(struct mldv2_record)) / sizeof(struct in6_addr); + + msrcs = 0; + RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { + MLD_PRINTF(("%s: visit node %s\n", + __func__, ip6_sprintf(&ims->im6s_addr))); + now = im6s_get_mode(inm, ims, 1); + if ((now != mode) || + (now == mode && + (!use_block_allow && mode == MCAST_UNDEFINED))) { + MLD_PRINTF(("%s: skip node\n", __func__)); + continue; + } + if (is_source_query && ims->im6s_stp == 0) { + MLD_PRINTF(("%s: skip unrecorded node\n", + __func__)); + continue; + } + MLD_PRINTF(("%s: append node\n", __func__)); + if (!m_append(m, sizeof(struct in6_addr), + (void *)&ims->im6s_addr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed.\n", + __func__)); + return (-ENOMEM); + } + ++msrcs; + if (msrcs == m0srcs) + break; + } + pmr->mr_numsrc = htons(msrcs); + nbytes += (msrcs * sizeof(struct in6_addr)); + + MLD_PRINTF(("%s: enqueueing next packet\n", __func__)); + m->m_pkthdr.rcvif = ifp; + IF_ENQUEUE(ifq, m); + } + + return (nbytes); +} + +/* + * Type used to mark record pass completion. + * We exploit the fact we can cast to this easily from the + * current filter modes on each ip_msource node. + */ +typedef enum { + REC_NONE = 0x00, /* MCAST_UNDEFINED */ + REC_ALLOW = 0x01, /* MCAST_INCLUDE */ + REC_BLOCK = 0x02, /* MCAST_EXCLUDE */ + REC_FULL = REC_ALLOW | REC_BLOCK +} rectype_t; + +/* + * Enqueue an MLDv2 filter list change to the given output queue. + * + * Source list filter state is held in an RB-tree. When the filter list + * for a group is changed without changing its mode, we need to compute + * the deltas between T0 and T1 for each source in the filter set, + * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records. + * + * As we may potentially queue two record types, and the entire R-B tree + * needs to be walked at once, we break this out into its own function + * so we can generate a tightly packed queue of packets. + * + * XXX This could be written to only use one tree walk, although that makes + * serializing into the mbuf chains a bit harder. For now we do two walks + * which makes things easier on us, and it may or may not be harder on + * the L2 cache. + * + * If successful the size of all data appended to the queue is returned, + * otherwise an error code less than zero is returned, or zero if + * no record(s) were appended. + */ +static int +mld_v2_enqueue_filter_change(struct ifqueue *ifq, struct in6_multi *inm) +{ + static const int MINRECLEN = + sizeof(struct mldv2_record) + sizeof(struct in6_addr); + struct ifnet *ifp; + struct mldv2_record mr; + struct mldv2_record *pmr; + struct ip6_msource *ims, *nims; + struct mbuf *m, *m0, *md; + int m0srcs, nbytes, npbytes, off, rsrcs, schanged; + int nallow, nblock; + uint8_t mode, now, then; + rectype_t crt, drt, nrt; + + IN6M_LOCK_ASSERT_HELD(inm); + + if (inm->in6m_nsrc == 0 || + (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0)) + return (0); + + ifp = inm->in6m_ifp; /* interface */ + mode = inm->in6m_st[1].iss_fmode; /* filter mode at t1 */ + crt = REC_NONE; /* current group record type */ + drt = REC_NONE; /* mask of completed group record types */ + nrt = REC_NONE; /* record type for current node */ + m0srcs = 0; /* # source which will fit in current mbuf chain */ + npbytes = 0; /* # of bytes appended this packet */ + nbytes = 0; /* # of bytes appended to group's state-change queue */ + rsrcs = 0; /* # sources encoded in current record */ + schanged = 0; /* # nodes encoded in overall filter change */ + nallow = 0; /* # of source entries in ALLOW_NEW */ + nblock = 0; /* # of source entries in BLOCK_OLD */ + nims = NULL; /* next tree node pointer */ + + /* + * For each possible filter record mode. + * The first kind of source we encounter tells us which + * is the first kind of record we start appending. + * If a node transitioned to UNDEFINED at t1, its mode is treated + * as the inverse of the group's filter mode. + */ + while (drt != REC_FULL) { + do { + m0 = ifq->ifq_tail; + if (m0 != NULL && + (m0->m_pkthdr.vt_nrecs + 1 <= + MLD_V2_REPORT_MAXRECS) && + (m0->m_pkthdr.len + MINRECLEN) < + (ifp->if_mtu - MLD_MTUSPACE)) { + m = m0; + m0srcs = (ifp->if_mtu - m0->m_pkthdr.len - + sizeof(struct mldv2_record)) / + sizeof(struct in6_addr); + MLD_PRINTF(("%s: use previous packet\n", + __func__)); + } else { + m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR); + if (m == NULL) + m = m_gethdr(M_DONTWAIT, MT_DATA); + if (m == NULL) { + MLD_PRINTF(("%s: m_get*() failed\n", + __func__)); + return (-ENOMEM); + } + m->m_pkthdr.vt_nrecs = 0; + m0srcs = (ifp->if_mtu - MLD_MTUSPACE - + sizeof(struct mldv2_record)) / + sizeof(struct in6_addr); + npbytes = 0; + MLD_PRINTF(("%s: allocated new packet\n", + __func__)); + } + /* + * Append the MLD group record header to the + * current packet's data area. + * Recalculate pointer to free space for next + * group record, in case m_append() allocated + * a new mbuf or cluster. + */ + memset(&mr, 0, sizeof(mr)); + mr.mr_addr = inm->in6m_addr; + in6_clearscope(&mr.mr_addr); + if (!m_append(m, sizeof(mr), (void *)&mr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed\n", + __func__)); + return (-ENOMEM); + } + npbytes += sizeof(struct mldv2_record); + if (m != m0) { + /* new packet; offset in chain */ + md = m_getptr(m, npbytes - + sizeof(struct mldv2_record), &off); + pmr = (struct mldv2_record *)(mtod(md, + uint8_t *) + off); + } else { + /* current packet; offset from last append */ + md = m_last(m); + pmr = (struct mldv2_record *)(mtod(md, + uint8_t *) + md->m_len - + sizeof(struct mldv2_record)); + } + /* + * Begin walking the tree for this record type + * pass, or continue from where we left off + * previously if we had to allocate a new packet. + * Only report deltas in-mode at t1. + * We need not report included sources as allowed + * if we are in inclusive mode on the group, + * however the converse is not true. + */ + rsrcs = 0; + if (nims == NULL) { + nims = RB_MIN(ip6_msource_tree, + &inm->in6m_srcs); + } + RB_FOREACH_FROM(ims, ip6_msource_tree, nims) { + MLD_PRINTF(("%s: visit node %s\n", __func__, + ip6_sprintf(&ims->im6s_addr))); + now = im6s_get_mode(inm, ims, 1); + then = im6s_get_mode(inm, ims, 0); + MLD_PRINTF(("%s: mode: t0 %d, t1 %d\n", + __func__, then, now)); + if (now == then) { + MLD_PRINTF(("%s: skip unchanged\n", + __func__)); + continue; + } + if (mode == MCAST_EXCLUDE && + now == MCAST_INCLUDE) { + MLD_PRINTF(("%s: skip IN src on EX " + "group\n", __func__)); + continue; + } + nrt = (rectype_t)now; + if (nrt == REC_NONE) + nrt = (rectype_t)(~mode & REC_FULL); + if (schanged++ == 0) { + crt = nrt; + } else if (crt != nrt) + continue; + if (!m_append(m, sizeof(struct in6_addr), + (void *)&ims->im6s_addr)) { + if (m != m0) + m_freem(m); + MLD_PRINTF(("%s: m_append() failed\n", + __func__)); + return (-ENOMEM); + } + nallow += !!(crt == REC_ALLOW); + nblock += !!(crt == REC_BLOCK); + if (++rsrcs == m0srcs) + break; + } + /* + * If we did not append any tree nodes on this + * pass, back out of allocations. + */ + if (rsrcs == 0) { + npbytes -= sizeof(struct mldv2_record); + if (m != m0) { + MLD_PRINTF(("%s: m_free(m)\n", + __func__)); + m_freem(m); + } else { + MLD_PRINTF(("%s: m_adj(m, -mr)\n", + __func__)); + m_adj(m, -((int)sizeof( + struct mldv2_record))); + } + continue; + } + npbytes += (rsrcs * sizeof(struct in6_addr)); + if (crt == REC_ALLOW) + pmr->mr_type = MLD_ALLOW_NEW_SOURCES; + else if (crt == REC_BLOCK) + pmr->mr_type = MLD_BLOCK_OLD_SOURCES; + pmr->mr_numsrc = htons(rsrcs); + /* + * Count the new group record, and enqueue this + * packet if it wasn't already queued. + */ + m->m_pkthdr.vt_nrecs++; + m->m_pkthdr.rcvif = ifp; + if (m != m0) + IF_ENQUEUE(ifq, m); + nbytes += npbytes; + } while (nims != NULL); + drt |= crt; + crt = (~crt & REC_FULL); + } + + MLD_PRINTF(("%s: queued %d ALLOW_NEW, %d BLOCK_OLD\n", __func__, + nallow, nblock)); + + return (nbytes); +} + +static int +mld_v2_merge_state_changes(struct in6_multi *inm, struct ifqueue *ifscq) +{ + struct ifqueue *gq; + struct mbuf *m; /* pending state-change */ + struct mbuf *m0; /* copy of pending state-change */ + struct mbuf *mt; /* last state-change in packet */ + struct mbuf *n; + int docopy, domerge; + u_int recslen; + + IN6M_LOCK_ASSERT_HELD(inm); + + docopy = 0; + domerge = 0; + recslen = 0; + + /* + * If there are further pending retransmissions, make a writable + * copy of each queued state-change message before merging. + */ + if (inm->in6m_scrv > 0) + docopy = 1; + + gq = &inm->in6m_scq; +#ifdef MLD_DEBUG + if (gq->ifq_head == NULL) { + MLD_PRINTF(("%s: WARNING: queue for inm %p is empty\n", + __func__, inm)); + } +#endif + + /* + * Use IF_REMQUEUE() instead of IF_DEQUEUE() below, since the + * packet might not always be at the head of the ifqueue. + */ + m = gq->ifq_head; + while (m != NULL) { + /* + * Only merge the report into the current packet if + * there is sufficient space to do so; an MLDv2 report + * packet may only contain 65,535 group records. + * Always use a simple mbuf chain concatentation to do this, + * as large state changes for single groups may have + * allocated clusters. + */ + domerge = 0; + mt = ifscq->ifq_tail; + if (mt != NULL) { + recslen = m_length(m); + + if ((mt->m_pkthdr.vt_nrecs + + m->m_pkthdr.vt_nrecs <= + MLD_V2_REPORT_MAXRECS) && + (mt->m_pkthdr.len + recslen <= + (inm->in6m_ifp->if_mtu - MLD_MTUSPACE))) + domerge = 1; + } + + if (!domerge && IF_QFULL(gq)) { + MLD_PRINTF(("%s: outbound queue full, skipping whole " + "packet %p\n", __func__, m)); + n = m->m_nextpkt; + if (!docopy) { + IF_REMQUEUE(gq, m); + m_freem(m); + } + m = n; + continue; + } + + if (!docopy) { + MLD_PRINTF(("%s: dequeueing %p\n", __func__, m)); + n = m->m_nextpkt; + IF_REMQUEUE(gq, m); + m0 = m; + m = n; + } else { + MLD_PRINTF(("%s: copying %p\n", __func__, m)); + m0 = m_dup(m, M_NOWAIT); + if (m0 == NULL) + return (ENOMEM); + m0->m_nextpkt = NULL; + m = m->m_nextpkt; + } + + if (!domerge) { + MLD_PRINTF(("%s: queueing %p to ifscq %p)\n", + __func__, m0, ifscq)); + m0->m_pkthdr.rcvif = inm->in6m_ifp; + IF_ENQUEUE(ifscq, m0); + } else { + struct mbuf *mtl; /* last mbuf of packet mt */ + + MLD_PRINTF(("%s: merging %p with ifscq tail %p)\n", + __func__, m0, mt)); + + mtl = m_last(mt); + m0->m_flags &= ~M_PKTHDR; + mt->m_pkthdr.len += recslen; + mt->m_pkthdr.vt_nrecs += + m0->m_pkthdr.vt_nrecs; + + mtl->m_next = m0; + } + } + + return (0); +} + +/* + * Respond to a pending MLDv2 General Query. + */ +static void +mld_v2_dispatch_general_query(struct mld_ifinfo *mli) +{ + struct ifnet *ifp; + struct in6_multi *inm; + struct in6_multistep step; + int retval; + + MLI_LOCK_ASSERT_HELD(mli); + + VERIFY(mli->mli_version == MLD_VERSION_2); + + ifp = mli->mli_ifp; + MLI_UNLOCK(mli); + + in6_multihead_lock_shared(); + IN6_FIRST_MULTI(step, inm); + while (inm != NULL) { + IN6M_LOCK(inm); + if (inm->in6m_ifp != ifp) + goto next; + + switch (inm->in6m_state) { + case MLD_NOT_MEMBER: + case MLD_SILENT_MEMBER: + break; + case MLD_REPORTING_MEMBER: + case MLD_IDLE_MEMBER: + case MLD_LAZY_MEMBER: + case MLD_SLEEPING_MEMBER: + case MLD_AWAKENING_MEMBER: + inm->in6m_state = MLD_REPORTING_MEMBER; + MLI_LOCK(mli); + retval = mld_v2_enqueue_group_record(&mli->mli_gq, + inm, 0, 0, 0, 0); + MLI_UNLOCK(mli); + MLD_PRINTF(("%s: enqueue record = %d\n", + __func__, retval)); + break; + case MLD_G_QUERY_PENDING_MEMBER: + case MLD_SG_QUERY_PENDING_MEMBER: + case MLD_LEAVING_MEMBER: + break; + } +next: + IN6M_UNLOCK(inm); + IN6_NEXT_MULTI(step, inm); + } + in6_multihead_lock_done(); + + MLI_LOCK(mli); + mld_dispatch_queue(mli, &mli->mli_gq, MLD_MAX_RESPONSE_BURST); + MLI_LOCK_ASSERT_HELD(mli); + + /* + * Slew transmission of bursts over 500ms intervals. + */ + if (mli->mli_gq.ifq_head != NULL) { + mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY( + MLD_RESPONSE_BURST_INTERVAL); + interface_timers_running6 = 1; + } +} + +/* + * Transmit the next pending message in the output queue. + * + * Must not be called with in6m_lockm or mli_lock held. + */ +static void +mld_dispatch_packet(struct mbuf *m) +{ + struct ip6_moptions *im6o; + struct ifnet *ifp; + struct ifnet *oifp = NULL; + struct mbuf *m0; + struct mbuf *md; + struct ip6_hdr *ip6; + struct mld_hdr *mld; + int error; + int off; + int type; + + MLD_PRINTF(("%s: transmit %p\n", __func__, m)); + + /* + * Check if the ifnet is still attached. + */ + ifp = m->m_pkthdr.rcvif; + if (ifp == NULL || !ifnet_is_attached(ifp, 0)) { + MLD_PRINTF(("%s: dropped %p as ifindex %u went away.\n", + __func__, m, (u_int)if_index)); + m_freem(m); + ip6stat.ip6s_noroute++; + return; + } + + im6o = ip6_allocmoptions(M_WAITOK); + if (im6o == NULL) { + m_freem(m); + return; + } + + im6o->im6o_multicast_hlim = 1; +#if MROUTING + im6o->im6o_multicast_loop = (ip6_mrouter != NULL); +#else + im6o->im6o_multicast_loop = 0; +#endif + im6o->im6o_multicast_ifp = ifp; + + if (m->m_flags & M_MLDV1) { + m0 = m; + } else { + m0 = mld_v2_encap_report(ifp, m); + if (m0 == NULL) { + MLD_PRINTF(("%s: dropped %p\n", __func__, m)); + /* + * mld_v2_encap_report() has already freed our mbuf. + */ + IM6O_REMREF(im6o); + ip6stat.ip6s_odropped++; + return; + } + } + + m->m_flags &= ~(M_PROTOFLAGS); + m0->m_pkthdr.rcvif = lo_ifp; + + ip6 = mtod(m0, struct ip6_hdr *); +#if 0 + (void) in6_setscope(&ip6->ip6_dst, ifp, NULL); /* XXX LOR */ +#else + /* + * XXX XXX Break some KPI rules to prevent an LOR which would + * occur if we called in6_setscope() at transmission. + * See comments at top of file. + */ + MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index); +#endif + + /* + * Retrieve the ICMPv6 type before handoff to ip6_output(), + * so we can bump the stats. + */ + md = m_getptr(m0, sizeof(struct ip6_hdr), &off); + mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off); + type = mld->mld_type; + + error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, im6o, + &oifp, NULL); + + IM6O_REMREF(im6o); + + if (error) { + MLD_PRINTF(("%s: ip6_output(%p) = %d\n", __func__, m0, error)); + if (oifp != NULL) + ifnet_release(oifp); + return; + } + + icmp6stat.icp6s_outhist[type]++; + if (oifp != NULL) { + icmp6_ifstat_inc(oifp, ifs6_out_msg); + switch (type) { + case MLD_LISTENER_REPORT: + case MLDV2_LISTENER_REPORT: + icmp6_ifstat_inc(oifp, ifs6_out_mldreport); + break; + case MLD_LISTENER_DONE: + icmp6_ifstat_inc(oifp, ifs6_out_mlddone); + break; + } + ifnet_release(oifp); + } +} + +/* + * Encapsulate an MLDv2 report. + * + * KAME IPv6 requires that hop-by-hop options be passed separately, + * and that the IPv6 header be prepended in a separate mbuf. + * + * Returns a pointer to the new mbuf chain head, or NULL if the + * allocation failed. + */ +static struct mbuf * +mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m) +{ + struct mbuf *mh; + struct mldv2_report *mld; + struct ip6_hdr *ip6; + struct in6_ifaddr *ia; + int mldreclen; + + VERIFY(m->m_flags & M_PKTHDR); + + /* + * RFC3590: OK to send as :: or tentative during DAD. + */ + ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST); + if (ia == NULL) + MLD_PRINTF(("%s: warning: ia is NULL\n", __func__)); + + MGETHDR(mh, M_DONTWAIT, MT_HEADER); + if (mh == NULL) { + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + m_freem(m); + return (NULL); + } + MH_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report)); + + mldreclen = m_length(m); + MLD_PRINTF(("%s: mldreclen is %d\n", __func__, mldreclen)); + + mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report); + mh->m_pkthdr.len = sizeof(struct ip6_hdr) + + sizeof(struct mldv2_report) + mldreclen; + + ip6 = mtod(mh, struct ip6_hdr *); + ip6->ip6_flow = 0; + ip6->ip6_vfc &= ~IPV6_VERSION_MASK; + ip6->ip6_vfc |= IPV6_VERSION; + ip6->ip6_nxt = IPPROTO_ICMPV6; + if (ia != NULL) + IFA_LOCK(&ia->ia_ifa); + ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; + if (ia != NULL) { + IFA_UNLOCK(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } + ip6->ip6_dst = in6addr_linklocal_allv2routers; + /* scope ID will be set in netisr */ + + mld = (struct mldv2_report *)(ip6 + 1); + mld->mld_type = MLDV2_LISTENER_REPORT; + mld->mld_code = 0; + mld->mld_cksum = 0; + mld->mld_v2_reserved = 0; + mld->mld_v2_numrecs = htons(m->m_pkthdr.vt_nrecs); + m->m_pkthdr.vt_nrecs = 0; + m->m_flags &= ~M_PKTHDR; + + mh->m_next = m; + mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, + sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen); + return (mh); +} + +#ifdef MLD_DEBUG +static const char * +mld_rec_type_to_str(const int type) +{ + switch (type) { + case MLD_CHANGE_TO_EXCLUDE_MODE: + return "TO_EX"; + break; + case MLD_CHANGE_TO_INCLUDE_MODE: + return "TO_IN"; + break; + case MLD_MODE_IS_EXCLUDE: + return "MODE_EX"; + break; + case MLD_MODE_IS_INCLUDE: + return "MODE_IN"; + break; + case MLD_ALLOW_NEW_SOURCES: + return "ALLOW_NEW"; + break; + case MLD_BLOCK_OLD_SOURCES: + return "BLOCK_OLD"; + break; + default: + break; + } + return "unknown"; +} +#endif + +void +mld_init(void) +{ + + MLD_PRINTF(("%s: initializing\n", __func__)); + + /* Setup lock group and attribute for mld6_mtx */ + mld_mtx_grp_attr = lck_grp_attr_alloc_init(); + mld_mtx_grp = lck_grp_alloc_init("mld_mtx\n", mld_mtx_grp_attr); + mld_mtx_attr = lck_attr_alloc_init(); + lck_mtx_init(&mld_mtx, mld_mtx_grp, mld_mtx_attr); + + ip6_initpktopts(&mld_po); + mld_po.ip6po_hlim = 1; + mld_po.ip6po_hbh = &mld_ra.hbh; + mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER; + mld_po.ip6po_flags = IP6PO_DONTFRAG; + LIST_INIT(&mli_head); + + mli_size = sizeof (struct mld_ifinfo); + mli_zone = zinit(mli_size, MLI_ZONE_MAX * mli_size, + 0, MLI_ZONE_NAME); + if (mli_zone == NULL) { + panic("%s: failed allocating %s", __func__, MLI_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(mli_zone, Z_EXPAND, TRUE); + zone_change(mli_zone, Z_CALLERACCT, FALSE); +} diff --git a/bsd/netinet6/mld6.h b/bsd/netinet6/mld6.h new file mode 100644 index 000000000..ceb41365c --- /dev/null +++ b/bsd/netinet6/mld6.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2009 Bruce Simpson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET6_MLD6_H_ +#define _NETINET6_MLD6_H_ + +/* + * Multicast Listener Discovery (MLD) definitions. + */ + +/* Minimum length of any MLD protocol message. */ +#define MLD_MINLEN sizeof(struct icmp6_hdr) + +/* + * MLD v2 query format. + * See <netinet/icmp6.h> for struct mld_hdr + * (MLDv1 query and host report format). + */ +struct mldv2_query { + struct icmp6_hdr mld_icmp6_hdr; /* ICMPv6 header */ + struct in6_addr mld_addr; /* address being queried */ + uint8_t mld_misc; /* reserved/suppress/robustness */ + uint8_t mld_qqi; /* querier's query interval */ + uint16_t mld_numsrc; /* number of sources */ + /* followed by 1..numsrc source addresses */ +} __attribute__((__packed__)); +#define MLD_V2_QUERY_MINLEN sizeof(struct mldv2_query) +#define MLD_MRC_EXP(x) ((ntohs((x)) >> 12) & 0x0007) +#define MLD_MRC_MANT(x) (ntohs((x)) & 0x0fff) +#define MLD_QQIC_EXP(x) (((x) >> 4) & 0x07) +#define MLD_QQIC_MANT(x) ((x) & 0x0f) +#define MLD_QRESV(x) (((x) >> 4) & 0x0f) +#define MLD_SFLAG(x) (((x) >> 3) & 0x01) +#define MLD_QRV(x) ((x) & 0x07) + +/* + * MLDv2 host membership report header. + * mld_type: MLDV2_LISTENER_REPORT + */ +struct mldv2_report { + struct icmp6_hdr mld_icmp6_hdr; + /* followed by 1..numgrps records */ +} __attribute__((__packed__)); +/* overlaid on struct icmp6_hdr. */ +#define mld_numrecs mld_icmp6_hdr.icmp6_data16[1] + +struct mldv2_record { + uint8_t mr_type; /* record type */ + uint8_t mr_datalen; /* length of auxiliary data */ + uint16_t mr_numsrc; /* number of sources */ + struct in6_addr mr_addr; /* address being reported */ + /* followed by 1..numsrc source addresses */ +} __attribute__((__packed__)); +#define MLD_V2_REPORT_MAXRECS 65535 + +/* + * MLDv2 report modes. + */ +#define MLD_DO_NOTHING 0 /* don't send a record */ +#define MLD_MODE_IS_INCLUDE 1 /* MODE_IN */ +#define MLD_MODE_IS_EXCLUDE 2 /* MODE_EX */ +#define MLD_CHANGE_TO_INCLUDE_MODE 3 /* TO_IN */ +#define MLD_CHANGE_TO_EXCLUDE_MODE 4 /* TO_EX */ +#define MLD_ALLOW_NEW_SOURCES 5 /* ALLOW_NEW */ +#define MLD_BLOCK_OLD_SOURCES 6 /* BLOCK_OLD */ + +/* + * MLDv2 query types. + */ +#define MLD_V2_GENERAL_QUERY 1 +#define MLD_V2_GROUP_QUERY 2 +#define MLD_V2_GROUP_SOURCE_QUERY 3 + +/* + * Maximum report interval for MLDv1 host membership reports. + */ +#define MLD_V1_MAX_RI 10 + +/* + * MLD_TIMER_SCALE denotes that the MLD code field specifies + * time in milliseconds. + */ +#define MLD_TIMER_SCALE 1000 + +#endif /* _NETINET6_MLD6_H_ */ diff --git a/bsd/netinet6/mld6_var.h b/bsd/netinet6/mld6_var.h index bbeda1ff9..7652cdca9 100644 --- a/bsd/netinet6/mld6_var.h +++ b/bsd/netinet6/mld6_var.h @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/mld6_var.h,v 1.1.2.1 2000/07/15 07:14:36 kris Exp $ */ /* $KAME: mld6_var.h,v 1.4 2000/03/25 07:23:54 sumikawa Exp $ */ @@ -34,21 +62,186 @@ #define _NETINET6_MLD6_VAR_H_ #include <sys/appleapiopts.h> -#ifdef KERNEL_PRIVATE +/* + * Multicast Listener Discovery (MLD) + * implementation-specific definitions. + */ + +#ifdef PRIVATE +/* + * Per-link MLD state. + */ +#ifndef XNU_KERNEL_PRIVATE +struct mld_ifinfo { +#else +struct mld_ifinfo_u { +#endif /* XNU_KERNEL_PRIVATE */ + uint32_t mli_ifindex; /* interface this instance belongs to */ + uint32_t mli_version; /* MLDv1 Host Compatibility Mode */ + uint32_t mli_v1_timer; /* MLDv1 Querier Present timer (s) */ + uint32_t mli_v2_timer; /* MLDv2 General Query (interface) timer (s)*/ + uint32_t mli_flags; /* MLD per-interface flags */ + uint32_t mli_rv; /* MLDv2 Robustness Variable */ + uint32_t mli_qi; /* MLDv2 Query Interval (s) */ + uint32_t mli_qri; /* MLDv2 Query Response Interval (s) */ + uint32_t mli_uri; /* MLDv2 Unsolicited Report Interval (s) */ + uint32_t _pad; +}; + +#define MLIF_SILENT 0x00000001 /* Do not use MLD on this ifp */ +#define MLIF_USEALLOW 0x00000002 /* Use ALLOW/BLOCK for joins/leaves */ + +/* + * MLD version tag. + */ +#define MLD_VERSION_NONE 0 /* Invalid */ +#define MLD_VERSION_1 1 +#define MLD_VERSION_2 2 /* Default */ +#endif /* PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +#include <sys/syslog.h> + +#define MLD_DEBUG 1 +#ifdef MLD_DEBUG +extern int mld_debug; +#define MLD_PRINTF(x) do { if (mld_debug) printf x; } while (0) +#else +#define MLD_PRINTF(x) +#endif + +#define MLD_RANDOM_DELAY(X) (random() % (X) + 1) +#define MLD_MAX_STATE_CHANGES 24 /* Max pending changes per group */ + +/* + * MLD per-group states. + */ +#define MLD_NOT_MEMBER 0 /* Can garbage collect group */ +#define MLD_SILENT_MEMBER 1 /* Do not perform MLD for group */ +#define MLD_REPORTING_MEMBER 2 /* MLDv1 we are reporter */ +#define MLD_IDLE_MEMBER 3 /* MLDv1 we reported last */ +#define MLD_LAZY_MEMBER 4 /* MLDv1 other member reporting */ +#define MLD_SLEEPING_MEMBER 5 /* MLDv1 start query response */ +#define MLD_AWAKENING_MEMBER 6 /* MLDv1 group timer will start */ +#define MLD_G_QUERY_PENDING_MEMBER 7 /* MLDv2 group query pending */ +#define MLD_SG_QUERY_PENDING_MEMBER 8 /* MLDv2 source query pending */ +#define MLD_LEAVING_MEMBER 9 /* MLDv2 dying gasp (pending last */ + /* retransmission of INCLUDE {}) */ +/* + * MLDv2 protocol control variables. + */ +#define MLD_RV_INIT 2 /* Robustness Variable */ +#define MLD_RV_MIN 1 +#define MLD_RV_MAX 7 + +#define MLD_QI_INIT 125 /* Query Interval (s) */ +#define MLD_QI_MIN 1 +#define MLD_QI_MAX 255 -#define MLD6_RANDOM_DELAY(X) (random() % (X) + 1) +#define MLD_QRI_INIT 10 /* Query Response Interval (s) */ +#define MLD_QRI_MIN 1 +#define MLD_QRI_MAX 255 + +#define MLD_URI_INIT 3 /* Unsolicited Report Interval (s) */ +#define MLD_URI_MIN 0 +#define MLD_URI_MAX 10 + +#define MLD_MAX_GS_SOURCES 256 /* # of sources in rx GS query */ +#define MLD_MAX_G_GS_PACKETS 8 /* # of packets to answer G/GS */ +#define MLD_MAX_STATE_CHANGE_PACKETS 8 /* # of packets per state change */ +#define MLD_MAX_RESPONSE_PACKETS 16 /* # of packets for general query */ +#define MLD_MAX_RESPONSE_BURST 4 /* # of responses to send at once */ +#define MLD_RESPONSE_BURST_INTERVAL (PR_SLOWHZ) /* 500ms */ + +/* + * MLD-specific mbuf flags. + */ +#define M_MLDV1 M_PROTO1 /* Packet is MLDv1 */ +#define M_GROUPREC M_PROTO3 /* mbuf chain is a group record */ + +/* + * Leading space for MLDv2 reports inside MTU. + * + * NOTE: This differs from IGMPv3 significantly. KAME IPv6 requires + * that a fully formed mbuf chain *without* the Router Alert option + * is passed to ip6_output(), however we must account for it in the + * MTU if we need to split an MLDv2 report into several packets. + * + * We now put the MLDv2 report header in the initial mbuf containing + * the IPv6 header. + */ +#define MLD_MTUSPACE (sizeof(struct ip6_hdr) + sizeof(struct mld_raopt) + \ + sizeof(struct icmp6_hdr)) + +struct mld_ifinfo { + decl_lck_mtx_data(, mli_lock); + uint32_t mli_refcnt; /* reference count */ + uint32_t mli_debug; /* see ifa_debug flags */ + LIST_ENTRY(mld_ifinfo) mli_link; + struct ifnet *mli_ifp; /* interface this instance belongs to */ + uint32_t mli_version; /* MLDv1 Host Compatibility Mode */ + uint32_t mli_v1_timer; /* MLDv1 Querier Present timer (s) */ + uint32_t mli_v2_timer; /* MLDv2 General Query (interface) timer (s)*/ + uint32_t mli_flags; /* MLD per-interface flags */ + uint32_t mli_rv; /* MLDv2 Robustness Variable */ + uint32_t mli_qi; /* MLDv2 Query Interval (s) */ + uint32_t mli_qri; /* MLDv2 Query Response Interval (s) */ + uint32_t mli_uri; /* MLDv2 Unsolicited Report Interval (s) */ + SLIST_HEAD(,in6_multi) mli_relinmhead; /* released groups */ + struct ifqueue mli_gq; /* queue of general query responses */ + struct ifqueue mli_v1q; /* MLDv1 message queue */ +}; + +#define MLI_LOCK_ASSERT_HELD(_mli) \ + lck_mtx_assert(&(_mli)->mli_lock, LCK_MTX_ASSERT_OWNED) + +#define MLI_LOCK_ASSERT_NOTHELD(_mli) \ + lck_mtx_assert(&(_mli)->mli_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define MLI_LOCK(_mli) \ + lck_mtx_lock(&(_mli)->mli_lock) + +#define MLI_LOCK_SPIN(_mli) \ + lck_mtx_lock_spin(&(_mli)->mli_lock) + +#define MLI_CONVERT_LOCK(_mli) do { \ + MLI_LOCK_ASSERT_HELD(_mli); \ + lck_mtx_convert_spin(&(_mli)->mli_lock); \ +} while (0) + +#define MLI_UNLOCK(_mli) \ + lck_mtx_unlock(&(_mli)->mli_lock) + +#define MLI_ADDREF(_mli) \ + mli_addref(_mli, 0) + +#define MLI_ADDREF_LOCKED(_mli) \ + mli_addref(_mli, 1) + +#define MLI_REMREF(_mli) \ + mli_remref(_mli) /* - * States for MLD stop-listening processing + * Per-link MLD context. */ -#define MLD6_OTHERLISTENER 0 -#define MLD6_IREPORTEDLAST 1 +#define MLD_IFINFO(ifp) ((ifp)->if_mli) + +extern int mld_change_state(struct in6_multi *, const int); +extern struct mld_ifinfo *mld_domifattach(struct ifnet *, int); +extern void mld_domifreattach(struct mld_ifinfo *); +extern void mld_domifdetach(struct ifnet *); +extern void mld_fasttimo(void); +extern void mld_ifdetach(struct ifnet *); +extern int mld_input(struct mbuf *, int, int); +extern void mld_slowtimo(void); +extern void mld_init(void); +extern void mli_addref(struct mld_ifinfo *, int); +extern void mli_remref(struct mld_ifinfo *); + +#ifdef SYSCTL_DECL +SYSCTL_DECL(_net_inet6_mld); +#endif -void mld6_init(void); -void mld6_input(struct mbuf *, int); -void mld6_start_listening(struct in6_multi *); -void mld6_stop_listening(struct in6_multi *); -void mld6_fasttimeo(void); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #endif /* _NETINET6_MLD6_VAR_H_ */ diff --git a/bsd/netinet6/nd6.c b/bsd/netinet6/nd6.c index d71746042..77ab7630a 100644 --- a/bsd/netinet6/nd6.c +++ b/bsd/netinet6/nd6.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,6 +79,8 @@ #include <sys/syslog.h> #include <sys/protosw.h> #include <sys/proc.h> +#include <sys/mcache.h> + #include <kern/queue.h> #include <kern/zalloc.h> @@ -86,19 +88,19 @@ #include <net/if.h> #include <net/if_dl.h> #include <net/if_types.h> -#include <net/if_atm.h> +#include <net/if_llreach.h> #include <net/route.h> #include <net/dlil.h> +#include <net/ntstat.h> #include <netinet/in.h> #include <netinet/in_arp.h> #include <netinet/if_ether.h> -#include <netinet/if_fddi.h> #include <netinet6/in6_var.h> #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/nd6.h> -#include <netinet6/in6_prefix.h> +#include <netinet6/scope6_var.h> #include <netinet/icmp6.h> #include "loop.h" @@ -125,6 +127,7 @@ int nd6_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */ int nd6_maxndopt = 10; /* max # of ND options allowed */ int nd6_maxnudhint = 0; /* max # of subsequent upper layer hints */ +int nd6_maxqueuelen = 1; /* max # of packets cached in unresolved ND entries */ #if ND6_DEBUG int nd6_debug = 1; @@ -132,6 +135,8 @@ int nd6_debug = 1; int nd6_debug = 0; #endif +static int nd6_is_new_addr_neighbor (struct sockaddr_in6 *, struct ifnet *); + /* for debugging? */ static int nd6_inuse, nd6_allocated; @@ -151,7 +156,8 @@ static int nd6_inuse, nd6_allocated; * * - Routing lock (rnh_lock) * - * ln_hold, ln_asked, ln_expire, ln_state, ln_router, ln_byhint, ln_flags + * ln_hold, ln_asked, ln_expire, ln_state, ln_router, ln_byhint, ln_flags, + * ln_llreach, ln_lastused * * - Routing entry lock (rt_lock) * @@ -161,7 +167,7 @@ static int nd6_inuse, nd6_allocated; * freed until the route itself is freed. */ struct llinfo_nd6 llinfo_nd6 = { - &llinfo_nd6, &llinfo_nd6, NULL, NULL, 0, 0, 0, 0, 0, 0 + &llinfo_nd6, &llinfo_nd6, NULL, NULL, 0, 0, 0, 0, 0, 0, NULL, 0 }; /* Protected by nd_if_rwlock */ @@ -177,16 +183,22 @@ lck_rw_t *nd_if_rwlock; struct nd_drhead nd_defrouter; struct nd_prhead nd_prefix = { 0 }; +/* Serialization variables for nd6_drain() */ +static boolean_t nd6_drain_busy; +static void *nd6_drain_waitchan = &nd6_drain_busy; +static int nd6_drain_waiters = 0; + int nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL; static struct sockaddr_in6 all1_sa; static int regen_tmpaddr(struct in6_ifaddr *); -extern lck_mtx_t *ip6_mutex; extern lck_mtx_t *nd6_mutex; static void nd6_slowtimo(void *ignored_arg); static struct llinfo_nd6 *nd6_llinfo_alloc(void); static void nd6_llinfo_free(void *); +static void nd6_llinfo_purge(struct rtentry *); +static void nd6_llinfo_get_ri(struct rtentry *, struct rt_reach_info *); static void nd6_siocgdrlst(void *, int); static void nd6_siocgprlst(void *, int); @@ -249,6 +261,10 @@ nd6_init() panic("%s: failed allocating llinfo_nd6_zone", __func__); zone_change(llinfo_nd6_zone, Z_EXPAND, TRUE); + zone_change(llinfo_nd6_zone, Z_CALLERACCT, FALSE); + + nd6_nbr_init(); + nd6_rtr_init(); nd6_init_done = 1; @@ -278,9 +294,48 @@ nd6_llinfo_free(void *arg) ln->ln_hold = NULL; } + /* Purge any link-layer info caching */ + VERIFY(ln->ln_rt->rt_llinfo == ln); + if (ln->ln_rt->rt_llinfo_purge != NULL) + ln->ln_rt->rt_llinfo_purge(ln->ln_rt); + zfree(llinfo_nd6_zone, ln); } +static void +nd6_llinfo_purge(struct rtentry *rt) +{ + struct llinfo_nd6 *ln = rt->rt_llinfo; + + RT_LOCK_ASSERT_HELD(rt); + VERIFY(rt->rt_llinfo_purge == nd6_llinfo_purge && ln != NULL); + + if (ln->ln_llreach != NULL) { + RT_CONVERT_LOCK(rt); + ifnet_llreach_free(ln->ln_llreach); + ln->ln_llreach = NULL; + } + ln->ln_lastused = 0; +} + +static void +nd6_llinfo_get_ri(struct rtentry *rt, struct rt_reach_info *ri) +{ + struct llinfo_nd6 *ln = rt->rt_llinfo; + struct if_llreach *lr = ln->ln_llreach; + + if (lr == NULL) { + bzero(ri, sizeof (*ri)); + } else { + IFLR_LOCK(lr); + /* Export to rt_reach_info structure */ + ifnet_lr2ri(lr, ri); + /* Export ND6 send expiration time */ + ri->ri_snd_expire = ifnet_llreach_up2cal(lr, ln->ln_lastused); + IFLR_UNLOCK(lr); + } +} + int nd6_ifattach(struct ifnet *ifp) { @@ -338,7 +393,6 @@ nd6_ifattach(struct ifnet *ifp) ND.basereachable = REACHABLE_TIME; ND.reachable = ND_COMPUTE_RTIME(ND.basereachable); ND.retrans = RETRANS_TIMER; - ND.receivedra = 0; ND.flags = ND6_IFF_PERFORMNUD; lck_rw_done(nd_if_rwlock); nd6_setmtu(ifp); @@ -378,8 +432,8 @@ nd6_setmtu(struct ifnet *ifp) * the sanity checks related to the maximum MTU allowed for the * interface (a value that is known only by the interface layer), * by sending the request down via ifnet_ioctl(). The use of the - * ND level maxmtu and linkmtu (the latter obtained via RA) are done - * via IN6_LINKMTU() which does further checking against if_mtu. + * ND level maxmtu and linkmtu are done via IN6_LINKMTU() which + * does further checking against if_mtu. */ maxmtu = ndi->maxmtu = ifp->if_mtu; @@ -394,6 +448,7 @@ nd6_setmtu(struct ifnet *ifp) "new link MTU on %s%d (%u) is too small for IPv6\n", ifp->if_name, ifp->if_unit, (uint32_t)ndi->maxmtu); } + ndi->linkmtu = ifp->if_mtu; lck_rw_done(nd_if_rwlock); /* also adjust in6_maxmtu if necessary. */ @@ -480,16 +535,16 @@ nd6_options( struct nd_opt_hdr *nd_opt; int i = 0; - if (!ndopts) - panic("ndopts == NULL in nd6_options\n"); - if (!ndopts->nd_opts_last) - panic("uninitialized ndopts in nd6_options\n"); - if (!ndopts->nd_opts_search) + if (ndopts == NULL) + panic("ndopts == NULL in nd6_options"); + if (ndopts->nd_opts_last == NULL) + panic("uninitialized ndopts in nd6_options"); + if (ndopts->nd_opts_search == NULL) return 0; while (1) { nd_opt = nd6_option(ndopts); - if (!nd_opt && !ndopts->nd_opts_last) { + if (nd_opt == NULL && ndopts->nd_opts_last == NULL) { /* * Message validation requires that all included * options have a length that is greater than zero. @@ -499,7 +554,7 @@ nd6_options( return -1; } - if (!nd_opt) + if (nd_opt == NULL) goto skip1; switch (nd_opt->nd_opt_type) { @@ -525,6 +580,9 @@ nd6_options( ndopts->nd_opts_pi_end = (struct nd_opt_prefix_info *)nd_opt; break; + case ND_OPT_RDNSS: + /* ignore */ + break; default: /* * Unknown options must be silently ignored, @@ -581,7 +639,6 @@ again: struct rtentry *rt; struct sockaddr_in6 *dst; struct llinfo_nd6 *next; - struct nd_ifinfo ndi; /* ln_next/prev/rt is protected by rnh_lock */ next = ln->ln_next; @@ -634,7 +691,6 @@ again: ln = next; continue; } - ndi = nd_ifinfo[ifp->if_index]; lck_rw_done(nd_if_rwlock); RT_LOCK_ASSERT_HELD(rt); @@ -643,13 +699,15 @@ again: case ND6_LLINFO_INCOMPLETE: if (ln->ln_asked < nd6_mmaxtries) { ln->ln_asked++; + lck_rw_lock_shared(nd_if_rwlock); ln->ln_expire = timenow.tv_sec + - ndi.retrans / 1000; + nd_ifinfo[ifp->if_index].retrans / 1000; + lck_rw_done(nd_if_rwlock); RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); nd6_ns_output(ifp, NULL, &dst->sin6_addr, - ln, 0, 0); + ln, 0); RT_REMREF(rt); } else { struct mbuf *m = ln->ln_hold; @@ -701,22 +759,26 @@ again: break; case ND6_LLINFO_DELAY: - if ((ndi.flags & ND6_IFF_PERFORMNUD) != 0) { + lck_rw_lock_shared(nd_if_rwlock); + if ((nd_ifinfo[ifp->if_index].flags & + ND6_IFF_PERFORMNUD) != 0) { /* We need NUD */ ln->ln_asked = 1; ln->ln_state = ND6_LLINFO_PROBE; ln->ln_expire = timenow.tv_sec + - ndi.retrans / 1000; + nd_ifinfo[ifp->if_index].retrans / 1000; + lck_rw_done(nd_if_rwlock); RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); nd6_ns_output(ifp, &dst->sin6_addr, - &dst->sin6_addr, ln, 0, 0); + &dst->sin6_addr, ln, 0); lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_NOTOWNED); RT_REMREF(rt); goto again; } + lck_rw_done(nd_if_rwlock); ln->ln_state = ND6_LLINFO_STALE; /* XXX */ ln->ln_expire = rt_expiry(rt, timenow.tv_sec, nd6_gctimer); @@ -726,13 +788,15 @@ again: case ND6_LLINFO_PROBE: if (ln->ln_asked < nd6_umaxtries) { ln->ln_asked++; + lck_rw_lock_shared(nd_if_rwlock); ln->ln_expire = timenow.tv_sec + - ndi.retrans / 1000; + nd_ifinfo[ifp->if_index].retrans / 1000; + lck_rw_done(nd_if_rwlock); RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_unlock(rnh_lock); nd6_ns_output(ifp, &dst->sin6_addr, - &dst->sin6_addr, ln, 0, 0); + &dst->sin6_addr, ln, 0); RT_REMREF(rt); } else { RT_UNLOCK(rt); @@ -771,12 +835,13 @@ again: if (dr->expire && dr->expire < timenow.tv_sec) { struct nd_defrouter *t; t = TAILQ_NEXT(dr, dr_entry); - defrtrlist_del(dr, 1); + defrtrlist_del(dr); dr = t; } else { dr = TAILQ_NEXT(dr, dr_entry); } } + lck_mtx_unlock(nd6_mutex); /* * expire interface addresses. @@ -784,22 +849,21 @@ again: * However, from a stricter speci-confrmance standpoint, we should * rather separate address lifetimes and prefix lifetimes. */ - addrloop: +addrloop: + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); for (ia6 = in6_ifaddrs; ia6; ia6 = nia6) { nia6 = ia6->ia_next; + IFA_LOCK(&ia6->ia_ifa); + /* + * Extra reference for ourselves; it's no-op if + * we don't have to regenerate temporary address, + * otherwise it protects the address from going + * away since we drop in6_ifaddr_rwlock below. + */ + IFA_ADDREF_LOCKED(&ia6->ia_ifa); /* check address lifetime */ lt6 = &ia6->ia6_lifetime; if (IFA6_IS_INVALID(ia6)) { - int regen = 0; - - /* - * Extra reference for ourselves; it's no-op if - * we don't have to regenerate temporary address, - * otherwise it protects the address from going - * away since we drop nd6_mutex below. - */ - ifaref(&ia6->ia_ifa); - /* * If the expiring address is temporary, try * regenerating a new one. This would be useful when @@ -818,20 +882,27 @@ again: * hang. This is safe because the goto addrloop * leads to a reevaluation of the in6_ifaddrs list */ - lck_mtx_unlock(nd6_mutex); - if (regen_tmpaddr(ia6) == 0) - regen = 1; - lck_mtx_lock(nd6_mutex); + IFA_UNLOCK(&ia6->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + (void) regen_tmpaddr(ia6); + } else { + IFA_UNLOCK(&ia6->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); } - in6_purgeaddr(&ia6->ia_ifa, 1); + /* + * Purging the address would have caused + * in6_ifaddr_rwlock to be dropped and reacquired; + * therefore search again from the beginning + * of in6_ifaddrs list. + */ + in6_purgeaddr(&ia6->ia_ifa); /* Release extra reference taken above */ - ifafree(&ia6->ia_ifa); - - if (regen) - goto addrloop; /* XXX: see below */ + IFA_REMREF(&ia6->ia_ifa); + goto addrloop; } + IFA_LOCK_ASSERT_HELD(&ia6->ia_ifa); if (IFA6_IS_DEPRECATED(ia6)) { int oldflags = ia6->ia6_flags; @@ -846,7 +917,8 @@ again: (oldflags & IN6_IFF_DEPRECATED) == 0) { /* see NOTE above */ - lck_mtx_unlock(nd6_mutex); + IFA_UNLOCK(&ia6->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); if (regen_tmpaddr(ia6) == 0) { /* * A new temporary address is @@ -860,10 +932,13 @@ again: * loop just for safety. Or does this * significantly reduce performance?? */ - lck_mtx_lock(nd6_mutex); + /* Release extra reference */ + IFA_REMREF(&ia6->ia_ifa); goto addrloop; } - lck_mtx_lock(nd6_mutex); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + } else { + IFA_UNLOCK(&ia6->ia_ifa); } } else { /* @@ -871,8 +946,26 @@ again: * preferred. */ ia6->ia6_flags &= ~IN6_IFF_DEPRECATED; + IFA_UNLOCK(&ia6->ia_ifa); } + lck_rw_assert(&in6_ifaddr_rwlock, LCK_RW_ASSERT_EXCLUSIVE); + /* Release extra reference taken above */ + IFA_REMREF(&ia6->ia_ifa); } + lck_rw_done(&in6_ifaddr_rwlock); + + lck_mtx_lock(nd6_mutex); + /* + * Since we drop the nd6_mutex in prelist_remove, we want to run this + * section single threaded. + */ + while (nd6_drain_busy) { + nd6_drain_waiters++; + msleep(nd6_drain_waitchan, nd6_mutex, (PZERO-1), + __func__, NULL); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + } + nd6_drain_busy = TRUE; /* expire prefix list */ pr = nd_prefix.lh_first; @@ -882,19 +975,38 @@ again: * since pltime is just for autoconf, pltime processing for * prefix is not necessary. */ + NDPR_LOCK(pr); + if (pr->ndpr_stateflags & NDPRF_PROCESSED) { + NDPR_UNLOCK(pr); + pr = pr->ndpr_next; + continue; + } if (pr->ndpr_expire && pr->ndpr_expire < timenow.tv_sec) { - struct nd_prefix *t; - t = pr->ndpr_next; - /* * address expiration and prefix expiration are * separate. NEVER perform in6_purgeaddr here. */ - - prelist_remove(pr, 1); - pr = t; - } else + pr->ndpr_stateflags |= NDPRF_PROCESSED; + NDPR_ADDREF_LOCKED(pr); + prelist_remove(pr); + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); + pr = nd_prefix.lh_first; + } else { + pr->ndpr_stateflags |= NDPRF_PROCESSED; + NDPR_UNLOCK(pr); pr = pr->ndpr_next; + } + } + LIST_FOREACH(pr, &nd_prefix, ndpr_entry) { + NDPR_LOCK(pr); + pr->ndpr_stateflags &= ~NDPRF_PROCESSED; + NDPR_UNLOCK(pr); + } + nd6_drain_busy = FALSE; + if (nd6_drain_waiters > 0) { + nd6_drain_waiters = 0; + wakeup(nd6_drain_waitchan); } lck_mtx_unlock(nd6_mutex); } @@ -921,25 +1033,29 @@ regen_tmpaddr( getmicrotime(&timenow); ifp = ia6->ia_ifa.ifa_ifp; - ifnet_lock_exclusive(ifp); + ifnet_lock_shared(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa->ifa_list.tqe_next) { struct in6_ifaddr *it6; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - + } it6 = (struct in6_ifaddr *)ifa; /* ignore no autoconf addresses. */ - if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) + if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + IFA_UNLOCK(ifa); continue; - + } /* ignore autoconf addresses with different prefixes. */ - if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) + if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr) { + IFA_UNLOCK(ifa); continue; - + } /* * Now we are looking at an autoconf address with the same * prefix as ours. If the address is temporary and is still @@ -949,6 +1065,9 @@ regen_tmpaddr( */ if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 && !IFA6_IS_DEPRECATED(it6)) { + IFA_UNLOCK(ifa); + if (public_ifa6 != NULL) + IFA_REMREF(&public_ifa6->ia_ifa); public_ifa6 = NULL; break; } @@ -959,8 +1078,15 @@ regen_tmpaddr( * loop here, because there may be a still-preferred temporary * address with the prefix. */ - if (!IFA6_IS_DEPRECATED(it6)) - public_ifa6 = it6; + if (!IFA6_IS_DEPRECATED(it6)) { + IFA_ADDREF_LOCKED(ifa); /* for public_ifa6 */ + IFA_UNLOCK(ifa); + if (public_ifa6 != NULL) + IFA_REMREF(&public_ifa6->ia_ifa); + public_ifa6 = it6; + } else { + IFA_UNLOCK(ifa); + } } ifnet_lock_done(ifp); @@ -970,8 +1096,10 @@ regen_tmpaddr( if ((e = in6_tmpifadd(public_ifa6, 0, M_WAITOK)) != 0) { log(LOG_NOTICE, "regen_tmpaddr: failed to create a new" " tmp addr,errno=%d\n", e); + IFA_REMREF(&public_ifa6->ia_ifa); return(-1); } + IFA_REMREF(&public_ifa6->ia_ifa); return(0); } @@ -987,7 +1115,7 @@ nd6_purge( struct ifnet *ifp) { struct llinfo_nd6 *ln; - struct nd_defrouter *dr, *ndr, drany; + struct nd_defrouter *dr, *ndr; struct nd_prefix *pr, *npr; /* Nuke default router list entries toward ifp */ @@ -999,18 +1127,38 @@ nd6_purge( */ for (dr = TAILQ_NEXT(dr, dr_entry); dr; dr = ndr) { ndr = TAILQ_NEXT(dr, dr_entry); + if (dr->stateflags & NDDRF_INSTALLED) + continue; if (dr->ifp == ifp) - defrtrlist_del(dr, 1); + defrtrlist_del(dr); } dr = TAILQ_FIRST(&nd_defrouter); if (dr->ifp == ifp) - defrtrlist_del(dr, 1); + defrtrlist_del(dr); + } + + for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = ndr) { + ndr = TAILQ_NEXT(dr, dr_entry); + if (!(dr->stateflags & NDDRF_INSTALLED)) + continue; + + if (dr->ifp == ifp) + defrtrlist_del(dr); } /* Nuke prefix list entries toward ifp */ for (pr = nd_prefix.lh_first; pr; pr = npr) { npr = pr->ndpr_next; + NDPR_LOCK(pr); if (pr->ndpr_ifp == ifp) { + /* + * Because if_detach() does *not* release prefixes + * while purging addresses the reference count will + * still be above zero. We therefore reset it to + * make sure that the prefix really gets purged. + */ + pr->ndpr_addrcnt = 0; + /* * Previously, pr->ndpr_addr is removed as well, * but I strongly believe we don't have to do it. @@ -1019,27 +1167,28 @@ nd6_purge( * by itself. * (jinmei@kame.net 20010129) */ - prelist_remove(pr, 1); + NDPR_ADDREF_LOCKED(pr); + prelist_remove(pr); + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); + } else { + NDPR_UNLOCK(pr); } } + lck_mtx_unlock(nd6_mutex); /* cancel default outgoing interface setting */ if (nd6_defifindex == ifp->if_index) { - /* Release nd6_mutex as it will be acquired - * during nd6_setdefaultiface again - */ - lck_mtx_unlock(nd6_mutex); nd6_setdefaultiface(0); - lck_mtx_lock(nd6_mutex); } if (!ip6_forwarding && (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + lck_mtx_lock(nd6_mutex); /* refresh default router list */ - bzero(&drany, sizeof(drany)); - defrouter_delreq(&drany, 0); - defrouter_select(); + defrouter_reset(); + defrouter_select(ifp); + lck_mtx_unlock(nd6_mutex); } - lck_mtx_unlock(nd6_mutex); /* * Nuke neighbor cache entries for the ifp. @@ -1098,28 +1247,31 @@ nd6_lookup( { struct rtentry *rt; struct sockaddr_in6 sin6; + unsigned int ifscope; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *addr6; -#if SCOPEDROUTING - sin6.sin6_scope_id = in6_addr2scopeid(ifp, addr6); -#endif - if (rt_locked) - lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); - rt = rt_locked ? rtalloc1_locked((struct sockaddr *)&sin6, create, 0) : - rtalloc1((struct sockaddr *)&sin6, create, 0); + ifscope = (ifp != NULL) ? ifp->if_index : IFSCOPE_NONE; + if (rt_locked) { + lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); + rt = rtalloc1_scoped_locked((struct sockaddr *)&sin6, + create, 0, ifscope); + } else { + rt = rtalloc1_scoped((struct sockaddr *)&sin6, + create, 0, ifscope); + } if (rt != NULL) { RT_LOCK(rt); if ((rt->rt_flags & RTF_LLINFO) == 0) { /* - * This is the case for the default route. If we - * want to create a neighbor cache for the address, - * we should free the route for the destination and - * allocate an interface route. + * This is the case for the default route. + * If we want to create a neighbor cache for the + * address, we should free the route for the + * destination and allocate an interface route. */ if (create) { RT_UNLOCK(rt); @@ -1134,6 +1286,7 @@ nd6_lookup( if (rt == NULL) { if (create && ifp) { struct ifaddr *ifa; + u_int32_t ifa_flags; int e; /* @@ -1155,11 +1308,14 @@ nd6_lookup( */ if (!rt_locked) lck_mtx_lock(rnh_lock); - if ((e = rtrequest_locked(RTM_ADD, + IFA_LOCK_SPIN(ifa); + ifa_flags = ifa->ifa_flags; + IFA_UNLOCK(ifa); + if ((e = rtrequest_scoped_locked(RTM_ADD, (struct sockaddr *)&sin6, ifa->ifa_addr, (struct sockaddr *)&all1_sa, - (ifa->ifa_flags | RTF_HOST | RTF_LLINFO) & - ~RTF_CLONING, &rt)) != 0) { + (ifa_flags | RTF_HOST | RTF_LLINFO) & + ~RTF_CLONING, &rt, ifscope)) != 0) { if (e != EEXIST) log(LOG_ERR, "%s: failed to add route " "for a neighbor(%s), errno=%d\n", @@ -1167,7 +1323,7 @@ nd6_lookup( } if (!rt_locked) lck_mtx_unlock(rnh_lock); - ifafree(ifa); + IFA_REMREF(ifa); if (rt == NULL) return(NULL); @@ -1191,10 +1347,13 @@ nd6_lookup( * it might be the loopback interface if the entry is for our * own address on a non-loopback interface. Instead, we should * use rt->rt_ifa->ifa_ifp, which would specify the REAL - * interface. + * interface. + * Note also that ifa_ifp and ifp may differ when we connect two + * interfaces to a same link, install a link prefix to an interface, + * and try to install a neighbor cache on an interface that does not + * have a route to the prefix. */ - if (ifp == NULL || (ifp->if_type == IFT_PPP) || - (ifp->if_eflags & IFEF_NOAUTOIPV6LL) || + if (ifp == NULL || (rt->rt_flags & RTF_GATEWAY) || (rt->rt_flags & RTF_LLINFO) == 0 || rt->rt_gateway->sa_family != AF_LINK || rt->rt_llinfo == NULL || (ifp && rt->rt_ifa->ifa_ifp != ifp)) { @@ -1215,73 +1374,132 @@ nd6_lookup( } /* - * Detect if a given IPv6 address identifies a neighbor on a given link. - * XXX: should take care of the destination of a p2p link? + * Test whether a given IPv6 address is a neighbor or not, ignoring + * the actual neighbor cache. The neighbor cache is ignored in order + * to not reenter the routing code from within itself. */ -int -nd6_is_addr_neighbor( +static int +nd6_is_new_addr_neighbor( struct sockaddr_in6 *addr, - struct ifnet *ifp, - int rt_locked) + struct ifnet *ifp) { - struct ifaddr *ifa; - struct rtentry *rt; - int i; + struct nd_prefix *pr; + struct ifaddr *dstaddr; -#define IFADDR6(a) ((((struct in6_ifaddr *)(a))->ia_addr).sin6_addr) -#define IFMASK6(a) ((((struct in6_ifaddr *)(a))->ia_prefixmask).sin6_addr) + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); /* * A link-local address is always a neighbor. - * XXX: we should use the sin6_scope_id field rather than the embedded - * interface index. + * XXX: a link does not necessarily specify a single interface. */ - if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr) && - ntohs(*(u_int16_t *)&addr->sin6_addr.s6_addr[2]) == ifp->if_index) - return(1); + if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) { + struct sockaddr_in6 sin6_copy; + u_int32_t zone; + + /* + * We need sin6_copy since sa6_recoverscope() may modify the + * content (XXX). + */ + sin6_copy = *addr; + if (sa6_recoverscope(&sin6_copy)) + return (0); /* XXX: should be impossible */ + if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone)) + return (0); + if (sin6_copy.sin6_scope_id == zone) + return (1); + else + return (0); + } /* * If the address matches one of our addresses, * it should be a neighbor. + * If the address matches one of our on-link prefixes, it should be a + * neighbor. */ - ifnet_lock_shared(ifp); - for (ifa = ifp->if_addrlist.tqh_first; - ifa; - ifa = ifa->ifa_list.tqe_next) - { - if (ifa->ifa_addr->sa_family != AF_INET6) + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if (pr->ndpr_ifp != ifp) { + NDPR_UNLOCK(pr); + continue; + } + if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) { + NDPR_UNLOCK(pr); continue; + } + if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr, + &addr->sin6_addr, &pr->ndpr_mask)) { + NDPR_UNLOCK(pr); + return (1); + } + NDPR_UNLOCK(pr); + } - for (i = 0; i < 4; i++) { - if ((IFADDR6(ifa).s6_addr32[i] ^ - addr->sin6_addr.s6_addr32[i]) & - IFMASK6(ifa).s6_addr32[i]) - continue; + /* + * If the address is assigned on the node of the other side of + * a p2p interface, the address should be a neighbor. + */ + dstaddr = ifa_ifwithdstaddr((struct sockaddr *)addr); + if (dstaddr != NULL) { + if (dstaddr->ifa_ifp == ifp) { + IFA_REMREF(dstaddr); + return (1); } - ifnet_lock_done(ifp); - return(1); + IFA_REMREF(dstaddr); + dstaddr = NULL; } - ifnet_lock_done(ifp); + + /* + * If the default router list is empty, all addresses are regarded + * as on-link, and thus, as a neighbor. + * XXX: we restrict the condition to hosts, because routers usually do + * not have the "default router list". + */ + if (!ip6_forwarding && TAILQ_FIRST(&nd_defrouter) == NULL && + nd6_defifindex == ifp->if_index) { + return (1); + } + + return (0); +} + + +/* + * Detect if a given IPv6 address identifies a neighbor on a given link. + * XXX: should take care of the destination of a p2p link? + */ +int +nd6_is_addr_neighbor(struct sockaddr_in6 *addr, struct ifnet *ifp, int rt_locked) +{ + struct rtentry *rt; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + lck_mtx_lock(nd6_mutex); + if (nd6_is_new_addr_neighbor(addr, ifp)) { + lck_mtx_unlock(nd6_mutex); + return (1); + } + lck_mtx_unlock(nd6_mutex); /* * Even if the address matches none of our addresses, it might be - * in the neighbor cache. Callee returns a locked route upon - * success. + * in the neighbor cache. */ if ((rt = nd6_lookup(&addr->sin6_addr, 0, ifp, rt_locked)) != NULL) { RT_LOCK_ASSERT_HELD(rt); RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); - return(1); + return (1); } - return(0); -#undef IFADDR6 -#undef IFMASK6 + return (0); } /* * Free an nd6 llinfo entry. + * Since the function would cause significant changes in the kernel, DO NOT + * make it global, unless you have a strong reason for the change, and are sure + * that the change is safe. */ void nd6_free( @@ -1324,12 +1542,15 @@ nd6_free( * See a corresponding comment in nd6_na_input(). */ RT_UNLOCK(rt); + lck_mtx_unlock(nd6_mutex); rt6_flush(&in6, rt->rt_ifp); + lck_mtx_lock(nd6_mutex); } else { RT_UNLOCK(rt); } if (dr) { + NDDR_REMREF(dr); /* * Unreachablity of a router might affect the default * router selection and on-link detection of advertised @@ -1353,21 +1574,12 @@ nd6_free( * the check now. */ RT_UNLOCK(rt); - pfxlist_onlink_check(1); + pfxlist_onlink_check(); - if (dr == TAILQ_FIRST(&nd_defrouter)) { - /* - * It is used as the current default router, - * so we have to move it to the end of the - * list and choose a new one. - * XXX: it is not very efficient if this is - * the only router. - */ - TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); - TAILQ_INSERT_TAIL(&nd_defrouter, dr, dr_entry); - - defrouter_select(); - } + /* + * refresh default router list + */ + defrouter_select(rt->rt_ifp); } RT_LOCK_ASSERT_NOTHELD(rt); } else { @@ -1390,7 +1602,7 @@ nd6_free( /* * Upper-layer reachability hint for Neighbor Unreachability Detection. * - * XXX cost-effective metods? + * XXX cost-effective methods? */ void nd6_nud_hint( @@ -1444,8 +1656,8 @@ nd6_nud_hint( ln->ln_state = ND6_LLINFO_REACHABLE; if (ln->ln_expire) { lck_rw_lock_shared(nd_if_rwlock); - ln->ln_expire = rt_expiry(rt, timenow.tv_sec, - nd_ifinfo[rt->rt_ifp->if_index].reachable); + ln->ln_expire = timenow.tv_sec + + nd_ifinfo[rt->rt_ifp->if_index].reachable; lck_rw_done(nd_if_rwlock); } done: @@ -1490,17 +1702,17 @@ nd6_rtrequest( if (!nd6_need_cache(ifp)) { /* stf case */ no_nd_cache = 1; } else { + struct sockaddr_in6 sin6; + + rtkey_to_sa6(rt, &sin6); /* * nd6_is_addr_neighbor() may call nd6_lookup(), * therefore we drop rt_lock to avoid deadlock - * during the lookup. Using rt_key(rt) is still - * safe because it won't change while rnh_lock - * is held. + * during the lookup. */ RT_ADDREF_LOCKED(rt); RT_UNLOCK(rt); - no_nd_cache = !nd6_is_addr_neighbor( - (struct sockaddr_in6 *)rt_key(rt), ifp, 1); + no_nd_cache = !nd6_is_addr_neighbor(&sin6, ifp, 1); RT_LOCK(rt); RT_REMREF_LOCKED(rt); } @@ -1535,12 +1747,13 @@ nd6_rtrequest( * SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff) * rt->rt_flags |= RTF_CLONING; */ - if (rt->rt_flags & (RTF_CLONING | RTF_LLINFO)) { + if ((rt->rt_flags & RTF_CLONING) || + ((rt->rt_flags & RTF_LLINFO) && ln == NULL)) { /* - * Case 1: This route should come from - * a route to interface. RTF_LLINFO flag is set - * for a host route whose destination should be - * treated as on-link. + * Case 1: This route should come from a route to + * interface (RTF_CLONING case) or the route should be + * treated as on-link but is currently not + * (RTF_LLINFO && ln == NULL case). */ if (rt_setgate(rt, rt_key(rt), (struct sockaddr *)&null_sdl) == 0) { @@ -1575,15 +1788,6 @@ nd6_rtrequest( * (or should we allow proxy ND configuration only for * routers? there's no mention about proxy ND from hosts) */ -#if 0 - /* XXX it does not work */ - if (rt->rt_flags & RTF_ANNOUNCE) - nd6_na_output(ifp, - &SIN6(rt_key(rt))->sin6_addr, - &SIN6(rt_key(rt))->sin6_addr, - ip6_forwarding ? ND_NA_FLAG_ROUTER : 0, - 1, NULL); -#endif /* FALLTHROUGH */ case RTM_RESOLVE: if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) == 0) { @@ -1593,9 +1797,12 @@ nd6_rtrequest( */ if (gate->sa_family != AF_LINK || gate->sa_len < sizeof(null_sdl)) { - log(LOG_DEBUG, - "nd6_rtrequest: bad gateway value: %s\n", - if_name(ifp)); + /* Don't complain in case of RTM_ADD */ + if (req == RTM_RESOLVE) { + log(LOG_DEBUG, + "nd6_rtrequest: bad gateway " + "value: %s\n", if_name(ifp)); + } break; } SDL(gate)->sdl_type = ifp->if_type; @@ -1612,6 +1819,8 @@ nd6_rtrequest( log(LOG_DEBUG, "nd6_rtrequest: malloc failed\n"); break; } + rt->rt_llinfo_get_ri = nd6_llinfo_get_ri; + rt->rt_llinfo_purge = nd6_llinfo_purge; rt->rt_llinfo_free = nd6_llinfo_free; nd6_inuse++; @@ -1688,14 +1897,22 @@ nd6_rtrequest( SDL(gate)->sdl_alen = ifp->if_addrlen; } if (nd6_useloopback) { -#if IFNET_ROUTE_REFCNT - /* Adjust route ref count for the interfaces */ - if (rt->rt_if_ref_fn != NULL && - rt->rt_ifp != lo_ifp) { - rt->rt_if_ref_fn(lo_ifp, 1); - rt->rt_if_ref_fn(rt->rt_ifp, -1); + if (rt->rt_ifp != lo_ifp) { + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + + /* + * Adjust route ref count for the + * interfaces. + */ + if (rt->rt_if_ref_fn != NULL) { + rt->rt_if_ref_fn(lo_ifp, 1); + rt->rt_if_ref_fn(rt->rt_ifp, -1); + } } -#endif /* IFNET_ROUTE_REFCNT */ rt->rt_ifp = lo_ifp; /* XXX */ /* * Make sure rt_ifa be equal to the ifaddr @@ -1709,7 +1926,7 @@ nd6_rtrequest( rtsetifa(rt, ifa); } } - ifafree(ifa); + IFA_REMREF(ifa); } else if (rt->rt_flags & RTF_ANNOUNCE) { ln->ln_expire = 0; ln->ln_state = ND6_LLINFO_REACHABLE; @@ -1718,26 +1935,30 @@ nd6_rtrequest( /* join solicited node multicast for proxy ND */ if (ifp->if_flags & IFF_MULTICAST) { struct in6_addr llsol; + struct in6_multi *in6m; int error; llsol = SIN6(rt_key(rt))->sin6_addr; - llsol.s6_addr16[0] = htons(0xff02); - llsol.s6_addr16[1] = htons(ifp->if_index); + llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL; llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; - - if (!in6_addmulti(&llsol, ifp, &error, 0)) { + if (in6_setscope(&llsol, ifp, NULL)) + break; + error = in6_mc_join(ifp, &llsol, NULL, &in6m, 0); + if (error) { nd6log((LOG_ERR, "%s: failed to join " "%s (errno=%d)\n", if_name(ifp), ip6_sprintf(&llsol), error)); + } else { + IN6M_REMREF(in6m); } } } break; case RTM_DELETE: - if (!ln) + if (ln == NULL) break; /* leave from solicited node multicast for proxy ND */ if ((rt->rt_flags & RTF_ANNOUNCE) != 0 && @@ -1746,17 +1967,19 @@ nd6_rtrequest( struct in6_multi *in6m; llsol = SIN6(rt_key(rt))->sin6_addr; - llsol.s6_addr16[0] = htons(0xff02); - llsol.s6_addr16[1] = htons(ifp->if_index); + llsol.s6_addr32[0] = IPV6_ADDR_INT32_MLL; llsol.s6_addr32[1] = 0; llsol.s6_addr32[2] = htonl(1); llsol.s6_addr8[12] = 0xff; - - ifnet_lock_shared(ifp); - IN6_LOOKUP_MULTI(llsol, ifp, in6m); - ifnet_lock_done(ifp); - if (in6m) - in6_delmulti(in6m, 0); + if (in6_setscope(&llsol, ifp, NULL) == 0) { + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&llsol, ifp, in6m); + in6_multihead_lock_done(); + if (in6m != NULL) { + in6_mc_leave(in6m, NULL); + IN6M_REMREF(in6m); + } + } } nd6_inuse--; /* @@ -1767,10 +1990,18 @@ nd6_rtrequest( */ if (ln->ln_flags & ND6_LNF_IN_USE) LN_DEQUEUE(ln); + + /* + * Purge any link-layer info caching. + */ + if (rt->rt_llinfo_purge != NULL) + rt->rt_llinfo_purge(rt); + rt->rt_flags &= ~RTF_LLINFO; - if (ln->ln_hold != NULL) + if (ln->ln_hold != NULL) { m_freem(ln->ln_hold); - ln->ln_hold = NULL; + ln->ln_hold = NULL; + } } } @@ -1835,7 +2066,6 @@ nd6_siocgprlst(void *data, int data_is_64) struct in6_prlist_64 *prl_64 = (struct in6_prlist_64 *)data; struct in6_prlist_32 *prl_32 = (struct in6_prlist_32 *)data; struct nd_prefix *pr; - struct rr_prefix *rpp; int i = 0; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); @@ -1852,8 +2082,9 @@ nd6_siocgprlst(void *data, int data_is_64) struct nd_pfxrouter *pfr; int j; + NDPR_LOCK(pr); (void) in6_embedscope(&prl_64->prefix[i].prefix, - &pr->ndpr_prefix, NULL, NULL); + &pr->ndpr_prefix, NULL, NULL, NULL); prl_64->prefix[i].raflags = pr->ndpr_raf; prl_64->prefix[i].prefixlen = pr->ndpr_plen; prl_64->prefix[i].vltime = pr->ndpr_vltime; @@ -1884,27 +2115,12 @@ nd6_siocgprlst(void *data, int data_is_64) } prl_64->prefix[i].advrtrs = j; prl_64->prefix[i].origin = PR_ORIG_RA; + NDPR_UNLOCK(pr); i++; pr = pr->ndpr_next; } - for (rpp = LIST_FIRST(&rr_prefix); rpp; - rpp = LIST_NEXT(rpp, rp_entry)) { - if (i >= PRLSTSIZ) - break; - (void) in6_embedscope(&prl_64->prefix[i].prefix, - &pr->ndpr_prefix, NULL, NULL); - prl_64->prefix[i].raflags = rpp->rp_raf; - prl_64->prefix[i].prefixlen = rpp->rp_plen; - prl_64->prefix[i].vltime = rpp->rp_vltime; - prl_64->prefix[i].pltime = rpp->rp_pltime; - prl_64->prefix[i].if_index = rpp->rp_ifp->if_index; - prl_64->prefix[i].expire = rpp->rp_expire; - prl_64->prefix[i].advrtrs = 0; - prl_64->prefix[i].origin = rpp->rp_origin; - i++; - } return; } /* For 32-bit process */ @@ -1912,8 +2128,9 @@ nd6_siocgprlst(void *data, int data_is_64) struct nd_pfxrouter *pfr; int j; + NDPR_LOCK(pr); (void) in6_embedscope(&prl_32->prefix[i].prefix, - &pr->ndpr_prefix, NULL, NULL); + &pr->ndpr_prefix, NULL, NULL, NULL); prl_32->prefix[i].raflags = pr->ndpr_raf; prl_32->prefix[i].prefixlen = pr->ndpr_plen; prl_32->prefix[i].vltime = pr->ndpr_vltime; @@ -1944,27 +2161,11 @@ nd6_siocgprlst(void *data, int data_is_64) } prl_32->prefix[i].advrtrs = j; prl_32->prefix[i].origin = PR_ORIG_RA; + NDPR_UNLOCK(pr); i++; pr = pr->ndpr_next; } - - for (rpp = LIST_FIRST(&rr_prefix); rpp; - rpp = LIST_NEXT(rpp, rp_entry)) { - if (i >= PRLSTSIZ) - break; - (void) in6_embedscope(&prl_32->prefix[i].prefix, - &pr->ndpr_prefix, NULL, NULL); - prl_32->prefix[i].raflags = rpp->rp_raf; - prl_32->prefix[i].prefixlen = rpp->rp_plen; - prl_32->prefix[i].vltime = rpp->rp_vltime; - prl_32->prefix[i].pltime = rpp->rp_pltime; - prl_32->prefix[i].if_index = rpp->rp_ifp->if_index; - prl_32->prefix[i].expire = rpp->rp_expire; - prl_32->prefix[i].advrtrs = 0; - prl_32->prefix[i].origin = rpp->rp_origin; - i++; - } } int @@ -1972,7 +2173,7 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) { struct in6_ndireq *ndi = (struct in6_ndireq *)data; struct in6_ondireq *ondi = (struct in6_ondireq *)data; - struct nd_defrouter *dr, any; + struct nd_defrouter *dr; struct nd_prefix *pr; struct rtentry *rt; int i = ifp->if_index, error = 0; @@ -2018,7 +2219,6 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) ondi->ndi.flags = nd_ifinfo[i].flags; ondi->ndi.recalctm = nd_ifinfo[i].recalctm; ondi->ndi.chlim = nd_ifinfo[i].chlim; - ondi->ndi.receivedra = nd_ifinfo[i].receivedra; lck_rw_done(nd_if_rwlock); break; @@ -2040,10 +2240,9 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) * xxx sumikawa: should not delete route if default * route equals to the top of default router list */ - bzero(&any, sizeof(any)); lck_mtx_lock(nd6_mutex); - defrouter_delreq(&any, 1); - defrouter_select(); + defrouter_reset(); + defrouter_select(ifp); lck_mtx_unlock(nd6_mutex); /* xxx sumikawa: flush prefix list */ break; @@ -2051,28 +2250,75 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) case SIOCSPFXFLUSH_IN6: { /* flush all the prefix advertised by routers */ struct nd_prefix *next; - lck_mtx_lock(nd6_mutex); + lck_mtx_lock(nd6_mutex); for (pr = nd_prefix.lh_first; pr; pr = next) { - struct in6_ifaddr *ia, *ia_next; + struct in6_ifaddr *ia; next = pr->ndpr_next; - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) + NDPR_LOCK(pr); + if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) { + NDPR_UNLOCK(pr); continue; /* XXX */ - + } + if (ifp != lo_ifp && pr->ndpr_ifp != ifp) { + NDPR_UNLOCK(pr); + continue; + } /* do we really have to remove addresses as well? */ - for (ia = in6_ifaddrs; ia; ia = ia_next) { - /* ia might be removed. keep the next ptr. */ - ia_next = ia->ia_next; + NDPR_ADDREF_LOCKED(pr); + NDPR_UNLOCK(pr); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + ia = in6_ifaddrs; + while (ia != NULL) { + IFA_LOCK(&ia->ia_ifa); + if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + IFA_UNLOCK(&ia->ia_ifa); + ia = ia->ia_next; + continue; + } - if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0) + if (ia->ia6_ndpr == pr) { + IFA_ADDREF_LOCKED(&ia->ia_ifa); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); + lck_mtx_unlock(nd6_mutex); + in6_purgeaddr(&ia->ia_ifa); + lck_mtx_lock(nd6_mutex); + lck_rw_lock_exclusive(&in6_ifaddr_rwlock); + IFA_REMREF(&ia->ia_ifa); + /* + * Purging the address caused + * in6_ifaddr_rwlock to be + * dropped and + * reacquired; therefore search again + * from the beginning of in6_ifaddrs. + * The same applies for the prefix list. + */ + ia = in6_ifaddrs; + next = nd_prefix.lh_first; continue; - if (ia->ia6_ndpr == pr) - in6_purgeaddr(&ia->ia_ifa, 1); + } + IFA_UNLOCK(&ia->ia_ifa); + ia = ia->ia_next; } - prelist_remove(pr, 1); + lck_rw_done(&in6_ifaddr_rwlock); + NDPR_LOCK(pr); + prelist_remove(pr); + NDPR_UNLOCK(pr); + /* + * If we were trying to restart this loop + * above by changing the value of 'next', we might + * end up freeing the only element on the list + * when we call NDPR_REMREF(). + * When this happens, we also have get out of this + * loop because we have nothing else to do. + */ + if (pr == next) + next = NULL; + NDPR_REMREF(pr); } lck_mtx_unlock(nd6_mutex); break; @@ -2090,9 +2336,12 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) */ for (dr = TAILQ_NEXT(dr, dr_entry); dr; dr = next) { next = TAILQ_NEXT(dr, dr_entry); - defrtrlist_del(dr, 1); + if (ifp == lo_ifp || dr->ifp == ifp) + defrtrlist_del(dr); } - defrtrlist_del(TAILQ_FIRST(&nd_defrouter), 1); + if (ifp == lo_ifp || + TAILQ_FIRST(&nd_defrouter)->ifp == ifp) + defrtrlist_del(TAILQ_FIRST(&nd_defrouter)); } lck_mtx_unlock(nd6_mutex); break; @@ -2183,8 +2432,9 @@ nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp) struct in6_ndifreq_64 *ndif_64 = (struct in6_ndifreq_64 *)data; struct in6_ndifreq_32 *ndif_32 = (struct in6_ndifreq_32 *)data; - return (nd6_setdefaultiface(cmd == SIOCSDEFIFACE_IN6_64 ? - ndif_64->ifindex : ndif_32->ifindex)); + error = nd6_setdefaultiface(cmd == SIOCSDEFIFACE_IN6_64 ? + ndif_64->ifindex : ndif_32->ifindex); + return (error); /* NOTREACHED */ } } @@ -2214,9 +2464,9 @@ nd6_cache_lladdr( int newstate = 0; struct timeval timenow; - if (!ifp) + if (ifp == NULL) panic("ifp == NULL in nd6_cache_lladdr"); - if (!from) + if (from == NULL) panic("from == NULL in nd6_cache_lladdr"); /* nothing must be updated for unspecified address */ @@ -2236,12 +2486,6 @@ nd6_cache_lladdr( rt = nd6_lookup(from, 0, ifp, 0); if (rt == NULL) { -#if 0 - /* nothing must be done if there's no lladdr */ - if (!lladdr || !lladdrlen) - return; -#endif - if ((rt = nd6_lookup(from, 1, ifp, 0)) == NULL) return; RT_LOCK_ASSERT_HELD(rt); @@ -2257,6 +2501,8 @@ nd6_cache_lladdr( is_newentry = 0; } + if (rt == NULL) + return; if ((rt->rt_flags & (RTF_GATEWAY | RTF_LLINFO)) != RTF_LLINFO) { fail: RT_UNLOCK(rt); @@ -2264,10 +2510,10 @@ fail: rtfree(rt); return; } - ln = rt->rt_llinfo; - if (!ln) + ln = (struct llinfo_nd6 *)rt->rt_llinfo; + if (ln == NULL) goto fail; - if (!rt->rt_gateway) + if (rt->rt_gateway == NULL) goto fail; if (rt->rt_gateway->sa_family != AF_LINK) goto fail; @@ -2300,18 +2546,21 @@ fail: */ sdl->sdl_alen = ifp->if_addrlen; bcopy(lladdr, LLADDR(sdl), ifp->if_addrlen); + + /* cache the gateway (sender HW) address */ + nd6_llreach_alloc(rt, ifp, LLADDR(sdl), sdl->sdl_alen, FALSE); } if (!is_newentry) { - if ((!olladdr && lladdr) /* (3) */ - || (olladdr && lladdr && llchange)) { /* (5) */ + if ((!olladdr && lladdr != NULL) || /* (3) */ + (olladdr && lladdr != NULL && llchange)) { /* (5) */ do_update = 1; newstate = ND6_LLINFO_STALE; } else /* (1-2,4) */ do_update = 0; } else { do_update = 1; - if (!lladdr) /* (6) */ + if (lladdr == NULL) /* (6) */ newstate = ND6_LLINFO_NOSTATE; else /* (7) */ newstate = ND6_LLINFO_STALE; @@ -2331,18 +2580,19 @@ fail: * we must set the timer now, although it is actually * meaningless. */ - ln->ln_expire = rt_expiry(rt, timenow.tv_sec, - nd6_gctimer); + ln->ln_expire = timenow.tv_sec + nd6_gctimer; ln->ln_hold = NULL; if (m != NULL) { + struct sockaddr_in6 sin6; + + rtkey_to_sa6(rt, &sin6); /* * we assume ifp is not a p2p here, so just * set the 2nd argument as the 1st one. */ RT_UNLOCK(rt); - nd6_output(ifp, ifp, m, - (struct sockaddr_in6 *)rt_key(rt), rt, 0); + nd6_output(ifp, ifp, m, &sin6, rt); RT_LOCK(rt); } } else if (ln->ln_state == ND6_LLINFO_INCOMPLETE) { @@ -2375,7 +2625,7 @@ fail: * 0 n y -- (3) c s s * 0 y y n (4) c s s * 0 y y y (5) c s s - * 1 -- n -- (6) c c c s + * 1 -- n -- (6) c c c s * 1 -- y -- (7) c c s c s * * (c=clear s=set) @@ -2391,8 +2641,8 @@ fail: case ND_REDIRECT: /* * If the icmp is a redirect to a better router, always set the - * is_router flag. Otherwise, if the entry is newly created, - * clear the flag. [RFC 2461, sec 8.3] + * is_router flag. Otherwise, if the entry is newly created, + * clear the flag. [RFC 2461, sec 8.3] */ if (code == ND_REDIRECT_ROUTER) ln->ln_router = 1; @@ -2409,8 +2659,8 @@ fail: /* * Mark an entry with lladdr as a router. */ - if ((!is_newentry && (olladdr || lladdr)) /* (2-5) */ - || (is_newentry && lladdr)) { /* (7) */ + if ((!is_newentry && (olladdr || lladdr)) || /* (2-5) */ + (is_newentry && lladdr)) { /* (7) */ ln->ln_router = 1; } break; @@ -2436,7 +2686,7 @@ fail: RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); lck_mtx_lock(nd6_mutex); - defrouter_select(); + defrouter_select(ifp); lck_mtx_unlock(nd6_mutex); } else { RT_REMREF_LOCKED(rt); @@ -2475,7 +2725,7 @@ nd6_slowtimo( #define senderr(e) { error = (e); goto bad;} int nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, - struct sockaddr_in6 *dst, struct rtentry *hint0, int locked) + struct sockaddr_in6 *dst, struct rtentry *hint0) { struct mbuf *m = m0; struct rtentry *rt = hint0, *hint = hint0; @@ -2520,14 +2770,14 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, if (!(rt->rt_flags & RTF_UP)) { RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); - if ((hint = rt = rtalloc1((struct sockaddr *)dst, - 1, 0)) != NULL) { + if ((hint = rt = rtalloc1_scoped((struct sockaddr *)dst, + 1, 0, ifp->if_index)) != NULL) { RT_LOCK_SPIN(rt); if (rt->rt_ifp != ifp) { /* XXX: loop care? */ RT_UNLOCK(rt); error = nd6_output(ifp, origifp, m0, - dst, rt, locked); + dst, rt); rtfree(rt); return (error); } @@ -2541,7 +2791,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, struct in6_ifaddr *ia6 = NULL; struct sockaddr_in6 gw6; - gw6 = *((struct sockaddr_in6 *)rt->rt_gateway); + rtgw_to_sa6(rt, &gw6); /* * Must drop rt_lock since nd6_is_addr_neighbor() * calls nd6_lookup() and acquires rnh_lock. @@ -2564,7 +2814,7 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, * XXX: we may need a more generic rule here. */ if (ia6 != NULL) - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); if ((ifp->if_flags & IFF_POINTOPOINT) == 0) senderr(EHOSTUNREACH); goto sendpkt; @@ -2601,7 +2851,8 @@ nd6_output(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m0, RT_UNLOCK(rt); rtfree(gwrt); lookup: - gwrt = rtalloc1((struct sockaddr *)&gw6, 1, 0); + gwrt = rtalloc1_scoped((struct sockaddr *)&gw6, + 1, 0, ifp->if_index); RT_LOCK(rt); /* @@ -2680,6 +2931,12 @@ lookup: if (rt && (rt->rt_flags & RTF_LLINFO) != 0) { ln = rt->rt_llinfo; } else { + struct sockaddr_in6 sin6; + /* + * Clear out Scope ID field in case it is set. + */ + sin6 = *dst; + sin6.sin6_scope_id = 0; /* * Since nd6_is_addr_neighbor() internally calls nd6_lookup(), * the condition below is not very efficient. But we believe @@ -2689,7 +2946,7 @@ lookup: */ if (rt != NULL) RT_UNLOCK(rt); - if (nd6_is_addr_neighbor(dst, ifp, 0)) { + if (nd6_is_addr_neighbor(&sin6, ifp, 0)) { /* "rtrele" may have been used, so clean up "rt" now */ if (rt != NULL) { /* Don't free "hint0" */ @@ -2795,7 +3052,7 @@ lookup: lck_rw_done(nd_if_rwlock); RT_UNLOCK(rt); /* We still have a reference on rt (for ln) */ - nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0, locked); + nd6_ns_output(ifp, NULL, &dst->sin6_addr, ln, 0); } else { RT_UNLOCK(rt); } @@ -2828,19 +3085,20 @@ sendpkt: if (rt != NULL) RT_LOCK_ASSERT_NOTHELD(rt); - /* Clean up HW checksum flags before sending the packet */ - m->m_pkthdr.csum_data = 0; - m->m_pkthdr.csum_flags = 0; + /* discard the packet if IPv6 operation is disabled on the interface */ + lck_rw_lock_shared(nd_if_rwlock); + if ((nd_ifinfo[ifp->if_index].flags & ND6_IFF_IFDISABLED)) { + lck_rw_done(nd_if_rwlock); + error = ENETDOWN; /* better error? */ + goto bad; + } + lck_rw_done(nd_if_rwlock); if ((ifp->if_flags & IFF_LOOPBACK) != 0) { /* forwarding rules require the original scope_id */ m->m_pkthdr.rcvif = origifp; - if (locked) - lck_mtx_unlock(ip6_mutex); error = dlil_output(origifp, PF_INET6, m, (caddr_t)rt, (struct sockaddr *)dst, 0); - if (locked) - lck_mtx_lock(ip6_mutex); goto release; } else { /* Do not allow loopback address to wind up on a wire */ @@ -2862,13 +3120,20 @@ sendpkt: } } + if (rt != NULL) { + RT_LOCK_SPIN(rt); + /* Mark use timestamp */ + if (rt->rt_llinfo != NULL) + nd6_llreach_use(rt->rt_llinfo); + RT_UNLOCK(rt); + } + + if (hint && nstat_collect) + nstat_route_tx(hint, 1, m->m_pkthdr.len, 0); + m->m_pkthdr.rcvif = NULL; - if (locked) - lck_mtx_unlock(ip6_mutex); error = dlil_output(ifp, PF_INET6, m, (caddr_t)rt, (struct sockaddr *)dst, 0); - if (locked) - lck_mtx_lock(ip6_mutex); goto release; bad: @@ -2923,8 +3188,13 @@ nd6_need_cache( #if IFT_IEEE80211 case IFT_IEEE80211: #endif - case IFT_BRIDGE: case IFT_GIF: /* XXX need more cases? */ + case IFT_PPP: +#if IFT_TUNNEL + case IFT_TUNNEL: +#endif + case IFT_BRIDGE: + case IFT_CELLULAR: return(1); default: return(0); @@ -3110,6 +3380,8 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS "default router list (%s)\n", ip6_sprintf(&dr->rtaddr)); d->flags = dr->flags; + d->stateflags = dr->stateflags; + d->stateflags &= ~NDDRF_PROCESSED; d->rtlifetime = dr->rtlifetime; d->expire = dr->expire; d->if_index = dr->ifp->if_index; @@ -3140,6 +3412,8 @@ nd6_sysctl_drlist SYSCTL_HANDLER_ARGS "default router list (%s)\n", ip6_sprintf(&dr->rtaddr)); d_32->flags = dr->flags; + d_32->stateflags = dr->stateflags; + d_32->stateflags &= ~NDDRF_PROCESSED; d_32->rtlifetime = dr->rtlifetime; d_32->expire = dr->expire; d_32->if_index = dr->ifp->if_index; @@ -3184,6 +3458,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS bzero(p, sizeof (*p)); sin6 = (struct sockaddr_in6 *)(p + 1); + NDPR_LOCK(pr); p->prefix = pr->ndpr_prefix; if (in6_recoverscope(&p->prefix, &p->prefix.sin6_addr, pr->ndpr_ifp) != 0) @@ -3196,7 +3471,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS p->pltime = pr->ndpr_pltime; p->if_index = pr->ndpr_ifp->if_index; p->expire = pr->ndpr_expire; - p->refcnt = pr->ndpr_refcnt; + p->refcnt = pr->ndpr_addrcnt; p->flags = pr->ndpr_stateflags; p->origin = PR_ORIG_RA; advrtrs = 0; @@ -3222,6 +3497,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS advrtrs++; } p->advrtrs = advrtrs; + NDPR_UNLOCK(pr); } else { panic("buffer too short"); } @@ -3246,6 +3522,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS bzero(p_32, sizeof (*p_32)); sin6 = (struct sockaddr_in6 *)(p_32 + 1); + NDPR_LOCK(pr); p_32->prefix = pr->ndpr_prefix; if (in6_recoverscope(&p_32->prefix, &p_32->prefix.sin6_addr, pr->ndpr_ifp) != 0) @@ -3258,7 +3535,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS p_32->pltime = pr->ndpr_pltime; p_32->if_index = pr->ndpr_ifp->if_index; p_32->expire = pr->ndpr_expire; - p_32->refcnt = pr->ndpr_refcnt; + p_32->refcnt = pr->ndpr_addrcnt; p_32->flags = pr->ndpr_stateflags; p_32->origin = PR_ORIG_RA; advrtrs = 0; @@ -3284,6 +3561,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS advrtrs++; } p_32->advrtrs = advrtrs; + NDPR_UNLOCK(pr); } else { panic("buffer too short"); } @@ -3297,7 +3575,7 @@ nd6_sysctl_prlist SYSCTL_HANDLER_ARGS return (error); } SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist, - CTLFLAG_RD, 0, 0, nd6_sysctl_drlist, "S,in6_defrouter",""); + CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, nd6_sysctl_drlist, "S,in6_defrouter",""); SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist, - CTLFLAG_RD, 0, 0, nd6_sysctl_prlist, "S,in6_defrouter",""); + CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, nd6_sysctl_prlist, "S,in6_defrouter",""); diff --git a/bsd/netinet6/nd6.h b/bsd/netinet6/nd6.h index 26fca3c11..601e075aa 100644 --- a/bsd/netinet6/nd6.h +++ b/bsd/netinet6/nd6.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -69,7 +69,7 @@ #include <sys/queue.h> -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #include <kern/locks.h> struct llinfo_nd6 { @@ -89,12 +89,14 @@ struct llinfo_nd6 { short ln_router; /* 2^0: ND6 router bit */ int ln_byhint; /* # of times we made it reachable by UL hint */ u_int32_t ln_flags; /* flags; see below */ + struct if_llreach *ln_llreach; /* link-layer reachability record */ + u_int64_t ln_lastused; /* last used timestamp */ }; /* Values for ln_flags */ #define ND6_LNF_TIMER_SKIP 0x1 /* modified by nd6_timer() */ #define ND6_LNF_IN_USE 0x2 /* currently in llinfo_nd6 list */ -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define ND6_LLINFO_PURGE -3 #define ND6_LLINFO_NOSTATE -2 @@ -112,16 +114,27 @@ struct llinfo_nd6 { #define ND6_LLINFO_DELAY 3 #define ND6_LLINFO_PROBE 4 -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE) -#endif /* KERNEL_PRIVATE */ - -#if !defined(KERNEL_PRIVATE) +#define ND6_LLINFO_PERMANENT(n) (((n)->ln_expire == 0) && ((n)->ln_state > ND6_LLINFO_INCOMPLETE)) +#define ND6_IFF_PERFORMNUD 0x1 +#define ND6_IFF_ACCEPT_RTADV 0x2 /* APPLE: not used. Innterface specific router advertisments are + * handled with a specific ifnet flag: IFEF_ACCEPT_RTADVD + */ +#define ND6_IFF_PREFER_SOURCE 0x4 /* APPLE: NOT USED not related to ND. */ +#define ND6_IFF_IFDISABLED 0x8 /* IPv6 operation is disabled due to + * DAD failure. (XXX: not ND-specific) + */ +#define ND6_IFF_DONT_SET_IFROUTE 0x10 /* NOT USED */ + +#endif /* XNU_KERNEL_PRIVATE */ + +#if !defined(XNU_KERNEL_PRIVATE) struct nd_ifinfo { #else /* For binary compatibility, this structure must not change */ struct nd_ifinfo_compat { -#endif /* !KERNEL_PRIVATE */ +#endif /* !XNU_KERNEL_PRIVATE */ u_int32_t linkmtu; /* LinkMTU */ u_int32_t maxmtu; /* Upper bound of LinkMTU */ u_int32_t basereachable; /* BaseReachableTime */ @@ -137,7 +150,7 @@ struct nd_ifinfo_compat { u_int8_t randomid[8]; /* current random ID */ }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct nd_ifinfo { u_int32_t linkmtu; /* LinkMTU */ u_int32_t maxmtu; /* Upper bound of LinkMTU */ @@ -147,7 +160,7 @@ struct nd_ifinfo { u_int32_t flags; /* Flags */ int recalctm; /* BaseReacable re-calculation timer */ u_int8_t chlim; /* CurHopLimit */ - u_int8_t receivedra; + u_int8_t initialized; /* Flag to see the entry is initialized */ /* the following 3 members are for privacy extension for addrconf */ u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */ u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */ @@ -156,7 +169,7 @@ struct nd_ifinfo { int32_t nprefixes; int32_t ndefrouters; }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define ND6_IFF_PERFORMNUD 0x1 @@ -169,7 +182,7 @@ struct in6_nbrinfo { int expire; /* lifetime for NDP state transition */ }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct in6_nbrinfo_32 { char ifname[IFNAMSIZ]; struct in6_addr addr; @@ -187,7 +200,7 @@ struct in6_nbrinfo_64 { int state; int expire; } __attribute__((aligned(8))); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #define DRLSTSIZ 10 #define PRLSTSIZ 10 @@ -203,7 +216,7 @@ struct in6_drlist { } defrouter[DRLSTSIZ]; }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct in6_drlist_32 { char ifname[IFNAMSIZ]; struct { @@ -225,20 +238,30 @@ struct in6_drlist_64 { u_short if_index __attribute__((aligned(8))); } defrouter[DRLSTSIZ] __attribute__((aligned(8))); }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ + +/* valid values for stateflags */ +#define NDDRF_INSTALLED 0x1 /* installed in the routing table */ +#define NDDRF_IFSCOPE 0x2 /* installed as a scoped route */ +#define NDDRF_STATIC 0x4 /* for internal use only */ +#ifdef XNU_KERNEL_PRIVATE +#define NDDRF_PROCESSED 0x10 +#endif struct in6_defrouter { struct sockaddr_in6 rtaddr; u_char flags; + u_char stateflags; u_short rtlifetime; u_long expire; u_short if_index; }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct in6_defrouter_32 { struct sockaddr_in6 rtaddr; u_char flags; + u_char stateflags; u_short rtlifetime; u_int32_t expire; u_short if_index; @@ -247,11 +270,12 @@ struct in6_defrouter_32 { struct in6_defrouter_64 { struct sockaddr_in6 rtaddr; u_char flags; + u_char stateflags; u_short rtlifetime; u_long expire __attribute__((aligned(8))); u_short if_index __attribute__((aligned(8))); } __attribute__((aligned(8))); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct in6_prlist { char ifname[IFNAMSIZ]; @@ -269,7 +293,7 @@ struct in6_prlist { } prefix[PRLSTSIZ]; }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct in6_prlist_32 { char ifname[IFNAMSIZ]; struct { @@ -302,7 +326,7 @@ struct in6_prlist_64 { struct in6_addr advrtr[DRLSTSIZ]; } prefix[PRLSTSIZ]; }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct in6_prefix { struct sockaddr_in6 prefix; @@ -319,7 +343,7 @@ struct in6_prefix { /* struct sockaddr_in6 advrtr[] */ }; -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) struct in6_prefix_32 { struct sockaddr_in6 prefix; struct prf_ra raflags; @@ -331,7 +355,7 @@ struct in6_prefix_32 { u_int32_t flags; int refcnt; u_short if_index; - u_short advrtrs; + u_short advrtrs; /* number of advertisement routers */ /* struct sockaddr_in6 advrtr[] */ }; @@ -349,7 +373,7 @@ struct in6_prefix_64 { u_short advrtrs; /* struct sockaddr_in6 advrtr[] */ }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ struct in6_ondireq { char ifname[IFNAMSIZ]; @@ -366,7 +390,7 @@ struct in6_ondireq { } ndi; }; -#if !defined(KERNEL_PRIVATE) +#if !defined(XNU_KERNEL_PRIVATE) struct in6_ndireq { char ifname[IFNAMSIZ]; struct nd_ifinfo ndi; @@ -376,14 +400,17 @@ struct in6_ndireq { char ifname[IFNAMSIZ]; struct nd_ifinfo_compat ndi; }; -#endif /* !KERNEL_PRIVATE */ +#endif /* !XNU_KERNEL_PRIVATE */ struct in6_ndifreq { char ifname[IFNAMSIZ]; u_long ifindex; }; -#if defined(KERNEL_PRIVATE) +#define MAX_RTR_SOLICITATION_DELAY 1 /* 1sec */ +#define RTR_SOLICITATION_INTERVAL 4 /* 4sec */ + +#if defined(XNU_KERNEL_PRIVATE) struct in6_ndifreq_32 { char ifname[IFNAMSIZ]; u_int32_t ifindex; @@ -393,11 +420,16 @@ struct in6_ndifreq_64 { char ifname[IFNAMSIZ]; u_long ifindex __attribute__((aligned(8))); }; -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ /* Prefix status */ #define NDPRF_ONLINK 0x1 #define NDPRF_DETACHED 0x2 +#define NDPRF_STATIC 0x100 +#define NDPRF_IFSCOPE 0x1000 +#ifdef XNU_KERNEL_PRIVATE +#define NDPRF_PROCESSED 0x08000 +#endif /* protocol constants */ #define MAX_RTR_SOLICITATION_DELAY 1 /*1sec*/ @@ -405,8 +437,9 @@ struct in6_ndifreq_64 { #define MAX_RTR_SOLICITATIONS 3 #define ND6_INFINITE_LIFETIME 0xffffffff +#define ND6_MAX_LIFETIME 0x7fffffff -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE /* * Protects nd_ifinfo[] */ @@ -448,18 +481,59 @@ __private_extern__ lck_rw_t *nd_if_rwlock; TAILQ_HEAD(nd_drhead, nd_defrouter); struct nd_defrouter { + decl_lck_mtx_data(, nddr_lock); + uint32_t nddr_refcount; + uint32_t nddr_debug; TAILQ_ENTRY(nd_defrouter) dr_entry; - struct in6_addr rtaddr; - u_char flags; /* flags on RA message */ - u_short rtlifetime; + struct in6_addr rtaddr; + u_char flags; /* flags on RA message */ + u_char stateflags; + u_short rtlifetime; u_int32_t expire; - u_int32_t advint; /* Mobile IPv6 addition (milliseconds) */ - u_int32_t advint_expire; /* Mobile IPv6 addition */ - int advints_lost; /* Mobile IPv6 addition */ - struct ifnet *ifp; + struct ifnet *ifp; + unsigned int genid; + int err; + void (*nddr_trace) /* callback fn for tracing refs */ + (struct nd_defrouter *, int); }; +#define NDDR_LOCK_ASSERT_HELD(_nddr) \ + lck_mtx_assert(&(_nddr)->nddr_lock, LCK_MTX_ASSERT_OWNED) + +#define NDDR_LOCK_ASSERT_NOTHELD(_nddr) \ + lck_mtx_assert(&(_nddr)->nddr_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define NDDR_LOCK(_nddr) \ + lck_mtx_lock(&(_nddr)->nddr_lock) + +#define NDDR_LOCK_SPIN(_nddr) \ + lck_mtx_lock_spin(&(_nddr)->nddr_lock) + +#define NDDR_CONVERT_LOCK(_nddr) do { \ + NDPR_LOCK_ASSERT_HELD(_nddr); \ + lck_mtx_convert_spin(&(_nddr)->nddr_lock); \ +} while (0) + +#define NDDR_UNLOCK(_nddr) \ + lck_mtx_unlock(&(_nddr)->nddr_lock) + +#define NDDR_ADDREF(_nddr) \ + nddr_addref(_nddr, 0) + +#define NDDR_ADDREF_LOCKED(_nddr) \ + nddr_addref(_nddr, 1) + +#define NDDR_REMREF(_nddr) do { \ + (void) nddr_remref(_nddr, 0); \ +} while (0) + +#define NDDR_REMREF_LOCKED(_nddr) \ + nddr_remref(_nddr, 1) + struct nd_prefix { + decl_lck_mtx_data(, ndpr_lock); + u_int32_t ndpr_refcount; /* reference count */ + u_int32_t ndpr_debug; /* see ifa_debug flags */ struct ifnet *ndpr_ifp; LIST_ENTRY(nd_prefix) ndpr_entry; struct sockaddr_in6 ndpr_prefix; /* prefix */ @@ -467,15 +541,17 @@ struct nd_prefix { struct in6_addr ndpr_addr; /* address that is derived from the prefix */ u_int32_t ndpr_vltime; /* advertised valid lifetime */ u_int32_t ndpr_pltime; /* advertised preferred lifetime */ - time_t ndpr_expire; /* expiration time of the prefix */ time_t ndpr_preferred; /* preferred time of the prefix */ + time_t ndpr_expire; /* expiration time of the prefix */ + time_t ndpr_lastupdate; /* reception time of last advertisement */ struct prf_ra ndpr_flags; u_int32_t ndpr_stateflags; /* actual state flags */ /* list of routers that advertise the prefix: */ LIST_HEAD(pr_rtrhead, nd_pfxrouter) ndpr_advrtrs; u_char ndpr_plen; - int ndpr_refcnt; /* reference counter from addresses */ - int ndpr_usecnt; /* actual use count; prevents free */ + int ndpr_addrcnt; /* reference counter from addresses */ + void (*ndpr_trace) /* callback fn for tracing refs */ + (struct nd_prefix *, int); }; #define ndpr_next ndpr_entry.le_next @@ -483,13 +559,46 @@ struct nd_prefix { #define ndpr_raf ndpr_flags #define ndpr_raf_onlink ndpr_flags.onlink #define ndpr_raf_auto ndpr_flags.autonomous - +#define ndpr_raf_router ndpr_flags.router /* * We keep expired prefix for certain amount of time, for validation purposes. * 1800s = MaxRtrAdvInterval */ #define NDPR_KEEP_EXPIRED (1800 * 2) +#define NDPR_LOCK_ASSERT_HELD(_ndpr) \ + lck_mtx_assert(&(_ndpr)->ndpr_lock, LCK_MTX_ASSERT_OWNED) + +#define NDPR_LOCK_ASSERT_NOTHELD(_ndpr) \ + lck_mtx_assert(&(_ndpr)->ndpr_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define NDPR_LOCK(_ndpr) \ + lck_mtx_lock(&(_ndpr)->ndpr_lock) + +#define NDPR_LOCK_SPIN(_ndpr) \ + lck_mtx_lock_spin(&(_ndpr)->ndpr_lock) + +#define NDPR_CONVERT_LOCK(_ndpr) do { \ + NDPR_LOCK_ASSERT_HELD(_ndpr); \ + lck_mtx_convert_spin(&(_ndpr)->ndpr_lock); \ +} while (0) + +#define NDPR_UNLOCK(_ndpr) \ + lck_mtx_unlock(&(_ndpr)->ndpr_lock) + +#define NDPR_ADDREF(_ndpr) \ + ndpr_addref(_ndpr, 0) + +#define NDPR_ADDREF_LOCKED(_ndpr) \ + ndpr_addref(_ndpr, 1) + +#define NDPR_REMREF(_ndpr) do { \ + (void) ndpr_remref(_ndpr, 0); \ +} while (0) + +#define NDPR_REMREF_LOCKED(_ndpr) \ + ndpr_remref(_ndpr, 1) + /* * Message format for use in obtaining information about prefixes * from inet6 sysctl function @@ -533,6 +642,7 @@ extern int nd6_delay; extern int nd6_umaxtries; extern int nd6_mmaxtries; extern int nd6_useloopback; +extern int nd6_accept_6to4; extern int nd6_maxnudhint; extern int nd6_gctimer; extern struct llinfo_nd6 llinfo_nd6; @@ -541,20 +651,21 @@ extern struct nd_drhead nd_defrouter; extern struct nd_prhead nd_prefix; extern int nd6_debug; extern size_t nd_ifinfo_indexlim; +extern int nd6_onlink_ns_rfc4861; -#define nd6log(x) do { if (nd6_debug) log x; } while (0) - -extern struct callout nd6_timer_ch; +#define nd6log(x) do { if (nd6_debug >= 1) log x; } while (0) +#define nd6log2(x) do { if (nd6_debug >= 2) log x; } while (0) /* nd6_rtr.c */ extern int nd6_defifindex; extern int ip6_desync_factor; /* seconds */ +/* ND6_INFINITE_LIFETIME does not apply to temporary addresses */ extern u_int32_t ip6_temp_preferred_lifetime; /* seconds */ extern u_int32_t ip6_temp_valid_lifetime; /* seconds */ extern int ip6_temp_regen_advance; /* seconds */ union nd_opts { - struct nd_opt_hdr *nd_opt_array[9]; /*max = home agent info*/ + struct nd_opt_hdr *nd_opt_array[8]; /* max = target address list */ struct { struct nd_opt_hdr *zero; struct nd_opt_hdr *src_lladdr; @@ -562,9 +673,6 @@ union nd_opts { struct nd_opt_prefix_info *pi_beg; /* multiple opts, start */ struct nd_opt_rd_hdr *rh; struct nd_opt_mtu *mtu; - struct nd_opt_hdr *six; - struct nd_opt_advint *adv; - struct nd_opt_hai *hai; struct nd_opt_hdr *search; /* multiple opts */ struct nd_opt_hdr *last; /* multiple opts */ int done; @@ -577,8 +685,6 @@ union nd_opts { #define nd_opts_pi_end nd_opt_each.pi_end #define nd_opts_rh nd_opt_each.rh #define nd_opts_mtu nd_opt_each.mtu -#define nd_opts_adv nd_opt_each.adv -#define nd_opts_hai nd_opt_each.hai #define nd_opts_search nd_opt_each.search #define nd_opts_last nd_opt_each.last #define nd_opts_done nd_opt_each.done @@ -604,49 +710,62 @@ extern int nd6_ioctl(u_long, caddr_t, struct ifnet *); extern void nd6_cache_lladdr(struct ifnet *, struct in6_addr *, char *, int, int, int); extern int nd6_output(struct ifnet *, struct ifnet *, struct mbuf *, - struct sockaddr_in6 *, struct rtentry *, int); + struct sockaddr_in6 *, struct rtentry *); extern int nd6_storelladdr(struct ifnet *, struct rtentry *, struct mbuf *, struct sockaddr *, u_char *); extern int nd6_need_cache(struct ifnet *); extern void nd6_drain(void *); /* nd6_nbr.c */ +extern void nd6_nbr_init(void); extern void nd6_na_input(struct mbuf *, int, int); extern void nd6_na_output(struct ifnet *, const struct in6_addr *, const struct in6_addr *, u_int32_t, int, struct sockaddr *); extern void nd6_ns_input(struct mbuf *, int, int); extern void nd6_ns_output(struct ifnet *, const struct in6_addr *, - const struct in6_addr *, struct llinfo_nd6 *, int, int); + const struct in6_addr *, struct llinfo_nd6 *, int); extern caddr_t nd6_ifptomac(struct ifnet *); extern void nd6_dad_start(struct ifaddr *, int *); extern void nd6_dad_stop(struct ifaddr *); -extern void nd6_dad_duplicated(struct ifaddr *); +extern void nd6_dad_duplicated(struct ifaddr *, boolean_t); +extern void nd6_llreach_alloc(struct rtentry *, struct ifnet *, void *, + unsigned int, boolean_t); +extern void nd6_llreach_set_reachable(struct ifnet *, void *, unsigned int); +extern void nd6_llreach_use(struct llinfo_nd6 *); /* nd6_rtr.c */ +extern void nd6_rtr_init(void); extern void nd6_rs_input(struct mbuf *, int, int); extern void nd6_ra_input(struct mbuf *, int, int); extern void prelist_del(struct nd_prefix *); -extern void defrouter_addreq(struct nd_defrouter *); -extern void defrouter_delreq(struct nd_defrouter *, int); -extern void defrouter_select(void); -extern void defrtrlist_del(struct nd_defrouter *, int); -extern void prelist_remove(struct nd_prefix *, int); +extern void defrouter_addreq(struct nd_defrouter *, boolean_t); +extern void defrouter_delreq(struct nd_defrouter *); +extern void defrouter_select(struct ifnet *); +extern void defrouter_reset(void); +extern int defrtrlist_ioctl(u_long, caddr_t); +extern void defrtrlist_del(struct nd_defrouter *); +extern int defrtrlist_add_static(struct nd_defrouter *); +extern int defrtrlist_del_static(struct nd_defrouter *); +extern void prelist_remove(struct nd_prefix *); extern int prelist_update(struct nd_prefix *, struct nd_defrouter *, - struct mbuf *); + struct mbuf *, int); extern int nd6_prelist_add(struct nd_prefix *, struct nd_defrouter *, - struct nd_prefix **); -extern int nd6_prefix_onlink(struct nd_prefix *, int, int); + struct nd_prefix **, boolean_t); +extern int nd6_prefix_onlink(struct nd_prefix *); +extern int nd6_prefix_onlink_scoped(struct nd_prefix *, unsigned int); extern int nd6_prefix_offlink(struct nd_prefix *); -extern void pfxlist_onlink_check(int); +extern void pfxlist_onlink_check(void); extern struct nd_defrouter *defrouter_lookup(struct in6_addr *, struct ifnet *); extern struct nd_prefix *nd6_prefix_lookup(struct nd_prefix *); extern int in6_init_prefix_ltimes(struct nd_prefix *ndpr); extern void rt6_flush(struct in6_addr *, struct ifnet *); extern int nd6_setdefaultiface(int); extern int in6_tmpifadd(const struct in6_ifaddr *, int, int); -extern void ndpr_hold(struct nd_prefix *, boolean_t); -extern void ndpr_rele(struct nd_prefix *, boolean_t); -#endif /* KERNEL_PRIVATE */ +extern void nddr_addref(struct nd_defrouter *, int); +extern struct nd_defrouter *nddr_remref(struct nd_defrouter *, int); +extern void ndpr_addref(struct nd_prefix *, int); +extern struct nd_prefix *ndpr_remref(struct nd_prefix *, int); +#endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL diff --git a/bsd/netinet6/nd6_nbr.c b/bsd/netinet6/nd6_nbr.c index 5b0d744a6..b2abd8169 100644 --- a/bsd/netinet6/nd6_nbr.c +++ b/bsd/netinet6/nd6_nbr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -67,11 +67,19 @@ #include <sys/kernel.h> #include <sys/errno.h> #include <sys/syslog.h> +#include <sys/sysctl.h> +#include <sys/mcache.h> +#include <sys/protosw.h> #include <kern/queue.h> +#include <kern/locks.h> +#include <kern/zalloc.h> + #include <net/if.h> +#include <net/if_var.h> #include <net/if_types.h> #include <net/if_dl.h> +#include <net/if_llreach.h> #include <net/route.h> #include <netinet/in.h> @@ -80,6 +88,7 @@ #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet6/nd6.h> +#include <netinet6/scope6_var.h> #include <netinet/icmp6.h> #if IPSEC @@ -96,27 +105,147 @@ extern int ipsec_bypass; struct dadq; static struct dadq *nd6_dad_find(struct ifaddr *); -#ifndef __APPLE__ -static void nd6_dad_starttimer(struct dadq *, int); -static void nd6_dad_stoptimer(struct dadq *); -#else void nd6_dad_stoptimer(struct ifaddr *); -#endif static void nd6_dad_timer(struct ifaddr *); static void nd6_dad_ns_output(struct dadq *, struct ifaddr *); static void nd6_dad_ns_input(struct ifaddr *); -static void nd6_dad_na_input(struct ifaddr *); +static void nd6_dad_na_input(struct ifaddr *, caddr_t, int); +static void dad_addref(struct dadq *, int); +static void dad_remref(struct dadq *); +static struct dadq *nd6_dad_attach(struct dadq *, struct ifaddr *); +static void nd6_dad_detach(struct dadq *, struct ifaddr *); static int dad_ignore_ns = 0; /* ignore NS in DAD - specwise incorrect*/ static int dad_maxtry = 15; /* max # of *tries* to transmit DAD packet */ +static unsigned int dad_size; /* size of zone element */ +static struct zone *dad_zone; /* zone for dadq */ + +#define DAD_ZONE_MAX 64 /* maximum elements in zone */ +#define DAD_ZONE_NAME "nd6_dad" /* zone name */ + +#define DAD_LOCK_ASSERT_HELD(_dp) \ + lck_mtx_assert(&(_dp)->dad_lock, LCK_MTX_ASSERT_OWNED) + +#define DAD_LOCK_ASSERT_NOTHELD(_dp) \ + lck_mtx_assert(&(_dp)->dad_lock, LCK_MTX_ASSERT_NOTOWNED) + +#define DAD_LOCK(_dp) \ + lck_mtx_lock(&(_dp)->dad_lock) + +#define DAD_LOCK_SPIN(_dp) \ + lck_mtx_lock_spin(&(_dp)->dad_lock) + +#define DAD_CONVERT_LOCK(_dp) do { \ + DAD_LOCK_ASSERT_HELD(_dp); \ + lck_mtx_convert_spin(&(_dp)->dad_lock); \ +} while (0) + +#define DAD_UNLOCK(_dp) \ + lck_mtx_unlock(&(_dp)->dad_lock) + +#define DAD_ADDREF(_dp) \ + dad_addref(_dp, 0) + +#define DAD_ADDREF_LOCKED(_dp) \ + dad_addref(_dp, 1) + +#define DAD_REMREF(_dp) \ + dad_remref(_dp) + extern lck_mtx_t *dad6_mutex; extern lck_mtx_t *nd6_mutex; +extern int in6_get_hw_ifid(struct ifnet *, struct in6_addr *); + +static int nd6_llreach_base = (LL_BASE_REACHABLE / 1000); /* seconds */ + +SYSCTL_DECL(_net_inet6_icmp6); + +SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_llreach_base, + CTLFLAG_RW | CTLFLAG_LOCKED, &nd6_llreach_base, LL_BASE_REACHABLE, + "default ND6 link-layer reachability max lifetime (in seconds)"); + +#define SIN6(s) ((struct sockaddr_in6 *)s) + /* - * Input an Neighbor Solicitation Message. + * Obtain a link-layer source cache entry for the sender. + * + * NOTE: This is currently only for ND6/Ethernet. + */ +void +nd6_llreach_alloc(struct rtentry *rt, struct ifnet *ifp, void *addr, + unsigned int alen, boolean_t solicited) +{ + struct llinfo_nd6 *ln = rt->rt_llinfo; + + if (nd6_llreach_base != 0 && + ln->ln_expire != 0 && rt->rt_ifp != lo_ifp && + ifp->if_addrlen == IF_LLREACH_MAXLEN && /* Ethernet */ + alen == ifp->if_addrlen) { + struct if_llreach *lr; + const char *why = NULL, *type = ""; + + /* Become a regular mutex, just in case */ + RT_CONVERT_LOCK(rt); + + if ((lr = ln->ln_llreach) != NULL) { + type = (solicited ? "ND6 advertisement" : + "ND6 unsolicited announcement"); + /* + * If target has changed, create a new record; + * otherwise keep existing record. + */ + IFLR_LOCK(lr); + if (bcmp(addr, lr->lr_key.addr, alen) != 0) { + IFLR_UNLOCK(lr); + /* Purge any link-layer info caching */ + VERIFY(rt->rt_llinfo_purge != NULL); + rt->rt_llinfo_purge(rt); + lr = NULL; + why = " for different target HW address; " + "using new llreach record"; + } else { + lr->lr_probes = 0; /* reset probe count */ + IFLR_UNLOCK(lr); + if (solicited) { + why = " for same target HW address; " + "keeping existing llreach record"; + } + } + } + + if (lr == NULL) { + lr = ln->ln_llreach = ifnet_llreach_alloc(ifp, + ETHERTYPE_IPV6, addr, alen, nd6_llreach_base); + if (lr != NULL) { + lr->lr_probes = 0; /* reset probe count */ + if (why == NULL) + why = "creating new llreach record"; + } + } + + if (nd6_debug && lr != NULL && why != NULL) { + char tmp[MAX_IPv6_STR_LEN]; + + nd6log((LOG_DEBUG, "%s%d: %s%s for %s\n", ifp->if_name, + ifp->if_unit, type, why, inet_ntop(AF_INET6, + &SIN6(rt_key(rt))->sin6_addr, tmp, sizeof (tmp)))); + } + } +} + +void +nd6_llreach_use(struct llinfo_nd6 *ln) +{ + if (ln->ln_llreach != NULL) + ln->ln_lastused = net_uptime(); +} + +/* + * Input a Neighbor Solicitation Message. * * Based on RFC 2461 - * Based on RFC 2462 (duplicated address detection) + * Based on RFC 2462 (duplicate address detection) */ void nd6_ns_input( @@ -151,6 +280,8 @@ nd6_ns_input( #endif ip6 = mtod(m, struct ip6_hdr *); /* adjust pointer for safety */ taddr6 = nd_ns->nd_ns_target; + if (in6_setscope(&taddr6, ifp, NULL) != 0) + goto bad; if (ip6->ip6_hlim != 255) { nd6log((LOG_ERR, @@ -161,18 +292,36 @@ nd6_ns_input( } if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { - /* dst has to be solicited node multicast address. */ - if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL + /* dst has to be a solicited node multicast address. */ + if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL && /* don't check ifindex portion */ - && daddr6.s6_addr32[1] == 0 - && daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE - && daddr6.s6_addr8[12] == 0xff) { + daddr6.s6_addr32[1] == 0 && + daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE && + daddr6.s6_addr8[12] == 0xff) { ; /* good */ } else { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(wrong ip6 dst)\n")); goto bad; } + } else if (!nd6_onlink_ns_rfc4861) { + struct sockaddr_in6 src_sa6; + + /* + * According to recent IETF discussions, it is not a good idea + * to accept a NS from an address which would not be deemed + * to be a neighbor otherwise. This point is expected to be + * clarified in future revisions of the specification. + */ + bzero(&src_sa6, sizeof(src_sa6)); + src_sa6.sin6_family = AF_INET6; + src_sa6.sin6_len = sizeof(src_sa6); + src_sa6.sin6_addr = saddr6; + if (!nd6_is_addr_neighbor(&src_sa6, ifp, 0)) { + nd6log((LOG_INFO, "nd6_ns_input: " + "NS packet from non-neighbor\n")); + goto bad; + } } if (IN6_IS_ADDR_MULTICAST(&taddr6)) { @@ -180,9 +329,6 @@ nd6_ns_input( goto bad; } - if (IN6_IS_SCOPE_LINKLOCAL(&taddr6)) - taddr6.s6_addr16[1] = htons(ifp->if_index); - icmp6len -= sizeof(*nd_ns); nd6_option_init(nd_ns + 1, icmp6len, &ndopts); if (nd6_options(&ndopts) < 0) { @@ -196,7 +342,7 @@ nd6_ns_input( lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1); lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3; } - + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) { nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet " "(link-layer address option)\n")); @@ -213,12 +359,6 @@ nd6_ns_input( * In implementation, we add target link-layer address by default. * We do not add one in MUST NOT cases. */ -#if 0 /* too much! */ - ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &daddr6); - if (ifa && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)) - tlladdr = 0; - else -#endif if (!IN6_IS_ADDR_MULTICAST(&daddr6)) tlladdr = 0; else @@ -234,16 +374,18 @@ nd6_ns_input( ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6); /* (2) check. */ - if (!ifa) { + if (ifa == NULL) { struct rtentry *rt; struct sockaddr_in6 tsin6; - bzero(&tsin6, sizeof tsin6); + bzero(&tsin6, sizeof tsin6); tsin6.sin6_len = sizeof(struct sockaddr_in6); tsin6.sin6_family = AF_INET6; tsin6.sin6_addr = taddr6; - rt = rtalloc1((struct sockaddr *)&tsin6, 0, 0); + rt = rtalloc1_scoped((struct sockaddr *)&tsin6, 0, 0, + ifp->if_index); + if (rt != NULL) { RT_LOCK(rt); if ((rt->rt_flags & RTF_ANNOUNCE) != 0 && @@ -262,7 +404,7 @@ nd6_ns_input( rtfree(rt); } } - if (!ifa) { + if (ifa == NULL) { /* * We've got an NS packet, and we don't have that adddress * assigned for us. We MUST silently ignore it. @@ -270,11 +412,15 @@ nd6_ns_input( */ goto freeit; } + IFA_LOCK(ifa); myaddr6 = *IFA_IN6(ifa); anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST; tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE; - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED) + if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED) { + IFA_UNLOCK(ifa); goto freeit; + } + IFA_UNLOCK(ifa); if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) { nd6log((LOG_INFO, @@ -306,7 +452,7 @@ nd6_ns_input( if (tentative) { /* * If source address is unspecified address, it is for - * duplicated address detection. + * duplicate address detection. * * If not, the packet is for addess resolution; * silently ignore it. @@ -327,7 +473,8 @@ nd6_ns_input( */ if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) { saddr6 = in6addr_linklocal_allnodes; - saddr6.s6_addr16[1] = htons(ifp->if_index); + if (in6_setscope(&saddr6, ifp, NULL) != 0) + goto bad; nd6_na_output(ifp, &saddr6, &taddr6, ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) @@ -346,7 +493,7 @@ nd6_ns_input( freeit: m_freem(m); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return; bad: @@ -356,17 +503,17 @@ nd6_ns_input( icmp6stat.icp6s_badns++; m_freem(m); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); } /* - * Output an Neighbor Solicitation Message. Caller specifies: + * Output a Neighbor Solicitation Message. Caller specifies: * - ICMP6 header source IP6 address * - ND6 header target IP6 address * - ND6 header source datalink address * * Based on RFC 2461 - * Based on RFC 2462 (duplicated address detection) + * Based on RFC 2462 (duplicate address detection) * * Caller must bump up ln->ln_rt refcnt to make sure 'ln' doesn't go * away if there is a llinfo_nd6 passed in. @@ -377,22 +524,29 @@ nd6_ns_output( const struct in6_addr *daddr6, const struct in6_addr *taddr6, struct llinfo_nd6 *ln, /* for source address determination */ - int dad, /* duplicated address detection */ - int locked) + int dad) /* duplicated address detection */ { struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_solicit *nd_ns; struct in6_ifaddr *ia = NULL; - struct ip6_moptions im6o; + struct in6_addr *src, src_in, src_storage; + struct ip6_moptions *im6o = NULL; + struct ifnet *outif = NULL; int icmp6len; int maxlen; + int flags; caddr_t mac; - struct ifnet *outif = NULL; - + struct route_in6 ro; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + + bzero(&ro, sizeof(ro)); + if (IN6_IS_ADDR_MULTICAST(taddr6)) return; + ip6oa.ip6oa_boundif = ifp->if_index; + /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_ns); maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7; @@ -418,9 +572,16 @@ nd6_ns_output( if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) { m->m_flags |= M_MCAST; - im6o.im6o_multicast_ifp = ifp; - im6o.im6o_multicast_hlim = 255; - im6o.im6o_multicast_loop = 0; + + im6o = ip6_allocmoptions(M_DONTWAIT); + if (im6o == NULL) { + m_freem(m); + return; + } + + im6o->im6o_multicast_ifp = ifp; + im6o->im6o_multicast_hlim = 255; + im6o->im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_ns); @@ -439,27 +600,15 @@ nd6_ns_output( ip6->ip6_dst = *daddr6; else { ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; - ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index); + ip6->ip6_dst.s6_addr16[1] = 0; ip6->ip6_dst.s6_addr32[1] = 0; ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE; ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3]; ip6->ip6_dst.s6_addr8[12] = 0xff; + if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0) + goto bad; } if (!dad) { -#if 0 /* KAME way, exact address scope match */ - /* - * Select a source whose scope is the same as that of the dest. - * Typically, the dest is link-local solicitation multicast - * (i.e. neighbor discovery) or link-local/global unicast - * (i.e. neighbor un-reachability detection). - */ - ia = in6_ifawithifp(ifp, &ip6->ip6_dst); - if (ia == NULL) { - m_freem(m); - return; - } - ip6->ip6_src = ia->ia_addr.sin6_addr; -#else /* spec-wise correct */ /* * RFC2461 7.2.2: * "If the source address of the packet prompting the @@ -473,72 +622,82 @@ nd6_ns_output( * (saddr6), if: * - saddr6 is given from the caller (by giving "ln"), and * - saddr6 belongs to the outgoing interface. - * Otherwise, we perform a scope-wise match. + * Otherwise, we perform the source address selection as usual. */ - struct ip6_hdr *hip6 = NULL; /* hold ip6 */ - struct in6_addr saddr6; + struct ip6_hdr *hip6; /* hold ip6 */ + struct in6_addr *hsrc = NULL; /* Caller holds ref on this route */ if (ln != NULL) { RT_LOCK(ln->ln_rt); + /* + * assuming every packet in ln_hold has the same IP + * header + */ if (ln->ln_hold != NULL) { hip6 = mtod(ln->ln_hold, struct ip6_hdr *); /* XXX pullup? */ if (sizeof (*hip6) < ln->ln_hold->m_len) - saddr6 = hip6->ip6_src; + hsrc = &hip6->ip6_src; else - hip6 = NULL; + hsrc = NULL; + } + /* Update probe count, if applicable */ + if (ln->ln_llreach != NULL) { + IFLR_LOCK_SPIN(ln->ln_llreach); + ln->ln_llreach->lr_probes++; + IFLR_UNLOCK(ln->ln_llreach); } - /* - * hip6 is used only to indicate whether or - * not there is a valid source address from - * the held packet in ln_hold. For obvious - * reasons we should not dereference it after - * releasing the lock though we can simply - * test if it's non-NULL. - */ RT_UNLOCK(ln->ln_rt); - } - if (ia != NULL) - ifafree(&ia->ia_ifa); - if (hip6 != NULL && (ia = in6ifa_ifpwithaddr(ifp, &saddr6))) { - bcopy(&saddr6, &ip6->ip6_src, sizeof (saddr6)); - } else { - ia = in6_ifawithifp(ifp, &ip6->ip6_dst); - if (ia == NULL) { - if (ln != NULL) { - RT_LOCK(ln->ln_rt); - if (ln->ln_hold != NULL) - m_freem(ln->ln_hold); - ln->ln_hold = NULL; - RT_UNLOCK(ln->ln_rt); - } - m_freem(m); - return; - } - ip6->ip6_src = ia->ia_addr.sin6_addr; } if (ia != NULL) { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); ia = NULL; } -#endif + if (hsrc != NULL && (ia = in6ifa_ifpwithaddr(ifp, hsrc))) { + src = hsrc; + IFA_REMREF(&ia->ia_ifa); + ia = NULL; + } else { + int error; + struct sockaddr_in6 dst_sa; + + bzero(&dst_sa, sizeof(dst_sa)); + dst_sa.sin6_family = AF_INET6; + dst_sa.sin6_len = sizeof(dst_sa); + dst_sa.sin6_addr = ip6->ip6_dst; + + src = in6_selectsrc(&dst_sa, NULL, + NULL, &ro, NULL, &src_storage, ip6oa.ip6oa_boundif, + &error); + if (src == NULL) { + nd6log((LOG_DEBUG, + "nd6_ns_output: source can't be " + "determined: dst=%s, error=%d\n", + ip6_sprintf(&dst_sa.sin6_addr), + error)); + goto bad; + } + } } else { /* * Source address for DAD packet must always be IPv6 * unspecified address. (0::0) + * We actually don't have to 0-clear the address (we did it + * above), but we do so here explicitly to make the intention + * clearer. */ - bzero(&ip6->ip6_src, sizeof(ip6->ip6_src)); + bzero(&src_in, sizeof(src_in)); + src = &src_in; } + ip6->ip6_src = *src; nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1); nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT; nd_ns->nd_ns_code = 0; nd_ns->nd_ns_reserved = 0; nd_ns->nd_ns_target = *taddr6; - - if (IN6_IS_SCOPE_LINKLOCAL(&nd_ns->nd_ns_target)) - nd_ns->nd_ns_target.s6_addr16[1] = 0; + in6_clearscope(&nd_ns->nd_ns_target); /* XXX */ /* * Add source link-layer address option. @@ -577,19 +736,43 @@ nd6_ns_output( if (ipsec_bypass == 0) (void)ipsec_setsocket(m, NULL); #endif - ip6_output(m, NULL, NULL, dad ? IPV6_DADOUTPUT : 0, &im6o, &outif, locked); + flags = dad ? IPV6_UNSPECSRC : 0; + flags |= IPV6_OUTARGS; + + ip6_output(m, NULL, NULL, flags, im6o, &outif, &ip6oa); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_neighborsolicit); + ifnet_release(outif); } icmp6stat.icp6s_outhist[ND_NEIGHBOR_SOLICIT]++; + + if (im6o != NULL) + IM6O_REMREF(im6o); + if (ro.ro_rt) { /* we don't cache this route. */ + rtfree(ro.ro_rt); + } + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + return; + +bad: + if (im6o != NULL) + IM6O_REMREF(im6o); + if (ro.ro_rt) { + rtfree(ro.ro_rt); + } + m_freem(m); + if (ia != NULL) + IFA_REMREF(&ia->ia_ifa); + return; } /* * Neighbor advertisement input handling. * * Based on RFC 2461 - * Based on RFC 2462 (duplicated address detection) + * Based on RFC 2462 (duplicate address detection) * * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) @@ -604,9 +787,6 @@ nd6_na_input( struct ifnet *ifp = m->m_pkthdr.rcvif; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_neighbor_advert *nd_na; -#if 0 - struct in6_addr saddr6 = ip6->ip6_src; -#endif struct in6_addr daddr6 = ip6->ip6_dst; struct in6_addr taddr6; int flags; @@ -640,14 +820,15 @@ nd6_na_input( return; } #endif - taddr6 = nd_na->nd_na_target; + flags = nd_na->nd_na_flags_reserved; is_router = ((flags & ND_NA_FLAG_ROUTER) != 0); is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0); is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0); - if (IN6_IS_SCOPE_LINKLOCAL(&taddr6)) - taddr6.s6_addr16[1] = htons(ifp->if_index); + taddr6 = nd_na->nd_na_target; + if (in6_setscope(&taddr6, ifp, NULL)) + goto bad; /* XXX: impossible */ if (IN6_IS_ADDR_MULTICAST(&taddr6)) { nd6log((LOG_ERR, @@ -687,10 +868,14 @@ nd6_na_input( * * Otherwise, process as defined in RFC 2461. */ - if (ifa - && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) { - nd6_dad_na_input(ifa); - goto freeit; + if (ifa != NULL) { + IFA_LOCK(ifa); + if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE) { + IFA_UNLOCK(ifa); + nd6_dad_na_input(ifa, lladdr, lladdrlen); + goto freeit; + } + IFA_UNLOCK(ifa); } /* Just for safety, maybe unnecessary. */ @@ -710,8 +895,8 @@ nd6_na_input( } /* - * If no neighbor cache entry is found, NA SHOULD silently be discarded. - * Callee returns a locked route upon success. + * If no neighbor cache entry is found, NA SHOULD silently be + * discarded. */ if ((rt = nd6_lookup(&taddr6, 0, ifp, 0)) == NULL) goto freeit; @@ -762,7 +947,9 @@ nd6_na_input( * affect the status of associated prefixes.. */ RT_UNLOCK(rt); - pfxlist_onlink_check(0); + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); + lck_mtx_unlock(nd6_mutex); RT_LOCK(rt); } } else { @@ -802,7 +989,7 @@ nd6_na_input( * 1 1 y n (2a) L *->REACHABLE * 1 1 y y (2a) L *->REACHABLE */ - if (!is_override && (lladdr && llchange)) { /* (1) */ + if (!is_override && (lladdr != NULL && llchange)) { /* (1) */ /* * If state is REACHABLE, make it STALE. * no other updates should be done. @@ -866,13 +1053,14 @@ nd6_na_input( * Lock to protect the default router list. * XXX: this might be unnecessary, since this function * is only called under the network software interrupt - * context. However, we keep it just for safety. + * context. However, we keep it just for safety. */ RT_UNLOCK(rt); lck_mtx_lock(nd6_mutex); dr = defrouter_lookup(in6, rt_ifp); if (dr) { - defrtrlist_del(dr, 1); + defrtrlist_del(dr); + NDDR_REMREF(dr); lck_mtx_unlock(nd6_mutex); } else { @@ -894,33 +1082,51 @@ nd6_na_input( } RT_LOCK_ASSERT_HELD(rt); rt->rt_flags &= ~RTF_REJECT; + + /* cache the gateway (sender HW) address */ + nd6_llreach_alloc(rt, ifp, LLADDR(sdl), sdl->sdl_alen, TRUE); + + /* update the llinfo, send a queued packet if there is one */ ln->ln_asked = 0; if (ln->ln_hold != NULL) { - struct mbuf *n = ln->ln_hold; - ln->ln_hold = NULL; + struct mbuf *m_hold, *m_hold_next; + struct sockaddr_in6 sin6; + + rtkey_to_sa6(rt, &sin6); /* - * we assume ifp is not a loopback here, so just set the 2nd - * argument as the 1st one. + * reset the ln_hold in advance, to explicitly + * prevent a ln_hold lookup in nd6_output() + * (wouldn't happen, though...) */ - RT_UNLOCK(rt); - nd6_output(ifp, ifp, n, (struct sockaddr_in6 *)rt_key(rt), - rt, 0); - RT_LOCK_SPIN(rt); + for (m_hold = ln->ln_hold; + m_hold; m_hold = m_hold_next) { + m_hold_next = m_hold->m_nextpkt; + m_hold->m_nextpkt = NULL; + /* + * we assume ifp is not a loopback here, so just set + * the 2nd argument as the 1st one. + */ + RT_UNLOCK(rt); + nd6_output(ifp, ifp, m_hold, &sin6, rt); + RT_LOCK_SPIN(rt); + } + ln->ln_hold = NULL; + } RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); - freeit: +freeit: m_freem(m); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); return; - bad: +bad: icmp6stat.icp6s_badna++; m_freem(m); if (ifa != NULL) - ifafree(ifa); + IFA_REMREF(ifa); } /* @@ -931,11 +1137,14 @@ nd6_na_input( * the following items are not implemented yet: * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD) * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD) + * + * tlladdr - 1 if include target link-layer address + * sdl0 - sockaddr_dl (= proxy NA) or NULL */ void nd6_na_output( struct ifnet *ifp, - const struct in6_addr *daddr6, + const struct in6_addr *daddr6_0, const struct in6_addr *taddr6, uint32_t flags, int tlladdr, /* 1 if include target link-layer address */ @@ -944,12 +1153,20 @@ nd6_na_output( struct mbuf *m; struct ip6_hdr *ip6; struct nd_neighbor_advert *nd_na; - struct in6_ifaddr *ia = NULL; - struct ip6_moptions im6o; - int icmp6len; - int maxlen; + struct ip6_moptions *im6o = NULL; caddr_t mac = NULL; - struct ifnet *outif = NULL; + struct route_in6 ro; + struct in6_addr *src, src_storage, daddr6; + struct sockaddr_in6 dst_sa; + int icmp6len, maxlen, error; + struct ifnet *outif = NULL; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + + bzero(&ro, sizeof(ro)); + + daddr6 = *daddr6_0; /* make a local copy for modification */ + + ip6oa.ip6oa_boundif = ifp->if_index; /* estimate the size of message */ maxlen = sizeof(*ip6) + sizeof(*nd_na); @@ -974,11 +1191,18 @@ nd6_na_output( return; m->m_pkthdr.rcvif = NULL; - if (IN6_IS_ADDR_MULTICAST(daddr6)) { + if (IN6_IS_ADDR_MULTICAST(&daddr6)) { m->m_flags |= M_MCAST; - im6o.im6o_multicast_ifp = ifp; - im6o.im6o_multicast_hlim = 255; - im6o.im6o_multicast_loop = 0; + + im6o = ip6_allocmoptions(M_DONTWAIT); + if (im6o == NULL) { + m_freem(m); + return; + } + + im6o->im6o_multicast_ifp = ifp; + im6o->im6o_multicast_hlim = 255; + im6o->im6o_multicast_loop = 0; } icmp6len = sizeof(*nd_na); @@ -992,35 +1216,44 @@ nd6_na_output( ip6->ip6_vfc |= IPV6_VERSION; ip6->ip6_nxt = IPPROTO_ICMPV6; ip6->ip6_hlim = 255; - if (IN6_IS_ADDR_UNSPECIFIED(daddr6)) { + if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) { /* reply to DAD */ - ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL; - ip6->ip6_dst.s6_addr16[1] = htons(ifp->if_index); - ip6->ip6_dst.s6_addr32[1] = 0; - ip6->ip6_dst.s6_addr32[2] = 0; - ip6->ip6_dst.s6_addr32[3] = IPV6_ADDR_INT32_ONE; + daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL; + daddr6.s6_addr16[1] = 0; + daddr6.s6_addr32[1] = 0; + daddr6.s6_addr32[2] = 0; + daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE; + if (in6_setscope(&daddr6, ifp, NULL)) + goto bad; + flags &= ~ND_NA_FLAG_SOLICITED; } else - ip6->ip6_dst = *daddr6; + ip6->ip6_dst = daddr6; + + bzero(&dst_sa, sizeof(struct sockaddr_in6)); + dst_sa.sin6_family = AF_INET6; + dst_sa.sin6_len = sizeof(struct sockaddr_in6); + dst_sa.sin6_addr = daddr6; /* * Select a source whose scope is the same as that of the dest. */ - ia = in6_ifawithifp(ifp, &ip6->ip6_dst); - if (ia == NULL) { - m_freem(m); - return; + bcopy(&dst_sa, &ro.ro_dst, sizeof(dst_sa)); + src = in6_selectsrc(&dst_sa, NULL, NULL, &ro, NULL, &src_storage, + ip6oa.ip6oa_boundif, &error); + if (src == NULL) { + nd6log((LOG_DEBUG, "nd6_na_output: source can't be " + "determined: dst=%s, error=%d\n", + ip6_sprintf(&dst_sa.sin6_addr), error)); + goto bad; } - ip6->ip6_src = ia->ia_addr.sin6_addr; - ifafree(&ia->ia_ifa); - ia = NULL; + ip6->ip6_src = *src; nd_na = (struct nd_neighbor_advert *)(ip6 + 1); nd_na->nd_na_type = ND_NEIGHBOR_ADVERT; nd_na->nd_na_code = 0; nd_na->nd_na_target = *taddr6; - if (IN6_IS_SCOPE_LINKLOCAL(&nd_na->nd_na_target)) - nd_na->nd_na_target.s6_addr16[1] = 0; + in6_clearscope(&nd_na->nd_na_target); /* XXX */ /* * "tlladdr" indicates NS's condition for adding tlladdr or not. @@ -1046,7 +1279,7 @@ nd6_na_output( if (tlladdr && mac) { int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen; struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1); - + /* roundup to 8 bytes alignment! */ optlen = (optlen + 7) & ~7; @@ -1071,23 +1304,63 @@ nd6_na_output( if (ipsec_bypass == 0) (void)ipsec_setsocket(m, NULL); #endif - ip6_output(m, NULL, NULL, 0, &im6o, &outif, 0); + ip6_output(m, NULL, NULL, IPV6_OUTARGS, im6o, &outif, &ip6oa); if (outif) { icmp6_ifstat_inc(outif, ifs6_out_msg); icmp6_ifstat_inc(outif, ifs6_out_neighboradvert); + ifnet_release(outif); } icmp6stat.icp6s_outhist[ND_NEIGHBOR_ADVERT]++; + + if (im6o != NULL) + IM6O_REMREF(im6o); + if (ro.ro_rt) { + rtfree(ro.ro_rt); + } + return; + +bad: + if (im6o != NULL) + IM6O_REMREF(im6o); + if (ro.ro_rt) { + rtfree(ro.ro_rt); + } + m_freem(m); + return; } caddr_t nd6_ifptomac( struct ifnet *ifp) { - return ((caddr_t)ifnet_lladdr(ifp)); + switch (ifp->if_type) { + case IFT_ARCNET: + case IFT_ETHER: + case IFT_IEEE8023ADLAG: + case IFT_FDDI: + case IFT_IEEE1394: +#ifdef IFT_L2VLAN + case IFT_L2VLAN: +#endif +#ifdef IFT_IEEE80211 + case IFT_IEEE80211: +#endif +#ifdef IFT_CARP + case IFT_CARP: +#endif + case IFT_BRIDGE: + case IFT_ISO88025: + return ((caddr_t)ifnet_lladdr(ifp)); + default: + return NULL; + } } TAILQ_HEAD(dadq_head, dadq); struct dadq { + decl_lck_mtx_data(, dad_lock); + u_int32_t dad_refcount; /* reference count */ + int dad_attached; TAILQ_ENTRY(dadq) dad_list; struct ifaddr *dad_ifa; int dad_count; /* max NS to send */ @@ -1095,28 +1368,46 @@ struct dadq { int dad_ns_ocount; /* NS sent so far */ int dad_ns_icount; int dad_na_icount; + int dad_na_ixcount; /* Count of IFDISABLED eligible NA rx'd */ }; static struct dadq_head dadq; -static int dad_init = 0; + +void +nd6_nbr_init(void) +{ + TAILQ_INIT(&dadq); + + dad_size = sizeof (struct dadq); + dad_zone = zinit(dad_size, DAD_ZONE_MAX * dad_size, 0, DAD_ZONE_NAME); + if (dad_zone == NULL) { + panic("%s: failed allocating %s", __func__, DAD_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(dad_zone, Z_EXPAND, TRUE); + zone_change(dad_zone, Z_CALLERACCT, FALSE); +} static struct dadq * -nd6_dad_find( - struct ifaddr *ifa) +nd6_dad_find(struct ifaddr *ifa) { struct dadq *dp; + lck_mtx_lock(dad6_mutex); for (dp = dadq.tqh_first; dp; dp = dp->dad_list.tqe_next) { + DAD_LOCK_SPIN(dp); if (dp->dad_ifa == ifa) { + DAD_ADDREF_LOCKED(dp); + DAD_UNLOCK(dp); lck_mtx_unlock(dad6_mutex); - return dp; + return (dp); } + DAD_UNLOCK(dp); } lck_mtx_unlock(dad6_mutex); - return NULL; + return (NULL); } -#ifdef __APPLE__ void nd6_dad_stoptimer( struct ifaddr *ifa) @@ -1124,28 +1415,9 @@ nd6_dad_stoptimer( untimeout((void (*)(void *))nd6_dad_timer, (void *)ifa); } -#else -static void -nd6_dad_starttimer( - struct dadq *dp, - int ticks) -{ - - callout_reset(&dp->dad_timer_ch, ticks, - (void (*)(void *))nd6_dad_timer, (void *)dp->dad_ifa); -} - -static void -nd6_dad_stoptimer( - struct dadq *dp) -{ - - callout_stop(&dp->dad_timer_ch); -} -#endif /* - * Start Duplicated Address Detection (DAD) for specified interface address. + * Start Duplicate Address Detection (DAD) for specified interface address. */ void nd6_dad_start( @@ -1155,43 +1427,45 @@ nd6_dad_start( struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; - if (!dad_init) { - TAILQ_INIT(&dadq); - dad_init++; - } - /* * If we don't need DAD, don't do it. * There are several cases: * - DAD is disabled (ip6_dad_count == 0) * - the interface address is anycast */ + IFA_LOCK(&ia->ia_ifa); if (!(ia->ia6_flags & IN6_IFF_TENTATIVE)) { log(LOG_DEBUG, "nd6_dad_start: called with non-tentative address " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); + IFA_UNLOCK(&ia->ia_ifa); return; } if (ia->ia6_flags & IN6_IFF_ANYCAST) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; + IFA_UNLOCK(&ia->ia_ifa); return; } if (!ip6_dad_count) { ia->ia6_flags &= ~IN6_IFF_TENTATIVE; + IFA_UNLOCK(&ia->ia_ifa); return; } - if (!ifa->ifa_ifp) + IFA_UNLOCK(&ia->ia_ifa); + if (ifa->ifa_ifp == NULL) panic("nd6_dad_start: ifa->ifa_ifp == NULL"); - if (!(ifa->ifa_ifp->if_flags & IFF_UP)) + if (!(ifa->ifa_ifp->if_flags & IFF_UP)) { return; - if (nd6_dad_find(ifa) != NULL) { + } + if ((dp = nd6_dad_find(ifa)) != NULL) { + DAD_REMREF(dp); /* DAD already in progress */ return; } - dp = _MALLOC(sizeof(*dp), M_IP6NDP, M_NOWAIT); + dp = zalloc(dad_zone); if (dp == NULL) { log(LOG_ERR, "nd6_dad_start: memory allocation failed for " "%s(%s)\n", @@ -1199,10 +1473,11 @@ nd6_dad_start( ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); return; } - bzero(dp, sizeof(*dp)); - lck_mtx_lock(dad6_mutex); - TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); - lck_mtx_unlock(dad6_mutex); + bzero(dp, dad_size); + lck_mtx_init(&dp->dad_lock, ifa_mtx_grp, ifa_mtx_attr); + + /* Callee adds one reference for us */ + dp = nd6_dad_attach(dp, ifa); nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr))); @@ -1213,11 +1488,6 @@ nd6_dad_start( * first packet to be sent from the interface after interface * (re)initialization. */ - dp->dad_ifa = ifa; - ifaref(ifa); /*just for safety*/ - dp->dad_count = ip6_dad_count; - dp->dad_ns_icount = dp->dad_na_icount = 0; - dp->dad_ns_ocount = dp->dad_ns_tcount = 0; if (tick_delay == NULL) { u_int32_t retrans; nd6_dad_ns_output(dp, ifa); @@ -1236,19 +1506,61 @@ nd6_dad_start( timeout((void (*)(void *))nd6_dad_timer, (void *)ifa, ntick); } + + DAD_REMREF(dp); /* drop our reference */ +} + +static struct dadq * +nd6_dad_attach(struct dadq *dp, struct ifaddr *ifa) +{ + lck_mtx_lock(dad6_mutex); + DAD_LOCK(dp); + dp->dad_ifa = ifa; + IFA_ADDREF(ifa); /* for dad_ifa */ + dp->dad_count = ip6_dad_count; + dp->dad_ns_icount = dp->dad_na_icount = 0; + dp->dad_ns_ocount = dp->dad_ns_tcount = 0; + dp->dad_na_ixcount = 0; + VERIFY(!dp->dad_attached); + dp->dad_attached = 1; + DAD_ADDREF_LOCKED(dp); /* for caller */ + DAD_ADDREF_LOCKED(dp); /* for dadq_head list */ + TAILQ_INSERT_TAIL(&dadq, (struct dadq *)dp, dad_list); + DAD_UNLOCK(dp); + lck_mtx_unlock(dad6_mutex); + + return (dp); +} + +static void +nd6_dad_detach(struct dadq *dp, struct ifaddr *ifa) +{ + int detached; + + lck_mtx_lock(dad6_mutex); + DAD_LOCK(dp); + if ((detached = dp->dad_attached)) { + VERIFY(dp->dad_ifa == ifa); + TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); + dp->dad_list.tqe_next = NULL; + dp->dad_list.tqe_prev = NULL; + dp->dad_attached = 0; + } + DAD_UNLOCK(dp); + lck_mtx_unlock(dad6_mutex); + if (detached) { + DAD_REMREF(dp); /* drop dadq_head reference */ + } } /* * terminate DAD unconditionally. used for address removals. */ void -nd6_dad_stop( - struct ifaddr *ifa) +nd6_dad_stop(struct ifaddr *ifa) { struct dadq *dp; - if (!dad_init) - return; dp = nd6_dad_find(ifa); if (!dp) { /* DAD wasn't started yet */ @@ -1257,21 +1569,42 @@ nd6_dad_stop( untimeout((void (*)(void *))nd6_dad_timer, (void *)ifa); - lck_mtx_lock(dad6_mutex); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); - lck_mtx_unlock(dad6_mutex); - FREE(dp, M_IP6NDP); - dp = NULL; - ifafree(ifa); + nd6_dad_detach(dp, ifa); + DAD_REMREF(dp); /* drop our reference */ } static void -nd6_dad_timer( - struct ifaddr *ifa) +nd6_unsol_na_output(struct ifaddr *ifa) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; - struct dadq *dp; + struct ifnet *ifp = ifa->ifa_ifp; + struct in6_addr saddr6, taddr6; + + if ((ifp->if_flags & IFF_UP) == 0 || + (ifp->if_flags & IFF_RUNNING) == 0) + return; + + IFA_LOCK_SPIN(&ia->ia_ifa); + taddr6 = ia->ia_addr.sin6_addr; + IFA_UNLOCK(&ia->ia_ifa); + if (in6_setscope(&taddr6, ifp, NULL) != 0) + return; + saddr6 = in6addr_linklocal_allnodes; + if (in6_setscope(&saddr6, ifp, NULL) != 0) + return; + + nd6log((LOG_INFO, "%s: sending unsolicited NA\n", + if_name(ifa->ifa_ifp))); + + nd6_na_output(ifp, &saddr6, &taddr6, ND_NA_FLAG_OVERRIDE, 1, NULL); +} + +static void +nd6_dad_timer(struct ifaddr *ifa) +{ + struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; + struct dadq *dp = NULL; /* Sanity check */ if (ia == NULL) { @@ -1283,11 +1616,13 @@ nd6_dad_timer( log(LOG_ERR, "nd6_dad_timer: DAD structure not found\n"); goto done; } + IFA_LOCK(&ia->ia_ifa); if (ia->ia6_flags & IN6_IFF_DUPLICATED) { log(LOG_ERR, "nd6_dad_timer: called with duplicated address " "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); + IFA_UNLOCK(&ia->ia_ifa); goto done; } if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) { @@ -1295,26 +1630,26 @@ nd6_dad_timer( "%s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???"); + IFA_UNLOCK(&ia->ia_ifa); goto done; } + IFA_UNLOCK(&ia->ia_ifa); /* timeouted with IFF_{RUNNING,UP} check */ + DAD_LOCK(dp); if (dp->dad_ns_tcount > dad_maxtry) { + DAD_UNLOCK(dp); nd6log((LOG_INFO, "%s: could not run DAD, driver problem?\n", if_name(ifa->ifa_ifp))); - lck_mtx_lock(dad6_mutex); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); - lck_mtx_unlock(dad6_mutex); - FREE(dp, M_IP6NDP); - dp = NULL; - ifafree(ifa); + nd6_dad_detach(dp, ifa); goto done; } /* Need more checks? */ if (dp->dad_ns_ocount < dp->dad_count) { u_int32_t retrans; + DAD_UNLOCK(dp); /* * We have more NS to go. Send NS packet for DAD. */ @@ -1341,71 +1676,50 @@ nd6_dad_timer( } if (dp->dad_ns_icount) { -#if 0 /* heuristics */ - /* - * if - * - we have sent many(?) DAD NS, and - * - the number of NS we sent equals to the - * number of NS we've got, and - * - we've got no NA - * we may have a faulty network card/driver which - * loops back multicasts to myself. - */ - if (3 < dp->dad_count - && dp->dad_ns_icount == dp->dad_count - && dp->dad_na_icount == 0) { - log(LOG_INFO, "DAD questionable for %s(%s): " - "network card loops back multicast?\n", - ip6_sprintf(&ia->ia_addr.sin6_addr), - if_name(ifa->ifa_ifp)); - /* XXX consider it a duplicate or not? */ - /* duplicate++; */ - } else { - /* We've seen NS, means DAD has failed. */ - duplicate++; - } -#else /* We've seen NS, means DAD has failed. */ duplicate++; -#endif } + DAD_UNLOCK(dp); if (duplicate) { /* (*dp) will be freed in nd6_dad_duplicated() */ - dp = NULL; - nd6_dad_duplicated(ifa); + nd6_dad_duplicated(ifa, TRUE); } else { /* * We are done with DAD. No NA came, no NS came. - * duplicated address found. + * No duplicate address found. */ + IFA_LOCK_SPIN(&ia->ia_ifa); ia->ia6_flags &= ~IN6_IFF_TENTATIVE; + IFA_UNLOCK(&ia->ia_ifa); nd6log((LOG_DEBUG, "%s: DAD complete for %s - no duplicates found\n", if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr))); - - lck_mtx_lock(dad6_mutex); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); - lck_mtx_unlock(dad6_mutex); + /* + * Send an Unsolicited Neighbor Advertisement so that + * other machines on the network are aware of us + * (important when we are waking from sleep). + */ + nd6_unsol_na_output(ifa); in6_post_msg(ia->ia_ifp, KEV_INET6_NEW_USER_ADDR, ia); - FREE(dp, M_IP6NDP); - dp = NULL; - ifafree(ifa); + nd6_dad_detach(dp, ifa); } } done: - return; + if (dp != NULL) + DAD_REMREF(dp); /* drop our reference */ } void -nd6_dad_duplicated( - struct ifaddr *ifa) +nd6_dad_duplicated(struct ifaddr *ifa, boolean_t dontignhwdup) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; + struct ifnet *ifp = ifa->ifa_ifp; + int hwdupposs; dp = nd6_dad_find(ifa); if (dp == NULL) { @@ -1413,80 +1727,99 @@ nd6_dad_duplicated( return; } + hwdupposs = 0; + IFA_LOCK(&ia->ia_ifa); + DAD_LOCK(dp); log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: " - "NS in/out=%d/%d, NA in=%d\n", - if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr), - dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount); - + "NS in/out=%d/%d, NA in=%d inx=%d\n", + if_name(ifp), ip6_sprintf(&ia->ia_addr.sin6_addr), + dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_na_icount, + dp->dad_na_ixcount); + hwdupposs = dp->dad_na_ixcount; + DAD_UNLOCK(dp); ia->ia6_flags &= ~IN6_IFF_TENTATIVE; ia->ia6_flags |= IN6_IFF_DUPLICATED; + IFA_UNLOCK(&ia->ia_ifa); /* We are done with DAD, with duplicated address found. (failure) */ untimeout((void (*)(void *))nd6_dad_timer, (void *)ifa); - + IFA_LOCK(&ia->ia_ifa); log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n", - if_name(ifa->ifa_ifp), ip6_sprintf(&ia->ia_addr.sin6_addr)); + if_name(ifp), ip6_sprintf(&ia->ia_addr.sin6_addr)); log(LOG_ERR, "%s: manual intervention required\n", - if_name(ifa->ifa_ifp)); - - lck_mtx_lock(dad6_mutex); - TAILQ_REMOVE(&dadq, (struct dadq *)dp, dad_list); - lck_mtx_unlock(dad6_mutex); - FREE(dp, M_IP6NDP); - dp = NULL; - ifafree(ifa); + if_name(ifp)); + IFA_UNLOCK(&ia->ia_ifa); + + if (hwdupposs || + (dontignhwdup && IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr))) { + log(LOG_ERR, "%s: possible hardware address duplication " + "detected, disable IPv6\n", if_name(ifp)); + + lck_rw_lock_shared(nd_if_rwlock); + nd_ifinfo[ifp->if_index].flags |= + ND6_IFF_IFDISABLED; + lck_rw_done(nd_if_rwlock); + } + + /* Send an event to the configuration agent so that the + * duplicate address will be notified to the user and will + * be removed. + */ + in6_post_msg(ifp, KEV_INET6_NEW_USER_ADDR, ia); + nd6_dad_detach(dp, ifa); + DAD_REMREF(dp); /* drop our reference */ } static void -nd6_dad_ns_output( - struct dadq *dp, - struct ifaddr *ifa) +nd6_dad_ns_output(struct dadq *dp, struct ifaddr *ifa) { struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct ifnet *ifp = ifa->ifa_ifp; + struct in6_addr taddr6; + DAD_LOCK(dp); dp->dad_ns_tcount++; if ((ifp->if_flags & IFF_UP) == 0) { -#if 0 - printf("%s: interface down?\n", if_name(ifp)); -#endif + DAD_UNLOCK(dp); return; } if ((ifp->if_flags & IFF_RUNNING) == 0) { -#if 0 - printf("%s: interface not running?\n", if_name(ifp)); -#endif + DAD_UNLOCK(dp); return; } dp->dad_ns_ocount++; - nd6_ns_output(ifp, NULL, &ia->ia_addr.sin6_addr, NULL, 1, 0); + DAD_UNLOCK(dp); + IFA_LOCK_SPIN(&ia->ia_ifa); + taddr6 = ia->ia_addr.sin6_addr; + IFA_UNLOCK(&ia->ia_ifa); + nd6_ns_output(ifp, NULL, &taddr6, NULL, 1); } static void -nd6_dad_ns_input( - struct ifaddr *ifa) +nd6_dad_ns_input(struct ifaddr *ifa) { - struct in6_ifaddr *ia; - const struct in6_addr *taddr6; struct dadq *dp; int duplicate; + struct ifnet *ifp; - if (!ifa) + if (ifa == NULL) panic("ifa == NULL in nd6_dad_ns_input"); - ia = (struct in6_ifaddr *)ifa; - taddr6 = &ia->ia_addr.sin6_addr; + ifp = ifa->ifa_ifp; duplicate = 0; dp = nd6_dad_find(ifa); /* Quickhack - completely ignore DAD NS packets */ if (dad_ignore_ns) { + struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; + IFA_LOCK(&ia->ia_ifa); nd6log((LOG_INFO, "nd6_dad_ns_input: ignoring DAD NS packet for " - "address %s(%s)\n", ip6_sprintf(taddr6), + "address %s(%s)\n", ip6_sprintf(&ia->ia_addr.sin6_addr), if_name(ifa->ifa_ifp))); + IFA_UNLOCK(&ia->ia_ifa); return; } @@ -1494,37 +1827,184 @@ nd6_dad_ns_input( * if I'm yet to start DAD, someone else started using this address * first. I have a duplicate and you win. */ - if (!dp || dp->dad_ns_ocount == 0) + if (dp != NULL) + DAD_LOCK(dp); + if (dp == NULL || dp->dad_ns_ocount == 0) duplicate++; /* XXX more checks for loopback situation - see nd6_dad_timer too */ if (duplicate) { - dp = NULL; /* will be freed in nd6_dad_duplicated() */ - nd6_dad_duplicated(ifa); - } else { + if (dp != NULL) { + DAD_UNLOCK(dp); + DAD_REMREF(dp); + dp = NULL; + } + nd6_dad_duplicated(ifa, TRUE); + } else if (dp != NULL) { /* * not sure if I got a duplicate. * increment ns count and see what happens. */ - if (dp) - dp->dad_ns_icount++; + dp->dad_ns_icount++; + DAD_UNLOCK(dp); + DAD_REMREF(dp); } } static void -nd6_dad_na_input( - struct ifaddr *ifa) +nd6_dad_na_input(struct ifaddr *ifa, caddr_t lladdr, int lladdrlen) { + struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa; struct dadq *dp; + int hwdupposs; - if (!ifa) + if (ifa == NULL) panic("ifa == NULL in nd6_dad_na_input"); dp = nd6_dad_find(ifa); - if (dp) - dp->dad_na_icount++; - + if (dp == NULL) { + log(LOG_ERR, "nd6_dad_na_input: DAD structure not found\n"); + return; + } + + /* + * If the address is a link-local address formed from an interface + * identifier based on the hardware address which is supposed to be + * uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP + * operation on the interface SHOULD be disabled according to RFC 4862, + * section 5.4.5, but here we decide not to disable if the target + * hardware address is not also ours, which is a transitory possibility + * in the presence of network-resident sleep proxies on the local link. + */ + hwdupposs = 0; + IFA_LOCK(ifa); + if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) { + struct ifnet *ifp; + struct in6_addr in6; + + IFA_UNLOCK(ifa); + ifp = ifa->ifa_ifp; + + /* + * To avoid over-reaction, we only apply this logic when we are + * very sure that hardware addresses are supposed to be unique. + */ + switch (ifp->if_type) { + case IFT_BRIDGE: + case IFT_ETHER: + case IFT_FDDI: + case IFT_ATM: + case IFT_IEEE1394: +#ifdef IFT_IEEE80211 + case IFT_IEEE80211: +#endif + /* Check if our hardware address matches the target */ + if (lladdr != NULL && lladdrlen > 0) { + struct ifaddr *llifa; + struct sockaddr_dl *sdl; + + llifa = ifp->if_lladdr; + IFA_LOCK(llifa); + sdl = (struct sockaddr_dl *)llifa->ifa_addr; + if (lladdrlen == sdl->sdl_alen || + bcmp(lladdr, LLADDR(sdl), lladdrlen) == 0) + hwdupposs = 1; + IFA_UNLOCK(llifa); + } + in6 = ia->ia_addr.sin6_addr; + if (in6_get_hw_ifid(ifp, &in6) != 0) + break; + /* + * Apply this logic only to the EUI-64 form of + * link-local interface identifiers. + */ + IFA_LOCK(ifa); + if (hwdupposs && + !IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { + hwdupposs = 0; + } else if (lladdr == NULL && + IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) { + /* + * We received a NA with no target link-layer + * address option. This means that someone else + * has our address. Mark it as a hardware + * duplicate so we disable IPv6 later on. + */ + hwdupposs = 1; + } + IFA_UNLOCK(ifa); + break; + default: + break; + } + } else { + IFA_UNLOCK(ifa); + } + + DAD_LOCK_SPIN(dp); + dp->dad_na_icount++; + if (hwdupposs) + dp->dad_na_ixcount++; + DAD_UNLOCK(dp); + DAD_REMREF(dp); + /* remove the address. */ - nd6_dad_duplicated(ifa); + nd6_dad_duplicated(ifa, FALSE); +} + +static void +dad_addref(struct dadq *dp, int locked) +{ + if (!locked) + DAD_LOCK_SPIN(dp); + else + DAD_LOCK_ASSERT_HELD(dp); + + if (++dp->dad_refcount == 0) { + panic("%s: dad %p wraparound refcnt\n", __func__, dp); + /* NOTREACHED */ + } + if (!locked) + DAD_UNLOCK(dp); +} + +static void +dad_remref(struct dadq *dp) +{ + struct ifaddr *ifa; + + DAD_LOCK_SPIN(dp); + if (dp->dad_refcount == 0) + panic("%s: dad %p negative refcnt\n", __func__, dp); + --dp->dad_refcount; + if (dp->dad_refcount > 0) { + DAD_UNLOCK(dp); + return; + } + DAD_UNLOCK(dp); + + if (dp->dad_attached || + dp->dad_list.tqe_next != NULL || dp->dad_list.tqe_prev != NULL) { + panic("%s: attached dad=%p is being freed", __func__, dp); + /* NOTREACHED */ + } + + if ((ifa = dp->dad_ifa) != NULL) { + IFA_REMREF(ifa); /* drop dad_ifa reference */ + dp->dad_ifa = NULL; + } + + lck_mtx_destroy(&dp->dad_lock, ifa_mtx_grp); + zfree(dad_zone, dp); +} + +void +nd6_llreach_set_reachable(struct ifnet *ifp, void *addr, unsigned int alen) +{ + /* Nothing more to do if it's disabled */ + if (nd6_llreach_base == 0) + return; + + ifnet_llreach_set_reachable(ifp, ETHERTYPE_IPV6, addr, alen); } diff --git a/bsd/netinet6/nd6_rtr.c b/bsd/netinet6/nd6_rtr.c index 10e965185..2e5c5eae5 100644 --- a/bsd/netinet6/nd6_rtr.c +++ b/bsd/netinet6/nd6_rtr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2008 Apple Inc. All rights reserved. + * Copyright (c) 2003-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,7 +70,11 @@ #include <sys/errno.h> #include <sys/syslog.h> #include <sys/queue.h> +#include <sys/mcache.h> + #include <kern/lock.h> +#include <kern/zalloc.h> +#include <machine/machine_routines.h> #include <net/if.h> #include <net/if_types.h> @@ -91,41 +95,170 @@ #define SDL(s) ((struct sockaddr_dl *)s) +static struct nd_defrouter *defrtrlist_update_common(struct nd_defrouter *, + boolean_t); static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *); -static struct in6_ifaddr *in6_ifadd(struct nd_prefix *, - struct in6_addr *); + +static struct in6_ifaddr *in6_ifadd(struct nd_prefix *, int); +static void defrtrlist_sync(struct ifnet *); + +static void defrouter_select_common(struct ifnet *, int); + static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *); static void pfxrtr_del(struct nd_pfxrouter *); static struct nd_pfxrouter *find_pfxlist_reachable_router(struct nd_prefix *); -static void defrouter_addifreq(struct ifnet *); static void nd6_rtmsg(int, struct rtentry *); -static void in6_init_address_ltimes(struct nd_prefix *ndpr, - struct in6_addrlifetime *lt6); +static int nd6_prefix_onlink_common(struct nd_prefix *, boolean_t, + unsigned int); +static struct nd_prefix *nd6_prefix_equal_lookup(struct nd_prefix *, boolean_t); +static void nd6_prefix_sync(struct ifnet *); + +static void in6_init_address_ltimes(struct nd_prefix *, + struct in6_addrlifetime *, boolean_t); static int rt6_deleteroute(struct radix_node *, void *); +static struct nd_defrouter *nddr_alloc(int); +static void nddr_free(struct nd_defrouter *); +static void nddr_trace(struct nd_defrouter *, int); + +static struct nd_prefix *ndpr_alloc(int); +static void ndpr_free(struct nd_prefix *); +static void ndpr_trace(struct nd_prefix *, int); + extern int nd6_recalc_reachtm_interval; static struct ifnet *nd6_defifp; int nd6_defifindex; +static unsigned int nd6_defrouter_genid; + +int ip6_use_tempaddr = 1; /* use temp addr by default for testing now */ -int ip6_use_tempaddr = 0; +int nd6_accept_6to4 = 1; int ip6_desync_factor; u_int32_t ip6_temp_preferred_lifetime = DEF_TEMP_PREFERRED_LIFETIME; u_int32_t ip6_temp_valid_lifetime = DEF_TEMP_VALID_LIFETIME; /* * shorter lifetimes for debugging purposes. -int ip6_temp_preferred_lifetime = 800; -static int ip6_temp_valid_lifetime = 1800; +u_int32_t ip6_temp_preferred_lifetime = 800; +static u_int32_t ip6_temp_valid_lifetime = 1800; */ int ip6_temp_regen_advance = TEMPADDR_REGEN_ADVANCE; extern lck_mtx_t *nd6_mutex; +/* Serialization variables for single thread access to nd_prefix */ +static boolean_t nd_prefix_busy; +static void *nd_prefix_waitchan = &nd_prefix_busy; +static int nd_prefix_waiters = 0; + +/* Serialization variables for single thread access to nd_defrouter */ +static boolean_t nd_defrouter_busy; +static void *nd_defrouter_waitchan = &nd_defrouter_busy; +static int nd_defrouter_waiters = 0; + +/* RTPREF_MEDIUM has to be 0! */ +#define RTPREF_HIGH 1 +#define RTPREF_MEDIUM 0 +#define RTPREF_LOW (-1) +#define RTPREF_RESERVED (-2) +#define RTPREF_INVALID (-3) /* internal */ + +#define NDPR_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int ndpr_trace_hist_size = NDPR_TRACE_HIST_SIZE; + +struct nd_prefix_dbg { + struct nd_prefix ndpr_pr; /* nd_prefix */ + u_int16_t ndpr_refhold_cnt; /* # of ref */ + u_int16_t ndpr_refrele_cnt; /* # of rele */ + /* + * Circular lists of ndpr_addref and ndpr_remref callers. + */ + ctrace_t ndpr_refhold[NDPR_TRACE_HIST_SIZE]; + ctrace_t ndpr_refrele[NDPR_TRACE_HIST_SIZE]; +}; + +static unsigned int ndpr_debug; /* debug flags */ +static unsigned int ndpr_size; /* size of zone element */ +static struct zone *ndpr_zone; /* zone for nd_prefix */ + +#define NDPR_ZONE_MAX 64 /* maximum elements in zone */ +#define NDPR_ZONE_NAME "nd6_prefix" /* zone name */ + +#define NDDR_TRACE_HIST_SIZE 32 /* size of trace history */ + +/* For gdb */ +__private_extern__ unsigned int nddr_trace_hist_size = NDDR_TRACE_HIST_SIZE; + +struct nd_defrouter_dbg { + struct nd_defrouter nddr_dr; /* nd_defrouter */ + uint16_t nddr_refhold_cnt; /* # of ref */ + uint16_t nddr_refrele_cnt; /* # of rele */ + /* + * Circular lists of ndpr_addref and ndpr_remref callers. + */ + ctrace_t nddr_refhold[NDDR_TRACE_HIST_SIZE]; + ctrace_t nddr_refrele[NDDR_TRACE_HIST_SIZE]; +}; + +static unsigned int nddr_debug; /* debug flags */ +static unsigned int nddr_size; /* size of zone element */ +static struct zone *nddr_zone; /* zone for nd_defrouter */ + +#define NDDR_ZONE_MAX 64 /* maximum elements in zone */ +#define NDDR_ZONE_NAME "nd6_defrouter" /* zone name */ + +static unsigned int ndprtr_size; /* size of zone element */ +static struct zone *ndprtr_zone; /* zone for nd_pfxrouter */ + +#define NDPRTR_ZONE_MAX 64 /* maximum elements in zone */ +#define NDPRTR_ZONE_NAME "nd6_pfxrouter" /* zone name */ + +void +nd6_rtr_init(void) +{ + PE_parse_boot_argn("ifa_debug", &ndpr_debug, sizeof (ndpr_debug)); + PE_parse_boot_argn("ifa_debug", &nddr_debug, sizeof (nddr_debug)); + + ndpr_size = (ndpr_debug == 0) ? sizeof (struct nd_prefix) : + sizeof (struct nd_prefix_dbg); + ndpr_zone = zinit(ndpr_size, NDPR_ZONE_MAX * ndpr_size, 0, + NDPR_ZONE_NAME); + if (ndpr_zone == NULL) { + panic("%s: failed allocating %s", __func__, NDPR_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(ndpr_zone, Z_EXPAND, TRUE); + zone_change(ndpr_zone, Z_CALLERACCT, FALSE); + + nddr_size = (nddr_debug == 0) ? sizeof (struct nd_defrouter) : + sizeof (struct nd_defrouter_dbg); + nddr_zone = zinit(nddr_size, NDDR_ZONE_MAX * nddr_size, 0, + NDDR_ZONE_NAME); + if (nddr_zone == NULL) { + panic("%s: failed allocating %s", __func__, NDDR_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(nddr_zone, Z_EXPAND, TRUE); + zone_change(nddr_zone, Z_CALLERACCT, FALSE); + + ndprtr_size = sizeof (struct nd_pfxrouter); + ndprtr_zone = zinit(ndprtr_size, NDPRTR_ZONE_MAX * ndprtr_size, 0, + NDPRTR_ZONE_NAME); + if (ndprtr_zone == NULL) { + panic("%s: failed allocating %s", __func__, NDPRTR_ZONE_NAME); + /* NOTREACHED */ + } + zone_change(ndprtr_zone, Z_EXPAND, TRUE); + zone_change(ndprtr_zone, Z_CALLERACCT, FALSE); +} + /* * Receive Router Solicitation Message - just for routers. * Router solicitation/advertisement is mostly managed by userland program @@ -143,17 +276,8 @@ nd6_rs_input( struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_solicit *nd_rs; struct in6_addr saddr6 = ip6->ip6_src; -#if 0 - struct in6_addr daddr6 = ip6->ip6_dst; -#endif char *lladdr = NULL; int lladdrlen = 0; -#if 0 - struct sockaddr_dl *sdl = (struct sockaddr_dl *)NULL; - struct llinfo_nd6 *ln = (struct llinfo_nd6 *)NULL; - struct rtentry *rt = NULL; - int is_newentry; -#endif union nd_opts ndopts; /* If I'm not a router, ignore it. */ @@ -170,11 +294,25 @@ nd6_rs_input( } /* - * Don't update the neighbor cache, if src = ::. - * This indicates that the src has no IP address assigned yet. - */ - if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) + * Don't update the neighbor cache, if src = :: or a non-neighbor. + * The former case indicates that the src has no IP address assigned + * yet. See nd6_ns_input() for the latter case. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) goto freeit; + else { + struct sockaddr_in6 src_sa6; + + bzero(&src_sa6, sizeof(src_sa6)); + src_sa6.sin6_family = AF_INET6; + src_sa6.sin6_len = sizeof(src_sa6); + src_sa6.sin6_addr = ip6->ip6_src; + if (!nd6_is_addr_neighbor(&src_sa6, ifp, 0)) { + nd6log((LOG_INFO, "nd6_rs_input: " + "RS packet from non-neighbor\n")); + goto freeit; + } + } #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, icmp6len, return); @@ -238,14 +376,9 @@ nd6_ra_input( struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct nd_router_advert *nd_ra; struct in6_addr saddr6 = ip6->ip6_src; -#if 0 - struct in6_addr daddr6 = ip6->ip6_dst; - int flags; /* = nd_ra->nd_ra_flags_reserved; */ - int is_managed = ((flags & ND_RA_FLAG_MANAGED) != 0); - int is_other = ((flags & ND_RA_FLAG_OTHER) != 0); -#endif + int mcast = 0; union nd_opts ndopts; - struct nd_defrouter *dr; + struct nd_defrouter *dr = NULL; struct timeval timenow; getmicrotime(&timenow); @@ -292,20 +425,22 @@ nd6_ra_input( struct nd_defrouter dr0; u_int32_t advreachable = nd_ra->nd_ra_reachable; + /* remember if this is a multicasted advertisement */ + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) + mcast = 1; + lck_rw_lock_shared(nd_if_rwlock); if (ifp->if_index >= nd_ifinfo_indexlim) { lck_rw_done(nd_if_rwlock); goto freeit; } ndi = &nd_ifinfo[ifp->if_index]; + bzero(&dr0, sizeof (dr0)); dr0.rtaddr = saddr6; dr0.flags = nd_ra->nd_ra_flags_reserved; dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime); dr0.expire = timenow.tv_sec + dr0.rtlifetime; dr0.ifp = ifp; - dr0.advint = 0; /* Mobile IPv6 */ - dr0.advint_expire = 0; /* Mobile IPv6 */ - dr0.advints_lost = 0; /* Mobile IPv6 */ /* unspecified or not? (RFC 2461 6.3.4) */ if (advreachable) { advreachable = ntohl(advreachable); @@ -322,7 +457,9 @@ nd6_ra_input( ndi->chlim = nd_ra->nd_ra_curhoplimit; lck_rw_done(nd_if_rwlock); ndi = NULL; + lck_mtx_lock(nd6_mutex); dr = defrtrlist_update(&dr0); + lck_mtx_unlock(nd6_mutex); } /* @@ -366,18 +503,9 @@ nd6_ra_input( continue; } - /* aggregatable unicast address, rfc2374 */ - if ((pi->nd_opt_pi_prefix.s6_addr8[0] & 0xe0) == 0x20 - && pi->nd_opt_pi_prefix_len != 64) { - nd6log((LOG_INFO, - "nd6_ra_input: invalid prefixlen " - "%d for rfc2374 prefix %s, ignored\n", - pi->nd_opt_pi_prefix_len, - ip6_sprintf(&pi->nd_opt_pi_prefix))); - continue; - } - bzero(&pr, sizeof(pr)); + lck_mtx_init(&pr.ndpr_lock, ifa_mtx_grp, ifa_mtx_attr); + NDPR_LOCK(&pr); pr.ndpr_prefix.sin6_family = AF_INET6; pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix); pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix; @@ -392,10 +520,35 @@ nd6_ra_input( pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time); - if (in6_init_prefix_ltimes(&pr)) - continue; /* prefix lifetime init failed */ + /* + * Exceptions to stateless autoconfiguration processing: + * + nd6_accept_6to4 == 0 && address has 6to4 prefix + * + ip6_only_allow_rfc4193_prefix != 0 && address not RFC 4193 + */ + if (ip6_only_allow_rfc4193_prefix && + !IN6_IS_ADDR_UNIQUE_LOCAL(&pi->nd_opt_pi_prefix)) { + nd6log((LOG_INFO, + "nd6_ra_input: no SLAAC on prefix %s [not RFC 4193]\n", + ip6_sprintf(&pi->nd_opt_pi_prefix))); + pr.ndpr_raf_auto = 0; + } + else if (!nd6_accept_6to4 && + IN6_IS_ADDR_6TO4(&pi->nd_opt_pi_prefix)) { + nd6log((LOG_INFO, + "nd6_ra_input: no SLAAC on prefix %s [6to4]\n", + ip6_sprintf(&pi->nd_opt_pi_prefix))); + pr.ndpr_raf_auto = 0; + } - (void)prelist_update(&pr, dr, m); + if (in6_init_prefix_ltimes(&pr)) { + NDPR_UNLOCK(&pr); + lck_mtx_destroy(&pr.ndpr_lock, ifa_mtx_grp); + continue; /* prefix lifetime init failed */ + } else { + NDPR_UNLOCK(&pr); + } + (void)prelist_update(&pr, dr, m, mcast); + lck_mtx_destroy(&pr.ndpr_lock, ifa_mtx_grp); } } @@ -475,16 +628,20 @@ nd6_ra_input( * router's neighbor cache, which might also affect our on-link * detection of adveritsed prefixes. */ - pfxlist_onlink_check(0); + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); + lck_mtx_unlock(nd6_mutex); } freeit: m_freem(m); + if (dr) + NDDR_REMREF(dr); return; bad: icmp6stat.icp6s_badra++; - m_freem(m); + goto freeit; } /* @@ -503,13 +660,16 @@ nd6_rtmsg(cmd, rt) RT_LOCK_ASSERT_HELD(rt); bzero((caddr_t)&info, sizeof(info)); - /* Lock ifp for if_addrlist */ + /* Lock ifp for if_lladdr */ ifnet_lock_shared(ifp); info.rti_info[RTAX_DST] = rt_key(rt); info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; info.rti_info[RTAX_NETMASK] = rt_mask(rt); - info.rti_info[RTAX_IFP] = - TAILQ_FIRST(&ifp->if_addrlist)->ifa_addr; + /* + * ifa_addr pointers for both should always be valid + * in this context; no need to hold locks. + */ + info.rti_info[RTAX_IFP] = ifp->if_lladdr->ifa_addr; info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr; rt_missmsg(cmd, &info, rt->rt_flags, 0); @@ -517,11 +677,21 @@ nd6_rtmsg(cmd, rt) } void -defrouter_addreq( - struct nd_defrouter *new) +defrouter_addreq(struct nd_defrouter *new, boolean_t scoped) { struct sockaddr_in6 def, mask, gate; struct rtentry *newrt = NULL; + unsigned int ifscope; + int err; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + + if (new->stateflags & NDDRF_INSTALLED) + return; + + nd6log2((LOG_INFO, "%s: adding default router %s, scoped=%d, " + "static=%d\n", if_name(new->ifp), ip6_sprintf(&new->rtaddr), + scoped, (new->stateflags & NDDRF_STATIC) ? 1 : 0)); Bzero(&def, sizeof(def)); Bzero(&mask, sizeof(mask)); @@ -532,65 +702,28 @@ defrouter_addreq( def.sin6_family = mask.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = new->rtaddr; - (void) rtrequest(RTM_ADD, (struct sockaddr *)&def, + ifscope = scoped ? new->ifp->if_index : IFSCOPE_NONE; + + err = rtrequest_scoped(RTM_ADD, (struct sockaddr *)&def, (struct sockaddr *)&gate, (struct sockaddr *)&mask, - RTF_GATEWAY, &newrt); + RTF_GATEWAY, &newrt, ifscope); + if (newrt) { RT_LOCK(newrt); nd6_rtmsg(RTM_ADD, newrt); /* tell user process */ RT_REMREF_LOCKED(newrt); RT_UNLOCK(newrt); - } - return; -} - -/* Add a route to a given interface as default */ -void -defrouter_addifreq( - struct ifnet *ifp) -{ - struct sockaddr_in6 def, mask; - struct ifaddr *ifa = NULL; - struct rtentry *newrt = NULL; - int error; - u_int32_t flags; - - bzero(&def, sizeof(def)); - bzero(&mask, sizeof(mask)); - - def.sin6_len = mask.sin6_len = sizeof(struct sockaddr_in6); - def.sin6_family = mask.sin6_family = AF_INET6; - - /* - * Search for an ifaddr beloging to the specified interface. - * XXX: An IPv6 address are required to be assigned on the interface. - */ - if ((ifa = ifaof_ifpforaddr((struct sockaddr *)&def, ifp)) == NULL) { - nd6log((LOG_ERR, /* better error? */ - "defrouter_addifreq: failed to find an ifaddr " - "to install a route to interface %s\n", - if_name(ifp))); - return; - } - - flags = ifa->ifa_flags; - error = rtrequest(RTM_ADD, (struct sockaddr *)&def, ifa->ifa_addr, - (struct sockaddr *)&mask, flags, &newrt); - if (error != 0) { - nd6log((LOG_ERR, - "defrouter_addifreq: failed to install a route to " - "interface %s (errno = %d)\n", - if_name(ifp), error)); + new->stateflags |= NDDRF_INSTALLED; + if (ifscope != IFSCOPE_NONE) + new->stateflags |= NDDRF_IFSCOPE; + new->genid = nd6_defrouter_genid; } else { - if (newrt) { - RT_LOCK(newrt); - nd6_rtmsg(RTM_ADD, newrt); - RT_REMREF_LOCKED(newrt); - RT_UNLOCK(newrt); - } - in6_post_msg(ifp, KEV_INET6_DEFROUTER, (struct in6_ifaddr *)ifa); + nd6log((LOG_ERR, "%s: failed to add default router " + "%s on %s scoped %d (errno = %d)\n", __func__, + ip6_sprintf(&gate.sin6_addr), if_name(new->ifp), + (ifscope != IFSCOPE_NONE), err)); } - ifafree(ifa); + new->err = err; } struct nd_defrouter * @@ -600,25 +733,47 @@ defrouter_lookup( { struct nd_defrouter *dr; - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { - if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) + NDDR_LOCK(dr); + if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) { + NDDR_ADDREF_LOCKED(dr); + NDDR_UNLOCK(dr); return(dr); + } + NDDR_UNLOCK(dr); } - return(NULL); /* search failed */ + return (NULL); /* search failed */ } +/* + * Remove the default route for a given router. + * This is just a subroutine function for defrouter_select(), and should + * not be called from anywhere else. + */ void -defrouter_delreq( - struct nd_defrouter *dr, - int dofree) +defrouter_delreq(struct nd_defrouter *dr) { struct sockaddr_in6 def, mask, gate; struct rtentry *oldrt = NULL; + unsigned int ifscope; + int err; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); + + /* ifp would be NULL for the "drany" case */ + if (dr->ifp != NULL && !(dr->stateflags & NDDRF_INSTALLED)) + return; + + NDDR_LOCK_ASSERT_HELD(dr); + + nd6log2((LOG_INFO, "%s: removing default router %s, scoped=%d, " + "static=%d\n", dr->ifp != NULL ? if_name(dr->ifp) : "ANY", + ip6_sprintf(&dr->rtaddr), (dr->stateflags & NDDRF_IFSCOPE) ? 1 : 0, + (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); Bzero(&def, sizeof(def)); Bzero(&mask, sizeof(mask)); @@ -629,28 +784,155 @@ defrouter_delreq( def.sin6_family = mask.sin6_family = gate.sin6_family = AF_INET6; gate.sin6_addr = dr->rtaddr; - (void) rtrequest(RTM_DELETE, (struct sockaddr *)&def, - (struct sockaddr *)&gate, (struct sockaddr *)&mask, - RTF_GATEWAY, &oldrt); + if (dr->ifp != NULL) { + ifscope = (dr->stateflags & NDDRF_IFSCOPE) ? + dr->ifp->if_index : IFSCOPE_NONE; + } else { + ifscope = IFSCOPE_NONE; + } + err = rtrequest_scoped(RTM_DELETE, + (struct sockaddr *)&def, (struct sockaddr *)&gate, + (struct sockaddr *)&mask, RTF_GATEWAY, &oldrt, ifscope); + if (oldrt) { RT_LOCK(oldrt); nd6_rtmsg(RTM_DELETE, oldrt); RT_UNLOCK(oldrt); rtfree(oldrt); + } else if (err != ESRCH) { + nd6log((LOG_ERR, "%s: failed to delete default router " + "%s on %s scoped %d (errno = %d)\n", __func__, + ip6_sprintf(&gate.sin6_addr), dr->ifp != NULL ? + if_name(dr->ifp) : "ANY", (ifscope != IFSCOPE_NONE), err)); + } + /* ESRCH means it's no longer in the routing table; ignore it */ + if (oldrt != NULL || err == ESRCH) { + dr->stateflags &= ~NDDRF_INSTALLED; + if (ifscope != IFSCOPE_NONE) + dr->stateflags &= ~NDDRF_IFSCOPE; + } + dr->err = 0; +} + + +/* + * remove all default routes from default router list + */ +void +defrouter_reset(void) +{ + struct nd_defrouter *dr, drany; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + dr = TAILQ_FIRST(&nd_defrouter); + while (dr) { + NDDR_LOCK(dr); + if (dr->stateflags & NDDRF_INSTALLED) { + NDDR_ADDREF_LOCKED(dr); + NDDR_UNLOCK(dr); + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(dr); + defrouter_delreq(dr); + NDDR_UNLOCK(dr); + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(dr); + dr = TAILQ_FIRST(&nd_defrouter); + } else { + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + } + } + + /* Nuke primary (non-scoped) default router */ + if (ip6_doscopedroute) { + bzero(&drany, sizeof (drany)); + lck_mtx_init(&drany.nddr_lock, ifa_mtx_grp, ifa_mtx_attr); + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(&drany); + defrouter_delreq(&drany); + NDDR_UNLOCK(&drany); + lck_mtx_destroy(&drany.nddr_lock, ifa_mtx_grp); + lck_mtx_lock(nd6_mutex); + } + +} + +int +defrtrlist_ioctl(u_long cmd, caddr_t data) +{ + struct in6_defrouter_32 *r_32 = (struct in6_defrouter_32 *)data; + struct in6_defrouter_64 *r_64 = (struct in6_defrouter_64 *)data; + struct nd_defrouter dr0; + unsigned int ifindex; + struct ifnet *dr_ifp; + int error = 0, add = 0; + + switch (cmd) { + case SIOCDRADD_IN6_32: + case SIOCDRADD_IN6_64: + ++add; + /* FALLTHRU */ + case SIOCDRDEL_IN6_32: + case SIOCDRDEL_IN6_64: + bzero(&dr0, sizeof (dr0)); + if (cmd == SIOCDRADD_IN6_64 || cmd == SIOCDRDEL_IN6_64) { + dr0.rtaddr = r_64->rtaddr.sin6_addr; + dr0.flags = r_64->flags; + ifindex = r_64->if_index; + } else { + dr0.rtaddr = r_32->rtaddr.sin6_addr; + dr0.flags = r_32->flags; + ifindex = r_32->if_index; + } + ifnet_head_lock_shared(); + /* Don't need to check is ifindex is < 0 since it's unsigned */ + if (if_index < ifindex || + (dr_ifp = ifindex2ifnet[ifindex]) == NULL) { + ifnet_head_done(); + error = EINVAL; + break; + } + dr0.ifp = dr_ifp; + ifnet_head_done(); + + if (IN6_IS_SCOPE_EMBED(&dr0.rtaddr)) { + uint16_t *scope = &dr0.rtaddr.s6_addr16[1]; + + if (*scope == 0) { + *scope = htons(dr_ifp->if_index); + } else if (*scope != htons(dr_ifp->if_index)) { + error = EINVAL; + break; + } + } + + if (add) + error = defrtrlist_add_static(&dr0); + if (!add || error != 0) { + int err = defrtrlist_del_static(&dr0); + if (!add) + error = err; + } + break; + + default: + error = EOPNOTSUPP; /* check for safety */ + break; } - if (dofree) /* XXX: necessary? */ - FREE(dr, M_IP6NDP); + return (error); } void -defrtrlist_del( - struct nd_defrouter *dr, int nd6locked) +defrtrlist_del(struct nd_defrouter *dr) { struct nd_defrouter *deldr = NULL; struct nd_prefix *pr; struct ifnet *ifp = dr->ifp; + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + /* * Flush all the routing table entries that use the router * as a next hop. @@ -658,38 +940,60 @@ defrtrlist_del( if (!ip6_forwarding && (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { /* above is a good condition? */ + NDDR_ADDREF(dr); + lck_mtx_unlock(nd6_mutex); rt6_flush(&dr->rtaddr, ifp); + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(dr); } - if (nd6locked == 0) - lck_mtx_lock(nd6_mutex); if (dr == TAILQ_FIRST(&nd_defrouter)) deldr = dr; /* The router is primary. */ TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); + ++nd6_defrouter_genid; + + nd6log2((LOG_INFO, "%s: freeing defrouter %s\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr))); + + /* + * Delete it from the routing table. + */ + NDDR_ADDREF(dr); + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(dr); + defrouter_delreq(dr); + NDDR_UNLOCK(dr); + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(dr); /* * Also delete all the pointers to the router in each prefix lists. */ for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { struct nd_pfxrouter *pfxrtr; + + NDPR_LOCK(pr); if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL) pfxrtr_del(pfxrtr); + NDPR_UNLOCK(pr); } - pfxlist_onlink_check(1); + + pfxlist_onlink_check(); /* - * If the router is the primary one, choose a new one. - * Note that defrouter_select() will remove the current gateway - * from the routing table. + * If the router is the primary one, choose a new one. If Scoped + * Routing is enabled, always try to pick another eligible router + * on this interface. */ - if (deldr) - defrouter_select(); + if ((deldr || ip6_doscopedroute) && !ip6_forwarding && + (ip6_accept_rtadv || (ifp->if_eflags & IFEF_ACCEPT_RTADVD))) + defrouter_select(ifp); lck_rw_lock_shared(nd_if_rwlock); if (ifp->if_index < nd_ifinfo_indexlim) { struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index]; - ndi->ndefrouters--; + atomic_add_32(&ndi->ndefrouters, -1); if (ndi->ndefrouters < 0) { log(LOG_WARNING, "defrtrlist_del: negative " "count on %s\n", if_name(ifp)); @@ -697,177 +1001,822 @@ defrtrlist_del( } lck_rw_done(nd_if_rwlock); - if (nd6locked == 0) - lck_mtx_unlock(nd6_mutex); + NDDR_REMREF(dr); /* remove list reference */ +} + +int +defrtrlist_add_static(struct nd_defrouter *new) +{ + struct nd_defrouter *dr; + int err = 0; + + new->rtlifetime = -1; + new->stateflags |= NDDRF_STATIC; + + /* we only want the preference level */ + new->flags &= ND_RA_FLAG_RTPREF_MASK; + + lck_mtx_lock(nd6_mutex); + dr = defrouter_lookup(&new->rtaddr, new->ifp); + if (dr != NULL && !(dr->stateflags & NDDRF_STATIC)) { + err = EINVAL; + } else { + if (dr != NULL) + NDDR_REMREF(dr); + dr = defrtrlist_update(new); + if (dr != NULL) + err = dr->err; + else + err = ENOMEM; + } + if (dr != NULL) + NDDR_REMREF(dr); + lck_mtx_unlock(nd6_mutex); + + return (err); +} + +int +defrtrlist_del_static(struct nd_defrouter *new) +{ + struct nd_defrouter *dr; + + lck_mtx_lock(nd6_mutex); + dr = defrouter_lookup(&new->rtaddr, new->ifp); + if (dr == NULL || !(dr->stateflags & NDDRF_STATIC)) { + if (dr != NULL) + NDDR_REMREF(dr); + dr = NULL; + } else { + defrtrlist_del(dr); + NDDR_REMREF(dr); + } + lck_mtx_unlock(nd6_mutex); - FREE(dr, M_IP6NDP); + return (dr != NULL ? 0 : EINVAL); +} + +/* + * for default router selection + * regards router-preference field as a 2-bit signed integer + */ +static int +rtpref(struct nd_defrouter *dr) +{ + switch (dr->flags & ND_RA_FLAG_RTPREF_MASK) { + case ND_RA_FLAG_RTPREF_HIGH: + return (RTPREF_HIGH); + case ND_RA_FLAG_RTPREF_MEDIUM: + case ND_RA_FLAG_RTPREF_RSV: + return (RTPREF_MEDIUM); + case ND_RA_FLAG_RTPREF_LOW: + return (RTPREF_LOW); + default: + /* + * This case should never happen. If it did, it would mean a + * serious bug of kernel internal. We thus always bark here. + * Or, can we even panic? + */ + log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->flags); + return (RTPREF_INVALID); + } + /* NOTREACHED */ } /* - * Default Router Selection according to Section 6.3.6 of RFC 2461: - * 1) Routers that are reachable or probably reachable should be - * preferred. + * Default Router Selection according to Section 6.3.6 of RFC 2461 and + * draft-ietf-ipngwg-router-selection: + * + * 1) Routers that are reachable or probably reachable should be preferred. + * If we have more than one (probably) reachable router, prefer ones + * with the highest router preference. * 2) When no routers on the list are known to be reachable or * probably reachable, routers SHOULD be selected in a round-robin - * fashion. + * fashion, regardless of router preference values. * 3) If the Default Router List is empty, assume that all * destinations are on-link. + * + * When Scoped Routing is enabled, the selection logic is amended as follows: + * + * a) When a default interface is specified, the primary/non-scoped default + * router will be set to the reachable router on that link (if any) with + * the highest router preference. + * b) When there are more than one routers on the same link, the one with + * the highest router preference will be installed, either as scoped or + * non-scoped route entry. If they all share the same preference value, + * the one installed will be the static or the first encountered reachable + * router, i.e. static one wins over dynamic. + * c) When no routers on the list are known to be reachable, or probably + * reachable, no round-robin selection will take place when the default + * interface is set. + * + * We assume nd_defrouter is sorted by router preference value. + * Since the code below covers both with and without router preference cases, + * we do not need to classify the cases by ifdef. */ -void -defrouter_select() +static void +defrouter_select_common(struct ifnet *ifp, int ignore) { - struct nd_defrouter *dr, anydr; + struct nd_defrouter *dr, *selected_dr = NULL, *installed_dr = NULL; + struct nd_defrouter *installed_dr0 = NULL; struct rtentry *rt = NULL; struct llinfo_nd6 *ln = NULL; + int update = 0; + boolean_t found_installedrt = FALSE; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); /* - * Search for a (probably) reachable router from the list. + * This function should be called only when acting as an autoconfigured + * host. Although the remaining part of this function is not effective + * if the node is not an autoconfigured host, we explicitly exclude + * such cases here for safety. */ - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + if (ip6_forwarding || (!ignore && !ip6_accept_rtadv && + !(ifp->if_eflags & IFEF_ACCEPT_RTADVD))) { + nd6log((LOG_WARNING, + "defrouter_select: called unexpectedly (forwarding=%d, " + "accept_rtadv=%d)\n", ip6_forwarding, ip6_accept_rtadv)); + return; + } + + /* + * Let's handle easy case (3) first: + * If default router list is empty, there's nothing to be done. + */ + if (!TAILQ_FIRST(&nd_defrouter)) + return; + + /* + * Due to the number of times we drop nd6_mutex, we need to + * serialize this function. + */ + while (nd_defrouter_busy) { + nd_defrouter_waiters++; + msleep(nd_defrouter_waitchan, nd6_mutex, (PZERO-1), + __func__, NULL); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + } + nd_defrouter_busy = TRUE; + /* + * Search for a (probably) reachable router from the list. + * We just pick up the first reachable one (if any), assuming that + * the ordering rule of the list described in defrtrlist_update(). + * + * For all intents and purposes of Scoped Routing: + * selected_dr = candidate for primary router + * installed_dr = currently installed primary router + */ for (dr = TAILQ_FIRST(&nd_defrouter); dr; dr = TAILQ_NEXT(dr, dr_entry)) { + boolean_t reachable; + /* Callee returns a locked route upon success */ + reachable = FALSE; + NDDR_ADDREF(dr); /* for this for loop */ + lck_mtx_unlock(nd6_mutex); if ((rt = nd6_lookup(&dr->rtaddr, 0, dr->ifp, 0)) != NULL) { RT_LOCK_ASSERT_HELD(rt); if ((ln = rt->rt_llinfo) != NULL && ND6_IS_LLINFO_PROBREACH(ln)) { - RT_REMREF_LOCKED(rt); - RT_UNLOCK(rt); - /* Got it, and move it to the head */ - TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); - TAILQ_INSERT_HEAD(&nd_defrouter, dr, dr_entry); - break; + reachable = TRUE; + if (selected_dr == NULL && + (!ip6_doscopedroute || + dr->ifp == nd6_defifp)) { + selected_dr = dr; + NDDR_ADDREF(selected_dr); + } } RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); + rt = NULL; } - } + lck_mtx_lock(nd6_mutex); - if ((dr = TAILQ_FIRST(&nd_defrouter))) { - /* - * De-install the previous default gateway and install - * a new one. - * Note that if there is no reachable router in the list, - * the head entry will be used anyway. - * XXX: do we have to check the current routing table entry? - */ - bzero(&anydr, sizeof(anydr)); - defrouter_delreq(&anydr, 0); - defrouter_addreq(dr); - } - else { - /* - * The Default Router List is empty, so install the default - * route to an inteface. - * XXX: The specification does not say this mechanism should - * be restricted to hosts, but this would be not useful - * (even harmful) for routers. - */ - if (!ip6_forwarding) { + /* Handle case (b) */ + if (ip6_doscopedroute && dr->ifp == nd6_defifp && + (selected_dr == NULL || rtpref(dr) > rtpref(selected_dr) || + (rtpref(dr) == rtpref(selected_dr) && + (dr->stateflags & NDDRF_STATIC) && + !(selected_dr->stateflags & NDDRF_STATIC)))) { + if (selected_dr) + NDDR_REMREF(selected_dr); + selected_dr = dr; + NDDR_ADDREF(selected_dr); + } + + if (!(dr->stateflags & NDDRF_INSTALLED)) { /* - * De-install the current default route - * in advance. + * If the router hasn't been installed and it is + * reachable, try to install it later on below. + * If it's static, try to install it anyway. */ - bzero(&anydr, sizeof(anydr)); - defrouter_delreq(&anydr, 0); - if (nd6_defifp) { - /* - * Install a route to the default interface - * as default route. - * XXX: we enable this for host only, because - * this may override a default route installed - * a user process (e.g. routing daemon) in a - * router case. - */ - defrouter_addifreq(nd6_defifp); + if (reachable || (dr->stateflags & NDDRF_STATIC)) { + dr->genid = -1; + ++update; + nd6log2((LOG_INFO, "%s: possible router %s, " + "scoped=%d, static=%d\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr), + (dr->stateflags & NDDRF_IFSCOPE) ? 1 : 0, + (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); + } + NDDR_REMREF(dr); /* for this for loop */ + continue; + } + + /* Record the currently installed primary/non-scoped router */ + if (!ip6_doscopedroute || !(dr->stateflags & NDDRF_IFSCOPE)) { + if (installed_dr == NULL) { + installed_dr = dr; + NDDR_ADDREF(installed_dr); } else { - nd6log((LOG_INFO, "defrouter_select: " - "there's no default router and no default" - " interface\n")); + /* this should not happen; warn for diagnosis */ + log(LOG_ERR, "defrouter_select: more than one " + "%s default router is installed\n", + ip6_doscopedroute ? "non-scoped" : ""); } } + NDDR_REMREF(dr); /* for this for loop */ } - return; -} + /* If none was selected, use the currently installed one */ + if (ip6_doscopedroute && selected_dr == NULL && installed_dr != NULL) { + selected_dr = installed_dr; + NDDR_ADDREF(selected_dr); + } -static struct nd_defrouter * -defrtrlist_update( - struct nd_defrouter *new) -{ - struct nd_defrouter *dr, *n; - struct ifnet *ifp = new->ifp; - struct nd_ifinfo *ndi; + /* + * Install the unreachable one(s) if necesssary. + */ + for (dr = TAILQ_FIRST(&nd_defrouter); dr; + dr = TAILQ_NEXT(dr, dr_entry)) { + struct nd_defrouter *_dr; - lck_mtx_lock(nd6_mutex); - if ((dr = defrouter_lookup(&new->rtaddr, ifp)) != NULL) { - /* entry exists */ - if (new->rtlifetime == 0) { - defrtrlist_del(dr, 1); - dr = NULL; - } else { - /* override */ - dr->flags = new->flags; /* xxx flag check */ + if (!ip6_doscopedroute) + break; + + NDDR_LOCK(dr); + + /* If already (or will be) installed, skip */ + if ((dr->stateflags & NDDRF_INSTALLED) || dr->genid == -1) { + NDDR_UNLOCK(dr); + continue; + } + + /* See if there is already a default router for the link */ + for (_dr = TAILQ_FIRST(&nd_defrouter); _dr; + _dr = TAILQ_NEXT(_dr, dr_entry)) { + if (_dr != dr) + NDDR_LOCK(_dr); + if (_dr == dr || _dr->ifp != dr->ifp) { + if (_dr != dr) + NDDR_UNLOCK(_dr); + continue; + } + + if ((_dr->stateflags & NDDRF_INSTALLED) || + _dr->genid == -1) { + if (_dr != dr) + NDDR_UNLOCK(_dr); + break; + } + if (_dr != dr) + NDDR_UNLOCK(_dr); + } + + /* If none so far, schedule it to be installed below */ + if (_dr == NULL) { + dr->genid = -1; + ++update; + nd6log2((LOG_INFO, "%s: possible router %s, " + "static=%d (unreachable)\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr), + (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); + } + NDDR_UNLOCK(dr); + } + + dr = selected_dr; + if (dr != NULL) { + nd6log2((LOG_INFO, "%s: considering primary default router %s, " + "static=%d [round 1]\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr), + (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); + } + + /* + * If none of the default routers was found to be reachable, + * round-robin the list regardless of preference, except when + * Scoped Routing is enabled per case (c). + * + * Otherwise, if we have an installed router, check if the selected + * (reachable) router should really be preferred to the installed one. + * We only prefer the new router when the old one is not reachable + * or when the new one has a really higher preference value. + */ + if (!ip6_doscopedroute && selected_dr == NULL) { + if (installed_dr == NULL || + !TAILQ_NEXT(installed_dr, dr_entry)) { + selected_dr = TAILQ_FIRST(&nd_defrouter); + if (selected_dr) + NDDR_ADDREF(selected_dr); + } else { + selected_dr = TAILQ_NEXT(installed_dr, dr_entry); + if (selected_dr) + NDDR_ADDREF(selected_dr); + } + } else if (selected_dr != NULL && installed_dr != NULL) { + lck_mtx_unlock(nd6_mutex); + rt = nd6_lookup(&installed_dr->rtaddr, 0, installed_dr->ifp, 0); + if (rt) { + RT_LOCK_ASSERT_HELD(rt); + if ((ln = (struct llinfo_nd6 *)rt->rt_llinfo) && + ND6_IS_LLINFO_PROBREACH(ln) && + (!ip6_doscopedroute || + installed_dr->ifp == nd6_defifp) && + rtpref(selected_dr) <= rtpref(installed_dr)) { + NDDR_REMREF(selected_dr); + selected_dr = installed_dr; + NDDR_ADDREF(selected_dr); + } + RT_REMREF_LOCKED(rt); + RT_UNLOCK(rt); + rt = NULL; + found_installedrt = TRUE; + } + lck_mtx_lock(nd6_mutex); + } + + if (ip6_doscopedroute) { + /* + * If the installed primary router is not on the current + * IPv6 default interface, demote it to a scoped entry. + */ + if (installed_dr != NULL && installed_dr->ifp != nd6_defifp && + !(installed_dr->stateflags & NDDRF_IFSCOPE)) { + if (selected_dr != NULL && + selected_dr->ifp != nd6_defifp) { + NDDR_REMREF(selected_dr); + selected_dr = NULL; + } + ++update; + } + + /* + * If the selected router is currently scoped, make sure + * we update (it needs to be promoted to primary.) + */ + if (selected_dr != NULL && + (selected_dr->stateflags & NDDRF_IFSCOPE)) + ++update; + + /* + * If the installed router is no longe reachable, remove + * it and install the selected router instead. + */ + if (installed_dr != NULL && selected_dr != NULL && + installed_dr != selected_dr && found_installedrt == FALSE) { + installed_dr0 = installed_dr; /* skip it below */ + /* NB: we previousled referenced installed_dr */ + installed_dr = NULL; + selected_dr->genid = -1; + ++update; + } + } + + /* + * If Scoped Routing is enabled and there's nothing to update, + * just return. Otherwise, if Scoped Routing is disabled and if + * the selected router is different than the installed one, + * remove the installed router and install the selected one. + */ + dr = selected_dr; + VERIFY(dr != NULL || ip6_doscopedroute); + if (!ip6_doscopedroute || !update) { + if (dr == NULL) + goto out; + + if (dr != installed_dr) { + nd6log2((LOG_INFO, "%s: no update, selected router %s, " + "installed router %s\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr), installed_dr != NULL ? + ip6_sprintf(&installed_dr->rtaddr) : "NONE")); + } else { + nd6log2((LOG_INFO, "%s: no update, router is %s\n", + if_name(dr->ifp), ip6_sprintf(&dr->rtaddr))); + } + if (!ip6_doscopedroute && installed_dr != dr) { + /* + * No need to ADDREF dr because at this point + * dr points to selected_dr, which already holds + * a reference. + */ + lck_mtx_unlock(nd6_mutex); + if (installed_dr) { + NDDR_LOCK(installed_dr); + defrouter_delreq(installed_dr); + NDDR_UNLOCK(installed_dr); + } + NDDR_LOCK(dr); + defrouter_addreq(dr, FALSE); + NDDR_UNLOCK(dr); + lck_mtx_lock(nd6_mutex); + } + goto out; + } + + /* + * Scoped Routing is enabled and we need to update. The selected + * router needs to be installed as primary/non-scoped entry. If + * there is any existing entry that is non-scoped, remove it from + * the routing table and reinstall it as scoped entry. + */ + if (dr != NULL) { + nd6log2((LOG_INFO, "%s: considering primary default router %s, " + "static=%d [round 2]\n", if_name(dr->ifp), + ip6_sprintf(&dr->rtaddr), + (dr->stateflags & NDDRF_STATIC) ? 1 : 0)); + } + + /* + * On the following while loops we use two flags: + * dr->genid + * NDDRF_PROCESSED + * + * genid is used to skip entries that are not to be added/removed on the + * second while loop. + * NDDRF_PROCESSED is used to skip entries that were already processed. + * This is necessary because we drop the nd6_mutex and start the while + * loop again. + */ + TAILQ_FOREACH(dr, &nd_defrouter, dr_entry) { + NDDR_LOCK(dr); + VERIFY((dr->stateflags & NDDRF_PROCESSED) == 0); + NDDR_UNLOCK(dr); + } + /* Remove conflicting entries */ + dr = TAILQ_FIRST(&nd_defrouter); + while (dr) { + NDDR_LOCK(dr); + if (!(dr->stateflags & NDDRF_INSTALLED) || + dr->stateflags & NDDRF_PROCESSED) { + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + continue; + } + dr->stateflags |= NDDRF_PROCESSED; + + /* A NULL selected_dr will remove primary default route */ + if ((dr == selected_dr && (dr->stateflags & NDDRF_IFSCOPE)) || + (dr != selected_dr && !(dr->stateflags & NDDRF_IFSCOPE))) { + NDDR_ADDREF_LOCKED(dr); + NDDR_UNLOCK(dr); + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(dr); + defrouter_delreq(dr); + NDDR_UNLOCK(dr); + lck_mtx_lock(nd6_mutex); + NDDR_LOCK(dr); + if (dr && dr != installed_dr0) + dr->genid = -1; + NDDR_UNLOCK(dr); + NDDR_REMREF(dr); + /* + * Since we lost nd6_mutex, we have to start over. + */ + dr = TAILQ_FIRST(&nd_defrouter); + continue; + } + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + } + + /* -1 is a special number, make sure we don't use it for genid */ + if (++nd6_defrouter_genid == -1) + nd6_defrouter_genid = 1; + + TAILQ_FOREACH(dr, &nd_defrouter, dr_entry) { + NDDR_LOCK(dr); + dr->stateflags &= ~NDDRF_PROCESSED; + NDDR_UNLOCK(dr); + } + /* Add the entries back */ + dr = TAILQ_FIRST(&nd_defrouter); + while (dr) { + struct nd_defrouter *_dr; + + NDDR_LOCK(dr); + if (dr->stateflags & NDDRF_PROCESSED || + dr->genid != -1) { + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + continue; + } + dr->stateflags |= NDDRF_PROCESSED; + + /* Handle case (b) */ + for (_dr = TAILQ_FIRST(&nd_defrouter); _dr; + _dr = TAILQ_NEXT(_dr, dr_entry)) { + if (_dr == dr) + continue; + /* + * This is safe because we previously checked if + * _dr == dr. + */ + NDDR_LOCK(_dr); + if (_dr->ifp == dr->ifp && rtpref(_dr) >= rtpref(dr) && + (_dr->stateflags & NDDRF_INSTALLED)) { + NDDR_ADDREF_LOCKED(_dr); + NDDR_UNLOCK(_dr); + break; + } + NDDR_UNLOCK(_dr); + } + + /* If same preference and i/f, static entry takes precedence */ + if (_dr != NULL && rtpref(_dr) == rtpref(dr) && + !(_dr->stateflags & NDDRF_STATIC) && + (dr->stateflags & NDDRF_STATIC)) { + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(_dr); + defrouter_delreq(_dr); + NDDR_UNLOCK(_dr); + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(_dr); + _dr = NULL; + } + + if (_dr == NULL && !(dr->stateflags & NDDRF_INSTALLED)) { + NDDR_ADDREF_LOCKED(dr); + NDDR_UNLOCK(dr); + lck_mtx_unlock(nd6_mutex); + NDDR_LOCK(dr); + defrouter_addreq(dr, (selected_dr == NULL || + dr->ifp != selected_dr->ifp)); + dr->genid = nd6_defrouter_genid; + NDDR_UNLOCK(dr); + lck_mtx_lock(nd6_mutex); + NDDR_REMREF(dr); + /* + * Since we lost nd6_mutex, we have to start over. + */ + dr = TAILQ_FIRST(&nd_defrouter); + continue; + } + NDDR_UNLOCK(dr); + dr = TAILQ_NEXT(dr, dr_entry); + } +out: + TAILQ_FOREACH(dr, &nd_defrouter, dr_entry) { + NDDR_LOCK(dr); + dr->stateflags &= ~NDDRF_PROCESSED; + NDDR_UNLOCK(dr); + } + if (selected_dr) + NDDR_REMREF(selected_dr); + if (installed_dr) + NDDR_REMREF(installed_dr); + if (installed_dr0) + NDDR_REMREF(installed_dr0); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + VERIFY(nd_defrouter_busy); + nd_defrouter_busy = FALSE; + if (nd_defrouter_waiters > 0) { + nd_defrouter_waiters = 0; + wakeup(nd_defrouter_waitchan); + } +} + +void +defrouter_select(struct ifnet *ifp) +{ + return (defrouter_select_common(ifp, 0)); +} + +static struct nd_defrouter * +defrtrlist_update_common(struct nd_defrouter *new, boolean_t scoped) +{ + struct nd_defrouter *dr, *n; + struct ifnet *ifp = new->ifp; + struct nd_ifinfo *ndi; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + if ((dr = defrouter_lookup(&new->rtaddr, ifp)) != NULL) { + /* entry exists */ + if (new->rtlifetime == 0) { + defrtrlist_del(dr); + NDDR_REMREF(dr); + dr = NULL; + } else { + int oldpref = rtpref(dr); + + /* override */ + dr->flags = new->flags; /* xxx flag check */ dr->rtlifetime = new->rtlifetime; dr->expire = new->expire; + + /* + * If the preference does not change, there's no need + * to sort the entries. If Scoped Routing is enabled, + * put the primary/non-scoped router at the top of the + * list of routers in the same preference band, unless + * it's already at that position. + */ + if (ip6_doscopedroute) { + struct nd_defrouter *p = NULL; + + /* same preference and scoped; just return */ + if (rtpref(new) == oldpref && scoped) + return (dr); + + n = TAILQ_FIRST(&nd_defrouter); + while (n != NULL) { + /* preference changed; sort it */ + if (rtpref(new) != oldpref) + break; + + /* not at the top of band; sort it */ + if (n != dr && rtpref(n) == oldpref && + (!p || rtpref(p) > rtpref(n))) + break; + + p = n; + n = TAILQ_NEXT(n, dr_entry); + } + + /* nothing has changed, just return */ + if (n == NULL && (scoped || + !(dr->stateflags & NDDRF_IFSCOPE))) + return (dr); + } else if (rtpref(new) == oldpref) { + return (dr); + } + + /* + * preferred router may be changed, so relocate + * this router. + * XXX: calling TAILQ_REMOVE directly is a bad manner. + * However, since defrtrlist_del() has many side + * effects, we intentionally do so here. + * defrouter_select() below will handle routing + * changes later. + */ + TAILQ_REMOVE(&nd_defrouter, dr, dr_entry); + new->stateflags = dr->stateflags; + new->stateflags &= ~NDDRF_PROCESSED; + + lck_rw_lock_shared(nd_if_rwlock); + VERIFY(ifp->if_index < nd_ifinfo_indexlim); + ndi = &nd_ifinfo[ifp->if_index]; + lck_rw_done(nd_if_rwlock); + n = dr; + goto insert; } - lck_mtx_unlock(nd6_mutex); - return(dr); + return (dr); } + VERIFY(dr == NULL); + /* entry does not exist */ if (new->rtlifetime == 0) { - lck_mtx_unlock(nd6_mutex); return(NULL); } - n = (struct nd_defrouter *)_MALLOC(sizeof(*n), M_IP6NDP, M_NOWAIT); + n = nddr_alloc(M_WAITOK); if (n == NULL) { - lck_mtx_unlock(nd6_mutex); return(NULL); } lck_rw_lock_shared(nd_if_rwlock); + ndi = &nd_ifinfo[ifp->if_index]; if (ifp->if_index >= nd_ifinfo_indexlim) goto freeit; - ndi = &nd_ifinfo[ifp->if_index]; if (ip6_maxifdefrouters >= 0 && ndi->ndefrouters >= ip6_maxifdefrouters) { freeit: lck_rw_done(nd_if_rwlock); - lck_mtx_unlock(nd6_mutex); - FREE(n, M_IP6NDP); + nddr_free(n); return (NULL); } - ndi->ndefrouters++; + + NDDR_ADDREF(n); /* for the nd_defrouter list */ + NDDR_ADDREF(n); /* for the caller */ + + ++nd6_defrouter_genid; + atomic_add_32(&ndi->ndefrouters, 1); lck_rw_done(nd_if_rwlock); - bzero(n, sizeof(*n)); - *n = *new; + nd6log2((LOG_INFO, "%s: allocating defrouter %s\n", if_name(ifp), + ip6_sprintf(&new->rtaddr))); + + NDDR_LOCK(n); + memcpy(&n->rtaddr, &new->rtaddr, sizeof(n->rtaddr)); + n->flags = new->flags; + n->stateflags = new->stateflags; + n->stateflags &= ~NDDRF_PROCESSED; + n->rtlifetime = new->rtlifetime; + n->expire = new->expire; + n->ifp = new->ifp; + n->genid = new->genid; + n->err = new->err; + NDDR_UNLOCK(n); +insert: /* - * Insert the new router at the end of the Default Router List. - * If there is no other router, install it anyway. Otherwise, - * just continue to use the current default router. + * Insert the new router in the Default Router List; + * The Default Router List should be in the descending order + * of router-preferece. When Scoped Routing is disabled, routers + * with the same preference are sorted in the arriving time order; + * otherwise, the first entry in the list of routers having the same + * preference is the primary default router, when the interface used + * by the entry is the default interface. */ - TAILQ_INSERT_TAIL(&nd_defrouter, n, dr_entry); - if (TAILQ_FIRST(&nd_defrouter) == n) - defrouter_select(); - lck_mtx_unlock(nd6_mutex); - return(n); + /* insert at the end of the group */ + for (dr = TAILQ_FIRST(&nd_defrouter); dr; + dr = TAILQ_NEXT(dr, dr_entry)) { + if (rtpref(n) > rtpref(dr) || + (ip6_doscopedroute && !scoped && rtpref(n) == rtpref(dr))) + break; + } + if (dr) + TAILQ_INSERT_BEFORE(dr, n, dr_entry); + else + TAILQ_INSERT_TAIL(&nd_defrouter, n, dr_entry); + + /* Ignore auto-configuration checks for static route entries */ + defrouter_select_common(ifp, (n->stateflags & NDDRF_STATIC)); + + return (n); +} + +static struct nd_defrouter * +defrtrlist_update(struct nd_defrouter *new) +{ + struct nd_defrouter *dr; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + dr = defrtrlist_update_common(new, + (nd6_defifp != NULL && new->ifp != nd6_defifp)); + + return (dr); +} + +static void +defrtrlist_sync(struct ifnet *ifp) +{ + struct nd_defrouter *dr, new; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + if (!ip6_doscopedroute) { + defrouter_select(ifp); + return; + } + + for (dr = TAILQ_FIRST(&nd_defrouter); dr; + dr = TAILQ_NEXT(dr, dr_entry)) { + NDDR_LOCK(dr); + if (dr->ifp == ifp && (dr->stateflags & NDDRF_INSTALLED)) + break; + NDDR_UNLOCK(dr); + } + + if (dr == NULL) { + /* + * Set ignore flag; the chosen default interface might + * not be configured to accept RAs. + */ + defrouter_select_common(ifp, 1); + } else { + memcpy(&new.rtaddr, &dr->rtaddr, sizeof(new.rtaddr)); + new.flags = dr->flags; + new.stateflags = dr->stateflags; + new.stateflags &= ~NDDRF_PROCESSED; + new.rtlifetime = dr->rtlifetime; + new.expire = dr->expire; + new.ifp = dr->ifp; + new.genid = dr->genid; + new.err = dr->err; + NDDR_UNLOCK(dr); + dr = defrtrlist_update_common(&new, FALSE); + if (dr) + NDDR_REMREF(dr); + } } static struct nd_pfxrouter * -pfxrtr_lookup( - struct nd_prefix *pr, - struct nd_defrouter *dr) +pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *search; - + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - for (search = pr->ndpr_advrtrs.lh_first; search; search = search->pfr_next) { + NDPR_LOCK_ASSERT_HELD(pr); + + for (search = pr->ndpr_advrtrs.lh_first; search; + search = search->pfr_next) { if (search->router == dr) break; } @@ -876,23 +1825,24 @@ pfxrtr_lookup( } static void -pfxrtr_add( - struct nd_prefix *pr, - struct nd_defrouter *dr) +pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr) { struct nd_pfxrouter *new; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + NDPR_LOCK_ASSERT_NOTHELD(pr); - new = (struct nd_pfxrouter *)_MALLOC(sizeof(*new), M_IP6NDP, M_NOWAIT); + new = zalloc(ndprtr_zone); if (new == NULL) return; bzero(new, sizeof(*new)); new->router = dr; + NDPR_LOCK(pr); LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry); - - pfxlist_onlink_check(1); + NDPR_UNLOCK(pr); + + pfxlist_onlink_check(); } static void @@ -901,65 +1851,32 @@ pfxrtr_del( { lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); LIST_REMOVE(pfr, pfr_entry); - FREE(pfr, M_IP6NDP); + zfree(ndprtr_zone, pfr); } struct nd_prefix * -nd6_prefix_lookup( - struct nd_prefix *pr) +nd6_prefix_lookup(struct nd_prefix *pr) { struct nd_prefix *search; lck_mtx_lock(nd6_mutex); for (search = nd_prefix.lh_first; search; search = search->ndpr_next) { + NDPR_LOCK(search); if (pr->ndpr_ifp == search->ndpr_ifp && pr->ndpr_plen == search->ndpr_plen && in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, - &search->ndpr_prefix.sin6_addr, - pr->ndpr_plen) - ) { + &search->ndpr_prefix.sin6_addr, pr->ndpr_plen)) { + NDPR_ADDREF_LOCKED(search); + NDPR_UNLOCK(search); break; } + NDPR_UNLOCK(search); } - if (search != NULL) - ndpr_hold(search, TRUE); lck_mtx_unlock(nd6_mutex); return(search); } -void -ndpr_hold(struct nd_prefix *pr, boolean_t locked) -{ - if (!locked) - lck_mtx_lock(nd6_mutex); - - if (pr->ndpr_usecnt < 0) - panic("%s: bad usecnt %d for pr %p\n", __func__, - pr->ndpr_usecnt, pr); - - pr->ndpr_usecnt++; - - if (!locked) - lck_mtx_unlock(nd6_mutex); -} - -void -ndpr_rele(struct nd_prefix *pr, boolean_t locked) -{ - if (!locked) - lck_mtx_lock(nd6_mutex); - - if (pr->ndpr_usecnt <= 0) - panic("%s: bad usecnt %d for pr %p\n", __func__, - pr->ndpr_usecnt, pr); - - pr->ndpr_usecnt--; - - if (!locked) - lck_mtx_unlock(nd6_mutex); -} - static void purge_detached(struct ifnet *ifp) { @@ -969,52 +1886,75 @@ purge_detached(struct ifnet *ifp) lck_mtx_lock(nd6_mutex); - for (pr = nd_prefix.lh_first; pr; pr = pr_next) { + pr = nd_prefix.lh_first; +repeat: + while (pr) { pr_next = pr->ndpr_next; + NDPR_LOCK(pr); if (pr->ndpr_ifp != ifp || IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) || ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && - !LIST_EMPTY(&pr->ndpr_advrtrs))) + !LIST_EMPTY(&pr->ndpr_advrtrs))) { + NDPR_UNLOCK(pr); + pr = pr_next; continue; -repeat: + } + NDPR_UNLOCK(pr); ifnet_lock_shared(ifp); for (ifa = ifp->if_addrlist.tqh_first; ifa; ifa = ifa_next) { ifa_next = ifa->ifa_list.tqe_next; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; + } ia = (struct in6_ifaddr *)ifa; if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == IN6_IFF_AUTOCONF && ia->ia6_ndpr == pr) { - ifaref(ifa); + IFA_ADDREF_LOCKED(ifa); /* for us */ + IFA_UNLOCK(ifa); /* * Purging the address requires writer access * to the address list, so drop the ifnet lock * now and repeat from beginning. */ ifnet_lock_done(ifp); - in6_purgeaddr(ifa, 1); - ifafree(ifa); + lck_mtx_unlock(nd6_mutex); + in6_purgeaddr(ifa); + lck_mtx_lock(nd6_mutex); + IFA_REMREF(ifa); /* drop ours */ + pr = nd_prefix.lh_first; goto repeat; } + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); - if (pr->ndpr_refcnt == 0) - prelist_remove(pr, 1); + NDPR_LOCK(pr); + if (pr->ndpr_addrcnt == 0) { + NDPR_ADDREF_LOCKED(pr); + prelist_remove(pr); + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); + } else { + NDPR_UNLOCK(pr); + } + pr = pr_next; } lck_mtx_unlock(nd6_mutex); } int -nd6_prelist_add( - struct nd_prefix *pr, - struct nd_defrouter *dr, - struct nd_prefix **newp) +nd6_prelist_add(struct nd_prefix *pr, struct nd_defrouter *dr, + struct nd_prefix **newp, boolean_t force_scoped) { struct nd_prefix *new = NULL; struct ifnet *ifp = pr->ndpr_ifp; struct nd_ifinfo *ndi = NULL; - int i; + int i, error; + struct timeval timenow; + + getmicrotime(&timenow); if (ip6_maxifprefixes >= 0) { lck_rw_lock_shared(nd_if_rwlock); @@ -1041,15 +1981,32 @@ nd6_prelist_add( lck_rw_done(nd_if_rwlock); } - new = (struct nd_prefix *)_MALLOC(sizeof(*new), M_IP6NDP, M_NOWAIT); + new = ndpr_alloc(M_WAITOK); if (new == NULL) return ENOMEM; - bzero(new, sizeof(*new)); - *new = *pr; - if (newp != NULL) - *newp = new; - /* initilization */ + NDPR_LOCK(new); + NDPR_LOCK(pr); + new->ndpr_ifp = pr->ndpr_ifp; + new->ndpr_prefix = pr->ndpr_prefix; + new->ndpr_plen = pr->ndpr_plen; + new->ndpr_vltime = pr->ndpr_vltime; + new->ndpr_pltime = pr->ndpr_pltime; + new->ndpr_flags = pr->ndpr_flags; + if (pr->ndpr_stateflags & NDPRF_STATIC) + new->ndpr_stateflags |= NDPRF_STATIC; + NDPR_UNLOCK(pr); + if ((error = in6_init_prefix_ltimes(new)) != 0) { + NDPR_UNLOCK(new); + ndpr_free(new); + return(error); + } + new->ndpr_lastupdate = timenow.tv_sec; + if (newp != NULL) { + *newp = new; + NDPR_ADDREF_LOCKED(new); /* for caller */ + } + /* initialization */ LIST_INIT(&new->ndpr_advrtrs); in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen); /* make prefix in the canonical form */ @@ -1057,22 +2014,25 @@ nd6_prelist_add( new->ndpr_prefix.sin6_addr.s6_addr32[i] &= new->ndpr_mask.s6_addr32[i]; - /* link ndpr_entry to nd_prefix list */ + NDPR_UNLOCK(new); + lck_mtx_lock(nd6_mutex); + /* link ndpr_entry to nd_prefix list */ LIST_INSERT_HEAD(&nd_prefix, new, ndpr_entry); - - new->ndpr_usecnt = 0; - ndpr_hold(new, TRUE); + new->ndpr_debug |= IFD_ATTACHED; + NDPR_ADDREF(new); /* for nd_prefix list */ /* ND_OPT_PI_FLAG_ONLINK processing */ if (new->ndpr_raf_onlink) { int e; - if ((e = nd6_prefix_onlink(new, 0, 1)) != 0) { + if ((e = nd6_prefix_onlink_common(new, force_scoped, + new->ndpr_ifp->if_index)) != 0) { nd6log((LOG_ERR, "nd6_prelist_add: failed to make " - "the prefix %s/%d on-link on %s (errno=%d)\n", - ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(ifp), e)); + "the prefix %s/%d on-link %s on %s (errno=%d)\n", + ip6_sprintf(&new->ndpr_prefix.sin6_addr), + new->ndpr_plen, force_scoped ? "scoped" : + "non-scoped", if_name(ifp), e)); /* proceed anyway. XXX: is it correct? */ } } @@ -1088,7 +2048,7 @@ nd6_prelist_add( * isn't necessary since the array never shrinks. */ ndi = &nd_ifinfo[ifp->if_index]; - ndi->nprefixes++; + atomic_add_32(&ndi->nprefixes, 1); lck_rw_done(nd_if_rwlock); lck_mtx_unlock(nd6_mutex); @@ -1096,54 +2056,63 @@ nd6_prelist_add( return 0; } +/* + * Caller must have held an extra reference on nd_prefix. + */ void -prelist_remove( - struct nd_prefix *pr, int nd6locked) +prelist_remove(struct nd_prefix *pr) { struct nd_pfxrouter *pfr, *next; struct ifnet *ifp = pr->ndpr_ifp; int e; + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + NDPR_LOCK_ASSERT_HELD(pr); + /* make sure to invalidate the prefix until it is really freed. */ pr->ndpr_vltime = 0; pr->ndpr_pltime = 0; -#if 0 + /* * Though these flags are now meaningless, we'd rather keep the value - * not to confuse users when executing "ndp -p". + * of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users + * when executing "ndp -p". */ - pr->ndpr_raf_onlink = 0; - pr->ndpr_raf_auto = 0; -#endif - if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0 && - (e = nd6_prefix_offlink(pr)) != 0) { - nd6log((LOG_ERR, "prelist_remove: failed to make %s/%d offlink " - "on %s, errno=%d\n", - ip6_sprintf(&pr->ndpr_prefix.sin6_addr), - pr->ndpr_plen, if_name(ifp), e)); - /* what should we do? */ - } - if (nd6locked == 0) + if ((pr->ndpr_stateflags & NDPRF_ONLINK)) { + NDPR_ADDREF_LOCKED(pr); + NDPR_UNLOCK(pr); + lck_mtx_unlock(nd6_mutex); + if ((e = nd6_prefix_offlink(pr)) != 0) { + nd6log((LOG_ERR, "prelist_remove: failed to make " + "%s/%d offlink on %s, errno=%d\n", + ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(ifp), e)); + /* what should we do? */ + } lck_mtx_lock(nd6_mutex); + NDPR_LOCK(pr); + if (NDPR_REMREF_LOCKED(pr) == NULL) + return; + } - if (pr->ndpr_usecnt > 0 || pr->ndpr_refcnt > 0) - goto done; /* notice here? */ + if (pr->ndpr_addrcnt > 0) + return; /* notice here? */ /* unlink ndpr_entry from nd_prefix list */ LIST_REMOVE(pr, ndpr_entry); + pr->ndpr_debug &= ~IFD_ATTACHED; /* free list of routers that adversed the prefix */ for (pfr = pr->ndpr_advrtrs.lh_first; pfr; pfr = next) { next = pfr->pfr_next; - - FREE(pfr, M_IP6NDP); + pfxrtr_del(pfr); } lck_rw_lock_shared(nd_if_rwlock); if (ifp->if_index < nd_ifinfo_indexlim) { struct nd_ifinfo *ndi = &nd_ifinfo[ifp->if_index]; - ndi->nprefixes--; + atomic_add_32(&ndi->nprefixes, -1); if (ndi->nprefixes < 0) { log(LOG_WARNING, "prelist_remove: negative " "count on %s\n", if_name(ifp)); @@ -1151,19 +2120,21 @@ prelist_remove( } lck_rw_done(nd_if_rwlock); - FREE(pr, M_IP6NDP); + /* This must not be the last reference to the nd_prefix */ + if (NDPR_REMREF_LOCKED(pr) == NULL) { + panic("%s: unexpected (missing) refcnt ndpr=%p", __func__, pr); + /* NOTREACHED */ + } - pfxlist_onlink_check(1); -done: - if (nd6locked == 0) - lck_mtx_unlock(nd6_mutex); + pfxlist_onlink_check(); } int prelist_update( struct nd_prefix *new, struct nd_defrouter *dr, /* may be NULL */ - struct mbuf *m) + struct mbuf *m, + int mcast) { struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL; struct ifaddr *ifa; @@ -1175,6 +2146,9 @@ prelist_update( struct in6_addrlifetime lt6_tmp; struct timeval timenow; + /* no need to lock "new" here, as it is local to the caller */ + NDPR_LOCK_ASSERT_NOTHELD(new); + auth = 0; if (m) { /* @@ -1199,6 +2173,8 @@ prelist_update( * and the autonomous (A) bit should NOT be changed from 1 * to 0. */ + lck_mtx_lock(nd6_mutex); + NDPR_LOCK(pr); if (new->ndpr_raf_onlink == 1) pr->ndpr_raf_onlink = 1; if (new->ndpr_raf_auto == 1) @@ -1214,7 +2190,8 @@ prelist_update( (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { int e; - if ((e = nd6_prefix_onlink(pr, 0, 0)) != 0) { + NDPR_UNLOCK(pr); + if ((e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "prelist_update: failed to make " "the prefix %s/%d on-link on %s " @@ -1223,11 +2200,15 @@ prelist_update( pr->ndpr_plen, if_name(pr->ndpr_ifp), e)); /* proceed anyway. XXX: is it correct? */ } + NDPR_LOCK(pr); } - - lck_mtx_lock(nd6_mutex); - if (dr && pfxrtr_lookup(pr, dr) == NULL) + + if (dr && pfxrtr_lookup(pr, dr) == NULL) { + NDPR_UNLOCK(pr); pfxrtr_add(pr, dr); + } else { + NDPR_UNLOCK(pr); + } lck_mtx_unlock(nd6_mutex); } else { struct nd_prefix *newpr = NULL; @@ -1241,7 +2222,7 @@ prelist_update( bzero(&new->ndpr_addr, sizeof(struct in6_addr)); - error = nd6_prelist_add(new, dr, &newpr); + error = nd6_prelist_add(new, dr, &newpr, FALSE); if (error != 0 || newpr == NULL) { nd6log((LOG_NOTICE, "prelist_update: " "nd6_prelist_add failed for %s/%d on %s " @@ -1256,9 +2237,10 @@ prelist_update( * XXX: from the ND point of view, we can ignore a prefix * with the on-link bit being zero. However, we need a * prefix structure for references from autoconfigured - * addresses. Thus, we explicitly make suret that the prefix + * addresses. Thus, we explicitly make sure that the prefix * itself expires now. */ + NDPR_LOCK(newpr); if (newpr->ndpr_raf_onlink == 0) { newpr->ndpr_vltime = 0; newpr->ndpr_pltime = 0; @@ -1266,6 +2248,7 @@ prelist_update( } pr = newpr; + NDPR_UNLOCK(newpr); } /* @@ -1282,84 +2265,109 @@ prelist_update( * nd6_ra_input. */ + /* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */ + if (new->ndpr_pltime > new->ndpr_vltime) { + error = EINVAL; /* XXX: won't be used */ + goto end; + } + /* - * 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. - * This should have been done in nd6_ra_input. + * 5.5.3 (d). If the prefix advertised is not equal to the prefix of + * an address configured by stateless autoconfiguration already in the + * list of addresses associated with the interface, and the Valid + * Lifetime is not 0, form an address. We first check if we have + * a matching prefix. + * Note: we apply a clarification in rfc2462bis-02 here. We only + * consider autoconfigured addresses while RFC2462 simply said + * "address". */ - /* - * 5.5.3 (d). If the prefix advertised does not match the prefix of an - * address already in the list, and the Valid Lifetime is not 0, - * form an address. Note that even a manually configured address - * should reject autoconfiguration of a new address. - */ - getmicrotime(&timenow); + getmicrotime(&timenow); - ifnet_lock_exclusive(ifp); + ifnet_lock_shared(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) { struct in6_ifaddr *ifa6; - int ifa_plen; - u_int32_t storedlifetime; + u_int32_t remaininglifetime; - if (ifa->ifa_addr->sa_family != AF_INET6) + IFA_LOCK(ifa); + if (ifa->ifa_addr->sa_family != AF_INET6) { + IFA_UNLOCK(ifa); continue; - + } ifa6 = (struct in6_ifaddr *)ifa; + /* + * We only consider autoconfigured addresses as per rfc2462bis. + */ + if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF)) { + IFA_UNLOCK(ifa); + continue; + } /* * Spec is not clear here, but I believe we should concentrate * on unicast (i.e. not anycast) addresses. * XXX: other ia6_flags? detached or duplicated? */ - if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0) + if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0) { + IFA_UNLOCK(ifa); continue; - - ifa_plen = in6_mask2len(&ifa6->ia_prefixmask.sin6_addr, NULL); - if (ifa_plen != new->ndpr_plen || - !in6_are_prefix_equal(&ifa6->ia_addr.sin6_addr, - &new->ndpr_prefix.sin6_addr, - ifa_plen)) + } + /* + * Ignore the address if it is not associated with a prefix + * or is associated with a prefix that is different from this + * one. (pr is never NULL here) + */ + if (ifa6->ia6_ndpr != pr) { + IFA_UNLOCK(ifa); continue; + } - if (ia6_match == NULL) /* remember the first one */ + if (ia6_match == NULL) { /* remember the first one */ ia6_match = ifa6; - - if ((ifa6->ia6_flags & IN6_IFF_AUTOCONF) == 0) - continue; + IFA_ADDREF_LOCKED(ifa); /* for ia6_match */ + } /* * An already autoconfigured address matched. Now that we * are sure there is at least one matched address, we can * proceed to 5.5.3. (e): update the lifetimes according to the * "two hours" rule and the privacy extension. + * We apply some clarifications in rfc2462bis: + * - use remaininglifetime instead of storedlifetime as a + * variable name + * - remove the dead code in the "two-hour" rule */ #define TWOHOUR (120*60) lt6_tmp = ifa6->ia6_lifetime; - storedlifetime = IFA6_IS_INVALID(ifa6) ? 0 : - (lt6_tmp.ia6t_expire - timenow.tv_sec); + if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME) + remaininglifetime = ND6_INFINITE_LIFETIME; + else if (timenow.tv_sec - ifa6->ia6_updatetime > + lt6_tmp.ia6t_vltime) { + /* + * The case of "invalid" address. We should usually + * not see this case. + */ + remaininglifetime = 0; + } else + remaininglifetime = lt6_tmp.ia6t_vltime - + (timenow.tv_sec - ifa6->ia6_updatetime); + + /* when not updating, keep the current stored lifetime. */ + lt6_tmp.ia6t_vltime = remaininglifetime; if (TWOHOUR < new->ndpr_vltime || - storedlifetime < new->ndpr_vltime) { + remaininglifetime < new->ndpr_vltime) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; - } else if (storedlifetime <= TWOHOUR -#if 0 - /* - * This condition is logically redundant, so we just - * omit it. - * See IPng 6712, 6717, and 6721. - */ - && new->ndpr_vltime <= storedlifetime -#endif - ) { + } else if (remaininglifetime <= TWOHOUR) { if (auth) { lt6_tmp.ia6t_vltime = new->ndpr_vltime; } } else { /* * new->ndpr_vltime <= TWOHOUR && - * TWOHOUR < storedlifetime + * TWOHOUR < remaininglifetime */ lt6_tmp.ia6t_vltime = TWOHOUR; } @@ -1367,57 +2375,108 @@ prelist_update( /* The 2 hour rule is not imposed for preferred lifetime. */ lt6_tmp.ia6t_pltime = new->ndpr_pltime; - in6_init_address_ltimes(pr, <6_tmp); - - /* - * When adjusting the lifetimes of an existing temporary - * address, only lower the lifetimes. - * RFC 3041 3.3. (1). - * XXX: how should we modify ia6t_[pv]ltime? - */ + /* Special handling for lifetimes of temporary addresses. */ if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) { - if (lt6_tmp.ia6t_expire == 0 || /* no expire */ - lt6_tmp.ia6t_expire > - ifa6->ia6_lifetime.ia6t_expire) { - lt6_tmp.ia6t_expire = - ifa6->ia6_lifetime.ia6t_expire; - } - if (lt6_tmp.ia6t_preferred == 0 || /* no expire */ - lt6_tmp.ia6t_preferred > - ifa6->ia6_lifetime.ia6t_preferred) { - lt6_tmp.ia6t_preferred = - ifa6->ia6_lifetime.ia6t_preferred; - } + u_int32_t maxvltime, maxpltime; + + /* Constrain lifetimes to system limits. */ + if (lt6_tmp.ia6t_vltime > ip6_temp_valid_lifetime) + lt6_tmp.ia6t_vltime = ip6_temp_valid_lifetime; + if (lt6_tmp.ia6t_pltime > ip6_temp_preferred_lifetime) + lt6_tmp.ia6t_pltime = + ip6_temp_preferred_lifetime - + ip6_desync_factor; + + /* + * According to RFC 4941, section 3.3 (1), we only + * update the lifetimes when they are in the maximum + * intervals. + */ + if (ip6_temp_valid_lifetime > + (u_int32_t)((timenow.tv_sec - ifa6->ia6_createtime) + + ip6_desync_factor)) { + maxvltime = ip6_temp_valid_lifetime - + (timenow.tv_sec - ifa6->ia6_createtime) - + ip6_desync_factor; + } else + maxvltime = 0; + if (ip6_temp_preferred_lifetime > + (u_int32_t)((timenow.tv_sec - ifa6->ia6_createtime) + + ip6_desync_factor)) { + maxpltime = ip6_temp_preferred_lifetime - + (timenow.tv_sec - ifa6->ia6_createtime) - + ip6_desync_factor; + } else + maxpltime = 0; + + if (lt6_tmp.ia6t_vltime > maxvltime) + lt6_tmp.ia6t_vltime = maxvltime; + if (lt6_tmp.ia6t_pltime > maxpltime) + lt6_tmp.ia6t_pltime = maxpltime; } + in6_init_address_ltimes(pr, <6_tmp, + !!(ifa6->ia6_flags & IN6_IFF_TEMPORARY)); + ifa6->ia6_lifetime = lt6_tmp; + ifa6->ia6_updatetime = timenow.tv_sec; + IFA_UNLOCK(ifa); } ifnet_lock_done(ifp); if (ia6_match == NULL && new->ndpr_vltime) { + int ifidlen; + /* + * 5.5.3 (d) (continued) * No address matched and the valid lifetime is non-zero. * Create a new address. */ - if ((ia6 = in6_ifadd(new, NULL)) != NULL) { + + /* + * Prefix Length check: + * If the sum of the prefix length and interface identifier + * length does not equal 128 bits, the Prefix Information + * option MUST be ignored. The length of the interface + * identifier is defined in a separate link-type specific + * document. + */ + ifidlen = in6_if2idlen(ifp); + if (ifidlen < 0) { + /* this should not happen, so we always log it. */ + log(LOG_ERR, "prelist_update: IFID undefined (%s)\n", + if_name(ifp)); + goto end; + } + NDPR_LOCK(pr); + if (ifidlen + pr->ndpr_plen != 128) { + nd6log((LOG_INFO, + "prelist_update: invalid prefixlen " + "%d for %s, ignored\n", + pr->ndpr_plen, if_name(ifp))); + NDPR_UNLOCK(pr); + goto end; + } + NDPR_UNLOCK(pr); + + if ((ia6 = in6_ifadd(new, mcast)) != NULL) { /* * note that we should use pr (not new) for reference. */ - lck_mtx_lock(nd6_mutex); - pr->ndpr_refcnt++; - lck_mtx_unlock(nd6_mutex); + IFA_LOCK(&ia6->ia_ifa); + NDPR_LOCK(pr); ia6->ia6_ndpr = pr; - -#if 0 - /* XXXYYY Don't do this, according to Jinmei. */ - pr->ndpr_addr = new->ndpr_addr; -#endif + NDPR_ADDREF_LOCKED(pr); /* for addr reference */ + pr->ndpr_addrcnt++; + VERIFY(pr->ndpr_addrcnt != 0); + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ia6->ia_ifa); /* - * RFC 3041 3.3 (2). + * RFC 4941 3.3 (2). * When a new public address is created as described * in RFC2462, also create a new temporary address. * - * RFC 3041 3.5. + * RFC 4941 3.5. * When an interface connects to a new link, a new * randomized interface identifier should be generated * immediately together with a new set of temporary @@ -1426,35 +2485,264 @@ prelist_update( */ if (ip6_use_tempaddr) { int e; - if ((e = in6_tmpifadd(ia6, 1, M_NOWAIT)) != 0) { + if ((e = in6_tmpifadd(ia6, 1, M_WAITOK)) != 0) { nd6log((LOG_NOTICE, "prelist_update: " "failed to create a temporary " "address, errno=%d\n", e)); } } - ifafree(&ia6->ia_ifa); + IFA_REMREF(&ia6->ia_ifa); ia6 = NULL; - /* - * A newly added address might affect the status - * of other addresses, so we check and update it. - * XXX: what if address duplication happens? - */ - pfxlist_onlink_check(0); - } else { - /* just set an error. do not bark here. */ - error = EADDRNOTAVAIL; /* XXX: might be unused. */ - } + /* + * A newly added address might affect the status + * of other addresses, so we check and update it. + * XXX: what if address duplication happens? + */ + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); + lck_mtx_unlock(nd6_mutex); + } else { + /* just set an error. do not bark here. */ + error = EADDRNOTAVAIL; /* XXX: might be unused. */ + } + } + +afteraddrconf: + +end: + if (pr != NULL) + NDPR_REMREF(pr); + if (ia6_match != NULL) + IFA_REMREF(&ia6_match->ia_ifa); + return error; +} + +/* + * Neighbor Discover Default Router structure reference counting routines. + */ +static struct nd_defrouter * +nddr_alloc(int how) +{ + struct nd_defrouter *dr; + + dr = (how == M_WAITOK) ? zalloc(nddr_zone) : zalloc_noblock(nddr_zone); + if (dr != NULL) { + bzero(dr, nddr_size); + lck_mtx_init(&dr->nddr_lock, ifa_mtx_grp, ifa_mtx_attr); + dr->nddr_debug |= IFD_ALLOC; + if (nddr_debug != 0) { + dr->nddr_debug |= IFD_DEBUG; + dr->nddr_trace = nddr_trace; + } + } + return (dr); +} + +static void +nddr_free(struct nd_defrouter *dr) +{ + NDDR_LOCK(dr); + if (dr->nddr_debug & IFD_ATTACHED) { + panic("%s: attached nddr %p is being freed", __func__, dr); + /* NOTREACHED */ + } else if (!(dr->nddr_debug & IFD_ALLOC)) { + panic("%s: nddr %p cannot be freed", __func__, dr); + /* NOTREACHED */ + } + dr->nddr_debug &= ~IFD_ALLOC; + NDDR_UNLOCK(dr); + + lck_mtx_destroy(&dr->nddr_lock, ifa_mtx_grp); + zfree(nddr_zone, dr); +} + +static void +nddr_trace(struct nd_defrouter *dr, int refhold) +{ + struct nd_defrouter_dbg *dr_dbg = (struct nd_defrouter_dbg *)dr; + ctrace_t *tr; + uint32_t idx; + uint16_t *cnt; + + if (!(dr->nddr_debug & IFD_DEBUG)) { + panic("%s: nddr %p has no debug structure", __func__, dr); + /* NOTREACHED */ + } + if (refhold) { + cnt = &dr_dbg->nddr_refhold_cnt; + tr = dr_dbg->nddr_refhold; + } else { + cnt = &dr_dbg->nddr_refrele_cnt; + tr = dr_dbg->nddr_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % NDDR_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + +void +nddr_addref(struct nd_defrouter *nddr, int locked) +{ + + if (!locked) + NDDR_LOCK_SPIN(nddr); + else + NDDR_LOCK_ASSERT_HELD(nddr); + + if (++nddr->nddr_refcount == 0) { + panic("%s: nddr %p wraparound refcnt\n", __func__, nddr); + /* NOTREACHED */ + } else if (nddr->nddr_trace != NULL) { + (*nddr->nddr_trace)(nddr, TRUE); + } + + if (!locked) + NDDR_UNLOCK(nddr); +} + +struct nd_defrouter * +nddr_remref(struct nd_defrouter *nddr, int locked) +{ + + if (!locked) + NDDR_LOCK_SPIN(nddr); + else + NDDR_LOCK_ASSERT_HELD(nddr); + + if (nddr->nddr_refcount == 0) { + panic("%s: nddr %p negative refcnt\n", __func__, nddr); + /* NOTREACHED */ + } else if (nddr->nddr_trace != NULL) { + (*nddr->nddr_trace)(nddr, FALSE); + } + + if (--nddr->nddr_refcount == 0) { + NDDR_UNLOCK(nddr); + nddr_free(nddr); + nddr = NULL; + } + + if (!locked && nddr != NULL) + NDDR_UNLOCK(nddr); + + return (nddr); +} + +/* + * Neighbor Discover Prefix structure reference counting routines. + */ +static struct nd_prefix * +ndpr_alloc(int how) +{ + struct nd_prefix *pr; + + pr = (how == M_WAITOK) ? zalloc(ndpr_zone) : zalloc_noblock(ndpr_zone); + if (pr != NULL) { + bzero(pr, ndpr_size); + lck_mtx_init(&pr->ndpr_lock, ifa_mtx_grp, ifa_mtx_attr); + pr->ndpr_debug |= IFD_ALLOC; + if (ndpr_debug != 0) { + pr->ndpr_debug |= IFD_DEBUG; + pr->ndpr_trace = ndpr_trace; + } + } + return (pr); +} + +static void +ndpr_free(struct nd_prefix *pr) +{ + NDPR_LOCK(pr); + if (pr->ndpr_debug & IFD_ATTACHED) { + panic("%s: attached ndpr %p is being freed", __func__, pr); + /* NOTREACHED */ + } else if (!(pr->ndpr_debug & IFD_ALLOC)) { + panic("%s: ndpr %p cannot be freed", __func__, pr); + /* NOTREACHED */ + } + pr->ndpr_debug &= ~IFD_ALLOC; + NDPR_UNLOCK(pr); + + lck_mtx_destroy(&pr->ndpr_lock, ifa_mtx_grp); + zfree(ndpr_zone, pr); +} + +static void +ndpr_trace(struct nd_prefix *pr, int refhold) +{ + struct nd_prefix_dbg *pr_dbg = (struct nd_prefix_dbg *)pr; + ctrace_t *tr; + u_int32_t idx; + u_int16_t *cnt; + + if (!(pr->ndpr_debug & IFD_DEBUG)) { + panic("%s: ndpr %p has no debug structure", __func__, pr); + /* NOTREACHED */ + } + if (refhold) { + cnt = &pr_dbg->ndpr_refhold_cnt; + tr = pr_dbg->ndpr_refhold; + } else { + cnt = &pr_dbg->ndpr_refrele_cnt; + tr = pr_dbg->ndpr_refrele; + } + + idx = atomic_add_16_ov(cnt, 1) % NDPR_TRACE_HIST_SIZE; + ctrace_record(&tr[idx]); +} + +void +ndpr_addref(struct nd_prefix *ndpr, int locked) +{ + if (!locked) + NDPR_LOCK_SPIN(ndpr); + else + NDPR_LOCK_ASSERT_HELD(ndpr); + + if (++ndpr->ndpr_refcount == 0) { + panic("%s: ndpr %p wraparound refcnt\n", __func__, ndpr); + /* NOTREACHED */ + } else if (ndpr->ndpr_trace != NULL) { + (*ndpr->ndpr_trace)(ndpr, TRUE); + } + + if (!locked) + NDPR_UNLOCK(ndpr); +} + +struct nd_prefix * +ndpr_remref(struct nd_prefix *ndpr, int locked) +{ + if (!locked) + NDPR_LOCK_SPIN(ndpr); + else + NDPR_LOCK_ASSERT_HELD(ndpr); + + if (ndpr->ndpr_refcount == 0) { + panic("%s: ndpr %p negative refcnt\n", __func__, ndpr); + /* NOTREACHED */ + } else if (ndpr->ndpr_trace != NULL) { + (*ndpr->ndpr_trace)(ndpr, FALSE); } -afteraddrconf: + if (--ndpr->ndpr_refcount == 0) { + if (ndpr->ndpr_addrcnt != 0) { + panic("%s: freeing ndpr %p with outstanding address " + "reference (%d)", __func__, ndpr, + ndpr->ndpr_addrcnt); + /* NOTREACHED */ + } + NDPR_UNLOCK(ndpr); + ndpr_free(ndpr); + ndpr = NULL; + } -end: - if (pr != NULL) - ndpr_rele(pr, FALSE); + if (!locked && ndpr != NULL) + NDPR_UNLOCK(ndpr); - return error; + return (ndpr); } /* @@ -1463,17 +2751,19 @@ end: * XXX: lengthy function name... */ static struct nd_pfxrouter * -find_pfxlist_reachable_router( - struct nd_prefix *pr) +find_pfxlist_reachable_router(struct nd_prefix *pr) { struct nd_pfxrouter *pfxrtr; struct rtentry *rt; struct llinfo_nd6 *ln; lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + NDPR_LOCK_ASSERT_HELD(pr); for (pfxrtr = LIST_FIRST(&pr->ndpr_advrtrs); pfxrtr; pfxrtr = LIST_NEXT(pfxrtr, pfr_entry)) { + NDPR_UNLOCK(pr); + lck_mtx_unlock(nd6_mutex); /* Callee returns a locked route upon success */ if ((rt = nd6_lookup(&pfxrtr->router->rtaddr, 0, pfxrtr->router->ifp, 0)) != NULL) { @@ -1482,14 +2772,19 @@ find_pfxlist_reachable_router( ND6_IS_LLINFO_PROBREACH(ln)) { RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); + lck_mtx_lock(nd6_mutex); + NDPR_LOCK(pr); break; /* found */ } RT_REMREF_LOCKED(rt); RT_UNLOCK(rt); } + lck_mtx_lock(nd6_mutex); + NDPR_LOCK(pr); } + NDPR_LOCK_ASSERT_HELD(pr); - return(pfxrtr); + return (pfxrtr); } @@ -1507,61 +2802,150 @@ find_pfxlist_reachable_router( * is no router around us. */ void -pfxlist_onlink_check(int nd6locked) +pfxlist_onlink_check(void) { - struct nd_prefix *pr; + struct nd_prefix *pr, *prclear; struct in6_ifaddr *ifa; + struct nd_defrouter *dr; + struct nd_pfxrouter *pfxrtr = NULL; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + while (nd_prefix_busy) { + nd_prefix_waiters++; + msleep(nd_prefix_waitchan, nd6_mutex, (PZERO-1), + __func__, NULL); + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + } + nd_prefix_busy = TRUE; /* * Check if there is a prefix that has a reachable advertising * router. */ - if (nd6locked == 0) - lck_mtx_lock(nd6_mutex); - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { - if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr)) + pr = nd_prefix.lh_first; + while (pr) { + NDPR_LOCK(pr); + if (pr->ndpr_stateflags & NDPRF_PROCESSED) { + NDPR_UNLOCK(pr); + pr = pr->ndpr_next; + continue; + } + NDPR_ADDREF_LOCKED(pr); + if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr) && + (pr->ndpr_debug & IFD_ATTACHED)) { + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); break; + } + pr->ndpr_stateflags |= NDPRF_PROCESSED; + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); + /* + * Since find_pfxlist_reachable_router() drops the nd6_mutex, we + * have to start over, but the NDPRF_PROCESSED flag will stop + * us from checking the same prefix twice. + */ + pr = nd_prefix.lh_first; + } + LIST_FOREACH(prclear, &nd_prefix, ndpr_entry) { + NDPR_LOCK(prclear); + prclear->ndpr_stateflags &= ~NDPRF_PROCESSED; + NDPR_UNLOCK(prclear); } - if (pr) { + /* + * If we have no such prefix, check whether we still have a router + * that does not advertise any prefixes. + */ + if (pr == NULL) { + for (dr = TAILQ_FIRST(&nd_defrouter); dr; + dr = TAILQ_NEXT(dr, dr_entry)) { + struct nd_prefix *pr0; + + for (pr0 = nd_prefix.lh_first; pr0; + pr0 = pr0->ndpr_next) { + NDPR_LOCK(pr0); + if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL) { + NDPR_UNLOCK(pr0); + break; + } + NDPR_UNLOCK(pr0); + } + if (pfxrtr != NULL) + break; + } + } + if (pr != NULL || (TAILQ_FIRST(&nd_defrouter) && pfxrtr == NULL)) { /* - * There is at least one prefix that has a reachable router. + * There is at least one prefix that has a reachable router, + * or at least a router which probably does not advertise + * any prefixes. The latter would be the case when we move + * to a new link where we have a router that does not provide + * prefixes and we configure an address by hand. * Detach prefixes which have no reachable advertising * router, and attach other prefixes. */ - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { - /* XXX: a link-local prefix should never be detached */ - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) - continue; - + pr = nd_prefix.lh_first; + while (pr) { + NDPR_LOCK(pr); /* - * we aren't interested in prefixes without the L bit - * set. + * We aren't interested prefixes already processed, + * nor in prefixes without the L bit + * set nor in static prefixes */ - if (pr->ndpr_raf_onlink == 0) + if (pr->ndpr_raf_onlink == 0 || + pr->ndpr_stateflags & NDPRF_PROCESSED || + pr->ndpr_stateflags & NDPRF_STATIC) { + NDPR_UNLOCK(pr); + pr = pr->ndpr_next; continue; - + } + NDPR_ADDREF_LOCKED(pr); if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && - find_pfxlist_reachable_router(pr) == NULL) + find_pfxlist_reachable_router(pr) == NULL && + (pr->ndpr_debug & IFD_ATTACHED)) pr->ndpr_stateflags |= NDPRF_DETACHED; if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && - find_pfxlist_reachable_router(pr) != 0) + find_pfxlist_reachable_router(pr) != NULL && + (pr->ndpr_debug & IFD_ATTACHED)) pr->ndpr_stateflags &= ~NDPRF_DETACHED; + pr->ndpr_stateflags |= NDPRF_PROCESSED; + NDPR_UNLOCK(pr); + NDPR_REMREF(pr); + /* + * Since find_pfxlist_reachable_router() drops the + * nd6_mutex, we have to start over, but the + * NDPRF_PROCESSED flag will stop us from checking + * the same prefix twice. + */ + pr = nd_prefix.lh_first; } } else { /* there is no prefix that has a reachable router */ for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) + NDPR_LOCK(pr); + if (pr->ndpr_raf_onlink == 0 || + pr->ndpr_stateflags & NDPRF_STATIC) { + NDPR_UNLOCK(pr); continue; - - if (pr->ndpr_raf_onlink == 0) - continue; - + } if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0) pr->ndpr_stateflags &= ~NDPRF_DETACHED; + NDPR_UNLOCK(pr); } } + LIST_FOREACH(prclear, &nd_prefix, ndpr_entry) { + NDPR_LOCK(prclear); + prclear->ndpr_stateflags &= ~NDPRF_PROCESSED; + NDPR_UNLOCK(prclear); + } + VERIFY(nd_prefix_busy); + nd_prefix_busy = FALSE; + if (nd_prefix_waiters > 0) { + nd_prefix_waiters = 0; + wakeup(nd_prefix_waitchan); + } /* * Remove each interface route associated with a (just) detached @@ -1571,17 +2955,21 @@ pfxlist_onlink_check(int nd6locked) * interfaces. Such cases will be handled in nd6_prefix_onlink, * so we don't have to care about them. */ - for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + pr = nd_prefix.lh_first; + while (pr) { int e; - if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) - continue; - - if (pr->ndpr_raf_onlink == 0) + NDPR_LOCK(pr); + if (pr->ndpr_raf_onlink == 0 || + pr->ndpr_stateflags & NDPRF_STATIC) { + NDPR_UNLOCK(pr); + pr = pr->ndpr_next; continue; - + } if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 && (pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { + NDPR_UNLOCK(pr); + lck_mtx_unlock(nd6_mutex); if ((e = nd6_prefix_offlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " @@ -1589,18 +2977,25 @@ pfxlist_onlink_check(int nd6locked) ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } + lck_mtx_lock(nd6_mutex); + pr = nd_prefix.lh_first; + continue; } if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 && (pr->ndpr_stateflags & NDPRF_ONLINK) == 0 && pr->ndpr_raf_onlink) { - if ((e = nd6_prefix_onlink(pr, 0, 1)) != 0) { + NDPR_UNLOCK(pr); + if ((e = nd6_prefix_onlink(pr)) != 0) { nd6log((LOG_ERR, "pfxlist_onlink_check: failed to " "make %s/%d offlink, errno=%d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, e)); } + } else { + NDPR_UNLOCK(pr); } + pr = pr->ndpr_next; } /* @@ -1611,118 +3006,298 @@ pfxlist_onlink_check(int nd6locked) * always be attached. * The precise detection logic is same as the one for prefixes. */ + lck_rw_lock_shared(&in6_ifaddr_rwlock); for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { - if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) - continue; + struct nd_prefix *ndpr; - if (ifa->ia6_ndpr == NULL) { + IFA_LOCK(&ifa->ia_ifa); + if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + IFA_UNLOCK(&ifa->ia_ifa); + continue; + } + if ((ndpr = ifa->ia6_ndpr) == NULL) { /* * This can happen when we first configure the address * (i.e. the address exists, but the prefix does not). * XXX: complicated relationships... */ + IFA_UNLOCK(&ifa->ia_ifa); continue; } + NDPR_ADDREF(ndpr); + IFA_UNLOCK(&ifa->ia_ifa); - if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) + NDPR_LOCK(ndpr); + if (find_pfxlist_reachable_router(ndpr)) { + NDPR_UNLOCK(ndpr); + NDPR_REMREF(ndpr); break; + } + NDPR_UNLOCK(ndpr); + NDPR_REMREF(ndpr); } if (ifa) { for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { - if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) - continue; + struct nd_prefix *ndpr; - if (ifa->ia6_ndpr == NULL) /* XXX: see above. */ + IFA_LOCK(&ifa->ia_ifa); + if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + IFA_UNLOCK(&ifa->ia_ifa); continue; - - if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) - ifa->ia6_flags &= ~IN6_IFF_DETACHED; - else + } + if ((ndpr = ifa->ia6_ndpr) == NULL) { + /* XXX: see above. */ + IFA_UNLOCK(&ifa->ia_ifa); + continue; + } + NDPR_ADDREF(ndpr); + IFA_UNLOCK(&ifa->ia_ifa); + NDPR_LOCK(ndpr); + if (find_pfxlist_reachable_router(ndpr)) { + NDPR_UNLOCK(ndpr); + IFA_LOCK(&ifa->ia_ifa); + if (ifa->ia6_flags & IN6_IFF_DETACHED) { + ifa->ia6_flags &= ~IN6_IFF_DETACHED; + ifa->ia6_flags |= IN6_IFF_TENTATIVE; + IFA_UNLOCK(&ifa->ia_ifa); + nd6_dad_start((struct ifaddr *)ifa, 0); + } else { + IFA_UNLOCK(&ifa->ia_ifa); + } + } else { + NDPR_UNLOCK(ndpr); + IFA_LOCK(&ifa->ia_ifa); ifa->ia6_flags |= IN6_IFF_DETACHED; + IFA_UNLOCK(&ifa->ia_ifa); + } + NDPR_REMREF(ndpr); } } else { for (ifa = in6_ifaddrs; ifa; ifa = ifa->ia_next) { - if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) + IFA_LOCK(&ifa->ia_ifa); + if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0) { + IFA_UNLOCK(&ifa->ia_ifa); continue; + } + if (ifa->ia6_flags & IN6_IFF_DETACHED) { + ifa->ia6_flags &= ~IN6_IFF_DETACHED; + ifa->ia6_flags |= IN6_IFF_TENTATIVE; + IFA_UNLOCK(&ifa->ia_ifa); + /* Do we need a delay in this case? */ + nd6_dad_start((struct ifaddr *)ifa, 0); + } else { + IFA_UNLOCK(&ifa->ia_ifa); + } + } + } + lck_rw_done(&in6_ifaddr_rwlock); +} + +static struct nd_prefix * +nd6_prefix_equal_lookup(struct nd_prefix *pr, boolean_t primary_only) +{ + struct nd_prefix *opr; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + for (opr = nd_prefix.lh_first; opr; opr = opr->ndpr_next) { + if (opr == pr) + continue; + + NDPR_LOCK(opr); + if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0) { + NDPR_UNLOCK(opr); + continue; + } + if (opr->ndpr_plen == pr->ndpr_plen && + in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, + &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen) && + (!primary_only || + !(opr->ndpr_stateflags & NDPRF_IFSCOPE))) { + NDPR_ADDREF_LOCKED(opr); + NDPR_UNLOCK(opr); + return (opr); + } + NDPR_UNLOCK(opr); + } + return (NULL); +} + +/* + * Synchronize the interface routes of similar prefixes on different + * interfaces; the one using the default interface would be (re)installed + * as a primary/non-scoped entry, and the rest as scoped entri(es). + */ +static void +nd6_prefix_sync(struct ifnet *ifp) +{ + struct nd_prefix *pr, *opr; + int err = 0; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + + if (!ip6_doscopedroute || ifp == NULL) + return; - ifa->ia6_flags &= ~IN6_IFF_DETACHED; + for (pr = nd_prefix.lh_first; pr; pr = pr->ndpr_next) { + NDPR_LOCK(pr); + if (!(pr->ndpr_stateflags & NDPRF_ONLINK)) { + NDPR_UNLOCK(pr); + continue; + } + if (pr->ndpr_ifp == ifp && + (pr->ndpr_stateflags & NDPRF_IFSCOPE) && + !IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) { + NDPR_UNLOCK(pr); + break; } + NDPR_UNLOCK(pr); } - if (nd6locked == 0) + + if (pr == NULL) + return; + + /* Remove conflicting entries */ + opr = nd6_prefix_equal_lookup(pr, TRUE); + if (opr != NULL) { lck_mtx_unlock(nd6_mutex); + err = nd6_prefix_offlink(opr); + lck_mtx_lock(nd6_mutex); + if (err != 0) { + nd6log((LOG_ERR, + "%s: failed to make %s/%d offlink on %s, " + "errno=%d\n", __func__, + ip6_sprintf(&opr->ndpr_prefix.sin6_addr), + opr->ndpr_plen, if_name(opr->ndpr_ifp), err)); + } + } else { + nd6log((LOG_ERR, + "%s: scoped %s/%d on %s has no matching unscoped prefix\n", + __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp))); + } + + lck_mtx_unlock(nd6_mutex); + err = nd6_prefix_offlink(pr); + lck_mtx_lock(nd6_mutex); + if (err != 0) { + nd6log((LOG_ERR, + "%s: failed to make %s/%d offlink on %s, errno=%d\n", + __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), err)); + } + + /* Add the entries back */ + if (opr != NULL) { + err = nd6_prefix_onlink_scoped(opr, opr->ndpr_ifp->if_index); + if (err != 0) { + nd6log((LOG_ERR, + "%s: failed to make %s/%d scoped onlink on %s, " + "errno=%d\n", __func__, + ip6_sprintf(&opr->ndpr_prefix.sin6_addr), + opr->ndpr_plen, if_name(opr->ndpr_ifp), err)); + } + } + + err = nd6_prefix_onlink_scoped(pr, IFSCOPE_NONE); + if (err != 0) { + nd6log((LOG_ERR, + "%s: failed to make %s/%d onlink on %s, errno=%d\n", + __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), err)); + } + + if (err != 0) { + nd6log((LOG_ERR, + "%s: error promoting %s/%d to %s from %s\n", + __func__, ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), + (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE")); + } else { + nd6log2((LOG_INFO, + "%s: %s/%d promoted, previously on %s\n", + if_name(pr->ndpr_ifp), + ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, + (opr != NULL) ? if_name(opr->ndpr_ifp) : "NONE")); + } + + if (opr != NULL) + NDPR_REMREF(opr); } -int -nd6_prefix_onlink( - struct nd_prefix *pr, int rtlocked, int nd6locked) +static int +nd6_prefix_onlink_common(struct nd_prefix *pr, boolean_t force_scoped, + unsigned int ifscope) { struct ifaddr *ifa; struct ifnet *ifp = pr->ndpr_ifp; - struct sockaddr_in6 mask6; + struct sockaddr_in6 mask6, prefix; struct nd_prefix *opr; u_int32_t rtflags; int error = 0; struct rtentry *rt = NULL; + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); + /* sanity check */ + NDPR_LOCK(pr); if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) { nd6log((LOG_ERR, - "nd6_prefix_onlink: %s/%d is already on-link\n", - ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen); - return(EEXIST)); + "nd6_prefix_onlink: %s/%d on %s scoped=%d is already " + "on-link\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), + (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? 1 : 0); + NDPR_UNLOCK(pr); + return (EEXIST)); } + NDPR_UNLOCK(pr); /* * Add the interface route associated with the prefix. Before * installing the route, check if there's the same prefix on another * interface, and the prefix has already installed the interface route. - * Although such a configuration is expected to be rare, we explicitly - * allow it. */ - if (nd6locked == 0) - lck_mtx_lock(nd6_mutex); - else - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - for (opr = nd_prefix.lh_first; opr; opr = opr->ndpr_next) { - if (opr == pr) - continue; - - if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0) - continue; - - if (opr->ndpr_plen == pr->ndpr_plen && - in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, - &opr->ndpr_prefix.sin6_addr, - pr->ndpr_plen)) { - if (nd6locked == 0) - lck_mtx_unlock(nd6_mutex); - return(0); - } + opr = nd6_prefix_equal_lookup(pr, FALSE); + if (opr != NULL) + NDPR_REMREF(opr); + + if (!ip6_doscopedroute) { + /* if an interface route already exists, just return */ + if (opr != NULL) + return (0); + ifscope = IFSCOPE_NONE; + } else if (!force_scoped) { + /* + * If a primary/non-scoped interface route already exists, + * install the new one as a scoped entry. If the existing + * interface route is scoped, install new as non-scoped. + */ + ifscope = (opr != NULL) ? ifp->if_index : IFSCOPE_NONE; + opr = nd6_prefix_equal_lookup(pr, TRUE); + if (opr != NULL) + NDPR_REMREF(opr); + else if (ifscope != IFSCOPE_NONE) + ifscope = IFSCOPE_NONE; } - if (nd6locked == 0) - lck_mtx_unlock(nd6_mutex); /* - * We prefer link-local addresses as the associated interface address. + * We prefer link-local addresses as the associated interface address. */ /* search for a link-local addr */ ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY| IN6_IFF_ANYCAST); if (ifa == NULL) { - /* XXX: freebsd does not have ifa_ifwithaf */ - ifnet_lock_exclusive(ifp); - TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list) - { - if (ifa->ifa_addr->sa_family == AF_INET6) - break; - } - if (ifa != NULL) - ifaref(ifa); + struct in6_ifaddr *ia6; + ifnet_lock_shared(ifp); + IFP_TO_IA6(ifp, ia6); ifnet_lock_done(ifp); + if (ia6 != NULL) + ifa = &ia6->ia_ifa; /* should we care about ia6_flags? */ } + NDPR_LOCK(pr); if (ifa == NULL) { /* * This can still happen, when, for example, we receive an RA @@ -1735,7 +3310,8 @@ nd6_prefix_onlink( " to add route for a prefix(%s/%d) on %s\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp))); - return(0); + NDPR_UNLOCK(pr); + return (0); } /* @@ -1745,11 +3321,12 @@ nd6_prefix_onlink( bzero(&mask6, sizeof(mask6)); mask6.sin6_len = sizeof(mask6); mask6.sin6_addr = pr->ndpr_mask; + prefix = pr->ndpr_prefix; + NDPR_UNLOCK(pr); - if (rtlocked == 0) - lck_mtx_lock(rnh_lock); - + IFA_LOCK_SPIN(ifa); rtflags = ifa->ifa_flags | RTF_CLONING | RTF_UP; + IFA_UNLOCK(ifa); if (nd6_need_cache(ifp)) { /* explicitly set in case ifa_flags does not set the flag. */ rtflags |= RTF_CLONING; @@ -1759,54 +3336,81 @@ nd6_prefix_onlink( */ rtflags &= ~RTF_CLONING; } - error = rtrequest_locked(RTM_ADD, (struct sockaddr *)&pr->ndpr_prefix, - ifa->ifa_addr, (struct sockaddr *)&mask6, - rtflags, &rt); - if (error == 0) { - if (rt != NULL) { /* this should be non NULL, though */ - RT_LOCK(rt); - nd6_rtmsg(RTM_ADD, rt); - RT_UNLOCK(rt); - } - pr->ndpr_stateflags |= NDPRF_ONLINK; - } - else { + + lck_mtx_unlock(nd6_mutex); + + error = rtrequest_scoped(RTM_ADD, (struct sockaddr *)&prefix, + ifa->ifa_addr, (struct sockaddr *)&mask6, rtflags, &rt, + ifscope); + + if (rt != NULL) { + RT_LOCK(rt); + nd6_rtmsg(RTM_ADD, rt); + RT_UNLOCK(rt); + RT_REMREF(rt); + } else { + NDPR_LOCK(pr); nd6log((LOG_ERR, "nd6_prefix_onlink: failed to add route for a" - " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%lx " - "errno = %d\n", + " prefix (%s/%d) on %s, gw=%s, mask=%s, flags=%lx," + " scoped=%d, errno = %d\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen, if_name(ifp), ip6_sprintf(&((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr), - ip6_sprintf(&mask6.sin6_addr), rtflags, error)); + ip6_sprintf(&mask6.sin6_addr), rtflags, + (ifscope != IFSCOPE_NONE), error)); + NDPR_UNLOCK(pr); } - if (rt != NULL) - RT_REMREF(rt); + lck_mtx_lock(nd6_mutex); + + NDPR_LOCK(pr); + pr->ndpr_stateflags &= ~NDPRF_IFSCOPE; + if (rt != NULL || error == EEXIST) { + pr->ndpr_stateflags |= NDPRF_ONLINK; + if (ifscope != IFSCOPE_NONE) + pr->ndpr_stateflags |= NDPRF_IFSCOPE; + } + NDPR_UNLOCK(pr); - if (rtlocked == 0) - lck_mtx_unlock(rnh_lock); + IFA_REMREF(ifa); - ifafree(ifa); + return (error); +} - return(error); +int +nd6_prefix_onlink(struct nd_prefix *pr) +{ + return (nd6_prefix_onlink_common(pr, FALSE, IFSCOPE_NONE)); } int -nd6_prefix_offlink( - struct nd_prefix *pr) +nd6_prefix_onlink_scoped(struct nd_prefix *pr, unsigned int ifscope) { - int error = 0; + return (nd6_prefix_onlink_common(pr, TRUE, ifscope)); +} + +int +nd6_prefix_offlink(struct nd_prefix *pr) +{ + int plen, error = 0; struct ifnet *ifp = pr->ndpr_ifp; struct nd_prefix *opr; - struct sockaddr_in6 sa6, mask6; + struct sockaddr_in6 sa6, mask6, prefix; struct rtentry *rt = NULL; + unsigned int ifscope; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); /* sanity check */ + NDPR_LOCK(pr); if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0) { nd6log((LOG_ERR, - "nd6_prefix_offlink: %s/%d is already off-link\n", - ip6_sprintf(&pr->ndpr_prefix.sin6_addr), pr->ndpr_plen)); - return(EEXIST); + "nd6_prefix_offlink: %s/%d on %s scoped=%d is already " + "off-link\n", ip6_sprintf(&pr->ndpr_prefix.sin6_addr), + pr->ndpr_plen, if_name(pr->ndpr_ifp), + (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? 1 : 0)); + NDPR_UNLOCK(pr); + return (EEXIST); } bzero(&sa6, sizeof(sa6)); @@ -1818,48 +3422,66 @@ nd6_prefix_offlink( mask6.sin6_family = AF_INET6; mask6.sin6_len = sizeof(sa6); bcopy(&pr->ndpr_mask, &mask6.sin6_addr, sizeof(struct in6_addr)); - lck_mtx_lock(rnh_lock); - error = rtrequest_locked(RTM_DELETE, (struct sockaddr *)&sa6, NULL, - (struct sockaddr *)&mask6, 0, &rt); - if (error == 0) { - pr->ndpr_stateflags &= ~NDPRF_ONLINK; + prefix = pr->ndpr_prefix; + plen = pr->ndpr_plen; + NDPR_UNLOCK(pr); + ifscope = (pr->ndpr_stateflags & NDPRF_IFSCOPE) ? + ifp->if_index : IFSCOPE_NONE; + + error = rtrequest_scoped(RTM_DELETE, (struct sockaddr *)&sa6, + NULL, (struct sockaddr *)&mask6, 0, &rt, ifscope); + + if (rt != NULL) { /* report the route deletion to the routing socket. */ - if (rt != NULL) { - RT_LOCK(rt); - nd6_rtmsg(RTM_DELETE, rt); - RT_UNLOCK(rt); - } + RT_LOCK(rt); + nd6_rtmsg(RTM_DELETE, rt); + RT_UNLOCK(rt); + rtfree(rt); /* - * There might be the same prefix on another interface, - * the prefix which could not be on-link just because we have - * the interface route (see comments in nd6_prefix_onlink). - * If there's one, try to make the prefix on-link on the - * interface. + * The following check takes place only when Scoped Routing + * is not enabled. There might be the same prefix on another + * interface, the prefix which could not be on-link just + * because we have the interface route (see comments in + * nd6_prefix_onlink). If there's one, try to make the prefix + * on-link on the interface. */ - lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_OWNED); - for (opr = nd_prefix.lh_first; opr; opr = opr->ndpr_next) { - if (opr == pr) - continue; + lck_mtx_lock(nd6_mutex); + opr = nd_prefix.lh_first; + while (opr) { + /* does not apply in the Scoped Routing case */ + if (ip6_doscopedroute) + break; - if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0) + if (opr == pr) { + opr = opr->ndpr_next; continue; + } + NDPR_LOCK(opr); + if ((opr->ndpr_stateflags & NDPRF_ONLINK) != 0) { + NDPR_UNLOCK(opr); + opr = opr->ndpr_next; + continue; + } /* * KAME specific: detached prefixes should not be * on-link. */ - if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0) + if ((opr->ndpr_stateflags & NDPRF_DETACHED) != 0) { + NDPR_UNLOCK(opr); + opr = opr->ndpr_next; continue; - - if (opr->ndpr_plen == pr->ndpr_plen && - in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr, - &opr->ndpr_prefix.sin6_addr, - pr->ndpr_plen)) { + } + if (opr->ndpr_plen == plen && + in6_are_prefix_equal(&prefix.sin6_addr, + &opr->ndpr_prefix.sin6_addr, plen)) { int e; - if ((e = nd6_prefix_onlink(opr, 1, 1)) != 0) { + NDPR_UNLOCK(opr); + lck_mtx_unlock(nd6_mutex); + if ((e = nd6_prefix_onlink(opr)) != 0) { nd6log((LOG_ERR, "nd6_prefix_offlink: failed to " "recover a prefix %s/%d from %s " @@ -1868,40 +3490,41 @@ nd6_prefix_offlink( opr->ndpr_plen, if_name(ifp), if_name(opr->ndpr_ifp), e)); } + lck_mtx_lock(nd6_mutex); + opr = nd_prefix.lh_first; + } else { + NDPR_UNLOCK(opr); + opr = opr->ndpr_next; } } - } - else { - /* XXX: can we still set the NDPRF_ONLINK flag? */ + lck_mtx_unlock(nd6_mutex); + } else { nd6log((LOG_ERR, "nd6_prefix_offlink: failed to delete route: " - "%s/%d on %s (errno = %d)\n", - ip6_sprintf(&sa6.sin6_addr), pr->ndpr_plen, if_name(ifp), - error)); + "%s/%d on %s, scoped %d, (errno = %d)\n", + ip6_sprintf(&sa6.sin6_addr), plen, if_name(ifp), + (ifscope != IFSCOPE_NONE), error)); } - if (rt != NULL) - rtfree_locked(rt); - - lck_mtx_unlock(rnh_lock); + NDPR_LOCK(pr); + pr->ndpr_stateflags &= ~(NDPRF_ONLINK | NDPRF_IFSCOPE); + NDPR_UNLOCK(pr); - return(error); + return (error); } static struct in6_ifaddr * in6_ifadd( struct nd_prefix *pr, - struct in6_addr *ifid) /* Mobile IPv6 addition */ + int mcast) { struct ifnet *ifp = pr->ndpr_ifp; - struct ifaddr *ifa; struct in6_aliasreq ifra; struct in6_ifaddr *ia, *ib; int error, plen0; + int updateflags; struct in6_addr mask; - int prefixlen = pr->ndpr_plen; - - in6_len2mask(&mask, prefixlen); + int prefixlen; /* * find a link-local address (will be interface ID). @@ -1915,41 +3538,32 @@ in6_ifadd( * (2) RFC2462 5.4 suggesting the use of the same interface identifier * for multiple addresses on a single interface, and possible shortcut * of DAD. we omitted DAD for this reason in the past. - * (3) a user can prevent autoconfiguration of global address + * (3) a user can prevent autoconfiguration of global address * by removing link-local address by hand (this is partly because we - * don't have other way to control the use of IPv6 on a interface. + * don't have other way to control the use of IPv6 on an interface. * this has been our design choice - cf. NRL's "ifconfig auto"). * (4) it is easier to manage when an interface has addresses * with the same interface identifier, than to have multiple addresses * with different interface identifiers. - * - * Mobile IPv6 addition: allow for caller to specify a wished interface - * ID. This is to not break connections when moving addresses between - * interfaces. */ - ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0);/* 0 is OK? */ - if (ifa) - ib = (struct in6_ifaddr *)ifa; - else - return NULL; - -#if 0 /* don't care link local addr state, and always do DAD */ - /* if link-local address is not eligible, do not autoconfigure. */ - if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) { - printf("in6_ifadd: link-local address not ready\n"); - ifafree(ifa); - return NULL; - } -#endif + ib = in6ifa_ifpforlinklocal(ifp, 0);/* 0 is OK? */ + if (ib == NULL) + return (NULL); - /* prefixlen + ifidlen must be equal to 128 */ + IFA_LOCK(&ib->ia_ifa); + NDPR_LOCK(pr); + prefixlen = pr->ndpr_plen; + in6_len2mask(&mask, prefixlen); plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL); + /* prefixlen + ifidlen must be equal to 128 */ if (prefixlen != plen0) { nd6log((LOG_INFO, "in6_ifadd: wrong prefixlen for %s " "(prefix=%d ifid=%d)\n", if_name(ifp), prefixlen, 128 - plen0)); - ifafree(ifa); - return NULL; + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ib->ia_ifa); + IFA_REMREF(&ib->ia_ifa); + return (NULL); } /* make ifaddr */ @@ -1971,53 +3585,57 @@ in6_ifadd( ifra.ifra_addr.sin6_addr.s6_addr32[3] &= mask.s6_addr32[3]; /* interface ID */ - if (ifid == NULL || IN6_IS_ADDR_UNSPECIFIED(ifid)) - ifid = &ib->ia_addr.sin6_addr; - ifra.ifra_addr.sin6_addr.s6_addr32[0] - |= (ifid->s6_addr32[0] & ~mask.s6_addr32[0]); - ifra.ifra_addr.sin6_addr.s6_addr32[1] - |= (ifid->s6_addr32[1] & ~mask.s6_addr32[1]); - ifra.ifra_addr.sin6_addr.s6_addr32[2] - |= (ifid->s6_addr32[2] & ~mask.s6_addr32[2]); - ifra.ifra_addr.sin6_addr.s6_addr32[3] - |= (ifid->s6_addr32[3] & ~mask.s6_addr32[3]); - + ifra.ifra_addr.sin6_addr.s6_addr32[0] |= + (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]); + ifra.ifra_addr.sin6_addr.s6_addr32[1] |= + (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]); + ifra.ifra_addr.sin6_addr.s6_addr32[2] |= + (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]); + ifra.ifra_addr.sin6_addr.s6_addr32[3] |= + (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]); + /* new prefix mask. */ ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); ifra.ifra_prefixmask.sin6_family = AF_INET6; bcopy(&mask, &ifra.ifra_prefixmask.sin6_addr, sizeof(ifra.ifra_prefixmask.sin6_addr)); - /* - * lifetime. - * XXX: in6_init_address_ltimes would override these values later. - * We should reconsider this logic. - */ + /* lifetimes. */ ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime; ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime; /* XXX: scope zone ID? */ ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */ + + NDPR_UNLOCK(pr); + IFA_UNLOCK(&ib->ia_ifa); + IFA_REMREF(&ib->ia_ifa); + /* - * temporarily set the nopfx flag to avoid conflict. - * XXX: we should reconsider the entire mechanism about prefix - * manipulation. + * Make sure that we do not have this address already. This should + * usually not happen, but we can still see this case, e.g., if we + * have manually configured the exact address to be configured. */ - ifra.ifra_flags |= IN6_IFF_NOPFX; + if ((ib = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr)) != NULL) { + IFA_REMREF(&ib->ia_ifa); + /* this should be rare enough to make an explicit log */ + log(LOG_INFO, "in6_ifadd: %s is already configured\n", + ip6_sprintf(&ifra.ifra_addr.sin6_addr)); + return (NULL); + } /* - * keep the new address, regardless of the result of in6_update_ifa. - * XXX: this address is now meaningless. - * We should reconsider its role. + * Allocate ifaddr structure, link into chain, etc. + * If we are going to create a new address upon receiving a multicasted + * RA, we need to impose a random delay before starting DAD. + * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2] */ - pr->ndpr_addr = ifra.ifra_addr.sin6_addr; - - ifafree(ifa); - ifa = NULL; - - /* allocate ifaddr structure, link into chain, etc. */ - if ((error = in6_update_ifa(ifp, &ifra, NULL, M_NOWAIT)) != 0) { + updateflags = 0; + if (mcast) + updateflags |= IN6_IFAUPDATE_DADDELAY; + error = in6_update_ifa(ifp, &ifra, NULL, updateflags, M_WAITOK); + if (error != 0) { nd6log((LOG_ERR, "in6_ifadd: failed to make ifaddr %s on %s (errno=%d)\n", ip6_sprintf(&ifra.ifra_addr.sin6_addr), if_name(ifp), @@ -2032,6 +3650,8 @@ in6_ifadd( return(ia); /* this must NOT be NULL. */ } +#define IA6_NONCONST(i) ((struct in6_ifaddr *)(uintptr_t)(i)) + int in6_tmpifadd( const struct in6_ifaddr *ia0, /* corresponding public address */ @@ -2043,14 +3663,18 @@ in6_tmpifadd( struct in6_aliasreq ifra; int i, error; int trylimit = 3; /* XXX: adhoc value */ + int updateflags; u_int32_t randid[2]; time_t vltime0, pltime0; struct timeval timenow; + struct in6_addr addr; + struct nd_prefix *ndpr; getmicrotime(&timenow); bzero(&ifra, sizeof(ifra)); strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name)); + IFA_LOCK(&IA6_NONCONST(ia0)->ia_ifa); ifra.ifra_addr = ia0->ia_addr; /* copy prefix mask */ ifra.ifra_prefixmask = ia0->ia_prefixmask; @@ -2059,24 +3683,26 @@ in6_tmpifadd( ifra.ifra_addr.sin6_addr.s6_addr32[i] &= ifra.ifra_prefixmask.sin6_addr.s6_addr32[i]; } + addr = ia0->ia_addr.sin6_addr; + IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa); - again: +again: in6_get_tmpifid(ifp, (u_int8_t *)randid, - (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], - forcegen); - ifra.ifra_addr.sin6_addr.s6_addr32[2] - |= (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2])); - ifra.ifra_addr.sin6_addr.s6_addr32[3] - |= (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3])); + (const u_int8_t *)&addr.s6_addr[8], forcegen); + + ifra.ifra_addr.sin6_addr.s6_addr32[2] |= + (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2])); + ifra.ifra_addr.sin6_addr.s6_addr32[3] |= + (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3])); /* - * If by chance the new temporary address is the same as an address - * already assigned to the interface, generate a new randomized - * interface identifier and repeat this step. - * RFC 3041 3.3 (4). + * in6_get_tmpifid() quite likely provided a unique interface ID. + * However, we may still have a chance to see collision, because + * there may be a time lag between generation of the ID and generation + * of the address. So, we'll do one more sanity check. */ if ((ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr)) != NULL) { - ifafree(&ia->ia_ifa); + IFA_REMREF(&ia->ia_ifa); if (trylimit-- == 0) { nd6log((LOG_NOTICE, "in6_tmpifadd: failed to find " "a unique random IFID\n")); @@ -2093,25 +3719,22 @@ in6_tmpifadd( * of the public address or TEMP_PREFERRED_LIFETIME - * DESYNC_FACTOR. */ - if (ia0->ia6_lifetime.ia6t_expire != 0) { - vltime0 = IFA6_IS_INVALID(ia0) ? 0 : - (ia0->ia6_lifetime.ia6t_expire - timenow.tv_sec); - if (vltime0 > ip6_temp_valid_lifetime) - vltime0 = ip6_temp_valid_lifetime; - } else + IFA_LOCK(&IA6_NONCONST(ia0)->ia_ifa); + vltime0 = IFA6_IS_INVALID(ia0) + ? 0 + : (ia0->ia6_lifetime.ia6t_vltime - + (timenow.tv_sec - ia0->ia6_updatetime)); + if (vltime0 > ip6_temp_valid_lifetime) vltime0 = ip6_temp_valid_lifetime; - if (ia0->ia6_lifetime.ia6t_preferred != 0) { - pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 : - (ia0->ia6_lifetime.ia6t_preferred - timenow.tv_sec); - if (pltime0 > ip6_temp_preferred_lifetime - ip6_desync_factor){ - pltime0 = ip6_temp_preferred_lifetime - - ip6_desync_factor; - } - } else + pltime0 = IFA6_IS_DEPRECATED(ia0) + ? 0 + : (ia0->ia6_lifetime.ia6t_pltime - + (timenow.tv_sec - ia0->ia6_updatetime)); + if (pltime0 > ip6_temp_preferred_lifetime - ip6_desync_factor) pltime0 = ip6_temp_preferred_lifetime - ip6_desync_factor; ifra.ifra_lifetime.ia6t_vltime = vltime0; ifra.ifra_lifetime.ia6t_pltime = pltime0; - + IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa); /* * A temporary address is created only if this calculated Preferred * Lifetime is greater than REGEN_ADVANCE time units. @@ -2124,8 +3747,13 @@ in6_tmpifadd( ifra.ifra_flags |= (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY); /* allocate ifaddr structure, link into chain, etc. */ - if ((error = in6_update_ifa(ifp, &ifra, NULL, how)) != 0) - return(error); + updateflags = 0; + + if (how) + updateflags |= IN6_IFAUPDATE_DADDELAY; + + if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags, how)) != 0) + return (error); newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr); if (newia == NULL) { /* XXX: can it happen? */ @@ -2134,9 +3762,37 @@ in6_tmpifadd( "no ifaddr\n")); return(EINVAL); /* XXX */ } - lck_mtx_lock(nd6_mutex); - newia->ia6_ndpr = ia0->ia6_ndpr; - newia->ia6_ndpr->ndpr_refcnt++; + IFA_LOCK(&IA6_NONCONST(ia0)->ia_ifa); + ndpr = ia0->ia6_ndpr; + if (ndpr == NULL) { + /* + * We lost the race with another thread that has purged + * ia0 address; in this case, purge the tmp addr as well. + */ + nd6log((LOG_ERR, "in6_tmpifadd: no public address\n")); + VERIFY(!(ia0->ia6_flags & IN6_IFF_AUTOCONF)); + IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa); + in6_purgeaddr(&newia->ia_ifa); + IFA_REMREF(&newia->ia_ifa); + return (EADDRNOTAVAIL); + } + NDPR_ADDREF(ndpr); /* for us */ + IFA_UNLOCK(&IA6_NONCONST(ia0)->ia_ifa); + IFA_LOCK(&newia->ia_ifa); + if (newia->ia6_ndpr != NULL) { + NDPR_LOCK(newia->ia6_ndpr); + VERIFY(newia->ia6_ndpr->ndpr_addrcnt != 0); + newia->ia6_ndpr->ndpr_addrcnt--; + NDPR_UNLOCK(newia->ia6_ndpr); + NDPR_REMREF(newia->ia6_ndpr); /* release addr reference */ + } + newia->ia6_ndpr = ndpr; + NDPR_LOCK(newia->ia6_ndpr); + newia->ia6_ndpr->ndpr_addrcnt++; + VERIFY(newia->ia6_ndpr->ndpr_addrcnt != 0); + NDPR_ADDREF_LOCKED(newia->ia6_ndpr); /* for addr reference */ + NDPR_UNLOCK(newia->ia6_ndpr); + IFA_UNLOCK(&newia->ia_ifa); /* * A newly added address might affect the status of other addresses. * XXX: when the temporary address is generated with a new public @@ -2145,18 +3801,25 @@ in6_tmpifadd( * and, in fact, we surely need the check when we create a new * temporary address due to deprecation of an old temporary address. */ - pfxlist_onlink_check(1); + lck_mtx_lock(nd6_mutex); + pfxlist_onlink_check(); lck_mtx_unlock(nd6_mutex); - ifafree(&newia->ia_ifa); + IFA_REMREF(&newia->ia_ifa); + + /* remove our reference */ + NDPR_REMREF(ndpr); return(0); -} +} +#undef IA6_NONCONST int in6_init_prefix_ltimes(struct nd_prefix *ndpr) { struct timeval timenow; + NDPR_LOCK_ASSERT_HELD(ndpr); + getmicrotime(&timenow); /* check if preferred lifetime > valid lifetime. RFC2462 5.5.3 (c) */ if (ndpr->ndpr_pltime > ndpr->ndpr_vltime) { @@ -2178,14 +3841,15 @@ in6_init_prefix_ltimes(struct nd_prefix *ndpr) } static void -in6_init_address_ltimes(__unused struct nd_prefix *new, struct in6_addrlifetime *lt6) +in6_init_address_ltimes(__unused struct nd_prefix *new, + struct in6_addrlifetime *lt6, boolean_t is_temporary) { struct timeval timenow; getmicrotime(&timenow); /* Valid lifetime must not be updated unless explicitly specified. */ /* init ia6t_expire */ - if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) + if (!is_temporary && lt6->ia6t_vltime == ND6_INFINITE_LIFETIME) lt6->ia6t_expire = 0; else { lt6->ia6t_expire = timenow.tv_sec; @@ -2193,7 +3857,7 @@ in6_init_address_ltimes(__unused struct nd_prefix *new, struct in6_addrlifetime } /* init ia6t_preferred */ - if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) + if (!is_temporary && lt6->ia6t_pltime == ND6_INFINITE_LIFETIME) lt6->ia6t_preferred = 0; else { lt6->ia6t_preferred = timenow.tv_sec; @@ -2281,6 +3945,8 @@ nd6_setdefaultiface( { int error = 0; ifnet_t def_ifp = NULL; + + lck_mtx_assert(nd6_mutex, LCK_MTX_ASSERT_NOTOWNED); ifnet_head_lock_shared(); if (ifindex < 0 || if_index < ifindex) { @@ -2292,12 +3958,21 @@ nd6_setdefaultiface( lck_mtx_lock(nd6_mutex); if (nd6_defifindex != ifindex) { + struct ifnet *odef_ifp = nd6_defifp; + nd6_defifindex = ifindex; if (nd6_defifindex > 0) nd6_defifp = def_ifp; else nd6_defifp = NULL; + if (nd6_defifp != NULL) + nd6log((LOG_INFO, "%s: is now the default " + "interface (was %s)\n", if_name(nd6_defifp), + odef_ifp != NULL ? if_name(odef_ifp) : "NONE")); + else + nd6log((LOG_INFO, "No default interface set\n")); + /* * If the Default Router List is empty, install a route * to the specified interface as default or remove the default @@ -2306,8 +3981,10 @@ nd6_setdefaultiface( * we do this here to avoid re-install the default route * if the list is NOT empty. */ - if (TAILQ_FIRST(&nd_defrouter) == NULL) - defrouter_select(); + if (ip6_doscopedroute || TAILQ_FIRST(&nd_defrouter) == NULL) { + defrtrlist_sync(nd6_defifp); + nd6_prefix_sync(nd6_defifp); + } /* * Our current implementation assumes one-to-one maping between @@ -2316,7 +3993,7 @@ nd6_setdefaultiface( */ scope6_setdefault(nd6_defifp); } - lck_mtx_unlock(nd6_mutex); + return(error); } diff --git a/bsd/netinet6/raw_ip6.c b/bsd/netinet6/raw_ip6.c index 169b7992d..f5c48648e 100644 --- a/bsd/netinet6/raw_ip6.c +++ b/bsd/netinet6/raw_ip6.c @@ -138,6 +138,7 @@ extern struct inpcbhead ripcb; extern struct inpcbinfo ripcbinfo; extern u_int32_t rip_sendspace; extern u_int32_t rip_recvspace; +extern int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt); struct rip6stat rip6stat; @@ -149,7 +150,8 @@ struct rip6stat rip6stat; int rip6_input( struct mbuf **mp, - int *offp) + int *offp, + int proto) { struct mbuf *m = *mp; register struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); @@ -157,7 +159,7 @@ rip6_input( struct inpcb *last = 0; struct mbuf *opts = NULL; struct sockaddr_in6 rip6src; - int proto = ip6->ip6_nxt; + int ret; rip6stat.rip6s_ipackets++; @@ -206,11 +208,20 @@ rip6_input( } else #endif /*IPSEC*/ if (n) { - if (last->in6p_flags & IN6P_CONTROLOPTS || - last->in6p_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(last, &opts, ip6, n); + if ((last->in6p_flags & IN6P_CONTROLOPTS) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(last, n, &opts); + if (ret != 0) { + m_freem(n); + m_freem(opts); + last = in6p; + continue; + } + } /* strip intermediate headers */ m_adj(n, *offp); + so_recv_data_stat(last->in6p_socket, m, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, n, opts, NULL) == 0) { @@ -222,7 +233,7 @@ rip6_input( } last = in6p; } - lck_rw_done(ripcbinfo.mtx); + #if IPSEC /* * Check AH/ESP integrity. @@ -235,11 +246,21 @@ rip6_input( } else #endif /*IPSEC*/ if (last) { - if (last->in6p_flags & IN6P_CONTROLOPTS || - last->in6p_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(last, &opts, ip6, m); + if ((last->in6p_flags & IN6P_CONTROLOPTS) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(last, m, &opts); + if (ret != 0) { + m_freem(m); + m_freem(opts); + ip6stat.ip6s_delivered--; + goto unlock; + } + + } /* strip intermediate headers */ m_adj(m, *offp); + so_recv_data_stat(last->in6p_socket, m, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)&rip6src, m, opts, NULL) == 0) { rip6stat.rip6s_fullsock++; @@ -259,6 +280,10 @@ rip6_input( } ip6stat.ip6s_delivered--; } + +unlock: + lck_rw_done(ripcbinfo.mtx); + return IPPROTO_DONE; } @@ -270,6 +295,7 @@ rip6_ctlinput( { struct ip6_hdr *ip6; struct mbuf *m; + void *cmdarg = NULL; int off = 0; struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; @@ -294,6 +320,7 @@ rip6_ctlinput( m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; + cmdarg = ip6cp->ip6c_cmdarg; sa6_src = ip6cp->ip6c_src; } else { m = NULL; @@ -302,7 +329,7 @@ rip6_ctlinput( } (void) in6_pcbnotify(&ripcbinfo, sa, 0, (const struct sockaddr *)sa6_src, - 0, cmd, notify); + 0, cmd, cmdarg, notify); } /* @@ -314,7 +341,8 @@ rip6_output( register struct mbuf *m, struct socket *so, struct sockaddr_in6 *dstsock, - struct mbuf *control) + struct mbuf *control, + int israw) { struct in6_addr *dst; struct ip6_hdr *ip6; @@ -322,25 +350,29 @@ rip6_output( u_int plen = m->m_pkthdr.len; int error = 0; struct ip6_pktopts opt, *optp = 0; + struct ip6_moptions *im6o = NULL; struct ifnet *oifp = NULL; int type = 0, code = 0; /* for ICMPv6 output statistics only */ - int priv = 0; -#if PKT_PRIORITY - mbuf_traffic_class_t mtc = MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ + mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + int flags = IPV6_OUTARGS; + + if (dstsock && IN6_IS_ADDR_V4MAPPED(&dstsock->sin6_addr)) { + m_freem(m); + return (EINVAL); + } in6p = sotoin6pcb(so); - priv = 0; - if (so->so_uid == 0) - priv = 1; + ip6oa.ip6oa_boundif = (in6p->inp_flags & INP_BOUND_IF) ? + in6p->inp_boundif : IFSCOPE_NONE; + ip6oa.ip6oa_nocell = (in6p->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; + dst = &dstsock->sin6_addr; if (control) { -#if PKT_PRIORITY mtc = mbuf_traffic_class_from_control(control); -#endif /* PKT_PRIORITY */ - if ((error = ip6_setpktoptions(control, &opt, priv, 0)) != 0) + if ((error = ip6_setpktopts(control, &opt, NULL, so->so_proto->pr_protocol)) != 0) goto bad; optp = &opt; } else @@ -374,6 +406,8 @@ rip6_output( */ ip6->ip6_dst = *dst; + im6o = in6p->in6p_moptions; + /* * If the scope of the destination is link-local, embed the interface * index in the address. @@ -382,7 +416,13 @@ rip6_output( */ if (IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_dst)) { struct in6_pktinfo *pi; + struct ifnet *im6o_multicast_ifp = NULL; + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && im6o != NULL) { + IM6O_LOCK(im6o); + im6o_multicast_ifp = im6o->im6o_multicast_ifp; + IM6O_UNLOCK(im6o); + } /* * XXX Boundary check is assumed to be already done in * ip6_setpktoptions(). @@ -391,10 +431,12 @@ rip6_output( if (optp && (pi = optp->ip6po_pktinfo) && pi->ipi6_ifindex) { ip6->ip6_dst.s6_addr16[1] = htons(pi->ipi6_ifindex); oifp = ifindex2ifnet[pi->ipi6_ifindex]; + if (oifp != NULL) + ifnet_reference(oifp); } else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && - in6p->in6p_moptions && - in6p->in6p_moptions->im6o_multicast_ifp) { - oifp = in6p->in6p_moptions->im6o_multicast_ifp; + im6o != NULL && im6o_multicast_ifp != NULL) { + oifp = im6o_multicast_ifp; + ifnet_reference(oifp); ip6->ip6_dst.s6_addr16[1] = htons(oifp->if_index); } else if (dstsock->sin6_scope_id) { /* @@ -421,11 +463,13 @@ rip6_output( struct in6_addr *in6a; struct in6_addr storage; u_short index = 0; - if ((in6a = in6_selectsrc(dstsock, optp, - in6p->in6p_moptions, - &in6p->in6p_route, - &in6p->in6p_laddr, - &storage, &error)) == 0) { + + if (israw != 0 && optp && optp->ip6po_pktinfo && !IN6_IS_ADDR_UNSPECIFIED(&optp->ip6po_pktinfo->ipi6_addr)) { + in6a = &optp->ip6po_pktinfo->ipi6_addr; + flags |= IPV6_FLAG_NOSRCIFSEL; + } else if ((in6a = in6_selectsrc(dstsock, optp, in6p, + &in6p->in6p_route, NULL, &storage, ip6oa.ip6oa_boundif, + &error)) == 0) { if (error == 0) error = EADDRNOTAVAIL; goto bad; @@ -436,11 +480,15 @@ rip6_output( if (in6p->in6p_route.ro_rt->rt_ifp != NULL) index = in6p->in6p_route.ro_rt->rt_ifp->if_index; RT_UNLOCK(in6p->in6p_route.ro_rt); + if (oifp != NULL) + ifnet_release(oifp); ifnet_head_lock_shared(); if (index == 0 || if_index < index) { panic("bad if_index on interface from route"); } oifp = ifindex2ifnet[index]; + if (oifp != NULL) + ifnet_reference(oifp); ifnet_head_done(); } } @@ -463,7 +511,7 @@ rip6_output( off = offsetof(struct icmp6_hdr, icmp6_cksum); else off = in6p->in6p_cksum; - if (plen < off + 1) { + if (plen < (unsigned int)(off + 1)) { error = EINVAL; goto bad; } @@ -494,26 +542,48 @@ rip6_output( in6p->in6p_route.ro_rt = NULL; } -#if PKT_PRIORITY - set_traffic_class(m, so, mtc); -#endif /* PKT_PRIORITY */ + if (oifp != NULL) { + ifnet_release(oifp); + oifp = NULL; + } - error = ip6_output(m, optp, &in6p->in6p_route, 0, - in6p->in6p_moptions, &oifp, 0); + set_packet_tclass(m, so, mtc, 1); + + if (im6o != NULL) + IM6O_ADDREF(im6o); -#if IFNET_ROUTE_REFCNT - /* - * Always discard the cached route for unconnected socket - * or if it is a multicast route. - */ - if (in6p->in6p_route.ro_rt != NULL && - ((in6p->in6p_route.ro_rt->rt_flags & RTF_MULTICAST) || - in6p->in6p_socket == NULL || - in6p->in6p_socket->so_state != SS_ISCONNECTED)) { - rtfree(in6p->in6p_route.ro_rt); - in6p->in6p_route.ro_rt = NULL; + error = ip6_output(m, optp, &in6p->in6p_route, flags, im6o, + &oifp, &ip6oa); + + if (im6o != NULL) + IM6O_REMREF(im6o); + + if (in6p->in6p_route.ro_rt != NULL) { + struct rtentry *rt = in6p->in6p_route.ro_rt; + unsigned int outif; + + if ((rt->rt_flags & RTF_MULTICAST) || + in6p->in6p_socket == NULL || + !(in6p->in6p_socket->so_state & SS_ISCONNECTED)) { + rt = NULL; /* unusable */ + } + /* + * Always discard the cached route for unconnected + * socket or if it is a multicast route. + */ + if (rt == NULL) { + rtfree(in6p->in6p_route.ro_rt); + in6p->in6p_route.ro_rt = NULL; + } + /* + * If this is a connected socket and the destination + * route is not multicast, update outif with that of + * the route interface index used by IP. + */ + if (rt != NULL && + (outif = rt->rt_ifp->if_index) != in6p->in6p_last_outif) + in6p->in6p_last_outif = outif; } -#endif /* IFNET_ROUTE_REFCNT */ if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (oifp) @@ -535,9 +605,11 @@ rip6_output( } if (control) { if (optp == &opt) - ip6_clearpktopts(optp, 0, -1); + ip6_clearpktopts(optp, -1); m_freem(control); } + if (oifp != NULL) + ifnet_release(oifp); return(error); } @@ -594,8 +666,13 @@ rip6_ctloutput( case MRT6_PIM: #if MROUTING error = ip6_mrouter_get(so, sopt); +#else + error = ENOPROTOOPT; +#endif /* MROUTING */ + break; + case IPV6_CHECKSUM: + error = ip6_raw_ctloutput(so, sopt); break; -#endif default: error = ip6_ctloutput(so, sopt); break; @@ -627,8 +704,13 @@ rip6_ctloutput( case MRT6_PIM: #if MROUTING error = ip6_mrouter_set(so, sopt); - break; +#else + error = ENOPROTOOPT; #endif + break; + case IPV6_CHECKSUM: + error = ip6_raw_ctloutput(so, sopt); + break; default: error = ip6_ctloutput(so, sopt); break; @@ -714,7 +796,8 @@ rip6_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) { struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam; - struct ifaddr *ia = NULL; + struct ifaddr *ifa = NULL; + unsigned int outif = 0; if (nam->sa_len != sizeof(*addr)) return EINVAL; @@ -727,18 +810,23 @@ rip6_bind(struct socket *so, struct sockaddr *nam, __unused struct proc *p) } #endif if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) && - (ia = ifa_ifwithaddr((struct sockaddr *)addr)) == 0) + (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == 0) return EADDRNOTAVAIL; - if (ia && - ((struct in6_ifaddr *)ia)->ia6_flags & - (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| - IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { - if (ia) ifafree(ia); - return(EADDRNOTAVAIL); + if (ifa != NULL) { + IFA_LOCK(ifa); + if (((struct in6_ifaddr *)ifa)->ia6_flags & + (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY| + IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) { + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); + return(EADDRNOTAVAIL); + } + outif = ifa->ifa_ifp->if_index; + IFA_UNLOCK(ifa); + IFA_REMREF(ifa); } - if (ia != NULL) - ifafree(ia); inp->in6p_laddr = addr->sin6_addr; + inp->in6p_last_outif = outif; return 0; } @@ -753,6 +841,7 @@ rip6_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) #if ENABLE_DEFAULT_SCOPE struct sockaddr_in6 tmp; #endif + unsigned int outif = 0, ifscope; if (nam->sa_len != sizeof(*addr)) return EINVAL; @@ -768,14 +857,20 @@ rip6_connect(struct socket *so, struct sockaddr *nam, __unused struct proc *p) addr->sin6_scope_id = scope6_addr2default(&addr->sin6_addr); } #endif + + ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; + /* Source address selection. XXX: need pcblookup? */ - in6a = in6_selectsrc(addr, inp->in6p_outputopts, - inp->in6p_moptions, &inp->in6p_route, - &inp->in6p_laddr, &storage, &error); + in6a = in6_selectsrc(addr, inp->in6p_outputopts, inp, &inp->in6p_route, + NULL, &storage, ifscope, &error); if (in6a == NULL) return (error ? error : EADDRNOTAVAIL); inp->in6p_laddr = *in6a; inp->in6p_faddr = addr->sin6_addr; + if (inp->in6p_route.ro_rt != NULL) + outif = inp->in6p_route.ro_rt->rt_ifp->if_index; + inp->in6p_last_outif = outif; soisconnected(so); return 0; } @@ -788,12 +883,13 @@ rip6_shutdown(struct socket *so) } static int -rip6_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr *nam, - struct mbuf *control, __unused struct proc *p) +rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct proc *p) { +#pragma unused(flags, p) struct inpcb *inp = sotoinpcb(so); struct sockaddr_in6 tmp; - struct sockaddr_in6 *dst; + struct sockaddr_in6 *dst = (struct sockaddr_in6 *)nam; /* always copy sockaddr to avoid overwrites */ if (so->so_state & SS_ISCONNECTED) { @@ -821,7 +917,7 @@ rip6_send(struct socket *so, __unused int flags, struct mbuf *m, struct sockaddr dst->sin6_scope_id = scope6_addr2default(&dst->sin6_addr); } #endif - return rip6_output(m, so, dst, control); + return rip6_output(m, so, dst, control, 1); } struct pr_usrreqs rip6_usrreqs = { diff --git a/bsd/netinet6/route6.c b/bsd/netinet6/route6.c index 36617f2d0..a0dc6c6a6 100644 --- a/bsd/netinet6/route6.c +++ b/bsd/netinet6/route6.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/route6.c,v 1.1.2.3 2001/07/03 11:01:55 ume Exp $ */ /* $KAME: route6.c,v 1.24 2001/03/14 03:07:05 itojun Exp $ */ @@ -52,8 +80,9 @@ static int ip6_rthdr0(struct mbuf *, struct ip6_hdr *, #endif /* IP6_RTHDR0_ALLOWED */ int -route6_input(struct mbuf **mp, int *offp) +route6_input(struct mbuf **mp, int *offp, int proto) { +#pragma unused(proto) struct ip6_hdr *ip6; struct mbuf *m = *mp; struct ip6_rthdr *rh; @@ -143,7 +172,7 @@ ip6_rthdr0(m, ip6, rh0) struct ip6_rthdr0 *rh0; { int addrs, index; - struct in6_addr *nextaddr, tmpaddr; + struct in6_addr *nextaddr, tmpaddr, ia6 = NULL; struct route_in6 ip6forward_rt; if (rh0->ip6r0_segleft == 0) @@ -156,20 +185,20 @@ ip6_rthdr0(m, ip6, rh0) ) { /* * Type 0 routing header can't contain more than 23 addresses. - * RFC 2462: this limitation was removed since stict/loose + * RFC 2462: this limitation was removed since strict/loose * bitmap field was deleted. */ ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh0->ip6r0_len - (caddr_t)ip6); - return(-1); + return (-1); } if ((addrs = rh0->ip6r0_len / 2) < rh0->ip6r0_segleft) { ip6stat.ip6s_badoptions++; icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&rh0->ip6r0_segleft - (caddr_t)ip6); - return(-1); + return (-1); } index = addrs - rh0->ip6r0_segleft; @@ -188,7 +217,7 @@ ip6_rthdr0(m, ip6, rh0) IN6_IS_ADDR_V4COMPAT(nextaddr)) { ip6stat.ip6s_badoptions++; m_freem(m); - return(-1); + return (-1); } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst) || @@ -196,16 +225,31 @@ ip6_rthdr0(m, ip6, rh0) IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { ip6stat.ip6s_badoptions++; m_freem(m); - return(-1); + return (-1); + } + + /* + * Determine the scope zone of the next hop, based on the interface + * of the current hop. [RFC4007, Section 9] + * Then disambiguate the scope zone for the next hop (if necessary). + */ + if ((ia6 = ip6_getdstifaddr(m)) == NULL) + goto bad; + if (in6_setscope(nextaddr, ia6->ia_ifp, NULL) != 0) { + ip6stat.ip6s_badscope++; + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; + goto bad; } + IFA_REMREF(&ia6->ia_ifa); + ia6 = NULL; /* * Swap the IPv6 destination address and nextaddr. Forward the packet. */ tmpaddr = *nextaddr; *nextaddr = ip6->ip6_dst; - if (IN6_IS_ADDR_LINKLOCAL(nextaddr)) - nextaddr->s6_addr16[1] = 0; + in6_clearscope(nextaddr); /* XXX */ ip6->ip6_dst = tmpaddr; if (IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst)) ip6->ip6_dst.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); diff --git a/bsd/netinet6/scope6.c b/bsd/netinet6/scope6.c index 70e90dfa9..2d4eedf76 100644 --- a/bsd/netinet6/scope6.c +++ b/bsd/netinet6/scope6.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/scope6.c,v 1.3 2002/03/25 10:12:51 ume Exp $ */ /* $KAME: scope6.c,v 1.10 2000/07/24 13:29:31 itojun Exp $ */ @@ -36,6 +64,8 @@ #include <sys/socket.h> #include <sys/systm.h> #include <sys/queue.h> +#include <sys/syslog.h> +#include <sys/mcache.h> #include <net/route.h> #include <net/if.h> @@ -47,13 +77,12 @@ extern lck_mtx_t *scope6_mutex; -struct scope6_id { - /* - * 16 is correspondent to 4bit multicast scope field. - * i.e. from node-local to global with some reserved/unassigned types. - */ - u_int32_t s6id_list[16]; -}; +#ifdef ENABLE_DEFAULT_SCOPE +int ip6_use_defzone = 1; +#else +int ip6_use_defzone = 0; +#endif + static size_t if_scope_indexlim = 8; struct scope6_id *scope6_ids = NULL; @@ -103,6 +132,7 @@ scope6_ifattach( * XXX: IPV6_ADDR_SCOPE_xxx macros are not standard. * Should we rather hardcode here? */ + SID.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index; SID.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index; #if MULTI_SCOPE /* by default, we don't care about scope boundary for these scopes. */ @@ -133,14 +163,14 @@ scope6_set( /* * TODO(XXX): after setting, we should reflect the changes to - * interface addresses, routing table entries, PCB entries... + * interface addresses, routing table entries, PCB entries... */ lck_mtx_lock(scope6_mutex); for (i = 0; i < 16; i++) { if (idlist[i] && idlist[i] != scope6_ids[ifp->if_index].s6id_list[i]) { - if (i == IPV6_ADDR_SCOPE_LINKLOCAL && + if (i == IPV6_ADDR_SCOPE_INTFACELOCAL && idlist[i] > if_index) { /* * XXX: theoretically, there should be no @@ -216,8 +246,8 @@ struct in6_addr *addr; * return scope doesn't work. */ switch (scope) { - case IPV6_ADDR_SCOPE_NODELOCAL: - return IPV6_ADDR_SCOPE_NODELOCAL; + case IPV6_ADDR_SCOPE_INTFACELOCAL: + return IPV6_ADDR_SCOPE_INTFACELOCAL; break; case IPV6_ADDR_SCOPE_LINKLOCAL: return IPV6_ADDR_SCOPE_LINKLOCAL; @@ -231,11 +261,15 @@ struct in6_addr *addr; } } + /* + * Regard loopback and unspecified addresses as global, since + * they have no ambiguity. + */ if (bcmp(&in6addr_loopback, addr, sizeof(*addr) - 1) == 0) { if (addr->s6_addr8[15] == 1) /* loopback */ - return IPV6_ADDR_SCOPE_NODELOCAL; - if (addr->s6_addr8[15] == 0) /* unspecified */ return IPV6_ADDR_SCOPE_LINKLOCAL; + if (addr->s6_addr8[15] == 0) /* unspecified */ + return IPV6_ADDR_SCOPE_GLOBAL; /* XXX: correct? */ } return IPV6_ADDR_SCOPE_GLOBAL; @@ -282,6 +316,106 @@ in6_addr2scopeid( return retid; } +/* + * Validate the specified scope zone ID in the sin6_scope_id field. If the ID + * is unspecified (=0), needs to be specified, and the default zone ID can be + * used, the default value will be used. + * This routine then generates the kernel-internal form: if the address scope + * of is interface-local or link-local, embed the interface index in the + * address. + */ +int +sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok) +{ + struct ifnet *ifp; + u_int32_t zoneid; + + if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok) + zoneid = scope6_addr2default(&sin6->sin6_addr); + + if (zoneid != 0 && + (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || + IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) { + /* + * At this moment, we only check interface-local and + * link-local scope IDs, and use interface indices as the + * zone IDs assuming a one-to-one mapping between interfaces + * and links. + */ + if (if_index < zoneid) + return (ENXIO); + ifnet_head_lock_shared(); + ifp = ifindex2ifnet[zoneid]; + if (ifp == NULL) {/* XXX: this can happen for some OS */ + ifnet_head_done(); + return (ENXIO); + } + ifnet_head_done(); + /* XXX assignment to 16bit from 32bit variable */ + sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff); + + sin6->sin6_scope_id = 0; + } + + return 0; +} + +void +rtkey_to_sa6(struct rtentry *rt, struct sockaddr_in6 *sin6) +{ + VERIFY(rt_key(rt)->sa_family == AF_INET6); + + *sin6 = *((struct sockaddr_in6 *)rt_key(rt)); + sin6->sin6_scope_id = 0; +} + +void +rtgw_to_sa6(struct rtentry *rt, struct sockaddr_in6 *sin6) +{ + VERIFY(rt->rt_flags & RTF_GATEWAY); + + *sin6 = *((struct sockaddr_in6 *)rt->rt_gateway); + sin6->sin6_scope_id = 0; +} + +/* + * generate standard sockaddr_in6 from embedded form. + */ +int +sa6_recoverscope(struct sockaddr_in6 *sin6) +{ + u_int32_t zoneid; + + if (sin6->sin6_scope_id != 0) { + log(LOG_NOTICE, + "sa6_recoverscope: assumption failure (non 0 ID): %s%%%d\n", + ip6_sprintf(&sin6->sin6_addr), sin6->sin6_scope_id); + /* XXX: proceed anyway... */ + } + if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || + IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) { + /* + * KAME assumption: link id == interface id + */ + zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]); + if (zoneid) { + /* sanity check */ + if (if_index < zoneid) + return (ENXIO); + ifnet_head_lock_shared(); + if (ifindex2ifnet[zoneid] == NULL) { + ifnet_head_done(); + return (ENXIO); + } + ifnet_head_done(); + sin6->sin6_addr.s6_addr16[1] = 0; + sin6->sin6_scope_id = zoneid; + } + } + + return 0; +} + void scope6_setdefault( struct ifnet *ifp) /* note that this might be NULL */ @@ -294,11 +428,14 @@ scope6_setdefault( */ lck_mtx_lock(scope6_mutex); if (ifp) { + scope6_ids[0].s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = + ifp->if_index; scope6_ids[0].s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index; - } - else + } else { + scope6_ids[0].s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = 0; scope6_ids[0].s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = 0; + } lck_mtx_unlock(scope6_mutex); } @@ -328,3 +465,93 @@ scope6_addr2default( lck_mtx_unlock(scope6_mutex); return (id); } + +/* + * Determine the appropriate scope zone ID for in6 and ifp. If ret_id is + * non NULL, it is set to the zone ID. If the zone ID needs to be embedded + * in the in6_addr structure, in6 will be modified. + * + * ret_id - unnecessary? + */ +int +in6_setscope(struct in6_addr *in6, struct ifnet *ifp, u_int32_t *ret_id) +{ + int scope; + u_int32_t zoneid = 0; + int index = ifp->if_index; + +#ifdef DIAGNOSTIC + if (scope6_ids == NULL) { /* should not happen */ + panic("in6_setscope: scope array is NULL"); + /* NOTREACHED */ + } +#endif + + /* + * special case: the loopback address can only belong to a loopback + * interface. + */ + if (IN6_IS_ADDR_LOOPBACK(in6)) { + if (!(ifp->if_flags & IFF_LOOPBACK)) { + return (EINVAL); + } else { + if (ret_id != NULL) + *ret_id = 0; /* there's no ambiguity */ + return (0); + } + } + + scope = in6_addrscope(in6); + +#define SID scope6_ids[index] + lck_mtx_lock(scope6_mutex); + switch (scope) { + case IPV6_ADDR_SCOPE_INTFACELOCAL: /* should be interface index */ + zoneid = SID.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL]; + break; + + case IPV6_ADDR_SCOPE_LINKLOCAL: + zoneid = SID.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL]; + break; + + case IPV6_ADDR_SCOPE_SITELOCAL: + zoneid = SID.s6id_list[IPV6_ADDR_SCOPE_SITELOCAL]; + break; + + case IPV6_ADDR_SCOPE_ORGLOCAL: + zoneid = SID.s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL]; + break; +#undef SID + default: + zoneid = 0; /* XXX: treat as global. */ + break; + } + lck_mtx_unlock(scope6_mutex); + + if (ret_id != NULL) + *ret_id = zoneid; + + if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) + in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */ + + return (0); +} + +/* + * Just clear the embedded scope identifier. Return 0 if the original address + * is intact; return non 0 if the address is modified. + */ +int +in6_clearscope(struct in6_addr *in6) +{ + int modified = 0; + + if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) { + if (in6->s6_addr16[1] != 0) + modified = 1; + in6->s6_addr16[1] = 0; + } + + return (modified); +} + diff --git a/bsd/netinet6/scope6_var.h b/bsd/netinet6/scope6_var.h index 2b3a9954a..d028aefb8 100644 --- a/bsd/netinet6/scope6_var.h +++ b/bsd/netinet6/scope6_var.h @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2009-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/scope6_var.h,v 1.1.2.1 2000/07/15 07:14:38 kris Exp $ */ /* $KAME: scope6_var.h,v 1.4 2000/05/18 15:03:27 jinmei Exp $ */ @@ -35,13 +63,31 @@ #include <sys/appleapiopts.h> #ifdef KERNEL_PRIVATE + +struct scope6_id { + /* + * 16 is correspondent to 4bit multicast scope field. + * i.e. from node-local to global with some reserved/unassigned types. + */ + u_int32_t s6id_list[16]; +}; + +void scope6_init (void); int scope6_ifattach(struct ifnet *); +void scope6_ifdetach (struct scope6_id *); int scope6_set(struct ifnet *, u_int32_t *); int scope6_get(struct ifnet *, u_int32_t *); void scope6_setdefault(struct ifnet *); int scope6_get_default(u_int32_t *); u_int32_t scope6_in6_addrscope(struct in6_addr *); u_int32_t scope6_addr2default(struct in6_addr *); +int sa6_embedscope (struct sockaddr_in6 *, int); +int sa6_recoverscope (struct sockaddr_in6 *); +int in6_setscope (struct in6_addr *, struct ifnet *, u_int32_t *); +int in6_clearscope (struct in6_addr *); +extern void rtkey_to_sa6(struct rtentry *, struct sockaddr_in6 *); +extern void rtgw_to_sa6(struct rtentry *, struct sockaddr_in6 *); + #endif /* KERNEL_PRIVATE */ #endif /* _NETINET6_SCOPE6_VAR_H_ */ diff --git a/bsd/netinet6/tcp6_var.h b/bsd/netinet6/tcp6_var.h index 9d7c44968..5fded19e6 100644 --- a/bsd/netinet6/tcp6_var.h +++ b/bsd/netinet6/tcp6_var.h @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. @@ -79,8 +107,8 @@ extern int tcp_v6mssdflt; /* XXX */ struct ip6_hdr; void tcp6_ctlinput(int, struct sockaddr *, void *); void tcp6_init(void); -int tcp6_input(struct mbuf **, int *); -struct rtentry *tcp_rtlookup6(struct inpcb *); +int tcp6_input(struct mbuf **, int *, int); +struct rtentry *tcp_rtlookup6(struct inpcb *, unsigned int); extern struct pr_usrreqs tcp6_usrreqs; diff --git a/bsd/netinet6/udp6_output.c b/bsd/netinet6/udp6_output.c index e3d3198f4..0fb9a6993 100644 --- a/bsd/netinet6/udp6_output.c +++ b/bsd/netinet6/udp6_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -107,9 +107,12 @@ #include <sys/proc.h> #include <sys/syslog.h> +#include <machine/endian.h> + #include <net/if.h> #include <net/route.h> #include <net/if_types.h> +#include <net/ntstat.h> #include <netinet/in.h> #include <netinet/in_var.h> @@ -177,27 +180,28 @@ udp6_output(in6p, m, addr6, control, p) struct in6_addr *laddr, *faddr; u_short fport; int error = 0; - struct ip6_pktopts opt, *stickyopt = in6p->in6p_outputopts; - int priv; + struct ip6_pktopts opt, *optp = NULL; + struct ip6_moptions *im6o; int af = AF_INET6, hlen = sizeof(struct ip6_hdr); int flags; struct sockaddr_in6 tmp; struct in6_addr storage; -#if PKT_PRIORITY - mbuf_traffic_class_t mtc = MBUF_TC_NONE; -#endif /* PKT_PRIORITY */ + mbuf_traffic_class_t mtc = MBUF_TC_UNSPEC; + struct ip6_out_args ip6oa = { IFSCOPE_NONE, 0 }; + + if (in6p->inp_flags & INP_BOUND_IF) + ip6oa.ip6oa_boundif = in6p->inp_boundif; - priv = (proc_suser(p) == 0); + ip6oa.ip6oa_nocell = (in6p->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0; if (control) { -#if PKT_PRIORITY mtc = mbuf_traffic_class_from_control(control); -#endif /* PKT_PRIORITY */ - if ((error = ip6_setpktoptions(control, &opt, priv, 0)) != 0) + if ((error = ip6_setpktopts(control, &opt, NULL, IPPROTO_UDP)) != 0) goto release; - in6p->in6p_outputopts = &opt; - } + optp = &opt; + } else + optp = in6p->in6p_outputopts; if (addr6) { /* @@ -246,16 +250,16 @@ udp6_output(in6p, m, addr6, control, p) } /* KAME hack: embed scopeid */ - if (in6_embedscope(&sin6->sin6_addr, sin6, in6p, NULL) != 0) { + if (in6_embedscope(&sin6->sin6_addr, sin6, in6p, NULL, + optp) != 0) { error = EINVAL; goto release; } if (!IN6_IS_ADDR_V4MAPPED(faddr)) { - laddr = in6_selectsrc(sin6, in6p->in6p_outputopts, - in6p->in6p_moptions, - &in6p->in6p_route, - &in6p->in6p_laddr, &storage, &error); + laddr = in6_selectsrc(sin6, optp, + in6p, &in6p->in6p_route, NULL, &storage, + ip6oa.ip6oa_boundif, &error); } else laddr = &in6p->in6p_laddr; /* XXX */ if (laddr == NULL) { @@ -333,12 +337,12 @@ udp6_output(in6p, m, addr6, control, p) ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; - if ((udp6->uh_sum = in6_cksum(m, IPPROTO_UDP, - sizeof(struct ip6_hdr), plen)) == 0) { - udp6->uh_sum = 0xffff; - } + udp6->uh_sum = in6_cksum_phdr(laddr, faddr, + htonl(plen), htonl(IPPROTO_UDP)); + m->m_pkthdr.csum_flags = CSUM_UDPIPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); - flags = 0; + flags = IPV6_OUTARGS; udp6stat.udp6s_opackets++; #ifdef IPSEC @@ -348,26 +352,50 @@ udp6_output(in6p, m, addr6, control, p) } #endif /*IPSEC*/ m->m_pkthdr.socket_id = get_socket_id(in6p->in6p_socket); + + set_packet_tclass(m, in6p->in6p_socket, mtc, 1); + + im6o = in6p->in6p_moptions; + if (im6o != NULL) + IM6O_ADDREF(im6o); + + error = ip6_output(m, optp, &in6p->in6p_route, + flags, im6o, NULL, &ip6oa); + + if (im6o != NULL) + IM6O_REMREF(im6o); -#if PKT_PRIORITY - set_traffic_class(m, in6p->in6p_socket, mtc); -#endif /* PKT_PRIORITY */ - error = ip6_output(m, in6p->in6p_outputopts, &in6p->in6p_route, - flags, in6p->in6p_moptions, NULL, 0); + if (error == 0 && nstat_collect) { + locked_add_64(&in6p->inp_stat->txpackets, 1); + locked_add_64(&in6p->inp_stat->txbytes, ulen); + } -#if IFNET_ROUTE_REFCNT - /* - * Always discard the cached route for unconnected socket - * or if it is a multicast route. - */ - if (in6p->in6p_route.ro_rt != NULL && - ((in6p->in6p_route.ro_rt->rt_flags & RTF_MULTICAST) || - in6p->in6p_socket == NULL || - in6p->in6p_socket->so_state != SS_ISCONNECTED)) { - rtfree(in6p->in6p_route.ro_rt); - in6p->in6p_route.ro_rt = NULL; + if (in6p->in6p_route.ro_rt != NULL) { + struct rtentry *rt = in6p->in6p_route.ro_rt; + unsigned int outif; + + if ((rt->rt_flags & RTF_MULTICAST) || + in6p->in6p_socket == NULL || + !(in6p->in6p_socket->so_state & SS_ISCONNECTED)) { + rt = NULL; /* unusable */ + } + /* + * Always discard the cached route for unconnected + * socket or if it is a multicast route. + */ + if (rt == NULL) { + rtfree(in6p->in6p_route.ro_rt); + in6p->in6p_route.ro_rt = NULL; + } + /* + * If this is a connected socket and the destination + * route is not multicast, update outif with that of + * the route interface index used by IP. + */ + if (rt != NULL && (outif = rt->rt_ifp->if_index) != + in6p->in6p_last_outif) + in6p->in6p_last_outif = outif; } -#endif /* IFNET_ROUTE_REFCNT */ break; case AF_INET: error = EAFNOSUPPORT; @@ -380,8 +408,8 @@ release: releaseopt: if (control) { - ip6_clearpktopts(in6p->in6p_outputopts, 0, -1); - in6p->in6p_outputopts = stickyopt; + if (optp == &opt) + ip6_clearpktopts(optp, -1); m_freem(control); } return(error); diff --git a/bsd/netinet6/udp6_usrreq.c b/bsd/netinet6/udp6_usrreq.c index fed294b90..c88c0d169 100644 --- a/bsd/netinet6/udp6_usrreq.c +++ b/bsd/netinet6/udp6_usrreq.c @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + /* $FreeBSD: src/sys/netinet6/udp6_usrreq.c,v 1.6.2.6 2001/07/29 19:32:40 ume Exp $ */ /* $KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $ */ @@ -83,6 +111,7 @@ #include <net/if.h> #include <net/route.h> #include <net/if_types.h> +#include <net/ntstat.h> #include <netinet/in.h> #include <netinet/in_systm.h> @@ -104,7 +133,6 @@ #include <netinet6/ipsec6.h> extern int ipsec_bypass; #endif /*IPSEC*/ -extern lck_mtx_t *nd6_mutex; /* * UDP protocol inplementation. @@ -112,7 +140,6 @@ extern lck_mtx_t *nd6_mutex; */ extern struct protosw inetsw[]; -static int in6_mcmatch(struct inpcb *, struct in6_addr *, struct ifnet *); static int udp6_detach(struct socket *so); static void udp6_append(struct inpcb *, struct ip6_hdr *, struct sockaddr_in6 *, struct mbuf *, int); @@ -131,53 +158,37 @@ extern int fw_verbose; #define log_in_vain_log( a ) { log a; } #endif -static int -in6_mcmatch( - struct inpcb *in6p, - register struct in6_addr *ia6, - struct ifnet *ifp) -{ - struct ip6_moptions *im6o = in6p->in6p_moptions; - struct in6_multi_mship *imm; - - if (im6o == NULL) - return 0; - - lck_mtx_lock(nd6_mutex); - for (imm = im6o->im6o_memberships.lh_first; imm != NULL; - imm = imm->i6mm_chain.le_next) { - if ((ifp == NULL || - imm->i6mm_maddr->in6m_ifp == ifp) && - IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr, - ia6)) { - lck_mtx_unlock(nd6_mutex); - return 1; - } - } - lck_mtx_unlock(nd6_mutex); - return 0; -} - /* * subroutine of udp6_input(), mainly for source code readability. */ static void -udp6_append(struct inpcb *last, struct ip6_hdr *ip6, +udp6_append(struct inpcb *last, __unused struct ip6_hdr *ip6, struct sockaddr_in6 *udp_in6, struct mbuf *n, int off) { struct mbuf *opts = NULL; - + int ret = 0; #if CONFIG_MACF_NET if (mac_inpcb_check_deliver(last, n, AF_INET6, SOCK_DGRAM) != 0) { m_freem(n); return; } #endif - if (last->in6p_flags & IN6P_CONTROLOPTS || - last->in6p_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(last, &opts, ip6, n); - + if ((last->in6p_flags & IN6P_CONTROLOPTS) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(last, n, &opts); + if (ret != 0) { + m_freem(n); + m_freem(opts); + return; + } + } m_adj(n, off); + if (nstat_collect) { + locked_add_64(&last->inp_stat->rxpackets, 1); + locked_add_64(&last->inp_stat->rxbytes, n->m_pkthdr.len); + } + so_recv_data_stat(last->in6p_socket, n, 0); if (sbappendaddr(&last->in6p_socket->so_rcv, (struct sockaddr *)udp_in6, n, opts, NULL) == 0) udpstat.udps_fullsock++; @@ -188,20 +199,25 @@ udp6_append(struct inpcb *last, struct ip6_hdr *ip6, int udp6_input( struct mbuf **mp, - int *offp) + int *offp, + int proto) { +#pragma unused(proto) struct mbuf *m = *mp; + struct ifnet *ifp; register struct ip6_hdr *ip6; register struct udphdr *uh; register struct inpcb *in6p; struct mbuf *opts = NULL; int off = *offp; - int plen, ulen; + int plen, ulen, ret = 0; struct sockaddr_in6 udp_in6; struct inpcbinfo *pcbinfo = &udbinfo; + struct sockaddr_in6 fromsa; IP6_EXTHDR_CHECK(m, off, sizeof(struct udphdr), return IPPROTO_DONE); + ifp = m->m_pkthdr.rcvif; ip6 = mtod(m, struct ip6_hdr *); #if defined(NFAITH) && 0 < NFAITH @@ -223,20 +239,40 @@ udp6_input( goto bad; } + /* destination port of 0 is illegal, based on RFC768. */ + if (uh->uh_dport == 0) + goto bad; + /* * Checksum extended UDP header and data. */ + if (uh->uh_sum) { + if ((apple_hwcksum_rx != 0) && (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) { + uh->uh_sum = m->m_pkthdr.csum_data; + uh->uh_sum ^= 0xffff; + } + else { + if (in6_cksum(m, IPPROTO_UDP, off, ulen) != 0) { + udpstat.udps_badsum++; + goto bad; + } + } + } #ifndef __APPLE__ - if (uh->uh_sum == 0) + else udpstat.udps_nosum++; #endif - else if (in6_cksum(m, IPPROTO_UDP, off, ulen) != 0) { - udpstat.udps_badsum++; - goto bad; - } + + /* + * Construct sockaddr format source address. + */ + init_sin6(&fromsa, m); + fromsa.sin6_port = uh->uh_sport; + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { int reuse_sock = 0, mcast_delivered = 0; + struct ip6_moptions *imo; struct mbuf *n = NULL; /* @@ -299,11 +335,27 @@ udp6_input( udp_unlock(in6p->in6p_socket, 1, 0); continue; } - if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { - if (!IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, - &ip6->ip6_dst) && - !in6_mcmatch(in6p, &ip6->ip6_dst, - m->m_pkthdr.rcvif)) { + + /* + * Handle socket delivery policy for any-source + * and source-specific multicast. [RFC3678] + */ + imo = in6p->in6p_moptions; + if (imo && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { + struct sockaddr_in6 mcaddr; + int blocked; + + IM6O_LOCK(imo); + bzero(&mcaddr, sizeof(struct sockaddr_in6)); + mcaddr.sin6_len = sizeof(struct sockaddr_in6); + mcaddr.sin6_family = AF_INET6; + mcaddr.sin6_addr = ip6->ip6_dst; + + blocked = im6o_mc_filter(imo, ifp, + (struct sockaddr *)&mcaddr, + (struct sockaddr *)&fromsa); + IM6O_UNLOCK(imo); + if (blocked != MCAST_PASS) { udp_unlock(in6p->in6p_socket, 1, 0); continue; } @@ -444,10 +496,21 @@ udp6_input( init_sin6(&udp_in6, m); /* general init */ udp_in6.sin6_port = uh->uh_sport; - if (in6p->in6p_flags & IN6P_CONTROLOPTS - || in6p->in6p_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(in6p, &opts, ip6, m); + if ((in6p->in6p_flags & IN6P_CONTROLOPTS) != 0 || + (in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0 || + (in6p->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) { + ret = ip6_savecontrol(in6p, m, &opts); + if (ret != 0) { + udp_unlock(in6p->in6p_socket, 1, 0); + goto bad; + } + } m_adj(m, off + sizeof(struct udphdr)); + if (nstat_collect) { + locked_add_64(&in6p->inp_stat->rxpackets, 1); + locked_add_64(&in6p->inp_stat->rxbytes, m->m_pkthdr.len); + } + so_recv_data_stat(in6p->in6p_socket, m, 0); if (sbappendaddr(&in6p->in6p_socket->so_rcv, (struct sockaddr *)&udp_in6, m, opts, NULL) == 0) { @@ -527,10 +590,10 @@ udp6_ctlinput( (void) in6_pcbnotify(&udbinfo, sa, uh.uh_dport, (struct sockaddr*)ip6cp->ip6c_src, - uh.uh_sport, cmd, notify); + uh.uh_sport, cmd, NULL, notify); } else (void) in6_pcbnotify(&udbinfo, sa, 0, (struct sockaddr *)&sa6_src, - 0, cmd, notify); + 0, cmd, NULL, notify); } #ifndef __APPLE__ @@ -561,6 +624,12 @@ udp6_getcred SYSCTL_HANDLER_ARGS error = ENOENT; goto out; } + /* + * XXX This should not be copying out a credential!!!! This + * XXX is an opaque type, and is not intended to be introspected, + * XXX and the size of this structure *WILL* change as planned MACF + * XXX and kauth changes go forward. + */ error = SYSCTL_OUT(req, inp->inp_socket->so_cred->pc_ucred, sizeof(*(kauth_cred_t)0)); @@ -619,6 +688,7 @@ udp6_attach(struct socket *so, __unused int proto, struct proc *p) * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = ip_defttl; + nstat_udp_new_pcb(inp); return 0; } @@ -676,7 +746,7 @@ udp6_connect(struct socket *so, struct sockaddr *nam, struct proc *p) if (inp->inp_faddr.s_addr != INADDR_ANY) return EISCONN; in6_sin6_2_sin(&sin, sin6_p); - error = in_pcbconnect(inp, (struct sockaddr *)&sin, p); + error = in_pcbconnect(inp, (struct sockaddr *)&sin, p, NULL); if (error == 0) { inp->inp_vflag |= INP_IPV4; inp->inp_vflag &= ~INP_IPV6; @@ -732,6 +802,7 @@ udp6_disconnect(struct socket *so) in6_pcbdisconnect(inp); inp->in6p_laddr = in6addr_any; + inp->in6p_last_outif = 0; so->so_state &= ~SS_ISCONNECTED; /* XXX */ return 0; } diff --git a/bsd/netinet6/udp6_var.h b/bsd/netinet6/udp6_var.h index 18274d10f..bd6916e4e 100644 --- a/bsd/netinet6/udp6_var.h +++ b/bsd/netinet6/udp6_var.h @@ -72,7 +72,7 @@ SYSCTL_DECL(_net_inet6_udp6); extern struct pr_usrreqs udp6_usrreqs; void udp6_ctlinput(int, struct sockaddr *, void *); -int udp6_input(struct mbuf **, int *); +int udp6_input(struct mbuf **, int *, int); int udp6_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct proc *p); diff --git a/bsd/netkey/Makefile b/bsd/netkey/Makefile index def3c0629..1a68c8a44 100644 --- a/bsd/netkey/Makefile +++ b/bsd/netkey/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/netkey/key.c b/bsd/netkey/key.c index 73a605869..457f772ec 100644 --- a/bsd/netkey/key.c +++ b/bsd/netkey/key.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Inc. All rights reserved. + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -78,6 +78,7 @@ #include <sys/proc.h> #include <sys/queue.h> #include <sys/syslog.h> +#include <sys/mcache.h> #include <kern/locks.h> @@ -152,9 +153,6 @@ lck_grp_attr_t *pfkey_stat_mutex_grp_attr; lck_attr_t *pfkey_stat_mutex_attr; lck_mtx_t *pfkey_stat_mutex; - -extern lck_mtx_t *nd6_mutex; - /* * Note on SA reference counting: * - SAs that are not in DEAD state will have (total external reference + 1) @@ -270,61 +268,61 @@ static int ipsec_esp_auth = 0; static int ipsec_ah_keymin = 128; SYSCTL_DECL(_net_key); - -SYSCTL_INT(_net_key, KEYCTL_DEBUG_LEVEL, debug, CTLFLAG_RW, \ +/* Thread safe: no accumulated state */ +SYSCTL_INT(_net_key, KEYCTL_DEBUG_LEVEL, debug, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_debug_level, 0, ""); /* max count of trial for the decision of spi value */ -SYSCTL_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_SPI_TRY, spi_trycnt, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_spi_trycnt, 0, ""); /* minimum spi value to allocate automatically. */ -SYSCTL_INT(_net_key, KEYCTL_SPI_MIN_VALUE, spi_minval, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_SPI_MIN_VALUE, spi_minval, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_spi_minval, 0, ""); /* maximun spi value to allocate automatically. */ -SYSCTL_INT(_net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_SPI_MAX_VALUE, spi_maxval, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_spi_maxval, 0, ""); /* interval to initialize randseed */ -SYSCTL_INT(_net_key, KEYCTL_RANDOM_INT, int_random, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_RANDOM_INT, int_random, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_int_random, 0, ""); -/* lifetime for larval SA */ -SYSCTL_INT(_net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime, CTLFLAG_RW, \ +/* lifetime for larval SA; thread safe due to > compare */ +SYSCTL_INT(_net_key, KEYCTL_LARVAL_LIFETIME, larval_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_larval_lifetime, 0, ""); /* counter for blocking to send SADB_ACQUIRE to IKEd */ -SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_COUNT, blockacq_count, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_blockacq_count, 0, ""); -/* lifetime for blocking to send SADB_ACQUIRE to IKEd */ -SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime, CTLFLAG_RW, \ +/* lifetime for blocking to send SADB_ACQUIRE to IKEd: Thread safe, > compare */ +SYSCTL_INT(_net_key, KEYCTL_BLOCKACQ_LIFETIME, blockacq_lifetime, CTLFLAG_RW | CTLFLAG_LOCKED, \ &key_blockacq_lifetime, 0, ""); /* ESP auth */ -SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_ESP_AUTH, esp_auth, CTLFLAG_RW | CTLFLAG_LOCKED, \ &ipsec_esp_auth, 0, ""); /* minimum ESP key length */ -SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_ESP_KEYMIN, esp_keymin, CTLFLAG_RW | CTLFLAG_LOCKED, \ &ipsec_esp_keymin, 0, ""); /* minimum AH key length */ -SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin, CTLFLAG_RW, \ +SYSCTL_INT(_net_key, KEYCTL_AH_KEYMIN, ah_keymin, CTLFLAG_RW | CTLFLAG_LOCKED, \ &ipsec_ah_keymin, 0, ""); /* perfered old SA rather than new SA */ -SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, prefered_oldsa, CTLFLAG_RW,\ +SYSCTL_INT(_net_key, KEYCTL_PREFERED_OLDSA, prefered_oldsa, CTLFLAG_RW | CTLFLAG_LOCKED,\ &key_preferred_oldsa, 0, ""); /* time between NATT keepalives in seconds, 0 disabled */ -SYSCTL_INT(_net_key, KEYCTL_NATT_KEEPALIVE_INTERVAL, natt_keepalive_interval, CTLFLAG_RW,\ +SYSCTL_INT(_net_key, KEYCTL_NATT_KEEPALIVE_INTERVAL, natt_keepalive_interval, CTLFLAG_RW | CTLFLAG_LOCKED,\ &natt_keepalive_interval, 0, ""); /* PF_KEY statistics */ -SYSCTL_STRUCT(_net_key, KEYCTL_PFKEYSTAT, pfkeystat, CTLFLAG_RD,\ +SYSCTL_STRUCT(_net_key, KEYCTL_PFKEYSTAT, pfkeystat, CTLFLAG_RD | CTLFLAG_LOCKED,\ &pfkeystat, pfkeystat, ""); #ifndef LIST_FOREACH @@ -566,7 +564,7 @@ void key_init(void); /* * PF_KEY init - * setup locks and call raw_init() + * setup locks, call raw_init(), and then init timer and associated data * */ void @@ -597,7 +595,46 @@ key_init(void) LIST_INIT(&spihash[i]); raw_init(); + + bzero((caddr_t)&key_cb, sizeof(key_cb)); + for (i = 0; i < IPSEC_DIR_MAX; i++) { + LIST_INIT(&sptree[i]); + } + ipsec_policy_count = 0; + + LIST_INIT(&sahtree); + + for (i = 0; i <= SADB_SATYPE_MAX; i++) { + LIST_INIT(®tree[i]); + } + ipsec_sav_count = 0; + +#ifndef IPSEC_NONBLOCK_ACQUIRE + LIST_INIT(&acqtree); +#endif + LIST_INIT(&spacqtree); + + /* system default */ +#if INET + ip4_def_policy.policy = IPSEC_POLICY_NONE; + ip4_def_policy.refcnt++; /*never reclaim this*/ +#endif +#if INET6 + ip6_def_policy.policy = IPSEC_POLICY_NONE; + ip6_def_policy.refcnt++; /*never reclaim this*/ +#endif + +#ifndef IPSEC_DEBUG2 + timeout((void *)key_timehandler, (void *)0, hz); +#endif /*IPSEC_DEBUG2*/ + + /* initialize key statistics */ + keystat.getspi_count = 1; + +#ifndef __APPLE__ + printf("IPsec: Initialized Security Association Processing.\n"); +#endif } @@ -609,9 +646,9 @@ key_init(void) * others: found and return the pointer. */ struct secpolicy * -key_allocsp(spidx, dir) - struct secpolicyindex *spidx; - u_int dir; +key_allocsp( + struct secpolicyindex *spidx, + u_int dir) { struct secpolicy *sp; struct timeval tv; @@ -670,8 +707,11 @@ found: * XXX slow */ struct secpolicy * -key_gettunnel(osrc, odst, isrc, idst) - struct sockaddr *osrc, *odst, *isrc, *idst; +key_gettunnel( + struct sockaddr *osrc, + struct sockaddr *odst, + struct sockaddr *isrc, + struct sockaddr *idst) { struct secpolicy *sp; const int dir = IPSEC_DIR_INBOUND; @@ -744,10 +784,10 @@ found: * ENOENT: policy may be valid, but SA with REQUIRE is on acquiring. */ int -key_checkrequest(isr, saidx, sav) - struct ipsecrequest *isr; - struct secasindex *saidx; - struct secasvar **sav; +key_checkrequest( + struct ipsecrequest *isr, + struct secasindex *saidx, + struct secasvar **sav) { u_int level; int error; @@ -814,8 +854,8 @@ key_checkrequest(isr, saidx, sav) u_int32_t sah_search_calls = 0; u_int32_t sah_search_count = 0; struct secasvar * -key_allocsa_policy(saidx) - struct secasindex *saidx; +key_allocsa_policy( + struct secasindex *saidx) { struct secashead *sah; struct secasvar *sav; @@ -879,10 +919,10 @@ key_allocsa_policy(saidx) * others : found, pointer to a SA. */ static struct secasvar * -key_do_allocsa_policy(sah, state, dstport) - struct secashead *sah; - u_int state; - u_int16_t dstport; +key_do_allocsa_policy( + struct secashead *sah, + u_int state, + u_int16_t dstport) { struct secasvar *sav, *nextsav, *candidate, *natt_candidate, *no_natt_candidate, *d; @@ -1060,10 +1100,12 @@ key_do_allocsa_policy(sah, state, dstport) * keep source address in IPsec SA. We see a tricky situation here. */ struct secasvar * -key_allocsa(family, src, dst, proto, spi) - u_int family, proto; - caddr_t src, dst; - u_int32_t spi; +key_allocsa( + u_int family, + caddr_t src, + caddr_t dst, + u_int proto, + u_int32_t spi) { struct secasvar *sav, *match; u_int stateidx, state, tmpidx, matchidx; @@ -1214,8 +1256,8 @@ found: } u_int16_t -key_natt_get_translated_port(outsav) - struct secasvar *outsav; +key_natt_get_translated_port( + struct secasvar *outsav) { struct secasindex saidx; @@ -1271,10 +1313,10 @@ found: } static int -key_do_get_translated_port(sah, outsav, state) - struct secashead *sah; - struct secasvar *outsav; - u_int state; +key_do_get_translated_port( + struct secashead *sah, + struct secasvar *outsav, + u_int state) { struct secasvar *currsav, *nextsav, *candidate; @@ -1338,9 +1380,9 @@ key_do_get_translated_port(sah, outsav, state) * For both the packet without socket and key_freeso(). */ void -key_freesp(sp, locked) - struct secpolicy *sp; - int locked; +key_freesp( + struct secpolicy *sp, + int locked) { /* sanity check */ @@ -1371,8 +1413,8 @@ static void key_freesp_so(struct secpolicy **); * For the packet with socket. */ void -key_freeso(so) - struct socket *so; +key_freeso( + struct socket *so) { /* sanity check */ @@ -1429,8 +1471,8 @@ done: } static void -key_freesp_so(sp) - struct secpolicy **sp; +key_freesp_so( + struct secpolicy **sp) { /* sanity check */ @@ -1464,9 +1506,9 @@ key_freesp_so(sp) * for a policy. */ void -key_freesav(sav, locked) - struct secasvar *sav; - int locked; +key_freesav( + struct secasvar *sav, + int locked) { /* sanity check */ @@ -1494,8 +1536,8 @@ key_freesav(sav, locked) * free security policy entry. */ static void -key_delsp(sp) - struct secpolicy *sp; +key_delsp( + struct secpolicy *sp) { /* sanity check */ @@ -1534,8 +1576,8 @@ key_delsp(sp) * others : found, pointer to a SP. */ static struct secpolicy * -key_getsp(spidx) - struct secpolicyindex *spidx; +key_getsp( + struct secpolicyindex *spidx) { struct secpolicy *sp; @@ -1563,8 +1605,8 @@ key_getsp(spidx) * others : found, pointer to a SP. */ static struct secpolicy * -key_getspbyid(id) - u_int32_t id; +key_getspbyid( + u_int32_t id) { struct secpolicy *sp; @@ -1592,7 +1634,7 @@ key_getspbyid(id) } struct secpolicy * -key_newsp() +key_newsp(void) { struct secpolicy *newsp = NULL; @@ -1613,10 +1655,10 @@ key_newsp() * so must be set properly later. */ struct secpolicy * -key_msg2sp(xpl0, len, error) - struct sadb_x_policy *xpl0; - size_t len; - int *error; +key_msg2sp( + struct sadb_x_policy *xpl0, + size_t len, + int *error) { struct secpolicy *newsp; @@ -1835,7 +1877,7 @@ key_msg2sp(xpl0, len, error) } static u_int32_t -key_newreqid() +key_newreqid(void) { lck_mtx_lock(sadb_mutex); static u_int32_t auto_reqid = IPSEC_MANUAL_REQID_MAX + 1; @@ -1853,8 +1895,8 @@ key_newreqid() * copy secpolicy struct to sadb_x_policy structure indicated. */ struct mbuf * -key_sp2msg(sp) - struct secpolicy *sp; +key_sp2msg( + struct secpolicy *sp) { struct sadb_x_policy *xpl; int tlen; @@ -2006,10 +2048,10 @@ fail: * m will always be freed. */ static int -key_spdadd(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spdadd( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_address *src0, *dst0; struct sadb_x_policy *xpl0, *xpl; @@ -2266,7 +2308,7 @@ key_spdadd(so, m, mhp) * others: success. */ static u_int32_t -key_getnewspid() +key_getnewspid(void) { u_int32_t newid = 0; int count = key_spi_trycnt; /* XXX */ @@ -2304,10 +2346,10 @@ key_getnewspid() * m will always be freed. */ static int -key_spddelete(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spddelete( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_address *src0, *dst0; struct sadb_x_policy *xpl0; @@ -2406,10 +2448,10 @@ key_spddelete(so, m, mhp) * m will always be freed. */ static int -key_spddelete2(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spddelete2( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { u_int32_t id; struct secpolicy *sp; @@ -2507,10 +2549,10 @@ key_spddelete2(so, m, mhp) * m will always be freed. */ static int -key_spdget(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spdget( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { u_int32_t id; struct secpolicy *sp; @@ -2562,8 +2604,8 @@ key_spdget(so, m, mhp) * others: error number */ int -key_spdacquire(sp) - struct secpolicy *sp; +key_spdacquire( + struct secpolicy *sp) { struct mbuf *result = NULL, *m; struct secspacq *newspacq; @@ -2637,10 +2679,10 @@ fail: * m will always be freed. */ static int -key_spdflush(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spdflush( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_msg *newmsg; struct secpolicy *sp; @@ -2690,10 +2732,10 @@ key_spdflush(so, m, mhp) */ static int -key_spddump(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_spddump( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct secpolicy *sp, **spbuf = NULL, **sp_ptr; int cnt = 0, bufcount; @@ -2762,10 +2804,11 @@ end: } static struct mbuf * -key_setdumpsp(sp, type, seq, pid) - struct secpolicy *sp; - u_int8_t type; - u_int32_t seq, pid; +key_setdumpsp( + struct secpolicy *sp, + u_int8_t type, + u_int32_t seq, + u_int32_t pid) { struct mbuf *result = NULL, *m; @@ -2820,8 +2863,8 @@ fail: * get PFKEY message length for security policy and request. */ static u_int -key_getspreqmsglen(sp) - struct secpolicy *sp; +key_getspreqmsglen( + struct secpolicy *sp) { u_int tlen; @@ -2858,12 +2901,12 @@ key_getspreqmsglen(sp) * others : error number */ static int -key_spdexpire(sp) - struct secpolicy *sp; +key_spdexpire( + struct secpolicy *sp) { struct mbuf *result = NULL, *m; int len; - int error = -1; + int error = EINVAL; struct sadb_lifetime *lt; lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED); @@ -2969,9 +3012,9 @@ key_spdexpire(sp) * others : pointer to new SA head. */ static struct secashead * -key_newsah(saidx, dir) - struct secasindex *saidx; - u_int8_t dir; +key_newsah( + struct secasindex *saidx, + u_int8_t dir) { struct secashead *newsah; @@ -3019,8 +3062,8 @@ key_newsah(saidx, dir) * delete SA index and all SA registerd. */ static void -key_delsah(sah) - struct secashead *sah; +key_delsah( + struct secashead *sah) { struct secasvar *sav, *nextsav; u_int stateidx, state; @@ -3092,11 +3135,11 @@ key_delsah(sah) * does not modify mbuf. does not free mbuf on error. */ static struct secasvar * -key_newsav(m, mhp, sah, errp) - struct mbuf *m; - const struct sadb_msghdr *mhp; - struct secashead *sah; - int *errp; +key_newsav( + struct mbuf *m, + const struct sadb_msghdr *mhp, + struct secashead *sah, + int *errp) { struct secasvar *newsav; const struct sadb_sa *xsa; @@ -3187,8 +3230,8 @@ key_newsav(m, mhp, sah, errp) * free() SA variable entry. */ static void -key_delsav(sav) - struct secasvar *sav; +key_delsav( + struct secasvar *sav) { lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); @@ -3256,8 +3299,8 @@ key_delsav(sav) * others : found, pointer to a SA. */ static struct secashead * -key_getsah(saidx) - struct secasindex *saidx; +key_getsah( + struct secasindex *saidx) { struct secashead *sah; @@ -3281,9 +3324,9 @@ key_getsah(saidx) * others : found, pointer to a SA. */ static struct secasvar * -key_checkspidup(saidx, spi) - struct secasindex *saidx; - u_int32_t spi; +key_checkspidup( + struct secasindex *saidx, + u_int32_t spi) { struct secasvar *sav; u_int stateidx, state; @@ -3314,9 +3357,9 @@ key_checkspidup(saidx, spi) } static void -key_setspi(sav, spi) - struct secasvar *sav; - u_int32_t spi; +key_setspi( + struct secasvar *sav, + u_int32_t spi) { lck_mtx_assert(sadb_mutex, LCK_MTX_ASSERT_OWNED); sav->spi = spi; @@ -3333,9 +3376,9 @@ key_setspi(sav, spi) * others : found, pointer to a SA. */ static struct secasvar * -key_getsavbyspi(sah, spi) - struct secashead *sah; - u_int32_t spi; +key_getsavbyspi( + struct secashead *sah, + u_int32_t spi) { struct secasvar *sav, *match; u_int stateidx, state, matchidx; @@ -3370,10 +3413,10 @@ key_getsavbyspi(sah, spi) * does not modify mbuf. does not free mbuf on error. */ static int -key_setsaval(sav, m, mhp) - struct secasvar *sav; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_setsaval( + struct secasvar *sav, + struct mbuf *m, + const struct sadb_msghdr *mhp) { #if IPSEC_ESP const struct esp_algorithm *algo; @@ -3688,8 +3731,8 @@ key_setsaval(sav, m, mhp) * other: errno */ static int -key_mature(sav) - struct secasvar *sav; +key_mature( + struct secasvar *sav) { int mature; int checkmask = 0; /* 2^0: ealg 2^1: aalg 2^2: calg */ @@ -3867,10 +3910,12 @@ key_mature(sav) * subroutine for SADB_GET and SADB_DUMP. */ static struct mbuf * -key_setdumpsa(sav, type, satype, seq, pid) - struct secasvar *sav; - u_int8_t type, satype; - u_int32_t seq, pid; +key_setdumpsa( + struct secasvar *sav, + u_int8_t type, + u_int8_t satype, + u_int32_t seq, + u_int32_t pid) { struct mbuf *result = NULL, *tres = NULL, *m; int l = 0; @@ -4016,12 +4061,13 @@ fail: * set data into sadb_msg. */ static struct mbuf * -key_setsadbmsg(type, tlen, satype, seq, pid, reserved) - u_int8_t type, satype; - u_int16_t tlen; - u_int32_t seq; - pid_t pid; - u_int16_t reserved; +key_setsadbmsg( + u_int8_t type, + u_int16_t tlen, + u_int8_t satype, + u_int32_t seq, + pid_t pid, + u_int16_t reserved) { struct mbuf *m; struct sadb_msg *p; @@ -4062,8 +4108,8 @@ key_setsadbmsg(type, tlen, satype, seq, pid, reserved) * copy secasvar data into sadb_address. */ static struct mbuf * -key_setsadbsa(sav) - struct secasvar *sav; +key_setsadbsa( + struct secasvar *sav) { struct mbuf *m; struct sadb_sa *p; @@ -4096,11 +4142,11 @@ key_setsadbsa(sav) * set data into sadb_address. */ static struct mbuf * -key_setsadbaddr(exttype, saddr, prefixlen, ul_proto) - u_int16_t exttype; - struct sockaddr *saddr; - u_int8_t prefixlen; - u_int16_t ul_proto; +key_setsadbaddr( + u_int16_t exttype, + struct sockaddr *saddr, + u_int8_t prefixlen, + u_int16_t ul_proto) { struct mbuf *m; struct sadb_address *p; @@ -4218,11 +4264,12 @@ key_setsadbsastat (u_int32_t dir, * set data into sadb_ident. */ static struct mbuf * -key_setsadbident(exttype, idtype, string, stringlen, id) - u_int16_t exttype, idtype; - caddr_t string; - int stringlen; - u_int64_t id; +key_setsadbident( + u_int16_t exttype, + u_int16_t idtype, + caddr_t string, + int stringlen, + u_int64_t id) { struct mbuf *m; struct sadb_ident *p; @@ -4257,9 +4304,10 @@ key_setsadbident(exttype, idtype, string, stringlen, id) * set data into sadb_x_sa2. */ static struct mbuf * -key_setsadbxsa2(mode, seq, reqid) - u_int8_t mode; - u_int32_t seq, reqid; +key_setsadbxsa2( + u_int8_t mode, + u_int32_t seq, + u_int32_t reqid) { struct mbuf *m; struct sadb_x_sa2 *p; @@ -4291,10 +4339,10 @@ key_setsadbxsa2(mode, seq, reqid) * set data into sadb_x_policy */ static struct mbuf * -key_setsadbxpolicy(type, dir, id) - u_int16_t type; - u_int8_t dir; - u_int32_t id; +key_setsadbxpolicy( + u_int16_t type, + u_int8_t dir, + u_int32_t id) { struct mbuf *m; struct sadb_x_policy *p; @@ -4325,9 +4373,9 @@ key_setsadbxpolicy(type, dir, id) * copy a buffer into the new buffer allocated. */ static void * -key_newbuf(src, len) - const void *src; - u_int len; +key_newbuf( + const void *src, + u_int len) { caddr_t new; @@ -4352,8 +4400,8 @@ key_newbuf(src, len) * 0: false */ int -key_ismyaddr(sa) - struct sockaddr *sa; +key_ismyaddr( + struct sockaddr *sa) { #if INET struct sockaddr_in *sin; @@ -4370,15 +4418,17 @@ key_ismyaddr(sa) lck_rw_lock_shared(in_ifaddr_rwlock); sin = (struct sockaddr_in *)sa; for (ia = in_ifaddrhead.tqh_first; ia; - ia = ia->ia_link.tqe_next) - { + ia = ia->ia_link.tqe_next) { + IFA_LOCK_SPIN(&ia->ia_ifa); if (sin->sin_family == ia->ia_addr.sin_family && sin->sin_len == ia->ia_addr.sin_len && sin->sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr) { + IFA_UNLOCK(&ia->ia_ifa); lck_rw_done(in_ifaddr_rwlock); return 1; } + IFA_UNLOCK(&ia->ia_ifa); } lck_rw_done(in_ifaddr_rwlock); break; @@ -4402,19 +4452,22 @@ key_ismyaddr(sa) #include <netinet6/in6_var.h> static int -key_ismyaddr6(sin6) - struct sockaddr_in6 *sin6; +key_ismyaddr6( + struct sockaddr_in6 *sin6) { struct in6_ifaddr *ia; struct in6_multi *in6m; - lck_mtx_lock(nd6_mutex); + lck_rw_lock_shared(&in6_ifaddr_rwlock); for (ia = in6_ifaddrs; ia; ia = ia->ia_next) { + IFA_LOCK(&ia->ia_ifa); if (key_sockaddrcmp((struct sockaddr *)&sin6, (struct sockaddr *)&ia->ia_addr, 0) == 0) { - lck_mtx_unlock(nd6_mutex); + IFA_UNLOCK(&ia->ia_ifa); + lck_rw_done(&in6_ifaddr_rwlock); return 1; } + IFA_UNLOCK(&ia->ia_ifa); /* * XXX Multicast @@ -4423,15 +4476,16 @@ key_ismyaddr6(sin6) * XXX scope */ in6m = NULL; - ifnet_lock_shared(ia->ia_ifp); - IN6_LOOKUP_MULTI(sin6->sin6_addr, ia->ia_ifp, in6m); - ifnet_lock_done(ia->ia_ifp); - if (in6m) { - lck_mtx_unlock(nd6_mutex); + in6_multihead_lock_shared(); + IN6_LOOKUP_MULTI(&sin6->sin6_addr, ia->ia_ifp, in6m); + in6_multihead_lock_done(); + if (in6m != NULL) { + lck_rw_done(&in6_ifaddr_rwlock); + IN6M_REMREF(in6m); return 1; } } - lck_mtx_unlock(nd6_mutex); + lck_rw_done(&in6_ifaddr_rwlock); /* loopback, just for safety */ if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr)) @@ -4454,9 +4508,10 @@ key_ismyaddr6(sin6) * 0 : not equal */ static int -key_cmpsaidx(saidx0, saidx1, flag) - struct secasindex *saidx0, *saidx1; - int flag; +key_cmpsaidx( + struct secasindex *saidx0, + struct secasindex *saidx1, + int flag) { /* sanity */ if (saidx0 == NULL && saidx1 == NULL) @@ -4517,8 +4572,9 @@ key_cmpsaidx(saidx0, saidx1, flag) * 0 : not equal */ static int -key_cmpspidx_exactly(spidx0, spidx1) - struct secpolicyindex *spidx0, *spidx1; +key_cmpspidx_exactly( + struct secpolicyindex *spidx0, + struct secpolicyindex *spidx1) { /* sanity */ if (spidx0 == NULL && spidx1 == NULL) @@ -4554,8 +4610,9 @@ key_cmpspidx_exactly(spidx0, spidx1) * 0 : not equal */ static int -key_cmpspidx_withmask(spidx0, spidx1) - struct secpolicyindex *spidx0, *spidx1; +key_cmpspidx_withmask( + struct secpolicyindex *spidx0, + struct secpolicyindex *spidx1) { /* sanity */ if (spidx0 == NULL && spidx1 == NULL) @@ -4652,10 +4709,10 @@ key_cmpspidx_withmask(spidx0, spidx1) /* returns 0 on match */ static int -key_sockaddrcmp(sa1, sa2, port) - struct sockaddr *sa1; - struct sockaddr *sa2; - int port; +key_sockaddrcmp( + struct sockaddr *sa1, + struct sockaddr *sa2, + int port) { if (sa1->sa_family != sa2->sa_family || sa1->sa_len != sa2->sa_len) return 1; @@ -4707,9 +4764,10 @@ key_sockaddrcmp(sa1, sa2, port) * 0 : not equal */ static int -key_bbcmp(p1, p2, bits) - caddr_t p1, p2; - u_int bits; +key_bbcmp( + caddr_t p1, + caddr_t p2, + u_int bits) { u_int8_t mask; @@ -5154,7 +5212,7 @@ key_timehandler(void) * to initialize a seed for random() */ static void -key_srandom() +key_srandom(void) { #ifdef __APPLE__ /* Our PRNG is based on Yarrow and doesn't need to be seeded */ @@ -5171,7 +5229,7 @@ key_srandom() } u_int32_t -key_random() +key_random(void) { u_int32_t value; @@ -5180,9 +5238,9 @@ key_random() } void -key_randomfill(p, l) - void *p; - size_t l; +key_randomfill( + void *p, + size_t l) { #ifdef __APPLE__ @@ -5217,8 +5275,8 @@ key_randomfill(p, l) * 0: invalid satype. */ static u_int16_t -key_satype2proto(satype) - u_int8_t satype; +key_satype2proto( + u_int8_t satype) { switch (satype) { case SADB_SATYPE_UNSPEC: @@ -5242,8 +5300,8 @@ key_satype2proto(satype) * 0: invalid protocol type. */ static u_int8_t -key_proto2satype(proto) - u_int16_t proto; +key_proto2satype( + u_int16_t proto) { switch (proto) { case IPPROTO_AH: @@ -5273,10 +5331,10 @@ key_proto2satype(proto) * other if success, return pointer to the message to send. */ static int -key_getspi(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_getspi( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_address *src0, *dst0; struct secasindex saidx; @@ -5483,9 +5541,9 @@ key_getspi(so, m, mhp) * others: success. */ static u_int32_t -key_do_getnewspi(spirange, saidx) - struct sadb_spirange *spirange; - struct secasindex *saidx; +key_do_getnewspi( + struct sadb_spirange *spirange, + struct secasindex *saidx) { u_int32_t newspi; u_int32_t keymin, keymax; @@ -5567,10 +5625,10 @@ key_do_getnewspi(spirange, saidx) * m will always be freed. */ static int -key_update(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_update( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; @@ -5743,9 +5801,9 @@ key_update(so, m, mhp) */ #if IPSEC_DOSEQCHECK static struct secasvar * -key_getsavbyseq(sah, seq) - struct secashead *sah; - u_int32_t seq; +key_getsavbyseq( + struct secashead *sah, + u_int32_t seq) { struct secasvar *sav; u_int state; @@ -5789,10 +5847,10 @@ key_getsavbyseq(sah, seq) * m will always be freed. */ static int -key_add(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_add( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; @@ -5925,10 +5983,10 @@ key_add(so, m, mhp) /* m is retained */ static int -key_setident(sah, m, mhp) - struct secashead *sah; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_setident( + struct secashead *sah, + struct mbuf *m, + const struct sadb_msghdr *mhp) { const struct sadb_ident *idsrc, *iddst; int idsrclen, iddstlen; @@ -6009,9 +6067,9 @@ key_setident(sah, m, mhp) * it is caller's responsibility to free the result. */ static struct mbuf * -key_getmsgbuf_x1(m, mhp) - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_getmsgbuf_x1( + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct mbuf *n; int mbufItems[] = {SADB_EXT_RESERVED, SADB_EXT_SA, @@ -6056,10 +6114,10 @@ static int key_delete_all(struct socket *, struct mbuf *, * m will always be freed. */ static int -key_delete(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_delete( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; @@ -6169,11 +6227,11 @@ key_delete(so, m, mhp) * delete all SAs for src/dst. Called from key_delete(). */ static int -key_delete_all(so, m, mhp, proto) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; - u_int16_t proto; +key_delete_all( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp, + u_int16_t proto) { struct sadb_address *src0, *dst0; struct secasindex saidx; @@ -6259,10 +6317,10 @@ key_delete_all(so, m, mhp, proto) * m will always be freed. */ static int -key_get(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_get( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_sa *sa0; struct sadb_address *src0, *dst0; @@ -6427,8 +6485,8 @@ key_getsastatbyspi (struct sastat *stat_arg, /* XXX make it sysctl-configurable? */ static void -key_getcomb_setlifetime(comb) - struct sadb_comb *comb; +key_getcomb_setlifetime( + struct sadb_comb *comb) { comb->sadb_comb_soft_allocations = 1; @@ -6447,7 +6505,7 @@ key_getcomb_setlifetime(comb) * XXX no idea if the user wants ESP authentication or not */ static struct mbuf * -key_getcomb_esp() +key_getcomb_esp(void) { struct sadb_comb *comb; const struct esp_algorithm *algo; @@ -6529,7 +6587,7 @@ key_getcomb_esp() * XXX reorder combinations by preference */ static struct mbuf * -key_getcomb_ah() +key_getcomb_ah(void) { struct sadb_comb *comb; const struct ah_algorithm *algo; @@ -6588,7 +6646,7 @@ key_getcomb_ah() * XXX reorder combinations by preference */ static struct mbuf * -key_getcomb_ipcomp() +key_getcomb_ipcomp(void) { struct sadb_comb *comb; const struct ipcomp_algorithm *algo; @@ -6634,8 +6692,8 @@ key_getcomb_ipcomp() * XXX sysctl interface to ipsec_{ah,esp}_keymin */ static struct mbuf * -key_getprop(saidx) - const struct secasindex *saidx; +key_getprop( + const struct secasindex *saidx) { struct sadb_prop *prop; struct mbuf *m, *n; @@ -6698,9 +6756,9 @@ key_getprop(saidx) * others: error number */ static int -key_acquire(saidx, sp) - struct secasindex *saidx; - struct secpolicy *sp; +key_acquire( + struct secasindex *saidx, + struct secpolicy *sp) { struct mbuf *result = NULL, *m; #ifndef IPSEC_NONBLOCK_ACQUIRE @@ -6883,8 +6941,8 @@ key_acquire(saidx, sp) #ifndef IPSEC_NONBLOCK_ACQUIRE static struct secacq * -key_newacq(saidx) - struct secasindex *saidx; +key_newacq( + struct secasindex *saidx) { struct secacq *newacq; struct timeval tv; @@ -6913,8 +6971,8 @@ key_newacq(saidx) } static struct secacq * -key_getacq(saidx) - struct secasindex *saidx; +key_getacq( + struct secasindex *saidx) { struct secacq *acq; @@ -6929,8 +6987,8 @@ key_getacq(saidx) } static struct secacq * -key_getacqbyseq(seq) - u_int32_t seq; +key_getacqbyseq( + u_int32_t seq) { struct secacq *acq; @@ -6946,8 +7004,8 @@ key_getacqbyseq(seq) #endif static struct secspacq * -key_newspacq(spidx) - struct secpolicyindex *spidx; +key_newspacq( + struct secpolicyindex *spidx) { struct secspacq *acq; struct timeval tv; @@ -6975,8 +7033,8 @@ key_newspacq(spidx) } static struct secspacq * -key_getspacq(spidx) - struct secpolicyindex *spidx; +key_getspacq( + struct secpolicyindex *spidx) { struct secspacq *acq; @@ -7005,10 +7063,10 @@ key_getspacq(spidx) * m will always be freed. */ static int -key_acquire2(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_acquire2( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { const struct sadb_address *src0, *dst0; struct secasindex saidx; @@ -7134,10 +7192,10 @@ key_acquire2(so, m, mhp) * m will always be freed. */ static int -key_register(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_register( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct secreg *reg, *newreg = 0; @@ -7303,8 +7361,8 @@ key_register(so, m, mhp) * XXX: I want to do free a socket marked done SADB_RESIGER to socket. */ void -key_freereg(so) - struct socket *so; +key_freereg( + struct socket *so) { struct secreg *reg; int i; @@ -7344,8 +7402,8 @@ key_freereg(so) * others : error number */ static int -key_expire(sav) - struct secasvar *sav; +key_expire( + struct secasvar *sav) { int satype; struct mbuf *result = NULL, *m; @@ -7471,10 +7529,10 @@ key_expire(sav) * m will always be freed. */ static int -key_flush(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_flush( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct sadb_msg *newmsg; struct secashead *sah, *nextsah; @@ -7560,10 +7618,10 @@ struct sav_dump_elem { }; static int -key_dump(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_dump( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { struct secashead *sah; struct secasvar *sav; @@ -7679,10 +7737,10 @@ end: * m will always be freed. */ static int -key_promisc(so, m, mhp) - struct socket *so; - struct mbuf *m; - const struct sadb_msghdr *mhp; +key_promisc( + struct socket *so, + struct mbuf *m, + const struct sadb_msghdr *mhp) { int olen; @@ -7771,9 +7829,9 @@ static int (*key_typesw[])(struct socket *, struct mbuf *, * length for buffer to send to user process. */ int -key_parse(m, so) - struct mbuf *m; - struct socket *so; +key_parse( + struct mbuf *m, + struct socket *so) { struct sadb_msg *msg; struct sadb_msghdr mh; @@ -8026,10 +8084,10 @@ senderror: } static int -key_senderror(so, m, code) - struct socket *so; - struct mbuf *m; - int code; +key_senderror( + struct socket *so, + struct mbuf *m, + int code) { struct sadb_msg *msg; @@ -8049,9 +8107,9 @@ key_senderror(so, m, code) * XXX larger-than-MCLBYTES extension? */ static int -key_align(m, mhp) - struct mbuf *m; - struct sadb_msghdr *mhp; +key_align( + struct mbuf *m, + struct sadb_msghdr *mhp) { struct mbuf *n; struct sadb_ext *ext; @@ -8156,9 +8214,9 @@ key_align(m, mhp) } static int -key_validate_ext(ext, len) - const struct sadb_ext *ext; - int len; +key_validate_ext( + const struct sadb_ext *ext, + int len) { struct sockaddr *sa; enum { NONE, ADDR } checktype = NONE; @@ -8216,50 +8274,8 @@ key_validate_ext(ext, len) } void -key_domain_init() +key_domain_init(void) { - int i; - - bzero((caddr_t)&key_cb, sizeof(key_cb)); - - for (i = 0; i < IPSEC_DIR_MAX; i++) { - LIST_INIT(&sptree[i]); - } - ipsec_policy_count = 0; - - LIST_INIT(&sahtree); - - for (i = 0; i <= SADB_SATYPE_MAX; i++) { - LIST_INIT(®tree[i]); - } - ipsec_sav_count = 0; - -#ifndef IPSEC_NONBLOCK_ACQUIRE - LIST_INIT(&acqtree); -#endif - LIST_INIT(&spacqtree); - - /* system default */ -#if INET - ip4_def_policy.policy = IPSEC_POLICY_NONE; - ip4_def_policy.refcnt++; /*never reclaim this*/ -#endif -#if INET6 - ip6_def_policy.policy = IPSEC_POLICY_NONE; - ip6_def_policy.refcnt++; /*never reclaim this*/ -#endif - -#ifndef IPSEC_DEBUG2 - timeout((void *)key_timehandler, (void *)0, hz); -#endif /*IPSEC_DEBUG2*/ - - /* initialize key statistics */ - keystat.getspi_count = 1; - -#ifndef __APPLE__ - printf("IPsec: Initialized Security Association Processing.\n"); -#endif - return; } @@ -8290,9 +8306,9 @@ key_checktunnelsanity( /* record data transfer on SA, and update timestamps */ void -key_sa_recordxfer(sav, m) - struct secasvar *sav; - struct mbuf *m; +key_sa_recordxfer( + struct secasvar *sav, + struct mbuf *m) { @@ -8343,8 +8359,8 @@ key_sa_recordxfer(sav, m) /* dumb version */ void -key_sa_routechange(dst) - struct sockaddr *dst; +key_sa_routechange( + struct sockaddr *dst) { struct secashead *sah; struct route *ro; @@ -8364,9 +8380,9 @@ key_sa_routechange(dst) } static void -key_sa_chgstate(sav, state) - struct secasvar *sav; - u_int8_t state; +key_sa_chgstate( + struct secasvar *sav, + u_int8_t state) { if (sav == NULL) @@ -8386,8 +8402,8 @@ key_sa_chgstate(sav, state) } void -key_sa_stir_iv(sav) - struct secasvar *sav; +key_sa_stir_iv( + struct secasvar *sav) { lck_mtx_lock(sadb_mutex); if (!sav->iv) @@ -8398,8 +8414,8 @@ key_sa_stir_iv(sav) /* XXX too much? */ static struct mbuf * -key_alloc_mbuf(l) - int l; +key_alloc_mbuf( + int l) { struct mbuf *m = NULL, *n; int len, t; diff --git a/bsd/nfs/Makefile b/bsd/nfs/Makefile index 10e246402..d4c4ce3cb 100644 --- a/bsd/nfs/Makefile +++ b/bsd/nfs/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/nfs/krpc.h b/bsd/nfs/krpc.h index 16fde5248..5f3b87677 100644 --- a/bsd/nfs/krpc.h +++ b/bsd/nfs/krpc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,7 +44,7 @@ int krpc_portmap(struct sockaddr_in *sin, /* - * RPC definitions for the portmapper + * RPC definitions for the portmapper (portmap and rpcbind) */ #define PMAPPORT 111 #define PMAPPROG 100000 @@ -56,6 +56,24 @@ int krpc_portmap(struct sockaddr_in *sin, #define PMAPPROC_DUMP 4 #define PMAPPROC_CALLIT 5 +#define RPCBPROG PMAPPROG +#define RPCBVERS3 3 +#define RPCBVERS4 4 +#define RPCBPROC_NULL 0 +#define RPCBPROC_SET 1 +#define RPCBPROC_UNSET 2 +#define RPCBPROC_GETADDR 3 +#define RPCBPROC_DUMP 4 +#define RPCBPROC_CALLIT 5 +#define RPCBPROC_BCAST RPCBPROC_CALLIT +#define RPCBPROC_GETTIME 6 +#define RPCBPROC_UADDR2TADDR 7 +#define RPCBPROC_TADDR2UADDR 8 +#define RPCBPROC_GETVERSADDR 9 +#define RPCBPROC_INDIRECT 10 +#define RPCBPROC_GETADDRLIST 11 +#define RPCBPROC_GETSTAT 12 + /* * RPC definitions for bootparamd diff --git a/bsd/nfs/nfs.h b/bsd/nfs/nfs.h index 821af8e5b..41b025389 100644 --- a/bsd/nfs/nfs.h +++ b/bsd/nfs/nfs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -146,14 +146,71 @@ __private_extern__ int nfs_ticks; * (These sizes are always a power of 2. If the kernel malloc() changes * to one that does not allocate space in powers of 2 size, then this all * becomes bunk!). - * Note that some of these structures come out of there own nfs zones. -*/ + * Note that some of these structures come out of their own nfs zones. + */ #define NFS_NODEALLOC 1024 #define NFS_MNTALLOC 1024 #define NFS_SVCALLOC 512 +#define NFS_ARGSVERSION_XDR 88 /* NFS mount args are in XDR format */ + +#define NFS_XDRARGS_VERSION_0 0 +#define NFS_MATTR_BITMAP_LEN 1 /* length of mount attributes bitmap */ +#define NFS_MFLAG_BITMAP_LEN 1 /* length of mount flags bitmap */ + +/* NFS mount attributes */ +#define NFS_MATTR_FLAGS 0 /* mount flags (NFS_MATTR_*) */ +#define NFS_MATTR_NFS_VERSION 1 /* NFS protocol version */ +#define NFS_MATTR_NFS_MINOR_VERSION 2 /* NFS protocol minor version */ +#define NFS_MATTR_READ_SIZE 3 /* READ RPC size */ +#define NFS_MATTR_WRITE_SIZE 4 /* WRITE RPC size */ +#define NFS_MATTR_READDIR_SIZE 5 /* READDIR RPC size */ +#define NFS_MATTR_READAHEAD 6 /* block readahead count */ +#define NFS_MATTR_ATTRCACHE_REG_MIN 7 /* minimum attribute cache time */ +#define NFS_MATTR_ATTRCACHE_REG_MAX 8 /* maximum attribute cache time */ +#define NFS_MATTR_ATTRCACHE_DIR_MIN 9 /* minimum attribute cache time for dirs */ +#define NFS_MATTR_ATTRCACHE_DIR_MAX 10 /* maximum attribute cache time for dirs */ +#define NFS_MATTR_LOCK_MODE 11 /* advisory file locking mode (NFS_LOCK_MODE_*) */ +#define NFS_MATTR_SECURITY 12 /* RPC security flavors to use */ +#define NFS_MATTR_MAX_GROUP_LIST 13 /* max # of RPC AUTH_SYS groups */ +#define NFS_MATTR_SOCKET_TYPE 14 /* socket transport type as a netid-like string */ +#define NFS_MATTR_NFS_PORT 15 /* port # to use for NFS protocol */ +#define NFS_MATTR_MOUNT_PORT 16 /* port # to use for MOUNT protocol */ +#define NFS_MATTR_REQUEST_TIMEOUT 17 /* initial RPC request timeout value */ +#define NFS_MATTR_SOFT_RETRY_COUNT 18 /* max RPC retransmissions for soft mounts */ +#define NFS_MATTR_DEAD_TIMEOUT 19 /* how long until unresponsive mount is considered dead */ +#define NFS_MATTR_FH 20 /* file handle for mount directory */ +#define NFS_MATTR_FS_LOCATIONS 21 /* list of locations for the file system */ +#define NFS_MATTR_MNTFLAGS 22 /* VFS mount flags (MNT_*) */ +#define NFS_MATTR_MNTFROM 23 /* fixed string to use for "f_mntfromname" */ + +/* NFS mount flags */ +#define NFS_MFLAG_SOFT 0 /* soft mount (requests fail if unresponsive) */ +#define NFS_MFLAG_INTR 1 /* allow operations to be interrupted */ +#define NFS_MFLAG_RESVPORT 2 /* use a reserved port */ +#define NFS_MFLAG_NOCONNECT 3 /* don't connect the socket (UDP) */ +#define NFS_MFLAG_DUMBTIMER 4 /* don't estimate RTT dynamically */ +#define NFS_MFLAG_CALLUMNT 5 /* call MOUNTPROC_UMNT on unmount */ +#define NFS_MFLAG_RDIRPLUS 6 /* request additional info when reading directories */ +#define NFS_MFLAG_NONEGNAMECACHE 7 /* don't do negative name caching */ +#define NFS_MFLAG_MUTEJUKEBOX 8 /* don't treat jukebox errors as unresponsive */ +#define NFS_MFLAG_EPHEMERAL 9 /* ephemeral (mirror) mount */ +#define NFS_MFLAG_NOCALLBACK 10 /* don't provide callback RPC service */ +#define NFS_MFLAG_NONAMEDATTR 11 /* don't use named attributes */ +#define NFS_MFLAG_NOACL 12 /* don't support ACLs */ +#define NFS_MFLAG_ACLONLY 13 /* only support ACLs - not mode */ +#define NFS_MFLAG_NFC 14 /* send NFC strings */ +#define NFS_MFLAG_NOQUOTA 15 /* don't support QUOTA requests */ +#define NFS_MFLAG_MNTUDP 16 /* MOUNT protocol should use UDP */ +#define NFS_MFLAG_MNTQUICK 17 /* use short timeouts while mounting */ + +/* NFS advisory file locking modes */ +#define NFS_LOCK_MODE_ENABLED 0 /* advisory file locking enabled */ +#define NFS_LOCK_MODE_DISABLED 1 /* do not support advisory file locking */ +#define NFS_LOCK_MODE_LOCAL 2 /* perform advisory file locking locally */ + /* - * Arguments to mount NFS + * Old-style arguments to mount NFS */ #define NFS_ARGSVERSION 6 /* change when nfs_args changes */ struct nfs_args { @@ -197,115 +254,11 @@ struct nfs_args { /* NFS_ARGSVERSION 5 ends here */ uint32_t deadtimeout; /* secs until unresponsive mount considered dead */ }; -struct nfs_args5 { - int version; /* args structure version number */ -#ifdef KERNEL - user32_addr_t addr; /* file server address */ -#else - struct sockaddr *addr; /* file server address */ -#endif - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ -#ifdef KERNEL - user32_addr_t fh; /* File handle to be mounted */ -#else - u_char *fh; /* File handle to be mounted */ -#endif - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ -#ifdef KERNEL - user32_addr_t hostname; /* server's name */ -#else - char *hostname; /* server's name */ -#endif - /* NFS_ARGSVERSION 3 ends here */ - int acregmin; /* reg file min attr cache timeout */ - int acregmax; /* reg file max attr cache timeout */ - int acdirmin; /* dir min attr cache timeout */ - int acdirmax; /* dir max attr cache timeout */ - /* NFS_ARGSVERSION 4 ends here */ - uint32_t auth; /* security mechanism flavor */ -}; -struct nfs_args4 { - int version; /* args structure version number */ -#ifdef KERNEL - user32_addr_t addr; /* file server address */ -#else - struct sockaddr *addr; /* file server address */ -#endif - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ -#ifdef KERNEL - user32_addr_t fh; /* File handle to be mounted */ -#else - u_char *fh; /* File handle to be mounted */ -#endif - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ -#ifdef KERNEL - user32_addr_t hostname; /* server's name */ -#else - char *hostname; /* server's name */ -#endif - /* NFS_ARGSVERSION 3 ends here */ - int acregmin; /* reg file min attr cache timeout */ - int acregmax; /* reg file max attr cache timeout */ - int acdirmin; /* dir min attr cache timeout */ - int acdirmax; /* dir max attr cache timeout */ -}; -struct nfs_args3 { - int version; /* args structure version number */ -#ifdef KERNEL - user32_addr_t addr; /* file server address */ -#else - struct sockaddr *addr; /* file server address */ -#endif - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ -#ifdef KERNEL - user32_addr_t fh; /* File handle to be mounted */ -#else - u_char *fh; /* File handle to be mounted */ -#endif - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ -#ifdef KERNEL - user32_addr_t hostname; /* server's name */ -#else - char *hostname; /* server's name */ -#endif -}; +/* incremental size additions in each version of nfs_args */ +#define NFS_ARGSVERSION4_INCSIZE (4 * sizeof(int)) +#define NFS_ARGSVERSION5_INCSIZE (sizeof(uint32_t)) +#define NFS_ARGSVERSION6_INCSIZE (sizeof(uint32_t)) #ifdef KERNEL /* LP64 version of nfs_args. all pointers and longs @@ -341,83 +294,10 @@ struct user_nfs_args { /* NFS_ARGSVERSION 5 ends here */ uint32_t deadtimeout; /* secs until unresponsive mount considered dead */ }; -struct user_nfs_args5 { - int version; /* args structure version number */ - user_addr_t addr __attribute((aligned(8))); /* file server address */ - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ - user_addr_t fh __attribute((aligned(8))); /* File handle to be mounted */ - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ - user_addr_t hostname __attribute((aligned(8))); /* server's name */ - /* NFS_ARGSVERSION 3 ends here */ - int acregmin; /* reg file min attr cache timeout */ - int acregmax; /* reg file max attr cache timeout */ - int acdirmin; /* dir min attr cache timeout */ - int acdirmax; /* dir max attr cache timeout */ - /* NFS_ARGSVERSION 4 ends here */ - uint32_t auth; /* security mechanism flavor */ -}; -struct user_nfs_args4 { - int version; /* args structure version number */ - user_addr_t addr __attribute((aligned(8))); /* file server address */ - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ - user_addr_t fh __attribute((aligned(8))); /* File handle to be mounted */ - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ - user_addr_t hostname __attribute((aligned(8))); /* server's name */ - /* NFS_ARGSVERSION 3 ends here */ - int acregmin; /* reg file min attr cache timeout */ - int acregmax; /* reg file max attr cache timeout */ - int acdirmin; /* dir min attr cache timeout */ - int acdirmax; /* dir max attr cache timeout */ -}; -struct user_nfs_args3 { - int version; /* args structure version number */ - user_addr_t addr __attribute((aligned(8))); /* file server address */ - int addrlen; /* length of address */ - int sotype; /* Socket type */ - int proto; /* and Protocol */ - user_addr_t fh __attribute((aligned(8))); /* File handle to be mounted */ - int fhsize; /* Size, in bytes, of fh */ - int flags; /* flags */ - int wsize; /* write size in bytes */ - int rsize; /* read size in bytes */ - int readdirsize; /* readdir size in bytes */ - int timeo; /* initial timeout in .1 secs */ - int retrans; /* times to retry send */ - int maxgrouplist; /* Max. size of group list */ - int readahead; /* # of blocks to readahead */ - int leaseterm; /* obsolete: Term (sec) of lease */ - int deadthresh; /* obsolete: Retrans threshold */ - user_addr_t hostname __attribute((aligned(8))); /* server's name */ -}; - #endif // KERNEL /* - * NFS mount option flags + * Old-style NFS mount option flags */ #define NFSMNT_SOFT 0x00000001 /* soft mount (hard is default) */ #define NFSMNT_WSIZE 0x00000002 /* set write size */ @@ -446,6 +326,27 @@ struct user_nfs_args3 { #define NFSMNT_SECFLAVOR 0x01000000 /* Use security flavor */ #define NFSMNT_SECSYSOK 0x02000000 /* Server can support auth sys */ #define NFSMNT_MUTEJUKEBOX 0x04000000 /* don't treat jukebox errors as unresponsive */ +#define NFSMNT_NOQUOTA 0x08000000 /* don't support QUOTA requests */ + + +/* + * fs.nfs sysctl(3) NFS_MOUNTINFO defines + */ +#define NFS_MOUNT_INFO_VERSION 0 /* nfsstat mount information version */ +#define NFS_MIATTR_BITMAP_LEN 1 /* length of mount info attributes bitmap */ +#define NFS_MIFLAG_BITMAP_LEN 1 /* length of mount info flags bitmap */ + +/* NFS mount info attributes */ +#define NFS_MIATTR_FLAGS 0 /* mount info flags bitmap (MIFLAG_*) */ +#define NFS_MIATTR_ORIG_ARGS 1 /* original mount args passed into mount call */ +#define NFS_MIATTR_CUR_ARGS 2 /* current mount args values */ +#define NFS_MIATTR_CUR_LOC_INDEX 3 /* current fs location index */ + +/* NFS mount info flags */ +#define NFS_MIFLAG_DEAD 0 /* mount is dead */ +#define NFS_MIFLAG_NOTRESP 1 /* server is unresponsive */ +#define NFS_MIFLAG_RECOVERY 2 /* mount in recovery */ + /* * Structures for the nfssvc(2) syscall. Not that anyone but nfsd @@ -831,6 +732,7 @@ struct nfsstats { * Flags for nfsclnt() system call. */ #define NFSCLNT_LOCKDANS 0x200 +#define NFSCLNT_LOCKDNOTIFY 0x400 /* * fs.nfs sysctl(3) identifiers @@ -839,6 +741,7 @@ struct nfsstats { #define NFS_EXPORTSTATS 3 /* gets exported directory stats */ #define NFS_USERSTATS 4 /* gets exported directory active user stats */ #define NFS_USERCOUNT 5 /* gets current count of active nfs users */ +#define NFS_MOUNTINFO 6 /* gets information about an NFS mount */ #ifndef NFS_WDELAYHASHSIZ #define NFS_WDELAYHASHSIZ 16 /* and with this */ @@ -882,6 +785,11 @@ struct nfs_open_file; struct nfs_lock_owner; struct nfs_file_lock; struct nfsreq; +struct nfs_rpc_record_state; +struct nfs_fs_locations; +struct nfs_location_index; +struct nfs_socket; +struct nfs_socket_search; /* * The set of signals the interrupt an I/O in progress for NFSMNT_INT mounts. @@ -926,6 +834,28 @@ struct nfsreq_cbinfo { uint32_t rcb_args[3]; /* additional callback args */ }; +/* + * Arguments to use if a request needs to call SECINFO to handle a WRONGSEC error + * + * If only node is set, use the parent file handle and this node's name; otherwise, + * use any file handle and name provided. + */ +struct nfsreq_secinfo_args { + nfsnode_t rsia_np; /* the node */ + const char *rsia_name; /* alternate name string */ + u_char *rsia_fh; /* alternate file handle */ + uint32_t rsia_namelen; /* length of string */ + uint32_t rsia_fhsize; /* length of fh */ +}; +#define NFSREQ_SECINFO_SET(SI, NP, FH, FHSIZE, NAME, NAMELEN) \ + do { \ + (SI)->rsia_np = (NP); \ + (SI)->rsia_fh = (FH); \ + (SI)->rsia_fhsize = (FHSIZE); \ + (SI)->rsia_name = (NAME); \ + (SI)->rsia_namelen = (NAMELEN); \ + } while (0) + /* * NFS outstanding request list element */ @@ -959,8 +889,11 @@ struct nfsreq { SLIST_HEAD(, gss_seq) r_gss_seqlist; /* RPCSEC_GSS sequence numbers */ uint32_t r_gss_argoff; /* RPCSEC_GSS offset to args */ uint32_t r_gss_arglen; /* RPCSEC_GSS arg length */ + uint32_t r_auth; /* security flavor request sent with */ + uint32_t *r_wrongsec; /* wrongsec: other flavors to try */ int r_error; /* request error */ struct nfsreq_cbinfo r_callback; /* callback info */ + struct nfsreq_secinfo_args r_secinfo; /* secinfo args */ }; /* @@ -992,9 +925,10 @@ __private_extern__ lck_grp_t *nfs_request_grp; #define R_RESENDQ 0x00004000 /* async request currently on resendq */ #define R_SENDING 0x00008000 /* request currently being sent */ +#define R_NOINTR 0x20000000 /* request should not be interupted by a signal */ #define R_RECOVER 0x40000000 /* a state recovery RPC - during NFSSTA_RECOVER */ #define R_SETUP 0x80000000 /* a setup RPC - during (re)connection */ -#define R_OPTMASK 0xc0000000 /* mask of all RPC option flags */ +#define R_OPTMASK 0xe0000000 /* mask of all RPC option flags */ /* Flag values for r_lflags */ #define RL_BUSY 0x0001 /* Locked. */ @@ -1002,10 +936,21 @@ __private_extern__ lck_grp_t *nfs_request_grp; #define RL_QUEUED 0x0004 /* request is on the queue */ __private_extern__ u_int32_t nfs_xid, nfs_xidwrap; -__private_extern__ int nfs_iosize, nfs_access_cache_timeout, nfs_access_delete, nfs_allow_async, nfs_statfs_rate_limit; +__private_extern__ int nfs_iosize, nfs_allow_async, nfs_statfs_rate_limit; +__private_extern__ int nfs_access_cache_timeout, nfs_access_delete, nfs_access_dotzfs, nfs_access_for_getattr; __private_extern__ int nfs_lockd_mounts, nfs_lockd_request_sent, nfs_single_des; __private_extern__ int nfs_tprintf_initial_delay, nfs_tprintf_delay; __private_extern__ int nfsiod_thread_count, nfsiod_thread_max, nfs_max_async_writes; +__private_extern__ int nfs_idmap_ctrl, nfs_callback_port; + +/* bits for nfs_idmap_ctrl: */ +#define NFS_IDMAP_CTRL_USE_IDMAP_SERVICE 0x00000001 /* use the ID mapping service */ +#define NFS_IDMAP_CTRL_FALLBACK_NO_COMMON_IDS 0x00000002 /* fallback should NOT handle common IDs like "root" and "nobody" */ +#define NFS_IDMAP_CTRL_FALLBACK_NO_WELLKNOWN_IDS 0x00000004 /* fallback should NOT handle the well known "XXX@" IDs */ +#define NFS_IDMAP_CTRL_UNKNOWN_IS_99 0x00000008 /* for unknown IDs use uid/gid 99 instead of -2/nobody */ +#define NFS_IDMAP_CTRL_COMPARE_RESULTS 0x00000010 /* compare results of ID mapping service and fallback */ +#define NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS 0x00000020 /* log failed ID mapping attempts */ +#define NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS 0x00000040 /* log successful ID mapping attempts */ #define NFSIOD_MAX (MIN(nfsiod_thread_max, NFS_MAXASYNCTHREAD)) @@ -1018,14 +963,6 @@ struct nfs_dulookup { char du_smallname[48]; /* buffer for small names */ }; -/* - * Network address hash list element - */ -union nethostaddr { - u_int32_t had_inetaddr; - mbuf_t had_nam; -}; - /* * One nfsrv_sock structure is maintained for each socket the * server is servicing requests on. @@ -1071,7 +1008,7 @@ struct nfsrv_sock { #define SLPNOLIST ((struct nfsrv_sock *)0xdeadbeef) /* sentinel value for sockets not in the nfsrv_sockwg list */ -__private_extern__ struct nfsrv_sock *nfsrv_udpsock; +__private_extern__ struct nfsrv_sock *nfsrv_udpsock, *nfsrv_udp6sock; /* * global NFS server socket lists: @@ -1148,7 +1085,7 @@ __private_extern__ lck_mtx_t *nfs_global_mutex; /* NFSv4 callback globals */ __private_extern__ int nfs4_callback_timer_on; -__private_extern__ in_port_t nfs4_cb_port; +__private_extern__ in_port_t nfs4_cb_port, nfs4_cb_port6; /* nfs timer call structures */ __private_extern__ thread_call_t nfs_request_timer_call; @@ -1180,15 +1117,23 @@ void nfs4_mount_callback_shutdown(struct nfsmount *); void nfs4_cb_accept(socket_t, void *, int); void nfs4_cb_rcv(socket_t, void *, int); void nfs4_callback_timer(void *, void *); +int nfs4_secinfo_rpc(struct nfsmount *, struct nfsreq_secinfo_args *, kauth_cred_t, uint32_t *, int *); +int nfs4_get_fs_locations(struct nfsmount *, nfsnode_t, u_char *, int, const char *, vfs_context_t, struct nfs_fs_locations *); +void nfs_fs_locations_cleanup(struct nfs_fs_locations *); +void nfs4_default_attrs_for_referral_trigger(nfsnode_t, char *, int, struct nfs_vattr *, fhandle_t *); -int nfs_connect(struct nfsmount *, int); +int nfs_sockaddr_cmp(struct sockaddr *, struct sockaddr *); +int nfs_connect(struct nfsmount *, int, int); void nfs_disconnect(struct nfsmount *); void nfs_need_reconnect(struct nfsmount *); void nfs_mount_sock_thread_wake(struct nfsmount *); void nfs_mount_check_dead_timeout(struct nfsmount *); +void nfs_rpc_record_state_init(struct nfs_rpc_record_state *); +void nfs_rpc_record_state_cleanup(struct nfs_rpc_record_state *); +int nfs_rpc_record_read(socket_t, struct nfs_rpc_record_state *, int, int *, mbuf_t *); int nfs_getattr(nfsnode_t, struct nfs_vattr *, vfs_context_t, int); -int nfs_getattrcache(nfsnode_t, struct nfs_vattr *); +int nfs_getattrcache(nfsnode_t, struct nfs_vattr *, int); int nfs_loadattrcache(nfsnode_t, struct nfs_vattr *, u_int64_t *, int); int nfs_attrcachetimeout(nfsnode_t); @@ -1196,6 +1141,7 @@ int nfs_buf_page_inval(vnode_t vp, off_t offset); int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int); int nfs_vinvalbuf2(vnode_t, int, thread_t, kauth_cred_t, int); int nfs_vinvalbuf_internal(nfsnode_t, int, thread_t, kauth_cred_t, int, int); +void nfs_wait_bufs(nfsnode_t); int nfs_request_create(nfsnode_t, mount_t, struct nfsm_chain *, int, thread_t, kauth_cred_t, struct nfsreq **); void nfs_request_destroy(struct nfsreq *); @@ -1205,14 +1151,14 @@ int nfs_request_add_header(struct nfsreq *); int nfs_request_send(struct nfsreq *, int); void nfs_request_wait(struct nfsreq *); int nfs_request_finish(struct nfsreq *, struct nfsm_chain *, int *); -int nfs_request(nfsnode_t, mount_t, struct nfsm_chain *, int, vfs_context_t, struct nfsm_chain *, u_int64_t *, int *); -int nfs_request2(nfsnode_t, mount_t, struct nfsm_chain *, int, thread_t, kauth_cred_t, int, struct nfsm_chain *, u_int64_t *, int *); +int nfs_request(nfsnode_t, mount_t, struct nfsm_chain *, int, vfs_context_t, struct nfsreq_secinfo_args *, struct nfsm_chain *, u_int64_t *, int *); +int nfs_request2(nfsnode_t, mount_t, struct nfsm_chain *, int, thread_t, kauth_cred_t, struct nfsreq_secinfo_args *, int, struct nfsm_chain *, u_int64_t *, int *); int nfs_request_gss(mount_t, struct nfsm_chain *, thread_t, kauth_cred_t, int, struct nfs_gss_clnt_ctx *, struct nfsm_chain *, int *); -int nfs_request_async(nfsnode_t, mount_t, struct nfsm_chain *, int, thread_t, kauth_cred_t, struct nfsreq_cbinfo *cb, struct nfsreq **); +int nfs_request_async(nfsnode_t, mount_t, struct nfsm_chain *, int, thread_t, kauth_cred_t, struct nfsreq_secinfo_args *, int, struct nfsreq_cbinfo *, struct nfsreq **); int nfs_request_async_finish(struct nfsreq *, struct nfsm_chain *, u_int64_t *, int *); void nfs_request_async_cancel(struct nfsreq *); void nfs_request_timer(void *, void *); -int nfs_aux_request(struct nfsmount *, thread_t, struct sockaddr_in *, mbuf_t, uint32_t, int, int, struct nfsm_chain *); +int nfs_request_using_gss(struct nfsreq *); void nfs_get_xid(uint64_t *); int nfs_sigintr(struct nfsmount *, struct nfsreq *, thread_t, int); int nfs_noremotehang(thread_t); @@ -1221,6 +1167,24 @@ int nfs_send(struct nfsreq *, int); int nfs_sndlock(struct nfsreq *); void nfs_sndunlock(struct nfsreq *); +int nfs_uaddr2sockaddr(const char *, struct sockaddr *); + +int nfs_aux_request(struct nfsmount *, thread_t, struct sockaddr *, socket_t, int, mbuf_t, uint32_t, int, int, struct nfsm_chain *); +int nfs_portmap_lookup(struct nfsmount *, vfs_context_t, struct sockaddr *, socket_t, uint32_t, uint32_t, uint32_t, int); + +void nfs_location_next(struct nfs_fs_locations *, struct nfs_location_index *); +int nfs_location_index_cmp(struct nfs_location_index *, struct nfs_location_index *); +void nfs_location_mntfromname(struct nfs_fs_locations *, struct nfs_location_index, char *, int, int); +int nfs_socket_create(struct nfsmount *, struct sockaddr *, int, in_port_t, uint32_t, uint32_t, int, struct nfs_socket **); +void nfs_socket_destroy(struct nfs_socket *); +void nfs_socket_options(struct nfsmount *, struct nfs_socket *); +void nfs_connect_upcall(socket_t, void *, int); +int nfs_connect_error_class(int); +int nfs_connect_search_loop(struct nfsmount *, struct nfs_socket_search *); +void nfs_socket_search_update_error(struct nfs_socket_search *, int); +void nfs_socket_search_cleanup(struct nfs_socket_search *); +void nfs_mount_connect_thread(void *, __unused wait_result_t); + int nfs_lookitup(nfsnode_t, char *, int, vfs_context_t, nfsnode_t *); void nfs_dulookup_init(struct nfs_dulookup *, nfsnode_t, const char *, int, vfs_context_t); void nfs_dulookup_start(struct nfs_dulookup *, nfsnode_t, vfs_context_t); @@ -1229,16 +1193,28 @@ int nfs_dir_buf_cache_lookup(nfsnode_t, nfsnode_t *, struct componentname *, vfs int nfs_dir_buf_search(struct nfsbuf *, struct componentname *, fhandle_t *, struct nfs_vattr *, uint64_t *, time_t *, daddr64_t *, int); void nfs_name_cache_purge(nfsnode_t, nfsnode_t, struct componentname *, vfs_context_t); +uint32_t nfs4_ace_nfstype_to_vfstype(uint32_t, int *); +uint32_t nfs4_ace_vfstype_to_nfstype(uint32_t, int *); +uint32_t nfs4_ace_nfsflags_to_vfsflags(uint32_t); +uint32_t nfs4_ace_vfsflags_to_nfsflags(uint32_t); +uint32_t nfs4_ace_nfsmask_to_vfsrights(uint32_t); +uint32_t nfs4_ace_vfsrights_to_nfsmask(uint32_t); +int nfs4_id2guid(char *, guid_t *, int); +int nfs4_guid2id(guid_t *, char *, int *, int); + int nfs_parsefattr(struct nfsm_chain *, int, struct nfs_vattr *); -int nfs4_parsefattr(struct nfsm_chain *, struct nfs_fsattr *, struct nfs_vattr *, fhandle_t *, struct dqblk *); +int nfs4_parsefattr(struct nfsm_chain *, struct nfs_fsattr *, struct nfs_vattr *, fhandle_t *, struct dqblk *, struct nfs_fs_locations *); void nfs_vattr_set_supported(uint32_t *, struct vnode_attr *); +void nfs_vattr_set_bitmap(struct nfsmount *, uint32_t *, struct vnode_attr *); void nfs3_pathconf_cache(struct nfsmount *, struct nfs_fsattr *); +int nfs3_mount_rpc(struct nfsmount *, struct sockaddr *, int, int, char *, vfs_context_t, int, fhandle_t *, struct nfs_sec *); void nfs3_umount_rpc(struct nfsmount *, vfs_context_t, int); -int nfs_node_mode_slot(nfsnode_t, uid_t, int); +int nfs_node_access_slot(nfsnode_t, uid_t, int); +void nfs_vnode_notify(nfsnode_t, uint32_t); void nfs_avoid_needless_id_setting_on_create(nfsnode_t, struct vnode_attr *, vfs_context_t); int nfs4_create_rpc(vfs_context_t, nfsnode_t, struct componentname *, struct vnode_attr *, int, char *, nfsnode_t *); -int nfs_open_state_set_busy(nfsnode_t, vfs_context_t); +int nfs_open_state_set_busy(nfsnode_t, thread_t); void nfs_open_state_clear_busy(nfsnode_t); struct nfs_open_owner *nfs_open_owner_find(struct nfsmount *, kauth_cred_t, int); void nfs_open_owner_destroy(struct nfs_open_owner *); @@ -1248,21 +1224,34 @@ int nfs_open_owner_set_busy(struct nfs_open_owner *, thread_t); void nfs_open_owner_clear_busy(struct nfs_open_owner *); void nfs_owner_seqid_increment(struct nfs_open_owner *, struct nfs_lock_owner *, int); int nfs_open_file_find(nfsnode_t, struct nfs_open_owner *, struct nfs_open_file **, uint32_t, uint32_t, int); +int nfs_open_file_find_internal(nfsnode_t, struct nfs_open_owner *, struct nfs_open_file **, uint32_t, uint32_t, int); void nfs_open_file_destroy(struct nfs_open_file *); int nfs_open_file_set_busy(struct nfs_open_file *, thread_t); void nfs_open_file_clear_busy(struct nfs_open_file *); +void nfs_open_file_add_open(struct nfs_open_file *, uint32_t, uint32_t, int); +void nfs_open_file_remove_open_find(struct nfs_open_file *, uint32_t, uint32_t, uint32_t *, uint32_t *, int*); +void nfs_open_file_remove_open(struct nfs_open_file *, uint32_t, uint32_t); void nfs_get_stateid(nfsnode_t, thread_t, kauth_cred_t, nfs_stateid *); int nfs4_open(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); -int nfs4_close(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); -int nfs4_check_for_locks(struct nfs_open_owner *, struct nfs_open_file *); -void nfs4_reopen(struct nfs_open_file *, thread_t); +int nfs4_open_delegated(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); +int nfs_close(nfsnode_t, struct nfs_open_file *, uint32_t, uint32_t, vfs_context_t); +int nfs_check_for_locks(struct nfs_open_owner *, struct nfs_open_file *); +int nfs4_reopen(struct nfs_open_file *, thread_t); int nfs4_open_rpc(struct nfs_open_file *, vfs_context_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int); int nfs4_open_rpc_internal(struct nfs_open_file *, vfs_context_t, thread_t, kauth_cred_t, struct componentname *, struct vnode_attr *, vnode_t, vnode_t *, int, int, int); +int nfs4_open_confirm_rpc(struct nfsmount *, nfsnode_t, u_char *, int, struct nfs_open_owner *, nfs_stateid *, thread_t, kauth_cred_t, struct nfs_vattr *, uint64_t *); int nfs4_open_reopen_rpc(struct nfs_open_file *, thread_t, kauth_cred_t, struct componentname *, vnode_t, vnode_t *, int, int); int nfs4_open_reclaim_rpc(struct nfs_open_file *, int, int); +int nfs4_claim_delegated_open_rpc(struct nfs_open_file *, int, int, int); +int nfs4_claim_delegated_state_for_open_file(struct nfs_open_file *, int); +int nfs4_claim_delegated_state_for_node(nfsnode_t, int); int nfs4_open_downgrade_rpc(nfsnode_t, struct nfs_open_file *, vfs_context_t); int nfs4_close_rpc(nfsnode_t, struct nfs_open_file *, thread_t, kauth_cred_t, int); -int nfs4_delegreturn_rpc(struct nfsmount *, u_char *, int, struct nfs_stateid *, thread_t, kauth_cred_t); +void nfs4_delegation_return_enqueue(nfsnode_t); +int nfs4_delegation_return(nfsnode_t, int, thread_t, kauth_cred_t); +int nfs4_delegreturn_rpc(struct nfsmount *, u_char *, int, struct nfs_stateid *, int, thread_t, kauth_cred_t); +void nfs_release_open_state_for_node(nfsnode_t, int); +void nfs_revoke_open_state_for_node(nfsnode_t); struct nfs_lock_owner *nfs_lock_owner_find(nfsnode_t, proc_t, int); void nfs_lock_owner_destroy(struct nfs_lock_owner *); void nfs_lock_owner_ref(struct nfs_lock_owner *); @@ -1273,37 +1262,52 @@ void nfs_lock_owner_insert_held_lock(struct nfs_lock_owner *, struct nfs_file_lo struct nfs_file_lock *nfs_file_lock_alloc(struct nfs_lock_owner *); void nfs_file_lock_destroy(struct nfs_file_lock *); int nfs_file_lock_conflict(struct nfs_file_lock *, struct nfs_file_lock *, int *); -int nfs4_lock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, thread_t, kauth_cred_t); -int nfs4_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, vfs_context_t); -int nfs4_getlock(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); -int nfs4_setlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, short, vfs_context_t); -int nfs4_unlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, uint64_t, uint64_t, int, vfs_context_t); +int nfs4_lock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); +int nfs_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, thread_t, kauth_cred_t, int); +int nfs_advlock_getlock(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); +int nfs_advlock_setlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, short, vfs_context_t); +int nfs_advlock_unlock(nfsnode_t, struct nfs_open_file *, struct nfs_lock_owner *, uint64_t, uint64_t, int, vfs_context_t); + +nfsnode_t nfs4_named_attr_dir_get(nfsnode_t, int, vfs_context_t); +int nfs4_named_attr_get(nfsnode_t, struct componentname *, uint32_t, int, vfs_context_t, nfsnode_t *, struct nfs_open_file **); +int nfs4_named_attr_remove(nfsnode_t, nfsnode_t, const char *, vfs_context_t); -int nfs_mount_state_in_use_start(struct nfsmount *); +int nfs_mount_state_in_use_start(struct nfsmount *, thread_t); int nfs_mount_state_in_use_end(struct nfsmount *, int); int nfs_mount_state_error_should_restart(int); +int nfs_mount_state_error_delegation_lost(int); uint nfs_mount_state_max_restarts(struct nfsmount *); int nfs_mount_state_wait_for_recovery(struct nfsmount *); -void nfs4_recover(struct nfsmount *); +void nfs_need_recover(struct nfsmount *nmp, int error); +void nfs_recover(struct nfsmount *); int nfs_vnop_access(struct vnop_access_args *); - -int nfs3_vnop_open(struct vnop_open_args *); -int nfs3_vnop_close(struct vnop_close_args *); +int nfs_vnop_remove(struct vnop_remove_args *); +int nfs_vnop_read(struct vnop_read_args *); +int nfs_vnop_write(struct vnop_write_args *); +int nfs_vnop_open(struct vnop_open_args *); +int nfs_vnop_close(struct vnop_close_args *); +int nfs_vnop_advlock(struct vnop_advlock_args *); +int nfs_vnop_mmap(struct vnop_mmap_args *); +int nfs_vnop_mnomap(struct vnop_mnomap_args *); int nfs4_vnop_create(struct vnop_create_args *); int nfs4_vnop_mknod(struct vnop_mknod_args *); -int nfs4_vnop_open(struct vnop_open_args *); int nfs4_vnop_close(struct vnop_close_args *); -int nfs4_vnop_mmap(struct vnop_mmap_args *); -int nfs4_vnop_mnomap(struct vnop_mnomap_args *); int nfs4_vnop_getattr(struct vnop_getattr_args *); -int nfs4_vnop_read(struct vnop_read_args *); int nfs4_vnop_link(struct vnop_link_args *); int nfs4_vnop_mkdir(struct vnop_mkdir_args *); int nfs4_vnop_rmdir(struct vnop_rmdir_args *); int nfs4_vnop_symlink(struct vnop_symlink_args *); -int nfs4_vnop_advlock(struct vnop_advlock_args *ap); +int nfs4_vnop_getxattr(struct vnop_getxattr_args *); +int nfs4_vnop_setxattr(struct vnop_setxattr_args *); +int nfs4_vnop_removexattr(struct vnop_removexattr_args *); +int nfs4_vnop_listxattr(struct vnop_listxattr_args *); +#if NAMEDSTREAMS +int nfs4_vnop_getnamedstream(struct vnop_getnamedstream_args *); +int nfs4_vnop_makenamedstream(struct vnop_makenamedstream_args *); +int nfs4_vnop_removenamedstream(struct vnop_removenamedstream_args *); +#endif int nfs_read_rpc(nfsnode_t, uio_t, vfs_context_t); int nfs_write_rpc(nfsnode_t, uio_t, vfs_context_t, int *, uint64_t *); @@ -1311,8 +1315,8 @@ int nfs_write_rpc2(nfsnode_t, uio_t, thread_t, kauth_cred_t, int *, uint64_t *); int nfs3_access_rpc(nfsnode_t, u_int32_t *, vfs_context_t); int nfs4_access_rpc(nfsnode_t, u_int32_t *, vfs_context_t); -int nfs3_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, vfs_context_t, struct nfs_vattr *, u_int64_t *); -int nfs4_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, vfs_context_t, struct nfs_vattr *, u_int64_t *); +int nfs3_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *); +int nfs4_getattr_rpc(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *); int nfs3_setattr_rpc(nfsnode_t, struct vnode_attr *, vfs_context_t); int nfs4_setattr_rpc(nfsnode_t, struct vnode_attr *, vfs_context_t); int nfs3_read_rpc_async(nfsnode_t, off_t, size_t, thread_t, kauth_cred_t, struct nfsreq_cbinfo *, struct nfsreq **); @@ -1327,18 +1331,24 @@ int nfs3_readdir_rpc(nfsnode_t, struct nfsbuf *, vfs_context_t); int nfs4_readdir_rpc(nfsnode_t, struct nfsbuf *, vfs_context_t); int nfs3_readlink_rpc(nfsnode_t, char *, uint32_t *, vfs_context_t); int nfs4_readlink_rpc(nfsnode_t, char *, uint32_t *, vfs_context_t); -int nfs3_commit_rpc(nfsnode_t, u_int64_t, u_int64_t, kauth_cred_t); -int nfs4_commit_rpc(nfsnode_t, u_int64_t, u_int64_t, kauth_cred_t); +int nfs3_commit_rpc(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t); +int nfs4_commit_rpc(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t); int nfs3_lookup_rpc_async(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **); int nfs4_lookup_rpc_async(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **); -int nfs3_lookup_rpc_async_finish(nfsnode_t, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); -int nfs4_lookup_rpc_async_finish(nfsnode_t, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); +int nfs3_lookup_rpc_async_finish(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); +int nfs4_lookup_rpc_async_finish(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); int nfs3_remove_rpc(nfsnode_t, char *, int, thread_t, kauth_cred_t); int nfs4_remove_rpc(nfsnode_t, char *, int, thread_t, kauth_cred_t); int nfs3_rename_rpc(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t); int nfs4_rename_rpc(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t); int nfs3_pathconf_rpc(nfsnode_t, struct nfs_fsattr *, vfs_context_t); int nfs4_pathconf_rpc(nfsnode_t, struct nfs_fsattr *, vfs_context_t); +int nfs3_setlock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); +int nfs4_setlock_rpc(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); +int nfs3_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t); +int nfs4_unlock_rpc(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t); +int nfs3_getlock_rpc(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); +int nfs4_getlock_rpc(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); void nfsrv_active_user_list_reclaim(void); void nfsrv_cleancache(void); @@ -1363,7 +1373,7 @@ int nfsrv_is_initialized(void); int nfsrv_namei(struct nfsrv_descript *, vfs_context_t, struct nameidata *, struct nfs_filehandle *, vnode_t *, struct nfs_export **, struct nfs_export_options **); -void nfsrv_rcv(socket_t, caddr_t, int); +void nfsrv_rcv(socket_t, void *, int); void nfsrv_rcv_locked(socket_t, struct nfsrv_sock *, int); int nfsrv_rephead(struct nfsrv_descript *, struct nfsrv_sock *, struct nfsm_chain *, size_t); int nfsrv_send(struct nfsrv_sock *, mbuf_t, mbuf_t); @@ -1410,6 +1420,15 @@ struct nfs_diskless; int nfs_boot_init(struct nfs_diskless *); int nfs_boot_getfh(struct nfs_diskless *, int, int); +#if CONFIG_TRIGGERS +resolver_result_t nfs_mirror_mount_trigger_resolve(vnode_t, const struct componentname *, enum path_operation, int, void *, vfs_context_t); +resolver_result_t nfs_mirror_mount_trigger_unresolve(vnode_t, int, void *, vfs_context_t); +resolver_result_t nfs_mirror_mount_trigger_rearm(vnode_t, int, void *, vfs_context_t); +int nfs_mirror_mount_domount(vnode_t, vnode_t, vfs_context_t); +void nfs_ephemeral_mount_harvester_start(void); +void nfs_ephemeral_mount_harvester(__unused void *arg, __unused wait_result_t wr); +#endif + __END_DECLS #endif /* KERNEL */ diff --git a/bsd/nfs/nfs4_subs.c b/bsd/nfs/nfs4_subs.c index 6b1786cda..3d65f7985 100644 --- a/bsd/nfs/nfs4_subs.c +++ b/bsd/nfs/nfs4_subs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2009 Apple Inc. All rights reserved. + * Copyright (c) 2006-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -87,7 +87,11 @@ * * In an attempt to differentiate mounts we include the mntfromname and mntonname * strings to the client ID (as long as they fit). We also make sure that the - * value does not conflict with any existing values in use. + * value does not conflict with any existing values in use (changing the unique ID). + * + * Note that info such as the server's address may change over the lifetime of the + * mount. But the client ID will not be updated because we don't want it changing + * simply because we switched to a different server address. */ int nfs4_init_clientid(struct nfsmount *nmp) @@ -120,7 +124,7 @@ nfs4_init_clientid(struct nfsmount *nmp) return (ENOMEM); vsfs = vfs_statfs(nmp->nm_mountp); - saddr = mbuf_data(nmp->nm_nam); + saddr = nmp->nm_saddr; ncip->nci_idlen = sizeof(uint32_t) + sizeof(en0addr) + saddr->sa_len + strlen(vsfs->f_mntfromname) + 1 + strlen(vsfs->f_mntonname) + 1; if (ncip->nci_idlen > NFS4_OPAQUE_LIMIT) @@ -199,10 +203,12 @@ nfs4_setclientid(struct nfsmount *nmp) thread_t thd; kauth_cred_t cred; struct nfsm_chain nmreq, nmrep; - struct sockaddr_in sin; - uint8_t *addr; - char raddr[32]; - int ralen = 0; + struct sockaddr_storage ss; + void *sinaddr = NULL; + char raddr[MAX_IPv6_STR_LEN]; + char uaddr[MAX_IPv6_STR_LEN+16]; + int ualen = 0; + in_port_t port; thd = current_thread(); cred = IS_VALID_CRED(nmp->nm_mcred) ? nmp->nm_mcred : vfs_context_ucred(vfs_context_kernel()); @@ -224,26 +230,35 @@ nfs4_setclientid(struct nfsmount *nmp) nfsm_chain_add_64(error, &nmreq, nmp->nm_mounttime); nfsm_chain_add_32(error, &nmreq, nmp->nm_longid->nci_idlen); nfsm_chain_add_opaque(error, &nmreq, nmp->nm_longid->nci_id, nmp->nm_longid->nci_idlen); + nfsmout_if(error); /* cb_client4 callback; */ - if (nmp->nm_cbid && nfs4_cb_port && - !(error = sock_getsockname(nmp->nm_so, (struct sockaddr*)&sin, sizeof(sin)))) { - /* assemble r_addr = h1.h2.h3.h4.p1.p2 */ - /* h = source address of nmp->nm_so */ - /* p = nfs4_cb_port */ - addr = (uint8_t*)&sin.sin_addr.s_addr; - ralen = snprintf(raddr, sizeof(raddr), "%d.%d.%d.%d.%d.%d", - addr[0], addr[1], addr[2], addr[3], - ((nfs4_cb_port >> 8) & 0xff), - (nfs4_cb_port & 0xff)); - /* make sure it fit, give up if it didn't */ - if (ralen >= (int)sizeof(raddr)) - ralen = 0; - } - if (ralen > 0) { + if (!NMFLAG(nmp, NOCALLBACK) && nmp->nm_cbid && nfs4_cb_port && + !sock_getsockname(nmp->nm_nso->nso_so, (struct sockaddr*)&ss, sizeof(ss))) { + if (ss.ss_family == AF_INET) { + sinaddr = &((struct sockaddr_in*)&ss)->sin_addr; + port = nfs4_cb_port; + } else if (ss.ss_family == AF_INET6) { + sinaddr = &((struct sockaddr_in6*)&ss)->sin6_addr; + port = nfs4_cb_port6; + } + if (sinaddr && port && (inet_ntop(ss.ss_family, sinaddr, raddr, sizeof(raddr)) == raddr)) { + /* assemble r_addr = universal address (nmp->nm_nso->nso_so source IP addr + port) */ + ualen = snprintf(uaddr, sizeof(uaddr), "%s.%d.%d", raddr, + ((port >> 8) & 0xff), + (port & 0xff)); + /* make sure it fit, give up if it didn't */ + if (ualen >= (int)sizeof(uaddr)) + ualen = 0; + } + } + if (ualen > 0) { /* add callback info */ nfsm_chain_add_32(error, &nmreq, NFS4_CALLBACK_PROG); /* callback program */ - nfsm_chain_add_string(error, &nmreq, "tcp", 3); /* callback r_netid */ - nfsm_chain_add_string(error, &nmreq, raddr, ralen); /* callback r_addr */ + if (ss.ss_family == AF_INET) + nfsm_chain_add_string(error, &nmreq, "tcp", 3); /* callback r_netid */ + else if (ss.ss_family == AF_INET6) + nfsm_chain_add_string(error, &nmreq, "tcp6", 4); /* callback r_netid */ + nfsm_chain_add_string(error, &nmreq, uaddr, ualen); /* callback r_addr */ nfsm_chain_add_32(error, &nmreq, nmp->nm_cbid); /* callback_ident */ } else { /* don't provide valid callback info */ @@ -255,9 +270,11 @@ nfs4_setclientid(struct nfsmount *nmp) nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, R_SETUP, &nmrep, &xid, &status); + error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, NULL, R_SETUP, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); + if (!error && (numops != 1) && status) + error = status; nfsm_chain_op_check(error, &nmrep, NFS_OP_SETCLIENTID); if (error == NFSERR_CLID_INUSE) printf("nfs4_setclientid: client ID in use?\n"); @@ -267,43 +284,57 @@ nfs4_setclientid(struct nfsmount *nmp) nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); - // SETCLIENTID_CONFIRM, PUTFH, GETATTR(FS) - numops = nmp->nm_dnp ? 3 : 1; - nfsm_chain_build_alloc_init(error, &nmreq, 28 * NFSX_UNSIGNED); + // SETCLIENTID_CONFIRM + numops = 1; + nfsm_chain_build_alloc_init(error, &nmreq, 15 * NFSX_UNSIGNED); nfsm_chain_add_compound_header(error, &nmreq, "setclid_conf", numops); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_SETCLIENTID_CONFIRM); nfsm_chain_add_64(error, &nmreq, nmp->nm_clientid); nfsm_chain_add_64(error, &nmreq, verifier); - if (nmp->nm_dnp) { - /* refresh fs attributes too */ - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, nmp->nm_dnp->n_fhp, nmp->nm_dnp->n_fhsize); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - NFS_CLEAR_ATTRIBUTES(bitmap); - NFS4_PER_FS_ATTRIBUTES(bitmap); - nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); - } nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, R_SETUP, &nmrep, &xid, &status); + error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, NULL, R_SETUP, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_SETCLIENTID_CONFIRM); if (error) printf("nfs4_setclientid: confirm error %d\n", error); - if (nmp->nm_dnp) { - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsmout_if(error); - lck_mtx_lock(&nmp->nm_lock); - error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, NULL, NULL, NULL); - lck_mtx_unlock(&nmp->nm_lock); - } + lck_mtx_lock(&nmp->nm_lock); + if (!error) + nmp->nm_state |= NFSSTA_CLIENTID; + lck_mtx_unlock(&nmp->nm_lock); + nfsmout_if(error || !nmp->nm_dnp); + + /* take the opportunity to refresh fs attributes too */ + // PUTFH, GETATTR(FS) + numops = 2; + nfsm_chain_build_alloc_init(error, &nmreq, 23 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "setclid_attr", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, nmp->nm_dnp->n_fhp, nmp->nm_dnp->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_CLEAR_ATTRIBUTES(bitmap); + NFS4_PER_FS_ATTRIBUTES(bitmap); + nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, NULL, R_SETUP, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + lck_mtx_lock(&nmp->nm_lock); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + if (!error) + error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, NULL, NULL, NULL, NULL); + lck_mtx_unlock(&nmp->nm_lock); + if (error) /* ignore any error from the getattr */ + error = 0; nfsmout: nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); @@ -341,7 +372,7 @@ nfs4_renew(struct nfsmount *nmp, int rpcflag) nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, - current_thread(), cred, rpcflag, &nmrep, &xid, &status); + current_thread(), cred, NULL, rpcflag, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_RENEW); @@ -381,8 +412,7 @@ out: if (error && (error != ETIMEDOUT) && (nmp->nm_clientid == clientid) && !(nmp->nm_state & NFSSTA_RECOVER)) { printf("nfs4_renew_timer: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + nfs_need_recover(nmp, error); } interval = nmp->nm_fsattr.nfsa_lease / (error ? 4 : 2); @@ -392,6 +422,1034 @@ out: nfs_interval_timer_start(nmp->nm_renew_timer, interval * 1000); } +/* + * get the list of supported security flavors + * + * How we get them depends on what args we are given: + * + * FH? Name? Action + * ----- ----- ------ + * YES YES Use the fh and name provided + * YES NO 4.1-only just use the fh provided + * NO YES Use the node's (or root) fh and the name provided + * NO NO Use the node's parent and the node's name (4.1 will just use node's fh) + */ +int +nfs4_secinfo_rpc(struct nfsmount *nmp, struct nfsreq_secinfo_args *siap, kauth_cred_t cred, uint32_t *sec, int *seccountp) +{ + int error = 0, status, nfsvers, numops, namelen, fhsize; + vnode_t dvp = NULLVP; + nfsnode_t np, dnp; + u_char *fhp; + const char *vname = NULL, *name; + uint64_t xid; + struct nfsm_chain nmreq, nmrep; + + *seccountp = 0; + if (!nmp) + return (ENXIO); + nfsvers = nmp->nm_vers; + np = siap->rsia_np; + + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + fhp = siap->rsia_fh; + fhsize = fhp ? siap->rsia_fhsize : 0; + name = siap->rsia_name; + namelen = name ? siap->rsia_namelen : 0; + if (name && !namelen) + namelen = strlen(name); + if (!fhp && name) { + if (!np) /* use PUTROOTFH */ + goto gotargs; + fhp = np->n_fhp; + fhsize = np->n_fhsize; + } + if (fhp && name) + goto gotargs; + + if (!np) + return (EIO); + nfs_node_lock_force(np); + if ((vnode_vtype(NFSTOV(np)) != VDIR) && np->n_sillyrename) { + /* + * The node's been sillyrenamed, so we need to use + * the sillyrename directory/name to do the open. + */ + struct nfs_sillyrename *nsp = np->n_sillyrename; + dnp = nsp->nsr_dnp; + dvp = NFSTOV(dnp); + if ((error = vnode_get(dvp))) { + nfs_node_unlock(np); + goto nfsmout; + } + fhp = dnp->n_fhp; + fhsize = dnp->n_fhsize; + name = nsp->nsr_name; + namelen = nsp->nsr_namlen; + } else { + /* + * [sigh] We can't trust VFS to get the parent right for named + * attribute nodes. (It likes to reparent the nodes after we've + * created them.) Luckily we can probably get the right parent + * from the n_parent we have stashed away. + */ + if ((np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) && + (((dvp = np->n_parent)) && (error = vnode_get(dvp)))) + dvp = NULL; + if (!dvp) + dvp = vnode_getparent(NFSTOV(np)); + vname = vnode_getname(NFSTOV(np)); + if (!dvp || !vname) { + if (!error) + error = EIO; + nfs_node_unlock(np); + goto nfsmout; + } + dnp = VTONFS(dvp); + fhp = dnp->n_fhp; + fhsize = dnp->n_fhsize; + name = vname; + namelen = strnlen(vname, MAXPATHLEN); + } + nfs_node_unlock(np); + +gotargs: + // PUT(ROOT)FH + SECINFO + numops = 2; + nfsm_chain_build_alloc_init(error, &nmreq, + 4 * NFSX_UNSIGNED + NFSX_FH(nfsvers) + nfsm_rndup(namelen)); + nfsm_chain_add_compound_header(error, &nmreq, "secinfo", numops); + numops--; + if (fhp) { + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nfsvers, fhp, fhsize); + } else { + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTROOTFH); + } + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_SECINFO); + nfsm_chain_add_name(error, &nmreq, name, namelen, nmp); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request2(np, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + current_thread(), cred, NULL, 0, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, fhp ? NFS_OP_PUTFH : NFS_OP_PUTROOTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_SECINFO); + nfsmout_if(error); + error = nfsm_chain_get_secinfo(&nmrep, sec, seccountp); +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + if (vname) + vnode_putname(vname); + if (dvp != NULLVP) + vnode_put(dvp); + return (error); +} + +/* + * Parse an NFSv4 SECINFO array to an array of pseudo flavors. + * (Note: also works for MOUNTv3 security arrays.) + */ +int +nfsm_chain_get_secinfo(struct nfsm_chain *nmc, uint32_t *sec, int *seccountp) +{ + int error = 0, secmax, seccount, srvcount; + uint32_t flavor, val; + u_char oid[12]; + + seccount = srvcount = 0; + secmax = *seccountp; + *seccountp = 0; + + nfsm_chain_get_32(error, nmc, srvcount); + while (!error && (srvcount > 0) && (seccount < secmax)) { + nfsm_chain_get_32(error, nmc, flavor); + nfsmout_if(error); + switch (flavor) { + case RPCAUTH_NONE: + case RPCAUTH_SYS: + case RPCAUTH_KRB5: + case RPCAUTH_KRB5I: + case RPCAUTH_KRB5P: + sec[seccount++] = flavor; + break; + case RPCSEC_GSS: + /* we only recognize KRB5, KRB5I, KRB5P */ + nfsm_chain_get_32(error, nmc, val); /* OID length */ + nfsmout_if(error); + if (val != sizeof(krb5_mech)) { + nfsm_chain_adv(error, nmc, val); + nfsm_chain_adv(error, nmc, 2*NFSX_UNSIGNED); + break; + } + nfsm_chain_get_opaque(error, nmc, val, oid); /* OID bytes */ + nfsmout_if(error); + if (bcmp(oid, krb5_mech, sizeof(krb5_mech))) { + nfsm_chain_adv(error, nmc, 2*NFSX_UNSIGNED); + break; + } + nfsm_chain_get_32(error, nmc, val); /* QOP */ + nfsm_chain_get_32(error, nmc, val); /* SERVICE */ + nfsmout_if(error); + switch (val) { + case RPCSEC_GSS_SVC_NONE: + sec[seccount++] = RPCAUTH_KRB5; + break; + case RPCSEC_GSS_SVC_INTEGRITY: + sec[seccount++] = RPCAUTH_KRB5I; + break; + case RPCSEC_GSS_SVC_PRIVACY: + sec[seccount++] = RPCAUTH_KRB5P; + break; + } + break; + } + srvcount--; + } +nfsmout: + if (!error) + *seccountp = seccount; + return (error); +} + + +/* + * Fetch the FS_LOCATIONS attribute for the node found at directory/name. + */ +int +nfs4_get_fs_locations( + struct nfsmount *nmp, + nfsnode_t dnp, + u_char *fhp, + int fhsize, + const char *name, + vfs_context_t ctx, + struct nfs_fs_locations *nfslsp) +{ + int error = 0, numops, status; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; + struct nfsreq rq, *req = &rq; + struct nfsreq_secinfo_args si; + struct nfsm_chain nmreq, nmrep; + uint64_t xid; + + if (!fhp && dnp) { + fhp = dnp->n_fhp; + fhsize = dnp->n_fhsize; + } + if (!fhp) + return (EINVAL); + + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + NFSREQ_SECINFO_SET(&si, NULL, fhp, fhsize, name, 0); + numops = 3; + nfsm_chain_build_alloc_init(error, &nmreq, 18 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "fs_locations", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, NFS_VER4, fhp, fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUP); + nfsm_chain_add_name(error, &nmreq, name, strlen(name), nmp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_CLEAR_ATTRIBUTES(bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FS_LOCATIONS); + nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request_async(dnp, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_LOOKUP); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, NULL, NULL, NULL, nfslsp); +nfsmout: + nfsm_chain_cleanup(&nmrep); + nfsm_chain_cleanup(&nmreq); + return (error); +} + +/* + * Referral trigger nodes may not have many attributes provided by the + * server, so put some default values in place. + */ +void +nfs4_default_attrs_for_referral_trigger( + nfsnode_t dnp, + char *name, + int namelen, + struct nfs_vattr *nvap, + fhandle_t *fhp) +{ + struct timeval now; + microtime(&now); + int len; + + nvap->nva_flags = NFS_FFLAG_TRIGGER | NFS_FFLAG_TRIGGER_REFERRAL; + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_TYPE)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TYPE); + nvap->nva_type = VDIR; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_FSID)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_FSID); + nvap->nva_fsid.major = 0; + nvap->nva_fsid.minor = 0; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER) && dnp) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_OWNER); + nvap->nva_uid = dnp->n_vattr.nva_uid; + nvap->nva_uuuid = dnp->n_vattr.nva_uuuid; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP) && dnp) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP); + nvap->nva_gid = dnp->n_vattr.nva_gid; + nvap->nva_guuid = dnp->n_vattr.nva_guuid; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_MODE)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_MODE); + nvap->nva_mode = 0777; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_SIZE)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_SIZE); + nvap->nva_size = 0; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_SPACE_USED)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_SPACE_USED); + nvap->nva_bytes = 0; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_NUMLINKS)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_NUMLINKS); + nvap->nva_nlink = 2; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_TIME_ACCESS)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_ACCESS); + nvap->nva_timesec[NFSTIME_ACCESS] = now.tv_sec; + nvap->nva_timensec[NFSTIME_ACCESS] = now.tv_usec * 1000; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_TIME_MODIFY)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_MODIFY); + nvap->nva_timesec[NFSTIME_MODIFY] = now.tv_sec; + nvap->nva_timensec[NFSTIME_MODIFY] = now.tv_usec * 1000; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_TIME_METADATA)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_METADATA); + nvap->nva_timesec[NFSTIME_CHANGE] = now.tv_sec; + nvap->nva_timensec[NFSTIME_CHANGE] = now.tv_usec * 1000; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_FILEID)) { + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_FILEID); + nvap->nva_fileid = 42; + } + if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_FILEHANDLE) && dnp && name && fhp) { + /* Build a fake filehandle made up of parent node pointer and name */ + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_FILEHANDLE); + bcopy(&dnp, &fhp->fh_data[0], sizeof(dnp)); + len = sizeof(fhp->fh_data) - sizeof(dnp); + bcopy(name, &fhp->fh_data[0] + sizeof(dnp), MIN(len, namelen)); + fhp->fh_len = sizeof(dnp) + namelen; + if (fhp->fh_len > (int)sizeof(fhp->fh_data)) + fhp->fh_len = sizeof(fhp->fh_data); + } +} + +/* + * Set NFS bitmap according to what's set in vnode_attr (and supported by the server). + */ +void +nfs_vattr_set_bitmap(struct nfsmount *nmp, uint32_t *bitmap, struct vnode_attr *vap) +{ + int i; + + NFS_CLEAR_ATTRIBUTES(bitmap); + if (VATTR_IS_ACTIVE(vap, va_data_size)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_SIZE); + if (VATTR_IS_ACTIVE(vap, va_acl) && (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_ACL); + if (VATTR_IS_ACTIVE(vap, va_flags)) { + NFS_BITMAP_SET(bitmap, NFS_FATTR_ARCHIVE); + NFS_BITMAP_SET(bitmap, NFS_FATTR_HIDDEN); + } + // NFS_BITMAP_SET(bitmap, NFS_FATTR_MIMETYPE) + if (VATTR_IS_ACTIVE(vap, va_mode) && !NMFLAG(nmp, ACLONLY)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_MODE); + if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_uuuid)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_OWNER); + if (VATTR_IS_ACTIVE(vap, va_gid) || VATTR_IS_ACTIVE(vap, va_guuid)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_OWNER_GROUP); + // NFS_BITMAP_SET(bitmap, NFS_FATTR_SYSTEM) + if (vap->va_vaflags & VA_UTIMES_NULL) { + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_ACCESS_SET); + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_MODIFY_SET); + } else { + if (VATTR_IS_ACTIVE(vap, va_access_time)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_ACCESS_SET); + if (VATTR_IS_ACTIVE(vap, va_modify_time)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_MODIFY_SET); + } + if (VATTR_IS_ACTIVE(vap, va_backup_time)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_BACKUP); + if (VATTR_IS_ACTIVE(vap, va_create_time)) + NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_CREATE); + /* and limit to what is supported by server */ + for (i=0; i < NFS_ATTR_BITMAP_LEN; i++) + bitmap[i] &= nmp->nm_fsattr.nfsa_supp_attr[i]; +} + +/* + * Convert between NFSv4 and VFS ACE types + */ +uint32_t +nfs4_ace_nfstype_to_vfstype(uint32_t nfsacetype, int *errorp) +{ + switch (nfsacetype) { + case NFS_ACE_ACCESS_ALLOWED_ACE_TYPE: + return KAUTH_ACE_PERMIT; + case NFS_ACE_ACCESS_DENIED_ACE_TYPE: + return KAUTH_ACE_DENY; + case NFS_ACE_SYSTEM_AUDIT_ACE_TYPE: + return KAUTH_ACE_AUDIT; + case NFS_ACE_SYSTEM_ALARM_ACE_TYPE: + return KAUTH_ACE_ALARM; + } + *errorp = EBADRPC; + return 0; +} + +uint32_t +nfs4_ace_vfstype_to_nfstype(uint32_t vfstype, int *errorp) +{ + switch (vfstype) { + case KAUTH_ACE_PERMIT: + return NFS_ACE_ACCESS_ALLOWED_ACE_TYPE; + case KAUTH_ACE_DENY: + return NFS_ACE_ACCESS_DENIED_ACE_TYPE; + case KAUTH_ACE_AUDIT: + return NFS_ACE_SYSTEM_AUDIT_ACE_TYPE; + case KAUTH_ACE_ALARM: + return NFS_ACE_SYSTEM_ALARM_ACE_TYPE; + } + *errorp = EINVAL; + return 0; +} + +/* + * Convert between NFSv4 and VFS ACE flags + */ +uint32_t +nfs4_ace_nfsflags_to_vfsflags(uint32_t nfsflags) +{ + uint32_t vfsflags = 0; + + if (nfsflags & NFS_ACE_FILE_INHERIT_ACE) + vfsflags |= KAUTH_ACE_FILE_INHERIT; + if (nfsflags & NFS_ACE_DIRECTORY_INHERIT_ACE) + vfsflags |= KAUTH_ACE_DIRECTORY_INHERIT; + if (nfsflags & NFS_ACE_NO_PROPAGATE_INHERIT_ACE) + vfsflags |= KAUTH_ACE_LIMIT_INHERIT; + if (nfsflags & NFS_ACE_INHERIT_ONLY_ACE) + vfsflags |= KAUTH_ACE_ONLY_INHERIT; + if (nfsflags & NFS_ACE_SUCCESSFUL_ACCESS_ACE_FLAG) + vfsflags |= KAUTH_ACE_SUCCESS; + if (nfsflags & NFS_ACE_FAILED_ACCESS_ACE_FLAG) + vfsflags |= KAUTH_ACE_FAILURE; + if (nfsflags & NFS_ACE_INHERITED_ACE) + vfsflags |= KAUTH_ACE_INHERITED; + + return (vfsflags); +} + +uint32_t +nfs4_ace_vfsflags_to_nfsflags(uint32_t vfsflags) +{ + uint32_t nfsflags = 0; + + if (vfsflags & KAUTH_ACE_FILE_INHERIT) + nfsflags |= NFS_ACE_FILE_INHERIT_ACE; + if (vfsflags & KAUTH_ACE_DIRECTORY_INHERIT) + nfsflags |= NFS_ACE_DIRECTORY_INHERIT_ACE; + if (vfsflags & KAUTH_ACE_LIMIT_INHERIT) + nfsflags |= NFS_ACE_NO_PROPAGATE_INHERIT_ACE; + if (vfsflags & KAUTH_ACE_ONLY_INHERIT) + nfsflags |= NFS_ACE_INHERIT_ONLY_ACE; + if (vfsflags & KAUTH_ACE_SUCCESS) + nfsflags |= NFS_ACE_SUCCESSFUL_ACCESS_ACE_FLAG; + if (vfsflags & KAUTH_ACE_FAILURE) + nfsflags |= NFS_ACE_FAILED_ACCESS_ACE_FLAG; + if (vfsflags & KAUTH_ACE_INHERITED) + nfsflags |= NFS_ACE_INHERITED_ACE; + + return (nfsflags); +} + +/* + * Convert between NFSv4 ACE access masks and VFS access rights + */ +uint32_t +nfs4_ace_nfsmask_to_vfsrights(uint32_t nfsmask) +{ + uint32_t vfsrights = 0; + + if (nfsmask & NFS_ACE_READ_DATA) + vfsrights |= KAUTH_VNODE_READ_DATA; + if (nfsmask & NFS_ACE_LIST_DIRECTORY) + vfsrights |= KAUTH_VNODE_LIST_DIRECTORY; + if (nfsmask & NFS_ACE_WRITE_DATA) + vfsrights |= KAUTH_VNODE_WRITE_DATA; + if (nfsmask & NFS_ACE_ADD_FILE) + vfsrights |= KAUTH_VNODE_ADD_FILE; + if (nfsmask & NFS_ACE_APPEND_DATA) + vfsrights |= KAUTH_VNODE_APPEND_DATA; + if (nfsmask & NFS_ACE_ADD_SUBDIRECTORY) + vfsrights |= KAUTH_VNODE_ADD_SUBDIRECTORY; + if (nfsmask & NFS_ACE_READ_NAMED_ATTRS) + vfsrights |= KAUTH_VNODE_READ_EXTATTRIBUTES; + if (nfsmask & NFS_ACE_WRITE_NAMED_ATTRS) + vfsrights |= KAUTH_VNODE_WRITE_EXTATTRIBUTES; + if (nfsmask & NFS_ACE_EXECUTE) + vfsrights |= KAUTH_VNODE_EXECUTE; + if (nfsmask & NFS_ACE_DELETE_CHILD) + vfsrights |= KAUTH_VNODE_DELETE_CHILD; + if (nfsmask & NFS_ACE_READ_ATTRIBUTES) + vfsrights |= KAUTH_VNODE_READ_ATTRIBUTES; + if (nfsmask & NFS_ACE_WRITE_ATTRIBUTES) + vfsrights |= KAUTH_VNODE_WRITE_ATTRIBUTES; + if (nfsmask & NFS_ACE_DELETE) + vfsrights |= KAUTH_VNODE_DELETE; + if (nfsmask & NFS_ACE_READ_ACL) + vfsrights |= KAUTH_VNODE_READ_SECURITY; + if (nfsmask & NFS_ACE_WRITE_ACL) + vfsrights |= KAUTH_VNODE_WRITE_SECURITY; + if (nfsmask & NFS_ACE_WRITE_OWNER) + vfsrights |= KAUTH_VNODE_CHANGE_OWNER; + if (nfsmask & NFS_ACE_SYNCHRONIZE) + vfsrights |= KAUTH_VNODE_SYNCHRONIZE; + if ((nfsmask & NFS_ACE_GENERIC_READ) == NFS_ACE_GENERIC_READ) + vfsrights |= KAUTH_ACE_GENERIC_READ; + if ((nfsmask & NFS_ACE_GENERIC_WRITE) == NFS_ACE_GENERIC_WRITE) + vfsrights |= KAUTH_ACE_GENERIC_WRITE; + if ((nfsmask & NFS_ACE_GENERIC_EXECUTE) == NFS_ACE_GENERIC_EXECUTE) + vfsrights |= KAUTH_ACE_GENERIC_EXECUTE; + + return (vfsrights); +} + +uint32_t +nfs4_ace_vfsrights_to_nfsmask(uint32_t vfsrights) +{ + uint32_t nfsmask = 0; + + if (vfsrights & KAUTH_VNODE_READ_DATA) + nfsmask |= NFS_ACE_READ_DATA; + if (vfsrights & KAUTH_VNODE_LIST_DIRECTORY) + nfsmask |= NFS_ACE_LIST_DIRECTORY; + if (vfsrights & KAUTH_VNODE_WRITE_DATA) + nfsmask |= NFS_ACE_WRITE_DATA; + if (vfsrights & KAUTH_VNODE_ADD_FILE) + nfsmask |= NFS_ACE_ADD_FILE; + if (vfsrights & KAUTH_VNODE_APPEND_DATA) + nfsmask |= NFS_ACE_APPEND_DATA; + if (vfsrights & KAUTH_VNODE_ADD_SUBDIRECTORY) + nfsmask |= NFS_ACE_ADD_SUBDIRECTORY; + if (vfsrights & KAUTH_VNODE_READ_EXTATTRIBUTES) + nfsmask |= NFS_ACE_READ_NAMED_ATTRS; + if (vfsrights & KAUTH_VNODE_WRITE_EXTATTRIBUTES) + nfsmask |= NFS_ACE_WRITE_NAMED_ATTRS; + if (vfsrights & KAUTH_VNODE_EXECUTE) + nfsmask |= NFS_ACE_EXECUTE; + if (vfsrights & KAUTH_VNODE_DELETE_CHILD) + nfsmask |= NFS_ACE_DELETE_CHILD; + if (vfsrights & KAUTH_VNODE_READ_ATTRIBUTES) + nfsmask |= NFS_ACE_READ_ATTRIBUTES; + if (vfsrights & KAUTH_VNODE_WRITE_ATTRIBUTES) + nfsmask |= NFS_ACE_WRITE_ATTRIBUTES; + if (vfsrights & KAUTH_VNODE_DELETE) + nfsmask |= NFS_ACE_DELETE; + if (vfsrights & KAUTH_VNODE_READ_SECURITY) + nfsmask |= NFS_ACE_READ_ACL; + if (vfsrights & KAUTH_VNODE_WRITE_SECURITY) + nfsmask |= NFS_ACE_WRITE_ACL; + if (vfsrights & KAUTH_VNODE_CHANGE_OWNER) + nfsmask |= NFS_ACE_WRITE_OWNER; + if (vfsrights & KAUTH_VNODE_SYNCHRONIZE) + nfsmask |= NFS_ACE_SYNCHRONIZE; + if (vfsrights & KAUTH_ACE_GENERIC_READ) + nfsmask |= NFS_ACE_GENERIC_READ; + if (vfsrights & KAUTH_ACE_GENERIC_WRITE) + nfsmask |= NFS_ACE_GENERIC_WRITE; + if (vfsrights & KAUTH_ACE_GENERIC_EXECUTE) + nfsmask |= NFS_ACE_GENERIC_EXECUTE; + if (vfsrights & KAUTH_ACE_GENERIC_ALL) + nfsmask |= (KAUTH_ACE_GENERIC_READ|KAUTH_ACE_GENERIC_WRITE|NFS_ACE_GENERIC_EXECUTE); + + return (nfsmask); +} + +/* + * Map an NFSv4 ID string to a VFS guid. + * + * Try to use the ID mapping service... but we may fallback to trying to do it ourselves. + */ +int +nfs4_id2guid(/*const*/ char *id, guid_t *guidp, int isgroup) +{ + int error1 = 0, error = 0, compare; + guid_t guid1, guid2, *gp; + ntsid_t sid; + long num, unknown; + const char *p, *at; + + *guidp = kauth_null_guid; + compare = ((nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE) && + (nfs_idmap_ctrl & NFS_IDMAP_CTRL_COMPARE_RESULTS)); + unknown = (nfs_idmap_ctrl & NFS_IDMAP_CTRL_UNKNOWN_IS_99) ? 99 : -2; + + /* + * First check if it is just a simple numeric ID string or a special "XXX@" name. + * If it's a number, there's no need trying to ask the IDMAP service to map it. + * If it's a special "XXX@" name, we want to make sure to treat it as a group. + */ + num = 1; + at = NULL; + p = id; + while (*p) { + if ((*p < '0') || (*p > '9')) + num = 0; + if (*p == '@') + at = p; + p++; + } + if (at && !at[1] && !isgroup) + isgroup = 1; /* special "XXX@" names should always be treated as groups */ + if (num) { + /* must be numeric ID (or empty) */ + num = *id ? strtol(id, NULL, 10) : unknown; + gp = guidp; + goto gotnumid; + } + + if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE) { + /* + * Ask the ID mapping service to map the ID string to a GUID. + * + * [sigh] this isn't a "pwnam/grnam" it's an NFS ID string! + */ + gp = compare ? &guid1 : guidp; + if (isgroup) + error = kauth_cred_grnam2guid(id, gp); + else + error = kauth_cred_pwnam2guid(id, gp); + if (error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS)) + printf("nfs4_id2guid: idmap failed for %s %s error %d\n", id, isgroup ? "G" : " ", error); + if (!error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS)) + printf("nfs4_id2guid: idmap for %s %s got guid " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x\n", + id, isgroup ? "G" : " ", + gp->g_guid[0], gp->g_guid[1], gp->g_guid[2], gp->g_guid[3], + gp->g_guid[4], gp->g_guid[5], gp->g_guid[6], gp->g_guid[7], + gp->g_guid[8], gp->g_guid[9], gp->g_guid[10], gp->g_guid[11], + gp->g_guid[12], gp->g_guid[13], gp->g_guid[14], gp->g_guid[15]); + error1 = error; + } + if (error || compare || !(nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE)) { + /* + * fallback path... see if we can come up with an answer ourselves. + */ + gp = compare ? &guid2 : guidp; + + if (!(nfs_idmap_ctrl & NFS_IDMAP_CTRL_FALLBACK_NO_WELLKNOWN_IDS) && at && !at[1]) { + /* must be a special ACE "who" ID */ + bzero(&sid, sizeof(sid)); + sid.sid_kind = 1; + sid.sid_authcount = 1; + if (!strcmp(id, "OWNER@")) { + // S-1-3-0 + sid.sid_authority[5] = 3; + sid.sid_authorities[0] = 0; + } else if (!strcmp(id, "GROUP@")) { + // S-1-3-1 + sid.sid_authority[5] = 3; + sid.sid_authorities[0] = 1; + } else if (!strcmp(id, "EVERYONE@")) { + // S-1-1-0 + sid.sid_authority[5] = 1; + sid.sid_authorities[0] = 0; + } else if (!strcmp(id, "INTERACTIVE@")) { + // S-1-5-4 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 4; + } else if (!strcmp(id, "NETWORK@")) { + // S-1-5-2 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 2; + } else if (!strcmp(id, "DIALUP@")) { + // S-1-5-1 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 1; + } else if (!strcmp(id, "BATCH@")) { + // S-1-5-3 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 3; + } else if (!strcmp(id, "ANONYMOUS@")) { + // S-1-5-7 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 7; + } else if (!strcmp(id, "AUTHENTICATED@")) { + // S-1-5-11 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 11; + } else if (!strcmp(id, "SERVICE@")) { + // S-1-5-6 + sid.sid_authority[5] = 5; + sid.sid_authorities[0] = 6; + } else { + // S-1-0-0 "NOBODY" + sid.sid_authority[5] = 0; + sid.sid_authorities[0] = 0; + } + error = kauth_cred_ntsid2guid(&sid, gp); + } else { + if (!(nfs_idmap_ctrl & NFS_IDMAP_CTRL_FALLBACK_NO_COMMON_IDS) && at) { + /* must be user@domain */ + /* try to identify some well-known IDs */ + if (!strncmp(id, "root@", 5)) + num = 0; + else if (!strncmp(id, "wheel@", 6)) + num = 0; + else if (!strncmp(id, "nobody@", 7)) + num = -2; + else if (!strncmp(id, "nfsnobody@", 10)) + num = -2; + else + num = unknown; + } else if (!(nfs_idmap_ctrl & NFS_IDMAP_CTRL_FALLBACK_NO_COMMON_IDS) && !strcmp(id, "nobody")) { + num = -2; + } else { + num = unknown; + } +gotnumid: + if (isgroup) + error = kauth_cred_gid2guid((gid_t)num, gp); + else + error = kauth_cred_uid2guid((uid_t)num, gp); + } + if (error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS)) + printf("nfs4_id2guid: fallback map failed for %s %s error %d\n", id, isgroup ? "G" : " ", error); + if (!error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS)) + printf("nfs4_id2guid: fallback map for %s %s got guid " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x\n", + id, isgroup ? "G" : " ", + gp->g_guid[0], gp->g_guid[1], gp->g_guid[2], gp->g_guid[3], + gp->g_guid[4], gp->g_guid[5], gp->g_guid[6], gp->g_guid[7], + gp->g_guid[8], gp->g_guid[9], gp->g_guid[10], gp->g_guid[11], + gp->g_guid[12], gp->g_guid[13], gp->g_guid[14], gp->g_guid[15]); + } + + if (compare) { + /* compare the results, log if different */ + if (!error1 && !error) { + if (!kauth_guid_equal(&guid1, &guid2)) + printf("nfs4_id2guid: idmap/fallback results differ for %s %s - " + "idmap %02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x " + "fallback %02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x\n", + id, isgroup ? "G" : " ", + guid1.g_guid[0], guid1.g_guid[1], guid1.g_guid[2], guid1.g_guid[3], + guid1.g_guid[4], guid1.g_guid[5], guid1.g_guid[6], guid1.g_guid[7], + guid1.g_guid[8], guid1.g_guid[9], guid1.g_guid[10], guid1.g_guid[11], + guid1.g_guid[12], guid1.g_guid[13], guid1.g_guid[14], guid1.g_guid[15], + guid2.g_guid[0], guid2.g_guid[1], guid2.g_guid[2], guid2.g_guid[3], + guid2.g_guid[4], guid2.g_guid[5], guid2.g_guid[6], guid2.g_guid[7], + guid2.g_guid[8], guid2.g_guid[9], guid2.g_guid[10], guid2.g_guid[11], + guid2.g_guid[12], guid2.g_guid[13], guid2.g_guid[14], guid2.g_guid[15]); + /* copy idmap result to output guid */ + *guidp = guid1; + } else if (error1 && !error) { + printf("nfs4_id2guid: idmap/fallback results differ for %s %s - " + "idmap error %d " + "fallback %02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x\n", + id, isgroup ? "G" : " ", + error1, + guid2.g_guid[0], guid2.g_guid[1], guid2.g_guid[2], guid2.g_guid[3], + guid2.g_guid[4], guid2.g_guid[5], guid2.g_guid[6], guid2.g_guid[7], + guid2.g_guid[8], guid2.g_guid[9], guid2.g_guid[10], guid2.g_guid[11], + guid2.g_guid[12], guid2.g_guid[13], guid2.g_guid[14], guid2.g_guid[15]); + /* copy fallback result to output guid */ + *guidp = guid2; + } else if (!error1 && error) { + printf("nfs4_id2guid: idmap/fallback results differ for %s %s - " + "idmap %02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x " + "fallback error %d\n", + id, isgroup ? "G" : " ", + guid1.g_guid[0], guid1.g_guid[1], guid1.g_guid[2], guid1.g_guid[3], + guid1.g_guid[4], guid1.g_guid[5], guid1.g_guid[6], guid1.g_guid[7], + guid1.g_guid[8], guid1.g_guid[9], guid1.g_guid[10], guid1.g_guid[11], + guid1.g_guid[12], guid1.g_guid[13], guid1.g_guid[14], guid1.g_guid[15], + error); + /* copy idmap result to output guid */ + *guidp = guid1; + error = 0; + } else { + if (error1 != error) + printf("nfs4_id2guid: idmap/fallback results differ for %s %s - " + "idmap error %d fallback error %d\n", + id, isgroup ? "G" : " ", error1, error); + } + } + + return (error); +} + +/* + * Map a VFS guid to an NFSv4 ID string. + * + * Try to use the ID mapping service... but we may fallback to trying to do it ourselves. + */ +int +nfs4_guid2id(guid_t *guidp, char *id, int *idlen, int isgroup) +{ + int error1 = 0, error = 0, compare; + int id1len, id2len, len; + char *id1buf, *id1; + char numbuf[32]; + const char *id2 = NULL; + + id1buf = id1 = NULL; + id1len = id2len = 0; + compare = ((nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE) && + (nfs_idmap_ctrl & NFS_IDMAP_CTRL_COMPARE_RESULTS)); + + if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE) { + /* + * Ask the ID mapping service to map the GUID to an ID string. + * + * [sigh] this isn't a "pwnam" it's an NFS id string! + */ + + /* + * Stupid kauth_cred_guid2pwnam() function requires that the buffer + * be at least MAXPATHLEN bytes long even though most if not all ID + * strings will be much much shorter than that. + */ + if (compare || (*idlen < MAXPATHLEN)) { + MALLOC_ZONE(id1buf, char*, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!id1buf) + return (ENOMEM); + id1 = id1buf; + id1len = MAXPATHLEN; + } else { + id1 = id; + id1len = *idlen; + } + + if (isgroup) + error = kauth_cred_guid2grnam(guidp, id1); + else + error = kauth_cred_guid2pwnam(guidp, id1); + if (error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS)) + printf("nfs4_guid2id: idmap failed for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "error %d\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", error); + if (!error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS)) + printf("nfs4_guid2id: idmap for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "got ID %s\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", id1); + error1 = error; + if (!error) { + if (compare) { + id1len = strnlen(id1, id1len); + } else if (id1 == id1buf) { + /* copy idmap result to output buffer */ + len = strlcpy(id, id1, *idlen); + if (len >= *idlen) + error = ENOSPC; + else + *idlen = len; + } + } + } + if (error || compare || !(nfs_idmap_ctrl & NFS_IDMAP_CTRL_USE_IDMAP_SERVICE)) { + /* + * fallback path... see if we can come up with an answer ourselves. + */ + ntsid_t sid; + uid_t uid; + + if (!(nfs_idmap_ctrl & NFS_IDMAP_CTRL_FALLBACK_NO_WELLKNOWN_IDS)) { + error = kauth_cred_guid2ntsid(guidp, &sid); + if (!error && (sid.sid_kind == 1) && (sid.sid_authcount == 1)) { + /* check if it's one of our well-known ACE WHO names */ + if (sid.sid_authority[5] == 0) { + if (sid.sid_authorities[0] == 0) // S-1-0-0 + id2 = "nobody@localdomain"; + } else if (sid.sid_authority[5] == 1) { + if (sid.sid_authorities[0] == 0) // S-1-1-0 + id2 = "EVERYONE@"; + } else if (sid.sid_authority[5] == 3) { + if (sid.sid_authorities[0] == 0) // S-1-3-0 + id2 = "OWNER@"; + else if (sid.sid_authorities[0] == 1) // S-1-3-1 + id2 = "GROUP@"; + } else if (sid.sid_authority[5] == 5) { + if (sid.sid_authorities[0] == ntohl(1)) // S-1-5-1 + id2 = "DIALUP@"; + else if (sid.sid_authorities[0] == ntohl(2)) // S-1-5-2 + id2 = "NETWORK@"; + else if (sid.sid_authorities[0] == ntohl(3)) // S-1-5-3 + id2 = "BATCH@"; + else if (sid.sid_authorities[0] == ntohl(4)) // S-1-5-4 + id2 = "INTERACTIVE@"; + else if (sid.sid_authorities[0] == ntohl(6)) // S-1-5-6 + id2 = "SERVICE@"; + else if (sid.sid_authorities[0] == ntohl(7)) // S-1-5-7 + id2 = "ANONYMOUS@"; + else if (sid.sid_authorities[0] == ntohl(11)) // S-1-5-11 + id2 = "AUTHENTICATED@"; + } + } + } + if (!id2) { + /* OK, let's just try mapping it to a UID/GID */ + if (isgroup) + error = kauth_cred_guid2gid(guidp, (gid_t*)&uid); + else + error = kauth_cred_guid2uid(guidp, &uid); + if (!error) { + if (!(nfs_idmap_ctrl & NFS_IDMAP_CTRL_FALLBACK_NO_COMMON_IDS)) { + /* map well known uid's to strings */ + if (uid == 0) + id2 = isgroup ? "wheel@localdomain" : "root@localdomain"; + else if (uid == (uid_t)-2) + id2 = "nobody@localdomain"; + } + if (!id2) { + /* or just use a decimal number string. */ + snprintf(numbuf, sizeof(numbuf), "%d", uid); + id2 = numbuf; + } + } + } + if (error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS)) + printf("nfs4_guid2id: fallback map failed for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "error %d\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", error); + if (!error && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_SUCCESSFUL_MAPPINGS)) + printf("nfs4_guid2id: fallback map for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "got ID %s\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", id2); + if (!error && id2) { + if (compare) { + id2len = strnlen(id2, MAXPATHLEN); + } else { + /* copy fallback result to output buffer */ + len = strlcpy(id, id2, *idlen); + if (len >= *idlen) + error = ENOSPC; + else + *idlen = len; + } + } + } + + if (compare) { + /* compare the results, log if different */ + if (!error1 && !error) { + if ((id1len != id2len) || strncmp(id1, id2, id1len)) + printf("nfs4_guid2id: idmap/fallback results differ for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "idmap %s fallback %s\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", id1, id2); + if (id1 == id1buf) { + /* copy idmap result to output buffer */ + len = strlcpy(id, id1, *idlen); + if (len >= *idlen) + error = ENOSPC; + else + *idlen = len; + } + } else if (error1 && !error) { + printf("nfs4_guid2id: idmap/fallback results differ for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "idmap error %d fallback %s\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", error1, id2); + /* copy fallback result to output buffer */ + len = strlcpy(id, id2, *idlen); + if (len >= *idlen) + error = ENOSPC; + else + *idlen = len; + } else if (!error1 && error) { + printf("nfs4_guid2id: idmap/fallback results differ for " + "%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x_%02x%02x%02x%02x %s " + "idmap %s fallback error %d\n", + guidp->g_guid[0], guidp->g_guid[1], guidp->g_guid[2], guidp->g_guid[3], + guidp->g_guid[4], guidp->g_guid[5], guidp->g_guid[6], guidp->g_guid[7], + guidp->g_guid[8], guidp->g_guid[9], guidp->g_guid[10], guidp->g_guid[11], + guidp->g_guid[12], guidp->g_guid[13], guidp->g_guid[14], guidp->g_guid[15], + isgroup ? "G" : " ", id1, error); + if (id1 == id1buf) { + /* copy idmap result to output buffer */ + len = strlcpy(id, id1, *idlen); + if (len >= *idlen) + error = ENOSPC; + else + *idlen = len; + } + error = 0; + } else { + if (error1 != error) + printf("nfs4_guid2id: idmap/fallback results differ for %s %s - " + "idmap error %d fallback error %d\n", + id, isgroup ? "G" : " ", error1, error); + } + } + if (id1buf) + FREE_ZONE(id1buf, MAXPATHLEN, M_NAMEI); + return (error); +} + + /* * Set a vnode attr's supported bits according to the given bitmap */ @@ -403,11 +1461,10 @@ nfs_vattr_set_supported(uint32_t *bitmap, struct vnode_attr *vap) // if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_CHANGE)) if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_SIZE)) VATTR_SET_SUPPORTED(vap, va_data_size); - // if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_NAMED_ATTR)) if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FSID)) VATTR_SET_SUPPORTED(vap, va_fsid); -// if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL)) -// VATTR_SET_SUPPORTED(vap, va_acl); + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL)) + VATTR_SET_SUPPORTED(vap, va_acl); if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ARCHIVE)) VATTR_SET_SUPPORTED(vap, va_flags); if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FILEID)) @@ -419,10 +1476,14 @@ nfs_vattr_set_supported(uint32_t *bitmap, struct vnode_attr *vap) VATTR_SET_SUPPORTED(vap, va_mode); if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_NUMLINKS)) VATTR_SET_SUPPORTED(vap, va_nlink); - if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER)) + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER)) { VATTR_SET_SUPPORTED(vap, va_uid); - if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER_GROUP)) + VATTR_SET_SUPPORTED(vap, va_uuuid); + } + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER_GROUP)) { VATTR_SET_SUPPORTED(vap, va_gid); + VATTR_SET_SUPPORTED(vap, va_guuid); + } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_RAWDEV)) VATTR_SET_SUPPORTED(vap, va_rdev); if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_SPACE_USED)) @@ -450,15 +1511,20 @@ nfs4_parsefattr( struct nfs_fsattr *nfsap, struct nfs_vattr *nvap, fhandle_t *fhp, - struct dqblk *dqbp) + struct dqblk *dqbp, + struct nfs_fs_locations *nfslsp) { - int error = 0, attrbytes; - uint32_t val, val2, val3, i, j; - uint32_t bitmap[NFS_ATTR_BITMAP_LEN], len; - char *s; + int error = 0, error2, rderror = 0, attrbytes; + uint32_t val, val2, val3, i; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN], len, slen; + char sbuf[64], *s; struct nfs_fsattr nfsa_dummy; struct nfs_vattr nva_dummy; struct dqblk dqb_dummy; + kauth_acl_t acl = NULL; + uint32_t ace_type, ace_flags, ace_mask; + struct nfs_fs_locations nfsls_dummy; + struct sockaddr_storage ss; /* if not interested in some values... throw 'em into a local dummy variable */ if (!nfsap) @@ -467,8 +1533,14 @@ nfs4_parsefattr( nvap = &nva_dummy; if (!dqbp) dqbp = &dqb_dummy; + if (!nfslsp) + nfslsp = &nfsls_dummy; + bzero(nfslsp, sizeof(*nfslsp)); attrbytes = val = val2 = val3 = 0; + s = sbuf; + slen = sizeof(sbuf); + NVATTR_INIT(nvap); len = NFS_ATTR_BITMAP_LEN; nfsm_chain_get_bitmap(error, nmc, bitmap, len); @@ -489,17 +1561,19 @@ nfs4_parsefattr( if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_TYPE)) { nfsm_chain_get_32(error, nmc, val); nvap->nva_type = nfstov_type(val, NFS_VER4); + if ((val == NFATTRDIR) || (val == NFNAMEDATTR)) + nvap->nva_flags |= NFS_FFLAG_IS_ATTR; + else + nvap->nva_flags &= ~NFS_FFLAG_IS_ATTR; attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FH_EXPIRE_TYPE)) { nfsm_chain_get_32(error, nmc, val); nfsmout_if(error); - if (val != NFS_FH_PERSISTENT) - printf("nfs: warning: non-persistent file handles!\n"); - if (val & ~0xff) - printf("nfs: warning unknown fh type: 0x%x\n", val); nfsap->nfsa_flags &= ~NFS_FSFLAG_FHTYPE_MASK; nfsap->nfsa_flags |= val << NFS_FSFLAG_FHTYPE_SHIFT; + if (val & ~0xff) + printf("nfs: warning unknown fh type: 0x%x\n", val); attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_CHANGE)) { @@ -529,9 +1603,9 @@ nfs4_parsefattr( if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_NAMED_ATTR)) { nfsm_chain_get_32(error, nmc, val); if (val) - nvap->nva_flags |= NFS_FFLAG_NAMED_ATTR; + nvap->nva_flags |= NFS_FFLAG_HAS_NAMED_ATTRS; else - nvap->nva_flags &= ~NFS_FFLAG_NAMED_ATTR; + nvap->nva_flags &= ~NFS_FFLAG_HAS_NAMED_ATTRS; attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FSID)) { @@ -552,26 +1626,79 @@ nfs4_parsefattr( attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_RDATTR_ERROR)) { - nfsm_chain_get_32(error, nmc, error); + nfsm_chain_get_32(error, nmc, rderror); attrbytes -= NFSX_UNSIGNED; - nfsmout_if(error); + if (!rderror) { /* no error */ + NFS_BITMAP_CLR(bitmap, NFS_FATTR_RDATTR_ERROR); + NFS_BITMAP_CLR(nvap->nva_bitmap, NFS_FATTR_RDATTR_ERROR); + } } - if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL)) { /* skip for now */ + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL)) { + error2 = 0; + ace_type = ace_flags = ace_mask = 0; nfsm_chain_get_32(error, nmc, val); /* ACE count */ + if (!error && (val > KAUTH_ACL_MAX_ENTRIES)) + error = EOVERFLOW; + if (!error && !((acl = kauth_acl_alloc(val)))) + error = ENOMEM; + if (!error && acl) { + acl->acl_entrycount = val; + acl->acl_flags = 0; + } + attrbytes -= NFSX_UNSIGNED; + nfsm_assert(error, (attrbytes >= 0), EBADRPC); for (i=0; !error && (i < val); i++) { - nfsm_chain_adv(error, nmc, 3 * NFSX_UNSIGNED); - nfsm_chain_get_32(error, nmc, val2); /* string length */ - nfsm_chain_adv(error, nmc, nfsm_rndup(val2)); - attrbytes -= 4*NFSX_UNSIGNED + nfsm_rndup(val2); + nfsm_chain_get_32(error, nmc, ace_type); + nfsm_chain_get_32(error, nmc, ace_flags); + nfsm_chain_get_32(error, nmc, ace_mask); + nfsm_chain_get_32(error, nmc, len); + acl->acl_ace[i].ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); + acl->acl_ace[i].ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); + acl->acl_ace[i].ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); + if (!error && !error2 && (len >= slen)) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + slen = sizeof(sbuf); + } + MALLOC(s, char*, len+16, M_TEMP, M_WAITOK); + if (s) + slen = len+16; + else + error2 = ENOMEM; + } + if (error2) + nfsm_chain_adv(error, nmc, nfsm_rndup(len)); + else + nfsm_chain_get_opaque(error, nmc, len, s); + if (!error && !error2) { + s[len] = '\0'; + error2 = nfs4_id2guid(s, &acl->acl_ace[i].ace_applicable, + (ace_flags & NFS_ACE_IDENTIFIER_GROUP)); + if (error2 && (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS)) + printf("nfs4_parsefattr: ACE WHO %s is no one, no guid?, error %d\n", s, error2); + } + attrbytes -= 4*NFSX_UNSIGNED + nfsm_rndup(len); nfsm_assert(error, (attrbytes >= 0), EBADRPC); } + nfsmout_if(error); + if ((nvap != &nva_dummy) && !error2) { + nvap->nva_acl = acl; + acl = NULL; + } } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACLSUPPORT)) { + /* + * Support ACLs if: the server supports DENY/ALLOC ACEs and + * (just to be safe) FATTR_ACL is in the supported list too. + */ nfsm_chain_get_32(error, nmc, val); - if (val) + if ((val & (NFS_ACL_SUPPORT_ALLOW_ACL|NFS_ACL_SUPPORT_DENY_ACL)) && + NFS_BITMAP_ISSET(nfsap->nfsa_supp_attr, NFS_FATTR_ACL)) { nfsap->nfsa_flags |= NFS_FSFLAG_ACL; - else + } else { nfsap->nfsa_flags &= ~NFS_FSFLAG_ACL; + } attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ARCHIVE)) { /* SF_ARCHIVED */ @@ -640,23 +1767,151 @@ nfs4_parsefattr( nfsm_chain_get_64(error, nmc, nfsap->nfsa_files_total); attrbytes -= 2 * NFSX_UNSIGNED; } - if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FS_LOCATIONS)) { /* skip for now */ - nfsm_chain_get_32(error, nmc, val); /* root path length */ - nfsm_chain_adv(error, nmc, nfsm_rndup(val)); /* root path */ - attrbytes -= (2 * NFSX_UNSIGNED) + nfsm_rndup(val); - nfsm_chain_get_32(error, nmc, val); /* location count */ - for (i=0; !error && (i < val); i++) { - nfsm_chain_get_32(error, nmc, val2); /* server string length */ - nfsm_chain_adv(error, nmc, nfsm_rndup(val2)); /* server string */ - attrbytes -= (2 * NFSX_UNSIGNED) + nfsm_rndup(val2); - nfsm_chain_get_32(error, nmc, val2); /* pathname component count */ - for (j=0; !error && (j < val2); j++) { - nfsm_chain_get_32(error, nmc, val3); /* component length */ - nfsm_chain_adv(error, nmc, nfsm_rndup(val3)); /* component */ - attrbytes -= NFSX_UNSIGNED + nfsm_rndup(val3); - nfsm_assert(error, (attrbytes >= 0), EBADRPC); + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_FS_LOCATIONS)) { + uint32_t loc, serv, comp; + struct nfs_fs_location *fsl; + struct nfs_fs_server *fss; + struct nfs_fs_path *fsp; + + /* get root pathname */ + fsp = &nfslsp->nl_root; + nfsm_chain_get_32(error, nmc, fsp->np_compcount); /* component count */ + attrbytes -= NFSX_UNSIGNED; + /* sanity check component count */ + if (!error && (fsp->np_compcount > MAXPATHLEN)) + error = EBADRPC; + nfsmout_if(error); + if (fsp->np_compcount) { + MALLOC(fsp->np_components, char **, fsp->np_compcount * sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components) + error = ENOMEM; + } + for (comp = 0; comp < fsp->np_compcount; comp++) { + nfsm_chain_get_32(error, nmc, val); /* component length */ + /* sanity check component length */ + if (!error && (val == 0)) { + /* + * Apparently some people think a path with zero components should + * be encoded with one zero-length component. So, just ignore any + * zero length components. + */ + comp--; + fsp->np_compcount--; + if (fsp->np_compcount == 0) { + FREE(fsp->np_components, M_TEMP); + fsp->np_components = NULL; + } + attrbytes -= NFSX_UNSIGNED; + continue; + } + if (!error && ((val < 1) || (val > MAXPATHLEN))) + error = EBADRPC; + nfsmout_if(error); + MALLOC(fsp->np_components[comp], char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components[comp]) + error = ENOMEM; + nfsmout_if(error); + nfsm_chain_get_opaque(error, nmc, val, fsp->np_components[comp]); /* component */ + attrbytes -= NFSX_UNSIGNED + nfsm_rndup(val); + } + nfsm_chain_get_32(error, nmc, nfslsp->nl_numlocs); /* fs location count */ + attrbytes -= NFSX_UNSIGNED; + /* sanity check location count */ + if (!error && (nfslsp->nl_numlocs > 256)) + error = EBADRPC; + nfsmout_if(error); + if (nfslsp->nl_numlocs > 0) { + MALLOC(nfslsp->nl_locations, struct nfs_fs_location **, nfslsp->nl_numlocs * sizeof(struct nfs_fs_location*), M_TEMP, M_WAITOK|M_ZERO); + if (!nfslsp->nl_locations) + error = ENOMEM; + } + nfsmout_if(error); + for (loc = 0; loc < nfslsp->nl_numlocs; loc++) { + nfsmout_if(error); + MALLOC(fsl, struct nfs_fs_location *, sizeof(struct nfs_fs_location), M_TEMP, M_WAITOK|M_ZERO); + if (!fsl) + error = ENOMEM; + nfslsp->nl_locations[loc] = fsl; + nfsm_chain_get_32(error, nmc, fsl->nl_servcount); /* server count */ + attrbytes -= NFSX_UNSIGNED; + /* sanity check server count */ + if (!error && ((fsl->nl_servcount < 1) || (fsl->nl_servcount > 256))) + error = EBADRPC; + nfsmout_if(error); + MALLOC(fsl->nl_servers, struct nfs_fs_server **, fsl->nl_servcount * sizeof(struct nfs_fs_server*), M_TEMP, M_WAITOK|M_ZERO); + if (!fsl->nl_servers) + error = ENOMEM; + for (serv = 0; serv < fsl->nl_servcount; serv++) { + nfsmout_if(error); + MALLOC(fss, struct nfs_fs_server *, sizeof(struct nfs_fs_server), M_TEMP, M_WAITOK|M_ZERO); + if (!fss) + error = ENOMEM; + fsl->nl_servers[serv] = fss; + nfsm_chain_get_32(error, nmc, val); /* server name length */ + /* sanity check server name length */ + if (!error && ((val < 1) || (val > MAXPATHLEN))) + error = EINVAL; + nfsmout_if(error); + MALLOC(fss->ns_name, char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_name) + error = ENOMEM; + nfsm_chain_get_opaque(error, nmc, val, fss->ns_name); /* server name */ + attrbytes -= NFSX_UNSIGNED + nfsm_rndup(val); + nfsmout_if(error); + /* copy name to address if it converts to a sockaddr */ + if (nfs_uaddr2sockaddr(fss->ns_name, (struct sockaddr*)&ss)) { + fss->ns_addrcount = 1; + MALLOC(fss->ns_addresses, char **, sizeof(char *), M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_addresses) + error = ENOMEM; + nfsmout_if(error); + MALLOC(fss->ns_addresses[0], char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_addresses[0]) + error = ENOMEM; + nfsmout_if(error); + strlcpy(fss->ns_addresses[0], fss->ns_name, val+1); + } + } + /* get pathname */ + fsp = &fsl->nl_path; + nfsm_chain_get_32(error, nmc, fsp->np_compcount); /* component count */ + attrbytes -= NFSX_UNSIGNED; + /* sanity check component count */ + if (!error && (fsp->np_compcount > MAXPATHLEN)) + error = EINVAL; + nfsmout_if(error); + if (fsp->np_compcount) { + MALLOC(fsp->np_components, char **, fsp->np_compcount * sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components) + error = ENOMEM; + } + for (comp = 0; comp < fsp->np_compcount; comp++) { + nfsm_chain_get_32(error, nmc, val); /* component length */ + /* sanity check component length */ + if (!error && (val == 0)) { + /* + * Apparently some people think a path with zero components should + * be encoded with one zero-length component. So, just ignore any + * zero length components. + */ + comp--; + fsp->np_compcount--; + if (fsp->np_compcount == 0) { + FREE(fsp->np_components, M_TEMP); + fsp->np_components = NULL; + } + attrbytes -= NFSX_UNSIGNED; + continue; + } + if (!error && ((val < 1) || (val > MAXPATHLEN))) + error = EINVAL; + nfsmout_if(error); + MALLOC(fsp->np_components[comp], char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components[comp]) + error = ENOMEM; + nfsm_chain_get_opaque(error, nmc, val, fsp->np_components[comp]); /* component */ + attrbytes -= NFSX_UNSIGNED + nfsm_rndup(val); } - nfsm_assert(error, (attrbytes >= 0), EBADRPC); } nfsm_assert(error, (attrbytes >= 0), EBADRPC); } @@ -724,34 +1979,68 @@ nfs4_parsefattr( attrbytes -= NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER)) { - /* XXX Need ID mapping infrastructure - use ugly hack for now */ nfsm_chain_get_32(error, nmc, len); - nfsm_chain_get_opaque_pointer(error, nmc, len, s); + if (!error && (len >= slen)) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + slen = sizeof(sbuf); + } + MALLOC(s, char*, len+16, M_TEMP, M_WAITOK); + if (s) + slen = len+16; + else + error = ENOMEM; + } + nfsm_chain_get_opaque(error, nmc, len, s); + if (!error) { + s[len] = '\0'; + error = nfs4_id2guid(s, &nvap->nva_uuuid, 0); + if (!error) + error = kauth_cred_guid2uid(&nvap->nva_uuuid, &nvap->nva_uid); + if (error) { + /* unable to get either GUID or UID, set to default */ + nvap->nva_uid = (uid_t)((nfs_idmap_ctrl & NFS_IDMAP_CTRL_UNKNOWN_IS_99) ? 99 : -2); + if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS) + printf("nfs4_parsefattr: owner %s is no one, no %s?, error %d\n", s, + kauth_guid_equal(&nvap->nva_uuuid, &kauth_null_guid) ? "guid" : "uid", + error); + error = 0; + } + } attrbytes -= NFSX_UNSIGNED + nfsm_rndup(len); - nfsmout_if(error); - if ((*s >= '0') && (*s <= '9')) - nvap->nva_uid = strtol(s, NULL, 10); - else if (!strncmp(s, "nobody@", 7)) - nvap->nva_uid = -2; - else if (!strncmp(s, "root@", 5)) - nvap->nva_uid = 0; - else - nvap->nva_uid = 99; /* unknown */ } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER_GROUP)) { - /* XXX Need ID mapping infrastructure - use ugly hack for now */ nfsm_chain_get_32(error, nmc, len); - nfsm_chain_get_opaque_pointer(error, nmc, len, s); + if (!error && (len >= slen)) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + slen = sizeof(sbuf); + } + MALLOC(s, char*, len+16, M_TEMP, M_WAITOK); + if (s) + slen = len+16; + else + error = ENOMEM; + } + nfsm_chain_get_opaque(error, nmc, len, s); + if (!error) { + s[len] = '\0'; + error = nfs4_id2guid(s, &nvap->nva_guuid, 1); + if (!error) + error = kauth_cred_guid2gid(&nvap->nva_guuid, &nvap->nva_gid); + if (error) { + /* unable to get either GUID or GID, set to default */ + nvap->nva_gid = (gid_t)((nfs_idmap_ctrl & NFS_IDMAP_CTRL_UNKNOWN_IS_99) ? 99 : -2); + if (nfs_idmap_ctrl & NFS_IDMAP_CTRL_LOG_FAILED_MAPPINGS) + printf("nfs4_parsefattr: group %s is no one, no %s?, error %d\n", s, + kauth_guid_equal(&nvap->nva_guuid, &kauth_null_guid) ? "guid" : "gid", + error); + error = 0; + } + } attrbytes -= NFSX_UNSIGNED + nfsm_rndup(len); - nfsmout_if(error); - if ((*s >= '0') && (*s <= '9')) - nvap->nva_gid = strtol(s, NULL, 10); - else if (!strncmp(s, "nobody@", 7)) - nvap->nva_gid = -2; - else if (!strncmp(s, "root@", 5)) - nvap->nva_uid = 0; - else - nvap->nva_gid = 99; /* unknown */ } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_QUOTA_AVAIL_HARD)) { nfsm_chain_get_64(error, nmc, dqbp->dqb_bhardlimit); @@ -828,14 +2117,32 @@ nfs4_parsefattr( nfsm_chain_adv(error, nmc, 4*NFSX_UNSIGNED); /* just skip it */ attrbytes -= 4 * NFSX_UNSIGNED; } - if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_MOUNTED_ON_FILEID)) { /* skip for now */ + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_MOUNTED_ON_FILEID)) { +#if CONFIG_TRIGGERS + /* we prefer the mounted on file ID, so just replace the fileid */ + nfsm_chain_get_64(error, nmc, nvap->nva_fileid); +#else nfsm_chain_adv(error, nmc, 2*NFSX_UNSIGNED); +#endif attrbytes -= 2 * NFSX_UNSIGNED; } /* advance over any leftover attrbytes */ nfsm_assert(error, (attrbytes >= 0), EBADRPC); nfsm_chain_adv(error, nmc, nfsm_rndup(attrbytes)); nfsmout: + if (error) + nfs_fs_locations_cleanup(nfslsp); + if (!error && rderror) + error = rderror; + /* free up temporary resources */ + if (s && (s != sbuf)) + FREE(s, M_TEMP); + if (acl) + kauth_acl_free(acl); + if (error && nvap->nva_acl) { + kauth_acl_free(nvap->nva_acl); + nvap->nva_acl = NULL; + } return (error); } @@ -845,51 +2152,18 @@ nfsmout: int nfsm_chain_add_fattr4_f(struct nfsm_chain *nmc, struct vnode_attr *vap, struct nfsmount *nmp) { - int error = 0, attrbytes, slen, i; - uint32_t *pattrbytes; + int error = 0, attrbytes, slen, len, i, isgroup; + uint32_t *pattrbytes, val, acecount;; uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; - char s[32]; + char sbuf[64], *s; + kauth_acl_t acl; + gid_t gid; - /* - * Do this in two passes. - * First calculate the bitmap, then pack - * everything together and set the size. - */ + s = sbuf; + slen = sizeof(sbuf); - NFS_CLEAR_ATTRIBUTES(bitmap); - if (VATTR_IS_ACTIVE(vap, va_data_size)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_SIZE); - if (VATTR_IS_ACTIVE(vap, va_acl)) { - // NFS_BITMAP_SET(bitmap, NFS_FATTR_ACL) - } - if (VATTR_IS_ACTIVE(vap, va_flags)) { - NFS_BITMAP_SET(bitmap, NFS_FATTR_ARCHIVE); - NFS_BITMAP_SET(bitmap, NFS_FATTR_HIDDEN); - } - // NFS_BITMAP_SET(bitmap, NFS_FATTR_MIMETYPE) - if (VATTR_IS_ACTIVE(vap, va_mode)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_MODE); - if (VATTR_IS_ACTIVE(vap, va_uid)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_OWNER); - if (VATTR_IS_ACTIVE(vap, va_gid)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_OWNER_GROUP); - // NFS_BITMAP_SET(bitmap, NFS_FATTR_SYSTEM) - if (vap->va_vaflags & VA_UTIMES_NULL) { - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_ACCESS_SET); - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_MODIFY_SET); - } else { - if (VATTR_IS_ACTIVE(vap, va_access_time)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_ACCESS_SET); - if (VATTR_IS_ACTIVE(vap, va_modify_time)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_MODIFY_SET); - } - if (VATTR_IS_ACTIVE(vap, va_backup_time)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_BACKUP); - if (VATTR_IS_ACTIVE(vap, va_create_time)) - NFS_BITMAP_SET(bitmap, NFS_FATTR_TIME_CREATE); - /* and limit to what is supported by server */ - for (i=0; i < NFS_ATTR_BITMAP_LEN; i++) - bitmap[i] &= nmp->nm_fsattr.nfsa_supp_attr[i]; + /* First calculate the bitmap... */ + nfs_vattr_set_bitmap(nmp, bitmap, vap); /* * Now pack it all together: @@ -905,7 +2179,43 @@ nfsm_chain_add_fattr4_f(struct nfsm_chain *nmc, struct vnode_attr *vap, struct n nfsm_chain_add_64(error, nmc, vap->va_data_size); attrbytes += 2*NFSX_UNSIGNED; } - // NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL) + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL)) { + acl = vap->va_acl; + if (!acl || (acl->acl_entrycount == KAUTH_FILESEC_NOACL)) + acecount = 0; + else + acecount = acl->acl_entrycount; + nfsm_chain_add_32(error, nmc, acecount); + attrbytes += NFSX_UNSIGNED; + for (i=0; !error && (i < (int)acecount); i++) { + val = (acl->acl_ace[i].ace_flags & KAUTH_ACE_KINDMASK); + val = nfs4_ace_vfstype_to_nfstype(val, &error); + nfsm_chain_add_32(error, nmc, val); + val = nfs4_ace_vfsflags_to_nfsflags(acl->acl_ace[i].ace_flags); + nfsm_chain_add_32(error, nmc, val); + val = nfs4_ace_vfsrights_to_nfsmask(acl->acl_ace[i].ace_rights); + nfsm_chain_add_32(error, nmc, val); + len = slen; + isgroup = (kauth_cred_guid2gid(&acl->acl_ace[i].ace_applicable, &gid) == 0); + error = nfs4_guid2id(&acl->acl_ace[i].ace_applicable, s, &len, isgroup); + if (error == ENOSPC) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + } + len += 8; + MALLOC(s, char*, len, M_TEMP, M_WAITOK); + if (s) { + slen = len; + error = nfs4_guid2id(&acl->acl_ace[i].ace_applicable, s, &len, isgroup); + } else { + error = ENOMEM; + } + } + nfsm_chain_add_name(error, nmc, s, len, nmp); + attrbytes += 4*NFSX_UNSIGNED + nfsm_rndup(len); + } + } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ARCHIVE)) { nfsm_chain_add_32(error, nmc, (vap->va_flags & SF_ARCHIVED) ? 1 : 0); attrbytes += NFSX_UNSIGNED; @@ -920,26 +2230,56 @@ nfsm_chain_add_fattr4_f(struct nfsm_chain *nmc, struct vnode_attr *vap, struct n attrbytes += NFSX_UNSIGNED; } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER)) { - /* XXX Need ID mapping infrastructure - use ugly hack for now */ - if (vap->va_uid == 0) - slen = snprintf(s, sizeof(s), "root@localdomain"); - else if (vap->va_uid == (uid_t)-2) - slen = snprintf(s, sizeof(s), "nobody@localdomain"); - else - slen = snprintf(s, sizeof(s), "%d", vap->va_uid); - nfsm_chain_add_string(error, nmc, s, slen); - attrbytes += NFSX_UNSIGNED + nfsm_rndup(slen); + nfsmout_if(error); + /* if we have va_uuuid use it, otherwise use uid */ + if (!VATTR_IS_ACTIVE(vap, va_uuuid)) { + error = kauth_cred_uid2guid(vap->va_uid, &vap->va_uuuid); + nfsmout_if(error); + } + len = slen; + error = nfs4_guid2id(&vap->va_uuuid, s, &len, 0); + if (error == ENOSPC) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + } + len += 8; + MALLOC(s, char*, len, M_TEMP, M_WAITOK); + if (s) { + slen = len; + error = nfs4_guid2id(&vap->va_uuuid, s, &len, 0); + } else { + error = ENOMEM; + } + } + nfsm_chain_add_name(error, nmc, s, len, nmp); + attrbytes += NFSX_UNSIGNED + nfsm_rndup(len); } if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_OWNER_GROUP)) { - /* XXX Need ID mapping infrastructure - use ugly hack for now */ - if (vap->va_gid == 0) - slen = snprintf(s, sizeof(s), "root@localdomain"); - else if (vap->va_gid == (gid_t)-2) - slen = snprintf(s, sizeof(s), "nobody@localdomain"); - else - slen = snprintf(s, sizeof(s), "%d", vap->va_gid); - nfsm_chain_add_string(error, nmc, s, slen); - attrbytes += NFSX_UNSIGNED + nfsm_rndup(slen); + nfsmout_if(error); + /* if we have va_guuid use it, otherwise use gid */ + if (!VATTR_IS_ACTIVE(vap, va_guuid)) { + error = kauth_cred_gid2guid(vap->va_gid, &vap->va_guuid); + nfsmout_if(error); + } + len = slen; + error = nfs4_guid2id(&vap->va_guuid, s, &len, 1); + if (error == ENOSPC) { + if (s != sbuf) { + FREE(s, M_TEMP); + s = sbuf; + } + len += 8; + MALLOC(s, char*, len, M_TEMP, M_WAITOK); + if (s) { + slen = len; + error = nfs4_guid2id(&vap->va_guuid, s, &len, 1); + } else { + error = ENOMEM; + } + } + nfsm_chain_add_name(error, nmc, s, len, nmp); + attrbytes += NFSX_UNSIGNED + nfsm_rndup(len); } // NFS_BITMAP_SET(bitmap, NFS_FATTR_SYSTEM) if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_TIME_ACCESS_SET)) { @@ -978,16 +2318,100 @@ nfsm_chain_add_fattr4_f(struct nfsm_chain *nmc, struct vnode_attr *vap, struct n /* Now, set the attribute data length */ *pattrbytes = txdr_unsigned(attrbytes); nfsmout: + if (s && (s != sbuf)) + FREE(s, M_TEMP); return (error); } +/* + * Got the given error and need to start recovery (if not already started). + * Note: nmp must be locked! + */ +void +nfs_need_recover(struct nfsmount *nmp, int error) +{ + int wake = !(nmp->nm_state & NFSSTA_RECOVER); + + nmp->nm_state |= NFSSTA_RECOVER; + if ((error == NFSERR_ADMIN_REVOKED) || + (error == NFSERR_EXPIRED) || + (error == NFSERR_STALE_CLIENTID)) + nmp->nm_state |= NFSSTA_RECOVER_EXPIRED; + if (wake) + nfs_mount_sock_thread_wake(nmp); +} + +/* + * After recovery due to state expiry, check each node and + * drop any lingering delegation we thought we had. + * + * If a node has an open that is not lost and is not marked + * for reopen, then we hold onto any delegation because it is + * likely newly-granted. + */ +static void +nfs4_expired_check_delegation(nfsnode_t np, vfs_context_t ctx) +{ + struct nfsmount *nmp = NFSTONMP(np); + struct nfs_open_file *nofp; + int drop = 1; + + if ((np->n_flag & NREVOKE) || !(np->n_openflags & N_DELEG_MASK)) + return; + + lck_mtx_lock(&np->n_openlock); + + TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { + if (!nofp->nof_opencnt) + continue; + if (nofp->nof_flags & NFS_OPEN_FILE_LOST) + continue; + if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) + continue; + /* we have an open that is not lost and not marked for reopen */ + // XXX print out what's keeping this node from dropping the delegation. + NP(nofp->nof_np, "nfs4_expired_check_delegation: !drop: opencnt %d flags 0x%x access %d %d mmap %d %d", + nofp->nof_opencnt, nofp->nof_flags, + nofp->nof_access, nofp->nof_deny, + nofp->nof_mmap_access, nofp->nof_mmap_deny); + drop = 0; + break; + } + + if (drop) { + /* need to drop a delegation */ + if (np->n_dreturn.tqe_next != NFSNOLIST) { + /* remove this node from the delegation return list */ + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dreturn.tqe_next != NFSNOLIST) { + TAILQ_REMOVE(&nmp->nm_dreturnq, np, n_dreturn); + np->n_dreturn.tqe_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); + } + if (np->n_openflags & N_DELEG_MASK) { + np->n_openflags &= ~N_DELEG_MASK; + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next != NFSNOLIST) { + TAILQ_REMOVE(&nmp->nm_delegations, np, n_dlink); + np->n_dlink.tqe_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); + nfs4_delegreturn_rpc(nmp, np->n_fhp, np->n_fhsize, &np->n_dstateid, + 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); + } + } + + lck_mtx_unlock(&np->n_openlock); +} + /* * Recover state for an NFS mount. * * Iterates over all open files, reclaiming opens and lock state. */ void -nfs4_recover(struct nfsmount *nmp) +nfs_recover(struct nfsmount *nmp) { struct timespec ts = { 1, 0 }; int error, lost, reopen; @@ -996,6 +2420,8 @@ nfs4_recover(struct nfsmount *nmp) struct nfs_file_lock *nflp, *nextnflp; struct nfs_lock_owner *nlop; thread_t thd = current_thread(); + nfsnode_t np, nextnp; + struct timeval now; restart: error = 0; @@ -1020,25 +2446,36 @@ restart: } while (nmp->nm_stateinuse); if (error) { if (error == EPIPE) - printf("nfs recovery reconnecting\n"); + printf("nfs recovery reconnecting for %s, 0x%x\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); else - printf("nfs recovery aborted\n"); + printf("nfs recovery aborted for %s, 0x%x\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); lck_mtx_unlock(&nmp->nm_lock); return; } - printf("nfs recovery started\n"); + microuptime(&now); + if (now.tv_sec == nmp->nm_recover_start) { + printf("nfs recovery throttled for %s, 0x%x\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); + lck_mtx_unlock(&nmp->nm_lock); + tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", hz); + goto restart; + } + nmp->nm_recover_start = now.tv_sec; if (++nmp->nm_stategenid == 0) ++nmp->nm_stategenid; + printf("nfs recovery started for %s, 0x%x\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); lck_mtx_unlock(&nmp->nm_lock); /* for each open owner... */ TAILQ_FOREACH(noop, &nmp->nm_open_owners, noo_link) { /* for each of its opens... */ TAILQ_FOREACH(nofp, &noop->noo_opens, nof_oolink) { - if (!nofp->nof_access || (nofp->nof_flags & NFS_OPEN_FILE_LOST)) + if (!nofp->nof_access || (nofp->nof_flags & NFS_OPEN_FILE_LOST) || (nofp->nof_np->n_flag & NREVOKE)) continue; lost = reopen = 0; + /* for NFSv2/v3, just skip straight to lock reclaim */ + if (nmp->nm_vers < NFS_VER4) + goto reclaim_locks; if (nofp->nof_rw_drw) error = nfs4_open_reclaim_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_BOTH); if (!error && nofp->nof_w_drw) @@ -1056,45 +2493,80 @@ restart: */ if (!error && nofp->nof_rw) { error = nfs4_open_reclaim_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE); - if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) - reopen = 1; + if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) { + reopen = error; + error = 0; + } } - if (!error && nofp->nof_w) { + if (!error && !reopen && nofp->nof_w) { error = nfs4_open_reclaim_rpc(nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE); - if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) - reopen = 1; + if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) { + reopen = error; + error = 0; + } } - if (!error && nofp->nof_r) { + if (!error && !reopen && nofp->nof_r) { error = nfs4_open_reclaim_rpc(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE); - if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) - reopen = 1; + if ((error == NFSERR_ADMIN_REVOKED) || (error == NFSERR_EXPIRED) || (error == NFSERR_NO_GRACE)) { + reopen = error; + error = 0; + } } - if (error) { + /* + * If we hold delegated state but we don't have any non-delegated opens, + * then we should attempt to claim that state now (but don't return the + * delegation unless asked to). + */ + if ((nofp->nof_d_rw_drw || nofp->nof_d_w_drw || nofp->nof_d_r_drw || + nofp->nof_d_rw_dw || nofp->nof_d_w_dw || nofp->nof_d_r_dw || + nofp->nof_d_rw || nofp->nof_d_w || nofp->nof_d_r) && + (!nofp->nof_rw_drw && !nofp->nof_w_drw && !nofp->nof_r_drw && + !nofp->nof_rw_dw && !nofp->nof_w_dw && !nofp->nof_r_dw && + !nofp->nof_rw && !nofp->nof_w && !nofp->nof_r)) { + if (!error && !nfs_open_state_set_busy(nofp->nof_np, NULL)) { + error = nfs4_claim_delegated_state_for_node(nofp->nof_np, R_RECOVER); + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) + reopen = EAGAIN; + nfs_open_state_clear_busy(nofp->nof_np); + /* if claim didn't go well, we may need to return delegation now */ + if (nofp->nof_np->n_openflags & N_DELEG_RETURN) { + nfs4_delegation_return(nofp->nof_np, R_RECOVER, thd, noop->noo_cred); + if (!(nmp->nm_sockflags & NMSOCK_READY)) + error = ETIMEDOUT; /* looks like we need a reconnect */ + } + } + } + + /* + * Handle any issue claiming open state. + * Potential reopens need to first confirm that there are no locks. + */ + if (error || reopen) { /* restart recovery? */ if ((error == ETIMEDOUT) || nfs_mount_state_error_should_restart(error)) { if (error == ETIMEDOUT) nfs_need_reconnect(nmp); tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", 0); - printf("nfs recovery restarting %d\n", error); + printf("nfs recovery restarting for %s, 0x%x, error %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid, error); goto restart; } - if (reopen && (nfs4_check_for_locks(noop, nofp) == 0)) { + if (reopen && (nfs_check_for_locks(noop, nofp) == 0)) { /* just reopen the file on next access */ - const char *vname = vnode_getname(NFSTOV(nofp->nof_np)); - printf("nfs4_recover: %d, need reopen for %s\n", error, vname ? vname : "???"); - vnode_putname(vname); + NP(nofp->nof_np, "nfs_recover: %d, need reopen for %d %p 0x%x", reopen, + kauth_cred_getuid(noop->noo_cred), nofp->nof_np, nofp->nof_np->n_flag); lck_mtx_lock(&nofp->nof_lock); nofp->nof_flags |= NFS_OPEN_FILE_REOPEN; lck_mtx_unlock(&nofp->nof_lock); - error = 0; } else { /* open file state lost */ + if (reopen) + NP(nofp->nof_np, "nfs_recover: %d, can't reopen because of locks %d %p", reopen, + kauth_cred_getuid(noop->noo_cred), nofp->nof_np); lost = 1; error = 0; - lck_mtx_lock(&nofp->nof_lock); - nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN; - lck_mtx_unlock(&nofp->nof_lock); + reopen = 0; } } else { /* no error, so make sure the reopen flag isn't set */ @@ -1102,83 +2574,97 @@ restart: nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN; lck_mtx_unlock(&nofp->nof_lock); } + /* * Scan this node's lock owner list for entries with this open owner, * then walk the lock owner's held lock list recovering each lock. */ -rescanlocks: +reclaim_locks: TAILQ_FOREACH(nlop, &nofp->nof_np->n_lock_owners, nlo_link) { + if (lost || reopen) + break; if (nlop->nlo_open_owner != noop) continue; TAILQ_FOREACH_SAFE(nflp, &nlop->nlo_locks, nfl_lolink, nextnflp) { + /* skip dead & blocked lock requests (shouldn't be any in the held lock list) */ if (nflp->nfl_flags & (NFS_FILE_LOCK_DEAD|NFS_FILE_LOCK_BLOCKED)) continue; - if (!lost) { - error = nfs4_lock_rpc(nofp->nof_np, nofp, nflp, 1, thd, noop->noo_cred); - if (!error) - continue; - /* restart recovery? */ - if ((error == ETIMEDOUT) || nfs_mount_state_error_should_restart(error)) { - if (error == ETIMEDOUT) - nfs_need_reconnect(nmp); - tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", 0); - printf("nfs recovery restarting %d\n", error); - goto restart; - } - /* lock state lost - attempt to close file */ - lost = 1; - error = nfs4_close_rpc(nofp->nof_np, nofp, NULL, noop->noo_cred, R_RECOVER); - if ((error == ETIMEDOUT) || nfs_mount_state_error_should_restart(error)) { - if (error == ETIMEDOUT) - nfs_need_reconnect(nmp); - tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", 0); - printf("nfs recovery restarting %d\n", error); - goto restart; - } - error = 0; - /* rescan locks so we can drop them all */ - goto rescanlocks; - } - if (lost) { - /* kill/remove the lock */ - lck_mtx_lock(&nofp->nof_np->n_openlock); - nflp->nfl_flags |= NFS_FILE_LOCK_DEAD; - lck_mtx_lock(&nlop->nlo_lock); - nextnflp = TAILQ_NEXT(nflp, nfl_lolink); - TAILQ_REMOVE(&nlop->nlo_locks, nflp, nfl_lolink); - lck_mtx_unlock(&nlop->nlo_lock); - if (nflp->nfl_blockcnt) { - /* wake up anyone blocked on this lock */ - wakeup(nflp); - } else { - /* remove nflp from lock list and destroy */ - TAILQ_REMOVE(&nofp->nof_np->n_locks, nflp, nfl_link); - nfs_file_lock_destroy(nflp); - } - lck_mtx_unlock(&nofp->nof_np->n_openlock); + /* skip delegated locks */ + if (nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED) + continue; + error = nmp->nm_funcs->nf_setlock_rpc(nofp->nof_np, nofp, nflp, 1, R_RECOVER, thd, noop->noo_cred); + if (error) + NP(nofp->nof_np, "nfs: lock reclaim (0x%llx, 0x%llx) %s %d", + nflp->nfl_start, nflp->nfl_end, + error ? "failed" : "succeeded", error); + if (!error) + continue; + /* restart recovery? */ + if ((error == ETIMEDOUT) || nfs_mount_state_error_should_restart(error)) { + if (error == ETIMEDOUT) + nfs_need_reconnect(nmp); + tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", 0); + printf("nfs recovery restarting for %s, 0x%x, error %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid, error); + goto restart; } + /* lock state lost - attempt to close file */ + lost = 1; + error = 0; + break; } } + + /* + * If we've determined that we need to reopen the file then we probably + * didn't receive any delegation we think we hold. We should attempt to + * return that delegation (and claim any delegated state). + * + * If we hold a delegation that is marked for return, then we should + * return it now. + */ + if ((nofp->nof_np->n_openflags & N_DELEG_RETURN) || + (reopen && (nofp->nof_np->n_openflags & N_DELEG_MASK))) { + nfs4_delegation_return(nofp->nof_np, R_RECOVER, thd, noop->noo_cred); + if (!(nmp->nm_sockflags & NMSOCK_READY)) { + /* looks like we need a reconnect */ + tsleep(&lbolt, (PZERO-1), "nfsrecoverrestart", 0); + printf("nfs recovery restarting for %s, 0x%x, error %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid, error); + goto restart; + } + } + if (lost) { /* revoke open file state */ - lck_mtx_lock(&nofp->nof_lock); - nofp->nof_flags |= NFS_OPEN_FILE_LOST; - lck_mtx_unlock(&nofp->nof_lock); - const char *vname = vnode_getname(NFSTOV(nofp->nof_np)); - printf("nfs4_recover: state lost for %s\n", vname ? vname : "???"); - vnode_putname(vname); + NP(nofp->nof_np, "nfs_recover: state lost for %d %p 0x%x", + kauth_cred_getuid(noop->noo_cred), nofp->nof_np, nofp->nof_np->n_flag); + nfs_revoke_open_state_for_node(nofp->nof_np); } } } if (!error) { + /* If state expired, make sure we're not holding onto any stale delegations */ lck_mtx_lock(&nmp->nm_lock); - nmp->nm_state &= ~NFSSTA_RECOVER; + if ((nmp->nm_vers >= NFS_VER4) && (nmp->nm_state & NFSSTA_RECOVER_EXPIRED)) { +recheckdeleg: + TAILQ_FOREACH_SAFE(np, &nmp->nm_delegations, n_dlink, nextnp) { + lck_mtx_unlock(&nmp->nm_lock); + nfs4_expired_check_delegation(np, vfs_context_kernel()); + lck_mtx_lock(&nmp->nm_lock); + if (nextnp == NFSNOLIST) + goto recheckdeleg; + } + } + nmp->nm_state &= ~(NFSSTA_RECOVER|NFSSTA_RECOVER_EXPIRED); wakeup(&nmp->nm_state); - printf("nfs recovery completed\n"); + printf("nfs recovery completed for %s, 0x%x\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); lck_mtx_unlock(&nmp->nm_lock); } else { - printf("nfs recovery failed %d\n", error); + printf("nfs recovery failed for %s, 0x%x, error %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid, error); } } diff --git a/bsd/nfs/nfs4_vnops.c b/bsd/nfs/nfs4_vnops.c index ffd12d88f..ca874aa7c 100644 --- a/bsd/nfs/nfs4_vnops.c +++ b/bsd/nfs/nfs4_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2009 Apple Inc. All rights reserved. + * Copyright (c) 2006-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -46,7 +46,9 @@ #include <sys/ubc_internal.h> #include <sys/attr.h> #include <sys/signalvar.h> -#include <sys/uio.h> +#include <sys/uio_internal.h> +#include <sys/xattr.h> +#include <sys/paths.h> #include <vfs/vfs_support.h> @@ -78,17 +80,22 @@ #include <kern/sched_prim.h> int -nfs4_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) +nfs4_access_rpc(nfsnode_t np, u_int32_t *access, vfs_context_t ctx) { int error = 0, lockerror = ENOENT, status, numops, slot; u_int64_t xid; struct nfsm_chain nmreq, nmrep; struct timeval now; - uint32_t access = 0, supported = 0, missing; + uint32_t access_result = 0, supported = 0, missing; struct nfsmount *nmp = NFSTONMP(np); int nfsvers = nmp->nm_vers; uid_t uid; + struct nfsreq_secinfo_args si; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (0); + + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -101,15 +108,14 @@ nfs4_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_ACCESS); - nfsm_chain_add_32(error, &nmreq, *mode); + nfsm_chain_add_32(error, &nmreq, *access); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -118,9 +124,9 @@ nfs4_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_ACCESS); nfsm_chain_get_32(error, &nmrep, supported); - nfsm_chain_get_32(error, &nmrep, access); + nfsm_chain_get_32(error, &nmrep, access_result); nfsmout_if(error); - if ((missing = (*mode & ~supported))) { + if ((missing = (*access & ~supported))) { /* missing support for something(s) we wanted */ if (missing & NFS_ACCESS_DELETE) { /* @@ -129,25 +135,35 @@ nfs4_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) * and just let any subsequent delete action fail * if it really isn't deletable. */ - access |= NFS_ACCESS_DELETE; + access_result |= NFS_ACCESS_DELETE; } } + /* ".zfs" subdirectories may erroneously give a denied answer for modify/delete */ + if (nfs_access_dotzfs) { + vnode_t dvp = NULLVP; + if (np->n_flag & NISDOTZFSCHILD) /* may be able to create/delete snapshot dirs */ + access_result |= (NFS_ACCESS_MODIFY|NFS_ACCESS_EXTEND|NFS_ACCESS_DELETE); + else if (((dvp = vnode_getparent(NFSTOV(np))) != NULLVP) && (VTONFS(dvp)->n_flag & NISDOTZFSCHILD)) + access_result |= NFS_ACCESS_DELETE; /* may be able to delete snapshot dirs */ + if (dvp != NULLVP) + vnode_put(dvp); + } /* Some servers report DELETE support but erroneously give a denied answer. */ - if ((*mode & NFS_ACCESS_DELETE) && nfs_access_delete && !(access & NFS_ACCESS_DELETE)) - access |= NFS_ACCESS_DELETE; + if (nfs_access_delete && (*access & NFS_ACCESS_DELETE) && !(access_result & NFS_ACCESS_DELETE)) + access_result |= NFS_ACCESS_DELETE; nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); nfsmout_if(error); uid = kauth_cred_getuid(vfs_context_ucred(ctx)); - slot = nfs_node_mode_slot(np, uid, 1); - np->n_modeuid[slot] = uid; + slot = nfs_node_access_slot(np, uid, 1); + np->n_accessuid[slot] = uid; microuptime(&now); - np->n_modestamp[slot] = now.tv_sec; - np->n_mode[slot] = access; + np->n_accessstamp[slot] = now.tv_sec; + np->n_access[slot] = access_result; - /* pass back the mode returned with this request */ - *mode = np->n_mode[slot]; + /* pass back the access returned with this request */ + *access = np->n_access[slot]; nfsmout: if (!lockerror) nfs_node_unlock(np); @@ -162,18 +178,31 @@ nfs4_getattr_rpc( mount_t mp, u_char *fhp, size_t fhsize, + int flags, vfs_context_t ctx, struct nfs_vattr *nvap, u_int64_t *xidp) { struct nfsmount *nmp = mp ? VFSTONFS(mp) : NFSTONMP(np); - int error = 0, status, nfsvers, numops; + int error = 0, status, nfsvers, numops, rpcflags = 0, acls; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + acls = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL); + + if (np && (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL)) { + nfs4_default_attrs_for_referral_trigger(VTONFS(np->n_parent), NULL, 0, nvap, NULL); + return (0); + } + + if (flags & NGA_MONITOR) /* vnode monitor requests should be soft */ + rpcflags = R_RECOVER; + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -186,20 +215,29 @@ nfs4_getattr_rpc( nfsm_chain_add_fh(error, &nmreq, nfsvers, fhp, fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + if ((flags & NGA_ACL) && acls) + NFS_BITMAP_SET(bitmap, NFS_FATTR_ACL); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, mp, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, xidp, &status); + error = nfs_request2(np, mp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), + NULL, rpcflags, &nmrep, xidp, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvap->nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, nvap, NULL, NULL); + error = nfs4_parsefattr(&nmrep, NULL, nvap, NULL, NULL, NULL); + nfsmout_if(error); + if ((flags & NGA_ACL) && acls && !NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_ACL)) { + /* we asked for the ACL but didn't get one... assume there isn't one */ + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_ACL); + nvap->nva_acl = NULL; + } nfsmout: nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); @@ -214,10 +252,14 @@ nfs4_readlink_rpc(nfsnode_t np, char *buf, uint32_t *buflenp, vfs_context_t ctx) uint32_t len = 0; u_int64_t xid; struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -230,14 +272,13 @@ nfs4_readlink_rpc(nfsnode_t np, char *buf, uint32_t *buflenp, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, NFS_VER4, np->n_fhp, np->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_READLINK); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -245,7 +286,7 @@ nfs4_readlink_rpc(nfsnode_t np, char *buf, uint32_t *buflenp, vfs_context_t ctx) nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, &xid); nfsm_chain_op_check(error, &nmrep, NFS_OP_READLINK); nfsm_chain_get_32(error, &nmrep, len); nfsmout_if(error); @@ -280,12 +321,16 @@ nfs4_read_rpc_async( int error = 0, nfsvers, numops; nfs_stateid stateid; struct nfsm_chain nmreq; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); // PUTFH, READ, GETATTR @@ -303,12 +348,11 @@ nfs4_read_rpc_async( nfsm_chain_add_32(error, &nmreq, len); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request_async(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, cb, reqp); + error = nfs_request_async(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, 0, cb, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); @@ -354,7 +398,7 @@ nfs4_read_rpc_async_finish( error = nfsm_chain_get_uio(&nmrep, *lenp, uio); } nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); if (!lockerror) nfs_node_unlock(np); if (eofp) { @@ -363,6 +407,8 @@ nfs4_read_rpc_async_finish( *eofp = eof; } nfsm_chain_cleanup(&nmrep); + if (np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) + microuptime(&np->n_lastio); return (error); } @@ -378,15 +424,25 @@ nfs4_write_rpc_async( struct nfsreq **reqp) { struct nfsmount *nmp; + mount_t mp; int error = 0, nfsvers, numops; nfs_stateid stateid; struct nfsm_chain nmreq; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + + /* for async mounts, don't bother sending sync write requests */ + if ((iomode != NFS_WRITE_UNSTABLE) && nfs_allow_async && + ((mp = NFSTOMP(np))) && (vfs_flags(mp) & MNT_ASYNC)) + iomode = NFS_WRITE_UNSTABLE; + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); // PUTFH, WRITE, GETATTR @@ -407,13 +463,12 @@ nfs4_write_rpc_async( error = nfsm_chain_add_uio(&nmreq, uio, len); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request_async(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, cb, reqp); + error = nfs_request_async(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, 0, cb, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); @@ -475,7 +530,7 @@ nfs4_write_rpc_async_finish( } lck_mtx_unlock(&nmp->nm_lock); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); nfsmout: if (!lockerror) nfs_node_unlock(np); @@ -484,6 +539,8 @@ nfsmout: ((mp = NFSTOMP(np))) && (vfs_flags(mp) & MNT_ASYNC)) committed = NFS_WRITE_FILESYNC; *iomodep = committed; + if (np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) + microuptime(&np->n_lastio); return (error); } @@ -500,11 +557,15 @@ nfs4_remove_rpc( int nfsvers, numops; u_int64_t xid; struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(dnp); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, NULL, 0); restart: nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -518,16 +579,15 @@ restart: nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_REMOVE); - nfsm_chain_add_string(error, &nmreq, name, namelen); + nfsm_chain_add_name(error, &nmreq, name, namelen, nmp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, 0, &nmrep, &xid, &status); + error = nfs_request2(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, 0, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(dnp))) error = lockerror; @@ -538,7 +598,7 @@ restart: remove_error = error; nfsm_chain_check_change_info(error, &nmrep, dnp); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, &xid); if (error && !lockerror) NATTRINVALIDATE(dnp); nfsmout: @@ -571,12 +631,18 @@ nfs4_rename_rpc( struct nfsmount *nmp; u_int64_t xid, savedxid; struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(fdnp); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (fdnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + if (tdnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, fdnp, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -594,23 +660,21 @@ nfs4_rename_rpc( nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_RENAME); - nfsm_chain_add_string(error, &nmreq, fnameptr, fnamelen); - nfsm_chain_add_string(error, &nmreq, tnameptr, tnamelen); + nfsm_chain_add_name(error, &nmreq, fnameptr, fnamelen, nmp); + nfsm_chain_add_name(error, &nmreq, tnameptr, tnamelen, nmp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, tdnp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, fdnp); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(fdnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(fdnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock2(fdnp, tdnp))) error = lockerror; @@ -625,13 +689,13 @@ nfs4_rename_rpc( /* directory attributes: if we don't get them, make sure to invalidate */ nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); savedxid = xid; - nfsm_chain_loadattr(error, &nmrep, tdnp, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, tdnp, nfsvers, &xid); if (error && !lockerror) NATTRINVALIDATE(tdnp); nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); xid = savedxid; - nfsm_chain_loadattr(error, &nmrep, fdnp, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, fdnp, nfsvers, &xid); if (error && !lockerror) NATTRINVALIDATE(fdnp); nfsmout: @@ -642,9 +706,6 @@ nfsmout: tdnp->n_flag |= NMODIFIED; nfs_node_unlock2(fdnp, tdnp); } - /* Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ - if (error == EEXIST) - error = 0; return (error); } @@ -655,7 +716,7 @@ int nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) { struct nfsmount *nmp; - int error = 0, lockerror, nfsvers, rdirplus, bigcookies, numops; + int error = 0, lockerror, nfsvers, namedattr, rdirplus, bigcookies, numops; int i, status, more_entries = 1, eof, bp_dropped = 0; uint32_t nmreaddirsize, nmrsize; uint32_t namlen, skiplen, fhlen, xlen, attrlen, reclen, space_free, space_needed; @@ -669,6 +730,7 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) const char *tag; uint32_t entry_attrs[NFS_ATTR_BITMAP_LEN]; struct timeval now; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(dnp); if (!nmp) @@ -677,7 +739,11 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nmreaddirsize = nmp->nm_readdirsize; nmrsize = nmp->nm_rsize; bigcookies = nmp->nm_state & NFSSTA_BIGCOOKIES; - rdirplus = ((nfsvers > NFS_VER2) && (nmp->nm_flag & NFSMNT_RDIRPLUS)) ? 1 : 0; + namedattr = (dnp->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) ? 1 : 0; + rdirplus = (NMFLAG(nmp, RDIRPLUS) || namedattr) ? 1 : 0; + if (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, NULL, 0); /* * Set up attribute request for entries. @@ -686,18 +752,15 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) */ if (rdirplus) { tag = "readdirplus"; - for (i=0; i < NFS_ATTR_BITMAP_LEN; i++) - entry_attrs[i] = - nfs_getattr_bitmap[i] & - nmp->nm_fsattr.nfsa_supp_attr[i]; + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, entry_attrs); NFS_BITMAP_SET(entry_attrs, NFS_FATTR_FILEHANDLE); } else { tag = "readdir"; NFS_CLEAR_ATTRIBUTES(entry_attrs); NFS_BITMAP_SET(entry_attrs, NFS_FATTR_TYPE); NFS_BITMAP_SET(entry_attrs, NFS_FATTR_FILEID); + NFS_BITMAP_SET(entry_attrs, NFS_FATTR_MOUNTED_ON_FILEID); } - /* XXX NFS_BITMAP_SET(entry_attrs, NFS_FATTR_MOUNTED_ON_FILEID); */ NFS_BITMAP_SET(entry_attrs, NFS_FATTR_RDATTR_ERROR); /* lock to protect access to cookie verifier */ @@ -722,8 +785,10 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) /* * The NFS client is responsible for the "." and ".." entries in the * directory. So, we put them at the start of the first buffer. + * Don't bother for attribute directories. */ - if ((bp->nb_lblkno == 0) && (ndbhp->ndbh_count == 0)) { + if (((bp->nb_lblkno == 0) && (ndbhp->ndbh_count == 0)) && + !(dnp->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) { fh.fh_len = 0; fhlen = rdirplus ? fh.fh_len + 1 : 0; xlen = rdirplus ? (fhlen + sizeof(time_t)) : 0; @@ -790,20 +855,19 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_READDIR); nfsm_chain_add_64(error, &nmreq, (cookie <= 2) ? 0 : cookie); nfsm_chain_add_64(error, &nmreq, dnp->n_cookieverf); nfsm_chain_add_32(error, &nmreq, nmreaddirsize); nfsm_chain_add_32(error, &nmreq, nmrsize); - nfsm_chain_add_bitmap(error, &nmreq, entry_attrs, NFS_ATTR_BITMAP_LEN); + nfsm_chain_add_bitmap_supported(error, &nmreq, entry_attrs, nmp, dnp); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfs_node_unlock(dnp); nfsmout_if(error); - error = nfs_request(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(dnp))) error = lockerror; @@ -813,7 +877,7 @@ nfs4_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, &xid); nfsm_chain_op_check(error, &nmrep, NFS_OP_READDIR); nfsm_chain_get_64(error, &nmrep, dnp->n_cookieverf); nfsm_chain_get_32(error, &nmrep, more_entries); @@ -898,14 +962,21 @@ nextbuffer: nfsm_rndup(namlen + skiplen) - nfsm_rndup(namlen)); nfsmout_if(error); nvattrp = rdirplus ? NFS_DIR_BUF_NVATTR(bp, ndbhp->ndbh_count) : &nvattr; - NFS_CLEAR_ATTRIBUTES(nvattrp->nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, nvattrp, &fh, NULL); + error = nfs4_parsefattr(&nmrep, NULL, nvattrp, &fh, NULL, NULL); + if (!error && NFS_BITMAP_ISSET(nvattrp->nva_bitmap, NFS_FATTR_ACL)) { + /* we do NOT want ACLs returned to us here */ + NFS_BITMAP_CLR(nvattrp->nva_bitmap, NFS_FATTR_ACL); + if (nvattrp->nva_acl) { + kauth_acl_free(nvattrp->nva_acl); + nvattrp->nva_acl = NULL; + } + } if (error && NFS_BITMAP_ISSET(nvattrp->nva_bitmap, NFS_FATTR_RDATTR_ERROR)) { - /* OK, we didn't get attributes, whatever... */ - if (rdirplus) /* mark the attributes invalid */ - bzero(nvattrp, sizeof(struct nfs_vattr)); - else - NFS_CLEAR_ATTRIBUTES(nvattrp->nva_bitmap); + /* OK, we may not have gotten all of the attributes but we will use what we can. */ + if ((error == NFSERR_MOVED) || (error == NFSERR_INVAL)) { + /* set this up to look like a referral trigger */ + nfs4_default_attrs_for_referral_trigger(dnp, dp->d_name, namlen, nvattrp, &fh); + } error = 0; } /* check for more entries after this one */ @@ -913,7 +984,9 @@ nextbuffer: nfsmout_if(error); /* Skip any "." and ".." entries returned from server. */ - if ((dp->d_name[0] == '.') && ((namlen == 1) || ((namlen == 2) && (dp->d_name[1] == '.')))) { + /* Also skip any bothersome named attribute entries. */ + if (((dp->d_name[0] == '.') && ((namlen == 1) || ((namlen == 2) && (dp->d_name[1] == '.')))) || + (namedattr && (namlen == 11) && (!strcmp(dp->d_name, "SUNWattr_ro") || !strcmp(dp->d_name, "SUNWattr_rw")))) { lastcookie = cookie; continue; } @@ -1001,23 +1074,30 @@ nfs4_lookup_rpc_async( vfs_context_t ctx, struct nfsreq **reqp) { - int error = 0, isdotdot = 0, getattrs = 1, nfsvers, numops; + int error = 0, isdotdot = 0, nfsvers, numops; struct nfsm_chain nmreq; uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; struct nfsmount *nmp; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(dnp); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); - if ((name[0] == '.') && (name[1] == '.') && (namelen == 2)) + if ((name[0] == '.') && (name[1] == '.') && (namelen == 2)) { isdotdot = 1; + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, NULL, 0); + } else { + NFSREQ_SECINFO_SET(&si, dnp, dnp->n_fhp, dnp->n_fhsize, name, namelen); + } nfsm_chain_null(&nmreq); - // PUTFH, GETATTR, LOOKUP(P), GETATTR (FH) - numops = getattrs ? 4 : 3; + // PUTFH, GETATTR, LOOKUP(P), GETFH, GETATTR (FH) + numops = 5; nfsm_chain_build_alloc_init(error, &nmreq, 20 * NFSX_UNSIGNED + namelen); nfsm_chain_add_compound_header(error, &nmreq, "lookup", numops); numops--; @@ -1025,50 +1105,59 @@ nfs4_lookup_rpc_async( nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); numops--; if (isdotdot) { nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUPP); } else { nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUP); - nfsm_chain_add_string(error, &nmreq, name, namelen); - } - if (getattrs) { - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); - NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_name(error, &nmreq, name, namelen, nmp); } + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + /* some ".zfs" directories can't handle being asked for some attributes */ + if ((dnp->n_flag & NISDOTZFS) && !isdotdot) + NFS_BITMAP_CLR(bitmap, NFS_FATTR_NAMED_ATTR); + if ((dnp->n_flag & NISDOTZFSCHILD) && isdotdot) + NFS_BITMAP_CLR(bitmap, NFS_FATTR_NAMED_ATTR); + if (((namelen == 4) && (name[0] == '.') && (name[1] == 'z') && (name[2] == 'f') && (name[3] == 's'))) + NFS_BITMAP_CLR(bitmap, NFS_FATTR_NAMED_ATTR); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, NULL); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, reqp); + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); } + int nfs4_lookup_rpc_async_finish( nfsnode_t dnp, - __unused vfs_context_t ctx, + char *name, + int namelen, + vfs_context_t ctx, struct nfsreq *req, u_int64_t *xidp, fhandle_t *fhp, struct nfs_vattr *nvap) { - int error = 0, lockerror = ENOENT, status, nfsvers, numops; - uint32_t val = 0; + int error = 0, lockerror = ENOENT, status, nfsvers, numops, isdotdot = 0; + uint32_t op = NFS_OP_LOOKUP; u_int64_t xid; struct nfsmount *nmp; struct nfsm_chain nmrep; nmp = NFSTONMP(dnp); nfsvers = nmp->nm_vers; + if ((name[0] == '.') && (name[1] == '.') && (namelen == 2)) + isdotdot = 1; nfsm_chain_null(&nmrep); @@ -1082,47 +1171,69 @@ nfs4_lookup_rpc_async_finish( nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); if (xidp) *xidp = xid; - nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, NULL, &xid); - - // nfsm_chain_op_check(error, &nmrep, (isdotdot ? NFS_OP_LOOKUPP : NFS_OP_LOOKUP)); - nfsm_chain_get_32(error, &nmrep, val); - nfsm_assert(error, (val == NFS_OP_LOOKUPP) || (val == NFS_OP_LOOKUP), EBADRPC); - nfsm_chain_get_32(error, &nmrep, val); - nfsm_assert(error, (val == NFS_OK), val); + nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, &xid); + nfsm_chain_op_check(error, &nmrep, (isdotdot ? NFS_OP_LOOKUPP : NFS_OP_LOOKUP)); nfsmout_if(error || !fhp || !nvap); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETFH); + nfsm_chain_get_32(error, &nmrep, fhp->fh_len); + nfsm_chain_get_opaque(error, &nmrep, fhp->fh_len, fhp->fh_data); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvap->nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, nvap, fhp, NULL); - if (!NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_FILEHANDLE)) { - error = EBADRPC; - goto nfsmout; + if ((error == NFSERR_MOVED) || (error == NFSERR_INVAL)) { + /* set this up to look like a referral trigger */ + nfs4_default_attrs_for_referral_trigger(dnp, name, namelen, nvap, fhp); + error = 0; + } else { + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, nvap, NULL, NULL, NULL); } nfsmout: if (!lockerror) nfs_node_unlock(dnp); nfsm_chain_cleanup(&nmrep); + if (!error && (op == NFS_OP_LOOKUP) && (nmp->nm_state & NFSSTA_NEEDSECINFO)) { + /* We still need to get SECINFO to set default for mount. */ + /* Do so for the first LOOKUP that returns successfully. */ + struct nfs_sec sec; + + sec.count = NX_MAX_SEC_FLAVORS; + error = nfs4_secinfo_rpc(nmp, &req->r_secinfo, vfs_context_ucred(ctx), sec.flavors, &sec.count); + /* [sigh] some implementations return "illegal" error for unsupported ops */ + if (error == NFSERR_OP_ILLEGAL) + error = 0; + if (!error) { + /* set our default security flavor to the first in the list */ + lck_mtx_lock(&nmp->nm_lock); + if (sec.count) + nmp->nm_auth = sec.flavors[0]; + nmp->nm_state &= ~NFSSTA_NEEDSECINFO; + lck_mtx_unlock(&nmp->nm_lock); + } + } return (error); } int nfs4_commit_rpc( nfsnode_t np, - u_int64_t offset, - u_int64_t count, - kauth_cred_t cred) + uint64_t offset, + uint64_t count, + kauth_cred_t cred, + uint64_t wverf) { struct nfsmount *nmp; int error = 0, lockerror, status, nfsvers, numops; - u_int64_t xid, wverf; + u_int64_t xid, newwverf; uint32_t count32; struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); FSDBG(521, np, offset, count, nmp ? nmp->nm_state : 0); if (!nmp) return (ENXIO); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); if (!(nmp->nm_state & NFSSTA_HASWRITEVERF)) return (0); nfsvers = nmp->nm_vers; @@ -1132,6 +1243,7 @@ nfs4_commit_rpc( else count32 = count; + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -1148,13 +1260,12 @@ nfs4_commit_rpc( nfsm_chain_add_32(error, &nmreq, count32); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, - current_thread(), cred, 0, &nmrep, &xid, &status); + current_thread(), cred, &si, 0, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -1162,17 +1273,17 @@ nfs4_commit_rpc( nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_COMMIT); - nfsm_chain_get_64(error, &nmrep, wverf); + nfsm_chain_get_64(error, &nmrep, newwverf); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); if (!lockerror) nfs_node_unlock(np); nfsmout_if(error); lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_verf != wverf) { - nmp->nm_verf = wverf; + if (nmp->nm_verf != newwverf) + nmp->nm_verf = newwverf; + if (wverf != newwverf) error = NFSERR_STALEWRITEVERF; - } lck_mtx_unlock(&nmp->nm_lock); nfsmout: nfsm_chain_cleanup(&nmreq); @@ -1192,11 +1303,16 @@ nfs4_pathconf_rpc( struct nfsmount *nmp = NFSTONMP(np); uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; struct nfs_vattr nvattr; + struct nfsreq_secinfo_args si; if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); + NVATTR_INIT(&nvattr); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -1217,20 +1333,18 @@ nfs4_pathconf_rpc( NFS_BITMAP_SET(bitmap, NFS_FATTR_CHOWN_RESTRICTED); NFS_BITMAP_SET(bitmap, NFS_FATTR_CASE_INSENSITIVE); NFS_BITMAP_SET(bitmap, NFS_FATTR_CASE_PRESERVING); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, nfsap, &nvattr, NULL, NULL); + error = nfs4_parsefattr(&nmrep, nfsap, &nvattr, NULL, NULL, NULL); nfsmout_if(error); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -1239,6 +1353,7 @@ nfs4_pathconf_rpc( if (!lockerror) nfs_node_unlock(np); nfsmout: + NVATTR_CLEANUP(&nvattr); nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); return (error); @@ -1254,79 +1369,102 @@ nfs4_vnop_getattr( } */ *ap) { struct vnode_attr *vap = ap->a_vap; + struct nfsmount *nmp; struct nfs_vattr nva; - int error; + int error, acls, ngaflags; + + if (!(nmp = VTONMP(ap->a_vp))) + return (ENXIO); + acls = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL); - error = nfs_getattr(VTONFS(ap->a_vp), &nva, ap->a_context, NGA_CACHED); + ngaflags = NGA_CACHED; + if (VATTR_IS_ACTIVE(vap, va_acl) && acls) + ngaflags |= NGA_ACL; + error = nfs_getattr(VTONFS(ap->a_vp), &nva, ap->a_context, ngaflags); if (error) return (error); /* copy what we have in nva to *a_vap */ - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_RAWDEV)) { + if (VATTR_IS_ACTIVE(vap, va_rdev) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_RAWDEV)) { dev_t rdev = makedev(nva.nva_rawdev.specdata1, nva.nva_rawdev.specdata2); VATTR_RETURN(vap, va_rdev, rdev); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_NUMLINKS)) + if (VATTR_IS_ACTIVE(vap, va_nlink) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_NUMLINKS)) VATTR_RETURN(vap, va_nlink, nva.nva_nlink); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_SIZE)) + if (VATTR_IS_ACTIVE(vap, va_data_size) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_SIZE)) VATTR_RETURN(vap, va_data_size, nva.nva_size); // VATTR_RETURN(vap, va_data_alloc, ???); // VATTR_RETURN(vap, va_total_size, ???); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_SPACE_USED)) + if (VATTR_IS_ACTIVE(vap, va_total_alloc) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_SPACE_USED)) VATTR_RETURN(vap, va_total_alloc, nva.nva_bytes); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER)) + if (VATTR_IS_ACTIVE(vap, va_uid) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER)) VATTR_RETURN(vap, va_uid, nva.nva_uid); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER_GROUP)) + if (VATTR_IS_ACTIVE(vap, va_uuuid) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER)) + VATTR_RETURN(vap, va_uuuid, nva.nva_uuuid); + if (VATTR_IS_ACTIVE(vap, va_gid) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER_GROUP)) VATTR_RETURN(vap, va_gid, nva.nva_gid); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_MODE)) - VATTR_RETURN(vap, va_mode, nva.nva_mode); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_ARCHIVE) || - NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_HIDDEN)) { + if (VATTR_IS_ACTIVE(vap, va_guuid) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_OWNER_GROUP)) + VATTR_RETURN(vap, va_guuid, nva.nva_guuid); + if (VATTR_IS_ACTIVE(vap, va_mode)) { + if (NMFLAG(nmp, ACLONLY) || !NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_MODE)) + VATTR_RETURN(vap, va_mode, 0777); + else + VATTR_RETURN(vap, va_mode, nva.nva_mode); + } + if (VATTR_IS_ACTIVE(vap, va_flags) && + (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_ARCHIVE) || + NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_HIDDEN) || + (nva.nva_flags & NFS_FFLAG_TRIGGER))) { uint32_t flags = 0; - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_ARCHIVE)) + if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_ARCHIVE) && + (nva.nva_flags & NFS_FFLAG_ARCHIVED)) flags |= SF_ARCHIVED; - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_HIDDEN)) + if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_HIDDEN) && + (nva.nva_flags & NFS_FFLAG_HIDDEN)) flags |= UF_HIDDEN; VATTR_RETURN(vap, va_flags, flags); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_CREATE)) { + if (VATTR_IS_ACTIVE(vap, va_create_time) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_CREATE)) { vap->va_create_time.tv_sec = nva.nva_timesec[NFSTIME_CREATE]; vap->va_create_time.tv_nsec = nva.nva_timensec[NFSTIME_CREATE]; VATTR_SET_SUPPORTED(vap, va_create_time); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_ACCESS)) { + if (VATTR_IS_ACTIVE(vap, va_access_time) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_ACCESS)) { vap->va_access_time.tv_sec = nva.nva_timesec[NFSTIME_ACCESS]; vap->va_access_time.tv_nsec = nva.nva_timensec[NFSTIME_ACCESS]; VATTR_SET_SUPPORTED(vap, va_access_time); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_MODIFY)) { + if (VATTR_IS_ACTIVE(vap, va_modify_time) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_MODIFY)) { vap->va_modify_time.tv_sec = nva.nva_timesec[NFSTIME_MODIFY]; vap->va_modify_time.tv_nsec = nva.nva_timensec[NFSTIME_MODIFY]; VATTR_SET_SUPPORTED(vap, va_modify_time); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_METADATA)) { + if (VATTR_IS_ACTIVE(vap, va_change_time) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_METADATA)) { vap->va_change_time.tv_sec = nva.nva_timesec[NFSTIME_CHANGE]; vap->va_change_time.tv_nsec = nva.nva_timensec[NFSTIME_CHANGE]; VATTR_SET_SUPPORTED(vap, va_change_time); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_BACKUP)) { + if (VATTR_IS_ACTIVE(vap, va_backup_time) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TIME_BACKUP)) { vap->va_backup_time.tv_sec = nva.nva_timesec[NFSTIME_BACKUP]; vap->va_backup_time.tv_nsec = nva.nva_timensec[NFSTIME_BACKUP]; VATTR_SET_SUPPORTED(vap, va_backup_time); } - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_FILEID)) + if (VATTR_IS_ACTIVE(vap, va_fileid) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_FILEID)) VATTR_RETURN(vap, va_fileid, nva.nva_fileid); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TYPE)) + if (VATTR_IS_ACTIVE(vap, va_type) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_TYPE)) VATTR_RETURN(vap, va_type, nva.nva_type); - if (NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_CHANGE)) + if (VATTR_IS_ACTIVE(vap, va_filerev) && NFS_BITMAP_ISSET(nva.nva_bitmap, NFS_FATTR_CHANGE)) VATTR_RETURN(vap, va_filerev, nva.nva_change); + if (VATTR_IS_ACTIVE(vap, va_acl) && acls) { + VATTR_RETURN(vap, va_acl, nva.nva_acl); + nva.nva_acl = NULL; + } + // other attrs we might support someday: // VATTR_RETURN(vap, va_encoding, ??? /* potentially unnormalized UTF-8? */); - // struct kauth_acl *va_acl; /* access control list */ - // guid_t va_uuuid; /* file owner UUID */ - // guid_t va_guuid; /* file group UUID */ + NVATTR_CLEANUP(&nva); return (error); } @@ -1337,15 +1475,20 @@ nfs4_setattr_rpc( vfs_context_t ctx) { struct nfsmount *nmp = NFSTONMP(np); - int error = 0, lockerror = ENOENT, status, nfsvers, numops; + int error = 0, setattr_error = 0, lockerror = ENOENT, status, nfsvers, numops; u_int64_t xid, nextxid; struct nfsm_chain nmreq, nmrep; uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; + uint32_t getbitmap[NFS_ATTR_BITMAP_LEN]; + uint32_t setbitmap[NFS_ATTR_BITMAP_LEN]; nfs_stateid stateid; + struct nfsreq_secinfo_args si; if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); if (VATTR_IS_ACTIVE(vap, va_flags) && (vap->va_flags & ~(SF_ARCHIVED|UF_HIDDEN))) { /* we don't support setting unsupported flags (duh!) */ @@ -1355,9 +1498,39 @@ nfs4_setattr_rpc( return (ENOTSUP); /* return ENOTSUP for chflags(2) */ } + /* don't bother requesting some changes if they don't look like they are changing */ + if (VATTR_IS_ACTIVE(vap, va_uid) && (vap->va_uid == np->n_vattr.nva_uid)) + VATTR_CLEAR_ACTIVE(vap, va_uid); + if (VATTR_IS_ACTIVE(vap, va_gid) && (vap->va_gid == np->n_vattr.nva_gid)) + VATTR_CLEAR_ACTIVE(vap, va_gid); + if (VATTR_IS_ACTIVE(vap, va_uuuid) && kauth_guid_equal(&vap->va_uuuid, &np->n_vattr.nva_uuuid)) + VATTR_CLEAR_ACTIVE(vap, va_uuuid); + if (VATTR_IS_ACTIVE(vap, va_guuid) && kauth_guid_equal(&vap->va_guuid, &np->n_vattr.nva_guuid)) + VATTR_CLEAR_ACTIVE(vap, va_guuid); + +tryagain: + /* do nothing if no attributes will be sent */ + nfs_vattr_set_bitmap(nmp, bitmap, vap); + if (!bitmap[0] && !bitmap[1]) + return (0); + + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); + /* + * Prepare GETATTR bitmap: if we are setting the ACL or mode, we + * need to invalidate any cached ACL. And if we had an ACL cached, + * we might as well also fetch the new value. + */ + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, getbitmap); + if (NFS_BITMAP_ISSET(bitmap, NFS_FATTR_ACL) || + NFS_BITMAP_ISSET(bitmap, NFS_FATTR_MODE)) { + if (NACLVALID(np)) + NFS_BITMAP_SET(getbitmap, NFS_FATTR_ACL); + NACLINVALIDATE(np); + } + // PUTFH, SETATTR, GETATTR numops = 3; nfsm_chain_build_alloc_init(error, &nmreq, 40 * NFSX_UNSIGNED); @@ -1375,25 +1548,32 @@ nfs4_setattr_rpc( nfsm_chain_add_fattr4(error, &nmreq, vap, nmp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, getbitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_SETATTR); + nfsmout_if(error == EBADRPC); + setattr_error = error; + error = 0; bmlen = NFS_ATTR_BITMAP_LEN; - nfsm_chain_get_bitmap(error, &nmrep, bitmap, bmlen); - nfsmout_if(error); - nfs_vattr_set_supported(bitmap, vap); + nfsm_chain_get_bitmap(error, &nmrep, setbitmap, bmlen); + if (!error) { + if (VATTR_IS_ACTIVE(vap, va_data_size) && (np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + microuptime(&np->n_lastio); + nfs_vattr_set_supported(setbitmap, vap); + error = setattr_error; + } nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); if (error) NATTRINVALIDATE(np); /* @@ -1416,6 +1596,20 @@ nfsmout: nfs_node_unlock(np); nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); + if ((setattr_error == EINVAL) && VATTR_IS_ACTIVE(vap, va_acl) && VATTR_IS_ACTIVE(vap, va_mode) && !NMFLAG(nmp, ACLONLY)) { + /* + * Some server's may not like ACL/mode combos that get sent. + * If it looks like that's what the server choked on, try setting + * just the ACL and not the mode (unless it looks like everything + * but mode was already successfully set). + */ + if (((bitmap[0] & setbitmap[0]) != bitmap[0]) || + ((bitmap[1] & (setbitmap[1]|NFS_FATTR_MODE)) != bitmap[1])) { + VATTR_CLEAR_ACTIVE(vap, va_mode); + error = 0; + goto tryagain; + } + } return (error); } @@ -1426,7 +1620,7 @@ int nfs_mount_state_wait_for_recovery(struct nfsmount *nmp) { struct timespec ts = { 1, 0 }; - int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; lck_mtx_lock(&nmp->nm_lock); while (nmp->nm_state & NFSSTA_RECOVER) { @@ -1434,6 +1628,7 @@ nfs_mount_state_wait_for_recovery(struct nfsmount *nmp) break; nfs_mount_sock_thread_wake(nmp); msleep(&nmp->nm_state, &nmp->nm_lock, slpflag|(PZERO-1), "nfsrecoverwait", &ts); + slpflag = 0; } lck_mtx_unlock(&nmp->nm_lock); @@ -1447,19 +1642,24 @@ nfs_mount_state_wait_for_recovery(struct nfsmount *nmp) * the recovery thread until we're done). */ int -nfs_mount_state_in_use_start(struct nfsmount *nmp) +nfs_mount_state_in_use_start(struct nfsmount *nmp, thread_t thd) { struct timespec ts = { 1, 0 }; - int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + int error = 0, slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; if (!nmp) return (ENXIO); lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_DEAD)) { + lck_mtx_unlock(&nmp->nm_lock); + return (ENXIO); + } while (nmp->nm_state & NFSSTA_RECOVER) { - if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) + if ((error = nfs_sigintr(nmp, NULL, thd, 1))) break; nfs_mount_sock_thread_wake(nmp); msleep(&nmp->nm_state, &nmp->nm_lock, slpflag|(PZERO-1), "nfsrecoverwait", &ts); + slpflag = 0; } if (!error) nmp->nm_stateinuse++; @@ -1482,11 +1682,9 @@ nfs_mount_state_in_use_end(struct nfsmount *nmp, int error) return (restart); lck_mtx_lock(&nmp->nm_lock); if (restart && (error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE)) { - if (!(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_mount_state_in_use_end: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); - } + printf("nfs_mount_state_in_use_end: error %d, initiating recovery for %s, 0x%x\n", + error, vfs_statfs(nmp->nm_mountp)->f_mntfromname, nmp->nm_stategenid); + nfs_need_recover(nmp, error); } if (nmp->nm_stateinuse > 0) nmp->nm_stateinuse--; @@ -1531,22 +1729,39 @@ nfs_mount_state_max_restarts(struct nfsmount *nmp) return (MAX(nmp->nm_fsattr.nfsa_lease, 60)); } +/* + * Does the error mean we probably lost a delegation? + */ +int +nfs_mount_state_error_delegation_lost(int error) +{ + switch (error) { + case NFSERR_STALE_STATEID: + case NFSERR_ADMIN_REVOKED: + case NFSERR_EXPIRED: + case NFSERR_OLD_STATEID: + case NFSERR_BAD_STATEID: + case NFSERR_GRACE: /* ugh! (stupid) RFC 3530 specifically disallows CLAIM_DELEGATE_CUR during grace period? */ + return (1); + } + return (0); +} + /* * Mark an NFS node's open state as busy. */ int -nfs_open_state_set_busy(nfsnode_t np, vfs_context_t ctx) +nfs_open_state_set_busy(nfsnode_t np, thread_t thd) { struct nfsmount *nmp; - thread_t thd = vfs_context_thread(ctx); struct timespec ts = {2, 0}; int error = 0, slpflag; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; lck_mtx_lock(&np->n_openlock); while (np->n_openflags & N_OPENBUSY) { @@ -1554,6 +1769,7 @@ nfs_open_state_set_busy(nfsnode_t np, vfs_context_t ctx) break; np->n_openflags |= N_OPENWANT; msleep(&np->n_openflags, &np->n_openlock, slpflag, "nfs_open_state_set_busy", &ts); + slpflag = 0; } if (!error) np->n_openflags |= N_OPENBUSY; @@ -1688,7 +1904,7 @@ nfs_open_owner_set_busy(struct nfs_open_owner *noop, thread_t thd) nmp = noop->noo_mount; if (!nmp) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; lck_mtx_lock(&noop->noo_lock); while (noop->noo_flags & NFS_OPEN_OWNER_BUSY) { @@ -1696,6 +1912,7 @@ nfs_open_owner_set_busy(struct nfs_open_owner *noop, thread_t thd) break; noop->noo_flags |= NFS_OPEN_OWNER_WANT; msleep(noop, &noop->noo_lock, slpflag, "nfs_open_owner_set_busy", &ts); + slpflag = 0; } if (!error) noop->noo_flags |= NFS_OPEN_OWNER_BUSY; @@ -1761,6 +1978,24 @@ nfs_open_file_find( uint32_t accessMode, uint32_t denyMode, int alloc) +{ + *nofpp = NULL; + return nfs_open_file_find_internal(np, noop, nofpp, accessMode, denyMode, alloc); +} + +/* + * Internally, allow using a provisional nodeless nofp (passed in via *nofpp) + * if an existing one is not found. This is used in "create" scenarios to + * officially add the provisional nofp to the node once the node is created. + */ +int +nfs_open_file_find_internal( + nfsnode_t np, + struct nfs_open_owner *noop, + struct nfs_open_file **nofpp, + uint32_t accessMode, + uint32_t denyMode, + int alloc) { struct nfs_open_file *nofp = NULL, *nofp2, *newnofp = NULL; @@ -1777,7 +2012,6 @@ tryagain: if ((accessMode & nofp2->nof_deny) || (denyMode & nofp2->nof_access)) { /* This request conflicts with an existing open on this client. */ lck_mtx_unlock(&np->n_openlock); - *nofpp = NULL; return (EACCES); } } @@ -1786,14 +2020,12 @@ tryagain: * If this open owner doesn't have an open * file structure yet, we create one for it. */ - if (!nofp && !newnofp && alloc) { + if (!nofp && !*nofpp && !newnofp && alloc) { lck_mtx_unlock(&np->n_openlock); alloc: MALLOC(newnofp, struct nfs_open_file *, sizeof(struct nfs_open_file), M_TEMP, M_WAITOK); - if (!newnofp) { - *nofpp = NULL; + if (!newnofp) return (ENOMEM); - } bzero(newnofp, sizeof(*newnofp)); lck_mtx_init(&newnofp->nof_lock, nfs_open_grp, LCK_ATTR_NULL); newnofp->nof_owner = noop; @@ -1805,15 +2037,20 @@ alloc: if (np) goto tryagain; } - if (!nofp && newnofp) { - if (np) - TAILQ_INSERT_HEAD(&np->n_opens, newnofp, nof_link); - nofp = newnofp; + if (!nofp) { + if (*nofpp) { + (*nofpp)->nof_np = np; + nofp = *nofpp; + } else { + nofp = newnofp; + } + if (nofp && np) + TAILQ_INSERT_HEAD(&np->n_opens, nofp, nof_link); } if (np) lck_mtx_unlock(&np->n_openlock); - if (newnofp && (nofp != newnofp)) + if (alloc && newnofp && (nofp != newnofp)) nfs_open_file_destroy(newnofp); *nofpp = nofp; @@ -1848,7 +2085,7 @@ nfs_open_file_set_busy(struct nfs_open_file *nofp, thread_t thd) nmp = nofp->nof_owner->noo_mount; if (!nmp) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; lck_mtx_lock(&nofp->nof_lock); while (nofp->nof_flags & NFS_OPEN_FILE_BUSY) { @@ -1856,6 +2093,7 @@ nfs_open_file_set_busy(struct nfs_open_file *nofp, thread_t thd) break; nofp->nof_flags |= NFS_OPEN_FILE_WANT; msleep(nofp, &nofp->nof_lock, slpflag, "nfs_open_file_set_busy", &ts); + slpflag = 0; } if (!error) nofp->nof_flags |= NFS_OPEN_FILE_BUSY; @@ -1884,147 +2122,525 @@ nfs_open_file_clear_busy(struct nfs_open_file *nofp) } /* - * Get the current (delegation, lock, open, default) stateid for this node. - * If node has a delegation, use that stateid. - * If pid has a lock, use the lockowner's stateid. - * Or use the open file's stateid. - * If no open file, use a default stateid of all ones. + * Add the open state for the given access/deny modes to this open file. */ void -nfs_get_stateid(nfsnode_t np, thread_t thd, kauth_cred_t cred, nfs_stateid *sid) +nfs_open_file_add_open(struct nfs_open_file *nofp, uint32_t accessMode, uint32_t denyMode, int delegated) { - struct nfsmount *nmp = NFSTONMP(np); - proc_t p = thd ? get_bsdthreadtask_info(thd) : current_thread(); // XXX async I/O requests don't have a thread - struct nfs_open_owner *noop = NULL; - struct nfs_open_file *nofp = NULL; - struct nfs_lock_owner *nlop = NULL; - nfs_stateid *s = NULL; - - if (np->n_openflags & N_DELEG_MASK) - s = &np->n_dstateid; - else if (p) - nlop = nfs_lock_owner_find(np, p, 0); - if (nlop && !TAILQ_EMPTY(&nlop->nlo_locks)) { - /* we hold locks, use lock stateid */ - s = &nlop->nlo_stateid; - } else if (((noop = nfs_open_owner_find(nmp, cred, 0))) && - (nfs_open_file_find(np, noop, &nofp, 0, 0, 0) == 0) && - !(nofp->nof_flags & NFS_OPEN_FILE_LOST) && - nofp->nof_access) { - /* we (should) have the file open, use open stateid */ - if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) - nfs4_reopen(nofp, thd); - if (!(nofp->nof_flags & NFS_OPEN_FILE_LOST)) - s = &nofp->nof_stateid; - } + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_access |= accessMode; + nofp->nof_deny |= denyMode; - if (s) { - sid->seqid = s->seqid; - sid->other[0] = s->other[0]; - sid->other[1] = s->other[1]; - sid->other[2] = s->other[2]; + if (delegated) { + if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_d_r++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_d_w++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_d_rw++; + } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_d_r_dw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_d_w_dw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_d_rw_dw++; + } else { /* NFS_OPEN_SHARE_DENY_BOTH */ + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_d_r_drw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_d_w_drw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_d_rw_drw++; + } } else { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_get_stateid: no stateid for %s\n", vname ? vname : "???"); - vnode_putname(vname); - sid->seqid = sid->other[0] = sid->other[1] = sid->other[2] = 0xffffffff; + if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_r++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_w++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_rw++; + } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_r_dw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_w_dw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_rw_dw++; + } else { /* NFS_OPEN_SHARE_DENY_BOTH */ + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) + nofp->nof_r_drw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) + nofp->nof_w_drw++; + else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) + nofp->nof_rw_drw++; + } } - if (nlop) - nfs_lock_owner_rele(nlop); - if (noop) - nfs_open_owner_rele(noop); + + nofp->nof_opencnt++; + lck_mtx_unlock(&nofp->nof_lock); } /* - * We always send the open RPC even if this open's mode is a subset of all - * the existing opens. This makes sure that we will always be able to do a - * downgrade to any of the open modes. - * - * Note: local conflicts should have already been checked. (nfs_open_file_find) + * Find which particular open combo will be closed and report what + * the new modes will be and whether the open was delegated. */ -int -nfs4_open( - nfsnode_t np, +void +nfs_open_file_remove_open_find( struct nfs_open_file *nofp, uint32_t accessMode, uint32_t denyMode, - vfs_context_t ctx) + uint32_t *newAccessMode, + uint32_t *newDenyMode, + int *delegated) { - vnode_t vp = NFSTOV(np); - vnode_t dvp = NULL; - struct componentname cn; - const char *vname = NULL; - size_t namelen; - char smallname[128]; - char *filename = NULL; - int error = 0, readtoo = 0; - - dvp = vnode_getparent(vp); - vname = vnode_getname(vp); - if (!dvp || !vname) { - error = EIO; - goto out; - } - filename = &smallname[0]; - namelen = snprintf(filename, sizeof(smallname), "%s", vname); - if (namelen >= sizeof(smallname)) { - namelen++; /* snprintf result doesn't include '\0' */ - MALLOC(filename, char *, namelen, M_TEMP, M_WAITOK); - if (!filename) { - error = ENOMEM; - goto out; - } - snprintf(filename, namelen, "%s", vname); - } - bzero(&cn, sizeof(cn)); - cn.cn_nameptr = filename; - cn.cn_namelen = namelen; - - if (!(accessMode & NFS_OPEN_SHARE_ACCESS_READ)) { - /* - * Try to open it for read access too, - * so the buffer cache can read data. - */ - readtoo = 1; - accessMode |= NFS_OPEN_SHARE_ACCESS_READ; - } -tryagain: - error = nfs4_open_rpc(nofp, ctx, &cn, NULL, dvp, &vp, NFS_OPEN_NOCREATE, accessMode, denyMode); - if (error) { - if (!nfs_mount_state_error_should_restart(error) && readtoo) { - /* try again without the extra read access */ - accessMode &= ~NFS_OPEN_SHARE_ACCESS_READ; - readtoo = 0; - goto tryagain; - } - goto out; - } - nofp->nof_access |= accessMode; - nofp->nof_deny |= denyMode; + /* + * Calculate new modes: a mode bit gets removed when there's only + * one count in all the corresponding counts + */ + *newAccessMode = nofp->nof_access; + *newDenyMode = nofp->nof_deny; + if ((accessMode & NFS_OPEN_SHARE_ACCESS_READ) && + (nofp->nof_access & NFS_OPEN_SHARE_ACCESS_READ) && + ((nofp->nof_r + nofp->nof_d_r + + nofp->nof_rw + nofp->nof_d_rw + + nofp->nof_r_dw + nofp->nof_d_r_dw + + nofp->nof_rw_dw + nofp->nof_d_rw_dw + + nofp->nof_r_drw + nofp->nof_d_r_drw + + nofp->nof_rw_dw + nofp->nof_d_rw_dw) == 1)) + *newAccessMode &= ~NFS_OPEN_SHARE_ACCESS_READ; + if ((accessMode & NFS_OPEN_SHARE_ACCESS_WRITE) && + (nofp->nof_access & NFS_OPEN_SHARE_ACCESS_WRITE) && + ((nofp->nof_w + nofp->nof_d_w + + nofp->nof_rw + nofp->nof_d_rw + + nofp->nof_w_dw + nofp->nof_d_w_dw + + nofp->nof_rw_dw + nofp->nof_d_rw_dw + + nofp->nof_w_drw + nofp->nof_d_w_drw + + nofp->nof_rw_dw + nofp->nof_d_rw_dw) == 1)) + *newAccessMode &= ~NFS_OPEN_SHARE_ACCESS_WRITE; + if ((denyMode & NFS_OPEN_SHARE_DENY_READ) && + (nofp->nof_deny & NFS_OPEN_SHARE_DENY_READ) && + ((nofp->nof_r_drw + nofp->nof_d_r_drw + + nofp->nof_w_drw + nofp->nof_d_w_drw + + nofp->nof_rw_drw + nofp->nof_d_rw_drw) == 1)) + *newDenyMode &= ~NFS_OPEN_SHARE_DENY_READ; + if ((denyMode & NFS_OPEN_SHARE_DENY_WRITE) && + (nofp->nof_deny & NFS_OPEN_SHARE_DENY_WRITE) && + ((nofp->nof_r_drw + nofp->nof_d_r_drw + + nofp->nof_w_drw + nofp->nof_d_w_drw + + nofp->nof_rw_drw + nofp->nof_d_rw_drw + + nofp->nof_r_dw + nofp->nof_d_r_dw + + nofp->nof_w_dw + nofp->nof_d_w_dw + + nofp->nof_rw_dw + nofp->nof_d_rw_dw) == 1)) + *newDenyMode &= ~NFS_OPEN_SHARE_DENY_WRITE; + + /* Find the corresponding open access/deny mode counter. */ if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) - nofp->nof_r++; + *delegated = (nofp->nof_d_r != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) - nofp->nof_w++; + *delegated = (nofp->nof_d_w != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) - nofp->nof_rw++; + *delegated = (nofp->nof_d_rw != 0); + else + *delegated = 0; } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) - nofp->nof_r_dw++; + *delegated = (nofp->nof_d_r_dw != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) - nofp->nof_w_dw++; + *delegated = (nofp->nof_d_w_dw != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) - nofp->nof_rw_dw++; + *delegated = (nofp->nof_d_rw_dw != 0); + else + *delegated = 0; } else { /* NFS_OPEN_SHARE_DENY_BOTH */ if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) - nofp->nof_r_drw++; + *delegated = (nofp->nof_d_r_drw != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) - nofp->nof_w_drw++; + *delegated = (nofp->nof_d_w_drw != 0); else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) - nofp->nof_rw_drw++; + *delegated = (nofp->nof_d_rw_drw != 0); + else + *delegated = 0; } - nofp->nof_opencnt++; +} + +/* + * Remove the open state for the given access/deny modes to this open file. + */ +void +nfs_open_file_remove_open(struct nfs_open_file *nofp, uint32_t accessMode, uint32_t denyMode) +{ + uint32_t newAccessMode, newDenyMode; + int delegated = 0; + + lck_mtx_lock(&nofp->nof_lock); + nfs_open_file_remove_open_find(nofp, accessMode, denyMode, &newAccessMode, &newDenyMode, &delegated); + + /* Decrement the corresponding open access/deny mode counter. */ + if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { + if (delegated) { + if (nofp->nof_d_r == 0) + NP(nofp->nof_np, "nfs: open(R) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_r--; + } else { + if (nofp->nof_r == 0) + NP(nofp->nof_np, "nfs: open(R) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_r--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { + if (delegated) { + if (nofp->nof_d_w == 0) + NP(nofp->nof_np, "nfs: open(W) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_w--; + } else { + if (nofp->nof_w == 0) + NP(nofp->nof_np, "nfs: open(W) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_w--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { + if (delegated) { + if (nofp->nof_d_rw == 0) + NP(nofp->nof_np, "nfs: open(RW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_rw--; + } else { + if (nofp->nof_rw == 0) + NP(nofp->nof_np, "nfs: open(RW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_rw--; + } + } + } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { + if (delegated) { + if (nofp->nof_d_r_dw == 0) + NP(nofp->nof_np, "nfs: open(R,DW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_r_dw--; + } else { + if (nofp->nof_r_dw == 0) + NP(nofp->nof_np, "nfs: open(R,DW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_r_dw--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { + if (delegated) { + if (nofp->nof_d_w_dw == 0) + NP(nofp->nof_np, "nfs: open(W,DW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_w_dw--; + } else { + if (nofp->nof_w_dw == 0) + NP(nofp->nof_np, "nfs: open(W,DW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_w_dw--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { + if (delegated) { + if (nofp->nof_d_rw_dw == 0) + NP(nofp->nof_np, "nfs: open(RW,DW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_rw_dw--; + } else { + if (nofp->nof_rw_dw == 0) + NP(nofp->nof_np, "nfs: open(RW,DW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_rw_dw--; + } + } + } else { /* NFS_OPEN_SHARE_DENY_BOTH */ + if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { + if (delegated) { + if (nofp->nof_d_r_drw == 0) + NP(nofp->nof_np, "nfs: open(R,DRW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_r_drw--; + } else { + if (nofp->nof_r_drw == 0) + NP(nofp->nof_np, "nfs: open(R,DRW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_r_drw--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { + if (delegated) { + if (nofp->nof_d_w_drw == 0) + NP(nofp->nof_np, "nfs: open(W,DRW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_w_drw--; + } else { + if (nofp->nof_w_drw == 0) + NP(nofp->nof_np, "nfs: open(W,DRW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_w_drw--; + } + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { + if (delegated) { + if (nofp->nof_d_rw_drw == 0) + NP(nofp->nof_np, "nfs: open(RW,DRW) delegated count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_d_rw_drw--; + } else { + if (nofp->nof_rw_drw == 0) + NP(nofp->nof_np, "nfs: open(RW,DRW) count underrun, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + else + nofp->nof_rw_drw--; + } + } + } + + /* update the modes */ + nofp->nof_access = newAccessMode; + nofp->nof_deny = newDenyMode; + nofp->nof_opencnt--; + lck_mtx_unlock(&nofp->nof_lock); +} + + +/* + * Get the current (delegation, lock, open, default) stateid for this node. + * If node has a delegation, use that stateid. + * If pid has a lock, use the lockowner's stateid. + * Or use the open file's stateid. + * If no open file, use a default stateid of all ones. + */ +void +nfs_get_stateid(nfsnode_t np, thread_t thd, kauth_cred_t cred, nfs_stateid *sid) +{ + struct nfsmount *nmp = NFSTONMP(np); + proc_t p = thd ? get_bsdthreadtask_info(thd) : current_proc(); // XXX async I/O requests don't have a thread + struct nfs_open_owner *noop = NULL; + struct nfs_open_file *nofp = NULL; + struct nfs_lock_owner *nlop = NULL; + nfs_stateid *s = NULL; + + if (np->n_openflags & N_DELEG_MASK) { + s = &np->n_dstateid; + } else { + if (p) + nlop = nfs_lock_owner_find(np, p, 0); + if (nlop && !TAILQ_EMPTY(&nlop->nlo_locks)) { + /* we hold locks, use lock stateid */ + s = &nlop->nlo_stateid; + } else if (((noop = nfs_open_owner_find(nmp, cred, 0))) && + (nfs_open_file_find(np, noop, &nofp, 0, 0, 0) == 0) && + !(nofp->nof_flags & NFS_OPEN_FILE_LOST) && + nofp->nof_access) { + /* we (should) have the file open, use open stateid */ + if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) + nfs4_reopen(nofp, thd); + if (!(nofp->nof_flags & NFS_OPEN_FILE_LOST)) + s = &nofp->nof_stateid; + } + } + + if (s) { + sid->seqid = s->seqid; + sid->other[0] = s->other[0]; + sid->other[1] = s->other[1]; + sid->other[2] = s->other[2]; + } else { + /* named attributes may not have a stateid for reads, so don't complain for them */ + if (!(np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + NP(np, "nfs_get_stateid: no stateid"); + sid->seqid = sid->other[0] = sid->other[1] = sid->other[2] = 0xffffffff; + } + if (nlop) + nfs_lock_owner_rele(nlop); + if (noop) + nfs_open_owner_rele(noop); +} + + +/* + * When we have a delegation, we may be able to perform the OPEN locally. + * Perform the OPEN by checking the delegation ACE and/or checking via ACCESS. + */ +int +nfs4_open_delegated( + nfsnode_t np, + struct nfs_open_file *nofp, + uint32_t accessMode, + uint32_t denyMode, + vfs_context_t ctx) +{ + int error = 0, ismember, readtoo = 0, authorized = 0; + uint32_t action; + struct kauth_acl_eval eval; + kauth_cred_t cred = vfs_context_ucred(ctx); + + if (!(accessMode & NFS_OPEN_SHARE_ACCESS_READ)) { + /* + * Try to open it for read access too, + * so the buffer cache can read data. + */ + readtoo = 1; + accessMode |= NFS_OPEN_SHARE_ACCESS_READ; + } + +tryagain: + action = 0; + if (accessMode & NFS_OPEN_SHARE_ACCESS_READ) + action |= KAUTH_VNODE_READ_DATA; + if (accessMode & NFS_OPEN_SHARE_ACCESS_WRITE) + action |= KAUTH_VNODE_WRITE_DATA; + + /* evaluate ACE (if we have one) */ + if (np->n_dace.ace_flags) { + eval.ae_requested = action; + eval.ae_acl = &np->n_dace; + eval.ae_count = 1; + eval.ae_options = 0; + if (np->n_vattr.nva_uid == kauth_cred_getuid(cred)) + eval.ae_options |= KAUTH_AEVAL_IS_OWNER; + error = kauth_cred_ismember_gid(cred, np->n_vattr.nva_gid, &ismember); + if (!error && ismember) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP; + + eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; + eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; + eval.ae_exp_gwrite = KAUTH_VNODE_GENERIC_WRITE_BITS; + eval.ae_exp_gexec = KAUTH_VNODE_GENERIC_EXECUTE_BITS; + + error = kauth_acl_evaluate(cred, &eval); + + if (!error && (eval.ae_result == KAUTH_RESULT_ALLOW)) + authorized = 1; + } + + if (!authorized) { + /* need to ask the server via ACCESS */ + struct vnop_access_args naa; + naa.a_desc = &vnop_access_desc; + naa.a_vp = NFSTOV(np); + naa.a_action = action; + naa.a_context = ctx; + if (!(error = nfs_vnop_access(&naa))) + authorized = 1; + } + + if (!authorized) { + if (readtoo) { + /* try again without the extra read access */ + accessMode &= ~NFS_OPEN_SHARE_ACCESS_READ; + readtoo = 0; + goto tryagain; + } + return (error ? error : EACCES); + } + + nfs_open_file_add_open(nofp, accessMode, denyMode, 1); + + return (0); +} + + +/* + * Open a file with the given access/deny modes. + * + * If we have a delegation, we may be able to handle the open locally. + * Otherwise, we will always send the open RPC even if this open's mode is + * a subset of all the existing opens. This makes sure that we will always + * be able to do a downgrade to any of the open modes. + * + * Note: local conflicts should have already been checked in nfs_open_file_find(). + */ +int +nfs4_open( + nfsnode_t np, + struct nfs_open_file *nofp, + uint32_t accessMode, + uint32_t denyMode, + vfs_context_t ctx) +{ + vnode_t vp = NFSTOV(np); + vnode_t dvp = NULL; + struct componentname cn; + const char *vname = NULL; + size_t namelen; + char smallname[128]; + char *filename = NULL; + int error = 0, readtoo = 0; + + /* + * We can handle the OPEN ourselves if we have a delegation, + * unless it's a read delegation and the open is asking for + * either write access or deny read. We also don't bother to + * use the delegation if it's being returned. + */ + if (np->n_openflags & N_DELEG_MASK) { + if ((error = nfs_open_state_set_busy(np, vfs_context_thread(ctx)))) + return (error); + if ((np->n_openflags & N_DELEG_MASK) && !(np->n_openflags & N_DELEG_RETURN) && + (((np->n_openflags & N_DELEG_MASK) == N_DELEG_WRITE) || + (!(accessMode & NFS_OPEN_SHARE_ACCESS_WRITE) && !(denyMode & NFS_OPEN_SHARE_DENY_READ)))) { + error = nfs4_open_delegated(np, nofp, accessMode, denyMode, ctx); + nfs_open_state_clear_busy(np); + return (error); + } + nfs_open_state_clear_busy(np); + } + + /* + * [sigh] We can't trust VFS to get the parent right for named + * attribute nodes. (It likes to reparent the nodes after we've + * created them.) Luckily we can probably get the right parent + * from the n_parent we have stashed away. + */ + if ((np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) && + (((dvp = np->n_parent)) && (error = vnode_get(dvp)))) + dvp = NULL; + if (!dvp) + dvp = vnode_getparent(vp); + vname = vnode_getname(vp); + if (!dvp || !vname) { + if (!error) + error = EIO; + goto out; + } + filename = &smallname[0]; + namelen = snprintf(filename, sizeof(smallname), "%s", vname); + if (namelen >= sizeof(smallname)) { + MALLOC(filename, char *, namelen+1, M_TEMP, M_WAITOK); + if (!filename) { + error = ENOMEM; + goto out; + } + snprintf(filename, namelen+1, "%s", vname); + } + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = filename; + cn.cn_namelen = namelen; + + if (!(accessMode & NFS_OPEN_SHARE_ACCESS_READ)) { + /* + * Try to open it for read access too, + * so the buffer cache can read data. + */ + readtoo = 1; + accessMode |= NFS_OPEN_SHARE_ACCESS_READ; + } +tryagain: + error = nfs4_open_rpc(nofp, ctx, &cn, NULL, dvp, &vp, NFS_OPEN_NOCREATE, accessMode, denyMode); + if (error) { + if (!nfs_mount_state_error_should_restart(error) && + (error != EINTR) && (error != ERESTART) && readtoo) { + /* try again without the extra read access */ + accessMode &= ~NFS_OPEN_SHARE_ACCESS_READ; + readtoo = 0; + goto tryagain; + } + goto out; + } + nfs_open_file_add_open(nofp, accessMode, denyMode, 0); out: if (filename && (filename != &smallname[0])) FREE(filename, M_TEMP); @@ -2035,142 +2651,176 @@ out: return (error); } - int -nfs4_vnop_open( - struct vnop_open_args /* { +nfs_vnop_mmap( + struct vnop_mmap_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; - int a_mode; + int a_fflags; vfs_context_t a_context; } */ *ap) { vfs_context_t ctx = ap->a_context; vnode_t vp = ap->a_vp; nfsnode_t np = VTONFS(vp); + int error = 0, accessMode, denyMode, delegated; struct nfsmount *nmp; - int error, accessMode, denyMode, opened = 0; struct nfs_open_owner *noop = NULL; struct nfs_open_file *nofp = NULL; - if (!(ap->a_mode & (FREAD|FWRITE))) - return (EINVAL); - nmp = VTONMP(vp); if (!nmp) return (ENXIO); - /* First, call the common code */ - if ((error = nfs3_vnop_open(ap))) - return (error); - - if (!vnode_isreg(vp)) { - /* Just mark that it was opened */ - lck_mtx_lock(&np->n_openlock); - np->n_openrefcnt++; - lck_mtx_unlock(&np->n_openlock); - return (0); - } + if (!vnode_isreg(vp) || !(ap->a_fflags & (PROT_READ|PROT_WRITE))) + return (EINVAL); + if (np->n_flag & NREVOKE) + return (EIO); - /* mode contains some combination of: FREAD, FWRITE, O_SHLOCK, O_EXLOCK */ - accessMode = 0; - if (ap->a_mode & FREAD) - accessMode |= NFS_OPEN_SHARE_ACCESS_READ; - if (ap->a_mode & FWRITE) + /* + * fflags contains some combination of: PROT_READ, PROT_WRITE + * Since it's not possible to mmap() without having the file open for reading, + * read access is always there (regardless if PROT_READ is not set). + */ + accessMode = NFS_OPEN_SHARE_ACCESS_READ; + if (ap->a_fflags & PROT_WRITE) accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE; - if (ap->a_mode & O_EXLOCK) - denyMode = NFS_OPEN_SHARE_DENY_BOTH; - else if (ap->a_mode & O_SHLOCK) - denyMode = NFS_OPEN_SHARE_DENY_WRITE; - else - denyMode = NFS_OPEN_SHARE_DENY_NONE; + denyMode = NFS_OPEN_SHARE_DENY_NONE; noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); if (!noop) return (ENOMEM); restart: - error = nfs_mount_state_in_use_start(nmp); + error = nfs_mount_state_in_use_start(nmp, NULL); if (error) { nfs_open_owner_rele(noop); return (error); } - - error = nfs_open_file_find(np, noop, &nofp, accessMode, denyMode, 1); - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_open: LOST %s\n", vname); - vnode_putname(vname); + if (np->n_flag & NREVOKE) { error = EIO; + nfs_mount_state_in_use_end(nmp, 0); + nfs_open_owner_rele(noop); + return (error); + } + + error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1); + if (error || (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST))) { + NP(np, "nfs_vnop_mmap: no open file for owner, error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); + error = EPERM; } if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); + error = nfs4_reopen(nofp, NULL); nofp = NULL; - goto restart; + if (!error) + goto restart; } if (!error) - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); + error = nfs_open_file_set_busy(nofp, NULL); if (error) { nofp = NULL; goto out; } /* - * If we just created the file and the modes match, then we simply use - * the open performed in the create. Otherwise, send the request. + * The open reference for mmap must mirror an existing open because + * we may need to reclaim it after the file is closed. + * So grab another open count matching the accessMode passed in. + * If we already had an mmap open, prefer read/write without deny mode. + * This means we may have to drop the current mmap open first. */ - if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && - (nofp->nof_creator == current_thread()) && - (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) && - (denyMode == NFS_OPEN_SHARE_DENY_NONE)) { - nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; - nofp->nof_creator = NULL; - } else { - if (!opened) - error = nfs4_open(np, nofp, accessMode, denyMode, ctx); - if ((error == EACCES) && (nofp->nof_flags & NFS_OPEN_FILE_CREATE) && - (nofp->nof_creator == current_thread())) { - /* - * Ugh. This can happen if we just created the file with read-only - * perms and we're trying to open it for real with different modes - * (e.g. write-only or with a deny mode) and the server decides to - * not allow the second open because of the read-only perms. - * The best we can do is to just use the create's open. - * We may have access we don't need or we may not have a requested - * deny mode. We may log complaints later, but we'll try to avoid it. - */ - if (denyMode != NFS_OPEN_SHARE_DENY_NONE) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs4_vnop_open: deny mode foregone on create, %s\n", vname); - vnode_putname(vname); - } - nofp->nof_creator = NULL; + + if (!nofp->nof_access) { + if (accessMode != NFS_OPEN_SHARE_ACCESS_READ) { + /* not asking for just read access -> fail */ + error = EPERM; + goto out; + } + /* we don't have the file open, so open it for read access */ + if (nmp->nm_vers < NFS_VER4) { + /* NFS v2/v3 opens are always allowed - so just add it. */ + nfs_open_file_add_open(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, 0); error = 0; + } else { + error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); } + if (!error) + nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE; if (error) goto out; - opened = 1; - /* - * If we had just created the file, we already had it open. - * If the actual open mode is less than what we grabbed at - * create time, then we'll downgrade the open here. - */ - if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && - (nofp->nof_creator == current_thread())) { - error = nfs4_close(np, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx); - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_open: create close error %d, %s\n", error, vname); - vnode_putname(vname); - } - if (!nfs_mount_state_error_should_restart(error)) { - error = 0; - nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; - } + } + + /* determine deny mode for open */ + if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { + if (nofp->nof_d_rw || nofp->nof_d_rw_dw || nofp->nof_d_rw_drw) { + delegated = 1; + if (nofp->nof_d_rw) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + else if (nofp->nof_d_rw_dw) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else if (nofp->nof_d_rw_drw) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + } else if (nofp->nof_rw || nofp->nof_rw_dw || nofp->nof_rw_drw) { + delegated = 0; + if (nofp->nof_rw) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + else if (nofp->nof_rw_dw) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else if (nofp->nof_rw_drw) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + } else { + error = EPERM; + } + } else { /* NFS_OPEN_SHARE_ACCESS_READ */ + if (nofp->nof_d_r || nofp->nof_d_r_dw || nofp->nof_d_r_drw) { + delegated = 1; + if (nofp->nof_d_r) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + else if (nofp->nof_d_r_dw) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else if (nofp->nof_d_r_drw) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + } else if (nofp->nof_r || nofp->nof_r_dw || nofp->nof_r_drw) { + delegated = 0; + if (nofp->nof_r) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + else if (nofp->nof_r_dw) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else if (nofp->nof_r_drw) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + } else { + error = EPERM; + } + } + if (error) /* mmap mode without proper open mode */ + goto out; + + /* + * If the existing mmap access is more than the new access OR the + * existing access is the same and the existing deny mode is less, + * then we'll stick with the existing mmap open mode. + */ + if ((nofp->nof_mmap_access > accessMode) || + ((nofp->nof_mmap_access == accessMode) && (nofp->nof_mmap_deny <= denyMode))) + goto out; + + /* update mmap open mode */ + if (nofp->nof_mmap_access) { + error = nfs_close(np, nofp, nofp->nof_mmap_access, nofp->nof_mmap_deny, ctx); + if (error) { + if (!nfs_mount_state_error_should_restart(error)) + NP(np, "nfs_vnop_mmap: close of previous mmap mode failed: %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + NP(np, "nfs_vnop_mmap: update, close error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + goto out; } + nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; } + nfs_open_file_add_open(nofp, accessMode, denyMode, delegated); + nofp->nof_mmap_access = accessMode; + nofp->nof_mmap_deny = denyMode; + out: if (nofp) nfs_open_file_clear_busy(nofp); @@ -2180,601 +2830,134 @@ out: } if (noop) nfs_open_owner_rele(noop); - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_open: error %d, %s\n", error, vname); - vnode_putname(vname); - } return (error); } -int -nfs4_close( - nfsnode_t np, - struct nfs_open_file *nofp, - uint32_t accessMode, - uint32_t denyMode, - vfs_context_t ctx) -{ - struct nfs_lock_owner *nlop; - int error = 0, changed = 0, closed = 0; - uint32_t newAccessMode, newDenyMode; - - /* warn if modes don't match current state */ - if (((accessMode & nofp->nof_access) != accessMode) || ((denyMode & nofp->nof_deny) != denyMode)) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs4_close: mode mismatch %d %d, current %d %d, %s\n", - accessMode, denyMode, nofp->nof_access, nofp->nof_deny, vname); - vnode_putname(vname); - } - - /* - * If we're closing a write-only open, we may not have a write-only count - * if we also grabbed read access. So, check the read-write count. - */ - if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { - if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && - (nofp->nof_w == 0) && nofp->nof_rw) - accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; - } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { - if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && - (nofp->nof_w_dw == 0) && nofp->nof_rw_dw) - accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; - } else { /* NFS_OPEN_SHARE_DENY_BOTH */ - if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && - (nofp->nof_w_drw == 0) && nofp->nof_rw_drw) - accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; - } - - /* - * Calculate new modes: a mode bit gets removed when there's only - * one count in all the corresponding counts - */ - newAccessMode = nofp->nof_access; - newDenyMode = nofp->nof_deny; - if ((accessMode & NFS_OPEN_SHARE_ACCESS_READ) && - (newAccessMode & NFS_OPEN_SHARE_ACCESS_READ) && - ((nofp->nof_r + nofp->nof_rw + nofp->nof_r_dw + - nofp->nof_rw_dw + nofp->nof_r_drw + nofp->nof_rw_dw) == 1)) { - newAccessMode &= ~NFS_OPEN_SHARE_ACCESS_READ; - changed = 1; - } - if ((accessMode & NFS_OPEN_SHARE_ACCESS_WRITE) && - (newAccessMode & NFS_OPEN_SHARE_ACCESS_WRITE) && - ((nofp->nof_w + nofp->nof_rw + nofp->nof_w_dw + - nofp->nof_rw_dw + nofp->nof_w_drw + nofp->nof_rw_dw) == 1)) { - newAccessMode &= ~NFS_OPEN_SHARE_ACCESS_WRITE; - changed = 1; - } - if ((denyMode & NFS_OPEN_SHARE_DENY_READ) && - (newDenyMode & NFS_OPEN_SHARE_DENY_READ) && - ((nofp->nof_r_drw + nofp->nof_w_drw + nofp->nof_rw_drw) == 1)) { - newDenyMode &= ~NFS_OPEN_SHARE_DENY_READ; - changed = 1; - } - if ((denyMode & NFS_OPEN_SHARE_DENY_WRITE) && - (newDenyMode & NFS_OPEN_SHARE_DENY_WRITE) && - ((nofp->nof_r_drw + nofp->nof_w_drw + nofp->nof_rw_drw + - nofp->nof_r_dw + nofp->nof_w_dw + nofp->nof_rw_dw) == 1)) { - newDenyMode &= ~NFS_OPEN_SHARE_DENY_WRITE; - changed = 1; - } - - - if ((newAccessMode == 0) || (nofp->nof_opencnt == 1)) { - /* - * No more access after this close, so clean up and close it. - */ - closed = 1; - if (!(nofp->nof_flags & NFS_OPEN_FILE_LOST)) - error = nfs4_close_rpc(np, nofp, vfs_context_thread(ctx), vfs_context_ucred(ctx), 0); - if (error == NFSERR_LOCKS_HELD) { - /* - * Hmm... the server says we have locks we need to release first - * Find the lock owner and try to unlock everything. - */ - nlop = nfs_lock_owner_find(np, vfs_context_proc(ctx), 0); - if (nlop) { - nfs4_unlock_rpc(np, nlop, F_WRLCK, 0, UINT64_MAX, ctx); - nfs_lock_owner_rele(nlop); - } - error = nfs4_close_rpc(np, nofp, vfs_context_thread(ctx), vfs_context_ucred(ctx), 0); - } - } else if (changed) { - /* - * File is still open but with less access, so downgrade the open. - */ - if (!(nofp->nof_flags & NFS_OPEN_FILE_LOST)) - error = nfs4_open_downgrade_rpc(np, nofp, ctx); - } - - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs4_close: error %d, %s\n", error, vname); - vnode_putname(vname); - return (error); - } - - /* Decrement the corresponding open access/deny mode counter. */ - if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { - if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { - if (nofp->nof_r == 0) - printf("nfs4_close: open(R) count underrun\n"); - else - nofp->nof_r--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { - if (nofp->nof_w == 0) - printf("nfs4_close: open(W) count underrun\n"); - else - nofp->nof_w--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { - if (nofp->nof_rw == 0) - printf("nfs4_close: open(RW) count underrun\n"); - else - nofp->nof_rw--; - } - } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { - if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { - if (nofp->nof_r_dw == 0) - printf("nfs4_close: open(R,DW) count underrun\n"); - else - nofp->nof_r_dw--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { - if (nofp->nof_w_dw == 0) - printf("nfs4_close: open(W,DW) count underrun\n"); - else - nofp->nof_w_dw--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { - if (nofp->nof_rw_dw == 0) - printf("nfs4_close: open(RW,DW) count underrun\n"); - else - nofp->nof_rw_dw--; - } - } else { /* NFS_OPEN_SHARE_DENY_BOTH */ - if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { - if (nofp->nof_r_drw == 0) - printf("nfs4_close: open(R,DRW) count underrun\n"); - else - nofp->nof_r_drw--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) { - if (nofp->nof_w_drw == 0) - printf("nfs4_close: open(W,DRW) count underrun\n"); - else - nofp->nof_w_drw--; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { - if (nofp->nof_rw_drw == 0) - printf("nfs4_close: open(RW,DRW) count underrun\n"); - else - nofp->nof_rw_drw--; - } - } - /* update the modes */ - nofp->nof_access = newAccessMode; - nofp->nof_deny = newDenyMode; - if (closed) { - if (nofp->nof_r || nofp->nof_w || - (nofp->nof_rw && !((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && !nofp->nof_creator && (nofp->nof_rw == 1))) || - nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || - nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw) - printf("nfs4_close: unexpected count: %u %u %u dw %u %u %u drw %u %u %u flags 0x%x\n", - nofp->nof_r, nofp->nof_w, nofp->nof_rw, - nofp->nof_r_dw, nofp->nof_w_dw, nofp->nof_rw_dw, - nofp->nof_r_drw, nofp->nof_w_drw, nofp->nof_rw_drw, - nofp->nof_flags); - /* clear out all open info, just to be safe */ - nofp->nof_access = nofp->nof_deny = 0; - nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; - nofp->nof_r = nofp->nof_w = nofp->nof_rw = 0; - nofp->nof_r_dw = nofp->nof_w_dw = nofp->nof_rw_dw = 0; - nofp->nof_r_drw = nofp->nof_w_drw = nofp->nof_rw_drw = 0; - nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; - /* XXX we may potentially want to clean up idle/unused open file structures */ - } - nofp->nof_opencnt--; - if (nofp->nof_flags & NFS_OPEN_FILE_LOST) { - error = EIO; - if (!nofp->nof_opencnt) - nofp->nof_flags &= ~NFS_OPEN_FILE_LOST; - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_close: LOST%s, %s\n", !(nofp->nof_flags & NFS_OPEN_FILE_LOST) ? " (last)" : "", vname); - vnode_putname(vname); - } - return (error); -} int -nfs4_vnop_close( - struct vnop_close_args /* { +nfs_vnop_mnomap( + struct vnop_mnomap_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; - int a_fflag; vfs_context_t a_context; } */ *ap) { vfs_context_t ctx = ap->a_context; vnode_t vp = ap->a_vp; - int fflag = ap->a_fflag; - int error, common_error, accessMode, denyMode; nfsnode_t np = VTONFS(vp); struct nfsmount *nmp; - struct nfs_open_owner *noop = NULL; struct nfs_open_file *nofp = NULL; + off_t size; + int error; nmp = VTONMP(vp); if (!nmp) return (ENXIO); - /* First, call the common code */ - common_error = nfs3_vnop_close(ap); + /* flush buffers/ubc before we drop the open (in case it's our last open) */ + nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), V_IGNORE_WRITEERR); + if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) + ubc_msync(vp, 0, size, NULL, UBC_PUSHALL | UBC_SYNC); - if (!vnode_isreg(vp)) { - /* Just mark that it was closed */ - lck_mtx_lock(&np->n_openlock); - np->n_openrefcnt--; + /* walk all open files and close all mmap opens */ +loop: + error = nfs_mount_state_in_use_start(nmp, NULL); + if (error) + return (error); + lck_mtx_lock(&np->n_openlock); + TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { + if (!nofp->nof_mmap_access) + continue; lck_mtx_unlock(&np->n_openlock); - return (common_error); + if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(nofp, NULL); + if (!error) + goto loop; + } + if (!error) + error = nfs_open_file_set_busy(nofp, NULL); + if (error) { + lck_mtx_lock(&np->n_openlock); + break; + } + if (nofp->nof_mmap_access) { + error = nfs_close(np, nofp, nofp->nof_mmap_access, nofp->nof_mmap_deny, ctx); + if (!nfs_mount_state_error_should_restart(error)) { + if (error) /* not a state-operation-restarting error, so just clear the access */ + NP(np, "nfs_vnop_mnomap: close of mmap mode failed: %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; + } + if (error) + NP(np, "nfs_vnop_mnomap: error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + } + nfs_open_file_clear_busy(nofp); + nfs_mount_state_in_use_end(nmp, error); + goto loop; } + lck_mtx_unlock(&np->n_openlock); + nfs_mount_state_in_use_end(nmp, error); + return (error); +} - noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 0); - if (!noop) { - printf("nfs4_vnop_close: can't get open owner!\n"); - return (EIO); - } +/* + * Search a node's lock owner list for the owner for this process. + * If not found and "alloc" is set, then allocate a new one. + */ +struct nfs_lock_owner * +nfs_lock_owner_find(nfsnode_t np, proc_t p, int alloc) +{ + pid_t pid = proc_pid(p); + struct nfs_lock_owner *nlop, *newnlop = NULL; -restart: - error = nfs_mount_state_in_use_start(nmp); - if (error) { - nfs_open_owner_rele(noop); - return (error); +tryagain: + lck_mtx_lock(&np->n_openlock); + TAILQ_FOREACH(nlop, &np->n_lock_owners, nlo_link) { + if (nlop->nlo_pid != pid) + continue; + if (timevalcmp(&nlop->nlo_pid_start, &p->p_start, ==)) + break; + /* stale lock owner... reuse it if we can */ + if (nlop->nlo_refcnt) { + TAILQ_REMOVE(&np->n_lock_owners, nlop, nlo_link); + nlop->nlo_flags &= ~NFS_LOCK_OWNER_LINK; + lck_mtx_unlock(&np->n_openlock); + goto tryagain; + } + nlop->nlo_pid_start = p->p_start; + nlop->nlo_seqid = 0; + nlop->nlo_stategenid = 0; + break; } - error = nfs_open_file_find(np, noop, &nofp, 0, 0, 0); - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - nofp = NULL; - goto restart; - } - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs4_vnop_close: no open file for owner %d, %s\n", error, vname); - vnode_putname(vname); - error = EBADF; - goto out; + if (!nlop && !newnlop && alloc) { + lck_mtx_unlock(&np->n_openlock); + MALLOC(newnlop, struct nfs_lock_owner *, sizeof(struct nfs_lock_owner), M_TEMP, M_WAITOK); + if (!newnlop) + return (NULL); + bzero(newnlop, sizeof(*newnlop)); + lck_mtx_init(&newnlop->nlo_lock, nfs_open_grp, LCK_ATTR_NULL); + newnlop->nlo_pid = pid; + newnlop->nlo_pid_start = p->p_start; + newnlop->nlo_name = OSAddAtomic(1, &nfs_lock_owner_seqnum); + TAILQ_INIT(&newnlop->nlo_locks); + goto tryagain; } - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); - if (error) { - nofp = NULL; - goto out; + if (!nlop && newnlop) { + newnlop->nlo_flags |= NFS_LOCK_OWNER_LINK; + TAILQ_INSERT_HEAD(&np->n_lock_owners, newnlop, nlo_link); + nlop = newnlop; } + lck_mtx_unlock(&np->n_openlock); - /* fflag contains some combination of: FREAD, FWRITE, FHASLOCK */ - accessMode = 0; - if (fflag & FREAD) - accessMode |= NFS_OPEN_SHARE_ACCESS_READ; - if (fflag & FWRITE) - accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE; -// XXX It would be nice if we still had the O_EXLOCK/O_SHLOCK flags that were on the open -// if (fflag & O_EXLOCK) -// denyMode = NFS_OPEN_SHARE_DENY_BOTH; -// else if (fflag & O_SHLOCK) -// denyMode = NFS_OPEN_SHARE_DENY_WRITE; -// else -// denyMode = NFS_OPEN_SHARE_DENY_NONE; - if (fflag & FHASLOCK) { - /* XXX assume FHASLOCK is for the deny mode and not flock */ - /* FHASLOCK flock will be unlocked in the close path, but the flag is not cleared. */ - if (nofp->nof_deny & NFS_OPEN_SHARE_DENY_READ) - denyMode = NFS_OPEN_SHARE_DENY_BOTH; - else if (nofp->nof_deny & NFS_OPEN_SHARE_DENY_WRITE) - denyMode = NFS_OPEN_SHARE_DENY_WRITE; - else - denyMode = NFS_OPEN_SHARE_DENY_NONE; - } else { - denyMode = NFS_OPEN_SHARE_DENY_NONE; - } + if (newnlop && (nlop != newnlop)) + nfs_lock_owner_destroy(newnlop); - if (!accessMode) { - error = EINVAL; - goto out; - } + if (nlop) + nfs_lock_owner_ref(nlop); - error = nfs4_close(np, nofp, accessMode, denyMode, ctx); - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_close: close error %d, %s\n", error, vname); - vnode_putname(vname); - } - -out: - if (nofp) - nfs_open_file_clear_busy(nofp); - if (nfs_mount_state_in_use_end(nmp, error)) { - nofp = NULL; - goto restart; - } - if (noop) - nfs_open_owner_rele(noop); - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_close: error %d, %s\n", error, vname); - vnode_putname(vname); - } - if (!error) - error = common_error; - return (error); -} - -int -nfs4_vnop_mmap( - struct vnop_mmap_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - int a_fflags; - vfs_context_t a_context; - } */ *ap) -{ - vfs_context_t ctx = ap->a_context; - vnode_t vp = ap->a_vp; - nfsnode_t np = VTONFS(vp); - int error = 0, accessMode, denyMode; - struct nfsmount *nmp; - struct nfs_open_owner *noop = NULL; - struct nfs_open_file *nofp = NULL; - - nmp = VTONMP(vp); - if (!nmp) - return (ENXIO); - - if (!vnode_isreg(vp) || !(ap->a_fflags & (PROT_READ|PROT_WRITE))) - return (EINVAL); - - /* - * fflags contains some combination of: PROT_READ, PROT_WRITE - * Since it's not possible to mmap() without having the file open for reading, - * read access is always there (regardless if PROT_READ is not set). - */ - accessMode = NFS_OPEN_SHARE_ACCESS_READ; - if (ap->a_fflags & PROT_WRITE) - accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE; - denyMode = NFS_OPEN_SHARE_DENY_NONE; - - noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 0); - if (!noop) { - printf("nfs4_vnop_mmap: no open owner\n"); - return (EPERM); - } - -restart: - error = nfs_mount_state_in_use_start(nmp); - if (error) { - nfs_open_owner_rele(noop); - return (error); - } - - error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1); - if (error || (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST))) { - printf("nfs4_vnop_mmap: no open file for owner %d\n", error); - error = EPERM; - } - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - nofp = NULL; - goto restart; - } - if (!error) - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); - if (error) { - nofp = NULL; - goto out; - } - - /* - * The open reference for mmap must mirror an existing open because - * we may need to reclaim it after the file is closed. - * So grab another open count matching the accessMode passed in. - * If we already had an mmap open, prefer read/write without deny mode. - * This means we may have to drop the current mmap open first. - */ - - /* determine deny mode for open */ - if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { - if (nofp->nof_rw) - denyMode = NFS_OPEN_SHARE_DENY_NONE; - else if (nofp->nof_rw_dw) - denyMode = NFS_OPEN_SHARE_DENY_WRITE; - else if (nofp->nof_rw_drw) - denyMode = NFS_OPEN_SHARE_DENY_BOTH; - else - error = EPERM; - } else { /* NFS_OPEN_SHARE_ACCESS_READ */ - if (nofp->nof_r) - denyMode = NFS_OPEN_SHARE_DENY_NONE; - else if (nofp->nof_r_dw) - denyMode = NFS_OPEN_SHARE_DENY_WRITE; - else if (nofp->nof_r_drw) - denyMode = NFS_OPEN_SHARE_DENY_BOTH; - else - error = EPERM; - } - if (error) /* mmap mode without proper open mode */ - goto out; - - /* - * If the existing mmap access is more than the new access OR the - * existing access is the same and the existing deny mode is less, - * then we'll stick with the existing mmap open mode. - */ - if ((nofp->nof_mmap_access > accessMode) || - ((nofp->nof_mmap_access == accessMode) && (nofp->nof_mmap_deny <= denyMode))) - goto out; - - /* update mmap open mode */ - if (nofp->nof_mmap_access) { - error = nfs4_close(np, nofp, nofp->nof_mmap_access, nofp->nof_mmap_deny, ctx); - if (error) { - if (!nfs_mount_state_error_should_restart(error)) - printf("nfs_vnop_mmap: close of previous mmap mode failed: %d\n", error); - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_mmap: update, close error %d, %s\n", error, vname); - vnode_putname(vname); - goto out; - } - nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; - } - - if (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) { - if (denyMode == NFS_OPEN_SHARE_DENY_NONE) - nofp->nof_rw++; - else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) - nofp->nof_rw_dw++; - else /* NFS_OPEN_SHARE_DENY_BOTH */ - nofp->nof_rw_drw++; - } else if (accessMode == NFS_OPEN_SHARE_ACCESS_READ) { - if (denyMode == NFS_OPEN_SHARE_DENY_NONE) - nofp->nof_r++; - else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) - nofp->nof_r_dw++; - else /* NFS_OPEN_SHARE_DENY_BOTH */ - nofp->nof_r_drw++; - } - nofp->nof_mmap_access = accessMode; - nofp->nof_mmap_deny = denyMode; - nofp->nof_opencnt++; - -out: - if (nofp) - nfs_open_file_clear_busy(nofp); - if (nfs_mount_state_in_use_end(nmp, error)) { - nofp = NULL; - goto restart; - } - if (noop) - nfs_open_owner_rele(noop); - return (error); -} - - -int -nfs4_vnop_mnomap( - struct vnop_mnomap_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - vfs_context_t a_context; - } */ *ap) -{ - vfs_context_t ctx = ap->a_context; - vnode_t vp = ap->a_vp; - nfsnode_t np = VTONFS(vp); - struct nfsmount *nmp; - struct nfs_open_file *nofp = NULL; - int error; - - nmp = VTONMP(vp); - if (!nmp) - return (ENXIO); - - /* walk all open files and close all mmap opens */ -loop: - error = nfs_mount_state_in_use_start(nmp); - if (error) - return (error); - lck_mtx_lock(&np->n_openlock); - TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { - if (!nofp->nof_mmap_access) - continue; - lck_mtx_unlock(&np->n_openlock); - if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { - nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - goto loop; - } - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); - if (error) { - lck_mtx_lock(&np->n_openlock); - break; - } - if (nofp->nof_mmap_access) { - error = nfs4_close(np, nofp, nofp->nof_mmap_access, nofp->nof_mmap_deny, ctx); - if (!nfs_mount_state_error_should_restart(error)) { - if (error) /* not a state-operation-restarting error, so just clear the access */ - printf("nfs_vnop_mnomap: close of mmap mode failed: %d\n", error); - nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; - } - if (error) { - const char *vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_mnomap: error %d, %s\n", error, vname); - vnode_putname(vname); - } - } - nfs_open_file_clear_busy(nofp); - nfs_mount_state_in_use_end(nmp, error); - goto loop; - } - lck_mtx_unlock(&np->n_openlock); - nfs_mount_state_in_use_end(nmp, error); - return (error); -} - -/* - * Search a node's lock owner list for the owner for this process. - * If not found and "alloc" is set, then allocate a new one. - */ -struct nfs_lock_owner * -nfs_lock_owner_find(nfsnode_t np, proc_t p, int alloc) -{ - pid_t pid = proc_pid(p); - struct nfs_lock_owner *nlop, *newnlop = NULL; - -tryagain: - lck_mtx_lock(&np->n_openlock); - TAILQ_FOREACH(nlop, &np->n_lock_owners, nlo_link) { - if (nlop->nlo_pid != pid) - continue; - if (timevalcmp(&nlop->nlo_pid_start, &p->p_start, ==)) - break; - /* stale lock owner... reuse it if we can */ - if (nlop->nlo_refcnt) { - TAILQ_REMOVE(&np->n_lock_owners, nlop, nlo_link); - nlop->nlo_flags &= ~NFS_LOCK_OWNER_LINK; - lck_mtx_unlock(&np->n_openlock); - goto tryagain; - } - nlop->nlo_pid_start = p->p_start; - nlop->nlo_seqid = 0; - nlop->nlo_stategenid = 0; - break; - } - - if (!nlop && !newnlop && alloc) { - lck_mtx_unlock(&np->n_openlock); - MALLOC(newnlop, struct nfs_lock_owner *, sizeof(struct nfs_lock_owner), M_TEMP, M_WAITOK); - if (!newnlop) - return (NULL); - bzero(newnlop, sizeof(*newnlop)); - lck_mtx_init(&newnlop->nlo_lock, nfs_open_grp, LCK_ATTR_NULL); - newnlop->nlo_pid = pid; - newnlop->nlo_pid_start = p->p_start; - newnlop->nlo_name = OSAddAtomic(1, &nfs_lock_owner_seqnum); - TAILQ_INIT(&newnlop->nlo_locks); - goto tryagain; - } - if (!nlop && newnlop) { - newnlop->nlo_flags |= NFS_LOCK_OWNER_LINK; - TAILQ_INSERT_HEAD(&np->n_lock_owners, newnlop, nlo_link); - nlop = newnlop; - } - lck_mtx_unlock(&np->n_openlock); - - if (newnlop && (nlop != newnlop)) - nfs_lock_owner_destroy(newnlop); - - if (nlop) - nfs_lock_owner_ref(nlop); - - return (nlop); -} + return (nlop); +} /* * destroy a lock owner that's no longer needed @@ -2838,7 +3021,7 @@ nfs_lock_owner_set_busy(struct nfs_lock_owner *nlop, thread_t thd) nmp = nlop->nlo_open_owner->noo_mount; if (!nmp) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + slpflag = (NMFLAG(nmp, INTR) && thd) ? PCATCH : 0; lck_mtx_lock(&nlop->nlo_lock); while (nlop->nlo_flags & NFS_LOCK_OWNER_BUSY) { @@ -2846,6 +3029,7 @@ nfs_lock_owner_set_busy(struct nfs_lock_owner *nlop, thread_t thd) break; nlop->nlo_flags |= NFS_LOCK_OWNER_WANT; msleep(nlop, &nlop->nlo_lock, slpflag, "nfs_lock_owner_set_busy", &ts); + slpflag = 0; } if (!error) nlop->nlo_flags |= NFS_LOCK_OWNER_BUSY; @@ -2977,11 +3161,12 @@ nfs_file_lock_conflict(struct nfs_file_lock *nflp1, struct nfs_file_lock *nflp2, * Send an NFSv4 LOCK RPC to the server. */ int -nfs4_lock_rpc( +nfs4_setlock_rpc( nfsnode_t np, struct nfs_open_file *nofp, struct nfs_file_lock *nflp, int reclaim, + int flags, thread_t thd, kauth_cred_t cred) { @@ -2991,10 +3176,13 @@ nfs4_lock_rpc( uint64_t xid; uint32_t locktype; int error = 0, lockerror = ENOENT, newlocker, numops, status; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); newlocker = (nlop->nlo_stategenid != nmp->nm_stategenid); locktype = (nflp->nfl_flags & NFS_FILE_LOCK_WAIT) ? @@ -3027,6 +3215,7 @@ nfs4_lock_rpc( return (error); } + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -3039,8 +3228,7 @@ nfs4_lock_rpc( nfsm_chain_add_fh(error, &nmreq, NFS_VER4, np->n_fhp, np->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_LOCK); nfsm_chain_add_32(error, &nmreq, locktype); @@ -3061,7 +3249,7 @@ nfs4_lock_rpc( nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, (reclaim ? R_RECOVER : 0), &nmrep, &xid, &status); + error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, flags|R_NOINTR, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -3070,7 +3258,7 @@ nfs4_lock_rpc( nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, &xid); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_LOCK); nfs_owner_seqid_increment(newlocker ? nofp->nof_owner : NULL, nlop, error); @@ -3103,21 +3291,27 @@ nfs4_unlock_rpc( int type, uint64_t start, uint64_t end, - vfs_context_t ctx) + int flags, + thread_t thd, + kauth_cred_t cred) { struct nfsmount *nmp; struct nfsm_chain nmreq, nmrep; uint64_t xid; int error = 0, lockerror = ENOENT, numops, status; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); - error = nfs_lock_owner_set_busy(nlop, vfs_context_thread(ctx)); + error = nfs_lock_owner_set_busy(nlop, NULL); if (error) return (error); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -3130,8 +3324,7 @@ nfs4_unlock_rpc( nfsm_chain_add_fh(error, &nmreq, NFS_VER4, np->n_fhp, np->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_LOCKU); nfsm_chain_add_32(error, &nmreq, (type == F_WRLCK) ? NFS_LOCK_TYPE_WRITE : NFS_LOCK_TYPE_READ); @@ -3143,7 +3336,7 @@ nfs4_unlock_rpc( nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, flags|R_NOINTR, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -3152,7 +3345,7 @@ nfs4_unlock_rpc( nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, &xid); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_LOCKU); nfs_owner_seqid_increment(NULL, nlop, error); @@ -3167,14 +3360,10 @@ nfsmout: } /* - * Check for any conflicts with the given lock. - * - * Checking for a lock doesn't require the file to be opened. - * So we skip all the open owner, open file, lock owner work - * and just check for a conflicting lock. + * Send an NFSv4 LOCKT RPC to the server. */ int -nfs4_getlock( +nfs4_getlock_rpc( nfsnode_t np, struct nfs_lock_owner *nlop, struct flock *fl, @@ -3183,39 +3372,20 @@ nfs4_getlock( vfs_context_t ctx) { struct nfsmount *nmp; - struct nfs_file_lock *nflp; struct nfsm_chain nmreq, nmrep; uint64_t xid, val64 = 0; uint32_t val = 0; - int error = 0, lockerror = ENOENT, numops, status; + int error = 0, lockerror, numops, status; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); - lck_mtx_lock(&np->n_openlock); - /* scan currently held locks for conflict */ - TAILQ_FOREACH(nflp, &np->n_locks, nfl_link) { - if (nflp->nfl_flags & NFS_FILE_LOCK_BLOCKED) - continue; - if ((start <= nflp->nfl_end) && (end >= nflp->nfl_start) && - ((fl->l_type == F_WRLCK) || (nflp->nfl_type == F_WRLCK))) - break; - } - if (nflp) { - /* found a conflicting lock */ - fl->l_type = nflp->nfl_type; - fl->l_pid = (nflp->nfl_flags & NFS_FILE_LOCK_STYLE_FLOCK) ? -1 : nflp->nfl_owner->nlo_pid; - fl->l_start = nflp->nfl_start; - fl->l_len = NFS_FLOCK_LENGTH(nflp->nfl_start, nflp->nfl_end); - fl->l_whence = SEEK_SET; - } - lck_mtx_unlock(&np->n_openlock); - if (nflp) - return (0); - - /* no conflict found locally, so ask the server */ - + lockerror = ENOENT; + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -3228,8 +3398,7 @@ nfs4_getlock( nfsm_chain_add_fh(error, &nmreq, NFS_VER4, np->n_fhp, np->n_fhsize); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_LOCKT); nfsm_chain_add_32(error, &nmreq, (fl->l_type == F_WRLCK) ? NFS_LOCK_TYPE_WRITE : NFS_LOCK_TYPE_READ); @@ -3240,7 +3409,7 @@ nfs4_getlock( nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -3249,7 +3418,7 @@ nfs4_getlock( nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, NFS_VER4, &xid); nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_LOCKT); if (error == NFSERR_DENIED) { @@ -3272,6 +3441,74 @@ nfsmout: return (error); } + +/* + * Check for any conflicts with the given lock. + * + * Checking for a lock doesn't require the file to be opened. + * So we skip all the open owner, open file, lock owner work + * and just check for a conflicting lock. + */ +int +nfs_advlock_getlock( + nfsnode_t np, + struct nfs_lock_owner *nlop, + struct flock *fl, + uint64_t start, + uint64_t end, + vfs_context_t ctx) +{ + struct nfsmount *nmp; + struct nfs_file_lock *nflp; + int error = 0, answered = 0; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + +restart: + if ((error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)))) + return (error); + + lck_mtx_lock(&np->n_openlock); + /* scan currently held locks for conflict */ + TAILQ_FOREACH(nflp, &np->n_locks, nfl_link) { + if (nflp->nfl_flags & (NFS_FILE_LOCK_BLOCKED|NFS_FILE_LOCK_DEAD)) + continue; + if ((start <= nflp->nfl_end) && (end >= nflp->nfl_start) && + ((fl->l_type == F_WRLCK) || (nflp->nfl_type == F_WRLCK))) + break; + } + if (nflp) { + /* found a conflicting lock */ + fl->l_type = nflp->nfl_type; + fl->l_pid = (nflp->nfl_flags & NFS_FILE_LOCK_STYLE_FLOCK) ? -1 : nflp->nfl_owner->nlo_pid; + fl->l_start = nflp->nfl_start; + fl->l_len = NFS_FLOCK_LENGTH(nflp->nfl_start, nflp->nfl_end); + fl->l_whence = SEEK_SET; + answered = 1; + } else if ((np->n_openflags & N_DELEG_WRITE) && !(np->n_openflags & N_DELEG_RETURN)) { + /* + * If we have a write delegation, we know there can't be other + * locks on the server. So the answer is no conflicting lock found. + */ + fl->l_type = F_UNLCK; + answered = 1; + } + lck_mtx_unlock(&np->n_openlock); + if (answered) { + nfs_mount_state_in_use_end(nmp, 0); + return (0); + } + + /* no conflict found locally, so ask the server */ + error = nmp->nm_funcs->nf_getlock_rpc(np, nlop, fl, start, end, ctx); + + if (nfs_mount_state_in_use_end(nmp, error)) + goto restart; + return (error); +} + /* * Acquire a file lock for the given range. * @@ -3284,7 +3521,7 @@ nfsmout: * queue again to coalesce any locks adjacent to the new one. */ int -nfs4_setlock( +nfs_advlock_setlock( nfsnode_t np, struct nfs_open_file *nofp, struct nfs_lock_owner *nlop, @@ -3304,7 +3541,10 @@ nfs4_setlock( nmp = NFSTONMP(np); if (!nmp) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; + + if ((type != F_RDLCK) && (type != F_WRLCK)) + return (EINVAL); /* allocate a new lock */ newnflp = nfs_file_lock_alloc(nlop); @@ -3335,14 +3575,22 @@ nfs4_setlock( restart: restart = 0; - error = nfs_mount_state_in_use_start(nmp); + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); if (error) goto error_out; inuse = 1; + if (np->n_flag & NREVOKE) { + error = EIO; + nfs_mount_state_in_use_end(nmp, 0); + inuse = 0; + goto error_out; + } if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { nfs_mount_state_in_use_end(nmp, 0); inuse = 0; - nfs4_reopen(nofp, vfs_context_thread(ctx)); + error = nfs4_reopen(nofp, vfs_context_thread(ctx)); + if (error) + goto error_out; goto restart; } @@ -3354,7 +3602,8 @@ restart: } /* scan current list of locks (held and pending) for conflicts */ - for (nflp = TAILQ_NEXT(newnflp, nfl_link); nflp; nflp = TAILQ_NEXT(nflp, nfl_link)) { + for (nflp = TAILQ_NEXT(newnflp, nfl_link); nflp; nflp = nextnflp) { + nextnflp = TAILQ_NEXT(nflp, nfl_link); if (!nfs_file_lock_conflict(newnflp, nflp, &willsplit)) continue; /* Conflict */ @@ -3374,10 +3623,10 @@ restart: lck_mtx_unlock(&np->n_openlock); nfs_mount_state_in_use_end(nmp, 0); inuse = 0; - error = nfs4_unlock(np, nofp, nlop, 0, UINT64_MAX, NFS_FILE_LOCK_STYLE_FLOCK, ctx); + error = nfs_advlock_unlock(np, nofp, nlop, 0, UINT64_MAX, NFS_FILE_LOCK_STYLE_FLOCK, ctx); flocknflp = NULL; if (!error) - error = nfs_mount_state_in_use_start(nmp); + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); if (error) { lck_mtx_lock(&np->n_openlock); break; @@ -3388,7 +3637,8 @@ restart: if (!nfs_file_lock_conflict(newnflp, nflp, NULL)) break; } - msleep(nflp, &np->n_openlock, slpflag, "nfs4_setlock_blocked", &ts); + msleep(nflp, &np->n_openlock, slpflag, "nfs_advlock_setlock_blocked", &ts); + slpflag = 0; error = nfs_sigintr(NFSTONMP(np), NULL, vfs_context_thread(ctx), 0); if (!error && (nmp->nm_state & NFSSTA_RECOVER)) { /* looks like we have a recover pending... restart */ @@ -3399,6 +3649,8 @@ restart: lck_mtx_lock(&np->n_openlock); break; } + if (!error && (np->n_flag & NREVOKE)) + error = EIO; } while (!error && nfs_file_lock_conflict(newnflp, nflp, NULL)); nflp->nfl_blockcnt--; if ((nflp->nfl_flags & NFS_FILE_LOCK_DEAD) && !nflp->nfl_blockcnt) { @@ -3407,6 +3659,9 @@ restart: } if (error || restart) break; + /* We have released n_openlock and we can't trust that nextnflp is still valid. */ + /* So, start this lock-scanning loop over from where it started. */ + nextnflp = TAILQ_NEXT(newnflp, nfl_link); } lck_mtx_unlock(&np->n_openlock); if (restart) @@ -3428,16 +3683,50 @@ restart: } /* once scan for local conflicts is clear, send request to server */ - if ((error = nfs_open_state_set_busy(np, ctx))) + if ((error = nfs_open_state_set_busy(np, vfs_context_thread(ctx)))) goto error_out; busy = 1; delay = 0; do { - error = nfs4_lock_rpc(np, nofp, newnflp, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); + /* do we have a delegation? (that we're not returning?) */ + if ((np->n_openflags & N_DELEG_MASK) && !(np->n_openflags & N_DELEG_RETURN)) { + if (np->n_openflags & N_DELEG_WRITE) { + /* with a write delegation, just take the lock delegated */ + newnflp->nfl_flags |= NFS_FILE_LOCK_DELEGATED; + error = 0; + /* make sure the lock owner knows its open owner */ + if (!nlop->nlo_open_owner) { + nfs_open_owner_ref(nofp->nof_owner); + nlop->nlo_open_owner = nofp->nof_owner; + } + break; + } else { + /* + * If we don't have any non-delegated opens but we do have + * delegated opens, then we need to first claim the delegated + * opens so that the lock request on the server can be associated + * with an open it knows about. + */ + if ((!nofp->nof_rw_drw && !nofp->nof_w_drw && !nofp->nof_r_drw && + !nofp->nof_rw_dw && !nofp->nof_w_dw && !nofp->nof_r_dw && + !nofp->nof_rw && !nofp->nof_w && !nofp->nof_r) && + (nofp->nof_d_rw_drw || nofp->nof_d_w_drw || nofp->nof_d_r_drw || + nofp->nof_d_rw_dw || nofp->nof_d_w_dw || nofp->nof_d_r_dw || + nofp->nof_d_rw || nofp->nof_d_w || nofp->nof_d_r)) { + error = nfs4_claim_delegated_state_for_open_file(nofp, 0); + if (error) + break; + } + } + } + if (np->n_flag & NREVOKE) + error = EIO; + if (!error) + error = nmp->nm_funcs->nf_setlock_rpc(np, nofp, newnflp, 0, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (!error || ((error != NFSERR_DENIED) && (error != NFSERR_GRACE))) break; /* request was denied due to either conflict or grace period */ - if ((error != NFSERR_GRACE) && !(newnflp->nfl_flags & NFS_FILE_LOCK_WAIT)) { + if ((error == NFSERR_DENIED) && !(newnflp->nfl_flags & NFS_FILE_LOCK_WAIT)) { error = EAGAIN; break; } @@ -3447,13 +3736,13 @@ restart: busy = 0; nfs_mount_state_in_use_end(nmp, 0); inuse = 0; - error2 = nfs4_unlock(np, nofp, nlop, 0, UINT64_MAX, NFS_FILE_LOCK_STYLE_FLOCK, ctx); + error2 = nfs_advlock_unlock(np, nofp, nlop, 0, UINT64_MAX, NFS_FILE_LOCK_STYLE_FLOCK, ctx); flocknflp = NULL; if (!error2) - error2 = nfs_mount_state_in_use_start(nmp); + error2 = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); if (!error2) { inuse = 1; - error2 = nfs_open_state_set_busy(np, ctx); + error2 = nfs_open_state_set_busy(np, vfs_context_thread(ctx)); } if (error2) { error = error2; @@ -3461,12 +3750,18 @@ restart: } busy = 1; } - /* wait a little bit and send the request again */ - if (error == NFSERR_GRACE) - delay = 4; - if (delay < 4) - delay++; - tsleep(newnflp, slpflag, "nfs4_setlock_delay", delay * (hz/2)); + /* + * Wait a little bit and send the request again. + * Except for retries of blocked v2/v3 request where we've already waited a bit. + */ + if ((nmp->nm_vers >= NFS_VER4) || (error == NFSERR_GRACE)) { + if (error == NFSERR_GRACE) + delay = 4; + if (delay < 4) + delay++; + tsleep(newnflp, slpflag, "nfs_advlock_setlock_delay", delay * (hz/2)); + slpflag = 0; + } error = nfs_sigintr(NFSTONMP(np), NULL, vfs_context_thread(ctx), 0); if (!error && (nmp->nm_state & NFSSTA_RECOVER)) { /* looks like we have a recover pending... restart */ @@ -3476,6 +3771,8 @@ restart: inuse = 0; goto restart; } + if (!error && (np->n_flag & NREVOKE)) + error = EIO; } while (!error); error_out: @@ -3545,7 +3842,7 @@ error_out: /* We're replacing a range in the middle of a lock. */ /* The current lock will be split into two locks. */ /* Update locks and insert new lock after current lock. */ - nflp2->nfl_flags |= (nflp->nfl_flags & NFS_FILE_LOCK_STYLE_MASK); + nflp2->nfl_flags |= (nflp->nfl_flags & (NFS_FILE_LOCK_STYLE_MASK|NFS_FILE_LOCK_DELEGATED)); nflp2->nfl_type = nflp->nfl_type; nflp2->nfl_start = newnflp->nfl_end + 1; nflp2->nfl_end = nflp->nfl_end; @@ -3635,8 +3932,11 @@ error_out: return (error); } +/* + * Release all (same style) locks within the given range. + */ int -nfs4_unlock( +nfs_advlock_unlock( nfsnode_t np, struct nfs_open_file *nofp, struct nfs_lock_owner *nlop, @@ -3654,14 +3954,16 @@ nfs4_unlock( return (ENXIO); restart: - if ((error = nfs_mount_state_in_use_start(nmp))) + if ((error = nfs_mount_state_in_use_start(nmp, NULL))) return (error); if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); + error = nfs4_reopen(nofp, NULL); + if (error) + return (error); goto restart; } - if ((error = nfs_open_state_set_busy(np, ctx))) { + if ((error = nfs_open_state_set_busy(np, NULL))) { nfs_mount_state_in_use_end(nmp, error); return (error); } @@ -3725,11 +4027,13 @@ restart: ((nflp->nfl_flags & NFS_FILE_LOCK_STYLE_MASK) == NFS_FILE_LOCK_STYLE_POSIX)) { uint64_t s = 0; int type = TAILQ_FIRST(&nlop->nlo_locks)->nfl_type; - while (nflp) { + int delegated = (TAILQ_FIRST(&nlop->nlo_locks)->nfl_flags & NFS_FILE_LOCK_DELEGATED); + while (!delegated && nflp) { if ((nflp->nfl_flags & NFS_FILE_LOCK_STYLE_MASK) == NFS_FILE_LOCK_STYLE_POSIX) { /* unlock the range preceding this lock */ lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, type, s, nflp->nfl_start-1, ctx); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, type, s, nflp->nfl_start-1, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (nfs_mount_state_error_should_restart(error)) { nfs_open_state_clear_busy(np); nfs_mount_state_in_use_end(nmp, error); @@ -3742,16 +4046,19 @@ restart: } nflp = TAILQ_NEXT(nflp, nfl_lolink); } - lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, type, s, end, ctx); - if (nfs_mount_state_error_should_restart(error)) { - nfs_open_state_clear_busy(np); - nfs_mount_state_in_use_end(nmp, error); - goto restart; + if (!delegated) { + lck_mtx_unlock(&np->n_openlock); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, type, s, end, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); + if (nfs_mount_state_error_should_restart(error)) { + nfs_open_state_clear_busy(np); + nfs_mount_state_in_use_end(nmp, error); + goto restart; + } + lck_mtx_lock(&np->n_openlock); + if (error) + goto out; } - lck_mtx_lock(&np->n_openlock); - if (error) - goto out; send_unlock_rpcs = 0; } @@ -3767,9 +4074,10 @@ restart: /* here's one to unlock */ if ((start <= nflp->nfl_start) && (end >= nflp->nfl_end)) { /* The entire lock is being unlocked. */ - if (send_unlock_rpcs) { + if (send_unlock_rpcs && !(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) { lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, nflp->nfl_type, nflp->nfl_start, nflp->nfl_end, ctx); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, nflp->nfl_type, nflp->nfl_start, nflp->nfl_end, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (nfs_mount_state_error_should_restart(error)) { nfs_open_state_clear_busy(np); nfs_mount_state_in_use_end(nmp, error); @@ -3788,9 +4096,10 @@ restart: } else if ((start > nflp->nfl_start) && (end < nflp->nfl_end)) { /* We're unlocking a range in the middle of a lock. */ /* The current lock will be split into two locks. */ - if (send_unlock_rpcs) { + if (send_unlock_rpcs && !(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) { lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, nflp->nfl_type, start, end, ctx); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, nflp->nfl_type, start, end, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (nfs_mount_state_error_should_restart(error)) { nfs_open_state_clear_busy(np); nfs_mount_state_in_use_end(nmp, error); @@ -3801,7 +4110,7 @@ restart: if (error) break; /* update locks and insert new lock after current lock */ - newnflp->nfl_flags |= (nflp->nfl_flags & NFS_FILE_LOCK_STYLE_MASK); + newnflp->nfl_flags |= (nflp->nfl_flags & (NFS_FILE_LOCK_STYLE_MASK|NFS_FILE_LOCK_DELEGATED)); newnflp->nfl_type = nflp->nfl_type; newnflp->nfl_start = end + 1; newnflp->nfl_end = nflp->nfl_end; @@ -3812,9 +4121,10 @@ restart: newnflp = NULL; } else if (start > nflp->nfl_start) { /* We're unlocking the end of a lock. */ - if (send_unlock_rpcs) { + if (send_unlock_rpcs && !(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) { lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, nflp->nfl_type, start, nflp->nfl_end, ctx); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, nflp->nfl_type, start, nflp->nfl_end, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (nfs_mount_state_error_should_restart(error)) { nfs_open_state_clear_busy(np); nfs_mount_state_in_use_end(nmp, error); @@ -3828,9 +4138,10 @@ restart: nflp->nfl_end = start - 1; } else if (end < nflp->nfl_end) { /* We're unlocking the start of a lock. */ - if (send_unlock_rpcs) { + if (send_unlock_rpcs && !(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) { lck_mtx_unlock(&np->n_openlock); - error = nfs4_unlock_rpc(np, nlop, nflp->nfl_type, nflp->nfl_start, end, ctx); + error = nmp->nm_funcs->nf_unlock_rpc(np, nlop, nflp->nfl_type, nflp->nfl_start, end, 0, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); if (nfs_mount_state_error_should_restart(error)) { nfs_open_state_clear_busy(np); nfs_mount_state_in_use_end(nmp, error); @@ -3866,7 +4177,7 @@ out: * NFSv4 advisory file locking */ int -nfs4_vnop_advlock( +nfs_vnop_advlock( struct vnop_advlock_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -3884,19 +4195,34 @@ nfs4_vnop_advlock( int flags = ap->a_flags; vfs_context_t ctx = ap->a_context; struct nfsmount *nmp; - struct nfs_vattr nvattr; struct nfs_open_owner *noop = NULL; struct nfs_open_file *nofp = NULL; struct nfs_lock_owner *nlop = NULL; off_t lstart; uint64_t start, end; int error = 0, modified, style; + enum vtype vtype; #define OFF_MAX QUAD_MAX nmp = VTONMP(ap->a_vp); if (!nmp) return (ENXIO); + lck_mtx_lock(&nmp->nm_lock); + if ((nmp->nm_vers <= NFS_VER3) && (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED)) { + lck_mtx_unlock(&nmp->nm_lock); + return (ENOTSUP); + } + lck_mtx_unlock(&nmp->nm_lock); + if (np->n_flag & NREVOKE) + return (EIO); + vtype = vnode_vtype(ap->a_vp); + if (vtype == VDIR) /* ignore lock requests on directories */ + return (0); + if (vtype != VREG) /* anything other than regular files is invalid */ + return (EINVAL); + + /* Convert the flock structure into a start and end. */ switch (fl->l_whence) { case SEEK_SET: case SEEK_CUR: @@ -3915,7 +4241,7 @@ nfs4_vnop_advlock( nfs_node_unlock(np); if (modified && ((error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1)))) return (error); - if ((error = nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED))) + if ((error = nfs_getattr(np, NULL, ctx, NGA_UNCACHED))) return (error); nfs_data_lock(np, NFS_DATA_LOCK_SHARED); if ((np->n_size > OFF_MAX) || @@ -3944,8 +4270,8 @@ nfs4_vnop_advlock( end = start - 1; start += fl->l_len; } - if (error) - return (error); + if ((nmp->nm_vers == NFS_VER2) && ((start > INT32_MAX) || (fl->l_len && (end > INT32_MAX)))) + return (EINVAL); style = (flags & F_FLOCK) ? NFS_FILE_LOCK_STYLE_FLOCK : NFS_FILE_LOCK_STYLE_POSIX; if ((style == NFS_FILE_LOCK_STYLE_FLOCK) && ((start != 0) || (end != UINT64_MAX))) @@ -3956,17 +4282,17 @@ nfs4_vnop_advlock( if (!nlop) { error = (op == F_UNLCK) ? 0 : ENOMEM; if (error) - printf("nfs4_vnop_advlock: no lock owner %d\n", error); + NP(np, "nfs_vnop_advlock: no lock owner, error %d", error); goto out; } if (op == F_GETLK) { - error = nfs4_getlock(np, nlop, fl, start, end, ctx); + error = nfs_advlock_getlock(np, nlop, fl, start, end, ctx); } else { /* find the open owner */ noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 0); if (!noop) { - printf("nfs4_vnop_advlock: no open owner\n"); + NP(np, "nfs_vnop_advlock: no open owner %d", kauth_cred_getuid(vfs_context_ucred(ctx))); error = EPERM; goto out; } @@ -3976,24 +4302,25 @@ restart: if (error) error = EBADF; if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { - printf("nfs_vnop_advlock: LOST\n"); + NP(np, "nfs_vnop_advlock: LOST %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); error = EIO; } if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs4_reopen(nofp, vfs_context_thread(ctx)); + error = nfs4_reopen(nofp, ((op == F_UNLCK) ? NULL : vfs_context_thread(ctx))); nofp = NULL; - goto restart; + if (!error) + goto restart; } if (error) { - printf("nfs4_vnop_advlock: no open file %d\n", error); + NP(np, "nfs_vnop_advlock: no open file %d, %d", error, kauth_cred_getuid(noop->noo_cred)); goto out; } if (op == F_UNLCK) { - error = nfs4_unlock(np, nofp, nlop, start, end, style, ctx); + error = nfs_advlock_unlock(np, nofp, nlop, start, end, style, ctx); } else if ((op == F_SETLK) || (op == F_SETLKW)) { if ((op == F_SETLK) && (flags & F_WAIT)) op = F_SETLKW; - error = nfs4_setlock(np, nofp, nlop, op, start, end, style, fl->l_type, ctx); + error = nfs_advlock_setlock(np, nofp, nlop, op, start, end, style, fl->l_type, ctx); } else { /* not getlk, unlock or lock? */ error = EINVAL; @@ -4012,7 +4339,7 @@ out: * Check if an open owner holds any locks on a file. */ int -nfs4_check_for_locks(struct nfs_open_owner *noop, struct nfs_open_file *nofp) +nfs_check_for_locks(struct nfs_open_owner *noop, struct nfs_open_file *nofp) { struct nfs_lock_owner *nlop; @@ -4028,19 +4355,21 @@ nfs4_check_for_locks(struct nfs_open_owner *noop, struct nfs_open_file *nofp) /* * Reopen simple (no deny, no locks) open state that was lost. */ -void +int nfs4_reopen(struct nfs_open_file *nofp, thread_t thd) { struct nfs_open_owner *noop = nofp->nof_owner; struct nfsmount *nmp = NFSTONMP(nofp->nof_np); - vnode_t vp = NFSTOV(nofp->nof_np); + nfsnode_t np = nofp->nof_np; + vnode_t vp = NFSTOV(np); vnode_t dvp = NULL; struct componentname cn; const char *vname = NULL; + const char *name = NULL; size_t namelen; char smallname[128]; char *filename = NULL; - int error = 0, done = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + int error = 0, done = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; struct timespec ts = { 1, 0 }; lck_mtx_lock(&nofp->nof_lock); @@ -4048,38 +4377,67 @@ nfs4_reopen(struct nfs_open_file *nofp, thread_t thd) if ((error = nfs_sigintr(nmp, NULL, thd, 0))) break; msleep(&nofp->nof_flags, &nofp->nof_lock, slpflag|(PZERO-1), "nfsreopenwait", &ts); + slpflag = 0; } - if (!(nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + if (error || !(nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { lck_mtx_unlock(&nofp->nof_lock); - return; + return (error); } nofp->nof_flags |= NFS_OPEN_FILE_REOPENING; lck_mtx_unlock(&nofp->nof_lock); - dvp = vnode_getparent(vp); - vname = vnode_getname(vp); - if (!dvp || !vname) { - error = EIO; - goto out; + nfs_node_lock_force(np); + if ((vnode_vtype(vp) != VDIR) && np->n_sillyrename) { + /* + * The node's been sillyrenamed, so we need to use + * the sillyrename directory/name to do the open. + */ + struct nfs_sillyrename *nsp = np->n_sillyrename; + dvp = NFSTOV(nsp->nsr_dnp); + if ((error = vnode_get(dvp))) { + nfs_node_unlock(np); + goto out; + } + name = nsp->nsr_name; + } else { + /* + * [sigh] We can't trust VFS to get the parent right for named + * attribute nodes. (It likes to reparent the nodes after we've + * created them.) Luckily we can probably get the right parent + * from the n_parent we have stashed away. + */ + if ((np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) && + (((dvp = np->n_parent)) && (error = vnode_get(dvp)))) + dvp = NULL; + if (!dvp) + dvp = vnode_getparent(vp); + vname = vnode_getname(vp); + if (!dvp || !vname) { + if (!error) + error = EIO; + nfs_node_unlock(np); + goto out; + } + name = vname; } filename = &smallname[0]; - namelen = snprintf(filename, sizeof(smallname), "%s", vname); + namelen = snprintf(filename, sizeof(smallname), "%s", name); if (namelen >= sizeof(smallname)) { - namelen++; /* snprintf result doesn't include '\0' */ - MALLOC(filename, char *, namelen, M_TEMP, M_WAITOK); + MALLOC(filename, char *, namelen+1, M_TEMP, M_WAITOK); if (!filename) { error = ENOMEM; goto out; } - snprintf(filename, namelen, "%s", vname); + snprintf(filename, namelen+1, "%s", name); } + nfs_node_unlock(np); bzero(&cn, sizeof(cn)); cn.cn_nameptr = filename; cn.cn_namelen = namelen; restart: done = 0; - if ((error = nfs_mount_state_in_use_start(nmp))) + if ((error = nfs_mount_state_in_use_start(nmp, thd))) goto out; if (nofp->nof_rw) @@ -4092,19 +4450,22 @@ restart: if (nfs_mount_state_in_use_end(nmp, error)) { if (error == NFSERR_GRACE) goto restart; + printf("nfs4_reopen: RPC failed, error %d, lost %d, %s\n", error, + (nofp->nof_flags & NFS_OPEN_FILE_LOST) ? 1 : 0, name ? name : "???"); error = 0; goto out; } done = 1; out: + if (error && (error != EINTR) && (error != ERESTART)) + nfs_revoke_open_state_for_node(np); lck_mtx_lock(&nofp->nof_lock); nofp->nof_flags &= ~NFS_OPEN_FILE_REOPENING; - if (error) - nofp->nof_flags |= NFS_OPEN_FILE_LOST; if (done) nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN; - else - printf("nfs4_reopen: failed, error %d, lost %d\n", error, (nofp->nof_flags & NFS_OPEN_FILE_LOST) ? 1 : 0); + else if (error) + printf("nfs4_reopen: failed, error %d, lost %d, %s\n", error, + (nofp->nof_flags & NFS_OPEN_FILE_LOST) ? 1 : 0, name ? name : "???"); lck_mtx_unlock(&nofp->nof_lock); if (filename && (filename != &smallname[0])) FREE(filename, M_TEMP); @@ -4112,6 +4473,7 @@ out: vnode_putname(vname); if (dvp != NULLVP) vnode_put(dvp); + return (error); } /* @@ -4147,13 +4509,73 @@ nfs4_open_reopen_rpc( int share_access, int share_deny) { - return (nfs4_open_rpc_internal(nofp, NULL, thd, cred, cnp, NULL, dvp, vpp, 0, share_access, share_deny)); + return (nfs4_open_rpc_internal(nofp, NULL, thd, cred, cnp, NULL, dvp, vpp, NFS_OPEN_NOCREATE, share_access, share_deny)); +} + +/* + * Send an OPEN_CONFIRM RPC to confirm an OPEN. + */ +int +nfs4_open_confirm_rpc( + struct nfsmount *nmp, + nfsnode_t dnp, + u_char *fhp, + int fhlen, + struct nfs_open_owner *noop, + nfs_stateid *sid, + thread_t thd, + kauth_cred_t cred, + struct nfs_vattr *nvap, + uint64_t *xidp) +{ + struct nfsm_chain nmreq, nmrep; + int error = 0, status, numops; + struct nfsreq_secinfo_args si; + + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, NULL, 0); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + // PUTFH, OPEN_CONFIRM, GETATTR + numops = 3; + nfsm_chain_build_alloc_init(error, &nmreq, 23 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "open_confirm", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, fhp, fhlen); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPEN_CONFIRM); + nfsm_chain_add_stateid(error, &nmreq, sid); + nfsm_chain_add_32(error, &nmreq, noop->noo_seqid); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request2(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, R_NOINTR, &nmrep, xidp, &status); + + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsmout_if(error); + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPEN_CONFIRM); + nfs_owner_seqid_increment(noop, NULL, error); + nfsm_chain_get_stateid(error, &nmrep, sid); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, nvap, NULL, NULL, NULL); +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + return (error); } /* * common OPEN RPC code * * If create is set, ctx must be passed in. + * Returns a node on success if no node passed in. */ int nfs4_open_rpc_internal( @@ -4171,20 +4593,24 @@ nfs4_open_rpc_internal( { struct nfsmount *nmp; struct nfs_open_owner *noop = nofp->nof_owner; - struct nfs_vattr nvattr, dnvattr; + struct nfs_vattr nvattr; int error = 0, open_error = EIO, lockerror = ENOENT, busyerror = ENOENT, status; - int nfsvers, numops, exclusive = 0, gotuid, gotgid; + int nfsvers, namedattrs, numops, exclusive = 0, gotuid, gotgid; u_int64_t xid, savedxid = 0; nfsnode_t dnp = VTONFS(dvp); nfsnode_t np, newnp = NULL; vnode_t newvp = NULL; struct nfsm_chain nmreq, nmrep; uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; - uint32_t rflags, delegation = 0, recall = 0, val; + uint32_t rflags, delegation, recall; struct nfs_stateid stateid, dstateid, *sid; fhandle_t fh; - struct nfsreq *req = NULL; + struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; + char sbuf[64], *s; + uint32_t ace_type, ace_flags, ace_mask, len, slen; + struct kauth_ace ace; + struct nfsreq_secinfo_args si; if (create && !ctx) return (EINVAL); @@ -4193,6 +4619,9 @@ nfs4_open_rpc_internal( if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); + if (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); np = *vpp ? VTONFS(*vpp) : NULL; if (create && vap) { @@ -4200,6 +4629,8 @@ nfs4_open_rpc_internal( nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); gotuid = VATTR_IS_ACTIVE(vap, va_uid); gotgid = VATTR_IS_ACTIVE(vap, va_gid); + if (exclusive && (!VATTR_IS_ACTIVE(vap, va_access_time) || !VATTR_IS_ACTIVE(vap, va_modify_time))) + vap->va_vaflags |= VA_UTIMES_NULL; } else { exclusive = gotuid = gotgid = 0; } @@ -4213,7 +4644,12 @@ nfs4_open_rpc_internal( if ((error = nfs_open_owner_set_busy(noop, thd))) return (error); again: - rflags = 0; + rflags = delegation = recall = 0; + ace.ace_flags = 0; + s = sbuf; + slen = sizeof(sbuf); + NVATTR_INIT(&nvattr); + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, cnp->cn_nameptr, cnp->cn_namelen); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -4232,13 +4668,9 @@ again: nfsm_chain_add_32(error, &nmreq, noop->noo_seqid); nfsm_chain_add_32(error, &nmreq, share_access); nfsm_chain_add_32(error, &nmreq, share_deny); - - // open owner: clientid + uid - nfsm_chain_add_64(error, &nmreq, nmp->nm_clientid); // open_owner4.clientid + nfsm_chain_add_64(error, &nmreq, nmp->nm_clientid); nfsm_chain_add_32(error, &nmreq, NFSX_UNSIGNED); - nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(noop->noo_cred)); // open_owner4.owner - - // openflag4 + nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(noop->noo_cred)); nfsm_chain_add_32(error, &nmreq, create); if (create) { if (exclusive) { @@ -4253,40 +4685,36 @@ again: nfsm_chain_add_fattr4(error, &nmreq, vap, nmp); } } - - // open_claim4 nfsm_chain_add_32(error, &nmreq, NFS_CLAIM_NULL); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); if (!error) error = busyerror = nfs_node_set_busy(dnp, thd); nfsmout_if(error); - if (create) + if (create && !namedattrs) nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); - error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, NULL, &req); + error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, R_NOINTR, NULL, &req); if (!error) { - if (create) + if (create && !namedattrs) nfs_dulookup_start(&dul, dnp, ctx); error = nfs_request_async_finish(req, &nmrep, &xid, &status); savedxid = xid; } - if (create) + if (create && !namedattrs) nfs_dulookup_finish(&dul, dnp, ctx); if ((lockerror = nfs_node_lock(dnp))) @@ -4309,51 +4737,69 @@ again: case NFS_OPEN_DELEGATE_NONE: break; case NFS_OPEN_DELEGATE_READ: - nfsm_chain_get_stateid(error, &nmrep, &dstateid); - nfsm_chain_get_32(error, &nmrep, recall); - // ACE: (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - nfsm_chain_get_32(error, &nmrep, val); /* string length */ - nfsm_chain_adv(error, &nmrep, nfsm_rndup(val)); - break; case NFS_OPEN_DELEGATE_WRITE: nfsm_chain_get_stateid(error, &nmrep, &dstateid); nfsm_chain_get_32(error, &nmrep, recall); - // space (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - // ACE: (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - nfsm_chain_get_32(error, &nmrep, val); /* string length */ - nfsm_chain_adv(error, &nmrep, nfsm_rndup(val)); + if (delegation == NFS_OPEN_DELEGATE_WRITE) // space (skip) XXX + nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); + /* if we have any trouble accepting the ACE, just invalidate it */ + ace_type = ace_flags = ace_mask = len = 0; + nfsm_chain_get_32(error, &nmrep, ace_type); + nfsm_chain_get_32(error, &nmrep, ace_flags); + nfsm_chain_get_32(error, &nmrep, ace_mask); + nfsm_chain_get_32(error, &nmrep, len); + ace.ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); + ace.ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); + ace.ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); + if (!error && (len >= slen)) { + MALLOC(s, char*, len+1, M_TEMP, M_WAITOK); + if (s) + slen = len+1; + else + ace.ace_flags = 0; + } + if (s) + nfsm_chain_get_opaque(error, &nmrep, len, s); + else + nfsm_chain_adv(error, &nmrep, nfsm_rndup(len)); + if (!error && s) { + s[len] = '\0'; + if (nfs4_id2guid(s, &ace.ace_applicable, (ace_flags & NFS_ACE_IDENTIFIER_GROUP))) + ace.ace_flags = 0; + } + if (error || !s) + ace.ace_flags = 0; + if (s && (s != sbuf)) + FREE(s, M_TEMP); break; default: error = EBADRPC; break; } /* At this point if we have no error, the object was created/opened. */ - /* if we don't get attributes, then we should lookitup. */ open_error = error; nfsmout_if(error); - if (create && !exclusive) + if (create && vap && !exclusive) nfs_vattr_set_supported(bitmap, vap); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); nfsmout_if(error); if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { - printf("nfs: open/create didn't return filehandle?\n"); + printf("nfs: open/create didn't return filehandle? %s\n", cnp->cn_nameptr); error = EBADRPC; goto nfsmout; } if (!create && np && !NFS_CMPFH(np, fh.fh_data, fh.fh_len)) { // XXX for the open case, what if fh doesn't match the vnode we think we're opening? - printf("nfs4_open_rpc: warning: file handle mismatch\n"); + // Solaris Named Attributes may do this due to a bug.... so don't warn for named attributes. + if (!(np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + NP(np, "nfs4_open_rpc: warning: file handle mismatch"); } /* directory attributes: if we don't get them, make sure to invalidate */ nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, &xid); if (error) NATTRINVALIDATE(dnp); nfsmout_if(error); @@ -4364,39 +4810,8 @@ again: if (rflags & NFS_OPEN_RESULT_CONFIRM) { nfs_node_unlock(dnp); lockerror = ENOENT; - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); - // PUTFH, OPEN_CONFIRM, GETATTR - numops = 3; - nfsm_chain_build_alloc_init(error, &nmreq, 23 * NFSX_UNSIGNED); - nfsm_chain_add_compound_header(error, &nmreq, "open_confirm", numops); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nfsvers, fh.fh_data, fh.fh_len); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_OPEN_CONFIRM); - nfsm_chain_add_stateid(error, &nmreq, sid); - nfsm_chain_add_32(error, &nmreq, noop->noo_seqid); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); - nfsm_chain_build_done(error, &nmreq); - nfsm_assert(error, (numops == 0), EPROTO); - nfsmout_if(error); - error = nfs_request2(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, 0, &nmrep, &xid, &status); - - nfsm_chain_skip_tag(error, &nmrep); - nfsm_chain_get_32(error, &nmrep, numops); - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsmout_if(error); - nfsm_chain_op_check(error, &nmrep, NFS_OP_OPEN_CONFIRM); - nfs_owner_seqid_increment(noop, NULL, error); - nfsm_chain_get_stateid(error, &nmrep, sid); - nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, &nvattr, NULL, NULL); + NVATTR_CLEANUP(&nvattr); + error = nfs4_open_confirm_rpc(nmp, dnp, fh.fh_data, fh.fh_len, noop, sid, thd, cred, &nvattr, &xid); nfsmout_if(error); savedxid = xid; if ((lockerror = nfs_node_lock(dnp))) @@ -4415,17 +4830,18 @@ nfsmout: dnp->n_flag |= NMODIFIED; nfs_node_unlock(dnp); lockerror = ENOENT; - nfs_getattr(dnp, &dnvattr, ctx, NGA_CACHED); + nfs_getattr(dnp, NULL, ctx, NGA_CACHED); } if (!lockerror) nfs_node_unlock(dnp); - if (!error && create && fh.fh_len) { + if (!error && !np && fh.fh_len) { /* create the vnode with the filehandle and attributes */ xid = savedxid; - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &newnp); + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &newnp); if (!error) newvp = NFSTOV(newnp); } + NVATTR_CLEANUP(&nvattr); if (!busyerror) nfs_node_clear_busy(dnp); if ((delegation == NFS_OPEN_DELEGATE_READ) || (delegation == NFS_OPEN_DELEGATE_WRITE)) { @@ -4437,15 +4853,39 @@ nfsmout: np->n_openflags &= ~N_DELEG_MASK; np->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); np->n_dstateid = dstateid; + np->n_dace = ace; + if (np->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, np, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } lck_mtx_unlock(&np->n_openlock); - } - if (recall) { - nfs4_delegreturn_rpc(nmp, fh.fh_data, fh.fh_len, &dstateid, thd, cred); + } else { + /* give the delegation back */ if (np) { - lck_mtx_lock(&np->n_openlock); - np->n_openflags &= ~N_DELEG_MASK; - lck_mtx_unlock(&np->n_openlock); + if (NFS_CMPFH(np, fh.fh_data, fh.fh_len)) { + /* update delegation state and return it */ + lck_mtx_lock(&np->n_openlock); + np->n_openflags &= ~N_DELEG_MASK; + np->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); + np->n_dstateid = dstateid; + np->n_dace = ace; + if (np->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, np, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } + lck_mtx_unlock(&np->n_openlock); + /* don't need to send a separate delegreturn for fh */ + fh.fh_len = 0; + } + /* return np's current delegation */ + nfs4_delegation_return(np, 0, thd, cred); } + if (fh.fh_len) /* return fh's delegation if it wasn't for np */ + nfs4_delegreturn_rpc(nmp, fh.fh_data, fh.fh_len, &dstateid, 0, thd, cred); } } if (error) { @@ -4478,6 +4918,266 @@ nfsmout: return (error); } + +/* + * Send an OPEN RPC to claim a delegated open for a file + */ +int +nfs4_claim_delegated_open_rpc( + struct nfs_open_file *nofp, + int share_access, + int share_deny, + int flags) +{ + struct nfsmount *nmp; + struct nfs_open_owner *noop = nofp->nof_owner; + struct nfs_vattr nvattr; + int error = 0, lockerror = ENOENT, status; + int nfsvers, numops; + u_int64_t xid; + nfsnode_t np = nofp->nof_np; + struct nfsm_chain nmreq, nmrep; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; + uint32_t rflags = 0, delegation, recall = 0; + fhandle_t fh; + struct nfs_stateid dstateid; + char sbuf[64], *s = sbuf; + uint32_t ace_type, ace_flags, ace_mask, len, slen = sizeof(sbuf); + struct kauth_ace ace; + vnode_t dvp = NULL; + const char *vname = NULL; + const char *name = NULL; + size_t namelen; + char smallname[128]; + char *filename = NULL; + struct nfsreq_secinfo_args si; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + nfsvers = nmp->nm_vers; + + nfs_node_lock_force(np); + if ((vnode_vtype(NFSTOV(np)) != VDIR) && np->n_sillyrename) { + /* + * The node's been sillyrenamed, so we need to use + * the sillyrename directory/name to do the open. + */ + struct nfs_sillyrename *nsp = np->n_sillyrename; + dvp = NFSTOV(nsp->nsr_dnp); + if ((error = vnode_get(dvp))) { + nfs_node_unlock(np); + goto out; + } + name = nsp->nsr_name; + } else { + /* + * [sigh] We can't trust VFS to get the parent right for named + * attribute nodes. (It likes to reparent the nodes after we've + * created them.) Luckily we can probably get the right parent + * from the n_parent we have stashed away. + */ + if ((np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR) && + (((dvp = np->n_parent)) && (error = vnode_get(dvp)))) + dvp = NULL; + if (!dvp) + dvp = vnode_getparent(NFSTOV(np)); + vname = vnode_getname(NFSTOV(np)); + if (!dvp || !vname) { + if (!error) + error = EIO; + nfs_node_unlock(np); + goto out; + } + name = vname; + } + filename = &smallname[0]; + namelen = snprintf(filename, sizeof(smallname), "%s", name); + if (namelen >= sizeof(smallname)) { + MALLOC(filename, char *, namelen+1, M_TEMP, M_WAITOK); + if (!filename) { + error = ENOMEM; + goto out; + } + snprintf(filename, namelen+1, "%s", name); + } + nfs_node_unlock(np); + + if ((error = nfs_open_owner_set_busy(noop, NULL))) + return (error); + + NVATTR_INIT(&nvattr); + delegation = NFS_OPEN_DELEGATE_NONE; + dstateid = np->n_dstateid; + NFSREQ_SECINFO_SET(&si, VTONFS(dvp), NULL, 0, filename, namelen); + + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + // PUTFH, OPEN, GETATTR(FH) + numops = 3; + nfsm_chain_build_alloc_init(error, &nmreq, 48 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "open_claim_d", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nfsvers, VTONFS(dvp)->n_fhp, VTONFS(dvp)->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPEN); + nfsm_chain_add_32(error, &nmreq, noop->noo_seqid); + nfsm_chain_add_32(error, &nmreq, share_access); + nfsm_chain_add_32(error, &nmreq, share_deny); + // open owner: clientid + uid + nfsm_chain_add_64(error, &nmreq, nmp->nm_clientid); // open_owner4.clientid + nfsm_chain_add_32(error, &nmreq, NFSX_UNSIGNED); + nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(noop->noo_cred)); // open_owner4.owner + // openflag4 + nfsm_chain_add_32(error, &nmreq, NFS_OPEN_NOCREATE); + // open_claim4 + nfsm_chain_add_32(error, &nmreq, NFS_CLAIM_DELEGATE_CUR); + nfsm_chain_add_stateid(error, &nmreq, &np->n_dstateid); + nfsm_chain_add_name(error, &nmreq, filename, namelen, nmp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + + error = nfs_request2(np, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, current_thread(), + noop->noo_cred, &si, flags|R_NOINTR, &nmrep, &xid, &status); + + if ((lockerror = nfs_node_lock(np))) + error = lockerror; + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsmout_if(error); + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPEN); + nfs_owner_seqid_increment(noop, NULL, error); + nfsm_chain_get_stateid(error, &nmrep, &nofp->nof_stateid); + nfsm_chain_check_change_info(error, &nmrep, np); + nfsm_chain_get_32(error, &nmrep, rflags); + bmlen = NFS_ATTR_BITMAP_LEN; + nfsm_chain_get_bitmap(error, &nmrep, bitmap, bmlen); + nfsm_chain_get_32(error, &nmrep, delegation); + if (!error) + switch (delegation) { + case NFS_OPEN_DELEGATE_NONE: + // if (!(np->n_openflags & N_DELEG_RETURN)) /* don't warn if delegation is being returned */ + // printf("nfs: open delegated claim didn't return a delegation %s\n", filename ? filename : "???"); + break; + case NFS_OPEN_DELEGATE_READ: + case NFS_OPEN_DELEGATE_WRITE: + if ((((np->n_openflags & N_DELEG_MASK) == N_DELEG_READ) && + (delegation == NFS_OPEN_DELEGATE_WRITE)) || + (((np->n_openflags & N_DELEG_MASK) == N_DELEG_WRITE) && + (delegation == NFS_OPEN_DELEGATE_READ))) + printf("nfs: open delegated claim returned a different delegation type! have %s got %s %s\n", + ((np->n_openflags & N_DELEG_MASK) == N_DELEG_WRITE) ? "W" : "R", + (delegation == NFS_OPEN_DELEGATE_WRITE) ? "W" : "R", filename ? filename : "???"); + nfsm_chain_get_stateid(error, &nmrep, &dstateid); + nfsm_chain_get_32(error, &nmrep, recall); + if (delegation == NFS_OPEN_DELEGATE_WRITE) // space (skip) XXX + nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); + /* if we have any trouble accepting the ACE, just invalidate it */ + ace_type = ace_flags = ace_mask = len = 0; + nfsm_chain_get_32(error, &nmrep, ace_type); + nfsm_chain_get_32(error, &nmrep, ace_flags); + nfsm_chain_get_32(error, &nmrep, ace_mask); + nfsm_chain_get_32(error, &nmrep, len); + ace.ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); + ace.ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); + ace.ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); + if (!error && (len >= slen)) { + MALLOC(s, char*, len+1, M_TEMP, M_WAITOK); + if (s) + slen = len+1; + else + ace.ace_flags = 0; + } + if (s) + nfsm_chain_get_opaque(error, &nmrep, len, s); + else + nfsm_chain_adv(error, &nmrep, nfsm_rndup(len)); + if (!error && s) { + s[len] = '\0'; + if (nfs4_id2guid(s, &ace.ace_applicable, (ace_flags & NFS_ACE_IDENTIFIER_GROUP))) + ace.ace_flags = 0; + } + if (error || !s) + ace.ace_flags = 0; + if (s && (s != sbuf)) + FREE(s, M_TEMP); + if (!error) { + /* stuff the latest delegation state in the node */ + lck_mtx_lock(&np->n_openlock); + np->n_openflags &= ~N_DELEG_MASK; + np->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); + np->n_dstateid = dstateid; + np->n_dace = ace; + if (np->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, np, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } + lck_mtx_unlock(&np->n_openlock); + } + break; + default: + error = EBADRPC; + break; + } + nfsmout_if(error); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); + nfsmout_if(error); + if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { + printf("nfs: open reclaim didn't return filehandle? %s\n", filename ? filename : "???"); + error = EBADRPC; + goto nfsmout; + } + if (!NFS_CMPFH(np, fh.fh_data, fh.fh_len)) { + // XXX what if fh doesn't match the vnode we think we're re-opening? + // Solaris Named Attributes may do this due to a bug.... so don't warn for named attributes. + if (!(np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + printf("nfs4_claim_delegated_open_rpc: warning: file handle mismatch %s\n", filename ? filename : "???"); + } + error = nfs_loadattrcache(np, &nvattr, &xid, 1); + nfsmout_if(error); + if (rflags & NFS_OPEN_RESULT_LOCKTYPE_POSIX) + nofp->nof_flags |= NFS_OPEN_FILE_POSIXLOCK; +nfsmout: + NVATTR_CLEANUP(&nvattr); + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + if (!lockerror) + nfs_node_unlock(np); + nfs_open_owner_clear_busy(noop); + if ((delegation == NFS_OPEN_DELEGATE_READ) || (delegation == NFS_OPEN_DELEGATE_WRITE)) { + if (recall) { + /* + * We're making a delegated claim. + * Don't return the delegation here in case we have more to claim. + * Just make sure it's queued up to be returned. + */ + nfs4_delegation_return_enqueue(np); + } + } +out: + // if (!error) + // printf("nfs: open claim delegated (%d, %d) succeeded for %s\n", share_access, share_deny, filename ? filename : "???"); + if (filename && (filename != &smallname[0])) + FREE(filename, M_TEMP); + if (vname) + vnode_putname(vname); + if (dvp != NULLVP) + vnode_put(dvp); + return (error); +} + /* * Send an OPEN RPC to reclaim an open file. */ @@ -4496,19 +5196,26 @@ nfs4_open_reclaim_rpc( nfsnode_t np = nofp->nof_np; struct nfsm_chain nmreq, nmrep; uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; - uint32_t rflags = 0, delegation, recall = 0, val; + uint32_t rflags = 0, delegation, recall = 0; fhandle_t fh; struct nfs_stateid dstateid; + char sbuf[64], *s = sbuf; + uint32_t ace_type, ace_flags, ace_mask, len, slen = sizeof(sbuf); + struct kauth_ace ace; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; - if ((error = nfs_open_owner_set_busy(noop, current_thread()))) + if ((error = nfs_open_owner_set_busy(noop, NULL))) return (error); + NVATTR_INIT(&nvattr); delegation = NFS_OPEN_DELEGATE_NONE; + dstateid = np->n_dstateid; + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -4542,13 +5249,13 @@ nfs4_open_reclaim_rpc( nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(np, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, current_thread(), noop->noo_cred, R_RECOVER, &nmrep, &xid, &status); + error = nfs_request2(np, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, current_thread(), + noop->noo_cred, &si, R_RECOVER|R_NOINTR, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -4567,38 +5274,66 @@ nfs4_open_reclaim_rpc( if (!error) switch (delegation) { case NFS_OPEN_DELEGATE_NONE: - break; - case NFS_OPEN_DELEGATE_READ: - nfsm_chain_get_stateid(error, &nmrep, &dstateid); - nfsm_chain_get_32(error, &nmrep, recall); - // ACE: (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - nfsm_chain_get_32(error, &nmrep, val); /* string length */ - nfsm_chain_adv(error, &nmrep, nfsm_rndup(val)); - if (!error) { - /* stuff the delegation state in the node */ - lck_mtx_lock(&np->n_openlock); - np->n_openflags &= ~N_DELEG_MASK; - np->n_openflags |= N_DELEG_READ; - np->n_dstateid = dstateid; - lck_mtx_unlock(&np->n_openlock); + if (np->n_openflags & N_DELEG_MASK) { + /* + * Hey! We were supposed to get our delegation back even + * if it was getting immediately recalled. Bad server! + * + * Just try to return the existing delegation. + */ + // NP(np, "nfs: open reclaim didn't return delegation?"); + delegation = (np->n_openflags & N_DELEG_WRITE) ? NFS_OPEN_DELEGATE_WRITE : NFS_OPEN_DELEGATE_READ; + recall = 1; } break; + case NFS_OPEN_DELEGATE_READ: case NFS_OPEN_DELEGATE_WRITE: nfsm_chain_get_stateid(error, &nmrep, &dstateid); nfsm_chain_get_32(error, &nmrep, recall); - // space (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - // ACE: (skip) XXX - nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); - nfsm_chain_get_32(error, &nmrep, val); /* string length */ - nfsm_chain_adv(error, &nmrep, nfsm_rndup(val)); + if (delegation == NFS_OPEN_DELEGATE_WRITE) // space (skip) XXX + nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); + /* if we have any trouble accepting the ACE, just invalidate it */ + ace_type = ace_flags = ace_mask = len = 0; + nfsm_chain_get_32(error, &nmrep, ace_type); + nfsm_chain_get_32(error, &nmrep, ace_flags); + nfsm_chain_get_32(error, &nmrep, ace_mask); + nfsm_chain_get_32(error, &nmrep, len); + ace.ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); + ace.ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); + ace.ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); + if (!error && (len >= slen)) { + MALLOC(s, char*, len+1, M_TEMP, M_WAITOK); + if (s) + slen = len+1; + else + ace.ace_flags = 0; + } + if (s) + nfsm_chain_get_opaque(error, &nmrep, len, s); + else + nfsm_chain_adv(error, &nmrep, nfsm_rndup(len)); + if (!error && s) { + s[len] = '\0'; + if (nfs4_id2guid(s, &ace.ace_applicable, (ace_flags & NFS_ACE_IDENTIFIER_GROUP))) + ace.ace_flags = 0; + } + if (error || !s) + ace.ace_flags = 0; + if (s && (s != sbuf)) + FREE(s, M_TEMP); if (!error) { /* stuff the delegation state in the node */ lck_mtx_lock(&np->n_openlock); np->n_openflags &= ~N_DELEG_MASK; - np->n_openflags |= N_DELEG_WRITE; + np->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); np->n_dstateid = dstateid; + np->n_dace = ace; + if (np->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, np, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } lck_mtx_unlock(&np->n_openlock); } break; @@ -4608,35 +5343,37 @@ nfs4_open_reclaim_rpc( } nfsmout_if(error); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); nfsmout_if(error); if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { - printf("nfs: open reclaim didn't return filehandle?\n"); + NP(np, "nfs: open reclaim didn't return filehandle?"); error = EBADRPC; goto nfsmout; } if (!NFS_CMPFH(np, fh.fh_data, fh.fh_len)) { // XXX what if fh doesn't match the vnode we think we're re-opening? - printf("nfs4_open_reclaim_rpc: warning: file handle mismatch\n"); + // That should be pretty hard in this case, given that we are doing + // the open reclaim using the file handle (and not a dir/name pair). + // Solaris Named Attributes may do this due to a bug.... so don't warn for named attributes. + if (!(np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + NP(np, "nfs4_open_reclaim_rpc: warning: file handle mismatch"); } error = nfs_loadattrcache(np, &nvattr, &xid, 1); nfsmout_if(error); if (rflags & NFS_OPEN_RESULT_LOCKTYPE_POSIX) nofp->nof_flags |= NFS_OPEN_FILE_POSIXLOCK; nfsmout: + // if (!error) + // NP(np, "nfs: open reclaim (%d, %d) succeeded", share_access, share_deny); + NVATTR_CLEANUP(&nvattr); nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); if (!lockerror) nfs_node_unlock(np); nfs_open_owner_clear_busy(noop); if ((delegation == NFS_OPEN_DELEGATE_READ) || (delegation == NFS_OPEN_DELEGATE_WRITE)) { - if (recall) { - nfs4_delegreturn_rpc(nmp, fh.fh_data, fh.fh_len, &dstateid, current_thread(), noop->noo_cred); - lck_mtx_lock(&np->n_openlock); - np->n_openflags &= ~N_DELEG_MASK; - lck_mtx_unlock(&np->n_openlock); - } + if (recall) + nfs4_delegation_return_enqueue(np); } return (error); } @@ -4652,15 +5389,17 @@ nfs4_open_downgrade_rpc( int error, lockerror = ENOENT, status, nfsvers, numops; struct nfsm_chain nmreq, nmrep; u_int64_t xid; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; - if ((error = nfs_open_owner_set_busy(noop, vfs_context_thread(ctx)))) + if ((error = nfs_open_owner_set_busy(noop, NULL))) return (error); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -4679,12 +5418,13 @@ nfs4_open_downgrade_rpc( nfsm_chain_add_32(error, &nmreq, nofp->nof_deny); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), + &si, R_NOINTR, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -4696,7 +5436,7 @@ nfs4_open_downgrade_rpc( nfs_owner_seqid_increment(noop, NULL, error); nfsm_chain_get_stateid(error, &nmrep, &nofp->nof_stateid); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); nfsmout: if (!lockerror) nfs_node_unlock(np); @@ -4712,26 +5452,28 @@ nfs4_close_rpc( struct nfs_open_file *nofp, thread_t thd, kauth_cred_t cred, - int flag) + int flags) { struct nfs_open_owner *noop = nofp->nof_owner; struct nfsmount *nmp; int error, lockerror = ENOENT, status, nfsvers, numops; struct nfsm_chain nmreq, nmrep; u_int64_t xid; + struct nfsreq_secinfo_args si; nmp = NFSTONMP(np); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; - if ((error = nfs_open_owner_set_busy(noop, thd))) + if ((error = nfs_open_owner_set_busy(noop, NULL))) return (error); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); - // PUTFH, CLOSE, GETFH + // PUTFH, CLOSE, GETATTR numops = 3; nfsm_chain_build_alloc_init(error, &nmreq, 23 * NFSX_UNSIGNED); nfsm_chain_add_compound_header(error, &nmreq, "close", numops); @@ -4744,12 +5486,11 @@ nfs4_close_rpc( nfsm_chain_add_stateid(error, &nmreq, &nofp->nof_stateid); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, flag, &nmrep, &xid, &status); + error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, flags|R_NOINTR, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; @@ -4761,7 +5502,7 @@ nfs4_close_rpc( nfs_owner_seqid_increment(noop, NULL, error); nfsm_chain_get_stateid(error, &nmrep, &nofp->nof_stateid); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); nfsmout: if (!lockerror) nfs_node_unlock(np); @@ -4772,685 +5513,2630 @@ nfsmout: } -int -nfs4_delegreturn_rpc(struct nfsmount *nmp, u_char *fhp, int fhlen, struct nfs_stateid *sid, thread_t thd, kauth_cred_t cred) -{ - int error = 0, status, numops; - uint64_t xid; - struct nfsm_chain nmreq, nmrep; - - nfsm_chain_null(&nmreq); - nfsm_chain_null(&nmrep); - - // PUTFH, DELEGRETURN - numops = 2; - nfsm_chain_build_alloc_init(error, &nmreq, 16 * NFSX_UNSIGNED); - nfsm_chain_add_compound_header(error, &nmreq, "delegreturn", numops); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, fhp, fhlen); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_DELEGRETURN); - nfsm_chain_add_stateid(error, &nmreq, sid); - nfsm_chain_build_done(error, &nmreq); - nfsm_assert(error, (numops == 0), EPROTO); - nfsmout_if(error); - error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, R_RECOVER, &nmrep, &xid, &status); - nfsm_chain_skip_tag(error, &nmrep); - nfsm_chain_get_32(error, &nmrep, numops); - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_DELEGRETURN); -nfsmout: - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); - return (error); -} - - /* - * NFSv4 read call. - * Just call nfs_bioread() to do the work. - * - * Note: the exec code paths have a tendency to call VNOP_READ (and VNOP_MMAP) - * without first calling VNOP_OPEN, so we make sure the file is open here. + * Claim the delegated open combinations this open file holds. */ int -nfs4_vnop_read( - struct vnop_read_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } */ *ap) +nfs4_claim_delegated_state_for_open_file(struct nfs_open_file *nofp, int flags) { - vnode_t vp = ap->a_vp; - vfs_context_t ctx = ap->a_context; - nfsnode_t np; + struct nfs_open_owner *noop = nofp->nof_owner; + struct nfs_lock_owner *nlop; + struct nfs_file_lock *nflp, *nextnflp; struct nfsmount *nmp; - struct nfs_open_owner *noop; - struct nfs_open_file *nofp; - int error; - - if (vnode_vtype(ap->a_vp) != VREG) - return (EPERM); - - np = VTONFS(vp); - nmp = NFSTONMP(np); - if (!nmp) - return (ENXIO); + int error = 0, reopen = 0; - noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); - if (!noop) - return (ENOMEM); -restart: - error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1); - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { - printf("nfs_vnop_read: LOST\n"); - error = EIO; + if (nofp->nof_d_rw_drw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_BOTH, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_rw_drw += nofp->nof_d_rw_drw; + nofp->nof_d_rw_drw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } } - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs4_reopen(nofp, vfs_context_thread(ctx)); - nofp = NULL; - goto restart; + if (!error && nofp->nof_d_w_drw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_BOTH, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_w_drw += nofp->nof_d_w_drw; + nofp->nof_d_w_drw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } } - if (error) { - nfs_open_owner_rele(noop); + if (!error && nofp->nof_d_r_drw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_BOTH, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_r_drw += nofp->nof_d_r_drw; + nofp->nof_d_r_drw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + if (!error && nofp->nof_d_rw_dw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_WRITE, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_rw_dw += nofp->nof_d_rw_dw; + nofp->nof_d_rw_dw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + if (!error && nofp->nof_d_w_dw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_WRITE, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_w_dw += nofp->nof_d_w_dw; + nofp->nof_d_w_dw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + if (!error && nofp->nof_d_r_dw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_WRITE, flags); + if (!error) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_r_dw += nofp->nof_d_r_dw; + nofp->nof_d_r_dw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + /* non-deny-mode opens may be reopened if no locks are held */ + if (!error && nofp->nof_d_rw) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, flags); + /* for some errors, we should just try reopening the file */ + if (nfs_mount_state_error_delegation_lost(error)) + reopen = error; + if (!error || reopen) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_rw += nofp->nof_d_rw; + nofp->nof_d_rw = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + /* if we've already set reopen, we should move these other two opens from delegated to not delegated */ + if ((!error || reopen) && nofp->nof_d_w) { + if (!error) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE, flags); + /* for some errors, we should just try reopening the file */ + if (nfs_mount_state_error_delegation_lost(error)) + reopen = error; + } + if (!error || reopen) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_w += nofp->nof_d_w; + nofp->nof_d_w = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + if ((!error || reopen) && nofp->nof_d_r) { + if (!error) { + error = nfs4_claim_delegated_open_rpc(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, flags); + /* for some errors, we should just try reopening the file */ + if (nfs_mount_state_error_delegation_lost(error)) + reopen = error; + } + if (!error || reopen) { + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_r += nofp->nof_d_r; + nofp->nof_d_r = 0; + lck_mtx_unlock(&nofp->nof_lock); + } + } + + if (reopen) { + /* + * Any problems with the delegation probably indicates that we + * should review/return all of our current delegation state. + */ + if ((nmp = NFSTONMP(nofp->nof_np))) { + nfs4_delegation_return_enqueue(nofp->nof_np); + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, NFSERR_EXPIRED); + lck_mtx_unlock(&nmp->nm_lock); + } + if (reopen && (nfs_check_for_locks(noop, nofp) == 0)) { + /* just reopen the file on next access */ + NP(nofp->nof_np, "nfs4_claim_delegated_state_for_open_file: %d, need reopen, %d", + reopen, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_flags |= NFS_OPEN_FILE_REOPEN; + lck_mtx_unlock(&nofp->nof_lock); + return (0); + } + if (reopen) + NP(nofp->nof_np, "nfs4_claim_delegated_state_for_open_file: %d, locks prevent reopen, %d", + reopen, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + } + + if (!error && ((nmp = NFSTONMP(nofp->nof_np)))) { + /* claim delegated locks */ + TAILQ_FOREACH(nlop, &nofp->nof_np->n_lock_owners, nlo_link) { + if (nlop->nlo_open_owner != noop) + continue; + TAILQ_FOREACH_SAFE(nflp, &nlop->nlo_locks, nfl_lolink, nextnflp) { + /* skip dead & blocked lock requests (shouldn't be any in the held lock list) */ + if (nflp->nfl_flags & (NFS_FILE_LOCK_DEAD|NFS_FILE_LOCK_BLOCKED)) + continue; + /* skip non-delegated locks */ + if (!(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) + continue; + error = nmp->nm_funcs->nf_setlock_rpc(nofp->nof_np, nofp, nflp, 0, flags, current_thread(), noop->noo_cred); + if (error) { + NP(nofp->nof_np, "nfs: delegated lock claim (0x%llx, 0x%llx) failed %d, %d", + nflp->nfl_start, nflp->nfl_end, error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + break; + } + // else { + // NP(nofp->nof_np, "nfs: delegated lock claim (0x%llx, 0x%llx) succeeded, %d", + // nflp->nfl_start, nflp->nfl_end, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + // } + } + if (error) + break; + } + } + + if (!error) /* all state claimed successfully! */ + return (0); + + /* restart if it looks like a problem more than just losing the delegation */ + if (!nfs_mount_state_error_delegation_lost(error) && + ((error == ETIMEDOUT) || nfs_mount_state_error_should_restart(error))) { + NP(nofp->nof_np, "nfs delegated lock claim error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + if ((error == ETIMEDOUT) && ((nmp = NFSTONMP(nofp->nof_np)))) + nfs_need_reconnect(nmp); return (error); } - if (!nofp->nof_access) { - /* we don't have the file open, so open it for read access */ - error = nfs_mount_state_in_use_start(nmp); - if (error) { - nfs_open_owner_rele(noop); - return (error); + + /* delegated state lost (once held but now not claimable) */ + NP(nofp->nof_np, "nfs delegated state claim error %d, state lost, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + + /* + * Any problems with the delegation probably indicates that we + * should review/return all of our current delegation state. + */ + if ((nmp = NFSTONMP(nofp->nof_np))) { + nfs4_delegation_return_enqueue(nofp->nof_np); + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, NFSERR_EXPIRED); + lck_mtx_unlock(&nmp->nm_lock); + } + + /* revoke all open file state */ + nfs_revoke_open_state_for_node(nofp->nof_np); + + return (error); +} + +/* + * Release all open state for the given node. + */ +void +nfs_release_open_state_for_node(nfsnode_t np, int force) +{ + struct nfsmount *nmp = NFSTONMP(np); + struct nfs_open_file *nofp; + struct nfs_file_lock *nflp, *nextnflp; + + /* drop held locks */ + TAILQ_FOREACH_SAFE(nflp, &np->n_locks, nfl_link, nextnflp) { + /* skip dead & blocked lock requests */ + if (nflp->nfl_flags & (NFS_FILE_LOCK_DEAD|NFS_FILE_LOCK_BLOCKED)) + continue; + /* send an unlock if not a delegated lock */ + if (!force && nmp && !(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED)) + nmp->nm_funcs->nf_unlock_rpc(np, nflp->nfl_owner, F_WRLCK, nflp->nfl_start, nflp->nfl_end, R_RECOVER, + NULL, nflp->nfl_owner->nlo_open_owner->noo_cred); + /* kill/remove the lock */ + lck_mtx_lock(&np->n_openlock); + nflp->nfl_flags |= NFS_FILE_LOCK_DEAD; + lck_mtx_lock(&nflp->nfl_owner->nlo_lock); + TAILQ_REMOVE(&nflp->nfl_owner->nlo_locks, nflp, nfl_lolink); + lck_mtx_unlock(&nflp->nfl_owner->nlo_lock); + if (nflp->nfl_blockcnt) { + /* wake up anyone blocked on this lock */ + wakeup(nflp); + } else { + /* remove nflp from lock list and destroy */ + TAILQ_REMOVE(&np->n_locks, nflp, nfl_link); + nfs_file_lock_destroy(nflp); } - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); + lck_mtx_unlock(&np->n_openlock); + } + + lck_mtx_lock(&np->n_openlock); + + /* drop all opens */ + TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { + if (nofp->nof_flags & NFS_OPEN_FILE_LOST) + continue; + /* mark open state as lost */ + lck_mtx_lock(&nofp->nof_lock); + nofp->nof_flags &= ~NFS_OPEN_FILE_REOPEN; + nofp->nof_flags |= NFS_OPEN_FILE_LOST; + lck_mtx_unlock(&nofp->nof_lock); + if (!force && nmp && (nmp->nm_vers >= NFS_VER4)) + nfs4_close_rpc(np, nofp, NULL, nofp->nof_owner->noo_cred, R_RECOVER); + } + + lck_mtx_unlock(&np->n_openlock); +} + +/* + * State for a node has been lost, drop it, and revoke the node. + * Attempt to return any state if possible in case the server + * might somehow think we hold it. + */ +void +nfs_revoke_open_state_for_node(nfsnode_t np) +{ + struct nfsmount *nmp; + + /* mark node as needing to be revoked */ + nfs_node_lock_force(np); + if (np->n_flag & NREVOKE) /* already revoked? */ + { + NP(np, "nfs_revoke_open_state_for_node(): already revoked"); + nfs_node_unlock(np); + return; + } + np->n_flag |= NREVOKE; + nfs_node_unlock(np); + + nfs_release_open_state_for_node(np, 0); + NP(np, "nfs: state lost for %p 0x%x", np, np->n_flag); + + /* mark mount as needing a revoke scan and have the socket thread do it. */ + if ((nmp = NFSTONMP(np))) { + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_state |= NFSSTA_REVOKE; + nfs_mount_sock_thread_wake(nmp); + lck_mtx_unlock(&nmp->nm_lock); + } +} + +/* + * Claim the delegated open combinations that each of this node's open files hold. + */ +int +nfs4_claim_delegated_state_for_node(nfsnode_t np, int flags) +{ + struct nfs_open_file *nofp; + int error = 0; + + lck_mtx_lock(&np->n_openlock); + + /* walk the open file list looking for opens with delegated state to claim */ +restart: + TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { + if (!nofp->nof_d_rw_drw && !nofp->nof_d_w_drw && !nofp->nof_d_r_drw && + !nofp->nof_d_rw_dw && !nofp->nof_d_w_dw && !nofp->nof_d_r_dw && + !nofp->nof_d_rw && !nofp->nof_d_w && !nofp->nof_d_r) + continue; + lck_mtx_unlock(&np->n_openlock); + error = nfs4_claim_delegated_state_for_open_file(nofp, flags); + lck_mtx_lock(&np->n_openlock); if (error) - nofp = NULL; - if (!error) - error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); - if (!error) - nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE; - if (nofp) - nfs_open_file_clear_busy(nofp); - if (nfs_mount_state_in_use_end(nmp, error)) { - nofp = NULL; - goto restart; + break; + goto restart; + } + + lck_mtx_unlock(&np->n_openlock); + + return (error); +} + +/* + * Mark a node as needed to have its delegation returned. + * Queue it up on the delegation return queue. + * Make sure the thread is running. + */ +void +nfs4_delegation_return_enqueue(nfsnode_t np) +{ + struct nfsmount *nmp; + + nmp = NFSTONMP(np); + if (!nmp) + return; + + lck_mtx_lock(&np->n_openlock); + np->n_openflags |= N_DELEG_RETURN; + lck_mtx_unlock(&np->n_openlock); + + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dreturn.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_dreturnq, np, n_dreturn); + nfs_mount_sock_thread_wake(nmp); + lck_mtx_unlock(&nmp->nm_lock); +} + +/* + * return any delegation we may have for the given node + */ +int +nfs4_delegation_return(nfsnode_t np, int flags, thread_t thd, kauth_cred_t cred) +{ + struct nfsmount *nmp; + fhandle_t fh; + nfs_stateid dstateid; + int error; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + + /* first, make sure the node's marked for delegation return */ + lck_mtx_lock(&np->n_openlock); + np->n_openflags |= (N_DELEG_RETURN|N_DELEG_RETURNING); + lck_mtx_unlock(&np->n_openlock); + + /* make sure nobody else is using the delegation state */ + if ((error = nfs_open_state_set_busy(np, NULL))) + goto out; + + /* claim any delegated state */ + if ((error = nfs4_claim_delegated_state_for_node(np, flags))) + goto out; + + /* return the delegation */ + lck_mtx_lock(&np->n_openlock); + dstateid = np->n_dstateid; + fh.fh_len = np->n_fhsize; + bcopy(np->n_fhp, &fh.fh_data, fh.fh_len); + lck_mtx_unlock(&np->n_openlock); + error = nfs4_delegreturn_rpc(NFSTONMP(np), fh.fh_data, fh.fh_len, &dstateid, flags, thd, cred); + /* assume delegation is gone for all errors except ETIMEDOUT, NFSERR_*MOVED */ + if ((error != ETIMEDOUT) && (error != NFSERR_MOVED) && (error != NFSERR_LEASE_MOVED)) { + lck_mtx_lock(&np->n_openlock); + np->n_openflags &= ~N_DELEG_MASK; + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dlink.tqe_next != NFSNOLIST) { + TAILQ_REMOVE(&nmp->nm_delegations, np, n_dlink); + np->n_dlink.tqe_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); + lck_mtx_unlock(&np->n_openlock); + } + +out: + /* make sure it's no longer on the return queue and clear the return flags */ + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dreturn.tqe_next != NFSNOLIST) { + TAILQ_REMOVE(&nmp->nm_dreturnq, np, n_dreturn); + np->n_dreturn.tqe_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); + lck_mtx_lock(&np->n_openlock); + np->n_openflags &= ~(N_DELEG_RETURN|N_DELEG_RETURNING); + lck_mtx_unlock(&np->n_openlock); + + if (error) { + NP(np, "nfs4_delegation_return, error %d", error); + if (error == ETIMEDOUT) + nfs_need_reconnect(nmp); + if (nfs_mount_state_error_should_restart(error)) { + /* make sure recovery happens */ + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, nfs_mount_state_error_delegation_lost(error) ? NFSERR_EXPIRED : 0); + lck_mtx_unlock(&nmp->nm_lock); } } - nfs_open_owner_rele(noop); - if (error) - return (error); - return (nfs_bioread(VTONFS(ap->a_vp), ap->a_uio, ap->a_ioflag, ap->a_context)); + + nfs_open_state_clear_busy(np); + + return (error); } /* - * Note: the NFSv4 CREATE RPC is for everything EXCEPT regular files. - * Files are created using the NFSv4 OPEN RPC. So we must open the - * file to create it and then close it. + * RPC to return a delegation for a file handle + */ +int +nfs4_delegreturn_rpc(struct nfsmount *nmp, u_char *fhp, int fhlen, struct nfs_stateid *sid, int flags, thread_t thd, kauth_cred_t cred) +{ + int error = 0, status, numops; + uint64_t xid; + struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; + + NFSREQ_SECINFO_SET(&si, NULL, fhp, fhlen, NULL, 0); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + // PUTFH, DELEGRETURN + numops = 2; + nfsm_chain_build_alloc_init(error, &nmreq, 16 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "delegreturn", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, fhp, fhlen); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_DELEGRETURN); + nfsm_chain_add_stateid(error, &nmreq, sid); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, flags, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_DELEGRETURN); +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + return (error); +} + + +/* + * NFS read call. + * Just call nfs_bioread() to do the work. + * + * Note: the exec code paths have a tendency to call VNOP_READ (and VNOP_MMAP) + * without first calling VNOP_OPEN, so we make sure the file is open here. + */ +int +nfs_vnop_read( + struct vnop_read_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + struct uio *a_uio; + int a_ioflag; + vfs_context_t a_context; + } */ *ap) +{ + vnode_t vp = ap->a_vp; + vfs_context_t ctx = ap->a_context; + nfsnode_t np; + struct nfsmount *nmp; + struct nfs_open_owner *noop; + struct nfs_open_file *nofp; + int error; + + if (vnode_vtype(ap->a_vp) != VREG) + return (EPERM); + + np = VTONFS(vp); + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + if (np->n_flag & NREVOKE) + return (EIO); + + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); + if (!noop) + return (ENOMEM); +restart: + error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1); + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { + NP(np, "nfs_vnop_read: LOST %d", kauth_cred_getuid(noop->noo_cred)); + error = EIO; + } + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + error = nfs4_reopen(nofp, vfs_context_thread(ctx)); + nofp = NULL; + if (!error) + goto restart; + } + if (error) { + nfs_open_owner_rele(noop); + return (error); + } + if (!nofp->nof_access) { + /* we don't have the file open, so open it for read access */ + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); + if (error) { + nfs_open_owner_rele(noop); + return (error); + } + if (np->n_flag & NREVOKE) { + error = EIO; + nfs_mount_state_in_use_end(nmp, 0); + nfs_open_owner_rele(noop); + return (error); + } + error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); + if (error) + nofp = NULL; + if (!error) { + if (nmp->nm_vers < NFS_VER4) { + /* NFS v2/v3 opens are always allowed - so just add it. */ + nfs_open_file_add_open(nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, 0); + } else { + error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); + } + } + if (!error) + nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE; + if (nofp) + nfs_open_file_clear_busy(nofp); + if (nfs_mount_state_in_use_end(nmp, error)) { + nofp = NULL; + goto restart; + } + } + nfs_open_owner_rele(noop); + if (error) + return (error); + return (nfs_bioread(VTONFS(ap->a_vp), ap->a_uio, ap->a_ioflag, ap->a_context)); +} + +/* + * Note: the NFSv4 CREATE RPC is for everything EXCEPT regular files. + * Files are created using the NFSv4 OPEN RPC. So we must open the + * file to create it and then close it. + */ +int +nfs4_vnop_create( + struct vnop_create_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ *ap) +{ + vfs_context_t ctx = ap->a_context; + struct componentname *cnp = ap->a_cnp; + struct vnode_attr *vap = ap->a_vap; + vnode_t dvp = ap->a_dvp; + vnode_t *vpp = ap->a_vpp; + struct nfsmount *nmp; + nfsnode_t np; + int error = 0, busyerror = 0, accessMode, denyMode; + struct nfs_open_owner *noop = NULL; + struct nfs_open_file *newnofp = NULL, *nofp = NULL; + + nmp = VTONMP(dvp); + if (!nmp) + return (ENXIO); + + if (vap) + nfs_avoid_needless_id_setting_on_create(VTONFS(dvp), vap, ctx); + + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); + if (!noop) + return (ENOMEM); + +restart: + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); + if (error) { + nfs_open_owner_rele(noop); + return (error); + } + + /* grab a provisional, nodeless open file */ + error = nfs_open_file_find(NULL, noop, &newnofp, 0, 0, 1); + if (!error && (newnofp->nof_flags & NFS_OPEN_FILE_LOST)) { + printf("nfs_vnop_create: LOST\n"); + error = EIO; + } + if (!error && (newnofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + /* This shouldn't happen given that this is a new, nodeless nofp */ + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(newnofp, vfs_context_thread(ctx)); + nfs_open_file_destroy(newnofp); + newnofp = NULL; + if (!error) + goto restart; + } + if (!error) + error = nfs_open_file_set_busy(newnofp, vfs_context_thread(ctx)); + if (error) { + if (newnofp) + nfs_open_file_destroy(newnofp); + newnofp = NULL; + goto out; + } + + /* + * We're just trying to create the file. + * We'll create/open it RW, and set NFS_OPEN_FILE_CREATE. + */ + accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; + denyMode = NFS_OPEN_SHARE_DENY_NONE; + + /* Do the open/create */ + error = nfs4_open_rpc(newnofp, ctx, cnp, vap, dvp, vpp, NFS_OPEN_CREATE, accessMode, denyMode); + if ((error == EACCES) && vap && !(vap->va_vaflags & VA_EXCLUSIVE) && + VATTR_IS_ACTIVE(vap, va_mode) && !(vap->va_mode & S_IWUSR)) { + /* + * Hmm... it looks like we may have a situation where the request was + * retransmitted because we didn't get the first response which successfully + * created/opened the file and then the second time we were denied the open + * because the mode the file was created with doesn't allow write access. + * + * We'll try to work around this by temporarily updating the mode and + * retrying the open. + */ + struct vnode_attr vattr; + + /* first make sure it's there */ + int error2 = nfs_lookitup(VTONFS(dvp), cnp->cn_nameptr, cnp->cn_namelen, ctx, &np); + if (!error2 && np) { + nfs_node_unlock(np); + *vpp = NFSTOV(np); + if (vnode_vtype(NFSTOV(np)) == VREG) { + VATTR_INIT(&vattr); + VATTR_SET(&vattr, va_mode, (vap->va_mode | S_IWUSR)); + if (!nfs4_setattr_rpc(np, &vattr, ctx)) { + error2 = nfs4_open_rpc(newnofp, ctx, cnp, NULL, dvp, vpp, NFS_OPEN_NOCREATE, accessMode, denyMode); + VATTR_INIT(&vattr); + VATTR_SET(&vattr, va_mode, vap->va_mode); + nfs4_setattr_rpc(np, &vattr, ctx); + if (!error2) + error = 0; + } + } + if (error) { + vnode_put(*vpp); + *vpp = NULL; + } + } + } + if (!error && !*vpp) { + printf("nfs4_open_rpc returned without a node?\n"); + /* Hmmm... with no node, we have no filehandle and can't close it */ + error = EIO; + } + if (error) { + /* need to cleanup our temporary nofp */ + nfs_open_file_clear_busy(newnofp); + nfs_open_file_destroy(newnofp); + newnofp = NULL; + goto out; + } + /* After we have a node, add our open file struct to the node */ + np = VTONFS(*vpp); + nfs_open_file_add_open(newnofp, accessMode, denyMode, 0); + nofp = newnofp; + error = nfs_open_file_find_internal(np, noop, &nofp, 0, 0, 0); + if (error) { + /* This shouldn't happen, because we passed in a new nofp to use. */ + printf("nfs_open_file_find_internal failed! %d\n", error); + goto out; + } else if (nofp != newnofp) { + /* + * Hmm... an open file struct already exists. + * Mark the existing one busy and merge our open into it. + * Then destroy the one we created. + * Note: there's no chance of an open confict because the + * open has already been granted. + */ + busyerror = nfs_open_file_set_busy(nofp, NULL); + nfs_open_file_add_open(nofp, accessMode, denyMode, 0); + nofp->nof_stateid = newnofp->nof_stateid; + if (newnofp->nof_flags & NFS_OPEN_FILE_POSIXLOCK) + nofp->nof_flags |= NFS_OPEN_FILE_POSIXLOCK; + nfs_open_file_clear_busy(newnofp); + nfs_open_file_destroy(newnofp); + } + newnofp = NULL; + /* mark the node as holding a create-initiated open */ + nofp->nof_flags |= NFS_OPEN_FILE_CREATE; + nofp->nof_creator = current_thread(); +out: + if (nofp && !busyerror) + nfs_open_file_clear_busy(nofp); + if (nfs_mount_state_in_use_end(nmp, error)) { + nofp = newnofp = NULL; + busyerror = 0; + goto restart; + } + if (noop) + nfs_open_owner_rele(noop); + return (error); +} + +/* + * Note: the NFSv4 CREATE RPC is for everything EXCEPT regular files. + */ +int +nfs4_create_rpc( + vfs_context_t ctx, + nfsnode_t dnp, + struct componentname *cnp, + struct vnode_attr *vap, + int type, + char *link, + nfsnode_t *npp) +{ + struct nfsmount *nmp; + struct nfs_vattr nvattr; + int error = 0, create_error = EIO, lockerror = ENOENT, busyerror = ENOENT, status; + int nfsvers, namedattrs, numops; + u_int64_t xid, savedxid = 0; + nfsnode_t np = NULL; + vnode_t newvp = NULL; + struct nfsm_chain nmreq, nmrep; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; + const char *tag; + nfs_specdata sd; + fhandle_t fh; + struct nfsreq rq, *req = &rq; + struct nfs_dulookup dul; + struct nfsreq_secinfo_args si; + + nmp = NFSTONMP(dnp); + if (!nmp) + return (ENXIO); + nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); + if (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + + sd.specdata1 = sd.specdata2 = 0; + + switch (type) { + case NFLNK: + tag = "symlink"; + break; + case NFBLK: + case NFCHR: + tag = "mknod"; + if (!VATTR_IS_ACTIVE(vap, va_rdev)) + return (EINVAL); + sd.specdata1 = major(vap->va_rdev); + sd.specdata2 = minor(vap->va_rdev); + break; + case NFSOCK: + case NFFIFO: + tag = "mknod"; + break; + case NFDIR: + tag = "mkdir"; + break; + default: + return (EINVAL); + } + + nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); + + error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); + if (!namedattrs) + nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + + NFSREQ_SECINFO_SET(&si, dnp, NULL, 0, NULL, 0); + NVATTR_INIT(&nvattr); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + // PUTFH, SAVEFH, CREATE, GETATTR(FH), RESTOREFH, GETATTR + numops = 6; + nfsm_chain_build_alloc_init(error, &nmreq, 66 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, tag, numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_SAVEFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_CREATE); + nfsm_chain_add_32(error, &nmreq, type); + if (type == NFLNK) { + nfsm_chain_add_name(error, &nmreq, link, strlen(link), nmp); + } else if ((type == NFBLK) || (type == NFCHR)) { + nfsm_chain_add_32(error, &nmreq, sd.specdata1); + nfsm_chain_add_32(error, &nmreq, sd.specdata2); + } + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); + nfsm_chain_add_fattr4(error, &nmreq, vap, nmp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, NULL); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, dnp); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + + error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) { + if (!namedattrs) + nfs_dulookup_start(&dul, dnp, ctx); + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + } + + if ((lockerror = nfs_node_lock(dnp))) + error = lockerror; + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_SAVEFH); + nfsmout_if(error); + nfsm_chain_op_check(error, &nmrep, NFS_OP_CREATE); + nfsm_chain_check_change_info(error, &nmrep, dnp); + bmlen = NFS_ATTR_BITMAP_LEN; + nfsm_chain_get_bitmap(error, &nmrep, bitmap, bmlen); + /* At this point if we have no error, the object was created. */ + /* if we don't get attributes, then we should lookitup. */ + create_error = error; + nfsmout_if(error); + nfs_vattr_set_supported(bitmap, vap); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); + nfsmout_if(error); + if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { + printf("nfs: create/%s didn't return filehandle? %s\n", tag, cnp->cn_nameptr); + error = EBADRPC; + goto nfsmout; + } + /* directory attributes: if we don't get them, make sure to invalidate */ + nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + savedxid = xid; + nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, &xid); + if (error) + NATTRINVALIDATE(dnp); + +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + + if (!lockerror) { + if (!create_error && (dnp->n_flag & NNEGNCENTRIES)) { + dnp->n_flag &= ~NNEGNCENTRIES; + cache_purge_negatives(NFSTOV(dnp)); + } + dnp->n_flag |= NMODIFIED; + nfs_node_unlock(dnp); + /* nfs_getattr() will check changed and purge caches */ + nfs_getattr(dnp, NULL, ctx, NGA_CACHED); + } + + if (!error && fh.fh_len) { + /* create the vnode with the filehandle and attributes */ + xid = savedxid; + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); + if (!error) + newvp = NFSTOV(np); + } + NVATTR_CLEANUP(&nvattr); + + if (!namedattrs) + nfs_dulookup_finish(&dul, dnp, ctx); + + /* + * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry + * if we can succeed in looking up the object. + */ + if ((create_error == EEXIST) || (!create_error && !newvp)) { + error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np); + if (!error) { + newvp = NFSTOV(np); + if (vnode_vtype(newvp) != nfstov_type(type, nfsvers)) + error = EEXIST; + } + } + if (!busyerror) + nfs_node_clear_busy(dnp); + if (error) { + if (newvp) { + nfs_node_unlock(np); + vnode_put(newvp); + } + } else { + nfs_node_unlock(np); + *npp = np; + } + return (error); +} + +int +nfs4_vnop_mknod( + struct vnop_mknod_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ *ap) +{ + nfsnode_t np = NULL; + struct nfsmount *nmp; + int error; + + nmp = VTONMP(ap->a_dvp); + if (!nmp) + return (ENXIO); + + if (!VATTR_IS_ACTIVE(ap->a_vap, va_type)) + return (EINVAL); + switch (ap->a_vap->va_type) { + case VBLK: + case VCHR: + case VFIFO: + case VSOCK: + break; + default: + return (ENOTSUP); + } + + error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, + vtonfs_type(ap->a_vap->va_type, nmp->nm_vers), NULL, &np); + if (!error) + *ap->a_vpp = NFSTOV(np); + return (error); +} + +int +nfs4_vnop_mkdir( + struct vnop_mkdir_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + vfs_context_t a_context; + } */ *ap) +{ + nfsnode_t np = NULL; + int error; + + error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, + NFDIR, NULL, &np); + if (!error) + *ap->a_vpp = NFSTOV(np); + return (error); +} + +int +nfs4_vnop_symlink( + struct vnop_symlink_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + struct vnode_attr *a_vap; + char *a_target; + vfs_context_t a_context; + } */ *ap) +{ + nfsnode_t np = NULL; + int error; + + error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, + NFLNK, ap->a_target, &np); + if (!error) + *ap->a_vpp = NFSTOV(np); + return (error); +} + +int +nfs4_vnop_link( + struct vnop_link_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vnode_t a_tdvp; + struct componentname *a_cnp; + vfs_context_t a_context; + } */ *ap) +{ + vfs_context_t ctx = ap->a_context; + vnode_t vp = ap->a_vp; + vnode_t tdvp = ap->a_tdvp; + struct componentname *cnp = ap->a_cnp; + int error = 0, lockerror = ENOENT, status; + struct nfsmount *nmp; + nfsnode_t np = VTONFS(vp); + nfsnode_t tdnp = VTONFS(tdvp); + int nfsvers, numops; + u_int64_t xid, savedxid; + struct nfsm_chain nmreq, nmrep; + struct nfsreq_secinfo_args si; + + if (vnode_mount(vp) != vnode_mount(tdvp)) + return (EXDEV); + + nmp = VTONMP(vp); + if (!nmp) + return (ENXIO); + nfsvers = nmp->nm_vers; + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + if (tdnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (EINVAL); + + /* + * Push all writes to the server, so that the attribute cache + * doesn't get "out of sync" with the server. + * XXX There should be a better way! + */ + nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), V_IGNORE_WRITEERR); + + if ((error = nfs_node_set_busy2(tdnp, np, vfs_context_thread(ctx)))) + return (error); + + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + // PUTFH(SOURCE), SAVEFH, PUTFH(DIR), LINK, GETATTR(DIR), RESTOREFH, GETATTR + numops = 7; + nfsm_chain_build_alloc_init(error, &nmreq, 29 * NFSX_UNSIGNED + cnp->cn_namelen); + nfsm_chain_add_compound_header(error, &nmreq, "link", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_SAVEFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_LINK); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, tdnp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + nfsm_chain_add_bitmap_supported(error, &nmreq, nfs_getattr_bitmap, nmp, np); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request(tdnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); + + if ((lockerror = nfs_node_lock2(tdnp, np))) { + error = lockerror; + goto nfsmout; + } + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_SAVEFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_LINK); + nfsm_chain_check_change_info(error, &nmrep, tdnp); + /* directory attributes: if we don't get them, make sure to invalidate */ + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + savedxid = xid; + nfsm_chain_loadattr(error, &nmrep, tdnp, nfsvers, &xid); + if (error) + NATTRINVALIDATE(tdnp); + /* link attributes: if we don't get them, make sure to invalidate */ + nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + xid = savedxid; + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); + if (error) + NATTRINVALIDATE(np); +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + if (!lockerror) + tdnp->n_flag |= NMODIFIED; + /* Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ + if (error == EEXIST) + error = 0; + if (!error && (tdnp->n_flag & NNEGNCENTRIES)) { + tdnp->n_flag &= ~NNEGNCENTRIES; + cache_purge_negatives(tdvp); + } + if (!lockerror) + nfs_node_unlock2(tdnp, np); + nfs_node_clear_busy2(tdnp, np); + return (error); +} + +int +nfs4_vnop_rmdir( + struct vnop_rmdir_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t a_vp; + struct componentname *a_cnp; + vfs_context_t a_context; + } */ *ap) +{ + vfs_context_t ctx = ap->a_context; + vnode_t vp = ap->a_vp; + vnode_t dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct nfsmount *nmp; + int error = 0, namedattrs; + nfsnode_t np = VTONFS(vp); + nfsnode_t dnp = VTONFS(dvp); + struct nfs_dulookup dul; + + if (vnode_vtype(vp) != VDIR) + return (EINVAL); + + nmp = NFSTONMP(dnp); + if (!nmp) + return (ENXIO); + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); + + if ((error = nfs_node_set_busy2(dnp, np, vfs_context_thread(ctx)))) + return (error); + + if (!namedattrs) { + nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + nfs_dulookup_start(&dul, dnp, ctx); + } + + error = nfs4_remove_rpc(dnp, cnp->cn_nameptr, cnp->cn_namelen, + vfs_context_thread(ctx), vfs_context_ucred(ctx)); + + nfs_name_cache_purge(dnp, np, cnp, ctx); + /* nfs_getattr() will check changed and purge caches */ + nfs_getattr(dnp, NULL, ctx, NGA_CACHED); + if (!namedattrs) + nfs_dulookup_finish(&dul, dnp, ctx); + nfs_node_clear_busy2(dnp, np); + + /* + * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. + */ + if (error == ENOENT) + error = 0; + if (!error) { + /* + * remove nfsnode from hash now so we can't accidentally find it + * again if another object gets created with the same filehandle + * before this vnode gets reclaimed + */ + lck_mtx_lock(nfs_node_hash_mutex); + if (np->n_hflag & NHHASHED) { + LIST_REMOVE(np, n_hash); + np->n_hflag &= ~NHHASHED; + FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); + } + lck_mtx_unlock(nfs_node_hash_mutex); + } + return (error); +} + +/* + * NFSv4 Named Attributes + * + * Both the extended attributes interface and the named streams interface + * are backed by NFSv4 named attributes. The implementations for both use + * a common set of routines in an attempt to reduce code duplication, to + * increase efficiency, to increase caching of both names and data, and to + * confine the complexity. + * + * Each NFS node caches its named attribute directory's file handle. + * The directory nodes for the named attribute directories are handled + * exactly like regular directories (with a couple minor exceptions). + * Named attribute nodes are also treated as much like regular files as + * possible. + * + * Most of the heavy lifting is done by nfs4_named_attr_get(). + */ + +/* + * Get the given node's attribute directory node. + * If !fetch, then only return a cached node. + * Otherwise, we will attempt to fetch the node from the server. + * (Note: the node should be marked busy.) */ -int -nfs4_vnop_create( - struct vnop_create_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_dvp; - vnode_t *a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - vfs_context_t a_context; - } */ *ap) +nfsnode_t +nfs4_named_attr_dir_get(nfsnode_t np, int fetch, vfs_context_t ctx) { - vfs_context_t ctx = ap->a_context; - struct componentname *cnp = ap->a_cnp; - struct vnode_attr *vap = ap->a_vap; - vnode_t dvp = ap->a_dvp; - vnode_t *vpp = ap->a_vpp; + nfsnode_t adnp = NULL; struct nfsmount *nmp; - nfsnode_t np; - int error = 0; - struct nfs_open_owner *noop = NULL; - struct nfs_open_file *nofp = NULL; + int error = 0, status, numops; + struct nfsm_chain nmreq, nmrep; + u_int64_t xid; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; + fhandle_t fh; + struct nfs_vattr nvattr; + struct componentname cn; + struct nfsreq rq, *req = &rq; + struct nfsreq_secinfo_args si; - nmp = VTONMP(dvp); + nmp = NFSTONMP(np); if (!nmp) - return (ENXIO); - - nfs_avoid_needless_id_setting_on_create(VTONFS(dvp), vap, ctx); + return (NULL); + if (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) + return (NULL); - noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); - if (!noop) - return (ENOMEM); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); + NVATTR_INIT(&nvattr); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); -restart: - error = nfs_mount_state_in_use_start(nmp); - if (error) { - nfs_open_owner_rele(noop); - return (error); + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(_PATH_FORKSPECIFIER, const, char *); /* "/..namedfork/" */ + cn.cn_namelen = strlen(_PATH_FORKSPECIFIER); + cn.cn_nameiop = LOOKUP; + + if (np->n_attrdirfh) { + // XXX can't set parent correctly (to np) yet + error = nfs_nget(nmp->nm_mountp, NULL, &cn, np->n_attrdirfh+1, *np->n_attrdirfh, + NULL, NULL, RPCAUTH_UNKNOWN, NG_NOCREATE, &adnp); + if (adnp) + goto nfsmout; + } + if (!fetch) { + error = ENOENT; + goto nfsmout; } - error = nfs_open_file_find(NULL, noop, &nofp, 0, 0, 1); - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { - printf("nfs_vnop_create: LOST\n"); - error = EIO; - } - if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - nofp = NULL; - goto restart; - } + // PUTFH, OPENATTR, GETATTR + numops = 3; + nfsm_chain_build_alloc_init(error, &nmreq, 22 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "openattr", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, np->n_fhp, np->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPENATTR); + nfsm_chain_add_32(error, &nmreq, 0); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); + nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, + NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request_async(np, NULL, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); if (!error) - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); - if (error) { - nofp = NULL; - goto out; - } - - nofp->nof_opencnt++; - nofp->nof_access = NFS_OPEN_SHARE_ACCESS_BOTH; - nofp->nof_deny = NFS_OPEN_SHARE_DENY_NONE; - nofp->nof_rw++; + error = nfs_request_async_finish(req, &nmrep, &xid, &status); - error = nfs4_open_rpc(nofp, ctx, cnp, vap, dvp, vpp, NFS_OPEN_CREATE, - NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE); - if (!error && !*vpp) { - printf("nfs4_open_rpc returned without a node?\n"); - /* Hmmm... with no node, we have no filehandle and can't close it */ - error = EIO; + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPENATTR); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); + nfsmout_if(error); + if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE) || !fh.fh_len) { + error = ENOENT; + goto nfsmout; } - if (error) { - nofp->nof_rw--; - nofp->nof_access = 0; - nofp->nof_deny = 0; - nofp->nof_opencnt--; - } - if (*vpp) { - nofp->nof_np = np = VTONFS(*vpp); - /* insert nofp onto np's open list */ - TAILQ_INSERT_HEAD(&np->n_opens, nofp, nof_link); - if (!error) { - nofp->nof_flags |= NFS_OPEN_FILE_CREATE; - nofp->nof_creator = current_thread(); - } + if (!np->n_attrdirfh || (*np->n_attrdirfh != fh.fh_len)) { + /* (re)allocate attrdir fh buffer */ + if (np->n_attrdirfh) + FREE(np->n_attrdirfh, M_TEMP); + MALLOC(np->n_attrdirfh, u_char*, fh.fh_len+1, M_TEMP, M_WAITOK); } -out: - if (nofp) - nfs_open_file_clear_busy(nofp); - if (nfs_mount_state_in_use_end(nmp, error)) { - nofp = NULL; - goto restart; + if (!np->n_attrdirfh) { + error = ENOMEM; + goto nfsmout; } - if (noop) - nfs_open_owner_rele(noop); - return (error); -} + /* cache the attrdir fh in the node */ + *np->n_attrdirfh = fh.fh_len; + bcopy(fh.fh_data, np->n_attrdirfh+1, fh.fh_len); + /* create node for attrdir */ + // XXX can't set parent correctly (to np) yet + error = nfs_nget(NFSTOMP(np), NULL, &cn, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, 0, &adnp); +nfsmout: + NVATTR_CLEANUP(&nvattr); + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); -void -nfs_avoid_needless_id_setting_on_create(nfsnode_t dnp, struct vnode_attr *vap, vfs_context_t ctx) -{ - /* - * Don't bother setting UID if it's the same as the credential performing the create. - * Don't bother setting GID if it's the same as the directory or credential. - */ - if (VATTR_IS_ACTIVE(vap, va_uid)) { - if (kauth_cred_getuid(vfs_context_ucred(ctx)) == vap->va_uid) - VATTR_CLEAR_ACTIVE(vap, va_uid); - } - if (VATTR_IS_ACTIVE(vap, va_gid)) { - if ((vap->va_gid == dnp->n_vattr.nva_gid) || - (kauth_cred_getgid(vfs_context_ucred(ctx)) == vap->va_gid)) - VATTR_CLEAR_ACTIVE(vap, va_gid); + if (adnp) { + /* sanity check that this node is an attribute directory */ + if (adnp->n_vattr.nva_type != VDIR) + error = EINVAL; + if (!(adnp->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR)) + error = EINVAL; + nfs_node_unlock(adnp); + if (error) + vnode_put(NFSTOV(adnp)); } + return (error ? NULL : adnp); } /* - * Note: the NFSv4 CREATE RPC is for everything EXCEPT regular files. + * Get the given node's named attribute node for the name given. + * + * In an effort to increase the performance of named attribute access, we try + * to reduce server requests by doing the following: + * + * - cache the node's named attribute directory file handle in the node + * - maintain a directory vnode for the attribute directory + * - use name cache entries (positive and negative) to speed up lookups + * - optionally open the named attribute (with the given accessMode) in the same RPC + * - combine attribute directory retrieval with the lookup/open RPC + * - optionally prefetch the named attribute's first block of data in the same RPC + * + * Also, in an attempt to reduce the number of copies/variations of this code, + * parts of the RPC building/processing code are conditionalized on what is + * needed for any particular request (openattr, lookup vs. open, read). + * + * Note that because we may not have the attribute directory node when we start + * the lookup/open, we lock both the node and the attribute directory node. */ + +#define NFS_GET_NAMED_ATTR_CREATE 0x1 +#define NFS_GET_NAMED_ATTR_CREATE_GUARDED 0x2 +#define NFS_GET_NAMED_ATTR_TRUNCATE 0x4 +#define NFS_GET_NAMED_ATTR_PREFETCH 0x8 + int -nfs4_create_rpc( - vfs_context_t ctx, - nfsnode_t dnp, +nfs4_named_attr_get( + nfsnode_t np, struct componentname *cnp, - struct vnode_attr *vap, - int type, - char *link, - nfsnode_t *npp) + uint32_t accessMode, + int flags, + vfs_context_t ctx, + nfsnode_t *anpp, + struct nfs_open_file **nofpp) { struct nfsmount *nmp; - struct nfs_vattr nvattr, dnvattr; - int error = 0, create_error = EIO, lockerror = ENOENT, busyerror = ENOENT, status; - int nfsvers, numops; + int error = 0, open_error = EIO; + int inuse = 0, adlockerror = ENOENT, busyerror = ENOENT, adbusyerror = ENOENT, nofpbusyerror = ENOENT; + int create, guarded, prefetch, truncate, noopbusy = 0; + int open, status, numops, hadattrdir, negnamecache; + struct nfs_vattr nvattr; + struct vnode_attr vattr; + nfsnode_t adnp = NULL, anp = NULL; + vnode_t avp = NULL; u_int64_t xid, savedxid = 0; - nfsnode_t np = NULL; - vnode_t newvp = NULL; struct nfsm_chain nmreq, nmrep; uint32_t bitmap[NFS_ATTR_BITMAP_LEN], bmlen; - const char *tag; - nfs_specdata sd; + uint32_t denyMode, rflags, delegation, recall, eof, rlen, retlen; + nfs_stateid stateid, dstateid; fhandle_t fh; - struct nfsreq *req = NULL; - struct nfs_dulookup dul; + struct nfs_open_owner *noop = NULL; + struct nfs_open_file *newnofp = NULL, *nofp = NULL; + struct vnop_access_args naa; + thread_t thd; + kauth_cred_t cred; + struct timeval now; + char sbuf[64], *s; + uint32_t ace_type, ace_flags, ace_mask, len, slen; + struct kauth_ace ace; + struct nfsreq rq, *req = &rq; + struct nfsreq_secinfo_args si; + + *anpp = NULL; + fh.fh_len = 0; + rflags = delegation = recall = eof = rlen = retlen = 0; + ace.ace_flags = 0; + s = sbuf; + slen = sizeof(sbuf); - nmp = NFSTONMP(dnp); + nmp = NFSTONMP(np); if (!nmp) return (ENXIO); - nfsvers = nmp->nm_vers; - - sd.specdata1 = sd.specdata2 = 0; + NVATTR_INIT(&nvattr); + negnamecache = !NMFLAG(nmp, NONEGNAMECACHE); + thd = vfs_context_thread(ctx); + cred = vfs_context_ucred(ctx); + create = (flags & NFS_GET_NAMED_ATTR_CREATE) ? NFS_OPEN_CREATE : NFS_OPEN_NOCREATE; + guarded = (flags & NFS_GET_NAMED_ATTR_CREATE_GUARDED) ? NFS_CREATE_GUARDED : NFS_CREATE_UNCHECKED; + truncate = (flags & NFS_GET_NAMED_ATTR_TRUNCATE); + prefetch = (flags & NFS_GET_NAMED_ATTR_PREFETCH); + + if (!create) { + error = nfs_getattr(np, &nvattr, ctx, NGA_CACHED); + if (error) + return (error); + if (NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_NAMED_ATTR) && + !(nvattr.nva_flags & NFS_FFLAG_HAS_NAMED_ATTRS)) + return (ENOATTR); + } else if (accessMode == NFS_OPEN_SHARE_ACCESS_NONE) { + /* shouldn't happen... but just be safe */ + printf("nfs4_named_attr_get: create with no access %s\n", cnp->cn_nameptr); + accessMode = NFS_OPEN_SHARE_ACCESS_READ; + } + open = (accessMode != NFS_OPEN_SHARE_ACCESS_NONE); + if (open) { + /* + * We're trying to open the file. + * We'll create/open it with the given access mode, + * and set NFS_OPEN_FILE_CREATE. + */ + denyMode = NFS_OPEN_SHARE_DENY_NONE; + if (prefetch && guarded) + prefetch = 0; /* no sense prefetching data that can't be there */ - switch (type) { - case NFLNK: - tag = "symlink"; - break; - case NFBLK: - case NFCHR: - tag = "mknod"; - if (!VATTR_IS_ACTIVE(vap, va_rdev)) - return (EINVAL); - sd.specdata1 = major(vap->va_rdev); - sd.specdata2 = minor(vap->va_rdev); - break; - case NFSOCK: - case NFFIFO: - tag = "mknod"; - break; - case NFDIR: - tag = "mkdir"; - break; - default: - return (EINVAL); + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); + if (!noop) + return (ENOMEM); } - nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); - - error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); - nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + if ((error = busyerror = nfs_node_set_busy(np, vfs_context_thread(ctx)))) + return (error); + adnp = nfs4_named_attr_dir_get(np, 0, ctx); + hadattrdir = (adnp != NULL); + if (prefetch) { + microuptime(&now); + /* use the special state ID because we don't have a real one to send */ + stateid.seqid = stateid.other[0] = stateid.other[1] = stateid.other[2] = 0; + rlen = MIN(nmp->nm_rsize, nmp->nm_biosize); + } + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); - // PUTFH, SAVEFH, CREATE, GETATTR(FH), RESTOREFH, GETATTR - numops = 6; - nfsm_chain_build_alloc_init(error, &nmreq, 66 * NFSX_UNSIGNED); - nfsm_chain_add_compound_header(error, &nmreq, tag, numops); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_SAVEFH); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_CREATE); - nfsm_chain_add_32(error, &nmreq, type); - if (type == NFLNK) { - nfsm_chain_add_string(error, &nmreq, link, strlen(link)); - } else if ((type == NFBLK) || (type == NFCHR)) { - nfsm_chain_add_32(error, &nmreq, sd.specdata1); - nfsm_chain_add_32(error, &nmreq, sd.specdata2); + if (hadattrdir) { + if ((error = adbusyerror = nfs_node_set_busy(adnp, vfs_context_thread(ctx)))) + goto nfsmout; + /* nfs_getattr() will check changed and purge caches */ + error = nfs_getattr(adnp, NULL, ctx, NGA_CACHED); + nfsmout_if(error); + error = cache_lookup(NFSTOV(adnp), &avp, cnp); + switch (error) { + case ENOENT: + /* negative cache entry */ + goto nfsmout; + case 0: + /* cache miss */ + /* try dir buf cache lookup */ + error = nfs_dir_buf_cache_lookup(adnp, &anp, cnp, ctx, 0); + if (!error && anp) { + /* dir buf cache hit */ + *anpp = anp; + error = -1; + } + if (error != -1) /* cache miss */ + break; + /* FALLTHROUGH */ + case -1: + /* cache hit, not really an error */ + OSAddAtomic(1, &nfsstats.lookupcache_hits); + if (!anp && avp) + *anpp = anp = VTONFS(avp); + + nfs_node_clear_busy(adnp); + adbusyerror = ENOENT; + + /* check for directory access */ + naa.a_desc = &vnop_access_desc; + naa.a_vp = NFSTOV(adnp); + naa.a_action = KAUTH_VNODE_SEARCH; + naa.a_context = ctx; + + /* compute actual success/failure based on accessibility */ + error = nfs_vnop_access(&naa); + /* FALLTHROUGH */ + default: + /* we either found it, or hit an error */ + if (!error && guarded) { + /* found cached entry but told not to use it */ + error = EEXIST; + vnode_put(NFSTOV(anp)); + *anpp = anp = NULL; + } + /* we're done if error or we don't need to open */ + if (error || !open) + goto nfsmout; + /* no error and we need to open... */ + } + } + + if (open) { +restart: + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); + if (error) { + nfs_open_owner_rele(noop); + noop = NULL; + goto nfsmout; + } + inuse = 1; + + /* grab an open file - possibly provisional/nodeless if cache_lookup() failed */ + error = nfs_open_file_find(anp, noop, &newnofp, 0, 0, 1); + if (!error && (newnofp->nof_flags & NFS_OPEN_FILE_LOST)) { + printf("nfs4_named_attr_get: LOST %d %s\n", kauth_cred_getuid(noop->noo_cred), cnp->cn_nameptr); + error = EIO; + } + if (!error && (newnofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(newnofp, vfs_context_thread(ctx)); + nfs_open_file_destroy(newnofp); + newnofp = NULL; + if (!error) + goto restart; + } + if (!error) + error = nfs_open_file_set_busy(newnofp, vfs_context_thread(ctx)); + if (error) { + if (newnofp) + nfs_open_file_destroy(newnofp); + newnofp = NULL; + goto nfsmout; + } + if (anp) { + /* + * We already have the node. So we just need to open + * it - which we may be able to do with a delegation. + */ + open_error = error = nfs4_open(anp, newnofp, accessMode, denyMode, ctx); + if (!error) { + /* open succeeded, so our open file is no longer temporary */ + nofp = newnofp; + nofpbusyerror = 0; + newnofp = NULL; + if (nofpp) + *nofpp = nofp; + } + goto nfsmout; + } + } + + /* + * We either don't have the attrdir or we didn't find the attribute + * in the name cache, so we need to talk to the server. + * + * If we don't have the attrdir, we'll need to ask the server for that too. + * If the caller is requesting that the attribute be created, we need to + * make sure the attrdir is created. + * The caller may also request that the first block of an existing attribute + * be retrieved at the same time. + */ + + if (open) { + /* need to mark the open owner busy during the RPC */ + if ((error = nfs_open_owner_set_busy(noop, thd))) + goto nfsmout; + noopbusy = 1; + } + + /* + * We'd like to get updated post-open/lookup attributes for the + * directory and we may also want to prefetch some data via READ. + * We'd like the READ results to be last so that we can leave the + * data in the mbufs until the end. + * + * At a minimum we're sending: PUTFH, LOOKUP/OPEN, GETATTR, PUTFH, GETATTR + */ + numops = 5; + if (!hadattrdir) + numops += 3; // also sending: OPENATTR, GETATTR, OPENATTR + if (prefetch) + numops += 4; // also sending: SAVEFH, RESTOREFH, NVERIFY, READ + nfsm_chain_build_alloc_init(error, &nmreq, 64 * NFSX_UNSIGNED + cnp->cn_namelen); + nfsm_chain_add_compound_header(error, &nmreq, "getnamedattr", numops); + if (hadattrdir) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, adnp->n_fhp, adnp->n_fhsize); + } else { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, np->n_fhp, np->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPENATTR); + nfsm_chain_add_32(error, &nmreq, create ? 1 : 0); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); + nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, + NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + } + if (open) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPEN); + nfsm_chain_add_32(error, &nmreq, noop->noo_seqid); + nfsm_chain_add_32(error, &nmreq, accessMode); + nfsm_chain_add_32(error, &nmreq, denyMode); + nfsm_chain_add_64(error, &nmreq, nmp->nm_clientid); + nfsm_chain_add_32(error, &nmreq, NFSX_UNSIGNED); + nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(noop->noo_cred)); + nfsm_chain_add_32(error, &nmreq, create); + if (create) { + nfsm_chain_add_32(error, &nmreq, guarded); + VATTR_INIT(&vattr); + if (truncate) + VATTR_SET(&vattr, va_data_size, 0); + nfsm_chain_add_fattr4(error, &nmreq, &vattr, nmp); + } + nfsm_chain_add_32(error, &nmreq, NFS_CLAIM_NULL); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); + } else { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUP); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); } - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); - nfsm_chain_add_fattr4(error, &nmreq, vap, nmp); numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); + if (prefetch) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_SAVEFH); + } + if (hadattrdir) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, adnp->n_fhp, adnp->n_fhsize); + } else { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, np->n_fhp, np->n_fhsize); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPENATTR); + nfsm_chain_add_32(error, &nmreq, 0); + } numops--; nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + if (prefetch) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_NVERIFY); + VATTR_INIT(&vattr); + VATTR_SET(&vattr, va_data_size, 0); + nfsm_chain_add_fattr4(error, &nmreq, &vattr, nmp); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_READ); + nfsm_chain_add_stateid(error, &nmreq, &stateid); + nfsm_chain_add_64(error, &nmreq, 0); + nfsm_chain_add_32(error, &nmreq, rlen); + } nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - - error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC4_COMPOUND, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); - if (!error) { - nfs_dulookup_start(&dul, dnp, ctx); + error = nfs_request_async(hadattrdir ? adnp : np, NULL, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, open ? R_NOINTR: 0, NULL, &req); + if (!error) error = nfs_request_async_finish(req, &nmrep, &xid, &status); - } - if ((lockerror = nfs_node_lock(dnp))) - error = lockerror; + if (hadattrdir && ((adlockerror = nfs_node_lock(adnp)))) + error = adlockerror; + savedxid = xid; nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_SAVEFH); - nfsmout_if(error); - nfsm_chain_op_check(error, &nmrep, NFS_OP_CREATE); - nfsm_chain_check_change_info(error, &nmrep, dnp); - bmlen = NFS_ATTR_BITMAP_LEN; - nfsm_chain_get_bitmap(error, &nmrep, bitmap, bmlen); - /* At this point if we have no error, the object was created. */ - /* if we don't get attributes, then we should lookitup. */ - create_error = error; - nfsmout_if(error); - nfs_vattr_set_supported(bitmap, vap); + if (!hadattrdir) { + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPENATTR); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); + nfsmout_if(error); + if (NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE) && fh.fh_len) { + if (!np->n_attrdirfh || (*np->n_attrdirfh != fh.fh_len)) { + /* (re)allocate attrdir fh buffer */ + if (np->n_attrdirfh) + FREE(np->n_attrdirfh, M_TEMP); + MALLOC(np->n_attrdirfh, u_char*, fh.fh_len+1, M_TEMP, M_WAITOK); + } + if (np->n_attrdirfh) { + /* remember the attrdir fh in the node */ + *np->n_attrdirfh = fh.fh_len; + bcopy(fh.fh_data, np->n_attrdirfh+1, fh.fh_len); + /* create busied node for attrdir */ + struct componentname cn; + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(_PATH_FORKSPECIFIER, const, char *); /* "/..namedfork/" */ + cn.cn_namelen = strlen(_PATH_FORKSPECIFIER); + cn.cn_nameiop = LOOKUP; + // XXX can't set parent correctly (to np) yet + error = nfs_nget(NFSTOMP(np), NULL, &cn, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, 0, &adnp); + if (!error) { + adlockerror = 0; + /* set the node busy */ + SET(adnp->n_flag, NBUSY); + adbusyerror = 0; + } + /* if no adnp, oh well... */ + error = 0; + } + } + NVATTR_CLEANUP(&nvattr); + fh.fh_len = 0; + } + if (open) { + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPEN); + nfs_owner_seqid_increment(noop, NULL, error); + nfsm_chain_get_stateid(error, &nmrep, &newnofp->nof_stateid); + nfsm_chain_check_change_info(error, &nmrep, adnp); + nfsm_chain_get_32(error, &nmrep, rflags); + bmlen = NFS_ATTR_BITMAP_LEN; + nfsm_chain_get_bitmap(error, &nmrep, bitmap, bmlen); + nfsm_chain_get_32(error, &nmrep, delegation); + if (!error) + switch (delegation) { + case NFS_OPEN_DELEGATE_NONE: + break; + case NFS_OPEN_DELEGATE_READ: + case NFS_OPEN_DELEGATE_WRITE: + nfsm_chain_get_stateid(error, &nmrep, &dstateid); + nfsm_chain_get_32(error, &nmrep, recall); + if (delegation == NFS_OPEN_DELEGATE_WRITE) // space (skip) XXX + nfsm_chain_adv(error, &nmrep, 3 * NFSX_UNSIGNED); + /* if we have any trouble accepting the ACE, just invalidate it */ + ace_type = ace_flags = ace_mask = len = 0; + nfsm_chain_get_32(error, &nmrep, ace_type); + nfsm_chain_get_32(error, &nmrep, ace_flags); + nfsm_chain_get_32(error, &nmrep, ace_mask); + nfsm_chain_get_32(error, &nmrep, len); + ace.ace_flags = nfs4_ace_nfstype_to_vfstype(ace_type, &error); + ace.ace_flags |= nfs4_ace_nfsflags_to_vfsflags(ace_flags); + ace.ace_rights = nfs4_ace_nfsmask_to_vfsrights(ace_mask); + if (!error && (len >= slen)) { + MALLOC(s, char*, len+1, M_TEMP, M_WAITOK); + if (s) + slen = len+1; + else + ace.ace_flags = 0; + } + if (s) + nfsm_chain_get_opaque(error, &nmrep, len, s); + else + nfsm_chain_adv(error, &nmrep, nfsm_rndup(len)); + if (!error && s) { + s[len] = '\0'; + if (nfs4_id2guid(s, &ace.ace_applicable, (ace_flags & NFS_ACE_IDENTIFIER_GROUP))) + ace.ace_flags = 0; + } + if (error || !s) + ace.ace_flags = 0; + if (s && (s != sbuf)) + FREE(s, M_TEMP); + break; + default: + error = EBADRPC; + break; + } + /* At this point if we have no error, the object was created/opened. */ + open_error = error; + } else { + nfsm_chain_op_check(error, &nmrep, NFS_OP_LOOKUP); + } nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL); + error = nfs4_parsefattr(&nmrep, NULL, &nvattr, &fh, NULL, NULL); nfsmout_if(error); - if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { - printf("nfs: create/%s didn't return filehandle?\n", tag); - error = EBADRPC; + if (!NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE) || !fh.fh_len) { + error = EIO; goto nfsmout; } - /* directory attributes: if we don't get them, make sure to invalidate */ - nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); + if (prefetch) + nfsm_chain_op_check(error, &nmrep, NFS_OP_SAVEFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + if (!hadattrdir) + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPENATTR); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - savedxid = xid; - nfsm_chain_loadattr(error, &nmrep, dnp, nfsvers, NULL, &xid); - if (error) - NATTRINVALIDATE(dnp); - -nfsmout: - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); + nfsmout_if(error); + xid = savedxid; + nfsm_chain_loadattr(error, &nmrep, adnp, nmp->nm_vers, &xid); + nfsmout_if(error); - if (!lockerror) { - if (!create_error && (dnp->n_flag & NNEGNCENTRIES)) { - dnp->n_flag &= ~NNEGNCENTRIES; - cache_purge_negatives(NFSTOV(dnp)); + if (open) { + if (rflags & NFS_OPEN_RESULT_LOCKTYPE_POSIX) + newnofp->nof_flags |= NFS_OPEN_FILE_POSIXLOCK; + if (rflags & NFS_OPEN_RESULT_CONFIRM) { + if (adnp) { + nfs_node_unlock(adnp); + adlockerror = ENOENT; + } + NVATTR_CLEANUP(&nvattr); + error = nfs4_open_confirm_rpc(nmp, adnp ? adnp : np, fh.fh_data, fh.fh_len, noop, &newnofp->nof_stateid, thd, cred, &nvattr, &xid); + nfsmout_if(error); + savedxid = xid; + if ((adlockerror = nfs_node_lock(adnp))) + error = adlockerror; } - dnp->n_flag |= NMODIFIED; - nfs_node_unlock(dnp); - /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, NGA_CACHED); } - if (!error && fh.fh_len) { +nfsmout: + if (open && adnp && !adlockerror) { + if (!open_error && (adnp->n_flag & NNEGNCENTRIES)) { + adnp->n_flag &= ~NNEGNCENTRIES; + cache_purge_negatives(NFSTOV(adnp)); + } + adnp->n_flag |= NMODIFIED; + nfs_node_unlock(adnp); + adlockerror = ENOENT; + nfs_getattr(adnp, NULL, ctx, NGA_CACHED); + } + if (adnp && !adlockerror && (error == ENOENT) && + (cnp->cn_flags & MAKEENTRY) && (cnp->cn_nameiop != CREATE) && negnamecache) { + /* add a negative entry in the name cache */ + cache_enter(NFSTOV(adnp), NULL, cnp); + adnp->n_flag |= NNEGNCENTRIES; + } + if (adnp && !adlockerror) { + nfs_node_unlock(adnp); + adlockerror = ENOENT; + } + if (!error && !anp && fh.fh_len) { /* create the vnode with the filehandle and attributes */ xid = savedxid; - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &np); - if (!error) - newvp = NFSTOV(np); + error = nfs_nget(NFSTOMP(np), adnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &anp); + if (!error) { + *anpp = anp; + nfs_node_unlock(anp); + } + if (!error && open) { + nfs_open_file_add_open(newnofp, accessMode, denyMode, 0); + /* After we have a node, add our open file struct to the node */ + nofp = newnofp; + error = nfs_open_file_find_internal(anp, noop, &nofp, 0, 0, 0); + if (error) { + /* This shouldn't happen, because we passed in a new nofp to use. */ + printf("nfs_open_file_find_internal failed! %d\n", error); + nofp = NULL; + } else if (nofp != newnofp) { + /* + * Hmm... an open file struct already exists. + * Mark the existing one busy and merge our open into it. + * Then destroy the one we created. + * Note: there's no chance of an open confict because the + * open has already been granted. + */ + nofpbusyerror = nfs_open_file_set_busy(nofp, NULL); + nfs_open_file_add_open(nofp, accessMode, denyMode, 0); + nofp->nof_stateid = newnofp->nof_stateid; + if (newnofp->nof_flags & NFS_OPEN_FILE_POSIXLOCK) + nofp->nof_flags |= NFS_OPEN_FILE_POSIXLOCK; + nfs_open_file_clear_busy(newnofp); + nfs_open_file_destroy(newnofp); + newnofp = NULL; + } + if (!error) { + newnofp = NULL; + nofpbusyerror = 0; + /* mark the node as holding a create-initiated open */ + nofp->nof_flags |= NFS_OPEN_FILE_CREATE; + nofp->nof_creator = current_thread(); + if (nofpp) + *nofpp = nofp; + } + } } + NVATTR_CLEANUP(&nvattr); + if (open && ((delegation == NFS_OPEN_DELEGATE_READ) || (delegation == NFS_OPEN_DELEGATE_WRITE))) { + if (!error && anp && !recall) { + /* stuff the delegation state in the node */ + lck_mtx_lock(&anp->n_openlock); + anp->n_openflags &= ~N_DELEG_MASK; + anp->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); + anp->n_dstateid = dstateid; + anp->n_dace = ace; + if (anp->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (anp->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, anp, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } + lck_mtx_unlock(&anp->n_openlock); + } else { + /* give the delegation back */ + if (anp) { + if (NFS_CMPFH(anp, fh.fh_data, fh.fh_len)) { + /* update delegation state and return it */ + lck_mtx_lock(&anp->n_openlock); + anp->n_openflags &= ~N_DELEG_MASK; + anp->n_openflags |= ((delegation == NFS_OPEN_DELEGATE_READ) ? N_DELEG_READ : N_DELEG_WRITE); + anp->n_dstateid = dstateid; + anp->n_dace = ace; + if (anp->n_dlink.tqe_next == NFSNOLIST) { + lck_mtx_lock(&nmp->nm_lock); + if (anp->n_dlink.tqe_next == NFSNOLIST) + TAILQ_INSERT_TAIL(&nmp->nm_delegations, anp, n_dlink); + lck_mtx_unlock(&nmp->nm_lock); + } + lck_mtx_unlock(&anp->n_openlock); + /* don't need to send a separate delegreturn for fh */ + fh.fh_len = 0; + } + /* return anp's current delegation */ + nfs4_delegation_return(anp, 0, thd, cred); + } + if (fh.fh_len) /* return fh's delegation if it wasn't for anp */ + nfs4_delegreturn_rpc(nmp, fh.fh_data, fh.fh_len, &dstateid, 0, thd, cred); + } + } + if (open) { + if (newnofp) { + /* need to cleanup our temporary nofp */ + nfs_open_file_clear_busy(newnofp); + nfs_open_file_destroy(newnofp); + newnofp = NULL; + } else if (nofp && !nofpbusyerror) { + nfs_open_file_clear_busy(nofp); + nofpbusyerror = ENOENT; + } + if (inuse && nfs_mount_state_in_use_end(nmp, error)) { + inuse = 0; + nofp = newnofp = NULL; + rflags = delegation = recall = eof = rlen = retlen = 0; + ace.ace_flags = 0; + s = sbuf; + slen = sizeof(sbuf); + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + if (anp) { + vnode_put(NFSTOV(anp)); + *anpp = anp = NULL; + } + hadattrdir = (adnp != NULL); + if (noopbusy) { + nfs_open_owner_clear_busy(noop); + noopbusy = 0; + } + goto restart; + } + if (noop) { + if (noopbusy) { + nfs_open_owner_clear_busy(noop); + noopbusy = 0; + } + nfs_open_owner_rele(noop); + } + } + if (!error && prefetch && nmrep.nmc_mhead) { + nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_NVERIFY); + nfsm_chain_op_check(error, &nmrep, NFS_OP_READ); + nfsm_chain_get_32(error, &nmrep, eof); + nfsm_chain_get_32(error, &nmrep, retlen); + if (!error && anp) { + /* + * There can be one problem with doing the prefetch. + * Because we don't have the node before we start the RPC, we + * can't have the buffer busy while the READ is performed. + * So there is a chance that other I/O occured on the same + * range of data while we were performing this RPC. If that + * happens, then it's possible the data we have in the READ + * response is no longer up to date. + * Once we have the node and the buffer, we need to make sure + * that there's no chance we could be putting stale data in + * the buffer. + * So, we check if the range read is dirty or if any I/O may + * have occured on it while we were performing our RPC. + */ + struct nfsbuf *bp = NULL; + int lastpg; + uint32_t pagemask; + + retlen = MIN(retlen, rlen); + + /* check if node needs size update or invalidation */ + if (ISSET(anp->n_flag, NUPDATESIZE)) + nfs_data_update_size(anp, 0); + if (!(error = nfs_node_lock(anp))) { + if (anp->n_flag & NNEEDINVALIDATE) { + anp->n_flag &= ~NNEEDINVALIDATE; + nfs_node_unlock(anp); + error = nfs_vinvalbuf(NFSTOV(anp), V_SAVE|V_IGNORE_WRITEERR, ctx, 1); + if (!error) /* lets play it safe and just drop the data */ + error = EIO; + } else { + nfs_node_unlock(anp); + } + } - nfs_dulookup_finish(&dul, dnp, ctx); - - /* - * Kludge: Map EEXIST => 0 assuming that you have a reply to a retry - * if we can succeed in looking up the object. - */ - if ((create_error == EEXIST) || (!create_error && !newvp)) { - error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np); - if (!error) { - newvp = NFSTOV(np); - if (vnode_vtype(newvp) != VLNK) - error = EEXIST; + /* calculate page mask for the range of data read */ + lastpg = (trunc_page_32(retlen) - 1) / PAGE_SIZE; + pagemask = ((1 << (lastpg + 1)) - 1); + + if (!error) + error = nfs_buf_get(anp, 0, nmp->nm_biosize, thd, NBLK_READ|NBLK_NOWAIT, &bp); + /* don't save the data if dirty or potential I/O conflict */ + if (!error && bp && !bp->nb_dirtyoff && !(bp->nb_dirty & pagemask) && + timevalcmp(&anp->n_lastio, &now, <)) { + OSAddAtomic(1, &nfsstats.read_bios); + CLR(bp->nb_flags, (NB_DONE|NB_ASYNC)); + SET(bp->nb_flags, NB_READ); + NFS_BUF_MAP(bp); + nfsm_chain_get_opaque(error, &nmrep, retlen, bp->nb_data); + if (error) { + bp->nb_error = error; + SET(bp->nb_flags, NB_ERROR); + } else { + bp->nb_offio = 0; + bp->nb_endio = rlen; + if ((retlen > 0) && (bp->nb_endio < (int)retlen)) + bp->nb_endio = retlen; + if (eof || (retlen == 0)) { + /* zero out the remaining data (up to EOF) */ + off_t rpcrem, eofrem, rem; + rpcrem = (rlen - retlen); + eofrem = anp->n_size - (NBOFF(bp) + retlen); + rem = (rpcrem < eofrem) ? rpcrem : eofrem; + if (rem > 0) + bzero(bp->nb_data + retlen, rem); + } else if ((retlen < rlen) && !ISSET(bp->nb_flags, NB_ERROR)) { + /* ugh... short read ... just invalidate for now... */ + SET(bp->nb_flags, NB_INVAL); + } + } + nfs_buf_read_finish(bp); + microuptime(&anp->n_lastio); + } + if (bp) + nfs_buf_release(bp, 1); } + error = 0; /* ignore any transient error in processing the prefetch */ } - if (!busyerror) - nfs_node_clear_busy(dnp); - if (error) { - if (newvp) { - nfs_node_unlock(np); - vnode_put(newvp); + if (adnp && !adbusyerror) { + nfs_node_clear_busy(adnp); + adbusyerror = ENOENT; + } + if (!busyerror) { + nfs_node_clear_busy(np); + busyerror = ENOENT; + } + if (adnp) + vnode_put(NFSTOV(adnp)); + if (error && *anpp) { + vnode_put(NFSTOV(*anpp)); + *anpp = NULL; + } + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + return (error); +} + +/* + * Remove a named attribute. + */ +int +nfs4_named_attr_remove(nfsnode_t np, nfsnode_t anp, const char *name, vfs_context_t ctx) +{ + nfsnode_t adnp = NULL; + struct nfsmount *nmp; + struct componentname cn; + struct vnop_remove_args vra; + int error, putanp = 0; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(name, const, char *); + cn.cn_namelen = strlen(name); + cn.cn_nameiop = DELETE; + cn.cn_flags = 0; + + if (!anp) { + error = nfs4_named_attr_get(np, &cn, NFS_OPEN_SHARE_ACCESS_NONE, + 0, ctx, &anp, NULL); + if ((!error && !anp) || (error == ENOATTR)) + error = ENOENT; + if (error) { + if (anp) { + vnode_put(NFSTOV(anp)); + anp = NULL; + } + goto out; } - } else { - nfs_node_unlock(np); - *npp = np; + putanp = 1; + } + + if ((error = nfs_node_set_busy(np, vfs_context_thread(ctx)))) + goto out; + adnp = nfs4_named_attr_dir_get(np, 1, ctx); + nfs_node_clear_busy(np); + if (!adnp) { + error = ENOENT; + goto out; } + + vra.a_desc = &vnop_remove_desc; + vra.a_dvp = NFSTOV(adnp); + vra.a_vp = NFSTOV(anp); + vra.a_cnp = &cn; + vra.a_flags = 0; + vra.a_context = ctx; + error = nfs_vnop_remove(&vra); +out: + if (adnp) + vnode_put(NFSTOV(adnp)); + if (putanp) + vnode_put(NFSTOV(anp)); return (error); } int -nfs4_vnop_mknod( - struct vnop_mknod_args /* { +nfs4_vnop_getxattr( + struct vnop_getxattr_args /* { struct vnodeop_desc *a_desc; - vnode_t a_dvp; - vnode_t *a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; + vnode_t a_vp; + const char * a_name; + uio_t a_uio; + size_t *a_size; + int a_options; vfs_context_t a_context; } */ *ap) { - nfsnode_t np = NULL; + vfs_context_t ctx = ap->a_context; struct nfsmount *nmp; - int error; + struct nfs_vattr nvattr; + struct componentname cn; + nfsnode_t anp; + int error = 0, isrsrcfork; - nmp = VTONMP(ap->a_dvp); + nmp = VTONMP(ap->a_vp); if (!nmp) return (ENXIO); - if (!VATTR_IS_ACTIVE(ap->a_vap, va_type)) - return (EINVAL); - switch (ap->a_vap->va_type) { - case VBLK: - case VCHR: - case VFIFO: - case VSOCK: - break; - default: + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) return (ENOTSUP); + error = nfs_getattr(VTONFS(ap->a_vp), &nvattr, ctx, NGA_CACHED); + if (error) + return (error); + if (NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_NAMED_ATTR) && + !(nvattr.nva_flags & NFS_FFLAG_HAS_NAMED_ATTRS)) + return (ENOATTR); + + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(ap->a_name, const, char *); + cn.cn_namelen = strlen(ap->a_name); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = MAKEENTRY; + + /* we'll normally try to prefetch data for xattrs... the resource fork is really a stream */ + isrsrcfork = (bcmp(ap->a_name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0); + + error = nfs4_named_attr_get(VTONFS(ap->a_vp), &cn, NFS_OPEN_SHARE_ACCESS_NONE, + !isrsrcfork ? NFS_GET_NAMED_ATTR_PREFETCH : 0, ctx, &anp, NULL); + if ((!error && !anp) || (error == ENOENT)) + error = ENOATTR; + if (!error) { + if (ap->a_uio) + error = nfs_bioread(anp, ap->a_uio, 0, ctx); + else + *ap->a_size = anp->n_size; } + if (anp) + vnode_put(NFSTOV(anp)); + return (error); +} - error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, - vtonfs_type(ap->a_vap->va_type, nmp->nm_vers), NULL, &np); +int +nfs4_vnop_setxattr( + struct vnop_setxattr_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + const char * a_name; + uio_t a_uio; + int a_options; + vfs_context_t a_context; + } */ *ap) +{ + vfs_context_t ctx = ap->a_context; + int options = ap->a_options; + uio_t uio = ap->a_uio; + const char *name = ap->a_name; + struct nfsmount *nmp; + struct componentname cn; + nfsnode_t anp = NULL; + int error = 0, closeerror = 0, flags, isrsrcfork, isfinderinfo, empty = 0, i; +#define FINDERINFOSIZE 32 + uint8_t finfo[FINDERINFOSIZE]; + uint32_t *finfop; + struct nfs_open_file *nofp = NULL; + char uio_buf [ UIO_SIZEOF(1) ]; + uio_t auio; + struct vnop_write_args vwa; + + nmp = VTONMP(ap->a_vp); + if (!nmp) + return (ENXIO); + + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); + + if ((options & XATTR_CREATE) && (options & XATTR_REPLACE)) + return (EINVAL); + + /* XXX limitation based on need to back up uio on short write */ + if (uio_iovcnt(uio) > 1) { + printf("nfs4_vnop_setxattr: iovcnt > 1\n"); + return (EINVAL); + } + + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(name, const, char *); + cn.cn_namelen = strlen(name); + cn.cn_nameiop = CREATE; + cn.cn_flags = MAKEENTRY; + + isfinderinfo = (bcmp(name, XATTR_FINDERINFO_NAME, sizeof(XATTR_FINDERINFO_NAME)) == 0); + isrsrcfork = isfinderinfo ? 0 : (bcmp(name, XATTR_RESOURCEFORK_NAME, sizeof(XATTR_RESOURCEFORK_NAME)) == 0); + if (!isrsrcfork) + uio_setoffset(uio, 0); + if (isfinderinfo) { + if (uio_resid(uio) != sizeof(finfo)) + return (ERANGE); + error = uiomove((char*)&finfo, sizeof(finfo), uio); + if (error) + return (error); + /* setting a FinderInfo of all zeroes means remove the FinderInfo */ + empty = 1; + for (i=0, finfop=(uint32_t*)&finfo; i < (int)(sizeof(finfo)/sizeof(uint32_t)); i++) + if (finfop[i]) { + empty = 0; + break; + } + if (empty && !(options & (XATTR_CREATE|XATTR_REPLACE))) { + error = nfs4_named_attr_remove(VTONFS(ap->a_vp), anp, name, ctx); + if (error == ENOENT) + error = 0; + return (error); + } + /* first, let's see if we get a create/replace error */ + } + + /* + * create/open the xattr + * + * We need to make sure not to create it if XATTR_REPLACE. + * For all xattrs except the resource fork, we also want to + * truncate the xattr to remove any current data. We'll do + * that by setting the size to 0 on create/open. + */ + flags = 0; + if (!(options & XATTR_REPLACE)) + flags |= NFS_GET_NAMED_ATTR_CREATE; + if (options & XATTR_CREATE) + flags |= NFS_GET_NAMED_ATTR_CREATE_GUARDED; + if (!isrsrcfork) + flags |= NFS_GET_NAMED_ATTR_TRUNCATE; + + error = nfs4_named_attr_get(VTONFS(ap->a_vp), &cn, NFS_OPEN_SHARE_ACCESS_BOTH, + flags, ctx, &anp, &nofp); + if (!error && !anp) + error = ENOATTR; + if (error) + goto out; + /* grab the open state from the get/create/open */ + if (nofp && !(error = nfs_open_file_set_busy(nofp, NULL))) { + nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; + nofp->nof_creator = NULL; + nfs_open_file_clear_busy(nofp); + } + + /* Setting an empty FinderInfo really means remove it, skip to the close/remove */ + if (isfinderinfo && empty) + goto doclose; + + /* + * Write the data out and flush. + * + * For FinderInfo, we've already copied the data to finfo, so do I/O from there. + */ + vwa.a_desc = &vnop_write_desc; + vwa.a_vp = NFSTOV(anp); + vwa.a_uio = NULL; + vwa.a_ioflag = 0; + vwa.a_context = ctx; + if (isfinderinfo) { + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_WRITE, &uio_buf, sizeof(uio_buf)); + uio_addiov(auio, (uintptr_t)&finfo, sizeof(finfo)); + vwa.a_uio = auio; + } else if (uio_resid(uio) > 0) { + vwa.a_uio = uio; + } + if (vwa.a_uio) { + error = nfs_vnop_write(&vwa); + if (!error) + error = nfs_flush(anp, MNT_WAIT, vfs_context_thread(ctx), 0); + } +doclose: + /* Close the xattr. */ + if (nofp) { + int busyerror = nfs_open_file_set_busy(nofp, NULL); + closeerror = nfs_close(anp, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx); + if (!busyerror) + nfs_open_file_clear_busy(nofp); + } + if (!error && isfinderinfo && empty) { /* Setting an empty FinderInfo really means remove it */ + error = nfs4_named_attr_remove(VTONFS(ap->a_vp), anp, name, ctx); + if (error == ENOENT) + error = 0; + } if (!error) - *ap->a_vpp = NFSTOV(np); + error = closeerror; +out: + if (anp) + vnode_put(NFSTOV(anp)); + if (error == ENOENT) + error = ENOATTR; return (error); } int -nfs4_vnop_mkdir( - struct vnop_mkdir_args /* { +nfs4_vnop_removexattr( + struct vnop_removexattr_args /* { struct vnodeop_desc *a_desc; - vnode_t a_dvp; - vnode_t *a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; + vnode_t a_vp; + const char * a_name; + int a_options; vfs_context_t a_context; } */ *ap) { - nfsnode_t np = NULL; + struct nfsmount *nmp = VTONMP(ap->a_vp); int error; - error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, - NFDIR, NULL, &np); - if (!error) - *ap->a_vpp = NFSTOV(np); + if (!nmp) + return (ENXIO); + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); + + error = nfs4_named_attr_remove(VTONFS(ap->a_vp), NULL, ap->a_name, ap->a_context); + if (error == ENOENT) + error = ENOATTR; return (error); } int -nfs4_vnop_symlink( - struct vnop_symlink_args /* { +nfs4_vnop_listxattr( + struct vnop_listxattr_args /* { struct vnodeop_desc *a_desc; - vnode_t a_dvp; - vnode_t *a_vpp; - struct componentname *a_cnp; - struct vnode_attr *a_vap; - char *a_target; + vnode_t a_vp; + uio_t a_uio; + size_t *a_size; + int a_options; vfs_context_t a_context; } */ *ap) { - nfsnode_t np = NULL; - int error; + vfs_context_t ctx = ap->a_context; + nfsnode_t np = VTONFS(ap->a_vp); + uio_t uio = ap->a_uio; + nfsnode_t adnp = NULL; + struct nfsmount *nmp; + int error, done, i; + struct nfs_vattr nvattr; + uint64_t cookie, nextcookie, lbn = 0; + struct nfsbuf *bp = NULL; + struct nfs_dir_buf_header *ndbhp; + struct direntry *dp; - error = nfs4_create_rpc(ap->a_context, VTONFS(ap->a_dvp), ap->a_cnp, ap->a_vap, - NFLNK, ap->a_target, &np); - if (!error) - *ap->a_vpp = NFSTOV(np); + nmp = VTONMP(ap->a_vp); + if (!nmp) + return (ENXIO); + + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); + + error = nfs_getattr(np, &nvattr, ctx, NGA_CACHED); + if (error) + return (error); + if (NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_NAMED_ATTR) && + !(nvattr.nva_flags & NFS_FFLAG_HAS_NAMED_ATTRS)) + return (0); + + if ((error = nfs_node_set_busy(np, vfs_context_thread(ctx)))) + return (error); + adnp = nfs4_named_attr_dir_get(np, 1, ctx); + nfs_node_clear_busy(np); + if (!adnp) + goto out; + + if ((error = nfs_node_lock(adnp))) + goto out; + + if (adnp->n_flag & NNEEDINVALIDATE) { + adnp->n_flag &= ~NNEEDINVALIDATE; + nfs_invaldir(adnp); + nfs_node_unlock(adnp); + error = nfs_vinvalbuf(NFSTOV(adnp), 0, ctx, 1); + if (!error) + error = nfs_node_lock(adnp); + if (error) + goto out; + } + + /* + * check for need to invalidate when (re)starting at beginning + */ + if (adnp->n_flag & NMODIFIED) { + nfs_invaldir(adnp); + nfs_node_unlock(adnp); + if ((error = nfs_vinvalbuf(NFSTOV(adnp), 0, ctx, 1))) + goto out; + } else { + nfs_node_unlock(adnp); + } + /* nfs_getattr() will check changed and purge caches */ + if ((error = nfs_getattr(adnp, &nvattr, ctx, NGA_UNCACHED))) + goto out; + + if (uio && (uio_resid(uio) == 0)) + goto out; + + done = 0; + nextcookie = lbn = 0; + + while (!error && !done) { + OSAddAtomic(1, &nfsstats.biocache_readdirs); + cookie = nextcookie; +getbuffer: + error = nfs_buf_get(adnp, lbn, NFS_DIRBLKSIZ, vfs_context_thread(ctx), NBLK_READ, &bp); + if (error) + goto out; + ndbhp = (struct nfs_dir_buf_header*)bp->nb_data; + if (!ISSET(bp->nb_flags, NB_CACHE) || !ISSET(ndbhp->ndbh_flags, NDB_FULL)) { + if (!ISSET(bp->nb_flags, NB_CACHE)) { /* initialize the buffer */ + ndbhp->ndbh_flags = 0; + ndbhp->ndbh_count = 0; + ndbhp->ndbh_entry_end = sizeof(*ndbhp); + ndbhp->ndbh_ncgen = adnp->n_ncgen; + } + error = nfs_buf_readdir(bp, ctx); + if (error == NFSERR_DIRBUFDROPPED) + goto getbuffer; + if (error) + nfs_buf_release(bp, 1); + if (error && (error != ENXIO) && (error != ETIMEDOUT) && (error != EINTR) && (error != ERESTART)) { + if (!nfs_node_lock(adnp)) { + nfs_invaldir(adnp); + nfs_node_unlock(adnp); + } + nfs_vinvalbuf(NFSTOV(adnp), 0, ctx, 1); + if (error == NFSERR_BAD_COOKIE) + error = ENOENT; + } + if (error) + goto out; + } + + /* go through all the entries copying/counting */ + dp = NFS_DIR_BUF_FIRST_DIRENTRY(bp); + for (i=0; i < ndbhp->ndbh_count; i++) { + if (!xattr_protected(dp->d_name)) { + if (uio == NULL) { + *ap->a_size += dp->d_namlen + 1; + } else if (uio_resid(uio) < (dp->d_namlen + 1)) { + error = ERANGE; + } else { + error = uiomove(dp->d_name, dp->d_namlen+1, uio); + if (error && (error != EFAULT)) + error = ERANGE; + } + } + nextcookie = dp->d_seekoff; + dp = NFS_DIRENTRY_NEXT(dp); + } + + if (i == ndbhp->ndbh_count) { + /* hit end of buffer, move to next buffer */ + lbn = nextcookie; + /* if we also hit EOF, we're done */ + if (ISSET(ndbhp->ndbh_flags, NDB_EOF)) + done = 1; + } + if (!error && !done && (nextcookie == cookie)) { + printf("nfs readdir cookie didn't change 0x%llx, %d/%d\n", cookie, i, ndbhp->ndbh_count); + error = EIO; + } + nfs_buf_release(bp, 1); + } +out: + if (adnp) + vnode_put(NFSTOV(adnp)); return (error); } +#if NAMEDSTREAMS int -nfs4_vnop_link( - struct vnop_link_args /* { +nfs4_vnop_getnamedstream( + struct vnop_getnamedstream_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; - vnode_t a_tdvp; - struct componentname *a_cnp; + vnode_t *a_svpp; + const char *a_name; + enum nsoperation a_operation; + int a_flags; vfs_context_t a_context; } */ *ap) { vfs_context_t ctx = ap->a_context; - vnode_t vp = ap->a_vp; - vnode_t tdvp = ap->a_tdvp; - struct componentname *cnp = ap->a_cnp; - int error = 0, lockerror = ENOENT, status; struct nfsmount *nmp; - nfsnode_t np = VTONFS(vp); - nfsnode_t tdnp = VTONFS(tdvp); - int nfsvers, numops; - u_int64_t xid, savedxid; - struct nfsm_chain nmreq, nmrep; - - if (vnode_mount(vp) != vnode_mount(tdvp)) - return (EXDEV); + struct nfs_vattr nvattr; + struct componentname cn; + nfsnode_t anp; + int error = 0; - nmp = VTONMP(vp); + nmp = VTONMP(ap->a_vp); if (!nmp) return (ENXIO); - nfsvers = nmp->nm_vers; - - /* - * Push all writes to the server, so that the attribute cache - * doesn't get "out of sync" with the server. - * XXX There should be a better way! - */ - nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), V_IGNORE_WRITEERR); - if ((error = nfs_node_set_busy2(tdnp, np, vfs_context_thread(ctx)))) + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); + error = nfs_getattr(VTONFS(ap->a_vp), &nvattr, ctx, NGA_CACHED); + if (error) return (error); + if (NFS_BITMAP_ISSET(nvattr.nva_bitmap, NFS_FATTR_NAMED_ATTR) && + !(nvattr.nva_flags & NFS_FFLAG_HAS_NAMED_ATTRS)) + return (ENOATTR); - nfsm_chain_null(&nmreq); - nfsm_chain_null(&nmrep); - - // PUTFH(SOURCE), SAVEFH, PUTFH(DIR), LINK, GETATTR(DIR), RESTOREFH, GETATTR - numops = 7; - nfsm_chain_build_alloc_init(error, &nmreq, 29 * NFSX_UNSIGNED + cnp->cn_namelen); - nfsm_chain_add_compound_header(error, &nmreq, "link", numops); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_SAVEFH); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); - nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_LINK); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_RESTOREFH); - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - nfsm_chain_add_bitmap_masked(error, &nmreq, nfs_getattr_bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); - nfsm_chain_build_done(error, &nmreq); - nfsm_assert(error, (numops == 0), EPROTO); - nfsmout_if(error); - error = nfs_request(tdnp, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); - - if ((lockerror = nfs_node_lock2(tdnp, np))) { - error = lockerror; - goto nfsmout; - } - nfsm_chain_skip_tag(error, &nmrep); - nfsm_chain_get_32(error, &nmrep, numops); - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_SAVEFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_LINK); - nfsm_chain_check_change_info(error, &nmrep, tdnp); - /* directory attributes: if we don't get them, make sure to invalidate */ - nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - savedxid = xid; - nfsm_chain_loadattr(error, &nmrep, tdnp, nfsvers, NULL, &xid); - if (error) - NATTRINVALIDATE(tdnp); - /* link attributes: if we don't get them, make sure to invalidate */ - nfsm_chain_op_check(error, &nmrep, NFS_OP_RESTOREFH); - nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - xid = savedxid; - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); - if (error) - NATTRINVALIDATE(np); -nfsmout: - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); - if (!lockerror) - tdnp->n_flag |= NMODIFIED; - /* Kludge: Map EEXIST => 0 assuming that it is a reply to a retry. */ - if (error == EEXIST) - error = 0; - if (!error && (tdnp->n_flag & NNEGNCENTRIES)) { - tdnp->n_flag &= ~NNEGNCENTRIES; - cache_purge_negatives(tdvp); - } - if (!lockerror) - nfs_node_unlock2(tdnp, np); - nfs_node_clear_busy2(tdnp, np); + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(ap->a_name, const, char *); + cn.cn_namelen = strlen(ap->a_name); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = MAKEENTRY; + + error = nfs4_named_attr_get(VTONFS(ap->a_vp), &cn, NFS_OPEN_SHARE_ACCESS_NONE, + 0, ctx, &anp, NULL); + if ((!error && !anp) || (error == ENOENT)) + error = ENOATTR; + if (!error && anp) + *ap->a_svpp = NFSTOV(anp); + else if (anp) + vnode_put(NFSTOV(anp)); return (error); } int -nfs4_vnop_rmdir( - struct vnop_rmdir_args /* { +nfs4_vnop_makenamedstream( + struct vnop_makenamedstream_args /* { struct vnodeop_desc *a_desc; - vnode_t a_dvp; + vnode_t *a_svpp; vnode_t a_vp; - struct componentname *a_cnp; + const char *a_name; + int a_flags; vfs_context_t a_context; } */ *ap) { vfs_context_t ctx = ap->a_context; - vnode_t vp = ap->a_vp; - vnode_t dvp = ap->a_dvp; - struct componentname *cnp = ap->a_cnp; + struct nfsmount *nmp; + struct componentname cn; + nfsnode_t anp; int error = 0; - nfsnode_t np = VTONFS(vp); - nfsnode_t dnp = VTONFS(dvp); - struct nfs_vattr dnvattr; - struct nfs_dulookup dul; - if (vnode_vtype(vp) != VDIR) - return (EINVAL); + nmp = VTONMP(ap->a_vp); + if (!nmp) + return (ENXIO); - if ((error = nfs_node_set_busy2(dnp, np, vfs_context_thread(ctx)))) - return (error); + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); - nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); - nfs_dulookup_start(&dul, dnp, ctx); + bzero(&cn, sizeof(cn)); + cn.cn_nameptr = __CAST_AWAY_QUALIFIER(ap->a_name, const, char *); + cn.cn_namelen = strlen(ap->a_name); + cn.cn_nameiop = CREATE; + cn.cn_flags = MAKEENTRY; + + error = nfs4_named_attr_get(VTONFS(ap->a_vp), &cn, NFS_OPEN_SHARE_ACCESS_BOTH, + NFS_GET_NAMED_ATTR_CREATE, ctx, &anp, NULL); + if ((!error && !anp) || (error == ENOENT)) + error = ENOATTR; + if (!error && anp) + *ap->a_svpp = NFSTOV(anp); + else if (anp) + vnode_put(NFSTOV(anp)); + return (error); +} - error = nfs4_remove_rpc(dnp, cnp->cn_nameptr, cnp->cn_namelen, - vfs_context_thread(ctx), vfs_context_ucred(ctx)); +int +nfs4_vnop_removenamedstream( + struct vnop_removenamedstream_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + vnode_t a_svp; + const char *a_name; + int a_flags; + vfs_context_t a_context; + } */ *ap) +{ + struct nfsmount *nmp = VTONMP(ap->a_vp); + nfsnode_t np = ap->a_vp ? VTONFS(ap->a_vp) : NULL; + nfsnode_t anp = ap->a_svp ? VTONFS(ap->a_svp) : NULL; - nfs_name_cache_purge(dnp, np, cnp, ctx); - /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, NGA_CACHED); - nfs_dulookup_finish(&dul, dnp, ctx); - nfs_node_clear_busy2(dnp, np); + if (!nmp) + return (ENXIO); /* - * Kludge: Map ENOENT => 0 assuming that you have a reply to a retry. + * Given that a_svp is a named stream, checking for + * named attribute support is kinda pointless. */ - if (error == ENOENT) - error = 0; - if (!error) { - /* - * remove nfsnode from hash now so we can't accidentally find it - * again if another object gets created with the same filehandle - * before this vnode gets reclaimed - */ - lck_mtx_lock(nfs_node_hash_mutex); - if (np->n_hflag & NHHASHED) { - LIST_REMOVE(np, n_hash); - np->n_hflag &= ~NHHASHED; - FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); - } - lck_mtx_unlock(nfs_node_hash_mutex); - } - return (error); + if (!(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR)) + return (ENOTSUP); + + return (nfs4_named_attr_remove(np, anp, ap->a_name, ap->a_context)); } +#endif diff --git a/bsd/nfs/nfs_bio.c b/bsd/nfs/nfs_bio.c index 1c1c19123..4bd1bff61 100644 --- a/bsd/nfs/nfs_bio.c +++ b/bsd/nfs/nfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -77,6 +77,7 @@ #include <sys/kernel.h> #include <sys/ubc_internal.h> #include <sys/uio_internal.h> +#include <sys/kpi_mbuf.h> #include <sys/vm.h> #include <sys/vmparam.h> @@ -684,6 +685,21 @@ nfs_buf_get( loop: lck_mtx_lock(nfs_buf_mutex); + /* wait for any buffer invalidation/flushing to complete */ + while (np->n_bflag & NBINVALINPROG) { + np->n_bflag |= NBINVALWANT; + ts.tv_sec = 2; + ts.tv_nsec = 0; + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_buf_get_invalwait", &ts); + if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { + lck_mtx_unlock(nfs_buf_mutex); + FSDBG_BOT(541, np, blkno, 0, error); + return (error); + } + if (np->n_bflag & NBINVALINPROG) + slpflag = 0; + } + /* check for existence of nfsbuf in cache */ if ((bp = nfs_buf_incore(np, blkno))) { /* if busy, set wanted and wait */ @@ -1041,8 +1057,8 @@ pagelist_cleanup_done: if (start < NBOFF(bp)) start = NBOFF(bp); if (end > start) { - if (!(rv = ubc_sync_range(vp, start, end, UBC_INVALIDATE))) - printf("nfs_buf_release(): ubc_sync_range failed!\n"); + if ((rv = ubc_msync(vp, start, end, NULL, UBC_INVALIDATE))) + printf("nfs_buf_release(): ubc_msync failed!, error %d\n", rv); } } CLR(bp->nb_flags, NB_PAGELIST); @@ -1508,7 +1524,7 @@ nfs_buf_read_finish(struct nfsbuf *bp) bp->nb_valid = (1 << (round_page_32(bp->nb_validend) / PAGE_SIZE)) - 1; if (bp->nb_validend & PAGE_MASK) { /* zero-fill remainder of last page */ - bzero(bp->nb_data + bp->nb_validend, bp->nb_bufsize - bp->nb_validend); + bzero(bp->nb_data + bp->nb_validend, PAGE_SIZE - (bp->nb_validend & PAGE_MASK)); } } nfs_buf_iodone(bp); @@ -1649,6 +1665,8 @@ finish: kauth_cred_ref(cred); cb = req->r_callback; bp = cb.rcb_bp; + if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); nmp = NFSTONMP(np); if (!nmp) { @@ -1673,23 +1691,55 @@ finish: error = nmp->nm_funcs->nf_read_rpc_async_finish(np, req, auio, &rlen, &eof); if ((error == EINPROGRESS) && cb.rcb_func) { /* async request restarted */ + if (cb.rcb_func) + nfs_request_rele(req); if (IS_VALID_CRED(cred)) kauth_cred_unref(&cred); return; } if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_buf_read_rpc_finish: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { + NP(np, "nfs_buf_read_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); - if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { - rlen = 0; - goto readagain; + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; + } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + /* Note: nfsreq reference taken will be dropped later when finished */ + return; + } + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto readagain; + } } } if (error) { @@ -1734,6 +1784,7 @@ readagain: rreq = NULL; goto finish; } + nfs_request_rele(req); /* * We're done here. * Outstanding RPC count is unchanged. @@ -1746,6 +1797,8 @@ readagain: } out: + if (cb.rcb_func) + nfs_request_rele(req); if (IS_VALID_CRED(cred)) kauth_cred_unref(&cred); @@ -1786,7 +1839,8 @@ nfs_buf_readahead(nfsnode_t np, int ioflag, daddr64_t *rabnp, daddr64_t lastrabn { struct nfsmount *nmp = NFSTONMP(np); struct nfsbuf *bp; - int error = 0, nra; + int error = 0; + uint32_t nra; if (!nmp) return (ENXIO); @@ -1842,7 +1896,6 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) { vnode_t vp = NFSTOV(np); struct nfsbuf *bp = NULL; - struct nfs_vattr nvattr; struct nfsmount *nmp = VTONMP(vp); daddr64_t lbn, rabn = 0, lastrabn, maxrabn = -1; off_t diff; @@ -1903,7 +1956,7 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) modified = (np->n_flag & NMODIFIED); nfs_node_unlock(np); /* nfs_getattr() will check changed and purge caches */ - error = nfs_getattr(np, &nvattr, ctx, modified ? NGA_UNCACHED : NGA_CACHED); + error = nfs_getattr(np, NULL, ctx, modified ? NGA_UNCACHED : NGA_CACHED); if (error) { FSDBG_BOT(514, np, 0xd1e0004, 0, error); return (error); @@ -1986,6 +2039,12 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) np->n_lastread = (uio_offset(uio) - 1) / biosize; nfs_node_unlock(np); + if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) { + nfs_data_unlock(np); + FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa); + return (0); + } + /* adjust readahead block number, if necessary */ if (rabn < lbn) rabn = lbn; @@ -2000,12 +2059,6 @@ nfs_bioread(nfsnode_t np, uio_t uio, int ioflag, vfs_context_t ctx) readaheads = 1; } - if ((uio_resid(uio) <= 0) || (uio_offset(uio) >= (off_t)np->n_size)) { - nfs_data_unlock(np); - FSDBG_BOT(514, np, uio_offset(uio), uio_resid(uio), 0xaaaaaaaa); - return (0); - } - OSAddAtomic(1, &nfsstats.biocache_reads); /* @@ -2182,7 +2235,7 @@ buffer_ready: int nfs_async_write_start(struct nfsmount *nmp) { - int error = 0, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + int error = 0, slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; struct timespec ts = {1, 0}; if (nfs_max_async_writes <= 0) @@ -2301,7 +2354,7 @@ nfs_buf_write(struct nfsbuf *bp) } SET(bp->nb_flags, NB_WRITEINPROG); error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp) + bp->nb_dirtyoff, - bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred); + bp->nb_dirtyend - bp->nb_dirtyoff, bp->nb_wcred, bp->nb_verf); CLR(bp->nb_flags, NB_WRITEINPROG); if (error) { if (error != NFSERR_STALEWRITEVERF) { @@ -2610,7 +2663,7 @@ again: CLR(bp->nb_flags, NB_WRITEINPROG); if (!error && (commit != NFS_WRITE_FILESYNC)) { - error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred); + error = nmp->nm_funcs->nf_commit_rpc(np, NBOFF(bp), bp->nb_bufsize, cred, wverf); if (error == NFSERR_STALEWRITEVERF) { /* verifier changed, so we need to restart all the writes */ iomode = NFS_WRITE_FILESYNC; @@ -2731,6 +2784,9 @@ nfs_buf_write_rpc(struct nfsbuf *bp, int iomode, thread_t thd, kauth_cred_t cred } else { nfs_buf_write_finish(bp, thd, cred); } + /* It may have just been an interrupt... that's OK */ + if (!ISSET(bp->nb_flags, NB_ERROR)) + error = 0; } return (error); @@ -2765,6 +2821,8 @@ finish: kauth_cred_ref(cred); cb = req->r_callback; bp = cb.rcb_bp; + if (cb.rcb_func) /* take an extra reference on the nfsreq in case we want to resend it later due to grace error */ + nfs_request_ref(req, 0); nmp = NFSTONMP(np); if (!nmp) { @@ -2785,23 +2843,55 @@ finish: error = nmp->nm_funcs->nf_write_rpc_async_finish(np, req, &committed, &rlen, &wverf); if ((error == EINPROGRESS) && cb.rcb_func) { /* async request restarted */ + if (cb.rcb_func) + nfs_request_rele(req); if (IS_VALID_CRED(cred)) kauth_cred_unref(&cred); return; } if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && !ISSET(bp->nb_flags, NB_ERROR)) { lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_buf_write_rpc_finish: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_OLD_STATEID) && (error != NFSERR_GRACE) && (cb.rcb_args[2] == nmp->nm_stategenid)) { + NP(np, "nfs_buf_write_rpc_finish: error %d @ 0x%llx, 0x%x 0x%x, initiating recovery", + error, NBOFF(bp)+offset, cb.rcb_args[2], nmp->nm_stategenid); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); - if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { - rlen = 0; - goto writeagain; + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) { + if (cb.rcb_func) { + /* + * For an async I/O request, handle a grace delay just like + * jukebox errors. Set the resend time and queue it up. + */ + struct timeval now; + if (req->r_nmrep.nmc_mhead) { + mbuf_freem(req->r_nmrep.nmc_mhead); + req->r_nmrep.nmc_mhead = NULL; + } + req->r_error = 0; + microuptime(&now); + lck_mtx_lock(&req->r_mtx); + req->r_resendtime = now.tv_sec + 2; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + nfs_asyncio_resend(req); + lck_mtx_unlock(&req->r_mtx); + if (IS_VALID_CRED(cred)) + kauth_cred_unref(&cred); + /* Note: nfsreq reference taken will be dropped later when finished */ + return; + } + /* otherwise, just pause a couple seconds and retry */ + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + } + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) { + rlen = 0; + goto writeagain; + } } } if (error) { @@ -2863,6 +2953,7 @@ writeagain: wreq = NULL; goto finish; } + nfs_request_rele(req); /* * We're done here. * Outstanding RPC count is unchanged. @@ -2875,8 +2966,10 @@ writeagain: } out: - if (cb.rcb_func) + if (cb.rcb_func) { nfs_async_write_done(nmp); + nfs_request_rele(req); + } /* * Decrement outstanding RPC count on buffer * and call nfs_buf_write_finish on last RPC. @@ -2918,6 +3011,7 @@ nfs_flushcommits(nfsnode_t np, int nowait) struct nfsbuflists blist, commitlist; int error = 0, retv, wcred_set, flags, dirty; u_quad_t off, endoff, toff; + uint64_t wverf; u_int32_t count; kauth_cred_t wcred = NULL; @@ -2956,6 +3050,7 @@ nfs_flushcommits(nfsnode_t np, int nowait) if (nowait) flags |= NBI_NOWAIT; lck_mtx_lock(nfs_buf_mutex); + wverf = nmp->nm_verf; if (!nfs_buf_iterprepare(np, &blist, flags)) { while ((bp = LIST_FIRST(&blist))) { LIST_REMOVE(bp, nb_vnbufs); @@ -2965,8 +3060,8 @@ nfs_flushcommits(nfsnode_t np, int nowait) continue; if (ISSET(bp->nb_flags, NB_NEEDCOMMIT)) nfs_buf_check_write_verifier(np, bp); - if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) - != (NB_DELWRI | NB_NEEDCOMMIT))) { + if (((bp->nb_flags & (NB_DELWRI | NB_NEEDCOMMIT)) != (NB_DELWRI | NB_NEEDCOMMIT)) || + (bp->nb_verf != wverf)) { nfs_buf_drop(bp); continue; } @@ -3066,13 +3161,13 @@ nfs_flushcommits(nfsnode_t np, int nowait) count = 0; else count = (endoff - off); - retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred); + retv = nmp->nm_funcs->nf_commit_rpc(np, off, count, wcred, wverf); } else { retv = 0; LIST_FOREACH(bp, &commitlist, nb_vnbufs) { toff = NBOFF(bp) + bp->nb_dirtyoff; count = bp->nb_dirtyend - bp->nb_dirtyoff; - retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred); + retv = nmp->nm_funcs->nf_commit_rpc(np, toff, count, bp->nb_wcred, wverf); if (retv) break; } @@ -3161,7 +3256,7 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) goto out; } nfsvers = nmp->nm_vers; - if (nmp->nm_flag & NFSMNT_INT) + if (NMFLAG(nmp, INTR)) slpflag = PCATCH; if (!LIST_EMPTY(&np->n_dirtyblkhd)) { @@ -3173,8 +3268,9 @@ nfs_flush(nfsnode_t np, int waitfor, thread_t thd, int ignore_writeerr) lck_mtx_lock(nfs_buf_mutex); while (np->n_bflag & NBFLUSHINPROG) { np->n_bflag |= NBFLUSHWANT; - msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL); - if ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0))) { + error = msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_flush", NULL); + if ((error && (error != EWOULDBLOCK)) || + ((error = nfs_sigintr(NFSTONMP(np), NULL, thd, 0)))) { lck_mtx_unlock(nfs_buf_mutex); goto out; } @@ -3458,8 +3554,10 @@ nfs_vinvalbuf_internal( if (error) { FSDBG(554, bp, 0xd00dee, 0xbad, error); nfs_node_lock_force(np); - np->n_error = error; - np->n_flag |= NWRITEERR; + if ((error != EINTR) && (error != ERESTART)) { + np->n_error = error; + np->n_flag |= NWRITEERR; + } /* * There was a write error and we need to * invalidate attrs to sync with server. @@ -3468,7 +3566,7 @@ nfs_vinvalbuf_internal( */ NATTRINVALIDATE(np); nfs_node_unlock(np); - if (error == EINTR) { + if ((error == EINTR) || (error == ERESTART)) { /* * Abort on EINTR. If we don't, we could * be stuck in this loop forever because @@ -3521,12 +3619,13 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf { nfsnode_t np = VTONFS(vp); struct nfsmount *nmp = VTONMP(vp); - int error, rv, slpflag, slptimeo, nflags; + int error, slpflag, slptimeo, nflags, retry = 0; + struct timespec ts = { 2, 0 }; off_t size; FSDBG_TOP(554, np, flags, intrflg, 0); - if (nmp && !(nmp->nm_flag & NFSMNT_INT)) + if (nmp && !NMFLAG(nmp, INTR)) intrflg = 0; if (intrflg) { slpflag = PCATCH; @@ -3540,16 +3639,19 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf lck_mtx_lock(nfs_buf_mutex); while (np->n_bflag & NBINVALINPROG) { np->n_bflag |= NBINVALWANT; - msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", NULL); + msleep(&np->n_bflag, nfs_buf_mutex, slpflag, "nfs_vinvalbuf", &ts); if ((error = nfs_sigintr(VTONMP(vp), NULL, thd, 0))) { lck_mtx_unlock(nfs_buf_mutex); return (error); } + if (np->n_bflag & NBINVALINPROG) + slpflag = 0; } np->n_bflag |= NBINVALINPROG; lck_mtx_unlock(nfs_buf_mutex); /* Now, flush as required. */ +again: error = nfs_vinvalbuf_internal(np, flags, thd, cred, slpflag, 0); while (error) { FSDBG(554, np, 0, 0, error); @@ -3560,8 +3662,15 @@ nfs_vinvalbuf2(vnode_t vp, int flags, thread_t thd, kauth_cred_t cred, int intrf /* get the pages out of vm also */ if (UBCINFOEXISTS(vp) && (size = ubc_getsize(vp))) - if (!(rv = ubc_sync_range(vp, 0, size, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE))) - panic("nfs_vinvalbuf(): ubc_sync_range failed!"); + if ((error = ubc_msync(vp, 0, size, NULL, UBC_PUSHALL | UBC_SYNC | UBC_INVALIDATE))) { + if (error == EINVAL) + panic("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); + if (retry++ < 10) /* retry invalidating a few times */ + goto again; + /* give up */ + printf("nfs_vinvalbuf(): ubc_msync failed!, error %d", error); + + } done: lck_mtx_lock(nfs_buf_mutex); nflags = np->n_bflag; @@ -3574,6 +3683,57 @@ done: return (error); } +/* + * Wait for any busy buffers to complete. + */ +void +nfs_wait_bufs(nfsnode_t np) +{ + struct nfsbuf *bp; + struct nfsbuflists blist; + int error = 0; + + lck_mtx_lock(nfs_buf_mutex); + if (!nfs_buf_iterprepare(np, &blist, NBI_CLEAN)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_cleanblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + lck_mtx_unlock(nfs_buf_mutex); + return; + } + } + nfs_buf_refrele(bp); + nfs_buf_drop(bp); + } + nfs_buf_itercomplete(np, &blist, NBI_CLEAN); + } + if (!nfs_buf_iterprepare(np, &blist, NBI_DIRTY)) { + while ((bp = LIST_FIRST(&blist))) { + LIST_REMOVE(bp, nb_vnbufs); + LIST_INSERT_HEAD(&np->n_dirtyblkhd, bp, nb_vnbufs); + nfs_buf_refget(bp); + while ((error = nfs_buf_acquire(bp, 0, 0, 0))) { + if (error != EAGAIN) { + nfs_buf_refrele(bp); + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + lck_mtx_unlock(nfs_buf_mutex); + return; + } + } + nfs_buf_refrele(bp); + nfs_buf_drop(bp); + } + nfs_buf_itercomplete(np, &blist, NBI_DIRTY); + } + lck_mtx_unlock(nfs_buf_mutex); +} + + /* * Add an async I/O request to the mount's async I/O queue and make * sure that an nfsiod will service it. diff --git a/bsd/nfs/nfs_boot.c b/bsd/nfs/nfs_boot.c index 33bc25128..7fcd73bee 100644 --- a/bsd/nfs/nfs_boot.c +++ b/bsd/nfs/nfs_boot.c @@ -177,13 +177,7 @@ static int get_file_handle(struct nfs_dlmount *ndmntp); #define IP_CH(ip) ((u_char *)ip) #define IP_LIST(ip) IP_CH(ip)[0],IP_CH(ip)[1],IP_CH(ip)[2],IP_CH(ip)[3] -extern boolean_t -netboot_iaddr(struct in_addr * iaddr_p); - -extern boolean_t -netboot_rootpath(struct in_addr * server_ip, - char * name, int name_len, - char * path, int path_len); +#include <sys/netboot.h> /* * Called with an empty nfs_diskless struct to be filled in. diff --git a/bsd/nfs/nfs_gss.c b/bsd/nfs/nfs_gss.c index b8dbbb4a2..c848bfae6 100644 --- a/bsd/nfs/nfs_gss.c +++ b/bsd/nfs/nfs_gss.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2009 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -147,8 +147,8 @@ int nfs_single_des; * These octet strings are used to encode/decode ASN.1 tokens * in the RPCSEC_GSS verifiers. */ -static u_char krb5_tokhead[] = { 0x60, 0x23 }; -static u_char krb5_mech[] = { 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x12, 0x01, 0x02, 0x02 }; +static u_char krb5_tokhead[] __attribute__((unused)) = { 0x60, 0x23 }; + u_char krb5_mech[11] = { 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x12, 0x01, 0x02, 0x02 }; static u_char krb5_mic[] = { 0x01, 0x01, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; static u_char krb5_mic3[] = { 0x01, 0x01, 0x04, 0x00, 0xff, 0xff, 0xff, 0xff }; static u_char krb5_wrap[] = { 0x02, 0x01, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff }; @@ -184,11 +184,11 @@ static u_char iv0[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; // DES static int nfs_gss_clnt_ctx_find(struct nfsreq *); static int nfs_gss_clnt_ctx_failover(struct nfsreq *); static int nfs_gss_clnt_ctx_init(struct nfsreq *, struct nfs_gss_clnt_ctx *); +static int nfs_gss_clnt_ctx_init_retry(struct nfsreq *, struct nfs_gss_clnt_ctx *); static int nfs_gss_clnt_ctx_callserver(struct nfsreq *, struct nfs_gss_clnt_ctx *); static char *nfs_gss_clnt_svcname(struct nfsmount *); static int nfs_gss_clnt_gssd_upcall(struct nfsreq *, struct nfs_gss_clnt_ctx *); static void nfs_gss_clnt_ctx_remove(struct nfsmount *, struct nfs_gss_clnt_ctx *); -static int nfs_gss_clnt_ctx_delay(struct nfsreq *, int *); #endif /* NFSCLIENT */ #if NFSSERVER @@ -253,6 +253,25 @@ nfs_gss_init(void) #if NFSCLIENT +/* + * Is it OK to fall back to using AUTH_SYS? + */ +static int +nfs_gss_sysok(struct nfsreq *req) +{ + struct nfsmount *nmp = req->r_nmp; + int i; + + if (req->r_wrongsec) /* Not OK if we're trying to handle a wrongsec error */ + return (0); + if (!nmp->nm_sec.count) /* assume it's OK if we don't have a set of flavors */ + return (1); + for (i=0; i < nmp->nm_sec.count; i++) + if (nmp->nm_sec.flavors[i] == RPCAUTH_SYS) + return (1); + return (0); +} + /* * Find the context for a particular user. * @@ -269,15 +288,14 @@ nfs_gss_clnt_ctx_find(struct nfsreq *req) struct nfs_gss_clnt_ctx *cp; uid_t uid = kauth_cred_getuid(req->r_cred); int error = 0; - int retrycnt = 0; lck_mtx_lock(&nmp->nm_lock); TAILQ_FOREACH(cp, &nmp->nm_gsscl, gss_clnt_entries) { if (cp->gss_clnt_uid == uid) { if (cp->gss_clnt_flags & GSS_CTX_INVAL) continue; - lck_mtx_unlock(&nmp->nm_lock); nfs_gss_clnt_ctx_ref(req, cp); + lck_mtx_unlock(&nmp->nm_lock); return (0); } } @@ -292,8 +310,8 @@ nfs_gss_clnt_ctx_find(struct nfsreq *req) */ TAILQ_FOREACH(cp, &nmp->nm_gsscl, gss_clnt_entries) { if (!(cp->gss_clnt_flags & GSS_CTX_INVAL)) { - lck_mtx_unlock(&nmp->nm_lock); nfs_gss_clnt_ctx_ref(req, cp); + lck_mtx_unlock(&nmp->nm_lock); return (0); } } @@ -310,7 +328,7 @@ nfs_gss_clnt_ctx_find(struct nfsreq *req) * to failover to sec=sys. */ if (req->r_thread == NULL) { - if (nmp->nm_flag & NFSMNT_SECSYSOK) { + if (nfs_gss_sysok(req)) { error = nfs_gss_clnt_ctx_failover(req); } else { printf("nfs_gss_clnt_ctx_find: no context for async\n"); @@ -334,29 +352,7 @@ nfs_gss_clnt_ctx_find(struct nfsreq *req) TAILQ_INSERT_TAIL(&nmp->nm_gsscl, cp, gss_clnt_entries); lck_mtx_unlock(&nmp->nm_lock); -retry: - error = nfs_gss_clnt_ctx_init(req, cp); - if (error == ENEEDAUTH) { - error = nfs_gss_clnt_ctx_delay(req, &retrycnt); - if (!error) - goto retry; - - /* Giving up on this context */ - cp->gss_clnt_flags |= GSS_CTX_INVAL; - - /* - * Wake any threads waiting to use the context - */ - lck_mtx_lock(cp->gss_clnt_mtx); - cp->gss_clnt_thread = NULL; - if (cp->gss_clnt_flags & GSS_NEEDCTX) { - cp->gss_clnt_flags &= ~GSS_NEEDCTX; - wakeup(cp); - } - lck_mtx_unlock(cp->gss_clnt_mtx); - - } - + error = nfs_gss_clnt_ctx_init_retry(req, cp); // Initialize new context if (error) nfs_gss_clnt_ctx_unref(req); @@ -367,7 +363,7 @@ retry: * up a dummy context that allows this user to attempt * sec=sys calls. */ - if (error && (nmp->nm_flag & NFSMNT_SECSYSOK) && + if (error && nfs_gss_sysok(req) && (error != ENXIO) && (error != ETIMEDOUT)) { lck_mtx_lock(&nmp->nm_lock); error = nfs_gss_clnt_ctx_failover(req); @@ -433,7 +429,7 @@ nfs_gss_clnt_cred_put(struct nfsreq *req, struct nfsm_chain *nmc, mbuf_t args) slpflag = (PZERO-1); if (req->r_nmp) { - slpflag |= ((req->r_nmp->nm_flag & NFSMNT_INT) && req->r_thread) ? PCATCH : 0; + slpflag |= (NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) ? PCATCH : 0; recordmark = (req->r_nmp->nm_sotype == SOCK_STREAM); } retry: @@ -483,6 +479,7 @@ retry: if (cp->gss_clnt_thread && cp->gss_clnt_thread != current_thread()) { cp->gss_clnt_flags |= GSS_NEEDCTX; msleep(cp, cp->gss_clnt_mtx, slpflag | PDROP, "ctxwait", NULL); + slpflag &= ~PCATCH; if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) return (error); nfs_gss_clnt_ctx_unref(req); @@ -504,6 +501,7 @@ retry: ((cp->gss_clnt_seqnum - cp->gss_clnt_seqwin) + 1) % cp->gss_clnt_seqwin)) { cp->gss_clnt_flags |= GSS_NEEDSEQ; msleep(cp, cp->gss_clnt_mtx, slpflag, "seqwin", NULL); + slpflag &= ~PCATCH; if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) { lck_mtx_unlock(cp->gss_clnt_mtx); return (error); @@ -995,9 +993,9 @@ nfs_gss_clnt_ctx_init(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) cp->gss_clnt_proc = RPCSEC_GSS_INIT; cp->gss_clnt_service = - nmp->nm_auth == RPCAUTH_KRB5 ? RPCSEC_GSS_SVC_NONE : - nmp->nm_auth == RPCAUTH_KRB5I ? RPCSEC_GSS_SVC_INTEGRITY : - nmp->nm_auth == RPCAUTH_KRB5P ? RPCSEC_GSS_SVC_PRIVACY : 0; + req->r_auth == RPCAUTH_KRB5 ? RPCSEC_GSS_SVC_NONE : + req->r_auth == RPCAUTH_KRB5I ? RPCSEC_GSS_SVC_INTEGRITY : + req->r_auth == RPCAUTH_KRB5P ? RPCSEC_GSS_SVC_PRIVACY : 0; cp->gss_clnt_gssd_flags = (nfs_single_des ? GSSD_NFS_1DES : 0); /* @@ -1055,7 +1053,9 @@ retry: /* * The context is apparently established successfully */ + lck_mtx_lock(cp->gss_clnt_mtx); cp->gss_clnt_flags |= GSS_CTX_COMPLETE; + lck_mtx_unlock(cp->gss_clnt_mtx); cp->gss_clnt_proc = RPCSEC_GSS_DATA; microuptime(&now); cp->gss_clnt_ctime = now.tv_sec; // time stamp @@ -1110,13 +1110,13 @@ nfsmout: * It will be removed when the reference count * drops to zero. */ + lck_mtx_lock(cp->gss_clnt_mtx); if (error) cp->gss_clnt_flags |= GSS_CTX_INVAL; /* * Wake any threads waiting to use the context */ - lck_mtx_lock(cp->gss_clnt_mtx); cp->gss_clnt_thread = NULL; if (cp->gss_clnt_flags & GSS_NEEDCTX) { cp->gss_clnt_flags &= ~GSS_NEEDCTX; @@ -1127,6 +1127,77 @@ nfsmout: return (error); } +/* + * This function calls nfs_gss_clnt_ctx_init() to set up a new context. + * But if there's a failure in trying to establish the context it keeps + * retrying at progressively longer intervals in case the failure is + * due to some transient condition. For instance, the server might be + * failing the context setup because directory services is not coming + * up in a timely fashion. + */ +static int +nfs_gss_clnt_ctx_init_retry(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) +{ + struct nfsmount *nmp = req->r_nmp; + struct timeval now; + time_t waituntil; + int error, slpflag; + int retries = 0; + int timeo = NFS_TRYLATERDEL; + + if (nmp == NULL) { + error = ENXIO; + goto bad; + } + + /* For an "intr" mount allow a signal to interrupt the retries */ + slpflag = (NMFLAG(nmp, INTR) && !(req->r_flags & R_NOINTR)) ? PCATCH : 0; + + while ((error = nfs_gss_clnt_ctx_init(req, cp)) == ENEEDAUTH) { + microuptime(&now); + waituntil = now.tv_sec + timeo; + while (now.tv_sec < waituntil) { + tsleep(&lbolt, PSOCK | slpflag, "nfs_gss_clnt_ctx_init_retry", 0); + slpflag = 0; + error = nfs_sigintr(req->r_nmp, req, current_thread(), 0); + if (error) + goto bad; + microuptime(&now); + } + + retries++; + /* If it's a soft mount just give up after a while */ + if (NMFLAG(nmp, SOFT) && (retries > nmp->nm_retry)) { + error = ETIMEDOUT; + goto bad; + } + timeo *= 2; + if (timeo > 60) + timeo = 60; + } + + if (error == 0) + return 0; // success +bad: + /* + * Give up on this context + */ + lck_mtx_lock(cp->gss_clnt_mtx); + cp->gss_clnt_flags |= GSS_CTX_INVAL; + + /* + * Wake any threads waiting to use the context + */ + cp->gss_clnt_thread = NULL; + if (cp->gss_clnt_flags & GSS_NEEDCTX) { + cp->gss_clnt_flags &= ~GSS_NEEDCTX; + wakeup(cp); + } + lck_mtx_unlock(cp->gss_clnt_mtx); + + return error; +} + /* * Call the NFS server using a null procedure for context setup. * Even though it's a null procedure and nominally has no arguments @@ -1260,11 +1331,11 @@ static int nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) { kern_return_t kr; - byte_buffer okey = NULL; + gssd_byte_buffer okey = NULL; uint32_t skeylen = 0; int retry_cnt = 0; vm_map_copy_t itoken = NULL; - byte_buffer otoken = NULL; + gssd_byte_buffer otoken = NULL; mach_msg_type_number_t otokenlen; int error = 0; char uprinc[1]; @@ -1279,7 +1350,7 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) * the kernel is being compiled with -Wwrite-strings. */ uprinc[0] = '\0'; - if (cp->gss_clnt_mport == NULL) { + if (!IPC_PORT_VALID(cp->gss_clnt_mport)) { kr = task_get_gssd_port(get_threadtask(req->r_thread), &cp->gss_clnt_mport); if (kr != KERN_SUCCESS) { printf("nfs_gss_clnt_gssd_upcall: can't get gssd port, status %x (%d)\n", kr, kr); @@ -1298,8 +1369,8 @@ nfs_gss_clnt_gssd_upcall(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp) retry: kr = mach_gss_init_sec_context( cp->gss_clnt_mport, - KRB5_MECH, - (byte_buffer) itoken, (mach_msg_type_number_t) cp->gss_clnt_tokenlen, + GSSD_KRB5_MECH, + (gssd_byte_buffer) itoken, (mach_msg_type_number_t) cp->gss_clnt_tokenlen, cp->gss_clnt_uid, uprinc, cp->gss_clnt_svcname, @@ -1512,8 +1583,8 @@ nfs_gss_clnt_ctx_remove(struct nfsmount *nmp, struct nfs_gss_clnt_ctx *cp) if (nmp != NULL) TAILQ_REMOVE(&nmp->nm_gsscl, cp, gss_clnt_entries); - if (cp->gss_clnt_mport) - task_release_special_port(cp->gss_clnt_mport); + task_release_special_port(cp->gss_clnt_mport); + if (cp->gss_clnt_mtx) lck_mtx_destroy(cp->gss_clnt_mtx, nfs_gss_clnt_grp); if (cp->gss_clnt_handle) @@ -1541,7 +1612,6 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) int error = 0; uid_t saved_uid; mach_port_t saved_mport; - int retrycnt = 0; if (cp == NULL) return (0); @@ -1590,13 +1660,7 @@ nfs_gss_clnt_ctx_renew(struct nfsreq *req) nfs_gss_clnt_ctx_unref(req); nfs_gss_clnt_ctx_ref(req, ncp); -retry: - error = nfs_gss_clnt_ctx_init(req, ncp); // Initialize new context - if (error == ENEEDAUTH) { - error = nfs_gss_clnt_ctx_delay(req, &retrycnt); - if (!error) - goto retry; - } + error = nfs_gss_clnt_ctx_init_retry(req, ncp); // Initialize new context out: task_release_special_port(saved_mport); if (error) @@ -1610,17 +1674,13 @@ out: * The contexts are also destroyed by the server. */ void -nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp, int mntflags) +nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp) { struct nfs_gss_clnt_ctx *cp; - struct ucred temp_cred; - kauth_cred_t cred; struct nfsm_chain nmreq, nmrep; int error, status; struct nfsreq req; - bzero((caddr_t) &temp_cred, sizeof(temp_cred)); - temp_cred.cr_ngroups = 1; req.r_nmp = nmp; for (;;) { @@ -1637,9 +1697,14 @@ nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp, int mntflags) * But don't bother if it's a forced unmount * or if it's a dummy sec=sys context. */ - if (!(mntflags & MNT_FORCE) && cp->gss_clnt_service != RPCSEC_GSS_SVC_SYS) { - temp_cred.cr_uid = cp->gss_clnt_uid; - cred = kauth_cred_create(&temp_cred); + if (!(nmp->nm_state & NFSSTA_FORCE) && (cp->gss_clnt_service != RPCSEC_GSS_SVC_SYS)) { + kauth_cred_t cred; + struct posix_cred temp_pcred; + + bzero((caddr_t) &temp_pcred, sizeof(temp_pcred)); + temp_pcred.cr_ngroups = 1; + temp_pcred.cr_uid = cp->gss_clnt_uid; + cred = posix_cred_create(&temp_pcred); cp->gss_clnt_proc = RPCSEC_GSS_DESTROY; error = 0; @@ -1660,48 +1725,13 @@ nfs_gss_clnt_ctx_unmount(struct nfsmount *nmp, int mntflags) * the reference to remove it if its * refcount is zero. */ + lck_mtx_lock(cp->gss_clnt_mtx); cp->gss_clnt_flags |= GSS_CTX_INVAL; + lck_mtx_unlock(cp->gss_clnt_mtx); nfs_gss_clnt_ctx_unref(&req); } } -/* - * If we get a failure in trying to establish a context we need to wait a - * little while to see if the server is feeling better. In our case this is - * probably a failure in directory services not coming up in a timely fashion. - * This routine sort of mimics receiving a jukebox error. - */ -static int -nfs_gss_clnt_ctx_delay(struct nfsreq *req, int *retry) -{ - int timeo = (1 << *retry) * NFS_TRYLATERDEL; - int error = 0; - struct nfsmount *nmp = req->r_nmp; - struct timeval now; - time_t waituntil; - - if (!nmp) - return (ENXIO); - if ((nmp->nm_flag & NFSMNT_SOFT) && *retry > nmp->nm_retry) - return (ETIMEDOUT); - if (timeo > 60) - timeo = 60; - - microuptime(&now); - waituntil = now.tv_sec + timeo; - while (now.tv_sec < waituntil) { - tsleep(&lbolt, PSOCK, "nfs_gss_clnt_ctx_delay", 0); - error = nfs_sigintr(req->r_nmp, req, current_thread(), 0); - if (error) - break; - microuptime(&now); - } - *retry += 1; - - return (error); -} - - #endif /* NFSCLIENT */ /************* @@ -1733,7 +1763,7 @@ nfs_gss_svc_ctx_find(uint32_t handle) lck_mtx_lock(nfs_gss_svc_ctx_mutex); - LIST_FOREACH(cp, head, gss_svc_entries) + LIST_FOREACH(cp, head, gss_svc_entries) { if (cp->gss_svc_handle == handle) { if (timenow > cp->gss_svc_incarnation + GSS_SVC_CTX_TTL) { /* @@ -1743,14 +1773,20 @@ nfs_gss_svc_ctx_find(uint32_t handle) */ cp->gss_svc_handle = 0; /* - * Make sure though that we stay around for GSS_CTC_PEND seconds + * Make sure though that we stay around for GSS_CTX_PEND seconds * for other threads that might be using the context. */ cp->gss_svc_incarnation = timenow; + cp = NULL; + break; } + lck_mtx_lock(cp->gss_svc_mtx); + cp->gss_svc_refcnt++; + lck_mtx_unlock(cp->gss_svc_mtx); break; } + } lck_mtx_unlock(nfs_gss_svc_ctx_mutex); @@ -1765,10 +1801,26 @@ static void nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *cp) { struct nfs_gss_svc_ctx_hashhead *head; + struct nfs_gss_svc_ctx *p; + lck_mtx_lock(nfs_gss_svc_ctx_mutex); + + /* + * Give the client a random handle so that if we reboot + * it's unlikely the client will get a bad context match. + * Make sure it's not zero or already assigned. + */ +retry: + cp->gss_svc_handle = random(); + if (cp->gss_svc_handle == 0) + goto retry; head = &nfs_gss_svc_ctx_hashtbl[SVC_CTX_HASH(cp->gss_svc_handle)]; + LIST_FOREACH(p, head, gss_svc_entries) + if (p->gss_svc_handle == cp->gss_svc_handle) + goto retry; - lck_mtx_lock(nfs_gss_svc_ctx_mutex); + clock_interval_to_deadline(GSS_CTX_PEND, NSEC_PER_SEC, + &cp->gss_svc_incarnation); LIST_INSERT_HEAD(head, cp, gss_svc_entries); nfs_gss_ctx_count++; @@ -1776,7 +1828,7 @@ nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *cp) nfs_gss_timer_on = 1; nfs_interval_timer_start(nfs_gss_svc_ctx_timer_call, - min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, GSS_SVC_CTX_TTL)) * MSECS_PER_SEC); + min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, nfsrv_gss_context_ttl)) * MSECS_PER_SEC); } lck_mtx_unlock(nfs_gss_svc_ctx_mutex); @@ -1790,7 +1842,6 @@ nfs_gss_svc_ctx_insert(struct nfs_gss_svc_ctx *cp) void nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2) { - struct nfs_gss_svc_ctx_hashhead *head; struct nfs_gss_svc_ctx *cp, *next; uint64_t timenow; int contexts = 0; @@ -1801,19 +1852,17 @@ nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2) /* * Scan all the hash chains - * Assume nfs_gss_svc_ctx_mutex is held */ for (i = 0; i < SVC_CTX_HASHSZ; i++) { /* * For each hash chain, look for entries * that haven't been used in a while. */ - head = &nfs_gss_svc_ctx_hashtbl[i]; - for (cp = LIST_FIRST(head); cp; cp = next) { + LIST_FOREACH_SAFE(cp, &nfs_gss_svc_ctx_hashtbl[i], gss_svc_entries, next) { contexts++; - next = LIST_NEXT(cp, gss_svc_entries); - if (timenow > cp->gss_svc_incarnation + - (cp->gss_svc_handle ? GSS_SVC_CTX_TTL : 0)) { + if (timenow > cp->gss_svc_incarnation + + (cp->gss_svc_handle ? GSS_SVC_CTX_TTL : 0) + && cp->gss_svc_refcnt == 0) { /* * A stale context - remove it */ @@ -1836,7 +1885,7 @@ nfs_gss_svc_ctx_timer(__unused void *param1, __unused void *param2) nfs_gss_timer_on = nfs_gss_ctx_count > 0; if (nfs_gss_timer_on) nfs_interval_timer_start(nfs_gss_svc_ctx_timer_call, - min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, GSS_SVC_CTX_TTL)) * MSECS_PER_SEC); + min(GSS_TIMER_PERIOD, max(GSS_CTX_TTL_MIN, nfsrv_gss_context_ttl)) * MSECS_PER_SEC); lck_mtx_unlock(nfs_gss_svc_ctx_mutex); } @@ -1921,6 +1970,8 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) error = ENOMEM; goto nfsmout; } + cp->gss_svc_mtx = lck_mtx_alloc_init(nfs_gss_svc_grp, LCK_ATTR_NULL); + cp->gss_svc_refcnt = 1; } else { /* @@ -1944,7 +1995,7 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) ki = &cp->gss_svc_kinfo; if (proc == RPCSEC_GSS_DATA || proc == RPCSEC_GSS_DESTROY) { - struct ucred temp_cred; + struct posix_cred temp_pcred; if (cp->gss_svc_seqwin == 0) { /* @@ -1975,6 +2026,8 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) */ nfsm_chain_get_32(error, nmc, flavor); nfsm_chain_get_32(error, nmc, verflen); + if (error) + goto nfsmout; if (flavor != RPCSEC_GSS || verflen != KRB5_SZ_TOKEN(ki->hash_len)) error = NFSERR_AUTHERR | AUTH_BADVERF; nfsm_chain_get_opaque(error, nmc, verflen, tokbuf); @@ -1997,13 +2050,13 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) /* * Set up the user's cred */ - bzero(&temp_cred, sizeof(temp_cred)); - temp_cred.cr_uid = cp->gss_svc_uid; - bcopy(cp->gss_svc_gids, temp_cred.cr_groups, + bzero(&temp_pcred, sizeof(temp_pcred)); + temp_pcred.cr_uid = cp->gss_svc_uid; + bcopy(cp->gss_svc_gids, temp_pcred.cr_groups, sizeof(gid_t) * cp->gss_svc_ngroups); - temp_cred.cr_ngroups = cp->gss_svc_ngroups; + temp_pcred.cr_ngroups = cp->gss_svc_ngroups; - nd->nd_cr = kauth_cred_create(&temp_cred); + nd->nd_cr = posix_cred_create(&temp_pcred); if (nd->nd_cr == NULL) { error = ENOMEM; goto nfsmout; @@ -2135,12 +2188,21 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc) nfsm_chain_get_32(error, nmc, verflen); if (error || flavor != RPCAUTH_NULL || verflen > 0) error = NFSERR_AUTHERR | RPCSEC_GSS_CREDPROBLEM; - if (error) + if (error) { + if (proc == RPCSEC_GSS_INIT) { + lck_mtx_destroy(cp->gss_svc_mtx, nfs_gss_svc_grp); + FREE(cp, M_TEMP); + cp = NULL; + } goto nfsmout; + } } nd->nd_gss_context = cp; + return 0; nfsmout: + if (cp) + nfs_gss_svc_ctx_deref(cp); return (error); } @@ -2341,7 +2403,6 @@ int nfs_gss_svc_ctx_init(struct nfsrv_descript *nd, struct nfsrv_sock *slp, mbuf_t *mrepp) { struct nfs_gss_svc_ctx *cp = NULL; - uint32_t handle = 0; int error = 0; int autherr = 0; struct nfsm_chain *nmreq, nmrep; @@ -2355,22 +2416,7 @@ nfs_gss_svc_ctx_init(struct nfsrv_descript *nd, struct nfsrv_sock *slp, mbuf_t * switch (cp->gss_svc_proc) { case RPCSEC_GSS_INIT: - /* - * Give the client a random handle so that - * if we reboot it's unlikely the client - * will get a bad context match. - * Make sure it's not zero, or already assigned. - */ - do { - handle = random(); - } while (nfs_gss_svc_ctx_find(handle) != NULL || handle == 0); - cp->gss_svc_handle = handle; - cp->gss_svc_mtx = lck_mtx_alloc_init(nfs_gss_svc_grp, LCK_ATTR_NULL); - clock_interval_to_deadline(GSS_CTX_PEND, NSEC_PER_SEC, - &cp->gss_svc_incarnation); - nfs_gss_svc_ctx_insert(cp); - /* FALLTHRU */ case RPCSEC_GSS_CONTINUE_INIT: @@ -2502,11 +2548,11 @@ nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *cp) kern_return_t kr; mach_port_t mp; int retry_cnt = 0; - byte_buffer okey = NULL; + gssd_byte_buffer okey = NULL; uint32_t skeylen = 0; uint32_t ret_flags; vm_map_copy_t itoken = NULL; - byte_buffer otoken = NULL; + gssd_byte_buffer otoken = NULL; mach_msg_type_number_t otokenlen; int error = 0; char svcname[] = "nfs"; @@ -2527,7 +2573,7 @@ nfs_gss_svc_gssd_upcall(struct nfs_gss_svc_ctx *cp) retry: kr = mach_gss_accept_sec_context( mp, - (byte_buffer) itoken, (mach_msg_type_number_t) cp->gss_svc_tokenlen, + (gssd_byte_buffer) itoken, (mach_msg_type_number_t) cp->gss_svc_tokenlen, svcname, 0, &cp->gss_svc_context, @@ -2661,6 +2707,24 @@ nfs_gss_svc_seqnum_valid(struct nfs_gss_svc_ctx *cp, uint32_t seq) return (1); } +/* + * Drop a reference to a context + * + * Note that it's OK for the context to exist + * with a refcount of zero. The refcount isn't + * checked until we're about to reap an expired one. + */ +void +nfs_gss_svc_ctx_deref(struct nfs_gss_svc_ctx *cp) +{ + lck_mtx_lock(cp->gss_svc_mtx); + if (cp->gss_svc_refcnt > 0) + cp->gss_svc_refcnt--; + else + printf("nfs_gss_ctx_deref: zero refcount\n"); + lck_mtx_unlock(cp->gss_svc_mtx); +} + /* * Called at NFS server shutdown - destroy all contexts */ @@ -2713,8 +2777,8 @@ extern ipc_port_t ipc_port_copy_send(ipc_port_t); static void task_release_special_port(mach_port_t mp) { - - ipc_port_release_send(mp); + if (IPC_PORT_VALID(mp)) + ipc_port_release_send(mp); } static mach_port_t diff --git a/bsd/nfs/nfs_gss.h b/bsd/nfs/nfs_gss.h index aa6d55e96..ad056e7f2 100644 --- a/bsd/nfs/nfs_gss.h +++ b/bsd/nfs/nfs_gss.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2008 Apple Inc. All rights reserved. + * Copyright (c) 2007-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -50,6 +50,9 @@ enum rpcsec_gss_service { RPCSEC_GSS_SVC_SYS = 4 // sec=sys (fallback) }; +/* encoded krb5 OID */ +extern u_char krb5_mech[11]; + /* * GSS-API things */ @@ -111,8 +114,8 @@ struct nfs_gss_clnt_ctx { mach_port_t gss_clnt_mport; // Mach port for gssd upcall u_char *gss_clnt_verf; // RPC verifier from server char *gss_clnt_svcname; // Service name e.g. "nfs/big.apple.com" - gss_cred gss_clnt_cred_handle; // Opaque cred handle from gssd - gss_ctx gss_clnt_context; // Opaque context handle from gssd + gssd_cred gss_clnt_cred_handle; // Opaque cred handle from gssd + gssd_ctx gss_clnt_context; // Opaque context handle from gssd u_char *gss_clnt_token; // GSS token exchanged via gssd & server uint32_t gss_clnt_tokenlen; // Length of token gss_key_info gss_clnt_kinfo; // GSS key info @@ -136,6 +139,7 @@ struct nfs_gss_svc_ctx { lck_mtx_t *gss_svc_mtx; LIST_ENTRY(nfs_gss_svc_ctx) gss_svc_entries; uint32_t gss_svc_handle; // Identifies server context to client + uint32_t gss_svc_refcnt; // Reference count uint32_t gss_svc_proc; // Current GSS proc from cred uid_t gss_svc_uid; // UID of this user gid_t gss_svc_gids[NGROUPS]; // GIDs of this user @@ -144,8 +148,8 @@ struct nfs_gss_svc_ctx { uint32_t gss_svc_seqmax; // Current max GSS sequence number uint32_t gss_svc_seqwin; // GSS sequence number window uint32_t *gss_svc_seqbits; // Bitmap to track seq numbers - gss_cred gss_svc_cred_handle; // Opaque cred handle from gssd - gss_ctx gss_svc_context; // Opaque context handle from gssd + gssd_cred gss_svc_cred_handle; // Opaque cred handle from gssd + gssd_ctx gss_svc_context; // Opaque context handle from gssd u_char *gss_svc_token; // GSS token exchanged via gssd & client uint32_t gss_svc_tokenlen; // Length of token gss_key_info gss_svc_kinfo; // Session key info @@ -184,12 +188,13 @@ int nfs_gss_clnt_args_restore(struct nfsreq *); int nfs_gss_clnt_ctx_renew(struct nfsreq *); void nfs_gss_clnt_ctx_ref(struct nfsreq *, struct nfs_gss_clnt_ctx *); void nfs_gss_clnt_ctx_unref(struct nfsreq *); -void nfs_gss_clnt_ctx_unmount(struct nfsmount *, int); +void nfs_gss_clnt_ctx_unmount(struct nfsmount *); int nfs_gss_svc_cred_get(struct nfsrv_descript *, struct nfsm_chain *); int nfs_gss_svc_verf_put(struct nfsrv_descript *, struct nfsm_chain *); int nfs_gss_svc_ctx_init(struct nfsrv_descript *, struct nfsrv_sock *, mbuf_t *); int nfs_gss_svc_prepare_reply(struct nfsrv_descript *, struct nfsm_chain *); int nfs_gss_svc_protect_reply(struct nfsrv_descript *, mbuf_t); +void nfs_gss_svc_ctx_deref(struct nfs_gss_svc_ctx *); void nfs_gss_svc_cleanup(void); __END_DECLS diff --git a/bsd/nfs/nfs_lock.c b/bsd/nfs/nfs_lock.c index 590a70619..f76a9b6d0 100644 --- a/bsd/nfs/nfs_lock.c +++ b/bsd/nfs/nfs_lock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2008 Apple Inc. All rights reserved. + * Copyright (c) 2002-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,8 +95,6 @@ extern void ipc_port_release_send(ipc_port_t); -#define OFF_MAX QUAD_MAX - /* * pending lock request messages are kept in this queue which is * kept sorted by transaction ID (xid). @@ -104,28 +102,8 @@ extern void ipc_port_release_send(ipc_port_t); static uint64_t nfs_lockxid = 0; static LOCKD_MSG_QUEUE nfs_pendlockq; -/* - * This structure is used to identify processes which have acquired NFS locks. - * Knowing which processes have ever acquired locks allows us to short-circuit - * unlock requests for processes that have never had an NFS file lock. Thus - * avoiding a costly and unnecessary lockd request. - */ -struct nfs_lock_pid { - TAILQ_ENTRY(nfs_lock_pid) lp_lru; /* LRU list */ - LIST_ENTRY(nfs_lock_pid) lp_hash; /* hash chain */ - int lp_valid; /* valid entry? */ - int lp_time; /* last time seen valid */ - pid_t lp_pid; /* The process ID. */ - struct timeval lp_pid_start; /* Start time of process id */ -}; - -#define NFS_LOCK_PID_HASH_SIZE 64 // XXX tune me -#define NFS_LOCK_PID_HASH(pid) \ - (&nfs_lock_pid_hash_tbl[(pid) & nfs_lock_pid_hash]) -static LIST_HEAD(, nfs_lock_pid) *nfs_lock_pid_hash_tbl; -static TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_lru; -static u_long nfs_lock_pid_hash; -static uint32_t nfs_lock_pid_hash_trusted; +/* list of mounts that are (potentially) making lockd requests */ +TAILQ_HEAD(nfs_lockd_mount_list,nfsmount) nfs_lockd_mount_list; static lck_grp_t *nfs_lock_lck_grp; static lck_mtx_t *nfs_lock_mutex; @@ -136,7 +114,6 @@ int nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *, struct lockd_ans *); LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_answer(struct lockd_ans *); LOCKD_MSG_REQUEST *nfs_lockdmsg_find_by_xid(uint64_t); uint64_t nfs_lockxid_get(void); -int nfs_lock_pid_check(proc_t, int); int nfs_lockd_send_request(LOCKD_MSG *, int); /* @@ -146,31 +123,40 @@ void nfs_lockinit(void) { TAILQ_INIT(&nfs_pendlockq); - nfs_lock_pid_hash_trusted = 1; - nfs_lock_pid_hash_tbl = hashinit(NFS_LOCK_PID_HASH_SIZE, - M_TEMP, &nfs_lock_pid_hash); - TAILQ_INIT(&nfs_lock_pid_lru); + TAILQ_INIT(&nfs_lockd_mount_list); nfs_lock_lck_grp = lck_grp_alloc_init("nfs_lock", LCK_GRP_ATTR_NULL); nfs_lock_mutex = lck_mtx_alloc_init(nfs_lock_lck_grp, LCK_ATTR_NULL); } /* - * change the count of NFS mounts that may need to make lockd requests + * Register a mount as (potentially) making lockd requests. + */ +void +nfs_lockd_mount_register(struct nfsmount *nmp) +{ + lck_mtx_lock(nfs_lock_mutex); + TAILQ_INSERT_HEAD(&nfs_lockd_mount_list, nmp, nm_ldlink); + nfs_lockd_mounts++; + lck_mtx_unlock(nfs_lock_mutex); +} + +/* + * Unregister a mount as (potentially) making lockd requests. * - * If the mount count drops to zero, then send a shutdown request to + * When the lockd mount count drops to zero, then send a shutdown request to * lockd if we've sent any requests to it. */ void -nfs_lockd_mount_change(int i) +nfs_lockd_mount_unregister(struct nfsmount *nmp) { + int send_shutdown; mach_port_t lockd_port = IPC_PORT_NULL; kern_return_t kr; - int send_shutdown; lck_mtx_lock(nfs_lock_mutex); - - nfs_lockd_mounts += i; + TAILQ_REMOVE(&nfs_lockd_mount_list, nmp, nm_ldlink); + nfs_lockd_mounts--; /* send a shutdown request if there are no more lockd mounts */ send_shutdown = ((nfs_lockd_mounts == 0) && nfs_lockd_request_sent); @@ -183,7 +169,7 @@ nfs_lockd_mount_change(int i) return; /* - * Let lockd know that it is no longer need for any NFS mounts + * Let lockd know that it is no longer needed for any NFS mounts */ kr = host_get_lockd_port(host_priv_self(), &lockd_port); if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(lockd_port)) { @@ -204,7 +190,7 @@ nfs_lockd_mount_change(int i) * insert a lock request message into the pending queue * (nfs_lock_mutex must be held) */ -inline void +void nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq) { LOCKD_MSG_REQUEST *mr; @@ -230,7 +216,7 @@ nfs_lockdmsg_enqueue(LOCKD_MSG_REQUEST *msgreq) * remove a lock request message from the pending queue * (nfs_lock_mutex must be held) */ -inline void +void nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq) { TAILQ_REMOVE(&nfs_pendlockq, msgreq, lmr_next); @@ -248,7 +234,7 @@ nfs_lockdmsg_dequeue(LOCKD_MSG_REQUEST *msgreq) * * (nfs_lock_mutex must be held) */ -inline LOCKD_MSG_REQUEST * +LOCKD_MSG_REQUEST * nfs_lockdmsg_find_by_xid(uint64_t lockxid) { LOCKD_MSG_REQUEST *mr; @@ -264,8 +250,8 @@ nfs_lockdmsg_find_by_xid(uint64_t lockxid) /* * Because we can't depend on nlm_granted messages containing the same - * cookie we sent with the original lock request, we need code test if - * an nlm_granted answer matches the lock request. We also need code + * cookie we sent with the original lock request, we need code to test + * if an nlm_granted answer matches the lock request. We also need code * that can find a lockd message based solely on the nlm_granted answer. */ @@ -274,7 +260,7 @@ nfs_lockdmsg_find_by_xid(uint64_t lockxid) * * returns 0 on equality and 1 if different */ -inline int +int nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp) { if (!(ansp->la_flags & LOCKD_ANS_LOCK_INFO)) @@ -307,7 +293,7 @@ nfs_lockdmsg_compare_to_answer(LOCKD_MSG_REQUEST *msgreq, struct lockd_ans *ansp * * (nfs_lock_mutex must be held) */ -inline LOCKD_MSG_REQUEST * +LOCKD_MSG_REQUEST * nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp) { LOCKD_MSG_REQUEST *mr; @@ -325,7 +311,7 @@ nfs_lockdmsg_find_by_answer(struct lockd_ans *ansp) * return the next unique lock request transaction ID * (nfs_lock_mutex must be held) */ -inline uint64_t +uint64_t nfs_lockxid_get(void) { LOCKD_MSG_REQUEST *mr; @@ -359,143 +345,6 @@ nfs_lockxid_get(void) return nfs_lockxid; } - -/* - * Check the nfs_lock_pid hash table for an entry and, if requested, - * add the entry if it is not found. - * - * (Also, if adding, try to clean up some stale entries.) - * (nfs_lock_mutex must be held) - */ -int -nfs_lock_pid_check(proc_t p, int addflag) -{ - struct nfs_lock_pid *lp, *lplru, *lplru_next, *mlp; - TAILQ_HEAD(, nfs_lock_pid) nfs_lock_pid_free; - proc_t plru = PROC_NULL; - pid_t pid; - int error = 0; - struct timeval now; - - TAILQ_INIT(&nfs_lock_pid_free); - mlp = NULL; - -loop: - /* Search hash chain */ - pid = proc_pid(p); - error = ENOENT; - lp = NFS_LOCK_PID_HASH(pid)->lh_first; - for (; lp != NULL; lp = lp->lp_hash.le_next) - if (lp->lp_pid == pid) { - /* found pid... */ - if (timevalcmp(&lp->lp_pid_start, &p->p_start, ==)) { - /* ...and it's valid */ - /* move to tail of LRU */ - TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru); - microuptime(&now); - lp->lp_time = now.tv_sec; - TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru); - error = 0; - break; - } - /* ...but it's no longer valid */ - /* remove from hash, invalidate, and move to lru head */ - LIST_REMOVE(lp, lp_hash); - lp->lp_valid = 0; - TAILQ_REMOVE(&nfs_lock_pid_lru, lp, lp_lru); - TAILQ_INSERT_HEAD(&nfs_lock_pid_lru, lp, lp_lru); - lp = NULL; - break; - } - - /* if we didn't find it (valid), use any newly allocated one */ - if (!lp) - lp = mlp; - - /* if we don't have an lp and we've been asked to add it */ - if ((error == ENOENT) && addflag && !lp) { - /* scan lru list for invalid, stale entries to reuse/free */ - int lrucnt = 0; - microuptime(&now); - for (lplru = TAILQ_FIRST(&nfs_lock_pid_lru); lplru; lplru = lplru_next) { - lplru_next = TAILQ_NEXT(lplru, lp_lru); - if (lplru->lp_valid && (lplru->lp_time >= (now.tv_sec - 2))) { - /* - * If the oldest LRU entry is relatively new, then don't - * bother scanning any further. - */ - break; - } - /* remove entry from LRU, and check if it's still in use */ - TAILQ_REMOVE(&nfs_lock_pid_lru, lplru, lp_lru); - if (!lplru->lp_valid || !(plru = proc_find(lplru->lp_pid)) || - timevalcmp(&lplru->lp_pid_start, &plru->p_start, !=)) { - if (plru != PROC_NULL) { - proc_rele(plru); - plru = PROC_NULL; - } - /* no longer in use */ - LIST_REMOVE(lplru, lp_hash); - if (!lp) { - /* we'll reuse this one */ - lp = lplru; - } else { - /* queue it up for freeing */ - TAILQ_INSERT_HEAD(&nfs_lock_pid_free, lplru, lp_lru); - } - } else { - /* still in use */ - if (plru != PROC_NULL) { - proc_rele(plru); - plru = PROC_NULL; - } - lplru->lp_time = now.tv_sec; - TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lplru, lp_lru); - } - /* don't check too many entries at once */ - if (++lrucnt > 8) - break; - } - if (!lp) { - /* we need to allocate a new one */ - lck_mtx_unlock(nfs_lock_mutex); - MALLOC(mlp, struct nfs_lock_pid *, sizeof(struct nfs_lock_pid), - M_TEMP, M_WAITOK | M_ZERO); - lck_mtx_lock(nfs_lock_mutex); - if (mlp) /* make sure somebody hasn't already added this guy */ - goto loop; - error = ENOMEM; - } - } - if ((error == ENOENT) && addflag && lp) { - /* (re)initialize nfs_lock_pid info */ - lp->lp_pid = pid; - lp->lp_pid_start = p->p_start; - /* insert pid in hash */ - LIST_INSERT_HEAD(NFS_LOCK_PID_HASH(lp->lp_pid), lp, lp_hash); - lp->lp_valid = 1; - lp->lp_time = now.tv_sec; - TAILQ_INSERT_TAIL(&nfs_lock_pid_lru, lp, lp_lru); - error = 0; - } - - if ((mlp && (lp != mlp)) || TAILQ_FIRST(&nfs_lock_pid_free)) { - lck_mtx_unlock(nfs_lock_mutex); - if (mlp && (lp != mlp)) { - /* we didn't need this one, so we can free it */ - FREE(mlp, M_TEMP); - } - /* free up any stale entries */ - while ((lp = TAILQ_FIRST(&nfs_lock_pid_free))) { - TAILQ_REMOVE(&nfs_lock_pid_free, lp, lp_lru); - FREE(lp, M_TEMP); - } - lck_mtx_lock(nfs_lock_mutex); - } - - return (error); -} - #define MACH_MAX_TRIES 3 int @@ -551,186 +400,49 @@ nfs_lockd_send_request(LOCKD_MSG *msg, int interruptable) * NFS advisory byte-level locks (client) */ int -nfs3_vnop_advlock( - struct vnop_advlock_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - caddr_t a_id; - int a_op; - struct flock *a_fl; - int a_flags; - vfs_context_t a_context; - } */ *ap) +nfs3_lockd_request( + nfsnode_t np, + int type, + LOCKD_MSG_REQUEST *msgreq, + int flags, + thread_t thd) { - vfs_context_t ctx; - proc_t p; - LOCKD_MSG_REQUEST msgreq; - LOCKD_MSG *msg; - vnode_t vp; - nfsnode_t np; + LOCKD_MSG *msg = &msgreq->lmr_msg; int error, error2; - int interruptable, modified; - struct flock *fl; + int interruptable, slpflag; struct nfsmount *nmp; - struct nfs_vattr nvattr; - off_t start, end; struct timeval now; - int timeo, endtime, lastmsg, wentdown = 0; - int lockpidcheck, nfsvers; - struct sockaddr *saddr; + int timeo, starttime, endtime, lastmsg, wentdown = 0; struct timespec ts; + struct sockaddr *saddr; - ctx = ap->a_context; - p = vfs_context_proc(ctx); - vp = ap->a_vp; - fl = ap->a_fl; - np = VTONFS(vp); - - nmp = VTONMP(vp); - if (!nmp) - return (ENXIO); - lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_flag & NFSMNT_NOLOCKS) { - lck_mtx_unlock(&nmp->nm_lock); - return (ENOTSUP); - } - nfsvers = nmp->nm_vers; - lck_mtx_unlock(&nmp->nm_lock); - - /* - * The NLM protocol doesn't allow the server to return an error - * on ranges, so we do it. Pre LFS (Large File Summit) - * standards required EINVAL for the range errors. More recent - * standards use EOVERFLOW, but their EINVAL wording still - * encompasses these errors. - * Any code sensitive to this is either: - * 1) written pre-LFS and so can handle only EINVAL, or - * 2) written post-LFS and thus ought to be tolerant of pre-LFS - * implementations. - * Since returning EOVERFLOW certainly breaks 1), we return EINVAL. - */ - if (fl->l_whence != SEEK_END) { - if ((fl->l_whence != SEEK_CUR && fl->l_whence != SEEK_SET) || - fl->l_start < 0 || - (fl->l_len > 0 && fl->l_len - 1 > OFF_MAX - fl->l_start) || - (fl->l_len < 0 && fl->l_start + fl->l_len < 0)) - return (EINVAL); - } - - lck_mtx_lock(nfs_lock_mutex); - - /* - * Need to check if this process has successfully acquired an NFS lock before. - * If not, and this is an unlock request we can simply return success here. - */ - lockpidcheck = nfs_lock_pid_check(p, 0); - lck_mtx_unlock(nfs_lock_mutex); - if (lockpidcheck) { - if (lockpidcheck != ENOENT) - return (lockpidcheck); - if ((ap->a_op == F_UNLCK) && nfs_lock_pid_hash_trusted) - return (0); - } - - /* - * The NFS Lock Manager protocol doesn't directly handle - * negative lengths or SEEK_END, so we need to normalize - * things here where we have all the info. - * (Note: SEEK_CUR is already adjusted for at this point) - */ - /* Convert the flock structure into a start and end. */ - switch (fl->l_whence) { - case SEEK_SET: - case SEEK_CUR: - /* - * Caller is responsible for adding any necessary offset - * to fl->l_start when SEEK_CUR is used. - */ - start = fl->l_start; - break; - case SEEK_END: - /* need to flush, and refetch attributes to make */ - /* sure we have the correct end of file offset */ - if ((error = nfs_node_lock(np))) - return (error); - modified = (np->n_flag & NMODIFIED); - nfs_node_unlock(np); - if (modified && ((error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1)))) - return (error); - if ((error = nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED))) - return (error); - nfs_data_lock(np, NFS_DATA_LOCK_SHARED); - start = np->n_size + fl->l_start; - nfs_data_unlock(np); - break; - default: - return (EINVAL); - } - if (fl->l_len == 0) - end = -1; - else if (fl->l_len > 0) - end = start + fl->l_len - 1; - else { /* l_len is negative */ - end = start - 1; - start += fl->l_len; - } - if (start < 0) - return (EINVAL); - - if ((nfsvers == NFS_VER2) && - ((start >= 0x80000000) || (end >= 0x80000000))) - return (EINVAL); - - /* - * Fill in the information structure. - * We set all values to zero with bzero to clear - * out any information in the sockaddr_storage - * and nfs_filehandle contained in msgreq so that - * we will not leak extraneous information out of - * the kernel when calling up to lockd via our mig - * generated routine. - */ - bzero(&msgreq, sizeof(msgreq)); - msg = &msgreq.lmr_msg; - msg->lm_version = LOCKD_MSG_VERSION; - msg->lm_flags = 0; - - msg->lm_fl = *fl; - msg->lm_fl.l_start = start; - if (end != -1) - msg->lm_fl.l_len = end - start + 1; - msg->lm_fl.l_pid = vfs_context_pid(ctx); - - if (ap->a_flags & F_WAIT) - msg->lm_flags |= LOCKD_MSG_BLOCK; - if (ap->a_op == F_GETLK) - msg->lm_flags |= LOCKD_MSG_TEST; - - nmp = VTONMP(vp); - if (!nmp) + nmp = NFSTONMP(np); + if (!nmp || !nmp->nm_saddr) return (ENXIO); lck_mtx_lock(&nmp->nm_lock); - saddr = mbuf_data(nmp->nm_nam); + saddr = nmp->nm_saddr; bcopy(saddr, &msg->lm_addr, min(sizeof msg->lm_addr, saddr->sa_len)); - msg->lm_fh_len = (nfsvers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; - bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); - if (nfsvers == NFS_VER3) + if (nmp->nm_vers == NFS_VER3) msg->lm_flags |= LOCKD_MSG_NFSV3; - cru2x(vfs_context_ucred(ctx), &msg->lm_cred); +#if 0 /* not yet */ + if (nmp->nm_sotype != SOCK_DGRAM) + msg->lm_flags |= LOCKD_MSG_TCP; +#endif microuptime(&now); + starttime = now.tv_sec; lastmsg = now.tv_sec - ((nmp->nm_tprintf_delay) - (nmp->nm_tprintf_initial_delay)); - interruptable = nmp->nm_flag & NFSMNT_INT; + interruptable = NMFLAG(nmp, INTR); lck_mtx_unlock(&nmp->nm_lock); lck_mtx_lock(nfs_lock_mutex); /* allocate unique xid */ msg->lm_xid = nfs_lockxid_get(); - nfs_lockdmsg_enqueue(&msgreq); + nfs_lockdmsg_enqueue(msgreq); - timeo = 2; + timeo = 4; for (;;) { nfs_lockd_request_sent = 1; @@ -751,7 +463,7 @@ nfs3_vnop_advlock( * Retry if it takes too long to get a response. * * The timeout numbers were picked out of thin air... they start - * at 2 and double each timeout with a max of 60 seconds. + * at 4 and double each timeout with a max of 30 seconds. * * In order to maintain responsiveness, we pass a small timeout * to msleep and calculate the timeouts ourselves. This allows @@ -759,15 +471,18 @@ nfs3_vnop_advlock( */ wait_for_granted: error = EWOULDBLOCK; + slpflag = (interruptable && (type != F_UNLCK)) ? PCATCH : 0; ts.tv_sec = 2; ts.tv_nsec = 0; microuptime(&now); endtime = now.tv_sec + timeo; while (now.tv_sec < endtime) { error = error2 = 0; - if (!msgreq.lmr_answered) - error = msleep(&msgreq, nfs_lock_mutex, PCATCH | PUSER, "lockd", &ts); - if (msgreq.lmr_answered) { + if (!msgreq->lmr_answered) { + error = msleep(msgreq, nfs_lock_mutex, slpflag | PUSER, "lockd", &ts); + slpflag = 0; + } + if (msgreq->lmr_answered) { /* * Note: it's possible to have a lock granted at * essentially the same time that we get interrupted. @@ -775,8 +490,8 @@ wait_for_granted: * error from this request or we might not unlock the * lock that's been granted. */ - nmp = VTONMP(vp); - if ((msgreq.lmr_errno == ENOTSUP) && nmp && + nmp = NFSTONMP(np); + if ((msgreq->lmr_errno == ENOTSUP) && nmp && (nmp->nm_state & NFSSTA_LOCKSWORK)) { /* * We have evidence that locks work, yet lockd @@ -797,58 +512,81 @@ wait_for_granted: break; /* check that we still have our mount... */ /* ...and that we still support locks */ - nmp = VTONMP(vp); - if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) { + /* ...and that there isn't a recovery pending */ + nmp = NFSTONMP(np); + if ((error2 = nfs_sigintr(nmp, NULL, NULL, 0))) { error = error2; - if (fl->l_type == F_UNLCK) - printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error); + if (type == F_UNLCK) + printf("nfs3_lockd_request: aborting unlock request, error %d\n", error); break; } lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_flag & NFSMNT_NOLOCKS) { + if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) { + lck_mtx_unlock(&nmp->nm_lock); + break; + } + if ((nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) { + /* recovery pending... return an error that'll get this operation restarted */ + error = NFSERR_GRACE; lck_mtx_unlock(&nmp->nm_lock); break; } - interruptable = nmp->nm_flag & NFSMNT_INT; + interruptable = NMFLAG(nmp, INTR); lck_mtx_unlock(&nmp->nm_lock); microuptime(&now); } if (error) { /* check that we still have our mount... */ - nmp = VTONMP(vp); - if ((error2 = nfs_sigintr(nmp, NULL, vfs_context_thread(ctx), 0))) { + nmp = NFSTONMP(np); + if ((error2 = nfs_sigintr(nmp, NULL, NULL, 0))) { error = error2; if (error2 != EINTR) { - if (fl->l_type == F_UNLCK) - printf("nfs_vnop_advlock: aborting unlock request, error %d\n", error); + if (type == F_UNLCK) + printf("nfs3_lockd_request: aborting unlock request, error %d\n", error); break; } } /* ...and that we still support locks */ lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_flag & NFSMNT_NOLOCKS) { + if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) { if (error == EWOULDBLOCK) error = ENOTSUP; lck_mtx_unlock(&nmp->nm_lock); break; } - interruptable = nmp->nm_flag & NFSMNT_INT; - if (error != EWOULDBLOCK) { + /* ...and that there isn't a recovery pending */ + if ((error == EWOULDBLOCK) && (nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) { + /* recovery pending... return to allow recovery to occur */ + error = NFSERR_DENIED; + lck_mtx_unlock(&nmp->nm_lock); + break; + } + interruptable = NMFLAG(nmp, INTR); + if ((error != EWOULDBLOCK) || + ((nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) || + ((flags & R_RECOVER) && ((now.tv_sec - starttime) > 30))) { + if ((error == EWOULDBLOCK) && (flags & R_RECOVER)) { + /* give up if this is for recovery and taking too long */ + error = ETIMEDOUT; + } else if ((nmp->nm_state & NFSSTA_RECOVER) && !(flags & R_RECOVER)) { + /* recovery pending... return an error that'll get this operation restarted */ + error = NFSERR_GRACE; + } lck_mtx_unlock(&nmp->nm_lock); /* * We're going to bail on this request. * If we were a blocked lock request, send a cancel. */ - if ((msgreq.lmr_errno == EINPROGRESS) && + if ((msgreq->lmr_errno == EINPROGRESS) && !(msg->lm_flags & LOCKD_MSG_CANCEL)) { /* set this request up as a cancel */ msg->lm_flags |= LOCKD_MSG_CANCEL; - nfs_lockdmsg_dequeue(&msgreq); + nfs_lockdmsg_dequeue(msgreq); msg->lm_xid = nfs_lockxid_get(); - nfs_lockdmsg_enqueue(&msgreq); - msgreq.lmr_saved_errno = error; - msgreq.lmr_errno = 0; - msgreq.lmr_answered = 0; + nfs_lockdmsg_enqueue(msgreq); + msgreq->lmr_saved_errno = error; + msgreq->lmr_errno = 0; + msgreq->lmr_answered = 0; /* reset timeout */ timeo = 2; /* send cancel request */ @@ -859,18 +597,18 @@ wait_for_granted: /* warn if we're not getting any response */ microuptime(&now); - if ((msgreq.lmr_errno != EINPROGRESS) && + if ((msgreq->lmr_errno != EINPROGRESS) && !(msg->lm_flags & LOCKD_MSG_DENIED_GRACE) && (nmp->nm_tprintf_initial_delay != 0) && ((lastmsg + nmp->nm_tprintf_delay) < now.tv_sec)) { lck_mtx_unlock(&nmp->nm_lock); lastmsg = now.tv_sec; - nfs_down(nmp, vfs_context_thread(ctx), 0, NFSSTA_LOCKTIMEO, "lockd not responding"); + nfs_down(nmp, thd, 0, NFSSTA_LOCKTIMEO, "lockd not responding"); wentdown = 1; } else lck_mtx_unlock(&nmp->nm_lock); - if (msgreq.lmr_errno == EINPROGRESS) { + if (msgreq->lmr_errno == EINPROGRESS) { /* * We've got a blocked lock request that we are * going to retry. First, we'll want to try to @@ -883,95 +621,63 @@ wait_for_granted: * it is NLM_BLOCKED). */ msg->lm_flags |= LOCKD_MSG_CANCEL; - nfs_lockdmsg_dequeue(&msgreq); + nfs_lockdmsg_dequeue(msgreq); msg->lm_xid = nfs_lockxid_get(); - nfs_lockdmsg_enqueue(&msgreq); - msgreq.lmr_saved_errno = msgreq.lmr_errno; - msgreq.lmr_errno = 0; - msgreq.lmr_answered = 0; + nfs_lockdmsg_enqueue(msgreq); + msgreq->lmr_saved_errno = msgreq->lmr_errno; + msgreq->lmr_errno = 0; + msgreq->lmr_answered = 0; timeo = 2; /* send cancel then resend request */ continue; } - if (msg->lm_flags & LOCKD_MSG_DENIED_GRACE) { - /* - * Time to resend a request previously denied due to a grace period. - */ - msg->lm_flags &= ~LOCKD_MSG_DENIED_GRACE; - nfs_lockdmsg_dequeue(&msgreq); - msg->lm_xid = nfs_lockxid_get(); - nfs_lockdmsg_enqueue(&msgreq); - msgreq.lmr_saved_errno = 0; - msgreq.lmr_errno = 0; - msgreq.lmr_answered = 0; - timeo = 2; - /* resend request */ - continue; - } - /* * We timed out, so we will resend the request. */ - timeo *= 2; - if (timeo > 60) - timeo = 60; + if (!(flags & R_RECOVER)) + timeo *= 2; + if (timeo > 30) + timeo = 30; /* resend request */ continue; } /* we got a reponse, so the server's lockd is OK */ - nfs_up(VTONMP(vp), vfs_context_thread(ctx), NFSSTA_LOCKTIMEO, + nfs_up(NFSTONMP(np), thd, NFSSTA_LOCKTIMEO, wentdown ? "lockd alive again" : NULL); wentdown = 0; - if (msgreq.lmr_answered && (msg->lm_flags & LOCKD_MSG_DENIED_GRACE)) { + if (msgreq->lmr_answered && (msg->lm_flags & LOCKD_MSG_DENIED_GRACE)) { /* * The lock request was denied because the server lockd is * still in its grace period. So, we need to try the - * request again in a little bit. + * request again in a little bit. Return the GRACE error so + * the higher levels can perform the retry. */ - timeo = 4; - msgreq.lmr_answered = 0; - goto wait_for_granted; + msgreq->lmr_saved_errno = msgreq->lmr_errno = error = NFSERR_GRACE; } - if (msgreq.lmr_errno == EINPROGRESS) { + if (msgreq->lmr_errno == EINPROGRESS) { /* got NLM_BLOCKED response */ /* need to wait for NLM_GRANTED */ - timeo = 60; - msgreq.lmr_answered = 0; + timeo = 30; + msgreq->lmr_answered = 0; goto wait_for_granted; } if ((msg->lm_flags & LOCKD_MSG_CANCEL) && - (msgreq.lmr_saved_errno == EINPROGRESS)) { + (msgreq->lmr_saved_errno == EINPROGRESS)) { /* * We just got a successful reply to the * cancel of the previous blocked lock request. - * Now, go ahead and resend the request. + * Now, go ahead and return a DENIED error so the + * higher levels can resend the request. */ msg->lm_flags &= ~LOCKD_MSG_CANCEL; - nfs_lockdmsg_dequeue(&msgreq); - msg->lm_xid = nfs_lockxid_get(); - nfs_lockdmsg_enqueue(&msgreq); - msgreq.lmr_saved_errno = 0; - msgreq.lmr_errno = 0; - msgreq.lmr_answered = 0; - timeo = 2; - /* resend request */ - continue; - } - - if ((msg->lm_flags & LOCKD_MSG_TEST) && msgreq.lmr_errno == 0) { - if (msg->lm_fl.l_type != F_UNLCK) { - fl->l_type = msg->lm_fl.l_type; - fl->l_pid = msg->lm_fl.l_pid; - fl->l_start = msg->lm_fl.l_start; - fl->l_len = msg->lm_fl.l_len; - fl->l_whence = SEEK_SET; - } else - fl->l_type = F_UNLCK; + nfs_lockdmsg_dequeue(msgreq); + error = NFSERR_DENIED; + break; } /* @@ -981,11 +687,12 @@ wait_for_granted: */ if (msg->lm_flags & LOCKD_MSG_CANCEL) { msg->lm_flags &= ~LOCKD_MSG_CANCEL; - error = msgreq.lmr_saved_errno; - } else - error = msgreq.lmr_errno; + error = msgreq->lmr_saved_errno; + } else { + error = msgreq->lmr_errno; + } - nmp = VTONMP(vp); + nmp = NFSTONMP(np); if ((error == ENOTSUP) && nmp && !(nmp->nm_state & NFSSTA_LOCKSWORK)) { /* * We have NO evidence that locks work and lockd @@ -993,12 +700,18 @@ wait_for_granted: * that locks aren't supported and disable them * for this mount. */ + nfs_lockdmsg_dequeue(msgreq); + lck_mtx_unlock(nfs_lock_mutex); lck_mtx_lock(&nmp->nm_lock); - nmp->nm_flag |= NFSMNT_NOLOCKS; + if (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED) { + nmp->nm_lockmode = NFS_LOCK_MODE_DISABLED; + nfs_lockd_mount_unregister(nmp); + } nmp->nm_state &= ~NFSSTA_LOCKTIMEO; lck_mtx_unlock(&nmp->nm_lock); printf("lockd returned ENOTSUP, disabling locks for nfs server: %s\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); + return (error); } if (!error) { /* record that NFS file locking has worked on this mount */ @@ -1008,35 +721,162 @@ wait_for_granted: nmp->nm_state |= NFSSTA_LOCKSWORK; lck_mtx_unlock(&nmp->nm_lock); } - /* - * If we successfully acquired a lock, make sure this pid - * is in the nfs_lock_pid hash table so we know we can't - * short-circuit unlock requests. - */ - if ((lockpidcheck == ENOENT) && - ((ap->a_op == F_SETLK) || (ap->a_op == F_SETLKW))) { - error = nfs_lock_pid_check(p, 1); - if (error) { - /* - * We couldn't add the pid to the table, - * so we can no longer trust that a pid - * not in the table has no locks. - */ - nfs_lock_pid_hash_trusted = 0; - printf("nfs_vnop_advlock: pid add failed - no longer trusted\n"); - } - } } break; } - nfs_lockdmsg_dequeue(&msgreq); + nfs_lockdmsg_dequeue(msgreq); lck_mtx_unlock(nfs_lock_mutex); return (error); } +/* + * Send an NLM LOCK message to the server + */ +int +nfs3_setlock_rpc( + nfsnode_t np, + struct nfs_open_file *nofp, + struct nfs_file_lock *nflp, + int reclaim, + int flags, + thread_t thd, + kauth_cred_t cred) +{ + struct nfs_lock_owner *nlop = nflp->nfl_owner; + struct nfsmount *nmp; + int error; + LOCKD_MSG_REQUEST msgreq; + LOCKD_MSG *msg; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + + if (!nlop->nlo_open_owner) { + nfs_open_owner_ref(nofp->nof_owner); + nlop->nlo_open_owner = nofp->nof_owner; + } + if ((error = nfs_lock_owner_set_busy(nlop, thd))) + return (error); + + /* set up lock message request structure */ + bzero(&msgreq, sizeof(msgreq)); + msg = &msgreq.lmr_msg; + msg->lm_version = LOCKD_MSG_VERSION; + if ((nflp->nfl_flags & NFS_FILE_LOCK_WAIT) && !reclaim) + msg->lm_flags |= LOCKD_MSG_BLOCK; + if (reclaim) + msg->lm_flags |= LOCKD_MSG_RECLAIM; + msg->lm_fh_len = (nmp->nm_vers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; + bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); + cru2x(cred, &msg->lm_cred); + + msg->lm_fl.l_whence = SEEK_SET; + msg->lm_fl.l_start = nflp->nfl_start; + msg->lm_fl.l_len = NFS_FLOCK_LENGTH(nflp->nfl_start, nflp->nfl_end); + msg->lm_fl.l_type = nflp->nfl_type; + msg->lm_fl.l_pid = nlop->nlo_pid; + + error = nfs3_lockd_request(np, 0, &msgreq, flags, thd); + + nfs_lock_owner_clear_busy(nlop); + return (error); +} + +/* + * Send an NLM UNLOCK message to the server + */ +int +nfs3_unlock_rpc( + nfsnode_t np, + struct nfs_lock_owner *nlop, + __unused int type, + uint64_t start, + uint64_t end, + int flags, + thread_t thd, + kauth_cred_t cred) +{ + struct nfsmount *nmp; + LOCKD_MSG_REQUEST msgreq; + LOCKD_MSG *msg; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + + /* set up lock message request structure */ + bzero(&msgreq, sizeof(msgreq)); + msg = &msgreq.lmr_msg; + msg->lm_version = LOCKD_MSG_VERSION; + msg->lm_fh_len = (nmp->nm_vers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; + bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); + cru2x(cred, &msg->lm_cred); + + msg->lm_fl.l_whence = SEEK_SET; + msg->lm_fl.l_start = start; + msg->lm_fl.l_len = NFS_FLOCK_LENGTH(start, end); + msg->lm_fl.l_type = F_UNLCK; + msg->lm_fl.l_pid = nlop->nlo_pid; + + return (nfs3_lockd_request(np, F_UNLCK, &msgreq, flags, thd)); +} + +/* + * Send an NLM LOCK TEST message to the server + */ +int +nfs3_getlock_rpc( + nfsnode_t np, + struct nfs_lock_owner *nlop, + struct flock *fl, + uint64_t start, + uint64_t end, + vfs_context_t ctx) +{ + struct nfsmount *nmp; + int error; + LOCKD_MSG_REQUEST msgreq; + LOCKD_MSG *msg; + + nmp = NFSTONMP(np); + if (!nmp) + return (ENXIO); + + /* set up lock message request structure */ + bzero(&msgreq, sizeof(msgreq)); + msg = &msgreq.lmr_msg; + msg->lm_version = LOCKD_MSG_VERSION; + msg->lm_flags |= LOCKD_MSG_TEST; + msg->lm_fh_len = (nmp->nm_vers == NFS_VER2) ? NFSX_V2FH : np->n_fhsize; + bcopy(np->n_fhp, msg->lm_fh, msg->lm_fh_len); + cru2x(vfs_context_ucred(ctx), &msg->lm_cred); + + msg->lm_fl.l_whence = SEEK_SET; + msg->lm_fl.l_start = start; + msg->lm_fl.l_len = NFS_FLOCK_LENGTH(start, end); + msg->lm_fl.l_type = fl->l_type; + msg->lm_fl.l_pid = nlop->nlo_pid; + + error = nfs3_lockd_request(np, 0, &msgreq, 0, vfs_context_thread(ctx)); + + if (!error && (msg->lm_flags & LOCKD_MSG_TEST) && !msgreq.lmr_errno) { + if (msg->lm_fl.l_type != F_UNLCK) { + fl->l_type = msg->lm_fl.l_type; + fl->l_pid = msg->lm_fl.l_pid; + fl->l_start = msg->lm_fl.l_start; + fl->l_len = msg->lm_fl.l_len; + fl->l_whence = SEEK_SET; + } else + fl->l_type = F_UNLCK; + } + + return (error); +} + /* * nfslockdans -- * NFS advisory byte-level locks answer from the lock daemon. @@ -1105,3 +945,58 @@ nfslockdans(proc_t p, struct lockd_ans *ansp) return (0); } +/* + * nfslockdnotify -- + * NFS host restart notification from the lock daemon. + * + * Used to initiate reclaiming of held locks when a server we + * have mounted reboots. + */ +int +nfslockdnotify(proc_t p, user_addr_t argp) +{ + int error, i, headsize; + struct lockd_notify ln; + struct nfsmount *nmp; + struct sockaddr *saddr; + + /* Let root make this call. */ + error = proc_suser(p); + if (error) + return (error); + + headsize = (char*)&ln.ln_addr[0] - (char*)&ln.ln_version; + error = copyin(argp, &ln, headsize); + if (error) + return (error); + if (ln.ln_version != LOCKD_NOTIFY_VERSION) + return (EINVAL); + if ((ln.ln_addrcount < 1) || (ln.ln_addrcount > 128)) + return (EINVAL); + argp += headsize; + saddr = (struct sockaddr *)&ln.ln_addr[0]; + + lck_mtx_lock(nfs_lock_mutex); + + for (i=0; i < ln.ln_addrcount; i++) { + error = copyin(argp, &ln.ln_addr[0], sizeof(ln.ln_addr[0])); + if (error) + break; + argp += sizeof(ln.ln_addr[0]); + /* scan lockd mount list for match to this address */ + TAILQ_FOREACH(nmp, &nfs_lockd_mount_list, nm_ldlink) { + /* check if address matches this mount's server address */ + if (!nmp->nm_saddr || nfs_sockaddr_cmp(saddr, nmp->nm_saddr)) + continue; + /* We have a match! Mark it as needing recovery. */ + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, 0); + lck_mtx_unlock(&nmp->nm_lock); + } + } + + lck_mtx_unlock(nfs_lock_mutex); + + return (error); +} + diff --git a/bsd/nfs/nfs_lock.h b/bsd/nfs/nfs_lock.h index 7bd4e91a8..5a5efe3e4 100644 --- a/bsd/nfs/nfs_lock.h +++ b/bsd/nfs/nfs_lock.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2002-2008 Apple Inc. All rights reserved. + * Copyright (c) 2002-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -91,6 +91,8 @@ typedef struct nfs_lock_msg { #define LOCKD_MSG_NFSV3 0x0004 /* NFSv3 request */ #define LOCKD_MSG_CANCEL 0x0008 /* cancelling blocked request */ #define LOCKD_MSG_DENIED_GRACE 0x0010 /* lock denied due to grace period */ +#define LOCKD_MSG_RECLAIM 0x0020 /* lock reclaim request */ +#define LOCKD_MSG_TCP 0x0040 /* (try to) use TCP for request */ /* The structure used to maintain the pending request queue */ typedef struct nfs_lock_msg_request { @@ -128,11 +130,26 @@ struct lockd_ans { #define LOCKD_ANS_DENIED_GRACE 0x0008 /* lock denied due to grace period */ +/* + * The structure that lockd hands the kernel for each notify. + */ +#define LOCKD_NOTIFY_VERSION 1 +struct lockd_notify { + int ln_version; /* lockd_notify version */ + int ln_flags; /* notify flags */ + int ln_pad; /* (for alignment) */ + int ln_addrcount; /* # of addresss */ + struct sockaddr_storage ln_addr[1]; /* List of addresses. */ +}; + + #ifdef KERNEL void nfs_lockinit(void); -void nfs_lockd_mount_change(int); -int nfs3_vnop_advlock(struct vnop_advlock_args *ap); +void nfs_lockd_mount_register(struct nfsmount *); +void nfs_lockd_mount_unregister(struct nfsmount *); +int nfs3_lockd_request(nfsnode_t, int, LOCKD_MSG_REQUEST *, int, thread_t); int nfslockdans(proc_t p, struct lockd_ans *ansp); +int nfslockdnotify(proc_t p, user_addr_t argp); #endif #endif /* __APPLE_API_PRIVATE */ diff --git a/bsd/nfs/nfs_node.c b/bsd/nfs/nfs_node.c index 7d1926787..b3f2a47b9 100644 --- a/bsd/nfs/nfs_node.c +++ b/bsd/nfs/nfs_node.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,6 +75,7 @@ #include <sys/vnode.h> #include <sys/ubc.h> #include <sys/malloc.h> +#include <sys/fcntl.h> #include <nfs/rpcv2.h> #include <nfs/nfsproto.h> @@ -145,6 +146,7 @@ nfs_nget( int fhsize, struct nfs_vattr *nvap, u_int64_t *xidp, + uint32_t auth, int flags, nfsnode_t *npp) { @@ -175,6 +177,21 @@ loop: if (mp != mp2 || np->n_fhsize != fhsize || bcmp(fhp, np->n_fhp, fhsize)) continue; + if (nvap && (nvap->nva_flags & NFS_FFLAG_TRIGGER_REFERRAL) && + cnp && (cnp->cn_namelen > (fhsize - (int)sizeof(dnp)))) { + /* The name was too long to fit in the file handle. Check it against the node's name. */ + int namecmp = 0; + const char *vname = vnode_getname(NFSTOV(np)); + if (vname) { + if (cnp->cn_namelen != (int)strlen(vname)) + namecmp = 1; + else + namecmp = strncmp(vname, cnp->cn_nameptr, cnp->cn_namelen); + vnode_putname(vname); + } + if (namecmp) /* full name didn't match */ + continue; + } FSDBG(263, dnp, np, np->n_flag, 0xcace0000); /* if the node is locked, sleep on it */ if ((np->n_hflag & NHLOCKED) && !(flags & NG_NOCREATE)) { @@ -246,10 +263,21 @@ loop: bzero(np, sizeof *np); np->n_hflag |= (NHINIT | NHLOCKED); np->n_mount = mp; + np->n_auth = auth; TAILQ_INIT(&np->n_opens); TAILQ_INIT(&np->n_lock_owners); TAILQ_INIT(&np->n_locks); np->n_dlink.tqe_next = NFSNOLIST; + np->n_dreturn.tqe_next = NFSNOLIST; + np->n_monlink.le_next = NFSNOLIST; + + /* ugh... need to keep track of ".zfs" directories to workaround server bugs */ + if ((nvap->nva_type == VDIR) && cnp && (cnp->cn_namelen == 4) && + (cnp->cn_nameptr[0] == '.') && (cnp->cn_nameptr[1] == 'z') && + (cnp->cn_nameptr[2] == 'f') && (cnp->cn_nameptr[3] == 's')) + np->n_flag |= NISDOTZFS; + if (dnp && (dnp->n_flag & NISDOTZFS)) + np->n_flag |= NISDOTZFSCHILD; if (dnp && cnp && ((cnp->cn_namelen != 2) || (cnp->cn_nameptr[0] != '.') || (cnp->cn_nameptr[1] != '.'))) { @@ -293,6 +321,8 @@ loop: lck_mtx_unlock(nfs_node_hash_mutex); /* do initial loading of attributes */ + NACLINVALIDATE(np); + NACCESSINVALIDATE(np); error = nfs_loadattrcache(np, nvap, xidp, 1); if (error) { FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); @@ -325,7 +355,6 @@ loop: NFS_CHANGED_UPDATE(nfsvers, np, nvap); if (nvap->nva_type == VDIR) NFS_CHANGED_UPDATE_NC(nfsvers, np, nvap); - NMODEINVALIDATE(np); /* now, attempt to get a new vnode */ vfsp.vnfs_mp = mp; @@ -363,7 +392,21 @@ loop: if (!dnp || !cnp || !(flags & NG_MAKEENTRY)) vfsp.vnfs_flags |= VNFS_NOCACHE; - error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &np->n_vnode); +#if CONFIG_TRIGGERS + if ((nfsvers >= NFS_VER4) && (nvap->nva_type == VDIR) && (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) { + struct vnode_trigger_param vtp; + bzero(&vtp, sizeof(vtp)); + bcopy(&vfsp, &vtp.vnt_params, sizeof(vfsp)); + vtp.vnt_resolve_func = nfs_mirror_mount_trigger_resolve; + vtp.vnt_unresolve_func = nfs_mirror_mount_trigger_unresolve; + vtp.vnt_rearm_func = nfs_mirror_mount_trigger_rearm; + vtp.vnt_flags = VNT_AUTO_REARM; + error = vnode_create(VNCREATE_TRIGGER, VNCREATE_TRIGGER_SIZE, &vtp, &np->n_vnode); + } else +#endif + { + error = vnode_create(VNCREATE_FLAVOR, VCREATESIZE, &vfsp, &np->n_vnode); + } if (error) { FSDBG(266, 0, np, np->n_flag, 0xb1eb1e); nfs_node_unlock(np); @@ -425,57 +468,58 @@ nfs_vnop_inactive(ap) nfsnode_t np = VTONFS(ap->a_vp); struct nfs_sillyrename *nsp; struct nfs_vattr nvattr; - int unhash, attrerr, busyerror, error, inuse, busied; + int unhash, attrerr, busyerror, error, inuse, busied, force; struct nfs_open_file *nofp; - const char *vname = NULL; struct componentname cn; struct nfsmount *nmp = NFSTONMP(np); + mount_t mp = vnode_mount(vp); restart: + force = (!mp || (mp->mnt_kern_flag & MNTK_FRCUNMOUNT)); error = 0; - inuse = ((nmp->nm_vers >= NFS_VER4) && (nfs_mount_state_in_use_start(nmp) == 0)); + inuse = (nfs_mount_state_in_use_start(nmp, NULL) == 0); /* There shouldn't be any open or lock state at this point */ lck_mtx_lock(&np->n_openlock); - if (np->n_openrefcnt) { - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: still open: %d %s\n", np->n_openrefcnt, vname ? vname : "//"); - } + if (np->n_openrefcnt && !force) + NP(np, "nfs_vnop_inactive: still open: %d", np->n_openrefcnt); TAILQ_FOREACH(nofp, &np->n_opens, nof_link) { lck_mtx_lock(&nofp->nof_lock); if (nofp->nof_flags & NFS_OPEN_FILE_BUSY) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: open file busy: %s\n", vname ? vname : "//"); + if (!force) + NP(np, "nfs_vnop_inactive: open file busy"); busied = 0; } else { nofp->nof_flags |= NFS_OPEN_FILE_BUSY; busied = 1; } lck_mtx_unlock(&nofp->nof_lock); + if ((np->n_flag & NREVOKE) || (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { + if (busied) + nfs_open_file_clear_busy(nofp); + continue; + } /* * If we just created the file, we already had it open in * anticipation of getting a subsequent open call. If the * node has gone inactive without being open, we need to * clean up (close) the open done in the create. */ - if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && nofp->nof_creator) { + if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && nofp->nof_creator && !force) { if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { lck_mtx_unlock(&np->n_openlock); if (busied) nfs_open_file_clear_busy(nofp); if (inuse) nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - goto restart; + if (!nfs4_reopen(nofp, NULL)) + goto restart; } nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; lck_mtx_unlock(&np->n_openlock); - error = nfs4_close(np, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx); + error = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx); if (error) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: create close error: %d, %s\n", error, vname); + NP(np, "nfs_vnop_inactive: create close error: %d", error); nofp->nof_flags |= NFS_OPEN_FILE_CREATE; } if (busied) @@ -495,21 +539,19 @@ restart: nofp->nof_r--; nofp->nof_opencnt--; nofp->nof_access = 0; - } else { + } else if (!force) { lck_mtx_unlock(&np->n_openlock); if (nofp->nof_flags & NFS_OPEN_FILE_REOPEN) { if (busied) nfs_open_file_clear_busy(nofp); if (inuse) nfs_mount_state_in_use_end(nmp, 0); - nfs4_reopen(nofp, vfs_context_thread(ctx)); - goto restart; + if (!nfs4_reopen(nofp, NULL)) + goto restart; } - error = nfs4_close(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); + error = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_READ, NFS_OPEN_SHARE_DENY_NONE, ctx); if (error) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: need close error: %d, %s\n", error, vname); + NP(np, "nfs_vnop_inactive: need close error: %d", error); nofp->nof_flags |= NFS_OPEN_FILE_NEEDCLOSE; } if (busied) @@ -519,32 +561,33 @@ restart: goto restart; } } - if (nofp->nof_opencnt) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: file still open: %d %s\n", nofp->nof_opencnt, vname ? vname : "//"); - } - if (nofp->nof_access || nofp->nof_deny || + if (nofp->nof_opencnt && !force) + NP(np, "nfs_vnop_inactive: file still open: %d", nofp->nof_opencnt); + if (!force && (nofp->nof_access || nofp->nof_deny || nofp->nof_mmap_access || nofp->nof_mmap_deny || nofp->nof_r || nofp->nof_w || nofp->nof_rw || nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || - nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_inactive: non-zero access: %d %d %d %d # %u %u %u dw %u %u %u drw %u %u %u %s\n", + nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw || + nofp->nof_d_r || nofp->nof_d_w || nofp->nof_d_rw || + nofp->nof_d_r_dw || nofp->nof_d_w_dw || nofp->nof_d_rw_dw || + nofp->nof_d_r_drw || nofp->nof_d_w_drw || nofp->nof_d_rw_drw)) { + NP(np, "nfs_vnop_inactive: non-zero access: %d %d %d %d # %u.%u %u.%u %u.%u dw %u.%u %u.%u %u.%u drw %u.%u %u.%u %u.%u", nofp->nof_access, nofp->nof_deny, nofp->nof_mmap_access, nofp->nof_mmap_deny, - nofp->nof_r, nofp->nof_w, nofp->nof_rw, - nofp->nof_r_dw, nofp->nof_w_dw, nofp->nof_rw_dw, - nofp->nof_r_drw, nofp->nof_w_drw, nofp->nof_rw_drw, - vname ? vname : "//"); + nofp->nof_r, nofp->nof_d_r, + nofp->nof_w, nofp->nof_d_w, + nofp->nof_rw, nofp->nof_d_rw, + nofp->nof_r_dw, nofp->nof_d_r_dw, + nofp->nof_w_dw, nofp->nof_d_w_dw, + nofp->nof_rw_dw, nofp->nof_d_rw_dw, + nofp->nof_r_drw, nofp->nof_d_r_drw, + nofp->nof_w_drw, nofp->nof_d_w_drw, + nofp->nof_rw_drw, nofp->nof_d_rw_drw); } if (busied) nfs_open_file_clear_busy(nofp); } lck_mtx_unlock(&np->n_openlock); - if (vname) - vnode_putname(vname); if (inuse && nfs_mount_state_in_use_end(nmp, error)) goto restart; @@ -673,42 +716,59 @@ nfs_vnop_reclaim(ap) struct nfs_open_file *nofp, *nextnofp; struct nfs_file_lock *nflp, *nextnflp; struct nfs_lock_owner *nlop, *nextnlop; - const char *vname = NULL; struct nfsmount *nmp = np->n_mount ? VFSTONFS(np->n_mount) : NFSTONMP(np); + mount_t mp = vnode_mount(vp); + int force; FSDBG_TOP(265, vp, np, np->n_flag, 0); + force = (!mp || (mp->mnt_kern_flag & MNTK_FRCUNMOUNT)); /* There shouldn't be any open or lock state at this point */ lck_mtx_lock(&np->n_openlock); if (nmp && (nmp->nm_vers >= NFS_VER4)) { /* need to drop a delegation */ + if (np->n_dreturn.tqe_next != NFSNOLIST) { + /* remove this node from the delegation return list */ + lck_mtx_lock(&nmp->nm_lock); + if (np->n_dreturn.tqe_next != NFSNOLIST) { + TAILQ_REMOVE(&nmp->nm_dreturnq, np, n_dreturn); + np->n_dreturn.tqe_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); + } if (np->n_dlink.tqe_next != NFSNOLIST) { - /* remove this node from the recall list */ + /* remove this node from the delegation list */ lck_mtx_lock(&nmp->nm_lock); if (np->n_dlink.tqe_next != NFSNOLIST) { - TAILQ_REMOVE(&nmp->nm_recallq, np, n_dlink); + TAILQ_REMOVE(&nmp->nm_delegations, np, n_dlink); np->n_dlink.tqe_next = NFSNOLIST; } lck_mtx_unlock(&nmp->nm_lock); } - if (np->n_openflags & N_DELEG_MASK) { + if ((np->n_openflags & N_DELEG_MASK) && !force) { + /* try to return the delegation */ np->n_openflags &= ~N_DELEG_MASK; nfs4_delegreturn_rpc(nmp, np->n_fhp, np->n_fhsize, &np->n_dstateid, - vfs_context_thread(ctx), vfs_context_ucred(ctx)); + R_RECOVER, vfs_context_thread(ctx), vfs_context_ucred(ctx)); + } + if (np->n_attrdirfh) { + FREE(np->n_attrdirfh, M_TEMP); + np->n_attrdirfh = NULL; } } /* clean up file locks */ TAILQ_FOREACH_SAFE(nflp, &np->n_locks, nfl_link, nextnflp) { - if (!(nflp->nfl_flags & NFS_FILE_LOCK_DEAD)) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: lock 0x%llx 0x%llx 0x%x (bc %d) %s\n", - nflp->nfl_start, nflp->nfl_end, nflp->nfl_flags, - nflp->nfl_blockcnt, vname ? vname : "//"); + if (!(nflp->nfl_flags & NFS_FILE_LOCK_DEAD) && !force) { + NP(np, "nfs_vnop_reclaim: lock 0x%llx 0x%llx 0x%x (bc %d)", + nflp->nfl_start, nflp->nfl_end, nflp->nfl_flags, nflp->nfl_blockcnt); } - if (!(nflp->nfl_flags & NFS_FILE_LOCK_BLOCKED)) { + if (!(nflp->nfl_flags & (NFS_FILE_LOCK_BLOCKED|NFS_FILE_LOCK_DEAD))) { + /* try sending an unlock RPC if it wasn't delegated */ + if (!(nflp->nfl_flags & NFS_FILE_LOCK_DELEGATED) && !force) + nmp->nm_funcs->nf_unlock_rpc(np, nflp->nfl_owner, F_WRLCK, nflp->nfl_start, nflp->nfl_end, R_RECOVER, + NULL, nflp->nfl_owner->nlo_open_owner->noo_cred); lck_mtx_lock(&nflp->nfl_owner->nlo_lock); TAILQ_REMOVE(&nflp->nfl_owner->nlo_locks, nflp, nfl_lolink); lck_mtx_unlock(&nflp->nfl_owner->nlo_lock); @@ -718,72 +778,79 @@ nfs_vnop_reclaim(ap) } /* clean up lock owners */ TAILQ_FOREACH_SAFE(nlop, &np->n_lock_owners, nlo_link, nextnlop) { - if (!TAILQ_EMPTY(&nlop->nlo_locks)) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: lock owner with locks %s\n", - vname ? vname : "//"); - } + if (!TAILQ_EMPTY(&nlop->nlo_locks) && !force) + NP(np, "nfs_vnop_reclaim: lock owner with locks"); TAILQ_REMOVE(&np->n_lock_owners, nlop, nlo_link); nfs_lock_owner_destroy(nlop); } /* clean up open state */ - if (np->n_openrefcnt) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: still open: %d %s\n", - np->n_openrefcnt, vname ? vname : "//"); - } + if (np->n_openrefcnt && !force) + NP(np, "nfs_vnop_reclaim: still open: %d", np->n_openrefcnt); TAILQ_FOREACH_SAFE(nofp, &np->n_opens, nof_link, nextnofp) { - if (nofp->nof_flags & NFS_OPEN_FILE_BUSY) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: open file busy: %s\n", - vname ? vname : "//"); - } - if (nofp->nof_opencnt) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: file still open: %d %s\n", - nofp->nof_opencnt, vname ? vname : "//"); - } - if (nofp->nof_access || nofp->nof_deny || - nofp->nof_mmap_access || nofp->nof_mmap_deny || - nofp->nof_r || nofp->nof_w || nofp->nof_rw || - nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || - nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_vnop_reclaim: non-zero access: %d %d %d %d # %u %u %u dw %u %u %u drw %u %u %u %s\n", - nofp->nof_access, nofp->nof_deny, - nofp->nof_mmap_access, nofp->nof_mmap_deny, - nofp->nof_r, nofp->nof_w, nofp->nof_rw, - nofp->nof_r_dw, nofp->nof_w_dw, nofp->nof_rw_dw, - nofp->nof_r_drw, nofp->nof_w_drw, nofp->nof_rw_drw, - vname ? vname : "//"); + if (nofp->nof_flags & NFS_OPEN_FILE_BUSY) + NP(np, "nfs_vnop_reclaim: open file busy"); + if (!(np->n_flag & NREVOKE) && !(nofp->nof_flags & NFS_OPEN_FILE_LOST)) { + if (nofp->nof_opencnt && !force) + NP(np, "nfs_vnop_reclaim: file still open: %d", nofp->nof_opencnt); + if (!force && (nofp->nof_access || nofp->nof_deny || + nofp->nof_mmap_access || nofp->nof_mmap_deny || + nofp->nof_r || nofp->nof_w || nofp->nof_rw || + nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || + nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw || + nofp->nof_d_r || nofp->nof_d_w || nofp->nof_d_rw || + nofp->nof_d_r_dw || nofp->nof_d_w_dw || nofp->nof_d_rw_dw || + nofp->nof_d_r_drw || nofp->nof_d_w_drw || nofp->nof_d_rw_drw)) { + NP(np, "nfs_vnop_reclaim: non-zero access: %d %d %d %d # %u.%u %u.%u %u.%u dw %u.%u %u.%u %u.%u drw %u.%u %u.%u %u.%u", + nofp->nof_access, nofp->nof_deny, + nofp->nof_mmap_access, nofp->nof_mmap_deny, + nofp->nof_r, nofp->nof_d_r, + nofp->nof_w, nofp->nof_d_w, + nofp->nof_rw, nofp->nof_d_rw, + nofp->nof_r_dw, nofp->nof_d_r_dw, + nofp->nof_w_dw, nofp->nof_d_w_dw, + nofp->nof_rw_dw, nofp->nof_d_rw_dw, + nofp->nof_r_drw, nofp->nof_d_r_drw, + nofp->nof_w_drw, nofp->nof_d_w_drw, + nofp->nof_rw_drw, nofp->nof_d_rw_drw); + /* try sending a close RPC if it wasn't delegated */ + if (nofp->nof_r || nofp->nof_w || nofp->nof_rw || + nofp->nof_r_dw || nofp->nof_w_dw || nofp->nof_rw_dw || + nofp->nof_r_drw || nofp->nof_w_drw || nofp->nof_rw_drw) + nfs4_close_rpc(np, nofp, NULL, nofp->nof_owner->noo_cred, R_RECOVER); + } } TAILQ_REMOVE(&np->n_opens, nofp, nof_link); nfs_open_file_destroy(nofp); } lck_mtx_unlock(&np->n_openlock); - lck_mtx_lock(nfs_buf_mutex); - if (!LIST_EMPTY(&np->n_dirtyblkhd) || !LIST_EMPTY(&np->n_cleanblkhd)) { - if (!vname) - vname = vnode_getname(vp); - printf("nfs_reclaim: dropping %s buffers for file %s\n", - (!LIST_EMPTY(&np->n_dirtyblkhd) ? "dirty" : "clean"), - (vname ? vname : "//")); + if (np->n_monlink.le_next != NFSNOLIST) { + /* Wait for any in-progress getattr to complete, */ + /* then remove this node from the monitored node list. */ + lck_mtx_lock(&nmp->nm_lock); + while (np->n_mflag & NMMONSCANINPROG) { + struct timespec ts = { 1, 0 }; + np->n_mflag |= NMMONSCANWANT; + msleep(&np->n_mflag, &nmp->nm_lock, PZERO-1, "nfswaitmonscan", &ts); + } + if (np->n_monlink.le_next != NFSNOLIST) { + LIST_REMOVE(np, n_monlink); + np->n_monlink.le_next = NFSNOLIST; + } + lck_mtx_unlock(&nmp->nm_lock); } + + lck_mtx_lock(nfs_buf_mutex); + if (!force && (!LIST_EMPTY(&np->n_dirtyblkhd) || !LIST_EMPTY(&np->n_cleanblkhd))) + NP(np, "nfs_reclaim: dropping %s buffers", (!LIST_EMPTY(&np->n_dirtyblkhd) ? "dirty" : "clean")); lck_mtx_unlock(nfs_buf_mutex); - if (vname) - vnode_putname(vname); nfs_vinvalbuf(vp, V_IGNORE_WRITEERR, ap->a_context, 0); lck_mtx_lock(nfs_node_hash_mutex); if ((vnode_vtype(vp) != VDIR) && np->n_sillyrename) { - printf("nfs_reclaim: leaving unlinked file %s\n", np->n_sillyrename->nsr_name); + if (!force) + NP(np, "nfs_reclaim: leaving unlinked file %s", np->n_sillyrename->nsr_name); if (np->n_sillyrename->nsr_cred != NOCRED) kauth_cred_unref(&np->n_sillyrename->nsr_cred); vnode_rele(NFSTOV(np->n_sillyrename->nsr_dnp)); @@ -808,6 +875,8 @@ nfs_vnop_reclaim(ap) FREE_ZONE(np->n_cookiecache, sizeof(struct nfsdmap), M_NFSDIROFF); if (np->n_fhsize > NFS_SMALLFH) FREE_ZONE(np->n_fhp, np->n_fhsize, M_NFSBIGFH); + if (np->n_vattr.nva_acl) + kauth_acl_free(np->n_vattr.nva_acl); nfs_node_unlock(np); vnode_clearfsnode(vp); diff --git a/bsd/nfs/nfs_serv.c b/bsd/nfs/nfs_serv.c index e224a921d..956cc9285 100644 --- a/bsd/nfs/nfs_serv.c +++ b/bsd/nfs/nfs_serv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -89,6 +89,8 @@ #include <sys/vm.h> #include <sys/vmparam.h> +#include <netinet/in.h> + #include <nfs/nfsproto.h> #include <nfs/rpcv2.h> #include <nfs/nfs.h> @@ -114,6 +116,7 @@ lck_grp_t *nfsrv_slp_mutex_group; struct nfsrv_sockhead nfsrv_socklist, nfsrv_deadsocklist, nfsrv_sockwg, nfsrv_sockwait, nfsrv_sockwork; struct nfsrv_sock *nfsrv_udpsock = NULL; +struct nfsrv_sock *nfsrv_udp6sock = NULL; /* NFS exports */ struct nfsrv_expfs_list nfsrv_exports; @@ -232,6 +235,7 @@ nfsrv_init(void) TAILQ_INIT(&nfsd_head); TAILQ_INIT(&nfsd_queue); nfsrv_udpsock = NULL; + nfsrv_udp6sock = NULL; /* initialization complete */ nfsrv_initted = NFSRV_INITIALIZED; @@ -312,15 +316,7 @@ nfsrv_access( * obtain good performance in the optimistic mode. */ if (nfsmode & NFS_ACCESS_READ) { - if (vnode_isdir(vp)) { - testaction = - KAUTH_VNODE_LIST_DIRECTORY | - KAUTH_VNODE_READ_EXTATTRIBUTES; - } else { - testaction = - KAUTH_VNODE_READ_DATA | - KAUTH_VNODE_READ_EXTATTRIBUTES; - } + testaction = vnode_isdir(vp) ? KAUTH_VNODE_LIST_DIRECTORY : KAUTH_VNODE_READ_DATA; if (nfsrv_authorize(vp, NULL, testaction, ctx, nxo, 0)) nfsmode &= ~NFS_ACCESS_READ; } @@ -617,6 +613,9 @@ nfsrv_lookup( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LOOKUP; +#endif ni.ni_cnd.cn_flags = LOCKLEAF; error = nfsm_chain_get_path_namei(nmreq, len, &ni); isdotdot = ((len == 2) && (ni.ni_cnd.cn_pnbuf[0] == '.') && (ni.ni_cnd.cn_pnbuf[1] == '.')); @@ -1052,10 +1051,12 @@ again: * entry and free it. */ LIST_FOREACH_SAFE(fp, &firehead, fm_link, nfp) { - if (nfsrv_fsevents_enabled) + if (nfsrv_fsevents_enabled) { + fp->fm_context.vc_thread = current_thread(); add_fsevent(FSE_CONTENT_MODIFIED, &fp->fm_context, FSE_ARG_VNODE, fp->fm_vp, FSE_ARG_DONE); + } vnode_put(fp->fm_vp); kauth_cred_unref(&fp->fm_context.vc_ucred); LIST_REMOVE(fp, fm_link); @@ -1829,10 +1830,6 @@ nfsrv_create( ni.ni_cnd.cn_nameiop = 0; rdev = 0; - /* - * Save the original credential UID in case they are - * mapped and we need to map the IDs in the attributes. - */ saved_uid = kauth_cred_getuid(nd->nd_cr); nfsm_chain_get_fh_ptr(error, nmreq, nd->nd_vers, nfh.nfh_fhp, nfh.nfh_len); @@ -1841,6 +1838,9 @@ nfsrv_create( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { @@ -1923,17 +1923,6 @@ nfsrv_create( if (vp == NULL) { kauth_acl_t xacl = NULL; - /* - * If the credentials were mapped, we should - * map the same values in the attributes. - */ - if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nd->nd_cr) != saved_uid)) { - int ismember; - VATTR_SET(vap, va_uid, kauth_cred_getuid(nd->nd_cr)); - if (kauth_cred_ismember_gid(nd->nd_cr, vap->va_gid, &ismember) || !ismember) - VATTR_SET(vap, va_gid, kauth_cred_getgid(nd->nd_cr)); - } - /* authorize before creating */ error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx, nxo, 0); @@ -1950,20 +1939,17 @@ nfsrv_create( } VATTR_CLEAR_ACTIVE(vap, va_data_size); VATTR_CLEAR_ACTIVE(vap, va_access_time); + /* + * Server policy is to alway use the mapped rpc credential for + * file system object creation. This has the nice side effect of + * enforcing BSD creation semantics + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); /* validate new-file security information */ - if (!error) { + if (!error) error = vnode_authattr_new(dvp, vap, 0, ctx); - if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { - /* - * Most NFS servers just ignore the UID/GID attributes, so we - * try ignoring them if that'll help the request succeed. - */ - VATTR_CLEAR_ACTIVE(vap, va_uid); - VATTR_CLEAR_ACTIVE(vap, va_gid); - error = vnode_authattr_new(dvp, vap, 0, ctx); - } - } if (vap->va_type == VREG || vap->va_type == VSOCK) { @@ -2024,6 +2010,9 @@ nfsrv_create( vp = NULL; } ni.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LOOKUP; +#endif ni.ni_cnd.cn_flags &= ~LOCKPARENT; ni.ni_cnd.cn_context = ctx; ni.ni_startdir = dvp; @@ -2168,10 +2157,6 @@ nfsrv_mknod( vp = dvp = dirp = NULL; ni.ni_cnd.cn_nameiop = 0; - /* - * Save the original credential UID in case they are - * mapped and we need to map the IDs in the attributes. - */ saved_uid = kauth_cred_getuid(nd->nd_cr); nfsm_chain_get_fh_ptr(error, nmreq, NFS_VER3, nfh.nfh_fhp, nfh.nfh_len); @@ -2180,6 +2165,9 @@ nfsrv_mknod( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { @@ -2231,17 +2219,6 @@ nfsrv_mknod( } VATTR_SET(vap, va_type, vtyp); - /* - * If the credentials were mapped, we should - * map the same values in the attributes. - */ - if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nd->nd_cr) != saved_uid)) { - int ismember; - VATTR_SET(vap, va_uid, kauth_cred_getuid(nd->nd_cr)); - if (kauth_cred_ismember_gid(nd->nd_cr, vap->va_gid, &ismember) || !ismember) - VATTR_SET(vap, va_gid, kauth_cred_getgid(nd->nd_cr)); - } - /* authorize before creating */ error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx, nxo, 0); @@ -2258,20 +2235,18 @@ nfsrv_mknod( } VATTR_CLEAR_ACTIVE(vap, va_data_size); VATTR_CLEAR_ACTIVE(vap, va_access_time); + /* + * Server policy is to alway use the mapped rpc credential for + * file system object creation. This has the nice side effect of + * enforcing BSD creation semantics + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); /* validate new-file security information */ - if (!error) { + if (!error) error = vnode_authattr_new(dvp, vap, 0, ctx); - if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { - /* - * Most NFS servers just ignore the UID/GID attributes, so we - * try ignoring them if that'll help the request succeed. - */ - VATTR_CLEAR_ACTIVE(vap, va_uid); - VATTR_CLEAR_ACTIVE(vap, va_gid); - error = vnode_authattr_new(dvp, vap, 0, ctx); - } - } + if (error) goto out1; @@ -2295,6 +2270,9 @@ nfsrv_mknod( vp = NULL; } ni.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LOOKUP; +#endif ni.ni_cnd.cn_flags &= ~LOCKPARENT; ni.ni_cnd.cn_context = vfs_context_current(); ni.ni_startdir = dvp; @@ -2416,6 +2394,9 @@ nfsrv_remove( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = DELETE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_UNLINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { @@ -2596,6 +2577,9 @@ nfsrv_rename( kauth_cred_ref(saved_cred); retry: fromni.ni_cnd.cn_nameiop = DELETE; +#if CONFIG_TRIGGERS + fromni.ni_op = OP_UNLINK; +#endif fromni.ni_cnd.cn_flags = WANTPARENT; fromni.ni_cnd.cn_pnbuf = frompath; @@ -2628,6 +2612,9 @@ retry: } toni.ni_cnd.cn_nameiop = RENAME; +#if CONFIG_TRIGGERS + toni.ni_op = OP_RENAME; +#endif toni.ni_cnd.cn_flags = WANTPARENT; toni.ni_cnd.cn_pnbuf = topath; @@ -3175,6 +3162,9 @@ nfsrv_link( goto out; ni.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) @@ -3307,10 +3297,6 @@ nfsrv_symlink( linkdata = NULL; dirp = NULL; - /* - * Save the original credential UID in case they are - * mapped and we need to map the IDs in the attributes. - */ saved_uid = kauth_cred_getuid(nd->nd_cr); ni.ni_cnd.cn_nameiop = 0; @@ -3322,6 +3308,9 @@ nfsrv_symlink( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { @@ -3377,42 +3366,33 @@ nfsrv_symlink( goto out; } - /* - * If the credentials were mapped, we should - * map the same values in the attributes. - */ - if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nd->nd_cr) != saved_uid)) { - int ismember; - VATTR_SET(vap, va_uid, kauth_cred_getuid(nd->nd_cr)); - if (kauth_cred_ismember_gid(nd->nd_cr, vap->va_gid, &ismember) || !ismember) - VATTR_SET(vap, va_gid, kauth_cred_getgid(nd->nd_cr)); - } VATTR_SET(vap, va_type, VLNK); VATTR_CLEAR_ACTIVE(vap, va_data_size); VATTR_CLEAR_ACTIVE(vap, va_access_time); + /* + * Server policy is to alway use the mapped rpc credential for + * file system object creation. This has the nice side effect of + * enforcing BSD creation semantics + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); /* authorize before creating */ error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx, nxo, 0); /* validate given attributes */ - if (!error) { + if (!error) error = vnode_authattr_new(dvp, vap, 0, ctx); - if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { - /* - * Most NFS servers just ignore the UID/GID attributes, so we - * try ignoring them if that'll help the request succeed. - */ - VATTR_CLEAR_ACTIVE(vap, va_uid); - VATTR_CLEAR_ACTIVE(vap, va_gid); - error = vnode_authattr_new(dvp, vap, 0, ctx); - } - } + if (!error) error = VNOP_SYMLINK(dvp, &vp, &ni.ni_cnd, vap, linkdata, ctx); if (!error && (nd->nd_vers == NFS_VER3)) { if (vp == NULL) { ni.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LOOKUP; +#endif ni.ni_cnd.cn_flags &= ~(LOCKPARENT | FOLLOW); ni.ni_cnd.cn_flags |= (NOFOLLOW | LOCKLEAF); ni.ni_cnd.cn_context = ctx; @@ -3508,6 +3488,7 @@ nfsmout: /* * nfs mkdir service */ + int nfsrv_mkdir( struct nfsrv_descript *nd, @@ -3533,10 +3514,6 @@ nfsrv_mkdir( nmreq = &nd->nd_nmreq; nfsm_chain_null(&nmrep); - /* - * Save the original credential UID in case they are - * mapped and we need to map the IDs in the attributes. - */ saved_uid = kauth_cred_getuid(nd->nd_cr); ni.ni_cnd.cn_nameiop = 0; @@ -3548,6 +3525,9 @@ nfsrv_mkdir( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_LINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { @@ -3593,17 +3573,6 @@ nfsrv_mkdir( goto out; } - /* - * If the credentials were mapped, we should - * map the same values in the attributes. - */ - if ((vap->va_uid == saved_uid) && (kauth_cred_getuid(nd->nd_cr) != saved_uid)) { - int ismember; - VATTR_SET(vap, va_uid, kauth_cred_getuid(nd->nd_cr)); - if (kauth_cred_ismember_gid(nd->nd_cr, vap->va_gid, &ismember) || !ismember) - VATTR_SET(vap, va_gid, kauth_cred_getgid(nd->nd_cr)); - } - error = nfsrv_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx, nxo, 0); /* construct ACL and handle inheritance */ @@ -3617,22 +3586,33 @@ nfsrv_mkdir( if (!error && xacl != NULL) VATTR_SET(vap, va_acl, xacl); } + VATTR_CLEAR_ACTIVE(vap, va_data_size); VATTR_CLEAR_ACTIVE(vap, va_access_time); + /* + * We don't support the S_ISGID bit for directories. Solaris and other + * SRV4 derived systems might set this to get BSD semantics, which we enforce + * any ways. + */ + if (VATTR_IS_ACTIVE(vap, va_mode)) + vap->va_mode &= ~S_ISGID; + /* + * Server policy is to alway use the mapped rpc credential for + * file system object creation. This has the nice side effect of + * enforcing BSD creation semantics + */ + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_gid); /* validate new-file security information */ - if (!error) { + if (!error) error = vnode_authattr_new(dvp, vap, 0, ctx); - if (error && (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid))) { - /* - * Most NFS servers just ignore the UID/GID attributes, so we - * try ignoring them if that'll help the request succeed. - */ - VATTR_CLEAR_ACTIVE(vap, va_uid); - VATTR_CLEAR_ACTIVE(vap, va_gid); - error = vnode_authattr_new(dvp, vap, 0, ctx); - } - } + /* + * vnode_authattr_new can return errors other than EPERM, but that's not going to + * sit well with our clients so we map all errors to EPERM. + */ + if (error) + error = EPERM; if (!error) error = VNOP_MKDIR(dvp, &vp, &ni.ni_cnd, vap, ctx); @@ -3755,6 +3735,9 @@ nfsrv_rmdir( nfsmerr_if(error); ni.ni_cnd.cn_nameiop = DELETE; +#if CONFIG_TRIGGERS + ni.ni_op = OP_UNLINK; +#endif ni.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; error = nfsm_chain_get_path_namei(nmreq, len, &ni); if (!error) { diff --git a/bsd/nfs/nfs_socket.c b/bsd/nfs/nfs_socket.c index 8d6747009..71b6e5c44 100644 --- a/bsd/nfs/nfs_socket.c +++ b/bsd/nfs/nfs_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -72,6 +72,7 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> +#include <sys/signalvar.h> #include <sys/kauth.h> #include <sys/mount_internal.h> #include <sys/kernel.h> @@ -91,11 +92,13 @@ #include <kern/thread.h> #include <kern/thread_call.h> #include <sys/user.h> +#include <sys/acct.h> #include <netinet/in.h> #include <netinet/tcp.h> #include <nfs/rpcv2.h> +#include <nfs/krpc.h> #include <nfs/nfsproto.h> #include <nfs/nfs.h> #include <nfs/xdr_subs.h> @@ -117,6 +120,29 @@ int nfsrv_getreq(struct nfsrv_descript *); extern int nfsv3_procid[NFS_NPROCS]; #endif /* NFSSERVER */ +/* + * compare two sockaddr structures + */ +int +nfs_sockaddr_cmp(struct sockaddr *sa1, struct sockaddr *sa2) +{ + if (!sa1) + return (-1); + if (!sa2) + return (1); + if (sa1->sa_family != sa2->sa_family) + return ((sa1->sa_family < sa2->sa_family) ? -1 : 1); + if (sa1->sa_len != sa2->sa_len) + return ((sa1->sa_len < sa2->sa_len) ? -1 : 1); + if (sa1->sa_family == AF_INET) + return (bcmp(&((struct sockaddr_in*)sa1)->sin_addr, + &((struct sockaddr_in*)sa2)->sin_addr, sizeof(((struct sockaddr_in*)sa1)->sin_addr))); + if (sa1->sa_family == AF_INET6) + return (bcmp(&((struct sockaddr_in6*)sa1)->sin6_addr, + &((struct sockaddr_in6*)sa2)->sin6_addr, sizeof(((struct sockaddr_in6*)sa1)->sin6_addr))); + return (-1); +} + #if NFSCLIENT int nfs_reconnect(struct nfsmount *); @@ -188,214 +214,1270 @@ static int proct[NFS_NPROCS] = { static int nfs_backoff[8] = { 2, 4, 8, 16, 32, 64, 128, 256, }; /* - * Initialize socket state and perform setup for a new NFS connection. + * Increment location index to next address/server/location. */ -int -nfs_connect(struct nfsmount *nmp, int verbose) +void +nfs_location_next(struct nfs_fs_locations *nlp, struct nfs_location_index *nlip) { - socket_t so; - int error, on = 1, proto; - sock_upcall upcall; - struct sockaddr *saddr; - struct sockaddr_in sin; - struct timeval timeo; - - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_sockflags |= NMSOCK_CONNECTING; - saddr = mbuf_data(nmp->nm_nam); - upcall = (nmp->nm_sotype == SOCK_STREAM) ? nfs_tcp_rcv : nfs_udp_rcv; - lck_mtx_unlock(&nmp->nm_lock); - error = sock_socket(saddr->sa_family, nmp->nm_sotype, - nmp->nm_soproto, upcall, nmp, &nmp->nm_so); - if (error) - goto bad; - lck_mtx_lock(&nmp->nm_lock); - so = nmp->nm_so; - - /* - * Some servers require that the client port be a reserved port number. - */ - if (saddr->sa_family == AF_INET && (nmp->nm_flag & NFSMNT_RESVPORT)) { - int portrange = IP_PORTRANGE_LOW; - error = sock_setsockopt(so, IPPROTO_IP, IP_PORTRANGE, &portrange, sizeof(portrange)); - if (!error) { /* bind now to check for failure */ - sin.sin_len = sizeof (struct sockaddr_in); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = INADDR_ANY; - sin.sin_port = 0; - error = sock_bind(so, (struct sockaddr *) &sin); - } - if (error) { - lck_mtx_unlock(&nmp->nm_lock); - goto bad; + uint8_t loc = nlip->nli_loc; + uint8_t serv = nlip->nli_serv; + uint8_t addr = nlip->nli_addr; + + /* move to next address */ + addr++; + if (addr >= nlp->nl_locations[loc]->nl_servers[serv]->ns_addrcount) { + /* no more addresses on current server, go to first address of next server */ +next_server: + addr = 0; + serv++; + if (serv >= nlp->nl_locations[loc]->nl_servcount) { + /* no more servers on current location, go to first server of next location */ + serv = 0; + loc++; + if (loc >= nlp->nl_numlocs) + loc = 0; /* after last location, wrap back around to first location */ } } - /* - * Protocols that do not require connections may be optionally left - * unconnected for servers that reply from a different address/port. + * It's possible for this next server to not have any addresses. + * Check for that here and go to the next server. + * But bail out if we've managed to come back around to the original + * location that was passed in. (That would mean no servers had any + * addresses. And we don't want to spin here forever.) */ - if (nmp->nm_flag & NFSMNT_NOCONN) { - if (nmp->nm_sotype == SOCK_STREAM) { - error = ENOTCONN; - lck_mtx_unlock(&nmp->nm_lock); - goto bad; + if ((loc == nlip->nli_loc) && (serv == nlip->nli_serv) && (addr == nlip->nli_addr)) + return; + if (addr >= nlp->nl_locations[loc]->nl_servers[serv]->ns_addrcount) + goto next_server; + + nlip->nli_loc = loc; + nlip->nli_serv = serv; + nlip->nli_addr = addr; +} + +/* + * Compare two location indices. + */ +int +nfs_location_index_cmp(struct nfs_location_index *nlip1, struct nfs_location_index *nlip2) +{ + if (nlip1->nli_loc != nlip2->nli_loc) + return (nlip1->nli_loc - nlip2->nli_loc); + if (nlip1->nli_serv != nlip2->nli_serv) + return (nlip1->nli_serv - nlip2->nli_serv); + return (nlip1->nli_addr - nlip2->nli_addr); +} + +/* + * Get the mntfromname (or path portion only) for a given location. + */ +void +nfs_location_mntfromname(struct nfs_fs_locations *locs, struct nfs_location_index idx, char *s, int size, int pathonly) +{ + struct nfs_fs_location *fsl = locs->nl_locations[idx.nli_loc]; + char *p; + int cnt, i; + + p = s; + if (!pathonly) { + cnt = snprintf(p, size, "%s:", fsl->nl_servers[idx.nli_serv]->ns_name); + p += cnt; + size -= cnt; + } + if (fsl->nl_path.np_compcount == 0) { + /* mounting root export on server */ + if (size > 0) { + *p++ = '/'; + *p++ = '\0'; } - } else { - int tocnt = 0, optlen = sizeof(error); - struct timespec ts = { 1, 0 }; + return; + } + /* append each server path component */ + for (i=0; (size > 0) && (i < (int)fsl->nl_path.np_compcount); i++) { + cnt = snprintf(p, size, "/%s", fsl->nl_path.np_components[i]); + p += cnt; + size -= cnt; + } +} - lck_mtx_unlock(&nmp->nm_lock); - error = sock_connect(so, mbuf_data(nmp->nm_nam), MSG_DONTWAIT); - if (error && (error != EINPROGRESS)) - goto bad; - lck_mtx_lock(&nmp->nm_lock); - while (!sock_isconnected(so)) { - nfs_mount_check_dead_timeout(nmp); - if ((tocnt++ == 30) && verbose) /* log a warning if connect is taking a while */ - log(LOG_INFO, "nfs_connect: socket connect taking a while for %s\n", - vfs_statfs(nmp->nm_mountp)->f_mntfromname); - /* check for error on socket */ - sock_getsockopt(so, SOL_SOCKET, SO_ERROR, &error, &optlen); - if (error) { - if (verbose) - log(LOG_INFO, "nfs_connect: socket error %d for %s\n", - error, vfs_statfs(nmp->nm_mountp)->f_mntfromname); - break; +/* + * NFS client connect socket upcall. + * (Used only during socket connect/search.) + */ +void +nfs_connect_upcall(socket_t so, void *arg, __unused int waitflag) +{ + struct nfs_socket *nso = arg; + size_t rcvlen; + mbuf_t m; + int error = 0, recv = 1; + + if (nso->nso_flags & NSO_CONNECTING) { + NFS_SOCK_DBG(("nfs connect - socket %p upcall - connecting\n", nso)); + wakeup(nso->nso_wake); + return; + } + + lck_mtx_lock(&nso->nso_lock); + if ((nso->nso_flags & (NSO_UPCALL|NSO_DISCONNECTING|NSO_DEAD)) || !(nso->nso_flags & NSO_PINGING)) { + NFS_SOCK_DBG(("nfs connect - socket %p upcall - nevermind\n", nso)); + lck_mtx_unlock(&nso->nso_lock); + return; + } + NFS_SOCK_DBG(("nfs connect - socket %p upcall\n", nso)); + nso->nso_flags |= NSO_UPCALL; + + /* loop while we make error-free progress */ + while (!error && recv) { + /* make sure we're still interested in this socket */ + if (nso->nso_flags & (NSO_DISCONNECTING|NSO_DEAD)) + break; + lck_mtx_unlock(&nso->nso_lock); + m = NULL; + if (nso->nso_sotype == SOCK_STREAM) { + error = nfs_rpc_record_read(so, &nso->nso_rrs, MSG_DONTWAIT, &recv, &m); + } else { + rcvlen = 1000000; + error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen); + recv = m ? 1 : 0; + } + lck_mtx_lock(&nso->nso_lock); + if (m) { + /* match response with request */ + struct nfsm_chain nmrep; + uint32_t reply = 0, rxid = 0, verf_type, verf_len; + uint32_t reply_status, rejected_status, accepted_status; + + nfsm_chain_dissect_init(error, &nmrep, m); + nfsm_chain_get_32(error, &nmrep, rxid); + nfsm_chain_get_32(error, &nmrep, reply); + if (!error && ((reply != RPC_REPLY) || (rxid != nso->nso_pingxid))) + error = EBADRPC; + nfsm_chain_get_32(error, &nmrep, reply_status); + if (!error && (reply_status == RPC_MSGDENIED)) { + nfsm_chain_get_32(error, &nmrep, rejected_status); + if (!error) + error = (rejected_status == RPC_MISMATCH) ? ERPCMISMATCH : EACCES; } - /* abort if this is taking too long or we're unmounting */ - if ((tocnt > 120) || (nmp->nm_sockflags & NMSOCK_UNMOUNT)) { - error = ENOTCONN; - break; + nfsm_chain_get_32(error, &nmrep, verf_type); /* verifier flavor */ + nfsm_chain_get_32(error, &nmrep, verf_len); /* verifier length */ + nfsmout_if(error); + if (verf_len) + nfsm_chain_adv(error, &nmrep, nfsm_rndup(verf_len)); + nfsm_chain_get_32(error, &nmrep, accepted_status); + nfsmout_if(error); + if ((accepted_status == RPC_PROGMISMATCH) && !nso->nso_version) { + uint32_t minvers, maxvers; + nfsm_chain_get_32(error, &nmrep, minvers); + nfsm_chain_get_32(error, &nmrep, maxvers); + nfsmout_if(error); + if (nso->nso_protocol == PMAPPROG) { + if ((minvers > RPCBVERS4) || (maxvers < PMAPVERS)) + error = EPROGMISMATCH; + else if ((nso->nso_saddr->sa_family == AF_INET) && + (PMAPVERS >= minvers) && (PMAPVERS <= maxvers)) + nso->nso_version = PMAPVERS; + else if (nso->nso_saddr->sa_family == AF_INET6) { + if ((RPCBVERS4 >= minvers) && (RPCBVERS4 <= maxvers)) + nso->nso_version = RPCBVERS4; + else if ((RPCBVERS3 >= minvers) && (RPCBVERS3 <= maxvers)) + nso->nso_version = RPCBVERS3; + } + } else if (nso->nso_protocol == NFS_PROG) { + if ((minvers > NFS_VER4) || (maxvers < NFS_VER2)) + error = EPROGMISMATCH; + else if ((NFS_VER3 >= minvers) && (NFS_VER3 <= maxvers)) + nso->nso_version = NFS_VER3; + else if ((NFS_VER2 >= minvers) && (NFS_VER2 <= maxvers)) + nso->nso_version = NFS_VER2; + else if ((NFS_VER4 >= minvers) && (NFS_VER4 <= maxvers)) + nso->nso_version = NFS_VER4; + } + if (!error && nso->nso_version) + accepted_status = RPC_SUCCESS; } - if ((error = nfs_sigintr(nmp, NULL, current_thread(), 1))) - break; - msleep(&nmp->nm_so, &nmp->nm_lock, PSOCK, "nfs_socket_connect", &ts); + if (!error) { + switch (accepted_status) { + case RPC_SUCCESS: + error = 0; + break; + case RPC_PROGUNAVAIL: + error = EPROGUNAVAIL; + break; + case RPC_PROGMISMATCH: + error = EPROGMISMATCH; + break; + case RPC_PROCUNAVAIL: + error = EPROCUNAVAIL; + break; + case RPC_GARBAGE: + error = EBADRPC; + break; + case RPC_SYSTEM_ERR: + default: + error = EIO; + break; + } + } +nfsmout: + nso->nso_flags &= ~NSO_PINGING; + if (error) { + nso->nso_error = error; + nso->nso_flags |= NSO_DEAD; + } else { + nso->nso_flags |= NSO_VERIFIED; + } + mbuf_freem(m); + /* wake up search thread */ + wakeup(nso->nso_wake); + break; } - if ((tocnt > 30) && verbose) - log(LOG_INFO, "nfs_connect: socket connect %s for %s\n", - error ? "aborted" : "completed", - vfs_statfs(nmp->nm_mountp)->f_mntfromname); - if (error) { - lck_mtx_unlock(&nmp->nm_lock); - goto bad; + } + + nso->nso_flags &= ~NSO_UPCALL; + if ((error != EWOULDBLOCK) && (error || !recv)) { + /* problems with the socket... */ + nso->nso_error = error ? error : EPIPE; + nso->nso_flags |= NSO_DEAD; + wakeup(nso->nso_wake); + } + if (nso->nso_flags & NSO_DISCONNECTING) + wakeup(&nso->nso_flags); + lck_mtx_unlock(&nso->nso_lock); +} + +/* + * Create/initialize an nfs_socket structure. + */ +int +nfs_socket_create( + __unused struct nfsmount *nmp, + struct sockaddr *sa, + int sotype, + in_port_t port, + uint32_t protocol, + uint32_t vers, + int resvport, + struct nfs_socket **nsop) +{ + struct nfs_socket *nso; + struct timeval now; + int error; +#ifdef NFS_SOCKET_DEBUGGING + char naddr[MAX_IPv6_STR_LEN]; + void *sinaddr; + + if (sa->sa_family == AF_INET) + sinaddr = &((struct sockaddr_in*)sa)->sin_addr; + else + sinaddr = &((struct sockaddr_in6*)sa)->sin6_addr; + if (inet_ntop(sa->sa_family, sinaddr, naddr, sizeof(naddr)) != naddr) + strlcpy(naddr, "<unknown>", sizeof(naddr)); +#endif + + *nsop = NULL; + + /* Create the socket. */ + MALLOC(nso, struct nfs_socket *, sizeof(struct nfs_socket), M_TEMP, M_WAITOK|M_ZERO); + if (nso) + MALLOC(nso->nso_saddr, struct sockaddr *, sa->sa_len, M_SONAME, M_WAITOK|M_ZERO); + if (!nso || !nso->nso_saddr) { + if (nso) + FREE(nso, M_TEMP); + return (ENOMEM); + } + lck_mtx_init(&nso->nso_lock, nfs_request_grp, LCK_ATTR_NULL); + nso->nso_sotype = sotype; + if (nso->nso_sotype == SOCK_STREAM) + nfs_rpc_record_state_init(&nso->nso_rrs); + microuptime(&now); + nso->nso_timestamp = now.tv_sec; + bcopy(sa, nso->nso_saddr, sa->sa_len); + if (sa->sa_family == AF_INET) + ((struct sockaddr_in*)nso->nso_saddr)->sin_port = htons(port); + else if (sa->sa_family == AF_INET6) + ((struct sockaddr_in6*)nso->nso_saddr)->sin6_port = htons(port); + nso->nso_protocol = protocol; + nso->nso_version = vers; + + error = sock_socket(sa->sa_family, nso->nso_sotype, 0, NULL, NULL, &nso->nso_so); + + /* Some servers require that the client port be a reserved port number. */ + if (!error && resvport && ((sa->sa_family == AF_INET) || (sa->sa_family == AF_INET6))) { + struct sockaddr_storage ss; + int level = (sa->sa_family == AF_INET) ? IPPROTO_IP : IPPROTO_IPV6; + int optname = (sa->sa_family == AF_INET) ? IP_PORTRANGE : IPV6_PORTRANGE; + int portrange = IP_PORTRANGE_LOW; + + error = sock_setsockopt(nso->nso_so, level, optname, &portrange, sizeof(portrange)); + if (!error) { /* bind now to check for failure */ + ss.ss_len = sa->sa_len; + ss.ss_family = sa->sa_family; + if (ss.ss_family == AF_INET) { + ((struct sockaddr_in*)&ss)->sin_addr.s_addr = INADDR_ANY; + ((struct sockaddr_in*)&ss)->sin_port = htons(0); + } else if (ss.ss_family == AF_INET6) { + ((struct sockaddr_in6*)&ss)->sin6_addr = in6addr_any; + ((struct sockaddr_in6*)&ss)->sin6_port = htons(0); + } else { + error = EINVAL; + } + if (!error) + error = sock_bind(nso->nso_so, (struct sockaddr*)&ss); } } + if (error) { + NFS_SOCK_DBG(("nfs connect %s error %d creating socket %p %s type %d%s port %d prot %d %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, error, nso, naddr, sotype, + resvport ? "r" : "", port, protocol, vers)); + nfs_socket_destroy(nso); + } else { + NFS_SOCK_DBG(("nfs connect %s created socket %p %s type %d%s port %d prot %d %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, naddr, + sotype, resvport ? "r" : "", port, protocol, vers)); + *nsop = nso; + } + return (error); +} + +/* + * Destroy an nfs_socket structure. + */ +void +nfs_socket_destroy(struct nfs_socket *nso) +{ + struct timespec ts = { 4, 0 }; + + lck_mtx_lock(&nso->nso_lock); + nso->nso_flags |= NSO_DISCONNECTING; + if (nso->nso_flags & NSO_UPCALL) /* give upcall a chance to complete */ + msleep(&nso->nso_flags, &nso->nso_lock, PZERO-1, "nfswaitupcall", &ts); + lck_mtx_unlock(&nso->nso_lock); + sock_shutdown(nso->nso_so, SHUT_RDWR); + sock_close(nso->nso_so); + if (nso->nso_sotype == SOCK_STREAM) + nfs_rpc_record_state_cleanup(&nso->nso_rrs); + lck_mtx_destroy(&nso->nso_lock, nfs_request_grp); + if (nso->nso_saddr) + FREE(nso->nso_saddr, M_SONAME); + if (nso->nso_saddr2) + FREE(nso->nso_saddr2, M_SONAME); + NFS_SOCK_DBG(("nfs connect - socket %p destroyed\n", nso)); + FREE(nso, M_TEMP); +} + +/* + * Set common socket options on an nfs_socket. + */ +void +nfs_socket_options(struct nfsmount *nmp, struct nfs_socket *nso) +{ /* * Set socket send/receive timeouts - * - Receive timeout shouldn't matter because all receives are performed + * - Receive timeout shouldn't matter because most receives are performed * in the socket upcall non-blocking. * - Send timeout should allow us to react to a blocked socket. * Soft mounts will want to abort sooner. */ - timeo.tv_usec = 0; - timeo.tv_sec = (nmp->nm_flag & NFSMNT_SOFT) ? 10 : 60; - error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); - error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); - if (error) { - log(LOG_INFO, "nfs_connect: socket timeout setting errors for %s\n", - vfs_statfs(nmp->nm_mountp)->f_mntfromname); - error = 0; - } + struct timeval timeo; + int on = 1, proto; - if (nmp->nm_sotype == SOCK_STREAM) { + timeo.tv_usec = 0; + timeo.tv_sec = NMFLAG(nmp, SOFT) ? 5 : 60; + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); + if (nso->nso_sotype == SOCK_STREAM) { /* Assume that SOCK_STREAM always requires a connection */ - sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on)); + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_KEEPALIVE, &on, sizeof(on)); /* set nodelay for TCP */ - sock_gettype(so, NULL, NULL, &proto); + sock_gettype(nso->nso_so, NULL, NULL, &proto); if (proto == IPPROTO_TCP) - sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); + sock_setsockopt(nso->nso_so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); } - - if (nmp->nm_sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */ + if (nso->nso_sotype == SOCK_DGRAM) { /* set socket buffer sizes for UDP */ int reserve = NFS_UDPSOCKBUF; - error |= sock_setsockopt(so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve)); - error |= sock_setsockopt(so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve)); + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_SNDBUF, &reserve, sizeof(reserve)); + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_RCVBUF, &reserve, sizeof(reserve)); + } + /* set SO_NOADDRERR to detect network changes ASAP */ + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)); + /* just playin' it safe with upcalls */ + sock_setsockopt(nso->nso_so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); + /* socket should be interruptible if the mount is */ + if (!NMFLAG(nmp, INTR)) + sock_nointerrupt(nso->nso_so, 1); +} + +/* + * Release resources held in an nfs_socket_search. + */ +void +nfs_socket_search_cleanup(struct nfs_socket_search *nss) +{ + struct nfs_socket *nso, *nsonext; + + TAILQ_FOREACH_SAFE(nso, &nss->nss_socklist, nso_link, nsonext) { + TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link); + nss->nss_sockcnt--; + nfs_socket_destroy(nso); + } + if (nss->nss_sock) { + nfs_socket_destroy(nss->nss_sock); + nss->nss_sock = NULL; + } +} + +/* + * Prefer returning certain errors over others. + * This function returns a ranking of the given error. + */ +int +nfs_connect_error_class(int error) +{ + switch (error) { + case 0: + return (0); + case ETIMEDOUT: + case EAGAIN: + return (1); + case EPIPE: + case EADDRNOTAVAIL: + case ENETDOWN: + case ENETUNREACH: + case ENETRESET: + case ECONNABORTED: + case ECONNRESET: + case EISCONN: + case ENOTCONN: + case ESHUTDOWN: + case ECONNREFUSED: + case EHOSTDOWN: + case EHOSTUNREACH: + return (2); + case ERPCMISMATCH: + case EPROCUNAVAIL: + case EPROGMISMATCH: + case EPROGUNAVAIL: + return (3); + case EBADRPC: + return (4); + default: + return (5); + } +} + +/* + * Make sure a socket search returns the best error. + */ +void +nfs_socket_search_update_error(struct nfs_socket_search *nss, int error) +{ + if (nfs_connect_error_class(error) >= nfs_connect_error_class(nss->nss_error)) + nss->nss_error = error; +} + +/* + * Continue the socket search until we have something to report. + */ +int +nfs_connect_search_loop(struct nfsmount *nmp, struct nfs_socket_search *nss) +{ + struct nfs_socket *nso, *nsonext; + struct timeval now; + struct nfs_fs_location *fsl; + struct nfs_fs_server *fss; + struct sockaddr_storage ss; + char *addrstr; + int error, nomore = 0; + +loop: + microuptime(&now); + NFS_SOCK_DBG(("nfs connect %s search %ld\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, now.tv_sec)); + + /* Time to start another socket? */ + while ((nss->nss_last < 0) || (nss->nss_sockcnt == 0) || + ((nss->nss_sockcnt < 4) && (now.tv_sec >= (nss->nss_last + 2)))) { + if (nmp->nm_sockflags & NMSOCK_UNMOUNT) + return (EINTR); + /* Find the next address to try... */ + /* Have we run out of locations? */ + if (!nomore && (nss->nss_last != -1) && !nfs_location_index_cmp(&nss->nss_nextloc, &nss->nss_startloc)) + nomore = 1; + if (nomore) { + if (nss->nss_last < 0) + nss->nss_last = now.tv_sec; + break; + } + /* Can we convert the address to a sockaddr? */ + fsl = nmp->nm_locations.nl_locations[nss->nss_nextloc.nli_loc]; + fss = fsl->nl_servers[nss->nss_nextloc.nli_serv]; + addrstr = fss->ns_addresses[nss->nss_nextloc.nli_addr]; + if (!nfs_uaddr2sockaddr(addrstr, (struct sockaddr*)&ss)) { + nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc); + nss->nss_last = -2; + continue; + } + /* Check that socket family is acceptable. */ + if (nmp->nm_sofamily && (ss.ss_family != nmp->nm_sofamily)) { + nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc); + nss->nss_last = -2; + continue; + } + + /* Create the socket. */ + error = nfs_socket_create(nmp, (struct sockaddr*)&ss, nss->nss_sotype, + nss->nss_port, nss->nss_protocol, nss->nss_version, + ((nss->nss_protocol == NFS_PROG) && NMFLAG(nmp, RESVPORT)), &nso); + if (error) + return (error); + + nso->nso_location = nss->nss_nextloc; + nso->nso_wake = nss; + error = sock_setupcall(nso->nso_so, nfs_connect_upcall, nso); if (error) { - log(LOG_INFO, "nfs_connect: socket buffer setting errors for %s\n", + lck_mtx_lock(&nso->nso_lock); + nso->nso_error = error; + nso->nso_flags |= NSO_DEAD; + lck_mtx_unlock(&nso->nso_lock); + } + + TAILQ_INSERT_TAIL(&nss->nss_socklist, nso, nso_link); + nss->nss_sockcnt++; + nfs_location_next(&nmp->nm_locations, &nss->nss_nextloc); + + nss->nss_last = now.tv_sec; + } + + /* check each active socket and try to push it along */ + TAILQ_FOREACH(nso, &nss->nss_socklist, nso_link) { + lck_mtx_lock(&nso->nso_lock); + if (!(nso->nso_flags & NSO_CONNECTED)) { + if ((nso->nso_sotype != SOCK_STREAM) && NMFLAG(nmp, NOCONNECT)) { + /* no connection needed, just say it's already connected */ + nso->nso_flags |= NSO_CONNECTED; + NFS_SOCK_DBG(("nfs connect %s UDP socket %p noconnect\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + } else if (!(nso->nso_flags & NSO_CONNECTING)) { + /* initiate the connection */ + nso->nso_flags |= NSO_CONNECTING; + lck_mtx_unlock(&nso->nso_lock); + NFS_SOCK_DBG(("nfs connect %s connecting socket %p\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + error = sock_connect(nso->nso_so, nso->nso_saddr, MSG_DONTWAIT); + lck_mtx_lock(&nso->nso_lock); + if (error && (error != EINPROGRESS)) { + nso->nso_error = error; + nso->nso_flags |= NSO_DEAD; + lck_mtx_unlock(&nso->nso_lock); + continue; + } + } + if (nso->nso_flags & NSO_CONNECTING) { + /* check the connection */ + if (sock_isconnected(nso->nso_so)) { + NFS_SOCK_DBG(("nfs connect %s socket %p is connected\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + nso->nso_flags &= ~NSO_CONNECTING; + nso->nso_flags |= NSO_CONNECTED; + } else { + int optlen = sizeof(error); + error = 0; + sock_getsockopt(nso->nso_so, SOL_SOCKET, SO_ERROR, &error, &optlen); + if (error) { /* we got an error on the socket */ + NFS_SOCK_DBG(("nfs connect %s socket %p connection error %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error)); + if (nss->nss_flags & NSS_VERBOSE) + log(LOG_INFO, "nfs_connect: socket error %d for %s\n", + error, vfs_statfs(nmp->nm_mountp)->f_mntfromname); + nso->nso_error = error; + nso->nso_flags |= NSO_DEAD; + lck_mtx_unlock(&nso->nso_lock); + continue; + } + } + } + if (nso->nso_flags & NSO_CONNECTED) + nfs_socket_options(nmp, nso); + } + if (!(nso->nso_flags & NSO_CONNECTED)) { + lck_mtx_unlock(&nso->nso_lock); + continue; + } + if (!(nso->nso_flags & (NSO_PINGING|NSO_VERIFIED)) || + ((nso->nso_sotype == SOCK_DGRAM) && (now.tv_sec >= nso->nso_reqtimestamp+2))) { + /* initiate a NULL RPC request */ + uint64_t xid = nso->nso_pingxid; + mbuf_t m, mreq = NULL; + struct msghdr msg; + size_t reqlen, sentlen; + uint32_t vers; + + if (!(vers = nso->nso_version)) { + if (nso->nso_protocol == PMAPPROG) + vers = (nso->nso_saddr->sa_family == AF_INET) ? PMAPVERS : RPCBVERS4; + else if (nso->nso_protocol == NFS_PROG) + vers = NFS_VER3; + } + lck_mtx_unlock(&nso->nso_lock); + error = nfsm_rpchead2(nmp, nso->nso_sotype, nso->nso_protocol, vers, 0, RPCAUTH_SYS, + vfs_context_ucred(vfs_context_kernel()), NULL, NULL, &xid, &mreq); + lck_mtx_lock(&nso->nso_lock); + if (!error) { + nso->nso_flags |= NSO_PINGING; + nso->nso_pingxid = R_XID32(xid); + nso->nso_reqtimestamp = now.tv_sec; + bzero(&msg, sizeof(msg)); + if ((nso->nso_sotype != SOCK_STREAM) && !sock_isconnected(nso->nso_so)) { + msg.msg_name = nso->nso_saddr; + msg.msg_namelen = nso->nso_saddr->sa_len; + } + for (reqlen=0, m=mreq; m; m = mbuf_next(m)) + reqlen += mbuf_len(m); + lck_mtx_unlock(&nso->nso_lock); + error = sock_sendmbuf(nso->nso_so, &msg, mreq, 0, &sentlen); + NFS_SOCK_DBG(("nfs connect %s verifying socket %p send rv %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error)); + lck_mtx_lock(&nso->nso_lock); + if (!error && (sentlen != reqlen)) + error = ETIMEDOUT; + } + if (error) { + nso->nso_error = error; + nso->nso_flags |= NSO_DEAD; + lck_mtx_unlock(&nso->nso_lock); + continue; + } + } + if (nso->nso_flags & NSO_VERIFIED) { + /* WOOHOO!! This socket looks good! */ + NFS_SOCK_DBG(("nfs connect %s socket %p verified\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + if (!nso->nso_version) { + /* If the version isn't set, the default must have worked. */ + if (nso->nso_protocol == PMAPPROG) + nso->nso_version = (nso->nso_saddr->sa_family == AF_INET) ? PMAPVERS : RPCBVERS4; + if (nso->nso_protocol == NFS_PROG) + nso->nso_version = NFS_VER3; + } + lck_mtx_unlock(&nso->nso_lock); + TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link); + nss->nss_sockcnt--; + nss->nss_sock = nso; + break; + } + lck_mtx_unlock(&nso->nso_lock); + } + + TAILQ_FOREACH_SAFE(nso, &nss->nss_socklist, nso_link, nsonext) { + lck_mtx_lock(&nso->nso_lock); + if (now.tv_sec >= (nso->nso_timestamp + nss->nss_timeo)) { + /* took too long */ + NFS_SOCK_DBG(("nfs connect %s socket %p timed out\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + nso->nso_error = ETIMEDOUT; + nso->nso_flags |= NSO_DEAD; + } + if (!(nso->nso_flags & NSO_DEAD)) { + lck_mtx_unlock(&nso->nso_lock); + continue; + } + lck_mtx_unlock(&nso->nso_lock); + NFS_SOCK_DBG(("nfs connect %s reaping socket %p %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, nso->nso_error)); + nfs_socket_search_update_error(nss, nso->nso_error); + TAILQ_REMOVE(&nss->nss_socklist, nso, nso_link); + nss->nss_sockcnt--; + nfs_socket_destroy(nso); + if (!nomore) + nss->nss_last = -2; + } + + /* + * Keep looping if we haven't found a socket yet and we have more + * sockets to (continue to) try. + */ + error = 0; + if (!nss->nss_sock && (!TAILQ_EMPTY(&nss->nss_socklist) || !nomore)) { + /* log a warning if connect is taking a while */ + if (((now.tv_sec - nss->nss_timestamp) >= 30) && ((nss->nss_flags & (NSS_VERBOSE|NSS_WARNED)) == NSS_VERBOSE)) { + log(LOG_INFO, "nfs_connect: socket connect taking a while for %s\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); + nss->nss_flags |= NSS_WARNED; + } + if (nmp->nm_sockflags & NMSOCK_UNMOUNT) + return (EINTR); + if ((error = nfs_sigintr(nmp, NULL, current_thread(), 0))) + return (error); + if (nss->nss_last >= 0) + tsleep(nss, PSOCK, "nfs_connect_search_wait", hz); + goto loop; + } + + NFS_SOCK_DBG(("nfs connect %s returning %d\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, error)); + return (error); +} + +/* + * Initialize a new NFS connection. + * + * Search for a location to connect a socket to and initialize the connection. + * + * An NFS mount may have multiple locations/servers/addresses available. + * We attempt to connect to each one asynchronously and will start + * several sockets in parallel if other locations are slow to answer. + * We'll use the first NFS socket we can successfully set up. + * + * The search may involve contacting the portmapper service first. + * + * A mount's initial connection may require negotiating some parameters such + * as socket type and NFS version. + */ +int +nfs_connect(struct nfsmount *nmp, int verbose, int timeo) +{ + struct nfs_socket_search nss; + struct nfs_socket *nso, *nsonfs; + struct sockaddr_storage ss; + struct sockaddr *saddr, *oldsaddr; + sock_upcall upcall; + struct timeval now, start; + int error, savederror, nfsvers; + uint8_t sotype = nmp->nm_sotype ? nmp->nm_sotype : SOCK_STREAM; + fhandle_t *fh = NULL; + char *path = NULL; + in_port_t port; + + /* paranoia... check that we have at least one address in the locations */ + uint32_t loc, serv; + for (loc=0; loc < nmp->nm_locations.nl_numlocs; loc++) { + for (serv=0; serv < nmp->nm_locations.nl_locations[loc]->nl_servcount; serv++) { + if (nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addrcount) + break; + NFS_SOCK_DBG(("nfs connect %s search, server %s has no addresses\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, + nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_name)); + } + if (serv < nmp->nm_locations.nl_locations[loc]->nl_servcount) + break; + } + if (loc >= nmp->nm_locations.nl_numlocs) { + NFS_SOCK_DBG(("nfs connect %s search failed, no addresses\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname)); + return (EINVAL); + } + + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_sockflags |= NMSOCK_CONNECTING; + nmp->nm_nss = &nss; + lck_mtx_unlock(&nmp->nm_lock); + microuptime(&start); + savederror = error = 0; + +tryagain: + /* initialize socket search state */ + bzero(&nss, sizeof(nss)); + nss.nss_error = savederror; + TAILQ_INIT(&nss.nss_socklist); + nss.nss_sotype = sotype; + nss.nss_startloc = nmp->nm_locations.nl_current; + nss.nss_timestamp = start.tv_sec; + nss.nss_timeo = timeo; + if (verbose) + nss.nss_flags |= NSS_VERBOSE; + + /* First time connecting, we may need to negotiate some things */ + if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) { + if (!nmp->nm_vers) { + /* No NFS version specified... */ + if (!nmp->nm_nfsport || (!NM_OMATTR_GIVEN(nmp, FH) && !nmp->nm_mountport)) { + /* ...connect to portmapper first if we (may) need any ports. */ + nss.nss_port = PMAPPORT; + nss.nss_protocol = PMAPPROG; + nss.nss_version = 0; + } else { + /* ...connect to NFS port first. */ + nss.nss_port = nmp->nm_nfsport; + nss.nss_protocol = NFS_PROG; + nss.nss_version = 0; + } + } else if (nmp->nm_vers >= NFS_VER4) { + /* For NFSv4, we use the given (or default) port. */ + nss.nss_port = nmp->nm_nfsport ? nmp->nm_nfsport : NFS_PORT; + nss.nss_protocol = NFS_PROG; + nss.nss_version = 4; + } else { + /* For NFSv3/v2... */ + if (!nmp->nm_nfsport || (!NM_OMATTR_GIVEN(nmp, FH) && !nmp->nm_mountport)) { + /* ...connect to portmapper first if we need any ports. */ + nss.nss_port = PMAPPORT; + nss.nss_protocol = PMAPPROG; + nss.nss_version = 0; + } else { + /* ...connect to NFS port first. */ + nss.nss_port = nmp->nm_nfsport; + nss.nss_protocol = NFS_PROG; + nss.nss_version = nmp->nm_vers; + } + } + NFS_SOCK_DBG(("nfs connect first %s, so type %d port %d prot %d %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nss.nss_sotype, nss.nss_port, + nss.nss_protocol, nss.nss_version)); + } else { + /* we've connected before, just connect to NFS port */ + if (!nmp->nm_nfsport) { + /* need to ask portmapper which port that would be */ + nss.nss_port = PMAPPORT; + nss.nss_protocol = PMAPPROG; + nss.nss_version = 0; + } else { + nss.nss_port = nmp->nm_nfsport; + nss.nss_protocol = NFS_PROG; + nss.nss_version = nmp->nm_vers; + } + NFS_SOCK_DBG(("nfs connect %s, so type %d port %d prot %d %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nss.nss_sotype, nss.nss_port, + nss.nss_protocol, nss.nss_version)); + } + + /* Set next location to first valid location. */ + /* If start location is invalid, find next location. */ + nss.nss_nextloc = nss.nss_startloc; + if ((nss.nss_nextloc.nli_serv >= nmp->nm_locations.nl_locations[nss.nss_nextloc.nli_loc]->nl_servcount) || + (nss.nss_nextloc.nli_addr >= nmp->nm_locations.nl_locations[nss.nss_nextloc.nli_loc]->nl_servers[nss.nss_nextloc.nli_serv]->ns_addrcount)) { + nfs_location_next(&nmp->nm_locations, &nss.nss_nextloc); + if (!nfs_location_index_cmp(&nss.nss_nextloc, &nss.nss_startloc)) { + NFS_SOCK_DBG(("nfs connect %s search failed, couldn't find a valid location index\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname)); + return (ENOENT); + } + } + nss.nss_last = -1; + +keepsearching: + + error = nfs_connect_search_loop(nmp, &nss); + if (error || !nss.nss_sock) { + /* search failed */ + nfs_socket_search_cleanup(&nss); + if (!error && (nss.nss_sotype == SOCK_STREAM) && !nmp->nm_sotype && (nmp->nm_vers < NFS_VER4)) { + /* Try using UDP */ + sotype = SOCK_DGRAM; + savederror = nss.nss_error; + NFS_SOCK_DBG(("nfs connect %s TCP failed %d %d, trying UDP\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, error, nss.nss_error)); + goto tryagain; + } + if (!error) + error = nss.nss_error ? nss.nss_error : ETIMEDOUT; + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_sockflags &= ~NMSOCK_CONNECTING; + nmp->nm_nss = NULL; + lck_mtx_unlock(&nmp->nm_lock); + if (nss.nss_flags & NSS_WARNED) + log(LOG_INFO, "nfs_connect: socket connect aborted for %s\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname); + if (fh) + FREE(fh, M_TEMP); + if (path) + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + NFS_SOCK_DBG(("nfs connect %s search failed, returning %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, error)); + return (error); + } + + /* try to use nss_sock */ + nso = nss.nss_sock; + nss.nss_sock = NULL; + + /* We may be speaking to portmap first... to determine port(s). */ + if (nso->nso_saddr->sa_family == AF_INET) + port = ntohs(((struct sockaddr_in*)nso->nso_saddr)->sin_port); + else + port = ntohs(((struct sockaddr_in6*)nso->nso_saddr)->sin6_port); + if (port == PMAPPORT) { + /* Use this portmapper port to get the port #s we need. */ + NFS_SOCK_DBG(("nfs connect %s got portmapper socket %p\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + + /* remove the connect upcall so nfs_portmap_lookup() can use this socket */ + sock_setupcall(nso->nso_so, NULL, NULL); + + /* Set up socket address and port for NFS socket. */ + bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len); + + /* If NFS version not set, try NFSv3 then NFSv2. */ + nfsvers = nmp->nm_vers ? nmp->nm_vers : NFS_VER3; + + if (!(port = nmp->nm_nfsport)) { + if (ss.ss_family == AF_INET) + ((struct sockaddr_in*)&ss)->sin_port = htons(0); + else if (ss.ss_family == AF_INET6) + ((struct sockaddr_in6*)&ss)->sin6_port = htons(0); + error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, + nso->nso_so, NFS_PROG, nfsvers, + (nso->nso_sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP, timeo); + if (!error) { + if (ss.ss_family == AF_INET) + port = ntohs(((struct sockaddr_in*)&ss)->sin_port); + else if (ss.ss_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port); + if (!port) + error = EPROGUNAVAIL; + } + if (error && !nmp->nm_vers) { + nfsvers = NFS_VER2; + error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, + nso->nso_so, NFS_PROG, nfsvers, + (nso->nso_sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP, timeo); + if (!error) { + if (ss.ss_family == AF_INET) + port = ntohs(((struct sockaddr_in*)&ss)->sin_port); + else if (ss.ss_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port); + if (!port) + error = EPROGUNAVAIL; + } + } + if (error) { + nfs_socket_search_update_error(&nss, error); + nfs_socket_destroy(nso); + goto keepsearching; + } + } + /* Create NFS protocol socket and add it to the list of sockets. */ + error = nfs_socket_create(nmp, (struct sockaddr*)&ss, nso->nso_sotype, port, + NFS_PROG, nfsvers, NMFLAG(nmp, RESVPORT), &nsonfs); + if (error) { + nfs_socket_search_update_error(&nss, error); + nfs_socket_destroy(nso); + goto keepsearching; + } + nsonfs->nso_location = nso->nso_location; + nsonfs->nso_wake = &nss; + error = sock_setupcall(nsonfs->nso_so, nfs_connect_upcall, nsonfs); + if (error) { + nfs_socket_search_update_error(&nss, error); + nfs_socket_destroy(nsonfs); + nfs_socket_destroy(nso); + goto keepsearching; + } + TAILQ_INSERT_TAIL(&nss.nss_socklist, nsonfs, nso_link); + nss.nss_sockcnt++; + if ((nfsvers < NFS_VER4) && !(nmp->nm_sockflags & NMSOCK_HASCONNECTED) && !NM_OMATTR_GIVEN(nmp, FH)) { + /* Set up socket address and port for MOUNT socket. */ error = 0; + bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len); + port = nmp->nm_mountport; + if (ss.ss_family == AF_INET) + ((struct sockaddr_in*)&ss)->sin_port = htons(port); + else if (ss.ss_family == AF_INET6) + ((struct sockaddr_in6*)&ss)->sin6_port = htons(port); + if (!port) { + /* Get port/sockaddr for MOUNT version corresponding to NFS version. */ + /* If NFS version is unknown, optimistically choose for NFSv3. */ + int mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; + int mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nso->nso_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP; + error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, + nso->nso_so, RPCPROG_MNT, mntvers, mntproto, timeo); + } + if (!error) { + if (ss.ss_family == AF_INET) + port = ntohs(((struct sockaddr_in*)&ss)->sin_port); + else if (ss.ss_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port); + if (!port) + error = EPROGUNAVAIL; + } + /* create sockaddr for MOUNT */ + if (!error) + MALLOC(nsonfs->nso_saddr2, struct sockaddr *, ss.ss_len, M_SONAME, M_WAITOK|M_ZERO); + if (!error && !nsonfs->nso_saddr2) + error = ENOMEM; + if (!error) + bcopy(&ss, nsonfs->nso_saddr2, ss.ss_len); + if (error) { + lck_mtx_lock(&nsonfs->nso_lock); + nsonfs->nso_error = error; + nsonfs->nso_flags |= NSO_DEAD; + lck_mtx_unlock(&nsonfs->nso_lock); + } } + nfs_socket_destroy(nso); + goto keepsearching; } - /* set SO_NOADDRERR to detect network changes ASAP */ - error = sock_setsockopt(so, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)); + /* nso is an NFS socket */ + NFS_SOCK_DBG(("nfs connect %s got NFS socket %p\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso)); + + /* If NFS version wasn't specified, it was determined during the connect. */ + nfsvers = nmp->nm_vers ? nmp->nm_vers : (int)nso->nso_version; + + /* Perform MOUNT call for initial NFSv2/v3 connection/mount. */ + if ((nfsvers < NFS_VER4) && !(nmp->nm_sockflags & NMSOCK_HASCONNECTED) && !NM_OMATTR_GIVEN(nmp, FH)) { + error = 0; + saddr = nso->nso_saddr2; + if (!saddr) { + /* Need sockaddr for MOUNT port */ + bcopy(nso->nso_saddr, &ss, nso->nso_saddr->sa_len); + port = nmp->nm_mountport; + if (ss.ss_family == AF_INET) + ((struct sockaddr_in*)&ss)->sin_port = htons(port); + else if (ss.ss_family == AF_INET6) + ((struct sockaddr_in6*)&ss)->sin6_port = htons(port); + if (!port) { + /* Get port/sockaddr for MOUNT version corresponding to NFS version. */ + int mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; + int mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nso->nso_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP; + error = nfs_portmap_lookup(nmp, vfs_context_current(), (struct sockaddr*)&ss, + NULL, RPCPROG_MNT, mntvers, mntproto, timeo); + if (ss.ss_family == AF_INET) + port = ntohs(((struct sockaddr_in*)&ss)->sin_port); + else if (ss.ss_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)&ss)->sin6_port); + } + if (!error) { + if (port) + saddr = (struct sockaddr*)&ss; + else + error = EPROGUNAVAIL; + } + } + if (saddr) + MALLOC(fh, fhandle_t *, sizeof(fhandle_t), M_TEMP, M_WAITOK|M_ZERO); + if (saddr && fh) + MALLOC_ZONE(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!saddr || !fh || !path) { + if (!error) + error = ENOMEM; + if (fh) + FREE(fh, M_TEMP); + if (path) + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + fh = NULL; + path = NULL; + nfs_socket_search_update_error(&nss, error); + nfs_socket_destroy(nso); + goto keepsearching; + } + nfs_location_mntfromname(&nmp->nm_locations, nso->nso_location, path, MAXPATHLEN, 1); + error = nfs3_mount_rpc(nmp, saddr, nso->nso_sotype, nfsvers, + path, vfs_context_current(), timeo, fh, &nmp->nm_servsec); + NFS_SOCK_DBG(("nfs connect %s socket %p mount %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error)); + if (!error) { + /* Make sure we can agree on a security flavor. */ + int o, s; /* indices into mount option and server security flavor lists */ + int found = 0; + + if ((nfsvers == NFS_VER3) && !nmp->nm_servsec.count) { + /* Some servers return an empty list to indicate RPCAUTH_SYS? */ + nmp->nm_servsec.count = 1; + nmp->nm_servsec.flavors[0] = RPCAUTH_SYS; + } + if (nmp->nm_sec.count) { + /* Choose the first flavor in our list that the server supports. */ + if (!nmp->nm_servsec.count) { + /* we don't know what the server supports, just use our first choice */ + nmp->nm_auth = nmp->nm_sec.flavors[0]; + found = 1; + } + for (o=0; !found && (o < nmp->nm_sec.count); o++) + for (s=0; !found && (s < nmp->nm_servsec.count); s++) + if (nmp->nm_sec.flavors[o] == nmp->nm_servsec.flavors[s]) { + nmp->nm_auth = nmp->nm_sec.flavors[o]; + found = 1; + } + } else { + /* Choose the first one we support from the server's list. */ + if (!nmp->nm_servsec.count) { + nmp->nm_auth = RPCAUTH_SYS; + found = 1; + } + for (s=0; s < nmp->nm_servsec.count; s++) + switch (nmp->nm_servsec.flavors[s]) { + case RPCAUTH_SYS: + /* prefer RPCAUTH_SYS to RPCAUTH_NONE */ + if (found && (nmp->nm_auth == RPCAUTH_NONE)) + found = 0; + case RPCAUTH_NONE: + case RPCAUTH_KRB5: + case RPCAUTH_KRB5I: + case RPCAUTH_KRB5P: + if (!found) { + nmp->nm_auth = nmp->nm_servsec.flavors[s]; + found = 1; + } + break; + } + } + error = !found ? EAUTH : 0; + } + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + path = NULL; + if (error) { + nfs_socket_search_update_error(&nss, error); + FREE(fh, M_TEMP); + fh = NULL; + nfs_socket_destroy(nso); + goto keepsearching; + } + if (nmp->nm_fh) + FREE(nmp->nm_fh, M_TEMP); + nmp->nm_fh = fh; + fh = NULL; + NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_CALLUMNT); + } + + /* put the real upcall in place */ + upcall = (nso->nso_sotype == SOCK_STREAM) ? nfs_tcp_rcv : nfs_udp_rcv; + error = sock_setupcall(nso->nso_so, upcall, nmp); if (error) { - lck_mtx_unlock(&nmp->nm_lock); - goto bad; + nfs_socket_search_update_error(&nss, error); + nfs_socket_destroy(nso); + goto keepsearching; } - /* just playin' it safe */ - sock_setsockopt(so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); - if (!(nmp->nm_flag & NFSMNT_INT)) - sock_nointerrupt(so, 1); + if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) { + /* set mntfromname to this location */ + if (!NM_OMATTR_GIVEN(nmp, MNTFROM)) + nfs_location_mntfromname(&nmp->nm_locations, nso->nso_location, + vfs_statfs(nmp->nm_mountp)->f_mntfromname, + sizeof(vfs_statfs(nmp->nm_mountp)->f_mntfromname), 0); + /* some negotiated values need to remain unchanged for the life of the mount */ + if (!nmp->nm_sotype) + nmp->nm_sotype = nso->nso_sotype; + if (!nmp->nm_vers) { + nmp->nm_vers = nfsvers; + /* If we negotiated NFSv4, set nm_nfsport if we ended up on the standard NFS port */ + if ((nfsvers >= NFS_VER4) && !NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_PORT)) { + if (nso->nso_saddr->sa_family == AF_INET) + port = ((struct sockaddr_in*)nso->nso_saddr)->sin_port = htons(port); + else if (nso->nso_saddr->sa_family == AF_INET6) + port = ((struct sockaddr_in6*)nso->nso_saddr)->sin6_port = htons(port); + else + port = 0; + if (port == NFS_PORT) + nmp->nm_nfsport = NFS_PORT; + } + } + /* do some version-specific pre-mount set up */ + if (nmp->nm_vers >= NFS_VER4) { + microtime(&now); + nmp->nm_mounttime = ((uint64_t)now.tv_sec << 32) | now.tv_usec; + if (!NMFLAG(nmp, NOCALLBACK)) + nfs4_mount_callback_setup(nmp); + } + } - /* Initialize socket state variables */ + /* Initialize NFS socket state variables */ + lck_mtx_lock(&nmp->nm_lock); nmp->nm_srtt[0] = nmp->nm_srtt[1] = nmp->nm_srtt[2] = nmp->nm_srtt[3] = (NFS_TIMEO << 3); nmp->nm_sdrtt[0] = nmp->nm_sdrtt[1] = nmp->nm_sdrtt[2] = nmp->nm_sdrtt[3] = 0; - if (nmp->nm_sotype == SOCK_DGRAM) { - /* XXX do we really want to reset this on each reconnect? */ + if (nso->nso_sotype == SOCK_DGRAM) { nmp->nm_cwnd = NFS_MAXCWND / 2; /* Initial send window */ nmp->nm_sent = 0; - } else if (nmp->nm_sotype == SOCK_STREAM) { - nmp->nm_markerleft = sizeof(nmp->nm_fragleft); - nmp->nm_fragleft = nmp->nm_reclen = 0; + } else if (nso->nso_sotype == SOCK_STREAM) { nmp->nm_timeouts = 0; } nmp->nm_sockflags &= ~NMSOCK_CONNECTING; nmp->nm_sockflags |= NMSOCK_SETUP; - FSDBG(529, nmp, nmp->nm_state, nmp->nm_flag, nmp->nm_cwnd); + /* move the socket to the mount structure */ + nmp->nm_nso = nso; + oldsaddr = nmp->nm_saddr; + nmp->nm_saddr = nso->nso_saddr; lck_mtx_unlock(&nmp->nm_lock); error = nfs_connect_setup(nmp); -bad: lck_mtx_lock(&nmp->nm_lock); - nmp->nm_sockflags &= ~(NMSOCK_CONNECTING|NMSOCK_SETUP); + nmp->nm_sockflags &= ~NMSOCK_SETUP; if (!error) { nmp->nm_sockflags |= NMSOCK_READY; wakeup(&nmp->nm_sockflags); } + if (error) { + NFS_SOCK_DBG(("nfs connect %s socket %p setup failed %d\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname, nso, error)); + nfs_socket_search_update_error(&nss, error); + nmp->nm_saddr = oldsaddr; + if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) { + /* undo settings made prior to setup */ + if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_SOCKET_TYPE)) + nmp->nm_sotype = 0; + if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_VERSION)) { + if (nmp->nm_vers >= NFS_VER4) { + if (!NFS_BITMAP_ISSET(nmp->nm_mattrs, NFS_MATTR_NFS_PORT)) + nmp->nm_nfsport = 0; + if (nmp->nm_cbid) + nfs4_mount_callback_shutdown(nmp); + if (IS_VALID_CRED(nmp->nm_mcred)) + kauth_cred_unref(&nmp->nm_mcred); + bzero(&nmp->nm_un, sizeof(nmp->nm_un)); + } + nmp->nm_vers = 0; + } + } + lck_mtx_unlock(&nmp->nm_lock); + nmp->nm_nso = NULL; + nfs_socket_destroy(nso); + goto keepsearching; + } + + /* update current location */ + if ((nmp->nm_locations.nl_current.nli_flags & NLI_VALID) && + (nmp->nm_locations.nl_current.nli_serv != nso->nso_location.nli_serv)) { + /* server has changed, we should initiate failover/recovery */ + // XXX + } + nmp->nm_locations.nl_current = nso->nso_location; + nmp->nm_locations.nl_current.nli_flags |= NLI_VALID; + + if (!(nmp->nm_sockflags & NMSOCK_HASCONNECTED)) { + /* We have now successfully connected... make a note of it. */ + nmp->nm_sockflags |= NMSOCK_HASCONNECTED; + } + lck_mtx_unlock(&nmp->nm_lock); - return (error); + if (oldsaddr) + FREE(oldsaddr, M_SONAME); + + if (nss.nss_flags & NSS_WARNED) + log(LOG_INFO, "nfs_connect: socket connect completed for %s\n", + vfs_statfs(nmp->nm_mountp)->f_mntfromname); + + nmp->nm_nss = NULL; + nfs_socket_search_cleanup(&nss); + if (fh) + FREE(fh, M_TEMP); + if (path) + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + NFS_SOCK_DBG(("nfs connect %s success\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname)); + return (0); } + /* setup & confirm socket connection is functional */ int nfs_connect_setup(struct nfsmount *nmp) { - struct nfsm_chain nmreq, nmrep; - int error = 0, status; - u_int64_t xid; + int error = 0; if (nmp->nm_vers >= NFS_VER4) { - error = nfs4_setclientid(nmp); - if (error) - return (error); - error = nfs4_renew(nmp, R_SETUP); - if ((error == NFSERR_ADMIN_REVOKED) || - (error == NFSERR_EXPIRED) || - (error == NFSERR_LEASE_MOVED) || - (error == NFSERR_STALE_CLIENTID)) { - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_state |= NFSSTA_RECOVER; - lck_mtx_unlock(&nmp->nm_lock); + if (nmp->nm_state & NFSSTA_CLIENTID) { + /* first, try to renew our current state */ + error = nfs4_renew(nmp, R_SETUP); + if ((error == NFSERR_ADMIN_REVOKED) || + (error == NFSERR_CB_PATH_DOWN) || + (error == NFSERR_EXPIRED) || + (error == NFSERR_LEASE_MOVED) || + (error == NFSERR_STALE_CLIENTID)) { + lck_mtx_lock(&nmp->nm_lock); + nfs_need_recover(nmp, error); + lck_mtx_unlock(&nmp->nm_lock); + } } - } else { - /* verify connection's OK by sending a NULL request */ - nfsm_chain_null(&nmreq); - nfsm_chain_null(&nmrep); - nfsm_chain_build_alloc_init(error, &nmreq, 0); - nfsm_chain_build_done(error, &nmreq); - nfsmout_if(error); - error = nfs_request2(NULL, nmp->nm_mountp, &nmreq, NFSPROC_NULL, - current_thread(), NULL, R_SETUP, &nmrep, &xid, &status); - if (!error) - error = status; -nfsmout: - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); + error = nfs4_setclientid(nmp); } return (error); } @@ -422,10 +1504,10 @@ nfs_reconnect(struct nfsmount *nmp) nfs_disconnect(nmp); - while ((error = nfs_connect(nmp, verbose))) { + while ((error = nfs_connect(nmp, verbose, 30))) { verbose = 0; nfs_disconnect(nmp); - if (error == EINTR || error == ERESTART) + if ((error == EINTR) || (error == ERESTART)) return (EINTR); if (error == EIO) return (EIO); @@ -485,19 +1567,32 @@ nfs_reconnect(struct nfsmount *nmp) void nfs_disconnect(struct nfsmount *nmp) { - socket_t so; + struct nfs_socket *nso; lck_mtx_lock(&nmp->nm_lock); - if ((nmp->nm_sotype == SOCK_STREAM) && nmp->nm_m) { - mbuf_freem(nmp->nm_m); - nmp->nm_m = nmp->nm_mlast = NULL; - } - if (nmp->nm_so) { - so = nmp->nm_so; - nmp->nm_so = NULL; +tryagain: + if (nmp->nm_nso) { + struct timespec ts = { 1, 0 }; + if (nmp->nm_state & NFSSTA_SENDING) { /* wait for sending to complete */ + nmp->nm_state |= NFSSTA_WANTSND; + msleep(&nmp->nm_state, &nmp->nm_lock, PZERO-1, "nfswaitsending", &ts); + goto tryagain; + } + if (nmp->nm_sockflags & NMSOCK_POKE) { /* wait for poking to complete */ + msleep(&nmp->nm_sockflags, &nmp->nm_lock, PZERO-1, "nfswaitpoke", &ts); + goto tryagain; + } + nmp->nm_sockflags |= NMSOCK_DISCONNECTING; + nmp->nm_sockflags &= ~NMSOCK_READY; + nso = nmp->nm_nso; + nmp->nm_nso = NULL; + if (nso->nso_saddr == nmp->nm_saddr) + nso->nso_saddr = NULL; + lck_mtx_unlock(&nmp->nm_lock); + nfs_socket_destroy(nso); + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_sockflags &= ~NMSOCK_DISCONNECTING; lck_mtx_unlock(&nmp->nm_lock); - sock_shutdown(so, SHUT_RDWR); - sock_close(so); } else { lck_mtx_unlock(&nmp->nm_lock); } @@ -536,6 +1631,7 @@ nfs_need_reconnect(struct nfsmount *nmp) lck_mtx_unlock(nfs_request_mutex); } + /* * thread to handle miscellaneous async NFS socket work (reconnects/resends) */ @@ -547,24 +1643,22 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) thread_t thd = current_thread(); struct nfsreq *req; struct timeval now; - int error, dofinish, force; + int error, dofinish; nfsnode_t np; - fhandle_t fh; - nfs_stateid dstateid; lck_mtx_lock(&nmp->nm_lock); while (!(nmp->nm_sockflags & NMSOCK_READY) || !TAILQ_EMPTY(&nmp->nm_resendq) || + !LIST_EMPTY(&nmp->nm_monlist) || nmp->nm_deadto_start || - ((nmp->nm_vers >= NFS_VER4) && - ((nmp->nm_state & NFSSTA_RECOVER) || !TAILQ_EMPTY(&nmp->nm_recallq)))) + (nmp->nm_state & NFSSTA_RECOVER) || + ((nmp->nm_vers >= NFS_VER4) && !TAILQ_EMPTY(&nmp->nm_dreturnq))) { if (nmp->nm_sockflags & NMSOCK_UNMOUNT) break; - force = (nmp->nm_state & NFSSTA_FORCE); /* do reconnect, if necessary */ - if (!(nmp->nm_sockflags & NMSOCK_READY) && !force) { + if (!(nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_FORCE)) { if (nmp->nm_reconnect_start <= 0) { microuptime(&now); nmp->nm_reconnect_start = now.tv_sec; @@ -577,38 +1671,27 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) } if ((nmp->nm_sockflags & NMSOCK_READY) && (nmp->nm_state & NFSSTA_RECOVER) && - !(nmp->nm_sockflags & NMSOCK_UNMOUNT) && !force) { + !(nmp->nm_sockflags & NMSOCK_UNMOUNT) && + !(nmp->nm_state & NFSSTA_FORCE)) { /* perform state recovery */ lck_mtx_unlock(&nmp->nm_lock); - nfs4_recover(nmp); + nfs_recover(nmp); lck_mtx_lock(&nmp->nm_lock); } - /* handle NFSv4 delegation recalls */ - while ((nmp->nm_vers >= NFS_VER4) && !force && + /* handle NFSv4 delegation returns */ + while ((nmp->nm_vers >= NFS_VER4) && !(nmp->nm_state & NFSSTA_FORCE) && (nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER) && - ((np = TAILQ_FIRST(&nmp->nm_recallq)))) { - TAILQ_REMOVE(&nmp->nm_recallq, np, n_dlink); - np->n_dlink.tqe_next = NFSNOLIST; + ((np = TAILQ_FIRST(&nmp->nm_dreturnq)))) { lck_mtx_unlock(&nmp->nm_lock); - lck_mtx_lock(&np->n_openlock); - dstateid = np->n_dstateid; - if (np->n_openflags & N_DELEG_MASK) { - fh.fh_len = np->n_fhsize; - bcopy(np->n_fhp, &fh.fh_data, fh.fh_len); - np->n_openflags &= ~N_DELEG_MASK; - lck_mtx_unlock(&np->n_openlock); - nfs4_delegreturn_rpc(nmp, fh.fh_data, fh.fh_len, &dstateid, thd, nmp->nm_mcred); - } else { - lck_mtx_unlock(&np->n_openlock); - } + nfs4_delegation_return(np, R_RECOVER, thd, nmp->nm_mcred); lck_mtx_lock(&nmp->nm_lock); } /* do resends, if necessary/possible */ - while ((((nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER)) || force) && + while ((((nmp->nm_sockflags & NMSOCK_READY) && !(nmp->nm_state & NFSSTA_RECOVER)) || (nmp->nm_state & NFSSTA_FORCE)) && ((req = TAILQ_FIRST(&nmp->nm_resendq)))) { if (req->r_resendtime) microuptime(&now); - while (req && !force && req->r_resendtime && (now.tv_sec < req->r_resendtime)) + while (req && !(nmp->nm_state & NFSSTA_FORCE) && req->r_resendtime && (now.tv_sec < req->r_resendtime)) req = TAILQ_NEXT(req, r_rchain); if (!req) break; @@ -626,20 +1709,20 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) lck_mtx_lock(&nmp->nm_lock); continue; } - if ((req->r_flags & R_RESTART) || req->r_gss_ctx) { + if ((req->r_flags & R_RESTART) || nfs_request_using_gss(req)) { req->r_flags &= ~R_RESTART; req->r_resendtime = 0; lck_mtx_unlock(&req->r_mtx); /* async RPCs on GSS mounts need to be rebuilt and resent. */ nfs_reqdequeue(req); - if (req->r_gss_ctx) { + if (nfs_request_using_gss(req)) { nfs_gss_clnt_rpcdone(req); error = nfs_gss_clnt_args_restore(req); if (error == ENEEDAUTH) req->r_xid = 0; } NFS_SOCK_DBG(("nfs async%s restart: p %d x 0x%llx f 0x%x rtt %d\n", - req->r_gss_ctx ? " gss" : "", req->r_procnum, req->r_xid, + nfs_request_using_gss(req) ? " gss" : "", req->r_procnum, req->r_xid, req->r_flags, req->r_rtt)); error = !req->r_nmp ? ENXIO : 0; /* unmounted? */ if (!error) @@ -693,20 +1776,45 @@ nfs_mount_sock_thread(void *arg, __unused wait_result_t wr) } if (nmp->nm_deadto_start) nfs_mount_check_dead_timeout(nmp); - if (force || (nmp->nm_state & NFSSTA_DEAD)) + if (nmp->nm_state & (NFSSTA_FORCE|NFSSTA_DEAD)) break; - if ((nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & NFSSTA_RECOVER)) { + /* check monitored nodes, if necessary/possible */ + if (!LIST_EMPTY(&nmp->nm_monlist)) { + nmp->nm_state |= NFSSTA_MONITOR_SCAN; + LIST_FOREACH(np, &nmp->nm_monlist, n_monlink) { + if (!(nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & (NFSSTA_RECOVER|NFSSTA_UNMOUNTING|NFSSTA_FORCE))) + break; + np->n_mflag |= NMMONSCANINPROG; + lck_mtx_unlock(&nmp->nm_lock); + error = nfs_getattr(np, NULL, vfs_context_kernel(), (NGA_UNCACHED|NGA_MONITOR)); + if (!error && ISSET(np->n_flag, NUPDATESIZE)) /* update quickly to avoid multiple events */ + nfs_data_update_size(np, 0); + lck_mtx_lock(&nmp->nm_lock); + np->n_mflag &= ~NMMONSCANINPROG; + if (np->n_mflag & NMMONSCANWANT) { + np->n_mflag &= ~NMMONSCANWANT; + wakeup(&np->n_mflag); + } + if (error || !(nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & (NFSSTA_RECOVER|NFSSTA_UNMOUNTING|NFSSTA_FORCE))) + break; + } + nmp->nm_state &= ~NFSSTA_MONITOR_SCAN; + if (nmp->nm_state & NFSSTA_UNMOUNTING) + wakeup(&nmp->nm_state); /* let unmounting thread know scan is done */ + } + if ((nmp->nm_sockflags & NMSOCK_READY) || (nmp->nm_state & (NFSSTA_RECOVER|NFSSTA_UNMOUNTING))) { if (nmp->nm_deadto_start || !TAILQ_EMPTY(&nmp->nm_resendq) || (nmp->nm_state & NFSSTA_RECOVER)) ts.tv_sec = 1; else - ts.tv_sec = 30; + ts.tv_sec = 5; msleep(&nmp->nm_sockthd, &nmp->nm_lock, PSOCK, "nfssockthread", &ts); } } /* If we're unmounting, send the unmount RPC, if requested/appropriate. */ - if ((nmp->nm_sockflags & NMSOCK_UNMOUNT) && (nmp->nm_flag & NFSMNT_CALLUMNT) && + if ((nmp->nm_sockflags & NMSOCK_UNMOUNT) && + (nmp->nm_state & NFSSTA_MOUNTED) && NMFLAG(nmp, CALLUMNT) && (nmp->nm_vers < NFS_VER4) && !(nmp->nm_state & (NFSSTA_FORCE|NFSSTA_DEAD))) { lck_mtx_unlock(&nmp->nm_lock); nfs3_umount_rpc(nmp, vfs_context_kernel(), @@ -741,7 +1849,7 @@ nfs_mount_check_dead_timeout(struct nfsmount *nmp) { struct timeval now; - if (!(nmp->nm_flag & NFSMNT_DEADTIMEOUT)) + if (nmp->nm_deadtimeout <= 0) return; if (nmp->nm_deadto_start == 0) return; @@ -755,20 +1863,6 @@ nfs_mount_check_dead_timeout(struct nfsmount *nmp) vfs_event_signal(&vfs_statfs(nmp->nm_mountp)->f_fsid, VQ_DEAD, 0); } -/* - * RPC record marker parsing state - */ -struct nfs_rpc_record_state -{ - uint16_t nrrs_lastfrag; /* last fragment of record */ - uint16_t nrrs_markerleft; /* marker bytes remaining */ - uint32_t nrrs_fragleft; /* fragment bytes remaining */ - uint32_t nrrs_reclen; /* length of RPC record */ - mbuf_t nrrs_m; /* mbufs for current record */ - mbuf_t nrrs_mlast; -}; -int nfs_rpc_record_read(socket_t, struct nfs_rpc_record_state *, int *, mbuf_t *); - /* * NFS callback channel socket state */ @@ -776,7 +1870,7 @@ struct nfs_callback_socket { TAILQ_ENTRY(nfs_callback_socket) ncbs_link; socket_t ncbs_so; /* the socket */ - struct sockaddr_in ncbs_sin; /* socket address */ + struct sockaddr_storage ncbs_saddr; /* socket address */ struct nfs_rpc_record_state ncbs_rrs; /* RPC record parsing state */ time_t ncbs_stamp; /* last accessed at */ uint32_t ncbs_flags; /* see below */ @@ -795,7 +1889,9 @@ struct nfs_callback_socket * the requests up with mounts. */ socket_t nfs4_cb_so = NULL; +socket_t nfs4_cb_so6 = NULL; in_port_t nfs4_cb_port = 0; +in_port_t nfs4_cb_port6 = 0; uint32_t nfs4_cb_id = 0; uint32_t nfs4_cb_so_usecount = 0; TAILQ_HEAD(nfs4_cb_sock_list,nfs_callback_socket) nfs4_cb_socks; @@ -813,9 +1909,12 @@ void nfs4_mount_callback_setup(struct nfsmount *nmp) { struct sockaddr_in sin; + struct sockaddr_in6 sin6; socket_t so = NULL; + socket_t so6 = NULL; struct timeval timeo; int error, on = 1; + in_port_t port; lck_mtx_lock(nfs_global_mutex); if (nfs4_cb_id == 0) { @@ -834,32 +1933,34 @@ nfs4_mount_callback_setup(struct nfsmount *nmp) return; } + /* IPv4 */ error = sock_socket(AF_INET, SOCK_STREAM, IPPROTO_TCP, nfs4_cb_accept, NULL, &nfs4_cb_so); if (error) { - log(LOG_INFO, "nfs callback setup: error %d creating listening socket\n", error); + log(LOG_INFO, "nfs callback setup: error %d creating listening IPv4 socket\n", error); goto fail; } so = nfs4_cb_so; + sock_setsockopt(so, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_ANY); - sin.sin_port = 0; + sin.sin_port = htons(nfs_callback_port); /* try to use specified port */ error = sock_bind(so, (struct sockaddr *)&sin); if (error) { - log(LOG_INFO, "nfs callback setup: error %d binding listening socket\n", error); + log(LOG_INFO, "nfs callback setup: error %d binding listening IPv4 socket\n", error); goto fail; } error = sock_getsockname(so, (struct sockaddr *)&sin, sin.sin_len); if (error) { - log(LOG_INFO, "nfs callback setup: error %d getting listening socket port\n", error); + log(LOG_INFO, "nfs callback setup: error %d getting listening IPv4 socket port\n", error); goto fail; } nfs4_cb_port = ntohs(sin.sin_port); error = sock_listen(so, 32); if (error) { - log(LOG_INFO, "nfs callback setup: error %d on listen\n", error); + log(LOG_INFO, "nfs callback setup: error %d on IPv4 listen\n", error); goto fail; } @@ -868,23 +1969,81 @@ nfs4_mount_callback_setup(struct nfsmount *nmp) timeo.tv_sec = 60; error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); if (error) - log(LOG_INFO, "nfs callback setup: error %d setting socket rx timeout\n", error); + log(LOG_INFO, "nfs callback setup: error %d setting IPv4 socket rx timeout\n", error); error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); if (error) - log(LOG_INFO, "nfs callback setup: error %d setting socket tx timeout\n", error); + log(LOG_INFO, "nfs callback setup: error %d setting IPv4 socket tx timeout\n", error); sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); sock_setsockopt(so, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)); sock_setsockopt(so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); error = 0; + /* IPv6 */ + error = sock_socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP, nfs4_cb_accept, NULL, &nfs4_cb_so6); + if (error) { + log(LOG_INFO, "nfs callback setup: error %d creating listening IPv6 socket\n", error); + goto fail; + } + so6 = nfs4_cb_so6; + + sock_setsockopt(so6, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + sock_setsockopt(so6, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on)); + /* try to use specified port or same port as IPv4 */ + port = nfs_callback_port ? nfs_callback_port : nfs4_cb_port; +ipv6_bind_again: + sin6.sin6_len = sizeof(struct sockaddr_in6); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = in6addr_any; + sin6.sin6_port = htons(port); + error = sock_bind(so6, (struct sockaddr *)&sin6); + if (error) { + if (port != nfs_callback_port) { + /* if we simply tried to match the IPv4 port, then try any port */ + port = 0; + goto ipv6_bind_again; + } + log(LOG_INFO, "nfs callback setup: error %d binding listening IPv6 socket\n", error); + goto fail; + } + error = sock_getsockname(so6, (struct sockaddr *)&sin6, sin6.sin6_len); + if (error) { + log(LOG_INFO, "nfs callback setup: error %d getting listening IPv6 socket port\n", error); + goto fail; + } + nfs4_cb_port6 = ntohs(sin6.sin6_port); + + error = sock_listen(so6, 32); + if (error) { + log(LOG_INFO, "nfs callback setup: error %d on IPv6 listen\n", error); + goto fail; + } + + /* receive timeout shouldn't matter. If timeout on send, we'll want to drop the socket */ + timeo.tv_usec = 0; + timeo.tv_sec = 60; + error = sock_setsockopt(so6, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); + if (error) + log(LOG_INFO, "nfs callback setup: error %d setting IPv6 socket rx timeout\n", error); + error = sock_setsockopt(so6, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); + if (error) + log(LOG_INFO, "nfs callback setup: error %d setting IPv6 socket tx timeout\n", error); + sock_setsockopt(so6, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); + sock_setsockopt(so6, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)); + sock_setsockopt(so6, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); + error = 0; + fail: if (error) { - nfs4_cb_so = NULL; + nfs4_cb_so = nfs4_cb_so6 = NULL; lck_mtx_unlock(nfs_global_mutex); if (so) { sock_shutdown(so, SHUT_RDWR); sock_close(so); } + if (so6) { + sock_shutdown(so6, SHUT_RDWR); + sock_close(so6); + } } else { lck_mtx_unlock(nfs_global_mutex); } @@ -901,7 +2060,7 @@ void nfs4_mount_callback_shutdown(struct nfsmount *nmp) { struct nfs_callback_socket *ncbsp; - socket_t so; + socket_t so, so6; struct nfs4_cb_sock_list cb_socks; struct timespec ts = {1,0}; @@ -910,12 +2069,14 @@ nfs4_mount_callback_shutdown(struct nfsmount *nmp) /* wait for any callbacks in progress to complete */ while (nmp->nm_cbrefs) msleep(&nmp->nm_cbrefs, nfs_global_mutex, PSOCK, "cbshutwait", &ts); + nmp->nm_cbid = 0; if (--nfs4_cb_so_usecount) { lck_mtx_unlock(nfs_global_mutex); return; } so = nfs4_cb_so; - nfs4_cb_so = NULL; + so6 = nfs4_cb_so6; + nfs4_cb_so = nfs4_cb_so6 = NULL; TAILQ_INIT(&cb_socks); TAILQ_CONCAT(&cb_socks, &nfs4_cb_socks, ncbs_link); lck_mtx_unlock(nfs_global_mutex); @@ -923,10 +2084,15 @@ nfs4_mount_callback_shutdown(struct nfsmount *nmp) sock_shutdown(so, SHUT_RDWR); sock_close(so); } + if (so6) { + sock_shutdown(so6, SHUT_RDWR); + sock_close(so6); + } while ((ncbsp = TAILQ_FIRST(&cb_socks))) { TAILQ_REMOVE(&cb_socks, ncbsp, ncbs_link); sock_shutdown(ncbsp->ncbs_so, SHUT_RDWR); sock_close(ncbsp->ncbs_so); + nfs_rpc_record_state_cleanup(&ncbsp->ncbs_rrs); FREE(ncbsp, M_TEMP); } } @@ -958,6 +2124,7 @@ loop: lck_mtx_unlock(nfs_global_mutex); sock_shutdown(ncbsp->ncbs_so, SHUT_RDWR); sock_close(ncbsp->ncbs_so); + nfs_rpc_record_state_cleanup(&ncbsp->ncbs_rrs); FREE(ncbsp, M_TEMP); goto loop; } @@ -977,10 +2144,13 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag) struct nfs_callback_socket *ncbsp; struct nfsmount *nmp; struct timeval timeo, now; - struct sockaddr_in *saddr; - int error, on = 1; + int error, on = 1, ip; - if (so != nfs4_cb_so) + if (so == nfs4_cb_so) + ip = 4; + else if (so == nfs4_cb_so6) + ip = 6; + else return; /* allocate/initialize a new nfs_callback_socket */ @@ -990,15 +2160,15 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag) return; } bzero(ncbsp, sizeof(*ncbsp)); - ncbsp->ncbs_sin.sin_len = sizeof(struct sockaddr_in); - ncbsp->ncbs_rrs.nrrs_markerleft = sizeof(ncbsp->ncbs_rrs.nrrs_fragleft); + ncbsp->ncbs_saddr.ss_len = (ip == 4) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + nfs_rpc_record_state_init(&ncbsp->ncbs_rrs); /* accept a new socket */ - error = sock_accept(so, (struct sockaddr*)&ncbsp->ncbs_sin, - ncbsp->ncbs_sin.sin_len, MSG_DONTWAIT, + error = sock_accept(so, (struct sockaddr*)&ncbsp->ncbs_saddr, + ncbsp->ncbs_saddr.ss_len, MSG_DONTWAIT, nfs4_cb_rcv, ncbsp, &newso); if (error) { - log(LOG_INFO, "nfs callback accept: error %d accepting socket\n", error); + log(LOG_INFO, "nfs callback accept: error %d accepting IPv%d socket\n", error, ip); FREE(ncbsp, M_TEMP); return; } @@ -1009,11 +2179,12 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag) timeo.tv_sec = 60; error = sock_setsockopt(newso, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo)); if (error) - log(LOG_INFO, "nfs callback socket: error %d setting socket rx timeout\n", error); + log(LOG_INFO, "nfs callback socket: error %d setting IPv%d socket rx timeout\n", error, ip); error = sock_setsockopt(newso, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo)); if (error) - log(LOG_INFO, "nfs callback socket: error %d setting socket tx timeout\n", error); + log(LOG_INFO, "nfs callback socket: error %d setting IPv%d socket tx timeout\n", error, ip); sock_setsockopt(newso, IPPROTO_TCP, TCP_NODELAY, &on, sizeof(on)); + sock_setsockopt(newso, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); sock_setsockopt(newso, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on)); sock_setsockopt(newso, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); @@ -1028,11 +2199,10 @@ nfs4_cb_accept(socket_t so, __unused void *arg, __unused int waitflag) /* verify it's from a host we have mounted */ TAILQ_FOREACH(nmp, &nfs4_cb_mounts, nm_cblink) { - /* check socket's source address matches this mount's server address */ - saddr = mbuf_data(nmp->nm_nam); - if ((ncbsp->ncbs_sin.sin_len == saddr->sin_len) && - (ncbsp->ncbs_sin.sin_family == saddr->sin_family) && - (ncbsp->ncbs_sin.sin_addr.s_addr == saddr->sin_addr.s_addr)) + /* check if socket's source address matches this mount's server address */ + if (!nmp->nm_saddr) + continue; + if (nfs_sockaddr_cmp((struct sockaddr*)&ncbsp->ncbs_saddr, nmp->nm_saddr) == 0) break; } if (!nmp) /* we don't want this socket, mark it dead */ @@ -1077,7 +2247,7 @@ nfs4_cb_rcv(socket_t so, void *arg, __unused int waitflag) /* loop while we make error-free progress */ while (!error && recv) { - error = nfs_rpc_record_read(so, &ncbsp->ncbs_rrs, &recv, &m); + error = nfs_rpc_record_read(so, &ncbsp->ncbs_rrs, MSG_DONTWAIT, &recv, &m); if (m) /* handle the request */ error = nfs4_cb_handler(ncbsp, m); } @@ -1111,7 +2281,6 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) socket_t so = ncbsp->ncbs_so; struct nfsm_chain nmreq, nmrep; mbuf_t mhead = NULL, mrest = NULL, m; - struct sockaddr_in *saddr; struct msghdr msg; struct nfsmount *nmp; fhandle_t fh; @@ -1203,12 +2372,10 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) if (nmp->nm_cbid != cbid) continue; /* verify socket's source address matches this mount's server address */ - saddr = mbuf_data(nmp->nm_nam); - if ((ncbsp->ncbs_sin.sin_len != saddr->sin_len) || - (ncbsp->ncbs_sin.sin_family != saddr->sin_family) || - (ncbsp->ncbs_sin.sin_addr.s_addr != saddr->sin_addr.s_addr)) + if (!nmp->nm_saddr) continue; - break; + if (nfs_sockaddr_cmp((struct sockaddr*)&ncbsp->ncbs_saddr, nmp->nm_saddr) == 0) + break; } /* mark the NFS mount as busy */ if (nmp) @@ -1240,7 +2407,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) numops = 0; /* don't process any more ops */ } else { /* find the node for the file handle */ - error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, NULL, NULL, NG_NOCREATE, &np); + error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, NULL, NULL, RPCAUTH_UNKNOWN, NG_NOCREATE, &np); if (error || !np) { status = NFSERR_BADHANDLE; error = 0; @@ -1301,7 +2468,7 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) numops = 0; /* don't process any more ops */ } else { /* find the node for the file handle */ - error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, NULL, NULL, NG_NOCREATE, &np); + error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, NULL, NULL, RPCAUTH_UNKNOWN, NG_NOCREATE, &np); if (error || !np) { status = NFSERR_BADHANDLE; error = 0; @@ -1313,14 +2480,8 @@ nfs4_cb_handler(struct nfs_callback_socket *ncbsp, mbuf_t mreq) status = NFSERR_BAD_STATEID; numops = 0; /* don't process any more ops */ } - if (!status) { - /* add node to recall queue, and wake socket thread */ - lck_mtx_lock(&nmp->nm_lock); - if (np->n_dlink.tqe_next == NFSNOLIST) - TAILQ_INSERT_TAIL(&nmp->nm_recallq, np, n_dlink); - nfs_mount_sock_thread_wake(nmp); - lck_mtx_unlock(&nmp->nm_lock); - } + if (!status) /* add node to recall queue, and wake socket thread */ + nfs4_delegation_return_enqueue(np); if (np) { nfs_node_unlock(np); vnode_put(NFSTOV(np)); @@ -1456,6 +2617,28 @@ out: } +/* + * Initialize an nfs_rpc_record_state structure. + */ +void +nfs_rpc_record_state_init(struct nfs_rpc_record_state *nrrsp) +{ + bzero(nrrsp, sizeof(*nrrsp)); + nrrsp->nrrs_markerleft = sizeof(nrrsp->nrrs_fragleft); +} + +/* + * Clean up an nfs_rpc_record_state structure. + */ +void +nfs_rpc_record_state_cleanup(struct nfs_rpc_record_state *nrrsp) +{ + if (nrrsp->nrrs_m) { + mbuf_freem(nrrsp->nrrs_m); + nrrsp->nrrs_m = nrrsp->nrrs_mlast = NULL; + } +} + /* * Read the next (marked) RPC record from the socket. * @@ -1463,7 +2646,7 @@ out: * *mp returns the next complete RPC record */ int -nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int *recvp, mbuf_t *mp) +nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int flags, int *recvp, mbuf_t *mp) { struct iovec aio; struct msghdr msg; @@ -1482,7 +2665,7 @@ nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int *recvp, bzero(&msg, sizeof(msg)); msg.msg_iov = &aio; msg.msg_iovlen = 1; - error = sock_receive(so, &msg, MSG_DONTWAIT, &rcvlen); + error = sock_receive(so, &msg, flags, &rcvlen); if (error || !rcvlen) break; *recvp = 1; @@ -1497,10 +2680,7 @@ nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int *recvp, } nrrsp->nrrs_reclen += nrrsp->nrrs_fragleft; if (nrrsp->nrrs_reclen > NFS_MAXPACKET) { - /* - * This is SERIOUS! We are out of sync with the sender - * and forcing a disconnect/reconnect is all I can do. - */ + /* This is SERIOUS! We are out of sync with the sender. */ log(LOG_ERR, "impossible RPC record length (%d) on callback", nrrsp->nrrs_reclen); error = EFBIG; } @@ -1510,7 +2690,7 @@ nfs_rpc_record_read(socket_t so, struct nfs_rpc_record_state *nrrsp, int *recvp, while (!error && !nrrsp->nrrs_markerleft && nrrsp->nrrs_fragleft) { m = NULL; rcvlen = nrrsp->nrrs_fragleft; - error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen); + error = sock_receivembuf(so, NULL, &m, flags, &rcvlen); if (error || !rcvlen || !m) break; *recvp = 1; @@ -1579,7 +2759,7 @@ int nfs_send(struct nfsreq *req, int wait) { struct nfsmount *nmp; - socket_t so; + struct nfs_socket *nso; int error, error2, sotype, rexmit, slpflag = 0, needrecon; struct msghdr msg; struct sockaddr *sendnam; @@ -1597,7 +2777,7 @@ again: return (error); } - error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0); + error = nfs_sigintr(req->r_nmp, req, NULL, 0); if (error) { nfs_sndunlock(req); lck_mtx_lock(&req->r_mtx); @@ -1629,7 +2809,7 @@ again: lck_mtx_lock(&nmp->nm_lock); if (!(nmp->nm_sockflags & NMSOCK_READY) && !((nmp->nm_sockflags & NMSOCK_SETUP) && (req->r_flags & R_SETUP))) { - if (nmp->nm_flag & NFSMNT_INT) + if (NMFLAG(nmp, INTR) && !(req->r_flags & R_NOINTR)) slpflag |= PCATCH; lck_mtx_unlock(&nmp->nm_lock); nfs_sndunlock(req); @@ -1653,7 +2833,7 @@ again: error = EIO; break; } - if ((nmp->nm_flag & NFSMNT_SOFT) && (nmp->nm_reconnect_start > 0)) { + if (NMFLAG(nmp, SOFT) && (nmp->nm_reconnect_start > 0)) { struct timeval now; microuptime(&now); if ((now.tv_sec - nmp->nm_reconnect_start) >= 8) { @@ -1681,9 +2861,11 @@ again: } goto again; } - so = nmp->nm_so; + nso = nmp->nm_nso; + /* note that we're using the mount's socket to do the send */ + nmp->nm_state |= NFSSTA_SENDING; /* will be cleared by nfs_sndunlock() */ lck_mtx_unlock(&nmp->nm_lock); - if (!so) { + if (!nso) { nfs_sndunlock(req); lck_mtx_lock(&req->r_mtx); req->r_flags &= ~R_SENDING; @@ -1700,7 +2882,7 @@ again: lck_mtx_lock(&nmp->nm_lock); if (!(req->r_flags & R_CWND) && (nmp->nm_sent >= nmp->nm_cwnd)) { /* if we can't send this out yet, wait on the cwnd queue */ - slpflag = ((nmp->nm_flag & NFSMNT_INT) && req->r_thread) ? PCATCH : 0; + slpflag = (NMFLAG(nmp, INTR) && req->r_thread) ? PCATCH : 0; lck_mtx_unlock(&nmp->nm_lock); nfs_sndunlock(req); req->r_flags &= ~R_SENDING; @@ -1764,13 +2946,11 @@ again: } bzero(&msg, sizeof(msg)); - if (nmp->nm_nam && (sotype != SOCK_STREAM) && !sock_isconnected(so)) { - if ((sendnam = mbuf_data(nmp->nm_nam))) { - msg.msg_name = (caddr_t)sendnam; - msg.msg_namelen = sendnam->sa_len; - } + if ((sotype != SOCK_STREAM) && !sock_isconnected(nso->nso_so) && ((sendnam = nmp->nm_saddr))) { + msg.msg_name = (caddr_t)sendnam; + msg.msg_namelen = sendnam->sa_len; } - error = sock_sendmbuf(so, &msg, mreqcopy, 0, &sentlen); + error = sock_sendmbuf(nso->nso_so, &msg, mreqcopy, 0, &sentlen); #ifdef NFS_SOCKET_DEBUGGING if (error || (sentlen != req->r_mreqlen)) NFS_SOCK_DBG(("nfs_send: 0x%llx sent %d/%d error %d\n", @@ -1820,9 +3000,9 @@ again: * For now, ignore them all */ if ((error != EINTR) && (error != ERESTART) && - (error != EWOULDBLOCK) && (error != EIO)) { + (error != EWOULDBLOCK) && (error != EIO) && (nso == nmp->nm_nso)) { int clearerror = 0, optlen = sizeof(clearerror); - sock_getsockopt(so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen); + sock_getsockopt(nso->nso_so, SOL_SOCKET, SO_ERROR, &clearerror, &optlen); #ifdef NFS_SOCKET_DEBUGGING if (clearerror) NFS_SOCK_DBG(("nfs_send: ignoring UDP socket error %d so %d\n", @@ -1852,7 +3032,7 @@ again: needrecon = 1; break; } - if (needrecon) { /* mark socket as needing reconnect */ + if (needrecon && (nso == nmp->nm_nso)) { /* mark socket as needing reconnect */ NFS_SOCK_DBG(("nfs_send: 0x%llx need reconnect %d\n", req->r_xid, error)); nfs_need_reconnect(nmp); } @@ -1902,20 +3082,19 @@ void nfs_udp_rcv(socket_t so, void *arg, __unused int waitflag) { struct nfsmount *nmp = arg; + struct nfs_socket *nso = nmp->nm_nso; size_t rcvlen; mbuf_t m; int error = 0; - if (nmp->nm_sockflags & NMSOCK_CONNECTING) { - wakeup(&nmp->nm_so); - return; - } - - /* make sure we're on the current socket */ - if (nmp->nm_so != so) + if (nmp->nm_sockflags & NMSOCK_CONNECTING) return; do { + /* make sure we're on the current socket */ + if (!nso || (nso->nso_so != so)) + return; + m = NULL; rcvlen = 1000000; error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen); @@ -1935,123 +3114,54 @@ void nfs_tcp_rcv(socket_t so, void *arg, __unused int waitflag) { struct nfsmount *nmp = arg; - struct iovec aio; - struct msghdr msg; - size_t rcvlen; + struct nfs_socket *nso = nmp->nm_nso; + struct nfs_rpc_record_state nrrs; mbuf_t m; int error = 0; - int recv; + int recv = 1; - if (nmp->nm_sockflags & NMSOCK_CONNECTING) { - wakeup(&nmp->nm_so); + if (nmp->nm_sockflags & NMSOCK_CONNECTING) return; - } /* make sure we're on the current socket */ - if (nmp->nm_so != so) - return; - lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_sockflags & NMSOCK_UPCALL) { - /* upcall is already receiving data - just return */ + nso = nmp->nm_nso; + if (!nso || (nso->nso_so != so) || (nmp->nm_sockflags & (NMSOCK_DISCONNECTING))) { lck_mtx_unlock(&nmp->nm_lock); return; } - nmp->nm_sockflags |= NMSOCK_UPCALL; - -nextfrag: - recv = 0; - - /* read the TCP RPC record marker */ - while (!error && nmp->nm_markerleft) { - aio.iov_base = ((char*)&nmp->nm_fragleft + - sizeof(nmp->nm_fragleft) - nmp->nm_markerleft); - aio.iov_len = nmp->nm_markerleft; - bzero(&msg, sizeof(msg)); - msg.msg_iov = &aio; - msg.msg_iovlen = 1; - lck_mtx_unlock(&nmp->nm_lock); - error = sock_receive(so, &msg, MSG_DONTWAIT, &rcvlen); - lck_mtx_lock(&nmp->nm_lock); - if (error || !rcvlen) - break; - recv = 1; - nmp->nm_markerleft -= rcvlen; - if (nmp->nm_markerleft) - continue; - /* record marker complete */ - nmp->nm_fragleft = ntohl(nmp->nm_fragleft); - if (nmp->nm_fragleft & 0x80000000) { - nmp->nm_sockflags |= NMSOCK_LASTFRAG; - nmp->nm_fragleft &= ~0x80000000; - } - nmp->nm_reclen += nmp->nm_fragleft; - if (nmp->nm_reclen > NFS_MAXPACKET) { - /* - * This is SERIOUS! We are out of sync with the sender - * and forcing a disconnect/reconnect is all I can do. - */ - log(LOG_ERR, "%s (%d) from nfs server %s\n", - "impossible RPC record length", nmp->nm_reclen, - vfs_statfs(nmp->nm_mountp)->f_mntfromname); - error = EFBIG; - } - } + lck_mtx_unlock(&nmp->nm_lock); - /* read the TCP RPC record fragment */ - while (!error && !nmp->nm_markerleft && nmp->nm_fragleft) { - m = NULL; - rcvlen = nmp->nm_fragleft; - lck_mtx_unlock(&nmp->nm_lock); - error = sock_receivembuf(so, NULL, &m, MSG_DONTWAIT, &rcvlen); - lck_mtx_lock(&nmp->nm_lock); - if (error || !rcvlen || !m) - break; - recv = 1; - /* append mbufs to list */ - nmp->nm_fragleft -= rcvlen; - if (!nmp->nm_m) { - nmp->nm_m = m; - } else { - error = mbuf_setnext(nmp->nm_mlast, m); - if (error) { - printf("nfs_tcp_rcv: mbuf_setnext failed %d\n", error); - mbuf_freem(m); - break; - } - } - while (mbuf_next(m)) - m = mbuf_next(m); - nmp->nm_mlast = m; + /* make sure this upcall should be trying to do work */ + lck_mtx_lock(&nso->nso_lock); + if (nso->nso_flags & (NSO_UPCALL|NSO_DISCONNECTING|NSO_DEAD)) { + lck_mtx_unlock(&nso->nso_lock); + return; } + nso->nso_flags |= NSO_UPCALL; + nrrs = nso->nso_rrs; + lck_mtx_unlock(&nso->nso_lock); - /* done reading fragment? */ - m = NULL; - if (!error && !nmp->nm_markerleft && !nmp->nm_fragleft) { - /* reset socket fragment parsing state */ - nmp->nm_markerleft = sizeof(nmp->nm_fragleft); - if (nmp->nm_sockflags & NMSOCK_LASTFRAG) { - /* RPC record complete */ - m = nmp->nm_m; - /* reset socket record parsing state */ - nmp->nm_reclen = 0; - nmp->nm_m = nmp->nm_mlast = NULL; - nmp->nm_sockflags &= ~NMSOCK_LASTFRAG; - } + /* loop while we make error-free progress */ + while (!error && recv) { + error = nfs_rpc_record_read(so, &nrrs, MSG_DONTWAIT, &recv, &m); + if (m) /* match completed response with request */ + nfs_request_match_reply(nmp, m); } - if (m) { /* match completed response with request */ + lck_mtx_lock(&nmp->nm_lock); + if (nmp->nm_nso == nso) { + /* still the same socket, so update socket's RPC parsing state */ + lck_mtx_unlock(&nmp->nm_lock); + lck_mtx_lock(&nso->nso_lock); + nso->nso_rrs = nrrs; + nso->nso_flags &= ~NSO_UPCALL; + lck_mtx_unlock(&nso->nso_lock); + if (nmp->nm_sockflags & NMSOCK_DISCONNECTING) + wakeup(&nmp->nm_sockflags); + } else { lck_mtx_unlock(&nmp->nm_lock); - nfs_request_match_reply(nmp, m); - lck_mtx_lock(&nmp->nm_lock); } - - /* loop if we've been making error-free progress */ - if (!error && recv) - goto nextfrag; - - nmp->nm_sockflags &= ~NMSOCK_UPCALL; - lck_mtx_unlock(&nmp->nm_lock); #ifdef NFS_SOCKET_DEBUGGING if (!recv && (error != EWOULDBLOCK)) NFS_SOCK_DBG(("nfs_tcp_rcv: got nothing, error %d, got FIN?\n", error)); @@ -2077,7 +3187,8 @@ nfs_sock_poke(struct nfsmount *nmp) int dummy; lck_mtx_lock(&nmp->nm_lock); - if ((nmp->nm_sockflags & NMSOCK_UNMOUNT) || !nmp->nm_so) { + if ((nmp->nm_sockflags & NMSOCK_UNMOUNT) || + !(nmp->nm_sockflags & NMSOCK_READY) || !nmp->nm_nso || !nmp->nm_nso->nso_so) { lck_mtx_unlock(&nmp->nm_lock); return; } @@ -2088,7 +3199,7 @@ nfs_sock_poke(struct nfsmount *nmp) bzero(&msg, sizeof(msg)); msg.msg_iov = &aio; msg.msg_iovlen = 1; - error = sock_send(nmp->nm_so, &msg, MSG_DONTWAIT, &len); + error = sock_send(nmp->nm_nso->nso_so, &msg, MSG_DONTWAIT, &len); NFS_SOCK_DBG(("nfs_sock_poke: error %d\n", error)); } @@ -2183,7 +3294,7 @@ nfs_request_match_reply(struct nfsmount *nmp, mbuf_t mrep) /* signal anyone waiting on this request */ wakeup(req); asyncioq = (req->r_callback.rcb_func != NULL); - if (req->r_gss_ctx != NULL) + if (nfs_request_using_gss(req)) nfs_gss_clnt_rpcdone(req); lck_mtx_unlock(&req->r_mtx); lck_mtx_unlock(nfs_request_mutex); @@ -2209,16 +3320,16 @@ int nfs_wait_reply(struct nfsreq *req) { struct timespec ts = { 2, 0 }; - int error = 0, slpflag; + int error = 0, slpflag, first = 1; - if (req->r_nmp && (req->r_nmp->nm_flag & NFSMNT_INT) && req->r_thread) + if (req->r_nmp && NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) slpflag = PCATCH; else slpflag = 0; lck_mtx_lock(&req->r_mtx); while (!req->r_nmrep.nmc_mhead) { - if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) + if ((error = nfs_sigintr(req->r_nmp, req, first ? NULL : req->r_thread, 0))) break; if (((error = req->r_error)) || req->r_nmrep.nmc_mhead) break; @@ -2228,9 +3339,9 @@ nfs_wait_reply(struct nfsreq *req) req->r_procnum, req->r_xid, req->r_flags, req->r_rtt)); req->r_flags |= R_SENDING; lck_mtx_unlock(&req->r_mtx); - if (req->r_gss_ctx) { + if (nfs_request_using_gss(req)) { /* - * It's an RPCSEC_GSS mount. + * It's an RPCSEC_GSS request. * Can't just resend the original request * without bumping the cred sequence number. * Go back and re-build the request. @@ -2253,7 +3364,7 @@ nfs_wait_reply(struct nfsreq *req) if (nfs_noremotehang(req->r_thread)) ts.tv_sec = 1; msleep(req, &req->r_mtx, slpflag | (PZERO - 1), "nfswaitreply", &ts); - slpflag = 0; + first = slpflag = 0; } lck_mtx_unlock(&req->r_mtx); @@ -2340,6 +3451,8 @@ nfs_request_create( req->r_nmp = nmp; req->r_np = np; req->r_thread = thd; + if (!thd) + req->r_flags |= R_NOINTR; if (IS_VALID_CRED(cred)) { kauth_cred_ref(cred); req->r_cred = cred; @@ -2353,6 +3466,14 @@ nfs_request_create( req->r_rchain.tqe_next = NFSREQNOLIST; req->r_cchain.tqe_next = NFSREQNOLIST; + /* set auth flavor to use for request */ + if (!req->r_cred) + req->r_auth = RPCAUTH_NONE; + else if (req->r_np && (req->r_np->n_auth != RPCAUTH_INVALID)) + req->r_auth = req->r_np->n_auth; + else + req->r_auth = nmp->nm_auth; + lck_mtx_unlock(&nmp->nm_lock); /* move the request mbuf chain to the nfsreq */ @@ -2394,6 +3515,18 @@ nfs_request_destroy(struct nfsreq *req) lck_mtx_lock(&req->r_mtx); if (nmp) { lck_mtx_lock(&nmp->nm_lock); + if (req->r_flags & R_CWND) { + /* Decrement the outstanding request count. */ + req->r_flags &= ~R_CWND; + nmp->nm_sent -= NFS_CWNDSCALE; + if ((nmp->nm_sent < nmp->nm_cwnd) && !TAILQ_EMPTY(&nmp->nm_cwndq)) { + /* congestion window is open, poke the cwnd queue */ + struct nfsreq *req2 = TAILQ_FIRST(&nmp->nm_cwndq); + TAILQ_REMOVE(&nmp->nm_cwndq, req2, r_cchain); + req2->r_cchain.tqe_next = NFSREQNOLIST; + wakeup(req2); + } + } if (req->r_rchain.tqe_next != NFSREQNOLIST) { TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain); req->r_rchain.tqe_next = NFSREQNOLIST; @@ -2424,12 +3557,14 @@ nfs_request_destroy(struct nfsreq *req) mbuf_freem(req->r_nmrep.nmc_mhead); if (IS_VALID_CRED(req->r_cred)) kauth_cred_unref(&req->r_cred); - if (req->r_gss_ctx) + if (nfs_request_using_gss(req)) nfs_gss_clnt_rpcdone(req); SLIST_FOREACH_SAFE(gsp, &req->r_gss_seqlist, gss_seqnext, ngsp) FREE(gsp, M_TEMP); if (req->r_gss_ctx) nfs_gss_clnt_ctx_unref(req); + if (req->r_wrongsec) + FREE(req->r_wrongsec, M_TEMP); lck_mtx_destroy(&req->r_mtx, nfs_request_grp); if (req->r_flags & R_ALLOCATED) @@ -2471,7 +3606,7 @@ int nfs_request_add_header(struct nfsreq *req) { struct nfsmount *nmp; - int error = 0, auth_len = 0; + int error = 0; mbuf_t m; /* free up any previous header */ @@ -2485,24 +3620,7 @@ nfs_request_add_header(struct nfsreq *req) if (!nmp) return (ENXIO); - if (!req->r_cred) /* RPCAUTH_NULL */ - auth_len = 0; - else switch (nmp->nm_auth) { - case RPCAUTH_UNIX: - if (req->r_cred->cr_ngroups < 1) - return (EINVAL); - auth_len = ((((req->r_cred->cr_ngroups - 1) > nmp->nm_numgrps) ? - nmp->nm_numgrps : (req->r_cred->cr_ngroups - 1)) << 2) + - 5 * NFSX_UNSIGNED; - break; - case RPCAUTH_KRB5: - case RPCAUTH_KRB5I: - case RPCAUTH_KRB5P: - auth_len = 5 * NFSX_UNSIGNED + 0; // zero context handle for now - break; - } - - error = nfsm_rpchead(req, auth_len, req->r_mrest, &req->r_xid, &req->r_mhead); + error = nfsm_rpchead(req, req->r_mrest, &req->r_xid, &req->r_mhead); if (error) return (error); @@ -2511,7 +3629,7 @@ nfs_request_add_header(struct nfsreq *req) if (!nmp) return (ENXIO); lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_flag & NFSMNT_SOFT) + if (NMFLAG(nmp, SOFT)) req->r_retry = nmp->nm_retry; else req->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ @@ -2598,7 +3716,7 @@ nfs_request_finish( uint32_t auth_status = 0; uint32_t accepted_status = 0; struct nfsm_chain nmrep; - int error, auth, clearjbtimeo; + int error, clearjbtimeo; error = req->r_error; @@ -2612,10 +3730,10 @@ nfs_request_finish( nmp = req->r_np ? NFSTONMP(req->r_np) : req->r_nmp; - /* - * Decrement the outstanding request count. - */ if ((req->r_flags & R_CWND) && nmp) { + /* + * Decrement the outstanding request count. + */ req->r_flags &= ~R_CWND; lck_mtx_lock(&nmp->nm_lock); FSDBG(273, R_XID32(req->r_xid), req, nmp->nm_sent, nmp->nm_cwnd); @@ -2630,9 +3748,9 @@ nfs_request_finish( lck_mtx_unlock(&nmp->nm_lock); } - if (req->r_gss_ctx) { // Using gss cred ? + if (nfs_request_using_gss(req)) { /* - * If the request had an RPCSEC_GSS credential + * If the request used an RPCSEC_GSS credential * then reset its sequence number bit in the * request window. */ @@ -2665,7 +3783,7 @@ nfs_request_finish( */ if (!error) { if ((req->r_flags & R_TPRINTFMSG) || - (nmp && (nmp->nm_flag & NFSMNT_SOFT) && + (nmp && NMFLAG(nmp, SOFT) && ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_FORCE)) == NFSSTA_TIMEO))) nfs_up(nmp, req->r_thread, NFSSTA_TIMEO, "is alive again"); else @@ -2725,11 +3843,10 @@ nfs_request_finish( nfsm_chain_get_32(error, &nmrep, verf_len); // verifier length nfsmout_if(error); - auth = !req->r_cred ? RPCAUTH_NULL : nmp->nm_auth; - switch (auth) { - case RPCAUTH_NULL: - case RPCAUTH_UNIX: - /* Any AUTH_UNIX verifier is ignored */ + switch (req->r_auth) { + case RPCAUTH_NONE: + case RPCAUTH_SYS: + /* Any AUTH_SYS verifier is ignored */ if (verf_len > 0) nfsm_chain_adv(error, &nmrep, nfsm_rndup(verf_len)); nfsm_chain_get_32(error, &nmrep, accepted_status); @@ -2760,7 +3877,7 @@ nfs_request_finish( /* * It's a JUKEBOX error - delay and try again */ - int delay, slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; + int delay, slpflag = (NMFLAG(nmp, INTR) && !(req->r_flags & R_NOINTR)) ? PCATCH : 0; mbuf_freem(mrep); req->r_nmrep.nmc_mhead = NULL; @@ -2785,7 +3902,7 @@ nfs_request_finish( nfs_down(req->r_nmp, req->r_thread, 0, NFSSTA_JUKEBOXTIMEO, "resource temporarily unavailable (jukebox)"); } - if ((nmp->nm_flag & NFSMNT_SOFT) && (req->r_delay == 30)) { + if (NMFLAG(nmp, SOFT) && (req->r_delay == 30) && !(req->r_flags & R_NOINTR)) { /* for soft mounts, just give up after a short while */ OSAddAtomic(1, &nfsstats.rpctimeouts); nfs_softterm(req); @@ -2802,6 +3919,7 @@ nfs_request_finish( if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) goto nfsmout; tsleep(&lbolt, PSOCK|slpflag, "nfs_jukebox_trylater", 0); + slpflag = 0; } while (--delay > 0); } req->r_xid = 0; // get a new XID @@ -2820,6 +3938,96 @@ nfs_request_finish( nfs_up(nmp, req->r_thread, clearjbtimeo, "resource available again"); } + if ((nmp->nm_vers >= NFS_VER4) && (*status == NFSERR_WRONGSEC)) { + /* + * Hmmm... we need to try a different security flavor. + * The first time a request hits this, we will allocate an array + * to track flavors to try. We fill the array with the mount's + * preferred flavors or the server's preferred flavors or just the + * flavors we support. + */ + uint32_t srvflavors[NX_MAX_SEC_FLAVORS]; + int srvcount, i, j; + + /* Call SECINFO to try to get list of flavors from server. */ + srvcount = NX_MAX_SEC_FLAVORS; + nfs4_secinfo_rpc(nmp, &req->r_secinfo, req->r_cred, srvflavors, &srvcount); + + if (!req->r_wrongsec) { + /* first time... set up flavor array */ + MALLOC(req->r_wrongsec, uint32_t*, NX_MAX_SEC_FLAVORS*sizeof(uint32_t), M_TEMP, M_WAITOK); + if (!req->r_wrongsec) { + error = EACCES; + goto nfsmout; + } + i=0; + if (nmp->nm_sec.count) { /* use the mount's preferred list of flavors */ + for(; i < nmp->nm_sec.count; i++) + req->r_wrongsec[i] = nmp->nm_sec.flavors[i]; + } else if (srvcount) { /* otherwise use the server's list of flavors */ + for(; i < srvcount; i++) + req->r_wrongsec[i] = srvflavors[i]; + } else { /* otherwise, just try the flavors we support. */ + req->r_wrongsec[i++] = RPCAUTH_KRB5P; + req->r_wrongsec[i++] = RPCAUTH_KRB5I; + req->r_wrongsec[i++] = RPCAUTH_KRB5; + req->r_wrongsec[i++] = RPCAUTH_SYS; + req->r_wrongsec[i++] = RPCAUTH_NONE; + } + for(; i < NX_MAX_SEC_FLAVORS; i++) /* invalidate any remaining slots */ + req->r_wrongsec[i] = RPCAUTH_INVALID; + } + + /* clear the current flavor from the list */ + for(i=0; i < NX_MAX_SEC_FLAVORS; i++) + if (req->r_wrongsec[i] == req->r_auth) + req->r_wrongsec[i] = RPCAUTH_INVALID; + + /* find the next flavor to try */ + for(i=0; i < NX_MAX_SEC_FLAVORS; i++) + if (req->r_wrongsec[i] != RPCAUTH_INVALID) { + if (((req->r_wrongsec[i] == RPCAUTH_KRB5P) || + (req->r_wrongsec[i] == RPCAUTH_KRB5I) || + (req->r_wrongsec[i] == RPCAUTH_KRB5)) && (req->r_gss_ctx && + (req->r_gss_ctx->gss_clnt_service == RPCSEC_GSS_SVC_SYS))) { + /* don't bother trying Kerberos if we've already got a fallback context */ + req->r_wrongsec[i] = RPCAUTH_INVALID; + continue; + } + if (!srvcount) /* no server list, just try it */ + break; + /* check that it's in the server's list */ + for(j=0; j < srvcount; j++) + if (req->r_wrongsec[i] == srvflavors[j]) + break; + if (j < srvcount) /* found */ + break; + /* not found in server list */ + req->r_wrongsec[i] = RPCAUTH_INVALID; + } + if (i == NX_MAX_SEC_FLAVORS) { + /* nothing left to try! */ + error = EACCES; + goto nfsmout; + } + + /* retry with the next auth flavor */ + req->r_auth = req->r_wrongsec[i]; + req->r_xid = 0; // get a new XID + req->r_flags |= R_RESTART; + req->r_start = 0; + FSDBG(273, R_XID32(req->r_xid), nmp, req, NFSERR_WRONGSEC); + return (0); + } + if ((nmp->nm_vers >= NFS_VER4) && req->r_wrongsec) { + /* + * We renegotiated security for this request; so update the + * default security flavor for the associated node. + */ + if (req->r_np) + req->r_np->n_auth = req->r_auth; + } + if (*status == NFS_OK) { /* * Successful NFS request @@ -2834,8 +4042,12 @@ nfs_request_finish( * If the File Handle was stale, invalidate the * lookup cache, just in case. */ - if ((*status == ESTALE) && req->r_np) + if ((*status == ESTALE) && req->r_np) { cache_purge(NFSTOV(req->r_np)); + /* if monitored, also send delete event */ + if (vnode_ismonitored(NFSTOV(req->r_np))) + nfs_vnode_notify(req->r_np, (VNODE_EVENT_ATTRIB|VNODE_EVENT_DELETE)); + } if (nmp->nm_vers == NFS_VER2) mbuf_freem(mrep); else @@ -2875,6 +4087,22 @@ nfsmout: return (error); } +/* + * NFS request using a GSS/Kerberos security flavor? + */ +int +nfs_request_using_gss(struct nfsreq *req) +{ + if (!req->r_gss_ctx) + return (0); + switch (req->r_auth) { + case RPCAUTH_KRB5: + case RPCAUTH_KRB5I: + case RPCAUTH_KRB5P: + return (1); + } + return (0); +} /* * Perform an NFS request synchronously. @@ -2887,13 +4115,14 @@ nfs_request( struct nfsm_chain *nmrest, int procnum, vfs_context_t ctx, + struct nfsreq_secinfo_args *si, struct nfsm_chain *nmrepp, u_int64_t *xidp, int *status) { return nfs_request2(np, mp, nmrest, procnum, vfs_context_thread(ctx), vfs_context_ucred(ctx), - 0, nmrepp, xidp, status); + si, 0, nmrepp, xidp, status); } int @@ -2904,6 +4133,7 @@ nfs_request2( int procnum, thread_t thd, kauth_cred_t cred, + struct nfsreq_secinfo_args *si, int flags, struct nfsm_chain *nmrepp, u_int64_t *xidp, @@ -2915,6 +4145,8 @@ nfs_request2( if ((error = nfs_request_create(np, mp, nmrest, procnum, thd, cred, &req))) return (error); req->r_flags |= (flags & R_OPTMASK); + if (si) + req->r_secinfo = *si; FSDBG_TOP(273, R_XID32(req->r_xid), np, procnum, 0); do { @@ -2998,10 +4230,13 @@ nfs_request_async( int procnum, thread_t thd, kauth_cred_t cred, + struct nfsreq_secinfo_args *si, + int flags, struct nfsreq_cbinfo *cb, struct nfsreq **reqp) { struct nfsreq *req; + struct nfsmount *nmp; int error, sent; error = nfs_request_create(np, mp, nmrest, procnum, thd, cred, reqp); @@ -3009,7 +4244,10 @@ nfs_request_async( FSDBG(274, (req ? R_XID32(req->r_xid) : 0), np, procnum, error); if (error) return (error); + req->r_flags |= (flags & R_OPTMASK); req->r_flags |= R_ASYNC; + if (si) + req->r_secinfo = *si; if (cb) req->r_callback = *cb; error = nfs_request_add_header(req); @@ -3021,9 +4259,32 @@ nfs_request_async( lck_mtx_lock(&req->r_mtx); if (!error && !(req->r_flags & R_SENT) && req->r_callback.rcb_func) { /* make sure to wait until this async I/O request gets sent */ - int slpflag = (req->r_nmp && (req->r_nmp->nm_flag & NFSMNT_INT) && req->r_thread) ? PCATCH : 0; + int slpflag = (req->r_nmp && NMFLAG(req->r_nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) ? PCATCH : 0; struct timespec ts = { 2, 0 }; while (!(req->r_flags & R_SENT)) { + if ((req->r_flags & R_RESENDQ) && ((nmp = req->r_nmp))) { + lck_mtx_lock(&nmp->nm_lock); + if ((nmp->nm_state & NFSSTA_RECOVER) && (req->r_rchain.tqe_next != NFSREQNOLIST)) { + /* + * It's not going to get off the resend queue if we're in recovery. + * So, just take it off ourselves. We could be holding mount state + * busy and thus holding up the start of recovery. + */ + TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain); + req->r_rchain.tqe_next = NFSREQNOLIST; + if (req->r_flags & R_RESENDQ) + req->r_flags &= ~R_RESENDQ; + lck_mtx_unlock(&nmp->nm_lock); + req->r_flags |= R_SENDING; + lck_mtx_unlock(&req->r_mtx); + error = nfs_send(req, 1); + lck_mtx_lock(&req->r_mtx); + if (error) + break; + continue; + } + lck_mtx_unlock(&nmp->nm_lock); + } if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) break; msleep(req, &req->r_mtx, slpflag | (PZERO - 1), "nfswaitsent", &ts); @@ -3052,12 +4313,30 @@ nfs_request_async_finish( int *status) { int error = 0, asyncio = req->r_callback.rcb_func ? 1 : 0; + struct nfsmount *nmp; lck_mtx_lock(&req->r_mtx); if (!asyncio) req->r_flags |= R_ASYNCWAIT; while (req->r_flags & R_RESENDQ) { /* wait until the request is off the resend queue */ struct timespec ts = { 2, 0 }; + if ((nmp = req->r_nmp)) { + lck_mtx_lock(&nmp->nm_lock); + if ((nmp->nm_state & NFSSTA_RECOVER) && (req->r_rchain.tqe_next != NFSREQNOLIST)) { + /* + * It's not going to get off the resend queue if we're in recovery. + * So, just take it off ourselves. We could be holding mount state + * busy and thus holding up the start of recovery. + */ + TAILQ_REMOVE(&nmp->nm_resendq, req, r_rchain); + req->r_rchain.tqe_next = NFSREQNOLIST; + if (req->r_flags & R_RESENDQ) + req->r_flags &= ~R_RESENDQ; + lck_mtx_unlock(&nmp->nm_lock); + break; + } + lck_mtx_unlock(&nmp->nm_lock); + } if ((error = nfs_sigintr(req->r_nmp, req, req->r_thread, 0))) break; msleep(req, &req->r_mtx, PZERO-1, "nfsresendqwait", &ts); @@ -3270,7 +4549,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) * Put a reasonable limit on the maximum timeout, * and reduce that limit when soft mounts get timeouts or are in reconnect. */ - if (!(nmp->nm_flag & NFSMNT_SOFT)) + if (!NMFLAG(nmp, SOFT)) maxtime = NFS_MAXTIMEO; else if ((req->r_flags & (R_SETUP|R_RECOVER)) || ((nmp->nm_reconnect_start <= 0) || ((now.tv_sec - nmp->nm_reconnect_start) < 8))) @@ -3290,7 +4569,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) } else { if (req->r_procnum == NFSPROC_NULL && req->r_gss_ctx != NULL) timeo = NFS_MINIDEMTIMEO; // gss context setup - else if (nmp->nm_flag & NFSMNT_DUMBTIMR) + else if (NMFLAG(nmp, DUMBTIMER)) timeo = nmp->nm_timeo; else timeo = NFS_RTO(nmp, proct[req->r_procnum]); @@ -3320,7 +4599,8 @@ nfs_request_timer(__unused void *param0, __unused void *param1) /* if it's been a few seconds, try poking the socket */ if ((nmp->nm_sotype == SOCK_STREAM) && ((now.tv_sec - req->r_start) >= 3) && - !(nmp->nm_sockflags & NMSOCK_POKE)) { + !(nmp->nm_sockflags & (NMSOCK_POKE|NMSOCK_UNMOUNT)) && + (nmp->nm_sockflags & NMSOCK_READY)) { nmp->nm_sockflags |= NMSOCK_POKE; TAILQ_INSERT_TAIL(&nfs_mount_poke_queue, nmp, nm_pokeq); } @@ -3328,7 +4608,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) } /* For soft mounts (& SETUPs/RECOVERs), check for too many retransmits/timeout. */ - if (((nmp->nm_flag & NFSMNT_SOFT) || (req->r_flags & (R_SETUP|R_RECOVER))) && + if ((NMFLAG(nmp, SOFT) || (req->r_flags & (R_SETUP|R_RECOVER))) && ((req->r_rexmit >= req->r_retry) || /* too many */ ((now.tv_sec - req->r_start)*NFS_HZ > maxtime))) { /* too long */ OSAddAtomic(1, &nfsstats.rpctimeouts); @@ -3344,6 +4624,11 @@ nfs_request_timer(__unused void *param0, __unused void *param1) } else { lck_mtx_unlock(&nmp->nm_lock); } + if (req->r_flags & R_NOINTR) { + /* don't terminate nointr requests on timeout */ + lck_mtx_unlock(&req->r_mtx); + continue; + } NFS_SOCK_DBG(("nfs timer TERMINATE: p %d x 0x%llx f 0x%x rtt %d t %ld\n", req->r_procnum, req->r_xid, req->r_flags, req->r_rtt, now.tv_sec - req->r_start)); @@ -3391,8 +4676,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1) nfs_sock_poke(nmp); lck_mtx_lock(&nmp->nm_lock); nmp->nm_sockflags &= ~NMSOCK_POKE; - if (!(nmp->nm_state & NFSSTA_MOUNTED)) - wakeup(&nmp->nm_sockflags); + wakeup(&nmp->nm_sockflags); lck_mtx_unlock(&nmp->nm_lock); } @@ -3417,6 +4701,7 @@ nfs_noremotehang(thread_t thd) * and the mount is interruptable, or if we are a thread that is in the process * of cancellation (also SIGKILL posted). */ +extern int sigprop[NSIG+1]; int nfs_sigintr(struct nfsmount *nmp, struct nfsreq *req, thread_t thd, int nmplocked) { @@ -3428,19 +4713,17 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *req, thread_t thd, int nmplocke if (req && (req->r_flags & R_SOFTTERM)) return (ETIMEDOUT); /* request has been terminated. */ + if (req && (req->r_flags & R_NOINTR)) + thd = NULL; /* don't check for signal on R_NOINTR */ - /* - * If we're in the progress of a force unmount and there's - * been a timeout, we're dead and fail IO. - */ if (!nmplocked) lck_mtx_lock(&nmp->nm_lock); - if ((nmp->nm_state & NFSSTA_FORCE) && - (nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_JUKEBOXTIMEO|NFSSTA_LOCKTIMEO))) { + if (nmp->nm_state & NFSSTA_FORCE) { + /* If a force unmount is in progress then fail. */ error = EIO; } else if (nmp->nm_mountp->mnt_kern_flag & MNTK_FRCUNMOUNT) { /* Someone is unmounting us, go soft and mark it. */ - nmp->nm_flag |= NFSMNT_SOFT; + NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_SOFT); nmp->nm_state |= NFSSTA_FORCE; } @@ -3464,12 +4747,20 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *req, thread_t thd, int nmplocke if (thd == NULL) return (0); - /* If this thread belongs to kernel task; then abort check is not needed */ - if ((current_proc() != kernproc) && current_thread_aborted()) + /* + * Check if the process is aborted, but don't interrupt if we + * were killed by a signal and this is the exiting thread which + * is attempting to dump core. + */ + if (((p = current_proc()) != kernproc) && current_thread_aborted() && + (!(p->p_acflag & AXSIG) || (p->exit_thread != current_thread()) || + (p->p_sigacts == NULL) || + (p->p_sigacts->ps_sig < 1) || (p->p_sigacts->ps_sig > NSIG) || + !(sigprop[p->p_sigacts->ps_sig] & SA_CORE))) return (EINTR); /* mask off thread and process blocked signals. */ - if ((nmp->nm_flag & NFSMNT_INT) && ((p = get_bsdthreadtask_info(thd))) && + if (NMFLAG(nmp, INTR) && ((p = get_bsdthreadtask_info(thd))) && proc_pendingsignals(p, NFSINT_SIGMASK)) return (EINTR); return (0); @@ -3495,7 +4786,7 @@ nfs_sndlock(struct nfsreq *req) lck_mtx_lock(&nmp->nm_lock); statep = &nmp->nm_state; - if ((nmp->nm_flag & NFSMNT_INT) && req->r_thread) + if (NMFLAG(nmp, INTR) && req->r_thread && !(req->r_flags & R_NOINTR)) slpflag = PCATCH; while (*statep & NFSSTA_SNDLOCK) { if ((error = nfs_sigintr(nmp, req, req->r_thread, 1))) @@ -3530,7 +4821,7 @@ nfs_sndunlock(struct nfsreq *req) statep = &nmp->nm_state; if ((*statep & NFSSTA_SNDLOCK) == 0) panic("nfs sndunlock"); - *statep &= ~NFSSTA_SNDLOCK; + *statep &= ~(NFSSTA_SNDLOCK|NFSSTA_SENDING); if (*statep & NFSSTA_WANTSND) { *statep &= ~NFSSTA_WANTSND; wake = 1; @@ -3544,62 +4835,113 @@ int nfs_aux_request( struct nfsmount *nmp, thread_t thd, - struct sockaddr_in *saddr, + struct sockaddr *saddr, + socket_t so, + int sotype, mbuf_t mreq, uint32_t xid, int bindresv, int timeo, struct nfsm_chain *nmrep) { - int error = 0, on = 1, try, sendat = 2; - socket_t so = NULL; - struct sockaddr_in sin; - struct timeval tv = { 1, 0 }; + int error = 0, on = 1, try, sendat = 2, soproto, recv, optlen, restoreto = 0; + socket_t newso = NULL; + struct sockaddr_storage ss; + struct timeval orig_rcvto, orig_sndto, tv = { 1, 0 }; mbuf_t m, mrep = NULL; struct msghdr msg; uint32_t rxid = 0, reply = 0, reply_status, rejected_status; uint32_t verf_type, verf_len, accepted_status; - size_t readlen; + size_t readlen, sentlen; + struct nfs_rpc_record_state nrrs; - /* create socket and set options */ - if (((error = sock_socket(saddr->sin_family, SOCK_DGRAM, IPPROTO_UDP, NULL, NULL, &so))) || - ((error = sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))) || - ((error = sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)))) || - ((error = sock_setsockopt(so, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on))))) - goto nfsmout; - if (bindresv) { - int portrange = IP_PORTRANGE_LOW; - error = sock_setsockopt(so, IPPROTO_IP, IP_PORTRANGE, &portrange, sizeof(portrange)); - nfsmout_if(error); - /* bind now to check for failure */ - sin.sin_len = sizeof (struct sockaddr_in); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = INADDR_ANY; - sin.sin_port = 0; - error = sock_bind(so, (struct sockaddr *) &sin); - nfsmout_if(error); + if (!so) { + /* create socket and set options */ + soproto = (sotype == SOCK_DGRAM) ? IPPROTO_UDP : IPPROTO_TCP; + if ((error = sock_socket(saddr->sa_family, sotype, soproto, NULL, NULL, &newso))) + goto nfsmout; + + if (bindresv) { + int level = (saddr->sa_family == AF_INET) ? IPPROTO_IP : IPPROTO_IPV6; + int optname = (saddr->sa_family == AF_INET) ? IP_PORTRANGE : IPV6_PORTRANGE; + int portrange = IP_PORTRANGE_LOW; + error = sock_setsockopt(newso, level, optname, &portrange, sizeof(portrange)); + nfsmout_if(error); + ss.ss_len = saddr->sa_len; + ss.ss_family = saddr->sa_family; + if (ss.ss_family == AF_INET) { + ((struct sockaddr_in*)&ss)->sin_addr.s_addr = INADDR_ANY; + ((struct sockaddr_in*)&ss)->sin_port = htons(0); + } else if (ss.ss_family == AF_INET6) { + ((struct sockaddr_in6*)&ss)->sin6_addr = in6addr_any; + ((struct sockaddr_in6*)&ss)->sin6_port = htons(0); + } else { + error = EINVAL; + } + if (!error) + error = sock_bind(newso, (struct sockaddr *)&ss); + nfsmout_if(error); + } + + if (sotype == SOCK_STREAM) { + on = 4; /* don't wait too long for the socket to connect */ + sock_setsockopt(newso, IPPROTO_TCP, TCP_CONNECTIONTIMEOUT, &on, sizeof(on)); + error = sock_connect(newso, saddr, 0); + nfsmout_if(error); + } + if (((error = sock_setsockopt(newso, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))) || + ((error = sock_setsockopt(newso, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)))) || + ((error = sock_setsockopt(newso, SOL_SOCKET, SO_NOADDRERR, &on, sizeof(on))))) + goto nfsmout; + so = newso; + } else { + /* make sure socket is using a one second timeout in this function */ + optlen = sizeof(orig_rcvto); + error = sock_getsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &orig_rcvto, &optlen); + if (!error) { + optlen = sizeof(orig_sndto); + error = sock_getsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &orig_sndto, &optlen); + } + if (!error) { + sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)); + restoreto = 1; + } + } + + if (sotype == SOCK_STREAM) { + sendat = 0; /* we only resend the request for UDP */ + nfs_rpc_record_state_init(&nrrs); } for (try=0; try < timeo; try++) { - if ((error = nfs_sigintr(nmp, NULL, thd, 0))) + if ((error = nfs_sigintr(nmp, NULL, !try ? NULL : thd, 0))) break; if (!try || (try == sendat)) { - /* send the request (resending periodically) */ + /* send the request (resending periodically for UDP) */ if ((error = mbuf_copym(mreq, 0, MBUF_COPYALL, MBUF_WAITOK, &m))) goto nfsmout; bzero(&msg, sizeof(msg)); - msg.msg_name = saddr; - msg.msg_namelen = saddr->sin_len; - if ((error = sock_sendmbuf(so, &msg, m, 0, NULL))) + if ((sotype == SOCK_DGRAM) && !sock_isconnected(so)) { + msg.msg_name = saddr; + msg.msg_namelen = saddr->sa_len; + } + if ((error = sock_sendmbuf(so, &msg, m, 0, &sentlen))) goto nfsmout; sendat *= 2; if (sendat > 30) sendat = 30; } /* wait for the response */ - readlen = 1<<18; - bzero(&msg, sizeof(msg)); - error = sock_receivembuf(so, &msg, &mrep, 0, &readlen); + if (sotype == SOCK_STREAM) { + /* try to read (more of) record */ + error = nfs_rpc_record_read(so, &nrrs, 0, &recv, &mrep); + /* if we don't have the whole record yet, we'll keep trying */ + } else { + readlen = 1<<18; + bzero(&msg, sizeof(msg)); + error = sock_receivembuf(so, &msg, &mrep, 0, &readlen); + } if (error == EWOULDBLOCK) continue; nfsmout_if(error); @@ -3615,7 +4957,7 @@ nfs_aux_request( if (reply_status == RPC_MSGDENIED) { nfsm_chain_get_32(error, nmrep, rejected_status); nfsmout_if(error); - error = (rejected_status == RPC_MISMATCH) ? ENOTSUP : EACCES; + error = (rejected_status == RPC_MISMATCH) ? ERPCMISMATCH : EACCES; goto nfsmout; } nfsm_chain_get_32(error, nmrep, verf_type); /* verifier flavor */ @@ -3624,18 +4966,159 @@ nfs_aux_request( if (verf_len) nfsm_chain_adv(error, nmrep, nfsm_rndup(verf_len)); nfsm_chain_get_32(error, nmrep, accepted_status); - nfsm_assert(error, (accepted_status == RPC_SUCCESS), EIO); + nfsmout_if(error); + switch (accepted_status) { + case RPC_SUCCESS: + error = 0; + break; + case RPC_PROGUNAVAIL: + error = EPROGUNAVAIL; + break; + case RPC_PROGMISMATCH: + error = EPROGMISMATCH; + break; + case RPC_PROCUNAVAIL: + error = EPROCUNAVAIL; + break; + case RPC_GARBAGE: + error = EBADRPC; + break; + case RPC_SYSTEM_ERR: + default: + error = EIO; + break; + } break; } nfsmout: - if (so) { - sock_shutdown(so, SHUT_RDWR); - sock_close(so); + if (restoreto) { + sock_setsockopt(so, SOL_SOCKET, SO_RCVTIMEO, &orig_rcvto, sizeof(tv)); + sock_setsockopt(so, SOL_SOCKET, SO_SNDTIMEO, &orig_sndto, sizeof(tv)); + } + if (newso) { + sock_shutdown(newso, SHUT_RDWR); + sock_close(newso); } mbuf_freem(mreq); return (error); } +int +nfs_portmap_lookup( + struct nfsmount *nmp, + vfs_context_t ctx, + struct sockaddr *sa, + socket_t so, + uint32_t protocol, + uint32_t vers, + uint32_t ipproto, + int timeo) +{ + thread_t thd = vfs_context_thread(ctx); + kauth_cred_t cred = vfs_context_ucred(ctx); + struct sockaddr_storage ss; + struct sockaddr *saddr = (struct sockaddr*)&ss; + struct nfsm_chain nmreq, nmrep; + mbuf_t mreq; + int error = 0, ip, pmprog, pmvers, pmproc, ualen = 0; + uint32_t port; + uint64_t xid = 0; + char uaddr[MAX_IPv6_STR_LEN+16]; + + bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + if (saddr->sa_family == AF_INET) { + ip = 4; + pmprog = PMAPPROG; + pmvers = PMAPVERS; + pmproc = PMAPPROC_GETPORT; + } else if (saddr->sa_family == AF_INET6) { + ip = 6; + pmprog = RPCBPROG; + pmvers = RPCBVERS4; + pmproc = RPCBPROC_GETVERSADDR; + } else { + return (EINVAL); + } + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + +tryagain: + /* send portmapper request to get port/uaddr */ + if (ip == 4) + ((struct sockaddr_in*)saddr)->sin_port = htons(PMAPPORT); + else + ((struct sockaddr_in6*)saddr)->sin6_port = htons(PMAPPORT); + nfsm_chain_build_alloc_init(error, &nmreq, 8*NFSX_UNSIGNED); + nfsm_chain_add_32(error, &nmreq, protocol); + nfsm_chain_add_32(error, &nmreq, vers); + if (ip == 4) { + nfsm_chain_add_32(error, &nmreq, ipproto); + nfsm_chain_add_32(error, &nmreq, 0); + } else { + if (ipproto == IPPROTO_TCP) + nfsm_chain_add_string(error, &nmreq, "tcp6", 4); + else + nfsm_chain_add_string(error, &nmreq, "udp6", 4); + nfsm_chain_add_string(error, &nmreq, "", 0); /* uaddr */ + nfsm_chain_add_string(error, &nmreq, "", 0); /* owner */ + } + nfsm_chain_build_done(error, &nmreq); + nfsmout_if(error); + error = nfsm_rpchead2(nmp, (ipproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + pmprog, pmvers, pmproc, RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, + &xid, &mreq); + nfsmout_if(error); + nmreq.nmc_mhead = NULL; + error = nfs_aux_request(nmp, thd, saddr, so, (ipproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + mreq, R_XID32(xid), 0, timeo, &nmrep); + + /* grab port from portmap response */ + if (ip == 4) { + nfsm_chain_get_32(error, &nmrep, port); + if (!error) + ((struct sockaddr_in*)sa)->sin_port = htons(port); + } else { + /* get uaddr string and convert to sockaddr */ + nfsm_chain_get_32(error, &nmrep, ualen); + if (!error) { + if (ualen > ((int)sizeof(uaddr)-1)) + error = EIO; + if (ualen < 1) { + /* program is not available, just return a zero port */ + bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + ((struct sockaddr_in6*)saddr)->sin6_port = htons(0); + } else { + nfsm_chain_get_opaque(error, &nmrep, ualen, uaddr); + if (!error) { + uaddr[ualen] = '\0'; + if (!nfs_uaddr2sockaddr(uaddr, saddr)) + error = EIO; + } + } + } + if ((error == EPROGMISMATCH) || (error == EPROCUNAVAIL) || (error == EIO) || (error == EBADRPC)) { + /* remote doesn't support rpcbind version or proc (or we couldn't parse uaddr) */ + if (pmvers == RPCBVERS4) { + /* fall back to v3 and GETADDR */ + pmvers = RPCBVERS3; + pmproc = RPCBPROC_GETADDR; + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + xid = 0; + error = 0; + goto tryagain; + } + } + if (!error) + bcopy(saddr, sa, min(saddr->sa_len, sa->sa_len)); + } +nfsmout: + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + return (error); +} + int nfs_msg(thread_t thd, const char *server, @@ -3670,12 +5153,12 @@ nfs_down(struct nfsmount *nmp, thread_t thd, int error, int flags, const char *m lck_mtx_lock(&nmp->nm_lock); timeoutmask = NFSSTA_TIMEO | NFSSTA_LOCKTIMEO | NFSSTA_JUKEBOXTIMEO; - if (nmp->nm_flag & NFSMNT_MUTEJUKEBOX) /* jukebox timeouts don't count as unresponsive if muted */ + if (NMFLAG(nmp, MUTEJUKEBOX)) /* jukebox timeouts don't count as unresponsive if muted */ timeoutmask &= ~NFSSTA_JUKEBOXTIMEO; wasunresponsive = (nmp->nm_state & timeoutmask); /* XXX don't allow users to know about/disconnect unresponsive, soft, nobrowse mounts */ - softnobrowse = ((nmp->nm_flag & NFSMNT_SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); + softnobrowse = (NMFLAG(nmp, SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); if ((flags & NFSSTA_TIMEO) && !(nmp->nm_state & NFSSTA_TIMEO)) nmp->nm_state |= NFSSTA_TIMEO; @@ -3686,7 +5169,7 @@ nfs_down(struct nfsmount *nmp, thread_t thd, int error, int flags, const char *m unresponsive = (nmp->nm_state & timeoutmask); - if (unresponsive && (nmp->nm_flag & NFSMNT_DEADTIMEOUT)) { + if (unresponsive && (nmp->nm_deadtimeout > 0)) { microuptime(&now); if (!wasunresponsive) { nmp->nm_deadto_start = now.tv_sec; @@ -3726,12 +5209,12 @@ nfs_up(struct nfsmount *nmp, thread_t thd, int flags, const char *msg) lck_mtx_lock(&nmp->nm_lock); timeoutmask = NFSSTA_TIMEO | NFSSTA_LOCKTIMEO | NFSSTA_JUKEBOXTIMEO; - if (nmp->nm_flag & NFSMNT_MUTEJUKEBOX) /* jukebox timeouts don't count as unresponsive if muted */ + if (NMFLAG(nmp, MUTEJUKEBOX)) /* jukebox timeouts don't count as unresponsive if muted */ timeoutmask &= ~NFSSTA_JUKEBOXTIMEO; wasunresponsive = (nmp->nm_state & timeoutmask); /* XXX don't allow users to know about/disconnect unresponsive, soft, nobrowse mounts */ - softnobrowse = ((nmp->nm_flag & NFSMNT_SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); + softnobrowse = (NMFLAG(nmp, SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); if ((flags & NFSSTA_TIMEO) && (nmp->nm_state & NFSSTA_TIMEO)) nmp->nm_state &= ~NFSSTA_TIMEO; @@ -3916,9 +5399,9 @@ nfsrv_send(struct nfsrv_sock *slp, mbuf_t nam, mbuf_t top) * be called with MBUF_WAITOK from an nfsd. */ void -nfsrv_rcv(socket_t so, caddr_t arg, int waitflag) +nfsrv_rcv(socket_t so, void *arg, int waitflag) { - struct nfsrv_sock *slp = (struct nfsrv_sock *)arg; + struct nfsrv_sock *slp = arg; if (!nfsd_thread_count || !(slp->ns_flag & SLP_VALID)) return; @@ -4250,6 +5733,8 @@ nfsrv_dorec( if (error) { if (nam) mbuf_freem(nam); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); return (error); } @@ -4274,7 +5759,6 @@ nfsrv_getreq(struct nfsrv_descript *nd) uid_t user_id; gid_t group_id; int ngroups; - struct ucred temp_cred; uint32_t val; nd->nd_cr = NULL; @@ -4331,10 +5815,11 @@ nfsrv_getreq(struct nfsrv_descript *nd) nfsmout_if(error); /* Handle authentication */ - if (auth_type == RPCAUTH_UNIX) { + if (auth_type == RPCAUTH_SYS) { + struct posix_cred temp_pcred; if (nd->nd_procnum == NFSPROC_NULL) return (0); - nd->nd_sec = RPCAUTH_UNIX; + nd->nd_sec = RPCAUTH_SYS; nfsm_chain_adv(error, nmreq, NFSX_UNSIGNED); // skip stamp nfsm_chain_get_32(error, nmreq, len); // hostname length if (len < 0 || len > NFS_MAXNAMLEN) @@ -4343,23 +5828,23 @@ nfsrv_getreq(struct nfsrv_descript *nd) nfsmout_if(error); /* create a temporary credential using the bits from the wire */ - bzero(&temp_cred, sizeof(temp_cred)); + bzero(&temp_pcred, sizeof(temp_pcred)); nfsm_chain_get_32(error, nmreq, user_id); nfsm_chain_get_32(error, nmreq, group_id); - temp_cred.cr_groups[0] = group_id; + temp_pcred.cr_groups[0] = group_id; nfsm_chain_get_32(error, nmreq, len); // extra GID count if ((len < 0) || (len > RPCAUTH_UNIXGIDS)) error = EBADRPC; nfsmout_if(error); for (i = 1; i <= len; i++) if (i < NGROUPS) - nfsm_chain_get_32(error, nmreq, temp_cred.cr_groups[i]); + nfsm_chain_get_32(error, nmreq, temp_pcred.cr_groups[i]); else nfsm_chain_adv(error, nmreq, NFSX_UNSIGNED); nfsmout_if(error); ngroups = (len >= NGROUPS) ? NGROUPS : (len + 1); if (ngroups > 1) - nfsrv_group_sort(&temp_cred.cr_groups[0], ngroups); + nfsrv_group_sort(&temp_pcred.cr_groups[0], ngroups); nfsm_chain_adv(error, nmreq, NFSX_UNSIGNED); // verifier flavor (should be AUTH_NONE) nfsm_chain_get_32(error, nmreq, len); // verifier length if (len < 0 || len > RPCAUTH_MAXSIZ) @@ -4368,9 +5853,9 @@ nfsrv_getreq(struct nfsrv_descript *nd) nfsm_chain_adv(error, nmreq, nfsm_rndup(len)); /* request creation of a real credential */ - temp_cred.cr_uid = user_id; - temp_cred.cr_ngroups = ngroups; - nd->nd_cr = kauth_cred_create(&temp_cred); + temp_pcred.cr_uid = user_id; + temp_pcred.cr_ngroups = ngroups; + nd->nd_cr = posix_cred_create(&temp_pcred); if (nd->nd_cr == NULL) { nd->nd_repstat = ENOMEM; nd->nd_procnum = NFSPROC_NOOP; diff --git a/bsd/nfs/nfs_srvcache.c b/bsd/nfs/nfs_srvcache.c index db1c6e6a7..7fde3da6b 100644 --- a/bsd/nfs/nfs_srvcache.c +++ b/bsd/nfs/nfs_srvcache.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -174,6 +174,7 @@ nfsrv_initcache(void) * If there is any doubt, return FALSE. * The AF_INET family is handled as a special case so that address mbufs * don't need to be saved to store "struct in_addr", which is only 4 bytes. + * Ditto for AF_INET6 which is only 16 bytes. */ static int netaddr_match( @@ -182,17 +183,22 @@ netaddr_match( mbuf_t nam) { struct sockaddr_in *inetaddr; + struct sockaddr_in6 *inet6addr; switch (family) { case AF_INET: inetaddr = mbuf_data(nam); - if (inetaddr->sin_family == AF_INET && - inetaddr->sin_addr.s_addr == haddr->had_inetaddr) + if ((inetaddr->sin_family == AF_INET) && + (inetaddr->sin_addr.s_addr == haddr->had_inetaddr)) return (1); break; - default: + case AF_INET6: + inet6addr = mbuf_data(nam); + if ((inet6addr->sin6_family == AF_INET6) && + !bcmp(&inet6addr->sin6_addr, &haddr->had_inet6addr, sizeof(inet6addr->sin6_addr))) + return (1); break; - }; + } return (0); } @@ -218,7 +224,7 @@ nfsrv_getcache( { struct nfsrvcache *rp; struct nfsm_chain nmrep; - struct sockaddr_in *saddr; + struct sockaddr *saddr; int ret, error; /* @@ -232,7 +238,7 @@ loop: for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0; rp = rp->rc_hash.le_next) { if (nd->nd_retxid == rp->rc_xid && nd->nd_procnum == rp->rc_proc && - netaddr_match(AF_INET, &rp->rc_haddr, nd->nd_nam)) { + netaddr_match(rp->rc_family, &rp->rc_haddr, nd->nd_nam)) { if ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; msleep(rp, nfsrv_reqcache_mutex, PZERO-1, "nfsrc", NULL); @@ -323,10 +329,15 @@ loop: rp->rc_state = RC_INPROG; rp->rc_xid = nd->nd_retxid; saddr = mbuf_data(nd->nd_nam); - switch (saddr->sin_family) { + rp->rc_family = saddr->sa_family; + switch (saddr->sa_family) { case AF_INET: rp->rc_flag |= RC_INETADDR; - rp->rc_inetaddr = saddr->sin_addr.s_addr; + rp->rc_inetaddr = ((struct sockaddr_in*)saddr)->sin_addr.s_addr; + break; + case AF_INET6: + rp->rc_flag |= RC_INETADDR; + rp->rc_inet6addr = ((struct sockaddr_in6*)saddr)->sin6_addr; break; default: error = mbuf_copym(nd->nd_nam, 0, MBUF_COPYALL, MBUF_WAITOK, &rp->rc_nam); @@ -366,7 +377,7 @@ loop: for (rp = NFSRCHASH(nd->nd_retxid)->lh_first; rp != 0; rp = rp->rc_hash.le_next) { if (nd->nd_retxid == rp->rc_xid && nd->nd_procnum == rp->rc_proc && - netaddr_match(AF_INET, &rp->rc_haddr, nd->nd_nam)) { + netaddr_match(rp->rc_family, &rp->rc_haddr, nd->nd_nam)) { if ((rp->rc_flag & RC_LOCKED) != 0) { rp->rc_flag |= RC_WANTED; msleep(rp, nfsrv_reqcache_mutex, PZERO-1, "nfsrc", NULL); diff --git a/bsd/nfs/nfs_subs.c b/bsd/nfs/nfs_subs.c index 40b55e86e..dccead918 100644 --- a/bsd/nfs/nfs_subs.c +++ b/bsd/nfs/nfs_subs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -99,6 +99,9 @@ #include <nfs/nfsproto.h> #include <nfs/nfs.h> #include <nfs/nfsnode.h> +#if NFSCLIENT +#define _NFS_XDR_SUBS_FUNCS_ /* define this to get xdrbuf function definitions */ +#endif #include <nfs/xdr_subs.h> #include <nfs/nfsm_subs.h> #include <nfs/nfs_gss.h> @@ -110,6 +113,8 @@ #include <netinet/in.h> #include <net/kpi_interface.h> +#include <sys/utfconv.h> + /* * NFS globals */ @@ -793,6 +798,33 @@ nfsm_chain_get_uio(struct nfsm_chain *nmc, uint32_t len, uio_t uio) #if NFSCLIENT +int +nfsm_chain_add_string_nfc(struct nfsm_chain *nmc, const uint8_t *s, uint32_t slen) +{ + uint8_t smallbuf[64]; + uint8_t *nfcname = smallbuf; + size_t buflen = sizeof(smallbuf), nfclen; + int error; + + error = utf8_normalizestr(s, slen, nfcname, &nfclen, buflen, UTF_PRECOMPOSED|UTF_NO_NULL_TERM); + if (error == ENAMETOOLONG) { + buflen = MAXPATHLEN; + MALLOC_ZONE(nfcname, uint8_t *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (nfcname) + error = utf8_normalizestr(s, slen, nfcname, &nfclen, buflen, UTF_PRECOMPOSED|UTF_NO_NULL_TERM); + } + + /* if we got an error, just use the original string */ + if (error) + nfsm_chain_add_string(error, nmc, s, slen); + else + nfsm_chain_add_string(error, nmc, nfcname, nfclen); + + if (nfcname && (nfcname != smallbuf)) + FREE_ZONE(nfcname, MAXPATHLEN, M_NAMEI); + return (error); +} + /* * Add an NFSv2 "sattr" structure to an mbuf chain */ @@ -909,7 +941,7 @@ nfsm_chain_get_fh_attr( error = nfs_parsefattr(nmc, nfsvers, nvap); } else if (gotfh) { /* we need valid attributes in order to call nfs_nget() */ - if (nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, ctx, nvap, xidp)) { + if (nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, 0, ctx, nvap, xidp)) { gotattr = 0; fhp->fh_len = 0; } @@ -985,7 +1017,6 @@ nfs_get_xid(uint64_t *xidp) int nfsm_rpchead( struct nfsreq *req, - int auth_len, mbuf_t mrest, u_int64_t *xidp, mbuf_t *mreqp) @@ -993,23 +1024,55 @@ nfsm_rpchead( struct nfsmount *nmp = req->r_nmp; int nfsvers = nmp->nm_vers; int proc = ((nfsvers == NFS_VER2) ? nfsv2_procid[req->r_procnum] : (int)req->r_procnum); - int auth_type = (!auth_len && !req->r_cred) ? RPCAUTH_NULL : nmp->nm_auth; - return nfsm_rpchead2(nmp->nm_sotype, NFS_PROG, nfsvers, proc, - auth_type, auth_len, req->r_cred, req, mrest, xidp, mreqp); + return nfsm_rpchead2(nmp, nmp->nm_sotype, NFS_PROG, nfsvers, proc, + req->r_auth, req->r_cred, req, mrest, xidp, mreqp); } int -nfsm_rpchead2(int sotype, int prog, int vers, int proc, int auth_type, int auth_len, +nfsm_rpchead2(struct nfsmount *nmp, int sotype, int prog, int vers, int proc, int auth_type, kauth_cred_t cred, struct nfsreq *req, mbuf_t mrest, u_int64_t *xidp, mbuf_t *mreqp) { mbuf_t mreq, mb; - int error, i, grpsiz, authsiz, reqlen; + int error, i, grpsiz, auth_len = 0, authsiz, reqlen; size_t headlen; struct nfsm_chain nmreq; - /* allocate the packet */ + /* calculate expected auth length */ + switch (auth_type) { + case RPCAUTH_NONE: + auth_len = 0; + break; + case RPCAUTH_SYS: + { + gid_t grouplist[NGROUPS]; + int groupcount = NGROUPS; + + if (!cred) + return (EINVAL); + + (void)kauth_cred_getgroups(cred, grouplist, &groupcount); + if (groupcount < 1) + return (EINVAL); + + auth_len = ((((groupcount - 1) > nmp->nm_numgrps) ? + nmp->nm_numgrps : (groupcount - 1)) << 2) + + 5 * NFSX_UNSIGNED; + break; + } + case RPCAUTH_KRB5: + case RPCAUTH_KRB5I: + case RPCAUTH_KRB5P: + if (!req || !cred) + return (EINVAL); + auth_len = 5 * NFSX_UNSIGNED + 0; // zero context handle for now + break; + default: + return (EINVAL); + } authsiz = nfsm_rndup(auth_len); + + /* allocate the packet */ headlen = authsiz + 10 * NFSX_UNSIGNED; if (sotype == SOCK_STREAM) /* also include room for any RPC Record Mark */ headlen += NFSX_UNSIGNED; @@ -1055,27 +1118,36 @@ nfsm_rpchead2(int sotype, int prog, int vers, int proc, int auth_type, int auth_ add_cred: switch (auth_type) { - case RPCAUTH_NULL: - nfsm_chain_add_32(error, &nmreq, RPCAUTH_NULL); /* auth */ + case RPCAUTH_NONE: + nfsm_chain_add_32(error, &nmreq, RPCAUTH_NONE); /* auth */ nfsm_chain_add_32(error, &nmreq, 0); /* length */ - nfsm_chain_add_32(error, &nmreq, RPCAUTH_NULL); /* verf */ + nfsm_chain_add_32(error, &nmreq, RPCAUTH_NONE); /* verf */ nfsm_chain_add_32(error, &nmreq, 0); /* length */ nfsm_chain_build_done(error, &nmreq); + /* Append the args mbufs */ + if (!error) + error = mbuf_setnext(nmreq.nmc_mcur, mrest); break; - case RPCAUTH_UNIX: - nfsm_chain_add_32(error, &nmreq, RPCAUTH_UNIX); + case RPCAUTH_SYS: { + gid_t grouplist[NGROUPS]; + int groupcount; + + nfsm_chain_add_32(error, &nmreq, RPCAUTH_SYS); nfsm_chain_add_32(error, &nmreq, authsiz); nfsm_chain_add_32(error, &nmreq, 0); /* stamp */ nfsm_chain_add_32(error, &nmreq, 0); /* zero-length hostname */ nfsm_chain_add_32(error, &nmreq, kauth_cred_getuid(cred)); /* UID */ - nfsm_chain_add_32(error, &nmreq, cred->cr_groups[0]); /* GID */ + nfsm_chain_add_32(error, &nmreq, kauth_cred_getgid(cred)); /* GID */ grpsiz = (auth_len >> 2) - 5; nfsm_chain_add_32(error, &nmreq, grpsiz);/* additional GIDs */ + memset(grouplist, 0, sizeof(grouplist)); + groupcount = grpsiz; + (void)kauth_cred_getgroups(cred, grouplist, &groupcount); for (i = 1; i <= grpsiz; i++) - nfsm_chain_add_32(error, &nmreq, cred->cr_groups[i]); + nfsm_chain_add_32(error, &nmreq, grouplist[i]); /* And the verifier... */ - nfsm_chain_add_32(error, &nmreq, RPCAUTH_NULL); /* flavor */ + nfsm_chain_add_32(error, &nmreq, RPCAUTH_NONE); /* flavor */ nfsm_chain_add_32(error, &nmreq, 0); /* length */ nfsm_chain_build_done(error, &nmreq); @@ -1083,16 +1155,24 @@ add_cred: if (!error) error = mbuf_setnext(nmreq.nmc_mcur, mrest); break; + } case RPCAUTH_KRB5: case RPCAUTH_KRB5I: case RPCAUTH_KRB5P: error = nfs_gss_clnt_cred_put(req, &nmreq, mrest); if (error == ENEEDAUTH) { + gid_t grouplist[NGROUPS]; + int groupcount = NGROUPS; /* * Use sec=sys for this user */ error = 0; - auth_type = RPCAUTH_UNIX; + req->r_auth = auth_type = RPCAUTH_SYS; + (void)kauth_cred_getgroups(cred, grouplist, &groupcount); + auth_len = ((((groupcount - 1) > nmp->nm_numgrps) ? + nmp->nm_numgrps : (groupcount - 1)) << 2) + + 5 * NFSX_UNSIGNED; + authsiz = nfsm_rndup(auth_len); goto add_cred; } break; @@ -1141,6 +1221,21 @@ nfs_parsefattr(struct nfsm_chain *nmc, int nfsvers, struct nfs_vattr *nvap) dev_t rdev; val = val2 = 0; + NVATTR_INIT(nvap); + + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TYPE); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_MODE); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_NUMLINKS); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_OWNER); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_SIZE); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_SPACE_USED); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_RAWDEV); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_FSID); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_FILEID); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_ACCESS); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_MODIFY); + NFS_BITMAP_SET(nvap->nva_bitmap, NFS_FATTR_TIME_METADATA); nfsm_chain_get_32(error, nmc, vtype); nfsm_chain_get_32(error, nmc, vmode); @@ -1241,6 +1336,12 @@ nfs_loadattrcache( vnode_t vp; struct timeval now; struct nfs_vattr *npnvap; + int xattr = np->n_vattr.nva_flags & NFS_FFLAG_IS_ATTR; + int referral = np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL; + int aclbit, monitored, error = 0; + kauth_acl_t acl; + struct nfsmount *nmp; + uint32_t events = np->n_events; if (np->n_hflag & NHINIT) { vp = NULL; @@ -1249,10 +1350,11 @@ nfs_loadattrcache( vp = NFSTOV(np); mp = vnode_mount(vp); } + monitored = vp ? vnode_ismonitored(vp) : 0; FSDBG_TOP(527, np, vp, *xidp >> 32, *xidp); - if (!VFSTONFS(mp)) { + if (!((nmp = VFSTONFS(mp)))) { FSDBG_BOT(527, ENXIO, 1, 0, *xidp); return (ENXIO); } @@ -1298,16 +1400,133 @@ nfs_loadattrcache( */ printf("nfs loadattrcache vnode changed type, was %d now %d\n", vnode_vtype(vp), nvap->nva_type); - FSDBG_BOT(527, ESTALE, 3, 0, *xidp); - return (ESTALE); + error = ESTALE; + if (monitored) + events |= VNODE_EVENT_DELETE; + goto out; } + npnvap = &np->n_vattr; + + /* + * The ACL cache needs special handling because it is not + * always updated. Save current ACL cache state so it can + * be restored after copying the new attributes into place. + */ + aclbit = NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_ACL); + acl = npnvap->nva_acl; + + if (monitored) { + /* + * For monitored nodes, check for attribute changes that should generate events. + */ + if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_NUMLINKS) && + (nvap->nva_nlink != npnvap->nva_nlink)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_LINK; + if (events & VNODE_EVENT_PERMS) + /* no need to do all the checking if it's already set */; + else if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_MODE) && + (nvap->nva_mode != npnvap->nva_mode)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + else if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER) && + (nvap->nva_uid != npnvap->nva_uid)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + else if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP) && + (nvap->nva_gid != npnvap->nva_gid)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + else if (nmp->nm_vers >= NFS_VER4) { + if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER) && + !kauth_guid_equal(&nvap->nva_uuuid, &npnvap->nva_uuuid)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + else if (NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_OWNER_GROUP) && + !kauth_guid_equal(&nvap->nva_guuid, &npnvap->nva_guuid)) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + else if ((NFS_BITMAP_ISSET(nvap->nva_bitmap, NFS_FATTR_ACL) && + nvap->nva_acl && npnvap->nva_acl && + ((nvap->nva_acl->acl_entrycount != npnvap->nva_acl->acl_entrycount) || + bcmp(nvap->nva_acl, npnvap->nva_acl, KAUTH_ACL_COPYSIZE(nvap->nva_acl))))) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_PERMS; + } + if (((nmp->nm_vers >= NFS_VER4) && (nvap->nva_change != npnvap->nva_change)) || + (NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_TIME_MODIFY) && + ((nvap->nva_timesec[NFSTIME_MODIFY] != npnvap->nva_timesec[NFSTIME_MODIFY]) || + (nvap->nva_timensec[NFSTIME_MODIFY] != npnvap->nva_timensec[NFSTIME_MODIFY])))) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_WRITE; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_RAWDEV) && + ((nvap->nva_rawdev.specdata1 != npnvap->nva_rawdev.specdata1) || + (nvap->nva_rawdev.specdata2 != npnvap->nva_rawdev.specdata2))) + events |= VNODE_EVENT_ATTRIB; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_FILEID) && + (nvap->nva_fileid != npnvap->nva_fileid)) + events |= VNODE_EVENT_ATTRIB; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_ARCHIVE) && + ((nvap->nva_flags & NFS_FFLAG_ARCHIVED) != (npnvap->nva_flags & NFS_FFLAG_ARCHIVED))) + events |= VNODE_EVENT_ATTRIB; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_HIDDEN) && + ((nvap->nva_flags & NFS_FFLAG_HIDDEN) != (npnvap->nva_flags & NFS_FFLAG_HIDDEN))) + events |= VNODE_EVENT_ATTRIB; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_TIME_CREATE) && + ((nvap->nva_timesec[NFSTIME_CREATE] != npnvap->nva_timesec[NFSTIME_CREATE]) || + (nvap->nva_timensec[NFSTIME_CREATE] != npnvap->nva_timensec[NFSTIME_CREATE]))) + events |= VNODE_EVENT_ATTRIB; + if (!events && NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_TIME_BACKUP) && + ((nvap->nva_timesec[NFSTIME_BACKUP] != npnvap->nva_timesec[NFSTIME_BACKUP]) || + (nvap->nva_timensec[NFSTIME_BACKUP] != npnvap->nva_timensec[NFSTIME_BACKUP]))) + events |= VNODE_EVENT_ATTRIB; + } + + /* Copy the attributes to the attribute cache */ + bcopy((caddr_t)nvap, (caddr_t)npnvap, sizeof(*nvap)); + microuptime(&now); np->n_attrstamp = now.tv_sec; np->n_xid = *xidp; + /* NFS_FFLAG_IS_ATTR and NFS_FFLAG_TRIGGER_REFERRAL need to be sticky... */ + if (vp && xattr) + nvap->nva_flags |= xattr; + if (vp && referral) + nvap->nva_flags |= referral; + + if (NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_ACL)) { + /* we're updating the ACL */ + if (nvap->nva_acl) { + /* make a copy of the acl for the cache */ + npnvap->nva_acl = kauth_acl_alloc(nvap->nva_acl->acl_entrycount); + if (npnvap->nva_acl) { + bcopy(nvap->nva_acl, npnvap->nva_acl, KAUTH_ACL_COPYSIZE(nvap->nva_acl)); + } else { + /* can't make a copy to cache, invalidate ACL cache */ + NFS_BITMAP_CLR(npnvap->nva_bitmap, NFS_FATTR_ACL); + NACLINVALIDATE(np); + aclbit = 0; + } + } + if (acl) { + kauth_acl_free(acl); + acl = NULL; + } + } + if (NFS_BITMAP_ISSET(npnvap->nva_bitmap, NFS_FATTR_ACL)) { + /* update the ACL timestamp */ + np->n_aclstamp = now.tv_sec; + } else { + /* we aren't updating the ACL, so restore original values */ + if (aclbit) + NFS_BITMAP_SET(npnvap->nva_bitmap, NFS_FATTR_ACL); + npnvap->nva_acl = acl; + } - npnvap = &np->n_vattr; - bcopy((caddr_t)nvap, (caddr_t)npnvap, sizeof(*nvap)); +#if CONFIG_TRIGGERS + /* + * For NFSv4, if the fsid doesn't match the fsid for the mount, then + * this node is for a different file system on the server. So we mark + * this node as a trigger node that will trigger the mirror mount. + */ + if ((nmp->nm_vers >= NFS_VER4) && (nvap->nva_type == VDIR) && + ((np->n_vattr.nva_fsid.major != nmp->nm_fsid.major) || + (np->n_vattr.nva_fsid.minor != nmp->nm_fsid.minor))) + np->n_vattr.nva_flags |= NFS_FFLAG_TRIGGER; +#endif if (!vp || (nvap->nva_type != VREG)) { np->n_size = nvap->nva_size; @@ -1332,6 +1551,8 @@ nfs_loadattrcache( */ np->n_newsize = nvap->nva_size; SET(np->n_flag, NUPDATESIZE); + if (monitored) + events |= VNODE_EVENT_ATTRIB | VNODE_EVENT_EXTEND; } } @@ -1346,8 +1567,11 @@ nfs_loadattrcache( } } - FSDBG_BOT(527, 0, np, np->n_size, *xidp); - return (0); +out: + if (monitored && events) + nfs_vnode_notify(np, events); + FSDBG_BOT(527, error, np, np->n_size, *xidp); + return (error); } /* @@ -1359,16 +1583,22 @@ nfs_attrcachetimeout(nfsnode_t np) { struct nfsmount *nmp; struct timeval now; - int isdir, timeo; + int isdir; + uint32_t timeo; if (!(nmp = NFSTONMP(np))) return (0); isdir = vnode_isdir(NFSTOV(np)); - if ((np)->n_flag & NMODIFIED) + if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK)) { + /* If we have a delegation, we always use the max timeout. */ + timeo = isdir ? nmp->nm_acdirmax : nmp->nm_acregmax; + } else if ((np)->n_flag & NMODIFIED) { + /* If we have modifications, we always use the min timeout. */ timeo = isdir ? nmp->nm_acdirmin : nmp->nm_acregmin; - else { + } else { + /* Otherwise, we base the timeout on how old the file seems. */ /* Note that if the client and server clocks are way out of sync, */ /* timeout will probably get clamped to a min or max value */ microtime(&now); @@ -1396,26 +1626,32 @@ nfs_attrcachetimeout(nfsnode_t np) * Must be called with the node locked. */ int -nfs_getattrcache(nfsnode_t np, struct nfs_vattr *nvaper) +nfs_getattrcache(nfsnode_t np, struct nfs_vattr *nvaper, int flags) { struct nfs_vattr *nvap; struct timeval nowup; int32_t timeo; - if (!NATTRVALID(np)) { + /* Check if the attributes are valid. */ + if (!NATTRVALID(np) || ((flags & NGA_ACL) && !NACLVALID(np))) { FSDBG(528, np, 0, 0xffffff01, ENOENT); OSAddAtomic(1, &nfsstats.attrcache_misses); return (ENOENT); } + /* Verify the cached attributes haven't timed out. */ timeo = nfs_attrcachetimeout(np); - microuptime(&nowup); if ((nowup.tv_sec - np->n_attrstamp) >= timeo) { FSDBG(528, np, 0, 0xffffff02, ENOENT); OSAddAtomic(1, &nfsstats.attrcache_misses); return (ENOENT); } + if ((flags & NGA_ACL) && ((nowup.tv_sec - np->n_aclstamp) >= timeo)) { + FSDBG(528, np, 0, 0xffffff02, ENOENT); + OSAddAtomic(1, &nfsstats.attrcache_misses); + return (ENOENT); + } nvap = &np->n_vattr; FSDBG(528, np, nvap->nva_size, np->n_size, 0xcace); @@ -1451,9 +1687,257 @@ nfs_getattrcache(nfsnode_t np, struct nfs_vattr *nvaper) nvaper->nva_timensec[NFSTIME_MODIFY] = np->n_mtim.tv_nsec; } } + if (nvap->nva_acl) { + if (flags & NGA_ACL) { + nvaper->nva_acl = kauth_acl_alloc(nvap->nva_acl->acl_entrycount); + if (!nvaper->nva_acl) + return (ENOMEM); + bcopy(nvap->nva_acl, nvaper->nva_acl, KAUTH_ACL_COPYSIZE(nvap->nva_acl)); + } else { + nvaper->nva_acl = NULL; + } + } return (0); } +/* + * When creating file system objects: + * Don't bother setting UID if it's the same as the credential performing the create. + * Don't bother setting GID if it's the same as the directory or credential. + */ +void +nfs_avoid_needless_id_setting_on_create(nfsnode_t dnp, struct vnode_attr *vap, vfs_context_t ctx) +{ + if (VATTR_IS_ACTIVE(vap, va_uid)) { + if (kauth_cred_getuid(vfs_context_ucred(ctx)) == vap->va_uid) { + VATTR_CLEAR_ACTIVE(vap, va_uid); + VATTR_CLEAR_ACTIVE(vap, va_uuuid); + } + } + if (VATTR_IS_ACTIVE(vap, va_gid)) { + if ((vap->va_gid == dnp->n_vattr.nva_gid) || + (kauth_cred_getgid(vfs_context_ucred(ctx)) == vap->va_gid)) { + VATTR_CLEAR_ACTIVE(vap, va_gid); + VATTR_CLEAR_ACTIVE(vap, va_guuid); + } + } +} + +/* + * Convert a universal address string to a sockaddr structure. + * + * Universal addresses can be in the following formats: + * + * d = decimal (IPv4) + * x = hexadecimal (IPv6) + * p = port (decimal) + * + * d.d.d.d + * d.d.d.d.p.p + * x:x:x:x:x:x:x:x + * x:x:x:x:x:x:x:x.p.p + * x:x:x:x:x:x:d.d.d.d + * x:x:x:x:x:x:d.d.d.d.p.p + * + * IPv6 strings can also have a series of zeroes elided + * IPv6 strings can also have a %scope suffix at the end (after any port) + * + * rules & exceptions: + * - value before : is hex + * - value before . is dec + * - once . hit, all values are dec + * - hex+port case means value before first dot is actually hex + * - . is always preceded by digits except if last hex was double-colon + * + * scan, converting #s to bytes + * first time a . is encountered, scan the rest to count them. + * 2 dots = just port + * 3 dots = just IPv4 no port + * 5 dots = IPv4 and port + */ + +#define IS_DIGIT(C) \ + (((C) >= '0') && ((C) <= '9')) + +#define IS_XDIGIT(C) \ + (IS_DIGIT(C) || \ + (((C) >= 'A') && ((C) <= 'F')) || \ + (((C) >= 'a') && ((C) <= 'f'))) + +int +nfs_uaddr2sockaddr(const char *uaddr, struct sockaddr *addr) +{ + const char *p, *pd; /* pointers to current character in scan */ + const char *pnum; /* pointer to current number to decode */ + const char *pscope; /* pointer to IPv6 scope ID */ + uint8_t a[18]; /* octet array to store address bytes */ + int i; /* index of next octet to decode */ + int dci; /* index of octet to insert double-colon zeroes */ + int dcount, xdcount; /* count of digits in current number */ + int needmore; /* set when we know we need more input (e.g. after colon, period) */ + int dots; /* # of dots */ + int hex; /* contains hex values */ + unsigned long val; /* decoded value */ + int s; /* index used for sliding array to insert elided zeroes */ + +#define HEXVALUE 0 +#define DECIMALVALUE 1 +#define GET(TYPE) \ + do { \ + if ((dcount <= 0) || (dcount > (((TYPE) == DECIMALVALUE) ? 3 : 4))) \ + return (0); \ + if (((TYPE) == DECIMALVALUE) && xdcount) \ + return (0); \ + val = strtoul(pnum, NULL, ((TYPE) == DECIMALVALUE) ? 10 : 16); \ + if (((TYPE) == DECIMALVALUE) && (val >= 256)) \ + return (0); \ + /* check if there is room left in the array */ \ + if (i > (int)(sizeof(a) - (((TYPE) == HEXVALUE) ? 2 : 1) - ((dci != -1) ? 2 : 0))) \ + return (0); \ + if ((TYPE) == HEXVALUE) \ + a[i++] = ((val >> 8) & 0xff); \ + a[i++] = (val & 0xff); \ + } while (0) + + hex = 0; + dots = 0; + dci = -1; + i = dcount = xdcount = 0; + pnum = p = uaddr; + pscope = NULL; + needmore = 1; + if ((*p == ':') && (*++p != ':')) /* if it starts with colon, gotta be a double */ + return (0); + + while (*p) { + if (IS_XDIGIT(*p)) { + dcount++; + if (!IS_DIGIT(*p)) + xdcount++; + needmore = 0; + p++; + } else if (*p == '.') { + /* rest is decimal IPv4 dotted quad and/or port */ + if (!dots) { + /* this is the first, so count them */ + for (pd = p; *pd; pd++) { + if (*pd == '.') { + if (++dots > 5) + return (0); + } else if (hex && (*pd == '%')) { + break; + } else if ((*pd < '0') || (*pd > '9')) { + return (0); + } + } + if ((dots != 2) && (dots != 3) && (dots != 5)) + return (0); + if (hex && (dots == 2)) { /* hex+port */ + if (!dcount && needmore) + return (0); + if (dcount) /* last hex may be elided zero */ + GET(HEXVALUE); + } else { + GET(DECIMALVALUE); + } + } else { + GET(DECIMALVALUE); + } + dcount = xdcount = 0; + needmore = 1; + pnum = ++p; + } else if (*p == ':') { + hex = 1; + if (dots) + return (0); + if (!dcount) { /* missing number, probably double colon */ + if (dci >= 0) /* can only have one double colon */ + return (0); + dci = i; + needmore = 0; + } else { + GET(HEXVALUE); + dcount = xdcount = 0; + needmore = 1; + } + pnum = ++p; + } else if (*p == '%') { /* scope ID delimiter */ + if (!hex) + return (0); + p++; + pscope = p; + break; + } else { /* unexpected character */ + return (0); + } + } + if (needmore && !dcount) + return (0); + if (dcount) /* decode trailing number */ + GET(dots ? DECIMALVALUE : HEXVALUE); + if (dci >= 0) { /* got a double-colon at i, need to insert a range of zeroes */ + /* if we got a port, slide to end of array */ + /* otherwise, slide to end of address (non-port) values */ + int end = ((dots == 2) || (dots == 5)) ? sizeof(a) : (sizeof(a) - 2); + if (i % 2) /* length of zero range must be multiple of 2 */ + return (0); + if (i >= end) /* no room? */ + return (0); + /* slide (i-dci) numbers up from index dci */ + for (s=0; s < (i - dci); s++) + a[end-1-s] = a[i-1-s]; + /* zero (end-i) numbers at index dci */ + for (s=0; s < (end - i); s++) + a[dci+s] = 0; + i = end; + } + + /* copy out resulting socket address */ + if (hex) { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6*)addr; + if ((((dots == 0) || (dots == 3)) && (i != (sizeof(a)-2)))) + return (0); + if ((((dots == 2) || (dots == 5)) && (i != sizeof(a)))) + return (0); + bzero(sin6, sizeof(struct sockaddr_in6)); + sin6->sin6_len = sizeof(struct sockaddr_in6); + sin6->sin6_family = AF_INET6; + bcopy(a, &sin6->sin6_addr.s6_addr, sizeof(struct in6_addr)); + if ((dots == 5) || (dots == 2)) + sin6->sin6_port = htons((a[16] << 8) | a[17]); + if (pscope) { + for (p=pscope; IS_DIGIT(*p); p++) + ; + if (*p && !IS_DIGIT(*p)) { /* name */ + ifnet_t interface = NULL; + if (ifnet_find_by_name(pscope, &interface) == 0) + sin6->sin6_scope_id = ifnet_index(interface); + if (interface) + ifnet_release(interface); + } else { /* decimal number */ + sin6->sin6_scope_id = strtoul(pscope, NULL, 10); + } + /* XXX should we also embed scope id for linklocal? */ + } + } else { + struct sockaddr_in *sin = (struct sockaddr_in*)addr; + if ((dots != 3) && (dots != 5)) + return (0); + if ((dots == 3) && (i != 4)) + return (0); + if ((dots == 5) && (i != 6)) + return (0); + bzero(sin, sizeof(struct sockaddr_in)); + sin->sin_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + bcopy(a, &sin->sin_addr.s_addr, sizeof(struct in_addr)); + if (dots == 5) + sin->sin_port = htons((a[4] << 8) | a[5]); + } + return (1); +} + + #endif /* NFSCLIENT */ /* @@ -1478,8 +1962,7 @@ int nfsrv_free_netopt(struct radix_node *, void *); int nfsrv_free_addrlist(struct nfs_export *, struct user_nfs_export_args *); struct nfs_export_options *nfsrv_export_lookup(struct nfs_export *, mbuf_t); struct nfs_export *nfsrv_fhtoexport(struct nfs_filehandle *); -int nfsrv_cmp_sockaddr(struct sockaddr_storage *, struct sockaddr_storage *); -struct nfs_user_stat_node *nfsrv_get_user_stat_node(struct nfs_active_user_list *, struct sockaddr_storage *, uid_t); +struct nfs_user_stat_node *nfsrv_get_user_stat_node(struct nfs_active_user_list *, struct sockaddr *, uid_t); void nfsrv_init_user_list(struct nfs_active_user_list *); void nfsrv_free_user_list(struct nfs_active_user_list *); @@ -1939,7 +2422,6 @@ nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) unsigned int net; user_addr_t uaddr; kauth_cred_t cred; - struct ucred temp_cred; uaddr = unxa->nxa_nets; for (net = 0; net < unxa->nxa_netcount; net++, uaddr += sizeof(nxna)) { @@ -1948,12 +2430,13 @@ nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) return (error); if (nxna.nxna_flags & (NX_MAPROOT|NX_MAPALL)) { - bzero(&temp_cred, sizeof(temp_cred)); - temp_cred.cr_uid = nxna.nxna_cred.cr_uid; - temp_cred.cr_ngroups = nxna.nxna_cred.cr_ngroups; + struct posix_cred temp_pcred; + bzero(&temp_pcred, sizeof(temp_pcred)); + temp_pcred.cr_uid = nxna.nxna_cred.cr_uid; + temp_pcred.cr_ngroups = nxna.nxna_cred.cr_ngroups; for (i=0; i < nxna.nxna_cred.cr_ngroups && i < NGROUPS; i++) - temp_cred.cr_groups[i] = nxna.nxna_cred.cr_groups[i]; - cred = kauth_cred_create(&temp_cred); + temp_pcred.cr_groups[i] = nxna.nxna_cred.cr_groups[i]; + cred = posix_cred_create(&temp_pcred); if (!IS_VALID_CRED(cred)) return (ENOMEM); } else { @@ -2035,13 +2518,34 @@ nfsrv_hang_addrlist(struct nfs_export *nx, struct user_nfs_export_args *unxa) if (cred == cred2) { /* creds are same (or both NULL) */ matched = 1; - } else if (cred && cred2 && (cred->cr_uid == cred2->cr_uid) && - (cred->cr_ngroups == cred2->cr_ngroups)) { - for (i=0; i < cred2->cr_ngroups && i < NGROUPS; i++) - if (cred->cr_groups[i] != cred2->cr_groups[i]) - break; - if (i >= cred2->cr_ngroups || i >= NGROUPS) - matched = 1; + } else if (cred && cred2 && (kauth_cred_getuid(cred) == kauth_cred_getuid(cred2))) { + /* + * Now compare the effective and + * supplementary groups... + * + * Note: This comparison, as written, + * does not correctly indicate that + * the groups are equivalent, since + * other than the first supplementary + * group, which is also the effective + * group, order on the remaining groups + * doesn't matter, and this is an + * ordered compare. + */ + gid_t groups[NGROUPS]; + gid_t groups2[NGROUPS]; + int groupcount = NGROUPS; + int group2count = NGROUPS; + + if (!kauth_cred_getgroups(cred, groups, &groupcount) && + !kauth_cred_getgroups(cred2, groups2, &group2count) && + groupcount == group2count) { + for (i=0; i < group2count; i++) + if (groups[i] != groups2[i]) + break; + if (i >= group2count || i >= NGROUPS) + matched = 1; + } } } if (IS_VALID_CRED(cred)) @@ -2167,7 +2671,8 @@ void enablequotas(struct mount *mp, vfs_context_t ctx); // XXX int nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) { - int error = 0, pathlen; + int error = 0; + size_t pathlen; struct nfs_exportfs *nxfs, *nxfs2, *nxfs3; struct nfs_export *nx, *nx2, *nx3; struct nfs_filehandle nfh; @@ -2179,10 +2684,10 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) if (unxa->nxa_flags == NXA_CHECK) { /* just check if the path is an NFS-exportable file system */ - error = copyinstr(unxa->nxa_fspath, path, MAXPATHLEN, (size_t *)&pathlen); + error = copyinstr(unxa->nxa_fspath, path, MAXPATHLEN, &pathlen); if (error) return (error); - NDINIT(&mnd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(&mnd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); error = namei(&mnd); if (error) @@ -2215,8 +2720,11 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) lck_rw_lock_exclusive(&nfsrv_export_rwlock); while ((nxfs = LIST_FIRST(&nfsrv_exports))) { mp = vfs_getvfs_by_mntonname(nxfs->nxfs_path); - if (mp) + if (mp) { vfs_clearflags(mp, MNT_EXPORTED); + mount_iterdrop(mp); + mp = NULL; + } /* delete all exports on this file system */ while ((nx = LIST_FIRST(&nxfs->nxfs_exports))) { LIST_REMOVE(nx, nx_next); @@ -2245,7 +2753,7 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) return (0); } - error = copyinstr(unxa->nxa_fspath, path, MAXPATHLEN, (size_t *)&pathlen); + error = copyinstr(unxa->nxa_fspath, path, MAXPATHLEN, &pathlen); if (error) return (error); @@ -2272,8 +2780,12 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) if ((unxa->nxa_flags & (NXA_ADD|NXA_OFFLINE)) == NXA_ADD) { /* if adding, verify that the mount is still what we expect */ mp = vfs_getvfs_by_mntonname(nxfs->nxfs_path); + if (mp) { + mount_ref(mp, 0); + mount_iterdrop(mp); + } /* find exported FS root vnode */ - NDINIT(&mnd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(&mnd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(nxfs->nxfs_path), ctx); error = namei(&mnd); if (error) @@ -2298,7 +2810,7 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) } /* find exported FS root vnode */ - NDINIT(&mnd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(&mnd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); error = namei(&mnd); if (error) { @@ -2318,6 +2830,7 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) mvp = NULL; } else { mp = vnode_mount(mvp); + mount_ref(mp, 0); /* make sure the file system is NFS-exportable */ nfh.nfh_len = NFSV3_MAX_FID_SIZE; @@ -2366,7 +2879,7 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) } if (unxa->nxa_exppath) { - error = copyinstr(unxa->nxa_exppath, path, MAXPATHLEN, (size_t *)&pathlen); + error = copyinstr(unxa->nxa_exppath, path, MAXPATHLEN, &pathlen); if (error) goto out; LIST_FOREACH(nx, &nxfs->nxfs_exports, nx_next) { @@ -2483,6 +2996,9 @@ nfsrv_export(struct user_nfs_export_args *unxa, vfs_context_t ctx) vnode_get(xvp); } else { xnd.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + xnd.ni_op = OP_LOOKUP; +#endif xnd.ni_cnd.cn_flags = LOCKLEAF; xnd.ni_pathlen = pathlen - 1; xnd.ni_cnd.cn_nameptr = xnd.ni_cnd.cn_pnbuf = path; @@ -2600,6 +3116,8 @@ out: nameidone(&mnd); } unlock_out: + if (mp) + mount_drop(mp, 0); lck_rw_done(&nfsrv_export_rwlock); return (error); } @@ -2736,6 +3254,12 @@ nfsrv_fhtovp( /* find mount structure */ mp = vfs_getvfs_by_mntonname((*nxp)->nx_fs->nxfs_path); + if (mp) { + error = vfs_busy(mp, LK_NOWAIT); + mount_iterdrop(mp); + if (error) + mp = NULL; + } if (!mp) { /* * We have an export, but no mount? @@ -2746,6 +3270,7 @@ nfsrv_fhtovp( fidp = nfhp->nfh_fhp + sizeof(*nxh); error = VFS_FHTOVP(mp, nxh->nxh_fidlen, fidp, vpp, NULL); + vfs_unbusy(mp); if (error) return (error); /* vnode pointer should be good at this point or ... */ @@ -2863,46 +3388,6 @@ nfsrv_fhmatch(struct nfs_filehandle *fh1, struct nfs_filehandle *fh2) * Functions for dealing with active user lists */ -/* - * Compare address fields of two sockaddr_storage structures. - * Returns zero if they match. - */ -int -nfsrv_cmp_sockaddr(struct sockaddr_storage *sock1, struct sockaddr_storage *sock2) -{ - struct sockaddr_in *ipv4_sock1, *ipv4_sock2; - struct sockaddr_in6 *ipv6_sock1, *ipv6_sock2; - - /* check for valid parameters */ - if (sock1 == NULL || sock2 == NULL) - return 1; - - /* check address length */ - if (sock1->ss_len != sock2->ss_len) - return 1; - - /* Check address family */ - if (sock1->ss_family != sock2->ss_family) - return 1; - - if (sock1->ss_family == AF_INET) { - /* IPv4 */ - ipv4_sock1 = (struct sockaddr_in *)sock1; - ipv4_sock2 = (struct sockaddr_in *)sock2; - - if (!bcmp(&ipv4_sock1->sin_addr, &ipv4_sock2->sin_addr, sizeof(struct in_addr))) - return 0; - } else { - /* IPv6 */ - ipv6_sock1 = (struct sockaddr_in6 *)sock1; - ipv6_sock2 = (struct sockaddr_in6 *)sock2; - - if (!bcmp(&ipv6_sock1->sin6_addr, &ipv6_sock2->sin6_addr, sizeof(struct in6_addr))) - return 0; - } - return 1; -} - /* * Search the hash table for a user node with a matching IP address and uid field. * If found, the node's tm_last timestamp is updated and the node is returned. @@ -2913,7 +3398,7 @@ nfsrv_cmp_sockaddr(struct sockaddr_storage *sock1, struct sockaddr_storage *sock * The list's user_mutex lock MUST be held. */ struct nfs_user_stat_node * -nfsrv_get_user_stat_node(struct nfs_active_user_list *list, struct sockaddr_storage *sock, uid_t uid) +nfsrv_get_user_stat_node(struct nfs_active_user_list *list, struct sockaddr *saddr, uid_t uid) { struct nfs_user_stat_node *unode; struct timeval now; @@ -2922,7 +3407,7 @@ nfsrv_get_user_stat_node(struct nfs_active_user_list *list, struct sockaddr_stor /* seach the hash table */ head = NFS_USER_STAT_HASH(list->user_hashtbl, uid); LIST_FOREACH(unode, head, hash_link) { - if (uid == unode->uid && nfsrv_cmp_sockaddr(sock, &unode->sock) == 0) { + if ((uid == unode->uid) && (nfs_sockaddr_cmp(saddr, (struct sockaddr*)&unode->sock) == 0)) { /* found matching node */ break; } @@ -2964,7 +3449,7 @@ nfsrv_get_user_stat_node(struct nfs_active_user_list *list, struct sockaddr_stor /* Initialize the node */ unode->uid = uid; - bcopy(sock, &unode->sock, sock->ss_len); + bcopy(saddr, &unode->sock, saddr->sa_len); microtime(&now); unode->ops = 0; unode->bytes_read = 0; @@ -2984,15 +3469,15 @@ nfsrv_update_user_stat(struct nfs_export *nx, struct nfsrv_descript *nd, uid_t u { struct nfs_user_stat_node *unode; struct nfs_active_user_list *ulist; - struct sockaddr_storage *sock_stor; + struct sockaddr *saddr; if ((!nfsrv_user_stat_enabled) || (!nx) || (!nd) || (!nd->nd_nam)) return; - sock_stor = (struct sockaddr_storage *)mbuf_data(nd->nd_nam); + saddr = (struct sockaddr *)mbuf_data(nd->nd_nam); /* check address family before going any further */ - if ((sock_stor->ss_family != AF_INET) && (sock_stor->ss_family != AF_INET6)) + if ((saddr->sa_family != AF_INET) && (saddr->sa_family != AF_INET6)) return; ulist = &nx->nx_user_list; @@ -3001,7 +3486,7 @@ nfsrv_update_user_stat(struct nfs_export *nx, struct nfsrv_descript *nd, uid_t u lck_mtx_lock(&ulist->user_mutex); /* get the user node */ - unode = nfsrv_get_user_stat_node(ulist, sock_stor, uid); + unode = nfsrv_get_user_stat_node(ulist, saddr, uid); if (!unode) { lck_mtx_unlock(&ulist->user_mutex); diff --git a/bsd/nfs/nfs_syscalls.c b/bsd/nfs/nfs_syscalls.c index c28eac76c..d6de219ba 100644 --- a/bsd/nfs/nfs_syscalls.c +++ b/bsd/nfs/nfs_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -150,36 +150,40 @@ SYSCTL_NODE(_vfs_generic, OID_AUTO, nfs, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs hing #if NFSCLIENT SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, client, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs client hinge"); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, initialdowndelay, CTLFLAG_RW, &nfs_tprintf_initial_delay, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nextdowndelay, CTLFLAG_RW, &nfs_tprintf_delay, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, iosize, CTLFLAG_RW, &nfs_iosize, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_cache_timeout, CTLFLAG_RW, &nfs_access_cache_timeout, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, allow_async, CTLFLAG_RW, &nfs_allow_async, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, statfs_rate_limit, CTLFLAG_RW, &nfs_statfs_rate_limit, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_max, CTLFLAG_RW, &nfsiod_thread_max, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_count, CTLFLAG_RD, &nfsiod_thread_count, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, lockd_mounts, CTLFLAG_RD, &nfs_lockd_mounts, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, max_async_writes, CTLFLAG_RW, &nfs_max_async_writes, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, single_des, CTLFLAG_RW, &nfs_single_des, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_delete, CTLFLAG_RW, &nfs_access_delete, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, initialdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_initial_delay, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nextdowndelay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_tprintf_delay, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, iosize, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_iosize, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_cache_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_cache_timeout, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, allow_async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_allow_async, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, statfs_rate_limit, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_statfs_rate_limit, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsiod_thread_max, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, nfsiod_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsiod_thread_count, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, lockd_mounts, CTLFLAG_RD | CTLFLAG_LOCKED, &nfs_lockd_mounts, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, max_async_writes, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_max_async_writes, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, single_des, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_single_des, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_delete, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_delete, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_dotzfs, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_dotzfs, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, access_for_getattr, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_access_for_getattr, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, idmap_ctrl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_idmap_ctrl, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_client, OID_AUTO, callback_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfs_callback_port, 0, ""); #endif /* NFSCLIENT */ #if NFSSERVER SYSCTL_NODE(_vfs_generic_nfs, OID_AUTO, server, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "nfs server hinge"); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay, CTLFLAG_RW, &nfsrv_wg_delay, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay_v3, CTLFLAG_RW, &nfsrv_wg_delay_v3, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, require_resv_port, CTLFLAG_RW, &nfsrv_require_resv_port, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, async, CTLFLAG_RW, &nfsrv_async, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, export_hash_size, CTLFLAG_RW, &nfsrv_export_hash_size, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, reqcache_size, CTLFLAG_RW, &nfsrv_reqcache_size, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, request_queue_length, CTLFLAG_RW, &nfsrv_sock_max_rec_queue_length, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, user_stats, CTLFLAG_RW, &nfsrv_user_stat_enabled, 0, ""); -SYSCTL_UINT(_vfs_generic_nfs_server, OID_AUTO, gss_context_ttl, CTLFLAG_RW, &nfsrv_gss_context_ttl, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, wg_delay_v3, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_wg_delay_v3, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, require_resv_port, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_require_resv_port, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, async, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_async, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, export_hash_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_export_hash_size, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, reqcache_size, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_reqcache_size, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, request_queue_length, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_sock_max_rec_queue_length, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, user_stats, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_user_stat_enabled, 0, ""); +SYSCTL_UINT(_vfs_generic_nfs_server, OID_AUTO, gss_context_ttl, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_gss_context_ttl, 0, ""); #if CONFIG_FSE -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, fsevents, CTLFLAG_RW, &nfsrv_fsevents_enabled, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, fsevents, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsrv_fsevents_enabled, 0, ""); #endif -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_max, CTLFLAG_RW, &nfsd_thread_max, 0, ""); -SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_count, CTLFLAG_RD, &nfsd_thread_count, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_max, CTLFLAG_RW | CTLFLAG_LOCKED, &nfsd_thread_max, 0, ""); +SYSCTL_INT(_vfs_generic_nfs_server, OID_AUTO, nfsd_thread_count, CTLFLAG_RD | CTLFLAG_LOCKED, &nfsd_thread_count, 0, ""); #endif /* NFSSERVER */ @@ -191,11 +195,19 @@ nfsclnt(proc_t p, struct nfsclnt_args *uap, __unused int *retval) struct lockd_ans la; int error; - if (uap->flag == NFSCLNT_LOCKDANS) { + switch (uap->flag) { + case NFSCLNT_LOCKDANS: error = copyin(uap->argp, &la, sizeof(la)); - return (error != 0 ? error : nfslockdans(p, &la)); + if (!error) + error = nfslockdans(p, &la); + break; + case NFSCLNT_LOCKDNOTIFY: + error = nfslockdnotify(p, uap->argp); + break; + default: + error = EINVAL; } - return EINVAL; + return (error); } /* @@ -389,10 +401,10 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) { vnode_t vp; struct nfs_filehandle nfh; - int error; + int error, fhlen, fidlen; struct nameidata nd; char path[MAXPATHLEN], *ptr; - u_int pathlen; + size_t pathlen; struct nfs_exportfs *nxfs; struct nfs_export *nx; @@ -403,14 +415,20 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) if (error) return (error); - error = copyinstr(uap->fname, path, MAXPATHLEN, (size_t *)&pathlen); + error = copyinstr(uap->fname, path, MAXPATHLEN, &pathlen); + if (!error) + error = copyin(uap->fhp, &fhlen, sizeof(fhlen)); if (error) return (error); + /* limit fh size to length specified (or v3 size by default) */ + if ((fhlen != NFSV2_MAX_FH_SIZE) && (fhlen != NFSV3_MAX_FH_SIZE)) + fhlen = NFSV3_MAX_FH_SIZE; + fidlen = fhlen - sizeof(struct nfs_exphandle); if (!nfsrv_is_initialized()) return (EINVAL); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, UIO_SYSSPACE, CAST_USER_ADDR_T(path), vfs_context_current()); error = namei(&nd); if (error) @@ -452,9 +470,9 @@ getfh(proc_t p, struct getfh_args *uap, __unused int *retval) nfh.nfh_xh.nxh_expid = htonl(nx->nx_id); nfh.nfh_xh.nxh_flags = 0; nfh.nfh_xh.nxh_reserved = 0; - nfh.nfh_len = NFSV3_MAX_FID_SIZE; + nfh.nfh_len = fidlen; error = VFS_VPTOFH(vp, (int*)&nfh.nfh_len, &nfh.nfh_fid[0], NULL); - if (nfh.nfh_len > (int)NFSV3_MAX_FID_SIZE) + if (nfh.nfh_len > (uint32_t)fidlen) error = EOVERFLOW; nfh.nfh_xh.nxh_fidlen = nfh.nfh_len; nfh.nfh_len += sizeof(nfh.nfh_xh); @@ -465,7 +483,7 @@ out: vnode_put(vp); if (error) return (error); - error = copyout((caddr_t)&nfh, uap->fhp, sizeof(nfh)); + error = copyout((caddr_t)&nfh, uap->fhp, sizeof(fhandle_t)); return (error); } @@ -564,7 +582,7 @@ fhopen( proc_t p, if ((error = VNOP_OPEN(vp, fmode, ctx))) goto bad; - if ((error = vnode_ref_ext(vp, fmode))) + if ((error = vnode_ref_ext(vp, fmode, 0))) goto bad; /* @@ -714,8 +732,12 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) sock_gettype(so, &sodomain, &sotype, &soprotocol); - /* There should be only one UDP socket */ - if ((soprotocol == IPPROTO_UDP) && nfsrv_udpsock) { + /* There should be only one UDP socket for each of IPv4 and IPv6 */ + if ((sodomain == AF_INET) && (soprotocol == IPPROTO_UDP) && nfsrv_udpsock) { + mbuf_freem(mynam); + return (EEXIST); + } + if ((sodomain == AF_INET6) && (soprotocol == IPPROTO_UDP) && nfsrv_udp6sock) { mbuf_freem(mynam); return (EEXIST); } @@ -763,14 +785,26 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) lck_mtx_lock(nfsd_mutex); if (soprotocol == IPPROTO_UDP) { - /* There should be only one UDP socket */ - if (nfsrv_udpsock) { - lck_mtx_unlock(nfsd_mutex); - nfsrv_slpfree(slp); - mbuf_freem(mynam); - return (EEXIST); + if (sodomain == AF_INET) { + /* There should be only one UDP/IPv4 socket */ + if (nfsrv_udpsock) { + lck_mtx_unlock(nfsd_mutex); + nfsrv_slpfree(slp); + mbuf_freem(mynam); + return (EEXIST); + } + nfsrv_udpsock = slp; + } + if (sodomain == AF_INET6) { + /* There should be only one UDP/IPv6 socket */ + if (nfsrv_udp6sock) { + lck_mtx_unlock(nfsd_mutex); + nfsrv_slpfree(slp); + mbuf_freem(mynam); + return (EEXIST); + } + nfsrv_udp6sock = slp; } - nfsrv_udpsock = slp; } /* add the socket to the list */ @@ -782,11 +816,7 @@ nfssvc_addsock(socket_t so, mbuf_t mynam) slp->ns_nam = mynam; /* set up the socket upcall */ - socket_lock(so, 1); - so->so_upcallarg = (caddr_t)slp; - so->so_upcall = nfsrv_rcv; - so->so_rcv.sb_flags |= SB_UPCALL; - socket_unlock(so, 1); + sock_setupcall(so, nfsrv_rcv, slp); /* just playin' it safe */ sock_setsockopt(so, SOL_SOCKET, SO_UPCALLCLOSEWAIT, &on, sizeof(on)); @@ -978,6 +1008,8 @@ nfssvc_nfsd(void) mbuf_freem(nd->nd_nam2); if (IS_VALID_CRED(nd->nd_cr)) kauth_cred_unref(&nd->nd_cr); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); nd = NULL; } @@ -1000,21 +1032,17 @@ nfssvc_nfsd(void) if (nfsrv_require_resv_port) { /* Check if source port is a reserved port */ - u_short port; - struct sockaddr *nam = mbuf_data(nd->nd_nam); - struct sockaddr_in *sin; - - sin = (struct sockaddr_in *)nam; - port = ntohs(sin->sin_port); - if (port >= IPPORT_RESERVED && - nd->nd_procnum != NFSPROC_NULL) { - char strbuf[MAX_IPv4_STR_LEN]; + in_port_t port = 0; + struct sockaddr *saddr = mbuf_data(nd->nd_nam); + + if (saddr->sa_family == AF_INET) + port = ntohs(((struct sockaddr_in*)saddr)->sin_port); + else if (saddr->sa_family == AF_INET6) + port = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + if ((port >= IPPORT_RESERVED) && (nd->nd_procnum != NFSPROC_NULL)) { nd->nd_procnum = NFSPROC_NOOP; nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK); cacherep = RC_DOIT; - printf("NFS request from unprivileged port (%s:%d)\n", - inet_ntop(AF_INET, &sin->sin_addr, strbuf, sizeof(strbuf)), - port); } } @@ -1130,6 +1158,8 @@ nfssvc_nfsd(void) nfsm_chain_cleanup(&nd->nd_nmreq); if (IS_VALID_CRED(nd->nd_cr)) kauth_cred_unref(&nd->nd_cr); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); nfsrv_slpderef(slp); lck_mtx_lock(nfsd_mutex); @@ -1148,6 +1178,8 @@ nfssvc_nfsd(void) mbuf_freem(nd->nd_nam2); if (IS_VALID_CRED(nd->nd_cr)) kauth_cred_unref(&nd->nd_cr); + if (nd->nd_gss_context) + nfs_gss_svc_ctx_deref(nd->nd_gss_context); FREE_ZONE(nd, sizeof(*nd), M_NFSRVDESC); nd = NULL; } @@ -1294,6 +1326,8 @@ nfsrv_slpfree(struct nfsrv_sock *slp) mbuf_freem(nwp->nd_nam2); if (IS_VALID_CRED(nwp->nd_cr)) kauth_cred_unref(&nwp->nd_cr); + if (nwp->nd_gss_context) + nfs_gss_svc_ctx_deref(nwp->nd_gss_context); FREE_ZONE(nwp, sizeof(*nwp), M_NFSRVDESC); } LIST_INIT(&slp->ns_tq); @@ -1455,10 +1489,12 @@ nfsrv_cleanup(void) * Fire off the content modified fsevent for each * entry, remove it from the list, and free it. */ - if (nfsrv_fsevents_enabled) + if (nfsrv_fsevents_enabled) { + fp->fm_context.vc_thread = current_thread(); add_fsevent(FSE_CONTENT_MODIFIED, &fp->fm_context, FSE_ARG_VNODE, fp->fm_vp, FSE_ARG_DONE); + } vnode_put(fp->fm_vp); kauth_cred_unref(&fp->fm_context.vc_ucred); nfp = LIST_NEXT(fp, fm_link); @@ -1475,6 +1511,7 @@ nfsrv_cleanup(void) nfsrv_cleancache(); /* And clear out server cache */ nfsrv_udpsock = NULL; + nfsrv_udp6sock = NULL; } #endif /* NFS_NOSERVER */ diff --git a/bsd/nfs/nfs_vfsops.c b/bsd/nfs/nfs_vfsops.c index e92c58cdf..7a0323fde 100644 --- a/bsd/nfs/nfs_vfsops.c +++ b/bsd/nfs/nfs_vfsops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -88,6 +88,7 @@ #include <sys/socketvar.h> #include <sys/fcntl.h> #include <sys/quota.h> +#include <sys/priv.h> #include <libkern/OSAtomic.h> #include <sys/vm.h> @@ -159,21 +160,29 @@ int nfs_max_async_writes = NFS_DEFMAXASYNCWRITES; int nfs_iosize = NFS_IOSIZE; int nfs_access_cache_timeout = NFS_MAXATTRTIMO; -int nfs_access_delete = 0; +int nfs_access_delete = 1; /* too many servers get this wrong - workaround on by default */ +int nfs_access_dotzfs = 1; +int nfs_access_for_getattr = 0; int nfs_allow_async = 0; int nfs_statfs_rate_limit = NFS_DEFSTATFSRATELIMIT; int nfs_lockd_mounts = 0; int nfs_lockd_request_sent = 0; +int nfs_idmap_ctrl = NFS_IDMAP_CTRL_USE_IDMAP_SERVICE; +int nfs_callback_port = 0; int nfs_tprintf_initial_delay = NFS_TPRINTF_INITIAL_DELAY; int nfs_tprintf_delay = NFS_TPRINTF_DELAY; -int mountnfs(struct user_nfs_args *,mount_t,mbuf_t,vfs_context_t,vnode_t *); +int mountnfs(char *, mount_t, vfs_context_t, vnode_t *); static int nfs_mount_diskless(struct nfs_dlmount *, const char *, int, vnode_t *, mount_t *, vfs_context_t); #if !defined(NO_MOUNT_PRIVATE) static int nfs_mount_diskless_private(struct nfs_dlmount *, const char *, int, vnode_t *, mount_t *, vfs_context_t); #endif /* NO_MOUNT_PRIVATE */ +int nfs_mount_connect(struct nfsmount *); +void nfs_mount_cleanup(struct nfsmount *); +int nfs_mountinfo_assemble(struct nfsmount *, struct xdrbuf *); +int nfs4_mount_update_path_with_symlink(struct nfsmount *, struct nfs_fs_path *, uint32_t, fhandle_t *, int *, fhandle_t *, vfs_context_t); /* * NFS VFS operations. @@ -218,8 +227,8 @@ struct vfsops nfs_vfsops = { /* * version-specific NFS functions */ -int nfs3_mount(struct nfsmount *, vfs_context_t, struct user_nfs_args *, nfsnode_t *); -int nfs4_mount(struct nfsmount *, vfs_context_t, struct user_nfs_args *, nfsnode_t *); +int nfs3_mount(struct nfsmount *, vfs_context_t, nfsnode_t *); +int nfs4_mount(struct nfsmount *, vfs_context_t, nfsnode_t *); int nfs3_fsinfo(struct nfsmount *, nfsnode_t, vfs_context_t); int nfs3_update_statfs(struct nfsmount *, vfs_context_t); int nfs4_update_statfs(struct nfsmount *, vfs_context_t); @@ -247,7 +256,10 @@ struct nfs_funcs nfs3_funcs = { nfs3_lookup_rpc_async, nfs3_lookup_rpc_async_finish, nfs3_remove_rpc, - nfs3_rename_rpc + nfs3_rename_rpc, + nfs3_setlock_rpc, + nfs3_unlock_rpc, + nfs3_getlock_rpc }; struct nfs_funcs nfs4_funcs = { nfs4_mount, @@ -265,7 +277,10 @@ struct nfs_funcs nfs4_funcs = { nfs4_lookup_rpc_async, nfs4_lookup_rpc_async_finish, nfs4_remove_rpc, - nfs4_rename_rpc + nfs4_rename_rpc, + nfs4_setlock_rpc, + nfs4_unlock_rpc, + nfs4_getlock_rpc }; /* @@ -358,8 +373,7 @@ nfs3_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_FSSTAT, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_FSSTAT, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; if (nfsvers == NFS_VER3) @@ -418,6 +432,7 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) struct nfsm_chain nmreq, nmrep; uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; struct nfs_vattr nvattr; + struct nfsreq_secinfo_args si; nfsvers = nmp->nm_vers; np = nmp->nm_dnp; @@ -426,6 +441,8 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) if ((error = vnode_get(NFSTOV(np)))) return (error); + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); + NVATTR_INIT(&nvattr); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -440,12 +457,11 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); NFS_COPY_ATTRIBUTES(nfs_getattr_bitmap, bitmap); NFS4_STATFS_ATTRIBUTES(bitmap); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, np); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC4_COMPOUND, ctx, &si, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); @@ -453,8 +469,7 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) nfsm_assert(error, NFSTONMP(np), ENXIO); nfsmout_if(error); lck_mtx_lock(&nmp->nm_lock); - NFS_CLEAR_ATTRIBUTES(nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, NULL, NULL); + error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, NULL, NULL, NULL); lck_mtx_unlock(&nmp->nm_lock); nfsmout_if(error); if ((lockerror = nfs_node_lock(np))) @@ -467,6 +482,7 @@ nfs4_update_statfs(struct nfsmount *nmp, vfs_context_t ctx) nfsmout_if(error); nmp->nm_fsattr.nfsa_bsize = NFS_FABLKSIZE; nfsmout: + NVATTR_CLEANUP(&nvattr); nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); vnode_put(NFSTOV(np)); @@ -605,6 +621,8 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) caps |= VOL_CAP_FMT_HIDDEN_FILES; valid |= VOL_CAP_FMT_HIDDEN_FILES; // VOL_CAP_FMT_OPENDENYMODES +// caps |= VOL_CAP_FMT_OPENDENYMODES; +// valid |= VOL_CAP_FMT_OPENDENYMODES; } fsap->f_capabilities.capabilities[VOL_CAPABILITIES_FORMAT] = // VOL_CAP_FMT_PERSISTENTOBJECTIDS | @@ -655,10 +673,18 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) if (nfsvers >= NFS_VER4) { caps = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; valid = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; - // VOL_CAP_INT_EXTENDED_SECURITY - // VOL_CAP_INT_NAMEDSTREAMS - // VOL_CAP_INT_EXTENDED_ATTR - } else if ((nmp->nm_flag & NFSMNT_NOLOCKS)) { + if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL) + caps |= VOL_CAP_INT_EXTENDED_SECURITY; + valid |= VOL_CAP_INT_EXTENDED_SECURITY; + if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR) + caps |= VOL_CAP_INT_EXTENDED_ATTR; + valid |= VOL_CAP_INT_EXTENDED_ATTR; +#if NAMEDSTREAMS + if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR) + caps |= VOL_CAP_INT_NAMEDSTREAMS; + valid |= VOL_CAP_INT_NAMEDSTREAMS; +#endif + } else if (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) { /* locks disabled on this mount, so they definitely won't work */ valid = VOL_CAP_INT_ADVLOCK | VOL_CAP_INT_FLOCK; } else if (nmp->nm_state & NFSSTA_LOCKSWORK) { @@ -681,6 +707,7 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) // VOL_CAP_INT_MANLOCK | // VOL_CAP_INT_NAMEDSTREAMS | // VOL_CAP_INT_EXTENDED_ATTR | + VOL_CAP_INT_REMOTE_EVENT | caps; fsap->f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] = VOL_CAP_INT_SEARCHFS | @@ -698,6 +725,7 @@ nfs_vfs_getattr(mount_t mp, struct vfs_attr *fsap, vfs_context_t ctx) // VOL_CAP_INT_MANLOCK | // VOL_CAP_INT_NAMEDSTREAMS | // VOL_CAP_INT_EXTENDED_ATTR | + VOL_CAP_INT_REMOTE_EVENT | valid; fsap->f_capabilities.capabilities[VOL_CAPABILITIES_RESERVED1] = 0; @@ -749,8 +777,7 @@ nfs3_fsinfo(struct nfsmount *nmp, nfsnode_t np, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, nmp->nm_vers, np->n_fhp, np->n_fhsize); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_FSINFO, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_FSINFO, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; nfsm_chain_postop_attr_update(error, &nmrep, np, &xid); @@ -770,7 +797,7 @@ nfs3_fsinfo(struct nfsmount *nmp, nfsnode_t np, vfs_context_t ctx) if (prefsize < nmp->nm_rsize) nmp->nm_rsize = (prefsize + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); - if (maxsize < nmp->nm_rsize) { + if ((maxsize > 0) && (maxsize < nmp->nm_rsize)) { nmp->nm_rsize = maxsize & ~(NFS_FABLKSIZE - 1); if (nmp->nm_rsize == 0) nmp->nm_rsize = maxsize; @@ -784,7 +811,7 @@ nfs3_fsinfo(struct nfsmount *nmp, nfsnode_t np, vfs_context_t ctx) if (prefsize < nmp->nm_wsize) nmp->nm_wsize = (prefsize + NFS_FABLKSIZE - 1) & ~(NFS_FABLKSIZE - 1); - if (maxsize < nmp->nm_wsize) { + if ((maxsize > 0) && (maxsize < nmp->nm_wsize)) { nmp->nm_wsize = maxsize & ~(NFS_FABLKSIZE - 1); if (nmp->nm_wsize == 0) nmp->nm_wsize = maxsize; @@ -793,10 +820,11 @@ nfs3_fsinfo(struct nfsmount *nmp, nfsnode_t np, vfs_context_t ctx) nfsm_chain_get_32(error, &nmrep, prefsize); nfsmout_if(error); - if (prefsize < nmp->nm_readdirsize) + if ((prefsize > 0) && (prefsize < nmp->nm_readdirsize)) nmp->nm_readdirsize = prefsize; - if (maxsize < nmp->nm_readdirsize) - nmp->nm_readdirsize = maxsize; + if ((nmp->nm_fsattr.nfsa_maxread > 0) && + (nmp->nm_fsattr.nfsa_maxread < nmp->nm_readdirsize)) + nmp->nm_readdirsize = nmp->nm_fsattr.nfsa_maxread; nfsm_chain_get_64(error, &nmrep, nmp->nm_fsattr.nfsa_maxfilesize); @@ -846,7 +874,6 @@ int nfs_mountroot(void) { struct nfs_diskless nd; - struct nfs_vattr nvattr; mount_t mp = NULL; vnode_t vp = NULL; vfs_context_t ctx; @@ -864,9 +891,9 @@ nfs_mountroot(void) */ bzero((caddr_t) &nd, sizeof(nd)); error = nfs_boot_init(&nd); - if (error) { - panic("nfs_boot_init failed with %d\n", error); - } + if (error) + panic("nfs_boot_init: unable to initialize NFS root system information, " + "error %d, check configuration: %s\n", error, PE_boot_args()); /* * Try NFSv3 first, then fallback to NFSv2. @@ -895,27 +922,29 @@ tryagain: } if (v3) { if (sotype == SOCK_STREAM) { - printf("nfs_boot_getfh(v3,TCP) failed with %d, trying UDP...\n", error); + printf("NFS mount (v3,TCP) failed with error %d, trying UDP...\n", error); sotype = SOCK_DGRAM; goto tryagain; } - printf("nfs_boot_getfh(v3,UDP) failed with %d, trying v2...\n", error); + printf("NFS mount (v3,UDP) failed with error %d, trying v2...\n", error); v3 = 0; sotype = SOCK_STREAM; goto tryagain; } else if (sotype == SOCK_STREAM) { - printf("nfs_boot_getfh(v2,TCP) failed with %d, trying UDP...\n", error); + printf("NFS mount (v2,TCP) failed with error %d, trying UDP...\n", error); sotype = SOCK_DGRAM; goto tryagain; + } else { + printf("NFS mount (v2,UDP) failed with error %d, giving up...\n", error); } switch(error) { case EPROGUNAVAIL: - panic("nfs_boot_getfh(v2,UDP) failed: NFS server mountd not responding - check server configuration: %s", PE_boot_args()); + panic("NFS mount failed: NFS server mountd not responding, check server configuration: %s", PE_boot_args()); case EACCES: case EPERM: - panic("nfs_boot_getfh(v2,UDP) failed: NFS server refused mount - check server configuration: %s", PE_boot_args()); + panic("NFS mount failed: NFS server refused mount, check server configuration: %s", PE_boot_args()); default: - panic("nfs_boot_getfh(v2,UDP) failed with %d: %s", error, PE_boot_args()); + panic("NFS mount failed with error %d, check configuration: %s", error, PE_boot_args()); } } @@ -943,20 +972,22 @@ tryagain: { if (v3) { if (sotype == SOCK_STREAM) { - printf("nfs_mount_diskless(v3,TCP) failed with %d, trying UDP...\n", error); + printf("NFS root mount (v3,TCP) failed with %d, trying UDP...\n", error); sotype = SOCK_DGRAM; goto tryagain; } - printf("nfs_mount_diskless(v3,UDP) failed with %d, trying v2...\n", error); + printf("NFS root mount (v3,UDP) failed with %d, trying v2...\n", error); v3 = 0; sotype = SOCK_STREAM; goto tryagain; } else if (sotype == SOCK_STREAM) { - printf("nfs_mount_diskless(v2,TCP) failed with %d, trying UDP...\n", error); + printf("NFS root mount (v2,TCP) failed with %d, trying UDP...\n", error); sotype = SOCK_DGRAM; goto tryagain; + } else { + printf("NFS root mount (v2,UDP) failed with error %d, giving up...\n", error); } - panic("nfs_mount_diskless(v2,UDP) root failed with %d: %s\n", error, PE_boot_args()); + panic("NFS root mount failed with error %d, check configuration: %s\n", error, PE_boot_args()); } } printf("root on %s\n", nd.nd_root.ndm_mntfrom); @@ -969,9 +1000,8 @@ tryagain: if (nd.nd_private.ndm_saddr.sin_addr.s_addr) { error = nfs_mount_diskless_private(&nd.nd_private, "/private", 0, &vppriv, &mppriv, ctx); - if (error) { - panic("nfs_mount_diskless private failed with %d\n", error); - } + if (error) + panic("NFS /private mount failed with error %d, check configuration: %s\n", error, PE_boot_args()); printf("private on %s\n", nd.nd_private.ndm_mntfrom); vfs_unbusy(mppriv); @@ -990,8 +1020,9 @@ tryagain: FREE_ZONE(nd.nd_private.ndm_path, MAXPATHLEN, M_NAMEI); /* Get root attributes (for the time). */ - error = nfs_getattr(VTONFS(vp), &nvattr, ctx, NGA_UNCACHED); - if (error) panic("nfs_mountroot: getattr for root"); + error = nfs_getattr(VTONFS(vp), NULL, ctx, NGA_UNCACHED); + if (error) + panic("NFS mount: failed to get attributes for root directory, error %d, check server", error); return (0); } @@ -1007,13 +1038,18 @@ nfs_mount_diskless( mount_t *mpp, vfs_context_t ctx) { - struct user_nfs_args args; mount_t mp; - mbuf_t m; - int error; + int error, numcomps; + char *xdrbuf, *p, *cp, *frompath, *endserverp; + char uaddr[MAX_IPv4_STR_LEN]; + struct xdrbuf xb; + uint32_t mattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t mflags_mask[NFS_MFLAG_BITMAP_LEN]; + uint32_t mflags[NFS_MFLAG_BITMAP_LEN]; + uint32_t argslength_offset, attrslength_offset, end_offset; if ((error = vfs_rootmountalloc("nfs", ndmntp->ndm_mntfrom, &mp))) { - printf("nfs_mount_diskless: NFS not configured"); + printf("nfs_mount_diskless: NFS not configured\n"); return (error); } @@ -1021,26 +1057,112 @@ nfs_mount_diskless( if (!(mntflag & MNT_RDONLY)) mp->mnt_flag &= ~MNT_RDONLY; - /* Initialize mount args. */ - bzero((caddr_t) &args, sizeof(args)); - args.addr = CAST_USER_ADDR_T(&ndmntp->ndm_saddr); - args.addrlen = ndmntp->ndm_saddr.sin_len; - args.sotype = ndmntp->ndm_sotype; - args.fh = CAST_USER_ADDR_T(&ndmntp->ndm_fh[0]); - args.fhsize = ndmntp->ndm_fhlen; - args.hostname = CAST_USER_ADDR_T(ndmntp->ndm_mntfrom); - args.flags = NFSMNT_RESVPORT; - if (ndmntp->ndm_nfsv3) - args.flags |= NFSMNT_NFSV3; - - error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &m); + /* find the server-side path being mounted */ + frompath = ndmntp->ndm_mntfrom; + if (*frompath == '[') { /* skip IPv6 literal address */ + while (*frompath && (*frompath != ']')) + frompath++; + if (*frompath == ']') + frompath++; + } + while (*frompath && (*frompath != ':')) + frompath++; + endserverp = frompath; + while (*frompath && (*frompath == ':')) + frompath++; + /* count fs location path components */ + p = frompath; + while (*p && (*p == '/')) + p++; + numcomps = 0; + while (*p) { + numcomps++; + while (*p && (*p != '/')) + p++; + while (*p && (*p == '/')) + p++; + } + + /* convert address to universal address string */ + if (inet_ntop(AF_INET, &ndmntp->ndm_saddr.sin_addr, uaddr, sizeof(uaddr)) != uaddr) { + printf("nfs_mount_diskless: bad address\n"); + return (EINVAL); + } + + /* prepare mount attributes */ + NFS_BITMAP_ZERO(mattrs, NFS_MATTR_BITMAP_LEN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_VERSION); + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FH); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FS_LOCATIONS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFLAGS); + + /* prepare mount flags */ + NFS_BITMAP_ZERO(mflags_mask, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_ZERO(mflags, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RESVPORT); + NFS_BITMAP_SET(mflags, NFS_MFLAG_RESVPORT); + + /* build xdr buffer */ + xb_init_buffer(&xb, NULL, 0); + xb_add_32(error, &xb, NFS_ARGSVERSION_XDR); + argslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // args length + xb_add_32(error, &xb, NFS_XDRARGS_VERSION_0); + xb_add_bitmap(error, &xb, mattrs, NFS_MATTR_BITMAP_LEN); + attrslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // attrs length + xb_add_32(error, &xb, ndmntp->ndm_nfsv3 ? 3 : 2); // NFS version + xb_add_string(error, &xb, ((ndmntp->ndm_sotype == SOCK_DGRAM) ? "udp" : "tcp"), 3); + xb_add_32(error, &xb, ntohs(ndmntp->ndm_saddr.sin_port)); // NFS port + xb_add_fh(error, &xb, &ndmntp->ndm_fh[0], ndmntp->ndm_fhlen); + /* fs location */ + xb_add_32(error, &xb, 1); /* fs location count */ + xb_add_32(error, &xb, 1); /* server count */ + xb_add_string(error, &xb, ndmntp->ndm_mntfrom, (endserverp - ndmntp->ndm_mntfrom)); /* server name */ + xb_add_32(error, &xb, 1); /* address count */ + xb_add_string(error, &xb, uaddr, strlen(uaddr)); /* address */ + xb_add_32(error, &xb, 0); /* empty server info */ + xb_add_32(error, &xb, numcomps); /* pathname component count */ + p = frompath; + while (*p && (*p == '/')) + p++; + while (*p) { + cp = p; + while (*p && (*p != '/')) + p++; + xb_add_string(error, &xb, cp, (p - cp)); /* component */ + if (error) + break; + while (*p && (*p == '/')) + p++; + } + xb_add_32(error, &xb, 0); /* empty fsl info */ + xb_add_32(error, &xb, mntflag); /* MNT flags */ + xb_build_done(error, &xb); + + /* update opaque counts */ + end_offset = xb_offset(&xb); + if (!error) { + error = xb_seek(&xb, argslength_offset); + xb_add_32(error, &xb, end_offset - argslength_offset + XDRWORD/*version*/); + } + if (!error) { + error = xb_seek(&xb, attrslength_offset); + xb_add_32(error, &xb, end_offset - attrslength_offset - XDRWORD/*don't include length field*/); + } if (error) { - printf("nfs_mount_diskless: mbuf_get(soname) failed"); + printf("nfs_mount_diskless: error %d assembling mount args\n", error); + xb_cleanup(&xb); return (error); } - mbuf_setlen(m, ndmntp->ndm_saddr.sin_len); - bcopy(&ndmntp->ndm_saddr, mbuf_data(m), ndmntp->ndm_saddr.sin_len); - if ((error = mountnfs(&args, mp, m, ctx, vpp))) { + /* grab the assembled buffer */ + xdrbuf = xb_buffer_base(&xb); + xb.xb_flags &= ~XB_CLEANUP; + + /* do the mount */ + if ((error = mountnfs(xdrbuf, mp, ctx, vpp))) { printf("nfs_mountroot: mount %s failed: %d\n", mntname, error); // XXX vfs_rootmountfailed(mp); mount_list_lock(); @@ -1052,10 +1174,11 @@ nfs_mount_diskless( mac_mount_label_destroy(mp); #endif FREE_ZONE(mp, sizeof(struct mount), M_MOUNT); - return (error); + } else { + *mpp = mp; } - *mpp = mp; - return (0); + xb_cleanup(&xb); + return (error); } #if !defined(NO_MOUNT_PRIVATE) @@ -1072,16 +1195,21 @@ nfs_mount_diskless_private( mount_t *mpp, vfs_context_t ctx) { - struct user_nfs_args args; mount_t mp; - mbuf_t m; - int error; + int error, numcomps; proc_t procp; struct vfstable *vfsp; struct nameidata nd; vnode_t vp; + char *xdrbuf = NULL, *p, *cp, *frompath, *endserverp; + char uaddr[MAX_IPv4_STR_LEN]; + struct xdrbuf xb; + uint32_t mattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t mflags_mask[NFS_MFLAG_BITMAP_LEN], mflags[NFS_MFLAG_BITMAP_LEN]; + uint32_t argslength_offset, attrslength_offset, end_offset; procp = current_proc(); /* XXX */ + xb_init(&xb, 0); { /* @@ -1107,7 +1235,7 @@ nfs_mount_diskless_private( /* * Get vnode to be covered */ - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, CAST_USER_ADDR_T(mntname), ctx); if ((error = namei(&nd))) { printf("nfs_mountroot: private namei failed!\n"); @@ -1189,26 +1317,112 @@ nfs_mount_diskless_private( mac_mount_label_associate(ctx, mp); #endif - /* Initialize mount args. */ - bzero((caddr_t) &args, sizeof(args)); - args.addr = CAST_USER_ADDR_T(&ndmntp->ndm_saddr); - args.addrlen = ndmntp->ndm_saddr.sin_len; - args.sotype = ndmntp->ndm_sotype; - args.fh = CAST_USER_ADDR_T(ndmntp->ndm_fh); - args.fhsize = ndmntp->ndm_fhlen; - args.hostname = CAST_USER_ADDR_T(ndmntp->ndm_mntfrom); - args.flags = NFSMNT_RESVPORT; - if (ndmntp->ndm_nfsv3) - args.flags |= NFSMNT_NFSV3; - - error = mbuf_get(MBUF_WAITOK, MBUF_TYPE_SONAME, &m); + /* find the server-side path being mounted */ + frompath = ndmntp->ndm_mntfrom; + if (*frompath == '[') { /* skip IPv6 literal address */ + while (*frompath && (*frompath != ']')) + frompath++; + if (*frompath == ']') + frompath++; + } + while (*frompath && (*frompath != ':')) + frompath++; + endserverp = frompath; + while (*frompath && (*frompath == ':')) + frompath++; + /* count fs location path components */ + p = frompath; + while (*p && (*p == '/')) + p++; + numcomps = 0; + while (*p) { + numcomps++; + while (*p && (*p != '/')) + p++; + while (*p && (*p == '/')) + p++; + } + + /* convert address to universal address string */ + if (inet_ntop(AF_INET, &ndmntp->ndm_saddr.sin_addr, uaddr, sizeof(uaddr)) != uaddr) { + printf("nfs_mountroot: bad address\n"); + error = EINVAL; + goto out; + } + + /* prepare mount attributes */ + NFS_BITMAP_ZERO(mattrs, NFS_MATTR_BITMAP_LEN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_VERSION); + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FH); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FS_LOCATIONS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFLAGS); + + /* prepare mount flags */ + NFS_BITMAP_ZERO(mflags_mask, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_ZERO(mflags, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RESVPORT); + NFS_BITMAP_SET(mflags, NFS_MFLAG_RESVPORT); + + /* build xdr buffer */ + xb_init_buffer(&xb, NULL, 0); + xb_add_32(error, &xb, NFS_ARGSVERSION_XDR); + argslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // args length + xb_add_32(error, &xb, NFS_XDRARGS_VERSION_0); + xb_add_bitmap(error, &xb, mattrs, NFS_MATTR_BITMAP_LEN); + attrslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // attrs length + xb_add_32(error, &xb, ndmntp->ndm_nfsv3 ? 3 : 2); // NFS version + xb_add_string(error, &xb, ((ndmntp->ndm_sotype == SOCK_DGRAM) ? "udp" : "tcp"), 3); + xb_add_32(error, &xb, ntohs(ndmntp->ndm_saddr.sin_port)); // NFS port + xb_add_fh(error, &xb, &ndmntp->ndm_fh[0], ndmntp->ndm_fhlen); + /* fs location */ + xb_add_32(error, &xb, 1); /* fs location count */ + xb_add_32(error, &xb, 1); /* server count */ + xb_add_string(error, &xb, ndmntp->ndm_mntfrom, (endserverp - ndmntp->ndm_mntfrom)); /* server name */ + xb_add_32(error, &xb, 1); /* address count */ + xb_add_string(error, &xb, uaddr, strlen(uaddr)); /* address */ + xb_add_32(error, &xb, 0); /* empty server info */ + xb_add_32(error, &xb, numcomps); /* pathname component count */ + p = frompath; + while (*p && (*p == '/')) + p++; + while (*p) { + cp = p; + while (*p && (*p != '/')) + p++; + xb_add_string(error, &xb, cp, (p - cp)); /* component */ + if (error) + break; + while (*p && (*p == '/')) + p++; + } + xb_add_32(error, &xb, 0); /* empty fsl info */ + xb_add_32(error, &xb, mntflag); /* MNT flags */ + xb_build_done(error, &xb); + + /* update opaque counts */ + end_offset = xb_offset(&xb); + if (!error) { + error = xb_seek(&xb, argslength_offset); + xb_add_32(error, &xb, end_offset - argslength_offset + XDRWORD/*version*/); + } + if (!error) { + error = xb_seek(&xb, attrslength_offset); + xb_add_32(error, &xb, end_offset - attrslength_offset - XDRWORD/*don't include length field*/); + } if (error) { - printf("nfs_mount_diskless_private: mbuf_get(soname) failed"); + printf("nfs_mountroot: error %d assembling mount args\n", error); goto out; } - mbuf_setlen(m, ndmntp->ndm_saddr.sin_len); - bcopy(&ndmntp->ndm_saddr, mbuf_data(m), ndmntp->ndm_saddr.sin_len); - if ((error = mountnfs(&args, mp, m, ctx, &vp))) { + /* grab the assembled buffer */ + xdrbuf = xb_buffer_base(&xb); + xb.xb_flags &= ~XB_CLEANUP; + + /* do the mount */ + if ((error = mountnfs(xdrbuf, mp, ctx, &vp))) { printf("nfs_mountroot: mount %s failed: %d\n", mntname, error); mount_list_lock(); vfsp->vfc_refcount--; @@ -1225,63 +1439,65 @@ nfs_mount_diskless_private( *mpp = mp; *vpp = vp; out: + xb_cleanup(&xb); return (error); } #endif /* NO_MOUNT_PRIVATE */ /* - * VFS Operations. - * - * mount system call + * Convert old style NFS mount args to XDR. */ -int -nfs_vfs_mount(mount_t mp, vnode_t vp, user_addr_t data, vfs_context_t ctx) +static int +nfs_convert_old_nfs_args(mount_t mp, user_addr_t data, vfs_context_t ctx, int argsversion, int inkernel, char **xdrbufp) { - int error, argsvers; + int error = 0, args64bit, argsize, numcomps; struct user_nfs_args args; struct nfs_args tempargs; - mbuf_t nam; + caddr_t argsp; size_t len; - u_char nfh[NFSX_V3FHMAX]; - char *mntfrom; - - error = copyin(data, (caddr_t)&argsvers, sizeof (argsvers)); - if (error) - return (error); - - switch (argsvers) { + u_char nfh[NFS4_FHSIZE]; + char *mntfrom, *endserverp, *frompath, *p, *cp; + struct sockaddr_storage ss; + void *sinaddr; + char uaddr[MAX_IPv6_STR_LEN]; + uint32_t mattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t mflags_mask[NFS_MFLAG_BITMAP_LEN], mflags[NFS_MFLAG_BITMAP_LEN]; + uint32_t nfsvers, nfslockmode = 0, argslength_offset, attrslength_offset, end_offset; + struct xdrbuf xb; + + *xdrbufp = NULL; + + /* allocate a temporary buffer for mntfrom */ + MALLOC_ZONE(mntfrom, char*, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!mntfrom) + return (ENOMEM); + + args64bit = (inkernel || vfs_context_is64bit(ctx)); + argsp = args64bit ? (void*)&args : (void*)&tempargs; + + argsize = args64bit ? sizeof(args) : sizeof(tempargs); + switch (argsversion) { case 3: - if (vfs_context_is64bit(ctx)) - error = copyin(data, (caddr_t)&args, sizeof (struct user_nfs_args3)); - else - error = copyin(data, (caddr_t)&tempargs, sizeof (struct nfs_args3)); - break; + argsize -= NFS_ARGSVERSION4_INCSIZE; case 4: - if (vfs_context_is64bit(ctx)) - error = copyin(data, (caddr_t)&args, sizeof (struct user_nfs_args4)); - else - error = copyin(data, (caddr_t)&tempargs, sizeof (struct nfs_args4)); - break; + argsize -= NFS_ARGSVERSION5_INCSIZE; case 5: - if (vfs_context_is64bit(ctx)) - error = copyin(data, (caddr_t)&args, sizeof (struct user_nfs_args5)); - else - error = copyin(data, (caddr_t)&tempargs, sizeof (struct nfs_args5)); - break; + argsize -= NFS_ARGSVERSION6_INCSIZE; case 6: - if (vfs_context_is64bit(ctx)) - error = copyin(data, (caddr_t)&args, sizeof (args)); - else - error = copyin(data, (caddr_t)&tempargs, sizeof (tempargs)); break; default: - return (EPROGMISMATCH); + error = EPROGMISMATCH; + goto nfsmout; } - if (error) - return (error); - if (!vfs_context_is64bit(ctx)) { - args.version = tempargs.version; + /* read in the structure */ + if (inkernel) + bcopy(CAST_DOWN(void *, data), argsp, argsize); + else + error = copyin(data, argsp, argsize); + nfsmout_if(error); + + if (!args64bit) { args.addrlen = tempargs.addrlen; args.sotype = tempargs.sotype; args.proto = tempargs.proto; @@ -1299,39 +1515,357 @@ nfs_vfs_mount(mount_t mp, vnode_t vp, user_addr_t data, vfs_context_t ctx) args.addr = CAST_USER_ADDR_T(tempargs.addr); args.fh = CAST_USER_ADDR_T(tempargs.fh); args.hostname = CAST_USER_ADDR_T(tempargs.hostname); - if (argsvers >= 4) { + if (args.version >= 4) { args.acregmin = tempargs.acregmin; args.acregmax = tempargs.acregmax; args.acdirmin = tempargs.acdirmin; args.acdirmax = tempargs.acdirmax; } - if (argsvers >= 5) + if (args.version >= 5) args.auth = tempargs.auth; - if (argsvers >= 6) + if (args.version >= 6) args.deadtimeout = tempargs.deadtimeout; } - if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX) - return (EINVAL); + if ((args.fhsize < 0) || (args.fhsize > NFS4_FHSIZE)) { + error = EINVAL; + goto nfsmout; + } if (args.fhsize > 0) { - error = copyin(args.fh, (caddr_t)nfh, args.fhsize); - if (error) - return (error); + if (inkernel) + bcopy(CAST_DOWN(void *, args.fh), (caddr_t)nfh, args.fhsize); + else + error = copyin(args.fh, (caddr_t)nfh, args.fhsize); + nfsmout_if(error); } - mntfrom = &vfs_statfs(mp)->f_mntfromname[0]; - error = copyinstr(args.hostname, mntfrom, MAXPATHLEN-1, &len); - if (error) - return (error); + if (inkernel) + error = copystr(CAST_DOWN(void *, args.hostname), mntfrom, MAXPATHLEN-1, &len); + else + error = copyinstr(args.hostname, mntfrom, MAXPATHLEN-1, &len); + nfsmout_if(error); bzero(&mntfrom[len], MAXPATHLEN - len); - /* sockargs() call must be after above copyin() calls */ - error = sockargs(&nam, args.addr, args.addrlen, MBUF_TYPE_SONAME); - if (error) + /* find the server-side path being mounted */ + frompath = mntfrom; + if (*frompath == '[') { /* skip IPv6 literal address */ + while (*frompath && (*frompath != ']')) + frompath++; + if (*frompath == ']') + frompath++; + } + while (*frompath && (*frompath != ':')) + frompath++; + endserverp = frompath; + while (*frompath && (*frompath == ':')) + frompath++; + /* count fs location path components */ + p = frompath; + while (*p && (*p == '/')) + p++; + numcomps = 0; + while (*p) { + numcomps++; + while (*p && (*p != '/')) + p++; + while (*p && (*p == '/')) + p++; + } + + /* copy socket address */ + if (inkernel) + bcopy(CAST_DOWN(void *, args.addr), &ss, args.addrlen); + else + error = copyin(args.addr, &ss, args.addrlen); + nfsmout_if(error); + ss.ss_len = args.addrlen; + + /* convert address to universal address string */ + if (ss.ss_family == AF_INET) + sinaddr = &((struct sockaddr_in*)&ss)->sin_addr; + else if (ss.ss_family == AF_INET6) + sinaddr = &((struct sockaddr_in6*)&ss)->sin6_addr; + else + sinaddr = NULL; + if (!sinaddr || (inet_ntop(ss.ss_family, sinaddr, uaddr, sizeof(uaddr)) != uaddr)) { + error = EINVAL; + goto nfsmout; + } + + /* prepare mount flags */ + NFS_BITMAP_ZERO(mflags_mask, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_ZERO(mflags, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_SOFT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_INTR); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RESVPORT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOCONNECT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_DUMBTIMER); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_CALLUMNT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RDIRPLUS); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NONEGNAMECACHE); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_MUTEJUKEBOX); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOQUOTA); + if (args.flags & NFSMNT_SOFT) + NFS_BITMAP_SET(mflags, NFS_MFLAG_SOFT); + if (args.flags & NFSMNT_INT) + NFS_BITMAP_SET(mflags, NFS_MFLAG_INTR); + if (args.flags & NFSMNT_RESVPORT) + NFS_BITMAP_SET(mflags, NFS_MFLAG_RESVPORT); + if (args.flags & NFSMNT_NOCONN) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOCONNECT); + if (args.flags & NFSMNT_DUMBTIMR) + NFS_BITMAP_SET(mflags, NFS_MFLAG_DUMBTIMER); + if (args.flags & NFSMNT_CALLUMNT) + NFS_BITMAP_SET(mflags, NFS_MFLAG_CALLUMNT); + if (args.flags & NFSMNT_RDIRPLUS) + NFS_BITMAP_SET(mflags, NFS_MFLAG_RDIRPLUS); + if (args.flags & NFSMNT_NONEGNAMECACHE) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NONEGNAMECACHE); + if (args.flags & NFSMNT_MUTEJUKEBOX) + NFS_BITMAP_SET(mflags, NFS_MFLAG_MUTEJUKEBOX); + if (args.flags & NFSMNT_NOQUOTA) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOQUOTA); + + /* prepare mount attributes */ + NFS_BITMAP_ZERO(mattrs, NFS_MATTR_BITMAP_LEN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FLAGS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_VERSION); + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FH); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FS_LOCATIONS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFLAGS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFROM); + if (args.flags & NFSMNT_NFSV4) + nfsvers = 4; + else if (args.flags & NFSMNT_NFSV3) + nfsvers = 3; + else + nfsvers = 2; + if ((args.flags & NFSMNT_RSIZE) && (args.rsize > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_READ_SIZE); + if ((args.flags & NFSMNT_WSIZE) && (args.wsize > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_WRITE_SIZE); + if ((args.flags & NFSMNT_TIMEO) && (args.timeo > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_REQUEST_TIMEOUT); + if ((args.flags & NFSMNT_RETRANS) && (args.retrans > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOFT_RETRY_COUNT); + if ((args.flags & NFSMNT_MAXGRPS) && (args.maxgrouplist > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_MAX_GROUP_LIST); + if ((args.flags & NFSMNT_READAHEAD) && (args.readahead > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_READAHEAD); + if ((args.flags & NFSMNT_READDIRSIZE) && (args.readdirsize > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_READDIR_SIZE); + if ((args.flags & NFSMNT_NOLOCKS) || + (args.flags & NFSMNT_LOCALLOCKS)) { + NFS_BITMAP_SET(mattrs, NFS_MATTR_LOCK_MODE); + if (args.flags & NFSMNT_NOLOCKS) + nfslockmode = NFS_LOCK_MODE_DISABLED; + else if (args.flags & NFSMNT_LOCALLOCKS) + nfslockmode = NFS_LOCK_MODE_LOCAL; + else + nfslockmode = NFS_LOCK_MODE_ENABLED; + } + if (args.version >= 4) { + if ((args.flags & NFSMNT_ACREGMIN) && (args.acregmin > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_REG_MIN); + if ((args.flags & NFSMNT_ACREGMAX) && (args.acregmax > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_REG_MAX); + if ((args.flags & NFSMNT_ACDIRMIN) && (args.acdirmin > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MIN); + if ((args.flags & NFSMNT_ACDIRMAX) && (args.acdirmax > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX); + } + if (args.version >= 5) { + if ((args.flags & NFSMNT_SECFLAVOR) || (args.flags & NFSMNT_SECSYSOK)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_SECURITY); + } + if (args.version >= 6) { + if ((args.flags & NFSMNT_DEADTIMEOUT) && (args.deadtimeout > 0)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_DEAD_TIMEOUT); + } + + /* build xdr buffer */ + xb_init_buffer(&xb, NULL, 0); + xb_add_32(error, &xb, args.version); + argslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // args length + xb_add_32(error, &xb, NFS_XDRARGS_VERSION_0); + xb_add_bitmap(error, &xb, mattrs, NFS_MATTR_BITMAP_LEN); + attrslength_offset = xb_offset(&xb); + xb_add_32(error, &xb, 0); // attrs length + xb_add_bitmap(error, &xb, mflags_mask, NFS_MFLAG_BITMAP_LEN); /* mask */ + xb_add_bitmap(error, &xb, mflags, NFS_MFLAG_BITMAP_LEN); /* value */ + xb_add_32(error, &xb, nfsvers); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READ_SIZE)) + xb_add_32(error, &xb, args.rsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_WRITE_SIZE)) + xb_add_32(error, &xb, args.wsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READDIR_SIZE)) + xb_add_32(error, &xb, args.readdirsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READAHEAD)) + xb_add_32(error, &xb, args.readahead); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MIN)) { + xb_add_32(error, &xb, args.acregmin); + xb_add_32(error, &xb, 0); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MAX)) { + xb_add_32(error, &xb, args.acregmax); + xb_add_32(error, &xb, 0); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MIN)) { + xb_add_32(error, &xb, args.acdirmin); + xb_add_32(error, &xb, 0); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX)) { + xb_add_32(error, &xb, args.acdirmax); + xb_add_32(error, &xb, 0); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCK_MODE)) + xb_add_32(error, &xb, nfslockmode); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SECURITY)) { + uint32_t flavors[2], i=0; + if (args.flags & NFSMNT_SECFLAVOR) + flavors[i++] = args.auth; + if ((args.flags & NFSMNT_SECSYSOK) && ((i == 0) || (flavors[0] != RPCAUTH_SYS))) + flavors[i++] = RPCAUTH_SYS; + xb_add_word_array(error, &xb, flavors, i); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MAX_GROUP_LIST)) + xb_add_32(error, &xb, args.maxgrouplist); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOCKET_TYPE)) + xb_add_string(error, &xb, ((args.sotype == SOCK_DGRAM) ? "udp" : "tcp"), 3); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_PORT)) + xb_add_32(error, &xb, ((ss.ss_family == AF_INET) ? + ntohs(((struct sockaddr_in*)&ss)->sin_port) : + ntohs(((struct sockaddr_in6*)&ss)->sin6_port))); + /* NFS_MATTR_MOUNT_PORT (not available in old args) */ + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_REQUEST_TIMEOUT)) { + /* convert from .1s increments to time */ + xb_add_32(error, &xb, args.timeo/10); + xb_add_32(error, &xb, (args.timeo%10)*100000000); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOFT_RETRY_COUNT)) + xb_add_32(error, &xb, args.retrans); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_DEAD_TIMEOUT)) { + xb_add_32(error, &xb, args.deadtimeout); + xb_add_32(error, &xb, 0); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FH)) + xb_add_fh(error, &xb, &nfh[0], args.fhsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FS_LOCATIONS)) { + xb_add_32(error, &xb, 1); /* fs location count */ + xb_add_32(error, &xb, 1); /* server count */ + xb_add_string(error, &xb, mntfrom, (endserverp - mntfrom)); /* server name */ + xb_add_32(error, &xb, 1); /* address count */ + xb_add_string(error, &xb, uaddr, strlen(uaddr)); /* address */ + xb_add_32(error, &xb, 0); /* empty server info */ + xb_add_32(error, &xb, numcomps); /* pathname component count */ + nfsmout_if(error); + p = frompath; + while (*p && (*p == '/')) + p++; + while (*p) { + cp = p; + while (*p && (*p != '/')) + p++; + xb_add_string(error, &xb, cp, (p - cp)); /* component */ + nfsmout_if(error); + while (*p && (*p == '/')) + p++; + } + xb_add_32(error, &xb, 0); /* empty fsl info */ + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFLAGS)) + xb_add_32(error, &xb, (vfs_flags(mp) & MNT_VISFLAGMASK)); /* VFS MNT_* flags */ + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFROM)) + xb_add_string(error, &xb, mntfrom, strlen(mntfrom)); /* fixed f_mntfromname */ + xb_build_done(error, &xb); + + /* update opaque counts */ + end_offset = xb_offset(&xb); + error = xb_seek(&xb, argslength_offset); + xb_add_32(error, &xb, end_offset - argslength_offset + XDRWORD/*version*/); + nfsmout_if(error); + error = xb_seek(&xb, attrslength_offset); + xb_add_32(error, &xb, end_offset - attrslength_offset - XDRWORD/*don't include length field*/); + + if (!error) { + /* grab the assembled buffer */ + *xdrbufp = xb_buffer_base(&xb); + xb.xb_flags &= ~XB_CLEANUP; + } +nfsmout: + xb_cleanup(&xb); + FREE_ZONE(mntfrom, MAXPATHLEN, M_NAMEI); + return (error); +} + +/* + * VFS Operations. + * + * mount system call + */ +int +nfs_vfs_mount(mount_t mp, vnode_t vp, user_addr_t data, vfs_context_t ctx) +{ + int error = 0, inkernel = vfs_iskernelmount(mp); + uint32_t argsversion, argslength; + char *xdrbuf = NULL; + + /* read in version */ + if (inkernel) + bcopy(CAST_DOWN(void *, data), &argsversion, sizeof(argsversion)); + else if ((error = copyin(data, &argsversion, sizeof(argsversion)))) return (error); - args.fh = CAST_USER_ADDR_T(&nfh[0]); - error = mountnfs(&args, mp, nam, ctx, &vp); + /* If we have XDR args, then all values in the buffer are in network order */ + if (argsversion == htonl(NFS_ARGSVERSION_XDR)) + argsversion = NFS_ARGSVERSION_XDR; + + switch (argsversion) { + case 3: + case 4: + case 5: + case 6: + /* convert old-style args to xdr */ + error = nfs_convert_old_nfs_args(mp, data, ctx, argsversion, inkernel, &xdrbuf); + break; + case NFS_ARGSVERSION_XDR: + /* copy in xdr buffer */ + if (inkernel) + bcopy(CAST_DOWN(void *, (data + XDRWORD)), &argslength, XDRWORD); + else + error = copyin((data + XDRWORD), &argslength, XDRWORD); + if (error) + break; + argslength = ntohl(argslength); + /* put a reasonable limit on the size of the XDR args */ + if (argslength > 16*1024) { + error = E2BIG; + break; + } + /* allocate xdr buffer */ + xdrbuf = xb_malloc(xdr_rndup(argslength)); + if (!xdrbuf) { + error = ENOMEM; + break; + } + if (inkernel) + bcopy(CAST_DOWN(void *, data), xdrbuf, argslength); + else + error = copyin(data, xdrbuf, argslength); + break; + default: + error = EPROGMISMATCH; + } + + if (error) { + if (xdrbuf) + xb_free(xdrbuf); + return (error); + } + error = mountnfs(xdrbuf, mp, ctx, &vp); return (error); } @@ -1339,32 +1873,33 @@ nfs_vfs_mount(mount_t mp, vnode_t vp, user_addr_t data, vfs_context_t ctx) * Common code for mount and mountroot */ +/* Set up an NFSv2/v3 mount */ int nfs3_mount( struct nfsmount *nmp, vfs_context_t ctx, - struct user_nfs_args *argp, nfsnode_t *npp) { int error = 0; struct nfs_vattr nvattr; u_int64_t xid; - u_char *fhp; *npp = NULL; + if (!nmp->nm_fh) + return (EINVAL); + /* * Get file attributes for the mountpoint. These are needed * in order to properly create the root vnode. */ - fhp = CAST_DOWN(u_char *, argp->fh); - error = nfs3_getattr_rpc(NULL, nmp->nm_mountp, fhp, argp->fhsize, + error = nfs3_getattr_rpc(NULL, nmp->nm_mountp, nmp->nm_fh->fh_data, nmp->nm_fh->fh_len, 0, ctx, &nvattr, &xid); if (error) goto out; - error = nfs_nget(nmp->nm_mountp, NULL, NULL, fhp, argp->fhsize, - &nvattr, &xid, NG_MARKROOT, npp); + error = nfs_nget(nmp->nm_mountp, NULL, NULL, nmp->nm_fh->fh_data, nmp->nm_fh->fh_len, + &nvattr, &xid, RPCAUTH_UNKNOWN, NG_MARKROOT, npp); if (*npp) nfs_node_unlock(*npp); if (error) @@ -1403,325 +1938,1150 @@ out: return (error); } +/* + * Update an NFSv4 mount path with the contents of the symlink. + * + * Read the link for the given file handle. + * Insert the link's components into the path. + */ int -nfs4_mount( - struct nfsmount *nmp, - vfs_context_t ctx, - __unused struct user_nfs_args *argp, - nfsnode_t *npp) +nfs4_mount_update_path_with_symlink(struct nfsmount *nmp, struct nfs_fs_path *nfsp, uint32_t curcomp, fhandle_t *dirfhp, int *depthp, fhandle_t *fhp, vfs_context_t ctx) { - struct nfsm_chain nmreq, nmrep; - int error = 0, numops, status, interval; - char *path = &vfs_statfs(nmp->nm_mountp)->f_mntfromname[0]; - char *name, *nextname; - fhandle_t fh; - struct nfs_vattr nvattr; + int error = 0, status, numops; + uint32_t len = 0, comp, newcomp, linkcompcount; u_int64_t xid; + struct nfsm_chain nmreq, nmrep; + struct nfsreq rq, *req = &rq; + struct nfsreq_secinfo_args si; + char *link = NULL, *p, *q, ch; + struct nfs_fs_path nfsp2; + + bzero(&nfsp2, sizeof(nfsp2)); + if (dirfhp->fh_len) + NFSREQ_SECINFO_SET(&si, NULL, dirfhp->fh_data, dirfhp->fh_len, nfsp->np_components[curcomp], 0); + else + NFSREQ_SECINFO_SET(&si, NULL, NULL, 0, nfsp->np_components[curcomp], 0); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); - *npp = NULL; - fh.fh_len = 0; - TAILQ_INIT(&nmp->nm_open_owners); - TAILQ_INIT(&nmp->nm_recallq); - nmp->nm_stategenid = 1; + MALLOC_ZONE(link, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (!link) + error = ENOMEM; - /* look up path to get fh and attrs for mount point root */ - numops = 2; // PUTROOTFH + LOOKUP* + GETATTR - while (*path && (*path != '/')) - path++; - name = path; - while (*name) { - while (*name && (*name == '/')) - name++; - if (!*name) - break; - nextname = name; - while (*nextname && (*nextname != '/')) - nextname++; - numops++; - name = nextname; - } - nfsm_chain_build_alloc_init(error, &nmreq, 25 * NFSX_UNSIGNED); - nfsm_chain_add_compound_header(error, &nmreq, "mount", numops); + // PUTFH, READLINK + numops = 2; + nfsm_chain_build_alloc_init(error, &nmreq, 12 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "readlink", numops); numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTROOTFH); - // (LOOKUP)* - name = path; - while (*name) { - while (*name && (*name == '/')) - name++; - if (!*name) - break; - nextname = name; - while (*nextname && (*nextname != '/')) - nextname++; - numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUP); - nfsm_chain_add_string(error, &nmreq, name, nextname - name); - name = nextname; - } + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, NFS_VER4, fhp->fh_data, fhp->fh_len); numops--; - nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); - NFS4_DEFAULT_ATTRIBUTES(nmp->nm_fsattr.nfsa_supp_attr); - NFS_BITMAP_SET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_FILEHANDLE); - nfsm_chain_add_bitmap(error, &nmreq, nmp->nm_fsattr.nfsa_supp_attr, NFS_ATTR_BITMAP_LEN); + nfsm_chain_add_32(error, &nmreq, NFS_OP_READLINK); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, ctx, &nmrep, &xid, &status); + + error = nfs_request_async(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); - nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTROOTFH); - name = path; - while (*name) { - while (*name && (*name == '/')) - name++; - if (!*name) - break; - nextname = name; - while (*nextname && (*nextname != '/')) - nextname++; - nfsm_chain_op_check(error, &nmrep, NFS_OP_LOOKUP); - name = nextname; - } - nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); - nfsmout_if(error); - NFS_CLEAR_ATTRIBUTES(nmp->nm_fsattr.nfsa_bitmap); - NFS_CLEAR_ATTRIBUTES(&nvattr.nva_bitmap); - error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, &fh, NULL); - if (!error && !NFS_BITMAP_ISSET(&nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { - printf("nfs: mount didn't return filehandle?\n"); - error = EBADRPC; - } + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_READLINK); + nfsm_chain_get_32(error, &nmrep, len); nfsmout_if(error); - - error = nfs_nget(nmp->nm_mountp, NULL, NULL, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MARKROOT, npp); + if (len == 0) + error = ENOENT; + else if (len >= MAXPATHLEN) + len = MAXPATHLEN - 1; + nfsm_chain_get_opaque(error, &nmrep, len, link); nfsmout_if(error); + /* make sure link string is terminated properly */ + link[len] = '\0'; + + /* count the number of components in link */ + p = link; + while (*p && (*p == '/')) + p++; + linkcompcount = 0; + while (*p) { + linkcompcount++; + while (*p && (*p != '/')) + p++; + while (*p && (*p == '/')) + p++; + } - /* adjust I/O sizes to server limits */ - if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXREAD)) { - if (nmp->nm_fsattr.nfsa_maxread < (uint64_t)nmp->nm_rsize) { - nmp->nm_rsize = nmp->nm_fsattr.nfsa_maxread & ~(NFS_FABLKSIZE - 1); - if (nmp->nm_rsize == 0) - nmp->nm_rsize = nmp->nm_fsattr.nfsa_maxread; + /* free up used components */ + for (comp=0; comp <= curcomp; comp++) { + if (nfsp->np_components[comp]) { + FREE(nfsp->np_components[comp], M_TEMP); + nfsp->np_components[comp] = NULL; } } - if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXWRITE)) { - if (nmp->nm_fsattr.nfsa_maxwrite < (uint64_t)nmp->nm_wsize) { - nmp->nm_wsize = nmp->nm_fsattr.nfsa_maxwrite & ~(NFS_FABLKSIZE - 1); - if (nmp->nm_wsize == 0) - nmp->nm_wsize = nmp->nm_fsattr.nfsa_maxwrite; + + /* set up new path */ + nfsp2.np_compcount = nfsp->np_compcount - curcomp - 1 + linkcompcount; + MALLOC(nfsp2.np_components, char **, nfsp2.np_compcount*sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!nfsp2.np_components) { + error = ENOMEM; + goto nfsmout; + } + + /* add link components */ + p = link; + while (*p && (*p == '/')) + p++; + for (newcomp=0; newcomp < linkcompcount; newcomp++) { + /* find end of component */ + q = p; + while (*q && (*q != '/')) + q++; + MALLOC(nfsp2.np_components[newcomp], char *, q-p+1, M_TEMP, M_WAITOK|M_ZERO); + if (!nfsp2.np_components[newcomp]) { + error = ENOMEM; + break; } + ch = *q; + *q = '\0'; + strlcpy(nfsp2.np_components[newcomp], p, q-p+1); + *q = ch; + p = q; + while (*p && (*p == '/')) + p++; } + nfsmout_if(error); - /* set up lease renew timer */ - nmp->nm_renew_timer = thread_call_allocate(nfs4_renew_timer, nmp); - interval = nmp->nm_fsattr.nfsa_lease / 2; - if (interval < 1) - interval = 1; - nfs_interval_timer_start(nmp->nm_renew_timer, interval * 1000); + /* add remaining components */ + for(comp = curcomp + 1; comp < nfsp->np_compcount; comp++,newcomp++) { + nfsp2.np_components[newcomp] = nfsp->np_components[comp]; + nfsp->np_components[comp] = NULL; + } + + /* move new path into place */ + FREE(nfsp->np_components, M_TEMP); + nfsp->np_components = nfsp2.np_components; + nfsp->np_compcount = nfsp2.np_compcount; + nfsp2.np_components = NULL; + /* for absolute link, let the caller now that the next dirfh is root */ + if (link[0] == '/') { + dirfhp->fh_len = 0; + *depthp = 0; + } nfsmout: - if (*npp) - nfs_node_unlock(*npp); + if (link) + FREE_ZONE(link, MAXPATHLEN, M_NAMEI); + if (nfsp2.np_components) { + for (comp=0; comp < nfsp2.np_compcount; comp++) + if (nfsp2.np_components[comp]) + FREE(nfsp2.np_components[comp], M_TEMP); + FREE(nfsp2.np_components, M_TEMP); + } + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); return (error); } +/* Set up an NFSv4 mount */ int -mountnfs( - struct user_nfs_args *argp, - mount_t mp, - mbuf_t nam, +nfs4_mount( + struct nfsmount *nmp, vfs_context_t ctx, - vnode_t *vpp) + nfsnode_t *npp) { - struct nfsmount *nmp; - nfsnode_t np; - int error; - uint32_t maxio, iosize; - struct vfsstatfs *sbp; - struct timespec ts = { 1, 0 }; + struct nfsm_chain nmreq, nmrep; + int error = 0, numops, status, interval, isdotdot, loopcnt = 0, depth = 0; + struct nfs_fs_path fspath, *nfsp, fspath2; + uint32_t bitmap[NFS_ATTR_BITMAP_LEN], comp, comp2; + fhandle_t fh, dirfh; + struct nfs_vattr nvattr; + u_int64_t xid; + struct nfsreq rq, *req = &rq; + struct nfsreq_secinfo_args si; + struct nfs_sec sec; + struct nfs_fs_locations nfsls; + + *npp = NULL; + fh.fh_len = dirfh.fh_len = 0; + TAILQ_INIT(&nmp->nm_open_owners); + TAILQ_INIT(&nmp->nm_delegations); + TAILQ_INIT(&nmp->nm_dreturnq); + nmp->nm_stategenid = 1; + NVATTR_INIT(&nvattr); + bzero(&nfsls, sizeof(nfsls)); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); /* - * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes - * no sense in that context. + * If no security flavors were specified we'll want to default to the server's + * preferred flavor. For NFSv4.0 we need a file handle and name to get that via + * SECINFO, so we'll do that on the last component of the server path we are + * mounting. If we are mounting the server's root, we'll need to defer the + * SECINFO call to the first successful LOOKUP request. */ - if (argp->sotype == SOCK_STREAM) - argp->flags &= ~NFSMNT_NOCONN; - - if (vfs_flags(mp) & MNT_UPDATE) { - nmp = VFSTONFS(mp); - /* update paths, file handles, etc, here XXX */ - mbuf_freem(nam); - return (0); - } else { - MALLOC_ZONE(nmp, struct nfsmount *, - sizeof (struct nfsmount), M_NFSMNT, M_WAITOK); - if (!nmp) { - mbuf_freem(nam); - return (ENOMEM); + if (!nmp->nm_sec.count) + nmp->nm_state |= NFSSTA_NEEDSECINFO; + + /* make a copy of the current location's path */ + nfsp = &nmp->nm_locations.nl_locations[nmp->nm_locations.nl_current.nli_loc]->nl_path; + bzero(&fspath, sizeof(fspath)); + fspath.np_compcount = nfsp->np_compcount; + if (fspath.np_compcount > 0) { + MALLOC(fspath.np_components, char **, fspath.np_compcount*sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!fspath.np_components) { + error = ENOMEM; + goto nfsmout; } - bzero((caddr_t)nmp, sizeof (struct nfsmount)); - lck_mtx_init(&nmp->nm_lock, nfs_mount_grp, LCK_ATTR_NULL); - TAILQ_INIT(&nmp->nm_resendq); - TAILQ_INIT(&nmp->nm_iodq); - TAILQ_INIT(&nmp->nm_gsscl); - vfs_setfsprivate(mp, nmp); - - nfs_nhinit_finish(); + for (comp=0; comp < nfsp->np_compcount; comp++) { + int slen = strlen(nfsp->np_components[comp]); + MALLOC(fspath.np_components[comp], char *, slen+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fspath.np_components[comp]) { + error = ENOMEM; + break; + } + strlcpy(fspath.np_components[comp], nfsp->np_components[comp], slen+1); + } + if (error) + goto nfsmout; } - lck_mtx_lock(&nmp->nm_lock); - /* setup defaults */ - nmp->nm_vers = NFS_VER2; - nmp->nm_timeo = NFS_TIMEO; - nmp->nm_retry = NFS_RETRANS; - if (argp->sotype == SOCK_DGRAM) { - nmp->nm_wsize = NFS_DGRAM_WSIZE; - nmp->nm_rsize = NFS_DGRAM_RSIZE; - } else { - nmp->nm_wsize = NFS_WSIZE; - nmp->nm_rsize = NFS_RSIZE; + /* for mirror mounts, we can just use the file handle passed in */ + if (nmp->nm_fh) { + dirfh.fh_len = nmp->nm_fh->fh_len; + bcopy(nmp->nm_fh->fh_data, dirfh.fh_data, dirfh.fh_len); + NFSREQ_SECINFO_SET(&si, NULL, dirfh.fh_data, dirfh.fh_len, NULL, 0); + goto gotfh; } - nmp->nm_readdirsize = NFS_READDIRSIZE; - nmp->nm_numgrps = NFS_MAXGRPS; - nmp->nm_readahead = NFS_DEFRAHEAD; - nmp->nm_tprintf_delay = nfs_tprintf_delay; - if (nmp->nm_tprintf_delay < 0) - nmp->nm_tprintf_delay = 0; - nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay; - if (nmp->nm_tprintf_initial_delay < 0) - nmp->nm_tprintf_initial_delay = 0; - nmp->nm_acregmin = NFS_MINATTRTIMO; - nmp->nm_acregmax = NFS_MAXATTRTIMO; - nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; - nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; - nmp->nm_auth = RPCAUTH_SYS; - nmp->nm_deadtimeout = 0; - - vfs_getnewfsid(mp); - nmp->nm_mountp = mp; - vfs_setauthopaque(mp); - nmp->nm_flag = argp->flags; - nmp->nm_nam = nam; - - if (argp->flags & NFSMNT_NFSV4) { - nmp->nm_vers = NFS_VER4; - /* NFSv4 is only allowed over TCP. */ - if (argp->sotype != SOCK_STREAM) { - error = EINVAL; - goto bad; + + /* otherwise, we need to get the fh for the directory we are mounting */ + + /* if no components, just get root */ + if (fspath.np_compcount == 0) { +nocomponents: + // PUTROOTFH + GETATTR(FH) + NFSREQ_SECINFO_SET(&si, NULL, NULL, 0, NULL, 0); + numops = 2; + nfsm_chain_build_alloc_init(error, &nmreq, 9 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "mount", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTROOTFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_CLEAR_ATTRIBUTES(bitmap); + NFS4_DEFAULT_ATTRIBUTES(bitmap); + NFS_BITMAP_SET(bitmap, NFS_FATTR_FILEHANDLE); + nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request_async(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTROOTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + NFS_CLEAR_ATTRIBUTES(nmp->nm_fsattr.nfsa_bitmap); + error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, &dirfh, NULL, NULL); + if (!error && !NFS_BITMAP_ISSET(&nvattr.nva_bitmap, NFS_FATTR_FILEHANDLE)) { + printf("nfs: mount didn't return filehandle?\n"); + error = EBADRPC; } - } else if (argp->flags & NFSMNT_NFSV3) - nmp->nm_vers = NFS_VER3; + nfsmout_if(error); + nfsm_chain_cleanup(&nmrep); + nfsm_chain_null(&nmreq); + NVATTR_CLEANUP(&nvattr); + goto gotfh; + } - if (nmp->nm_vers == NFS_VER2) - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; + /* look up each path component */ + for (comp=0; comp < fspath.np_compcount; ) { + isdotdot = 0; + if (fspath.np_components[comp][0] == '.') { + if (fspath.np_components[comp][1] == '\0') { + /* skip "." */ + comp++; + continue; + } + /* treat ".." specially */ + if ((fspath.np_components[comp][1] == '.') && + (fspath.np_components[comp][2] == '\0')) + isdotdot = 1; + if (isdotdot && (dirfh.fh_len == 0)) { + /* ".." in root directory is same as "." */ + comp++; + continue; + } + } + // PUT(ROOT)FH + LOOKUP(P) + GETFH + GETATTR + if (dirfh.fh_len == 0) + NFSREQ_SECINFO_SET(&si, NULL, NULL, 0, isdotdot ? NULL : fspath.np_components[comp], 0); + else + NFSREQ_SECINFO_SET(&si, NULL, dirfh.fh_data, dirfh.fh_len, isdotdot ? NULL : fspath.np_components[comp], 0); + numops = 4; + nfsm_chain_build_alloc_init(error, &nmreq, 18 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "mount", numops); + numops--; + if (dirfh.fh_len) { + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, NFS_VER4, dirfh.fh_data, dirfh.fh_len); + } else { + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTROOTFH); + } + numops--; + if (isdotdot) { + nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUPP); + } else { + nfsm_chain_add_32(error, &nmreq, NFS_OP_LOOKUP); + nfsm_chain_add_name(error, &nmreq, + fspath.np_components[comp], strlen(fspath.np_components[comp]), nmp); + } + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETFH); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_CLEAR_ATTRIBUTES(bitmap); + NFS4_DEFAULT_ATTRIBUTES(bitmap); + /* if no namedattr support or component is ".zfs", clear NFS_FATTR_NAMED_ATTR */ + if (NMFLAG(nmp, NONAMEDATTR) || !strcmp(fspath.np_components[comp], ".zfs")) + NFS_BITMAP_CLR(bitmap, NFS_FATTR_NAMED_ATTR); + nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request_async(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, dirfh.fh_len ? NFS_OP_PUTFH : NFS_OP_PUTROOTFH); + nfsm_chain_op_check(error, &nmrep, isdotdot ? NFS_OP_LOOKUPP : NFS_OP_LOOKUP); + nfsmout_if(error); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETFH); + nfsm_chain_get_32(error, &nmrep, fh.fh_len); + nfsm_chain_get_opaque(error, &nmrep, fh.fh_len, fh.fh_data); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + if (!error) { + NFS_CLEAR_ATTRIBUTES(nmp->nm_fsattr.nfsa_bitmap); + error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, NULL, NULL, &nfsls); + } + nfsm_chain_cleanup(&nmrep); + nfsm_chain_null(&nmreq); + if (error) { + /* LOOKUP succeeded but GETATTR failed? This could be a referral. */ + /* Try the lookup again with a getattr for fs_locations. */ + nfs_fs_locations_cleanup(&nfsls); + error = nfs4_get_fs_locations(nmp, NULL, dirfh.fh_data, dirfh.fh_len, fspath.np_components[comp], ctx, &nfsls); + if (!error && (nfsls.nl_numlocs < 1)) + error = ENOENT; + nfsmout_if(error); + if (++loopcnt > MAXSYMLINKS) { + /* too many symlink/referral redirections */ + error = ELOOP; + goto nfsmout; + } + /* tear down the current connection */ + nfs_disconnect(nmp); + /* replace fs locations */ + nfs_fs_locations_cleanup(&nmp->nm_locations); + nmp->nm_locations = nfsls; + bzero(&nfsls, sizeof(nfsls)); + /* initiate a connection using the new fs locations */ + error = nfs_mount_connect(nmp); + if (!error && !(nmp->nm_locations.nl_current.nli_flags & NLI_VALID)) + error = EIO; + nfsmout_if(error); + /* add new server's remote path to beginning of our path and continue */ + nfsp = &nmp->nm_locations.nl_locations[nmp->nm_locations.nl_current.nli_loc]->nl_path; + bzero(&fspath2, sizeof(fspath2)); + fspath2.np_compcount = (fspath.np_compcount - comp - 1) + nfsp->np_compcount; + if (fspath2.np_compcount > 0) { + MALLOC(fspath2.np_components, char **, fspath2.np_compcount*sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!fspath2.np_components) { + error = ENOMEM; + goto nfsmout; + } + for (comp2=0; comp2 < nfsp->np_compcount; comp2++) { + int slen = strlen(nfsp->np_components[comp2]); + MALLOC(fspath2.np_components[comp2], char *, slen+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fspath2.np_components[comp2]) { + /* clean up fspath2, then error out */ + while (comp2 > 0) { + comp2--; + FREE(fspath2.np_components[comp2], M_TEMP); + } + FREE(fspath2.np_components, M_TEMP); + error = ENOMEM; + goto nfsmout; + } + strlcpy(fspath2.np_components[comp2], nfsp->np_components[comp2], slen+1); + } + if ((fspath.np_compcount - comp - 1) > 0) + bcopy(&fspath.np_components[comp+1], &fspath2.np_components[nfsp->np_compcount], (fspath.np_compcount - comp - 1)*sizeof(char*)); + /* free up unused parts of old path (prior components and component array) */ + do { + FREE(fspath.np_components[comp], M_TEMP); + } while (comp-- > 0); + FREE(fspath.np_components, M_TEMP); + /* put new path in place */ + fspath = fspath2; + } + /* reset dirfh and component index */ + dirfh.fh_len = 0; + comp = 0; + NVATTR_CLEANUP(&nvattr); + if (fspath.np_compcount == 0) + goto nocomponents; + continue; + } + nfsmout_if(error); + /* if file handle is for a symlink, then update the path with the symlink contents */ + if (NFS_BITMAP_ISSET(&nvattr.nva_bitmap, NFS_FATTR_TYPE) && (nvattr.nva_type == VLNK)) { + if (++loopcnt > MAXSYMLINKS) + error = ELOOP; + else + error = nfs4_mount_update_path_with_symlink(nmp, &fspath, comp, &dirfh, &depth, &fh, ctx); + nfsmout_if(error); + /* directory file handle is either left the same or reset to root (if link was absolute) */ + /* path traversal starts at beginning of the path again */ + comp = 0; + NVATTR_CLEANUP(&nvattr); + nfs_fs_locations_cleanup(&nfsls); + continue; + } + NVATTR_CLEANUP(&nvattr); + nfs_fs_locations_cleanup(&nfsls); + /* not a symlink... */ + if ((nmp->nm_state & NFSSTA_NEEDSECINFO) && (comp == (fspath.np_compcount-1)) && !isdotdot) { + /* need to get SECINFO for the directory being mounted */ + if (dirfh.fh_len == 0) + NFSREQ_SECINFO_SET(&si, NULL, NULL, 0, isdotdot ? NULL : fspath.np_components[comp], 0); + else + NFSREQ_SECINFO_SET(&si, NULL, dirfh.fh_data, dirfh.fh_len, isdotdot ? NULL : fspath.np_components[comp], 0); + sec.count = NX_MAX_SEC_FLAVORS; + error = nfs4_secinfo_rpc(nmp, &si, vfs_context_ucred(ctx), sec.flavors, &sec.count); + /* [sigh] some implementations return "illegal" error for unsupported ops */ + if (error == NFSERR_OP_ILLEGAL) + error = 0; + nfsmout_if(error); + /* set our default security flavor to the first in the list */ + if (sec.count) + nmp->nm_auth = sec.flavors[0]; + nmp->nm_state &= ~NFSSTA_NEEDSECINFO; + } + /* advance directory file handle, component index, & update depth */ + dirfh = fh; + comp++; + if (!isdotdot) /* going down the hierarchy */ + depth++; + else if (--depth <= 0) /* going up the hierarchy */ + dirfh.fh_len = 0; /* clear dirfh when we hit root */ + } - if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { - nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; - if (nmp->nm_timeo < NFS_MINTIMEO) - nmp->nm_timeo = NFS_MINTIMEO; - else if (nmp->nm_timeo > NFS_MAXTIMEO) - nmp->nm_timeo = NFS_MAXTIMEO; +gotfh: + /* get attrs for mount point root */ + numops = NMFLAG(nmp, NONAMEDATTR) ? 2 : 3; // PUTFH + GETATTR + OPENATTR + nfsm_chain_build_alloc_init(error, &nmreq, 25 * NFSX_UNSIGNED); + nfsm_chain_add_compound_header(error, &nmreq, "mount", numops); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_PUTFH); + nfsm_chain_add_fh(error, &nmreq, NFS_VER4, dirfh.fh_data, dirfh.fh_len); + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_GETATTR); + NFS_CLEAR_ATTRIBUTES(bitmap); + NFS4_DEFAULT_ATTRIBUTES(bitmap); + /* if no namedattr support or last component is ".zfs", clear NFS_FATTR_NAMED_ATTR */ + if (NMFLAG(nmp, NONAMEDATTR) || ((fspath.np_compcount > 0) && !strcmp(fspath.np_components[fspath.np_compcount-1], ".zfs"))) + NFS_BITMAP_CLR(bitmap, NFS_FATTR_NAMED_ATTR); + nfsm_chain_add_bitmap(error, &nmreq, bitmap, NFS_ATTR_BITMAP_LEN); + if (!NMFLAG(nmp, NONAMEDATTR)) { + numops--; + nfsm_chain_add_32(error, &nmreq, NFS_OP_OPENATTR); + nfsm_chain_add_32(error, &nmreq, 0); + } + nfsm_chain_build_done(error, &nmreq); + nfsm_assert(error, (numops == 0), EPROTO); + nfsmout_if(error); + error = nfs_request_async(NULL, nmp->nm_mountp, &nmreq, NFSPROC4_COMPOUND, + vfs_context_thread(ctx), vfs_context_ucred(ctx), &si, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); + nfsm_chain_skip_tag(error, &nmrep); + nfsm_chain_get_32(error, &nmrep, numops); + nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); + nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); + nfsmout_if(error); + NFS_CLEAR_ATTRIBUTES(nmp->nm_fsattr.nfsa_bitmap); + error = nfs4_parsefattr(&nmrep, &nmp->nm_fsattr, &nvattr, NULL, NULL, NULL); + nfsmout_if(error); + if (!NMFLAG(nmp, NONAMEDATTR)) { + nfsm_chain_op_check(error, &nmrep, NFS_OP_OPENATTR); + if (error == ENOENT) + error = 0; + /* [sigh] some implementations return "illegal" error for unsupported ops */ + if (error || !NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_NAMED_ATTR)) { + nmp->nm_fsattr.nfsa_flags &= ~NFS_FSFLAG_NAMED_ATTR; + } else { + nmp->nm_fsattr.nfsa_flags |= NFS_FSFLAG_NAMED_ATTR; + } + } else { + nmp->nm_fsattr.nfsa_flags &= ~NFS_FSFLAG_NAMED_ATTR; + } + if (NMFLAG(nmp, NOACL)) /* make sure ACL support is turned off */ + nmp->nm_fsattr.nfsa_flags &= ~NFS_FSFLAG_ACL; + if (NMFLAG(nmp, ACLONLY) && !(nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL)) + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_ACLONLY); + if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_FH_EXPIRE_TYPE)) { + uint32_t fhtype = ((nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_FHTYPE_MASK) >> NFS_FSFLAG_FHTYPE_SHIFT); + if (fhtype != NFS_FH_PERSISTENT) + printf("nfs: warning: non-persistent file handles! for %s\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); } - if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) { - nmp->nm_retry = argp->retrans; - if (nmp->nm_retry > NFS_MAXREXMIT) - nmp->nm_retry = NFS_MAXREXMIT; + /* make sure it's a directory */ + if (!NFS_BITMAP_ISSET(&nvattr.nva_bitmap, NFS_FATTR_TYPE) || (nvattr.nva_type != VDIR)) { + error = ENOTDIR; + goto nfsmout; } - if (nmp->nm_vers != NFS_VER2) { - if (argp->sotype == SOCK_DGRAM) - maxio = NFS_MAXDGRAMDATA; - else - maxio = NFS_MAXDATA; - } else - maxio = NFS_V2MAXDATA; + /* save the NFS fsid */ + nmp->nm_fsid = nvattr.nva_fsid; + + /* create the root node */ + error = nfs_nget(nmp->nm_mountp, NULL, NULL, dirfh.fh_data, dirfh.fh_len, &nvattr, &xid, rq.r_auth, NG_MARKROOT, npp); + nfsmout_if(error); + + if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL) + vfs_setextendedsecurity(nmp->nm_mountp); - if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) { - nmp->nm_wsize = argp->wsize; - /* Round down to multiple of blocksize */ - nmp->nm_wsize &= ~(NFS_FABLKSIZE - 1); - if (nmp->nm_wsize <= 0) - nmp->nm_wsize = NFS_FABLKSIZE; + /* adjust I/O sizes to server limits */ + if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXREAD) && (nmp->nm_fsattr.nfsa_maxread > 0)) { + if (nmp->nm_fsattr.nfsa_maxread < (uint64_t)nmp->nm_rsize) { + nmp->nm_rsize = nmp->nm_fsattr.nfsa_maxread & ~(NFS_FABLKSIZE - 1); + if (nmp->nm_rsize == 0) + nmp->nm_rsize = nmp->nm_fsattr.nfsa_maxread; + } + } + if (NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_bitmap, NFS_FATTR_MAXWRITE) && (nmp->nm_fsattr.nfsa_maxwrite > 0)) { + if (nmp->nm_fsattr.nfsa_maxwrite < (uint64_t)nmp->nm_wsize) { + nmp->nm_wsize = nmp->nm_fsattr.nfsa_maxwrite & ~(NFS_FABLKSIZE - 1); + if (nmp->nm_wsize == 0) + nmp->nm_wsize = nmp->nm_fsattr.nfsa_maxwrite; + } } - if (nmp->nm_wsize > maxio) - nmp->nm_wsize = maxio; - if (nmp->nm_wsize > NFS_MAXBSIZE) - nmp->nm_wsize = NFS_MAXBSIZE; - if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) { - nmp->nm_rsize = argp->rsize; - /* Round down to multiple of blocksize */ - nmp->nm_rsize &= ~(NFS_FABLKSIZE - 1); - if (nmp->nm_rsize <= 0) - nmp->nm_rsize = NFS_FABLKSIZE; + /* set up lease renew timer */ + nmp->nm_renew_timer = thread_call_allocate(nfs4_renew_timer, nmp); + interval = nmp->nm_fsattr.nfsa_lease / 2; + if (interval < 1) + interval = 1; + nfs_interval_timer_start(nmp->nm_renew_timer, interval * 1000); + +nfsmout: + if (fspath.np_components) { + for (comp=0; comp < fspath.np_compcount; comp++) + if (fspath.np_components[comp]) + FREE(fspath.np_components[comp], M_TEMP); + FREE(fspath.np_components, M_TEMP); } - if (nmp->nm_rsize > maxio) - nmp->nm_rsize = maxio; - if (nmp->nm_rsize > NFS_MAXBSIZE) - nmp->nm_rsize = NFS_MAXBSIZE; + NVATTR_CLEANUP(&nvattr); + nfs_fs_locations_cleanup(&nfsls); + if (*npp) + nfs_node_unlock(*npp); + nfsm_chain_cleanup(&nmreq); + nfsm_chain_cleanup(&nmrep); + return (error); +} + +/* + * Thread to handle initial NFS mount connection. + */ +void +nfs_mount_connect_thread(void *arg, __unused wait_result_t wr) +{ + struct nfsmount *nmp = arg; + int error = 0, savederror = 0, slpflag = (NMFLAG(nmp, INTR) ? PCATCH : 0); + int done = 0, timeo, tries, maxtries; - if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) { - nmp->nm_readdirsize = argp->readdirsize; + if (NM_OMFLAG(nmp, MNTQUICK)) { + timeo = 8; + maxtries = 1; + } else { + timeo = 30; + maxtries = 2; } - if (nmp->nm_readdirsize > maxio) - nmp->nm_readdirsize = maxio; - if (nmp->nm_readdirsize > nmp->nm_rsize) - nmp->nm_readdirsize = nmp->nm_rsize; - if ((argp->flags & NFSMNT_MAXGRPS) && argp->maxgrouplist >= 0 && - argp->maxgrouplist <= NFS_MAXGRPS) - nmp->nm_numgrps = argp->maxgrouplist; - if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0 && - argp->readahead <= NFS_MAXRAHEAD) - nmp->nm_readahead = argp->readahead; - if (argp->flags & NFSMNT_READAHEAD) - nmp->nm_readahead = argp->readahead; - if (nmp->nm_readahead < 0) - nmp->nm_readahead = 0; - else if (nmp->nm_readahead > NFS_MAXRAHEAD) - nmp->nm_readahead = NFS_MAXRAHEAD; + for (tries = 0; tries < maxtries; tries++) { + error = nfs_connect(nmp, 1, timeo); + switch (error) { + case ETIMEDOUT: + case EAGAIN: + case EPIPE: + case EADDRNOTAVAIL: + case ENETDOWN: + case ENETUNREACH: + case ENETRESET: + case ECONNABORTED: + case ECONNRESET: + case EISCONN: + case ENOTCONN: + case ESHUTDOWN: + case ECONNREFUSED: + case EHOSTDOWN: + case EHOSTUNREACH: + /* just keep retrying on any of these errors */ + break; + case 0: + default: + /* looks like we got an answer... */ + done = 1; + break; + } - if (argp->version >= 4) { - if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0) - nmp->nm_acregmin = argp->acregmin; - if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0) - nmp->nm_acregmax = argp->acregmax; - if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0) - nmp->nm_acdirmin = argp->acdirmin; - if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0) - nmp->nm_acdirmax = argp->acdirmax; - if (nmp->nm_acregmin > nmp->nm_acregmax) - nmp->nm_acregmin = nmp->nm_acregmax; - if (nmp->nm_acdirmin > nmp->nm_acdirmax) - nmp->nm_acdirmin = nmp->nm_acdirmax; - } - if (argp->version >= 5) { - if (argp->flags & NFSMNT_SECFLAVOR) { - /* - * Check for valid security flavor - */ - switch (argp->auth) { + /* save the best error */ + if (nfs_connect_error_class(error) >= nfs_connect_error_class(savederror)) + savederror = error; + if (done) { + error = savederror; + break; + } + + /* pause before next attempt */ + if ((error = nfs_sigintr(nmp, NULL, current_thread(), 0))) + break; + error = tsleep(nmp, PSOCK|slpflag, "nfs_mount_connect_retry", 2*hz); + if (error && (error != EWOULDBLOCK)) + break; + error = savederror; + } + + /* update status of mount connect */ + lck_mtx_lock(&nmp->nm_lock); + if (!nmp->nm_mounterror) + nmp->nm_mounterror = error; + nmp->nm_state &= ~NFSSTA_MOUNT_THREAD; + lck_mtx_unlock(&nmp->nm_lock); + wakeup(&nmp->nm_nss); +} + +int +nfs_mount_connect(struct nfsmount *nmp) +{ + int error = 0, slpflag; + thread_t thd; + struct timespec ts = { 2, 0 }; + + /* + * Set up the socket. Perform initial search for a location/server/address to + * connect to and negotiate any unspecified mount parameters. This work is + * done on a kernel thread to satisfy reserved port usage needs. + */ + slpflag = NMFLAG(nmp, INTR) ? PCATCH : 0; + lck_mtx_lock(&nmp->nm_lock); + /* set flag that the thread is running */ + nmp->nm_state |= NFSSTA_MOUNT_THREAD; + if (kernel_thread_start(nfs_mount_connect_thread, nmp, &thd) != KERN_SUCCESS) { + nmp->nm_state &= ~NFSSTA_MOUNT_THREAD; + nmp->nm_mounterror = EIO; + printf("nfs mount %s start socket connect thread failed\n", vfs_statfs(nmp->nm_mountp)->f_mntfromname); + } else { + thread_deallocate(thd); + } + + /* wait until mount connect thread is finished/gone */ + while (nmp->nm_state & NFSSTA_MOUNT_THREAD) { + error = msleep(&nmp->nm_nss, &nmp->nm_lock, slpflag|PSOCK, "nfsconnectthread", &ts); + if ((error && (error != EWOULDBLOCK)) || ((error = nfs_sigintr(nmp, NULL, current_thread(), 1)))) { + /* record error */ + if (!nmp->nm_mounterror) + nmp->nm_mounterror = error; + /* signal the thread that we are aborting */ + nmp->nm_sockflags |= NMSOCK_UNMOUNT; + if (nmp->nm_nss) + wakeup(nmp->nm_nss); + /* and continue waiting on it to finish */ + slpflag = 0; + } + } + lck_mtx_unlock(&nmp->nm_lock); + + /* grab mount connect status */ + error = nmp->nm_mounterror; + + return (error); +} + +/* + * Common code to mount an NFS file system. + */ +int +mountnfs( + char *xdrbuf, + mount_t mp, + vfs_context_t ctx, + vnode_t *vpp) +{ + struct nfsmount *nmp; + nfsnode_t np; + int error = 0; + struct vfsstatfs *sbp; + struct xdrbuf xb; + uint32_t i, val, vers = 0, minorvers, maxio, iosize, len; + uint32_t *mattrs; + uint32_t *mflags_mask; + uint32_t *mflags; + uint32_t argslength, attrslength; + struct nfs_location_index firstloc = { NLI_VALID, 0, 0, 0 }; + + /* make sure mbuf constants are set up */ + if (!nfs_mbuf_mhlen) + nfs_mbuf_init(); + + if (vfs_flags(mp) & MNT_UPDATE) { + nmp = VFSTONFS(mp); + /* update paths, file handles, etc, here XXX */ + xb_free(xdrbuf); + return (0); + } else { + /* allocate an NFS mount structure for this mount */ + MALLOC_ZONE(nmp, struct nfsmount *, + sizeof (struct nfsmount), M_NFSMNT, M_WAITOK); + if (!nmp) { + xb_free(xdrbuf); + return (ENOMEM); + } + bzero((caddr_t)nmp, sizeof (struct nfsmount)); + lck_mtx_init(&nmp->nm_lock, nfs_mount_grp, LCK_ATTR_NULL); + TAILQ_INIT(&nmp->nm_resendq); + TAILQ_INIT(&nmp->nm_iodq); + TAILQ_INIT(&nmp->nm_gsscl); + LIST_INIT(&nmp->nm_monlist); + vfs_setfsprivate(mp, nmp); + vfs_getnewfsid(mp); + nmp->nm_mountp = mp; + vfs_setauthopaque(mp); + + nfs_nhinit_finish(); + + nmp->nm_args = xdrbuf; + + /* set up defaults */ + nmp->nm_vers = 0; + nmp->nm_timeo = NFS_TIMEO; + nmp->nm_retry = NFS_RETRANS; + nmp->nm_sotype = 0; + nmp->nm_sofamily = 0; + nmp->nm_nfsport = 0; + nmp->nm_wsize = NFS_WSIZE; + nmp->nm_rsize = NFS_RSIZE; + nmp->nm_readdirsize = NFS_READDIRSIZE; + nmp->nm_numgrps = NFS_MAXGRPS; + nmp->nm_readahead = NFS_DEFRAHEAD; + nmp->nm_tprintf_delay = nfs_tprintf_delay; + if (nmp->nm_tprintf_delay < 0) + nmp->nm_tprintf_delay = 0; + nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay; + if (nmp->nm_tprintf_initial_delay < 0) + nmp->nm_tprintf_initial_delay = 0; + nmp->nm_acregmin = NFS_MINATTRTIMO; + nmp->nm_acregmax = NFS_MAXATTRTIMO; + nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; + nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; + nmp->nm_auth = RPCAUTH_SYS; + nmp->nm_deadtimeout = 0; + NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_NOACL); + } + + mattrs = nmp->nm_mattrs; + mflags = nmp->nm_mflags; + mflags_mask = nmp->nm_mflags_mask; + + /* set up NFS mount with args */ + xb_init_buffer(&xb, xdrbuf, 2*XDRWORD); + xb_get_32(error, &xb, val); /* version */ + xb_get_32(error, &xb, argslength); /* args length */ + nfsmerr_if(error); + xb_init_buffer(&xb, xdrbuf, argslength); /* restart parsing with actual buffer length */ + xb_get_32(error, &xb, val); /* version */ + xb_get_32(error, &xb, argslength); /* args length */ + xb_get_32(error, &xb, val); /* XDR args version */ + if (val != NFS_XDRARGS_VERSION_0) + error = EINVAL; + len = NFS_MATTR_BITMAP_LEN; + xb_get_bitmap(error, &xb, mattrs, len); /* mount attribute bitmap */ + attrslength = 0; + xb_get_32(error, &xb, attrslength); /* attrs length */ + if (!error && (attrslength > (argslength - ((4+NFS_MATTR_BITMAP_LEN+1)*XDRWORD)))) + error = EINVAL; + nfsmerr_if(error); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FLAGS)) { + len = NFS_MFLAG_BITMAP_LEN; + xb_get_bitmap(error, &xb, mflags_mask, len); /* mount flag mask */ + len = NFS_MFLAG_BITMAP_LEN; + xb_get_bitmap(error, &xb, mflags, len); /* mount flag values */ + if (!error) { + /* clear all mask bits and OR in all the ones that are set */ + nmp->nm_flags[0] &= ~mflags_mask[0]; + nmp->nm_flags[0] |= (mflags_mask[0] & mflags[0]); + } + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_VERSION)) { + xb_get_32(error, &xb, vers); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_MINOR_VERSION)) + xb_get_32(error, &xb, minorvers); + else + minorvers = 0; + nfsmerr_if(error); + switch (vers) { + case 2: + nmp->nm_vers = NFS_VER2; + break; + case 3: + nmp->nm_vers = NFS_VER3; + break; + case 4: + switch (minorvers) { + case 0: + nmp->nm_vers = NFS_VER4; + break; + default: + error = EINVAL; + } + break; + default: + error = EINVAL; + } + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_MINOR_VERSION)) { + /* should have also gotten NFS version (and already gotten minorvers) */ + if (!NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_VERSION)) + error = EINVAL; + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READ_SIZE)) + xb_get_32(error, &xb, nmp->nm_rsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_WRITE_SIZE)) + xb_get_32(error, &xb, nmp->nm_wsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READDIR_SIZE)) + xb_get_32(error, &xb, nmp->nm_readdirsize); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READAHEAD)) + xb_get_32(error, &xb, nmp->nm_readahead); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MIN)) { + xb_get_32(error, &xb, nmp->nm_acregmin); + xb_skip(error, &xb, XDRWORD); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MAX)) { + xb_get_32(error, &xb, nmp->nm_acregmax); + xb_skip(error, &xb, XDRWORD); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MIN)) { + xb_get_32(error, &xb, nmp->nm_acdirmin); + xb_skip(error, &xb, XDRWORD); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX)) { + xb_get_32(error, &xb, nmp->nm_acdirmax); + xb_skip(error, &xb, XDRWORD); + } + nfsmerr_if(error); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCK_MODE)) { + xb_get_32(error, &xb, val); + switch (val) { + case NFS_LOCK_MODE_DISABLED: + case NFS_LOCK_MODE_LOCAL: + if (nmp->nm_vers >= NFS_VER4) { + /* disabled/local lock mode only allowed on v2/v3 */ + error = EINVAL; + break; + } + /* FALLTHROUGH */ + case NFS_LOCK_MODE_ENABLED: + nmp->nm_lockmode = val; + break; + default: + error = EINVAL; + } + } + nfsmerr_if(error); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SECURITY)) { + uint32_t seccnt; + xb_get_32(error, &xb, seccnt); + if (!error && ((seccnt < 1) || (seccnt > NX_MAX_SEC_FLAVORS))) + error = EINVAL; + nfsmerr_if(error); + nmp->nm_sec.count = seccnt; + for (i=0; i < seccnt; i++) { + xb_get_32(error, &xb, nmp->nm_sec.flavors[i]); + /* Check for valid security flavor */ + switch (nmp->nm_sec.flavors[i]) { + case RPCAUTH_NONE: case RPCAUTH_SYS: case RPCAUTH_KRB5: case RPCAUTH_KRB5I: case RPCAUTH_KRB5P: - nmp->nm_auth = argp->auth; break; default: error = EINVAL; - goto bad; } } + /* start with the first flavor */ + nmp->nm_auth = nmp->nm_sec.flavors[0]; } - if (argp->version >= 6) { - if (argp->flags & NFSMNT_DEADTIMEOUT) - nmp->nm_deadtimeout = argp->deadtimeout; + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MAX_GROUP_LIST)) + xb_get_32(error, &xb, nmp->nm_numgrps); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOCKET_TYPE)) { + char sotype[6]; + + xb_get_32(error, &xb, val); + if (!error && ((val < 3) || (val > 5))) + error = EINVAL; + nfsmerr_if(error); + error = xb_get_bytes(&xb, sotype, val, 0); + nfsmerr_if(error); + sotype[val] = '\0'; + if (!strcmp(sotype, "tcp")) { + nmp->nm_sotype = SOCK_STREAM; + } else if (!strcmp(sotype, "udp")) { + nmp->nm_sotype = SOCK_DGRAM; + } else if (!strcmp(sotype, "tcp4")) { + nmp->nm_sotype = SOCK_STREAM; + nmp->nm_sofamily = AF_INET; + } else if (!strcmp(sotype, "udp4")) { + nmp->nm_sotype = SOCK_DGRAM; + nmp->nm_sofamily = AF_INET; + } else if (!strcmp(sotype, "tcp6")) { + nmp->nm_sotype = SOCK_STREAM; + nmp->nm_sofamily = AF_INET6; + } else if (!strcmp(sotype, "udp6")) { + nmp->nm_sotype = SOCK_DGRAM; + nmp->nm_sofamily = AF_INET6; + } else if (!strcmp(sotype, "inet4")) { + nmp->nm_sofamily = AF_INET; + } else if (!strcmp(sotype, "inet6")) { + nmp->nm_sofamily = AF_INET6; + } else if (!strcmp(sotype, "inet")) { + nmp->nm_sofamily = 0; /* ok */ + } else { + error = EINVAL; + } + if (!error && (nmp->nm_vers >= NFS_VER4) && nmp->nm_sotype && + (nmp->nm_sotype != SOCK_STREAM)) + error = EINVAL; /* NFSv4 is only allowed over TCP. */ + nfsmerr_if(error); } - if ((nmp->nm_flag & NFSMNT_DEADTIMEOUT) && (nmp->nm_deadtimeout <= 0)) - nmp->nm_flag &= ~NFSMNT_DEADTIMEOUT; + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_PORT)) + xb_get_32(error, &xb, nmp->nm_nfsport); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MOUNT_PORT)) + xb_get_32(error, &xb, nmp->nm_mountport); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_REQUEST_TIMEOUT)) { + /* convert from time to 0.1s units */ + xb_get_32(error, &xb, nmp->nm_timeo); + xb_get_32(error, &xb, val); + nfsmerr_if(error); + if (val >= 1000000000) + error = EINVAL; + nfsmerr_if(error); + nmp->nm_timeo *= 10; + nmp->nm_timeo += (val+100000000-1)/100000000; + /* now convert to ticks */ + nmp->nm_timeo = (nmp->nm_timeo * NFS_HZ + 5) / 10; + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOFT_RETRY_COUNT)) { + xb_get_32(error, &xb, val); + if (!error && (val > 1)) + nmp->nm_retry = val; + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_DEAD_TIMEOUT)) { + xb_get_32(error, &xb, nmp->nm_deadtimeout); + xb_skip(error, &xb, XDRWORD); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FH)) { + nfsmerr_if(error); + MALLOC(nmp->nm_fh, fhandle_t *, sizeof(fhandle_t), M_TEMP, M_WAITOK|M_ZERO); + if (!nmp->nm_fh) + error = ENOMEM; + xb_get_32(error, &xb, nmp->nm_fh->fh_len); + nfsmerr_if(error); + error = xb_get_bytes(&xb, (char*)&nmp->nm_fh->fh_data[0], nmp->nm_fh->fh_len, 0); + } + nfsmerr_if(error); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FS_LOCATIONS)) { + uint32_t loc, serv, addr, comp; + struct nfs_fs_location *fsl; + struct nfs_fs_server *fss; + struct nfs_fs_path *fsp; + + xb_get_32(error, &xb, nmp->nm_locations.nl_numlocs); /* fs location count */ + /* sanity check location count */ + if (!error && ((nmp->nm_locations.nl_numlocs < 1) || (nmp->nm_locations.nl_numlocs > 256))) + error = EINVAL; + nfsmerr_if(error); + MALLOC(nmp->nm_locations.nl_locations, struct nfs_fs_location **, nmp->nm_locations.nl_numlocs * sizeof(struct nfs_fs_location*), M_TEMP, M_WAITOK|M_ZERO); + if (!nmp->nm_locations.nl_locations) + error = ENOMEM; + for (loc = 0; loc < nmp->nm_locations.nl_numlocs; loc++) { + nfsmerr_if(error); + MALLOC(fsl, struct nfs_fs_location *, sizeof(struct nfs_fs_location), M_TEMP, M_WAITOK|M_ZERO); + if (!fsl) + error = ENOMEM; + nmp->nm_locations.nl_locations[loc] = fsl; + xb_get_32(error, &xb, fsl->nl_servcount); /* server count */ + /* sanity check server count */ + if (!error && ((fsl->nl_servcount < 1) || (fsl->nl_servcount > 256))) + error = EINVAL; + nfsmerr_if(error); + MALLOC(fsl->nl_servers, struct nfs_fs_server **, fsl->nl_servcount * sizeof(struct nfs_fs_server*), M_TEMP, M_WAITOK|M_ZERO); + if (!fsl->nl_servers) + error = ENOMEM; + for (serv = 0; serv < fsl->nl_servcount; serv++) { + nfsmerr_if(error); + MALLOC(fss, struct nfs_fs_server *, sizeof(struct nfs_fs_server), M_TEMP, M_WAITOK|M_ZERO); + if (!fss) + error = ENOMEM; + fsl->nl_servers[serv] = fss; + xb_get_32(error, &xb, val); /* server name length */ + /* sanity check server name length */ + if (!error && ((val < 1) || (val > MAXPATHLEN))) + error = EINVAL; + nfsmerr_if(error); + MALLOC(fss->ns_name, char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_name) + error = ENOMEM; + nfsmerr_if(error); + error = xb_get_bytes(&xb, fss->ns_name, val, 0); /* server name */ + xb_get_32(error, &xb, fss->ns_addrcount); /* address count */ + /* sanity check address count (OK to be zero) */ + if (!error && (fss->ns_addrcount > 256)) + error = EINVAL; + nfsmerr_if(error); + if (fss->ns_addrcount > 0) { + MALLOC(fss->ns_addresses, char **, fss->ns_addrcount * sizeof(char *), M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_addresses) + error = ENOMEM; + for (addr = 0; addr < fss->ns_addrcount; addr++) { + xb_get_32(error, &xb, val); /* address length */ + /* sanity check address length */ + if (!error && ((val < 1) || (val > 128))) + error = EINVAL; + nfsmerr_if(error); + MALLOC(fss->ns_addresses[addr], char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fss->ns_addresses[addr]) + error = ENOMEM; + nfsmerr_if(error); + error = xb_get_bytes(&xb, fss->ns_addresses[addr], val, 0); /* address */ + } + } + xb_get_32(error, &xb, val); /* server info length */ + xb_skip(error, &xb, val); /* skip server info */ + } + /* get pathname */ + fsp = &fsl->nl_path; + xb_get_32(error, &xb, fsp->np_compcount); /* component count */ + /* sanity check component count */ + if (!error && (fsp->np_compcount > MAXPATHLEN)) + error = EINVAL; + nfsmerr_if(error); + if (fsp->np_compcount) { + MALLOC(fsp->np_components, char **, fsp->np_compcount * sizeof(char*), M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components) + error = ENOMEM; + } + for (comp = 0; comp < fsp->np_compcount; comp++) { + xb_get_32(error, &xb, val); /* component length */ + /* sanity check component length */ + if (!error && (val == 0)) { + /* + * Apparently some people think a path with zero components should + * be encoded with one zero-length component. So, just ignore any + * zero length components. + */ + comp--; + fsp->np_compcount--; + if (fsp->np_compcount == 0) { + FREE(fsp->np_components, M_TEMP); + fsp->np_components = NULL; + } + continue; + } + if (!error && ((val < 1) || (val > MAXPATHLEN))) + error = EINVAL; + nfsmerr_if(error); + MALLOC(fsp->np_components[comp], char *, val+1, M_TEMP, M_WAITOK|M_ZERO); + if (!fsp->np_components[comp]) + error = ENOMEM; + nfsmerr_if(error); + error = xb_get_bytes(&xb, fsp->np_components[comp], val, 0); /* component */ + } + xb_get_32(error, &xb, val); /* fs location info length */ + xb_skip(error, &xb, val); /* skip fs location info */ + } + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFLAGS)) + xb_skip(error, &xb, XDRWORD); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFROM)) { + xb_get_32(error, &xb, len); + nfsmerr_if(error); + val = len; + if (val >= sizeof(vfs_statfs(mp)->f_mntfromname)) + val = sizeof(vfs_statfs(mp)->f_mntfromname) - 1; + error = xb_get_bytes(&xb, vfs_statfs(mp)->f_mntfromname, val, 0); + if ((len - val) > 0) + xb_skip(error, &xb, len - val); + nfsmerr_if(error); + vfs_statfs(mp)->f_mntfromname[val] = '\0'; + } + nfsmerr_if(error); + + /* + * Sanity check/finalize settings. + */ + + if (nmp->nm_timeo < NFS_MINTIMEO) + nmp->nm_timeo = NFS_MINTIMEO; + else if (nmp->nm_timeo > NFS_MAXTIMEO) + nmp->nm_timeo = NFS_MAXTIMEO; + if (nmp->nm_retry > NFS_MAXREXMIT) + nmp->nm_retry = NFS_MAXREXMIT; + + if (nmp->nm_numgrps > NFS_MAXGRPS) + nmp->nm_numgrps = NFS_MAXGRPS; + if (nmp->nm_readahead > NFS_MAXRAHEAD) + nmp->nm_readahead = NFS_MAXRAHEAD; + if (nmp->nm_acregmin > nmp->nm_acregmax) + nmp->nm_acregmin = nmp->nm_acregmax; + if (nmp->nm_acdirmin > nmp->nm_acdirmax) + nmp->nm_acdirmin = nmp->nm_acdirmax; + + /* need at least one fs location */ + if (nmp->nm_locations.nl_numlocs < 1) + error = EINVAL; + nfsmerr_if(error); + + /* init mount's mntfromname to first location */ + if (!NM_OMATTR_GIVEN(nmp, MNTFROM)) + nfs_location_mntfromname(&nmp->nm_locations, firstloc, + vfs_statfs(mp)->f_mntfromname, sizeof(vfs_statfs(mp)->f_mntfromname), 0); + + /* Need to save the mounting credential for v4. */ + nmp->nm_mcred = vfs_context_ucred(ctx); + if (IS_VALID_CRED(nmp->nm_mcred)) + kauth_cred_ref(nmp->nm_mcred); + + /* + * If a reserved port is required, check for that privilege. + * (Note that mirror mounts are exempt because the privilege was + * already checked for the original mount.) + */ + if (NMFLAG(nmp, RESVPORT) && !vfs_iskernelmount(mp)) + error = priv_check_cred(nmp->nm_mcred, PRIV_NETINET_RESERVEDPORT, 0); + nfsmerr_if(error); + + /* do mount's initial socket connection */ + error = nfs_mount_connect(nmp); + nfsmerr_if(error); /* set up the version-specific function tables */ if (nmp->nm_vers < NFS_VER4) @@ -1729,39 +3089,67 @@ mountnfs( else nmp->nm_funcs = &nfs4_funcs; - /* Set up the sockets and related info */ - nmp->nm_sotype = argp->sotype; - nmp->nm_soproto = argp->proto; - if (nmp->nm_sotype == SOCK_DGRAM) - TAILQ_INIT(&nmp->nm_cwndq); - - lck_mtx_unlock(&nmp->nm_lock); - - /* make sure mbuf constants are set up */ - if (!nfs_mbuf_mhlen) - nfs_mbuf_init(); - + /* sanity check settings now that version/connection is set */ + if (nmp->nm_vers == NFS_VER2) /* ignore RDIRPLUS on NFSv2 */ + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_RDIRPLUS); if (nmp->nm_vers >= NFS_VER4) { - struct timeval now; - microtime(&now); - nmp->nm_mounttime = ((uint64_t)now.tv_sec << 32) | now.tv_usec; - nmp->nm_mcred = vfs_context_ucred(ctx); + if (NFS_BITMAP_ISSET(nmp->nm_flags, NFS_MFLAG_ACLONLY)) /* aclonly trumps noacl */ + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOACL); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_CALLUMNT); + if (nmp->nm_lockmode != NFS_LOCK_MODE_ENABLED) + error = EINVAL; /* disabled/local lock mode only allowed on v2/v3 */ + } else { + /* ignore these if not v4 */ + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOCALLBACK); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NONAMEDATTR); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_NOACL); + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_ACLONLY); if (IS_VALID_CRED(nmp->nm_mcred)) - kauth_cred_ref(nmp->nm_mcred); - nfs4_mount_callback_setup(nmp); + kauth_cred_unref(&nmp->nm_mcred); + } + nfsmerr_if(error); + + if (nmp->nm_sotype == SOCK_DGRAM) { + /* I/O size defaults for UDP are different */ + if (!NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READ_SIZE)) + nmp->nm_rsize = NFS_DGRAM_RSIZE; + if (!NFS_BITMAP_ISSET(mattrs, NFS_MATTR_WRITE_SIZE)) + nmp->nm_wsize = NFS_DGRAM_WSIZE; } - /* set up the socket */ - if ((error = nfs_connect(nmp, 1))) - goto bad; + /* round down I/O sizes to multiple of NFS_FABLKSIZE */ + nmp->nm_rsize &= ~(NFS_FABLKSIZE - 1); + if (nmp->nm_rsize <= 0) + nmp->nm_rsize = NFS_FABLKSIZE; + nmp->nm_wsize &= ~(NFS_FABLKSIZE - 1); + if (nmp->nm_wsize <= 0) + nmp->nm_wsize = NFS_FABLKSIZE; + + /* and limit I/O sizes to maximum allowed */ + maxio = (nmp->nm_vers == NFS_VER2) ? NFS_V2MAXDATA : + (nmp->nm_sotype == SOCK_DGRAM) ? NFS_MAXDGRAMDATA : NFS_MAXDATA; + if (maxio > NFS_MAXBSIZE) + maxio = NFS_MAXBSIZE; + if (nmp->nm_rsize > maxio) + nmp->nm_rsize = maxio; + if (nmp->nm_wsize > maxio) + nmp->nm_wsize = maxio; + + if (nmp->nm_readdirsize > maxio) + nmp->nm_readdirsize = maxio; + if (nmp->nm_readdirsize > nmp->nm_rsize) + nmp->nm_readdirsize = nmp->nm_rsize; + + /* Set up the sockets and related info */ + if (nmp->nm_sotype == SOCK_DGRAM) + TAILQ_INIT(&nmp->nm_cwndq); /* * Get the root node/attributes from the NFS server and * do any basic, version-specific setup. */ - error = nmp->nm_funcs->nf_mount(nmp, ctx, argp, &np); - if (error) - goto bad; + error = nmp->nm_funcs->nf_mount(nmp, ctx, &np); + nfsmerr_if(error); /* * A reference count is needed on the node representing the @@ -1776,7 +3164,7 @@ mountnfs( vnode_put(*vpp); if (error) { vnode_recycle(*vpp); - goto bad; + goto nfsmerr; } /* @@ -1788,151 +3176,877 @@ mountnfs( if (!error2) vnode_put(*vpp); vnode_recycle(*vpp); - goto bad; + goto nfsmerr; + } + sbp = vfs_statfs(mp); + sbp->f_bsize = nmp->nm_fsattr.nfsa_bsize; + sbp->f_blocks = nmp->nm_fsattr.nfsa_space_total / sbp->f_bsize; + sbp->f_bfree = nmp->nm_fsattr.nfsa_space_free / sbp->f_bsize; + sbp->f_bavail = nmp->nm_fsattr.nfsa_space_avail / sbp->f_bsize; + sbp->f_bused = (nmp->nm_fsattr.nfsa_space_total / sbp->f_bsize) - + (nmp->nm_fsattr.nfsa_space_free / sbp->f_bsize); + sbp->f_files = nmp->nm_fsattr.nfsa_files_total; + sbp->f_ffree = nmp->nm_fsattr.nfsa_files_free; + sbp->f_iosize = nfs_iosize; + + /* + * Calculate the size used for I/O buffers. Use the larger + * of the two sizes to minimise NFS requests but make sure + * that it is at least one VM page to avoid wasting buffer + * space and to allow easy mmapping of I/O buffers. + * The read/write RPC calls handle the splitting up of + * buffers into multiple requests if the buffer size is + * larger than the I/O size. + */ + iosize = max(nmp->nm_rsize, nmp->nm_wsize); + if (iosize < PAGE_SIZE) + iosize = PAGE_SIZE; + nmp->nm_biosize = trunc_page_32(iosize); + + /* For NFSv3 and greater, there is a (relatively) reliable ACCESS call. */ + if (nmp->nm_vers > NFS_VER2) + vfs_setauthopaqueaccess(mp); + + switch (nmp->nm_lockmode) { + case NFS_LOCK_MODE_DISABLED: + break; + case NFS_LOCK_MODE_LOCAL: + vfs_setlocklocal(nmp->nm_mountp); + break; + case NFS_LOCK_MODE_ENABLED: + default: + if (nmp->nm_vers <= NFS_VER3) + nfs_lockd_mount_register(nmp); + break; + } + + /* success! */ + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_state |= NFSSTA_MOUNTED; + lck_mtx_unlock(&nmp->nm_lock); + return (0); +nfsmerr: + nfs_mount_cleanup(nmp); + return (error); +} + +#if CONFIG_TRIGGERS + +/* + * We've detected a file system boundary on the server and + * need to mount a new file system so that our file systems + * MIRROR the file systems on the server. + * + * Build the mount arguments for the new mount and call kernel_mount(). + */ +int +nfs_mirror_mount_domount(vnode_t dvp, vnode_t vp, vfs_context_t ctx) +{ + nfsnode_t np = VTONFS(vp); + nfsnode_t dnp = VTONFS(dvp); + struct nfsmount *nmp = NFSTONMP(np); + char fstype[MFSTYPENAMELEN], *mntfromname = NULL, *path = NULL, *relpath, *p, *cp; + int error = 0, pathbuflen = MAXPATHLEN, i, mntflags = 0, referral, skipcopy = 0; + size_t nlen; + struct xdrbuf xb, xbnew; + uint32_t mattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t newmattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t newmflags[NFS_MFLAG_BITMAP_LEN]; + uint32_t newmflags_mask[NFS_MFLAG_BITMAP_LEN]; + uint32_t argslength = 0, val, count, mlen, mlen2, rlen, relpathcomps; + uint32_t argslength_offset, attrslength_offset, end_offset; + uint32_t numlocs, loc, numserv, serv, numaddr, addr, numcomp, comp; + char buf[XDRWORD]; + struct nfs_fs_locations nfsls; + + referral = (np->n_vattr.nva_flags & NFS_FFLAG_TRIGGER_REFERRAL); + if (referral) + bzero(&nfsls, sizeof(nfsls)); + + xb_init(&xbnew, 0); + + if (!nmp || (nmp->nm_state & NFSSTA_FORCE)) + return (ENXIO); + + /* allocate a couple path buffers we need */ + MALLOC_ZONE(mntfromname, char *, pathbuflen, M_NAMEI, M_WAITOK); + if (!mntfromname) { + error = ENOMEM; + goto nfsmerr; + } + MALLOC_ZONE(path, char *, pathbuflen, M_NAMEI, M_WAITOK); + if (!path) { + error = ENOMEM; + goto nfsmerr; + } + + /* get the path for the directory being mounted on */ + error = vn_getpath(vp, path, &pathbuflen); + if (error) { + error = ENOMEM; + goto nfsmerr; + } + + /* + * Set up the mntfromname for the new mount based on the + * current mount's mntfromname and the directory's path + * relative to the current mount's mntonname. + * Set up relpath to point at the relative path on the current mount. + * Also, count the number of components in relpath. + * We'll be adding those to each fs location path in the new args. + */ + nlen = strlcpy(mntfromname, vfs_statfs(nmp->nm_mountp)->f_mntfromname, MAXPATHLEN); + if ((nlen > 0) && (mntfromname[nlen-1] == '/')) { /* avoid double '/' in new name */ + mntfromname[nlen-1] = '\0'; + nlen--; + } + relpath = mntfromname + nlen; + nlen = strlcat(mntfromname, path + strlen(vfs_statfs(nmp->nm_mountp)->f_mntonname), MAXPATHLEN); + if (nlen >= MAXPATHLEN) { + error = ENAMETOOLONG; + goto nfsmerr; + } + /* count the number of components in relpath */ + p = relpath; + while (*p && (*p == '/')) + p++; + relpathcomps = 0; + while (*p) { + relpathcomps++; + while (*p && (*p != '/')) + p++; + while (*p && (*p == '/')) + p++; + } + + /* grab a copy of the file system type */ + vfs_name(vnode_mount(vp), fstype); + + /* for referrals, fetch the fs locations */ + if (referral) { + const char *vname = vnode_getname(NFSTOV(np)); + if (!vname) { + error = ENOENT; + } else { + error = nfs4_get_fs_locations(nmp, dnp, NULL, 0, vname, ctx, &nfsls); + vnode_putname(vname); + if (!error && (nfsls.nl_numlocs < 1)) + error = ENOENT; + } + nfsmerr_if(error); + } + + /* set up NFS mount args based on current mount args */ + +#define xb_copy_32(E, XBSRC, XBDST, V) \ + do { \ + if (E) break; \ + xb_get_32((E), (XBSRC), (V)); \ + if (skipcopy) break; \ + xb_add_32((E), (XBDST), (V)); \ + } while (0) +#define xb_copy_opaque(E, XBSRC, XBDST) \ + do { \ + uint32_t __count, __val; \ + xb_copy_32((E), (XBSRC), (XBDST), __count); \ + if (E) break; \ + __count = nfsm_rndup(__count); \ + __count /= XDRWORD; \ + while (__count-- > 0) \ + xb_copy_32((E), (XBSRC), (XBDST), __val); \ + } while (0) + + xb_init_buffer(&xb, nmp->nm_args, 2*XDRWORD); + xb_get_32(error, &xb, val); /* version */ + xb_get_32(error, &xb, argslength); /* args length */ + xb_init_buffer(&xb, nmp->nm_args, argslength); + + xb_init_buffer(&xbnew, NULL, 0); + xb_copy_32(error, &xb, &xbnew, val); /* version */ + argslength_offset = xb_offset(&xbnew); + xb_copy_32(error, &xb, &xbnew, val); /* args length */ + xb_copy_32(error, &xb, &xbnew, val); /* XDR args version */ + count = NFS_MATTR_BITMAP_LEN; + xb_get_bitmap(error, &xb, mattrs, count); /* mount attribute bitmap */ + nfsmerr_if(error); + for (i = 0; i < NFS_MATTR_BITMAP_LEN; i++) + newmattrs[i] = mattrs[i]; + if (referral) + NFS_BITMAP_SET(newmattrs, NFS_MATTR_FS_LOCATIONS); + else + NFS_BITMAP_SET(newmattrs, NFS_MATTR_FH); + NFS_BITMAP_SET(newmattrs, NFS_MATTR_FLAGS); + NFS_BITMAP_SET(newmattrs, NFS_MATTR_MNTFLAGS); + NFS_BITMAP_CLR(newmattrs, NFS_MATTR_MNTFROM); + xb_add_bitmap(error, &xbnew, newmattrs, NFS_MATTR_BITMAP_LEN); + attrslength_offset = xb_offset(&xbnew); + xb_copy_32(error, &xb, &xbnew, val); /* attrs length */ + NFS_BITMAP_ZERO(newmflags_mask, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_ZERO(newmflags, NFS_MFLAG_BITMAP_LEN); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FLAGS)) { + count = NFS_MFLAG_BITMAP_LEN; + xb_get_bitmap(error, &xb, newmflags_mask, count); /* mount flag mask bitmap */ + count = NFS_MFLAG_BITMAP_LEN; + xb_get_bitmap(error, &xb, newmflags, count); /* mount flag bitmap */ + } + NFS_BITMAP_SET(newmflags_mask, NFS_MFLAG_EPHEMERAL); + NFS_BITMAP_SET(newmflags, NFS_MFLAG_EPHEMERAL); + xb_add_bitmap(error, &xbnew, newmflags_mask, NFS_MFLAG_BITMAP_LEN); + xb_add_bitmap(error, &xbnew, newmflags, NFS_MFLAG_BITMAP_LEN); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_VERSION)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_MINOR_VERSION)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READ_SIZE)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_WRITE_SIZE)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READDIR_SIZE)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_READAHEAD)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MIN)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_REG_MAX)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MIN)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_LOCK_MODE)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SECURITY)) { + xb_copy_32(error, &xb, &xbnew, count); + while (!error && (count-- > 0)) + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MAX_GROUP_LIST)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOCKET_TYPE)) + xb_copy_opaque(error, &xb, &xbnew); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_NFS_PORT)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MOUNT_PORT)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_REQUEST_TIMEOUT)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_SOFT_RETRY_COUNT)) + xb_copy_32(error, &xb, &xbnew, val); + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_DEAD_TIMEOUT)) { + xb_copy_32(error, &xb, &xbnew, val); + xb_copy_32(error, &xb, &xbnew, val); + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FH)) { + xb_get_32(error, &xb, count); + xb_skip(error, &xb, count); + } + if (!referral) { + /* set the initial file handle to the directory's file handle */ + xb_add_fh(error, &xbnew, np->n_fhp, np->n_fhsize); + } + /* copy/extend/skip fs locations */ + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_FS_LOCATIONS)) { + numlocs = numserv = numaddr = numcomp = 0; + if (referral) /* don't copy the fs locations for a referral */ + skipcopy = 1; + xb_copy_32(error, &xb, &xbnew, numlocs); /* location count */ + for (loc = 0; !error && (loc < numlocs); loc++) { + xb_copy_32(error, &xb, &xbnew, numserv); /* server count */ + for (serv = 0; !error && (serv < numserv); serv++) { + xb_copy_opaque(error, &xb, &xbnew); /* server name */ + xb_copy_32(error, &xb, &xbnew, numaddr); /* address count */ + for (addr = 0; !error && (addr < numaddr); addr++) + xb_copy_opaque(error, &xb, &xbnew); /* address */ + xb_copy_opaque(error, &xb, &xbnew); /* server info */ + } + /* pathname */ + xb_get_32(error, &xb, numcomp); /* component count */ + if (!skipcopy) + xb_add_32(error, &xbnew, numcomp+relpathcomps); /* new component count */ + for (comp = 0; !error && (comp < numcomp); comp++) + xb_copy_opaque(error, &xb, &xbnew); /* component */ + /* add additional components */ + for (comp = 0; !skipcopy && !error && (comp < relpathcomps); comp++) { + p = relpath; + while (*p && (*p == '/')) + p++; + while (*p && !error) { + cp = p; + while (*p && (*p != '/')) + p++; + xb_add_string(error, &xbnew, cp, (p - cp)); /* component */ + while (*p && (*p == '/')) + p++; + } + } + xb_copy_opaque(error, &xb, &xbnew); /* fs location info */ + } + if (referral) + skipcopy = 0; + } + if (referral) { + /* add referral's fs locations */ + xb_add_32(error, &xbnew, nfsls.nl_numlocs); /* FS_LOCATIONS */ + for (loc = 0; !error && (loc < nfsls.nl_numlocs); loc++) { + xb_add_32(error, &xbnew, nfsls.nl_locations[loc]->nl_servcount); + for (serv = 0; !error && (serv < nfsls.nl_locations[loc]->nl_servcount); serv++) { + xb_add_string(error, &xbnew, nfsls.nl_locations[loc]->nl_servers[serv]->ns_name, + strlen(nfsls.nl_locations[loc]->nl_servers[serv]->ns_name)); + xb_add_32(error, &xbnew, nfsls.nl_locations[loc]->nl_servers[serv]->ns_addrcount); + for (addr = 0; !error && (addr < nfsls.nl_locations[loc]->nl_servers[serv]->ns_addrcount); addr++) + xb_add_string(error, &xbnew, nfsls.nl_locations[loc]->nl_servers[serv]->ns_addresses[addr], + strlen(nfsls.nl_locations[loc]->nl_servers[serv]->ns_addresses[addr])); + xb_add_32(error, &xbnew, 0); /* empty server info */ + } + xb_add_32(error, &xbnew, nfsls.nl_locations[loc]->nl_path.np_compcount); + for (comp = 0; !error && (comp < nfsls.nl_locations[loc]->nl_path.np_compcount); comp++) + xb_add_string(error, &xbnew, nfsls.nl_locations[loc]->nl_path.np_components[comp], + strlen(nfsls.nl_locations[loc]->nl_path.np_components[comp])); + xb_add_32(error, &xbnew, 0); /* empty fs location info */ + } + } + if (NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFLAGS)) + xb_get_32(error, &xb, mntflags); + /* + * We add the following mount flags to the ones for the mounted-on mount: + * MNT_DONTBROWSE - to keep the mount from showing up as a separate volume + * MNT_AUTOMOUNTED - to keep DiskArb from retriggering the mount after + * an unmount (looking for /.autodiskmounted) + */ + mntflags |= (MNT_AUTOMOUNTED | MNT_DONTBROWSE); + xb_add_32(error, &xbnew, mntflags); + if (!referral && NFS_BITMAP_ISSET(mattrs, NFS_MATTR_MNTFROM)) { + /* copy mntfrom string and add relpath */ + rlen = strlen(relpath); + xb_get_32(error, &xb, mlen); + nfsmerr_if(error); + mlen2 = mlen + ((relpath[0] != '/') ? 1 : 0) + rlen; + xb_add_32(error, &xbnew, mlen2); + count = mlen/XDRWORD; + /* copy the original string */ + while (count-- > 0) + xb_copy_32(error, &xb, &xbnew, val); + if (!error && (mlen % XDRWORD)) { + error = xb_get_bytes(&xb, buf, mlen%XDRWORD, 0); + if (!error) + error = xb_add_bytes(&xbnew, buf, mlen%XDRWORD, 1); + } + /* insert a '/' if the relative path doesn't start with one */ + if (!error && (relpath[0] != '/')) { + buf[0] = '/'; + error = xb_add_bytes(&xbnew, buf, 1, 1); + } + /* add the additional relative path */ + if (!error) + error = xb_add_bytes(&xbnew, relpath, rlen, 1); + /* make sure the resulting string has the right number of pad bytes */ + if (!error && (mlen2 != nfsm_rndup(mlen2))) { + bzero(buf, sizeof(buf)); + count = nfsm_rndup(mlen2) - mlen2; + error = xb_add_bytes(&xbnew, buf, count, 1); + } + } + xb_build_done(error, &xbnew); + + /* update opaque counts */ + end_offset = xb_offset(&xbnew); + if (!error) { + error = xb_seek(&xbnew, argslength_offset); + argslength = end_offset - argslength_offset + XDRWORD/*version*/; + xb_add_32(error, &xbnew, argslength); + } + if (!error) { + error = xb_seek(&xbnew, attrslength_offset); + xb_add_32(error, &xbnew, end_offset - attrslength_offset - XDRWORD/*don't include length field*/); + } + nfsmerr_if(error); + + /* + * For kernel_mount() call, use the existing mount flags (instead of the + * original flags) because flags like MNT_NOSUID and MNT_NODEV may have + * been silently enforced. + */ + mntflags = vnode_vfsvisflags(vp); + mntflags |= (MNT_AUTOMOUNTED | MNT_DONTBROWSE); + + /* do the mount */ + error = kernel_mount(fstype, dvp, vp, path, xb_buffer_base(&xbnew), argslength, + mntflags, KERNEL_MOUNT_PERMIT_UNMOUNT | KERNEL_MOUNT_NOAUTH, ctx); + +nfsmerr: + if (error) + printf("nfs: mirror mount of %s on %s failed (%d)\n", + mntfromname, path, error); + /* clean up */ + xb_cleanup(&xbnew); + if (referral) + nfs_fs_locations_cleanup(&nfsls); + if (path) + FREE_ZONE(path, MAXPATHLEN, M_NAMEI); + if (mntfromname) + FREE_ZONE(mntfromname, MAXPATHLEN, M_NAMEI); + if (!error) + nfs_ephemeral_mount_harvester_start(); + return (error); +} + +/* + * trigger vnode functions + */ + +resolver_result_t +nfs_mirror_mount_trigger_resolve( + vnode_t vp, + const struct componentname *cnp, + enum path_operation pop, + __unused int flags, + __unused void *data, + vfs_context_t ctx) +{ + nfsnode_t np = VTONFS(vp); + vnode_t pvp = NULLVP; + int error = 0; + resolver_result_t result; + + /* + * We have a trigger node that doesn't have anything mounted on it yet. + * We'll do the mount if either: + * (a) this isn't the last component of the path OR + * (b) this is an op that looks like it should trigger the mount. + */ + if (cnp->cn_flags & ISLASTCN) { + switch (pop) { + case OP_MOUNT: + case OP_UNMOUNT: + case OP_STATFS: + case OP_LINK: + case OP_UNLINK: + case OP_RENAME: + case OP_MKNOD: + case OP_MKFIFO: + case OP_SYMLINK: + case OP_ACCESS: + case OP_GETATTR: + case OP_MKDIR: + case OP_RMDIR: + case OP_REVOKE: + case OP_GETXATTR: + case OP_LISTXATTR: + /* don't perform the mount for these operations */ + result = vfs_resolver_result(np->n_trigseq, RESOLVER_NOCHANGE, 0); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger RESOLVE: no change, last %d nameiop %d, seq %d", + (cnp->cn_flags & ISLASTCN) ? 1 : 0, cnp->cn_nameiop, np->n_trigseq); +#endif + return (result); + case OP_OPEN: + case OP_CHDIR: + case OP_CHROOT: + case OP_TRUNCATE: + case OP_COPYFILE: + case OP_PATHCONF: + case OP_READLINK: + case OP_SETATTR: + case OP_EXCHANGEDATA: + case OP_SEARCHFS: + case OP_FSCTL: + case OP_SETXATTR: + case OP_REMOVEXATTR: + default: + /* go ahead and do the mount */ + break; + } + } + + if (vnode_mountedhere(vp) != NULL) { + /* + * Um... there's already something mounted. + * Been there. Done that. Let's just say it succeeded. + */ + error = 0; + goto skipmount; + } + + if ((error = nfs_node_set_busy(np, vfs_context_thread(ctx)))) { + result = vfs_resolver_result(np->n_trigseq, RESOLVER_ERROR, error); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger RESOLVE: busy error %d, last %d nameiop %d, seq %d", + error, (cnp->cn_flags & ISLASTCN) ? 1 : 0, cnp->cn_nameiop, np->n_trigseq); +#endif + return (result); + } + + pvp = vnode_getparent(vp); + if (pvp == NULLVP) + error = EINVAL; + if (!error) + error = nfs_mirror_mount_domount(pvp, vp, ctx); +skipmount: + if (!error) + np->n_trigseq++; + result = vfs_resolver_result(np->n_trigseq, error ? RESOLVER_ERROR : RESOLVER_RESOLVED, error); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger RESOLVE: %s %d, last %d nameiop %d, seq %d", + error ? "error" : "resolved", error, + (cnp->cn_flags & ISLASTCN) ? 1 : 0, cnp->cn_nameiop, np->n_trigseq); +#endif + + if (pvp != NULLVP) + vnode_put(pvp); + nfs_node_clear_busy(np); + return (result); +} + +resolver_result_t +nfs_mirror_mount_trigger_unresolve( + vnode_t vp, + int flags, + __unused void *data, + vfs_context_t ctx) +{ + nfsnode_t np = VTONFS(vp); + mount_t mp; + int error; + resolver_result_t result; + + if ((error = nfs_node_set_busy(np, vfs_context_thread(ctx)))) { + result = vfs_resolver_result(np->n_trigseq, RESOLVER_ERROR, error); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger UNRESOLVE: busy error %d, seq %d", error, np->n_trigseq); +#endif + return (result); + } + + mp = vnode_mountedhere(vp); + if (!mp) + error = EINVAL; + if (!error) + error = vfs_unmountbyfsid(&(vfs_statfs(mp)->f_fsid), flags, ctx); + if (!error) + np->n_trigseq++; + result = vfs_resolver_result(np->n_trigseq, error ? RESOLVER_ERROR : RESOLVER_UNRESOLVED, error); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger UNRESOLVE: %s %d, seq %d", + error ? "error" : "unresolved", error, np->n_trigseq); +#endif + nfs_node_clear_busy(np); + return (result); +} + +resolver_result_t +nfs_mirror_mount_trigger_rearm( + vnode_t vp, + __unused int flags, + __unused void *data, + vfs_context_t ctx) +{ + nfsnode_t np = VTONFS(vp); + int error; + resolver_result_t result; + + if ((error = nfs_node_set_busy(np, vfs_context_thread(ctx)))) { + result = vfs_resolver_result(np->n_trigseq, RESOLVER_ERROR, error); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger REARM: busy error %d, seq %d", error, np->n_trigseq); +#endif + return (result); + } + + np->n_trigseq++; + result = vfs_resolver_result(np->n_trigseq, + vnode_mountedhere(vp) ? RESOLVER_RESOLVED : RESOLVER_UNRESOLVED, 0); +#ifdef NFS_TRIGGER_DEBUG + NP(np, "nfs trigger REARM: %s, seq %d", + vnode_mountedhere(vp) ? "resolved" : "unresolved", np->n_trigseq); +#endif + nfs_node_clear_busy(np); + return (result); +} + +/* + * Periodically attempt to unmount ephemeral (mirror) mounts in an attempt to limit + * the number of unused mounts. + */ + +#define NFS_EPHEMERAL_MOUNT_HARVEST_INTERVAL 120 /* how often the harvester runs */ +struct nfs_ephemeral_mount_harvester_info { + fsid_t fsid; /* FSID that we need to try to unmount */ + uint32_t mountcount; /* count of ephemeral mounts seen in scan */ + }; +/* various globals for the harvester */ +static thread_call_t nfs_ephemeral_mount_harvester_timer = NULL; +static int nfs_ephemeral_mount_harvester_on = 0; + +kern_return_t thread_terminate(thread_t); + +static int +nfs_ephemeral_mount_harvester_callback(mount_t mp, void *arg) +{ + struct nfs_ephemeral_mount_harvester_info *hinfo = arg; + struct nfsmount *nmp; + struct timeval now; + + if (strcmp(mp->mnt_vfsstat.f_fstypename, "nfs")) + return (VFS_RETURNED); + nmp = VFSTONFS(mp); + if (!nmp || !NMFLAG(nmp, EPHEMERAL)) + return (VFS_RETURNED); + hinfo->mountcount++; + + /* avoid unmounting mounts that have been triggered within the last harvest interval */ + microtime(&now); + if ((nmp->nm_mounttime >> 32) > ((uint32_t)now.tv_sec - NFS_EPHEMERAL_MOUNT_HARVEST_INTERVAL)) + return (VFS_RETURNED); + + if (hinfo->fsid.val[0] || hinfo->fsid.val[1]) { + /* attempt to unmount previously-found ephemeral mount */ + vfs_unmountbyfsid(&hinfo->fsid, 0, vfs_context_kernel()); + hinfo->fsid.val[0] = hinfo->fsid.val[1] = 0; } - sbp = vfs_statfs(mp); - sbp->f_bsize = nmp->nm_fsattr.nfsa_bsize; - sbp->f_blocks = nmp->nm_fsattr.nfsa_space_total / sbp->f_bsize; - sbp->f_bfree = nmp->nm_fsattr.nfsa_space_free / sbp->f_bsize; - sbp->f_bavail = nmp->nm_fsattr.nfsa_space_avail / sbp->f_bsize; - sbp->f_bused = (nmp->nm_fsattr.nfsa_space_total / sbp->f_bsize) - - (nmp->nm_fsattr.nfsa_space_free / sbp->f_bsize); - sbp->f_files = nmp->nm_fsattr.nfsa_files_total; - sbp->f_ffree = nmp->nm_fsattr.nfsa_files_free; - sbp->f_iosize = nfs_iosize; /* - * Calculate the size used for I/O buffers. Use the larger - * of the two sizes to minimise NFS requests but make sure - * that it is at least one VM page to avoid wasting buffer - * space and to allow easy mmapping of I/O buffers. - * The read/write RPC calls handle the splitting up of - * buffers into multiple requests if the buffer size is - * larger than the I/O size. + * We can't call unmount here since we hold a mount iter ref + * on mp so save its fsid for the next call iteration to unmount. */ - iosize = max(nmp->nm_rsize, nmp->nm_wsize); - if (iosize < PAGE_SIZE) - iosize = PAGE_SIZE; - nmp->nm_biosize = trunc_page_32(iosize); + hinfo->fsid.val[0] = mp->mnt_vfsstat.f_fsid.val[0]; + hinfo->fsid.val[1] = mp->mnt_vfsstat.f_fsid.val[1]; - /* - * V3 mounts give us a (relatively) reliable remote access(2) - * call, so advertise the fact. - * - * XXX this may not be the best way to go, as the granularity - * offered isn't a good match to our needs. - */ - if (nmp->nm_vers != NFS_VER2) - vfs_setauthopaqueaccess(mp); + return (VFS_RETURNED); +} - if (nmp->nm_flag & NFSMNT_LOCALLOCKS) - vfs_setlocklocal(nmp->nm_mountp); - if (!(nmp->nm_flag & (NFSMNT_NOLOCKS|NFSMNT_LOCALLOCKS))) - nfs_lockd_mount_change(1); +/* + * Spawn a thread to do the ephemeral mount harvesting. + */ +static void +nfs_ephemeral_mount_harvester_timer_func(void) +{ + thread_t thd; - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_state |= NFSSTA_MOUNTED; - lck_mtx_unlock(&nmp->nm_lock); - return (0); -bad: - /* mark the socket for termination */ - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_sockflags |= NMSOCK_UNMOUNT; - /* wait for any socket poking to complete */ - while (nmp->nm_sockflags & NMSOCK_POKE) - msleep(&nmp->nm_sockflags, &nmp->nm_lock, PZERO-1, "nfswaitpoke", &ts); - /* wait for the socket thread to terminate */ - while (nmp->nm_sockthd) { - wakeup(&nmp->nm_sockthd); - msleep(&nmp->nm_sockthd, &nmp->nm_lock, PZERO-1, "nfswaitsockthd", &ts); + if (kernel_thread_start(nfs_ephemeral_mount_harvester, NULL, &thd) == KERN_SUCCESS) + thread_deallocate(thd); +} + +/* + * Iterate all mounts looking for NFS ephemeral mounts to try to unmount. + */ +void +nfs_ephemeral_mount_harvester(__unused void *arg, __unused wait_result_t wr) +{ + struct nfs_ephemeral_mount_harvester_info hinfo; + uint64_t deadline; + + hinfo.mountcount = 0; + hinfo.fsid.val[0] = hinfo.fsid.val[1] = 0; + vfs_iterate(VFS_ITERATE_TAIL_FIRST, nfs_ephemeral_mount_harvester_callback, &hinfo); + if (hinfo.fsid.val[0] || hinfo.fsid.val[1]) { + /* attempt to unmount last found ephemeral mount */ + vfs_unmountbyfsid(&hinfo.fsid, 0, vfs_context_kernel()); } - /* tear down the socket */ - lck_mtx_unlock(&nmp->nm_lock); - nfs_disconnect(nmp); - if (nmp->nm_vers >= NFS_VER4) { - if (nmp->nm_cbid) - nfs4_mount_callback_shutdown(nmp); - if (nmp->nm_renew_timer) { - thread_call_cancel(nmp->nm_renew_timer); - thread_call_free(nmp->nm_renew_timer); - } - if (nmp->nm_longid) { - /* remove/deallocate the client ID data */ - lck_mtx_lock(nfs_global_mutex); - TAILQ_REMOVE(&nfsclientids, nmp->nm_longid, nci_link); - if (nmp->nm_longid->nci_id) - FREE(nmp->nm_longid->nci_id, M_TEMP); - FREE(nmp->nm_longid, M_TEMP); - lck_mtx_unlock(nfs_global_mutex); - } - if (IS_VALID_CRED(nmp->nm_mcred)) - kauth_cred_unref(&nmp->nm_mcred); + + lck_mtx_lock(nfs_global_mutex); + if (!hinfo.mountcount) { + /* no more ephemeral mounts - don't need timer */ + nfs_ephemeral_mount_harvester_on = 0; + } else { + /* re-arm the timer */ + clock_interval_to_deadline(NFS_EPHEMERAL_MOUNT_HARVEST_INTERVAL, NSEC_PER_SEC, &deadline); + thread_call_enter_delayed(nfs_ephemeral_mount_harvester_timer, deadline); + nfs_ephemeral_mount_harvester_on = 1; } - lck_mtx_destroy(&nmp->nm_lock, nfs_mount_grp); - FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); - mbuf_freem(nam); - return (error); + lck_mtx_unlock(nfs_global_mutex); + + /* thread done */ + thread_terminate(current_thread()); } +/* + * Make sure the NFS ephemeral mount harvester timer is running. + */ void -nfs3_umount_rpc(struct nfsmount *nmp, vfs_context_t ctx, int timeo) +nfs_ephemeral_mount_harvester_start(void) +{ + uint64_t deadline; + + lck_mtx_lock(nfs_global_mutex); + if (nfs_ephemeral_mount_harvester_on) { + lck_mtx_unlock(nfs_global_mutex); + return; + } + if (nfs_ephemeral_mount_harvester_timer == NULL) + nfs_ephemeral_mount_harvester_timer = thread_call_allocate((thread_call_func_t)nfs_ephemeral_mount_harvester_timer_func, NULL); + clock_interval_to_deadline(NFS_EPHEMERAL_MOUNT_HARVEST_INTERVAL, NSEC_PER_SEC, &deadline); + thread_call_enter_delayed(nfs_ephemeral_mount_harvester_timer, deadline); + nfs_ephemeral_mount_harvester_on = 1; + lck_mtx_unlock(nfs_global_mutex); +} + +#endif + +/* + * Send a MOUNT protocol MOUNT request to the server to get the initial file handle (and security). + */ +int +nfs3_mount_rpc(struct nfsmount *nmp, struct sockaddr *sa, int sotype, int nfsvers, char *path, vfs_context_t ctx, int timeo, fhandle_t *fh, struct nfs_sec *sec) { - int error = 0, auth_len, slen; + int error = 0, slen, mntproto; thread_t thd = vfs_context_thread(ctx); kauth_cred_t cred = vfs_context_ucred(ctx); - char *path; uint64_t xid = 0; struct nfsm_chain nmreq, nmrep; mbuf_t mreq; - uint32_t mntport = 0; - struct sockaddr *nam = mbuf_data(nmp->nm_nam); - struct sockaddr_in saddr; - - bcopy(nam, &saddr, min(sizeof(saddr), nam->sa_len)); - auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ? - nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + - 5 * NFSX_UNSIGNED; + uint32_t mntvers, mntport, val; + struct sockaddr_storage ss; + struct sockaddr *saddr = (struct sockaddr*)&ss; + nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); - /* send portmap request to get mountd port */ - saddr.sin_port = htons(PMAPPORT); - nfsm_chain_build_alloc_init(error, &nmreq, 4*NFSX_UNSIGNED); - nfsm_chain_add_32(error, &nmreq, RPCPROG_MNT); - nfsm_chain_add_32(error, &nmreq, RPCMNT_VER1); - nfsm_chain_add_32(error, &nmreq, IPPROTO_UDP); - nfsm_chain_add_32(error, &nmreq, 0); + mntvers = (nfsvers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; + mntproto = (NM_OMFLAG(nmp, MNTUDP) || (sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP; + sec->count = 0; + + bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + if (saddr->sa_family == AF_INET) { + if (nmp->nm_mountport) + ((struct sockaddr_in*)saddr)->sin_port = htons(nmp->nm_mountport); + mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port); + } else { + if (nmp->nm_mountport) + ((struct sockaddr_in6*)saddr)->sin6_port = htons(nmp->nm_mountport); + mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + } + + while (!mntport) { + error = nfs_portmap_lookup(nmp, ctx, saddr, NULL, RPCPROG_MNT, mntvers, mntproto, timeo); + nfsmout_if(error); + if (saddr->sa_family == AF_INET) + mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port); + else + mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + if (!mntport) { + /* if not found and TCP, then retry with UDP */ + if (mntproto == IPPROTO_UDP) { + error = EPROGUNAVAIL; + break; + } + mntproto = IPPROTO_UDP; + bcopy(sa, saddr, min(sizeof(ss), sa->sa_len)); + } + } + nfsmout_if(error || !mntport); + + /* MOUNT protocol MOUNT request */ + slen = strlen(path); + nfsm_chain_build_alloc_init(error, &nmreq, NFSX_UNSIGNED + nfsm_rndup(slen)); + nfsm_chain_add_name(error, &nmreq, path, slen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfsm_rpchead2(SOCK_DGRAM, PMAPPROG, PMAPVERS, PMAPPROC_GETPORT, - RPCAUTH_SYS, auth_len, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); + error = nfsm_rpchead2(nmp, (mntproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + RPCPROG_MNT, mntvers, RPCMNT_MOUNT, + RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); nfsmout_if(error); nmreq.nmc_mhead = NULL; - error = nfs_aux_request(nmp, thd, &saddr, mreq, R_XID32(xid), 0, timeo, &nmrep); - nfsmout_if(error); - - /* grab mountd port from portmap response */ - nfsm_chain_get_32(error, &nmrep, mntport); + error = nfs_aux_request(nmp, thd, saddr, NULL, + ((mntproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM), + mreq, R_XID32(xid), 1, timeo, &nmrep); nfsmout_if(error); + nfsm_chain_get_32(error, &nmrep, val); + if (!error && val) + error = val; + nfsm_chain_get_fh(error, &nmrep, nfsvers, fh); + if (!error && (nfsvers > NFS_VER2)) { + sec->count = NX_MAX_SEC_FLAVORS; + error = nfsm_chain_get_secinfo(&nmrep, &sec->flavors[0], &sec->count); + } +nfsmout: nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); - xid = 0; + return (error); +} + + +/* + * Send a MOUNT protocol UNMOUNT request to tell the server we've unmounted it. + */ +void +nfs3_umount_rpc(struct nfsmount *nmp, vfs_context_t ctx, int timeo) +{ + int error = 0, slen, mntproto; + thread_t thd = vfs_context_thread(ctx); + kauth_cred_t cred = vfs_context_ucred(ctx); + char *path; + uint64_t xid = 0; + struct nfsm_chain nmreq, nmrep; + mbuf_t mreq; + uint32_t mntvers, mntport; + struct sockaddr_storage ss; + struct sockaddr *saddr = (struct sockaddr*)&ss; + + if (!nmp->nm_saddr) + return; + + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); + + mntvers = (nmp->nm_vers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; + mntproto = (NM_OMFLAG(nmp, MNTUDP) || (nmp->nm_sotype == SOCK_DGRAM)) ? IPPROTO_UDP : IPPROTO_TCP; + mntport = nmp->nm_mountport; + + bcopy(nmp->nm_saddr, saddr, min(sizeof(ss), nmp->nm_saddr->sa_len)); + if (saddr->sa_family == AF_INET) + ((struct sockaddr_in*)saddr)->sin_port = htons(mntport); + else + ((struct sockaddr_in6*)saddr)->sin6_port = htons(mntport); + + while (!mntport) { + error = nfs_portmap_lookup(nmp, ctx, saddr, NULL, RPCPROG_MNT, mntvers, mntproto, timeo); + nfsmout_if(error); + if (saddr->sa_family == AF_INET) + mntport = ntohs(((struct sockaddr_in*)saddr)->sin_port); + else + mntport = ntohs(((struct sockaddr_in6*)saddr)->sin6_port); + /* if not found and mntvers > VER1, then retry with VER1 */ + if (!mntport) { + if (mntvers > RPCMNT_VER1) { + mntvers = RPCMNT_VER1; + } else if (mntproto == IPPROTO_TCP) { + mntproto = IPPROTO_UDP; + mntvers = (nmp->nm_vers == NFS_VER2) ? RPCMNT_VER1 : RPCMNT_VER3; + } else { + break; + } + bcopy(nmp->nm_saddr, saddr, min(sizeof(ss), nmp->nm_saddr->sa_len)); + } + } + nfsmout_if(!mntport); /* MOUNT protocol UNMOUNT request */ - saddr.sin_port = htons(mntport); path = &vfs_statfs(nmp->nm_mountp)->f_mntfromname[0]; while (*path && (*path != '/')) path++; slen = strlen(path); nfsm_chain_build_alloc_init(error, &nmreq, NFSX_UNSIGNED + nfsm_rndup(slen)); - nfsm_chain_add_string(error, &nmreq, path, slen); + nfsm_chain_add_name(error, &nmreq, path, slen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfsm_rpchead2(SOCK_DGRAM, RPCPROG_MNT, RPCMNT_VER1, RPCMNT_UMOUNT, - RPCAUTH_SYS, auth_len, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); + error = nfsm_rpchead2(nmp, (mntproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + RPCPROG_MNT, RPCMNT_VER1, RPCMNT_UMOUNT, + RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); nfsmout_if(error); nmreq.nmc_mhead = NULL; - error = nfs_aux_request(nmp, thd, &saddr, mreq, R_XID32(xid), 1, timeo, &nmrep); + error = nfs_aux_request(nmp, thd, saddr, NULL, + ((mntproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM), + mreq, R_XID32(xid), 1, timeo, &nmrep); nfsmout: nfsm_chain_cleanup(&nmreq); nfsm_chain_cleanup(&nmrep); @@ -1949,15 +4063,15 @@ nfs_vfs_unmount( { struct nfsmount *nmp; vnode_t vp; - int error, flags = 0, docallback; - struct nfsreq *req, *treq; - struct nfs_reqqhead iodq; + int error, flags = 0; struct timespec ts = { 1, 0 }; - struct nfs_open_owner *noop, *nextnoop; - nfsnode_t np; nmp = VFSTONFS(mp); lck_mtx_lock(&nmp->nm_lock); + /* + * Set the flag indicating that an unmount attempt is in progress. + */ + nmp->nm_state |= NFSSTA_UNMOUNTING; /* * During a force unmount we want to... * Mark that we are doing a force unmount. @@ -1966,15 +4080,19 @@ nfs_vfs_unmount( if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; nmp->nm_state |= NFSSTA_FORCE; - nmp->nm_flag |= NFSMNT_SOFT; + NFS_BITMAP_SET(nmp->nm_flags, NFS_MFLAG_SOFT); } + /* + * Wait for any in-progress monitored node scan to complete. + */ + while (nmp->nm_state & NFSSTA_MONITOR_SCAN) + msleep(&nmp->nm_state, &nmp->nm_lock, PZERO-1, "nfswaitmonscan", &ts); /* * Goes something like this.. * - Call vflush() to clear out vnodes for this file system, * except for the swap files. Deal with them in 2nd pass. * - Decrement reference on the vnode representing remote root. - * - Close the socket - * - Free up the data structures + * - Clean up the NFS mount structure. */ vp = NFSTOV(nmp->nm_dnp); lck_mtx_unlock(&nmp->nm_lock); @@ -1989,14 +4107,18 @@ nfs_vfs_unmount( error = vflush(mp, NULLVP, flags); /* locks vp in the process */ } else { if (vnode_isinuse(vp, 1)) - return (EBUSY); - error = vflush(mp, vp, flags); + error = EBUSY; + else + error = vflush(mp, vp, flags); } - if (error) + if (error) { + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_state &= ~NFSSTA_UNMOUNTING; + lck_mtx_unlock(&nmp->nm_lock); return (error); + } lck_mtx_lock(&nmp->nm_lock); - nmp->nm_state &= ~NFSSTA_MOUNTED; nmp->nm_dnp = NULL; lck_mtx_unlock(&nmp->nm_lock); @@ -2010,26 +4132,86 @@ nfs_vfs_unmount( vflush(mp, NULLVP, FORCECLOSE); - /* - * Destroy any RPCSEC_GSS contexts - */ - if (!TAILQ_EMPTY(&nmp->nm_gsscl)) - nfs_gss_clnt_ctx_unmount(nmp, mntflags); + nfs_mount_cleanup(nmp); + return (0); +} - /* mark the socket for termination */ - lck_mtx_lock(&nmp->nm_lock); - nmp->nm_sockflags |= NMSOCK_UNMOUNT; +/* + * cleanup/destroy NFS fs locations structure + */ +void +nfs_fs_locations_cleanup(struct nfs_fs_locations *nfslsp) +{ + struct nfs_fs_location *fsl; + struct nfs_fs_server *fss; + struct nfs_fs_path *fsp; + uint32_t loc, serv, addr, comp; + + /* free up fs locations */ + if (!nfslsp->nl_numlocs || !nfslsp->nl_locations) + return; + + for (loc = 0; loc < nfslsp->nl_numlocs; loc++) { + fsl = nfslsp->nl_locations[loc]; + if (!fsl) + continue; + if ((fsl->nl_servcount > 0) && fsl->nl_servers) { + for (serv = 0; serv < fsl->nl_servcount; serv++) { + fss = fsl->nl_servers[serv]; + if (!fss) + continue; + if ((fss->ns_addrcount > 0) && fss->ns_addresses) { + for (addr = 0; addr < fss->ns_addrcount; addr++) + FREE(fss->ns_addresses[addr], M_TEMP); + FREE(fss->ns_addresses, M_TEMP); + } + FREE(fss->ns_name, M_TEMP); + FREE(fss, M_TEMP); + } + FREE(fsl->nl_servers, M_TEMP); + } + fsp = &fsl->nl_path; + if (fsp->np_compcount && fsp->np_components) { + for (comp = 0; comp < fsp->np_compcount; comp++) + if (fsp->np_components[comp]) + FREE(fsp->np_components[comp], M_TEMP); + FREE(fsp->np_components, M_TEMP); + } + FREE(fsl, M_TEMP); + } + FREE(nfslsp->nl_locations, M_TEMP); + nfslsp->nl_numlocs = 0; + nfslsp->nl_locations = NULL; +} + +/* + * cleanup/destroy an nfsmount + */ +void +nfs_mount_cleanup(struct nfsmount *nmp) +{ + struct nfsreq *req, *treq; + struct nfs_reqqhead iodq; + struct timespec ts = { 1, 0 }; + struct nfs_open_owner *noop, *nextnoop; + nfsnode_t np; + int docallback; /* stop callbacks */ - if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_cbid) + if ((nmp->nm_vers >= NFS_VER4) && !NMFLAG(nmp, NOCALLBACK) && nmp->nm_cbid) nfs4_mount_callback_shutdown(nmp); - /* wait for any socket poking to complete */ - while (nmp->nm_sockflags & NMSOCK_POKE) - msleep(&nmp->nm_sockflags, &nmp->nm_lock, PZERO-1, "nfswaitpoke", &ts); + /* Destroy any RPCSEC_GSS contexts */ + if (!TAILQ_EMPTY(&nmp->nm_gsscl)) + nfs_gss_clnt_ctx_unmount(nmp); + + /* mark the socket for termination */ + lck_mtx_lock(&nmp->nm_lock); + nmp->nm_sockflags |= NMSOCK_UNMOUNT; /* Have the socket thread send the unmount RPC, if requested/appropriate. */ - if ((nmp->nm_vers < NFS_VER4) && !(mntflags & MNT_FORCE) && (nmp->nm_flag & NFSMNT_CALLUMNT)) + if ((nmp->nm_vers < NFS_VER4) && (nmp->nm_state & NFSSTA_MOUNTED) && + !(nmp->nm_state & NFSSTA_FORCE) && NMFLAG(nmp, CALLUMNT)) nfs_mount_sock_thread_wake(nmp); /* wait for the socket thread to terminate */ @@ -2043,15 +4225,16 @@ nfs_vfs_unmount( /* tear down the socket */ nfs_disconnect(nmp); - vfs_setfsprivate(mp, NULL); + if (nmp->nm_mountp) + vfs_setfsprivate(nmp->nm_mountp, NULL); lck_mtx_lock(&nmp->nm_lock); - if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_cbid) { - /* clear out any pending recall requests */ - while ((np = TAILQ_FIRST(&nmp->nm_recallq))) { - TAILQ_REMOVE(&nmp->nm_recallq, np, n_dlink); - np->n_dlink.tqe_next = NFSNOLIST; + if ((nmp->nm_vers >= NFS_VER4) && !NMFLAG(nmp, NOCALLBACK) && nmp->nm_cbid) { + /* clear out any pending delegation return requests */ + while ((np = TAILQ_FIRST(&nmp->nm_dreturnq))) { + TAILQ_REMOVE(&nmp->nm_dreturnq, np, n_dreturn); + np->n_dreturn.tqe_next = NFSNOLIST; } } @@ -2061,11 +4244,23 @@ nfs_vfs_unmount( thread_call_free(nmp->nm_renew_timer); } - mbuf_freem(nmp->nm_nam); + if (nmp->nm_saddr) + FREE(nmp->nm_saddr, M_SONAME); + if ((nmp->nm_vers < NFS_VER4) && nmp->nm_rqsaddr) + FREE(nmp->nm_rqsaddr, M_SONAME); lck_mtx_unlock(&nmp->nm_lock); - if ((nmp->nm_vers < NFS_VER4) && !(nmp->nm_flag & (NFSMNT_NOLOCKS|NFSMNT_LOCALLOCKS))) - nfs_lockd_mount_change(-1); + if (nmp->nm_state & NFSSTA_MOUNTED) + switch (nmp->nm_lockmode) { + case NFS_LOCK_MODE_DISABLED: + case NFS_LOCK_MODE_LOCAL: + break; + case NFS_LOCK_MODE_ENABLED: + default: + if (nmp->nm_vers <= NFS_VER3) + nfs_lockd_mount_unregister(nmp); + break; + } if ((nmp->nm_vers >= NFS_VER4) && nmp->nm_longid) { /* remove/deallocate the client ID data */ @@ -2126,24 +4321,41 @@ nfs_vfs_unmount( req->r_callback.rcb_func(req); } - /* clean up open owner list */ + /* clean up common state */ + lck_mtx_lock(&nmp->nm_lock); + while ((np = LIST_FIRST(&nmp->nm_monlist))) { + LIST_REMOVE(np, n_monlink); + np->n_monlink.le_next = NFSNOLIST; + } + TAILQ_FOREACH_SAFE(noop, &nmp->nm_open_owners, noo_link, nextnoop) { + TAILQ_REMOVE(&nmp->nm_open_owners, noop, noo_link); + noop->noo_flags &= ~NFS_OPEN_OWNER_LINK; + if (noop->noo_refcnt) + continue; + nfs_open_owner_destroy(noop); + } + lck_mtx_unlock(&nmp->nm_lock); + + /* clean up NFSv4 state */ if (nmp->nm_vers >= NFS_VER4) { lck_mtx_lock(&nmp->nm_lock); - TAILQ_FOREACH_SAFE(noop, &nmp->nm_open_owners, noo_link, nextnoop) { - TAILQ_REMOVE(&nmp->nm_open_owners, noop, noo_link); - noop->noo_flags &= ~NFS_OPEN_OWNER_LINK; - if (noop->noo_refcnt) - continue; - nfs_open_owner_destroy(noop); + while ((np = TAILQ_FIRST(&nmp->nm_delegations))) { + TAILQ_REMOVE(&nmp->nm_delegations, np, n_dlink); + np->n_dlink.tqe_next = NFSNOLIST; } lck_mtx_unlock(&nmp->nm_lock); - if (IS_VALID_CRED(nmp->nm_mcred)) - kauth_cred_unref(&nmp->nm_mcred); } + if (IS_VALID_CRED(nmp->nm_mcred)) + kauth_cred_unref(&nmp->nm_mcred); + nfs_fs_locations_cleanup(&nmp->nm_locations); + + if (nmp->nm_args) + xb_free(nmp->nm_args); lck_mtx_destroy(&nmp->nm_lock, nfs_mount_grp); + if (nmp->nm_fh) + FREE(nmp->nm_fh, M_TEMP); FREE_ZONE((caddr_t)nmp, sizeof (struct nfsmount), M_NFSMNT); - return (0); } /* @@ -2192,8 +4404,8 @@ nfs_vfs_quotactl( int nfs3_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struct dqblk *dqb) { - int error = 0, auth_len, slen, timeo; - int rqvers = (type == GRPQUOTA) ? RPCRQUOTA_EXT_VER : RPCRQUOTA_VER; + int error = 0, slen, timeo; + int rqport = 0, rqproto, rqvers = (type == GRPQUOTA) ? RPCRQUOTA_EXT_VER : RPCRQUOTA_VER; thread_t thd = vfs_context_thread(ctx); kauth_cred_t cred = vfs_context_ucred(ctx); char *path; @@ -2201,70 +4413,70 @@ nfs3_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc struct nfsm_chain nmreq, nmrep; mbuf_t mreq; uint32_t val = 0, bsize = 0; - struct sockaddr *nam = mbuf_data(nmp->nm_nam); - struct sockaddr_in saddr; + struct sockaddr *rqsaddr; struct timeval now; - bcopy(nam, &saddr, min(sizeof(saddr), nam->sa_len)); - auth_len = ((((cred->cr_ngroups - 1) > nmp->nm_numgrps) ? - nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + - 5 * NFSX_UNSIGNED; - timeo = (nmp->nm_flag & NFSMNT_SOFT) ? 10 : 60; - nfsm_chain_null(&nmreq); - nfsm_chain_null(&nmrep); + if (!nmp->nm_saddr) + return (ENXIO); - /* check if we have a recently cached rquota port */ - if (nmp->nm_rqport) { - microuptime(&now); - if ((nmp->nm_rqportstamp + 60) >= (uint32_t)now.tv_sec) - goto got_rqport; - } + if (NMFLAG(nmp, NOQUOTA)) + return (ENOTSUP); - /* send portmap request to get rquota port */ - saddr.sin_port = htons(PMAPPORT); - nfsm_chain_build_alloc_init(error, &nmreq, 4*NFSX_UNSIGNED); - nfsm_chain_add_32(error, &nmreq, RPCPROG_RQUOTA); - nfsm_chain_add_32(error, &nmreq, rqvers); - nfsm_chain_add_32(error, &nmreq, IPPROTO_UDP); - nfsm_chain_add_32(error, &nmreq, 0); - nfsm_chain_build_done(error, &nmreq); - nfsmout_if(error); - error = nfsm_rpchead2(SOCK_DGRAM, PMAPPROG, PMAPVERS, PMAPPROC_GETPORT, - RPCAUTH_SYS, auth_len, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); - nfsmout_if(error); - nmreq.nmc_mhead = NULL; - error = nfs_aux_request(nmp, thd, &saddr, mreq, R_XID32(xid), 0, timeo, &nmrep); - nfsmout_if(error); + if (!nmp->nm_rqsaddr) + MALLOC(nmp->nm_rqsaddr, struct sockaddr *, sizeof(struct sockaddr_storage), M_SONAME, M_WAITOK|M_ZERO); + if (!nmp->nm_rqsaddr) + return (ENOMEM); + rqsaddr = nmp->nm_rqsaddr; + if (rqsaddr->sa_family == AF_INET6) + rqport = ntohs(((struct sockaddr_in6*)rqsaddr)->sin6_port); + else if (rqsaddr->sa_family == AF_INET) + rqport = ntohs(((struct sockaddr_in*)rqsaddr)->sin_port); - /* grab rquota port from portmap response */ - nfsm_chain_get_32(error, &nmrep, val); - nfsmout_if(error); - nmp->nm_rqport = val; + timeo = NMFLAG(nmp, SOFT) ? 10 : 60; + rqproto = IPPROTO_UDP; /* XXX should prefer TCP if mount is TCP */ + + /* check if we have a recently cached rquota port */ microuptime(&now); - nmp->nm_rqportstamp = now.tv_sec; - nfsm_chain_cleanup(&nmreq); - nfsm_chain_cleanup(&nmrep); - xid = 0; + if (!rqport || ((nmp->nm_rqsaddrstamp + 60) >= (uint32_t)now.tv_sec)) { + /* send portmap request to get rquota port */ + bcopy(nmp->nm_saddr, rqsaddr, min(sizeof(struct sockaddr_storage), nmp->nm_saddr->sa_len)); + error = nfs_portmap_lookup(nmp, ctx, rqsaddr, NULL, RPCPROG_RQUOTA, rqvers, rqproto, timeo); + if (error) + return (error); + if (rqsaddr->sa_family == AF_INET6) + rqport = ntohs(((struct sockaddr_in6*)rqsaddr)->sin6_port); + else if (rqsaddr->sa_family == AF_INET) + rqport = ntohs(((struct sockaddr_in*)rqsaddr)->sin_port); + else + return (EIO); + if (!rqport) + return (ENOTSUP); + microuptime(&now); + nmp->nm_rqsaddrstamp = now.tv_sec; + } -got_rqport: /* rquota request */ - saddr.sin_port = htons(nmp->nm_rqport); + nfsm_chain_null(&nmreq); + nfsm_chain_null(&nmrep); path = &vfs_statfs(nmp->nm_mountp)->f_mntfromname[0]; while (*path && (*path != '/')) path++; slen = strlen(path); nfsm_chain_build_alloc_init(error, &nmreq, 3 * NFSX_UNSIGNED + nfsm_rndup(slen)); - nfsm_chain_add_string(error, &nmreq, path, slen); + nfsm_chain_add_name(error, &nmreq, path, slen, nmp); if (type == GRPQUOTA) nfsm_chain_add_32(error, &nmreq, type); nfsm_chain_add_32(error, &nmreq, id); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfsm_rpchead2(SOCK_DGRAM, RPCPROG_RQUOTA, rqvers, RPCRQUOTA_GET, - RPCAUTH_SYS, auth_len, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); + error = nfsm_rpchead2(nmp, (rqproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + RPCPROG_RQUOTA, rqvers, RPCRQUOTA_GET, + RPCAUTH_SYS, cred, NULL, nmreq.nmc_mhead, &xid, &mreq); nfsmout_if(error); nmreq.nmc_mhead = NULL; - error = nfs_aux_request(nmp, thd, &saddr, mreq, R_XID32(xid), 0, timeo, &nmrep); + error = nfs_aux_request(nmp, thd, rqsaddr, NULL, + (rqproto == IPPROTO_UDP) ? SOCK_DGRAM : SOCK_STREAM, + mreq, R_XID32(xid), 0, timeo, &nmrep); nfsmout_if(error); /* parse rquota response */ @@ -2311,6 +4523,7 @@ nfs4_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc uint32_t bitmap[NFS_ATTR_BITMAP_LEN]; thread_t thd = vfs_context_thread(ctx); kauth_cred_t cred = vfs_context_ucred(ctx); + struct nfsreq_secinfo_args si; if (type != USRQUOTA) /* NFSv4 only supports user quotas */ return (ENOTSUP); @@ -2326,12 +4539,13 @@ nfs4_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc * an effective uid that matches the given uid. */ if (id != kauth_cred_getuid(cred)) { - struct ucred temp_cred; - bzero(&temp_cred, sizeof(temp_cred)); - temp_cred.cr_uid = id; - temp_cred.cr_ngroups = cred->cr_ngroups; - bcopy(cred->cr_groups, temp_cred.cr_groups, sizeof(temp_cred.cr_groups)); - cred = kauth_cred_create(&temp_cred); + struct posix_cred temp_pcred; + posix_cred_t pcred = posix_cred_get(cred); + bzero(&temp_pcred, sizeof(temp_pcred)); + temp_pcred.cr_uid = id; + temp_pcred.cr_ngroups = pcred->cr_ngroups; + bcopy(pcred->cr_groups, temp_pcred.cr_groups, sizeof(temp_pcred.cr_groups)); + cred = posix_cred_create(&temp_pcred); if (!IS_VALID_CRED(cred)) return (ENOMEM); } else { @@ -2347,6 +4561,7 @@ nfs4_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc return(error); } + NFSREQ_SECINFO_SET(&si, np, NULL, 0, NULL, 0); nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -2363,19 +4578,18 @@ nfs4_getquota(struct nfsmount *nmp, vfs_context_t ctx, uid_t id, int type, struc NFS_BITMAP_SET(bitmap, NFS_FATTR_QUOTA_AVAIL_HARD); NFS_BITMAP_SET(bitmap, NFS_FATTR_QUOTA_AVAIL_SOFT); NFS_BITMAP_SET(bitmap, NFS_FATTR_QUOTA_USED); - nfsm_chain_add_bitmap_masked(error, &nmreq, bitmap, - NFS_ATTR_BITMAP_LEN, nmp->nm_fsattr.nfsa_supp_attr); + nfsm_chain_add_bitmap_supported(error, &nmreq, bitmap, nmp, NULL); nfsm_chain_build_done(error, &nmreq); nfsm_assert(error, (numops == 0), EPROTO); nfsmout_if(error); - error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, 0, &nmrep, &xid, &status); + error = nfs_request2(np, NULL, &nmreq, NFSPROC4_COMPOUND, thd, cred, &si, 0, &nmrep, &xid, &status); nfsm_chain_skip_tag(error, &nmrep); nfsm_chain_get_32(error, &nmrep, numops); nfsm_chain_op_check(error, &nmrep, NFS_OP_PUTFH); nfsm_chain_op_check(error, &nmrep, NFS_OP_GETATTR); nfsm_assert(error, NFSTONMP(np), ENXIO); nfsmout_if(error); - error = nfs4_parsefattr(&nmrep, NULL, NULL, NULL, dqb); + error = nfs4_parsefattr(&nmrep, NULL, NULL, NULL, dqb, NULL); nfsmout_if(error); nfsm_assert(error, NFSTONMP(np), ENXIO); nfsmout: @@ -2391,7 +4605,7 @@ nfs_vfs_quotactl(mount_t mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t c { struct nfsmount *nmp; int cmd, type, error, nfsvers; - uid_t ruid = vfs_context_ucred(ctx)->cr_ruid; + uid_t euid = kauth_cred_getuid(vfs_context_ucred(ctx)); struct dqblk *dqb = (struct dqblk*)datap; if (!(nmp = VFSTONFS(mp))) @@ -2399,7 +4613,7 @@ nfs_vfs_quotactl(mount_t mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t c nfsvers = nmp->nm_vers; if (uid == ~0U) - uid = ruid; + uid = euid; /* we can only support Q_GETQUOTA */ cmd = cmds >> SUBCMDSHIFT; @@ -2420,7 +4634,7 @@ nfs_vfs_quotactl(mount_t mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t c type = cmds & SUBCMDMASK; if ((u_int)type >= MAXQUOTAS) return (EINVAL); - if ((uid != ruid) && ((error = vfs_context_suser(ctx)))) + if ((uid != euid) && ((error = vfs_context_suser(ctx)))) return (error); if (vfs_busy(mp, LK_NOWAIT)) @@ -2438,7 +4652,7 @@ nfs_vfs_quotactl(mount_t mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t c int nfs_sync_callout(vnode_t, void *); struct nfs_sync_cargs { - thread_t thd; + vfs_context_t ctx; int waitfor; int error; }; @@ -2447,16 +4661,22 @@ int nfs_sync_callout(vnode_t vp, void *arg) { struct nfs_sync_cargs *cargs = (struct nfs_sync_cargs*)arg; + nfsnode_t np = VTONFS(vp); int error; - if (LIST_EMPTY(&VTONFS(vp)->n_dirtyblkhd)) + if (np->n_flag & NREVOKE) { + vn_revoke(vp, REVOKEALL, cargs->ctx); + return (VNODE_RETURNED); + } + + if (LIST_EMPTY(&np->n_dirtyblkhd)) return (VNODE_RETURNED); - if (VTONFS(vp)->n_wrbusy > 0) + if (np->n_wrbusy > 0) return (VNODE_RETURNED); - if (VTONFS(vp)->n_bflag & (NBFLUSHINPROG|NBINVALINPROG)) + if (np->n_bflag & (NBFLUSHINPROG|NBINVALINPROG)) return (VNODE_RETURNED); - error = nfs_flush(VTONFS(vp), cargs->waitfor, cargs->thd, 0); + error = nfs_flush(np, cargs->waitfor, vfs_context_thread(cargs->ctx), 0); if (error) cargs->error = error; @@ -2469,7 +4689,7 @@ nfs_vfs_sync(mount_t mp, int waitfor, vfs_context_t ctx) struct nfs_sync_cargs cargs; cargs.waitfor = waitfor; - cargs.thd = vfs_context_thread(ctx); + cargs.ctx = ctx; cargs.error = 0; vnode_iterate(mp, 0, nfs_sync_callout, &cargs); @@ -2538,6 +4758,290 @@ nfs_vfs_start( return (0); } +/* + * Build the mount info buffer for NFS_MOUNTINFO. + */ +int +nfs_mountinfo_assemble(struct nfsmount *nmp, struct xdrbuf *xb) +{ + struct xdrbuf xbinfo, xborig; + char sotype[6]; + uint32_t origargsvers, origargslength; + uint32_t infolength_offset, curargsopaquelength_offset, curargslength_offset, attrslength_offset, curargs_end_offset, end_offset; + uint32_t miattrs[NFS_MIATTR_BITMAP_LEN]; + uint32_t miflags_mask[NFS_MIFLAG_BITMAP_LEN]; + uint32_t miflags[NFS_MIFLAG_BITMAP_LEN]; + uint32_t mattrs[NFS_MATTR_BITMAP_LEN]; + uint32_t mflags_mask[NFS_MFLAG_BITMAP_LEN]; + uint32_t mflags[NFS_MFLAG_BITMAP_LEN]; + uint32_t loc, serv, addr, comp; + int i, timeo, error = 0; + + /* set up mount info attr and flag bitmaps */ + NFS_BITMAP_ZERO(miattrs, NFS_MIATTR_BITMAP_LEN); + NFS_BITMAP_SET(miattrs, NFS_MIATTR_FLAGS); + NFS_BITMAP_SET(miattrs, NFS_MIATTR_ORIG_ARGS); + NFS_BITMAP_SET(miattrs, NFS_MIATTR_CUR_ARGS); + NFS_BITMAP_SET(miattrs, NFS_MIATTR_CUR_LOC_INDEX); + NFS_BITMAP_ZERO(miflags_mask, NFS_MIFLAG_BITMAP_LEN); + NFS_BITMAP_ZERO(miflags, NFS_MIFLAG_BITMAP_LEN); + NFS_BITMAP_SET(miflags_mask, NFS_MIFLAG_DEAD); + NFS_BITMAP_SET(miflags_mask, NFS_MIFLAG_NOTRESP); + NFS_BITMAP_SET(miflags_mask, NFS_MIFLAG_RECOVERY); + if (nmp->nm_state & NFSSTA_DEAD) + NFS_BITMAP_SET(miflags, NFS_MIFLAG_DEAD); + if ((nmp->nm_state & (NFSSTA_TIMEO|NFSSTA_JUKEBOXTIMEO)) || + ((nmp->nm_state & NFSSTA_LOCKTIMEO) && (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED))) + NFS_BITMAP_SET(miflags, NFS_MIFLAG_NOTRESP); + if (nmp->nm_state & NFSSTA_RECOVER) + NFS_BITMAP_SET(miflags, NFS_MIFLAG_RECOVERY); + + /* get original mount args length */ + xb_init_buffer(&xborig, nmp->nm_args, 2*XDRWORD); + xb_get_32(error, &xborig, origargsvers); /* version */ + xb_get_32(error, &xborig, origargslength); /* args length */ + nfsmerr_if(error); + + /* set up current mount attributes bitmap */ + NFS_BITMAP_ZERO(mattrs, NFS_MATTR_BITMAP_LEN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FLAGS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_VERSION); + if (nmp->nm_vers >= NFS_VER4) + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_MINOR_VERSION); + NFS_BITMAP_SET(mattrs, NFS_MATTR_READ_SIZE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_WRITE_SIZE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_READDIR_SIZE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_READAHEAD); + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_REG_MIN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_REG_MAX); + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MIN); + NFS_BITMAP_SET(mattrs, NFS_MATTR_ATTRCACHE_DIR_MAX); + NFS_BITMAP_SET(mattrs, NFS_MATTR_LOCK_MODE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_SECURITY); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MAX_GROUP_LIST); + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOCKET_TYPE); + NFS_BITMAP_SET(mattrs, NFS_MATTR_NFS_PORT); + if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mountport) + NFS_BITMAP_SET(mattrs, NFS_MATTR_MOUNT_PORT); + NFS_BITMAP_SET(mattrs, NFS_MATTR_REQUEST_TIMEOUT); + if (NMFLAG(nmp, SOFT)) + NFS_BITMAP_SET(mattrs, NFS_MATTR_SOFT_RETRY_COUNT); + if (nmp->nm_deadtimeout) + NFS_BITMAP_SET(mattrs, NFS_MATTR_DEAD_TIMEOUT); + if (nmp->nm_fh) + NFS_BITMAP_SET(mattrs, NFS_MATTR_FH); + NFS_BITMAP_SET(mattrs, NFS_MATTR_FS_LOCATIONS); + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFLAGS); + if (origargsvers < NFS_ARGSVERSION_XDR) + NFS_BITMAP_SET(mattrs, NFS_MATTR_MNTFROM); + + /* set up current mount flags bitmap */ + /* first set the flags that we will be setting - either on OR off */ + NFS_BITMAP_ZERO(mflags_mask, NFS_MFLAG_BITMAP_LEN); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_SOFT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_INTR); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RESVPORT); + if (nmp->nm_sotype == SOCK_DGRAM) + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOCONNECT); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_DUMBTIMER); + if (nmp->nm_vers < NFS_VER4) + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_CALLUMNT); + if (nmp->nm_vers >= NFS_VER3) + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_RDIRPLUS); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NONEGNAMECACHE); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_MUTEJUKEBOX); + if (nmp->nm_vers >= NFS_VER4) { + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_EPHEMERAL); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOCALLBACK); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NONAMEDATTR); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOACL); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_ACLONLY); + } + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NFC); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_NOQUOTA); + if (nmp->nm_vers < NFS_VER4) + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_MNTUDP); + NFS_BITMAP_SET(mflags_mask, NFS_MFLAG_MNTQUICK); + /* now set the flags that should be set */ + NFS_BITMAP_ZERO(mflags, NFS_MFLAG_BITMAP_LEN); + if (NMFLAG(nmp, SOFT)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_SOFT); + if (NMFLAG(nmp, INTR)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_INTR); + if (NMFLAG(nmp, RESVPORT)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_RESVPORT); + if ((nmp->nm_sotype == SOCK_DGRAM) && NMFLAG(nmp, NOCONNECT)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOCONNECT); + if (NMFLAG(nmp, DUMBTIMER)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_DUMBTIMER); + if ((nmp->nm_vers < NFS_VER4) && NMFLAG(nmp, CALLUMNT)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_CALLUMNT); + if ((nmp->nm_vers >= NFS_VER3) && NMFLAG(nmp, RDIRPLUS)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_RDIRPLUS); + if (NMFLAG(nmp, NONEGNAMECACHE)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NONEGNAMECACHE); + if (NMFLAG(nmp, MUTEJUKEBOX)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_MUTEJUKEBOX); + if (nmp->nm_vers >= NFS_VER4) { + if (NMFLAG(nmp, EPHEMERAL)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_EPHEMERAL); + if (NMFLAG(nmp, NOCALLBACK)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOCALLBACK); + if (NMFLAG(nmp, NONAMEDATTR)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NONAMEDATTR); + if (NMFLAG(nmp, NOACL)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOACL); + if (NMFLAG(nmp, ACLONLY)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_ACLONLY); + } + if (NMFLAG(nmp, NFC)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NFC); + if (NMFLAG(nmp, NOQUOTA) || ((nmp->nm_vers >= NFS_VER4) && + !NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_QUOTA_AVAIL_HARD) && + !NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_QUOTA_AVAIL_SOFT) && + !NFS_BITMAP_ISSET(nmp->nm_fsattr.nfsa_supp_attr, NFS_FATTR_QUOTA_USED))) + NFS_BITMAP_SET(mflags, NFS_MFLAG_NOQUOTA); + if ((nmp->nm_vers < NFS_VER4) && NMFLAG(nmp, MNTUDP)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_MNTUDP); + if (NMFLAG(nmp, MNTQUICK)) + NFS_BITMAP_SET(mflags, NFS_MFLAG_MNTQUICK); + + /* assemble info buffer: */ + xb_init_buffer(&xbinfo, NULL, 0); + xb_add_32(error, &xbinfo, NFS_MOUNT_INFO_VERSION); + infolength_offset = xb_offset(&xbinfo); + xb_add_32(error, &xbinfo, 0); + xb_add_bitmap(error, &xbinfo, miattrs, NFS_MIATTR_BITMAP_LEN); + xb_add_bitmap(error, &xbinfo, miflags, NFS_MIFLAG_BITMAP_LEN); + xb_add_32(error, &xbinfo, origargslength); + if (!error) + error = xb_add_bytes(&xbinfo, nmp->nm_args, origargslength, 0); + + /* the opaque byte count for the current mount args values: */ + curargsopaquelength_offset = xb_offset(&xbinfo); + xb_add_32(error, &xbinfo, 0); + + /* Encode current mount args values */ + xb_add_32(error, &xbinfo, NFS_ARGSVERSION_XDR); + curargslength_offset = xb_offset(&xbinfo); + xb_add_32(error, &xbinfo, 0); + xb_add_32(error, &xbinfo, NFS_XDRARGS_VERSION_0); + xb_add_bitmap(error, &xbinfo, mattrs, NFS_MATTR_BITMAP_LEN); + attrslength_offset = xb_offset(&xbinfo); + xb_add_32(error, &xbinfo, 0); + xb_add_bitmap(error, &xbinfo, mflags_mask, NFS_MFLAG_BITMAP_LEN); + xb_add_bitmap(error, &xbinfo, mflags, NFS_MFLAG_BITMAP_LEN); + xb_add_32(error, &xbinfo, nmp->nm_vers); /* NFS_VERSION */ + if (nmp->nm_vers >= NFS_VER4) + xb_add_32(error, &xbinfo, 0); /* NFS_MINOR_VERSION */ + xb_add_32(error, &xbinfo, nmp->nm_rsize); /* READ_SIZE */ + xb_add_32(error, &xbinfo, nmp->nm_wsize); /* WRITE_SIZE */ + xb_add_32(error, &xbinfo, nmp->nm_readdirsize); /* READDIR_SIZE */ + xb_add_32(error, &xbinfo, nmp->nm_readahead); /* READAHEAD */ + xb_add_32(error, &xbinfo, nmp->nm_acregmin); /* ATTRCACHE_REG_MIN */ + xb_add_32(error, &xbinfo, 0); /* ATTRCACHE_REG_MIN */ + xb_add_32(error, &xbinfo, nmp->nm_acregmax); /* ATTRCACHE_REG_MAX */ + xb_add_32(error, &xbinfo, 0); /* ATTRCACHE_REG_MAX */ + xb_add_32(error, &xbinfo, nmp->nm_acdirmin); /* ATTRCACHE_DIR_MIN */ + xb_add_32(error, &xbinfo, 0); /* ATTRCACHE_DIR_MIN */ + xb_add_32(error, &xbinfo, nmp->nm_acdirmax); /* ATTRCACHE_DIR_MAX */ + xb_add_32(error, &xbinfo, 0); /* ATTRCACHE_DIR_MAX */ + xb_add_32(error, &xbinfo, nmp->nm_lockmode); /* LOCK_MODE */ + if (nmp->nm_sec.count) { + xb_add_32(error, &xbinfo, nmp->nm_sec.count); /* SECURITY */ + nfsmerr_if(error); + for (i=0; i < nmp->nm_sec.count; i++) + xb_add_32(error, &xbinfo, nmp->nm_sec.flavors[i]); + } else if (nmp->nm_servsec.count) { + xb_add_32(error, &xbinfo, nmp->nm_servsec.count); /* SECURITY */ + nfsmerr_if(error); + for (i=0; i < nmp->nm_servsec.count; i++) + xb_add_32(error, &xbinfo, nmp->nm_servsec.flavors[i]); + } else { + xb_add_32(error, &xbinfo, 1); /* SECURITY */ + xb_add_32(error, &xbinfo, nmp->nm_auth); + } + xb_add_32(error, &xbinfo, nmp->nm_numgrps); /* MAX_GROUP_LIST */ + nfsmerr_if(error); + snprintf(sotype, sizeof(sotype), "%s%s", (nmp->nm_sotype == SOCK_DGRAM) ? "udp" : "tcp", + nmp->nm_sofamily ? (nmp->nm_sofamily == AF_INET) ? "4" : "6" : ""); + xb_add_string(error, &xbinfo, sotype, strlen(sotype)); /* SOCKET_TYPE */ + xb_add_32(error, &xbinfo, ntohs(((struct sockaddr_in*)nmp->nm_saddr)->sin_port)); /* NFS_PORT */ + if ((nmp->nm_vers < NFS_VER4) && nmp->nm_mountport) + xb_add_32(error, &xbinfo, nmp->nm_mountport); /* MOUNT_PORT */ + timeo = (nmp->nm_timeo * 10) / NFS_HZ; + xb_add_32(error, &xbinfo, timeo/10); /* REQUEST_TIMEOUT */ + xb_add_32(error, &xbinfo, (timeo%10)*100000000); /* REQUEST_TIMEOUT */ + if (NMFLAG(nmp, SOFT)) + xb_add_32(error, &xbinfo, nmp->nm_retry); /* SOFT_RETRY_COUNT */ + if (nmp->nm_deadtimeout) { + xb_add_32(error, &xbinfo, nmp->nm_deadtimeout); /* DEAD_TIMEOUT */ + xb_add_32(error, &xbinfo, 0); /* DEAD_TIMEOUT */ + } + if (nmp->nm_fh) + xb_add_fh(error, &xbinfo, &nmp->nm_fh->fh_data[0], nmp->nm_fh->fh_len); /* FH */ + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_numlocs); /* FS_LOCATIONS */ + for (loc = 0; !error && (loc < nmp->nm_locations.nl_numlocs); loc++) { + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_servcount); + for (serv = 0; !error && (serv < nmp->nm_locations.nl_locations[loc]->nl_servcount); serv++) { + xb_add_string(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_name, + strlen(nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_name)); + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addrcount); + for (addr = 0; !error && (addr < nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addrcount); addr++) + xb_add_string(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addresses[addr], + strlen(nmp->nm_locations.nl_locations[loc]->nl_servers[serv]->ns_addresses[addr])); + xb_add_32(error, &xbinfo, 0); /* empty server info */ + } + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_path.np_compcount); + for (comp = 0; !error && (comp < nmp->nm_locations.nl_locations[loc]->nl_path.np_compcount); comp++) + xb_add_string(error, &xbinfo, nmp->nm_locations.nl_locations[loc]->nl_path.np_components[comp], + strlen(nmp->nm_locations.nl_locations[loc]->nl_path.np_components[comp])); + xb_add_32(error, &xbinfo, 0); /* empty fs location info */ + } + xb_add_32(error, &xbinfo, vfs_flags(nmp->nm_mountp)); /* MNTFLAGS */ + if (origargsvers < NFS_ARGSVERSION_XDR) + xb_add_string(error, &xbinfo, vfs_statfs(nmp->nm_mountp)->f_mntfromname, + strlen(vfs_statfs(nmp->nm_mountp)->f_mntfromname)); /* MNTFROM */ + curargs_end_offset = xb_offset(&xbinfo); + + /* NFS_MIATTR_CUR_LOC_INDEX */ + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_current.nli_flags); + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_current.nli_loc); + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_current.nli_serv); + xb_add_32(error, &xbinfo, nmp->nm_locations.nl_current.nli_addr); + + xb_build_done(error, &xbinfo); + + /* update opaque counts */ + end_offset = xb_offset(&xbinfo); + if (!error) { + error = xb_seek(&xbinfo, attrslength_offset); + xb_add_32(error, &xbinfo, curargs_end_offset - attrslength_offset - XDRWORD/*don't include length field*/); + } + if (!error) { + error = xb_seek(&xbinfo, curargslength_offset); + xb_add_32(error, &xbinfo, curargs_end_offset - curargslength_offset + XDRWORD/*version*/); + } + if (!error) { + error = xb_seek(&xbinfo, curargsopaquelength_offset); + xb_add_32(error, &xbinfo, curargs_end_offset - curargslength_offset + XDRWORD/*version*/); + } + if (!error) { + error = xb_seek(&xbinfo, infolength_offset); + xb_add_32(error, &xbinfo, end_offset - infolength_offset + XDRWORD/*version*/); + } + nfsmerr_if(error); + + /* copy result xdrbuf to caller */ + *xb = xbinfo; + + /* and mark the local copy as not needing cleanup */ + xbinfo.xb_flags &= ~XB_CLEANUP; +nfsmerr: + xb_cleanup(&xbinfo); + return (error); +} + /* * Do that sysctl thang... */ @@ -2552,6 +5056,8 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, struct nfsmount *nmp = NULL; struct vfsquery vq; boolean_t is_64_bit; + fsid_t fsid; + struct xdrbuf xb; #if NFSSERVER struct nfs_exportfs *nxfs; struct nfs_export *nx; @@ -2622,6 +5128,32 @@ nfs_vfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp, if (newp) return copyin(newp, &nfsstats, sizeof nfsstats); return (0); + case NFS_MOUNTINFO: + /* read in the fsid */ + if (*oldlenp < sizeof(fsid)) + return (EINVAL); + if ((error = copyin(oldp, &fsid, sizeof(fsid)))) + return (error); + /* swizzle it back to host order */ + fsid.val[0] = ntohl(fsid.val[0]); + fsid.val[1] = ntohl(fsid.val[1]); + /* find mount and make sure it's NFS */ + if (((mp = vfs_getvfs(&fsid))) == NULL) + return (ENOENT); + if (strcmp(mp->mnt_vfsstat.f_fstypename, "nfs")) + return (EINVAL); + if (((nmp = VFSTONFS(mp))) == NULL) + return (ENOENT); + xb_init(&xb, 0); + if ((error = nfs_mountinfo_assemble(nmp, &xb))) + return (error); + if (*oldlenp < xb.xb_u.xb_buffer.xbb_len) + error = ENOMEM; + else + error = copyout(xb_buffer_base(&xb), oldp, xb.xb_u.xb_buffer.xbb_len); + *oldlenp = xb.xb_u.xb_buffer.xbb_len; + xb_cleanup(&xb); + break; #if NFSSERVER case NFS_EXPORTSTATS: /* setup export stat descriptor */ @@ -2866,7 +5398,7 @@ ustat_skip: case VFS_CTL_NOLOCKS: if (req->oldptr != USER_ADDR_NULL) { lck_mtx_lock(&nmp->nm_lock); - val = (nmp->nm_flag & NFSMNT_NOLOCKS) ? 1 : 0; + val = (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED) ? 1 : 0; lck_mtx_unlock(&nmp->nm_lock); error = SYSCTL_OUT(req, &val, sizeof(val)); if (error) @@ -2877,18 +5409,21 @@ ustat_skip: if (error) return (error); lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_flag & NFSMNT_LOCALLOCKS) { + if (nmp->nm_lockmode == NFS_LOCK_MODE_LOCAL) { /* can't toggle locks when using local locks */ error = EINVAL; + } else if ((nmp->nm_vers >= NFS_VER4) && val) { + /* can't disable locks for NFSv4 */ + error = EINVAL; } else if (val) { - if (!(nmp->nm_flag & NFSMNT_NOLOCKS)) - nfs_lockd_mount_change(-1); - nmp->nm_flag |= NFSMNT_NOLOCKS; + if ((nmp->nm_vers <= NFS_VER3) && (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED)) + nfs_lockd_mount_unregister(nmp); + nmp->nm_lockmode = NFS_LOCK_MODE_DISABLED; nmp->nm_state &= ~NFSSTA_LOCKTIMEO; } else { - if (nmp->nm_flag & NFSMNT_NOLOCKS) - nfs_lockd_mount_change(1); - nmp->nm_flag &= ~NFSMNT_NOLOCKS; + if ((nmp->nm_vers <= NFS_VER3) && (nmp->nm_lockmode == NFS_LOCK_MODE_DISABLED)) + nfs_lockd_mount_register(nmp); + nmp->nm_lockmode = NFS_LOCK_MODE_ENABLED; } lck_mtx_unlock(&nmp->nm_lock); } @@ -2896,14 +5431,13 @@ ustat_skip: case VFS_CTL_QUERY: lck_mtx_lock(&nmp->nm_lock); /* XXX don't allow users to know about/disconnect unresponsive, soft, nobrowse mounts */ - softnobrowse = ((nmp->nm_flag & NFSMNT_SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); + softnobrowse = (NMFLAG(nmp, SOFT) && (vfs_flags(nmp->nm_mountp) & MNT_DONTBROWSE)); if (!softnobrowse && (nmp->nm_state & NFSSTA_TIMEO)) vq.vq_flags |= VQ_NOTRESP; - if (!softnobrowse && (nmp->nm_state & NFSSTA_JUKEBOXTIMEO) && - !(nmp->nm_flag & NFSMNT_MUTEJUKEBOX)) + if (!softnobrowse && (nmp->nm_state & NFSSTA_JUKEBOXTIMEO) && !NMFLAG(nmp, MUTEJUKEBOX)) vq.vq_flags |= VQ_NOTRESP; if (!softnobrowse && (nmp->nm_state & NFSSTA_LOCKTIMEO) && - !(nmp->nm_flag & (NFSMNT_NOLOCKS|NFSMNT_LOCALLOCKS))) + (nmp->nm_lockmode == NFS_LOCK_MODE_ENABLED)) vq.vq_flags |= VQ_NOTRESP; if (nmp->nm_state & NFSSTA_DEAD) vq.vq_flags |= VQ_DEAD; diff --git a/bsd/nfs/nfs_vnops.c b/bsd/nfs/nfs_vnops.c index 00199a6df..d1e130b88 100644 --- a/bsd/nfs/nfs_vnops.c +++ b/bsd/nfs/nfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -134,11 +134,7 @@ int nfsfifo_vnop_close(struct vnop_close_args *); int nfs_vnop_ioctl(struct vnop_ioctl_args *); int nfs_vnop_select(struct vnop_select_args *); int nfs_vnop_setattr(struct vnop_setattr_args *); -int nfs_vnop_read(struct vnop_read_args *); -int nfs_vnop_write(struct vnop_write_args *); -int nfs_vnop_mmap(struct vnop_mmap_args *); int nfs_vnop_fsync(struct vnop_fsync_args *); -int nfs_vnop_remove(struct vnop_remove_args *); int nfs_vnop_rename(struct vnop_rename_args *); int nfs_vnop_readdir(struct vnop_readdir_args *); int nfs_vnop_readlink(struct vnop_readlink_args *); @@ -148,6 +144,7 @@ int nfs_vnop_pageout(struct vnop_pageout_args *); int nfs_vnop_blktooff(struct vnop_blktooff_args *); int nfs_vnop_offtoblk(struct vnop_offtoblk_args *); int nfs_vnop_blockmap(struct vnop_blockmap_args *); +int nfs_vnop_monitor(struct vnop_monitor_args *); int nfs3_vnop_create(struct vnop_create_args *); int nfs3_vnop_mknod(struct vnop_mknod_args *); @@ -163,8 +160,8 @@ static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vnop_lookup_desc, (vnop_t *)nfs_vnop_lookup }, /* lookup */ { &vnop_create_desc, (vnop_t *)nfs3_vnop_create }, /* create */ { &vnop_mknod_desc, (vnop_t *)nfs3_vnop_mknod }, /* mknod */ - { &vnop_open_desc, (vnop_t *)nfs3_vnop_open }, /* open */ - { &vnop_close_desc, (vnop_t *)nfs3_vnop_close }, /* close */ + { &vnop_open_desc, (vnop_t *)nfs_vnop_open }, /* open */ + { &vnop_close_desc, (vnop_t *)nfs_vnop_close }, /* close */ { &vnop_access_desc, (vnop_t *)nfs_vnop_access }, /* access */ { &vnop_getattr_desc, (vnop_t *)nfs3_vnop_getattr }, /* getattr */ { &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr }, /* setattr */ @@ -174,6 +171,7 @@ static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vnop_select_desc, (vnop_t *)nfs_vnop_select }, /* select */ { &vnop_revoke_desc, (vnop_t *)nfs_vnop_revoke }, /* revoke */ { &vnop_mmap_desc, (vnop_t *)nfs_vnop_mmap }, /* mmap */ + { &vnop_mnomap_desc, (vnop_t *)nfs_vnop_mnomap }, /* mnomap */ { &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync }, /* fsync */ { &vnop_remove_desc, (vnop_t *)nfs_vnop_remove }, /* remove */ { &vnop_link_desc, (vnop_t *)nfs3_vnop_link }, /* link */ @@ -187,7 +185,7 @@ static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim }, /* reclaim */ { &vnop_strategy_desc, (vnop_t *)err_strategy }, /* strategy */ { &vnop_pathconf_desc, (vnop_t *)nfs_vnop_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (vnop_t *)nfs3_vnop_advlock }, /* advlock */ + { &vnop_advlock_desc, (vnop_t *)nfs_vnop_advlock }, /* advlock */ { &vnop_bwrite_desc, (vnop_t *)err_bwrite }, /* bwrite */ { &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein }, /* Pagein */ { &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout }, /* Pageout */ @@ -195,6 +193,7 @@ static struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc nfsv2_vnodeop_opv_desc = @@ -206,18 +205,18 @@ static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = { { &vnop_lookup_desc, (vnop_t *)nfs_vnop_lookup }, /* lookup */ { &vnop_create_desc, (vnop_t *)nfs4_vnop_create }, /* create */ { &vnop_mknod_desc, (vnop_t *)nfs4_vnop_mknod }, /* mknod */ - { &vnop_open_desc, (vnop_t *)nfs4_vnop_open }, /* open */ - { &vnop_close_desc, (vnop_t *)nfs4_vnop_close }, /* close */ + { &vnop_open_desc, (vnop_t *)nfs_vnop_open }, /* open */ + { &vnop_close_desc, (vnop_t *)nfs_vnop_close }, /* close */ { &vnop_access_desc, (vnop_t *)nfs_vnop_access }, /* access */ { &vnop_getattr_desc, (vnop_t *)nfs4_vnop_getattr }, /* getattr */ { &vnop_setattr_desc, (vnop_t *)nfs_vnop_setattr }, /* setattr */ - { &vnop_read_desc, (vnop_t *)nfs4_vnop_read }, /* read */ + { &vnop_read_desc, (vnop_t *)nfs_vnop_read }, /* read */ { &vnop_write_desc, (vnop_t *)nfs_vnop_write }, /* write */ { &vnop_ioctl_desc, (vnop_t *)nfs_vnop_ioctl }, /* ioctl */ { &vnop_select_desc, (vnop_t *)nfs_vnop_select }, /* select */ { &vnop_revoke_desc, (vnop_t *)nfs_vnop_revoke }, /* revoke */ - { &vnop_mmap_desc, (vnop_t *)nfs4_vnop_mmap }, /* mmap */ - { &vnop_mnomap_desc, (vnop_t *)nfs4_vnop_mnomap }, /* mnomap */ + { &vnop_mmap_desc, (vnop_t *)nfs_vnop_mmap }, /* mmap */ + { &vnop_mnomap_desc, (vnop_t *)nfs_vnop_mnomap }, /* mnomap */ { &vnop_fsync_desc, (vnop_t *)nfs_vnop_fsync }, /* fsync */ { &vnop_remove_desc, (vnop_t *)nfs_vnop_remove }, /* remove */ { &vnop_link_desc, (vnop_t *)nfs4_vnop_link }, /* link */ @@ -231,7 +230,7 @@ static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = { { &vnop_reclaim_desc, (vnop_t *)nfs_vnop_reclaim }, /* reclaim */ { &vnop_strategy_desc, (vnop_t *)err_strategy }, /* strategy */ { &vnop_pathconf_desc, (vnop_t *)nfs_vnop_pathconf }, /* pathconf */ - { &vnop_advlock_desc, (vnop_t *)nfs4_vnop_advlock }, /* advlock */ + { &vnop_advlock_desc, (vnop_t *)nfs_vnop_advlock }, /* advlock */ { &vnop_bwrite_desc, (vnop_t *)err_bwrite }, /* bwrite */ { &vnop_pagein_desc, (vnop_t *)nfs_vnop_pagein }, /* Pagein */ { &vnop_pageout_desc, (vnop_t *)nfs_vnop_pageout }, /* Pageout */ @@ -239,6 +238,16 @@ static struct vnodeopv_entry_desc nfsv4_vnodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_getxattr_desc, (vnop_t *)nfs4_vnop_getxattr }, /* getxattr */ + { &vnop_setxattr_desc, (vnop_t *)nfs4_vnop_setxattr }, /* setxattr */ + { &vnop_removexattr_desc, (vnop_t *)nfs4_vnop_removexattr },/* removexattr */ + { &vnop_listxattr_desc, (vnop_t *)nfs4_vnop_listxattr },/* listxattr */ +#if NAMEDSTREAMS + { &vnop_getnamedstream_desc, (vnop_t *)nfs4_vnop_getnamedstream }, /* getnamedstream */ + { &vnop_makenamedstream_desc, (vnop_t *)nfs4_vnop_makenamedstream }, /* makenamedstream */ + { &vnop_removenamedstream_desc, (vnop_t *)nfs4_vnop_removenamedstream },/* removenamedstream */ +#endif + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc nfsv4_vnodeop_opv_desc = @@ -283,6 +292,7 @@ static struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = @@ -323,6 +333,16 @@ static struct vnodeopv_entry_desc spec_nfsv4nodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_getxattr_desc, (vnop_t *)nfs4_vnop_getxattr }, /* getxattr */ + { &vnop_setxattr_desc, (vnop_t *)nfs4_vnop_setxattr }, /* setxattr */ + { &vnop_removexattr_desc, (vnop_t *)nfs4_vnop_removexattr },/* removexattr */ + { &vnop_listxattr_desc, (vnop_t *)nfs4_vnop_listxattr },/* listxattr */ +#if NAMEDSTREAMS + { &vnop_getnamedstream_desc, (vnop_t *)nfs4_vnop_getnamedstream }, /* getnamedstream */ + { &vnop_makenamedstream_desc, (vnop_t *)nfs4_vnop_makenamedstream }, /* makenamedstream */ + { &vnop_removenamedstream_desc, (vnop_t *)nfs4_vnop_removenamedstream },/* removenamedstream */ +#endif + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc spec_nfsv4nodeop_opv_desc = @@ -365,6 +385,7 @@ static struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = @@ -406,6 +427,16 @@ static struct vnodeopv_entry_desc fifo_nfsv4nodeop_entries[] = { { &vnop_blktooff_desc, (vnop_t *)nfs_vnop_blktooff }, /* blktooff */ { &vnop_offtoblk_desc, (vnop_t *)nfs_vnop_offtoblk }, /* offtoblk */ { &vnop_blockmap_desc, (vnop_t *)nfs_vnop_blockmap }, /* blockmap */ + { &vnop_getxattr_desc, (vnop_t *)nfs4_vnop_getxattr }, /* getxattr */ + { &vnop_setxattr_desc, (vnop_t *)nfs4_vnop_setxattr }, /* setxattr */ + { &vnop_removexattr_desc, (vnop_t *)nfs4_vnop_removexattr },/* removexattr */ + { &vnop_listxattr_desc, (vnop_t *)nfs4_vnop_listxattr },/* listxattr */ +#if NAMEDSTREAMS + { &vnop_getnamedstream_desc, (vnop_t *)nfs4_vnop_getnamedstream }, /* getnamedstream */ + { &vnop_makenamedstream_desc, (vnop_t *)nfs4_vnop_makenamedstream }, /* makenamedstream */ + { &vnop_removenamedstream_desc, (vnop_t *)nfs4_vnop_removenamedstream },/* removenamedstream */ +#endif + { &vnop_monitor_desc, (vnop_t *)nfs_vnop_monitor }, /* monitor */ { NULL, NULL } }; struct vnodeopv_desc fifo_nfsv4nodeop_opv_desc = @@ -418,30 +449,30 @@ int nfs_sillyrename(nfsnode_t,nfsnode_t,struct componentname *,vfs_context_t); /* * Find the slot in the access cache for this UID. * If adding and no existing slot is found, reuse slots in FIFO order. - * The index of the next slot to use is kept in the last entry of the n_mode array. + * The index of the next slot to use is kept in the last entry of the n_access array. */ int -nfs_node_mode_slot(nfsnode_t np, uid_t uid, int add) +nfs_node_access_slot(nfsnode_t np, uid_t uid, int add) { int slot; for (slot=0; slot < NFS_ACCESS_CACHE_SIZE; slot++) - if (np->n_modeuid[slot] == uid) + if (np->n_accessuid[slot] == uid) break; if (slot == NFS_ACCESS_CACHE_SIZE) { if (!add) return (-1); - slot = np->n_mode[NFS_ACCESS_CACHE_SIZE]; - np->n_mode[NFS_ACCESS_CACHE_SIZE] = (slot + 1) % NFS_ACCESS_CACHE_SIZE; + slot = np->n_access[NFS_ACCESS_CACHE_SIZE]; + np->n_access[NFS_ACCESS_CACHE_SIZE] = (slot + 1) % NFS_ACCESS_CACHE_SIZE; } return (slot); } int -nfs3_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) +nfs3_access_rpc(nfsnode_t np, u_int32_t *access, vfs_context_t ctx) { int error = 0, lockerror = ENOENT, status, slot; - uint32_t access = 0; + uint32_t access_result = 0; u_int64_t xid; struct nfsm_chain nmreq, nmrep; struct timeval now; @@ -452,25 +483,24 @@ nfs3_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(NFS_VER3) + NFSX_UNSIGNED); nfsm_chain_add_fh(error, &nmreq, NFS_VER3, np->n_fhp, np->n_fhsize); - nfsm_chain_add_32(error, &nmreq, *mode); + nfsm_chain_add_32(error, &nmreq, *access); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_ACCESS, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_ACCESS, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; nfsm_chain_postop_attr_update(error, &nmrep, np, &xid); if (!error) error = status; - nfsm_chain_get_32(error, &nmrep, access); + nfsm_chain_get_32(error, &nmrep, access_result); nfsmout_if(error); uid = kauth_cred_getuid(vfs_context_ucred(ctx)); - slot = nfs_node_mode_slot(np, uid, 1); - np->n_modeuid[slot] = uid; + slot = nfs_node_access_slot(np, uid, 1); + np->n_accessuid[slot] = uid; microuptime(&now); - np->n_modestamp[slot] = now.tv_sec; - np->n_mode[slot] = access; + np->n_accessstamp[slot] = now.tv_sec; + np->n_access[slot] = access_result; /* * If we asked for DELETE but didn't get it, the server @@ -479,11 +509,14 @@ nfs3_access_rpc(nfsnode_t np, u_int32_t *mode, vfs_context_t ctx) * and just let any subsequent delete action fail if it * really isn't deletable. */ - if ((*mode & NFS_ACCESS_DELETE) && - !(np->n_mode[slot] & NFS_ACCESS_DELETE)) - np->n_mode[slot] |= NFS_ACCESS_DELETE; - /* pass back the mode returned with this request */ - *mode = np->n_mode[slot]; + if ((*access & NFS_ACCESS_DELETE) && + !(np->n_access[slot] & NFS_ACCESS_DELETE)) + np->n_access[slot] |= NFS_ACCESS_DELETE; + /* ".zfs" subdirectories may erroneously give a denied answer for add/remove */ + if (nfs_access_dotzfs && (np->n_flag & NISDOTZFSCHILD)) + np->n_access[slot] |= (NFS_ACCESS_MODIFY|NFS_ACCESS_EXTEND|NFS_ACCESS_DELETE); + /* pass back the access returned with this request */ + *access = np->n_access[slot]; nfsmout: if (!lockerror) nfs_node_unlock(np); @@ -495,8 +528,8 @@ nfsmout: /* * NFS access vnode op. * For NFS version 2, just return ok. File accesses may fail later. - * For NFS version 3+, use the access RPC to check accessibility. If file modes - * are changed on the server, accesses might still fail later. + * For NFS version 3+, use the access RPC to check accessibility. If file + * permissions are changed on the server, accesses might still fail later. */ int nfs_vnop_access( @@ -510,7 +543,7 @@ nfs_vnop_access( vfs_context_t ctx = ap->a_context; vnode_t vp = ap->a_vp; int error = 0, slot, dorpc; - u_int32_t mode, wmode; + u_int32_t access, waccess; nfsnode_t np = VTONFS(vp); struct nfsmount *nmp; int nfsvers; @@ -541,53 +574,53 @@ nfs_vnop_access( /* * Convert KAUTH primitives to NFS access rights. */ - mode = 0; + access = 0; if (vnode_isdir(vp)) { /* directory */ if (ap->a_action & (KAUTH_VNODE_LIST_DIRECTORY | KAUTH_VNODE_READ_EXTATTRIBUTES)) - mode |= NFS_ACCESS_READ; + access |= NFS_ACCESS_READ; if (ap->a_action & KAUTH_VNODE_SEARCH) - mode |= NFS_ACCESS_LOOKUP; + access |= NFS_ACCESS_LOOKUP; if (ap->a_action & (KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY)) - mode |= NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; + access |= NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; if (ap->a_action & KAUTH_VNODE_DELETE_CHILD) - mode |= NFS_ACCESS_MODIFY; + access |= NFS_ACCESS_MODIFY; } else { /* file */ if (ap->a_action & (KAUTH_VNODE_READ_DATA | KAUTH_VNODE_READ_EXTATTRIBUTES)) - mode |= NFS_ACCESS_READ; + access |= NFS_ACCESS_READ; if (ap->a_action & KAUTH_VNODE_WRITE_DATA) - mode |= NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; + access |= NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; if (ap->a_action & KAUTH_VNODE_APPEND_DATA) - mode |= NFS_ACCESS_EXTEND; + access |= NFS_ACCESS_EXTEND; if (ap->a_action & KAUTH_VNODE_EXECUTE) - mode |= NFS_ACCESS_EXECUTE; + access |= NFS_ACCESS_EXECUTE; } /* common */ if (ap->a_action & KAUTH_VNODE_DELETE) - mode |= NFS_ACCESS_DELETE; + access |= NFS_ACCESS_DELETE; if (ap->a_action & (KAUTH_VNODE_WRITE_ATTRIBUTES | KAUTH_VNODE_WRITE_EXTATTRIBUTES | KAUTH_VNODE_WRITE_SECURITY)) - mode |= NFS_ACCESS_MODIFY; + access |= NFS_ACCESS_MODIFY; /* XXX this is pretty dubious */ if (ap->a_action & KAUTH_VNODE_CHANGE_OWNER) - mode |= NFS_ACCESS_MODIFY; + access |= NFS_ACCESS_MODIFY; /* if caching, always ask for every right */ if (nfs_access_cache_timeout > 0) { - wmode = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | + waccess = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND | NFS_ACCESS_EXECUTE | NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; } else { - wmode = mode; + waccess = access; } if ((error = nfs_node_lock(np))) @@ -598,39 +631,44 @@ nfs_vnop_access( * this request? */ uid = kauth_cred_getuid(vfs_context_ucred(ctx)); - slot = nfs_node_mode_slot(np, uid, 0); + slot = nfs_node_access_slot(np, uid, 0); dorpc = 1; - if (mode == 0) { + if (access == 0) { /* not asking for any rights understood by NFS, so don't bother doing an RPC */ /* OSAddAtomic(1, &nfsstats.accesscache_hits); */ dorpc = 0; - wmode = 0; - } else if (NMODEVALID(np, slot)) { + waccess = 0; + } else if (NACCESSVALID(np, slot)) { microuptime(&now); - if ((now.tv_sec < (np->n_modestamp[slot] + nfs_access_cache_timeout)) && - ((np->n_mode[slot] & mode) == mode)) { + if ((now.tv_sec < (np->n_accessstamp[slot] + nfs_access_cache_timeout)) && + ((np->n_access[slot] & access) == access)) { /* OSAddAtomic(1, &nfsstats.accesscache_hits); */ dorpc = 0; - wmode = np->n_mode[slot]; + waccess = np->n_access[slot]; } } nfs_node_unlock(np); if (dorpc) { /* Either a no, or a don't know. Go to the wire. */ /* OSAddAtomic(1, &nfsstats.accesscache_misses); */ - error = nmp->nm_funcs->nf_access_rpc(np, &wmode, ctx); + error = nmp->nm_funcs->nf_access_rpc(np, &waccess, ctx); } - if (!error && ((wmode & mode) != mode)) + if (!error && ((waccess & access) != access)) error = EACCES; return (error); } + /* * NFS open vnode op + * + * Perform various update/invalidation checks and then add the + * open to the node. Regular files will have an open file structure + * on the node and, for NFSv4, perform an OPEN request on the server. */ int -nfs3_vnop_open( +nfs_vnop_open( struct vnop_open_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -642,17 +680,25 @@ nfs3_vnop_open( vnode_t vp = ap->a_vp; nfsnode_t np = VTONFS(vp); struct nfsmount *nmp; - struct nfs_vattr nvattr; + int error, accessMode, denyMode, opened = 0; + struct nfs_open_owner *noop = NULL; + struct nfs_open_file *nofp = NULL; enum vtype vtype; - int error; + + if (!(ap->a_mode & (FREAD|FWRITE))) + return (EINVAL); nmp = VTONMP(vp); if (!nmp) return (ENXIO); + if (np->n_flag & NREVOKE) + return (EIO); vtype = vnode_vtype(vp); if ((vtype != VREG) && (vtype != VDIR) && (vtype != VLNK)) return (EACCES); + + /* First, check if we need to update/invalidate */ if (ISSET(np->n_flag, NUPDATESIZE)) nfs_data_update_size(np, 0); if ((error = nfs_node_lock(np))) @@ -666,7 +712,7 @@ nfs3_vnop_open( if ((error = nfs_node_lock(np))) return (error); } - if (vnode_vtype(NFSTOV(np)) == VREG) + if (vtype == VREG) np->n_lastrahead = -1; if (np->n_flag & NMODIFIED) { if (vtype == VDIR) @@ -677,12 +723,145 @@ nfs3_vnop_open( } else { nfs_node_unlock(np); } + /* nfs_getattr() will check changed and purge caches */ - return (nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED)); + if ((error = nfs_getattr(np, NULL, ctx, NGA_UNCACHED))) + return (error); + + if (vtype != VREG) { + /* Just mark that it was opened */ + lck_mtx_lock(&np->n_openlock); + np->n_openrefcnt++; + lck_mtx_unlock(&np->n_openlock); + return (0); + } + + /* mode contains some combination of: FREAD, FWRITE, O_SHLOCK, O_EXLOCK */ + accessMode = 0; + if (ap->a_mode & FREAD) + accessMode |= NFS_OPEN_SHARE_ACCESS_READ; + if (ap->a_mode & FWRITE) + accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE; + if (ap->a_mode & O_EXLOCK) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + else if (ap->a_mode & O_SHLOCK) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else + denyMode = NFS_OPEN_SHARE_DENY_NONE; + // XXX don't do deny modes just yet (and never do it for !v4) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); + if (!noop) + return (ENOMEM); + +restart: + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); + if (error) { + nfs_open_owner_rele(noop); + return (error); + } + if (np->n_flag & NREVOKE) { + error = EIO; + nfs_mount_state_in_use_end(nmp, 0); + nfs_open_owner_rele(noop); + return (error); + } + + error = nfs_open_file_find(np, noop, &nofp, accessMode, denyMode, 1); + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) { + NP(np, "nfs_vnop_open: LOST %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + error = EIO; + } + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(nofp, vfs_context_thread(ctx)); + nofp = NULL; + if (!error) + goto restart; + } + if (!error) + error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); + if (error) { + nofp = NULL; + goto out; + } + + if (nmp->nm_vers < NFS_VER4) { + /* + * NFS v2/v3 opens are always allowed - so just add it. + */ + nfs_open_file_add_open(nofp, accessMode, denyMode, 0); + goto out; + } + + /* + * If we just created the file and the modes match, then we simply use + * the open performed in the create. Otherwise, send the request. + */ + if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && + (nofp->nof_creator == current_thread()) && + (accessMode == NFS_OPEN_SHARE_ACCESS_BOTH) && + (denyMode == NFS_OPEN_SHARE_DENY_NONE)) { + nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; + nofp->nof_creator = NULL; + } else { + if (!opened) + error = nfs4_open(np, nofp, accessMode, denyMode, ctx); + if ((error == EACCES) && (nofp->nof_flags & NFS_OPEN_FILE_CREATE) && + (nofp->nof_creator == current_thread())) { + /* + * Ugh. This can happen if we just created the file with read-only + * perms and we're trying to open it for real with different modes + * (e.g. write-only or with a deny mode) and the server decides to + * not allow the second open because of the read-only perms. + * The best we can do is to just use the create's open. + * We may have access we don't need or we may not have a requested + * deny mode. We may log complaints later, but we'll try to avoid it. + */ + if (denyMode != NFS_OPEN_SHARE_DENY_NONE) + NP(np, "nfs_vnop_open: deny mode foregone on create, %d", kauth_cred_getuid(nofp->nof_owner->noo_cred)); + nofp->nof_creator = NULL; + error = 0; + } + if (error) + goto out; + opened = 1; + /* + * If we had just created the file, we already had it open. + * If the actual open mode is less than what we grabbed at + * create time, then we'll downgrade the open here. + */ + if ((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && + (nofp->nof_creator == current_thread())) { + error = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_BOTH, NFS_OPEN_SHARE_DENY_NONE, ctx); + if (error) + NP(np, "nfs_vnop_open: create close error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + if (!nfs_mount_state_error_should_restart(error)) { + error = 0; + nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; + } + } + } + +out: + if (nofp) + nfs_open_file_clear_busy(nofp); + if (nfs_mount_state_in_use_end(nmp, error)) { + nofp = NULL; + goto restart; + } + if (error) + NP(np, "nfs_vnop_open: error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); + if (noop) + nfs_open_owner_rele(noop); + return (error); } + /* * NFS close vnode op + * * What an NFS client should do upon close after writing is a debatable issue. * Most NFS clients push delayed writes to the server upon close, basically for * two reasons: @@ -700,11 +879,11 @@ nfs3_vnop_open( * * The current code does the following: * for NFS Version 2 - play it safe and flush/invalidate all dirty buffers - * for NFS Version 3 - flush dirty buffers to the server but don't invalidate - * them. + * for NFS Version 3 - flush dirty buffers to the server but don't invalidate them. + * for NFS Version 4 - basically the same as NFSv3 */ int -nfs3_vnop_close( +nfs_vnop_close( struct vnop_close_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -716,35 +895,36 @@ nfs3_vnop_close( vnode_t vp = ap->a_vp; nfsnode_t np = VTONFS(vp); struct nfsmount *nmp; - int nfsvers; - int error = 0; + int error = 0, error1, nfsvers; + int fflag = ap->a_fflag; + enum vtype vtype; + int accessMode, denyMode; + struct nfs_open_owner *noop = NULL; + struct nfs_open_file *nofp = NULL; - if (vnode_vtype(vp) != VREG) - return (0); nmp = VTONMP(vp); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + vtype = vnode_vtype(vp); + /* First, check if we need to update/flush/invalidate */ if (ISSET(np->n_flag, NUPDATESIZE)) nfs_data_update_size(np, 0); - if ((error = nfs_node_lock(np))) - return (error); + nfs_node_lock_force(np); if (np->n_flag & NNEEDINVALIDATE) { np->n_flag &= ~NNEEDINVALIDATE; nfs_node_unlock(np); nfs_vinvalbuf(vp, V_SAVE|V_IGNORE_WRITEERR, ctx, 1); - if ((error = nfs_node_lock(np))) - return (error); + nfs_node_lock_force(np); } - if (np->n_flag & NMODIFIED) { + if ((vtype == VREG) && (np->n_flag & NMODIFIED) && (fflag & FWRITE)) { + /* we're closing an open for write and the file is modified, so flush it */ nfs_node_unlock(np); if (nfsvers != NFS_VER2) error = nfs_flush(np, MNT_WAIT, vfs_context_thread(ctx), 0); else error = nfs_vinvalbuf(vp, V_SAVE, ctx, 1); - if (error) - return (error); nfs_node_lock_force(np); NATTRINVALIDATE(np); } @@ -753,9 +933,266 @@ nfs3_vnop_close( error = np->n_error; } nfs_node_unlock(np); + + if (vtype != VREG) { + /* Just mark that it was closed */ + lck_mtx_lock(&np->n_openlock); + if (np->n_openrefcnt == 0) { + if (fflag & (FREAD|FWRITE)) { + NP(np, "nfs_vnop_close: open reference underrun"); + error = EINVAL; + } + } else if (fflag & (FREAD|FWRITE)) { + np->n_openrefcnt--; + } else { + /* No FREAD/FWRITE set - probably the final close */ + np->n_openrefcnt = 0; + } + lck_mtx_unlock(&np->n_openlock); + return (error); + } + error1 = error; + + /* fflag should contain some combination of: FREAD, FWRITE, FHASLOCK */ + accessMode = 0; + if (fflag & FREAD) + accessMode |= NFS_OPEN_SHARE_ACCESS_READ; + if (fflag & FWRITE) + accessMode |= NFS_OPEN_SHARE_ACCESS_WRITE; +// XXX It would be nice if we still had the O_EXLOCK/O_SHLOCK flags that were on the open +// if (fflag & O_EXLOCK) +// denyMode = NFS_OPEN_SHARE_DENY_BOTH; +// else if (fflag & O_SHLOCK) +// denyMode = NFS_OPEN_SHARE_DENY_WRITE; +// else +// denyMode = NFS_OPEN_SHARE_DENY_NONE; + if (fflag & FHASLOCK) { + /* XXX assume FHASLOCK is for the deny mode and not flock */ + /* FHASLOCK flock will be unlocked in the close path, but the flag is not cleared. */ + if (nofp->nof_deny & NFS_OPEN_SHARE_DENY_READ) + denyMode = NFS_OPEN_SHARE_DENY_BOTH; + else if (nofp->nof_deny & NFS_OPEN_SHARE_DENY_WRITE) + denyMode = NFS_OPEN_SHARE_DENY_WRITE; + else + denyMode = NFS_OPEN_SHARE_DENY_NONE; + } else { + denyMode = NFS_OPEN_SHARE_DENY_NONE; + } + // XXX don't do deny modes just yet (and never do it for !v4) + denyMode = NFS_OPEN_SHARE_DENY_NONE; + + if (!accessMode) { + /* + * No mode given to close? + * Guess this is the final close. + * We should unlock all locks and close all opens. + */ + mount_t mp = vnode_mount(vp); + int force = (!mp || (mp->mnt_kern_flag & MNTK_FRCUNMOUNT)); + nfs_release_open_state_for_node(np, force); + return (error); + } + + noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 0); + if (!noop) { + // printf("nfs_vnop_close: can't get open owner!\n"); + return (EIO); + } + +restart: + error = nfs_mount_state_in_use_start(nmp, NULL); + if (error) { + nfs_open_owner_rele(noop); + return (error); + } + + error = nfs_open_file_find(np, noop, &nofp, 0, 0, 0); + if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(nofp, NULL); + nofp = NULL; + if (!error) + goto restart; + } + if (error) { + NP(np, "nfs_vnop_close: no open file for owner, error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); + error = EBADF; + goto out; + } + error = nfs_open_file_set_busy(nofp, NULL); + if (error) { + nofp = NULL; + goto out; + } + + error = nfs_close(np, nofp, accessMode, denyMode, ctx); + if (error) + NP(np, "nfs_vnop_close: close error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); + +out: + if (nofp) + nfs_open_file_clear_busy(nofp); + if (nfs_mount_state_in_use_end(nmp, error)) { + nofp = NULL; + goto restart; + } + if (!error) + error = error1; + if (error) + NP(np, "nfs_vnop_close: error %d, %d", error, kauth_cred_getuid(noop->noo_cred)); + if (noop) + nfs_open_owner_rele(noop); return (error); } +/* + * nfs_close(): common function that does all the heavy lifting of file closure + * + * Takes an open file structure and a set of access/deny modes and figures out how + * to update the open file structure (and the state on the server) appropriately. + */ +int +nfs_close( + nfsnode_t np, + struct nfs_open_file *nofp, + uint32_t accessMode, + uint32_t denyMode, + vfs_context_t ctx) +{ + struct nfs_lock_owner *nlop; + int error = 0, changed = 0, delegated = 0, closed = 0, downgrade = 0; + uint32_t newAccessMode, newDenyMode; + + /* warn if modes don't match current state */ + if (((accessMode & nofp->nof_access) != accessMode) || ((denyMode & nofp->nof_deny) != denyMode)) + NP(np, "nfs_close: mode mismatch %d %d, current %d %d, %d", + accessMode, denyMode, nofp->nof_access, nofp->nof_deny, + kauth_cred_getuid(nofp->nof_owner->noo_cred)); + + /* + * If we're closing a write-only open, we may not have a write-only count + * if we also grabbed read access. So, check the read-write count. + */ + if (denyMode == NFS_OPEN_SHARE_DENY_NONE) { + if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && + (nofp->nof_w == 0) && (nofp->nof_d_w == 0) && + (nofp->nof_rw || nofp->nof_d_rw)) + accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; + } else if (denyMode == NFS_OPEN_SHARE_DENY_WRITE) { + if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && + (nofp->nof_w_dw == 0) && (nofp->nof_d_w_dw == 0) && + (nofp->nof_rw_dw || nofp->nof_d_rw_dw)) + accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; + } else { /* NFS_OPEN_SHARE_DENY_BOTH */ + if ((accessMode == NFS_OPEN_SHARE_ACCESS_WRITE) && + (nofp->nof_w_drw == 0) && (nofp->nof_d_w_drw == 0) && + (nofp->nof_rw_drw || nofp->nof_d_rw_drw)) + accessMode = NFS_OPEN_SHARE_ACCESS_BOTH; + } + + nfs_open_file_remove_open_find(nofp, accessMode, denyMode, &newAccessMode, &newDenyMode, &delegated); + if ((newAccessMode != nofp->nof_access) || (newDenyMode != nofp->nof_deny)) + changed = 1; + else + changed = 0; + + if (NFSTONMP(np)->nm_vers < NFS_VER4) /* NFS v2/v3 closes simply need to remove the open. */ + goto v3close; + + if ((newAccessMode == 0) || (nofp->nof_opencnt == 1)) { + /* + * No more access after this close, so clean up and close it. + * Don't send a close RPC if we're closing a delegated open. + */ + nfs_wait_bufs(np); + closed = 1; + if (!delegated && !(nofp->nof_flags & NFS_OPEN_FILE_LOST)) + error = nfs4_close_rpc(np, nofp, vfs_context_thread(ctx), vfs_context_ucred(ctx), 0); + if (error == NFSERR_LOCKS_HELD) { + /* + * Hmm... the server says we have locks we need to release first + * Find the lock owner and try to unlock everything. + */ + nlop = nfs_lock_owner_find(np, vfs_context_proc(ctx), 0); + if (nlop) { + nfs4_unlock_rpc(np, nlop, F_WRLCK, 0, UINT64_MAX, + 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); + nfs_lock_owner_rele(nlop); + } + error = nfs4_close_rpc(np, nofp, vfs_context_thread(ctx), vfs_context_ucred(ctx), 0); + } + } else if (changed) { + /* + * File is still open but with less access, so downgrade the open. + * Don't send a downgrade RPC if we're closing a delegated open. + */ + if (!delegated && !(nofp->nof_flags & NFS_OPEN_FILE_LOST)) { + downgrade = 1; + /* + * If we have delegated opens, we should probably claim them before sending + * the downgrade because the server may not know the open we are downgrading to. + */ + if (nofp->nof_d_rw_drw || nofp->nof_d_w_drw || nofp->nof_d_r_drw || + nofp->nof_d_rw_dw || nofp->nof_d_w_dw || nofp->nof_d_r_dw || + nofp->nof_d_rw || nofp->nof_d_w || nofp->nof_d_r) + nfs4_claim_delegated_state_for_open_file(nofp, 0); + /* need to remove the open before sending the downgrade */ + nfs_open_file_remove_open(nofp, accessMode, denyMode); + error = nfs4_open_downgrade_rpc(np, nofp, ctx); + if (error) /* Hmm.. that didn't work. Add the open back in. */ + nfs_open_file_add_open(nofp, accessMode, denyMode, delegated); + } + } + + if (error) { + NP(np, "nfs_close: error %d, %d", error, kauth_cred_getuid(nofp->nof_owner->noo_cred)); + return (error); + } + +v3close: + if (!downgrade) + nfs_open_file_remove_open(nofp, accessMode, denyMode); + + if (closed) { + lck_mtx_lock(&nofp->nof_lock); + if (nofp->nof_r || nofp->nof_d_r || nofp->nof_w || nofp->nof_d_w || nofp->nof_d_rw || + (nofp->nof_rw && !((nofp->nof_flags & NFS_OPEN_FILE_CREATE) && !nofp->nof_creator && (nofp->nof_rw == 1))) || + nofp->nof_r_dw || nofp->nof_d_r_dw || nofp->nof_w_dw || nofp->nof_d_w_dw || + nofp->nof_rw_dw || nofp->nof_d_rw_dw || nofp->nof_r_drw || nofp->nof_d_r_drw || + nofp->nof_w_drw || nofp->nof_d_w_drw || nofp->nof_rw_drw || nofp->nof_d_rw_drw) + NP(np, "nfs_close: unexpected count: %u.%u %u.%u %u.%u dw %u.%u %u.%u %u.%u drw %u.%u %u.%u %u.%u flags 0x%x, %d", + nofp->nof_r, nofp->nof_d_r, nofp->nof_w, nofp->nof_d_w, + nofp->nof_rw, nofp->nof_d_rw, nofp->nof_r_dw, nofp->nof_d_r_dw, + nofp->nof_w_dw, nofp->nof_d_w_dw, nofp->nof_rw_dw, nofp->nof_d_rw_dw, + nofp->nof_r_drw, nofp->nof_d_r_drw, nofp->nof_w_drw, nofp->nof_d_w_drw, + nofp->nof_rw_drw, nofp->nof_d_rw_drw, nofp->nof_flags, + kauth_cred_getuid(nofp->nof_owner->noo_cred)); + /* clear out all open info, just to be safe */ + nofp->nof_access = nofp->nof_deny = 0; + nofp->nof_mmap_access = nofp->nof_mmap_deny = 0; + nofp->nof_r = nofp->nof_d_r = 0; + nofp->nof_w = nofp->nof_d_w = 0; + nofp->nof_rw = nofp->nof_d_rw = 0; + nofp->nof_r_dw = nofp->nof_d_r_dw = 0; + nofp->nof_w_dw = nofp->nof_d_w_dw = 0; + nofp->nof_rw_dw = nofp->nof_d_rw_dw = 0; + nofp->nof_r_drw = nofp->nof_d_r_drw = 0; + nofp->nof_w_drw = nofp->nof_d_w_drw = 0; + nofp->nof_rw_drw = nofp->nof_d_rw_drw = 0; + nofp->nof_flags &= ~NFS_OPEN_FILE_CREATE; + lck_mtx_unlock(&nofp->nof_lock); + /* XXX we may potentially want to clean up idle/unused open file structures */ + } + if (nofp->nof_flags & NFS_OPEN_FILE_LOST) { + error = EIO; + NP(np, "nfs_close: LOST%s, %d", !nofp->nof_opencnt ? " (last)" : "", + kauth_cred_getuid(nofp->nof_owner->noo_cred)); + } + return (error); +} + + + int nfs3_getattr_rpc( @@ -763,18 +1200,22 @@ nfs3_getattr_rpc( mount_t mp, u_char *fhp, size_t fhsize, + int flags, vfs_context_t ctx, struct nfs_vattr *nvap, u_int64_t *xidp) { struct nfsmount *nmp = mp ? VFSTONFS(mp) : NFSTONMP(np); - int error = 0, status, nfsvers; + int error = 0, status, nfsvers, rpcflags = 0; struct nfsm_chain nmreq, nmrep; if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + if (flags & NGA_MONITOR) /* vnode monitor requests should be soft */ + rpcflags = R_RECOVER; + nfsm_chain_null(&nmreq); nfsm_chain_null(&nmrep); @@ -784,8 +1225,9 @@ nfs3_getattr_rpc( nfsm_chain_add_opaque(error, &nmreq, fhp, fhsize); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, mp, &nmreq, NFSPROC_GETATTR, ctx, - &nmrep, xidp, &status); + error = nfs_request2(np, mp, &nmreq, NFSPROC_GETATTR, + vfs_context_thread(ctx), vfs_context_ucred(ctx), + NULL, rpcflags, &nmrep, xidp, &status); if (!error) error = status; nfsmout_if(error); @@ -798,10 +1240,11 @@ nfsmout: int -nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncached) +nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int flags) { struct nfsmount *nmp; int error = 0, nfsvers, inprogset = 0, wanted = 0, avoidfloods; + struct nfs_vattr nvattr; struct timespec ts = { 2, 0 }; u_int64_t xid; @@ -811,6 +1254,10 @@ nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncache return (ENXIO); nfsvers = nmp->nm_vers; + if (!nvap) + nvap = &nvattr; + NVATTR_INIT(nvap); + /* Update local times for special files. */ if (np->n_flag & (NACC | NUPD)) { nfs_node_lock_force(np); @@ -823,15 +1270,27 @@ nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncache error = nfs_node_lock(np); nfsmout_if(error); - if (!uncached) { + if (!(flags & (NGA_UNCACHED|NGA_MONITOR)) || ((nfsvers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK))) { + /* + * Use the cache or wait for any getattr in progress if: + * - it's a cached request, or + * - we have a delegation + */ while (1) { - error = nfs_getattrcache(np, nvap); + error = nfs_getattrcache(np, nvap, flags); if (!error || (error != ENOENT)) { nfs_node_unlock(np); goto nfsmout; } + error = 0; if (!ISSET(np->n_flag, NGETATTRINPROG)) break; + if (flags & NGA_MONITOR) { + /* no need to wait if a request is pending */ + error = EINPROGRESS; + nfs_node_unlock(np); + goto nfsmout; + } SET(np->n_flag, NGETATTRWANT); msleep(np, &np->n_lock, PZERO-1, "nfsgetattrwant", &ts); if ((error = nfs_sigintr(NFSTONMP(np), NULL, vfs_context_thread(ctx), 0))) { @@ -844,30 +1303,33 @@ nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncache } else if (!ISSET(np->n_flag, NGETATTRINPROG)) { SET(np->n_flag, NGETATTRINPROG); inprogset = 1; + } else if (flags & NGA_MONITOR) { + /* no need to make a request if one is pending */ + error = EINPROGRESS; } nfs_node_unlock(np); nmp = NFSTONMP(np); - if (!nmp) { + if (!nmp) error = ENXIO; + if (error) goto nfsmout; - } /* - * Try to get both the attributes and access info by making an - * ACCESS call and seeing if it returns updated attributes. + * We might want to try to get both the attributes and access info by + * making an ACCESS call and seeing if it returns updated attributes. * But don't bother if we aren't caching access info or if the * attributes returned wouldn't be cached. */ - if ((nfsvers != NFS_VER2) && (nfs_access_cache_timeout > 0)) { + if (!(flags & NGA_ACL) && (nfsvers != NFS_VER2) && nfs_access_for_getattr && (nfs_access_cache_timeout > 0)) { if (nfs_attrcachetimeout(np) > 0) { /* OSAddAtomic(1, &nfsstats.accesscache_misses); */ - u_int32_t mode = NFS_ACCESS_ALL; - error = nmp->nm_funcs->nf_access_rpc(np, &mode, ctx); + u_int32_t access = NFS_ACCESS_ALL; + error = nmp->nm_funcs->nf_access_rpc(np, &access, ctx); if (error) goto nfsmout; nfs_node_lock_force(np); - error = nfs_getattrcache(np, nvap); + error = nfs_getattrcache(np, nvap, flags); nfs_node_unlock(np); if (!error || (error != ENOENT)) goto nfsmout; @@ -878,7 +1340,7 @@ nfs_getattr(nfsnode_t np, struct nfs_vattr *nvap, vfs_context_t ctx, int uncache avoidfloods = 0; tryagain: - error = nmp->nm_funcs->nf_getattr_rpc(np, NULL, np->n_fhp, np->n_fhsize, ctx, nvap, &xid); + error = nmp->nm_funcs->nf_getattr_rpc(np, NULL, np->n_fhp, np->n_fhsize, flags, ctx, nvap, &xid); if (!error) { nfs_node_lock_force(np); error = nfs_loadattrcache(np, nvap, &xid, 0); @@ -933,6 +1395,17 @@ nfsmout: if (wanted) wakeup(np); } + + if (nvap == &nvattr) { + NVATTR_CLEANUP(nvap); + } else if (!(flags & NGA_ACL)) { + /* make sure we don't return an ACL if it wasn't asked for */ + NFS_BITMAP_CLR(nvap->nva_bitmap, NFS_FATTR_ACL); + if (nvap->nva_acl) { + kauth_acl_free(nvap->nva_acl); + nvap->nva_acl = NULL; + } + } FSDBG_BOT(513, np->n_size, error, np->n_vattr.nva_size, np->n_flag); return (error); } @@ -1002,20 +1475,20 @@ nfs_vnop_setattr( struct nfsmount *nmp; struct vnode_attr *vap = ap->a_vap; int error = 0; - int biosize, nfsvers; - u_quad_t origsize; + int biosize, nfsvers, namedattrs; + u_quad_t origsize, vapsize; struct nfs_dulookup dul; nfsnode_t dnp = NULL; vnode_t dvp = NULL; const char *vname = NULL; struct nfs_open_owner *noop = NULL; struct nfs_open_file *nofp = NULL; - struct nfs_vattr nvattr; nmp = VTONMP(vp); if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); biosize = nmp->nm_biosize; /* Disallow write attempts if the filesystem is mounted read-only. */ @@ -1058,46 +1531,52 @@ nfs_vnop_setattr( /* flush everything */ error = nfs_vinvalbuf(vp, (vap->va_data_size ? V_SAVE : 0) , ctx, 1); if (error) { - printf("nfs_setattr: nfs_vinvalbuf %d\n", error); + NP(np, "nfs_setattr: nfs_vinvalbuf %d", error); FSDBG_BOT(512, np->n_size, vap->va_data_size, np->n_vattr.nva_size, -1); return (error); } if (nfsvers >= NFS_VER4) { /* setting file size requires having the file open for write access */ + if (np->n_flag & NREVOKE) + return (EIO); noop = nfs_open_owner_find(nmp, vfs_context_ucred(ctx), 1); if (!noop) return (ENOMEM); -retryopen: +restart: + error = nfs_mount_state_in_use_start(nmp, vfs_context_thread(ctx)); + if (error) + return (error); + if (np->n_flag & NREVOKE) { + nfs_mount_state_in_use_end(nmp, 0); + return (EIO); + } error = nfs_open_file_find(np, noop, &nofp, 0, 0, 1); if (!error && (nofp->nof_flags & NFS_OPEN_FILE_LOST)) error = EIO; if (!error && (nofp->nof_flags & NFS_OPEN_FILE_REOPEN)) { - nfs4_reopen(nofp, vfs_context_thread(ctx)); + nfs_mount_state_in_use_end(nmp, 0); + error = nfs4_reopen(nofp, vfs_context_thread(ctx)); nofp = NULL; - goto retryopen; + if (!error) + goto restart; } + if (!error) + error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); if (error) { nfs_open_owner_rele(noop); return (error); } if (!(nofp->nof_access & NFS_OPEN_SHARE_ACCESS_WRITE)) { /* we don't have the file open for write access, so open it */ - error = nfs_mount_state_in_use_start(nmp); - if (!error) - error = nfs_open_file_set_busy(nofp, vfs_context_thread(ctx)); - if (error) { - nfs_open_owner_rele(noop); - return (error); - } error = nfs4_open(np, nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE, ctx); if (!error) nofp->nof_flags |= NFS_OPEN_FILE_SETATTR; if (nfs_mount_state_error_should_restart(error)) { nfs_open_file_clear_busy(nofp); nofp = NULL; + if (nfs_mount_state_in_use_end(nmp, error)) + goto restart; } - if (nfs_mount_state_in_use_end(nmp, error)) - goto retryopen; } } nfs_data_lock(np, NFS_DATA_LOCK_EXCLUSIVE); @@ -1198,61 +1677,52 @@ retryopen: nfs_node_unlock(np); } } - if (VATTR_IS_ACTIVE(vap, va_mode) || - VATTR_IS_ACTIVE(vap, va_uid) || - VATTR_IS_ACTIVE(vap, va_gid)) { - if ((error = nfs_node_lock(np))) { - if (VATTR_IS_ACTIVE(vap, va_data_size)) - nfs_data_unlock(np); - return (error); - } - NMODEINVALIDATE(np); + if ((VATTR_IS_ACTIVE(vap, va_mode) || VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid) || + VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid)) && + !(error = nfs_node_lock(np))) { + NACCESSINVALIDATE(np); nfs_node_unlock(np); - dvp = vnode_getparent(vp); - vname = vnode_getname(vp); - dnp = (dvp && vname) ? VTONFS(dvp) : NULL; - if (dnp) { - error = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); - if (error) { - dnp = NULL; - error = 0; + if (!namedattrs) { + dvp = vnode_getparent(vp); + vname = vnode_getname(vp); + dnp = (dvp && vname) ? VTONFS(dvp) : NULL; + if (dnp) { + error = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); + if (error) { + dnp = NULL; + error = 0; + } + } + if (dnp) { + nfs_dulookup_init(&dul, dnp, vname, strlen(vname), ctx); + nfs_dulookup_start(&dul, dnp, ctx); } - } - if (dnp) { - nfs_dulookup_init(&dul, dnp, vname, strlen(vname), ctx); - nfs_dulookup_start(&dul, dnp, ctx); } } -retrysetattr: - if (VATTR_IS_ACTIVE(vap, va_data_size) && (nfsvers >= NFS_VER4)) - error = nfs_mount_state_in_use_start(nmp); - - if (!error) { + if (!error) error = nmp->nm_funcs->nf_setattr_rpc(np, vap, ctx); - if (VATTR_IS_ACTIVE(vap, va_data_size) && (nfsvers >= NFS_VER4)) - if (nfs_mount_state_in_use_end(nmp, error)) - goto retrysetattr; - } - - if (VATTR_IS_ACTIVE(vap, va_mode) || - VATTR_IS_ACTIVE(vap, va_uid) || - VATTR_IS_ACTIVE(vap, va_gid)) { - if (dnp) { - nfs_dulookup_finish(&dul, dnp, ctx); - nfs_node_clear_busy(dnp); + if (VATTR_IS_ACTIVE(vap, va_mode) || VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid) || + VATTR_IS_ACTIVE(vap, va_acl) || VATTR_IS_ACTIVE(vap, va_uuuid) || VATTR_IS_ACTIVE(vap, va_guuid)) { + if (!namedattrs) { + if (dnp) { + nfs_dulookup_finish(&dul, dnp, ctx); + nfs_node_clear_busy(dnp); + } + if (dvp != NULLVP) + vnode_put(dvp); + if (vname != NULL) + vnode_putname(vname); } - if (dvp != NULLVP) - vnode_put(dvp); - if (vname != NULL) - vnode_putname(vname); } FSDBG_BOT(512, np->n_size, vap->va_data_size, np->n_vattr.nva_size, error); if (VATTR_IS_ACTIVE(vap, va_data_size)) { - if (error && (origsize != np->n_size)) { + if (error && (origsize != np->n_size) && + ((nfsvers < NFS_VER4) || !nfs_mount_state_error_should_restart(error))) { /* make every effort to resync file size w/ server... */ + /* (don't bother if we'll be restarting the operation) */ int err; /* preserve "error" for return */ np->n_size = np->n_vattr.nva_size = origsize; nfs_node_lock_force(np); @@ -1260,10 +1730,12 @@ retrysetattr: nfs_node_unlock(np); FSDBG(512, np, np->n_size, np->n_vattr.nva_size, 0xf00d0002); ubc_setsize(vp, (off_t)np->n_size); /* XXX check error */ + vapsize = vap->va_data_size; vap->va_data_size = origsize; err = nmp->nm_funcs->nf_setattr_rpc(np, vap, ctx); if (err) - printf("nfs_vnop_setattr: nfs%d_setattr_rpc %d %d\n", nfsvers, error, err); + NP(np, "nfs_vnop_setattr: nfs%d_setattr_rpc %d %d", nfsvers, error, err); + vap->va_data_size = vapsize; } nfs_node_lock_force(np); /* @@ -1276,22 +1748,26 @@ retrysetattr: CLR(np->n_flag, NUPDATESIZE); NATTRINVALIDATE(np); nfs_node_unlock(np); - nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED); + nfs_getattr(np, NULL, ctx, NGA_UNCACHED); } else { nfs_node_unlock(np); } nfs_data_unlock(np); if (nfsvers >= NFS_VER4) { - if (nofp->nof_flags & NFS_OPEN_FILE_SETATTR) { - int err = nfs4_close(np, nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE, ctx); - if (err) { - vname = vnode_getname(NFSTOV(np)); - printf("nfs_vnop_setattr: close error: %d, %s\n", err, vname); - vnode_putname(vname); + if (nofp) { + /* don't close our setattr open if we'll be restarting... */ + if (!nfs_mount_state_error_should_restart(error) && + (nofp->nof_flags & NFS_OPEN_FILE_SETATTR)) { + int err = nfs_close(np, nofp, NFS_OPEN_SHARE_ACCESS_WRITE, NFS_OPEN_SHARE_DENY_NONE, ctx); + if (err) + NP(np, "nfs_vnop_setattr: close error: %d", err); + nofp->nof_flags &= ~NFS_OPEN_FILE_SETATTR; } - nofp->nof_flags &= ~NFS_OPEN_FILE_SETATTR; nfs_open_file_clear_busy(nofp); + nofp = NULL; } + if (nfs_mount_state_in_use_end(nmp, error)) + goto restart; nfs_open_owner_rele(noop); } } @@ -1414,8 +1890,7 @@ nfs3_setattr_rpc( } nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_SETATTR, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_SETATTR, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; if (nfsvers == NFS_VER3) { @@ -1435,7 +1910,7 @@ nfs3_setattr_rpc( } else { if (!error) error = status; - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); } /* * We just changed the attributes and we want to make sure that we @@ -1495,6 +1970,7 @@ nfs_vnop_lookup( *vpp = NULLVP; dnp = VTONFS(dvp); + NVATTR_INIT(&nvattr); mp = vnode_mount(dvp); nmp = VFSTONFS(mp); @@ -1503,12 +1979,12 @@ nfs_vnop_lookup( goto error_return; } nfsvers = nmp->nm_vers; - negnamecache = !(nmp->nm_flag & NFSMNT_NONEGNAMECACHE); + negnamecache = !NMFLAG(nmp, NONEGNAMECACHE); if ((error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)))) goto error_return; /* nfs_getattr() will check changed and purge caches */ - if ((error = nfs_getattr(dnp, &nvattr, ctx, NGA_CACHED))) + if ((error = nfs_getattr(dnp, NULL, ctx, NGA_CACHED))) goto error_return; error = cache_lookup(dvp, vpp, cnp); @@ -1518,7 +1994,7 @@ nfs_vnop_lookup( goto error_return; case 0: /* cache miss */ - if ((nfsvers > NFS_VER2) && (nmp->nm_flag & NFSMNT_RDIRPLUS)) { + if ((nfsvers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) { /* if rdirplus, try dir buf cache lookup */ error = nfs_dir_buf_cache_lookup(dnp, &np, cnp, ctx, 0); if (!error && np) { @@ -1535,8 +2011,10 @@ nfs_vnop_lookup( OSAddAtomic(1, &nfsstats.lookupcache_hits); nfs_node_clear_busy(dnp); + busyerror = ENOENT; /* check for directory access */ + naa.a_desc = &vnop_access_desc; naa.a_vp = dvp; naa.a_action = KAUTH_VNODE_SEARCH; naa.a_context = ctx; @@ -1561,6 +2039,11 @@ nfs_vnop_lookup( fh.fh_len = 0; goto found; } + if ((nfsvers >= NFS_VER4) && (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) { + /* we should never be looking things up in a trigger directory, return nothing */ + error = ENOENT; + goto error_return; + } /* do we know this name is too long? */ nmp = VTONMP(dvp); @@ -1581,7 +2064,7 @@ nfs_vnop_lookup( error = nmp->nm_funcs->nf_lookup_rpc_async(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &req); nfsmout_if(error); - error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, ctx, req, &xid, &fh, &nvattr); + error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, req, &xid, &fh, &nvattr); nfsmout_if(error); /* is the file handle the same as this directory's file handle? */ @@ -1620,7 +2103,7 @@ found: nfs_node_unlock(dnp); } else { ngflags = (cnp->cn_flags & MAKEENTRY) ? NG_MAKEENTRY : 0; - error = nfs_nget(mp, dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, ngflags, &np); + error = nfs_nget(mp, dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, ngflags, &np); if (error) goto error_return; newvp = NFSTOV(np); @@ -1647,6 +2130,7 @@ nfsmout: nfs_node_unlock(dnp); } error_return: + NVATTR_CLEANUP(&nvattr); if (!busyerror) nfs_node_clear_busy(dnp); if (error && *vpp) { @@ -1656,26 +2140,6 @@ error_return: return (error); } -/* - * NFS read call. - * Just call nfs_bioread() to do the work. - */ -int -nfs_vnop_read( - struct vnop_read_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - struct uio *a_uio; - int a_ioflag; - vfs_context_t a_context; - } */ *ap) -{ - if (vnode_vtype(ap->a_vp) != VREG) - return (EPERM); - return (nfs_bioread(VTONFS(ap->a_vp), ap->a_uio, ap->a_ioflag, ap->a_context)); -} - - /* * NFS readlink call */ @@ -1694,7 +2158,6 @@ nfs_vnop_readlink( int error = 0, nfsvers; uint32_t buflen; uio_t uio = ap->a_uio; - struct nfs_vattr nvattr; struct nfsbuf *bp = NULL; if (vnode_vtype(ap->a_vp) != VLNK) @@ -1711,7 +2174,7 @@ nfs_vnop_readlink( nfsvers = nmp->nm_vers; /* nfs_getattr() will check changed and purge caches */ - if ((error = nfs_getattr(np, &nvattr, ctx, NGA_CACHED))) { + if ((error = nfs_getattr(np, NULL, ctx, NGA_CACHED))) { FSDBG(531, np, 0xd1e0001, 0, error); return (error); } @@ -1764,8 +2227,7 @@ nfs3_readlink_rpc(nfsnode_t np, char *buf, uint32_t *buflenp, vfs_context_t ctx) nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_READLINK, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_READLINK, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; if (nfsvers == NFS_VER3) @@ -1827,6 +2289,10 @@ nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx) while (tsiz > 0) { len = retlen = (tsiz > (user_ssize_t)nmrsize) ? nmrsize : (size_t)tsiz; FSDBG(536, np, txoffset, len, 0); + if (np->n_flag & NREVOKE) { + error = EIO; + break; + } if (nmp->nm_vers >= NFS_VER4) stategenid = nmp->nm_stategenid; error = nmp->nm_funcs->nf_read_rpc_async(np, txoffset, len, @@ -1836,16 +2302,19 @@ nfs_read_rpc(nfsnode_t np, uio_t uio, vfs_context_t ctx) if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && (++restart <= nfs_mount_state_max_restarts(nmp))) { /* guard against no progress */ lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_read_rpc: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_read_rpc: error %d, initiating recovery", error); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); - if (!(error = nfs_mount_state_wait_for_recovery(nmp))) - continue; + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) + continue; + } } if (error) break; @@ -1894,7 +2363,7 @@ nfs3_read_rpc_async( } nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request_async(np, NULL, &nmreq, NFSPROC_READ, thd, cred, cb, reqp); + error = nfs_request_async(np, NULL, &nmreq, NFSPROC_READ, thd, cred, NULL, 0, cb, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); @@ -1937,7 +2406,7 @@ nfs3_read_rpc_async_finish( nfsm_chain_adv(error, &nmrep, NFSX_UNSIGNED); nfsm_chain_get_32(error, &nmrep, eof); } else { - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); } if (!lockerror) nfs_node_unlock(np); @@ -1980,7 +2449,6 @@ nfs_vnop_write( nfsnode_t np = VTONFS(vp); int ioflag = ap->a_ioflag; struct nfsbuf *bp; - struct nfs_vattr nvattr; struct nfsmount *nmp = VTONMP(vp); daddr64_t lbn; int biosize; @@ -2046,7 +2514,7 @@ nfs_vnop_write( if (ioflag & IO_APPEND) { nfs_data_unlock(np); /* nfs_getattr() will check changed and purge caches */ - error = nfs_getattr(np, &nvattr, ctx, NGA_UNCACHED); + error = nfs_getattr(np, NULL, ctx, NGA_UNCACHED); /* we'll be extending the file, so take the data lock exclusive */ nfs_data_lock(np, NFS_DATA_LOCK_EXCLUSIVE); if (error) { @@ -2272,12 +2740,14 @@ again: * * Notes: * We don't want to read anything we're just going to write over. + * We don't want to read anything we're just going drop when the + * I/O is complete (i.e. don't do reads for NOCACHE requests). * We don't want to issue multiple I/Os if we don't have to * (because they're synchronous rpcs). * We don't want to read anything we already have modified in the * page cache. */ - if (!ISSET(bp->nb_flags, NB_NOCACHE) && !ISSET(bp->nb_flags, NB_CACHE) && (n < biosize)) { + if (!ISSET(bp->nb_flags, NB_CACHE) && (n < biosize)) { int firstpg, lastpg, dirtypg; int firstpgoff, lastpgoff; start = end = -1; @@ -2296,6 +2766,22 @@ again: start = (lastpg * PAGE_SIZE) + lastpgoff; end = (lastpg + 1) * PAGE_SIZE; } + if (ISSET(bp->nb_flags, NB_NOCACHE)) { + /* + * For nocache writes, if there is any partial page at the + * start or end of the write range, then we do the write + * synchronously to make sure that we can drop the data + * from the cache as soon as the WRITE finishes. Normally, + * we would do an unstable write and not drop the data until + * it was committed. But doing that here would risk allowing + * invalid data to be read from the cache between the WRITE + * and the COMMIT. + * (NB_STABLE indicates that data writes should be FILESYNC) + */ + if (end > start) + SET(bp->nb_flags, NB_STABLE); + goto skipread; + } if (end > start) { /* need to read the data in range: start...end-1 */ @@ -2327,8 +2813,11 @@ again: uio_reset(auio, boff + start, UIO_SYSSPACE, UIO_READ); uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + start), on - start); error = nfs_read_rpc(np, auio, ctx); - if (error) /* couldn't read the data, so treat buffer as NOCACHE */ + if (error) { + /* couldn't read the data, so treat buffer as synchronous NOCACHE */ SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE)); + goto skipread; + } if (uio_resid(auio) > 0) { FSDBG(516, bp, (caddr_t)uio_curriovbase(auio) - bp->nb_data, uio_resid(auio), 0xd00dee01); bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio)); @@ -2370,13 +2859,16 @@ again: FSDBG(516, bp, start, end - start, 0xd00dee00); bzero(bp->nb_data + start, end - start); error = 0; - } else if (!ISSET(bp->nb_flags, NB_NOCACHE)) { + } else { /* now we'll read the (rest of the) data */ uio_reset(auio, boff + start, UIO_SYSSPACE, UIO_READ); uio_addiov(auio, CAST_USER_ADDR_T(bp->nb_data + start), end - start); error = nfs_read_rpc(np, auio, ctx); - if (error) /* couldn't read the data, so treat buffer as NOCACHE */ + if (error) { + /* couldn't read the data, so treat buffer as synchronous NOCACHE */ SET(bp->nb_flags, (NB_NOCACHE|NB_STABLE)); + goto skipread; + } if (uio_resid(auio) > 0) { FSDBG(516, bp, (caddr_t)uio_curriovbase(auio) - bp->nb_data, uio_resid(auio), 0xd00dee02); bzero(CAST_DOWN(caddr_t, uio_curriovbase(auio)), uio_resid(auio)); @@ -2400,6 +2892,7 @@ again: /* Note: pages being written to will be validated when written */ } } +skipread: if (ISSET(bp->nb_flags, NB_ERROR)) { error = bp->nb_error; @@ -2554,6 +3047,10 @@ nfs_write_rpc2( while (tsiz > 0) { len = (tsiz > nmwsize) ? nmwsize : tsiz; FSDBG(537, np, uio_offset(uio), len, 0); + if (np->n_flag & NREVOKE) { + error = EIO; + break; + } if (nmp->nm_vers >= NFS_VER4) stategenid = nmp->nm_stategenid; error = nmp->nm_funcs->nf_write_rpc_async(np, uio, len, thd, cred, *iomodep, NULL, &req); @@ -2565,16 +3062,19 @@ nfs_write_rpc2( if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error) && (++restart <= nfs_mount_state_max_restarts(nmp))) { /* guard against no progress */ lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_write_rpc: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_write_rpc: error %d, initiating recovery", error); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); - if (!(error = nfs_mount_state_wait_for_recovery(nmp))) - continue; + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (error == NFSERR_GRACE) + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) + continue; + } } if (error) break; @@ -2635,6 +3135,7 @@ nfs3_write_rpc_async( struct nfsreq **reqp) { struct nfsmount *nmp; + mount_t mp; int error = 0, nfsvers; struct nfsm_chain nmreq; @@ -2643,6 +3144,11 @@ nfs3_write_rpc_async( return (ENXIO); nfsvers = nmp->nm_vers; + /* for async mounts, don't bother sending sync write requests */ + if ((iomode != NFS_WRITE_UNSTABLE) && nfs_allow_async && + ((mp = NFSTOMP(np))) && (vfs_flags(mp) & MNT_ASYNC)) + iomode = NFS_WRITE_UNSTABLE; + nfsm_chain_null(&nmreq); nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); @@ -2661,7 +3167,7 @@ nfs3_write_rpc_async( error = nfsm_chain_add_uio(&nmreq, uio, len); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request_async(np, NULL, &nmreq, NFSPROC_WRITE, thd, cred, cb, reqp); + error = nfs_request_async(np, NULL, &nmreq, NFSPROC_WRITE, thd, cred, NULL, 0, cb, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); @@ -2727,7 +3233,7 @@ nfs3_write_rpc_async_finish( } else { if (!error) error = status; - nfsm_chain_loadattr(error, &nmrep, np, nfsvers, NULL, &xid); + nfsm_chain_loadattr(error, &nmrep, np, nfsvers, &xid); nfsmout_if(error); } if (updatemtime) @@ -2769,7 +3275,7 @@ nfs3_vnop_mknod( nfsnode_t np = NULL; struct nfsmount *nmp; nfsnode_t dnp = VTONFS(dvp); - struct nfs_vattr nvattr, dnvattr; + struct nfs_vattr nvattr; fhandle_t fh; int error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0; struct timespec premtime = { 0, 0 }; @@ -2777,6 +3283,7 @@ nfs3_vnop_mknod( u_int64_t xid, dxid; int nfsvers, gotuid, gotgid; struct nfsm_chain nmreq, nmrep; + struct nfsreq rq, *req = &rq; nmp = VTONMP(dvp); if (!nmp) @@ -2797,6 +3304,8 @@ nfs3_vnop_mknod( if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) return (ENAMETOOLONG); + nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); + VATTR_SET_SUPPORTED(vap, va_mode); VATTR_SET_SUPPORTED(vap, va_uid); VATTR_SET_SUPPORTED(vap, va_gid); @@ -2813,7 +3322,7 @@ nfs3_vnop_mknod( NFSX_FH(nfsvers) + 4 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(nfsvers)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) { nfsm_chain_add_32(error, &nmreq, vtonfs_type(vap->va_type, nfsvers)); nfsm_chain_add_v3sattr(error, &nmreq, vap); @@ -2829,7 +3338,10 @@ nfs3_vnop_mknod( error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); nfsmout_if(error); - error = nfs_request(dnp, NULL, &nmreq, NFSPROC_MKNOD, ctx, &nmrep, &xid, &status); + error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_MKNOD, + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); + if (!error) + error = nfs_request_async_finish(req, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(dnp))) error = lockerror; @@ -2857,11 +3369,11 @@ nfsmout: NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr); nfs_node_unlock(dnp); /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); + nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } if (!error && fh.fh_len) - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &np); + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); if (!error && !np) error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np); if (!error && np) @@ -2870,7 +3382,7 @@ nfsmout: nfs_node_clear_busy(dnp); if (!error && (gotuid || gotgid) && - (!newvp || nfs_getattrcache(np, &nvattr) || + (!newvp || nfs_getattrcache(np, &nvattr, 0) || (gotuid && (nvattr.nva_uid != vap->va_uid)) || (gotgid && (nvattr.nva_gid != vap->va_gid)))) { /* clear ID bits if server didn't use them (or we can't tell) */ @@ -2908,7 +3420,7 @@ nfs3_vnop_create( vnode_t dvp = ap->a_dvp; struct vnode_attr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; - struct nfs_vattr nvattr, dnvattr; + struct nfs_vattr nvattr; fhandle_t fh; nfsnode_t np = NULL; struct nfsmount *nmp; @@ -2920,7 +3432,7 @@ nfs3_vnop_create( u_int64_t xid, dxid; uint32_t val; struct nfsm_chain nmreq, nmrep; - struct nfsreq *req; + struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; nmp = VTONMP(dvp); @@ -2931,6 +3443,8 @@ nfs3_vnop_create( if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) return (ENAMETOOLONG); + nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); + VATTR_SET_SUPPORTED(vap, va_mode); VATTR_SET_SUPPORTED(vap, va_uid); VATTR_SET_SUPPORTED(vap, va_gid); @@ -2940,11 +3454,13 @@ nfs3_vnop_create( gotuid = VATTR_IS_ACTIVE(vap, va_uid); gotgid = VATTR_IS_ACTIVE(vap, va_gid); - if (vap->va_vaflags & VA_EXCLUSIVE) + if (vap->va_vaflags & VA_EXCLUSIVE) { fmode |= O_EXCL; + if (!VATTR_IS_ACTIVE(vap, va_access_time) || !VATTR_IS_ACTIVE(vap, va_modify_time)) + vap->va_vaflags |= VA_UTIMES_NULL; + } again: - req = NULL; error = busyerror = nfs_node_set_busy(dnp, vfs_context_thread(ctx)); nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); @@ -2955,7 +3471,7 @@ again: NFSX_FH(nfsvers) + 2 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(nfsvers)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) { if (fmode & O_EXCL) { nfsm_chain_add_32(error, &nmreq, NFS_CREATE_EXCLUSIVE); @@ -2979,7 +3495,7 @@ again: nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_CREATE, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { nfs_dulookup_start(&dul, dnp, ctx); error = nfs_request_async_finish(req, &nmrep, &xid, &status); @@ -3010,11 +3526,11 @@ nfsmout: NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr); nfs_node_unlock(dnp); /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); + nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } if (!error && fh.fh_len) - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &np); + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); if (!error && !np) error = nfs_lookitup(dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx, &np); if (!error && np) @@ -3051,7 +3567,7 @@ nfsmout: if (!error) *ap->a_vpp = newvp; if (!error && (gotuid || gotgid) && - (!newvp || nfs_getattrcache(np, &nvattr) || + (!newvp || nfs_getattrcache(np, &nvattr, 0) || (gotuid && (nvattr.nva_uid != vap->va_uid)) || (gotgid && (nvattr.nva_gid != vap->va_gid)))) { /* clear ID bits if server didn't use them (or we can't tell) */ @@ -3091,7 +3607,7 @@ nfs_vnop_remove( struct componentname *cnp = ap->a_cnp; nfsnode_t dnp = VTONFS(dvp); nfsnode_t np = VTONFS(vp); - int error = 0, nfsvers, inuse, gotattr = 0, flushed = 0, setsize = 0; + int error = 0, nfsvers, namedattrs, inuse, gotattr = 0, flushed = 0, setsize = 0; struct nfs_vattr nvattr; struct nfsmount *nmp; struct nfs_dulookup dul; @@ -3102,6 +3618,7 @@ nfs_vnop_remove( if (!nmp) return (ENXIO); nfsvers = nmp->nm_vers; + namedattrs = (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR); again_relock: error = nfs_node_set_busy2(dnp, np, vfs_context_thread(ctx)); @@ -3117,7 +3634,8 @@ again_relock: np->n_hflag |= NHLOCKED; lck_mtx_unlock(nfs_node_hash_mutex); - nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); + if (!namedattrs) + nfs_dulookup_init(&dul, dnp, cnp->cn_nameptr, cnp->cn_namelen, ctx); again: inuse = vnode_isinuse(vp, 0); if ((ap->a_flags & VNODE_REMOVE_NODELETEBUSY) && inuse) { @@ -3152,16 +3670,13 @@ again: nfs_node_unlock(np); return (error); } + if (!namedattrs) + nfs_dulookup_finish(&dul, dnp, ctx); goto again_relock; } - if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK)) { - lck_mtx_lock(&np->n_openlock); - np->n_openflags &= ~N_DELEG_MASK; - lck_mtx_unlock(&np->n_openlock); - nfs4_delegreturn_rpc(nmp, np->n_fhp, np->n_fhsize, &np->n_dstateid, - vfs_context_thread(ctx), vfs_context_ucred(ctx)); - } + if ((nmp->nm_vers >= NFS_VER4) && (np->n_openflags & N_DELEG_MASK)) + nfs4_delegation_return(np, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); /* * Purge the name cache so that the chance of a lookup for @@ -3170,7 +3685,8 @@ again: */ nfs_name_cache_purge(dnp, np, cnp, ctx); - nfs_dulookup_start(&dul, dnp, ctx); + if (!namedattrs) + nfs_dulookup_start(&dul, dnp, ctx); /* Do the rpc */ error = nmp->nm_funcs->nf_remove_rpc(dnp, cnp->cn_nameptr, cnp->cn_namelen, @@ -3213,7 +3729,8 @@ again: nfs_node_unlock(np); } } else if (!np->n_sillyrename) { - nfs_dulookup_start(&dul, dnp, ctx); + if (!namedattrs) + nfs_dulookup_start(&dul, dnp, ctx); error = nfs_sillyrename(dnp, np, cnp, ctx); nfs_node_lock_force(np); NATTRINVALIDATE(np); @@ -3222,12 +3739,14 @@ again: nfs_node_lock_force(np); NATTRINVALIDATE(np); nfs_node_unlock(np); - nfs_dulookup_start(&dul, dnp, ctx); + if (!namedattrs) + nfs_dulookup_start(&dul, dnp, ctx); } /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &nvattr, ctx, NGA_CACHED); - nfs_dulookup_finish(&dul, dnp, ctx); + nfs_getattr(dnp, NULL, ctx, NGA_CACHED); + if (!namedattrs) + nfs_dulookup_finish(&dul, dnp, ctx); out: /* unlock the node */ lck_mtx_lock(nfs_node_hash_mutex); @@ -3286,11 +3805,11 @@ nfs3_remove_rpc( nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(namelen)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, name, namelen); + nfsm_chain_add_name(error, &nmreq, name, namelen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request2(dnp, NULL, &nmreq, NFSPROC_REMOVE, thd, cred, 0, &nmrep, &xid, &status); + error = nfs_request2(dnp, NULL, &nmreq, NFSPROC_REMOVE, thd, cred, NULL, 0, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(dnp))) error = lockerror; @@ -3398,11 +3917,7 @@ nfs_vnop_rename( tvp = NULL; } } else if (tvp && (nmp->nm_vers >= NFS_VER4) && (tnp->n_openflags & N_DELEG_MASK)) { - lck_mtx_lock(&tnp->n_openlock); - tnp->n_openflags &= ~N_DELEG_MASK; - lck_mtx_unlock(&tnp->n_openlock); - nfs4_delegreturn_rpc(nmp, tnp->n_fhp, tnp->n_fhsize, &tnp->n_dstateid, - vfs_context_thread(ctx), vfs_context_ucred(ctx)); + nfs4_delegation_return(tnp, 0, vfs_context_thread(ctx), vfs_context_ucred(ctx)); } error = nmp->nm_funcs->nf_rename_rpc(fdnp, fcnp->cn_nameptr, fcnp->cn_namelen, @@ -3417,7 +3932,7 @@ nfs_vnop_rename( if (tvp && (tvp != fvp) && !tnp->n_sillyrename) { nfs_node_lock_force(tnp); tvprecycle = (!error && !vnode_isinuse(tvp, 0) && - (nfs_getattrcache(tnp, &nvattr) || (nvattr.nva_nlink == 1))); + (nfs_getattrcache(tnp, &nvattr, 0) || (nvattr.nva_nlink == 1))); nfs_node_unlock(tnp); lck_mtx_lock(nfs_node_hash_mutex); if (tvprecycle && (tnp->n_hflag & NHHASHED)) { @@ -3474,8 +3989,8 @@ nfs_vnop_rename( } out: /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(fdnp, &nvattr, ctx, NGA_CACHED); - nfs_getattr(tdnp, &nvattr, ctx, NGA_CACHED); + nfs_getattr(fdnp, NULL, ctx, NGA_CACHED); + nfs_getattr(tdnp, NULL, ctx, NGA_CACHED); if (locked) { /* unlock node */ lck_mtx_lock(nfs_node_hash_mutex); @@ -3525,13 +4040,13 @@ nfs3_rename_rpc( (NFSX_FH(nfsvers) + NFSX_UNSIGNED) * 2 + nfsm_rndup(fnamelen) + nfsm_rndup(tnamelen)); nfsm_chain_add_fh(error, &nmreq, nfsvers, fdnp->n_fhp, fdnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, fnameptr, fnamelen); + nfsm_chain_add_name(error, &nmreq, fnameptr, fnamelen, nmp); nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, tnameptr, tnamelen); + nfsm_chain_add_name(error, &nmreq, tnameptr, tnamelen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(fdnp, NULL, &nmreq, NFSPROC_RENAME, ctx, &nmrep, &xid, &status); + error = nfs_request(fdnp, NULL, &nmreq, NFSPROC_RENAME, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock2(fdnp, tdnp))) error = lockerror; @@ -3617,11 +4132,10 @@ nfs3_vnop_link( NFSX_FH(nfsvers)*2 + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); nfsm_chain_add_fh(error, &nmreq, nfsvers, tdnp->n_fhp, tdnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_LINK, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_LINK, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock2(tdnp, np))) { error = lockerror; @@ -3680,7 +4194,7 @@ nfs3_vnop_symlink( vnode_t dvp = ap->a_dvp; struct vnode_attr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; - struct nfs_vattr nvattr, dnvattr; + struct nfs_vattr nvattr; fhandle_t fh; int slen, error = 0, lockerror = ENOENT, busyerror = ENOENT, status, wccpostattr = 0; struct timespec premtime = { 0, 0 }; @@ -3691,7 +4205,7 @@ nfs3_vnop_symlink( nfsnode_t dnp = VTONFS(dvp); struct nfsmount *nmp; struct nfsm_chain nmreq, nmrep; - struct nfsreq *req = NULL; + struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; nmp = VTONMP(dvp); @@ -3704,6 +4218,8 @@ nfs3_vnop_symlink( ((cnp->cn_namelen > NFS_MAXNAMLEN) || (slen > NFS_MAXPATHLEN))) return (ENAMETOOLONG); + nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); + VATTR_SET_SUPPORTED(vap, va_mode); VATTR_SET_SUPPORTED(vap, va_uid); VATTR_SET_SUPPORTED(vap, va_gid); @@ -3723,17 +4239,17 @@ nfs3_vnop_symlink( NFSX_FH(nfsvers) + 2 * NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + nfsm_rndup(slen) + NFSX_SATTR(nfsvers)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) nfsm_chain_add_v3sattr(error, &nmreq, vap); - nfsm_chain_add_string(error, &nmreq, ap->a_target, slen); + nfsm_chain_add_name(error, &nmreq, ap->a_target, slen, nmp); if (nfsvers == NFS_VER2) nfsm_chain_add_v2sattr(error, &nmreq, vap, -1); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_SYMLINK, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { nfs_dulookup_start(&dul, dnp, ctx); error = nfs_request_async_finish(req, &nmrep, &xid, &status); @@ -3767,11 +4283,11 @@ nfsmout: NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr); nfs_node_unlock(dnp); /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); + nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } if (!error && fh.fh_len) - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &np); + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); if (!error && np) newvp = NFSTOV(np); @@ -3797,7 +4313,7 @@ nfsmout: if (!busyerror) nfs_node_clear_busy(dnp); if (!error && (gotuid || gotgid) && - (!newvp || nfs_getattrcache(np, &nvattr) || + (!newvp || nfs_getattrcache(np, &nvattr, 0) || (gotuid && (nvattr.nva_uid != vap->va_uid)) || (gotgid && (nvattr.nva_gid != vap->va_gid)))) { /* clear ID bits if server didn't use them (or we can't tell) */ @@ -3834,7 +4350,7 @@ nfs3_vnop_mkdir( vnode_t dvp = ap->a_dvp; struct vnode_attr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; - struct nfs_vattr nvattr, dnvattr; + struct nfs_vattr nvattr; nfsnode_t np = NULL; struct nfsmount *nmp; nfsnode_t dnp = VTONFS(dvp); @@ -3845,7 +4361,7 @@ nfs3_vnop_mkdir( u_int64_t xid, dxid; fhandle_t fh; struct nfsm_chain nmreq, nmrep; - struct nfsreq *req = NULL; + struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; nmp = VTONMP(dvp); @@ -3855,6 +4371,8 @@ nfs3_vnop_mkdir( if ((nfsvers == NFS_VER2) && (cnp->cn_namelen > NFS_MAXNAMLEN)) return (ENAMETOOLONG); + nfs_avoid_needless_id_setting_on_create(dnp, vap, ctx); + VATTR_SET_SUPPORTED(vap, va_mode); VATTR_SET_SUPPORTED(vap, va_uid); VATTR_SET_SUPPORTED(vap, va_gid); @@ -3874,7 +4392,7 @@ nfs3_vnop_mkdir( NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen) + NFSX_SATTR(nfsvers)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); if (nfsvers == NFS_VER3) nfsm_chain_add_v3sattr(error, &nmreq, vap); else @@ -3883,7 +4401,7 @@ nfs3_vnop_mkdir( nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_MKDIR, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { nfs_dulookup_start(&dul, dnp, ctx); error = nfs_request_async_finish(req, &nmrep, &xid, &status); @@ -3914,11 +4432,11 @@ nfsmout: NFS_CHANGED_UPDATE_NC(nfsvers, dnp, &dnp->n_vattr); nfs_node_unlock(dnp); /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); + nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } if (!error && fh.fh_len) - error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, NG_MAKEENTRY, &np); + error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); if (!error && np) newvp = NFSTOV(np); @@ -3944,7 +4462,7 @@ nfsmout: if (!busyerror) nfs_node_clear_busy(dnp); if (!error && (gotuid || gotgid) && - (!newvp || nfs_getattrcache(np, &nvattr) || + (!newvp || nfs_getattrcache(np, &nvattr, 0) || (gotuid && (nvattr.nva_uid != vap->va_uid)) || (gotgid && (nvattr.nva_gid != vap->va_gid)))) { /* clear ID bits if server didn't use them (or we can't tell) */ @@ -3985,11 +4503,10 @@ nfs3_vnop_rmdir( struct nfsmount *nmp; nfsnode_t np = VTONFS(vp); nfsnode_t dnp = VTONFS(dvp); - struct nfs_vattr dnvattr; int nfsvers; u_int64_t xid; struct nfsm_chain nmreq, nmrep; - struct nfsreq *req = NULL; + struct nfsreq rq, *req = &rq; struct nfs_dulookup dul; nmp = VTONMP(vp); @@ -4010,12 +4527,12 @@ nfs3_vnop_rmdir( nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(cnp->cn_namelen)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen); + nfsm_chain_add_name(error, &nmreq, cnp->cn_nameptr, cnp->cn_namelen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_RMDIR, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, &req); + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, &req); if (!error) { nfs_dulookup_start(&dul, dnp, ctx); error = nfs_request_async_finish(req, &nmrep, &xid, &status); @@ -4039,7 +4556,7 @@ nfsmout: nfs_node_unlock(dnp); nfs_name_cache_purge(dnp, np, cnp, ctx); /* nfs_getattr() will check changed and purge caches */ - nfs_getattr(dnp, &dnvattr, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); + nfs_getattr(dnp, NULL, ctx, wccpostattr ? NGA_CACHED : NGA_UNCACHED); } nfs_dulookup_finish(&dul, dnp, ctx); nfs_node_clear_busy2(dnp, np); @@ -4106,7 +4623,6 @@ nfs_vnop_readdir( struct nfsmount *nmp; uio_t uio = ap->a_uio; int error, nfsvers, extended, numdirent, bigcookies, ptc, done; - struct nfs_vattr nvattr; uint16_t i, iptc, rlen, nlen; uint64_t cookie, nextcookie, lbn = 0; struct nfsbuf *bp = NULL; @@ -4132,6 +4648,11 @@ nfs_vnop_readdir( if (uio_resid(uio) == 0) return (0); + if ((nfsvers >= NFS_VER4) && (dnp->n_vattr.nva_flags & NFS_FFLAG_TRIGGER)) { + /* trigger directories should never be read, return nothing */ + return (0); + } + thd = vfs_context_thread(ctx); numdirent = done = 0; nextcookie = uio_offset(uio); @@ -4164,7 +4685,7 @@ nfs_vnop_readdir( nfs_node_unlock(dnp); } /* nfs_getattr() will check changed and purge caches */ - if ((error = nfs_getattr(dnp, &nvattr, ctx, NGA_UNCACHED))) + if ((error = nfs_getattr(dnp, NULL, ctx, NGA_UNCACHED))) goto out; } else { nfs_node_unlock(dnp); @@ -4412,7 +4933,8 @@ int nfs_dir_cookie_to_lbn(nfsnode_t dnp, uint64_t cookie, int *ptc, uint64_t *lbnp) { struct nfsdmap *ndcc = dnp->n_cookiecache; - int8_t i, eofptc, iptc, found; + int8_t eofptc, found; + int i, iptc; struct nfsmount *nmp; struct nfsbuf *bp, *lastbp; struct nfsbuflists blist; @@ -4586,7 +5108,7 @@ nfs_dir_buf_search( nvattrp = NFS_DIR_BUF_NVATTR(bp, i); if ((ndbhp->ndbh_ncgen != bp->nb_np->n_ncgen) || (fhp->fh_len == 0) || (nvattrp->nva_type == VNON) || (nvattrp->nva_fileid == 0)) { - /* entry is no longer valid */ + /* entry is not valid */ error = ENOENT; break; } @@ -4633,7 +5155,7 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn { nfsnode_t newnp; struct nfsmount *nmp; - int error = 0, slpflag, slptimeo, i, found = 0, count = 0; + int error = 0, i, found = 0, count = 0; u_int64_t xid; struct nfs_vattr nvattr; fhandle_t fh; @@ -4646,8 +5168,6 @@ nfs_dir_buf_cache_lookup(nfsnode_t dnp, nfsnode_t *npp, struct componentname *cn if (!(nmp = NFSTONMP(dnp))) return (ENXIO); - slpflag = (nmp->nm_flag & NFSMNT_INT) ? PCATCH : 0; - slptimeo = 0; if (!purge) *npp = NULL; @@ -4728,7 +5248,7 @@ done: if (!error && found && !purge) { error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, - &nvattr, &xid, NG_MAKEENTRY, &newnp); + &nvattr, &xid, dnp->n_auth, NG_MAKEENTRY, &newnp); if (error) return (error); newnp->n_attrstamp = attrstamp; @@ -4762,7 +5282,7 @@ nfs_name_cache_purge(nfsnode_t dnp, nfsnode_t np, struct componentname *cnp, vfs struct nfsmount *nmp = NFSTONMP(dnp); cache_purge(NFSTOV(np)); - if (nmp && (nmp->nm_vers > NFS_VER2) && (nmp->nm_flag & NFSMNT_RDIRPLUS)) + if (nmp && (nmp->nm_vers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) nfs_dir_buf_cache_lookup(dnp, NULL, cnp, ctx, 1); } @@ -4794,7 +5314,7 @@ nfs3_readdir_rpc(nfsnode_t dnp, struct nfsbuf *bp, vfs_context_t ctx) nmrsize = nmp->nm_rsize; bigcookies = nmp->nm_state & NFSSTA_BIGCOOKIES; noplus: - rdirplus = ((nfsvers > NFS_VER2) && (nmp->nm_flag & NFSMNT_RDIRPLUS)) ? 1 : 0; + rdirplus = ((nfsvers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) ? 1 : 0; if ((lockerror = nfs_node_lock(dnp))) return (lockerror); @@ -4843,7 +5363,7 @@ noplus: error = nfs_request(dnp, NULL, &nmreq, rdirplus ? NFSPROC_READDIRPLUS : NFSPROC_READDIR, - ctx, &nmrep, &xid, &status); + ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(dnp))) error = lockerror; @@ -4864,7 +5384,7 @@ noplus: if (error == NFSERR_NOTSUPP) { /* oops... it doesn't look like readdirplus is supported */ lck_mtx_lock(&nmp->nm_lock); - nmp->nm_flag &= ~NFSMNT_RDIRPLUS; + NFS_BITMAP_CLR(nmp->nm_flags, NFS_MFLAG_RDIRPLUS); lck_mtx_unlock(&nmp->nm_lock); goto noplus; } @@ -5107,6 +5627,10 @@ nfs_sillyrename( /* now, do the rename */ error = nmp->nm_funcs->nf_rename_rpc(dnp, cnp->cn_nameptr, cnp->cn_namelen, dnp, nsp->nsr_name, nsp->nsr_namlen, ctx); + + /* Kludge: Map ENOENT => 0 assuming that it is a reply to a retry. */ + if (error == ENOENT) + error = 0; if (!error) { nfs_node_lock_force(dnp); if (dnp->n_flag & NNEGNCENTRIES) { @@ -5154,11 +5678,11 @@ nfs3_lookup_rpc_async( nfsm_chain_build_alloc_init(error, &nmreq, NFSX_FH(nfsvers) + NFSX_UNSIGNED + nfsm_rndup(namelen)); nfsm_chain_add_fh(error, &nmreq, nfsvers, dnp->n_fhp, dnp->n_fhsize); - nfsm_chain_add_string(error, &nmreq, name, namelen); + nfsm_chain_add_name(error, &nmreq, name, namelen, nmp); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); error = nfs_request_async(dnp, NULL, &nmreq, NFSPROC_LOOKUP, - vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, reqp); + vfs_context_thread(ctx), vfs_context_ucred(ctx), NULL, 0, NULL, reqp); nfsmout: nfsm_chain_cleanup(&nmreq); return (error); @@ -5167,6 +5691,8 @@ nfsmout: int nfs3_lookup_rpc_async_finish( nfsnode_t dnp, + __unused char *name, + __unused int namelen, vfs_context_t ctx, struct nfsreq *req, u_int64_t *xidp, @@ -5206,7 +5732,7 @@ nfs3_lookup_rpc_async_finish( nfsm_chain_postop_attr_get(error, &nmrep, attrflag, nvap); nfsm_chain_postop_attr_update(error, &nmrep, dnp, &xid); if (!error && !attrflag) - error = nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, ctx, nvap, xidp); + error = nfs3_getattr_rpc(NULL, NFSTOMP(dnp), fhp->fh_data, fhp->fh_len, 0, ctx, nvap, xidp); } else { error = nfs_parsefattr(&nmrep, nfsvers, nvap); } @@ -5249,6 +5775,8 @@ nfs_lookitup( (namelen > (int)nmp->nm_fsattr.nfsa_maxname)) return (ENAMETOOLONG); + NVATTR_INIT(&nvattr); + /* check for lookup of "." */ if ((name[0] == '.') && (namelen == 1)) { /* skip lookup, we know who we are */ @@ -5259,7 +5787,7 @@ nfs_lookitup( error = nmp->nm_funcs->nf_lookup_rpc_async(dnp, name, namelen, ctx, &req); nfsmout_if(error); - error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, ctx, req, &xid, &fh, &nvattr); + error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, name, namelen, ctx, req, &xid, &fh, &nvattr); nfsmout_if(!npp || error); if (*npp) { @@ -5299,7 +5827,7 @@ nfs_lookitup( cnp->cn_nameptr = name; cnp->cn_namelen = namelen; error = nfs_nget(NFSTOMP(dnp), dnp, cnp, fh.fh_data, fh.fh_len, - &nvattr, &xid, NG_MAKEENTRY, &np); + &nvattr, &xid, rq.r_auth, NG_MAKEENTRY, &np); nfsmout_if(error); newnp = np; } @@ -5307,6 +5835,7 @@ nfs_lookitup( nfsmout: if (npp && !*npp && !error) *npp = newnp; + NVATTR_CLEANUP(&nvattr); return (error); } @@ -5319,11 +5848,14 @@ nfs_dulookup_init(struct nfs_dulookup *dulp, nfsnode_t dnp, const char *name, in { int error, du_namelen; vnode_t du_vp; + struct nfsmount *nmp = NFSTONMP(dnp); /* check for ._ file in name cache */ dulp->du_flags = 0; bzero(&dulp->du_cn, sizeof(dulp->du_cn)); du_namelen = namelen + 2; + if (!nmp || NMFLAG(nmp, NONEGNAMECACHE)) + return; if ((namelen >= 2) && (name[0] == '.') && (name[1] == '_')) return; if (du_namelen >= (int)sizeof(dulp->du_smallname)) @@ -5342,8 +5874,8 @@ nfs_dulookup_init(struct nfs_dulookup *dulp, nfsnode_t dnp, const char *name, in if (error == -1) { vnode_put(du_vp); } else if (!error) { - struct nfsmount *nmp = NFSTONMP(dnp); - if (nmp && (nmp->nm_vers > NFS_VER2) && (nmp->nm_flag & NFSMNT_RDIRPLUS)) { + nmp = NFSTONMP(dnp); + if (nmp && (nmp->nm_vers > NFS_VER2) && NMFLAG(nmp, RDIRPLUS)) { /* if rdirplus, try dir buf cache lookup */ nfsnode_t du_np = NULL; if (!nfs_dir_buf_cache_lookup(dnp, &du_np, &dulp->du_cn, ctx, 0) && du_np) { @@ -5367,7 +5899,7 @@ nfs_dulookup_start(struct nfs_dulookup *dulp, nfsnode_t dnp, vfs_context_t ctx) struct nfsmount *nmp = NFSTONMP(dnp); struct nfsreq *req = &dulp->du_req; - if (!nmp || !(dulp->du_flags & NFS_DULOOKUP_DOIT)) + if (!nmp || !(dulp->du_flags & NFS_DULOOKUP_DOIT) || (dulp->du_flags & NFS_DULOOKUP_INPROG)) return; if (!nmp->nm_funcs->nf_lookup_rpc_async(dnp, dulp->du_cn.cn_nameptr, dulp->du_cn.cn_namelen, ctx, &req)) @@ -5390,7 +5922,9 @@ nfs_dulookup_finish(struct nfs_dulookup *dulp, nfsnode_t dnp, vfs_context_t ctx) if (!nmp || !(dulp->du_flags & NFS_DULOOKUP_INPROG)) goto out; - error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, ctx, &dulp->du_req, &xid, &fh, &nvattr); + NVATTR_INIT(&nvattr); + error = nmp->nm_funcs->nf_lookup_rpc_async_finish(dnp, dulp->du_cn.cn_nameptr, + dulp->du_cn.cn_namelen, ctx, &dulp->du_req, &xid, &fh, &nvattr); dulp->du_flags &= ~NFS_DULOOKUP_INPROG; if (error == ENOENT) { /* add a negative entry in the name cache */ @@ -5400,12 +5934,13 @@ nfs_dulookup_finish(struct nfs_dulookup *dulp, nfsnode_t dnp, vfs_context_t ctx) nfs_node_unlock(dnp); } else if (!error) { error = nfs_nget(NFSTOMP(dnp), dnp, &dulp->du_cn, fh.fh_data, fh.fh_len, - &nvattr, &xid, NG_MAKEENTRY, &du_np); + &nvattr, &xid, dulp->du_req.r_auth, NG_MAKEENTRY, &du_np); if (!error) { nfs_node_unlock(du_np); vnode_put(NFSTOV(du_np)); } } + NVATTR_CLEANUP(&nvattr); out: if (dulp->du_flags & NFS_DULOOKUP_INPROG) nfs_request_async_cancel(&dulp->du_req); @@ -5420,14 +5955,15 @@ out: int nfs3_commit_rpc( nfsnode_t np, - u_int64_t offset, - u_int64_t count, - kauth_cred_t cred) + uint64_t offset, + uint64_t count, + kauth_cred_t cred, + uint64_t wverf) { struct nfsmount *nmp; int error = 0, lockerror, status, wccpostattr = 0, nfsvers; struct timespec premtime = { 0, 0 }; - u_int64_t xid, wverf; + u_int64_t xid, newwverf; uint32_t count32; struct nfsm_chain nmreq, nmrep; @@ -5454,7 +5990,7 @@ nfs3_commit_rpc( nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); error = nfs_request2(np, NULL, &nmreq, NFSPROC_COMMIT, - current_thread(), cred, 0, &nmrep, &xid, &status); + current_thread(), cred, NULL, 0, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; /* can we do anything useful with the wcc info? */ @@ -5463,13 +5999,13 @@ nfs3_commit_rpc( nfs_node_unlock(np); if (!error) error = status; - nfsm_chain_get_64(error, &nmrep, wverf); + nfsm_chain_get_64(error, &nmrep, newwverf); nfsmout_if(error); lck_mtx_lock(&nmp->nm_lock); - if (nmp->nm_verf != wverf) { - nmp->nm_verf = wverf; + if (nmp->nm_verf != newwverf) + nmp->nm_verf = newwverf; + if (wverf != newwverf) error = NFSERR_STALEWRITEVERF; - } lck_mtx_unlock(&nmp->nm_lock); nfsmout: nfsm_chain_cleanup(&nmreq); @@ -5494,23 +6030,6 @@ nfs_vnop_blockmap( return (ENOTSUP); } -/* - * Mmap a file - * - * NB Currently unsupported. - */ -/*ARGSUSED*/ -int -nfs_vnop_mmap( - __unused struct vnop_mmap_args /* { - struct vnodeop_desc *a_desc; - vnode_t a_vp; - int a_fflags; - vfs_context_t a_context; - } */ *ap) -{ - return (EINVAL); -} /* * fsync vnode op. Just call nfs_flush(). @@ -5556,8 +6075,7 @@ nfs3_pathconf_rpc( nfsm_chain_add_fh(error, &nmreq, nfsvers, np->n_fhp, np->n_fhsize); nfsm_chain_build_done(error, &nmreq); nfsmout_if(error); - error = nfs_request(np, NULL, &nmreq, NFSPROC_PATHCONF, ctx, - &nmrep, &xid, &status); + error = nfs_request(np, NULL, &nmreq, NFSPROC_PATHCONF, ctx, NULL, &nmrep, &xid, &status); if ((lockerror = nfs_node_lock(np))) error = lockerror; nfsm_chain_postop_attr_update(error, &nmrep, np, &xid); @@ -5653,6 +6171,12 @@ nfs_vnop_pathconf( return (0); } break; + case _PC_XATTR_SIZE_BITS: + /* Do we support xattrs natively? */ + if (nmp->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR) + break; /* Yes */ + /* No... so just return an error */ + /* FALLTHROUGH */ default: /* don't bother contacting the server if we know the answer */ return (EINVAL); @@ -5738,6 +6262,7 @@ nfs_vnop_pathconf( else error = EINVAL; break; + case _PC_XATTR_SIZE_BITS: /* same as file size bits if named attrs supported */ case _PC_FILESIZEBITS: if (!NFS_BITMAP_ISSET(nfsap->nfsa_bitmap, NFS_FATTR_MAXFILESIZE)) { *ap->a_retval = 64; @@ -6007,7 +6532,7 @@ nfsfifo_vnop_close( /*ARGSUSED*/ int nfs_vnop_ioctl( - __unused struct vnop_ioctl_args /* { + struct vnop_ioctl_args /* { struct vnodeop_desc *a_desc; vnode_t a_vp; u_int32_t a_command; @@ -6016,12 +6541,23 @@ nfs_vnop_ioctl( vfs_context_t a_context; } */ *ap) { + vfs_context_t ctx = ap->a_context; + vnode_t vp = ap->a_vp; + int error = ENOTTY; - /* - * XXX we were once bogusly enoictl() which returned this (ENOTTY). - * Probably we should return ENODEV. - */ - return (ENOTTY); + switch (ap->a_command) { + + case F_FULLFSYNC: + if (vnode_vfsisrdonly(vp)) + return (EROFS); + if (!VTONMP(vp)) + return (ENXIO); + error = nfs_flush(VTONFS(vp), MNT_WAIT, vfs_context_thread(ctx), 0); + break; + + } + + return (error); } /*ARGSUSED*/ @@ -6135,6 +6671,10 @@ tryagain: bzero(req, sizeof(req)); nextsend = nextwait = 0; do { + if (np->n_flag & NREVOKE) { + error = EIO; + break; + } /* send requests while we need to and have available slots */ while ((txsize > 0) && (req[nextsend] == NULL)) { iosize = MIN(nmrsize, txsize); @@ -6161,14 +6701,11 @@ tryagain: nextwait = (nextwait + 1) % MAXPAGINGREQS; if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) { lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_vnop_pagein: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_vnop_pagein: error %d, initiating recovery", error); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); restart++; goto cancel; } @@ -6200,11 +6737,17 @@ cancel: req[nextwait] = NULL; nextwait = (nextwait + 1) % MAXPAGINGREQS; } - if (restart) { - if ((restart <= nfs_mount_state_max_restarts(nmp)) && /* guard against no progress */ - (!(error = nfs_mount_state_wait_for_recovery(nmp)))) - goto tryagain; - printf("nfs_pagein: too many restarts, aborting.\n"); + if (np->n_flag & NREVOKE) { + error = EIO; + } else if (restart) { + if (restart <= nfs_mount_state_max_restarts(nmp)) { /* guard against no progress */ + if (error == NFSERR_GRACE) + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) + goto tryagain; + } else { + NP(np, "nfs_pagein: too many restarts, aborting"); + } } } @@ -6579,6 +7122,10 @@ tryagain: bzero(req, sizeof(req)); nextsend = nextwait = 0; do { + if (np->n_flag & NREVOKE) { + error = EIO; + break; + } /* send requests while we need to and have available slots */ while ((txsize > 0) && (req[nextsend] == NULL)) { iosize = MIN(nmwsize, txsize); @@ -6616,14 +7163,11 @@ tryagain: nfs_node_unlock(np); if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) { lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_vnop_pageout: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_vnop_pageout: error %d, initiating recovery", error); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); restart = 1; goto cancel; } @@ -6654,16 +7198,13 @@ tryagain: iomode = NFS_WRITE_UNSTABLE; error = nfs_write_rpc2(np, auio, thd, cred, &iomode, &wverf2); if ((nmp->nm_vers >= NFS_VER4) && nfs_mount_state_error_should_restart(error)) { - printf("nfs_vnop_pageout: restart: error %d\n", error); + NP(np, "nfs_vnop_pageout: restart: error %d", error); lck_mtx_lock(&nmp->nm_lock); - if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid) && !(nmp->nm_state & NFSSTA_RECOVER)) { - printf("nfs_vnop_pageout: error %d, initiating recovery\n", error); - nmp->nm_state |= NFSSTA_RECOVER; - nfs_mount_sock_thread_wake(nmp); + if ((error != NFSERR_GRACE) && (stategenid == nmp->nm_stategenid)) { + NP(np, "nfs_vnop_pageout: error %d, initiating recovery", error); + nfs_need_recover(nmp, error); } lck_mtx_unlock(&nmp->nm_lock); - if (error == NFSERR_GRACE) - tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); restart = 1; goto cancel; } @@ -6690,7 +7231,7 @@ tryagain: vrestart = 0; if (!error && (commit != NFS_WRITE_FILESYNC)) { - error = nmp->nm_funcs->nf_commit_rpc(np, f_offset, xsize, cred); + error = nmp->nm_funcs->nf_commit_rpc(np, f_offset, xsize, cred, wverf); if (error == NFSERR_STALEWRITEVERF) { vrestart = 1; error = EIO; @@ -6709,18 +7250,26 @@ cancel: np->n_numoutput--; nfs_node_unlock(np); } - if (vrestart) { - if (++vrestarts <= 100) /* guard against no progress */ - goto tryagain; - printf("nfs_pageout: too many restarts, aborting.\n"); - FSDBG(323, f_offset, xsize, ERESTART, -1); - } - if (restart) { - if ((restarts <= nfs_mount_state_max_restarts(nmp)) && /* guard against no progress */ - (!(error = nfs_mount_state_wait_for_recovery(nmp)))) - goto tryagain; - printf("nfs_pageout: too many restarts, aborting.\n"); - FSDBG(323, f_offset, xsize, ERESTART, -1); + if (np->n_flag & NREVOKE) { + error = EIO; + } else { + if (vrestart) { + if (++vrestarts <= 100) /* guard against no progress */ + goto tryagain; + NP(np, "nfs_pageout: too many restarts, aborting"); + FSDBG(323, f_offset, xsize, ERESTART, -1); + } + if (restart) { + if (restarts <= nfs_mount_state_max_restarts(nmp)) { /* guard against no progress */ + if (error == NFSERR_GRACE) + tsleep(&nmp->nm_state, (PZERO-1), "nfsgrace", 2*hz); + if (!(error = nfs_mount_state_wait_for_recovery(nmp))) + goto tryagain; + } else { + NP(np, "nfs_pageout: too many restarts, aborting"); + FSDBG(323, f_offset, xsize, ERESTART, -1); + } + } } } @@ -6762,7 +7311,7 @@ cancel: abortflags = UPL_ABORT_DUMP_PAGES|UPL_ABORT_FREE_ON_EMPTY; if (error <= NFS_ELAST) { if ((errorcount[error] % 100) == 0) - printf("nfs_pageout: unexpected error %d. dumping vm page\n", error); + NP(np, "nfs_pageout: unexpected error %d. dumping vm page", error); errorcount[error]++; } break; @@ -6776,7 +7325,7 @@ cancel: break; case SEVER: /* not implemented */ default: - printf("nfs_pageout: action %d not expected\n", action); + NP(np, "nfs_pageout: action %d not expected", action); break; } @@ -6837,3 +7386,84 @@ nfs_vnop_offtoblk( return (0); } +/* + * vnode change monitoring + */ +int +nfs_vnop_monitor( + struct vnop_monitor_args /* { + struct vnodeop_desc *a_desc; + vnode_t a_vp; + uint32_t a_events; + uint32_t a_flags; + void *a_handle; + vfs_context_t a_context; + } */ *ap) +{ + nfsnode_t np = VTONFS(ap->a_vp); + struct nfsmount *nmp = VTONMP(ap->a_vp); + int error = 0; + + if (!nmp) + return (ENXIO); + + /* make sure that the vnode's monitoring status is up to date */ + lck_mtx_lock(&nmp->nm_lock); + if (vnode_ismonitored(ap->a_vp)) { + /* This vnode is currently being monitored, make sure we're tracking it. */ + if (np->n_monlink.le_next == NFSNOLIST) { + LIST_INSERT_HEAD(&nmp->nm_monlist, np, n_monlink); + nfs_mount_sock_thread_wake(nmp); + } + } else { + /* This vnode is no longer being monitored, make sure we're not tracking it. */ + /* Wait for any in-progress getattr to complete first. */ + while (np->n_mflag & NMMONSCANINPROG) { + struct timespec ts = { 1, 0 }; + np->n_mflag |= NMMONSCANWANT; + msleep(&np->n_mflag, &nmp->nm_lock, PZERO-1, "nfswaitmonscan", &ts); + } + if (np->n_monlink.le_next != NFSNOLIST) { + LIST_REMOVE(np, n_monlink); + np->n_monlink.le_next = NFSNOLIST; + } + } + lck_mtx_unlock(&nmp->nm_lock); + + return (error); +} + +/* + * Send a vnode notification for the given events. + */ +void +nfs_vnode_notify(nfsnode_t np, uint32_t events) +{ + struct nfsmount *nmp = NFSTONMP(np); + struct nfs_vattr nvattr; + struct vnode_attr vattr, *vap = NULL; + struct timeval now; + + microuptime(&now); + if ((np->n_evtstamp == now.tv_sec) || !nmp) { + /* delay sending this notify */ + np->n_events |= events; + return; + } + events |= np->n_events; + np->n_events = 0; + np->n_evtstamp = now.tv_sec; + + vfs_get_notify_attributes(&vattr); + if (!nfs_getattrcache(np, &nvattr, 0)) { + vap = &vattr; + VATTR_INIT(vap); + VATTR_RETURN(vap, va_fsid, vfs_statfs(nmp->nm_mountp)->f_fsid.val[0]); + VATTR_RETURN(vap, va_fileid, nvattr.nva_fileid); + VATTR_RETURN(vap, va_mode, nvattr.nva_mode); + VATTR_RETURN(vap, va_uid, nvattr.nva_uid); + VATTR_RETURN(vap, va_gid, nvattr.nva_gid); + VATTR_RETURN(vap, va_nlink, nvattr.nva_nlink); + } + vnode_notify(NFSTOV(np), events, vap); +} diff --git a/bsd/nfs/nfsm_subs.h b/bsd/nfs/nfsm_subs.h index 910636f85..434d4f57a 100644 --- a/bsd/nfs/nfsm_subs.h +++ b/bsd/nfs/nfsm_subs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -73,8 +73,8 @@ #ifdef __APPLE_API_PRIVATE -int nfsm_rpchead(struct nfsreq *, int, mbuf_t, u_int64_t *, mbuf_t *); -int nfsm_rpchead2(int, int, int, int, int, int, kauth_cred_t, struct nfsreq *, mbuf_t, u_int64_t *, mbuf_t *); +int nfsm_rpchead(struct nfsreq *, mbuf_t, u_int64_t *, mbuf_t *); +int nfsm_rpchead2(struct nfsmount *, int, int, int, int, int, kauth_cred_t, struct nfsreq *, mbuf_t, u_int64_t *, mbuf_t *); int nfsm_chain_new_mbuf(struct nfsm_chain *, size_t); int nfsm_chain_add_opaque_f(struct nfsm_chain *, const u_char *, uint32_t); @@ -83,6 +83,7 @@ int nfsm_chain_add_uio(struct nfsm_chain *, uio_t, uint32_t); int nfsm_chain_add_fattr4_f(struct nfsm_chain *, struct vnode_attr *, struct nfsmount *); int nfsm_chain_add_v2sattr_f(struct nfsm_chain *, struct vnode_attr *, uint32_t); int nfsm_chain_add_v3sattr_f(struct nfsm_chain *, struct vnode_attr *); +int nfsm_chain_add_string_nfc(struct nfsm_chain *, const uint8_t *, uint32_t); int nfsm_chain_advance(struct nfsm_chain *, uint32_t); int nfsm_chain_offset(struct nfsm_chain *); @@ -93,6 +94,7 @@ int nfsm_chain_get_uio(struct nfsm_chain *, uint32_t, uio_t); int nfsm_chain_get_fh_attr(struct nfsm_chain *, nfsnode_t, vfs_context_t, int, uint64_t *, fhandle_t *, struct nfs_vattr *); int nfsm_chain_get_wcc_data_f(struct nfsm_chain *, nfsnode_t, struct timespec *, int *, u_int64_t *); +int nfsm_chain_get_secinfo(struct nfsm_chain *, uint32_t *, int *); #if NFSSERVER void nfsm_adj(mbuf_t, int, int); @@ -339,6 +341,16 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); nfsm_chain_add_opaque((E), (NMC), (STR), (LEN)); \ } while (0) +/* add a name to an mbuf chain */ +#define nfsm_chain_add_name(E, NMC, STR, LEN, NMP) \ + do { \ + if (E) break; \ + if (NMFLAG((NMP), NFC)) \ + (E) = nfsm_chain_add_string_nfc((NMC), (const uint8_t*)(STR), (LEN)); \ + else \ + nfsm_chain_add_string((E), (NMC), (STR), (LEN)); \ + } while (0) + /* add an NFSv2 time to an mbuf chain */ #define nfsm_chain_add_v2time(E, NMC, TVP) \ do { \ @@ -454,6 +466,36 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); nfsm_chain_add_32((E), (NMC), ((B)[__i] & (MASK)[__i])); \ } while (0) +/* add NFSv4 attr bitmap masked with the supported attributes for this mount/node */ +#define nfsm_chain_add_bitmap_supported(E, NMC, B, NMP, NP) \ + do { \ + uint32_t __bitmap[NFS_ATTR_BITMAP_LEN], *__bmp = (B); \ + int __nonamedattr = 0, __noacl = 0, __nomode = 0; \ + if (!((NMP)->nm_fsattr.nfsa_flags & NFS_FSFLAG_NAMED_ATTR) || \ + ((NP) && (((nfsnode_t)(NP))->n_flag & (NISDOTZFS|NISDOTZFSCHILD)))) \ + __nonamedattr = 1; \ + if (!((NMP)->nm_fsattr.nfsa_flags & NFS_FSFLAG_ACL)) \ + __noacl = 1; \ + if (NMFLAG((NMP), ACLONLY)) \ + __nomode = 1; \ + if (__nonamedattr || __noacl || __nomode) { \ + /* don't ask for attrs we're not supporting */ \ + /* some ".zfs" directories can't handle being asked for some attributes */ \ + int __ii; \ + NFS_CLEAR_ATTRIBUTES(__bitmap); \ + for (__ii=0; __ii < NFS_ATTR_BITMAP_LEN; __ii++) \ + __bitmap[__ii] = (B)[__ii]; \ + if (__nonamedattr) \ + NFS_BITMAP_CLR(__bitmap, NFS_FATTR_NAMED_ATTR); \ + if (__noacl) \ + NFS_BITMAP_CLR(__bitmap, NFS_FATTR_ACL); \ + if (__nomode) \ + NFS_BITMAP_CLR(__bitmap, NFS_FATTR_MODE); \ + __bmp = __bitmap; \ + } \ + nfsm_chain_add_bitmap_masked((E), (NMC), __bmp, NFS_ATTR_BITMAP_LEN, (NMP)->nm_fsattr.nfsa_supp_attr); \ + } while (0) + /* Add an NFSv4 "stateid" structure to an mbuf chain */ #define nfsm_chain_add_stateid(E, NMC, SID) \ do { \ @@ -642,19 +684,18 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); } while (0) /* update a node's attribute cache with attributes from an mbuf chain */ -#define nfsm_chain_loadattr(E, NMC, NP, VERS, A, X) \ +#define nfsm_chain_loadattr(E, NMC, NP, VERS, X) \ do { \ - struct nfs_vattr ttvattr, *ttnvap; \ + struct nfs_vattr ttvattr; \ if (E) break; \ - ttnvap = (A) ? (A) : &ttvattr; \ if ((VERS) == NFS_VER4) { \ - NFS_CLEAR_ATTRIBUTES(ttnvap->nva_bitmap); \ - (E) = nfs4_parsefattr((NMC), NULL, ttnvap, NULL, NULL); \ + (E) = nfs4_parsefattr((NMC), NULL, &ttvattr, NULL, NULL, NULL); \ } else { \ - (E) = nfs_parsefattr((NMC), (VERS), ttnvap); \ + (E) = nfs_parsefattr((NMC), (VERS), &ttvattr); \ } \ - if (E) break; \ - (E) = nfs_loadattrcache((NP), ttnvap, (X), 0); \ + if (!(E) && (NP)) \ + (E) = nfs_loadattrcache((NP), &ttvattr, (X), 0); \ + NVATTR_CLEANUP(&ttvattr); \ } while (0) /* get NFSv4 attr bitmap */ @@ -693,7 +734,8 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); do { \ uint32_t __val = 0; \ nfsm_chain_get_32((E), (NMC), __val); \ - nfsm_assert((E), (__val == (OP)), EBADRPC); \ + /* [sigh] some implementations return the "illegal" op for unsupported ops */ \ + nfsm_assert((E), ((__val == (OP)) || (__val == NFS_OP_ILLEGAL)), EBADRPC); \ nfsm_chain_get_32((E), (NMC), __val); \ nfsm_assert((E), (__val == NFS_OK), __val); \ } while (0) @@ -705,7 +747,7 @@ int nfsm_chain_trim_data(struct nfsm_chain *, int, int *); nfsm_chain_get_32((E), (NMC), __ci_atomic); \ nfsm_chain_get_64((E), (NMC), __ci_before); \ nfsm_chain_get_64((E), (NMC), __ci_after); \ - if (E) break; \ + if ((E) || !(DNP)) break; \ if (__ci_atomic && (__ci_before == (DNP)->n_ncchange)) { \ (DNP)->n_ncchange = __ci_after; \ } else { \ diff --git a/bsd/nfs/nfsmount.h b/bsd/nfs/nfsmount.h index 742c166c5..97f955e2f 100644 --- a/bsd/nfs/nfsmount.h +++ b/bsd/nfs/nfsmount.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -105,29 +105,130 @@ struct nfs_fsattr { #define NFS_FSFLAG_CHOWN_RESTRICTED 0x00000080 #define NFS_FSFLAG_HOMOGENEOUS 0x00000100 #define NFS_FSFLAG_NO_TRUNC 0x00000200 +#define NFS_FSFLAG_NAMED_ATTR 0x00000400 #define NFS_FSFLAG_FHTYPE_MASK 0xFF000000 #define NFS_FSFLAG_FHTYPE_SHIFT 24 +/* + * NFS file system location structures + */ +struct nfs_fs_server { + char * ns_name; /* name of server */ + char ** ns_addresses; /* array of addresses for server */ + uint32_t ns_addrcount; /* # of addresses */ +}; +struct nfs_fs_path { + char ** np_components; /* array of component pointers */ + uint32_t np_compcount; /* # components in path */ +}; +struct nfs_fs_location { + struct nfs_fs_server ** nl_servers; /* array of server pointers */ + struct nfs_fs_path nl_path; /* file system path */ + uint32_t nl_servcount; /* # of servers */ +}; + +struct nfs_location_index { + uint8_t nli_flags; /* misc flags */ + uint8_t nli_loc; /* location index */ + uint8_t nli_serv; /* server index */ + uint8_t nli_addr; /* address index */ +}; +#define NLI_VALID 0x01 /* index is valid */ + +struct nfs_fs_locations { + struct nfs_fs_path nl_root; /* current server's root file system path */ + uint32_t nl_numlocs; /* # of locations */ + struct nfs_location_index nl_current; /* index of current location/server/address */ + struct nfs_fs_location **nl_locations; /* array of fs locations */ +}; + +/* + * RPC record marker parsing state + */ +struct nfs_rpc_record_state { + mbuf_t nrrs_m; /* mbufs for current record */ + mbuf_t nrrs_mlast; + uint16_t nrrs_lastfrag; /* last fragment of record */ + uint16_t nrrs_markerleft; /* marker bytes remaining */ + uint32_t nrrs_fragleft; /* fragment bytes remaining */ + uint32_t nrrs_reclen; /* length of RPC record */ +}; + +/* + * NFS socket structures + */ +struct nfs_socket { + lck_mtx_t nso_lock; /* nfs socket lock */ + TAILQ_ENTRY(nfs_socket) nso_link; /* list of sockets */ + struct sockaddr * nso_saddr; /* socket address */ + struct sockaddr * nso_saddr2; /* additional socket address */ + void * nso_wake; /* address to wake up */ + time_t nso_timestamp; + time_t nso_reqtimestamp; /* last request sent */ + socket_t nso_so; /* socket */ + uint8_t nso_sotype; /* Type of socket */ + uint16_t nso_flags; /* NSO_* flags */ + struct nfs_location_index nso_location; /* location index */ + uint32_t nso_protocol; /* RPC protocol */ + uint32_t nso_version; /* RPC protocol version */ + uint32_t nso_pingxid; /* RPC XID of NULL ping request */ + int nso_error; /* saved error/status */ + struct nfs_rpc_record_state nso_rrs; /* RPC record parsing state (TCP) */ +}; +TAILQ_HEAD(nfssocketlist, nfs_socket); +/* nso_flags */ +#define NSO_UPCALL 0x0001 /* socket upcall in progress */ +#define NSO_DEAD 0x0002 /* socket is dead */ +#define NSO_CONNECTING 0x0004 /* socket is being connected */ +#define NSO_CONNECTED 0x0008 /* socket connection complete */ +#define NSO_PINGING 0x0010 /* socket is being tested */ +#define NSO_VERIFIED 0x0020 /* socket appears functional */ +#define NSO_DISCONNECTING 0x0040 /* socket is being disconnected */ + +/* NFS connect socket search state */ +struct nfs_socket_search { + struct nfs_location_index nss_startloc; /* starting location index */ + struct nfs_location_index nss_nextloc; /* next location index */ + struct nfssocketlist nss_socklist; /* list of active sockets */ + time_t nss_timestamp; /* search start time */ + time_t nss_last; /* timestamp of last socket */ + struct nfs_socket * nss_sock; /* found socket */ + uint8_t nss_sotype; /* TCP/UDP */ + uint8_t nss_sockcnt; /* # of active sockets */ + in_port_t nss_port; /* port # to connect to */ + uint32_t nss_protocol; /* RPC protocol */ + uint32_t nss_version; /* RPC protocol version */ + uint32_t nss_flags; /* (see below) */ + int nss_timeo; /* how long we are willing to wait */ + int nss_error; /* best error we've gotten so far */ +}; +/* nss_flags */ +#define NSS_VERBOSE 0x00000001 /* OK to log info about socket search */ +#define NSS_WARNED 0x00000002 /* logged warning about socket search taking a while */ + /* * function table for calling version-specific NFS functions */ struct nfs_funcs { - int (*nf_mount)(struct nfsmount *, vfs_context_t, struct user_nfs_args *, nfsnode_t *); + int (*nf_mount)(struct nfsmount *, vfs_context_t, nfsnode_t *); int (*nf_update_statfs)(struct nfsmount *, vfs_context_t); int (*nf_getquota)(struct nfsmount *, vfs_context_t, uid_t, int, struct dqblk *); int (*nf_access_rpc)(nfsnode_t, u_int32_t *, vfs_context_t); - int (*nf_getattr_rpc)(nfsnode_t, mount_t, u_char *, size_t, vfs_context_t, struct nfs_vattr *, u_int64_t *); + int (*nf_getattr_rpc)(nfsnode_t, mount_t, u_char *, size_t, int, vfs_context_t, struct nfs_vattr *, u_int64_t *); int (*nf_setattr_rpc)(nfsnode_t, struct vnode_attr *, vfs_context_t); int (*nf_read_rpc_async)(nfsnode_t, off_t, size_t, thread_t, kauth_cred_t, struct nfsreq_cbinfo *, struct nfsreq **); int (*nf_read_rpc_async_finish)(nfsnode_t, struct nfsreq *, uio_t, size_t *, int *); int (*nf_readlink_rpc)(nfsnode_t, char *, uint32_t *, vfs_context_t); int (*nf_write_rpc_async)(nfsnode_t, uio_t, size_t, thread_t, kauth_cred_t, int, struct nfsreq_cbinfo *, struct nfsreq **); int (*nf_write_rpc_async_finish)(nfsnode_t, struct nfsreq *, int *, size_t *, uint64_t *); - int (*nf_commit_rpc)(nfsnode_t, uint64_t, uint64_t, kauth_cred_t); + int (*nf_commit_rpc)(nfsnode_t, uint64_t, uint64_t, kauth_cred_t, uint64_t); int (*nf_lookup_rpc_async)(nfsnode_t, char *, int, vfs_context_t, struct nfsreq **); - int (*nf_lookup_rpc_async_finish)(nfsnode_t, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); + int (*nf_lookup_rpc_async_finish)(nfsnode_t, char *, int, vfs_context_t, struct nfsreq *, u_int64_t *, fhandle_t *, struct nfs_vattr *); int (*nf_remove_rpc)(nfsnode_t, char *, int, thread_t, kauth_cred_t); int (*nf_rename_rpc)(nfsnode_t, char *, int, nfsnode_t, char *, int, vfs_context_t); + int (*nf_setlock_rpc)(nfsnode_t, struct nfs_open_file *, struct nfs_file_lock *, int, int, thread_t, kauth_cred_t); + int (*nf_unlock_rpc)(nfsnode_t, struct nfs_lock_owner *, int, uint64_t, uint64_t, int, thread_t, kauth_cred_t); + int (*nf_getlock_rpc)(nfsnode_t, struct nfs_lock_owner *, struct flock *, uint64_t, uint64_t, vfs_context_t); }; /* @@ -148,12 +249,18 @@ __private_extern__ struct nfsclientidlist nfsclientids; */ struct nfsmount { lck_mtx_t nm_lock; /* nfs mount lock */ - int nm_flag; /* Flags for soft/hard... */ + char * nm_args; /* NFS mount args (XDR) */ + uint32_t nm_mattrs[NFS_MATTR_BITMAP_LEN]; /* mount attributes in mount args */ + uint32_t nm_mflags_mask[NFS_MFLAG_BITMAP_LEN]; /* mount flags mask in mount args */ + uint32_t nm_mflags[NFS_MFLAG_BITMAP_LEN]; /* mount flags in mount args */ + uint32_t nm_flags[NFS_MFLAG_BITMAP_LEN]; /* current mount flags (soft, intr, etc...) */ int nm_state; /* Internal state flags */ int nm_vers; /* NFS version */ struct nfs_funcs *nm_funcs; /* version-specific functions */ + kauth_cred_t nm_mcred; /* credential used for the mount (v4) */ mount_t nm_mountp; /* VFS structure for this filesystem */ nfsnode_t nm_dnp; /* root directory nfsnode pointer */ + struct nfs_fs_locations nm_locations; /* file system locations */ int nm_numgrps; /* Max. size of groupslist */ TAILQ_HEAD(, nfs_gss_clnt_ctx) nm_gsscl; /* GSS user contexts */ int nm_timeo; /* Init timer for NFSMNT_DUMBTIMR */ @@ -162,36 +269,48 @@ struct nfsmount { uint32_t nm_wsize; /* Max size of write rpc */ uint32_t nm_biosize; /* buffer I/O size */ uint32_t nm_readdirsize; /* Size of a readdir rpc */ - int nm_readahead; /* Num. of blocks to readahead */ - int nm_acregmin; /* reg file min attr cache timeout */ - int nm_acregmax; /* reg file max attr cache timeout */ - int nm_acdirmin; /* dir min attr cache timeout */ - int nm_acdirmax; /* dir max attr cache timeout */ - uint32_t nm_auth; /* security mechanism flavor */ + uint32_t nm_readahead; /* Num. of blocks to readahead */ + uint32_t nm_acregmin; /* reg file min attr cache timeout */ + uint32_t nm_acregmax; /* reg file max attr cache timeout */ + uint32_t nm_acdirmin; /* dir min attr cache timeout */ + uint32_t nm_acdirmax; /* dir max attr cache timeout */ + uint32_t nm_auth; /* security mechanism flavor being used */ + struct nfs_sec nm_sec; /* acceptable security mechanism flavors */ + struct nfs_sec nm_servsec; /* server's acceptable security mechanism flavors */ + fhandle_t *nm_fh; /* initial file handle */ + uint8_t nm_lockmode; /* advisory file locking mode */ /* mount info */ uint32_t nm_fsattrstamp; /* timestamp for fs attrs */ struct nfs_fsattr nm_fsattr; /* file system attributes */ uint64_t nm_verf; /* v3/v4 write verifier */ union { struct { /* v2/v3 specific fields */ - u_short rqport; /* cached rquota port */ - uint32_t rqportstamp; /* timestamp of rquota port */ + TAILQ_ENTRY(nfsmount) ldlink; /* chain of mounts registered for lockd use */ + int udp_sent; /* UDP request send count */ + int udp_cwnd; /* UDP request congestion window */ + struct nfs_reqqhead udp_cwndq; /* requests waiting on cwnd */ + struct sockaddr *rqsaddr;/* cached rquota socket address */ + uint32_t rqsaddrstamp; /* timestamp of rquota socket address */ } v3; struct { /* v4 specific fields */ struct nfs_client_id *longid; /* client ID, long form */ uint64_t mounttime; /* used as client ID verifier */ uint64_t clientid; /* client ID, short form */ thread_call_t renew_timer; /* RENEW timer call */ - TAILQ_HEAD(, nfs_open_owner) open_owners; /* list of open owners */ - TAILQ_HEAD(, nfsnode) recallq; /* list of nodes with recalled delegations */ + nfs_fsid fsid; /* NFS file system id */ + TAILQ_HEAD(, nfsnode) delegations; /* list of nodes with delegations */ + TAILQ_HEAD(, nfsnode) dreturnq; /* list of nodes with delegations to return */ TAILQ_ENTRY(nfsmount) cblink; /* chain of mounts registered for callbacks */ - uint32_t stateinuse; /* state in use counter */ - uint32_t stategenid; /* state generation counter */ - kauth_cred_t mcred; /* credential used for the mount */ uint32_t cbid; /* callback channel identifier */ uint32_t cbrefs; /* # callbacks using this mount */ } v4; } nm_un; + /* common state */ + TAILQ_HEAD(, nfs_open_owner) nm_open_owners; /* list of open owners */ + uint32_t nm_stateinuse; /* state in use counter */ + uint32_t nm_stategenid; /* state generation counter */ + time_t nm_recover_start; /* recover start time */ + LIST_HEAD(, nfsnode) nm_monlist; /* list of nodes being monitored */ /* async I/O queue */ struct nfs_reqqhead nm_resendq; /* async I/O resend queue */ struct nfs_reqqhead nm_iodq; /* async I/O request queue */ @@ -199,11 +318,14 @@ struct nfsmount { TAILQ_ENTRY(nfsmount) nm_iodlink; /* chain of mounts awaiting nfsiod */ int nm_asyncwrites; /* outstanding async I/O writes */ /* socket state */ - int nm_sotype; /* Type of socket */ - int nm_soproto; /* and protocol */ - mbuf_t nm_nam; /* Address of server */ + uint8_t nm_sofamily; /* (preferred) protocol family of socket */ + uint8_t nm_sotype; /* (preferred) type of socket */ + in_port_t nm_nfsport; /* NFS protocol port */ + in_port_t nm_mountport; /* MOUNT protocol port (v2/v3) */ + struct nfs_socket_search *nm_nss; /* current socket search structure */ + struct nfs_socket *nm_nso; /* current socket */ + struct sockaddr *nm_saddr; /* Address of server */ u_short nm_sockflags; /* socket state flags */ - socket_t nm_so; /* RPC socket */ time_t nm_deadto_start; /* dead timeout start time */ time_t nm_reconnect_start; /* reconnect start time */ int nm_tprintf_initial_delay; /* delay first "server down" */ @@ -213,27 +335,26 @@ struct nfsmount { int nm_sdrtt[4]; int nm_timeouts; /* Request timeouts */ int nm_jbreqs; /* # R_JBTPRINTFMSG requests */ - union { - struct { - int sent; /* Request send count */ - int cwnd; /* Request congestion window */ - struct nfs_reqqhead cwndq; /* requests waiting on cwnd */ - } udp; - struct { - u_int32_t mleft;/* marker bytes remaining */ - u_int32_t fleft;/* fragment bytes remaining */ - u_int32_t len; /* length of RPC record */ - mbuf_t m; /* mbufs for current record */ - mbuf_t mlast; - } tcp; - } nm_sockstate; + int nm_mounterror; /* status of mount connect */ TAILQ_ENTRY(nfsmount) nm_pokeq; /* mount poke queue chain */ thread_t nm_sockthd; /* socket thread for this mount */ }; +/* macro for checking current mount flags */ +#define NMFLAG(NMP, F) NFS_BITMAP_ISSET((NMP)->nm_flags, NFS_MFLAG_ ## F) +/* macros for checking (original) mount attributes/flags */ +#define NM_OMATTR_GIVEN(NMP, F) NFS_BITMAP_ISSET((NMP)->nm_mattrs, NFS_MATTR_ ## F) +#define NM_OMFLAG_GIVEN(NMP, F) NFS_BITMAP_ISSET((NMP)->nm_mflags_mask, NFS_MFLAG_ ## F) +#define NM_OMFLAG(NMP, F) NFS_BITMAP_ISSET((NMP)->nm_mflags, NFS_MFLAG_ ## F) + /* * NFS mount state flags (nm_state) */ +#define NFSSTA_MOUNT_THREAD 0x00000040 /* nfs_mount_connect_thread running */ +#define NFSSTA_MONITOR_SCAN 0x00000080 /* scan of monitored nodes in progress */ +#define NFSSTA_UNMOUNTING 0x00000100 /* an unmount attempt is in progress */ +#define NFSSTA_NEEDSECINFO 0x00000200 /* need to fetch security info */ +#define NFSSTA_CLIENTID 0x00000400 /* short client ID is valid */ #define NFSSTA_BIGCOOKIES 0x00000800 /* have seen >32bit dir cookies */ #define NFSSTA_JUKEBOXTIMEO 0x00001000 /* experienced a jukebox timeout */ #define NFSSTA_LOCKTIMEO 0x00002000 /* experienced a lock req timeout */ @@ -244,45 +365,41 @@ struct nfsmount { #define NFSSTA_HASWRITEVERF 0x00040000 /* Has write verifier for V3 */ #define NFSSTA_GOTPATHCONF 0x00080000 /* Got the V3 pathconf info */ #define NFSSTA_GOTFSINFO 0x00100000 /* Got the V3 fsinfo */ +#define NFSSTA_SENDING 0x00800000 /* Sending on socket */ #define NFSSTA_SNDLOCK 0x01000000 /* Send socket lock */ #define NFSSTA_WANTSND 0x02000000 /* Want above */ #define NFSSTA_DEAD 0x04000000 /* mount is dead */ #define NFSSTA_RECOVER 0x08000000 /* mount state needs to be recovered */ +#define NFSSTA_RECOVER_EXPIRED 0x10000000 /* mount state expired */ +#define NFSSTA_REVOKE 0x20000000 /* need to scan for revoked nodes */ /* flags for nm_sockflags */ #define NMSOCK_READY 0x0001 /* socket is ready for use */ #define NMSOCK_CONNECTING 0x0002 /* socket is being connect()ed */ #define NMSOCK_SETUP 0x0004 /* socket/connection is being set up */ #define NMSOCK_UNMOUNT 0x0008 /* unmounted, no more socket activity */ -#define NMSOCK_LASTFRAG 0x0010 /* on last fragment of RPC record */ +#define NMSOCK_HASCONNECTED 0x0010 /* socket has connected before */ #define NMSOCK_POKE 0x0020 /* socket needs to be poked */ -#define NMSOCK_UPCALL 0x0040 /* socket upcall in progress */ - -/* aliases for socket state variables */ -#define nm_sent nm_sockstate.udp.sent -#define nm_cwnd nm_sockstate.udp.cwnd -#define nm_cwndq nm_sockstate.udp.cwndq -#define nm_markerleft nm_sockstate.tcp.mleft -#define nm_fragleft nm_sockstate.tcp.fleft -#define nm_reclen nm_sockstate.tcp.len -#define nm_m nm_sockstate.tcp.m -#define nm_mlast nm_sockstate.tcp.mlast +#define NMSOCK_DISCONNECTING 0x0080 /* socket is being disconnected */ /* aliases for version-specific fields */ -#define nm_rqport nm_un.v3.rqport -#define nm_rqportstamp nm_un.v3.rqportstamp +#define nm_ldlink nm_un.v3.ldlink +#define nm_sent nm_un.v3.udp_sent +#define nm_cwnd nm_un.v3.udp_cwnd +#define nm_cwndq nm_un.v3.udp_cwndq +#define nm_rqproto nm_un.v3.rqproto +#define nm_rqsaddr nm_un.v3.rqsaddr +#define nm_rqsaddrstamp nm_un.v3.rqsaddrstamp #define nm_longid nm_un.v4.longid #define nm_clientid nm_un.v4.clientid #define nm_mounttime nm_un.v4.mounttime +#define nm_fsid nm_un.v4.fsid #define nm_renew_timer nm_un.v4.renew_timer -#define nm_open_owners nm_un.v4.open_owners -#define nm_stateinuse nm_un.v4.stateinuse -#define nm_stategenid nm_un.v4.stategenid -#define nm_mcred nm_un.v4.mcred #define nm_cbid nm_un.v4.cbid #define nm_cblink nm_un.v4.cblink #define nm_cbrefs nm_un.v4.cbrefs -#define nm_recallq nm_un.v4.recallq +#define nm_delegations nm_un.v4.delegations +#define nm_dreturnq nm_un.v4.dreturnq #if defined(KERNEL) /* diff --git a/bsd/nfs/nfsnode.h b/bsd/nfs/nfsnode.h index fa0d5bfc4..cce1399ca 100644 --- a/bsd/nfs/nfsnode.h +++ b/bsd/nfs/nfsnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -75,6 +75,7 @@ #ifndef _NFS_NFS_H_ #include <nfs/nfs.h> #endif +#include <sys/kauth.h> /* * Silly rename structure that hangs off the nfsnode until the name @@ -255,6 +256,7 @@ struct nfs_dir_buf_header { /* ndbh_flags */ #define NDB_FULL 0x0001 /* buffer has been filled */ #define NDB_EOF 0x0002 /* buffer contains EOF */ +#define NDB_PLUS 0x0004 /* buffer contains RDIRPLUS data */ #define NFS_DIR_BUF_FIRST_DIRENTRY(BP) \ ((struct direntry*)((char*)((BP)->nb_data) + sizeof(*ndbhp))) @@ -313,11 +315,14 @@ struct nfsdmap { struct nfs_vattr { enum vtype nva_type; /* vnode type (for create) */ - uint32_t nva_mode; /* files access mode (and type) */ + uint32_t nva_mode; /* file's access mode (and type) */ uid_t nva_uid; /* owner user id */ gid_t nva_gid; /* owner group id */ + guid_t nva_uuuid; /* owner user UUID */ + guid_t nva_guuid; /* owner group UUID */ + kauth_acl_t nva_acl; /* access control list */ nfs_specdata nva_rawdev; /* device the special file represents */ - uint32_t nva_flags; /* file flags */ + uint32_t nva_flags; /* file flags (see below) */ uint32_t nva_maxlink; /* maximum # of links (v4) */ uint64_t nva_nlink; /* number of references to file */ uint64_t nva_fileid; /* file id */ @@ -330,13 +335,35 @@ struct nfs_vattr { uint32_t nva_bitmap[NFS_ATTR_BITMAP_LEN]; /* attributes that are valid */ }; -#define NFS_FFLAG_ARCHIVED 0x0001 -#define NFS_FFLAG_HIDDEN 0x0002 -#define NFS_FFLAG_NAMED_ATTR 0x0004 /* file has named attributes */ +/* nva_flags */ +#define NFS_FFLAG_ARCHIVED 0x0001 +#define NFS_FFLAG_HIDDEN 0x0002 +#define NFS_FFLAG_HAS_NAMED_ATTRS 0x0004 /* file has named attributes */ +#define NFS_FFLAG_TRIGGER 0x0008 /* node is a trigger/mirror mount point */ +#define NFS_FFLAG_TRIGGER_REFERRAL 0x0010 /* trigger is a referral */ +#define NFS_FFLAG_IS_ATTR 0x8000 /* file is a named attribute file/directory */ /* flags for nfs_getattr() */ -#define NGA_CACHED 0 -#define NGA_UNCACHED 1 +#define NGA_CACHED 0x0001 /* use cached attributes (if still valid) */ +#define NGA_UNCACHED 0x0002 /* fetch new attributes */ +#define NGA_ACL 0x0004 /* fetch ACL */ +#define NGA_MONITOR 0x0008 /* vnode monitor attr update poll */ + +/* macros for initting/cleaning up nfs_vattr structures */ +#define NVATTR_INIT(NVAP) \ + do { \ + NFS_CLEAR_ATTRIBUTES((NVAP)->nva_bitmap); \ + (NVAP)->nva_flags = 0; \ + (NVAP)->nva_acl = NULL; \ + } while (0) +#define NVATTR_CLEANUP(NVAP) \ + do { \ + NFS_CLEAR_ATTRIBUTES((NVAP)->nva_bitmap); \ + if ((NVAP)->nva_acl) { \ + kauth_acl_free((NVAP)->nva_acl); \ + (NVAP)->nva_acl = NULL; \ + } \ + } while (0) /* * macros for detecting node changes @@ -416,17 +443,27 @@ struct nfs_open_file { uint32_t nof_rw; /* read/write opens (deny none) */ uint32_t nof_r_dw; /* read deny-write opens */ /* the rest of the counts have a max of 2 (1 for open + 1 for mmap) */ - uint32_t nof_w_dw:4; /* write deny-write opens (max 2) */ - uint32_t nof_rw_dw:4; /* read/write deny-write opens (max 2) */ - uint32_t nof_r_drw:4; /* read deny-read/write opens (max 2) */ - uint32_t nof_w_drw:4; /* write deny-read/write opens (max 2) */ - uint32_t nof_rw_drw:4; /* read/write deny-read/write opens (max 2) */ + uint32_t nof_w_dw:2; /* write deny-write opens (max 2) */ + uint32_t nof_rw_dw:2; /* read/write deny-write opens (max 2) */ + uint32_t nof_r_drw:2; /* read deny-read/write opens (max 2) */ + uint32_t nof_w_drw:2; /* write deny-read/write opens (max 2) */ + uint32_t nof_rw_drw:2; /* read/write deny-read/write opens (max 2) */ + /* counts of DELEGATED access/deny mode open combinations */ + uint32_t nof_d_w_dw:2; /* write deny-write opens (max 2) */ + uint32_t nof_d_rw_dw:2; /* read/write deny-write opens (max 2) */ + uint32_t nof_d_r_drw:2; /* read deny-read/write opens (max 2) */ + uint32_t nof_d_w_drw:2; /* write deny-read/write opens (max 2) */ + uint32_t nof_d_rw_drw:2; /* read/write deny-read/write opens (max 2) */ + uint32_t nof_d_r; /* read opens (deny none) */ + uint32_t nof_d_w; /* write opens (deny none) */ + uint32_t nof_d_rw; /* read/write opens (deny none) */ + uint32_t nof_d_r_dw; /* read deny-write opens */ }; /* nof_flags */ #define NFS_OPEN_FILE_BUSY 0x0001 /* open state-modifying operation in progress */ #define NFS_OPEN_FILE_WANT 0x0002 /* someone else wants to mark busy */ -#define NFS_OPEN_FILE_CREATE 0x0004 /* has an open(RW) from a VNOP_CREATE call */ -#define NFS_OPEN_FILE_NEEDCLOSE 0x0008 /* has an open(R) from an (unopen) VNOP_READ call */ +#define NFS_OPEN_FILE_CREATE 0x0004 /* has an open(RW) from a "CREATE" call */ +#define NFS_OPEN_FILE_NEEDCLOSE 0x0008 /* has an open(R) from an (unopen) VNOP_READ or VNOP_MMAP call */ #define NFS_OPEN_FILE_SETATTR 0x0020 /* has an open(W) to perform a SETATTR(size) */ #define NFS_OPEN_FILE_POSIXLOCK 0x0040 /* server supports POSIX locking semantics */ #define NFS_OPEN_FILE_LOST 0x0080 /* open state has been lost */ @@ -458,6 +495,7 @@ struct nfs_file_lock { #define NFS_FILE_LOCK_WAIT 0x08 /* may block on conflicting locks */ #define NFS_FILE_LOCK_BLOCKED 0x10 /* request is blocked */ #define NFS_FILE_LOCK_DEAD 0x20 /* lock (request) no longer exists */ +#define NFS_FILE_LOCK_DELEGATED 0x40 /* lock acquired via delegation */ TAILQ_HEAD(nfs_file_lock_queue, nfs_file_lock); @@ -514,14 +552,18 @@ struct nfsnode { lck_rw_t n_datalock; /* nfs node data lock */ void *n_datalockowner;/* nfs node data lock owner (exclusive) */ LIST_ENTRY(nfsnode) n_hash; /* Hash chain */ + LIST_ENTRY(nfsnode) n_monlink; /* list of monitored nodes */ u_quad_t n_size; /* Current size of file */ u_quad_t n_newsize; /* new size of file (pending update) */ u_int64_t n_xid; /* last xid to loadattr */ struct nfs_vattr n_vattr; /* Vnode attribute cache */ time_t n_attrstamp; /* Attr. cache timestamp */ - u_int8_t n_mode[NFS_ACCESS_CACHE_SIZE+1]; /* ACCESS mode cache */ - uid_t n_modeuid[NFS_ACCESS_CACHE_SIZE]; /* credentials having mode */ - time_t n_modestamp[NFS_ACCESS_CACHE_SIZE]; /* mode cache timestamp */ + time_t n_aclstamp; /* ACL cache timestamp */ + time_t n_evtstamp; /* last vnode event timestamp */ + uint32_t n_events; /* pending vnode events */ + u_int8_t n_access[NFS_ACCESS_CACHE_SIZE+1]; /* ACCESS cache */ + uid_t n_accessuid[NFS_ACCESS_CACHE_SIZE]; /* credentials having access */ + time_t n_accessstamp[NFS_ACCESS_CACHE_SIZE]; /* access cache timestamp */ union { struct { struct timespec n3_mtime; /* Prev modify time. */ @@ -530,6 +572,8 @@ struct nfsnode { struct { uint64_t n4_change; /* prev change attribute */ uint64_t n4_ncchange; /* namecache change attribute */ + u_char *n4_attrdirfh; /* associated attr directory fh */ + struct timeval n4_lastio; /* time of most recent I/O on attr */ } v4; } n_un4; vnode_t n_parent; /* this node's parent */ @@ -555,7 +599,9 @@ struct nfsnode { u_short n_flag; /* node flags */ u_short n_hflag; /* node hash flags */ u_short n_bflag; /* node buffer flags */ + u_short n_mflag; /* node mount flags */ u_char n_fh[NFS_SMALLFH];/* Small File Handle */ + uint32_t n_auth; /* security flavor used for this node */ struct nfsbuflists n_cleanblkhd; /* clean blocklist head */ struct nfsbuflists n_dirtyblkhd; /* dirty blocklist head */ union { @@ -567,7 +613,10 @@ struct nfsnode { daddr64_t nd_lastdbl; /* last dir buf lookup block# */ } n_un6; int n_bufiterflags; /* buf iterator flags */ - int n_numoutput; /* I/O in progress */ + union { + int nf_numoutput; /* write I/Os in progress */ + int nd_trigseq; /* vnode trigger seq# */ + } n_un7; /* open state */ lck_mtx_t n_openlock; /* nfs node open lock */ uint32_t n_openflags; /* open state flags */ @@ -578,7 +627,9 @@ struct nfsnode { struct nfs_file_lock_queue n_locks; /* list of locks */ /* delegation state */ nfs_stateid n_dstateid; /* delegation stateid */ - TAILQ_ENTRY(nfsnode) n_dlink; /* delegation recall list link */ + TAILQ_ENTRY(nfsnode) n_dlink; /* delegation list link */ + TAILQ_ENTRY(nfsnode) n_dreturn; /* delegation return list link */ + struct kauth_ace n_dace; /* delegation ACE */ }; #define NFS_DATA_LOCK_SHARED 1 @@ -604,20 +655,25 @@ struct nfsnode { #define n_sillyrename n_un3.nf_silly #define n_wrbusy n_un5.nf_wrbusy #define n_needcommitcnt n_un6.nf_needcommitcnt +#define n_numoutput n_un7.nf_numoutput #define n_cookieverf n_un1.nd_cookieverf #define n_eofcookie n_un2.nd_eofcookie #define n_cookiecache n_un3.nd_cookiecache #define n_ncgen n_un5.nd_ncgen #define n_lastdbl n_un6.nd_lastdbl +#define n_trigseq n_un7.nd_trigseq #define n_mtime n_un4.v3.n3_mtime #define n_ncmtime n_un4.v3.n3_ncmtime #define n_change n_un4.v4.n4_change #define n_ncchange n_un4.v4.n4_ncchange +#define n_attrdirfh n_un4.v4.n4_attrdirfh +#define n_lastio n_un4.v4.n4_lastio /* * Flags for n_flag */ #define NUPDATESIZE 0x0001 /* size of file needs updating */ +#define NREVOKE 0x0002 /* node revoked */ #define NMODIFIED 0x0004 /* Might have a modified buffer in bio */ #define NWRITEERR 0x0008 /* Flag write errors so close will know */ #define NNEEDINVALIDATE 0x0010 /* need to call vinvalbuf() */ @@ -629,6 +685,9 @@ struct nfsnode { #define NNEGNCENTRIES 0x0800 /* directory has negative name cache entries */ #define NBUSY 0x1000 /* node is busy */ #define NBUSYWANT 0x2000 /* waiting on busy node */ +#define NISDOTZFS 0x4000 /* a ".zfs" directory */ +#define NISDOTZFSCHILD 0x8000 /* a child of a ".zfs" directory */ + /* * Flags for n_hflag @@ -648,6 +707,13 @@ struct nfsnode { #define NBINVALINPROG 0x0004 /* Avoid multiple calls to nfs_vinvalbuf() */ #define NBINVALWANT 0x0008 /* waiting for nfs_vinvalbuf() to complete */ +/* + * Flags for n_mflag + * Note: protected by nfsmount's nm_lock + */ +#define NMMONSCANINPROG 0x0001 /* monitored node is currently updating attributes */ +#define NMMONSCANWANT 0x0002 /* waiting for attribute update to complete */ + /* * n_openflags * Note: protected by n_openlock @@ -657,18 +723,22 @@ struct nfsnode { #define N_DELEG_READ 0x0004 /* we have a read delegation */ #define N_DELEG_WRITE 0x0008 /* we have a write delegation */ #define N_DELEG_MASK 0x000c /* delegation mask */ +#define N_DELEG_RETURN 0x0010 /* delegation queued for return */ +#define N_DELEG_RETURNING 0x0020 /* delegation being returned */ -/* attr/mode timestamp macros */ +/* attr/access/ACL cache timestamp macros */ #define NATTRVALID(np) ((np)->n_attrstamp != ~0) #define NATTRINVALIDATE(np) ((np)->n_attrstamp = ~0) -#define NMODEVALID(np, slot) (((slot) >= 0) && ((slot) < 3) && ((np)->n_modestamp[(slot)] != ~0)) -#define NMODEINVALIDATE(np) \ +#define NACCESSVALID(np, slot) (((slot) >= 0) && ((slot) < NFS_ACCESS_CACHE_SIZE) && ((np)->n_accessstamp[(slot)] != ~0)) +#define NACCESSINVALIDATE(np) \ do { \ - (np)->n_modestamp[0] = ~0; \ - (np)->n_modestamp[1] = ~0; \ - (np)->n_modestamp[2] = ~0; \ - (np)->n_mode[3] = 0; \ + int __i; \ + for (__i=0; __i < NFS_ACCESS_CACHE_SIZE; __i++) \ + (np)->n_accessstamp[__i] = ~0; \ + (np)->n_access[NFS_ACCESS_CACHE_SIZE] = 0; \ } while (0) +#define NACLVALID(np) ((np)->n_aclstamp != ~0) +#define NACLINVALIDATE(np) ((np)->n_aclstamp = ~0) /* * NFS-specific flags for nfs_vinvalbuf/nfs_flush @@ -691,6 +761,16 @@ struct nfsnode { /* nfsnode hash table mutex */ __private_extern__ lck_mtx_t *nfs_node_hash_mutex; +/* + * printf-like helper macro that also outputs node name. + */ +#define NP(NP, FMT, ...) \ + do { \ + const char *__vname = (NP) ? vnode_getname(NFSTOV(NP)) : NULL; \ + printf(FMT " %s\n", ##__VA_ARGS__, __vname ? __vname : "???"); \ + if (__vname) vnode_putname(__vname); \ + } while (0) + /* * nfsiod structures */ @@ -743,7 +823,7 @@ void nfs_data_update_size(nfsnode_t, int); /* other stuff */ int nfs_removeit(struct nfs_sillyrename *); -int nfs_nget(mount_t,nfsnode_t,struct componentname *,u_char *,int,struct nfs_vattr *,u_int64_t *,int,nfsnode_t*); +int nfs_nget(mount_t,nfsnode_t,struct componentname *,u_char *,int,struct nfs_vattr *,u_int64_t *,uint32_t,int,nfsnode_t*); void nfs_dir_cookie_cache(nfsnode_t, uint64_t, uint64_t); int nfs_dir_cookie_to_lbn(nfsnode_t, uint64_t, int *, uint64_t *); void nfs_invaldir(nfsnode_t); diff --git a/bsd/nfs/nfsproto.h b/bsd/nfs/nfsproto.h index 9823531f4..ec6bc9311 100644 --- a/bsd/nfs/nfsproto.h +++ b/bsd/nfs/nfsproto.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -348,21 +348,21 @@ typedef enum { NFNON=0, NFREG=1, NFDIR=2, NFBLK=3, NFCHR=4, NFLNK=5, * NFS attribute management stuff */ #define NFS_ATTR_BITMAP_LEN 2 -#define NFS_BITMAP_SET(A, I) (((uint32_t *)(A))[(I)/32] |= 1<<((I)%32)) -#define NFS_BITMAP_CLR(A, I) (((uint32_t *)(A))[(I)/32] &= ~(1<<((I)%32))) -#define NFS_BITMAP_ISSET(A, I) (((uint32_t *)(A))[(I)/32] & (1<<((I)%32))) +#define NFS_BITMAP_SET(B, I) (((uint32_t *)(B))[(I)/32] |= 1<<((I)%32)) +#define NFS_BITMAP_CLR(B, I) (((uint32_t *)(B))[(I)/32] &= ~(1<<((I)%32))) +#define NFS_BITMAP_ISSET(B, I) (((uint32_t *)(B))[(I)/32] & (1<<((I)%32))) +#define NFS_BITMAP_ZERO(B, L) \ + do { \ + int __i; \ + for (__i=0; __i < (L); __i++) \ + ((uint32_t*)(B))[__i] = 0; \ + } while (0) __private_extern__ uint32_t nfs_fs_attr_bitmap[NFS_ATTR_BITMAP_LEN]; __private_extern__ uint32_t nfs_object_attr_bitmap[NFS_ATTR_BITMAP_LEN]; __private_extern__ uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; -#define NFS_CLEAR_ATTRIBUTES(A) \ - do { \ - int __i; \ - for (__i=0; __i < NFS_ATTR_BITMAP_LEN; __i++) \ - ((uint32_t*)(A))[__i] = 0; \ - } while (0) - +#define NFS_CLEAR_ATTRIBUTES(A) NFS_BITMAP_ZERO((A), NFS_ATTR_BITMAP_LEN) #define NFS_COPY_ATTRIBUTES(SRC, DST) \ do { \ int __i; \ @@ -571,7 +571,7 @@ __private_extern__ uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; /* NFS_BITMAP_SET((A), NFS_FATTR_FILEHANDLE); */ \ /* optional: */ \ /* NFS_BITMAP_SET((A), NFS_FATTR_ACL); */ \ - /* NFS_BITMAP_SET((A), NFS_FATTR_ACLSUPPORT); */ \ + NFS_BITMAP_SET((A), NFS_FATTR_ACLSUPPORT); \ NFS_BITMAP_SET((A), NFS_FATTR_ARCHIVE); \ /* NFS_BITMAP_SET((A), NFS_FATTR_CANSETTIME); */ \ NFS_BITMAP_SET((A), NFS_FATTR_CASE_INSENSITIVE); \ @@ -612,7 +612,7 @@ __private_extern__ uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; NFS_BITMAP_SET((A), NFS_FATTR_TIME_METADATA); \ NFS_BITMAP_SET((A), NFS_FATTR_TIME_MODIFY); \ /* NFS_BITMAP_SET((A), NFS_FATTR_TIME_MODIFY_SET); */ \ - /* NFS_BITMAP_SET((A), NFS_FATTR_MOUNTED_ON_FILEID); */ \ + NFS_BITMAP_SET((A), NFS_FATTR_MOUNTED_ON_FILEID); \ } while (0) /* attributes requested when we want to do a "statfs" */ @@ -637,6 +637,7 @@ __private_extern__ uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; #define NFS_LIMIT_SIZE 1 #define NFS_LIMIT_BLOCKS 2 /* access/deny modes */ +#define NFS_OPEN_SHARE_ACCESS_NONE 0x00000000 #define NFS_OPEN_SHARE_ACCESS_READ 0x00000001 #define NFS_OPEN_SHARE_ACCESS_WRITE 0x00000002 #define NFS_OPEN_SHARE_ACCESS_BOTH 0x00000003 @@ -740,6 +741,7 @@ __private_extern__ uint32_t nfs_getattr_bitmap[NFS_ATTR_BITMAP_LEN]; #define NFS_ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x00000010 #define NFS_ACE_FAILED_ACCESS_ACE_FLAG 0x00000020 #define NFS_ACE_IDENTIFIER_GROUP 0x00000040 +#define NFS_ACE_INHERITED_ACE 0x00000080 /* ACE mask flags */ #define NFS_ACE_READ_DATA 0x00000001 #define NFS_ACE_LIST_DIRECTORY 0x00000001 diff --git a/bsd/nfs/nfsrvcache.h b/bsd/nfs/nfsrvcache.h index fa23f1877..06bc9baeb 100644 --- a/bsd/nfs/nfsrvcache.h +++ b/bsd/nfs/nfsrvcache.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,13 @@ * Definitions for the server recent request cache */ +/* Network address hash list element */ +union nethostaddr { + in_addr_t had_inetaddr; + struct in6_addr had_inet6addr; + mbuf_t had_nam; +}; + #define NFSRVCACHESIZ 64 struct nfsrvcache { @@ -86,6 +93,7 @@ struct nfsrvcache { mbuf_t ru_repmb; /* Reply mbuf list OR */ int ru_repstat; /* Reply status */ } rc_un; + sa_family_t rc_family; /* address family */ union nethostaddr rc_haddr; /* Host address */ u_int32_t rc_proc; /* rpc proc number */ u_char rc_state; /* Current state of request */ @@ -95,6 +103,7 @@ struct nfsrvcache { #define rc_reply rc_un.ru_repmb #define rc_status rc_un.ru_repstat #define rc_inetaddr rc_haddr.had_inetaddr +#define rc_inet6addr rc_haddr.had_inet6addr #define rc_nam rc_haddr.had_nam /* Cache entry states */ diff --git a/bsd/nfs/rpcv2.h b/bsd/nfs/rpcv2.h index 510f5110b..3a288f203 100644 --- a/bsd/nfs/rpcv2.h +++ b/bsd/nfs/rpcv2.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -82,6 +82,7 @@ /* Authentication */ #define RPCAUTH_NULL 0 +#define RPCAUTH_NONE RPCAUTH_NULL #define RPCAUTH_UNIX 1 #define RPCAUTH_SYS RPCAUTH_UNIX #define RPCAUTH_SHORT 2 @@ -89,6 +90,8 @@ #define RPCAUTH_KRB5 390003 #define RPCAUTH_KRB5I 390004 #define RPCAUTH_KRB5P 390005 +#define RPCAUTH_INVALID ~0U +#define RPCAUTH_UNKNOWN RPCAUTH_INVALID #define RPCAUTH_MAXSIZ 400 #define RPCAUTH_UNIXGIDS 16 diff --git a/bsd/nfs/xdr_subs.h b/bsd/nfs/xdr_subs.h index 9a399db19..59356190a 100644 --- a/bsd/nfs/xdr_subs.h +++ b/bsd/nfs/xdr_subs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -64,8 +64,6 @@ * @(#)xdr_subs.h 8.3 (Berkeley) 3/30/95 * FreeBSD-Id: xdr_subs.h,v 1.9 1997/02/22 09:42:53 peter Exp $ */ - - #ifndef _NFS_XDR_SUBS_H_ #define _NFS_XDR_SUBS_H_ @@ -96,5 +94,418 @@ ((uint32_t *)(t))[1] = htonl(((uint32_t *)(f))[_QUAD_LOWWORD]); \ } + +/* + * xdrbuf + * + * generalized functionality for managing the building/dissecting of XDR data + */ +typedef enum xdrbuf_type { XDRBUF_BUFFER=1 } xdrbuf_type; + +struct xdrbuf { + union { + struct { + char * xbb_base; /* base address of buffer */ + uint32_t xbb_size; /* size of buffer */ + uint32_t xbb_len; /* length of data in buffer */ + } xb_buffer; + } xb_u; + char * xb_ptr; /* pointer to current position */ + size_t xb_left; /* bytes remaining in current buffer */ + size_t xb_growsize; /* bytes to allocate when growing */ + xdrbuf_type xb_type; /* type of xdr buffer */ + uint32_t xb_flags; /* XB_* (see below) */ +}; + +#define XB_CLEANUP 0x0001 /* needs cleanup */ + +#define XDRWORD 4 /* the basic XDR building block is a 4 byte (32 bit) word */ +#define xdr_rndup(a) (((a)+3)&(~0x3)) /* round up to XDRWORD size */ +#define xdr_pad(a) (xdr_rndup(a) - (a)) /* calculate round up padding */ + +void xb_init(struct xdrbuf *, xdrbuf_type); +void xb_init_buffer(struct xdrbuf *, char *, size_t); +void xb_cleanup(struct xdrbuf *); +void *xb_malloc(size_t); +void xb_free(void *); +int xb_grow(struct xdrbuf *); +void xb_set_cur_buf_len(struct xdrbuf *); +char *xb_buffer_base(struct xdrbuf *); +int xb_advance(struct xdrbuf *, uint32_t); +int xb_offset(struct xdrbuf *); +int xb_seek(struct xdrbuf *, uint32_t); +int xb_add_bytes(struct xdrbuf *, const char *, uint32_t, int); +int xb_get_bytes(struct xdrbuf *, char *, uint32_t, int); + +#ifdef _NFS_XDR_SUBS_FUNCS_ + +/* + * basic initialization of xdrbuf structure + */ +void +xb_init(struct xdrbuf *xbp, xdrbuf_type type) +{ + bzero(xbp, sizeof(*xbp)); + xbp->xb_type = type; + xbp->xb_flags |= XB_CLEANUP; +} + +/* + * initialize a single-buffer xdrbuf + */ +void +xb_init_buffer(struct xdrbuf *xbp, char *buf, size_t buflen) +{ + xb_init(xbp, XDRBUF_BUFFER); + xbp->xb_u.xb_buffer.xbb_base = buf; + xbp->xb_u.xb_buffer.xbb_size = buflen; + xbp->xb_u.xb_buffer.xbb_len = buflen; + xbp->xb_growsize = 512; + xbp->xb_ptr = buf; + xbp->xb_left = buflen; + if (buf) /* when using an existing buffer, xb code should skip cleanup */ + xbp->xb_flags &= ~XB_CLEANUP; +} + +/* + * get the pointer to the single-buffer xdrbuf's buffer + */ +char * +xb_buffer_base(struct xdrbuf *xbp) +{ + return (xbp->xb_u.xb_buffer.xbb_base); +} + +/* + * clean up any resources held by an xdrbuf + */ +void +xb_cleanup(struct xdrbuf *xbp) +{ + if (!(xbp->xb_flags & XB_CLEANUP)) + return; + switch (xbp->xb_type) { + case XDRBUF_BUFFER: + if (xbp->xb_u.xb_buffer.xbb_base) + xb_free(xbp->xb_u.xb_buffer.xbb_base); + break; + } + xbp->xb_flags &= ~XB_CLEANUP; +} + +/* + * set the length of valid data in the current buffer to + * be up to the current location within the buffer + */ +void +xb_set_cur_buf_len(struct xdrbuf *xbp) +{ + switch (xbp->xb_type) { + case XDRBUF_BUFFER: + xbp->xb_u.xb_buffer.xbb_len = xbp->xb_ptr - xbp->xb_u.xb_buffer.xbb_base; + break; + } +} + +/* + * advance forward through existing data in xdrbuf + */ +int +xb_advance(struct xdrbuf *xbp, uint32_t len) +{ + uint32_t tlen; + + while (len) { + if (xbp->xb_left <= 0) + return (EBADRPC); + tlen = MIN(xbp->xb_left, len); + if (tlen) { + xbp->xb_ptr += tlen; + xbp->xb_left -= tlen; + len -= tlen; + } + } + return (0); +} + +/* + * Calculate the current offset in the XDR buffer. + */ +int +xb_offset(struct xdrbuf *xbp) +{ + uint32_t offset = 0; + + switch (xbp->xb_type) { + case XDRBUF_BUFFER: + offset = xbp->xb_ptr - xbp->xb_u.xb_buffer.xbb_base; + break; + } + + return (offset); +} + +/* + * Seek to the given offset in the existing data in the XDR buffer. + */ +int +xb_seek(struct xdrbuf *xbp, uint32_t offset) +{ + + switch (xbp->xb_type) { + case XDRBUF_BUFFER: + xbp->xb_ptr = xbp->xb_u.xb_buffer.xbb_base + offset; + xbp->xb_left = xbp->xb_u.xb_buffer.xbb_len - offset; + break; + } + + return (0); +} + +/* + * allocate memory + */ +void * +xb_malloc(size_t size) +{ + void *buf = NULL; + +#ifdef KERNEL + MALLOC(buf, void *, size, M_TEMP, M_WAITOK); +#else + buf = malloc(size); +#endif + return (buf); +} +/* + * free a chunk of memory allocated with xb_malloc() + */ +void +xb_free(void *buf) +{ +#ifdef KERNEL + FREE(buf, M_TEMP); +#else + free(buf); +#endif +} + +/* + * Increase space available for new data in XDR buffer. + */ +int +xb_grow(struct xdrbuf *xbp) +{ + char *newbuf, *oldbuf; + size_t newsize, oldsize; + + switch (xbp->xb_type) { + case XDRBUF_BUFFER: + oldsize = xbp->xb_u.xb_buffer.xbb_size; + oldbuf = xbp->xb_u.xb_buffer.xbb_base; + newsize = oldsize + xbp->xb_growsize; + newbuf = xb_malloc(newsize); + if (newbuf == NULL) + return (ENOMEM); + if (oldbuf != NULL) { + bcopy(oldbuf, newbuf, oldsize); + xb_free(oldbuf); + } + xbp->xb_u.xb_buffer.xbb_base = newbuf; + xbp->xb_u.xb_buffer.xbb_size = newsize; + xbp->xb_ptr = newbuf + oldsize; + xbp->xb_left = xbp->xb_growsize; + break; + } + + return (0); +} + +/* + * xb_add_bytes() + * + * Add "count" bytes of opaque data pointed to by "buf" to the given XDR buffer. + */ +int +xb_add_bytes(struct xdrbuf *xbp, const char *buf, uint32_t count, int nopad) +{ + uint32_t len, tlen; + int error; + + len = nopad ? count : xdr_rndup(count); + + /* copy in "count" bytes and zero out any pad bytes */ + while (len) { + if (xbp->xb_left <= 0) { + /* need more space */ + if ((error = xb_grow(xbp))) + return (error); + if (xbp->xb_left <= 0) + return (ENOMEM); + } + tlen = MIN(xbp->xb_left, len); + if (tlen) { + if (count) { + if (tlen > count) + tlen = count; + bcopy(buf, xbp->xb_ptr, tlen); + } else { + bzero(xbp->xb_ptr, tlen); + } + xbp->xb_ptr += tlen; + xbp->xb_left -= tlen; + len -= tlen; + if (count) { + buf += tlen; + count -= tlen; + } + } + } + return (0); +} + +/* + * xb_get_bytes() + * + * Get "count" bytes of opaque data from the given XDR buffer. + */ +int +xb_get_bytes(struct xdrbuf *xbp, char *buf, uint32_t count, int nopad) +{ + uint32_t len, tlen; + + len = nopad ? count : xdr_rndup(count); + + /* copy in "count" bytes and zero out any pad bytes */ + while (len) { + if (xbp->xb_left <= 0) + return (ENOMEM); + tlen = MIN(xbp->xb_left, len); + if (tlen) { + if (count) { + if (tlen > count) + tlen = count; + bcopy(xbp->xb_ptr, buf, tlen); + } + xbp->xb_ptr += tlen; + xbp->xb_left -= tlen; + len -= tlen; + if (count) { + buf += tlen; + count -= tlen; + } + } + } + return (0); +} + +#endif /* _NFS_XDR_SUBS_FUNCS_ */ + + +/* + * macros for building XDR data + */ + +/* finalize the data that has been added to the buffer */ +#define xb_build_done(E, XB) \ + do { \ + if (E) break; \ + xb_set_cur_buf_len(XB); \ + } while (0) + +/* add a 32-bit value */ +#define xb_add_32(E, XB, VAL) \ + do { \ + uint32_t __tmp; \ + if (E) break; \ + __tmp = txdr_unsigned(VAL); \ + (E) = xb_add_bytes((XB), (void*)&__tmp, XDRWORD, 0); \ + } while (0) + +/* add a 64-bit value */ +#define xb_add_64(E, XB, VAL) \ + do { \ + uint64_t __tmp1, __tmp2; \ + if (E) break; \ + __tmp1 = (VAL); \ + txdr_hyper(&__tmp1, &__tmp2); \ + (E) = xb_add_bytes((XB), (char*)&__tmp2, 2 * XDRWORD, 0); \ + } while (0) + +/* add an array of XDR words */ +#define xb_add_word_array(E, XB, A, LEN) \ + do { \ + uint32_t __i; \ + xb_add_32((E), (XB), (LEN)); \ + for (__i=0; __i < (uint32_t)(LEN); __i++) \ + xb_add_32((E), (XB), (A)[__i]); \ + } while (0) +#define xb_add_bitmap(E, XB, B, LEN) xb_add_word_array((E), (XB), (B), (LEN)) + +/* add a file handle */ +#define xb_add_fh(E, XB, FHP, FHLEN) \ + do { \ + xb_add_32((E), (XB), (FHLEN)); \ + if (E) break; \ + (E) = xb_add_bytes((XB), (char*)(FHP), (FHLEN), 0); \ + } while (0) + +/* add a string */ +#define xb_add_string(E, XB, S, LEN) \ + do { \ + xb_add_32((E), (XB), (LEN)); \ + if (E) break; \ + (E) = xb_add_bytes((XB), (const char*)(S), (LEN), 0); \ + } while (0) + + +/* + * macros for decoding XDR data + */ + +/* skip past data in the buffer */ +#define xb_skip(E, XB, LEN) \ + do { \ + if (E) break; \ + (E) = xb_advance((XB), (LEN)); \ + } while (0) + +/* get a 32-bit value */ +#define xb_get_32(E, XB, LVAL) \ + do { \ + uint32_t __tmp; \ + if (E) break; \ + (E) = xb_get_bytes((XB), (char*)&__tmp, XDRWORD, 0); \ + if (E) break; \ + (LVAL) = fxdr_unsigned(uint32_t, __tmp); \ + } while (0) + +/* get a 64-bit value */ +#define xb_get_64(E, XB, LVAL) \ + do { \ + uint64_t __tmp; \ + if (E) break; \ + (E) = xb_get_bytes((XB), (char*)&__tmp, 2 * XDRWORD, 0); \ + if (E) break; \ + fxdr_hyper(&__tmp, &(LVAL)); \ + } while (0) + +/* get an array of XDR words (of a given expected/maximum length) */ +#define xb_get_word_array(E, XB, A, LEN) \ + do { \ + uint32_t __len = 0, __i; \ + xb_get_32((E), (XB), __len); \ + if (E) break; \ + for (__i=0; __i < MIN(__len, (uint32_t)(LEN)); __i++) \ + xb_get_32((E), (XB), (A)[__i]); \ + if (E) break; \ + for (; __i < __len; __i++) \ + xb_skip((E), (XB), XDRWORD); \ + for (; __i < (uint32_t)(LEN); __i++) \ + (A)[__i] = 0; \ + (LEN) = __len; \ + } while (0) +#define xb_get_bitmap(E, XB, B, LEN) xb_get_word_array((E), (XB), (B), (LEN)) + #endif /* __APPLE_API_PRIVATE */ #endif /* _NFS_XDR_SUBS_H_ */ diff --git a/bsd/ppc/Makefile b/bsd/ppc/Makefile deleted file mode 100644 index 21878d7f3..000000000 --- a/bsd/ppc/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - - -include $(MakeInc_cmd) -include $(MakeInc_def) - -DATAFILES = \ - endian.h fasttrap_isa.h param.h profile.h \ - setjmp.h signal.h limits.h _limits.h \ - types.h vmparam.h _structs.h _types.h _param.h - -KERNELFILES = \ - disklabel.h \ - endian.h param.h profile.h \ - signal.h limits.h _limits.h \ - types.h vmparam.h _structs.h _types.h _param.h - -INSTALL_MD_LIST = ${DATAFILES} -INSTALL_MD_LCL_LIST = ${DATAFILES} disklabel.h - -INSTALL_MD_DIR = ppc - -EXPORT_MD_LIST = ${KERNELFILES} - -EXPORT_MD_DIR = ppc - -include $(MakeInc_rule) -include $(MakeInc_dir) - - diff --git a/bsd/ppc/_limits.h b/bsd/ppc/_limits.h deleted file mode 100644 index d512ec411..000000000 --- a/bsd/ppc/_limits.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_LICENSE_HEADER_START@ - * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. - * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. - * - * @APPLE_LICENSE_HEADER_END@ - */ -#ifndef _PPC__LIMITS_H_ -#define _PPC__LIMITS_H_ - -#define __DARWIN_CLK_TCK 100 /* ticks per second */ - -#endif /* _PPC__LIMITS_H_ */ diff --git a/bsd/ppc/_param.h b/bsd/ppc/_param.h deleted file mode 100644 index 938fc499f..000000000 --- a/bsd/ppc/_param.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _PPC__PARAM_H_ -#define _PPC__PARAM_H_ - -#include <ppc/_types.h> - -/* - * Round p (pointer or byte index) up to a correctly-aligned value for all - * data types (int, long, ...). The result is unsigned int and must be - * cast to any desired pointer type. - */ -#define __DARWIN_ALIGNBYTES (sizeof(__darwin_size_t) - 1) -#define __DARWIN_ALIGN(p) ((__darwin_size_t)((char *)(__darwin_size_t)(p) + __DARWIN_ALIGNBYTES) &~ __DARWIN_ALIGNBYTES) - -#define __DARWIN_ALIGNBYTES32 (sizeof(__uint32_t) - 1) -#define __DARWIN_ALIGN32(p) ((__darwin_size_t)((char *)(__darwin_size_t)(p) + __DARWIN_ALIGNBYTES32) &~ __DARWIN_ALIGNBYTES32) - - -#endif /* _PPC__PARAM_H_ */ diff --git a/bsd/ppc/_structs.h b/bsd/ppc/_structs.h deleted file mode 100644 index c028f7efb..000000000 --- a/bsd/ppc/_structs.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#include <sys/cdefs.h> - -#ifdef __need_mcontext_t -#ifndef __need_struct_mcontext -#define __need_struct_mcontext -#endif /* __need_struct_mcontext */ -#endif /* __need_mcontext_t */ - -#ifdef __need_mcontext64_t -#ifndef __need_struct_mcontext64 -#define __need_struct_mcontext64 -#endif /* __need_struct_mcontext64 */ -#endif /* __need_mcontext64_t */ - -#if defined(__need_struct_mcontext) || defined(__need_struct_mcontext64) -#include <mach/ppc/_structs.h> -#endif /* __need_struct_mcontext || __need_struct_mcontext64 */ - -#ifdef __need_struct_mcontext -#undef __need_struct_mcontext -#ifndef _STRUCT_MCONTEXT -#if __DARWIN_UNIX03 -#define _STRUCT_MCONTEXT struct __darwin_mcontext -_STRUCT_MCONTEXT -{ - _STRUCT_PPC_EXCEPTION_STATE __es; - _STRUCT_PPC_THREAD_STATE __ss; - _STRUCT_PPC_FLOAT_STATE __fs; - _STRUCT_PPC_VECTOR_STATE __vs; -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_MCONTEXT struct mcontext -_STRUCT_MCONTEXT -{ - _STRUCT_PPC_EXCEPTION_STATE es; - _STRUCT_PPC_THREAD_STATE ss; - _STRUCT_PPC_FLOAT_STATE fs; - _STRUCT_PPC_VECTOR_STATE vs; -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* _STRUCT_MCONTEXT */ -#endif /* __need_struct_mcontext */ - -#ifdef __need_struct_mcontext64 -#undef __need_struct_mcontext64 -#ifndef _STRUCT_MCONTEXT64 -#if __DARWIN_UNIX03 -#define _STRUCT_MCONTEXT64 struct __darwin_mcontext64 -_STRUCT_MCONTEXT64 -{ - _STRUCT_PPC_EXCEPTION_STATE64 __es; - _STRUCT_PPC_THREAD_STATE64 __ss; - _STRUCT_PPC_FLOAT_STATE __fs; - _STRUCT_PPC_VECTOR_STATE __vs; -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_MCONTEXT64 struct mcontext64 -_STRUCT_MCONTEXT64 -{ - _STRUCT_PPC_EXCEPTION_STATE64 es; - _STRUCT_PPC_THREAD_STATE64 ss; - _STRUCT_PPC_FLOAT_STATE fs; - _STRUCT_PPC_VECTOR_STATE vs; -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* _STRUCT_MCONTEXT64 */ -#endif /* __need_struct_mcontext64 */ - -#ifdef __need_mcontext_t -#undef __need_mcontext_t -#ifndef _MCONTEXT_T -#define _MCONTEXT_T -typedef _STRUCT_MCONTEXT *mcontext_t; -#endif /* _MCONTEXT_T */ -#endif /* __need_mcontext_t */ - -#ifdef __need_mcontext64_t -#undef __need_mcontext64_t -#ifndef _MCONTEXT64_T -#define _MCONTEXT64_T -typedef _STRUCT_MCONTEXT64 *mcontext64_t; -#endif /* _MCONTEXT64_T */ -#endif /* __need_mcontext64_t */ - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#ifndef PPC_MCONTEXT_SIZE -#define PPC_MCONTEXT_SIZE (PPC_THREAD_STATE_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) -#endif /* PPC_MCONTEXT_SIZE */ -#ifndef PPC_MCONTEXT64_SIZE -#define PPC_MCONTEXT64_SIZE (PPC_THREAD_STATE64_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) -#endif /* PPC_MCONTEXT64_SIZE */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -/* - * For now, just duplicate the 32-bit context as the generic one. - */ -#ifdef __need_struct_sigcontext -#undef __need_struct_sigcontext -#ifndef _STRUCT_SIGCONTEXT -#if __DARWIN_UNIX03 /* signal.h needs struct sigcontext visible */ -#define _STRUCT_SIGCONTEXT struct __darwin_sigcontext -_STRUCT_SIGCONTEXT -{ - int __sc_onstack; /* sigstack state to restore */ - int __sc_mask; /* signal mask to restore */ - int __sc_ir; /* pc */ - int __sc_psw; /* processor status word */ - int __sc_sp; /* stack pointer if sc_regs == NULL */ - void *__sc_regs; /* (kernel private) saved state */ -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_SIGCONTEXT struct sigcontext -_STRUCT_SIGCONTEXT -{ - int sc_onstack; /* sigstack state to restore */ - int sc_mask; /* signal mask to restore */ - int sc_ir; /* pc */ - int sc_psw; /* processor status word */ - int sc_sp; /* stack pointer if sc_regs == NULL */ - void *sc_regs; /* (kernel private) saved state */ -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* _STRUCT_SIGCONTEXT */ -#endif /* __need_struct_sigcontext */ - -/* - * Information pushed on stack when a signal is delivered. - * This is used by the kernel to restore state following - * execution of the signal handler. It is also made available - * to the handler to allow it to properly restore state if - * a non-standard exit is performed. - */ -#ifdef __need_struct_sigcontext32 -#undef __need_struct_sigcontext32 -#ifndef _STRUCT_SIGCONTEXT32 -#if __DARWIN_UNIX03 -#define _STRUCT_SIGCONTEXT32 struct __darwin_sigcontext32 -_STRUCT_SIGCONTEXT32 -{ - int __sc_onstack; /* sigstack state to restore */ - int __sc_mask; /* signal mask to restore */ - int __sc_ir; /* pc */ - int __sc_psw; /* processor status word */ - int __sc_sp; /* stack pointer if sc_regs == NULL */ - void *__sc_regs; /* (kernel private) saved state */ -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_SIGCONTEXT32 struct sigcontext32 -_STRUCT_SIGCONTEXT32 -{ - int sc_onstack; /* sigstack state to restore */ - int sc_mask; /* signal mask to restore */ - int sc_ir; /* pc */ - int sc_psw; /* processor status word */ - int sc_sp; /* stack pointer if sc_regs == NULL */ - void *sc_regs; /* (kernel private) saved state */ -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* _STRUCT_SIGCONTEXT32 */ -#endif /* __need_struct_sigcontext32 */ - -#ifdef __need_struct_sigcontext64 -#undef __need_struct_sigcontext64 -#ifndef _STRUCT_SIGCONTEXT64 -#if __DARWIN_UNIX03 -#define _STRUCT_SIGCONTEXT64 struct __darwin_sigcontext64 -_STRUCT_SIGCONTEXT64 -{ - int __sc_onstack; /* sigstack state to restore */ - int __sc_mask; /* signal mask to restore */ - long long __sc_ir; /* pc */ - long long __sc_psw; /* processor status word */ - long long __sc_sp; /* stack pointer if sc_regs == NULL */ - void *__sc_regs; /* (kernel private) saved state */ -}; -#else /* !__DARWIN_UNIX03 */ -#define _STRUCT_SIGCONTEXT64 struct sigcontext64 -_STRUCT_SIGCONTEXT64 -{ - int sc_onstack; /* sigstack state to restore */ - int sc_mask; /* signal mask to restore */ - long long sc_ir; /* pc */ - long long sc_psw; /* processor status word */ - long long sc_sp; /* stack pointer if sc_regs == NULL */ - void *sc_regs; /* (kernel private) saved state */ -}; -#endif /* __DARWIN_UNIX03 */ -#endif /* _STRUCT_SIGCONTEXT64 */ -#endif /* __need_struct_sigcontext64 */ diff --git a/bsd/ppc/_types.h b/bsd/ppc/_types.h deleted file mode 100644 index 4b7855988..000000000 --- a/bsd/ppc/_types.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -#ifndef _BSD_PPC__TYPES_H_ -#define _BSD_PPC__TYPES_H_ - -/* - * This header file contains integer types. It's intended to also contain - * flotaing point and other arithmetic types, as needed, later. - */ - -#ifdef __GNUC__ -typedef __signed char __int8_t; -#else /* !__GNUC__ */ -typedef char __int8_t; -#endif /* !__GNUC__ */ -typedef unsigned char __uint8_t; -typedef short __int16_t; -typedef unsigned short __uint16_t; -typedef int __int32_t; -typedef unsigned int __uint32_t; -typedef long long __int64_t; -typedef unsigned long long __uint64_t; - -typedef long __darwin_intptr_t; -typedef unsigned int __darwin_natural_t; - -/* - * The rune type below is declared to be an ``int'' instead of the more natural - * ``unsigned long'' or ``long''. Two things are happening here. It is not - * unsigned so that EOF (-1) can be naturally assigned to it and used. Also, - * it looks like 10646 will be a 31 bit standard. This means that if your - * ints cannot hold 32 bits, you will be in trouble. The reason an int was - * chosen over a long is that the is*() and to*() routines take ints (says - * ANSI C), but they use __darwin_ct_rune_t instead of int. By changing it - * here, you lose a bit of ANSI conformance, but your programs will still - * work. - * - * NOTE: rune_t is not covered by ANSI nor other standards, and should not - * be instantiated outside of lib/libc/locale. Use wchar_t. wchar_t and - * rune_t must be the same type. Also wint_t must be no narrower than - * wchar_t, and should also be able to hold all members of the largest - * character set plus one extra value (WEOF). wint_t must be at least 16 bits. - */ - -typedef int __darwin_ct_rune_t; /* ct_rune_t */ - -/* - * mbstate_t is an opaque object to keep conversion state, during multibyte - * stream conversions. The content must not be referenced by user programs. - */ -typedef union { - char __mbstate8[128]; - long long _mbstateL; /* for alignment */ -} __mbstate_t; - -typedef __mbstate_t __darwin_mbstate_t; /* mbstate_t */ - -#if defined(__GNUC__) && defined(__PTRDIFF_TYPE__) -typedef __PTRDIFF_TYPE__ __darwin_ptrdiff_t; /* ptr1 - ptr2 */ -#else -typedef int __darwin_ptrdiff_t; /* ptr1 - ptr2 */ -#endif /* __GNUC__ */ - -#if defined(__GNUC__) && defined(__SIZE_TYPE__) -typedef __SIZE_TYPE__ __darwin_size_t; /* sizeof() */ -#else -typedef unsigned long __darwin_size_t; /* sizeof() */ -#endif - -#if (__GNUC__ > 2) -typedef __builtin_va_list __darwin_va_list; /* va_list */ -#else -typedef char * __darwin_va_list; /* va_list */ -#endif - -#if defined(__GNUC__) && defined(__WCHAR_TYPE__) -typedef __WCHAR_TYPE__ __darwin_wchar_t; /* wchar_t */ -#else -typedef __darwin_ct_rune_t __darwin_wchar_t; /* wchar_t */ -#endif - -typedef __darwin_wchar_t __darwin_rune_t; /* rune_t */ - -#if defined(__GNUC__) && defined(__WINT_TYPE__) -typedef __WINT_TYPE__ __darwin_wint_t; /* wint_t */ -#else -typedef __darwin_ct_rune_t __darwin_wint_t; /* wint_t */ -#endif - -typedef unsigned long __darwin_clock_t; /* clock() */ -typedef __uint32_t __darwin_socklen_t; /* socklen_t (duh) */ -typedef long __darwin_ssize_t; /* byte count or error */ -typedef long __darwin_time_t; /* time() */ - -#endif /* _BSD_PPC__TYPES_H_ */ diff --git a/bsd/ppc/decodePPC.h b/bsd/ppc/decodePPC.h deleted file mode 100644 index 8fb4756f6..000000000 --- a/bsd/ppc/decodePPC.h +++ /dev/null @@ -1,919 +0,0 @@ -/* - * Copyright (c) 2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -struct dcdtab { - - uint8_t dcdFlgs; /* Flags needed to decode */ -#define dcdStep 0x80 /* Step to next table entry on non-match */ -#define dcdJump 0x40 /* Jump to new entry in table. Index is in dcdMatch. */ -#define dcdMask 0x0F /* Index into mask table. 0 matches everything */ - - uint8_t dcdType; /* Instruction type */ -#define diINV 0x00 -#define diTRP 0x01 -#define diSC 0x02 -#define diRFI 0x03 -#define diB 0x04 -#define diBC 0x05 -#define diBLR 0x06 -#define diBCTR 0x07 -#define diOR 0x08 -#define diSPR 0x09 -#define diCMN 0x0A -#define diPRV 0x0B - - uint16_t dcdMatch; /* Extended op code to match */ -}; - -typedef struct dcdtab dcdtab; - -static uint16_t masktab[] = {0x0000, 0x0003, 0x001C, 0x001E, 0x003E, /* Table of extended op masks */ - 0x003F, 0x03FE, 0x03FF, 0x07FC, 0x07FE, 0x07FF}; - -static dcdtab insts[] = { - { 0x40, 0, 64 }, // 0 Maj op = 0, jump to entry 64 - { 0x00, diINV, 0x0000 }, // 1 Maj op = 1, invalid - { 0x00, diTRP, 0x0000 }, // 2 Maj op = 2, tdi - { 0x00, diTRP, 0x0000 }, // 3 Maj op = 3, twi - { 0x40, 0, 65 }, // 4 Maj op = 4, jump to entry 65 - { 0x00, diINV, 0x0000 }, // 5 Maj op = 5, invalid - { 0x00, diINV, 0x0000 }, // 6 Maj op = 6, invalid - { 0x00, diCMN, 0x0000 }, // 7 Maj op = 7, mulli - { 0x00, diCMN, 0x0000 }, // 8 Maj op = 8, subfic - { 0x00, diINV, 0x0000 }, // 9 Maj op = 9, invalid - { 0x00, diCMN, 0x0000 }, // 10 Maj op = 10, cmpli - { 0x00, diCMN, 0x0000 }, // 11 Maj op = 11, cmpi - { 0x00, diCMN, 0x0000 }, // 12 Maj op = 12, addic - { 0x00, diCMN, 0x0000 }, // 13 Maj op = 13, addic. - { 0x00, diCMN, 0x0000 }, // 14 Maj op = 14, addi - { 0x00, diCMN, 0x0000 }, // 15 Maj op = 15, addis - { 0x00, diBC, 0x0000 }, // 16 Maj op = 16, bc - { 0x00, diSC, 0x0000 }, // 17 Maj op = 17, sc - { 0x00, diB, 0x0000 }, // 18 Maj op = 18, b - { 0x40, 0, 209 }, // 19 Maj op = 19, jump to entry 209 - { 0x00, diCMN, 0x0000 }, // 20 Maj op = 20, rlwimi - { 0x00, diCMN, 0x0000 }, // 21 Maj op = 21, rlwinm - { 0x00, diINV, 0x0000 }, // 22 Maj op = 22, invalid - { 0x00, diCMN, 0x0000 }, // 23 Maj op = 23, rlwnm - { 0x00, diOR, 0x0000 }, // 24 Maj op = 24, ori - { 0x00, diCMN, 0x0000 }, // 25 Maj op = 25, oris - { 0x00, diCMN, 0x0000 }, // 26 Maj op = 26, xori - { 0x00, diCMN, 0x0000 }, // 27 Maj op = 27, xoris - { 0x00, diCMN, 0x0000 }, // 28 Maj op = 28, andi. - { 0x00, diCMN, 0x0000 }, // 29 Maj op = 29, andis. - { 0x40, 0, 224 }, // 30 Maj op = 30, jump to entry 224 - { 0x40, 0, 230 }, // 31 Maj op = 31, jump to entry 230 - { 0x00, diCMN, 0x0000 }, // 32 Maj op = 32, lwz - { 0x00, diCMN, 0x0000 }, // 33 Maj op = 33, lwzu - { 0x00, diCMN, 0x0000 }, // 34 Maj op = 34, lbz - { 0x00, diCMN, 0x0000 }, // 35 Maj op = 35, lbzu - { 0x00, diCMN, 0x0000 }, // 36 Maj op = 36, stw - { 0x00, diCMN, 0x0000 }, // 37 Maj op = 37, stwu - { 0x00, diCMN, 0x0000 }, // 38 Maj op = 38, stb - { 0x00, diCMN, 0x0000 }, // 39 Maj op = 39, stbu - { 0x00, diCMN, 0x0000 }, // 40 Maj op = 40, lhz - { 0x00, diCMN, 0x0000 }, // 41 Maj op = 41, lhzu - { 0x00, diCMN, 0x0000 }, // 42 Maj op = 42, lha - { 0x00, diCMN, 0x0000 }, // 43 Maj op = 43, lhau - { 0x00, diCMN, 0x0000 }, // 44 Maj op = 44, sth - { 0x00, diCMN, 0x0000 }, // 45 Maj op = 45, sthu - { 0x00, diCMN, 0x0000 }, // 46 Maj op = 46, lmw - { 0x00, diCMN, 0x0000 }, // 47 Maj op = 47, stmw - { 0x00, diCMN, 0x0000 }, // 48 Maj op = 48, lfs - { 0x00, diCMN, 0x0000 }, // 49 Maj op = 49, lfsu - { 0x00, diCMN, 0x0000 }, // 50 Maj op = 50, lfd - { 0x00, diCMN, 0x0000 }, // 51 Maj op = 51, lfdu - { 0x00, diCMN, 0x0000 }, // 52 Maj op = 52, stfs - { 0x00, diCMN, 0x0000 }, // 53 Maj op = 53, stfsu - { 0x00, diCMN, 0x0000 }, // 54 Maj op = 54, stfd - { 0x00, diCMN, 0x0000 }, // 55 Maj op = 55, stfdu - { 0x00, diINV, 0x0000 }, // 56 Maj op = 56, invalid - { 0x00, diINV, 0x0000 }, // 57 Maj op = 57, invalid - { 0x40, 0, 365 }, // 58 Maj op = 58, jump to entry 365 - { 0x40, 0, 368 }, // 59 Maj op = 59, jump to entry 368 - { 0x00, diINV, 0x0000 }, // 60 Maj op = 60, invalid - { 0x00, diINV, 0x0000 }, // 61 Maj op = 61, invalid - { 0x40, 0, 378 }, // 62 Maj op = 62, jump to entry 378 - { 0x40, 0, 380 }, // 63 Maj op = 63, jump to entry 380 - { 0x09, diCMN, 0x0200 }, // 64 Maj op = 0, mask = 07FE, xop = 0x0200 ( 256) - attn - { 0x85, diCMN, 0x0020 }, // 65 Maj op = 4, mask = 003F, xop = 0x0020 ( 32) - vmhaddshs - { 0x85, diCMN, 0x0021 }, // 66 Maj op = 4, mask = 003F, xop = 0x0021 ( 33) - vmhraddshs - { 0x85, diCMN, 0x0022 }, // 67 Maj op = 4, mask = 003F, xop = 0x0022 ( 34) - vmladduhm - { 0x85, diCMN, 0x0024 }, // 68 Maj op = 4, mask = 003F, xop = 0x0024 ( 36) - vmsumubm - { 0x85, diCMN, 0x0025 }, // 69 Maj op = 4, mask = 003F, xop = 0x0025 ( 37) - vmsummbm - { 0x85, diCMN, 0x0026 }, // 70 Maj op = 4, mask = 003F, xop = 0x0026 ( 38) - vmsumuhm - { 0x85, diCMN, 0x0027 }, // 71 Maj op = 4, mask = 003F, xop = 0x0027 ( 39) - vmsumuhs - { 0x85, diCMN, 0x0028 }, // 72 Maj op = 4, mask = 003F, xop = 0x0028 ( 40) - vmsumshm - { 0x85, diCMN, 0x0029 }, // 73 Maj op = 4, mask = 003F, xop = 0x0029 ( 41) - vmsumshs - { 0x85, diCMN, 0x002A }, // 74 Maj op = 4, mask = 003F, xop = 0x002A ( 42) - vsel - { 0x85, diCMN, 0x002B }, // 75 Maj op = 4, mask = 003F, xop = 0x002B ( 43) - vperm - { 0x85, diCMN, 0x002C }, // 76 Maj op = 4, mask = 003F, xop = 0x002C ( 44) - vsldoi - { 0x85, diCMN, 0x002E }, // 77 Maj op = 4, mask = 003F, xop = 0x002E ( 46) - vmaddfp - { 0x85, diCMN, 0x002F }, // 78 Maj op = 4, mask = 003F, xop = 0x002F ( 47) - vnmsubfp - { 0x87, diCMN, 0x0006 }, // 79 Maj op = 4, mask = 03FF, xop = 0x0006 ( 6) - vcmpequb - { 0x87, diCMN, 0x0046 }, // 80 Maj op = 4, mask = 03FF, xop = 0x0046 ( 70) - vcmpequh - { 0x87, diCMN, 0x0086 }, // 81 Maj op = 4, mask = 03FF, xop = 0x0086 ( 134) - vcmpequw - { 0x87, diCMN, 0x00C6 }, // 82 Maj op = 4, mask = 03FF, xop = 0x00C6 ( 198) - vcmpeqfp - { 0x87, diCMN, 0x01C6 }, // 83 Maj op = 4, mask = 03FF, xop = 0x01C6 ( 454) - vcmpgefp - { 0x87, diCMN, 0x0206 }, // 84 Maj op = 4, mask = 03FF, xop = 0x0206 ( 518) - vcmpgtub - { 0x87, diCMN, 0x0246 }, // 85 Maj op = 4, mask = 03FF, xop = 0x0246 ( 582) - vcmpgtuh - { 0x87, diCMN, 0x0286 }, // 86 Maj op = 4, mask = 03FF, xop = 0x0286 ( 646) - vcmpgtuw - { 0x87, diCMN, 0x02C6 }, // 87 Maj op = 4, mask = 03FF, xop = 0x02C6 ( 710) - vcmpgtfp - { 0x87, diCMN, 0x0306 }, // 88 Maj op = 4, mask = 03FF, xop = 0x0306 ( 774) - vcmpgtsb - { 0x87, diCMN, 0x0346 }, // 89 Maj op = 4, mask = 03FF, xop = 0x0346 ( 838) - vcmpgtsh - { 0x87, diCMN, 0x0386 }, // 90 Maj op = 4, mask = 03FF, xop = 0x0386 ( 902) - vcmpgtsw - { 0x87, diCMN, 0x03C6 }, // 91 Maj op = 4, mask = 03FF, xop = 0x03C6 ( 966) - vcmpbfp - { 0x8A, diCMN, 0x0000 }, // 92 Maj op = 4, mask = 07FF, xop = 0x0000 ( 0) - vaddubm - { 0x8A, diCMN, 0x0002 }, // 93 Maj op = 4, mask = 07FF, xop = 0x0002 ( 2) - vmaxub - { 0x8A, diCMN, 0x0004 }, // 94 Maj op = 4, mask = 07FF, xop = 0x0004 ( 4) - vrlb - { 0x8A, diCMN, 0x0008 }, // 95 Maj op = 4, mask = 07FF, xop = 0x0008 ( 8) - vmuloub - { 0x8A, diCMN, 0x000A }, // 96 Maj op = 4, mask = 07FF, xop = 0x000A ( 10) - vaddfp - { 0x8A, diCMN, 0x000C }, // 97 Maj op = 4, mask = 07FF, xop = 0x000C ( 12) - vmrghb - { 0x8A, diCMN, 0x000E }, // 98 Maj op = 4, mask = 07FF, xop = 0x000E ( 14) - vpkuhum - { 0x8A, diCMN, 0x0040 }, // 99 Maj op = 4, mask = 07FF, xop = 0x0040 ( 64) - vadduhm - { 0x8A, diCMN, 0x0042 }, // 100 Maj op = 4, mask = 07FF, xop = 0x0042 ( 66) - vmaxuh - { 0x8A, diCMN, 0x0044 }, // 101 Maj op = 4, mask = 07FF, xop = 0x0044 ( 68) - vrlh - { 0x8A, diCMN, 0x0048 }, // 102 Maj op = 4, mask = 07FF, xop = 0x0048 ( 72) - vmulouh - { 0x8A, diCMN, 0x004A }, // 103 Maj op = 4, mask = 07FF, xop = 0x004A ( 74) - vsubfp - { 0x8A, diCMN, 0x004C }, // 104 Maj op = 4, mask = 07FF, xop = 0x004C ( 76) - vmrghh - { 0x8A, diCMN, 0x004E }, // 105 Maj op = 4, mask = 07FF, xop = 0x004E ( 78) - vpkuwum - { 0x8A, diCMN, 0x0080 }, // 106 Maj op = 4, mask = 07FF, xop = 0x0080 ( 128) - vadduwm - { 0x8A, diCMN, 0x0082 }, // 107 Maj op = 4, mask = 07FF, xop = 0x0082 ( 130) - vmaxuw - { 0x8A, diCMN, 0x0084 }, // 108 Maj op = 4, mask = 07FF, xop = 0x0084 ( 132) - vrlw - { 0x8A, diCMN, 0x008C }, // 109 Maj op = 4, mask = 07FF, xop = 0x008C ( 140) - vmrghw - { 0x8A, diCMN, 0x008E }, // 110 Maj op = 4, mask = 07FF, xop = 0x008E ( 142) - vpkuhus - { 0x8A, diCMN, 0x00CE }, // 111 Maj op = 4, mask = 07FF, xop = 0x00CE ( 206) - vpkuwus - { 0x8A, diCMN, 0x0102 }, // 112 Maj op = 4, mask = 07FF, xop = 0x0102 ( 258) - vmaxsb - { 0x8A, diCMN, 0x0104 }, // 113 Maj op = 4, mask = 07FF, xop = 0x0104 ( 260) - vslb - { 0x8A, diCMN, 0x0108 }, // 114 Maj op = 4, mask = 07FF, xop = 0x0108 ( 264) - vmulosb - { 0x8A, diCMN, 0x010A }, // 115 Maj op = 4, mask = 07FF, xop = 0x010A ( 266) - vrefp - { 0x8A, diCMN, 0x010C }, // 116 Maj op = 4, mask = 07FF, xop = 0x010C ( 268) - vmrglb - { 0x8A, diCMN, 0x010E }, // 117 Maj op = 4, mask = 07FF, xop = 0x010E ( 270) - vpkshus - { 0x8A, diCMN, 0x0142 }, // 118 Maj op = 4, mask = 07FF, xop = 0x0142 ( 322) - vmaxsh - { 0x8A, diCMN, 0x0144 }, // 119 Maj op = 4, mask = 07FF, xop = 0x0144 ( 324) - vslh - { 0x8A, diCMN, 0x0148 }, // 120 Maj op = 4, mask = 07FF, xop = 0x0148 ( 328) - vmulosh - { 0x8A, diCMN, 0x014A }, // 121 Maj op = 4, mask = 07FF, xop = 0x014A ( 330) - vrsqrtefp - { 0x8A, diCMN, 0x014C }, // 122 Maj op = 4, mask = 07FF, xop = 0x014C ( 332) - vmrglh - { 0x8A, diCMN, 0x014E }, // 123 Maj op = 4, mask = 07FF, xop = 0x014E ( 334) - vpkswus - { 0x8A, diCMN, 0x0180 }, // 124 Maj op = 4, mask = 07FF, xop = 0x0180 ( 384) - vaddcuw - { 0x8A, diCMN, 0x0182 }, // 125 Maj op = 4, mask = 07FF, xop = 0x0182 ( 386) - vmaxsw - { 0x8A, diCMN, 0x0184 }, // 126 Maj op = 4, mask = 07FF, xop = 0x0184 ( 388) - vslw - { 0x8A, diCMN, 0x018A }, // 127 Maj op = 4, mask = 07FF, xop = 0x018A ( 394) - vexptefp - { 0x8A, diCMN, 0x018C }, // 128 Maj op = 4, mask = 07FF, xop = 0x018C ( 396) - vmrglw - { 0x8A, diCMN, 0x018E }, // 129 Maj op = 4, mask = 07FF, xop = 0x018E ( 398) - vpkshss - { 0x8A, diCMN, 0x01C4 }, // 130 Maj op = 4, mask = 07FF, xop = 0x01C4 ( 452) - vsl - { 0x8A, diCMN, 0x01CA }, // 131 Maj op = 4, mask = 07FF, xop = 0x01CA ( 458) - vlogefp - { 0x8A, diCMN, 0x01CE }, // 132 Maj op = 4, mask = 07FF, xop = 0x01CE ( 462) - vpkswss - { 0x8A, diCMN, 0x0200 }, // 133 Maj op = 4, mask = 07FF, xop = 0x0200 ( 512) - vaddubs - { 0x8A, diCMN, 0x0202 }, // 134 Maj op = 4, mask = 07FF, xop = 0x0202 ( 514) - vminub - { 0x8A, diCMN, 0x0204 }, // 135 Maj op = 4, mask = 07FF, xop = 0x0204 ( 516) - vsrb - { 0x8A, diCMN, 0x0208 }, // 136 Maj op = 4, mask = 07FF, xop = 0x0208 ( 520) - vmuleub - { 0x8A, diCMN, 0x020A }, // 137 Maj op = 4, mask = 07FF, xop = 0x020A ( 522) - vrfin - { 0x8A, diCMN, 0x020C }, // 138 Maj op = 4, mask = 07FF, xop = 0x020C ( 524) - vspltb - { 0x8A, diCMN, 0x020E }, // 139 Maj op = 4, mask = 07FF, xop = 0x020E ( 526) - vupkhsb - { 0x8A, diCMN, 0x0240 }, // 140 Maj op = 4, mask = 07FF, xop = 0x0240 ( 576) - vadduhs - { 0x8A, diCMN, 0x0242 }, // 141 Maj op = 4, mask = 07FF, xop = 0x0242 ( 578) - vminuh - { 0x8A, diCMN, 0x0244 }, // 142 Maj op = 4, mask = 07FF, xop = 0x0244 ( 580) - vsrh - { 0x8A, diCMN, 0x0248 }, // 143 Maj op = 4, mask = 07FF, xop = 0x0248 ( 584) - vmuleuh - { 0x8A, diCMN, 0x024A }, // 144 Maj op = 4, mask = 07FF, xop = 0x024A ( 586) - vrfiz - { 0x8A, diCMN, 0x024C }, // 145 Maj op = 4, mask = 07FF, xop = 0x024C ( 588) - vsplth - { 0x8A, diCMN, 0x024E }, // 146 Maj op = 4, mask = 07FF, xop = 0x024E ( 590) - vupkhsh - { 0x8A, diCMN, 0x0280 }, // 147 Maj op = 4, mask = 07FF, xop = 0x0280 ( 640) - vadduws - { 0x8A, diCMN, 0x0282 }, // 148 Maj op = 4, mask = 07FF, xop = 0x0282 ( 642) - vminuw - { 0x8A, diCMN, 0x0284 }, // 149 Maj op = 4, mask = 07FF, xop = 0x0284 ( 644) - vsrw - { 0x8A, diCMN, 0x028A }, // 150 Maj op = 4, mask = 07FF, xop = 0x028A ( 650) - vrfip - { 0x8A, diCMN, 0x028C }, // 151 Maj op = 4, mask = 07FF, xop = 0x028C ( 652) - vspltw - { 0x8A, diCMN, 0x028E }, // 152 Maj op = 4, mask = 07FF, xop = 0x028E ( 654) - vupklsb - { 0x8A, diCMN, 0x02C4 }, // 153 Maj op = 4, mask = 07FF, xop = 0x02C4 ( 708) - vsr - { 0x8A, diCMN, 0x02CA }, // 154 Maj op = 4, mask = 07FF, xop = 0x02CA ( 714) - vrfim - { 0x8A, diCMN, 0x02CE }, // 155 Maj op = 4, mask = 07FF, xop = 0x02CE ( 718) - vupklsh - { 0x8A, diCMN, 0x0300 }, // 156 Maj op = 4, mask = 07FF, xop = 0x0300 ( 768) - vaddsbs - { 0x8A, diCMN, 0x0302 }, // 157 Maj op = 4, mask = 07FF, xop = 0x0302 ( 770) - vminsb - { 0x8A, diCMN, 0x0304 }, // 158 Maj op = 4, mask = 07FF, xop = 0x0304 ( 772) - vsrab - { 0x8A, diCMN, 0x0308 }, // 159 Maj op = 4, mask = 07FF, xop = 0x0308 ( 776) - vmulesb - { 0x8A, diCMN, 0x030A }, // 160 Maj op = 4, mask = 07FF, xop = 0x030A ( 778) - vcfux - { 0x8A, diCMN, 0x030C }, // 161 Maj op = 4, mask = 07FF, xop = 0x030C ( 780) - vspltisb - { 0x8A, diCMN, 0x030E }, // 162 Maj op = 4, mask = 07FF, xop = 0x030E ( 782) - vpkpx - { 0x8A, diCMN, 0x0340 }, // 163 Maj op = 4, mask = 07FF, xop = 0x0340 ( 832) - vaddshs - { 0x8A, diCMN, 0x0342 }, // 164 Maj op = 4, mask = 07FF, xop = 0x0342 ( 834) - vminsh - { 0x8A, diCMN, 0x0344 }, // 165 Maj op = 4, mask = 07FF, xop = 0x0344 ( 836) - vsrah - { 0x8A, diCMN, 0x0348 }, // 166 Maj op = 4, mask = 07FF, xop = 0x0348 ( 840) - vmulesh - { 0x8A, diCMN, 0x034A }, // 167 Maj op = 4, mask = 07FF, xop = 0x034A ( 842) - vcfsx - { 0x8A, diCMN, 0x034C }, // 168 Maj op = 4, mask = 07FF, xop = 0x034C ( 844) - vspltish - { 0x8A, diCMN, 0x034E }, // 169 Maj op = 4, mask = 07FF, xop = 0x034E ( 846) - vupkhpx - { 0x8A, diCMN, 0x0380 }, // 170 Maj op = 4, mask = 07FF, xop = 0x0380 ( 896) - vaddsws - { 0x8A, diCMN, 0x0382 }, // 171 Maj op = 4, mask = 07FF, xop = 0x0382 ( 898) - vminsw - { 0x8A, diCMN, 0x0384 }, // 172 Maj op = 4, mask = 07FF, xop = 0x0384 ( 900) - vsraw - { 0x8A, diCMN, 0x038A }, // 173 Maj op = 4, mask = 07FF, xop = 0x038A ( 906) - vctuxs - { 0x8A, diCMN, 0x038C }, // 174 Maj op = 4, mask = 07FF, xop = 0x038C ( 908) - vspltisw - { 0x8A, diCMN, 0x03CA }, // 175 Maj op = 4, mask = 07FF, xop = 0x03CA ( 970) - vctsxs - { 0x8A, diCMN, 0x03CE }, // 176 Maj op = 4, mask = 07FF, xop = 0x03CE ( 974) - vupklpx - { 0x8A, diCMN, 0x0400 }, // 177 Maj op = 4, mask = 07FF, xop = 0x0400 (1024) - vsububm - { 0x8A, diCMN, 0x0402 }, // 178 Maj op = 4, mask = 07FF, xop = 0x0402 (1026) - vavgub - { 0x8A, diCMN, 0x0404 }, // 179 Maj op = 4, mask = 07FF, xop = 0x0404 (1028) - vand - { 0x8A, diCMN, 0x040A }, // 180 Maj op = 4, mask = 07FF, xop = 0x040A (1034) - vmaxfp - { 0x8A, diCMN, 0x040C }, // 181 Maj op = 4, mask = 07FF, xop = 0x040C (1036) - vslo - { 0x8A, diCMN, 0x0440 }, // 182 Maj op = 4, mask = 07FF, xop = 0x0440 (1088) - vsubuhm - { 0x8A, diCMN, 0x0442 }, // 183 Maj op = 4, mask = 07FF, xop = 0x0442 (1090) - vavguh - { 0x8A, diCMN, 0x0444 }, // 184 Maj op = 4, mask = 07FF, xop = 0x0444 (1092) - vandc - { 0x8A, diCMN, 0x044A }, // 185 Maj op = 4, mask = 07FF, xop = 0x044A (1098) - vminfp - { 0x8A, diCMN, 0x044C }, // 186 Maj op = 4, mask = 07FF, xop = 0x044C (1100) - vsro - { 0x8A, diCMN, 0x0480 }, // 187 Maj op = 4, mask = 07FF, xop = 0x0480 (1152) - vsubuwm - { 0x8A, diCMN, 0x0482 }, // 188 Maj op = 4, mask = 07FF, xop = 0x0482 (1154) - vavguw - { 0x8A, diCMN, 0x0484 }, // 189 Maj op = 4, mask = 07FF, xop = 0x0484 (1156) - vor - { 0x8A, diCMN, 0x04C4 }, // 190 Maj op = 4, mask = 07FF, xop = 0x04C4 (1220) - vxor - { 0x8A, diCMN, 0x0502 }, // 191 Maj op = 4, mask = 07FF, xop = 0x0502 (1282) - vavgsb - { 0x8A, diCMN, 0x0504 }, // 192 Maj op = 4, mask = 07FF, xop = 0x0504 (1284) - vnor - { 0x8A, diCMN, 0x0542 }, // 193 Maj op = 4, mask = 07FF, xop = 0x0542 (1346) - vavgsh - { 0x8A, diCMN, 0x0580 }, // 194 Maj op = 4, mask = 07FF, xop = 0x0580 (1408) - vsubcuw - { 0x8A, diCMN, 0x0582 }, // 195 Maj op = 4, mask = 07FF, xop = 0x0582 (1410) - vavgsw - { 0x8A, diCMN, 0x0600 }, // 196 Maj op = 4, mask = 07FF, xop = 0x0600 (1536) - vsububs - { 0x8A, diCMN, 0x0604 }, // 197 Maj op = 4, mask = 07FF, xop = 0x0604 (1540) - mfvscr - { 0x8A, diCMN, 0x0608 }, // 198 Maj op = 4, mask = 07FF, xop = 0x0608 (1544) - vsum4ubs - { 0x8A, diCMN, 0x0640 }, // 199 Maj op = 4, mask = 07FF, xop = 0x0640 (1600) - vsubuhs - { 0x8A, diCMN, 0x0644 }, // 200 Maj op = 4, mask = 07FF, xop = 0x0644 (1604) - mtvscr - { 0x8A, diCMN, 0x0648 }, // 201 Maj op = 4, mask = 07FF, xop = 0x0648 (1608) - vsum4shs - { 0x8A, diCMN, 0x0680 }, // 202 Maj op = 4, mask = 07FF, xop = 0x0680 (1664) - vsubuws - { 0x8A, diCMN, 0x0688 }, // 203 Maj op = 4, mask = 07FF, xop = 0x0688 (1672) - vsum2sws - { 0x8A, diCMN, 0x0700 }, // 204 Maj op = 4, mask = 07FF, xop = 0x0700 (1792) - vsubsbs - { 0x8A, diCMN, 0x0708 }, // 205 Maj op = 4, mask = 07FF, xop = 0x0708 (1800) - vsum4sbs - { 0x8A, diCMN, 0x0740 }, // 206 Maj op = 4, mask = 07FF, xop = 0x0740 (1856) - vsubshs - { 0x8A, diCMN, 0x0780 }, // 207 Maj op = 4, mask = 07FF, xop = 0x0780 (1920) - vsubsws - { 0x0A, diCMN, 0x0788 }, // 208 Maj op = 4, mask = 07FF, xop = 0x0788 (1928) - vsumsws - { 0x89, diCMN, 0x0000 }, // 209 Maj op = 19, mask = 07FE, xop = 0x0000 ( 0) - mcrf - { 0x89, diBLR, 0x0020 }, // 210 Maj op = 19, mask = 07FE, xop = 0x0020 ( 16) - bclr - { 0x89, diPRV, 0x0024 }, // 211 Maj op = 19, mask = 07FE, xop = 0x0024 ( 18) - rfid - { 0x89, diCMN, 0x0042 }, // 212 Maj op = 19, mask = 07FE, xop = 0x0042 ( 33) - crnor - { 0x89, diPRV, 0x0064 }, // 213 Maj op = 19, mask = 07FE, xop = 0x0064 ( 50) - rfi - { 0x89, diCMN, 0x0102 }, // 214 Maj op = 19, mask = 07FE, xop = 0x0102 ( 129) - crandc - { 0x89, diCMN, 0x012C }, // 215 Maj op = 19, mask = 07FE, xop = 0x012C ( 150) - isync - { 0x89, diCMN, 0x0182 }, // 216 Maj op = 19, mask = 07FE, xop = 0x0182 ( 193) - crxor - { 0x89, diCMN, 0x01C2 }, // 217 Maj op = 19, mask = 07FE, xop = 0x01C2 ( 225) - crnand - { 0x89, diCMN, 0x0202 }, // 218 Maj op = 19, mask = 07FE, xop = 0x0202 ( 257) - crand - { 0x89, diPRV, 0x0224 }, // 219 Maj op = 19, mask = 07FE, xop = 0x0224 ( 274) - hrfid - { 0x89, diCMN, 0x0242 }, // 220 Maj op = 19, mask = 07FE, xop = 0x0242 ( 289) - creqv - { 0x89, diCMN, 0x0342 }, // 221 Maj op = 19, mask = 07FE, xop = 0x0342 ( 417) - crorc - { 0x89, diCMN, 0x0382 }, // 222 Maj op = 19, mask = 07FE, xop = 0x0382 ( 449) - cror - { 0x09, diBCTR, 0x0420 }, // 223 Maj op = 19, mask = 07FE, xop = 0x0420 ( 528) - bctr - { 0x82, diCMN, 0x0000 }, // 224 Maj op = 30, mask = 001C, xop = 0x0000 ( 0) - rldicl - { 0x82, diCMN, 0x0004 }, // 225 Maj op = 30, mask = 001C, xop = 0x0004 ( 1) - rldicr - { 0x82, diCMN, 0x0008 }, // 226 Maj op = 30, mask = 001C, xop = 0x0008 ( 2) - rldic - { 0x82, diCMN, 0x000C }, // 227 Maj op = 30, mask = 001C, xop = 0x000C ( 3) - rldimi - { 0x83, diCMN, 0x0010 }, // 228 Maj op = 30, mask = 001E, xop = 0x0010 ( 8) - rldcl - { 0x03, diCMN, 0x0012 }, // 229 Maj op = 30, mask = 001E, xop = 0x0012 ( 9) - rldcr - { 0x86, diCMN, 0x0010 }, // 230 Maj op = 31, mask = 03FE, xop = 0x0010 ( 8) - subfc - { 0x86, diCMN, 0x0012 }, // 231 Maj op = 31, mask = 03FE, xop = 0x0012 ( 9) - mulhdu - { 0x86, diCMN, 0x0014 }, // 232 Maj op = 31, mask = 03FE, xop = 0x0014 ( 10) - addc - { 0x86, diCMN, 0x0016 }, // 233 Maj op = 31, mask = 03FE, xop = 0x0016 ( 11) - mulhwu - { 0x86, diCMN, 0x0050 }, // 234 Maj op = 31, mask = 03FE, xop = 0x0050 ( 40) - subf - { 0x86, diCMN, 0x0092 }, // 235 Maj op = 31, mask = 03FE, xop = 0x0092 ( 73) - mulhd - { 0x86, diCMN, 0x0096 }, // 236 Maj op = 31, mask = 03FE, xop = 0x0096 ( 75) - mulhw - { 0x86, diCMN, 0x00D0 }, // 237 Maj op = 31, mask = 03FE, xop = 0x00D0 ( 104) - neg - { 0x86, diCMN, 0x0110 }, // 238 Maj op = 31, mask = 03FE, xop = 0x0110 ( 136) - subfe - { 0x86, diCMN, 0x0114 }, // 239 Maj op = 31, mask = 03FE, xop = 0x0114 ( 138) - adde - { 0x86, diCMN, 0x0190 }, // 240 Maj op = 31, mask = 03FE, xop = 0x0190 ( 200) - subfze - { 0x86, diCMN, 0x0194 }, // 241 Maj op = 31, mask = 03FE, xop = 0x0194 ( 202) - addze - { 0x86, diCMN, 0x01D0 }, // 242 Maj op = 31, mask = 03FE, xop = 0x01D0 ( 232) - subfme - { 0x86, diCMN, 0x01D2 }, // 243 Maj op = 31, mask = 03FE, xop = 0x01D2 ( 233) - mulld - { 0x86, diCMN, 0x01D4 }, // 244 Maj op = 31, mask = 03FE, xop = 0x01D4 ( 234) - addme - { 0x86, diCMN, 0x01D6 }, // 245 Maj op = 31, mask = 03FE, xop = 0x01D6 ( 235) - mullw - { 0x86, diCMN, 0x0214 }, // 246 Maj op = 31, mask = 03FE, xop = 0x0214 ( 266) - add - { 0x86, diCMN, 0x0392 }, // 247 Maj op = 31, mask = 03FE, xop = 0x0392 ( 457) - divdu - { 0x86, diCMN, 0x0396 }, // 248 Maj op = 31, mask = 03FE, xop = 0x0396 ( 459) - divwu - { 0x86, diCMN, 0x03D2 }, // 249 Maj op = 31, mask = 03FE, xop = 0x03D2 ( 489) - divd - { 0x86, diCMN, 0x03D6 }, // 250 Maj op = 31, mask = 03FE, xop = 0x03D6 ( 491) - divw - { 0x88, diCMN, 0x0674 }, // 251 Maj op = 31, mask = 07FC, xop = 0x0674 ( 413) - sradi - { 0x89, diCMN, 0x0000 }, // 252 Maj op = 31, mask = 07FE, xop = 0x0000 ( 0) - cmp - { 0x89, diTRP, 0x0008 }, // 253 Maj op = 31, mask = 07FE, xop = 0x0008 ( 4) - tw - { 0x89, diCMN, 0x000C }, // 254 Maj op = 31, mask = 07FE, xop = 0x000C ( 6) - lvsl - { 0x89, diCMN, 0x000E }, // 255 Maj op = 31, mask = 07FE, xop = 0x000E ( 7) - lvebx - { 0x89, diCMN, 0x0026 }, // 256 Maj op = 31, mask = 07FE, xop = 0x0026 ( 19) - mfcr - { 0x89, diCMN, 0x0028 }, // 257 Maj op = 31, mask = 07FE, xop = 0x0028 ( 20) - lwarx - { 0x89, diCMN, 0x002A }, // 258 Maj op = 31, mask = 07FE, xop = 0x002A ( 21) - ldx - { 0x89, diCMN, 0x002E }, // 259 Maj op = 31, mask = 07FE, xop = 0x002E ( 23) - lwzx - { 0x89, diCMN, 0x0030 }, // 260 Maj op = 31, mask = 07FE, xop = 0x0030 ( 24) - slw - { 0x89, diCMN, 0x0034 }, // 261 Maj op = 31, mask = 07FE, xop = 0x0034 ( 26) - cntlzw - { 0x89, diCMN, 0x0036 }, // 262 Maj op = 31, mask = 07FE, xop = 0x0036 ( 27) - sld - { 0x89, diCMN, 0x0038 }, // 263 Maj op = 31, mask = 07FE, xop = 0x0038 ( 28) - and - { 0x89, diCMN, 0x0040 }, // 264 Maj op = 31, mask = 07FE, xop = 0x0040 ( 32) - cmpl - { 0x89, diCMN, 0x004C }, // 265 Maj op = 31, mask = 07FE, xop = 0x004C ( 38) - lvsr - { 0x89, diCMN, 0x004E }, // 266 Maj op = 31, mask = 07FE, xop = 0x004E ( 39) - lvehx - { 0x89, diCMN, 0x006A }, // 267 Maj op = 31, mask = 07FE, xop = 0x006A ( 53) - ldux - { 0x89, diCMN, 0x006C }, // 268 Maj op = 31, mask = 07FE, xop = 0x006C ( 54) - dcbst - { 0x89, diCMN, 0x006E }, // 269 Maj op = 31, mask = 07FE, xop = 0x006E ( 55) - lwzux - { 0x89, diCMN, 0x0074 }, // 270 Maj op = 31, mask = 07FE, xop = 0x0074 ( 58) - cntlzd - { 0x89, diCMN, 0x0078 }, // 271 Maj op = 31, mask = 07FE, xop = 0x0078 ( 60) - andc - { 0x89, diTRP, 0x0088 }, // 272 Maj op = 31, mask = 07FE, xop = 0x0088 ( 68) - td - { 0x89, diCMN, 0x008E }, // 273 Maj op = 31, mask = 07FE, xop = 0x008E ( 71) - lvewx - { 0x89, diPRV, 0x00A6 }, // 274 Maj op = 31, mask = 07FE, xop = 0x00A6 ( 83) - mfmsr - { 0x89, diCMN, 0x00A8 }, // 275 Maj op = 31, mask = 07FE, xop = 0x00A8 ( 84) - ldarx - { 0x89, diCMN, 0x00AC }, // 276 Maj op = 31, mask = 07FE, xop = 0x00AC ( 86) - dcbf - { 0x89, diCMN, 0x00AE }, // 277 Maj op = 31, mask = 07FE, xop = 0x00AE ( 87) - lbzx - { 0x89, diCMN, 0x00CE }, // 278 Maj op = 31, mask = 07FE, xop = 0x00CE ( 103) - lvx - { 0x89, diCMN, 0x00EE }, // 279 Maj op = 31, mask = 07FE, xop = 0x00EE ( 119) - lbzux - { 0x89, diCMN, 0x00F8 }, // 280 Maj op = 31, mask = 07FE, xop = 0x00F8 ( 124) - nor - { 0x89, diCMN, 0x010E }, // 281 Maj op = 31, mask = 07FE, xop = 0x010E ( 135) - stvebx - { 0x89, diCMN, 0x0120 }, // 282 Maj op = 31, mask = 07FE, xop = 0x0120 ( 144) - mtcrf - { 0x89, diPRV, 0x0124 }, // 283 Maj op = 31, mask = 07FE, xop = 0x0124 ( 146) - mtmsr - { 0x89, diCMN, 0x012A }, // 284 Maj op = 31, mask = 07FE, xop = 0x012A ( 149) - stdx - { 0x89, diCMN, 0x012C }, // 285 Maj op = 31, mask = 07FE, xop = 0x012C ( 150) - stwcx - { 0x89, diCMN, 0x012E }, // 286 Maj op = 31, mask = 07FE, xop = 0x012E ( 151) - stwx - { 0x89, diCMN, 0x014E }, // 287 Maj op = 31, mask = 07FE, xop = 0x014E ( 167) - stvehx - { 0x89, diPRV, 0x0164 }, // 288 Maj op = 31, mask = 07FE, xop = 0x0164 ( 178) - mtmsrd - { 0x89, diCMN, 0x016A }, // 289 Maj op = 31, mask = 07FE, xop = 0x016A ( 181) - stdux - { 0x89, diCMN, 0x016E }, // 290 Maj op = 31, mask = 07FE, xop = 0x016E ( 183) - stwux - { 0x89, diCMN, 0x018E }, // 291 Maj op = 31, mask = 07FE, xop = 0x018E ( 199) - stvewx - { 0x89, diCMN, 0x01A4 }, // 292 Maj op = 31, mask = 07FE, xop = 0x01A4 ( 210) - mtsr - { 0x89, diCMN, 0x01AC }, // 293 Maj op = 31, mask = 07FE, xop = 0x01AC ( 214) - stdcx. - { 0x89, diCMN, 0x01AE }, // 294 Maj op = 31, mask = 07FE, xop = 0x01AE ( 215) - stbx - { 0x89, diCMN, 0x01CE }, // 295 Maj op = 31, mask = 07FE, xop = 0x01CE ( 231) - stvx - { 0x89, diPRV, 0x01E4 }, // 296 Maj op = 31, mask = 07FE, xop = 0x01E4 ( 242) - mtsrin - { 0x89, diCMN, 0x01EC }, // 297 Maj op = 31, mask = 07FE, xop = 0x01EC ( 246) - dcbtst - { 0x89, diCMN, 0x01EE }, // 298 Maj op = 31, mask = 07FE, xop = 0x01EE ( 247) - stbux - { 0x89, diPRV, 0x0224 }, // 299 Maj op = 31, mask = 07FE, xop = 0x0224 ( 274) - tlbiel - { 0x89, diCMN, 0x022C }, // 300 Maj op = 31, mask = 07FE, xop = 0x022C ( 278) - dcbt - { 0x89, diCMN, 0x022E }, // 301 Maj op = 31, mask = 07FE, xop = 0x022E ( 279) - lhzx - { 0x89, diCMN, 0x0238 }, // 302 Maj op = 31, mask = 07FE, xop = 0x0238 ( 284) - eqv - { 0x89, diPRV, 0x0264 }, // 303 Maj op = 31, mask = 07FE, xop = 0x0264 ( 306) - tlbie - { 0x89, diPRV, 0x026C }, // 304 Maj op = 31, mask = 07FE, xop = 0x026C ( 310) - eciwx - { 0x89, diCMN, 0x026E }, // 305 Maj op = 31, mask = 07FE, xop = 0x026E ( 311) - lhzux - { 0x89, diCMN, 0x0278 }, // 306 Maj op = 31, mask = 07FE, xop = 0x0278 ( 316) - xor - { 0x89, diSPR, 0x02A6 }, // 307 Maj op = 31, mask = 07FE, xop = 0x02A6 ( 339) - mfspr - { 0x89, diCMN, 0x02AA }, // 308 Maj op = 31, mask = 07FE, xop = 0x02AA ( 341) - lwax - { 0x89, diCMN, 0x02AC }, // 309 Maj op = 31, mask = 07FE, xop = 0x02AC ( 342) - dst - { 0x89, diCMN, 0x02AE }, // 310 Maj op = 31, mask = 07FE, xop = 0x02AE ( 343) - lhax - { 0x89, diCMN, 0x02CE }, // 311 Maj op = 31, mask = 07FE, xop = 0x02CE ( 359) - lvxl - { 0x89, diPRV, 0x02E4 }, // 312 Maj op = 31, mask = 07FE, xop = 0x02E4 ( 370) - tlbia - { 0x89, diCMN, 0x02E6 }, // 313 Maj op = 31, mask = 07FE, xop = 0x02E6 ( 371) - mftb - { 0x89, diCMN, 0x02EA }, // 314 Maj op = 31, mask = 07FE, xop = 0x02EA ( 373) - lwaux - { 0x89, diCMN, 0x02EC }, // 315 Maj op = 31, mask = 07FE, xop = 0x02EC ( 374) - dstst - { 0x89, diCMN, 0x02EE }, // 316 Maj op = 31, mask = 07FE, xop = 0x02EE ( 375) - lhaux - { 0x89, diPRV, 0x0324 }, // 317 Maj op = 31, mask = 07FE, xop = 0x0324 ( 402) - slbmte - { 0x89, diCMN, 0x032E }, // 318 Maj op = 31, mask = 07FE, xop = 0x032E ( 407) - sthx - { 0x89, diCMN, 0x0338 }, // 319 Maj op = 31, mask = 07FE, xop = 0x0338 ( 412) - orc - { 0x89, diPRV, 0x0364 }, // 320 Maj op = 31, mask = 07FE, xop = 0x0364 ( 434) - slbie - { 0x89, diPRV, 0x036C }, // 321 Maj op = 31, mask = 07FE, xop = 0x036C ( 438) - ecowx - { 0x89, diCMN, 0x036E }, // 322 Maj op = 31, mask = 07FE, xop = 0x036E ( 439) - sthux - { 0x89, diOR, 0x0378 }, // 323 Maj op = 31, mask = 07FE, xop = 0x0378 ( 444) - or - { 0x89, diSPR, 0x03A6 }, // 324 Maj op = 31, mask = 07FE, xop = 0x03A6 ( 467) - mtspr - { 0x89, diCMN, 0x03B8 }, // 325 Maj op = 31, mask = 07FE, xop = 0x03B8 ( 476) - nand - { 0x89, diCMN, 0x03CE }, // 326 Maj op = 31, mask = 07FE, xop = 0x03CE ( 487) - stvxl - { 0x89, diPRV, 0x03E4 }, // 327 Maj op = 31, mask = 07FE, xop = 0x03E4 ( 498) - slbia - { 0x89, diCMN, 0x0400 }, // 328 Maj op = 31, mask = 07FE, xop = 0x0400 ( 512) - mcrxr - { 0x89, diCMN, 0x042A }, // 329 Maj op = 31, mask = 07FE, xop = 0x042A ( 533) - lswx - { 0x89, diCMN, 0x042C }, // 330 Maj op = 31, mask = 07FE, xop = 0x042C ( 534) - lwbrx - { 0x89, diCMN, 0x042E }, // 331 Maj op = 31, mask = 07FE, xop = 0x042E ( 535) - lfsx - { 0x89, diCMN, 0x0430 }, // 332 Maj op = 31, mask = 07FE, xop = 0x0430 ( 536) - srw - { 0x89, diCMN, 0x0436 }, // 333 Maj op = 31, mask = 07FE, xop = 0x0436 ( 539) - srd - { 0x89, diPRV, 0x046C }, // 334 Maj op = 31, mask = 07FE, xop = 0x046C ( 566) - tlbsync - { 0x89, diCMN, 0x046E }, // 335 Maj op = 31, mask = 07FE, xop = 0x046E ( 567) - lfsux - { 0x89, diPRV, 0x04A6 }, // 336 Maj op = 31, mask = 07FE, xop = 0x04A6 ( 595) - mfsr - { 0x89, diCMN, 0x04AA }, // 337 Maj op = 31, mask = 07FE, xop = 0x04AA ( 597) - lswi - { 0x89, diCMN, 0x04AC }, // 338 Maj op = 31, mask = 07FE, xop = 0x04AC ( 598) - sync - { 0x89, diCMN, 0x04AE }, // 339 Maj op = 31, mask = 07FE, xop = 0x04AE ( 599) - lfdx - { 0x89, diCMN, 0x04EE }, // 340 Maj op = 31, mask = 07FE, xop = 0x04EE ( 631) - lfdux - { 0x89, diPRV, 0x0526 }, // 341 Maj op = 31, mask = 07FE, xop = 0x0526 ( 659) - mfsrin - { 0x89, diCMN, 0x052A }, // 342 Maj op = 31, mask = 07FE, xop = 0x052A ( 661) - stswx - { 0x89, diCMN, 0x052C }, // 343 Maj op = 31, mask = 07FE, xop = 0x052C ( 662) - stwbrx - { 0x89, diCMN, 0x052E }, // 344 Maj op = 31, mask = 07FE, xop = 0x052E ( 663) - stfsx - { 0x89, diCMN, 0x056E }, // 345 Maj op = 31, mask = 07FE, xop = 0x056E ( 695) - stfsux - { 0x89, diCMN, 0x05AA }, // 346 Maj op = 31, mask = 07FE, xop = 0x05AA ( 725) - stswi - { 0x89, diCMN, 0x05AE }, // 347 Maj op = 31, mask = 07FE, xop = 0x05AE ( 727) - stfdx - { 0x89, diCMN, 0x05EC }, // 348 Maj op = 31, mask = 07FE, xop = 0x05EC ( 758) - dcba - { 0x89, diCMN, 0x05EE }, // 349 Maj op = 31, mask = 07FE, xop = 0x05EE ( 759) - stfdux - { 0x89, diCMN, 0x062C }, // 350 Maj op = 31, mask = 07FE, xop = 0x062C ( 790) - lhbrx - { 0x89, diCMN, 0x0630 }, // 351 Maj op = 31, mask = 07FE, xop = 0x0630 ( 792) - sraw - { 0x89, diCMN, 0x0634 }, // 352 Maj op = 31, mask = 07FE, xop = 0x0634 ( 794) - srad - { 0x89, diCMN, 0x066C }, // 353 Maj op = 31, mask = 07FE, xop = 0x066C ( 822) - dss - { 0x89, diCMN, 0x0670 }, // 354 Maj op = 31, mask = 07FE, xop = 0x0670 ( 824) - srawi - { 0x89, diPRV, 0x06A6 }, // 355 Maj op = 31, mask = 07FE, xop = 0x06A6 ( 851) - slbmfev - { 0x89, diCMN, 0x06AC }, // 356 Maj op = 31, mask = 07FE, xop = 0x06AC ( 854) - eieio - { 0x89, diPRV, 0x0726 }, // 357 Maj op = 31, mask = 07FE, xop = 0x0726 ( 915) - slbmfee - { 0x89, diCMN, 0x072C }, // 358 Maj op = 31, mask = 07FE, xop = 0x072C ( 918) - sthbrx - { 0x89, diCMN, 0x0734 }, // 359 Maj op = 31, mask = 07FE, xop = 0x0734 ( 922) - extsh - { 0x89, diCMN, 0x0774 }, // 360 Maj op = 31, mask = 07FE, xop = 0x0774 ( 954) - extsb - { 0x89, diCMN, 0x07AC }, // 361 Maj op = 31, mask = 07FE, xop = 0x07AC ( 982) - icbi - { 0x89, diCMN, 0x07AE }, // 362 Maj op = 31, mask = 07FE, xop = 0x07AE ( 983) - stfiwx - { 0x89, diCMN, 0x07B4 }, // 363 Maj op = 31, mask = 07FE, xop = 0x07B4 ( 986) - extsw - { 0x09, diCMN, 0x07EC }, // 364 Maj op = 31, mask = 07FE, xop = 0x07EC (1014) - dcbz - { 0x81, diCMN, 0x0000 }, // 365 Maj op = 58, mask = 0003, xop = 0x0000 ( 0) - ld - { 0x81, diCMN, 0x0001 }, // 366 Maj op = 58, mask = 0003, xop = 0x0001 ( 1) - ldu - { 0x01, diCMN, 0x0002 }, // 367 Maj op = 58, mask = 0003, xop = 0x0002 ( 2) - lwa - { 0x84, diCMN, 0x0024 }, // 368 Maj op = 59, mask = 003E, xop = 0x0024 ( 18) - fdivs - { 0x84, diCMN, 0x0028 }, // 369 Maj op = 59, mask = 003E, xop = 0x0028 ( 20) - fsubs - { 0x84, diCMN, 0x002A }, // 370 Maj op = 59, mask = 003E, xop = 0x002A ( 21) - fadds - { 0x84, diCMN, 0x002C }, // 371 Maj op = 59, mask = 003E, xop = 0x002C ( 22) - fsqrts - { 0x84, diCMN, 0x0030 }, // 372 Maj op = 59, mask = 003E, xop = 0x0030 ( 24) - fres - { 0x84, diCMN, 0x0032 }, // 373 Maj op = 59, mask = 003E, xop = 0x0032 ( 25) - fmuls - { 0x84, diCMN, 0x0038 }, // 374 Maj op = 59, mask = 003E, xop = 0x0038 ( 28) - fmsubs - { 0x84, diCMN, 0x003A }, // 375 Maj op = 59, mask = 003E, xop = 0x003A ( 29) - fmadds - { 0x84, diCMN, 0x003C }, // 376 Maj op = 59, mask = 003E, xop = 0x003C ( 30) - fnmsubs - { 0x04, diCMN, 0x003E }, // 377 Maj op = 59, mask = 003E, xop = 0x003E ( 31) - fnmadds - { 0x81, diCMN, 0x0000 }, // 378 Maj op = 62, mask = 0003, xop = 0x0000 ( 0) - std - { 0x01, diCMN, 0x0001 }, // 379 Maj op = 62, mask = 0003, xop = 0x0001 ( 1) - stdu - { 0x84, diCMN, 0x0024 }, // 380 Maj op = 63, mask = 003E, xop = 0x0024 ( 18) - fdiv - { 0x84, diCMN, 0x0028 }, // 381 Maj op = 63, mask = 003E, xop = 0x0028 ( 20) - fsub - { 0x84, diCMN, 0x002A }, // 382 Maj op = 63, mask = 003E, xop = 0x002A ( 21) - fadd - { 0x84, diCMN, 0x002C }, // 383 Maj op = 63, mask = 003E, xop = 0x002C ( 22) - fsqrt - { 0x84, diCMN, 0x002E }, // 384 Maj op = 63, mask = 003E, xop = 0x002E ( 23) - fsel - { 0x84, diCMN, 0x0032 }, // 385 Maj op = 63, mask = 003E, xop = 0x0032 ( 25) - fmul - { 0x84, diCMN, 0x0034 }, // 386 Maj op = 63, mask = 003E, xop = 0x0034 ( 26) - frsqrte - { 0x84, diCMN, 0x0038 }, // 387 Maj op = 63, mask = 003E, xop = 0x0038 ( 28) - fmsub - { 0x84, diCMN, 0x003A }, // 388 Maj op = 63, mask = 003E, xop = 0x003A ( 29) - fmadd - { 0x84, diCMN, 0x003C }, // 389 Maj op = 63, mask = 003E, xop = 0x003C ( 30) - fnmsub - { 0x84, diCMN, 0x003E }, // 390 Maj op = 63, mask = 003E, xop = 0x003E ( 31) - fnmadd - { 0x89, diCMN, 0x0000 }, // 391 Maj op = 63, mask = 07FE, xop = 0x0000 ( 0) - fcmpu - { 0x89, diCMN, 0x0018 }, // 392 Maj op = 63, mask = 07FE, xop = 0x0018 ( 12) - frsp - { 0x89, diCMN, 0x001C }, // 393 Maj op = 63, mask = 07FE, xop = 0x001C ( 14) - fctiw - { 0x89, diCMN, 0x001E }, // 394 Maj op = 63, mask = 07FE, xop = 0x001E ( 15) - fctiwz - { 0x89, diCMN, 0x0040 }, // 395 Maj op = 63, mask = 07FE, xop = 0x0040 ( 32) - fcmpo - { 0x89, diCMN, 0x004C }, // 396 Maj op = 63, mask = 07FE, xop = 0x004C ( 38) - mtfsb1 - { 0x89, diCMN, 0x0050 }, // 397 Maj op = 63, mask = 07FE, xop = 0x0050 ( 40) - fneg - { 0x89, diCMN, 0x0080 }, // 398 Maj op = 63, mask = 07FE, xop = 0x0080 ( 64) - mcrfs - { 0x89, diCMN, 0x008C }, // 399 Maj op = 63, mask = 07FE, xop = 0x008C ( 70) - mtfsb0 - { 0x89, diCMN, 0x0090 }, // 400 Maj op = 63, mask = 07FE, xop = 0x0090 ( 72) - fmr - { 0x89, diCMN, 0x010C }, // 401 Maj op = 63, mask = 07FE, xop = 0x010C ( 134) - mtfsfi - { 0x89, diCMN, 0x0110 }, // 402 Maj op = 63, mask = 07FE, xop = 0x0110 ( 136) - fnabs - { 0x89, diCMN, 0x0210 }, // 403 Maj op = 63, mask = 07FE, xop = 0x0210 ( 264) - fabs - { 0x89, diCMN, 0x048E }, // 404 Maj op = 63, mask = 07FE, xop = 0x048E ( 583) - mffs - { 0x89, diCMN, 0x058E }, // 405 Maj op = 63, mask = 07FE, xop = 0x058E ( 711) - mtfsf - { 0x89, diCMN, 0x065C }, // 406 Maj op = 63, mask = 07FE, xop = 0x065C ( 814) - fctid - { 0x89, diCMN, 0x065E }, // 407 Maj op = 63, mask = 07FE, xop = 0x065E ( 815) - fctidz - { 0x09, diCMN, 0x069C }, // 408 Maj op = 63, mask = 07FE, xop = 0x069C ( 846) - fcfid -}; - -#ifdef __decodePPC_debug__ -char *instname[] = { - "Jump entry...", - "Invalid", - "tdi", - "twi", - "Jump entry...", - "Invalid", - "Invalid", - "mulli", - "subfic", - "Invalid", - "cmpli", - "cmpi", - "addic", - "addic.", - "addi", - "addis", - "bc", - "sc", - "b", - "Jump entry...", - "rlwimi", - "rlwinm", - "Invalid", - "rlwnm", - "ori", - "oris", - "xori", - "xoris", - "andi.", - "andis.", - "Jump entry...", - "Jump entry...", - "lwz", - "lwzu", - "lbz", - "lbzu", - "stw", - "stwu", - "stb", - "stbu", - "lhz", - "lhzu", - "lha", - "lhau", - "sth", - "sthu", - "lmw", - "stmw", - "lfs", - "lfsu", - "lfd", - "lfdu", - "stfs", - "stfsu", - "stfd", - "stfdu", - "Invalid", - "Invalid", - "Jump entry...", - "Jump entry...", - "Invalid", - "Invalid", - "Jump entry...", - "Jump entry...", - "attn", - "vmhaddshs", - "vmhraddshs", - "vmladduhm", - "vmsumubm", - "vmsummbm", - "vmsumuhm", - "vmsumuhs", - "vmsumshm", - "vmsumshs", - "vsel", - "vperm", - "vsldoi", - "vmaddfp", - "vnmsubfp", - "vcmpequb", - "vcmpequh", - "vcmpequw", - "vcmpeqfp", - "vcmpgefp", - "vcmpgtub", - "vcmpgtuh", - "vcmpgtuw", - "vcmpgtfp", - "vcmpgtsb", - "vcmpgtsh", - "vcmpgtsw", - "vcmpbfp", - "vaddubm", - "vmaxub", - "vrlb", - "vmuloub", - "vaddfp", - "vmrghb", - "vpkuhum", - "vadduhm", - "vmaxuh", - "vrlh", - "vmulouh", - "vsubfp", - "vmrghh", - "vpkuwum", - "vadduwm", - "vmaxuw", - "vrlw", - "vmrghw", - "vpkuhus", - "vpkuwus", - "vmaxsb", - "vslb", - "vmulosb", - "vrefp", - "vmrglb", - "vpkshus", - "vmaxsh", - "vslh", - "vmulosh", - "vrsqrtefp", - "vmrglh", - "vpkswus", - "vaddcuw", - "vmaxsw", - "vslw", - "vexptefp", - "vmrglw", - "vpkshss", - "vsl", - "vlogefp", - "vpkswss", - "vaddubs", - "vminub", - "vsrb", - "vmuleub", - "vrfin", - "vspltb", - "vupkhsb", - "vadduhs", - "vminuh", - "vsrh", - "vmuleuh", - "vrfiz", - "vsplth", - "vupkhsh", - "vadduws", - "vminuw", - "vsrw", - "vrfip", - "vspltw", - "vupklsb", - "vsr", - "vrfim", - "vupklsh", - "vaddsbs", - "vminsb", - "vsrab", - "vmulesb", - "vcfux", - "vspltisb", - "vpkpx", - "vaddshs", - "vminsh", - "vsrah", - "vmulesh", - "vcfsx", - "vspltish", - "vupkhpx", - "vaddsws", - "vminsw", - "vsraw", - "vctuxs", - "vspltisw", - "vctsxs", - "vupklpx", - "vsububm", - "vavgub", - "vand", - "vmaxfp", - "vslo", - "vsubuhm", - "vavguh", - "vandc", - "vminfp", - "vsro", - "vsubuwm", - "vavguw", - "vor", - "vxor", - "vavgsb", - "vnor", - "vavgsh", - "vsubcuw", - "vavgsw", - "vsububs", - "mfvscr", - "vsum4ubs", - "vsubuhs", - "mtvscr", - "vsum4shs", - "vsubuws", - "vsum2sws", - "vsubsbs", - "vsum4sbs", - "vsubshs", - "vsubsws", - "vsumsws", - "mcrf", - "bclr", - "rfid", - "crnor", - "rfi", - "crandc", - "isync", - "crxor", - "crnand", - "crand", - "hrfid", - "creqv", - "crorc", - "cror", - "bctr", - "rldicl", - "rldicr", - "rldic", - "rldimi", - "rldcl", - "rldcr", - "subfc", - "mulhdu", - "addc", - "mulhwu", - "subf", - "mulhd", - "mulhw", - "neg", - "subfe", - "adde", - "subfze", - "addze", - "subfme", - "mulld", - "addme", - "mullw", - "add", - "divdu", - "divwu", - "divd", - "divw", - "sradi", - "cmp", - "tw", - "lvsl", - "lvebx", - "mfcr", - "lwarx", - "ldx", - "lwzx", - "slw", - "cntlzw", - "sld", - "and", - "cmpl", - "lvsr", - "lvehx", - "ldux", - "dcbst", - "lwzux", - "cntlzd", - "andc", - "td", - "lvewx", - "mfmsr", - "ldarx", - "dcbf", - "lbzx", - "lvx", - "lbzux", - "nor", - "stvebx", - "mtcrf", - "mtmsr", - "stdx", - "stwcx", - "stwx", - "stvehx", - "mtmsrd", - "stdux", - "stwux", - "stvewx", - "mtsr", - "stdcx.", - "stbx", - "stvx", - "mtsrin", - "dcbtst", - "stbux", - "tlbiel", - "dcbt", - "lhzx", - "eqv", - "tlbie", - "eciwx", - "lhzux", - "xor", - "mfspr", - "lwax", - "dst", - "lhax", - "lvxl", - "tlbia", - "mftb", - "lwaux", - "dstst", - "lhaux", - "slbmte", - "sthx", - "orc", - "slbie", - "ecowx", - "sthux", - "or", - "mtspr", - "nand", - "stvxl", - "slbia", - "mcrxr", - "lswx", - "lwbrx", - "lfsx", - "srw", - "srd", - "tlbsync", - "lfsux", - "mfsr", - "lswi", - "sync", - "lfdx", - "lfdux", - "mfsrin", - "stswx", - "stwbrx", - "stfsx", - "stfsux", - "stswi", - "stfdx", - "dcba", - "stfdux", - "lhbrx", - "sraw", - "srad", - "dss", - "srawi", - "slbmfev", - "eieio", - "slbmfee", - "sthbrx", - "extsh", - "extsb", - "icbi", - "stfiwx", - "extsw", - "dcbz", - "ld", - "ldu", - "lwa", - "fdivs", - "fsubs", - "fadds", - "fsqrts", - "fres", - "fmuls", - "fmsubs", - "fmadds", - "fnmsubs", - "fnmadds", - "std", - "stdu", - "fdiv", - "fsub", - "fadd", - "fsqrt", - "fsel", - "fmul", - "frsqrte", - "fmsub", - "fmadd", - "fnmsub", - "fnmadd", - "fcmpu", - "frsp", - "fctiw", - "fctiwz", - "fcmpo", - "mtfsb1", - "fneg", - "mcrfs", - "mtfsb0", - "fmr", - "mtfsfi", - "fnabs", - "fabs", - "mffs", - "mtfsf", - "fctid", - "fctidz", - "fcfid", -}; -#endif - -static dcdtab dcdfail = { 0x00, diINV, 0x0000 }; // Decode failed - -static uint32_t sprtbl[] = { - 0xCCC03274, // spr 0 to 31 - 0x00000000, // spr 32 to 63 - 0x00000000, // spr 64 to 95 - 0x00000000, // spr 96 to 127 - 0x00000080, // spr 128 to 159 - 0x00000000, // spr 160 to 191 - 0x00000000, // spr 192 to 223 - 0x00000000, // spr 224 to 255 - 0x9000FCAD, // spr 256 to 287 - 0x0000C3F3, // spr 288 to 319 - 0x00000000, // spr 320 to 351 - 0x00000000, // spr 352 to 383 - 0x00000000, // spr 384 to 415 - 0x00000000, // spr 416 to 447 - 0x00000000, // spr 448 to 479 - 0x00000000, // spr 480 to 511 - 0x0000FFFF, // spr 512 to 543 - 0x00000000, // spr 544 to 575 - 0x00000000, // spr 576 to 607 - 0x00000000, // spr 608 to 639 - 0x00000000, // spr 640 to 671 - 0x00000000, // spr 672 to 703 - 0x00000000, // spr 704 to 735 - 0x00000000, // spr 736 to 767 - 0x3FFF3FFF, // spr 768 to 799 - 0x00000000, // spr 800 to 831 - 0x00000000, // spr 832 to 863 - 0x00000000, // spr 864 to 895 - 0x00000000, // spr 896 to 927 - 0xE1FFE1FF, // spr 928 to 959 - 0x0000FE80, // spr 960 to 991 - 0x0000FFFF, // spr 992 to 1023 -}; diff --git a/bsd/ppc/endian.h b/bsd/ppc/endian.h deleted file mode 100644 index c6929f117..000000000 --- a/bsd/ppc/endian.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * Copyright (c) 1995 NeXT Computer, Inc. All rights reserved. - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1987, 1991, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)endian.h 8.1 (Berkeley) 6/10/93 - */ - -#ifndef _PPC_ENDIAN_H_ -#define _PPC_ENDIAN_H_ - -#include <sys/cdefs.h> - -/* - * Define the order of 32-bit words in 64-bit words. - */ -#define _QUAD_HIGHWORD 0 -#define _QUAD_LOWWORD 1 - -/* - * Definitions for byte order, according to byte significance from low - * address to high. - */ -#define __DARWIN_LITTLE_ENDIAN 1234 /* LSB first: i386, vax */ -#define __DARWIN_BIG_ENDIAN 4321 /* MSB first: 68000, ibm, net, ppc */ -#define __DARWIN_PDP_ENDIAN 3412 /* LSB first in word, MSW first in long */ - -#define __DARWIN_BYTE_ORDER __DARWIN_BIG_ENDIAN - -#if defined(KERNEL) || (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) - -#define LITTLE_ENDIAN __DARWIN_LITTLE_ENDIAN -#define BIG_ENDIAN __DARWIN_BIG_ENDIAN -#define PDP_ENDIAN __DARWIN_PDP_ENDIAN - -#define BYTE_ORDER __DARWIN_BYTE_ORDER - -#include <sys/_endian.h> - -#endif /* defined(KERNEL) || (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) */ -#endif /* !_PPC_ENDIAN_H_ */ diff --git a/bsd/ppc/exec.h b/bsd/ppc/exec.h deleted file mode 100644 index 471543a1d..000000000 --- a/bsd/ppc/exec.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1994, The University of Utah and - * the Center for Software Science at the University of Utah (CSS). - * All rights reserved. - * - * Permission to use, copy, modify and distribute this software and its - * documentation is hereby granted, provided that both the copyright - * notice and this permission notice appear in all copies of the - * software, derivative works or modified versions, and any portions - * thereof, and that both notices appear in supporting documentation. - * - * THE UNIVERSITY OF UTAH AND CSS ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS - * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSS DISCLAIM ANY LIABILITY OF - * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * - * CSS requests users of this software to return to css-dist@cs.utah.edu any - * improvements that they make and grant CSS redistribution rights. - * - */ - -#ifndef _BSD_PPC_EXEC_H_ -#define _BSD_PPC_EXEC_H_ - - -#include <sys/appleapiopts.h> - -#ifdef BSD_KERNEL_PRIVATE -/* Size of a page in an object file. */ -#define __LDPGSZ 4096 - -/* Valid magic number check. */ -#define N_BADMAG(ex) \ - ((ex).a_magic != NMAGIC && (ex).a_magic != OMAGIC && \ - (ex).a_magic != ZMAGIC) - -/* Address of the bottom of the text segment. */ -#define N_TXTADDR(X) 0 - -/* Address of the bottom of the data segment. */ -#define N_DATADDR(ex) \ - (N_TXTADDR(ex) + ((ex).a_magic == OMAGIC ? (ex).a_text \ - : __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1)))) - -/* Text segment offset. */ -#define N_TXTOFF(ex) \ - ((ex).a_magic == ZMAGIC ? __LDPGSZ : sizeof(struct exec)) - -/* Data segment offset. */ -#define N_DATOFF(ex) \ - (N_TXTOFF(ex) + ((ex).a_magic != ZMAGIC ? (ex).a_text : \ - __LDPGSZ + ((ex).a_text - 1 & ~(__LDPGSZ - 1)))) - -/* Symbol table offset. */ -#define N_SYMOFF(ex) \ - (N_TXTOFF(ex) + (ex).a_text + (ex).a_data + (ex).a_trsize + \ - (ex).a_drsize) - -/* String table offset. */ -#define N_STROFF(ex) (N_SYMOFF(ex) + (ex).a_syms) - -/* Description of the object file header (a.out format). */ -struct exec { -#define OMAGIC 0407 /* old impure format */ -#define NMAGIC 0410 /* read-only text */ -#define ZMAGIC 0413 /* demand load format */ -#define QMAGIC 0314 /* demand load format. Header in text. */ - unsigned int a_magic; /* magic number */ - - unsigned int a_text; /* text segment size */ - unsigned int a_data; /* initialized data size */ - unsigned int a_bss; /* uninitialized data size */ - unsigned int a_syms; /* symbol table size */ - unsigned int a_entry; /* entry point */ - unsigned int a_trsize; /* text relocation size */ - unsigned int a_drsize; /* data relocation size */ -}; - -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /* _BSD_PPC_EXEC_H_ */ - diff --git a/bsd/ppc/fasttrap_isa.h b/bsd/ppc/fasttrap_isa.h deleted file mode 100644 index b4a2cb4c2..000000000 --- a/bsd/ppc/fasttrap_isa.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2007 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _FASTTRAP_ISA_H -#define _FASTTRAP_ISA_H - -/* #pragma ident "@(#)fasttrap_isa.h 1.4 05/06/08 SMI" */ - -#include <sys/types.h> -#if defined(__APPLE__) -#include <stdint.h> -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef uint32_t fasttrap_instr_t; - -typedef struct fasttrap_machtp { - fasttrap_instr_t ftmt_instr; /* Original instruction */ - int32_t ftmt_trgt; /* Offset or absolute address */ - uint8_t ftmt_type; /* Emulation function type */ -#define ftmtNOP 0 -#define ftmtCommon 1 -#define ftmtB 2 -#define ftmtBC 3 -#define ftmtBLR 4 -#define ftmtBCTR 5 - uint8_t ftmt_bo; /* Branch options */ - uint8_t ftmt_bi; /* Condition bit */ - uint8_t ftmt_flgs; /* Flags */ -#define ftmtAbs 2 -#define ftmtLink 1 -} fasttrap_machtp_t; - -#define ftt_instr ftt_mtp.ftmt_instr -#define ftt_trgt ftt_mtp.ftmt_trgt -#define ftt_type ftt_mtp.ftmt_type -#define ftt_bo ftt_mtp.ftmt_bo -#define ftt_bi ftt_mtp.ftmt_bi -#define ftt_flgs ftt_mtp.ftmt_flgs - -#define FASTTRAP_INSTR 0x0FFFDDDD -#define T_DTRACE_RET (0x2E * 4) - -#define FASTTRAP_RETURN_AFRAMES 7 -#define FASTTRAP_ENTRY_AFRAMES 7 -#define FASTTRAP_OFFSET_AFRAMES 6 - -#ifdef __cplusplus -} -#endif - -#endif /* _FASTTRAP_ISA_H */ diff --git a/bsd/ppc/limits.h b/bsd/ppc/limits.h deleted file mode 100644 index 8f7decbec..000000000 --- a/bsd/ppc/limits.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 1988, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)limits.h 8.3 (Berkeley) 1/4/94 - */ - -#ifndef _PPC_LIMITS_H_ -#define _PPC_LIMITS_H_ - -#include <sys/cdefs.h> -#include <ppc/_limits.h> - -#define CHAR_BIT 8 /* number of bits in a char */ -#define MB_LEN_MAX 6 /* Allow 31 bit UTF2 */ - -#if !defined(_ANSI_SOURCE) && (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) -#define CLK_TCK __DARWIN_CLK_TCK /* ticks per second */ -#endif /* !_ANSI_SOURCE && (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ - -/* - * According to ANSI (section 2.2.4.2), the values below must be usable by - * #if preprocessing directives. Additionally, the expression must have the - * same type as would an expression that is an object of the corresponding - * type converted according to the integral promotions. The subtraction for - * INT_MIN and LONG_MIN is so the value is not unsigned; 2147483648 is an - * unsigned int for 32-bit two's complement ANSI compilers (section 3.1.3.2). - * These numbers work for pcc as well. The UINT_MAX and ULONG_MAX values - * are written as hex so that GCC will be quiet about large integer constants. - */ -#define SCHAR_MAX 127 /* min value for a signed char */ -#define SCHAR_MIN (-128) /* max value for a signed char */ - -#define UCHAR_MAX 255 /* max value for an unsigned char */ -#define CHAR_MAX 127 /* max value for a char */ -#define CHAR_MIN (-128) /* min value for a char */ - -#define USHRT_MAX 65535 /* max value for an unsigned short */ -#define SHRT_MAX 32767 /* max value for a short */ -#define SHRT_MIN (-32768) /* min value for a short */ - -#define UINT_MAX 0xffffffff /* max value for an unsigned int */ -#define INT_MAX 2147483647 /* max value for an int */ -#define INT_MIN (-2147483647-1) /* min value for an int */ - -#ifdef __LP64__ -#define ULONG_MAX 0xffffffffffffffffUL /* max unsigned long */ -#define LONG_MAX 0x7fffffffffffffffL /* max signed long */ -#define LONG_MIN (-0x7fffffffffffffffL-1) /* min signed long */ -#else /* !__LP64__ */ -#define ULONG_MAX 0xffffffffUL /* max unsigned long */ -#define LONG_MAX 2147483647L /* max signed long */ -#define LONG_MIN (-2147483647L-1) /* min signed long */ -#endif /* __LP64__ */ - -#define ULLONG_MAX 0xffffffffffffffffULL /* max unsigned long long */ -#define LLONG_MAX 0x7fffffffffffffffLL /* max signed long long */ -#define LLONG_MIN (-0x7fffffffffffffffLL-1) /* min signed long long */ - -#if !defined(_ANSI_SOURCE) -#ifdef __LP64__ -#define LONG_BIT 64 -#else /* !__LP64__ */ -#define LONG_BIT 32 -#endif /* __LP64__ */ -#define SSIZE_MAX LONG_MAX /* max value for a ssize_t */ -#define WORD_BIT 32 - -#if (!defined(_POSIX_C_SOURCE) && !defined(_XOPEN_SOURCE)) || defined(_DARWIN_C_SOURCE) -#define SIZE_T_MAX ULONG_MAX /* max value for a size_t */ - -#define UQUAD_MAX ULLONG_MAX -#define QUAD_MAX LLONG_MAX -#define QUAD_MIN LLONG_MIN - -#endif /* (!_POSIX_C_SOURCE && !_XOPEN_SOURCE) || _DARWIN_C_SOURCE */ -#endif /* !_ANSI_SOURCE */ - -#endif /* _PPC_LIMITS_H_ */ diff --git a/bsd/ppc/param.h b/bsd/ppc/param.h deleted file mode 100644 index a434e3c4c..000000000 --- a/bsd/ppc/param.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1993,1995 NeXT Computer, Inc. All Rights Reserved */ - -#ifndef _PPC_PARAM_H_ -#define _PPC_PARAM_H_ - -#include <ppc/_param.h> - -/* - * Round p (pointer or byte index) up to a correctly-aligned value for all - * data types (int, long, ...). The result is unsigned int and must be - * cast to any desired pointer type. - */ -#define ALIGNBYTES __DARWIN_ALIGNBYTES -#define ALIGN(p) __DARWIN_ALIGN(p) - -#define NBPG 4096 /* bytes/page */ -#define PGOFSET (NBPG-1) /* byte offset into page */ -#define PGSHIFT 12 /* LOG2(NBPG) */ - -#define NBSEG 0x40000000 /* bytes/segment (quadrant) */ -#define SEGOFSET (NBSEG-1) /* byte offset into segment */ -#define SEGSHIFT 30 /* LOG2(NBSEG) */ - -#define DEV_BSIZE 512 -#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ -#define BLKDEV_IOSIZE 2048 -#define MAXPHYS (128 * 1024) /* max raw I/O transfer size */ - -#define STACK_GROWTH_UP 0 /* stack grows to lower addresses */ - -#define CLSIZE 1 -#define CLSIZELOG2 0 - -#define STACKSIZE 4 /* pages in kernel stack */ -#define UPAGES 0 /* total pages in u-area */ - /* red zone is beyond this */ - -/* - * Constants related to network buffer management. - * MCLBYTES must be no larger than CLBYTES (the software page size), and, - * on machines that exchange pages of input or output buffers with mbuf - * clusters (MAPPED_MBUFS), MCLBYTES must also be an integral multiple - * of the hardware page size. - */ -#define MSIZE 256 /* size of an mbuf */ -#define MCLBYTES 2048 /* large enough for ether MTU */ -#define MCLSHIFT 11 -#define MCLOFSET (MCLBYTES - 1) -#ifndef NMBCLUSTERS -#if GATEWAY -#define NMBCLUSTERS ((1024 * 1024) / MCLBYTES) /* cl map size: 1MB */ -#else -#define NMBCLUSTERS ((1024 * 1024) / MCLBYTES) - /* cl map size was 0.5MB when MSIZE was 128, now it's 1MB*/ -#endif -#endif - -/* pages ("clicks") (NBPG bytes) to disk blocks */ -#define ctod(x) ((x)<<(PGSHIFT-DEV_BSHIFT)) -#define dtoc(x) ((x)>>(PGSHIFT-DEV_BSHIFT)) -#define dtob(x) ((x)<<DEV_BSHIFT) - -/* pages to bytes */ -#define ctob(x) ((x)<<PGSHIFT) - -/* bytes to pages */ -#define btoc(x) (((unsigned)(x)+(PGOFSET))>>PGSHIFT) -#ifdef __APPLE__ -#define btodb(bytes, devBlockSize) \ - ((unsigned)(bytes) / devBlockSize) -#define dbtob(db, devBlockSize) \ - ((unsigned)(db) * devBlockSize) -#else -#define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ - ((unsigned)(bytes) >> DEV_BSHIFT) -#define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ - ((unsigned)(db) << DEV_BSHIFT) -#endif - -/* - * Map a ``block device block'' to a file system block. - * This should be device dependent, and should use the bsize - * field from the disk label. - * For now though just use DEV_BSIZE. - */ -#define bdbtofsb(bn) ((bn) / (BLKDEV_IOSIZE/DEV_BSIZE)) - -/* from machdep/ppc/proc_reg.h */ -#ifdef __BIG_ENDIAN__ -#define ENDIAN_MASK(val,size) (1 << (size-1 - val)) -#else -#error code not ported to little endian targets yet -#endif /* __BIG_ENDIAN__ */ - -#ifndef MASK -#define MASK(PART) ENDIAN_MASK(PART ## _BIT, 32) -#endif - -#define MSR_EE_BIT 16 -#define MSR_PR_BIT 17 -#define USERMODE(msr) (msr & MASK(MSR_PR) ? TRUE : FALSE) -#define BASEPRI(msr) (msr & MASK(MSR_EE) ? TRUE : FALSE) -/* end of from proc_reg.h */ - -#if defined(KERNEL) || defined(STANDALONE) -#define DELAY(n) delay(n) -#else -#define DELAY(n) { register int N = (n); while (--N > 0); } -#endif /* defined(KERNEL) || defined(STANDALONE) */ - -#define NPIDS 16 /* maximum number of PIDs per process */ -#define NIOPIDS 8 /* maximum number of IO space PIDs */ - -#endif /* _PPC_PARAM_H_ */ diff --git a/bsd/ppc/profile.h b/bsd/ppc/profile.h deleted file mode 100644 index 7be38b3a9..000000000 --- a/bsd/ppc/profile.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997, Apple Computer, Inc. All rights reserved. - * - */ - -#ifndef _BSD_PPC_PROFILE_H_ -#define _BSD_PPC_PROFILE_H_ - -#include <sys/appleapiopts.h> - -#ifdef KERNEL -#ifdef __APPLE_API_UNSTABLE -/* - * Block interrupts during mcount so that those interrupts can also be - * counted (as soon as we get done with the current counting). On the - * PPC platfom, can't do splhigh/splx as those are C routines and can - * recursively invoke mcount. - */ -extern unsigned long disable_ee(void); -extern void restore_ee(unsigned long smsr); - -#define MCOUNT_INIT register unsigned long smsr; - -#define MCOUNT_ENTER smsr = disable_ee(); - -#define MCOUNT_EXIT restore_ee(smsr); - -#endif /* __APPLE_API_UNSTABLE */ -#endif /* KERNEL */ - -#endif /* _BSD_PPC_PROFILE_H_ */ diff --git a/bsd/ppc/reboot.h b/bsd/ppc/reboot.h deleted file mode 100644 index 75e3a7656..000000000 --- a/bsd/ppc/reboot.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _BSD_PPC_REBOOT_H_ -#define _BSD_PPC_REBOOT_H_ - -#include <sys/appleapiopts.h> - -/* - * Empty file (publicly) - */ -#ifdef BSD_KERNEL_PRIVATE -/* - * Use most significant 16 bits to avoid collisions with - * machine independent flags. - */ -#define RB_POWERDOWN 0x00010000 /* power down on halt */ -#define RB_NOBOOTRC 0x00020000 /* don't run '/etc/rc.boot' */ -#define RB_DEBUG 0x00040000 /* drop into mini monitor on panic */ -#define RB_EJECT 0x00080000 /* eject disks on halt */ -#define RB_COMMAND 0x00100000 /* new boot command specified */ -#define RB_NOFP 0x00200000 /* don't use floating point */ -#define RB_BOOTNEXT 0x00400000 /* reboot into NeXT */ -#define RB_BOOTDOS 0x00800000 /* reboot into DOS */ - - -#endif /* BSD_KERNEL_PRIVATE */ - -#endif /* _BSD_PPC_REBOOT_H_ */ - diff --git a/bsd/ppc/setjmp.h b/bsd/ppc/setjmp.h deleted file mode 100644 index 27eb59ab0..000000000 --- a/bsd/ppc/setjmp.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * File: ppc/setjmp.h - * - * Declaration of setjmp routines and data structures. - */ -#ifndef _BSD_PPC_SETJMP_H_ -#define _BSD_PPC_SETJMP_H_ - -#include <sys/cdefs.h> - -#define __need_struct_sigcontext -#if defined(KERNEL) -#define __need_struct_sigcontext32 -#define __need_struct_sigcontext64 -#endif /* KERNEL */ -#include <ppc/_structs.h> - -struct _jmp_buf { -#if __DARWIN_UNIX03 - _STRUCT_SIGCONTEXT __sigcontext; /* kernel state preserved by set/longjmp */ - unsigned int __vmask __attribute__((aligned(8))); /* vector mask register */ - unsigned int __vreg[32 * 4] __attribute__((aligned(16))); - /* 32 128-bit vector registers */ -#else /* !__DARWIN_UNIX03 */ - _STRUCT_SIGCONTEXT sigcontext; /* kernel state preserved by set/longjmp */ - unsigned int vmask __attribute__((aligned(8))); /* vector mask register */ - unsigned int vreg[32 * 4] __attribute__((aligned(16))); - /* 32 128-bit vector registers */ -#endif /* __DARWIN_UNIX03 */ -}; - -/* - * _JBLEN is number of ints required to save the following: - * r1, r2, r13-r31, lr, cr, ctr, xer, sig == 26 register_t sized - * fr14 - fr31 = 18 doubles - * vmask, 32 vector registers = 129 ints - * 2 ints to get all the elements aligned - * - * register_t is 2 ints for ppc64 threads - */ -#define _JBLEN64 (26*2 + 18*2 + 129 + 1) -#define _JBLEN32 (26 + 18*2 + 129 + 1) -#define _JBLEN_MAX _JBLEN64 - -/* - * Locally scoped sizes - */ -#if defined(__ppc64__) -#define _JBLEN _JBLEN64 -#else -#define _JBLEN _JBLEN32 -#endif - -#if defined(KERNEL) -typedef _STRUCT_SIGCONTEXT32 jmp_buf32[1]; -typedef struct __sigjmp_buf32 { - int __storage[_JBLEN32 + 1] __attribute__((aligned(8))); - } sigjmp_buf32[1]; - -typedef struct sigcontext64 jmp_buf64[1]; -typedef struct __sigjmp_buf64 { - int __storage[_JBLEN64 + 1] __attribute__((aligned(8))); - } sigjmp_buf64[1]; - -/* - * JMM - have to decide how the kernel will deal with this. - * For now, hard-code the 32-bit types. - */ -typedef _STRUCT_SIGCONTEXT32 jmp_buf[1]; -typedef struct __sigjmp_buf32 sigjmp_buf[1]; - -#else -typedef int jmp_buf[_JBLEN]; -typedef int sigjmp_buf[_JBLEN + 1]; -#endif - -__BEGIN_DECLS -int setjmp(jmp_buf); -void longjmp(jmp_buf, int); - -#ifndef _ANSI_SOURCE -int _setjmp(jmp_buf); -void _longjmp(jmp_buf, int); -int sigsetjmp(sigjmp_buf, int); -void siglongjmp(sigjmp_buf, int); -#endif /* _ANSI_SOURCE */ - -#if !defined(_ANSI_SOURCE) && (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) -void longjmperror(void); -#endif /* neither ANSI nor POSIX */ -__END_DECLS - -#endif /* !_BSD_PPC_SETJMP_H_ */ diff --git a/bsd/ppc/signal.h b/bsd/ppc/signal.h deleted file mode 100644 index 31af83a02..000000000 --- a/bsd/ppc/signal.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1992, 1993 NeXT Computer, Inc. - */ - -#ifndef _PPC_SIGNAL_H_ -#define _PPC_SIGNAL_H_ 1 - -#include <sys/cdefs.h> - -#ifndef _ANSI_SOURCE - -typedef int sig_atomic_t; - -#include <sys/appleapiopts.h> - -#ifdef __APPLE_API_OBSOLETE - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) - -#define __need_struct_sigcontext -#define __need_struct_sigcontext32 -#define __need_struct_sigcontext64 -#include <ppc/_structs.h> - -/* - * Machine-dependant flags used in sigvec call. - */ -#define SV_SAVE_REGS 0x1000 /* Save all regs in sigcontext */ - -/* - * regs_saved_t -- Describes which registers beyond what the kernel cares - * about are saved to and restored from this sigcontext. - * - * The default is REGS_SAVED_CALLER, only the caller saved registers - * are saved. If the SV_SAVE_REGS flag was set when the signal - * handler was registered with sigvec() then all the registers will be - * saved in the sigcontext, and REGS_SAVED_ALL will be set. The C - * library uses REGS_SAVED_NONE in order to quickly restore kernel - * state during a longjmp(). - */ -typedef enum { - REGS_SAVED_NONE, /* Only kernel managed regs restored */ - REGS_SAVED_CALLER, /* "Caller saved" regs: rpc, a0-a7, - t0-t4, at, lk0-lk1, xt1-xt20, - xr0-xr1 */ - REGS_SAVED_ALL /* All registers */ -} regs_saved_t; - -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -#endif /* __APPLE_API_OBSOLETE */ - -#endif /* _ANSI_SOURCE */ - -#endif /* _PPC_SIGNAL_H_ */ - diff --git a/bsd/ppc/types.h b/bsd/ppc/types.h deleted file mode 100644 index 21265f8e0..000000000 --- a/bsd/ppc/types.h +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright 1995 NeXT Computer, Inc. All rights reserved. - */ -/* - * Copyright (c) 1990, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)types.h 8.3 (Berkeley) 1/5/94 - */ - -#ifndef _MACHTYPES_H_ -#define _MACHTYPES_H_ - -#ifndef __ASSEMBLER__ -#include <ppc/_types.h> -#include <sys/cdefs.h> -/* - * Basic integral types. Omit the typedef if - * not possible for a machine/compiler combination. - */ -#ifndef _INT8_T -#define _INT8_T -typedef __signed char int8_t; -#endif -typedef unsigned char u_int8_t; -#ifndef _INT16_T -#define _INT16_T -typedef short int16_t; -#endif -typedef unsigned short u_int16_t; -#ifndef _INT32_T -#define _INT32_T -typedef int int32_t; -#endif -typedef unsigned int u_int32_t; -#ifndef _INT64_T -#define _INT64_T -typedef long long int64_t; -#endif -typedef unsigned long long u_int64_t; - -#if __LP64__ -typedef int64_t register_t; -#else -typedef int32_t register_t; -#endif - -#ifndef _INTPTR_T -#define _INTPTR_T -typedef __darwin_intptr_t intptr_t; -#endif -#ifndef _UINTPTR_T -#define _UINTPTR_T -typedef unsigned long uintptr_t; -#endif - -#if !defined(_ANSI_SOURCE) && (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE)) -/* These types are used for reserving the largest possible size. */ -typedef u_int64_t user_addr_t; -typedef u_int64_t user_size_t; -typedef int64_t user_ssize_t; -typedef int64_t user_long_t; -typedef u_int64_t user_ulong_t; -typedef int64_t user_time_t; -typedef int64_t user_off_t; -#define USER_ADDR_NULL ((user_addr_t) 0) -#define CAST_USER_ADDR_T(a_ptr) ((user_addr_t)((uintptr_t)(a_ptr))) - -#ifdef KERNEL - -/* - * These types are used when you know the word size of the target - * user process. They can be used to create struct layouts independent - * of the types and alignment requirements of the current running - * kernel. - */ - -/* - * The default ABI for the 32-bit PowerPC userspace is called "Power" - * alignment, and aligns fundamental integral data types to their - * natural boundary, with a maximum alignment of 4, even for 8-byte - * quantites. Power alignment also pads a structure to 8-byte alignment - * if the first field is an 8-byte quantity, which is not handled by - * these typedefs. The default ABI for 64-bit PowerPC userspace is called - * "Natural" alignment, and aligns fundamental integral data types - * to their natural boundaries. - */ - -typedef __uint64_t user64_addr_t __attribute__((aligned(8))); -typedef __uint64_t user64_size_t __attribute__((aligned(8))); -typedef __int64_t user64_ssize_t __attribute__((aligned(8))); -typedef __int64_t user64_long_t __attribute__((aligned(8))); -typedef __uint64_t user64_ulong_t __attribute__((aligned(8))); -typedef __int64_t user64_time_t __attribute__((aligned(8))); -typedef __int64_t user64_off_t __attribute__((aligned(8))); - -typedef __uint32_t user32_addr_t; -typedef __uint32_t user32_size_t; -typedef __int32_t user32_ssize_t; -typedef __int32_t user32_long_t; -typedef __uint32_t user32_ulong_t; -typedef __int32_t user32_time_t; -typedef __int64_t user32_off_t __attribute__((aligned(4))); - -#endif /* KERNEL */ - -#endif /* !_ANSI_SOURCE && (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ - -/* This defines the size of syscall arguments after copying into the kernel: */ -typedef u_int64_t syscall_arg_t; - -#ifndef __offsetof -#define __offsetof(type, field) ((size_t)(&((type *)0)->field)) -#endif - -#endif /* __ASSEMBLER__ */ -#endif /* _MACHTYPES_H_ */ diff --git a/bsd/ppc/ucontext.h b/bsd/ppc/ucontext.h deleted file mode 100644 index 5c391c283..000000000 --- a/bsd/ppc/ucontext.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _PPC_UCONTEXT_H_ -#define _PPC_UCONTEXT_H_ - - -#include <mach/ppc/_types.h> - -#if !__DARWIN_UNIX03 -struct mcontext { - struct ppc_exception_state es; - struct ppc_thread_state ss; - struct ppc_float_state fs; - struct ppc_vector_state vs; -}; -#define PPC_MCONTEXT_SIZE (PPC_THREAD_STATE_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) -#else /* __DARWIN_UNIX03 */ -struct __darwin_mcontext { - struct __darwin_ppc_exception_state es; - struct __darwin_ppc_thread_state ss; - struct __darwin_ppc_float_state fs; - struct __darwin_ppc_vector_state vs; -}; -#endif /* __DARWIN_UNIX03 */ - -#ifndef _MCONTEXT_T -#define _MCONTEXT_T -typedef __darwin_mcontext_t mcontext_t; -#endif - -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -struct mcontext64 { - struct ppc_exception_state64 es; - struct ppc_thread_state64 ss; - struct ppc_float_state fs; - struct ppc_vector_state vs; -}; -#define PPC_MCONTEXT64_SIZE (PPC_THREAD_STATE64_COUNT + PPC_FLOAT_STATE_COUNT + PPC_EXCEPTION_STATE_COUNT + PPC_VECTOR_STATE_COUNT) * sizeof(int) - -#ifndef _MCONTEXT64_T -#define _MCONTEXT64_T -typedef struct mcontext64 * mcontext64_t; -#endif - -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ - -#endif /* _PPC_UCONTEXT_H_ */ diff --git a/bsd/ppc/vmparam.h b/bsd/ppc/vmparam.h deleted file mode 100644 index 8e682fcdf..000000000 --- a/bsd/ppc/vmparam.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _BSD_PPC_VMPARAM_H_ -#define _BSD_PPC_VMPARAM_H_ 1 - -#include <sys/resource.h> - -#define USRSTACK (0xc0000000) - -/* - * put the default 64-bit stack at the max address - * (minus one 32-bit address space for other incidentals) - */ -#define USRSTACK64 (0x00007FFF5FC00000ULL) - -/* - * Virtual memory related constants, all in bytes - */ -#ifndef DFLDSIZ -#define DFLDSIZ (RLIM_INFINITY) /* initial data size limit */ -// XXX Not enforced -//#define DFLDSIZ (6*1024*1024) /* initial data size limit */ -#endif -#ifndef MAXDSIZ -#define MAXDSIZ (RLIM_INFINITY) /* max data size */ -#endif -#ifndef DFLSSIZ -#define DFLSSIZ (8*1024*1024) /* initial stack size limit */ -#endif -#ifndef MAXSSIZ -#define MAXSSIZ (64*1024*1024) /* max stack size */ -#endif -#ifndef DFLCSIZ -#define DFLCSIZ (0) /* initial core size limit */ -#endif -#ifndef MAXCSIZ -#define MAXCSIZ (RLIM_INFINITY) /* max core size */ -#endif - -#endif /* _BSD_PPC_VMPARAM_H_ */ diff --git a/bsd/security/Makefile b/bsd/security/Makefile index b574d2956..92974f6e2 100644 --- a/bsd/security/Makefile +++ b/bsd/security/Makefile @@ -10,8 +10,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ audit -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ @@ -21,8 +19,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ audit -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/security/audit/Makefile b/bsd/security/audit/Makefile index 660e7c155..ac552f60d 100644 --- a/bsd/security/audit/Makefile +++ b/bsd/security/audit/Makefile @@ -9,8 +9,6 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ INSTINC_SUBDIRS_X86_64 = \ @@ -19,8 +17,6 @@ INSTINC_SUBDIRS_ARM = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS_X86_64 = \ diff --git a/bsd/security/audit/audit.c b/bsd/security/audit/audit.c index c454867bf..1ee6c85cd 100644 --- a/bsd/security/audit/audit.c +++ b/bsd/security/audit/audit.c @@ -231,23 +231,25 @@ audit_record_ctor(proc_t p, struct kaudit_record *ar) ar->k_ar.ar_magic = AUDIT_RECORD_MAGIC; nanotime(&ar->k_ar.ar_starttime); - cred = kauth_cred_proc_ref(p); + if (PROC_NULL != p) { + cred = kauth_cred_proc_ref(p); - /* - * Export the subject credential. - */ - cru2x(cred, &ar->k_ar.ar_subj_cred); - ar->k_ar.ar_subj_ruid = cred->cr_ruid; - ar->k_ar.ar_subj_rgid = cred->cr_rgid; - ar->k_ar.ar_subj_egid = cred->cr_groups[0]; - ar->k_ar.ar_subj_pid = p->p_pid; - ar->k_ar.ar_subj_auid = cred->cr_audit.as_aia_p->ai_auid; - ar->k_ar.ar_subj_asid = cred->cr_audit.as_aia_p->ai_asid; - bcopy(&cred->cr_audit.as_mask, &ar->k_ar.ar_subj_amask, - sizeof(struct au_mask)); - bcopy(&cred->cr_audit.as_aia_p->ai_termid, &ar->k_ar.ar_subj_term_addr, - sizeof(struct au_tid_addr)); - kauth_cred_unref(&cred); + /* + * Export the subject credential. + */ + cru2x(cred, &ar->k_ar.ar_subj_cred); + ar->k_ar.ar_subj_ruid = kauth_cred_getruid(cred); + ar->k_ar.ar_subj_rgid = kauth_cred_getrgid(cred); + ar->k_ar.ar_subj_egid = kauth_cred_getgid(cred); + ar->k_ar.ar_subj_pid = p->p_pid; + ar->k_ar.ar_subj_auid = cred->cr_audit.as_aia_p->ai_auid; + ar->k_ar.ar_subj_asid = cred->cr_audit.as_aia_p->ai_asid; + bcopy(&cred->cr_audit.as_mask, &ar->k_ar.ar_subj_amask, + sizeof(struct au_mask)); + bcopy(&cred->cr_audit.as_aia_p->ai_termid, + &ar->k_ar.ar_subj_term_addr, sizeof(struct au_tid_addr)); + kauth_cred_unref(&cred); + } } static void @@ -311,6 +313,7 @@ audit_init(void) audit_kinfo.ai_termid.at_type = AU_IPv4; audit_kinfo.ai_termid.at_addr[0] = INADDR_ANY; + _audit_lck_grp_init(); mtx_init(&audit_mtx, "audit_mtx", NULL, MTX_DEF); KINFO_LOCK_INIT(); cv_init(&audit_worker_cv, "audit_worker_cv"); @@ -353,7 +356,7 @@ audit_shutdown(void) /* * Return the current thread's audit record, if any. */ -__inline__ struct kaudit_record * +struct kaudit_record * currecord(void) { @@ -373,11 +376,24 @@ audit_new(int event, proc_t p, __unused struct uthread *uthread) { struct kaudit_record *ar; int no_record; + int audit_override; + /* + * Override the audit_suspended and audit_enabled if it always + * audits session events. + * + * XXXss - This really needs to be a generalized call to a filter + * interface so if other things that use the audit subsystem in the + * future can simply plugged in. + */ + audit_override = (AUE_SESSION_START == event || + AUE_SESSION_UPDATE == event || AUE_SESSION_END == event || + AUE_SESSION_CLOSE == event); + mtx_lock(&audit_mtx); no_record = (audit_suspended || !audit_enabled); mtx_unlock(&audit_mtx); - if (no_record) + if (!audit_override && no_record) return (NULL); /* @@ -395,10 +411,13 @@ audit_new(int event, proc_t p, __unused struct uthread *uthread) ar->k_ar.ar_event = event; #if CONFIG_MACF - if (audit_mac_new(p, ar) != 0) { - zfree(audit_record_zone, ar); - return (NULL); - } + if (PROC_NULL != p) { + if (audit_mac_new(p, ar) != 0) { + zfree(audit_record_zone, ar); + return (NULL); + } + } else + ar->k_ar.ar_mac_records = NULL; #endif mtx_lock(&audit_mtx); @@ -414,7 +433,8 @@ audit_free(struct kaudit_record *ar) audit_record_dtor(ar); #if CONFIG_MACF - audit_mac_free(ar); + if (NULL != ar->k_ar.ar_mac_records) + audit_mac_free(ar); #endif zfree(audit_record_zone, ar); } @@ -427,6 +447,7 @@ audit_commit(struct kaudit_record *ar, int error, int retval) au_id_t auid; int sorf; struct au_mask *aumask; + int audit_override; if (ar == NULL) return; @@ -487,6 +508,17 @@ audit_commit(struct kaudit_record *ar, int error, int retval) event = ar->k_ar.ar_event; class = au_event_class(event); + /* + * See if we need to override the audit_suspend and audit_enabled + * flags. + * + * XXXss - This check needs to be generalized so new filters can + * easily be added. + */ + audit_override = (AUE_SESSION_START == event || + AUE_SESSION_UPDATE == event || AUE_SESSION_END == event || + AUE_SESSION_CLOSE == event); + ar->k_ar_commit |= AR_COMMIT_KERNEL; if (au_preselect(event, class, aumask, sorf) != 0) ar->k_ar_commit |= AR_PRESELECT_TRAIL; @@ -494,7 +526,8 @@ audit_commit(struct kaudit_record *ar, int error, int retval) ar->k_ar_commit & AR_PRESELECT_TRAIL) != 0) ar->k_ar_commit |= AR_PRESELECT_PIPE; if ((ar->k_ar_commit & (AR_PRESELECT_TRAIL | AR_PRESELECT_PIPE | - AR_PRESELECT_USER_TRAIL | AR_PRESELECT_USER_PIPE)) == 0) { + AR_PRESELECT_USER_TRAIL | AR_PRESELECT_USER_PIPE | + AR_PRESELECT_FILTER)) == 0) { mtx_lock(&audit_mtx); audit_pre_q_len--; mtx_unlock(&audit_mtx); @@ -511,7 +544,7 @@ audit_commit(struct kaudit_record *ar, int error, int retval) * enabled should still be committed? */ mtx_lock(&audit_mtx); - if (audit_suspended || !audit_enabled) { + if (!audit_override && (audit_suspended || !audit_enabled)) { audit_pre_q_len--; mtx_unlock(&audit_mtx); audit_free(ar); diff --git a/bsd/security/audit/audit.h b/bsd/security/audit/audit.h index 5af1da795..d85139b2b 100644 --- a/bsd/security/audit/audit.h +++ b/bsd/security/audit/audit.h @@ -174,7 +174,7 @@ void audit_syscall_exit(int error, struct proc *proc, void audit_mach_syscall_enter(unsigned short audit_event); void audit_mach_syscall_exit(int retval, struct uthread *uthread); -extern struct auditinfo_addr audit_default_aia; +extern struct auditinfo_addr *audit_default_aia_p; /* * The remaining kernel functions are conditionally compiled in as they are @@ -262,20 +262,23 @@ typedef struct ucred *kauth_cred_t; void audit_session_ref(kauth_cred_t cred); void audit_session_unref(kauth_cred_t cred); -void audit_session_procnew(kauth_cred_t cred); -void audit_session_procexit(kauth_cred_t cred); +void audit_session_procnew(proc_t p); +void audit_session_procexit(proc_t p); int audit_session_spawnjoin(proc_t p, ipc_port_t port); +void audit_sdev_submit(au_id_t auid, au_asid_t asid, void *record, + u_int record_len); + /* * Audit session macros. */ -#define IS_VALID_SESSION(a) ((a) != NULL && (a) != &audit_default_aia) +#define IS_VALID_SESSION(a) ((a) != NULL && (a) != audit_default_aia_p) #define AUDIT_SESSION_REF(cred) audit_session_ref(cred) #define AUDIT_SESSION_UNREF(cred) audit_session_unref(cred) -#define AUDIT_SESSION_PROCNEW(cred) audit_session_procnew(cred) -#define AUDIT_SESSION_PROCEXIT(cred) audit_session_procexit(cred) +#define AUDIT_SESSION_PROCNEW(p) audit_session_procnew(p) +#define AUDIT_SESSION_PROCEXIT(p) audit_session_procexit(p) #if CONFIG_MACF /* @@ -292,8 +295,8 @@ extern au_event_t sys_au_event[]; #define AUDIT_RECORD() \ ((struct uthread*)get_bsdthread_info(current_thread()))->uu_ar -#ifndef AUDIT_USE_BUILDIN_EXPECT -#define AUDIT_USE_BUILDIN_EXPECT +#ifndef AUDIT_USE_BUILTIN_EXPECT +#define AUDIT_USE_BUILTIN_EXPECT #endif #ifdef AUDIT_USE_BUILTIN_EXPECT diff --git a/bsd/security/audit/audit_arg.c b/bsd/security/audit/audit_arg.c index 66792758f..eb6d5d434 100644 --- a/bsd/security/audit/audit_arg.c +++ b/bsd/security/audit/audit_arg.c @@ -308,10 +308,10 @@ audit_arg_process(struct kaudit_record *ar, proc_t p) ar->k_ar.ar_arg_asid = my_cred->cr_audit.as_aia_p->ai_asid; bcopy(&my_cred->cr_audit.as_aia_p->ai_termid, &ar->k_ar.ar_arg_termid_addr, sizeof(au_tid_addr_t)); - ar->k_ar.ar_arg_euid = my_cred->cr_uid; - ar->k_ar.ar_arg_egid = my_cred->cr_groups[0]; - ar->k_ar.ar_arg_ruid = my_cred->cr_ruid; - ar->k_ar.ar_arg_rgid = my_cred->cr_rgid; + ar->k_ar.ar_arg_euid = kauth_cred_getuid(my_cred); + ar->k_ar.ar_arg_egid = kauth_cred_getgid(my_cred); + ar->k_ar.ar_arg_ruid = kauth_cred_getruid(my_cred); + ar->k_ar.ar_arg_rgid = kauth_cred_getrgid(my_cred); kauth_cred_unref(&my_cred); ar->k_ar.ar_arg_pid = p->p_pid; ARG_SET_VALID(ar, ARG_AUID | ARG_EUID | ARG_EGID | ARG_RUID | diff --git a/bsd/security/audit/audit_bsd.c b/bsd/security/audit/audit_bsd.c index fdae0d79d..6f4d416c9 100644 --- a/bsd/security/audit/audit_bsd.c +++ b/bsd/security/audit/audit_bsd.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2008-2009 Apple Inc. + * Copyright (c) 2008-2010 Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -59,6 +59,11 @@ struct mhdr { char mh_data[0]; }; +/* + * The lock group for the audit subsystem. + */ +static lck_grp_t *audit_lck_grp = NULL; + #define AUDIT_MHMAGIC 0x4D656C53 #if AUDIT_MALLOC_DEBUG @@ -174,28 +179,25 @@ _audit_malloc(size_t size, au_malloc_type_t *type, int flags, const char *fn) _audit_malloc(size_t size, au_malloc_type_t *type, int flags) #endif { - union { - struct mhdr hdr; - char mem[size + sizeof (struct mhdr)]; - } *mem; - size_t memsize = sizeof (*mem); + struct mhdr *hdr; + size_t memsize = sizeof (*hdr) + size; if (size == 0) return (NULL); if (flags & M_NOWAIT) { - mem = (void *)kalloc_noblock(memsize); + hdr = (void *)kalloc_noblock(memsize); } else { - mem = (void *)kalloc(memsize); - if (mem == NULL) + hdr = (void *)kalloc(memsize); + if (hdr == NULL) panic("_audit_malloc: kernel memory exhausted"); } - if (mem == NULL) + if (hdr == NULL) return (NULL); - mem->hdr.mh_size = memsize; - mem->hdr.mh_type = type; - mem->hdr.mh_magic = AUDIT_MHMAGIC; + hdr->mh_size = memsize; + hdr->mh_type = type; + hdr->mh_magic = AUDIT_MHMAGIC; if (flags & M_ZERO) - memset(mem->hdr.mh_data, 0, size); + memset(hdr->mh_data, 0, size); #if AUDIT_MALLOC_DEBUG if (type != NULL && type->mt_type < NUM_MALLOC_TYPES) { OSAddAtomic64(memsize, &type->mt_size); @@ -206,7 +208,7 @@ _audit_malloc(size_t size, au_malloc_type_t *type, int flags) audit_malloc_types[type->mt_type] = type; } #endif /* AUDIT_MALLOC_DEBUG */ - return (mem->hdr.mh_data); + return (hdr->mh_data); } /* @@ -316,15 +318,99 @@ _audit_cv_wait_sig(struct cv *cvp, lck_mtx_t *mp, const char *desc) } /* - * Simple recursive lock. + * BSD Mutexes. + */ +void +#if DIAGNOSTIC +_audit_mtx_init(struct mtx *mp, const char *lckname) +#else +_audit_mtx_init(struct mtx *mp, __unused const char *lckname) +#endif +{ + mp->mtx_lock = lck_mtx_alloc_init(audit_lck_grp, LCK_ATTR_NULL); + KASSERT(mp->mtx_lock != NULL, + ("_audit_mtx_init: Could not allocate a mutex.")); +#if DIAGNOSTIC + strlcpy(mp->mtx_name, lckname, AU_MAX_LCK_NAME); +#endif +} + +void +_audit_mtx_destroy(struct mtx *mp) +{ + + if (mp->mtx_lock) { + lck_mtx_free(mp->mtx_lock, audit_lck_grp); + mp->mtx_lock = NULL; + } +} + +/* + * BSD rw locks. */ void -_audit_rlck_init(struct rlck *lp, const char *grpname) +#if DIAGNOSTIC +_audit_rw_init(struct rwlock *lp, const char *lckname) +#else +_audit_rw_init(struct rwlock *lp, __unused const char *lckname) +#endif +{ + lp->rw_lock = lck_rw_alloc_init(audit_lck_grp, LCK_ATTR_NULL); + KASSERT(lp->rw_lock != NULL, + ("_audit_rw_init: Could not allocate a rw lock.")); +#if DIAGNOSTIC + strlcpy(lp->rw_name, lckname, AU_MAX_LCK_NAME); +#endif +} + +void +_audit_rw_destroy(struct rwlock *lp) +{ + + if (lp->rw_lock) { + lck_rw_free(lp->rw_lock, audit_lck_grp); + lp->rw_lock = NULL; + } +} +/* + * Wait on a condition variable in a continuation (i.e. yield kernel stack). + * A cv_signal or cv_broadcast on the same condition variable will cause + * the thread to be scheduled. + */ +int +_audit_cv_wait_continuation(struct cv *cvp, lck_mtx_t *mp, thread_continue_t function) { + int status = KERN_SUCCESS; + + cvp->cv_waiters++; + assert_wait(cvp, THREAD_UNINT); + lck_mtx_unlock(mp); + + status = thread_block(function); - lp->rl_grp = lck_grp_alloc_init(grpname, LCK_GRP_ATTR_NULL); - lp->rl_mtx = lck_mtx_alloc_init(lp->rl_grp, LCK_ATTR_NULL); + /* should not be reached, but just in case, re-lock */ + lck_mtx_lock(mp); + + return status; +} + +/* + * Simple recursive lock. + */ +void +#if DIAGNOSTIC +_audit_rlck_init(struct rlck *lp, const char *lckname) +#else +_audit_rlck_init(struct rlck *lp, __unused const char *lckname) +#endif +{ + lp->rl_mtx = lck_mtx_alloc_init(audit_lck_grp, LCK_ATTR_NULL); + KASSERT(lp->rl_mtx != NULL, + ("_audit_rlck_init: Could not allocate a recursive lock.")); +#if DIAGNOSTIC + strlcpy(lp->rl_name, lckname, AU_MAX_LCK_NAME); +#endif lp->rl_thread = 0; lp->rl_recurse = 0; } @@ -368,12 +454,8 @@ _audit_rlck_destroy(struct rlck *lp) { if (lp->rl_mtx) { - lck_mtx_free(lp->rl_mtx, lp->rl_grp); - lp->rl_mtx = 0; - } - if (lp->rl_grp) { - lck_grp_free(lp->rl_grp); - lp->rl_grp = 0; + lck_mtx_free(lp->rl_mtx, audit_lck_grp); + lp->rl_mtx = NULL; } } @@ -397,12 +479,19 @@ _audit_rlck_assert(struct rlck *lp, u_int assert) * Simple sleep lock. */ void -_audit_slck_init(struct slck *lp, const char *grpname) +#if DIAGNOSTIC +_audit_slck_init(struct slck *lp, const char *lckname) +#else +_audit_slck_init(struct slck *lp, __unused const char *lckname) +#endif { - lp->sl_grp = lck_grp_alloc_init(grpname, LCK_GRP_ATTR_NULL); - lp->sl_mtx = lck_mtx_alloc_init(lp->sl_grp, LCK_ATTR_NULL); - + lp->sl_mtx = lck_mtx_alloc_init(audit_lck_grp, LCK_ATTR_NULL); + KASSERT(lp->sl_mtx != NULL, + ("_audit_slck_init: Could not allocate a sleep lock.")); +#if DIAGNOSTIC + strlcpy(lp->sl_name, lckname, AU_MAX_LCK_NAME); +#endif lp->sl_locked = 0; lp->sl_waiting = 0; } @@ -442,7 +531,7 @@ _audit_slck_unlock(struct slck *lp) lp->sl_waiting = 0; /* Wake up *all* sleeping threads. */ - thread_wakeup_prim((event_t) lp, /*1 thr*/ 0, THREAD_AWAKENED); + wakeup((event_t) lp); } lck_mtx_unlock(lp->sl_mtx); } @@ -482,12 +571,8 @@ _audit_slck_destroy(struct slck *lp) { if (lp->sl_mtx) { - lck_mtx_free(lp->sl_mtx, lp->sl_grp); - lp->sl_mtx = 0; - } - if (lp->sl_grp) { - lck_grp_free(lp->sl_grp); - lp->sl_grp = 0; + lck_mtx_free(lp->sl_mtx, audit_lck_grp); + lp->sl_mtx = NULL; } } @@ -545,6 +630,18 @@ _audit_ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps) return (rv); } +/* + * Initialize lock group for audit related locks/mutexes. + */ +void +_audit_lck_grp_init(void) +{ + audit_lck_grp = lck_grp_alloc_init("Audit", LCK_GRP_ATTR_NULL); + + KASSERT(audit_lck_grp != NULL, + ("audit_get_lck_grp: Could not allocate the audit lock group.")); +} + int audit_send_trigger(unsigned int trigger) { diff --git a/bsd/security/audit/audit_bsd.h b/bsd/security/audit/audit_bsd.h index 23b61a5df..72db99f35 100644 --- a/bsd/security/audit/audit_bsd.h +++ b/bsd/security/audit/audit_bsd.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2008, Apple Inc. + * Copyright (c) 2008-2009, Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -55,6 +55,8 @@ #endif #endif /* DIAGNOSTIC */ +#define AU_MAX_LCK_NAME 32 + #if __DARWIN_BYTE_ORDER == __DARWIN_BIG_ENDIAN #define be16enc(p, d) *(p) = (d) #define be32enc(p, d) *(p) = (d) @@ -176,7 +178,9 @@ struct cv { */ struct mtx { lck_mtx_t *mtx_lock; - lck_grp_t *mtx_grp; +#if DIAGNOSTIC + char mtx_name[AU_MAX_LCK_NAME]; +#endif }; /* @@ -184,7 +188,9 @@ struct mtx { */ struct rwlock { lck_rw_t *rw_lock; - lck_grp_t *rw_grp; +#if DIAGNOSTIC + char rw_name[AU_MAX_LCK_NAME]; +#endif }; /* @@ -192,9 +198,11 @@ struct rwlock { */ struct slck { lck_mtx_t *sl_mtx; - lck_grp_t *sl_grp; int sl_locked; int sl_waiting; +#if DIAGNOSTIC + char sl_name[AU_MAX_LCK_NAME]; +#endif }; /* @@ -202,9 +210,11 @@ struct slck { */ struct rlck { lck_mtx_t *rl_mtx; - lck_grp_t *rl_grp; uint32_t rl_recurse; thread_t rl_thread; +#if DIAGNOSTIC + char rl_name[AU_MAX_LCK_NAME]; +#endif }; /* @@ -216,6 +226,8 @@ void _audit_cv_signal(struct cv *cvp); void _audit_cv_broadcast(struct cv *cvp); void _audit_cv_wait(struct cv *cvp, lck_mtx_t *mp, const char *desc); int _audit_cv_wait_sig(struct cv *cvp, lck_mtx_t *mp, const char *desc); +int _audit_cv_wait_continuation(struct cv *cvp, lck_mtx_t *mp, + thread_continue_t function); #define cv_init(cvp, desc) _audit_cv_init(cvp, desc) #define cv_destroy(cvp) _audit_cv_destroy(cvp) #define cv_signal(cvp) _audit_cv_signal(cvp) @@ -223,28 +235,20 @@ int _audit_cv_wait_sig(struct cv *cvp, lck_mtx_t *mp, const char *desc); #define cv_broadcastpri(cvp, pri) _audit_cv_broadcast(cvp) #define cv_wait(cvp, mp) _audit_cv_wait(cvp, (mp)->mtx_lock, #cvp) #define cv_wait_sig(cvp, mp) _audit_cv_wait_sig(cvp, (mp)->mtx_lock, #cvp) +#define cv_wait_continuation(cvp,mp,f) \ + _audit_cv_wait_continuation(cvp, (mp)->mtx_lock, f) /* * BSD Mutexes. */ -#define LOCK_MAX_NAME 64 -#define mtx_init(mp, name, type, opts) do { \ - (mp)->mtx_grp = lck_grp_alloc_init(name, LCK_GRP_ATTR_NULL); \ - (mp)->mtx_lock = lck_mtx_alloc_init((mp)->mtx_grp, \ - LCK_ATTR_NULL); \ -} while(0) -#define mtx_lock(mp) lck_mtx_lock((mp)->mtx_lock) -#define mtx_unlock(mp) lck_mtx_unlock((mp)->mtx_lock) -#define mtx_destroy(mp) do { \ - if ((mp)->mtx_lock) { \ - lck_mtx_free((mp)->mtx_lock, (mp)->mtx_grp); \ - (mp)->mtx_lock = 0; \ - } \ - if ((mp)->mtx_grp) { \ - lck_grp_free((mp)->mtx_grp); \ - (mp)->mtx_grp = 0; \ - } \ -} while (0) +void _audit_mtx_init(struct mtx *mp, const char *name); +void _audit_mtx_destroy(struct mtx *mp); +#define mtx_init(mp, name, type, opts) \ + _audit_mtx_init(mp, name) +#define mtx_lock(mp) lck_mtx_lock((mp)->mtx_lock) +#define mtx_unlock(mp) lck_mtx_unlock((mp)->mtx_lock) +#define mtx_destroy(mp) _audit_mtx_destroy(mp) +#define mtx_yield(mp) lck_mtx_yield((mp)->mtx_lock) /* * Sleep lock functions. @@ -277,25 +281,14 @@ void _audit_rlck_destroy(struct rlck *lp); /* * BSD rw locks. */ -#define rw_init(lp, name) do { \ - (lp)->rw_grp = lck_grp_alloc_init(name, LCK_GRP_ATTR_NULL); \ - (lp)->rw_lock = lck_rw_alloc_init((lp)->rw_grp, \ - LCK_ATTR_NULL); \ -} while(0) +void _audit_rw_init(struct rwlock *lp, const char *name); +void _audit_rw_destroy(struct rwlock *lp); +#define rw_init(lp, name) _audit_rw_init(lp, name) #define rw_rlock(lp) lck_rw_lock_shared((lp)->rw_lock) #define rw_runlock(lp) lck_rw_unlock_shared((lp)->rw_lock) #define rw_wlock(lp) lck_rw_lock_exclusive((lp)->rw_lock) #define rw_wunlock(lp) lck_rw_unlock_exclusive((lp)->rw_lock) -#define rw_destroy(lp) do { \ - if ((lp)->rw_lock) { \ - lck_rw_free((lp)->rw_lock, (lp)->rw_grp); \ - (lp)->rw_lock = 0; \ - } \ - if ((lp)->rw_grp) { \ - lck_grp_free((lp)->rw_grp); \ - (lp)->rw_grp = 0; \ - } \ -} while (0) +#define rw_destroy(lp) _audit_rw_destroy(lp) #define MA_OWNED LCK_MTX_ASSERT_OWNED #define RA_LOCKED LCK_RW_ASSERT_HELD @@ -319,6 +312,11 @@ void _audit_rlck_destroy(struct rlck *lp); #define slck_assert(lp, wht) #endif /* DIAGNOSTIC */ +/* + * Synchronization initialization. + */ +void _audit_lck_grp_init(void); + /* * BSD (IPv6) event rate limiter. */ diff --git a/bsd/security/audit/audit_bsm.c b/bsd/security/audit/audit_bsm.c index 0ee35a074..6f665d890 100644 --- a/bsd/security/audit/audit_bsm.c +++ b/bsd/security/audit/audit_bsm.c @@ -1757,6 +1757,24 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) } break; + case AUE_SESSION_START: + case AUE_SESSION_UPDATE: + case AUE_SESSION_END: + case AUE_SESSION_CLOSE: + if (ARG_IS_VALID(kar, ARG_VALUE64)) { + tok = au_to_arg64(1, "sflags", ar->ar_arg_value64); + kau_write(rec, tok); + } + if (ARG_IS_VALID(kar, ARG_AMASK)) { + tok = au_to_arg32(2, "am_success", + ar->ar_arg_amask.am_success); + kau_write(rec, tok); + tok = au_to_arg32(3, "am_failure", + ar->ar_arg_amask.am_failure); + kau_write(rec, tok); + } + break; + /************************ * Mach system calls * ************************/ @@ -1884,7 +1902,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) } #if CONFIG_MACF - do { + if (NULL != ar->ar_mac_records) { /* Convert the audit data from the MAC policies */ struct mac_audit_record *mar; @@ -1913,7 +1931,7 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau) kau_write(rec, tok); } - } while (0); + } #endif kau_write(rec, subj_tok); diff --git a/bsd/security/audit/audit_ioctl.h b/bsd/security/audit/audit_ioctl.h index 806f8ae93..1059532b9 100644 --- a/bsd/security/audit/audit_ioctl.h +++ b/bsd/security/audit/audit_ioctl.h @@ -31,6 +31,7 @@ #define _SECURITY_AUDIT_AUDIT_IOCTL_H_ #define AUDITPIPE_IOBASE 'A' +#define AUDITSDEV_IOBASE 'S' /* * Data structures used for complex ioctl arguments. Do not change existing @@ -79,4 +80,28 @@ struct auditpipe_ioctl_preselect { #define AUDITPIPE_GET_DROPS _IOR(AUDITPIPE_IOBASE, 102, u_int64_t) #define AUDITPIPE_GET_TRUNCATES _IOR(AUDITPIPE_IOBASE, 103, u_int64_t) +/* + * Ioctls for the audit session device. + */ +#define AUDITSDEV_GET_QLEN _IOR(AUDITSDEV_IOBASE, 1, u_int) +#define AUDITSDEV_GET_QLIMIT _IOR(AUDITSDEV_IOBASE, 2, u_int) +#define AUDITSDEV_SET_QLIMIT _IOW(AUDITSDEV_IOBASE, 3, u_int) +#define AUDITSDEV_GET_QLIMIT_MIN _IOR(AUDITSDEV_IOBASE, 4, u_int) +#define AUDITSDEV_GET_QLIMIT_MAX _IOR(AUDITSDEV_IOBASE, 5, u_int) +#define AUDITSDEV_FLUSH _IO(AUDITSDEV_IOBASE, 6) +#define AUDITSDEV_GET_MAXDATA _IOR(AUDITSDEV_IOBASE, 7, u_int) + +/* + * Ioctls to retrieve and set the ALLSESSIONS flag in the audit session device. + */ +#define AUDITSDEV_GET_ALLSESSIONS _IOR(AUDITSDEV_IOBASE, 100, u_int) +#define AUDITSDEV_SET_ALLSESSIONS _IOW(AUDITSDEV_IOBASE, 101, u_int) + +/* + * Ioctls to retrieve audit sessions device statistics. + */ +#define AUDITSDEV_GET_INSERTS _IOR(AUDITSDEV_IOBASE, 200, u_int64_t) +#define AUDITSDEV_GET_READS _IOR(AUDITSDEV_IOBASE, 201, u_int64_t) +#define AUDITSDEV_GET_DROPS _IOR(AUDITSDEV_IOBASE, 202, u_int64_t) + #endif /* _SECURITY_AUDIT_AUDIT_IOCTL_H_ */ diff --git a/bsd/security/audit/audit_private.h b/bsd/security/audit/audit_private.h index 803a2b936..aa26d7ede 100644 --- a/bsd/security/audit/audit_private.h +++ b/bsd/security/audit/audit_private.h @@ -113,6 +113,8 @@ extern au_class_t audit_kevent_mask; #define AR_PRESELECT_USER_TRAIL 0x00004000U #define AR_PRESELECT_USER_PIPE 0x00008000U +#define AR_PRESELECT_FILTER 0x00010000U + #define AR_DRAIN_QUEUE 0x80000000U /* @@ -171,6 +173,7 @@ union auditon_udata { int au_trigger; au_evclass_map_t au_evclass; au_mask_t au_mask; + au_asflgs_t au_flags; auditinfo_t au_auinfo; auditpinfo_t au_aupinfo; auditpinfo_addr_t au_aupinfo_addr; @@ -440,7 +443,7 @@ int audit_mac_syscall_exit(unsigned short code, struct uthread *uthread, * Audit Session. */ void audit_session_init(void); -int audit_session_setaia(proc_t p, auditinfo_addr_t *aia_p, int newprocess); +int audit_session_setaia(proc_t p, auditinfo_addr_t *aia_p); auditinfo_addr_t *audit_session_update(auditinfo_addr_t *new_aia); int audit_session_lookup(au_asid_t asid, auditinfo_addr_t *ret_aia); diff --git a/bsd/security/audit/audit_session.c b/bsd/security/audit/audit_session.c index 8e05f9dcd..4b63e0082 100644 --- a/bsd/security/audit/audit_session.c +++ b/bsd/security/audit/audit_session.c @@ -27,46 +27,40 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include <stdarg.h> + #include <sys/kernel.h> -#include <sys/event.h> +#include <sys/fcntl.h> #include <sys/kauth.h> +#include <sys/conf.h> +#include <sys/poll.h> #include <sys/queue.h> +#include <sys/signalvar.h> #include <sys/syscall.h> #include <sys/sysent.h> #include <sys/sysproto.h> #include <sys/systm.h> #include <sys/ucred.h> +#include <sys/user.h> + +#include <miscfs/devfs/devfs.h> #include <libkern/OSAtomic.h> #include <bsm/audit.h> +#include <bsm/audit_internal.h> +#include <bsm/audit_kevents.h> + #include <security/audit/audit.h> #include <security/audit/audit_bsd.h> +#include <security/audit/audit_ioctl.h> #include <security/audit/audit_private.h> #include <vm/vm_protos.h> +#include <mach/mach_port.h> #include <kern/audit_sessionport.h> -kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, - mach_msg_type_name_t, ipc_port_t *); -void ipc_port_release_send(ipc_port_t); - -/* - * The default auditinfo_addr entry for ucred. - */ -struct auditinfo_addr audit_default_aia = { - .ai_auid = AU_DEFAUDITID, - .ai_asid = AU_DEFAUDITSID, - .ai_termid = { .at_type = AU_IPv4, }, -}; - -#if CONFIG_AUDIT - -/* - * Currently the hash table is a fixed size. - */ -#define HASH_TABLE_SIZE 97 -#define HASH_ASID(asid) (audit_session_hash(asid) % HASH_TABLE_SIZE) +#include <libkern/OSDebug.h> /* * Audit Session Entry. This is treated as an object with public and private @@ -84,119 +78,397 @@ struct au_sentry { long se_refcnt; /* Reference count. */ long se_procnt; /* Processes in session. */ ipc_port_t se_port; /* Session port. */ - struct klist se_klist; /* Knotes for session */ - struct mtx se_klist_mtx; /* se_klist mutex */ LIST_ENTRY(au_sentry) se_link; /* Hash bucket link list (1) */ }; typedef struct au_sentry au_sentry_t; #define AU_SENTRY_PTR(aia_p) ((au_sentry_t *)(aia_p)) +/* + * The default au_sentry/auditinfo_addr entry for ucred. + */ + +static au_sentry_t audit_default_se = { + .se_auinfo = { + .ai_auid = AU_DEFAUDITID, + .ai_asid = AU_DEFAUDITSID, + .ai_termid = { .at_type = AU_IPv4, }, + }, + .se_refcnt = 1, + .se_procnt = 1, +}; + +struct auditinfo_addr *audit_default_aia_p = &audit_default_se.se_auinfo; + +kern_return_t ipc_object_copyin(ipc_space_t, mach_port_name_t, + mach_msg_type_name_t, ipc_port_t *); +void ipc_port_release_send(ipc_port_t); + +#if CONFIG_AUDIT + + +/* + * Currently the hash table is a fixed size. + */ +#define HASH_TABLE_SIZE 97 +#define HASH_ASID(asid) (audit_session_hash(asid) % HASH_TABLE_SIZE) + static struct rwlock se_entry_lck; /* (1) lock for se_link above */ LIST_HEAD(au_sentry_head, au_sentry); static struct au_sentry_head *au_sentry_bucket = NULL; +#define AU_HISTORY_LOGGING 0 +#if AU_HISTORY_LOGGING +typedef enum au_history_event { + AU_HISTORY_EVENT_UNKNOWN = 0, + AU_HISTORY_EVENT_REF = 1, + AU_HISTORY_EVENT_UNREF = 2, + AU_HISTORY_EVENT_BIRTH = 3, + AU_HISTORY_EVENT_DEATH = 4, + AU_HISTORY_EVENT_FIND = 5 +} au_history_event_t; + +#define AU_HISTORY_MAX_STACK_DEPTH 8 + +struct au_history { + struct au_sentry *ptr; + struct au_sentry se; + void *stack[AU_HISTORY_MAX_STACK_DEPTH]; + unsigned int stack_depth; + au_history_event_t event; +}; + +static struct au_history *au_history; +static size_t au_history_size = 65536; +static unsigned int au_history_index; + +static inline unsigned int +au_history_entries(void) +{ + if (au_history_index >= au_history_size) + return au_history_size; + else + return au_history_index; +} + +static inline void +au_history_record(au_sentry_t *se, au_history_event_t event) +{ + struct au_history *p; + unsigned int i; + + i = OSAddAtomic(1, &au_history_index); + p = &au_history[i % au_history_size]; + + bzero(p, sizeof(*p)); + p->event = event; + bcopy(se, &p->se, sizeof(p->se)); + p->stack_depth = OSBacktrace(&p->stack[0], AU_HISTORY_MAX_STACK_DEPTH); + p->ptr = se; +} +#else +#define au_history_record(se, event) do {} while (0) +#endif + +MALLOC_DEFINE(M_AU_SESSION, "audit_session", "Audit session data"); + +static void audit_ref_session(au_sentry_t *se); +static void audit_unref_session(au_sentry_t *se); + +static void audit_session_event(int event, auditinfo_addr_t *aia_p); + +/* + * Audit session device. + */ + +static MALLOC_DEFINE(M_AUDIT_SDEV, "audit_sdev", "Audit sdevs"); +static MALLOC_DEFINE(M_AUDIT_SDEV_ENTRY, "audit_sdevent", + "Audit sdev entries and buffers"); + +/* + * Default audit sdev buffer parameters. + */ +#define AUDIT_SDEV_QLIMIT_DEFAULT 128 +#define AUDIT_SDEV_QLIMIT_MIN 1 +#define AUDIT_SDEV_QLIMIT_MAX 1024 + /* - * Audit Propagation Knote List is a list of kevent knotes that are assosiated - * with an any ASID knote. If the any ASID gets modified or deleted these are - * modified or deleted as well. + * Entry structure. */ -struct au_plist { - struct knote *pl_knote; /* ptr to per-session knote */ - LIST_ENTRY(au_plist) pl_link; /* list link (2) */ +struct audit_sdev_entry { + void *ase_record; + u_int ase_record_len; + TAILQ_ENTRY(audit_sdev_entry) ase_queue; }; -typedef struct au_plist au_plist_t; -struct au_plisthead { - struct rlck ph_rlck; /* (2) lock for pl_link list */ - LIST_HEAD(au_plhead, au_plist) ph_head; /* list head */ +/* + * Per audit sdev structure. + */ + +struct audit_sdev { + int asdev_open; + +#define AUDIT_SDEV_ASYNC 0x00000001 +#define AUDIT_SDEV_NBIO 0x00000002 + +#define AUDIT_SDEV_ALLSESSIONS 0x00010000 + u_int asdev_flags; + + struct selinfo asdev_selinfo; + pid_t asdev_sigio; + + au_id_t asdev_auid; + au_asid_t asdev_asid; + + /* Per-sdev mutex for most fields in this struct. */ + struct mtx asdev_mtx; + + /* + * Per-sdev sleep lock serializing user-generated reads and + * flushes. uiomove() is called to copy out the current head + * record's data whie the record remains in the queue, so we + * prevent other threads from removing it using this lock. + */ + struct slck asdev_sx; + + /* + * Condition variable to signal when data has been delivered to + * a sdev. + */ + struct cv asdev_cv; + + /* Count and bound of records in the queue. */ + u_int asdev_qlen; + u_int asdev_qlimit; + + /* The number of bytes of data across all records. */ + u_int asdev_qbyteslen; + + /* + * The amount read so far of the first record in the queue. + * (The number of bytes available for reading in the queue is + * qbyteslen - qoffset.) + */ + u_int asdev_qoffset; + + /* + * Per-sdev operation statistics. + */ + u_int64_t asdev_inserts; /* Records added. */ + u_int64_t asdev_reads; /* Records read. */ + u_int64_t asdev_drops; /* Records dropped. */ + + /* + * Current pending record list. This is protected by a + * combination of asdev_mtx and asdev_sx. Note that both + * locks are required to remove a record from the head of the + * queue, as an in-progress read may sleep while copying and, + * therefore, cannot hold asdev_mtx. + */ + TAILQ_HEAD(, audit_sdev_entry) asdev_queue; + + /* Global sdev list. */ + TAILQ_ENTRY(audit_sdev) asdev_list; }; -typedef struct au_plisthead au_plisthead_t; -#define EV_ANY_ASID EV_FLAG0 +#define AUDIT_SDEV_LOCK(asdev) mtx_lock(&(asdev)->asdev_mtx) +#define AUDIT_SDEV_LOCK_ASSERT(asdev) mtx_assert(&(asdev)->asdev_mtx, \ + MA_OWNED) +#define AUDIT_SDEV_LOCK_DESTROY(asdev) mtx_destroy(&(asdev)->asdev_mtx) +#define AUDIT_SDEV_LOCK_INIT(asdev) mtx_init(&(asdev)->asdev_mtx, \ + "audit_sdev_mtx", NULL, MTX_DEF) +#define AUDIT_SDEV_UNLOCK(asdev) mtx_unlock(&(asdev)->asdev_mtx) +#define AUDIT_SDEV_MTX(asdev) (&(asdev)->asdev_mtx) + +#define AUDIT_SDEV_SX_LOCK_DESTROY(asd) slck_destroy(&(asd)->asdev_sx) +#define AUDIT_SDEV_SX_LOCK_INIT(asd) slck_init(&(asd)->asdev_sx, \ + "audit_sdev_sx") +#define AUDIT_SDEV_SX_XLOCK_ASSERT(asd) slck_assert(&(asd)->asdev_sx, \ + SA_XLOCKED) +#define AUDIT_SDEV_SX_XLOCK_SIG(asd) slck_lock_sig(&(asd)->asdev_sx) +#define AUDIT_SDEV_SX_XUNLOCK(asd) slck_unlock(&(asd)->asdev_sx) -MALLOC_DEFINE(M_AU_SESSION, "audit_session", "Audit session data"); -MALLOC_DEFINE(M_AU_EV_PLIST, "audit_ev_plist", "Audit session event plist"); +/* + * Cloning variables and constants. + */ +#define AUDIT_SDEV_NAME "auditsessions" +#define MAX_AUDIT_SDEVS 32 + +static int audit_sdev_major; +static void *devnode; + +/* + * Global list of audit sdevs. The list is protected by a rw lock. + * Individaul record queues are protected by per-sdev locks. These + * locks synchronize between threads walking the list to deliver to + * individual sdevs and adds/removes of sdevs. + */ +static TAILQ_HEAD(, audit_sdev) audit_sdev_list; +static struct rwlock audit_sdev_lock; + +#define AUDIT_SDEV_LIST_LOCK_INIT() rw_init(&audit_sdev_lock, \ + "audit_sdev_list_lock") +#define AUDIT_SDEV_LIST_RLOCK() rw_rlock(&audit_sdev_lock) +#define AUDIT_SDEV_LIST_RUNLOCK() rw_runlock(&audit_sdev_lock) +#define AUDIT_SDEV_LIST_WLOCK() rw_wlock(&audit_sdev_lock) +#define AUDIT_SDEV_LIST_WLOCK_ASSERT() rw_assert(&audit_sdev_lock, \ + RA_WLOCKED) +#define AUDIT_SDEV_LIST_WUNLOCK() rw_wunlock(&audit_sdev_lock) + +/* + * dev_t doesn't have a pointer for "softc" data so we have to keep track of + * it with the following global array (indexed by the minor number). + * + * XXX We may want to dynamically grow this as need. + */ +static struct audit_sdev *audit_sdev_dtab[MAX_AUDIT_SDEVS]; /* - * Kevent filters. + * Special device methods and definition. */ -static int audit_filt_sessionattach(struct knote *kn); -static void audit_filt_sessiondetach(struct knote *kn); -static void audit_filt_sessiontouch(struct knote *kn, - struct kevent64_s *kev, long type); -static int audit_filt_session(struct knote *kn, long hint); - -static void audit_register_kevents(uint32_t asid, uint32_t auid); - -struct filterops audit_session_filtops = { - .f_attach = audit_filt_sessionattach, - .f_detach = audit_filt_sessiondetach, - .f_touch = audit_filt_sessiontouch, - .f_event = audit_filt_session, +static open_close_fcn_t audit_sdev_open; +static open_close_fcn_t audit_sdev_close; +static read_write_fcn_t audit_sdev_read; +static ioctl_fcn_t audit_sdev_ioctl; +static select_fcn_t audit_sdev_poll; + +static struct cdevsw audit_sdev_cdevsw = { + .d_open = audit_sdev_open, + .d_close = audit_sdev_close, + .d_read = audit_sdev_read, + .d_write = eno_rdwrt, + .d_ioctl = audit_sdev_ioctl, + .d_stop = eno_stop, + .d_reset = eno_reset, + .d_ttys = NULL, + .d_select = audit_sdev_poll, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_type = 0 }; /* - * The klist for consumers that are interested in any session (ASID). This list - * is not associated with any data structure but is used for registering - * new kevents when sessions are created. This klist is lock by - * anyas_klist_mtx. - */ -static struct klist anyas_klist; -struct mtx anyas_klist_mtx; - -#define AUDIT_ANYAS_KLIST_LOCK_INIT() mtx_init(&anyas_klist_mtx, \ - "audit anyas_klist_mtx", NULL, MTX_DEF) -#define AUDIT_ANYAS_KLIST_LOCK() mtx_lock(&anyas_klist_mtx) -#define AUDIT_ANYAS_KLIST_UNLOCK() mtx_unlock(&anyas_klist_mtx) -#define AUDIT_ANYAS_KLIST_LOCK_ASSERT() mtx_assert(&anyas_klist_mtx, MA_OWNED) + * Global statistics on audit sdevs. + */ +static int audit_sdev_count; /* Current number of sdevs. */ +static u_int64_t audit_sdev_ever; /* Sdevs ever allocated. */ +static u_int64_t audit_sdev_records; /* Total records seen. */ +static u_int64_t audit_sdev_drops; /* Global record drop count. */ + +static int audit_sdev_init(void); #define AUDIT_SENTRY_RWLOCK_INIT() rw_init(&se_entry_lck, \ - "audit se_entry_lck") + "se_entry_lck") #define AUDIT_SENTRY_RLOCK() rw_rlock(&se_entry_lck) #define AUDIT_SENTRY_WLOCK() rw_wlock(&se_entry_lck) #define AUDIT_SENTRY_RWLOCK_ASSERT() rw_assert(&se_entry_lck, RA_LOCKED) #define AUDIT_SENTRY_RUNLOCK() rw_runlock(&se_entry_lck) #define AUDIT_SENTRY_WUNLOCK() rw_wunlock(&se_entry_lck) -#define AUDIT_SE_KLIST_LOCK_INIT(se, n) mtx_init(&(se)->se_klist_mtx, \ - n, NULL, MTX_DEF) -#define AUDIT_SE_KLIST_LOCK(se) mtx_lock(&(se)->se_klist_mtx) -#define AUDIT_SE_KLIST_UNLOCK(se) mtx_unlock(&(se)->se_klist_mtx) -#define AUDIT_SE_KLIST_LOCK_DESTROY(se) mtx_destroy(&(se)->se_klist_mtx) -#define AUDIT_SE_KLIST_LOCK_ASSERT(se) mtx_assert(&(se)->se_klist_mtx, \ - MA_OWNED) - -#define AUDIT_PLIST_LOCK_INIT(pl) rlck_init(&(pl)->ph_rlck, \ - "audit ph_rlck") -#define AUDIT_PLIST_LOCK(pl) rlck_lock(&(pl)->ph_rlck) -#define AUDIT_PLIST_UNLOCK(pl) rlck_unlock(&(pl)->ph_rlck) -#define AUDIT_PLIST_LOCK_DESTROY(pl) rlck_destroy(&(pl)->ph_rlck) - +/* Access control on the auditinfo_addr.ai_flags member. */ +static uint64_t audit_session_superuser_set_sflags_mask; +static uint64_t audit_session_superuser_clear_sflags_mask; +static uint64_t audit_session_member_set_sflags_mask; +static uint64_t audit_session_member_clear_sflags_mask; +SYSCTL_NODE(, OID_AUTO, audit, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Audit controls"); +SYSCTL_NODE(_audit, OID_AUTO, session, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Audit sessions"); +SYSCTL_QUAD(_audit_session, OID_AUTO, superuser_set_sflags_mask, CTLFLAG_RW | CTLFLAG_LOCKED, + &audit_session_superuser_set_sflags_mask, + "Audit session flags settable by superuser"); +SYSCTL_QUAD(_audit_session, OID_AUTO, superuser_clear_sflags_mask, CTLFLAG_RW | CTLFLAG_LOCKED, + &audit_session_superuser_clear_sflags_mask, + "Audit session flags clearable by superuser"); +SYSCTL_QUAD(_audit_session, OID_AUTO, member_set_sflags_mask, CTLFLAG_RW | CTLFLAG_LOCKED, + &audit_session_member_set_sflags_mask, + "Audit session flags settable by a session member"); +SYSCTL_QUAD(_audit_session, OID_AUTO, member_clear_sflags_mask, CTLFLAG_RW | CTLFLAG_LOCKED, + &audit_session_member_clear_sflags_mask, + "Audit session flags clearable by a session member"); + +#define AUDIT_SESSION_DEBUG 0 #if AUDIT_SESSION_DEBUG +/* + * The following is debugging code that can be used to get a snapshot of the + * session state. The audit session information is read out using sysctl: + * + * error = sysctlbyname("kern.audit_session_debug", buffer_ptr, &buffer_len, + * NULL, 0); + */ #include <kern/kalloc.h> +/* + * The per session record structure for the snapshot data. + */ struct au_sentry_debug { auditinfo_addr_t se_auinfo; - long se_refcnt; - long se_procnt; + int64_t se_refcnt; /* refereence count */ + int64_t se_procnt; /* process count */ + int64_t se_ptcnt; /* process count from + proc table */ }; typedef struct au_sentry_debug au_sentry_debug_t; static int audit_sysctl_session_debug(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); -SYSCTL_PROC(_kern, OID_AUTO, audit_session_debug, CTLFLAG_RD, NULL, 0, - audit_sysctl_session_debug, "S,audit_session_debug", +SYSCTL_PROC(_kern, OID_AUTO, audit_session_debug, CTLFLAG_RD | CTLFLAG_LOCKED, + NULL, 0, audit_sysctl_session_debug, "S,audit_session_debug", "Current session debug info for auditing."); /* - * Copy out the session debug info via the sysctl interface. The userland code - * is something like the following: + * Callouts for proc_interate() which is used to reconcile the audit session + * proc state information with the proc table. We get everything we need + * in the filterfn while the proc_lock() is held so we really don't need the + * callout() function. + */ +static int +audit_session_debug_callout(__unused proc_t p, __unused void *arg) +{ + + return (PROC_RETURNED_DONE); +} + +static int +audit_session_debug_filterfn(proc_t p, void *st) +{ + kauth_cred_t cred = p->p_ucred; + auditinfo_addr_t *aia_p = cred->cr_audit.as_aia_p; + au_sentry_debug_t *sed_tab = (au_sentry_debug_t *) st; + au_sentry_debug_t *sdtp; + au_sentry_t *se; + + if (IS_VALID_SESSION(aia_p)) { + sdtp = &sed_tab[0]; + do { + if (aia_p->ai_asid == sdtp->se_asid) { + sdtp->se_ptcnt++; + + /* Do some santy checks. */ + se = AU_SENTRY_PTR(aia_p); + if (se->se_refcnt != sdtp->se_refcnt) { + sdtp->se_refcnt = + (int64_t)se->se_refcnt; + } + if (se->se_procnt != sdtp->se_procnt) { + sdtp->se_procnt = + (int64_t)se->se_procnt; + } + break; + } + sdtp++; + } while (sdtp->se_asid != 0 && sdtp->se_auid != 0); + } else { + /* add it to the default sesison */ + sed_tab->se_ptcnt++; + } + + return (0); +} + +/* + * Copy out the session debug info via the sysctl interface. * - * error = sysctlbyname("kern.audit_session_debug", buffer_ptr, &buffer_len, - * NULL, 0); */ static int audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, @@ -223,6 +495,7 @@ audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, if (se != NULL) entry_cnt++; + entry_cnt++; /* add one for the default entry */ /* * If just querying then return the space required. There is an * obvious race condition here so we just fudge this by 3 in case @@ -258,10 +531,18 @@ audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, */ sz = 0; next_sed = sed_tab; + /* add the first entry for processes not tracked in sessions. */ + bcopy(audit_default_aia_p, &next_sed->se_auinfo, sizeof (au_sentry_t)); + next_sed->se_refcnt = (int64_t)audit_default_se.se_refcnt; + next_sed->se_procnt = (int64_t)audit_default_se.se_procnt; + next_sed++; + sz += sizeof(au_sentry_debug_t); for(i = 0; i < HASH_TABLE_SIZE; i++) { LIST_FOREACH(se, &au_sentry_bucket[i], se_link) { if (se != NULL) { - bcopy(se, next_sed, sizeof(next_sed)); + next_sed->se_auinfo = se->se_auinfo; + next_sed->se_refcnt = (int64_t)se->se_refcnt; + next_sed->se_procnt = (int64_t)se->se_procnt; next_sed++; sz += sizeof(au_sentry_debug_t); } @@ -269,6 +550,12 @@ audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, } AUDIT_SENTRY_RUNLOCK(); + /* Reconcile with the process table. */ + (void) proc_iterate(PROC_ALLPROCLIST | PROC_ZOMBPROCLIST, + audit_session_debug_callout, NULL, + audit_session_debug_filterfn, (void *)&sed_tab[0]); + + req->oldlen = sz; err = SYSCTL_OUT(req, sed_tab, sz); kfree(sed_tab, entry_cnt * sizeof(au_sentry_debug_t)); @@ -278,6 +565,65 @@ audit_sysctl_session_debug(__unused struct sysctl_oid *oidp, #endif /* AUDIT_SESSION_DEBUG */ +/* + * Create and commit a session audit event. The proc and se arguments needs to + * be that of the subject and not necessarily the current process. + */ +static void +audit_session_event(int event, auditinfo_addr_t *aia_p) +{ + struct kaudit_record *ar; + + KASSERT(AUE_SESSION_START == event || AUE_SESSION_UPDATE == event || + AUE_SESSION_END == event || AUE_SESSION_CLOSE == event, + ("audit_session_event: invalid event: %d", event)); + + if (NULL == aia_p) + return; + + /* + * Create a new audit record. The record will contain the subject + * ruid, rgid, egid, pid, auid, asid, amask, and term_addr + * (implicitly added by audit_new). + */ + ar = audit_new(event, PROC_NULL, /* Not used */ NULL); + if (NULL == ar) + return; + + /* + * Audit session events are always generated because they are used + * by some userland consumers so just set the preselect flag. + */ + ar->k_ar_commit |= AR_PRESELECT_FILTER; + + /* + * Populate the subject information. Note that the ruid, rgid, + * egid, and pid values are incorrect. We only need the auditinfo_addr + * information. + */ + ar->k_ar.ar_subj_ruid = 0; + ar->k_ar.ar_subj_rgid = 0; + ar->k_ar.ar_subj_egid = 0; + ar->k_ar.ar_subj_pid = 0; + ar->k_ar.ar_subj_auid = aia_p->ai_auid; + ar->k_ar.ar_subj_asid = aia_p->ai_asid; + bcopy(&aia_p->ai_termid, &ar->k_ar.ar_subj_term_addr, + sizeof(struct au_tid_addr)); + + /* Add the audit masks to the record. */ + ar->k_ar.ar_arg_amask.am_success = aia_p->ai_mask.am_success; + ar->k_ar.ar_arg_amask.am_failure = aia_p->ai_mask.am_failure; + ARG_SET_VALID(ar, ARG_AMASK); + + /* Add the audit session flags to the record. */ + ar->k_ar.ar_arg_value64 = aia_p->ai_flags; + ARG_SET_VALID(ar, ARG_VALUE64); + + + /* Commit the record to the queue. */ + audit_commit(ar, 0, 0); +} + /* * Hash the audit session ID using a simple 32-bit mix. */ @@ -296,7 +642,8 @@ audit_session_hash(au_asid_t asid) /* * Do an hash lookup and find the session entry for a given ASID. Return NULL - * if not found. + * if not found. If the session is found then audit_session_find takes a + * reference. */ static au_sentry_t * audit_session_find(au_asid_t asid) @@ -309,23 +656,14 @@ audit_session_find(au_asid_t asid) hkey = HASH_ASID(asid); LIST_FOREACH(found_se, &au_sentry_bucket[hkey], se_link) - if (found_se->se_asid == asid) + if (found_se->se_asid == asid) { + au_history_record(found_se, AU_HISTORY_EVENT_FIND); + audit_ref_session(found_se); return (found_se); + } return (NULL); } -/* - * Call kqueue knote while holding the session entry klist lock. - */ -static void -audit_session_knote(au_sentry_t *se, long hint) -{ - - AUDIT_SE_KLIST_LOCK(se); - KNOTE(&se->se_klist, hint); - AUDIT_SE_KLIST_UNLOCK(se); -} - /* * Remove the given audit_session entry from the hash table. */ @@ -335,20 +673,35 @@ audit_session_remove(au_sentry_t *se) uint32_t hkey; au_sentry_t *found_se, *tmp_se; + au_history_record(se, AU_HISTORY_EVENT_DEATH); KASSERT(se->se_refcnt == 0, ("audit_session_remove: ref count != 0")); + KASSERT(se != &audit_default_se, + ("audit_session_remove: removing default session")); hkey = HASH_ASID(se->se_asid); AUDIT_SENTRY_WLOCK(); + /* + * Check and see if someone got a reference before we got the lock. + */ + if (se->se_refcnt != 0) { + AUDIT_SENTRY_WUNLOCK(); + return; + } + + audit_session_portdestroy(&se->se_port); LIST_FOREACH_SAFE(found_se, &au_sentry_bucket[hkey], se_link, tmp_se) { if (found_se == se) { - audit_session_knote(found_se, NOTE_AS_CLOSE); + /* + * Generate an audit event to notify userland of the + * session close. + */ + audit_session_event(AUE_SESSION_CLOSE, + &found_se->se_auinfo); LIST_REMOVE(found_se, se_link); AUDIT_SENTRY_WUNLOCK(); - AUDIT_SE_KLIST_LOCK_DESTROY(found_se); - found_se->se_refcnt = 0; free(found_se, M_AU_SESSION); return; @@ -365,6 +718,11 @@ audit_ref_session(au_sentry_t *se) { long old_val; + if (se == NULL || se == &audit_default_se) + return; + + au_history_record(se, AU_HISTORY_EVENT_REF); + old_val = OSAddAtomicLong(1, &se->se_refcnt); KASSERT(old_val < 100000, ("audit_ref_session: Too many references on session.")); @@ -378,6 +736,11 @@ audit_unref_session(au_sentry_t *se) { long old_val; + if (se == NULL || se == &audit_default_se) + return; + + au_history_record(se, AU_HISTORY_EVENT_UNREF); + old_val = OSAddAtomicLong(-1, &se->se_refcnt); if (old_val == 1) audit_session_remove(se); @@ -393,6 +756,9 @@ audit_inc_procount(au_sentry_t *se) { long old_val; + if (se == NULL || se == &audit_default_se) + return; + old_val = OSAddAtomicLong(1, &se->se_procnt); KASSERT(old_val <= PID_MAX, ("audit_inc_procount: proc count > PID_MAX")); @@ -407,9 +773,16 @@ audit_dec_procount(au_sentry_t *se) { long old_val; + if (se == NULL || se == &audit_default_se) + return; + old_val = OSAddAtomicLong(-1, &se->se_procnt); + /* + * If this was the last process generate an audit event to notify + * userland of the session ending. + */ if (old_val == 1) - audit_session_knote(se, NOTE_AS_END); + audit_session_event(AUE_SESSION_END, &se->se_auinfo); KASSERT(old_val >= 1, ("audit_dec_procount: proc count < 0")); } @@ -426,7 +799,7 @@ audit_update_sentry(au_sentry_t *se, auditinfo_addr_t *new_aia) auditinfo_addr_t *aia = &se->se_auinfo; int update; - KASSERT(new_aia != &audit_default_aia, + KASSERT(new_aia != audit_default_aia_p, ("audit_update_sentry: Trying to update the default aia.")); update = (aia->ai_auid != new_aia->ai_auid || @@ -464,64 +837,66 @@ audit_session_nextid(void) * reference to the entry that must be unref'ed. */ static auditinfo_addr_t * -audit_session_new(auditinfo_addr_t *new_aia, int newprocess) +audit_session_new(auditinfo_addr_t *new_aia_p, auditinfo_addr_t *old_aia_p) { - au_asid_t asid; + au_asid_t new_asid; au_sentry_t *se = NULL; + au_sentry_t *found_se = NULL; auditinfo_addr_t *aia = NULL; - char nm[LOCK_MAX_NAME]; - KASSERT(new_aia != NULL, ("audit_session_new: new_aia == NULL")); + KASSERT(new_aia_p != NULL, ("audit_session_new: new_aia_p == NULL")); - asid = new_aia->ai_asid; + new_asid = new_aia_p->ai_asid; -#if 0 /* XXX this assertion is currently broken by securityd/LoginWindow */ - KASSERT((asid != AU_ASSIGN_ASID && asid <= PID_MAX), - ("audit_session_new: illegal ASID value: %d", asid)); -#endif - /* * Alloc a new session entry now so we don't wait holding the lock. */ se = malloc(sizeof(au_sentry_t), M_AU_SESSION, M_WAITOK | M_ZERO); - snprintf(nm, sizeof(nm), "audit se_klist_mtx %d", asid); - AUDIT_SE_KLIST_LOCK_INIT(se, nm); - /* * Find an unique session ID, if desired. */ AUDIT_SENTRY_WLOCK(); - if (asid == AU_ASSIGN_ASID) { + if (new_asid == AU_ASSIGN_ASID) { do { - asid = (au_asid_t)audit_session_nextid(); - } while(audit_session_find(asid) != NULL); + + new_asid = (au_asid_t)audit_session_nextid(); + found_se = audit_session_find(new_asid); + + /* + * If the session ID is currently active then drop the + * reference and try again. + */ + if (found_se != NULL) + audit_unref_session(found_se); + else + break; + } while(1); } else { - au_sentry_t *found_se = NULL; /* * Check to see if the requested ASID is already in the * hash table. If so, update it with the new auditinfo. */ - if ((found_se = audit_session_find(asid)) != NULL) { + if ((found_se = audit_session_find(new_asid)) != NULL) { int updated; - updated = audit_update_sentry(found_se, new_aia); - audit_ref_session(found_se); + updated = audit_update_sentry(found_se, new_aia_p); AUDIT_SENTRY_WUNLOCK(); - AUDIT_SE_KLIST_LOCK_DESTROY(se); free(se, M_AU_SESSION); - if (updated) - audit_session_knote(found_se, NOTE_AS_UPDATE); + /* If a different session then add this process in. */ + if (new_aia_p != old_aia_p) + audit_inc_procount(found_se); /* - * If this is a new process joining this session then - * we need to update the proc count. + * If the session information was updated then + * generate an audit event to notify userland. */ - if (newprocess) - audit_inc_procount(found_se); + if (updated) + audit_session_event(AUE_SESSION_UPDATE, + &found_se->se_auinfo); return (&found_se->se_auinfo); } @@ -539,25 +914,23 @@ audit_session_new(auditinfo_addr_t *new_aia, int newprocess) */ se->se_port = IPC_PORT_NULL; aia = &se->se_auinfo; - aia->ai_asid = asid; - aia->ai_auid = new_aia->ai_auid; - bzero(&new_aia->ai_mask, sizeof(new_aia->ai_mask)); - bcopy(&new_aia->ai_termid, &aia->ai_termid, sizeof(aia->ai_termid)); - aia->ai_flags = new_aia->ai_flags; + aia->ai_asid = new_asid; + aia->ai_auid = new_aia_p->ai_auid; + bzero(&new_aia_p->ai_mask, sizeof(new_aia_p->ai_mask)); + bcopy(&new_aia_p->ai_termid, &aia->ai_termid, sizeof(aia->ai_termid)); + aia->ai_flags = new_aia_p->ai_flags; /* * Add it to the hash table. */ - LIST_INSERT_HEAD(&au_sentry_bucket[HASH_ASID(asid)], se, se_link); + LIST_INSERT_HEAD(&au_sentry_bucket[HASH_ASID(new_asid)], se, se_link); AUDIT_SENTRY_WUNLOCK(); /* - * Register kevents for consumers wanting events for any ASID - * and knote the event. + * Generate an audit event to notify userland of the new session. */ - audit_register_kevents(se->se_asid, se->se_auid); - audit_session_knote(se, NOTE_AS_START); - + audit_session_event(AUE_SESSION_START, aia); + au_history_record(se, AU_HISTORY_EVENT_BIRTH); return (aia); } @@ -577,13 +950,22 @@ audit_session_lookup(au_asid_t asid, auditinfo_addr_t *ret_aia) AUDIT_SENTRY_RUNLOCK(); return (1); } + /* We have a reference on the session so it is safe to drop the lock. */ + AUDIT_SENTRY_RUNLOCK(); if (ret_aia != NULL) bcopy(&se->se_auinfo, ret_aia, sizeof(*ret_aia)); - AUDIT_SENTRY_RUNLOCK(); + audit_unref_session(se); return (0); } +void +audit_session_aiaref(auditinfo_addr_t *aia_p) +{ + + audit_ref_session(AU_SENTRY_PTR(aia_p)); +} + /* * Add a reference to the session entry. */ @@ -596,9 +978,13 @@ audit_session_ref(kauth_cred_t cred) ("audit_session_ref: Invalid kauth_cred.")); aia_p = cred->cr_audit.as_aia_p; + audit_session_aiaref(aia_p); +} + +void audit_session_aiaunref(auditinfo_addr_t *aia_p) +{ - if (IS_VALID_SESSION(aia_p)) - audit_ref_session(AU_SENTRY_PTR(aia_p)); + audit_unref_session(AU_SENTRY_PTR(aia_p)); } /* @@ -613,14 +999,17 @@ audit_session_unref(kauth_cred_t cred) ("audit_session_unref: Invalid kauth_cred.")); aia_p = cred->cr_audit.as_aia_p; - - if (IS_VALID_SESSION(aia_p)) - audit_unref_session(AU_SENTRY_PTR(aia_p)); + audit_session_aiaunref(aia_p); } +/* + * Increment the per audit session process count. Assumes that the caller has + * a reference on the process' cred. + */ void -audit_session_procnew(kauth_cred_t cred) +audit_session_procnew(proc_t p) { + kauth_cred_t cred = p->p_ucred; auditinfo_addr_t *aia_p; KASSERT(IS_VALID_CRED(cred), @@ -628,13 +1017,17 @@ audit_session_procnew(kauth_cred_t cred) aia_p = cred->cr_audit.as_aia_p; - if (IS_VALID_SESSION(aia_p)) - audit_inc_procount(AU_SENTRY_PTR(aia_p)); + audit_inc_procount(AU_SENTRY_PTR(aia_p)); } +/* + * Decrement the per audit session process count. Assumes that the caller has + * a reference on the cred. + */ void -audit_session_procexit(kauth_cred_t cred) +audit_session_procexit(proc_t p) { + kauth_cred_t cred = p->p_ucred; auditinfo_addr_t *aia_p; KASSERT(IS_VALID_CRED(cred), @@ -642,8 +1035,7 @@ audit_session_procexit(kauth_cred_t cred) aia_p = cred->cr_audit.as_aia_p; - if (IS_VALID_SESSION(aia_p)) - audit_dec_procount(AU_SENTRY_PTR(aia_p)); + audit_dec_procount(AU_SENTRY_PTR(aia_p)); } /* @@ -658,450 +1050,109 @@ audit_session_init(void) ("audit_session_init: ASSIGNED_ASID_MAX is not large enough.")); AUDIT_SENTRY_RWLOCK_INIT(); - AUDIT_ANYAS_KLIST_LOCK_INIT(); au_sentry_bucket = malloc( sizeof(struct au_sentry) * HASH_TABLE_SIZE, M_AU_SESSION, M_WAITOK | M_ZERO); for (i = 0; i < HASH_TABLE_SIZE; i++) LIST_INIT(&au_sentry_bucket[i]); -} - -/* - * Allocate a new kevent propagation list (plist). - */ -static caddr_t -audit_new_plist(void) -{ - au_plisthead_t *plhead; - - plhead = malloc(sizeof(au_plisthead_t), M_AU_EV_PLIST, M_WAITOK | - M_ZERO); - - LIST_INIT(&plhead->ph_head); - AUDIT_PLIST_LOCK_INIT(plhead); - return ((caddr_t) plhead); + (void)audit_sdev_init(); +#if AU_HISTORY_LOGGING + au_history = malloc(sizeof(struct au_history) * au_history_size, + M_AU_SESSION, M_WAITOK|M_ZERO); +#endif } -/* - * Destroy a kevent propagation list (plist). The anyas_klist_mtx mutex must be - * held by the caller. - */ -static void -audit_destroy_plist(struct knote *anyas_kn) +static int +audit_session_update_check(kauth_cred_t cred, auditinfo_addr_t *old, + auditinfo_addr_t *new) { - au_plisthead_t *plhead; - au_plist_t *plentry, *ple_tmp; - struct kevent64_s kev; - - KASSERT(anyas_kn != NULL, ("audit_destroy_plist: anyas = NULL")); - plhead = (au_plisthead_t *)anyas_kn->kn_hook; - KASSERT(plhead != NULL, ("audit_destroy_plist: plhead = NULL")); - - /* - * Delete everything in the propagation list. + uint64_t n; + + /* If the current audit ID is not the default then it is immutable. */ + if (old->ai_auid != AU_DEFAUDITID && old->ai_auid != new->ai_auid) + return (EINVAL); + + /* If the current termid is not the default then it is immutable. */ + if ((old->ai_termid.at_type != AU_IPv4 || + old->ai_termid.at_port != 0 || + old->ai_termid.at_addr[0] != 0) && + (old->ai_termid.at_port != new->ai_termid.at_port || + old->ai_termid.at_type != new->ai_termid.at_type || + 0 != bcmp(&old->ai_termid.at_addr, &new->ai_termid.at_addr, + sizeof (old->ai_termid.at_addr)))) + return (EINVAL); + + /* The flags may be set only according to the + * audit_session_*_set_sflags_masks. */ - AUDIT_PLIST_LOCK(plhead); - LIST_FOREACH_SAFE(plentry, &plhead->ph_head, pl_link, ple_tmp) { - struct kqueue *kq = plentry->pl_knote->kn_kq; - - kev.ident = plentry->pl_knote->kn_id; - kev.filter = EVFILT_SESSION; - kev.flags = EV_DELETE; - - /* - * The plist entry gets removed in rm_from_plist() which is - * called indirectly by kevent_register(). - */ - kevent_register(kq, &kev, NULL); - } - AUDIT_PLIST_UNLOCK(plhead); - - /* - * Remove the head. + n = ~old->ai_flags & new->ai_flags; + if (0 != n && + !((n == (audit_session_superuser_set_sflags_mask & n) && + kauth_cred_issuser(cred)) || + (n == (audit_session_member_set_sflags_mask & n) && + old->ai_asid == new->ai_asid))) + return (EINVAL); + + /* The flags may be cleared only according to the + * audit_session_*_clear_sflags_masks. */ - AUDIT_PLIST_LOCK_DESTROY(plhead); - free(plhead, M_AU_EV_PLIST); + n = ~new->ai_flags & old->ai_flags; + if (0 != n && + !((n == (audit_session_superuser_clear_sflags_mask & n) && + kauth_cred_issuser(cred)) || + (n == (audit_session_member_clear_sflags_mask & n) && + old->ai_asid == new->ai_asid))) + return (EINVAL); + + /* The audit masks are mutable. */ + return (0); } /* - * Add a knote pointer entry to the kevent propagation list. + * Safely update kauth cred of the given process with new the given audit info. */ -static void -audit_add_to_plist(struct knote *anyas_kn, struct knote *kn) +int +audit_session_setaia(proc_t p, auditinfo_addr_t *new_aia_p) { - au_plisthead_t *plhead; - au_plist_t *plentry; - - KASSERT(anyas_kn != NULL, ("audit_add_to_plist: anyas = NULL")); - plhead = (au_plisthead_t *)anyas_kn->kn_hook; - KASSERT(plhead != NULL, ("audit_add_to_plist: plhead = NULL")); + kauth_cred_t my_cred, my_new_cred; + struct au_session as; + struct au_session tmp_as; + auditinfo_addr_t caia, *old_aia_p; + int ret; - plentry = malloc(sizeof(au_plist_t), M_AU_EV_PLIST, M_WAITOK | M_ZERO); + /* + * If this is going to modify an existing session then do some + * immutable checks. + */ + if (audit_session_lookup(new_aia_p->ai_asid, &caia) == 0) { + my_cred = kauth_cred_proc_ref(p); + ret = audit_session_update_check(my_cred, &caia, new_aia_p); + kauth_cred_unref(&my_cred); + if (ret) + return (ret); + } - plentry->pl_knote = kn; - AUDIT_PLIST_LOCK(plhead); - LIST_INSERT_HEAD(&plhead->ph_head, plentry, pl_link); - AUDIT_PLIST_UNLOCK(plhead); -} + my_cred = kauth_cred_proc_ref(p); + bcopy(&new_aia_p->ai_mask, &as.as_mask, sizeof(as.as_mask)); + old_aia_p = my_cred->cr_audit.as_aia_p; + /* audit_session_new() adds a reference on the session */ + as.as_aia_p = audit_session_new(new_aia_p, old_aia_p); -/* - * Remote a knote pointer entry from the kevent propagation list. The lock - * on the plist may already be head (by audit_destroy_plist() above) so we use - * a recursive lock. - */ -static void -audit_rm_from_plist(struct knote *kn) -{ - struct knote *anyas_kn; - au_plisthead_t *plhd; - au_plist_t *plentry, *ple_tmp; - - KASSERT(kn != NULL, ("audit_rm_from_plist: kn = NULL")); - anyas_kn = (struct knote *)kn->kn_hook; - KASSERT(anyas_kn != NULL, ("audit_rm_to_plist: anyas = NULL")); - plhd = (au_plisthead_t *)anyas_kn->kn_hook; - - AUDIT_PLIST_LOCK(plhd); - LIST_FOREACH_SAFE(plentry, &plhd->ph_head, pl_link, ple_tmp) { - if (plentry->pl_knote == kn) { - LIST_REMOVE(plentry, pl_link); - free(plentry, M_AU_EV_PLIST); - AUDIT_PLIST_UNLOCK(plhd); - return; - } - } - AUDIT_PLIST_UNLOCK(plhd); -} + /* If the process left a session then update the process count. */ + if (old_aia_p != new_aia_p) + audit_dec_procount(AU_SENTRY_PTR(old_aia_p)); -/* - * The attach filter for EVFILT_SESSION. - */ -static int -audit_filt_sessionattach(struct knote *kn) -{ - au_sentry_t *se = NULL; /* - * Check flags for the events we currently support. + * We are modifying the audit info in a credential so we need a new + * credential (or take another reference on an existing credential that + * matches our new one). We must do this because the audit info in the + * credential is used as part of our hash key. Get current credential + * in the target process and take a reference while we muck with it. */ - if ((kn->kn_sfflags & (NOTE_AS_START | NOTE_AS_END | NOTE_AS_CLOSE - | NOTE_AS_UPDATE | NOTE_AS_ERR)) == 0) - return (ENOTSUP); - - /* - * If the interest is in any session then add to the any ASID knote - * list. Otherwise, add it to the knote list assosiated with the - * given session. - */ - if (kn->kn_id == AS_ANY_ASID) { - - kn->kn_flags |= EV_CLEAR; - kn->kn_ptr.p_se = NULL; - - /* - * Attach a kevent propagation list for any kevents that get - * added. - */ - kn->kn_hook = audit_new_plist(); - - AUDIT_ANYAS_KLIST_LOCK(); - KNOTE_ATTACH(&anyas_klist, kn); - AUDIT_ANYAS_KLIST_UNLOCK(); - - return (0); - } else { - - /* - * NOTE: The anyas klist lock will be held in this - * part of the code when indirectly called from - * audit_register_kevents() below. - */ - - /* - * Check to make sure it is a valid ASID. - */ - if (kn->kn_id > ASSIGNED_ASID_MAX) - return (EINVAL); - - AUDIT_SENTRY_RLOCK(); - se = audit_session_find(kn->kn_id); - AUDIT_SENTRY_RUNLOCK(); - if (se == NULL) - return (EINVAL); - - AUDIT_SE_KLIST_LOCK(se); - kn->kn_flags |= EV_CLEAR; - kn->kn_ptr.p_se = se; - - /* - * If this attach is the result of an "any ASID" (pseudo) - * kevent then attach the any session knote ptr to this knote. - * Also, add this knote to the its propagation list. - */ - if (kn->kn_flags & EV_ANY_ASID) { - struct knote *anyas_kn = - (struct knote *)((uintptr_t)kn->kn_kevent.ext[0]); - kn->kn_hook = (caddr_t) anyas_kn; - kn->kn_flags &= ~EV_ANY_ASID; - audit_add_to_plist(anyas_kn, kn); - } else - kn->kn_hook = NULL; - KNOTE_ATTACH(&se->se_klist, kn); - AUDIT_SE_KLIST_UNLOCK(se); - - return (0); - } -} - -/* - * The detach filter for EVFILT_SESSION. - */ -static void -audit_filt_sessiondetach(struct knote *kn) -{ - au_sentry_t *se = NULL; - - if (kn->kn_id == AS_ANY_ASID) { - - AUDIT_ANYAS_KLIST_LOCK(); - audit_destroy_plist(kn); - KNOTE_DETACH(&anyas_klist, kn); - AUDIT_ANYAS_KLIST_UNLOCK(); - - } else { - /* - * If this knote was created by any ASID kevent then remove - * from kevent propagation list. - */ - if (kn->kn_hook != NULL) { - audit_rm_from_plist(kn); - kn->kn_hook = NULL; - } - - /* - * Check to see if already detached. - */ - se = kn->kn_ptr.p_se; - if (se != NULL) { - AUDIT_SE_KLIST_LOCK(se); - kn->kn_ptr.p_se = NULL; - KNOTE_DETACH(&se->se_klist, kn); - AUDIT_SE_KLIST_UNLOCK(se); - } - } -} - -/* - * The touch filter for EVFILT_SESSION. Check for any ASID kevent updates and - * propagate the change. - */ -static void -audit_filt_sessiontouch(struct knote *kn, struct kevent64_s *kev, long type) -{ - struct knote *ple_kn; - struct kqueue *kq; - au_sentry_t *se; - au_plisthead_t *plhead; - au_plist_t *plentry; - struct kevent64_s newkev; - - switch (type) { - case EVENT_REGISTER: - kn->kn_sfflags = kev->fflags; - kn->kn_sdata = kev->data; - /* - * If an any ASID kevent was updated then we may need to - * propagate the update. - */ - if (kev->ident == AS_ANY_ASID && kn->kn_hook != NULL) { - - /* - * Propagate the change to each of the session kevents - * that were created by this any ASID kevent. - */ - plhead = (au_plisthead_t *)kn->kn_hook; - AUDIT_PLIST_LOCK(plhead); - LIST_FOREACH(plentry, &plhead->ph_head, pl_link) { - - if ((ple_kn = plentry->pl_knote) == NULL) - continue; - if ((se = ple_kn->kn_ptr.p_se) == NULL) - continue; - if ((kq = ple_kn->kn_kq) == NULL) - continue; - - newkev.ident = plentry->pl_knote->kn_id; - newkev.filter = EVFILT_SESSION; - newkev.flags = kev->flags; - newkev.fflags = kev->fflags; - newkev.data = kev->data; - newkev.udata = kev->udata; - kevent_register(kq, &newkev, NULL); - } - AUDIT_PLIST_UNLOCK(plhead); - } - break; - - case EVENT_PROCESS: - *kev = kn->kn_kevent; - if (kn->kn_flags & EV_CLEAR) { - kn->kn_data = 0; - kn->kn_fflags = 0; - } - break; - - default: - KASSERT((type == EVENT_REGISTER || type == EVENT_PROCESS), - ("filt_sessiontouch(): invalid type (%ld)", type)); - break; - } -} - -/* - * Event filter for EVFILT_SESSION. The AUDIT_SE_KLIST_LOCK should be held - * by audit_session_knote(). - */ -static int -audit_filt_session(struct knote *kn, long hint) -{ - int events = (int)hint; - au_sentry_t *se = kn->kn_ptr.p_se; - - if (hint != 0 && se != NULL) { - - if (kn->kn_sfflags & events) { - kn->kn_fflags |= events; - kn->kn_data = se->se_auid; - } - - /* - * If this is the last possible event for the knote, - * detach the knote from the audit session before the - * session goes away. - */ - if (events & NOTE_AS_CLOSE) { - - /* - * If created by any ASID kevent then remove from - * propagation list. - */ - if (kn->kn_hook != NULL) { - audit_rm_from_plist(kn); - kn->kn_hook = NULL; - } - kn->kn_flags |= (EV_EOF | EV_ONESHOT); - kn->kn_ptr.p_se = NULL; - AUDIT_SE_KLIST_LOCK_ASSERT(se); - KNOTE_DETACH(&se->se_klist, kn); - - return (1); - } - } - return (kn->kn_fflags != 0); -} - -/* - * For all the consumers wanting events for all sessions, register new - * kevents associated with the session for the given ASID. The actual - * attachment is done by the EVFILT_SESSION attach filter above. - */ -static void -audit_register_kevents(uint32_t asid, uint32_t auid) -{ - struct knote *kn; - - AUDIT_ANYAS_KLIST_LOCK(); - SLIST_FOREACH(kn, &anyas_klist, kn_selnext) { - struct kqueue *kq = kn->kn_kq; - struct kevent64_s kev; - int err; - - kev.ident = asid; - kev.filter = EVFILT_SESSION; - kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ANY_ASID; - kev.fflags = kn->kn_sfflags; - kev.data = auid; - kev.udata = kn->kn_kevent.udata; - - /* - * Save the knote ptr for this "any ASID" knote for the attach - * filter. - */ - kev.ext[0] = (uint64_t)((uintptr_t)kn); - - /* - * XXX kevent_register() may block here alloc'ing a new knote. - * We may want to think about using a lockless linked list or - * at least a sleep rwlock for the anyas_klist. - */ - err = kevent_register(kq, &kev, NULL); - if (err) - kn->kn_fflags |= NOTE_AS_ERR; - } - AUDIT_ANYAS_KLIST_UNLOCK(); -} - -/* - * Safely update kauth cred of the given process with new the given audit info. - * If the newprocess flag is set then we need to account for this process in - * the proc count. - */ -int -audit_session_setaia(proc_t p, auditinfo_addr_t *aia_p, int newprocess) -{ - kauth_cred_t my_cred, my_new_cred; - struct au_session as; - struct au_session tmp_as; - auditinfo_addr_t caia; - - /* - * If this is going to modify an existing session then do some - * immutable checks. - */ - if (audit_session_lookup(aia_p->ai_asid, &caia) == 0) { - - /* - * If the current audit ID is not the default then it is - * immutable. - */ - if (caia.ai_auid != AU_DEFAUDITID && - caia.ai_auid != aia_p->ai_auid) - return (EINVAL); - - /* - * If the current termid is not the default then it is - * immutable. - */ - if ((caia.ai_termid.at_type != AU_IPv4 || - caia.ai_termid.at_port != 0 || - caia.ai_termid.at_addr[0] != 0) && - (caia.ai_termid.at_port != aia_p->ai_termid.at_port || - caia.ai_termid.at_type != aia_p->ai_termid.at_type || - bcmp(&caia.ai_termid.at_addr, &aia_p->ai_termid.at_addr, - sizeof (caia.ai_termid.at_addr) )) ) - return (EINVAL); - - /* The audit flags are immutable. */ - if (caia.ai_flags != aia_p->ai_flags) - return (EINVAL); - - /* The audit masks are mutable. */ - } - - my_cred = kauth_cred_proc_ref(p); - bcopy(&aia_p->ai_mask, &as.as_mask, sizeof(as.as_mask)); - as.as_aia_p = audit_session_new(aia_p, newprocess); - - /* - * We are modifying the audit info in a credential so we need a new - * credential (or take another reference on an existing credential that - * matches our new one). We must do this because the audit info in the - * credential is used as part of our hash key. Get current credential - * in the target process and take a reference while we muck with it. - */ - for (;;) { + for (;;) { /* * Set the credential with new info. If there is no change, @@ -1129,6 +1180,8 @@ audit_session_setaia(proc_t p, auditinfo_addr_t *aia_p, int newprocess) continue; } p->p_ucred = my_new_cred; + /* update cred on proc */ + PROC_UPDATE_CREDS_ONPROC(p); proc_unlock(p); } /* @@ -1137,11 +1190,11 @@ audit_session_setaia(proc_t p, auditinfo_addr_t *aia_p, int newprocess) kauth_cred_unref(&my_cred); break; } - audit_session_unref(my_new_cred); - /* - * Propagate the change from the process to the Mach task. - */ + /* Drop the reference taken by audit_session_new() above. */ + audit_unref_session(AU_SENTRY_PTR(as.as_aia_p)); + + /* Propagate the change from the process to the Mach task. */ set_security_token(p); return (0); @@ -1180,6 +1233,7 @@ audit_session_self(proc_t p, __unused struct audit_session_self_args *uap, aia_p = cred->cr_audit.as_aia_p; if (!IS_VALID_SESSION(aia_p)) { + /* Can't join the default session. */ err = EINVAL; goto done; } @@ -1194,91 +1248,190 @@ audit_session_self(proc_t p, __unused struct audit_session_self_args *uap, bcopy(&cred->cr_audit.as_mask, &se->se_mask, sizeof(se->se_mask)); - if ((sendport = audit_session_mksend(aia_p, &se->se_port)) == NULL) { - /* failed to alloc new port */ - err = ENOMEM; - goto done; - } - /* - * This reference on the session is unref'ed in - * audit_session_port_destory(). This reference is needed so the - * session doesn't get dropped until the session join is done. + * Get a send right to the session's Mach port and insert it in the + * process' mach port namespace. */ - audit_ref_session(se); - + sendport = audit_session_mksend(aia_p, &se->se_port); + *ret_port = ipc_port_copyout_send(sendport, get_task_ipcspace(p->task)); done: if (cred != NULL) kauth_cred_unref(&cred); - if (err == 0) - *ret_port = ipc_port_copyout_send(sendport, - get_task_ipcspace(p->task)); - else + if (err != 0) *ret_port = MACH_PORT_NULL; - return (err); } -void -audit_session_portaiadestroy(struct auditinfo_addr *port_aia_p) +/* + * audit_session_port (system call) + * + * Description: Obtain a Mach send right for the given session ID. + * + * Parameters: p Process calling audit_session_port(). + * uap->asid The target audit session ID. The special + * value -1 can be used to target the process's + * own session. + * uap->portnamep User address at which to place port name. + * + * Returns: 0 Success + * EINVAL The calling process' session has not be set. + * EINVAL The given session ID could not be found. + * EINVAL The Mach port right could not be copied out. + * ESRCH Bad process, can't get valid cred for process. + * EPERM Only the superuser can reference sessions other + * than the process's own. + * ENOMEM Port allocation failed due to no free memory. + */ +int +audit_session_port(proc_t p, struct audit_session_port_args *uap, + __unused int *retval) { - au_sentry_t *se; + ipc_port_t sendport = IPC_PORT_NULL; + mach_port_name_t portname = MACH_PORT_NULL; + kauth_cred_t cred = NULL; + auditinfo_addr_t *aia_p = NULL; + au_sentry_t *se = NULL; + int err = 0; + + /* Note: Currently this test will never be true, because + * ASSIGNED_ASID_MAX is effectively (uint32_t)-2. + */ + if (uap->asid != -1 && (uint32_t)uap->asid > ASSIGNED_ASID_MAX) { + err = EINVAL; + goto done; + } + cred = kauth_cred_proc_ref(p); + if (!IS_VALID_CRED(cred)) { + err = ESRCH; + goto done; + } + aia_p = cred->cr_audit.as_aia_p; - KASSERT(port_aia_p != NULL, - ("audit_session_infodestroy: port_aia_p = NULL")); + /* Find the session corresponding to the requested audit + * session ID. If found, take a reference on it so that + * the session is not dropped until the join is later done. + */ + if (uap->asid == (au_asid_t)-1 || + uap->asid == aia_p->ai_asid) { - se = AU_SENTRY_PTR(port_aia_p); + if (!IS_VALID_SESSION(aia_p)) { + /* Can't join the default session. */ + err = EINVAL; + goto done; + } + + /* No privilege is required to obtain a port for our + * own session. + */ + se = AU_SENTRY_PTR(aia_p); + audit_ref_session(se); + } else if (kauth_cred_issuser(cred)) { + /* The superuser may obtain a port for any existing + * session. + */ + AUDIT_SENTRY_RLOCK(); + se = audit_session_find(uap->asid); + AUDIT_SENTRY_RUNLOCK(); + if (NULL == se) { + err = EINVAL; + goto done; + } + aia_p = &se->se_auinfo; + } else { + err = EPERM; + goto done; + } /* - * Drop the reference added in audit_session_self(). + * Processes that join using this mach port will inherit this process' + * pre-selection masks. */ - if (se != NULL) { - se->se_port = IPC_PORT_NULL; - audit_unref_session(se); + if (se->se_port == IPC_PORT_NULL) + bcopy(&cred->cr_audit.as_mask, &se->se_mask, + sizeof(se->se_mask)); + + /* + * Use the session reference to create a mach port reference for the + * session (at which point we are free to drop the session reference) + * and then copy out the mach port to the process' mach port namespace. + */ + sendport = audit_session_mksend(aia_p, &se->se_port); + portname = ipc_port_copyout_send(sendport, get_task_ipcspace(p->task)); + if (!MACH_PORT_VALID(portname)) { + err = EINVAL; + goto done; } + err = copyout(&portname, uap->portnamep, sizeof(mach_port_name_t)); +done: + if (cred != NULL) + kauth_cred_unref(&cred); + if (NULL != se) + audit_unref_session(se); + if (MACH_PORT_VALID(portname) && 0 != err) + (void)mach_port_deallocate(get_task_ipcspace(p->task), + portname); + return (err); } static int audit_session_join_internal(proc_t p, ipc_port_t port, au_asid_t *new_asid) { - auditinfo_addr_t *port_aia_p, *old_aia_p; - kauth_cred_t cred = NULL; + auditinfo_addr_t *new_aia_p, *old_aia_p; + kauth_cred_t my_cred = NULL; au_asid_t old_asid; int err = 0; *new_asid = AU_DEFAUDITSID; - if ((port_aia_p = audit_session_porttoaia(port)) == NULL) { + if ((new_aia_p = audit_session_porttoaia(port)) == NULL) { err = EINVAL; goto done; } - *new_asid = port_aia_p->ai_asid; - cred = kauth_cred_proc_ref(p); - if (!IS_VALID_CRED(cred)) { - kauth_cred_unref(&cred); + proc_lock(p); + kauth_cred_ref(p->p_ucred); + my_cred = p->p_ucred; + if (!IS_VALID_CRED(my_cred)) { + kauth_cred_unref(&my_cred); + proc_unlock(p); err = ESRCH; goto done; } - old_aia_p = cred->cr_audit.as_aia_p; + old_aia_p = my_cred->cr_audit.as_aia_p; old_asid = old_aia_p->ai_asid; + *new_asid = new_aia_p->ai_asid; /* * Add process in if not already in the session. */ if (*new_asid != old_asid) { - audit_session_setaia(p, port_aia_p, 1); - /* - * If this process was in a valid session before then we - * need to decrement the process count of the session it - * came from. - */ - if (IS_VALID_SESSION(old_aia_p)) - audit_dec_procount(AU_SENTRY_PTR(old_aia_p)); + kauth_cred_t my_new_cred; + struct au_session new_as; + + bcopy(&new_aia_p->ai_mask, &new_as.as_mask, + sizeof(new_as.as_mask)); + new_as.as_aia_p = new_aia_p; + + my_new_cred = kauth_cred_setauditinfo(my_cred, &new_as); + p->p_ucred = my_new_cred; + PROC_UPDATE_CREDS_ONPROC(p); + + /* Increment the proc count of new session */ + audit_inc_procount(AU_SENTRY_PTR(new_aia_p)); + + proc_unlock(p); + + /* Propagate the change from the process to the Mach task. */ + set_security_token(p); + + /* Decrement the process count of the former session. */ + audit_dec_procount(AU_SENTRY_PTR(old_aia_p)); + } else { + proc_unlock(p); } - kauth_cred_unref(&cred); + kauth_cred_unref(&my_cred); done: if (port != IPC_PORT_NULL) @@ -1312,8 +1465,10 @@ audit_session_spawnjoin(proc_t p, ipc_port_t port) * Parameters: p Process calling session join. * uap->port A Mach send right. * - * Returns: *ret_asid Audit session ID of new session, which may - * be AU_DEFAUDITSID in the failure case. + * Returns: *ret_asid Audit session ID of new session. + * In the failure case the return value will be -1 + * and 'errno' will be set to a non-zero value + * described below. * * Errno: 0 Success * EINVAL Invalid Mach port name. @@ -1338,6 +1493,540 @@ audit_session_join(proc_t p, struct audit_session_join_args *uap, return (err); } +/* + * Audit session device. + */ + +/* + * Free an audit sdev entry. + */ +static void +audit_sdev_entry_free(struct audit_sdev_entry *ase) +{ + + free(ase->ase_record, M_AUDIT_SDEV_ENTRY); + free(ase, M_AUDIT_SDEV_ENTRY); +} + +/* + * Append individual record to a queue. Allocate queue-local buffer and + * add to the queue. If the queue is full or we can't allocate memory, + * drop the newest record. + */ +static void +audit_sdev_append(struct audit_sdev *asdev, void *record, u_int record_len) +{ + struct audit_sdev_entry *ase; + + AUDIT_SDEV_LOCK_ASSERT(asdev); + + if (asdev->asdev_qlen >= asdev->asdev_qlimit) { + asdev->asdev_drops++; + audit_sdev_drops++; + return; + } + + ase = malloc(sizeof (*ase), M_AUDIT_SDEV_ENTRY, M_NOWAIT | M_ZERO); + if (NULL == ase) { + asdev->asdev_drops++; + audit_sdev_drops++; + return; + } + + ase->ase_record = malloc(record_len, M_AUDIT_SDEV_ENTRY, M_NOWAIT); + if (NULL == ase->ase_record) { + free(ase, M_AUDIT_SDEV_ENTRY); + asdev->asdev_drops++; + audit_sdev_drops++; + return; + } + + bcopy(record, ase->ase_record, record_len); + ase->ase_record_len = record_len; + + TAILQ_INSERT_TAIL(&asdev->asdev_queue, ase, ase_queue); + asdev->asdev_inserts++; + asdev->asdev_qlen++; + asdev->asdev_qbyteslen += ase->ase_record_len; + selwakeup(&asdev->asdev_selinfo); + if (asdev->asdev_flags & AUDIT_SDEV_ASYNC) + pgsigio(asdev->asdev_sigio, SIGIO); + + cv_broadcast(&asdev->asdev_cv); +} + +/* + * Submit an audit record to be queued in the audit session device. + */ +void +audit_sdev_submit(__unused au_id_t auid, __unused au_asid_t asid, void *record, + u_int record_len) +{ + struct audit_sdev *asdev; + + /* + * Lockless read to avoid lock overhead if sessio devices are not in + * use. + */ + if (NULL == TAILQ_FIRST(&audit_sdev_list)) + return; + + AUDIT_SDEV_LIST_RLOCK(); + TAILQ_FOREACH(asdev, &audit_sdev_list, asdev_list) { + AUDIT_SDEV_LOCK(asdev); + + /* + * Only append to the sdev queue if the AUID and ASID match that + * of the process that opened this session device or if the + * ALLSESSIONS flag is set. + */ + if ((/* XXXss auid == asdev->asdev_auid && */ + asid == asdev->asdev_asid) || + (asdev->asdev_flags & AUDIT_SDEV_ALLSESSIONS) != 0) + audit_sdev_append(asdev, record, record_len); + AUDIT_SDEV_UNLOCK(asdev); + } + AUDIT_SDEV_LIST_RUNLOCK(); + + /* Unlocked increment. */ + audit_sdev_records++; +} + +/* + * Allocate a new audit sdev. Connects the sdev, on succes, to the global + * list and updates statistics. + */ +static struct audit_sdev * +audit_sdev_alloc(void) +{ + struct audit_sdev *asdev; + + AUDIT_SDEV_LIST_WLOCK_ASSERT(); + + asdev = malloc(sizeof (*asdev), M_AUDIT_SDEV, M_NOWAIT | M_ZERO); + if (NULL == asdev) + return (NULL); + + asdev->asdev_qlimit = AUDIT_SDEV_QLIMIT_DEFAULT; + TAILQ_INIT(&asdev->asdev_queue); + AUDIT_SDEV_LOCK_INIT(asdev); + AUDIT_SDEV_SX_LOCK_INIT(asdev); + cv_init(&asdev->asdev_cv, "audit_sdev_cv"); + + /* + * Add to global list and update global statistics. + */ + TAILQ_INSERT_HEAD(&audit_sdev_list, asdev, asdev_list); + audit_sdev_count++; + audit_sdev_ever++; + + return (asdev); +} + +/* + * Flush all records currently present in an audit sdev. + */ +static void +audit_sdev_flush(struct audit_sdev *asdev) +{ + struct audit_sdev_entry *ase; + + AUDIT_SDEV_LOCK_ASSERT(asdev); + + while ((ase = TAILQ_FIRST(&asdev->asdev_queue)) != NULL) { + TAILQ_REMOVE(&asdev->asdev_queue, ase, ase_queue); + asdev->asdev_qbyteslen -= ase->ase_record_len; + audit_sdev_entry_free(ase); + asdev->asdev_qlen--; + } + asdev->asdev_qoffset = 0; + + KASSERT(0 == asdev->asdev_qlen, ("audit_sdev_flush: asdev_qlen")); + KASSERT(0 == asdev->asdev_qbyteslen, + ("audit_sdev_flush: asdev_qbyteslen")); +} + +/* + * Free an audit sdev. + */ +static void +audit_sdev_free(struct audit_sdev *asdev) +{ + + AUDIT_SDEV_LIST_WLOCK_ASSERT(); + AUDIT_SDEV_LOCK_ASSERT(asdev); + + /* XXXss - preselect hook here */ + audit_sdev_flush(asdev); + cv_destroy(&asdev->asdev_cv); + AUDIT_SDEV_SX_LOCK_DESTROY(asdev); + AUDIT_SDEV_LOCK_DESTROY(asdev); + + TAILQ_REMOVE(&audit_sdev_list, asdev, asdev_list); + free(asdev, M_AUDIT_SDEV); + audit_sdev_count--; +} + +/* + * Get the auditinfo_addr of the proc and check to see if suser. Will return + * non-zero if not suser. + */ +static int +audit_sdev_get_aia(proc_t p, struct auditinfo_addr *aia_p) +{ + int error; + kauth_cred_t scred; + + scred = kauth_cred_proc_ref(p); + error = suser(scred, &p->p_acflag); + + if (NULL != aia_p) + bcopy(scred->cr_audit.as_aia_p, aia_p, sizeof (*aia_p)); + kauth_cred_unref(&scred); + + return (error); +} + +/* + * Audit session dev open method. + */ +static int +audit_sdev_open(dev_t dev, __unused int flags, __unused int devtype, proc_t p) +{ + struct audit_sdev *asdev; + struct auditinfo_addr aia; + int u; + + u = minor(dev); + if (u < 0 || u > MAX_AUDIT_SDEVS) + return (ENXIO); + + (void) audit_sdev_get_aia(p, &aia); + + AUDIT_SDEV_LIST_WLOCK(); + asdev = audit_sdev_dtab[u]; + if (NULL == asdev) { + asdev = audit_sdev_alloc(); + if (NULL == asdev) { + AUDIT_SDEV_LIST_WUNLOCK(); + return (ENOMEM); + } + audit_sdev_dtab[u] = asdev; + } else { + KASSERT(asdev->asdev_open, ("audit_sdev_open: Already open")); + AUDIT_SDEV_LIST_WUNLOCK(); + return (EBUSY); + } + asdev->asdev_open = 1; + asdev->asdev_auid = aia.ai_auid; + asdev->asdev_asid = aia.ai_asid; + asdev->asdev_flags = 0; + + AUDIT_SDEV_LIST_WUNLOCK(); + + return (0); +} + +/* + * Audit session dev close method. + */ +static int +audit_sdev_close(dev_t dev, __unused int flags, __unused int devtype, + __unused proc_t p) +{ + struct audit_sdev *asdev; + int u; + + u = minor(dev); + asdev = audit_sdev_dtab[u]; + + KASSERT(asdev != NULL, ("audit_sdev_close: asdev == NULL")); + KASSERT(asdev->asdev_open, ("audit_sdev_close: !asdev_open")); + + AUDIT_SDEV_LIST_WLOCK(); + AUDIT_SDEV_LOCK(asdev); + asdev->asdev_open = 0; + audit_sdev_free(asdev); /* sdev lock unlocked in audit_sdev_free() */ + audit_sdev_dtab[u] = NULL; + AUDIT_SDEV_LIST_WUNLOCK(); + + return (0); +} + +/* + * Audit session dev ioctl method. + */ +static int +audit_sdev_ioctl(dev_t dev, u_long cmd, caddr_t data, + __unused int flag, proc_t p) +{ + struct audit_sdev *asdev; + int error; + + asdev = audit_sdev_dtab[minor(dev)]; + KASSERT(asdev != NULL, ("audit_sdev_ioctl: asdev == NULL")); + + error = 0; + + switch (cmd) { + case FIONBIO: + AUDIT_SDEV_LOCK(asdev); + if (*(int *)data) + asdev->asdev_flags |= AUDIT_SDEV_NBIO; + else + asdev->asdev_flags &= ~AUDIT_SDEV_NBIO; + AUDIT_SDEV_UNLOCK(asdev); + break; + + case FIONREAD: + AUDIT_SDEV_LOCK(asdev); + *(int *)data = asdev->asdev_qbyteslen - asdev->asdev_qoffset; + AUDIT_SDEV_UNLOCK(asdev); + break; + + case AUDITSDEV_GET_QLEN: + *(u_int *)data = asdev->asdev_qlen; + break; + + case AUDITSDEV_GET_QLIMIT: + *(u_int *)data = asdev->asdev_qlimit; + break; + + case AUDITSDEV_SET_QLIMIT: + if (*(u_int *)data >= AUDIT_SDEV_QLIMIT_MIN || + *(u_int *)data <= AUDIT_SDEV_QLIMIT_MAX) { + asdev->asdev_qlimit = *(u_int *)data; + } else + error = EINVAL; + break; + + case AUDITSDEV_GET_QLIMIT_MIN: + *(u_int *)data = AUDIT_SDEV_QLIMIT_MIN; + break; + + case AUDITSDEV_GET_QLIMIT_MAX: + *(u_int *)data = AUDIT_SDEV_QLIMIT_MAX; + break; + + case AUDITSDEV_FLUSH: + if (AUDIT_SDEV_SX_XLOCK_SIG(asdev) != 0) + return (EINTR); + AUDIT_SDEV_LOCK(asdev); + audit_sdev_flush(asdev); + AUDIT_SDEV_UNLOCK(asdev); + AUDIT_SDEV_SX_XUNLOCK(asdev); + break; + + case AUDITSDEV_GET_MAXDATA: + *(u_int *)data = MAXAUDITDATA; + break; + + /* XXXss these should be 64 bit, maybe. */ + case AUDITSDEV_GET_INSERTS: + *(u_int *)data = asdev->asdev_inserts; + break; + + case AUDITSDEV_GET_READS: + *(u_int *)data = asdev->asdev_reads; + break; + + case AUDITSDEV_GET_DROPS: + *(u_int *)data = asdev->asdev_drops; + break; + + case AUDITSDEV_GET_ALLSESSIONS: + error = audit_sdev_get_aia(p, NULL); + if (error) + break; + *(u_int *)data = (asdev->asdev_flags & AUDIT_SDEV_ALLSESSIONS) ? + 1 : 0; + break; + + case AUDITSDEV_SET_ALLSESSIONS: + error = audit_sdev_get_aia(p, NULL); + if (error) + break; + + AUDIT_SDEV_LOCK(asdev); + if (*(int *)data) + asdev->asdev_flags |= AUDIT_SDEV_ALLSESSIONS; + else + asdev->asdev_flags &= ~AUDIT_SDEV_ALLSESSIONS; + AUDIT_SDEV_UNLOCK(asdev); + break; + + default: + error = ENOTTY; + } + + return (error); +} + +/* + * Audit session dev read method. + */ +static int +audit_sdev_read(dev_t dev, struct uio *uio, __unused int flag) +{ + struct audit_sdev_entry *ase; + struct audit_sdev *asdev; + u_int toread; + int error; + + asdev = audit_sdev_dtab[minor(dev)]; + KASSERT(NULL != asdev, ("audit_sdev_read: asdev == NULL")); + + /* + * We hold a sleep lock over read and flush because we rely on the + * stability of a record in the queue during uiomove. + */ + if (0 != AUDIT_SDEV_SX_XLOCK_SIG(asdev)) + return (EINTR); + AUDIT_SDEV_LOCK(asdev); + while (TAILQ_EMPTY(&asdev->asdev_queue)) { + if (asdev->asdev_flags & AUDIT_SDEV_NBIO) { + AUDIT_SDEV_UNLOCK(asdev); + AUDIT_SDEV_SX_XUNLOCK(asdev); + return (EAGAIN); + } + error = cv_wait_sig(&asdev->asdev_cv, AUDIT_SDEV_MTX(asdev)); + if (error) { + AUDIT_SDEV_UNLOCK(asdev); + AUDIT_SDEV_SX_XUNLOCK(asdev); + return (error); + } + } + + /* + * Copy as many remaining bytes from the current record to userspace + * as we can. Keep processing records until we run out of records in + * the queue or until the user buffer runs out of space. + * + * We rely on the sleep lock to maintain ase's stability here. + */ + asdev->asdev_reads++; + while ((ase = TAILQ_FIRST(&asdev->asdev_queue)) != NULL && + uio_resid(uio) > 0) { + AUDIT_SDEV_LOCK_ASSERT(asdev); + + KASSERT(ase->ase_record_len > asdev->asdev_qoffset, + ("audit_sdev_read: record_len > qoffset (1)")); + toread = MIN(ase->ase_record_len - asdev->asdev_qoffset, + uio_resid(uio)); + AUDIT_SDEV_UNLOCK(asdev); + error = uiomove((char *) ase->ase_record + asdev->asdev_qoffset, + toread, uio); + if (error) { + AUDIT_SDEV_SX_XUNLOCK(asdev); + return (error); + } + + /* + * If the copy succeeded then update book-keeping, and if no + * bytes remain in the current record then free it. + */ + AUDIT_SDEV_LOCK(asdev); + KASSERT(TAILQ_FIRST(&asdev->asdev_queue) == ase, + ("audit_sdev_read: queue out of sync after uiomove")); + asdev->asdev_qoffset += toread; + KASSERT(ase->ase_record_len >= asdev->asdev_qoffset, + ("audit_sdev_read: record_len >= qoffset (2)")); + if (asdev->asdev_qoffset == ase->ase_record_len) { + TAILQ_REMOVE(&asdev->asdev_queue, ase, ase_queue); + asdev->asdev_qbyteslen -= ase->ase_record_len; + audit_sdev_entry_free(ase); + asdev->asdev_qlen--; + asdev->asdev_qoffset = 0; + } + } + AUDIT_SDEV_UNLOCK(asdev); + AUDIT_SDEV_SX_XUNLOCK(asdev); + return (0); +} + +/* + * Audit session device poll method. + */ +static int +audit_sdev_poll(dev_t dev, int events, void *wql, struct proc *p) +{ + struct audit_sdev *asdev; + int revents; + + revents = 0; + asdev = audit_sdev_dtab[minor(dev)]; + KASSERT(NULL != asdev, ("audit_sdev_poll: asdev == NULL")); + + if (events & (POLLIN | POLLRDNORM)) { + AUDIT_SDEV_LOCK(asdev); + if (NULL != TAILQ_FIRST(&asdev->asdev_queue)) + revents |= events & (POLLIN | POLLRDNORM); + else + selrecord(p, &asdev->asdev_selinfo, wql); + AUDIT_SDEV_UNLOCK(asdev); + } + return (revents); +} + +/* + * Audit sdev clone routine. Provides a new minor number or returns -1. + * This called with DEVFS_LOCK held. + */ +static int +audit_sdev_clone(__unused dev_t dev, int action) +{ + int i; + + if (DEVFS_CLONE_ALLOC == action) { + for(i = 0; i < MAX_AUDIT_SDEVS; i++) + if (NULL == audit_sdev_dtab[i]) + return (i); + + /* + * This really should return -1 here but that seems to + * hang things in devfs. We instead return 0 and let + * audit_sdev_open tell userland the bad news. + */ + return (0); + } + + return (-1); +} + +static int +audit_sdev_init(void) +{ + dev_t dev; + + TAILQ_INIT(&audit_sdev_list); + AUDIT_SDEV_LIST_LOCK_INIT(); + + audit_sdev_major = cdevsw_add(-1, &audit_sdev_cdevsw); + if (audit_sdev_major < 0) + return (KERN_FAILURE); + + dev = makedev(audit_sdev_major, 0); + devnode = devfs_make_node_clone(dev, DEVFS_CHAR, UID_ROOT, GID_WHEEL, + 0644, audit_sdev_clone, AUDIT_SDEV_NAME, 0); + + if (NULL == devnode) + return (KERN_FAILURE); + + return (KERN_SUCCESS); +} + +/* XXXss +static int +audit_sdev_shutdown(void) +{ + + devfs_remove(devnode); + (void) cdevsw_remove(audit_sdev_major, &audit_sdev_cdevsw); + + return (KERN_SUCCESS); +} +*/ + #else int @@ -1358,4 +2047,12 @@ audit_session_join(proc_t p, struct audit_session_join_args *uap, return (ENOSYS); } +int +audit_session_port(proc_t p, struct audit_session_port_args *uap, int *retval) +{ +#pragma unused(p, uap, retval) + + return (ENOSYS); +} + #endif /* CONFIG_AUDIT */ diff --git a/bsd/security/audit/audit_syscalls.c b/bsd/security/audit/audit_syscalls.c index 0ad24367a..43d93bdda 100644 --- a/bsd/security/audit/audit_syscalls.c +++ b/bsd/security/audit/audit_syscalls.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1999-2009, Apple Inc. + * Copyright (c) 1999-2010, Apple Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -246,7 +246,7 @@ int auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) { kauth_cred_t scred; - int error; + int error = 0; union auditon_udata udata; proc_t tp = PROC_NULL; struct auditinfo_addr aia; @@ -288,6 +288,8 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) case A_GETPINFO_ADDR: case A_SENDTRIGGER: case A_GETSINFO_ADDR: + case A_GETSFLAGS: + case A_SETSFLAGS: error = copyin(uap->data, (void *)&udata, uap->length); if (error) return (error); @@ -296,33 +298,45 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) break; } + /* Check appropriate privilege. */ + switch (uap->cmd) { /* * A_GETSINFO doesn't require priviledge but only superuser * gets to see the audit masks. */ - error = suser(kauth_cred_get(), &p->p_acflag); - if (A_GETSINFO_ADDR == uap->cmd) { + case A_GETSINFO_ADDR: if ((sizeof(udata.au_kau_info) != uap->length) || (audit_session_lookup(udata.au_kau_info.ai_asid, &udata.au_kau_info) != 0)) - return (EINVAL); - if (error) { + error = EINVAL; + else if (!kauth_cred_issuser(kauth_cred_get())) { udata.au_kau_info.ai_mask.am_success = ~0; udata.au_kau_info.ai_mask.am_failure = ~0; } - } else - if (error) - return (error); + break; + case A_GETSFLAGS: + case A_SETSFLAGS: + /* Getting one's own audit session flags requires no + * privilege. Setting the flags is subject to access + * control implemented in audit_session_setaia(). + */ + break; + default: + error = suser(kauth_cred_get(), &p->p_acflag); + break; + } + if (error) + return (error); /* * XXX Need to implement these commands by accessing the global * values associated with the commands. */ - mtx_lock(&audit_mtx); switch (uap->cmd) { case A_OLDGETPOLICY: case A_GETPOLICY: if (sizeof(udata.au_policy64) == uap->length) { + mtx_lock(&audit_mtx); if (!audit_fail_stop) udata.au_policy64 |= AUDIT_CNT; if (audit_panic_on_write_fail) @@ -331,12 +345,12 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) udata.au_policy64 |= AUDIT_ARGV; if (audit_arge) udata.au_policy64 |= AUDIT_ARGE; + mtx_unlock(&audit_mtx); break; } - if (sizeof(udata.au_policy) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_policy) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); if (!audit_fail_stop) udata.au_policy |= AUDIT_CNT; if (audit_panic_on_write_fail) @@ -345,60 +359,61 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) udata.au_policy |= AUDIT_ARGV; if (audit_arge) udata.au_policy |= AUDIT_ARGE; + mtx_unlock(&audit_mtx); break; case A_OLDSETPOLICY: case A_SETPOLICY: if (sizeof(udata.au_policy64) == uap->length) { if (udata.au_policy64 & ~(AUDIT_CNT|AUDIT_AHLT| - AUDIT_ARGV|AUDIT_ARGE)) { - mtx_unlock(&audit_mtx); + AUDIT_ARGV|AUDIT_ARGE)) return (EINVAL); - } + mtx_lock(&audit_mtx); audit_fail_stop = ((udata.au_policy64 & AUDIT_CNT) == 0); audit_panic_on_write_fail = (udata.au_policy64 & AUDIT_AHLT); audit_argv = (udata.au_policy64 & AUDIT_ARGV); audit_arge = (udata.au_policy64 & AUDIT_ARGE); - + mtx_unlock(&audit_mtx); break; } if ((sizeof(udata.au_policy) != uap->length) || (udata.au_policy & ~(AUDIT_CNT|AUDIT_AHLT|AUDIT_ARGV| - AUDIT_ARGE))) { - mtx_unlock(&audit_mtx); + AUDIT_ARGE))) return (EINVAL); - } /* * XXX - Need to wake up waiters if the policy relaxes? */ + mtx_lock(&audit_mtx); audit_fail_stop = ((udata.au_policy & AUDIT_CNT) == 0); audit_panic_on_write_fail = (udata.au_policy & AUDIT_AHLT); audit_argv = (udata.au_policy & AUDIT_ARGV); audit_arge = (udata.au_policy & AUDIT_ARGE); + mtx_unlock(&audit_mtx); break; case A_GETKMASK: - if (sizeof(udata.au_mask) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_mask) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); udata.au_mask = audit_nae_mask; + mtx_unlock(&audit_mtx); break; case A_SETKMASK: - if (sizeof(udata.au_mask) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_mask) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); audit_nae_mask = udata.au_mask; AUDIT_CHECK_IF_KEVENTS_MASK(audit_nae_mask); + mtx_unlock(&audit_mtx); break; case A_OLDGETQCTRL: case A_GETQCTRL: if (sizeof(udata.au_qctrl64) == uap->length) { + mtx_lock(&audit_mtx); udata.au_qctrl64.aq64_hiwater = (u_int64_t)audit_qctrl.aq_hiwater; udata.au_qctrl64.aq64_lowater = @@ -409,13 +424,14 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) (u_int64_t)audit_qctrl.aq_delay; udata.au_qctrl64.aq64_minfree = (int64_t)audit_qctrl.aq_minfree; + mtx_unlock(&audit_mtx); break; } - if (sizeof(udata.au_qctrl) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_qctrl) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); udata.au_qctrl = audit_qctrl; + mtx_unlock(&audit_mtx); break; case A_OLDSETQCTRL: @@ -426,10 +442,9 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) udata.au_qctrl64.aq64_hiwater) || (udata.au_qctrl64.aq64_bufsz > AQ_MAXBUFSZ) || (udata.au_qctrl64.aq64_minfree < 0) || - (udata.au_qctrl64.aq64_minfree > 100)) { - mtx_unlock(&audit_mtx); + (udata.au_qctrl64.aq64_minfree > 100)) return (EINVAL); - } + mtx_lock(&audit_mtx); audit_qctrl.aq_hiwater = (int)udata.au_qctrl64.aq64_hiwater; audit_qctrl.aq_lowater = @@ -439,77 +454,67 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) audit_qctrl.aq_minfree = (int)udata.au_qctrl64.aq64_minfree; audit_qctrl.aq_delay = -1; /* Not used. */ - - break; + mtx_unlock(&audit_mtx); + break; } if ((sizeof(udata.au_qctrl) != uap->length) || (udata.au_qctrl.aq_hiwater > AQ_MAXHIGH) || (udata.au_qctrl.aq_lowater >= udata.au_qctrl.aq_hiwater) || (udata.au_qctrl.aq_bufsz > AQ_MAXBUFSZ) || (udata.au_qctrl.aq_minfree < 0) || - (udata.au_qctrl.aq_minfree > 100)) { - mtx_unlock(&audit_mtx); + (udata.au_qctrl.aq_minfree > 100)) return (EINVAL); - } + mtx_lock(&audit_mtx); audit_qctrl = udata.au_qctrl; /* XXX The queue delay value isn't used with the kernel. */ audit_qctrl.aq_delay = -1; + mtx_unlock(&audit_mtx); break; case A_GETCWD: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_GETCAR: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_GETSTAT: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_SETSTAT: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_SETUMASK: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_SETSMASK: - mtx_unlock(&audit_mtx); return (ENOSYS); - break; case A_OLDGETCOND: case A_GETCOND: if (sizeof(udata.au_cond64) == uap->length) { + mtx_lock(&audit_mtx); if (audit_enabled && !audit_suspended) udata.au_cond64 = AUC_AUDITING; else udata.au_cond64 = AUC_NOAUDIT; - + mtx_unlock(&audit_mtx); break; } - if (sizeof(udata.au_cond) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_cond) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); if (audit_enabled && !audit_suspended) udata.au_cond = AUC_AUDITING; else udata.au_cond = AUC_NOAUDIT; + mtx_unlock(&audit_mtx); break; case A_OLDSETCOND: case A_SETCOND: if (sizeof(udata.au_cond64) == uap->length) { + mtx_lock(&audit_mtx); if (udata.au_cond64 == AUC_NOAUDIT) audit_suspended = 1; if (udata.au_cond64 == AUC_AUDITING) @@ -518,14 +523,15 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) audit_suspended = 1; mtx_unlock(&audit_mtx); audit_shutdown(); - mtx_lock(&audit_mtx); + break; } + mtx_unlock(&audit_mtx); break; } if (sizeof(udata.au_cond) != uap->length) { - mtx_unlock(&audit_mtx); return (EINVAL); } + mtx_lock(&audit_mtx); if (udata.au_cond == AUC_NOAUDIT) audit_suspended = 1; if (udata.au_cond == AUC_AUDITING) @@ -534,40 +540,32 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) audit_suspended = 1; mtx_unlock(&audit_mtx); audit_shutdown(); - mtx_lock(&audit_mtx); + break; } + mtx_unlock(&audit_mtx); break; case A_GETCLASS: - if (sizeof(udata.au_evclass) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_evclass) != uap->length) return (EINVAL); - } udata.au_evclass.ec_class = au_event_class( udata.au_evclass.ec_number); break; case A_SETCLASS: - if (sizeof(udata.au_evclass) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_evclass) != uap->length) return (EINVAL); - } au_evclassmap_insert(udata.au_evclass.ec_number, udata.au_evclass.ec_class); break; case A_GETPINFO: if ((sizeof(udata.au_aupinfo) != uap->length) || - IS_NOT_VALID_PID(udata.au_aupinfo.ap_pid)) { - mtx_unlock(&audit_mtx); + IS_NOT_VALID_PID(udata.au_aupinfo.ap_pid)) return (EINVAL); - } - if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) { - mtx_unlock(&audit_mtx); + if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) return (ESRCH); - } - mtx_unlock(&audit_mtx); scred = kauth_cred_proc_ref(tp); if (scred->cr_audit.as_aia_p->ai_termid.at_type == AU_IPv6) { kauth_cred_unref(&scred); @@ -590,19 +588,14 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) kauth_cred_unref(&scred); proc_rele(tp); tp = PROC_NULL; - mtx_lock(&audit_mtx); break; case A_SETPMASK: if ((sizeof(udata.au_aupinfo) != uap->length) || - IS_NOT_VALID_PID(udata.au_aupinfo.ap_pid)) { - mtx_unlock(&audit_mtx); + IS_NOT_VALID_PID(udata.au_aupinfo.ap_pid)) return (EINVAL); - } - if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) { - mtx_unlock(&audit_mtx); + if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) return (ESRCH); - } scred = kauth_cred_proc_ref(tp); bcopy(scred->cr_audit.as_aia_p, &aia, sizeof(aia)); kauth_cred_unref(&scred); @@ -611,44 +604,38 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) aia.ai_mask.am_failure = udata.au_aupinfo.ap_mask.am_failure; AUDIT_CHECK_IF_KEVENTS_MASK(aia.ai_mask); - error = audit_session_setaia(tp, &aia, 0); - mtx_unlock(&audit_mtx); + error = audit_session_setaia(tp, &aia); proc_rele(tp); tp = PROC_NULL; if (error) return (error); - mtx_lock(&audit_mtx); break; case A_SETFSIZE: if ((sizeof(udata.au_fstat) != uap->length) || ((udata.au_fstat.af_filesz != 0) && - (udata.au_fstat.af_filesz < MIN_AUDIT_FILE_SIZE))) { - mtx_unlock(&audit_mtx); + (udata.au_fstat.af_filesz < MIN_AUDIT_FILE_SIZE))) return (EINVAL); - } + mtx_lock(&audit_mtx); audit_fstat.af_filesz = udata.au_fstat.af_filesz; + mtx_unlock(&audit_mtx); break; case A_GETFSIZE: - if (sizeof(udata.au_fstat) != uap->length) { - mtx_unlock(&audit_mtx); + if (sizeof(udata.au_fstat) != uap->length) return (EINVAL); - } + mtx_lock(&audit_mtx); udata.au_fstat.af_filesz = audit_fstat.af_filesz; udata.au_fstat.af_currsz = audit_fstat.af_currsz; + mtx_unlock(&audit_mtx); break; case A_GETPINFO_ADDR: if ((sizeof(udata.au_aupinfo_addr) != uap->length) || - IS_NOT_VALID_PID(udata.au_aupinfo_addr.ap_pid)) { - mtx_unlock(&audit_mtx); + IS_NOT_VALID_PID(udata.au_aupinfo_addr.ap_pid)) return (EINVAL); - } - if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) { - mtx_unlock(&audit_mtx); + if ((tp = proc_find(udata.au_aupinfo.ap_pid)) == NULL) return (ESRCH); - } WARN_IF_AINFO_ADDR_CHANGED(uap->length, sizeof(auditpinfo_addr_t), "auditon(A_GETPINFO_ADDR,...)", "auditpinfo_addr_t"); @@ -672,41 +659,48 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) break; case A_GETKAUDIT: - mtx_unlock(&audit_mtx); if (sizeof(udata.au_kau_info) != uap->length) return (EINVAL); audit_get_kinfo(&udata.au_kau_info); - mtx_lock(&audit_mtx); break; case A_SETKAUDIT: if ((sizeof(udata.au_kau_info) != uap->length) || (udata.au_kau_info.ai_termid.at_type != AU_IPv4 && - udata.au_kau_info.ai_termid.at_type != AU_IPv6)) { - mtx_unlock(&audit_mtx); + udata.au_kau_info.ai_termid.at_type != AU_IPv6)) return (EINVAL); - } - mtx_unlock(&audit_mtx); audit_set_kinfo(&udata.au_kau_info); - mtx_lock(&audit_mtx); break; case A_SENDTRIGGER: if ((sizeof(udata.au_trigger) != uap->length) || (udata.au_trigger < AUDIT_TRIGGER_MIN) || - (udata.au_trigger > AUDIT_TRIGGER_MAX)) { - mtx_unlock(&audit_mtx); + (udata.au_trigger > AUDIT_TRIGGER_MAX)) return (EINVAL); - } - mtx_unlock(&audit_mtx); return (audit_send_trigger(udata.au_trigger)); case A_GETSINFO_ADDR: /* Handled above before switch(). */ break; + case A_GETSFLAGS: + if (sizeof(udata.au_flags) != uap->length) + return (EINVAL); + bcopy(&(kauth_cred_get()->cr_audit.as_aia_p->ai_flags), + &udata.au_flags, sizeof(udata.au_flags)); + break; + + case A_SETSFLAGS: + if (sizeof(udata.au_flags) != uap->length) + return (EINVAL); + bcopy(kauth_cred_get()->cr_audit.as_aia_p, &aia, sizeof(aia)); + aia.ai_flags = udata.au_flags; + error = audit_session_setaia(p, &aia); + if (error) + return (error); + break; + default: - mtx_unlock(&audit_mtx); return (EINVAL); } @@ -730,15 +724,13 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval) case A_GETPINFO_ADDR: case A_GETKAUDIT: case A_GETSINFO_ADDR: + case A_GETSFLAGS: error = copyout((void *)&udata, uap->data, uap->length); - if (error) { - mtx_unlock(&audit_mtx); + if (error) return (ENOSYS); - } break; } - mtx_unlock(&audit_mtx); return (0); } @@ -803,7 +795,7 @@ setauid(proc_t p, struct setauid_args *uap, __unused int32_t *retval) bcopy(&scred->cr_audit.as_mask, &aia.ai_mask, sizeof(au_mask_t)); kauth_cred_unref(&scred); aia.ai_auid = id; - error = audit_session_setaia(p, &aia, 0); + error = audit_session_setaia(p, &aia); return (error); } @@ -917,7 +909,7 @@ setaudit(proc_t p, struct setaudit_args *uap, __unused int32_t *retval) newaia.ai_termid.at_port = ai.ai_termid.port; newaia.ai_termid.at_type = AU_IPv4; - error = audit_session_setaia(p, &newaia, 0); + error = audit_session_setaia(p, &newaia); if (error) return (error); @@ -1007,7 +999,7 @@ setaudit_addr(proc_t p, struct setaudit_addr_args *uap, if (aia.ai_asid == AU_DEFAUDITSID) aia.ai_asid = AU_ASSIGN_ASID; - error = audit_session_setaia(p, &aia, 0); + error = audit_session_setaia(p, &aia); if (error) return (error); @@ -1053,7 +1045,7 @@ auditctl(proc_t p, struct auditctl_args *uap, __unused int32_t *retval) if (uap->path == USER_ADDR_NULL) return (EINVAL); - NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | LOCKLEAF | AUDITVNPATH1, (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32), uap->path, vfs_context_current()); error = vn_open(&nd, AUDIT_OPEN_FLAGS, 0); diff --git a/bsd/security/audit/audit_worker.c b/bsd/security/audit/audit_worker.c index d307a7eb9..d9ef366a2 100644 --- a/bsd/security/audit/audit_worker.c +++ b/bsd/security/audit/audit_worker.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1999-2008 Apple Inc. + * Copyright (c) 1999-2010 Apple Inc. * Copyright (c) 2006-2008 Robert N. M. Watson * All rights reserved. * @@ -203,16 +203,11 @@ audit_record_write(struct vnode *vp, struct vfs_context *ctx, void *data, */ if (audit_qctrl.aq_minfree != 0) { temp = mnt_stat->f_blocks / (100 / audit_qctrl.aq_minfree); - if (mnt_stat->f_bfree < temp) { - if (ppsratecheck(&last_lowspace_trigger, - &cur_lowspace_trigger, 1)) { + if (mnt_stat->f_bfree < temp && + ppsratecheck(&last_lowspace_trigger, + &cur_lowspace_trigger, 1)) (void)audit_send_trigger( AUDIT_TRIGGER_LOW_SPACE); - printf("Warning: audit space low (< %d%% free)" - "on audit log file-system\n", - audit_qctrl.aq_minfree); - } - } } /* @@ -358,7 +353,8 @@ audit_worker_process_record(struct kaudit_record *ar) if (!(ar->k_ar_commit & AR_COMMIT_KERNEL) || ((ar->k_ar_commit & AR_PRESELECT_PIPE) == 0 && - (ar->k_ar_commit & AR_PRESELECT_TRAIL) == 0)) + (ar->k_ar_commit & AR_PRESELECT_TRAIL) == 0 && + (ar->k_ar_commit & AR_PRESELECT_FILTER) == 0)) goto out; auid = ar->k_ar.ar_subj_auid; @@ -395,6 +391,16 @@ audit_worker_process_record(struct kaudit_record *ar) ar->k_ar_commit & AR_PRESELECT_TRAIL, bsm->data, bsm->len); + if (ar->k_ar_commit & AR_PRESELECT_FILTER) { + + /* + * XXXss - This needs to be generalized so new filters can + * be easily plugged in. + */ + audit_sdev_submit(auid, ar->k_ar.ar_subj_asid, bsm->data, + bsm->len); + } + kau_free(bsm); out: if (trail_locked) @@ -417,7 +423,9 @@ audit_worker(void) struct kaudit_record *ar; int lowater_signal; - audit_ctx.vc_thread = current_thread(); + if (audit_ctx.vc_thread == NULL) + audit_ctx.vc_thread = current_thread(); + TAILQ_INIT(&ar_worklist); mtx_lock(&audit_mtx); while (1) { @@ -427,7 +435,8 @@ audit_worker(void) * Wait for a record. */ while (TAILQ_EMPTY(&audit_q)) - cv_wait(&audit_worker_cv, &audit_mtx); + cv_wait_continuation(&audit_worker_cv, &audit_mtx, + (thread_continue_t)audit_worker); /* * If there are records in the global audit record queue, diff --git a/bsd/sys/Makefile b/bsd/sys/Makefile index f74bfe8d0..53f457741 100644 --- a/bsd/sys/Makefile +++ b/bsd/sys/Makefile @@ -7,25 +7,9 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir include $(MakeInc_cmd) include $(MakeInc_def) -ALLPRODUCTS = AppleTV iPhone MacOSX -PRODUCT = $(shell tconf --product) -EXTRAUNIFDEF = $(foreach x,$(ALLPRODUCTS),$(if $(findstring $(PRODUCT),$(x)),-DPRODUCT_$(x),-UPRODUCT_$(x))) -SINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) -SPINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) -KINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) -KPINCFRAME_UNIFDEF += $(EXTRAUNIFDEF) +INSTINC_SUBDIRS = -INSTINC_SUBDIRS = \ - -INSTINC_SUBDIRS_PPC = \ - -INSTINC_SUBDIRS_I386 = \ - -EXPINC_SUBDIRS = \ - -EXPINC_SUBDIRS_PPC = \ - -EXPINC_SUBDIRS_I386 = \ +EXPINC_SUBDIRS = # Installs header file for user level - # $(DSTROOT)/System/Library/Frameworks/System.framework/Headers @@ -55,17 +39,22 @@ DATAFILES = \ # $(DSTROOT)/System/Library/Frameworks/System.framework/PrivateHeaders PRIVATE_DATAFILES = \ codesign.h \ + content_protection.h \ disklabel.h \ + fileport.h \ fsctl.h \ fsgetpath.h \ fslog.h \ + imgsrc.h \ ipcs.h \ shm_internal.h \ spawn_internal.h \ tree.h \ ux_exception.h \ proc_info.h \ - vnioctl.h + process_policy.h \ + vnioctl.h \ + priv.h # Installs header file for kernel extensions - # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers @@ -97,10 +86,13 @@ KERNELFILES = \ # Installs header file for Apple internal use for kernel extensions - # $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders PRIVATE_KERNELFILES = \ + codesign.h \ cprotect.h \ + content_protection.h \ decmpfs.h \ disktab.h \ fbt.h \ + fileport.h \ fsctl.h \ fslog.h \ mach_swapon.h \ @@ -115,19 +107,20 @@ PRIVATE_KERNELFILES = \ user.h \ vfs_context.h \ vmmeter.h \ - spawn_internal.h + spawn_internal.h \ + priv.h # /System/Library/Frameworks/System.framework/Headers and /usr/include INSTALL_MI_LIST = ${DATAFILES} -INSTALL_MI_GEN_LIST = syscall.h +INSTALL_MI_GEN_LIST = syscall.h _posix_availability.h _symbol_aliasing.h INSTALL_MI_DIR = sys -EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info.h filedesc.h pipe.h resourcevar.h semaphore.h \ +EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info.h pthread_internal.h filedesc.h pipe.h resourcevar.h semaphore.h \ vnode_internal.h proc_internal.h file_internal.h mount_internal.h \ - uio_internal.h + uio_internal.h tree.h EXPORT_MI_GEN_LIST = syscall.h sysproto.h @@ -156,6 +149,16 @@ sysproto.h: $(SRCROOT)/bsd/kern/syscalls.master $(MAKESYSCALLS) @echo "Generating bsd/sys/$@ from $<"; $(_v)$(MAKESYSCALLS) $< proto > /dev/null +MAKE_POSIX_AVAILABILITY = $(SRCROOT)/bsd/sys/make_posix_availability.sh +_posix_availability.h: $(MAKE_POSIX_AVAILABILITY) + @echo "Generating bsd/sys/$@" + $(_v)$(MAKE_POSIX_AVAILABILITY) $@ + +MAKE_SYMBOL_ALIASING = $(SRCROOT)/bsd/sys/make_symbol_aliasing.sh +_symbol_aliasing.h: $(MAKE_SYMBOL_ALIASING) + @echo "Generating bsd/sys/$@" + $(_v)$(MAKE_SYMBOL_ALIASING) $@ + include $(MakeInc_rule) include $(MakeInc_dir) diff --git a/bsd/sys/attr.h b/bsd/sys/attr.h index 5f85717d7..42a8b7673 100644 --- a/bsd/sys/attr.h +++ b/bsd/sys/attr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -49,6 +49,12 @@ /* The following option only valid when requesting ATTR_CMN_RETURNED_ATTRS */ #define FSOPT_PACK_INVAL_ATTRS 0x00000008 +#ifdef PRIVATE +#define FSOPT_EXCHANGE_DATA_ONLY 0x0000010 +#endif + + + /* we currently aren't anywhere near this amount for a valid * fssearchblock.sizeofsearchparams1 or fssearchblock.sizeofsearchparams2 * but we put a sanity check in to avoid abuse of the value passed in from @@ -208,6 +214,12 @@ typedef struct vol_capabilities_attr { * * VOL_CAP_FMT_DECMPFS_COMPRESSION: When set, the volume supports transparent * decompression of compressed files using decmpfs. + * + * VOL_CAP_FMT_64BIT_OBJECT_IDS: When set, the volume uses object IDs that + * are 64-bit. This means that ATTR_CMN_FILEID and ATTR_CMN_PARENTID are the + * only legitimate attributes for obtaining object IDs from this volume and the + * 32-bit fid_objno fields of the fsobj_id_t returned by ATTR_CMN_OBJID, + * ATTR_CMN_OBJPERMID, and ATTR_CMN_PAROBJID are undefined. */ #define VOL_CAP_FMT_PERSISTENTOBJECTIDS 0x00000001 #define VOL_CAP_FMT_SYMBOLICLINKS 0x00000002 @@ -225,7 +237,8 @@ typedef struct vol_capabilities_attr { #define VOL_CAP_FMT_HIDDEN_FILES 0x00002000 #define VOL_CAP_FMT_PATH_FROM_ID 0x00004000 #define VOL_CAP_FMT_NO_VOLUME_SIZES 0x00008000 -#define VOL_CAP_FMT_DECMPFS_COMPRESSION 0x00010000 +#define VOL_CAP_FMT_DECMPFS_COMPRESSION 0x00010000 +#define VOL_CAP_FMT_64BIT_OBJECT_IDS 0x00020000 /* @@ -338,13 +351,15 @@ typedef struct vol_attributes_attr { #define ATTR_CMN_FILEID 0x02000000 #define ATTR_CMN_PARENTID 0x04000000 #define ATTR_CMN_FULLPATH 0x08000000 +#define ATTR_CMN_ADDEDTIME 0x10000000 + /* * ATTR_CMN_RETURNED_ATTRS is only valid with getattrlist(2). * It is always the first attribute in the return buffer. */ #define ATTR_CMN_RETURNED_ATTRS 0x80000000 -#define ATTR_CMN_VALIDMASK 0x8FE7FFFF +#define ATTR_CMN_VALIDMASK 0x9FE7FFFF #define ATTR_CMN_SETMASK 0x01C7FF00 #define ATTR_CMN_VOLSETMASK 0x00006700 @@ -378,7 +393,9 @@ typedef struct vol_attributes_attr { #define ATTR_DIR_LINKCOUNT 0x00000001 #define ATTR_DIR_ENTRYCOUNT 0x00000002 #define ATTR_DIR_MOUNTSTATUS 0x00000004 -#define DIR_MNTSTATUS_MNTPOINT 0x00000001 +/* ATTR_DIR_MOUNTSTATUS Flags: */ +#define DIR_MNTSTATUS_MNTPOINT 0x00000001 +#define DIR_MNTSTATUS_TRIGGER 0x00000002 #define ATTR_DIR_VALIDMASK 0x00000007 #define ATTR_DIR_SETMASK 0x00000000 @@ -394,11 +411,9 @@ typedef struct vol_attributes_attr { #define ATTR_FILE_DATAALLOCSIZE 0x00000400 #define ATTR_FILE_RSRCLENGTH 0x00001000 #define ATTR_FILE_RSRCALLOCSIZE 0x00002000 -/* Only used when CONFIG_PROTECT is ON */ -#define ATTR_FILE_PROTECTION_CLASS 0x00004000 -#define ATTR_FILE_VALIDMASK 0x000077FF -#define ATTR_FILE_SETMASK 0x00004020 +#define ATTR_FILE_VALIDMASK 0x000037FF +#define ATTR_FILE_SETMASK 0x00000020 #define ATTR_FORK_TOTALSIZE 0x00000001 #define ATTR_FORK_ALLOCSIZE 0x00000002 diff --git a/bsd/sys/buf.h b/bsd/sys/buf.h index 80bf3d384..f1d7f924b 100644 --- a/bsd/sys/buf.h +++ b/bsd/sys/buf.h @@ -90,6 +90,7 @@ #define B_PASSIVE 0x00000800 /* PASSIVE I/Os are ignored by THROTTLE I/O */ #define B_IOSTREAMING 0x00001000 /* sequential access pattern detected */ #define B_THROTTLED_IO 0x00002000 /* low priority I/O */ +#define B_ENCRYPTED_IO 0x00004000 /* Encrypted I/O */ /* * make sure to check when adding flags that * that the new flags don't overlap the definitions @@ -121,6 +122,8 @@ void buf_markinvalid(buf_t); */ void buf_markdelayed(buf_t); +void buf_markclean(buf_t); + /*! @function buf_markeintr @abstract Mark a buffer as having been interrupted during I/O. @@ -634,6 +637,32 @@ errno_t buf_setupl(buf_t, upl_t, uint32_t); */ buf_t buf_clone(buf_t, int, int, void (*)(buf_t, void *), void *); + +/*! + @function buf_create_shadow + @abstract Create a shadow buffer with optional private storage and an optional callback. + @param bp Buffer to shadow. + @param force_copy If TRUE, do not link the shadaow to 'bp' and if 'external_storage' == NULL, + force a copy of the data associated with 'bp'. + @param external_storage If non-NULL, associate it with the new buffer as its storage instead of the + storage currently associated with 'bp'. + @param iodone Callback to be called from buf_biodone() when I/O completes, in the sense of buf_setcallback(). + @param arg Argument to pass to iodone() callback. + @return NULL if the buffer to be shadowed is not B_META or a primary buffer (i.e. not a shadow buffer); otherwise, the new buffer. +*/ + +buf_t buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg); + + +/*! + @function buf_shadow + @abstract returns true if 'bp' is a shadow of another buffer. + @param bp Buffer to query. + @return 1 if 'bp' is a shadow, 0 otherwise. +*/ +int buf_shadow(buf_t bp); + + /*! @function buf_alloc @abstract Allocate an uninitialized buffer. @@ -659,6 +688,7 @@ void buf_free(buf_t); */ #define BUF_WRITE_DATA 0x0001 /* write data blocks first */ #define BUF_SKIP_META 0x0002 /* skip over metadata blocks */ +#define BUF_INVALIDATE_LOCKED 0x0004 /* force B_LOCKED blocks to be invalidated */ /*! @function buf_invalidateblks @@ -966,8 +996,38 @@ buf_t buf_getblk(vnode_t, daddr64_t, int, int, int, int); @return Always returns a new buffer. */ buf_t buf_geteblk(int); + +/*! + @function buf_clear_redundancy_flags + @abstract Clear flags on a buffer. + @discussion: buffer_redundancy_flags &= ~flags + @param bp Buffer whose flags to clear. + @param flags Flags to remove from buffer's mask + @return void. + */ +void buf_clear_redundancy_flags(buf_t, uint32_t); + +/*! + @function buf_redundancyflags + @abstract Get redundancy flags set on a buffer. + @param bp Buffer whose redundancy flags to grab. + @return flags. + */ +uint32_t buf_redundancy_flags(buf_t); + +/*! + @function buf_setredundancyflags + @abstract Set redundancy flags on a buffer. + @discussion: buffer_redundancy_flags |= flags + @param bp Buffer whose flags to set. + @param flags Flags to add to buffer's redundancy flags + @return void. + */ +void buf_set_redundancy_flags(buf_t, uint32_t); + #ifdef KERNEL_PRIVATE -void buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void **, void **); +void buf_setfilter(buf_t, void (*)(buf_t, void *), void *, void (**)(buf_t, void *), void **); + /*! @function buf_getcpaddr diff --git a/bsd/sys/buf_internal.h b/bsd/sys/buf_internal.h index 5718861f6..d80eb21c8 100644 --- a/bsd/sys/buf_internal.h +++ b/bsd/sys/buf_internal.h @@ -115,7 +115,16 @@ struct buf { int b_dirtyend; /* Offset of end of dirty region. */ int b_validoff; /* Offset in buffer of valid region. */ int b_validend; /* Offset of end of valid region. */ + + /* store extra information related to redundancy of data, such as + * which redundancy copy to use, etc + */ + uint32_t b_redundancy_flags; + proc_t b_proc; /* Associated proc; NULL if kernel. */ +#ifdef BUF_MAKE_PRIVATE + buf_t b_data_store; +#endif #if CONFIG_PROTECT struct cprotect *b_cpentry; /* address of cp_entry, to be passed further down */ #endif /* CONFIG_PROTECT */ @@ -131,6 +140,12 @@ struct buf { /* cluster_io definitions for use with io bufs */ #define b_uploffset b_bufsize +#define b_orig b_freelist.tqe_prev +#define b_shadow b_freelist.tqe_next +#define b_shadow_ref b_validoff +#ifdef BUF_MAKE_PRIVATE +#define b_data_ref b_validend +#endif #define b_trans_head b_freelist.tqe_prev #define b_trans_next b_freelist.tqe_next #define b_iostate b_rcred @@ -143,20 +158,25 @@ struct buf { #define BL_BUSY 0x00000001 /* I/O in progress. */ #define BL_WANTED 0x00000002 /* Process wants this buffer. */ #define BL_IOBUF 0x00000004 /* buffer allocated via 'buf_alloc' */ -#define BL_CALLDONE 0x00000008 /* callback routine on B_CALL bp has completed */ #define BL_WANTDEALLOC 0x00000010 /* buffer should be put on empty list when clean */ +#define BL_SHADOW 0x00000020 +#define BL_EXTERNAL 0x00000040 +#define BL_WAITSHADOW 0x00000080 +#define BL_IOBUF_ALLOC 0x00000100 /* * Parameters for buffer cache garbage collection */ #define BUF_STALE_THRESHHOLD 30 /* Collect if untouched in the last 30 seconds */ -#define BUF_MAX_GC_COUNT 1000 /* Generally 6-8 MB */ +#define BUF_MAX_GC_COUNT 1024 /* Generally 6-8 MB */ +#define BUF_MAX_GC_BATCH_SIZE 128 /* Under a single grab of the lock */ /* * mask used by buf_flags... these are the readable external flags */ #define BUF_X_RDFLAGS (B_PHYS | B_RAW | B_LOCKED | B_ASYNC | B_READ | B_WRITE | B_PAGEIO |\ - B_META | B_CLUSTER | B_DELWRI | B_FUA | B_PASSIVE | B_IOSTREAMING | B_THROTTLED_IO) + B_META | B_CLUSTER | B_DELWRI | B_FUA | B_PASSIVE | B_IOSTREAMING | B_THROTTLED_IO |\ + B_ENCRYPTED_IO) /* * mask used by buf_clearflags/buf_setflags... these are the writable external flags */ @@ -189,11 +209,10 @@ struct buf { /* * private flags used by by the cluster layer */ -#define B_NEED_IODONE 0x20000000 /* need biodone on the real_bp associated with a cluster_io */ +#define B_TWANTED 0x20000000 /* but_t that is part of a cluster level transaction is wanted */ #define B_COMMIT_UPL 0x40000000 /* commit/abort the UPL on I/O success/failure */ #define B_TDONE 0x80000000 /* buf_t that is part of a cluster level transaction has completed */ - /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ @@ -222,6 +241,8 @@ extern struct buf *buf_headers; /* The buffer headers. */ __BEGIN_DECLS +buf_t buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg); + buf_t alloc_io_buf(vnode_t, int); void free_io_buf(buf_t); @@ -239,8 +260,6 @@ void bufinit(void) __attribute__((section("__TEXT, initcode"))); void buf_list_lock(void); void buf_list_unlock(void); -void buf_biowait_callback(buf_t); - void cluster_init(void) __attribute__((section("__TEXT, initcode"))); void buf_drop(buf_t); errno_t buf_acquire(buf_t, int, int, int); @@ -248,6 +267,9 @@ errno_t buf_acquire(buf_t, int, int, int); int count_busy_buffers(void); int count_lock_queue(void); +#ifdef BUF_MAKE_PRIVATE +errno_t buf_make_private(buf_t bp); +#endif __END_DECLS diff --git a/bsd/sys/cdefs.h b/bsd/sys/cdefs.h index 59e922bea..7076ef572 100644 --- a/bsd/sys/cdefs.h +++ b/bsd/sys/cdefs.h @@ -164,6 +164,12 @@ #define __unused #endif +#if defined(__GNUC__) && __GNUC__ >= 4 +#define __used __attribute__((__used__)) +#else +#define __used +#endif + /* * GCC 2.95 provides `__restrict' as an extension to C90 to support the * C99-specific `restrict' type qualifier. We happen to use `__restrict' as @@ -196,7 +202,7 @@ #define __scanflike(fmtarg, firstvararg) #endif -#define __IDSTRING(name,string) static const char name[] __unused = string +#define __IDSTRING(name,string) static const char name[] __used = string #ifndef __COPYRIGHT #define __COPYRIGHT(s) __IDSTRING(copyright,s) @@ -215,7 +221,7 @@ #endif /* - * COMPILATION ENVIRONMENTS + * COMPILATION ENVIRONMENTS -- see compat(5) for additional detail * * DEFAULT By default newly complied code will get POSIX APIs plus * Apple API extensions in scope. @@ -259,24 +265,24 @@ #define __DARWIN_SUF_DARWIN10 "_darwin10" #define __DARWIN10_ALIAS(sym) __asm("_" __STRING(sym) __DARWIN_SUF_DARWIN10) #else /* !KERNEL */ -#ifdef PRODUCT_AppleTV -/* Product: AppleTV */ +#ifdef PLATFORM_iPhoneOS +/* Platform: iPhoneOS */ #define __DARWIN_ONLY_64_BIT_INO_T 1 #define __DARWIN_ONLY_UNIX_CONFORMANCE 1 #define __DARWIN_ONLY_VERS_1050 1 -#endif /* PRODUCT_AppleTV */ -#ifdef PRODUCT_iPhone -/* Product: iPhone */ +#endif /* PLATFORM_iPhoneOS */ +#ifdef PLATFORM_iPhoneSimulator +/* Platform: iPhoneSimulator */ #define __DARWIN_ONLY_64_BIT_INO_T 1 #define __DARWIN_ONLY_UNIX_CONFORMANCE 1 #define __DARWIN_ONLY_VERS_1050 1 -#endif /* PRODUCT_iPhone */ -#ifdef PRODUCT_MacOSX -/* Product: MacOSX */ +#endif /* PLATFORM_iPhoneSimulator */ +#ifdef PLATFORM_MacOSX +/* Platform: MacOSX */ #define __DARWIN_ONLY_64_BIT_INO_T 0 /* #undef __DARWIN_ONLY_UNIX_CONFORMANCE (automatically set for 64-bit) */ #define __DARWIN_ONLY_VERS_1050 0 -#endif /* PRODUCT_MacOSX */ +#endif /* PLATFORM_MacOSX */ #endif /* KERNEL */ /* @@ -313,6 +319,8 @@ # error "Can't define _NONSTD_SOURCE when only UNIX conformance is available." # endif /* _NONSTD_SOURCE */ # define __DARWIN_UNIX03 1 +# elif defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && ((__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0) < 1040) +# define __DARWIN_UNIX03 0 # elif defined(_DARWIN_C_SOURCE) || defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE) # if defined(_NONSTD_SOURCE) # error "Can't define both _NONSTD_SOURCE and any of _DARWIN_C_SOURCE, _XOPEN_SOURCE or _POSIX_C_SOURCE." @@ -438,13 +446,19 @@ /* * symbol release macros */ -#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && ((__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0) < 1060) -#undef __DARWIN_10_6_AND_LATER -#define __DARWIN_10_6_AND_LATER_ALIAS(x) /* nothing */ -#else /* 10.6 and beyond */ -#define __DARWIN_10_6_AND_LATER -#define __DARWIN_10_6_AND_LATER_ALIAS(x) x +#ifdef KERNEL +#define __DARWIN_ALIAS_STARTING(_mac, _iphone, x) +#else +#include <sys/_symbol_aliasing.h> + +#if defined(__IPHONE_OS_VERSION_MIN_REQUIRED) +#define __DARWIN_ALIAS_STARTING(_mac, _iphone, x) __DARWIN_ALIAS_STARTING_IPHONE_##_iphone(x) +#elif defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) +#define __DARWIN_ALIAS_STARTING(_mac, _iphone, x) __DARWIN_ALIAS_STARTING_MAC_##_mac(x) +#else +#define __DARWIN_ALIAS_STARTING(_mac, _iphone, x) #endif +#endif /* KERNEL */ /* @@ -460,6 +474,7 @@ * _POSIX_C_SOURCE == 199506L 1003.1c-1995, 1003.1i-1995, * and the omnibus ISO/IEC 9945-1: 1996 * _POSIX_C_SOURCE == 200112L 1003.1-2001 + * _POSIX_C_SOURCE == 200809L 1003.1-2008 * * In addition, the X/Open Portability Guide, which is now the Single UNIX * Specification, defines a feature-test macro which indicates the version of @@ -480,10 +495,13 @@ /* Deal with various X/Open Portability Guides and Single UNIX Spec. */ #ifdef _XOPEN_SOURCE -#if _XOPEN_SOURCE - 0L >= 600L +#if _XOPEN_SOURCE - 0L >= 700L && (!defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE - 0L < 200809L) +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#elif _XOPEN_SOURCE - 0L >= 600L && (!defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE - 0L < 200112L) #undef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 200112L -#elif _XOPEN_SOURCE - 0L >= 500L +#elif _XOPEN_SOURCE - 0L >= 500L && (!defined(_POSIX_C_SOURCE) || _POSIX_C_SOURCE - 0L < 199506L) #undef _POSIX_C_SOURCE #define _POSIX_C_SOURCE 199506L #endif @@ -497,6 +515,44 @@ #define _POSIX_C_SOURCE 198808L #endif +/* + * Deprecation macro + */ +#if defined(__GNUC__) && ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))) +#define __deprecated __attribute__((deprecated)) +#define __unavailable __attribute__((unavailable)) +#else +#define __deprecated /* nothing */ +#define __unavailable /* nothing */ +#endif + +/* POSIX C deprecation macros */ +#ifdef KERNEL +#define __POSIX_C_DEPRECATED(ver) +#else +#include <sys/_posix_availability.h> + +#define __POSIX_C_DEPRECATED(ver) ___POSIX_C_DEPRECATED_STARTING_##ver +#endif + +/* + * Set a single macro which will always be defined and can be used to determine + * the appropriate namespace. For POSIX, these values will correspond to + * _POSIX_C_SOURCE value. Currently there are two additional levels corresponding + * to ANSI (_ANSI_SOURCE) and Darwin extensions (_DARWIN_C_SOURCE) + */ +#define __DARWIN_C_ANSI 010000L +#define __DARWIN_C_FULL 900000L + +#if defined(_ANSI_SOURCE) +#define __DARWIN_C_LEVEL __DARWIN_C_ANSI +#elif defined(_POSIX_C_SOURCE) && !defined(_DARWIN_C_SOURCE) && !defined(_NONSTD_SOURCE) +#define __DARWIN_C_LEVEL _POSIX_C_SOURCE +#else +#define __DARWIN_C_LEVEL __DARWIN_C_FULL +#endif + + /* * long long is not supported in c89 (__STRICT_ANSI__), but g++ -ansi and * c99 still want long longs. While not perfect, we allow long longs for @@ -512,22 +568,7 @@ * long doubles. This applies only to ppc; i386 already has long double * support, while ppc64 doesn't have any backwards history. */ -#if defined(__ppc__) -# if defined(__LDBL_MANT_DIG__) && defined(__DBL_MANT_DIG__) && \ - __LDBL_MANT_DIG__ > __DBL_MANT_DIG__ -# if __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0 < 1040 -# define __DARWIN_LDBL_COMPAT(x) __asm("_" __STRING(x) "$LDBLStub") -# else -# define __DARWIN_LDBL_COMPAT(x) __asm("_" __STRING(x) "$LDBL128") -# endif -# define __DARWIN_LDBL_COMPAT2(x) __asm("_" __STRING(x) "$LDBL128") -# define __DARWIN_LONG_DOUBLE_IS_DOUBLE 0 -# else -# define __DARWIN_LDBL_COMPAT(x) /* nothing */ -# define __DARWIN_LDBL_COMPAT2(x) /* nothing */ -# define __DARWIN_LONG_DOUBLE_IS_DOUBLE 1 -# endif -#elif defined(__i386__) || defined(__ppc64__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) # define __DARWIN_LDBL_COMPAT(x) /* nothing */ # define __DARWIN_LDBL_COMPAT2(x) /* nothing */ # define __DARWIN_LONG_DOUBLE_IS_DOUBLE 0 @@ -535,15 +576,6 @@ # error Unknown architecture #endif -/* - * Deprecation macro - */ -#if __GNUC__ >= 3 -#define __deprecated __attribute__((deprecated)) -#else -#define __deprecated /* nothing */ -#endif - /***************************************** * Public darwin-specific feature macros *****************************************/ @@ -605,7 +637,7 @@ * catastrophic run-time failures. */ #ifndef __CAST_AWAY_QUALIFIER -#define __CAST_AWAY_QUALIFIER(variable, qualifier, type) (type) ((char *)0 + ((qualifier char *)(variable) - (qualifier char *)0) ) +#define __CAST_AWAY_QUALIFIER(variable, qualifier, type) (type) (long)(variable) #endif #endif /* !_CDEFS_H_ */ diff --git a/bsd/sys/codesign.h b/bsd/sys/codesign.h index 56ae21668..e72c25044 100644 --- a/bsd/sys/codesign.h +++ b/bsd/sys/codesign.h @@ -38,6 +38,7 @@ #define CS_EXEC_SET_HARD 0x1000 /* set CS_HARD on any exec'ed process */ #define CS_EXEC_SET_KILL 0x2000 /* set CS_KILL on any exec'ed process */ #define CS_KILLED 0x10000 /* was killed by kernel for invalidity */ +#define CS_RESTRICT 0x20000 /* tell dyld to treat restricted */ /* csops operations */ #define CS_OPS_STATUS 0 /* return status */ @@ -47,6 +48,8 @@ #define CS_OPS_PIDPATH 4 /* get executable's pathname */ #define CS_OPS_CDHASH 5 /* get code directory hash */ #define CS_OPS_PIDOFFSET 6 /* get offset of active Mach-o slice */ +#define CS_OPS_ENTITLEMENTS_BLOB 7 /* get entitlements blob */ +#define CS_OPS_MARKRESTRICT 8 /* set RESTRICT flag (sticky) */ #ifndef KERNEL diff --git a/bsd/sys/conf.h b/bsd/sys/conf.h index 4cf53a914..39e4fef37 100644 --- a/bsd/sys/conf.h +++ b/bsd/sys/conf.h @@ -71,6 +71,8 @@ #include <sys/appleapiopts.h> #include <sys/cdefs.h> +#include <sys/queue.h> +#include <stdint.h> /* * Definitions of device driver entry switches @@ -194,10 +196,24 @@ struct cdevsw { int d_type; }; +#ifdef BSD_KERNEL_PRIVATE +void devsw_init(void); + +extern uint64_t cdevsw_flags[]; +#define CDEVSW_SELECT_KQUEUE 0x01 +#define CDEVSW_USE_OFFSET 0x02 + +struct thread; + +typedef struct devsw_lock { + TAILQ_ENTRY(devsw_lock) dl_list; + struct thread *dl_thread; + dev_t dl_dev; + int dl_mode; +} *devsw_lock_t; + +#endif /* BSD_KERNEL_PRIVATE */ -#ifdef KERNEL_PRIVATE -extern struct cdevsw cdevsw[]; -#endif /* KERNEL_PRIVATE */ /* * Contents of empty cdevsw slot. @@ -276,6 +292,16 @@ extern struct swdevt swdevt[]; * else -1 */ __BEGIN_DECLS +#ifdef KERNEL_PRIVATE +extern struct cdevsw cdevsw[]; +extern int cdevsw_setkqueueok(int, struct cdevsw*, int); +#endif /* KERNEL_PRIVATE */ + +#ifdef BSD_KERNEL_PRIVATE +extern void devsw_lock(dev_t, int); +extern void devsw_unlock(dev_t, int); +#endif /* BSD_KERNEL_PRIVATE */ + int bdevsw_isfree(int); int bdevsw_add(int, struct bdevsw *); int bdevsw_remove(int, struct bdevsw *); diff --git a/osfmk/ppc/Performance.h b/bsd/sys/content_protection.h similarity index 71% rename from osfmk/ppc/Performance.h rename to bsd/sys/content_protection.h index 4442d603e..a4066e184 100644 --- a/osfmk/ppc/Performance.h +++ b/bsd/sys/content_protection.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,21 +25,23 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Keep special performance related stuff in here - */ -#define PERF_HIST 0 -#define PMIHIST_SIZE 0x00400000 -#define perfClear 0 -#define perfStart 1 -#define perfStop 2 -#define perfMap 3 +#ifndef _SYS_CONTENT_PROTECTION_H_ +#define _SYS_CONTENT_PROTECTION_H_ -#ifndef ASSEMBLER +#ifdef PRIVATE + +/* + * Protection classes vary in their restrictions on read/writability. A is generally + * the strictest, and D is effectively no restriction. + */ +#define PROTECTION_CLASS_A 1 +#define PROTECTION_CLASS_B 2 +#define PROTECTION_CLASS_C 3 +#define PROTECTION_CLASS_D 4 +#define PROTECTION_CLASS_E 5 +#define PROTECTION_CLASS_F 6 -extern unsigned int PMIhist; -extern unsigned int PMIhistV; -extern unsigned int PerfCtl(unsigned int cmd, unsigned int parm0); +#endif /* PRIVATE */ -#endif /* !ASSEMBLER */ +#endif /* _SYS_CONTENT_PROTECTION_H_ */ diff --git a/bsd/sys/cprotect.h b/bsd/sys/cprotect.h index 2edf9aed2..bebe3bb43 100644 --- a/bsd/sys/cprotect.h +++ b/bsd/sys/cprotect.h @@ -36,77 +36,114 @@ extern "C" { #if KERNEL_PRIVATE #include <sys/cdefs.h> +#include <sys/content_protection.h> #include <sys/kernel_types.h> -#define PROTECTION_CLASS_A 1 -#define PROTECTION_CLASS_B 2 -#define PROTECTION_CLASS_C 3 -#define PROTECTION_CLASS_D 4 -#define PROTECTION_CLASS_E 5 +#define CP_KEYSIZE 32 /* 8x4 = 32, 32x8 = 256 */ +#define CP_WRAPPEDKEYSIZE 40 /* 2x4 = 8, 8x8 = 64 */ -#define KEYSIZE 8 /* 8x4 = 32, 32x8 = 256 */ -#define INTEGRITYSIZE 2 /* 2x4 = 8, 8x8 = 64 */ +/* lock events from AppleKeyStore */ +#define CP_LOCKED_STATE 0 /* Device is locked */ +#define CP_UNLOCKED_STATE 1 /* Device is unlocked */ -#define LOCKED_STATE 0 -#define UNLOCKED_STATE 1 +#define CP_LOCKED_KEYCHAIN 0 +#define CP_UNLOCKED_KEYCHAIN 1 -#define LOCKED_KEYCHAIN 0 -#define UNLOCKED_KEYCHAIN 1 +/* For struct cprotect: cp_flags */ +#define CP_NEEDS_KEYS 0x1 /* File needs persistent keys */ +#define CP_KEY_FLUSHED 0x2 /* File's unwrapped key has been purged from memory */ +#define CP_NO_XATTR 0x4 /* Key info has not been saved as EA to the FS */ -#define CONTENT_PROTECTION_XATTR_NAME "com.apple.system.cprotect" +/* Content Protection VNOP Operation flags */ +#define CP_READ_ACCESS 0x1 +#define CP_WRITE_ACCESS 0x2 -#define kEMBCKeyHandleSpecial ~1 +#define CONTENT_PROTECTION_XATTR_NAME "com.apple.system.cprotect" +#define CP_CURRENT_MAJOR_VERS 2 +#define CP_CURRENT_MINOR_VERS 0 -/* SLIST_HEAD(cp_list, cp_entry) cp_head = LIST_HEAD_INITIALIZER(cp_head); */ -/* struct cp_list *cprotect_list_headp; /\* List head *\/ */ typedef struct cprotect *cprotect_t; typedef struct cp_wrap_func *cp_wrap_func_t; typedef struct cp_global_state *cp_global_state_t; typedef struct cp_xattr *cp_xattr_t; +typedef struct cnode * cnode_ptr_t; +//forward declare the struct. +struct hfsmount; -typedef int wrapper_t(uint32_t properties, void *key_bytes, size_t key_length, void **wrapped_data, uint32_t *wrapped_length); -typedef int unwrapper_t(uint32_t properties, void *wrapped_data, size_t wrapped_data_length, void **key_bytes, uint32_t *key_length); +/* The wrappers are invoked by the AKS kext */ +typedef int wrapper_t(uint32_t properties, void *key_bytes, size_t key_length, void *wrapped_data, size_t *wrapped_length); +typedef int unwrapper_t(uint32_t properties, void *wrapped_data, size_t wrapped_data_length, void *key_bytes, size_t *key_length); +/* + * Runtime-only structure containing the content protection status + * for the given file. This is contained within the cnode + */ struct cprotect { - uint32_t cache_key[KEYSIZE]; - uint32_t special_data; - uint32_t pclass; - uint8_t cache_key_flushed; - uint8_t lock_state; /* lock_state: 0 means unlocked. 1 means locked */ -}; - -struct cp_entry { - SLIST_ENTRY(cp_entry) cp_list; - struct cprotect *protected_entry; + uint8_t cp_cache_key[CP_KEYSIZE]; + uint8_t cp_persistent_key[CP_WRAPPEDKEYSIZE]; + uint32_t cp_flags; + uint32_t cp_pclass; }; struct cp_wrap_func { - wrapper_t *wrapper; - unwrapper_t *unwrapper; + wrapper_t *wrapper; + unwrapper_t *unwrapper; }; struct cp_global_state { + uint8_t wrap_functions_set; uint8_t lock_state; - uint8_t wrap_functions_set; }; +/* + * On-disk structure written as the per-file EA payload + * All on-disk multi-byte fields for the CP XATTR must be stored + * little-endian on-disk. This means they must be endian swapped to + * L.E on getxattr() and converted to LE on setxattr(). + */ struct cp_xattr { - uint32_t persistent_class; - uint8_t persistent_key[32]; - uint8_t persistent_integrity[8]; - uint8_t xattr_version; + u_int16_t xattr_major_version; + u_int16_t xattr_minor_version; + u_int32_t flags; + u_int32_t persistent_class; + u_int32_t key_size; + uint8_t persistent_key[CP_WRAPPEDKEYSIZE]; }; -int cp_create_init(vnode_t, vfs_context_t); +/* Same is true for the root EA, all fields must be written little endian. */ +struct cp_root_xattr { + u_int16_t major_version; + u_int16_t minor_version; + u_int64_t flags; + u_int32_t reserved1; + u_int32_t reserved2; + u_int32_t reserved3; + u_int32_t reserved4; +}; + + +/* + * Functions to check the status of a CP and to query + * the containing filesystem to see if it is supported. + */ +int cp_vnode_getclass(vnode_t, int *); +int cp_vnode_setclass(vnode_t, uint32_t); + int cp_key_store_action(int); int cp_register_wraps(cp_wrap_func_t); -struct cprotect *cp_vnode_entry_alloc(void); -void cp_vnode_entry_init(vnode_t); -int cp_vnode_entry_init_needed(vnode_t); -struct cp_xattr * cp_vn_getxattr(vnode_t, vfs_context_t); -int cp_vn_setxattr(vnode_t, uint32_t, vfs_context_t); + +int cp_entry_init(cnode_ptr_t, struct mount *); +int cp_entry_create_keys(cnode_ptr_t); +void cp_entry_destroy(cnode_ptr_t); + +cnode_ptr_t cp_get_protected_cnode(vnode_t); +int cp_handle_vnop(cnode_ptr_t, int); +int cp_fs_protected (mount_t); +int cp_getrootxattr (struct hfsmount *hfsmp, struct cp_root_xattr *outxattr); +int cp_setrootxattr (struct hfsmount *hfsmp, struct cp_root_xattr *newxattr); +int cp_handle_relocate (cnode_ptr_t cp); #endif /* KERNEL_PRIVATE */ diff --git a/bsd/sys/decmpfs.h b/bsd/sys/decmpfs.h index 72e99ee18..f8a61d288 100644 --- a/bsd/sys/decmpfs.h +++ b/bsd/sys/decmpfs.h @@ -84,7 +84,7 @@ typedef struct decmpfs_cnode { uint32_t cmp_type; uint32_t lockcount; void *lockowner; /* cnode's lock owner (if a thread is currently holding an exclusive lock) */ - uint64_t uncompressed_size; + uint64_t uncompressed_size __attribute__((aligned(8))); lck_rw_t compressed_data_lock; #if !DECMPFS_SUPPORTS_SWAP64 /* we need a lock since we can't atomically fetch/set 64 bits */ diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index 0232617ca..745aa6710 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -161,12 +161,27 @@ typedef struct #ifdef KERNEL #define DK_FEATURE_FORCE_UNIT_ACCESS 0x00000001 + +typedef struct +{ + uint64_t offset; + uint64_t length; + + uint8_t reserved0128[12]; /* reserved, clear to zero */ + + dev_t dev; +} dk_physical_extent_t; + #define DKIOCGETBLOCKCOUNT32 _IOR('d', 25, uint32_t) #define DKIOCSETBLOCKSIZE _IOW('d', 24, uint32_t) #define DKIOCGETBSDUNIT _IOR('d', 27, uint32_t) -#define DKIOCISSOLIDSTATE _IOR('d', 79, uint32_t) +#define DKIOCISSOLIDSTATE _IOR('d', 79, uint32_t) #define DKIOCISVIRTUAL _IOR('d', 72, uint32_t) #define DKIOCGETBASE _IOR('d', 73, uint64_t) +#define DKIOCGETTHROTTLEMASK _IOR('d', 80, uint64_t) +#define DKIOCLOCKPHYSICALEXTENTS _IO('d', 81) +#define DKIOCGETPHYSICALEXTENT _IOWR('d', 82, dk_physical_extent_t) +#define DKIOCUNLOCKPHYSICALEXTENTS _IO('d', 83) #endif /* KERNEL */ #endif /* _SYS_DISK_H_ */ diff --git a/bsd/sys/dtrace.h b/bsd/sys/dtrace.h index 6d9c6976a..d81a48a4f 100644 --- a/bsd/sys/dtrace.h +++ b/bsd/sys/dtrace.h @@ -119,6 +119,8 @@ typedef int64_t hrtime_t; typedef enum { B_FALSE = 0, B_TRUE = 1 } _dtrace_boolean; +typedef uint8_t UUID[16]; /* For modctl use in dtrace.h */ + struct modctl; /* In lieu of Solaris <sys/modctl.h> */ /* NOTHING */ /* In lieu of Solaris <sys/processor.h> */ #include <sys/ioctl.h> /* In lieu of Solaris <sys/systm.h> */ @@ -508,6 +510,15 @@ typedef struct dtrace_difv { #define DTRACEACT_RAISE (DTRACEACT_PROC_DESTRUCTIVE + 2) #define DTRACEACT_SYSTEM (DTRACEACT_PROC_DESTRUCTIVE + 3) #define DTRACEACT_FREOPEN (DTRACEACT_PROC_DESTRUCTIVE + 4) + +#if defined(__APPLE__) +/* + * Dtrace stop() will task_suspend the currently running process. + * Dtrace pidresume(pid) will task_resume it. + */ + +#define DTRACEACT_PIDRESUME (DTRACEACT_PROC_DESTRUCTIVE + 50) +#endif /* __APPLE__ */ #define DTRACEACT_PROC_CONTROL 0x0300 @@ -1340,6 +1351,34 @@ typedef struct dtrace_providerdesc { #define DTRACEIOC_FORMAT (DTRACEIOC | 16) /* get format str */ #define DTRACEIOC_DOFGET (DTRACEIOC | 17) /* get DOF */ #define DTRACEIOC_REPLICATE (DTRACEIOC | 18) /* replicate enab */ +#define DTRACEIOC_MODUUIDSLIST (DTRACEIOC | 30) /* APPLE ONLY, query for modules with missing symbols */ +#define DTRACEIOC_PROVMODSYMS (DTRACEIOC | 31) /* APPLE ONLY, provide missing symbols for a given module */ + +/* + * The following structs are used to provide symbol information to the kernel from userspace. + */ + +typedef struct dtrace_symbol { + uint64_t dtsym_addr; /* address of the symbol */ + uint64_t dtsym_size; /* size of the symbol, must be uint64_t to maintain alignment when called by 64b uproc in i386 kernel */ + char dtsym_name[DTRACE_FUNCNAMELEN]; /* symbol name */ +} dtrace_symbol_t; + +typedef struct dtrace_module_symbols { + UUID dtmodsyms_uuid; + uint64_t dtmodsyms_count; + dtrace_symbol_t dtmodsyms_symbols[1]; +} dtrace_module_symbols_t; + +#define DTRACE_MODULE_SYMBOLS_SIZE(count) (sizeof(dtrace_module_symbols_t) + ((count - 1) * sizeof(dtrace_symbol_t))) + +typedef struct dtrace_module_uuids_list { + uint64_t dtmul_count; + UUID dtmul_uuid[1]; +} dtrace_module_uuids_list_t; + +#define DTRACE_MODULE_UUIDS_LIST_SIZE(count) (sizeof(dtrace_module_uuids_list_t) + ((count - 1) * sizeof(UUID))) + #endif /* __APPLE__ */ /* @@ -1566,7 +1605,7 @@ typedef struct dof_ioctl_data { * dtps_provide_module(); see "Arguments and Notes" for dtrace_register(), * below. * - * 1.4 void dtps_enable(void *arg, dtrace_id_t id, void *parg) + * 1.4 int dtps_enable(void *arg, dtrace_id_t id, void *parg) * * 1.4.1 Overview * @@ -1587,7 +1626,8 @@ typedef struct dof_ioctl_data { * * 1.4.3 Return value * - * None. + * On success, dtps_enable() should return 0. On failure, -1 should be + * returned. * * 1.4.4 Caller's context * @@ -2141,7 +2181,7 @@ typedef struct dof_ioctl_data { typedef struct dtrace_pops { void (*dtps_provide)(void *arg, const dtrace_probedesc_t *spec); void (*dtps_provide_module)(void *arg, struct modctl *mp); - void (*dtps_enable)(void *arg, dtrace_id_t id, void *parg); + int (*dtps_enable)(void *arg, dtrace_id_t id, void *parg); void (*dtps_disable)(void *arg, dtrace_id_t id, void *parg); void (*dtps_suspend)(void *arg, dtrace_id_t id, void *parg); void (*dtps_resume)(void *arg, dtrace_id_t id, void *parg); @@ -2357,10 +2397,7 @@ struct regs; extern int (*dtrace_pid_probe_ptr)(struct regs *); extern int (*dtrace_return_probe_ptr)(struct regs *); #else -#if defined (__ppc__) || defined (__ppc64__) -extern int (*dtrace_pid_probe_ptr)(ppc_saved_state_t *regs); -extern int (*dtrace_return_probe_ptr)(ppc_saved_state_t* regs); -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) extern int (*dtrace_pid_probe_ptr)(x86_saved_state_t *regs); extern int (*dtrace_return_probe_ptr)(x86_saved_state_t* regs); #else @@ -2382,8 +2419,13 @@ extern void dtrace_membar_producer(void); extern void dtrace_membar_consumer(void); extern void (*dtrace_cpu_init)(processorid_t); +#if !defined(__APPLE__) extern void (*dtrace_modload)(struct modctl *); extern void (*dtrace_modunload)(struct modctl *); +#else +extern int (*dtrace_modload)(struct kmod_info *); +extern int (*dtrace_modunload)(struct kmod_info *); +#endif /* __APPLE__ */ extern void (*dtrace_helpers_cleanup)(proc_t*); extern void (*dtrace_helpers_fork)(proc_t *parent, proc_t *child); extern void (*dtrace_cpustart_init)(void); @@ -2427,14 +2469,11 @@ extern int dtrace_instr_size(uchar_t *instr); extern int dtrace_instr_size_isa(uchar_t *, model_t, int *); extern void dtrace_invop_add(int (*)(uintptr_t, uintptr_t *, uintptr_t)); extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t)); -extern void dtrace_invop_callsite(void); +extern void *dtrace_invop_callsite_pre; +extern void *dtrace_invop_callsite_post; #endif -#if defined (__ppc__) || defined (__ppc64__) -extern void dtrace_invop_add(int (*)(uintptr_t, uintptr_t *, uintptr_t)); -extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t)); -#endif #undef proc_t #endif /* __APPLE__ */ @@ -2472,13 +2511,6 @@ extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t)); #endif -#if defined (__ppc__) || defined (__ppc64__) -#define DTRACE_INVOP_NOP 4 -#define DTRACE_INVOP_RET 5 -#define DTRACE_INVOP_BCTR 6 -#define DTRACE_INVOP_TAILJUMP 7 -#endif - #endif /* __APPLE__ */ diff --git a/bsd/sys/dtrace_glue.h b/bsd/sys/dtrace_glue.h index 5612fe80c..6b3665b02 100644 --- a/bsd/sys/dtrace_glue.h +++ b/bsd/sys/dtrace_glue.h @@ -43,6 +43,10 @@ #include <mach/kmod.h> #include <libkern/OSAtomic.h> +#if defined(__i386__) || defined(__x86_64__) +#include <i386/mp.h> +#endif + /* * cmn_err */ @@ -100,17 +104,17 @@ extern lck_mtx_t mod_lock; /* * Per-CPU data. */ -typedef struct cpu { +typedef struct dtrace_cpu { processorid_t cpu_id; /* CPU number */ - struct cpu *cpu_next; /* next existing CPU */ + struct dtrace_cpu *cpu_next; /* next existing CPU */ lck_rw_t cpu_ft_lock; /* DTrace: fasttrap lock */ uintptr_t cpu_dtrace_caller; /* DTrace: caller, if any */ hrtime_t cpu_dtrace_chillmark; /* DTrace: chill mark time */ hrtime_t cpu_dtrace_chilled; /* DTrace: total chill time */ boolean_t cpu_dtrace_invop_underway; /* DTrace gaurds against invalid op re-entrancy */ -} cpu_t; +} dtrace_cpu_t; -extern cpu_t *cpu_list; +extern dtrace_cpu_t *cpu_list; /* * The cpu_core structure consists of per-CPU state available in any context. @@ -130,7 +134,8 @@ typedef struct cpu_core { } cpu_core_t; extern cpu_core_t *cpu_core; -extern unsigned int real_ncpus; + + extern int cpu_number(void); /* From #include <kern/cpu_number.h>. Called from probe context, must blacklist. */ #define CPU (&(cpu_list[cpu_number()])) /* Pointer to current CPU */ @@ -187,6 +192,55 @@ extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *); CPU_DTRACE_BADSTACK) #define CPU_DTRACE_ERROR (CPU_DTRACE_FAULT | CPU_DTRACE_DROP) +/* + * Loadable Modules + */ + +/* Keep the compiler happy */ +struct dtrace_module_symbols; + +/* Solaris' modctl structure, greatly simplified, shadowing parts of xnu kmod structure. */ +typedef struct modctl { + struct modctl *mod_next; + struct modctl *mod_stale; // stale module chain + uint32_t mod_id; // the kext unique identifier + char mod_modname[KMOD_MAX_NAME]; + int mod_loadcnt; + char mod_loaded; + char mod_flags; // See flags below + int mod_nenabled; // # of enabled DTrace probes in module + vm_address_t mod_address; // starting address (of Mach-o header blob) + vm_size_t mod_size; // total size (of blob) + UUID mod_uuid; + struct dtrace_module_symbols* mod_user_symbols; +} modctl_t; + +/* Definitions for mod_flags */ +#define MODCTL_IS_MACH_KERNEL 0x01 // This module represents /mach_kernel +#define MODCTL_HAS_KERNEL_SYMBOLS 0x02 // Kernel symbols (nlist) are available +#define MODCTL_FBT_PROBES_PROVIDED 0x04 // fbt probes have been provided +#define MODCTL_FBT_INVALID 0x08 // Module is invalid for fbt probes +#define MODCTL_SDT_PROBES_PROVIDED 0x10 // sdt probes have been provided +#define MODCTL_SDT_INVALID 0x20 // Module is invalid for sdt probes +#define MODCTL_HAS_UUID 0x40 // Module has UUID + +/* Simple/singular mod_flags accessors */ +#define MOD_IS_MACH_KERNEL(mod) (mod->mod_flags & MODCTL_IS_MACH_KERNEL) +#define MOD_HAS_KERNEL_SYMBOLS(mod) (mod->mod_flags & MODCTL_HAS_KERNEL_SYMBOLS) +#define MOD_HAS_USERSPACE_SYMBOLS(mod) (mod->mod_user_symbols) /* No point in duplicating state in the flags bits */ +#define MOD_FBT_PROBES_PROVIDED(mod) (mod->mod_flags & MODCTL_FBT_PROBES_PROVIDED) +#define MOD_FBT_INVALID(mod) (mod->mod_flags & MODCTL_FBT_INVALID) +#define MOD_SDT_PROBES_PROVIDED(mod) (mod->mod_flags & MODCTL_SDT_PROBES_PROVIDED) +#define MOD_SDT_INVALID(mod) (mod->mod_flags & MODCTL_SDT_INVALID) +#define MOD_HAS_UUID(mod) (mod->mod_flags & MODCTL_HAS_UUID) + +/* Compound accessors */ +#define MOD_FBT_DONE(mod) (MOD_FBT_PROBES_PROVIDED(mod) || MOD_FBT_INVALID(mod)) +#define MOD_SDT_DONE(mod) (MOD_SDT_PROBES_PROVIDED(mod) || MOD_SDT_INVALID(mod)) +#define MOD_SYMBOLS_DONE(mod) (MOD_FBT_DONE(mod) && MOD_SDT_DONE(mod)) + +extern modctl_t *dtrace_modctl_list; + /* * cred_t */ @@ -244,8 +298,8 @@ typedef struct cyc_handler { } cyc_handler_t; typedef struct cyc_omni_handler { - void (*cyo_online)(void *, cpu_t *, cyc_handler_t *, cyc_time_t *); - void (*cyo_offline)(void *, cpu_t *, void *); + void (*cyo_online)(void *, dtrace_cpu_t *, cyc_handler_t *, cyc_time_t *); + void (*cyo_offline)(void *, dtrace_cpu_t *, void *); void *cyo_arg; } cyc_omni_handler_t; @@ -389,25 +443,6 @@ extern void kmem_cache_destroy(kmem_cache_t *); typedef struct _kthread kthread_t; /* For dtrace_vtime_switch(), dtrace_panicked and dtrace_errthread */ -/* - * Loadable Modules - */ - -#if 0 /* kmod_lock has been removed */ -decl_simple_lock_data(extern,kmod_lock) -#endif /* 0 */ - -/* Want to use Darwin's kmod_info in place of the Solaris modctl. - Can't typedef since the (many) usages in the code are "struct modctl *" */ -extern kmod_info_t *kmod; -#define modctl kmod_info - -#define mod_modname name -#define mod_loadcnt id -#define mod_next next -#define mod_loaded info_version /* XXX Is always > 0, hence TRUE */ -#define modules kmod - /* * proc */ @@ -472,15 +507,6 @@ static inline void atomic_add_64( uint64_t *theValue, int64_t theAmount ) { (void)OSAddAtomic64( theAmount, (SInt64 *)theValue ); } -#elif defined(__ppc__) -static inline void atomic_add_64( uint64_t *theValue, int64_t theAmount ) -{ - // FIXME - // atomic_add_64() is at present only called from fasttrap.c to increment - // or decrement a 64bit counter. Narrow to 32bits since ppc32 (G4) has - // no convenient 64bit atomic op. - (void)OSAddAtomic( (int32_t)theAmount, &(((SInt32 *)theValue)[1])); -} #endif /* diff --git a/bsd/sys/dtrace_impl.h b/bsd/sys/dtrace_impl.h index 4ef2ef655..7f42cff5e 100644 --- a/bsd/sys/dtrace_impl.h +++ b/bsd/sys/dtrace_impl.h @@ -1008,6 +1008,45 @@ typedef enum dtrace_activity { #define DTRACE_DOF_MODE_LAZY_ON 1 #define DTRACE_DOF_MODE_LAZY_OFF 2 #define DTRACE_DOF_MODE_NON_LAZY 3 + +/* + * dtrace kernel symbol modes are used to control when the kernel may dispose of + * symbol information used by the fbt/sdt provider. The kernel itself, as well as + * every kext, has symbol table/nlist info that has historically been preserved + * for dtrace's use. This allowed dtrace to be lazy about allocating fbt/sdt probes, + * at the expense of keeping the symbol info in the kernel permanently. + * + * Starting in 10.7+, fbt probes may be created from userspace, in the same + * fashion as pid probes. The kernel allows dtrace "first right of refusal" + * whenever symbol data becomes available (such as a kext load). If dtrace is + * active, it will immediately read/copy the needed data, and then the kernel + * may free it. If dtrace is not active, it returns immediately, having done + * no work or allocations, and the symbol data is freed. Should dtrace need + * this data later, it is expected that the userspace client will push the + * data into the kernel via ioctl calls. + * + * The kernel symbol modes are used to control what dtrace does with symbol data: + * + * DTRACE_KERNEL_SYMBOLS_NEVER Effectively disables fbt/sdt + * DTRACE_KERNEL_SYMBOLS_FROM_KERNEL Immediately read/copy symbol data + * DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE Wait for symbols from userspace + * DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL Immediately read/copy symbol data + * + * It is legal to transition between DTRACE_KERNEL_SYMBOLS_FROM_KERNEL and + * DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE. The DTRACE_KERNEL_SYMBOLS_NEVER and + * DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL are permanent modes, intended to + * disable fbt probes entirely, or prevent any symbols being loaded from + * userspace. +* + * The kernel symbol mode is kept in dtrace_kernel_symbol_mode, which is protected + * by the dtrace_lock. + */ + +#define DTRACE_KERNEL_SYMBOLS_NEVER 0 +#define DTRACE_KERNEL_SYMBOLS_FROM_KERNEL 1 +#define DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE 2 +#define DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL 3 + #endif /* __APPLE__ */ /* diff --git a/bsd/sys/errno.h b/bsd/sys/errno.h index 54d5d0371..231c68ead 100644 --- a/bsd/sys/errno.h +++ b/bsd/sys/errno.h @@ -69,8 +69,9 @@ #ifndef _SYS_ERRNO_H_ #define _SYS_ERRNO_H_ -#if !defined(KERNEL) && !defined(KERNEL_PRIVATE) #include <sys/cdefs.h> + +#if !defined(KERNEL) && !defined(KERNEL_PRIVATE) __BEGIN_DECLS extern int * __error(void); #define errno (*__error()) @@ -96,7 +97,7 @@ __END_DECLS #define ENOMEM 12 /* Cannot allocate memory */ #define EACCES 13 /* Permission denied */ #define EFAULT 14 /* Bad address */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define ENOTBLK 15 /* Block device required */ #endif #define EBUSY 16 /* Device / Resource busy */ @@ -134,9 +135,9 @@ __END_DECLS #define EPROTOTYPE 41 /* Protocol wrong type for socket */ #define ENOPROTOOPT 42 /* Protocol not available */ #define EPROTONOSUPPORT 43 /* Protocol not supported */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define ESOCKTNOSUPPORT 44 /* Socket type not supported */ -#endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ +#endif #define ENOTSUP 45 /* Operation not supported */ #if !__DARWIN_UNIX03 && !defined(KERNEL) /* @@ -150,9 +151,9 @@ __END_DECLS #define EOPNOTSUPP ENOTSUP /* Operation not supported on socket */ #endif /* !__DARWIN_UNIX03 && !KERNEL */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EPFNOSUPPORT 46 /* Protocol family not supported */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define EAFNOSUPPORT 47 /* Address family not supported by protocol family */ #define EADDRINUSE 48 /* Address already in use */ #define EADDRNOTAVAIL 49 /* Can't assign requested address */ @@ -166,10 +167,10 @@ __END_DECLS #define ENOBUFS 55 /* No buffer space available */ #define EISCONN 56 /* Socket is already connected */ #define ENOTCONN 57 /* Socket is not connected */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define ESHUTDOWN 58 /* Can't send after socket shutdown */ #define ETOOMANYREFS 59 /* Too many references: can't splice */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define ETIMEDOUT 60 /* Operation timed out */ #define ECONNREFUSED 61 /* Connection refused */ @@ -177,34 +178,34 @@ __END_DECLS #define ENAMETOOLONG 63 /* File name too long */ /* should be rearranged */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EHOSTDOWN 64 /* Host is down */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define EHOSTUNREACH 65 /* No route to host */ #define ENOTEMPTY 66 /* Directory not empty */ /* quotas & mush */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EPROCLIM 67 /* Too many processes */ #define EUSERS 68 /* Too many users */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define EDQUOT 69 /* Disc quota exceeded */ /* Network File System */ #define ESTALE 70 /* Stale NFS file handle */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EREMOTE 71 /* Too many levels of remote in path */ #define EBADRPC 72 /* RPC struct is bad */ #define ERPCMISMATCH 73 /* RPC version wrong */ #define EPROGUNAVAIL 74 /* RPC prog. not avail */ #define EPROGMISMATCH 75 /* Program version wrong */ #define EPROCUNAVAIL 76 /* Bad procedure for program */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define ENOLCK 77 /* No locks available */ #define ENOSYS 78 /* Function not implemented */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EFTYPE 79 /* Inappropriate file type or format */ #define EAUTH 80 /* Authentication error */ #define ENEEDAUTH 81 /* Need authenticator */ @@ -212,26 +213,26 @@ __END_DECLS /* Intelligent device errors */ #define EPWROFF 82 /* Device power is off */ #define EDEVERR 83 /* Device error, e.g. paper out */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define EOVERFLOW 84 /* Value too large to be stored in data type */ /* Program loading errors */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define EBADEXEC 85 /* Bad executable */ #define EBADARCH 86 /* Bad CPU type in executable */ #define ESHLIBVERS 87 /* Shared library version mismatch */ #define EBADMACHO 88 /* Malformed Macho file */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define ECANCELED 89 /* Operation canceled */ #define EIDRM 90 /* Identifier removed */ #define ENOMSG 91 /* No message of desired type */ #define EILSEQ 92 /* Illegal byte sequence */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL #define ENOATTR 93 /* Attribute not found */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#endif #define EBADMSG 94 /* Bad message */ #define EMULTIHOP 95 /* Reserved */ @@ -249,9 +250,14 @@ __END_DECLS #define ENOPOLICY 103 /* No such policy registered */ -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define ELAST 103 /* Must be equal largest errno */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#if __DARWIN_C_LEVEL >= 200809L +#define ENOTRECOVERABLE 104 /* State not recoverable */ +#define EOWNERDEAD 105 /* Previous owner died */ +#endif + +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL +#define ELAST 105 /* Must be equal largest errno */ +#endif #ifdef KERNEL /* pseudo-errors returned inside kernel to modify return to process */ @@ -261,6 +267,10 @@ __END_DECLS #ifdef BSD_KERNEL_PRIVATE #define ERECYCLE (-5) /* restart lookup under heavy vnode pressure/recycling */ #define EREDRIVEOPEN (-6) +#define EKEEPLOOKING (-7) +/* used for cvwait error returns to Libc */ +#define ECVCERORR 256 +#define ECVPERORR 512 #else /* BSD_KERNEL_PRIVATE */ /* -5 and -6 are reserved for kernel internal use */ #endif /* BSD_KERNEL_PRIVATE */ diff --git a/bsd/sys/event.h b/bsd/sys/event.h index abbd60045..05b31174a 100644 --- a/bsd/sys/event.h +++ b/bsd/sys/event.h @@ -70,9 +70,10 @@ #define EVFILT_MACHPORT (-8) /* Mach portsets */ #define EVFILT_FS (-9) /* Filesystem events */ #define EVFILT_USER (-10) /* User events */ -#define EVFILT_SESSION (-11) /* Audit session events */ + /* (-11) unused */ +#define EVFILT_VM (-12) /* Virtual memory events */ -#define EVFILT_SYSCOUNT 11 +#define EVFILT_SYSCOUNT 12 #define EVFILT_THREADMARKER EVFILT_SYSCOUNT /* Internal use only */ #pragma pack(4) @@ -191,7 +192,6 @@ struct kevent64_s { * On input, NOTE_TRIGGER causes the event to be triggered for output. */ #define NOTE_TRIGGER 0x01000000 -#define EV_TRIGGER 0x0100 /*deprecated--for backwards compatibility only*/ /* * On input, the top two bits of fflags specifies how the lower twenty four @@ -233,16 +233,26 @@ struct kevent64_s { * that hangs off the proc structure. They also both play games with the hint * passed to KNOTE(). If NOTE_SIGNAL is passed as a hint, then the lower bits * of the hint contain the signal. IF NOTE_FORK is passed, then the lower bits - * contain the PID of the child. + * contain the PID of the child. */ #define NOTE_EXIT 0x80000000 /* process exited */ #define NOTE_FORK 0x40000000 /* process forked */ #define NOTE_EXEC 0x20000000 /* process exec'd */ #define NOTE_REAP 0x10000000 /* process reaped */ #define NOTE_SIGNAL 0x08000000 /* shared with EVFILT_SIGNAL */ +#define NOTE_EXITSTATUS 0x04000000 /* exit status to be returned, valid for child process only */ +#define NOTE_RESOURCEEND 0x02000000 /* resource limit reached, resource type returned */ #define NOTE_PDATAMASK 0x000fffff /* mask for pid/signal */ #define NOTE_PCTRLMASK (~NOTE_PDATAMASK) +/* + * data/hint fflags for EVFILT_VM, shared with userspace. + */ +#define NOTE_VM_PRESSURE 0x80000000 /* will react on memory pressure */ +#define NOTE_VM_PRESSURE_TERMINATE 0x40000000 /* will quit on memory pressure, possibly after cleaning up dirty state */ +#define NOTE_VM_PRESSURE_SUDDEN_TERMINATE 0x20000000 /* will quit immediately on memory pressure */ +#define NOTE_VM_ERROR 0x10000000 /* there was an error */ + /* * data/hint fflags for EVFILT_TIMER, shared with userspace. * The default is a (repeating) interval timer with the data @@ -258,7 +268,7 @@ struct kevent64_s { /* * data/hint fflags for EVFILT_MACHPORT, shared with userspace. * - * Only portsets are support at this time. + * Only portsets are supported at this time. * * The fflags field can optionally contain the MACH_RCV_MSG, MACH_RCV_LARGE, * and related trailer receive options as defined in <mach/message.h>. @@ -275,29 +285,6 @@ struct kevent64_s { * contains the name of the actual port detected with a message waiting. */ -/* - * data/hint fflags for EVFILT_SESSION, shared with userspace. - * - * The kevent ident field should be set to AU_SESSION_ANY_ASID if interested - * in events for any session. - * - * NOTE_AS_UPDATE may be going away since struct auditinfo_addr may become - * immutable once initially set. - */ -#define NOTE_AS_START 0x00000001 /* start of new session */ -#define NOTE_AS_END 0x00000002 /* start of new session */ -#define NOTE_AS_ERR 0x00000004 /* error tracking new session */ -#define NOTE_AS_CLOSE 0x00000008 /* currently unsupported */ -#define NOTE_AS_UPDATE 0x00000010 /* session data updated */ - -/* - * Kevent ident value for any session. - */ -#define AS_ANY_ASID 0xFFFFFFFF - -struct au_sentry; /* Audit session entry */ - - /* * DEPRECATED!!!!!!!!! * NOTE_TRACK, NOTE_TRACKERR, and NOTE_CHILD are no longer supported as of 10.5 @@ -338,7 +325,6 @@ struct knote { struct fileproc *p_fp; /* file data pointer */ struct proc *p_proc; /* proc pointer */ struct ipc_pset *p_pset; /* pset pointer */ - struct au_sentry *p_se; /* Audit session ptr */ } kn_ptr; struct filterops *kn_fop; int kn_status; /* status bits */ @@ -378,7 +364,7 @@ struct filterops { /* Optional f_touch operation, called only if !f_isfd && non-NULL */ void (*f_touch)(struct knote *kn, struct kevent64_s *kev, long type); /* Optional f_peek operation, called only if KN_STAYQUEUED is set */ - int (*f_peek)(struct knote *kn); + unsigned (*f_peek)(struct knote *kn); }; struct proc; @@ -399,6 +385,7 @@ extern int knote_detach(struct klist *list, struct knote *kn); extern int knote_link_wait_queue(struct knote *kn, struct wait_queue *wq); extern void knote_unlink_wait_queue(struct knote *kn, struct wait_queue *wq); extern void knote_fdclose(struct proc *p, int fd); +extern void knote_markstayqueued(struct knote *kn); #endif /* !KERNEL_PRIVATE */ diff --git a/bsd/sys/fasttrap_impl.h b/bsd/sys/fasttrap_impl.h index 259841c70..a4017cc41 100644 --- a/bsd/sys/fasttrap_impl.h +++ b/bsd/sys/fasttrap_impl.h @@ -201,10 +201,7 @@ extern int fasttrap_tracepoint_init(proc_t *, fasttrap_tracepoint_t *, extern int fasttrap_tracepoint_install(proc_t *, fasttrap_tracepoint_t *); extern int fasttrap_tracepoint_remove(proc_t *, fasttrap_tracepoint_t *); -#if defined (__ppc__) || defined (__ppc64__) -extern int fasttrap_pid_probe(ppc_saved_state_t *regs); -extern int fasttrap_return_probe(ppc_saved_state_t* regs); -#elif defined (__i386__) || defined(__x86_64__) +#if defined (__i386__) || defined(__x86_64__) extern int fasttrap_pid_probe(x86_saved_state_t *regs); extern int fasttrap_return_probe(x86_saved_state_t* regs); #else diff --git a/bsd/sys/fbt.h b/bsd/sys/fbt.h index c72208c99..3796443ea 100644 --- a/bsd/sys/fbt.h +++ b/bsd/sys/fbt.h @@ -29,9 +29,7 @@ #ifndef _FBT_H #define _FBT_H -#if defined (__ppc__) || defined (__ppc64__) -typedef uint32_t machine_inst_t; -#elif defined(__i386__) || defined (__x86_64__) +#if defined(__i386__) || defined (__x86_64__) typedef uint8_t machine_inst_t; #else #error Unknown Architecture @@ -45,18 +43,25 @@ typedef struct fbt_probe { int8_t fbtp_rval; machine_inst_t fbtp_patchval; machine_inst_t fbtp_savedval; + machine_inst_t fbtp_currentval; uintptr_t fbtp_roffset; dtrace_id_t fbtp_id; + /* FIXME! + * This field appears to only be used in error messages. + * It puts this structure into the next size bucket in kmem_alloc + * wasting 32 bytes per probe. (in i386 only) + */ char fbtp_name[MAX_FBTP_NAME_CHARS]; struct modctl *fbtp_ctl; int fbtp_loadcnt; +#if !defined(__APPLE__) int fbtp_symndx; - int fbtp_primary; +#endif struct fbt_probe *fbtp_next; } fbt_probe_t; extern int dtrace_invop(uintptr_t, uintptr_t *, uintptr_t); extern int fbt_invop(uintptr_t, uintptr_t *, uintptr_t); extern void fbt_provide_module(void *, struct modctl *); - +extern int fbt_enable (void *arg, dtrace_id_t id, void *parg); #endif /* _FBT_H */ diff --git a/bsd/sys/fcntl.h b/bsd/sys/fcntl.h index e9302b184..f6cbe9d5a 100644 --- a/bsd/sys/fcntl.h +++ b/bsd/sys/fcntl.h @@ -77,6 +77,9 @@ */ #include <sys/_types.h> #include <sys/cdefs.h> +#ifndef KERNEL +#include <Availability.h> +#endif /* We should not be exporting size_t here. Temporary for gcc bootstrapping. */ #ifndef _SIZE_T @@ -168,6 +171,14 @@ typedef __darwin_pid_t pid_t; #define O_DSYNC 0x400000 /* synch I/O data integrity */ #endif +#ifdef KERNEL +#define FNODIRECT 0x800000 /* fcntl(F_NODIRECT, 1) */ +#endif + +#if __DARWIN_C_LEVEL >= 200809L +#define O_CLOEXEC 0x1000000 /* implicitly set FD_CLOEXEC */ +#endif + #ifdef KERNEL /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ #define FFLAGS(oflags) ((oflags) + 1) @@ -220,6 +231,7 @@ typedef __darwin_pid_t pid_t; #define F_SETLK 8 /* set record locking information */ #define F_SETLKW 9 /* F_SETLK; wait if blocked */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define F_FLUSH_DATA 40 #define F_CHKCLEAN 41 /* Used for regression test */ #define F_PREALLOCATE 42 /* Preallocate storage */ #define F_SETSIZE 43 /* Truncate a file without zeroing space */ @@ -248,14 +260,35 @@ typedef __darwin_pid_t pid_t; #define F_ADDFILESIGS 61 /* add signature from same file (used by dyld for shared libs) */ -#define F_GETPROTECTIONCLASS 62 /* Get the protection class of a file from the EA, returns int */ -#define F_SETPROTECTIONCLASS 63 /* Set the protection class of a file for the EA, requires int */ +#define F_NODIRECT 62 /* used in conjunction with F_NOCACHE to indicate that DIRECT, synchonous writes */ + /* should not be used (i.e. its ok to temporaily create cached pages) */ + +#define F_GETPROTECTIONCLASS 63 /* Get the protection class of a file from the EA, returns int */ +#define F_SETPROTECTIONCLASS 64 /* Set the protection class of a file for the EA, requires int */ + +#define F_LOG2PHYS_EXT 65 /* file offset to device offset, extended */ + +#define F_GETLKPID 66 /* get record locking information, per-process */ + +#ifdef PRIVATE +#define F_MOVEDATAEXTENTS 69 /* Swap only the data associated with two files */ +#endif + +#define F_SETBACKINGSTORE 70 /* Mark the file as being the backing store for another filesystem */ +#define F_GETPATH_MTMINFO 71 /* return the full path of the FD, but error in specific mtmd circumstances */ + +#define F_SETNOSIGPIPE 73 /* No SIGPIPE generated on EPIPE */ +#define F_GETNOSIGPIPE 74 /* Status of SIGPIPE for this fd */ // FS-specific fcntl()'s numbers begin at 0x00010000 and go up #define FCNTL_FS_SPECIFIC_BASE 0x00010000 #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#if __DARWIN_C_LEVEL >= 200809L +#define F_DUPFD_CLOEXEC 67 /* mark the dup with FD_CLOEXEC */ +#endif + /* file descriptor flags (F_GETFD, F_SETFD) */ #define FD_CLOEXEC 1 /* close-on-exec flag */ @@ -296,7 +329,7 @@ typedef __darwin_pid_t pid_t; #define S_IFLNK 0120000 /* [XSI] symbolic link */ #define S_IFSOCK 0140000 /* [XSI] socket */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define S_IFWHT 0160000 /* whiteout */ +#define S_IFWHT 0160000 /* OBSOLETE: whiteout */ #endif /* File mode */ @@ -464,13 +497,22 @@ typedef struct user_fbootstraptransfer { * For them the fcntl will nedd to switch from using BMAP to CMAP * and a per filesystem type flag will be needed to interpret the * contiguous bytes count result from CMAP. + * + * F_LOG2PHYS_EXT is a variant of F_LOG2PHYS that uses a passed in + * file offset and length instead of the current file offset. + * F_LOG2PHYS_EXT operates on the same structure as F_LOG2PHYS, but + * treats it as an in/out. */ #pragma pack(4) struct log2phys { - unsigned int l2p_flags; /* unused so far */ - off_t l2p_contigbytes; /* unused so far */ - off_t l2p_devoffset; /* bytes into device */ + unsigned int l2p_flags; /* unused so far */ + off_t l2p_contigbytes; /* F_LOG2PHYS: unused so far */ + /* F_LOG2PHYS_EXT: IN: number of bytes to be queried */ + /* OUT: number of contiguous bytes at this position */ + off_t l2p_devoffset; /* F_LOG2PHYS: OUT: bytes into device */ + /* F_LOG2PHYS_EXT: IN: bytes into file */ + /* OUT: bytes into device */ }; #pragma pack() @@ -544,6 +586,13 @@ int fcntl(int, int, ...) __DARWIN_ALIAS_C(fcntl); #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #ifdef PRIVATE +/* + * These definitions are retained temporarily for compatibility. + * If you want to use fileports, please use + * #include <sys/fileport.h> + * or + * #include <System/sys/fileport.h> + */ #ifndef _FILEPORT_T #define _FILEPORT_T typedef __darwin_mach_port_t fileport_t; @@ -561,7 +610,7 @@ void filesec_free(filesec_t); int filesec_get_property(filesec_t, filesec_property_t, void *); int filesec_query_property(filesec_t, filesec_property_t, int *); int filesec_set_property(filesec_t, filesec_property_t, const void *); -int filesec_unset_property(filesec_t, filesec_property_t); +int filesec_unset_property(filesec_t, filesec_property_t) __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2); #define _FILESEC_UNSET_PROPERTY ((void *)0) #define _FILESEC_REMOVE_ACL ((void *)1) #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ diff --git a/bsd/sys/file.h b/bsd/sys/file.h index b236f0840..bd3629144 100644 --- a/bsd/sys/file.h +++ b/bsd/sys/file.h @@ -81,6 +81,8 @@ #define _KAUTH_CRED_T struct ucred; typedef struct ucred *kauth_cred_t; +struct posix_cred; +typedef struct posix_cred *posix_cred_t; #endif /* !_KAUTH_CRED_T */ #pragma pack(4) diff --git a/bsd/sys/file_internal.h b/bsd/sys/file_internal.h index 7592c745d..9fcb4d1f3 100644 --- a/bsd/sys/file_internal.h +++ b/bsd/sys/file_internal.h @@ -105,14 +105,17 @@ struct fileproc { #define FP_WAITCLOSE 0x0040 #define FP_AIOISSUED 0x0080 #define FP_WAITEVENT 0x0100 +#define FP_SELCONFLICT 0x0200 /* select conflict on an individual fp */ -#define FP_VALID_FLAGS (FP_INCREATE | FP_INCLOSE | FP_INSELECT | FP_INCHRREAD | FP_WRITTEN | FP_WRITTEN | FP_CLOSING | FP_WAITCLOSE | FP_AIOISSUED | FP_WAITEVENT) +#define FP_VALID_FLAGS (FP_INCREATE | FP_INCLOSE | FP_INSELECT | FP_INCHRREAD | FP_WRITTEN | FP_CLOSING | FP_WAITCLOSE | FP_AIOISSUED | FP_WAITEVENT | FP_SELCONFLICT) #ifndef _KAUTH_CRED_T #define _KAUTH_CRED_T struct ucred; typedef struct ucred *kauth_cred_t; +struct posix_cred; +typedef struct posix_cred *posix_cred_t; #endif /* !_KAUTH_CRED_T */ /* file types */ @@ -133,6 +136,7 @@ typedef enum { #define FG_RMMSGQ 0x08 /* the fileglob is being removed from msgqueue */ #define FG_WRMMSGQ 0x10 /* wait for the fileglob to be removed from msgqueue */ #define FG_PORTMADE 0x20 /* a port was at some point created for this fileglob */ +#define FG_NOSIGPIPE 0x40 /* don't deliver SIGPIPE with EPIPE return */ struct fileglob { LIST_ENTRY(fileglob) f_list;/* list of active files */ @@ -159,11 +163,9 @@ struct fileglob { int (*fo_drain) (struct fileproc *fp, vfs_context_t ctx); } *fg_ops; off_t fg_offset; - caddr_t fg_data; /* vnode or socket or SHM or semaphore */ + void *fg_data; /* vnode or socket or SHM or semaphore */ lck_mtx_t fg_lock; int32_t fg_lflags; /* file global flags */ - unsigned int fg_lockpc[4]; - unsigned int fg_unlockpc[4]; #if CONFIG_MACF struct label *fg_label; /* JMM - use the one in the cred? */ #endif diff --git a/bsd/sys/filedesc.h b/bsd/sys/filedesc.h index 7ea50f5a9..740e2d6f4 100644 --- a/bsd/sys/filedesc.h +++ b/bsd/sys/filedesc.h @@ -121,7 +121,9 @@ struct filedesc { #ifdef KERNEL #define UF_RESVWAIT 0x10 /* close in progress */ -#define UF_VALID_FLAGS (UF_EXCLOSE| UF_RESERVED | UF_CLOSING | UF_RESVWAIT) +#define UF_INHERIT 0x20 /* "inherit-on-exec" */ +#define UF_VALID_FLAGS \ + (UF_EXCLOSE | UF_RESERVED | UF_CLOSING | UF_RESVWAIT | UF_INHERIT) #endif /* KERNEL */ /* @@ -148,7 +150,7 @@ extern void ffree(struct file *fp); #ifdef __APPLE_API_PRIVATE extern struct filedesc *fdcopy(proc_t p, struct vnode *uth_cdir); extern void fdfree(proc_t p); -extern void fdexec(proc_t p); +extern void fdexec(proc_t p, short flags); #endif /* __APPLE_API_PRIVATE */ #endif /* KERNEL */ diff --git a/osfmk/ppc/machine_cpu.h b/bsd/sys/fileport.h similarity index 68% rename from osfmk/ppc/machine_cpu.h rename to bsd/sys/fileport.h index 88fe14def..779179baf 100644 --- a/osfmk/ppc/machine_cpu.h +++ b/bsd/sys/fileport.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,27 +25,32 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifndef _PPC_MACHINE_CPU_H_ -#define _PPC_MACHINE_CPU_H_ -#include <mach/mach_types.h> -#include <mach/boolean.h> -#include <kern/kern_types.h> -#include <pexpert/pexpert.h> +#ifndef _SYS_FILEPORT_H_ +#define _SYS_FILEPORT_H_ -extern void cpu_machine_init( - void); +#include <sys/_types.h> +#include <sys/cdefs.h> -extern void cpu_doshutdown( - void); +#ifndef KERNEL -extern void cpu_signal_handler( - void); +__BEGIN_DECLS -typedef void (*broadcastFunc) (uint32_t); +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -int32_t cpu_broadcast(uint32_t *, broadcastFunc, uint32_t); +#ifndef _FILEPORT_T +#define _FILEPORT_T +typedef __darwin_mach_port_t fileport_t; +#define FILEPORT_NULL ((fileport_t)0) +#endif /* _FILEPORT_T */ -#define cpu_pause() /* Not for this architecture */ +int fileport_makeport(int, fileport_t *); +int fileport_makefd(fileport_t); -#endif /* _PPC_MACHINE_CPU_H_ */ +#endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ + +__END_DECLS + +#endif /* !KERNEL */ + +#endif /* !_SYS_FILEPORT_H_ */ diff --git a/bsd/sys/fsctl.h b/bsd/sys/fsctl.h index b70ba0651..40c6b10b1 100644 --- a/bsd/sys/fsctl.h +++ b/bsd/sys/fsctl.h @@ -70,26 +70,161 @@ #define _SYS_FSCTL_H_ #include <sys/ioccom.h> +#include <sys/mount.h> -#define FSIOC_SYNC_VOLUME _IOW('A', 1, uint32_t) -#define FSCTL_SYNC_VOLUME IOCBASECMD(FSIOC_SYNC_VOLUME) +#ifdef XNU_KERNEL_PRIVATE -#define FSCTL_SYNC_FULLSYNC (1<<0) /* Flush the data fully to disk, if supported by the filesystem */ -#define FSCTL_SYNC_WAIT (1<<1) /* Wait for the sync to complete */ +typedef struct user64_namespace_handler_info { + user64_addr_t token; + user64_addr_t flags; + user64_addr_t fdptr; +} user64_namespace_handler_info; + +typedef struct user32_namespace_handler_info { + user32_addr_t token; + user32_addr_t flags; + user32_addr_t fdptr; +} user32_namespace_handler_info; + +typedef struct namespace_handler_info { + user_addr_t token; + user_addr_t flags; + user_addr_t fdptr; +} namespace_handler_info; + +typedef struct user64_namespace_handler_info_ext { + user64_addr_t token; + user64_addr_t flags; + user64_addr_t fdptr; + user64_addr_t infoptr; +} user64_namespace_handler_info_ext; + +typedef struct user32_namespace_handler_info_ext { + user32_addr_t token; + user32_addr_t flags; + user32_addr_t fdptr; + user32_addr_t infoptr; +} user32_namespace_handler_info_ext; + +typedef struct namespace_handler_info_ext { + user_addr_t token; + user_addr_t flags; + user_addr_t fdptr; + user_addr_t infoptr; +} namespace_handler_info_ext; + +extern int resolve_nspace_item(struct vnode *vp, uint64_t op); +extern int resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg); +extern int get_nspace_item_status(struct vnode *vp, int32_t *status); + +#else + +typedef struct namespace_handler_info { + int32_t *token; + int64_t *flags; + int32_t *fdptr; +} namespace_handler_info; + +typedef struct namespace_handler_info_ext { + int32_t *token; + int64_t *flags; + int32_t *fdptr; + int64_t *infoptr; // for snapshot write events, the kernel puts an offset/length pair here +} namespace_handler_info_ext; + + +#endif /* XNU_KERNEL_PRIVATE */ + +#define NAMESPACE_HANDLER_READ_OP 0x0001 +#define NAMESPACE_HANDLER_WRITE_OP 0x0002 +#define NAMESPACE_HANDLER_DELETE_OP 0x0004 +#define NAMESPACE_HANDLER_TRUNCATE_OP 0x0008 +#define NAMESPACE_HANDLER_RENAME_OP 0x0010 +#define NAMESPACE_HANDLER_METADATA_WRITE_OP 0x0020 +#define NAMESPACE_HANDLER_METADATA_DELETE_OP 0x0040 +#define NAMESPACE_HANDLER_METADATA_MOD 0x0080 +#define NAMESPACE_HANDLER_LINK_CREATE 0x0200 + +#define NAMESPACE_HANDLER_NSPACE_EVENT 0x1000 +#define NAMESPACE_HANDLER_SNAPSHOT_EVENT 0x0100 +#define NAMESPACE_HANDLER_TRACK_EVENT 0x2000 + +#define NAMESPACE_HANDLER_EVENT_TYPE_MASK (NAMESPACE_HANDLER_NSPACE_EVENT | NAMESPACE_HANDLER_SNAPSHOT_EVENT | NAMESPACE_HANDLER_TRACK_EVENT) + +#define DATALESS_CMPFS_TYPE 0x80000001 +typedef int32_t nspace_handler_info[2]; +typedef char fstypename_t[MFSTYPENAMELEN]; + +#ifdef KERNEL + +typedef struct user64_package_ext_info { + user64_addr_t strings; + uint32_t num_entries; + uint32_t max_width; +} user64_package_ext_info; + +typedef struct user32_package_ext_info { + user32_addr_t strings; + uint32_t num_entries; + uint32_t max_width; +} user32_package_ext_info; + +#endif // KERNEL + typedef struct package_ext_info { const char *strings; uint32_t num_entries; uint32_t max_width; } package_ext_info; -#define FSIOC_SET_PACKAGE_EXTS _IOW('A', 2, struct package_ext_info) -#define FSCTL_SET_PACKAGE_EXTS IOCBASECMD(FSIOC_SET_PACKAGE_EXTS) +#define FSCTL_SYNC_FULLSYNC (1<<0) /* Flush the data fully to disk, if supported by the filesystem */ +#define FSCTL_SYNC_WAIT (1<<1) /* Wait for the sync to complete */ + + +#define FSIOC_SYNC_VOLUME _IOW('A', 1, uint32_t) +#define FSCTL_SYNC_VOLUME IOCBASECMD(FSIOC_SYNC_VOLUME) + +#define FSIOC_SET_PACKAGE_EXTS _IOW('A', 2, struct package_ext_info) +#define FSCTL_SET_PACKAGE_EXTS IOCBASECMD(FSIOC_SET_PACKAGE_EXTS) + +#define FSIOC_WAIT_FOR_SYNC _IOR('A', 3, int32_t) +#define FSCTL_WAIT_FOR_SYNC IOCBASECMD(FSIOC_WAIT_FOR_SYNC) + +#define FSIOC_NAMESPACE_HANDLER_GET _IOW('A', 4, struct namespace_handler_info) +#define FSCTL_NAMESPACE_HANDLER_GET IOCBASECMD(FSIOC_NAMESPACE_HANDLER_GET) + +#define FSIOC_NAMESPACE_HANDLER_UPDATE _IOW('A', 5, nspace_handler_info) +#define FSCTL_NAMESPACE_HANDLER_UPDATE IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UPDATE) + +#define FSIOC_NAMESPACE_HANDLER_UNBLOCK _IOW('A', 6, nspace_handler_info) +#define FSCTL_NAMESPACE_HANDLER_UNBLOCK IOCBASECMD(FSIOC_NAMESPACE_HANDLER_UNBLOCK) -#define FSIOC_WAIT_FOR_SYNC _IOR('A', 3, int32_t) -#define FSCTL_WAIT_FOR_SYNC IOCBASECMD(FSIOC_WAIT_FOR_SYNC) +#define FSIOC_NAMESPACE_HANDLER_CANCEL _IOW('A', 7, nspace_handler_info) +#define FSCTL_NAMESPACE_HANDLER_CANCEL IOCBASECMD(FSIOC_NAMESPACE_HANDLER_CANCEL) +#define FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME _IOW('A', 8, int32_t) +#define FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME IOCBASECMD(FSIOC_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME) + +#define FSIOC_OLD_SNAPSHOT_HANDLER_GET _IOW('A', 9, struct namespace_handler_info) +#define FSCTL_OLD_SNAPSHOT_HANDLER_GET IOCBASECMD(FSIOC_OLD_SNAPSHOT_HANDLER_GET) + +#define FSIOC_SET_FSTYPENAME_OVERRIDE _IOW('A', 10, fstypename_t) +#define FSCTL_SET_FSTYPENAME_OVERRIDE IOCBASECMD(FSIOC_SET_FSTYPENAME_OVERRIDE) + +#define FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS _IOW('A', 11, int32_t) +#define FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS IOCBASECMD(FSIOC_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS) + +#define FSIOC_TRACKED_HANDLER_GET _IOW('A', 12, struct namespace_handler_info) +#define FSCTL_TRACKED_HANDLER_GET IOCBASECMD(FSIOC_TRACKED_HANDLER_GET) + +#define FSIOC_SNAPSHOT_HANDLER_GET_EXT _IOW('A', 13, struct namespace_handler_info_ext) +#define FSCTL_SNAPSHOT_HANDLER_GET_EXT IOCBASECMD(FSIOC_SNAPSHOT_HANDLER_GET_EXT) + +// +// IO commands 14, 15, 16, and 17 are currently unused +// // // Spotlight and fseventsd use these fsctl()'s to find out @@ -104,27 +239,10 @@ typedef struct package_ext_info { // or else it will break binary compatibility with mds // and fseventsd. // -#define SPOTLIGHT_IOC_GET_MOUNT_TIME _IOR('h', 18, u_int32_t) -#define SPOTLIGHT_FSCTL_GET_MOUNT_TIME IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME) -#define SPOTLIGHT_IOC_GET_LAST_MTIME _IOR('h', 19, u_int32_t) -#define SPOTLIGHT_FSCTL_GET_LAST_MTIME IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME) - - -#ifdef KERNEL - -typedef struct user64_package_ext_info { - user64_addr_t strings; - uint32_t num_entries; - uint32_t max_width; -} user64_package_ext_info; - -typedef struct user32_package_ext_info { - user32_addr_t strings; - uint32_t num_entries; - uint32_t max_width; -} user32_package_ext_info; - -#endif // KERNEL +#define SPOTLIGHT_IOC_GET_MOUNT_TIME _IOR('h', 18, u_int32_t) +#define SPOTLIGHT_FSCTL_GET_MOUNT_TIME IOCBASECMD(SPOTLIGHT_IOC_GET_MOUNT_TIME) +#define SPOTLIGHT_IOC_GET_LAST_MTIME _IOR('h', 19, u_int32_t) +#define SPOTLIGHT_FSCTL_GET_LAST_MTIME IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME) #ifndef KERNEL diff --git a/bsd/sys/fsevents.h b/bsd/sys/fsevents.h index e5cb3ec3c..82c16ac48 100644 --- a/bsd/sys/fsevents.h +++ b/bsd/sys/fsevents.h @@ -161,6 +161,7 @@ typedef struct fse_info { } fse_info; int get_fse_info(struct vnode *vp, fse_info *fse, vfs_context_t ctx); +int vnode_get_fse_info_from_vap(vnode_t vp, fse_info *fse, struct vnode_attr *vap); char *get_pathbuff(void); void release_pathbuff(char *path); diff --git a/bsd/sys/fslog.h b/bsd/sys/fslog.h index c1bee8c64..1266f3075 100644 --- a/bsd/sys/fslog.h +++ b/bsd/sys/fslog.h @@ -87,6 +87,14 @@ void fslog_fs_corrupt(struct mount *mnt); void fslog_io_error(const buf_t bp); #endif /* BSD_KERNEL_PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE + +/* Log information about external modification of a target process */ +void fslog_extmod_msgtracer(proc_t caller, proc_t target); + +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* KERNEL */ /* Keys used by FSLog */ diff --git a/bsd/sys/imageboot.h b/bsd/sys/imageboot.h index 9ab02b5ab..a77c9cca8 100644 --- a/bsd/sys/imageboot.h +++ b/bsd/sys/imageboot.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -28,7 +28,12 @@ #ifndef _IMAGEBOOT_H_ #define _IMAGEBOOT_H_ -int imageboot_needed(void); -int imageboot_setup(void); +int imageboot_needed(void); +void imageboot_setup(void); +int imageboot_format_is_valid(const char *root_path); +int imageboot_mount_image(const char *root_path, int height); + +#define IMAGEBOOT_CONTAINER_ARG "container-dmg" +#define IMAGEBOOT_ROOT_ARG "root-dmg" #endif diff --git a/bsd/sys/imgact.h b/bsd/sys/imgact.h index fa9be0460..0a194b779 100644 --- a/bsd/sys/imgact.h +++ b/bsd/sys/imgact.h @@ -85,15 +85,23 @@ struct image_params { char *ip_vdata; /* file data (up to one page) */ int ip_flags; /* image flags */ int ip_argc; /* argument count */ - char *ip_argv; /* argument vector beginning */ int ip_envc; /* environment count */ + int ip_applec; /* apple vector count */ + + char *ip_startargv; /* argument vector beginning */ + char *ip_endargv; /* end of argv/start of envv */ + char *ip_endenvv; /* end of envv/start of applev */ + char *ip_strings; /* base address for strings */ char *ip_strendp; /* current end pointer */ - char *ip_strendargvp; /* end of argv/start of envp */ - int ip_strspace; /* remaining space */ + + int ip_argspace; /* remaining space of NCARGS limit (argv+envv) */ + int ip_strspace; /* remaining total string space */ + user_size_t ip_arch_offset; /* subfile offset in ip_vp */ user_size_t ip_arch_size; /* subfile length in ip_vp */ - char ip_interp_name[IMG_SHSIZE]; /* interpreter name */ + char ip_interp_buffer[IMG_SHSIZE]; /* interpreter buffer space */ + int ip_interp_sugid_fd; /* fd for sugid script */ /* Next two fields are for support of architecture translation... */ char *ip_p_comm; /* optional alt p->p_comm */ @@ -112,14 +120,16 @@ struct image_params { /* * Image flags */ -#define IMGPF_NONE 0x00000000 /* No flags */ -#define IMGPF_INTERPRET 0x00000001 /* Interpreter invoked */ -#define IMGPF_POWERPC 0x00000002 /* ppc mode for x86 */ +#define IMGPF_NONE 0x00000000 /* No flags */ +#define IMGPF_INTERPRET 0x00000001 /* Interpreter invoked */ +#define IMGPF_POWERPC 0x00000002 /* ppc mode for x86 */ #if CONFIG_EMBEDDED #undef IMGPF_POWERPC #endif -#define IMGPF_WAS_64BIT 0x00000004 /* exec from a 64Bit binary */ -#define IMGPF_IS_64BIT 0x00000008 /* exec to a 64Bit binary */ -#define IMGPF_SPAWN 0x00000010 /* spawn (without setexec) */ +#define IMGPF_WAS_64BIT 0x00000004 /* exec from a 64Bit binary */ +#define IMGPF_IS_64BIT 0x00000008 /* exec to a 64Bit binary */ +#define IMGPF_SPAWN 0x00000010 /* spawn (without setexec) */ +#define IMGPF_DISABLE_ASLR 0x00000020 /* disable ASLR */ +#define IMGPF_ALLOW_DATA_EXEC 0x00000040 /* forcibly disallow data execution */ #endif /* !_SYS_IMGACT */ diff --git a/bsd/ppc/ptrace.h b/bsd/sys/imgsrc.h similarity index 74% rename from bsd/ppc/ptrace.h rename to bsd/sys/imgsrc.h index be9af6886..aac577176 100644 --- a/bsd/ppc/ptrace.h +++ b/bsd/sys/imgsrc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,8 +25,9 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ +/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ /* - * Copyright (c) 1992, 1993 + * Copyright (c) 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -57,11 +58,42 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)ptrace.h 8.1 (Berkeley) 6/11/93 */ -/* - * Machine dependent trace commands. - * - * None for the PowerPC at this time. +#ifndef _SYS_IMGSRC_H_ +#define _SYS_IMGSRC_H_ + +#include <stdint.h> +/* + * For mount(2), defined here for easy use with System.framework/PrivateHeaders. */ +#define MNT_IMGSRC_BY_INDEX 0x20000000 + +typedef struct imgsrc_info +{ + uint32_t ii_height; /* Nesting height: 0 is outermost */ + uint32_t ii_flags; /* Currently unused */ + dev_t ii_dev; /* dev_t for this volume */ + char ii_reserved[24];/* TBD */ +} *imgsrc_info_t; + +struct mnt_imgsrc_args { + uint32_t mi_height; /* As determined from an imgsrc_info structure */ + uint32_t mi_flags; /* TBD */ + const char* mi_devpath; /* Path to devnode */ +}; + +#ifdef BSD_KERNEL_PRIVATE +struct user64_mnt_imgsrc_args { + uint32_t mi_height; + uint32_t mi_flags; + user64_addr_t mi_devpath; +}; + +struct user32_mnt_imgsrc_args { + uint32_t mi_height; + uint32_t mi_flags; + user32_addr_t mi_devpath; +}; +#endif /* XNU_KERNEL_PRIVATE */ +#endif /* _SYS_IMGSRC_H_ */ diff --git a/bsd/sys/kauth.h b/bsd/sys/kauth.h index 33078a1f4..94f0b1e1e 100644 --- a/bsd/sys/kauth.h +++ b/bsd/sys/kauth.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 Apple Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -106,11 +106,14 @@ struct kauth_identity_extlookup { #define KAUTH_EXTLOOKUP_WANT_MEMBERSHIP (1<<12) #define KAUTH_EXTLOOKUP_VALID_MEMBERSHIP (1<<13) #define KAUTH_EXTLOOKUP_ISMEMBER (1<<14) +#define KAUTH_EXTLOOKUP_VALID_PWNAM (1<<15) +#define KAUTH_EXTLOOKUP_WANT_PWNAM (1<<16) +#define KAUTH_EXTLOOKUP_VALID_GRNAM (1<<17) +#define KAUTH_EXTLOOKUP_WANT_GRNAM (1<<18) __darwin_pid_t el_info_pid; /* request on behalf of PID */ + u_int64_t el_extend; /* extension field */ u_int32_t el_info_reserved_1; /* reserved (APPLE) */ - u_int32_t el_info_reserved_2; /* reserved (APPLE) */ - u_int32_t el_info_reserved_3; /* reserved (APPLE) */ uid_t el_uid; /* user ID */ guid_t el_uguid; /* user GUID */ @@ -177,7 +180,6 @@ struct kauth_cred { int kc_nwhtgroups; /* whiteout group list */ gid_t *kc_whtgroups; - struct auditinfo cr_au; struct au_session cr_audit; /* user auditing data */ int kc_nsupplement; /* entry count in supplemental data pointer array */ @@ -192,6 +194,16 @@ struct kauth_cred { /* Kernel SPI for now */ __BEGIN_DECLS +/* + * Routines specific to credentials with POSIX credential labels attached + * + * XXX Should be in policy_posix.h, with struct posix_cred + */ +extern kauth_cred_t posix_cred_create(posix_cred_t pcred); +extern posix_cred_t posix_cred_get(kauth_cred_t cred); +extern void posix_cred_label(kauth_cred_t cred, posix_cred_t pcred); +extern int posix_cred_access(kauth_cred_t cred, id_t object_uid, id_t object_gid, mode_t object_mode, mode_t mode_req); + extern uid_t kauth_getuid(void); extern uid_t kauth_getruid(void); extern gid_t kauth_getgid(void); @@ -221,7 +233,15 @@ extern int kauth_proc_label_update(struct proc *p, void *label); extern kauth_cred_t kauth_cred_find(kauth_cred_t cred); extern uid_t kauth_cred_getuid(kauth_cred_t _cred); +extern uid_t kauth_cred_getruid(kauth_cred_t _cred); +extern uid_t kauth_cred_getsvuid(kauth_cred_t _cred); extern gid_t kauth_cred_getgid(kauth_cred_t _cred); +extern gid_t kauth_cred_getrgid(kauth_cred_t _cred); +extern gid_t kauth_cred_getsvgid(kauth_cred_t _cred); +extern int kauth_cred_pwnam2guid(char *pwnam, guid_t *guidp); +extern int kauth_cred_grnam2guid(char *grnam, guid_t *guidp); +extern int kauth_cred_guid2pwnam(guid_t *guidp, char *pwnam); +extern int kauth_cred_guid2grnam(guid_t *guidp, char *grnam); extern int kauth_cred_guid2uid(guid_t *_guid, uid_t *_uidp); extern int kauth_cred_guid2gid(guid_t *_guid, gid_t *_gidp); extern int kauth_cred_ntsid2uid(ntsid_t *_sid, uid_t *_uidp); @@ -273,7 +293,7 @@ extern void kauth_cred_uthread_update(struct uthread *, proc_t); #ifdef CONFIG_MACF extern int kauth_proc_label_update_execve(struct proc *p, struct vfs_context *ctx, struct vnode *vp, struct label *scriptlabel, struct label *execlabel); #endif -extern int kauth_cred_getgroups(gid_t *_groups, int *_groupcount); +extern int kauth_cred_getgroups(kauth_cred_t _cred, gid_t *_groups, int *_groupcount); extern int kauth_cred_assume(uid_t _uid); extern int kauth_cred_gid_subset(kauth_cred_t _cred1, kauth_cred_t _cred2, int *_resultp); struct auditinfo_addr; @@ -468,6 +488,7 @@ struct kauth_acl_eval { int ae_options; #define KAUTH_AEVAL_IS_OWNER (1<<0) /* authorizing operation for owner */ #define KAUTH_AEVAL_IN_GROUP (1<<1) /* authorizing operation for groupmember */ +#define KAUTH_AEVAL_IN_GROUP_UNKNOWN (1<<2) /* authorizing operation for unknown group membership */ /* expansions for 'generic' rights bits */ kauth_ace_rights_t ae_exp_gall; kauth_ace_rights_t ae_exp_gread; diff --git a/bsd/sys/kdebug.h b/bsd/sys/kdebug.h index 9f7b789c9..393c413df 100644 --- a/bsd/sys/kdebug.h +++ b/bsd/sys/kdebug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -47,6 +47,11 @@ __BEGIN_DECLS #include <kdebug.h> #endif /* KERNEL_BUILD */ +#ifdef XNU_KERNEL_PRIVATE +#include <stdint.h> +#include <mach/branch_predicates.h> +#endif + /* * types of faults that vm_fault handles * and creates trace entries for @@ -77,21 +82,22 @@ __BEGIN_DECLS /* The Kernel Debug Classes */ -#define DBG_MACH 1 -#define DBG_NETWORK 2 -#define DBG_FSYSTEM 3 -#define DBG_BSD 4 -#define DBG_IOKIT 5 -#define DBG_DRIVERS 6 -#define DBG_TRACE 7 +#define DBG_MACH 1 +#define DBG_NETWORK 2 +#define DBG_FSYSTEM 3 +#define DBG_BSD 4 +#define DBG_IOKIT 5 +#define DBG_DRIVERS 6 +#define DBG_TRACE 7 #define DBG_DLIL 8 #define DBG_SECURITY 9 -#define DBG_MISC 20 -#define DBG_DYLD 31 -#define DBG_QT 32 -#define DBG_APPS 33 -#define DBG_LAUNCHD 34 -#define DBG_MIG 255 +#define DBG_CORESTORAGE 10 +#define DBG_MISC 20 +#define DBG_DYLD 31 +#define DBG_QT 32 +#define DBG_APPS 33 +#define DBG_LAUNCHD 34 +#define DBG_MIG 255 /* **** The Kernel Debug Sub Classes for Mach (DBG_MACH) **** */ #define DBG_MACH_EXCP_KTRAP_x86 0x02 /* Kernel Traps on x86 */ @@ -114,6 +120,7 @@ __BEGIN_DECLS #define DBG_MACH_MSGID_INVALID 0x50 /* Messages - invalid */ #define DBG_MACH_LOCKS 0x60 /* new lock APIs */ #define DBG_MACH_PMAP 0x70 /* pmap */ +#define DBG_MACH_MP 0x90 /* MP related */ /* Codes for Scheduler (DBG_MACH_SCHED) */ #define MACH_SCHED 0x0 /* Scheduler */ @@ -127,6 +134,16 @@ __BEGIN_DECLS #define MACH_DEMOTE 0x8 /* promotion undone */ #define MACH_IDLE 0x9 /* processor idling */ #define MACH_STACK_DEPTH 0xa /* stack depth at switch */ +#define MACH_MOVED 0xb /* did not use original scheduling decision */ +#define MACH_FAIRSHARE_ENTER 0xc /* move to fairshare band */ +#define MACH_FAIRSHARE_EXIT 0xd /* exit fairshare band */ +#define MACH_FAILSAFE 0xe /* tripped fixed-pri/RT failsafe */ +#define MACH_GET_URGENCY 0x14 /* Urgency queried by platform */ +#define MACH_URGENCY 0x15 /* Urgency (RT/BG/NORMAL) communicated + * to platform */ +#define MACH_REDISPATCH 0x16 /* "next thread" thread redispatched */ +#define MACH_REMOTE_AST 0x17 /* AST signal issued to remote processor */ +#define MACH_SCHED_LPA_BROKEN 0x18 /* last_processor affinity broken in choose_processor */ /* Codes for pmap (DBG_MACH_PMAP) */ #define PMAP__CREATE 0x0 @@ -176,31 +193,32 @@ __BEGIN_DECLS #define DBG_IOMCURS 5 /* Memory Cursor */ #define DBG_IOMDESC 6 /* Memory Descriptors */ #define DBG_IOPOWER 7 /* Power Managerment */ -#define DBG_IOSERVICE 8 /* Matching etc. */ +#define DBG_IOSERVICE 8 /* Matching etc. */ /* **** 9-32 reserved for internal IOKit usage **** */ #define DBG_IOSTORAGE 32 /* Storage layers */ #define DBG_IONETWORK 33 /* Network layers */ #define DBG_IOKEYBOARD 34 /* Keyboard */ -#define DBG_IOHID 35 /* HID Devices */ -#define DBG_IOAUDIO 36 /* Audio */ +#define DBG_IOHID 35 /* HID Devices */ +#define DBG_IOAUDIO 36 /* Audio */ #define DBG_IOSERIAL 37 /* Serial */ -#define DBG_IOTTY 38 /* TTY layers */ -#define DBG_IOSAM 39 /* SCSI Architecture Model layers */ -#define DBG_IOPARALLELATA 40 /* Parallel ATA */ +#define DBG_IOTTY 38 /* TTY layers */ +#define DBG_IOSAM 39 /* SCSI Architecture Model layers */ +#define DBG_IOPARALLELATA 40 /* Parallel ATA */ #define DBG_IOPARALLELSCSI 41 /* Parallel SCSI */ -#define DBG_IOSATA 42 /* Serial-ATA */ -#define DBG_IOSAS 43 /* SAS */ +#define DBG_IOSATA 42 /* Serial-ATA */ +#define DBG_IOSAS 43 /* SAS */ #define DBG_IOFIBRECHANNEL 44 /* FiberChannel */ -#define DBG_IOUSB 45 /* USB */ +#define DBG_IOUSB 45 /* USB */ #define DBG_IOBLUETOOTH 46 /* Bluetooth */ #define DBG_IOFIREWIRE 47 /* FireWire */ #define DBG_IOINFINIBAND 48 /* Infiniband */ -#define DBG_IOCPUPM 49 /* CPU Power Management */ +#define DBG_IOCPUPM 49 /* CPU Power Management */ #define DBG_IOGRAPHICS 50 /* Graphics */ #define DBG_HIBERNATE 51 /* hibernation related events */ + /* Backwards compatibility */ #define DBG_IOPOINTING DBG_IOHID /* OBSOLETE: Use DBG_IOHID instead */ #define DBG_IODISK DBG_IOSTORAGE /* OBSOLETE: Use DBG_IOSTORAGE instead */ @@ -223,7 +241,7 @@ __BEGIN_DECLS #define DBG_DRVFIREWIRE 16 /* FireWire */ #define DBG_DRVINFINIBAND 17 /* Infiniband */ #define DBG_DRVGRAPHICS 18 /* Graphics */ -#define DBG_DRVSD 19 /* Secure Digital */ +#define DBG_DRVSD 19 /* Secure Digital */ /* Backwards compatibility */ #define DBG_DRVPOINTING DBG_DRVHID /* OBSOLETE: Use DBG_DRVHID instead */ @@ -236,7 +254,7 @@ __BEGIN_DECLS #define DBG_DLIL_PR_FLT 4 /* DLIL Protocol Filter */ #define DBG_DLIL_IF_FLT 5 /* DLIL Interface FIlter */ -/* The Kernel Debug Sub Classes for File System */ +/* The Kernel Debug Sub Classes for File System (DBG_FSYSTEM) */ #define DBG_FSRW 1 /* reads and writes to the filesystem */ #define DBG_DKRW 2 /* reads and writes to the disk */ #define DBG_FSVN 3 /* vnode operations (inc. locking/unlocking) */ @@ -244,6 +262,7 @@ __BEGIN_DECLS #define DBG_JOURNAL 5 /* journaling operations */ #define DBG_IOCTL 6 /* ioctl to the disk */ #define DBG_BOOTCACHE 7 /* bootcache operations */ +#define DBG_HFS 8 /* HFS-specific events; see bsd/hfs/hfs_kdebug.h */ /* The Kernel Debug Sub Classes for BSD */ #define DBG_BSD_PROC 0x01 /* process/signals related */ @@ -256,11 +275,15 @@ __BEGIN_DECLS /* The Codes for BSD subcode class DBG_BSD_PROC */ #define BSD_PROC_EXIT 1 /* process exit */ #define BSD_PROC_FRCEXIT 2 /* Kernel force termination */ + /* The Kernel Debug Sub Classes for DBG_TRACE */ #define DBG_TRACE_DATA 0 #define DBG_TRACE_STRING 1 #define DBG_TRACE_INFO 2 +/* The Kernel Debug Sub Classes for DBG_CORESTORAGE */ +#define DBG_CS_IO 0 + /* The Kernel Debug Sub Classes for DBG_MISC */ #define DBG_EVENT 0x10 #define DBG_BUFFER 0x20 @@ -274,6 +297,8 @@ __BEGIN_DECLS #define DKIO_ASYNC 0x04 #define DKIO_META 0x08 #define DKIO_PAGING 0x10 +#define DKIO_THROTTLE 0x20 +#define DKIO_PASSIVE 0x40 /* Codes for Application Sub Classes */ #define DBG_APP_SAMBA 128 @@ -343,25 +368,38 @@ extern unsigned int kdebug_enable; #define KDEBUG_ENABLE_CHUD 0x4 #if (!defined(NO_KDEBUG)) - +#ifdef XNU_KERNEL_PRIVATE #define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (__improbable(kdebug_enable)) \ kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ (uintptr_t)d,(uintptr_t)e); \ } while(0) #define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (__improbable(kdebug_enable)) \ kernel_debug1(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ (uintptr_t)d,(uintptr_t)e); \ } while(0) +#else /* XNU_KERNEL_PRIVATE */ +#define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) \ +do { \ + if (kdebug_enable) \ + kernel_debug(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ + (uintptr_t)d,(uintptr_t)e); \ +} while(0) -#else - -#define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) -#define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) +#define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) \ +do { \ + if (kdebug_enable) \ + kernel_debug1(x,(uintptr_t)a,(uintptr_t)b,(uintptr_t)c, \ + (uintptr_t)d,(uintptr_t)e); \ +} while(0) +#endif /* XNU_KERNEL_PRIVATE */ +#else /*!NO_KDEBUG */ +#define KERNEL_DEBUG_CONSTANT(x,a,b,c,d,e) do { } while(0) +#define KERNEL_DEBUG_CONSTANT1(x,a,b,c,d,e) do { } while(0) #define __kdebug_constant_only __unused #endif @@ -384,23 +422,37 @@ extern void kernel_debug1( #if (KDEBUG && (!defined(NO_KDEBUG))) - +#ifdef XNU_KERNEL_PRIVATE #define KERNEL_DEBUG(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (__improbable(kdebug_enable)) \ kernel_debug((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ } while(0) #define KERNEL_DEBUG1(x,a,b,c,d,e) \ do { \ - if (kdebug_enable) \ + if (__improbable(kdebug_enable)) \ kernel_debug1((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ } while(0) #define __kdebug_only +#else /* !XNU_KERNEL_PRIVATE */ +#define KERNEL_DEBUG(x,a,b,c,d,e) \ +do { \ + if (kdebug_enable) \ + kernel_debug((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ + (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ +} while(0) +#define KERNEL_DEBUG1(x,a,b,c,d,e) \ +do { \ + if (kdebug_enable) \ + kernel_debug1((uint32_t)x, (uintptr_t)a, (uintptr_t)b, \ + (uintptr_t)c, (uintptr_t)d, (uintptr_t)e); \ +} while(0) +#endif /* XNU_KERNEL_PRIVATE */ #else #define KERNEL_DEBUG(x,a,b,c,d,e) do {} while (0) @@ -410,6 +462,7 @@ do { \ #endif #ifdef KERNEL_PRIVATE +#include <mach/boolean.h> struct proc; extern void kdbg_trace_data(struct proc *proc, long *arg_pid); @@ -417,6 +470,19 @@ extern void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *a extern void kdbg_dump_trace_to_file(const char *); void start_kern_tracing(unsigned int); +struct task; +extern void kdbg_get_task_name(char*, int, struct task *task); +void disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags); +void enable_wrap(uint32_t old_slowcheck, boolean_t lostevents); +void release_storage_unit(int cpu, uint32_t storage_unit); +int allocate_storage_unit(int cpu); + +void trace_handler_map_ctrl_page(uintptr_t addr, unsigned long ctrl_page_size, unsigned long storage_size, unsigned long kds_ptr_size); +void trace_handler_map_bufinfo(uintptr_t addr, unsigned long size); +void trace_handler_unmap_bufinfo(void); +void trace_handler_map_buffer(int index, uintptr_t addr, unsigned long size); +void trace_handler_unmap_buffer(int index); +void trace_set_timebases(uint64_t tsc, uint64_t ns); #endif /* KERNEL_PRIVATE */ @@ -446,7 +512,7 @@ typedef struct { #if !defined(__LP64__) #define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL -#define KDBG_CPU_MASK 0x0f00000000000000ULL +#define KDBG_CPU_MASK 0xff00000000000000ULL #define KDBG_CPU_SHIFT 56 static inline void kdbg_set_cpu(kd_buf *kp, int cpu) @@ -460,9 +526,9 @@ kdbg_get_cpu(kd_buf *kp) return (int) (((kp)->timestamp & KDBG_CPU_MASK) >> KDBG_CPU_SHIFT); } static inline void -kdbg_set_timestamp(kd_buf *kp, uint64_t time) +kdbg_set_timestamp(kd_buf *kp, uint64_t thetime) { - kp->timestamp = time & KDBG_TIMESTAMP_MASK; + kp->timestamp = thetime & KDBG_TIMESTAMP_MASK; } static inline uint64_t kdbg_get_timestamp(kd_buf *kp) @@ -470,9 +536,9 @@ kdbg_get_timestamp(kd_buf *kp) return kp->timestamp & KDBG_TIMESTAMP_MASK; } static inline void -kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t time, int cpu) +kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) { - kp->timestamp = (time & KDBG_TIMESTAMP_MASK) | + kp->timestamp = (thetime & KDBG_TIMESTAMP_MASK) | (((uint64_t) cpu) << KDBG_CPU_SHIFT); } #else @@ -488,9 +554,9 @@ kdbg_get_cpu(kd_buf *kp) return kp->cpuid; } static inline void -kdbg_set_timestamp(kd_buf *kp, uint64_t time) +kdbg_set_timestamp(kd_buf *kp, uint64_t thetime) { - kp->timestamp = time; + kp->timestamp = thetime; } static inline uint64_t kdbg_get_timestamp(kd_buf *kp) @@ -498,9 +564,9 @@ kdbg_get_timestamp(kd_buf *kp) return kp->timestamp; } static inline void -kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t time, int cpu) +kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu) { - kdbg_set_timestamp(kp, time); + kdbg_set_timestamp(kp, thetime); kdbg_set_cpu(kp, cpu); } #endif @@ -541,6 +607,18 @@ typedef struct { char command[20]; } kd_threadmap; + +typedef struct { + int version_no; + int thread_count; + uint64_t TOD_secs; + uint32_t TOD_usecs; +} RAW_header; + +#define RAW_VERSION0 0x55aa0000 +#define RAW_VERSION1 0x55aa0101 + + #define KDBG_CLASSTYPE 0x10000 #define KDBG_SUBCLSTYPE 0x20000 #define KDBG_RANGETYPE 0x40000 diff --git a/bsd/sys/kern_control.h b/bsd/sys/kern_control.h index 0e83895b4..4a5a411d6 100644 --- a/bsd/sys/kern_control.h +++ b/bsd/sys/kern_control.h @@ -252,6 +252,7 @@ typedef errno_t (*ctl_disconnect_func)(kern_ctl_ref kctlref, u_int32_t unit, voi @param unitinfo The user-defined private data initialized by the ctl_connect_func callback. @param m The data sent by the client to the kernel control in an + mbuf chain. Your function is responsible for releasing the mbuf chain. @param flags The flags specified by the client when calling send/sendto/sendmsg (MSG_OOB/MSG_DONTROUTE). @@ -433,6 +434,11 @@ ctl_enqueuembuf(kern_ctl_ref kctlref, u_int32_t unit, mbuf_t m, u_int32_t flags) errno_t ctl_getenqueuespace(kern_ctl_ref kctlref, u_int32_t unit, size_t *space); +#ifdef KERNEL_PRIVATE +u_int32_t ctl_id_by_name(const char *name); +errno_t ctl_name_by_id(u_int32_t id, char *out_name, size_t maxsize); +#endif /* KERNEL_PRIVATE */ + __END_DECLS #endif /* KERNEL */ diff --git a/bsd/sys/kern_memorystatus.h b/bsd/sys/kern_memorystatus.h index b89337521..4a05a490f 100644 --- a/bsd/sys/kern_memorystatus.h +++ b/bsd/sys/kern_memorystatus.h @@ -52,7 +52,8 @@ enum { kMemoryStatusLevelNote = 1, - kMemoryStatusSnapshotNote = 2 + kMemoryStatusSnapshotNote = 2, + kMemoryStatusHibernationNote = 3 }; enum { @@ -109,19 +110,48 @@ typedef struct jetsam_snapshot { jetsam_snapshot_entry_t entries[1]; } jetsam_snapshot_t; +typedef struct jetsam_hibernation_entry { + uint32_t pid; + uint32_t flags; + uint32_t pages; +} jetsam_hibernation_entry_t; + +#endif /* !MACH_KERNEL_PRIVATE */ + enum { - kJetsamFlagsFrontmost = (1 << 0), - kJetsamFlagsKilled = (1 << 1), - kJetsamFlagsKilledHiwat = (1 << 2) + kJetsamFlagsFrontmost = (1 << 0), + kJetsamFlagsKilled = (1 << 1), + kJetsamFlagsKilledHiwat = (1 << 2), + kJetsamFlagsHibernated = (1 << 3), + kJetsamFlagsKilledVnodes = (1 << 4), + kJetsamFlagsKilledSwap = (1 << 5), + kJetsamFlagsThawed = (1 << 6), + kJetsamFlagsKilledVM = (1 << 7), + kJetsamFlagsSuspForDiagnosis = (1 << 8) }; -#endif /* !MACH_KERNEL_PRIVATE */ #ifdef KERNEL extern void kern_memorystatus_init(void) __attribute__((section("__TEXT, initcode"))); -extern int jetsam_kill_top_proc(void); +extern int jetsam_kill_top_proc(boolean_t any, uint32_t reason); extern int kern_memorystatus_wakeup; extern int kern_memorystatus_level; +extern unsigned int kern_memorystatus_delta; + +#ifdef CONFIG_FREEZE +extern void kern_hibernation_init(void) __attribute__((section("__TEXT, initcode"))); +extern int kern_hibernation_wakeup; + +void kern_hibernation_on_pid_suspend(int pid); +void kern_hibernation_on_pid_resume(int pid, task_t task); +void kern_hibernation_on_pid_hibernate(int pid); +#endif + +#if CONFIG_EMBEDDED +#define VM_CHECK_MEMORYSTATUS do { vm_check_memorystatus(); } while(0) +#else /*CONFIG_EMBEDDED*/ +#define VM_CHECK_MEMORYSTATUS do {} while(0) +#endif #endif /* KERNEL */ #endif /* SYS_KERN_MEMORYSTATUS_H */ diff --git a/bsd/sys/kpi_mbuf.h b/bsd/sys/kpi_mbuf.h index 00134b226..24239b9f4 100644 --- a/bsd/sys/kpi_mbuf.h +++ b/bsd/sys/kpi_mbuf.h @@ -55,6 +55,7 @@ @constant MBUF_EXT Indicates this mbuf has external data. @constant MBUF_PKTHDR Indicates this mbuf has a packet header. @constant MBUF_EOR Indicates this mbuf is the end of a record. + @constant MBUF_LOOP Indicates this packet is looped back. @constant MBUF_BCAST Indicates this packet will be sent or was received as a brodcast. @constant MBUF_MCAST Indicates this packet will be sent or was @@ -72,13 +73,15 @@ enum { MBUF_EXT = 0x0001, /* has associated external storage */ MBUF_PKTHDR = 0x0002, /* start of record */ MBUF_EOR = 0x0004, /* end of record */ + MBUF_LOOP = 0x0040, /* packet is looped back */ MBUF_BCAST = 0x0100, /* send/received as link-level broadcast */ MBUF_MCAST = 0x0200, /* send/received as link-level multicast */ MBUF_FRAG = 0x0400, /* packet is a fragment of a larger packet */ MBUF_FIRSTFRAG = 0x0800, /* packet is first fragment */ MBUF_LASTFRAG = 0x1000, /* packet is last fragment */ - MBUF_PROMISC = 0x2000 /* packet is promiscuous */ + MBUF_PROMISC = 0x2000, /* packet is promiscuous */ + MBUF_HASFCS = 0x4000 /* packet has FCS */ }; typedef u_int32_t mbuf_flags_t; @@ -145,6 +148,10 @@ typedef u_int32_t mbuf_type_t; calculated yet. @constant MBUF_CSUM_REQ_UDP Indicates the UDP checksum has not been calculated yet. + @constant MBUF_CSUM_REQ_TCPIPV6 Indicates the TCP checksum for IPv6 + has not been calculated yet. + @constant MBUF_CSUM_REQ_UDPIPV6 Indicates the UDP checksum for IPv6 + has not been calculated yet. */ enum { MBUF_TSO_IPV4 = 0x100000, @@ -158,7 +165,9 @@ enum { #endif /* KERNEL_PRIVATE */ MBUF_CSUM_REQ_IP = 0x0001, MBUF_CSUM_REQ_TCP = 0x0002, - MBUF_CSUM_REQ_UDP = 0x0004 + MBUF_CSUM_REQ_UDP = 0x0004, + MBUF_CSUM_REQ_TCPIPV6 = 0x0020, + MBUF_CSUM_REQ_UDPIPV6 = 0x0040 }; typedef u_int32_t mbuf_csum_request_flags_t; @@ -178,7 +187,7 @@ typedef u_int32_t mbuf_csum_request_flags_t; hardware should be passed as the second parameter of mbuf_set_csum_performed. The hardware calculated checksum value can be retrieved using the second parameter passed to - mbuf_get_csum_performed. + mbuf_get_csum_performed. This should be done for IPv4 or IPv6. @constant MBUF_CSUM_PSEUDO_HDR If set, this indicates that the checksum value for MBUF_CSUM_DID_DATA includes the pseudo header value. If this is not set, the stack will calculate the pseudo @@ -1183,6 +1192,15 @@ extern u_int32_t mbuf_get_mlen(void); */ extern u_int32_t mbuf_get_mhlen(void); +/*! + @function mbuf_get_minclsize + @discussion This routine returns the minimum number of data bytes + before an external cluster is used. This is equivalent to the + legacy MINCLSIZE macro. + @result The minimum number of bytes before a cluster will be used. + */ +extern u_int32_t mbuf_get_minclsize(void); + /*! @function mbuf_clear_csum_performed @discussion Clears the hardware checksum flags and values. @@ -1330,32 +1348,8 @@ extern void mbuf_tag_free(mbuf_t mbuf, mbuf_tag_id_t module_id, */ extern void mbuf_stats(struct mbuf_stat *stats); -#ifdef KERNEL_PRIVATE -/* - @enum mbuf_priority_t - @abstract Priority of a packet. - @discussion Some mbufs represent packets containing application data. - The priority of the application data is represented by the - mbuf priority, as determined by the system. - @constant MBUF_PRIORITY_NORMAL Indicates the packet contains - normal priority data. - @constant MBUF_PRIORITY_BACKGROUND Indicates the packet contains - background priority data. - */ -typedef enum { - MBUF_PRIORITY_NORMAL = 0, - MBUF_PRIORITY_BACKGROUND = 1 -} mbuf_priority_t; - -/* - @function mbuf_get_priority - @discussion Get the priority value of the packet. - @param mbuf The mbuf to obtain the priority value from. - @result The priority value of the packet. - */ -extern mbuf_priority_t mbuf_get_priority(mbuf_t mbuf); -/* +/*! @enum mbuf_traffic_class_t @abstract Traffic class of a packet @discussion Property that represent the category of traffic of a packet. @@ -1367,15 +1361,19 @@ extern mbuf_priority_t mbuf_get_priority(mbuf_t mbuf); */ typedef enum { #ifdef XNU_KERNEL_PRIVATE - MBUF_TC_NONE = -1, + MBUF_TC_UNSPEC = -1, /* Internal: not specified */ #endif MBUF_TC_BE = 0, MBUF_TC_BK = 1, MBUF_TC_VI = 2, MBUF_TC_VO = 3 +#ifdef XNU_KERNEL_PRIVATE + , + MBUF_TC_MAX = 4 /* Internal: traffic class count */ +#endif } mbuf_traffic_class_t; -/* +/*! @function mbuf_get_traffic_class @discussion Get the traffic class of an mbuf packet @param mbuf The mbuf to get the traffic class of. @@ -1383,7 +1381,7 @@ typedef enum { */ extern mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t mbuf); -/* +/*! @function mbuf_set_traffic_class @discussion Set the traffic class of an mbuf packet. @param mbuf The mbuf to set the traffic class on. @@ -1391,7 +1389,6 @@ extern mbuf_traffic_class_t mbuf_get_traffic_class(mbuf_t mbuf); @result 0 on success, EINVAL if bad paramater is passed */ extern errno_t mbuf_set_traffic_class(mbuf_t mbuf, mbuf_traffic_class_t tc); -#endif /* KERNEL_PRIVATE */ /* IF_QUEUE interaction */ diff --git a/bsd/sys/kpi_socket.h b/bsd/sys/kpi_socket.h index 5e380f5a7..5f2093369 100644 --- a/bsd/sys/kpi_socket.h +++ b/bsd/sys/kpi_socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008-2011 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -56,7 +56,12 @@ struct timeval; Calls to your upcall function are not serialized and may be called concurrently from multiple threads in the kernel. - Your upcall function will be called when: + Your upcall function will be called: + when there is data more than the low water mark for reading, + or when there is space for a write, + or when there is a connection to accept, + or when a socket is connected, + or when a socket is closed or disconnected @param so A reference to the socket that's ready. @param cookie The cookie passed in when the socket was created. @@ -227,11 +232,16 @@ extern errno_t sock_settclassopt(socket_t so, const void* optval, size_t optlen) */ extern errno_t sock_gettclassopt(socket_t so, void* optval, size_t* optlen); +#ifdef XNU_KERNEL_PRIVATE +extern void socket_set_traffic_mgt_flags_locked(socket_t so, u_int32_t flags); +extern void socket_clear_traffic_mgt_flags_locked(socket_t so, u_int32_t flags); +#endif /* XNU_KERNEL_PRIVATE */ #ifdef BSD_KERNEL_PRIVATE extern void socket_set_traffic_mgt_flags(socket_t so, u_int32_t flags); extern void socket_clear_traffic_mgt_flags(socket_t so, u_int32_t flags); +extern errno_t socket_defunct(struct proc *, socket_t so, int); #endif /* BSD_KERNEL_PRIVATE */ -#endif +#endif /* KERNEL_PRIVATE */ /*! @function sock_listen @@ -473,6 +483,22 @@ extern errno_t sock_getaddr(socket_t so, struct sockaddr **psockname, @param sockname The socket name to be freed. */ extern void sock_freeaddr(struct sockaddr *sockname); + +/* + @function sock_setupcall + @discussion Set the notifier function to be called when an event + occurs on the socket. This may be set to NULL to disable + further notifications. Setting the function does not + affect currently notifications about to be sent or being sent. + Note: When this function is used on a socket passed from userspace + it is crucial to call sock_retain() on the socket otherwise a callback + could be dispatched on a closed socket and cause a crash. + @param sock The socket. + @param callback The notifier function + @param context A cookie passed directly to the callback +*/ +extern errno_t sock_setupcall(socket_t sock, sock_upcall callback, void* context); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/make_posix_availability.sh b/bsd/sys/make_posix_availability.sh new file mode 100755 index 000000000..5aa58b364 --- /dev/null +++ b/bsd/sys/make_posix_availability.sh @@ -0,0 +1,71 @@ +#! /bin/sh - +# +# Copyright (c) 2010 Apple Inc. All rights reserved. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +# +# This file contains Original Code and/or Modifications of Original Code +# as defined in and that are subject to the Apple Public Source License +# Version 2.0 (the 'License'). You may not use this file except in +# compliance with the License. Please obtain a copy of the License at +# http://www.opensource.apple.com/apsl/ and read it before using this +# file. +# +# The Original Code and all software distributed under the License are +# distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +# Please see the License for the specific language governing rights and +# limitations under the License. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +# + +POSIX_VALUES="198808L 199009L 199209L 199309L 199506L 200112L 200809L" + +{ +cat <<EOF +/* Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _CDEFS_H_ +# error "Never use <sys/_posix_availability.h> directly. Use <sys/cdefs.h> instead." +#endif + +EOF + +for value in ${POSIX_VALUES} ; do + echo "#if !defined(_DARWIN_C_SOURCE) && defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= ${value}" + echo "#define ___POSIX_C_DEPRECATED_STARTING_${value} __deprecated" + echo "#else" + echo "#define ___POSIX_C_DEPRECATED_STARTING_${value}" + echo "#endif" + echo +done +} > $1 + diff --git a/bsd/sys/make_symbol_aliasing.sh b/bsd/sys/make_symbol_aliasing.sh new file mode 100755 index 000000000..fa5f0e33c --- /dev/null +++ b/bsd/sys/make_symbol_aliasing.sh @@ -0,0 +1,86 @@ +#! /bin/bash - +# +# Copyright (c) 2010 Apple Inc. All rights reserved. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_START@ +# +# This file contains Original Code and/or Modifications of Original Code +# as defined in and that are subject to the Apple Public Source License +# Version 2.0 (the 'License'). You may not use this file except in +# compliance with the License. Please obtain a copy of the License at +# http://www.opensource.apple.com/apsl/ and read it before using this +# file. +# +# The Original Code and all software distributed under the License are +# distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER +# EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, +# INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. +# Please see the License for the specific language governing rights and +# limitations under the License. +# +# @APPLE_OSREFERENCE_LICENSE_HEADER_END@ +# + +{ +cat <<EOF +/* Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _CDEFS_H_ +# error "Never use <sys/_symbol_aliasing.h> directly. Use <sys/cdefs.h> instead." +#endif + +EOF + +for ver in $(${SDKROOT}/usr/local/libexec/availability.pl --ios) ; do + ver_major=${ver%.*} + ver_minor=${ver#*.} + value=$(printf "%d%02d00" ${ver_major} ${ver_minor}) + str=$(printf "__IPHONE_%d_%d" ${ver_major} ${ver_minor}) + echo "#if defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ >= ${value}" + echo "#define __DARWIN_ALIAS_STARTING_IPHONE_${str}(x) x" + echo "#else" + echo "#define __DARWIN_ALIAS_STARTING_IPHONE_${str}(x)" + echo "#endif" + echo "" +done + +for ver in $(${SDKROOT}/usr/local/libexec/availability.pl --macosx) ; do + ver_major=${ver%.*} + ver_minor=${ver#*.} + value=$(printf "%d%d0" ${ver_major} ${ver_minor}) + str=$(printf "__MAC_%d_%d" ${ver_major} ${ver_minor}) + echo "#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ >= ${value}" + echo "#define __DARWIN_ALIAS_STARTING_MAC_${str}(x) x" + echo "#else" + echo "#define __DARWIN_ALIAS_STARTING_MAC_${str}(x)" + echo "#endif" + echo "" +done +} > $1 + diff --git a/bsd/sys/malloc.h b/bsd/sys/malloc.h index dcbaaded7..4e8688735 100644 --- a/bsd/sys/malloc.h +++ b/bsd/sys/malloc.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -179,7 +179,7 @@ #define M_IP6NDP 86 /* IPv6 Neighbour Discovery*/ #define M_IP6OPT 87 /* IPv6 options management */ #define M_IP6MISC 88 /* IPv6 misc. memory */ -#define M_TSEGQ 89 /* TCP segment queue entry */ +#define M_TSEGQ 89 /* TCP segment queue entry, unused */ #define M_IGMP 90 #define M_JNL_JNL 91 /* Journaling: "struct journal" */ #define M_JNL_TR 92 /* Journaling: "struct transaction" */ @@ -204,8 +204,13 @@ #if HFS_COMPRESSION #define M_DECMPFS_CNODE 109 /* decmpfs cnode structures */ #endif /* HFS_COMPRESSION */ +#define M_INMFILTER 110 /* IPv4 multicast PCB-layer source filter */ +#define M_IPMSOURCE 111 /* IPv4 multicast IGMP-layer source filter */ +#define M_IN6MFILTER 112 /* IPv6 multicast PCB-layer source filter */ +#define M_IP6MOPTS 113 /* IPv6 multicast options */ +#define M_IP6MSOURCE 114 /* IPv6 multicast MLD-layer source filter */ -#define M_LAST 110 /* Must be last type + 1 */ +#define M_LAST 115 /* Must be last type + 1 */ #else /* BSD_KERNEL_PRIVATE */ @@ -253,6 +258,9 @@ extern struct kmemstats kmemstats[]; #define FREE(addr, type) \ _FREE((void *)addr, type) +#define REALLOC(space, cast, addr, size, type, flags) \ + (space) = (cast)_REALLOC(addr, size, type, flags) + #define MALLOC_ZONE(space, cast, size, type, flags) \ (space) = (cast)_MALLOC_ZONE(size, type, flags) @@ -268,6 +276,12 @@ extern void _FREE( void *addr, int type); +extern void *_REALLOC( + void *addr, + size_t size, + int type, + int flags); + extern void *_MALLOC_ZONE( size_t size, int type, diff --git a/bsd/sys/mbuf.h b/bsd/sys/mbuf.h index 247d7bb71..f0d45c565 100644 --- a/bsd/sys/mbuf.h +++ b/bsd/sys/mbuf.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 1999-2010 Apple Inc. All rights reserved. + * Copyright (c) 1999-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,12 +22,12 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* +/* * Mach Operating System * Copyright (c) 1987 Carnegie-Mellon University * All rights reserved. The CMU software License Agreement specifies @@ -68,11 +68,6 @@ * SUCH DAMAGE. * * @(#)mbuf.h 8.3 (Berkeley) 1/21/94 - ********************************************************************** - * HISTORY - * 20-May-95 Mac Gillon (mgillon) at NeXT - * New version based on 4.4 - * Purged old history */ /* * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce @@ -87,7 +82,7 @@ #include <sys/cdefs.h> #include <sys/appleapiopts.h> -#ifdef KERNEL_PRIVATE +#ifdef XNU_KERNEL_PRIVATE #include <sys/lock.h> #include <sys/queue.h> @@ -99,18 +94,10 @@ /* * Mbufs are of a single size, MSIZE (machine/param.h), which * includes overhead. An mbuf may add a single "mbuf cluster" of size - * MCLBYTES (also in machine/param.h), which has no additional overhead - * and is used instead of the internal data area; this is done when - * at least MINCLSIZE of data must be stored. - */ - -/* - * These macros are mapped to the appropriate KPIs, so that private code - * can be simply recompiled in order to be forward-compatible with future - * changes toward the struture sizes. + * MCLBYTES/MBIGCLBYTES/M16KCLBYTES (also in machine/param.h), which has + * no additional overhead and is used instead of the internal data area; + * this is done when at least MINCLSIZE of data must be stored. */ -#define MLEN mbuf_get_mlen() /* normal data len */ -#define MHLEN mbuf_get_mhlen() /* data len w/pkthdr */ /* * The following _MLEN and _MHLEN macros are private to xnu. Private code @@ -120,24 +107,31 @@ #define _MLEN (MSIZE - sizeof(struct m_hdr)) /* normal data len */ #define _MHLEN (_MLEN - sizeof(struct pkthdr)) /* data len w/pkthdr */ -#define MINCLSIZE (MHLEN + MLEN) /* smallest amount to put in cluster */ -#define M_MAXCOMPRESS (MHLEN / 2) /* max amount to copy for compression */ +#define NMBPBGSHIFT (MBIGCLSHIFT - MSIZESHIFT) +#define NMBPBG (1 << NMBPBGSHIFT) /* # of mbufs per big cl */ -#define NMBPCL (sizeof(union mcluster) / sizeof(struct mbuf)) +#define NCLPBGSHIFT (MBIGCLSHIFT - MCLSHIFT) +#define NCLPBG (1 << NCLPBGSHIFT) /* # of cl per big cl */ + +#define NMBPCLSHIFT (NMBPBGSHIFT - NCLPBGSHIFT) +#define NMBPCL (1 << NMBPCLSHIFT) /* # of mbufs per cl */ + +#define NCLPJCLSHIFT ((M16KCLSHIFT - MBIGCLSHIFT) + NCLPBGSHIFT) +#define NCLPJCL (1 << NCLPJCLSHIFT) /* # of cl per jumbo cl */ /* * Macros for type conversion * mtod(m,t) - convert mbuf pointer to data pointer of correct type * dtom(x) - convert data pointer within mbuf to mbuf pointer (XXX) */ -#define mtod(m,t) ((t)m_mtod(m)) -#define dtom(x) m_dtom(x) +#define mtod(m, t) ((t)m_mtod(m)) +#define dtom(x) m_dtom(x) /* header at beginning of each mbuf: */ struct m_hdr { struct mbuf *mh_next; /* next buffer in chain */ struct mbuf *mh_nextpkt; /* next chain in queue/record */ - int32_t mh_len; /* amount of data in this mbuf */ + int32_t mh_len; /* amount of data in this mbuf */ caddr_t mh_data; /* location of data */ short mh_type; /* type of data in this mbuf */ short mh_flags; /* flags; see below */ @@ -147,10 +141,29 @@ struct m_hdr { * Packet tag structure (see below for details). */ struct m_tag { + u_int64_t m_tag_cookie; /* Error checking */ +#ifndef __LP64__ + u_int32_t pad; /* For structure alignment */ +#endif /* !__LP64__ */ SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ - u_int16_t m_tag_type; /* Module specific type */ - u_int16_t m_tag_len; /* Length of data */ - u_int32_t m_tag_id; /* Module ID */ + u_int16_t m_tag_type; /* Module specific type */ + u_int16_t m_tag_len; /* Length of data */ + u_int32_t m_tag_id; /* Module ID */ +}; + +#ifdef __LP64__ +#define M_TAG_ALIGN(len) \ + P2ROUNDUP(len, sizeof (u_int64_t)) + sizeof (struct m_tag) +#else +#define M_TAG_ALIGN(len) \ + P2ROUNDUP(len, sizeof (u_int32_t)) + sizeof (struct m_tag) +#endif /* !__LP64__ */ + +#define M_TAG_VALID_PATTERN 0xfeedfacefeedfaceULL +#define M_TAG_FREE_PATTERN 0xdeadbeefdeadbeefULL + +struct m_taghdr { + u_int64_t refcnt; /* Number of tags in this mbuf */ }; /* record/packet header in first mbuf of chain; valid if M_PKTHDR set */ @@ -160,14 +173,14 @@ struct pkthdr { /* variables for ip and tcp reassembly */ void *header; /* pointer to packet header */ - /* variables for hardware checksum */ - /* Note: csum_flags is used for hardware checksum and VLAN */ - int csum_flags; /* flags regarding checksum */ - int csum_data; /* data field used by csum routines */ + /* variables for hardware checksum */ + /* Note: csum_flags is used for hardware checksum and VLAN */ + int csum_flags; /* flags regarding checksum */ + int csum_data; /* data field used by csum routines */ u_int tso_segsz; /* TSO segment size (actual MSS) */ u_short vlan_tag; /* VLAN tag, host byte order */ u_short socket_id; /* socket id */ - SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ + SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */ #if PF_PKTHDR /* * Be careful; {en,dis}abling PF_PKTHDR will require xnu recompile; @@ -176,16 +189,16 @@ struct pkthdr { */ struct pf_mtag pf_mtag; #endif /* PF_PKTHDR */ -#if PKT_PRIORITY u_int32_t prio; /* packet priority */ -#endif /* PKT_PRIORITY */ + u_short vt_nrecs; /* # of IGMPv3 records in this chain */ + u_short _pad; }; /* description of external storage mapped into mbuf, valid if M_EXT set */ struct m_ext { caddr_t ext_buf; /* start of buffer */ - void (*ext_free)(caddr_t , u_int, caddr_t); /* free routine if not the usual */ + void (*ext_free)(caddr_t, u_int, caddr_t); /* free routine if not the usual */ u_int ext_size; /* size of buffer, for ext_free */ caddr_t ext_arg; /* additional ext_free argument */ struct ext_refsq { /* references held */ @@ -226,58 +239,71 @@ struct mbuf { #define m_pktdat M_dat.MH.MH_dat.MH_databuf #define m_dat M_dat.M_databuf -/* mbuf flags */ +/* mbuf flags (private) */ #define M_EXT 0x0001 /* has associated external storage */ #define M_PKTHDR 0x0002 /* start of record */ #define M_EOR 0x0004 /* end of record */ #define M_PROTO1 0x0008 /* protocol-specific */ #define M_PROTO2 0x0010 /* protocol-specific */ #define M_PROTO3 0x0020 /* protocol-specific */ -#define M_PROTO4 0x0040 /* protocol-specific */ +#define M_LOOP 0x0040 /* packet is looped back */ #define M_PROTO5 0x0080 /* protocol-specific */ -/* mbuf pkthdr flags, also in m_flags */ +/* mbuf pkthdr flags, also in m_flags (private) */ #define M_BCAST 0x0100 /* send/received as link-level broadcast */ #define M_MCAST 0x0200 /* send/received as link-level multicast */ #define M_FRAG 0x0400 /* packet is a fragment of a larger packet */ #define M_FIRSTFRAG 0x0800 /* packet is first fragment */ #define M_LASTFRAG 0x1000 /* packet is last fragment */ #define M_PROMISC 0x2000 /* packet is promiscuous (shouldn't go to stack) */ +#define M_HASFCS 0x4000 /* packet has FCS */ +#define M_TAGHDR 0x8000 /* m_tag hdr structure at top of mbuf data */ + +/* + * Flags to purge when crossing layers. + */ +#define M_PROTOFLAGS \ + (M_PROTO1|M_PROTO2|M_PROTO3|M_PROTO5) /* flags copied when copying m_pkthdr */ -#define M_COPYFLAGS (M_PKTHDR|M_EOR|M_PROTO1|M_PROTO2|M_PROTO3 | \ - M_PROTO4|M_PROTO5|M_BCAST|M_MCAST|M_FRAG | \ - M_FIRSTFRAG|M_LASTFRAG|M_PROMISC) - -/* flags indicating hw checksum support and sw checksum requirements [freebsd4.1]*/ -#define CSUM_IP 0x0001 /* will csum IP */ -#define CSUM_TCP 0x0002 /* will csum TCP */ -#define CSUM_UDP 0x0004 /* will csum UDP */ -#define CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ -#define CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ - -#define CSUM_IP_CHECKED 0x0100 /* did csum IP */ -#define CSUM_IP_VALID 0x0200 /* ... the csum is valid */ -#define CSUM_DATA_VALID 0x0400 /* csum_data field is valid */ -#define CSUM_PSEUDO_HDR 0x0800 /* csum_data has pseudo hdr */ -#define CSUM_TCP_SUM16 0x1000 /* simple TCP Sum16 computation */ - -#define CSUM_DELAY_DATA (CSUM_TCP | CSUM_UDP) -#define CSUM_DELAY_IP (CSUM_IP) /* XXX add ipv6 here too? */ +#define M_COPYFLAGS \ + (M_PKTHDR|M_EOR|M_PROTO1|M_PROTO2|M_PROTO3 | \ + M_LOOP|M_PROTO5|M_BCAST|M_MCAST|M_FRAG | \ + M_FIRSTFRAG|M_LASTFRAG|M_PROMISC|M_HASFCS) + +/* flags indicating hw checksum support and sw checksum requirements [freebsd4.1] */ +#define CSUM_IP 0x0001 /* will csum IP */ +#define CSUM_TCP 0x0002 /* will csum TCP */ +#define CSUM_UDP 0x0004 /* will csum UDP */ +#define CSUM_IP_FRAGS 0x0008 /* will csum IP fragments */ +#define CSUM_FRAGMENT 0x0010 /* will do IP fragmentation */ +#define CSUM_TCPIPV6 0x0020 /* will csum TCP for IPv6 */ +#define CSUM_UDPIPV6 0x0040 /* will csum UDP for IPv6 */ +#define CSUM_FRAGMENT_IPV6 0x0080 /* will do IPv6 fragmentation */ + +#define CSUM_IP_CHECKED 0x0100 /* did csum IP */ +#define CSUM_IP_VALID 0x0200 /* ... the csum is valid */ +#define CSUM_DATA_VALID 0x0400 /* csum_data field is valid */ +#define CSUM_PSEUDO_HDR 0x0800 /* csum_data has pseudo hdr */ +#define CSUM_TCP_SUM16 0x1000 /* simple TCP Sum16 computation */ + +#define CSUM_DELAY_DATA (CSUM_TCP | CSUM_UDP) +#define CSUM_DELAY_IP (CSUM_IP) /* IPv4 only: no IPv6 IP cksum */ +#define CSUM_DELAY_IPV6_DATA (CSUM_TCPIPV6 | CSUM_UDPIPV6) +#define CSUM_DATA_IPV6_VALID CSUM_DATA_VALID /* csum_data field is valid */ /* * Note: see also IF_HWASSIST_CSUM defined in <net/if_var.h> */ /* bottom 16 bits reserved for hardware checksum */ -#define CSUM_CHECKSUM_MASK 0xffff +#define CSUM_CHECKSUM_MASK 0xffff /* VLAN tag present */ -#define CSUM_VLAN_TAG_VALID 0x10000 /* vlan_tag field is valid */ +#define CSUM_VLAN_TAG_VALID 0x10000 /* vlan_tag field is valid */ /* TCP Segment Offloading requested on this mbuf */ -#define CSUM_TSO_IPV4 0x100000 /* This mbuf needs to be segmented by the NIC */ -#define CSUM_TSO_IPV6 0x200000 /* This mbuf needs to be segmented by the NIC */ -#endif /* KERNEL_PRIVATE */ - +#define CSUM_TSO_IPV4 0x100000 /* This mbuf needs to be segmented by the NIC */ +#define CSUM_TSO_IPV6 0x200000 /* This mbuf needs to be segmented by the NIC */ +#endif /* XNU_KERNEL_PRIVATE */ /* mbuf types */ #define MT_FREE 0 /* should be on free list */ @@ -293,20 +319,12 @@ struct mbuf { #define MT_FTABLE 11 /* fragment reassembly header */ #define MT_RIGHTS 12 /* access rights */ #define MT_IFADDR 13 /* interface address */ -#define MT_CONTROL 14 /* extra-data protocol message */ -#define MT_OOBDATA 15 /* expedited data */ -#define MT_TAG 16 /* volatile metadata associated to pkts */ -#define MT_MAX 32 /* enough? */ - -#ifdef KERNEL_PRIVATE - -/* flags to m_get/MGET */ -/* Need to include malloc.h to get right options for malloc */ -#include <sys/malloc.h> - -#define M_DONTWAIT M_NOWAIT -#define M_WAIT M_WAITOK +#define MT_CONTROL 14 /* extra-data protocol message */ +#define MT_OOBDATA 15 /* expedited data */ +#define MT_TAG 16 /* volatile metadata associated to pkts */ +#define MT_MAX 32 /* enough? */ +#ifdef XNU_KERNEL_PRIVATE /* * mbuf allocation/deallocation macros: * @@ -319,9 +337,9 @@ struct mbuf { */ #if 1 -#define MCHECK(m) m_mcheck(m) +#define MCHECK(m) m_mcheck(m) #else -#define MCHECK(m) +#define MCHECK(m) #endif #define MGET(m, how, type) ((m) = m_get((how), (type))) @@ -347,27 +365,27 @@ union mcluster { #define MCLALLOC(p, how) ((p) = m_mclalloc(how)) -#define MCLFREE(p) m_mclfree(p) +#define MCLFREE(p) m_mclfree(p) -#define MCLGET(m, how) ((m) = m_mclget(m, how)) +#define MCLGET(m, how) ((m) = m_mclget(m, how)) /* * Mbuf big cluster */ - union mbigcluster { union mbigcluster *mbc_next; - char mbc_buf[NBPG]; + char mbc_buf[MBIGCLBYTES]; }; -#define M16KCLBYTES (16 * 1024) - +/* + * Mbuf jumbo cluster + */ union m16kcluster { union m16kcluster *m16kcl_next; char m16kcl_buf[M16KCLBYTES]; }; -#define MCLHASREFERENCE(m) m_mclhasreference(m) +#define MCLHASREFERENCE(m) m_mclhasreference(m) /* * MFREE(struct mbuf *m, struct mbuf *n) @@ -388,14 +406,19 @@ union m16kcluster { * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place * an object of the specified size at the end of the mbuf, longword aligned. */ -#define M_ALIGN(m, len) \ - { (m)->m_data += (MLEN - (len)) &~ (sizeof(long) - 1); } +#define M_ALIGN(m, len) \ +do { \ + (m)->m_data += (MLEN - (len)) &~ (sizeof (long) - 1); \ +} while (0) + /* * As above, for mbufs allocated with m_gethdr/MGETHDR * or initialized by M_COPY_PKTHDR. */ -#define MH_ALIGN(m, len) \ - { (m)->m_data += (MHLEN - (len)) &~ (sizeof(long) - 1); } +#define MH_ALIGN(m, len) \ +do { \ + (m)->m_data += (MHLEN - (len)) &~ (sizeof (long) - 1); \ +} while (0) /* * Compute the amount of space available @@ -417,21 +440,84 @@ union m16kcluster { * If how is M_DONTWAIT and allocation fails, the original mbuf chain * is freed and m is set to NULL. */ -#define M_PREPEND(m, plen, how) ((m) = m_prepend_2((m), (plen), (how))) +#define M_PREPEND(m, plen, how) ((m) = m_prepend_2((m), (plen), (how))) /* change mbuf to new type */ -#define MCHTYPE(m, t) m_mchtype(m, t) - -/* length to m_copy to copy all */ -#define M_COPYALL 1000000000 +#define MCHTYPE(m, t) m_mchtype(m, t) /* compatiblity with 4.3 */ -#define m_copy(m, o, l) m_copym((m), (o), (l), M_DONTWAIT) +#define m_copy(m, o, l) m_copym((m), (o), (l), M_DONTWAIT) #define MBSHIFT 20 /* 1MB */ +#define MBSIZE (1 << MBSHIFT) #define GBSHIFT 30 /* 1GB */ +#define GBSIZE (1 << GBSHIFT) -#endif /* KERNEL_PRIVATE */ +/* + * M_STRUCT_GET ensures that intermediate protocol header (from "off" to + * "len") is located in single mbuf, on contiguous memory region. + * The pointer to the region will be returned to pointer variable "val", + * with type "typ". + * + * M_STRUCT_GET0 does the same, except that it aligns the structure at + * very top of mbuf. GET0 is likely to make memory copy than GET. + */ +#define M_STRUCT_GET(val, typ, m, off, len) \ +do { \ + struct mbuf *t; \ + int tmp; \ + \ + if ((m)->m_len >= (off) + (len)) { \ + (val) = (typ)(mtod((m), caddr_t) + (off)); \ + } else { \ + t = m_pulldown((m), (off), (len), &tmp); \ + if (t != NULL) { \ + if (t->m_len < tmp + (len)) \ + panic("m_pulldown malfunction"); \ + (val) = (typ)(mtod(t, caddr_t) + tmp); \ + } else { \ + (val) = (typ)NULL; \ + (m) = NULL; \ + } \ + } \ +} while (0) + +#define M_STRUCT_GET0(val, typ, m, off, len) \ +do { \ + struct mbuf *t; \ + \ + if ((off) == 0) { \ + (val) = (typ)mtod(m, caddr_t); \ + } else { \ + t = m_pulldown((m), (off), (len), NULL); \ + if (t != NULL) { \ + if (t->m_len < (len)) \ + panic("m_pulldown malfunction"); \ + (val) = (typ)mtod(t, caddr_t); \ + } else { \ + (val) = (typ)NULL; \ + (m) = NULL; \ + } \ + } \ +} while (0) + +#define MBUF_INPUT_CHECK(m, rcvif) \ +do { \ + if (!(m->m_flags & MBUF_PKTHDR) || \ + m->m_len < 0 || \ + m->m_len > ((njcl > 0) ? njclbytes : MBIGCLBYTES) || \ + m->m_type == MT_FREE || \ + ((m->m_flags & M_EXT) != 0 && m->m_ext.ext_buf == NULL)) { \ + panic("Failed mbuf validity check: mbuf %p len %d " \ + "type %d flags 0x%x data %p rcvif %s%d ifflags 0x%x", \ + m, m->m_len, m->m_type, m->m_flags, \ + ((m->m_flags & M_EXT) ? m->m_ext.ext_buf : m->m_data), \ + rcvif->if_name, rcvif->if_unit, \ + (rcvif->if_flags & 0xffff)); \ + } \ +} while (0) + +#endif /* XNU_KERNEL_PRIVATE */ /* * Mbuf statistics (legacy). @@ -481,7 +567,7 @@ struct ombstat { */ #define MAX_MBUF_CNAME 15 -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) /* For backwards compatibility with 32-bit userland process */ struct omb_class_stat { char mbcl_cname[MAX_MBUF_CNAME + 1]; /* class name */ @@ -506,7 +592,7 @@ struct omb_class_stat { u_int32_t mbcl_mc_nwretry_cnt; /* # of no-wait retry attempts */ u_int64_t mbcl_reserved[4]; /* for future use */ } __attribute__((__packed__)); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ typedef struct mb_class_stat { char mbcl_cname[MAX_MBUF_CNAME + 1]; /* class name */ @@ -540,13 +626,13 @@ typedef struct mb_class_stat { #define MCS_PURGING 2 /* cache is being purged */ #define MCS_OFFLINE 3 /* cache is offline (resizing) */ -#if defined(KERNEL_PRIVATE) +#if defined(XNU_KERNEL_PRIVATE) /* For backwards compatibility with 32-bit userland process */ struct omb_stat { u_int32_t mbs_cnt; /* number of classes */ struct omb_class_stat mbs_class[1]; /* class array */ } __attribute__((__packed__)); -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ typedef struct mb_stat { u_int32_t mbs_cnt; /* number of classes */ @@ -556,120 +642,199 @@ typedef struct mb_stat { mb_class_stat_t mbs_class[1]; /* class array */ } mb_stat_t; +#ifdef PRIVATE +#define MLEAK_STACK_DEPTH 16 /* Max PC stack depth */ + +typedef struct mleak_trace_stat { + u_int64_t mltr_collisions; + u_int64_t mltr_hitcount; + u_int64_t mltr_allocs; + u_int64_t mltr_depth; + u_int64_t mltr_addr[MLEAK_STACK_DEPTH]; +} mleak_trace_stat_t; + +typedef struct mleak_stat { + u_int32_t ml_isaddr64; /* 64-bit KVA? */ + u_int32_t ml_cnt; /* number of traces */ + mleak_trace_stat_t ml_trace[1]; /* trace array */ +} mleak_stat_t; + +struct mleak_table { + u_int32_t mleak_capture; /* sampling capture counter */ + u_int32_t mleak_sample_factor; /* sample factor */ + + /* Times two active records want to occupy the same spot */ + u_int64_t alloc_collisions; + u_int64_t trace_collisions; + + /* Times new record lands on spot previously occupied by freed alloc */ + u_int64_t alloc_overwrites; + u_int64_t trace_overwrites; + + /* Times a new alloc or trace is put into the hash table */ + u_int64_t alloc_recorded; + u_int64_t trace_recorded; + + /* Total number of outstanding allocs */ + u_int64_t outstanding_allocs; + + /* Times mleak_log returned false because couldn't acquire the lock */ + u_int64_t total_conflicts; +}; +#endif /* PRIVATE */ + #ifdef KERNEL_PRIVATE +__BEGIN_DECLS -#ifdef KERNEL -extern union mcluster *mbutl; /* virtual address of mclusters */ -extern union mcluster *embutl; /* ending virtual address of mclusters */ -extern struct mbstat mbstat; /* statistics */ -extern unsigned int nmbclusters; /* number of mapped clusters */ -extern int njcl; /* # of clusters for jumbo sizes */ -extern int njclbytes; /* size of a jumbo cluster */ -extern int max_linkhdr; /* largest link-level header */ -extern int max_protohdr; /* largest protocol header */ -extern int max_hdr; /* largest link+protocol header */ -extern int max_datalen; /* MHLEN - max_hdr */ +/* + * Exported (private) + */ + +extern struct mbstat mbstat; /* statistics */ + +__END_DECLS +#endif /* KERNEL_PRIVATE */ +#ifdef XNU_KERNEL_PRIVATE __BEGIN_DECLS -/* Not exported */ -__private_extern__ unsigned int mbuf_default_ncl(int, uint64_t); + +/* + * Not exported (xnu private) + */ + +/* flags to m_get/MGET */ +/* Need to include malloc.h to get right options for malloc */ +#include <sys/malloc.h> + +struct mbuf; + +/* length to m_copy to copy all */ +#define M_COPYALL 1000000000 + +#define M_DONTWAIT M_NOWAIT +#define M_WAIT M_WAITOK + +/* + * These macros are mapped to the appropriate KPIs, so that private code + * can be simply recompiled in order to be forward-compatible with future + * changes toward the struture sizes. + */ +#define MLEN mbuf_get_mlen() /* normal data len */ +#define MHLEN mbuf_get_mhlen() /* data len w/pkthdr */ + +#define MINCLSIZE mbuf_get_minclsize() /* cluster usage threshold */ + +extern void m_freem(struct mbuf *); +extern char *mcl_to_paddr(char *); +extern void m_adj(struct mbuf *, int); +extern void m_cat(struct mbuf *, struct mbuf *); +extern void m_copydata(struct mbuf *, int, int, void *); +extern struct mbuf *m_copym(struct mbuf *, int, int, int); +extern struct mbuf *m_get(int, int); +extern struct mbuf *m_gethdr(int, int); +extern struct mbuf *m_getpacket(void); +extern struct mbuf *m_getpackets(int, int, int); +extern struct mbuf *m_mclget(struct mbuf *, int); +extern void *m_mtod(struct mbuf *); +extern struct mbuf *m_prepend_2(struct mbuf *, int, int); +extern struct mbuf *m_pullup(struct mbuf *, int); +extern struct mbuf *m_split(struct mbuf *, int, int); +extern void m_mclfree(caddr_t p); + +__private_extern__ union mbigcluster *mbutl; /* start VA of mbuf pool */ +__private_extern__ union mbigcluster *embutl; /* end VA of mbuf pool */ +__private_extern__ unsigned int nmbclusters; /* number of mapped clusters */ +__private_extern__ int njcl; /* # of jumbo clusters */ +__private_extern__ int njclbytes; /* size of a jumbo cluster */ +__private_extern__ int max_linkhdr; /* largest link-level header */ +__private_extern__ int max_protohdr; /* largest protocol header */ +__private_extern__ int max_hdr; /* largest link+protocol header */ +__private_extern__ int max_datalen; /* MHLEN - max_hdr */ + +__private_extern__ unsigned int mbuf_default_ncl(int, u_int64_t); __private_extern__ void mbinit(void); __private_extern__ struct mbuf *m_clattach(struct mbuf *, int, caddr_t, - void (*)(caddr_t , u_int, caddr_t), u_int, caddr_t, int); + void (*)(caddr_t, u_int, caddr_t), u_int, caddr_t, int); __private_extern__ caddr_t m_bigalloc(int); __private_extern__ void m_bigfree(caddr_t, u_int, caddr_t); __private_extern__ struct mbuf *m_mbigget(struct mbuf *, int); __private_extern__ caddr_t m_16kalloc(int); __private_extern__ void m_16kfree(caddr_t, u_int, caddr_t); __private_extern__ struct mbuf *m_m16kget(struct mbuf *, int); -__private_extern__ void mbuf_growth_aggressive(void); -__private_extern__ void mbuf_growth_normal(void); - -/* Exported */ -struct mbuf *m_copym(struct mbuf *, int, int, int); -struct mbuf *m_split(struct mbuf *, int, int); -struct mbuf *m_free(struct mbuf *); -struct mbuf *m_get(int, int); -struct mbuf *m_getpacket(void); -struct mbuf *m_getclr(int, int); -struct mbuf *m_gethdr(int, int); -struct mbuf *m_prepend(struct mbuf *, int, int); -struct mbuf *m_prepend_2(struct mbuf *, int, int); -struct mbuf *m_pullup(struct mbuf *, int); -struct mbuf *m_retry(int, int); -struct mbuf *m_retryhdr(int, int); -void m_adj(struct mbuf *, int); -void m_freem(struct mbuf *); -int m_freem_list(struct mbuf *); -struct mbuf *m_devget(char *, int, int, struct ifnet *, void (*)(const void *, void *, size_t)); -char *mcl_to_paddr(char *); -struct mbuf *m_pulldown(struct mbuf*, int, int, int*); - -extern struct mbuf *m_getcl(int, int, int); -struct mbuf *m_mclget(struct mbuf *, int); -caddr_t m_mclalloc(int); -void m_mclfree(caddr_t p); -int m_mclhasreference(struct mbuf *); -void m_copy_pkthdr(struct mbuf *, struct mbuf*); - -int m_mclref(struct mbuf *); -int m_mclunref(struct mbuf *); - -void * m_mtod(struct mbuf *); -struct mbuf * m_dtom(void *); -int m_mtocl(void *); -union mcluster *m_cltom(int ); - -int m_trailingspace(struct mbuf *); -int m_leadingspace(struct mbuf *); - -struct mbuf *m_normalize(struct mbuf *m); -void m_mchtype(struct mbuf *m, int t); -void m_mcheck(struct mbuf*); - -extern void m_copyback(struct mbuf *, int , int , const void *); -extern struct mbuf *m_copyback_cow(struct mbuf *, int, int, const void *, int); -extern int m_makewritable(struct mbuf **, int, int, int); -void m_copydata(struct mbuf *, int , int , void *); -struct mbuf* m_dup(struct mbuf *m, int how); -void m_cat(struct mbuf *, struct mbuf *); -struct mbuf *m_copym_with_hdrs(struct mbuf*, int, int, int, struct mbuf**, int*); -struct mbuf *m_getpackets(int, int, int); -struct mbuf * m_getpackethdrs(int , int ); -struct mbuf* m_getpacket_how(int ); -struct mbuf * m_getpackets_internal(unsigned int *, int , int , int , size_t); -struct mbuf * m_allocpacket_internal(unsigned int * , size_t , unsigned int *, int , int , size_t ); -__END_DECLS +__private_extern__ struct mbuf *m_free(struct mbuf *); +__private_extern__ struct mbuf *m_getclr(int, int); +__private_extern__ struct mbuf *m_getptr(struct mbuf *, int, int *); +__private_extern__ unsigned int m_length(struct mbuf *); +__private_extern__ struct mbuf *m_prepend(struct mbuf *, int, int); +__private_extern__ struct mbuf *m_copyup(struct mbuf *, int, int); +__private_extern__ struct mbuf *m_retry(int, int); +__private_extern__ struct mbuf *m_retryhdr(int, int); +__private_extern__ int m_freem_list(struct mbuf *); +__private_extern__ int m_append(struct mbuf *, int, caddr_t); +__private_extern__ struct mbuf *m_last(struct mbuf *); +__private_extern__ struct mbuf *m_devget(char *, int, int, struct ifnet *, + void (*)(const void *, void *, size_t)); +__private_extern__ struct mbuf *m_pulldown(struct mbuf *, int, int, int *); + +__private_extern__ struct mbuf *m_getcl(int, int, int); +__private_extern__ caddr_t m_mclalloc(int); +__private_extern__ int m_mclhasreference(struct mbuf *); +__private_extern__ void m_copy_pkthdr(struct mbuf *, struct mbuf *); + +__private_extern__ struct mbuf *m_dtom(void *); +__private_extern__ int m_mtocl(void *); +__private_extern__ union mcluster *m_cltom(int); + +__private_extern__ int m_trailingspace(struct mbuf *); +__private_extern__ int m_leadingspace(struct mbuf *); + +__private_extern__ struct mbuf *m_normalize(struct mbuf *m); +__private_extern__ void m_mchtype(struct mbuf *m, int t); +__private_extern__ void m_mcheck(struct mbuf *); + +__private_extern__ void m_copyback(struct mbuf *, int, int, const void *); +__private_extern__ struct mbuf *m_copyback_cow(struct mbuf *, int, int, + const void *, int); +__private_extern__ int m_makewritable(struct mbuf **, int, int, int); +__private_extern__ struct mbuf *m_dup(struct mbuf *m, int how); +__private_extern__ struct mbuf *m_copym_with_hdrs(struct mbuf *, int, int, int, + struct mbuf **, int *); +__private_extern__ struct mbuf *m_getpackethdrs(int, int); +__private_extern__ struct mbuf *m_getpacket_how(int); +__private_extern__ struct mbuf *m_getpackets_internal(unsigned int *, int, + int, int, size_t); +__private_extern__ struct mbuf *m_allocpacket_internal(unsigned int *, size_t, + unsigned int *, int, int, size_t); /* - Packets may have annotations attached by affixing a list of "packet - tags" to the pkthdr structure. Packet tags are dynamically allocated - semi-opaque data structures that have a fixed header (struct m_tag) - that specifies the size of the memory block and an <id,type> pair that - identifies it. The id identifies the module and the type identifies the - type of data for that module. The id of zero is reserved for the kernel. - - Note that the packet tag returned by m_tag_allocate has the default - memory alignment implemented by malloc. To reference private data one - can use a construct like: - - struct m_tag *mtag = m_tag_allocate(...); - struct foo *p = (struct foo *)(mtag+1); - - if the alignment of struct m_tag is sufficient for referencing members - of struct foo. Otherwise it is necessary to embed struct m_tag within - the private data structure to insure proper alignment; e.g. - - struct foo { - struct m_tag tag; - ... - }; - struct foo *p = (struct foo *) m_tag_allocate(...); - struct m_tag *mtag = &p->tag; + * Packets may have annotations attached by affixing a list of "packet + * tags" to the pkthdr structure. Packet tags are dynamically allocated + * semi-opaque data structures that have a fixed header (struct m_tag) + * that specifies the size of the memory block and an <id,type> pair that + * identifies it. The id identifies the module and the type identifies the + * type of data for that module. The id of zero is reserved for the kernel. + * + * Note that the packet tag returned by m_tag_allocate has the default + * memory alignment implemented by malloc. To reference private data one + * can use a construct like: + * + * struct m_tag *mtag = m_tag_allocate(...); + * struct foo *p = (struct foo *)(mtag+1); + * + * if the alignment of struct m_tag is sufficient for referencing members + * of struct foo. Otherwise it is necessary to embed struct m_tag within + * the private data structure to insure proper alignment; e.g. + * + * struct foo { + * struct m_tag tag; + * ... + * }; + * struct foo *p = (struct foo *) m_tag_allocate(...); + * struct m_tag *mtag = &p->tag; */ -#define KERNEL_MODULE_TAG_ID 0 +#define KERNEL_MODULE_TAG_ID 0 enum { KERNEL_TAG_TYPE_NONE = 0, @@ -685,45 +850,27 @@ enum { KERNEL_TAG_TYPE_PF = 11 }; -/* - * As a temporary and low impact solution to replace the even uglier - * approach used so far in some parts of the network stack (which relies - * on global variables), packet tag-like annotations are stored in MT_TAG - * mbufs (or lookalikes) prepended to the actual mbuf chain. - * - * m_type = MT_TAG - * m_flags = m_tag_id - * m_next = next buffer in chain. - * - * BE VERY CAREFUL not to pass these blocks to the mbuf handling routines. - */ -#define _m_tag_id m_hdr.mh_flags - -__BEGIN_DECLS - /* Packet tag routines */ -struct m_tag *m_tag_alloc(u_int32_t id, u_int16_t type, int len, int wait); -void m_tag_free(struct m_tag *); -void m_tag_prepend(struct mbuf *, struct m_tag *); -void m_tag_unlink(struct mbuf *, struct m_tag *); -void m_tag_delete(struct mbuf *, struct m_tag *); -void m_tag_delete_chain(struct mbuf *, struct m_tag *); -struct m_tag *m_tag_locate(struct mbuf *,u_int32_t id, u_int16_t type, - struct m_tag *); -struct m_tag *m_tag_copy(struct m_tag *, int wait); -int m_tag_copy_chain(struct mbuf *to, struct mbuf *from, int wait); -void m_tag_init(struct mbuf *); -struct m_tag *m_tag_first(struct mbuf *); -struct m_tag *m_tag_next(struct mbuf *, struct m_tag *); - -extern void m_prio_init(struct mbuf *); -extern void m_prio_background(struct mbuf *); +__private_extern__ struct m_tag *m_tag_alloc(u_int32_t, u_int16_t, int, int); +__private_extern__ struct m_tag *m_tag_create(u_int32_t, u_int16_t, int, int, + struct mbuf *); +__private_extern__ void m_tag_free(struct m_tag *); +__private_extern__ void m_tag_prepend(struct mbuf *, struct m_tag *); +__private_extern__ void m_tag_unlink(struct mbuf *, struct m_tag *); +__private_extern__ void m_tag_delete(struct mbuf *, struct m_tag *); +__private_extern__ void m_tag_delete_chain(struct mbuf *, struct m_tag *); +__private_extern__ struct m_tag *m_tag_locate(struct mbuf *, u_int32_t, + u_int16_t, struct m_tag *); +__private_extern__ struct m_tag *m_tag_copy(struct m_tag *, int); +__private_extern__ int m_tag_copy_chain(struct mbuf *, struct mbuf *, int); +__private_extern__ void m_tag_init(struct mbuf *); +__private_extern__ struct m_tag *m_tag_first(struct mbuf *); +__private_extern__ struct m_tag *m_tag_next(struct mbuf *, struct m_tag *); + +__private_extern__ void m_prio_init(struct mbuf *); __END_DECLS - -#endif /* KERNEL */ - -#endif /* KERNEL_PRIVATE */ +#endif /* XNU_KERNEL_PRIVATE */ #ifdef KERNEL #include <sys/kpi_mbuf.h> #endif /* KERNEL */ diff --git a/bsd/sys/mcache.h b/bsd/sys/mcache.h index 21a169223..443e05b01 100644 --- a/bsd/sys/mcache.h +++ b/bsd/sys/mcache.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006-2007 Apple Inc. All rights reserved. + * Copyright (c) 2006-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -38,6 +38,7 @@ extern "C" { #include <sys/queue.h> #include <mach/boolean.h> #include <kern/locks.h> +#include <libkern/OSAtomic.h> #ifdef ASSERT #undef ASSERT @@ -57,11 +58,40 @@ extern "C" { #define ASSERT(EX) ((void)0) #endif -#if defined(__ppc__) -#define CPU_CACHE_SIZE 128 +#define atomic_add_16_ov(a, n) \ + ((u_int16_t) OSAddAtomic16(n, (volatile SInt16 *)a)) + +#define atomic_add_16(a, n) \ + ((void) atomic_add_16_ov(a, n)) + +#define atomic_add_32_ov(a, n) \ + ((u_int32_t) OSAddAtomic(n, (volatile SInt32 *)a)) + +#define atomic_add_32(a, n) \ + ((void) atomic_add_32_ov(a, n)) + +#define atomic_add_64_ov(a, n) \ + ((u_int64_t) OSAddAtomic64(n, (volatile SInt64 *)a)) + +#define atomic_add_64(a, n) \ + ((void) atomic_add_64_ov(a, n)) + +#define atomic_set_64(a, n) do { \ + while (!OSCompareAndSwap64(*a, n, (volatile UInt64 *)a)) \ + ; \ +} while (0) + +#if defined(__LP64__) +#define atomic_get_64(n, a) do { \ + (n) = *(a); \ +} while (0) #else +#define atomic_get_64(n, a) do { \ + (n) = atomic_add_64_ov(a, 0); \ +} while (0) +#endif /* __LP64__ */ + #define CPU_CACHE_SIZE 64 -#endif #ifndef IS_P2ALIGNED #define IS_P2ALIGNED(v, a) \ @@ -152,6 +182,7 @@ typedef unsigned int (*mcache_allocfn_t)(void *, mcache_obj_t ***, unsigned int, int); typedef void (*mcache_freefn_t)(void *, mcache_obj_t *, boolean_t); typedef void (*mcache_auditfn_t)(void *, mcache_obj_t *, boolean_t); +typedef void (*mcache_logfn_t)(u_int32_t, mcache_obj_t *, boolean_t); typedef void (*mcache_notifyfn_t)(void *, u_int32_t); typedef struct mcache { @@ -164,6 +195,7 @@ typedef struct mcache { mcache_allocfn_t mc_slab_alloc; /* slab layer allocate callback */ mcache_freefn_t mc_slab_free; /* slab layer free callback */ mcache_auditfn_t mc_slab_audit; /* slab layer audit callback */ + mcache_logfn_t mc_slab_log; /* slab layer log callback */ mcache_notifyfn_t mc_slab_notify; /* slab layer notify callback */ void *mc_private; /* opaque arg to callbacks */ size_t mc_bufsize; /* object size */ @@ -210,11 +242,12 @@ typedef struct mcache { /* Valid values for mc_flags */ #define MCF_VERIFY 0x00000001 /* enable verification */ -#define MCF_AUDIT 0x00000002 /* enable transaction auditing */ +#define MCF_TRACE 0x00000002 /* enable transaction auditing */ #define MCF_NOCPUCACHE 0x00000010 /* disable CPU layer caching */ +#define MCF_NOLEAKLOG 0x00000100 /* disable leak logging */ -#define MCF_DEBUG (MCF_VERIFY | MCF_AUDIT) -#define MCF_FLAGS_MASK (MCF_DEBUG | MCF_NOCPUCACHE) +#define MCF_DEBUG (MCF_VERIFY | MCF_TRACE) +#define MCF_FLAGS_MASK (MCF_DEBUG | MCF_NOCPUCACHE | MCF_NOLEAKLOG) /* Valid values for notify callback */ #define MCN_RETRYALLOC 0x00000001 /* Allocation should be retried */ @@ -245,8 +278,8 @@ __private_extern__ mcache_t *mcache_create(const char *, size_t, __private_extern__ void *mcache_alloc(mcache_t *, int); __private_extern__ void mcache_free(mcache_t *, void *); __private_extern__ mcache_t *mcache_create_ext(const char *, size_t, - mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_notifyfn_t, - void *, u_int32_t, int); + mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t, + mcache_notifyfn_t, void *, u_int32_t, int); __private_extern__ void mcache_destroy(mcache_t *); __private_extern__ unsigned int mcache_alloc_ext(mcache_t *, mcache_obj_t **, unsigned int, int); diff --git a/bsd/sys/mman.h b/bsd/sys/mman.h index a82aec943..109c63634 100644 --- a/bsd/sys/mman.h +++ b/bsd/sys/mman.h @@ -130,6 +130,7 @@ typedef __darwin_size_t size_t; #define MAP_NOEXTEND 0x0100 /* for MAP_FILE, don't change file size */ #define MAP_HASSEMAPHORE 0x0200 /* region may contain semaphores */ #define MAP_NOCACHE 0x0400 /* don't cache pages for this mapping */ +#define MAP_JIT 0x0800 /* Allocate a region that will be used for JIT purposes */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ /* diff --git a/bsd/sys/mount.h b/bsd/sys/mount.h index 8633a1465..f6594c0ef 100644 --- a/bsd/sys/mount.h +++ b/bsd/sys/mount.h @@ -293,9 +293,6 @@ struct vfs_attr { * NFS export related mount flags. */ #define MNT_EXPORTED 0x00000100 /* file system is exported */ -#ifdef PRIVATE -#define MNT_IMGSRC 0x00000200 -#endif /* CONFIG_IMGSRC_ACCESS */ /* * MAC labeled / "quarantined" flag @@ -319,6 +316,9 @@ struct vfs_attr { #define MNT_DEFWRITE 0x02000000 /* filesystem should defer writes */ #define MNT_MULTILABEL 0x04000000 /* MAC support for individual labels */ #define MNT_NOATIME 0x10000000 /* disable update of file access time */ +#ifdef BSD_KERNEL_PRIVATE +/* #define MNT_IMGSRC_BY_INDEX 0x20000000 see sys/imgsrc.h */ +#endif /* BSD_KERNEL_PRIVATE */ /* backwards compatibility only */ #define MNT_UNKNOWNPERMISSIONS MNT_IGNORE_OWNERSHIP @@ -334,7 +334,8 @@ struct vfs_attr { MNT_LOCAL | MNT_QUOTA | \ MNT_ROOTFS | MNT_DOVOLFS | MNT_DONTBROWSE | \ MNT_IGNORE_OWNERSHIP | MNT_AUTOMOUNTED | MNT_JOURNALED | \ - MNT_NOUSERXATTR | MNT_DEFWRITE | MNT_MULTILABEL | MNT_NOATIME | MNT_CPROTECT ) + MNT_NOUSERXATTR | MNT_DEFWRITE | MNT_MULTILABEL | \ + MNT_NOATIME | MNT_CPROTECT) /* * External filesystem command modifier flags. * Unmount can use the MNT_FORCE flag. @@ -440,6 +441,7 @@ union union_vfsidctl { /* the fields vc_vers and vc_fsid are compatible */ #define VFS_CTL_NEWADDR 0x00010004 /* reconnect to new address */ #define VFS_CTL_TIMEO 0x00010005 /* set timeout for vfs notification */ #define VFS_CTL_NOLOCKS 0x00010006 /* disable file locking */ +#define VFS_CTL_SADDR 0x00010007 /* get server address */ struct vfsquery { u_int32_t vq_flags; @@ -684,6 +686,9 @@ struct vfsops { /* * flags passed into vfs_iterate */ +#ifdef PRIVATE +#define VFS_ITERATE_TAIL_FIRST (1 << 0) +#endif /* PRIVATE */ /* * return values from callback @@ -1164,14 +1169,88 @@ void vfs_event_signal(fsid_t *, u_int32_t, intptr_t); */ void vfs_event_init(void); /* XXX We should not export this */ #ifdef KERNEL_PRIVATE +int vfs_getbyid(fsid_t *fsid, ino64_t ino, vnode_t *vpp, vfs_context_t ctx); int vfs_getattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx); int vfs_setattr(mount_t mp, struct vfs_attr *vfa, vfs_context_t ctx); int vfs_extendedsecurity(mount_t); mount_t vfs_getvfs_by_mntonname(char *); void vfs_markdependency(mount_t); vnode_t vfs_vnodecovered(mount_t mp); /* Returns vnode with an iocount that must be released with vnode_put() */ -void * vfs_mntlabel(mount_t mp); /* Safe to cast to "struct label*"; returns "void*" to limit dependence of mount.h on security headers. */ +vnode_t vfs_devvp(mount_t mp); /* Please see block comment with implementation */ +void * vfs_mntlabel(mount_t mp); /* Safe to cast to "struct label*"; returns "void*" to limit dependence of mount.h on security headers. */ void vfs_setunmountpreflight(mount_t mp); +void vfs_setcompoundopen(mount_t mp); +uint64_t vfs_throttle_mask(mount_t mp); + +struct vnode_trigger_info; + +/*! + @function vfs_addtrigger + @abstract Create an "external" trigger vnode: look up a vnode and mark it as + a trigger. Can only safely be called in the context of a callback set by + vfs_settriggercallback(). May only be used on a file which is not already + marked as a trigger. + @param relpath Path relative to root of mountpoint at which to mark trigger. + @param vtip Information about trigger; analogous to "vnode_trigger_param" + argument to vnode_create. + @param ctx Authorization context. + */ +int vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx); + + +/*! + @enum vfs_trigger_callback_op_t + @abstract Operation to perform after an attempted unmount (successful or otherwise). + @constant VTC_REPLACE Unmount failed: attempt to replace triggers. Only valid + VFS operation to perform in this context is vfs_addtrigger(). + @constant VTC_RELEASE Unmount succeeded: release external triggering context. + */ +typedef enum { + VTC_REPLACE, + VTC_RELEASE +} vfs_trigger_callback_op_t; + +/*! + @typedef vfs_trigger_callback_t + @abstract Callback to be passed to vfs_settriggercallback() and invoked from + unmount context. + @param mp Mountpoint on which unmount is occurring. + @param op Operation (see vfs_trigger_callback_op_t) + @param data Context passed to vfs_settriggercallback() + @param ctx Authorization context in which unmount is occurring. + */ +typedef void vfs_trigger_callback_t(mount_t mp, vfs_trigger_callback_op_t op, void *data, vfs_context_t ctx); + +/*! + @function vfs_settriggercallback + @abstract Install a callback to be called after unmount attempts on a volume, + to restore triggers for failed unmounts and release state for successful ones. + @discussion Installs a callback which will be called in two situations: a + failed unmount where vnodes may have been reclaimed and a successful unmount. + Gives an external trigger-marking entity an opportunity to replace triggers + which may have been reclaimed. The callback can only be installed (not + cleared), and only one callback can be installed. The callback will be called + with a read-write lock held on the mount point; in the VTC_REPLACE case, the + <em>only</em> valid VFS operation to perform in the context of the callback is + vfs_addtrigger() on the mountpoint in question. This rwlock is held in order + to attempt to provide some modicum of coverage from lookups which might find + missing trigger vnodes and receive spurious ENOENTs. Note that this + protection is incomplete--current working directories, or traversals up into a + volume via ".." may still find missing triggers. As of this writing, no + serialization mechanism exists to do better than this. + When the "op" is VTC_RELEASE, the mountpoint is going away, and the only valid + VFS operation is to free the private data pointer if needed. The callback + will be called immediately, with VTC_REPLACE, from vfs_settriggercallback(), + if installation is successful. + @param fsid FSID for filesystem in question. + @param vtc Callback pointer. + @param data Context pointer to be passed to callback. + @param flags Currently unused. + @param ctx Authorization context. + @return 0 for success. EBUSY if a trigger has already been installed. + */ +int vfs_settriggercallback(fsid_t *fsid, vfs_trigger_callback_t vtc, void *data, uint32_t flags, vfs_context_t ctx); + #endif /* KERNEL_PRIVATE */ __END_DECLS diff --git a/bsd/sys/mount_internal.h b/bsd/sys/mount_internal.h index b069b1a0f..141fb3eeb 100644 --- a/bsd/sys/mount_internal.h +++ b/bsd/sys/mount_internal.h @@ -115,6 +115,7 @@ struct mount { struct vnodelst mnt_newvnodes; /* list of vnodes this mount */ uint32_t mnt_flag; /* flags */ uint32_t mnt_kern_flag; /* kernel only flags */ + uint32_t mnt_compound_ops; /* Available compound operations */ uint32_t mnt_lflag; /* mount life cycle flags */ uint32_t mnt_maxsymlinklen; /* max size of short symlink */ struct vfsstatfs mnt_vfsstat; /* cache of filesystem stats */ @@ -131,17 +132,22 @@ struct mount { uint32_t mnt_ioqueue_depth; /* the maxiumum number of commands a device can accept */ uint32_t mnt_ioscale; /* scale the various throttles/limits imposed on the amount of I/O in flight */ uint32_t mnt_ioflags; /* flags for underlying device */ - pending_io_t mnt_pending_write_size; /* byte count of pending writes */ - pending_io_t mnt_pending_read_size; /* byte count of pending reads */ + pending_io_t mnt_pending_write_size __attribute__((aligned(sizeof(pending_io_t)))); /* byte count of pending writes */ + pending_io_t mnt_pending_read_size __attribute__((aligned(sizeof(pending_io_t)))); /* byte count of pending reads */ lck_rw_t mnt_rwlock; /* mutex readwrite lock */ lck_mtx_t mnt_renamelock; /* mutex that serializes renames that change shape of tree */ vnode_t mnt_devvp; /* the device mounted on for local file systems */ uint32_t mnt_devbsdunit; /* the BSD unit number of the device */ + uint64_t mnt_throttle_mask; /* the throttle mask of what devices will be affected by I/O from this mnt */ void *mnt_throttle_info; /* used by the throttle code */ int32_t mnt_crossref; /* refernces to cover lookups crossing into mp */ int32_t mnt_iterref; /* refernces to cover iterations; drained makes it -ve */ - +#if CONFIG_TRIGGERS + int32_t mnt_numtriggers; /* num of trigger vnodes for this mount */ + vfs_trigger_callback_t *mnt_triggercallback; + void *mnt_triggerdata; +#endif /* XXX 3762912 hack to support HFS filesystem 'owner' */ uid_t mnt_fsowner; gid_t mnt_fsgroup; @@ -190,6 +196,7 @@ struct mount { */ pid_t mnt_dependent_pid; void *mnt_dependent_process; + char fstypename_override[MFSTYPENAMELEN]; }; /* @@ -228,6 +235,12 @@ extern struct mount * dead_mountp; * because the bits here were broken out from the high bits * of the mount flags. */ +#define MNTK_DENY_READDIREXT 0x00000200 /* Deny Extended-style readdir's for this volume */ +#define MNTK_PERMIT_UNMOUNT 0x00000400 /* Allow (non-forced) unmounts by UIDs other than the one that mounted the volume */ +#ifdef NFSCLIENT +#define MNTK_TYPENAME_OVERRIDE 0x00000800 /* override the fstypename for statfs() */ +#endif /* NFSCLIENT */ +#define MNTK_KERNEL_MOUNT 0x00001000 /* mount came from kernel side */ #ifdef CONFIG_IMGSRC_ACCESS #define MNTK_HAS_MOVED 0x00002000 #define MNTK_BACKS_ROOT 0x00004000 @@ -392,13 +405,11 @@ struct user32_statfs { }; /* - * throttle I/Os are affected only by normal I/Os happening on the same bsd device node. For example, disk1s3 and - * disk1s5 are the same device node, while disk1s3 and disk2 are not (although disk2 might be a mounted disk image file - * and the disk image file resides on a partition in disk1). The following constant defines the maximum number of - * different bsd device nodes the algorithm can consider, and larger numbers are rounded by this maximum. Since - * throttled I/O is usually useful in non-server environment only, a small number 16 is enough in most cases + * throttle I/Os are affected only by normal I/Os happening on the same spindle. Currently we use a 64-bit integer to + * represent what devices are affected, so we can handle at most 64 different spindles. Since + * throttled I/O is usually useful in non-server environment only, this number is enough in most cases. */ -#define LOWPRI_MAX_NUM_DEV 16 +#define LOWPRI_MAX_NUM_DEV 64 __BEGIN_DECLS @@ -425,7 +436,7 @@ void vfs_unmountall(void); int safedounmount(struct mount *, int, vfs_context_t); int dounmount(struct mount *, int, int, vfs_context_t); -/* xnuy internal api */ +/* xnu internal api */ void mount_dropcrossref(mount_t, vnode_t, int); mount_t mount_lookupby_volfsid(int, int); mount_t mount_list_lookupby_fsid(fsid_t *, int, int); @@ -437,11 +448,31 @@ void mount_iterdrop(mount_t); void mount_iterdrain(mount_t); void mount_iterreset(mount_t); +/* tags a volume as not supporting extended readdir for NFS exports */ +#ifdef BSD_KERNEL_PRIVATE +void mount_set_noreaddirext (mount_t); +#endif + +/* Private NFS spi */ +#define KERNEL_MOUNT_NOAUTH 0x01 /* Don't check the UID of the directory we are mounting on */ +#define KERNEL_MOUNT_PERMIT_UNMOUNT 0x02 /* Allow (non-forced) unmounts by users other the one who mounted the volume */ +#if NFSCLIENT +/* + * NOTE: kernel_mount() does not force MNT_NOSUID, MNT_NOEXEC, or MNT_NODEC for non-privileged + * mounting credentials, as the mount(2) system call does. + */ +int kernel_mount(char *, vnode_t, vnode_t, const char *, void *, size_t, int, uint32_t, vfs_context_t); +boolean_t vfs_iskernelmount(mount_t); +#endif + /* throttled I/O api */ int throttle_get_io_policy(struct uthread **ut); -extern void throttle_lowpri_io(boolean_t ok_to_sleep); int throttle_io_will_be_throttled(int lowpri_window_msecs, mount_t mp); +/* throttled I/O helper function */ +/* convert the lowest bit to a device index */ +extern int num_trailing_0(uint64_t n); + __END_DECLS #endif /* !_SYS_MOUNT_INTERNAL_H_ */ diff --git a/bsd/sys/msgbuf.h b/bsd/sys/msgbuf.h index e05b73e9e..5b8211cac 100644 --- a/bsd/sys/msgbuf.h +++ b/bsd/sys/msgbuf.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,22 +65,24 @@ #include <sys/cdefs.h> -#define MSG_BSIZE 4096 +#define MAX_MSG_BSIZE (1*1024*1024) struct msgbuf { #define MSG_MAGIC 0x063061 - long msg_magic; - long msg_size; - long msg_bufx; /* write pointer */ - long msg_bufr; /* read pointer */ - char *msg_bufc; /* buffer */ + int msg_magic; + int msg_size; + int msg_bufx; /* write pointer */ + int msg_bufr; /* read pointer */ + char *msg_bufc; /* buffer */ }; -#ifdef KERNEL + +#ifdef XNU_KERNEL_PRIVATE __BEGIN_DECLS extern struct msgbuf *msgbufp; extern void log_putc(char); extern void log_putc_locked(char); -extern void log_setsize(long size); +extern int log_setsize(int size); extern int log_dmesg(user_addr_t, uint32_t, int32_t *); __END_DECLS -#endif +#endif /* XNU_KERNEL_PRIVATE */ + #endif /* !_SYS_MSGBUF_H_ */ diff --git a/bsd/sys/namei.h b/bsd/sys/namei.h index 5aa2f701a..56d3ecf13 100644 --- a/bsd/sys/namei.h +++ b/bsd/sys/namei.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -90,12 +90,15 @@ /* * Encapsulation of namei parameters. */ -struct nameidata { +struct nameidata { /* * Arguments to namei/lookup. */ user_addr_t ni_dirp; /* pathname pointer */ enum uio_seg ni_segflg; /* location of pathname */ +#if CONFIG_TRIGGERS + enum path_operation ni_op; /* intended operation, see enum path_operation in vnode.h */ +#endif /* CONFIG_TRIGGERS */ /* * Arguments to lookup. */ @@ -116,8 +119,25 @@ struct nameidata { u_long ni_loopcnt; /* count of symlinks encountered */ struct componentname ni_cnd; + int32_t ni_flag; + int ni_ncgeneration; /* For a batched vnop, grab generation beforehand */ }; +#define NAMEI_CONTLOOKUP 0x002 /* Continue processing a lookup which was partially processed in a compound VNOP */ +#define NAMEI_TRAILINGSLASH 0x004 /* There was at least one trailing slash after last component */ +#define NAMEI_UNFINISHED 0x008 /* We broke off a lookup to do a compound op */ +/* + * XXX Hack: we need to encode the intended VNOP in order to + * be able to include information about which operations a filesystem + * supports in the decision to break off a lookup early. + */ +#define NAMEI_COMPOUNDOPEN 0x010 +#define NAMEI_COMPOUNDREMOVE 0x020 +#define NAMEI_COMPOUNDMKDIR 0x040 +#define NAMEI_COMPOUNDRMDIR 0x080 +#define NAMEI_COMPOUNDRENAME 0x100 +#define NAMEI_COMPOUND_OP_MASK (NAMEI_COMPOUNDOPEN | NAMEI_COMPOUNDREMOVE | NAMEI_COMPOUNDMKDIR | NAMEI_COMPOUNDRMDIR | NAMEI_COMPOUNDRENAME) + #ifdef KERNEL /* * namei operational modifier flags, stored in ni_cnd.flags @@ -169,7 +189,27 @@ struct nameidata { /* * Initialization of an nameidata structure. */ -#define NDINIT(ndp, op, flags, segflg, namep, ctx) { \ + +#if CONFIG_TRIGGERS +/* Note: vnode triggers require more precise path operation (ni_op) */ + +#define NDINIT(ndp, op, pop, flags, segflg, namep, ctx) { \ + (ndp)->ni_cnd.cn_nameiop = op; \ + (ndp)->ni_op = pop; \ + (ndp)->ni_cnd.cn_flags = flags; \ + if ((segflg) == UIO_USERSPACE) { \ + (ndp)->ni_segflg = ((IS_64BIT_PROCESS(vfs_context_proc(ctx))) ? UIO_USERSPACE64 : UIO_USERSPACE32); \ + } \ + else { \ + (ndp)->ni_segflg = segflg; \ + } \ + (ndp)->ni_dirp = namep; \ + (ndp)->ni_cnd.cn_context = ctx; \ + (ndp)->ni_flag = 0; \ + (ndp)->ni_cnd.cn_ndp = (ndp); \ +} +#else +#define NDINIT(ndp, op, _unused_, flags, segflg, namep, ctx) { \ (ndp)->ni_cnd.cn_nameiop = op; \ (ndp)->ni_cnd.cn_flags = flags; \ if ((segflg) == UIO_USERSPACE) { \ @@ -180,7 +220,11 @@ struct nameidata { } \ (ndp)->ni_dirp = namep; \ (ndp)->ni_cnd.cn_context = ctx; \ + (ndp)->ni_flag = 0; \ + (ndp)->ni_cnd.cn_ndp = (ndp); \ } +#endif /* CONFIG_TRIGGERS */ + #endif /* KERNEL */ /* @@ -210,21 +254,25 @@ struct namecache { int namei(struct nameidata *ndp); void nameidone(struct nameidata *); +void namei_unlock_fsnode(struct nameidata *ndp); int lookup(struct nameidata *ndp); int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp); +void lookup_compound_vnop_post_hook(int error, vnode_t dvp, vnode_t vp, struct nameidata *ndp, int did_create); /* * namecache function prototypes */ void cache_purgevfs(mount_t mp); int cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, - vfs_context_t context, int *trailing_slash, int *dp_authorized, vnode_t last_dp); + vfs_context_t context, int *dp_authorized, vnode_t last_dp); void vnode_cache_authorized_action(vnode_t vp, vfs_context_t context, kauth_action_t action); void vnode_uncache_authorized_action(vnode_t vp, kauth_action_t action); boolean_t vnode_cache_is_stale(vnode_t vp); boolean_t vnode_cache_is_authorized(vnode_t vp, vfs_context_t context, kauth_action_t action); +int lookup_validate_creation_path(struct nameidata *ndp); +int namei_compound_available(vnode_t dp, struct nameidata *ndp); #endif /* KERNEL */ diff --git a/bsd/dev/ppc/memmove.c b/bsd/sys/netboot.h similarity index 72% rename from bsd/dev/ppc/memmove.c rename to bsd/sys/netboot.h index e102c248a..717100d7f 100644 --- a/bsd/dev/ppc/memmove.c +++ b/bsd/sys/netboot.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,30 +25,26 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* Copyright (c) 1991,1993 NeXT Computer, Inc. All rights reserved. - * - */ -#include <sys/systm.h> +/* + * netboot.h + * - definitions for network booting/rooting + */ -void ovbcopy(const void *src, void *dst, size_t ulen) -{ - bcopy(src, dst, ulen); -} +#ifndef _SYS_NETBOOT_H +#define _SYS_NETBOOT_H -#if 0 -void *memcpy(void *dst, const void *src, unsigned int ulen) -{ - bcopy(src, dst, ulen); - return dst; -} +#include <mach/boolean.h> +#include <netinet/in.h> -void *memmove(void *dst, const void *src, unsigned int ulen) -{ - bcopy(src, dst, ulen); - return dst; -} +int netboot_setup(void); +int netboot_mountroot(void); +int netboot_root(void); -#endif /* 0 */ +boolean_t netboot_iaddr(struct in_addr * iaddr_p); +boolean_t netboot_rootpath(struct in_addr * server_ip, + char * name, int name_len, + char * path, int path_len); +#endif /* _SYS_NETBOOT_H */ diff --git a/bsd/sys/priv.h b/bsd/sys/priv.h new file mode 100644 index 000000000..1abb898bf --- /dev/null +++ b/bsd/sys/priv.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2010 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/*- + * Copyright (c) 2006 nCircle Network Security, Inc. + * All rights reserved. + * + * This software was developed by Robert N. M. Watson for the TrustedBSD + * Project under contract to nCircle Network Security, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, + * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD: src/sys/sys/priv.h,v 1.38.2.1.2.1 2009/10/25 01:10:29 kensmith Exp $ + */ + +/* + * Kernel privilege checking interface. + */ +#ifndef _SYS_PRIV_H_ +#define _SYS_PRIV_H_ + +/* + * Privilege list, sorted loosely by kernel subsystem. + * + * Think carefully before adding or reusing one of these privileges -- are + * there existing instances referring to the same privilege? Particular + * numeric privilege assignments are part of the kernel extension ABI. + */ + +/* + * The remaining privileges typically correspond to one or a small + * number of specific privilege checks, and have (relatively) precise + * meanings. They are loosely sorted into a set of base system + * privileges, such as the ability to reboot, and then loosely by + * subsystem, indicated by a subsystem name. + */ +#define PRIV_ADJTIME 1000 /* Set time adjustment. */ + +/* + * IPv4 and IPv6 privileges. + */ +#define PRIV_NETINET_RESERVEDPORT 11000 /* Bind low port number. */ + +#ifdef KERNEL +/* + * Privilege check interface. No flags are currently defined for the API. + */ +#include <sys/kauth.h> +int priv_check_cred(kauth_cred_t cred, int priv, int flags); +#endif + +#endif /* !_SYS_PRIV_H_ */ diff --git a/bsd/sys/proc.h b/bsd/sys/proc.h index 92c86c0a1..8cec174a3 100644 --- a/bsd/sys/proc.h +++ b/bsd/sys/proc.h @@ -167,7 +167,7 @@ struct extern_proc { #define P_TIMEOUT 0x00000400 /* Timing out during sleep */ #define P_TRACED 0x00000800 /* Debugged process being traced */ -#define P_RESV3 0x00001000 /* (P_WAITED)Debugging prc has waited for child */ +#define P_DISABLE_ASLR 0x00001000 /* Disable address space layout randomization */ #define P_WEXIT 0x00002000 /* Working on exiting */ #define P_EXEC 0x00004000 /* Process called exec. */ @@ -252,7 +252,7 @@ extern int proc_pid(proc_t); extern int proc_ppid(proc_t); /* returns 1 if the process is marked for no remote hangs */ extern int proc_noremotehang(proc_t); -/* returns 1 is the process is marked for force quota */ +/* returns 1 if the process is marked for force quota */ extern int proc_forcequota(proc_t); /* this routine returns 1 if the process is running with 64bit address space, else 0 */ @@ -292,9 +292,41 @@ extern int msleep1(void *chan, lck_mtx_t *mtx, int pri, const char *wmesg, u_int extern int proc_pidversion(proc_t); extern int proc_getcdhash(proc_t, unsigned char *); #endif /* KERNEL_PRIVATE */ +#ifdef XNU_KERNEL_PRIVATE +/* + * This returns an unique 64bit id of a given process. + * Caller needs to hold proper reference on the + * passed in process strucutre. + */ +extern uint64_t proc_uniqueid(proc_t); +extern uint64_t proc_selfuniqueid(void); +extern void proc_getexecutableuuid(proc_t, unsigned char *, unsigned long); +#endif /* XNU_KERNEL_PRIVATE*/ __END_DECLS #endif /* KERNEL */ +#ifdef PRIVATE + +/* Values for pid_shutdown_sockets */ +#ifdef KERNEL +#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL 0x0 +#endif /* KERNEL */ +#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC 0x1 +#define SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL 0x2 + +#ifndef KERNEL + +__BEGIN_DECLS + +int pid_suspend(int pid); +int pid_resume(int pid); + + +__END_DECLS + +#endif /* !KERNEL */ +#endif /* PRIVATE */ + #endif /* !_SYS_PROC_H_ */ diff --git a/bsd/sys/proc_info.h b/bsd/sys/proc_info.h index e22cd3ab4..67842664d 100644 --- a/bsd/sys/proc_info.h +++ b/bsd/sys/proc_info.h @@ -50,6 +50,7 @@ __BEGIN_DECLS #define PROC_TTY_ONLY 3 #define PROC_UID_ONLY 4 #define PROC_RUID_ONLY 5 +#define PROC_PPID_ONLY 6 struct proc_bsdinfo { uint32_t pbi_flags; /* 64bit; emulated etc */ @@ -77,25 +78,47 @@ struct proc_bsdinfo { }; +struct proc_bsdshortinfo { + uint32_t pbsi_pid; /* process id */ + uint32_t pbsi_ppid; /* process parent id */ + uint32_t pbsi_pgid; /* process perp id */ + uint32_t pbsi_status; /* p_stat value, SZOMB, SRUN, etc */ + char pbsi_comm[MAXCOMLEN]; /* upto 16 characters of process name */ + uint32_t pbsi_flags; /* 64bit; emulated etc */ + uid_t pbsi_uid; /* current uid on process */ + gid_t pbsi_gid; /* current gid on process */ + uid_t pbsi_ruid; /* current ruid on process */ + gid_t pbsi_rgid; /* current tgid on process */ + uid_t pbsi_svuid; /* current svuid on process */ + gid_t pbsi_svgid; /* current svgid on process */ + uint32_t pbsi_rfu; /* reserved for future use*/ +}; + /* pbi_flags values */ -#define PROC_FLAG_SYSTEM 1 -#define PROC_FLAG_TRACED 2 -#define PROC_FLAG_INEXIT 4 +#define PROC_FLAG_SYSTEM 1 /* System process */ +#define PROC_FLAG_TRACED 2 /* process currently being traced, possibly by gdb */ +#define PROC_FLAG_INEXIT 4 /* process is working its way in exit() */ #define PROC_FLAG_PPWAIT 8 -#define PROC_FLAG_LP64 0x10 -#define PROC_FLAG_SLEADER 0x20 -#define PROC_FLAG_CTTY 0x40 -#define PROC_FLAG_CONTROLT 0x80 -#define PROC_FLAG_THCWD 0x100 +#define PROC_FLAG_LP64 0x10 /* 64bit process */ +#define PROC_FLAG_SLEADER 0x20 /* The process is the session leader */ +#define PROC_FLAG_CTTY 0x40 /* process has a control tty */ +#define PROC_FLAG_CONTROLT 0x80 /* Has a controlling terminal */ +#define PROC_FLAG_THCWD 0x100 /* process has a thread with cwd */ /* process control bits for resource starvation */ -#define PROC_FLAG_PC_THROTTLE 0x200 -#define PROC_FLAG_PC_SUSP 0x400 -#define PROC_FLAG_PC_KILL 0x600 +#define PROC_FLAG_PC_THROTTLE 0x200 /* In resource starvation situations, this process is to be throttled */ +#define PROC_FLAG_PC_SUSP 0x400 /* In resource starvation situations, this process is to be suspended */ +#define PROC_FLAG_PC_KILL 0x600 /* In resource starvation situations, this process is to be terminated */ #define PROC_FLAG_PC_MASK 0x600 /* process action bits for resource starvation */ -#define PROC_FLAG_PA_THROTTLE 0x800 -#define PROC_FLAG_PA_SUSP 0x1000 +#define PROC_FLAG_PA_THROTTLE 0x800 /* The process is currently throttled due to resource starvation */ +#define PROC_FLAG_PA_SUSP 0x1000 /* The process is currently suspended due to resource starvation */ +#define PROC_FLAG_PSUGID 0x2000 /* process has set privileges since last exec */ +#define PROC_FLAG_EXEC 0x4000 /* process has called exec */ +#ifdef PRIVATE +#define PROC_FLAG_DARWINBG 0x8000 /* process in darwin background */ +#define PROC_FLAG_EXT_DARWINBG 0x10000 /* process in darwin background - external enforcement */ +#endif struct proc_taskinfo { @@ -174,6 +197,7 @@ struct proc_regioninfo { #define SM_TRUESHARED 5 #define SM_PRIVATE_ALIASED 6 #define SM_SHARED_ALIASED 7 +#define SM_LARGE_PAGE 8 /* @@ -199,9 +223,16 @@ struct proc_workqueueinfo { uint32_t pwq_nthreads; /* total number of workqueue threads */ uint32_t pwq_runthreads; /* total number of running workqueue threads */ uint32_t pwq_blockedthreads; /* total number of blocked workqueue threads */ - uint32_t reserved[1]; /* reserved for future use */ + uint32_t pwq_state; }; +/* + * workqueue state (pwq_state field) + */ +#define WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT 0x1 +#define WQ_EXCEEDED_TOTAL_THREAD_LIMIT 0x2 + + struct proc_fileinfo { uint32_t fi_openflags; uint32_t fi_status; @@ -561,6 +592,11 @@ struct proc_fdinfo { uint32_t proc_fdtype; }; +struct proc_fileportinfo { + uint32_t proc_fileport; + uint32_t proc_fdtype; +}; + /* Flavors for proc_pidinfo() */ #define PROC_PIDLISTFDS 1 #define PROC_PIDLISTFD_SIZE (sizeof(struct proc_fdinfo)) @@ -600,6 +636,12 @@ struct proc_fdinfo { #define PROC_PIDWORKQUEUEINFO 12 #define PROC_PIDWORKQUEUEINFO_SIZE (sizeof(struct proc_workqueueinfo)) +#define PROC_PIDT_SHORTBSDINFO 13 +#define PROC_PIDT_SHORTBSDINFO_SIZE (sizeof(struct proc_bsdshortinfo)) + +#define PROC_PIDLISTFILEPORTS 14 +#define PROC_PIDLISTFILEPORTS_SIZE (sizeof(struct proc_fileportinfo)) + /* Flavors for proc_pidfdinfo */ #define PROC_PIDFDVNODEINFO 1 @@ -626,9 +668,29 @@ struct proc_fdinfo { #define PROC_PIDFDATALKINFO 8 #define PROC_PIDFDATALKINFO_SIZE (sizeof(struct appletalk_fdinfo)) +/* Flavors for proc_pidfileportinfo */ + +#define PROC_PIDFILEPORTVNODEPATHINFO 2 /* out: vnode_fdinfowithpath */ +#define PROC_PIDFILEPORTVNODEPATHINFO_SIZE \ + PROC_PIDFDVNODEPATHINFO_SIZE + +#define PROC_PIDFILEPORTSOCKETINFO 3 /* out: socket_fdinfo */ +#define PROC_PIDFILEPORTSOCKETINFO_SIZE PROC_PIDFDSOCKETINFO_SIZE + +#define PROC_PIDFILEPORTPSHMINFO 5 /* out: pshm_fdinfo */ +#define PROC_PIDFILEPORTPSHMINFO_SIZE PROC_PIDFDPSHMINFO_SIZE + +#define PROC_PIDFILEPORTPIPEINFO 6 /* out: pipe_fdinfo */ +#define PROC_PIDFILEPORTPIPEINFO_SIZE PROC_PIDFDPIPEINFO_SIZE + /* used for proc_setcontrol */ #define PROC_SELFSET_PCONTROL 1 +#define PROC_SELFSET_THREADNAME 2 +#define PROC_SELFSET_THREADNAME_SIZE (MAXTHREADNAMESIZE -1) + +#define PROC_SELFSET_VMRSRCOWNER 3 + #ifdef XNU_KERNEL_PRIVATE #ifndef pshmnode struct pshmnode; diff --git a/bsd/sys/proc_internal.h b/bsd/sys/proc_internal.h index 52e4197dd..26b91b3cd 100644 --- a/bsd/sys/proc_internal.h +++ b/bsd/sys/proc_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -184,6 +184,14 @@ struct proc; #define PROC_NULL (struct proc *)0 +#define PROC_UPDATE_CREDS_ONPROC(p) { \ + p->p_uid = kauth_cred_getuid(p->p_ucred); \ + p->p_gid = kauth_cred_getgid(p->p_ucred); \ + p->p_ruid = kauth_cred_getruid(p->p_ucred); \ + p->p_rgid = kauth_cred_getrgid(p->p_ucred); \ + p->p_svuid = kauth_cred_getsvuid(p->p_ucred); \ + p->p_svgid = kauth_cred_getsvgid(p->p_ucred); \ + } /* * Description of a process. * @@ -203,6 +211,13 @@ struct proc { struct proc * p_pptr; /* Pointer to parent process.(LL) */ pid_t p_ppid; /* process's parent pid number */ pid_t p_pgrpid; /* process group id of the process (LL)*/ + uid_t p_uid; + gid_t p_gid; + uid_t p_ruid; + gid_t p_rgid; + uid_t p_svuid; + gid_t p_svgid; + uint64_t p_uniqueid; /* process uniqe ID */ lck_mtx_t p_mlock; /* mutex lock for proc */ @@ -281,6 +296,7 @@ struct proc { lck_mtx_t p_dtrace_sprlock; /* sun proc lock emulation */ int p_dtrace_probes; /* (PL) are there probes for this proc? */ u_int p_dtrace_count; /* (sprlock) number of DTrace tracepoints */ + uint8_t p_dtrace_stop; /* indicates a DTrace-desired stop */ struct dtrace_ptss_page* p_dtrace_ptss_pages; /* (sprlock) list of user ptss pages */ struct dtrace_ptss_page_entry* p_dtrace_ptss_free_list; /* (atomic) list of individual ptss entries */ struct dtrace_helpers* p_dtrace_helpers; /* (dtrace_lock) DTrace per-proc private */ @@ -314,7 +330,9 @@ struct proc { char p_name[(2*MAXCOMLEN)+1]; /* PL */ struct pgrp *p_pgrp; /* Pointer to process group. (LL) */ +#if CONFIG_EMBEDDED int p_iopol_disk; /* disk I/O policy (PL) */ +#endif /* CONFIG_EMBEDDED */ uint32_t p_csflags; /* flags for codesign (PL) */ uint32_t p_pcaction; /* action for process control on starvation */ uint8_t p_uuid[16]; /* from LC_UUID load command */ @@ -330,6 +348,7 @@ struct proc { struct klist p_klist; /* knote list (PL ?)*/ struct rusage *p_ru; /* Exit information. (PL) */ + int p_sigwaitcnt; thread_t p_signalholder; thread_t p_transholder; @@ -408,10 +427,13 @@ struct proc { #define P_LLIMWAIT 0x00040000 #define P_LWAITED 0x00080000 #define P_LINSIGNAL 0x00100000 -#define P_LSIGNALWAIT 0x00200000 +#define P_UNUSED 0x00200000 /* Unused */ #define P_LRAGE_VNODES 0x00400000 #define P_LREGISTER 0x00800000 /* thread start fns registered */ +#if CONFIG_EMBEDDED #define P_LBACKGROUND 0x01000000 +#endif /* CONFIG_EMBEDDED */ +#define P_LVMRSRCOWNER 0x02000000 /* can handle the resource ownership of */ /* Process control state for resource starvation */ #define P_PCTHROTTLE 1 @@ -426,7 +448,7 @@ struct proc { #define PROC_SETACTION_STATE(p) (p->p_pcaction = (PROC_CONTROL_STATE(p) | (PROC_CONTROL_STATE(p) << 16))) #define PROC_RESETACTION_STATE(p) (p->p_pcaction = PROC_CONTROL_STATE(p)) -/* advisory flags in the proc */ +/* additional process flags */ #define P_LADVLOCK 0x01 /* defns for proc_iterate */ @@ -580,10 +602,10 @@ extern lck_mtx_t * proc_list_mlock; extern lck_mtx_t * proc_klist_mlock; #define BSD_SIMUL_EXECS 33 /* 32 , allow for rounding */ -#define BSD_PAGABLE_MAP_SIZE (BSD_SIMUL_EXECS * (NCARGS + PAGE_SIZE)) -__private_extern__ int execargs_cache_size; -__private_extern__ int execargs_free_count; -__private_extern__ vm_offset_t * execargs_cache; +#define BSD_PAGEABLE_SIZE_PER_EXEC (NCARGS + PAGE_SIZE + PAGE_SIZE) /* page for apple vars, page for executable header */ +extern int execargs_cache_size; +extern int execargs_free_count; +extern vm_offset_t * execargs_cache; #define SESS_LEADER(p, sessp) ((sessp)->s_leader == (p)) @@ -611,9 +633,11 @@ extern LIST_HEAD(sesshashhead, session) *sesshashtbl; extern u_long sesshash; extern lck_grp_t * proc_lck_grp; +#if CONFIG_FINE_LOCK_GROUPS extern lck_grp_t * proc_mlock_grp; extern lck_grp_t * proc_fdmlock_grp; extern lck_grp_t * proc_slock_grp; +#endif extern lck_grp_attr_t * proc_lck_grp_attr; extern lck_attr_t * proc_lck_attr; @@ -638,6 +662,9 @@ __private_extern__ int proc_core_name(const char *name, uid_t uid, pid_t pid, char *cr_name, size_t cr_name_len); extern int isinferior(struct proc *, struct proc *); __private_extern__ struct proc *pzfind(pid_t); /* Find zombie by id. */ +__private_extern__ struct proc *proc_find_zombref(pid_t); /* Find zombie by id. */ +__private_extern__ void proc_drop_zombref(struct proc * p); /* Find zombie by id. */ + extern struct lctx *lcfind(pid_t); /* Find a login context by id */ extern struct lctx *lccreate(void); /* Create a new login context */ @@ -699,6 +726,7 @@ void proc_transcommit(struct proc *, int locked); void proc_transend(struct proc *, int locked); int proc_transwait(struct proc *, int locked); void proc_rele_locked(struct proc * p); +struct proc *proc_ref_locked(struct proc * p); void proc_knote(struct proc * p, long hint); void proc_knote_drain(struct proc *p); void workqueue_init_lock(proc_t p); diff --git a/bsd/sys/process_policy.h b/bsd/sys/process_policy.h new file mode 100644 index 000000000..19f3c2617 --- /dev/null +++ b/bsd/sys/process_policy.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _SYS_PROCESS_POLICY_H +#define _SYS_PROCESS_POLICY_H + +#include <sys/cdefs.h> +#include <sys/param.h> +#include <sys/types.h> +#include <stdint.h> + +__BEGIN_DECLS + +/* defns of scope */ +#define PROC_POLICY_SCOPE_PROCESS 1 /* the policy setting is for process wide effect */ +#define PROC_POLICY_SCOPE_THREAD 2 /* the policy setting is for thread inside a proc */ + +/* defns of actions with no attributes */ +#define PROC_POLICY_ACTION_APPLY 1 /* enforce the set policy */ +#define PROC_POLICY_ACTION_RESTORE 2 /* revert the applied action back */ +#define PROC_POLICY_ACTION_DENYINHERIT 3 /* set for no inheritence of the specified policy */ +#define PROC_POLICY_ACTION_DENYSELFSET 4 /* set for the process to set its own policy */ +#define PROC_POLICY_ACTION_ENABLE 5 /* enable policy and its actions */ +#define PROC_POLICY_ACTION_DISABLE 6 /* disable policy and its actions, also clears any actions that have already happened */ +/* defns of actions with attributes */ +#define PROC_POLICY_ACTION_SET 10 /* set the policy attributes */ +#define PROC_POLICY_ACTION_GET 11 /* get the policy attributes */ +#define PROC_POLICY_ACTION_ADD 12 /* add a policy attribute */ +#define PROC_POLICY_ACTION_REMOVE 13 /* remove a policy attribute */ + +/* policies */ +#define PROC_POLICY NONE 0 +#define PROC_POLICY_BACKGROUND 1 /* darwin background policy */ +#define PROC_POLICY_HARDWARE_ACCESS 2 /* access to various hardware */ +#define PROC_POLICY_RESOURCE_STARVATION 3 /* behavior on resource starvation */ +#define PROC_POLICY_RESOURCE_USAGE 4 /* behavior on resource consumption */ +#define PROC_POLICY_RESERVED 5 /* behavior on resource consumption */ +#define PROC_POLICY_APPTYPE 6 /* behavior on resource consumption */ + +/* sub policies for background policy */ +#define PROC_POLICY_BG_NONE 0 /* none */ +#define PROC_POLICY_BG_LOWCPUPRI 1 /* Low cpu priority */ +#define PROC_POLICY_BG_DISKTHROTTLE 2 /* disk accesses throttled */ +#define PROC_POLICY_BG_NETTHROTTLE 4 /* network accesses throttled */ +#define PROC_POLICY_BG_GPUDENY 8 /* no access to GPU */ +#if CONFIG_EMBEDDED +#define PROC_POLICY_BG_ALL 0x0F +#else /* CONFIG_EMBEDDED */ +#define PROC_POLICY_BG_ALL 0x07 +#endif /* CONFIG_EMBEDDED */ +#define PROC_POLICY_BG_DEFAULT PROC_POLICY_BG_ALL + +/* sub policies for hardware */ +#define PROC_POLICY_HWACCESS_NONE 0 +#define PROC_POLICY_HWACCESS_DISK 1 /* disk access */ +#define PROC_POLICY_HWACCESS_GPU 2 /* GPU access */ +#define PROC_POLICY_HWACCESS_NETWORK 3 /* network access */ +#define PROC_POLICY_HWACCESS_CPU 4 /* cpu access */ + +/* attribute values for disk hardware access, bit different as it should reflect IOPOL_XXX */ +#define PROC_POLICY_DISKACC_NONE 0 +#define PROC_POLICY_DISKACC_NORMAL 1 /* normal access to the disk */ +#define PROC_POLICY_DISKACC_PASSIVE 2 /* treat the I/Os as passive */ +#define PROC_POLICY_DISKACC_THROTTLE 3 /* throttle the disk IOs */ +#define PROC_POLICY_DISKACC_DEFAULT 0 + +/* attribute values for GPU hardware access */ +#define PROC_POLICY_GPUACC_NONE 0 +#define PROC_POLICY_GPUACC_FULLACCESS 0 /* complete access to the GPU */ +#define PROC_POLICY_GPUACC_DENYACCESS 1 /* deny any access to the GPU */ +#define PROC_POLICY_GPUACC_DEFAULT 0 /* default is complete access */ + +/* atrribute values for network hardware access */ +#define PROC_POLICY_NETACC_NONE 0 +#define PROC_POLICY_NETACC_NORMAL 0 /* complete access to the network */ +#define PROC_POLICY_NETACC_THROTTLE 1 /* throttle access to network */ +#define PROC_POLICY_NETACC_DEFAULT 0 /* default is complete access */ + +/* atrribute values for network hardware access */ +#define PROC_POLICY_CPUACC_NONE 0 +#define PROC_POLICY_CPUACC_ALL 0 /* access to all avialable cpus */ +#define PROC_POLICY_CPUACC_ONE 1 /* access to only one available cpu */ +#define PROC_POLICY_CPUACC_LLCACHE 2 /* access to only one last level cache */ +#define PROC_POLICY_CPUACC_DEFAULT 0 /* default is access to all cpus */ + + +/* System Resource management (ie usage and starvation related) definitions */ + +/* sub policies for resource starvation */ +#define PROC_POLICY_RS_NONE 0 +#define PROC_POLICY_RS_VIRTUALMEM 1 /* virtual memory starvation */ + +/* sub policies for resource usage */ +#define PROC_POLICY_RUSAGE_NONE 0 +#define PROC_POLICY_RUSAGE_WIREDMEM 1 /* wired memory usages */ +#define PROC_POLICY_RUSAGE_VIRTMEM 2 /* virtual memory usage */ +#define PROC_POLICY_RUSAGE_CPU 3 /* amount of cpu usage */ +#define PROC_POLICY_RUSAGE_DISK 4 /* amount of disk usage */ +#define PROC_POLICY_RUSAGE_NETWORK 5 /* amount of network usage */ +#define PROC_POLICY_RUSAGE_POWER 6 /* amount of power/battery consumption */ + +/* attribute values for the resource usage and low resource */ +#define PROC_POLICY_RSRCACT_NONE 0 +#define PROC_POLICY_RSRCACT_THROTTLE 1 /* throttle on resource condition */ +#define PROC_POLICY_RSRCACT_SUSPEND 2 /* suspend on resource condition */ +#define PROC_POLICY_RSRCACT_TERMINATE 3 /* kill on resource condition */ +#define PROC_POLICY_RSRCACT_NOTIFY 4 /* send kqueue notification */ + + +/* type of resource for kqueue notifiction */ +#define PROC_POLICY_RSRTYPE_CPU 1 +#define PROC_POLICY_RSRTYPE_WIREDMEM 2 +#define PROC_POLICY_RSRTYPE_VIRTUALMEM 4 +#define PROC_POLICY_RSRTYPE_DISK 8 +#define PROC_POLICY_RSRTYPE_NETWORK 0x010 +#define PROC_POLICY_RSRTYPE_POWER 0x20 + + +typedef struct proc_policy_attribute { + uint32_t ppattr_attribute; /* the policy attribute to be modified or returned */ + uint32_t ppattr_resv; /* pad field */ + uint64_t ppattr_value1; /* 64bit policy specific attribute */ + uint64_t ppattr_value2; /* 64bit policy specific attribute */ + uint64_t ppattr_value3; /* 64bit policy specific attribute */ + uint64_t ppattr_resv1[4]; /* reserved for future use */ +} proc_policy_attribute_t; + + +typedef struct proc_policy_cpuusage_attr { + uint32_t ppattr_cpu_attr ; /* specified action as in PROC_POLICY_RSRCACT_xx */ + uint32_t ppattr_cpu_percentage; /* percentage of interval */ + uint64_t ppattr_cpu_attr_interval; /* 64bit interval in nsecs */ + uint64_t ppattr_cpu_attr_deadline; /* 64bit deadline in nsecs */ +} proc_policy_cpuusage_attr_t; + + +/* sub policies for PROC_POLICY_APPTYPE */ +#define PROC_POLICY_OSX_APPTYPE_NONE 0 +#define PROC_POLICY_OSX_APPTYPE_TAL 1 /* TAL based launched */ +#define PROC_POLICY_OSX_APPTYPE_WIDGET 2 /* for dashboard client */ +#define PROC_POLICY_OSX_APPTYPE_DASHCLIENT 2 /* rename to move away from widget */ +#define PROC_POLICY_IOS_APPTYPE 3 /* ios specific handling */ +#define PROC_POLICY_IOS_NONUITYPE 4 /* ios non graphics type */ + +#ifndef KERNEL +int process_policy(int scope, int action, int policy, int policy_subtype, proc_policy_attribute_t * attrp, pid_t target_pid, uint64_t target_threadid); +#endif /* KERNEL */ + + +__END_DECLS + +#endif /*_SYS_PROCESS_POLICY_H */ diff --git a/bsd/sys/protosw.h b/bsd/sys/protosw.h index bdf36a317..75b6d6b28 100644 --- a/bsd/sys/protosw.h +++ b/bsd/sys/protosw.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2009 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -70,7 +70,12 @@ #include <sys/cdefs.h> #define PR_SLOWHZ 2 /* 2 slow timeouts per second */ +#ifndef __APPLE__ +/* + * See rdar://7617868: pr_fasttimo was removed use your own timer or pr_slowtimo instead + */ #define PR_FASTHZ 5 /* 5 fast timeouts per second */ +#endif #ifdef PRIVATE @@ -105,7 +110,8 @@ struct socket_filter; * The userreq routine interfaces protocols to the system and is * described below. */ - + +#include <sys/socket.h> #include <sys/socketvar.h> #include <sys/queue.h> #ifdef KERNEL @@ -132,8 +138,12 @@ struct protosw { void *pr_ousrreq; /* utility hooks */ void (*pr_init)(void); /* initialization hook */ +#if __APPLE__ + void (*pr_unused)(void); /* placeholder - fasttimo is removed */ +#else void (*pr_fasttimo)(void); /* fast timeout (200ms) */ +#endif void (*pr_slowtimo)(void); /* slow timeout (500ms) */ void (*pr_drain)(void); @@ -408,6 +418,7 @@ char *prcorequests[] = { __BEGIN_DECLS void domaininit(void) __attribute__((section("__TEXT, initcode"))); +void domainfin(void) __attribute__((section("__TEXT, fincode"))); void pfctlinput(int, struct sockaddr *); void pfctlinput2(int, struct sockaddr *, void *); @@ -418,6 +429,7 @@ struct protosw *pffindtype(int family, int type); extern int net_add_proto(struct protosw *, struct domain *); extern int net_del_proto(int, int, struct domain *); +extern u_int64_t net_uptime(void); __END_DECLS /* Temp hack to link static domains together */ diff --git a/bsd/sys/pthread_internal.h b/bsd/sys/pthread_internal.h index 7d0cfae29..6cc80f5f3 100644 --- a/bsd/sys/pthread_internal.h +++ b/bsd/sys/pthread_internal.h @@ -29,12 +29,34 @@ #ifndef _SYS_PTHREAD_INTERNAL_H_ #define _SYS_PTHREAD_INTERNAL_H_ -#undef pthread_mutexattr_t; - +#include <sys/user.h> #include <kern/thread_call.h> +struct ksyn_waitq_element { + TAILQ_ENTRY(ksyn_waitq_element) kwe_list; /* link to other list members */ + void * kwe_kwqqueue; /* queue blocked on */ + uint32_t kwe_flags; /* flags */ + uint32_t kwe_lockseq; /* the sequence of the entry */ + uint32_t kwe_count; /* upper bound on number of matches still pending */ + uint32_t kwe_psynchretval; /* thread retval */ + void *kwe_uth; /* uthread */ +}; +typedef struct ksyn_waitq_element * ksyn_waitq_element_t; + +/* kew_flags defns */ +#define KWE_THREAD_INWAIT 1 +#define KWE_THREAD_PREPOST 2 +#define KWE_THREAD_BROADCAST 4 + + #define WORKITEM_SIZE 64 -#define WORKQUEUE_NUMPRIOS 3 + +#define WORKQUEUE_HIGH_PRIOQUEUE 0 /* high priority queue */ +#define WORKQUEUE_DEFAULT_PRIOQUEUE 1 /* default priority queue */ +#define WORKQUEUE_LOW_PRIOQUEUE 2 /* low priority queue */ +#define WORKQUEUE_BG_PRIOQUEUE 3 /* background priority queue */ + +#define WORKQUEUE_NUMPRIOS 4 #define WORKQUEUE_OVERCOMMIT 0x10000 @@ -57,6 +79,8 @@ struct threadlist { #define TH_LIST_SUSPENDED 0x08 #define TH_LIST_BUSY 0x10 #define TH_LIST_NEED_WAKEUP 0x20 +#define TH_LIST_CONSTRAINED 0x40 + struct workitem { TAILQ_ENTRY(workitem) wi_entry; @@ -83,6 +107,7 @@ struct workqueue { uint32_t wq_timer_interval; uint32_t wq_affinity_max; uint32_t wq_threads_scheduled; + uint32_t wq_constrained_threads_scheduled; uint32_t wq_nthreads; uint32_t wq_thidlecount; uint32_t wq_reqconc[WORKQUEUE_NUMPRIOS]; /* requested concurrency for each priority level */ @@ -100,6 +125,8 @@ struct workqueue { #define WQL_ATIMER_BUSY 0x01 #define WQL_ATIMER_WAITING 0x02 +#define WQL_EXCEEDED_CONSTRAINED_THREAD_LIMIT 0x04 +#define WQL_EXCEEDED_TOTAL_THREAD_LIMIT 0x08 #define WQ_VECT_SET_BIT(vector, bit) \ @@ -121,7 +148,7 @@ struct workqueue { /* workq_kernreturn commands */ #define WQOPS_QUEUE_ADD 1 -#define WQOPS_QUEUE_REMOVE 2 +#define WQOPS_QUEUE_REMOVE_OBSOLETE 2 #define WQOPS_THREAD_RETURN 4 #define WQOPS_THREAD_SETCONC 8 @@ -129,12 +156,12 @@ struct workqueue { #define PTH_DEFAULT_GUARDSIZE 4*1024 #define MAX_PTHREAD_SIZE 64*1024 -void workqueue_exit(struct proc *); - -void pthread_init(void); extern lck_grp_attr_t *pthread_lck_grp_attr; extern lck_grp_t *pthread_lck_grp; extern lck_attr_t *pthread_lck_attr; +void workqueue_exit(struct proc *); +void pthread_init(void); +void psynch_zoneinit(void); #endif /* _SYS_PTHREAD_INTERNAL_H_ */ diff --git a/bsd/sys/queue.h b/bsd/sys/queue.h index a8e96cd4c..9ccb63e74 100644 --- a/bsd/sys/queue.h +++ b/bsd/sys/queue.h @@ -133,8 +133,11 @@ * _INSERT_AFTER + + + + + * _INSERT_TAIL - - + + + * _CONCAT - - + + - + * _REMOVE_AFTER + - + - - * _REMOVE_HEAD + - + - - + * _REMOVE_HEAD_UNTIL - - + - - * _REMOVE + + + + + + * _SWAP - + + + - * */ #ifdef QUEUE_MACRO_DEBUG @@ -232,12 +235,16 @@ struct { \ struct type *curelm = SLIST_FIRST((head)); \ while (SLIST_NEXT(curelm, field) != (elm)) \ curelm = SLIST_NEXT(curelm, field); \ - SLIST_NEXT(curelm, field) = \ - SLIST_NEXT(SLIST_NEXT(curelm, field), field); \ + SLIST_REMOVE_AFTER(curelm, field); \ } \ TRASHIT((elm)->field.sle_next); \ } while (0) +#define SLIST_REMOVE_AFTER(elm, field) do { \ + SLIST_NEXT(elm, field) = \ + SLIST_NEXT(SLIST_NEXT(elm, field), field); \ +} while (0) + #define SLIST_REMOVE_HEAD(head, field) do { \ SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ } while (0) @@ -324,9 +331,7 @@ struct { \ struct type *curelm = STAILQ_FIRST((head)); \ while (STAILQ_NEXT(curelm, field) != (elm)) \ curelm = STAILQ_NEXT(curelm, field); \ - if ((STAILQ_NEXT(curelm, field) = \ - STAILQ_NEXT(STAILQ_NEXT(curelm, field), field)) == NULL)\ - (head)->stqh_last = &STAILQ_NEXT((curelm), field);\ + STAILQ_REMOVE_AFTER(head, curelm, field); \ } \ TRASHIT((elm)->field.stqe_next); \ } while (0) @@ -337,11 +342,31 @@ struct { \ (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) -#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ - if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ - (head)->stqh_last = &STAILQ_FIRST((head)); \ +#define STAILQ_REMOVE_HEAD_UNTIL(head, elm, field) do { \ + if ((STAILQ_FIRST((head)) = STAILQ_NEXT((elm), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ } while (0) +#define STAILQ_REMOVE_AFTER(head, elm, field) do { \ + if ((STAILQ_NEXT(elm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(elm, field), field)) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_SWAP(head1, head2, type) do { \ + struct type *swap_first = STAILQ_FIRST(head1); \ + struct type **swap_last = (head1)->stqh_last; \ + STAILQ_FIRST(head1) = STAILQ_FIRST(head2); \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_FIRST(head2) = swap_first; \ + (head2)->stqh_last = swap_last; \ + if (STAILQ_EMPTY(head1)) \ + (head1)->stqh_last = &STAILQ_FIRST(head1); \ + if (STAILQ_EMPTY(head2)) \ + (head2)->stqh_last = &STAILQ_FIRST(head2); \ +} while (0) + + /* * List declarations. */ @@ -444,6 +469,16 @@ struct { \ TRASHIT((elm)->field.le_prev); \ } while (0) +#define LIST_SWAP(head1, head2, type, field) do { \ + struct type *swap_tmp = LIST_FIRST((head1)); \ + LIST_FIRST((head1)) = LIST_FIRST((head2)); \ + LIST_FIRST((head2)) = swap_tmp; \ + if ((swap_tmp = LIST_FIRST((head1))) != NULL) \ + swap_tmp->field.le_prev = &LIST_FIRST((head1)); \ + if ((swap_tmp = LIST_FIRST((head2))) != NULL) \ + swap_tmp->field.le_prev = &LIST_FIRST((head2)); \ +} while (0) + /* * Tail queue declarations. */ @@ -574,6 +609,23 @@ struct { \ QMD_TRACE_ELEM(&(elm)->field); \ } while (0) +#define TAILQ_SWAP(head1, head2, type, field) do { \ + struct type *swap_first = (head1)->tqh_first; \ + struct type **swap_last = (head1)->tqh_last; \ + (head1)->tqh_first = (head2)->tqh_first; \ + (head1)->tqh_last = (head2)->tqh_last; \ + (head2)->tqh_first = swap_first; \ + (head2)->tqh_last = swap_last; \ + if ((swap_first = (head1)->tqh_first) != NULL) \ + swap_first->field.tqe_prev = &(head1)->tqh_first; \ + else \ + (head1)->tqh_last = &(head1)->tqh_first; \ + if ((swap_first = (head2)->tqh_first) != NULL) \ + swap_first->field.tqe_prev = &(head2)->tqh_first; \ + else \ + (head2)->tqh_last = &(head2)->tqh_first; \ +} while (0) + /* * Circular queue definitions. */ diff --git a/bsd/sys/reboot.h b/bsd/sys/reboot.h index f79f2a9e2..6c64e53b8 100644 --- a/bsd/sys/reboot.h +++ b/bsd/sys/reboot.h @@ -135,7 +135,7 @@ #include <machine/reboot.h> __BEGIN_DECLS -void boot(int, int, char *); +int boot(int, int, char *); __END_DECLS #define PROC_SHUTDOWN_LOG "/var/log/kernel-shutdown.log" diff --git a/bsd/sys/resource.h b/bsd/sys/resource.h index 72c969c12..fbe8e6266 100644 --- a/bsd/sys/resource.h +++ b/bsd/sys/resource.h @@ -68,6 +68,9 @@ #include <sys/cdefs.h> #include <sys/_types.h> +#ifndef KERNEL +#include <Availability.h> +#endif /* [XSI] The timeval structure shall be defined as described in * <sys/time.h> @@ -121,6 +124,12 @@ typedef __uint64_t rlim_t; */ #define PRIO_DARWIN_BG 0x1000 +/* + * use PRIO_DARWIN_NONUI to restrict a process's ability to make calls to + * the GPU. + */ +#define PRIO_DARWIN_NONUI 0x1001 + #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ @@ -305,13 +314,13 @@ struct _iopol_param_t { __BEGIN_DECLS int getpriority(int, id_t); #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -int getiopolicy_np(int, int); +int getiopolicy_np(int, int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); #endif /* !_POSIX_C_SOURCE || _DARWIN_C_SOURCE */ int getrlimit(int, struct rlimit *) __DARWIN_ALIAS(getrlimit); int getrusage(int, struct rusage *); int setpriority(int, id_t, int); #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -int setiopolicy_np(int, int, int); +int setiopolicy_np(int, int, int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); #endif /* !_POSIX_C_SOURCE || _DARWIN_C_SOURCE */ int setrlimit(int, const struct rlimit *) __DARWIN_ALIAS(setrlimit); __END_DECLS diff --git a/bsd/sys/sdt_impl.h b/bsd/sys/sdt_impl.h index cbd117b61..e9531067c 100644 --- a/bsd/sys/sdt_impl.h +++ b/bsd/sys/sdt_impl.h @@ -74,6 +74,9 @@ struct module { }; extern int sdt_invop(uintptr_t, uintptr_t *, uintptr_t); +#if defined (__APPLE__) +extern uint64_t sdt_getarg(void *, dtrace_id_t, void *, int, int); +#endif /* __APPLE__ */ void sdt_provide_module(void *, struct modctl *); void sdt_init(void); @@ -85,8 +88,6 @@ extern int sdt_probetab_mask; #if defined(__i386__) || defined(__x86_64__) typedef uint8_t sdt_instr_t; -#elif defined(__ppc__) || defined(__ppc64__) -typedef uint32_t sdt_instr_t; #else #error Unknown implementation #endif diff --git a/bsd/sys/signal.h b/bsd/sys/signal.h index faa6fcc1d..d2e40fd76 100644 --- a/bsd/sys/signal.h +++ b/bsd/sys/signal.h @@ -145,12 +145,6 @@ #define __need_mcontext_t #define __need_stack_t #define __need_ucontext_t -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#if defined(__ppc__) || defined(__ppc64__) -#define __need_mcontext64_t -#define __need_ucontext64_t -#endif /* __ppc__ || __ppc64__ */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #include <sys/_structs.h> #ifndef _PID_T diff --git a/bsd/sys/socket.h b/bsd/sys/socket.h index 6b37dfced..3fc35997c 100644 --- a/bsd/sys/socket.h +++ b/bsd/sys/socket.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,6 +76,14 @@ #include <sys/cdefs.h> #include <machine/_param.h> +#ifdef PRIVATE +#include <sys/param.h> +#endif /* PRIVATE */ + +#ifndef KERNEL +#include <Availability.h> +#endif + /* * Definitions related to sockets: types, address families, options. */ @@ -130,6 +138,23 @@ struct iovec { size_t iov_len; /* [XSI] Size of region iov_base points to */ }; #endif + +#ifdef PRIVATE +#define SO_TCDBG_PID 0x01 /* Set/get traffic class for PID */ +#define SO_TCDBG_PNAME 0x02 /* Set/get traffic class for processes of that name */ +#define SO_TCDBG_PURGE 0x04 /* Purge entries for unused PIDs */ +#define SO_TCDBG_FLUSH 0x08 /* Flush all entries */ +#define SO_TCDBG_COUNT 0x10 /* Get count of entries */ +#define SO_TCDBG_LIST 0x20 /* List entries */ + +struct so_tcdbg { + u_int32_t so_tcdbg_cmd; + int32_t so_tcdbg_tclass; + u_int32_t so_tcdbg_count; + pid_t so_tcdbg_pid; + char so_tcdbg_pname[MAXCOMLEN + 1]; +}; +#endif /* PRIVATE */ /* * Types @@ -161,6 +186,7 @@ struct iovec { #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) #define SO_REUSEPORT 0x0200 /* allow local address & port reuse */ #define SO_TIMESTAMP 0x0400 /* timestamp received dgram traffic */ +#define SO_TIMESTAMP_MONOTONIC 0x0800 /* Monotonically increasing timestamp on rcvd dgram */ #ifndef __APPLE__ #define SO_ACCEPTFILTER 0x1000 /* there is an accept filter */ #else @@ -184,6 +210,8 @@ struct iovec { #define SO_TYPE 0x1008 /* get socket type */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) /*efine SO_PRIVSTATE 0x1009 get/deny privileged state */ +#define SO_LABEL 0x1010 /* socket's MAC label */ +#define SO_PEERLABEL 0x1011 /* socket's peer MAC label */ #ifdef __APPLE__ #define SO_NREAD 0x1020 /* APPLE: get 1st-packet byte count */ #define SO_NKE 0x1021 /* APPLE: Install socket-level NKE */ @@ -203,16 +231,26 @@ struct iovec { #define SO_RANDOMPORT 0x1082 /* APPLE: request local port randomization */ #define SO_NP_EXTENSIONS 0x1083 /* To turn off some POSIX behavior */ #endif + #ifdef PRIVATE #define SO_EXECPATH 0x1085 /* Application Firewall Socket option */ -#define SO_TRAFFIC_CLASS 0x1086 /* Traffic class */ +#define SO_TRAFFIC_CLASS 0x1086 /* Traffic class (int)*/ #define SO_TC_BE 0 /* Best effort, normal */ #define SO_TC_BK 1 /* Background, low priority or bulk traffic */ #define SO_TC_VI 2 /* Interactive video, constant bit rate, low latency */ #define SO_TC_VO 3 /* Interactive voice, constant bit rate, lowest latency */ -#endif -#define SO_LABEL 0x1010 /* socket's MAC label */ -#define SO_PEERLABEL 0x1011 /* socket's peer MAC label */ +#define SO_TC_MAX 4 /* Max traffic class value */ + +/* Background socket configuration flags */ +#define TRAFFIC_MGT_SO_BACKGROUND 0x0001 /* background socket */ +#define TRAFFIC_MGT_TCP_RECVBG 0x0002 /* Only TCP sockets, receiver throttling */ + +#define SO_RECV_TRAFFIC_CLASS 0x1087 /* Receive traffic class (bool)*/ +#define SO_TRAFFIC_CLASS_DBG 0x1088 /* Debug traffic class (struct so_tcdbg) */ +#define SO_TRAFFIC_CLASS_STATS 0x1089 /* Traffic class statistics */ +#define SO_DEFUNCTOK 0x1100 /* can be defunct'd */ +#define SO_ISDEFUNCT 0x1101 /* get defunct status */ +#endif /* PRIVATE */ #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ /* @@ -429,6 +467,9 @@ struct sockaddr_storage { */ #define PF_VLAN ((uint32_t)0x766c616e) /* 'vlan' */ #define PF_BOND ((uint32_t)0x626f6e64) /* 'bond' */ +#ifdef KERNEL_PRIVATE +#define PF_BRIDGE ((uint32_t)0x62726467) /* 'brdg' */ +#endif /* KERNEL_PRIVATE */ /* * Definitions for network related sysctl, CTL_NET. @@ -492,14 +533,18 @@ struct sockaddr_storage { * Fifth: type of info, defined below * Sixth: flag(s) to mask with for NET_RT_FLAGS */ -#define NET_RT_DUMP 1 /* dump; may limit to a.f. */ -#define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ -#define NET_RT_IFLIST 3 /* survey interface list */ -#define NET_RT_STAT 4 /* routing statistics */ -#define NET_RT_TRASH 5 /* routes not in table but not freed */ -#define NET_RT_IFLIST2 6 /* interface list with addresses */ -#define NET_RT_DUMP2 7 /* dump; may limit to a.f. */ -#define NET_RT_MAXID 8 +#define NET_RT_DUMP 1 /* dump; may limit to a.f. */ +#define NET_RT_FLAGS 2 /* by flags, e.g. RESOLVING */ +#define NET_RT_IFLIST 3 /* survey interface list */ +#define NET_RT_STAT 4 /* routing statistics */ +#define NET_RT_TRASH 5 /* routes not in table but not freed */ +#define NET_RT_IFLIST2 6 /* interface list with addresses */ +#define NET_RT_DUMP2 7 /* dump; may limit to a.f. */ +#ifdef PRIVATE +#define NET_RT_DUMPX 8 /* private */ +#define NET_RT_DUMPX_FLAGS 9 /* private */ +#endif /* PRIVATE */ +#define NET_RT_MAXID 10 #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #ifdef KERNEL_PRIVATE @@ -512,6 +557,8 @@ struct sockaddr_storage { { "trash", CTLTYPE_INT }, \ { "iflist2", CTLTYPE_STRUCT }, \ { "dump2", CTLTYPE_STRUCT }, \ + { "dumpx", CTLTYPE_STRUCT }, \ + { "dumpx_flags", CTLTYPE_STRUCT }, \ } #endif /* KERNEL_PRIVATE */ @@ -595,7 +642,13 @@ struct user32_msghdr { #define MSG_DONTWAIT 0x80 /* this message should be nonblocking */ #define MSG_EOF 0x100 /* data completes connection */ #ifdef __APPLE__ +#ifndef PRIVATE +#ifdef __APPLE_API_OBSOLETE +#define MSG_WAITSTREAM 0x200 /* wait up to full request.. may return partial */ +#endif +#else #define MSG_WAITSTREAM 0x200 /* wait up to full request.. may return partial */ +#endif #define MSG_FLUSH 0x400 /* Start of 'hold' seq; dump so_temp */ #define MSG_HOLD 0x800 /* Hold frag in so_temp */ #define MSG_SEND 0x1000 /* Send the packet in so_temp */ @@ -680,7 +733,7 @@ struct cmsgcred { ((unsigned char *)(mhdr)->msg_control + \ (mhdr)->msg_controllen)) ? \ (struct cmsghdr *)0L /* NULL */ : \ - (struct cmsghdr *)((unsigned char *)(cmsg) + \ + (struct cmsghdr *)(void *)((unsigned char *)(cmsg) + \ __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len)))) #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) @@ -694,10 +747,11 @@ struct cmsgcred { #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ /* "Socket"-level control message types: */ -#define SCM_RIGHTS 0x01 /* access rights (array of int) */ +#define SCM_RIGHTS 0x01 /* access rights (array of int) */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ -#define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ +#define SCM_TIMESTAMP 0x02 /* timestamp (struct timeval) */ +#define SCM_CREDS 0x03 /* process creds (struct cmsgcred) */ +#define SCM_TIMESTAMP_MONOTONIC 0x04 /* timestamp (uint64_t) */ #ifdef KERNEL_PRIVATE /* @@ -792,7 +846,7 @@ ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t) __DARWIN_ALIAS_C(sendto); int setsockopt(int, int, int, const void *, socklen_t); int shutdown(int, int); -int sockatmark(int); +int sockatmark(int) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); int socket(int, int, int); int socketpair(int, int, int, int *) __DARWIN_ALIAS(socketpair); @@ -804,7 +858,6 @@ int sendfile(int, int, off_t, off_t *, struct sf_hdtr *, int); void pfctlinput(int, struct sockaddr *); #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ __END_DECLS - #endif /* !KERNEL */ #ifdef KERNEL diff --git a/bsd/sys/socketvar.h b/bsd/sys/socketvar.h index 35560c65b..3c81716fe 100644 --- a/bsd/sys/socketvar.h +++ b/bsd/sys/socketvar.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2010 Apple Inc. All rights reserved. + * Copyright (c) 2000-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -108,6 +108,17 @@ extern char netio[], netcon[], netcls[]; typedef u_quad_t so_gen_t; +#ifdef PRIVATE +#define SO_TC_STATS_MAX 4 + +struct data_stats { + u_int64_t rxpackets; + u_int64_t rxbytes; + u_int64_t txpackets; + u_int64_t txbytes; +}; +#endif /* PRIVATE */ + #ifdef KERNEL_PRIVATE #ifndef __APPLE__ /* We don't support BSD style socket filters */ @@ -196,6 +207,7 @@ struct socket { void (*so_upcall)(struct socket *so, caddr_t arg, int waitf); caddr_t so_upcallarg; /* Arg for above */ uid_t so_uid; /* who opened the socket */ + gid_t so_gid; /* gid of whoever opened the socket */ /* NB: generation count must not be first; easiest to make it last. */ so_gen_t so_gencnt; /* generation count */ #ifndef __APPLE__ @@ -220,7 +232,7 @@ struct socket { #define SOF_NOSIGPIPE 0x1 #define SOF_NOADDRAVAIL 0x2 /* EADDRNOTAVAIL if src addr is gone */ #define SOF_PCBCLEARING 0x4 /* pru_disconnect done; don't call pru_detach */ -#define SOF_DEFUNCT 0x8 /* accepted socket marked as inactive */ +#define SOF_DEFUNCT 0x8 /* socket marked as inactive */ #define SOF_CLOSEWAIT 0x10 /* blocked in close awaiting some events */ #define SOF_UPCALLINUSE 0x20 /* socket upcall is currently in progress */ #define SOF_REUSESHAREUID 0x40 /* Allows SO_REUSEADDR/SO_REUSEPORT for multiple so_uid */ @@ -233,6 +245,9 @@ struct socket { #define SOF_UPCALLCLOSEWAIT 0x800 /* block on close until an upcall returns */ #define SOF_BINDRANDOMPORT 0x1000 /* Request a randomized port number for the bind */ #define SOF_NPX_SETOPTSHUT 0x2000 /* Non POSIX extension to allow setsockopt(2) after shut down */ +#define SOF_RECV_TRAFFIC_CLASS 0x4000 /* Receive traffic class as ancillary data */ +#define SOF_NODEFUNCT 0x8000 /* socket cannot be defunct'd */ +#define SOF_INCOMP_INPROGRESS 0x10000 /* incomp socket still being processed */ int so_usecount; /* refcounting of socket use */; int so_retaincnt; u_int32_t so_filteruse; /* usecount for the socket filters */ @@ -252,10 +267,36 @@ struct socket { struct label *so_label; /* MAC label for socket */ struct label *so_peerlabel; /* cached MAC label for socket peer */ thread_t so_background_thread; /* thread that marked this socket background */ -#if PKT_PRIORITY int so_traffic_class; -#endif /* PKT_PRIORITY */ + + // last process to interact with this socket + u_int64_t last_upid; + pid_t last_pid; + + struct data_stats so_tc_stats[SO_TC_STATS_MAX]; }; + +/* Control message accessor in mbufs */ + +#define _MIN_NXT_CMSGHDR_PTR(cmsg) \ + ((char *)(cmsg) + \ + __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len) + \ + __DARWIN_ALIGN32(sizeof(struct cmsghdr))) + +#define M_FIRST_CMSGHDR(m) \ + ((char *)(m) != (char *)0L && (size_t)(m)->m_len >= sizeof(struct cmsghdr) && \ + (socklen_t)(m)->m_len >= __DARWIN_ALIGN32(((struct cmsghdr *)(m)->m_data)->cmsg_len) ?\ + (struct cmsghdr *)(m)->m_data : \ + (struct cmsghdr *)0L) + +#define M_NXT_CMSGHDR(m, cmsg) \ + ((char *)(cmsg) == (char *)0L ? M_FIRST_CMSGHDR(m) : \ + _MIN_NXT_CMSGHDR_PTR(cmsg) > ((char *)(m)->m_data) + (m)->m_len || \ + _MIN_NXT_CMSGHDR_PTR(cmsg) < (char *)(m)->m_data ? \ + (struct cmsghdr *)0L /* NULL */ : \ + (struct cmsghdr *)((unsigned char *)(cmsg) + \ + __DARWIN_ALIGN32((__uint32_t)(cmsg)->cmsg_len))) + #endif /* KERNEL_PRIVATE */ /* @@ -278,6 +319,7 @@ struct socket { #define SS_ISDISCONNECTED 0x2000 /* socket disconnected from peer */ #define SS_DRAINING 0x4000 /* close waiting for blocked system calls to drain */ +#define SS_DEFUNCT 0x8000 /* has been fully defunct'd */ #if defined(__LP64__) #define _XSOCKET_PTR(x) u_int32_t @@ -288,13 +330,13 @@ struct socket { #pragma pack(4) struct xsockbuf { - u_int32_t sb_cc; - u_int32_t sb_hiwat; - u_int32_t sb_mbcnt; - u_int32_t sb_mbmax; - int32_t sb_lowat; - short sb_flags; - short sb_timeo; + u_int32_t sb_cc; + u_int32_t sb_hiwat; + u_int32_t sb_mbcnt; + u_int32_t sb_mbmax; + int32_t sb_lowat; + short sb_flags; + short sb_timeo; }; /* @@ -348,6 +390,56 @@ struct xsocket64 { #endif /* !CONFIG_EMBEDDED */ +#ifdef PRIVATE + +#define XSO_SOCKET 0x001 +#define XSO_RCVBUF 0x002 +#define XSO_SNDBUF 0x004 +#define XSO_STATS 0x008 +#define XSO_INPCB 0x010 +#define XSO_TCPCB 0x020 + +struct xsocket_n { + u_int32_t xso_len; /* length of this structure */ + u_int32_t xso_kind; /* XSO_SOCKET */ + u_int64_t xso_so; /* makes a convenient handle */ + short so_type; + short so_options; + short so_linger; + short so_state; + u_int64_t so_pcb; /* another convenient handle */ + int xso_protocol; + int xso_family; + short so_qlen; + short so_incqlen; + short so_qlimit; + short so_timeo; + u_short so_error; + pid_t so_pgid; + u_int32_t so_oobmark; + uid_t so_uid; /* XXX */ +}; + +struct xsockbuf_n { + u_int32_t xsb_len; /* length of this structure */ + u_int32_t xsb_kind; /* XSO_RCVBUF or XSO_SNDBUF */ + u_int32_t sb_cc; + u_int32_t sb_hiwat; + u_int32_t sb_mbcnt; + u_int32_t sb_mbmax; + int32_t sb_lowat; + short sb_flags; + short sb_timeo; +}; + +struct xsockstat_n { + u_int32_t xst_len; /* length of this structure */ + u_int32_t xst_kind; /* XSO_STATS */ + struct data_stats xst_tc_stats[SO_TC_STATS_MAX]; +}; + +#endif /* PRIVATE */ + #pragma pack() #ifdef KERNEL_PRIVATE @@ -434,6 +526,7 @@ extern so_gen_t so_gencnt; extern int socket_debug; extern int sosendjcl; extern int sosendjcl_ignore_capab; +extern int sodefunctlog; extern int somaxconn; struct file; @@ -444,6 +537,7 @@ struct stat; struct ucred; struct uio; struct knote; +struct so_tcdbg; #define SBLASTRECORDCHK(sb, s) \ if (socket_debug) sblastrecordchk(sb, s); @@ -458,6 +552,20 @@ struct knote; } \ } +#define SODEFUNCTLOG(x) do { if (sodefunctlog) printf x; } while (0) + +/* + * For debugging traffic class behaviors + */ +#define SOTCDB_NO_DSCP 0x01 /* Do not set DSCP code in IP header */ +#define SOTCDB_NO_MTC 0x02 /* Do not set the mbuf traffic class */ +#define SOTCDB_NO_SENDTCPBG 0x04 /* Do not use background TCP CC algorithm for sender */ +#define SOTCDB_NO_LCLTST 0x08 /* Do not test for local destination for setting DSCP */ +#define SOTCDB_NO_DSCPTST 0x10 /* Overwritte any existing DSCP code */ +#define SOTCDB_NO_RECVTCPBG 0x20 /* Do not use throttling on receiver-side of TCP */ + +extern u_int32_t sotcdb; + /* * From uipc_socket and friends */ @@ -481,6 +589,7 @@ extern void sbcheck(struct sockbuf *sb); extern void sblastmbufchk(struct sockbuf *, const char *); extern void sblastrecordchk(struct sockbuf *, const char *); extern struct mbuf *sbcreatecontrol(caddr_t p, int size, int type, int level); +extern struct mbuf **sbcreatecontrol_mbuf(caddr_t p, int size, int type, int level, struct mbuf** m); extern void sbdrop(struct sockbuf *sb, int len); extern void sbdroprecord(struct sockbuf *sb); extern void sbflush(struct sockbuf *sb); @@ -512,11 +621,14 @@ extern void sofree(struct socket *so); extern void soreference(struct socket *so); extern void sodereference(struct socket *so); extern void somultipages(struct socket *, boolean_t); +extern int sosetdefunct(struct proc *, struct socket *, int level, boolean_t); +extern int sodefunct(struct proc *, struct socket *, int level); extern int sogetopt(struct socket *so, struct sockopt *sopt); extern void sohasoutofband(struct socket *so); extern void soisconnected(struct socket *so); extern void soisconnecting(struct socket *so); extern void soisdisconnected(struct socket *so); +extern void sodisconnectwakeup(struct socket *so); extern void soisdisconnecting(struct socket *so); extern int soisbackground(struct socket *so); extern int solisten(struct socket *so, int backlog); @@ -531,8 +643,15 @@ extern int socket_unlock(struct socket *so, int refcount); extern void sofreelastref(struct socket *, int); extern int sogetaddr_locked(struct socket *, struct sockaddr **, int); extern const char *solockhistory_nr(struct socket *); -extern void set_traffic_class(struct mbuf *, struct socket *, int); +extern void set_packet_tclass(struct mbuf *, struct socket *, int, int); extern int mbuf_traffic_class_from_control(struct mbuf *); +extern void set_tcp_stream_priority(struct socket *so); +extern int so_set_traffic_class(struct socket *, int); +extern void so_set_default_traffic_class(struct socket *); +extern void socket_tclass_init(void); +extern int so_set_tcdbg(struct socket *, struct so_tcdbg *); +extern int sogetopt_tcdbg(struct socket *, struct sockopt *); +extern void so_recv_data_stat(struct socket *, struct mbuf *, size_t); /* * XXX; prepare mbuf for (__FreeBSD__ < 3) routines. @@ -557,6 +676,7 @@ extern void sotoxsocket(struct socket *so, struct xsocket *xso); #if !CONFIG_EMBEDDED extern void sotoxsocket64(struct socket *so, struct xsocket64 *xso); #endif +extern void sbwakeup(struct sockbuf *sb); extern void sowakeup(struct socket *so, struct sockbuf *sb); extern int soioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p); diff --git a/bsd/sys/sockio.h b/bsd/sys/sockio.h index 8d415e6e4..3a6b1371b 100644 --- a/bsd/sys/sockio.h +++ b/bsd/sys/sockio.h @@ -158,6 +158,18 @@ #define SIOCGIFALTMTU _IOWR('i', 72, struct ifreq) /* get if alternate mtu */ #define SIOCSIFBOND _IOW('i', 70, struct ifreq) /* set bond if config */ #define SIOCGIFBOND _IOWR('i', 71, struct ifreq) /* get bond if config */ + +#ifdef PRIVATE +/* + * temporary control calls to attach/detach IP to/from an ethernet interface + */ +#define SIOCPROTOATTACH _IOWR('i', 80, struct ifreq) /* attach proto to interface */ +#define SIOCPROTODETACH _IOWR('i', 81, struct ifreq) /* detach proto from interface */ +#endif /* PRIVATE */ + +#define SIOCSIFCAP _IOW('i', 90, struct ifreq) /* set IF features */ +#define SIOCGIFCAP _IOWR('i', 91, struct ifreq) /* get IF features */ + #define SIOCIFCREATE _IOWR('i', 120, struct ifreq) /* create clone if */ #define SIOCIFDESTROY _IOW('i', 121, struct ifreq) /* destroy clone if */ #define SIOCIFCREATE2 _IOWR('i', 122, struct ifreq) /* create clone if with data */ @@ -192,11 +204,6 @@ #define SIOCIFGCLONERS64 _IOWR('i', 129, struct if_clonereq64) /* get cloners */ #endif /* KERNEL */ -/* - * temporary control calls to attach/detach IP to/from an ethernet interface - */ -#define SIOCPROTOATTACH _IOWR('i', 80, struct ifreq) /* attach proto to interface */ -#define SIOCPROTODETACH _IOWR('i', 81, struct ifreq) /* detach proto from interface */ #endif /* PRIVATE */ #define SIOCGIFASYNCMAP _IOWR('i', 124, struct ifreq) /* get ppp asyncmap */ diff --git a/bsd/sys/spawn.h b/bsd/sys/spawn.h index f54fcc396..4947902dd 100644 --- a/bsd/sys/spawn.h +++ b/bsd/sys/spawn.h @@ -58,6 +58,15 @@ */ #define POSIX_SPAWN_SETEXEC 0x0040 #define POSIX_SPAWN_START_SUSPENDED 0x0080 +#ifdef PRIVATE +#define _POSIX_SPAWN_DISABLE_ASLR 0x0100 +#define _POSIX_SPAWN_ALLOW_DATA_EXEC 0x2000 +#define POSIX_SPAWN_OSX_TALAPP_START 0x0400 +#define POSIX_SPAWN_OSX_WIDGET_START 0x0800 +#define POSIX_SPAWN_OSX_DBCLIENT_START 0x0800 /* not a bug, same as widget just rename */ +#define POSIX_SPAWN_IOS_APP_START 0x1000 +#endif /* PRIVATE */ +#define POSIX_SPAWN_CLOEXEC_DEFAULT 0x4000 /* * Possible values to be set for the process control actions on resource starvation. diff --git a/bsd/sys/spawn_internal.h b/bsd/sys/spawn_internal.h index 0e8943947..d29526095 100644 --- a/bsd/sys/spawn_internal.h +++ b/bsd/sys/spawn_internal.h @@ -30,7 +30,7 @@ /* * [SPN] Support for _POSIX_SPAWN * - * This file contains intern datastructures which are externally represented + * This file contains internal data structures which are externally represented * as opaque void pointers to prevent introspection. This permits us to * change the underlying implementation of the code to maintain it or to * support new features, as needed, without the consumer needing to recompile @@ -110,7 +110,8 @@ typedef struct _posix_spawnattr { typedef enum { PSFA_OPEN = 0, PSFA_CLOSE = 1, - PSFA_DUP2 = 2 + PSFA_DUP2 = 2, + PSFA_INHERIT = 3 } psfa_t; diff --git a/bsd/sys/stat.h b/bsd/sys/stat.h index bcc8b79b4..d5daf6120 100644 --- a/bsd/sys/stat.h +++ b/bsd/sys/stat.h @@ -443,7 +443,7 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp); #define S_IFLNK 0120000 /* [XSI] symbolic link */ #define S_IFSOCK 0140000 /* [XSI] socket */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define S_IFWHT 0160000 /* whiteout */ +#define S_IFWHT 0160000 /* OBSOLETE: whiteout */ #endif /* File mode */ @@ -489,7 +489,7 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp); #define S_ISLNK(m) (((m) & S_IFMT) == S_IFLNK) /* symbolic link */ #define S_ISSOCK(m) (((m) & S_IFMT) == S_IFSOCK) /* socket */ #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#define S_ISWHT(m) (((m) & S_IFMT) == S_IFWHT) /* whiteout */ +#define S_ISWHT(m) (((m) & S_IFMT) == S_IFWHT) /* OBSOLETE: whiteout */ #endif /* @@ -553,7 +553,8 @@ extern void munge_user32_stat64(struct stat64 *sbp, struct user32_stat64 *usbp); */ /* #define UF_NOUNLINK 0x00000010 */ /* file may not be removed or renamed */ #define UF_COMPRESSED 0x00000020 /* file is hfs-compressed */ -/* Bits 0x0040 through 0x4000 are currently undefined. */ +#define UF_TRACKED 0x00000040 /* file renames and deletes are tracked */ +/* Bits 0x0080 through 0x4000 are currently undefined. */ #define UF_HIDDEN 0x00008000 /* hint that this item should not be */ /* displayed in a GUI */ /* @@ -607,13 +608,13 @@ int chmodx_np(const char *, filesec_t); int fchflags(int, __uint32_t); int fchmodx_np(int, filesec_t); int fstatx_np(int, struct stat *, filesec_t) __DARWIN_INODE64(fstatx_np); -int lchflags(const char *, __uint32_t); -int lchmod(const char *, mode_t); +int lchflags(const char *, __uint32_t) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); +int lchmod(const char *, mode_t) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); int lstatx_np(const char *, struct stat *, filesec_t) __DARWIN_INODE64(lstatx_np); int mkdirx_np(const char *, filesec_t); int mkfifox_np(const char *, filesec_t); int statx_np(const char *, struct stat *, filesec_t) __DARWIN_INODE64(statx_np); -int umaskx_np(filesec_t); +int umaskx_np(filesec_t) __OSX_AVAILABLE_BUT_DEPRECATED(__MAC_10_4,__MAC_10_6,__IPHONE_NA,__IPHONE_NA); #if !__DARWIN_ONLY_64_BIT_INO_T /* The following deprecated routines are simillar to stat and friends except provide struct stat64 instead of struct stat */ diff --git a/bsd/sys/sys_domain.h b/bsd/sys/sys_domain.h index 013959c8a..981d9f107 100644 --- a/bsd/sys/sys_domain.h +++ b/bsd/sys/sys_domain.h @@ -96,8 +96,9 @@ struct ctl_cb { lck_mtx_t *mtx; struct socket *so; /* controlling socket */ struct kctl *kctl; /* back pointer to controller */ - u_int32_t unit; void *userdata; + u_int32_t unit; + u_int32_t usecount; }; diff --git a/bsd/sys/sysctl.h b/bsd/sys/sysctl.h index 083432071..1da032f48 100644 --- a/bsd/sys/sysctl.h +++ b/bsd/sys/sysctl.h @@ -110,12 +110,34 @@ * type given below. Each sysctl level defines a set of name/type * pairs to be used by sysctl(1) in manipulating the subsystem. * - * When declaring new sysctl names, please use the CTLFLAG_LOCKED - * flag in the type to indicate that all necessary locking will - * be handled within the sysctl. Any sysctl defined without - * CTLFLAG_LOCKED is considered legacy and will be protected by - * both the kernel funnel and the sysctl memlock. This is not - * optimal, so it is best to handle locking yourself. + * When declaring new sysctl names, unless your sysctl is callable + * from the paging path, please use the CTLFLAG_LOCKED flag in the + * type to indicate that all necessary locking will be handled + * within the sysctl. + * + * Any sysctl defined without CTLFLAG_LOCKED is considered legacy + * and will be protected by both wiring the user process pages and, + * if it is a 32 bit legacy KEXT, by the obsolete kernel funnel. + * + * Note: This is not optimal, so it is best to handle locking + * yourself, if you are able to do so. A simple design + * pattern for use to avoid in a single function known + * to potentially be in the paging path ot doing a DMA + * to physical memory in a user space process is: + * + * lock + * perform operation vs. local buffer + * unlock + * SYSCTL_OUT(rey, local buffer, length) + * + * ...this assumes you are not using a deep call graph + * or are unable to pass a local buffer address as a + * parameter into your deep call graph. + * + * Note that very large user buffers can fail the wire + * if to do so would require more physical pages than + * are available (the caller will get an ENOMEM error, + * see sysctl_mem_hold() for details). */ struct ctlname { char *ctl_name; /* subsystem name */ @@ -139,7 +161,8 @@ struct ctlname { #define CTLFLAG_MASKED 0x04000000 /* deprecated variable, do not display */ #define CTLFLAG_NOAUTO 0x02000000 /* do not auto-register */ #define CTLFLAG_KERN 0x01000000 /* valid inside the kernel */ -#define CTLFLAG_LOCKED 0x00800000 /* node will handle locking itself (highly encouraged) */ +#define CTLFLAG_LOCKED 0x00800000 /* node will handle locking itself */ +#define CTLFLAG_OID2 0x00400000 /* struct sysctl_oid has version info */ /* * USE THIS instead of a hardwired number from the categories below @@ -161,33 +184,6 @@ struct ctlname { #define SYSCTL_HANDLER_ARGS (struct sysctl_oid *oidp, void *arg1, int arg2, \ struct sysctl_req *req) -/* - * Locking and stats - */ -struct sysctl_lock { - int sl_lock; - int sl_want; - int sl_locked; -}; - -#define MEMLOCK_LOCK() \ - do { \ - while (memlock.sl_lock) { \ - memlock.sl_want = 1; \ - (void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0); \ - memlock.sl_locked++; \ - } \ - memlock.sl_lock = 1; \ - } while(0) - -#define MEMLOCK_UNLOCK() \ - do { \ - memlock.sl_lock = 0; \ - if (memlock.sl_want) { \ - memlock.sl_want = 0; \ - wakeup((caddr_t)&memlock); \ - } \ - }while(0) /* * This describes the access space for a sysctl request. This is needed @@ -195,22 +191,55 @@ struct sysctl_lock { */ struct sysctl_req { struct proc *p; - int lock; - user_addr_t oldptr; - size_t oldlen; - size_t oldidx; - int (*oldfunc)(struct sysctl_req *, const void *, size_t); - user_addr_t newptr; - size_t newlen; - size_t newidx; - int (*newfunc)(struct sysctl_req *, void *, size_t); + int lock; + user_addr_t oldptr; /* pointer to user supplied buffer */ + size_t oldlen; /* user buffer length (also returned) */ + size_t oldidx; /* total data iteratively copied out */ + int (*oldfunc)(struct sysctl_req *, const void *, size_t); + user_addr_t newptr; /* buffer containing new value */ + size_t newlen; /* length of new value */ + size_t newidx; /* total data iteratively copied in */ + int (*newfunc)(struct sysctl_req *, void *, size_t); }; SLIST_HEAD(sysctl_oid_list, sysctl_oid); +#define SYSCTL_OID_VERSION 1 /* current OID structure version */ + /* * This describes one "oid" in the MIB tree. Potentially more nodes can * be hidden behind it, expanded by the handler. + * + * NOTES: We implement binary comparibility between CTLFLAG_OID2 and + * pre-CTLFLAG_OID2 structure in sysctl_register_oid() and in + * sysctl_unregister_oid() using the fact that the fields up + * to oid_fmt are unchanged, and that the field immediately + * following is on an alignment boundary following a pointer + * type and is also a pointer. This lets us get the previous + * size of the structure, and the copy-cut-off point, using + * the offsetof() language primitive, and these values are + * used in conjunction with the fact that earlier and future + * statically compiled sysctl_oid structures are declared via + * macros. This lets us overload the macros so that the addition + * of the CTLFLAG_OID2 in newly compiled code containing sysctl + * node declarations, subsequently allowing us to to avoid + * changing the KPI used for non-static (un)registration in + * KEXTs. + * + * This depends on the fact that people declare SYSCTLs, + * rather than declaring sysctl_oid structures. All new code + * should avoid declaring struct sysctl_oid's directly without + * the macros; the current risk for this is limited to losing + * your description field and ending up with a malloc'ed copy, + * as if it were a legacy binary static declaration via SYSCTL; + * in the future, we may deprecate access to a named structure + * type in third party code. Use the macros, or our code will + * end up with compile errors when that happens. + * + * Please try to include a long description of the field in any + * new sysctl declarations (all the macros support this). This + * field may be the only human readable documentation your users + * get for your sysctl. */ struct sysctl_oid { struct sysctl_oid_list *oid_parent; @@ -222,6 +251,9 @@ struct sysctl_oid { const char *oid_name; int (*oid_handler) SYSCTL_HANDLER_ARGS; const char *oid_fmt; + const char *oid_descr; /* offsetof() field / long description */ + int oid_version; + int oid_refcnt; }; #define SYSCTL_IN(r, p, l) (r->newfunc)(r, p, l) @@ -267,7 +299,7 @@ __END_DECLS #define SYSCTL_OID(parent, nbr, name, kind, a1, a2, handler, fmt, descr) \ struct sysctl_oid sysctl_##parent##_##name = { \ &sysctl_##parent##_children, { 0 }, \ - nbr, kind, a1, a2, #name, handler, fmt }; \ + nbr, kind|CTLFLAG_OID2, a1, a2, #name, handler, fmt, descr, SYSCTL_OID_VERSION, 0 }; \ SYSCTL_LINKER_SET_ENTRY(__sysctl_set, sysctl_##parent##_##name) /* This constructs a node from which other oids can hang. */ @@ -510,6 +542,9 @@ SYSCTL_DECL(_user); #define KERN_KDPIDEX 14 #define KERN_KDSETRTCDEC 15 #define KERN_KDGETENTROPY 16 +#define KERN_KDWRITETR 17 +#define KERN_KDWRITEMAP 18 + /* KERN_PANICINFO types */ #define KERN_PANICINFO_MAXSIZE 1 /* quad: panic UI image size limit */ diff --git a/bsd/sys/sysent.h b/bsd/sys/sysent.h index b83a67c44..df71d010f 100644 --- a/bsd/sys/sysent.h +++ b/bsd/sys/sysent.h @@ -31,9 +31,6 @@ #include <sys/appleapiopts.h> #include <sys/cdefs.h> -#ifdef __ppc__ -#include <sys/types.h> -#endif #ifdef KERNEL_PRIVATE #ifdef __APPLE_API_PRIVATE @@ -59,7 +56,7 @@ extern struct sysent sysent[]; #endif /* __INIT_SYSENT_C__ */ extern int nsysent; -#define NUM_SYSENT 434 /* Current number of defined syscalls */ +#define NUM_SYSENT 439 /* Current number of defined syscalls */ /* sy_funnel flags bits */ #define FUNNEL_MASK 0x07f diff --git a/bsd/sys/syslog.h b/bsd/sys/syslog.h index e85a4a817..71004cf2a 100644 --- a/bsd/sys/syslog.h +++ b/bsd/sys/syslog.h @@ -26,7 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ -/* +/*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * @@ -38,10 +38,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. @@ -59,9 +55,10 @@ * SUCH DAMAGE. * * @(#)syslog.h 8.1 (Berkeley) 6/2/93 + * $FreeBSD: src/sys/sys/syslog.h,v 1.27.2.1.4.1 2010/06/14 02:09:06 kensmith Exp $ */ -#ifndef _SYS_SYSLOG_H_ +#ifndef _SYS_SYSLOG_H_ #define _SYS_SYSLOG_H_ #include <sys/appleapiopts.h> @@ -88,54 +85,61 @@ #define LOG_DEBUG 7 /* debug-level messages */ #define LOG_PRIMASK 0x07 /* mask to extract priority part (internal) */ - /* extract priority */ +/* extract priority */ #define LOG_PRI(p) ((p) & LOG_PRIMASK) -#define LOG_MAKEPRI(fac, pri) (((fac) << 3) | (pri)) +#define LOG_MAKEPRI(fac, pri) ((fac) | (pri)) #ifdef SYSLOG_NAMES #define INTERNAL_NOPRI 0x10 /* the "no priority" priority */ - /* mark "facility" */ -#define INTERNAL_MARK LOG_MAKEPRI(LOG_NFACILITIES, 0) +/* mark "facility" */ +#define INTERNAL_MARK LOG_MAKEPRI((LOG_NFACILITIES<<3), 0) typedef struct _code { - char *c_name; - int c_val; + const char *c_name; + int c_val; } CODE; CODE prioritynames[] = { - { "alert", LOG_ALERT }, - { "crit", LOG_CRIT }, - { "debug", LOG_DEBUG }, - { "emerg", LOG_EMERG }, - { "err", LOG_ERR }, - { "error", LOG_ERR }, /* DEPRECATED */ - { "info", LOG_INFO }, - { "none", INTERNAL_NOPRI }, /* INTERNAL */ - { "notice", LOG_NOTICE }, - { "panic", LOG_EMERG }, /* DEPRECATED */ - { "warn", LOG_WARNING }, /* DEPRECATED */ - { "warning", LOG_WARNING }, - { 0, -1 } + { "alert", LOG_ALERT, }, + { "crit", LOG_CRIT, }, + { "debug", LOG_DEBUG, }, + { "emerg", LOG_EMERG, }, + { "err", LOG_ERR, }, + { "error", LOG_ERR, }, /* DEPRECATED */ + { "info", LOG_INFO, }, + { "none", INTERNAL_NOPRI, }, /* INTERNAL */ + { "notice", LOG_NOTICE, }, + { "panic", LOG_EMERG, }, /* DEPRECATED */ + { "warn", LOG_WARNING, }, /* DEPRECATED */ + { "warning", LOG_WARNING, }, + { NULL, -1, } }; #endif /* facility codes */ -#define LOG_KERN (0<<3) /* kernel messages */ -#define LOG_USER (1<<3) /* random user-level messages */ -#define LOG_MAIL (2<<3) /* mail system */ -#define LOG_DAEMON (3<<3) /* system daemons */ -#define LOG_AUTH (4<<3) /* security/authorization messages */ -#define LOG_SYSLOG (5<<3) /* messages generated internally by syslogd */ -#define LOG_LPR (6<<3) /* line printer subsystem */ -#define LOG_NEWS (7<<3) /* network news subsystem */ -#define LOG_UUCP (8<<3) /* UUCP subsystem */ -#define LOG_CRON (9<<3) /* clock daemon */ -#define LOG_AUTHPRIV (10<<3) /* security/authorization messages (private) */ -#define LOG_FTP (11<<3) /* ftp daemon */ -#define LOG_NETINFO (12<<3) /* NetInfo */ +#define LOG_KERN (0<<3) /* kernel messages */ +#define LOG_USER (1<<3) /* random user-level messages */ +#define LOG_MAIL (2<<3) /* mail system */ +#define LOG_DAEMON (3<<3) /* system daemons */ +#define LOG_AUTH (4<<3) /* authorization messages */ +#define LOG_SYSLOG (5<<3) /* messages generated internally by syslogd */ +#define LOG_LPR (6<<3) /* line printer subsystem */ +#define LOG_NEWS (7<<3) /* network news subsystem */ +#define LOG_UUCP (8<<3) /* UUCP subsystem */ +#define LOG_CRON (9<<3) /* clock daemon */ +#define LOG_AUTHPRIV (10<<3) /* authorization messages (private) */ +/* Facility #10 clashes in DEC UNIX, where */ +/* it's defined as LOG_MEGASAFE for AdvFS */ +/* event logging. */ +#define LOG_FTP (11<<3) /* ftp daemon */ +//#define LOG_NTP (12<<3) /* NTP subsystem */ +//#define LOG_SECURITY (13<<3) /* security subsystems (firewalling, etc.) */ +//#define LOG_CONSOLE (14<<3) /* /dev/console output */ +#define LOG_NETINFO (12<<3) /* NetInfo */ #define LOG_REMOTEAUTH (13<<3) /* remote authentication/authorization */ -#define LOG_INSTALL (14<<3) /* installer subsystem */ -#define LOG_RAS (15<<3) /* Remote Access Service (VPN / PPP) */ +#define LOG_INSTALL (14<<3) /* installer subsystem */ +#define LOG_RAS (15<<3) /* Remote Access Service (VPN / PPP) */ +/* other codes through 15 reserved for system use */ #define LOG_LOCAL0 (16<<3) /* reserved for local use */ #define LOG_LOCAL1 (17<<3) /* reserved for local use */ #define LOG_LOCAL2 (18<<3) /* reserved for local use */ @@ -145,43 +149,43 @@ CODE prioritynames[] = { #define LOG_LOCAL6 (22<<3) /* reserved for local use */ #define LOG_LOCAL7 (23<<3) /* reserved for local use */ -#define LOG_LAUNCHD (24<<3) /* launchd - general bootstrap daemon */ +#define LOG_LAUNCHD (24<<3) /* launchd - general bootstrap daemon */ #define LOG_NFACILITIES 25 /* current number of facilities */ #define LOG_FACMASK 0x03f8 /* mask to extract facility part */ - /* facility of pri */ +/* facility of pri */ #define LOG_FAC(p) (((p) & LOG_FACMASK) >> 3) #ifdef SYSLOG_NAMES CODE facilitynames[] = { - { "auth", LOG_AUTH }, - { "authpriv", LOG_AUTHPRIV }, - { "cron", LOG_CRON }, - { "daemon", LOG_DAEMON }, - { "ftp", LOG_FTP }, - { "install", LOG_INSTALL }, - { "kern", LOG_KERN }, - { "lpr", LOG_LPR }, - { "mail", LOG_MAIL }, - { "mark", INTERNAL_MARK }, /* INTERNAL */ - { "netinfo", LOG_NETINFO }, - { "ras", LOG_RAS }, - { "remoteauth", LOG_REMOTEAUTH }, - { "news", LOG_NEWS }, - { "security", LOG_AUTH }, /* DEPRECATED */ - { "syslog", LOG_SYSLOG }, - { "user", LOG_USER }, - { "uucp", LOG_UUCP }, - { "local0", LOG_LOCAL0 }, - { "local1", LOG_LOCAL1 }, - { "local2", LOG_LOCAL2 }, - { "local3", LOG_LOCAL3 }, - { "local4", LOG_LOCAL4 }, - { "local5", LOG_LOCAL5 }, - { "local6", LOG_LOCAL6 }, - { "local7", LOG_LOCAL7 }, - { "launchd", LOG_LAUNCHD }, - { 0, -1 } + { "auth", LOG_AUTH, }, + { "authpriv", LOG_AUTHPRIV, }, + { "cron", LOG_CRON, }, + { "daemon", LOG_DAEMON, }, + { "ftp", LOG_FTP, }, + { "install", LOG_INSTALL }, + { "kern", LOG_KERN, }, + { "lpr", LOG_LPR, }, + { "mail", LOG_MAIL, }, + { "mark", INTERNAL_MARK, }, /* INTERNAL */ + { "netinfo", LOG_NETINFO, }, + { "ras", LOG_RAS }, + { "remoteauth", LOG_REMOTEAUTH }, + { "news", LOG_NEWS, }, + { "security", LOG_AUTH }, /* DEPRECATED */ + { "syslog", LOG_SYSLOG, }, + { "user", LOG_USER, }, + { "uucp", LOG_UUCP, }, + { "local0", LOG_LOCAL0, }, + { "local1", LOG_LOCAL1, }, + { "local2", LOG_LOCAL2, }, + { "local3", LOG_LOCAL3, }, + { "local4", LOG_LOCAL4, }, + { "local5", LOG_LOCAL5, }, + { "local6", LOG_LOCAL6, }, + { "local7", LOG_LOCAL7, }, + { "launchd", LOG_LAUNCHD }, + { NULL, -1, } }; #endif @@ -211,18 +215,24 @@ CODE facilitynames[] = { #define LOG_PERROR 0x20 /* log to stderr as well */ #ifndef KERNEL -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#include <sys/_types.h> /* for __darwin_va_list */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ + +/* + * Don't use va_list in the vsyslog() prototype. Va_list is typedef'd in two + * places (<machine/varargs.h> and <machine/stdarg.h>), so if we include one + * of them here we may collide with the utility's includes. It's unreasonable + * for utilities to have to include one of them to include syslog.h, so we get + * __va_list from <sys/_types.h> and use it. + */ +#include <sys/_types.h> __BEGIN_DECLS void closelog(void); void openlog(const char *, int, int); int setlogmask(int); -void syslog(int, const char *, ...) __DARWIN_LDBL_COMPAT(syslog); -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -void vsyslog(int, const char *, __darwin_va_list) __DARWIN_LDBL_COMPAT(vsyslog); -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +void syslog(int, const char *, ...) __printflike(2, 3) __DARWIN_LDBL_COMPAT(syslog); +#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL +void vsyslog(int, const char *, __darwin_va_list) __printflike(2, 0) __DARWIN_LDBL_COMPAT(vsyslog); +#endif __END_DECLS #else /* !KERNEL */ diff --git a/bsd/sys/systm.h b/bsd/sys/systm.h index d5fdbe392..f08bc477c 100644 --- a/bsd/sys/systm.h +++ b/bsd/sys/systm.h @@ -223,10 +223,17 @@ void bsd_untimeout(void (*)(void *), void *arg); void set_fsblocksize(struct vnode *); uint64_t tvtoabstime(struct timeval *); void *throttle_info_create(void); -void throttle_info_mount_ref(mount_t mp, void * throttle_info); -void throttle_info_mount_rel(mount_t mp); +void throttle_info_mount_ref(mount_t mp, void * throttle_info); +void throttle_info_mount_rel(mount_t mp); void throttle_info_release(void *throttle_info); void throttle_info_update(void *throttle_info, int flags); +uint32_t throttle_lowpri_io(int sleep_amount); +void throttle_set_thread_io_policy(int policy); +typedef struct __throttle_info_handle *throttle_info_handle_t; +int throttle_info_ref_by_mask( + uint64_t throttle_mask, throttle_info_handle_t *throttle_info_handle); +void throttle_info_rel_by_mask(throttle_info_handle_t throttle_info_handle); +void throttle_info_update_by_mask(void *throttle_info_handle, int flags); __END_DECLS #endif /* !_SYS_SYSTM_H_ */ diff --git a/bsd/sys/time.h b/bsd/sys/time.h index 732d1ae76..a32ed62d6 100644 --- a/bsd/sys/time.h +++ b/bsd/sys/time.h @@ -68,6 +68,8 @@ #include <sys/_types.h> #ifdef KERNEL #include <machine/types.h> /* user_time_t */ +#else /* !KERNEL */ +#include <Availability.h> #endif /* KERNEL */ /* @@ -240,7 +242,7 @@ __BEGIN_DECLS #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) int adjtime(const struct timeval *, struct timeval *); int futimes(int, const struct timeval *); -int lutimes(const char *, const struct timeval *); +int lutimes(const char *, const struct timeval *) __OSX_AVAILABLE_STARTING(__MAC_10_5, __IPHONE_2_0); int settimeofday(const struct timeval *, const struct timezone *); #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ diff --git a/bsd/sys/tree.h b/bsd/sys/tree.h index f4bf40c73..42427ca31 100644 --- a/bsd/sys/tree.h +++ b/bsd/sys/tree.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,693 +26,4 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* $NetBSD: tree.h,v 1.13 2006/08/27 22:32:38 christos Exp $ */ -/* $OpenBSD: tree.h,v 1.7 2002/10/17 21:51:54 art Exp $ */ -/* - * Copyright 2002 Niels Provos <provos@citi.umich.edu> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. - * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _SYS_TREE_H_ -#define _SYS_TREE_H_ - -/* - * This file defines data structures for different types of trees: - * splay trees and red-black trees. - * - * A splay tree is a self-organizing data structure. Every operation - * on the tree causes a splay to happen. The splay moves the requested - * node to the root of the tree and partly rebalances it. - * - * This has the benefit that request locality causes faster lookups as - * the requested nodes move to the top of the tree. On the other hand, - * every lookup causes memory writes. - * - * The Balance Theorem bounds the total access time for m operations - * and n inserts on an initially empty tree as O((m + n)lg n). The - * amortized cost for a sequence of m accesses to a splay tree is O(lg n); - * - * A red-black tree is a binary search tree with the node color as an - * extra attribute. It fulfills a set of conditions: - * - every search path from the root to a leaf consists of the - * same number of black nodes, - * - each red node (except for the root) has a black parent, - * - each leaf node is black. - * - * Every operation on a red-black tree is bounded as O(lg n). - * The maximum height of a red-black tree is 2lg (n+1). - */ - -#define SPLAY_HEAD(name, type) \ -struct name { \ - struct type *sph_root; /* root of the tree */ \ -} - -#define SPLAY_INITIALIZER(root) \ - { NULL } - -#define SPLAY_INIT(root) do { \ - (root)->sph_root = NULL; \ -} while (/*CONSTCOND*/ 0) - -#define SPLAY_ENTRY(type) \ -struct { \ - struct type *spe_left; /* left element */ \ - struct type *spe_right; /* right element */ \ -} - -#define SPLAY_LEFT(elm, field) (elm)->field.spe_left -#define SPLAY_RIGHT(elm, field) (elm)->field.spe_right -#define SPLAY_ROOT(head) (head)->sph_root -#define SPLAY_EMPTY(head) (SPLAY_ROOT(head) == NULL) - -/* SPLAY_ROTATE_{LEFT,RIGHT} expect that tmp hold SPLAY_{RIGHT,LEFT} */ -#define SPLAY_ROTATE_RIGHT(head, tmp, field) do { \ - SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(tmp, field); \ - SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ - (head)->sph_root = tmp; \ -} while (/*CONSTCOND*/ 0) - -#define SPLAY_ROTATE_LEFT(head, tmp, field) do { \ - SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(tmp, field); \ - SPLAY_LEFT(tmp, field) = (head)->sph_root; \ - (head)->sph_root = tmp; \ -} while (/*CONSTCOND*/ 0) - -#define SPLAY_LINKLEFT(head, tmp, field) do { \ - SPLAY_LEFT(tmp, field) = (head)->sph_root; \ - tmp = (head)->sph_root; \ - (head)->sph_root = SPLAY_LEFT((head)->sph_root, field); \ -} while (/*CONSTCOND*/ 0) - -#define SPLAY_LINKRIGHT(head, tmp, field) do { \ - SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ - tmp = (head)->sph_root; \ - (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field); \ -} while (/*CONSTCOND*/ 0) - -#define SPLAY_ASSEMBLE(head, node, left, right, field) do { \ - SPLAY_RIGHT(left, field) = SPLAY_LEFT((head)->sph_root, field); \ - SPLAY_LEFT(right, field) = SPLAY_RIGHT((head)->sph_root, field);\ - SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(node, field); \ - SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(node, field); \ -} while (/*CONSTCOND*/ 0) - -/* Generates prototypes and inline functions */ - -#define SPLAY_PROTOTYPE(name, type, field, cmp) \ -void name##_SPLAY(struct name *, struct type *); \ -void name##_SPLAY_MINMAX(struct name *, int); \ -struct type *name##_SPLAY_INSERT(struct name *, struct type *); \ -struct type *name##_SPLAY_REMOVE(struct name *, struct type *); \ - \ -/* Finds the node with the same key as elm */ \ -static __inline struct type * \ -name##_SPLAY_FIND(struct name *head, struct type *elm) \ -{ \ - if (SPLAY_EMPTY(head)) \ - return(NULL); \ - name##_SPLAY(head, elm); \ - if ((cmp)(elm, (head)->sph_root) == 0) \ - return (head->sph_root); \ - return (NULL); \ -} \ - \ -static __inline struct type * \ -name##_SPLAY_NEXT(struct name *head, struct type *elm) \ -{ \ - name##_SPLAY(head, elm); \ - if (SPLAY_RIGHT(elm, field) != NULL) { \ - elm = SPLAY_RIGHT(elm, field); \ - while (SPLAY_LEFT(elm, field) != NULL) { \ - elm = SPLAY_LEFT(elm, field); \ - } \ - } else \ - elm = NULL; \ - return (elm); \ -} \ - \ -static __inline struct type * \ -name##_SPLAY_MIN_MAX(struct name *head, int val) \ -{ \ - name##_SPLAY_MINMAX(head, val); \ - return (SPLAY_ROOT(head)); \ -} - -/* Main splay operation. - * Moves node close to the key of elm to top - */ -#define SPLAY_GENERATE(name, type, field, cmp) \ -struct type * \ -name##_SPLAY_INSERT(struct name *head, struct type *elm) \ -{ \ - if (SPLAY_EMPTY(head)) { \ - SPLAY_LEFT(elm, field) = SPLAY_RIGHT(elm, field) = NULL; \ - } else { \ - int __comp; \ - name##_SPLAY(head, elm); \ - __comp = (cmp)(elm, (head)->sph_root); \ - if(__comp < 0) { \ - SPLAY_LEFT(elm, field) = SPLAY_LEFT((head)->sph_root, field);\ - SPLAY_RIGHT(elm, field) = (head)->sph_root; \ - SPLAY_LEFT((head)->sph_root, field) = NULL; \ - } else if (__comp > 0) { \ - SPLAY_RIGHT(elm, field) = SPLAY_RIGHT((head)->sph_root, field);\ - SPLAY_LEFT(elm, field) = (head)->sph_root; \ - SPLAY_RIGHT((head)->sph_root, field) = NULL; \ - } else \ - return ((head)->sph_root); \ - } \ - (head)->sph_root = (elm); \ - return (NULL); \ -} \ - \ -struct type * \ -name##_SPLAY_REMOVE(struct name *head, struct type *elm) \ -{ \ - struct type *__tmp; \ - if (SPLAY_EMPTY(head)) \ - return (NULL); \ - name##_SPLAY(head, elm); \ - if ((cmp)(elm, (head)->sph_root) == 0) { \ - if (SPLAY_LEFT((head)->sph_root, field) == NULL) { \ - (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);\ - } else { \ - __tmp = SPLAY_RIGHT((head)->sph_root, field); \ - (head)->sph_root = SPLAY_LEFT((head)->sph_root, field);\ - name##_SPLAY(head, elm); \ - SPLAY_RIGHT((head)->sph_root, field) = __tmp; \ - } \ - return (elm); \ - } \ - return (NULL); \ -} \ - \ -void \ -name##_SPLAY(struct name *head, struct type *elm) \ -{ \ - struct type __node, *__left, *__right, *__tmp; \ - int __comp; \ -\ - SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ - __left = __right = &__node; \ -\ - while ((__comp = (cmp)(elm, (head)->sph_root)) != 0) { \ - if (__comp < 0) { \ - __tmp = SPLAY_LEFT((head)->sph_root, field); \ - if (__tmp == NULL) \ - break; \ - if ((cmp)(elm, __tmp) < 0){ \ - SPLAY_ROTATE_RIGHT(head, __tmp, field); \ - if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ - break; \ - } \ - SPLAY_LINKLEFT(head, __right, field); \ - } else if (__comp > 0) { \ - __tmp = SPLAY_RIGHT((head)->sph_root, field); \ - if (__tmp == NULL) \ - break; \ - if ((cmp)(elm, __tmp) > 0){ \ - SPLAY_ROTATE_LEFT(head, __tmp, field); \ - if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ - break; \ - } \ - SPLAY_LINKRIGHT(head, __left, field); \ - } \ - } \ - SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ -} \ - \ -/* Splay with either the minimum or the maximum element \ - * Used to find minimum or maximum element in tree. \ - */ \ -void name##_SPLAY_MINMAX(struct name *head, int __comp) \ -{ \ - struct type __node, *__left, *__right, *__tmp; \ -\ - SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ - __left = __right = &__node; \ -\ - while (1) { \ - if (__comp < 0) { \ - __tmp = SPLAY_LEFT((head)->sph_root, field); \ - if (__tmp == NULL) \ - break; \ - if (__comp < 0){ \ - SPLAY_ROTATE_RIGHT(head, __tmp, field); \ - if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ - break; \ - } \ - SPLAY_LINKLEFT(head, __right, field); \ - } else if (__comp > 0) { \ - __tmp = SPLAY_RIGHT((head)->sph_root, field); \ - if (__tmp == NULL) \ - break; \ - if (__comp > 0) { \ - SPLAY_ROTATE_LEFT(head, __tmp, field); \ - if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ - break; \ - } \ - SPLAY_LINKRIGHT(head, __left, field); \ - } \ - } \ - SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ -} - -#define SPLAY_NEGINF -1 -#define SPLAY_INF 1 - -#define SPLAY_INSERT(name, x, y) name##_SPLAY_INSERT(x, y) -#define SPLAY_REMOVE(name, x, y) name##_SPLAY_REMOVE(x, y) -#define SPLAY_FIND(name, x, y) name##_SPLAY_FIND(x, y) -#define SPLAY_NEXT(name, x, y) name##_SPLAY_NEXT(x, y) -#define SPLAY_MIN(name, x) (SPLAY_EMPTY(x) ? NULL \ - : name##_SPLAY_MIN_MAX(x, SPLAY_NEGINF)) -#define SPLAY_MAX(name, x) (SPLAY_EMPTY(x) ? NULL \ - : name##_SPLAY_MIN_MAX(x, SPLAY_INF)) - -#define SPLAY_FOREACH(x, name, head) \ - for ((x) = SPLAY_MIN(name, head); \ - (x) != NULL; \ - (x) = SPLAY_NEXT(name, head, x)) - -/* Macros that define a red-black tree */ -#define RB_HEAD(name, type) \ -struct name { \ - struct type *rbh_root; /* root of the tree */ \ -} - -#define RB_INITIALIZER(root) \ - { NULL } - -#define RB_INIT(root) do { \ - (root)->rbh_root = NULL; \ -} while (/*CONSTCOND*/ 0) - -#define RB_BLACK 0 -#define RB_RED 1 -#define RB_ENTRY(type) \ -struct { \ - struct type *rbe_left; /* left element */ \ - struct type *rbe_right; /* right element */ \ - struct type *rbe_parent; /* parent element */ \ - int rbe_color; /* node color */ \ -} - -#define RB_LEFT(elm, field) (elm)->field.rbe_left -#define RB_RIGHT(elm, field) (elm)->field.rbe_right -#define RB_PARENT(elm, field) (elm)->field.rbe_parent -#define RB_COLOR(elm, field) (elm)->field.rbe_color -#define RB_ROOT(head) (head)->rbh_root -#define RB_EMPTY(head) (RB_ROOT(head) == NULL) - -#define RB_SET(elm, parent, field) do { \ - RB_PARENT(elm, field) = parent; \ - RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \ - RB_COLOR(elm, field) = RB_RED; \ -} while (/*CONSTCOND*/ 0) - -#define RB_SET_BLACKRED(black, red, field) do { \ - RB_COLOR(black, field) = RB_BLACK; \ - RB_COLOR(red, field) = RB_RED; \ -} while (/*CONSTCOND*/ 0) - -#ifndef RB_AUGMENT -#define RB_AUGMENT(x) (void)(x) -#endif - -#define RB_ROTATE_LEFT(head, elm, tmp, field) do { \ - (tmp) = RB_RIGHT(elm, field); \ - if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field)) != NULL) { \ - RB_PARENT(RB_LEFT(tmp, field), field) = (elm); \ - } \ - RB_AUGMENT(elm); \ - if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) { \ - if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ - RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ - else \ - RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ - } else \ - (head)->rbh_root = (tmp); \ - RB_LEFT(tmp, field) = (elm); \ - RB_PARENT(elm, field) = (tmp); \ - RB_AUGMENT(tmp); \ - if ((RB_PARENT(tmp, field))) \ - RB_AUGMENT(RB_PARENT(tmp, field)); \ -} while (/*CONSTCOND*/ 0) - -#define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \ - (tmp) = RB_LEFT(elm, field); \ - if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field)) != NULL) { \ - RB_PARENT(RB_RIGHT(tmp, field), field) = (elm); \ - } \ - RB_AUGMENT(elm); \ - if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) { \ - if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ - RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ - else \ - RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ - } else \ - (head)->rbh_root = (tmp); \ - RB_RIGHT(tmp, field) = (elm); \ - RB_PARENT(elm, field) = (tmp); \ - RB_AUGMENT(tmp); \ - if ((RB_PARENT(tmp, field))) \ - RB_AUGMENT(RB_PARENT(tmp, field)); \ -} while (/*CONSTCOND*/ 0) - -/* Generates prototypes and inline functions */ -#define RB_PROTOTYPE(name, type, field, cmp) \ -void name##_RB_INSERT_COLOR(struct name *, struct type *); \ -void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\ -struct type *name##_RB_REMOVE(struct name *, struct type *); \ -struct type *name##_RB_INSERT(struct name *, struct type *); \ -struct type *name##_RB_FIND(struct name *, struct type *); \ -struct type *name##_RB_NEXT(struct type *); \ -struct type *name##_RB_MINMAX(struct name *, int); - -/* Generates prototypes (with storage class) and inline functions */ -#define RB_PROTOTYPE_SC(_sc_, name, type, field, cmp) \ -_sc_ void name##_RB_INSERT_COLOR(struct name *, struct type *); \ -_sc_ void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *); \ -_sc_ struct type *name##_RB_REMOVE(struct name *, struct type *); \ -_sc_ struct type *name##_RB_INSERT(struct name *, struct type *); \ -_sc_ struct type *name##_RB_FIND(struct name *, struct type *); \ -_sc_ struct type *name##_RB_NEXT(struct type *); \ -_sc_ struct type *name##_RB_MINMAX(struct name *, int); - -/* Main rb operation. - * Moves node close to the key of elm to top - */ -#define RB_GENERATE(name, type, field, cmp) \ -void \ -name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \ -{ \ - struct type *parent, *gparent, *tmp; \ - while ((parent = RB_PARENT(elm, field)) != NULL && \ - RB_COLOR(parent, field) == RB_RED) { \ - gparent = RB_PARENT(parent, field); \ - if (parent == RB_LEFT(gparent, field)) { \ - tmp = RB_RIGHT(gparent, field); \ - if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ - RB_COLOR(tmp, field) = RB_BLACK; \ - RB_SET_BLACKRED(parent, gparent, field);\ - elm = gparent; \ - continue; \ - } \ - if (RB_RIGHT(parent, field) == elm) { \ - RB_ROTATE_LEFT(head, parent, tmp, field);\ - tmp = parent; \ - parent = elm; \ - elm = tmp; \ - } \ - RB_SET_BLACKRED(parent, gparent, field); \ - RB_ROTATE_RIGHT(head, gparent, tmp, field); \ - } else { \ - tmp = RB_LEFT(gparent, field); \ - if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ - RB_COLOR(tmp, field) = RB_BLACK; \ - RB_SET_BLACKRED(parent, gparent, field);\ - elm = gparent; \ - continue; \ - } \ - if (RB_LEFT(parent, field) == elm) { \ - RB_ROTATE_RIGHT(head, parent, tmp, field);\ - tmp = parent; \ - parent = elm; \ - elm = tmp; \ - } \ - RB_SET_BLACKRED(parent, gparent, field); \ - RB_ROTATE_LEFT(head, gparent, tmp, field); \ - } \ - } \ - RB_COLOR(head->rbh_root, field) = RB_BLACK; \ -} \ - \ -void \ -name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \ -{ \ - struct type *tmp; \ - while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) && \ - elm != RB_ROOT(head)) { \ - if (RB_LEFT(parent, field) == elm) { \ - tmp = RB_RIGHT(parent, field); \ - if (RB_COLOR(tmp, field) == RB_RED) { \ - RB_SET_BLACKRED(tmp, parent, field); \ - RB_ROTATE_LEFT(head, parent, tmp, field);\ - tmp = RB_RIGHT(parent, field); \ - } \ - if ((RB_LEFT(tmp, field) == NULL || \ - RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ - (RB_RIGHT(tmp, field) == NULL || \ - RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ - RB_COLOR(tmp, field) = RB_RED; \ - elm = parent; \ - parent = RB_PARENT(elm, field); \ - } else { \ - if (RB_RIGHT(tmp, field) == NULL || \ - RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK) {\ - struct type *oleft; \ - if ((oleft = RB_LEFT(tmp, field)) \ - != NULL) \ - RB_COLOR(oleft, field) = RB_BLACK;\ - RB_COLOR(tmp, field) = RB_RED; \ - RB_ROTATE_RIGHT(head, tmp, oleft, field);\ - tmp = RB_RIGHT(parent, field); \ - } \ - RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ - RB_COLOR(parent, field) = RB_BLACK; \ - if (RB_RIGHT(tmp, field)) \ - RB_COLOR(RB_RIGHT(tmp, field), field) = RB_BLACK;\ - RB_ROTATE_LEFT(head, parent, tmp, field);\ - elm = RB_ROOT(head); \ - break; \ - } \ - } else { \ - tmp = RB_LEFT(parent, field); \ - if (RB_COLOR(tmp, field) == RB_RED) { \ - RB_SET_BLACKRED(tmp, parent, field); \ - RB_ROTATE_RIGHT(head, parent, tmp, field);\ - tmp = RB_LEFT(parent, field); \ - } \ - if ((RB_LEFT(tmp, field) == NULL || \ - RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ - (RB_RIGHT(tmp, field) == NULL || \ - RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ - RB_COLOR(tmp, field) = RB_RED; \ - elm = parent; \ - parent = RB_PARENT(elm, field); \ - } else { \ - if (RB_LEFT(tmp, field) == NULL || \ - RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) {\ - struct type *oright; \ - if ((oright = RB_RIGHT(tmp, field)) \ - != NULL) \ - RB_COLOR(oright, field) = RB_BLACK;\ - RB_COLOR(tmp, field) = RB_RED; \ - RB_ROTATE_LEFT(head, tmp, oright, field);\ - tmp = RB_LEFT(parent, field); \ - } \ - RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ - RB_COLOR(parent, field) = RB_BLACK; \ - if (RB_LEFT(tmp, field)) \ - RB_COLOR(RB_LEFT(tmp, field), field) = RB_BLACK;\ - RB_ROTATE_RIGHT(head, parent, tmp, field);\ - elm = RB_ROOT(head); \ - break; \ - } \ - } \ - } \ - if (elm) \ - RB_COLOR(elm, field) = RB_BLACK; \ -} \ - \ -struct type * \ -name##_RB_REMOVE(struct name *head, struct type *elm) \ -{ \ - struct type *child, *parent, *old = elm; \ - int color; \ - if (RB_LEFT(elm, field) == NULL) \ - child = RB_RIGHT(elm, field); \ - else if (RB_RIGHT(elm, field) == NULL) \ - child = RB_LEFT(elm, field); \ - else { \ - struct type *left; \ - elm = RB_RIGHT(elm, field); \ - while ((left = RB_LEFT(elm, field)) != NULL) \ - elm = left; \ - child = RB_RIGHT(elm, field); \ - parent = RB_PARENT(elm, field); \ - color = RB_COLOR(elm, field); \ - if (child) \ - RB_PARENT(child, field) = parent; \ - if (parent) { \ - if (RB_LEFT(parent, field) == elm) \ - RB_LEFT(parent, field) = child; \ - else \ - RB_RIGHT(parent, field) = child; \ - RB_AUGMENT(parent); \ - } else \ - RB_ROOT(head) = child; \ - if (RB_PARENT(elm, field) == old) \ - parent = elm; \ - (elm)->field = (old)->field; \ - if (RB_PARENT(old, field)) { \ - if (RB_LEFT(RB_PARENT(old, field), field) == old)\ - RB_LEFT(RB_PARENT(old, field), field) = elm;\ - else \ - RB_RIGHT(RB_PARENT(old, field), field) = elm;\ - RB_AUGMENT(RB_PARENT(old, field)); \ - } else \ - RB_ROOT(head) = elm; \ - RB_PARENT(RB_LEFT(old, field), field) = elm; \ - if (RB_RIGHT(old, field)) \ - RB_PARENT(RB_RIGHT(old, field), field) = elm; \ - if (parent) { \ - left = parent; \ - do { \ - RB_AUGMENT(left); \ - } while ((left = RB_PARENT(left, field)) != NULL); \ - } \ - goto color; \ - } \ - parent = RB_PARENT(elm, field); \ - color = RB_COLOR(elm, field); \ - if (child) \ - RB_PARENT(child, field) = parent; \ - if (parent) { \ - if (RB_LEFT(parent, field) == elm) \ - RB_LEFT(parent, field) = child; \ - else \ - RB_RIGHT(parent, field) = child; \ - RB_AUGMENT(parent); \ - } else \ - RB_ROOT(head) = child; \ -color: \ - if (color == RB_BLACK) \ - name##_RB_REMOVE_COLOR(head, parent, child); \ - return (old); \ -} \ - \ -/* Inserts a node into the RB tree */ \ -struct type * \ -name##_RB_INSERT(struct name *head, struct type *elm) \ -{ \ - struct type *tmp; \ - struct type *parent = NULL; \ - int comp = 0; \ - tmp = RB_ROOT(head); \ - while (tmp) { \ - parent = tmp; \ - comp = (cmp)(elm, parent); \ - if (comp < 0) \ - tmp = RB_LEFT(tmp, field); \ - else if (comp > 0) \ - tmp = RB_RIGHT(tmp, field); \ - else \ - return (tmp); \ - } \ - RB_SET(elm, parent, field); \ - if (parent != NULL) { \ - if (comp < 0) \ - RB_LEFT(parent, field) = elm; \ - else \ - RB_RIGHT(parent, field) = elm; \ - RB_AUGMENT(parent); \ - } else \ - RB_ROOT(head) = elm; \ - name##_RB_INSERT_COLOR(head, elm); \ - return (NULL); \ -} \ - \ -/* Finds the node with the same key as elm */ \ -struct type * \ -name##_RB_FIND(struct name *head, struct type *elm) \ -{ \ - struct type *tmp = RB_ROOT(head); \ - int comp; \ - while (tmp) { \ - comp = cmp(elm, tmp); \ - if (comp < 0) \ - tmp = RB_LEFT(tmp, field); \ - else if (comp > 0) \ - tmp = RB_RIGHT(tmp, field); \ - else \ - return (tmp); \ - } \ - return (NULL); \ -} \ - \ -/* ARGSUSED */ \ -struct type * \ -name##_RB_NEXT(struct type *elm) \ -{ \ - if (RB_RIGHT(elm, field)) { \ - elm = RB_RIGHT(elm, field); \ - while (RB_LEFT(elm, field)) \ - elm = RB_LEFT(elm, field); \ - } else { \ - if (RB_PARENT(elm, field) && \ - (elm == RB_LEFT(RB_PARENT(elm, field), field))) \ - elm = RB_PARENT(elm, field); \ - else { \ - while (RB_PARENT(elm, field) && \ - (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\ - elm = RB_PARENT(elm, field); \ - elm = RB_PARENT(elm, field); \ - } \ - } \ - return (elm); \ -} \ - \ -struct type * \ -name##_RB_MINMAX(struct name *head, int val) \ -{ \ - struct type *tmp = RB_ROOT(head); \ - struct type *parent = NULL; \ - while (tmp) { \ - parent = tmp; \ - if (val < 0) \ - tmp = RB_LEFT(tmp, field); \ - else \ - tmp = RB_RIGHT(tmp, field); \ - } \ - return (parent); \ -} - -#define RB_NEGINF -1 -#define RB_INF 1 - -#define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) -#define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) -#define RB_FIND(name, x, y) name##_RB_FIND(x, y) -#define RB_NEXT(name, x, y) name##_RB_NEXT(y) -#define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) -#define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF) - -#define RB_FOREACH(x, name, head) \ - for ((x) = RB_MIN(name, head); \ - (x) != NULL; \ - (x) = name##_RB_NEXT(x)) - -#endif /* _SYS_TREE_H_ */ +#include <libkern/tree.h> diff --git a/bsd/sys/tty.h b/bsd/sys/tty.h index f0f546c48..ecfb234d5 100644 --- a/bsd/sys/tty.h +++ b/bsd/sys/tty.h @@ -220,6 +220,8 @@ struct clist; #define TS_DSR_OFLOW 0x800000 /* For CDSR_OFLOW. */ #endif +#define TS_IOCTL_NOT_OK 0x1000000 /* Workaround <rdar://....> */ + /* Character type information. */ #define ORDINARY 0 diff --git a/bsd/sys/ubc.h b/bsd/sys/ubc.h index a26ba1caa..4ee9e86cf 100644 --- a/bsd/sys/ubc.h +++ b/bsd/sys/ubc.h @@ -70,6 +70,7 @@ int ubc_setcred(struct vnode *, struct proc *) __deprecated; /* code signing */ struct cs_blob; struct cs_blob *ubc_cs_blob_get(vnode_t, cpu_type_t, off_t); +int cs_entitlements_blob_get(proc_t p, void **, size_t *); #endif /* cluster IO routines */ diff --git a/bsd/sys/ubc_internal.h b/bsd/sys/ubc_internal.h index 775a8457b..d7197f089 100644 --- a/bsd/sys/ubc_internal.h +++ b/bsd/sys/ubc_internal.h @@ -87,6 +87,8 @@ struct cl_readahead { struct cl_writebehind { lck_mtx_t cl_lockw; void * cl_scmap; /* pointer to sparse cluster map */ + off_t cl_last_write; /* offset of the end of the last write */ + off_t cl_seq_written; /* sequentially written bytes */ int cl_sparse_pushes; /* number of pushes outside of the cl_lockw in progress */ int cl_sparse_wait; /* synchronous push is in progress */ int cl_number; /* number of packed write behind clusters currently valid */ @@ -124,6 +126,13 @@ struct ubc_info { struct cl_writebehind *cl_wbehind; /* cluster write behind context */ struct cs_blob *cs_blobs; /* for CODE SIGNING */ +#if CHECK_CS_VALIDATION_BITMAP + void *cs_valid_bitmap; /* right now: used only for signed files on the read-only root volume */ + uint64_t cs_valid_bitmap_size; /* Save original bitmap size in case the file size changes. + * In the future, we may want to reconsider changing the + * underlying bitmap to reflect the new file size changes. + */ +#endif /* CHECK_CS_VALIDATION_BITMAP */ }; /* Defines for ui_flags */ @@ -159,6 +168,7 @@ __private_extern__ uint32_t cluster_hard_throttle_limit(vnode_t, uint32_t *, uin #define UBC_FOR_PAGEOUT 0x0002 memory_object_control_t ubc_getobject(vnode_t, int); +boolean_t ubc_strict_uncached_IO(vnode_t); int ubc_info_init(vnode_t); int ubc_info_init_withsize(vnode_t, off_t); @@ -181,6 +191,8 @@ int ubc_cs_getcdhash(vnode_t, off_t, unsigned char *); kern_return_t ubc_cs_blob_allocate(vm_offset_t *, vm_size_t *); void ubc_cs_blob_deallocate(vm_offset_t, vm_size_t); +kern_return_t ubc_cs_validation_bitmap_allocate( vnode_t ); +void ubc_cs_validation_bitmap_deallocate( vnode_t ); __END_DECLS diff --git a/bsd/sys/ucontext.h b/bsd/sys/ucontext.h index b31d50ed3..249cf5e23 100644 --- a/bsd/sys/ucontext.h +++ b/bsd/sys/ucontext.h @@ -35,12 +35,6 @@ #define __need_mcontext_t #define __need_stack_t #define __need_ucontext_t -#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) -#if defined(__ppc__) || defined(__ppc64__) -#define __need_mcontext64_t -#define __need_ucontext64_t -#endif /* __ppc__|| __ppc64__ */ -#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ #include <sys/_structs.h> #ifndef _SIGSET_T diff --git a/bsd/sys/ucred.h b/bsd/sys/ucred.h index 0d8b0f2a4..6d914a4df 100644 --- a/bsd/sys/ucred.h +++ b/bsd/sys/ucred.h @@ -90,6 +90,7 @@ struct ucred { TAILQ_ENTRY(ucred) cr_link; /* never modify this without KAUTH_CRED_HASH_LOCK */ u_long cr_ref; /* reference count */ +struct posix_cred { /* * The credential hash depends on everything from this point on * (see kauth_cred_get_hashkey) @@ -102,15 +103,9 @@ struct ucred { gid_t cr_rgid; /* real group id */ gid_t cr_svgid; /* saved group id */ uid_t cr_gmuid; /* UID for group membership purposes */ - /* - * XXX - cr_au will be replaced with cr_audit below. - * cr_au is here to keep kexts from breaking. It seems to - * be currently used by the ucred hashing as well. - */ - struct auditinfo cr_au; /* XXX This needs to go away. */ - struct label *cr_label; /* MAC label */ - int cr_flags; /* flags on credential */ +} cr_posix; + struct label *cr_label; /* MAC label */ /* * NOTE: If anything else (besides the flags) * added after the label, you must change @@ -121,6 +116,7 @@ struct ucred { #ifndef _KAUTH_CRED_T #define _KAUTH_CRED_T typedef struct ucred *kauth_cred_t; +typedef struct posix_cred *posix_cred_t; #endif /* !_KAUTH_CRED_T */ /* diff --git a/bsd/sys/un.h b/bsd/sys/un.h index 479058ff2..400620396 100644 --- a/bsd/sys/un.h +++ b/bsd/sys/un.h @@ -83,8 +83,13 @@ struct sockaddr_un { }; #if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) + +/* Level number of get/setsockopt for local domain sockets */ +#define SOL_LOCAL 0 + /* Socket options. */ #define LOCAL_PEERCRED 0x001 /* retrieve peer credentails */ + #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ diff --git a/bsd/sys/unistd.h b/bsd/sys/unistd.h index d80b3bbd3..c778c66f3 100644 --- a/bsd/sys/unistd.h +++ b/bsd/sys/unistd.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -174,6 +174,7 @@ struct accessx_descriptor { #define _PC_REC_XFER_ALIGN 23 /* Recommended buffer alignment */ #define _PC_SYMLINK_MAX 24 /* Max # of bytes in symlink name */ #define _PC_SYNC_IO 25 /* Sync I/O [SIO] supported? */ +#define _PC_XATTR_SIZE_BITS 26 /* # of bits to represent maximum xattr size */ /* configurable system strings */ #define _CS_PATH 1 diff --git a/bsd/sys/unpcb.h b/bsd/sys/unpcb.h index 2376c11f8..a50aebe36 100644 --- a/bsd/sys/unpcb.h +++ b/bsd/sys/unpcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2008-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -134,7 +134,7 @@ struct unpcb { unp_gen_t unp_gencnt; /* generation count of this instance */ int unp_flags; /* flags */ struct xucred unp_peercred; /* peer credentials, if applicable */ - lck_mtx_t *unp_mtx; /* per unpcb lock */ + decl_lck_mtx_data( ,unp_mtx); /* per unpcb lock */ int rw_thrcount; /* disconnect should wait for this count to become zero */ }; #endif /* KERNEL */ @@ -155,6 +155,7 @@ struct unpcb { #define UNP_HAVEPC 0x0001 #define UNP_HAVEPCCACHED 0x0002 #define UNP_DONTDISCONNECT 0x0004 +#define UNP_TRACE_MDNS 0x1000 #ifdef KERNEL struct unpcb_compat { diff --git a/bsd/sys/user.h b/bsd/sys/user.h index 66f355110..4a59aa866 100644 --- a/bsd/sys/user.h +++ b/bsd/sys/user.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -79,12 +79,15 @@ #endif #include <sys/vm.h> /* XXX */ #include <sys/sysctl.h> - + + #ifdef KERNEL +#ifdef BSD_KERNEL_PRIVATE +#include <sys/pthread_internal.h> /* for uu_kwe entry */ +#endif /* BSD_KERNEL_PRIVATE */ #ifdef __APPLE_API_PRIVATE #include <sys/eventvar.h> - #if !defined(__LP64__) || defined(XNU_KERNEL_PRIVATE) /* * VFS context structure (part of uthread) @@ -124,7 +127,7 @@ struct uthread { int poll; int error; int count; - int kfcount; + int _reserved1; // UNUSED: avoid changing size for now char * wql; } uu_select; /* saved state for select() */ /* to support kevent continuations */ @@ -156,7 +159,9 @@ struct uthread { caddr_t uu_wchan; /* sleeping thread wait channel */ const char *uu_wmesg; /* ... wait message */ int uu_flag; +#if CONFIG_EMBEDDED int uu_iopol_disk; /* disk I/O policy */ +#endif /* CONFIG_EMBEDDED */ struct proc * uu_proc; void * uu_userstate; wait_queue_set_t uu_wqset; /* cached across select calls */ @@ -172,12 +177,12 @@ struct uthread { struct kaudit_record *uu_ar; /* audit record */ struct task* uu_aio_task; /* target task for async io */ - - /* network support for dlil layer locking */ - u_int32_t dlil_incremented_read; + lck_mtx_t *uu_mtx; int uu_lowpri_window; + boolean_t uu_throttle_isssd; + boolean_t uu_throttle_bc; void * uu_throttle_info; /* pointer to throttled I/Os info */ struct kern_sigaltstack uu_sigstk; @@ -191,12 +196,14 @@ struct uthread { int uu_iocount; int uu_vpindex; void * uu_vps[32]; + void * uu_pcs[32][10]; #endif #if CONFIG_DTRACE siginfo_t t_dtrace_siginfo; uint32_t t_dtrace_errno; /* Most recent errno */ - uint8_t t_dtrace_stop; /* indicates a DTrace-desired stop */ + uint8_t t_dtrace_stop; /* indicates a DTrace desired stop */ uint8_t t_dtrace_sig; /* signal sent via DTrace's raise() */ + uint64_t t_dtrace_resumepid; /* DTrace's pidresume() pid */ union __tdu { struct __tds { @@ -232,10 +239,7 @@ struct uthread { #endif /* CONFIG_DTRACE */ void * uu_threadlist; char * pth_name; - TAILQ_ENTRY(uthread) uu_mtxlist; /* psynch waiters list*/ - uint32_t uu_lockseq; /* seq on arrival */ - uint32_t uu_psynchretval; /* pmtx retval */ - void * uu_kwqqueue; /* queue blocked on */ + struct ksyn_waitq_element uu_kwe; /* user for pthread synch */ }; typedef struct uthread * uthread_t; @@ -252,7 +256,9 @@ typedef struct uthread * uthread_t; #define UT_PASSIVE_IO 0x00000100 /* this thread issues passive I/O */ #define UT_PROCEXIT 0x00000200 /* this thread completed the proc exit */ #define UT_RAGE_VNODES 0x00000400 /* rapid age any vnodes created by this thread */ +#if CONFIG_EMBEDDED #define UT_BACKGROUND 0x00000800 /* this thread is in background state */ +#endif /* !CONFIG_EMBEDDED */ #define UT_BACKGROUND_TRAFFIC_MGT 0x00001000 /* background traffic is regulated */ #define UT_VFORK 0x02000000 /* thread has vfork children */ diff --git a/bsd/sys/vfs_context.h b/bsd/sys/vfs_context.h index 16453bb7a..fd31f99e3 100644 --- a/bsd/sys/vfs_context.h +++ b/bsd/sys/vfs_context.h @@ -5,7 +5,9 @@ #include <sys/types.h> #include <sys/kernel_types.h> #include <kern/thread.h> +#ifdef BSD_KERNEL_PRIVATE #include <sys/user.h> +#endif #include <stdint.h> /* diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index 65620f277..965518cb0 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -145,6 +145,7 @@ enum vtagtype { #define IO_BACKGROUND IO_PASSIVE /* used for backward compatibility. to be removed after IO_BACKGROUND is no longer * used by DiskImages in-kernel mode */ #define IO_NOAUTH 0x8000 /* No authorization checks. */ +#define IO_NODIRECT 0x10000 /* don't use direct synchronous writes if IO_NOCACHE is specified */ /* @@ -159,15 +160,15 @@ struct componentname { uint32_t cn_flags; /* flags (see below) */ #ifdef BSD_KERNEL_PRIVATE vfs_context_t cn_context; - void * pad_obsolete2; + struct nameidata *cn_ndp; /* pointer back to nameidata */ /* XXX use of these defines are deprecated */ #define cn_proc (cn_context->vc_proc + 0) /* non-lvalue */ #define cn_cred (cn_context->vc_ucred + 0) /* non-lvalue */ #else - void * obsolete1; /* use vfs_context_t */ - void * obsolete2; /* use vfs_context_t */ + void * cn_reserved1; /* use vfs_context_t */ + void * cn_reserved2; /* use vfs_context_t */ #endif /* * Shared between lookup and commit routines. @@ -201,8 +202,8 @@ struct componentname { #define ISDOTDOT 0x00002000 /* current component name is .. */ #define MAKEENTRY 0x00004000 /* entry is to be added to name cache */ #define ISLASTCN 0x00008000 /* this is last component of pathname */ -#define ISWHITEOUT 0x00020000 /* found whiteout */ -#define DOWHITEOUT 0x00040000 /* do whiteouts */ +#define ISWHITEOUT 0x00020000 /* OBSOLETE: found whiteout */ +#define DOWHITEOUT 0x00040000 /* OBSOLETE: do whiteouts */ /* The following structure specifies a vnode for creation */ @@ -228,6 +229,234 @@ struct vnode_fsparam { #define VNCREATE_FLAVOR 0 #define VCREATESIZE sizeof(struct vnode_fsparam) + +#ifdef KERNEL_PRIVATE +/* + * Resolver callback SPI for trigger vnodes + * + * Only available from kernels built with CONFIG_TRIGGERS option + */ + +/*! + @enum Pathname Lookup Operations + @abstract Constants defining pathname operations (passed to resolver callbacks) + */ +enum path_operation { + OP_LOOKUP, + OP_MOUNT, + OP_UNMOUNT, + OP_STATFS, + OP_OPEN, + OP_LINK, + OP_UNLINK, + OP_RENAME, + OP_CHDIR, + OP_CHROOT, + OP_MKNOD, + OP_MKFIFO, + OP_SYMLINK, + OP_ACCESS, + OP_PATHCONF, + OP_READLINK, + OP_GETATTR, + OP_SETATTR, + OP_TRUNCATE, + OP_COPYFILE, + OP_MKDIR, + OP_RMDIR, + OP_REVOKE, + OP_EXCHANGEDATA, + OP_SEARCHFS, + OP_FSCTL, + OP_GETXATTR, + OP_SETXATTR, + OP_REMOVEXATTR, + OP_LISTXATTR, + OP_MAXOP /* anything beyond previous entry is invalid */ +}; + +/* + * is operation a traditional trigger (autofs)? + * 1 if trigger, 0 if no trigger + */ +extern int vfs_istraditionaltrigger(enum path_operation op, const struct componentname *cnp); + +/*! + @enum resolver status + @abstract Constants defining resolver status + @constant RESOLVER_RESOLVED the resolver has finished (typically means a successful mount) + @constant RESOLVER_NOCHANGE the resolver status didn't change + @constant RESOLVER_UNRESOLVED the resolver has finished (typically means a successful unmount) + @constant RESOLVER_ERROR the resolver encountered an error (errno passed in aux value) + @constant RESOLVER_STOP a request to destroy trigger XXX do we need this??? + */ +enum resolver_status { + RESOLVER_RESOLVED, + RESOLVER_NOCHANGE, + RESOLVER_UNRESOLVED, + RESOLVER_ERROR, + RESOLVER_STOP +}; + +typedef uint64_t resolver_result_t; + +/* + * Compound resolver result + * + * The trigger vnode callbacks use a compound result value. In addition + * to the resolver status, it contains a sequence number and an auxiliary + * value. + * + * The sequence value is used by VFS to sequence-stamp trigger vnode + * state transitions. It is expected to be incremented each time a + * resolver changes state (ie resolved or unresolved). A result + * containing a stale sequence (older than a trigger vnode's current + * value) will be ignored by VFS. + * + * The auxiliary value is currently only used to deliver the errno + * value for RESOLVER_ERROR status conditions. When a RESOLVER_ERROR + * occurs, VFS will propagate this error back to the syscall that + * encountered the trigger vnode. + */ +extern resolver_result_t vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux); + +/* + * Extract values from a compound resolver result + */ +extern enum resolver_status vfs_resolver_status(resolver_result_t); +extern uint32_t vfs_resolver_sequence(resolver_result_t); +extern int vfs_resolver_auxiliary(resolver_result_t); + + +/*! + @typedef trigger_vnode_resolve_callback_t + @abstract function prototype for a trigger vnode resolve callback + @discussion This function is associated with a trigger vnode during a vnode create. It is + typically called when a lookup operation occurs for a trigger vnode + @param vp The trigger vnode which needs resolving + @param cnp Various data about lookup, e.g. filename and state flags + @param pop The pathname operation that initiated the lookup (see enum path_operation). + @param flags + @param data Arbitrary data supplied by vnode trigger creator + @param ctx Context for authentication. + @return RESOLVER_RESOLVED, RESOLVER_NOCHANGE, RESOLVER_UNRESOLVED or RESOLVER_ERROR +*/ +typedef resolver_result_t (* trigger_vnode_resolve_callback_t)( + vnode_t vp, + const struct componentname * cnp, + enum path_operation pop, + int flags, + void * data, + vfs_context_t ctx); + +/*! + @typedef trigger_vnode_unresolve_callback_t + @abstract function prototype for a trigger vnode unresolve callback + @discussion This function is associated with a trigger vnode during a vnode create. It is + called to unresolve a trigger vnode (typically this means unmount). + @param vp The trigger vnode which needs unresolving + @param flags Unmount flags + @param data Arbitrary data supplied by vnode trigger creator + @param ctx Context for authentication. + @return RESOLVER_NOCHANGE, RESOLVER_UNRESOLVED or RESOLVER_ERROR +*/ +typedef resolver_result_t (* trigger_vnode_unresolve_callback_t)( + vnode_t vp, + int flags, + void * data, + vfs_context_t ctx); + +/*! + @typedef trigger_vnode_rearm_callback_t + @abstract function prototype for a trigger vnode rearm callback + @discussion This function is associated with a trigger vnode during a vnode create. It is + called to verify a rearm from VFS (i.e. should VFS rearm the trigger?). + @param vp The trigger vnode which needs rearming + @param flags + @param data Arbitrary data supplied by vnode trigger creator + @param ctx Context for authentication. + @return RESOLVER_NOCHANGE or RESOLVER_ERROR +*/ +typedef resolver_result_t (* trigger_vnode_rearm_callback_t)( + vnode_t vp, + int flags, + void * data, + vfs_context_t ctx); + +/*! + @typedef trigger_vnode_reclaim_callback_t + @abstract function prototype for a trigger vnode reclaim callback + @discussion This function is associated with a trigger vnode during a vnode create. It is + called to deallocate private callback argument data + @param vp The trigger vnode associated with the data + @param data The arbitrary data supplied by vnode trigger creator +*/ +typedef void (* trigger_vnode_reclaim_callback_t)( + vnode_t vp, + void * data); + +/*! + @function vnode_trigger_update + @abstract Update a trigger vnode's state. + @discussion This allows a resolver to notify VFS of a state change in a trigger vnode. + @param vp The trigger vnode whose information to update. + @param result A compound resolver result value + @return EINVAL if result value is invalid or vp isn't a trigger vnode + */ +extern int vnode_trigger_update(vnode_t vp, resolver_result_t result); + +struct vnode_trigger_info { + trigger_vnode_resolve_callback_t vti_resolve_func; + trigger_vnode_unresolve_callback_t vti_unresolve_func; + trigger_vnode_rearm_callback_t vti_rearm_func; + trigger_vnode_reclaim_callback_t vti_reclaim_func; + void * vti_data; /* auxiliary data (optional) */ + uint32_t vti_flags; /* optional flags (see below) */ +}; + +/* + * SPI for creating a trigger vnode + * + * Uses the VNCREATE_TRIGGER flavor with existing vnode_create() KPI + * + * Only one resolver per vnode. + * + * ERRORS (in addition to vnode_create errors): + * EINVAL (invalid resolver info, like invalid flags) + * ENOTDIR (only directories can have a resolver) + * EPERM (vnode cannot be a trigger - eg root dir of a file system) + * ENOMEM + */ +struct vnode_trigger_param { + struct vnode_fsparam vnt_params; /* same as for VNCREATE_FLAVOR */ + trigger_vnode_resolve_callback_t vnt_resolve_func; + trigger_vnode_unresolve_callback_t vnt_unresolve_func; + trigger_vnode_rearm_callback_t vnt_rearm_func; + trigger_vnode_reclaim_callback_t vnt_reclaim_func; + void * vnt_data; /* auxiliary data (optional) */ + uint32_t vnt_flags; /* optional flags (see below) */ +}; + +#define VNCREATE_TRIGGER (('T' << 8) + ('V')) +#define VNCREATE_TRIGGER_SIZE sizeof(struct vnode_trigger_param) + +/* + * vnode trigger flags (vnt_flags) + * + * VNT_AUTO_REARM: + * On unmounts of a trigger mount, automatically re-arm the trigger. + * + * VNT_NO_DIRECT_MOUNT: + * A trigger vnode instance that doesn't directly trigger a mount, + * instead it triggers the mounting of sub-trigger nodes. + */ +#define VNT_AUTO_REARM (1 << 0) +#define VNT_NO_DIRECT_MOUNT (1 << 1) +#define VNT_VALID_MASK (VNT_AUTO_REARM | VNT_NO_DIRECT_MOUNT) + +#endif /* KERNEL_PRIVATE */ + + /* * Vnode attributes, new-style. * @@ -287,6 +516,7 @@ struct vnode_fsparam { #define VNODE_ATTR_va_guuid (1LL<<27) /* 08000000 */ #define VNODE_ATTR_va_nchildren (1LL<<28) /* 10000000 */ #define VNODE_ATTR_va_dirlinkcount (1LL<<29) /* 20000000 */ +#define VNODE_ATTR_va_addedtime (1LL<<30) /* 40000000 */ #define VNODE_ATTR_BIT(n) (VNODE_ATTR_ ## n) /* @@ -307,7 +537,8 @@ struct vnode_fsparam { VNODE_ATTR_BIT(va_name) | \ VNODE_ATTR_BIT(va_type) | \ VNODE_ATTR_BIT(va_nchildren) | \ - VNODE_ATTR_BIT(va_dirlinkcount)) + VNODE_ATTR_BIT(va_dirlinkcount)| \ + VNODE_ATTR_BIT(va_addedtime)) /* * Attributes that can be applied to a new file object. */ @@ -381,14 +612,23 @@ struct vnode_attr { uint64_t va_dirlinkcount; /* Real references to dir (i.e. excluding "." and ".." refs) */ /* add new fields here only */ +#ifdef BSD_KERNEL_PRIVATE + struct kauth_acl *va_base_acl; +#else + void * va_reserved1; +#endif /* BSD_KERNEL_PRIVATE */ + struct timespec va_addedtime; /* timestamp when item was added to parent directory */ + }; /* * Flags for va_vaflags. */ -#define VA_UTIMES_NULL 0x010000 /* utimes argument was NULL */ -#define VA_EXCLUSIVE 0x020000 /* exclusive create request */ +#define VA_UTIMES_NULL 0x010000 /* utimes argument was NULL */ +#define VA_EXCLUSIVE 0x020000 /* exclusive create request */ +#define VA_NOINHERIT 0x040000 /* Don't inherit ACLs from parent */ +#define VA_NOAUTH 0x080000 /* * Modes. Some values same as Ixxx entries from inode.h for now. @@ -761,6 +1001,14 @@ int vnode_isnocache(vnode_t); */ int vnode_israge(vnode_t); +/*! + @function vnode_needssnapshots + @abstract Check if a vnode needs snapshots events (regardless of its ctime status) + @param vp The vnode to test. + @return Nonzero if vnode needs snapshot events, 0 otherwise + */ +int vnode_needssnapshots(vnode_t); + /*! @function vnode_setnocache @abstract Set a vnode to not have its data cached in memory (i.e. we write-through to disk and always read from disk). @@ -992,6 +1240,20 @@ int vfs_context_rele(vfs_context_t); vfs_context_t vfs_context_current(void); #ifdef KERNEL_PRIVATE int vfs_context_bind(vfs_context_t); + +/*! + @function vfs_ctx_skipatime + @abstract Check to see if this context should skip updating a vnode's access times. + @discussion This is currently tied to the vnode rapid aging process. If the process is marked for rapid aging, + then the kernel should not update vnodes it touches for access time purposes. This will check to see if the + specified process and/or thread is marked for rapid aging when it manipulates vnodes. + @param ctx The context being investigated. + @return 1 if we should skip access time updates. + @return 0 if we should NOT skip access time updates. + */ + +int vfs_ctx_skipatime(vfs_context_t ctx); + #endif /*! @@ -1048,6 +1310,10 @@ int vnode_get(vnode_t); */ int vnode_getwithvid(vnode_t, uint32_t); +#ifdef BSD_KERNEL_PRIVATE +int vnode_getwithvid_drainok(vnode_t, uint32_t); +#endif /* BSD_KERNEL_PRIVATE */ + /*! @function vnode_getwithref @abstract Increase the iocount on a vnode on which a usecount (persistent reference) is held. @@ -1172,6 +1438,17 @@ int vnode_notify(vnode_t, uint32_t, struct vnode_attr*); */ int vnode_ismonitored(vnode_t); + +/*! + @function vnode_isdyldsharedcache + @abstract Check whether a file is a dyld shared cache file. + @param vp Vnode to examine. + @discussion Will not reenter the filesystem. + @return nonzero if a dyld shared cache file, zero otherwise. + */ +int vnode_isdyldsharedcache(vnode_t); + + /*! @function vfs_get_notify_attributes @abstract Determine what attributes are required to send up a notification with vnode_notify(). @@ -1298,7 +1575,7 @@ int vn_getpath(struct vnode *vp, char *pathbuf, int *len); */ #define VNODE_LOOKUP_NOFOLLOW 0x01 #define VNODE_LOOKUP_NOCROSSMOUNT 0x02 -#define VNODE_LOOKUP_DOWHITEOUT 0x04 +#define VNODE_LOOKUP_DOWHITEOUT 0x04 /* OBSOLETE */ /*! @function vnode_lookup @abstract Convert a path into a vnode. @@ -1368,6 +1645,7 @@ int vnode_iterate(struct mount *, int, int (*)(struct vnode *, void *), void *); #define VNODE_ITERATE_INACTIVE 0x200 #ifdef BSD_KERNEL_PRIVATE #define VNODE_ALWAYS 0x400 +#define VNODE_DRAINO 0x800 #endif /* BSD_KERNEL_PRIVATE */ /* @@ -1545,6 +1823,20 @@ void vnode_putname(const char *name); */ vnode_t vnode_getparent(vnode_t vp); +#ifdef KERNEL_PRIVATE +/*! + @function vnode_lookup_continue_needed + @abstract Determine whether vnode needs additional processing in VFS before being opened. + @discussion If result is zero, filesystem can open this vnode. If result is nonzero, + additional processing is needed in VFS (e.g. symlink, mountpoint). Nonzero results should + be passed up to VFS. + @param vp Vnode to consider opening (found by filesystem). + @param cnp Componentname as passed to filesystem from VFS. + @result 0 to indicate that a vnode can be opened, or an error that should be passed up to VFS. + */ +int vnode_lookup_continue_needed(vnode_t vp, struct componentname *cnp); +#endif /* KERNEL_PRIVATE */ + #ifdef BSD_KERNEL_PRIVATE /* Not in export list so can be private */ struct stat; diff --git a/bsd/sys/vnode_if.h b/bsd/sys/vnode_if.h index 66812b08d..33ae10047 100644 --- a/bsd/sys/vnode_if.h +++ b/bsd/sys/vnode_if.h @@ -92,6 +92,13 @@ extern struct vnodeop_desc vnop_default_desc; extern struct vnodeop_desc vnop_lookup_desc; +#ifdef KERNEL_PRIVATE +extern struct vnodeop_desc vnop_compound_open_desc; +extern struct vnodeop_desc vnop_compound_remove_desc; +extern struct vnodeop_desc vnop_compound_rename_desc; +extern struct vnodeop_desc vnop_compound_mkdir_desc; +extern struct vnodeop_desc vnop_compound_rmdir_desc; +#endif /* KERNEL_PRIVATE */ extern struct vnodeop_desc vnop_create_desc; extern struct vnodeop_desc vnop_whiteout_desc; extern struct vnodeop_desc vnop_mknod_desc; @@ -257,6 +264,44 @@ struct vnop_open_args { vfs_context_t a_context; }; +#ifdef KERNEL_PRIVATE +struct vnop_compound_open_args { + struct vnodeop_desc *a_desc; + + vnode_t a_dvp; /* Directory in which to open/create */ + vnode_t *a_vpp; /* Resulting vnode */ + int a_fmode; /* Open mode */ + struct componentname *a_cnp; /* Path to look up */ + struct vnode_attr *a_vap; /* Attributes with which to create, if appropriate */ + uint32_t a_flags; /* VNOP-control flags */ + uint32_t *a_status; /* Information about results */ + + vfs_context_t a_context; /* Authorization context */ + + int (*a_open_create_authorizer)( /* Authorizer for create case */ + vnode_t dvp, /* Directory in which to create */ + struct componentname *cnp, /* As passed to VNOP */ + struct vnode_attr *vap, /* As passed to VNOP */ + vfs_context_t ctx, /* Context */ + void *reserved); /* Who knows */ + + int (*a_open_existing_authorizer)( /* Authorizer for preexisting case */ + vnode_t vp, /* vp to open */ + struct componentname *cnp, /* Lookup state */ + int fmode, /* As passed to VNOP */ + vfs_context_t ctx, /* Context */ + void *reserved); /* Who knows */ + + void *a_reserved; +}; + +/* Control flags */ +#define VNOP_COMPOUND_OPEN_DO_CREATE 0x00000001 + +/* Results */ +#define COMPOUND_OPEN_STATUS_DID_CREATE 0x00000001 +#endif /* KERNEL_PRIVATE */ + /*! @function VNOP_OPEN @abstract Call down to a filesystem to open a file. @@ -272,6 +317,11 @@ struct vnop_open_args { extern errno_t VNOP_OPEN(vnode_t, int, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +#ifdef BSD_KERNEL_PRIVATE +struct nameidata; +extern int VNOP_COMPOUND_OPEN(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, int32_t fmode, uint32_t *status, struct vnode_attr *vap, vfs_context_t ctx); +#endif + struct vnop_close_args { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -381,8 +431,7 @@ struct vnop_read_args { @discussion VNOP_READ() is where the hard work of of the read() system call happens. The filesystem may use the buffer cache, the cluster layer, or an alternative method to get its data; uio routines will be used to see that data is copied to the correct virtual address in the correct address space and will update its uio argument - to indicate how much data has been moved. Filesystems will not receive a read request on a file without having - first received a VNOP_OPEN(). + to indicate how much data has been moved. @param vp The vnode to read from. @param uio Description of request, including file offset, amount of data requested, destination address for data, and whether that destination is in kernel or user space. @@ -406,8 +455,7 @@ struct vnop_write_args { @discussion VNOP_WRITE() is to write() as VNOP_READ() is to read(). The filesystem may use the buffer cache, the cluster layer, or an alternative method to write its data; uio routines will be used to see that data is copied to the correct virtual address in the correct address space and will update its uio argument - to indicate how much data has been moved. Filesystems will not receive a write request on a file without having - first received a VNOP_OPEN(). + to indicate how much data has been moved. @param vp The vnode to write to. @param uio Description of request, including file offset, amount of data to write, source address for data, and whether that destination is in kernel or user space. @@ -600,6 +648,28 @@ struct vnop_remove_args { extern errno_t VNOP_REMOVE(vnode_t, vnode_t, struct componentname *, int, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE +struct vnop_compound_remove_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; /* Directory in which to lookup and remove */ + vnode_t *a_vpp; /* File to remove; may or may not point to NULL pointer */ + struct componentname *a_cnp; /* Name of file to remove */ + struct vnode_attr *a_vap; /* Destination for file attributes on successful delete */ + uint32_t a_flags; /* Control flags (unused) */ + vfs_context_t a_context; /* Authorization context */ + int (*a_remove_authorizer)( /* Authorizer callback */ + vnode_t dvp, /* Directory in which to delete */ + vnode_t vp, /* File to delete */ + struct componentname *cnp, /* As passed to VNOP */ + vfs_context_t ctx, /* As passed to VNOP */ + void *reserved); /* Always NULL */ + void *a_reserved; /* Unused */ +}; +#endif /* KERNEL_PRIVATE */ + +#ifdef BSD_KERNEL_PRIVATE +extern errno_t VNOP_COMPOUND_REMOVE(vnode_t, vnode_t*, struct nameidata *, int32_t flags, struct vnode_attr *vap, vfs_context_t); +#endif struct vnop_link_args { struct vnodeop_desc *a_desc; vnode_t a_vp; @@ -650,6 +720,43 @@ struct vnop_rename_args { extern errno_t VNOP_RENAME(vnode_t, vnode_t, struct componentname *, vnode_t, vnode_t, struct componentname *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE +struct vnop_compound_rename_args { + struct vnodeop_desc *a_desc; + + vnode_t a_fdvp; /* Directory from which to rename */ + vnode_t *a_fvpp; /* Vnode to rename (can point to a NULL pointer) */ + struct componentname *a_fcnp; /* Source name */ + struct vnode_attr *a_fvap; + + vnode_t a_tdvp; /* Directory to which to rename */ + vnode_t *a_tvpp; /* Vnode to rename over (can point to a NULL pointer) */ + struct componentname *a_tcnp; /* Destination name */ + struct vnode_attr *a_tvap; + + uint32_t a_flags; /* Control flags: currently unused */ + vfs_context_t a_context; /* Authorization context */ + int (*a_rename_authorizer)( /* Authorization callback */ + vnode_t fdvp, /* As passed to VNOP */ + vnode_t fvp, /* Vnode to rename */ + struct componentname *fcnp, /* As passed to VNOP */ + vnode_t tdvp, /* As passed to VNOP */ + vnode_t tvp, /* Vnode to rename over (can be NULL) */ + struct componentname *tcnp, /* As passed to VNOP */ + vfs_context_t ctx, /* As passed to VNOP */ + void *reserved); /* Always NULL */ + void *a_reserved; /* Currently unused */ +}; +#endif /* KERNEL_PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +errno_t +VNOP_COMPOUND_RENAME( + struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, struct vnode_attr *fvap, + struct vnode *tdvp, struct vnode **tvpp, struct componentname *tcnp, struct vnode_attr *tvap, + uint32_t flags,vfs_context_t ctx); +#endif /* XNU_KERNEL_PRIVATE */ + struct vnop_mkdir_args { struct vnodeop_desc *a_desc; vnode_t a_dvp; @@ -674,6 +781,27 @@ struct vnop_mkdir_args { extern errno_t VNOP_MKDIR(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ + +#ifdef KERNEL_PRIVATE +struct vnop_compound_mkdir_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; /* Directory in which to create */ + vnode_t *a_vpp; /* Destination for found or created vnode */ + struct componentname *a_cnp; /* Name of directory to create */ + struct vnode_attr *a_vap; /* Creation attributes */ + uint32_t a_flags; /* Control flags (unused) */ + vfs_context_t a_context; /* Authorization context */ +#if 0 + int (*a_mkdir_authorizer)(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved); +#endif /* 0 */ + void *a_reserved; /* Unused */ +}; +#endif /* KERNEL_PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +extern errno_t VNOP_COMPOUND_MKDIR(vnode_t, vnode_t *, struct nameidata *, struct vnode_attr *, vfs_context_t); +#endif /* XNU_KERNEL_PRIVATE */ + struct vnop_rmdir_args { struct vnodeop_desc *a_desc; vnode_t a_dvp; @@ -695,6 +823,30 @@ struct vnop_rmdir_args { extern errno_t VNOP_RMDIR(vnode_t, vnode_t, struct componentname *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ +#ifdef KERNEL_PRIVATE +struct vnop_compound_rmdir_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; /* Directory in which to look up and delete */ + vnode_t *a_vpp; /* Destination for found vnode */ + struct componentname *a_cnp; /* Name to delete */ + struct vnode_attr *a_vap; /* Location in which to store attributes if delete succeeds (can be NULL) */ + uint32_t a_flags; /* Control flags (currently unused) */ + vfs_context_t a_context; /* Context for authorization */ + int (*a_rmdir_authorizer)( /* Authorization callback */ + vnode_t dvp, /* As passed to VNOP */ + vnode_t vp, /* Directory to delete */ + struct componentname *cnp, /* As passed to VNOP */ + vfs_context_t ctx, /* As passed to VNOP */ + void *reserved); /* Always NULL */ + void *a_reserved; /* Unused */ +}; +#endif /* KERNEL_PRIVATE */ + +#ifdef XNU_KERNEL_PRIVATE +extern errno_t VNOP_COMPOUND_RMDIR(vnode_t, vnode_t*, struct nameidata *, struct vnode_attr *vap, vfs_context_t); +#endif /* XNU_KERNEL_PRIVATE */ + + struct vnop_symlink_args { struct vnodeop_desc *a_desc; vnode_t a_dvp; @@ -723,7 +875,6 @@ struct vnop_symlink_args { extern errno_t VNOP_SYMLINK(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, char *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ - /* * * When VNOP_READDIR is called from the NFS Server, the nfs_data @@ -941,7 +1092,7 @@ struct vnop_allocate_args { a file. It can be used to either shrink or grow a file. If the file shrinks, its ubc size will be modified accordingly, but if it grows, then the ubc size is unchanged; space is set aside without being actively used by the file. VNOP_ALLOCATE() is currently only - called as part of the F_PREALLOCATE fcntl, and is supported only by AFP and HFS. + called as part of the F_PREALLOCATE fcntl. @param vp The vnode for which to preallocate space. @param length Desired preallocated file length. @param flags @@ -1009,12 +1160,20 @@ struct vnop_pageout_args { @abstract Write data from a mapped file back to disk. @discussion VNOP_PAGEOUT() is called when data from a mapped file needs to be flushed to disk, either because of an msync() call or due to memory pressure. Filesystems are for the most part expected to - just call cluster_pageout(). + just call cluster_pageout(). However, if they opt into the VFC_VFSVNOP_PAGEOUTV2 flag, then + they will be responsible for creating their own UPLs. @param vp The vnode for which to page out data. - @param pl UPL describing pages needing to be paged out. - @param pl_offset Offset in UPL from which to start paging out data. - @param f_offset Offset in file of data needing to be paged out. - @param size Amount of data to page out (in bytes). + @param pl UPL describing pages needed to be paged out. If UPL is NULL, then it means the filesystem + has opted into VFC_VFSVNOP_PAGEOUTV2 semantics, which means that it will create and operate on its own UPLs + as opposed to relying on the one passed down into the filesystem. This means that the filesystem must be + responsible for N cluster_pageout calls for N dirty ranges in the UPL. + @param pl_offset Offset in UPL from which to start paging out data. Under the new VFC_VFSVNOP_PAGEOUTV2 + semantics, this is the offset in the range specified that must be paged out if the associated page is dirty. + @param f_offset Offset in file of data needing to be paged out. Under the new VFC_VFSVNOP_PAGEOUTV2 + semantics, this represents the offset in the file where we should start looking for dirty pages. + @param size Amount of data to page out (in bytes). Under VFC_VFSVNOP_PAGEOUTV2, this represents + the size of the range to be considered. The fileystem is free to extend or shrink the specified range + to better fit its blocking model as long as the page at 'pl_offset' is included. @param flags UPL-style flags: UPL_IOSYNC, UPL_NOCOMMIT, UPL_NORDAHEAD, UPL_VNODE_PAGER, UPL_MSYNC. Filesystems should generally leave it to the cluster layer to handle these flags. See the memory_object_types.h header in the kernel framework if interested. @@ -1042,6 +1201,36 @@ struct vnop_searchfs_args { vfs_context_t a_context; }; +/* + @function VNOP_SEARCHFS + @abstract Search a filesystem quickly for files or directories that match the passed-in search criteria. + @discussion VNOP_SEARCHFS is a getattrlist-based system call which is implemented almost entirely inside + supported filesystems. Callers provide a set of criteria to match against, and the filesystem is responsible + for finding all files or directories that match the criteria. Once these files or directories are found, + the user-requested attributes of these files is provided as output. The set of searchable attributes is a + subset of the getattrlist attributes. For example, ATTR_CMN_UUID is not a valid searchable attribute as of + 10.6. A common usage scenario could be to request all files whose mod dates is greater than time X, less than + time Y, and provide the inode ID and filename of the matching objects as output. + @param vp The vnode representing the mountpoint of the filesystem to be searched. + @param a_searchparams1 If one-argument search criteria is requested, the search criteria would go here. However, + some search criteria, like ATTR_CMN_MODTIME, can be bounded. The user could request files modified between time X + and time Y. In this case, the lower bound goes in a_searchparams1. + @param a_searchparams2 If two-argument search criteria is requested, the upper bound goes in here. + @param a_searchattrs Contains the getattrlist-style attribute bits which are requested by the current search. + @param a_maxmatches The maximum number of matches to return in a single system call. + @param a_timelimit The suggested maximum amount of time we can spend in the kernel to service this system call. + Filesystems should use this as a guide only, and set their own internal maximum time to avoid denial of service. + @param a_returnattrs The getattrlist-style attributes to return for items in the filesystem that match the search + criteria above. + @param a_scriptcode Currently ignored. + @param a_uio The uio in which to write out the search matches. + @param a_searchstate Sometimes searches cannot be completed in a single system call. In this case, we provide + an identifier back to the user which indicates where to resume a previously-started search. This is an opaque structure + used by the filesystem to identify where to resume said search. + @param a_context The context in which to perform the filesystem search. + @return 0 on success, EAGAIN for searches which could not be completed in 1 call, and other ERRNOS as needed. + */ + #ifdef XNU_KERNEL_PRIVATE extern errno_t VNOP_SEARCHFS(vnode_t, void *, void *, struct attrlist *, uint32_t, struct timeval *, struct attrlist *, uint32_t *, uint32_t, uint32_t, struct uio *, struct searchstate *, vfs_context_t); #endif /* XNU_KERNEL_PRIVATE */ diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index dbff3a50d..7d17be99e 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -100,6 +100,29 @@ struct unsafe_fsnode { void * fsnodeowner; }; +#if CONFIG_TRIGGERS +/* + * VFS Internal (private) trigger vnode resolver info. + */ +struct vnode_resolve { + lck_mtx_t vr_lock; /* protects vnode_resolve_t fields */ + trigger_vnode_resolve_callback_t vr_resolve_func; + trigger_vnode_unresolve_callback_t vr_unresolve_func; + trigger_vnode_rearm_callback_t vr_rearm_func; + trigger_vnode_reclaim_callback_t vr_reclaim_func; + void * vr_data; /* private data for resolver */ + uint32_t vr_flags; + uint32_t vr_lastseq; +}; +typedef struct vnode_resolve *vnode_resolve_t; + +/* private vr_flags */ +#define VNT_RESOLVED (1UL << 31) +#define VNT_VFS_UNMOUNTED (1UL << 30) +#define VNT_EXTERNAL (1UL << 29) + +#endif /* CONFIG_TRIGGERS */ + /* * Reading or writing any of these items requires holding the appropriate lock. * v_freelist is locked by the global vnode_list_lock @@ -166,6 +189,9 @@ struct vnode { #if CONFIG_MACF struct label *v_label; /* MAC security label */ #endif +#if CONFIG_TRIGGERS + vnode_resolve_t v_resolve; /* trigger vnode resolve info (VDIR only) */ +#endif /* CONFIG_TRIGGERS */ }; #define v_mountedhere v_un.vu_mountedhere @@ -199,7 +225,6 @@ struct vnode { #define VL_TERMWANT 0x0008 /* there's a waiter for recycle finish (vnode_getiocount)*/ #define VL_DEAD 0x0010 /* vnode is dead, cleaned of filesystem-specific info */ #define VL_MARKTERM 0x0020 /* vnode should be recycled when no longer referenced */ -#define VL_MOUNTDEAD 0x0040 /* v_moutnedhere is dead */ #define VL_NEEDINACTIVE 0x0080 /* delay VNOP_INACTIVE until iocount goes to 0 */ #define VL_LABEL 0x0100 /* vnode is marked for labeling */ @@ -224,7 +249,7 @@ struct vnode { #define VDEVFLUSH 0x000040 /* device vnode after vflush */ #define VMOUNT 0x000080 /* mount operation in progress */ #define VBWAIT 0x000100 /* waiting for output to complete */ - /* Free slot here after removing VALIASED for radar #5971707 */ +#define VSHARED_DYLD 0x000200 /* vnode is a dyld shared cache file */ #define VNOCACHE_DATA 0x000400 /* don't keep data cached once it's been consumed */ #define VSTANDARD 0x000800 /* vnode obtained from common pool */ #define VAGE 0x001000 /* Insert vnode at head of free list */ @@ -244,6 +269,7 @@ struct vnode { #define VISNAMEDSTREAM 0x400000 /* vnode is a named stream (eg HFS resource fork) */ #endif #define VOPENEVT 0x800000 /* if process is P_CHECKOPENEVT, then or in the O_EVTONLY flag on open */ +#define VNEEDSSNAPSHOT 0x1000000 /* * Global vnode data. @@ -251,7 +277,8 @@ struct vnode { extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ #ifdef CONFIG_IMGSRC_ACCESS -extern struct vnode *imgsrc_rootvnode; +#define MAX_IMAGEBOOT_NESTING 2 +extern struct vnode *imgsrc_rootvnodes[]; #endif /* CONFIG_IMGSRC_ACCESS */ @@ -367,6 +394,10 @@ int vn_open(struct nameidata *ndp, int fmode, int cmode); int vn_open_modflags(struct nameidata *ndp, int *fmode, int cmode); int vn_open_auth(struct nameidata *ndp, int *fmode, struct vnode_attr *); int vn_close(vnode_t, int flags, vfs_context_t ctx); +errno_t vn_remove(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, struct vnode_attr *vap, vfs_context_t ctx); +errno_t vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, struct vnode_attr *fvap, + struct vnode *tdvp, struct vnode **tvpp, struct componentname *tcnp, struct vnode_attr *tvap, + uint32_t flags, vfs_context_t ctx); void lock_vnode_and_post(vnode_t, int); @@ -377,14 +408,30 @@ void lock_vnode_and_post(vnode_t, int); } \ } while (0) - +/* Authorization subroutines */ +int vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved); +int vn_authorize_create(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); +int vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx); +void vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields); +int vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); +int vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_context_t ctx, void *reserved); +int vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved); + +typedef int (*vn_create_authorizer_t)(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); +int vn_authorize_mkdir(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); +int vn_authorize_null(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*); +/* End of authorization subroutines */ #define VN_CREATE_NOAUTH (1<<0) #define VN_CREATE_NOINHERIT (1<<1) #define VN_CREATE_UNION (1<<2) #define VN_CREATE_NOLABEL (1<<3) -errno_t vn_create(vnode_t, vnode_t *, struct componentname *, struct vnode_attr *, int flags, vfs_context_t); - +#define VN_CREATE_DOOPEN (1<<4) /* Open file if a batched operation is available */ +errno_t vn_create(vnode_t, vnode_t *, struct nameidata *, struct vnode_attr *, uint32_t, int, uint32_t*, vfs_context_t); +int vn_mkdir(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, vfs_context_t ctx); +int vn_rmdir(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, vfs_context_t ctx); int vn_getxattr(vnode_t, const char *, uio_t, size_t *, int, vfs_context_t); int vn_setxattr(vnode_t, const char *, uio_t, int, vfs_context_t); @@ -415,6 +462,7 @@ void cache_enter_with_gen(vnode_t dvp, vnode_t vp, struct componentname *cnp, in const char *cache_enter_create(vnode_t dvp, vnode_t vp, struct componentname *cnp); int vn_pathconf(vnode_t, int, int32_t *, vfs_context_t); +extern int nc_disabled; #define vnode_lock_convert(v) lck_mtx_convert_spin(&(v)->v_lock) @@ -423,12 +471,16 @@ void vnode_lock_spin(vnode_t); void vnode_list_lock(void); void vnode_list_unlock(void); -int vnode_ref_ext(vnode_t, int); + +#define VNODE_REF_FORCE 0x1 +int vnode_ref_ext(vnode_t, int, int); + void vnode_rele_ext(vnode_t, int, int); void vnode_rele_internal(vnode_t, int, int, int); #ifdef BSD_KERNEL_PRIVATE int vnode_getalways(vnode_t); int vget_internal(vnode_t, int, int); +errno_t vnode_getiocount(vnode_t, unsigned int, int); #endif /* BSD_KERNEL_PRIVATE */ int vnode_get_locked(vnode_t); int vnode_put_locked(vnode_t); @@ -448,6 +500,24 @@ errno_t vnode_setsize(vnode_t, off_t, int ioflag, vfs_context_t); int vnode_setattr_fallback(vnode_t vp, struct vnode_attr *vap, vfs_context_t ctx); int vnode_isspec(vnode_t vp); + +#ifdef BSD_KERNEL_PRIVATE + +typedef uint32_t compound_vnop_id_t; +#define COMPOUND_VNOP_OPEN 0x01 +#define COMPOUND_VNOP_MKDIR 0x02 +#define COMPOUND_VNOP_RENAME 0x04 +#define COMPOUND_VNOP_REMOVE 0x08 +#define COMPOUND_VNOP_RMDIR 0x10 + +int vnode_compound_rename_available(vnode_t vp); +int vnode_compound_rmdir_available(vnode_t vp); +int vnode_compound_mkdir_available(vnode_t vp); +int vnode_compound_remove_available(vnode_t vp); +int vnode_compound_open_available(vnode_t vp); +int vnode_compound_op_available(vnode_t, compound_vnop_id_t); +#endif /* BSD_KERNEL_PRIVATE */ + void vn_setunionwait(vnode_t); void vn_checkunionwait(vnode_t); void vn_clearunionwait(vnode_t, int); @@ -471,9 +541,18 @@ int vfs_sysctl(int *name, uint32_t namelen, user_addr_t oldp, size_t *oldlenp, int sysctl_vnode(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req); #ifdef BSD_KERNEL_PRIVATE -void vnode_knoteupdate(struct knote *kn); void vnode_setneedinactive(vnode_t); int vnode_hasnamedstreams(vnode_t); /* Does this vnode have associated named streams? */ -#endif + +void nspace_proc_exit(struct proc *p); + +#if CONFIG_TRIGGERS +/* VFS Internal Vnode Trigger Interfaces (Private) */ +int vnode_trigger_resolve(vnode_t, struct nameidata *, vfs_context_t); +void vnode_trigger_rearm(vnode_t, vfs_context_t); +void vfs_nested_trigger_unmounts(mount_t, int, vfs_context_t); +#endif /* CONFIG_TRIGGERS */ + +#endif /* BSD_KERNEL_PRIVATE */ #endif /* !_SYS_VNODE_INTERNAL_H_ */ diff --git a/bsd/sys/xattr.h b/bsd/sys/xattr.h index c9ecf4275..bd91c3c31 100644 --- a/bsd/sys/xattr.h +++ b/bsd/sys/xattr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,7 +59,19 @@ __BEGIN_DECLS int xattr_protected(const char *); int xattr_validatename(const char *); -#define XATTR_MAXSIZE (64 * 1024 * 1024) +/* Maximum extended attribute size supported by VFS */ +#define XATTR_MAXSIZE (64 * 1024 * 1024) + +#ifdef PRIVATE +/* Maximum extended attribute size in an Apple Double file */ +#define AD_XATTR_MAXSIZE (128 * 1024) + +/* Number of bits used to represent the maximum size of + * extended attribute stored in an Apple Double file. + */ +#define AD_XATTR_SIZE_BITS 18 +#endif /* PRIVATE */ + __END_DECLS #endif /* KERNEL */ diff --git a/bsd/uuid/Makefile b/bsd/uuid/Makefile index 8d5af9310..1f7f17bfc 100644 --- a/bsd/uuid/Makefile +++ b/bsd/uuid/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ # In both the framework PrivateHeader area and /usr/include/uuid diff --git a/bsd/vfs/Makefile b/bsd/vfs/Makefile index 3d578ffd7..b9ddbedcc 100644 --- a/bsd/vfs/Makefile +++ b/bsd/vfs/Makefile @@ -9,14 +9,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c index 50338b255..a18760397 100644 --- a/bsd/vfs/kpi_vfs.c +++ b/bsd/vfs/kpi_vfs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -605,6 +605,13 @@ VFS_VPTOFH(struct vnode * vp, int *fhlenp, unsigned char * fhp, vfs_context_t ct } +/* returns the cached throttle mask for the mount_t */ +uint64_t +vfs_throttle_mask(mount_t mp) +{ + return(mp->mnt_throttle_mask); +} + /* returns a copy of vfs type name for the mount_t */ void vfs_name(mount_t mp, char * buffer) @@ -943,6 +950,27 @@ vfs_vnodecovered(mount_t mp) } } +/* + * Returns device vnode backing a mountpoint with an iocount (if valid vnode exists). + * The iocount must be released with vnode_put(). Note that this KPI is subtle + * with respect to the validity of using this device vnode for anything substantial + * (which is discouraged). If commands are sent to the device driver without + * taking proper steps to ensure that the device is still open, chaos may ensue. + * Similarly, this routine should only be called if there is some guarantee that + * the mount itself is still valid. + */ +vnode_t +vfs_devvp(mount_t mp) +{ + vnode_t vp = mp->mnt_devvp; + + if ((vp != NULLVP) && (vnode_get(vp) == 0)) { + return vp; + } + + return NULLVP; +} + /* * return the io attributes associated with mount_t */ @@ -1002,7 +1030,6 @@ extern int vfs_opv_numops; errno_t vfs_fsadd(struct vfs_fsentry *vfe, vfstable_t * handle) { -#pragma unused(data) struct vfstable *newvfstbl = NULL; int i,j; int (***opv_desc_vector_p)(void *); @@ -1686,6 +1713,40 @@ vnode_israge(vnode_t vp) return ((vp->v_flag & VRAGE)? 1 : 0); } +int +vnode_needssnapshots(vnode_t vp) +{ + return ((vp->v_flag & VNEEDSSNAPSHOT)? 1 : 0); +} + + +/* Check the process/thread to see if we should skip atime updates */ +int +vfs_ctx_skipatime (vfs_context_t ctx) { + struct uthread *ut; + proc_t proc; + thread_t thr; + + proc = vfs_context_proc(ctx); + thr = vfs_context_thread (ctx); + + /* Validate pointers in case we were invoked via a kernel context */ + if (thr && proc) { + ut = get_bsdthread_info (thr); + + if (proc->p_lflag & P_LRAGE_VNODES) { + return 1; + } + + if (ut) { + if (ut->uu_flag & UT_RAGE_VNODES) { + return 1; + } + } + } + return 0; +} + /* is vnode_t marked to not keep data cached once it's been consumed */ int vnode_isnocache(vnode_t vp) @@ -1743,6 +1804,46 @@ vnode_islnk(vnode_t vp) return ((vp->v_type == VLNK)? 1 : 0); } +int +vnode_lookup_continue_needed(vnode_t vp, struct componentname *cnp) +{ + struct nameidata *ndp = cnp->cn_ndp; + + if (ndp == NULL) { + panic("vnode_lookup_continue_needed(): cnp->cn_ndp is NULL\n"); + } + + if (vnode_isdir(vp)) { + if (vp->v_mountedhere != NULL) { + goto yes; + } + +#if CONFIG_TRIGGERS + if (vp->v_resolve) { + goto yes; + } +#endif /* CONFIG_TRIGGERS */ + + } + + + if (vnode_islnk(vp)) { + /* From lookup(): || *ndp->ni_next == '/') No need for this, we know we're NULL-terminated here */ + if (cnp->cn_flags & FOLLOW) { + goto yes; + } + if (ndp->ni_flag & NAMEI_TRAILINGSLASH) { + goto yes; + } + } + + return 0; + +yes: + ndp->ni_flag |= NAMEI_CONTLOOKUP; + return EKEEPLOOKING; +} + /* is vnode_t a fifo ? */ int vnode_isfifo(vnode_t vp) @@ -2041,6 +2142,37 @@ vnode_vfsisrdonly(vnode_t vp) return ((vp->v_mount->mnt_flag & MNT_RDONLY)? 1 : 0); } +int +vnode_compound_rename_available(vnode_t vp) +{ + return vnode_compound_op_available(vp, COMPOUND_VNOP_RENAME); +} +int +vnode_compound_rmdir_available(vnode_t vp) +{ + return vnode_compound_op_available(vp, COMPOUND_VNOP_RMDIR); +} +int +vnode_compound_mkdir_available(vnode_t vp) +{ + return vnode_compound_op_available(vp, COMPOUND_VNOP_MKDIR); +} +int +vnode_compound_remove_available(vnode_t vp) +{ + return vnode_compound_op_available(vp, COMPOUND_VNOP_REMOVE); +} +int +vnode_compound_open_available(vnode_t vp) +{ + return vnode_compound_op_available(vp, COMPOUND_VNOP_OPEN); +} + +int +vnode_compound_op_available(vnode_t vp, compound_vnop_id_t opid) +{ + return ((vp->v_mount->mnt_compound_ops & opid) != 0); +} /* * Returns vnode ref to current working directory; if a per-thread current @@ -2769,6 +2901,15 @@ vnode_notify(vnode_t vp, uint32_t events, struct vnode_attr *vap) return 0; } + + +int +vnode_isdyldsharedcache(vnode_t vp) +{ + return ((vp->v_flag & VSHARED_DYLD) ? 1 : 0); +} + + /* * For a filesystem that isn't tracking its own vnode watchers: * check whether a vnode is being monitored. @@ -2778,27 +2919,6 @@ vnode_ismonitored(vnode_t vp) { return (vp->v_knotes.slh_first != NULL); } -/* - * Conceived as a function available only in BSD kernel so that if kevent_register - * changes what a knote of type EVFILT_VNODE is watching, it can push - * that updated information down to a networked filesystem that may - * need to update server-side monitoring. - * - * Blunted to do nothing--because we want to get both kqueue and fsevents support - * from the VNOP_MONITOR design, we always want all the events a filesystem can provide us. - */ -void -vnode_knoteupdate(__unused struct knote *kn) -{ -#if 0 - vnode_t vp = (vnode_t)kn->kn_hook; - if (vnode_getwithvid(vp, kn->kn_hookid) == 0) { - VNOP_MONITOR(vp, kn->kn_sfflags, VNODE_MONITOR_UPDATE, (void*)kn, NULL); - vnode_put(vp); - } -#endif -} - /* * Initialize a struct vnode_attr and activate the attributes required * by the vnode_notify() call. @@ -2811,6 +2931,44 @@ vfs_get_notify_attributes(struct vnode_attr *vap) return 0; } +#if CONFIG_TRIGGERS +int +vfs_settriggercallback(fsid_t *fsid, vfs_trigger_callback_t vtc, void *data, uint32_t flags __unused, vfs_context_t ctx) +{ + int error; + mount_t mp; + + mp = mount_list_lookupby_fsid(fsid, 0 /* locked */, 1 /* withref */); + if (mp == NULL) { + return ENOENT; + } + + error = vfs_busy(mp, LK_NOWAIT); + mount_iterdrop(mp); + + if (error != 0) { + return ENOENT; + } + + mount_lock(mp); + if (mp->mnt_triggercallback != NULL) { + error = EBUSY; + mount_unlock(mp); + goto out; + } + + mp->mnt_triggercallback = vtc; + mp->mnt_triggerdata = data; + mount_unlock(mp); + + mp->mnt_triggercallback(mp, VTC_REPLACE, data, ctx); + +out: + vfs_unbusy(mp); + return 0; +} +#endif /* CONFIG_TRIGGERS */ + /* * Definition of vnode operations. */ @@ -2909,13 +3067,87 @@ VNOP_LOOKUP(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, vfs_context_t } #if 0 -/* - *# - *#% create dvp L L L - *#% create vpp - L - - *# - */ - +struct vnop_compound_open_args { + struct vnodeop_desc *a_desc; + vnode_t a_dvp; + vnode_t *a_vpp; + struct componentname *a_cnp; + int32_t a_flags; + int32_t a_fmode; + struct vnode_attr *a_vap; + vfs_context_t a_context; + void *a_reserved; +}; +#endif /* 0 */ + +int +VNOP_COMPOUND_OPEN(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, int32_t fmode, uint32_t *statusp, struct vnode_attr *vap, vfs_context_t ctx) +{ + int _err; + struct vnop_compound_open_args a; + int did_create = 0; + int want_create; + uint32_t tmp_status = 0; + struct componentname *cnp = &ndp->ni_cnd; + + want_create = (flags & VNOP_COMPOUND_OPEN_DO_CREATE); + + a.a_desc = &vnop_compound_open_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; /* Could be NULL */ + a.a_cnp = cnp; + a.a_flags = flags; + a.a_fmode = fmode; + a.a_status = (statusp != NULL) ? statusp : &tmp_status; + a.a_vap = vap; + a.a_context = ctx; + a.a_open_create_authorizer = vn_authorize_create; + a.a_open_existing_authorizer = vn_authorize_open_existing; + a.a_reserved = NULL; + + if (dvp == NULLVP) { + panic("No dvp?"); + } + if (want_create && !vap) { + panic("Want create, but no vap?"); + } + if (!want_create && vap) { + panic("Don't want create, but have a vap?"); + } + + _err = (*dvp->v_op[vnop_compound_open_desc.vdesc_offset])(&a); + + did_create = (*a.a_status & COMPOUND_OPEN_STATUS_DID_CREATE); + + if (did_create && !want_create) { + panic("Filesystem did a create, even though none was requested?"); + } + + if (did_create) { + if (!NATIVE_XATTR(dvp)) { + /* + * Remove stale Apple Double file (if any). + */ + xattrfile_remove(dvp, cnp->cn_nameptr, ctx, 0); + } + + /* On create, provide kqueue notification */ + post_event_if_success(dvp, _err, NOTE_WRITE); + } + + lookup_compound_vnop_post_hook(_err, dvp, *vpp, ndp, did_create); +#if 0 /* FSEvents... */ + if (*vpp && _err && _err != EKEEPLOOKING) { + vnode_put(*vpp); + *vpp = NULLVP; + } +#endif /* 0 */ + + return (_err); + +} + +#if 0 struct vnop_create_args { struct vnodeop_desc *a_desc; vnode_t a_dvp; @@ -3094,34 +3326,34 @@ struct vnop_open_args { }; #endif /* 0*/ errno_t -VNOP_OPEN(vnode_t vp, int mode, vfs_context_t ctx) +VNOP_OPEN(vnode_t vp, int mode, vfs_context_t ctx) { int _err; struct vnop_open_args a; #ifndef __LP64__ int thread_safe; - int funnel_state = 0; + int funnel_state = 0; #endif /* __LP64__ */ if (ctx == NULL) { ctx = vfs_context_current(); - } + } a.a_desc = &vnop_open_desc; a.a_vp = vp; a.a_mode = mode; - a.a_context = ctx; + a.a_context = ctx; #ifndef __LP64__ thread_safe = THREAD_SAFE_FS(vp); if (!thread_safe) { funnel_state = thread_funnel_set(kernel_flock, TRUE); if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { - if ( (_err = lock_fsnode(vp, NULL)) ) { - (void) thread_funnel_set(kernel_flock, funnel_state); - return (_err); - } - } - } + if ( (_err = lock_fsnode(vp, NULL)) ) { + (void) thread_funnel_set(kernel_flock, funnel_state); + return (_err); + } + } + } #endif /* __LP64__ */ _err = (*vp->v_op[vnop_open_desc.vdesc_offset])(&a); @@ -3130,9 +3362,9 @@ VNOP_OPEN(vnode_t vp, int mode, vfs_context_t ctx) if (!thread_safe) { if (vp->v_type != VCHR && vp->v_type != VFIFO && vp->v_type != VSOCK) { unlock_fsnode(vp, NULL); - } + } (void) thread_funnel_set(kernel_flock, funnel_state); - } + } #endif /* __LP64__ */ return (_err); @@ -4012,6 +4244,49 @@ VNOP_REMOVE(vnode_t dvp, vnode_t vp, struct componentname * cnp, int flags, vfs_ return (_err); } +int +VNOP_COMPOUND_REMOVE(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, struct vnode_attr *vap, vfs_context_t ctx) +{ + int _err; + struct vnop_compound_remove_args a; + int no_vp = (*vpp == NULLVP); + + a.a_desc = &vnop_compound_remove_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = &ndp->ni_cnd; + a.a_flags = flags; + a.a_vap = vap; + a.a_context = ctx; + a.a_remove_authorizer = vn_authorize_unlink; + + _err = (*dvp->v_op[vnop_compound_remove_desc.vdesc_offset])(&a); + if (_err == 0) { + vnode_setneedinactive(*vpp); + + if ( !(NATIVE_XATTR(dvp)) ) { + /* + * Remove any associated extended attribute file (._ AppleDouble file). + */ + xattrfile_remove(dvp, ndp->ni_cnd.cn_nameptr, ctx, 1); + } + } + + post_event_if_success(*vpp, _err, NOTE_DELETE | NOTE_LINK); + post_event_if_success(dvp, _err, NOTE_WRITE); + + if (no_vp) { + lookup_compound_vnop_post_hook(_err, dvp, *vpp, ndp, 0); + if (*vpp && _err && _err != EKEEPLOOKING) { + vnode_put(*vpp); + *vpp = NULLVP; + } + } + + //printf("VNOP_COMPOUND_REMOVE() returning %d\n", _err); + + return (_err); +} #if 0 /* @@ -4085,114 +4360,33 @@ VNOP_LINK(vnode_t vp, vnode_t tdvp, struct componentname * cnp, vfs_context_t ct return (_err); } - -#if 0 -/* - *# - *#% rename fdvp U U U - *#% rename fvp U U U - *#% rename tdvp L U U - *#% rename tvp X U U - *# - */ -struct vnop_rename_args { - struct vnodeop_desc *a_desc; - vnode_t a_fdvp; - vnode_t a_fvp; - struct componentname *a_fcnp; - vnode_t a_tdvp; - vnode_t a_tvp; - struct componentname *a_tcnp; - vfs_context_t a_context; -}; -#endif /* 0*/ errno_t -VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, - struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, - vfs_context_t ctx) +vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, struct vnode_attr *fvap, + struct vnode *tdvp, struct vnode **tvpp, struct componentname *tcnp, struct vnode_attr *tvap, + uint32_t flags, vfs_context_t ctx) { - int _err = 0; - int events; - struct vnop_rename_args a; - char smallname1[48]; - char smallname2[48]; - char *xfromname = NULL; - char *xtoname = NULL; -#ifndef __LP64__ - int funnel_state = 0; - vnode_t lock_first = NULL, lock_second = NULL; - vnode_t fdvp_unsafe = NULLVP; - vnode_t tdvp_unsafe = NULLVP; -#endif /* __LP64__ */ + int _err; vnode_t src_attr_vp = NULLVP; vnode_t dst_attr_vp = NULLVP; struct nameidata fromnd; struct nameidata tond; + char smallname1[48]; + char smallname2[48]; + char *xfromname = NULL; + char *xtoname = NULL; + int batched; - a.a_desc = &vnop_rename_desc; - a.a_fdvp = fdvp; - a.a_fvp = fvp; - a.a_fcnp = fcnp; - a.a_tdvp = tdvp; - a.a_tvp = tvp; - a.a_tcnp = tcnp; - a.a_context = ctx; + batched = vnode_compound_rename_available(fdvp); #ifndef __LP64__ - if (!THREAD_SAFE_FS(fdvp)) - fdvp_unsafe = fdvp; - if (!THREAD_SAFE_FS(tdvp)) - tdvp_unsafe = tdvp; + vnode_t fdvp_unsafe = (THREAD_SAFE_FS(fdvp) ? NULLVP : fdvp); +#endif /* __LP64__ */ - if (fdvp_unsafe != NULLVP) { - /* - * Lock parents in vnode address order to avoid deadlocks - * note that it's possible for the fdvp to be unsafe, - * but the tdvp to be safe because tvp could be a directory - * in the root of a filesystem... in that case, tdvp is the - * in the filesystem that this root is mounted on - */ - if (tdvp_unsafe == NULL || fdvp_unsafe == tdvp_unsafe) { - lock_first = fdvp_unsafe; - lock_second = NULL; - } else if (fdvp_unsafe < tdvp_unsafe) { - lock_first = fdvp_unsafe; - lock_second = tdvp_unsafe; - } else { - lock_first = tdvp_unsafe; - lock_second = fdvp_unsafe; - } - if ( (_err = lock_fsnode(lock_first, &funnel_state)) ) - return (_err); - - if (lock_second != NULL && (_err = lock_fsnode(lock_second, NULL))) { - unlock_fsnode(lock_first, &funnel_state); - return (_err); - } - - /* - * Lock both children in vnode address order to avoid deadlocks - */ - if (tvp == NULL || tvp == fvp) { - lock_first = fvp; - lock_second = NULL; - } else if (fvp < tvp) { - lock_first = fvp; - lock_second = tvp; - } else { - lock_first = tvp; - lock_second = fvp; - } - if ( (_err = lock_fsnode(lock_first, NULL)) ) - goto out1; - - if (lock_second != NULL && (_err = lock_fsnode(lock_second, NULL))) { - unlock_fsnode(lock_first, NULL); - goto out1; - } + if (!batched) { + if (*fvpp == NULLVP) + panic("Not batched, and no fvp?"); } -#endif /* __LP64__ */ - + /* * We need to preflight any potential AppleDouble file for the source file * before doing the rename operation, since we could potentially be doing @@ -4235,8 +4429,8 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, * is only for AppleDouble files. */ if (xfromname != NULL) { - NDINIT(&fromnd, RENAME, NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, UIO_SYSSPACE, - CAST_USER_ADDR_T(xfromname), ctx); + NDINIT(&fromnd, RENAME, OP_RENAME, NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, + UIO_SYSSPACE, CAST_USER_ADDR_T(xfromname), ctx); fromnd.ni_dvp = fdvp; error = namei(&fromnd); @@ -4267,21 +4461,18 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, } } + if (batched) { + _err = VNOP_COMPOUND_RENAME(fdvp, fvpp, fcnp, fvap, tdvp, tvpp, tcnp, tvap, flags, ctx); + if (_err != 0) { + printf("VNOP_COMPOUND_RENAME() returned %d\n", _err); + } - /* do the rename of the main file. */ - _err = (*fdvp->v_op[vnop_rename_desc.vdesc_offset])(&a); - -#ifndef __LP64__ - if (fdvp_unsafe != NULLVP) { - if (lock_second != NULL) - unlock_fsnode(lock_second, NULL); - unlock_fsnode(lock_first, NULL); + } else { + _err = VNOP_RENAME(fdvp, *fvpp, fcnp, tdvp, *tvpp, tcnp, ctx); } -#endif /* __LP64__ */ if (_err == 0) { - if (tvp && tvp != fvp) - vnode_setneedinactive(tvp); + mac_vnode_notify_rename(ctx, *fvpp, tdvp, tcnp); } /* @@ -4295,7 +4486,7 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, * Note that tdvp already has an iocount reference. Make sure to check that we * get a valid vnode from namei. */ - NDINIT(&tond, RENAME, + NDINIT(&tond, RENAME, OP_RENAME, NOCACHE | NOFOLLOW | USEDVP | CN_NBMOUNTLOOK, UIO_SYSSPACE, CAST_USER_ADDR_T(xtoname), ctx); tond.ni_dvp = tdvp; @@ -4309,81 +4500,15 @@ VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, } if (src_attr_vp) { - /* attempt to rename src -> dst */ - - a.a_desc = &vnop_rename_desc; - a.a_fdvp = fdvp; - a.a_fvp = src_attr_vp; - a.a_fcnp = &fromnd.ni_cnd; - a.a_tdvp = tdvp; - a.a_tvp = dst_attr_vp; - a.a_tcnp = &tond.ni_cnd; - a.a_context = ctx; - -#ifndef __LP64__ - if (fdvp_unsafe != NULLVP) { - /* - * Lock in vnode address order to avoid deadlocks - */ - if (dst_attr_vp == NULL || dst_attr_vp == src_attr_vp) { - lock_first = src_attr_vp; - lock_second = NULL; - } else if (src_attr_vp < dst_attr_vp) { - lock_first = src_attr_vp; - lock_second = dst_attr_vp; - } else { - lock_first = dst_attr_vp; - lock_second = src_attr_vp; - } - if ( (error = lock_fsnode(lock_first, NULL)) == 0) { - if (lock_second != NULL && (error = lock_fsnode(lock_second, NULL)) ) - unlock_fsnode(lock_first, NULL); - } + if (batched) { + error = VNOP_COMPOUND_RENAME(fdvp, &src_attr_vp, &fromnd.ni_cnd, NULL, + tdvp, &dst_attr_vp, &tond.ni_cnd, NULL, + 0, ctx); + } else { + error = VNOP_RENAME(fdvp, src_attr_vp, &fromnd.ni_cnd, + tdvp, dst_attr_vp, &tond.ni_cnd, ctx); } -#endif /* __LP64__ */ - if (error == 0) { - const char *oname; - vnode_t oparent; - /* Save these off so we can later verify them (fix up below) */ - oname = src_attr_vp->v_name; - oparent = src_attr_vp->v_parent; - - error = (*fdvp->v_op[vnop_rename_desc.vdesc_offset])(&a); - -#ifndef __LP64__ - if (fdvp_unsafe != NULLVP) { - if (lock_second != NULL) - unlock_fsnode(lock_second, NULL); - unlock_fsnode(lock_first, NULL); - } -#endif /* __LP64__ */ - - if (error == 0) { - vnode_setneedinactive(src_attr_vp); - - if (dst_attr_vp && dst_attr_vp != src_attr_vp) - vnode_setneedinactive(dst_attr_vp); - /* - * Fix up name & parent pointers on ._ file - */ - if (oname == src_attr_vp->v_name && - oparent == src_attr_vp->v_parent) { - int update_flags; - - update_flags = VNODE_UPDATE_NAME; - - if (fdvp != tdvp) - update_flags |= VNODE_UPDATE_PARENT; - - vnode_update_identity(src_attr_vp, tdvp, - tond.ni_cnd.cn_nameptr, - tond.ni_cnd.cn_namelen, - tond.ni_cnd.cn_hash, - update_flags); - } - } - } /* kevent notifications for moving resource files * _err is zero if we're here, so no need to notify directories, code * below will do that. only need to post the rename on the source and @@ -4449,6 +4574,125 @@ out: FREE(xtoname, M_TEMP); } + return _err; +} + + +#if 0 +/* + *# + *#% rename fdvp U U U + *#% rename fvp U U U + *#% rename tdvp L U U + *#% rename tvp X U U + *# + */ +struct vnop_rename_args { + struct vnodeop_desc *a_desc; + vnode_t a_fdvp; + vnode_t a_fvp; + struct componentname *a_fcnp; + vnode_t a_tdvp; + vnode_t a_tvp; + struct componentname *a_tcnp; + vfs_context_t a_context; +}; +#endif /* 0*/ +errno_t +VNOP_RENAME(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_context_t ctx) +{ + int _err = 0; + int events; + struct vnop_rename_args a; +#ifndef __LP64__ + int funnel_state = 0; + vnode_t lock_first = NULL, lock_second = NULL; + vnode_t fdvp_unsafe = NULLVP; + vnode_t tdvp_unsafe = NULLVP; +#endif /* __LP64__ */ + + a.a_desc = &vnop_rename_desc; + a.a_fdvp = fdvp; + a.a_fvp = fvp; + a.a_fcnp = fcnp; + a.a_tdvp = tdvp; + a.a_tvp = tvp; + a.a_tcnp = tcnp; + a.a_context = ctx; + +#ifndef __LP64__ + if (!THREAD_SAFE_FS(fdvp)) + fdvp_unsafe = fdvp; + if (!THREAD_SAFE_FS(tdvp)) + tdvp_unsafe = tdvp; + + if (fdvp_unsafe != NULLVP) { + /* + * Lock parents in vnode address order to avoid deadlocks + * note that it's possible for the fdvp to be unsafe, + * but the tdvp to be safe because tvp could be a directory + * in the root of a filesystem... in that case, tdvp is the + * in the filesystem that this root is mounted on + */ + if (tdvp_unsafe == NULL || fdvp_unsafe == tdvp_unsafe) { + lock_first = fdvp_unsafe; + lock_second = NULL; + } else if (fdvp_unsafe < tdvp_unsafe) { + lock_first = fdvp_unsafe; + lock_second = tdvp_unsafe; + } else { + lock_first = tdvp_unsafe; + lock_second = fdvp_unsafe; + } + if ( (_err = lock_fsnode(lock_first, &funnel_state)) ) + return (_err); + + if (lock_second != NULL && (_err = lock_fsnode(lock_second, NULL))) { + unlock_fsnode(lock_first, &funnel_state); + return (_err); + } + + /* + * Lock both children in vnode address order to avoid deadlocks + */ + if (tvp == NULL || tvp == fvp) { + lock_first = fvp; + lock_second = NULL; + } else if (fvp < tvp) { + lock_first = fvp; + lock_second = tvp; + } else { + lock_first = tvp; + lock_second = fvp; + } + if ( (_err = lock_fsnode(lock_first, NULL)) ) + goto out1; + + if (lock_second != NULL && (_err = lock_fsnode(lock_second, NULL))) { + unlock_fsnode(lock_first, NULL); + goto out1; + } + } +#endif /* __LP64__ */ + + /* do the rename of the main file. */ + _err = (*fdvp->v_op[vnop_rename_desc.vdesc_offset])(&a); + +#ifndef __LP64__ + if (fdvp_unsafe != NULLVP) { + if (lock_second != NULL) + unlock_fsnode(lock_second, NULL); + unlock_fsnode(lock_first, NULL); + } +#endif /* __LP64__ */ + + if (_err == 0) { + if (tvp && tvp != fvp) + vnode_setneedinactive(tvp); + } + #ifndef __LP64__ out1: if (fdvp_unsafe != NULLVP) { @@ -4488,6 +4732,112 @@ out1: return (_err); } +int +VNOP_COMPOUND_RENAME( + struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, struct vnode_attr *fvap, + struct vnode *tdvp, struct vnode **tvpp, struct componentname *tcnp, struct vnode_attr *tvap, + uint32_t flags, vfs_context_t ctx) +{ + int _err = 0; + int events; + struct vnop_compound_rename_args a; + int no_fvp, no_tvp; + + no_fvp = (*fvpp) == NULLVP; + no_tvp = (*tvpp) == NULLVP; + + a.a_desc = &vnop_compound_rename_desc; + + a.a_fdvp = fdvp; + a.a_fvpp = fvpp; + a.a_fcnp = fcnp; + a.a_fvap = fvap; + + a.a_tdvp = tdvp; + a.a_tvpp = tvpp; + a.a_tcnp = tcnp; + a.a_tvap = tvap; + + a.a_flags = flags; + a.a_context = ctx; + a.a_rename_authorizer = vn_authorize_rename; + a.a_reserved = NULL; + + /* do the rename of the main file. */ + _err = (*fdvp->v_op[vnop_compound_rename_desc.vdesc_offset])(&a); + + if (_err == 0) { + if (*tvpp && *tvpp != *fvpp) + vnode_setneedinactive(*tvpp); + } + + /* Wrote at least one directory. If transplanted a dir, also changed link counts */ + if (0 == _err && *fvpp != *tvpp) { + if (!*fvpp) { + panic("No fvpp after compound rename?"); + } + + events = NOTE_WRITE; + if (vnode_isdir(*fvpp)) { + /* Link count on dir changed only if we are moving a dir and... + * --Moved to new dir, not overwriting there + * --Kept in same dir and DID overwrite + */ + if (((fdvp != tdvp) && (!*tvpp)) || ((fdvp == tdvp) && (*tvpp))) { + events |= NOTE_LINK; + } + } + + lock_vnode_and_post(fdvp, events); + if (fdvp != tdvp) { + lock_vnode_and_post(tdvp, events); + } + + /* If you're replacing the target, post a deletion for it */ + if (*tvpp) + { + lock_vnode_and_post(*tvpp, NOTE_DELETE); + } + + lock_vnode_and_post(*fvpp, NOTE_RENAME); + } + + if (no_fvp) { + lookup_compound_vnop_post_hook(_err, fdvp, *fvpp, fcnp->cn_ndp, 0); + } + if (no_tvp && *tvpp != NULLVP) { + lookup_compound_vnop_post_hook(_err, tdvp, *tvpp, tcnp->cn_ndp, 0); + } + + if (_err && _err != EKEEPLOOKING) { + if (*fvpp) { + vnode_put(*fvpp); + *fvpp = NULLVP; + } + if (*tvpp) { + vnode_put(*tvpp); + *tvpp = NULLVP; + } + } + + return (_err); +} + +int +vn_mkdir(struct vnode *dvp, struct vnode **vpp, struct nameidata *ndp, + struct vnode_attr *vap, vfs_context_t ctx) +{ + if (ndp->ni_cnd.cn_nameiop != CREATE) { + panic("Non-CREATE nameiop in vn_mkdir()?"); + } + + if (vnode_compound_mkdir_available(dvp)) { + return VNOP_COMPOUND_MKDIR(dvp, vpp, ndp, vap, ctx); + } else { + return VNOP_MKDIR(dvp, vpp, &ndp->ni_cnd, vap, ctx); + } +} + #if 0 /* *# @@ -4550,6 +4900,59 @@ VNOP_MKDIR(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, return (_err); } +int +VNOP_COMPOUND_MKDIR(struct vnode *dvp, struct vnode **vpp, struct nameidata *ndp, + struct vnode_attr *vap, vfs_context_t ctx) +{ + int _err; + struct vnop_compound_mkdir_args a; + + a.a_desc = &vnop_compound_mkdir_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = &ndp->ni_cnd; + a.a_vap = vap; + a.a_flags = 0; + a.a_context = ctx; +#if 0 + a.a_mkdir_authorizer = vn_authorize_mkdir; +#endif /* 0 */ + a.a_reserved = NULL; + + _err = (*dvp->v_op[vnop_compound_mkdir_desc.vdesc_offset])(&a); + if (_err == 0 && !NATIVE_XATTR(dvp)) { + /* + * Remove stale Apple Double file (if any). + */ + xattrfile_remove(dvp, ndp->ni_cnd.cn_nameptr, ctx, 0); + } + + post_event_if_success(dvp, _err, NOTE_LINK | NOTE_WRITE); + + lookup_compound_vnop_post_hook(_err, dvp, *vpp, ndp, (_err == 0)); + if (*vpp && _err && _err != EKEEPLOOKING) { + vnode_put(*vpp); + *vpp = NULLVP; + } + + return (_err); +} + +int +vn_rmdir(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, vfs_context_t ctx) +{ + if (vnode_compound_rmdir_available(dvp)) { + return VNOP_COMPOUND_RMDIR(dvp, vpp, ndp, vap, ctx); + } else { + if (*vpp == NULLVP) { + panic("NULL vp, but not a compound VNOP?"); + } + if (vap != NULL) { + panic("Non-NULL vap, but not a compound VNOP?"); + } + return VNOP_RMDIR(dvp, *vpp, &ndp->ni_cnd, ctx); + } +} #if 0 /* @@ -4618,6 +5021,53 @@ VNOP_RMDIR(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, vfs_c return (_err); } +int +VNOP_COMPOUND_RMDIR(struct vnode *dvp, struct vnode **vpp, struct nameidata *ndp, + struct vnode_attr *vap, vfs_context_t ctx) +{ + int _err; + struct vnop_compound_rmdir_args a; + int no_vp; + + a.a_desc = &vnop_mkdir_desc; + a.a_dvp = dvp; + a.a_vpp = vpp; + a.a_cnp = &ndp->ni_cnd; + a.a_vap = vap; + a.a_flags = 0; + a.a_context = ctx; + a.a_rmdir_authorizer = vn_authorize_rmdir; + a.a_reserved = NULL; + + no_vp = (*vpp == NULLVP); + + _err = (*dvp->v_op[vnop_compound_rmdir_desc.vdesc_offset])(&a); + if (_err == 0 && !NATIVE_XATTR(dvp)) { + /* + * Remove stale Apple Double file (if any). + */ + xattrfile_remove(dvp, ndp->ni_cnd.cn_nameptr, ctx, 0); + } + + if (*vpp) { + post_event_if_success(*vpp, _err, NOTE_DELETE | NOTE_LINK); + } + post_event_if_success(dvp, _err, NOTE_LINK | NOTE_WRITE); + + if (no_vp) { + lookup_compound_vnop_post_hook(_err, dvp, *vpp, ndp, 0); + +#if 0 /* Removing orphaned ._ files requires a vp.... */ + if (*vpp && _err && _err != EKEEPLOOKING) { + vnode_put(*vpp); + *vpp = NULLVP; + } +#endif /* 0 */ + } + + return (_err); +} + /* * Remove a ._ AppleDouble file */ @@ -4642,7 +5092,7 @@ xattrfile_remove(vnode_t dvp, const char * basename, vfs_context_t ctx, int forc MALLOC(filename, char *, len, M_TEMP, M_WAITOK); len = snprintf(filename, len, "._%s", basename); } - NDINIT(&nd, DELETE, WANTPARENT | LOCKLEAF | NOFOLLOW | USEDVP, UIO_SYSSPACE, + NDINIT(&nd, DELETE, OP_UNLINK, WANTPARENT | LOCKLEAF | NOFOLLOW | USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(filename), ctx); nd.ni_dvp = dvp; if (namei(&nd) != 0) @@ -4678,32 +5128,9 @@ xattrfile_remove(vnode_t dvp, const char * basename, vfs_context_t ctx, int forc } } if (force) { - struct vnop_remove_args a; int error; -#ifndef __LP64__ - int thread_safe = THREAD_SAFE_FS(dvp); -#endif /* __LP64__ */ - a.a_desc = &vnop_remove_desc; - a.a_dvp = nd.ni_dvp; - a.a_vp = xvp; - a.a_cnp = &nd.ni_cnd; - a.a_context = ctx; - -#ifndef __LP64__ - if (!thread_safe) { - if ( (lock_fsnode(xvp, NULL)) ) - goto out1; - } -#endif /* __LP64__ */ - - error = (*dvp->v_op[vnop_remove_desc.vdesc_offset])(&a); - -#ifndef __LP64__ - if (!thread_safe) - unlock_fsnode(xvp, NULL); -#endif /* __LP64__ */ - + error = VNOP_REMOVE(dvp, xvp, &nd.ni_cnd, 0, ctx); if (error == 0) vnode_setneedinactive(xvp); @@ -4745,7 +5172,7 @@ xattrfile_setattr(vnode_t dvp, const char * basename, struct vnode_attr * vap, MALLOC(filename, char *, len, M_TEMP, M_WAITOK); len = snprintf(filename, len, "._%s", basename); } - NDINIT(&nd, LOOKUP, NOFOLLOW | USEDVP, UIO_SYSSPACE, + NDINIT(&nd, LOOKUP, OP_SETATTR, NOFOLLOW | USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(filename), ctx); nd.ni_dvp = dvp; if (namei(&nd) != 0) @@ -4847,7 +5274,6 @@ VNOP_SYMLINK(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, xattrfile_remove(dvp, cnp->cn_nameptr, ctx, 0); } - #ifndef __LP64__ if (!thread_safe) { unlock_fsnode(dvp, &funnel_state); @@ -5454,6 +5880,16 @@ VNOP_PAGEOUT(struct vnode *vp, upl_t pl, upl_offset_t pl_offset, off_t f_offset, return (_err); } +int +vn_remove(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, int32_t flags, struct vnode_attr *vap, vfs_context_t ctx) +{ + if (vnode_compound_remove_available(dvp)) { + return VNOP_COMPOUND_REMOVE(dvp, vpp, ndp, flags, vap, ctx); + } else { + return VNOP_REMOVE(dvp, *vpp, &ndp->ni_cnd, flags, ctx); + } +} + #if 0 /* diff --git a/bsd/vfs/vfs_attrlist.c b/bsd/vfs/vfs_attrlist.c index b94375efd..091ee16ab 100644 --- a/bsd/vfs/vfs_attrlist.c +++ b/bsd/vfs/vfs_attrlist.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2008 Apple Inc. All rights reserved. + * Copyright (c) 1995-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -395,6 +395,7 @@ static struct getattrlist_attrtab getattrlist_common_tab[] = { {ATTR_CMN_FILEID, VATTR_BIT(va_fileid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES}, {ATTR_CMN_PARENTID, VATTR_BIT(va_parentid), sizeof(uint64_t), KAUTH_VNODE_READ_ATTRIBUTES}, {ATTR_CMN_FULLPATH, 0, sizeof(struct attrreference), KAUTH_VNODE_READ_ATTRIBUTES }, + {ATTR_CMN_ADDEDTIME, VATTR_BIT(va_addedtime), ATTR_TIME_SIZE, KAUTH_VNODE_READ_ATTRIBUTES}, {ATTR_CMN_RETURNED_ATTRS, 0, sizeof(attribute_set_t), 0}, {0, 0, 0, 0} }; @@ -523,6 +524,27 @@ getattrlist_fixupattrs(attribute_set_t *asp, struct vnode_attr *vap) if (asp->commonattr) { tab = getattrlist_common_tab; do { + /* + * This if() statement is slightly confusing. We're trying to + * iterate through all of the bits listed in the array + * getattr_common_tab, and see if the filesystem was expected + * to support it, and whether or not we need to do anything about this. + * + * This array is full of structs that have 4 fields (attr, bits, size, action). + * The first is used to store the ATTR_CMN_* bit that was being requested + * from userland. The second stores the VATTR_BIT corresponding to the field + * filled in vnode_attr struct. If it is 0, then we don't typically expect + * the filesystem to fill in this field. The third is the size of the field, + * and the fourth is the type of kauth actions needed. + * + * So, for all of the ATTR_CMN bits listed in this array, we iterate through + * them, and check to see if it was both passed down to the filesystem via the + * va_active bitfield, and whether or not we expect it to be emitted from + * the filesystem. If it wasn't supported, then we un-twiddle the bit and move + * on. This is done so that we can uncheck those bits and re-request + * a vnode_getattr from the filesystem again. + */ + if ((tab->attr & asp->commonattr) && (tab->bits & vap->va_active) && (tab->bits & vap->va_supported) == 0) { @@ -1108,6 +1130,7 @@ getvolattrlist(vnode_t vp, struct getattrlist_args *uap, struct attrlist *alp, } if (alp->volattr & ATTR_VOL_UUID) { ATTR_PACK(&ab, vs.f_uuid); + ab.actual.volattr |= ATTR_VOL_UUID; } if (alp->volattr & ATTR_VOL_ATTRIBUTES) { /* fix up volume attribute information */ @@ -1188,6 +1211,7 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con int return_valid; int pack_invalid; int vtype = 0; + uint32_t perms = 0; proc_is64 = proc_is64bit(p); VATTR_INIT(&va); @@ -1604,6 +1628,30 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con ATTR_PACK_TIME(ab, va.va_backup_time, proc_is64); ab.actual.commonattr |= ATTR_CMN_BKUPTIME; } + /* + * They are requesting user access, we should obtain this before getting + * the finder info. For some network file systems this is a performance + * improvement. + */ + if (al.commonattr & ATTR_CMN_USERACCESS) { /* this is expensive */ + if (vtype == VDIR) { + if (vnode_authorize(vp, NULL, + KAUTH_VNODE_ACCESS | KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_DELETE_CHILD, ctx) == 0) + perms |= W_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_LIST_DIRECTORY, ctx) == 0) + perms |= R_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_SEARCH, ctx) == 0) + perms |= X_OK; + } else { + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA, ctx) == 0) + perms |= W_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_READ_DATA, ctx) == 0) + perms |= R_OK; + if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_EXECUTE, ctx) == 0) + perms |= X_OK; + } + } + if (al.commonattr & ATTR_CMN_FNDRINFO) { uio_t auio; size_t fisize = 32; @@ -1654,25 +1702,8 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con ATTR_PACK4(ab, va.va_flags); ab.actual.commonattr |= ATTR_CMN_FLAGS; } - if (al.commonattr & ATTR_CMN_USERACCESS) { /* this is expensive */ - uint32_t perms = 0; - if (vtype == VDIR) { - if (vnode_authorize(vp, NULL, - KAUTH_VNODE_ACCESS | KAUTH_VNODE_ADD_FILE | KAUTH_VNODE_ADD_SUBDIRECTORY | KAUTH_VNODE_DELETE_CHILD, ctx) == 0) - perms |= W_OK; - if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_LIST_DIRECTORY, ctx) == 0) - perms |= R_OK; - if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_SEARCH, ctx) == 0) - perms |= X_OK; - } else { - if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_WRITE_DATA, ctx) == 0) - perms |= W_OK; - if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_READ_DATA, ctx) == 0) - perms |= R_OK; - if (vnode_authorize(vp, NULL, KAUTH_VNODE_ACCESS | KAUTH_VNODE_EXECUTE, ctx) == 0) - perms |= X_OK; - } - + /* We already obtain the user access, so just fill in the buffer here */ + if (al.commonattr & ATTR_CMN_USERACCESS) { #if CONFIG_MACF /* * Rather than MAC preceding DAC, in this case we want @@ -1737,6 +1768,12 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con attrlist_pack_string (&ab, fullpathptr, fullpathlen); ab.actual.commonattr |= ATTR_CMN_FULLPATH; } + + if (al.commonattr & ATTR_CMN_ADDEDTIME) { + ATTR_PACK_TIME(ab, va.va_addedtime, proc_is64); + ab.actual.commonattr |= ATTR_CMN_ADDEDTIME; + } + /* directory attributes *********************************************/ if (al.dirattr && (vtype == VDIR)) { @@ -1749,24 +1786,100 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con ab.actual.dirattr |= ATTR_DIR_ENTRYCOUNT; } if (al.dirattr & ATTR_DIR_MOUNTSTATUS) { - ATTR_PACK_CAST(&ab, uint32_t, (vp->v_flag & VROOT) ? - DIR_MNTSTATUS_MNTPOINT : 0); + uint32_t mntstat; + + mntstat = (vp->v_flag & VROOT) ? DIR_MNTSTATUS_MNTPOINT : 0; +#if CONFIG_TRIGGERS + /* + * Report back on active vnode triggers + * that can directly trigger a mount + */ + if (vp->v_resolve && + !(vp->v_resolve->vr_flags & VNT_NO_DIRECT_MOUNT)) { + mntstat |= DIR_MNTSTATUS_TRIGGER; + } +#endif + ATTR_PACK4(ab, mntstat); ab.actual.dirattr |= ATTR_DIR_MOUNTSTATUS; } } /* file attributes **************************************************/ if (al.fileattr && (vtype != VDIR)) { + + size_t rsize = 0; + uint64_t rlength = 0; + uint64_t ralloc = 0; + /* + * Pre-fetch the rsrc attributes now so we only get them once. + * Fetch the resource fork size/allocation via xattr interface + */ + if (al.fileattr & (ATTR_FILE_TOTALSIZE | ATTR_FILE_ALLOCSIZE | ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE)) { + if ((error = vn_getxattr(vp, XATTR_RESOURCEFORK_NAME, NULL, &rsize, XATTR_NOSECURITY, ctx)) != 0) { + if ((error == ENOENT) || (error == ENOATTR) || (error == ENOTSUP) || (error == EPERM)|| (error == EACCES)) { + rsize = 0; + error = 0; + } else { + goto out; + } + } + rlength = rsize; + + if (al.fileattr & (ATTR_FILE_RSRCALLOCSIZE | ATTR_FILE_ALLOCSIZE)) { + uint32_t blksize = vp->v_mount->mnt_vfsstat.f_bsize; + if (blksize == 0) { + blksize = 512; + } + ralloc = roundup(rsize, blksize); + } + } + if (al.fileattr & ATTR_FILE_LINKCOUNT) { ATTR_PACK4(ab, (uint32_t)va.va_nlink); ab.actual.fileattr |= ATTR_FILE_LINKCOUNT; } + /* + * Note the following caveats for the TOTALSIZE and ALLOCSIZE attributes: + * We infer that if the filesystem does not support va_data_size or va_data_alloc + * it must not know about alternate forks. So when we need to gather + * the total size or total alloc, it's OK to substitute the total size for + * the data size below. This is because it is likely a flat filesystem and we must + * be using AD files to store the rsrc fork and EAs. + * + * Additionally, note that getattrlist is barred from being called on + * resource fork paths. (Search for CN_ALLOWRSRCFORK). So if the filesystem does + * support va_data_size, it is guaranteed to represent the data fork's size. This + * is an important distinction to make because when we call vnode_getattr on + * an HFS resource fork vnode, to get the size, it will vend out the resource + * fork's size (it only gets the size of the passed-in vnode). + */ if (al.fileattr & ATTR_FILE_TOTALSIZE) { - ATTR_PACK8(ab, va.va_total_size); + uint64_t totalsize = rlength; + + if (VATTR_IS_SUPPORTED(&va, va_data_size)) { + totalsize += va.va_data_size; + } else { + totalsize += va.va_total_size; + } + + ATTR_PACK8(ab, totalsize); ab.actual.fileattr |= ATTR_FILE_TOTALSIZE; } if (al.fileattr & ATTR_FILE_ALLOCSIZE) { - ATTR_PACK8(ab, va.va_total_alloc); + uint64_t totalalloc = ralloc; + + /* + * If data_alloc is supported, then it must represent the + * data fork size. + */ + if (VATTR_IS_SUPPORTED(&va, va_data_alloc)) { + totalalloc += va.va_data_alloc; + } + else { + totalalloc += va.va_total_alloc; + } + + ATTR_PACK8(ab, totalalloc); ab.actual.fileattr |= ATTR_FILE_ALLOCSIZE; } if (al.fileattr & ATTR_FILE_IOBLOCKSIZE) { @@ -1793,6 +1906,12 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con ATTR_PACK4(ab, dev); ab.actual.fileattr |= ATTR_FILE_DEVTYPE; } + + /* + * If the filesystem does not support datalength + * or dataallocsize, then we infer that totalsize and + * totalalloc are substitutes. + */ if (al.fileattr & ATTR_FILE_DATALENGTH) { if (VATTR_IS_SUPPORTED(&va, va_data_size)) { ATTR_PACK8(ab, va.va_data_size); @@ -1809,37 +1928,17 @@ getattrlist_internal(vnode_t vp, struct getattrlist_args *uap, proc_t p, vfs_con } ab.actual.fileattr |= ATTR_FILE_DATAALLOCSIZE; } - /* fetch resource fork size/allocation via xattr interface */ - if (al.fileattr & (ATTR_FILE_RSRCLENGTH | ATTR_FILE_RSRCALLOCSIZE)) { - size_t rsize; - uint64_t rlength; - - if ((error = vn_getxattr(vp, XATTR_RESOURCEFORK_NAME, NULL, &rsize, XATTR_NOSECURITY, ctx)) != 0) { - if ((error == ENOENT) || (error == ENOATTR) || (error == ENOTSUP) || (error == EPERM)) { - rsize = 0; - error = 0; - } else { - goto out; - } - } - if (al.fileattr & ATTR_FILE_RSRCLENGTH) { - rlength = rsize; - ATTR_PACK8(ab, rlength); - ab.actual.fileattr |= ATTR_FILE_RSRCLENGTH; - } - if (al.fileattr & ATTR_FILE_RSRCALLOCSIZE) { - uint32_t blksize = vp->v_mount->mnt_vfsstat.f_bsize; - if (blksize == 0) - blksize = 512; - rlength = roundup(rsize, blksize); - ATTR_PACK8(ab, rlength); - ab.actual.fileattr |= ATTR_FILE_RSRCALLOCSIZE; - } + /* already got the resource fork size/allocation above */ + if (al.fileattr & ATTR_FILE_RSRCLENGTH) { + ATTR_PACK8(ab, rlength); + ab.actual.fileattr |= ATTR_FILE_RSRCLENGTH; } - if (al.fileattr & ATTR_FILE_PROTECTION_CLASS) { + if (al.fileattr & ATTR_FILE_RSRCALLOCSIZE) { + ATTR_PACK8(ab, ralloc); + ab.actual.fileattr |= ATTR_FILE_RSRCALLOCSIZE; } } - + /* diagnostic */ if (!return_valid && (ab.fixedcursor - ab.base) != fixedsize) panic("packed field size mismatch; allocated %ld but packed %ld for common %08x vol %08x", @@ -1938,7 +2037,7 @@ getattrlist(proc_t p, struct getattrlist_args *uap, __unused int32_t *retval) nameiflags = NOTRIGGER | AUDITVNPATH1; if (!(uap->options & FSOPT_NOFOLLOW)) nameiflags |= FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags, UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_GETATTR, nameiflags, UIO_USERSPACE, uap->path, ctx); if ((error = namei(&nd)) != 0) goto out; @@ -2198,8 +2297,6 @@ setattrlist_internal(vnode_t vp, struct setattrlist_args *uap, proc_t p, vfs_con VFS_DEBUG(ctx, vp, "ATTRLIST - XXX device type change not implemented"); goto out; } - if (al.fileattr & ATTR_FILE_PROTECTION_CLASS) { - } /* * Validate and authorize. @@ -2325,10 +2422,10 @@ setattrlist(proc_t p, struct setattrlist_args *uap, __unused int32_t *retval) /* * Look up the file. */ - nameiflags = 0; + nameiflags = AUDITVNPATH1; if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_SETATTR, nameiflags, UIO_USERSPACE, uap->path, ctx); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; diff --git a/bsd/vfs/vfs_bio.c b/bsd/vfs/vfs_bio.c index 69b9e8520..0d474ed28 100644 --- a/bsd/vfs/vfs_bio.c +++ b/bsd/vfs/vfs_bio.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -110,12 +110,13 @@ #include <sys/sdt.h> + #if BALANCE_QUEUES static __inline__ void bufqinc(int q); static __inline__ void bufqdec(int q); #endif -static int bcleanbuf(buf_t bp, boolean_t discard); +int bcleanbuf(buf_t bp, boolean_t discard); static int brecover_data(buf_t bp); static boolean_t incore(vnode_t vp, daddr64_t blkno); /* timeout is in msecs */ @@ -125,7 +126,13 @@ static void buf_reassign(buf_t bp, vnode_t newvp); static errno_t buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo); static int buf_iterprepare(vnode_t vp, struct buflists *, int flags); static void buf_itercomplete(vnode_t vp, struct buflists *, int flags); -boolean_t buffer_cache_gc(int); +static boolean_t buffer_cache_gc(int); +static buf_t buf_brelse_shadow(buf_t bp); +static void buf_free_meta_store(buf_t bp); + +static buf_t buf_create_shadow_internal(buf_t bp, boolean_t force_copy, + uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv); + __private_extern__ int bdwrite_internal(buf_t, int); @@ -156,6 +163,7 @@ long nbdwrite = 0; int blaundrycnt = 0; static int boot_nbuf_headers = 0; +static TAILQ_HEAD(delayqueue, buf) delaybufqueue; static TAILQ_HEAD(ioqueue, buf) iobufqueue; static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; @@ -231,7 +239,7 @@ int lru_is_stale = LRU_IS_STALE; int age_is_stale = AGE_IS_STALE; int meta_is_stale = META_IS_STALE; - +#define MAXLAUNDRY 10 /* LIST_INSERT_HEAD() with assertions */ static __inline__ void @@ -278,7 +286,28 @@ bremhash(buf_t bp) *bp->b_hash.le_prev = (bp)->b_hash.le_next; } +/* + * buf_mtxp held. + */ +static __inline__ void +bmovelaundry(buf_t bp) +{ + bp->b_whichq = BQ_LAUNDRY; + bp->b_timestamp = buf_timestamp(); + binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); + blaundrycnt++; +} +static __inline__ void +buf_release_credentials(buf_t bp) +{ + if (IS_VALID_CRED(bp->b_rcred)) { + kauth_cred_unref(&bp->b_rcred); + } + if (IS_VALID_CRED(bp->b_wcred)) { + kauth_cred_unref(&bp->b_wcred); + } +} int @@ -315,6 +344,17 @@ buf_markdelayed(buf_t bp) { SET(bp->b_flags, B_DONE); } +void +buf_markclean(buf_t bp) { + + if (ISSET(bp->b_flags, B_DELWRI)) { + CLR(bp->b_flags, B_DELWRI); + + OSAddAtomicLong(-1, &nbdwrite); + buf_reassign(bp, bp->b_vp); + } +} + void buf_markeintr(buf_t bp) { @@ -571,15 +611,179 @@ buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), v } +int +buf_shadow(buf_t bp) +{ + if (bp->b_lflags & BL_SHADOW) + return 1; + return 0; +} + + +buf_t +buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg) +{ + return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1)); +} + +buf_t +buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg) +{ + return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0)); +} + + +static buf_t +buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv) +{ + buf_t io_bp; + + KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0); + + if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) { + + KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0); + return (NULL); + } +#ifdef BUF_MAKE_PRIVATE + if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0) + panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref); +#endif + io_bp = alloc_io_buf(bp->b_vp, priv); + + io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA); + io_bp->b_blkno = bp->b_blkno; + io_bp->b_lblkno = bp->b_lblkno; + + if (iodone) { + io_bp->b_transaction = arg; + io_bp->b_iodone = iodone; + io_bp->b_flags |= B_CALL; + } + if (force_copy == FALSE) { + io_bp->b_bcount = bp->b_bcount; + io_bp->b_bufsize = bp->b_bufsize; + + if (external_storage) { + io_bp->b_datap = external_storage; +#ifdef BUF_MAKE_PRIVATE + io_bp->b_data_store = NULL; +#endif + } else { + io_bp->b_datap = bp->b_datap; +#ifdef BUF_MAKE_PRIVATE + io_bp->b_data_store = bp; +#endif + } + *(buf_t *)(&io_bp->b_orig) = bp; + + lck_mtx_lock_spin(buf_mtxp); + + io_bp->b_lflags |= BL_SHADOW; + io_bp->b_shadow = bp->b_shadow; + bp->b_shadow = io_bp; + bp->b_shadow_ref++; + +#ifdef BUF_MAKE_PRIVATE + if (external_storage) + io_bp->b_lflags |= BL_EXTERNAL; + else + bp->b_data_ref++; +#endif + lck_mtx_unlock(buf_mtxp); + } else { + if (external_storage) { +#ifdef BUF_MAKE_PRIVATE + io_bp->b_lflags |= BL_EXTERNAL; +#endif + io_bp->b_bcount = bp->b_bcount; + io_bp->b_bufsize = bp->b_bufsize; + io_bp->b_datap = external_storage; + } else { + allocbuf(io_bp, bp->b_bcount); + + io_bp->b_lflags |= BL_IOBUF_ALLOC; + } + bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount); + +#ifdef BUF_MAKE_PRIVATE + io_bp->b_data_store = NULL; +#endif + } + KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0); + + return (io_bp); +} + + +#ifdef BUF_MAKE_PRIVATE +errno_t +buf_make_private(buf_t bp) +{ + buf_t ds_bp; + buf_t t_bp; + struct buf my_buf; + + KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0); + + if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) { + + KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0); + return (EINVAL); + } + my_buf.b_flags = B_META; + my_buf.b_datap = (uintptr_t)NULL; + allocbuf(&my_buf, bp->b_bcount); + + bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount); + + lck_mtx_lock_spin(buf_mtxp); + + for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) { + if ( !ISSET(bp->b_lflags, BL_EXTERNAL)) + break; + } + ds_bp = t_bp; + + if (ds_bp == NULL && bp->b_data_ref) + panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL"); + + if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0)) + panic("buf_make_private: ref_count == 0 && ds_bp != NULL"); + + if (ds_bp == NULL) { + lck_mtx_unlock(buf_mtxp); + + buf_free_meta_store(&my_buf); + + KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0); + return (EINVAL); + } + for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) { + if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL)) + t_bp->b_data_store = ds_bp; + } + ds_bp->b_data_ref = bp->b_data_ref; + + bp->b_data_ref = 0; + bp->b_datap = my_buf.b_datap; + + lck_mtx_unlock(buf_mtxp); + + KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0); + return (0); +} +#endif + void buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction, - void **old_iodone, void **old_transaction) + void (**old_iodone)(buf_t, void *), void **old_transaction) { - if (old_iodone) - *old_iodone = (void *)(bp->b_iodone); + if (old_iodone) + *old_iodone = bp->b_iodone; if (old_transaction) - *old_transaction = (void *)(bp->b_transaction); + *old_transaction = bp->b_transaction; bp->b_transaction = transaction; bp->b_iodone = filter; @@ -884,6 +1088,13 @@ buf_strategy(vnode_t devvp, void *ap) vnode_t vp = bp->b_vp; int bmap_flags; errno_t error; +#if CONFIG_DTRACE + int dtrace_io_start_flag = 0; /* We only want to trip the io:::start + * probe once, with the true phisical + * block in place (b_blkno) + */ + +#endif if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK) panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n"); @@ -893,7 +1104,6 @@ buf_strategy(vnode_t devvp, void *ap) * end up issuing the I/O... */ bp->b_dev = devvp->v_rdev; - DTRACE_IO1(start, buf_t, bp); if (bp->b_flags & B_READ) bmap_flags = VNODE_READ; @@ -909,6 +1119,7 @@ buf_strategy(vnode_t devvp, void *ap) * to deal with filesystem block sizes * that aren't equal to the page size */ + DTRACE_IO1(start, buf_t, bp); return (cluster_bp(bp)); } if (bp->b_blkno == bp->b_lblkno) { @@ -916,30 +1127,53 @@ buf_strategy(vnode_t devvp, void *ap) size_t contig_bytes; if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) { + DTRACE_IO1(start, buf_t, bp); buf_seterror(bp, error); buf_biodone(bp); return (error); } if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) { + DTRACE_IO1(start, buf_t, bp); buf_seterror(bp, error); buf_biodone(bp); return (error); } + + DTRACE_IO1(start, buf_t, bp); +#if CONFIG_DTRACE + dtrace_io_start_flag = 1; +#endif /* CONFIG_DTRACE */ + if ((bp->b_blkno == -1) || (contig_bytes == 0)) { /* Set block number to force biodone later */ bp->b_blkno = -1; buf_clear(bp); } - else if ((long)contig_bytes < bp->b_bcount) + else if ((long)contig_bytes < bp->b_bcount) { return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes)); + } } + +#if CONFIG_DTRACE + if (dtrace_io_start_flag == 0) { + DTRACE_IO1(start, buf_t, bp); + dtrace_io_start_flag = 1; + } +#endif /* CONFIG_DTRACE */ + if (bp->b_blkno == -1) { buf_biodone(bp); return (0); } } + +#if CONFIG_DTRACE + if (dtrace_io_start_flag == 0) + DTRACE_IO1(start, buf_t, bp); +#endif /* CONFIG_DTRACE */ + /* * we can issue the I/O because... * either B_CLUSTER is set which @@ -1067,6 +1301,7 @@ int buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) { buf_t bp; + int aflags; int error = 0; int must_rescan = 1; struct buflists local_iterblkhd; @@ -1097,6 +1332,7 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) goto try_dirty_list; } while (!LIST_EMPTY(&local_iterblkhd)) { + bp = LIST_FIRST(&local_iterblkhd); LIST_REMOVE(bp, b_vnbufs); @@ -1108,7 +1344,12 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) continue; - if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) { + aflags = BAC_REMOVE; + + if ( !(flags & BUF_INVALIDATE_LOCKED) ) + aflags |= BAC_SKIP_LOCKED; + + if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) { if (error == EDEADLK) /* * this buffer was marked B_LOCKED... @@ -1136,6 +1377,10 @@ buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo) } lck_mtx_unlock(buf_mtxp); + if (bp->b_flags & B_LOCKED) + KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0); + + CLR(bp->b_flags, B_LOCKED); SET(bp->b_flags, B_INVAL); buf_brelse(bp); @@ -1170,7 +1415,12 @@ try_dirty_list: if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META))) continue; - if ( (error = (int)buf_acquire_locked(bp, BAC_REMOVE | BAC_SKIP_LOCKED, slpflag, slptimeo)) ) { + aflags = BAC_REMOVE; + + if ( !(flags & BUF_INVALIDATE_LOCKED) ) + aflags |= BAC_SKIP_LOCKED; + + if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) { if (error == EDEADLK) /* * this buffer was marked B_LOCKED... @@ -1198,6 +1448,10 @@ try_dirty_list: } lck_mtx_unlock(buf_mtxp); + if (bp->b_flags & B_LOCKED) + KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0); + + CLR(bp->b_flags, B_LOCKED); SET(bp->b_flags, B_INVAL); if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA)) @@ -1360,6 +1614,19 @@ bremfree_locked(buf_t bp) { struct bqueues *dp = NULL; int whichq; + + whichq = bp->b_whichq; + + if (whichq == -1) { + if (bp->b_shadow_ref == 0) + panic("bremfree_locked: %p not on freelist", bp); + /* + * there are clones pointing to 'bp'... + * therefore, it was not put on a freelist + * when buf_brelse was last called on 'bp' + */ + return; + } /* * We only calculate the head of the freelist when removing * the last element of the list as that is the only time that @@ -1367,8 +1634,6 @@ bremfree_locked(buf_t bp) * * NB: This makes an assumption about how tailq's are implemented. */ - whichq = bp->b_whichq; - if (bp->b_freelist.tqe_next == NULL) { dp = &bufqueues[whichq]; @@ -1385,6 +1650,7 @@ bremfree_locked(buf_t bp) bp->b_whichq = -1; bp->b_timestamp = 0; + bp->b_shadow = 0; } /* @@ -1432,7 +1698,7 @@ brelvp_locked(buf_t bp) static void buf_reassign(buf_t bp, vnode_t newvp) { - register struct buflists *listheadp; + struct buflists *listheadp; if (newvp == NULL) { printf("buf_reassign: NULL"); @@ -1502,8 +1768,11 @@ bufinit(void) binsheadfree(bp, dp, BQ_EMPTY); binshash(bp, &invalhash); } - boot_nbuf_headers = nbuf_headers; + + TAILQ_INIT(&iobufqueue); + TAILQ_INIT(&delaybufqueue); + for (; i < nbuf_headers + niobuf_headers; i++) { bp = &buf_headers[i]; bufhdrinit(bp); @@ -1601,8 +1870,10 @@ bufzoneinit(void) meta_zones[i].mz_max, PAGE_SIZE, meta_zones[i].mz_name); + zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE); } buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers"); + zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE); } static __inline__ zone_t @@ -1853,7 +2124,7 @@ vn_bwrite(struct vnop_bwrite_args *ap) * headers, we can get in to the situation where "too" many * buf_bdwrite()s can create situation where the kernel can create * buffers faster than the disks can service. Doing a buf_bawrite() in - * cases were we have "too many" outstanding buf_bdwrite()s avoids that. + * cases where we have "too many" outstanding buf_bdwrite()s avoids that. */ __private_extern__ int bdwrite_internal(buf_t bp, int return_error) @@ -1955,6 +2226,116 @@ buf_bawrite(buf_t bp) } + +static void +buf_free_meta_store(buf_t bp) +{ + if (bp->b_bufsize) { + if (ISSET(bp->b_flags, B_ZALLOC)) { + zone_t z; + + z = getbufzone(bp->b_bufsize); + zfree(z, (void *)bp->b_datap); + } else + kmem_free(kernel_map, bp->b_datap, bp->b_bufsize); + + bp->b_datap = (uintptr_t)NULL; + bp->b_bufsize = 0; + } +} + + +static buf_t +buf_brelse_shadow(buf_t bp) +{ + buf_t bp_head; + buf_t bp_temp; + buf_t bp_return = NULL; +#ifdef BUF_MAKE_PRIVATE + buf_t bp_data; + int data_ref = 0; +#endif + lck_mtx_lock_spin(buf_mtxp); + + bp_head = (buf_t)bp->b_orig; + + if (bp_head->b_whichq != -1) + panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq); + +#ifdef BUF_MAKE_PRIVATE + if (bp_data = bp->b_data_store) { + bp_data->b_data_ref--; + /* + * snapshot the ref count so that we can check it + * outside of the lock... we only want the guy going + * from 1 -> 0 to try and release the storage + */ + data_ref = bp_data->b_data_ref; + } +#endif + KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0); + + bp_head->b_shadow_ref--; + + for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow); + + if (bp_temp == NULL) + panic("buf_brelse_shadow: bp not on list %p", bp_head); + + bp_temp->b_shadow = bp_temp->b_shadow->b_shadow; + +#ifdef BUF_MAKE_PRIVATE + /* + * we're about to free the current 'owner' of the data buffer and + * there is at least one other shadow buf_t still pointing at it + * so transfer it to the first shadow buf left in the chain + */ + if (bp == bp_data && data_ref) { + if ((bp_data = bp_head->b_shadow) == NULL) + panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp); + + for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow) + bp_temp->b_data_store = bp_data; + bp_data->b_data_ref = data_ref; + } +#endif + if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow) + panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0 bp(%p)", bp); + if (bp_head->b_shadow_ref && bp_head->b_shadow == 0) + panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0 bp(%p)", bp); + + if (bp_head->b_shadow_ref == 0) { + if (!ISSET(bp_head->b_lflags, BL_BUSY)) { + + CLR(bp_head->b_flags, B_AGE); + bp_head->b_timestamp = buf_timestamp(); + + if (ISSET(bp_head->b_flags, B_LOCKED)) { + bp_head->b_whichq = BQ_LOCKED; + binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED); + } else { + bp_head->b_whichq = BQ_META; + binstailfree(bp_head, &bufqueues[BQ_META], BQ_META); + } + } else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) { + CLR(bp_head->b_lflags, BL_WAITSHADOW); + + bp_return = bp_head; + } + } + lck_mtx_unlock(buf_mtxp); +#ifdef BUF_MAKE_PRIVATE + if (bp == bp_data && data_ref == 0) + buf_free_meta_store(bp); + + bp->b_data_store = NULL; +#endif + KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0); + + return (bp_return); +} + + /* * Release a buffer on to the free lists. * Described in Bach (p. 46). @@ -1979,7 +2360,18 @@ buf_brelse(buf_t bp) bp->b_tag = 0; #endif if (bp->b_lflags & BL_IOBUF) { + buf_t shadow_master_bp = NULL; + + if (ISSET(bp->b_lflags, BL_SHADOW)) + shadow_master_bp = buf_brelse_shadow(bp); + else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC)) + buf_free_meta_store(bp); free_io_buf(bp); + + if (shadow_master_bp) { + bp = shadow_master_bp; + goto finish_shadow_master; + } return; } @@ -1999,7 +2391,7 @@ buf_brelse(buf_t bp) if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) { if (ISSET(bp->b_flags, B_FILTER)) { /* if necessary, call out */ void (*iodone_func)(struct buf *, void *) = bp->b_iodone; - void *arg = (void *)bp->b_transaction; + void *arg = bp->b_transaction; CLR(bp->b_flags, B_FILTER); /* but note callout done */ bp->b_iodone = NULL; @@ -2020,7 +2412,7 @@ buf_brelse(buf_t bp) kern_return_t kret; int upl_flags; - if ( (upl == NULL) ) { + if (upl == NULL) { if ( !ISSET(bp->b_flags, B_INVAL)) { kret = ubc_create_upl(bp->b_vp, ubc_blktooff(bp->b_vp, bp->b_lblkno), @@ -2082,6 +2474,9 @@ buf_brelse(buf_t bp) if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL) || (ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) { + + boolean_t delayed_buf_free_meta_store = FALSE; + /* * If it's invalid or empty, dissociate it from its vnode, * release its storage if B_META, and @@ -2091,34 +2486,34 @@ buf_brelse(buf_t bp) OSAddAtomicLong(-1, &nbdwrite); if (ISSET(bp->b_flags, B_META)) { - if (bp->b_bufsize) { - if (ISSET(bp->b_flags, B_ZALLOC)) { - zone_t z; - - z = getbufzone(bp->b_bufsize); - zfree(z, (void *)bp->b_datap); - } else - kmem_free(kernel_map, bp->b_datap, bp->b_bufsize); - - bp->b_datap = (uintptr_t)NULL; - bp->b_bufsize = 0; - } + if (bp->b_shadow_ref) + delayed_buf_free_meta_store = TRUE; + else + buf_free_meta_store(bp); } /* * nuke any credentials we were holding */ - if (IS_VALID_CRED(bp->b_rcred)) { - kauth_cred_unref(&bp->b_rcred); - } - if (IS_VALID_CRED(bp->b_wcred)) { - kauth_cred_unref(&bp->b_wcred); + buf_release_credentials(bp); + + lck_mtx_lock_spin(buf_mtxp); + + if (bp->b_shadow_ref) { + SET(bp->b_lflags, BL_WAITSHADOW); + + lck_mtx_unlock(buf_mtxp); + + return; } - CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); + if (delayed_buf_free_meta_store == TRUE) { - bufq = &bufqueues[BQ_EMPTY]; - bp->b_whichq = BQ_EMPTY; + lck_mtx_unlock(buf_mtxp); +finish_shadow_master: + buf_free_meta_store(bp); - lck_mtx_lock_spin(buf_mtxp); + lck_mtx_lock_spin(buf_mtxp); + } + CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); if (bp->b_vp) brelvp_locked(bp); @@ -2127,8 +2522,10 @@ buf_brelse(buf_t bp) BLISTNONE(bp); binshash(bp, &invalhash); - binsheadfree(bp, bufq, BQ_EMPTY); + bp->b_whichq = BQ_EMPTY; + binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY); } else { + /* * It has valid data. Put it on the end of the appropriate * queue, so that it'll stick around for as long as possible. @@ -2143,13 +2540,32 @@ buf_brelse(buf_t bp) whichq = BQ_LRU; /* valid data */ bufq = &bufqueues[whichq]; - CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE)); - bp->b_whichq = whichq; bp->b_timestamp = buf_timestamp(); - lck_mtx_lock_spin(buf_mtxp); - - binstailfree(bp, bufq, whichq); + lck_mtx_lock_spin(buf_mtxp); + + /* + * the buf_brelse_shadow routine doesn't take 'ownership' + * of the parent buf_t... it updates state that is protected by + * the buf_mtxp, and checks for BL_BUSY to determine whether to + * put the buf_t back on a free list. b_shadow_ref is protected + * by the lock, and since we have not yet cleared B_BUSY, we need + * to check it while holding the lock to insure that one of us + * puts this buf_t back on a free list when it is safe to do so + */ + if (bp->b_shadow_ref == 0) { + CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE)); + bp->b_whichq = whichq; + binstailfree(bp, bufq, whichq); + } else { + /* + * there are still cloned buf_t's pointing + * at this guy... need to keep it off the + * freelists until a buf_brelse is done on + * the last clone + */ + CLR(bp->b_flags, (B_ASYNC | B_NOCACHE)); + } } if (needbuffer) { /* @@ -2581,6 +2997,23 @@ buf_geteblk(int size) return (bp); } +uint32_t +buf_redundancy_flags(buf_t bp) +{ + return bp->b_redundancy_flags; +} + +void +buf_set_redundancy_flags(buf_t bp, uint32_t flags) +{ + SET(bp->b_redundancy_flags, flags); +} + +void +buf_clear_redundancy_flags(buf_t bp, uint32_t flags) +{ + CLR(bp->b_redundancy_flags, flags); +} /* * With UBC, there is no need to expand / shrink the file data @@ -2861,14 +3294,14 @@ found: /* * Clean a buffer. - * Returns 0 is buffer is ready to use, + * Returns 0 if buffer is ready to use, * Returns 1 if issued a buf_bawrite() to indicate * that the buffer is not ready. * * buf_mtxp is held upon entry * returns with buf_mtxp locked */ -static int +int bcleanbuf(buf_t bp, boolean_t discard) { /* Remove from the queue */ @@ -2887,10 +3320,7 @@ bcleanbuf(buf_t bp, boolean_t discard) SET(bp->b_lflags, BL_WANTDEALLOC); } - bp->b_whichq = BQ_LAUNDRY; - bp->b_timestamp = buf_timestamp(); - binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); - blaundrycnt++; + bmovelaundry(bp); lck_mtx_unlock(buf_mtxp); @@ -2926,30 +3356,12 @@ bcleanbuf(buf_t bp, boolean_t discard) BLISTNONE(bp); - if (ISSET(bp->b_flags, B_META)) { - vm_offset_t elem; - - elem = (vm_offset_t)bp->b_datap; - bp->b_datap = (uintptr_t)0xdeadbeef; - - if (ISSET(bp->b_flags, B_ZALLOC)) { - zone_t z; - - z = getbufzone(bp->b_bufsize); - zfree(z, (void *)elem); - } else - kmem_free(kernel_map, elem, bp->b_bufsize); - } + if (ISSET(bp->b_flags, B_META)) + buf_free_meta_store(bp); trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); - /* nuke any credentials we were holding */ - if (IS_VALID_CRED(bp->b_rcred)) { - kauth_cred_unref(&bp->b_rcred); - } - if (IS_VALID_CRED(bp->b_wcred)) { - kauth_cred_unref(&bp->b_wcred); - } + buf_release_credentials(bp); /* If discarding, just move to the empty queue */ if (discard) { @@ -3163,24 +3575,6 @@ buf_biowait(buf_t bp) return (0); } -/* - * Wait for the callback operation on a B_CALL buffer to complete. - */ -void -buf_biowait_callback(buf_t bp) -{ - while (!ISSET(bp->b_lflags, BL_CALLDONE)) { - - lck_mtx_lock_spin(buf_mtxp); - - if (!ISSET(bp->b_lflags, BL_CALLDONE)) { - DTRACE_IO1(wait__start, buf_t, bp); - (void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL); - DTRACE_IO1(wait__done, buf_t, bp); - } else - lck_mtx_unlock(buf_mtxp); - } -} /* * Mark I/O complete on a buffer. @@ -3242,6 +3636,11 @@ buf_biodone(buf_t bp) else if (bp->b_flags & B_PAGEIO) code |= DKIO_PAGING; + if (bp->b_flags & B_THROTTLED_IO) + code |= DKIO_THROTTLE; + else if (bp->b_flags & B_PASSIVE) + code |= DKIO_PASSIVE; + KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE, bp, (uintptr_t)bp->b_vp, bp->b_resid, bp->b_error, 0); @@ -3252,11 +3651,14 @@ buf_biodone(buf_t bp) microuptime(&priority_IO_timestamp_for_root); hard_throttle_on_root = 0; } + /* * I/O was done, so don't believe - * the DIRTY state from VM anymore + * the DIRTY state from VM anymore... + * and we need to reset the THROTTLED/PASSIVE + * indicators */ - CLR(bp->b_flags, B_WASDIRTY); + CLR(bp->b_flags, (B_WASDIRTY | B_THROTTLED_IO | B_PASSIVE)); DTRACE_IO1(done, buf_t, bp); if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW)) @@ -3269,46 +3671,26 @@ buf_biodone(buf_t bp) if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) { /* if necessary, call out */ void (*iodone_func)(struct buf *, void *) = bp->b_iodone; - void *arg = (void *)bp->b_transaction; + void *arg = bp->b_transaction; int callout = ISSET(bp->b_flags, B_CALL); + if (iodone_func == NULL) + panic("biodone: bp @ %p has NULL b_iodone!\n", bp); + CLR(bp->b_flags, (B_CALL | B_FILTER)); /* filters and callouts are one-shot */ bp->b_iodone = NULL; bp->b_transaction = NULL; - if (iodone_func == NULL) { - panic("biodone: bp @ %p has NULL b_iodone!\n", bp); - } else { - if (callout) - SET(bp->b_flags, B_DONE); /* note that it's done */ - (*iodone_func)(bp, arg); - } - if (callout) { - int need_wakeup = 0; + if (callout) + SET(bp->b_flags, B_DONE); /* note that it's done */ - /* + (*iodone_func)(bp, arg); + + if (callout) { + /* * assumes that the callback function takes * ownership of the bp and deals with releasing it if necessary - * BL_WANTED indicates that we've decided to wait on the - * completion of this I/O in a synchronous manner... we - * still call the callback function, but in addition we - * will do a wakeup... BL_CALLDONE indicates that the callback - * routine has completed and its ok for the waiter to take - * 'ownership' of this bp back */ - lck_mtx_lock_spin(buf_mtxp); - - if (bp->b_lflags & BL_WANTED) { - CLR(bp->b_lflags, BL_WANTED); - need_wakeup = 1; - } - SET(bp->b_lflags, BL_CALLDONE); - - lck_mtx_unlock(buf_mtxp); - - if (need_wakeup) - wakeup(bp); - goto biodone_done; } /* @@ -3390,8 +3772,8 @@ void vfs_bufstats() { int i, j, count; - register struct buf *bp; - register struct bqueues *dp; + struct buf *bp; + struct bqueues *dp; int counts[MAXBSIZE/CLBYTES+1]; static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" }; @@ -3418,7 +3800,7 @@ vfs_bufstats() } #endif /* DIAGNOSTIC */ -#define NRESERVEDIOBUFS 64 +#define NRESERVEDIOBUFS 128 buf_t @@ -3433,9 +3815,7 @@ alloc_io_buf(vnode_t vp, int priv) bufstats.bufs_iobufsleeps++; need_iobuffer = 1; - (void) msleep(&need_iobuffer, iobuffer_mtxp, PDROP | (PRIBIO+1), (const char *)"alloc_io_buf", NULL); - - lck_mtx_lock_spin(iobuffer_mtxp); + (void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL); } TAILQ_REMOVE(&iobufqueue, bp, b_freelist); @@ -3457,6 +3837,7 @@ alloc_io_buf(vnode_t vp, int priv) bp->b_datap = 0; bp->b_flags = 0; bp->b_lflags = BL_BUSY | BL_IOBUF; + bp->b_redundancy_flags = 0; bp->b_blkno = bp->b_lblkno = 0; #ifdef JOE_DEBUG bp->b_owner = current_thread(); @@ -3551,6 +3932,8 @@ bcleanbuf_thread_init(void) thread_deallocate(thread); } +typedef int (*bcleanbufcontinuation)(int); + static void bcleanbuf_thread(void) { @@ -3562,10 +3945,9 @@ bcleanbuf_thread(void) lck_mtx_lock_spin(buf_mtxp); while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) { - (void)msleep((void *)&bufqueues[BQ_LAUNDRY], buf_mtxp, PDROP | PRIBIO, "blaundry", NULL); - - lck_mtx_lock_spin(buf_mtxp); + (void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread); } + /* * Remove from the queue */ @@ -3597,7 +3979,7 @@ bcleanbuf_thread(void) binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY); blaundrycnt++; - /* we never leave a busy page on the laundary queue */ + /* we never leave a busy page on the laundry queue */ CLR(bp->b_lflags, BL_BUSY); buf_busycount--; #ifdef JOE_DEBUG @@ -3606,12 +3988,18 @@ bcleanbuf_thread(void) #endif lck_mtx_unlock(buf_mtxp); - - if (loopcnt > 10) { - (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1); + + if (loopcnt > MAXLAUNDRY) { + /* + * bawrite_internal() can return errors if we're throttled. If we've + * done several I/Os and failed, give the system some time to unthrottle + * the vnode + */ + (void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1); loopcnt = 0; } else { - (void)thread_block(THREAD_CONTINUE_NULL); + /* give other threads a chance to run */ + (void)thread_block(THREAD_CONTINUE_NULL); loopcnt++; } } @@ -3680,34 +4068,125 @@ buffer_cache_gc(int all) { buf_t bp; boolean_t did_large_zfree = FALSE; + boolean_t need_wakeup = FALSE; int now = buf_timestamp(); - uint32_t count = 0; + uint32_t found = 0, total_found = 0; + struct bqueues privq; int thresh_hold = BUF_STALE_THRESHHOLD; if (all) thresh_hold = 0; + /* + * We only care about metadata (incore storage comes from zalloc()). + * No more than 1024 buffers total, and only those not accessed within the + * last 30s. We will also only examine 128 buffers during a single grab + * of the lock in order to limit lock hold time. + */ + lck_mtx_lock(buf_mtxp); + do { + found = 0; + TAILQ_INIT(&privq); + need_wakeup = FALSE; - lck_mtx_lock_spin(buf_mtxp); + while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) && + (now > bp->b_timestamp) && + (now - bp->b_timestamp > thresh_hold) && + (found < BUF_MAX_GC_BATCH_SIZE)) { + + /* Remove from free list */ + bremfree_locked(bp); + found++; + +#ifdef JOE_DEBUG + bp->b_owner = current_thread(); + bp->b_tag = 12; +#endif + + /* If dirty, move to laundry queue and remember to do wakeup */ + if (ISSET(bp->b_flags, B_DELWRI)) { + SET(bp->b_lflags, BL_WANTDEALLOC); + + bmovelaundry(bp); + need_wakeup = TRUE; + + continue; + } + + /* + * Mark busy and put on private list. We could technically get + * away without setting BL_BUSY here. + */ + SET(bp->b_lflags, BL_BUSY); + buf_busycount++; - /* We only care about metadata (incore storage comes from zalloc()) */ - bp = TAILQ_FIRST(&bufqueues[BQ_META]); + /* + * Remove from hash and dissociate from vp. + */ + bremhash(bp); + if (bp->b_vp) { + brelvp_locked(bp); + } - /* Only collect buffers unused in the last N seconds. Note: ordered by timestamp. */ - while ((bp != NULL) && ((now - bp->b_timestamp) > thresh_hold) && (all || (count < BUF_MAX_GC_COUNT))) { - int result, size; - boolean_t is_zalloc; + TAILQ_INSERT_TAIL(&privq, bp, b_freelist); + } - size = buf_size(bp); - is_zalloc = ISSET(bp->b_flags, B_ZALLOC); + if (found == 0) { + break; + } - result = bcleanbuf(bp, TRUE); - if ((result == 0) && is_zalloc && (size >= PAGE_SIZE)) { - /* We've definitely freed at least a page to a zone */ - did_large_zfree = TRUE; + /* Drop lock for batch processing */ + lck_mtx_unlock(buf_mtxp); + + /* Wakeup and yield for laundry if need be */ + if (need_wakeup) { + wakeup(&bufqueues[BQ_LAUNDRY]); + (void)thread_block(THREAD_CONTINUE_NULL); } - bp = TAILQ_FIRST(&bufqueues[BQ_META]); - count++; - } + + /* Clean up every buffer on private list */ + TAILQ_FOREACH(bp, &privq, b_freelist) { + /* Take note if we've definitely freed at least a page to a zone */ + if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) { + did_large_zfree = TRUE; + } + + trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); + + /* Free Storage */ + buf_free_meta_store(bp); + + /* Release credentials */ + buf_release_credentials(bp); + + /* Prepare for moving to empty queue */ + CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED + | B_AGE | B_ASYNC | B_NOCACHE | B_FUA)); + bp->b_whichq = BQ_EMPTY; + BLISTNONE(bp); + } + + lck_mtx_lock(buf_mtxp); + + /* Back under lock, move them all to invalid hash and clear busy */ + TAILQ_FOREACH(bp, &privq, b_freelist) { + binshash(bp, &invalhash); + CLR(bp->b_lflags, BL_BUSY); + buf_busycount--; + +#ifdef JOE_DEBUG + if (bp->b_owner != current_thread()) { + panic("Buffer stolen from buffer_cache_gc()"); + } + bp->b_owner = current_thread(); + bp->b_tag = 13; +#endif + } + + /* And do a big bulk move to the empty queue */ + TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist); + total_found += found; + + } while ((all || (total_found < BUF_MAX_GC_COUNT)) && (found == BUF_MAX_GC_BATCH_SIZE)); lck_mtx_unlock(buf_mtxp); diff --git a/bsd/vfs/vfs_cache.c b/bsd/vfs/vfs_cache.c index ba73d95a4..3096d1294 100644 --- a/bsd/vfs/vfs_cache.c +++ b/bsd/vfs/vfs_cache.c @@ -114,6 +114,7 @@ long numcache; /* number of cache entries allocated */ int desiredNodes; int desiredNegNodes; int ncs_negtotal; +int nc_disabled = 0; TAILQ_HEAD(, namecache) nchead; /* chain of all name cache entries */ TAILQ_HEAD(, namecache) neghead; /* chain of only negative cache entries */ @@ -309,8 +310,22 @@ again: */ if (((vp->v_parent != NULLVP) && !fixhardlink) || (flags & BUILDPATH_NO_FS_ENTER)) { - vp = vp->v_parent; + /* + * In this if () block we are not allowed to enter the filesystem + * to conclusively get the most accurate parent identifier. + * As a result, if 'vp' does not identify '/' and it + * does not have a valid v_parent, then error out + * and disallow further path construction + */ + if ((vp->v_parent == NULLVP) && (rootvnode != vp)) { + /* Only '/' is allowed to have a NULL parent pointer */ + ret = EINVAL; + + /* The code below will exit early if 'tvp = vp' == NULL */ + } + vp = vp->v_parent; + /* * if the vnode we have in hand isn't a directory and it * has a v_parent, then we started with the resource fork @@ -808,11 +823,18 @@ void vnode_uncache_authorized_action(vnode_t vp, kauth_action_t action) } -boolean_t vnode_cache_is_authorized(vnode_t vp, vfs_context_t ctx, kauth_action_t action) +extern int bootarg_vnode_cache_defeat; /* default = 0, from bsd_init.c */ + +boolean_t +vnode_cache_is_authorized(vnode_t vp, vfs_context_t ctx, kauth_action_t action) { kauth_cred_t ucred; boolean_t retval = FALSE; + /* Boot argument to defeat rights caching */ + if (bootarg_vnode_cache_defeat) + return FALSE; + if ( (vp->v_mount->mnt_kern_flag & (MNTK_AUTH_OPAQUE | MNTK_AUTH_CACHE_TTL)) ) { /* * a TTL is enabled on the rights cache... handle it here @@ -937,7 +959,7 @@ boolean_t vnode_cache_is_stale(vnode_t vp) */ int cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, - vfs_context_t ctx, int *trailing_slash, int *dp_authorized, vnode_t last_dp) + vfs_context_t ctx, int *dp_authorized, vnode_t last_dp) { char *cp; /* pointer into pathname argument */ int vid; @@ -951,8 +973,12 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, unsigned int hash; int error = 0; +#if CONFIG_TRIGGERS + vnode_t trigger_vp; +#endif /* CONFIG_TRIGGERS */ + ucred = vfs_context_ucred(ctx); - *trailing_slash = 0; + ndp->ni_flag &= ~(NAMEI_TRAILINGSLASH); NAME_CACHE_LOCK_SHARED(); @@ -999,7 +1025,7 @@ cache_lookup_path(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, ndp->ni_pathlen--; if (*cp == '\0') { - *trailing_slash = 1; + ndp->ni_flag |= NAMEI_TRAILINGSLASH; *ndp->ni_next = '\0'; } } @@ -1073,10 +1099,12 @@ skiprsrcfork: *dp_authorized = 1; if ( (cnp->cn_flags & (ISLASTCN | ISDOTDOT)) ) { - if (cnp->cn_nameiop != LOOKUP) - break; - if (cnp->cn_flags & (LOCKPARENT | NOCACHE)) - break; + if (cnp->cn_nameiop != LOOKUP) + break; + if (cnp->cn_flags & LOCKPARENT) + break; + if (cnp->cn_flags & NOCACHE) + break; if (cnp->cn_flags & ISDOTDOT) { /* * Force directory hardlinks to go to @@ -1126,6 +1154,7 @@ skiprsrcfork: vp = NULL; break; } + if ( (mp = vp->v_mountedhere) && ((cnp->cn_flags & NOCROSSMOUNT) == 0)) { if (mp->mnt_realrootvp == NULLVP || mp->mnt_generation != mount_generation || @@ -1133,6 +1162,20 @@ skiprsrcfork: break; vp = mp->mnt_realrootvp; } + +#if CONFIG_TRIGGERS + /* + * After traversing all mountpoints stacked here, if we have a + * trigger in hand, resolve it. Note that we don't need to + * leave the fast path if the mount has already happened. + */ + if ((vp->v_resolve != NULL) && + (vp->v_resolve->vr_resolve_func != NULL)) { + break; + } +#endif /* CONFIG_TRIGGERS */ + + dp = vp; vp = NULLVP; @@ -1184,7 +1227,7 @@ need_dp: * immediately w/o waiting... it always succeeds */ vnode_get(dp); - } else if ( (vnode_getwithvid(dp, vid)) ) { + } else if ( (vnode_getwithvid_drainok(dp, vid)) ) { /* * failure indicates the vnode * changed identity or is being @@ -1202,7 +1245,7 @@ need_dp: } } if (vp != NULLVP) { - if ( (vnode_getwithvid(vp, vvid)) ) { + if ( (vnode_getwithvid_drainok(vp, vvid)) ) { vp = NULLVP; /* @@ -1219,9 +1262,24 @@ need_dp: } } } + ndp->ni_dvp = dp; ndp->ni_vp = vp; +#if CONFIG_TRIGGERS + trigger_vp = vp ? vp : dp; + if ((error == 0) && (trigger_vp != NULLVP) && vnode_isdir(trigger_vp)) { + error = vnode_trigger_resolve(trigger_vp, ndp, ctx); + if (error) { + if (vp) + vnode_put(vp); + if (dp) + vnode_put(dp); + goto errorout; + } + } +#endif /* CONFIG_TRIGGERS */ + errorout: /* * If we came into cache_lookup_path after an iteration of the lookup loop that @@ -1249,6 +1307,10 @@ cache_lookup_locked(vnode_t dvp, struct componentname *cnp) long namelen = cnp->cn_namelen; unsigned int hashval = (cnp->cn_hash & NCHASHMASK); + if (nc_disabled) { + return NULL; + } + ncpp = NCHHASH(dvp, cnp->cn_hash); LIST_FOREACH(ncp, ncpp, nc_hash) { if ((ncp->nc_dvp == dvp) && (ncp->nc_hashval == hashval)) { @@ -1328,6 +1390,10 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) cnp->cn_hash = hash_string(cnp->cn_nameptr, cnp->cn_namelen); hashval = (cnp->cn_hash & NCHASHMASK); + if (nc_disabled) { + return 0; + } + NAME_CACHE_LOCK_SHARED(); relook: @@ -1485,6 +1551,9 @@ cache_enter_locked(struct vnode *dvp, struct vnode *vp, struct componentname *cn struct namecache *ncp, *negp; struct nchashhead *ncpp; + if (nc_disabled) + return; + /* * if the entry is for -ve caching vp is null */ @@ -1799,7 +1868,10 @@ cache_purge(vnode_t vp) struct namecache *ncp; kauth_cred_t tcred = NULL; - if ((LIST_FIRST(&vp->v_nclinks) == NULL) && (LIST_FIRST(&vp->v_ncchildren) == NULL) && (vp->v_cred == NOCRED)) + if ((LIST_FIRST(&vp->v_nclinks) == NULL) && + (LIST_FIRST(&vp->v_ncchildren) == NULL) && + (vp->v_cred == NOCRED) && + (vp->v_parent == NULLVP)) return; NAME_CACHE_LOCK(); @@ -1973,9 +2045,6 @@ add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t need_ uint32_t lock_index; char *ptr; - if (hashval == 0) { - hashval = hash_string(name, 0); - } /* * if the length already accounts for the null-byte, then * subtract one so later on we don't index past the end @@ -1984,6 +2053,10 @@ add_name_internal(const char *name, uint32_t len, u_int hashval, boolean_t need_ if (len > 0 && name[len-1] == '\0') { len--; } + if (hashval == 0) { + hashval = hash_string(name, len); + } + /* * take this lock 'shared' to keep the hash stable * if someone else decides to grow the pool they diff --git a/bsd/vfs/vfs_cluster.c b/bsd/vfs/vfs_cluster.c index 499056a3b..0e8bd67dd 100644 --- a/bsd/vfs/vfs_cluster.c +++ b/bsd/vfs/vfs_cluster.c @@ -82,6 +82,7 @@ #include <mach/memory_object_types.h> #include <mach/vm_map.h> #include <mach/upl.h> +#include <kern/task.h> #include <vm/vm_kern.h> #include <vm/vm_map.h> @@ -90,6 +91,8 @@ #include <sys/kdebug.h> #include <libkern/OSAtomic.h> +#include <sys/sdt.h> + #if 0 #undef KERNEL_DEBUG #define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT @@ -111,6 +114,8 @@ #define CL_DIRECT_IO 0x1000 #define CL_PASSIVE 0x2000 #define CL_IOSTREAMING 0x4000 +#define CL_CLOSE 0x8000 +#define CL_ENCRYPTED 0x10000 #define MAX_VECTOR_UPL_ELEMENTS 8 #define MAX_VECTOR_UPL_SIZE (2 * MAX_UPL_SIZE) * PAGE_SIZE @@ -122,6 +127,7 @@ extern void vector_upl_set_pagelist(upl_t); extern void vector_upl_set_iostate(upl_t, upl_t, vm_offset_t, u_int32_t); struct clios { + lck_mtx_t io_mtxp; u_int io_completed; /* amount of io that has currently completed */ u_int io_issued; /* amount of io that was successfully issued */ int io_error; /* error code of first error encountered */ @@ -131,7 +137,6 @@ struct clios { static lck_grp_t *cl_mtx_grp; static lck_attr_t *cl_mtx_attr; static lck_grp_attr_t *cl_mtx_grp_attr; -static lck_mtx_t *cl_mtxp; static lck_mtx_t *cl_transaction_mtxp; @@ -157,6 +162,8 @@ static int cluster_iodone(buf_t bp, void *callback_arg); static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags); static int cluster_hard_throttle_on(vnode_t vp, uint32_t); +static void cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name); + static void cluster_syncup(vnode_t vp, off_t newEOF, int (*)(buf_t, void *), void *callback_arg); static void cluster_read_upl_release(upl_t upl, int start_pg, int last_pg, int take_reference); @@ -183,10 +190,10 @@ static void cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t files static int cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg); -static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); +static int cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg); static void sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg); -static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int (*)(buf_t, void *), void *callback_arg); +static void sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg); static void sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg); static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp); @@ -203,12 +210,20 @@ static kern_return_t vfs_drt_control(void **cmapp, int op_type); #define MAX_VECTS 16 #define MIN_DIRECT_WRITE_SIZE (4 * PAGE_SIZE) +#define WRITE_THROTTLE 6 +#define WRITE_THROTTLE_SSD 2 +#define WRITE_BEHIND 1 +#define WRITE_BEHIND_SSD 1 +#define PREFETCH 3 +#define PREFETCH_SSD 2 + #define IO_SCALE(vp, base) (vp->v_mount->mnt_ioscale * base) #define MAX_CLUSTER_SIZE(vp) (cluster_max_io_size(vp->v_mount, CL_WRITE)) -#define MAX_PREFETCH(vp, io_size) (io_size * IO_SCALE(vp, 3)) +#define MAX_PREFETCH(vp, size, is_ssd) (size * IO_SCALE(vp, (is_ssd && !ignore_is_ssd) ? PREFETCH_SSD : PREFETCH)) - -int speculative_reads_disabled = 0; +int ignore_is_ssd = 0; +int speculative_reads_disabled = 0; +uint32_t speculative_prefetch_max = (MAX_UPL_SIZE * 3); /* * throttle the number of async writes that @@ -235,15 +250,6 @@ cluster_init(void) { */ cl_mtx_attr = lck_attr_alloc_init(); - /* - * allocate and initialize mutex's used to protect updates and waits - * on the cluster_io context - */ - cl_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); - - if (cl_mtxp == NULL) - panic("cluster_init: failed to allocate cl_mtxp"); - cl_transaction_mtxp = lck_mtx_alloc_init(cl_mtx_grp, cl_mtx_attr); if (cl_transaction_mtxp == NULL) @@ -412,7 +418,7 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c if (wbp->cl_number) { lck_mtx_lock(&wbp->cl_lockw); - cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, callback, callback_arg); + cluster_try_push(wbp, vp, newEOF, PUSH_ALL | PUSH_SYNC, 0, callback, callback_arg); lck_mtx_unlock(&wbp->cl_lockw); } @@ -450,6 +456,27 @@ cluster_hard_throttle_on(vnode_t vp, uint32_t hard_throttle) } +static void +cluster_iostate_wait(struct clios *iostate, u_int target, const char *wait_name) +{ + + lck_mtx_lock(&iostate->io_mtxp); + + while ((iostate->io_issued - iostate->io_completed) > target) { + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, + iostate->io_issued, iostate->io_completed, target, 0, 0); + + iostate->io_wanted = 1; + msleep((caddr_t)&iostate->io_wanted, &iostate->io_mtxp, PRIBIO + 1, wait_name, NULL); + + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, + iostate->io_issued, iostate->io_completed, target, 0, 0); + } + lck_mtx_unlock(&iostate->io_mtxp); +} + + static int cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_flags) { @@ -457,7 +484,7 @@ cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_fla int page_in = 0; int page_out = 0; - if (io_flags & B_PHYS) + if ((io_flags & (B_PHYS | B_CACHE)) == (B_PHYS | B_CACHE)) /* * direct write of any flavor, or a direct read that wasn't aligned */ @@ -517,33 +544,44 @@ cluster_iodone(buf_t bp, void *callback_arg) cbp_head, bp->b_lblkno, bp->b_bcount, bp->b_flags, 0); if (cbp_head->b_trans_next || !(cbp_head->b_flags & B_EOT)) { + boolean_t need_wakeup = FALSE; lck_mtx_lock_spin(cl_transaction_mtxp); bp->b_flags |= B_TDONE; + if (bp->b_flags & B_TWANTED) { + CLR(bp->b_flags, B_TWANTED); + need_wakeup = TRUE; + } for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { - /* + /* * all I/O requests that are part of this transaction * have to complete before we can process it */ - if ( !(cbp->b_flags & B_TDONE)) { + if ( !(cbp->b_flags & B_TDONE)) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, cbp_head, cbp, cbp->b_bcount, cbp->b_flags, 0); lck_mtx_unlock(cl_transaction_mtxp); + + if (need_wakeup == TRUE) + wakeup(bp); + return 0; } if (cbp->b_flags & B_EOT) - transaction_complete = TRUE; + transaction_complete = TRUE; } lck_mtx_unlock(cl_transaction_mtxp); + if (need_wakeup == TRUE) + wakeup(bp); + if (transaction_complete == FALSE) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 20)) | DBG_FUNC_END, cbp_head, 0, 0, 0, 0); - return 0; } } @@ -609,7 +647,7 @@ cluster_iodone(buf_t bp, void *callback_arg) * someone has issued multiple I/Os asynchrounsly * and is waiting for them to complete (streaming) */ - lck_mtx_lock_spin(cl_mtxp); + lck_mtx_lock_spin(&iostate->io_mtxp); if (error && iostate->io_error == 0) iostate->io_error = error; @@ -624,7 +662,7 @@ cluster_iodone(buf_t bp, void *callback_arg) iostate->io_wanted = 0; need_wakeup = 1; } - lck_mtx_unlock(cl_mtxp); + lck_mtx_unlock(&iostate->io_mtxp); if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); @@ -649,7 +687,7 @@ cluster_iodone(buf_t bp, void *callback_arg) ubc_upl_commit_range(upl, upl_offset - pg_offset, commit_size, upl_flags); } } - if ((b_flags & B_NEED_IODONE) && real_bp) { + if (real_bp) { if (error) { real_bp->b_flags |= B_ERROR; real_bp->b_error = error; @@ -735,27 +773,36 @@ cluster_wait_IO(buf_t cbp_head, int async) /* * async callback completion will not normally * generate a wakeup upon I/O completion... - * by setting BL_WANTED, we will force a wakeup + * by setting B_TWANTED, we will force a wakeup * to occur as any outstanding I/Os complete... - * I/Os already completed will have BL_CALLDONE already - * set and we won't block in buf_biowait_callback.. + * I/Os already completed will have B_TDONE already + * set and we won't cause us to block * note that we're actually waiting for the bp to have * completed the callback function... only then * can we safely take back ownership of the bp - * need the main buf mutex in order to safely - * update b_lflags */ - buf_list_lock(); + lck_mtx_lock_spin(cl_transaction_mtxp); for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) - cbp->b_lflags |= BL_WANTED; + cbp->b_flags |= B_TWANTED; - buf_list_unlock(); + lck_mtx_unlock(cl_transaction_mtxp); } for (cbp = cbp_head; cbp; cbp = cbp->b_trans_next) { - if (async) - buf_biowait_callback(cbp); - else + + if (async) { + while (!ISSET(cbp->b_flags, B_TDONE)) { + + lck_mtx_lock_spin(cl_transaction_mtxp); + + if (!ISSET(cbp->b_flags, B_TDONE)) { + DTRACE_IO1(wait__start, buf_t, cbp); + (void) msleep(cbp, cl_transaction_mtxp, PDROP | (PRIBIO+1), "cluster_wait_IO", NULL); + DTRACE_IO1(wait__done, buf_t, cbp); + } else + lck_mtx_unlock(cl_transaction_mtxp); + } + } else buf_biowait(cbp); } } @@ -781,7 +828,7 @@ cluster_complete_transaction(buf_t *cbp_head, void *callback_arg, int *retval, i * so that cluster_iodone sees the transaction as completed */ for (cbp = *cbp_head; cbp; cbp = cbp->b_trans_next) - cbp->b_flags |= B_TDONE; + cbp->b_flags |= B_TDONE; error = cluster_iodone(*cbp_head, callback_arg); @@ -910,10 +957,9 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no else { u_int max_cluster; u_int max_cluster_size; - u_int max_prefetch; - + u_int scale; + max_cluster_size = MAX_CLUSTER_SIZE(vp); - max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ)); if (max_iosize > max_cluster_size) max_cluster = max_cluster_size; @@ -922,8 +968,16 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no if (size < max_cluster) max_cluster = size; + + if ((vp->v_mount->mnt_kern_flag & MNTK_SSD) && !ignore_is_ssd) + scale = WRITE_THROTTLE_SSD; + else + scale = WRITE_THROTTLE; - async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), (max_prefetch / max_cluster) - 1); + if (flags & CL_CLOSE) + scale += MAX_CLUSTERS; + + async_throttle = min(IO_SCALE(vp, VNODE_ASYNC_THROTTLE), ((scale * max_cluster_size) / max_cluster) - 1); } } } @@ -935,12 +989,14 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no io_flags |= B_IOSTREAMING; if (flags & CL_COMMIT) io_flags |= B_COMMIT_UPL; - if (flags & CL_PRESERVE) + if (flags & CL_DIRECT_IO) io_flags |= B_PHYS; - if (flags & CL_KEEPCACHED) - io_flags |= B_CACHE; + if (flags & (CL_PRESERVE | CL_KEEPCACHED)) + io_flags |= B_CACHE; if (flags & CL_PASSIVE) io_flags |= B_PASSIVE; + if (flags & CL_ENCRYPTED) + io_flags |= B_ENCRYPTED_IO; if (vp->v_flag & VSYSTEM) io_flags |= B_META; @@ -997,7 +1053,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no off_t e_offset; int pageout_flags; - if(upl_get_internal_vectorupl(upl)) + if (upl_get_internal_vectorupl(upl)) panic("Vector UPLs should not take this code-path\n"); /* * we're writing into a 'hole' @@ -1104,7 +1160,6 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no } if (vnode_pageout(vp, upl, trunc_page(upl_offset), trunc_page_64(f_offset), PAGE_SIZE, pageout_flags, NULL) != PAGER_SUCCESS) { error = EINVAL; - break; } e_offset = round_page_64(f_offset + 1); io_size = e_offset - f_offset; @@ -1133,6 +1188,11 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no */ size = 0; } + if (error) { + if (size == 0) + flags &= ~CL_COMMIT; + break; + } continue; } lblkno = (daddr64_t)(f_offset / PAGE_SIZE_64); @@ -1370,10 +1430,8 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no cbp_head = cbp; cbp_tail = cbp; - if ( (cbp_head->b_real_bp = real_bp) ) { - cbp_head->b_flags |= B_NEED_IODONE; + if ( (cbp_head->b_real_bp = real_bp) ) real_bp = (buf_t)NULL; - } } *(buf_t *)(&cbp->b_trans_head) = cbp_head; @@ -1479,7 +1537,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no * since we never really issued the io * just go ahead and adjust it back */ - lck_mtx_lock_spin(cl_mtxp); + lck_mtx_lock_spin(&iostate->io_mtxp); if (iostate->io_error == 0) iostate->io_error = error; @@ -1493,7 +1551,7 @@ cluster_io(vnode_t vp, upl_t upl, vm_offset_t upl_offset, off_t f_offset, int no iostate->io_wanted = 0; need_wakeup = 1; } - lck_mtx_unlock(cl_mtxp); + lck_mtx_unlock(&iostate->io_mtxp); if (need_wakeup) wakeup((caddr_t)&iostate->io_wanted); @@ -1604,8 +1662,16 @@ cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct return; } - max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ)); + max_prefetch = MAX_PREFETCH(vp, cluster_max_io_size(vp->v_mount, CL_READ), (vp->v_mount->mnt_kern_flag & MNTK_SSD)); + if ((max_prefetch / PAGE_SIZE) > speculative_prefetch_max) + max_prefetch = (speculative_prefetch_max * PAGE_SIZE); + + if (max_prefetch <= PAGE_SIZE) { + KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 48)) | DBG_FUNC_END, + rap->cl_ralen, (int)rap->cl_maxra, (int)rap->cl_lastr, 6, 0); + return; + } if (extent->e_addr < rap->cl_maxra) { if ((rap->cl_maxra - extent->e_addr) > ((max_prefetch / PAGE_SIZE) / 4)) { @@ -1667,18 +1733,7 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offs off_t max_size; int local_flags; - if (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) - /* - * if we know we're issuing this I/O to a virtual device (i.e. disk image) - * then we don't want to enforce this throttle... if we do, we can - * potentially deadlock since we're stalling the pageout thread at a time - * when the disk image might need additional memory (which won't be available - * if the pageout thread can't run)... instead we'll just depend on the throttle - * that the pageout thread now has in place to deal with external files - */ - local_flags = CL_PAGEOUT; - else - local_flags = CL_PAGEOUT | CL_THROTTLE; + local_flags = CL_PAGEOUT | CL_THROTTLE; if ((flags & UPL_IOSYNC) == 0) local_flags |= CL_ASYNC; @@ -1686,6 +1741,8 @@ cluster_pageout_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offs local_flags |= CL_COMMIT; if ((flags & UPL_KEEPCACHED)) local_flags |= CL_KEEPCACHED; + if (flags & UPL_PAGING_ENCRYPTED) + local_flags |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 52)) | DBG_FUNC_NONE, @@ -1762,6 +1819,8 @@ cluster_pagein_ext(vnode_t vp, upl_t upl, upl_offset_t upl_offset, off_t f_offse local_flags |= CL_COMMIT; if (flags & UPL_IOSTREAMING) local_flags |= CL_IOSTREAMING; + if (flags & UPL_PAGING_ENCRYPTED) + local_flags |= CL_ENCRYPTED; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 56)) | DBG_FUNC_NONE, @@ -1869,12 +1928,12 @@ cluster_write_ext(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, off_t } /* * do a write through the cache if one of the following is true.... - * NOCACHE is not true and + * NOCACHE is not true or NODIRECT is true * the uio request doesn't target USERSPACE * otherwise, find out if we want the direct or contig variant for * the first vector in the uio request */ - if ( (flags & IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) + if ( ((flags & (IO_NOCACHE | IO_NODIRECT)) == IO_NOCACHE) && UIO_SEG_IS_USER_SPACE(uio->uio_segflg) ) retval = cluster_io_type(uio, &write_type, &write_length, MIN_DIRECT_WRITE_SIZE); if ( (flags & (IO_TAILZEROFILL | IO_HEADZEROFILL)) && write_type == IO_DIRECT) @@ -2027,6 +2086,8 @@ cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t newEOF, in iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; @@ -2207,23 +2268,9 @@ next_dwrite: * if there are already too many outstanding writes * wait until some complete before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (max_upl_size * IO_SCALE(vp, 2))) { - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, max_upl_size * IO_SCALE(vp, 2), 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, max_upl_size * IO_SCALE(vp, 2), "cluster_write_direct"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, max_upl_size * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier writes we issued ran into a hard error @@ -2303,7 +2350,7 @@ next_dwrite: wait_for_dwrites: - if(retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { + if (retval == 0 && iostate.io_error == 0 && useVectorUPL && vector_upl_index) { retval = vector_cluster_io(vp, vector_upl, vector_upl_offset, v_upl_uio_offset, vector_upl_iosize, io_flag, (buf_t)NULL, &iostate, callback, callback_arg); reset_vector_run_state(); } @@ -2313,23 +2360,13 @@ wait_for_dwrites: * make sure all async writes issued as part of this stream * have completed before we return */ - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); + cluster_iostate_wait(&iostate, 0, "cluster_write_direct"); } if (iostate.io_error) retval = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -2392,6 +2429,8 @@ cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF, int *write_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + next_cwrite: io_size = *write_length; @@ -2480,22 +2519,9 @@ next_cwrite: * if there are already too many outstanding writes * wait until some have completed before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2))) { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_write_contig"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier writes we issued ran into a hard error @@ -2539,25 +2565,14 @@ wait_for_cwrites: * make sure all async writes that are part of this stream * have completed before we proceed */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_write_contig", NULL); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_write_contig"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (error == 0 && tail_size) error = cluster_align_phys_io(vp, uio, src_paddr, tail_size, 0, callback, callback_arg); @@ -2632,6 +2647,9 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old off_t zero_off; long long zero_cnt1; off_t zero_off1; + off_t write_off = 0; + int write_cnt = 0; + boolean_t first_pass = FALSE; struct cl_extent cl; struct cl_writebehind *wbp; int bflag; @@ -2713,7 +2731,16 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old retval, 0, 0, 0, 0); return (0); } - + if (uio) { + write_off = uio->uio_offset; + write_cnt = uio_resid(uio); + /* + * delay updating the sequential write info + * in the control block until we've obtained + * the lock for it + */ + first_pass = TRUE; + } while ((total_size = (io_resid + zero_cnt + zero_cnt1)) && retval == 0) { /* * for this iteration of the loop, figure out where our starting point is @@ -3008,7 +3035,7 @@ check_cluster: */ wbp->cl_number = 0; - sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, callback, callback_arg); + sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg); /* * no clusters of either type present at this point * so just go directly to start_new_cluster since @@ -3017,7 +3044,17 @@ check_cluster: * to avoid the deadlock with sparse_cluster_push */ goto start_new_cluster; - } + } + if (first_pass) { + if (write_off == wbp->cl_last_write) + wbp->cl_seq_written += write_cnt; + else + wbp->cl_seq_written = write_cnt; + + wbp->cl_last_write = write_off + write_cnt; + + first_pass = FALSE; + } if (wbp->cl_number == 0) /* * no clusters currently present @@ -3132,14 +3169,27 @@ check_cluster: */ goto delay_io; - if (wbp->cl_number < MAX_CLUSTERS) + if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) && + wbp->cl_number == MAX_CLUSTERS && + wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) { + uint32_t n; + + if (vp->v_mount->mnt_kern_flag & MNTK_SSD) + n = WRITE_BEHIND_SSD; + else + n = WRITE_BEHIND; + + while (n--) + cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg); + } + if (wbp->cl_number < MAX_CLUSTERS) { /* * we didn't find an existing cluster to * merge into, but there's room to start * a new one */ goto start_new_cluster; - + } /* * no exisitng cluster to merge with and no * room to start a new one... we'll try @@ -3157,7 +3207,7 @@ check_cluster: */ if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, callback, callback_arg); + ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg); } /* @@ -3176,18 +3226,6 @@ check_cluster: continue; } - /* - * we pushed one cluster successfully, so we must be sequentially writing this file - * otherwise, we would have failed and fallen into the sparse cluster support - * so let's take the opportunity to push out additional clusters... - * this will give us better I/O locality if we're in a copy loop - * (i.e. we won't jump back and forth between the read and write points - */ - if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) { - while (wbp->cl_number) - cluster_try_push(wbp, vp, newEOF, 0, callback, callback_arg); - } - start_new_cluster: wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr; wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr; @@ -3342,19 +3380,25 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file struct cl_extent extent; int bflag; int take_reference = 1; +#if CONFIG_EMBEDDED struct uthread *ut; +#endif /* CONFIG_EMBEDDED */ int policy = IOPOL_DEFAULT; - + boolean_t iolock_inited = FALSE; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_START, (int)uio->uio_offset, io_req_size, (int)filesize, flags, 0); +#if !CONFIG_EMBEDDED + policy = proc_get_task_selfdiskacc(); +#else /* !CONFIG_EMBEDDED */ policy = current_proc()->p_iopol_disk; ut = get_bsdthread_info(current_thread()); if (ut->uu_iopol_disk != IOPOL_DEFAULT) policy = ut->uu_iopol_disk; +#endif /* !CONFIG_EMBEDDED */ if (policy == IOPOL_THROTTLE || (flags & IO_NOCACHE)) take_reference = 0; @@ -3365,7 +3409,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file bflag = 0; max_io_size = cluster_max_io_size(vp->v_mount, CL_READ); - max_prefetch = MAX_PREFETCH(vp, max_io_size); + max_prefetch = MAX_PREFETCH(vp, max_io_size, (vp->v_mount->mnt_kern_flag & MNTK_SSD)); max_rd_size = max_prefetch; last_request_offset = uio->uio_offset + io_req_size; @@ -3464,7 +3508,7 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file io_requested = io_resid; - retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, last_ioread_offset == 0 ? take_reference : 0); + retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_resid, 0, take_reference); xsize = io_requested - io_resid; @@ -3576,6 +3620,11 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file * we may have to clip the size of it to keep from reading past * the end of the last physical block associated with the file */ + if (iolock_inited == FALSE) { + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + + iolock_inited = TRUE; + } upl_offset = start_pg * PAGE_SIZE; io_size = (last_pg - start_pg) * PAGE_SIZE; @@ -3588,6 +3637,18 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file error = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, CL_READ | CL_ASYNC | bflag, (buf_t)NULL, &iostate, callback, callback_arg); + + if (rap) { + if (extent.e_addr < rap->cl_maxra) { + /* + * we've just issued a read for a block that should have been + * in the cache courtesy of the read-ahead engine... something + * has gone wrong with the pipeline, so reset the read-ahead + * logic which will cause us to restart from scratch + */ + rap->cl_maxra = 0; + } + } } if (error == 0) { /* @@ -3666,22 +3727,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file rap->cl_lastr = extent.e_addr; } } - if (iostate.io_issued > iostate.io_completed) { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_copy", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; else { @@ -3693,6 +3741,9 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file io_req_size -= (val_size - io_requested); } + } else { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); } if (start_pg < last_pg) { /* @@ -3773,6 +3824,20 @@ cluster_read_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t file } } } + if (iolock_inited == TRUE) { + if (iostate.io_issued > iostate.io_completed) { + /* + * cluster_io returned an error after it + * had already issued some I/O. we need + * to wait for that I/O to complete before + * we can destroy the iostate mutex... + * 'retval' already contains the early error + * so no need to pick it up from iostate.io_error + */ + cluster_iostate_wait(&iostate, 0, "cluster_read_copy"); + } + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + } if (rap != NULL) { KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 32)) | DBG_FUNC_END, (int)uio->uio_offset, io_req_size, rap->cl_lastr, retval, 0); @@ -3819,6 +3884,7 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, u_int32_t max_upl_size; u_int32_t max_rd_size; u_int32_t max_rd_ahead; + boolean_t strict_uncached_IO = FALSE; u_int32_t vector_upl_iosize = 0; int issueVectorUPL = 0,useVectorUPL = (uio->uio_iovcnt > 1); @@ -3835,6 +3901,7 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, max_rd_ahead = max_rd_size * IO_SCALE(vp, 2); io_flag = CL_COMMIT | CL_READ | CL_ASYNC | CL_NOZERO | CL_DIRECT_IO; + if (flags & IO_PASSIVE) io_flag |= CL_PASSIVE; @@ -3843,6 +3910,8 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + devblocksize = (u_int32_t)vp->v_mount->mnt_devblocksize; mem_alignment_mask = (u_int32_t)vp->v_mount->mnt_alignmentmask; @@ -3862,6 +3931,9 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, */ devblocksize = PAGE_SIZE; } + + strict_uncached_IO = ubc_strict_uncached_IO(vp); + next_dread: io_req_size = *read_length; iov_base = uio_curriovbase(uio); @@ -3913,8 +3985,9 @@ next_dread: * cluster_copy_ubc_data returns the resid * in io_size */ - retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); - + if (strict_uncached_IO == FALSE) { + retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0); + } /* * calculate the number of bytes actually copied * starting size - residual @@ -3991,21 +4064,26 @@ next_dread: */ goto wait_for_dreads; } - if ((xsize = io_size) > max_rd_size) - xsize = max_rd_size; - io_size = 0; + if (strict_uncached_IO == FALSE) { - ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); + if ((xsize = io_size) > max_rd_size) + xsize = max_rd_size; - if (io_size == 0) { - /* - * a page must have just come into the cache - * since the first page in this range is no - * longer absent, go back and re-evaluate - */ - continue; + io_size = 0; + + ubc_range_op(vp, uio->uio_offset, uio->uio_offset + xsize, UPL_ROP_ABSENT, (int *)&io_size); + + if (io_size == 0) { + /* + * a page must have just come into the cache + * since the first page in this range is no + * longer absent, go back and re-evaluate + */ + continue; + } } + iov_base = uio_curriovbase(uio); upl_offset = (vm_offset_t)((u_int32_t)iov_base & PAGE_MASK); @@ -4097,22 +4175,9 @@ next_dread: * if there are already too many outstanding reads * wait until some have completed before issuing the next read */ - if (iostate.io_issued > iostate.io_completed) { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, max_rd_ahead, "cluster_read_direct"); - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > max_rd_ahead) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, max_rd_ahead, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -4191,25 +4256,14 @@ wait_for_dreads: * make sure all async reads that are part of this stream * have completed before we return */ - if (iostate.io_issued > iostate.io_completed) { + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_direct"); - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_direct", NULL); - - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) retval = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (io_req_size && retval == 0) { /* * we couldn't handle the tail of this request in DIRECT mode @@ -4273,6 +4327,8 @@ cluster_read_contig(vnode_t vp, struct uio *uio, off_t filesize, int *read_type, iostate.io_error = 0; iostate.io_wanted = 0; + lck_mtx_init(&iostate.io_mtxp, cl_mtx_grp, cl_mtx_attr); + next_cread: io_size = *read_length; @@ -4370,21 +4426,9 @@ next_cread: * if there are already too many outstanding reads * wait until some have completed before issuing the next */ - if (iostate.io_issued > iostate.io_completed) { - lck_mtx_lock(cl_mtxp); - - while ((iostate.io_issued - iostate.io_completed) > (MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2))) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), "cluster_read_contig"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, MAX_IO_CONTIG_SIZE * IO_SCALE(vp, 2), 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) { /* * one of the earlier reads we issued ran into a hard error @@ -4425,25 +4469,14 @@ wait_for_creads: * make sure all async reads that are part of this stream * have completed before we proceed */ - if (iostate.io_issued > iostate.io_completed) { - - lck_mtx_lock(cl_mtxp); - - while (iostate.io_issued != iostate.io_completed) { - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_START, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - - iostate.io_wanted = 1; - msleep((caddr_t)&iostate.io_wanted, cl_mtxp, PRIBIO + 1, "cluster_read_contig", NULL); + if (iostate.io_issued > iostate.io_completed) + cluster_iostate_wait(&iostate, 0, "cluster_read_contig"); - KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 95)) | DBG_FUNC_END, - iostate.io_issued, iostate.io_completed, 0, 0, 0); - } - lck_mtx_unlock(cl_mtxp); - } if (iostate.io_error) error = iostate.io_error; + lck_mtx_destroy(&iostate.io_mtxp, cl_mtx_grp); + if (error == 0 && tail_size) error = cluster_align_phys_io(vp, uio, dst_paddr, tail_size, CL_READ, callback, callback_arg); @@ -4787,7 +4820,7 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca lck_mtx_unlock(&wbp->cl_lockw); - sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); lck_mtx_lock(&wbp->cl_lockw); @@ -4796,11 +4829,11 @@ cluster_push_ext(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0) wakeup((caddr_t)&wbp->cl_sparse_pushes); } else { - sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); } retval = 1; } else { - retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL | IO_PASSIVE, callback, callback_arg); + retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags | IO_PASSIVE, callback, callback_arg); } lck_mtx_unlock(&wbp->cl_lockw); @@ -4861,7 +4894,7 @@ cluster_release(struct ubc_info *ubc) static int -cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) +cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) { int cl_index; int cl_index1; @@ -4944,15 +4977,15 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla int flags; struct cl_extent cl; + flags = io_flags & (IO_PASSIVE|IO_CLOSE); + /* * try to push each cluster in turn... */ if (l_clusters[cl_index].io_flags & CLW_IONOCACHE) - flags = IO_NOCACHE; - else - flags = 0; + flags |= IO_NOCACHE; - if ((l_clusters[cl_index].io_flags & CLW_IOPASSIVE) || (push_flag & IO_PASSIVE)) + if (l_clusters[cl_index].io_flags & CLW_IOPASSIVE) flags |= IO_PASSIVE; if (push_flag & PUSH_SYNC) @@ -5057,9 +5090,9 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c kern_return_t kret; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_START, (int)cl->b_addr, (int)cl->e_addr, (int)EOF, flags, 0); @@ -5186,6 +5219,9 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c if ( !(flags & IO_SYNC)) io_flags |= CL_ASYNC; + if (flags & IO_CLOSE) + io_flags |= CL_CLOSE; + retval = cluster_io(vp, upl, upl_offset, upl_f_offset + upl_offset, io_size, io_flags, (buf_t)NULL, (struct clios *)NULL, callback, callback_arg); @@ -5237,7 +5273,7 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c * from the write-behind context (the cluster_push case), the wb lock is not held */ static void -sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int (*callback)(buf_t, void *), void *callback_arg) +sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg) { struct cl_extent cl; off_t offset; @@ -5255,7 +5291,7 @@ sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int (*ca cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64); cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64); - cluster_push_now(vp, &cl, EOF, push_flag & IO_PASSIVE, callback, callback_arg); + cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg); if ( !(push_flag & PUSH_ALL) ) break; @@ -5285,7 +5321,7 @@ sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, in * only a partial update was done * push out some pages and try again */ - sparse_cluster_push(scmap, vp, EOF, 0, callback, callback_arg); + sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg); offset += (new_dirty * PAGE_SIZE_64); length -= (new_dirty * PAGE_SIZE); @@ -5308,9 +5344,9 @@ cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t int bflag; if (flags & IO_PASSIVE) - bflag = CL_PASSIVE; + bflag = CL_PASSIVE; else - bflag = 0; + bflag = 0; upl_flags = UPL_SET_LITE; @@ -5479,7 +5515,7 @@ cluster_copy_ubc_data_internal(vnode_t vp, struct uio *uio, int *io_resid, int m io_size = *io_resid; KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 34)) | DBG_FUNC_START, - (int)uio->uio_offset, 0, io_size, 0, 0); + (int)uio->uio_offset, io_size, mark_dirty, take_reference, 0); control = ubc_getobject(vp, UBC_FLAGS_NONE); diff --git a/bsd/vfs/vfs_conf.c b/bsd/vfs/vfs_conf.c index 529129d9c..a4a962b66 100644 --- a/bsd/vfs/vfs_conf.c +++ b/bsd/vfs/vfs_conf.c @@ -86,8 +86,8 @@ struct mount *rootfs; struct vnode *rootvnode; #ifdef CONFIG_IMGSRC_ACCESS -struct vnode *imgsrc_rootvnode; -#endif /* IMGSRC_ACESS */ +struct vnode *imgsrc_rootvnodes[MAX_IMAGEBOOT_NESTING]; /* [0] -> source volume, [1] -> first disk image */ +#endif /* CONFIG_IMGSRC_ACCESS */ int (*mountroot)(void) = NULL; @@ -102,7 +102,6 @@ extern struct vfsops nfs_vfsops; extern int nfs_mountroot(void); extern struct vfsops afs_vfsops; extern struct vfsops null_vfsops; -extern struct vfsops union_vfsops; extern struct vfsops devfs_vfsops; /* @@ -117,7 +116,7 @@ typedef int (*mountroot_t)(mount_t, vnode_t, vfs_context_t); static struct vfstable vfstbllist[] = { /* HFS/HFS+ Filesystem */ #if HFS - { &hfs_vfsops, "hfs", 17, 0, (MNT_LOCAL | MNT_DOVOLFS), hfs_mountroot, NULL, 0, 0, VFC_VFSLOCALARGS | VFC_VFSREADDIR_EXTENDED | VFS_THREAD_SAFE_FLAG | VFC_VFS64BITREADY | VFC_VFSVNOP_PAGEOUTV2, NULL, 0}, + { &hfs_vfsops, "hfs", 17, 0, (MNT_LOCAL | MNT_DOVOLFS), hfs_mountroot, NULL, 0, 0, VFC_VFSLOCALARGS | VFC_VFSREADDIR_EXTENDED | VFS_THREAD_SAFE_FLAG | VFC_VFS64BITREADY | VFC_VFSVNOP_PAGEOUTV2 | VFC_VFSVNOP_PAGEINV2, NULL, 0}, #endif /* Memory-based Filesystem */ @@ -140,18 +139,6 @@ static struct vfstable vfstbllist[] = { #endif #endif /* __LP64__ */ - /* Loopback (Minimal) Filesystem Layer */ -#ifndef __LP64__ -#if NULLFS - { &null_vfsops, "loopback", 9, 0, 0, NULL, NULL, 0, 0, VFC_VFSGENERICARGS , NULL, 0}, -#endif -#endif /* __LP64__ */ - - /* Union (translucent) Filesystem */ -#if UNION - { &union_vfsops, "unionfs", 15, 0, 0, NULL, NULL, 0, 0, VFC_VFSGENERICARGS | VFS_THREAD_SAFE_FLAG | VFC_VFS64BITREADY, NULL, 0}, -#endif - /* Device Filesystem */ #if DEVFS #if CONFIG_MACF @@ -214,7 +201,6 @@ extern struct vnodeopv_desc hfs_vnodeop_opv_desc; extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc; extern struct vnodeopv_desc hfs_specop_opv_desc; extern struct vnodeopv_desc hfs_fifoop_opv_desc; -extern struct vnodeopv_desc union_vnodeop_opv_desc; extern struct vnodeopv_desc devfs_vnodeop_opv_desc; extern struct vnodeopv_desc devfs_spec_vnodeop_opv_desc; #if FDESC @@ -241,9 +227,6 @@ struct vnodeopv_desc *vfs_opv_descs[] = { &fifo_nfsv4nodeop_opv_desc, #endif #endif -#if NULLFS - &null_vnodeop_opv_desc, -#endif #if HFS &hfs_vnodeop_opv_desc, &hfs_std_vnodeop_opv_desc, @@ -252,9 +235,6 @@ struct vnodeopv_desc *vfs_opv_descs[] = { &hfs_fifoop_opv_desc, #endif #endif -#if UNION - &union_vnodeop_opv_desc, -#endif #if DEVFS &devfs_vnodeop_opv_desc, &devfs_spec_vnodeop_opv_desc, diff --git a/bsd/vfs/vfs_fsevents.c b/bsd/vfs/vfs_fsevents.c index e09f990dc..0132a60dd 100644 --- a/bsd/vfs/vfs_fsevents.c +++ b/bsd/vfs/vfs_fsevents.c @@ -206,6 +206,7 @@ fsevents_internal_init(void) // ever grow beyond what we initially filled it with zone_change(event_zone, Z_EXHAUST, TRUE); zone_change(event_zone, Z_COLLECT, FALSE); + zone_change(event_zone, Z_CALLERACCT, FALSE); } static void @@ -1821,6 +1822,11 @@ fmod_watch(fs_event_watcher *watcher, struct uio *uio) if (watcher->event_list[kfse->type] == FSE_REPORT && watcher_cares_about_dev(watcher, kfse->dev)) { + if (last_event_ptr == kfse) { + last_event_ptr = NULL; + last_event_type = -1; + last_coalesced_time = 0; + } error = copy_out_kfse(watcher, kfse, uio); if (error != 0) { // if an event won't fit or encountered an error while @@ -2667,18 +2673,24 @@ get_fse_info(struct vnode *vp, fse_info *fse, __unused vfs_context_t ctx) memset(fse, 0, sizeof(fse_info)); return -1; } - - fse->ino = (ino64_t)va.va_fileid; - fse->dev = (dev_t)va.va_fsid; - fse->mode = (int32_t)vnode_vttoif(vnode_vtype(vp)) | va.va_mode; - fse->uid = (uid_t)va.va_uid; - fse->gid = (gid_t)va.va_gid; + + return vnode_get_fse_info_from_vap(vp, fse, &va); +} + +int +vnode_get_fse_info_from_vap(vnode_t vp, fse_info *fse, struct vnode_attr *vap) +{ + fse->ino = (ino64_t)vap->va_fileid; + fse->dev = (dev_t)vap->va_fsid; + fse->mode = (int32_t)vnode_vttoif(vnode_vtype(vp)) | vap->va_mode; + fse->uid = (uid_t)vap->va_uid; + fse->gid = (gid_t)vap->va_gid; if (vp->v_flag & VISHARDLINK) { fse->mode |= FSE_MODE_HLINK; if (vp->v_type == VDIR) { - fse->nlink = (uint64_t)va.va_dirlinkcount; + fse->nlink = (uint64_t)vap->va_dirlinkcount; } else { - fse->nlink = (uint64_t)va.va_nlink; + fse->nlink = (uint64_t)vap->va_nlink; } } diff --git a/bsd/vfs/vfs_fslog.c b/bsd/vfs/vfs_fslog.c index 618e4546c..580ea60b4 100644 --- a/bsd/vfs/vfs_fslog.c +++ b/bsd/vfs/vfs_fslog.c @@ -42,6 +42,8 @@ #include <sys/fslog.h> #include <sys/mount_internal.h> +#include <uuid/uuid.h> + /* String to append as format modifier for each key-value pair */ #define FSLOG_KEYVAL_FMT "[%s %s] " #define FSLOG_KEYVAL_FMT_LEN (sizeof(FSLOG_KEYVAL_FMT) - 1) @@ -341,12 +343,10 @@ static int escape_str(char *str, int len, int buflen) void fslog_fs_corrupt(struct mount *mnt) { if (mnt != NULL) { - if (mnt->mnt_vfsstat.f_mntonname != NULL) { - fslog_err(FSLOG_MSG_SINGLE, - FSLOG_KEY_ERR_TYPE, FSLOG_VAL_ERR_TYPE_FS, - FSLOG_KEY_MNTPT, mnt->mnt_vfsstat.f_mntonname, - NULL); - } + fslog_err(FSLOG_MSG_SINGLE, + FSLOG_KEY_ERR_TYPE, FSLOG_VAL_ERR_TYPE_FS, + FSLOG_KEY_MNTPT, mnt->mnt_vfsstat.f_mntonname, + NULL); } return; @@ -458,3 +458,73 @@ out: return; } + +static void +_fslog_extmod_msgtracer_internal(int level, const char *facility, int num_pairs, ...) +{ + va_list ap; + + va_start(ap, num_pairs); + (void) fslog_asl_msg(level, facility, + num_pairs, ap, NULL); + va_end(ap); +} + +/* Log information about external modification of a process, + * using MessageTracer formatting. Assumes that both the caller + * and target are appropriately locked. + * Currently prints following information - + * 1. Caller process name (truncated to 16 characters) + * 2. Caller process Mach-O UUID + * 3. Target process name (truncated to 16 characters) + * 4. Target process Mach-O UUID + */ +void +fslog_extmod_msgtracer(proc_t caller, proc_t target) +{ + if ((caller != PROC_NULL) && (target != PROC_NULL)) { + + /* + * Print into buffer large enough for "ThisIsAnApplicat(BC223DD7-B314-42E0-B6B0-C5D2E6638337)", + * including space for escaping, and NUL byte included in sizeof(uuid_string_t). + */ + + uuid_string_t uuidstr; + char c_name[2*MAXCOMLEN + 2 /* () */ + sizeof(uuid_string_t)]; + char t_name[2*MAXCOMLEN + 2 /* () */ + sizeof(uuid_string_t)]; + + strlcpy(c_name, caller->p_comm, sizeof(c_name)); + uuid_unparse_upper(caller->p_uuid, uuidstr); + strlcat(c_name, "(", sizeof(c_name)); + strlcat(c_name, uuidstr, sizeof(c_name)); + strlcat(c_name, ")", sizeof(c_name)); + if (0 != escape_str(c_name, strlen(c_name), sizeof(c_name))) { + return; + } + + strlcpy(t_name, target->p_comm, sizeof(t_name)); + uuid_unparse_upper(target->p_uuid, uuidstr); + strlcat(t_name, "(", sizeof(t_name)); + strlcat(t_name, uuidstr, sizeof(t_name)); + strlcat(t_name, ")", sizeof(t_name)); + if (0 != escape_str(t_name, strlen(t_name), sizeof(t_name))) { + return; + } + +#if DEBUG + printf("EXTMOD: %s(%d) -> %s(%d)\n", + c_name, + proc_pid(caller), + t_name, + proc_pid(target)); +#endif + + _fslog_extmod_msgtracer_internal(LOG_DEBUG, "messagetracer", + 4, + "com.apple.message.domain", "com.apple.kernel.external_modification", /* 0 */ + "com.apple.message.signature", c_name, /* 1 */ + "com.apple.message.signature2", t_name, /* 2 */ + "com.apple.message.result", "noop", /* 3 */ + NULL); + } +} diff --git a/bsd/vfs/vfs_init.c b/bsd/vfs/vfs_init.c index 253bbcd77..2c83c4725 100644 --- a/bsd/vfs/vfs_init.c +++ b/bsd/vfs/vfs_init.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -261,6 +261,12 @@ lck_grp_t * vnode_lck_grp; lck_grp_attr_t * vnode_lck_grp_attr; lck_attr_t * vnode_lck_attr; +#if CONFIG_TRIGGERS +/* vars for vnode trigger resolver */ +lck_grp_t * trigger_vnode_lck_grp; +lck_grp_attr_t * trigger_vnode_lck_grp_attr; +lck_attr_t * trigger_vnode_lck_attr; +#endif /* vars for vnode list lock */ lck_grp_t * vnode_list_lck_grp; @@ -289,6 +295,9 @@ lck_mtx_t * mnt_list_mtx_lock; lck_mtx_t *pkg_extensions_lck; struct mount * dead_mountp; + +extern void nspace_handler_init(void); + /* * Initialize the vnode structures and initialize each file system type. */ @@ -324,6 +333,12 @@ vfsinit(void) /* Allocate vnode lock attribute */ vnode_lck_attr = lck_attr_alloc_init(); +#if CONFIG_TRIGGERS + trigger_vnode_lck_grp_attr = lck_grp_attr_alloc_init(); + trigger_vnode_lck_grp = lck_grp_alloc_init("trigger_vnode", trigger_vnode_lck_grp_attr); + trigger_vnode_lck_attr = lck_attr_alloc_init(); +#endif + /* Allocate fs config lock group attribute and group */ fsconf_lck_grp_attr= lck_grp_attr_alloc_init(); @@ -373,6 +388,7 @@ vfsinit(void) */ journal_init(); #endif + nspace_handler_init(); /* * Build vnode operation vectors. diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c index 0a967aba9..4999f814b 100644 --- a/bsd/vfs/vfs_journal.c +++ b/bsd/vfs/vfs_journal.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2010 Apple Inc. All rights reserved. + * Copyright (c) 2002-2011 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -51,6 +51,7 @@ #include <sys/tty.h> #include <sys/ubc.h> #include <sys/malloc.h> +#include <kern/task.h> #include <kern/thread.h> #include <kern/kalloc.h> #include <sys/disk.h> @@ -58,9 +59,36 @@ #include <miscfs/specfs/specdev.h> #include <libkern/OSAtomic.h> /* OSAddAtomic */ -extern task_t kernel_task; +kern_return_t thread_terminate(thread_t); -#define DBG_JOURNAL_FLUSH 1 +/* + * Set sysctl vfs.generic.jnl.kdebug.trim=1 to enable KERNEL_DEBUG_CONSTANT + * logging of trim-related calls within the journal. (They're + * disabled by default because there can be a lot of these events, + * and we don't want to overwhelm the kernel debug buffer. If you + * want to watch these events in particular, just set the sysctl.) + */ +static int jnl_kdebug = 0; +SYSCTL_DECL(_vfs_generic); +SYSCTL_NODE(_vfs_generic, OID_AUTO, jnl, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal"); +SYSCTL_NODE(_vfs_generic_jnl, OID_AUTO, kdebug, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "Journal kdebug"); +SYSCTL_INT(_vfs_generic_jnl_kdebug, OID_AUTO, trim, CTLFLAG_RW|CTLFLAG_LOCKED, &jnl_kdebug, 0, "Enable kdebug logging for journal TRIM"); + +#define DBG_JOURNAL_FLUSH FSDBG_CODE(DBG_JOURNAL, 1) +#define DBG_JOURNAL_TRIM_ADD FSDBG_CODE(DBG_JOURNAL, 2) +#define DBG_JOURNAL_TRIM_REMOVE FSDBG_CODE(DBG_JOURNAL, 3) +#define DBG_JOURNAL_TRIM_REMOVE_PENDING FSDBG_CODE(DBG_JOURNAL, 4) +#define DBG_JOURNAL_TRIM_REALLOC FSDBG_CODE(DBG_JOURNAL, 5) +#define DBG_JOURNAL_TRIM_FLUSH FSDBG_CODE(DBG_JOURNAL, 6) +#define DBG_JOURNAL_TRIM_UNMAP FSDBG_CODE(DBG_JOURNAL, 7) + +/* + * Cap the journal max size to 2GB. On HFS, it will attempt to occupy + * a full allocation block if the current size is smaller than the allocation + * block on which it resides. Once we hit the exabyte filesystem range, then + * it will use 2GB allocation blocks. As a result, make the cap 2GB. + */ +#define MAX_JOURNAL_SIZE 0x80000000U #include <sys/sdt.h> /* DTRACE_IO1 */ #else @@ -80,6 +108,13 @@ extern task_t kernel_task; #include "vfs_journal.h" +#include <sys/kdebug.h> + +#if 0 +#undef KERNEL_DEBUG +#define KERNEL_DEBUG KERNEL_DEBUG_CONSTANT +#endif + #ifndef CONFIG_HFS_TRIM #define CONFIG_HFS_TRIM 0 #endif @@ -104,10 +139,10 @@ SYSCTL_UINT (_kern, OID_AUTO, jnl_trim_flush, CTLFLAG_RW, &jnl_trim_flush_limit, /* XXX next prototytype should be from libsa/stdlib.h> but conflicts libkern */ __private_extern__ void qsort( - void * array, - size_t nmembers, - size_t member_size, - int (*)(const void *, const void *)); + void * array, + size_t nmembers, + size_t member_size, + int (*)(const void *, const void *)); @@ -116,8 +151,13 @@ __private_extern__ void qsort( // fields as well as the first entry of binfo[] #define BLHDR_CHECKSUM_SIZE 32 - -static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg); +static void lock_condition(journal *jnl, boolean_t *condition, const char *condition_name); +static void wait_condition(journal *jnl, boolean_t *condition, const char *condition_name); +static void unlock_condition(journal *jnl, boolean_t *condition); +static void finish_end_thread(transaction *tr); +static void write_header_thread(journal *jnl); +static int finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg); +static int end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait); static void abort_transaction(journal *jnl, transaction *tr); static void dump_journal(journal *jnl); @@ -125,8 +165,8 @@ static __inline__ void lock_journal(journal *jnl); static __inline__ void unlock_journal(journal *jnl); static __inline__ void lock_oldstart(journal *jnl); static __inline__ void unlock_oldstart(journal *jnl); - - +static __inline__ void lock_flush(journal *jnl); +static __inline__ void unlock_flush(journal *jnl); // @@ -134,10 +174,10 @@ static __inline__ void unlock_oldstart(journal *jnl); // typedef struct bucket { - off_t block_num; - uint32_t jnl_offset; - uint32_t block_size; - int32_t cksum; + off_t block_num; + uint32_t jnl_offset; + uint32_t block_size; + int32_t cksum; } bucket; #define STARTING_BUCKETS 256 @@ -149,56 +189,56 @@ static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_ static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting); #define CHECK_JOURNAL(jnl) \ - do { \ - if (jnl == NULL) {\ - panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__);\ - }\ - if (jnl->jdev == NULL) { \ - panic("%s:%d: jdev is null!\n", __FILE__, __LINE__);\ - } \ - if (jnl->fsdev == NULL) { \ - panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__);\ - } \ - if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) {\ - panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n",\ - __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC);\ - }\ - if ( jnl->jhdr->start <= 0 \ - || jnl->jhdr->start > jnl->jhdr->size) {\ - panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \ - __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size);\ - }\ - if ( jnl->jhdr->end <= 0 \ - || jnl->jhdr->end > jnl->jhdr->size) {\ - panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \ - __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size);\ - }\ - } while(0) + do { \ + if (jnl == NULL) { \ + panic("%s:%d: null journal ptr?\n", __FILE__, __LINE__); \ + } \ + if (jnl->jdev == NULL) { \ + panic("%s:%d: jdev is null!\n", __FILE__, __LINE__); \ + } \ + if (jnl->fsdev == NULL) { \ + panic("%s:%d: fsdev is null!\n", __FILE__, __LINE__); \ + } \ + if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC) { \ + panic("%s:%d: jhdr magic corrupted (0x%x != 0x%x)\n", \ + __FILE__, __LINE__, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); \ + } \ + if ( jnl->jhdr->start <= 0 \ + || jnl->jhdr->start > jnl->jhdr->size) { \ + panic("%s:%d: jhdr start looks bad (0x%llx max size 0x%llx)\n", \ + __FILE__, __LINE__, jnl->jhdr->start, jnl->jhdr->size); \ + } \ + if ( jnl->jhdr->end <= 0 \ + || jnl->jhdr->end > jnl->jhdr->size) { \ + panic("%s:%d: jhdr end looks bad (0x%llx max size 0x%llx)\n", \ + __FILE__, __LINE__, jnl->jhdr->end, jnl->jhdr->size); \ + } \ + } while(0) #define CHECK_TRANSACTION(tr) \ - do {\ - if (tr == NULL) {\ - panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__);\ - }\ - if (tr->jnl == NULL) {\ - panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__);\ - }\ - if (tr->blhdr != (block_list_header *)tr->tbuffer) {\ - panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer);\ - }\ - if (tr->total_bytes < 0) {\ - panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes);\ - }\ - if (tr->journal_start < 0) {\ - panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start);\ - }\ - if (tr->journal_end < 0) {\ - panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end);\ - }\ - if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) {\ - panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks);\ - }\ - } while(0) + do { \ + if (tr == NULL) { \ + panic("%s:%d: null transaction ptr?\n", __FILE__, __LINE__); \ + } \ + if (tr->jnl == NULL) { \ + panic("%s:%d: null tr->jnl ptr?\n", __FILE__, __LINE__); \ + } \ + if (tr->blhdr != (block_list_header *)tr->tbuffer) { \ + panic("%s:%d: blhdr (%p) != tbuffer (%p)\n", __FILE__, __LINE__, tr->blhdr, tr->tbuffer); \ + } \ + if (tr->total_bytes < 0) { \ + panic("%s:%d: tr total_bytes looks bad: %d\n", __FILE__, __LINE__, tr->total_bytes); \ + } \ + if (tr->journal_start < 0) { \ + panic("%s:%d: tr journal start looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_start); \ + } \ + if (tr->journal_end < 0) { \ + panic("%s:%d: tr journal end looks bad: 0x%llx\n", __FILE__, __LINE__, tr->journal_end); \ + } \ + if (tr->blhdr && (tr->blhdr->max_blocks <= 0 || tr->blhdr->max_blocks > (tr->jnl->jhdr->size/tr->jnl->jhdr->jhdr_size))) { \ + panic("%s:%d: tr blhdr max_blocks looks bad: %d\n", __FILE__, __LINE__, tr->blhdr->max_blocks); \ + } \ + } while(0) @@ -210,14 +250,14 @@ static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, of static int calc_checksum(char *ptr, int len) { - int i, cksum=0; + int i, cksum=0; - // this is a lame checksum but for now it'll do - for(i=0; i < len; i++, ptr++) { + // this is a lame checksum but for now it'll do + for(i = 0; i < len; i++, ptr++) { cksum = (cksum << 8) ^ (cksum + *(unsigned char *)ptr); - } + } - return (~cksum); + return (~cksum); } // @@ -247,6 +287,18 @@ unlock_journal(journal *jnl) lck_mtx_unlock(&jnl->jlock); } +static __inline__ void +lock_flush(journal *jnl) +{ + lck_mtx_lock(&jnl->flock); +} + +static __inline__ void +unlock_flush(journal *jnl) +{ + lck_mtx_unlock(&jnl->flock); +} + static __inline__ void lock_oldstart(journal *jnl) { @@ -277,78 +329,80 @@ unlock_oldstart(journal *jnl) static size_t do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction) { - int err, curlen=len; - size_t io_sz = 0; - buf_t bp; - off_t max_iosize; + int err, curlen=len; + size_t io_sz = 0; + buf_t bp; + off_t max_iosize; - if (*offset < 0 || *offset > jnl->jhdr->size) { + if (*offset < 0 || *offset > jnl->jhdr->size) { panic("jnl: do_jnl_io: bad offset 0x%llx (max 0x%llx)\n", *offset, jnl->jhdr->size); - } - - if (direction & JNL_WRITE) - max_iosize = jnl->max_write_size; - else if (direction & JNL_READ) - max_iosize = jnl->max_read_size; - else - max_iosize = 128 * 1024; + } + + if (direction & JNL_WRITE) + max_iosize = jnl->max_write_size; + else if (direction & JNL_READ) + max_iosize = jnl->max_read_size; + else + max_iosize = 128 * 1024; - again: - bp = alloc_io_buf(jnl->jdev, 1); +again: + bp = alloc_io_buf(jnl->jdev, 1); - if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) { + if (*offset + (off_t)curlen > jnl->jhdr->size && *offset != 0 && jnl->jhdr->size != 0) { if (*offset == jnl->jhdr->size) { *offset = jnl->jhdr->jhdr_size; } else { curlen = (off_t)jnl->jhdr->size - *offset; } - } + } if (curlen > max_iosize) { curlen = max_iosize; } - if (curlen <= 0) { + if (curlen <= 0) { panic("jnl: do_jnl_io: curlen == %d, offset 0x%llx len %zd\n", curlen, *offset, len); - } + } if (*offset == 0 && (direction & JNL_HEADER) == 0) { panic("jnl: request for i/o to jnl-header without JNL_HEADER flag set! (len %d, data %p)\n", curlen, data); } - if (direction & JNL_READ) - buf_setflags(bp, B_READ); - else { - /* - * don't have to set any flags - */ - vnode_startwrite(jnl->jdev); - } - buf_setsize(bp, curlen); - buf_setcount(bp, curlen); - buf_setdataptr(bp, (uintptr_t)data); - buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); - buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); - if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) { - buf_markfua(bp); - } + if (direction & JNL_READ) + buf_setflags(bp, B_READ); + else { + /* + * don't have to set any flags + */ + vnode_startwrite(jnl->jdev); + } + buf_setsize(bp, curlen); + buf_setcount(bp, curlen); + buf_setdataptr(bp, (uintptr_t)data); + buf_setblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); + buf_setlblkno(bp, (daddr64_t) ((jnl->jdev_offset + *offset) / (off_t)jnl->jhdr->jhdr_size)); + + if ((direction & JNL_WRITE) && (jnl->flags & JOURNAL_DO_FUA_WRITES)) { + buf_markfua(bp); + } - DTRACE_IO1(journal__start, buf_t, bp); - err = VNOP_STRATEGY(bp); - if (!err) { + DTRACE_IO1(journal__start, buf_t, bp); + err = VNOP_STRATEGY(bp); + if (!err) { err = (int)buf_biowait(bp); - } - DTRACE_IO1(journal__done, buf_t, bp); - free_io_buf(bp); + } + DTRACE_IO1(journal__done, buf_t, bp); + free_io_buf(bp); - if (err) { - printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err); - return 0; - } + if (err) { + printf("jnl: %s: do_jnl_io: strategy err 0x%x\n", jnl->jdev_name, err); + return 0; + } + + *offset += curlen; + io_sz += curlen; - *offset += curlen; - io_sz += curlen; - if (io_sz != len) { + if (io_sz != len) { // handle wrap-around data = (char *)data + curlen; curlen = len - io_sz; @@ -356,21 +410,21 @@ do_journal_io(journal *jnl, off_t *offset, void *data, size_t len, int direction *offset = jnl->jhdr->jhdr_size; } goto again; - } + } - return io_sz; + return io_sz; } static size_t read_journal_data(journal *jnl, off_t *offset, void *data, size_t len) { - return do_journal_io(jnl, offset, data, len, JNL_READ); + return do_journal_io(jnl, offset, data, len, JNL_READ); } static size_t write_journal_data(journal *jnl, off_t *offset, void *data, size_t len) { - return do_journal_io(jnl, offset, data, len, JNL_WRITE); + return do_journal_io(jnl, offset, data, len, JNL_WRITE); } @@ -383,64 +437,66 @@ read_journal_header(journal *jnl, void *data, size_t len) } static int -write_journal_header(journal *jnl, int updating_start) -{ - static int num_err_prints = 0; - int ret=0; - off_t jhdr_offset = 0; - struct vfs_context context; - - context.vc_thread = current_thread(); - context.vc_ucred = NOCRED; - // - // Flush the track cache if we're not doing force-unit-access - // writes. - // - if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { - ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); - } - if (ret != 0) { - // - // Only print this error if it's a different error than the - // previous one, or if it's the first time for this device - // or if the total number of printfs is less than 25. We - // allow for up to 25 printfs to insure that some make it - // into the on-disk syslog. Otherwise if we only printed - // one, it's possible it would never make it to the syslog - // for the root volume and that makes debugging hard. +write_journal_header(journal *jnl, int updating_start, uint32_t sequence_num) +{ + static int num_err_prints = 0; + int ret=0; + off_t jhdr_offset = 0; + struct vfs_context context; + + context.vc_thread = current_thread(); + context.vc_ucred = NOCRED; + // + // Flush the track cache if we're not doing force-unit-access + // writes. // - if ( ret != jnl->last_flush_err - || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0 - || num_err_prints++ < 25) { + if (!updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { + ret = VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); + } + if (ret != 0) { + // + // Only print this error if it's a different error than the + // previous one, or if it's the first time for this device + // or if the total number of printfs is less than 25. We + // allow for up to 25 printfs to insure that some make it + // into the on-disk syslog. Otherwise if we only printed + // one, it's possible it would never make it to the syslog + // for the root volume and that makes debugging hard. + // + if ( ret != jnl->last_flush_err + || (jnl->flags & JOURNAL_FLUSHCACHE_ERR) == 0 + || num_err_prints++ < 25) { - printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret); + printf("jnl: %s: flushing fs disk buffer returned 0x%x\n", jnl->jdev_name, ret); - jnl->flags |= JOURNAL_FLUSHCACHE_ERR; - jnl->last_flush_err = ret; + jnl->flags |= JOURNAL_FLUSHCACHE_ERR; + jnl->last_flush_err = ret; + } } - } - jnl->jhdr->checksum = 0; - jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); - if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) { - printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name); - jnl->flags |= JOURNAL_INVALID; - return -1; - } - - // If we're not doing force-unit-access writes, then we - // have to flush after writing the journal header so that - // a future transaction doesn't sneak out to disk before - // the header does and thus overwrite data that the old - // journal header refers to. Saw this exact case happen - // on an IDE bus analyzer with Larry Barras so while it - // may seem obscure, it's not. - // - if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { - VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); - } + jnl->jhdr->sequence_num = sequence_num; + jnl->jhdr->checksum = 0; + jnl->jhdr->checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); - return 0; + if (do_journal_io(jnl, &jhdr_offset, jnl->header_buf, jnl->jhdr->jhdr_size, JNL_WRITE|JNL_HEADER) != (size_t)jnl->jhdr->jhdr_size) { + printf("jnl: %s: write_journal_header: error writing the journal header!\n", jnl->jdev_name); + jnl->flags |= JOURNAL_INVALID; + return -1; + } + + // If we're not doing force-unit-access writes, then we + // have to flush after writing the journal header so that + // a future transaction doesn't sneak out to disk before + // the header does and thus overwrite data that the old + // journal header refers to. Saw this exact case happen + // on an IDE bus analyzer with Larry Barras so while it + // may seem obscure, it's not. + // + if (updating_start && (jnl->flags & JOURNAL_DO_FUA_WRITES) == 0) { + VNOP_IOCTL(jnl->jdev, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, &context); + } + + return 0; } @@ -456,18 +512,29 @@ write_journal_header(journal *jnl, int updating_start) static void free_old_stuff(journal *jnl) { - transaction *tr, *next; + transaction *tr, *next; + block_list_header *blhdr=NULL, *next_blhdr=NULL; - lock_oldstart(jnl); - tr = jnl->tr_freeme; - jnl->tr_freeme = NULL; - unlock_oldstart(jnl); + if (jnl->tr_freeme == NULL) + return; - for(; tr; tr=next) { - next = tr->next; - FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); - } + lock_oldstart(jnl); + tr = jnl->tr_freeme; + jnl->tr_freeme = NULL; + unlock_oldstart(jnl); + + for(; tr; tr=next) { + for (blhdr = tr->blhdr; blhdr; blhdr = next_blhdr) { + next_blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum); + blhdr->binfo[0].bnum = 0xdeadc0de; + + kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); + KERNEL_DEBUG(0xbbbbc01c, jnl, tr, tr->tbuffer_size, 0, 0); + } + next = tr->next; + FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); + } } @@ -481,167 +548,169 @@ free_old_stuff(journal *jnl) static void buffer_flushed_callback(struct buf *bp, void *arg) { - transaction *tr; - journal *jnl; - transaction *ctr, *prev=NULL, *next; - size_t i; - int bufsize, amt_flushed, total_bytes; + transaction *tr; + journal *jnl; + transaction *ctr, *prev=NULL, *next; + size_t i; + int bufsize, amt_flushed, total_bytes; - //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n", - // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg); + //printf("jnl: buf flush: bp @ 0x%x l/blkno %qd/%qd vp 0x%x tr @ 0x%x\n", + // bp, buf_lblkno(bp), buf_blkno(bp), buf_vnode(bp), arg); - // snarf out the bits we want - bufsize = buf_size(bp); - tr = (transaction *)arg; + // snarf out the bits we want + bufsize = buf_size(bp); + tr = (transaction *)arg; - // then we've already seen it - if (tr == NULL) { + // then we've already seen it + if (tr == NULL) { return; - } + } - CHECK_TRANSACTION(tr); + CHECK_TRANSACTION(tr); - jnl = tr->jnl; - if (jnl->flags & JOURNAL_INVALID) { + jnl = tr->jnl; + if (jnl->flags & JOURNAL_INVALID) { return; - } + } - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); - amt_flushed = tr->num_killed; - total_bytes = tr->total_bytes; + amt_flushed = tr->num_killed; + total_bytes = tr->total_bytes; - // update the number of blocks that have been flushed. - // this buf may represent more than one block so take - // that into account. - // - // OSAddAtomic() returns the value of tr->num_flushed before the add - // - amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed); + // update the number of blocks that have been flushed. + // this buf may represent more than one block so take + // that into account. + // + // OSAddAtomic() returns the value of tr->num_flushed before the add + // + amt_flushed += OSAddAtomic(bufsize, &tr->num_flushed); - // if this transaction isn't done yet, just return as - // there is nothing to do. - // - // NOTE: we are careful to not reference anything through - // the tr pointer after doing the OSAddAtomic(). if - // this if statement fails then we are the last one - // and then it's ok to dereference "tr". - // - if ((amt_flushed + bufsize) < total_bytes) { + // if this transaction isn't done yet, just return as + // there is nothing to do. + // + // NOTE: we are careful to not reference anything through + // the tr pointer after doing the OSAddAtomic(). if + // this if statement fails then we are the last one + // and then it's ok to dereference "tr". + // + if ((amt_flushed + bufsize) < total_bytes) { return; - } + } - // this will single thread checking the transaction - lock_oldstart(jnl); + // this will single thread checking the transaction + lock_oldstart(jnl); - if (tr->total_bytes == (int)0xfbadc0de) { - // then someone beat us to it... - unlock_oldstart(jnl); - return; - } + if (tr->total_bytes == (int)0xfbadc0de) { + // then someone beat us to it... + unlock_oldstart(jnl); + return; + } - // mark this so that we're the owner of dealing with the - // cleanup for this transaction - tr->total_bytes = 0xfbadc0de; + // mark this so that we're the owner of dealing with the + // cleanup for this transaction + tr->total_bytes = 0xfbadc0de; - //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", - // tr, tr->journal_start, tr->journal_end, jnl); + //printf("jnl: tr 0x%x (0x%llx 0x%llx) in jnl 0x%x completed.\n", + // tr, tr->journal_start, tr->journal_end, jnl); - // find this entry in the old_start[] index and mark it completed - for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { + // find this entry in the old_start[] index and mark it completed + for(i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { - if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) { - jnl->old_start[i] &= ~(0x8000000000000000ULL); - break; + if ((off_t)(jnl->old_start[i] & ~(0x8000000000000000ULL)) == tr->journal_start) { + jnl->old_start[i] &= ~(0x8000000000000000ULL); + break; + } } - } - if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { - panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n", - tr->journal_start, tr, jnl); - } + if (i >= sizeof(jnl->old_start)/sizeof(jnl->old_start[0])) { + panic("jnl: buffer_flushed: did not find tr w/start @ %lld (tr %p, jnl %p)\n", + tr->journal_start, tr, jnl); + } - // if we are here then we need to update the journal header - // to reflect that this transaction is complete - if (tr->journal_start == jnl->active_start) { - jnl->active_start = tr->journal_end; - tr->journal_start = tr->journal_end = (off_t)0; - } + // if we are here then we need to update the journal header + // to reflect that this transaction is complete + if (tr->journal_start == jnl->active_start) { + jnl->active_start = tr->journal_end; + tr->journal_start = tr->journal_end = (off_t)0; + } - // go through the completed_trs list and try to coalesce - // entries, restarting back at the beginning if we have to. - for(ctr=jnl->completed_trs; ctr; prev=ctr, ctr=next) { - if (ctr->journal_start == jnl->active_start) { - jnl->active_start = ctr->journal_end; - if (prev) { - prev->next = ctr->next; - } - if (ctr == jnl->completed_trs) { - jnl->completed_trs = ctr->next; - } + // go through the completed_trs list and try to coalesce + // entries, restarting back at the beginning if we have to. + for (ctr = jnl->completed_trs; ctr; prev=ctr, ctr=next) { + if (ctr->journal_start == jnl->active_start) { + jnl->active_start = ctr->journal_end; + if (prev) { + prev->next = ctr->next; + } + if (ctr == jnl->completed_trs) { + jnl->completed_trs = ctr->next; + } - next = jnl->completed_trs; // this starts us over again - ctr->next = jnl->tr_freeme; - jnl->tr_freeme = ctr; - ctr = NULL; - } else if (tr->journal_end == ctr->journal_start) { - ctr->journal_start = tr->journal_start; - next = jnl->completed_trs; // this starts us over again - ctr = NULL; - tr->journal_start = tr->journal_end = (off_t)0; - } else if (tr->journal_start == ctr->journal_end) { - ctr->journal_end = tr->journal_end; - next = ctr->next; - tr->journal_start = tr->journal_end = (off_t)0; - } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) { - // coalesce the next entry with this one and link the next - // entry in at the head of the tr_freeme list - next = ctr->next; // temporarily use the "next" variable - ctr->journal_end = next->journal_end; - ctr->next = next->next; - next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list - jnl->tr_freeme = next; - - next = jnl->completed_trs; // this starts us over again - ctr = NULL; - } else { - next = ctr->next; + next = jnl->completed_trs; // this starts us over again + ctr->next = jnl->tr_freeme; + jnl->tr_freeme = ctr; + ctr = NULL; + } else if (tr->journal_end == ctr->journal_start) { + ctr->journal_start = tr->journal_start; + next = jnl->completed_trs; // this starts us over again + ctr = NULL; + tr->journal_start = tr->journal_end = (off_t)0; + } else if (tr->journal_start == ctr->journal_end) { + ctr->journal_end = tr->journal_end; + next = ctr->next; + tr->journal_start = tr->journal_end = (off_t)0; + } else if (ctr->next && ctr->journal_end == ctr->next->journal_start) { + // coalesce the next entry with this one and link the next + // entry in at the head of the tr_freeme list + next = ctr->next; // temporarily use the "next" variable + ctr->journal_end = next->journal_end; + ctr->next = next->next; + next->next = jnl->tr_freeme; // link in the next guy at the head of the tr_freeme list + jnl->tr_freeme = next; + + next = jnl->completed_trs; // this starts us over again + ctr = NULL; + } else { + next = ctr->next; + } } - } - // if this is true then we didn't merge with anyone - // so link ourselves in at the head of the completed - // transaction list. - if (tr->journal_start != 0) { - // put this entry into the correct sorted place - // in the list instead of just at the head. - // + // if this is true then we didn't merge with anyone + // so link ourselves in at the head of the completed + // transaction list. + if (tr->journal_start != 0) { + // put this entry into the correct sorted place + // in the list instead of just at the head. + // - prev = NULL; - for(ctr=jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) { - // just keep looping - } + prev = NULL; + for (ctr = jnl->completed_trs; ctr && tr->journal_start > ctr->journal_start; prev=ctr, ctr=ctr->next) { + // just keep looping + } - if (ctr == NULL && prev == NULL) { - jnl->completed_trs = tr; - tr->next = NULL; - } else if (ctr == jnl->completed_trs) { - tr->next = jnl->completed_trs; - jnl->completed_trs = tr; + if (ctr == NULL && prev == NULL) { + jnl->completed_trs = tr; + tr->next = NULL; + } else if (ctr == jnl->completed_trs) { + tr->next = jnl->completed_trs; + jnl->completed_trs = tr; + } else { + tr->next = prev->next; + prev->next = tr; + } } else { - tr->next = prev->next; - prev->next = tr; - } - } else { - // if we're here this tr got merged with someone else so - // put it on the list to be free'd - tr->next = jnl->tr_freeme; - jnl->tr_freeme = tr; - } - unlock_oldstart(jnl); + // if we're here this tr got merged with someone else so + // put it on the list to be free'd + tr->next = jnl->tr_freeme; + jnl->tr_freeme = tr; + } + unlock_oldstart(jnl); + + unlock_condition(jnl, &jnl->asyncIO); } @@ -655,51 +724,51 @@ buffer_flushed_callback(struct buf *bp, void *arg) static void swap_journal_header(journal *jnl) { - jnl->jhdr->magic = SWAP32(jnl->jhdr->magic); - jnl->jhdr->endian = SWAP32(jnl->jhdr->endian); - jnl->jhdr->start = SWAP64(jnl->jhdr->start); - jnl->jhdr->end = SWAP64(jnl->jhdr->end); - jnl->jhdr->size = SWAP64(jnl->jhdr->size); - jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size); - jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum); - jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size); - jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num); + jnl->jhdr->magic = SWAP32(jnl->jhdr->magic); + jnl->jhdr->endian = SWAP32(jnl->jhdr->endian); + jnl->jhdr->start = SWAP64(jnl->jhdr->start); + jnl->jhdr->end = SWAP64(jnl->jhdr->end); + jnl->jhdr->size = SWAP64(jnl->jhdr->size); + jnl->jhdr->blhdr_size = SWAP32(jnl->jhdr->blhdr_size); + jnl->jhdr->checksum = SWAP32(jnl->jhdr->checksum); + jnl->jhdr->jhdr_size = SWAP32(jnl->jhdr->jhdr_size); + jnl->jhdr->sequence_num = SWAP32(jnl->jhdr->sequence_num); } static void swap_block_list_header(journal *jnl, block_list_header *blhdr) { - int i; + int i; - blhdr->max_blocks = SWAP16(blhdr->max_blocks); - blhdr->num_blocks = SWAP16(blhdr->num_blocks); - blhdr->bytes_used = SWAP32(blhdr->bytes_used); - blhdr->checksum = SWAP32(blhdr->checksum); - blhdr->flags = SWAP32(blhdr->flags); - - if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) { - printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size); - return; - } + blhdr->max_blocks = SWAP16(blhdr->max_blocks); + blhdr->num_blocks = SWAP16(blhdr->num_blocks); + blhdr->bytes_used = SWAP32(blhdr->bytes_used); + blhdr->checksum = SWAP32(blhdr->checksum); + blhdr->flags = SWAP32(blhdr->flags); + + if (blhdr->num_blocks >= ((jnl->jhdr->blhdr_size / sizeof(block_info)) - 1)) { + printf("jnl: %s: blhdr num blocks looks suspicious (%d / blhdr size %d). not swapping.\n", jnl->jdev_name, blhdr->num_blocks, jnl->jhdr->blhdr_size); + return; + } - for(i=0; i < blhdr->num_blocks; i++) { + for(i = 0; i < blhdr->num_blocks; i++) { blhdr->binfo[i].bnum = SWAP64(blhdr->binfo[i].bnum); blhdr->binfo[i].u.bi.bsize = SWAP32(blhdr->binfo[i].u.bi.bsize); blhdr->binfo[i].u.bi.b.cksum = SWAP32(blhdr->binfo[i].u.bi.b.cksum); - } + } } static int update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) { - int ret; - struct buf *oblock_bp=NULL; + int ret; + struct buf *oblock_bp=NULL; - // first read the block we want. - ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); - if (ret != 0) { - printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret); + // first read the block we want. + ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); + if (ret != 0) { + printf("jnl: %s: update_fs_block: error reading fs block # %lld! (ret %d)\n", jnl->jdev_name, fs_block, ret); if (oblock_bp) { buf_brelse(oblock_bp); @@ -709,277 +778,277 @@ update_fs_block(journal *jnl, void *block_ptr, off_t fs_block, size_t bsize) // let's try to be aggressive here and just re-write the block oblock_bp = buf_getblk(jnl->fsdev, (daddr64_t)fs_block, bsize, 0, 0, BLK_META); if (oblock_bp == NULL) { - printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block); - return -1; + printf("jnl: %s: update_fs_block: buf_getblk() for %lld failed! failing update.\n", jnl->jdev_name, fs_block); + return -1; } - } + } - // make sure it's the correct size. - if (buf_size(oblock_bp) != bsize) { + // make sure it's the correct size. + if (buf_size(oblock_bp) != bsize) { buf_brelse(oblock_bp); return -1; - } + } - // copy the journal data over top of it - memcpy((char *)0 + buf_dataptr(oblock_bp), block_ptr, bsize); + // copy the journal data over top of it + memcpy((char *)buf_dataptr(oblock_bp), block_ptr, bsize); - if ((ret = VNOP_BWRITE(oblock_bp)) != 0) { - printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret); - return ret; - } + if ((ret = VNOP_BWRITE(oblock_bp)) != 0) { + printf("jnl: %s: update_fs_block: failed to update block %lld (ret %d)\n", jnl->jdev_name, fs_block,ret); + return ret; + } - // and now invalidate it so that if someone else wants to read - // it in a different size they'll be able to do it. - ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); - if (oblock_bp) { + // and now invalidate it so that if someone else wants to read + // it in a different size they'll be able to do it. + ret = buf_meta_bread(jnl->fsdev, (daddr64_t)fs_block, bsize, NOCRED, &oblock_bp); + if (oblock_bp) { buf_markinvalid(oblock_bp); buf_brelse(oblock_bp); - } + } - return 0; + return 0; } static int grow_table(struct bucket **buf_ptr, int num_buckets, int new_size) { - struct bucket *newBuf; - int current_size = num_buckets, i; + struct bucket *newBuf; + int current_size = num_buckets, i; - // return if newsize is less than the current size - if (new_size < num_buckets) { - return current_size; - } + // return if newsize is less than the current size + if (new_size < num_buckets) { + return current_size; + } - if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { - printf("jnl: grow_table: no memory to expand coalesce buffer!\n"); - return -1; - } + if ((MALLOC(newBuf, struct bucket *, new_size*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { + printf("jnl: grow_table: no memory to expand coalesce buffer!\n"); + return -1; + } - // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size); + // printf("jnl: lookup_bucket: expanded co_buf to %d elems\n", new_size); - // copy existing elements - bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket)); + // copy existing elements + bcopy(*buf_ptr, newBuf, num_buckets*sizeof(struct bucket)); - // initialize the new ones - for(i=num_buckets; i < new_size; i++) { - newBuf[i].block_num = (off_t)-1; - } + // initialize the new ones + for(i = num_buckets; i < new_size; i++) { + newBuf[i].block_num = (off_t)-1; + } - // free the old container - FREE(*buf_ptr, M_TEMP); + // free the old container + FREE(*buf_ptr, M_TEMP); - // reset the buf_ptr - *buf_ptr = newBuf; + // reset the buf_ptr + *buf_ptr = newBuf; - return new_size; + return new_size; } static int lookup_bucket(struct bucket **buf_ptr, off_t block_num, int num_full) { - int lo, hi, index, matches, i; + int lo, hi, index, matches, i; - if (num_full == 0) { - return 0; // table is empty, so insert at index=0 - } + if (num_full == 0) { + return 0; // table is empty, so insert at index=0 + } - lo = 0; - hi = num_full - 1; - index = -1; + lo = 0; + hi = num_full - 1; + index = -1; - // perform binary search for block_num - do { - int mid = (hi - lo)/2 + lo; - off_t this_num = (*buf_ptr)[mid].block_num; + // perform binary search for block_num + do { + int mid = (hi - lo)/2 + lo; + off_t this_num = (*buf_ptr)[mid].block_num; - if (block_num == this_num) { - index = mid; - break; - } + if (block_num == this_num) { + index = mid; + break; + } - if (block_num < this_num) { - hi = mid; - continue; - } + if (block_num < this_num) { + hi = mid; + continue; + } - if (block_num > this_num) { - lo = mid + 1; - continue; - } - } while(lo < hi); + if (block_num > this_num) { + lo = mid + 1; + continue; + } + } while (lo < hi); - // check if lo and hi converged on the match - if (block_num == (*buf_ptr)[hi].block_num) { - index = hi; - } + // check if lo and hi converged on the match + if (block_num == (*buf_ptr)[hi].block_num) { + index = hi; + } - // if no existing entry found, find index for new one - if (index == -1) { - index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1; - } else { - // make sure that we return the right-most index in the case of multiple matches - matches = 0; - i = index + 1; - while(i < num_full && block_num == (*buf_ptr)[i].block_num) { - matches++; - i++; - } - - index += matches; - } + // if no existing entry found, find index for new one + if (index == -1) { + index = (block_num < (*buf_ptr)[hi].block_num) ? hi : hi + 1; + } else { + // make sure that we return the right-most index in the case of multiple matches + matches = 0; + i = index + 1; + while (i < num_full && block_num == (*buf_ptr)[i].block_num) { + matches++; + i++; + } + + index += matches; + } - return index; + return index; } static int insert_block(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t num, size_t size, size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr, int overwriting) { - if (!overwriting) { - // grow the table if we're out of space - if (*num_full_ptr >= *num_buckets_ptr) { - int new_size = *num_buckets_ptr * 2; - int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size); + if (!overwriting) { + // grow the table if we're out of space + if (*num_full_ptr >= *num_buckets_ptr) { + int new_size = *num_buckets_ptr * 2; + int grow_size = grow_table(buf_ptr, *num_buckets_ptr, new_size); - if (grow_size < new_size) { - printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name); - return -1; - } + if (grow_size < new_size) { + printf("jnl: %s: add_block: grow_table returned an error!\n", jnl->jdev_name); + return -1; + } - *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size - } + *num_buckets_ptr = grow_size; //update num_buckets to reflect the new size + } - // if we're not inserting at the end, we need to bcopy - if (blk_index != *num_full_ptr) { - bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) ); - } + // if we're not inserting at the end, we need to bcopy + if (blk_index != *num_full_ptr) { + bcopy( (*buf_ptr)+(blk_index), (*buf_ptr)+(blk_index+1), (*num_full_ptr-blk_index)*sizeof(struct bucket) ); + } - (*num_full_ptr)++; // increment only if we're not overwriting - } + (*num_full_ptr)++; // increment only if we're not overwriting + } - // sanity check the values we're about to add - if ((off_t)offset >= jnl->jhdr->size) { - offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); - } - if (size <= 0) { - panic("jnl: insert_block: bad size in insert_block (%zd)\n", size); - } - - (*buf_ptr)[blk_index].block_num = num; - (*buf_ptr)[blk_index].block_size = size; - (*buf_ptr)[blk_index].jnl_offset = offset; - (*buf_ptr)[blk_index].cksum = cksum; + // sanity check the values we're about to add + if ((off_t)offset >= jnl->jhdr->size) { + offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); + } + if (size <= 0) { + panic("jnl: insert_block: bad size in insert_block (%zd)\n", size); + } + + (*buf_ptr)[blk_index].block_num = num; + (*buf_ptr)[blk_index].block_size = size; + (*buf_ptr)[blk_index].jnl_offset = offset; + (*buf_ptr)[blk_index].cksum = cksum; - return blk_index; + return blk_index; } static int do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) { - int num_to_remove, index, i, overwrite, err; - size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset; - off_t overlap, block_start, block_end; - - block_start = block_num*jhdr_size; - block_end = block_start + size; - overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size); - - // first, eliminate any overlap with the previous entry - if (blk_index != 0 && !overwrite) { - off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size; - off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size; - overlap = prev_block_end - block_start; - if (overlap > 0) { - if (overlap % jhdr_size != 0) { - panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size); - } - - // if the previous entry completely overlaps this one, we need to break it into two pieces. - if (prev_block_end > block_end) { - off_t new_num = block_end / jhdr_size; - size_t new_size = prev_block_end - block_end; - - new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start); + int num_to_remove, index, i, overwrite, err; + size_t jhdr_size = jnl->jhdr->jhdr_size, new_offset; + off_t overlap, block_start, block_end; + + block_start = block_num*jhdr_size; + block_end = block_start + size; + overwrite = (block_num == (*buf_ptr)[blk_index].block_num && size >= (*buf_ptr)[blk_index].block_size); + + // first, eliminate any overlap with the previous entry + if (blk_index != 0 && !overwrite) { + off_t prev_block_start = (*buf_ptr)[blk_index-1].block_num*jhdr_size; + off_t prev_block_end = prev_block_start + (*buf_ptr)[blk_index-1].block_size; + overlap = prev_block_end - block_start; + if (overlap > 0) { + if (overlap % jhdr_size != 0) { + panic("jnl: do_overlap: overlap with previous entry not a multiple of %zd\n", jhdr_size); + } + + // if the previous entry completely overlaps this one, we need to break it into two pieces. + if (prev_block_end > block_end) { + off_t new_num = block_end / jhdr_size; + size_t new_size = prev_block_end - block_end; + + new_offset = (*buf_ptr)[blk_index-1].jnl_offset + (block_end - prev_block_start); - err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0); - if (err < 0) { - panic("jnl: do_overlap: error inserting during pre-overlap\n"); - } - } + err = insert_block(jnl, buf_ptr, blk_index, new_num, new_size, new_offset, cksum, num_buckets_ptr, num_full_ptr, 0); + if (err < 0) { + panic("jnl: do_overlap: error inserting during pre-overlap\n"); + } + } - // Regardless, we need to truncate the previous entry to the beginning of the overlap - (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start; - (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it + // Regardless, we need to truncate the previous entry to the beginning of the overlap + (*buf_ptr)[blk_index-1].block_size = block_start - prev_block_start; + (*buf_ptr)[blk_index-1].cksum = 0; // have to blow it away because there's no way to check it + } } - } - // then, bail out fast if there's no overlap with the entries that follow - if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) { - return 0; // no overlap, no overwrite - } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) { + // then, bail out fast if there's no overlap with the entries that follow + if (!overwrite && block_end <= (off_t)((*buf_ptr)[blk_index].block_num*jhdr_size)) { + return 0; // no overlap, no overwrite + } else if (overwrite && (blk_index + 1 >= *num_full_ptr || block_end <= (off_t)((*buf_ptr)[blk_index+1].block_num*jhdr_size))) { - (*buf_ptr)[blk_index].cksum = cksum; // update this - return 1; // simple overwrite - } + (*buf_ptr)[blk_index].cksum = cksum; // update this + return 1; // simple overwrite + } - // Otherwise, find all cases of total and partial overlap. We use the special - // block_num of -2 to designate entries that are completely overlapped and must - // be eliminated. The block_num, size, and jnl_offset of partially overlapped - // entries must be adjusted to keep the array consistent. - index = blk_index; - num_to_remove = 0; - while(index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) { - if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) { - (*buf_ptr)[index].block_num = -2; // mark this for deletion - num_to_remove++; - } else { - overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size; - if (overlap > 0) { - if (overlap % jhdr_size != 0) { - panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size); - } - - // if we partially overlap this entry, adjust its block number, jnl offset, and size - (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up - (*buf_ptr)[index].cksum = 0; + // Otherwise, find all cases of total and partial overlap. We use the special + // block_num of -2 to designate entries that are completely overlapped and must + // be eliminated. The block_num, size, and jnl_offset of partially overlapped + // entries must be adjusted to keep the array consistent. + index = blk_index; + num_to_remove = 0; + while (index < *num_full_ptr && block_end > (off_t)((*buf_ptr)[index].block_num*jhdr_size)) { + if (block_end >= (off_t)(((*buf_ptr)[index].block_num*jhdr_size + (*buf_ptr)[index].block_size))) { + (*buf_ptr)[index].block_num = -2; // mark this for deletion + num_to_remove++; + } else { + overlap = block_end - (*buf_ptr)[index].block_num*jhdr_size; + if (overlap > 0) { + if (overlap % jhdr_size != 0) { + panic("jnl: do_overlap: overlap of %lld is not multiple of %zd\n", overlap, jhdr_size); + } + + // if we partially overlap this entry, adjust its block number, jnl offset, and size + (*buf_ptr)[index].block_num += (overlap / jhdr_size); // make sure overlap is multiple of jhdr_size, or round up + (*buf_ptr)[index].cksum = 0; - new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around - if ((off_t)new_offset >= jnl->jhdr->size) { - new_offset = jhdr_size + (new_offset - jnl->jhdr->size); - } - (*buf_ptr)[index].jnl_offset = new_offset; + new_offset = (*buf_ptr)[index].jnl_offset + overlap; // check for wrap-around + if ((off_t)new_offset >= jnl->jhdr->size) { + new_offset = jhdr_size + (new_offset - jnl->jhdr->size); + } + (*buf_ptr)[index].jnl_offset = new_offset; - (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value - if ((*buf_ptr)[index].block_size <= 0) { - panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size); - // return -1; // if above panic is removed, return -1 for error + (*buf_ptr)[index].block_size -= overlap; // sanity check for negative value + if ((*buf_ptr)[index].block_size <= 0) { + panic("jnl: do_overlap: after overlap, new block size is invalid (%u)\n", (*buf_ptr)[index].block_size); + // return -1; // if above panic is removed, return -1 for error + } + } + } - } - - } - index++; - } + index++; + } - // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out) - index--; // start with the last index used within the above loop - while(index >= blk_index) { - if ((*buf_ptr)[index].block_num == -2) { - if (index == *num_full_ptr-1) { - (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free - } else { - bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) ); - } - (*num_full_ptr)--; - } - index--; - } + // bcopy over any completely overlapped entries, starting at the right (where the above loop broke out) + index--; // start with the last index used within the above loop + while (index >= blk_index) { + if ((*buf_ptr)[index].block_num == -2) { + if (index == *num_full_ptr-1) { + (*buf_ptr)[index].block_num = -1; // it's the last item in the table... just mark as free + } else { + bcopy( (*buf_ptr)+(index+1), (*buf_ptr)+(index), (*num_full_ptr - (index + 1)) * sizeof(struct bucket) ); + } + (*num_full_ptr)--; + } + index--; + } - // eliminate any stale entries at the end of the table - for(i=*num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) { - (*buf_ptr)[i].block_num = -1; - } + // eliminate any stale entries at the end of the table + for(i = *num_full_ptr; i < (*num_full_ptr + num_to_remove); i++) { + (*buf_ptr)[i].block_num = -1; + } - return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite) + return 0; // if we got this far, we need to insert the entry into the table (rather than overwrite) } // PR-3105942: Coalesce writes to the same block in journal replay @@ -993,90 +1062,90 @@ do_overlap(journal *jnl, struct bucket **buf_ptr, int blk_index, off_t block_num static int add_block(journal *jnl, struct bucket **buf_ptr, off_t block_num, size_t size, __unused size_t offset, int32_t cksum, int *num_buckets_ptr, int *num_full_ptr) { - int blk_index, overwriting; + int blk_index, overwriting; - // on return from lookup_bucket(), blk_index is the index into the table where block_num should be - // inserted (or the index of the elem to overwrite). - blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr); + // on return from lookup_bucket(), blk_index is the index into the table where block_num should be + // inserted (or the index of the elem to overwrite). + blk_index = lookup_bucket( buf_ptr, block_num, *num_full_ptr); - // check if the index is within bounds (if we're adding this block to the end of - // the table, blk_index will be equal to num_full) - if (blk_index < 0 || blk_index > *num_full_ptr) { - //printf("jnl: add_block: trouble adding block to co_buf\n"); - return -1; - } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index); + // check if the index is within bounds (if we're adding this block to the end of + // the table, blk_index will be equal to num_full) + if (blk_index < 0 || blk_index > *num_full_ptr) { + //printf("jnl: add_block: trouble adding block to co_buf\n"); + return -1; + } // else printf("jnl: add_block: adding block 0x%llx at i=%d\n", block_num, blk_index); - // Determine whether we're overwriting an existing entry by checking for overlap - overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr); - if (overwriting < 0) { - return -1; // if we got an error, pass it along - } + // Determine whether we're overwriting an existing entry by checking for overlap + overwriting = do_overlap(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr); + if (overwriting < 0) { + return -1; // if we got an error, pass it along + } - // returns the index, or -1 on error - blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting); + // returns the index, or -1 on error + blk_index = insert_block(jnl, buf_ptr, blk_index, block_num, size, offset, cksum, num_buckets_ptr, num_full_ptr, overwriting); - return blk_index; + return blk_index; } static int replay_journal(journal *jnl) { - int i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0; - size_t ret; - size_t max_bsize = 0; /* protected by block_ptr */ - block_list_header *blhdr; - off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start; - char *buff, *block_ptr=NULL; - struct bucket *co_buf; - int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0; - uint32_t last_sequence_num = 0; + int i, orig_checksum, checksum, check_block_checksums=0, bad_blocks=0; + size_t ret; + size_t max_bsize = 0; /* protected by block_ptr */ + block_list_header *blhdr; + off_t offset, txn_start_offset=0, blhdr_offset, orig_jnl_start; + char *buff, *block_ptr=NULL; + struct bucket *co_buf; + int num_buckets = STARTING_BUCKETS, num_full, check_past_jnl_end = 1, in_uncharted_territory=0; + uint32_t last_sequence_num = 0; - // wrap the start ptr if it points to the very end of the journal - if (jnl->jhdr->start == jnl->jhdr->size) { + // wrap the start ptr if it points to the very end of the journal + if (jnl->jhdr->start == jnl->jhdr->size) { jnl->jhdr->start = jnl->jhdr->jhdr_size; - } - if (jnl->jhdr->end == jnl->jhdr->size) { + } + if (jnl->jhdr->end == jnl->jhdr->size) { jnl->jhdr->end = jnl->jhdr->jhdr_size; - } + } - if (jnl->jhdr->start == jnl->jhdr->end) { + if (jnl->jhdr->start == jnl->jhdr->end) { return 0; - } + } - orig_jnl_start = jnl->jhdr->start; + orig_jnl_start = jnl->jhdr->start; - // allocate memory for the header_block. we'll read each blhdr into this - if (kmem_alloc(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) { + // allocate memory for the header_block. we'll read each blhdr into this + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&buff, jnl->jhdr->blhdr_size)) { printf("jnl: %s: replay_journal: no memory for block buffer! (%d bytes)\n", - jnl->jdev_name, jnl->jhdr->blhdr_size); + jnl->jdev_name, jnl->jhdr->blhdr_size); return -1; - } + } - // allocate memory for the coalesce buffer - if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { - printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name); - return -1; - } + // allocate memory for the coalesce buffer + if ((MALLOC(co_buf, struct bucket *, num_buckets*sizeof(struct bucket), M_TEMP, M_WAITOK)) == NULL) { + printf("jnl: %s: replay_journal: no memory for coalesce buffer!\n", jnl->jdev_name); + return -1; + } - restart_replay: +restart_replay: - // initialize entries - for(i=0; i < num_buckets; i++) { - co_buf[i].block_num = -1; - } - num_full = 0; // empty at first + // initialize entries + for(i = 0; i < num_buckets; i++) { + co_buf[i].block_num = -1; + } + num_full = 0; // empty at first - printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n", - jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset); + printf("jnl: %s: replay_journal: from: %lld to: %lld (joffset 0x%llx)\n", + jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end, jnl->jdev_offset); - while(check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) { + while (check_past_jnl_end || jnl->jhdr->start != jnl->jhdr->end) { offset = blhdr_offset = jnl->jhdr->start; ret = read_journal_data(jnl, &offset, buff, jnl->jhdr->blhdr_size); if (ret != (size_t)jnl->jhdr->blhdr_size) { - printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset); - bad_blocks = 1; - goto bad_txn_handling; + printf("jnl: %s: replay_journal: Could not read block list header block @ 0x%llx!\n", jnl->jdev_name, offset); + bad_blocks = 1; + goto bad_txn_handling; } blhdr = (block_list_header *)buff; @@ -1101,101 +1170,101 @@ replay_journal(journal *jnl) // anything // if (checksum != orig_checksum) { - if (check_past_jnl_end && in_uncharted_territory) { + if (check_past_jnl_end && in_uncharted_territory) { - if (blhdr_offset != jnl->jhdr->end) { - printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); - } + if (blhdr_offset != jnl->jhdr->end) { + printf("jnl: %s: Extra txn replay stopped @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); + } - check_past_jnl_end = 0; - jnl->jhdr->end = blhdr_offset; - continue; - } + check_past_jnl_end = 0; + jnl->jhdr->end = blhdr_offset; + continue; + } - printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", + printf("jnl: %s: replay_journal: bad block list header @ 0x%llx (checksum 0x%x != 0x%x)\n", jnl->jdev_name, blhdr_offset, orig_checksum, checksum); - if (blhdr_offset == orig_jnl_start) { - // if there's nothing in the journal at all, just bail out altogether. - goto bad_replay; - } + if (blhdr_offset == orig_jnl_start) { + // if there's nothing in the journal at all, just bail out altogether. + goto bad_replay; + } - bad_blocks = 1; - goto bad_txn_handling; + bad_blocks = 1; + goto bad_txn_handling; } if ( (last_sequence_num != 0) - && (blhdr->binfo[0].u.bi.b.sequence_num != 0) - && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num) - && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) { + && (blhdr->binfo[0].u.bi.b.sequence_num != 0) + && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num) + && (blhdr->binfo[0].u.bi.b.sequence_num != last_sequence_num+1)) { - txn_start_offset = jnl->jhdr->end = blhdr_offset; + txn_start_offset = jnl->jhdr->end = blhdr_offset; - if (check_past_jnl_end) { - check_past_jnl_end = 0; - printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n", - jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); - continue; - } + if (check_past_jnl_end) { + check_past_jnl_end = 0; + printf("jnl: %s: 2: extra replay stopped @ %lld / 0x%llx (seq %d < %d)\n", + jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); + continue; + } - printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n", - jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); - bad_blocks = 1; - goto bad_txn_handling; + printf("jnl: %s: txn sequence numbers out of order in txn @ %lld / %llx! (%d < %d)\n", + jnl->jdev_name, blhdr_offset, blhdr_offset, blhdr->binfo[0].u.bi.b.sequence_num, last_sequence_num); + bad_blocks = 1; + goto bad_txn_handling; } last_sequence_num = blhdr->binfo[0].u.bi.b.sequence_num; if (blhdr_offset >= jnl->jhdr->end && jnl->jhdr->start <= jnl->jhdr->end) { - if (last_sequence_num == 0) { - check_past_jnl_end = 0; - printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n", - jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); - if (jnl->jhdr->start != jnl->jhdr->end) { - jnl->jhdr->start = jnl->jhdr->end; + if (last_sequence_num == 0) { + check_past_jnl_end = 0; + printf("jnl: %s: pre-sequence-num-enabled txn's - can not go further than end (%lld %lld).\n", + jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); + if (jnl->jhdr->start != jnl->jhdr->end) { + jnl->jhdr->start = jnl->jhdr->end; + } + continue; } - continue; - } - printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); + printf("jnl: %s: examining extra transactions starting @ %lld / 0x%llx\n", jnl->jdev_name, blhdr_offset, blhdr_offset); } if ( blhdr->max_blocks <= 0 || blhdr->max_blocks > (jnl->jhdr->size/jnl->jhdr->jhdr_size) - || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) { - printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n", - jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks); - bad_blocks = 1; - goto bad_txn_handling; + || blhdr->num_blocks <= 0 || blhdr->num_blocks > blhdr->max_blocks) { + printf("jnl: %s: replay_journal: bad looking journal entry: max: %d num: %d\n", + jnl->jdev_name, blhdr->max_blocks, blhdr->num_blocks); + bad_blocks = 1; + goto bad_txn_handling; } max_bsize = 0; - for(i=1; i < blhdr->num_blocks; i++) { + for (i = 1; i < blhdr->num_blocks; i++) { if (blhdr->binfo[i].bnum < 0 && blhdr->binfo[i].bnum != (off_t)-1) { - printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum); - bad_blocks = 1; - goto bad_txn_handling; + printf("jnl: %s: replay_journal: bogus block number 0x%llx\n", jnl->jdev_name, blhdr->binfo[i].bnum); + bad_blocks = 1; + goto bad_txn_handling; } if ((size_t)blhdr->binfo[i].u.bi.bsize > max_bsize) { - max_bsize = blhdr->binfo[i].u.bi.bsize; + max_bsize = blhdr->binfo[i].u.bi.bsize; } } if (blhdr->flags & BLHDR_CHECK_CHECKSUMS) { - check_block_checksums = 1; - if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { - goto bad_replay; - } + check_block_checksums = 1; + if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { + goto bad_replay; + } } else { - block_ptr = NULL; + block_ptr = NULL; } if (blhdr->flags & BLHDR_FIRST_HEADER) { - txn_start_offset = blhdr_offset; + txn_start_offset = blhdr_offset; } //printf("jnl: replay_journal: adding %d blocks in journal entry @ 0x%llx to co_buf\n", // blhdr->num_blocks-1, jnl->jhdr->start); bad_blocks = 0; - for(i=1; i < blhdr->num_blocks; i++) { + for (i = 1; i < blhdr->num_blocks; i++) { int size, ret_val; off_t number; @@ -1204,48 +1273,48 @@ replay_journal(journal *jnl) // don't add "killed" blocks if (number == (off_t)-1) { - //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i); + //printf("jnl: replay_journal: skipping killed fs block (index %d)\n", i); } else { - if (check_block_checksums) { - int32_t disk_cksum; - off_t block_offset; + if (check_block_checksums) { + int32_t disk_cksum; + off_t block_offset; - block_offset = offset; + block_offset = offset; - // read the block so we can check the checksum - ret = read_journal_data(jnl, &block_offset, block_ptr, size); - if (ret != (size_t)size) { - printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); - bad_blocks = 1; - goto bad_txn_handling; - } + // read the block so we can check the checksum + ret = read_journal_data(jnl, &block_offset, block_ptr, size); + if (ret != (size_t)size) { + printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); + bad_blocks = 1; + goto bad_txn_handling; + } - disk_cksum = calc_checksum(block_ptr, size); - - // there is no need to swap the checksum from disk because - // it got swapped when the blhdr was read in. - if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) { - printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n", - jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum); - printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n", - *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)], - *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]); - - bad_blocks = 1; - goto bad_txn_handling; + disk_cksum = calc_checksum(block_ptr, size); + + // there is no need to swap the checksum from disk because + // it got swapped when the blhdr was read in. + if (blhdr->binfo[i].u.bi.b.cksum != 0 && disk_cksum != blhdr->binfo[i].u.bi.b.cksum) { + printf("jnl: %s: txn starting at %lld (%lld) @ index %3d bnum %lld (%d) with disk cksum != blhdr cksum (0x%.8x 0x%.8x)\n", + jnl->jdev_name, txn_start_offset, blhdr_offset, i, number, size, disk_cksum, blhdr->binfo[i].u.bi.b.cksum); + printf("jnl: 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x 0x%.8x\n", + *(int *)&block_ptr[0*sizeof(int)], *(int *)&block_ptr[1*sizeof(int)], *(int *)&block_ptr[2*sizeof(int)], *(int *)&block_ptr[3*sizeof(int)], + *(int *)&block_ptr[4*sizeof(int)], *(int *)&block_ptr[5*sizeof(int)], *(int *)&block_ptr[6*sizeof(int)], *(int *)&block_ptr[7*sizeof(int)]); + + bad_blocks = 1; + goto bad_txn_handling; + } } - } - // add this bucket to co_buf, coalescing where possible - // printf("jnl: replay_journal: adding block 0x%llx\n", number); - ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full); + // add this bucket to co_buf, coalescing where possible + // printf("jnl: replay_journal: adding block 0x%llx\n", number); + ret_val = add_block(jnl, &co_buf, number, size, (size_t) offset, blhdr->binfo[i].u.bi.b.cksum, &num_buckets, &num_full); - if (ret_val == -1) { - printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name); - goto bad_replay; - } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number); + if (ret_val == -1) { + printf("jnl: %s: replay_journal: trouble adding block to co_buf\n", jnl->jdev_name); + goto bad_replay; + } // else printf("jnl: replay_journal: added block 0x%llx at i=%d\n", number); } // increment offset @@ -1256,28 +1325,28 @@ replay_journal(journal *jnl) // into account // if (offset >= jnl->jhdr->size) { - offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); + offset = jnl->jhdr->jhdr_size + (offset - jnl->jhdr->size); } } if (block_ptr) { - kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); - block_ptr = NULL; + kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); + block_ptr = NULL; } - bad_txn_handling: +bad_txn_handling: if (bad_blocks) { - if (txn_start_offset == 0) { - printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name); - goto bad_replay; - } + if (txn_start_offset == 0) { + printf("jnl: %s: no known good txn start offset! aborting journal replay.\n", jnl->jdev_name); + goto bad_replay; + } - jnl->jhdr->start = orig_jnl_start; - jnl->jhdr->end = txn_start_offset; - check_past_jnl_end = 0; - last_sequence_num = 0; - printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); - goto restart_replay; + jnl->jhdr->start = orig_jnl_start; + jnl->jhdr->end = txn_start_offset; + check_past_jnl_end = 0; + last_sequence_num = 0; + printf("jnl: %s: restarting journal replay (%lld - %lld)!\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); + goto restart_replay; } jnl->jhdr->start += blhdr->bytes_used; @@ -1287,98 +1356,98 @@ replay_journal(journal *jnl) } if (jnl->jhdr->start == jnl->jhdr->end) { - in_uncharted_territory = 1; + in_uncharted_territory = 1; } - } + } - if (jnl->jhdr->start != jnl->jhdr->end) { - printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); - jnl->jhdr->end = jnl->jhdr->start; - } + if (jnl->jhdr->start != jnl->jhdr->end) { + printf("jnl: %s: start %lld != end %lld. resetting end.\n", jnl->jdev_name, jnl->jhdr->start, jnl->jhdr->end); + jnl->jhdr->end = jnl->jhdr->start; + } - //printf("jnl: replay_journal: replaying %d blocks\n", num_full); + //printf("jnl: replay_journal: replaying %d blocks\n", num_full); - /* - * make sure it's at least one page in size, so - * start max_bsize at PAGE_SIZE - */ - for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) { - - if (co_buf[i].block_num == (off_t)-1) - continue; + /* + * make sure it's at least one page in size, so + * start max_bsize at PAGE_SIZE + */ + for (i = 0, max_bsize = PAGE_SIZE; i < num_full; i++) { - if (co_buf[i].block_size > max_bsize) - max_bsize = co_buf[i].block_size; - } - /* - * round max_bsize up to the nearest PAGE_SIZE multiple - */ - if (max_bsize & (PAGE_SIZE - 1)) { - max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1); - } + if (co_buf[i].block_num == (off_t)-1) + continue; - if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { - goto bad_replay; - } + if (co_buf[i].block_size > max_bsize) + max_bsize = co_buf[i].block_size; + } + /* + * round max_bsize up to the nearest PAGE_SIZE multiple + */ + if (max_bsize & (PAGE_SIZE - 1)) { + max_bsize = (max_bsize + PAGE_SIZE) & ~(PAGE_SIZE - 1); + } + + if (kmem_alloc(kernel_map, (vm_offset_t *)&block_ptr, max_bsize)) { + goto bad_replay; + } - // Replay the coalesced entries in the co-buf - for(i=0; i < num_full; i++) { - size_t size = co_buf[i].block_size; - off_t jnl_offset = (off_t) co_buf[i].jnl_offset; - off_t number = co_buf[i].block_num; + // Replay the coalesced entries in the co-buf + for(i = 0; i < num_full; i++) { + size_t size = co_buf[i].block_size; + off_t jnl_offset = (off_t) co_buf[i].jnl_offset; + off_t number = co_buf[i].block_num; - // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num, - // co_buf[i].block_size, co_buf[i].jnl_offset); + // printf("replaying co_buf[%d]: block 0x%llx, size 0x%x, jnl_offset 0x%llx\n", i, co_buf[i].block_num, + // co_buf[i].block_size, co_buf[i].jnl_offset); - if (number == (off_t)-1) { - // printf("jnl: replay_journal: skipping killed fs block\n"); - } else { + if (number == (off_t)-1) { + // printf("jnl: replay_journal: skipping killed fs block\n"); + } else { - // do journal read, and set the phys. block - ret = read_journal_data(jnl, &jnl_offset, block_ptr, size); - if (ret != size) { - printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); - goto bad_replay; - } + // do journal read, and set the phys. block + ret = read_journal_data(jnl, &jnl_offset, block_ptr, size); + if (ret != size) { + printf("jnl: %s: replay_journal: Could not read journal entry data @ offset 0x%llx!\n", jnl->jdev_name, offset); + goto bad_replay; + } - if (update_fs_block(jnl, block_ptr, number, size) != 0) { - goto bad_replay; - } + if (update_fs_block(jnl, block_ptr, number, size) != 0) { + goto bad_replay; + } + } } - } + + // done replaying; update jnl header + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { + goto bad_replay; + } - // done replaying; update jnl header - if (write_journal_header(jnl, 1) != 0) { - goto bad_replay; - } - - printf("jnl: %s: journal replay done.\n", jnl->jdev_name); + printf("jnl: %s: journal replay done.\n", jnl->jdev_name); - // free block_ptr - if (block_ptr) { - kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); - block_ptr = NULL; - } + // free block_ptr + if (block_ptr) { + kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); + block_ptr = NULL; + } - // free the coalesce buffer - FREE(co_buf, M_TEMP); - co_buf = NULL; + // free the coalesce buffer + FREE(co_buf, M_TEMP); + co_buf = NULL; - kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); - return 0; + kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); + return 0; - bad_replay: - if (block_ptr) { +bad_replay: + if (block_ptr) { kmem_free(kernel_map, (vm_offset_t)block_ptr, max_bsize); - } - if (co_buf) { - FREE(co_buf, M_TEMP); - } - kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); + } + if (co_buf) { + FREE(co_buf, M_TEMP); + } + kmem_free(kernel_map, (vm_offset_t)buff, jnl->jhdr->blhdr_size); - return -1; + return -1; } @@ -1413,11 +1482,11 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) } } - // size up the transaction buffer... can't be larger than the number - // of blocks that can fit in a block_list_header block. - if (tbuffer_size == 0) { + // size up the transaction buffer... can't be larger than the number + // of blocks that can fit in a block_list_header block. + if (tbuffer_size == 0) { jnl->tbuffer_size = def_tbuffer_size; - } else { + } else { // make sure that the specified tbuffer_size isn't too small if (tbuffer_size < jnl->jhdr->blhdr_size * 2) { tbuffer_size = jnl->jhdr->blhdr_size * 2; @@ -1428,23 +1497,23 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) } jnl->tbuffer_size = tbuffer_size; - } + } - if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) { + if (jnl->tbuffer_size > (jnl->jhdr->size / 2)) { jnl->tbuffer_size = (jnl->jhdr->size / 2); - } + } - if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) { + if (jnl->tbuffer_size > MAX_TRANSACTION_BUFFER_SIZE) { jnl->tbuffer_size = MAX_TRANSACTION_BUFFER_SIZE; - } + } - jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info); - if (jnl->jhdr->blhdr_size < phys_blksz) { - jnl->jhdr->blhdr_size = phys_blksz; - } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) { + jnl->jhdr->blhdr_size = (jnl->tbuffer_size / jnl->jhdr->jhdr_size) * sizeof(block_info); + if (jnl->jhdr->blhdr_size < phys_blksz) { + jnl->jhdr->blhdr_size = phys_blksz; + } else if ((jnl->jhdr->blhdr_size % phys_blksz) != 0) { // have to round up so we're an even multiple of the physical block size jnl->jhdr->blhdr_size = (jnl->jhdr->blhdr_size + (phys_blksz - 1)) & ~(phys_blksz - 1); - } + } } @@ -1452,96 +1521,99 @@ size_up_tbuffer(journal *jnl, int tbuffer_size, int phys_blksz) static void get_io_info(struct vnode *devvp, size_t phys_blksz, journal *jnl, struct vfs_context *context) { - off_t readblockcnt; - off_t writeblockcnt; - off_t readmaxcnt=0, tmp_readmaxcnt; - off_t writemaxcnt=0, tmp_writemaxcnt; - off_t readsegcnt, writesegcnt; - int32_t features; - - if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) { - if (features & DK_FEATURE_FORCE_UNIT_ACCESS) { - const char *name = vnode_name(devvp); - jnl->flags |= JOURNAL_DO_FUA_WRITES; - printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features); + off_t readblockcnt; + off_t writeblockcnt; + off_t readmaxcnt=0, tmp_readmaxcnt; + off_t writemaxcnt=0, tmp_writemaxcnt; + off_t readsegcnt, writesegcnt; + int32_t features; + + if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, context) == 0) { + if (features & DK_FEATURE_FORCE_UNIT_ACCESS) { + const char *name = vnode_name(devvp); + jnl->flags |= JOURNAL_DO_FUA_WRITES; + printf("jnl: %s: enabling FUA writes (features 0x%x)\n", name ? name : "no-name-dev", features); + } + if (features & DK_FEATURE_UNMAP) { + jnl->flags |= JOURNAL_USE_UNMAP; + } } - } - // - // First check the max read size via several different mechanisms... - // - VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context); + // + // First check the max read size via several different mechanisms... + // + VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTREAD, (caddr_t)&readmaxcnt, 0, context); - if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) { - tmp_readmaxcnt = readblockcnt * phys_blksz; - if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) { - readmaxcnt = tmp_readmaxcnt; - } - } + if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTREAD, (caddr_t)&readblockcnt, 0, context) == 0) { + tmp_readmaxcnt = readblockcnt * phys_blksz; + if (readmaxcnt == 0 || (readblockcnt > 0 && tmp_readmaxcnt < readmaxcnt)) { + readmaxcnt = tmp_readmaxcnt; + } + } - if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) { - readsegcnt = 0; - } + if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTREAD, (caddr_t)&readsegcnt, 0, context)) { + readsegcnt = 0; + } - if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) { - readmaxcnt = readsegcnt * PAGE_SIZE; - } + if (readsegcnt > 0 && (readsegcnt * PAGE_SIZE) < readmaxcnt) { + readmaxcnt = readsegcnt * PAGE_SIZE; + } - if (readmaxcnt == 0) { - readmaxcnt = 128 * 1024; - } else if (readmaxcnt > UINT32_MAX) { - readmaxcnt = UINT32_MAX; - } + if (readmaxcnt == 0) { + readmaxcnt = 128 * 1024; + } else if (readmaxcnt > UINT32_MAX) { + readmaxcnt = UINT32_MAX; + } - // - // Now check the max writes size via several different mechanisms... - // - VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context); + // + // Now check the max writes size via several different mechanisms... + // + VNOP_IOCTL(devvp, DKIOCGETMAXBYTECOUNTWRITE, (caddr_t)&writemaxcnt, 0, context); - if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) { - tmp_writemaxcnt = writeblockcnt * phys_blksz; - if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) { - writemaxcnt = tmp_writemaxcnt; - } - } + if (VNOP_IOCTL(devvp, DKIOCGETMAXBLOCKCOUNTWRITE, (caddr_t)&writeblockcnt, 0, context) == 0) { + tmp_writemaxcnt = writeblockcnt * phys_blksz; + if (writemaxcnt == 0 || (writeblockcnt > 0 && tmp_writemaxcnt < writemaxcnt)) { + writemaxcnt = tmp_writemaxcnt; + } + } - if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt, 0, context)) { - writesegcnt = 0; - } + if (VNOP_IOCTL(devvp, DKIOCGETMAXSEGMENTCOUNTWRITE, (caddr_t)&writesegcnt, 0, context)) { + writesegcnt = 0; + } - if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) { - writemaxcnt = writesegcnt * PAGE_SIZE; - } + if (writesegcnt > 0 && (writesegcnt * PAGE_SIZE) < writemaxcnt) { + writemaxcnt = writesegcnt * PAGE_SIZE; + } - if (writemaxcnt == 0) { - writemaxcnt = 128 * 1024; - } else if (writemaxcnt > UINT32_MAX) { - writemaxcnt = UINT32_MAX; - } + if (writemaxcnt == 0) { + writemaxcnt = 128 * 1024; + } else if (writemaxcnt > UINT32_MAX) { + writemaxcnt = UINT32_MAX; + } - jnl->max_read_size = readmaxcnt; - jnl->max_write_size = writemaxcnt; - // printf("jnl: %s: max read/write: %lld k / %lld k\n", - // jnl->jdev_name ? jnl->jdev_name : "unknown", - // jnl->max_read_size/1024, jnl->max_write_size/1024); + jnl->max_read_size = readmaxcnt; + jnl->max_write_size = writemaxcnt; + // printf("jnl: %s: max read/write: %lld k / %lld k\n", + // jnl->jdev_name ? jnl->jdev_name : "unknown", + // jnl->max_read_size/1024, jnl->max_write_size/1024); } static const char * get_jdev_name(struct vnode *jvp) { - const char *jdev_name; + const char *jdev_name; - jdev_name = vnode_name(jvp); - if (jdev_name == NULL) { - jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0); - } else { - // this just bumps the refcount on the name so we have our own copy - jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0); - } + jdev_name = vnode_name(jvp); + if (jdev_name == NULL) { + jdev_name = vfs_addname("unknown-dev", strlen("unknown-dev"), 0, 0); + } else { + // this just bumps the refcount on the name so we have our own copy + jdev_name = vfs_addname(jdev_name, strlen(jdev_name), 0, 0); + } - return jdev_name; + return jdev_name; } @@ -1556,143 +1628,167 @@ journal_create(struct vnode *jvp, void (*flush)(void *arg), void *arg) { - journal *jnl; - uint32_t phys_blksz, new_txn_base; - struct vfs_context context; - const char *jdev_name; + journal *jnl; + uint32_t phys_blksz, new_txn_base; + u_int32_t min_size; + struct vfs_context context; + const char *jdev_name; + /* + * Cap the journal max size to 2GB. On HFS, it will attempt to occupy + * a full allocation block if the current size is smaller than the allocation + * block on which it resides. Once we hit the exabyte filesystem range, then + * it will use 2GB allocation blocks. As a result, make the cap 2GB. + */ + context.vc_thread = current_thread(); + context.vc_ucred = FSCRED; - context.vc_thread = current_thread(); - context.vc_ucred = FSCRED; + jdev_name = get_jdev_name(jvp); - jdev_name = get_jdev_name(jvp); + /* Get the real physical block size. */ + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { + return NULL; + } - /* Get the real physical block size. */ - if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { - return NULL; - } + if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { + printf("jnl: create: journal size %lld looks bogus.\n", journal_size); + return NULL; + } - if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { - printf("jnl: create: journal size %lld looks bogus.\n", journal_size); - return NULL; - } + min_size = phys_blksz * (phys_blksz / sizeof(block_info)); + /* Reject journals that are too small given the sector size of the device */ + if (journal_size < min_size) { + printf("jnl: create: journal size (%lld) too small given sector size of (%u)\n", + journal_size, phys_blksz); + return NULL; + } - if (phys_blksz > min_fs_blksz) { + if (phys_blksz > min_fs_blksz) { printf("jnl: %s: create: error: phys blksize %u bigger than min fs blksize %zd\n", - jdev_name, phys_blksz, min_fs_blksz); + jdev_name, phys_blksz, min_fs_blksz); return NULL; - } + } - if ((journal_size % phys_blksz) != 0) { + if ((journal_size % phys_blksz) != 0) { printf("jnl: %s: create: journal size 0x%llx is not an even multiple of block size 0x%ux\n", - jdev_name, journal_size, phys_blksz); + jdev_name, journal_size, phys_blksz); return NULL; - } + } - MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); - memset(jnl, 0, sizeof(*jnl)); + MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); + memset(jnl, 0, sizeof(*jnl)); - jnl->jdev = jvp; - jnl->jdev_offset = offset; - jnl->fsdev = fsvp; - jnl->flush = flush; - jnl->flush_arg = arg; - jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); - jnl->jdev_name = jdev_name; - lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); + jnl->jdev = jvp; + jnl->jdev_offset = offset; + jnl->fsdev = fsvp; + jnl->flush = flush; + jnl->flush_arg = arg; + jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); + jnl->jdev_name = jdev_name; + lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); - get_io_info(jvp, phys_blksz, jnl, &context); + get_io_info(jvp, phys_blksz, jnl, &context); - if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { - printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); - goto bad_kmem_alloc; - } - jnl->header_buf_size = phys_blksz; + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { + printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); + goto bad_kmem_alloc; + } + jnl->header_buf_size = phys_blksz; - jnl->jhdr = (journal_header *)jnl->header_buf; - memset(jnl->jhdr, 0, sizeof(journal_header)); + jnl->jhdr = (journal_header *)jnl->header_buf; + memset(jnl->jhdr, 0, sizeof(journal_header)); - // we have to set this up here so that do_journal_io() will work - jnl->jhdr->jhdr_size = phys_blksz; + // we have to set this up here so that do_journal_io() will work + jnl->jhdr->jhdr_size = phys_blksz; - // - // We try and read the journal header to see if there is already one - // out there. If there is, it's possible that it has transactions - // in it that we might replay if we happen to pick a sequence number - // that is a little less than the old one, there is a crash and the - // last txn written ends right at the start of a txn from the previous - // incarnation of this file system. If all that happens we would - // replay the transactions from the old file system and that would - // destroy your disk. Although it is extremely unlikely for all those - // conditions to happen, the probability is non-zero and the result is - // severe - you lose your file system. Therefore if we find a valid - // journal header and the sequence number is non-zero we write junk - // over the entire journal so that there is no way we will encounter - // any old transactions. This is slow but should be a rare event - // since most tools erase the journal. - // - if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz - && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC - && jnl->jhdr->sequence_num != 0) { + // + // We try and read the journal header to see if there is already one + // out there. If there is, it's possible that it has transactions + // in it that we might replay if we happen to pick a sequence number + // that is a little less than the old one, there is a crash and the + // last txn written ends right at the start of a txn from the previous + // incarnation of this file system. If all that happens we would + // replay the transactions from the old file system and that would + // destroy your disk. Although it is extremely unlikely for all those + // conditions to happen, the probability is non-zero and the result is + // severe - you lose your file system. Therefore if we find a valid + // journal header and the sequence number is non-zero we write junk + // over the entire journal so that there is no way we will encounter + // any old transactions. This is slow but should be a rare event + // since most tools erase the journal. + // + if ( read_journal_header(jnl, jnl->jhdr, phys_blksz) == phys_blksz + && jnl->jhdr->magic == JOURNAL_HEADER_MAGIC + && jnl->jhdr->sequence_num != 0) { - new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; - printf("jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base); + new_txn_base = (jnl->jhdr->sequence_num + (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; + printf("jnl: create: avoiding old sequence number 0x%x (0x%x)\n", jnl->jhdr->sequence_num, new_txn_base); #if 0 - int i; - off_t pos=0; + int i; + off_t pos=0; - for(i=1; i < journal_size / phys_blksz; i++) { - pos = i*phys_blksz; + for(i = 1; i < journal_size / phys_blksz; i++) { + pos = i*phys_blksz; - // we don't really care what data we write just so long - // as it's not a valid transaction header. since we have - // the header_buf sitting around we'll use that. - write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz); - } - printf("jnl: create: done clearing journal (i=%d)\n", i); + // we don't really care what data we write just so long + // as it's not a valid transaction header. since we have + // the header_buf sitting around we'll use that. + write_journal_data(jnl, &pos, jnl->header_buf, phys_blksz); + } + printf("jnl: create: done clearing journal (i=%d)\n", i); #endif - } else { - new_txn_base = random() & 0x00ffffff; - } + } else { + new_txn_base = random() & 0x00ffffff; + } - memset(jnl->header_buf, 0, phys_blksz); + memset(jnl->header_buf, 0, phys_blksz); - jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; - jnl->jhdr->endian = ENDIAN_MAGIC; - jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself - jnl->jhdr->end = phys_blksz; - jnl->jhdr->size = journal_size; - jnl->jhdr->jhdr_size = phys_blksz; - size_up_tbuffer(jnl, tbuffer_size, phys_blksz); - - jnl->active_start = jnl->jhdr->start; - - // XXXdbg - for testing you can force the journal to wrap around - // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3); - // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3); + jnl->jhdr->magic = JOURNAL_HEADER_MAGIC; + jnl->jhdr->endian = ENDIAN_MAGIC; + jnl->jhdr->start = phys_blksz; // start at block #1, block #0 is for the jhdr itself + jnl->jhdr->end = phys_blksz; + jnl->jhdr->size = journal_size; + jnl->jhdr->jhdr_size = phys_blksz; + size_up_tbuffer(jnl, tbuffer_size, phys_blksz); + + jnl->active_start = jnl->jhdr->start; + + // XXXdbg - for testing you can force the journal to wrap around + // jnl->jhdr->start = jnl->jhdr->size - (phys_blksz*3); + // jnl->jhdr->end = jnl->jhdr->size - (phys_blksz*3); - jnl->jhdr->sequence_num = new_txn_base; + jnl->jhdr->sequence_num = new_txn_base; - lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); + lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); + lck_mtx_init(&jnl->flock, jnl_mutex_group, jnl_lock_attr); + lck_rw_init(&jnl->trim_lock, jnl_mutex_group, jnl_lock_attr); + + jnl->flushing = FALSE; + jnl->asyncIO = FALSE; + jnl->flush_aborted = FALSE; + jnl->writing_header = FALSE; + jnl->async_trim = NULL; + jnl->sequence_num = jnl->jhdr->sequence_num; + + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num) != 0) { + printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name); + goto bad_write; + } - if (write_journal_header(jnl, 1) != 0) { - printf("jnl: %s: journal_create: failed to write journal header.\n", jdev_name); - goto bad_write; - } + return jnl; - return jnl; +bad_write: + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); +bad_kmem_alloc: + if (jdev_name) { + vfs_removename(jdev_name); + } + jnl->jhdr = NULL; + FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); - bad_write: - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); - bad_kmem_alloc: - if (jdev_name) { - vfs_removename(jdev_name); - } - jnl->jhdr = NULL; - FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); - return NULL; + return NULL; } @@ -1707,69 +1803,78 @@ journal_open(struct vnode *jvp, void (*flush)(void *arg), void *arg) { - journal *jnl; - uint32_t orig_blksz=0; - uint32_t phys_blksz; - int orig_checksum, checksum; - struct vfs_context context; - const char *jdev_name = get_jdev_name(jvp); - - context.vc_thread = current_thread(); - context.vc_ucred = FSCRED; - - /* Get the real physical block size. */ - if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { + journal *jnl; + uint32_t orig_blksz=0; + uint32_t phys_blksz; + u_int32_t min_size = 0; + int orig_checksum, checksum; + struct vfs_context context; + const char *jdev_name = get_jdev_name(jvp); + + context.vc_thread = current_thread(); + context.vc_ucred = FSCRED; + + /* Get the real physical block size. */ + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { return NULL; - } + } - if (phys_blksz > min_fs_blksz) { + if (phys_blksz > min_fs_blksz) { printf("jnl: %s: open: error: phys blksize %u bigger than min fs blksize %zd\n", - jdev_name, phys_blksz, min_fs_blksz); + jdev_name, phys_blksz, min_fs_blksz); return NULL; - } + } - if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { - printf("jnl: open: journal size %lld looks bogus.\n", journal_size); - return NULL; - } + if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { + printf("jnl: open: journal size %lld looks bogus.\n", journal_size); + return NULL; + } + + min_size = phys_blksz * (phys_blksz / sizeof(block_info)); + /* Reject journals that are too small given the sector size of the device */ + if (journal_size < min_size) { + printf("jnl: open: journal size (%lld) too small given sector size of (%u)\n", + journal_size, phys_blksz); + return NULL; + } - if ((journal_size % phys_blksz) != 0) { + if ((journal_size % phys_blksz) != 0) { printf("jnl: %s: open: journal size 0x%llx is not an even multiple of block size 0x%x\n", - jdev_name, journal_size, phys_blksz); + jdev_name, journal_size, phys_blksz); return NULL; - } + } - MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); - memset(jnl, 0, sizeof(*jnl)); + MALLOC_ZONE(jnl, struct journal *, sizeof(struct journal), M_JNL_JNL, M_WAITOK); + memset(jnl, 0, sizeof(*jnl)); - jnl->jdev = jvp; - jnl->jdev_offset = offset; - jnl->fsdev = fsvp; - jnl->flush = flush; - jnl->flush_arg = arg; - jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); - jnl->jdev_name = jdev_name; - lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); + jnl->jdev = jvp; + jnl->jdev_offset = offset; + jnl->fsdev = fsvp; + jnl->flush = flush; + jnl->flush_arg = arg; + jnl->flags = (flags & JOURNAL_OPTION_FLAGS_MASK); + jnl->jdev_name = jdev_name; + lck_mtx_init(&jnl->old_start_lock, jnl_mutex_group, jnl_lock_attr); - get_io_info(jvp, phys_blksz, jnl, &context); + get_io_info(jvp, phys_blksz, jnl, &context); - if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { - printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); - goto bad_kmem_alloc; - } - jnl->header_buf_size = phys_blksz; + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl->header_buf, phys_blksz)) { + printf("jnl: %s: create: could not allocate space for header buffer (%u bytes)\n", jdev_name, phys_blksz); + goto bad_kmem_alloc; + } + jnl->header_buf_size = phys_blksz; - jnl->jhdr = (journal_header *)jnl->header_buf; - memset(jnl->jhdr, 0, sizeof(journal_header)); + jnl->jhdr = (journal_header *)jnl->header_buf; + memset(jnl->jhdr, 0, sizeof(journal_header)); - // we have to set this up here so that do_journal_io() will work - jnl->jhdr->jhdr_size = phys_blksz; + // we have to set this up here so that do_journal_io() will work + jnl->jhdr->jhdr_size = phys_blksz; - if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) { + if (read_journal_header(jnl, jnl->jhdr, phys_blksz) != phys_blksz) { printf("jnl: %s: open: could not read %u bytes for the journal header.\n", - jdev_name, phys_blksz); + jdev_name, phys_blksz); goto bad_journal; - } + } orig_checksum = jnl->jhdr->checksum; jnl->jhdr->checksum = 0; @@ -1784,18 +1889,18 @@ journal_open(struct vnode *jvp, checksum = calc_checksum((char *)jnl->jhdr, JOURNAL_HEADER_CKSUM_SIZE); } - if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { + if (jnl->jhdr->magic != JOURNAL_HEADER_MAGIC && jnl->jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { printf("jnl: %s: open: journal magic is bad (0x%x != 0x%x)\n", - jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); + jnl->jdev_name, jnl->jhdr->magic, JOURNAL_HEADER_MAGIC); goto bad_journal; - } + } // only check if we're the current journal header magic value if (jnl->jhdr->magic == JOURNAL_HEADER_MAGIC) { if (orig_checksum != checksum) { printf("jnl: %s: open: journal checksum is bad (0x%x != 0x%x)\n", - jdev_name, orig_checksum, checksum); + jdev_name, orig_checksum, checksum); //goto bad_journal; } @@ -1807,16 +1912,16 @@ journal_open(struct vnode *jvp, } if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { - /* - * The volume has probably been resized (such that we had to adjust the - * logical sector size), or copied to media with a different logical - * sector size. + /* + * The volume has probably been resized (such that we had to adjust the + * logical sector size), or copied to media with a different logical + * sector size. * * Temporarily change the device's logical block size to match the * journal's header size. This will allow us to replay the journal * safely. If the replay succeeds, we will update the journal's header * size (later in this function). - */ + */ orig_blksz = phys_blksz; phys_blksz = jnl->jhdr->jhdr_size; @@ -1825,27 +1930,27 @@ journal_open(struct vnode *jvp, printf("jnl: %s: open: temporarily switched block size from %u to %u\n", jdev_name, orig_blksz, phys_blksz); } - - if ( jnl->jhdr->start <= 0 - || jnl->jhdr->start > jnl->jhdr->size - || jnl->jhdr->start > 1024*1024*1024) { + + if ( jnl->jhdr->start <= 0 + || jnl->jhdr->start > jnl->jhdr->size + || jnl->jhdr->start > 1024*1024*1024) { printf("jnl: %s: open: jhdr start looks bad (0x%llx max size 0x%llx)\n", - jdev_name, jnl->jhdr->start, jnl->jhdr->size); + jdev_name, jnl->jhdr->start, jnl->jhdr->size); goto bad_journal; - } + } - if ( jnl->jhdr->end <= 0 - || jnl->jhdr->end > jnl->jhdr->size - || jnl->jhdr->end > 1024*1024*1024) { + if ( jnl->jhdr->end <= 0 + || jnl->jhdr->end > jnl->jhdr->size + || jnl->jhdr->end > 1024*1024*1024) { printf("jnl: %s: open: jhdr end looks bad (0x%llx max size 0x%llx)\n", - jdev_name, jnl->jhdr->end, jnl->jhdr->size); + jdev_name, jnl->jhdr->end, jnl->jhdr->size); goto bad_journal; - } + } - if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) { - printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size); - goto bad_journal; - } + if (jnl->jhdr->size < (256*1024) || jnl->jhdr->size > 1024*1024*1024) { + printf("jnl: %s: open: jhdr size looks bad (0x%llx)\n", jdev_name, jnl->jhdr->size); + goto bad_journal; + } // XXXdbg - can't do these checks because hfs writes all kinds of // non-uniform sized blocks even on devices that have a block size @@ -1853,28 +1958,28 @@ journal_open(struct vnode *jvp, // therefore these checks will fail and so we just have to punt and // do more relaxed checking... // XXXdbg if ((jnl->jhdr->start % jnl->jhdr->jhdr_size) != 0) { - if ((jnl->jhdr->start % 512) != 0) { + if ((jnl->jhdr->start % 512) != 0) { printf("jnl: %s: open: journal start (0x%llx) not a multiple of 512?\n", - jdev_name, jnl->jhdr->start); + jdev_name, jnl->jhdr->start); goto bad_journal; - } + } //XXXdbg if ((jnl->jhdr->end % jnl->jhdr->jhdr_size) != 0) { - if ((jnl->jhdr->end % 512) != 0) { + if ((jnl->jhdr->end % 512) != 0) { printf("jnl: %s: open: journal end (0x%llx) not a multiple of block size (0x%x)?\n", - jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size); + jdev_name, jnl->jhdr->end, jnl->jhdr->jhdr_size); goto bad_journal; - } + } - // take care of replaying the journal if necessary - if (flags & JOURNAL_RESET) { - printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n", - jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end); - jnl->jhdr->start = jnl->jhdr->end; - } else if (replay_journal(jnl) != 0) { - printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name); - goto bad_journal; - } + // take care of replaying the journal if necessary + if (flags & JOURNAL_RESET) { + printf("jnl: %s: journal start/end pointers reset! (jnl %p; s 0x%llx e 0x%llx)\n", + jdev_name, jnl, jnl->jhdr->start, jnl->jhdr->end); + jnl->jhdr->start = jnl->jhdr->end; + } else if (replay_journal(jnl) != 0) { + printf("jnl: %s: journal_open: Error replaying the journal!\n", jdev_name); + goto bad_journal; + } /* * When we get here, we know that the journal is empty (jnl->jhdr->start == @@ -1891,6 +1996,7 @@ journal_open(struct vnode *jvp, VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); phys_blksz = orig_blksz; orig_blksz = 0; + printf("jnl: %s: open: restored block size to %u\n", jdev_name, phys_blksz); jnl->jhdr->jhdr_size = phys_blksz; jnl->jhdr->start = phys_blksz; @@ -1899,23 +2005,24 @@ journal_open(struct vnode *jvp, (journal_size / phys_blksz) + (random() % 16384)) & 0x00ffffff; - if (write_journal_header(jnl, 1)) { + if (write_journal_header(jnl, 1, jnl->jhdr->sequence_num)) { printf("jnl: %s: open: failed to update journal header size\n", jdev_name); goto bad_journal; } } - + // make sure this is in sync! jnl->active_start = jnl->jhdr->start; + jnl->sequence_num = jnl->jhdr->sequence_num; // set this now, after we've replayed the journal size_up_tbuffer(jnl, tbuffer_size, phys_blksz); // TODO: Does this need to change if the device's logical block size changed? if ((off_t)(jnl->jhdr->blhdr_size/sizeof(block_info)-1) > (jnl->jhdr->size/jnl->jhdr->jhdr_size)) { - printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, - jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); - goto bad_journal; + printf("jnl: %s: open: jhdr size and blhdr size are not compatible (0x%llx, %d, %d)\n", jdev_name, jnl->jhdr->size, + jnl->jhdr->blhdr_size, jnl->jhdr->jhdr_size); + goto bad_journal; } lck_mtx_init(&jnl->jlock, jnl_mutex_group, jnl_lock_attr); @@ -1926,7 +2033,7 @@ journal_open(struct vnode *jvp, if (orig_blksz != 0) { phys_blksz = orig_blksz; VNOP_IOCTL(jvp, DKIOCSETBLOCKSIZE, (caddr_t)&orig_blksz, FWRITE, &context); - printf("jnl: %s: open: restored block size after error\n", jdev_name); + printf("jnl: %s: open: restored block size to %u after error\n", jdev_name, orig_blksz); } kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, phys_blksz); bad_kmem_alloc: @@ -1945,110 +2052,109 @@ journal_is_clean(struct vnode *jvp, struct vnode *fsvp, size_t min_fs_block_size) { - journal jnl; - uint32_t phys_blksz; - int ret; - int orig_checksum, checksum; - struct vfs_context context; - const char *jdev_name = get_jdev_name(jvp); - - context.vc_thread = current_thread(); - context.vc_ucred = FSCRED; - - /* Get the real physical block size. */ - if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { - printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name); - return EINVAL; - } + journal jnl; + uint32_t phys_blksz; + int ret; + int orig_checksum, checksum; + struct vfs_context context; + const char *jdev_name = get_jdev_name(jvp); + + context.vc_thread = current_thread(); + context.vc_ucred = FSCRED; + + /* Get the real physical block size. */ + if (VNOP_IOCTL(jvp, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, &context)) { + printf("jnl: %s: is_clean: failed to get device block size.\n", jdev_name); + return EINVAL; + } - if (phys_blksz > (uint32_t)min_fs_block_size) { - printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n", - jdev_name, phys_blksz, min_fs_block_size); - return EINVAL; - } + if (phys_blksz > (uint32_t)min_fs_block_size) { + printf("jnl: %s: is_clean: error: phys blksize %d bigger than min fs blksize %zd\n", + jdev_name, phys_blksz, min_fs_block_size); + return EINVAL; + } - if (journal_size < (256*1024) || journal_size > (1024*1024*1024)) { - printf("jnl: is_clean: journal size %lld looks bogus.\n", journal_size); - return EINVAL; - } + if (journal_size < (256*1024) || journal_size > (MAX_JOURNAL_SIZE)) { + printf("jnl: is_clean: journal size %lld looks bogus.\n", journal_size); + return EINVAL; + } - if ((journal_size % phys_blksz) != 0) { - printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n", - jdev_name, journal_size, phys_blksz); - return EINVAL; - } + if ((journal_size % phys_blksz) != 0) { + printf("jnl: %s: is_clean: journal size 0x%llx is not an even multiple of block size 0x%x\n", + jdev_name, journal_size, phys_blksz); + return EINVAL; + } - memset(&jnl, 0, sizeof(jnl)); + memset(&jnl, 0, sizeof(jnl)); - if (kmem_alloc(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) { - printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz); - return ENOMEM; - } - jnl.header_buf_size = phys_blksz; + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&jnl.header_buf, phys_blksz)) { + printf("jnl: %s: is_clean: could not allocate space for header buffer (%d bytes)\n", jdev_name, phys_blksz); + return ENOMEM; + } + jnl.header_buf_size = phys_blksz; - get_io_info(jvp, phys_blksz, &jnl, &context); + get_io_info(jvp, phys_blksz, &jnl, &context); - jnl.jhdr = (journal_header *)jnl.header_buf; - memset(jnl.jhdr, 0, sizeof(journal_header)); + jnl.jhdr = (journal_header *)jnl.header_buf; + memset(jnl.jhdr, 0, sizeof(journal_header)); - jnl.jdev = jvp; - jnl.jdev_offset = offset; - jnl.fsdev = fsvp; + jnl.jdev = jvp; + jnl.jdev_offset = offset; + jnl.fsdev = fsvp; - // we have to set this up here so that do_journal_io() will work - jnl.jhdr->jhdr_size = phys_blksz; + // we have to set this up here so that do_journal_io() will work + jnl.jhdr->jhdr_size = phys_blksz; - if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) { - printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n", - jdev_name, phys_blksz); - ret = EINVAL; - goto get_out; - } + if (read_journal_header(&jnl, jnl.jhdr, phys_blksz) != (unsigned)phys_blksz) { + printf("jnl: %s: is_clean: could not read %d bytes for the journal header.\n", + jdev_name, phys_blksz); + ret = EINVAL; + goto get_out; + } - orig_checksum = jnl.jhdr->checksum; - jnl.jhdr->checksum = 0; - - if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { - // do this before the swap since it's done byte-at-a-time - orig_checksum = SWAP32(orig_checksum); - checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); - swap_journal_header(&jnl); - jnl.flags |= JOURNAL_NEED_SWAP; - } else { - checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); - } + orig_checksum = jnl.jhdr->checksum; + jnl.jhdr->checksum = 0; - if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { - printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n", - jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC); - ret = EINVAL; - goto get_out; - } + if (jnl.jhdr->magic == SWAP32(JOURNAL_HEADER_MAGIC)) { + // do this before the swap since it's done byte-at-a-time + orig_checksum = SWAP32(orig_checksum); + checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); + swap_journal_header(&jnl); + jnl.flags |= JOURNAL_NEED_SWAP; + } else { + checksum = calc_checksum((char *)jnl.jhdr, JOURNAL_HEADER_CKSUM_SIZE); + } - if (orig_checksum != checksum) { - printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum); - ret = EINVAL; - goto get_out; - } + if (jnl.jhdr->magic != JOURNAL_HEADER_MAGIC && jnl.jhdr->magic != OLD_JOURNAL_HEADER_MAGIC) { + printf("jnl: %s: is_clean: journal magic is bad (0x%x != 0x%x)\n", + jdev_name, jnl.jhdr->magic, JOURNAL_HEADER_MAGIC); + ret = EINVAL; + goto get_out; + } - // - // if the start and end are equal then the journal is clean. - // otherwise it's not clean and therefore an error. - // - if (jnl.jhdr->start == jnl.jhdr->end) { - ret = 0; - } else { - ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one - } + if (orig_checksum != checksum) { + printf("jnl: %s: is_clean: journal checksum is bad (0x%x != 0x%x)\n", jdev_name, orig_checksum, checksum); + ret = EINVAL; + goto get_out; + } - get_out: - kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz); - if (jdev_name) { - vfs_removename(jdev_name); - } - - return ret; + // + // if the start and end are equal then the journal is clean. + // otherwise it's not clean and therefore an error. + // + if (jnl.jhdr->start == jnl.jhdr->end) { + ret = 0; + } else { + ret = EBUSY; // so the caller can differentiate an invalid journal from a "busy" one + } +get_out: + kmem_free(kernel_map, (vm_offset_t)jnl.header_buf, phys_blksz); + if (jdev_name) { + vfs_removename(jdev_name); + } + + return ret; } @@ -2056,26 +2162,31 @@ journal_is_clean(struct vnode *jvp, void journal_close(journal *jnl) { - volatile off_t *start, *end; - int counter=0; + volatile off_t *start, *end; + int counter=0; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); // set this before doing anything that would block so that // we start tearing things down properly. // jnl->flags |= JOURNAL_CLOSE_PENDING; - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { lock_journal(jnl); - } + } - // - // only write stuff to disk if the journal is still valid - // - if ((jnl->flags & JOURNAL_INVALID) == 0) { + wait_condition(jnl, &jnl->flushing, "journal_close"); + + // + // only write stuff to disk if the journal is still valid + // + if ((jnl->flags & JOURNAL_INVALID) == 0) { if (jnl->active_tr) { + /* + * "journal_end_transaction" will fire the flush asynchronously + */ journal_end_transaction(jnl); } @@ -2084,8 +2195,17 @@ journal_close(journal *jnl) transaction *tr = jnl->cur_tr; jnl->cur_tr = NULL; - end_transaction(tr, 1, NULL, NULL); // force it to get flushed + /* + * "end_transaction" will wait for any in-progress flush to complete + * before flushing "cur_tr" synchronously("must_wait" == TRUE) + */ + end_transaction(tr, 1, NULL, NULL, FALSE, TRUE); } + /* + * if there was an "active_tr", make sure we wait for + * it to flush if there was no "cur_tr" to process + */ + wait_condition(jnl, &jnl->flushing, "journal_close"); //start = &jnl->jhdr->start; start = &jnl->active_start; @@ -2101,20 +2221,22 @@ journal_close(journal *jnl) if (*start != *end) { printf("jnl: %s: close: buffer flushing didn't seem to flush out all the transactions! (0x%llx - 0x%llx)\n", - jnl->jdev_name, *start, *end); + jnl->jdev_name, *start, *end); } // make sure this is in sync when we close the journal jnl->jhdr->start = jnl->active_start; // if this fails there's not much we can do at this point... - write_journal_header(jnl, 1); - } else { + write_journal_header(jnl, 1, jnl->sequence_num); + } else { // if we're here the journal isn't valid any more. // so make sure we don't leave any locked blocks lying around printf("jnl: %s: close: journal %p, is invalid. aborting outstanding transactions\n", jnl->jdev_name, jnl); + if (jnl->active_tr || jnl->cur_tr) { transaction *tr; + if (jnl->active_tr) { tr = jnl->active_tr; jnl->active_tr = NULL; @@ -2122,45 +2244,45 @@ journal_close(journal *jnl) tr = jnl->cur_tr; jnl->cur_tr = NULL; } - abort_transaction(jnl, tr); + if (jnl->active_tr || jnl->cur_tr) { - panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl); + panic("jnl: %s: close: jnl @ %p had both an active and cur tr\n", jnl->jdev_name, jnl); } } - } + } - free_old_stuff(jnl); + free_old_stuff(jnl); - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); - jnl->jhdr = (void *)0xbeefbabe; + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); + jnl->jhdr = (void *)0xbeefbabe; - if (jnl->jdev_name) { - vfs_removename(jnl->jdev_name); - } + if (jnl->jdev_name) { + vfs_removename(jnl->jdev_name); + } - FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); + FREE_ZONE(jnl, sizeof(struct journal), M_JNL_JNL); } static void dump_journal(journal *jnl) { - transaction *ctr; - - printf("journal for dev %s:", jnl->jdev_name); - printf(" jdev_offset %.8llx\n", jnl->jdev_offset); - printf(" magic: 0x%.8x\n", jnl->jhdr->magic); - printf(" start: 0x%.8llx\n", jnl->jhdr->start); - printf(" end: 0x%.8llx\n", jnl->jhdr->end); - printf(" size: 0x%.8llx\n", jnl->jhdr->size); - printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size); - printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size); - printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum); + transaction *ctr; + + printf("journal for dev %s:", jnl->jdev_name); + printf(" jdev_offset %.8llx\n", jnl->jdev_offset); + printf(" magic: 0x%.8x\n", jnl->jhdr->magic); + printf(" start: 0x%.8llx\n", jnl->jhdr->start); + printf(" end: 0x%.8llx\n", jnl->jhdr->end); + printf(" size: 0x%.8llx\n", jnl->jhdr->size); + printf(" blhdr size: %d\n", jnl->jhdr->blhdr_size); + printf(" jhdr size: %d\n", jnl->jhdr->jhdr_size); + printf(" chksum: 0x%.8x\n", jnl->jhdr->checksum); - printf(" completed transactions:\n"); - for(ctr=jnl->completed_trs; ctr; ctr=ctr->next) { + printf(" completed transactions:\n"); + for (ctr = jnl->completed_trs; ctr; ctr = ctr->next) { printf(" 0x%.8llx - 0x%.8llx\n", ctr->journal_start, ctr->journal_end); - } + } } @@ -2168,18 +2290,18 @@ dump_journal(journal *jnl) static off_t free_space(journal *jnl) { - off_t free_space_offset; + off_t free_space_offset; - if (jnl->jhdr->start < jnl->jhdr->end) { + if (jnl->jhdr->start < jnl->jhdr->end) { free_space_offset = jnl->jhdr->size - (jnl->jhdr->end - jnl->jhdr->start) - jnl->jhdr->jhdr_size; - } else if (jnl->jhdr->start > jnl->jhdr->end) { + } else if (jnl->jhdr->start > jnl->jhdr->end) { free_space_offset = jnl->jhdr->start - jnl->jhdr->end; - } else { + } else { // journal is completely empty free_space_offset = jnl->jhdr->size - jnl->jhdr->jhdr_size; - } + } - return free_space_offset; + return free_space_offset; } @@ -2188,46 +2310,50 @@ free_space(journal *jnl) // The "desired_size" is in bytes. // static int -check_free_space(journal *jnl, int desired_size) +check_free_space(journal *jnl, int desired_size, boolean_t *delayed_header_write, uint32_t sequence_num) { - size_t i; - int counter=0; + size_t i; + int counter=0; + + //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n", + // desired_size, free_space(jnl)); - //printf("jnl: check free space (desired 0x%x, avail 0x%Lx)\n", -// desired_size, free_space(jnl)); + if (delayed_header_write) + *delayed_header_write = FALSE; - while (1) { + while (1) { int old_start_empty; + // make sure there's space in the journal to hold this transaction + if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) { + break; + } if (counter++ == 5000) { dump_journal(jnl); panic("jnl: check_free_space: buffer flushing isn't working " - "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl, - jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start); + "(jnl @ %p s %lld e %lld f %lld [active start %lld]).\n", jnl, + jnl->jhdr->start, jnl->jhdr->end, free_space(jnl), jnl->active_start); } if (counter > 7500) { - printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name); - return ENOSPC; + printf("jnl: %s: check_free_space: giving up waiting for free space.\n", jnl->jdev_name); + return ENOSPC; } - // make sure there's space in the journal to hold this transaction - if (free_space(jnl) > desired_size && jnl->old_start[0] == 0) { - break; - } // // here's where we lazily bump up jnl->jhdr->start. we'll consume // entries until there is enough space for the next transaction. // old_start_empty = 1; lock_oldstart(jnl); - for(i=0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { + + for (i = 0; i < sizeof(jnl->old_start)/sizeof(jnl->old_start[0]); i++) { int lcl_counter; lcl_counter = 0; while (jnl->old_start[i] & 0x8000000000000000LL) { if (lcl_counter++ > 1000) { panic("jnl: check_free_space: tr starting @ 0x%llx not flushing (jnl %p).\n", - jnl->old_start[i], jnl); + jnl->old_start[i], jnl); } unlock_oldstart(jnl); @@ -2245,10 +2371,16 @@ check_free_space(journal *jnl, int desired_size) old_start_empty = 0; jnl->jhdr->start = jnl->old_start[i]; jnl->old_start[i] = 0; + if (free_space(jnl) > desired_size) { - unlock_oldstart(jnl); - write_journal_header(jnl, 1); - lock_oldstart(jnl); + + if (delayed_header_write) + *delayed_header_write = TRUE; + else { + unlock_oldstart(jnl); + write_journal_header(jnl, 1, sequence_num); + lock_oldstart(jnl); + } break; } } @@ -2268,7 +2400,11 @@ check_free_space(journal *jnl, int desired_size) // start of the loop. // jnl->jhdr->start = jnl->active_start; - write_journal_header(jnl, 1); + + if (delayed_header_write) + *delayed_header_write = TRUE; + else + write_journal_header(jnl, 1, sequence_num); continue; } @@ -2283,9 +2419,9 @@ check_free_space(journal *jnl, int desired_size) // wait for a while to avoid being cpu-bound (this will // put us to sleep for 10 milliseconds) tsleep((caddr_t)jnl, PRIBIO, "check_free_space2", 1); - } + } - return 0; + return 0; } /* @@ -2297,31 +2433,31 @@ journal_allocate_transaction(journal *jnl) transaction *tr; MALLOC_ZONE(tr, transaction *, sizeof(transaction), M_JNL_TR, M_WAITOK); - memset(tr, 0, sizeof(transaction)); + memset(tr, 0, sizeof(transaction)); - tr->tbuffer_size = jnl->tbuffer_size; + tr->tbuffer_size = jnl->tbuffer_size; - if (kmem_alloc(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) { + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&tr->tbuffer, tr->tbuffer_size)) { FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); jnl->active_tr = NULL; return ENOMEM; - } + } - // journal replay code checksum check depends on this. - memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); - // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility) - memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); + // journal replay code checksum check depends on this. + memset(tr->tbuffer, 0, BLHDR_CHECKSUM_SIZE); + // Fill up the rest of the block with unimportant bytes (0x5a 'Z' chosen for visibility) + memset(tr->tbuffer + BLHDR_CHECKSUM_SIZE, 0x5a, jnl->jhdr->blhdr_size - BLHDR_CHECKSUM_SIZE); - tr->blhdr = (block_list_header *)tr->tbuffer; - tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; - tr->blhdr->num_blocks = 1; // accounts for this header block - tr->blhdr->bytes_used = jnl->jhdr->blhdr_size; - tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER; + tr->blhdr = (block_list_header *)tr->tbuffer; + tr->blhdr->max_blocks = (jnl->jhdr->blhdr_size / sizeof(block_info)) - 1; + tr->blhdr->num_blocks = 1; // accounts for this header block + tr->blhdr->bytes_used = jnl->jhdr->blhdr_size; + tr->blhdr->flags = BLHDR_CHECK_CHECKSUMS | BLHDR_FIRST_HEADER; - tr->sequence_num = ++jnl->jhdr->sequence_num; - tr->num_blhdrs = 1; - tr->total_bytes = jnl->jhdr->blhdr_size; - tr->jnl = jnl; + tr->sequence_num = ++jnl->sequence_num; + tr->num_blhdrs = 1; + tr->total_bytes = jnl->jhdr->blhdr_size; + tr->jnl = jnl; jnl->active_tr = tr; @@ -2331,67 +2467,72 @@ journal_allocate_transaction(journal *jnl) int journal_start_transaction(journal *jnl) { - int ret; + int ret; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } + free_old_stuff(jnl); - if (jnl->owner == current_thread()) { + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + if (jnl->owner == current_thread()) { if (jnl->active_tr == NULL) { panic("jnl: start_tr: active_tr is NULL (jnl @ %p, owner %p, current_thread %p\n", - jnl, jnl->owner, current_thread()); + jnl, jnl->owner, current_thread()); } jnl->nested_count++; return 0; - } - - lock_journal(jnl); + } + lock_journal(jnl); - if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) { + if (jnl->owner != NULL || jnl->nested_count != 0 || jnl->active_tr != NULL) { panic("jnl: start_tr: owner %p, nested count %d, active_tr %p jnl @ %p\n", - jnl->owner, jnl->nested_count, jnl->active_tr, jnl); - } + jnl->owner, jnl->nested_count, jnl->active_tr, jnl); + } - jnl->owner = current_thread(); - jnl->nested_count = 1; + jnl->owner = current_thread(); + jnl->nested_count = 1; - free_old_stuff(jnl); +#if JOE + // make sure there's room in the journal + if (free_space(jnl) < jnl->tbuffer_size) { - // make sure there's room in the journal - if (free_space(jnl) < jnl->tbuffer_size) { - // this is the call that really waits for space to free up - // as well as updating jnl->jhdr->start - if (check_free_space(jnl, jnl->tbuffer_size) != 0) { - printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name); - ret = ENOSPC; - goto bad_start; + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0); + + // this is the call that really waits for space to free up + // as well as updating jnl->jhdr->start + if (check_free_space(jnl, jnl->tbuffer_size, NULL, jnl->sequence_num) != 0) { + printf("jnl: %s: start transaction failed: no space\n", jnl->jdev_name); + ret = ENOSPC; + goto bad_start; + } + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, 0, 0, 0, 0); } - } +#endif - // if there's a buffered transaction, use it. - if (jnl->cur_tr) { + // if there's a buffered transaction, use it. + if (jnl->cur_tr) { jnl->active_tr = jnl->cur_tr; jnl->cur_tr = NULL; return 0; - } + } ret = journal_allocate_transaction(jnl); if (ret) { goto bad_start; } - // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr); + // printf("jnl: start_tr: owner 0x%x new tr @ 0x%x\n", jnl->owner, jnl->active_tr); - return 0; + return 0; - bad_start: +bad_start: jnl->owner = NULL; jnl->nested_count = 0; unlock_journal(jnl); + return ret; } @@ -2399,118 +2540,123 @@ journal_start_transaction(journal *jnl) int journal_modify_block_start(journal *jnl, struct buf *bp) { - transaction *tr; + transaction *tr; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); + - if (jnl->flags & JOURNAL_INVALID) { + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { return EINVAL; - } + } - // XXXdbg - for debugging I want this to be true. later it may - // not be necessary. - if ((buf_flags(bp) & B_META) == 0) { + // XXXdbg - for debugging I want this to be true. later it may + // not be necessary. + if ((buf_flags(bp) & B_META) == 0) { panic("jnl: modify_block_start: bp @ %p is not a meta-data block! (jnl %p)\n", bp, jnl); - } + } - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_start: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); + jnl, jnl->owner, current_thread()); + } - //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n", - // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); + //printf("jnl: mod block start (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d; total bytes %d)\n", + // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); - // can't allow blocks that aren't an even multiple of the - // underlying block size. - if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) { - uint32_t phys_blksz, bad=0; + // can't allow blocks that aren't an even multiple of the + // underlying block size. + if ((buf_size(bp) % jnl->jhdr->jhdr_size) != 0) { + uint32_t phys_blksz, bad=0; - if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) { - bad = 1; - } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) { - if (phys_blksz < 512) { - panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n", - phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size); - } - - if ((buf_size(bp) % phys_blksz) != 0) { - bad = 1; - } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) { - jnl->jhdr->jhdr_size = phys_blksz; - } else { - // the phys_blksz is now larger... need to realloc the jhdr - char *new_header_buf; - - printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n", - jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz); - if (kmem_alloc(kernel_map, (vm_offset_t *)&new_header_buf, phys_blksz)) { - printf("jnl: modify_block_start: %s: create: phys blksz change (was %d, now %d) but could not allocate space for new header\n", - jnl->jdev_name, jnl->jhdr->jhdr_size, phys_blksz); - bad = 1; - } else { - memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size); - memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size)); - kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); - jnl->header_buf = new_header_buf; - jnl->header_buf_size = phys_blksz; - - jnl->jhdr = (journal_header *)jnl->header_buf; - jnl->jhdr->jhdr_size = phys_blksz; - } - } - } else { - bad = 1; - } + if (VNOP_IOCTL(jnl->jdev, DKIOCGETBLOCKSIZE, (caddr_t)&phys_blksz, 0, vfs_context_kernel())) { + bad = 1; + } else if (phys_blksz != (uint32_t)jnl->jhdr->jhdr_size) { + if (phys_blksz < 512) { + panic("jnl: mod block start: phys blksz %d is too small (%d, %d)\n", + phys_blksz, buf_size(bp), jnl->jhdr->jhdr_size); + } + + if ((buf_size(bp) % phys_blksz) != 0) { + bad = 1; + } else if (phys_blksz < (uint32_t)jnl->jhdr->jhdr_size) { + jnl->jhdr->jhdr_size = phys_blksz; + } else { + // the phys_blksz is now larger... need to realloc the jhdr + char *new_header_buf; + + printf("jnl: %s: phys blksz got bigger (was: %d/%d now %d)\n", + jnl->jdev_name, jnl->header_buf_size, jnl->jhdr->jhdr_size, phys_blksz); + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&new_header_buf, phys_blksz)) { + printf("jnl: modify_block_start: %s: create: phys blksz change (was %d, now %d) but could not allocate space for new header\n", + jnl->jdev_name, jnl->jhdr->jhdr_size, phys_blksz); + bad = 1; + } else { + memcpy(new_header_buf, jnl->header_buf, jnl->header_buf_size); + memset(&new_header_buf[jnl->header_buf_size], 0x18, (phys_blksz - jnl->header_buf_size)); + kmem_free(kernel_map, (vm_offset_t)jnl->header_buf, jnl->header_buf_size); + jnl->header_buf = new_header_buf; + jnl->header_buf_size = phys_blksz; + + jnl->jhdr = (journal_header *)jnl->header_buf; + jnl->jhdr->jhdr_size = phys_blksz; + } + } + } else { + bad = 1; + } - if (bad) { - panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n", - buf_size(bp), jnl->jhdr->jhdr_size); - return -1; - } - } + if (bad) { + panic("jnl: mod block start: bufsize %d not a multiple of block size %d\n", + buf_size(bp), jnl->jhdr->jhdr_size); + return -1; + } + } - // make sure that this transaction isn't bigger than the whole journal - if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) { + // make sure that this transaction isn't bigger than the whole journal + if (tr->total_bytes+buf_size(bp) >= (jnl->jhdr->size - jnl->jhdr->jhdr_size)) { panic("jnl: transaction too big (%d >= %lld bytes, bufsize %d, tr %p bp %p)\n", - tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp); + tr->total_bytes, (tr->jnl->jhdr->size - jnl->jhdr->jhdr_size), buf_size(bp), tr, bp); return -1; - } + } - // if the block is dirty and not already locked we have to write - // it out before we muck with it because it has data that belongs - // (presumably) to another transaction. - // - if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) { + // if the block is dirty and not already locked we have to write + // it out before we muck with it because it has data that belongs + // (presumably) to another transaction. + // + if ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI) { if (buf_flags(bp) & B_ASYNC) { panic("modify_block_start: bp @ %p has async flag set!\n", bp); } + if (bp->b_shadow_ref) + panic("modify_block_start: dirty bp @ %p has shadows!\n", bp); // this will cause it to not be buf_brelse()'d buf_setflags(bp, B_NORELSE); VNOP_BWRITE(bp); - } - buf_setflags(bp, B_LOCKED); - - return 0; + } + buf_setflags(bp, B_LOCKED); + + return 0; } int journal_modify_block_abort(journal *jnl, struct buf *bp) { - transaction *tr; + transaction *tr; block_list_header *blhdr; - int i; + int i; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); - tr = jnl->active_tr; + free_old_stuff(jnl); + + tr = jnl->active_tr; // // if there's no active transaction then we just want to @@ -2522,26 +2668,24 @@ journal_modify_block_abort(journal *jnl, struct buf *bp) return 0; } - if (jnl->flags & JOURNAL_INVALID) { + if (jnl->flags & JOURNAL_INVALID) { /* Still need to buf_brelse(). Callers assume we consume the bp. */ buf_brelse(bp); return EINVAL; - } + } - CHECK_TRANSACTION(tr); + CHECK_TRANSACTION(tr); - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_abort: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); + jnl, jnl->owner, current_thread()); + } - // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp); + // printf("jnl: modify_block_abort: tr 0x%x bp 0x%x\n", jnl->active_tr, bp); - // first check if it's already part of this transaction - for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { - for(i=1; i < blhdr->num_blocks; i++) { + // first check if it's already part of this transaction + for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { + for (i = 1; i < blhdr->num_blocks; i++) { if (bp == blhdr->binfo[i].u.bp) { break; } @@ -2550,7 +2694,7 @@ journal_modify_block_abort(journal *jnl, struct buf *bp) if (i < blhdr->num_blocks) { break; } - } + } // // if blhdr is null, then this block has only had modify_block_start @@ -2560,76 +2704,75 @@ journal_modify_block_abort(journal *jnl, struct buf *bp) // on it and so we need to keep it locked in memory. // if (blhdr == NULL) { - buf_clearflags(bp, B_LOCKED); + buf_clearflags(bp, B_LOCKED); } - buf_brelse(bp); - return 0; + buf_brelse(bp); + return 0; } int -journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf *bp, void *arg), void *arg) +journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(buf_t bp, void *arg), void *arg) { - int i = 1; - int tbuffer_offset=0; - char *blkptr; - block_list_header *blhdr, *prev=NULL; - transaction *tr; + int i = 1; + int tbuffer_offset=0; + block_list_header *blhdr, *prev=NULL; + transaction *tr; + + CHECK_JOURNAL(jnl); - CHECK_JOURNAL(jnl); + free_old_stuff(jnl); - if (jnl->flags & JOURNAL_INVALID) { + if (jnl->flags & JOURNAL_INVALID) { /* Still need to buf_brelse(). Callers assume we consume the bp. */ buf_brelse(bp); return EINVAL; - } + } - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); + jnl, jnl->owner, current_thread()); + } - //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n", - // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); + //printf("jnl: mod block end: (bp 0x%x vp 0x%x l/blkno %qd/%qd bsz %d, total bytes %d)\n", + // bp, buf_vnode(bp), buf_lblkno(bp), buf_blkno(bp), buf_size(bp), tr->total_bytes); - if ((buf_flags(bp) & B_LOCKED) == 0) { + if ((buf_flags(bp) & B_LOCKED) == 0) { panic("jnl: modify_block_end: bp %p not locked! jnl @ %p\n", bp, jnl); - } + } - // first check if it's already part of this transaction - for(blhdr=tr->blhdr; blhdr; prev=blhdr,blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { + // first check if it's already part of this transaction + for (blhdr = tr->blhdr; blhdr; prev = blhdr, blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { tbuffer_offset = jnl->jhdr->blhdr_size; - for(i=1; i < blhdr->num_blocks; i++) { + for (i = 1; i < blhdr->num_blocks; i++) { if (bp == blhdr->binfo[i].u.bp) { break; } if (blhdr->binfo[i].bnum != (off_t)-1) { - tbuffer_offset += buf_size(blhdr->binfo[i].u.bp); + tbuffer_offset += buf_size(blhdr->binfo[i].u.bp); } else { - tbuffer_offset += blhdr->binfo[i].u.bi.bsize; + tbuffer_offset += blhdr->binfo[i].u.bi.bsize; } } if (i < blhdr->num_blocks) { break; } - } + } - if (blhdr == NULL - && prev - && (prev->num_blocks+1) <= prev->max_blocks - && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) { + if (blhdr == NULL + && prev + && (prev->num_blocks+1) <= prev->max_blocks + && (prev->bytes_used+buf_size(bp)) <= (uint32_t)tr->tbuffer_size) { blhdr = prev; - } else if (blhdr == NULL) { - block_list_header *nblhdr; + } else if (blhdr == NULL) { + block_list_header *nblhdr; if (prev == NULL) { panic("jnl: modify block end: no way man, prev == NULL?!?, jnl %p, bp %p\n", jnl, bp); } @@ -2641,9 +2784,9 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf * // through prev->binfo[0].bnum. that's a skanky way to do things but // avoids having yet another linked list of small data structures to manage. - if (kmem_alloc(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) { + if (kmem_alloc_kobject(kernel_map, (vm_offset_t *)&nblhdr, tr->tbuffer_size)) { panic("jnl: end_tr: no space for new block tr @ %p (total bytes: %d)!\n", - tr, tr->total_bytes); + tr, tr->total_bytes); } // journal replay code checksum check depends on this. @@ -2667,25 +2810,15 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf * blhdr = nblhdr; tbuffer_offset = jnl->jhdr->blhdr_size; i = 1; - } + } - if ((i+1) > blhdr->max_blocks) { + if ((i+1) > blhdr->max_blocks) { panic("jnl: modify_block_end: i = %d, max_blocks %d\n", i, blhdr->max_blocks); - } - - // if the function pointer is not set then copy the - // block of data now. if the function pointer is set - // the copy will happen after calling the callback in - // end_transaction() just before it goes to disk. - // - if (func == NULL) { - blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; - memcpy(blkptr, (char *)0 + buf_dataptr(bp), buf_size(bp)); } - // if this is true then this is a new block we haven't seen - if (i >= blhdr->num_blocks) { + // if this is true then this is a new block we haven't seen + if (i >= blhdr->num_blocks) { int bsize; vnode_t vp; @@ -2695,8 +2828,9 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf * blhdr->binfo[i].bnum = (off_t)(buf_blkno(bp)); blhdr->binfo[i].u.bp = bp; + if (func) { - void *old_func=NULL, *old_arg=NULL; + void (*old_func)(buf_t, void *)=NULL, *old_arg=NULL; buf_setfilter(bp, func, arg, &old_func, &old_arg); if (old_func != NULL && old_func != func) { @@ -2708,48 +2842,48 @@ journal_modify_block_end(journal *jnl, struct buf *bp, void (*func)(struct buf * tr->total_bytes += bsize; blhdr->num_blocks++; - } - buf_bdwrite(bp); + } + buf_bdwrite(bp); - return 0; + return 0; } int journal_kill_block(journal *jnl, struct buf *bp) { - int i; - int bflags; - block_list_header *blhdr; - transaction *tr; + int i; + int bflags; + block_list_header *blhdr; + transaction *tr; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); - if (jnl->flags & JOURNAL_INVALID) { + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { return EINVAL; - } + } - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { panic("jnl: modify_block_end: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); + jnl, jnl->owner, current_thread()); + } - bflags = buf_flags(bp); + bflags = buf_flags(bp); - if ( !(bflags & B_LOCKED)) - panic("jnl: modify_block_end: called with bp not B_LOCKED"); + if ( !(bflags & B_LOCKED)) + panic("jnl: modify_block_end: called with bp not B_LOCKED"); - /* - * bp must be BL_BUSY and B_LOCKED - */ - // first check if it's already part of this transaction - for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { + /* + * bp must be BL_BUSY and B_LOCKED + * first check if it's already part of this transaction + */ + for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { - for(i=1; i < blhdr->num_blocks; i++) { + for (i = 1; i < blhdr->num_blocks; i++) { if (bp == blhdr->binfo[i].u.bp) { vnode_t vp; @@ -2783,9 +2917,38 @@ journal_kill_block(journal *jnl, struct buf *bp) if (i < blhdr->num_blocks) { break; } - } + } - return 0; + return 0; +} + + +/* +;________________________________________________________________________________ +; +; Routine: journal_trim_set_callback +; +; Function: Provide the journal with a routine to be called back when a +; TRIM has (or would have) been issued to the device. That +; is, the transaction has been flushed to the device, and the +; blocks freed by the transaction are now safe for reuse. +; +; CAUTION: If the journal becomes invalid (eg., due to an I/O +; error when trying to write to the journal), this callback +; will stop getting called, even if extents got freed before +; the journal became invalid! +; +; Input Arguments: +; jnl - The journal structure for the filesystem. +; callback - The function to call when the TRIM is complete. +; arg - An argument to be passed to callback. +;________________________________________________________________________________ +*/ +__private_extern__ void +journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg) +{ + jnl->trim_callback = callback; + jnl->trim_callback_arg = arg; } @@ -2802,7 +2965,7 @@ journal_kill_block(journal *jnl, struct buf *bp) ; grown successfully. ; ; Input Arguments: -; tr - The transaction containing the extent list. +; trim - The trim list to be resized. ; ; Output: ; (result) - ENOMEM or 0. @@ -2813,53 +2976,107 @@ journal_kill_block(journal *jnl, struct buf *bp) ;________________________________________________________________________________ */ static int -journal_trim_realloc(transaction *tr) +trim_realloc(struct jnl_trim_list *trim) { - if (CONFIG_HFS_TRIM) { - void *new_extents; - uint32_t new_allocated_count; - - new_allocated_count = tr->trim.allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS; - new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t)); - if (new_extents == NULL) { - printf("journal_trim_realloc: unable to grow extent list!\n"); - /* - * Since we could be called when allocating space previously marked - * to be trimmed, we need to empty out the list to be safe. - */ - tr->trim.extent_count = 0; - return ENOMEM; - } - - /* Copy the old extent list to the newly allocated list. */ - if (tr->trim.extents != NULL) { - memmove(new_extents, - tr->trim.extents, - tr->trim.allocated_count * sizeof(dk_extent_t)); - kfree(tr->trim.extents, - tr->trim.allocated_count * sizeof(dk_extent_t)); - } - - tr->trim.allocated_count = new_allocated_count; - tr->trim.extents = new_extents; + void *new_extents; + uint32_t new_allocated_count; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_START, trim, 0, trim->allocated_count, trim->extent_count, 0); + + new_allocated_count = trim->allocated_count + JOURNAL_DEFAULT_TRIM_EXTENTS; + new_extents = kalloc(new_allocated_count * sizeof(dk_extent_t)); + if (new_extents == NULL) { + printf("jnl: trim_realloc: unable to grow extent list!\n"); + /* + * Since we could be called when allocating space previously marked + * to be trimmed, we need to empty out the list to be safe. + */ + trim->extent_count = 0; + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, ENOMEM, 0, trim->allocated_count, 0, 0); + return ENOMEM; } + + /* Copy the old extent list to the newly allocated list. */ + if (trim->extents != NULL) { + memmove(new_extents, + trim->extents, + trim->allocated_count * sizeof(dk_extent_t)); + kfree(trim->extents, + trim->allocated_count * sizeof(dk_extent_t)); + } + + trim->allocated_count = new_allocated_count; + trim->extents = new_extents; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REALLOC | DBG_FUNC_END, 0, 0, new_allocated_count, trim->extent_count, 0); + return 0; } +/* +;________________________________________________________________________________ +; +; Routine: trim_search_extent +; +; Function: Search the given extent list to see if any of its extents +; overlap the given extent. +; +; Input Arguments: +; trim - The trim list to be searched. +; offset - The first byte of the range to be searched for. +; length - The number of bytes of the extent being searched for. +; +; Output: +; (result) - TRUE if one or more extents overlap, FALSE otherwise. +;________________________________________________________________________________ +*/ +static int +trim_search_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length) +{ + uint64_t end = offset + length; + uint32_t lower = 0; /* Lowest index to search */ + uint32_t upper = trim->extent_count; /* Highest index to search + 1 */ + uint32_t middle; + + /* A binary search over the extent list. */ + while (lower < upper) { + middle = (lower + upper) / 2; + + if (trim->extents[middle].offset >= end) + upper = middle; + else if (trim->extents[middle].offset + trim->extents[middle].length <= offset) + lower = middle + 1; + else + return TRUE; + } + + return FALSE; +} + + /* ;________________________________________________________________________________ ; ; Routine: journal_trim_add_extent ; -; Function: Make note of a range of bytes that should be unmapped -; (trimmed). That is, the given range of bytes no longer have -; useful content, and the device can unmap the previous -; contents. For example, a solid state disk may reuse the -; underlying storage for other blocks. +; Function: Keep track of extents that have been freed as part of this +; transaction. If the underlying device supports TRIM (UNMAP), +; then those extents will be trimmed/unmapped once the +; transaction has been written to the journal. (For example, +; SSDs can support trim/unmap and avoid having to recopy those +; blocks when doing wear leveling, and may reuse the same +; phsyical blocks for different logical blocks.) ; -; The extent will be unmapped after the transaction is written -; to the journal. +; HFS also uses this, in combination with journal_trim_set_callback, +; to add recently freed extents to its free extent cache, but +; only after the transaction that freed them is committed to +; disk. (This reduces the chance of overwriting live data in +; a way that causes data loss if a transaction never gets +; written to the journal.) ; ; Input Arguments: ; jnl - The journal for the volume containing the byte range. @@ -2870,113 +3087,114 @@ journal_trim_realloc(transaction *tr) __private_extern__ int journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) { - if (CONFIG_HFS_TRIM) { - uint64_t end; - transaction *tr; - dk_extent_t *extent; - uint32_t insert_index; - uint32_t replace_count; - - CHECK_JOURNAL(jnl); + uint64_t end; + transaction *tr; + dk_extent_t *extent; + uint32_t insert_index; + uint32_t replace_count; - if (jnl->flags & JOURNAL_TRIM_ERR) { - /* - * A previous trim failed, so we have disabled trim for this volume - * for as long as it remains mounted. - */ - return 0; - } - - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } + CHECK_JOURNAL(jnl); + + /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0); + + if (jnl->owner != current_thread()) { + panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + free_old_stuff(jnl); - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + end = offset + length; - if (jnl->owner != current_thread()) { - panic("jnl: trim_add_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } + /* + * Find the range of existing extents that can be combined with the + * input extent. We start by counting the number of extents that end + * strictly before the input extent, then count the number of extents + * that overlap or are contiguous with the input extent. + */ + extent = tr->trim.extents; + insert_index = 0; + while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) { + ++insert_index; + ++extent; + } + replace_count = 0; + while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) { + ++replace_count; + ++extent; + } - free_old_stuff(jnl); - - end = offset + length; - - /* - * Find the range of existing extents that can be combined with the - * input extent. We start by counting the number of extents that end - * strictly before the input extent, then count the number of extents - * that overlap or are contiguous with the input extent. - */ - extent = tr->trim.extents; - insert_index = 0; - while (insert_index < tr->trim.extent_count && extent->offset + extent->length < offset) { - ++insert_index; - ++extent; - } - replace_count = 0; - while (insert_index + replace_count < tr->trim.extent_count && extent->offset <= end) { - ++replace_count; - ++extent; + /* + * If none of the existing extents can be combined with the input extent, + * then just insert it in the list (before item number insert_index). + */ + if (replace_count == 0) { + /* If the list was already full, we need to grow it. */ + if (tr->trim.extent_count == tr->trim.allocated_count) { + if (trim_realloc(&tr->trim) != 0) { + printf("jnl: trim_add_extent: out of memory!"); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, ENOMEM, 0, 0, tr->trim.extent_count, 0); + return ENOMEM; + } } - /* - * If none of the existing extents can be combined with the input extent, - * then just insert it in the list (before item number insert_index). - */ - if (replace_count == 0) { - /* If the list was already full, we need to grow it. */ - if (tr->trim.extent_count == tr->trim.allocated_count) { - if (journal_trim_realloc(tr) != 0) { - printf("jnl: trim_add_extent: out of memory!"); - return ENOMEM; - } - } - - /* Shift any existing extents with larger offsets. */ - if (insert_index < tr->trim.extent_count) { - memmove(&tr->trim.extents[insert_index+1], - &tr->trim.extents[insert_index], - (tr->trim.extent_count - insert_index) * sizeof(dk_extent_t)); - } - tr->trim.extent_count++; - - /* Store the new extent in the list. */ - tr->trim.extents[insert_index].offset = offset; - tr->trim.extents[insert_index].length = length; - - /* We're done. */ - return 0; + /* Shift any existing extents with larger offsets. */ + if (insert_index < tr->trim.extent_count) { + memmove(&tr->trim.extents[insert_index+1], + &tr->trim.extents[insert_index], + (tr->trim.extent_count - insert_index) * sizeof(dk_extent_t)); } + tr->trim.extent_count++; - /* - * Update extent number insert_index to be the union of the input extent - * and all of the replaced extents. - */ - if (tr->trim.extents[insert_index].offset < offset) - offset = tr->trim.extents[insert_index].offset; - extent = &tr->trim.extents[insert_index + replace_count - 1]; - if (extent->offset + extent->length > end) - end = extent->offset + extent->length; + /* Store the new extent in the list. */ tr->trim.extents[insert_index].offset = offset; - tr->trim.extents[insert_index].length = end - offset; + tr->trim.extents[insert_index].length = length; - /* - * If we were replacing more than one existing extent, then shift any - * extents with larger offsets, and update the count of extents. - * - * We're going to leave extent #insert_index alone since it was just updated, above. - * We need to move extents from index (insert_index + replace_count) through the end of - * the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1). - */ - if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) { - memmove(&tr->trim.extents[insert_index + 1], - &tr->trim.extents[insert_index + replace_count], - (tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t)); - } - tr->trim.extent_count -= replace_count - 1; - } + /* We're done. */ + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0); + return 0; + } + + /* + * Update extent number insert_index to be the union of the input extent + * and all of the replaced extents. + */ + if (tr->trim.extents[insert_index].offset < offset) + offset = tr->trim.extents[insert_index].offset; + extent = &tr->trim.extents[insert_index + replace_count - 1]; + if (extent->offset + extent->length > end) + end = extent->offset + extent->length; + tr->trim.extents[insert_index].offset = offset; + tr->trim.extents[insert_index].length = end - offset; + + /* + * If we were replacing more than one existing extent, then shift any + * extents with larger offsets, and update the count of extents. + * + * We're going to leave extent #insert_index alone since it was just updated, above. + * We need to move extents from index (insert_index + replace_count) through the end of + * the list by (replace_count - 1) positions so that they overwrite extent #(insert_index + 1). + */ + if (replace_count > 1 && (insert_index + replace_count) < tr->trim.extent_count) { + memmove(&tr->trim.extents[insert_index + 1], + &tr->trim.extents[insert_index + replace_count], + (tr->trim.extent_count - insert_index - replace_count) * sizeof(dk_extent_t)); + } + tr->trim.extent_count -= replace_count - 1; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_ADD | DBG_FUNC_END, 0, 0, 0, tr->trim.extent_count, 0); return 0; } @@ -2984,153 +3202,217 @@ journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length) /* ;________________________________________________________________________________ ; -; Routine: journal_trim_remove_extent +; Routine: trim_remove_extent ; -; Function: Make note of a range of bytes, some of which may have previously -; been passed to journal_trim_add_extent, is now in use on the -; volume. The given bytes will be not be trimmed as part of -; this transaction. +; Function: Indicate that a range of bytes, some of which may have previously +; been passed to journal_trim_add_extent, is now allocated. +; Any overlapping ranges currently in the journal's trim list will +; be removed. If the underlying device supports TRIM (UNMAP), then +; these extents will not be trimmed/unmapped when the transaction +; is written to the journal. +; +; HFS also uses this to prevent newly allocated space from being +; added to its free extent cache (if some portion of the newly +; allocated space was recently freed). ; ; Input Arguments: -; jnl - The journal for the volume containing the byte range. +; trim - The trim list to update. ; offset - The first byte of the range to be trimmed. ; length - The number of bytes of the extent being trimmed. ;________________________________________________________________________________ */ -__private_extern__ int -journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) +static int +trim_remove_extent(struct jnl_trim_list *trim, uint64_t offset, uint64_t length) { - if (CONFIG_HFS_TRIM) { - u_int64_t end; - dk_extent_t *extent; - transaction *tr; - u_int32_t keep_before; - u_int32_t keep_after; - - CHECK_JOURNAL(jnl); - - if (jnl->flags & JOURNAL_TRIM_ERR) { - /* - * A previous trim failed, so we have disabled trim for this volume - * for as long as it remains mounted. - */ - return 0; - } - - if (jnl->flags & JOURNAL_INVALID) { - return EINVAL; - } - - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + u_int64_t end; + dk_extent_t *extent; + u_int32_t keep_before; + u_int32_t keep_after; - if (jnl->owner != current_thread()) { - panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } + end = offset + length; - free_old_stuff(jnl); + /* + * Find any existing extents that start before or end after the input + * extent. These extents will be modified if they overlap the input + * extent. Other extents between them will be deleted. + */ + extent = trim->extents; + keep_before = 0; + while (keep_before < trim->extent_count && extent->offset < offset) { + ++keep_before; + ++extent; + } + keep_after = keep_before; + if (keep_after > 0) { + /* See if previous extent extends beyond both ends of input extent. */ + --keep_after; + --extent; + } + while (keep_after < trim->extent_count && (extent->offset + extent->length) <= end) { + ++keep_after; + ++extent; + } - end = offset + length; + /* + * When we get here, the first keep_before extents (0 .. keep_before-1) + * start before the input extent, and extents (keep_after .. extent_count-1) + * end after the input extent. We'll need to keep, all of those extents, + * but possibly modify #(keep_before-1) and #keep_after to remove the portion + * that overlaps with the input extent. + */ - /* - * Find any existing extents that start before or end after the input - * extent. These extents will be modified if they overlap the input - * extent. Other extents between them will be deleted. - */ - extent = tr->trim.extents; - keep_before = 0; - while (keep_before < tr->trim.extent_count && extent->offset < offset) { - ++keep_before; - ++extent; - } - keep_after = keep_before; - if (keep_after > 0) { - /* See if previous extent extends beyond both ends of input extent. */ - --keep_after; - --extent; - } - while (keep_after < tr->trim.extent_count && (extent->offset + extent->length) <= end) { - ++keep_after; - ++extent; + /* + * Does the input extent start after and end before the same existing + * extent? If so, we have to "punch a hole" in that extent and convert + * it to two separate extents. + */ + if (keep_before > keep_after) { + /* If the list was already full, we need to grow it. */ + if (trim->extent_count == trim->allocated_count) { + if (trim_realloc(trim) != 0) { + printf("jnl: trim_remove_extent: out of memory!"); + return ENOMEM; + } } /* - * When we get here, the first keep_before extents (0 .. keep_before-1) - * start before the input extent, and extents (keep_after .. extent_count-1) - * end after the input extent. We'll need to keep, all of those extents, - * but possibly modify #(keep_before-1) and #keep_after to remove the portion - * that overlaps with the input extent. + * Make room for a new extent by shifting extents #keep_after and later + * down by one extent. When we're done, extents #keep_before and + * #keep_after will be identical, and we can fall through to removing + * the portion that overlaps the input extent. */ + memmove(&trim->extents[keep_before], + &trim->extents[keep_after], + (trim->extent_count - keep_after) * sizeof(dk_extent_t)); + ++trim->extent_count; + ++keep_after; /* - * Does the input extent start after and end before the same existing - * extent? If so, we have to "punch a hole" in that extent and convert - * it to two separate extents. + * Fall through. We now have the case where the length of extent + * #(keep_before - 1) needs to be updated, and the start of extent + * #(keep_after) needs to be updated. */ - if (keep_before > keep_after) { - /* If the list was already full, we need to grow it. */ - if (tr->trim.extent_count == tr->trim.allocated_count) { - if (journal_trim_realloc(tr) != 0) { - printf("jnl: trim_remove_extent: out of memory!"); - return ENOMEM; - } - } - - /* - * Make room for a new extent by shifting extents #keep_after and later - * down by one extent. When we're done, extents #keep_before and - * #keep_after will be identical, and we can fall through to removing - * the portion that overlaps the input extent. - */ - memmove(&tr->trim.extents[keep_before], - &tr->trim.extents[keep_after], - (tr->trim.extent_count - keep_after) * sizeof(dk_extent_t)); - ++tr->trim.extent_count; - ++keep_after; - - /* - * Fall through. We now have the case where the length of extent - * #(keep_before - 1) needs to be updated, and the start of extent - * #(keep_after) needs to be updated. - */ + } + + /* + * May need to truncate the end of extent #(keep_before - 1) if it overlaps + * the input extent. + */ + if (keep_before > 0) { + extent = &trim->extents[keep_before - 1]; + if (extent->offset + extent->length > offset) { + extent->length = offset - extent->offset; } - - /* - * May need to truncate the end of extent #(keep_before - 1) if it overlaps - * the input extent. - */ - if (keep_before > 0) { - extent = &tr->trim.extents[keep_before - 1]; - if (extent->offset + extent->length > offset) { - extent->length = offset - extent->offset; - } + } + + /* + * May need to update the start of extent #(keep_after) if it overlaps the + * input extent. + */ + if (keep_after < trim->extent_count) { + extent = &trim->extents[keep_after]; + if (extent->offset < end) { + extent->length = extent->offset + extent->length - end; + extent->offset = end; } + } + + /* + * If there were whole extents that overlapped the input extent, get rid + * of them by shifting any following extents, and updating the count. + */ + if (keep_after > keep_before && keep_after < trim->extent_count) { + memmove(&trim->extents[keep_before], + &trim->extents[keep_after], + (trim->extent_count - keep_after) * sizeof(dk_extent_t)); + } + trim->extent_count -= keep_after - keep_before; + + return 0; +} + + +/* +;________________________________________________________________________________ +; +; Routine: journal_trim_remove_extent +; +; Function: Make note of a range of bytes, some of which may have previously +; been passed to journal_trim_add_extent, is now in use on the +; volume. The given bytes will be not be trimmed as part of +; this transaction, or a pending trim of a transaction being +; asynchronously flushed. +; +; Input Arguments: +; jnl - The journal for the volume containing the byte range. +; offset - The first byte of the range to be trimmed. +; length - The number of bytes of the extent being trimmed. +;________________________________________________________________________________ +*/ +__private_extern__ int +journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length) +{ + int error = 0; + transaction *tr; + + CHECK_JOURNAL(jnl); + + /* TODO: Is it OK to manipulate the trim list even if JOURNAL_INVALID is set? I think so... */ + if (jnl->flags & JOURNAL_INVALID) { + return EINVAL; + } + + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_START, jnl, offset, length, tr->trim.extent_count, 0); + + if (jnl->owner != current_thread()) { + panic("jnl: trim_remove_extent: called w/out a transaction! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); + } + + free_old_stuff(jnl); + + error = trim_remove_extent(&tr->trim, offset, length); + if (error == 0) { + int found = FALSE; /* - * May need to update the start of extent #(keep_after) if it overlaps the - * input extent. + * See if a pending trim has any extents that overlap with the + * one we were given. */ - if (keep_after < tr->trim.extent_count) { - extent = &tr->trim.extents[keep_after]; - if (extent->offset < end) { - extent->length = extent->offset + extent->length - end; - extent->offset = end; - } - } + lck_rw_lock_shared(&jnl->trim_lock); + if (jnl->async_trim != NULL) + found = trim_search_extent(jnl->async_trim, offset, length); + lck_rw_unlock_shared(&jnl->trim_lock); - /* - * If there were whole extents that overlapped the input extent, get rid - * of them by shifting any following extents, and updating the count. - */ - if (keep_after > keep_before && keep_after < tr->trim.extent_count) { - memmove(&tr->trim.extents[keep_before], - &tr->trim.extents[keep_after], - (tr->trim.extent_count - keep_after) * sizeof(dk_extent_t)); + if (found) { + /* + * There was an overlap, so avoid trimming the extent we + * just allocated. (Otherwise, it might get trimmed after + * we've written to it, which will cause that data to be + * corrupted.) + */ + uint32_t async_extent_count = 0; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_START, jnl, offset, length, 0, 0); + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim != NULL) { + error = trim_remove_extent(jnl->async_trim, offset, length); + async_extent_count = jnl->async_trim->extent_count; + } + lck_rw_unlock_exclusive(&jnl->trim_lock); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE_PENDING | DBG_FUNC_END, error, 0, 0, async_extent_count, 0); } - tr->trim.extent_count -= keep_after - keep_before; } - return 0; + + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_REMOVE | DBG_FUNC_END, error, 0, 0, tr->trim.extent_count, 0); + return error; } @@ -3139,29 +3421,70 @@ journal_trim_flush(journal *jnl, transaction *tr) { int errno = 0; - if (CONFIG_HFS_TRIM) { - if ((jnl->flags & JOURNAL_TRIM_ERR) == 0 && tr->trim.extent_count > 0) { - dk_unmap_t unmap; - - bzero(&unmap, sizeof(unmap)); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0); + + if (tr->trim.extent_count > 0) { + dk_unmap_t unmap; + + bzero(&unmap, sizeof(unmap)); + lck_rw_lock_shared(&jnl->trim_lock); + if (CONFIG_HFS_TRIM && (jnl->flags & JOURNAL_USE_UNMAP)) { unmap.extents = tr->trim.extents; unmap.extentsCount = tr->trim.extent_count; + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_START, jnl, tr, 0, tr->trim.extent_count, 0); errno = VNOP_IOCTL(jnl->fsdev, DKIOCUNMAP, (caddr_t)&unmap, FWRITE, vfs_context_kernel()); + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_UNMAP | DBG_FUNC_END, errno, 0, 0, 0, 0); if (errno) { printf("jnl: error %d from DKIOCUNMAP (extents=%lx, count=%u); disabling trim for %s\n", - errno, (unsigned long) (tr->trim.extents), tr->trim.extent_count, + errno, (unsigned long) (unmap.extents), unmap.extentsCount, jnl->jdev_name); - jnl->flags |= JOURNAL_TRIM_ERR; + jnl->flags &= ~JOURNAL_USE_UNMAP; } } - if (tr->trim.extents) { - kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); - tr->trim.allocated_count = 0; - tr->trim.extent_count = 0; - tr->trim.extents = NULL; - } + + /* + * Call back into the file system to tell them that we have + * trimmed some extents and that they can now be reused. + * + * CAUTION: If the journal becomes invalid (eg., due to an I/O + * error when trying to write to the journal), this callback + * will stop getting called, even if extents got freed before + * the journal became invalid! + */ + if (jnl->trim_callback) + jnl->trim_callback(jnl->trim_callback_arg, tr->trim.extent_count, tr->trim.extents); + + lck_rw_unlock_shared(&jnl->trim_lock); + } + + /* + * If the transaction we're flushing was the async transaction, then + * tell the current transaction that there is no pending trim + * any more. + * + * NOTE: Since we released the lock, another thread could have + * removed one or more extents from our list. That's not a + * problem since any writes to the re-allocated blocks + * would get sent to the device after the DKIOCUNMAP. + */ + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim == &tr->trim) + jnl->async_trim = NULL; + lck_rw_unlock_exclusive(&jnl->trim_lock); + + if (tr->trim.extents) { + kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); + tr->trim.allocated_count = 0; + tr->trim.extent_count = 0; + tr->trim.extents = NULL; } + if (jnl_kdebug) + KERNEL_DEBUG_CONSTANT(DBG_JOURNAL_TRIM_FLUSH | DBG_FUNC_END, errno, 0, 0, 0, 0); + return errno; } @@ -3169,23 +3492,23 @@ journal_trim_flush(journal *jnl, transaction *tr) static int journal_binfo_cmp(const void *a, const void *b) { - const block_info *bi_a = (const struct block_info *)a; - const block_info *bi_b = (const struct block_info *)b; - daddr64_t res; + const block_info *bi_a = (const struct block_info *)a; + const block_info *bi_b = (const struct block_info *)b; + daddr64_t res; - if (bi_a->bnum == (off_t)-1) { + if (bi_a->bnum == (off_t)-1) { return 1; - } - if (bi_b->bnum == (off_t)-1) { + } + if (bi_b->bnum == (off_t)-1) { return -1; - } + } - // don't have to worry about negative block - // numbers so this is ok to do. - // - res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp)); + // don't have to worry about negative block + // numbers so this is ok to do. + // + res = (buf_blkno(bi_a->u.bp) - buf_blkno(bi_b->u.bp)); - return (int)res; + return (int)res; } @@ -3220,27 +3543,27 @@ journal_binfo_cmp(const void *a, const void *b) * -1 An error occurred. The journal is marked invalid. */ static int -end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg) +end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void *callback_arg, boolean_t drop_lock, boolean_t must_wait) { - int i, ret, amt; - errno_t errno; - off_t end; - journal *jnl = tr->jnl; - struct buf *bp, **bparray; - block_list_header *blhdr=NULL, *next=NULL; - size_t tbuffer_offset; + block_list_header *blhdr=NULL, *next=NULL; + int i, ret_val = 0; + errno_t errno; + journal *jnl = tr->jnl; + struct buf *bp; + size_t tbuffer_offset; + boolean_t drop_lock_early; if (jnl->cur_tr) { panic("jnl: jnl @ %p already has cur_tr %p, new tr: %p\n", jnl, jnl->cur_tr, tr); } - // if there weren't any modified blocks in the transaction - // just save off the transaction pointer and return. - if (tr->total_bytes == jnl->jhdr->blhdr_size) { + // if there weren't any modified blocks in the transaction + // just save off the transaction pointer and return. + if (tr->total_bytes == jnl->jhdr->blhdr_size) { jnl->cur_tr = tr; - return 0; - } + goto done; + } // if our transaction buffer isn't very full, just hang // on to it and don't actually flush anything. this is @@ -3248,174 +3571,314 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void // transaction buffer if it's full or if we have more than // one of them so we don't start hogging too much memory. // - // We also check the number of extents waiting to be trimmed. - // If it is small enough, then keep accumulating more (so we - // can reduce the overhead of trimming). If there was a - // prior trim error, then we stop issuing trims for this + // We also check the device supports UNMAP/TRIM, and if so, + // the number of extents waiting to be trimmed. If it is + // small enough, then keep accumulating more (so we can + // reduce the overhead of trimming). If there was a prior + // trim error, then we stop issuing trims for this // volume, so we can also coalesce transactions. - // + // if ( force_it == 0 && (jnl->flags & JOURNAL_NO_GROUP_COMMIT) == 0 && tr->num_blhdrs < 3 && (tr->total_bytes <= ((tr->tbuffer_size*tr->num_blhdrs) - tr->tbuffer_size/8)) - && ((jnl->flags & JOURNAL_TRIM_ERR) || (tr->trim.extent_count < jnl_trim_flush_limit))) { + && (!(jnl->flags & JOURNAL_USE_UNMAP) || (tr->trim.extent_count < jnl_trim_flush_limit))) { jnl->cur_tr = tr; - return 0; - } + goto done; + } + KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_START, jnl, tr, drop_lock, must_wait, 0); - // if we're here we're going to flush the transaction buffer to disk. - // make sure there is room in the journal first. - check_free_space(jnl, tr->total_bytes); + lock_condition(jnl, &jnl->flushing, "end_transaction"); - // range check the end index - if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { - panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n", - jnl->jhdr->end, jnl->jhdr->size); - } + /* + * if the previous 'finish_end_transaction' was being run + * asynchronously, it could have encountered a condition + * that caused it to mark the journal invalid... if that + * occurred while we were waiting for it to finish, we + * need to notice and abort the current transaction + */ + if ((jnl->flags & JOURNAL_INVALID) || jnl->flush_aborted == TRUE) { + unlock_condition(jnl, &jnl->flushing); - // this transaction starts where the current journal ends - tr->journal_start = jnl->jhdr->end; - end = jnl->jhdr->end; + abort_transaction(jnl, tr); + ret_val = -1; + KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); + goto done; + } - // - // if the first entry in old_start[] isn't free yet, loop calling the - // file system flush routine until it is (or we panic). - // - i = 0; - lock_oldstart(jnl); - while ((jnl->old_start[0] & 0x8000000000000000LL) != 0) { - if (jnl->flush) { - unlock_oldstart(jnl); + /* + * Store a pointer to this transaction's trim list so that + * future transactions can find it. + * + * Note: if there are no extents in the trim list, then don't + * bother saving the pointer since nothing can add new extents + * to the list (and other threads/transactions only care if + * there is a trim pending). + */ + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim != NULL) + panic("jnl: end_transaction: async_trim already non-NULL!"); + if (tr->trim.extent_count > 0) + jnl->async_trim = &tr->trim; + lck_rw_unlock_exclusive(&jnl->trim_lock); - if (jnl->flush) { - jnl->flush(jnl->flush_arg); - } + /* + * snapshot the transaction sequence number while we are still behind + * the journal lock since it will be bumped upon the start of the + * next transaction group which may overlap the current journal flush... + * we pass the snapshot into write_journal_header during the journal + * flush so that it can write the correct version in the header... + * because we hold the 'flushing' condition variable for the duration + * of the journal flush, 'saved_sequence_num' remains stable + */ + jnl->saved_sequence_num = jnl->sequence_num; - // yield the cpu so others can get in to clear the lock bit - (void)tsleep((void *)jnl, PRIBIO, "jnl-old-start-sleep", 1); + /* + * if we're here we're going to flush the transaction buffer to disk. + * 'check_free_space' will not return untl there is enough free + * space for this transaction in the journal and jnl->old_start[0] + * is avaiable for use + */ + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_START, jnl, 0, 0, 0, 0); - lock_oldstart(jnl); - } - if (i++ >= 500) { - panic("jnl: transaction that started at 0x%llx is not completing! jnl %p\n", - jnl->old_start[0] & (~0x8000000000000000LL), jnl); - } + check_free_space(jnl, tr->total_bytes, &tr->delayed_header_write, jnl->saved_sequence_num); + + KERNEL_DEBUG(0xbbbbc030 | DBG_FUNC_END, jnl, tr->delayed_header_write, 0, 0, 0); + + // range check the end index + if (jnl->jhdr->end <= 0 || jnl->jhdr->end > jnl->jhdr->size) { + panic("jnl: end_transaction: end is bogus 0x%llx (sz 0x%llx)\n", + jnl->jhdr->end, jnl->jhdr->size); } + if (tr->delayed_header_write == TRUE) { + thread_t thread = THREAD_NULL; - // - // slide everyone else down and put our latest guy in the last - // entry in the old_start array - // - - /* Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy. */ + lock_condition(jnl, &jnl->writing_header, "end_transaction"); + /* + * fire up a thread to write the journal header + * asynchronously... when it finishes, it will call + * unlock_condition... we can overlap the preparation of + * the log and buffers during this time + */ + kernel_thread_start((thread_continue_t)write_header_thread, jnl, &thread); + } else + jnl->write_header_failed = FALSE; + + + // this transaction starts where the current journal ends + tr->journal_start = jnl->jhdr->end; + + lock_oldstart(jnl); + /* + * Because old_start is locked above, we can cast away the volatile qualifier before passing it to memcpy. + * slide everyone else down and put our latest guy in the last + * entry in the old_start array + */ memcpy(__CAST_AWAY_QUALIFIER(&jnl->old_start[0], volatile, void *), __CAST_AWAY_QUALIFIER(&jnl->old_start[1], volatile, void *), sizeof(jnl->old_start)-sizeof(jnl->old_start[0])); jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] = tr->journal_start | 0x8000000000000000LL; unlock_oldstart(jnl); - // for each block, make sure that the physical block # is set - for(blhdr=tr->blhdr; blhdr; blhdr=next) { - char *blkptr; - + for (blhdr = tr->blhdr; blhdr; blhdr = next) { + char *blkptr; + buf_t sbp; + int32_t bsize; + tbuffer_offset = jnl->jhdr->blhdr_size; - for(i=1; i < blhdr->num_blocks; i++) { - daddr64_t blkno; - daddr64_t lblkno; - struct vnode *vp; - bp = blhdr->binfo[i].u.bp; + for (i = 1; i < blhdr->num_blocks; i++) { - // if this block has a callback function set, call - // it now and then copy the data from the bp into - // the journal. if (blhdr->binfo[i].bnum != (off_t)-1) { - void (*func)(struct buf *, void *); + void (*func)(buf_t, void *); void *arg; + bp = blhdr->binfo[i].u.bp; + if (bp == NULL) { panic("jnl: inconsistent binfo (NULL bp w/bnum %lld; jnl @ %p, tr %p)\n", blhdr->binfo[i].bnum, jnl, tr); } - - buf_setfilter(bp, NULL, NULL, (void **)&func, &arg); - - if (func) { - // acquire the bp here so that we can safely - // mess around with its data. buf_acquire() - // will return EAGAIN if the buffer was busy, - // so loop trying again. - do { - errno = buf_acquire(bp, 0, 0, 0); - } while (errno == EAGAIN); - - if (errno == 0) { + /* + * acquire the bp here so that we can safely + * mess around with its data. buf_acquire() + * will return EAGAIN if the buffer was busy, + * so loop trying again. + */ + do { + errno = buf_acquire(bp, BAC_REMOVE, 0, 0); + } while (errno == EAGAIN); - // call the hook function and then copy the - // data into the transaction buffer... - func(bp, arg); + if (errno) + panic("could not acquire bp %p (err %d)\n", bp, errno); - blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; - memcpy(blkptr, (char *)buf_dataptr(bp), buf_size(bp)); - - buf_drop(bp); + if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) { + if (jnl->flags & JOURNAL_CLOSE_PENDING) { + buf_clearflags(bp, B_LOCKED); + buf_brelse(bp); + + /* + * this is an odd case that appears to happen occasionally + * make sure we mark this block as no longer valid + * so that we don't process it in "finish_end_transaction" since + * the bp that is recorded in our array no longer belongs + * to us (normally we substitute a shadow bp to be processed + * issuing a 'buf_bawrite' on a stale buf_t pointer leads + * to all kinds of problems. + */ + blhdr->binfo[i].bnum = (off_t)-1; + continue; } else { - panic("could not acquire bp %p (err %d)\n", bp, errno); + panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp)); } } + bsize = buf_size(bp); - } else { // bnum == -1, only true if a block was "killed" + buf_setfilter(bp, NULL, NULL, &func, &arg); + + blkptr = (char *)&((char *)blhdr)[tbuffer_offset]; - tbuffer_offset += blhdr->binfo[i].u.bi.bsize; - continue; - } + sbp = buf_create_shadow_priv(bp, FALSE, (uintptr_t)blkptr, 0, 0); - tbuffer_offset += buf_size(bp); + if (sbp == NULL) + panic("jnl: buf_create_shadow returned NULL"); - vp = buf_vnode(bp); - blkno = buf_blkno(bp); - lblkno = buf_lblkno(bp); + /* + * copy the data into the transaction buffer... + */ + memcpy(blkptr, (char *)buf_dataptr(bp), bsize); - if (vp == NULL && lblkno == blkno) { - printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr %p jnl %p).\n", - jnl->jdev_name, bp, lblkno, blkno, tr, jnl); - goto bad_journal; - } - - // if the lblkno is the same as blkno and this bp isn't - // associated with the underlying file system device then - // we need to call bmap() to get the actual physical block. - // - if ((lblkno == blkno) && (vp != jnl->fsdev)) { - off_t f_offset; - size_t contig_bytes; + buf_clearflags(bp, B_LOCKED); + buf_markclean(bp); + buf_drop(bp); - if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) { - printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl); - goto bad_journal; - } - if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) { - printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl); - goto bad_journal; - } - if ((uint32_t)contig_bytes < buf_count(bp)) { - printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl); - goto bad_journal; + /* + * adopt the shadow buffer for this block + */ + if (func) { + /* + * transfer FS hook function to the + * shadow buffer... it will get called + * in finish_end_transaction + */ + buf_setfilter(sbp, func, arg, NULL, NULL); } - buf_setblkno(bp, blkno); + blhdr->binfo[i].u.bp = sbp; + + } else { + // bnum == -1, only true if a block was "killed" + bsize = blhdr->binfo[i].u.bi.bsize; } - // update this so we write out the correct physical block number! - blhdr->binfo[i].bnum = (off_t)(blkno); + tbuffer_offset += bsize; } - next = (block_list_header *)((long)blhdr->binfo[0].bnum); - } - + } + /* + * if callback != NULL, we don't want to drop the journal + * lock, or complete end_transaction asynchronously, since + * the caller is expecting the callback to run in the calling + * context + * + * if drop_lock == FALSE, we can't complete end_transaction + * asynchronously + */ + if (callback) + drop_lock_early = FALSE; + else + drop_lock_early = drop_lock; + + if (drop_lock_early == FALSE) + must_wait = TRUE; + + if (drop_lock_early == TRUE) { + jnl->owner = NULL; + unlock_journal(jnl); + drop_lock = FALSE; + } + if (must_wait == TRUE) + ret_val = finish_end_transaction(tr, callback, callback_arg); + else { + thread_t thread = THREAD_NULL; + + /* + * fire up a thread to complete processing this transaction + * asynchronously... when it finishes, it will call + * unlock_condition + */ + kernel_thread_start((thread_continue_t)finish_end_thread, tr, &thread); + } + KERNEL_DEBUG(0xbbbbc018|DBG_FUNC_END, jnl, tr, ret_val, 0, 0); +done: + if (drop_lock == TRUE) { + jnl->owner = NULL; + unlock_journal(jnl); + } + return (ret_val); +} + + +static void +finish_end_thread(transaction *tr) +{ +#if !CONFIG_EMBEDDED + proc_apply_thread_selfdiskacc(IOPOL_PASSIVE); +#else /* !CONFIG_EMBEDDED */ + struct uthread *ut; + + ut = get_bsdthread_info(current_thread()); + ut->uu_iopol_disk = IOPOL_PASSIVE; +#endif /* !CONFIG_EMBEDDED */ + + finish_end_transaction(tr, NULL, NULL); + + thread_deallocate(current_thread()); + thread_terminate(current_thread()); +} +static void +write_header_thread(journal *jnl) +{ +#if !CONFIG_EMBEDDED + proc_apply_thread_selfdiskacc(IOPOL_PASSIVE); +#else /* !CONFIG_EMBEDDED */ + struct uthread *ut; + + ut = get_bsdthread_info(current_thread()); + ut->uu_iopol_disk = IOPOL_PASSIVE; +#endif /* !CONFIG_EMBEDDED */ + + if (write_journal_header(jnl, 1, jnl->saved_sequence_num)) + jnl->write_header_failed = TRUE; + else + jnl->write_header_failed = FALSE; + unlock_condition(jnl, &jnl->writing_header); + + thread_deallocate(current_thread()); + thread_terminate(current_thread()); +} + +static int +finish_end_transaction(transaction *tr, errno_t (*callback)(void*), void *callback_arg) +{ + int i, amt; + int ret = 0; + off_t end; + journal *jnl = tr->jnl; + buf_t bp, *bparray; + vnode_t vp; + block_list_header *blhdr=NULL, *next=NULL; + size_t tbuffer_offset; + int bufs_written = 0; + int ret_val = 0; + + KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_START, jnl, tr, 0, 0, 0); + + end = jnl->jhdr->end; + + for (blhdr = tr->blhdr; blhdr; blhdr = (block_list_header *)((long)blhdr->binfo[0].bnum)) { - for(blhdr=tr->blhdr; blhdr; blhdr=(block_list_header *)((long)blhdr->binfo[0].bnum)) { amt = blhdr->bytes_used; blhdr->binfo[0].u.bi.b.sequence_num = tr->sequence_num; @@ -3424,64 +3887,139 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void blhdr->checksum = calc_checksum((char *)blhdr, BLHDR_CHECKSUM_SIZE); if (kmem_alloc(kernel_map, (vm_offset_t *)&bparray, blhdr->num_blocks * sizeof(struct buf *))) { - panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *)); + panic("can't allocate %zd bytes for bparray\n", blhdr->num_blocks * sizeof(struct buf *)); } - - // calculate individual block checksums tbuffer_offset = jnl->jhdr->blhdr_size; - for(i=1; i < blhdr->num_blocks; i++) { - int32_t bsize; + + for (i = 1; i < blhdr->num_blocks; i++) { + void (*func)(buf_t, void *); + void *arg; + int32_t bsize; - if (blhdr->binfo[i].bnum != (off_t)-1) { - bparray[i] = blhdr->binfo[i].u.bp; - bsize = buf_size(bparray[i]); - blhdr->binfo[i].u.bi.bsize = bsize; - blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize); - } else { - bparray[i] = NULL; - bsize = blhdr->binfo[i].u.bi.bsize; - blhdr->binfo[i].u.bi.b.cksum = 0; - } + /* + * finish preparing the shadow buf_t before + * calculating the individual block checksums + */ + if (blhdr->binfo[i].bnum != (off_t)-1) { + daddr64_t blkno; + daddr64_t lblkno; - tbuffer_offset += bsize; - } + bp = blhdr->binfo[i].u.bp; + + vp = buf_vnode(bp); + blkno = buf_blkno(bp); + lblkno = buf_lblkno(bp); - ret = write_journal_data(jnl, &end, blhdr, amt); + if (vp == NULL && lblkno == blkno) { + printf("jnl: %s: end_tr: bad news! bp @ %p w/null vp and l/blkno = %qd/%qd. aborting the transaction (tr %p jnl %p).\n", + jnl->jdev_name, bp, lblkno, blkno, tr, jnl); + ret_val = -1; + goto bad_journal; + } + + // if the lblkno is the same as blkno and this bp isn't + // associated with the underlying file system device then + // we need to call bmap() to get the actual physical block. + // + if ((lblkno == blkno) && (vp != jnl->fsdev)) { + off_t f_offset; + size_t contig_bytes; + + if (VNOP_BLKTOOFF(vp, lblkno, &f_offset)) { + printf("jnl: %s: end_tr: vnop_blktooff failed @ %p, jnl %p\n", jnl->jdev_name, bp, jnl); + ret_val = -1; + goto bad_journal; + } + if (VNOP_BLOCKMAP(vp, f_offset, buf_count(bp), &blkno, &contig_bytes, NULL, 0, NULL)) { + printf("jnl: %s: end_tr: can't blockmap the bp @ %p, jnl %p\n", jnl->jdev_name, bp, jnl); + ret_val = -1; + goto bad_journal; + } + if ((uint32_t)contig_bytes < buf_count(bp)) { + printf("jnl: %s: end_tr: blk not physically contiguous on disk@ %p, jnl %p\n", jnl->jdev_name, bp, jnl); + ret_val = -1; + goto bad_journal; + } + buf_setblkno(bp, blkno); + } + // update this so we write out the correct physical block number! + blhdr->binfo[i].bnum = (off_t)(blkno); - // always put the bp pointers back - for(i=1; i < blhdr->num_blocks; i++) { - if (blhdr->binfo[i].bnum != (off_t)-1) { - blhdr->binfo[i].u.bp = bparray[i]; - } + /* + * pick up the FS hook function (if any) and prepare + * to fire this buffer off in the next pass + */ + buf_setfilter(bp, buffer_flushed_callback, tr, &func, &arg); + + if (func) { + /* + * call the hook function supplied by the filesystem... + * this needs to happen BEFORE cacl_checksum in case + * the FS morphs the data in the buffer + */ + func(bp, arg); + } + bparray[i] = bp; + bsize = buf_size(bp); + blhdr->binfo[i].u.bi.bsize = bsize; + blhdr->binfo[i].u.bi.b.cksum = calc_checksum(&((char *)blhdr)[tbuffer_offset], bsize); + } else { + bparray[i] = NULL; + bsize = blhdr->binfo[i].u.bi.bsize; + blhdr->binfo[i].u.bi.b.cksum = 0; + } + tbuffer_offset += bsize; } + /* + * if we fired off the journal_write_header asynchronously in + * 'end_transaction', we need to wait for its completion + * before writing the actual journal data + */ + wait_condition(jnl, &jnl->writing_header, "finish_end_transaction"); + + if (jnl->write_header_failed == FALSE) + ret = write_journal_data(jnl, &end, blhdr, amt); + else + ret_val = -1; + /* + * put the bp pointers back so that we can + * make the final pass on them + */ + for (i = 1; i < blhdr->num_blocks; i++) + blhdr->binfo[i].u.bp = bparray[i]; kmem_free(kernel_map, (vm_offset_t)bparray, blhdr->num_blocks * sizeof(struct buf *)); + if (ret_val == -1) + goto bad_journal; + if (ret != amt) { printf("jnl: %s: end_transaction: only wrote %d of %d bytes to the journal!\n", - jnl->jdev_name, ret, amt); + jnl->jdev_name, ret, amt); + ret_val = -1; goto bad_journal; } - } + } + jnl->jhdr->end = end; // update where the journal now ends + tr->journal_end = end; // the transaction ends here too - jnl->jhdr->end = end; // update where the journal now ends - tr->journal_end = end; // the transaction ends here too - if (tr->journal_start == 0 || tr->journal_end == 0) { + if (tr->journal_start == 0 || tr->journal_end == 0) { panic("jnl: end_transaction: bad tr journal start/end: 0x%llx 0x%llx\n", - tr->journal_start, tr->journal_end); - } + tr->journal_start, tr->journal_end); + } - if (write_journal_header(jnl, 0) != 0) { + if (write_journal_header(jnl, 0, jnl->saved_sequence_num) != 0) { + ret_val = -1; goto bad_journal; - } - + } /* * If the caller supplied a callback, call it now that the blocks have been * written to the journal. This is used by journal_relocate so, for example, * the file system can change its pointer to the new journal. */ if (callback != NULL && callback(callback_arg) != 0) { + ret_val = -1; goto bad_journal; } @@ -3489,284 +4027,429 @@ end_transaction(transaction *tr, int force_it, errno_t (*callback)(void*), void // Send a DKIOCUNMAP for the extents trimmed by this transaction, and // free up the extent list. // - errno = journal_trim_flush(jnl, tr); + journal_trim_flush(jnl, tr); - // - // setup for looping through all the blhdr's. we null out the - // tbuffer and blhdr fields so that they're not used any more. - // - blhdr = tr->blhdr; - tr->tbuffer = NULL; - tr->blhdr = NULL; - - // the buffer_flushed_callback will only be called for the - // real blocks that get flushed so we have to account for - // the block_list_headers here. - // - tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size; - - // for each block, set the iodone callback and unlock it - for(; blhdr; blhdr=next) { - - // we can re-order the buf ptrs because everything is written out already - qsort(&blhdr->binfo[1], blhdr->num_blocks-1, sizeof(block_info), journal_binfo_cmp); - - for(i=1; i < blhdr->num_blocks; i++) { - if (blhdr->binfo[i].bnum == (off_t)-1) { - continue; - } + // the buffer_flushed_callback will only be called for the + // real blocks that get flushed so we have to account for + // the block_list_headers here. + // + tr->num_flushed = tr->num_blhdrs * jnl->jhdr->blhdr_size; - bp = blhdr->binfo[i].u.bp; + lock_condition(jnl, &jnl->asyncIO, "finish_end_transaction"); - // have to pass BAC_REMOVE here because we're going to bawrite() - // the buffer when we're done - do { - errno = buf_acquire(bp, BAC_REMOVE, 0, 0); - } while (errno == EAGAIN); - - if (errno == 0) { - struct vnode *save_vp; - void *cur_filter; + // + // setup for looping through all the blhdr's. + // + for (blhdr = tr->blhdr; blhdr; blhdr = next) { + uint16_t num_blocks; - if ((buf_flags(bp) & (B_LOCKED|B_DELWRI)) != (B_LOCKED|B_DELWRI)) { - if (jnl->flags & JOURNAL_CLOSE_PENDING) { - buf_clearflags(bp, B_LOCKED); - buf_brelse(bp); - continue; - } else { - panic("jnl: end_tr: !!!DANGER!!! bp %p flags (0x%x) not LOCKED & DELWRI\n", bp, buf_flags(bp)); - } - } - save_vp = buf_vnode(bp); + /* + * grab this info ahead of issuing the buf_bawrites... + * once the last one goes out, its possible for blhdr + * to be freed (especially if we get preempted) before + * we do the last check of num_blocks or + * grab the next blhdr pointer... + */ + next = (block_list_header *)((long)blhdr->binfo[0].bnum); + num_blocks = blhdr->num_blocks; - buf_setfilter(bp, buffer_flushed_callback, tr, &cur_filter, NULL); + /* + * we can re-order the buf ptrs because everything is written out already + */ + qsort(&blhdr->binfo[1], num_blocks-1, sizeof(block_info), journal_binfo_cmp); - if (cur_filter) { - panic("jnl: bp @ %p (blkno %qd, vp %p) has non-null iodone (%p) buffflushcb %p\n", - bp, buf_blkno(bp), save_vp, cur_filter, buffer_flushed_callback); - } - buf_clearflags(bp, B_LOCKED); + /* + * need to make sure that the loop issuing the buf_bawrite's + * does not touch blhdr once the last buf_bawrite has been + * issued... at that point, we no longer have a legitmate + * reference on the associated storage since it will be + * released upon the completion of that last buf_bawrite + */ + for (i = num_blocks-1; i >= 1; i--) { + if (blhdr->binfo[i].bnum != (off_t)-1) + break; + num_blocks--; + } + for (i = 1; i < num_blocks; i++) { - // kicking off the write here helps performance + if ((bp = blhdr->binfo[i].u.bp)) { + vp = buf_vnode(bp); + buf_bawrite(bp); - // XXXdbg this is good for testing: buf_bdwrite(bp); - //buf_bdwrite(bp); // this undoes the vnode_ref() in journal_modify_block_end() - vnode_rele_ext(save_vp, 0, 1); - } else { - printf("jnl: %s: end_transaction: could not acquire block %p (errno %d)!\n", - jnl->jdev_name,bp, errno); + vnode_rele_ext(vp, 0, 1); + + bufs_written++; } } + } + if (bufs_written == 0) { + /* + * since we didn't issue any buf_bawrite's, there is no + * async trigger to cause the memory associated with this + * transaction to be freed... so, move it to the garbage + * list now + */ + lock_oldstart(jnl); - next = (block_list_header *)((long)blhdr->binfo[0].bnum); + tr->next = jnl->tr_freeme; + jnl->tr_freeme = tr; - // we can free blhdr here since we won't need it any more - blhdr->binfo[0].bnum = 0xdeadc0de; - kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); - } + unlock_oldstart(jnl); - //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n", - // tr, tr->journal_start, tr->journal_end); - return 0; + unlock_condition(jnl, &jnl->asyncIO); + } + //printf("jnl: end_tr: tr @ 0x%x, jnl-blocks: 0x%llx - 0x%llx. exit!\n", + // tr, tr->journal_start, tr->journal_end); - bad_journal: - jnl->flags |= JOURNAL_INVALID; - jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL; - abort_transaction(jnl, tr); // cleans up list of extents to be trimmed - return -1; +bad_journal: + if (ret_val == -1) { + /* + * 'flush_aborted' is protected by the flushing condition... we need to + * set it before dropping the condition so that it will be + * noticed in 'end_transaction'... we add this additional + * aborted condition so that we can drop the 'flushing' condition + * before grabbing the journal lock... this avoids a deadlock + * in 'end_transaction' which is holding the journal lock while + * waiting for the 'flushing' condition to clear... + * everyone else will notice the JOURNAL_INVALID flag + */ + jnl->flush_aborted = TRUE; + + unlock_condition(jnl, &jnl->flushing); + lock_journal(jnl); + + jnl->flags |= JOURNAL_INVALID; + jnl->old_start[sizeof(jnl->old_start)/sizeof(jnl->old_start[0]) - 1] &= ~0x8000000000000000LL; + abort_transaction(jnl, tr); // cleans up list of extents to be trimmed + + unlock_journal(jnl); + } else + unlock_condition(jnl, &jnl->flushing); + + KERNEL_DEBUG(0xbbbbc028|DBG_FUNC_END, jnl, tr, bufs_written, ret_val, 0); + + return (ret_val); +} + + +static void +lock_condition(journal *jnl, boolean_t *condition, const char *condition_name) +{ + + KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_START, jnl, condition, 0, 0, 0); + + lock_flush(jnl); + + while (*condition == TRUE) + msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL); + + *condition = TRUE; + unlock_flush(jnl); + + KERNEL_DEBUG(0xbbbbc020|DBG_FUNC_END, jnl, condition, 0, 0, 0); +} + +static void +wait_condition(journal *jnl, boolean_t *condition, const char *condition_name) +{ + + if (*condition == FALSE) + return; + + KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_START, jnl, condition, 0, 0, 0); + + lock_flush(jnl); + + while (*condition == TRUE) + msleep(condition, &jnl->flock, PRIBIO, condition_name, NULL); + + unlock_flush(jnl); + + KERNEL_DEBUG(0xbbbbc02c|DBG_FUNC_END, jnl, condition, 0, 0, 0); +} + +static void +unlock_condition(journal *jnl, boolean_t *condition) +{ + lock_flush(jnl); + + *condition = FALSE; + wakeup(condition); + + unlock_flush(jnl); } static void abort_transaction(journal *jnl, transaction *tr) { - int i; - errno_t errno; - block_list_header *blhdr, *next; - struct buf *bp; - struct vnode *save_vp; + block_list_header *blhdr, *next; - // for each block list header, iterate over the blocks then - // free up the memory associated with the block list. - // - // for each block, clear the lock bit and release it. - // - for(blhdr=tr->blhdr; blhdr; blhdr=next) { + // for each block list header, iterate over the blocks then + // free up the memory associated with the block list. + // + // find each of the primary blocks (i.e. the list could + // contain a mix of shadowed and real buf_t's depending + // on when the abort condition was detected) and mark them + // clean and locked in the cache... this at least allows + // the FS a consistent view between it's incore data structures + // and the meta-data held in the cache + // + KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_START, jnl, tr, 0, 0, 0); + + for (blhdr = tr->blhdr; blhdr; blhdr = next) { + int i; + + for (i = 1; i < blhdr->num_blocks; i++) { + buf_t bp, tbp, sbp; + vnode_t bp_vp; + errno_t errno; - for(i=1; i < blhdr->num_blocks; i++) { - if (blhdr->binfo[i].bnum == (off_t)-1) { + if (blhdr->binfo[i].bnum == (off_t)-1) continue; - } - if ( (buf_vnode(blhdr->binfo[i].u.bp) == NULL) || - !(buf_flags(blhdr->binfo[i].u.bp) & B_LOCKED) ) { - continue; - } - errno = buf_meta_bread(buf_vnode(blhdr->binfo[i].u.bp), - buf_lblkno(blhdr->binfo[i].u.bp), - buf_size(blhdr->binfo[i].u.bp), - NOCRED, - &bp); - if (errno == 0) { - if (bp != blhdr->binfo[i].u.bp) { - panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n", - bp, blhdr->binfo[i].u.bp, jnl); - } + tbp = blhdr->binfo[i].u.bp; - // releasing a bp marked invalid - // also clears the locked and delayed state - buf_markinvalid(bp); - save_vp = buf_vnode(bp); + bp_vp = buf_vnode(tbp); - buf_brelse(bp); + buf_setfilter(tbp, NULL, NULL, NULL, NULL); - vnode_rele_ext(save_vp, 0, 1); - } else { - printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n", - jnl->jdev_name, blhdr->binfo[i].bnum, blhdr->binfo[i].u.bp); - if (bp) { + if (buf_shadow(tbp)) + sbp = tbp; + else + sbp = NULL; + + if (bp_vp) { + errno = buf_meta_bread(bp_vp, + buf_lblkno(tbp), + buf_size(tbp), + NOCRED, + &bp); + if (errno == 0) { + if (sbp == NULL && bp != tbp && (buf_flags(tbp) & B_LOCKED)) { + panic("jnl: abort_tr: got back a different bp! (bp %p should be %p, jnl %p\n", + bp, tbp, jnl); + } + /* + * once the journal has been marked INVALID and aborted, + * NO meta data can be written back to the disk, so + * mark the buf_t clean and make sure it's locked in the cache + * note: if we found a shadow, the real buf_t needs to be relocked + */ + buf_setflags(bp, B_LOCKED); + buf_markclean(bp); buf_brelse(bp); + + KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_NONE, jnl, tr, bp, 0, 0); + + /* + * this undoes the vnode_ref() in journal_modify_block_end() + */ + vnode_rele_ext(bp_vp, 0, 1); + } else { + printf("jnl: %s: abort_tr: could not find block %Ld vp %p!\n", + jnl->jdev_name, blhdr->binfo[i].bnum, tbp); + if (bp) { + buf_brelse(bp); + } } } + if (sbp) + buf_brelse(sbp); } - next = (block_list_header *)((long)blhdr->binfo[0].bnum); // we can free blhdr here since we won't need it any more blhdr->binfo[0].bnum = 0xdeadc0de; kmem_free(kernel_map, (vm_offset_t)blhdr, tr->tbuffer_size); - } + } + /* + * If the transaction we're aborting was the async transaction, then + * tell the current transaction that there is no pending trim + * any more. + */ + lck_rw_lock_exclusive(&jnl->trim_lock); + if (jnl->async_trim == &tr->trim) + jnl->async_trim = NULL; + lck_rw_unlock_exclusive(&jnl->trim_lock); + if (tr->trim.extents) { kfree(tr->trim.extents, tr->trim.allocated_count * sizeof(dk_extent_t)); } tr->trim.allocated_count = 0; tr->trim.extent_count = 0; tr->trim.extents = NULL; - tr->tbuffer = NULL; - tr->blhdr = NULL; - tr->total_bytes = 0xdbadc0de; + tr->tbuffer = NULL; + tr->blhdr = NULL; + tr->total_bytes = 0xdbadc0de; FREE_ZONE(tr, sizeof(transaction), M_JNL_TR); + + KERNEL_DEBUG(0xbbbbc034|DBG_FUNC_END, jnl, tr, 0, 0, 0); } int journal_end_transaction(journal *jnl) { - int ret; + int ret; transaction *tr; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); + + free_old_stuff(jnl); if ((jnl->flags & JOURNAL_INVALID) && jnl->owner == NULL) { return 0; } - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { panic("jnl: end_tr: I'm not the owner! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); - } - - free_old_stuff(jnl); + jnl, jnl->owner, current_thread()); + } + jnl->nested_count--; - jnl->nested_count--; - if (jnl->nested_count > 0) { + if (jnl->nested_count > 0) { return 0; - } else if (jnl->nested_count < 0) { + } else if (jnl->nested_count < 0) { panic("jnl: jnl @ %p has negative nested count (%d). bad boy.\n", jnl, jnl->nested_count); - } + } - if (jnl->flags & JOURNAL_INVALID) { + if (jnl->flags & JOURNAL_INVALID) { if (jnl->active_tr) { if (jnl->cur_tr != NULL) { panic("jnl: journal @ %p has active tr (%p) and cur tr (%p)\n", - jnl, jnl->active_tr, jnl->cur_tr); + jnl, jnl->active_tr, jnl->cur_tr); } - tr = jnl->active_tr; jnl->active_tr = NULL; + abort_transaction(jnl, tr); } - jnl->owner = NULL; unlock_journal(jnl); return EINVAL; - } - - tr = jnl->active_tr; - CHECK_TRANSACTION(tr); + } - // clear this out here so that when check_free_space() calls - // the FS flush function, we don't panic in journal_flush() - // if the FS were to call that. note: check_free_space() is - // called from end_transaction(). - // - jnl->active_tr = NULL; - ret = end_transaction(tr, 0, NULL, NULL); + tr = jnl->active_tr; + CHECK_TRANSACTION(tr); - jnl->owner = NULL; - unlock_journal(jnl); + // clear this out here so that when check_free_space() calls + // the FS flush function, we don't panic in journal_flush() + // if the FS were to call that. note: check_free_space() is + // called from end_transaction(). + // + jnl->active_tr = NULL; + ret = end_transaction(tr, 0, NULL, NULL, TRUE, FALSE); - return ret; + return ret; } +/* + * Flush the contents of the journal to the disk. + * + * Input: + * wait_for_IO - + * If TRUE, wait to write in-memory journal to the disk + * consistently, and also wait to write all asynchronous + * metadata blocks to its corresponding locations + * consistently on the disk. This means that the journal + * is empty at this point and does not contain any + * transactions. This is overkill in normal scenarios + * but is useful whenever the metadata blocks are required + * to be consistent on-disk instead of just the journal + * being consistent; like before live verification + * and live volume resizing. + * + * If FALSE, only wait to write in-memory journal to the + * disk consistently. This means that the journal still + * contains uncommitted transactions and the file system + * metadata blocks in the journal transactions might be + * written asynchronously to the disk. But there is no + * guarantee that they are written to the disk before + * returning to the caller. Note that this option is + * sufficient for file system data integrity as it + * guarantees consistent journal content on the disk. + */ int -journal_flush(journal *jnl) +journal_flush(journal *jnl, boolean_t wait_for_IO) { - int need_signal = 0; + boolean_t drop_lock = FALSE; - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); - if (jnl->flags & JOURNAL_INVALID) { + free_old_stuff(jnl); + + if (jnl->flags & JOURNAL_INVALID) { return -1; - } + } - KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH)) - | DBG_FUNC_START, 0, 0, 0, 0, 0); + KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_START, jnl, 0, 0, 0, 0); - if (jnl->owner != current_thread()) { + if (jnl->owner != current_thread()) { lock_journal(jnl); - need_signal = 1; - } - - free_old_stuff(jnl); + drop_lock = TRUE; + } - // if we're not active, flush any buffered transactions - if (jnl->active_tr == NULL && jnl->cur_tr) { + // if we're not active, flush any buffered transactions + if (jnl->active_tr == NULL && jnl->cur_tr) { transaction *tr = jnl->cur_tr; jnl->cur_tr = NULL; - end_transaction(tr, 1, NULL, NULL); // force it to get flushed - } - if (need_signal) { - unlock_journal(jnl); - } + if (wait_for_IO) { + wait_condition(jnl, &jnl->flushing, "journal_flush"); + wait_condition(jnl, &jnl->asyncIO, "journal_flush"); + } + /* + * "end_transction" will wait for any current async flush + * to complete, before flushing "cur_tr"... because we've + * specified the 'must_wait' arg as TRUE, it will then + * synchronously flush the "cur_tr" + */ + end_transaction(tr, 1, NULL, NULL, drop_lock, TRUE); // force it to get flushed - KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_JOURNAL, DBG_JOURNAL_FLUSH)) - | DBG_FUNC_END, 0, 0, 0, 0, 0); + } else { + if (drop_lock == TRUE) { + unlock_journal(jnl); + } - return 0; + /* Because of pipelined journal, the journal transactions + * might be in process of being flushed on another thread. + * If there is nothing to flush currently, we should + * synchronize ourselves with the pipelined journal thread + * to ensure that all inflight transactions, if any, are + * flushed before we return success to caller. + */ + wait_condition(jnl, &jnl->flushing, "journal_flush"); + } + if (wait_for_IO) { + wait_condition(jnl, &jnl->asyncIO, "journal_flush"); + } + + KERNEL_DEBUG(DBG_JOURNAL_FLUSH | DBG_FUNC_END, jnl, 0, 0, 0, 0); + + return 0; } int journal_active(journal *jnl) { - if (jnl->flags & JOURNAL_INVALID) { + if (jnl->flags & JOURNAL_INVALID) { return -1; - } + } - return (jnl->active_tr == NULL) ? 0 : 1; + return (jnl->active_tr == NULL) ? 0 : 1; } void * journal_owner(journal *jnl) { - return jnl->owner; + return jnl->owner; } int journal_uses_fua(journal *jnl) @@ -3835,37 +4518,37 @@ int journal_uses_fua(journal *jnl) int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbuffer_size, errno_t (*callback)(void *), void *callback_arg) { - int ret; - transaction *tr; + int ret; + transaction *tr; /* * Sanity check inputs, and adjust the size of the transaction buffer. */ - if ((offset % jnl->jhdr->jhdr_size) != 0) { + if ((offset % jnl->jhdr->jhdr_size) != 0) { printf("jnl: %s: relocate: offset 0x%llx is not an even multiple of block size 0x%x\n", - jnl->jdev_name, offset, jnl->jhdr->jhdr_size); + jnl->jdev_name, offset, jnl->jhdr->jhdr_size); return EINVAL; - } - if ((journal_size % jnl->jhdr->jhdr_size) != 0) { + } + if ((journal_size % jnl->jhdr->jhdr_size) != 0) { printf("jnl: %s: relocate: journal size 0x%llx is not an even multiple of block size 0x%x\n", - jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size); + jnl->jdev_name, journal_size, jnl->jhdr->jhdr_size); return EINVAL; - } + } - CHECK_JOURNAL(jnl); + CHECK_JOURNAL(jnl); /* Guarantee we own the active transaction. */ - if (jnl->flags & JOURNAL_INVALID) { + if (jnl->flags & JOURNAL_INVALID) { return EINVAL; - } - if (jnl->owner != current_thread()) { - panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n", - jnl, jnl->owner, current_thread()); + } + if (jnl->owner != current_thread()) { + panic("jnl: relocate: Not the owner! jnl %p, owner %p, curact %p\n", + jnl, jnl->owner, current_thread()); } - if (tbuffer_size == 0) - tbuffer_size = jnl->tbuffer_size; - size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size); + if (tbuffer_size == 0) + tbuffer_size = jnl->tbuffer_size; + size_up_tbuffer(jnl, tbuffer_size, jnl->jhdr->jhdr_size); /* * Flush any non-active transactions. We have to temporarily hide the @@ -3875,11 +4558,13 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu tr = jnl->active_tr; CHECK_TRANSACTION(tr); jnl->active_tr = NULL; - ret = journal_flush(jnl); + ret = journal_flush(jnl, TRUE); jnl->active_tr = tr; + if (ret) { return ret; } + wait_condition(jnl, &jnl->flushing, "end_transaction"); /* Update the journal's offset and size in memory. */ jnl->jdev_offset = offset; @@ -3893,7 +4578,7 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu * before they get written to their normal on-disk locations. */ jnl->active_tr = NULL; - ret = end_transaction(tr, 1, callback, callback_arg); + ret = end_transaction(tr, 1, callback, callback_arg, FALSE, TRUE); if (ret) { printf("jnl: %s: relocate: end_transaction failed (%d)\n", jnl->jdev_name, ret); goto bad_journal; @@ -3912,9 +4597,9 @@ int journal_relocate(journal *jnl, off_t offset, off_t journal_size, int32_t tbu return 0; bad_journal: - jnl->flags |= JOURNAL_INVALID; - abort_transaction(jnl, tr); - return ret; + jnl->flags |= JOURNAL_INVALID; + abort_transaction(jnl, tr); + return ret; } @@ -3927,62 +4612,62 @@ int journal_uses_fua(__unused journal *jnl) journal * journal_create(__unused struct vnode *jvp, - __unused off_t offset, - __unused off_t journal_size, - __unused struct vnode *fsvp, - __unused size_t min_fs_blksz, - __unused int32_t flags, - __unused int32_t tbuffer_size, - __unused void (*flush)(void *arg), - __unused void *arg) + __unused off_t offset, + __unused off_t journal_size, + __unused struct vnode *fsvp, + __unused size_t min_fs_blksz, + __unused int32_t flags, + __unused int32_t tbuffer_size, + __unused void (*flush)(void *arg), + __unused void *arg) { return NULL; } journal * journal_open(__unused struct vnode *jvp, - __unused off_t offset, - __unused off_t journal_size, - __unused struct vnode *fsvp, - __unused size_t min_fs_blksz, - __unused int32_t flags, - __unused int32_t tbuffer_size, - __unused void (*flush)(void *arg), - __unused void *arg) + __unused off_t offset, + __unused off_t journal_size, + __unused struct vnode *fsvp, + __unused size_t min_fs_blksz, + __unused int32_t flags, + __unused int32_t tbuffer_size, + __unused void (*flush)(void *arg), + __unused void *arg) { - return NULL; + return NULL; } int journal_modify_block_start(__unused journal *jnl, __unused struct buf *bp) { - return EINVAL; + return EINVAL; } int journal_modify_block_end(__unused journal *jnl, - __unused struct buf *bp, - __unused void (*func)(struct buf *bp, void *arg), - __unused void *arg) + __unused struct buf *bp, + __unused void (*func)(struct buf *bp, void *arg), + __unused void *arg) { - return EINVAL; + return EINVAL; } int journal_kill_block(__unused journal *jnl, __unused struct buf *bp) { - return EINVAL; + return EINVAL; } int journal_relocate(__unused journal *jnl, - __unused off_t offset, - __unused off_t journal_size, - __unused int32_t tbuffer_size, - __unused errno_t (*callback)(void *), - __unused void *callback_arg) + __unused off_t offset, + __unused off_t journal_size, + __unused int32_t tbuffer_size, + __unused errno_t (*callback)(void *), + __unused void *callback_arg) { - return EINVAL; + return EINVAL; } void @@ -3993,19 +4678,19 @@ journal_close(__unused journal *jnl) int journal_start_transaction(__unused journal *jnl) { - return EINVAL; + return EINVAL; } int journal_end_transaction(__unused journal *jnl) { - return EINVAL; + return EINVAL; } int -journal_flush(__unused journal *jnl) +journal_flush(__unused journal *jnl, __unused boolean_t wait_for_IO) { - return EINVAL; + return EINVAL; } int @@ -4015,13 +4700,13 @@ journal_is_clean(__unused struct vnode *jvp, __unused struct vnode *fsvp, __unused size_t min_fs_block_size) { - return 0; + return 0; } void * journal_owner(__unused journal *jnl) { - return NULL; + return NULL; } #endif // !JOURNALING diff --git a/bsd/vfs/vfs_journal.h b/bsd/vfs/vfs_journal.h index 310445395..11b24c3ee 100644 --- a/bsd/vfs/vfs_journal.h +++ b/bsd/vfs/vfs_journal.h @@ -1,4 +1,3 @@ - /* * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * @@ -81,20 +80,23 @@ struct jnl_trim_list { dk_extent_t *extents; }; +typedef void (*jnl_trim_callback_t)(void *arg, uint32_t extent_count, const dk_extent_t *extents); + typedef struct transaction { - int tbuffer_size; // in bytes - char *tbuffer; // memory copy of the transaction - block_list_header *blhdr; // points to the first byte of tbuffer - int num_blhdrs; // how many buffers we've allocated - int total_bytes; // total # of bytes in transaction - int num_flushed; // how many bytes have been flushed - int num_killed; // how many bytes were "killed" - off_t journal_start; // where in the journal this transaction starts - off_t journal_end; // where in the journal this transaction ends - struct journal *jnl; // ptr back to the journal structure - struct transaction *next; // list of tr's (either completed or to be free'd) - uint32_t sequence_num; - struct jnl_trim_list trim; + int tbuffer_size; // in bytes + char *tbuffer; // memory copy of the transaction + block_list_header *blhdr; // points to the first byte of tbuffer + int num_blhdrs; // how many buffers we've allocated + int total_bytes; // total # of bytes in transaction + int num_flushed; // how many bytes have been flushed + int num_killed; // how many bytes were "killed" + off_t journal_start; // where in the journal this transaction starts + off_t journal_end; // where in the journal this transaction ends + struct journal *jnl; // ptr back to the journal structure + struct transaction *next; // list of tr's (either completed or to be free'd) + uint32_t sequence_num; + struct jnl_trim_list trim; + boolean_t delayed_header_write; } transaction; @@ -133,6 +135,8 @@ typedef struct journal_header { */ typedef struct journal { lck_mtx_t jlock; // protects the struct journal data + lck_mtx_t flock; // serializes flushing of journal + lck_rw_t trim_lock; // protects the async_trim field, below struct vnode *jdev; // vnode of the device where the journal lives off_t jdev_offset; // byte offset to the start of the journal @@ -145,11 +149,23 @@ typedef struct journal { int32_t flags; int32_t tbuffer_size; // default transaction buffer size - + boolean_t flush_aborted; + boolean_t flushing; + boolean_t asyncIO; + boolean_t writing_header; + boolean_t write_header_failed; + + struct jnl_trim_list *async_trim; // extents to be trimmed by transaction being asynchronously flushed + jnl_trim_callback_t trim_callback; + void *trim_callback_arg; + char *header_buf; // in-memory copy of the journal header int32_t header_buf_size; journal_header *jhdr; // points to the first byte of header_buf + uint32_t saved_sequence_num; + uint32_t sequence_num; + off_t max_read_size; off_t max_write_size; @@ -174,7 +190,7 @@ typedef struct journal { #define JOURNAL_FLUSHCACHE_ERR 0x00040000 // means we already printed this err #define JOURNAL_NEED_SWAP 0x00080000 // swap any data read from disk #define JOURNAL_DO_FUA_WRITES 0x00100000 // do force-unit-access writes -#define JOURNAL_TRIM_ERR 0x00200000 // a previous trim failed +#define JOURNAL_USE_UNMAP 0x00200000 // device supports UNMAP (TRIM) /* journal_open/create options are always in the low-16 bits */ #define JOURNAL_OPTION_FLAGS_MASK 0x0000ffff @@ -306,11 +322,12 @@ int journal_kill_block(journal *jnl, struct buf *bp); #ifdef BSD_KERNEL_PRIVATE int journal_trim_add_extent(journal *jnl, uint64_t offset, uint64_t length); int journal_trim_remove_extent(journal *jnl, uint64_t offset, uint64_t length); +void journal_trim_set_callback(journal *jnl, jnl_trim_callback_t callback, void *arg); #endif int journal_end_transaction(journal *jnl); int journal_active(journal *jnl); -int journal_flush(journal *jnl); +int journal_flush(journal *jnl, boolean_t wait_for_IO); void *journal_owner(journal *jnl); // compare against current_thread() int journal_uses_fua(journal *jnl); diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index 553bb41f9..10b885d51 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -111,6 +111,17 @@ static void kdebug_lookup(struct vnode *dp, struct componentname *cnp); static int vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_t ctx); #endif +boolean_t lookup_continue_ok(struct nameidata *ndp); +int lookup_traverse_mountpoints(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, int vbusyflags, vfs_context_t ctx); +int lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx); +int lookup_authorize_search(vnode_t dp, struct componentname *cnp, int dp_authorized_in_cache, vfs_context_t ctx); +void lookup_consider_update_cache(vnode_t dvp, vnode_t vp, struct componentname *cnp, int nc_generation); +int lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname *cnp, int wantparent, vfs_context_t ctx); +int lookup_handle_found_vnode(struct nameidata *ndp, struct componentname *cnp, int rdonly, + int vbusyflags, int *keep_going, int nc_generation, + int wantparent, int atroot, vfs_context_t ctx); +int lookup_handle_emptyname(struct nameidata *ndp, struct componentname *cnp, int wantparent); + /* * Convert a pathname into a pointer to a locked inode. * @@ -150,12 +161,10 @@ int namei(struct nameidata *ndp) { struct filedesc *fdp; /* pointer to file descriptor state */ - char *cp; /* pointer into pathname argument */ struct vnode *dp; /* the directory we are searching */ struct vnode *usedvp = ndp->ni_dvp; /* store pointer to vp in case we must loop due to heavy vnode pressure */ u_long cnpflags = ndp->ni_cnd.cn_flags; /* store in case we have to restore after loop */ - uio_t auio; int error; struct componentname *cnp = &ndp->ni_cnd; vfs_context_t ctx = cnp->cn_context; @@ -164,8 +173,8 @@ namei(struct nameidata *ndp) /* XXX ut should be from context */ uthread_t ut = (struct uthread *)get_bsdthread_info(current_thread()); #endif - char *tmppn; - char uio_buf[ UIO_SIZEOF(1) ]; + + fdp = p->p_fd; #if DIAGNOSTIC if (!vfs_context_ucred(ctx) || !p) @@ -175,7 +184,35 @@ namei(struct nameidata *ndp) if (cnp->cn_flags & OPMASK) panic ("namei: flags contaminated with nameiops"); #endif - fdp = p->p_fd; + + /* + * A compound VNOP found something that needs further processing: + * either a trigger vnode, a covered directory, or a symlink. + */ + if (ndp->ni_flag & NAMEI_CONTLOOKUP) { + int rdonly, vbusyflags, keep_going, wantparent; + + rdonly = cnp->cn_flags & RDONLY; + vbusyflags = ((cnp->cn_flags & CN_NBMOUNTLOOK) != 0) ? LK_NOWAIT : 0; + keep_going = 0; + wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); + + ndp->ni_flag &= ~(NAMEI_CONTLOOKUP); + + error = lookup_handle_found_vnode(ndp, &ndp->ni_cnd, rdonly, vbusyflags, + &keep_going, ndp->ni_ncgeneration, wantparent, 0, ctx); + if (error) + goto out_drop; + if (keep_going) { + if ((cnp->cn_flags & ISSYMLINK) == 0) { + panic("We need to keep going on a continued lookup, but for vp type %d (tag %d)\n", ndp->ni_vp->v_type, ndp->ni_vp->v_tag); + } + goto continue_symlink; + } + + return 0; + + } vnode_recycled: @@ -310,9 +347,6 @@ retry_copy: ndp->ni_vp = NULLVP; for (;;) { - int need_newpathbuf; - u_int linklen; - ndp->ni_startdir = dp; if ( (error = lookup(ndp)) ) { @@ -324,104 +358,13 @@ retry_copy: if ((cnp->cn_flags & ISSYMLINK) == 0) { return (0); } -#ifndef __LP64__ - if ((cnp->cn_flags & FSNODELOCKHELD)) { - cnp->cn_flags &= ~FSNODELOCKHELD; - unlock_fsnode(ndp->ni_dvp, NULL); - } -#endif /* __LP64__ */ - - if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { - error = ELOOP; - break; - } -#if CONFIG_MACF - if ((error = mac_vnode_check_readlink(ctx, ndp->ni_vp)) != 0) - break; -#endif /* MAC */ - if (ndp->ni_pathlen > 1 || !(cnp->cn_flags & HASBUF)) - need_newpathbuf = 1; - else - need_newpathbuf = 0; - - if (need_newpathbuf) { - MALLOC_ZONE(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); - if (cp == NULL) { - error = ENOMEM; - break; - } - } else { - cp = cnp->cn_pnbuf; - } - auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); - - uio_addiov(auio, CAST_USER_ADDR_T(cp), MAXPATHLEN); - - error = VNOP_READLINK(ndp->ni_vp, auio, ctx); - if (error) { - if (need_newpathbuf) - FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); - break; - } - - /* - * Safe to set unsigned with a [larger] signed type here - * because 0 <= uio_resid <= MAXPATHLEN and MAXPATHLEN - * is only 1024. - */ - linklen = MAXPATHLEN - (u_int)uio_resid(auio); - if (linklen + ndp->ni_pathlen > MAXPATHLEN) { - if (need_newpathbuf) - FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); - error = ENAMETOOLONG; +continue_symlink: + /* Gives us a new path to process, and a starting dir */ + error = lookup_handle_symlink(ndp, &dp, ctx); + if (error != 0) { break; } - if (need_newpathbuf) { - long len = cnp->cn_pnlen; - - tmppn = cnp->cn_pnbuf; - bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); - cnp->cn_pnbuf = cp; - cnp->cn_pnlen = MAXPATHLEN; - - if ( (cnp->cn_flags & HASBUF) ) - FREE_ZONE(tmppn, len, M_NAMEI); - else - cnp->cn_flags |= HASBUF; - } else - cnp->cn_pnbuf[linklen] = '\0'; - - ndp->ni_pathlen += linklen; - cnp->cn_nameptr = cnp->cn_pnbuf; - - /* - * starting point for 'relative' - * symbolic link path - */ - dp = ndp->ni_dvp; - /* - * get rid of references returned via 'lookup' - */ - vnode_put(ndp->ni_vp); - vnode_put(ndp->ni_dvp); - - ndp->ni_vp = NULLVP; - ndp->ni_dvp = NULLVP; - - /* - * Check if symbolic link restarts us at the root - */ - if (*(cnp->cn_nameptr) == '/') { - while (*(cnp->cn_nameptr) == '/') { - cnp->cn_nameptr++; - ndp->ni_pathlen--; - } - if ((dp = ndp->ni_rootdir) == NULLVP) { - error = ENOENT; - goto error_out; - } - } } /* * only come here if we fail to handle a SYMLINK... @@ -429,6 +372,7 @@ retry_copy: * we need to drop the iocount that was picked * up in the lookup routine */ +out_drop: if (ndp->ni_dvp) vnode_put(ndp->ni_dvp); if (ndp->ni_vp) @@ -440,6 +384,7 @@ retry_copy: } cnp->cn_pnbuf = NULL; ndp->ni_vp = NULLVP; + ndp->ni_dvp = NULLVP; if (error == ERECYCLE){ /* vnode was recycled underneath us. re-drive lookup to start at the beginning again, since recycling invalidated last lookup*/ @@ -452,143 +397,534 @@ retry_copy: return (error); } +int +namei_compound_available(vnode_t dp, struct nameidata *ndp) +{ + if ((ndp->ni_flag & NAMEI_COMPOUNDOPEN) != 0) { + return vnode_compound_open_available(dp); + } -/* - * Search a pathname. - * This is a very central and rather complicated routine. - * - * The pathname is pointed to by ni_ptr and is of length ni_pathlen. - * The starting directory is taken from ni_startdir. The pathname is - * descended until done, or a symbolic link is encountered. The variable - * ni_more is clear if the path is completed; it is set to one if a - * symbolic link needing interpretation is encountered. - * - * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on - * whether the name is to be looked up, created, renamed, or deleted. - * When CREATE, RENAME, or DELETE is specified, information usable in - * creating, renaming, or deleting a directory entry may be calculated. - * If flag has LOCKPARENT or'ed into it, the parent directory is returned - * locked. If flag has WANTPARENT or'ed into it, the parent directory is - * returned unlocked. Otherwise the parent directory is not returned. If - * the target of the pathname exists and LOCKLEAF is or'ed into the flag - * the target is returned locked, otherwise it is returned unlocked. - * When creating or renaming and LOCKPARENT is specified, the target may not - * be ".". When deleting and LOCKPARENT is specified, the target may be ".". - * - * Overall outline of lookup: - * - * dirloop: - * identify next component of name at ndp->ni_ptr - * handle degenerate case where name is null string - * if .. and crossing mount points and on mounted filesys, find parent - * call VNOP_LOOKUP routine for next component name - * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set - * component vnode returned in ni_vp (if it exists), locked. - * if result vnode is mounted on and crossing mount points, - * find mounted on vnode - * if more components of name, do next level at dirloop - * return the answer in ni_vp, locked if LOCKLEAF set - * if LOCKPARENT set, return locked parent in ni_dvp - * if WANTPARENT set, return unlocked parent in ni_dvp - * - * Returns: 0 Success - * ENOENT No such file or directory - * EBADF Bad file descriptor - * ENOTDIR Not a directory - * EROFS Read-only file system [CREATE] - * EISDIR Is a directory [CREATE] - * cache_lookup_path:ERECYCLE (vnode was recycled from underneath us, redrive lookup again) - * vnode_authorize:EROFS - * vnode_authorize:EACCES - * vnode_authorize:EPERM - * vnode_authorize:??? - * VNOP_LOOKUP:ENOENT No such file or directory - * VNOP_LOOKUP:EJUSTRETURN Restart system call (INTERNAL) - * VNOP_LOOKUP:??? - * VFS_ROOT:ENOTSUP - * VFS_ROOT:ENOENT - * VFS_ROOT:??? - */ + return 0; +} int -lookup(struct nameidata *ndp) +lookup_authorize_search(vnode_t dp, struct componentname *cnp, int dp_authorized_in_cache, vfs_context_t ctx) { - char *cp; /* pointer into pathname argument */ - vnode_t tdp; /* saved dp */ - vnode_t dp; /* the directory we are searching */ - mount_t mp; /* mount table entry */ - int docache = 1; /* == 0 do not cache last component */ - int wantparent; /* 1 => wantparent or lockparent flag */ - int rdonly; /* lookup read-only flag bit */ - int trailing_slash = 0; - int dp_authorized = 0; - int error = 0; - struct componentname *cnp = &ndp->ni_cnd; - vfs_context_t ctx = cnp->cn_context; - int mounted_on_depth = 0; - int dont_cache_mp = 0; - vnode_t mounted_on_dp = NULLVP; - int current_mount_generation = 0; - int vbusyflags = 0; - int nc_generation = 0; - vnode_t last_dp = NULLVP; + int error; - /* - * Setup: break out flag bits into variables. - */ - if (cnp->cn_flags & (NOCACHE | DOWHITEOUT)) { - if ((cnp->cn_flags & NOCACHE) || (cnp->cn_nameiop == DELETE)) - docache = 0; + if (!dp_authorized_in_cache) { + error = vnode_authorize(dp, NULL, KAUTH_VNODE_SEARCH, ctx); + if (error) + return error; } - wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); - rdonly = cnp->cn_flags & RDONLY; - cnp->cn_flags &= ~ISSYMLINK; - cnp->cn_consume = 0; +#if CONFIG_MACF + error = mac_vnode_check_lookup(ctx, dp, cnp); + if (error) + return error; +#endif /* CONFIG_MACF */ - dp = ndp->ni_startdir; - ndp->ni_startdir = NULLVP; + return 0; +} - if ((cnp->cn_flags & CN_NBMOUNTLOOK) != 0) - vbusyflags = LK_NOWAIT; - cp = cnp->cn_nameptr; +void +lookup_consider_update_cache(vnode_t dvp, vnode_t vp, struct componentname *cnp, int nc_generation) +{ + int isdot_or_dotdot; + isdot_or_dotdot = (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') || (cnp->cn_flags & ISDOTDOT); - if (*cp == '\0') { - if ( (vnode_getwithref(dp)) ) { - dp = NULLVP; - error = ENOENT; - goto bad; + if (vp->v_name == NULL || vp->v_parent == NULLVP) { + int update_flags = 0; + + if (isdot_or_dotdot == 0) { + if (vp->v_name == NULL) + update_flags |= VNODE_UPDATE_NAME; + if (dvp != NULLVP && vp->v_parent == NULLVP) + update_flags |= VNODE_UPDATE_PARENT; + + if (update_flags) + vnode_update_identity(vp, dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, update_flags); } - goto emptyname; } -dirloop: - ndp->ni_vp = NULLVP; + if ( (cnp->cn_flags & MAKEENTRY) && (vp->v_flag & VNCACHEABLE) && LIST_FIRST(&vp->v_nclinks) == NULL) { + /* + * missing from name cache, but should + * be in it... this can happen if volfs + * causes the vnode to be created or the + * name cache entry got recycled but the + * vnode didn't... + * check to make sure that ni_dvp is valid + * cache_lookup_path may return a NULL + * do a quick check to see if the generation of the + * directory matches our snapshot... this will get + * rechecked behind the name cache lock, but if it + * already fails to match, no need to go any further + */ + if (dvp != NULLVP && (nc_generation == dvp->v_nc_generation) && (!isdot_or_dotdot)) + cache_enter_with_gen(dvp, vp, cnp, nc_generation); + } - if ( (error = cache_lookup_path(ndp, cnp, dp, ctx, &trailing_slash, &dp_authorized, last_dp)) ) { - dp = NULLVP; - goto bad; +} + +#if NAMEDRSRCFORK +/* + * Can change ni_dvp and ni_vp. On success, returns with iocounts on stream vnode (always) and + * data fork if requested. On failure, returns with iocount data fork (always) and its parent directory + * (if one was provided). + */ +int +lookup_handle_rsrc_fork(vnode_t dp, struct nameidata *ndp, struct componentname *cnp, int wantparent, vfs_context_t ctx) +{ + vnode_t svp = NULLVP; + enum nsoperation nsop; + int error; + + if (dp->v_type != VREG) { + error = ENOENT; + goto out; } - if ((cnp->cn_flags & ISLASTCN)) { - if (docache) - cnp->cn_flags |= MAKEENTRY; - } else - cnp->cn_flags |= MAKEENTRY; + switch (cnp->cn_nameiop) { + case DELETE: + if (cnp->cn_flags & CN_ALLOWRSRCFORK) { + nsop = NS_DELETE; + } else { + error = EPERM; + goto out; + } + break; + case CREATE: + if (cnp->cn_flags & CN_ALLOWRSRCFORK) { + nsop = NS_CREATE; + } else { + error = EPERM; + goto out; + } + break; + case LOOKUP: + /* Make sure our lookup of "/..namedfork/rsrc" is allowed. */ + if (cnp->cn_flags & CN_ALLOWRSRCFORK) { + nsop = NS_OPEN; + } else { + error = EPERM; + goto out; + } + break; + default: + error = EPERM; + goto out; + } + /* Ask the file system for the resource fork. */ + error = vnode_getnamedstream(dp, &svp, XATTR_RESOURCEFORK_NAME, nsop, 0, ctx); - dp = ndp->ni_dvp; + /* During a create, it OK for stream vnode to be missing. */ + if (error == ENOATTR || error == ENOENT) { + error = (nsop == NS_CREATE) ? 0 : ENOENT; + } + if (error) { + goto out; + } + /* The "parent" of the stream is the file. */ + if (wantparent) { + if (ndp->ni_dvp) { +#ifndef __LP64__ + if (ndp->ni_cnd.cn_flags & FSNODELOCKHELD) { + ndp->ni_cnd.cn_flags &= ~FSNODELOCKHELD; + unlock_fsnode(ndp->ni_dvp, NULL); + } +#endif /* __LP64__ */ + vnode_put(ndp->ni_dvp); + } + ndp->ni_dvp = dp; + } else { + vnode_put(dp); + } + ndp->ni_vp = svp; /* on create this may be null */ - if (ndp->ni_vp != NULLVP) { - /* - * cache_lookup_path returned a non-NULL ni_vp then, - * we're guaranteed that the dp is a VDIR, it's - * been authorized, and vp is not ".." - * - * make sure we don't try to enter the name back into - * the cache if this vp is purged before we get to that - * check since we won't have serialized behind whatever - * activity is occurring in the FS that caused the purge - */ - if (dp != NULLVP) - nc_generation = dp->v_nc_generation - 1; + /* Restore the truncated pathname buffer (for audits). */ + if (ndp->ni_pathlen == 1 && ndp->ni_next[0] == '\0') { + ndp->ni_next[0] = '/'; + } + cnp->cn_flags &= ~MAKEENTRY; - goto returned_from_lookup_path; + return 0; +out: + return error; +} +#endif /* NAMEDRSRCFORK */ + +/* + * iocounts in: + * --One on ni_vp. One on ni_dvp if there is more path, or we didn't come through the + * cache, or we came through the cache and the caller doesn't want the parent. + * + * iocounts out: + * --Leaves us in the correct state for the next step, whatever that might be. + * --If we find a symlink, returns with iocounts on both ni_vp and ni_dvp. + * --If we are to look up another component, then we have an iocount on ni_vp and + * nothing else. + * --If we are done, returns an iocount on ni_vp, and possibly on ni_dvp depending on nameidata flags. + * --In the event of an error, may return with ni_dvp NULL'ed out (in which case, iocount + * was dropped). + */ +int +lookup_handle_found_vnode(struct nameidata *ndp, struct componentname *cnp, int rdonly, + int vbusyflags, int *keep_going, int nc_generation, + int wantparent, int atroot, vfs_context_t ctx) +{ + vnode_t dp; + int error; + char *cp; + + dp = ndp->ni_vp; + *keep_going = 0; + + if (ndp->ni_vp == NULLVP) { + panic("NULL ni_vp in %s\n", __FUNCTION__); + } + + if (atroot) { + goto nextname; + } + +#if CONFIG_TRIGGERS + if (dp->v_resolve) { + error = vnode_trigger_resolve(dp, ndp, ctx); + if (error) { + goto out; + } + } +#endif /* CONFIG_TRIGGERS */ + + /* + * Take into account any additional components consumed by + * the underlying filesystem. + */ + if (cnp->cn_consume > 0) { + cnp->cn_nameptr += cnp->cn_consume; + ndp->ni_next += cnp->cn_consume; + ndp->ni_pathlen -= cnp->cn_consume; + cnp->cn_consume = 0; + } else { + lookup_consider_update_cache(ndp->ni_dvp, dp, cnp, nc_generation); + } + + /* + * Check to see if the vnode has been mounted on... + * if so find the root of the mounted file system. + * Updates ndp->ni_vp. + */ + error = lookup_traverse_mountpoints(ndp, cnp, dp, vbusyflags, ctx); + dp = ndp->ni_vp; + if (error) { + goto out; + } + +#if CONFIG_MACF + if (vfs_flags(vnode_mount(dp)) & MNT_MULTILABEL) { + error = vnode_label(vnode_mount(dp), NULL, dp, NULL, 0, ctx); + if (error) + goto out; + } +#endif + + /* + * Check for symbolic link + */ + if ((dp->v_type == VLNK) && + ((cnp->cn_flags & FOLLOW) || (ndp->ni_flag & NAMEI_TRAILINGSLASH) || *ndp->ni_next == '/')) { + cnp->cn_flags |= ISSYMLINK; + *keep_going = 1; + return (0); + } + + /* + * Check for bogus trailing slashes. + */ + if ((ndp->ni_flag & NAMEI_TRAILINGSLASH)) { + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + ndp->ni_flag &= ~(NAMEI_TRAILINGSLASH); + } + +nextname: + /* + * Not a symbolic link. If more pathname, + * continue at next component, else return. + * + * Definitely have a dvp if there's another slash + */ + if (*ndp->ni_next == '/') { + cnp->cn_nameptr = ndp->ni_next + 1; + ndp->ni_pathlen--; + while (*cnp->cn_nameptr == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + + cp = cnp->cn_nameptr; + vnode_put(ndp->ni_dvp); + ndp->ni_dvp = NULLVP; + + if (*cp == '\0') { + goto emptyname; + } + + *keep_going = 1; + return 0; + } + + /* + * Disallow directory write attempts on read-only file systems. + */ + if (rdonly && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto out; + } + + /* If SAVESTART is set, we should have a dvp */ + if (cnp->cn_flags & SAVESTART) { + /* + * note that we already hold a reference + * on both dp and ni_dvp, but for some reason + * can't get another one... in this case we + * need to do vnode_put on dp in 'bad2' + */ + if ( (vnode_get(ndp->ni_dvp)) ) { + error = ENOENT; + goto out; + } + ndp->ni_startdir = ndp->ni_dvp; + } + if (!wantparent && ndp->ni_dvp) { + vnode_put(ndp->ni_dvp); + ndp->ni_dvp = NULLVP; + } + + if (cnp->cn_flags & AUDITVNPATH1) + AUDIT_ARG(vnpath, dp, ARG_VNODE1); + else if (cnp->cn_flags & AUDITVNPATH2) + AUDIT_ARG(vnpath, dp, ARG_VNODE2); + +#if NAMEDRSRCFORK + /* + * Caller wants the resource fork. + */ + if ((cnp->cn_flags & CN_WANTSRSRCFORK) && (dp != NULLVP)) { + error = lookup_handle_rsrc_fork(dp, ndp, cnp, wantparent, ctx); + if (error != 0) + goto out; + + dp = ndp->ni_vp; + } +#endif + if (kdebug_enable) + kdebug_lookup(dp, cnp); + + return 0; + +emptyname: + error = lookup_handle_emptyname(ndp, cnp, wantparent); + if (error != 0) + goto out; + + return 0; +out: + return error; + +} + +/* + * Comes in iocount on ni_vp. May overwrite ni_dvp, but doesn't interpret incoming value. + */ +int +lookup_handle_emptyname(struct nameidata *ndp, struct componentname *cnp, int wantparent) +{ + vnode_t dp; + int error = 0; + + dp = ndp->ni_vp; + cnp->cn_namelen = 0; + /* + * A degenerate name (e.g. / or "") which is a way of + * talking about a directory, e.g. like "/." or ".". + */ + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + if (cnp->cn_nameiop != LOOKUP) { + error = EISDIR; + goto out; + } + if (wantparent) { + /* + * note that we already hold a reference + * on dp, but for some reason can't + * get another one... in this case we + * need to do vnode_put on dp in 'bad' + */ + if ( (vnode_get(dp)) ) { + error = ENOENT; + goto out; + } + ndp->ni_dvp = dp; + } + cnp->cn_flags &= ~ISDOTDOT; + cnp->cn_flags |= ISLASTCN; + ndp->ni_next = cnp->cn_nameptr; + ndp->ni_vp = dp; + + if (cnp->cn_flags & AUDITVNPATH1) + AUDIT_ARG(vnpath, dp, ARG_VNODE1); + else if (cnp->cn_flags & AUDITVNPATH2) + AUDIT_ARG(vnpath, dp, ARG_VNODE2); + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + + return 0; +out: + return error; +} +/* + * Search a pathname. + * This is a very central and rather complicated routine. + * + * The pathname is pointed to by ni_ptr and is of length ni_pathlen. + * The starting directory is taken from ni_startdir. The pathname is + * descended until done, or a symbolic link is encountered. The variable + * ni_more is clear if the path is completed; it is set to one if a + * symbolic link needing interpretation is encountered. + * + * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on + * whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it, the parent directory is returned + * locked. If flag has WANTPARENT or'ed into it, the parent directory is + * returned unlocked. Otherwise the parent directory is not returned. If + * the target of the pathname exists and LOCKLEAF is or'ed into the flag + * the target is returned locked, otherwise it is returned unlocked. + * When creating or renaming and LOCKPARENT is specified, the target may not + * be ".". When deleting and LOCKPARENT is specified, the target may be ".". + * + * Overall outline of lookup: + * + * dirloop: + * identify next component of name at ndp->ni_ptr + * handle degenerate case where name is null string + * if .. and crossing mount points and on mounted filesys, find parent + * call VNOP_LOOKUP routine for next component name + * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set + * component vnode returned in ni_vp (if it exists), locked. + * if result vnode is mounted on and crossing mount points, + * find mounted on vnode + * if more components of name, do next level at dirloop + * return the answer in ni_vp, locked if LOCKLEAF set + * if LOCKPARENT set, return locked parent in ni_dvp + * if WANTPARENT set, return unlocked parent in ni_dvp + * + * Returns: 0 Success + * ENOENT No such file or directory + * EBADF Bad file descriptor + * ENOTDIR Not a directory + * EROFS Read-only file system [CREATE] + * EISDIR Is a directory [CREATE] + * cache_lookup_path:ERECYCLE (vnode was recycled from underneath us, redrive lookup again) + * vnode_authorize:EROFS + * vnode_authorize:EACCES + * vnode_authorize:EPERM + * vnode_authorize:??? + * VNOP_LOOKUP:ENOENT No such file or directory + * VNOP_LOOKUP:EJUSTRETURN Restart system call (INTERNAL) + * VNOP_LOOKUP:??? + * VFS_ROOT:ENOTSUP + * VFS_ROOT:ENOENT + * VFS_ROOT:??? + */ +int +lookup(struct nameidata *ndp) +{ + char *cp; /* pointer into pathname argument */ + vnode_t tdp; /* saved dp */ + vnode_t dp; /* the directory we are searching */ + int docache = 1; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int dp_authorized = 0; + int error = 0; + struct componentname *cnp = &ndp->ni_cnd; + vfs_context_t ctx = cnp->cn_context; + int vbusyflags = 0; + int nc_generation = 0; + vnode_t last_dp = NULLVP; + int keep_going; + int atroot; + + /* + * Setup: break out flag bits into variables. + */ + if (cnp->cn_flags & (NOCACHE | DOWHITEOUT)) { + if ((cnp->cn_flags & NOCACHE) || (cnp->cn_nameiop == DELETE)) + docache = 0; + } + wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); + rdonly = cnp->cn_flags & RDONLY; + cnp->cn_flags &= ~ISSYMLINK; + cnp->cn_consume = 0; + + dp = ndp->ni_startdir; + ndp->ni_startdir = NULLVP; + + if ((cnp->cn_flags & CN_NBMOUNTLOOK) != 0) + vbusyflags = LK_NOWAIT; + cp = cnp->cn_nameptr; + + if (*cp == '\0') { + if ( (vnode_getwithref(dp)) ) { + dp = NULLVP; + error = ENOENT; + goto bad; + } + ndp->ni_vp = dp; + error = lookup_handle_emptyname(ndp, cnp, wantparent); + if (error) { + goto bad; + } + + return 0; + } +dirloop: + atroot = 0; + ndp->ni_vp = NULLVP; + + if ( (error = cache_lookup_path(ndp, cnp, dp, ctx, &dp_authorized, last_dp)) ) { + dp = NULLVP; + goto bad; + } + if ((cnp->cn_flags & ISLASTCN)) { + if (docache) + cnp->cn_flags |= MAKEENTRY; + } else + cnp->cn_flags |= MAKEENTRY; + + dp = ndp->ni_dvp; + + if (ndp->ni_vp != NULLVP) { + /* + * cache_lookup_path returned a non-NULL ni_vp then, + * we're guaranteed that the dp is a VDIR, it's + * been authorized, and vp is not ".." + * + * make sure we don't try to enter the name back into + * the cache if this vp is purged before we get to that + * check since we won't have serialized behind whatever + * activity is occurring in the FS that caused the purge + */ + if (dp != NULLVP) + nc_generation = dp->v_nc_generation - 1; + + goto returned_from_lookup_path; } /* @@ -618,7 +954,8 @@ dirloop: error = ENOENT; goto bad; } - goto nextname; + atroot = 1; + goto returned_from_lookup_path; } if ((dp->v_flag & VROOT) == 0 || (cnp->cn_flags & NOCROSSMOUNT)) @@ -653,21 +990,32 @@ unionlookup: goto lookup_error; } if ( (cnp->cn_flags & DONOTAUTH) != DONOTAUTH ) { - if (!dp_authorized) { - error = vnode_authorize(dp, NULL, KAUTH_VNODE_SEARCH, ctx); - if (error) - goto lookup_error; - } -#if CONFIG_MACF - error = mac_vnode_check_lookup(ctx, dp, cnp); - if (error) + error = lookup_authorize_search(dp, cnp, dp_authorized, ctx); + if (error) { goto lookup_error; -#endif /* CONFIG_MACF */ + } + } + + /* + * Now that we've authorized a lookup, can bail out if the filesystem + * will be doing a batched operation. Return an iocount on dvp. + */ +#if NAMEDRSRCFORK + if ((cnp->cn_flags & ISLASTCN) && namei_compound_available(dp, ndp) && !(cnp->cn_flags & CN_WANTSRSRCFORK)) { +#else + if ((cnp->cn_flags & ISLASTCN) && namei_compound_available(dp, ndp)) { +#endif /* NAMEDRSRCFORK */ + ndp->ni_flag |= NAMEI_UNFINISHED; + ndp->ni_ncgeneration = dp->v_nc_generation; + return 0; } nc_generation = dp->v_nc_generation; - if ( (error = VNOP_LOOKUP(dp, &ndp->ni_vp, cnp, ctx)) ) { + error = VNOP_LOOKUP(dp, &ndp->ni_vp, cnp, ctx); + + + if ( error ) { lookup_error: if ((error == ENOENT) && (dp->v_flag & VROOT) && (dp->v_mount != NULL) && @@ -699,18 +1047,9 @@ lookup_error: if (ndp->ni_vp != NULLVP) panic("leaf should be empty"); - /* - * If creating and at end of pathname, then can consider - * allowing file to be created. - */ - if (rdonly) { - error = EROFS; - goto bad; - } - if ((cnp->cn_flags & ISLASTCN) && trailing_slash && !(cnp->cn_flags & WILLBEDIR)) { - error = ENOENT; + error = lookup_validate_creation_path(ndp); + if (error) goto bad; - } /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the @@ -731,337 +1070,33 @@ lookup_error: return (0); } returned_from_lookup_path: - dp = ndp->ni_vp; - - /* - * Take into account any additional components consumed by - * the underlying filesystem. - */ - if (cnp->cn_consume > 0) { - cnp->cn_nameptr += cnp->cn_consume; - ndp->ni_next += cnp->cn_consume; - ndp->ni_pathlen -= cnp->cn_consume; - cnp->cn_consume = 0; - } else { - int isdot_or_dotdot; - isdot_or_dotdot = (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') || (cnp->cn_flags & ISDOTDOT); - - if (dp->v_name == NULL || dp->v_parent == NULLVP) { - int update_flags = 0; - - if (isdot_or_dotdot == 0) { - if (dp->v_name == NULL) - update_flags |= VNODE_UPDATE_NAME; - if (ndp->ni_dvp != NULLVP && dp->v_parent == NULLVP) - update_flags |= VNODE_UPDATE_PARENT; - - if (update_flags) - vnode_update_identity(dp, ndp->ni_dvp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, update_flags); - } - } - if ( (cnp->cn_flags & MAKEENTRY) && (dp->v_flag & VNCACHEABLE) && LIST_FIRST(&dp->v_nclinks) == NULL) { - /* - * missing from name cache, but should - * be in it... this can happen if volfs - * causes the vnode to be created or the - * name cache entry got recycled but the - * vnode didn't... - * check to make sure that ni_dvp is valid - * cache_lookup_path may return a NULL - * do a quick check to see if the generation of the - * directory matches our snapshot... this will get - * rechecked behind the name cache lock, but if it - * already fails to match, no need to go any further - */ - if (ndp->ni_dvp != NULLVP && (nc_generation == ndp->ni_dvp->v_nc_generation) && (!isdot_or_dotdot)) - cache_enter_with_gen(ndp->ni_dvp, dp, cnp, nc_generation); - } - } - - mounted_on_dp = dp; - mounted_on_depth = 0; - dont_cache_mp = 0; - current_mount_generation = mount_generation; - /* - * Check to see if the vnode has been mounted on... - * if so find the root of the mounted file system. - */ -check_mounted_on: - if ((dp->v_type == VDIR) && dp->v_mountedhere && - ((cnp->cn_flags & NOCROSSMOUNT) == 0)) { - - vnode_lock(dp); - - if ((dp->v_type == VDIR) && (mp = dp->v_mountedhere)) { - struct uthread *uth = (struct uthread *)get_bsdthread_info(current_thread()); - - mp->mnt_crossref++; - vnode_unlock(dp); - - - if (vfs_busy(mp, vbusyflags)) { - mount_dropcrossref(mp, dp, 0); - if (vbusyflags == LK_NOWAIT) { - error = ENOENT; - goto bad2; - } - goto check_mounted_on; - } - - /* - * XXX - if this is the last component of the - * pathname, and it's either not a lookup operation - * or the NOTRIGGER flag is set for the operation, - * set a uthread flag to let VFS_ROOT() for autofs - * know it shouldn't trigger a mount. - */ - if ((cnp->cn_flags & ISLASTCN) && - (cnp->cn_nameiop != LOOKUP || - (cnp->cn_flags & NOTRIGGER))) { - uth->uu_notrigger = 1; - dont_cache_mp = 1; - } - error = VFS_ROOT(mp, &tdp, ctx); - /* XXX - clear the uthread flag */ - uth->uu_notrigger = 0; - /* - * mount_dropcrossref does a vnode_put - * on dp if the 3rd arg is non-zero - */ - mount_dropcrossref(mp, dp, 1); - dp = NULL; - vfs_unbusy(mp); - - if (error) { - goto bad2; - } - ndp->ni_vp = dp = tdp; - mounted_on_depth++; - - goto check_mounted_on; - } - vnode_unlock(dp); + /* We'll always have an iocount on ni_vp when this finishes. */ + error = lookup_handle_found_vnode(ndp, cnp, rdonly, vbusyflags, &keep_going, nc_generation, wantparent, atroot, ctx); + if (error != 0) { + goto bad2; } -#if CONFIG_MACF - if (vfs_flags(vnode_mount(dp)) & MNT_MULTILABEL) { - error = vnode_label(vnode_mount(dp), NULL, dp, NULL, 0, ctx); - if (error) - goto bad2; - } -#endif - - if (mounted_on_depth && !dont_cache_mp) { - mp = mounted_on_dp->v_mountedhere; - - if (mp) { - mount_lock_spin(mp); - mp->mnt_realrootvp_vid = dp->v_id; - mp->mnt_realrootvp = dp; - mp->mnt_generation = current_mount_generation; - mount_unlock(mp); - } - } - - /* - * Check for symbolic link - */ - if ((dp->v_type == VLNK) && - ((cnp->cn_flags & FOLLOW) || trailing_slash || *ndp->ni_next == '/')) { - cnp->cn_flags |= ISSYMLINK; - return (0); - } - - /* - * Check for bogus trailing slashes. - */ - if (trailing_slash) { - if (dp->v_type != VDIR) { - error = ENOTDIR; - goto bad2; - } - trailing_slash = 0; - } - -nextname: - /* - * Not a symbolic link. If more pathname, - * continue at next component, else return. - */ - if (*ndp->ni_next == '/') { - cnp->cn_nameptr = ndp->ni_next + 1; - ndp->ni_pathlen--; - while (*cnp->cn_nameptr == '/') { - cnp->cn_nameptr++; - ndp->ni_pathlen--; - } - vnode_put(ndp->ni_dvp); - - cp = cnp->cn_nameptr; - - if (*cp == '\0') - goto emptyname; - - /* - * cache_lookup_path is now responsible for dropping io ref on dp - * when it is called again in the dirloop. This ensures we hold - * a ref on dp until we complete the next round of lookup. - */ - last_dp = dp; - goto dirloop; - } - - /* - * Disallow directory write attempts on read-only file systems. - */ - if (rdonly && - (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { - error = EROFS; - goto bad2; - } - if (cnp->cn_flags & SAVESTART) { - /* - * note that we already hold a reference - * on both dp and ni_dvp, but for some reason - * can't get another one... in this case we - * need to do vnode_put on dp in 'bad2' - */ - if ( (vnode_get(ndp->ni_dvp)) ) { - error = ENOENT; - goto bad2; - } - ndp->ni_startdir = ndp->ni_dvp; - } - if (!wantparent && ndp->ni_dvp) { - vnode_put(ndp->ni_dvp); - ndp->ni_dvp = NULLVP; - } - - if (cnp->cn_flags & AUDITVNPATH1) - AUDIT_ARG(vnpath, dp, ARG_VNODE1); - else if (cnp->cn_flags & AUDITVNPATH2) - AUDIT_ARG(vnpath, dp, ARG_VNODE2); - -#if NAMEDRSRCFORK - /* - * Caller wants the resource fork. - */ - if ((cnp->cn_flags & CN_WANTSRSRCFORK) && (dp != NULLVP)) { - vnode_t svp = NULLVP; - enum nsoperation nsop; - - if (dp->v_type != VREG) { - error = ENOENT; - goto bad2; - } - switch (cnp->cn_nameiop) { - case DELETE: - if (cnp->cn_flags & CN_ALLOWRSRCFORK) { - nsop = NS_DELETE; - } else { - error = EPERM; - goto bad2; - } - break; - case CREATE: - if (cnp->cn_flags & CN_ALLOWRSRCFORK) { - nsop = NS_CREATE; - } else { - error = EPERM; - goto bad2; - } - break; - case LOOKUP: - /* Make sure our lookup of "/..namedfork/rsrc" is allowed. */ - if (cnp->cn_flags & CN_ALLOWRSRCFORK) { - nsop = NS_OPEN; - } else { - error = EPERM; - goto bad2; - } - break; - default: - error = EPERM; - goto bad2; - } - /* Ask the file system for the resource fork. */ - error = vnode_getnamedstream(dp, &svp, XATTR_RESOURCEFORK_NAME, nsop, 0, ctx); - - /* During a create, it OK for stream vnode to be missing. */ - if (error == ENOATTR || error == ENOENT) { - error = (nsop == NS_CREATE) ? 0 : ENOENT; - } - if (error) { - goto bad2; - } - /* The "parent" of the stream is the file. */ - if (wantparent) { - if (ndp->ni_dvp) { -#ifndef __LP64__ - if (ndp->ni_cnd.cn_flags & FSNODELOCKHELD) { - ndp->ni_cnd.cn_flags &= ~FSNODELOCKHELD; - unlock_fsnode(ndp->ni_dvp, NULL); - } -#endif /* __LP64__ */ - vnode_put(ndp->ni_dvp); - } - ndp->ni_dvp = dp; - } else { - vnode_put(dp); - } - ndp->ni_vp = dp = svp; /* on create this may be null */ + if (keep_going) { + dp = ndp->ni_vp; - /* Restore the truncated pathname buffer (for audits). */ - if (ndp->ni_pathlen == 1 && ndp->ni_next[0] == '\0') { - ndp->ni_next[0] = '/'; + /* namei() will handle symlinks */ + if ((dp->v_type == VLNK) && + ((cnp->cn_flags & FOLLOW) || (ndp->ni_flag & NAMEI_TRAILINGSLASH) || *ndp->ni_next == '/')) { + return 0; } - cnp->cn_flags &= ~MAKEENTRY; - } -#endif - if (kdebug_enable) - kdebug_lookup(dp, cnp); - return (0); -emptyname: - cnp->cn_namelen = 0; - /* - * A degenerate name (e.g. / or "") which is a way of - * talking about a directory, e.g. like "/." or ".". - */ - if (dp->v_type != VDIR) { - error = ENOTDIR; - goto bad; - } - if (cnp->cn_nameiop != LOOKUP) { - error = EISDIR; - goto bad; - } - if (wantparent) { - /* - * note that we already hold a reference - * on dp, but for some reason can't - * get another one... in this case we - * need to do vnode_put on dp in 'bad' + /* + * Otherwise, there's more path to process. + * cache_lookup_path is now responsible for dropping io ref on dp + * when it is called again in the dirloop. This ensures we hold + * a ref on dp until we complete the next round of lookup. */ - if ( (vnode_get(dp)) ) { - error = ENOENT; - goto bad; - } - ndp->ni_dvp = dp; + last_dp = dp; + + goto dirloop; } - cnp->cn_flags &= ~ISDOTDOT; - cnp->cn_flags |= ISLASTCN; - ndp->ni_next = cp; - ndp->ni_vp = dp; - if (cnp->cn_flags & AUDITVNPATH1) - AUDIT_ARG(vnpath, dp, ARG_VNODE1); - else if (cnp->cn_flags & AUDITVNPATH2) - AUDIT_ARG(vnpath, dp, ARG_VNODE2); - if (cnp->cn_flags & SAVESTART) - panic("lookup: SAVESTART"); return (0); - bad2: #ifndef __LP64__ if ((cnp->cn_flags & FSNODELOCKHELD)) { @@ -1070,9 +1105,9 @@ bad2: } #endif /* __LP64__ */ if (ndp->ni_dvp) - vnode_put(ndp->ni_dvp); - if (dp) - vnode_put(dp); + vnode_put(ndp->ni_dvp); + + vnode_put(ndp->ni_vp); ndp->ni_vp = NULLVP; if (kdebug_enable) @@ -1095,6 +1130,257 @@ bad: return (error); } +int +lookup_validate_creation_path(struct nameidata *ndp) +{ + struct componentname *cnp = &ndp->ni_cnd; + + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (cnp->cn_flags & RDONLY) { + return EROFS; + } + if ((cnp->cn_flags & ISLASTCN) && (ndp->ni_flag & NAMEI_TRAILINGSLASH) && !(cnp->cn_flags & WILLBEDIR)) { + return ENOENT; + } + + return 0; +} + +/* + * Modifies only ni_vp. Always returns with ni_vp still valid (iocount held). + */ +int +lookup_traverse_mountpoints(struct nameidata *ndp, struct componentname *cnp, vnode_t dp, + int vbusyflags, vfs_context_t ctx) +{ + mount_t mp; + vnode_t tdp; + int error = 0; + uthread_t uth; + uint32_t depth = 0; + int dont_cache_mp = 0; + vnode_t mounted_on_dp; + int current_mount_generation = 0; + + mounted_on_dp = dp; + current_mount_generation = mount_generation; + + while ((dp->v_type == VDIR) && dp->v_mountedhere && + ((cnp->cn_flags & NOCROSSMOUNT) == 0)) { +#if CONFIG_TRIGGERS + /* + * For a trigger vnode, call its resolver when crossing its mount (if requested) + */ + if (dp->v_resolve) { + (void) vnode_trigger_resolve(dp, ndp, ctx); + } +#endif + vnode_lock(dp); + + if ((dp->v_type == VDIR) && (mp = dp->v_mountedhere)) { + + mp->mnt_crossref++; + vnode_unlock(dp); + + + if (vfs_busy(mp, vbusyflags)) { + mount_dropcrossref(mp, dp, 0); + if (vbusyflags == LK_NOWAIT) { + error = ENOENT; + goto out; + } + + continue; + } + + + /* + * XXX - if this is the last component of the + * pathname, and it's either not a lookup operation + * or the NOTRIGGER flag is set for the operation, + * set a uthread flag to let VFS_ROOT() for autofs + * know it shouldn't trigger a mount. + */ + uth = (struct uthread *)get_bsdthread_info(current_thread()); + if ((cnp->cn_flags & ISLASTCN) && + (cnp->cn_nameiop != LOOKUP || + (cnp->cn_flags & NOTRIGGER))) { + uth->uu_notrigger = 1; + dont_cache_mp = 1; + } + + error = VFS_ROOT(mp, &tdp, ctx); + /* XXX - clear the uthread flag */ + uth->uu_notrigger = 0; + + mount_dropcrossref(mp, dp, 0); + vfs_unbusy(mp); + + if (error) { + goto out; + } + + vnode_put(dp); + ndp->ni_vp = dp = tdp; + depth++; + +#if CONFIG_TRIGGERS + /* + * Check if root dir is a trigger vnode + */ + if (dp->v_resolve) { + error = vnode_trigger_resolve(dp, ndp, ctx); + if (error) { + goto out; + } + } +#endif + + } else { + vnode_unlock(dp); + break; + } + } + + if (depth && !dont_cache_mp) { + mp = mounted_on_dp->v_mountedhere; + + if (mp) { + mount_lock_spin(mp); + mp->mnt_realrootvp_vid = dp->v_id; + mp->mnt_realrootvp = dp; + mp->mnt_generation = current_mount_generation; + mount_unlock(mp); + } + } + + return 0; + +out: + return error; +} + +/* + * Takes ni_vp and ni_dvp non-NULL. Returns with *new_dp set to the location + * at which to start a lookup with a resolved path, and all other iocounts dropped. + */ +int +lookup_handle_symlink(struct nameidata *ndp, vnode_t *new_dp, vfs_context_t ctx) +{ + int error; + char *cp; /* pointer into pathname argument */ + uio_t auio; + char uio_buf[ UIO_SIZEOF(1) ]; + int need_newpathbuf; + u_int linklen; + struct componentname *cnp = &ndp->ni_cnd; + vnode_t dp; + char *tmppn; + +#ifndef __LP64__ + if ((cnp->cn_flags & FSNODELOCKHELD)) { + cnp->cn_flags &= ~FSNODELOCKHELD; + unlock_fsnode(ndp->ni_dvp, NULL); + } +#endif /* __LP64__ */ + + if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { + return ELOOP; + } +#if CONFIG_MACF + if ((error = mac_vnode_check_readlink(ctx, ndp->ni_vp)) != 0) + return error; +#endif /* MAC */ + if (ndp->ni_pathlen > 1 || !(cnp->cn_flags & HASBUF)) + need_newpathbuf = 1; + else + need_newpathbuf = 0; + + if (need_newpathbuf) { + MALLOC_ZONE(cp, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (cp == NULL) { + return ENOMEM; + } + } else { + cp = cnp->cn_pnbuf; + } + auio = uio_createwithbuffer(1, 0, UIO_SYSSPACE, UIO_READ, &uio_buf[0], sizeof(uio_buf)); + + uio_addiov(auio, CAST_USER_ADDR_T(cp), MAXPATHLEN); + + error = VNOP_READLINK(ndp->ni_vp, auio, ctx); + if (error) { + if (need_newpathbuf) + FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); + return error; + } + + /* + * Safe to set unsigned with a [larger] signed type here + * because 0 <= uio_resid <= MAXPATHLEN and MAXPATHLEN + * is only 1024. + */ + linklen = MAXPATHLEN - (u_int)uio_resid(auio); + if (linklen + ndp->ni_pathlen > MAXPATHLEN) { + if (need_newpathbuf) + FREE_ZONE(cp, MAXPATHLEN, M_NAMEI); + + return ENAMETOOLONG; + } + if (need_newpathbuf) { + long len = cnp->cn_pnlen; + + tmppn = cnp->cn_pnbuf; + bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); + cnp->cn_pnbuf = cp; + cnp->cn_pnlen = MAXPATHLEN; + + if ( (cnp->cn_flags & HASBUF) ) + FREE_ZONE(tmppn, len, M_NAMEI); + else + cnp->cn_flags |= HASBUF; + } else + cnp->cn_pnbuf[linklen] = '\0'; + + ndp->ni_pathlen += linklen; + cnp->cn_nameptr = cnp->cn_pnbuf; + + /* + * starting point for 'relative' + * symbolic link path + */ + dp = ndp->ni_dvp; + + /* + * get rid of references returned via 'lookup' + */ + vnode_put(ndp->ni_vp); + vnode_put(ndp->ni_dvp); /* ALWAYS have a dvp for a symlink */ + + ndp->ni_vp = NULLVP; + ndp->ni_dvp = NULLVP; + + /* + * Check if symbolic link restarts us at the root + */ + if (*(cnp->cn_nameptr) == '/') { + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + if ((dp = ndp->ni_rootdir) == NULLVP) { + return ENOENT; + } + } + + *new_dp = dp; + + return 0; +} + /* * relookup - lookup a path name component * Used by lookup to re-aquire things. @@ -1205,18 +1491,27 @@ bad: return (error); } -/* - * Free pathname buffer - */ void -nameidone(struct nameidata *ndp) +namei_unlock_fsnode(struct nameidata *ndp) { #ifndef __LP64__ if ((ndp->ni_cnd.cn_flags & FSNODELOCKHELD)) { ndp->ni_cnd.cn_flags &= ~FSNODELOCKHELD; unlock_fsnode(ndp->ni_dvp, NULL); } +#else + (void)ndp; #endif /* __LP64__ */ +} + +/* + * Free pathname buffer + */ +void +nameidone(struct nameidata *ndp) +{ + namei_unlock_fsnode(ndp); + if (ndp->ni_cnd.cn_flags & HASBUF) { char *tmp = ndp->ni_cnd.cn_pnbuf; @@ -1258,6 +1553,7 @@ nameidone(struct nameidata *ndp) * fails because /foo_bar_baz is not found will only log "/foo_bar_baz", with * no '>' padding. But /foo_bar/spam would log "/foo_bar>>>>". */ +#if !defined(NO_KDEBUG) static void kdebug_lookup(struct vnode *dp, struct componentname *cnp) { @@ -1305,7 +1601,34 @@ kdebug_lookup(struct vnode *dp, struct componentname *cnp) KERNEL_DEBUG_CONSTANT(code, dbg_parms[i], dbg_parms[i+1], dbg_parms[i+2], dbg_parms[i+3], 0); } } +#else /* NO_KDEBUG */ +static void +kdebug_lookup(struct vnode *dp __unused, struct componentname *cnp __unused) +{ +} +#endif /* NO_KDEBUG */ + +int +vfs_getbyid(fsid_t *fsid, ino64_t ino, vnode_t *vpp, vfs_context_t ctx) +{ + mount_t mp; + int error; + + mp = mount_lookupby_volfsid(fsid->val[0], 1); + if (mp == NULL) { + return EINVAL; + } + + /* Get the target vnode. */ + if (ino == 2) { + error = VFS_ROOT(mp, vpp, ctx); + } else { + error = VFS_VGET(mp, ino, vpp, ctx); + } + vfs_unbusy(mp); + return error; +} /* * Obtain the real path from a legacy volfs style path. * @@ -1384,3 +1707,59 @@ out: return (error); } #endif + +void +lookup_compound_vnop_post_hook(int error, vnode_t dvp, vnode_t vp, struct nameidata *ndp, int did_create) +{ + if (error == 0 && vp == NULLVP) { + panic("NULL vp with error == 0.\n"); + } + + /* + * We don't want to do any of this if we didn't use the compound vnop + * to perform the lookup... i.e. if we're allowing and using the legacy pattern, + * where we did a full lookup. + */ + if ((ndp->ni_flag & NAMEI_COMPOUND_OP_MASK) == 0) { + return; + } + + /* + * If we're going to continue the lookup, we'll handle + * all lookup-related updates at that time. + */ + if (error == EKEEPLOOKING) { + return; + } + + /* + * Only audit or update cache for *found* vnodes. For creation + * neither would happen in the non-compound-vnop case. + */ + if ((vp != NULLVP) && !did_create) { + /* + * If MAKEENTRY isn't set, and we've done a successful compound VNOP, + * then we certainly don't want to update cache or identity. + */ + if ((error != 0) || (ndp->ni_cnd.cn_flags & MAKEENTRY)) { + lookup_consider_update_cache(dvp, vp, &ndp->ni_cnd, ndp->ni_ncgeneration); + } + if (ndp->ni_cnd.cn_flags & AUDITVNPATH1) + AUDIT_ARG(vnpath, vp, ARG_VNODE1); + else if (ndp->ni_cnd.cn_flags & AUDITVNPATH2) + AUDIT_ARG(vnpath, vp, ARG_VNODE2); + } + + /* + * If you created (whether you opened or not), cut a lookup tracepoint + * for the parent dir (as would happen without a compound vnop). Note: we may need + * a vnode despite failure in this case! + * + * If you did not create: + * Found child (succeeded or not): cut a tracepoint for the child. + * Did not find child: cut a tracepoint with the parent. + */ + if (kdebug_enable) { + kdebug_lookup(vp ? vp : dvp, &ndp->ni_cnd); + } +} diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index 3b10114cb..462fbef79 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -118,12 +118,17 @@ #include <mach/mach_types.h> #include <mach/memory_object_types.h> +#include <mach/memory_object_control.h> #include <kern/kalloc.h> /* kalloc()/kfree() */ #include <kern/clock.h> /* delay_for_interval() */ #include <libkern/OSAtomic.h> /* OSAddAtomic() */ +#ifdef JOE_DEBUG +#include <libkern/OSDebug.h> +#endif + #include <vm/vm_protos.h> /* vnode_pager_vrele() */ #if CONFIG_MACF @@ -133,6 +138,10 @@ extern lck_grp_t *vnode_lck_grp; extern lck_attr_t *vnode_lck_attr; +#if CONFIG_TRIGGERS +extern lck_grp_t *trigger_vnode_lck_grp; +extern lck_attr_t *trigger_vnode_lck_attr; +#endif extern lck_mtx_t * mnt_list_mtx_lock; @@ -145,6 +154,16 @@ int vttoif_tab[9] = { S_IFSOCK, S_IFIFO, S_IFMT, }; + +/* XXX These should be in a BSD accessible Mach header, but aren't. */ +extern void memory_object_mark_used( + memory_object_control_t control); + +extern void memory_object_mark_unused( + memory_object_control_t control, + boolean_t rage); + + /* XXX next protptype should be from <nfs/nfs.h> */ extern int nfs_vinvalbuf(vnode_t, int, vfs_context_t, int); @@ -173,7 +192,6 @@ static void vclean(vnode_t vp, int flag); static void vnode_reclaim_internal(vnode_t, int, int, int); static void vnode_dropiocount (vnode_t); -static errno_t vnode_getiocount(vnode_t vp, unsigned int vid, int vflags); static vnode_t checkalias(vnode_t vp, dev_t nvp_rdev); static int vnode_reload(vnode_t); @@ -188,6 +206,9 @@ static int vnode_iterate_prepare(mount_t); static int vnode_iterate_reloadq(mount_t); static void vnode_iterate_clear(mount_t); static mount_t vfs_getvfs_locked(fsid_t *); +static int vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, + struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx); +static int vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx); errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); @@ -195,6 +216,11 @@ errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); static void record_vp(vnode_t vp, int count); #endif +#if CONFIG_TRIGGERS +static int vnode_resolver_create(mount_t, vnode_t, struct vnode_trigger_param *, boolean_t external); +static void vnode_resolver_detach(vnode_t); +#endif + TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ TAILQ_HEAD(deadlst, vnode) vnode_dead_list; /* vnode dead list */ @@ -370,29 +396,27 @@ void vnode_writedone(vnode_t vp) { if (vp) { - OSAddAtomic(-1, &vp->v_numoutput); + int need_wakeup = 0; - if (vp->v_numoutput <= 1) { - int need_wakeup = 0; + OSAddAtomic(-1, &vp->v_numoutput); - vnode_lock_spin(vp); + vnode_lock_spin(vp); - if (vp->v_numoutput < 0) - panic("vnode_writedone: numoutput < 0"); + if (vp->v_numoutput < 0) + panic("vnode_writedone: numoutput < 0"); - if ((vp->v_flag & VTHROTTLED) && (vp->v_numoutput <= 1)) { - vp->v_flag &= ~VTHROTTLED; - need_wakeup = 1; - } - if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) { - vp->v_flag &= ~VBWAIT; - need_wakeup = 1; - } - vnode_unlock(vp); - - if (need_wakeup) - wakeup((caddr_t)&vp->v_numoutput); + if ((vp->v_flag & VTHROTTLED)) { + vp->v_flag &= ~VTHROTTLED; + need_wakeup = 1; + } + if ((vp->v_flag & VBWAIT) && (vp->v_numoutput == 0)) { + vp->v_flag &= ~VBWAIT; + need_wakeup = 1; } + vnode_unlock(vp); + + if (need_wakeup) + wakeup((caddr_t)&vp->v_numoutput); } } @@ -781,6 +805,13 @@ mount_refdrain(mount_t mp) return(0); } +/* Tags the mount point as not supportine extended readdir for NFS exports */ +void +mount_set_noreaddirext(mount_t mp) { + mount_lock (mp); + mp->mnt_kern_flag |= MNTK_DENY_READDIREXT; + mount_unlock (mp); +} /* * Mark a mount point as busy. Used to synchronize access and to delay @@ -887,6 +918,8 @@ vfs_rootmountalloc_internal(struct vfstable *vfsp, const char *devname) mp->mnt_ioflags = 0; mp->mnt_realrootvp = NULLVP; mp->mnt_authcache_ttl = CACHED_LOOKUP_RIGHT_TTL; + mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1; + mp->mnt_devbsdunit = 0; mount_lock_init(mp); (void)vfs_busy(mp, LK_NOWAIT); @@ -1120,6 +1153,8 @@ vfs_getvfs_by_mntonname(char *path) if (!strncmp(mp->mnt_vfsstat.f_mntonname, path, sizeof(mp->mnt_vfsstat.f_mntonname))) { retmp = mp; + if (mount_iterref(retmp, 1)) + retmp = NULL; goto out; } } @@ -1358,6 +1393,7 @@ found_alias: nvp->v_rdev = nvp_rdev; nvp->v_specflags = 0; nvp->v_speclastr = -1; + nvp->v_specinfo->si_opencount = 0; SPECHASH_LOCK(); @@ -1416,22 +1452,16 @@ int vget_internal(vnode_t vp, int vid, int vflags) { int error = 0; - int vpid; vnode_lock_spin(vp); - if (vflags & VNODE_WITHID) - vpid = vid; - else - vpid = vp->v_id; // save off the original v_id - if ((vflags & VNODE_WRITEABLE) && (vp->v_writecount == 0)) /* * vnode to be returned only if it has writers opened */ error = EINVAL; else - error = vnode_getiocount(vp, vpid, vflags); + error = vnode_getiocount(vp, vid, vflags); vnode_unlock(vp); @@ -1446,7 +1476,7 @@ int vnode_ref(vnode_t vp) { - return (vnode_ref_ext(vp, 0)); + return (vnode_ref_ext(vp, 0, 0)); } /* @@ -1454,7 +1484,7 @@ vnode_ref(vnode_t vp) * ENOENT No such file or directory [terminating] */ int -vnode_ref_ext(vnode_t vp, int fmode) +vnode_ref_ext(vnode_t vp, int fmode, int flags) { int error = 0; @@ -1471,10 +1501,12 @@ vnode_ref_ext(vnode_t vp, int fmode) /* * if you are the owner of drain/termination, can acquire usecount */ - if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) { - if (vp->v_owner != current_thread()) { - error = ENOENT; - goto out; + if ((flags & VNODE_REF_FORCE) == 0) { + if ((vp->v_lflag & (VL_DRAIN | VL_TERMINATE | VL_DEAD))) { + if (vp->v_owner != current_thread()) { + error = ENOENT; + goto out; + } } } vp->v_usecount++; @@ -1507,6 +1539,13 @@ vnode_ref_ext(vnode_t vp, int fmode) vnode_list_remove(vp); } } + if (vp->v_usecount == 1 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) { + + if (vp->v_ubcinfo) { + vnode_lock_convert(vp); + memory_object_mark_used(vp->v_ubcinfo->ui_control); + } + } out: vnode_unlock(vp); @@ -1659,6 +1698,7 @@ vnode_rele_ext(vnode_t vp, int fmode, int dont_reenter) void vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) { + if ( !locked) vnode_lock_spin(vp); #if DIAGNOSTIC @@ -1689,9 +1729,7 @@ vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) vp->v_lflag |= VL_NEEDINACTIVE; vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); } - if ( !locked) - vnode_unlock(vp); - return; + goto done; } vp->v_flag &= ~(VNOCACHE_DATA | VRAOFF | VOPENEVT); @@ -1709,9 +1747,8 @@ vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) vp->v_flag |= VAGE; } vnode_list_add(vp); - if ( !locked) - vnode_unlock(vp); - return; + + goto done; } /* * at this point both the iocount and usecount @@ -1746,15 +1783,22 @@ vnode_rele_internal(vnode_t vp, int fmode, int dont_reenter, int locked) if (ut->uu_defer_reclaims) { vp->v_defer_reclaimlist = ut->uu_vreclaims; - ut->uu_vreclaims = vp; - goto defer_reclaim; + ut->uu_vreclaims = vp; + goto done; } vnode_lock_convert(vp); vnode_reclaim_internal(vp, 1, 1, 0); } vnode_dropiocount(vp); vnode_list_add(vp); -defer_reclaim: +done: + if (vp->v_usecount == 0 && vp->v_type == VREG && !(vp->v_flag & VSYSTEM)) { + + if (vp->v_ubcinfo) { + vnode_lock_convert(vp); + memory_object_mark_unused(vp->v_ubcinfo->ui_control, (vp->v_flag & VRAGE) == VRAGE); + } + } if ( !locked) vnode_unlock(vp); return; @@ -2020,13 +2064,13 @@ vclean(vnode_t vp, int flags) #endif { VNOP_FSYNC(vp, MNT_WAIT, ctx); - buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0); + buf_invalidateblks(vp, BUF_WRITE_DATA | BUF_INVALIDATE_LOCKED, 0, 0); } if (UBCINFOEXISTS(vp)) /* * Clean the pages in VM. */ - (void)ubc_sync_range(vp, (off_t)0, ubc_getsize(vp), UBC_PUSHALL); + (void)ubc_msync(vp, (off_t)0, ubc_getsize(vp), NULL, UBC_PUSHALL | UBC_INVALIDATE | UBC_SYNC); } if (active || need_inactive) VNOP_INACTIVE(vp, ctx); @@ -2039,18 +2083,7 @@ vclean(vnode_t vp, int flags) if (vnode_isshadow(vp)) { vnode_relenamedstream(pvp, vp, ctx); } - - /* - * Because vclean calls VNOP_INACTIVE prior to calling vnode_relenamedstream, we may not have - * torn down and/or deleted the shadow file yet. On HFS, if the shadow file is sufficiently large - * and occupies a large number of extents, the deletion will be deferred until VNOP_INACTIVE - * and the file treated like an open-unlinked. To rectify this, call VNOP_INACTIVE again - * explicitly to force its removal. - */ - if (vnode_isshadow(vp)) { - VNOP_INACTIVE(vp, ctx); - } - + /* * No more streams associated with the parent. We * have a ref on it, so its identity is stable. @@ -2072,6 +2105,14 @@ vclean(vnode_t vp, int flags) */ ubc_destroy_named(vp); +#if CONFIG_TRIGGERS + /* + * cleanup trigger info from vnode (if any) + */ + if (vp->v_resolve) + vnode_resolver_detach(vp); +#endif + /* * Reclaim the vnode. */ @@ -2301,7 +2342,7 @@ vcount(vnode_t vp) loop: if (!vnode_isaliased(vp)) - return (vp->v_usecount - vp->v_kusecount); + return (vp->v_specinfo->si_opencount); count = 0; SPECHASH_LOCK(); @@ -2332,7 +2373,7 @@ loop: vnode_unlock(vq); goto loop; } - count += (vq->v_usecount - vq->v_kusecount); + count += vq->v_specinfo->si_opencount; } vnode_unlock(vq); @@ -2710,7 +2751,7 @@ sysctl_vnode } SYSCTL_PROC(_kern, KERN_VNODE, vnode, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MASKED, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED, 0, 0, sysctl_vnode, "S,", ""); @@ -2804,6 +2845,8 @@ vnode_pager_vrele(vnode_t vp) #include <sys/disk.h> +u_int32_t rootunit = (u_int32_t)-1; + errno_t vfs_init_io_attributes(vnode_t devvp, mount_t mp) { @@ -2824,24 +2867,25 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) vfs_context_t ctx = vfs_context_current(); int isssd = 0; int isvirtual = 0; + + + VNOP_IOCTL(devvp, DKIOCGETTHROTTLEMASK, (caddr_t)&mp->mnt_throttle_mask, 0, NULL); /* - * determine if this mount point exists on the same device as the root - * partition... if so, then it comes under the hard throttle control + * as a reasonable approximation, only use the lowest bit of the mask + * to generate a disk unit number */ - int thisunit = -1; - static int rootunit = -1; + mp->mnt_devbsdunit = num_trailing_0(mp->mnt_throttle_mask); - if (rootunit == -1) { - if (VNOP_IOCTL(rootvp, DKIOCGETBSDUNIT, (caddr_t)&rootunit, 0, ctx)) - rootunit = -1; - else if (rootvp == devvp) - mp->mnt_kern_flag |= MNTK_ROOTDEV; - } - if (devvp != rootvp && rootunit != -1) { - if (VNOP_IOCTL(devvp, DKIOCGETBSDUNIT, (caddr_t)&thisunit, 0, ctx) == 0) { - if (thisunit == rootunit) - mp->mnt_kern_flag |= MNTK_ROOTDEV; - } + if (devvp == rootvp) + rootunit = mp->mnt_devbsdunit; + + if (mp->mnt_devbsdunit == rootunit) { + /* + * this mount point exists on the same device as the root + * partition, so it comes under the hard throttle control... + * this is true even for the root mount point itself + */ + mp->mnt_kern_flag |= MNTK_ROOTDEV; } /* * force the spec device to re-cache @@ -2875,7 +2919,6 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp) if (isssd) mp->mnt_kern_flag |= MNTK_SSD; } - if ((error = VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&features, 0, ctx))) return (error); @@ -3253,7 +3296,11 @@ sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, sfs.f_fsid = sp->f_fsid; sfs.f_owner = sp->f_owner; - strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + } else { + strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); + } strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN); @@ -3307,7 +3354,11 @@ sysctl_vfs_ctlbyfsid(__unused struct sysctl_oid *oidp, void *arg1, int arg2, sfs.f_fsid = sp->f_fsid; sfs.f_owner = sp->f_owner; - strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + } else { + strlcpy(sfs.f_fstypename, sp->f_fstypename, MFSNAMELEN); + } strlcpy(sfs.f_mntonname, sp->f_mntonname, MNAMELEN); strlcpy(sfs.f_mntfromname, sp->f_mntfromname, MNAMELEN); @@ -3414,14 +3465,14 @@ sysctl_vfs_noremotehang(__unused struct sysctl_oid *oidp, } /* the vfs.generic. branch. */ -SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW|CTLFLAG_LOCKED, NULL, "vfs generic hinge"); +SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RW | CTLFLAG_LOCKED, NULL, "vfs generic hinge"); /* retreive a list of mounted filesystem fsid_t */ -SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD, +SYSCTL_PROC(_vfs_generic, OID_AUTO, vfsidlist, CTLFLAG_RD | CTLFLAG_LOCKED, NULL, 0, sysctl_vfs_vfslist, "S,fsid", "List of mounted filesystem ids"); /* perform operations on filesystem via fsid_t */ -SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW|CTLFLAG_LOCKED, +SYSCTL_NODE(_vfs_generic, OID_AUTO, ctlbyfsid, CTLFLAG_RW | CTLFLAG_LOCKED, sysctl_vfs_ctlbyfsid, "ctlbyfsid"); -SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW|CTLFLAG_ANYBODY, +SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY, NULL, 0, sysctl_vfs_noremotehang, "I", "noremotehang"); @@ -3448,18 +3499,18 @@ retry: vnode_list_lock(); - if ( !TAILQ_EMPTY(&vnode_dead_list)) { - /* - * Can always reuse a dead one + if ((numvnodes - deadvnodes) < desiredvnodes || force_alloc) { + if ( !TAILQ_EMPTY(&vnode_dead_list)) { + /* + * Can always reuse a dead one + */ + vp = TAILQ_FIRST(&vnode_dead_list); + goto steal_this_vp; + } + /* + * no dead vnodes available... if we're under + * the limit, we'll create a new vnode */ - vp = TAILQ_FIRST(&vnode_dead_list); - goto steal_this_vp; - } - /* - * no dead vnodes available... if we're under - * the limit, we'll create a new vnode - */ - if (numvnodes < desiredvnodes || force_alloc) { numvnodes++; vnode_list_unlock(); @@ -3493,17 +3544,22 @@ retry: panic("new_vnode: vp (%p) on RAGE list not marked VLIST_RAGE", vp); // if we're a dependency-capable process, skip vnodes that can - // cause recycling deadlocks. (i.e. this process is diskimages - // helper and the vnode is in a disk image). - // - if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || vp->v_mount->mnt_dependent_process == NULL) { - break; + // cause recycling deadlocks. (i.e. this process is diskimages + // helper and the vnode is in a disk image). Querying the + // mnt_kern_flag for the mount's virtual device status + // is safer than checking the mnt_dependent_process, which + // may not be updated if there are multiple devnode layers + // in between the disk image and the final consumer. + + if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || + (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { + break; } // don't iterate more than MAX_WALK_COUNT vnodes to // avoid keeping the vnode list lock held for too long. if (walk_count++ > MAX_WALK_COUNT) { - vp = NULL; + vp = NULL; break; } } @@ -3516,12 +3572,18 @@ retry: */ walk_count = 0; TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) { - // if we're a dependency-capable process, skip vnodes that can - // cause recycling deadlocks. (i.e. this process is diskimages - // helper and the vnode is in a disk image) - // - if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || vp->v_mount->mnt_dependent_process == NULL) { - break; + + // if we're a dependency-capable process, skip vnodes that can + // cause recycling deadlocks. (i.e. this process is diskimages + // helper and the vnode is in a disk image). Querying the + // mnt_kern_flag for the mount's virtual device status + // is safer than checking the mnt_dependent_process, which + // may not be updated if there are multiple devnode layers + // in between the disk image and the final consumer. + + if ((curproc->p_flag & P_DEPENDENCY_CAPABLE) == 0 || vp->v_mount == NULL || + (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) == 0) { + break; } // don't iterate more than MAX_WALK_COUNT vnodes to @@ -3572,7 +3634,7 @@ retry: * Running out of vnodes tends to make a system unusable. Start killing * processes that jetsam knows are killable. */ - if (jetsam_kill_top_proc() < 0) { + if (jetsam_kill_top_proc(TRUE, kJetsamFlagsKilledVnodes) < 0) { /* * If jetsam can't find any more processes to kill and there * still aren't any free vnodes, panic. Hopefully we'll get a @@ -3754,10 +3816,27 @@ vnode_get_locked(struct vnode *vp) return (0); } +/* + * vnode_getwithvid() cuts in line in front of a vnode drain (that is, + * while the vnode is draining, but at no point after that) to prevent + * deadlocks when getting vnodes from filesystem hashes while holding + * resources that may prevent other iocounts from being released. + */ int vnode_getwithvid(vnode_t vp, uint32_t vid) { - return(vget_internal(vp, vid, ( VNODE_NODEAD| VNODE_WITHID))); + return(vget_internal(vp, vid, ( VNODE_NODEAD | VNODE_WITHID | VNODE_DRAINO ))); +} + +/* + * vnode_getwithvid_drainok() is like vnode_getwithvid(), but *does* block behind a vnode + * drain; it exists for use in the VFS name cache, where we really do want to block behind + * vnode drain to prevent holding off an unmount. + */ +int +vnode_getwithvid_drainok(vnode_t vp, uint32_t vid) +{ + return(vget_internal(vp, vid, ( VNODE_NODEAD | VNODE_WITHID ))); } int @@ -3801,7 +3880,7 @@ retry: vnode_dropiocount(vp); return(0); } - if ((vp->v_lflag & (VL_MARKTERM | VL_TERMINATE | VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) { + if ((vp->v_lflag & (VL_DEAD | VL_NEEDINACTIVE)) == VL_NEEDINACTIVE) { vp->v_lflag &= ~VL_NEEDINACTIVE; vnode_unlock(vp); @@ -3914,7 +3993,7 @@ vnode_drain(vnode_t vp) { if (vp->v_lflag & VL_DRAIN) { - panic("vnode_drain: recursuve drain"); + panic("vnode_drain: recursive drain"); return(ENOENT); } vp->v_lflag |= VL_DRAIN; @@ -3922,13 +4001,16 @@ vnode_drain(vnode_t vp) while (vp->v_iocount > 1) msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_drain", NULL); + + vp->v_lflag &= ~VL_DRAIN; + return(0); } /* * if the number of recent references via vnode_getwithvid or vnode_getwithref - * exceeds this threshhold, than 'UN-AGE' the vnode by removing it from + * exceeds this threshold, than 'UN-AGE' the vnode by removing it from * the LRU list if it's currently on it... once the iocount and usecount both drop * to 0, it will get put back on the end of the list, effectively making it younger * this allows us to keep actively referenced vnodes in the list without having @@ -3937,12 +4019,13 @@ vnode_drain(vnode_t vp) */ #define UNAGE_THRESHHOLD 25 -static errno_t +errno_t vnode_getiocount(vnode_t vp, unsigned int vid, int vflags) { int nodead = vflags & VNODE_NODEAD; int nosusp = vflags & VNODE_NOSUSPEND; int always = vflags & VNODE_ALWAYS; + int beatdrain = vflags & VNODE_DRAINO; for (;;) { /* @@ -3974,6 +4057,18 @@ vnode_getiocount(vnode_t vp, unsigned int vid, int vflags) if (always != 0) break; + + /* + * In some situations, we want to get an iocount + * even if the vnode is draining to prevent deadlock, + * e.g. if we're in the filesystem, potentially holding + * resources that could prevent other iocounts from + * being released. + */ + if (beatdrain && (vp->v_lflag & VL_DRAIN)) { + break; + } + vnode_lock_convert(vp); if (vp->v_lflag & VL_TERMINATE) { @@ -3983,7 +4078,7 @@ vnode_getiocount(vnode_t vp, unsigned int vid, int vflags) } else msleep(&vp->v_iocount, &vp->v_lock, PVFS, "vnode_getiocount", NULL); } - if (vid != vp->v_id) { + if (((vflags & VNODE_WITHID) != 0) && vid != vp->v_id) { return(ENOENT); } if (++vp->v_references >= UNAGE_THRESHHOLD) { @@ -4087,7 +4182,6 @@ vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) vp->v_socket = NULL; vp->v_lflag &= ~VL_TERMINATE; - vp->v_lflag &= ~VL_DRAIN; vp->v_owner = NULL; KNOTE(&vp->v_knotes, NOTE_REVOKE); @@ -4114,7 +4208,6 @@ vnode_reclaim_internal(struct vnode * vp, int locked, int reuse, int flags) * The following api creates a vnode and associates all the parameter specified in vnode_fsparam * structure and returns a vnode handle with a reference. device aliasing is handled here so checkalias * is obsoleted by this. - * vnode_create(int flavor, size_t size, void * param, vnode_t *vp) */ int vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp) @@ -4127,159 +4220,210 @@ vnode_create(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp) struct uthread *ut; struct componentname *cnp; struct vnode_fsparam *param = (struct vnode_fsparam *)data; - - if (flavor == VNCREATE_FLAVOR && (size == VCREATESIZE) && param) { - if ( (error = new_vnode(&vp)) ) { - return(error); - } else { - dvp = param->vnfs_dvp; - cnp = param->vnfs_cnp; +#if CONFIG_TRIGGERS + struct vnode_trigger_param *tinfo = NULL; +#endif + if (param == NULL) + return (EINVAL); - vp->v_op = param->vnfs_vops; - vp->v_type = param->vnfs_vtype; - vp->v_data = param->vnfs_fsnode; +#if CONFIG_TRIGGERS + if ((flavor == VNCREATE_TRIGGER) && (size == VNCREATE_TRIGGER_SIZE)) { + tinfo = (struct vnode_trigger_param *)data; + + /* Validate trigger vnode input */ + if ((param->vnfs_vtype != VDIR) || + (tinfo->vnt_resolve_func == NULL) || + (tinfo->vnt_flags & ~VNT_VALID_MASK)) { + return (EINVAL); + } + /* Fall through a normal create (params will be the same) */ + flavor = VNCREATE_FLAVOR; + size = VCREATESIZE; + } +#endif + if ((flavor != VNCREATE_FLAVOR) || (size != VCREATESIZE)) + return (EINVAL); + + if ( (error = new_vnode(&vp)) ) + return(error); - if (param->vnfs_markroot) - vp->v_flag |= VROOT; - if (param->vnfs_marksystem) - vp->v_flag |= VSYSTEM; - if (vp->v_type == VREG) { - error = ubc_info_init_withsize(vp, param->vnfs_filesize); - if (error) { + dvp = param->vnfs_dvp; + cnp = param->vnfs_cnp; + + vp->v_op = param->vnfs_vops; + vp->v_type = param->vnfs_vtype; + vp->v_data = param->vnfs_fsnode; + + if (param->vnfs_markroot) + vp->v_flag |= VROOT; + if (param->vnfs_marksystem) + vp->v_flag |= VSYSTEM; + if (vp->v_type == VREG) { + error = ubc_info_init_withsize(vp, param->vnfs_filesize); + if (error) { #ifdef JOE_DEBUG - record_vp(vp, 1); + record_vp(vp, 1); #endif - vp->v_mount = NULL; - vp->v_op = dead_vnodeop_p; - vp->v_tag = VT_NON; - vp->v_data = NULL; - vp->v_type = VBAD; - vp->v_lflag |= VL_DEAD; - - vnode_put(vp); - return(error); - } - } + vp->v_mount = NULL; + vp->v_op = dead_vnodeop_p; + vp->v_tag = VT_NON; + vp->v_data = NULL; + vp->v_type = VBAD; + vp->v_lflag |= VL_DEAD; + + vnode_put(vp); + return(error); + } + } +#ifdef JOE_DEBUG + record_vp(vp, 1); +#endif + +#if CONFIG_TRIGGERS + /* + * For trigger vnodes, attach trigger info to vnode + */ + if ((vp->v_type == VDIR) && (tinfo != NULL)) { + /* + * Note: has a side effect of incrementing trigger count on the + * mount if successful, which we would need to undo on a + * subsequent failure. + */ +#ifdef JOE_DEBUG + record_vp(vp, -1); +#endif + error = vnode_resolver_create(param->vnfs_mp, vp, tinfo, FALSE); + if (error) { + printf("vnode_create: vnode_resolver_create() err %d\n", error); + vp->v_mount = NULL; + vp->v_op = dead_vnodeop_p; + vp->v_tag = VT_NON; + vp->v_data = NULL; + vp->v_type = VBAD; + vp->v_lflag |= VL_DEAD; #ifdef JOE_DEBUG record_vp(vp, 1); #endif - if (vp->v_type == VCHR || vp->v_type == VBLK) { - - vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */ - - if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) { - /* - * if checkalias returns a vnode, it will be locked - * - * first get rid of the unneeded vnode we acquired - */ - vp->v_data = NULL; - vp->v_op = spec_vnodeop_p; - vp->v_type = VBAD; - vp->v_lflag = VL_DEAD; - vp->v_data = NULL; - vp->v_tag = VT_NON; - vnode_put(vp); + vnode_put(vp); + return (error); + } + } +#endif + if (vp->v_type == VCHR || vp->v_type == VBLK) { - /* - * switch to aliased vnode and finish - * preparing it - */ - vp = nvp; - - vclean(vp, 0); - vp->v_op = param->vnfs_vops; - vp->v_type = param->vnfs_vtype; - vp->v_data = param->vnfs_fsnode; - vp->v_lflag = 0; - vp->v_mount = NULL; - insmntque(vp, param->vnfs_mp); - insert = 0; - vnode_unlock(vp); - } - } + vp->v_tag = VT_DEVFS; /* callers will reset if needed (bdevvp) */ - if (vp->v_type == VFIFO) { - struct fifoinfo *fip; + if ( (nvp = checkalias(vp, param->vnfs_rdev)) ) { + /* + * if checkalias returns a vnode, it will be locked + * + * first get rid of the unneeded vnode we acquired + */ + vp->v_data = NULL; + vp->v_op = spec_vnodeop_p; + vp->v_type = VBAD; + vp->v_lflag = VL_DEAD; + vp->v_data = NULL; + vp->v_tag = VT_NON; + vnode_put(vp); - MALLOC(fip, struct fifoinfo *, - sizeof(*fip), M_TEMP, M_WAITOK); - bzero(fip, sizeof(struct fifoinfo )); - vp->v_fifoinfo = fip; - } - /* The file systems must pass the address of the location where - * they store the vnode pointer. When we add the vnode into the mount - * list and name cache they become discoverable. So the file system node - * must have the connection to vnode setup by then + /* + * switch to aliased vnode and finish + * preparing it */ - *vpp = vp; + vp = nvp; - /* Add fs named reference. */ - if (param->vnfs_flags & VNFS_ADDFSREF) { - vp->v_lflag |= VNAMED_FSHASH; - } - if (param->vnfs_mp) { - if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) - vp->v_flag |= VLOCKLOCAL; - if (insert) { - if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) - panic("insmntque: vp on the free list\n"); - /* - * enter in mount vnode list - */ - insmntque(vp, param->vnfs_mp); - } + vclean(vp, 0); + vp->v_op = param->vnfs_vops; + vp->v_type = param->vnfs_vtype; + vp->v_data = param->vnfs_fsnode; + vp->v_lflag = 0; + vp->v_mount = NULL; + insmntque(vp, param->vnfs_mp); + insert = 0; + vnode_unlock(vp); + } + } + + if (vp->v_type == VFIFO) { + struct fifoinfo *fip; + + MALLOC(fip, struct fifoinfo *, + sizeof(*fip), M_TEMP, M_WAITOK); + bzero(fip, sizeof(struct fifoinfo )); + vp->v_fifoinfo = fip; + } + /* The file systems must pass the address of the location where + * they store the vnode pointer. When we add the vnode into the mount + * list and name cache they become discoverable. So the file system node + * must have the connection to vnode setup by then + */ + *vpp = vp; + + /* Add fs named reference. */ + if (param->vnfs_flags & VNFS_ADDFSREF) { + vp->v_lflag |= VNAMED_FSHASH; + } + if (param->vnfs_mp) { + if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) + vp->v_flag |= VLOCKLOCAL; + if (insert) { + if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) + panic("insmntque: vp on the free list\n"); + + /* + * enter in mount vnode list + */ + insmntque(vp, param->vnfs_mp); + } #ifndef __LP64__ - if ((param->vnfs_mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE) == 0) { - MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *, - sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK); - vp->v_unsafefs->fsnode_count = 0; - vp->v_unsafefs->fsnodeowner = (void *)NULL; - lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr); - } + if ((param->vnfs_mp->mnt_vtable->vfc_vfsflags & VFC_VFSTHREADSAFE) == 0) { + MALLOC_ZONE(vp->v_unsafefs, struct unsafe_fsnode *, + sizeof(struct unsafe_fsnode), M_UNSAFEFS, M_WAITOK); + vp->v_unsafefs->fsnode_count = 0; + vp->v_unsafefs->fsnodeowner = (void *)NULL; + lck_mtx_init(&vp->v_unsafefs->fsnodelock, vnode_lck_grp, vnode_lck_attr); + } #endif /* __LP64__ */ - } - if (dvp && vnode_ref(dvp) == 0) { - vp->v_parent = dvp; - } - if (cnp) { - if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) { - /* - * enter into name cache - * we've got the info to enter it into the name cache now - * cache_enter_create will pick up an extra reference on - * the name entered into the string cache - */ - vp->v_name = cache_enter_create(dvp, vp, cnp); - } else - vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); + } + if (dvp && vnode_ref(dvp) == 0) { + vp->v_parent = dvp; + } + if (cnp) { + if (dvp && ((param->vnfs_flags & (VNFS_NOCACHE | VNFS_CANTCACHE)) == 0)) { + /* + * enter into name cache + * we've got the info to enter it into the name cache now + * cache_enter_create will pick up an extra reference on + * the name entered into the string cache + */ + vp->v_name = cache_enter_create(dvp, vp, cnp); + } else + vp->v_name = vfs_addname(cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_hash, 0); - if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) - vp->v_flag |= VISUNION; - } - if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { - /* - * this vnode is being created as cacheable in the name cache - * this allows us to re-enter it in the cache - */ - vp->v_flag |= VNCACHEABLE; - } - ut = get_bsdthread_info(current_thread()); - - if ((current_proc()->p_lflag & P_LRAGE_VNODES) || - (ut->uu_flag & UT_RAGE_VNODES)) { - /* - * process has indicated that it wants any - * vnodes created on its behalf to be rapidly - * aged to reduce the impact on the cached set - * of vnodes - */ - vp->v_flag |= VRAGE; - } - return(0); - } + if ((cnp->cn_flags & UNIONCREATED) == UNIONCREATED) + vp->v_flag |= VISUNION; + } + if ((param->vnfs_flags & VNFS_CANTCACHE) == 0) { + /* + * this vnode is being created as cacheable in the name cache + * this allows us to re-enter it in the cache + */ + vp->v_flag |= VNCACHEABLE; + } + ut = get_bsdthread_info(current_thread()); + + if ((current_proc()->p_lflag & P_LRAGE_VNODES) || + (ut->uu_flag & UT_RAGE_VNODES)) { + /* + * process has indicated that it wants any + * vnodes created on its behalf to be rapidly + * aged to reduce the impact on the cached set + * of vnodes + */ + vp->v_flag |= VRAGE; } - return (EINVAL); + return (0); } int @@ -4309,13 +4453,14 @@ vnode_removefsref(vnode_t vp) int -vfs_iterate(__unused int flags, int (*callout)(mount_t, void *), void *arg) +vfs_iterate(int flags, int (*callout)(mount_t, void *), void *arg) { mount_t mp; int ret = 0; fsid_t * fsid_list; int count, actualcount, i; void * allocmem; + int indx_start, indx_stop, indx_incr; count = mount_getvfscnt(); count += 10; @@ -4325,7 +4470,21 @@ vfs_iterate(__unused int flags, int (*callout)(mount_t, void *), void *arg) actualcount = mount_fillfsids(fsid_list, count); - for (i=0; i< actualcount; i++) { + /* + * Establish the iteration direction + * VFS_ITERATE_TAIL_FIRST overrides default head first order (oldest first) + */ + if (flags & VFS_ITERATE_TAIL_FIRST) { + indx_start = actualcount - 1; + indx_stop = -1; + indx_incr = -1; + } else /* Head first by default */ { + indx_start = 0; + indx_stop = actualcount; + indx_incr = 1; + } + + for (i=indx_start; i != indx_stop; i += indx_incr) { /* obtain the mount point with iteration reference */ mp = mount_list_lookupby_fsid(&fsid_list[i], 0, 1); @@ -4567,7 +4726,8 @@ vnode_lookup(const char *path, int flags, vnode_t *vpp, vfs_context_t ctx) ndflags |= DOWHITEOUT; /* XXX AUDITVNPATH1 needed ? */ - NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + NDINIT(&nd, LOOKUP, OP_LOOKUP, ndflags, UIO_SYSSPACE, + CAST_USER_ADDR_T(path), ctx); if ((error = namei(&nd))) return (error); @@ -4603,7 +4763,8 @@ vnode_open(const char *path, int fmode, int cmode, int flags, vnode_t *vpp, vfs_ ndflags |= DOWHITEOUT; /* XXX AUDITVNPATH1 needed ? */ - NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE, + CAST_USER_ADDR_T(path), ctx); if ((error = vn_open(&nd, fmode, cmode))) *vpp = NULL; @@ -4656,6 +4817,18 @@ vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx) return(vnode_setattr(vp, &va, ctx)); } +static int +vn_create_reg(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx) +{ + /* Only use compound VNOP for compound operation */ + if (vnode_compound_open_available(dvp) && ((flags & VN_CREATE_DOOPEN) != 0)) { + *vpp = NULLVP; + return VNOP_COMPOUND_OPEN(dvp, vpp, ndp, VNOP_COMPOUND_OPEN_DO_CREATE, fmode, statusp, vap, ctx); + } else { + return VNOP_CREATE(dvp, vpp, &ndp->ni_cnd, vap, ctx); + } +} + /* * Create a filesystem object of arbitrary type with arbitrary attributes in * the spevied directory with the specified name. @@ -4698,70 +4871,48 @@ vnode_setsize(vnode_t vp, off_t size, int ioflag, vfs_context_t ctx) * in the code they originated. */ errno_t -vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_attr *vap, int flags, vfs_context_t ctx) +vn_create(vnode_t dvp, vnode_t *vpp, struct nameidata *ndp, struct vnode_attr *vap, uint32_t flags, int fmode, uint32_t *statusp, vfs_context_t ctx) { - kauth_acl_t oacl, nacl; - int initial_acl; - errno_t error; + errno_t error, old_error; vnode_t vp = (vnode_t)0; + boolean_t batched; + struct componentname *cnp; + uint32_t defaulted; + cnp = &ndp->ni_cnd; error = 0; - oacl = nacl = NULL; - initial_acl = 0; + batched = namei_compound_available(dvp, ndp) ? TRUE : FALSE; KAUTH_DEBUG("%p CREATE - '%s'", dvp, cnp->cn_nameptr); + if (flags & VN_CREATE_NOINHERIT) + vap->va_vaflags |= VA_NOINHERIT; + if (flags & VN_CREATE_NOAUTH) + vap->va_vaflags |= VA_NOAUTH; /* - * Handle ACL inheritance. + * Handle ACL inheritance, initialize vap. */ - if (!(flags & VN_CREATE_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) { - /* save the original filesec */ - if (VATTR_IS_ACTIVE(vap, va_acl)) { - initial_acl = 1; - oacl = vap->va_acl; - } - - vap->va_acl = NULL; - if ((error = kauth_acl_inherit(dvp, - oacl, - &nacl, - vap->va_type == VDIR, - ctx)) != 0) { - KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error); - return(error); - } + error = vn_attribute_prepare(dvp, vap, &defaulted, ctx); + if (error) { + return error; + } - /* - * If the generated ACL is NULL, then we can save ourselves some effort - * by clearing the active bit. - */ - if (nacl == NULL) { - VATTR_CLEAR_ACTIVE(vap, va_acl); - } else { - VATTR_SET(vap, va_acl, nacl); - } + if (vap->va_type != VREG && (fmode != 0 || (flags & VN_CREATE_DOOPEN) || statusp)) { + panic("Open parameters, but not a regular file."); } - - /* - * Check and default new attributes. - * This will set va_uid, va_gid, va_mode and va_create_time at least, if the caller - * hasn't supplied them. - */ - if ((error = vnode_authattr_new(dvp, vap, flags & VN_CREATE_NOAUTH, ctx)) != 0) { - KAUTH_DEBUG("%p CREATE - error %d handing/defaulting attributes", dvp, error); - goto out; + if ((fmode != 0) && ((flags & VN_CREATE_DOOPEN) == 0)) { + panic("Mode for open, but not trying to open..."); } - /* * Create the requested node. */ switch(vap->va_type) { case VREG: - error = VNOP_CREATE(dvp, vpp, cnp, vap, ctx); + error = vn_create_reg(dvp, vpp, ndp, vap, flags, fmode, statusp, ctx); break; case VDIR: - error = VNOP_MKDIR(dvp, vpp, cnp, vap, ctx); + error = vn_mkdir(dvp, vpp, ndp, vap, ctx); break; case VSOCK: case VFIFO: @@ -4778,6 +4929,8 @@ vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_att } vp = *vpp; + old_error = error; + #if CONFIG_MACF if (!(flags & VN_CREATE_NOLABEL)) { error = vnode_label(vnode_mount(vp), dvp, vp, cnp, VNODE_LABEL_CREATE, ctx); @@ -4797,24 +4950,22 @@ vn_create(vnode_t dvp, vnode_t *vpp, struct componentname *cnp, struct vnode_att #if CONFIG_MACF error: #endif - if ((error != 0 ) && (vp != (vnode_t)0)) { - *vpp = (vnode_t) 0; - vnode_put(vp); + if ((error != 0) && (vp != (vnode_t)0)) { + + /* If we've done a compound open, close */ + if (batched && (old_error == 0) && (vap->va_type == VREG)) { + VNOP_CLOSE(vp, fmode, ctx); + } + + /* Need to provide notifications if a create succeeded */ + if (!batched) { + *vpp = (vnode_t) 0; + vnode_put(vp); + } } out: - /* - * If the caller supplied a filesec in vap, it has been replaced - * now by the post-inheritance copy. We need to put the original back - * and free the inherited product. - */ - if (initial_acl) { - VATTR_SET(vap, va_acl, oacl); - } else { - VATTR_CLEAR_ACTIVE(vap, va_acl); - } - if (nacl != NULL) - kauth_acl_free(nacl); + vn_attribute_cleanup(vap, defaulted); return(error); } @@ -4845,6 +4996,433 @@ vnode_authorize_init(void) vnode_scope = kauth_register_scope(KAUTH_SCOPE_VNODE, vnode_authorize_callback, NULL); } +#define VATTR_PREPARE_DEFAULTED_UID 0x1 +#define VATTR_PREPARE_DEFAULTED_GID 0x2 +#define VATTR_PREPARE_DEFAULTED_MODE 0x4 + +int +vn_attribute_prepare(vnode_t dvp, struct vnode_attr *vap, uint32_t *defaulted_fieldsp, vfs_context_t ctx) +{ + kauth_acl_t nacl = NULL, oacl = NULL; + int error; + + /* + * Handle ACL inheritance. + */ + if (!(vap->va_vaflags & VA_NOINHERIT) && vfs_extendedsecurity(dvp->v_mount)) { + /* save the original filesec */ + if (VATTR_IS_ACTIVE(vap, va_acl)) { + oacl = vap->va_acl; + } + + vap->va_acl = NULL; + if ((error = kauth_acl_inherit(dvp, + oacl, + &nacl, + vap->va_type == VDIR, + ctx)) != 0) { + KAUTH_DEBUG("%p CREATE - error %d processing inheritance", dvp, error); + return(error); + } + + /* + * If the generated ACL is NULL, then we can save ourselves some effort + * by clearing the active bit. + */ + if (nacl == NULL) { + VATTR_CLEAR_ACTIVE(vap, va_acl); + } else { + vap->va_base_acl = oacl; + VATTR_SET(vap, va_acl, nacl); + } + } + + error = vnode_authattr_new_internal(dvp, vap, (vap->va_vaflags & VA_NOAUTH), defaulted_fieldsp, ctx); + if (error) { + vn_attribute_cleanup(vap, *defaulted_fieldsp); + } + + return error; +} + +void +vn_attribute_cleanup(struct vnode_attr *vap, uint32_t defaulted_fields) +{ + /* + * If the caller supplied a filesec in vap, it has been replaced + * now by the post-inheritance copy. We need to put the original back + * and free the inherited product. + */ + kauth_acl_t nacl, oacl; + + if (VATTR_IS_ACTIVE(vap, va_acl)) { + nacl = vap->va_acl; + oacl = vap->va_base_acl; + + if (oacl) { + VATTR_SET(vap, va_acl, oacl); + vap->va_base_acl = NULL; + } else { + VATTR_CLEAR_ACTIVE(vap, va_acl); + } + + if (nacl != NULL) { + kauth_acl_free(nacl); + } + } + + if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_MODE) != 0) { + VATTR_CLEAR_ACTIVE(vap, va_mode); + } + if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_GID) != 0) { + VATTR_CLEAR_ACTIVE(vap, va_gid); + } + if ((defaulted_fields & VATTR_PREPARE_DEFAULTED_UID) != 0) { + VATTR_CLEAR_ACTIVE(vap, va_uid); + } + + return; +} + +int +vn_authorize_unlink(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, __unused void *reserved) +{ + int error = 0; + + /* + * Normally, unlinking of directories is not supported. + * However, some file systems may have limited support. + */ + if ((vp->v_type == VDIR) && + !(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) { + return (EPERM); /* POSIX */ + } + + /* authorize the delete operation */ +#if CONFIG_MACF + if (!error) + error = mac_vnode_check_unlink(ctx, dvp, vp, cnp); +#endif /* MAC */ + if (!error) + error = vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); + + return error; +} + +int +vn_authorize_open_existing(vnode_t vp, struct componentname *cnp, int fmode, vfs_context_t ctx, void *reserved) +{ + /* Open of existing case */ + kauth_action_t action; + int error = 0; + + if (cnp->cn_ndp == NULL) { + panic("NULL ndp"); + } + if (reserved != NULL) { + panic("reserved not NULL."); + } + +#if CONFIG_MACF + /* XXX may do duplicate work here, but ignore that for now (idempotent) */ + if (vfs_flags(vnode_mount(vp)) & MNT_MULTILABEL) { + error = vnode_label(vnode_mount(vp), NULL, vp, NULL, 0, ctx); + if (error) + return (error); + } +#endif + + if ( (fmode & O_DIRECTORY) && vp->v_type != VDIR ) { + return (ENOTDIR); + } + + if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) { + return (EOPNOTSUPP); /* Operation not supported on socket */ + } + + if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) { + return (ELOOP); /* O_NOFOLLOW was specified and the target is a symbolic link */ + } + + /* disallow write operations on directories */ + if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) { + return (EISDIR); + } + + if ((cnp->cn_ndp->ni_flag & NAMEI_TRAILINGSLASH)) { + if (vp->v_type != VDIR) { + return (ENOTDIR); + } + } + +#if CONFIG_MACF + /* If a file being opened is a shadow file containing + * namedstream data, ignore the macf checks because it + * is a kernel internal file and access should always + * be allowed. + */ + if (!(vnode_isshadow(vp) && vnode_isnamedstream(vp))) { + error = mac_vnode_check_open(ctx, vp, fmode); + if (error) { + return (error); + } + } +#endif + + /* compute action to be authorized */ + action = 0; + if (fmode & FREAD) { + action |= KAUTH_VNODE_READ_DATA; + } + if (fmode & (FWRITE | O_TRUNC)) { + /* + * If we are writing, appending, and not truncating, + * indicate that we are appending so that if the + * UF_APPEND or SF_APPEND bits are set, we do not deny + * the open. + */ + if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { + action |= KAUTH_VNODE_APPEND_DATA; + } else { + action |= KAUTH_VNODE_WRITE_DATA; + } + } + return (vnode_authorize(vp, NULL, action, ctx)); +} + +int +vn_authorize_create(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved) +{ + /* Creation case */ + int error; + + if (cnp->cn_ndp == NULL) { + panic("NULL cn_ndp"); + } + if (reserved != NULL) { + panic("reserved not NULL."); + } + + /* Only validate path for creation if we didn't do a complete lookup */ + if (cnp->cn_ndp->ni_flag & NAMEI_UNFINISHED) { + error = lookup_validate_creation_path(cnp->cn_ndp); + if (error) + return (error); + } + +#if CONFIG_MACF + error = mac_vnode_check_create(ctx, dvp, cnp, vap); + if (error) + return (error); +#endif /* CONFIG_MACF */ + + return (vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)); +} + +int +vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct componentname *fcnp, + struct vnode *tdvp, struct vnode *tvp, struct componentname *tcnp, + vfs_context_t ctx, void *reserved) +{ + int error = 0; + int moving = 0; + + if (reserved != NULL) { + panic("Passed something other than NULL as reserved field!"); + } + + /* + * Avoid renaming "." and "..". + * + * XXX No need to check for this in the FS. We should always have the leaves + * in VFS in this case. + */ + if (fvp->v_type == VDIR && + ((fdvp == fvp) || + (fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || + ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT)) ) { + error = EINVAL; + goto out; + } + + if (tvp == NULLVP && vnode_compound_rename_available(tdvp)) { + error = lookup_validate_creation_path(tcnp->cn_ndp); + if (error) + goto out; + } + + /***** <MACF> *****/ +#if CONFIG_MACF + error = mac_vnode_check_rename_from(ctx, fdvp, fvp, fcnp); + if (error) + goto out; +#endif + +#if CONFIG_MACF + error = mac_vnode_check_rename_to(ctx, + tdvp, tvp, fdvp == tdvp, tcnp); + if (error) + goto out; +#endif + /***** </MACF> *****/ + + /***** <MiscChecks> *****/ + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + + if (fvp == tdvp) { + error = EINVAL; + goto out; + } + + /* + * The following edge case is caught here: + * (to cannot be a descendent of from) + * + * o fdvp + * / + * / + * o fvp + * \ + * \ + * o tdvp + * / + * / + * o tvp + */ + if (tdvp->v_parent == fvp) { + error = EINVAL; + goto out; + } + /***** </MiscChecks> *****/ + + /***** <Kauth> *****/ + + error = 0; + if ((tvp != NULL) && vnode_isdir(tvp)) { + if (tvp != fdvp) + moving = 1; + } else if (tdvp != fdvp) { + moving = 1; + } + + + /* + * must have delete rights to remove the old name even in + * the simple case of fdvp == tdvp. + * + * If fvp is a directory, and we are changing it's parent, + * then we also need rights to rewrite its ".." entry as well. + */ + if (vnode_isdir(fvp)) { + if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) + goto out; + } else { + if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) + goto out; + } + if (moving) { + /* moving into tdvp or tvp, must have rights to add */ + if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp, + NULL, + vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, + ctx)) != 0) { + goto out; + } + } else { + /* node staying in same directory, must be allowed to add new name */ + if ((error = vnode_authorize(fdvp, NULL, + vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) + goto out; + } + /* overwriting tvp */ + if ((tvp != NULL) && !vnode_isdir(tvp) && + ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) { + goto out; + } + + /***** </Kauth> *****/ + + /* XXX more checks? */ +out: + return error; +} + +int +vn_authorize_mkdir(vnode_t dvp, struct componentname *cnp, struct vnode_attr *vap, vfs_context_t ctx, void *reserved) +{ + int error; + + if (reserved != NULL) { + panic("reserved not NULL in vn_authorize_mkdir()"); + } + + /* XXX A hack for now, to make shadow files work */ + if (cnp->cn_ndp == NULL) { + return 0; + } + + if (vnode_compound_mkdir_available(dvp)) { + error = lookup_validate_creation_path(cnp->cn_ndp); + if (error) + goto out; + } + +#if CONFIG_MACF + error = mac_vnode_check_create(ctx, + dvp, cnp, vap); + if (error) + goto out; +#endif + + /* authorize addition of a directory to the parent */ + if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) + goto out; + +out: + return error; +} + +int +vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved) +{ + int error; + + if (reserved != NULL) { + panic("Non-NULL reserved argument to vn_authorize_rmdir()"); + } + + if (vp->v_type != VDIR) { + /* + * rmdir only deals with directories + */ + return ENOTDIR; + } + + if (dvp == vp) { + /* + * No rmdir "." please. + */ + return EINVAL; + } + +#if CONFIG_MACF + error = mac_vnode_check_unlink(ctx, dvp, + vp, cnp); + if (error) + return error; +#endif + + return vnode_authorize(vp, dvp, KAUTH_VNODE_DELETE, ctx); +} + /* * Authorize an operation on a vnode. * @@ -5254,8 +5832,11 @@ vnode_authorize_posix(vauth_ctx vcp, int action, int on_dir) } else { error = vauth_file_ingroup(vcp, &ismember, (!group_ok ? EACCES : 0)); } - if (error) - goto out; + if (error) { + if (!group_ok) + ismember = 1; + error = 0; + } if (ismember) { _SETWHERE("group"); if (!group_ok) @@ -5324,8 +5905,6 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) /* check the ACL on the directory */ delete_child_denied = 0; if (!cached_delete_child && VATTR_IS_NOT(dvap, va_acl, NULL)) { - errno_t posix_error; - eval.ae_requested = KAUTH_VNODE_DELETE_CHILD; eval.ae_acl = &dvap->va_acl->acl_ace[0]; eval.ae_count = dvap->va_acl->acl_entrycount; @@ -5338,9 +5917,11 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) * have the ACL evaluation answer. Previously, we would * always deny the operation at this point. */ - if ((posix_error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && posix_error != ENOENT) - return(posix_error); - if (ismember) + if ((error = vauth_dir_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) + return(error); + if (error == ENOENT) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; + else if (ismember) eval.ae_options |= KAUTH_AEVAL_IN_GROUP; eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; @@ -5361,18 +5942,11 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) case KAUTH_RESULT_DENY: delete_child_denied = 1; break; - case KAUTH_RESULT_ALLOW: - KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp); - return(0); + /* FALLSTHROUGH */ + case KAUTH_RESULT_ALLOW: + KAUTH_DEBUG("%p ALLOWED - granted by directory ACL", vcp->vp); + return(0); case KAUTH_RESULT_DEFER: - /* - * If we don't have a POSIX answer of "yes", and we - * can't get an ACL answer, then we deny it now. - */ - if (posix_error == ENOENT) { - delete_child_denied = 1; - break; - } default: /* Effectively the same as !delete_child_denied */ KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp); @@ -5383,8 +5957,6 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) /* check the ACL on the node */ delete_denied = 0; if (VATTR_IS_NOT(vap, va_acl, NULL)) { - errno_t posix_error; - eval.ae_requested = KAUTH_VNODE_DELETE; eval.ae_acl = &vap->va_acl->acl_ace[0]; eval.ae_count = vap->va_acl->acl_entrycount; @@ -5397,9 +5969,11 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) * have the ACL evaluation answer. Previously, we would * always deny the operation at this point. */ - if ((posix_error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && posix_error != ENOENT) - return(posix_error); - if (ismember) + if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) + return(error); + if (error == ENOENT) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; + else if (ismember) eval.ae_options |= KAUTH_AEVAL_IN_GROUP; eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; @@ -5419,13 +5993,6 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) KAUTH_DEBUG("%p ALLOWED - granted by file ACL", vcp->vp); return(0); case KAUTH_RESULT_DEFER: - /* - * If we don't have a POSIX answer of "yes", and we - * can't get an ACL answer, then we deny it now. - */ - if (posix_error == ENOENT) { - delete_denied = 1; - } default: /* Effectively the same as !delete_child_denied */ KAUTH_DEBUG("%p DEFERRED%s - by file ACL", vcp->vp, delete_denied ? "(DENY)" : ""); @@ -5447,13 +6014,13 @@ vnode_authorize_delete(vauth_ctx vcp, boolean_t cached_delete_child) */ if (!cached_delete_child && (dvap->va_mode & S_ISTXT) && !vauth_file_owner(vcp) && !vauth_dir_owner(vcp)) { KAUTH_DEBUG("%p DENIED - sticky bit rules (user %d file %d dir %d)", - vcp->vp, cred->cr_uid, vap->va_uid, dvap->va_uid); + vcp->vp, cred->cr_posix.cr_uid, vap->va_uid, dvap->va_uid); return(EACCES); } /* check the directory */ if (!cached_delete_child && (error = vnode_authorize_posix(vcp, VWRITE, 1 /* on_dir */)) != 0) { - KAUTH_DEBUG("%p ALLOWED - granted by posix permisssions", vcp->vp); + KAUTH_DEBUG("%p DENIED - denied by posix permisssions", vcp->vp); return(error); } @@ -5501,8 +6068,6 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r /* if we have an ACL, evaluate it */ if (VATTR_IS_NOT(vap, va_acl, NULL)) { - errno_t posix_error; - eval.ae_requested = acl_rights; eval.ae_acl = &vap->va_acl->acl_ace[0]; eval.ae_count = vap->va_acl->acl_entrycount; @@ -5515,9 +6080,11 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r * have the ACL evaluation answer. Previously, we would * always deny the operation at this point. */ - if ((posix_error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && posix_error != ENOENT) - return(posix_error); - if (ismember) + if ((error = vauth_file_ingroup(vcp, &ismember, ENOENT)) != 0 && error != ENOENT) + return(error); + if (error == ENOENT) + eval.ae_options |= KAUTH_AEVAL_IN_GROUP_UNKNOWN; + else if (ismember) eval.ae_options |= KAUTH_AEVAL_IN_GROUP; eval.ae_exp_gall = KAUTH_VNODE_GENERIC_ALL_BITS; eval.ae_exp_gread = KAUTH_VNODE_GENERIC_READ_BITS; @@ -5537,14 +6104,6 @@ vnode_authorize_simple(vauth_ctx vcp, kauth_ace_rights_t acl_rights, kauth_ace_r KAUTH_DEBUG("%p ALLOWED - all rights granted by ACL", vcp->vp); return(0); case KAUTH_RESULT_DEFER: - /* - * If we don't have a POSIX answer of "yes", and we - * can't get an ACL answer, then we deny it now. - */ - if (posix_error == ENOENT) { - KAUTH_DEBUG("%p DENIED(DEFERRED) - by ACL", vcp->vp); - return(EACCES); /* deny, deny, counter-allege */ - } default: /* Effectively the same as !delete_child_denied */ KAUTH_DEBUG("%p DEFERRED - directory ACL", vcp->vp); @@ -5866,8 +6425,8 @@ vnode_authorize_callback(kauth_cred_t cred, void *idata, kauth_action_t action, * find the stream and flush its cache. */ if (vnode_isnamedstream(vp) && (!vfs_authopaque(vp->v_mount))) { - cvp = vp->v_parent; - if ((cvp != NULLVP) && (vnode_getwithref(cvp) == 0)) { + cvp = vnode_getparent(vp); + if (cvp != NULLVP) { parent_iocount = 1; } else { cvp = NULL; @@ -5897,8 +6456,10 @@ vnode_authorize_callback(kauth_cred_t cred, void *idata, kauth_action_t action, defer: result = vnode_authorize_callback_int(cred, idata, action, arg0, arg1, arg2, arg3); - if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) + if (result == KAUTH_RESULT_ALLOW && cvp != NULLVP) { + KAUTH_DEBUG("%p - caching action = %x", cvp, action); vnode_cache_authorized_action(cvp, ctx, action); + } out: if (parent_iocount) { @@ -6068,7 +6629,7 @@ vnode_authorize_callback_int(__unused kauth_cred_t unused_cred, __unused void *i */ if (vnode_isnamedstream(vp) && (vp->v_parent != NULL) && - (vget_internal(vp->v_parent, 0, VNODE_NODEAD) == 0)) { + (vget_internal(vp->v_parent, 0, VNODE_NODEAD | VNODE_DRAINO) == 0)) { parent_ref = TRUE; vcp->vp = vp = vp->v_parent; if (VATTR_IS_SUPPORTED(&va, va_acl) && (va.va_acl != NULL)) @@ -6175,6 +6736,7 @@ out: if (VATTR_IS_SUPPORTED(&dva, va_mode) && !(dva.va_mode & (S_ISVTX))) { /* OK to cache delete rights */ + KAUTH_DEBUG("%p - caching DELETE_CHILD rights", dvp); vnode_cache_authorized_action(dvp, ctx, KAUTH_VNODE_DELETE_CHILD); } } @@ -6188,12 +6750,18 @@ out: return(KAUTH_RESULT_ALLOW); } +int +vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx) +{ + return vnode_authattr_new_internal(dvp, vap, noauth, NULL, ctx); +} + /* * Check that the attribute information in vattr can be legally applied to * a new file by the context. */ -int -vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_t ctx) +static int +vnode_authattr_new_internal(vnode_t dvp, struct vnode_attr *vap, int noauth, uint32_t *defaulted_fieldsp, vfs_context_t ctx) { int error; int has_priv_suser, ismember, defaulted_owner, defaulted_group, defaulted_mode; @@ -6202,6 +6770,11 @@ vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_ mount_t dmp; error = 0; + + if (defaulted_fieldsp) { + *defaulted_fieldsp = 0; + } + defaulted_owner = defaulted_group = defaulted_mode = 0; /* @@ -6384,6 +6957,17 @@ vnode_authattr_new(vnode_t dvp, struct vnode_attr *vap, int noauth, vfs_context_ } } out: + if (defaulted_fieldsp) { + if (defaulted_mode) { + *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_MODE; + } + if (defaulted_group) { + *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_GID; + } + if (defaulted_owner) { + *defaulted_fieldsp |= VATTR_PREPARE_DEFAULTED_UID; + } + } return(error); } @@ -6481,6 +7065,14 @@ vnode_authattr(vnode_t vp, struct vnode_attr *vap, kauth_action_t *actionp, vfs_ VATTR_WANTED(&ova, va_flags); } + /* + * If ACLs are being changed, we need the old ACLs. + */ + if (VATTR_IS_ACTIVE(vap, va_acl)) { + KAUTH_DEBUG("ATTR - acl changing, fetching old flags"); + VATTR_WANTED(&ova, va_acl); + } + /* * If the size is being set, make sure it's not a directory. */ @@ -6886,7 +7478,7 @@ no_guuid_change: KAUTH_DEBUG("CHMOD - adding/removing ACL entries"); } else if (vap->va_acl->acl_entrycount > 0) { /* both ACLs have the same ACE count, said count is 1 or more, bitwise compare ACLs */ - if (!memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0], + if (memcmp(&vap->va_acl->acl_ace[0], &ova.va_acl->acl_ace[0], sizeof(struct kauth_ace) * vap->va_acl->acl_entrycount)) { required_action |= KAUTH_VNODE_WRITE_SECURITY; KAUTH_DEBUG("CHMOD - changing ACL entries"); @@ -6909,29 +7501,28 @@ out: return(error); } +static int +setlocklocal_callback(struct vnode *vp, __unused void *cargs) +{ + vnode_lock_spin(vp); + vp->v_flag |= VLOCKLOCAL; + vnode_unlock(vp); + + return (VNODE_RETURNED); +} void vfs_setlocklocal(mount_t mp) { - vnode_t vp; - - mount_lock(mp); + mount_lock_spin(mp); mp->mnt_kern_flag |= MNTK_LOCK_LOCAL; + mount_unlock(mp); /* - * We do not expect anyone to be using any vnodes at the - * time this routine is called. So no need for vnode locking + * The number of active vnodes is expected to be + * very small when vfs_setlocklocal is invoked. */ - TAILQ_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { - vp->v_flag |= VLOCKLOCAL; - } - TAILQ_FOREACH(vp, &mp->mnt_workerqueue, v_mntvnodes) { - vp->v_flag |= VLOCKLOCAL; - } - TAILQ_FOREACH(vp, &mp->mnt_newvnodes, v_mntvnodes) { - vp->v_flag |= VLOCKLOCAL; - } - mount_unlock(mp); + vnode_iterate(mp, 0, setlocklocal_callback, NULL); } void @@ -6942,6 +7533,14 @@ vfs_setunmountpreflight(mount_t mp) mount_unlock(mp); } +void +vfs_setcompoundopen(mount_t mp) +{ + mount_lock_spin(mp); + mp->mnt_compound_ops |= COMPOUND_VNOP_OPEN; + mount_unlock(mp); +} + void vn_setunionwait(vnode_t vp) { @@ -7146,13 +7745,17 @@ errno_t rmdir_remove_orphaned_appleDouble(vnode_t vp , vfs_context_t ctx, int * !((dp->d_namlen == 1 && dp->d_name[0] == '.') || (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.')) ) { - - NDINIT(&nd_temp, DELETE, USEDVP, UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), ctx); + + NDINIT(&nd_temp, DELETE, OP_UNLINK, USEDVP, + UIO_SYSSPACE, CAST_USER_ADDR_T(dp->d_name), + ctx); nd_temp.ni_dvp = vp; error = unlink1(ctx, &nd_temp, 0); - if (error && error != ENOENT) { + + if (error && error != ENOENT) { goto outsc; } + } cpos += dp->d_reclen; dp = (struct dirent*)cpos; @@ -7208,21 +7811,645 @@ lock_vnode_and_post(vnode_t vp, int kevent_num) #ifdef JOE_DEBUG static void record_vp(vnode_t vp, int count) { struct uthread *ut; - int i; +#if CONFIG_TRIGGERS + if (vp->v_resolve) + return; +#endif if ((vp->v_flag & VSYSTEM)) return; ut = get_bsdthread_info(current_thread()); ut->uu_iocount += count; - if (ut->uu_vpindex < 32) { - for (i = 0; i < ut->uu_vpindex; i++) { - if (ut->uu_vps[i] == vp) - return; + if (count == 1) { + if (ut->uu_vpindex < 32) { + OSBacktrace((void **)&ut->uu_pcs[ut->uu_vpindex][0], 10); + + ut->uu_vps[ut->uu_vpindex] = vp; + ut->uu_vpindex++; } - ut->uu_vps[ut->uu_vpindex] = vp; - ut->uu_vpindex++; } } #endif + + +#if CONFIG_TRIGGERS + +#define TRIG_DEBUG 0 + +#if TRIG_DEBUG +#define TRIG_LOG(...) do { printf("%s: ", __FUNCTION__); printf(__VA_ARGS__); } while (0) +#else +#define TRIG_LOG(...) +#endif + +/* + * Resolver result functions + */ + +resolver_result_t +vfs_resolver_result(uint32_t seq, enum resolver_status stat, int aux) +{ + /* + * |<--- 32 --->|<--- 28 --->|<- 4 ->| + * sequence auxiliary status + */ + return (((uint64_t)seq) << 32) | + (((uint64_t)(aux & 0x0fffffff)) << 4) | + (uint64_t)(stat & 0x0000000F); +} + +enum resolver_status +vfs_resolver_status(resolver_result_t result) +{ + /* lower 4 bits is status */ + return (result & 0x0000000F); +} + +uint32_t +vfs_resolver_sequence(resolver_result_t result) +{ + /* upper 32 bits is sequence */ + return (uint32_t)(result >> 32); +} + +int +vfs_resolver_auxiliary(resolver_result_t result) +{ + /* 28 bits of auxiliary */ + return (int)(((uint32_t)(result & 0xFFFFFFF0)) >> 4); +} + +/* + * SPI + * Call in for resolvers to update vnode trigger state + */ +int +vnode_trigger_update(vnode_t vp, resolver_result_t result) +{ + vnode_resolve_t rp; + uint32_t seq; + enum resolver_status stat; + + if (vp->v_resolve == NULL) { + return (EINVAL); + } + + stat = vfs_resolver_status(result); + seq = vfs_resolver_sequence(result); + + if ((stat != RESOLVER_RESOLVED) && (stat != RESOLVER_UNRESOLVED)) { + return (EINVAL); + } + + rp = vp->v_resolve; + lck_mtx_lock(&rp->vr_lock); + + if (seq > rp->vr_lastseq) { + if (stat == RESOLVER_RESOLVED) + rp->vr_flags |= VNT_RESOLVED; + else + rp->vr_flags &= ~VNT_RESOLVED; + + rp->vr_lastseq = seq; + } + + lck_mtx_unlock(&rp->vr_lock); + + return (0); +} + +static int +vnode_resolver_attach(vnode_t vp, vnode_resolve_t rp, boolean_t ref) +{ + int error; + + vnode_lock_spin(vp); + if (vp->v_resolve != NULL) { + vnode_unlock(vp); + return EINVAL; + } else { + vp->v_resolve = rp; + } + vnode_unlock(vp); + + if (ref) { + error = vnode_ref_ext(vp, O_EVTONLY, VNODE_REF_FORCE); + if (error != 0) { + panic("VNODE_REF_FORCE didn't help..."); + } + } + + return 0; +} + +/* + * VFS internal interfaces for vnode triggers + * + * vnode must already have an io count on entry + * v_resolve is stable when io count is non-zero + */ +static int +vnode_resolver_create(mount_t mp, vnode_t vp, struct vnode_trigger_param *tinfo, boolean_t external) +{ + vnode_resolve_t rp; + int result; + char byte; + +#if 1 + /* minimum pointer test (debugging) */ + if (tinfo->vnt_data) + byte = *((char *)tinfo->vnt_data); +#endif + MALLOC(rp, vnode_resolve_t, sizeof(*rp), M_TEMP, M_WAITOK); + if (rp == NULL) + return (ENOMEM); + + lck_mtx_init(&rp->vr_lock, trigger_vnode_lck_grp, trigger_vnode_lck_attr); + + rp->vr_resolve_func = tinfo->vnt_resolve_func; + rp->vr_unresolve_func = tinfo->vnt_unresolve_func; + rp->vr_rearm_func = tinfo->vnt_rearm_func; + rp->vr_reclaim_func = tinfo->vnt_reclaim_func; + rp->vr_data = tinfo->vnt_data; + rp->vr_lastseq = 0; + rp->vr_flags = tinfo->vnt_flags & VNT_VALID_MASK; + if (external) { + rp->vr_flags |= VNT_EXTERNAL; + } + + result = vnode_resolver_attach(vp, rp, external); + if (result != 0) { + goto out; + } + + if (mp) { + OSAddAtomic(1, &mp->mnt_numtriggers); + } + + return (result); + +out: + FREE(rp, M_TEMP); + return result; +} + +static void +vnode_resolver_release(vnode_resolve_t rp) +{ + /* + * Give them a chance to free any private data + */ + if (rp->vr_data && rp->vr_reclaim_func) { + rp->vr_reclaim_func(NULLVP, rp->vr_data); + } + + lck_mtx_destroy(&rp->vr_lock, trigger_vnode_lck_grp); + FREE(rp, M_TEMP); + +} + +/* Called after the vnode has been drained */ +static void +vnode_resolver_detach(vnode_t vp) +{ + vnode_resolve_t rp; + mount_t mp; + + mp = vnode_mount(vp); + + vnode_lock(vp); + rp = vp->v_resolve; + vp->v_resolve = NULL; + vnode_unlock(vp); + + if ((rp->vr_flags & VNT_EXTERNAL) != 0) { + vnode_rele_ext(vp, O_EVTONLY, 1); + } + + vnode_resolver_release(rp); + + /* Keep count of active trigger vnodes per mount */ + OSAddAtomic(-1, &mp->mnt_numtriggers); +} + +/* + * Pathname operations that don't trigger a mount for trigger vnodes + */ +static const u_int64_t ignorable_pathops_mask = + 1LL << OP_MOUNT | + 1LL << OP_UNMOUNT | + 1LL << OP_STATFS | + 1LL << OP_ACCESS | + 1LL << OP_GETATTR | + 1LL << OP_LISTXATTR; + +int +vfs_istraditionaltrigger(enum path_operation op, const struct componentname *cnp) +{ + if (cnp->cn_flags & ISLASTCN) + return ((1LL << op) & ignorable_pathops_mask) == 0; + else + return (1); +} + +__private_extern__ +void +vnode_trigger_rearm(vnode_t vp, vfs_context_t ctx) +{ + vnode_resolve_t rp; + resolver_result_t result; + enum resolver_status status; + uint32_t seq; + + if ((vp->v_resolve == NULL) || + (vp->v_resolve->vr_rearm_func == NULL) || + (vp->v_resolve->vr_flags & VNT_AUTO_REARM) == 0) { + return; + } + + rp = vp->v_resolve; + lck_mtx_lock(&rp->vr_lock); + + /* + * Check if VFS initiated this unmount. If so, we'll catch it after the unresolve completes. + */ + if (rp->vr_flags & VNT_VFS_UNMOUNTED) { + lck_mtx_unlock(&rp->vr_lock); + return; + } + + /* Check if this vnode is already armed */ + if ((rp->vr_flags & VNT_RESOLVED) == 0) { + lck_mtx_unlock(&rp->vr_lock); + return; + } + + lck_mtx_unlock(&rp->vr_lock); + + result = rp->vr_rearm_func(vp, 0, rp->vr_data, ctx); + status = vfs_resolver_status(result); + seq = vfs_resolver_sequence(result); + + lck_mtx_lock(&rp->vr_lock); + if (seq > rp->vr_lastseq) { + if (status == RESOLVER_UNRESOLVED) + rp->vr_flags &= ~VNT_RESOLVED; + rp->vr_lastseq = seq; + } + lck_mtx_unlock(&rp->vr_lock); +} + +__private_extern__ +int +vnode_trigger_resolve(vnode_t vp, struct nameidata *ndp, vfs_context_t ctx) +{ + vnode_resolve_t rp; + enum path_operation op; + resolver_result_t result; + enum resolver_status status; + uint32_t seq; + + /* Only trigger on topmost vnodes */ + if ((vp->v_resolve == NULL) || + (vp->v_resolve->vr_resolve_func == NULL) || + (vp->v_mountedhere != NULL)) { + return (0); + } + + rp = vp->v_resolve; + lck_mtx_lock(&rp->vr_lock); + + /* Check if this vnode is already resolved */ + if (rp->vr_flags & VNT_RESOLVED) { + lck_mtx_unlock(&rp->vr_lock); + return (0); + } + + lck_mtx_unlock(&rp->vr_lock); + + /* + * XXX + * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock) + * is there anyway to know this??? + * there can also be other legitimate lookups in parallel + * + * XXX - should we call this on a separate thread with a timeout? + * + * XXX - should we use ISLASTCN to pick the op value??? Perhaps only leafs should + * get the richer set and non-leafs should get generic OP_LOOKUP? TBD + */ + op = (ndp->ni_op < OP_MAXOP) ? ndp->ni_op: OP_LOOKUP; + + result = rp->vr_resolve_func(vp, &ndp->ni_cnd, op, 0, rp->vr_data, ctx); + status = vfs_resolver_status(result); + seq = vfs_resolver_sequence(result); + + lck_mtx_lock(&rp->vr_lock); + if (seq > rp->vr_lastseq) { + if (status == RESOLVER_RESOLVED) + rp->vr_flags |= VNT_RESOLVED; + rp->vr_lastseq = seq; + } + lck_mtx_unlock(&rp->vr_lock); + + /* On resolver errors, propagate the error back up */ + return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0); +} + +static int +vnode_trigger_unresolve(vnode_t vp, int flags, vfs_context_t ctx) +{ + vnode_resolve_t rp; + resolver_result_t result; + enum resolver_status status; + uint32_t seq; + + if ((vp->v_resolve == NULL) || (vp->v_resolve->vr_unresolve_func == NULL)) { + return (0); + } + + rp = vp->v_resolve; + lck_mtx_lock(&rp->vr_lock); + + /* Check if this vnode is already resolved */ + if ((rp->vr_flags & VNT_RESOLVED) == 0) { + printf("vnode_trigger_unresolve: not currently resolved\n"); + lck_mtx_unlock(&rp->vr_lock); + return (0); + } + + rp->vr_flags |= VNT_VFS_UNMOUNTED; + + lck_mtx_unlock(&rp->vr_lock); + + /* + * XXX + * assumes that resolver will not access this trigger vnode (otherwise the kernel will deadlock) + * there can also be other legitimate lookups in parallel + * + * XXX - should we call this on a separate thread with a timeout? + */ + + result = rp->vr_unresolve_func(vp, flags, rp->vr_data, ctx); + status = vfs_resolver_status(result); + seq = vfs_resolver_sequence(result); + + lck_mtx_lock(&rp->vr_lock); + if (seq > rp->vr_lastseq) { + if (status == RESOLVER_UNRESOLVED) + rp->vr_flags &= ~VNT_RESOLVED; + rp->vr_lastseq = seq; + } + rp->vr_flags &= ~VNT_VFS_UNMOUNTED; + lck_mtx_unlock(&rp->vr_lock); + + /* On resolver errors, propagate the error back up */ + return (status == RESOLVER_ERROR ? vfs_resolver_auxiliary(result) : 0); +} + +static int +triggerisdescendant(mount_t mp, mount_t rmp) +{ + int match = FALSE; + + /* + * walk up vnode covered chain looking for a match + */ + name_cache_lock_shared(); + + while (1) { + vnode_t vp; + + /* did we encounter "/" ? */ + if (mp->mnt_flag & MNT_ROOTFS) + break; + + vp = mp->mnt_vnodecovered; + if (vp == NULLVP) + break; + + mp = vp->v_mount; + if (mp == rmp) { + match = TRUE; + break; + } + } + + name_cache_unlock(); + + return (match); +} + +struct trigger_unmount_info { + vfs_context_t ctx; + mount_t top_mp; + vnode_t trigger_vp; + mount_t trigger_mp; + uint32_t trigger_vid; + int flags; +}; + +static int +trigger_unmount_callback(mount_t mp, void * arg) +{ + struct trigger_unmount_info * infop = (struct trigger_unmount_info *)arg; + boolean_t mountedtrigger = FALSE; + + /* + * When we encounter the top level mount we're done + */ + if (mp == infop->top_mp) + return (VFS_RETURNED_DONE); + + if ((mp->mnt_vnodecovered == NULL) || + (vnode_getwithref(mp->mnt_vnodecovered) != 0)) { + return (VFS_RETURNED); + } + + if ((mp->mnt_vnodecovered->v_mountedhere == mp) && + (mp->mnt_vnodecovered->v_resolve != NULL) && + (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_RESOLVED)) { + mountedtrigger = TRUE; + } + vnode_put(mp->mnt_vnodecovered); + + /* + * When we encounter a mounted trigger, check if its under the top level mount + */ + if ( !mountedtrigger || !triggerisdescendant(mp, infop->top_mp) ) + return (VFS_RETURNED); + + /* + * Process any pending nested mount (now that its not referenced) + */ + if ((infop->trigger_vp != NULLVP) && + (vnode_getwithvid(infop->trigger_vp, infop->trigger_vid) == 0)) { + vnode_t vp = infop->trigger_vp; + int error; + + infop->trigger_vp = NULLVP; + + if (mp == vp->v_mountedhere) { + vnode_put(vp); + printf("trigger_unmount_callback: unexpected match '%s'\n", + mp->mnt_vfsstat.f_mntonname); + return (VFS_RETURNED); + } + if (infop->trigger_mp != vp->v_mountedhere) { + vnode_put(vp); + printf("trigger_unmount_callback: trigger mnt changed! (%p != %p)\n", + infop->trigger_mp, vp->v_mountedhere); + goto savenext; + } + + error = vnode_trigger_unresolve(vp, infop->flags, infop->ctx); + vnode_put(vp); + if (error) { + printf("unresolving: '%s', err %d\n", + vp->v_mountedhere ? vp->v_mountedhere->mnt_vfsstat.f_mntonname : + "???", error); + return (VFS_RETURNED_DONE); /* stop iteration on errors */ + } + } +savenext: + /* + * We can't call resolver here since we hold a mount iter + * ref on mp so save its covered vp for later processing + */ + infop->trigger_vp = mp->mnt_vnodecovered; + if ((infop->trigger_vp != NULLVP) && + (vnode_getwithref(infop->trigger_vp) == 0)) { + if (infop->trigger_vp->v_mountedhere == mp) { + infop->trigger_vid = infop->trigger_vp->v_id; + infop->trigger_mp = mp; + } + vnode_put(infop->trigger_vp); + } + + return (VFS_RETURNED); +} + +/* + * Attempt to unmount any trigger mounts nested underneath a mount. + * This is a best effort attempt and no retries are performed here. + * + * Note: mp->mnt_rwlock is held exclusively on entry (so be carefull) + */ +__private_extern__ +void +vfs_nested_trigger_unmounts(mount_t mp, int flags, vfs_context_t ctx) +{ + struct trigger_unmount_info info; + + /* Must have trigger vnodes */ + if (mp->mnt_numtriggers == 0) { + return; + } + /* Avoid recursive requests (by checking covered vnode) */ + if ((mp->mnt_vnodecovered != NULL) && + (vnode_getwithref(mp->mnt_vnodecovered) == 0)) { + boolean_t recursive = FALSE; + + if ((mp->mnt_vnodecovered->v_mountedhere == mp) && + (mp->mnt_vnodecovered->v_resolve != NULL) && + (mp->mnt_vnodecovered->v_resolve->vr_flags & VNT_VFS_UNMOUNTED)) { + recursive = TRUE; + } + vnode_put(mp->mnt_vnodecovered); + if (recursive) + return; + } + + /* + * Attempt to unmount any nested trigger mounts (best effort) + */ + info.ctx = ctx; + info.top_mp = mp; + info.trigger_vp = NULLVP; + info.trigger_vid = 0; + info.trigger_mp = NULL; + info.flags = flags; + + (void) vfs_iterate(VFS_ITERATE_TAIL_FIRST, trigger_unmount_callback, &info); + + /* + * Process remaining nested mount (now that its not referenced) + */ + if ((info.trigger_vp != NULLVP) && + (vnode_getwithvid(info.trigger_vp, info.trigger_vid) == 0)) { + vnode_t vp = info.trigger_vp; + + if (info.trigger_mp == vp->v_mountedhere) { + (void) vnode_trigger_unresolve(vp, flags, ctx); + } + vnode_put(vp); + } +} + +int +vfs_addtrigger(mount_t mp, const char *relpath, struct vnode_trigger_info *vtip, vfs_context_t ctx) +{ + struct nameidata nd; + int res; + vnode_t rvp, vp; + struct vnode_trigger_param vtp; + + /* + * Must be called for trigger callback, wherein rwlock is held + */ + lck_rw_assert(&mp->mnt_rwlock, LCK_RW_ASSERT_HELD); + + TRIG_LOG("Adding trigger at %s\n", relpath); + TRIG_LOG("Trying VFS_ROOT\n"); + + /* + * We do a lookup starting at the root of the mountpoint, unwilling + * to cross into other mountpoints. + */ + res = VFS_ROOT(mp, &rvp, ctx); + if (res != 0) { + goto out; + } + + TRIG_LOG("Trying namei\n"); + + NDINIT(&nd, LOOKUP, OP_LOOKUP, USEDVP | NOCROSSMOUNT | FOLLOW, UIO_SYSSPACE, + CAST_USER_ADDR_T(relpath), ctx); + nd.ni_dvp = rvp; + res = namei(&nd); + if (res != 0) { + vnode_put(rvp); + goto out; + } + + vp = nd.ni_vp; + nameidone(&nd); + vnode_put(rvp); + + TRIG_LOG("Trying vnode_resolver_create()\n"); + + /* + * Set up blob. vnode_create() takes a larger structure + * with creation info, and we needed something different + * for this case. One needs to win, or we need to munge both; + * vnode_create() wins. + */ + bzero(&vtp, sizeof(vtp)); + vtp.vnt_resolve_func = vtip->vti_resolve_func; + vtp.vnt_unresolve_func = vtip->vti_unresolve_func; + vtp.vnt_rearm_func = vtip->vti_rearm_func; + vtp.vnt_reclaim_func = vtip->vti_reclaim_func; + vtp.vnt_reclaim_func = vtip->vti_reclaim_func; + vtp.vnt_data = vtip->vti_data; + vtp.vnt_flags = vtip->vti_flags; + + res = vnode_resolver_create(mp, vp, &vtp, TRUE); + vnode_put(vp); +out: + TRIG_LOG("Returning %d\n", res); + return res; +} + +#endif /* CONFIG_TRIGGERS */ diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index 02c3c39af..3d9b4591b 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1995-2008 Apple Inc. All rights reserved. + * Copyright (c) 1995-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -92,6 +92,7 @@ #include <sys/quota.h> #include <sys/kdebug.h> #include <sys/fsevents.h> +#include <sys/imgsrc.h> #include <sys/sysproto.h> #include <sys/xattr.h> #include <sys/fcntl.h> @@ -101,7 +102,6 @@ #include <machine/cons.h> #include <machine/limits.h> #include <miscfs/specfs/specdev.h> -#include <miscfs/union/union.h> #include <security/audit/audit.h> #include <bsm/audit_kevents.h> @@ -109,6 +109,7 @@ #include <mach/mach_types.h> #include <kern/kern_types.h> #include <kern/kalloc.h> +#include <kern/task.h> #include <vm/vm_pageout.h> @@ -153,15 +154,21 @@ static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, static int statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp); static int fsync_common(proc_t p, struct fsync_args *uap, int flags); +static int mount_common(char *fstypename, vnode_t pvp, vnode_t vp, + struct componentname *cnp, user_addr_t fsmountargs, + int flags, uint32_t internal_flags, char *labelstr, boolean_t kernelmount, + vfs_context_t ctx); +void vfs_notify_mount(vnode_t pdvp); + +int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth); #ifdef CONFIG_IMGSRC_ACCESS -static int prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname); static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx); static int place_mount_and_checkdirs(mount_t mp, vnode_t vp, vfs_context_t ctx); static void undo_place_on_covered_vp(mount_t mp, vnode_t vp); static int mount_begin_update(mount_t mp, vfs_context_t ctx, int flags); static void mount_end_update(mount_t mp); -static int relocate_imageboot_source(vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs); +static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index); #endif /* CONFIG_IMGSRC_ACCESS */ int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t); @@ -220,6 +227,60 @@ extern errno_t rmdir_remove_orphaned_appleDouble(vnode_t, vfs_context_t, int *); * Virtual File System System Calls */ +#if NFSCLIENT +/* + * Private in-kernel mounting spi (NFS only, not exported) + */ + __private_extern__ +boolean_t +vfs_iskernelmount(mount_t mp) +{ + return ((mp->mnt_kern_flag & MNTK_KERNEL_MOUNT) ? TRUE : FALSE); +} + + __private_extern__ +int +kernel_mount(char *fstype, vnode_t pvp, vnode_t vp, const char *path, + void *data, __unused size_t datalen, int syscall_flags, __unused uint32_t kern_flags, vfs_context_t ctx) +{ + struct nameidata nd; + boolean_t did_namei; + int error; + + NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW | AUDITVNPATH1 | WANTPARENT, + UIO_SYSSPACE, CAST_USER_ADDR_T(path), ctx); + + /* + * Get the vnode to be covered if it's not supplied + */ + if (vp == NULLVP) { + error = namei(&nd); + if (error) + return (error); + vp = nd.ni_vp; + pvp = nd.ni_dvp; + did_namei = TRUE; + } else { + char *pnbuf = CAST_DOWN(char *, path); + + nd.ni_cnd.cn_pnbuf = pnbuf; + nd.ni_cnd.cn_pnlen = strlen(pnbuf) + 1; + did_namei = FALSE; + } + + error = mount_common(fstype, pvp, vp, &nd.ni_cnd, CAST_USER_ADDR_T(data), + syscall_flags, kern_flags, NULL, TRUE, ctx); + + if (did_namei) { + vnode_put(vp); + vnode_put(pvp); + nameidone(&nd); + } + + return (error); +} +#endif /* NFSCLIENT */ + /* * Mount a file system. */ @@ -237,6 +298,13 @@ mount(proc_t p, struct mount_args *uap, __unused int32_t *retval) return (__mac_mount(p, &muap, retval)); } +void +vfs_notify_mount(vnode_t pdvp) +{ + vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL); + lock_vnode_and_post(pdvp, NOTE_WRITE); +} + /* * __mac_mount: * Mount a file system taking into account MAC label behavior. @@ -256,10 +324,135 @@ mount(proc_t p, struct mount_args *uap, __unused int32_t *retval) * Returns: 0 Success * !0 Not success */ +boolean_t root_fs_upgrade_try = FALSE; + int __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int32_t *retval) { - struct vnode *vp, *pvp; + vnode_t pvp, vp; + vfs_context_t ctx = vfs_context_current(); + char fstypename[MFSNAMELEN]; + struct nameidata nd; + size_t dummy=0; + char *labelstr = NULL; + int flags = uap->flags; + int error; + boolean_t is_64bit = IS_64BIT_PROCESS(p); + + /* + * Get the fs type name from user space + */ + error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy); + if (error) + return (error); + + /* + * Get the vnode to be covered + */ + NDINIT(&nd, LOOKUP, OP_MOUNT, NOTRIGGER | FOLLOW | AUDITVNPATH1 | WANTPARENT, + UIO_USERSPACE, uap->path, ctx); + error = namei(&nd); + if (error) + return (error); + vp = nd.ni_vp; + pvp = nd.ni_dvp; + +#ifdef CONFIG_IMGSRC_ACCESS + /* Mounting image source cannot be batched with other operations */ + if (flags == MNT_IMGSRC_BY_INDEX) { + error = relocate_imageboot_source(pvp, vp, &nd.ni_cnd, fstypename, + ctx, is_64bit, uap->data, (flags == MNT_IMGSRC_BY_INDEX)); + goto out; + } +#endif /* CONFIG_IMGSRC_ACCESS */ + +#if CONFIG_MACF + /* + * Get the label string (if any) from user space + */ + if (uap->mac_p != USER_ADDR_NULL) { + struct user_mac mac; + size_t ulen = 0; + + if (is_64bit) { + struct user64_mac mac64; + error = copyin(uap->mac_p, &mac64, sizeof(mac64)); + mac.m_buflen = mac64.m_buflen; + mac.m_string = mac64.m_string; + } else { + struct user32_mac mac32; + error = copyin(uap->mac_p, &mac32, sizeof(mac32)); + mac.m_buflen = mac32.m_buflen; + mac.m_string = mac32.m_string; + } + if (error) + goto out; + if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) || + (mac.m_buflen < 2)) { + error = EINVAL; + goto out; + } + MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK); + error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen); + if (error) { + goto out; + } + AUDIT_ARG(mac_string, labelstr); + } +#endif /* CONFIG_MACF */ + + AUDIT_ARG(fflags, flags); + + if ((vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_ROOTFS)) { + flags |= MNT_UPDATE; + /* + * See 7392553 for more details on why this check exists. + * Suffice to say: If this check is ON and something tries + * to mount the rootFS RW, we'll turn off the codesign + * bitmap optimization. + */ +#if CHECK_CS_VALIDATION_BITMAP + if ( !(flags & MNT_RDONLY) ) { + root_fs_upgrade_try = TRUE; + } +#endif + } + + error = mount_common(fstypename, pvp, vp, &nd.ni_cnd, uap->data, flags, 0, + labelstr, FALSE, ctx); +out: +#if CONFIG_MACF + if (labelstr) + FREE(labelstr, M_MACTEMP); +#endif /* CONFIG_MACF */ + + vnode_put(vp); + vnode_put(pvp); + nameidone(&nd); + + return (error); +} + +/* + * common mount implementation (final stage of mounting) + + * Arguments: + * fstypename file system type (ie it's vfs name) + * pvp parent of covered vnode + * vp covered vnode + * cnp component name (ie path) of covered vnode + * flags generic mount flags + * fsmountargs file system specific data + * labelstr optional MAC label + * kernelmount TRUE for mounts initiated from inside the kernel + * ctx caller's context + */ +static int +mount_common(char *fstypename, vnode_t pvp, vnode_t vp, + struct componentname *cnp, user_addr_t fsmountargs, int flags, uint32_t internal_flags, + char *labelstr, boolean_t kernelmount, vfs_context_t ctx) +{ struct vnode *devvp = NULLVP; struct vnode *device_vnode = NULLVP; #if CONFIG_MACF @@ -267,57 +460,20 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 #endif struct mount *mp; struct vfstable *vfsp = (struct vfstable *)0; + struct proc *p = vfs_context_proc(ctx); int error, flag = 0; - struct vnode_attr va; - vfs_context_t ctx = vfs_context_current(); - struct nameidata nd; - struct nameidata nd1; - char fstypename[MFSNAMELEN]; - size_t dummy=0; user_addr_t devpath = USER_ADDR_NULL; - user_addr_t fsmountargs = uap->data; int ronly = 0; int mntalloc = 0; boolean_t vfsp_ref = FALSE; - mode_t accessmode; - boolean_t is_64bit; boolean_t is_rwlock_locked = FALSE; boolean_t did_rele = FALSE; boolean_t have_usecount = FALSE; - AUDIT_ARG(fflags, uap->flags); - - is_64bit = proc_is64bit(p); - /* - * Get vnode to be covered + * Process an update for an existing mount */ - NDINIT(&nd, LOOKUP, NOTRIGGER | FOLLOW | AUDITVNPATH1 | WANTPARENT, - UIO_USERSPACE, uap->path, ctx); - error = namei(&nd); - if (error) - return (error); - vp = nd.ni_vp; - pvp = nd.ni_dvp; - - if ((vp->v_flag & VROOT) && - (vp->v_mount->mnt_flag & MNT_ROOTFS)) - uap->flags |= MNT_UPDATE; - - error = copyinstr(uap->type, fstypename, MFSNAMELEN, &dummy); - if (error) - goto out1; - -#ifdef CONFIG_IMGSRC_ACCESS - if (uap->flags == MNT_IMGSRC) { - error = relocate_imageboot_source(vp, &nd.ni_cnd, fstypename, ctx, is_64bit, fsmountargs); - vnode_put(pvp); - vnode_put(vp); - return error; - } -#endif /* CONFIG_IMGSRC_ACCESS */ - - if (uap->flags & MNT_UPDATE) { + if (flags & MNT_UPDATE) { if ((vp->v_flag & VROOT) == 0) { error = EINVAL; goto out1; @@ -338,7 +494,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 * We only allow the filesystem to be reloaded if it * is currently mounted read-only. */ - if ((uap->flags & MNT_RELOAD) && + if ((flags & MNT_RELOAD) && ((mp->mnt_flag & MNT_RDONLY) == 0)) { error = ENOTSUP; goto out1; @@ -347,8 +503,7 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 #ifdef CONFIG_IMGSRC_ACCESS /* Can't downgrade the backer of the root FS */ if ((mp->mnt_kern_flag & MNTK_BACKS_ROOT) && - (!vfs_isrdonly(mp)) && (uap->flags & MNT_RDONLY)) - { + (!vfs_isrdonly(mp)) && (flags & MNT_RDONLY)) { error = ENOTSUP; goto out1; } @@ -365,7 +520,6 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 #if CONFIG_MACF error = mac_mount_check_remount(ctx, mp); if (error != 0) { - lck_rw_done(&mp->mnt_rwlock); goto out1; } #endif @@ -373,48 +527,26 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, * and MNT_NOEXEC if mount point is already MNT_NOEXEC. */ - if (suser(vfs_context_ucred(ctx), NULL)) { - uap->flags |= MNT_NOSUID | MNT_NODEV; + if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) { + flags |= MNT_NOSUID | MNT_NODEV; if (mp->mnt_flag & MNT_NOEXEC) - uap->flags |= MNT_NOEXEC; + flags |= MNT_NOEXEC; } flag = mp->mnt_flag; - mp->mnt_flag |= - uap->flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); + mp->mnt_flag |= flags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); vfsp = mp->mnt_vtable; goto update; } - /* - * If the user is not root, ensure that they own the directory - * onto which we are attempting to mount. - */ - VATTR_INIT(&va); - VATTR_WANTED(&va, va_uid); - if ((error = vnode_getattr(vp, &va, ctx)) || - (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) && - (error = suser(vfs_context_ucred(ctx), &p->p_acflag)))) { - goto out1; - } /* * For non-root users, silently enforce MNT_NOSUID and MNT_NODEV, and * MNT_NOEXEC if mount point is already MNT_NOEXEC. */ - if (suser(vfs_context_ucred(ctx), NULL)) { - uap->flags |= MNT_NOSUID | MNT_NODEV; + if ((!kernelmount) && suser(vfs_context_ucred(ctx), NULL)) { + flags |= MNT_NOSUID | MNT_NODEV; if (vp->v_mount->mnt_flag & MNT_NOEXEC) - uap->flags |= MNT_NOEXEC; - } - if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) ) - goto out1; - - if ( (error = buf_invalidateblks(vp, BUF_WRITE_DATA, 0, 0)) ) - goto out1; - - if (vp->v_type != VDIR) { - error = ENOTDIR; - goto out1; + flags |= MNT_NOEXEC; } /* XXXAUDIT: Should we capture the type on the error path as well? */ @@ -431,22 +563,22 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 error = ENODEV; goto out1; } -#if CONFIG_MACF - error = mac_mount_check_mount(ctx, vp, - &nd.ni_cnd, vfsp->vfc_name); - if (error != 0) + + /* + * VFC_VFSLOCALARGS is not currently supported for kernel mounts + */ + if (kernelmount && (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS)) { + error = EINVAL; /* unsupported request */ goto out1; -#endif - if (ISSET(vp->v_flag, VMOUNT) && (vp->v_mountedhere != NULL)) { - error = EBUSY; + } + + error = prepare_coveredvp(vp, ctx, cnp, fstypename, ((internal_flags & KERNEL_MOUNT_NOAUTH) != 0)); + if (error != 0) { goto out1; } - vnode_lock_spin(vp); - SET(vp->v_flag, VMOUNT); - vnode_unlock(vp); /* - * Allocate and initialize the filesystem. + * Allocate and initialize the filesystem (mount_t) */ MALLOC_ZONE(mp, struct mount *, (u_int32_t)sizeof(struct mount), M_MOUNT, M_WAITOK); @@ -477,35 +609,50 @@ __mac_mount(struct proc *p, register struct __mac_mount_args *uap, __unused int3 //mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_vfsstat.f_fstypename, vfsp->vfc_name, MFSTYPENAMELEN); - strncpy(mp->mnt_vfsstat.f_mntonname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); + strncpy(mp->mnt_vfsstat.f_mntonname, cnp->cn_pnbuf, MAXPATHLEN); mp->mnt_vnodecovered = vp; mp->mnt_vfsstat.f_owner = kauth_cred_getuid(vfs_context_ucred(ctx)); - mp->mnt_devbsdunit = LOWPRI_MAX_NUM_DEV - 1; + mp->mnt_throttle_mask = LOWPRI_MAX_NUM_DEV - 1; + mp->mnt_devbsdunit = 0; /* XXX 3762912 hack to support HFS filesystem 'owner' - filesystem may update later */ vfs_setowner(mp, KAUTH_UID_NONE, KAUTH_GID_NONE); - + +#if NFSCLIENT + if (kernelmount) + mp->mnt_kern_flag |= MNTK_KERNEL_MOUNT; + if ((internal_flags & KERNEL_MOUNT_PERMIT_UNMOUNT) != 0) + mp->mnt_kern_flag |= MNTK_PERMIT_UNMOUNT; +#endif /* NFSCLIENT */ + update: /* * Set the mount level flags. */ - if (uap->flags & MNT_RDONLY) + if (flags & MNT_RDONLY) mp->mnt_flag |= MNT_RDONLY; - else if (mp->mnt_flag & MNT_RDONLY) + else if (mp->mnt_flag & MNT_RDONLY) { + // disallow read/write upgrades of file systems that + // had the TYPENAME_OVERRIDE feature set. + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + error = EPERM; + goto out1; + } mp->mnt_kern_flag |= MNTK_WANTRDWR; - + } mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | - MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | MNT_AUTOMOUNTED | - MNT_DEFWRITE | MNT_NOATIME | MNT_QUARANTINE | MNT_CPROTECT ); - - mp->mnt_flag |= uap->flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | - MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | - MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | MNT_AUTOMOUNTED | - MNT_DEFWRITE | MNT_NOATIME | MNT_QUARANTINE | MNT_CPROTECT ); + MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | + MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | + MNT_QUARANTINE | MNT_CPROTECT); + mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | + MNT_UNKNOWNPERMISSIONS | MNT_DONTBROWSE | + MNT_AUTOMOUNTED | MNT_DEFWRITE | MNT_NOATIME | + MNT_QUARANTINE | MNT_CPROTECT); #if CONFIG_MACF - if (uap->flags & MNT_MULTILABEL) { + if (flags & MNT_MULTILABEL) { if (vfsp->vfc_vfsflags & VFC_VFSNOMACLABEL) { error = EINVAL; goto out1; @@ -513,9 +660,11 @@ update: mp->mnt_flag |= MNT_MULTILABEL; } #endif - + /* + * Process device path for local file systems if requested + */ if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) { - if (is_64bit) { + if (vfs_context_is64bit(ctx)) { if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) ) goto out1; fsmountargs += sizeof(devpath); @@ -528,16 +677,18 @@ update: fsmountargs += sizeof(tmp); } - /* if it is not update and device name needs to be parsed */ + /* Lookup device and authorize access to it */ if ((devpath)) { - NDINIT(&nd1, LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx); - if ( (error = namei(&nd1)) ) + struct nameidata nd; + + NDINIT(&nd, LOOKUP, OP_MOUNT, FOLLOW, UIO_USERSPACE, devpath, ctx); + if ( (error = namei(&nd)) ) goto out1; - strncpy(mp->mnt_vfsstat.f_mntfromname, nd1.ni_cnd.cn_pnbuf, MAXPATHLEN); - devvp = nd1.ni_vp; + strncpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); + devvp = nd.ni_vp; - nameidone(&nd1); + nameidone(&nd); if (devvp->v_type != VBLK) { error = ENOTBLK; @@ -552,14 +703,16 @@ update: * permissions on the device. */ if (suser(vfs_context_ucred(ctx), NULL) != 0) { - accessmode = KAUTH_VNODE_READ_DATA; + mode_t accessmode = KAUTH_VNODE_READ_DATA; + if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= KAUTH_VNODE_WRITE_DATA; if ((error = vnode_authorize(devvp, NULL, accessmode, ctx)) != 0) goto out2; } } - if (devpath && ((uap->flags & MNT_UPDATE) == 0)) { + /* On first mount, preflight and open device */ + if (devpath && ((flags & MNT_UPDATE) == 0)) { if ( (error = vnode_ref(devvp)) ) goto out2; /* @@ -595,114 +748,75 @@ update: mp->mnt_devvp = devvp; device_vnode = devvp; - } else { - if ((mp->mnt_flag & MNT_RDONLY) && (mp->mnt_kern_flag & MNTK_WANTRDWR)) { - dev_t dev; - int maj; - /* - * If upgrade to read-write by non-root, then verify - * that user has necessary permissions on the device. - */ - device_vnode = mp->mnt_devvp; - - if (device_vnode) { - vnode_getalways(device_vnode); - if (suser(vfs_context_ucred(ctx), NULL)) { - if ((error = vnode_authorize(device_vnode, NULL, - KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, ctx)) != 0) { - vnode_put(device_vnode); - goto out2; - } - } + } else if ((mp->mnt_flag & MNT_RDONLY) && + (mp->mnt_kern_flag & MNTK_WANTRDWR) && + (device_vnode = mp->mnt_devvp)) { + dev_t dev; + int maj; + /* + * If upgrade to read-write by non-root, then verify + * that user has necessary permissions on the device. + */ + vnode_getalways(device_vnode); - /* Tell the device that we're upgrading */ - dev = (dev_t)device_vnode->v_rdev; - maj = major(dev); + if (suser(vfs_context_ucred(ctx), NULL) && + (error = vnode_authorize(device_vnode, NULL, + KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, + ctx)) != 0) { + vnode_put(device_vnode); + goto out2; + } - if ((u_int)maj >= (u_int)nblkdev) - panic("Volume mounted on a device with invalid major number.\n"); + /* Tell the device that we're upgrading */ + dev = (dev_t)device_vnode->v_rdev; + maj = major(dev); - error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p); + if ((u_int)maj >= (u_int)nblkdev) + panic("Volume mounted on a device with invalid major number."); - vnode_put(device_vnode); - if (error != 0) { - goto out2; - } - } - } + error = bdevsw[maj].d_open(dev, FREAD | FWRITE, S_IFBLK, p); + vnode_put(device_vnode); device_vnode = NULLVP; + if (error != 0) { + goto out2; + } } } #if CONFIG_MACF - if ((uap->flags & MNT_UPDATE) == 0) { + if ((flags & MNT_UPDATE) == 0) { mac_mount_label_init(mp); mac_mount_label_associate(ctx, mp); } - if (uap->mac_p != USER_ADDR_NULL) { - struct user_mac mac; - char *labelstr = NULL; - size_t ulen = 0; - - if ((uap->flags & MNT_UPDATE) != 0) { - error = mac_mount_check_label_update( - ctx, mp); + if (labelstr) { + if ((flags & MNT_UPDATE) != 0) { + error = mac_mount_check_label_update(ctx, mp); if (error != 0) goto out3; } - if (is_64bit) { - error = copyin(uap->mac_p, &mac, sizeof(mac)); - } else { - struct mac mac32; - error = copyin(uap->mac_p, &mac32, sizeof(mac32)); - mac.m_buflen = mac32.m_buflen; - mac.m_string = CAST_USER_ADDR_T(mac32.m_string); - } - if (error != 0) - goto out3; - if ((mac.m_buflen > MAC_MAX_LABEL_BUF_LEN) || - (mac.m_buflen < 2)) { - error = EINVAL; - goto out3; - } - MALLOC(labelstr, char *, mac.m_buflen, M_MACTEMP, M_WAITOK); - error = copyinstr(mac.m_string, labelstr, mac.m_buflen, &ulen); - if (error != 0) { - FREE(labelstr, M_MACTEMP); - goto out3; - } - AUDIT_ARG(mac_string, labelstr); - error = mac_mount_label_internalize(mp->mnt_mntlabel, labelstr); - FREE(labelstr, M_MACTEMP); - if (error != 0) - goto out3; } #endif - if (device_vnode != NULL) { - VNOP_IOCTL(device_vnode, DKIOCGETBSDUNIT, (caddr_t)&mp->mnt_devbsdunit, 0, NULL); - mp->mnt_devbsdunit %= LOWPRI_MAX_NUM_DEV; - } - /* * Mount the filesystem. */ error = VFS_MOUNT(mp, device_vnode, fsmountargs, ctx); - if (uap->flags & MNT_UPDATE) { + if (flags & MNT_UPDATE) { if (mp->mnt_kern_flag & MNTK_WANTRDWR) mp->mnt_flag &= ~MNT_RDONLY; mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE); mp->mnt_kern_flag &=~ MNTK_WANTRDWR; if (error) - mp->mnt_flag = flag; + mp->mnt_flag = flag; /* restore flag value */ vfs_event_signal(NULL, VQ_UPDATE, (intptr_t)NULL); lck_rw_done(&mp->mnt_rwlock); is_rwlock_locked = FALSE; if (!error) enablequotas(mp, ctx); - goto out2; + goto exit; } + /* * Put the new filesystem on the mount list after root. */ @@ -761,11 +875,14 @@ update: */ (void)VFS_START(mp, 0, ctx); - error = mount_list_add(mp); - if (error != 0) { + if (mount_list_add(mp) != 0) { + /* + * The system is shutting down trying to umount + * everything, so fail with a plausible errno. + */ + error = EBUSY; goto out4; } - lck_rw_done(&mp->mnt_rwlock); is_rwlock_locked = FALSE; @@ -818,8 +935,14 @@ update: } /* Now that mount is setup, notify the listeners */ - vfs_event_signal(NULL, VQ_MOUNT, (intptr_t)NULL); + vfs_notify_mount(pvp); } else { + /* If we fail a fresh mount, there should be no vnodes left hooked into the mountpoint. */ + if (mp->mnt_vnodelist.tqh_first != NULL) { + panic("mount_common(): mount of %s filesystem failed with %d, but vnode list is not empty.", + mp->mnt_vtable->vfc_name, error); + } + vnode_lock_spin(vp); CLR(vp->v_flag, VMOUNT); vnode_unlock(vp); @@ -833,45 +956,60 @@ update: } lck_rw_done(&mp->mnt_rwlock); is_rwlock_locked = FALSE; + + /* + * if we get here, we have a mount structure that needs to be freed, + * but since the coveredvp hasn't yet been updated to point at it, + * no need to worry about other threads holding a crossref on this mp + * so it's ok to just free it + */ mount_lock_destroy(mp); #if CONFIG_MACF mac_mount_label_destroy(mp); #endif FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); } - nameidone(&nd); - +exit: /* - * drop I/O count on covered 'vp' and - * on the device vp if there was one + * drop I/O count on the device vp if there was one */ if (devpath && devvp) vnode_put(devvp); - vnode_put(vp); - - /* Note that we've changed something in the parent directory */ - post_event_if_success(pvp, error, NOTE_WRITE); - vnode_put(pvp); return(error); +/* Error condition exits */ out4: (void)VFS_UNMOUNT(mp, MNT_FORCE, ctx); + + /* + * If the mount has been placed on the covered vp, + * it may have been discovered by now, so we have + * to treat this just like an unmount + */ + mount_lock_spin(mp); + mp->mnt_lflag |= MNT_LDEAD; + mount_unlock(mp); + if (device_vnode != NULLVP) { vnode_rele(device_vnode); VNOP_CLOSE(device_vnode, mp->mnt_flag & MNT_RDONLY ? FREAD : FREAD|FWRITE, ctx); did_rele = TRUE; } + vnode_lock_spin(vp); + + mp->mnt_crossref++; vp->v_mountedhere = (mount_t) 0; + vnode_unlock(vp); - + if (have_usecount) { vnode_rele(vp); } out3: - if (devpath && ((uap->flags & MNT_UPDATE) == 0) && (!did_rele)) + if (devpath && ((flags & MNT_UPDATE) == 0) && (!did_rele)) vnode_rele(devvp); out2: if (devpath && devvp) @@ -881,47 +1019,50 @@ out1: if (is_rwlock_locked == TRUE) { lck_rw_done(&mp->mnt_rwlock); } + if (mntalloc) { + if (mp->mnt_crossref) + mount_dropcrossref(mp, vp, 0); + else { + mount_lock_destroy(mp); #if CONFIG_MACF - mac_mount_label_destroy(mp); + mac_mount_label_destroy(mp); #endif - FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); + FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); + } } - if (vfsp_ref) { mount_list_lock(); vfsp->vfc_refcount--; mount_list_unlock(); } - vnode_put(vp); - vnode_put(pvp); - nameidone(&nd); return(error); } -#ifdef CONFIG_IMGSRC_ACCESS /* * Flush in-core data, check for competing mount attempts, * and set VMOUNT */ -static int -prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname) +int +prepare_coveredvp(vnode_t vp, vfs_context_t ctx, struct componentname *cnp, const char *fsname, boolean_t skip_auth) { struct vnode_attr va; int error; - /* - * If the user is not root, ensure that they own the directory - * onto which we are attempting to mount. - */ - VATTR_INIT(&va); - VATTR_WANTED(&va, va_uid); - if ((error = vnode_getattr(vp, &va, ctx)) || - (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) && - (!vfs_context_issuser(ctx)))) { - error = EPERM; - goto out; + if (!skip_auth) { + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + VATTR_INIT(&va); + VATTR_WANTED(&va, va_uid); + if ((error = vnode_getattr(vp, &va, ctx)) || + (va.va_uid != kauth_cred_getuid(vfs_context_ucred(ctx)) && + (!vfs_context_issuser(ctx)))) { + error = EPERM; + goto out; + } } if ( (error = VNOP_FSYNC(vp, MNT_WAIT, ctx)) ) @@ -955,30 +1096,57 @@ out: return error; } +#if CONFIG_IMGSRC_ACCESS + +#if DEBUG +#define IMGSRC_DEBUG(args...) printf(args) +#else +#define IMGSRC_DEBUG(args...) do { } while(0) +#endif + static int authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_t *devvpp, vfs_context_t ctx) { struct nameidata nd; - vnode_t vp; + vnode_t vp, realdevvp; mode_t accessmode; int error; - NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx); - if ( (error = namei(&nd)) ) + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW, UIO_USERSPACE, devpath, ctx); + if ( (error = namei(&nd)) ) { + IMGSRC_DEBUG("namei() failed with %d\n", error); return error; + } - strncpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); vp = nd.ni_vp; - nameidone(&nd); - if (vp->v_type != VBLK) { + if (!vnode_isblk(vp)) { + IMGSRC_DEBUG("Not block device.\n"); error = ENOTBLK; goto out; } - if (major(vp->v_rdev) >= nblkdev) { + + realdevvp = mp->mnt_devvp; + if (realdevvp == NULLVP) { + IMGSRC_DEBUG("No device backs the mount.\n"); error = ENXIO; goto out; } + + error = vnode_getwithref(realdevvp); + if (error != 0) { + IMGSRC_DEBUG("Coudn't get iocount on device.\n"); + goto out; + } + + if (vnode_specrdev(vp) != vnode_specrdev(realdevvp)) { + IMGSRC_DEBUG("Wrong dev_t.\n"); + error = ENXIO; + goto out1; + } + + strlcpy(mp->mnt_vfsstat.f_mntfromname, nd.ni_cnd.cn_pnbuf, MAXPATHLEN); + /* * If mount by non-root, then verify that user has necessary * permissions on the device. @@ -987,12 +1155,18 @@ authorize_devpath_and_update_mntfromname(mount_t mp, user_addr_t devpath, vnode_ accessmode = KAUTH_VNODE_READ_DATA; if ((mp->mnt_flag & MNT_RDONLY) == 0) accessmode |= KAUTH_VNODE_WRITE_DATA; - if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) - goto out; + if ((error = vnode_authorize(vp, NULL, accessmode, ctx)) != 0) { + IMGSRC_DEBUG("Access denied.\n"); + goto out1; + } } *devvpp = vp; + +out1: + vnode_put(realdevvp); out: + nameidone(&nd); if (error) { vnode_put(vp); } @@ -1113,20 +1287,41 @@ mount_end_update(mount_t mp) } static int -relocate_imageboot_source(vnode_t vp, struct componentname *cnp, +get_imgsrc_rootvnode(uint32_t height, vnode_t *rvpp) +{ + vnode_t vp; + + if (height >= MAX_IMAGEBOOT_NESTING) { + return EINVAL; + } + + vp = imgsrc_rootvnodes[height]; + if ((vp != NULLVP) && (vnode_get(vp) == 0)) { + *rvpp = vp; + return 0; + } else { + return ENOENT; + } +} + +static int +relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, - boolean_t is64bit, user_addr_t fsmountargs) + boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index) { int error; mount_t mp; boolean_t placed = FALSE; - vnode_t devvp; + vnode_t devvp = NULLVP; struct vfstable *vfsp; user_addr_t devpath; char *old_mntonname; + vnode_t rvp; + uint32_t height; + uint32_t flags; /* If we didn't imageboot, nothing to move */ - if (imgsrc_rootvnode == NULLVP) { + if (imgsrc_rootvnodes[0] == NULLVP) { return EINVAL; } @@ -1135,23 +1330,84 @@ relocate_imageboot_source(vnode_t vp, struct componentname *cnp, return EPERM; } - error = vnode_get(imgsrc_rootvnode); + IMGSRC_DEBUG("looking for root vnode.\n"); + + /* + * Get root vnode of filesystem we're moving. + */ + if (by_index) { + if (is64bit) { + struct user64_mnt_imgsrc_args mia64; + error = copyin(fsmountargs, &mia64, sizeof(mia64)); + if (error != 0) { + IMGSRC_DEBUG("Failed to copy in arguments.\n"); + return error; + } + + height = mia64.mi_height; + flags = mia64.mi_flags; + devpath = mia64.mi_devpath; + } else { + struct user32_mnt_imgsrc_args mia32; + error = copyin(fsmountargs, &mia32, sizeof(mia32)); + if (error != 0) { + IMGSRC_DEBUG("Failed to copy in arguments.\n"); + return error; + } + + height = mia32.mi_height; + flags = mia32.mi_flags; + devpath = mia32.mi_devpath; + } + } else { + /* + * For binary compatibility--assumes one level of nesting. + */ + if (is64bit) { + if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) ) + return error; + } else { + user32_addr_t tmp; + if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) ) + return error; + + /* munge into LP64 addr */ + devpath = CAST_USER_ADDR_T(tmp); + } + + height = 0; + flags = 0; + } + + if (flags != 0) { + IMGSRC_DEBUG("%s: Got nonzero flags.\n", __FUNCTION__); + return EINVAL; + } + + error = get_imgsrc_rootvnode(height, &rvp); if (error != 0) { + IMGSRC_DEBUG("getting root vnode failed with %d\n", error); return error; } + IMGSRC_DEBUG("got root vnode.\n"); + MALLOC(old_mntonname, char*, MAXPATHLEN, M_TEMP, M_WAITOK); /* Can only move once */ - mp = vnode_mount(imgsrc_rootvnode); + mp = vnode_mount(rvp); if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) { + IMGSRC_DEBUG("Already moved.\n"); error = EBUSY; goto out0; } + IMGSRC_DEBUG("Starting updated.\n"); + /* Get exclusive rwlock on mount, authorize update on mp */ error = mount_begin_update(mp , ctx, 0); if (error != 0) { + IMGSRC_DEBUG("Starting updated failed with %d\n", error); goto out0; } @@ -1160,40 +1416,38 @@ relocate_imageboot_source(vnode_t vp, struct componentname *cnp, * so we're now safe to proceed. */ if ((mp->mnt_kern_flag & MNTK_HAS_MOVED) == MNTK_HAS_MOVED) { + IMGSRC_DEBUG("Already moved [2]\n"); goto out1; } + + + IMGSRC_DEBUG("Preparing coveredvp.\n"); /* Mark covered vnode as mount in progress, authorize placing mount on top */ - error = prepare_coveredvp(vp, ctx, cnp, fsname); + error = prepare_coveredvp(vp, ctx, cnp, fsname, FALSE); if (error != 0) { + IMGSRC_DEBUG("Preparing coveredvp failed with %d.\n", error); goto out1; } + IMGSRC_DEBUG("Covered vp OK.\n"); + /* Sanity check the name caller has provided */ vfsp = mp->mnt_vtable; if (strncmp(vfsp->vfc_name, fsname, MFSNAMELEN) != 0) { + IMGSRC_DEBUG("Wrong fs name.\n"); error = EINVAL; goto out2; } /* Check the device vnode and update mount-from name, for local filesystems */ if (vfsp->vfc_vfsflags & VFC_VFSLOCALARGS) { - if (is64bit) { - if ( (error = copyin(fsmountargs, (caddr_t)&devpath, sizeof(devpath))) ) - goto out2; - fsmountargs += sizeof(devpath); - } else { - user32_addr_t tmp; - if ( (error = copyin(fsmountargs, (caddr_t)&tmp, sizeof(tmp))) ) - goto out2; - /* munge into LP64 addr */ - devpath = CAST_USER_ADDR_T(tmp); - fsmountargs += sizeof(tmp); - } + IMGSRC_DEBUG("Local, doing device validation.\n"); if (devpath != USER_ADDR_NULL) { error = authorize_devpath_and_update_mntfromname(mp, devpath, &devvp, ctx); if (error) { + IMGSRC_DEBUG("authorize_devpath_and_update_mntfromname() failed.\n"); goto out2; } @@ -1205,6 +1459,8 @@ relocate_imageboot_source(vnode_t vp, struct componentname *cnp, * Place mp on top of vnode, ref the vnode, call checkdirs(), * and increment the name cache's mount generation */ + + IMGSRC_DEBUG("About to call place_mount_and_checkdirs().\n"); error = place_mount_and_checkdirs(mp, vp, ctx); if (error != 0) { goto out2; @@ -1221,15 +1477,21 @@ relocate_imageboot_source(vnode_t vp, struct componentname *cnp, mount_unlock(mp); /* Finally, add to mount list, completely ready to go */ - error = mount_list_add(mp); - if (error != 0) { + if (mount_list_add(mp) != 0) { + /* + * The system is shutting down trying to umount + * everything, so fail with a plausible errno. + */ + error = EBUSY; goto out3; } mount_end_update(mp); - vnode_put(imgsrc_rootvnode); + vnode_put(rvp); FREE(old_mntonname, M_TEMP); + vfs_notify_mount(pvp); + return 0; out3: strncpy(mp->mnt_vfsstat.f_mntonname, old_mntonname, MAXPATHLEN); @@ -1255,7 +1517,7 @@ out1: mount_end_update(mp); out0: - vnode_put(imgsrc_rootvnode); + vnode_put(rvp); FREE(old_mntonname, M_TEMP); return error; } @@ -1282,7 +1544,8 @@ enablequotas(struct mount *mp, vfs_context_t ctx) */ for (type=0; type < MAXQUOTAS; type++) { snprintf(qfpath, sizeof(qfpath), "%s/%s.%s", mp->mnt_vfsstat.f_mntonname, qfopsname, qfextension[type]); - NDINIT(&qnd, LOOKUP, FOLLOW, UIO_SYSSPACE, CAST_USER_ADDR_T(qfpath), ctx); + NDINIT(&qnd, LOOKUP, OP_MOUNT, FOLLOW, UIO_SYSSPACE, + CAST_USER_ADDR_T(qfpath), ctx); if (namei(&qnd) != 0) continue; /* option file to trigger quotas is not present */ vnode_put(qnd.ni_vp); @@ -1410,7 +1673,7 @@ unmount(__unused proc_t p, struct unmount_args *uap, __unused int32_t *retval) struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, NOTRIGGER | FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_UNMOUNT, NOTRIGGER | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -1466,13 +1729,18 @@ safedounmount(struct mount *mp, int flags, vfs_context_t ctx) proc_t p = vfs_context_proc(ctx); /* - * Only root, or the user that did the original mount is - * permitted to unmount this filesystem. + * Skip authorization if the mount is tagged as permissive and + * this is not a forced-unmount attempt. */ - if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) && - (error = suser(kauth_cred_get(), &p->p_acflag))) - goto out; - + if (!(((mp->mnt_kern_flag & MNTK_PERMIT_UNMOUNT) != 0) && ((flags & MNT_FORCE) == 0))) { + /* + * Only root, or the user that did the original mount is + * permitted to unmount this filesystem. + */ + if ((mp->mnt_vfsstat.f_owner != kauth_cred_getuid(kauth_cred_get())) && + (error = suser(kauth_cred_get(), &p->p_acflag))) + goto out; + } /* * Don't allow unmounting the root file system. */ @@ -1507,9 +1775,13 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) int forcedunmount = 0; int lflags = 0; struct vnode *devvp = NULLVP; +#if CONFIG_TRIGGERS + int did_vflush = 0; +#endif /* CONFIG_TRIGGERS */ if (flags & MNT_FORCE) forcedunmount = 1; + mount_lock(mp); /* XXX post jaguar fix LK_DRAIN - then clean this up */ if ((flags & MNT_FORCE)) { @@ -1572,7 +1844,11 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) } } } - + +#if CONFIG_TRIGGERS + vfs_nested_trigger_unmounts(mp, flags, ctx); + did_vflush = 1; +#endif if (forcedunmount) lflags |= FORCECLOSE; error = vflush(mp, NULLVP, SKIPSWAP | SKIPSYSTEM | SKIPROOT | lflags); @@ -1614,14 +1890,17 @@ dounmount(struct mount *mp, int flags, int withref, vfs_context_t ctx) lck_rw_done(&mp->mnt_rwlock); mount_list_remove(mp); lck_rw_lock_exclusive(&mp->mnt_rwlock); - + /* mark the mount point hook in the vp but not drop the ref yet */ if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) { - vnode_getwithref(coveredvp); - vnode_lock_spin(coveredvp); - coveredvp->v_mountedhere = (struct mount *)0; - vnode_unlock(coveredvp); - vnode_put(coveredvp); + vnode_getwithref(coveredvp); + vnode_lock_spin(coveredvp); + + mp->mnt_crossref++; + coveredvp->v_mountedhere = (struct mount *)0; + + vnode_unlock(coveredvp); + vnode_put(coveredvp); } mount_list_lock(); @@ -1650,11 +1929,33 @@ out: mp->mnt_lflag &= ~MNT_LWAIT; needwakeup = 1; } + + +#if CONFIG_TRIGGERS + /* + * Callback and context are set together under the mount lock, and + * never cleared, so we're safe to examine them here, drop the lock, + * and call out. + */ + if (mp->mnt_triggercallback != NULL) { + mount_unlock(mp); + if (error == 0) { + mp->mnt_triggercallback(mp, VTC_RELEASE, mp->mnt_triggerdata, ctx); + } else if (did_vflush) { + mp->mnt_triggercallback(mp, VTC_REPLACE, mp->mnt_triggerdata, ctx); + } + } else { + mount_unlock(mp); + } +#else mount_unlock(mp); +#endif /* CONFIG_TRIGGERS */ + lck_rw_done(&mp->mnt_rwlock); if (needwakeup) wakeup((caddr_t)mp); + if (!error) { if ((coveredvp != NULLVP)) { vnode_t pvp; @@ -1662,18 +1963,12 @@ out: vnode_getwithref(coveredvp); pvp = vnode_getparent(coveredvp); vnode_rele(coveredvp); - vnode_lock_spin(coveredvp); - if(mp->mnt_crossref == 0) { - vnode_unlock(coveredvp); - mount_lock_destroy(mp); -#if CONFIG_MACF - mac_mount_label_destroy(mp); -#endif - FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); - } else { - coveredvp->v_lflag |= VL_MOUNTDEAD; - vnode_unlock(coveredvp); - } + + mount_dropcrossref(mp, coveredvp, 0); +#if CONFIG_TRIGGERS + if (coveredvp->v_resolve) + vnode_trigger_rearm(coveredvp, ctx); +#endif vnode_put(coveredvp); if (pvp) { @@ -1695,25 +1990,28 @@ out: void mount_dropcrossref(mount_t mp, vnode_t dp, int need_put) { - vnode_lock(dp); - mp->mnt_crossref--; - if (mp->mnt_crossref < 0) - panic("mount cross refs -ve"); - if (((dp->v_lflag & VL_MOUNTDEAD) == VL_MOUNTDEAD) && (mp->mnt_crossref == 0)) { - dp->v_lflag &= ~VL_MOUNTDEAD; - if (need_put) - vnode_put_locked(dp); - vnode_unlock(dp); - mount_lock_destroy(mp); -#if CONFIG_MACF - mac_mount_label_destroy(mp); -#endif - FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); - return; - } + vnode_lock(dp); + mp->mnt_crossref--; + + if (mp->mnt_crossref < 0) + panic("mount cross refs -ve"); + + if ((mp != dp->v_mountedhere) && (mp->mnt_crossref == 0)) { + if (need_put) - vnode_put_locked(dp); + vnode_put_locked(dp); vnode_unlock(dp); + + mount_lock_destroy(mp); +#if CONFIG_MACF + mac_mount_label_destroy(mp); +#endif + FREE_ZONE((caddr_t)mp, sizeof (struct mount), M_MOUNT); + return; + } + if (need_put) + vnode_put_locked(dp); + vnode_unlock(dp); } @@ -1806,8 +2104,8 @@ quotactl_funneled(proc_t p, struct quotactl_args *uap, __unused int32_t *retval) AUDIT_ARG(uid, uap->uid); AUDIT_ARG(cmd, uap->cmd); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, - UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + uap->path, ctx); error = namei(&nd); if (error) return (error); @@ -1914,7 +2212,7 @@ statfs(__unused proc_t p, struct statfs_args *uap, __unused int32_t *retval) vfs_context_t ctx = vfs_context_current(); vnode_t vp; - NDINIT(&nd, LOOKUP, NOTRIGGER | FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_STATFS, NOTRIGGER | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -2000,7 +2298,11 @@ statfs64_common(struct mount *mp, struct vfsstatfs *sfsp, user_addr_t bufp) sfs.f_type = mp->mnt_vtable->vfc_typenum; sfs.f_flags = mp->mnt_flag & MNT_VISFLAGMASK; sfs.f_fssubtype = sfsp->f_fssubtype; - strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN); + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + } else { + strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSTYPENAMELEN); + } strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MAXPATHLEN); strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MAXPATHLEN); @@ -2022,7 +2324,7 @@ statfs64(__unused struct proc *p, struct statfs64_args *uap, __unused int32_t *r vfs_context_t ctxp = vfs_context_current(); vnode_t vp; - NDINIT(&nd, LOOKUP, NOTRIGGER | FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_STATFS, NOTRIGGER | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctxp); error = namei(&nd); if (error) @@ -2068,7 +2370,7 @@ fstatfs64(__unused struct proc *p, struct fstatfs64_args *uap, __unused int32_t mp = vp->v_mount; if (!mp) { - error = EBADF; + error = EBADF;; goto out; } sp = &mp->mnt_vfsstat; @@ -2470,7 +2772,7 @@ common_chdir(proc_t p, struct chdir_args *uap, int per_thread) vnode_t tvp; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_CHDIR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = change_dir(&nd, ctx); if (error) @@ -2572,7 +2874,7 @@ chroot(proc_t p, struct chroot_args *uap, __unused int32_t *retval) if ((error = suser(kauth_cred_get(), &p->p_acflag))) return (error); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_CHROOT, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = change_dir(&nd, ctx); if (error) @@ -2795,7 +3097,15 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, struct vnode_attr *v */ if (no_controlling_tty && (p->p_flag & P_CONTROLT)) { vnode_t ttyvp; - vnode_ref(vp); + + /* + * We already have a ref from vn_open_auth(), so we can demand another reference. + */ + error = vnode_ref_ext(vp, 0, VNODE_REF_FORCE); + if (error != 0) { + panic("vnode_ref_ext() with VNODE_REF_FORCE failed?!"); + } + session_lock(sessp); ttyvp = sessp->s_ttyvp; sessp->s_ttyvp = vp; @@ -2808,6 +3118,8 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags, struct vnode_attr *v vnode_put(vp); proc_fdlock(p); + if (flags & O_CLOEXEC) + *fdflags(p, indx) |= UF_EXCLOSE; procfdtbl_releasefd(p, indx, NULL); fp_drop(p, indx, fp, 1); proc_fdunlock(p); @@ -2887,7 +3199,8 @@ open_extended(proc_t p, struct open_extended_args *uap, int32_t *retval) if (xsecdst != NULL) VATTR_SET(&va, va_acl, &xsecdst->fsec_acl); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, vfs_context_current()); + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + uap->path, vfs_context_current()); ciferror = open1(vfs_context_current(), &nd, uap->flags, &va, retval); if (xsecdst != NULL) @@ -2916,7 +3229,8 @@ open_nocancel(proc_t p, struct open_nocancel_args *uap, int32_t *retval) cmode = ((uap->mode &~ fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT; VATTR_SET(&va, va_mode, cmode & ACCESSPERMS); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, vfs_context_current()); + NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + uap->path, vfs_context_current()); return(open1(vfs_context_current(), &nd, uap->flags, &va, retval)); } @@ -2933,7 +3247,6 @@ mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval) struct vnode_attr va; vfs_context_t ctx = vfs_context_current(); int error; - int whiteout = 0; struct nameidata nd; vnode_t vp, dvp; @@ -2950,7 +3263,7 @@ mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval) if ((error = suser(vfs_context_ucred(ctx), &p->p_acflag))) return (error); - NDINIT(&nd, CREATE, LOCKPARENT | AUDITVNPATH1, + NDINIT(&nd, CREATE, OP_MKNOD, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -2973,32 +3286,22 @@ mknod(proc_t p, struct mknod_args *uap, __unused int32_t *retval) case S_IFBLK: VATTR_SET(&va, va_type, VBLK); break; - case S_IFWHT: - whiteout = 1; - break; default: error = EINVAL; goto out; } #if CONFIG_MACF - if (!whiteout) { - error = mac_vnode_check_create(ctx, - nd.ni_dvp, &nd.ni_cnd, &va); - if (error) - goto out; - } + error = mac_vnode_check_create(ctx, + nd.ni_dvp, &nd.ni_cnd, &va); + if (error) + goto out; #endif if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) goto out; - if (whiteout) { - error = VNOP_WHITEOUT(dvp, &nd.ni_cnd, CREATE, ctx); - } else { - error = vn_create(dvp, &vp, &nd.ni_cnd, &va, 0, ctx); - } - if (error) + if ((error = vn_create(dvp, &vp, &nd, &va, 0, 0, NULL, ctx)) != 0) goto out; if (vp) { @@ -3050,7 +3353,7 @@ mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap) int error; struct nameidata nd; - NDINIT(&nd, CREATE, LOCKPARENT | AUDITVNPATH1, + NDINIT(&nd, CREATE, OP_MKFIFO, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE, upath, ctx); error = namei(&nd); if (error) @@ -3065,19 +3368,10 @@ mkfifo1(vfs_context_t ctx, user_addr_t upath, struct vnode_attr *vap) } VATTR_SET(vap, va_type, VFIFO); -#if CONFIG_MACF - error = mac_vnode_check_create(ctx, nd.ni_dvp, - &nd.ni_cnd, vap); - if (error) + if ((error = vn_authorize_create(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) goto out; -#endif - - - if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) - goto out; - - error = vn_create(dvp, &vp, &nd.ni_cnd, vap, 0, ctx); + error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx); out: /* * nameidone has to happen before we vnode_put(dvp) @@ -3263,7 +3557,7 @@ link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval) vp = dvp = lvp = NULLVP; /* look up the object we are linking to */ - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_LOOKUP, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -3297,6 +3591,9 @@ link(__unused proc_t p, struct link_args *uap, __unused int32_t *retval) } /* lookup the target node */ +#if CONFIG_TRIGGERS + nd.ni_op = OP_LINK; +#endif nd.ni_cnd.cn_nameiop = CREATE; nd.ni_cnd.cn_flags = LOCKPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK; nd.ni_dirp = uap->link; @@ -3439,7 +3736,7 @@ symlink(proc_t p, struct symlink_args *uap, __unused int32_t *retval) goto out; AUDIT_ARG(text, path); /* This is the link string */ - NDINIT(&nd, CREATE, LOCKPARENT | AUDITVNPATH1, + NDINIT(&nd, CREATE, OP_SYMLINK, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE, uap->link, ctx); error = namei(&nd); if (error) @@ -3481,6 +3778,9 @@ symlink(proc_t p, struct symlink_args *uap, __unused int32_t *retval) if (vp == NULL) { nd.ni_cnd.cn_nameiop = LOOKUP; +#if CONFIG_TRIGGERS + nd.ni_op = OP_LOOKUP; +#endif nd.ni_cnd.cn_flags = 0; error = namei(&nd); vp = nd.ni_vp; @@ -3557,7 +3857,7 @@ undelete(__unused proc_t p, struct undelete_args *uap, __unused int32_t *retval) vfs_context_t ctx = vfs_context_current(); vnode_t vp, dvp; - NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT|AUDITVNPATH1, + NDINIT(&nd, DELETE, OP_UNLINK, LOCKPARENT | DOWHITEOUT | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -3598,19 +3898,25 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) int len=0; #if CONFIG_FSE fse_info finfo; + struct vnode_attr va; #endif int flags = 0; int need_event = 0; int has_listeners = 0; int truncated_path=0; + int batched; + struct vnode_attr *vap = NULL; + #if NAMEDRSRCFORK /* unlink or delete is allowed on rsrc forks and named streams */ ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif ndp->ni_cnd.cn_flags |= LOCKPARENT; + ndp->ni_flag |= NAMEI_COMPOUNDREMOVE; cnp = &ndp->ni_cnd; +lookup_continue: error = namei(ndp); if (error) return (error); @@ -3618,57 +3924,62 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) dvp = ndp->ni_dvp; vp = ndp->ni_vp; + /* With Carbon delete semantics, busy files cannot be deleted */ if (nodelbusy) { flags |= VNODE_REMOVE_NODELETEBUSY; } - /* - * Normally, unlinking of directories is not supported. - * However, some file systems may have limited support. - */ - if ((vp->v_type == VDIR) && - !(vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSDIRLINKS)) { - error = EPERM; /* POSIX */ - } + if (vp) { + batched = vnode_compound_remove_available(vp); + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) { + error = EBUSY; + } - /* - * The root of a mounted filesystem cannot be deleted. - */ - if (vp->v_flag & VROOT) { - error = EBUSY; - } - if (error) - goto out; + if (!batched) { + error = vn_authorize_unlink(dvp, vp, cnp, ctx, NULL); + if (error) { + goto out; + } + } + } else { + batched = 1; + if (!vnode_compound_remove_available(dvp)) { + panic("No vp, but no compound remove?"); + } + } - /* authorize the delete operation */ -#if CONFIG_MACF - if (!error) - error = mac_vnode_check_unlink(ctx, - dvp, vp, cnp); -#endif /* MAC */ - if (!error) - error = vnode_authorize(vp, ndp->ni_dvp, KAUTH_VNODE_DELETE, ctx); - if (error) - goto out; - #if CONFIG_FSE need_event = need_fsevent(FSE_DELETE, dvp); if (need_event) { - if ((vp->v_flag & VISHARDLINK) == 0) { - get_fse_info(vp, &finfo, ctx); + if (!batched) { + if ((vp->v_flag & VISHARDLINK) == 0) { + /* XXX need to get these data in batched VNOP */ + get_fse_info(vp, &finfo, ctx); + } + } else { + error = vfs_get_notify_attributes(&va); + if (error) { + goto out; + } + + vap = &va; } } #endif has_listeners = kauth_authorize_fileop_has_listeners(); if (need_event || has_listeners) { - GET_PATH(path); if (path == NULL) { - error = ENOMEM; - goto out; + GET_PATH(path); + if (path == NULL) { + error = ENOMEM; + goto out; + } } - len = safe_getpath(dvp, ndp->ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated_path); } @@ -3677,7 +3988,25 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) error = vnode_removenamedstream(dvp, vp, XATTR_RESOURCEFORK_NAME, 0, ctx); else #endif - error = VNOP_REMOVE(dvp, vp, &ndp->ni_cnd, flags, ctx); + { + error = vn_remove(dvp, &ndp->ni_vp, ndp, flags, vap, ctx); + vp = ndp->ni_vp; + if (error == EKEEPLOOKING) { + if (!batched) { + panic("EKEEPLOOKING, but not a filesystem that supports compound VNOPs?"); + } + + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + + if (vnode_isdir(vp)) { + error = EISDIR; + goto out; + } + goto lookup_continue; + } + } /* * Call out to allow 3rd party notification of delete. @@ -3706,6 +4035,8 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) if (need_event) { if (vp->v_flag & VISHARDLINK) { get_fse_info(vp, &finfo, ctx); + } else if (vap) { + vnode_get_fse_info_from_vap(vp, &finfo, vap); } if (truncated_path) { finfo.mode |= FSE_TRUNCATED_PATH; @@ -3717,27 +4048,30 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) } #endif } + +out: if (path != NULL) RELEASE_PATH(path); - /* - * nameidone has to happen before we vnode_put(dvp) - * since it may need to release the fs_nodelock on the dvp - */ -out: #if NAMEDRSRCFORK /* recycle the deleted rsrc fork vnode to force a reclaim, which * will cause its shadow file to go away if necessary. */ - if ((vnode_isnamedstream(ndp->ni_vp)) && - (ndp->ni_vp->v_parent != NULLVP) && - vnode_isshadow(ndp->ni_vp)) { - vnode_recycle(ndp->ni_vp); + if (vp && (vnode_isnamedstream(vp)) && + (vp->v_parent != NULLVP) && + vnode_isshadow(vp)) { + vnode_recycle(vp); } #endif + /* + * nameidone has to happen before we vnode_put(dvp) + * since it may need to release the fs_nodelock on the dvp + */ nameidone(ndp); vnode_put(dvp); - vnode_put(vp); + if (vp) { + vnode_put(vp); + } return (error); } @@ -3750,7 +4084,8 @@ unlink(__unused proc_t p, struct unlink_args *uap, __unused int32_t *retval) struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, DELETE, AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, UIO_USERSPACE, + uap->path, ctx); return unlink1(ctx, &nd, 0); } @@ -3763,7 +4098,8 @@ delete(__unused proc_t p, struct delete_args *uap, __unused int32_t *retval) struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, DELETE, AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, DELETE, OP_UNLINK, AUDITVNPATH1, UIO_USERSPACE, + uap->path, ctx); return unlink1(ctx, &nd, 1); } @@ -4132,7 +4468,9 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in niopts |= WANTPARENT; /* do the lookup */ - NDINIT(&nd, LOOKUP, niopts, UIO_SYSSPACE, CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset), &context); + NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_SYSSPACE, + CAST_USER_ADDR_T(((const char *)input) + input[i].ad_name_offset), + &context); error = namei(&nd); if (!error) { vp = nd.ni_vp; @@ -4218,7 +4556,8 @@ access(__unused proc_t p, struct access_args *uap, __unused int32_t *retval) /* need parent for vnode_authorize for deletion test */ if (uap->flags & _DELETE_OK) niopts |= WANTPARENT; - NDINIT(&nd, LOOKUP, niopts, UIO_USERSPACE, uap->path, &context); + NDINIT(&nd, LOOKUP, OP_ACCESS, niopts, UIO_USERSPACE, + uap->path, &context); #if NAMEDRSRCFORK /* access(F_OK) calls are allowed for resource forks. */ @@ -4410,7 +4749,7 @@ stat1(user_addr_t path, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsecu struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, NOTRIGGER | FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_GETATTR, NOTRIGGER | FOLLOW | AUDITVNPATH1, UIO_USERSPACE, path, ctx); return(stat2(ctx, &nd, ub, xsecurity, xsecurity_size, isstat64)); } @@ -4483,7 +4822,7 @@ lstat1(user_addr_t path, user_addr_t ub, user_addr_t xsecurity, user_addr_t xsec struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, NOTRIGGER | NOFOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_GETATTR, NOTRIGGER | NOFOLLOW | AUDITVNPATH1, UIO_USERSPACE, path, ctx); return(stat2(ctx, &nd, ub, xsecurity, xsecurity_size, isstat64)); @@ -4569,7 +4908,7 @@ pathconf(__unused proc_t p, struct pathconf_args *uap, int32_t *retval) struct nameidata nd; vfs_context_t ctx = vfs_context_current(); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_PATHCONF, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -4597,7 +4936,7 @@ readlink(proc_t p, struct readlink_args *uap, int32_t *retval) vfs_context_t ctx = vfs_context_current(); char uio_buf[ UIO_SIZEOF(1) ]; - NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_READLINK, NOFOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -4680,7 +5019,7 @@ chflags(__unused proc_t p, struct chflags_args *uap, __unused int32_t *retval) struct nameidata nd; AUDIT_ARG(fflags, uap->flags); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -4782,7 +5121,7 @@ chmod1(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap) struct nameidata nd; int error; - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, path, ctx); if ((error = namei(&nd))) return (error); @@ -4984,7 +5323,8 @@ chown1(vfs_context_t ctx, struct chown_args *uap, __unused int32_t *retval, int AUDIT_ARG(owner, uap->uid, uap->gid); - NDINIT(&nd, LOOKUP, (follow ? FOLLOW : 0) | NOTRIGGER | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_SETATTR, + (follow ? FOLLOW : 0) | NOTRIGGER | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -5193,7 +5533,7 @@ utimes(__unused proc_t p, struct utimes_args *uap, __unused int32_t *retval) * AUDIT: Needed to change the order of operations to do the * name lookup first because auditing wants the path. */ - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_SETATTR, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -5260,7 +5600,7 @@ truncate(__unused proc_t p, struct truncate_args *uap, __unused int32_t *retval) if (uap->length < 0) return(EINVAL); - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, + NDINIT(&nd, LOOKUP, OP_TRUNCATE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, uap->path, ctx); if ((error = namei(&nd))) return (error); @@ -5472,14 +5812,15 @@ copyfile(__unused proc_t p, struct copyfile_args *uap, __unused int32_t *retval) return(EINVAL); } - NDINIT(&fromnd, LOOKUP, SAVESTART | AUDITVNPATH1, + NDINIT(&fromnd, LOOKUP, OP_COPYFILE, SAVESTART | AUDITVNPATH1, UIO_USERSPACE, uap->from, ctx); if ((error = namei(&fromnd))) return (error); fvp = fromnd.ni_vp; - NDINIT(&tond, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK, - UIO_USERSPACE, uap->to, ctx); + NDINIT(&tond, CREATE, OP_LINK, + LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNPATH2 | CN_NBMOUNTLOOK, + UIO_USERSPACE, uap->to, ctx); if ((error = namei(&tond))) { goto out1; } @@ -5552,76 +5893,101 @@ rename(__unused proc_t p, struct rename_args *uap, __unused int32_t *retval) int do_retry; int mntrename; int need_event; - const char *oname; + const char *oname = NULL; char *from_name = NULL, *to_name = NULL; int from_len=0, to_len=0; int holding_mntlock; mount_t locked_mp = NULL; - vnode_t oparent; + vnode_t oparent = NULLVP; #if CONFIG_FSE fse_info from_finfo, to_finfo; + struct vnode_attr fva, tva; #endif int from_truncated=0, to_truncated; + int batched = 0; + struct vnode_attr *fvap, *tvap; + int continuing = 0; holding_mntlock = 0; do_retry = 0; retry: fvp = tvp = NULL; fdvp = tdvp = NULL; + fvap = tvap = NULL; mntrename = FALSE; - NDINIT(&fromnd, DELETE, WANTPARENT | AUDITVNPATH1, UIO_USERSPACE, uap->from, ctx); + NDINIT(&fromnd, DELETE, OP_UNLINK, WANTPARENT | AUDITVNPATH1, + UIO_USERSPACE, uap->from, ctx); + fromnd.ni_flag = NAMEI_COMPOUNDRENAME; + + NDINIT(&tond, RENAME, OP_RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK, + UIO_USERSPACE, uap->to, ctx); + tond.ni_flag = NAMEI_COMPOUNDRENAME; - if ( (error = namei(&fromnd)) ) - goto out1; - fdvp = fromnd.ni_dvp; - fvp = fromnd.ni_vp; +continue_lookup: + if ((fromnd.ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) { + if ( (error = namei(&fromnd)) ) + goto out1; + fdvp = fromnd.ni_dvp; + fvp = fromnd.ni_vp; -#if CONFIG_MACF - error = mac_vnode_check_rename_from(ctx, fdvp, fvp, &fromnd.ni_cnd); - if (error) - goto out1; -#endif + if (fvp && fvp->v_type == VDIR) + tond.ni_cnd.cn_flags |= WILLBEDIR; + } - NDINIT(&tond, RENAME, WANTPARENT | AUDITVNPATH2 | CN_NBMOUNTLOOK , UIO_USERSPACE, uap->to, ctx); - if (fvp->v_type == VDIR) - tond.ni_cnd.cn_flags |= WILLBEDIR; + if ((tond.ni_flag & NAMEI_CONTLOOKUP) != 0 || !continuing) { + if ( (error = namei(&tond)) ) { + /* + * Translate error code for rename("dir1", "dir2/."). + */ + if (error == EISDIR && fvp->v_type == VDIR) + error = EINVAL; + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + } - if ( (error = namei(&tond)) ) { - /* - * Translate error code for rename("dir1", "dir2/."). + batched = vnode_compound_rename_available(fdvp); + if (!fvp) { + /* + * Claim: this check will never reject a valid rename. + * For success, either fvp must be on the same mount as tdvp, or fvp must sit atop a vnode on the same mount as tdvp. + * Suppose fdvp and tdvp are not on the same mount. + * If fvp is on the same mount as tdvp, then fvp is not on the same mount as fdvp, so fvp is the root of its filesystem. If fvp is the root, + * then you can't move it to within another dir on the same mountpoint. + * If fvp sits atop a vnode on the same mount as fdvp, then that vnode must be part of the same mount as fdvp, which is a contradiction. + * + * If this check passes, then we are safe to pass these vnodes to the same FS. */ - if (error == EISDIR && fvp->v_type == VDIR) - error = EINVAL; - goto out1; + if (fdvp->v_mount != tdvp->v_mount) { + error = EXDEV; + goto out1; + } + goto skipped_lookup; } - tdvp = tond.ni_dvp; - tvp = tond.ni_vp; - -#if CONFIG_MACF - error = mac_vnode_check_rename_to(ctx, - tdvp, tvp, fdvp == tdvp, &tond.ni_cnd); - if (error) - goto out1; -#endif - if (tvp != NULL) { - if (fvp->v_type == VDIR && tvp->v_type != VDIR) { - error = ENOTDIR; - goto out1; - } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { - error = EISDIR; + if (!batched) { + error = vn_authorize_rename(fdvp, fvp, &fromnd.ni_cnd, tdvp, tvp, &tond.ni_cnd, ctx, NULL); + if (error) { + if (error == ENOENT) { + /* + * We encountered a race where after doing the namei, tvp stops + * being valid. If so, simply re-drive the rename call from the + * top. + */ + do_retry = 1; + } goto out1; } } - if (fvp == tdvp) { - error = EINVAL; - goto out1; - } + /* * If the source and destination are the same (i.e. they're * links to the same vnode) and the target file system is * case sensitive, then there is nothing to do. + * + * XXX Come back to this. */ if (fvp == tvp) { int pathconf_val; @@ -5636,93 +6002,15 @@ retry: } } - /* - * Authorization. - * - * If tvp is a directory and not the same as fdvp, or tdvp is not - * the same as fdvp, the node is moving between directories and we - * need rights to remove from the old and add to the new. - * - * If tvp already exists and is not a directory, we need to be - * allowed to delete it. - * - * Note that we do not inherit when renaming. - * - * XXX This needs to be revisited to implement the deferred-inherit bit - */ - { - int moving = 0; - - error = 0; - if ((tvp != NULL) && vnode_isdir(tvp)) { - if (tvp != fdvp) - moving = 1; - } else if (tdvp != fdvp) { - moving = 1; - } - /* - * must have delete rights to remove the old name even in - * the simple case of fdvp == tdvp. - * - * If fvp is a directory, and we are changing it's parent, - * then we also need rights to rewrite its ".." entry as well. - */ - if (vnode_isdir(fvp)) { - if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) - goto auth_exit; - } else { - if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE, ctx)) != 0) - goto auth_exit; - } - if (moving) { - /* moving into tdvp or tvp, must have rights to add */ - if ((error = vnode_authorize(((tvp != NULL) && vnode_isdir(tvp)) ? tvp : tdvp, - NULL, - vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, - ctx)) != 0) { - /* - * We could encounter a race where after doing the namei, tvp stops - * being valid. If so, simply re-drive the rename call from the - * top. - */ - if (error == ENOENT) { - do_retry = 1; - } - goto auth_exit; - } - } else { - /* node staying in same directory, must be allowed to add new name */ - if ((error = vnode_authorize(fdvp, NULL, - vnode_isdir(fvp) ? KAUTH_VNODE_ADD_SUBDIRECTORY : KAUTH_VNODE_ADD_FILE, ctx)) != 0) - goto auth_exit; - } - /* overwriting tvp */ - if ((tvp != NULL) && !vnode_isdir(tvp) && - ((error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE, ctx)) != 0)) { - /* - * We could encounter a race where after doing the namei, tvp stops - * being valid. If so, simply re-drive the rename call from the - * top. - */ - if (error == ENOENT) { - do_retry = 1; - } - goto auth_exit; - } - - /* XXX more checks? */ - -auth_exit: - /* authorization denied */ - if (error != 0) - goto out1; - } /* * Allow the renaming of mount points. * - target must not exist * - target must reside in the same directory as source * - union mounts cannot be renamed * - "/" cannot be renamed + * + * XXX Handle this in VFS after a continued lookup (if we missed + * in the cache to start off) */ if ((fvp->v_flag & VROOT) && (fvp->v_type == VDIR) && @@ -5752,35 +6040,6 @@ auth_exit: error = EXDEV; goto out1; } - /* - * Avoid renaming "." and "..". - */ - if (fvp->v_type == VDIR && - ((fdvp == fvp) || - (fromnd.ni_cnd.cn_namelen == 1 && fromnd.ni_cnd.cn_nameptr[0] == '.') || - ((fromnd.ni_cnd.cn_flags | tond.ni_cnd.cn_flags) & ISDOTDOT)) ) { - error = EINVAL; - goto out1; - } - /* - * The following edge case is caught here: - * (to cannot be a descendent of from) - * - * o fdvp - * / - * / - * o fvp - * \ - * \ - * o tdvp - * / - * / - * o tvp - */ - if (tdvp->v_parent == fvp) { - error = EINVAL; - goto out1; - } /* * If source is the same as the destination (that is the @@ -5799,6 +6058,8 @@ auth_exit: * NOTE - that fvp == tvp also occurs if they are hard linked and * that correct behaviour then is just to return success without doing * anything. + * + * XXX filesystem should take care of this itself, perhaps... */ if (fvp == tvp && fdvp == tdvp) { if (fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && @@ -5882,17 +6143,35 @@ auth_exit: holding_mntlock = 0; } } + // save these off so we can later verify that fvp is the same oname = fvp->v_name; oparent = fvp->v_parent; +skipped_lookup: #if CONFIG_FSE - need_event = need_fsevent(FSE_RENAME, fvp); + need_event = need_fsevent(FSE_RENAME, fdvp); if (need_event) { - get_fse_info(fvp, &from_finfo, ctx); + if (fvp) { + get_fse_info(fvp, &from_finfo, ctx); + } else { + error = vfs_get_notify_attributes(&fva); + if (error) { + goto out1; + } + + fvap = &fva; + } if (tvp) { get_fse_info(tvp, &to_finfo, ctx); + } else if (batched) { + error = vfs_get_notify_attributes(&tva); + if (error) { + goto out1; + } + + tvap = &tva; } } #else @@ -5900,26 +6179,30 @@ auth_exit: #endif /* CONFIG_FSE */ if (need_event || kauth_authorize_fileop_has_listeners()) { - GET_PATH(from_name); if (from_name == NULL) { - error = ENOMEM; - goto out1; + GET_PATH(from_name); + if (from_name == NULL) { + error = ENOMEM; + goto out1; + } } from_len = safe_getpath(fdvp, fromnd.ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated); - GET_PATH(to_name); if (to_name == NULL) { - error = ENOMEM; - goto out1; + GET_PATH(to_name); + if (to_name == NULL) { + error = ENOMEM; + goto out1; + } } to_len = safe_getpath(tdvp, tond.ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated); } - error = VNOP_RENAME(fdvp, fvp, &fromnd.ni_cnd, - tdvp, tvp, &tond.ni_cnd, - ctx); + error = vn_rename(fdvp, &fvp, &fromnd.ni_cnd, fvap, + tdvp, &tvp, &tond.ni_cnd, tvap, + 0, ctx); if (holding_mntlock) { /* @@ -5931,16 +6214,29 @@ auth_exit: holding_mntlock = 0; } if (error) { - /* - * We may encounter a race in the VNOP where the destination didn't - * exist when we did the namei, but it does by the time we go and - * try to create the entry. In this case, we should re-drive this rename - * call from the top again. Currently, only HFS bubbles out ERECYCLE, + if (error == EKEEPLOOKING) { + if ((fromnd.ni_flag & NAMEI_CONTLOOKUP) == 0) { + if ((tond.ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING without NAMEI_CONTLOOKUP on either ndp?"); + } + } + + fromnd.ni_vp = fvp; + tond.ni_vp = tvp; + + goto continue_lookup; + } + + /* + * We may encounter a race in the VNOP where the destination didn't + * exist when we did the namei, but it does by the time we go and + * try to create the entry. In this case, we should re-drive this rename + * call from the top again. Currently, only HFS bubbles out ERECYCLE, * but other filesystems susceptible to this race could return it, too. - */ - if (error == ERECYCLE) { - do_retry = 1; - } + */ + if (error == ERECYCLE) { + do_retry = 1; + } goto out1; } @@ -5958,6 +6254,14 @@ auth_exit: // set it here since only the from_finfo gets reported up to user space from_finfo.mode |= FSE_TRUNCATED_PATH; } + + if (tvap && tvp) { + vnode_get_fse_info_from_vap(tvp, &to_finfo, tvap); + } + if (fvap) { + vnode_get_fse_info_from_vap(fvp, &from_finfo, fvap); + } + if (tvp) { add_fsevent(FSE_RENAME, ctx, FSE_ARG_STRING, from_len, from_name, @@ -6020,8 +6324,10 @@ auth_exit: * check that fvp has the same name/parent pointers it * had before the rename call... this is a 'weak' check * at best... + * + * XXX oparent and oname may not be set in the compound vnop case */ - if (oname == fvp->v_name && oparent == fvp->v_parent) { + if (batched || (oname == fvp->v_name && oparent == fvp->v_parent)) { int update_flags; update_flags = VNODE_UPDATE_NAME; @@ -6068,12 +6374,12 @@ out1: vnode_put(fdvp); } - /* - * If things changed after we did the namei, then we will re-drive - * this rename call from the top. - */ + /* + * If things changed after we did the namei, then we will re-drive + * this rename call from the top. + */ if(do_retry) { - do_retry = 0; + do_retry = 0; goto retry; } @@ -6096,12 +6402,16 @@ mkdir1(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap) vnode_t vp, dvp; int error; int update_flags = 0; + int batched; struct nameidata nd; AUDIT_ARG(mode, vap->va_mode); - NDINIT(&nd, CREATE, LOCKPARENT | AUDITVNPATH1, - UIO_USERSPACE, path, ctx); + NDINIT(&nd, CREATE, OP_MKDIR, LOCKPARENT | AUDITVNPATH1, UIO_USERSPACE, + path, ctx); nd.ni_cnd.cn_flags |= WILLBEDIR; + nd.ni_flag = NAMEI_COMPOUNDMKDIR; + +continue_lookup: error = namei(&nd); if (error) return (error); @@ -6112,24 +6422,56 @@ mkdir1(vfs_context_t ctx, user_addr_t path, struct vnode_attr *vap) error = EEXIST; goto out; } + + batched = vnode_compound_mkdir_available(dvp); VATTR_SET(vap, va_type, VDIR); - -#if CONFIG_MACF - error = mac_vnode_check_create(ctx, - nd.ni_dvp, &nd.ni_cnd, vap); - if (error) + + /* + * XXX + * Don't authorize in VFS for compound VNOP.... mkdir -p today assumes that it will + * only get EXISTS or EISDIR for existing path components, and not that it could see + * EACCESS/EPERM--so if we authorize for mkdir on "/" for "mkdir -p /tmp/foo/bar/baz" + * it will fail in a spurious manner. Need to figure out if this is valid behavior. + */ + if ((error = vn_authorize_mkdir(dvp, &nd.ni_cnd, vap, ctx, NULL)) != 0) { + if (error == EACCES || error == EPERM) { + int error2; + + nameidone(&nd); + vnode_put(dvp); + dvp = NULLVP; + + /* + * Try a lookup without "NAMEI_COMPOUNDVNOP" to make sure we return EEXIST + * rather than EACCESS if the target exists. + */ + NDINIT(&nd, LOOKUP, OP_MKDIR, AUDITVNPATH1, UIO_USERSPACE, + path, ctx); + error2 = namei(&nd); + if (error2) { + goto out; + } else { + vp = nd.ni_vp; + error = EEXIST; + goto out; + } + } + goto out; -#endif + } + + /* + * make the directory + */ + if ((error = vn_create(dvp, &vp, &nd, vap, 0, 0, NULL, ctx)) != 0) { + if (error == EKEEPLOOKING) { + nd.ni_vp = vp; + goto continue_lookup; + } - /* authorize addition of a directory to the parent */ - if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0) - goto out; - - - /* make the directory */ - if ((error = vn_create(dvp, &vp, &nd.ni_cnd, vap, 0, ctx)) != 0) goto out; + } // Make sure the name & parent pointers are hooked up if (vp->v_name == NULL) @@ -6152,8 +6494,9 @@ out: nameidone(&nd); if (vp) - vnode_put(vp); - vnode_put(dvp); + vnode_put(vp); + if (dvp) + vnode_put(dvp); return (error); } @@ -6219,10 +6562,19 @@ rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval) vnode_t vp, dvp; int error; struct nameidata nd; + char *path = NULL; + int len=0; + int has_listeners = 0; + int need_event = 0; + int truncated = 0; vfs_context_t ctx = vfs_context_current(); +#if CONFIG_FSE + struct vnode_attr va; +#endif /* CONFIG_FSE */ + struct vnode_attr *vap = NULL; + int batched; int restart_flag; - uint32_t oldvp_id = UINT32_MAX; /* * This loop exists to restart rmdir in the unlikely case that two @@ -6230,10 +6582,13 @@ rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval) * containing orphaned appleDouble files. */ do { + NDINIT(&nd, DELETE, OP_RMDIR, LOCKPARENT | AUDITVNPATH1, + UIO_USERSPACE, uap->path, ctx); + nd.ni_flag = NAMEI_COMPOUNDRMDIR; +continue_lookup: restart_flag = 0; + vap = NULL; - NDINIT(&nd, DELETE, LOCKPARENT | AUDITVNPATH1, - UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) return (error); @@ -6241,132 +6596,153 @@ rmdir(__unused proc_t p, struct rmdir_args *uap, __unused int32_t *retval) dvp = nd.ni_dvp; vp = nd.ni_vp; + if (vp) { + batched = vnode_compound_rmdir_available(vp); - /* - * If being restarted check if the new vp - * still has the same v_id. - */ - if (oldvp_id != UINT32_MAX && oldvp_id != vp->v_id) { - error = ENOENT; - goto out; - } + if (vp->v_flag & VROOT) { + /* + * The root of a mounted filesystem cannot be deleted. + */ + error = EBUSY; + goto out; + } - if (vp->v_type != VDIR) { - /* - * rmdir only deals with directories - */ - error = ENOTDIR; - } else if (dvp == vp) { /* - * No rmdir "." please. + * Removed a check here; we used to abort if vp's vid + * was not the same as what we'd seen the last time around. + * I do not think that check was valid, because if we retry + * and all dirents are gone, the directory could legitimately + * be recycled but still be present in a situation where we would + * have had permission to delete. Therefore, we won't make + * an effort to preserve that check now that we may not have a + * vp here. */ - error = EINVAL; - } else if (vp->v_flag & VROOT) { - /* - * The root of a mounted filesystem cannot be deleted. - */ - error = EBUSY; + + if (!batched) { + error = vn_authorize_rmdir(dvp, vp, &nd.ni_cnd, ctx, NULL); + if (error) { + goto out; + } + } } else { -#if CONFIG_MACF - error = mac_vnode_check_unlink(ctx, dvp, - vp, &nd.ni_cnd); - if (!error) -#endif - error = vnode_authorize(vp, nd.ni_dvp, KAUTH_VNODE_DELETE, ctx); + batched = 1; + + if (!vnode_compound_rmdir_available(dvp)) { + panic("No error, but no compound rmdir?"); + } } - if (!error) { - char *path = NULL; - int len=0; - int has_listeners = 0; - int need_event = 0; - int truncated = 0; + #if CONFIG_FSE - fse_info finfo; + fse_info finfo; - need_event = need_fsevent(FSE_DELETE, dvp); - if (need_event) { + need_event = need_fsevent(FSE_DELETE, dvp); + if (need_event) { + if (!batched) { get_fse_info(vp, &finfo, ctx); + } else { + error = vfs_get_notify_attributes(&va); + if (error) { + goto out; + } + + vap = &va; } + } #endif - has_listeners = kauth_authorize_fileop_has_listeners(); - if (need_event || has_listeners) { + has_listeners = kauth_authorize_fileop_has_listeners(); + if (need_event || has_listeners) { + if (path == NULL) { GET_PATH(path); if (path == NULL) { error = ENOMEM; goto out; } + } - len = safe_getpath(vp, NULL, path, MAXPATHLEN, &truncated); + len = safe_getpath(dvp, nd.ni_cnd.cn_nameptr, path, MAXPATHLEN, &truncated); #if CONFIG_FSE - if (truncated) { - finfo.mode |= FSE_TRUNCATED_PATH; - } -#endif + if (truncated) { + finfo.mode |= FSE_TRUNCATED_PATH; } +#endif + } - error = VNOP_RMDIR(dvp, vp, &nd.ni_cnd, ctx); - - /* - * Special case to remove orphaned AppleDouble - * files. I don't like putting this in the kernel, - * but carbon does not like putting this in carbon either, - * so here we are. - */ - if (error == ENOTEMPTY) { - error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag); - if (error == EBUSY) { - oldvp_id = vp->v_id; - goto out; - } + error = vn_rmdir(dvp, &vp, &nd, vap, ctx); + nd.ni_vp = vp; + if (vp == NULLVP) { + /* Couldn't find a vnode */ + goto out; + } + if (error == EKEEPLOOKING) { + goto continue_lookup; + } - /* - * Assuming everything went well, we will try the RMDIR again - */ - if (!error) - error = VNOP_RMDIR(dvp, vp, &nd.ni_cnd, ctx); + /* + * Special case to remove orphaned AppleDouble + * files. I don't like putting this in the kernel, + * but carbon does not like putting this in carbon either, + * so here we are. + */ + if (error == ENOTEMPTY) { + error = rmdir_remove_orphaned_appleDouble(vp, ctx, &restart_flag); + if (error == EBUSY) { + goto out; } + /* - * Call out to allow 3rd party notification of delete. - * Ignore result of kauth_authorize_fileop call. + * Assuming everything went well, we will try the RMDIR again */ - if (!error) { - if (has_listeners) { - kauth_authorize_fileop(vfs_context_ucred(ctx), - KAUTH_FILEOP_DELETE, - (uintptr_t)vp, - (uintptr_t)path); - } + if (!error) + error = vn_rmdir(dvp, &vp, &nd, vap, ctx); + } - if (vp->v_flag & VISHARDLINK) { - // see the comment in unlink1() about why we update - // the parent of a hard link when it is removed - vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT); - } + /* + * Call out to allow 3rd party notification of delete. + * Ignore result of kauth_authorize_fileop call. + */ + if (!error) { + if (has_listeners) { + kauth_authorize_fileop(vfs_context_ucred(ctx), + KAUTH_FILEOP_DELETE, + (uintptr_t)vp, + (uintptr_t)path); + } + + if (vp->v_flag & VISHARDLINK) { + // see the comment in unlink1() about why we update + // the parent of a hard link when it is removed + vnode_update_identity(vp, NULL, NULL, 0, 0, VNODE_UPDATE_PARENT); + } #if CONFIG_FSE - if (need_event) { - add_fsevent(FSE_DELETE, ctx, - FSE_ARG_STRING, len, path, - FSE_ARG_FINFO, &finfo, - FSE_ARG_DONE); + if (need_event) { + if (vap) { + vnode_get_fse_info_from_vap(vp, &finfo, vap); } -#endif + add_fsevent(FSE_DELETE, ctx, + FSE_ARG_STRING, len, path, + FSE_ARG_FINFO, &finfo, + FSE_ARG_DONE); } - if (path != NULL) - RELEASE_PATH(path); +#endif } out: + if (path != NULL) { + RELEASE_PATH(path); + path = NULL; + } /* * nameidone has to happen before we vnode_put(dvp) * since it may need to release the fs_nodelock on the dvp */ nameidone(&nd); - vnode_put(dvp); - vnode_put(vp); + + if (vp) + vnode_put(vp); if (restart_flag == 0) { wakeup_one((caddr_t)vp); @@ -6389,7 +6765,8 @@ vnode_readdir64(struct vnode *vp, struct uio *uio, int flags, int *eofflag, int *numdirent, vfs_context_t ctxp) { /* Check if fs natively supports VNODE_READDIR_EXTENDED */ - if (vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) { + if ((vp->v_mount->mnt_vtable->vfc_vfsflags & VFC_VFSREADDIR_EXTENDED) && + ((vp->v_mount->mnt_kern_flag & MNTK_DENY_READDIREXT) == 0)) { return VNOP_READDIR(vp, uio, flags, eofflag, numdirent, ctxp); } else { size_t bufsize; @@ -6673,8 +7050,8 @@ revoke(proc_t p, struct revoke_args *uap, __unused int32_t *retval) int error; struct nameidata nd; - NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNPATH1, - UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_REVOKE, FOLLOW | AUDITVNPATH1, UIO_USERSPACE, + uap->path, ctx); error = namei(&nd); if (error) return (error); @@ -6922,24 +7299,24 @@ exchangedata (__unused proc_t p, struct exchangedata_args *uap, __unused int32_t nameiflags = 0; if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW; - NDINIT(&fnd, LOOKUP, nameiflags | AUDITVNPATH1, - UIO_USERSPACE, uap->path1, ctx); + NDINIT(&fnd, LOOKUP, OP_EXCHANGEDATA, nameiflags | AUDITVNPATH1, + UIO_USERSPACE, uap->path1, ctx); - error = namei(&fnd); - if (error) - goto out2; + error = namei(&fnd); + if (error) + goto out2; nameidone(&fnd); fvp = fnd.ni_vp; - NDINIT(&snd, LOOKUP | CN_NBMOUNTLOOK, nameiflags | AUDITVNPATH2, - UIO_USERSPACE, uap->path2, ctx); + NDINIT(&snd, LOOKUP, OP_EXCHANGEDATA, CN_NBMOUNTLOOK | nameiflags | AUDITVNPATH2, + UIO_USERSPACE, uap->path2, ctx); - error = namei(&snd); - if (error) { + error = namei(&snd); + if (error) { vnode_put(fvp); goto out2; - } + } nameidone(&snd); svp = snd.ni_vp; @@ -7187,8 +7564,8 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) nameiflags = 0; if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags | AUDITVNPATH1, - UIO_USERSPACE, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_SEARCHFS, nameiflags | AUDITVNPATH1, + UIO_USERSPACE, uap->path, ctx); error = namei(&nd); if (error) @@ -7197,6 +7574,14 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) nameidone(&nd); vp = nd.ni_vp; +#if CONFIG_MACF + error = mac_vnode_check_searchfs(ctx, vp, &searchblock.searchattrs); + if (error) { + vnode_put(vp); + goto freeandexit; + } +#endif + /* * If searchblock.maxmatches == 0, then skip the search. This has happened @@ -7215,44 +7600,716 @@ searchfs(proc_t p, struct searchfs_args *uap, __unused int32_t *retval) from copying out any results... */ - fserror = VNOP_SEARCHFS(vp, - searchparams1, - searchparams2, - &searchblock.searchattrs, - (u_long)searchblock.maxmatches, - &timelimit, - returnattrs, - &nummatches, - (u_long)uap->scriptcode, - (u_long)uap->options, - auio, - state, - ctx); - -saveandexit: + fserror = VNOP_SEARCHFS(vp, + searchparams1, + searchparams2, + &searchblock.searchattrs, + (u_long)searchblock.maxmatches, + &timelimit, + returnattrs, + &nummatches, + (u_long)uap->scriptcode, + (u_long)uap->options, + auio, + state, + ctx); + +saveandexit: + + vnode_put(vp); + + /* Now copy out the stuff that needs copying out. That means the number of matches, the + search state. Everything was already put into he return buffer by the vop call. */ + + if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) + goto freeandexit; + + if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) + goto freeandexit; + + error = fserror; + +freeandexit: + + FREE(searchparams1,M_TEMP); + + return(error); + + +} /* end of searchfs system call */ + + + +lck_grp_attr_t * nspace_group_attr; +lck_attr_t * nspace_lock_attr; +lck_grp_t * nspace_mutex_group; + +lck_mtx_t nspace_handler_lock; +lck_mtx_t nspace_handler_exclusion_lock; + +time_t snapshot_timestamp=0; +int nspace_allow_virtual_devs=0; + +void nspace_handler_init(void); + +typedef struct nspace_item_info { + struct vnode *vp; + void *arg; + uint64_t op; + uint32_t vid; + uint32_t flags; + uint32_t token; + uint32_t refcount; +} nspace_item_info; + +#define MAX_NSPACE_ITEMS 128 +nspace_item_info nspace_items[MAX_NSPACE_ITEMS]; +uint32_t nspace_item_idx=0; // also used as the sleep/wakeup rendezvous address +uint32_t nspace_token_id=0; +uint32_t nspace_handler_timeout = 15; // seconds + +#define NSPACE_ITEM_NEW 0x0001 +#define NSPACE_ITEM_PROCESSING 0x0002 +#define NSPACE_ITEM_DEAD 0x0004 +#define NSPACE_ITEM_CANCELLED 0x0008 +#define NSPACE_ITEM_DONE 0x0010 +#define NSPACE_ITEM_RESET_TIMER 0x0020 + +#define NSPACE_ITEM_NSPACE_EVENT 0x0040 +#define NSPACE_ITEM_SNAPSHOT_EVENT 0x0080 +#define NSPACE_ITEM_TRACK_EVENT 0x0100 + +#define NSPACE_ITEM_ALL_EVENT_TYPES (NSPACE_ITEM_NSPACE_EVENT | NSPACE_ITEM_SNAPSHOT_EVENT | NSPACE_ITEM_TRACK_EVENT) + +//#pragma optimization_level 0 + +typedef enum { + NSPACE_HANDLER_NSPACE = 0, + NSPACE_HANDLER_SNAPSHOT = 1, + NSPACE_HANDLER_TRACK = 2, + + NSPACE_HANDLER_COUNT, +} nspace_type_t; + +typedef struct { + uint64_t handler_tid; + struct proc *handler_proc; + int handler_busy; +} nspace_handler_t; + +nspace_handler_t nspace_handlers[NSPACE_HANDLER_COUNT]; + +static inline int nspace_flags_matches_handler(uint32_t event_flags, nspace_type_t nspace_type) +{ + switch(nspace_type) { + case NSPACE_HANDLER_NSPACE: + return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_NSPACE_EVENT; + case NSPACE_HANDLER_SNAPSHOT: + return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_SNAPSHOT_EVENT; + case NSPACE_HANDLER_TRACK: + return (event_flags & NSPACE_ITEM_ALL_EVENT_TYPES) == NSPACE_ITEM_TRACK_EVENT; + default: + printf("nspace_flags_matches_handler: invalid type %u\n", (int)nspace_type); + return 0; + } +} + +static inline int nspace_item_flags_for_type(nspace_type_t nspace_type) +{ + switch(nspace_type) { + case NSPACE_HANDLER_NSPACE: + return NSPACE_ITEM_NSPACE_EVENT; + case NSPACE_HANDLER_SNAPSHOT: + return NSPACE_ITEM_SNAPSHOT_EVENT; + case NSPACE_HANDLER_TRACK: + return NSPACE_ITEM_TRACK_EVENT; + default: + printf("nspace_item_flags_for_type: invalid type %u\n", (int)nspace_type); + return 0; + } +} + +static inline int nspace_open_flags_for_type(nspace_type_t nspace_type) +{ + switch(nspace_type) { + case NSPACE_HANDLER_NSPACE: + return FREAD | FWRITE | O_EVTONLY; + case NSPACE_HANDLER_SNAPSHOT: + case NSPACE_HANDLER_TRACK: + return FREAD | O_EVTONLY; + default: + printf("nspace_open_flags_for_type: invalid type %u\n", (int)nspace_type); + return 0; + } +} + +static inline nspace_type_t nspace_type_for_op(uint64_t op) +{ + switch(op & NAMESPACE_HANDLER_EVENT_TYPE_MASK) { + case NAMESPACE_HANDLER_NSPACE_EVENT: + return NSPACE_HANDLER_NSPACE; + case NAMESPACE_HANDLER_SNAPSHOT_EVENT: + return NSPACE_HANDLER_SNAPSHOT; + case NAMESPACE_HANDLER_TRACK_EVENT: + return NSPACE_HANDLER_TRACK; + default: + printf("nspace_type_for_op: invalid op mask %llx\n", op & NAMESPACE_HANDLER_EVENT_TYPE_MASK); + return NSPACE_HANDLER_NSPACE; + } +} + +static inline int nspace_is_special_process(struct proc *proc) +{ + int i; + for (i = 0; i < NSPACE_HANDLER_COUNT; i++) { + if (proc == nspace_handlers[i].handler_proc) + return 1; + } + return 0; +} + +void +nspace_handler_init(void) +{ + nspace_lock_attr = lck_attr_alloc_init(); + nspace_group_attr = lck_grp_attr_alloc_init(); + nspace_mutex_group = lck_grp_alloc_init("nspace-mutex", nspace_group_attr); + lck_mtx_init(&nspace_handler_lock, nspace_mutex_group, nspace_lock_attr); + lck_mtx_init(&nspace_handler_exclusion_lock, nspace_mutex_group, nspace_lock_attr); + memset(&nspace_items[0], 0, sizeof(nspace_items)); +} + +void +nspace_proc_exit(struct proc *p) +{ + int i, event_mask = 0; + + for (i = 0; i < NSPACE_HANDLER_COUNT; i++) { + if (p == nspace_handlers[i].handler_proc) { + event_mask |= nspace_item_flags_for_type(i); + nspace_handlers[i].handler_tid = 0; + nspace_handlers[i].handler_proc = NULL; + } + } + + if (event_mask == 0) { + return; + } + + if (event_mask & NSPACE_ITEM_SNAPSHOT_EVENT) { + // if this process was the snapshot handler, zero snapshot_timeout + snapshot_timestamp = 0; + } + + // + // unblock anyone that's waiting for the handler that died + // + lck_mtx_lock(&nspace_handler_lock); + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].flags & (NSPACE_ITEM_NEW | NSPACE_ITEM_PROCESSING)) { + + if ( nspace_items[i].flags & event_mask ) { + + if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) { + vnode_lock_spin(nspace_items[i].vp); + nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; + vnode_unlock(nspace_items[i].vp); + } + nspace_items[i].vp = NULL; + nspace_items[i].vid = 0; + nspace_items[i].flags = NSPACE_ITEM_DONE; + nspace_items[i].token = 0; + + wakeup((caddr_t)&(nspace_items[i].vp)); + } + } + } + + wakeup((caddr_t)&nspace_item_idx); + lck_mtx_unlock(&nspace_handler_lock); +} + + +int +resolve_nspace_item(struct vnode *vp, uint64_t op) +{ + return resolve_nspace_item_ext(vp, op, NULL); +} + +int +resolve_nspace_item_ext(struct vnode *vp, uint64_t op, void *arg) +{ + int i, error, keep_waiting; + struct timespec ts; + nspace_type_t nspace_type = nspace_type_for_op(op); + + // only allow namespace events on regular files, directories and symlinks. + if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { + return 0; + } + + // + // if this is a snapshot event and the vnode is on a + // disk image just pretend nothing happened since any + // change to the disk image will cause the disk image + // itself to get backed up and this avoids multi-way + // deadlocks between the snapshot handler and the ever + // popular diskimages-helper process. the variable + // nspace_allow_virtual_devs allows this behavior to + // be overridden (for use by the Mobile TimeMachine + // testing infrastructure which uses disk images) + // + if ( (op & NAMESPACE_HANDLER_SNAPSHOT_EVENT) + && (vp->v_mount != NULL) + && (vp->v_mount->mnt_kern_flag & MNTK_VIRTUALDEV) + && !nspace_allow_virtual_devs) { + + return 0; + } + + // if (thread_tid(current_thread()) == namespace_handler_tid) { + if (nspace_handlers[nspace_type].handler_proc == NULL) { + return 0; + } + + if (nspace_is_special_process(current_proc())) { + return EDEADLK; + } + + lck_mtx_lock(&nspace_handler_lock); + +retry: + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (vp == nspace_items[i].vp && op == nspace_items[i].op) { + break; + } + } + + if (i >= MAX_NSPACE_ITEMS) { + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].flags == 0) { + break; + } + } + } else { + nspace_items[i].refcount++; + } + + if (i >= MAX_NSPACE_ITEMS) { + ts.tv_sec = nspace_handler_timeout; + ts.tv_nsec = 0; + + error = msleep((caddr_t)&nspace_token_id, &nspace_handler_lock, PVFS|PCATCH, "nspace-no-space", &ts); + if (error == 0) { + // an entry got free'd up, go see if we can get a slot + goto retry; + } else { + lck_mtx_unlock(&nspace_handler_lock); + return error; + } + } + + // + // if it didn't already exist, add it. if it did exist + // we'll get woken up when someone does a wakeup() on + // the slot in the nspace_items table. + // + if (vp != nspace_items[i].vp) { + nspace_items[i].vp = vp; + nspace_items[i].arg = arg; + nspace_items[i].op = op; + nspace_items[i].vid = vnode_vid(vp); + nspace_items[i].flags = NSPACE_ITEM_NEW; + nspace_items[i].flags |= nspace_item_flags_for_type(nspace_type); + if (nspace_items[i].flags & NSPACE_ITEM_SNAPSHOT_EVENT) { + if (arg) { + vnode_lock_spin(vp); + vp->v_flag |= VNEEDSSNAPSHOT; + vnode_unlock(vp); + } + } + + nspace_items[i].token = 0; + nspace_items[i].refcount = 1; + + wakeup((caddr_t)&nspace_item_idx); + } + + // + // Now go to sleep until the handler does a wakeup on this + // slot in the nspace_items table (or we timeout). + // + keep_waiting = 1; + while(keep_waiting) { + ts.tv_sec = nspace_handler_timeout; + ts.tv_nsec = 0; + error = msleep((caddr_t)&(nspace_items[i].vp), &nspace_handler_lock, PVFS|PCATCH, "namespace-done", &ts); + + if (nspace_items[i].flags & NSPACE_ITEM_DONE) { + error = 0; + } else if (nspace_items[i].flags & NSPACE_ITEM_CANCELLED) { + error = nspace_items[i].token; + } else if (error == EWOULDBLOCK || error == ETIMEDOUT) { + if (nspace_items[i].flags & NSPACE_ITEM_RESET_TIMER) { + nspace_items[i].flags &= ~NSPACE_ITEM_RESET_TIMER; + continue; + } else { + error = ETIMEDOUT; + } + } else if (error == 0) { + // hmmm, why did we get woken up? + printf("woken up for token %d but it's not done, cancelled or timedout and error == 0.\n", + nspace_items[i].token); + } + + if (--nspace_items[i].refcount == 0) { + nspace_items[i].vp = NULL; // clear this so that no one will match on it again + nspace_items[i].arg = NULL; + nspace_items[i].token = 0; // clear this so that the handler will not find it anymore + nspace_items[i].flags = 0; // this clears it for re-use + } + wakeup(&nspace_token_id); + keep_waiting = 0; + } + + lck_mtx_unlock(&nspace_handler_lock); + + return error; +} + + +int +get_nspace_item_status(struct vnode *vp, int32_t *status) +{ + int i; + + lck_mtx_lock(&nspace_handler_lock); + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].vp == vp) { + break; + } + } + + if (i >= MAX_NSPACE_ITEMS) { + lck_mtx_unlock(&nspace_handler_lock); + return ENOENT; + } + + *status = nspace_items[i].flags; + lck_mtx_unlock(&nspace_handler_lock); + return 0; +} + + +#if 0 +static int +build_volfs_path(struct vnode *vp, char *path, int *len) +{ + struct vnode_attr va; + int ret; + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_fsid); + VATTR_WANTED(&va, va_fileid); + + if (vnode_getattr(vp, &va, vfs_context_kernel()) != 0) { + *len = snprintf(path, *len, "/non/existent/path/because/vnode_getattr/failed") + 1; + ret = -1; + } else { + *len = snprintf(path, *len, "/.vol/%d/%lld", (dev_t)va.va_fsid, va.va_fileid) + 1; + ret = 0; + } + + return ret; +} +#endif + +// +// Note: this function does NOT check permissions on all of the +// parent directories leading to this vnode. It should only be +// called on behalf of a root process. Otherwise a process may +// get access to a file because the file itself is readable even +// though its parent directories would prevent access. +// +static int +vn_open_with_vp(vnode_t vp, int fmode, vfs_context_t ctx) +{ + int error, action; + + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { + return error; + } + +#if CONFIG_MACF + error = mac_vnode_check_open(ctx, vp, fmode); + if (error) + return error; +#endif - vnode_put(vp); + /* compute action to be authorized */ + action = 0; + if (fmode & FREAD) { + action |= KAUTH_VNODE_READ_DATA; + } + if (fmode & (FWRITE | O_TRUNC)) { + /* + * If we are writing, appending, and not truncating, + * indicate that we are appending so that if the + * UF_APPEND or SF_APPEND bits are set, we do not deny + * the open. + */ + if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { + action |= KAUTH_VNODE_APPEND_DATA; + } else { + action |= KAUTH_VNODE_WRITE_DATA; + } + } - /* Now copy out the stuff that needs copying out. That means the number of matches, the - search state. Everything was already put into he return buffer by the vop call. */ + if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) + return error; + - if ((error = copyout((caddr_t) state, uap->state, sizeof(struct searchstate))) != 0) - goto freeandexit; + // + // if the vnode is tagged VOPENEVT and the current process + // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY + // flag to the open mode so that this open won't count against + // the vnode when carbon delete() does a vnode_isinuse() to see + // if a file is currently in use. this allows spotlight + // importers to not interfere with carbon apps that depend on + // the no-delete-if-busy semantics of carbon delete(). + // + if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) { + fmode |= O_EVTONLY; + } - if ((error = suulong(uap->nummatches, (uint64_t)nummatches)) != 0) - goto freeandexit; - - error = fserror; + if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) { + return error; + } + if ( (error = vnode_ref_ext(vp, fmode, 0)) ) { + VNOP_CLOSE(vp, fmode, ctx); + return error; + } -freeandexit: + /* call out to allow 3rd party notification of open. + * Ignore result of kauth_authorize_fileop call. + */ + kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, + (uintptr_t)vp, 0); - FREE(searchparams1,M_TEMP); - return(error); + return 0; +} +static int +wait_for_namespace_event(namespace_handler_info_ext *nhi, nspace_type_t nspace_type) +{ + int i, error=0, unblock=0; + task_t curtask; + + lck_mtx_lock(&nspace_handler_exclusion_lock); + if (nspace_handlers[nspace_type].handler_busy) { + lck_mtx_unlock(&nspace_handler_exclusion_lock); + return EBUSY; + } + nspace_handlers[nspace_type].handler_busy = 1; + lck_mtx_unlock(&nspace_handler_exclusion_lock); + + /* + * Any process that gets here will be one of the namespace handlers. + * As such, they should be prevented from acquiring DMG vnodes during vnode reclamation + * as we can cause deadlocks to occur, because the namespace handler may prevent + * VNOP_INACTIVE from proceeding. Mark the current task as a P_DEPENDENCY_CAPABLE + * process. + */ + curtask = current_task(); + bsd_set_dependency_capable (curtask); + + lck_mtx_lock(&nspace_handler_lock); + if (nspace_handlers[nspace_type].handler_proc == NULL) { + nspace_handlers[nspace_type].handler_tid = thread_tid(current_thread()); + nspace_handlers[nspace_type].handler_proc = current_proc(); + } + + while (error == 0) { + + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].flags & NSPACE_ITEM_NEW) { + if (!nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) { + continue; + } + break; + } + } + + if (i < MAX_NSPACE_ITEMS) { + nspace_items[i].flags &= ~NSPACE_ITEM_NEW; + nspace_items[i].flags |= NSPACE_ITEM_PROCESSING; + nspace_items[i].token = ++nspace_token_id; + + if (nspace_items[i].vp) { + struct fileproc *fp; + int32_t indx, fmode; + struct proc *p = current_proc(); + vfs_context_t ctx = vfs_context_current(); + + fmode = nspace_open_flags_for_type(nspace_type); + + error = vnode_getwithvid(nspace_items[i].vp, nspace_items[i].vid); + if (error) { + unblock = 1; + break; + } + error = vn_open_with_vp(nspace_items[i].vp, fmode, ctx); + if (error) { + unblock = 1; + vnode_put(nspace_items[i].vp); + break; + } + + if ((error = falloc(p, &fp, &indx, ctx))) { + vn_close(nspace_items[i].vp, fmode, ctx); + vnode_put(nspace_items[i].vp); + unblock = 1; + break; + } + + fp->f_fglob->fg_flag = fmode; + fp->f_fglob->fg_type = DTYPE_VNODE; + fp->f_fglob->fg_ops = &vnops; + fp->f_fglob->fg_data = (caddr_t)nspace_items[i].vp; + + proc_fdlock(p); + procfdtbl_releasefd(p, indx, NULL); + fp_drop(p, indx, fp, 1); + proc_fdunlock(p); + + error = copyout(&nspace_items[i].token, nhi->token, sizeof(uint32_t)); + error = copyout(&nspace_items[i].op, nhi->flags, sizeof(uint64_t)); + error = copyout(&indx, nhi->fdptr, sizeof(uint32_t)); + if (nhi->infoptr) { + uio_t uio = (uio_t)nspace_items[i].arg; + uint64_t u_offset, u_length; + + if (uio) { + u_offset = uio_offset(uio); + u_length = uio_resid(uio); + } else { + u_offset = 0; + u_length = 0; + } + error = copyout(&u_offset, nhi->infoptr, sizeof(uint64_t)); + error = copyout(&u_length, nhi->infoptr+sizeof(uint64_t), sizeof(uint64_t)); + } + if (error) { + vn_close(nspace_items[i].vp, fmode, ctx); + fp_free(p, indx, fp); + unblock = 1; + } + + vnode_put(nspace_items[i].vp); + + break; + } else { + printf("wait_for_nspace_event: failed (nspace_items[%d] == %p error %d, name %s)\n", + i, nspace_items[i].vp, error, nspace_items[i].vp->v_name); + } + + } else { + error = msleep((caddr_t)&nspace_item_idx, &nspace_handler_lock, PVFS|PCATCH, "namespace-items", 0); + if ((nspace_type == NSPACE_HANDLER_SNAPSHOT) && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { + error = EINVAL; + break; + } + + } + } + + if (unblock) { + if (nspace_items[i].vp && (nspace_items[i].vp->v_flag & VNEEDSSNAPSHOT)) { + vnode_lock_spin(nspace_items[i].vp); + nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; + vnode_unlock(nspace_items[i].vp); + } + nspace_items[i].vp = NULL; + nspace_items[i].vid = 0; + nspace_items[i].flags = NSPACE_ITEM_DONE; + nspace_items[i].token = 0; + + wakeup((caddr_t)&(nspace_items[i].vp)); + } + + if (nspace_type == NSPACE_HANDLER_SNAPSHOT) { + // just go through every snapshot event and unblock it immediately. + if (error && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].flags & NSPACE_ITEM_NEW) { + if (nspace_flags_matches_handler(nspace_items[i].flags, nspace_type)) { + nspace_items[i].vp = NULL; + nspace_items[i].vid = 0; + nspace_items[i].flags = NSPACE_ITEM_DONE; + nspace_items[i].token = 0; + + wakeup((caddr_t)&(nspace_items[i].vp)); + } + } + } + } + } + + lck_mtx_unlock(&nspace_handler_lock); + + lck_mtx_lock(&nspace_handler_exclusion_lock); + nspace_handlers[nspace_type].handler_busy = 0; + lck_mtx_unlock(&nspace_handler_exclusion_lock); + + return error; +} -} /* end of searchfs system call */ +static int process_namespace_fsctl(nspace_type_t nspace_type, int is64bit, u_int size, caddr_t data) +{ + int error = 0; + namespace_handler_info_ext nhi; + + if (nspace_type == NSPACE_HANDLER_SNAPSHOT && (snapshot_timestamp == 0 || snapshot_timestamp == ~0)) { + return EINVAL; + } + + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { + return error; + } + + if ( (is64bit && size != sizeof(user64_namespace_handler_info) && size != sizeof(user64_namespace_handler_info_ext)) + || (is64bit == 0 && size != sizeof(user32_namespace_handler_info) && size != sizeof(user32_namespace_handler_info_ext))) { + + // either you're 64-bit and passed a 64-bit struct or + // you're 32-bit and passed a 32-bit struct. otherwise + // it's not ok. + return EINVAL; + } + + if (is64bit) { + nhi.token = (user_addr_t)((user64_namespace_handler_info *)data)->token; + nhi.flags = (user_addr_t)((user64_namespace_handler_info *)data)->flags; + nhi.fdptr = (user_addr_t)((user64_namespace_handler_info *)data)->fdptr; + if (size == sizeof(user64_namespace_handler_info_ext)) { + nhi.infoptr = (user_addr_t)((user64_namespace_handler_info_ext *)data)->infoptr; + } else { + nhi.infoptr = 0; + } + } else { + nhi.token = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->token); + nhi.flags = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->flags); + nhi.fdptr = CAST_USER_ADDR_T(((user32_namespace_handler_info *)data)->fdptr); + if (size == sizeof(user32_namespace_handler_info_ext)) { + nhi.infoptr = CAST_USER_ADDR_T(((user32_namespace_handler_info_ext *)data)->infoptr); + } else { + nhi.infoptr = 0; + } + } + + return wait_for_namespace_event(&nhi, nspace_type); +} /* * Make a filesystem-specific control call: @@ -7272,7 +8329,7 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long size = IOCPARM_LEN(cmd); if (size > IOCPARM_MAX) return (EINVAL); - is64bit = proc_is64bit(p); + is64bit = proc_is64bit(p); memp = NULL; if (size > sizeof (stkbuf)) { @@ -7287,12 +8344,12 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long error = copyin(udata, data, size); if (error) goto FSCtl_Exit; } else { - if (is64bit) { - *(user_addr_t *)data = udata; - } - else { - *(uint32_t *)data = (uint32_t)udata; - } + if (is64bit) { + *(user_addr_t *)data = udata; + } + else { + *(uint32_t *)data = (uint32_t)udata; + } }; } else if ((cmd & IOC_OUT) && size) { /* @@ -7302,10 +8359,10 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long bzero(data, size); } else if (cmd & IOC_VOID) { if (is64bit) { - *(user_addr_t *)data = udata; + *(user_addr_t *)data = udata; } else { - *(uint32_t *)data = (uint32_t)udata; + *(uint32_t *)data = (uint32_t)udata; } } @@ -7349,31 +8406,31 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long *arg_vp = NULL; } else if (IOCBASECMD(cmd) == FSCTL_SET_PACKAGE_EXTS) { - user_addr_t ext_strings; - uint32_t num_entries; - uint32_t max_width; + user_addr_t ext_strings; + uint32_t num_entries; + uint32_t max_width; - if ( (is64bit && size != sizeof(user64_package_ext_info)) - || (is64bit == 0 && size != sizeof(user32_package_ext_info))) { + if ( (is64bit && size != sizeof(user64_package_ext_info)) + || (is64bit == 0 && size != sizeof(user32_package_ext_info))) { - // either you're 64-bit and passed a 64-bit struct or - // you're 32-bit and passed a 32-bit struct. otherwise - // it's not ok. - error = EINVAL; - goto FSCtl_Exit; - } + // either you're 64-bit and passed a 64-bit struct or + // you're 32-bit and passed a 32-bit struct. otherwise + // it's not ok. + error = EINVAL; + goto FSCtl_Exit; + } - if (is64bit) { - ext_strings = ((user64_package_ext_info *)data)->strings; - num_entries = ((user64_package_ext_info *)data)->num_entries; - max_width = ((user64_package_ext_info *)data)->max_width; - } else { - ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings); - num_entries = ((user32_package_ext_info *)data)->num_entries; - max_width = ((user32_package_ext_info *)data)->max_width; - } + if (is64bit) { + ext_strings = ((user64_package_ext_info *)data)->strings; + num_entries = ((user64_package_ext_info *)data)->num_entries; + max_width = ((user64_package_ext_info *)data)->max_width; + } else { + ext_strings = CAST_USER_ADDR_T(((user32_package_ext_info *)data)->strings); + num_entries = ((user32_package_ext_info *)data)->num_entries; + max_width = ((user32_package_ext_info *)data)->max_width; + } - error = set_package_extensions_table(ext_strings, num_entries, max_width); + error = set_package_extensions_table(ext_strings, num_entries, max_width); } else if (IOCBASECMD(cmd) == FSCTL_WAIT_FOR_SYNC) { error = tsleep((caddr_t)&sync_wait_time, PVFS|PCATCH, "sync-wait", 0); @@ -7384,6 +8441,192 @@ fsctl_internal(proc_t p, vnode_t *arg_vp, u_long cmd, user_addr_t udata, u_long error *= -1; } + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_GET) { + error = process_namespace_fsctl(NSPACE_HANDLER_NSPACE, is64bit, size, data); + } else if (IOCBASECMD(cmd) == FSCTL_OLD_SNAPSHOT_HANDLER_GET) { + error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data); + } else if (IOCBASECMD(cmd) == FSCTL_SNAPSHOT_HANDLER_GET_EXT) { + error = process_namespace_fsctl(NSPACE_HANDLER_SNAPSHOT, is64bit, size, data); + } else if (IOCBASECMD(cmd) == FSCTL_TRACKED_HANDLER_GET) { + error = process_namespace_fsctl(NSPACE_HANDLER_TRACK, is64bit, size, data); + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_UPDATE) { + uint32_t token, val; + int i; + + if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) { + goto FSCtl_Exit; + } + + if (!nspace_is_special_process(p)) { + error = EINVAL; + goto FSCtl_Exit; + } + + token = ((uint32_t *)data)[0]; + val = ((uint32_t *)data)[1]; + + lck_mtx_lock(&nspace_handler_lock); + + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].token == token) { + break; + } + } + + if (i >= MAX_NSPACE_ITEMS) { + error = ENOENT; + } else { + // + // if this bit is set, when resolve_nspace_item() times out + // it will loop and go back to sleep. + // + nspace_items[i].flags |= NSPACE_ITEM_RESET_TIMER; + } + + lck_mtx_unlock(&nspace_handler_lock); + + if (error) { + printf("nspace-handler-update: did not find token %u\n", token); + } + + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_UNBLOCK) { + uint32_t token, val; + int i; + + if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) { + goto FSCtl_Exit; + } + + if (!nspace_is_special_process(p)) { + error = EINVAL; + goto FSCtl_Exit; + } + + token = ((uint32_t *)data)[0]; + val = ((uint32_t *)data)[1]; + + lck_mtx_lock(&nspace_handler_lock); + + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].token == token) { + break; + } + } + + if (i >= MAX_NSPACE_ITEMS) { + printf("nspace-handler-unblock: did not find token %u\n", token); + error = ENOENT; + } else { + if (val == 0 && nspace_items[i].vp) { + vnode_lock_spin(nspace_items[i].vp); + nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; + vnode_unlock(nspace_items[i].vp); + } + + nspace_items[i].vp = NULL; + nspace_items[i].arg = NULL; + nspace_items[i].op = 0; + nspace_items[i].vid = 0; + nspace_items[i].flags = NSPACE_ITEM_DONE; + nspace_items[i].token = 0; + + wakeup((caddr_t)&(nspace_items[i].vp)); + } + + lck_mtx_unlock(&nspace_handler_lock); + + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_CANCEL) { + uint32_t token, val; + int i; + + if ((error = suser(kauth_cred_get(), &(p->p_acflag)))) { + goto FSCtl_Exit; + } + + if (!nspace_is_special_process(p)) { + error = EINVAL; + goto FSCtl_Exit; + } + + token = ((uint32_t *)data)[0]; + val = ((uint32_t *)data)[1]; + + lck_mtx_lock(&nspace_handler_lock); + + for(i=0; i < MAX_NSPACE_ITEMS; i++) { + if (nspace_items[i].token == token) { + break; + } + } + + if (i >= MAX_NSPACE_ITEMS) { + printf("nspace-handler-cancel: did not find token %u\n", token); + error = ENOENT; + } else { + if (nspace_items[i].vp) { + vnode_lock_spin(nspace_items[i].vp); + nspace_items[i].vp->v_flag &= ~VNEEDSSNAPSHOT; + vnode_unlock(nspace_items[i].vp); + } + + nspace_items[i].vp = NULL; + nspace_items[i].arg = NULL; + nspace_items[i].vid = 0; + nspace_items[i].token = val; + nspace_items[i].flags &= ~NSPACE_ITEM_PROCESSING; + nspace_items[i].flags |= NSPACE_ITEM_CANCELLED; + + wakeup((caddr_t)&(nspace_items[i].vp)); + } + + lck_mtx_unlock(&nspace_handler_lock); + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_HANDLER_SET_SNAPSHOT_TIME) { + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { + goto FSCtl_Exit; + } + + // we explicitly do not do the namespace_handler_proc check here + + lck_mtx_lock(&nspace_handler_lock); + snapshot_timestamp = ((uint32_t *)data)[0]; + wakeup(&nspace_item_idx); + lck_mtx_unlock(&nspace_handler_lock); + printf("nspace-handler-set-snapshot-time: %d\n", (int)snapshot_timestamp); + + } else if (IOCBASECMD(cmd) == FSCTL_NAMESPACE_ALLOW_DMG_SNAPSHOT_EVENTS) { + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { + goto FSCtl_Exit; + } + + lck_mtx_lock(&nspace_handler_lock); + nspace_allow_virtual_devs = ((uint32_t *)data)[0]; + lck_mtx_unlock(&nspace_handler_lock); + printf("nspace-snapshot-handler will%s allow events on disk-images\n", + nspace_allow_virtual_devs ? "" : " NOT"); + error = 0; + + } else if (IOCBASECMD(cmd) == FSCTL_SET_FSTYPENAME_OVERRIDE) { + if ((error = suser(kauth_cred_get(), &(current_proc()->p_acflag)))) { + goto FSCtl_Exit; + } + if (vp->v_mount) { + mount_lock(vp->v_mount); + if (data[0] != 0) { + strlcpy(&vp->v_mount->fstypename_override[0], data, MFSTYPENAMELEN); + vp->v_mount->mnt_kern_flag |= MNTK_TYPENAME_OVERRIDE; + if (vfs_isrdonly(vp->v_mount) && strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) { + vp->v_mount->mnt_kern_flag |= MNTK_EXTENDED_SECURITY; + vp->v_mount->mnt_kern_flag &= ~MNTK_AUTH_OPAQUE; + } + } else { + if (strcmp(vp->v_mount->fstypename_override, "mtmfs") == 0) { + vp->v_mount->mnt_kern_flag &= ~MNTK_EXTENDED_SECURITY; + } + vp->v_mount->mnt_kern_flag &= ~MNTK_TYPENAME_OVERRIDE; + vp->v_mount->fstypename_override[0] = '\0'; + } + mount_unlock(vp->v_mount); + } } else { /* Invoke the filesystem-specific code */ error = VNOP_IOCTL(vp, IOCBASECMD(cmd), data, options, ctx); @@ -7418,8 +8661,8 @@ fsctl (proc_t p, struct fsctl_args *uap, __unused int32_t *retval) /* Get the vnode for the file we are getting info on: */ nameiflags = 0; if ((uap->options & FSOPT_NOFOLLOW) == 0) nameiflags |= FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags | AUDITVNPATH1, UIO_USERSPACE, - uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_FSCTL, nameiflags | AUDITVNPATH1, + UIO_USERSPACE, uap->path, ctx); if ((error = namei(&nd))) goto done; vp = nd.ni_vp; nameidone(&nd); @@ -7520,7 +8763,7 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval) return (EINVAL); nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags, spacetype, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_GETXATTR, nameiflags, spacetype, uap->path, ctx); if ((error = namei(&nd))) { return (error); } @@ -7531,8 +8774,10 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval) goto out; } if (xattr_protected(attrname)) { - error = EPERM; - goto out; + if (!vfs_context_issuser(ctx) || strcmp(attrname, "com.apple.system.Security") != 0) { + error = EPERM; + goto out; + } } /* * the specific check for 0xffffffff is a hack to preserve @@ -7558,10 +8803,10 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval) if (uap->size == 0xffffffff || uap->size == (size_t)-1) goto no_uio; - if (uap->size > (size_t)XATTR_MAXSIZE) - uap->size = XATTR_MAXSIZE; - if (uap->value) { + if (uap->size > (size_t)XATTR_MAXSIZE) + uap->size = XATTR_MAXSIZE; + auio = uio_createwithbuffer(1, uap->position, spacetype, UIO_READ, &uio_buf[0], sizeof(uio_buf)); uio_addiov(auio, uap->value, uap->size); @@ -7652,7 +8897,12 @@ setxattr(proc_t p, struct setxattr_args *uap, int *retval) return (EINVAL); if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) { - return (error); + if (error == EPERM) { + /* if the string won't fit in attrname, copyinstr emits EPERM */ + return (ENAMETOOLONG); + } + /* Otherwise return the default error from copyinstr to detect ERANGE, etc */ + return error; } if (xattr_protected(attrname)) return(EPERM); @@ -7661,7 +8911,7 @@ setxattr(proc_t p, struct setxattr_args *uap, int *retval) } nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags, spacetype, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_SETXATTR, nameiflags, spacetype, uap->path, ctx); if ((error = namei(&nd))) { return (error); } @@ -7698,7 +8948,9 @@ fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval) size_t namelen; int error; char uio_buf[ UIO_SIZEOF(1) ]; +#if CONFIG_FSE vfs_context_t ctx = vfs_context_current(); +#endif if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) return (EINVAL); @@ -7762,7 +9014,7 @@ removexattr(proc_t p, struct removexattr_args *uap, int *retval) if (xattr_protected(attrname)) return(EPERM); nameiflags = (uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW; - NDINIT(&nd, LOOKUP, nameiflags, spacetype, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_REMOVEXATTR, nameiflags, spacetype, uap->path, ctx); if ((error = namei(&nd))) { return (error); } @@ -7793,7 +9045,9 @@ fremovexattr(__unused proc_t p, struct fremovexattr_args *uap, int *retval) char attrname[XATTR_MAXNAMELEN+1]; size_t namelen; int error; +#if CONFIG_FSE vfs_context_t ctx = vfs_context_current(); +#endif if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT)) return (EINVAL); @@ -7847,15 +9101,15 @@ listxattr(proc_t p, struct listxattr_args *uap, user_ssize_t *retval) return (EINVAL); nameiflags = ((uap->options & XATTR_NOFOLLOW) ? 0 : FOLLOW) | NOTRIGGER; - NDINIT(&nd, LOOKUP, nameiflags, spacetype, uap->path, ctx); + NDINIT(&nd, LOOKUP, OP_LISTXATTR, nameiflags, spacetype, uap->path, ctx); if ((error = namei(&nd))) { return (error); } vp = nd.ni_vp; nameidone(&nd); if (uap->namebuf != 0 && uap->bufsize > 0) { - auio = uio_createwithbuffer(1, 0, spacetype, - UIO_READ, &uio_buf[0], sizeof(uio_buf)); + auio = uio_createwithbuffer(1, 0, spacetype, UIO_READ, + &uio_buf[0], sizeof(uio_buf)); uio_addiov(auio, uap->namebuf, uap->bufsize); } @@ -7958,6 +9212,13 @@ fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval) if (error) { goto out; } +#if CONFIG_MACF + error = mac_vnode_check_fsgetpath(ctx, vp); + if (error) { + vnode_put(vp); + goto out; + } +#endif /* Obtain the absolute path to this vnode. */ bpflags = vfs_context_suser(ctx) ? BUILDPATH_CHECKACCESS : 0; error = build_path(vp, realpath, uap->bufsize, &length, bpflags, ctx); @@ -8007,7 +9268,11 @@ munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, sfs.f_ffree = (user64_long_t)sfsp->f_ffree; sfs.f_fsid = sfsp->f_fsid; sfs.f_owner = sfsp->f_owner; - strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN); + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + } else { + strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN); + } strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN); strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN); @@ -8080,7 +9345,11 @@ munge_statfs(struct mount *mp, struct vfsstatfs *sfsp, sfs.f_ffree = (user32_long_t)sfsp->f_ffree; sfs.f_fsid = sfsp->f_fsid; sfs.f_owner = sfsp->f_owner; - strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN); + if (mp->mnt_kern_flag & MNTK_TYPENAME_OVERRIDE) { + strlcpy(&sfs.f_fstypename[0], &mp->fstypename_override[0], MFSTYPENAMELEN); + } else { + strlcpy(&sfs.f_fstypename[0], &sfsp->f_fstypename[0], MFSNAMELEN); + } strlcpy(&sfs.f_mntonname[0], &sfsp->f_mntonname[0], MNAMELEN); strlcpy(&sfs.f_mntfromname[0], &sfsp->f_mntfromname[0], MNAMELEN); diff --git a/bsd/vfs/vfs_utfconv.c b/bsd/vfs/vfs_utfconv.c index adf92df30..f785b0d8c 100644 --- a/bsd/vfs/vfs_utfconv.c +++ b/bsd/vfs/vfs_utfconv.c @@ -80,7 +80,7 @@ * Similar to __CFUniCharIsNonBaseCharacter except that * unicode_combinable also includes Hangul Jamo characters. */ -inline int +int unicode_combinable(u_int16_t character) { const u_int8_t *bitmap = __CFUniCharCombiningBitmap; @@ -105,7 +105,7 @@ unicode_combinable(u_int16_t character) * * Similar to __CFUniCharIsDecomposableCharacter. */ -inline int +int unicode_decomposeable(u_int16_t character) { const u_int8_t *bitmap = __CFUniCharDecomposableBitmap; u_int8_t value; @@ -1024,7 +1024,7 @@ priortysort(u_int16_t* characters, int count) u_int32_t p1, p2; u_int16_t *ch1, *ch2; u_int16_t *end; - int changes = 1; + int changes = 0; end = characters + count; do { @@ -1035,13 +1035,22 @@ priortysort(u_int16_t* characters, int count) while (ch2 < end) { p1 = p2; p2 = get_combining_class(*ch2); - if (p1 > p2) { + if (p1 > p2 && p2 != 0) { u_int32_t tmp; tmp = *ch1; *ch1 = *ch2; *ch2 = tmp; changes = 1; + + /* + * Make sure that p2 contains the combining class for the + * character now stored at *ch2. This isn't required for + * correctness, but it will be more efficient if a character + * with a large combining class has to "bubble past" several + * characters with lower combining classes. + */ + p2 = p1; } ++ch1; ++ch2; diff --git a/bsd/vfs/vfs_vnops.c b/bsd/vfs/vfs_vnops.c index c7b110fd6..d7e2b5f14 100644 --- a/bsd/vfs/vfs_vnops.c +++ b/bsd/vfs/vfs_vnops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -125,6 +125,7 @@ static int vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx); static void filt_vndetach(struct knote *kn); static int filt_vnode(struct knote *kn, long hint); +static int vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx); #if 0 static int vn_kqfilt_remove(struct vnode *vp, uintptr_t ident, vfs_context_t ctx); @@ -163,6 +164,138 @@ vn_open_modflags(struct nameidata *ndp, int *fmodep, int cmode) return(vn_open_auth(ndp, fmodep, &va)); } +static int +vn_open_auth_finish(vnode_t vp, int fmode, vfs_context_t ctx) +{ + int error; + + if ((error = vnode_ref_ext(vp, fmode, 0)) != 0) { + goto bad; + } + + /* call out to allow 3rd party notification of open. + * Ignore result of kauth_authorize_fileop call. + */ + kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, + (uintptr_t)vp, 0); + + return 0; + +bad: + return error; + +} + +/* + * May do nameidone() to allow safely adding an FSEvent. Cue off of ni_dvp to + * determine whether that has happened. + */ +static int +vn_open_auth_do_create(struct nameidata *ndp, struct vnode_attr *vap, int fmode, boolean_t *did_create, boolean_t *did_open, vfs_context_t ctx) +{ + uint32_t status = 0; + vnode_t dvp = ndp->ni_dvp; + int batched; + int error; + vnode_t vp; + + batched = vnode_compound_open_available(ndp->ni_dvp); + *did_open = FALSE; + + VATTR_SET(vap, va_type, VREG); + if (fmode & O_EXCL) + vap->va_vaflags |= VA_EXCLUSIVE; + +#if NAMEDRSRCFORK + if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) { + if ((error = vn_authorize_create(dvp, &ndp->ni_cnd, vap, ctx, NULL)) != 0) + goto out; + if ((error = vnode_makenamedstream(dvp, &ndp->ni_vp, XATTR_RESOURCEFORK_NAME, 0, ctx)) != 0) + goto out; + *did_create = TRUE; + } else { +#endif + if (!batched) { + if ((error = vn_authorize_create(dvp, &ndp->ni_cnd, vap, ctx, NULL)) != 0) + goto out; + } + + error = vn_create(dvp, &ndp->ni_vp, ndp, vap, VN_CREATE_DOOPEN, fmode, &status, ctx); + if (error != 0) { + if (batched) { + *did_create = (status & COMPOUND_OPEN_STATUS_DID_CREATE) ? TRUE : FALSE; + } else { + *did_create = FALSE; + } + + if (error == EKEEPLOOKING) { + if (*did_create) { + panic("EKEEPLOOKING, but we did a create?"); + } + if (!batched) { + panic("EKEEPLOOKING from filesystem that doesn't support compound vnops?"); + } + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + + /* + * Do NOT drop the dvp: we need everything to continue the lookup. + */ + return error; + } + } else { + if (batched) { + *did_create = (status & COMPOUND_OPEN_STATUS_DID_CREATE) ? 1 : 0; + *did_open = TRUE; + } else { + *did_create = TRUE; + } + } +#if NAMEDRSRCFORK + } +#endif + + /* + * Unlock the fsnode (if locked) here so that we are free + * to drop the dvp iocount and prevent deadlock in build_path(). + * nameidone() will still do the right thing later. + */ + vp = ndp->ni_vp; + namei_unlock_fsnode(ndp); + + if (*did_create) { + int update_flags = 0; + + // Make sure the name & parent pointers are hooked up + if (vp->v_name == NULL) + update_flags |= VNODE_UPDATE_NAME; + if (vp->v_parent == NULLVP) + update_flags |= VNODE_UPDATE_PARENT; + + if (update_flags) + vnode_update_identity(vp, dvp, ndp->ni_cnd.cn_nameptr, ndp->ni_cnd.cn_namelen, ndp->ni_cnd.cn_hash, update_flags); + + vnode_put(dvp); + ndp->ni_dvp = NULLVP; + +#if CONFIG_FSE + if (need_fsevent(FSE_CREATE_FILE, vp)) { + add_fsevent(FSE_CREATE_FILE, ctx, + FSE_ARG_VNODE, vp, + FSE_ARG_DONE); + } +#endif + } +out: + if (ndp->ni_dvp != NULLVP) { + vnode_put(dvp); + ndp->ni_dvp = NULLVP; + } + + return error; +} + /* * Open a file with authorization, updating the contents of the structures * pointed to by ndp, fmodep, and vap as necessary to perform the requested @@ -217,100 +350,85 @@ vn_open_auth(struct nameidata *ndp, int *fmodep, struct vnode_attr *vap) int error; int fmode; uint32_t origcnflags; - kauth_action_t action; + boolean_t did_create; + boolean_t did_open; + boolean_t need_vnop_open; + boolean_t batched; + boolean_t ref_failed; again: vp = NULL; dvp = NULL; + batched = FALSE; + did_create = FALSE; + need_vnop_open = TRUE; + ref_failed = FALSE; fmode = *fmodep; origcnflags = ndp->ni_cnd.cn_flags; + + /* + * O_CREAT + */ if (fmode & O_CREAT) { if ( (fmode & O_DIRECTORY) ) { error = EINVAL; goto out; } ndp->ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + ndp->ni_op = OP_LINK; +#endif /* Inherit USEDVP, vnode_open() supported flags only */ ndp->ni_cnd.cn_flags &= (USEDVP | NOCROSSMOUNT | DOWHITEOUT); ndp->ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF | AUDITVNPATH1; + ndp->ni_flag = NAMEI_COMPOUNDOPEN; #if NAMEDRSRCFORK /* open calls are allowed for resource forks. */ ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0 && (origcnflags & FOLLOW) != 0) ndp->ni_cnd.cn_flags |= FOLLOW; + +continue_create_lookup: if ( (error = namei(ndp)) ) goto out; + dvp = ndp->ni_dvp; vp = ndp->ni_vp; - /* not found, create */ - if (vp == NULL) { - /* must have attributes for a new file */ - if (vap == NULL) { - error = EINVAL; - goto badcreate; - } - - VATTR_SET(vap, va_type, VREG); -#if CONFIG_MACF - error = mac_vnode_check_create(ctx, - dvp, &ndp->ni_cnd, vap); - if (error) - goto badcreate; -#endif /* MAC */ + batched = vnode_compound_open_available(dvp); - /* authorize before creating */ - if ((error = vnode_authorize(dvp, NULL, KAUTH_VNODE_ADD_FILE, ctx)) != 0) - goto badcreate; + /* not found, create */ + if (vp == NULL) { + /* must have attributes for a new file */ + if (vap == NULL) { + error = EINVAL; + goto out; + } + /* + * Attempt a create. For a system supporting compound VNOPs, we may + * find an existing file or create one; in either case, we will already + * have the file open and no VNOP_OPEN() will be needed. + */ + error = vn_open_auth_do_create(ndp, vap, fmode, &did_create, &did_open, ctx); - if (fmode & O_EXCL) - vap->va_vaflags |= VA_EXCLUSIVE; -#if NAMEDRSRCFORK - if (ndp->ni_cnd.cn_flags & CN_WANTSRSRCFORK) { - if ((error = vnode_makenamedstream(dvp, &ndp->ni_vp, XATTR_RESOURCEFORK_NAME, 0, ctx)) != 0) - goto badcreate; - } else -#endif - if ((error = vn_create(dvp, &ndp->ni_vp, &ndp->ni_cnd, vap, 0, ctx)) != 0) - goto badcreate; - + dvp = ndp->ni_dvp; vp = ndp->ni_vp; - if (vp) { - int update_flags = 0; - - // Make sure the name & parent pointers are hooked up - if (vp->v_name == NULL) - update_flags |= VNODE_UPDATE_NAME; - if (vp->v_parent == NULLVP) - update_flags |= VNODE_UPDATE_PARENT; - - if (update_flags) - vnode_update_identity(vp, dvp, ndp->ni_cnd.cn_nameptr, ndp->ni_cnd.cn_namelen, ndp->ni_cnd.cn_hash, update_flags); - -#if CONFIG_FSE - if (need_fsevent(FSE_CREATE_FILE, vp)) { - vnode_put(dvp); - dvp = NULL; - add_fsevent(FSE_CREATE_FILE, ctx, - FSE_ARG_VNODE, vp, - FSE_ARG_DONE); + /* + * Detected a node that the filesystem couldn't handle. Don't call + * nameidone() yet, because we need that path buffer. + */ + if (error == EKEEPLOOKING) { + if (!batched) { + panic("EKEEPLOOKING from a filesystem that doesn't support compound VNOPs?"); } -#endif - + goto continue_create_lookup; } - /* - * nameidone has to happen before we vnode_put(dvp) - * and clear the ni_dvp field, since it may need - * to release the fs_nodelock on the dvp - */ -badcreate: - nameidone(ndp); - ndp->ni_dvp = NULL; + nameidone(ndp); if (dvp) { - vnode_put(dvp); + panic("Shouldn't have a dvp here."); } if (error) { @@ -318,129 +436,166 @@ badcreate: * Check for a creation or unlink race. */ if (((error == EEXIST) && !(fmode & O_EXCL)) || - ((error == ENOENT) && (fmode & O_CREAT))){ + ((error == ENOENT) && (fmode & O_CREAT))){ + if (vp) + vnode_put(vp); goto again; } goto bad; } - fmode &= ~O_TRUNC; + + need_vnop_open = !did_open; } else { + if (fmode & O_EXCL) + error = EEXIST; + + /* + * We have a vnode. Use compound open if available + * or else fall through to "traditional" path. Note: can't + * do a compound open for root, because the parent belongs + * to a different FS. + */ + if (error == 0 && batched && (vnode_mount(dvp) == vnode_mount(vp))) { + error = VNOP_COMPOUND_OPEN(dvp, &ndp->ni_vp, ndp, 0, fmode, NULL, NULL, ctx); + + if (error == 0) { + vp = ndp->ni_vp; + need_vnop_open = FALSE; + } else if (error == EKEEPLOOKING) { + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + goto continue_create_lookup; + } + } nameidone(ndp); - ndp->ni_dvp = NULL; vnode_put(dvp); + ndp->ni_dvp = NULLVP; - if (fmode & O_EXCL) { - error = EEXIST; + if (error) { goto bad; } + fmode &= ~O_CREAT; + + /* Fall through */ } } else { + /* + * Not O_CREAT + */ ndp->ni_cnd.cn_nameiop = LOOKUP; /* Inherit USEDVP, vnode_open() supported flags only */ ndp->ni_cnd.cn_flags &= (USEDVP | NOCROSSMOUNT | DOWHITEOUT); - ndp->ni_cnd.cn_flags |= FOLLOW | LOCKLEAF | AUDITVNPATH1; + ndp->ni_cnd.cn_flags |= FOLLOW | LOCKLEAF | AUDITVNPATH1 | WANTPARENT; #if NAMEDRSRCFORK /* open calls are allowed for resource forks. */ ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; #endif + ndp->ni_flag = NAMEI_COMPOUNDOPEN; + /* preserve NOFOLLOW from vnode_open() */ if (fmode & O_NOFOLLOW || fmode & O_SYMLINK || (origcnflags & FOLLOW) == 0) { - ndp->ni_cnd.cn_flags &= ~FOLLOW; + ndp->ni_cnd.cn_flags &= ~FOLLOW; } - if ( (error = namei(ndp)) ) - goto out; - vp = ndp->ni_vp; + /* Do a lookup, possibly going directly to filesystem for compound operation */ + do { + if ( (error = namei(ndp)) ) + goto out; + vp = ndp->ni_vp; + dvp = ndp->ni_dvp; + + /* Check for batched lookup-open */ + batched = vnode_compound_open_available(dvp); + if (batched && ((vp == NULLVP) || (vnode_mount(dvp) == vnode_mount(vp)))) { + error = VNOP_COMPOUND_OPEN(dvp, &ndp->ni_vp, ndp, 0, fmode, NULL, NULL, ctx); + vp = ndp->ni_vp; + if (error == 0) { + need_vnop_open = FALSE; + } else if (error == EKEEPLOOKING) { + if ((ndp->ni_flag & NAMEI_CONTLOOKUP) == 0) { + panic("EKEEPLOOKING, but continue flag not set?"); + } + } + } + } while (error == EKEEPLOOKING); + nameidone(ndp); - ndp->ni_dvp = NULL; + vnode_put(dvp); + ndp->ni_dvp = NULLVP; - if ( (fmode & O_DIRECTORY) && vp->v_type != VDIR ) { - error = ENOTDIR; + if (error) { goto bad; } } - if (vp->v_type == VSOCK && vp->v_tag != VT_FDESC) { - error = EOPNOTSUPP; /* Operation not supported on socket */ - goto bad; - } - - if (vp->v_type == VLNK && (fmode & O_NOFOLLOW) != 0) { - error = ELOOP; /* O_NOFOLLOW was specified and the target is a symbolic link */ - goto bad; + /* + * By this point, nameidone() is called, dvp iocount is dropped, + * and dvp pointer is cleared. + */ + if (ndp->ni_dvp != NULLVP) { + panic("Haven't cleaned up adequately in vn_open_auth()"); } - /* authorize open of an existing file */ - if ((fmode & O_CREAT) == 0) { - - /* disallow write operations on directories */ - if (vnode_isdir(vp) && (fmode & (FWRITE | O_TRUNC))) { - error = EISDIR; - goto bad; + /* + * Expect to use this code for filesystems without compound VNOPs, for the root + * of a filesystem, which can't be "looked up" in the sense of VNOP_LOOKUP(), + * and for shadow files, which do not live on the same filesystems as their "parents." + */ + if (need_vnop_open) { + if (batched && !vnode_isvroot(vp) && !vnode_isnamedstream(vp)) { + panic("Why am I trying to use VNOP_OPEN() on anything other than the root or a named stream?"); } -#if CONFIG_MACF - error = mac_vnode_check_open(ctx, vp, fmode); - if (error) - goto bad; -#endif - - /* compute action to be authorized */ - action = 0; - if (fmode & FREAD) { - action |= KAUTH_VNODE_READ_DATA; - } - if (fmode & (FWRITE | O_TRUNC)) { - /* - * If we are writing, appending, and not truncating, - * indicate that we are appending so that if the - * UF_APPEND or SF_APPEND bits are set, we do not deny - * the open. - */ - if ((fmode & O_APPEND) && !(fmode & O_TRUNC)) { - action |= KAUTH_VNODE_APPEND_DATA; - } else { - action |= KAUTH_VNODE_WRITE_DATA; + if (!did_create) { + error = vn_authorize_open_existing(vp, &ndp->ni_cnd, fmode, ctx, NULL); + if (error) { + goto bad; } } - if ((error = vnode_authorize(vp, NULL, action, ctx)) != 0) - goto bad; - - // - // if the vnode is tagged VOPENEVT and the current process - // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY - // flag to the open mode so that this open won't count against - // the vnode when carbon delete() does a vnode_isinuse() to see - // if a file is currently in use. this allows spotlight - // importers to not interfere with carbon apps that depend on - // the no-delete-if-busy semantics of carbon delete(). - // - if ((vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) { - fmode |= O_EVTONLY; + error = VNOP_OPEN(vp, fmode, ctx); + if (error) { + goto bad; } + need_vnop_open = FALSE; + } + // if the vnode is tagged VOPENEVT and the current process + // has the P_CHECKOPENEVT flag set, then we or in the O_EVTONLY + // flag to the open mode so that this open won't count against + // the vnode when carbon delete() does a vnode_isinuse() to see + // if a file is currently in use. this allows spotlight + // importers to not interfere with carbon apps that depend on + // the no-delete-if-busy semantics of carbon delete(). + // + if (!did_create && (vp->v_flag & VOPENEVT) && (current_proc()->p_flag & P_CHECKOPENEVT)) { + fmode |= O_EVTONLY; } - if ( (error = VNOP_OPEN(vp, fmode, ctx)) ) { + /* + * Grab reference, etc. + */ + error = vn_open_auth_finish(vp, fmode, ctx); + if (error) { + ref_failed = TRUE; goto bad; } - if ( (error = vnode_ref_ext(vp, fmode)) ) { - goto bad2; - } - /* call out to allow 3rd party notification of open. - * Ignore result of kauth_authorize_fileop call. - */ - kauth_authorize_fileop(vfs_context_ucred(ctx), KAUTH_FILEOP_OPEN, - (uintptr_t)vp, 0); + /* Compound VNOP open is responsible for doing the truncate */ + if (batched || did_create) + fmode &= ~O_TRUNC; *fmodep = fmode; return (0); -bad2: - VNOP_CLOSE(vp, fmode, ctx); + bad: + /* Opened either explicitly or by a batched create */ + if (!need_vnop_open) { + VNOP_CLOSE(vp, fmode, ctx); + } + ndp->ni_vp = NULL; if (vp) { #if NAMEDRSRCFORK @@ -459,10 +614,11 @@ bad: * * EREDRIVEOPEN: means that we were hit by the tty allocation race. */ - if (((error == ENOENT) && (*fmodep & O_CREAT)) || (error == EREDRIVEOPEN)) { + if (((error == ENOENT) && (*fmodep & O_CREAT)) || (error == EREDRIVEOPEN) || ref_failed) { goto again; } } + out: return (error); } @@ -502,16 +658,6 @@ vn_close(struct vnode *vp, int flags, vfs_context_t ctx) { int error; -#if CONFIG_FSE - if (flags & FWASWRITTEN) { - if (need_fsevent(FSE_CONTENT_MODIFIED, vp)) { - add_fsevent(FSE_CONTENT_MODIFIED, ctx, - FSE_ARG_VNODE, vp, - FSE_ARG_DONE); - } - } -#endif - #if NAMEDRSRCFORK /* Sync data from resource fork shadow file if needed. */ if ((vp->v_flag & VISNAMEDSTREAM) && @@ -529,6 +675,16 @@ vn_close(struct vnode *vp, int flags, vfs_context_t ctx) error = VNOP_CLOSE(vp, flags, ctx); +#if CONFIG_FSE + if (flags & FWASWRITTEN) { + if (need_fsevent(FSE_CONTENT_MODIFIED, vp)) { + add_fsevent(FSE_CONTENT_MODIFIED, ctx, + FSE_ARG_VNODE, vp, + FSE_ARG_DONE); + } + } +#endif + if (!vnode_isspec(vp)) (void)vnode_rele_ext(vp, flags, 0); @@ -782,6 +938,9 @@ vn_write(struct fileproc *fp, struct uio *uio, int flags, vfs_context_t ctx) ioflag |= IO_NDELAY; if ((fp->f_fglob->fg_flag & FNOCACHE) || vnode_isnocache(vp)) ioflag |= IO_NOCACHE; + if (fp->f_fglob->fg_flag & FNODIRECT) + ioflag |= IO_NODIRECT; + /* * Treat synchronous mounts and O_FSYNC on the fd as equivalent. * @@ -996,7 +1155,7 @@ vn_stat_noauth(struct vnode *vp, void *sbptr, kauth_filesec_t *xsec, int isstat6 sb->st_blocks = roundup(va.va_total_alloc, 512) / 512; } - /* if we're interested in exended security data and we got an ACL */ + /* if we're interested in extended security data and we got an ACL */ if (xsec != NULL) { if (!VATTR_IS_SUPPORTED(&va, va_acl) && !VATTR_IS_SUPPORTED(&va, va_uuuid) && @@ -1147,7 +1306,10 @@ vn_ioctl(struct fileproc *fp, u_long com, caddr_t data, vfs_context_t ctx) error = VNOP_IOCTL(vp, com, data, fp->f_fglob->fg_flag, ctx); if (error == 0 && com == TIOCSCTTY) { - vnode_ref(vp); + error = vnode_ref_ext(vp, 0, VNODE_REF_FORCE); + if (error != 0) { + panic("vnode_ref_ext() failed despite VNODE_REF_FORCE?!"); + } funnel_state = thread_funnel_set(kernel_flock, TRUE); sessp = proc_session(vfs_context_proc(ctx)); @@ -1235,6 +1397,7 @@ int vn_pathconf(vnode_t vp, int name, int32_t *retval, vfs_context_t ctx) { int error = 0; + struct vfs_attr vfa; switch(name) { case _PC_EXTENDED_SECURITY_NP: @@ -1273,6 +1436,33 @@ vn_pathconf(vnode_t vp, int name, int32_t *retval, vfs_context_t ctx) case _PC_SYNC_IO: /* unistd.h: _POSIX_SYNCHRONIZED_IO */ *retval = 0; /* [SIO] option is not supported */ break; + case _PC_XATTR_SIZE_BITS: + /* The number of bits used to store maximum extended + * attribute size in bytes. For example, if the maximum + * attribute size supported by a file system is 128K, the + * value returned will be 18. However a value 18 can mean + * that the maximum attribute size can be anywhere from + * (256KB - 1) to 128KB. As a special case, the resource + * fork can have much larger size, and some file system + * specific extended attributes can have smaller and preset + * size; for example, Finder Info is always 32 bytes. + */ + memset(&vfa, 0, sizeof(vfa)); + VFSATTR_INIT(&vfa); + VFSATTR_WANTED(&vfa, f_capabilities); + if (vfs_getattr(vnode_mount(vp), &vfa, ctx) == 0 && + (VFSATTR_IS_SUPPORTED(&vfa, f_capabilities)) && + (vfa.f_capabilities.capabilities[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR) && + (vfa.f_capabilities.valid[VOL_CAPABILITIES_INTERFACES] & VOL_CAP_INT_EXTENDED_ATTR)) { + /* Supports native extended attributes */ + error = VNOP_PATHCONF(vp, name, retval, ctx); + } else { + /* Number of bits used to represent the maximum size of + * extended attribute stored in an Apple Double file. + */ + *retval = AD_XATTR_SIZE_BITS; + } + break; default: error = VNOP_PATHCONF(vp, name, retval, ctx); break; @@ -1303,7 +1493,7 @@ vn_kqfilt_add(struct fileproc *fp, struct knote *kn, vfs_context_t ctx) } } else if (!vnode_isreg(vp)) { - if (vnode_isspec(vp) && + if (vnode_ischr(vp) && (error = spec_kqfilter(vp, kn)) == 0) { /* claimed by a special device */ vnode_put(vp); @@ -1447,18 +1637,22 @@ vnode_writable_space_count(vnode_t vp) static int filt_vnode(struct knote *kn, long hint) { - struct vnode *vp = (struct vnode *)kn->kn_hook; + vnode_t vp = (struct vnode *)kn->kn_hook; int activate = 0; + long orig_hint = hint; if (0 == hint) { - if ((vnode_getwithvid(vp, kn->kn_hookid) != 0)) { + vnode_lock(vp); + + if (vnode_getiocount(vp, kn->kn_hookid, VNODE_NODEAD | VNODE_WITHID) != 0) { + /* Is recycled */ hint = NOTE_REVOKE; - } else { - vnode_put(vp); - } - } + } + } else { + lck_mtx_assert(&vp->v_lock, LCK_MTX_ASSERT_OWNED); + } - /* NOTE_REVOKE is special, as it is only sent during vnode reclaim */ + /* Special handling for vnodes that are in recycle or already gone */ if (NOTE_REVOKE == hint) { kn->kn_flags |= (EV_EOF | EV_ONESHOT); activate = 1; @@ -1496,5 +1690,15 @@ filt_vnode(struct knote *kn, long hint) } } + if (orig_hint == 0) { + /* + * Definitely need to unlock, may need to put + */ + if (hint == 0) { + vnode_put_locked(vp); + } + vnode_unlock(vp); + } + return (activate); } diff --git a/bsd/vfs/vfs_xattr.c b/bsd/vfs/vfs_xattr.c index d15711685..a37ba0f74 100644 --- a/bsd/vfs/vfs_xattr.c +++ b/bsd/vfs/vfs_xattr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2008 Apple Inc. All rights reserved. + * Copyright (c) 2004-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -65,8 +65,6 @@ #define MAKE_SHADOW_NAME(VP, NAME) \ snprintf((NAME), sizeof((NAME)), ".vfs_rsrc_stream_%p%08x%p", (void*)(VP), (VP)->v_id, (VP)->v_data); -static vnode_t shadow_dvp; /* tmp directory to hold stream shadow files */ -static int shadow_vid; static int shadow_sequence; @@ -556,7 +554,7 @@ vnode_flushnamedstream(vnode_t vp, vnode_t svp, vfs_context_t context) return (0); } datasize = va.va_data_size; - if ((datasize == 0)) { + if (datasize == 0) { (void) default_removexattr(vp, XATTR_RESOURCEFORK_NAME, 0, context); return (0); } @@ -623,9 +621,10 @@ getshadowfile(vnode_t vp, vnode_t *svpp, int makestream, size_t *rsrcsize, char tmpname[80]; size_t datasize = 0; int error = 0; + int retries = 0; +retry_create: *creator = 0; - /* Establish a unique file name. */ MAKE_SHADOW_NAME(vp, tmpname); bzero(&cn, sizeof(cn)); @@ -705,9 +704,32 @@ getshadowfile(vnode_t vp, vnode_t *svpp, int makestream, size_t *rsrcsize, if (error == 0) { vnode_recycle(svp); *creator = 1; - } else if ((error == EEXIST) && !makestream) { + } + else if ((error == EEXIST) && !makestream) { error = VNOP_LOOKUP(dvp, &svp, &cn, context); } + else if ((error == ENOENT) && !makestream) { + /* + * We could have raced with a rmdir on the shadow directory + * post-lookup. Retry from the beginning, 1x only, to + * try and see if we need to re-create the shadow directory + * in get_shadow_dir. + */ + if (retries == 0) { + retries++; + if (dvp) { + vnode_put (dvp); + dvp = NULLVP; + } + if (svp) { + vnode_put (svp); + svp = NULLVP; + } + goto retry_create; + } + /* Otherwise, just error out normally below */ + } + out: if (dvp) { vnode_put(dvp); @@ -936,15 +958,27 @@ get_shadow_dir(vnode_t *sdvpp, vfs_context_t context) uint32_t tmp_fsid; int error; - /* Check if we've already created it. */ - if (shadow_dvp != NULLVP) { - if ((error = vnode_getwithvid(shadow_dvp, shadow_vid))) { - shadow_dvp = NULLVP; - } else { - *sdvpp = shadow_dvp; - return (0); - } + + bzero(tmpname, sizeof(tmpname)); + snprintf(tmpname, sizeof(tmpname), "/var/run/.vfs_rsrc_streams_%p%x", + (void*)rootvnode, shadow_sequence); + /* + * Look up the shadow directory to ensure that it still exists. + * By looking it up, we get an iocounted dvp to use, and avoid some coherency issues + * in caching it when multiple threads may be trying to manipulate the pointers. + */ + error = vnode_lookup(tmpname, 0, &sdvp, context); + if (error == 0) { + /* + * If we get here, then we have successfully looked up the shadow dir, + * and it has an iocount from the lookup. Return the vp in the output argument. + */ + *sdvpp = sdvp; + return (0); } + /* In the failure case, no iocount is acquired */ + sdvp = NULLVP; + bzero (tmpname, sizeof(tmpname)); /* Obtain the vnode for "/var/run" directory. */ if (vnode_lookup("/var/run", 0, &dvp, context) != 0) { @@ -980,14 +1014,7 @@ get_shadow_dir(vnode_t *sdvpp, vfs_context_t context) /* * There can be only one winner for an exclusive create. */ - if (error == 0) { - /* Take a long term ref to keep this dir around. */ - error = vnode_ref(sdvp); - if (error == 0) { - shadow_dvp = sdvp; - shadow_vid = sdvp->v_id; - } - } else if (error == EEXIST) { + if (error == EEXIST) { /* loser has to look up directory */ error = VNOP_LOOKUP(dvp, &sdvp, &cn, context); if (error == 0) { @@ -995,7 +1022,7 @@ get_shadow_dir(vnode_t *sdvpp, vfs_context_t context) if (sdvp->v_type != VDIR) { goto baddir; } - /* Obtain the fsid for /var/run directory */ + /* Obtain the fsid for /tmp directory */ VATTR_INIT(&va); VATTR_WANTED(&va, va_fsid); if (VNOP_GETATTR(dvp, &va, context) != 0 || @@ -1156,7 +1183,7 @@ baddir: #define ATTR_BUF_SIZE 4096 /* default size of the attr file and how much we'll grow by */ /* Implementation Limits */ -#define ATTR_MAX_SIZE (128*1024) /* 128K maximum attribute data size */ +#define ATTR_MAX_SIZE AD_XATTR_MAXSIZE #define ATTR_MAX_HDR_SIZE 65536 /* * Note: ATTR_MAX_HDR_SIZE is the largest attribute header @@ -2347,12 +2374,15 @@ open_xattrfile(vnode_t vp, int fileflags, vnode_t *xvpp, vfs_context_t context) * file security from the EA must always get access */ lookup: - NDINIT(&nd, LOOKUP, LOCKLEAF | NOFOLLOW | USEDVP | DONOTAUTH, UIO_SYSSPACE, - CAST_USER_ADDR_T(filename), context); + NDINIT(&nd, LOOKUP, OP_OPEN, LOCKLEAF | NOFOLLOW | USEDVP | DONOTAUTH, + UIO_SYSSPACE, CAST_USER_ADDR_T(filename), context); nd.ni_dvp = dvp; if (fileflags & O_CREAT) { nd.ni_cnd.cn_nameiop = CREATE; +#if CONFIG_TRIGGERS + nd.ni_op = OP_LINK; +#endif if (dvp != vp) { nd.ni_cnd.cn_flags |= LOCKPARENT; } @@ -2394,8 +2424,9 @@ lookup: if (gid != KAUTH_GID_NONE) VATTR_SET(&va, va_gid, gid); - error = vn_create(dvp, &nd.ni_vp, &nd.ni_cnd, &va, + error = vn_create(dvp, &nd.ni_vp, &nd, &va, VN_CREATE_NOAUTH | VN_CREATE_NOINHERIT | VN_CREATE_NOLABEL, + 0, NULL, context); if (error) error = ENOATTR; @@ -2544,7 +2575,7 @@ remove_xattrfile(vnode_t xvp, vfs_context_t context) return (error); } - NDINIT(&nd, DELETE, LOCKPARENT | NOFOLLOW | DONOTAUTH, + NDINIT(&nd, DELETE, OP_UNLINK, LOCKPARENT | NOFOLLOW | DONOTAUTH, UIO_SYSSPACE, CAST_USER_ADDR_T(path), context); error = namei(&nd); FREE_ZONE(path, MAXPATHLEN, M_NAMEI); diff --git a/bsd/vfs/vnode_if.c b/bsd/vfs/vnode_if.c index 1a77414e2..6dd63bfde 100644 --- a/bsd/vfs/vnode_if.c +++ b/bsd/vfs/vnode_if.c @@ -106,6 +106,24 @@ struct vnodeop_desc vnop_lookup_desc = { NULL }; +int vnop_compound_open_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_compound_open_args, a_dvp), + VDESC_NO_OFFSET +}; + +struct vnodeop_desc vnop_compound_open_desc = { + 0, + "vnop_compound_open", + 0 | VDESC_VP0_WILLRELE, + vnop_compound_open_vp_offsets, + VOPARG_OFFSETOF(struct vnop_compound_open_args, a_vpp), + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_compound_open_args, a_cnp), + VOPARG_OFFSETOF(struct vnop_compound_open_args, a_context), + NULL +}; + int vnop_create_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_create_args,a_dvp), VDESC_NO_OFFSET @@ -485,6 +503,23 @@ struct vnodeop_desc vnop_remove_desc = { NULL }; +int vnop_remove_extended_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_remove_args,a_dvp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_compound_remove_desc = { + 0, + "vnop_compound_remove", + 0, + vnop_remove_vp_offsets, + VOPARG_OFFSETOF(struct vnop_compound_remove_args, a_vpp), + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_remove_args, a_cnp), + VOPARG_OFFSETOF(struct vnop_remove_args, a_context), + NULL +}; + int vnop_link_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_link_args,a_vp), VOPARG_OFFSETOF(struct vnop_link_args,a_tdvp), @@ -523,6 +558,26 @@ struct vnodeop_desc vnop_rename_desc = { NULL }; +int vnop_compound_rename_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_compound_rename_args,a_fdvp), + VOPARG_OFFSETOF(struct vnop_compound_rename_args,a_fvpp), + VOPARG_OFFSETOF(struct vnop_compound_rename_args,a_tdvp), + VOPARG_OFFSETOF(struct vnop_compound_rename_args,a_tvpp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_compound_rename_desc = { + 0, + "vnop_compound_rename", + 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE | VDESC_VP2_WILLRELE | VDESC_VP3_WILLRELE, + vnop_compound_rename_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_compound_rename_args, a_fcnp), + VOPARG_OFFSETOF(struct vnop_compound_rename_args, a_context), + NULL +}; + int vnop_mkdir_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_mkdir_args,a_dvp), VDESC_NO_OFFSET @@ -540,6 +595,24 @@ struct vnodeop_desc vnop_mkdir_desc = { NULL }; +int vnop_compound_mkdir_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_compound_mkdir_args,a_dvp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_compound_mkdir_desc = { + 0, + "vnop_compound_mkdir", + 0 | VDESC_VP0_WILLRELE, + vnop_compound_mkdir_vp_offsets, + VOPARG_OFFSETOF(struct vnop_compound_mkdir_args, a_vpp), + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_compound_mkdir_args, a_cnp), + VOPARG_OFFSETOF(struct vnop_compound_mkdir_args, a_context), + NULL +}; + + int vnop_rmdir_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_rmdir_args,a_dvp), VOPARG_OFFSETOF(struct vnop_rmdir_args,a_vp), @@ -558,6 +631,23 @@ struct vnodeop_desc vnop_rmdir_desc = { NULL }; +int vnop_compound_rmdir_vp_offsets[] = { + VOPARG_OFFSETOF(struct vnop_compound_rmdir_args,a_dvp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vnop_compound_rmdir_desc = { + 0, + "vnop_compound_rmdir", + 0 | VDESC_VP0_WILLRELE | VDESC_VP1_WILLRELE, + vnop_rmdir_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vnop_compound_rmdir_args, a_cnp), + VOPARG_OFFSETOF(struct vnop_compound_rmdir_args, a_context), + NULL +}; + int vnop_symlink_vp_offsets[] = { VOPARG_OFFSETOF(struct vnop_symlink_args,a_dvp), VDESC_NO_OFFSET @@ -1004,6 +1094,7 @@ struct vnodeop_desc *vfs_op_descs[] = { &vnop_mknod_desc, &vnop_whiteout_desc, &vnop_open_desc, + &vnop_compound_open_desc, &vnop_close_desc, &vnop_access_desc, &vnop_getattr_desc, @@ -1021,10 +1112,14 @@ struct vnodeop_desc *vfs_op_descs[] = { &vnop_mnomap_desc, &vnop_fsync_desc, &vnop_remove_desc, + &vnop_compound_remove_desc, &vnop_link_desc, &vnop_rename_desc, + &vnop_compound_rename_desc, &vnop_mkdir_desc, + &vnop_compound_mkdir_desc, &vnop_rmdir_desc, + &vnop_compound_rmdir_desc, &vnop_symlink_desc, &vnop_readdir_desc, &vnop_readdirattr_desc, diff --git a/bsd/vm/Makefile b/bsd/vm/Makefile index f0ce21745..608304077 100644 --- a/bsd/vm/Makefile +++ b/bsd/vm/Makefile @@ -10,14 +10,10 @@ include $(MakeInc_def) INSTINC_SUBDIRS = \ -INSTINC_SUBDIRS_PPC = \ - INSTINC_SUBDIRS_I386 = \ EXPINC_SUBDIRS = \ -EXPINC_SUBDIRS_PPC = \ - EXPINC_SUBDIRS_I386 = \ DATAFILES = \ diff --git a/bsd/vm/dp_backing_file.c b/bsd/vm/dp_backing_file.c index 420238db9..bb2808ecf 100644 --- a/bsd/vm/dp_backing_file.c +++ b/bsd/vm/dp_backing_file.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -42,8 +42,11 @@ #include <sys/vnode_internal.h> #include <sys/namei.h> #include <sys/ubc_internal.h> -#include <sys/mount_internal.h> #include <sys/malloc.h> +#include <sys/user.h> +#if CONFIG_PROTECT +#include <sys/cprotect.h> +#endif #include <default_pager/default_pager_types.h> #include <default_pager/default_pager_object.h> @@ -245,7 +248,7 @@ macx_swapon( /* * Get a vnode for the paging area. */ - NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, ((IS_64BIT_PROCESS(p)) ? UIO_USERSPACE64 : UIO_USERSPACE32), (user_addr_t) args->filename, ctx); @@ -274,6 +277,18 @@ macx_swapon( if ((file_size < (off_t)size) && ((error = vnode_setsize(vp, (off_t)size, 0, ctx)) != 0)) goto swapon_bailout; +#if CONFIG_PROTECT + { + void *cnode = NULL; + /* initialize content protection keys manually */ + if ((cnode = cp_get_protected_cnode(vp)) != 0) { + if ((error = cp_handle_vnop(cnode, CP_WRITE_ACCESS)) != 0) + goto swapon_bailout; + } + } +#endif + + if (default_pager_init_flag == 0) { start_def_pager(NULL); default_pager_init_flag = 1; @@ -306,21 +321,23 @@ macx_swapon( goto swapon_bailout; } - if (vp->v_mount->mnt_kern_flag & MNTK_SSD) { +#if CONFIG_EMBEDDED + dp_cluster_size = 1 * PAGE_SIZE; +#else + if ((dp_isssd = vnode_pager_isSSD(vp)) == TRUE) { /* * keep the cluster size small since the * seek cost is effectively 0 which means * we don't care much about fragmentation */ - dp_isssd = TRUE; dp_cluster_size = 2 * PAGE_SIZE; } else { /* * use the default cluster size */ - dp_isssd = FALSE; dp_cluster_size = 0; } +#endif kr = default_pager_backing_store_create(default_pager, -1, /* default priority */ dp_cluster_size, @@ -379,6 +396,12 @@ swapon_bailout: } (void) thread_funnel_set(kernel_flock, FALSE); AUDIT_MACH_SYSCALL_EXIT(error); + + if (error) + printf("macx_swapon FAILED - %d\n", error); + else + printf("macx_swapon SUCCESS\n"); + return(error); } @@ -402,6 +425,8 @@ macx_swapoff( int error; boolean_t funnel_state; vfs_context_t ctx = vfs_context_current(); + struct uthread *ut; + int orig_iopol_disk; AUDIT_MACH_SYSCALL_ENTER(AUE_SWAPOFF); @@ -415,7 +440,7 @@ macx_swapoff( /* * Get the vnode for the paging area. */ - NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, + NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW | LOCKLEAF | AUDITVNPATH1, ((IS_64BIT_PROCESS(p)) ? UIO_USERSPACE64 : UIO_USERSPACE32), (user_addr_t) args->filename, ctx); @@ -447,7 +472,24 @@ macx_swapoff( } backing_store = (mach_port_t)bs_port_table[i].bs; + ut = get_bsdthread_info(current_thread()); + +#if !CONFIG_EMBEDDED + orig_iopol_disk = proc_get_thread_selfdiskacc(); + proc_apply_thread_selfdiskacc(IOPOL_THROTTLE); +#else /* !CONFIG_EMBEDDED */ + orig_iopol_disk = ut->uu_iopol_disk; + ut->uu_iopol_disk = IOPOL_THROTTLE; +#endif /* !CONFIG_EMBEDDED */ + kr = default_pager_backing_store_delete(backing_store); + +#if !CONFIG_EMBEDDED + proc_apply_thread_selfdiskacc(orig_iopol_disk); +#else /* !CONFIG_EMBEDDED */ + ut->uu_iopol_disk = orig_iopol_disk; +#endif /* !CONFIG_EMBEDDED */ + switch (kr) { case KERN_SUCCESS: error = 0; @@ -476,6 +518,12 @@ swapoff_bailout: (void) thread_funnel_set(kernel_flock, FALSE); AUDIT_MACH_SYSCALL_EXIT(error); + + if (error) + printf("macx_swapoff FAILED - %d\n", error); + else + printf("macx_swapoff SUCCESS\n"); + return(error); } diff --git a/bsd/vm/vm_unix.c b/bsd/vm/vm_unix.c index 369c91350..0190e70f7 100644 --- a/bsd/vm/vm_unix.c +++ b/bsd/vm/vm_unix.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,6 +44,7 @@ #include <kern/thread.h> #include <kern/debug.h> #include <kern/lock.h> +#include <kern/extmod_statistics.h> #include <mach/mach_traps.h> #include <mach/port.h> #include <mach/task.h> @@ -74,8 +75,11 @@ #include <sys/sysproto.h> #include <sys/mman.h> #include <sys/sysctl.h> +#include <sys/cprotect.h> +#include <sys/kpi_socket.h> #include <security/audit/audit.h> +#include <security/mac.h> #include <bsm/audit_kevents.h> #include <kern/kalloc.h> @@ -90,6 +94,18 @@ #include <vm/vm_protos.h> +#if CONFIG_FREEZE +#include <sys/kern_memorystatus.h> +#endif + + +int _shared_region_map( struct proc*, int, unsigned int, struct shared_file_mapping_np*, memory_object_control_t*, struct shared_file_mapping_np*); +int _shared_region_slide(uint32_t, mach_vm_offset_t, mach_vm_size_t, mach_vm_offset_t, mach_vm_size_t, memory_object_control_t); +int shared_region_copyin_mappings(struct proc*, user_addr_t, unsigned int, struct shared_file_mapping_np *); + +SYSCTL_INT(_vm, OID_AUTO, vm_debug_events, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_debug_events, 0, ""); + + /* * Sysctl's related to data/stack execution. See osfmk/vm/vm_map.c */ @@ -97,8 +113,8 @@ #ifndef SECURE_KERNEL extern int allow_stack_exec, allow_data_exec; -SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW, &allow_stack_exec, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW, &allow_data_exec, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, ""); +SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, ""); #endif /* !SECURE_KERNEL */ static const char *prot_values[] = { @@ -121,7 +137,7 @@ log_stack_execution_failure(addr64_t vaddr, vm_prot_t prot) int shared_region_unnest_logging = 1; -SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW, +SYSCTL_INT(_vm, OID_AUTO, shared_region_unnest_logging, CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_unnest_logging, 0, ""); int vm_shared_region_unnest_log_interval = 10; @@ -486,8 +502,8 @@ task_for_pid_posix_check(proc_t target) /* Do target's ruid, euid, and saved uid match my euid? */ if ((kauth_cred_getuid(targetcred) != myuid) || - (targetcred->cr_ruid != myuid) || - (targetcred->cr_svuid != myuid)) { + (kauth_cred_getruid(targetcred) != myuid) || + (kauth_cred_getsvuid(targetcred) != myuid)) { allowed = FALSE; goto out; } @@ -600,6 +616,8 @@ task_for_pid( /* Grant task port access */ task_reference(p->task); + extmod_statistics_incr_task_for_pid(p->task); + sright = (void *) convert_task_to_port(p->task); tret = ipc_port_copyout_send( sright, @@ -664,7 +682,7 @@ task_name_for_pid( && ((current_proc() == p) || kauth_cred_issuser(kauth_cred_get()) || ((kauth_cred_getuid(target_cred) == kauth_cred_getuid(kauth_cred_get())) && - ((target_cred->cr_ruid == kauth_cred_get()->cr_ruid))))) { + ((kauth_cred_getruid(target_cred) == kauth_getruid()))))) { if (p->task != TASK_NULL) { task_reference(p->task); @@ -714,21 +732,21 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) int error = 0; #if CONFIG_MACF - error = mac_proc_check_suspend_resume(p, 0); /* 0 for suspend */ + error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_SUSPEND); if (error) { - error = KERN_FAILURE; + error = EPERM; goto out; } #endif if (pid == 0) { - error = KERN_FAILURE; + error = EPERM; goto out; } targetproc = proc_find(pid); if (!task_for_pid_posix_check(targetproc)) { - error = KERN_FAILURE; + error = EPERM; goto out; } @@ -744,7 +762,7 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) (tfpport != IPC_PORT_NULL)) { if (tfpport == IPC_PORT_DEAD) { - error = KERN_PROTECTION_FAILURE; + error = EACCES; goto out; } @@ -753,9 +771,9 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) - error = KERN_ABORTED; + error = EINTR; else - error = KERN_FAILURE; + error = EPERM; goto out; } } @@ -764,8 +782,19 @@ pid_suspend(struct proc *p __unused, struct pid_suspend_args *args, int *ret) task_reference(target); error = task_suspend(target); + if (error) { + if (error == KERN_INVALID_ARGUMENT) { + error = EINVAL; + } else { + error = EPERM; + } + } task_deallocate(target); +#if CONFIG_FREEZE + kern_hibernation_on_pid_suspend(pid); +#endif + out: if (targetproc != PROC_NULL) proc_rele(targetproc); @@ -782,21 +811,21 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) int error = 0; #if CONFIG_MACF - error = mac_proc_check_suspend_resume(p, 1); /* 1 for resume */ + error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_RESUME); if (error) { - error = KERN_FAILURE; + error = EPERM; goto out; } #endif if (pid == 0) { - error = KERN_FAILURE; + error = EPERM; goto out; } targetproc = proc_find(pid); if (!task_for_pid_posix_check(targetproc)) { - error = KERN_FAILURE; + error = EPERM; goto out; } @@ -812,7 +841,7 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) (tfpport != IPC_PORT_NULL)) { if (tfpport == IPC_PORT_DEAD) { - error = KERN_PROTECTION_FAILURE; + error = EACCES; goto out; } @@ -821,9 +850,9 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) if (error != MACH_MSG_SUCCESS) { if (error == MACH_RCV_INTERRUPTED) - error = KERN_ABORTED; + error = EINTR; else - error = KERN_FAILURE; + error = EPERM; goto out; } } @@ -831,7 +860,19 @@ pid_resume(struct proc *p __unused, struct pid_resume_args *args, int *ret) #endif task_reference(target); + +#if CONFIG_FREEZE + kern_hibernation_on_pid_resume(pid, target); +#endif + error = task_resume(target); + if (error) { + if (error == KERN_INVALID_ARGUMENT) { + error = EINVAL; + } else { + error = EPERM; + } + } task_deallocate(target); out: @@ -843,6 +884,118 @@ out: return 0; } +#if CONFIG_EMBEDDED +kern_return_t +pid_hibernate(struct proc *p __unused, struct pid_hibernate_args *args, int *ret) +{ + int error = 0; + proc_t targetproc = PROC_NULL; + int pid = args->pid; + +#ifndef CONFIG_FREEZE + #pragma unused(pid) +#else + +#if CONFIG_MACF + error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_HIBERNATE); + if (error) { + error = EPERM; + goto out; + } +#endif + + /* + * The only accepted pid value here is currently -1, since we just kick off the hibernation thread + * here - individual ids aren't required. However, it's intended that that this call is to change + * in the future to initiate hibernation of individual processes. In anticipation, we'll obtain the + * process handle for potentially valid values and call task_for_pid_posix_check(); this way, everything + * is validated correctly and set for further refactoring. See <rdar://problem/7839708> for more details. + */ + if (pid >= 0) { + targetproc = proc_find(pid); + if (!task_for_pid_posix_check(targetproc)) { + error = EPERM; + goto out; + } + } + + if (pid == -1) { + kern_hibernation_on_pid_hibernate(pid); + } else { + error = EPERM; + } + +out: + +#endif /* CONFIG_FREEZE */ + + if (targetproc != PROC_NULL) + proc_rele(targetproc); + *ret = error; + return error; +} + +int +pid_shutdown_sockets(struct proc *p __unused, struct pid_shutdown_sockets_args *args, int *ret) +{ + int error = 0; + proc_t targetproc = PROC_NULL; + struct filedesc *fdp; + struct fileproc *fp; + int pid = args->pid; + int level = args->level; + int i; + + if (level != SHUTDOWN_SOCKET_LEVEL_DISCONNECT_SVC && + level != SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL) + { + error = EINVAL; + goto out; + } + +#if CONFIG_MACF + error = mac_proc_check_suspend_resume(p, MAC_PROC_CHECK_SHUTDOWN_SOCKETS); + if (error) { + error = EPERM; + goto out; + } +#endif + + targetproc = proc_find(pid); + if (!task_for_pid_posix_check(targetproc)) { + error = EPERM; + goto out; + } + + proc_fdlock(targetproc); + fdp = targetproc->p_fd; + + for (i = 0; i < fdp->fd_nfiles; i++) { + struct socket *sockp; + + fp = fdp->fd_ofiles[i]; + if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || + fp->f_fglob->fg_type != DTYPE_SOCKET) + { + continue; + } + + sockp = (struct socket *)fp->f_fglob->fg_data; + + /* Call networking stack with socket and level */ + (void) socket_defunct(targetproc, sockp, level); + } + + proc_fdunlock(targetproc); + +out: + if (targetproc != PROC_NULL) + proc_rele(targetproc); + *ret = error; + return error; +} +#endif /* CONFIG_EMBEDDED */ + static int sysctl_settfp_policy(__unused struct sysctl_oid *oidp, void *arg1, __unused int arg2, struct sysctl_req *req) @@ -876,17 +1029,17 @@ static int kern_secure_kernel = 1; static int kern_secure_kernel = 0; #endif -SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD, &kern_secure_kernel, 0, ""); +SYSCTL_INT(_kern, OID_AUTO, secure_kernel, CTLFLAG_RD | CTLFLAG_LOCKED, &kern_secure_kernel, 0, ""); -SYSCTL_NODE(_kern, KERN_TFP, tfp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "tfp"); -SYSCTL_PROC(_kern_tfp, KERN_TFP_POLICY, policy, CTLTYPE_INT | CTLFLAG_RW, +SYSCTL_NODE(_kern, KERN_TFP, tfp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "tfp"); +SYSCTL_PROC(_kern_tfp, KERN_TFP_POLICY, policy, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tfp_policy, sizeof(uint32_t), &sysctl_settfp_policy ,"I","policy"); -SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW, +SYSCTL_INT(_vm, OID_AUTO, shared_region_trace_level, CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_trace_level, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, shared_region_version, CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_version, 0, ""); -SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW, +SYSCTL_INT(_vm, OID_AUTO, shared_region_persistence, CTLFLAG_RW | CTLFLAG_LOCKED, &shared_region_persistence, 0, ""); /* @@ -968,6 +1121,31 @@ shared_region_check_np( return error; } + +int +shared_region_copyin_mappings( + struct proc *p, + user_addr_t user_mappings, + unsigned int mappings_count, + struct shared_file_mapping_np *mappings) +{ + int error = 0; + vm_size_t mappings_size = 0; + + /* get the list of mappings the caller wants us to establish */ + mappings_size = (vm_size_t) (mappings_count * sizeof (mappings[0])); + error = copyin(user_mappings, + mappings, + mappings_size); + if (error) { + SHARED_REGION_TRACE_ERROR( + ("shared_region: %p [%d(%s)] map(): " + "copyin(0x%llx, %d) failed (error=%d)\n", + current_thread(), p->p_pid, p->p_comm, + (uint64_t)user_mappings, mappings_count, error)); + } + return error; +} /* * shared_region_map_np() * @@ -979,25 +1157,22 @@ shared_region_check_np( * requiring any further setup. */ int -shared_region_map_np( +_shared_region_map( struct proc *p, - struct shared_region_map_np_args *uap, - __unused int *retvalp) + int fd, + uint32_t mappings_count, + struct shared_file_mapping_np *mappings, + memory_object_control_t *sr_file_control, + struct shared_file_mapping_np *mapping_to_slide) { int error; kern_return_t kr; - int fd; struct fileproc *fp; struct vnode *vp, *root_vp; struct vnode_attr va; off_t fs; memory_object_size_t file_size; - user_addr_t user_mappings; - struct shared_file_mapping_np *mappings; -#define SFM_MAX_STACK 8 - struct shared_file_mapping_np stack_mappings[SFM_MAX_STACK]; - unsigned int mappings_count; - vm_size_t mappings_size; + vm_prot_t maxprot = VM_PROT_ALL; memory_object_control_t file_control; struct vm_shared_region *shared_region; @@ -1006,15 +1181,9 @@ shared_region_map_np( current_thread(), p->p_pid, p->p_comm)); shared_region = NULL; - mappings_count = 0; - mappings_size = 0; - mappings = NULL; fp = NULL; vp = NULL; - /* get file descriptor for shared region cache file */ - fd = uap->fd; - /* get file structure from file descriptor */ error = fp_lookup(p, fd, &fp, 0); if (error) { @@ -1068,11 +1237,38 @@ shared_region_map_np( goto done; } +#if CONFIG_MACF + error = mac_file_check_mmap(vfs_context_ucred(vfs_context_current()), + fp->f_fglob, VM_PROT_ALL, MAP_FILE, &maxprot); + if (error) { + goto done; + } +#endif /* MAC */ + +#if CONFIG_PROTECT + /* check for content protection access */ + { + void *cnode; + if ((cnode = cp_get_protected_cnode(vp)) != NULL) { + error = cp_handle_vnop(cnode, CP_READ_ACCESS | CP_WRITE_ACCESS); + if (error) + goto done; + } + } +#endif /* CONFIG_PROTECT */ + /* make sure vnode is on the process's root volume */ root_vp = p->p_fd->fd_rdir; if (root_vp == NULL) { root_vp = rootvnode; + } else { + /* + * Chroot-ed processes can't use the shared_region. + */ + error = EINVAL; + goto done; } + if (vp->v_mount != root_vp->v_mount) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " @@ -1128,42 +1324,12 @@ shared_region_map_np( error = EINVAL; goto done; } - - /* get the list of mappings the caller wants us to establish */ - mappings_count = uap->count; /* number of mappings */ - mappings_size = (vm_size_t) (mappings_count * sizeof (mappings[0])); - if (mappings_count == 0) { - SHARED_REGION_TRACE_INFO( - ("shared_region: %p [%d(%s)] map(%p:'%s'): " - "no mappings\n", - current_thread(), p->p_pid, p->p_comm, - vp, vp->v_name)); - error = 0; /* no mappings: we're done ! */ - goto done; - } else if (mappings_count <= SFM_MAX_STACK) { - mappings = &stack_mappings[0]; - } else { - SHARED_REGION_TRACE_ERROR( - ("shared_region: %p [%d(%s)] map(%p:'%s'): " - "too many mappings (%d)\n", - current_thread(), p->p_pid, p->p_comm, - vp, vp->v_name, mappings_count)); - error = EINVAL; - goto done; - } - user_mappings = uap->mappings; /* the mappings, in user space */ - error = copyin(user_mappings, - mappings, - mappings_size); - if (error) { - SHARED_REGION_TRACE_ERROR( - ("shared_region: %p [%d(%s)] map(%p:'%s'): " - "copyin(0x%llx, %d) failed (error=%d)\n", - current_thread(), p->p_pid, p->p_comm, - vp, vp->v_name, (uint64_t)user_mappings, mappings_count, error)); - goto done; + if (sr_file_control != NULL) { + *sr_file_control = file_control; } + + /* get the process's shared region (setup in vm_map_exec()) */ shared_region = vm_shared_region_get(current_task()); @@ -1182,7 +1348,8 @@ shared_region_map_np( mappings, file_control, file_size, - (void *) p->p_fd->fd_rdir); + (void *) p->p_fd->fd_rdir, + mapping_to_slide); if (kr != KERN_SUCCESS) { SHARED_REGION_TRACE_ERROR( ("shared_region: %p [%d(%s)] map(%p:'%s'): " @@ -1210,6 +1377,12 @@ shared_region_map_np( error = 0; + vnode_lock_spin(vp); + + vp->v_flag |= VSHARED_DYLD; + + vnode_unlock(vp); + /* update the vnode's access time */ if (! (vnode_vfsvisflags(vp) & MNT_NOATIME)) { VATTR_INIT(&va); @@ -1249,6 +1422,126 @@ done: return error; } +int +_shared_region_slide(uint32_t slide, + mach_vm_offset_t entry_start_address, + mach_vm_size_t entry_size, + mach_vm_offset_t slide_start, + mach_vm_size_t slide_size, + memory_object_control_t sr_file_control) +{ + void *slide_info_entry = NULL; + int error; + + if((error = vm_shared_region_slide_init(slide_size, entry_start_address, entry_size, slide, sr_file_control))) { + printf("slide_info initialization failed with kr=%d\n", error); + goto done; + } + + slide_info_entry = vm_shared_region_get_slide_info_entry(); + if (slide_info_entry == NULL){ + error = EFAULT; + } else { + error = copyin(slide_start, + slide_info_entry, + (vm_size_t)slide_size); + } + if (error) { + goto done; + } + + if (vm_shared_region_slide_sanity_check() != KERN_SUCCESS) { + error = EFAULT; + printf("Sanity Check failed for slide_info\n"); + } else { +#if DEBUG + printf("Succesfully init slide_info with start_address: %p region_size: %ld slide_header_size: %ld\n", + (void*)(uintptr_t)entry_start_address, + (unsigned long)entry_size, + (unsigned long)slide_size); +#endif + } +done: + return error; +} + +int +shared_region_map_and_slide_np( + struct proc *p, + struct shared_region_map_and_slide_np_args *uap, + __unused int *retvalp) +{ + struct shared_file_mapping_np mapping_to_slide; + struct shared_file_mapping_np *mappings; + unsigned int mappings_count = uap->count; + + memory_object_control_t sr_file_control; + kern_return_t kr = KERN_SUCCESS; + uint32_t slide = uap->slide; + +#define SFM_MAX_STACK 8 + struct shared_file_mapping_np stack_mappings[SFM_MAX_STACK]; + + if ((kr = vm_shared_region_sliding_valid(slide)) != KERN_SUCCESS) { + if (kr == KERN_INVALID_ARGUMENT) { + /* + * This will happen if we request sliding again + * with the same slide value that was used earlier + * for the very first sliding. We continue through + * to the mapping layer. This is so that we can be + * absolutely certain that the same mappings have + * been requested. + */ + kr = KERN_SUCCESS; + } else { + goto done; + } + } + + if (mappings_count == 0) { + SHARED_REGION_TRACE_INFO( + ("shared_region: %p [%d(%s)] map(): " + "no mappings\n", + current_thread(), p->p_pid, p->p_comm)); + kr = 0; /* no mappings: we're done ! */ + goto done; + } else if (mappings_count <= SFM_MAX_STACK) { + mappings = &stack_mappings[0]; + } else { + SHARED_REGION_TRACE_ERROR( + ("shared_region: %p [%d(%s)] map(): " + "too many mappings (%d)\n", + current_thread(), p->p_pid, p->p_comm, + mappings_count)); + kr = KERN_FAILURE; + goto done; + } + + if ( (kr = shared_region_copyin_mappings(p, uap->mappings, uap->count, mappings))) { + goto done; + } + + + kr = _shared_region_map(p, uap->fd, mappings_count, mappings, &sr_file_control, &mapping_to_slide); + if (kr != KERN_SUCCESS) { + return kr; + } + + if (slide) { + kr = _shared_region_slide(slide, + mapping_to_slide.sfm_file_offset, + mapping_to_slide.sfm_size, + uap->slide_start, + uap->slide_size, + sr_file_control); + if (kr != KERN_SUCCESS) { + vm_shared_region_undo_mappings(NULL, 0, mappings, mappings_count); + return kr; + } + } +done: + return kr; +} /* sysctl overflow room */ @@ -1256,11 +1549,11 @@ done: allocate buffer space, possibly purgeable memory, but not cause inactive pages to be reclaimed. It allows the app to calculate how much memory is free outside the free target. */ extern unsigned int vm_page_free_target; -SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_free_target, 0, "Pageout daemon free target"); extern unsigned int vm_memory_pressure; -SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_memory_pressure, 0, "Memory pressure indicator"); static int @@ -1277,36 +1570,36 @@ SYSCTL_PROC(_vm, OID_AUTO, page_free_wanted, 0, 0, vm_ctl_page_free_wanted, "I", ""); extern unsigned int vm_page_purgeable_count; -SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, page_purgeable_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_purgeable_count, 0, "Purgeable page count"); extern unsigned int vm_page_purgeable_wired_count; -SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_purgeable_wired_count, 0, "Wired purgeable page count"); -SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD, +SYSCTL_INT(_vm, OID_AUTO, page_reusable_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_count, 0, "Reusable page count"); -SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, reusable_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_pages_success, ""); -SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, reusable_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_pages_failure, ""); -SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, reusable_shared, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reusable_pages_shared, ""); -SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, all_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.all_reusable_calls, ""); -SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, partial_reusable_calls, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.partial_reusable_calls, ""); -SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reuse_pages_success, ""); -SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.reuse_pages_failure, ""); -SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, all_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.all_reuse_calls, ""); -SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, partial_reuse_calls, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.partial_reuse_calls, ""); -SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.can_reuse_success, ""); -SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD, +SYSCTL_QUAD(_vm, OID_AUTO, can_reuse_failure, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_stats_reusable.can_reuse_failure, ""); diff --git a/bsd/vm/vnode_pager.c b/bsd/vm/vnode_pager.c index a15b6dcc4..d12a65652 100644 --- a/bsd/vm/vnode_pager.c +++ b/bsd/vm/vnode_pager.c @@ -52,6 +52,7 @@ #include <sys/mount_internal.h> /* needs internal due to fhandle_t */ #include <sys/ubc_internal.h> #include <sys/lock.h> +#include <sys/disk.h> /* For DKIOC calls */ #include <mach/mach_types.h> #include <mach/memory_object_types.h> @@ -81,6 +82,27 @@ #include <vm/vm_protos.h> +void +vnode_pager_throttle() +{ + struct uthread *ut; + + ut = get_bsdthread_info(current_thread()); + + if (ut->uu_lowpri_window) + throttle_lowpri_io(TRUE); +} + + +boolean_t +vnode_pager_isSSD(vnode_t vp) +{ + if (vp->v_mount->mnt_kern_flag & MNTK_SSD) + return (TRUE); + return (FALSE); +} + + uint32_t vnode_pager_isinuse(struct vnode *vp) { @@ -137,6 +159,85 @@ vnode_pager_get_cs_blobs( return KERN_SUCCESS; } +/* + * vnode_trim: + * Used to call the DKIOCUNMAP ioctl on the underlying disk device for the specified vnode. + * Trims the region at offset bytes into the file, for length bytes. + * + * Care must be taken to ensure that the vnode is sufficiently reference counted at the time this + * function is called; no iocounts or usecounts are taken on the vnode. + * This function is non-idempotent in error cases; We cannot un-discard the blocks if only some of them + * are successfully discarded. + */ +u_int32_t vnode_trim ( + struct vnode *vp, + off_t offset, + size_t length) +{ + daddr64_t io_blockno; /* Block number corresponding to the start of the extent */ + size_t io_bytecount; /* Number of bytes in current extent for the specified range */ + size_t trimmed = 0; + off_t current_offset = offset; + size_t remaining_length = length; + int error = 0; + u_int32_t blocksize = 0; + struct vnode *devvp; + dk_extent_t extent; + dk_unmap_t unmap; + + + /* Get the underlying device vnode */ + devvp = vp->v_mount->mnt_devvp; + + /* Figure out the underlying device block size */ + error = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&blocksize, 0, vfs_context_kernel()); + if (error) { + goto trim_exit; + } + + /* + * We may not get the entire range from offset -> offset+length in a single + * extent from the blockmap call. Keep looping/going until we are sure we've hit + * the whole range or if we encounter an error. + */ + while (trimmed < length) { + /* + * VNOP_BLOCKMAP will tell us the logical to physical block number mapping for the + * specified offset. It returns blocks in contiguous chunks, so if the logical range is + * broken into multiple extents, it must be called multiple times, increasing the offset + * in each call to ensure that the entire range is covered. + */ + error = VNOP_BLOCKMAP (vp, current_offset, remaining_length, + &io_blockno, &io_bytecount, NULL, VNODE_READ, NULL); + + if (error) { + goto trim_exit; + } + /* + * We have a contiguous run. Prepare & issue the ioctl for the device. + * the DKIOCUNMAP ioctl takes offset in bytes from the start of the device. + */ + memset (&extent, 0, sizeof(dk_extent_t)); + memset (&unmap, 0, sizeof(dk_unmap_t)); + extent.offset = (uint64_t) io_blockno * (u_int64_t) blocksize; + extent.length = io_bytecount; + unmap.extents = &extent; + unmap.extentsCount = 1; + error = VNOP_IOCTL(devvp, DKIOCUNMAP, (caddr_t)&unmap, 0, vfs_context_kernel()); + + if (error) { + goto trim_exit; + } + remaining_length = remaining_length - io_bytecount; + trimmed = trimmed + io_bytecount; + current_offset = current_offset + io_bytecount; + } +trim_exit: + + return error; + +} + pager_return_t vnode_pageout(struct vnode *vp, upl_t upl, @@ -219,9 +320,7 @@ vnode_pageout(struct vnode *vp, else request_flags = UPL_UBC_PAGEOUT | UPL_RET_ONLY_DIRTY; - ubc_create_upl(vp, f_offset, size, &upl, &pl, request_flags); - - if (upl == (upl_t)NULL) { + if (ubc_create_upl(vp, f_offset, size, &upl, &pl, request_flags) != KERN_SUCCESS) { result = PAGER_ERROR; error_ret = EINVAL; goto out; @@ -555,14 +654,23 @@ vnode_pagein( xsize, flags, vfs_context_current())) ) { /* * Usually this UPL will be aborted/committed by the lower cluster layer. - * In the case of decmpfs, however, we may return an error (EAGAIN) to avoid - * a deadlock with another thread already inflating the file. In that case, - * we must take care of our UPL at this layer itself. + * + * a) In the case of decmpfs, however, we may return an error (EAGAIN) to avoid + * a deadlock with another thread already inflating the file. + * + * b) In the case of content protection, EPERM is a valid error and we should respect it. + * + * In those cases, we must take care of our UPL at this layer itself. */ if (must_commit) { if(error == EAGAIN) { ubc_upl_abort_range(upl, (upl_offset_t) xoff, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_RESTART); } +#if CONFIG_PROTECT + if(error == EPERM) { + ubc_upl_abort_range(upl, (upl_offset_t) xoff, xsize, UPL_ABORT_FREE_ON_EMPTY | UPL_ABORT_ERROR); + } +#endif } result = PAGER_ERROR; error = PAGER_ERROR; diff --git a/config/BSDKernel.exports b/config/BSDKernel.exports index f0322f1ac..ebb5af5db 100644 --- a/config/BSDKernel.exports +++ b/config/BSDKernel.exports @@ -33,6 +33,7 @@ _buf_bwrite _buf_callback _buf_clear _buf_clearflags +_buf_clear_redundancy_flags _buf_clone _buf_count _buf_dataptr @@ -55,6 +56,7 @@ _buf_iterate _buf_lblkno _buf_map _buf_markaged +_buf_markclean _buf_markdelayed _buf_markeintr _buf_markfua @@ -63,6 +65,7 @@ _buf_meta_bread _buf_meta_breadn _buf_proc _buf_rcred +_buf_redundancy_flags _buf_reset _buf_resid _buf_setblkno @@ -77,6 +80,7 @@ _buf_seterror _buf_setflags _buf_setfsprivate _buf_setlblkno +_buf_set_redundancy_flags _buf_setresid _buf_setsize _buf_setupl @@ -222,6 +226,8 @@ _ifnet_allocate _ifnet_attach _ifnet_attach_protocol _ifnet_baudrate +_ifnet_capabilities_enabled +_ifnet_capabilities_supported _ifnet_detach _ifnet_detach_protocol _ifnet_eflags @@ -259,6 +265,8 @@ _ifnet_remove_multicast _ifnet_resolve_multicast:_dlil_resolve_multi _ifnet_set_addrlen _ifnet_set_baudrate +_ifnet_set_capabilities_supported +_ifnet_set_capabilities_enabled _ifnet_set_eflags _ifnet_set_flags _ifnet_set_hdrlen @@ -305,6 +313,10 @@ _kauth_cred_get_with_ref _kauth_cred_getgid _kauth_cred_getguid _kauth_cred_getntsid +_kauth_cred_getrgid +_kauth_cred_getruid +_kauth_cred_getsvgid +_kauth_cred_getsvuid _kauth_cred_getuid _kauth_cred_gid2guid _kauth_cred_gid2ntsid @@ -351,6 +363,7 @@ _mbuf_clear_csum_performed _mbuf_clear_csum_requested _mbuf_get_mlen _mbuf_get_mhlen +_mbuf_get_minclsize _mbuf_clear_vlan_tag _mbuf_concatenate _mbuf_copy_pkthdr @@ -368,6 +381,7 @@ _mbuf_freem_list _mbuf_get _mbuf_get_csum_performed _mbuf_get_csum_requested +_mbuf_get_traffic_class _mbuf_get_tso_requested _mbuf_get_vlan_tag _mbuf_getcluster @@ -388,6 +402,7 @@ _mbuf_pulldown _mbuf_pullup _mbuf_set_csum_performed _mbuf_set_csum_requested +_mbuf_set_traffic_class _mbuf_set_vlan_tag _mbuf_setdata _mbuf_setflags @@ -670,6 +685,7 @@ _vfs_setextendedsecurity _vfs_setflags _vfs_setfsprivate _vfs_setioattr +_vfs_setlocklocal _vfs_setmaxsymlen _vfs_statfs _vfs_sysctl diff --git a/config/BSDKernel.ppc.exports b/config/BSDKernel.ppc.exports deleted file mode 100644 index 83559e0b0..000000000 --- a/config/BSDKernel.ppc.exports +++ /dev/null @@ -1,37 +0,0 @@ -_file_vnode -_in6_cksum:_inet6_cksum -_is_suser -_is_suser1 -_mbuf_data -_mbuf_inet6_cksum -_mbuf_len -_mbuf_next -_mbuf_nextpkt -_mbuf_pkthdr_header -_mbuf_pkthdr_len -_mbuf_pkthdr_rcvif -_mbuf_pkthdr_setheader -_mbuf_setlen -_mbuf_setnextpkt -_mbuf_type -_nd6_lookup_ipv6 -_proc_ucred -_rootvnode -_spl0 -_splbio -_splclock -_splhigh -_splimp -_spllo -_spln -_sploff -_splon -_splpower -_splsched -_splsoftclock -_spltty -_splvm -_splx -_suser -_ubc_setcred -_ubc_sync_range diff --git a/config/Dummy.exports b/config/Dummy.exports new file mode 100644 index 000000000..fe7149c32 --- /dev/null +++ b/config/Dummy.exports @@ -0,0 +1 @@ +# Dummy exports, exists for stub architectures like PPC diff --git a/config/IOKit.exports b/config/IOKit.exports index 8f1cb8e73..2ad8e78c9 100644 --- a/config/IOKit.exports +++ b/config/IOKit.exports @@ -365,7 +365,6 @@ __ZN13IOCommandGate10runCommandEPvS0_S0_S0_ __ZN13IOCommandGate10superClassE __ZN13IOCommandGate11commandGateEP8OSObjectPFiS1_PvS2_S2_S2_E __ZN13IOCommandGate11setWorkLoopEP10IOWorkLoop -__ZN13IOCommandGate12checkForWorkEv __ZN13IOCommandGate13attemptActionEPFiP8OSObjectPvS2_S2_S2_ES2_S2_S2_S2_ __ZN13IOCommandGate13commandWakeupEPvb __ZN13IOCommandGate14attemptCommandEPvS0_S0_S0_ @@ -430,6 +429,7 @@ __ZN13IOEventSource23_RESERVEDIOEventSource4Ev __ZN13IOEventSource23_RESERVEDIOEventSource5Ev __ZN13IOEventSource23_RESERVEDIOEventSource6Ev __ZN13IOEventSource23_RESERVEDIOEventSource7Ev +__ZN13IOEventSource4freeEv __ZN13IOEventSource4initEP8OSObjectPFvS1_zE __ZN13IOEventSource6enableEv __ZN13IOEventSource7disableEv @@ -440,6 +440,7 @@ __ZN13IOEventSource9MetaClassC2Ev __ZN13IOEventSource9closeGateEv __ZN13IOEventSource9metaClassE __ZN13IOEventSource9setActionEPFvP8OSObjectzE +__ZN13IOEventSource12checkForWorkEv __ZN13IOEventSourceC1EPK11OSMetaClass __ZN13IOEventSourceC2EPK11OSMetaClass __ZN13IOEventSourceD0Ev @@ -483,12 +484,12 @@ __ZN14IOPMrootDomain14tellChangeDownEm __ZN14IOPMrootDomain15powerChangeDoneEm __ZN14IOPMrootDomain16tellNoChangeDownEm __ZN14IOPMrootDomain17createPMAssertionEyjP9IOServicePKc -__ZN14IOPMrootDomain17getSleepSupportedEv -__ZN14IOPMrootDomain17setAggressivenessEmm -__ZN14IOPMrootDomain18changePowerStateToEm __ZN14IOPMrootDomain18releasePMAssertionEy __ZN14IOPMrootDomain19getPMAssertionLevelEy __ZN14IOPMrootDomain19setPMAssertionLevelEyj +__ZN14IOPMrootDomain17getSleepSupportedEv +__ZN14IOPMrootDomain17setAggressivenessEmm +__ZN14IOPMrootDomain18changePowerStateToEm __ZN14IOPMrootDomain22changePowerStateToPrivEm __ZN14IOPMrootDomain22removePublishedFeatureEj __ZN14IOPMrootDomain23requestPowerDomainStateEmP17IOPowerConnectionm @@ -860,7 +861,6 @@ __ZN18IORegistryIteratorD2Ev __ZN18IOTimerEventSource10gMetaClassE __ZN18IOTimerEventSource10superClassE __ZN18IOTimerEventSource11setWorkLoopEP10IOWorkLoop -__ZN18IOTimerEventSource12checkForWorkEv __ZN18IOTimerEventSource13cancelTimeoutEv __ZN18IOTimerEventSource14setTimeoutFuncEv __ZN18IOTimerEventSource16timerEventSourceEP8OSObjectPFvS1_PS_E diff --git a/config/IOKit.i386.exports b/config/IOKit.i386.exports index 068770db6..d83bbdde6 100644 --- a/config/IOKit.i386.exports +++ b/config/IOKit.i386.exports @@ -280,7 +280,6 @@ __ZN9IOService13newUserClientEP4taskPvmPP12IOUserClient __ZN9IOService13startMatchingEm __ZN9IOService13waitMatchIdleEm __ZN9IOService13willTerminateEPS_m -__ZN9IOService14actionFinalizeEPS_m __ZN9IOService14doServiceMatchEm __ZN9IOService14messageClientsEmPvj __ZN9IOService14newTemperatureElPS_ @@ -299,13 +298,11 @@ __ZN9IOService16didYouWakeSystemEv __ZN9IOService16registerInterestEPK8OSSymbolPFiPvS3_mPS_S3_jES3_S3_ __ZN9IOService16requestTerminateEPS_m __ZN9IOService16setCPUSnoopDelayEm -__ZN9IOService18actionDidTerminateEPS_m __ZN9IOService18doServiceTerminateEm __ZN9IOService18matchPropertyTableEP12OSDictionaryPl __ZN9IOService18requireMaxBusStallEm __ZN9IOService18settleTimerExpiredEv __ZN9IOService18systemWillShutdownEm -__ZN9IOService19actionWillTerminateEPS_mP7OSArray __ZN9IOService19deliverNotificationEPK8OSSymbolmm __ZN9IOService19installNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_ES5_S5_lPP10OSIterator __ZN9IOService22PM_Clamp_Timer_ExpiredEv diff --git a/config/IOKit.ppc.exports b/config/IOKit.ppc.exports deleted file mode 100644 index 26b5a9209..000000000 --- a/config/IOKit.ppc.exports +++ /dev/null @@ -1,383 +0,0 @@ -_IOPanic -_PE_parse_boot_arg -__Z11IODBDMAStopPV23IODBDMAChannelRegisters -__Z12IODBDMAFlushPV23IODBDMAChannelRegisters -__Z12IODBDMAPausePV23IODBDMAChannelRegisters -__Z12IODBDMAResetPV23IODBDMAChannelRegisters -__Z12IODBDMAStartPV23IODBDMAChannelRegistersPV17IODBDMADescriptor -__Z15IODBDMAContinuePV23IODBDMAChannelRegisters -__Z16IODTFindSlotNameP15IORegistryEntrym -__Z16IODTSetResolvingP15IORegistryEntryPFlmPmS1_EPFvS0_PhS4_S4_E -__Z17IODTGetCellCountsP15IORegistryEntryPmS1_ -__Z22IODTResolveAddressCellP15IORegistryEntryPmS1_S1_ -__Z23IODTFindMatchingEntriesP15IORegistryEntrymPKc -__ZN10AppleMacIO9metaClassE -__ZN10IOWorkLoop19workLoopWithOptionsEm -__ZN10IOWorkLoop9sleepGateEPv12UnsignedWidem -__ZN10IOWorkLoop9sleepGateEPvm -__ZN11IOCatalogue11findDriversEP12OSDictionaryPl -__ZN11IOCatalogue11findDriversEP9IOServicePl -__ZN11IODataQueue11withEntriesEmm -__ZN11IODataQueue12withCapacityEm -__ZN11IODataQueue15initWithEntriesEmm -__ZN11IODataQueue16initWithCapacityEm -__ZN11IODataQueue7enqueueEPvm -__ZN11IOMemoryMap10getAddressEv -__ZN11IOMemoryMap18getPhysicalSegmentEmPm -__ZN11IOMemoryMap19setMemoryDescriptorEP18IOMemoryDescriptory -__ZN11IOMemoryMap7getSizeEv -__ZN11IOMemoryMap8redirectEP18IOMemoryDescriptormm -__ZN11IOMemoryMap8redirectEP18IOMemoryDescriptormy -__ZN12IODMACommand11OutputBig32EPS_NS_9Segment64EPvm -__ZN12IODMACommand11OutputBig64EPS_NS_9Segment64EPvm -__ZN12IODMACommand11synchronizeEm -__ZN12IODMACommand12OutputHost32EPS_NS_9Segment64EPvm -__ZN12IODMACommand12OutputHost64EPS_NS_9Segment64EPvm -__ZN12IODMACommand14OutputLittle32EPS_NS_9Segment64EPvm -__ZN12IODMACommand14OutputLittle64EPS_NS_9Segment64EPvm -__ZN12IODMACommand15genIOVMSegmentsEPyPvPm -__ZN12IODMACommand17withSpecificationEPFbPS_NS_9Segment64EPvmEhyNS_14MappingOptionsEymP8IOMapperS2_ -__ZN12IODMACommand21initWithSpecificationEPFbPS_NS_9Segment64EPvmEhyNS_14MappingOptionsEymP8IOMapperS2_ -__ZN12IODMACommand24prepareWithSpecificationEPFbPS_NS_9Segment64EPvmEhyNS_14MappingOptionsEymP8IOMapperyybb -__ZN12IODMACommand8transferEmyPvy -__ZN12IOUserClient12initWithTaskEP4taskPvm -__ZN12IOUserClient12initWithTaskEP4taskPvmP12OSDictionary -__ZN12IOUserClient15mapClientMemoryEmP4taskmj -__ZN12IOUserClient15sendAsyncResultEPjiPPvm -__ZN12IOUserClient17mapClientMemory64EmP4taskmy -__ZN12IOUserClient17sendAsyncResult64EPyiS0_m -__ZN12IOUserClient19clientMemoryForTypeEmPmPP18IOMemoryDescriptor -__ZN12IOUserClient19setAsyncReference64EPyP8ipc_portyy -__ZN12IOUserClient23getExternalTrapForIndexEm -__ZN12IOUserClient24getNotificationSemaphoreEmPP9semaphore -__ZN12IOUserClient24getTargetAndTrapForIndexEPP9IOServicem -__ZN12IOUserClient24registerNotificationPortEP8ipc_portmm -__ZN12IOUserClient24registerNotificationPortEP8ipc_portmy -__ZN12IOUserClient25getExternalMethodForIndexEm -__ZN12IOUserClient26getTargetAndMethodForIndexEPP9IOServicem -__ZN12IOUserClient30getExternalAsyncMethodForIndexEm -__ZN12IOUserClient31getAsyncTargetAndMethodForIndexEPP9IOServicem -__ZN13IOCommandGate12commandSleepEPv12UnsignedWidem -__ZN13IOCommandGate12commandSleepEPvm -__ZN13IOCommandPool11commandPoolEP9IOServiceP10IOWorkLoopm -__ZN13IOCommandPool4initEP9IOServiceP10IOWorkLoopm -__ZN13IOEventSource9sleepGateEPv12UnsignedWidem -__ZN13IOEventSource9sleepGateEPvm -__ZN13_IOServiceJob8startJobEP9IOServiceim -__ZN14IOCommandQueue10gMetaClassE -__ZN14IOCommandQueue10superClassE -__ZN14IOCommandQueue12checkForWorkEv -__ZN14IOCommandQueue12commandQueueEP8OSObjectPFvS1_PvS2_S2_S2_Ei -__ZN14IOCommandQueue14enqueueCommandEbPvS0_S0_S0_ -__ZN14IOCommandQueue15performAndFlushEP8OSObjectPFvS1_PvS2_S2_S2_E -__ZN14IOCommandQueue4freeEv -__ZN14IOCommandQueue4initEP8OSObjectPFvS1_PvS2_S2_S2_Ei -__ZN14IOCommandQueue9MetaClassC1Ev -__ZN14IOCommandQueue9MetaClassC2Ev -__ZN14IOCommandQueue9metaClassE -__ZN14IOCommandQueueC1EPK11OSMetaClass -__ZN14IOCommandQueueC1Ev -__ZN14IOCommandQueueC2EPK11OSMetaClass -__ZN14IOCommandQueueC2Ev -__ZN14IOCommandQueueD0Ev -__ZN14IOCommandQueueD2Ev -__ZN14IODeviceMemory12withSubRangeEPS_mm -__ZN14IODeviceMemory13arrayFromListEPNS_11InitElementEm -__ZN14IODeviceMemory9withRangeEmm -__ZN14IOMemoryCursor17withSpecificationEPFvNS_15PhysicalSegmentEPvmEmmm -__ZN14IOMemoryCursor19genPhysicalSegmentsEP18IOMemoryDescriptormPvmmPm -__ZN14IOMemoryCursor21initWithSpecificationEPFvNS_15PhysicalSegmentEPvmEmmm -__ZN14IOPMrootDomain17setSleepSupportedEm -__ZN14IOPMrootDomain19sysPowerDownHandlerEPvS0_mP9IOServiceS0_j -__ZN14IOPMrootDomain24receivePowerNotificationEm -__ZN14IOPMrootDomain27displayWranglerNotificationEPvS0_mP9IOServiceS0_j -__ZN15IODMAController13getControllerEP9IOServicem -__ZN15IODMAController16notifyDMACommandEP16IODMAEventSourceP12IODMACommandim -__ZN15IODMAController20createControllerNameEm -__ZN15IODMAController21registerDMAControllerEm -__ZN16AppleMacIODevice9metaClassE -__ZN16IODMAEventSource14dmaEventSourceEP8OSObjectP9IOServicePFvS1_PS_P12IODMACommandimES8_m -__ZN16IODMAEventSource15startDMACommandEP12IODMACommand11IODirectionmm -__ZN16IODMAEventSource16notifyDMACommandEP12IODMACommandim -__ZN16IODMAEventSource4initEP8OSObjectP9IOServicePFvS1_PS_P12IODMACommandimES8_m -__ZN16IOKitDiagnostics12updateOffsetEP12OSDictionarymPKc -__ZN16IORangeAllocator10deallocateEmm -__ZN16IORangeAllocator12allocElementEm -__ZN16IORangeAllocator13allocateRangeEmm -__ZN16IORangeAllocator14deallocElementEm -__ZN16IORangeAllocator28setFragmentCapacityIncrementEm -__ZN16IORangeAllocator4initEmmmm -__ZN16IORangeAllocator8allocateEmPmm -__ZN16IORangeAllocator9withRangeEmmmm -__ZN17IOBigMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm -__ZN17IOBigMemoryCursor17withSpecificationEmmm -__ZN17IOBigMemoryCursor21initWithSpecificationEmmm -__ZN17IOSharedDataQueue11withEntriesEmm -__ZN17IOSharedDataQueue12withCapacityEm -__ZN17IOSharedDataQueue16initWithCapacityEm -__ZN17IOSharedDataQueue7dequeueEPvPm -__ZN18IOMemoryDescriptor10setMappingEP4taskjm -__ZN18IOMemoryDescriptor10withRangesEP14IOVirtualRangem11IODirectionP4taskb -__ZN18IOMemoryDescriptor10writeBytesEmPKvm -__ZN18IOMemoryDescriptor11makeMappingEPS_P4taskjmmm -__ZN18IOMemoryDescriptor11withAddressEPvm11IODirection -__ZN18IOMemoryDescriptor11withAddressEjm11IODirectionP4task -__ZN18IOMemoryDescriptor11withOptionsEPvmmP4taskmP8IOMapper -__ZN18IOMemoryDescriptor12setPurgeableEmPm -__ZN18IOMemoryDescriptor12withSubRangeEPS_mm11IODirection -__ZN18IOMemoryDescriptor14initWithRangesEP14IOVirtualRangem11IODirectionP4taskb -__ZN18IOMemoryDescriptor15initWithAddressEPvm11IODirection -__ZN18IOMemoryDescriptor15initWithAddressEjm11IODirectionP4task -__ZN18IOMemoryDescriptor15initWithOptionsEPvmmP4taskmP8IOMapper -__ZN18IOMemoryDescriptor16getSourceSegmentEmPm -__ZN18IOMemoryDescriptor16performOperationEmmm -__ZN18IOMemoryDescriptor16withAddressRangeEyymP4task -__ZN18IOMemoryDescriptor17getVirtualSegmentEmPm -__ZN18IOMemoryDescriptor17withAddressRangesEP14IOAddressRangemmP4task -__ZN18IOMemoryDescriptor18getPhysicalSegmentEmPm -__ZN18IOMemoryDescriptor18getPhysicalSegmentEmPmm -__ZN18IOMemoryDescriptor18withPhysicalRangesEP15IOPhysicalRangem11IODirectionb -__ZN18IOMemoryDescriptor19createMappingInTaskEP4taskymyy -__ZN18IOMemoryDescriptor19withPhysicalAddressEmm11IODirection -__ZN18IOMemoryDescriptor20getPhysicalSegment64EmPm -__ZN18IOMemoryDescriptor22initWithPhysicalRangesEP15IOPhysicalRangem11IODirectionb -__ZN18IOMemoryDescriptor23initWithPhysicalAddressEmm11IODirection -__ZN18IOMemoryDescriptor28_RESERVEDIOMemoryDescriptor8Ev -__ZN18IOMemoryDescriptor28_RESERVEDIOMemoryDescriptor9Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor10Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor11Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor12Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor13Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor14Ev -__ZN18IOMemoryDescriptor29_RESERVEDIOMemoryDescriptor15Ev -__ZN18IOMemoryDescriptor3mapEP4taskjmmm -__ZN18IOMemoryDescriptor3mapEm -__ZN18IOMemoryDescriptor5doMapEP7_vm_mapPjmmm -__ZN18IOMemoryDescriptor6setTagEm -__ZN18IOMemoryDescriptor7doUnmapEP7_vm_mapjm -__ZN18IOMemoryDescriptor9readBytesEmPvm -__ZN18IORegistryIterator11iterateOverEP15IORegistryEntryPK15IORegistryPlanem -__ZN18IORegistryIterator11iterateOverEPK15IORegistryPlanem -__ZN18IOTimerEventSource10setTimeoutE12UnsignedWide -__ZN18IOTimerEventSource10setTimeoutE13mach_timespec -__ZN18IOTimerEventSource10setTimeoutEmm -__ZN18IOTimerEventSource10wakeAtTimeE12UnsignedWide -__ZN18IOTimerEventSource10wakeAtTimeE13mach_timespec -__ZN18IOTimerEventSource10wakeAtTimeEmm -__ZN18IOTimerEventSource12setTimeoutMSEm -__ZN18IOTimerEventSource12setTimeoutUSEm -__ZN18IOTimerEventSource12wakeAtTimeMSEm -__ZN18IOTimerEventSource12wakeAtTimeUSEm -__ZN18IOTimerEventSource15setTimeoutTicksEm -__ZN18IOTimerEventSource15wakeAtTimeTicksEm -__ZN19IODBDMAMemoryCursor10gMetaClassE -__ZN19IODBDMAMemoryCursor10superClassE -__ZN19IODBDMAMemoryCursor17withSpecificationEmmm -__ZN19IODBDMAMemoryCursor21initWithSpecificationEmmm -__ZN19IODBDMAMemoryCursor9MetaClassC1Ev -__ZN19IODBDMAMemoryCursor9MetaClassC2Ev -__ZN19IODBDMAMemoryCursor9metaClassE -__ZN19IODBDMAMemoryCursorC1EPK11OSMetaClass -__ZN19IODBDMAMemoryCursorC1Ev -__ZN19IODBDMAMemoryCursorC2EPK11OSMetaClass -__ZN19IODBDMAMemoryCursorC2Ev -__ZN19IODBDMAMemoryCursorD0Ev -__ZN19IODBDMAMemoryCursorD2Ev -__ZN20IOLittleMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm -__ZN20IOLittleMemoryCursor17withSpecificationEmmm -__ZN20IOLittleMemoryCursor21initWithSpecificationEmmm -__ZN20RootDomainUserClient15setPreventativeEmm -__ZN20RootDomainUserClient26getTargetAndMethodForIndexEPP9IOServicem -__ZN21IOInterruptController10initVectorElP17IOInterruptVector -__ZN21IOInterruptController11causeVectorElP17IOInterruptVector -__ZN21IOInterruptController12enableVectorElP17IOInterruptVector -__ZN21IOInterruptController13getVectorTypeElP17IOInterruptVector -__ZN21IOInterruptController17disableVectorHardElP17IOInterruptVector -__ZN21IOInterruptController17vectorCanBeSharedElP17IOInterruptVector -__ZN21IONaturalMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm -__ZN21IONaturalMemoryCursor17withSpecificationEmmm -__ZN21IONaturalMemoryCursor21initWithSpecificationEmmm -__ZN21IOSubMemoryDescriptor11makeMappingEP18IOMemoryDescriptorP4taskjmmm -__ZN21IOSubMemoryDescriptor12initSubRangeEP18IOMemoryDescriptormm11IODirection -__ZN21IOSubMemoryDescriptor12setPurgeableEmPm -__ZN21IOSubMemoryDescriptor12withSubRangeEP18IOMemoryDescriptormmm -__ZN21IOSubMemoryDescriptor18getPhysicalSegmentEmPmm -__ZN21IOSubMemoryDescriptor7prepareE11IODirection -__ZN21IOSubMemoryDescriptor8completeE11IODirection -__ZN23IOMultiMemoryDescriptor15withDescriptorsEPP18IOMemoryDescriptorm11IODirectionb -__ZN23IOMultiMemoryDescriptor18getPhysicalSegmentEmPmm -__ZN23IOMultiMemoryDescriptor19initWithDescriptorsEPP18IOMemoryDescriptorm11IODirectionb -__ZN23IOMultiMemoryDescriptor7prepareE11IODirection -__ZN23IOMultiMemoryDescriptor8completeE11IODirection -__ZN24IOBufferMemoryDescriptor11appendBytesEPKvj -__ZN24IOBufferMemoryDescriptor11withOptionsEmjj -__ZN24IOBufferMemoryDescriptor12setDirectionE11IODirection -__ZN24IOBufferMemoryDescriptor12withCapacityEj11IODirectionb -__ZN24IOBufferMemoryDescriptor13initWithBytesEPKvj11IODirectionb -__ZN24IOBufferMemoryDescriptor14getBytesNoCopyEjj -__ZN24IOBufferMemoryDescriptor15initWithOptionsEmjj -__ZN24IOBufferMemoryDescriptor15initWithOptionsEmjjP4task -__ZN24IOBufferMemoryDescriptor17inTaskWithOptionsEP4taskmjj -__ZN24IOBufferMemoryDescriptor20initWithPhysicalMaskEP4taskmyyy -__ZN24IOBufferMemoryDescriptor22inTaskWithPhysicalMaskEP4taskmyy -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor2Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor3Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor4Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor5Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor6Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor7Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor8Ev -__ZN24IOBufferMemoryDescriptor34_RESERVEDIOBufferMemoryDescriptor9Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor10Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor11Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor12Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor13Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor14Ev -__ZN24IOBufferMemoryDescriptor35_RESERVEDIOBufferMemoryDescriptor15Ev -__ZN24IOBufferMemoryDescriptor9setLengthEj -__ZN24IOBufferMemoryDescriptor9withBytesEPKvj11IODirectionb -__ZN25IOGeneralMemoryDescriptor11setPositionEm -__ZN25IOGeneralMemoryDescriptor11wireVirtualE11IODirection -__ZN25IOGeneralMemoryDescriptor12setPurgeableEmPm -__ZN25IOGeneralMemoryDescriptor13mapIntoKernelEj -__ZN25IOGeneralMemoryDescriptor14initWithRangesEP14IOVirtualRangem11IODirectionP4taskb -__ZN25IOGeneralMemoryDescriptor15initWithAddressEPvm11IODirection -__ZN25IOGeneralMemoryDescriptor15initWithAddressEjm11IODirectionP4task -__ZN25IOGeneralMemoryDescriptor15initWithOptionsEPvmmP4taskmP8IOMapper -__ZN25IOGeneralMemoryDescriptor15unmapFromKernelEv -__ZN25IOGeneralMemoryDescriptor16getSourceSegmentEmPm -__ZN25IOGeneralMemoryDescriptor17getVirtualSegmentEmPm -__ZN25IOGeneralMemoryDescriptor18getPhysicalSegmentEmPm -__ZN25IOGeneralMemoryDescriptor18getPhysicalSegmentEmPmm -__ZN25IOGeneralMemoryDescriptor20getPhysicalSegment64EmPm -__ZN25IOGeneralMemoryDescriptor22initWithPhysicalRangesEP15IOPhysicalRangem11IODirectionb -__ZN25IOGeneralMemoryDescriptor23initWithPhysicalAddressEmm11IODirection -__ZN25IOGeneralMemoryDescriptor5doMapEP7_vm_mapPjmmm -__ZN25IOGeneralMemoryDescriptor7doUnmapEP7_vm_mapjm -__ZN25IOGeneralMemoryDescriptor7prepareE11IODirection -__ZN25IOGeneralMemoryDescriptor8completeE11IODirection -__ZN29IOInterleavedMemoryDescriptor12withCapacityEm11IODirection -__ZN29IOInterleavedMemoryDescriptor16initWithCapacityEm11IODirection -__ZN29IOInterleavedMemoryDescriptor18getPhysicalSegmentEmPmm -__ZN29IOInterleavedMemoryDescriptor19setMemoryDescriptorEP18IOMemoryDescriptormm -__ZN29IOInterleavedMemoryDescriptor22clearMemoryDescriptorsE11IODirection -__ZN29IOInterleavedMemoryDescriptor7prepareE11IODirection -__ZN29IOInterleavedMemoryDescriptor8completeE11IODirection -__ZN8IOMapper10allocTableEm -__ZN8IOMapper10iovmInsertEjmP13upl_page_infom -__ZN8IOMapper10iovmInsertEjmPjm -__ZN8IOMapper11NewARTTableEmPPvPj -__ZN8IOMapper12FreeARTTableEP6OSDatam -__ZN8IOPMprot10gMetaClassE -__ZN8IOPMprot10superClassE -__ZN8IOPMprot9MetaClassC1Ev -__ZN8IOPMprot9MetaClassC2Ev -__ZN8IOPMprot9metaClassE -__ZN8IOPMprotC1EPK11OSMetaClass -__ZN8IOPMprotC1Ev -__ZN8IOPMprotC2EPK11OSMetaClass -__ZN8IOPMprotC2Ev -__ZN8IOPMprotD0Ev -__ZN8IOPMprotD2Ev -__ZN9IOService10adjustBusyEl -__ZN9IOService10handleOpenEPS_mPv -__ZN9IOService10systemWakeEv -__ZN9IOService10youAreRootEv -__ZN9IOService11_adjustBusyEl -__ZN9IOService11handleCloseEPS_m -__ZN9IOService11tellClientsEi -__ZN9IOService12clampPowerOnEm -__ZN9IOService12didTerminateEPS_mPb -__ZN9IOService12requestProbeEm -__ZN9IOService12waitForStateEmmP13mach_timespec -__ZN9IOService13getPMworkloopEv -__ZN9IOService13messageClientEmP8OSObjectPvj -__ZN9IOService13newUserClientEP4taskPvmP12OSDictionaryPP12IOUserClient -__ZN9IOService13newUserClientEP4taskPvmPP12IOUserClient -__ZN9IOService13startMatchingEm -__ZN9IOService13waitMatchIdleEm -__ZN9IOService13willTerminateEPS_m -__ZN9IOService14actionFinalizeEPS_m -__ZN9IOService14doServiceMatchEm -__ZN9IOService14messageClientsEmPvj -__ZN9IOService14newTemperatureElPS_ -__ZN9IOService14setPowerParentEP17IOPowerConnectionbm -__ZN9IOService15addNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_ES5_S5_l -__ZN9IOService15nextIdleTimeoutE12UnsignedWideS0_j -__ZN9IOService15registerServiceEm -__ZN9IOService15tellChangeDown1Em -__ZN9IOService15tellChangeDown2Em -__ZN9IOService15terminateClientEPS_m -__ZN9IOService15terminatePhase1Em -__ZN9IOService15terminateWorkerEm -__ZN9IOService16ack_timer_tickedEv -__ZN9IOService16command_receivedEPvS0_S0_S0_ -__ZN9IOService16didYouWakeSystemEv -__ZN9IOService16registerInterestEPK8OSSymbolPFiPvS3_mPS_S3_jES3_S3_ -__ZN9IOService16requestTerminateEPS_m -__ZN9IOService16setCPUSnoopDelayEm -__ZN9IOService18actionDidTerminateEPS_m -__ZN9IOService18doServiceTerminateEm -__ZN9IOService18matchPropertyTableEP12OSDictionaryPl -__ZN9IOService18requireMaxBusStallEm -__ZN9IOService18settleTimerExpiredEv -__ZN9IOService18systemWillShutdownEm -__ZN9IOService19actionWillTerminateEPS_mP7OSArray -__ZN9IOService19deliverNotificationEPK8OSSymbolmm -__ZN9IOService19installNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_ES5_S5_lPP10OSIterator -__ZN9IOService20_RESERVEDIOService48Ev -__ZN9IOService20_RESERVEDIOService49Ev -__ZN9IOService20_RESERVEDIOService50Ev -__ZN9IOService20_RESERVEDIOService51Ev -__ZN9IOService20_RESERVEDIOService52Ev -__ZN9IOService20_RESERVEDIOService53Ev -__ZN9IOService20_RESERVEDIOService54Ev -__ZN9IOService20_RESERVEDIOService55Ev -__ZN9IOService20_RESERVEDIOService56Ev -__ZN9IOService20_RESERVEDIOService57Ev -__ZN9IOService20_RESERVEDIOService58Ev -__ZN9IOService20_RESERVEDIOService59Ev -__ZN9IOService20_RESERVEDIOService60Ev -__ZN9IOService20_RESERVEDIOService61Ev -__ZN9IOService20_RESERVEDIOService62Ev -__ZN9IOService20_RESERVEDIOService63Ev -__ZN9IOService22PM_Clamp_Timer_ExpiredEv -__ZN9IOService22powerDomainDidChangeToEmP17IOPowerConnection -__ZN9IOService23acknowledgeNotificationEPvm -__ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_P10IONotifierES5_S5_l -__ZN9IOService23powerDomainWillChangeToEmP17IOPowerConnection -__ZN9IOService23scheduleTerminatePhase2Em -__ZN9IOService23tellClientsWithResponseEi -__ZN9IOService24PM_idle_timer_expirationEv -__ZN9IOService24mapDeviceMemoryWithIndexEjm -__ZN9IOService26temperatureCriticalForZoneEPS_ -__ZN9IOService27serializedAllowPowerChange2Em -__ZN9IOService28serializedCancelPowerChange2Em -__ZN9IOService4openEPS_mPv -__ZN9IOService5closeEPS_m -__ZN9IOService5probeEPS_Pl -__ZN9IOService6PMfreeEv -__ZN9IOService7messageEmPS_Pv -__ZN9IOService8finalizeEm -__ZN9IOService9terminateEm -__ZNK11IOCatalogue13serializeDataEmP11OSSerialize -__ZNK14IOCommandQueue12getMetaClassEv -__ZNK14IOCommandQueue9MetaClass5allocEv -__ZNK15IORegistryEntry11getPropertyEPK8OSStringPK15IORegistryPlanem -__ZNK15IORegistryEntry11getPropertyEPK8OSSymbolPK15IORegistryPlanem -__ZNK15IORegistryEntry11getPropertyEPKcPK15IORegistryPlanem -__ZNK15IORegistryEntry12copyPropertyEPK8OSStringPK15IORegistryPlanem -__ZNK15IORegistryEntry12copyPropertyEPK8OSSymbolPK15IORegistryPlanem -__ZNK15IORegistryEntry12copyPropertyEPKcPK15IORegistryPlanem -__ZNK18IOMemoryDescriptor19dmaCommandOperationEmPvj -__ZNK19IODBDMAMemoryCursor12getMetaClassEv -__ZNK19IODBDMAMemoryCursor9MetaClass5allocEv -__ZNK25IOGeneralMemoryDescriptor19dmaCommandOperationEmPvj -__ZNK8IOPMprot12getMetaClassEv -__ZNK8IOPMprot9MetaClass5allocEv -__ZTV14IOCommandQueue -__ZTV19IODBDMAMemoryCursor -__ZTV8IOPMprot -__ZTVN14IOCommandQueue9MetaClassE -__ZTVN19IODBDMAMemoryCursor9MetaClassE -__ZTVN8IOPMprot9MetaClassE diff --git a/config/IOKit.x86_64.exports b/config/IOKit.x86_64.exports index d3067b6e0..6f986aea6 100644 --- a/config/IOKit.x86_64.exports +++ b/config/IOKit.x86_64.exports @@ -226,7 +226,6 @@ __ZN9IOService13newUserClientEP4taskPvjPP12IOUserClient __ZN9IOService13startMatchingEj __ZN9IOService13waitMatchIdleEj __ZN9IOService13willTerminateEPS_j -__ZN9IOService14actionFinalizeEPS_j __ZN9IOService14doServiceMatchEj __ZN9IOService14messageClientsEjPvm __ZN9IOService15addNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_ES5_S5_i @@ -238,7 +237,6 @@ __ZN9IOService15terminateWorkerEj __ZN9IOService16registerInterestEPK8OSSymbolPFiPvS3_jPS_S3_mES3_S3_ __ZN9IOService16requestTerminateEPS_j __ZN9IOService16setCPUSnoopDelayEj -__ZN9IOService18actionDidTerminateEPS_j __ZN9IOService18doServiceTerminateEj __ZN9IOService18matchPropertyTableEP12OSDictionaryPi __ZN9IOService18requireMaxBusStallEj @@ -249,7 +247,6 @@ __ZN9IOService19_RESERVEDIOService2Ev __ZN9IOService19_RESERVEDIOService3Ev __ZN9IOService19_RESERVEDIOService4Ev __ZN9IOService19_RESERVEDIOService5Ev -__ZN9IOService19actionWillTerminateEPS_jP7OSArray __ZN9IOService19deliverNotificationEPK8OSSymboljj __ZN9IOService23acknowledgeNotificationEPvj __ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryPFbPvS5_PS_P10IONotifierES5_S5_i diff --git a/config/Libkern.exports b/config/Libkern.exports index 2e7ff44dd..4bd05a193 100644 --- a/config/Libkern.exports +++ b/config/Libkern.exports @@ -1,4 +1,4 @@ -___bzero:_bzero +___bzero _Assert _MD5Final _MD5Init @@ -744,6 +744,7 @@ _deflateSetDictionary _ffs _flush_dcache _flush_dcache64 +_gOSKextUnresolved _inet_ntop _inflate _inflateEnd diff --git a/config/Libkern.i386.exports b/config/Libkern.i386.exports index 31d172284..d1a97b9ee 100644 --- a/config/Libkern.i386.exports +++ b/config/Libkern.i386.exports @@ -1,4 +1,7 @@ _lck_mtx_unlock_darwin10 +_lck_mtx_lock_spin +_lck_mtx_try_lock_spin +_lck_mtx_convert_spin _OSAddAtomic64 _OSCompareAndSwap64 _OSRuntimeFinalizeCPP diff --git a/config/Libkern.ppc.exports b/config/Libkern.ppc.exports deleted file mode 100644 index ebf87f219..000000000 --- a/config/Libkern.ppc.exports +++ /dev/null @@ -1,29 +0,0 @@ -_OSDequeueAtomic -_OSEnqueueAtomic -_OSRuntimeFinalizeCPP -_OSRuntimeInitializeCPP -_OSRuntimeUnloadCPP -_OSRuntimeUnloadCPPForSegment -__ZN12OSOrderedSet12withCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_ -__ZN12OSOrderedSet16initWithCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_ -__ZN8OSObject19_RESERVEDOSObject16Ev -__ZN8OSObject19_RESERVEDOSObject17Ev -__ZN8OSObject19_RESERVEDOSObject18Ev -__ZN8OSObject19_RESERVEDOSObject19Ev -__ZN8OSObject19_RESERVEDOSObject20Ev -__ZN8OSObject19_RESERVEDOSObject21Ev -__ZN8OSObject19_RESERVEDOSObject22Ev -__ZN8OSObject19_RESERVEDOSObject23Ev -__ZN8OSObject19_RESERVEDOSObject24Ev -__ZN8OSObject19_RESERVEDOSObject25Ev -__ZN8OSObject19_RESERVEDOSObject26Ev -__ZN8OSObject19_RESERVEDOSObject27Ev -__ZN8OSObject19_RESERVEDOSObject28Ev -__ZN8OSObject19_RESERVEDOSObject29Ev -__ZN8OSObject19_RESERVEDOSObject30Ev -__ZN8OSObject19_RESERVEDOSObject31Ev -_bcopy_nc -_bzero_nc -_sprintf -_strcat -_strcpy diff --git a/config/Libkern.x86_64.exports b/config/Libkern.x86_64.exports index 639d10368..c42f577d8 100644 --- a/config/Libkern.x86_64.exports +++ b/config/Libkern.x86_64.exports @@ -1,8 +1,10 @@ +_lck_mtx_lock_spin +_lck_mtx_try_lock_spin +_lck_mtx_convert_spin _OSAddAtomic64 _OSCompareAndSwap64 __ZN12OSOrderedSet12withCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_ __ZN12OSOrderedSet16initWithCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_ -_gOSKextUnresolved _sprintf _strcat _strcpy diff --git a/config/MACFramework.exports b/config/MACFramework.exports index cba6d7dae..839eadc4f 100644 --- a/config/MACFramework.exports +++ b/config/MACFramework.exports @@ -8,6 +8,8 @@ _mac_label_set _mac_audit_text +_mac_iokit_check_hid_control + _sbuf_cat _sbuf_data _sbuf_delete diff --git a/config/MACFramework.ppc.exports b/config/MACFramework.ppc.exports deleted file mode 100644 index 6006136b4..000000000 --- a/config/MACFramework.ppc.exports +++ /dev/null @@ -1,9 +0,0 @@ -_kau_will_audit -_mac_kalloc -_mac_kalloc_noblock -_mac_kfree -_mac_mbuf_alloc -_mac_mbuf_free -_mac_unwire -_mac_wire -_sysctl__security_mac_children diff --git a/config/Mach.ppc.exports b/config/Mach.ppc.exports deleted file mode 100644 index cc31a814e..000000000 --- a/config/Mach.ppc.exports +++ /dev/null @@ -1 +0,0 @@ -_semaphore_timedwait diff --git a/config/Makefile b/config/Makefile index 9a00f1027..ff2d46ddb 100644 --- a/config/Makefile +++ b/config/Makefile @@ -1,5 +1,3 @@ -MAC = defined - export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule @@ -12,31 +10,22 @@ include $(MakeInc_def) ALL_SUBDIRS = INSTINC_SUBDIRS = - -INSTINC_SUBDIRS_PPC = - INSTINC_SUBDIRS_I386 = - INSTINC_SUBDIRS_X86_64 = - INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = - -EXPINC_SUBDIRS_PPC = - EXPINC_SUBDIRS_I386 = - EXPINC_SUBDIRS_X86_64 = - EXPINC_SUBDIRS_ARM = + COMP_SUBDIRS = INST_SUBDIRS = -INSTALL_DATA_LIST= \ +INSTALL_KEXT_PLIST_LIST= \ System.kext/Info.plist \ System.kext/PlugIns/Libkern.kext/Info.plist \ System.kext/PlugIns/Mach.kext/Info.plist \ @@ -48,18 +37,10 @@ INSTALL_DATA_LIST= \ System.kext/PlugIns/IONVRAMFamily.kext/Info.plist \ System.kext/PlugIns/IOSystemManagement.kext/Info.plist \ System.kext/PlugIns/Unsupported.kext/Info.plist \ - System.kext/PlugIns/Private.kext/Info.plist \ - \ - System.kext/PlugIns/System6.0.kext/Info.plist \ - System.kext/PlugIns/Libkern6.0.kext/Info.plist \ - System.kext/PlugIns/Mach6.0.kext/Info.plist \ - System.kext/PlugIns/BSDKernel6.0.kext/Info.plist \ - System.kext/PlugIns/IOKit6.0.kext/Info.plist \ + System.kext/PlugIns/Private.kext/Info.plist -INSTALL_DATA_DIR= \ - /System/Library/Extensions/ +INSTALL_KEXT_DIR = /System/Library/Extensions/ -INSTMAN_SUBDIRS = MD_SUPPORTED_KPI_FILENAME="SupportedKPIs-${ARCH_CONFIG_LC}.txt" MI_SUPPORTED_KPI_FILENAME="SupportedKPIs-all-archs.txt" @@ -72,32 +53,39 @@ endif ifeq ($(ARCH_CONFIG),I386) SUPPORT_SYSTEM60_KEXT = 1 -else ifeq ($(ARCH_CONFIG),ARM) -SUPPORT_SYSTEM60_KEXT = 1 else SUPPORT_SYSTEM60_KEXT = 0 endif +ifeq ($(SUPPORT_SYSTEM60_KEXT),1) +INSTALL_KEXT_PLIST_LIST += \ + System.kext/PlugIns/System6.0.kext/Info.plist \ + System.kext/PlugIns/Libkern6.0.kext/Info.plist \ + System.kext/PlugIns/Mach6.0.kext/Info.plist \ + System.kext/PlugIns/BSDKernel6.0.kext/Info.plist \ + System.kext/PlugIns/IOKit6.0.kext/Info.plist +endif + SYMBOL_COMPONENT_LIST = \ System6.0 \ BSDKernel \ IOKit \ - Libkern \ - Mach \ - Unsupported \ - Private - -ifdef MAC -SYMBOL_COMPONENT_LIST += MACFramework -MACFRAMEWORKEXPORTS = \ - -export $(SRCROOT)/$(COMPONENT)/MACFramework.exports \ - -export $(SRCROOT)/$(COMPONENT)/MACFramework.$(ARCH_CONFIG_LC).exports -endif + Libkern \ + Mach \ + MACFramework \ + Unsupported \ + Private SYMBOL_SET_BUILD = $(foreach set, $(SYMBOL_COMPONENT_LIST), $(OBJPATH)/$(set).symbolset) SYMBOL_SET_FAT = $(foreach set, $(SYMBOL_COMPONENT_LIST), $(OBJROOT)/$(set).symbolset) -## .SUFFIXES: .symbolset .symbollist +INSTALL_KEXT_PLISTS = $(addprefix $(DSTROOT)$(INSTALL_KEXT_DIR), $(INSTALL_KEXT_PLIST_LIST)) + +$(INSTALL_KEXT_PLISTS): $(DSTROOT)$(INSTALL_KEXT_DIR)% : $(SOURCE)/% + @echo Install $< in $@ + $(_v)$(MKDIR) $(dir $@); \ + $(RM) $(RMFLAGS) $@; \ + $(INSTALL) $(DATA_INSTALL_FLAGS) $< $(dir $@) $(OBJPATH)/allsymbols: $(OBJPATH)/mach_kernel $(_v)$(NM) -gj $< > $@ @@ -143,36 +131,20 @@ $(SYMBOL_SET_FAT): $(OBJROOT)/%.symbolset : printf "" > $@; \ fi -build_symbol_sets: $(SYMBOL_SET_BUILD) +build_symbol_sets: $(SYMBOL_SET_BUILD) $(OBJPATH)/allsymbols $(_v)$(KEXT_CREATE_SYMBOL_SET) \ $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \ + $(foreach comp,$(filter-out System6.0 Private,$(SYMBOL_COMPONENT_LIST)), \ + -export $(SRCROOT)/$(COMPONENT)/$(comp).exports \ + -export $(SRCROOT)/$(COMPONENT)/$(comp).$(ARCH_CONFIG_LC).exports) \ -import $(OBJPATH)/allsymbols \ - -export $(SRCROOT)/$(COMPONENT)/Libkern.exports \ - -export $(SRCROOT)/$(COMPONENT)/Libkern.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/Mach.exports \ - -export $(SRCROOT)/$(COMPONENT)/Mach.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/IOKit.exports \ - -export $(SRCROOT)/$(COMPONENT)/IOKit.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/BSDKernel.exports \ - -export $(SRCROOT)/$(COMPONENT)/BSDKernel.$(ARCH_CONFIG_LC).exports \ - $(MACFRAMEWORKEXPORTS) \ - -export $(SRCROOT)/$(COMPONENT)/Unsupported.exports \ - -export $(SRCROOT)/$(COMPONENT)/Unsupported.$(ARCH_CONFIG_LC).exports \ -output /dev/null $(_vstdout); $(_v)$(KEXT_CREATE_SYMBOL_SET) \ - $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \ + $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \ + $(foreach comp,$(filter-out System6.0 Unsupported,$(SYMBOL_COMPONENT_LIST)), \ + -export $(SRCROOT)/$(COMPONENT)/$(comp).exports \ + -export $(SRCROOT)/$(COMPONENT)/$(comp).$(ARCH_CONFIG_LC).exports) \ -import $(OBJPATH)/allsymbols \ - -export $(SRCROOT)/$(COMPONENT)/Libkern.exports \ - -export $(SRCROOT)/$(COMPONENT)/Libkern.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/Mach.exports \ - -export $(SRCROOT)/$(COMPONENT)/Mach.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/IOKit.exports \ - -export $(SRCROOT)/$(COMPONENT)/IOKit.$(ARCH_CONFIG_LC).exports \ - -export $(SRCROOT)/$(COMPONENT)/BSDKernel.exports \ - -export $(SRCROOT)/$(COMPONENT)/BSDKernel.$(ARCH_CONFIG_LC).exports \ - $(MACFRAMEWORKEXPORTS) \ - -export $(SRCROOT)/$(COMPONENT)/Private.exports \ - -export $(SRCROOT)/$(COMPONENT)/Private.$(ARCH_CONFIG_LC).exports \ -output /dev/null $(_vstdout); $(_v) $(SRCROOT)/$(COMPONENT)/list_supported.sh $(SRCROOT)/$(COMPONENT) $(ARCH_CONFIG_LC) $(OBJPATH)/${MD_SUPPORTED_KPI_FILENAME}; $(_v)if [ -n `echo $${ARCH_CONFIGS%%\ *} | grep -i $(ARCH_CONFIG)` ]; \ @@ -181,41 +153,42 @@ build_symbol_sets: $(SYMBOL_SET_BUILD) fi -install_symbol_sets: $(SYMBOL_SET_FAT) $(SRCROOT)/config/MasterVersion - $(_v)if [ -s "$(OBJROOT)/System6.0.symbolset" ]; then \ - install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/System6.0.kext/kernel.6.0; \ - install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Mach6.0.kext/Mach6.0; \ - install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/BSDKernel6.0.kext/BSDKernel6.0; \ - install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Libkern6.0.kext/Libkern6.0; \ - install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/IOKit6.0.kext/IOKit6.0; \ +install_symbol_sets: $(SYMBOL_SET_FAT) $(SRCROOT)/config/MasterVersion $(INSTALL_KEXT_PLISTS) + $(_v)if [ -s "$(OBJROOT)/System6.0.symbolset" -a $(SUPPORT_SYSTEM60_KEXT) -eq 1 ]; then \ + install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/System6.0.kext/kernel.6.0; \ + install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Mach6.0.kext/Mach6.0; \ + install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/BSDKernel6.0.kext/BSDKernel6.0; \ + install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Libkern6.0.kext/Libkern6.0; \ + install $(INSTALL_FLAGS) $(OBJROOT)/System6.0.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/IOKit6.0.kext/IOKit6.0; \ + fi + $(_v)if [ -s "$(OBJROOT)/BSDKernel.symbolset" ]; then \ + install $(INSTALL_FLAGS) $(OBJROOT)/BSDKernel.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/BSDKernel.kext/BSDKernel; \ + install $(INSTALL_FLAGS) $(OBJROOT)/IOKit.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/IOKit.kext/IOKit; \ + install $(INSTALL_FLAGS) $(OBJROOT)/Libkern.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Libkern.kext/Libkern; \ + install $(INSTALL_FLAGS) $(OBJROOT)/Mach.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Mach.kext/Mach; \ + install $(INSTALL_FLAGS) $(OBJROOT)/MACFramework.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/MACFramework.kext/MACFramework; \ + install $(INSTALL_FLAGS) $(OBJROOT)/Unsupported.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Unsupported.kext/Unsupported; \ + install $(INSTALL_FLAGS) $(OBJROOT)/Private.symbolset $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Private.kext/Private; \ fi - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/BSDKernel.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/BSDKernel.kext/BSDKernel; - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/IOKit.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/IOKit.kext/IOKit; - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/Libkern.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Libkern.kext/Libkern; - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/Mach.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Mach.kext/Mach; - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/Unsupported.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Unsupported.kext/Unsupported; - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/Private.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Private.kext/Private; - $(_v)$(NEWVERS) $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/AppleNMI.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/ApplePlatformFamily.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/BSDKernel.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/IOKit.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/IONVRAMFamily.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/IOSystemManagement.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Libkern.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Mach.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Unsupported.kext/Info.plist \ - $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/Private.kext/Info.plist; + $(_v)$(NEWVERS) $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/AppleNMI.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/ApplePlatformFamily.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/BSDKernel.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/IOKit.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/IONVRAMFamily.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/IOSystemManagement.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Libkern.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Mach.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/MACFramework.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Unsupported.kext/Info.plist \ + $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext/PlugIns/Private.kext/Info.plist; $(_v)$(MKDIR) $(DSTROOT)/$(KRESDIR); $(_v)install $(INSTALL_FLAGS) $(OBJPATH)/$(MD_SUPPORTED_KPI_FILENAME) $(DSTROOT)/$(KRESDIR); $(_v)if [ -n `echo $${ARCH_CONFIGS%%\ *} | grep -i $(ARCH_CONFIG)` ]; then \ install $(INSTALL_FLAGS) $(OBJROOT)/$(MI_SUPPORTED_KPI_FILENAME) $(DSTROOT)/$(KRESDIR); \ fi -ifdef MAC - $(_v)install $(INSTALL_FLAGS) $(OBJROOT)/MACFramework.symbolset $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/MACFramework.kext/MACFramework; - $(_v)$(NEWVERS) $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext/PlugIns/MACFramework.kext/Info.plist -endif - $(_v)$(CP) -rf $(DSTROOT)/$(INSTALL_DATA_DIR)/System.kext $(SYMROOT) + $(_v)$(MKDIR) $(SYMROOT) + $(_v)$(CP) -rf $(DSTROOT)/$(INSTALL_KEXT_DIR)/System.kext $(SYMROOT) do_build_all: build_symbol_sets diff --git a/config/MasterVersion b/config/MasterVersion index 237d3331d..8f5b9dd34 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -10.8.0 +11.0.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/Private.exports b/config/Private.exports index fb730cba2..299cabf8e 100644 --- a/config/Private.exports +++ b/config/Private.exports @@ -15,15 +15,21 @@ _bdevsw _boot _bsd_hostname _bsd_set_dependency_capable +_buf_create_shadow _buf_getcpaddr _buf_setcpaddr _buf_setfilter +_buf_shadow _cdevsw +_cdevsw_setkqueueok _clalloc _clfree _cons_cinput _cp_key_store_action _cp_register_wraps +_cs_entitlements_blob_get +_ctl_id_by_name +_ctl_name_by_id _fd_rdwr _get_aiotask _hz @@ -38,7 +44,14 @@ _ip_mutex _ip_output _ip_protox _ipc_port_release_send +_kauth_cred_getgroups +_kauth_cred_guid2grnam +_kauth_cred_guid2pwnam +_kauth_cred_grnam2guid +_kauth_cred_pwnam2guid +_kdp_register_link _kdp_set_interface +_kdp_unregister_link _kdp_unregister_send_receive _kmem_alloc_kobject _linesw @@ -55,9 +68,7 @@ _m_pullup _m_split _m_trailingspace:_mbuf_trailingspace _mac_proc_set_enforce -_mbuf_get_priority -_mbuf_get_traffic_class -_mbuf_set_traffic_class +_mbuf_get_priority:_mbuf_get_traffic_class _mcl_to_paddr _mountroot_post_hook _net_add_domain @@ -65,7 +76,7 @@ _net_add_proto _net_del_domain _net_del_proto _netboot_root -_perf_monitor_register +_perf_monitor_register_* _perf_monitor_unregister _pffinddomain _pffindproto @@ -111,6 +122,7 @@ _q_to_b _register_decmpfs_decompressor _rootdev _rootvp +_rtfree _sbappendaddr _sbappendrecord _sbflush @@ -121,6 +133,7 @@ _socantsendmore _sock_getlistener _sock_release _sock_retain +_sock_setupcall _sodisconnect _sofree _sofreelastref @@ -137,13 +150,20 @@ _soreserve _sorwakeup _sosend _termioschars -_thread_tid +_thread_clear_eager_preempt _thread_dispatchqaddr +_thread_set_eager_preempt +_thread_tid _throttle_info_create _throttle_info_mount_ref _throttle_info_mount_rel _throttle_info_release _throttle_info_update +_throttle_info_ref_by_mask +_throttle_info_rel_by_mask +_throttle_info_update_by_mask +_throttle_lowpri_io +_throttle_set_thread_io_policy _timeout _tk_nin _tk_rawcc @@ -167,15 +187,26 @@ _unmountroot_pre_hook _unputc _unregister_decmpfs_decompressor _untimeout +_vnode_isdyldsharedcache _vnode_ismonitored _vnode_notify +_vnop_compound_open_desc +_vnop_compound_mkdir_desc +_vnop_compound_remove_desc +_vnop_compound_rename_desc +_vnop_compound_rmdir_desc _vnop_monitor_desc _vfs_context_bind _vfs_context_get_special_port _vfs_context_set_special_port +_vfs_devvp +_vfs_getattr +_vfs_getbyid _vfs_get_notify_attributes _vfs_mntlabel +_vfs_setcompoundopen _vfs_setunmountpreflight +_vfs_throttle_mask _vfs_vnodecovered _vm_map_copy_copy _vm_map_copy_discard @@ -184,5 +215,6 @@ _vm_map_copyin_common _vm_map_copyout _vn_getpath_fsenter _vn_searchfs_inappropriate_name +_vnode_lookup_continue_needed _sock_settclassopt _sock_gettclassopt diff --git a/config/Private.i386.exports b/config/Private.i386.exports index 5ff0653e9..b6b05d103 100644 --- a/config/Private.i386.exports +++ b/config/Private.i386.exports @@ -1,14 +1,35 @@ +_IOGetBootKeyStoreData +_SHA256_Final +_SHA256_Init +_SHA256_Update +__ZN22IOInterruptEventSource7warmCPUEy _acpi_install_wake_handler _acpi_sleep_kernel _add_fsevent _apic_table +_apply_func_phys _cpu_to_lapic _cpuid_features _cpuid_info -_gOSKextUnresolved _lapic_end_of_interrupt _lapic_unmask_perfcnt_interrupt _mp_broadcast _mp_cpus_call +_mp_cpus_call1 _need_fsevent +_pal_efi_call_in_32bit_mode +_pal_efi_call_in_64bit_mode +_pal_machine_sleep _smp_initialized +_vfs_addtrigger +_vfs_istraditionaltrigger +_vfs_resolver_auxiliary +_vfs_resolver_result +_vfs_resolver_sequence +_vfs_resolver_status +_vfs_settriggercallback +_vnode_trigger_update +_xts_decrypt +_xts_done +_xts_encrypt +_xts_start diff --git a/config/Private.ppc.exports b/config/Private.ppc.exports deleted file mode 100644 index 0f0b58c19..000000000 --- a/config/Private.ppc.exports +++ /dev/null @@ -1,2 +0,0 @@ -_add_fsevent -_need_fsevent diff --git a/config/Private.x86_64.exports b/config/Private.x86_64.exports index 9748fcbe7..a19ab484b 100644 --- a/config/Private.x86_64.exports +++ b/config/Private.x86_64.exports @@ -1,7 +1,13 @@ +_IOGetBootKeyStoreData +_SHA256_Final +_SHA256_Init +_SHA256_Update +__ZN22IOInterruptEventSource7warmCPUEy _acpi_install_wake_handler _acpi_sleep_kernel _add_fsevent _apic_table +_apply_func_phys _cpu_to_lapic _cpuid_features _cpuid_info @@ -9,7 +15,23 @@ _lapic_end_of_interrupt _lapic_unmask_perfcnt_interrupt _mp_broadcast _mp_cpus_call +_mp_cpus_call1 _need_fsevent +_pal_efi_call_in_32bit_mode +_pal_efi_call_in_64bit_mode _semaphore_timedwait _smp_initialized _kext_get_vm_map +_pal_machine_sleep +_vfs_addtrigger +_vfs_istraditionaltrigger +_vfs_resolver_auxiliary +_vfs_resolver_result +_vfs_resolver_sequence +_vfs_resolver_status +_vfs_settriggercallback +_vnode_trigger_update +_xts_decrypt +_xts_done +_xts_encrypt +_xts_start diff --git a/config/System6.0.exports b/config/System6.0.exports index 75146568c..c3d167834 100644 --- a/config/System6.0.exports +++ b/config/System6.0.exports @@ -620,7 +620,6 @@ __ZN13IOCommandGate10runCommandEPvS0_S0_S0_ __ZN13IOCommandGate10superClassE __ZN13IOCommandGate11commandGateEP8OSObjectPFiS1_PvS2_S2_S2_E __ZN13IOCommandGate11setWorkLoopEP10IOWorkLoop -__ZN13IOCommandGate12checkForWorkEv __ZN13IOCommandGate12commandSleepEPv12UnsignedWidem __ZN13IOCommandGate12commandSleepEPvm __ZN13IOCommandGate13attemptActionEPFiP8OSObjectPvS2_S2_S2_ES2_S2_S2_S2_ @@ -689,6 +688,7 @@ __ZN13IOEventSource23_RESERVEDIOEventSource4Ev __ZN13IOEventSource23_RESERVEDIOEventSource5Ev __ZN13IOEventSource23_RESERVEDIOEventSource6Ev __ZN13IOEventSource23_RESERVEDIOEventSource7Ev +__ZN13IOEventSource4freeEv __ZN13IOEventSource4initEP8OSObjectPFvS1_zE __ZN13IOEventSource6enableEv __ZN13IOEventSource7disableEv @@ -699,6 +699,7 @@ __ZN13IOEventSource9MetaClassC2Ev __ZN13IOEventSource9closeGateEv __ZN13IOEventSource9metaClassE __ZN13IOEventSource9setActionEPFvP8OSObjectzE +__ZN13IOEventSource12checkForWorkEv __ZN13IOEventSource9sleepGateEPv12UnsignedWidem __ZN13IOEventSource9sleepGateEPvm __ZN13IOEventSourceC1EPK11OSMetaClass @@ -1241,7 +1242,6 @@ __ZN18IOTimerEventSource10wakeAtTimeE12UnsignedWide __ZN18IOTimerEventSource10wakeAtTimeE13mach_timespec __ZN18IOTimerEventSource10wakeAtTimeEmm __ZN18IOTimerEventSource11setWorkLoopEP10IOWorkLoop -__ZN18IOTimerEventSource12checkForWorkEv __ZN18IOTimerEventSource12setTimeoutMSEm __ZN18IOTimerEventSource12setTimeoutUSEm __ZN18IOTimerEventSource12wakeAtTimeMSEm @@ -2816,7 +2816,6 @@ __start _absolutetime_to_nanoseconds _acknowledgeSleepWakeNotification _appleClut8 -_argstrcpy _assert_wait _assert_wait_timeout _atoi @@ -2952,7 +2951,6 @@ _get_inpcb_str_size _get_kernel_symfile _get_procrustime _get_task_map -_getval _invalidate_icache _invalidate_icache64 _iokit_add_reference @@ -2970,7 +2968,6 @@ _iokit_version_variant:_version_variant _ipc_port_release_send _is_suser _is_suser1 -_isargsep _kOSBooleanFalse _kOSBooleanTrue _kalloc @@ -3150,7 +3147,6 @@ _thread_call_is_delayed _thread_cancel_timer _thread_deallocate _thread_flavor_array -_thread_funnel_set _thread_policy_set _thread_reference _thread_set_timer diff --git a/config/System6.0.i386.exports b/config/System6.0.i386.exports index 5cb3b501c..f3955791d 100644 --- a/config/System6.0.i386.exports +++ b/config/System6.0.i386.exports @@ -18,12 +18,12 @@ _lapic_end_of_interrupt _ml_get_max_cpus _mp_broadcast _mp_cpus_call +_mp_cpus_call1 _mp_rendezvous_no_intrs -_mtrr_range_add -_mtrr_range_remove _rtc_clock_stepped _rtc_clock_stepping _smp_initialized _sprintf _strcat _strcpy +_thread_funnel_set diff --git a/config/System6.0.ppc.exports b/config/System6.0.ppc.exports deleted file mode 100644 index 6b9d3ed8c..000000000 --- a/config/System6.0.ppc.exports +++ /dev/null @@ -1,256 +0,0 @@ -_CallTVector -_OSDequeueAtomic -_OSEnqueueAtomic -_PE_Determine_Clock_Speeds -_PE_find_scc -_PE_init_taproot -_PE_parse_boot_arg -_PE_read_write_time_of_day -_PE_write_IIC -_PPCcalls -_ResetHandler -__Z11IODBDMAStopPV23IODBDMAChannelRegisters -__Z12IODBDMAFlushPV23IODBDMAChannelRegisters -__Z12IODBDMAPausePV23IODBDMAChannelRegisters -__Z12IODBDMAResetPV23IODBDMAChannelRegisters -__Z12IODBDMAStartPV23IODBDMAChannelRegistersPV17IODBDMADescriptor -__Z15IODBDMAContinuePV23IODBDMAChannelRegisters -__Z32IOFreePhysicallyContiguousMemoryPjj -__Z36IOAllocatePhysicallyContiguousMemoryjjPjPm -__ZN10AppleMacIO10deleteListEv -__ZN10AppleMacIO10gMetaClassE -__ZN10AppleMacIO10processNubEP9IOService -__ZN10AppleMacIO10superClassE -__ZN10AppleMacIO11excludeListEv -__ZN10AppleMacIO12publishBelowEP15IORegistryEntry -__ZN10AppleMacIO15getNubResourcesEP9IOService -__ZN10AppleMacIO20_RESERVEDAppleMacIO0Ev -__ZN10AppleMacIO20_RESERVEDAppleMacIO1Ev -__ZN10AppleMacIO20_RESERVEDAppleMacIO2Ev -__ZN10AppleMacIO20_RESERVEDAppleMacIO3Ev -__ZN10AppleMacIO5startEP9IOService -__ZN10AppleMacIO8selfTestEv -__ZN10AppleMacIO9MetaClassC1Ev -__ZN10AppleMacIO9MetaClassC2Ev -__ZN10AppleMacIO9createNubEP15IORegistryEntry -__ZN10AppleMacIO9metaClassE -__ZN10AppleMacIOC1EPK11OSMetaClass -__ZN10AppleMacIOC2EPK11OSMetaClass -__ZN10AppleMacIOD0Ev -__ZN10AppleMacIOD2Ev -__ZN10AppleNVRAM10gMetaClassE -__ZN10AppleNVRAM10superClassE -__ZN10AppleNVRAM4readEmPhm -__ZN10AppleNVRAM5startEP9IOService -__ZN10AppleNVRAM5writeEmPhm -__ZN10AppleNVRAM9MetaClassC1Ev -__ZN10AppleNVRAM9MetaClassC2Ev -__ZN10AppleNVRAM9metaClassE -__ZN10AppleNVRAMC1EPK11OSMetaClass -__ZN10AppleNVRAMC1Ev -__ZN10AppleNVRAMC2EPK11OSMetaClass -__ZN10AppleNVRAMC2Ev -__ZN10AppleNVRAMD0Ev -__ZN10AppleNVRAMD2Ev -__ZN11IOMemoryMap19setMemoryDescriptorEP18IOMemoryDescriptory -__ZN11IOMemoryMap8redirectEP18IOMemoryDescriptormm -__ZN11IOMemoryMap8redirectEP18IOMemoryDescriptormy -__ZN16AppleMacIODevice10gMetaClassE -__ZN16AppleMacIODevice10superClassE -__ZN16AppleMacIODevice12getResourcesEv -__ZN16AppleMacIODevice13matchLocationEP9IOService -__ZN16AppleMacIODevice26_RESERVEDAppleMacIODevice0Ev -__ZN16AppleMacIODevice26_RESERVEDAppleMacIODevice1Ev -__ZN16AppleMacIODevice26_RESERVEDAppleMacIODevice2Ev -__ZN16AppleMacIODevice26_RESERVEDAppleMacIODevice3Ev -__ZN16AppleMacIODevice9MetaClassC1Ev -__ZN16AppleMacIODevice9MetaClassC2Ev -__ZN16AppleMacIODevice9metaClassE -__ZN16AppleMacIODeviceC1EPK11OSMetaClass -__ZN16AppleMacIODeviceC1Ev -__ZN16AppleMacIODeviceC2EPK11OSMetaClass -__ZN16AppleMacIODeviceC2Ev -__ZN16AppleMacIODeviceD0Ev -__ZN16AppleMacIODeviceD2Ev -__ZN17IONVRAMController10gMetaClassE -__ZN17IONVRAMController10superClassE -__ZN17IONVRAMController4syncEv -__ZN17IONVRAMController5startEP9IOService -__ZN17IONVRAMController9MetaClassC1Ev -__ZN17IONVRAMController9MetaClassC2Ev -__ZN17IONVRAMController9metaClassE -__ZN17IONVRAMControllerC1EPK11OSMetaClass -__ZN17IONVRAMControllerC2EPK11OSMetaClass -__ZN17IONVRAMControllerD0Ev -__ZN17IONVRAMControllerD2Ev -__ZN19ApplePlatformExpert10deleteListEv -__ZN19ApplePlatformExpert10gMetaClassE -__ZN19ApplePlatformExpert10superClassE -__ZN19ApplePlatformExpert11excludeListEv -__ZN19ApplePlatformExpert14getMachineNameEPci -__ZN19ApplePlatformExpert15getGMTTimeOfDayEv -__ZN19ApplePlatformExpert15setGMTTimeOfDayEl -__ZN19ApplePlatformExpert23registerNVRAMControllerEP17IONVRAMController -__ZN19ApplePlatformExpert29_RESERVEDApplePlatformExpert0Ev -__ZN19ApplePlatformExpert29_RESERVEDApplePlatformExpert1Ev -__ZN19ApplePlatformExpert29_RESERVEDApplePlatformExpert2Ev -__ZN19ApplePlatformExpert29_RESERVEDApplePlatformExpert3Ev -__ZN19ApplePlatformExpert5startEP9IOService -__ZN19ApplePlatformExpert9MetaClassC1Ev -__ZN19ApplePlatformExpert9MetaClassC2Ev -__ZN19ApplePlatformExpert9configureEP9IOService -__ZN19ApplePlatformExpert9metaClassE -__ZN19ApplePlatformExpertC1EPK11OSMetaClass -__ZN19ApplePlatformExpertC2EPK11OSMetaClass -__ZN19ApplePlatformExpertD0Ev -__ZN19ApplePlatformExpertD2Ev -__ZN19IODBDMAMemoryCursor10gMetaClassE -__ZN19IODBDMAMemoryCursor10superClassE -__ZN19IODBDMAMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm -__ZN19IODBDMAMemoryCursor17withSpecificationEmmm -__ZN19IODBDMAMemoryCursor21initWithSpecificationEmmm -__ZN19IODBDMAMemoryCursor9MetaClassC1Ev -__ZN19IODBDMAMemoryCursor9MetaClassC2Ev -__ZN19IODBDMAMemoryCursor9metaClassE -__ZN19IODBDMAMemoryCursorC1EPK11OSMetaClass -__ZN19IODBDMAMemoryCursorC1Ev -__ZN19IODBDMAMemoryCursorC2EPK11OSMetaClass -__ZN19IODBDMAMemoryCursorC2Ev -__ZN19IODBDMAMemoryCursorD0Ev -__ZN19IODBDMAMemoryCursorD2Ev -__ZN24IOBufferMemoryDescriptor22inTaskWithPhysicalMaskEP4taskmyy -__ZN8AppleCPU10gMetaClassE -__ZN8AppleCPU10getCPUNameEv -__ZN8AppleCPU10quiesceCPUEv -__ZN8AppleCPU10superClassE -__ZN8AppleCPU5startEP9IOService -__ZN8AppleCPU7haltCPUEv -__ZN8AppleCPU7initCPUEb -__ZN8AppleCPU8startCPUEjj -__ZN8AppleCPU9MetaClassC1Ev -__ZN8AppleCPU9MetaClassC2Ev -__ZN8AppleCPU9metaClassE -__ZN8AppleCPUC1EPK11OSMetaClass -__ZN8AppleCPUC1Ev -__ZN8AppleCPUC2EPK11OSMetaClass -__ZN8AppleCPUC2Ev -__ZN8AppleCPUD0Ev -__ZN8AppleCPUD2Ev -__ZN8AppleNMI10gMetaClassE -__ZN8AppleNMI10superClassE -__ZN8AppleNMI15handleInterruptEPvP9IOServicei -__ZN8AppleNMI18_RESERVEDAppleNMI0Ev -__ZN8AppleNMI18_RESERVEDAppleNMI1Ev -__ZN8AppleNMI18_RESERVEDAppleNMI2Ev -__ZN8AppleNMI18_RESERVEDAppleNMI3Ev -__ZN8AppleNMI22powerStateWillChangeToEmmP9IOService -__ZN8AppleNMI5startEP9IOService -__ZN8AppleNMI7initNMIEP21IOInterruptControllerP6OSData -__ZN8AppleNMI9MetaClassC1Ev -__ZN8AppleNMI9MetaClassC2Ev -__ZN8AppleNMI9metaClassE -__ZN8AppleNMIC1EPK11OSMetaClass -__ZN8AppleNMIC1Ev -__ZN8AppleNMIC2EPK11OSMetaClass -__ZN8AppleNMIC2Ev -__ZN8AppleNMID0Ev -__ZN8AppleNMID2Ev -__ZN8OSObject19_RESERVEDOSObject16Ev -__ZN8OSObject19_RESERVEDOSObject17Ev -__ZN8OSObject19_RESERVEDOSObject18Ev -__ZN8OSObject19_RESERVEDOSObject19Ev -__ZN8OSObject19_RESERVEDOSObject20Ev -__ZN8OSObject19_RESERVEDOSObject21Ev -__ZN8OSObject19_RESERVEDOSObject22Ev -__ZN8OSObject19_RESERVEDOSObject23Ev -__ZN8OSObject19_RESERVEDOSObject24Ev -__ZN8OSObject19_RESERVEDOSObject25Ev -__ZN8OSObject19_RESERVEDOSObject26Ev -__ZN8OSObject19_RESERVEDOSObject27Ev -__ZN8OSObject19_RESERVEDOSObject28Ev -__ZN8OSObject19_RESERVEDOSObject29Ev -__ZN8OSObject19_RESERVEDOSObject30Ev -__ZN8OSObject19_RESERVEDOSObject31Ev -__ZN9IOService20_RESERVEDIOService48Ev -__ZN9IOService20_RESERVEDIOService49Ev -__ZN9IOService20_RESERVEDIOService50Ev -__ZN9IOService20_RESERVEDIOService51Ev -__ZN9IOService20_RESERVEDIOService52Ev -__ZN9IOService20_RESERVEDIOService53Ev -__ZN9IOService20_RESERVEDIOService54Ev -__ZN9IOService20_RESERVEDIOService55Ev -__ZN9IOService20_RESERVEDIOService56Ev -__ZN9IOService20_RESERVEDIOService57Ev -__ZN9IOService20_RESERVEDIOService58Ev -__ZN9IOService20_RESERVEDIOService59Ev -__ZN9IOService20_RESERVEDIOService60Ev -__ZN9IOService20_RESERVEDIOService61Ev -__ZN9IOService20_RESERVEDIOService62Ev -__ZN9IOService20_RESERVEDIOService63Ev -__ZNK10AppleMacIO12getMetaClassEv -__ZNK10AppleMacIO14compareNubNameEPK9IOServiceP8OSStringPS4_ -__ZNK10AppleMacIO9MetaClass5allocEv -__ZNK10AppleNVRAM12getMetaClassEv -__ZNK10AppleNVRAM9MetaClass5allocEv -__ZNK16AppleMacIODevice11compareNameEP8OSStringPS1_ -__ZNK16AppleMacIODevice12getMetaClassEv -__ZNK16AppleMacIODevice9MetaClass5allocEv -__ZNK17IONVRAMController12getMetaClassEv -__ZNK17IONVRAMController9MetaClass5allocEv -__ZNK19ApplePlatformExpert12getMetaClassEv -__ZNK19ApplePlatformExpert9MetaClass5allocEv -__ZNK19IODBDMAMemoryCursor12getMetaClassEv -__ZNK19IODBDMAMemoryCursor9MetaClass5allocEv -__ZNK8AppleCPU12getMetaClassEv -__ZNK8AppleCPU9MetaClass5allocEv -__ZNK8AppleNMI12getMetaClassEv -__ZNK8AppleNMI9MetaClass5allocEv -__ZTV10AppleMacIO -__ZTV10AppleNVRAM -__ZTV16AppleMacIODevice -__ZTV17IONVRAMController -__ZTV19ApplePlatformExpert -__ZTV19IODBDMAMemoryCursor -__ZTV8AppleCPU -__ZTV8AppleNMI -__ZTVN10AppleMacIO9MetaClassE -__ZTVN10AppleNVRAM9MetaClassE -__ZTVN16AppleMacIODevice9MetaClassE -__ZTVN17IONVRAMController9MetaClassE -__ZTVN19ApplePlatformExpert9MetaClassE -__ZTVN19IODBDMAMemoryCursor9MetaClassE -__ZTVN8AppleCPU9MetaClassE -__ZTVN8AppleNMI9MetaClassE -__eSynchronizeIO -_abs -_bcopy_nc -_bzero_nc -_cacheDisable -_cacheInit -_delay_for_interval -_gGetDefaultBusSpeedsKey -_get_io_base_addr -_get_preemption_level -_hfs_addconverter -_hfs_remconverter -_ignore_zero_fault -_killprint -_kprintf_lock -_mapping_prealloc -_mapping_relpre -_ml_enable_cache_level -_ml_enable_nap -_ml_mem_backoff -_ml_ppc_sleep -_ml_set_processor_speed -_ml_set_processor_voltage -_ml_throttle -_pe_do_clock_test -_pe_run_clock_test -_pmsRunLocal -_rc4_crypt -_rc4_init -_scc -_sprintf -_strcat -_strcpy diff --git a/config/Unsupported.exports b/config/Unsupported.exports index 8886533d8..374517b7e 100644 --- a/config/Unsupported.exports +++ b/config/Unsupported.exports @@ -6,6 +6,8 @@ _KUNCUserNotificationDisplayAlert _KUNCUserNotificationDisplayFromBundle _KUNCUserNotificationDisplayNotice _NDR_record +_OSSpinLockTry +_OSSpinLockUnlock _PE_kputc __Z22OSFlushObjectTrackListv __ZN15IOWatchDogTimer10gMetaClassE @@ -49,6 +51,7 @@ __ZN9IODTNVRAM26calculatePartitionChecksumEPh __ZN9IODTNVRAM9metaClassE __ZN9IODTNVRAMC2EPK11OSMetaClass __ZN9IODTNVRAMD2Ev +__ZN9IODTNVRAM10safeToSyncEv __ZNK15IOWatchDogTimer12getMetaClassEv __ZNK15IOWatchDogTimer9MetaClass5allocEv __ZNK9IODTNVRAM17getOFVariablePermEPK8OSSymbol @@ -64,6 +67,7 @@ _aes_decrypt_key _aes_decrypt_key128 _aes_decrypt_key256 _aes_encrypt_cbc +_aes_encrypt_key _aes_encrypt_key128 _aes_encrypt_key256 _appleClut8 @@ -93,7 +97,6 @@ _host_get_special_port _host_priv_self _hz _ipc_kernel_map -_ipflow_fastforward _kalloc _kauth_cred_issuser _kauth_cred_label_update @@ -115,7 +118,11 @@ _ldisc_deregister _ldisc_register _log _mach_gss_accept_sec_context +_mach_gss_accept_sec_context_v2 +_mach_gss_hold_cred _mach_gss_init_sec_context +_mach_gss_init_sec_context_v2 +_mach_gss_unhold_cred _mach_make_memory_entry_64 _mach_memory_entry_page_op _mach_memory_entry_range_op @@ -159,7 +166,6 @@ _thread_notrigger _thread_tid _tsleep _vfs_context_current -_vfs_setlocklocal _vfs_update_vfsstat _vm_allocate _vm_deallocate diff --git a/config/Unsupported.i386.exports b/config/Unsupported.i386.exports index 66029e241..38b70f0ff 100644 --- a/config/Unsupported.i386.exports +++ b/config/Unsupported.i386.exports @@ -23,7 +23,7 @@ _kernel_thread _lapic_set_perfcnt_interrupt_mask _lapic_set_pmi_func _lo_ifp -_m_adj +_m_adj:_mbuf_adj _m_cat _m_copydata _m_copym @@ -41,7 +41,7 @@ _m_split _m_trailingspace:_mbuf_trailingspace _mach_msg_rpc_from_kernel _mach_msg_send_from_kernel_with_options -_mcl_to_paddr +_mcl_to_paddr:_mbuf_data_to_physical _ml_get_apicid _ml_get_maxbusdelay _ml_get_maxsnoop diff --git a/config/Unsupported.ppc.exports b/config/Unsupported.ppc.exports deleted file mode 100644 index fbc85ede8..000000000 --- a/config/Unsupported.ppc.exports +++ /dev/null @@ -1,118 +0,0 @@ -_CallTVector -_PPCcalls -_PE_write_IIC -__ZN19IODBDMAMemoryCursor13outputSegmentEN14IOMemoryCursor15PhysicalSegmentEPvm -__ZN9IODTNVRAM17getOWVariableInfoEmPPK8OSSymbolPmS4_ -__ZN9IODTNVRAM19convertObjectToPropEPhPmPK8OSSymbolP8OSObject -__ZN9IODTNVRAM19convertPropToObjectEPhmS0_mPPK8OSSymbolPP8OSObject -__ZN9IODTNVRAM19searchNVRAMPropertyEP17IONVRAMDescriptorPm -__ZN9IODTNVRAM19unescapeBytesToDataEPKhm -_domains -_get_preemption_level -_ignore_zero_fault -_ifunit -_in6addr_local -_in_broadcast -_inaddr_local -_inet_domain_mutex -_ip_mutex -_ip_output -_ip_protox -_killprint -_kernel_flock -_kernel_thread -_lo_ifp -_mapping_prealloc -_mapping_relpre -_m_adj -_m_cat -_m_copydata -_m_copym -_m_free:_mbuf_free -_m_freem:_mbuf_freem -_m_get -_m_gethdr -_m_getpacket -_m_getpackets -_m_mclget -_m_mtod -_m_prepend_2 -_m_pullup -_m_split -_m_trailingspace:_mbuf_trailingspace -_mcl_to_paddr -_ml_enable_cache_level -_ml_enable_nap -_ml_ppc_sleep -_ml_set_processor_speed -_ml_set_processor_voltage -_ml_throttle -_nd6_storelladdr -_net_add_domain -_net_add_proto -_net_del_domain -_net_del_proto -_pffinddomain -_pffindproto -_pmsStart -_pmsPark -_pmsRun -_pmsRunLocal -_pmsBuild -_pru_abort_notsupp -_pru_accept_notsupp -_pru_bind_notsupp -_pru_connect2_notsupp -_pru_connect_notsupp -_pru_disconnect_notsupp -_pru_listen_notsupp -_pru_peeraddr_notsupp -_pru_rcvd_notsupp -_pru_rcvoob_notsupp -_pru_send_notsupp -_pru_sense_null -_pru_shutdown_notsupp -_pru_sockaddr_notsupp -_pru_sopoll_notsupp -_ml_mem_backoff -_sbappendaddr -_sbappendrecord -_sbflush -_sbspace -_soabort -_sobind -_socantrcvmore -_socantsendmore -_sock_getlistener -_sock_release -_sock_retain -_soclose -_soconnect -_socreate -_sodisconnect -_sofree -_sofreelastref -_soisconnected -_soisconnecting -_soisdisconnected -_soisdisconnecting -_sonewconn -_sooptcopyin -_sooptcopyout -_sopoll -_soreceive -_soreserve -_sorwakeup -_sosend -_sosetopt -_tcbinfo -_thread_call_func -_thread_call_func_cancel -_thread_call_func_delayed -_thread_call_is_delayed -_thread_cancel_timer -_thread_funnel_set -_thread_set_timer -_thread_set_timer_deadline -_udbinfo -_clock_get_system_value diff --git a/config/Unsupported.x86_64.exports b/config/Unsupported.x86_64.exports index 79dce8fdc..9413c7dec 100644 --- a/config/Unsupported.x86_64.exports +++ b/config/Unsupported.x86_64.exports @@ -32,3 +32,4 @@ _tmrCvt _tsc_get_info _hibernate_vm_lock _hibernate_vm_unlock + diff --git a/config/version.c b/config/version.c index d916cbee8..4870d134c 100644 --- a/config/version.c +++ b/config/version.c @@ -46,3 +46,5 @@ const char osbuilder[] = "###KERNEL_BUILDER###"; const char osrelease[] = OSRELEASE; const char ostype[] = OSTYPE; char osversion[OSVERSIZE]; + +__private_extern__ const char compiler_version[] = __VERSION__; diff --git a/osfmk/ppc/cpu_affinity.h b/iokit/IOKit/AppleKeyStoreInterface.h similarity index 62% rename from osfmk/ppc/cpu_affinity.h rename to iokit/IOKit/AppleKeyStoreInterface.h index 2e0ae7ce4..02cb776c1 100644 --- a/osfmk/ppc/cpu_affinity.h +++ b/iokit/IOKit/AppleKeyStoreInterface.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Apple Inc. All rights reserved. + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,28 +25,36 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -#ifdef KERNEL_PRIVATE -#ifndef _PPC_CPU_AFFINITY_H_ -#define _PPC_CPU_AFFINITY_H_ -/* - * Just one hardware affinity set - the whole machine. - * This allows us to give the pretense that PPC supports the affinity policy - * SPI. The kernel will accept affinity hints but effectively ignore them. - * Hence Universal Apps can use platform-independent code. - */ -static inline int ml_get_max_affinity_sets(void) +#ifndef _IOKIT_APPLEKEYSTOREINTERFACE_H +#define _IOKIT_APPLEKEYSTOREINTERFACE_H + +// These are currently duplicate defs with different names +// from AppleKeyStore & CoreStorage + +// aka MAX_KEY_SIZE +#define AKS_MAX_KEY_SIZE 128 + +// aka rawKey +struct aks_raw_key_t { - return 1; -} + uint32_t keybytecount; + uint8_t keybytes[AKS_MAX_KEY_SIZE]; +}; -/* - * Return the single processor set. - */ -static inline processor_set_t ml_affinity_to_pset(__unused int affinity_num) +// aka volumeKey +struct aks_volume_key_t { - return processor_pset(master_processor); -} + uint32_t algorithm; + aks_raw_key_t key; +}; + +// aka AKS_GETKEY +#define AKS_PLATFORM_FUNCTION_GETKEY "getKey" + +// aka kCSFDETargetVEKID +#define PLATFORM_FUNCTION_GET_MEDIA_ENCRYPTION_KEY_UUID "CSFDETargetVEKID" + +#define AKS_SERVICE_PATH "/IOResources/AppleKeyStore" -#endif /* _I386_CPU_AFFINITY_H_ */ -#endif /* KERNEL_PRIVATE */ +#endif /* _IOKIT_APPLEKEYSTOREINTERFACE_H */ diff --git a/iokit/IOKit/IOBufferMemoryDescriptor.h b/iokit/IOKit/IOBufferMemoryDescriptor.h index 391a0460e..f5d504061 100644 --- a/iokit/IOKit/IOBufferMemoryDescriptor.h +++ b/iokit/IOKit/IOBufferMemoryDescriptor.h @@ -86,6 +86,7 @@ private: task_t inTask) APPLE_KEXT_DEPRECATED; /* use withOptions() instead */ #endif /* !__LP64__ */ +public: virtual bool initWithPhysicalMask( task_t inTask, IOOptionBits options, @@ -146,7 +147,11 @@ public: kIOMemoryPhysicallyContiguous - pass to request memory be physically contiguous. This option is heavily discouraged. The request may fail if memory is fragmented, may cause large amounts of paging activity, and may take a very long time to execute.<br> kIOMemoryPageable - pass to request memory be non-wired - the default for kernel allocated memory is wired.<br> kIOMemoryPurgeable - pass to request memory that may later have its purgeable state set with IOMemoryDescriptor::setPurgeable. Only supported for kIOMemoryPageable allocations.<br> - kIOMemoryKernelUserShared - pass to request memory that will be mapped into both the kernel and client applications. + kIOMemoryKernelUserShared - pass to request memory that will be mapped into both the kernel and client applications.<br> + kIOMapInhibitCache - allocate memory with inhibited cache setting. <br> + kIOMapWriteThruCache - allocate memory with writethru cache setting. <br> + kIOMapCopybackCache - allocate memory with copyback cache setting. <br> + kIOMapWriteCombineCache - allocate memory with writecombined cache setting. @param capacity The number of bytes to allocate. @param alignment The minimum required alignment of the buffer in bytes - 1 is the default for no required alignment. For example, pass 256 to get memory allocated at an address with bits 0-7 zero. @result Returns an instance of class IOBufferMemoryDescriptor to be released by the caller, which will free the memory desriptor and associated buffer. */ @@ -164,7 +169,11 @@ public: @param options Options for the allocation:<br> kIODirectionOut, kIODirectionIn - set the direction of the I/O transfer.<br> kIOMemoryPhysicallyContiguous - pass to request memory be physically contiguous. This option is heavily discouraged. The request may fail if memory is fragmented, may cause large amounts of paging activity, and may take a very long time to execute.<br> - kIOMemoryKernelUserShared - pass to request memory that will be mapped into both the kernel and client applications. + kIOMemoryKernelUserShared - pass to request memory that will be mapped into both the kernel and client applications.<br> + kIOMapInhibitCache - allocate memory with inhibited cache setting. <br> + kIOMapWriteThruCache - allocate memory with writethru cache setting. <br> + kIOMapCopybackCache - allocate memory with copyback cache setting. <br> + kIOMapWriteCombineCache - allocate memory with writecombined cache setting. @param capacity The number of bytes to allocate. @param mask The buffer will be allocated with pages such that physical addresses will only have bits set present in physicalMask. For example, pass 0x00000000FFFFFFFFULL for a buffer to be accessed by hardware that has 32 address bits. @result Returns an instance of class IOBufferMemoryDescriptor to be released by the caller, which will free the memory desriptor and associated buffer. */ diff --git a/iokit/IOKit/IOCatalogue.h b/iokit/IOKit/IOCatalogue.h index f087396fd..49f8fb84c 100644 --- a/iokit/IOKit/IOCatalogue.h +++ b/iokit/IOKit/IOCatalogue.h @@ -63,11 +63,11 @@ private: SInt32 generation; /* This stuff is no longer used at all but was exported in prior - * releases, so we keep it around for PPC/i386 only. + * releases, so we keep it around for i386 only. */ -#if __ppc__ || __i386__ +#if __i386__ IOLock * kld_lock; -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ public: /*! @@ -202,9 +202,19 @@ public: /*! @function reset @abstract Return the Catalogue to its initial state. + @discussion + Should only be used by kextd just before it sends all kext personalities down during a rescan. */ void reset(void); + /*! + @function resetAndAddDrivers + @abstract Replace personalities in IOCatalog with those provided. + @discussion + Resets the catalogue with a new set of drivers, preserving matching originals to keep wired memory usage down. + */ + bool resetAndAddDrivers(OSArray * drivers, bool doNubMatching = true); + /*! @function serialize @abstract Serializes the catalog for transport to the user. @@ -215,10 +225,10 @@ public: bool serializeData(IOOptionBits kind, OSSerialize * s) const; -/* This stuff is no longer used at all we keep it around for PPC/i386 +/* This stuff is no longer used at all we keep it around for i386 * binary compatibility only. Symbols are no longer exported. */ -#if __ppc__ || __i386__ +#if __i386__ /*! @function recordStartupExtensions @abstract Records extensions made available by the primary booter. @@ -253,7 +263,7 @@ public: removed or wasn't present, KERN_FAILURE otherwise. */ virtual kern_return_t removeKernelLinker(void); -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ private: diff --git a/iokit/IOKit/IOCommandGate.h b/iokit/IOKit/IOCommandGate.h index 1b17b791d..d38c88670 100644 --- a/iokit/IOKit/IOCommandGate.h +++ b/iokit/IOKit/IOCommandGate.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2009 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -83,10 +83,6 @@ member function's parameter list. void *arg2, void *arg3); protected: -/*! - @function checkForWork - @abstract Not used, $link IOEventSource::checkForWork(). */ - virtual bool checkForWork(); /*! @struct ExpansionData @discussion This structure will be used to expand the capablilties of the IOWorkLoop in the future. diff --git a/iokit/IOKit/IODMACommand.h b/iokit/IOKit/IODMACommand.h index a2a2852f3..c8bd8b37a 100644 --- a/iokit/IOKit/IODMACommand.h +++ b/iokit/IOKit/IODMACommand.h @@ -258,7 +258,7 @@ public: /*! @function setMemoryDescriptor @abstract Sets and resets the DMACommand's current memory descriptor - @discussion The DMA command will configure itself based on the information that it finds in the memory descriptor. It looks for things like the direction of the memory descriptor and whether the current memory descriptor is already mapped into some IOMMU. As a programmer convenience it can also prepare the memory descriptor immediately. See prepare(). Note the IODMACommand is designed to used multiple times with a succession of memory descriptors, making the pooling of commands possible. It is an error though to attempt to reset a currently prepared() DMA command. Warning: This routine may block so never try to autoprepare an IODMACommand while in a gated context, i.e. one of the WorkLoops action call outs. + @discussion The DMA command will configure itself based on the information that it finds in the memory descriptor. It looks for things like the direction of the memory descriptor and whether the current memory descriptor is already mapped into some IOMMU. As a programmer convenience it can also prepare the DMA command immediately. See prepare(). Note the IODMACommand is designed to used multiple times with a succession of memory descriptors, making the pooling of commands possible. It is an error though to attempt to reset a currently prepared() DMA command. Warning: This routine may block so never try to autoprepare an IODMACommand while in a gated context, i.e. one of the WorkLoops action call outs. @param mem A pointer to the current I/Os memory descriptor. @param autoPrepare An optional boolean variable that will call the prepare() function automatically after the memory descriptor is processed. Defaults to true. @result Returns kIOReturnSuccess, kIOReturnBusy if currently prepared, kIOReturnNoSpace if the length(mem) >= Maximum Transfer Size or the error codes returned by prepare() (qv). diff --git a/iokit/IOKit/IODataQueueShared.h b/iokit/IOKit/IODataQueueShared.h index 2fa0e9a45..dc4532486 100644 --- a/iokit/IOKit/IODataQueueShared.h +++ b/iokit/IOKit/IODataQueueShared.h @@ -66,7 +66,7 @@ typedef struct _IODataQueueMemory { * @abstract A struct mapping to the appendix region of a data queue. * @discussion This struct is variable sized dependent on the version. The struct represents the data queue appendix information. * @field version The version of the queue appendix. - * @field port The notification port associated with this queue. + * @field msgh Mach message header containing the notification mach port associated with this queue. */ typedef struct _IODataQueueAppendix { UInt32 version; diff --git a/iokit/IOKit/IOEventSource.h b/iokit/IOKit/IOEventSource.h index 4afc5aa99..10a392afc 100644 --- a/iokit/IOKit/IOEventSource.h +++ b/iokit/IOKit/IOEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000, 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,6 +44,9 @@ HISTORY #include <IOKit/system.h> #include <IOKit/IOWorkLoop.h> +#if IOKITSTATS +#include <IOKit/IOStatisticsPrivate.h> +#endif __BEGIN_DECLS #include <mach/clock_types.h> @@ -69,12 +72,12 @@ source may only be a member of 1 linked list chain. If you need to move it between chains than make sure it is removed from the original chain before attempting to move it. <br><br> - The IOEventSource makes no attempt to maintain the consitency of it's internal data across multi-threading. It is assumed that the user of these basic tools will protect the data that these objects represent in some sort of device wide instance lock. For example the IOWorkLoop maintains the event chain by handing off change request to its own thread and thus single threading access to its state. + The IOEventSource makes no attempt to maintain the consistency of its internal data across multi-threading. It is assumed that the user of these basic tools will protect the data that these objects represent in some sort of device wide instance lock. For example the IOWorkLoop maintains the event chain by using an IOCommandGate and thus single threading access to its state. <br><br> - All subclasses of the IOEventSource are expected to implement the checkForWork() member function. + All subclasses of IOEventSource that wish to perform work on the work-loop thread are expected to implement the checkForWork() member function. As of Mac OS X, 10.7 (Darwin 11), checkForWork is no longer pure virtual, and should not be overridden if there is no work to be done. <br><br> - checkForWork() is the key method in this class. It is called by some work-loop when convienient and is expected to evaluate it's internal state and determine if an event has occurred since the last call. In the case of an event having occurred then the instance defined target(owner)/action will be called. The action is stored as an ordinary C function pointer but the first parameter is always the owner. This means that a C++ member function can be used as an action function though this depends on the ABI. + checkForWork() is the key method in this class. It is called by some work-loop when convienient and is expected to evaluate its internal state and determine if an event has occurred since the last call. In the case of an event having occurred then the instance defined target(owner)/action will be called. The action is stored as an ordinary C function pointer but the first parameter is always the owner. This means that a C++ member function can be used as an action function though this depends on the ABI. <br><br> Although the eventChainNext variable contains a reference to the next event source in the chain this reference is not retained. The list 'owner' i.e. the client that creates the event, not the work-loop, is expected to retain the source. */ @@ -82,6 +85,9 @@ class IOEventSource : public OSObject { OSDeclareAbstractStructors(IOEventSource) friend class IOWorkLoop; +#if IOKITSTATS + friend class IOStatistics; +#endif public: /*! @@ -125,7 +131,13 @@ protected: /*! @struct ExpansionData @discussion This structure will be used to expand the capablilties of the IOEventSource in the future. */ - struct ExpansionData { }; + struct ExpansionData { +#if IOKITSTATS + struct IOEventSourceCounter *counter; +#else + void *iokitstatsReserved; +#endif + }; /*! @var reserved Reserved for future use. (Internal use only) */ @@ -149,14 +161,19 @@ successfully. */ virtual bool init(OSObject *owner, IOEventSource::Action action = 0); + virtual void free( void ); + /*! @function checkForWork - @abstract Pure Virtual member function used by IOWorkLoop for work + @abstract Virtual member function used by IOWorkLoop for work scheduling. @discussion This function will be called to request a subclass to check -it's internal state for any work to do and then to call out the owner/action. +its internal state for any work to do and then to call out the owner/action. +If this event source never performs any work (e.g. IOCommandGate), this +method should not be overridden. NOTE: This method is no longer declared pure +virtual. A default implementation is provided in IOEventSource. @result Return true if this function needs to be called again before all its outstanding events have been processed. */ - virtual bool checkForWork() = 0; + virtual bool checkForWork(); /*! @function setWorkLoop @abstract Set'ter for $link workLoop variable. diff --git a/iokit/IOKit/IOHibernatePrivate.h b/iokit/IOKit/IOHibernatePrivate.h index 30b307816..0fb3c53f3 100644 --- a/iokit/IOKit/IOHibernatePrivate.h +++ b/iokit/IOKit/IOHibernatePrivate.h @@ -34,6 +34,7 @@ extern "C" { #ifdef KERNEL #include <crypto/aes.h> +#include <uuid/uuid.h> #endif struct IOPolledFileExtent @@ -48,7 +49,9 @@ struct IOHibernateImageHeader uint64_t imageSize; uint64_t image1Size; - uint32_t restore1CodePage; + uint32_t restore1CodePhysPage; + uint32_t reserved1; + uint64_t restore1CodeVirt; uint32_t restore1PageCount; uint32_t restore1CodeOffset; uint32_t restore1StackOffset; @@ -86,16 +89,15 @@ struct IOHibernateImageHeader uint32_t diag[4]; - int32_t graphicsInfoOffset; - int32_t cryptVarsOffset; - int32_t memoryMapOffset; - uint32_t memoryMapSize; + uint32_t handoffPages; + uint32_t handoffPageCount; + uint32_t systemTableOffset; uint32_t debugFlags; uint32_t options; - uint32_t reserved[71]; // make sizeof == 512 + uint32_t reserved[70]; // make sizeof == 512 uint64_t encryptEnd __attribute__ ((packed)); uint64_t deviceBase __attribute__ ((packed)); @@ -154,6 +156,25 @@ typedef struct hibernate_cryptvars_t hibernate_cryptvars_t; #endif /* defined(_AES_H) */ +enum +{ + kIOHibernateHandoffType = 0x686f0000, + kIOHibernateHandoffTypeEnd = kIOHibernateHandoffType + 0, + kIOHibernateHandoffTypeGraphicsInfo = kIOHibernateHandoffType + 1, + kIOHibernateHandoffTypeCryptVars = kIOHibernateHandoffType + 2, + kIOHibernateHandoffTypeMemoryMap = kIOHibernateHandoffType + 3, + kIOHibernateHandoffTypeDeviceTree = kIOHibernateHandoffType + 4, + kIOHibernateHandoffTypeDeviceProperties = kIOHibernateHandoffType + 5, + kIOHibernateHandoffTypeKeyStore = kIOHibernateHandoffType + 6, +}; + +struct IOHibernateHandoff +{ + uint32_t type; + uint32_t bytecount; + uint8_t data[]; +}; +typedef struct IOHibernateHandoff IOHibernateHandoff; enum { @@ -233,15 +254,20 @@ typedef void (*kern_get_file_extents_callback_t)(void * ref, uint64_t start, uin struct kern_direct_file_io_ref_t * kern_open_file_for_direct_io(const char * name, kern_get_file_extents_callback_t callback, - void * callback_ref, - dev_t * device_result, - uint64_t * partitionbase_result, - uint64_t * maxiocount_result, - boolean_t * solid_state); + void * callback_ref, + dev_t * partition_device_result, + dev_t * image_device_result, + uint64_t * partitionbase_result, + uint64_t * maxiocount_result, + uint32_t * oflags, + off_t offset, + caddr_t addr, + vm_size_t len); void -kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref); +kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref, + off_t offset, caddr_t addr, vm_size_t len); int kern_write_file(struct kern_direct_file_io_ref_t * ref, off_t offset, caddr_t addr, vm_size_t len); int get_kernel_symfile(struct proc *p, char const **symfile); @@ -257,6 +283,7 @@ hibernate_setup(IOHibernateImageHeader * header, boolean_t vmflush, hibernate_page_list_t ** page_list_ret, hibernate_page_list_t ** page_list_wired_ret, + hibernate_page_list_t ** page_list_pal_ret, boolean_t * encryptedswap); kern_return_t hibernate_teardown(hibernate_page_list_t * page_list, @@ -279,6 +306,7 @@ hibernate_vm_unlock(void); void hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernate_page_list_t * page_list_wired, + hibernate_page_list_t * page_list_pal, uint32_t * pagesOut); // mark pages to be saved, or pages not to be saved but available @@ -316,7 +344,7 @@ hibernate_page_bitmap_pin(hibernate_page_list_t * list, uint32_t * page); uint32_t hibernate_page_bitmap_count(hibernate_bitmap_t * bitmap, uint32_t set, uint32_t page); -void +uintptr_t hibernate_restore_phys_page(uint64_t src, uint64_t dst, uint32_t len, uint32_t procFlags); void @@ -341,8 +369,6 @@ extern uint32_t gIOHibernateFreeTime; // max time to spend freeing pages (ms) extern uint8_t gIOHibernateRestoreStack[]; extern uint8_t gIOHibernateRestoreStackEnd[]; extern IOHibernateImageHeader * gIOHibernateCurrentHeader; -extern hibernate_graphics_t * gIOHibernateGraphicsInfo; -extern hibernate_cryptwakevars_t * gIOHibernateCryptWakeVars; #define HIBLOG(fmt, args...) \ { kprintf(fmt, ## args); printf(fmt, ## args); } @@ -419,9 +445,11 @@ enum { #define kIOHibernateMachineSignatureKey "machine-signature" #define kIOHibernateRTCVariablesKey "IOHibernateRTCVariables" +#define kIOHibernateSMCVariablesKey "IOHibernateSMCVariables" #define kIOHibernateBootSwitchVarsKey "boot-switch-vars" +#define kIOHibernateUseKernelInterpreter 0x80000000 #ifdef __cplusplus } diff --git a/iokit/IOKit/IOInterruptEventSource.h b/iokit/IOKit/IOInterruptEventSource.h index fe5d4ae12..2e1a82765 100644 --- a/iokit/IOKit/IOInterruptEventSource.h +++ b/iokit/IOKit/IOInterruptEventSource.h @@ -189,6 +189,17 @@ state when checkForWork is called. */ @param nub Where did the interrupt originate from @param ind What is this interrupts index within 'nub'. */ virtual void disableInterruptOccurred(void *, IOService *nub, int ind); + +/*! @function warmCPU + @abstract Tries to reduce latency for an interrupt which will be received near a specified time. + @discussion Warms up a CPU in advance of an interrupt so that the interrupt may be serviced with predictable latency. + The warm-up is not periodic; callers should call warmCPU once in advance of each interrupt. It is recommended that + requests be issues in serial (i.e. each after the target for the previous call has elapsed), as there is a systemwide + cap on the number of outstanding requests. This routine may be disruptive to the system if used with very small intervals + between requests; it should be used only in cases where interrupt latency is absolutely critical, and tens or hundreds of + milliseconds between targets is the expected time scale. NOTE: it is not safe to call this method with interrupts disabled. + @param abstime Time at which interrupt is expected. */ + IOReturn warmCPU(uint64_t abstime); private: IOReturn registerInterruptHandler(IOService *inProvider, int inIntIndex); diff --git a/iokit/IOKit/IOKitDebug.h b/iokit/IOKit/IOKitDebug.h index 96fb7c5a0..de2850d4e 100644 --- a/iokit/IOKit/IOKitDebug.h +++ b/iokit/IOKit/IOKitDebug.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,13 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* - * Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * - */ - #ifndef _IOKIT_IOKITDEBUG_H #define _IOKIT_IOKITDEBUG_H @@ -82,6 +75,9 @@ enum { kOSRegistryModsMode = 0x00040000ULL, // Change default registry modification handling - panic vs. log // kIOTraceIOService = 0x00080000ULL, // Obsolete: Use iotrace=0x00080000ULL to enable now kIOLogHibernate = 0x00100000ULL, + kIOLogDriverPower1 = 0x01000000ULL, + kIOLogDriverPower2 = 0x02000000ULL, + kIOStatistics = 0x04000000ULL, // debug aids - change behaviour kIONoFreeObjects = 0x00100000ULL, @@ -97,7 +93,7 @@ enum { kIOTraceEventSources = 0x00000004ULL, // Trace non-passive event sources kIOTraceIntEventSource = 0x00000008ULL, // Trace IOIES and IOFIES sources kIOTraceCommandGates = 0x00000010ULL, // Trace command gate activity - kIOTraceTimers = 0x00000020ULL, // Trace timer event source activity + kIOTraceTimers = 0x00000008ULL, // Trace timer event source activity kIOTracePowerMgmt = 0x00000400ULL, // Trace power management changes @@ -108,15 +104,22 @@ enum { extern SInt64 gIOKitDebug; extern SInt64 gIOKitTrace; -extern UInt64 gIOInterruptThresholdNS; - #ifdef __cplusplus extern "C" { #endif -struct IORegistryPlane; -extern void IOPrintPlane( const struct IORegistryPlane * plane ); +#ifdef __cplusplus +class IORegistryPlane; +#endif + +extern void IOPrintPlane( +#ifdef __cplusplus + const IORegistryPlane * plane +#else + const struct IORegistryPlane * plane +#endif + ); #ifndef _OSCPPDEBUG_H extern void OSPrintMemory( void ); #endif diff --git a/iokit/IOKit/IOKitKeys.h b/iokit/IOKit/IOKitKeys.h index 62395d54d..a6d7c8bf5 100644 --- a/iokit/IOKit/IOKitKeys.h +++ b/iokit/IOKit/IOKitKeys.h @@ -151,6 +151,7 @@ // IODTNVRAM property keys #define kIONVRAMDeletePropertyKey "IONVRAM-DELETE-PROPERTY" +#define kIONVRAMSyncNowPropertyKey "IONVRAM-SYNCNOW-PROPERTY" #define kIODTNVRAMPanicInfoKey "aapl,panic-info" // keys for complex boot information diff --git a/iokit/IOKit/IOKitKeysPrivate.h b/iokit/IOKit/IOKitKeysPrivate.h index 73e93db0f..06794304e 100644 --- a/iokit/IOKit/IOKitKeysPrivate.h +++ b/iokit/IOKit/IOKitKeysPrivate.h @@ -32,25 +32,31 @@ #include <IOKit/IOKitKeys.h> // properties found in the registry root -#define kIOConsoleUsersKey "IOConsoleUsers" /* value is OSArray */ +#define kIOConsoleLockedKey "IOConsoleLocked" /* value is OSBoolean */ +#define kIOConsoleUsersKey "IOConsoleUsers" /* value is OSArray */ #define kIOMaximumMappedIOByteCountKey "IOMaximumMappedIOByteCount" /* value is OSNumber */ // properties found in the console user dict +#define kIOConsoleSessionAuditIDKey "kCGSSessionAuditIDKey" /* value is OSNumber */ -#define kIOConsoleSessionIDKey "kCGSSessionIDKey" /* value is OSNumber */ - -#define kIOConsoleSessionUserNameKey "kCGSSessionUserNameKey" /* value is OSString */ -#define kIOConsoleSessionUIDKey "kCGSSessionUserIDKey" /* value is OSNumber */ -#define kIOConsoleSessionConsoleSetKey "kCGSSessionConsoleSetKey" /* value is OSNumber */ -#define kIOConsoleSessionOnConsoleKey "kCGSSessionOnConsoleKey" /* value is OSBoolean */ +#define kIOConsoleSessionUserNameKey "kCGSSessionUserNameKey" /* value is OSString */ +#define kIOConsoleSessionUIDKey "kCGSSessionUserIDKey" /* value is OSNumber */ +#define kIOConsoleSessionConsoleSetKey "kCGSSessionConsoleSetKey" /* value is OSNumber */ +#define kIOConsoleSessionOnConsoleKey "kCGSSessionOnConsoleKey" /* value is OSBoolean */ #define kIOConsoleSessionSecureInputPIDKey "kCGSSessionSecureInputPID" /* value is OSNumber */ +#define kIOConsoleSessionScreenLockedTimeKey "CGSSessionScreenLockedTime" /* value is OSNumber, secs - 1970 */ // IOResources property -#define kIOConsoleUsersSeedKey "IOConsoleUsersSeed" /* value is OSNumber */ +#define kIOConsoleUsersSeedKey "IOConsoleUsersSeed" /* value is OSNumber */ + +// interest type +#define kIOConsoleSecurityInterest "IOConsoleSecurityInterest" + // private keys for clientHasPrivilege #define kIOClientPrivilegeConsoleUser "console" #define kIOClientPrivilegeSecureConsoleProcess "secureprocess" +#define kIOClientPrivilegeConsoleSession "consolesession" // clientHasPrivilege security token for kIOClientPrivilegeSecureConsoleProcess typedef struct _IOUCProcessToken { diff --git a/iokit/IOKit/IOKitServer.h b/iokit/IOKit/IOKitServer.h index a68c99243..26787a25c 100644 --- a/iokit/IOKit/IOKitServer.h +++ b/iokit/IOKit/IOKitServer.h @@ -73,6 +73,11 @@ enum { @constant kIOCatalogRemoveDrivers Signals a call to the removeDrivers function in IOCatalogue. @constant kIOCatalogRemoveDriversNoMatch Signals a call to the removedrivers function in IOCatalogue but does not start a matching thread. @constant kIOCatalogStartMatching Signals the IOCatalogue to start an IOService matching thread. + @constant kIOCatalogRemoveKernelLinker Deprecated; does nothing. + @constant kIOCatalogKextdActive Signals the kernel that kextd is running. + @constant kIOCatalogKextdFinishedLaunching Signals the IOCatalogue that kextd has finished sending it information at startup. + @constant kIOCatalogResetDrivers Resets the IOCatalogue with a new set of personalities. + @constant kIOCatalogResetDriversNoMatch Resets the IOCatalogue with a new set of personalities but does not start a matching thread. */ enum { kIOCatalogAddDrivers = 1, @@ -82,7 +87,9 @@ enum { kIOCatalogStartMatching, kIOCatalogRemoveKernelLinker, kIOCatalogKextdActive, - kIOCatalogKextdFinishedLaunching + kIOCatalogKextdFinishedLaunching, + kIOCatalogResetDrivers, + kIOCatalogResetDriversNoMatch }; // IOCatalogueGetData diff --git a/iokit/IOKit/IOLib.h b/iokit/IOKit/IOLib.h index 6183a3358..5e91b4725 100644 --- a/iokit/IOKit/IOLib.h +++ b/iokit/IOKit/IOLib.h @@ -222,19 +222,10 @@ void IOMappedWrite32(IOPhysicalAddress address, UInt32 value); void IOMappedWrite64(IOPhysicalAddress address, UInt64 value); -/*! @function IOSetProcessorCacheMode - @abstract Sets the processor cache mode for mapped memory. - @discussion This function sets the cache mode of an already mapped & wired memory range. Note this may not be supported on I/O mappings or shared memory - it is far preferable to set the cache mode as mappings are created with the IOMemoryDescriptor::map method. - @param task Task the memory is mapped into. - @param address Virtual address of the memory. - @param length Length of the range to set. - @param cacheMode A constant from IOTypes.h, <br> - kIOMapDefaultCache to inhibit the cache in I/O areas, kIOMapCopybackCache in general purpose RAM.<br> - kIOMapInhibitCache, kIOMapWriteThruCache, kIOMapCopybackCache to set the appropriate caching.<br> - @result An IOReturn code.*/ +/* This function is deprecated. Cache settings may be set for allocated memory with the IOBufferMemoryDescriptor api. */ IOReturn IOSetProcessorCacheMode( task_t task, IOVirtualAddress address, - IOByteCount length, IOOptionBits cacheMode ); + IOByteCount length, IOOptionBits cacheMode ) __attribute__((deprecated)); /*! @function IOFlushProcessorCache @abstract Flushes the processor cache for mapped memory. @@ -341,8 +332,23 @@ void Debugger(const char * reason); void IOPanic(const char *reason) __attribute__((deprecated)); #endif -struct OSDictionary * IOBSDNameMatching( const char * name ); -struct OSDictionary * IOOFPathMatching( const char * path, char * buf, int maxLen ); +#ifdef __cplusplus +class OSDictionary; +#endif + +#ifdef __cplusplus +OSDictionary * +#else +struct OSDictionary * +#endif +IOBSDNameMatching( const char * name ); + +#ifdef __cplusplus +OSDictionary * +#else +struct OSDictionary * +#endif +IOOFPathMatching( const char * path, char * buf, int maxLen ); /* * Convert between size and a power-of-two alignment. diff --git a/iokit/IOKit/IOMemoryCursor.h b/iokit/IOKit/IOMemoryCursor.h index dfe9eed8c..048cdf584 100644 --- a/iokit/IOKit/IOMemoryCursor.h +++ b/iokit/IOKit/IOMemoryCursor.h @@ -378,85 +378,5 @@ public: } }; -/************************* class IODBDMAMemoryCursor *************************/ - -#if defined(__ppc__) - -struct IODBDMADescriptor; - -/*! - @class IODBDMAMemoryCursor - @abstract An IOMemoryCursor subclass that outputs a vector of DBDMA descriptors where the address and length are filled in. - @discussion The IODBDMAMemoryCursor would be used when the DBDMA hardware is available for the device for that will use an instance of this cursor. -*/ -class IODBDMAMemoryCursor : public IOMemoryCursor -{ - OSDeclareDefaultStructors(IODBDMAMemoryCursor) - -public: -/*! @function outputSegment - @abstract Outpust the given segment into the output segments array in address and length fields of an DBDMA descriptor. - @param segment The physical address and length that is next to be output. - @param segments Base of the output vector of DMA address length pairs. - @param segmentIndex Index to output 'segment' in the 'segments' array. -*/ - static void outputSegment(PhysicalSegment segment, - void * segments, - UInt32 segmentIndex); - -/*! @defined dbdmaOutputSegment - @discussion Backward compatibility define for the old global function definition. See IODBDMAMemoryCursor::outputSegment. */ -#define dbdmaOutputSegment IODBDMAMemoryCursor::outputSegment - -/*! @function withSpecification - @abstract Creates and initializes an IODBDMAMemoryCursor in one operation. - @discussion Factory function to create and initialize an IODBDMAMemoryCursor in one operation. See also IODBDMAMemoryCursor::initWithSpecification. - @param maxSegmentSize Maximum allowable size for one segment. Defaults to 0. - @param maxTransferSize Maximum size of an entire transfer. Defaults to 0 indicating no maximum. - @param alignment Alignment restrictions on output physical addresses. Not currently implemented. Defaults to single byte alignment. - @result Returns a new memory cursor if successfully created and initialized, 0 otherwise. -*/ - static IODBDMAMemoryCursor * - withSpecification(IOPhysicalLength maxSegmentSize, - IOPhysicalLength maxTransferSize, - IOPhysicalLength alignment = 1); - -/*! @function initWithSpecification - @abstract Primary initializer for the IODBDMAMemoryCursor class. - @param maxSegmentSize Maximum allowable size for one segment. Defaults to 0. - @param maxTransferSize Maximum size of an entire transfer. Defaults to 0 indicating no maximum. - @param alignment Alignment restrictions on output physical addresses. Not currently implemented. Defaults to single byte alignment. - @result Returns true if the inherited classes and this instance initialize successfully. -*/ - virtual bool initWithSpecification(IOPhysicalLength maxSegmentSize, - IOPhysicalLength maxTransferSize, - IOPhysicalLength alignment = 1); - - -/*! @function getPhysicalSegments - @abstract Generates a DBDMA physical scatter/gather list given a memory descriptor. - @discussion Generates a list of DBDMA descriptors where the address and length fields are filled in appropriately. But the client is expected to fill in the rest of the DBDMA descriptor as is appropriate for their particular hardware. Wraps IOMemoryCursor::genPhysicalSegments. - @param descriptor IOMemoryDescriptor that describes the data associated with an I/O request. - @param fromPosition Starting location of the I/O within a memory descriptor. - @param segments Pointer to an array of DBDMA descriptors for the output physical scatter/gather list. Be warned no room is left for a preamble in the output array. 'segments' should point to the first memory description slot in a DBDMA command. - @param maxSegments Maximum number of segments that can be written to the DBDMA descriptor table. - @param inMaxTransferSize Maximum transfer size is limited to that many bytes, otherwise it defaults to the maximum transfer size specified when the memory cursor was initialized. - @param transferSize Pointer to an IOByteCount variable that can contain the total size of the transfer being described. Defaults to 0 indicating that no transfer size need be returned. - @result If the descriptor is exhausted of memory, a zero is returned, otherwise the number of segments that were filled in is returned. -*/ - virtual UInt32 getPhysicalSegments(IOMemoryDescriptor * descriptor, - IOByteCount fromPosition, - IODBDMADescriptor * segments, - UInt32 maxSegments, - UInt32 inMaxTransferSize = 0, - IOByteCount * transferSize = 0) - { - return genPhysicalSegments(descriptor, fromPosition, segments, - maxSegments, inMaxTransferSize, transferSize); - } -}; - -#endif /* defined(__ppc__) */ - #endif /* !_IOMEMORYCURSOR_H */ diff --git a/iokit/IOKit/IOMemoryDescriptor.h b/iokit/IOKit/IOMemoryDescriptor.h index 866da4703..6e6961136 100644 --- a/iokit/IOKit/IOMemoryDescriptor.h +++ b/iokit/IOKit/IOMemoryDescriptor.h @@ -320,7 +320,7 @@ public: @param withLength The length of memory. @param options kIOMemoryDirectionMask (options:direction) This nibble indicates the I/O direction to be associated with the descriptor, which may affect the operation of the prepare and complete methods on some architectures. - @param task The task the virtual ranges are mapped into. Note that unlike IOMemoryDescriptor::withAddress(), kernel_task memory must be explicitly prepared when passed to this api. + @param task The task the virtual ranges are mapped into. Note that unlike IOMemoryDescriptor::withAddress(), kernel_task memory must be explicitly prepared when passed to this api. The task argument may be NULL to specify memory by physical address. @result The created IOMemoryDescriptor on success, to be released by the caller, or zero on failure. */ static IOMemoryDescriptor * withAddressRange( @@ -337,7 +337,7 @@ public: @param options kIOMemoryDirectionMask (options:direction) This nibble indicates the I/O direction to be associated with the descriptor, which may affect the operation of the prepare and complete methods on some architectures. kIOMemoryAsReference For options:type = Virtual or Physical this indicate that the memory descriptor need not copy the ranges array into local memory. This is an optimisation to try to minimise unnecessary allocations. - @param task The task each of the virtual ranges are mapped into. Note that unlike IOMemoryDescriptor::withAddress(), kernel_task memory must be explicitly prepared when passed to this api. + @param task The task each of the virtual ranges are mapped into. Note that unlike IOMemoryDescriptor::withAddress(), kernel_task memory must be explicitly prepared when passed to this api. The task argument may be NULL to specify memory by physical address. @result The created IOMemoryDescriptor on success, to be released by the caller, or zero on failure. */ static IOMemoryDescriptor * withAddressRanges( @@ -640,7 +640,7 @@ protected: public: /*! @function getVirtualAddress @abstract Accessor to the virtual address of the first byte in the mapping. - @discussion This method returns the virtual address of the first byte in the mapping. + @discussion This method returns the virtual address of the first byte in the mapping. Since the IOVirtualAddress is only 32bit in 32bit kernels, the getAddress() method should be used for compatibility with 64bit task mappings. @result A virtual address. */ virtual IOVirtualAddress getVirtualAddress(); @@ -725,9 +725,25 @@ public: mach_vm_size_t offset = 0); #ifdef __LP64__ +/*! @function getAddress + @abstract Accessor to the virtual address of the first byte in the mapping. + @discussion This method returns the virtual address of the first byte in the mapping. + @result A virtual address. */ +/*! @function getSize + @abstract Accessor to the length of the mapping. + @discussion This method returns the length of the mapping. + @result A byte count. */ inline mach_vm_address_t getAddress() __attribute__((always_inline)); inline mach_vm_size_t getSize() __attribute__((always_inline)); #else /* !__LP64__ */ +/*! @function getAddress + @abstract Accessor to the virtual address of the first byte in the mapping. + @discussion This method returns the virtual address of the first byte in the mapping. + @result A virtual address. */ +/*! @function getSize + @abstract Accessor to the length of the mapping. + @discussion This method returns the length of the mapping. + @result A byte count. */ virtual mach_vm_address_t getAddress(); virtual mach_vm_size_t getSize(); #endif /* !__LP64__ */ @@ -770,8 +786,6 @@ enum { }; #endif /* XNU_KERNEL_PRIVATE */ -#if !defined(__LP64) || defined(_IOMEMORYDESCRIPTOR_INTERNAL_) - // The following classes are private implementation of IOMemoryDescriptor - they // should not be referenced directly, just through the public API's in the // IOMemoryDescriptor class. For example, an IOGeneralMemoryDescriptor instance @@ -929,8 +943,6 @@ public: }; -#endif /* !defined(__LP64) || defined(_IOMEMORYDESCRIPTOR_INTERNAL_) */ - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #ifdef __LP64__ diff --git a/iokit/IOKit/IOMessage.h b/iokit/IOKit/IOMessage.h index 77a1001aa..4a571b9d4 100644 --- a/iokit/IOKit/IOMessage.h +++ b/iokit/IOKit/IOMessage.h @@ -7,7 +7,7 @@ * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, + * may notificationused to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. @@ -32,16 +32,24 @@ #include <IOKit/IOReturn.h> #include <IOKit/IOTypes.h> +/*! + * @header IOMessage.h + * + * Defines message type constants for several IOKit messaging API's. + * + */ + typedef UInt32 IOMessage; #define iokit_common_msg(message) (UInt32)(sys_iokit|sub_iokit_common|message) #define iokit_family_msg(sub,message) (UInt32)(sys_iokit|sub|message) -/*! @defined iokit_vendor_specific_msg - @discussion iokit_vendor_specific_msg passes messages in the sub_iokit_vendor_specific - subsystem. It can be used to be generate messages that are used for private - communication between vendor specific code with the IOService::message() etc. APIs. -*/ +/*! + * @defined iokit_vendor_specific_msg + * @discussion iokit_vendor_specific_msg passes messages in the sub_iokit_vendor_specific + * subsystem. It can be used to generate messages that are used for private + * communication between vendor specific code with the IOService::message() etc. APIs. + */ #define iokit_vendor_specific_msg(message) (UInt32)(sys_iokit|sub_iokit_vendor_specific|message) #define kIOMessageServiceIsTerminated iokit_common_msg(0x010) @@ -54,29 +62,159 @@ typedef UInt32 IOMessage; #define kIOMessageServiceBusyStateChange iokit_common_msg(0x120) +#define kIOMessageConsoleSecurityChange iokit_common_msg(0x128) + #define kIOMessageServicePropertyChange iokit_common_msg(0x130) -#define kIOMessageCanDevicePowerOff iokit_common_msg(0x200) +#define kIOMessageCopyClientID iokit_common_msg(0x330) + +#define kIOMessageSystemCapabilityChange iokit_common_msg(0x340) +#define kIOMessageDeviceSignaledWakeup iokit_common_msg(0x350) + + +/*! + * @defined kIOMessageDeviceWillPowerOff + * @discussion Indicates the device is about to move to a lower power state. + * Sent to IOKit interest notification clients of type <code>kIOAppPowerStateInterest</code> + * and <code>kIOGeneralInterest</code>. + */ #define kIOMessageDeviceWillPowerOff iokit_common_msg(0x210) -#define kIOMessageDeviceWillNotPowerOff iokit_common_msg(0x220) + +/*! + * @defined kIOMessageDeviceHasPoweredOn + * @discussion Indicates the device has just moved to a higher power state. + * Sent to IOKit interest notification clients of type <code>kIOAppPowerStateInterest</code> + * and <code>kIOGeneralInterest</code>. + */ #define kIOMessageDeviceHasPoweredOn iokit_common_msg(0x230) -// IOService power mgt does not send -// kIOMessageDeviceWillPowerOn -// kIOMessageDeviceHasPoweredOff -#define kIOMessageDeviceWillPowerOn iokit_common_msg(0x215) -#define kIOMessageDeviceHasPoweredOff iokit_common_msg(0x225) +/*! @group In-kernel system shutdown and restart notifications + */ -#define kIOMessageCanSystemPowerOff iokit_common_msg(0x240) +/*! + * @defined kIOMessageSystemWillPowerOff + * @discussion Indicates an imminent system shutdown. Recipients have a limited + * amount of time to respond, otherwise the system will timeout and + * shutdown even without a response. + * Delivered to in-kernel IOKit drivers via <code>IOService::systemWillShutdown()</code>, + * and to clients of <code>registerPrioritySleepWakeInterest()</code>. + * Never delivered to user space notification clients. + */ #define kIOMessageSystemWillPowerOff iokit_common_msg(0x250) -#define kIOMessageSystemWillNotPowerOff iokit_common_msg(0x260) + +/*! + * @defined kIOMessageSystemWillRestart + * @discussion Indicates an imminent system restart. Recipients have a limited + * amount of time to respond, otherwise the system will timeout and + * restart even without a response. + * Delivered to in-kernel IOKit drivers via <code>IOService::systemWillShutdown()</code>, + * and to clients of <code>registerPrioritySleepWakeInterest()</code>. + * Never delivered to user space notification clients. + */ +#define kIOMessageSystemWillRestart iokit_common_msg(0x310) + +/*! + * @defined kIOMessageSystemPagingOff + * @discussion Indicates an imminent system shutdown, paging device now unavailable. + * Recipients have a limited amount of time to respond, otherwise the + * system will timeout and shutdown even without a response. + * Delivered to clients of <code>registerPrioritySleepWakeInterest()</code>. + * Never delivered to user space notification clients. + */ +#define kIOMessageSystemPagingOff iokit_common_msg(0x255) + + +/*! @group System sleep and wake notifications + */ + +/*! + * @defined kIOMessageCanSystemSleep + * @discussion Announces/Requests permission to proceed to system sleep. + * Delivered to in-kernel IOKit drivers via <code>kIOGeneralInterest</code> + * and <code>kIOPriorityPowerStateInterest</code>. + * Delivered to user clients of <code>IORegisterForSystemPower</code>. + */ #define kIOMessageCanSystemSleep iokit_common_msg(0x270) -#define kIOMessageSystemWillSleep iokit_common_msg(0x280) + +/*! + * @defined kIOMessageSystemWillNotSleep + * @discussion Announces that the system has retracted a previous attempt to sleep; + * it follows <code>kIOMessageCanSystemSleep</code>. + * Delivered to in-kernel IOKit drivers via <code>kIOGeneralInterest</code> + * and <code>kIOPriorityPowerStateInterest</code>. + * Delivered to user clients of <code>IORegisterForSystemPower</code>. + */ #define kIOMessageSystemWillNotSleep iokit_common_msg(0x290) -#define kIOMessageSystemHasPoweredOn iokit_common_msg(0x300) -#define kIOMessageSystemWillRestart iokit_common_msg(0x310) + +/*! + * @defined kIOMessageSystemWillSleep + * @discussion Announces that sleep is beginning. + * Delivered to in-kernel IOKit drivers via <code>kIOGeneralInterest</code> + * and <code>kIOPriorityPowerStateInterest</code>. + * Delivered to user clients of <code>IORegisterForSystemPower</code>. + */ +#define kIOMessageSystemWillSleep iokit_common_msg(0x280) + +/*! + * @defined kIOMessageSystemWillPowerOn + * @discussion Announces that the system is beginning to power the device tree; most + * devices are unavailable at this point.. + * Delivered to in-kernel IOKit drivers via <code>kIOGeneralInterest</code> + * and <code>kIOPriorityPowerStateInterest</code>. + * Delivered to user clients of <code>IORegisterForSystemPower</code>. + */ #define kIOMessageSystemWillPowerOn iokit_common_msg(0x320) -#define kIOMessageCopyClientID iokit_common_msg(0x330) +/*! + * @defined kIOMessageSystemHasPoweredOn + * @discussion Announces that the system and its devices have woken up. + * Delivered to in-kernel IOKit drivers via <code>kIOGeneralInterest</code> + * and <code>kIOPriorityPowerStateInterest</code>. + * Delivered to user clients of <code>IORegisterForSystemPower</code>. + */ +#define kIOMessageSystemHasPoweredOn iokit_common_msg(0x300) + +/*! @group Unused and deprecated notifications + */ + +/*! + * @defined kIOMessageCanDevicePowerOff + * @discussion Delivered to <code>kIOAppPowerStateInterest</code> clients of + * devices that implement their own idle timeouts. + * This message type is almost never used. + */ +#define kIOMessageCanDevicePowerOff iokit_common_msg(0x200) + +/*! + * @defined kIOMessageDeviceWillNotPowerOff + * @discussion This IOKit interest notification is largely unused; + * it's not very interesting. + */ +#define kIOMessageDeviceWillNotPowerOff iokit_common_msg(0x220) + +/*! + * @defined kIOMessageSystemWillNotPowerOff + * @deprecated This IOKit message is unused. + */ +#define kIOMessageSystemWillNotPowerOff iokit_common_msg(0x260) + +/*! + * @defined kIOMessageCanSystemPowerOff + * @deprecated This IOKit message is unused. + */ +#define kIOMessageCanSystemPowerOff iokit_common_msg(0x240) + +/*! + * @defined kIOMessageDeviceWillPowerOn + * @discussion IOService power mgt does not send kIOMessageDeviceWillPowerOn. + */ +#define kIOMessageDeviceWillPowerOn iokit_common_msg(0x215) + +/*! + * @defined kIOMessageDeviceHasPoweredOff + * @discussion IOService power mgt does not send kIOMessageDeviceHasPoweredOff. + */ +#define kIOMessageDeviceHasPoweredOff iokit_common_msg(0x225) + #endif /* ! __IOKIT_IOMESSAGE_H */ diff --git a/iokit/IOKit/IONVRAM.h b/iokit/IOKit/IONVRAM.h index a9337bd1d..15bf709f6 100644 --- a/iokit/IOKit/IONVRAM.h +++ b/iokit/IOKit/IONVRAM.h @@ -29,17 +29,20 @@ #ifndef _IOKIT_IONVRAM_H #define _IOKIT_IONVRAM_H +#ifdef __cplusplus #include <IOKit/IOKitKeys.h> #include <IOKit/IOService.h> #include <IOKit/IODeviceTreeSupport.h> #include <IOKit/nvram/IONVRAMController.h> - +#endif /* __cplusplus */ #define kIODTNVRAMOFPartitionName "common" #define kIODTNVRAMXPRAMPartitionName "APL,MacOS75" #define kIODTNVRAMPanicInfoPartitonName "APL,OSXPanic" #define kIODTNVRAMFreePartitionName "wwwwwwwwwwww" +#define MIN_SYNC_NOW_INTERVAL 15*60 /* Minimum 15 Minutes interval mandated */ + enum { kIODTNVRAMImageSize = 0x2000, kIODTNVRAMXPRAMSize = 0x0100, @@ -60,6 +63,8 @@ enum { kOFVariablePermKernelOnly }; +#ifdef __cplusplus + class IODTNVRAM : public IOService { OSDeclareDefaultStructors(IODTNVRAM); @@ -86,6 +91,8 @@ private: UInt32 _piPartitionSize; UInt8 *_piImage; bool _systemPaniced; + SInt32 _lastDeviceSync; + bool _freshInterval; virtual UInt8 calculatePartitionChecksum(UInt8 *partitionHeader); virtual IOReturn initOFVariables(void); @@ -162,6 +169,9 @@ public: IOByteCount length); virtual IOByteCount savePanicInfo(UInt8 *buffer, IOByteCount length); + virtual bool safeToSync(void); }; +#endif /* __cplusplus */ + #endif /* !_IOKIT_IONVRAM_H */ diff --git a/iokit/IOKit/IOPlatformExpert.h b/iokit/IOKit/IOPlatformExpert.h index f75a3e3ab..a27cf64ad 100644 --- a/iokit/IOKit/IOPlatformExpert.h +++ b/iokit/IOKit/IOPlatformExpert.h @@ -57,7 +57,8 @@ enum { kPEHangCPU, kPEUPSDelayHaltCPU, kPEPanicRestartCPU, - kPEPanicSync + kPEPanicSync, + kPEPagingOff }; extern int (*PE_halt_restart)(unsigned int type); extern int PEHaltRestart(unsigned int type); @@ -68,6 +69,12 @@ extern UInt32 PESavePanicInfo(UInt8 *buffer, UInt32 length); extern long PEGetGMTTimeOfDay( void ); extern void PESetGMTTimeOfDay( long secs ); +/* unless it's a "well-known" property, these will read/write out the value as raw data */ + +extern boolean_t PEWriteNVRAMProperty(const char *symbol, const void *value, const unsigned int len); + +extern boolean_t PEReadNVRAMProperty(const char *symbol, void *value, unsigned int *len); + #ifdef __cplusplus } /* extern "C" */ diff --git a/iokit/IOKit/IOService.h b/iokit/IOKit/IOService.h index f2f513929..c3282f8ea 100644 --- a/iokit/IOKit/IOService.h +++ b/iokit/IOKit/IOService.h @@ -134,6 +134,7 @@ extern const OSSymbol * gIOBusyInterest; extern const OSSymbol * gIOOpenInterest; extern const OSSymbol * gIOAppPowerStateInterest; extern const OSSymbol * gIOPriorityPowerStateInterest; +extern const OSSymbol * gIOConsoleSecurityInterest; extern const OSSymbol * gIODeviceMemoryKey; extern const OSSymbol * gIOInterruptControllersKey; @@ -434,25 +435,6 @@ private: OSMetaClassDeclareReservedUnused(IOService, 46); OSMetaClassDeclareReservedUnused(IOService, 47); -#ifdef __ppc__ - OSMetaClassDeclareReservedUnused(IOService, 48); - OSMetaClassDeclareReservedUnused(IOService, 49); - OSMetaClassDeclareReservedUnused(IOService, 50); - OSMetaClassDeclareReservedUnused(IOService, 51); - OSMetaClassDeclareReservedUnused(IOService, 52); - OSMetaClassDeclareReservedUnused(IOService, 53); - OSMetaClassDeclareReservedUnused(IOService, 54); - OSMetaClassDeclareReservedUnused(IOService, 55); - OSMetaClassDeclareReservedUnused(IOService, 56); - OSMetaClassDeclareReservedUnused(IOService, 57); - OSMetaClassDeclareReservedUnused(IOService, 58); - OSMetaClassDeclareReservedUnused(IOService, 59); - OSMetaClassDeclareReservedUnused(IOService, 60); - OSMetaClassDeclareReservedUnused(IOService, 61); - OSMetaClassDeclareReservedUnused(IOService, 62); - OSMetaClassDeclareReservedUnused(IOService, 63); -#endif - public: /*! @function getState @abstract Accessor for IOService state bits, not normally needed or used outside IOService. @@ -1220,6 +1202,8 @@ public: static void setPMRootDomain( class IOPMrootDomain * rootDomain ); static IOReturn catalogNewDrivers( OSOrderedSet * newTables ); uint64_t getAccumulatedBusyTime( void ); + static void updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage); + static void consoleLockTimer(thread_call_param_t p0, thread_call_param_t p1); private: static IOReturn waitMatchIdle( UInt32 ms ); @@ -1312,10 +1296,13 @@ private: static void terminateThread( void * arg, wait_result_t unused ); static void terminateWorker( IOOptionBits options ); static void actionWillTerminate( IOService * victim, IOOptionBits options, - OSArray * doPhase2List ); - static void actionDidTerminate( IOService * victim, IOOptionBits options ); - static void actionFinalize( IOService * victim, IOOptionBits options ); - static void actionStop( IOService * client, IOService * provider ); + OSArray * doPhase2List, void*, void * ); + static void actionDidTerminate( IOService * victim, IOOptionBits options, + void *, void *, void *); + static void actionFinalize( IOService * victim, IOOptionBits options, + void *, void *, void *); + static void actionStop( IOService * client, IOService * provider, + void *, void *, void *); APPLE_KEXT_COMPATIBILITY_VIRTUAL IOReturn resolveInterrupt(IOService *nub, int source); @@ -1337,8 +1324,8 @@ public: virtual void PMinit( void ); /*! @function PMstop - @abstract Frees and removes the driver from power management. - @discussion The power managment variables don't exist after this call and the power managment methods in the caller shouldn't be called. + @abstract Stop power managing the driver. + @discussion Removes the driver from the power plane and stop its power management. This method is synchronous against any power management method invocations (e.g. <code>setPowerState</code> or <code>setAggressiveness</code>), so when this method returns it is guaranteed those power management methods will not be entered. Driver should not call any power management methods after this call. Calling <code>PMstop</code> cleans up for the three power management initialization calls: @link PMinit PMinit@/link, @link joinPMtree joinPMtree@/link, and @link registerPowerDriver registerPowerDriver@/link. */ virtual void PMstop( void ); @@ -1368,6 +1355,7 @@ public: /*! @function registerInterestedDriver @abstract Allows an IOService object to register interest in the changing power state of a power-managed IOService object. @discussion Call <code>registerInterestedDriver</code> on the IOService object you are interested in receiving power state messages from, and pass a pointer to the interested driver (<code>this</code>) as an argument. + The interested driver is retained until the power interest is removed by calling <code>deRegisterInterestedDriver</code>. The interested driver should override @link powerStateWillChangeTo powerStateWillChangeTo@/link and @link powerStateDidChangeTo powerStateDidChangeTo@/link to receive these power change messages. Interested drivers must acknowledge power changes in <code>powerStateWillChangeTo</code> or <code>powerStateDidChangeTo</code>, either via return value or later calls to @link acknowledgePowerChange acknowledgePowerChange@/link. @param theDriver The driver of interest adds this pointer to the list of interested drivers. It informs drivers on this list before and after the power change. @@ -1378,7 +1366,8 @@ public: /*! @function deRegisterInterestedDriver @abstract De-registers power state interest from a previous call to <code>registerInterestedDriver</code>. - @discussion Most drivers do not need to override <code>deRegisterInterestedDriver</code>. + @discussion The retain from <code>registerInterestedDriver</code> is released. This method is synchronous against any <code>powerStateWillChangeTo</code> or <code>powerStateDidChangeTo</code> call targeting the interested driver, so when this method returns it is guaranteed those interest handlers will not be entered. + Most drivers do not need to override <code>deRegisterInterestedDriver</code>. @param theDriver The interested driver previously passed into @link registerInterestedDriver registerInterestedDriver@/link. @result A return code that can be ignored by the caller. */ @@ -1725,10 +1714,13 @@ protected: #ifdef XNU_KERNEL_PRIVATE /* Power management internals */ public: + void idleTimerExpired( void ); void settleTimerExpired( void ); - IOReturn synchronizePowerTree( void ); - bool assertPMThreadCall( void ); - void deassertPMThreadCall( void ); + IOReturn synchronizePowerTree( IOOptionBits options = 0, IOService * notifyRoot = 0 ); + bool assertPMDriverCall( IOPMDriverCallEntry * callEntry, IOOptionBits options = 0, IOPMinformee * inform = 0 ); + void deassertPMDriverCall( IOPMDriverCallEntry * callEntry ); + IOReturn changePowerStateWithOverrideTo( unsigned long ordinal ); + static const char * getIOMessageString( uint32_t msg ); #ifdef __LP64__ static IOWorkLoop * getPMworkloop( void ); @@ -1736,10 +1728,7 @@ public: protected: bool tellClientsWithResponse( int messageType ); - bool tellClientsWithResponse( int messageType, bool (*)(OSObject *, void *) ); void tellClients( int messageType ); - void tellClients( int messageType, bool (*)(OSObject *, void *) ); - IOReturn changePowerStateWithOverrideTo( unsigned long ordinal ); private: #ifndef __LP64__ @@ -1752,44 +1741,44 @@ private: void PMfree( void ); bool tellChangeDown1 ( unsigned long ); bool tellChangeDown2 ( unsigned long ); - IOReturn startPowerChange ( unsigned long, unsigned long, unsigned long, IOPowerConnection *, unsigned long ); + IOReturn startPowerChange( IOPMPowerChangeFlags, IOPMPowerStateIndex, IOPMPowerFlags, IOPowerConnection *, IOPMPowerFlags ); void setParentInfo ( IOPMPowerFlags, IOPowerConnection *, bool ); - IOReturn notifyAll ( int nextMachineState, bool is_prechange ); - bool notifyChild ( IOPowerConnection * nextObject, bool is_prechange ); + IOReturn notifyAll ( uint32_t nextMS ); + bool notifyChild ( IOPowerConnection * child ); // power change initiated by driver void OurChangeStart( void ); + void OurSyncStart ( void ); void OurChangeTellClientsPowerDown ( void ); void OurChangeTellPriorityClientsPowerDown ( void ); + void OurChangeTellCapabilityWillChange ( void ); void OurChangeNotifyInterestedDriversWillChange ( void ); void OurChangeSetPowerState ( void ); void OurChangeWaitForPowerSettle ( void ); void OurChangeNotifyInterestedDriversDidChange ( void ); + void OurChangeTellCapabilityDidChange ( void ); void OurChangeFinish ( void ); - void OurSyncStart ( void ); // downward power change initiated by a power parent IOReturn ParentChangeStart( void ); - void ParentDownTellPriorityClientsPowerDown ( void ); - void ParentDownNotifyInterestedDriversWillChange ( void ); - void ParentDownNotifyDidChangeAndAcknowledgeChange ( void ); - void ParentDownSetPowerState ( void ); - void ParentDownWaitForPowerSettle ( void ); - void ParentAcknowledgePowerChange ( void ); - - // upward power change initiated by a power parent - void ParentUpSetPowerState ( void ); - void ParentUpWaitForSettleTime ( void ); - void ParentUpNotifyInterestedDriversDidChange ( void ); + void ParentChangeTellPriorityClientsPowerDown ( void ); + void ParentChangeTellCapabilityWillChange ( void ); + void ParentChangeNotifyInterestedDriversWillChange ( void ); + void ParentChangeSetPowerState ( void ); + void ParentChangeWaitForPowerSettle ( void ); + void ParentChangeNotifyInterestedDriversDidChange ( void ); + void ParentChangeTellCapabilityDidChange ( void ); + void ParentChangeAcknowledgePowerChange ( void ); void all_done ( void ); void start_ack_timer ( void ); void stop_ack_timer ( void ); void startSettleTimer( void ); bool checkForDone ( void ); - bool responseValid ( unsigned long x, int pid ); + bool responseValid ( uint32_t x, int pid ); void computeDesiredState ( unsigned long tempDesire = 0 ); void rebuildChildClampBits ( void ); + void tellSystemCapabilityChange( uint32_t nextMS ); static void ack_timer_expired( thread_call_param_t, thread_call_param_t ); static IOReturn actionAckTimerExpired(OSObject *, void *, void *, void *, void * ); @@ -1797,8 +1786,10 @@ private: static IOPMRequest * acquirePMRequest( IOService * target, IOOptionBits type, IOPMRequest * active = 0 ); static void releasePMRequest( IOPMRequest * request ); static void pmDriverCallout( IOService * from ); - static void pmTellClientWithResponse( OSObject * object, void * context ); static void pmTellAppWithResponse( OSObject * object, void * context ); + static void pmTellClientWithResponse( OSObject * object, void * context ); + static void pmTellCapabilityAppWithResponse ( OSObject * object, void * arg ); + static void pmTellCapabilityClientWithResponse( OSObject * object, void * arg ); bool ackTimerTick( void ); void addPowerChild1( IOPMRequest * request ); void addPowerChild2( IOPMRequest * request ); @@ -1831,14 +1822,15 @@ private: void driverInformPowerChange( void ); bool isPMBlocked( IOPMRequest * request, int count ); void notifyChildren( void ); - void notifyChildrenDone( void ); + void notifyChildrenOrdered( void ); + void notifyChildrenDelayed( void ); void cleanClientResponses ( bool logErrors ); - void idleTimerExpired( IOTimerEventSource * ); void updatePowerClient( const OSSymbol * client, uint32_t powerState ); void removePowerClient( const OSSymbol * client ); uint32_t getPowerStateForClient( const OSSymbol * client ); IOReturn requestPowerState( const OSSymbol * client, uint32_t state ); - IOReturn requestDomainPower( unsigned long ourPowerState, IOOptionBits options = 0 ); + IOReturn requestDomainPower( IOPMPowerStateIndex ourPowerState, IOOptionBits options = 0 ); + void waitForPMDriverCall( IOService * target = 0 ); #endif /* XNU_KERNEL_PRIVATE */ }; diff --git a/iokit/IOKit/IOServicePM.h b/iokit/IOKit/IOServicePM.h index 96edc11c0..2a2c4c400 100644 --- a/iokit/IOKit/IOServicePM.h +++ b/iokit/IOKit/IOServicePM.h @@ -47,6 +47,15 @@ class IOPMRequest; class IOPMRequestQueue; class IOPMCompletionQueue; +typedef unsigned long IOPMPowerStateIndex; +typedef uint32_t IOPMPowerChangeFlags; + +struct IOPMDriverCallEntry { + queue_chain_t link; + thread_t thread; + IOService * target; +}; + /* Binary compatibility with drivers that access pm_vars */ #ifdef __LP64__ #define PM_VARS_SUPPORT 0 diff --git a/iokit/IOKit/IOSharedLock.h b/iokit/IOKit/IOSharedLock.h index eadfc407d..795007451 100644 --- a/iokit/IOKit/IOSharedLock.h +++ b/iokit/IOKit/IOSharedLock.h @@ -1,19 +1,14 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Computer, Inc. All rights reserved. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * @APPLE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER @@ -23,70 +18,20 @@ * Please see the License for the specific language governing rights and * limitations under the License. * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * - */ - -/* - * Multiprocessor locks used within the shared memory area between the - * kernel and event system. These must work in both user and kernel mode. - * - * These routines are public, for the purpose of writing frame buffer device - * drivers which handle their own cursors. Certain architectures define a - * generic display class which handles cursor drawing and is subclassed by - * driver writers. These drivers need not be concerned with the following - * types and definitions. - * - * The ev_lock(), ev_unlock(), and ev_try_lock() functions are available only - * to drivers built in or dynamically loaded into the kernel, and to DPS - * drivers built in or dynamically loaded into the Window Server. They do not - * exist in any shared library. - * - * --> They're now in IOKit user lib. + * @APPLE_LICENSE_HEADER_END@ */ #ifndef _IOKIT_IOSHAREDLOCK_H #define _IOKIT_IOSHAREDLOCK_H -#ifdef __cplusplus -extern "C" { -#endif - -// should be 32 bytes on PPC -typedef volatile int IOSharedLockData; -typedef IOSharedLockData * IOSharedLock; - -#define IOSpinLockInit(l) (*(l) = (IOSharedLockData)0) - -#ifndef KERNEL -extern void IOSpinLock(IOSharedLock l); -#endif +#include <libkern/OSAtomic.h> -extern void IOSpinUnlock(IOSharedLock l); -extern boolean_t IOTrySpinLock(IOSharedLock l); +#define IOSharedLockData OSSpinLock +#define ev_lock_data_t OSSpinLock -/* exact same stuff & implementation */ - -typedef IOSharedLockData ev_lock_data_t; -typedef ev_lock_data_t * ev_lock_t; - -#define ev_init_lock(l) (*(l) = (ev_lock_data_t)0) -// needs isync? -//#define ev_is_locked(l) (*(l) != (ev_lock_data_t)0) - -#ifndef KERNEL -extern void ev_lock(ev_lock_t l); // Spin lock! +#ifdef KERNEL +#define ev_unlock(l) OSSpinLockUnlock(l) +#define ev_try_lock(l) OSSpinLockTry(l) #endif -extern void ev_unlock(ev_lock_t l); -extern boolean_t ev_try_lock(ev_lock_t l); - -#ifdef __cplusplus -} -#endif #endif /* ! _IOKIT_IOSHAREDLOCK_H */ diff --git a/iokit/IOKit/IOStatistics.h b/iokit/IOKit/IOStatistics.h new file mode 100644 index 000000000..0c4a1abb2 --- /dev/null +++ b/iokit/IOKit/IOStatistics.h @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef _IOKIT_STATISTICS_H +#define _IOKIT_STATISTICS_H + +#define IOSTATISTICS_SIG 'IOST' +#define IOSTATISTICS_SIG_USERCLIENT 'IOSU' +#define IOSTATISTICS_SIG_WORKLOOP 'IOSW' + +/* Update when the binary format changes */ +#define IOSTATISTICS_VER 0x2 + +enum { + kIOStatisticsDriverNameLength = 64, + kIOStatisticsClassNameLength = 64, + kIOStatisticsProcessNameLength = 20 +}; + +enum { + kIOStatisticsDerivedEventSourceCounter = 0, + kIOStatisticsTimerEventSourceCounter, + kIOStatisticsCommandGateCounter, + kIOStatisticsCommandQueueCounter, + kIOStatisticsInterruptEventSourceCounter, + kIOStatisticsFilterInterruptEventSourceCounter +}; + +typedef uint32_t IOStatisticsCounterType; + +enum { + kIOStatisticsGeneral = 0, + kIOStatisticsWorkLoop, + kIOStatisticsUserClient +}; + +/* Keep our alignments as intended */ + +#pragma pack(4) + +/* Event Counters */ + +typedef struct IOStatisticsInterruptEventSources { + uint32_t created; + uint32_t produced; + uint32_t checksForWork; +} IOStatisticsInterruptEventSources; + +typedef struct IOStatisticsTimerEventSources { + uint32_t created; + uint32_t openGateCalls; + uint32_t closeGateCalls; + uint64_t timeOnGate; + uint32_t timeouts; + uint32_t checksForWork; +} IOStatisticsTimerEventSources; + +typedef struct IOStatisticsDerivedEventSources { + uint32_t created; + uint32_t openGateCalls; + uint32_t closeGateCalls; + uint64_t timeOnGate; +} IOStatisticsDerivedEventSources; + +typedef struct IOStatisticsCommandGates { + uint32_t created; + uint32_t openGateCalls; + uint32_t closeGateCalls; + uint64_t timeOnGate; + uint32_t actionCalls; +} IOStatisticsCommandGates; + +typedef struct IOStatisticsCommandQueues { + uint32_t created; + uint32_t actionCalls; +} IOStatisticsCommandQueues; + +typedef struct IOStatisticsUserClients { + uint32_t created; + uint32_t clientCalls; +} IOStatisticsUserClients; + +/* General mode */ + +typedef struct IOStatisticsHeader { + uint32_t sig; /* 'IOST' */ + uint32_t ver; /* incremented with every data revision */ + + uint32_t seq; /* sequence ID */ + + uint32_t globalStatsOffset; + uint32_t kextStatsOffset; + uint32_t memoryStatsOffset; + uint32_t classStatsOffset; + uint32_t counterStatsOffset; + uint32_t kextIdentifiersOffset; + uint32_t classNamesOffset; + + /* struct IOStatisticsGlobal */ + /* struct IOStatisticsKext */ + /* struct IOStatisticsMemory */ + /* struct IOStatisticsClass */ + /* struct IOStatisticsCounter */ + /* struct IOStatisticsKextIdentifier */ + /* struct IOStatisticsClassName */ +} IOStatisticsHeader; + +typedef struct IOStatisticsGlobal { + uint32_t kextCount; + uint32_t classCount; + uint32_t workloops; +} IOStatisticsGlobal; + +typedef struct IOStatisticsKext { + uint32_t loadTag; + uint32_t loadSize; + uint32_t wiredSize; + uint32_t classes; /* Number of classes owned */ + uint32_t classIndexes[]; /* Variable length array of owned class indexes */ +} IOStatisticsKext; + +typedef struct IOStatisticsMemory { + uint32_t allocatedSize; + uint32_t freedSize; + uint32_t allocatedAlignedSize; + uint32_t freedAlignedSize; + uint32_t allocatedContiguousSize; + uint32_t freedContiguousSize; + uint32_t allocatedPageableSize; + uint32_t freedPageableSize; +} IOStatisticsMemory; + +typedef struct IOStatisticsClass { + uint32_t classID; + uint32_t superClassID; + uint32_t classSize; +} IOStatisticsClass; + +typedef struct IOStatisticsCounter { + uint32_t classID; + uint32_t classInstanceCount; + struct IOStatisticsUserClients userClientStatistics; + struct IOStatisticsInterruptEventSources interruptEventSourceStatistics; + struct IOStatisticsInterruptEventSources filterInterruptEventSourceStatistics; + struct IOStatisticsTimerEventSources timerEventSourceStatistics; + struct IOStatisticsCommandGates commandGateStatistics; + struct IOStatisticsCommandQueues commandQueueStatistics; + struct IOStatisticsDerivedEventSources derivedEventSourceStatistics; +} IOStatisticsCounter; + +typedef struct IOStatisticsKextIdentifier { + char identifier[kIOStatisticsDriverNameLength]; +} IOStatisticsKextIdentifier; + +typedef struct IOStatisticsClassName { + char name[kIOStatisticsClassNameLength]; +} IOStatisticsClassName; + +/* WorkLoop mode */ + +typedef struct IOStatisticsWorkLoop { + uint32_t attachedEventSources; + uint64_t timeOnGate; + uint32_t kextLoadTag; + uint32_t dependentKexts; + uint32_t dependentKextLoadTags[]; +} IOStatisticsWorkLoop; + +typedef struct IOStatisticsWorkLoopHeader { + uint32_t sig; /* 'IOSW */ + uint32_t ver; /* incremented with every data revision */ + uint32_t seq; /* sequence ID */ + uint32_t workloopCount; + struct IOStatisticsWorkLoop workLoopStats; +} IOStatisticsWorkLoopHeader; + +/* UserClient mode */ + +typedef struct IOStatisticsUserClientCall { + char processName[kIOStatisticsProcessNameLength]; + int32_t pid; + uint32_t calls; +} IOStatisticsUserClientCall; + +typedef struct IOStatisticsUserClientHeader { + uint32_t sig; /* 'IOSU */ + uint32_t ver; /* incremented with every data revision */ + uint32_t seq; /* sequence ID */ + uint32_t processes; + struct IOStatisticsUserClientCall userClientCalls[]; +} IOStatisticsUserClientHeader; + +#pragma pack() + +#endif /* _IOKIT_STATISTICS_H */ diff --git a/iokit/IOKit/IOStatisticsPrivate.h b/iokit/IOKit/IOStatisticsPrivate.h new file mode 100644 index 000000000..a41230c3d --- /dev/null +++ b/iokit/IOKit/IOStatisticsPrivate.h @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#ifndef __IOKIT_STATISTICS_PRIVATE_H +#define __IOKIT_STATISTICS_PRIVATE_H + +#if IOKITSTATS + +#include <sys/queue.h> +#include <sys/tree.h> + +#include <libkern/c++/OSKext.h> +#include <libkern/OSDebug.h> + +#include <IOKit/IOMemoryDescriptor.h> +#include <IOKit/IOStatistics.h> + +#ifndef KERNEL +#error IOStatisticsPrivate.h is for kernel use only +#endif + +/* Defines */ +#define IOKIT_STATISTICS_RECORDED_USERCLIENT_PROCS 20 + +#ifndef __probable +#define __probable(x) x +#endif + +/* Forward declarations */ +class IOWorkLoop; +class IOUserClient; +class IOEventSource; + +struct IOEventSourceCounter; +struct IOUserClientCounter; +struct IOWorkLoopCounter; +struct IOUserClientProcessEntry; + +struct KextNode; + +/* Allocation tracking */ + +enum { + kIOStatisticsMalloc = 0, + kIOStatisticsFree, + kIOStatisticsMallocAligned, + kIOStatisticsFreeAligned, + kIOStatisticsMallocContiguous, + kIOStatisticsFreeContiguous, + kIOStatisticsMallocPageable, + kIOStatisticsFreePageable, + kIOStatisticsAllocCount +}; + +TAILQ_HEAD(ProcessEntryList, IOUserClientProcessEntry); + +/* Tree and list structs */ + +typedef struct ClassNode { + RB_ENTRY(ClassNode) tLink; + SLIST_ENTRY(ClassNode) lLink; + struct KextNode *parentKext; + uint32_t classID; + uint32_t superClassID; + const OSMetaClass *metaClass; + SLIST_HEAD(, IOEventSourceCounter) counterList; + SLIST_HEAD(, IOUserClientCounter) userClientList; +} ClassNode; + +typedef struct KextNode { + RB_ENTRY(KextNode) link; + RB_ENTRY(KextNode) addressLink; + OSKext *kext; + OSKextLoadTag loadTag; + vm_offset_t address; + vm_offset_t address_end; + uint32_t memoryCounters[kIOStatisticsAllocCount]; + uint32_t classes; + SLIST_HEAD(, ClassNode) classList; + SLIST_HEAD(, IOWorkLoopCounter) workLoopList; + ProcessEntryList userClientCallList; +} KextNode; + +/* User client tracing */ + +typedef struct IOUserClientProcessEntry { + TAILQ_ENTRY(IOUserClientProcessEntry) link; + char processName[kIOStatisticsProcessNameLength]; + int32_t pid; + uint32_t calls; +} IOUserClientProcessEntry; + +/* Counters */ + +typedef struct IOInterruptEventSourceCounter { + uint32_t produced; + uint32_t checksForWork; +} IOInterruptEventSourceCounter; + +typedef struct IOTimerEventSourceCounter { + uint32_t timeouts; + uint32_t checksForWork; +} IOTimerEventSourceCounter; + +typedef struct IOCommandGateCounter { + uint32_t actionCalls; +} IOCommandGateCounter; + +typedef struct IOCommandQueueCounter { + uint32_t actionCalls; +} IOCommandQueueCounter; + +typedef struct IOEventSourceCounter { + SLIST_ENTRY(IOEventSourceCounter) link; + ClassNode *parentClass; + IOStatisticsCounterType type; + uint64_t startTimeStamp; + uint64_t timeOnGate; + uint32_t closeGateCalls; + uint32_t openGateCalls; + union { + IOInterruptEventSourceCounter interrupt; + IOInterruptEventSourceCounter filter; + IOTimerEventSourceCounter timer; + IOCommandGateCounter commandGate; + IOCommandQueueCounter commandQueue; + } u; +} IOEventSourceCounter; + +typedef struct IOWorkLoopDependency { + RB_ENTRY(IOWorkLoopDependency) link; + OSKextLoadTag loadTag; +} IOWorkLoopDependency; + +typedef struct IOWorkLoopCounter { + SLIST_ENTRY(IOWorkLoopCounter) link; + KextNode *parentKext; + int attachedEventSources; + IOWorkLoop *workLoop; + uint64_t startTimeStamp; + uint64_t timeOnGate; + uint32_t closeGateCalls; + uint32_t openGateCalls; + typedef RB_HEAD(DependencyTree, IOWorkLoopDependency) DependencyTreeHead; + DependencyTreeHead dependencyHead; + static int loadTagCompare(IOWorkLoopDependency *e1, IOWorkLoopDependency *e2); + RB_PROTOTYPE_SC(static, DependencyTree, IOWorkLoopDependency, dependencyLink, KextTagCompare); +} IOWorkLoopCounter; + +typedef struct IOUserClientCounter { + SLIST_ENTRY(IOUserClientCounter) link; + ClassNode *parentClass; + uint32_t clientCalls; +} IOUserClientCounter; + +class IOStatistics { + static bool enabled; + + static IORWLock *lock; + + static uint32_t sequenceID; + + static uint32_t lastKextIndex; + static uint32_t lastClassIndex; + + static uint32_t loadedKexts; + static uint32_t registeredClasses; + static uint32_t registeredCounters; + static uint32_t registeredWorkloops; + + static uint32_t attachedEventSources; + + static KextNode *kextHint; + + static IOWorkLoopDependency *nextWorkLoopDependency; + + typedef RB_HEAD(KextTree, KextNode) KextTreeHead; + static KextTreeHead kextHead; + static int kextNodeCompare(KextNode *e1, KextNode *e2); + RB_PROTOTYPE_SC(static, KextTree, KextNode, link, kextNodeCompare); + + typedef RB_HEAD(KextAddressTree, KextNode) KextAddressTreeHead; + static KextAddressTreeHead kextAddressHead; + static int kextAddressNodeCompare(KextNode *e1, KextNode *e2); + RB_PROTOTYPE_SC(static, KextAddressTree, KextNode, addressLink, kextAddressNodeCompare); + + typedef RB_HEAD(ClassTree, ClassNode) ClassTreeHead; + static ClassTreeHead classHead; + static int classNodeCompare(ClassNode *e1, ClassNode *e2); + RB_PROTOTYPE_SC(static, ClassTree, ClassNode, tLink, classNodeCompare); + + static int oid_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, int arg2, struct sysctl_req *req); + + static uint32_t copyGlobalStatistics(IOStatisticsGlobal *stats); + static uint32_t copyKextStatistics(IOStatisticsKext *stats); + static uint32_t copyMemoryStatistics(IOStatisticsMemory *stats); + static uint32_t copyClassStatistics(IOStatisticsClass *stats); + static uint32_t copyCounterStatistics(IOStatisticsCounter *stats); + static uint32_t copyKextIdentifiers(IOStatisticsKextIdentifier *kextIDs); + static uint32_t copyClassNames(IOStatisticsClassName *classNames); + + static uint32_t copyWorkLoopStatistics(IOStatisticsWorkLoop *workLoopStats); + + static uint32_t copyUserClientStatistics(IOStatisticsUserClientHeader *stats, uint32_t loadTag); + + static void updateAllocationCounter(vm_offset_t address, uint32_t index, vm_size_t size); + + static void storeUserClientCallInfo(IOUserClient *userClient, IOUserClientCounter *counter); + + static KextNode *getKextNodeFromBacktrace(boolean_t write); + static void releaseKextNode(KextNode *node); + +public: + + static void initialize(); + + static void onKextLoad(OSKext *kext, kmod_info_t *kmod_info); + static void onKextUnload(OSKext *kext); + static void onClassAdded(OSKext *parentKext, OSMetaClass *metaClass); + static void onClassRemoved(OSKext *parentKext, OSMetaClass *metaClass); + + static IOEventSourceCounter *registerEventSource(OSObject *inOwner); + static void unregisterEventSource(IOEventSourceCounter *counter); + + static IOWorkLoopCounter *registerWorkLoop(IOWorkLoop *workLoop); + static void unregisterWorkLoop(IOWorkLoopCounter *counter); + + static IOUserClientCounter *registerUserClient(IOUserClient *userClient); + static void unregisterUserClient(IOUserClientCounter *counter); + + static int getStatistics(sysctl_req *req); + static int getWorkLoopStatistics(sysctl_req *req); + static int getUserClientStatistics(sysctl_req *req); + + /* Inlines for counter manipulation. + * + * NOTE: counter access is not expressly guarded here so as not to incur performance penalties + * in the instrumented parent objects. Writes are arranged so as to be protected by pre-existing + * locks in the parent where appropriate, but reads have no such guarantee. Counters should + * therefore be regarded as providing an indication of current state, rather than precisely + * accurate statistics. + */ + + static inline void setCounterType(IOEventSourceCounter *counter, IOStatisticsCounterType type) { + if (counter) { + counter->type = type; + } + } + + static inline void countOpenGate(IOEventSourceCounter *counter) { + if (counter) { + counter->timeOnGate += mach_absolute_time() - counter->startTimeStamp; + counter->openGateCalls++; + } + } + + static inline void countCloseGate(IOEventSourceCounter *counter) { + if (counter) { + counter->startTimeStamp = mach_absolute_time(); + counter->closeGateCalls++; + } + } + + /* Interrupt */ + static inline void countInterruptCheckForWork(IOEventSourceCounter *counter) { + if (counter) { + counter->u.interrupt.checksForWork++; + } + } + + static inline void countInterrupt(IOEventSourceCounter *counter) { + if (counter) { + counter->u.interrupt.produced++; + } + } + + /* CommandQueue */ + static inline void countCommandQueueActionCall(IOEventSourceCounter *counter) { + if (counter) { + counter->u.commandQueue.actionCalls++; + } + } + + /* CommandGate */ + static inline void countCommandGateActionCall(IOEventSourceCounter *counter) { + if (counter) { + counter->u.commandGate.actionCalls++; + } + } + + /* Timer */ + static inline void countTimerTimeout(IOEventSourceCounter *counter) { + if (counter) { + counter->u.timer.timeouts++; + } + } + + /* WorkLoop */ + static void attachWorkLoopEventSource(IOWorkLoopCounter *wlc, IOEventSourceCounter *esc); + static void detachWorkLoopEventSource(IOWorkLoopCounter *wlc, IOEventSourceCounter *esc); + + static inline void countWorkLoopOpenGate(IOWorkLoopCounter *counter) { + if (counter) { + counter->timeOnGate += mach_absolute_time() - counter->startTimeStamp; + counter->openGateCalls++; + } + } + + static inline void countWorkLoopCloseGate(IOWorkLoopCounter *counter) { + if (counter) { + counter->startTimeStamp = mach_absolute_time(); + counter->closeGateCalls++; + } + } + + /* IOLib allocations */ + static void countAlloc(uint32_t index, vm_size_t size); + + /* UserClient */ + static void countUserClientCall(IOUserClient *client); +}; + +#else + +/* Statistics disabled */ + +class IOStatistics { +public: + static void initialize() {} +}; + +#endif /* IOKITSTATS */ + +#endif /* __IOKIT_STATISTICS_PRIVATE_H */ diff --git a/iokit/IOKit/IOTimeStamp.h b/iokit/IOKit/IOTimeStamp.h index a1d22f4d3..b551fd723 100644 --- a/iokit/IOKit/IOTimeStamp.h +++ b/iokit/IOKit/IOTimeStamp.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -190,4 +190,8 @@ IOTimeStamp(uintptr_t csc, #define IOSERVICE_TERMINATE_STOP_DEFER 16 /* 0x05080040 */ #define IOSERVICE_TERMINATE_DONE 17 /* 0x05080044 */ +#define IOSERVICE_KEXTD_ALIVE 18 /* 0x05080048 */ +#define IOSERVICE_KEXTD_READY 19 /* 0x0508004C */ +#define IOSERVICE_REGISTRY_QUIET 20 /* 0x05080050 */ + #endif /* ! IOKIT_IOTIMESTAMP_H */ diff --git a/iokit/IOKit/IOTimerEventSource.h b/iokit/IOKit/IOTimerEventSource.h index 7cc0d38c3..bbbeaf964 100644 --- a/iokit/IOKit/IOTimerEventSource.h +++ b/iokit/IOKit/IOTimerEventSource.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000, 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -95,10 +95,6 @@ protected: @abstract Sub-class implementation of free method, frees calloutEntry */ virtual void free(); -/*! @function checkForWork - @abstract Have to implement it is mandatory in $link IOEventSource, but IOTimerEventSources don't actually use this work-loop mechanism. */ - virtual bool checkForWork(); - virtual void setWorkLoop(IOWorkLoop *workLoop); public: diff --git a/iokit/IOKit/IOTypes.h b/iokit/IOKit/IOTypes.h index 9f5d5a3f7..3c41ab070 100644 --- a/iokit/IOKit/IOTypes.h +++ b/iokit/IOKit/IOTypes.h @@ -164,7 +164,12 @@ typedef unsigned int IOAlignment; #ifndef __IOKIT_PORTS_DEFINED__ #define __IOKIT_PORTS_DEFINED__ #ifdef KERNEL +#ifdef __cplusplus +class OSObject; +typedef OSObject * io_object_t; +#else typedef struct OSObject * io_object_t; +#endif #else /* KERNEL */ typedef mach_port_t io_object_t; #endif /* KERNEL */ diff --git a/iokit/IOKit/IOUserClient.h b/iokit/IOKit/IOUserClient.h index 7283ebd41..c3c40c57a 100644 --- a/iokit/IOKit/IOUserClient.h +++ b/iokit/IOKit/IOUserClient.h @@ -37,6 +37,9 @@ #include <IOKit/IOService.h> #include <IOKit/OSMessageNotification.h> +#if IOKITSTATS +#include <IOKit/IOStatisticsPrivate.h> +#endif enum { kIOUCTypeMask = 0x0000000f, @@ -164,18 +167,29 @@ enum { class IOUserClient : public IOService { OSDeclareAbstractStructors(IOUserClient) +#if IOKITSTATS + friend class IOStatistics; +#endif protected: /*! @struct ExpansionData @discussion This structure will be used to expand the capablilties of this class in the future. */ - struct ExpansionData { }; + struct ExpansionData { +#if IOKITSTATS + IOUserClientCounter *counter; +#else + void *iokitstatsReserved; +#endif + }; /*! @var reserved Reserved for future use. (Internal use only) */ ExpansionData * reserved; + bool reserve(); + #ifdef XNU_KERNEL_PRIVATE public: #else diff --git a/iokit/IOKit/IOWorkLoop.h b/iokit/IOKit/IOWorkLoop.h index 808329ada..e248a9b3b 100644 --- a/iokit/IOKit/IOWorkLoop.h +++ b/iokit/IOKit/IOWorkLoop.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2007 Apple Inc. All rights reserved. + * Copyright (c) 1998-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,14 +25,6 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* -Copyright (c) 1998 Apple Computer, Inc. All rights reserved. -HISTORY - 1998-7-13 Godfrey van der Linden(gvdl) - Created. - 1998-10-30 Godfrey van der Linden(gvdl) - Converted to C++ -*/ #ifndef __IOKIT_IOWORKLOOP_H #define __IOKIT_IOWORKLOOP_H @@ -44,6 +36,10 @@ HISTORY #include <IOKit/system.h> +#if IOKITSTATS +#include <IOKit/IOStatisticsPrivate.h> +#endif + class IOEventSource; class IOTimerEventSource; class IOCommandGate; @@ -87,7 +83,14 @@ private: @abstract Static function that calls the threadMain function. */ static void threadMainContinuation(IOWorkLoop *self); - + +/*! @function eventSourcePerformsWork + @abstract Checks if the event source passed in overrides checkForWork() to perform any work. +IOWorkLoop uses this to determine if the event source should be polled in runEventSources() or not. + @param inEventSource The event source to check. +*/ + bool eventSourcePerformsWork(IOEventSource *inEventSource); + protected: /*! @typedef maintCommandEnum @@ -138,6 +141,15 @@ protected: */ struct ExpansionData { IOOptionBits options; + IOEventSource *passiveEventChain; +#if DEBUG + void * allocationBacktrace[16]; +#endif /* DEBUG */ +#if IOKITSTATS + struct IOWorkLoopCounter *counter; +#else + void *iokitstatsReserved; +#endif }; /*! @var reserved @@ -237,13 +249,13 @@ public: /*! @function enableAllInterrupts @abstract Calls enable() in all interrupt event sources. - @discussion For all event sources (ES) for which IODynamicCast(IOInterruptEventSource, ES) is valid, in eventChain call enable() function. See IOEventSource::enable(). + @discussion For all event sources (ES) for which OSDynamicCast(IOInterruptEventSource, ES) is valid, in eventChain call enable() function. See IOEventSource::enable(). */ virtual void enableAllInterrupts() const; /*! @function disableAllInterrupts @abstract Calls disable() in all interrupt event sources. - @discussion For all event sources (ES) for which IODynamicCast(IOInterruptEventSource, ES) is valid, in eventChain call disable() function. See IOEventSource::disable(). + @discussion For all event sources (ES) for which OSDynamicCast(IOInterruptEventSource, ES) is valid, in eventChain call disable() function. See IOEventSource::disable(). */ virtual void disableAllInterrupts() const; @@ -252,6 +264,9 @@ protected: // Internal APIs used by event sources to control the thread friend class IOEventSource; friend class IOTimerEventSource; +#if IOKITSTATS + friend class IOStatistics; +#endif virtual void signalWorkAvailable(); virtual void openGate(); virtual void closeGate(); diff --git a/iokit/IOKit/Makefile b/iokit/IOKit/Makefile index 23d52274b..7b3c8df3e 100644 --- a/iokit/IOKit/Makefile +++ b/iokit/IOKit/Makefile @@ -3,9 +3,13 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir -IOKIT_FRAMEDIR = $(FRAMEDIR)/IOKit.framework/Versions/A -export INCDIR = $(IOKIT_FRAMEDIR)/Headers -export LCLDIR = $(IOKIT_FRAMEDIR)/PrivateHeaders +IOKIT_INCVERS = A +IOKIT_INCFRAME = $(FRAMEDIR)/IOKit.framework +IOKIT_INCDIR = $(IOKIT_INCFRAME)/Versions/$(IOKIT_INCVERS)/Headers +IOKIT_PINCDIR = $(IOKIT_INCFRAME)/Versions/$(IOKIT_INCVERS)/PrivateHeaders + +export INCDIR = $(IOKIT_INCDIR) +export LCLDIR = $(IOKIT_PINCDIR) include $(MakeInc_cmd) include $(MakeInc_def) @@ -18,20 +22,13 @@ INSTINC_SUBDIRS = \ rtc \ system_management -INSTINC_SUBDIRS_PPC = \ - ppc - -INSTINC_SUBDIRS_I386 = \ - i386 +INSTINC_SUBDIRS_I386 = -INSTINC_SUBDIRS_X86_64 = \ - i386 +INSTINC_SUBDIRS_X86_64 = -INSTINC_SUBDIRS_ARM = \ - arm +INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} @@ -41,7 +38,9 @@ NOT_EXPORT_HEADERS = NOT_KF_MI_HEADERS = $(NOT_EXPORT_HEADERS) \ IOKitKeysPrivate.h IOCPU.h \ IOHibernatePrivate.h IOPolledInterface.h \ - IOCommandQueue.h IOLocksPrivate.h + IOCommandQueue.h IOLocksPrivate.h \ + AppleKeyStoreInterface.h \ + IOStatistics.h IOStatisticsPrivate.h NOT_LOCAL_HEADERS = @@ -51,7 +50,7 @@ INSTALL_MI_LIST = IOBSD.h IOKitKeys.h IOKitServer.h IOReturn.h\ IOSharedLock.h IOTypes.h OSMessageNotification.h\ IODataQueueShared.h IOMessage.h -INSTALL_MI_LCL_LIST = IOKitKeysPrivate.h IOHibernatePrivate.h IOLocksPrivate.h +INSTALL_MI_LCL_LIST = IOKitKeysPrivate.h IOHibernatePrivate.h IOLocksPrivate.h IOStatistics.h AppleKeyStoreInterface.h INSTALL_MI_DIR = . diff --git a/iokit/IOKit/i386/IOSharedLockImp.h b/iokit/IOKit/i386/IOSharedLockImp.h deleted file mode 100644 index cb15fb1d8..000000000 --- a/iokit/IOKit/i386/IOSharedLockImp.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * - */ - -/* Copyright (c) 1992 NeXT Computer, Inc. All rights reserved. - * - * EventShmemLock.h - Shared memory area locks for use between the - * WindowServer and the Event Driver. - * - * - * HISTORY - * 29 April 1992 Mike Paquette at NeXT - * Created. - * - * Multiprocessor locks used within the shared memory area between the - * kernel and event system. These must work in both user and kernel mode. - * The locks are defined in an include file so they get exported to the local - * include file area. - * - * This is basically a ripoff of the spin locks under the cthreads packages. - */ - -#ifndef _IOKIT_IOSHAREDLOCKIMP_H -#define _IOKIT_IOSHAREDLOCKIMP_H - -#include <architecture/i386/asm_help.h> - -#ifndef KERNEL -#error this file for kernel only; comm page has user versions -#endif - - TEXT - -/* - * void - * ev_unlock(p) - * int *p; - * - * Unlock the lock pointed to by p. - */ -LEAF(_ev_unlock, 0) -LEAF(_IOSpinUnlock, 0) -#if __x86_64__ - movl $0, (%rdi) -#else - movl 4(%esp), %ecx - movl $0, (%ecx) -#endif -END(_ev_unlock) - - -/* - * int - * ev_try_lock(p) - * int *p; - * - * Try to lock p. Return zero if not successful. - */ - -LEAF(_ev_try_lock, 0) -LEAF(_IOTrySpinLock, 0) -#if __x86_64__ - xorl %eax, %eax - orl $-1, %edx - lock - cmpxchgl %edx, (%rdi) - setz %dl - movzbl %dl, %eax -#else - movl 4(%esp), %ecx - xorl %eax, %eax - lock - cmpxchgl %ecx, (%ecx) - jne 1f - movl $1, %eax /* yes */ - ret -1: - xorl %eax, %eax /* no */ -#endif -END(_ev_try_lock) - - -#endif /* ! _IOKIT_IOSHAREDLOCKIMP_H */ diff --git a/iokit/IOKit/machine/Makefile b/iokit/IOKit/machine/Makefile index 4a77745b4..14dd46d76 100644 --- a/iokit/IOKit/machine/Makefile +++ b/iokit/IOKit/machine/Makefile @@ -14,12 +14,10 @@ MI_DIR = machine EXCLUDE_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} diff --git a/iokit/IOKit/nvram/Makefile b/iokit/IOKit/nvram/Makefile index 3235dd242..2a3da6d3c 100644 --- a/iokit/IOKit/nvram/Makefile +++ b/iokit/IOKit/nvram/Makefile @@ -14,13 +14,11 @@ MI_DIR = nvram NOT_EXPORT_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} diff --git a/iokit/IOKit/platform/Makefile b/iokit/IOKit/platform/Makefile index 644b0b114..7d5079f87 100644 --- a/iokit/IOKit/platform/Makefile +++ b/iokit/IOKit/platform/Makefile @@ -15,13 +15,11 @@ NOT_EXPORT_HEADERS = NOT_KF_MI_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} diff --git a/iokit/IOKit/power/Makefile b/iokit/IOKit/power/Makefile index dcebcdb9b..fd1518bd7 100644 --- a/iokit/IOKit/power/Makefile +++ b/iokit/IOKit/power/Makefile @@ -14,13 +14,11 @@ MI_DIR = power NOT_EXPORT_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} diff --git a/iokit/IOKit/ppc/IODBDMA.h b/iokit/IOKit/ppc/IODBDMA.h deleted file mode 100644 index afe1337bb..000000000 --- a/iokit/IOKit/ppc/IODBDMA.h +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1997 Apple Computer, Inc. - * - * - * HISTORY - * - * Simon Douglas 10 Nov 97 - * - first checked in, mostly from MacOS DBDMA.i, machdep/ppc/dbdma.h - * but use byte reverse ops. - */ - -#ifndef _IODBDMA_H_ -#define _IODBDMA_H_ - -#include <IOKit/IOTypes.h> -#include <libkern/OSByteOrder.h> - - -/* DBDMA definitions */ - -struct IODBDMAChannelRegisters { - volatile unsigned long channelControl; - volatile unsigned long channelStatus; - volatile unsigned long commandPtrHi; /* implementation optional*/ - volatile unsigned long commandPtrLo; - volatile unsigned long interruptSelect; /* implementation optional*/ - volatile unsigned long branchSelect; /* implementation optional*/ - volatile unsigned long waitSelect; /* implementation optional*/ - volatile unsigned long transferModes; /* implementation optional*/ - volatile unsigned long data2PtrHi; /* implementation optional*/ - volatile unsigned long data2PtrLo; /* implementation optional*/ - - volatile unsigned long reserved1; - volatile unsigned long addressHi; /* implementation optional*/ - volatile unsigned long reserved2[4]; - volatile unsigned long unimplemented[16]; - -/* This structure must remain fully padded to 256 bytes.*/ - volatile unsigned long undefined[32]; -}; -typedef struct IODBDMAChannelRegisters IODBDMAChannelRegisters; - -/* These constants define the DB-DMA channel control words and status flags.*/ - -enum { - kdbdmaRun = 0x00008000, - kdbdmaPause = 0x00004000, - kdbdmaFlush = 0x00002000, - kdbdmaWake = 0x00001000, - kdbdmaDead = 0x00000800, - kdbdmaActive = 0x00000400, - kdbdmaBt = 0x00000100, - kdbdmaS7 = 0x00000080, - kdbdmaS6 = 0x00000040, - kdbdmaS5 = 0x00000020, - kdbdmaS4 = 0x00000010, - kdbdmaS3 = 0x00000008, - kdbdmaS2 = 0x00000004, - kdbdmaS1 = 0x00000002, - kdbdmaS0 = 0x00000001 -}; - - -#define IOSetDBDMAChannelControlBits(mask) ( ((mask) | (mask) << 16) ) -#define IOClearDBDMAChannelControlBits(mask) ( (mask) << 16) - - -/* This structure defines the DB-DMA channel command descriptor.*/ - -/* - *** WARNING: Endian-ness issues must be considered when performing load/store! *** -*/ - -struct IODBDMADescriptor { - unsigned long operation; /* cmd || key || i || b || w || reqCount*/ - unsigned long address; - volatile unsigned long cmdDep; - volatile unsigned long result; /* xferStatus || resCount*/ -}; -typedef struct IODBDMADescriptor IODBDMADescriptor; - -/* These constants define the DB-DMA channel command operations and modifiers.*/ - - -enum { -/* Command.cmd operations*/ - kdbdmaOutputMore = 0, - kdbdmaOutputLast = 1, - kdbdmaInputMore = 2, - kdbdmaInputLast = 3, - kdbdmaStoreQuad = 4, - kdbdmaLoadQuad = 5, - kdbdmaNop = 6, - kdbdmaStop = 7 -}; - - -enum { -/* Command.key modifiers (choose one for INPUT, OUTPUT, LOAD, and STORE)*/ - kdbdmaKeyStream0 = 0, /* default modifier*/ - kdbdmaKeyStream1 = 1, - kdbdmaKeyStream2 = 2, - kdbdmaKeyStream3 = 3, - kdbdmaKeyRegs = 5, - kdbdmaKeySystem = 6, - kdbdmaKeyDevice = 7, - - kdbdmaIntNever = 0, /* default modifier*/ - kdbdmaIntIfTrue = 1, - kdbdmaIntIfFalse = 2, - kdbdmaIntAlways = 3, - - kdbdmaBranchNever = 0, /* default modifier*/ - kdbdmaBranchIfTrue = 1, - kdbdmaBranchIfFalse = 2, - kdbdmaBranchAlways = 3, - - kdbdmaWaitNever = 0, /* default modifier*/ - kdbdmaWaitIfTrue = 1, - kdbdmaWaitIfFalse = 2, - kdbdmaWaitAlways = 3, - - kdbdmaCommandMask = (long)0xFFFF0000, - kdbdmaReqCountMask = 0x0000FFFF -}; - - -/* These constants define the DB-DMA channel command results.*/ - -enum { - /* result masks*/ - kdbdmaStatusRun = kdbdmaRun << 16, - kdbdmaStatusPause = kdbdmaPause << 16, - kdbdmaStatusFlush = kdbdmaFlush << 16, - kdbdmaStatusWake = kdbdmaWake << 16, - kdbdmaStatusDead = kdbdmaDead << 16, - kdbdmaStatusActive = kdbdmaActive << 16, - kdbdmaStatusBt = kdbdmaBt << 16, - kdbdmaStatusS7 = kdbdmaS7 << 16, - kdbdmaStatusS6 = kdbdmaS6 << 16, - kdbdmaStatusS5 = kdbdmaS5 << 16, - kdbdmaStatusS4 = kdbdmaS4 << 16, - kdbdmaStatusS3 = kdbdmaS3 << 16, - kdbdmaStatusS2 = kdbdmaS2 << 16, - kdbdmaStatusS1 = kdbdmaS1 << 16, - kdbdmaStatusS0 = kdbdmaS0 << 16, - kdbdmaResCountMask = 0x0000FFFF, - kdbdmaXferStatusMask = 0xFFFF0000 -}; - - -/* These macros are are IODBDMAChannelRegisters accessor functions. */ - -#define IOSetDBDMAChannelRegister(registerSetPtr,field,value) \ -OSWriteSwapInt32(registerSetPtr,offsetof(IODBDMAChannelRegisters,field),value) - -#define IOGetDBDMAChannelRegister(registerSetPtr, field) \ -OSReadSwapInt32(registerSetPtr,offsetof(IODBDMAChannelRegisters, field)) - - -/* void IOSetDBDMAChannelControl (IODBDMAChannelRegisters *registerSetPtr, unsigned long ctlValue); */ - -#define IOSetDBDMAChannelControl(registerSetPtr,ctlValue) \ -do { \ - eieio(); \ - IOSetDBDMAChannelRegister(registerSetPtr,channelControl,ctlValue); \ - eieio(); \ -} while(0) - -/* unsigned long IOGetDBDMAChannelStatus (IODBDMAChannelRegisters *registerSetPtr); */ - -#define IOGetDBDMAChannelStatus(registerSetPtr) \ - IOGetDBDMAChannelRegister(registerSetPtr,channelStatus) - -/* unsigned long IOGetDBDMACommandPtr (IODBDMAChannelRegisters *registerSetPtr); */ - -#define IOGetDBDMACommandPtr(registerSetPtr) \ - IOGetDBDMAChannelRegister(registerSetPtr,commandPtrLo) - -/* void IOSetDBDMACommandPtr (IODBDMAChannelRegisters *registerSetPtr, unsigned long cclPtr); */ - -#define IOSetDBDMACommandPtr(registerSetPtr,cclPtr) \ -do { \ - IOSetDBDMAChannelRegister(registerSetPtr,commandPtrHi,0); \ - eieio(); \ - IOSetDBDMAChannelRegister(registerSetPtr,commandPtrLo,cclPtr); \ - eieio(); \ -} while(0) - - -/* unsigned long IOGetDBDMAInterruptSelect (IODBDMAChannelRegisters *registerSetPtr); */ - -#define IOGetDBDMAInterruptSelect(registerSetPtr) \ - IOGetDBDMAChannelRegister(registerSetPtr,interruptSelect) - -/* void IOSetDBDMAInterruptSelect (IODBDMAChannelRegisters *registerSetPtr, unsigned long intSelValue); */ - -#define IOSetDBDMAInterruptSelect(registerSetPtr,intSelValue) \ -do { \ - IOSetDBDMAChannelRegister(registerSetPtr,interruptSelect,intSelValue); \ - eieio(); \ -} while(0) - -/* unsigned long IOGetDBDMABranchSelect (IODBDMAChannelRegisters *registerSetPtr); */ - -#define IOGetDBDMABranchSelect(registerSetPtr) \ - IOGetDBDMAChannelRegister(registerSetPtr,branchSelect) - -/* void IOSetDBDMABranchSelect (IODBDMAChannelRegisters *registerSetPtr, unsigned long braSelValue); */ - -#define IOSetDBDMABranchSelect(registerSetPtr,braSelValue) \ -do { \ - IOSetDBDMAChannelRegister(registerSetPtr,branchSelect,braSelValue); \ - eieio(); \ -} while(0) - -/* unsigned long IOGetDBDMAWaitSelect (IODBDMAChannelRegisters *registerSetPtr); */ - -#define IOGetDBDMAWaitSelect(registerSetPtr) \ - IOGetDBDMAChannelRegister(registerSetPtr,waitSelect) - -/* void IOSetDBDMAWaitSelect (IODBDMAChannelRegisters *registerSetPtr, unsigned long waitSelValue); */ - -#define IOSetDBDMAWaitSelect(registerSetPtr,waitSelValue) \ -do { \ - IOSetDBDMAChannelRegister(registerSetPtr,waitSelect,waitSelValue); \ - eieio(); \ -} while(0) - - -/* These macros are IODBDMADescriptor accessor functions. */ - -#define IOSetDBDMADescriptor(descPtr,field,value) \ -OSWriteSwapInt32( descPtr, offsetof( IODBDMADescriptor, field), value) - -#define IOGetDBDMADescriptor(descPtr,field) \ -OSReadSwapInt32( descPtr, offsetof( IODBDMADescriptor, field)) - -#define IOMakeDBDMAOperation(cmd,key,interrupt,branch,wait,count) \ - ( ((cmd) << 28) | ((key) << 24) | ((interrupt) << 20) \ - | ((branch) << 18) | ( (wait) << 16) | (count) ) - -/* void IOMakeDBDMADescriptor (IODBDMADescriptor *descPtr, - unsigned long cmd, - unsigned long key, - unsigned long interrupt, - unsigned long branch, - unsigned long wait, - unsigned long count, - unsigned long addr); */ - -#define IOMakeDBDMADescriptor(descPtr,cmd,key,interrupt,branch,wait,count,addr)\ -do { \ - IOSetDBDMADescriptor(descPtr, address, addr); \ - IOSetDBDMADescriptor(descPtr, cmdDep, 0); \ - IOSetDBDMADescriptor(descPtr, result, 0); \ - eieio(); \ - IOSetDBDMADescriptor(descPtr, operation, \ - IOMakeDBDMAOperation(cmd,key,interrupt,branch,wait,count)); \ - eieio(); \ -} while(0) - -/* void IOMakeDBDMADescriptorDep (IODBDMADescriptor *descPtr, - unsigned long cmd, - unsigned long key, - unsigned long interrupt, - unsigned long branch, - unsigned long wait, - unsigned long count, - unsigned long addr, - unsigned long dep); */ - -#define IOMakeDBDMADescriptorDep(descPtr,cmd,key,interrupt,branch,wait,count,addr,dep) \ -do { \ - IOSetDBDMADescriptor(descPtr, address, addr); \ - IOSetDBDMADescriptor(descPtr, cmdDep, dep); \ - IOSetDBDMADescriptor(descPtr, result, 0); \ - eieio(); \ - IOSetDBDMADescriptor(descPtr, operation, \ - IOMakeDBDMAOperation(cmd, key, interrupt, branch, wait, count)); \ - eieio(); \ -} while(0) - -/* Field accessors - NOTE: unsynchronized */ - -/* unsigned long IOGetDBDMAOperation (IODBDMADescriptor *descPtr) */ - -#define IOGetCCOperation(descPtr) \ - IOGetDBDMADescriptor(descPtr,operation) - -/* void IOSetCCOperation (IODBDMADescriptor *descPtr, unsigned long operationValue) */ - -#define IOSetCCOperation(descPtr,operationValue) \ - IOSetDBDMADescriptor(descPtr,operation,operationValue) - -/* unsigned long IOGetCCAddress (IODBDMADescriptor *descPtr) */ - -#define IOGetCCAddress(descPtr) \ - IOGetDBDMADescriptor(descPtr,address) - -/* void IOSetCCAddress (IODBDMADescriptor *descPtr, unsigned long addressValue) */ - -#define IOSetCCAddress(descPtr,addressValue) \ - IOSetDBDMADescriptor(descPtr,address, addressValue) - -/* unsigned long IOGetCCCmdDep (IODBDMADescriptor *descPtr) */ - -#define IOGetCCCmdDep(descPtr) \ - IOGetDBDMADescriptor(descPtr,cmdDep) - -/* void IOSetCCCmdDep (IODBDMADescriptor *descPtr, unsigned long cmdDepValue) */ - -#define IOSetCCCmdDep(descPtr,cmdDepValue) \ - IOSetDBDMADescriptor(descPtr,cmdDep,cmdDepValue) - -/* unsigned long IOGetCCResult (IODBDMADescriptor *descPtr) */ - -#define IOGetCCResult(descPtr) \ - IOGetDBDMADescriptor(descPtr,result) - -/* void IOSetCCResult (IODBDMADescriptor *descPtr, unsigned long resultValue) */ - -#define IOSetCCResult(descPtr,resultValue) \ - IOSetDBDMADescriptor(descPtr,result,resultValue) - - -/* DBDMA routines */ - -extern void IODBDMAStart( volatile IODBDMAChannelRegisters *registerSetPtr, volatile IODBDMADescriptor *physicalDescPtr); -extern void IODBDMAStop( volatile IODBDMAChannelRegisters *registerSetPtr); -extern void IODBDMAFlush( volatile IODBDMAChannelRegisters *registerSetPtr); -extern void IODBDMAReset( volatile IODBDMAChannelRegisters *registerSetPtr); -extern void IODBDMAContinue( volatile IODBDMAChannelRegisters *registerSetPtr); -extern void IODBDMAPause( volatile IODBDMAChannelRegisters *registerSetPtr); - -extern IOReturn IOAllocatePhysicallyContiguousMemory( unsigned int size, unsigned int options, - IOVirtualAddress * logical, IOPhysicalAddress * physical ); -extern IOReturn IOFreePhysicallyContiguousMemory( IOVirtualAddress * logical, unsigned int size); - -#endif /* !defined(_IODBDMA_H_) */ diff --git a/iokit/IOKit/ppc/IOSharedLockImp.h b/iokit/IOKit/ppc/IOSharedLockImp.h deleted file mode 100644 index 8c685b223..000000000 --- a/iokit/IOKit/ppc/IOSharedLockImp.h +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ -/* - * Copyright (c) 1998 Apple Computer, Inc. All rights reserved. - * - * HISTORY - * - */ - -/* Copyright (c) 1992 NeXT Computer, Inc. All rights reserved. - * - * EventShmemLock.h - Shared memory area locks for use between the - * WindowServer and the Event Driver. - * - * HISTORY - * 30 Nov 1992 Ben Fathi (benf@next.com) - * Ported to m98k. - * - * 29 April 1992 Mike Paquette at NeXT - * Created. - * - * Multiprocessor locks used within the shared memory area between the - * kernel and event system. These must work in both user and kernel mode. - * The locks are defined in an include file so they get exported to the local - * include file area. - */ - - -#ifndef _IOKIT_IOSHAREDLOCKIMP_H -#define _IOKIT_IOSHAREDLOCKIMP_H - -#include <architecture/ppc/asm_help.h> -#ifdef KERNEL -#undef END -#include <mach/ppc/asm.h> -#endif - -/* - * void - * ev_lock(p) - * register int *p; - * - * Lock the lock pointed to by p. Spin (possibly forever) until - * the lock is available. Test and test and set logic used. - */ - TEXT - -#ifndef KERNEL -LEAF(_ev_lock) - - li a6,1 // lock value - -8: lwz a7,0(a0) // Get lock word - mr. a7,a7 // Is it held? - bne-- 8b // Yup... - -9: lwarx a7,0,a0 // read the lock - mr. a7,a7 // Is it held? - bne-- 7f // yes, kill reservation - stwcx. a6,0,a0 // try to get the lock - bne-- 9b // failed, try again - isync - blr // got it, return - -7: li a7,-4 // Point to a spot in the red zone - stwcx. a7,a7,r1 // Kill reservation - b 8b // Go wait some more... - - -END(_ev_lock) - -LEAF(_IOSpinLock) - - li a6,1 // lock value - -8: lwz a7,0(a0) // Get lock word - mr. a7,a7 // Is it held? - bne-- 8b // Yup... - -9: lwarx a7,0,a0 // read the lock - mr. a7,a7 // Is it held? - bne-- 7f // yes, kill reservation - stwcx. a6,0,a0 // try to get the lock - bne-- 9b // failed, try again - isync - blr // got it, return - -7: li a7,-4 // Point to a spot in the red zone - stwcx. a7,a7,r1 // Kill reservation - b 8b // Go wait some more... -END(_IOSpinLock) -#endif - -/* - * void - * spin_unlock(p) - * int *p; - * - * Unlock the lock pointed to by p. - */ - -LEAF(_ev_unlock) - sync - li a7,0 - stw a7,0(a0) - blr -END(_ev_unlock) - -LEAF(_IOSpinUnlock) - sync - li a7,0 - stw a7,0(a0) - blr -END(_IOSpinUnlock) - - -/* - * ev_try_lock(p) - * int *p; - * - * Try to lock p. Return TRUE if successful in obtaining lock. - */ - -LEAF(_ev_try_lock) - li a6,1 // lock value - - lwz a7,0(a0) // Get lock word - mr. a7,a7 // Is it held? - bne-- 6f // Yup... - -9: lwarx a7,0,a0 // read the lock - mr. a7,a7 // Is it held? - bne-- 7f // yes, kill reservation - stwcx. a6,0,a0 // try to get the lock - bne-- 9b // failed, try again - li a0,1 // return TRUE - isync - blr // got it, return - -7: li a7,-4 // Point to a spot in the red zone - stwcx. a7,a7,r1 // Kill reservation - -6: - li a0,0 // return FALSE - blr - -END(_ev_try_lock) - -LEAF(_IOTrySpinLock) - li a6,1 // lock value - - lwz a7,0(a0) // Get lock word - mr. a7,a7 // Is it held? - bne-- 6f // Yup... - -9: lwarx a7,0,a0 // read the lock - mr. a7,a7 // Is it held? - bne-- 7f // yes, kill reservation - stwcx. a6,0,a0 // try to get the lock - bne-- 9b // failed, try again - li a0,1 // return TRUE - isync - blr // got it, return - -7: li a7,-4 // Point to a spot in the red zone - stwcx. a7,a7,r1 // Kill reservation - -6: - li a0,0 // return FALSE - blr - -END(_IOTrySpinLock) - -#endif /* ! _IOKIT_IOSHAREDLOCKIMP_H */ diff --git a/iokit/IOKit/ppc/Makefile b/iokit/IOKit/ppc/Makefile deleted file mode 100644 index 21ff86cad..000000000 --- a/iokit/IOKit/ppc/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd -export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def -export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule -export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - -IOKIT_FRAMEDIR = $(FRAMEDIR)/IOKit.framework/Versions/A -export INCDIR = $(IOKIT_FRAMEDIR)/Headers -export LCLDIR = $(IOKIT_FRAMEDIR)/PrivateHeaders - -include $(MakeInc_cmd) -include $(MakeInc_def) - -MD_DIR = ppc -NOT_EXPORT_HEADERS = IOSharedLockImp.h - -INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = - -EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} - -ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) - -INSTALL_MD_LIST = IOSharedLockImp.h -INSTALL_MD_LCL_LIST = "" -INSTALL_MD_DIR = $(MD_DIR) - -EXPORT_MD_LIST = $(filter-out $(NOT_EXPORT_HEADERS), $(ALL_HEADERS)) -EXPORT_MD_DIR = IOKit/$(MD_DIR) - -include $(MakeInc_rule) -include $(MakeInc_dir) diff --git a/iokit/IOKit/pwr_mgt/IOPM.h b/iokit/IOKit/pwr_mgt/IOPM.h index 804b9bbfd..f0002d5d6 100644 --- a/iokit/IOKit/pwr_mgt/IOPM.h +++ b/iokit/IOKit/pwr_mgt/IOPM.h @@ -32,10 +32,6 @@ #include <IOKit/IOMessage.h> #include <IOKit/IOReturn.h> -#ifdef __ppc__ -#include <IOKit/pwr_mgt/IOPMDeprecated.h> -#endif - /*! @header IOPM.h @abstract Defines power management constants and keys used by both in-kernel and user space power management. @discussion IOPM.h defines a range of power management constants used in several in-kernel and user space APIs. Most significantly, the IOPMPowerFlags used to specify the fields of an IOPMPowerState struct are defined here. @@ -80,7 +76,7 @@ enum { Useful only as a Capability. - @constant kIOPMSleepCapability + @constant kIOPMSleepCapability Used only by certain IOKit Families (USB). Not defined or used by generic Power Management. Read your family documentation to see if you should define a powerstate using these capabilities. @constant kIOPMRestartCapability @@ -91,6 +87,9 @@ enum { @constant kIOPMRestart Used only by certain IOKit Families (USB). Not defined or used by generic Power Management. Read your family documentation to see if you should define a powerstate using these capabilities. + + @constant kIOPMInitialDeviceState + Indicates the initial power state for the device. If <code>initialPowerStateForDomainState()</code> returns a power state with this flag set in the capability field, then the initial power change is performed without calling the driver's <code>setPowerState()</code>. */ typedef unsigned long IOPMPowerFlags; enum { @@ -101,7 +100,8 @@ enum { kIOPMSleepCapability = 0x00000004, kIOPMRestartCapability = 0x00000080, kIOPMSleep = 0x00000001, - kIOPMRestart = 0x00000080 + kIOPMRestart = 0x00000080, + kIOPMInitialDeviceState = 0x00000100 }; /* @@ -121,7 +121,6 @@ enum { kIOPMNotPowerManaged = 0x0800 }; - /* * Deprecated IOPMPowerFlags * Their behavior is undefined when used in IOPMPowerState @@ -221,7 +220,7 @@ enum { * * See IOPMrootDomain notification kIOPMMessageSleepWakeUUIDChange */ -#define kIOPMSleepWakeUUIDKey "SleepWakeUUID" + #define kIOPMSleepWakeUUIDKey "SleepWakeUUID" /* kIOPMDeepSleepEnabledKey * Indicates the Deep Sleep enable state. @@ -239,11 +238,14 @@ enum { */ #define kIOPMDeepSleepDelayKey "Standby Delay" -/* kIOPMLowBatteryWakeThresholdKey - * Key refers to a CFNumberRef that represents the percentage of battery - * remaining charge that will trigger a system wake followed by Deep Sleep. +/* kIOPMDestroyFVKeyOnStandbyKey + * Specifies if FileVault key can be stored when going to standby mode + * It has a boolean value, + * true == Destroy FV key when going to standby mode + * false == Retain FV key when going to standby mode + * not present == Retain FV key when going to standby mode */ -#define kIOPMLowBatteryWakeThresholdKey "LowBatteryWakeThreshold" +#define kIOPMDestroyFVKeyOnStandbyKey "DestroyFVKeyOnStandby" /******************************************************************************* * @@ -276,8 +278,16 @@ enum { */ kIOPMDriverAssertionExternalMediaMountedBit = 0x10, + /*! kIOPMDriverAssertionReservedBit5 + * Reserved for Thunderbolt. + */ kIOPMDriverAssertionReservedBit5 = 0x20, - kIOPMDriverAssertionReservedBit6 = 0x40, + + /*! kIOPMDriverAssertionPreventDisplaySleepBit + * When set, the display should remain powered on while the system's awake. + */ + kIOPMDriverAssertionPreventDisplaySleepBit = 0x40, + kIOPMDriverAssertionReservedBit7 = 0x80 }; @@ -406,6 +416,7 @@ enum { * These commands are issued from system drivers only: * ApplePMU, AppleSMU, IOGraphics, AppleACPIFamily * + * TODO: deprecate kIOPMAllowSleep and kIOPMPreventSleep ******************************************************************************/ enum { kIOPMSleepNow = (1<<0), // put machine to sleep now @@ -500,6 +511,8 @@ enum { #define kIOPMPSCapacityEstimatedKey "CapacityEstimated" #define kIOPMPSBatteryChargeStatusKey "ChargeStatus" #define kIOPMPSBatteryTemperatureKey "Temperature" +#define kIOPMPSAdapterDetailsKey "AdapterDetails" +#define kIOPMPSChargerConfigurationKey "ChargerConfiguration" // kIOPMPSBatteryChargeStatusKey may have one of the following values, or may have // no value. If kIOPMBatteryChargeStatusKey has a NULL value (or no value) associated with it @@ -507,6 +520,7 @@ enum { // then the charge may have been interrupted. #define kIOPMBatteryChargeStatusTooHot "HighTemperature" #define kIOPMBatteryChargeStatusTooCold "LowTemperature" +#define kIOPMBatteryChargeStatusTooHotOrCold "HighOrLowTemperature" #define kIOPMBatteryChargeStatusGradient "BatteryTemperatureGradient" // Definitions for battery location, in case of multiple batteries. @@ -526,6 +540,16 @@ enum { kIOPMGoodValue = 3 }; +// Keys for kIOPMPSAdapterDetailsKey dictionary +#define kIOPMPSAdapterDetailsIDKey "AdapterID" +#define kIOPMPSAdapterDetailsWattsKey "Watts" +#define kIOPMPSAdapterDetailsRevisionKey "AdapterRevision" +#define kIOPMPSAdapterDetailsSerialNumberKey "SerialNumber" +#define kIOPMPSAdapterDetailsFamilyKey "FamilyCode" +#define kIOPMPSAdapterDetailsAmperageKey "Amperage" +#define kIOPMPSAdapterDetailsDescriptionKey "Description" +#define kIOPMPSAdapterDetailsPMUConfigurationKey "PMUConfiguration" + // Battery's time remaining estimate is invalid this long (seconds) after a wake #define kIOPMPSInvalidWakeSecondsKey "BatteryInvalidWakeSeconds" @@ -688,7 +712,6 @@ enum { kIOBatteryChargerConnect = (1 << 0) }; - // Private power management message indicating battery data has changed // Indicates new data resides in the IORegistry #define kIOPMMessageBatteryStatusHasChanged iokit_family_msg(sub_iokit_pmu, 0x100) @@ -714,7 +737,6 @@ enum { kIOPMClamshellStateOnWake = (1<<10) // used only by Platform Expert }; - // ********************************************** // Internal power management data structures // ********************************************** @@ -731,7 +753,7 @@ enum { kIOPMSuperclassPolicy1 }; -struct stateChangeNote{ +struct stateChangeNote { IOPMPowerFlags stateFlags; unsigned long stateNum; void * powerRef; @@ -748,5 +770,54 @@ typedef struct IOPowerStateChangeNotification IOPowerStateChangeNotification; typedef IOPowerStateChangeNotification sleepWakeNote; #endif /* KERNEL && __cplusplus */ -#endif /* ! _IOKIT_IOPM_H */ +/*! @struct IOPMSystemCapabilityChangeParameters + @abstract A structure describing a system capability change. + @discussion A system capability change is a system level transition from a set + of system capabilities to a new set of system capabilities. Power management + sends a <code>kIOMessageSystemCapabilityChange</code> message and provides + this structure as the message data (by reference) to + <code>gIOPriorityPowerStateInterest</code> clients when system capability + changes. + @field notifyRef An identifier for this message notification. Clients with pending + I/O can signal completion by calling <code>allowPowerChange()</code> with this + value as the argument. Clients that are able to process the notification + synchronously should ignore this field. + @field maxWaitForReply A return value to the caller indicating the maximum time in + microseconds to wait for the <code>allowPowerChange()</code> call. The default + value is zero, which indicates the client processing has finished, and power + management should not wait for an <code>allowPowerChange()</code> call. + @field changeFlags Flags will be set to indicate whether the notification precedes + the capability change (<code>kIOPMSystemCapabilityWillChange</code>), or after + the capability change has occurred (<code>kIOPMSystemCapabilityDidChange</code>). + @field __reserved1 Set to zero. + @field fromCapabilities The system capabilities at the start of the transition. + @field toCapabilities The system capabilities at the end of the transition. + @field __reserved2 Set to zero. + */ +struct IOPMSystemCapabilityChangeParameters { + uint32_t notifyRef; + uint32_t maxWaitForReply; + uint32_t changeFlags; + uint32_t __reserved1; + uint32_t fromCapabilities; + uint32_t toCapabilities; + uint32_t __reserved2[4]; +}; + +/*! @enum IOPMSystemCapabilityChangeFlags + @constant kIOPMSystemCapabilityWillChange Indicates the system capability will change. + @constant kIOPMSystemCapabilityDidChange Indicates the system capability has changed. +*/ +enum { + kIOPMSystemCapabilityWillChange = 0x01, + kIOPMSystemCapabilityDidChange = 0x02 +}; +enum { + kIOPMSystemCapabilityCPU = 0x01, + kIOPMSystemCapabilityGraphics = 0x02, + kIOPMSystemCapabilityAudio = 0x04, + kIOPMSystemCapabilityNetwork = 0x08 +}; + +#endif /* ! _IOKIT_IOPM_H */ diff --git a/iokit/IOKit/pwr_mgt/IOPMDeprecated.h b/iokit/IOKit/pwr_mgt/IOPMDeprecated.h deleted file mode 100644 index 3bee01a3b..000000000 --- a/iokit/IOKit/pwr_mgt/IOPMDeprecated.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 1998-2005 Apple Computer, Inc. All rights reserved. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * - * This file contains Original Code and/or Modifications of Original Code - * as defined in and that are subject to the Apple Public Source License - * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. The rights granted to you under the License - * may not be used to create, or enable the creation or redistribution of, - * unlawful or unlicensed copies of an Apple operating system, or to - * circumvent, violate, or enable the circumvention or violation of, any - * terms of an Apple operating system software license agreement. - * - * Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this file. - * - * The Original Code and all software distributed under the License are - * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER - * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, - * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. - * Please see the License for the specific language governing rights and - * limitations under the License. - * - * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ - */ - -#ifndef _IOPMDeprecated_h_ -#define _IOPMDeprecated_h_ - -#ifdef __ppc__ - -// Power events -enum { - kClamshellClosedEventMask = (1<<0), // User closed lid - kDockingBarEventMask = (1<<1), // OBSOLETE - kACPlugEventMask = (1<<2), // User plugged or unplugged adapter - kFrontPanelButtonEventMask = (1<<3), // User hit the front panel button - kBatteryStatusEventMask = (1<<4) // Battery status has changed -}; - -// PUBLIC power management features -// NOTE: this is a direct port from classic, some of these bits -// are obsolete but are included for completeness -enum { - kPMHasWakeupTimerMask = (1<<0), // 1=wake timer is supported - kPMHasSharedModemPortMask = (1<<1), // Not used - kPMHasProcessorCyclingMask = (1<<2), // 1=processor cycling supported - kPMMustProcessorCycleMask = (1<<3), // Not used - kPMHasReducedSpeedMask = (1<<4), // 1=supports reduced processor speed - kPMDynamicSpeedChangeMask = (1<<5), // 1=supports changing processor speed on the fly - kPMHasSCSIDiskModeMask = (1<<6), // 1=supports using machine as SCSI drive - kPMCanGetBatteryTimeMask = (1<<7), // 1=battery time can be calculated - kPMCanWakeupOnRingMask = (1<<8), // 1=machine can wake on modem ring - kPMHasDimmingSupportMask = (1<<9), // 1=has monitor dimming support - kPMHasStartupTimerMask = (1<<10), // 1=can program startup timer - kPMHasChargeNotificationMask = (1<<11), // 1=client can determine charger status/get notifications - kPMHasDimSuspendSupportMask = (1<<12), // 1=can dim diplay to DPMS ('off') state - kPMHasWakeOnNetActivityMask = (1<<13), // 1=supports waking upon receipt of net packet - kPMHasWakeOnLidMask = (1<<14), // 1=can wake upon lid/case opening - kPMCanPowerOffPCIBusMask = (1<<15), // 1=can remove power from PCI bus on sleep - kPMHasDeepSleepMask = (1<<16), // 1=supports deep (hibernation) sleep - kPMHasSleepMask = (1<<17), // 1=machine support low power sleep (ala powerbooks) - kPMSupportsServerModeAPIMask = (1<<18), // 1=supports reboot on AC resume for unexpected power loss - kPMHasUPSIntegrationMask = (1<<19) // 1=supports incorporating UPS devices into power source calcs -}; - -// PRIVATE power management features -// NOTE: this is a direct port from classic, some of these bits -// are obsolete but are included for completeness. -enum { - kPMHasExtdBattInfoMask = (1<<0), // Not used - kPMHasBatteryIDMask = (1<<1), // Not used - kPMCanSwitchPowerMask = (1<<2), // Not used - kPMHasCelsiusCyclingMask = (1<<3), // Not used - kPMHasBatteryPredictionMask = (1<<4), // Not used - kPMHasPowerLevelsMask = (1<<5), // Not used - kPMHasSleepCPUSpeedMask = (1<<6), // Not used - kPMHasBtnIntHandlersMask = (1<<7), // 1=supports individual button interrupt handlers - kPMHasSCSITermPowerMask = (1<<8), // 1=supports SCSI termination power switch - kPMHasADBButtonHandlersMask = (1<<9), // 1=supports button handlers via ADB - kPMHasICTControlMask = (1<<10), // 1=supports ICT control - kPMHasLegacyDesktopSleepMask = (1<<11), // 1=supports 'doze' style sleep - kPMHasDeepIdleMask = (1<<12), // 1=supports Idle2 in hardware - kPMOpenLidPreventsSleepMask = (1<<13), // 1=open case prevent machine from sleeping - kPMClosedLidCausesSleepMask = (1<<14), // 1=case closed (clamshell closed) causes sleep - kPMHasFanControlMask = (1<<15), // 1=machine has software-programmable fan/thermostat controls - kPMHasThermalControlMask = (1<<16), // 1=machine supports thermal monitoring - kPMHasVStepSpeedChangeMask = (1<<17), // 1=machine supports processor voltage/clock change - kPMEnvironEventsPolledMask = (1<<18) // 1=machine doesn't generate pmu env ints, we must poll instead -}; - -// DEFAULT public and private features for machines whose device tree -// does NOT contain this information (pre-Core99). - -// For Cuda-based Desktops - -#define kStdDesktopPMFeatures kPMHasWakeupTimerMask |\ - kPMHasProcessorCyclingMask |\ - kPMHasDimmingSupportMask |\ - kPMHasStartupTimerMask |\ - kPMSupportsServerModeAPIMask |\ - kPMHasUPSIntegrationMask - -#define kStdDesktopPrivPMFeatures kPMHasExtdBattInfoMask |\ - kPMHasICTControlMask |\ - kPMHasLegacyDesktopSleepMask - -#define kStdDesktopNumBatteries 0 - -// For Wallstreet (PowerBook G3 Series 1998) - -#define kWallstreetPMFeatures kPMHasWakeupTimerMask |\ - kPMHasProcessorCyclingMask |\ - kPMHasReducedSpeedMask |\ - kPMDynamicSpeedChangeMask |\ - kPMHasSCSIDiskModeMask |\ - kPMCanGetBatteryTimeMask |\ - kPMHasDimmingSupportMask |\ - kPMHasChargeNotificationMask |\ - kPMHasDimSuspendSupportMask |\ - kPMHasSleepMask - -#define kWallstreetPrivPMFeatures kPMHasExtdBattInfoMask |\ - kPMHasBatteryIDMask |\ - kPMCanSwitchPowerMask |\ - kPMHasADBButtonHandlersMask |\ - kPMHasSCSITermPowerMask |\ - kPMHasICTControlMask |\ - kPMClosedLidCausesSleepMask |\ - kPMEnvironEventsPolledMask - -#define kStdPowerBookPMFeatures kWallstreetPMFeatures -#define kStdPowerBookPrivPMFeatures kWallstreetPrivPMFeatures - -#define kStdPowerBookNumBatteries 2 - -// For 101 (PowerBook G3 Series 1999) - -#define k101PMFeatures kPMHasWakeupTimerMask |\ - kPMHasProcessorCyclingMask |\ - kPMHasReducedSpeedMask |\ - kPMDynamicSpeedChangeMask |\ - kPMHasSCSIDiskModeMask |\ - kPMCanGetBatteryTimeMask |\ - kPMHasDimmingSupportMask |\ - kPMHasChargeNotificationMask |\ - kPMHasDimSuspendSupportMask |\ - kPMHasSleepMask |\ - kPMHasUPSIntegrationMask - -#define k101PrivPMFeatures kPMHasExtdBattInfoMask |\ - kPMHasBatteryIDMask |\ - kPMCanSwitchPowerMask |\ - kPMHasADBButtonHandlersMask |\ - kPMHasSCSITermPowerMask |\ - kPMHasICTControlMask |\ - kPMClosedLidCausesSleepMask |\ - kPMEnvironEventsPolledMask - - -// These flags are deprecated. Use the version with the kIOPM prefix in IOPM.h -enum { - kACInstalled = (1<<0), - kBatteryCharging = (1<<1), - kBatteryInstalled = (1<<2), - kUPSInstalled = (1<<3), - kBatteryAtWarn = (1<<4), - kBatteryDepleted = (1<<5), - kACnoChargeCapability = (1<<6), // AC adapter cannot charge battery - kRawLowBattery = (1<<7), // used only by Platform Expert - kForceLowSpeed = (1<<8) // set by Platfm Expert, chk'd by Pwr Plugin}; -}; - -#endif /* __ppc32 */ -#endif /* _IOPMDeprecated_h_ */ diff --git a/iokit/IOKit/pwr_mgt/IOPMPrivate.h b/iokit/IOKit/pwr_mgt/IOPMPrivate.h index 88ff6c788..3e61d81e0 100644 --- a/iokit/IOKit/pwr_mgt/IOPMPrivate.h +++ b/iokit/IOKit/pwr_mgt/IOPMPrivate.h @@ -30,8 +30,197 @@ #include <IOKit/pwr_mgt/IOPM.h> -/*****************************************************************************/ +#pragma mark PM Timeline Logging +/************************************************** +* +* Timeline API Keys - Reports timing details for +* applications, drivers, and system during PM activity +* +* For kernel-internal use only +**************************************************/ + +// Keys for interfacing with IOPMrootDomain Timeline +/* @constant kIOPMTimelineDictionaryKey + * @abstract RootDomain key for dictionary describing Timeline's info + */ +#define kIOPMTimelineDictionaryKey "PMTimelineLogging" + +/* @constant kIOPMTimelineEnabledKey + * @abstract Boolean value indicating whether the system is recording PM events. + * @discussion Key may be found in the dictionary at IOPMrootDomain's property + * kIOPMTimelineDictionaryKey. uint32_t value; may be 0. + */ +#define kIOPMTimelineEnabledKey "TimelineEnabled" + +/* @constant kIOMPTimelineSystemNumberTrackedKey + * @abstract The maximum number of system power events the system may record. + * @discussion Key may be found in the dictionary at IOPMrootDomain's property + * kIOPMTimelineDictionaryKey. uint32_t value; may be 0. + */ +#define kIOPMTimelineSystemNumberTrackedKey "TimelineSystemEventsTracked" + +/* @constant kIOPMTimelineSystemBufferSizeKey + * @abstract Size in bytes of buffer recording system PM events + * @discussion Key may be found in the dictionary at IOPMrootDomain's property + * kIOPMTimelineDictionaryKey. uint32_t value; may be 0. + */ +#define kIOPMTimelineSystemBufferSizeKey "TimelineSystemBufferSize" + + + +/* @constant kIOPMEventTypeIntermediateFlag + * @abstract This bit indicates the event is an intermediate event + * which must occur within a major system power event. + */ +#define kIOPMEventTypeIntermediateFlag 0x10000000 + +/* @enum SystemEventTypes + * @abstract Potential system events logged in the system event record. + */ +enum { + kIOPMEventTypeUndefined = 0, + + /* Event types mark driver events + */ + kIOPMEventTypeSetPowerStateImmediate = 1001, + kIOPMEventTypeSetPowerStateDelayed = 1002, + kIOPMEventTypePSWillChangeTo = 1003, + kIOPMEventTypePSDidChangeTo = 1004, + kIOPMEventTypeAppResponse = 1005, + + + /* Start and stop event types bracket major + * system power management events. + */ + kIOPMEventTypeSleep = 2001, + kIOPMEventTypeSleepDone = 2002, + kIOPMEventTypeWake = 3001, + kIOPMEventTypeWakeDone = 3002, + kIOPMEventTypeDoze = 4001, + kIOPMEventTypeDozeDone = 4002, + kIOPMEventTypeLiteWakeUp = 5001, + kIOPMEventTypeLiteWakeUpDone = 5002, + kIOPMEventTypeLiteWakeDown = 5003, + kIOPMEventTypeLiteWakeDownDone = 5004, + kIOPMEventTypeUUIDSet = 6001, + kIOPMEventTypeUUIDClear = 6002, + + /* Intermediate events that may only occur within the bounds + * of a major system event (between the event's initiation and its "done event".) + * e.g. chronologically kIOPMEventTypeSleep may be followed by one or more + * intermediate events, which then must be followed by kIOPMEventTypeSleepDone. + * + * The intermediate events below will always occur in a Sleep or Wake event, and may + * or may not occur for any of the other events. + */ + kIOPMEventTypeAppNotificationsFinished = 501 | kIOPMEventTypeIntermediateFlag, + kIOPMEventTypeDriverNotificationsFinished = 502 | kIOPMEventTypeIntermediateFlag, + kIOPMEventTypeCalTimeChange = 503 | kIOPMEventTypeIntermediateFlag +}; + + +/* @enum SystemSleepReasons + * @abstract The potential causes for system sleep as logged in the system event record. + */ +enum { + kIOPMSleepReasonClamshell = 101, + kIOPMSleepReasonPowerButton = 102, + kIOPMSleepReasonSoftware = 103, + kIOPMSleepReasonOSSwitchHibernate = 104, + kIOPMSleepReasonIdle = 105, + kIOPMSleepReasonLowPower = 106, + kIOPMSleepReasonThermalEmergency = 107, + kIOPMSleepReasonMaintenance = 108 +}; + +/* + * Possible C-string sleep reasons found under kRootDomainSleepReasonsKey + */ +#define kIOPMClamshellSleepKey "Clamshell Sleep" +#define kIOPMPowerButtonSleepKey "Power Button Sleep" +#define kIOPMSoftwareSleepKey "Software Sleep" +#define kIOPMOSSwitchHibernationKey "OS Switch Sleep" +#define kIOPMIdleSleepKey "Idle Sleep" +#define kIOPMLowPowerSleepKey "Low Power Sleep" +#define kIOPMThermalEmergencySleepKey "Thermal Emergency Sleep" + + +enum { + kIOPMMaxSystemEventsTracked = 25000, + kIOPMDefaultSystemEventsTracked = 1000, + kMaxPMStringLength = 40, +}; + +/* @struct IOPMSystemEventRecord + * @abstract Records a singe power event to a particular PM entity. + * This includes changes to a driver's power state, application responses + * to PM notifications, or system power management milestones. + */ +typedef struct { + union { + // For DRIVER events + char ownerName[kMaxPMStringLength]; + // For SYSTEM events, uuid contains the string describing the active UUID + char uuid[kMaxPMStringLength]; + }; + + // For DRIVER events - records the name of the driver who generated the notifications. + char interestName[kMaxPMStringLength]; + + // DRIVER & SYSTEM - Times are stored as uint64_t + // The high 32 bytes are the seconds returned from clock_get_calendar_microtime, + // and the low 32 bytes are the accompanying microseconds. + uint64_t timestamp; + + union { + // For DRIVER events - ownerDisambiguateID is a unique descriptor of the driver, to disambiguate + // several similarly named drivers. + uint64_t ownerDisambiguateID; + // For SYSTEM events - eventReason is a value in SystemSleepReason + uint64_t eventReason; + }; + + // DRIVER & SYSTEM - eventType is one of 'SystemEventTypes' + // The value of eventType determines, among ohter things, whether this is a SYSTEM or + // DRIVER event type. + uint32_t eventType; + + // DRIVER & SYSTEM - eventResult is an IOReturn value + uint32_t eventResult; + + // DRIVER - If defined, elapsedTimeUS records the entire time a transaction took to complete + uint32_t elapsedTimeUS; + + // DRIVER - in power state changes, oldState & newState are PM power state indices. + uint8_t oldState; + uint8_t newState; +} IOPMSystemEventRecord; + +/* @struct IOPMTraceBufferHeader + * Occupies the first bytes in the buffer allocated by IOPMrootDomain + * Describes the size and current index of the trace buffer + */ +typedef struct { + uint32_t sizeBytes; + uint32_t sizeEntries; + uint32_t index; +} IOPMTraceBufferHeader; + +/* Argument to IOPMrootDomain::clientMemoryForType to acquire + * memory mapping. + */ +enum { + kPMRootDomainMapTraceBuffer = 1 +}; +/************************************************** +* +* Accountability API Ends here +* +**************************************************/ + + +#pragma mark Stray Bitfields // Private power commands issued to root domain // bits 0-7 in IOPM.h @@ -143,10 +332,10 @@ typedef struct { /* PM RootDomain tracePoints * * In the sleep/wake process, we expect the sleep trace points to proceed - * in increasing order. Once sleep begins with code kIOPMTracePointSleepStarted = 0x11, + * in increasing order. Once sleep begins with code kIOPMTracePointSleepStarted, * we expect sleep to continue in a monotonically increasing order of tracepoints - * to kIOPMTracePointSystemLoginwindowPhase = 0x30. After trace point SystemLoginWindowPhase, - * the system will return to kIOPMTracePointSystemUp = 0x00. + * to kIOPMTracePointSystemLoginwindowPhase. After trace point SystemLoginWindowPhase, + * the system will return to kIOPMTracePointSystemUp. * * If the trace point decreases (instead of increasing) before reaching kIOPMTracePointSystemUp, * that indicates that the sleep process was cancelled. The cancel reason shall be indicated @@ -155,94 +344,215 @@ typedef struct { enum { /* When kTracePointSystemUp is the latest tracePoint, - the system is awake. It is not asleep, sleeping, or waking. - - * Phase begins: At boot, at completion of wake from sleep, - immediately following kIOPMTracePointSystemLoginwindowPhase. - * Phase ends: When a sleep attempt is initiated. + * the system is awake. It is not asleep, sleeping, or waking. + * + * Phase begins: At boot, at completion of wake from sleep, + * immediately following kIOPMTracePointSystemLoginwindowPhase. + * Phase ends: When a sleep attempt is initiated. */ kIOPMTracePointSystemUp = 0, -/* When kIOPMTracePointSleepStarted we have just initiated sleep. +/* When kIOPMTracePointSleepStarted is the latest tracePoint, + * sleep has been initiated. + * + * Phase begins: At initiation of system sleep (idle or forced). + * Phase ends: PM starts to notify applications of system sleep. + */ + kIOPMTracePointSleepStarted = 0x10, - Note: The state prior to kIOPMTracePointSleepStarted may be only one of: - * kIOPMTracePointSystemUp - * kIOPMTracePointSystemLoginwindowPhase or +/* When kIOPMTracePointSleepApplications is the latest tracePoint, + * a system sleep has been initiated and PM waits for responses + * from notified applications. + * + * Phase begins: Begin to asynchronously fire kIOMessageSystemWillSleep + * notifications, and also kIOMessageCanSystemSleep for the idle sleep case. + * Phase ends: When PM has received all application responses. + */ + kIOPMTracePointSleepApplications = 0x11, - * Phase begins: At initiation of system sleep (idle or forced). - * Phase ends: As we start to notify applications of system sleep. +/* When kIOPMTracePointSleepPriorityClients is the latest tracePoint, + * PM is notifying priority clients and in-kernel system capability + * clients, and waiting for any asynchronous completions. + * + * Phase begins: Synchronous delivery of kIOMessageSystemWillSleep notifications. + * Phase ends: All notified clients have acknowledged. + */ + kIOPMTracePointSleepPriorityClients = 0x12, + +/* When kIOPMTracePointSleepWillChangeInterests is the latest tracePoint, + * PM is calling powerStateWillChangeTo() on interested drivers of root domain. + * + * Phase begins: Dispatch a callout thread to call interested drivers. + * Phase ends: Callout thread work done, and acknowledgePowerChange() called + * by drivers that indicated asynchronous completion. */ - kIOPMTracePointSleepStarted = 0x11, + kIOPMTracePointSleepWillChangeInterests = 0x13, -/* When kTracePointSystemSleepAppsPhase is the latest tracePoint, - a system sleep has been irrevocably inititated and PM waits - for responses from notified applications. +/* When kIOPMTracePointSleepPowerPlaneDrivers is the latest tracePoint, + * PM is directing power plane drivers to power off in leaf-to-root order. + * + * Phase begins: Root domain informs its power children that it will drop to + * sleep state. This has a cascade effect and triggers all drivers in + * the power plane to transition to a lower power state if necessary. + * Phase ends: All power transitions in response to the root domain power + * change have completed. + */ + kIOPMTracePointSleepPowerPlaneDrivers = 0x14, + +/* When kIOPMTracePointSleepDidChangeInterests is the latest tracePoint, + * PM is calling powerStateDidChangeTo() on interested drivers of root domain. + * + * Phase begins: Dispatch a callout thread to call interested drivers. + * Phase ends: Callout thread work done, and acknowledgePowerChange() called + * by drivers that indicated asynchronous completion. + */ + kIOPMTracePointSleepDidChangeInterests = 0x15, - * Phase begins: Begin to asynchronously fire kIOMessageSystemWillSleep notifications, - * and in the case of an idle sleep kIOMessageCanSystemSleep as well. - * Phase ends: When we have received all user & interested kernel acknowledgements. +/* When kIOPMTracePointSleepCapabilityClients is the latest tracePoint, + * PM is notifying system capability clients about system sleep. + * + * Phase begins: Send kIOMessageSystemCapabilityChange notifications to inform + * capability clients that system has lost all capabilities. + * Phase ends: Finished sending notifications. */ - kIOPMTracePointSystemSleepAppsPhase = 0x12, + kIOPMTracePointSleepCapabilityClients = 0x16, +/* When kIOPMTracePointSleepPlatformActions is the latest tracePoint, + * PM is calling drivers that have registered a platform sleep action. + */ + kIOPMTracePointSleepPlatformActions = 0x17, -/* When kIOPMTracePointSystemHibernatePhase is the latest tracePoint, - PM is writing the hiernate image to disk. +/* When kIOPMTracePointSleepCPUs is the latest tracePoint, + * PM is shutting down all non-boot processors. + * + * Phase begins: Shutdown all non-boot processors. + * Phase ends: Reduced to only the boot processor running. */ - kIOPMTracePointSystemHibernatePhase = 0x13, + kIOPMTracePointSleepCPUs = 0x18, -/* When kTracePointSystemSleepDriversPhase is the latest tracePoint, - PM is iterating the driver tree powering off devices individually. +/* When kIOPMTracePointSleepPlatformDriver is the latest tracePoint, + * PM is executing platform dependent code to prepare for system sleep. + */ + kIOPMTracePointSleepPlatformDriver = 0x19, - * Phase begins: When IOPMrootDomain has received all of its power acknowledgements and begins - * executing IOService::powerDomainWillChangeTo() - * Phase ends: When IOPMrootDomain::powerChangeDone begins executing CPU shutoff code. +/* When kIOPMTracePointHibernate is the latest tracePoint, + * PM is writing the hibernate image to disk. */ - kIOPMTracePointSystemSleepDriversPhase = 0x14, + kIOPMTracePointHibernate = 0x1a, -/* When kTracePointSystemSleepPlatformPhase is the latest tracePoint, - all apps and drivers have notified of sleep. Plotfarm is powering - off CPU; or system is asleep; or low level wakeup is underway. +/* When kIOPMTracePointSystemSleep is the latest tracePoint, + * PM has recorded the final trace point before the hardware platform + * enters sleep state, or low level wakeup is underway - such as restoring + * the hibernate image from disk. + * + * Note: If a system is asleep and then loses power, and it does not have a + * hibernate image to restore from (e.g. hibernatemode = 0), then OS X will + * interpret this power loss as a failure in kIOPMTracePointSystemSleep. + * + * Phase begins: Before the OS directs the hardware to enter sleep state. + * Phase ends: Control returns to the OS on wake, but before recording the first + * wake trace point. + */ + kIOPMTracePointSystemSleep = 0x1f, - Note: If a system is asleep and then loses power, and it does not have a hibernate - image to restore from (e.g. hibernatemode = 0), then OS X may interpret this power - loss as a system crash in the kTracePointSystemSleepPlatformPhase, since the - power loss resembles a hang or crash, and the power being removed by the user. +/* When kIOPMTracePointWakePlatformDriver is the latest tracePoint, + * PM is executing platform dependent code to prepare for system wake. + */ + kIOPMTracePointWakePlatformDriver = 0x21, + +/* When kIOPMTracePointWakePlatformActions is the latest tracePoint, + * PM is calling drivers that have registered a platform wake action. + */ + kIOPMTracePointWakePlatformActions = 0x22, - * Phase begins: IOPMrootDomain has already shut off drivers, and is now powering off CPU. - * Phase ends: Immediately after CPU's are powered back on during wakeup. +/* When kIOPMTracePointWakeCPUs is the latest tracePoint, + * PM is bringing all non-boot processors online. + */ + kIOPMTracePointWakeCPUs = 0x23, + +/* When kIOPMTracePointWakeWillPowerOnClients is the latest tracePoint, + * PM is sending kIOMessageSystemWillPowerOn to both kernel clients and + * applications. PM also notifies system capability clients about the + * proposed capability change. + * + * Phase begins: Send kIOMessageSystemWillPowerOn and + * kIOMessageSystemCapabilityChange notifications. + * Phase ends: Finished sending notifications. */ - kIOPMTracePointSystemSleepPlatformPhase = 0x15, + kIOPMTracePointWakeWillPowerOnClients = 0x24, -/* When kTracePointSystemWakeDriversPhase is the latest tracePoint, - System CPU is powered, PM is notifying drivers of system wake. +/* When kIOPMTracePointWakeWillChangeInterests is the latest tracePoint, + * PM is calling powerStateWillChangeTo() on interested drivers of root domain. + * + * Phase begins: Dispatch a callout thread to call interested drivers. + * Phase ends: Callout thread work done, and acknowledgePowerChange() called + * by drivers that indicated asynchronous completion. + */ + kIOPMTracePointWakeWillChangeInterests = 0x25, + +/* When kIOPMTracePointWakeDidChangeInterests is the latest tracePoint, + * PM is calling powerStateDidChangeTo() on interested drivers of root domain. + * + * Phase begins: Dispatch a callout thread to call interested drivers. + * Phase ends: Callout thread work done, and acknowledgePowerChange() called + * by drivers that indicated asynchronous completion. + */ + kIOPMTracePointWakeDidChangeInterests = 0x26, - * Phase begins: CPU's have successfully powered up and OS is executing. - * Phase ends: All drivers have handled power events & acknowledged completion. - IOPMrootDomain is about to deliver kIOMessageSystemHasPoweredOn. +/* When kIOPMTracePointWakePowerPlaneDrivers is the latest tracePoint, + * PM is directing power plane drivers to power up in root-to-leaf order. + * + * Phase begins: Root domain informs its power children that it transitioned + * to ON state. This has a cascade effect and triggers all drivers in + * the power plane to re-evaluate and potentially change power state. + * Phase ends: All power transitions in response to the root domain power + * change have completed. */ - kIOPMTracePointSystemWakeDriversPhase = 0x21, + kIOPMTracePointWakePowerPlaneDrivers = 0x27, -/* When kTracePointSystemWakeAppsPhase is the latest tracePoint, - System CPU is powered, PM has powered on each driver. +/* When kIOPMTracePointWakeCapabilityClients is the latest tracePoint, + * PM is notifying system capability clients about system wake, and waiting + * for any asynchronous completions. + * + * Phase begins: Inform capability clients that system has gained capabilities. + * Phase ends: All notified clients have acknowledged. + */ + kIOPMTracePointWakeCapabilityClients = 0x28, - * Phase begins: IOPMrootDomain::tellChangeUp before sending asynchronous - kIOMessageSystemHasPoweredOn notifications - * Phase ends: IOPMrootDomain::tellChangeUp after sending asynchronous notifications +/* When kIOPMTracePointWakeApplications is the latest tracePoint, + * System CPU is powered, PM has powered on each driver. + * + * Phase begins: Send asynchronous kIOMessageSystemHasPoweredOn notifications. + * Phase ends: Finished sending asynchronous notifications. */ - kIOPMTracePointSystemWakeAppsPhase = 0x22, + kIOPMTracePointWakeApplications = 0x29, /* kIOPMTracePointSystemLoginwindowPhase - This phase represents a several minute window after the system has powered on. - Higher levels of system diagnostics are in a heightened state of alert in this phase, - in case any user errors occurred that we could not detect in software. - - This several minute window + * This phase represents a several minute window after the system has powered on. + * Higher levels of system diagnostics are in a heightened state of alert in this phase, + * in case any user errors occurred that we could not detect in software. + * + * Phase begins: After IOPMrootDomain sends kIOMessageSystemHasPoweredOn message. + * Phase ends: When loginwindow calls IOPMSleepWakeSetUUID(NULL) the system shall + * be considered awake and usable. The next phase shall be kIOPMTracePointSystemUp. + */ + kIOPMTracePointSystemLoginwindowPhase = 0x30, - * Phase begins: After IOPMrootDomain sends kIOMessageSystemHasPoweredOn message. - * Phase ends: When loginwindow calls IOPMSleepWakeSetUUID(NULL) the system shall - be considered awake and usable. The next phase shall be kIOPMTracePointSystemUp. +/* When kIOPMTracePointDarkWakeEntry is the latest tracePoint, + * PM has started a transition from full wake to dark wake. + * + * Phase begins: Start transition to dark wake. + * Phase ends: System in dark wake. Before recording kIOPMTracePointSystemUp. + */ + kIOPMTracePointDarkWakeEntry = 0x31, + +/* When kIOPMTracePointDarkWakeExit is the latest tracePoint, + * PM has started a transition from dark wake to full wake. + * + * Phase begins: Start transition to full wake. + * Phase ends: System in full wake. Before recording kIOPMTracePointSystemUp. */ - kIOPMTracePointSystemLoginwindowPhase = 0x30 + kIOPMTracePointDarkWakeExit = 0x32 }; /*****************************************************************************/ diff --git a/iokit/IOKit/pwr_mgt/IOPowerConnection.h b/iokit/IOKit/pwr_mgt/IOPowerConnection.h index 179035b2f..a7ece0ad5 100644 --- a/iokit/IOKit/pwr_mgt/IOPowerConnection.h +++ b/iokit/IOKit/pwr_mgt/IOPowerConnection.h @@ -46,16 +46,18 @@ class IOPowerConnection : public IOService protected: /*! @field parentKnowsState true: parent knows state of its domain used by child */ - bool stateKnown; + bool stateKnown; + /*! @field currentPowerFlags power flags which describe the current state of the power domain used by child */ IOPMPowerFlags currentPowerFlags; + /*! @field desiredDomainState state number which corresponds to the child's desire used by parent */ unsigned long desiredDomainState; /*! @field requestFlag set to true when desiredDomainState is set */ - bool requestFlag; + bool requestFlag; /*! @field preventIdleSleepFlag true if child has this bit set in its desired state used by parent */ @@ -67,16 +69,21 @@ protected: /*! @field awaitingAck true if child has not yet acked our notification used by parent */ - bool awaitingAck; + bool awaitingAck; /*! @field readyFlag true if the child has been added as a power child used by parent */ - bool readyFlag; + bool readyFlag; +#ifdef XNU_KERNEL_PRIVATE public: - /*! @function setParentKnowsState - @abstract Sets the stateKnown variable. - @discussion Called by the parent when the object is created and called by the child when it discovers that the parent now knows its state. */ + bool delayChildNotification; +#endif + +public: + /*! @function setParentKnowsState + @abstract Sets the stateKnown variable. + @discussion Called by the parent when the object is created and called by the child when it discovers that the parent now knows its state. */ void setParentKnowsState (bool ); /*! @function setParentCurrentPowerFlags @@ -107,7 +114,6 @@ public: @discussion Called by the parent. */ void setChildHasRequestedPower ( void ); - /*! @function childHasRequestedPower @abstract Return the flag that says whether the child has called requestPowerDomainState. @discussion Called by the PCI Aux Power Supply Driver to see if a device driver diff --git a/iokit/IOKit/pwr_mgt/Makefile b/iokit/IOKit/pwr_mgt/Makefile index 14165762a..b82357fe9 100644 --- a/iokit/IOKit/pwr_mgt/Makefile +++ b/iokit/IOKit/pwr_mgt/Makefile @@ -18,20 +18,18 @@ NOT_EXPORT_HEADERS = \ IOPMPagingPlexus.h INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h)) -INSTALL_MI_LIST = IOPMLibDefs.h IOPM.h IOPMDeprecated.h +INSTALL_MI_LIST = IOPMLibDefs.h IOPM.h INSTALL_MI_LCL_LIST = IOPMPrivate.h INSTALL_MI_DIR = $(MI_DIR) diff --git a/iokit/IOKit/pwr_mgt/RootDomain.h b/iokit/IOKit/pwr_mgt/RootDomain.h index 2de4d289c..760e7d674 100644 --- a/iokit/IOKit/pwr_mgt/RootDomain.h +++ b/iokit/IOKit/pwr_mgt/RootDomain.h @@ -31,18 +31,20 @@ #include <IOKit/IOService.h> #include <IOKit/pwr_mgt/IOPM.h> #include "IOKit/pwr_mgt/IOPMPrivate.h" +#include <IOKit/IOBufferMemoryDescriptor.h> #ifdef XNU_KERNEL_PRIVATE -#if defined(__i386__) || defined(__x86_64__) -#define ROOT_DOMAIN_RUN_STATES 1 -#endif struct AggressivesRecord; -class PMAssertionsTracker; -#endif /* XNU_KERNEL_PRIVATE */ - +struct IOPMMessageFilterContext; +struct IOPMActions; +class PMSettingObject; +class IOPMTimeline; +class PMEventDetails; +class PMTraceWorker; class IOPMPowerStateQueue; class RootDomainUserClient; -class PMTraceWorker; +class PMAssertionsTracker; +#endif /*! * Types for PM Assertions @@ -105,19 +107,6 @@ enum { #define kIOPMThermalEmergencySleepKey "Thermal Emergency Sleep" #define kIOPMMaintenanceSleepKey "Maintenance Sleep" -enum -{ - kIOPMSleepReasonClamshell = 1, - kIOPMSleepReasonPowerButton = 2, - kIOPMSleepReasonSoftware = 3, - kIOPMSleepReasonOSSwitchHibernation = 4, - kIOPMSleepReasonIdle = 5, - kIOPMSleepReasonLowPower = 6, - kIOPMSleepReasonThermalEmergency = 7, - kIOPMSleepReasonMaintenance = 8, - kIOPMSleepReasonMax -}; - /* * String constants for communication with PM CPU */ @@ -268,17 +257,6 @@ public: const OSSymbol * typeOfInterest, IOServiceInterestHandler handler, void * target, void * ref = 0 ); - - void pmStatsRecordEvent( - int eventIndex, - AbsoluteTime timestamp); - - void pmStatsRecordApplicationResponse( - const OSSymbol *response, - const char *name, - int messageType, - uint32_t delay_ms, - int app_pid); virtual IOReturn callPlatformFunction( const OSSymbol *functionName, @@ -346,57 +324,76 @@ private: #ifdef XNU_KERNEL_PRIVATE /* Root Domain internals */ public: + void tagPowerPlaneService( + IOService * service, + IOPMActions * actions ); -#if HIBERNATION - bool getHibernateSettings( - uint32_t * hibernateMode, - uint32_t * hibernateFreeRatio, - uint32_t * hibernateFreeTime ); -#endif + void overrideOurPowerChange( + IOService * service, + IOPMActions * actions, + unsigned long * inOutPowerState, + uint32_t * inOutChangeFlags ); -#if ROOT_DOMAIN_RUN_STATES - void tagPowerPlaneService( - IOService * service, - uint32_t * rdFlags ); - - void handleActivityTickleForService( IOService * service, - unsigned long type, - unsigned long currentPowerState, - uint32_t activityTickleCount ); - - void handlePowerChangeStartForService( - IOService * service, - uint32_t * rootDomainFlags, - uint32_t newPowerState, - uint32_t changeFlags ); - - void handlePowerChangeDoneForService( - IOService * service, - uint32_t * rootDomainFlags, - uint32_t newPowerState, - uint32_t changeFlags ); - - void overridePowerStateForService( + void handleOurPowerChangeStart( IOService * service, - uint32_t * rdFlags, - unsigned long * powerState, + IOPMActions * actions, + uint32_t powerState, + uint32_t * inOutChangeFlags ); + + void handleOurPowerChangeDone( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t changeFlags ); + + void overridePowerChangeForUIService( + IOService * service, + IOPMActions * actions, + unsigned long * inOutPowerState, + uint32_t * inOutChangeFlags ); + + void handleActivityTickleForDisplayWrangler( + IOService * service, + IOPMActions * actions ); + + bool shouldDelayChildNotification( + IOService * service ); + + void handlePowerChangeStartForPCIDevice( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t * inOutChangeFlags ); + + void handlePowerChangeDoneForPCIDevice( + IOService * service, + IOPMActions * actions, + uint32_t powerState, uint32_t changeFlags ); + void askChangeDownDone( + IOPMPowerChangeFlags * inOutChangeFlags, + bool * cancel ); + + void handlePublishSleepWakeUUID( + bool shouldPublish); + + void handleQueueSleepWakeUUID( + OSObject *obj); + IOReturn setMaintenanceWakeCalendar( const IOPMCalendarStruct * calendar ); -#endif /* ROOT_DOMAIN_RUN_STATES */ // Handle callbacks from IOService::systemWillShutdown() - void acknowledgeSystemWillShutdown( IOService * from ); + void acknowledgeSystemWillShutdown( IOService * from ); // Handle platform halt and restart notifications - void handlePlatformHaltRestart( UInt32 pe_type ); + void handlePlatformHaltRestart( UInt32 pe_type ); + + IOReturn shutdownSystem( void ); + IOReturn restartSystem( void ); + void handleSleepTimerExpiration( void ); - IOReturn shutdownSystem( void ); - IOReturn restartSystem( void ); - void handleSleepTimerExpiration( void ); - void handleForcedSleepTimerExpiration( void ); - void stopIgnoringClamshellEventsDuringWakeup( void ); bool activitySinceSleep(void); bool abortHibernation(void); @@ -404,15 +401,67 @@ public: void handleAggressivesRequests( void ); void tracePoint( uint8_t point ); + void tracePoint( uint8_t point, uint8_t data ); + void traceDetail( uint32_t data32 ); + + bool systemMessageFilter( + void * object, void * arg1, void * arg2, void * arg3 ); + +/*! @function recordPMEvent + @abstract Logs IOService PM event timing. + @discussion Should only be called from IOServicePM. Should not be exported. + @result kIOReturn on success. +*/ + IOReturn recordPMEvent( PMEventDetails *details ); + IOReturn recordAndReleasePMEvent( PMEventDetails *details ); + IOReturn recordPMEventGated( PMEventDetails *details ); + IOReturn recordAndReleasePMEventGated( PMEventDetails *details ); + + void pmStatsRecordEvent( + int eventIndex, + AbsoluteTime timestamp); + + void pmStatsRecordApplicationResponse( + const OSSymbol *response, + const char *name, + int messageType, + uint32_t delay_ms, + int app_pid); + +#if HIBERNATION + bool getHibernateSettings( + uint32_t * hibernateMode, + uint32_t * hibernateFreeRatio, + uint32_t * hibernateFreeTime ); +#endif private: friend class PMSettingObject; - friend class PMAssertionsTracker; friend class RootDomainUserClient; + friend class PMAssertionsTracker; + + static IOReturn sysPowerDownHandler( void * target, void * refCon, + UInt32 messageType, IOService * service, + void * messageArgument, vm_size_t argSize ); + + static IOReturn displayWranglerNotification( void * target, void * refCon, + UInt32 messageType, IOService * service, + void * messageArgument, vm_size_t argSize ); + + static IOReturn rootBusyStateChangeHandler( void * target, void * refCon, + UInt32 messageType, IOService * service, + void * messageArgument, vm_size_t argSize ); + + static bool displayWranglerMatchPublished( void * target, void * refCon, + IOService * newService, + IONotifier * notifier); + + static bool batteryPublished( void * target, void * refCon, + IOService * resourceService, + IONotifier * notifier); - // Points to our parent IOService * wrangler; - class IORootParent * patriarch; + IOService * wranglerConnection; IOLock *featuresDictLock; // guards supportedFeatures IOPMPowerStateQueue *pmPowerStateQueue; @@ -422,7 +471,7 @@ private: PMAssertionsTracker *pmAssertions; // Settings controller info - IORecursiveLock *settingsCtrlLock; + IOLock *settingsCtrlLock; OSDictionary *settingsCallbacks; OSDictionary *fPMSettingsDict; @@ -430,16 +479,16 @@ private: IONotifier *_displayWranglerNotifier; // Statistics - const OSSymbol *_statsNameKey; - const OSSymbol *_statsPIDKey; - const OSSymbol *_statsTimeMSKey; - const OSSymbol *_statsResponseTypeKey; - const OSSymbol *_statsMessageTypeKey; + const OSSymbol *_statsNameKey; + const OSSymbol *_statsPIDKey; + const OSSymbol *_statsTimeMSKey; + const OSSymbol *_statsResponseTypeKey; + const OSSymbol *_statsMessageTypeKey; OSString *queuedSleepWakeUUIDString; - OSArray *pmStatsAppResponses; + bool uuidPublished; PMStatsStruct pmStats; // Pref: idle time before idle sleep @@ -452,41 +501,78 @@ private: unsigned long extraSleepDelay; // Used to wait between say display idle and system idle - thread_call_t extraSleepTimer; - - // Used to ignore clamshell close events while we're waking from sleep - thread_call_t clamshellWakeupIgnore; - + thread_call_t extraSleepTimer; thread_call_t diskSyncCalloutEntry; - uint32_t runStateIndex; - uint32_t runStateFlags; - uint32_t nextRunStateIndex; - uint32_t wranglerTickled; + // IOPMActions parameter encoding + enum { + kPMActionsFlagIsDisplayWrangler = 0x00000001, + kPMActionsFlagIsGraphicsDevice = 0x00000002, + kPMActionsFlagIsAudioDevice = 0x00000004, + kPMActionsFlagLimitPower = 0x00000008, + kPMActionsPCIBitNumberMask = 0x000000ff + }; + + // Track system capabilities. + uint32_t _desiredCapability; + uint32_t _currentCapability; + uint32_t _pendingCapability; + uint32_t _highestCapability; + OSSet * _joinedCapabilityClients; + uint32_t _systemStateGeneration; + + // Type of clients that can receive system messages. + enum { + kSystemMessageClientConfigd = 0x01, + kSystemMessageClientApp = 0x02, + kSystemMessageClientUser = 0x03, + kSystemMessageClientKernel = 0x04, + kSystemMessageClientAll = 0x07 + }; + uint32_t _systemMessageClientMask; + + // Power state and capability change transitions. + enum { + kSystemTransitionNone = 0, + kSystemTransitionSleep = 1, + kSystemTransitionWake = 2, + kSystemTransitionCapability = 3, + kSystemTransitionNewCapClient = 4 + } _systemTransitionType; unsigned int systemBooting :1; unsigned int systemShutdown :1; + unsigned int systemDarkWake :1; unsigned int clamshellExists :1; - unsigned int clamshellIsClosed :1; - unsigned int ignoringClamshell :1; - unsigned int ignoringClamshellOnWake :1; + unsigned int clamshellClosed :1; + unsigned int clamshellDisabled :1; unsigned int desktopMode :1; - unsigned int acAdaptorConnected :1; + unsigned int acAdaptorConnected :1; - unsigned int allowSleep :1; - unsigned int sleepIsSupported :1; - unsigned int canSleep :1; - unsigned int sleepASAP :1; unsigned int idleSleepTimerPending :1; unsigned int userDisabledAllSleep :1; - unsigned int ignoreChangeDown :1; + unsigned int childPreventSystemSleep :1; + unsigned int ignoreTellChangeDown :1; unsigned int wranglerAsleep :1; + unsigned int wranglerTickled :1; + unsigned int wranglerSleepIgnored :1; + unsigned int graphicsSuppressed :1; + + unsigned int capabilityLoss :1; + unsigned int pciCantSleepFlag :1; + unsigned int pciCantSleepValid :1; + unsigned int logWranglerTickle :1; + unsigned int logGraphicsClamp :1; + unsigned int darkWakeToSleepASAP :1; + unsigned int darkWakeMaintenance :1; + unsigned int darkWakePostTickle :1; unsigned int sleepTimerMaintenance :1; unsigned int lowBatteryCondition :1; unsigned int hibernateDisabled :1; unsigned int hibernateNoDefeat :1; unsigned int hibernateAborted :1; + unsigned int rejectWranglerTickle :1; uint32_t hibernateMode; uint32_t userActivityCount; @@ -498,54 +584,45 @@ private: int32_t idxPMCPULimitedPower; IOOptionBits platformSleepSupport; + uint32_t _debugWakeSeconds; queue_head_t aggressivesQueue; thread_call_t aggressivesThreadCall; OSData * aggressivesData; AbsoluteTime wranglerSleepTime; - + AbsoluteTime systemWakeTime; + // PCI top-level PM trace IOService * pciHostBridgeDevice; + IOService * pciHostBridgeDriver; - // IOPMrootDomain internal sleep call - IOReturn privateSleepSystem( uint32_t sleepReason ); - void announcePowerSourceChange( void ); + IONotifier * systemCapabilityNotifier; - void reportUserInput( void ); - static IOReturn sysPowerDownHandler( void * target, void * refCon, - UInt32 messageType, IOService * service, - void * messageArgument, vm_size_t argSize ); + IOPMTimeline *timeline; - static IOReturn displayWranglerNotification( void * target, void * refCon, - UInt32 messageType, IOService * service, - void * messageArgument, vm_size_t argSize ); + // IOPMrootDomain internal sleep call + IOReturn privateSleepSystem( uint32_t sleepReason ); + void reportUserInput( void ); + bool checkSystemCanSleep( IOOptionBits options = 0 ); - static bool displayWranglerPublished( void * target, void * refCon, - IOService * newService); + void adjustPowerState( bool sleepASAP = false ); + void setQuickSpinDownTimeout( void ); + void restoreUserSpinDownTimeout( void ); - static bool batteryPublished( void * target, void * refCon, - IOService * resourceService ); + bool shouldSleepOnClamshellClosed(void ); + void sendClientClamshellNotification( void ); - void adjustPowerState( void ); - void setQuickSpinDownTimeout( void ); - void restoreUserSpinDownTimeout( void ); - - bool shouldSleepOnClamshellClosed(void ); - void sendClientClamshellNotification( void ); - // Inform PMCPU of changes to state like lid, AC vs. battery - void informCPUStateChange( uint32_t type, uint32_t value ); + void informCPUStateChange( uint32_t type, uint32_t value ); - void dispatchPowerEvent( uint32_t event, void * arg0, uint64_t arg1 ); - void handlePowerNotification( UInt32 msg ); + void dispatchPowerEvent( uint32_t event, void * arg0, uint64_t arg1 ); + void handlePowerNotification( UInt32 msg ); - IOReturn setPMSetting(const OSSymbol *, OSObject *); + IOReturn setPMSetting(const OSSymbol *, OSObject *); - void startIdleSleepTimer( uint32_t inSeconds ); - void cancelIdleSleepTimer( void ); - - void updateRunState( uint32_t inRunState ); + void startIdleSleepTimer( uint32_t inSeconds ); + void cancelIdleSleepTimer( void ); IOReturn setAggressiveness( unsigned long type, @@ -561,19 +638,23 @@ private: const AggressivesRecord * array, int count ); - void aggressivenessChanged( void ); + // getPMTraceMemoryDescriptor should only be called by our friend RootDomainUserClient + IOMemoryDescriptor *getPMTraceMemoryDescriptor(void); IOReturn setPMAssertionUserLevels(IOPMDriverAssertionType); - + void publishSleepWakeUUID( bool shouldPublish ); + void evaluatePolicy( int stimulus, uint32_t arg = 0 ); + + void deregisterPMSettingObject( PMSettingObject * pmso ); + #if HIBERNATION bool getSleepOption( const char * key, uint32_t * option ); bool evaluateSystemSleepPolicy( IOPMSystemSleepParameters * p ); void evaluateSystemSleepPolicyEarly( void ); void evaluateSystemSleepPolicyFinal( void ); #endif /* HIBERNATION */ - #endif /* XNU_KERNEL_PRIVATE */ }; @@ -582,10 +663,9 @@ class IORootParent: public IOService { OSDeclareFinalStructors(IORootParent) -private: - unsigned long mostRecentChange; - public: + static void initialize( void ); + virtual OSObject * copyProperty( const char * aKey ) const; bool start( IOService * nub ); void shutDownSystem( void ); void restartSystem( void ); diff --git a/iokit/IOKit/rtc/Makefile b/iokit/IOKit/rtc/Makefile index ace4cfb12..e16d5b83a 100644 --- a/iokit/IOKit/rtc/Makefile +++ b/iokit/IOKit/rtc/Makefile @@ -14,13 +14,11 @@ MI_DIR = rtc NOT_EXPORT_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} diff --git a/iokit/IOKit/system_management/Makefile b/iokit/IOKit/system_management/Makefile index 1f168421f..c887db562 100644 --- a/iokit/IOKit/system_management/Makefile +++ b/iokit/IOKit/system_management/Makefile @@ -14,13 +14,11 @@ MI_DIR = system_management NOT_EXPORT_HEADERS = INSTINC_SUBDIRS = -INSTINC_SUBDIRS_PPC = INSTINC_SUBDIRS_I386 = INSTINC_SUBDIRS_X86_64 = INSTINC_SUBDIRS_ARM = EXPINC_SUBDIRS = ${INSTINC_SUBDIRS} -EXPINC_SUBDIRS_PPC = ${INSTINC_SUBDIRS_PPC} EXPINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS_I386} EXPINC_SUBDIRS_X86_64 = ${INSTINC_SUBDIRS_X86_64} EXPINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS_ARM} diff --git a/iokit/Kernel/IOBufferMemoryDescriptor.cpp b/iokit/Kernel/IOBufferMemoryDescriptor.cpp index 004d2ec89..563059600 100644 --- a/iokit/Kernel/IOBufferMemoryDescriptor.cpp +++ b/iokit/Kernel/IOBufferMemoryDescriptor.cpp @@ -103,16 +103,6 @@ bool IOBufferMemoryDescriptor::initWithPhysicalMask( // Grab IOMD bits from the Buffer MD options iomdOptions |= (options & kIOBufferDescriptorMemoryFlags); -#if 0 - // workarounds- - if ((options & kIOMemoryPhysicallyContiguous) || ((capacity == 0x1000) && (inTask == kernel_task)) - && !physicalMask) - { - highestMask = physicalMask = 0xFFFFF000; - } - //- -#endif - if (physicalMask && (alignment <= 1)) { alignment = ((physicalMask ^ (-1ULL)) & (physicalMask - 1)); diff --git a/iokit/Kernel/IOCPU.cpp b/iokit/Kernel/IOCPU.cpp index 7646d2a97..5dd9ea416 100644 --- a/iokit/Kernel/IOCPU.cpp +++ b/iokit/Kernel/IOCPU.cpp @@ -302,12 +302,15 @@ void IOCPUSleepKernel(void) long cnt, numCPUs; IOCPU *target; IOCPU *bootCPU = NULL; - + IOPMrootDomain *rootDomain = IOService::getPMRootDomain(); + kprintf("IOCPUSleepKernel\n"); OSIterator * iter; IOService * service; + rootDomain->tracePoint( kIOPMTracePointSleepPlatformActions ); + queue_init(&gIOSleepActionQueue); queue_init(&gIOWakeActionQueue); @@ -333,6 +336,8 @@ void IOCPUSleepKernel(void) iocpu_run_platform_actions(&gIOSleepActionQueue, 0, 0U-1, NULL, NULL, NULL); + rootDomain->tracePoint( kIOPMTracePointSleepCPUs ); + numCPUs = gIOCPUs->getCount(); // Sleep the CPUs. cnt = numCPUs; @@ -352,10 +357,14 @@ void IOCPUSleepKernel(void) } } + rootDomain->tracePoint( kIOPMTracePointSleepPlatformDriver ); + // Now sleep the boot CPU. if (bootCPU) bootCPU->haltCPU(); + rootDomain->tracePoint( kIOPMTracePointWakePlatformActions ); + iocpu_run_platform_actions(&gIOWakeActionQueue, 0, 0U-1, NULL, NULL, NULL); @@ -372,6 +381,8 @@ void IOCPUSleepKernel(void) if (!queue_empty(&gIOWakeActionQueue)) panic("gIOWakeActionQueue"); + rootDomain->tracePoint( kIOPMTracePointWakeCPUs ); + // Wake the other CPUs. for (cnt = 0; cnt < numCPUs; cnt++) { diff --git a/iokit/Kernel/IOCatalogue.cpp b/iokit/Kernel/IOCatalogue.cpp index 8c51eed84..c6de3b56c 100644 --- a/iokit/Kernel/IOCatalogue.cpp +++ b/iokit/Kernel/IOCatalogue.cpp @@ -74,31 +74,6 @@ IOLock * gIOCatalogLock; #if PRAGMA_MARK #pragma mark Utility functions #endif -/********************************************************************* -*********************************************************************/ -static void -UniqueProperties(OSDictionary * dict) -{ - OSString * data; - - data = OSDynamicCast(OSString, dict->getObject(gIOClassKey)); - if (data) { - const OSSymbol *classSymbol = OSSymbol::withString(data); - - dict->setObject( gIOClassKey, (OSSymbol *) classSymbol); - classSymbol->release(); - } - - data = OSDynamicCast(OSString, dict->getObject(gIOMatchCategoryKey)); - if (data) { - const OSSymbol *classSymbol = OSSymbol::withString(data); - - dict->setObject(gIOMatchCategoryKey, (OSSymbol *) classSymbol); - classSymbol->release(); - } - return; -} - /********************************************************************* * Add a new personality to the set if it has a unique IOResourceMatchKey value. * XXX -- svail: This should be optimized. @@ -170,13 +145,13 @@ bool IOCatalogue::init(OSArray * initArray) gIOCatalogLock = IOLockAlloc(); lock = gIOCatalogLock; -#if __ppc__ || __i386__ +#if __i386__ kld_lock = NULL; -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ kernelTables->reset(); while( (dict = (OSDictionary *) kernelTables->getNextObject())) { - UniqueProperties(dict); + OSKext::uniquePersonalityProperties(dict); if( 0 == dict->getObject( gIOClassKey )) IOLog("Missing or bad \"%s\" key\n", gIOClassKey->getCStringNoCopy()); @@ -306,7 +281,7 @@ IOCatalogue::findDrivers( OSDictionary * dict; OSOrderedSet * set; - UniqueProperties(matching); + OSKext::uniquePersonalityProperties(matching); set = OSOrderedSet::withCapacity( 1, IOServiceOrdering, (void *)gIOProbeScoreKey ); @@ -345,7 +320,7 @@ bool IOCatalogue::addDrivers( bool result = false; OSCollectionIterator * iter = NULL; // must release OSOrderedSet * set = NULL; // must release - OSDictionary * dict = NULL; // do not release + OSObject * object = NULL; // do not release OSArray * persons = NULL; // do not release persons = OSDynamicCast(OSArray, drivers); @@ -364,16 +339,26 @@ bool IOCatalogue::addDrivers( goto finish; } + /* Start with success; clear it on an error. + */ result = true; IOLockLock(lock); - while ( (dict = (OSDictionary *) iter->getNextObject()) ) { + while ( (object = iter->getNextObject()) ) { // xxx Deleted OSBundleModuleDemand check; will handle in other ways for SL + OSDictionary * personality = OSDynamicCast(OSDictionary, object); + SInt count; - - UniqueProperties(dict); + + if (!personality) { + IOLog("IOCatalogue::addDrivers() encountered non-dictionary; bailing.\n"); + result = false; + break; + } + + OSKext::uniquePersonalityProperties(personality); // Add driver personality to catalogue. count = array->getCount(); @@ -389,7 +374,7 @@ bool IOCatalogue::addDrivers( * Do not compare just the properties present in one driver * pesonality or the other. */ - if (dict->isEqualTo(driver)) { + if (personality->isEqualTo(driver)) { break; } } @@ -398,15 +383,15 @@ bool IOCatalogue::addDrivers( continue; } - result = array->setObject(dict); + result = array->setObject(personality); if (!result) { break; } - AddNewImports(set, dict); + AddNewImports(set, personality); } // Start device matching. - if (doNubMatching && (set->getCount() > 0)) { + if (result && doNubMatching && (set->getCount() > 0)) { IOService::catalogNewDrivers(set); generation++; } @@ -455,7 +440,7 @@ IOCatalogue::removeDrivers( return false; } - UniqueProperties( matching ); + OSKext::uniquePersonalityProperties( matching ); IOLockLock(lock); kernelTables->reset(); @@ -553,8 +538,7 @@ void IOCatalogue::moduleHasLoaded(OSString * moduleName) startMatching(dict); dict->release(); - (void) OSKext::setDeferredLoadSucceeded(); - (void) OSKext::considerRebuildOfPrelinkedKernel(); + (void) OSKext::considerRebuildOfPrelinkedKernel(moduleName); } void IOCatalogue::moduleHasLoaded(const char * moduleName) @@ -589,7 +573,7 @@ static IOReturn _terminateDrivers(OSDictionary * matching) if ( !iter ) return kIOReturnNoMemory; - UniqueProperties( matching ); + OSKext::uniquePersonalityProperties( matching ); // terminate instances. do { @@ -785,7 +769,183 @@ bool IOCatalogue::startMatching( OSDictionary * matching ) void IOCatalogue::reset(void) { + IOCatalogue::resetAndAddDrivers(/* no drivers; true reset */ NULL, + /* doMatching */ false); + return; +} + +bool IOCatalogue::resetAndAddDrivers(OSArray * drivers, bool doNubMatching) +{ + bool result = false; + OSArray * newPersonalities = NULL; // do not release + OSCollectionIterator * newPIterator = NULL; // must release + OSOrderedSet * matchSet = NULL; // must release + OSArray * oldPersonalities = NULL; // must release + OSArray * kernelPersonalities = NULL; // must release + OSString * errorString = NULL; // must release + OSObject * object = NULL; // do not release + OSDictionary * thisNewPersonality = NULL; // do not release + signed int count, i; + + extern const char * gIOKernelConfigTables; + + if (drivers) { + newPersonalities = OSDynamicCast(OSArray, drivers); + if (!newPersonalities) { + goto finish; + } + + newPIterator = OSCollectionIterator::withCollection(newPersonalities); + if (!newPIterator) { + goto finish; + } + + matchSet = OSOrderedSet::withCapacity(10, IOServiceOrdering, + (void *)gIOProbeScoreKey); + if (!matchSet) { + goto finish; + } + } + + /* Read personalities for the built-in kernel driver classes. + * We don't have many any more. + */ + kernelPersonalities = OSDynamicCast(OSArray, + OSUnserialize(gIOKernelConfigTables, &errorString)); + if (!kernelPersonalities && errorString) { + IOLog("KernelConfigTables syntax error: %s\n", + errorString->getCStringNoCopy()); + goto finish; + } + + /* Now copy the current array of personalities so we can reuse them + * if the new list contains any duplicates. This saves on memory + * consumption. + */ + oldPersonalities = OSDynamicCast(OSArray, array->copyCollection()); + if (!oldPersonalities) { + goto finish; + } + + result = true; + IOLog("Resetting IOCatalogue.\n"); + + /* No goto finish from here to unlock. + */ + IOLockLock(lock); + + array->flushCollection(); + + /* Add back the kernel personalities and remove them from the old + * array so we don't try to match on them again. Go forward through + * the arrays as this causes the least iteration since kernel personalities + * should always be first. + */ + count = kernelPersonalities->getCount(); + for (i = 0; i < count; i++) { + + /* Static cast here, as the data is coming from within the kernel image. + */ + OSDictionary * thisNewPersonality = (OSDictionary *) + kernelPersonalities->getObject(i); + array->setObject(thisNewPersonality); + + signed int oldPCount = oldPersonalities->getCount(); + for (signed int oldPIndex = 0; oldPIndex < oldPCount; oldPIndex++) { + if (thisNewPersonality->isEqualTo(oldPersonalities->getObject(oldPIndex))) { + oldPersonalities->removeObject(oldPIndex); + break; + } + } + } + + /* Now add the new set of personalities passed in, using existing + * copies if we had them in kernel memory already. + */ + if (newPIterator) { + OSDictionary * thisOldPersonality = NULL; // do not release + + while ( (object = newPIterator->getNextObject()) ) { + + thisNewPersonality = OSDynamicCast(OSDictionary, object); + if (!thisNewPersonality) { + IOLog("IOCatalogue::resetAndAddDrivers() encountered non-dictionary; bailing.\n"); + result = false; + break; + } + + /* Convert common OSString property values to OSSymbols. + */ + OSKext::uniquePersonalityProperties(thisNewPersonality); + + /* Add driver personality to catalogue, but if we had a copy already + * use that instead so we don't have multiple copies from OSKext instances. + */ + count = oldPersonalities->getCount(); + thisOldPersonality = NULL; + while (count--) { + + thisOldPersonality = (OSDictionary *)oldPersonalities->getObject(count); + + /* Unlike in other functions, this comparison must be exact! + * The catalogue must be able to contain personalities that + * are proper supersets of others. + * Do not compare just the properties present in one driver + * pesonality or the other. + */ + if (thisNewPersonality->isEqualTo(thisOldPersonality)) { + break; + } + } + + /* If we found a dup, add the *original* back to the catalogue, + * remove it from our bookkeeping list, and continue. + * Don't worry about matching on personalities we already had. + */ + if (count >= 0) { + array->setObject(thisOldPersonality); + oldPersonalities->removeObject(count); + continue; + } + + /* Otherwise add the new personality and mark it for matching. + */ + array->setObject(thisNewPersonality); + AddNewImports(matchSet, thisNewPersonality); + } + + /***** + * Now, go through remaining old personalities, which have effectively + * been removed, and add them to the match set as necessary. + */ + count = oldPersonalities->getCount(); + while (count--) { + + /* Static cast here is ok as these dictionaries were already in the catalogue. + */ + thisOldPersonality = (OSDictionary *)oldPersonalities->getObject(count); + AddNewImports(matchSet, thisOldPersonality); + } + + /* Finally, start device matching on all new & removed personalities. + */ + if (result && doNubMatching && (matchSet->getCount() > 0)) { + IOService::catalogNewDrivers(matchSet); + generation++; + } + } + + IOLockUnlock(lock); + +finish: + if (newPIterator) newPIterator->release(); + if (matchSet) matchSet->release(); + if (oldPersonalities) oldPersonalities->release(); + if (kernelPersonalities) kernelPersonalities->release(); + if (errorString) errorString->release(); + + return result; } bool IOCatalogue::serialize(OSSerialize * s) const @@ -837,9 +997,9 @@ bool IOCatalogue::serializeData(IOOptionBits kind, OSSerialize * s) const ********************************************************************** ********************************************************************** * These functions are no longer used are necessary for C++ binary -* compatibility on ppc/i386. +* compatibility on i386. **********************************************************************/ -#if __ppc__ || __i386__ +#if __i386__ bool IOCatalogue::recordStartupExtensions(void) { return false; } @@ -850,4 +1010,4 @@ bool IOCatalogue::addExtensionsFromArchive(OSData * mkext) kern_return_t IOCatalogue::removeKernelLinker(void) { return KERN_NOT_SUPPORTED; } -#endif /* __ppc__ || __i386__ */ +#endif /* __i386__ */ diff --git a/iokit/Kernel/IOCommandGate.cpp b/iokit/Kernel/IOCommandGate.cpp index 0b823d2b6..29ecd859e 100644 --- a/iokit/Kernel/IOCommandGate.cpp +++ b/iokit/Kernel/IOCommandGate.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000, 2009-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -49,11 +49,33 @@ OSMetaClassDefineReservedUnused(IOCommandGate, 5); OSMetaClassDefineReservedUnused(IOCommandGate, 6); OSMetaClassDefineReservedUnused(IOCommandGate, 7); -bool IOCommandGate::checkForWork() { return false; } +#if IOKITSTATS + +#define IOStatisticsInitializeCounter() \ +do { \ + IOStatistics::setCounterType(IOEventSource::reserved->counter, kIOStatisticsCommandGateCounter); \ +} while (0) + +#define IOStatisticsActionCall() \ +do { \ + IOStatistics::countCommandGateActionCall(IOEventSource::reserved->counter); \ +} while (0) + +#else + +#define IOStatisticsInitializeCounter() +#define IOStatisticsActionCall() + +#endif /* IOKITSTATS */ bool IOCommandGate::init(OSObject *inOwner, Action inAction) { - return super::init(inOwner, (IOEventSource::Action) inAction); + bool res = super::init(inOwner, (IOEventSource::Action) inAction); + if (res) { + IOStatisticsInitializeCounter(); + } + + return res; } IOCommandGate * @@ -162,6 +184,8 @@ IOReturn IOCommandGate::runAction(Action inAction, IOTimeStampStartConstant(IODBG_CMDQ(IOCMDQ_ACTION), (uintptr_t) inAction, (uintptr_t) owner); + IOStatisticsActionCall(); + // Must be gated and on the work loop or enabled res = (*inAction)(owner, arg0, arg1, arg2, arg3); @@ -170,7 +194,7 @@ IOReturn IOCommandGate::runAction(Action inAction, (uintptr_t) inAction, (uintptr_t) owner); openGate(); - + return res; } @@ -196,9 +220,11 @@ IOReturn IOCommandGate::attemptAction(Action inAction, if (trace) IOTimeStampStartConstant(IODBG_CMDQ(IOCMDQ_ACTION), - (uintptr_t) inAction, (uintptr_t) owner); - - res = (*inAction)(owner, arg0, arg1, arg2, arg3); + (uintptr_t) inAction, (uintptr_t) owner); + + IOStatisticsActionCall(); + + res = (*inAction)(owner, arg0, arg1, arg2, arg3); if (trace) IOTimeStampEndConstant(IODBG_CMDQ(IOCMDQ_ACTION), diff --git a/iokit/Kernel/IOCommandQueue.cpp b/iokit/Kernel/IOCommandQueue.cpp index 7d7249dee..3a184bf94 100644 --- a/iokit/Kernel/IOCommandQueue.cpp +++ b/iokit/Kernel/IOCommandQueue.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2004 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,20 @@ #include <mach/sync_policy.h> +#if IOKITSTATS + +#define IOStatisticsInitializeCounter() \ + IOStatistics::setCounterType(reserved->counter, kIOStatisticsCommandQueueCounter) + +#define IOStatisticsActionCall() \ + IOStatistics::countCommandQueueActionCall(reserved->counter) + +#else + +#define IOStatisticsInitializeCounter() +#define IOStatisticsActionCall() + +#endif /* IOKITSTATS */ #define NUM_FIELDS_IN_COMMAND 4 typedef struct commandEntryTag { @@ -87,6 +101,8 @@ bool IOCommandQueue::init(OSObject *inOwner, producerIndex = consumerIndex = 0; + IOStatisticsInitializeCounter(); + return true; } @@ -130,7 +146,7 @@ void IOCommandQueue::free() bool IOCommandQueue::checkForWork() { - void *field0, *field1, *field2, *field3; + void *field0, *field1, *field2, *field3; bool trace = ( gIOKitTrace & kIOTraceCommandGates ) ? true : false; if (!enabled || consumerIndex == producerIndex) @@ -150,10 +166,11 @@ bool IOCommandQueue::checkForWork() if (trace) IOTimeStampStartConstant(IODBG_CMDQ(IOCMDQ_ACTION), - (uintptr_t) action, (uintptr_t) owner); - + (uintptr_t) action, (uintptr_t) owner); + + IOStatisticsActionCall(); (*(IOCommandQueueAction) action)(owner, field0, field1, field2, field3); - + if (trace) IOTimeStampEndConstant(IODBG_CMDQ(IOCMDQ_ACTION), (uintptr_t) action, (uintptr_t) owner); diff --git a/iokit/Kernel/IODMACommand.cpp b/iokit/Kernel/IODMACommand.cpp index 444abf720..b95ee921d 100644 --- a/iokit/Kernel/IODMACommand.cpp +++ b/iokit/Kernel/IODMACommand.cpp @@ -84,7 +84,7 @@ enum /**************************** class IODMACommand ***************************/ #undef super -#define super OSObject +#define super IOCommand OSDefineMetaClassAndStructors(IODMACommand, IOCommand); OSMetaClassDefineReservedUsed(IODMACommand, 0); @@ -227,6 +227,8 @@ IODMACommand::free() IOReturn IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepare) { + IOReturn err = kIOReturnSuccess; + if (mem == fMemory) { if (!autoPrepare) @@ -244,15 +246,15 @@ IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepar if (fActive) return kIOReturnBusy; clearMemoryDescriptor(); - }; + } if (mem) { bzero(&fMDSummary, sizeof(fMDSummary)); - IOReturn rtn = mem->dmaCommandOperation( + err = mem->dmaCommandOperation( kIOMDGetCharacteristics, &fMDSummary, sizeof(fMDSummary)); - if (rtn) - return rtn; + if (err) + return err; ppnum_t highPage = fMDSummary.fHighestPage ? fMDSummary.fHighestPage : gIOLastPage; @@ -269,11 +271,15 @@ IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepar fMemory = mem; mem->dmaCommandOperation(kIOMDSetDMAActive, this, 0); - if (autoPrepare) - return prepare(); - }; - - return kIOReturnSuccess; + if (autoPrepare) { + err = prepare(); + if (err) { + clearMemoryDescriptor(); + } + } + } + + return err; } IOReturn diff --git a/iokit/Kernel/IODMAController.cpp b/iokit/Kernel/IODMAController.cpp index 33a54dc76..603998035 100644 --- a/iokit/Kernel/IODMAController.cpp +++ b/iokit/Kernel/IODMAController.cpp @@ -50,9 +50,9 @@ IODMAController *IODMAController::getController(IOService *provider, UInt32 dmaI // Find the name of the parent dma controller dmaParentData = OSDynamicCast(OSData, provider->getProperty("dma-parent")); - if (dmaParentData == 0) return false; + if (dmaParentData == 0) return NULL; dmaParentName = createControllerName(*(UInt32 *)dmaParentData->getBytesNoCopy()); - if (dmaParentName == 0) return false; + if (dmaParentName == 0) return NULL; // Wait for the parent dma controller dmaController = OSDynamicCast(IODMAController, IOService::waitForService(IOService::nameMatching(dmaParentName))); diff --git a/iokit/Kernel/IODeviceTreeSupport.cpp b/iokit/Kernel/IODeviceTreeSupport.cpp index afb221cf4..8de463efd 100644 --- a/iokit/Kernel/IODeviceTreeSupport.cpp +++ b/iokit/Kernel/IODeviceTreeSupport.cpp @@ -37,12 +37,14 @@ #include <pexpert/device_tree.h> +#include <machine/machine_routines.h> + extern "C" { - #include <machine/machine_routines.h> - void DTInit( void * data ); - int IODTGetLoaderInfo( char *key, void **infoAddr, int *infosize ); - void IODTFreeLoaderInfo( char *key, void *infoAddr, int infoSize ); +int IODTGetLoaderInfo( const char *key, void **infoAddr, int *infosize ); +void IODTFreeLoaderInfo( const char *key, void *infoAddr, int infoSize ); +int IODTGetDefault(const char *key, void *infoAddr, unsigned int infoSize ); + } #include <IOKit/assert.h> @@ -209,26 +211,6 @@ IODeviceTreeAlloc( void * dtTop ) if( !intMap && child->getProperty( gIODTInterruptParentKey)) intMap = true; -#if __ppc__ - OSObject * obj; - - // Look for a "driver,AAPL,MacOSX,PowerPC" property. - if( (obj = child->getProperty( "driver,AAPL,MacOSX,PowerPC"))) { - gIOCatalogue->addExtensionsFromArchive((OSData *)obj); - child->removeProperty( "driver,AAPL,MacOSX,PowerPC"); - } - - // some gross pruning - child->removeProperty( "lanLib,AAPL,MacOS,PowerPC"); - - if( (obj = child->getProperty( "driver,AAPL,MacOS,PowerPC"))) { - - if( (0 == (prop = (OSData *)child->getProperty( gIODTTypeKey ))) - || (strncmp("display", (char *)prop->getBytesNoCopy(), sizeof("display"))) ) { - child->removeProperty( "driver,AAPL,MacOS,PowerPC"); - } - } -#endif /* __ppc__ */ } regIter->release(); } @@ -265,7 +247,7 @@ IODeviceTreeAlloc( void * dtTop ) return( parent); } -int IODTGetLoaderInfo( char *key, void **infoAddr, int *infoSize ) +int IODTGetLoaderInfo( const char *key, void **infoAddr, int *infoSize ) { IORegistryEntry *chosen; OSData *propObj; @@ -290,7 +272,7 @@ int IODTGetLoaderInfo( char *key, void **infoAddr, int *infoSize ) return 0; } -void IODTFreeLoaderInfo( char *key, void *infoAddr, int infoSize ) +void IODTFreeLoaderInfo( const char *key, void *infoAddr, int infoSize ) { vm_offset_t range[2]; IORegistryEntry *chosen; @@ -307,6 +289,26 @@ void IODTFreeLoaderInfo( char *key, void *infoAddr, int infoSize ) } } +int IODTGetDefault(const char *key, void *infoAddr, unsigned int infoSize ) +{ + IORegistryEntry *defaults; + OSData *defaultObj; + unsigned int defaultSize; + + defaults = IORegistryEntry::fromPath( "/defaults", gIODTPlane ); + if ( defaults == 0 ) return -1; + + defaultObj = OSDynamicCast( OSData, defaults->getProperty(key) ); + if ( defaultObj == 0 ) return -1; + + defaultSize = defaultObj->getLength(); + if ( defaultSize > infoSize) return -1; + + memcpy( infoAddr, defaultObj->getBytesNoCopy(), defaultSize ); + + return 0; +} + static void FreePhysicalMemory( vm_offset_t * range ) { vm_offset_t virt; diff --git a/iokit/Kernel/IOEventSource.cpp b/iokit/Kernel/IOEventSource.cpp index a20232d91..95046dacd 100644 --- a/iokit/Kernel/IOEventSource.cpp +++ b/iokit/Kernel/IOEventSource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000, 2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -40,6 +40,7 @@ HISTORY #define super OSObject OSDefineMetaClassAndAbstractStructors(IOEventSource, OSObject) + OSMetaClassDefineReservedUnused(IOEventSource, 0); OSMetaClassDefineReservedUnused(IOEventSource, 1); OSMetaClassDefineReservedUnused(IOEventSource, 2); @@ -49,17 +50,88 @@ OSMetaClassDefineReservedUnused(IOEventSource, 5); OSMetaClassDefineReservedUnused(IOEventSource, 6); OSMetaClassDefineReservedUnused(IOEventSource, 7); +bool IOEventSource::checkForWork() { return false; } + /* inline function implementations */ -void IOEventSource::signalWorkAvailable() { workLoop->signalWorkAvailable(); } -void IOEventSource::openGate() { workLoop->openGate(); } -void IOEventSource::closeGate() { workLoop->closeGate(); } -bool IOEventSource::tryCloseGate() { return workLoop->tryCloseGate(); } + +#if IOKITSTATS + +#define IOStatisticsRegisterCounter() \ +do { \ + reserved->counter = IOStatistics::registerEventSource(inOwner); \ +} while (0) + +#define IOStatisticsUnregisterCounter() \ +do { \ + if (reserved) \ + IOStatistics::unregisterEventSource(reserved->counter); \ +} while (0) + +#define IOStatisticsOpenGate() \ +do { \ + IOStatistics::countOpenGate(reserved->counter); \ +} while (0) + +#define IOStatisticsCloseGate() \ +do { \ + IOStatistics::countCloseGate(reserved->counter); \ +} while (0) + +#else + +#define IOStatisticsRegisterCounter() +#define IOStatisticsUnregisterCounter() +#define IOStatisticsOpenGate() +#define IOStatisticsCloseGate() + +#endif /* IOKITSTATS */ + +void IOEventSource::signalWorkAvailable() +{ + workLoop->signalWorkAvailable(); +} + +void IOEventSource::openGate() +{ + IOStatisticsOpenGate(); + workLoop->openGate(); +} + +void IOEventSource::closeGate() +{ + workLoop->closeGate(); + IOStatisticsCloseGate(); +} + +bool IOEventSource::tryCloseGate() +{ + bool res; + if ((res = workLoop->tryCloseGate())) { + IOStatisticsCloseGate(); + } + return res; +} + int IOEventSource::sleepGate(void *event, UInt32 type) - { return workLoop->sleepGate(event, type); } +{ + bool res; + IOStatisticsOpenGate(); + res = workLoop->sleepGate(event, type); + IOStatisticsCloseGate(); + return res; +} + int IOEventSource::sleepGate(void *event, AbsoluteTime deadline, UInt32 type) - { return workLoop->sleepGate(event, deadline, type); } -void IOEventSource::wakeupGate(void *event, bool oneThread) - { workLoop->wakeupGate(event, oneThread); } +{ + bool res; + IOStatisticsOpenGate(); + res = workLoop->sleepGate(event, deadline, type); + IOStatisticsCloseGate(); + return res; +} + +void IOEventSource::wakeupGate(void *event, bool oneThread) { workLoop->wakeupGate(event, oneThread); } + bool IOEventSource::init(OSObject *inOwner, Action inAction) @@ -75,9 +147,28 @@ bool IOEventSource::init(OSObject *inOwner, (void) setAction(inAction); enabled = true; + if(!reserved) { + reserved = IONew(ExpansionData, 1); + if (!reserved) { + return false; + } + } + + IOStatisticsRegisterCounter(); + return true; } +void IOEventSource::free( void ) +{ + IOStatisticsUnregisterCounter(); + + if (reserved) + IODelete(reserved, ExpansionData, 1); + + super::free(); +} + IOEventSource::Action IOEventSource::getAction () const { return action; }; void IOEventSource::setAction(Action inAction) diff --git a/iokit/Kernel/IOFilterInterruptEventSource.cpp b/iokit/Kernel/IOFilterInterruptEventSource.cpp index f4f73e2b4..944e84ced 100644 --- a/iokit/Kernel/IOFilterInterruptEventSource.cpp +++ b/iokit/Kernel/IOFilterInterruptEventSource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -32,6 +32,25 @@ #include <IOKit/IOTimeStamp.h> #include <IOKit/IOWorkLoop.h> +#if IOKITSTATS + +#define IOStatisticsInitializeCounter() \ +do { \ + IOStatistics::setCounterType(IOEventSource::reserved->counter, kIOStatisticsFilterInterruptEventSourceCounter); \ +} while (0) + +#define IOStatisticsInterrupt() \ +do { \ + IOStatistics::countInterrupt(IOEventSource::reserved->counter); \ +} while (0) + +#else + +#define IOStatisticsInitializeCounter() +#define IOStatisticsInterrupt() + +#endif /* IOKITSTATS */ + #define super IOInterruptEventSource OSDefineMetaClassAndStructors @@ -79,6 +98,9 @@ IOFilterInterruptEventSource::init(OSObject *inOwner, return false; filterAction = inFilterAction; + + IOStatisticsInitializeCounter(); + return true; } @@ -103,9 +125,10 @@ IOFilterInterruptEventSource *IOFilterInterruptEventSource void IOFilterInterruptEventSource::signalInterrupt() { bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + + IOStatisticsInterrupt(); producerCount++; - + if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_SEMA), (uintptr_t) this, (uintptr_t) owner); @@ -129,20 +152,20 @@ IOFilterInterruptEventSource::getFilterAction() const void IOFilterInterruptEventSource::normalInterruptOccurred (void */*refcon*/, IOService */*prov*/, int /*source*/) { - bool filterRes; + bool filterRes; bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_FILTER), (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - + // Call the filter. filterRes = (*filterAction)(owner, this); if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_FILTER), (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - + if (filterRes) signalInterrupt(); } @@ -150,20 +173,20 @@ void IOFilterInterruptEventSource::normalInterruptOccurred void IOFilterInterruptEventSource::disableInterruptOccurred (void */*refcon*/, IOService *prov, int source) { - bool filterRes; + bool filterRes; bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_FILTER), (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - + // Call the filter. filterRes = (*filterAction)(owner, this); if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_FILTER), (uintptr_t) filterAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - + if (filterRes) { prov->disableInterrupt(source); /* disable the interrupt */ signalInterrupt(); diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp index bc180fb5b..a4d7dbb4d 100644 --- a/iokit/Kernel/IOHibernateIO.cpp +++ b/iokit/Kernel/IOHibernateIO.cpp @@ -151,6 +151,7 @@ to restrict I/O ops. #include <IOKit/pwr_mgt/IOPowerConnection.h> #include "IOPMPowerStateQueue.h" #include <IOKit/IOBufferMemoryDescriptor.h> +#include <IOKit/AppleKeyStoreInterface.h> #include <crypto/aes.h> #include <sys/uio.h> @@ -158,13 +159,20 @@ to restrict I/O ops. #include <sys/stat.h> #include <sys/fcntl.h> // (FWRITE, ...) #include <sys/sysctl.h> +#include <sys/kdebug.h> #include <IOKit/IOHibernatePrivate.h> #include <IOKit/IOPolledInterface.h> #include <IOKit/IONVRAM.h> #include "IOHibernateInternal.h" -#include "WKdm.h" +#include <libkern/WKdm.h> #include "IOKitKernelInternal.h" +#include <pexpert/device_tree.h> + +#include <machine/pal_routines.h> +#include <machine/pal_hibernate.h> + +extern "C" addr64_t kvtophys(vm_offset_t va); /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -192,6 +200,8 @@ static IOPolledFileIOVars gFileVars; static IOHibernateVars gIOHibernateVars; static struct kern_direct_file_io_ref_t * gIOHibernateFileRef; static hibernate_cryptvars_t gIOHibernateCryptWakeContext; +static hibernate_graphics_t _hibernateGraphics; +static hibernate_graphics_t * gIOHibernateGraphicsInfo = &_hibernateGraphics; /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -540,19 +550,59 @@ file_extent_callback(void * ref, uint64_t start, uint64_t length) ctx->size += length; } +static IOService * +IOCopyMediaForDev(dev_t device) +{ + OSDictionary * matching; + OSNumber * num; + OSIterator * iter; + IOService * result = 0; + + matching = IOService::serviceMatching("IOMedia"); + if (!matching) + return (0); + do + { + num = OSNumber::withNumber(major(device), 32); + if (!num) + break; + matching->setObject(kIOBSDMajorKey, num); + num->release(); + num = OSNumber::withNumber(minor(device), 32); + if (!num) + break; + matching->setObject(kIOBSDMinorKey, num); + num->release(); + if (!num) + break; + iter = IOService::getMatchingServices(matching); + if (iter) + { + result = (IOService *) iter->getNextObject(); + result->retain(); + iter->release(); + } + } + while (false); + matching->release(); + + return (result); +} + IOReturn IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, IOPolledFileIOVars ** fileVars, OSData ** fileExtents, - OSData ** imagePath) + OSData ** imagePath, uint8_t * volumeCryptKey) { IOReturn err = kIOReturnError; IOPolledFileIOVars * vars; _OpenFileContext ctx; OSData * extentsData; OSNumber * num; - IORegistryEntry * part = 0; - OSDictionary * matching; - OSIterator * iter; + IOService * part = 0; + OSString * keyUUID = 0; + OSString * keyStoreUUID = 0; + dev_t block_dev; dev_t hibernate_image_dev; uint64_t maxiobytes; @@ -575,10 +625,13 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, ctx.size = 0; vars->fileRef = kern_open_file_for_direct_io(filename, &file_extent_callback, &ctx, + &block_dev, &hibernate_image_dev, &vars->block0, &maxiobytes, - &vars->solid_state); + &vars->flags, + 0, (caddr_t) gIOHibernateCurrentHeader, + sizeof(IOHibernateImageHeader)); if (!vars->fileRef) { err = kIOReturnNoSpace; @@ -587,10 +640,10 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, gIOHibernateFileRef = vars->fileRef; if (kIOHibernateModeSSDInvert & gIOHibernateMode) - vars->solid_state = vars->solid_state ? false : true; + vars->flags ^= kIOHibernateOptionSSD; HIBLOG("Opened file %s, size %qd, partition base 0x%qx, maxio %qx ssd %d\n", filename, ctx.size, - vars->block0, maxiobytes, vars->solid_state); + vars->block0, maxiobytes, kIOHibernateOptionSSD & vars->flags); if (ctx.size < 1*1024*1024) // check against image size estimate! { err = kIOReturnNoSpace; @@ -601,41 +654,52 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, vars->bufferSize = maxiobytes; vars->extentMap = (IOPolledFileExtent *) extentsData->getBytesNoCopy(); - - matching = IOService::serviceMatching("IOMedia"); - num = OSNumber::withNumber(major(hibernate_image_dev), 32); - matching->setObject(kIOBSDMajorKey, num); - num->release(); - num = OSNumber::withNumber(minor(hibernate_image_dev), 32); - matching->setObject(kIOBSDMinorKey, num); - num->release(); - iter = IOService::getMatchingServices(matching); - matching->release(); - if (iter) - { - part = (IORegistryEntry *) iter->getNextObject(); - part->retain(); - iter->release(); - } - if (!part) - break; - int minor, major; + part = IOCopyMediaForDev(block_dev); + if (!part) + break; + + err = part->callPlatformFunction(PLATFORM_FUNCTION_GET_MEDIA_ENCRYPTION_KEY_UUID, false, + (void *) &keyUUID, (void *) &keyStoreUUID, NULL, NULL); + if ((kIOReturnSuccess == err) && keyUUID && keyStoreUUID) + { +// IOLog("got volume key %s\n", keyStoreUUID->getCStringNoCopy()); + uuid_t volumeKeyUUID; + aks_volume_key_t vek; + static IOService * sKeyStore; + static const OSSymbol * sAKSGetKey; + + if (!sAKSGetKey) + sAKSGetKey = OSSymbol::withCStringNoCopy(AKS_PLATFORM_FUNCTION_GETKEY); + if (!sKeyStore) + sKeyStore = (IOService *) IORegistryEntry::fromPath(AKS_SERVICE_PATH, gIOServicePlane); + if (sKeyStore) + err = uuid_parse(keyStoreUUID->getCStringNoCopy(), volumeKeyUUID); + else + err = kIOReturnNoResources; + if (kIOReturnSuccess == err) + err = sKeyStore->callPlatformFunction(sAKSGetKey, true, volumeKeyUUID, &vek, NULL, NULL); + if (kIOReturnSuccess != err) + IOLog("volume key err 0x%x\n", err); + else + { + size_t bytes = (kIOHibernateAESKeySize / 8); + if (vek.key.keybytecount < bytes) + bytes = vek.key.keybytecount; + bcopy(&vek.key.keybytes[0], volumeCryptKey, bytes); + } + bzero(&vek, sizeof(vek)); + } + part->release(); + + part = IOCopyMediaForDev(hibernate_image_dev); + if (!part) + break; + IORegistryEntry * next; IORegistryEntry * child; OSData * data; - num = (OSNumber *) part->getProperty(kIOBSDMajorKey); - if (!num) - break; - major = num->unsigned32BitValue(); - num = (OSNumber *) part->getProperty(kIOBSDMinorKey); - if (!num) - break; - minor = num->unsigned32BitValue(); - - hibernate_image_dev = makedev(major, minor); - vars->pollers = OSArray::withCapacity(4); if (!vars->pollers) break; @@ -663,7 +727,7 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, && child->isParent(next, gIOServicePlane, true)); HIBLOG("hibernate image major %d, minor %d, blocksize %ld, pollers %d\n", - major, minor, (long)vars->blockSize, vars->pollers->getCount()); + major(hibernate_image_dev), minor(hibernate_image_dev), (long)vars->blockSize, vars->pollers->getCount()); if (vars->pollers->getCount() < kIOHibernateMinPollersNeeded) continue; @@ -682,18 +746,22 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, if ((extentsData->getLength() >= sizeof(IOPolledFileExtent))) { - char str2[24]; + char str2[24 + sizeof(uuid_string_t) + 2]; #if defined(__i386__) || defined(__x86_64__) if (!gIOCreateEFIDevicePathSymbol) gIOCreateEFIDevicePathSymbol = OSSymbol::withCString("CreateEFIDevicePath"); - snprintf(str2, sizeof(str2), "%qx", vars->extentMap[0].start); + if (keyUUID) + snprintf(str2, sizeof(str2), "%qx:%s", + vars->extentMap[0].start, keyUUID->getCStringNoCopy()); + else + snprintf(str2, sizeof(str2), "%qx", vars->extentMap[0].start); err = IOService::getPlatform()->callPlatformFunction( gIOCreateEFIDevicePathSymbol, false, - (void *) part, (void *) str2, (void *) true, - (void *) &data); + (void *) part, (void *) str2, + (void *) (uintptr_t) true, (void *) &data); #else char str1[256]; int len = sizeof(str1); @@ -724,7 +792,7 @@ IOPolledFileOpen( const char * filename, IOBufferMemoryDescriptor * ioBuffer, HIBLOG("error 0x%x opening hibernation file\n", err); if (vars->fileRef) { - kern_close_file_for_direct_io(vars->fileRef); + kern_close_file_for_direct_io(vars->fileRef, 0, 0, 0); gIOHibernateFileRef = vars->fileRef = NULL; } } @@ -827,6 +895,8 @@ IOPolledFileWrite(IOPolledFileIOVars * vars, && (vars->position > vars->encryptStart) && ((vars->position - length) < vars->encryptEnd)) { + AbsoluteTime startTime, endTime; + uint32_t encryptLen, encryptStart; encryptLen = vars->position - vars->encryptStart; if (encryptLen > length) @@ -835,12 +905,20 @@ IOPolledFileWrite(IOPolledFileIOVars * vars, if (vars->position > vars->encryptEnd) encryptLen -= (vars->position - vars->encryptEnd); + clock_get_uptime(&startTime); + // encrypt the buffer aes_encrypt_cbc(vars->buffer + vars->bufferHalf + encryptStart, &cryptvars->aes_iv[0], encryptLen / AES_BLOCK_SIZE, vars->buffer + vars->bufferHalf + encryptStart, &cryptvars->ctx.encrypt); + + clock_get_uptime(&endTime); + ADD_ABSOLUTETIME(&vars->cryptTime, &endTime); + SUB_ABSOLUTETIME(&vars->cryptTime, &startTime); + vars->cryptBytes += encryptLen; + // save initial vector for following encrypts bcopy(vars->buffer + vars->bufferHalf + encryptStart + encryptLen - AES_BLOCK_SIZE, &cryptvars->aes_iv[0], @@ -916,7 +994,7 @@ IOPolledFileRead(IOPolledFileIOVars * vars, vars->bufferOffset += copy; // vars->position += copy; - if (vars->bufferOffset == vars->bufferLimit) + if ((vars->bufferOffset == vars->bufferLimit) && (vars->position < vars->readEnd)) { if (vars->io) { @@ -929,9 +1007,9 @@ IOPolledFileRead(IOPolledFileIOVars * vars, if (vars->position & (vars->blockSize - 1)) HIBLOG("misaligned file pos %qx\n", vars->position); - vars->position += vars->lastRead; + vars->position += vars->lastRead; vars->extentRemaining -= vars->lastRead; - vars->bufferLimit = vars->lastRead; + vars->bufferLimit = vars->lastRead; if (!vars->extentRemaining) { @@ -953,14 +1031,18 @@ if (vars->position & (vars->blockSize - 1)) HIBLOG("misaligned file pos %qx\n", length = vars->extentRemaining; else length = vars->bufferSize; - vars->lastRead = length; + if ((length + vars->position) > vars->readEnd) + length = vars->readEnd - vars->position; + vars->lastRead = length; + if (length) + { //if (length != vars->bufferSize) HIBLOG("short read of %qx ends@ %qx\n", length, offset + length); - - err = IOHibernatePollerIO(vars, kIOPolledRead, vars->bufferHalf, offset, length); - if (kIOReturnSuccess != err) - break; - vars->io = true; + err = IOHibernatePollerIO(vars, kIOPolledRead, vars->bufferHalf, offset, length); + if (kIOReturnSuccess != err) + break; + vars->io = true; + } vars->bufferHalf = vars->bufferHalf ? 0 : vars->bufferSize; vars->bufferOffset = 0; @@ -969,16 +1051,26 @@ if (vars->position & (vars->blockSize - 1)) HIBLOG("misaligned file pos %qx\n", if (cryptvars) { uint8_t thisVector[AES_BLOCK_SIZE]; + AbsoluteTime startTime, endTime; + // save initial vector for following decrypts bcopy(&cryptvars->aes_iv[0], &thisVector[0], AES_BLOCK_SIZE); bcopy(vars->buffer + vars->bufferHalf + lastReadLength - AES_BLOCK_SIZE, &cryptvars->aes_iv[0], AES_BLOCK_SIZE); + // decrypt the buffer + clock_get_uptime(&startTime); + aes_decrypt_cbc(vars->buffer + vars->bufferHalf, &thisVector[0], lastReadLength / AES_BLOCK_SIZE, vars->buffer + vars->bufferHalf, &cryptvars->ctx.decrypt); + + clock_get_uptime(&endTime); + ADD_ABSOLUTETIME(&vars->cryptTime, &endTime); + SUB_ABSOLUTETIME(&vars->cryptTime, &startTime); + vars->cryptBytes += lastReadLength; } #endif /* CRYPTO */ } @@ -1013,10 +1105,12 @@ IOHibernateSystemSleep(void) if (IOService::getPMRootDomain()->getHibernateSettings( &gIOHibernateMode, &gIOHibernateFreeRatio, &gIOHibernateFreeTime)) + { if (kIOHibernateModeSleep & gIOHibernateMode) // default to discard clean for safe sleep gIOHibernateMode ^= (kIOHibernateModeDiscardCleanInactive | kIOHibernateModeDiscardCleanActive); + } if ((obj = IOService::getPMRootDomain()->copyProperty(kIOHibernateFileKey))) { @@ -1039,40 +1133,48 @@ IOHibernateSystemSleep(void) vars->ioBuffer = IOBufferMemoryDescriptor::withOptions(kIODirectionOutIn, 2 * kDefaultIOSize, page_size); - if (!vars->srcBuffer || !vars->ioBuffer) + vars->handoffBuffer = IOBufferMemoryDescriptor::withOptions(kIODirectionOutIn, + ptoa_64(gIOHibernateHandoffPageCount), page_size); + + if (!vars->srcBuffer || !vars->ioBuffer || !vars->handoffBuffer) { err = kIOReturnNoMemory; break; } + // open & invalidate the image file + gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature; err = IOPolledFileOpen(gIOHibernateFilename, vars->ioBuffer, - &vars->fileVars, &vars->fileExtents, &data); + &vars->fileVars, &vars->fileExtents, &data, + &vars->volumeCryptKey[0]); if (KERN_SUCCESS != err) { HIBLOG("IOPolledFileOpen(%x)\n", err); break; } - if (vars->fileVars->fileRef) - { - // invalidate the image file - gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature; - int err = kern_write_file(vars->fileVars->fileRef, 0, - (caddr_t) gIOHibernateCurrentHeader, sizeof(IOHibernateImageHeader)); - if (KERN_SUCCESS != err) - HIBLOG("kern_write_file(%d)\n", err); - } bzero(gIOHibernateCurrentHeader, sizeof(IOHibernateImageHeader)); gIOHibernateCurrentHeader->debugFlags = gIOHibernateDebugFlags; - - dsSSD = (vars->fileVars->solid_state + dsSSD = ((0 != (kIOHibernateOptionSSD & vars->fileVars->flags)) && (kOSBooleanTrue == IOService::getPMRootDomain()->getProperty(kIOPMDeepSleepEnabledKey))); - if (dsSSD) { gIOHibernateCurrentHeader->options |= kIOHibernateOptionSSD | kIOHibernateOptionColor; + +#if defined(__i386__) || defined(__x86_64__) + if (!uuid_is_null(vars->volumeCryptKey) && + (kOSBooleanTrue != IOService::getPMRootDomain()->getProperty(kIOPMDestroyFVKeyOnStandbyKey))) + { + uintptr_t smcVars[2]; + smcVars[0] = sizeof(vars->volumeCryptKey); + smcVars[1] = (uintptr_t)(void *) &vars->volumeCryptKey[0]; + + IOService::getPMRootDomain()->setProperty(kIOHibernateSMCVariablesKey, smcVars, sizeof(smcVars)); + bzero(smcVars, sizeof(smcVars)); + } +#endif } else { @@ -1087,7 +1189,7 @@ IOHibernateSystemSleep(void) err = hibernate_setup(gIOHibernateCurrentHeader, gIOHibernateFreeRatio, gIOHibernateFreeTime, dsSSD, - &vars->page_list, &vars->page_list_wired, &encryptedswap); + &vars->page_list, &vars->page_list_wired, &vars->page_list_pal, &encryptedswap); clock_get_uptime(&endTime); SUB_ABSOLUTETIME(&endTime, &startTime); absolutetime_to_nanoseconds(endTime, &nsec); @@ -1096,7 +1198,7 @@ IOHibernateSystemSleep(void) if (KERN_SUCCESS != err) break; - if (encryptedswap) + if (encryptedswap || !uuid_is_null(vars->volumeCryptKey)) gIOHibernateMode ^= kIOHibernateModeEncrypt; if (kIOHibernateOptionProgress & gIOHibernateCurrentHeader->options) @@ -1137,45 +1239,6 @@ IOHibernateSystemSleep(void) } data->release(); -#if defined(__ppc__) - size_t len; - char valueString[16]; - - vars->saveBootDevice = gIOOptionsEntry->copyProperty(kIOSelectedBootDeviceKey); - if (gIOChosenEntry) - { - OSData * bootDevice = OSDynamicCast(OSData, gIOChosenEntry->getProperty(kIOBootPathKey)); - if (bootDevice) - { - sym = OSSymbol::withCStringNoCopy(kIOSelectedBootDeviceKey); - OSString * str2 = OSString::withCStringNoCopy((const char *) bootDevice->getBytesNoCopy()); - if (sym && str2) - gIOOptionsEntry->setProperty(sym, str2); - if (sym) - sym->release(); - if (str2) - str2->release(); - } - data = OSDynamicCast(OSData, gIOChosenEntry->getProperty(kIOHibernateMemorySignatureKey)); - if (data) - { - vars->haveFastBoot = true; - - len = snprintf(valueString, sizeof(valueString), "0x%lx", *((UInt32 *)data->getBytesNoCopy())); - data = OSData::withBytes(valueString, len + 1); - sym = OSSymbol::withCStringNoCopy(kIOHibernateMemorySignatureEnvKey); - if (sym && data) - gIOOptionsEntry->setProperty(sym, data); - if (sym) - sym->release(); - if (data) - data->release(); - } - data = OSDynamicCast(OSData, gIOChosenEntry->getProperty(kIOHibernateMachineSignatureKey)); - if (data) - gIOHibernateCurrentHeader->machineSignature = *((UInt32 *)data->getBytesNoCopy()); - } -#endif /* __ppc__ */ #if defined(__i386__) || defined(__x86_64__) struct AppleRTCHibernateVars { @@ -1529,6 +1592,38 @@ IOHibernateSystemHasSlept(void) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +static DeviceTreeNode * +MergeDeviceTree(DeviceTreeNode * entry, IORegistryEntry * regEntry) +{ + DeviceTreeNodeProperty * prop; + DeviceTreeNode * child; + IORegistryEntry * childRegEntry; + const char * nameProp; + unsigned int propLen, idx; + + prop = (DeviceTreeNodeProperty *) (entry + 1); + for (idx = 0; idx < entry->nProperties; idx++) + { + if (regEntry && (0 != strcmp("name", prop->name))) + { + regEntry->setProperty((const char *) prop->name, (void *) (prop + 1), prop->length); +// HIBPRINT("%s: %s, %d\n", regEntry->getName(), prop->name, prop->length); + } + prop = (DeviceTreeNodeProperty *) (((uintptr_t)(prop + 1)) + ((prop->length + 3) & ~3)); + } + + child = (DeviceTreeNode *) prop; + for (idx = 0; idx < entry->nChildren; idx++) + { + if (kSuccess != DTGetProperty(child, "name", (void **) &nameProp, &propLen)) + panic("no name"); + childRegEntry = regEntry ? regEntry->childFromPath(nameProp, gIODTPlane) : NULL; +// HIBPRINT("%s == %p\n", nameProp, childRegEntry); + child = MergeDeviceTree(child, childRegEntry); + } + return (child); +} + IOReturn IOHibernateSystemWake(void) { @@ -1582,64 +1677,9 @@ IOHibernateSystemWake(void) // invalidate nvram properties - (gIOOptionsEntry != 0) => nvram was touched -#ifdef __ppc__ - OSData * data = OSData::withCapacity(4); - if (gIOOptionsEntry && data) - { - const OSSymbol * sym = OSSymbol::withCStringNoCopy(kIOHibernateBootImageKey); - if (sym) - { - gIOOptionsEntry->setProperty(sym, data); - sym->release(); - } - sym = OSSymbol::withCStringNoCopy(kIOSelectedBootDeviceKey); - if (sym) - { - if (vars->saveBootDevice) - { - gIOOptionsEntry->setProperty(sym, vars->saveBootDevice); - vars->saveBootDevice->release(); - } - sym->release(); - } - sym = OSSymbol::withCStringNoCopy(kIOHibernateBootImageKeyKey); - if (sym) - { - gIOOptionsEntry->setProperty(sym, data); - sym->release(); - } - sym = OSSymbol::withCStringNoCopy(kIOHibernateMemorySignatureEnvKey); - if (sym) - { - gIOOptionsEntry->removeProperty(sym); - sym->release(); - } - } - if (data) - data->release(); - - if (gIOOptionsEntry) - { - if (!vars->haveFastBoot) - { - // reset boot audio volume - IODTPlatformExpert * platform = OSDynamicCast(IODTPlatformExpert, IOService::getPlatform()); - if (platform) - platform->writeXPRAM(kXPRamAudioVolume, - &vars->saveBootAudioVolume, sizeof(vars->saveBootAudioVolume)); - } - - // sync now to hardware if the booter has not - if (kIOHibernateStateInactive == gIOHibernateState) - gIOOptionsEntry->sync(); - else - // just sync the variables in case a later panic syncs nvram (it won't sync variables) - gIOOptionsEntry->syncOFVariables(); - } -#endif - #if defined(__i386__) || defined(__x86_64__) IOService::getPMRootDomain()->removeProperty(gIOHibernateRTCVariablesKey); + IOService::getPMRootDomain()->removeProperty(kIOHibernateSMCVariablesKey); /* * Hibernate variable is written to NVRAM on platforms in which RtcRam @@ -1672,6 +1712,47 @@ IOHibernateSystemWake(void) vars->srcBuffer->release(); if (vars->ioBuffer) vars->ioBuffer->release(); + bzero(&gIOHibernateHandoffPages[0], gIOHibernateHandoffPageCount * sizeof(gIOHibernateHandoffPages[0])); + if (vars->handoffBuffer) + { + IOHibernateHandoff * handoff; + bool done = false; + for (handoff = (IOHibernateHandoff *) vars->handoffBuffer->getBytesNoCopy(); + !done; + handoff = (IOHibernateHandoff *) &handoff->data[handoff->bytecount]) + { +// HIBPRINT("handoff %p, %x, %x\n", handoff, handoff->type, handoff->bytecount); + uint8_t * data = &handoff->data[0]; + switch (handoff->type) + { + case kIOHibernateHandoffTypeEnd: + done = true; + break; + + case kIOHibernateHandoffTypeDeviceTree: + MergeDeviceTree((DeviceTreeNode *) data, IOService::getServiceRoot()); + break; + + case kIOHibernateHandoffTypeKeyStore: +#if defined(__i386__) || defined(__x86_64__) + { + IOBufferMemoryDescriptor * + md = IOBufferMemoryDescriptor::withBytes(data, handoff->bytecount, kIODirectionOutIn); + if (md) + { + IOSetKeyStoreData(md); + } + } +#endif + break; + + default: + done = (kIOHibernateHandoffType != (handoff->type & 0xFFFF0000)); + break; + } + } + vars->handoffBuffer->release(); + } if (vars->fileExtents) vars->fileExtents->release(); @@ -1687,14 +1768,11 @@ IOHibernateSystemPostWake(void) { if (gIOHibernateFileRef) { - // invalidate the image file + // invalidate & close the image file gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature; - int err = kern_write_file(gIOHibernateFileRef, 0, - (caddr_t) gIOHibernateCurrentHeader, sizeof(IOHibernateImageHeader)); - if (KERN_SUCCESS != err) - HIBLOG("kern_write_file(%d)\n", err); - - kern_close_file_for_direct_io(gIOHibernateFileRef); + kern_close_file_for_direct_io(gIOHibernateFileRef, + 0, (caddr_t) gIOHibernateCurrentHeader, + sizeof(IOHibernateImageHeader)); gIOHibernateFileRef = 0; } return (kIOReturnSuccess); @@ -1703,13 +1781,13 @@ IOHibernateSystemPostWake(void) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ SYSCTL_STRING(_kern, OID_AUTO, hibernatefile, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, gIOHibernateFilename, sizeof(gIOHibernateFilename), ""); SYSCTL_STRING(_kern, OID_AUTO, bootsignature, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, gIOHibernateBootSignature, sizeof(gIOHibernateBootSignature), ""); SYSCTL_UINT(_kern, OID_AUTO, hibernatemode, - CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOHibernateMode, 0, ""); void @@ -1738,10 +1816,6 @@ IOHibernateSystemInit(IOPMrootDomain * rootDomain) static void hibernate_setup_for_wake(void) { -#if __ppc__ - // go slow (state needed for wake) - ml_set_processor_speed(1); -#endif } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -1762,6 +1836,34 @@ uint32_t wired_pages_encrypted = 0; uint32_t dirty_pages_encrypted = 0; uint32_t wired_pages_clear = 0; +static void +hibernate_pal_callback(void *vars_arg, vm_offset_t addr) +{ + IOHibernateVars *vars = (IOHibernateVars *)vars_arg; + /* Make sure it's not in either of the save lists */ + hibernate_set_page_state(vars->page_list, vars->page_list_wired, atop_64(addr), 1, kIOHibernatePageStateFree); + + /* Set it in the bitmap of pages owned by the PAL */ + hibernate_page_bitset(vars->page_list_pal, TRUE, atop_64(addr)); +} + +static struct hibernate_cryptvars_t *local_cryptvars; + +extern "C" int +hibernate_pal_write(void *buffer, size_t size) +{ + IOHibernateVars * vars = &gIOHibernateVars; + + IOReturn err = IOPolledFileWrite(vars->fileVars, (const uint8_t *)buffer, size, local_cryptvars); + if (kIOReturnSuccess != err) { + kprintf("epic hibernate fail! %d\n", err); + return err; + } + + return 0; +} + + extern "C" uint32_t hibernate_write_image(void) { @@ -1786,9 +1888,11 @@ hibernate_write_image(void) uint32_t tag; uint32_t pageType; uint32_t pageAndCount[2]; + addr64_t phys64; + IOByteCount segLen; AbsoluteTime startTime, endTime; - AbsoluteTime allTime, compTime, decoTime; + AbsoluteTime allTime, compTime; uint64_t compBytes; uint64_t nsec; uint32_t lastProgressStamp = 0; @@ -1809,9 +1913,12 @@ hibernate_write_image(void) kdebug_enable = save_kdebug_enable; KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 1) | DBG_FUNC_START, 0, 0, 0, 0, 0); + IOService::getPMRootDomain()->tracePoint(kIOPMTracePointHibernate); restore1Sum = sum1 = sum2 = 0; + hibernate_pal_prepare(); + #if CRYPTO // encryption data. "iv" is the "initial vector". if (kIOHibernateModeEncrypt & gIOHibernateMode) @@ -1831,6 +1938,9 @@ hibernate_write_image(void) cryptvars = &_cryptvars; bzero(cryptvars, sizeof(hibernate_cryptvars_t)); + for (pageCount = 0; pageCount < sizeof(vars->wiredCryptKey); pageCount++) + vars->wiredCryptKey[pageCount] ^= vars->volumeCryptKey[pageCount]; + bzero(&vars->volumeCryptKey[0], sizeof(vars->volumeCryptKey)); aes_encrypt_key(vars->wiredCryptKey, kIOHibernateAESKeySize, &cryptvars->ctx.encrypt); @@ -1838,7 +1948,8 @@ hibernate_write_image(void) bcopy(&first_iv[0], &cryptvars->aes_iv[0], AES_BLOCK_SIZE); bzero(&vars->wiredCryptKey[0], sizeof(vars->wiredCryptKey)); bzero(&vars->cryptKey[0], sizeof(vars->cryptKey)); - bzero(gIOHibernateCryptWakeVars, sizeof(hibernate_cryptwakevars_t)); + + local_cryptvars = cryptvars; } #endif /* CRYPTO */ @@ -1846,6 +1957,7 @@ hibernate_write_image(void) hibernate_page_list_setall(vars->page_list, vars->page_list_wired, + vars->page_list_pal, &pageCount); HIBLOG("hibernate_page_list_setall found pageCount %d\n", pageCount); @@ -1863,9 +1975,8 @@ hibernate_write_image(void) #endif needEncrypt = (0 != (kIOHibernateModeEncrypt & gIOHibernateMode)); - AbsoluteTime_to_scalar(&compTime) = 0; - AbsoluteTime_to_scalar(&decoTime) = 0; + compBytes = 0; clock_get_uptime(&allTime); IOService::getPMRootDomain()->pmStatsRecordEvent( @@ -1901,18 +2012,26 @@ hibernate_write_image(void) uintptr_t hibernateBase; uintptr_t hibernateEnd; -#if defined(__i386__) || defined(__x86_64__) - hibernateBase = sectINITPTB; -#else - hibernateBase = sectHIBB; -#endif + hibernateBase = HIB_BASE; /* Defined in PAL headers */ hibernateEnd = (sectHIBB + sectSizeHIB); + // copy out restore1 code - - page = atop_32(hibernateBase); - count = atop_32(round_page(hibernateEnd)) - page; - header->restore1CodePage = page; + + for (count = 0; + (phys64 = vars->handoffBuffer->getPhysicalSegment(count, &segLen, kIOMemoryMapperNone)); + count += segLen) + { + for (pagesDone = 0; pagesDone < atop_32(segLen); pagesDone++) + { + gIOHibernateHandoffPages[atop_32(count) + pagesDone] = atop_64(phys64) + pagesDone; + } + } + + page = atop_32(kvtophys(hibernateBase)); + count = atop_32(round_page(hibernateEnd) - hibernateBase); + header->restore1CodePhysPage = page; + header->restore1CodeVirt = hibernateBase; header->restore1PageCount = count; header->restore1CodeOffset = ((uintptr_t) &hibernate_machine_entrypoint) - hibernateBase; header->restore1StackOffset = ((uintptr_t) &gIOHibernateRestoreStackEnd[0]) - 64 - hibernateBase; @@ -1922,7 +2041,7 @@ hibernate_write_image(void) for (page = 0; page < count; page++) { if ((src < &gIOHibernateRestoreStack[0]) || (src >= &gIOHibernateRestoreStackEnd[0])) - restore1Sum += hibernate_sum_page(src, header->restore1CodePage + page); + restore1Sum += hibernate_sum_page(src, header->restore1CodeVirt + page); else restore1Sum += 0x00000000; src += page_size; @@ -1956,9 +2075,6 @@ hibernate_write_image(void) // write the preview buffer - addr64_t phys64; - IOByteCount segLen; - if (vars->previewBuffer) { ppnum = 0; @@ -2031,8 +2147,9 @@ hibernate_write_image(void) hibernate_page_list_set_volatile(vars->page_list, vars->page_list_wired, &pageCount); - page = atop_32(hibernateBase); - count = atop_32(round_page(hibernateEnd)) - page; + + page = atop_32(KERNEL_IMAGE_TO_PHYS(hibernateBase)); + count = atop_32(round_page(KERNEL_IMAGE_TO_PHYS(hibernateEnd))) - page; hibernate_set_page_state(vars->page_list, vars->page_list_wired, page, count, kIOHibernatePageStateFree); @@ -2048,10 +2165,22 @@ hibernate_write_image(void) pageCount -= atop_32(segLen); } + for (count = 0; + (phys64 = vars->handoffBuffer->getPhysicalSegment(count, &segLen, kIOMemoryMapperNone)); + count += segLen) + { + hibernate_set_page_state(vars->page_list, vars->page_list_wired, + atop_64(phys64), atop_32(segLen), + kIOHibernatePageStateFree); + pageCount -= atop_32(segLen); + } + + (void)hibernate_pal_callback; + src = (uint8_t *) vars->srcBuffer->getBytesNoCopy(); - pagesDone = 0; - lastBlob = 0; + pagesDone = 0; + lastBlob = 0; HIBLOG("writing %d pages\n", pageCount); @@ -2254,9 +2383,9 @@ hibernate_write_image(void) else header->fileExtentMapSize = sizeof(header->fileExtentMap); bcopy(&fileExtents[0], &header->fileExtentMap[0], count); - - header->deviceBase = vars->fileVars->block0; + header->deviceBase = vars->fileVars->block0; + IOPolledFileSeek(vars->fileVars, 0); err = IOPolledFileWrite(vars->fileVars, (uint8_t *) header, sizeof(IOHibernateImageHeader), @@ -2283,12 +2412,16 @@ hibernate_write_image(void) nsec / 1000000ULL); absolutetime_to_nanoseconds(compTime, &nsec); - HIBLOG("comp time: %qd ms, ", - nsec / 1000000ULL); + HIBLOG("comp bytes: %qd time: %qd ms %qd Mb/s, ", + compBytes, + nsec / 1000000ULL, + nsec ? (((compBytes * 1000000000ULL) / 1024 / 1024) / nsec) : 0); - absolutetime_to_nanoseconds(decoTime, &nsec); - HIBLOG("deco time: %qd ms, ", - nsec / 1000000ULL); + absolutetime_to_nanoseconds(vars->fileVars->cryptTime, &nsec); + HIBLOG("crypt bytes: %qd time: %qd ms %qd Mb/s, ", + vars->fileVars->cryptBytes, + nsec / 1000000ULL, + nsec ? (((vars->fileVars->cryptBytes * 1000000000ULL) / 1024 / 1024) / nsec) : 0); HIBLOG("\nimage %qd, uncompressed %qd (%d), compressed %qd (%d%%), sum1 %x, sum2 %x\n", header->imageSize, @@ -2353,7 +2486,9 @@ hibernate_machine_init(void) uint32_t sum; uint32_t pagesDone; uint32_t pagesRead = 0; + AbsoluteTime startTime, compTime; AbsoluteTime allTime, endTime; + uint64_t compBytes; uint64_t nsec; uint32_t lastProgressStamp = 0; uint32_t progressStamp; @@ -2381,7 +2516,7 @@ hibernate_machine_init(void) HIBPRINT("diag %x %x %x %x\n", gIOHibernateCurrentHeader->diag[0], gIOHibernateCurrentHeader->diag[1], - gIOHibernateCurrentHeader->diag[2], gIOHibernateCurrentHeader->diag[3]); + gIOHibernateCurrentHeader->diag[2], gIOHibernateCurrentHeader->diag[3]); HIBPRINT("video %x %d %d %d status %x\n", gIOHibernateGraphicsInfo->physicalAddress, gIOHibernateGraphicsInfo->depth, @@ -2392,6 +2527,62 @@ hibernate_machine_init(void) boot_args *args = (boot_args *) PE_state.bootArgs; + cryptvars = (kIOHibernateModeEncrypt & gIOHibernateMode) ? &gIOHibernateCryptWakeContext : 0; + + if (gIOHibernateCurrentHeader->handoffPageCount > gIOHibernateHandoffPageCount) + panic("handoff overflow"); + + IOHibernateHandoff * handoff; + bool done = false; + bool foundCryptData = false; + + for (handoff = (IOHibernateHandoff *) vars->handoffBuffer->getBytesNoCopy(); + !done; + handoff = (IOHibernateHandoff *) &handoff->data[handoff->bytecount]) + { +// HIBPRINT("handoff %p, %x, %x\n", handoff, handoff->type, handoff->bytecount); + uint8_t * data = &handoff->data[0]; + switch (handoff->type) + { + case kIOHibernateHandoffTypeEnd: + done = true; + break; + + case kIOHibernateHandoffTypeGraphicsInfo: + bcopy(data, gIOHibernateGraphicsInfo, sizeof(*gIOHibernateGraphicsInfo)); + break; + + case kIOHibernateHandoffTypeCryptVars: + if (cryptvars) + { + hibernate_cryptwakevars_t * + wakevars = (hibernate_cryptwakevars_t *) &handoff->data[0]; + bcopy(&wakevars->aes_iv[0], &cryptvars->aes_iv[0], sizeof(cryptvars->aes_iv)); + } + foundCryptData = true; + bzero(data, handoff->bytecount); + break; + + case kIOHibernateHandoffTypeMemoryMap: + hibernate_newruntime_map(data, handoff->bytecount, + gIOHibernateCurrentHeader->systemTableOffset); + break; + + case kIOHibernateHandoffTypeDeviceTree: + { +// DTEntry chosen = NULL; +// HIBPRINT("DTLookupEntry %d\n", DTLookupEntry((const DTEntry) data, "/chosen", &chosen)); + } + break; + + default: + done = (kIOHibernateHandoffType != (handoff->type & 0xFFFF0000)); + break; + } + } + if (cryptvars && !foundCryptData) + panic("hibernate handoff"); + if (vars->videoMapping && gIOHibernateGraphicsInfo->physicalAddress && (args->Video.v_baseAddr == gIOHibernateGraphicsInfo->physicalAddress)) @@ -2404,21 +2595,11 @@ hibernate_machine_init(void) } uint8_t * src = (uint8_t *) vars->srcBuffer->getBytesNoCopy(); - - if (gIOHibernateWakeMapSize) - { - err = IOMemoryDescriptorWriteFromPhysical(vars->srcBuffer, 0, ptoa_64(gIOHibernateWakeMap), - gIOHibernateWakeMapSize); - if (kIOReturnSuccess == err) - hibernate_newruntime_map(src, gIOHibernateWakeMapSize, - gIOHibernateCurrentHeader->systemTableOffset); - gIOHibernateWakeMap = 0; - gIOHibernateWakeMapSize = 0; - } - uint32_t decoOffset; clock_get_uptime(&allTime); + AbsoluteTime_to_scalar(&compTime) = 0; + compBytes = 0; HIBLOG("IOHibernatePollerOpen(), ml_get_interrupts_enabled %d\n", ml_get_interrupts_enabled()); err = IOHibernatePollerOpen(vars->fileVars, kIOPolledAfterSleepState, 0); @@ -2439,23 +2620,17 @@ hibernate_machine_init(void) ProgressUpdate(gIOHibernateGraphicsInfo, (uint8_t *) vars->videoMapping, 0, lastBlob); } - cryptvars = (kIOHibernateModeEncrypt & gIOHibernateMode) ? &gIOHibernateCryptWakeContext : 0; - if (kIOHibernateModeEncrypt & gIOHibernateMode) - { - cryptvars = &gIOHibernateCryptWakeContext; - bcopy(&gIOHibernateCryptWakeVars->aes_iv[0], - &cryptvars->aes_iv[0], - sizeof(cryptvars->aes_iv)); - } - // kick off the read ahead vars->fileVars->io = false; vars->fileVars->bufferHalf = 0; vars->fileVars->bufferLimit = 0; vars->fileVars->lastRead = 0; + vars->fileVars->readEnd = gIOHibernateCurrentHeader->imageSize; vars->fileVars->bufferOffset = vars->fileVars->bufferLimit; + vars->fileVars->cryptBytes = 0; + AbsoluteTime_to_scalar(&vars->fileVars->cryptTime) = 0; - IOPolledFileRead(vars->fileVars, 0, 0, cryptvars); + err = IOPolledFileRead(vars->fileVars, 0, 0, cryptvars); vars->fileVars->bufferOffset = vars->fileVars->bufferLimit; // -- @@ -2464,7 +2639,7 @@ hibernate_machine_init(void) uint32_t * header = (uint32_t *) src; sum = 0; - do + while (kIOReturnSuccess == err) { unsigned int count; unsigned int page; @@ -2510,7 +2685,14 @@ hibernate_machine_init(void) if (compressedSize < page_size) { decoOffset = page_size; + + clock_get_uptime(&startTime); WKdm_decompress((WK_word*) src, (WK_word*) (src + decoOffset), PAGE_SIZE_IN_WORDS); + clock_get_uptime(&endTime); + ADD_ABSOLUTETIME(&compTime, &endTime); + SUB_ABSOLUTETIME(&compTime, &startTime); + + compBytes += page_size; } else decoOffset = 0; @@ -2554,7 +2736,8 @@ hibernate_machine_init(void) } } } - while (true); + if (pagesDone == gIOHibernateCurrentHeader->actualUncompressedPages) + err = kIOReturnLockedRead; if (kIOReturnSuccess != err) panic("Hibernate restore error %x", err); @@ -2580,10 +2763,22 @@ hibernate_machine_init(void) SUB_ABSOLUTETIME(&endTime, &allTime); absolutetime_to_nanoseconds(endTime, &nsec); - HIBLOG("hibernate_machine_init pagesDone %d sum2 %x, time: %qd ms\n", + HIBLOG("hibernate_machine_init pagesDone %d sum2 %x, time: %qd ms, ", pagesDone, sum, nsec / 1000000ULL); - - KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 2) | DBG_FUNC_NONE, pagesRead, pagesDone, 0, 0, 0); + + absolutetime_to_nanoseconds(compTime, &nsec); + HIBLOG("comp bytes: %qd time: %qd ms %qd Mb/s, ", + compBytes, + nsec / 1000000ULL, + nsec ? (((compBytes * 1000000000ULL) / 1024 / 1024) / nsec) : 0); + + absolutetime_to_nanoseconds(vars->fileVars->cryptTime, &nsec); + HIBLOG("crypt bytes: %qd time: %qd ms %qd Mb/s\n", + vars->fileVars->cryptBytes, + nsec / 1000000ULL, + nsec ? (((vars->fileVars->cryptBytes * 1000000000ULL) / 1024 / 1024) / nsec) : 0); + + KERNEL_DEBUG_CONSTANT(IOKDBG_CODE(DBG_HIBERNATE, 2) | DBG_FUNC_NONE, pagesRead, pagesDone, 0, 0, 0); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ diff --git a/iokit/Kernel/IOHibernateInternal.h b/iokit/Kernel/IOHibernateInternal.h index 2b2e5802e..7e7e95fe6 100644 --- a/iokit/Kernel/IOHibernateInternal.h +++ b/iokit/Kernel/IOHibernateInternal.h @@ -36,8 +36,10 @@ struct IOHibernateVars { hibernate_page_list_t * page_list; hibernate_page_list_t * page_list_wired; + hibernate_page_list_t * page_list_pal; class IOBufferMemoryDescriptor * ioBuffer; class IOBufferMemoryDescriptor * srcBuffer; + class IOBufferMemoryDescriptor * handoffBuffer; class IOMemoryDescriptor * previewBuffer; OSData * previewData; OSData * fileExtents; @@ -52,6 +54,7 @@ struct IOHibernateVars uint8_t saveBootAudioVolume; uint8_t wiredCryptKey[kIOHibernateAESKeySize / 8]; uint8_t cryptKey[kIOHibernateAESKeySize / 8]; + uint8_t volumeCryptKey[kIOHibernateAESKeySize / 8]; }; typedef struct IOHibernateVars IOHibernateVars; @@ -68,12 +71,15 @@ struct IOPolledFileIOVars IOByteCount bufferHalf; IOByteCount extentRemaining; IOByteCount lastRead; - boolean_t solid_state; + IOByteCount readEnd; + uint32_t flags; uint64_t block0; uint64_t position; uint64_t extentPosition; uint64_t encryptStart; uint64_t encryptEnd; + uint64_t cryptBytes; + AbsoluteTime cryptTime; IOPolledFileExtent * extentMap; IOPolledFileExtent * currentExtent; bool io; @@ -103,6 +109,5 @@ extern unsigned long sectSizeDATA; extern vm_offset_t sectINITPTB; #endif -extern vm_offset_t gIOHibernateWakeMap; // ppnum -extern vm_size_t gIOHibernateWakeMapSize; - +extern ppnum_t gIOHibernateHandoffPages[]; +extern uint32_t gIOHibernateHandoffPageCount; diff --git a/iokit/Kernel/IOHibernateRestoreKernel.c b/iokit/Kernel/IOHibernateRestoreKernel.c index 280b8c430..7259ab3ec 100644 --- a/iokit/Kernel/IOHibernateRestoreKernel.c +++ b/iokit/Kernel/IOHibernateRestoreKernel.c @@ -35,9 +35,13 @@ #include <crypto/aes.h> #include <libkern/libkern.h> -#include "WKdm.h" +#include <libkern/WKdm.h> #include "IOHibernateInternal.h" +#if defined(__i386__) || defined(__x86_64__) +#include <i386/pal_hibernate.h> +#endif + /* This code is linked into the kernel but part of the "__HIB" section, which means its used by code running in the special context of restoring the kernel text and data @@ -52,14 +56,15 @@ uint32_t gIOHibernateDebugFlags; static IOHibernateImageHeader _hibernateHeader; IOHibernateImageHeader * gIOHibernateCurrentHeader = &_hibernateHeader; -static hibernate_graphics_t _hibernateGraphics; -hibernate_graphics_t * gIOHibernateGraphicsInfo = &_hibernateGraphics; - -static hibernate_cryptwakevars_t _cryptWakeVars; -hibernate_cryptwakevars_t * gIOHibernateCryptWakeVars = &_cryptWakeVars; +ppnum_t gIOHibernateHandoffPages[64]; +uint32_t gIOHibernateHandoffPageCount = sizeof(gIOHibernateHandoffPages) + / sizeof(gIOHibernateHandoffPages[0]); -vm_offset_t gIOHibernateWakeMap; // ppnum -vm_size_t gIOHibernateWakeMapSize; +#if CONFIG_DEBUG +void hibprintf(const char *fmt, ...); +#else +#define hibprintf(x...) +#endif #if CONFIG_SLEEP @@ -148,7 +153,7 @@ static void uart_puthex(uint64_t num) c = 0xf & (num >> bit); if (c) leading = false; - else if (leading) + else if (leading && bit) continue; if (c <= 9) c += '0'; @@ -333,7 +338,7 @@ hibernate_page_bitmap_count(hibernate_bitmap_t * bitmap, uint32_t set, uint32_t return (count); } -static vm_offset_t +static ppnum_t hibernate_page_list_grab(hibernate_page_list_t * list, uint32_t * pNextFree) { uint32_t nextFree = *pNextFree; @@ -365,27 +370,19 @@ static uint32_t store_one_page(uint32_t procFlags, uint32_t * src, uint32_t compressedSize, uint32_t * buffer, uint32_t ppnum) { - uint64_t dst; - uint32_t sum; - - dst = ptoa_64(ppnum); - if (ppnum < 0x00100000) - buffer = (uint32_t *) (uintptr_t) dst; + uint64_t dst = ptoa_64(ppnum); - if (compressedSize != PAGE_SIZE) - { - WKdm_decompress((WK_word*) src, (WK_word*) buffer, PAGE_SIZE >> 2); - src = buffer; - } - - sum = hibernate_sum_page((uint8_t *) src, ppnum); - - if (((uint64_t) (uintptr_t) src) == dst) - src = 0; - - hibernate_restore_phys_page((uint64_t) (uintptr_t) src, dst, PAGE_SIZE, procFlags); + if (compressedSize != PAGE_SIZE) + { + dst = pal_hib_map(DEST_COPY_AREA, dst); + WKdm_decompress((WK_word*) src, (WK_word*)(uintptr_t)dst, PAGE_SIZE >> 2); + } + else + { + dst = hibernate_restore_phys_page((uint64_t) (uintptr_t) src, dst, PAGE_SIZE, procFlags); + } - return (sum); + return hibernate_sum_page((uint8_t *)(uintptr_t)dst, ppnum); } // used only for small struct copies @@ -411,9 +408,10 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, { uint32_t idx; uint32_t * src; - uint32_t * buffer; + uint32_t * imageReadPos; uint32_t * pageIndexSource; hibernate_page_list_t * map; + uint32_t stage; uint32_t count; uint32_t ppnum; uint32_t page; @@ -424,10 +422,13 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, uint32_t * copyPageList; uint32_t copyPageIndex; uint32_t sum; + uint32_t pageSum; uint32_t nextFree; uint32_t lastImagePage; uint32_t lastMapPage; uint32_t lastPageIndexPage; + uint32_t handoffPages; + uint32_t handoffPageCount; C_ASSERT(sizeof(IOHibernateImageHeader) == 512); @@ -440,84 +441,43 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, gIOHibernateCurrentHeader, sizeof(IOHibernateImageHeader)); - if (!p2) - { - count = header->graphicsInfoOffset; - if (count) - p2 = (void *)(((uintptr_t) header) - count); - } - if (p2) - bcopy_internal(p2, - gIOHibernateGraphicsInfo, - sizeof(hibernate_graphics_t)); - else - gIOHibernateGraphicsInfo->physicalAddress = gIOHibernateGraphicsInfo->depth = 0; - - if (!p3) - { - count = header->cryptVarsOffset; - if (count) - p3 = (void *)(((uintptr_t) header) - count); - } - if (p3) - bcopy_internal(p3, - gIOHibernateCryptWakeVars, - sizeof(hibernate_cryptwakevars_t)); - - src = (uint32_t *) + map = (hibernate_page_list_t *) (((uintptr_t) &header->fileExtentMap[0]) + header->fileExtentMapSize - + ptoa_32(header->restore1PageCount)); - - if (header->previewSize) - { - pageIndexSource = src; - map = (hibernate_page_list_t *)(((uintptr_t) pageIndexSource) + header->previewSize); - src = (uint32_t *) (((uintptr_t) pageIndexSource) + header->previewPageListSize); - } - else - { - pageIndexSource = 0; - map = (hibernate_page_list_t *) src; - src = (uint32_t *) (((uintptr_t) map) + header->bitmapSize); - } - - lastPageIndexPage = atop_32((uintptr_t) src); + + ptoa_32(header->restore1PageCount) + + header->previewSize); lastImagePage = atop_32(((uintptr_t) header) + header->image1Size); lastMapPage = atop_32(((uintptr_t) map) + header->bitmapSize); + handoffPages = header->handoffPages; + handoffPageCount = header->handoffPageCount; + debug_code(kIOHibernateRestoreCodeImageEnd, ptoa_64(lastImagePage)); - debug_code(kIOHibernateRestoreCodePageIndexStart, (uintptr_t) pageIndexSource); - debug_code(kIOHibernateRestoreCodePageIndexEnd, ptoa_64(lastPageIndexPage)); debug_code(kIOHibernateRestoreCodeMapStart, (uintptr_t) map); debug_code(kIOHibernateRestoreCodeMapEnd, ptoa_64(lastMapPage)); + debug_code('hand', ptoa_64(handoffPages)); + debug_code('hnde', ptoa_64(handoffPageCount)); + // knock all the image pages to be used out of free map for (ppnum = atop_32((uintptr_t) header); ppnum <= lastImagePage; ppnum++) { hibernate_page_bitset(map, FALSE, ppnum); } + // knock all the handoff pages to be used out of free map + for (ppnum = handoffPages; ppnum < (handoffPages + handoffPageCount); ppnum++) + { + hibernate_page_bitset(map, FALSE, ppnum); + } nextFree = 0; hibernate_page_list_grab(map, &nextFree); - buffer = (uint32_t *) (uintptr_t) ptoa_32(hibernate_page_list_grab(map, &nextFree)); - if (header->memoryMapSize && (count = header->memoryMapOffset)) - { - p4 = (void *)(((uintptr_t) header) - count); - gIOHibernateWakeMap = hibernate_page_list_grab(map, &nextFree); - gIOHibernateWakeMapSize = header->memoryMapSize; - debug_code(kIOHibernateRestoreCodeWakeMapSize, gIOHibernateWakeMapSize); - if (gIOHibernateWakeMapSize > PAGE_SIZE) - fatal(); - bcopy_internal(p4, (void *) (uintptr_t) ptoa_32(gIOHibernateWakeMap), gIOHibernateWakeMapSize); - } - else - gIOHibernateWakeMapSize = 0; + pal_hib_window_setup(hibernate_page_list_grab(map, &nextFree)); - sum = gIOHibernateCurrentHeader->actualRestore1Sum; + sum = header->actualRestore1Sum; gIOHibernateCurrentHeader->diag[0] = (uint32_t)(uintptr_t) header; gIOHibernateCurrentHeader->diag[1] = sum; @@ -528,54 +488,110 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, copyPageIndex = PAGE_SIZE >> 2; compressedSize = PAGE_SIZE; + stage = 2; + count = 0; + src = NULL; + + if (gIOHibernateCurrentHeader->previewSize) + { + pageIndexSource = (uint32_t *) + (((uintptr_t) &header->fileExtentMap[0]) + + gIOHibernateCurrentHeader->fileExtentMapSize + + ptoa_32(gIOHibernateCurrentHeader->restore1PageCount)); + imageReadPos = (uint32_t *) (((uintptr_t) pageIndexSource) + gIOHibernateCurrentHeader->previewPageListSize); + lastPageIndexPage = atop_32((uintptr_t) imageReadPos); + } + else + { + pageIndexSource = NULL; + lastPageIndexPage = 0; + imageReadPos = (uint32_t *) (((uintptr_t) map) + gIOHibernateCurrentHeader->bitmapSize); + } + + debug_code(kIOHibernateRestoreCodePageIndexStart, (uintptr_t) pageIndexSource); + debug_code(kIOHibernateRestoreCodePageIndexEnd, ptoa_64(lastPageIndexPage)); while (1) { - if (pageIndexSource) - { - ppnum = pageIndexSource[0]; - count = pageIndexSource[1]; - pageIndexSource += 2; - if (!count) - { - pageIndexSource = 0; - src = (uint32_t *) (((uintptr_t) map) + gIOHibernateCurrentHeader->bitmapSize); - ppnum = src[0]; - count = src[1]; - src += 2; - } - } - else - { - ppnum = src[0]; - count = src[1]; - if (!count) - break; - src += 2; + switch (stage) + { + case 2: + // copy handoff data + count = src ? 0 : handoffPageCount; + if (!count) + break; + if (count > gIOHibernateHandoffPageCount) + count = gIOHibernateHandoffPageCount; + src = (uint32_t *) (uintptr_t) ptoa_64(handoffPages); + break; + + case 1: + // copy pageIndexSource pages == preview image data + if (!src) + { + if (!pageIndexSource) + break; + src = imageReadPos; + } + ppnum = pageIndexSource[0]; + count = pageIndexSource[1]; + pageIndexSource += 2; + imageReadPos = src; + break; + + case 0: + // copy pages + if (!src) + { + src = (uint32_t *) (((uintptr_t) map) + gIOHibernateCurrentHeader->bitmapSize); + } + ppnum = src[0]; + count = src[1]; + src += 2; + imageReadPos = src; + break; + } + + + if (!count) + { + if (!stage) + break; + stage--; + src = NULL; + continue; } for (page = 0; page < count; page++, ppnum++) { - uint32_t tag; + uint32_t tag; int conflicts; - if (!pageIndexSource) - { - tag = *src++; - compressedSize = kIOHibernateTagLength & tag; - } + if (2 == stage) + ppnum = gIOHibernateHandoffPages[page]; + else if (!stage) + { + tag = *src++; + compressedSize = kIOHibernateTagLength & tag; + } + + conflicts = (ppnum >= atop_32((uintptr_t) map)) && (ppnum <= lastMapPage); - conflicts = (((ppnum >= atop_32((uintptr_t) map)) && (ppnum <= lastMapPage)) - || ((ppnum >= atop_32((uintptr_t) src)) && (ppnum <= lastImagePage))); + conflicts |= ((ppnum >= atop_32((uintptr_t) imageReadPos)) && (ppnum <= lastImagePage)); - if (pageIndexSource) - conflicts |= ((ppnum >= atop_32((uintptr_t) pageIndexSource)) && (ppnum <= lastPageIndexPage)); + if (stage >= 2) + conflicts |= ((ppnum >= atop_32((uintptr_t) src)) && (ppnum <= (handoffPages + handoffPageCount - 1))); + + if (stage >= 1) + conflicts |= ((ppnum >= atop_32((uintptr_t) pageIndexSource)) && (ppnum <= lastPageIndexPage)); if (!conflicts) { - if (compressedSize) - sum += store_one_page(gIOHibernateCurrentHeader->processorFlags, - src, compressedSize, buffer, ppnum); +// if (compressedSize) + pageSum = store_one_page(gIOHibernateCurrentHeader->processorFlags, + src, compressedSize, 0, ppnum); + if (stage != 2) + sum += pageSum; uncompressedPages++; } else @@ -596,46 +612,59 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, // alloc new copy list page uint32_t pageListPage = hibernate_page_list_grab(map, &nextFree); // link to current - if (copyPageList) - copyPageList[1] = pageListPage; - else - copyPageListHead = pageListPage; - copyPageList = (uint32_t *) (uintptr_t) ptoa_32(pageListPage); + if (copyPageList) { + copyPageList[1] = pageListPage; + } else { + copyPageListHead = pageListPage; + } + copyPageList = (uint32_t *)pal_hib_map(SRC_COPY_AREA, + ptoa_32(pageListPage)); copyPageList[1] = 0; copyPageIndex = 2; } copyPageList[copyPageIndex++] = ppnum; copyPageList[copyPageIndex++] = bufferPage; - copyPageList[copyPageIndex++] = compressedSize; + copyPageList[copyPageIndex++] = (compressedSize | (stage << 24)); copyPageList[0] = copyPageIndex; - dst = (uint32_t *) (uintptr_t) ptoa_32(bufferPage); + dst = (uint32_t *)pal_hib_map(DEST_COPY_AREA, ptoa_32(bufferPage)); for (idx = 0; idx < ((compressedSize + 3) >> 2); idx++) - dst[idx] = src[idx]; + dst[idx] = src[idx]; } src += ((compressedSize + 3) >> 2); } } + /* src points to the last page restored, so we need to skip over that */ + hibernateRestorePALState(src); + // -- copy back conflicts copyPageList = (uint32_t *)(uintptr_t) ptoa_32(copyPageListHead); + while (copyPageList) { + copyPageList = (uint32_t *)pal_hib_map(COPY_PAGE_AREA, (uintptr_t)copyPageList); for (copyPageIndex = 2; copyPageIndex < copyPageList[0]; copyPageIndex += 3) { - ppnum = copyPageList[copyPageIndex + 0]; - src = (uint32_t *) (uintptr_t) ptoa_32(copyPageList[copyPageIndex + 1]); - compressedSize = copyPageList[copyPageIndex + 2]; - - sum += store_one_page(gIOHibernateCurrentHeader->processorFlags, - src, compressedSize, buffer, ppnum); + ppnum = copyPageList[copyPageIndex + 0]; + src = (uint32_t *) (uintptr_t) ptoa_32(copyPageList[copyPageIndex + 1]); + src = (uint32_t *)pal_hib_map(SRC_COPY_AREA, (uintptr_t)src); + compressedSize = copyPageList[copyPageIndex + 2]; + stage = compressedSize >> 24; + compressedSize &= 0x1FFF; + pageSum = store_one_page(gIOHibernateCurrentHeader->processorFlags, + src, compressedSize, 0, ppnum); + if (stage != 2) + sum += pageSum; uncompressedPages++; } copyPageList = (uint32_t *) (uintptr_t) ptoa_32(copyPageList[1]); } + pal_hib_patchup(); + // -- image has been destroyed... gIOHibernateCurrentHeader->actualImage1Sum = sum; @@ -646,16 +675,10 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, gIOHibernateState = kIOHibernateStateWakingFromHibernate; #if CONFIG_SLEEP -#if defined(__ppc__) - typedef void (*ResetProc)(void); - ResetProc proc; - proc = (ResetProc) 0x100; - __asm__ volatile("ori 0, 0, 0" : : ); - proc(); -#elif defined(__i386__) || defined(__x86_64__) +#if defined(__i386__) || defined(__x86_64__) typedef void (*ResetProc)(void); ResetProc proc; - proc = (ResetProc) acpi_wake_prot_entry; + proc = HIB_ENTRYPOINT; // flush caches __asm__("wbinvd"); proc(); @@ -666,3 +689,445 @@ hibernate_kernel_entrypoint(IOHibernateImageHeader * header, return -1; } + +#if CONFIG_DEBUG +/* standalone printf implementation */ +/*- + * Copyright (c) 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94 + */ + +typedef long ptrdiff_t; +char const hibhex2ascii_data[] = "0123456789abcdefghijklmnopqrstuvwxyz"; +#define hibhex2ascii(hex) (hibhex2ascii_data[hex]) +#define toupper(c) ((c) - 0x20 * (((c) >= 'a') && ((c) <= 'z'))) +static size_t +hibstrlen(const char *s) +{ + size_t l = 0; + while (*s++) + l++; + return l; +} + +/* Max number conversion buffer length: a u_quad_t in base 2, plus NUL byte. */ +#define MAXNBUF (sizeof(intmax_t) * NBBY + 1) + +/* + * Put a NUL-terminated ASCII number (base <= 36) in a buffer in reverse + * order; return an optional length and a pointer to the last character + * written in the buffer (i.e., the first character of the string). + * The buffer pointed to by `nbuf' must have length >= MAXNBUF. + */ +static char * +ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper) +{ + char *p, c; + + /* Truncate so we don't call umoddi3, which isn't in __HIB */ +#if !defined(__LP64__) + uint32_t num2 = (uint32_t) num; +#else + uintmax_t num2 = num; +#endif + + p = nbuf; + *p = '\0'; + do { + c = hibhex2ascii(num2 % base); + *++p = upper ? toupper(c) : c; + } while (num2 /= base); + if (lenp) + *lenp = (int)(p - nbuf); + return (p); +} + +/* + * Scaled down version of printf(3). + * + * Two additional formats: + * + * The format %b is supported to decode error registers. + * Its usage is: + * + * printf("reg=%b\n", regval, "*"); + * + * where is the output base expressed as a control character, e.g. + * \10 gives octal; \20 gives hex. Each arg is a sequence of characters, + * the first of which gives the bit number to be inspected (origin 1), and + * the next characters (up to a control character, i.e. a character <= 32), + * give the name of the register. Thus: + * + * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n"); + * + * would produce output: + * + * reg=3 + * + * XXX: %D -- Hexdump, takes pointer and separator string: + * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX + * ("%*D", len, ptr, " " -> XX XX XX XX ... + */ +static int +hibkvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap) +{ +#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; } + char nbuf[MAXNBUF]; + char *d; + const char *p, *percent, *q; + u_char *up; + int ch, n; + uintmax_t num; + int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot; + int cflag, hflag, jflag, tflag, zflag; + int dwidth, upper; + char padc; + int stop = 0, retval = 0; + + num = 0; + if (!func) + d = (char *) arg; + else + d = NULL; + + if (fmt == NULL) + fmt = "(fmt null)\n"; + + if (radix < 2 || radix > 36) + radix = 10; + + for (;;) { + padc = ' '; + width = 0; + while ((ch = (u_char)*fmt++) != '%' || stop) { + if (ch == '\0') + return (retval); + PCHAR(ch); + } + percent = fmt - 1; + qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0; + sign = 0; dot = 0; dwidth = 0; upper = 0; + cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0; +reswitch: switch (ch = (u_char)*fmt++) { + case '.': + dot = 1; + goto reswitch; + case '#': + sharpflag = 1; + goto reswitch; + case '+': + sign = 1; + goto reswitch; + case '-': + ladjust = 1; + goto reswitch; + case '%': + PCHAR(ch); + break; + case '*': + if (!dot) { + width = va_arg(ap, int); + if (width < 0) { + ladjust = !ladjust; + width = -width; + } + } else { + dwidth = va_arg(ap, int); + } + goto reswitch; + case '0': + if (!dot) { + padc = '0'; + goto reswitch; + } + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + for (n = 0;; ++fmt) { + n = n * 10 + ch - '0'; + ch = *fmt; + if (ch < '0' || ch > '9') + break; + } + if (dot) + dwidth = n; + else + width = n; + goto reswitch; + case 'b': + num = (u_int)va_arg(ap, int); + p = va_arg(ap, char *); + for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;) + PCHAR(*q--); + + if (num == 0) + break; + + for (tmp = 0; *p;) { + n = *p++; + if (num & (1 << (n - 1))) { + PCHAR(tmp ? ',' : '<'); + for (; (n = *p) > ' '; ++p) + PCHAR(n); + tmp = 1; + } else + for (; *p > ' '; ++p) + continue; + } + if (tmp) + PCHAR('>'); + break; + case 'c': + PCHAR(va_arg(ap, int)); + break; + case 'D': + up = va_arg(ap, u_char *); + p = va_arg(ap, char *); + if (!width) + width = 16; + while(width--) { + PCHAR(hibhex2ascii(*up >> 4)); + PCHAR(hibhex2ascii(*up & 0x0f)); + up++; + if (width) + for (q=p;*q;q++) + PCHAR(*q); + } + break; + case 'd': + case 'i': + base = 10; + sign = 1; + goto handle_sign; + case 'h': + if (hflag) { + hflag = 0; + cflag = 1; + } else + hflag = 1; + goto reswitch; + case 'j': + jflag = 1; + goto reswitch; + case 'l': + if (lflag) { + lflag = 0; + qflag = 1; + } else + lflag = 1; + goto reswitch; + case 'n': + if (jflag) + *(va_arg(ap, intmax_t *)) = retval; + else if (qflag) + *(va_arg(ap, quad_t *)) = retval; + else if (lflag) + *(va_arg(ap, long *)) = retval; + else if (zflag) + *(va_arg(ap, size_t *)) = retval; + else if (hflag) + *(va_arg(ap, short *)) = retval; + else if (cflag) + *(va_arg(ap, char *)) = retval; + else + *(va_arg(ap, int *)) = retval; + break; + case 'o': + base = 8; + goto handle_nosign; + case 'p': + base = 16; + sharpflag = (width == 0); + sign = 0; + num = (uintptr_t)va_arg(ap, void *); + goto number; + case 'q': + qflag = 1; + goto reswitch; + case 'r': + base = radix; + if (sign) + goto handle_sign; + goto handle_nosign; + case 's': + p = va_arg(ap, char *); + if (p == NULL) + p = "(null)"; + if (!dot) + n = (typeof(n))hibstrlen (p); + else + for (n = 0; n < dwidth && p[n]; n++) + continue; + + width -= n; + + if (!ladjust && width > 0) + while (width--) + PCHAR(padc); + while (n--) + PCHAR(*p++); + if (ladjust && width > 0) + while (width--) + PCHAR(padc); + break; + case 't': + tflag = 1; + goto reswitch; + case 'u': + base = 10; + goto handle_nosign; + case 'X': + upper = 1; + case 'x': + base = 16; + goto handle_nosign; + case 'y': + base = 16; + sign = 1; + goto handle_sign; + case 'z': + zflag = 1; + goto reswitch; +handle_nosign: + sign = 0; + if (jflag) + num = va_arg(ap, uintmax_t); + else if (qflag) + num = va_arg(ap, u_quad_t); + else if (tflag) + num = va_arg(ap, ptrdiff_t); + else if (lflag) + num = va_arg(ap, u_long); + else if (zflag) + num = va_arg(ap, size_t); + else if (hflag) + num = (u_short)va_arg(ap, int); + else if (cflag) + num = (u_char)va_arg(ap, int); + else + num = va_arg(ap, u_int); + goto number; +handle_sign: + if (jflag) + num = va_arg(ap, intmax_t); + else if (qflag) + num = va_arg(ap, quad_t); + else if (tflag) + num = va_arg(ap, ptrdiff_t); + else if (lflag) + num = va_arg(ap, long); + else if (zflag) + num = va_arg(ap, ssize_t); + else if (hflag) + num = (short)va_arg(ap, int); + else if (cflag) + num = (char)va_arg(ap, int); + else + num = va_arg(ap, int); +number: + if (sign && (intmax_t)num < 0) { + neg = 1; + num = -(intmax_t)num; + } + p = ksprintn(nbuf, num, base, &tmp, upper); + if (sharpflag && num != 0) { + if (base == 8) + tmp++; + else if (base == 16) + tmp += 2; + } + if (neg) + tmp++; + + if (!ladjust && padc != '0' && width + && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + if (neg) + PCHAR('-'); + if (sharpflag && num != 0) { + if (base == 8) { + PCHAR('0'); + } else if (base == 16) { + PCHAR('0'); + PCHAR('x'); + } + } + if (!ladjust && width && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + + while (*p) + PCHAR(*p--); + + if (ladjust && width && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + + break; + default: + while (percent < fmt) + PCHAR(*percent++); + /* + * Since we ignore an formatting argument it is no + * longer safe to obey the remaining formatting + * arguments as the arguments will no longer match + * the format specs. + */ + stop = 1; + break; + } + } +#undef PCHAR +} + + +static void +putchar(int c, void *arg) +{ + (void)arg; + uart_putc(c); +} + +void +hibprintf(const char *fmt, ...) +{ + /* http://www.pagetable.com/?p=298 */ + va_list ap; + + va_start(ap, fmt); + hibkvprintf(fmt, putchar, NULL, 10, ap); + va_end(ap); +} +#endif /* CONFIG_DEBUG */ + diff --git a/iokit/Kernel/IOInterruptController.cpp b/iokit/Kernel/IOInterruptController.cpp index a8e04bddd..1000178ad 100644 --- a/iokit/Kernel/IOInterruptController.cpp +++ b/iokit/Kernel/IOInterruptController.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -26,11 +26,6 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ - -#if __ppc__ -#include <ppc/proc_reg.h> -#endif - #include <IOKit/IOLib.h> #include <IOKit/IOService.h> #include <IOKit/IOPlatformExpert.h> @@ -295,17 +290,10 @@ IOReturn IOInterruptController::enableInterrupt(IOService *nub, int source) if (vector->interruptDisabledSoft) { vector->interruptDisabledSoft = 0; -#if __ppc__ - sync(); - isync(); -#endif if (!getPlatform()->atInterruptLevel()) { while (vector->interruptActive) {} -#if __ppc__ - isync(); -#endif } if (vector->interruptDisabledHard) { vector->interruptDisabledHard = 0; @@ -330,17 +318,10 @@ IOReturn IOInterruptController::disableInterrupt(IOService *nub, int source) vector = &vectors[vectorNumber]; vector->interruptDisabledSoft = 1; -#if __ppc__ - sync(); - isync(); -#endif if (!getPlatform()->atInterruptLevel()) { while (vector->interruptActive) {} -#if __ppc__ - isync(); -#endif } return kIOReturnSuccess; @@ -663,10 +644,6 @@ IOReturn IOSharedInterruptController::disableInterrupt(IOService *nub, interruptState = IOSimpleLockLockDisableInterrupt(controllerLock); if (!vector->interruptDisabledSoft) { vector->interruptDisabledSoft = 1; -#if __ppc__ - sync(); - isync(); -#endif vectorsEnabled--; } IOSimpleLockUnlockEnableInterrupt(controllerLock, interruptState); @@ -674,9 +651,6 @@ IOReturn IOSharedInterruptController::disableInterrupt(IOService *nub, if (!getPlatform()->atInterruptLevel()) { while (vector->interruptActive) {} -#if __ppc__ - isync(); -#endif } return kIOReturnSuccess; @@ -699,48 +673,26 @@ IOReturn IOSharedInterruptController::handleInterrupt(void * /*refCon*/, vector = &vectors[vectorNumber]; vector->interruptActive = 1; -#if __ppc__ - sync(); - isync(); -#endif - if (!vector->interruptDisabledSoft) { -#if __ppc__ - isync(); -#endif - - // Call the handler if it exists. - if (vector->interruptRegistered) { - - bool trace = (gIOKitTrace & kIOTraceInterrupts) ? true : false; - bool timeHandler = gIOInterruptThresholdNS ? true : false; - uint64_t startTime = 0; - uint64_t endTime = 0; + if (!vector->interruptDisabledSoft) { + + // Call the handler if it exists. + if (vector->interruptRegistered) { + + bool trace = (gIOKitTrace & kIOTraceInterrupts) ? true : false; if (trace) IOTimeStampStartConstant(IODBG_INTC(IOINTC_HANDLER), (uintptr_t) vectorNumber, (uintptr_t) vector->handler, (uintptr_t)vector->target); - if (timeHandler) - startTime = mach_absolute_time(); - // Call handler. vector->handler(vector->target, vector->refCon, vector->nub, vector->source); - - if (timeHandler) - { - endTime = mach_absolute_time(); - if ((endTime - startTime) > gIOInterruptThresholdNS) - panic("IOSIC::handleInterrupt: interrupt exceeded threshold, handlerTime = %qd, vectorNumber = %d, handler = %p, target = %p\n", - endTime - startTime, (int)vectorNumber, vector->handler, vector->target); - } if (trace) IOTimeStampEndConstant(IODBG_INTC(IOINTC_HANDLER), (uintptr_t) vectorNumber, (uintptr_t) vector->handler, (uintptr_t)vector->target); - } - - } + } + } vector->interruptActive = 0; } diff --git a/iokit/Kernel/IOInterruptEventSource.cpp b/iokit/Kernel/IOInterruptEventSource.cpp index 97d4c5957..8b49024a1 100644 --- a/iokit/Kernel/IOInterruptEventSource.cpp +++ b/iokit/Kernel/IOInterruptEventSource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -25,13 +25,7 @@ * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -/* -Copyright (c) 1998 Apple Computer, Inc. All rights reserved. -HISTORY - 1998-7-13 Godfrey van der Linden(gvdl) - Created. -*/ #include <IOKit/IOInterruptEventSource.h> #include <IOKit/IOKitDebug.h> #include <IOKit/IOLib.h> @@ -40,6 +34,31 @@ HISTORY #include <IOKit/IOTimeStamp.h> #include <IOKit/IOWorkLoop.h> +#if IOKITSTATS + +#define IOStatisticsInitializeCounter() \ +do { \ + IOStatistics::setCounterType(IOEventSource::reserved->counter, kIOStatisticsInterruptEventSourceCounter); \ +} while (0) + +#define IOStatisticsCheckForWork() \ +do { \ + IOStatistics::countInterruptCheckForWork(IOEventSource::reserved->counter); \ +} while (0) + +#define IOStatisticsInterrupt() \ +do { \ + IOStatistics::countInterrupt(IOEventSource::reserved->counter); \ +} while (0) + +#else + +#define IOStatisticsInitializeCounter() +#define IOStatisticsCheckForWork() +#define IOStatisticsInterrupt() + +#endif // IOKITSTATS + #define super IOEventSource OSDefineMetaClassAndStructors(IOInterruptEventSource, IOEventSource) @@ -74,6 +93,8 @@ bool IOInterruptEventSource::init(OSObject *inOwner, intIndex = inIntIndex; } + IOStatisticsInitializeCounter(); + return res; } @@ -182,24 +203,26 @@ bool IOInterruptEventSource::checkForWork() int numInts = cacheProdCount - consumerCount; IOInterruptEventAction intAction = (IOInterruptEventAction) action; bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + + IOStatisticsCheckForWork(); + if ( numInts > 0 ) { if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_ACTION), (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - + // Call the handler - (*intAction)(owner, this, numInts); + (*intAction)(owner, this, numInts); if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_ACTION), (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - - consumerCount = cacheProdCount; - if (autoDisable && !explicitDisable) - enable(); - } + + consumerCount = cacheProdCount; + if (autoDisable && !explicitDisable) + enable(); + } else if ( numInts < 0 ) { @@ -208,17 +231,17 @@ bool IOInterruptEventSource::checkForWork() (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); // Call the handler - (*intAction)(owner, this, -numInts); + (*intAction)(owner, this, -numInts); if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_ACTION), (uintptr_t) intAction, (uintptr_t) owner, (uintptr_t) this, (uintptr_t) workLoop); - - consumerCount = cacheProdCount; - if (autoDisable && !explicitDisable) - enable(); - } - + + consumerCount = cacheProdCount; + if (autoDisable && !explicitDisable) + enable(); + } + return false; } @@ -226,14 +249,15 @@ void IOInterruptEventSource::normalInterruptOccurred (void */*refcon*/, IOService */*prov*/, int /*source*/) { bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + + IOStatisticsInterrupt(); producerCount++; - + if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_SEMA), (uintptr_t) this, (uintptr_t) owner); signalWorkAvailable(); - + if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_SEMA), (uintptr_t) this, (uintptr_t) owner); } @@ -242,16 +266,17 @@ void IOInterruptEventSource::disableInterruptOccurred (void */*refcon*/, IOService *prov, int source) { bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false; - + prov->disableInterrupt(source); /* disable the interrupt */ - + + IOStatisticsInterrupt(); producerCount++; - + if (trace) IOTimeStampStartConstant(IODBG_INTES(IOINTES_SEMA), (uintptr_t) this, (uintptr_t) owner); signalWorkAvailable(); - + if (trace) IOTimeStampEndConstant(IODBG_INTES(IOINTES_SEMA), (uintptr_t) this, (uintptr_t) owner); } @@ -264,3 +289,10 @@ void IOInterruptEventSource::interruptOccurred else normalInterruptOccurred(refcon, prov, source); } + +IOReturn IOInterruptEventSource::warmCPU + (uint64_t abstime) +{ + + return ml_interrupt_prewarm(abstime); +} diff --git a/iokit/Kernel/IOKitDebug.cpp b/iokit/Kernel/IOKitDebug.cpp index 31d681664..21048d88c 100644 --- a/iokit/Kernel/IOKitDebug.cpp +++ b/iokit/Kernel/IOKitDebug.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -28,15 +28,15 @@ #include <sys/sysctl.h> +#include <libkern/c++/OSContainers.h> +#include <libkern/c++/OSCPPDebug.h> + #include <IOKit/IOKitDebug.h> #include <IOKit/IOLib.h> #include <IOKit/assert.h> #include <IOKit/IODeviceTreeSupport.h> #include <IOKit/IOService.h> -#include <libkern/c++/OSContainers.h> -#include <libkern/c++/OSCPPDebug.h> - #ifdef IOKITDEBUG #define DEBUG_INIT_VALUE IOKITDEBUG #else @@ -44,12 +44,10 @@ #endif SInt64 gIOKitDebug = DEBUG_INIT_VALUE; -SInt64 gIOKitTrace = 0x3B; -UInt64 gIOInterruptThresholdNS = 0; +SInt64 gIOKitTrace = 0; SYSCTL_QUAD(_debug, OID_AUTO, iokit, CTLFLAG_RW | CTLFLAG_LOCKED, &gIOKitDebug, "boot_arg io"); SYSCTL_QUAD(_debug, OID_AUTO, iotrace, CTLFLAG_RW | CTLFLAG_LOCKED, &gIOKitTrace, "trace io"); -SYSCTL_QUAD(_debug, OID_AUTO, iointthreshold, CTLFLAG_RW | CTLFLAG_LOCKED, &gIOInterruptThresholdNS, "io interrupt threshold"); int debug_malloc_size; @@ -100,7 +98,7 @@ void IOPrintPlane( const IORegistryPlane * plane ) iter->release(); } -void dbugprintf(char *fmt, ...); +void dbugprintf(const char *fmt, ...); void db_dumpiojunk( const IORegistryPlane * plane ); void db_piokjunk(void) { diff --git a/iokit/Kernel/IOKitKernelInternal.h b/iokit/Kernel/IOKitKernelInternal.h index 804c57f24..5a74159a4 100644 --- a/iokit/Kernel/IOKitKernelInternal.h +++ b/iokit/Kernel/IOKitKernelInternal.h @@ -38,6 +38,29 @@ __BEGIN_DECLS #include <mach/memory_object_types.h> #include <device/device_port.h> +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#if !defined(NO_KDEBUG) + +#define IOServiceTrace(csc, a, b, c, d) do { \ + if(kIOTraceIOService & gIOKitDebug) { \ + KERNEL_DEBUG_CONSTANT(IODBG_IOSERVICE(csc), a, b, c, d, 0); \ + } \ +} while(0) + +#else /* NO_KDEBUG */ + +#define IOServiceTrace(csc, a, b, c, d) do { \ + (void)a; \ + (void)b; \ + (void)c; \ + (void)d; \ +} while (0) + +#endif /* NO_KDEBUG */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + typedef kern_return_t (*IOIteratePageableMapsCallback)(vm_map_t map, void * ref); void IOLibInit(void); @@ -149,4 +172,8 @@ extern "C" void IOKitInitializeTime( void ); extern "C" OSString * IOCopyLogNameForPID(int pid); +#if defined(__i386__) || defined(__x86_64__) +extern "C" void IOSetKeyStoreData(IOMemoryDescriptor * data); +#endif + #endif /* ! _IOKIT_KERNELINTERNAL_H */ diff --git a/iokit/Kernel/IOLib.cpp b/iokit/Kernel/IOLib.cpp index a5415e71c..50000299d 100644 --- a/iokit/Kernel/IOLib.cpp +++ b/iokit/Kernel/IOLib.cpp @@ -55,6 +55,24 @@ #include <sys/sysctl.h> #endif +#include "libkern/OSAtomic.h" +#include <libkern/c++/OSKext.h> +#include <IOKit/IOStatisticsPrivate.h> +#include <sys/msgbuf.h> + +#if IOKITSTATS + +#define IOStatisticsAlloc(type, size) \ +do { \ + IOStatistics::countAlloc(type, size); \ +} while (0) + +#else + +#define IOStatisticsAlloc(type, size) + +#endif /* IOKITSTATS */ + extern "C" { @@ -63,7 +81,7 @@ mach_timespec_t IOZeroTvalspec = { 0, 0 }; extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); -int +extern int __doprnt( const char *fmt, va_list argp, @@ -71,7 +89,9 @@ __doprnt( void *arg, int radix); -extern void conslog_putc(char); +extern void cons_putc_locked(char); +extern void bsd_log_lock(void); +extern void bsd_log_unlock(void); /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -178,11 +198,13 @@ void * IOMalloc(vm_size_t size) void * address; address = (void *)kalloc(size); + if ( address ) { #if IOALLOCDEBUG - if (address) { debug_iomalloc_size += size; - } #endif + IOStatisticsAlloc(kIOStatisticsMalloc, size); + } + return address; } @@ -193,6 +215,7 @@ void IOFree(void * address, vm_size_t size) #if IOALLOCDEBUG debug_iomalloc_size -= size; #endif + IOStatisticsAlloc(kIOStatisticsFree, size); } } @@ -250,11 +273,12 @@ void * IOMallocAligned(vm_size_t size, vm_size_t alignment) assert(0 == (address & alignMask)); -#if IOALLOCDEBUG if( address) { +#if IOALLOCDEBUG debug_iomalloc_size += size; - } #endif + IOStatisticsAlloc(kIOStatisticsMallocAligned, size); + } return (void *) address; } @@ -289,6 +313,8 @@ void IOFreeAligned(void * address, vm_size_t size) #if IOALLOCDEBUG debug_iomalloc_size -= size; #endif + + IOStatisticsAlloc(kIOStatisticsFreeAligned, size); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -325,7 +351,7 @@ IOKernelFreePhysical(mach_vm_address_t address, mach_vm_size_t size) mach_vm_address_t IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxPhys, - mach_vm_size_t alignment, bool contiguous) + mach_vm_size_t alignment, bool contiguous) { kern_return_t kr; mach_vm_address_t address; @@ -405,6 +431,7 @@ IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxP return (address); } + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ struct _IOMallocContiguousEntry @@ -463,6 +490,10 @@ void * IOMallocContiguous(vm_size_t size, vm_size_t alignment, } while (false); + if (address) { + IOStatisticsAlloc(kIOStatisticsMallocContiguous, size); + } + return (void *) address; } @@ -500,6 +531,8 @@ void IOFreeContiguous(void * _address, vm_size_t size) { IOKernelFreePhysical((mach_vm_address_t) address, size); } + + IOStatisticsAlloc(kIOStatisticsFreeContiguous, size); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -603,10 +636,12 @@ void * IOMallocPageable(vm_size_t size, vm_size_t alignment) if( kIOReturnSuccess != kr) ref.address = 0; + if( ref.address) { #if IOALLOCDEBUG - if( ref.address) debug_iomallocpageable_size += round_page(size); #endif + IOStatisticsAlloc(kIOStatisticsMallocPageable, size); + } return( (void *) ref.address ); } @@ -640,6 +675,8 @@ void IOFreePageable(void * address, vm_size_t size) #if IOALLOCDEBUG debug_iomallocpageable_size -= round_page(size); #endif + + IOStatisticsAlloc(kIOStatisticsFreePageable, size); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -727,23 +764,36 @@ void IOPause(unsigned nanoseconds) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ -static void _iolog_putc(int ch, void *arg __unused) +static void _iolog_consputc(int ch, void *arg __unused) { - conslog_putc(ch); + cons_putc_locked(ch); +} + +static void _iolog_logputc(int ch, void *arg __unused) +{ + log_putc_locked(ch); } void IOLog(const char *format, ...) { - va_list ap; + va_list ap; - va_start(ap, format); - __doprnt(format, ap, _iolog_putc, NULL, 16); - va_end(ap); + va_start(ap, format); + IOLogv(format, ap); + va_end(ap); } void IOLogv(const char *format, va_list ap) { - __doprnt(format, ap, _iolog_putc, NULL, 16); + va_list ap2; + + va_copy(ap2, ap); + + bsd_log_lock(); + __doprnt(format, ap, _iolog_logputc, NULL, 16); + bsd_log_unlock(); + + __doprnt(format, ap2, _iolog_consputc, NULL, 16); } #if !__LP64__ diff --git a/iokit/Kernel/IOMemoryCursor.cpp b/iokit/Kernel/IOMemoryCursor.cpp index 36a15009d..99999991d 100644 --- a/iokit/Kernel/IOMemoryCursor.cpp +++ b/iokit/Kernel/IOMemoryCursor.cpp @@ -325,66 +325,3 @@ IOLittleMemoryCursor::initWithSpecification(IOPhysicalLength inMaxSegmentSize, inMaxTransferSize, inAlignment); } - -/************************* class IODBDMAMemoryCursor *************************/ - -#if defined(__ppc__) - -#include <IOKit/ppc/IODBDMA.h> - -#undef super -#define super IOMemoryCursor -OSDefineMetaClassAndStructors(IODBDMAMemoryCursor, IOMemoryCursor) - -void -IODBDMAMemoryCursor::outputSegment(PhysicalSegment inSegment, - void * inSegments, - UInt32 inSegmentIndex) -{ - IODBDMADescriptor *segment; - - segment = &((IODBDMADescriptor *) inSegments)[inSegmentIndex]; - - // Write location into address field - OSWriteSwapInt32((UInt32 *) segment, 4, inSegment.location); - - // Write count into 1st two bytes of operation field. - // DO NOT touch rest of operation field as it should contain a STOP command. - OSWriteSwapInt16((UInt16 *) segment, 0, inSegment.length); -} - -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -IODBDMAMemoryCursor * -IODBDMAMemoryCursor::withSpecification(IOPhysicalLength inMaxSegmentSize, - IOPhysicalLength inMaxTransferSize, - IOPhysicalLength inAlignment) -{ - IODBDMAMemoryCursor *me = new IODBDMAMemoryCursor; - - if (me && !me->initWithSpecification(inMaxSegmentSize, - inMaxTransferSize, - inAlignment)) - { - me->release(); - return 0; - } - - return me; -} - -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -bool -IODBDMAMemoryCursor::initWithSpecification(IOPhysicalLength inMaxSegmentSize, - IOPhysicalLength inMaxTransferSize, - IOPhysicalLength inAlignment) -{ - return super::initWithSpecification(&IODBDMAMemoryCursor::outputSegment, - inMaxSegmentSize, - inMaxTransferSize, - inAlignment); -} - -#endif /* defined(__ppc__) */ - diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index a46021ede..0a11064a1 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -71,26 +71,8 @@ __BEGIN_DECLS #include <vm/vm_protos.h> extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); -void ipc_port_release_send(ipc_port_t port); - -/* Copy between a physical page and a virtual address in the given vm_map */ -kern_return_t copypv(addr64_t source, addr64_t sink, unsigned int size, int which); - -memory_object_t -device_pager_setup( - memory_object_t pager, - uintptr_t device_handle, - vm_size_t size, - int flags); -void -device_pager_deallocate( - memory_object_t); -kern_return_t -device_pager_populate_object( - memory_object_t pager, - vm_object_offset_t offset, - ppnum_t phys_addr, - vm_size_t size); +extern void ipc_port_release_send(ipc_port_t port); + kern_return_t memory_object_iopl_request( ipc_port_t port, @@ -172,8 +154,8 @@ struct ioGMDData { // align arrays to 8 bytes so following macros work unsigned int fPad; #endif - upl_page_info_t fPageList[]; - ioPLBlock fBlocks[]; + upl_page_info_t fPageList[1]; /* variable length */ + ioPLBlock fBlocks[1]; /* variable length */ }; #define getDataP(osd) ((ioGMDData *) (osd)->getBytesNoCopy()) @@ -182,7 +164,7 @@ struct ioGMDData { (((osd)->getLength() - ((char *) getIOPLList(d) - (char *) d)) / sizeof(ioPLBlock)) #define getPageList(d) (&(d->fPageList[0])) #define computeDataSize(p, u) \ - (sizeof(ioGMDData) + p * sizeof(upl_page_info_t) + u * sizeof(ioPLBlock)) + (offsetof(ioGMDData, fPageList) + p * sizeof(upl_page_info_t) + u * sizeof(ioPLBlock)) /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -306,18 +288,7 @@ IOMemoryDescriptor::withPhysicalAddress( IOByteCount length, IODirection direction ) { -#ifdef __LP64__ return (IOMemoryDescriptor::withAddressRange(address, length, direction, TASK_NULL)); -#else /* !__LP64__ */ - IOGeneralMemoryDescriptor *self = new IOGeneralMemoryDescriptor; - if (self - && !self->initWithPhysicalAddress(address, length, direction)) { - self->release(); - return 0; - } - - return self; -#endif /* !__LP64__ */ } #ifndef __LP64__ @@ -500,9 +471,7 @@ void *IOGeneralMemoryDescriptor::createNamedEntry() memory_object_size_t actualSize = size; vm_prot_t prot = VM_PROT_READ; -#if CONFIG_EMBEDDED if (kIODirectionOut != (kIODirectionOutIn & _flags)) -#endif prot |= VM_PROT_WRITE; if (_memEntry) @@ -630,6 +599,17 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, { IOOptionBits type = options & kIOMemoryTypeMask; +#ifndef __LP64__ + if (task + && (kIOMemoryTypeVirtual == type) + && vm_map_is_64bit(get_task_map(task)) + && ((IOVirtualRange *) buffers)->address) + { + OSReportWithBacktrace("IOMemoryDescriptor: attempt to create 32b virtual in 64b task, use ::withAddressRange()"); + return false; + } +#endif /* !__LP64__ */ + // Grab the original MD's configuation data to initialse the // arguments to this function. if (kIOMemoryTypePersistentMD == type) { @@ -644,10 +624,10 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, return false; _memEntry = initData->fMemEntry; // Grab the new named entry - options = orig->_flags | kIOMemoryAsReference; - _singleRange = orig->_singleRange; // Initialise our range - buffers = &_singleRange; - count = 1; + options = orig->_flags & ~kIOMemoryAsReference; + type = options & kIOMemoryTypeMask; + buffers = orig->_ranges.v; + count = orig->_rangesCount; // Now grab the original task and whatever mapper was previously used task = orig->_task; @@ -665,16 +645,6 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, assert(task); if (!task) return false; - -#ifndef __LP64__ - if (vm_map_is_64bit(get_task_map(task)) - && (kIOMemoryTypeVirtual == type) - && ((IOVirtualRange *) buffers)->address) - { - OSReportWithBacktrace("IOMemoryDescriptor: attempt to create 32b virtual in 64b task, use ::withAddressRange()"); - return false; - } -#endif /* !__LP64__ */ break; case kIOMemoryTypePhysical: // Neither Physical nor UPL should have a task @@ -721,7 +691,10 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, } if (_memEntry) - { ipc_port_release_send((ipc_port_t) _memEntry); _memEntry = 0; } + { + ipc_port_release_send((ipc_port_t) _memEntry); + _memEntry = 0; + } if (_mappings) _mappings->flushCollection(); } @@ -782,7 +755,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, else if (!_memoryEntries->initWithCapacity(dataSize)) return false; - _memoryEntries->appendBytes(0, sizeof(ioGMDData)); + _memoryEntries->appendBytes(0, computeDataSize(0, 0)); dataP = getDataP(_memoryEntries); dataP->fMapper = mapper; dataP->fPageCnt = 0; @@ -794,6 +767,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, ioPLBlock iopl; iopl.fIOPL = (upl_t) buffers; + upl_set_referenced(iopl.fIOPL, true); upl_page_info_t *pageList = UPL_GET_INTERNAL_PAGE_LIST(iopl.fIOPL); if (upl_get_size(iopl.fIOPL) < (count + offset)) @@ -853,7 +827,8 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, case kIOMemoryTypeVirtual64: case kIOMemoryTypePhysical64: if (count == 1 - && (((IOAddressRange *) buffers)->address + ((IOAddressRange *) buffers)->length) <= 0x100000000ULL) { + && (((IOAddressRange *) buffers)->address + ((IOAddressRange *) buffers)->length) <= 0x100000000ULL + ) { if (kIOMemoryTypeVirtual64 == type) type = kIOMemoryTypeVirtual; else @@ -931,7 +906,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, else if (!_memoryEntries->initWithCapacity(dataSize)) return false; - _memoryEntries->appendBytes(0, sizeof(ioGMDData)); + _memoryEntries->appendBytes(0, computeDataSize(0, 0)); dataP = getDataP(_memoryEntries); dataP->fMapper = mapper; dataP->fPageCnt = _pages; @@ -1183,11 +1158,7 @@ IOGeneralMemoryDescriptor::getPreparationID( void ) if (kIOPreparationIDUnprepared == dataP->fPreparationID) { -#if defined(__ppc__ ) - dataP->fPreparationID = gIOMDPreparationID++; -#else dataP->fPreparationID = OSIncrementAtomic64(&gIOMDPreparationID); -#endif } return (dataP->fPreparationID); } @@ -1397,7 +1368,7 @@ IOReturn IOGeneralMemoryDescriptor::dmaCommandOperation(DMACommandOps op, void * IOPhysicalAddress pageAddr = pageList[pageInd].phys_addr; if (!pageAddr) { panic("!pageList phys_addr"); - } + } address = ptoa_64(pageAddr) + offset; @@ -1911,13 +1882,8 @@ IOReturn IOMemoryDescriptor::performOperation( IOOptionBits options, return (remaining ? kIOReturnUnderrun : kIOReturnSuccess); } -#if defined(__ppc__) || defined(__arm__) -extern vm_offset_t static_memory_end; -#define io_kernel_static_end static_memory_end -#else extern vm_offset_t first_avail; #define io_kernel_static_end first_avail -#endif static kern_return_t io_get_kernel_static_upl( @@ -2118,12 +2084,16 @@ IOReturn IOGeneralMemoryDescriptor::wireVirtual(IODirection forDirection) iopl.fIOMDOffset = mdOffset; iopl.fPageInfo = pageIndex; +#if 0 + // used to remove the upl for auto prepares here, for some errant code + // that freed memory before the descriptor pointing at it if ((_flags & kIOMemoryAutoPrepare) && iopl.fIOPL) { upl_commit(iopl.fIOPL, 0, 0); upl_deallocate(iopl.fIOPL); iopl.fIOPL = 0; } +#endif if (!_memoryEntries->appendBytes(&iopl, sizeof(iopl))) { // Clean up partial created and unsaved iopl @@ -2169,7 +2139,7 @@ abortExit: upl_deallocate(ioplList[range].fIOPL); } } - (void) _memoryEntries->initWithBytes(dataP, sizeof(ioGMDData)); // == setLength() + (void) _memoryEntries->initWithBytes(dataP, computeDataSize(0, 0)); // == setLength() if (mapper && mapBase) mapper->iovmFree(mapBase, _pages); @@ -2231,7 +2201,7 @@ IOReturn IOGeneralMemoryDescriptor::prepare(IODirection forDirection) * issued; the prepare() and complete() must occur in pairs, before * before and after an I/O transfer involving pageable memory. */ - + IOReturn IOGeneralMemoryDescriptor::complete(IODirection /* forDirection */) { IOOptionBits type = _flags & kIOMemoryTypeMask; @@ -2273,8 +2243,11 @@ IOReturn IOGeneralMemoryDescriptor::complete(IODirection /* forDirection */) upl_commit(ioplList[ind].fIOPL, 0, 0); upl_deallocate(ioplList[ind].fIOPL); } + } else if (kIOMemoryTypeUPL == type) { + upl_set_referenced(ioplList[0].fIOPL, false); } - (void) _memoryEntries->initWithBytes(dataP, sizeof(ioGMDData)); // == setLength() + + (void) _memoryEntries->initWithBytes(dataP, computeDataSize(0, 0)); // == setLength() dataP->fPreparationID = kIOPreparationIDUnprepared; } @@ -3328,13 +3301,12 @@ IOMemoryMap * IOMemoryMap::copyCompatible( retain(); if( (fLength == _length) && (!_offset)) { - newMapping->release(); newMapping = this; } else { newMapping->fSuperMap = this; - newMapping->fOffset = _offset; + newMapping->fOffset = fOffset + _offset; newMapping->fAddress = fAddress + _offset; } @@ -3608,7 +3580,14 @@ IOMemoryMap * IOMemoryDescriptor::makeMapping( iter->release(); } if (result || (options & kIOMapReference)) + { + if (result != mapping) + { + mapping->release(); + mapping = NULL; + } continue; + } } if (!mapDesc) diff --git a/iokit/Kernel/IONVRAM.cpp b/iokit/Kernel/IONVRAM.cpp index 4c51e4457..85ac1a2ec 100644 --- a/iokit/Kernel/IONVRAM.cpp +++ b/iokit/Kernel/IONVRAM.cpp @@ -39,7 +39,6 @@ #define kIONVRAMPrivilege kIOClientPrivilegeAdministrator //#define kIONVRAMPrivilege kIOClientPrivilegeLocalUser - OSDefineMetaClassAndStructors(IODTNVRAM, IOService); bool IODTNVRAM::init(IORegistryEntry *old, const IORegistryPlane *plane) @@ -205,6 +204,9 @@ void IODTNVRAM::registerNVRAMController(IONVRAMController *nvram) _piImage = _nvramImage + _piPartitionOffset; } + _lastDeviceSync = 0; + _freshInterval = TRUE; // we will allow sync() even before the first 15 minutes have passed. + initOFVariables(); } @@ -229,27 +231,31 @@ bool IODTNVRAM::serializeProperties(OSSerialize *s) const OSDictionary *dict = 0, *tmpDict = 0; OSCollectionIterator *iter = 0; - if (_ofDict == 0) return false; - // Verify permissions. hasPrivilege = (kIOReturnSuccess == IOUserClient::clientHasPrivilege(current_task(), kIONVRAMPrivilege)); tmpDict = OSDictionary::withCapacity(1); if (tmpDict == 0) return false; + + if (_ofDict == 0) { + /* No nvram. Return an empty dictionary. */ + dict = tmpDict; + } else { + /* Copy properties with client privilege. */ + iter = OSCollectionIterator::withCollection(_ofDict); + if (iter == 0) return false; - iter = OSCollectionIterator::withCollection(_ofDict); - if (iter == 0) return false; - - while (1) { - key = OSDynamicCast(OSSymbol, iter->getNextObject()); - if (key == 0) break; + while (1) { + key = OSDynamicCast(OSSymbol, iter->getNextObject()); + if (key == 0) break; - variablePerm = getOFVariablePerm(key); - if ((hasPrivilege || (variablePerm != kOFVariablePermRootOnly)) && - ( ! (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) )) { - tmpDict->setObject(key, _ofDict->getObject(key)); + variablePerm = getOFVariablePerm(key); + if ((hasPrivilege || (variablePerm != kOFVariablePermRootOnly)) && + ( ! (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) )) { + tmpDict->setObject(key, _ofDict->getObject(key)); + } + dict = tmpDict; } - dict = tmpDict; } result = dict->serialize(s); @@ -412,18 +418,32 @@ IOReturn IODTNVRAM::setProperties(OSObject *properties) if (object == 0) continue; if (key->isEqualTo(kIONVRAMDeletePropertyKey)) { - tmpStr = OSDynamicCast(OSString, object); - if (tmpStr != 0) { - key = OSSymbol::withString(tmpStr); - removeProperty(key); - key->release(); - result = true; - } else { - result = false; - } - } else { - result = setProperty(key, object); + tmpStr = OSDynamicCast(OSString, object); + if (tmpStr != 0) { + key = OSSymbol::withString(tmpStr); + removeProperty(key); + key->release(); + result = true; + } else { + result = false; + } + } else if(key->isEqualTo(kIONVRAMSyncNowPropertyKey)) { + tmpStr = OSDynamicCast(OSString, object); + if (tmpStr != 0) { + + result = true; // We are not going to gaurantee sync, this is best effort + + if(safeToSync()) + sync(); + + } else { + result = false; + } + } + else { + result = setProperty(key, object); } + } iter->release(); @@ -1656,3 +1676,26 @@ IOReturn IODTNVRAM::writeNVRAMPropertyType1(IORegistryEntry *entry, return ok ? kIOReturnSuccess : kIOReturnNoMemory; } + +bool IODTNVRAM::safeToSync(void) +{ + AbsoluteTime delta; + UInt64 delta_ns; + SInt32 delta_secs; + + // delta interval went by + clock_get_uptime(&delta); + + // Figure it in seconds. + absolutetime_to_nanoseconds(delta, &delta_ns); + delta_secs = (SInt32)(delta_ns / NSEC_PER_SEC); + + if ((delta_secs > (_lastDeviceSync + MIN_SYNC_NOW_INTERVAL)) || _freshInterval) + { + _lastDeviceSync = delta_secs; + _freshInterval = FALSE; + return TRUE; + } + + return FALSE; +} diff --git a/iokit/Kernel/IOPMPowerSource.cpp b/iokit/Kernel/IOPMPowerSource.cpp index e6a11fc07..614f4caa3 100644 --- a/iokit/Kernel/IOPMPowerSource.cpp +++ b/iokit/Kernel/IOPMPowerSource.cpp @@ -165,7 +165,6 @@ void IOPMPowerSource::updateStatus (void) void IOPMPowerSource::setPSProperty(const OSSymbol *key, OSObject *val) { OSObject *lastVal; - OSNumber *newNumVal; if(!key || !val) return; @@ -175,19 +174,12 @@ void IOPMPowerSource::setPSProperty(const OSSymbol *key, OSObject *val) // Otherwise, just compare pointers. if( (lastVal = properties->getObject(key)) ) { - newNumVal = OSDynamicCast(OSNumber, val); - if(newNumVal) { - if(newNumVal->isEqualTo(lastVal)) { - // settings didn't change - } else { - // num val is not equal to last val - settingsChangedSinceUpdate = true; - } - } else { - // pointer compare as last resort - if(lastVal != val) - settingsChangedSinceUpdate = true; - } + if(val->isEqualTo(lastVal)) { + // settings didn't change + } else { + // num val is not equal to last val + settingsChangedSinceUpdate = true; + } } else { // new setting; no last value settingsChangedSinceUpdate = true; diff --git a/iokit/Kernel/IOPMrootDomain.cpp b/iokit/Kernel/IOPMrootDomain.cpp index 3ccda1a1b..e6146bb24 100644 --- a/iokit/Kernel/IOPMrootDomain.cpp +++ b/iokit/Kernel/IOPMrootDomain.cpp @@ -27,6 +27,7 @@ */ #include <libkern/c++/OSKext.h> #include <libkern/c++/OSMetaClass.h> +#include <libkern/OSAtomic.h> #include <libkern/OSDebug.h> #include <IOKit/IOWorkLoop.h> #include <IOKit/IOCommandGate.h> @@ -43,7 +44,6 @@ #include "IOKit/pwr_mgt/IOPowerConnection.h" #include "IOPMPowerStateQueue.h" #include <IOKit/IOCatalogue.h> -#include <IOKit/IOCommand.h> // IOServicePMPrivate #if HIBERNATION #include <IOKit/IOHibernatePrivate.h> #endif @@ -64,64 +64,98 @@ __END_DECLS #endif #define kIOPMrootDomainClass "IOPMrootDomain" +#define LOG_PREFIX "PMRD: " -#define LOG_PREFIX "PMRD: " +#define MSG(x...) \ + do { kprintf(LOG_PREFIX x); IOLog(x); } while (false) -#define LOG(x...) do { \ - kprintf(LOG_PREFIX x); IOLog(x); } while (false) - -#define KLOG(x...) do { \ - kprintf(LOG_PREFIX x); } while (false) +#define LOG(x...) \ + do { kprintf(LOG_PREFIX x); } while (false) #define DLOG(x...) do { \ if (kIOLogPMRootDomain & gIOKitDebug) \ kprintf(LOG_PREFIX x); } while (false) +#define _LOG(x...) + #define CHECK_THREAD_CONTEXT #ifdef CHECK_THREAD_CONTEXT static IOWorkLoop * gIOPMWorkLoop = 0; -#define ASSERT_GATED(x) \ +#define ASSERT_GATED() \ do { \ if (gIOPMWorkLoop && gIOPMWorkLoop->inGate() != true) { \ - panic("RootDomain: not inside PM gate"); \ + panic("RootDomain: not inside PM gate"); \ } \ } while(false) #else -#define ASSERT_GATED(x) +#define ASSERT_GATED() #endif /* CHECK_THREAD_CONTEXT */ +#define CAP_LOSS(c) \ + (((_pendingCapability & (c)) == 0) && \ + ((_currentCapability & (c)) != 0)) + +#define CAP_GAIN(c) \ + (((_currentCapability & (c)) == 0) && \ + ((_pendingCapability & (c)) != 0)) + +#define CAP_CHANGE(c) \ + (((_currentCapability ^ _pendingCapability) & (c)) != 0) + +#define CAP_CURRENT(c) \ + ((_currentCapability & (c)) != 0) + +#define CAP_HIGHEST(c) \ + ((_highestCapability & (c)) != 0) + +#define DARK_TO_FULL_EVALUATE_CLAMSHELL 0 + // Event types for IOPMPowerStateQueue::submitPowerEvent() enum { - kPowerEventFeatureChanged = 1, - kPowerEventReceivedPowerNotification, - kPowerEventSystemBootCompleted, - kPowerEventSystemShutdown, - kPowerEventUserDisabledSleep, - kPowerEventConfigdRegisteredInterest, - kPowerEventAggressivenessChanged, - kPowerEventAssertionCreate, // 8 - kPowerEventAssertionRelease, // 9 - kPowerEventAssertionSetLevel // 10 + kPowerEventFeatureChanged = 1, // 1 + kPowerEventReceivedPowerNotification, // 2 + kPowerEventSystemBootCompleted, // 3 + kPowerEventSystemShutdown, // 4 + kPowerEventUserDisabledSleep, // 5 + kPowerEventRegisterSystemCapabilityClient, // 6 + kPowerEventRegisterKernelCapabilityClient, // 7 + kPowerEventPolicyStimulus, // 8 + kPowerEventAssertionCreate, // 9 + kPowerEventAssertionRelease, // 10 + kPowerEventAssertionSetLevel, // 11 + kPowerEventQueueSleepWakeUUID, // 12 + kPowerEventPublishSleepWakeUUID // 13 +}; + +// For evaluatePolicy() +// List of stimuli that affects the root domain policy. +enum { + kStimulusDisplayWranglerSleep, // 0 + kStimulusDisplayWranglerWake, // 1 + kStimulusAggressivenessChanged, // 2 + kStimulusDemandSystemSleep, // 3 + kStimulusAllowSystemSleepChanged, // 4 + kStimulusDarkWakeActivityTickle, // 5 + kStimulusDarkWakeEntry, // 6 + kStimulusDarkWakeReentry, // 7 + kStimulusDarkWakeEvaluate // 8 }; extern "C" { IOReturn OSKextSystemSleepOrWake( UInt32 ); } -extern const IORegistryPlane * gIOPowerPlane; - static void idleSleepTimerExpired( thread_call_param_t, thread_call_param_t ); -static void wakeupClamshellTimerExpired( thread_call_param_t us, thread_call_param_t ); static void notifySystemShutdown( IOService * root, unsigned long event ); -static bool clientMessageFilter( OSObject * object, void * context ); -static void handleAggressivesFunction( thread_call_param_t param1, thread_call_param_t param2 ); +static void handleAggressivesFunction( thread_call_param_t, thread_call_param_t ); static void pmEventTimeStamp(uint64_t *recordTS); // "IOPMSetSleepSupported" callPlatformFunction name static const OSSymbol *sleepSupportedPEFunction = NULL; static const OSSymbol *sleepMessagePEFunction = NULL; -#define kIOSleepSupportedKey "IOSleepSupported" +#define kIOSleepSupportedKey "IOSleepSupported" +#define kIOPMSystemCapabilitiesKey "System Capabilities" #define kRD_AllPowerSources (kIOPMSupportedOnAC \ | kIOPMSupportedOnBatt \ @@ -137,91 +171,36 @@ enum #define kLocalEvalClamshellCommand (1 << 15) enum { - OFF_STATE = 0, - RESTART_STATE = 1, - SLEEP_STATE = 2, - DOZE_STATE = 3, - ON_STATE = 4, + OFF_STATE = 0, + RESTART_STATE = 1, + SLEEP_STATE = 2, + ON_STATE = 3, NUM_POWER_STATES }; #define ON_POWER kIOPMPowerOn #define RESTART_POWER kIOPMRestart #define SLEEP_POWER kIOPMAuxPowerOn -#define DOZE_POWER kIOPMDoze static IOPMPowerState ourPowerStates[NUM_POWER_STATES] = { {1, 0, 0, 0, 0,0,0,0,0,0,0,0}, {1, kIOPMRestartCapability, kIOPMRestart, RESTART_POWER, 0,0,0,0,0,0,0,0}, {1, kIOPMSleepCapability, kIOPMSleep, SLEEP_POWER, 0,0,0,0,0,0,0,0}, - {1, kIOPMDoze, kIOPMDoze, DOZE_POWER, 0,0,0,0,0,0,0,0}, {1, kIOPMPowerOn, kIOPMPowerOn, ON_POWER, 0,0,0,0,0,0,0,0} }; -// Clients eligible to receive system power messages. -enum { - kMessageClientNone = 0, - kMessageClientAll, - kMessageClientConfigd -}; - -// Run states (R-state) defined within the ON power state. -enum { - kRStateNormal = 0, - kRStateDark, - kRStateMaintenance, - kRStateCount -}; - -// IOService in power plane can be tagged with following flags. -enum { - kServiceFlagGraphics = 0x01, - kServiceFlagNoPowerUp = 0x02, - kServiceFlagTopLevelPCI = 0x04 -}; - -// Flags describing R-state features and capabilities. -enum { - kRStateFlagNone = 0x00000000, - kRStateFlagSuppressGraphics = 0x00000001, - kRStateFlagSuppressMessages = 0x00000002, - kRStateFlagSuppressPCICheck = 0x00000004, - kRStateFlagDisableIdleSleep = 0x00000008 -}; - -#if ROOT_DOMAIN_RUN_STATES - -// Table of flags for each R-state. -static uint32_t gRStateFlags[ kRStateCount ] = -{ - kRStateFlagNone, - - /* Dark wake */ - kRStateFlagSuppressGraphics, - - /* Maintenance wake */ - kRStateFlagSuppressGraphics | - kRStateFlagSuppressMessages | - kRStateFlagSuppressPCICheck | - kRStateFlagDisableIdleSleep -}; - -static IONotifier * gConfigdNotifier = 0; - -#define kIOPMRootDomainRunStateKey "Run State" #define kIOPMRootDomainWakeTypeMaintenance "Maintenance" #define kIOPMRootDomainWakeTypeSleepTimer "SleepTimer" #define kIOPMrootDomainWakeTypeLowBattery "LowBattery" - -#endif /* ROOT_DOMAIN_RUN_STATES */ +#define kIOPMRootDomainWakeTypeUser "User" +#define kIOPMRootDomainWakeTypeAlarm "Alarm" +#define kIOPMRootDomainWakeTypeNetwork "Network" // Special interest that entitles the interested client from receiving -// all system messages. Used by pmconfigd to support maintenance wake. +// all system messages. Only used by powerd. // -#define kIOPMPrivilegedPowerInterest "IOPMPrivilegedPowerInterest" - -static IONotifier * gSysPowerDownNotifier = 0; +#define kIOPMSystemCapabilityInterest "IOPMSystemCapabilityInterest" /* * Aggressiveness @@ -231,8 +210,6 @@ static IONotifier * gSysPowerDownNotifier = 0; #define kAggressivesMinValue 1 -static uint32_t gAggressivesState = 0; - enum { kAggressivesStateBusy = 0x01, kAggressivesStateQuickSpindown = 0x02 @@ -269,14 +246,28 @@ enum { enum { kAggressivesRecordFlagModified = 0x00000001, kAggressivesRecordFlagMinValue = 0x00000002 - +}; + +// gDarkWakeFlags +enum { + kDarkWakeFlagHIDTickleEarly = 0x01, // hid tickle before gfx suppression + kDarkWakeFlagHIDTickleLate = 0x02, // hid tickle after gfx suppression + kDarkWakeFlagHIDTickleNone = 0x03, // hid tickle is not posted + kDarkWakeFlagHIDTickleMask = 0x03, + kDarkWakeFlagIgnoreDiskIOInDark = 0x04, // ignore disk idle in DW + kDarkWakeFlagIgnoreDiskIOAlways = 0x08, // always ignore disk idle + kDarkWakeFlagIgnoreDiskIOMask = 0x0C, + kDarkWakeFlagAlarmIsDark = 0x0100 }; static IOPMrootDomain * gRootDomain; +static IONotifier * gSysPowerDownNotifier = 0; static UInt32 gSleepOrShutdownPending = 0; static UInt32 gWillShutdown = 0; -static uint32_t gMessageClientType = kMessageClientNone; +static UInt32 gPagingOff = 0; static UInt32 gSleepWakeUUIDIsSet = false; +static uint32_t gAggressivesState = 0; +static uint32_t gDarkWakeFlags = kDarkWakeFlagHIDTickleNone; struct timeval gIOLastSleepTime; struct timeval gIOLastWakeTime; @@ -293,29 +284,173 @@ const OSSymbol *gIOPMStatsApplicationResponseTimedOut; const OSSymbol *gIOPMStatsApplicationResponseCancel; const OSSymbol *gIOPMStatsApplicationResponseSlow; +/* + * PMSettingHandle + * Opaque handle passed to clients of registerPMSettingController() + */ +class PMSettingHandle : public OSObject +{ + OSDeclareFinalStructors( PMSettingHandle ) + friend class PMSettingObject; + +private: + PMSettingObject *pmso; + void free(void); +}; + +/* + * PMSettingObject + * Internal object to track each PM setting controller + */ class PMSettingObject : public OSObject { - OSDeclareFinalStructors(PMSettingObject) + OSDeclareFinalStructors( PMSettingObject ) + friend class IOPMrootDomain; + private: + queue_head_t calloutQueue; + thread_t waitThread; IOPMrootDomain *parent; + PMSettingHandle *pmsh; IOPMSettingControllerCallback func; OSObject *target; uintptr_t refcon; uint32_t *publishedFeatureID; - int releaseAtCount; + uint32_t settingCount; + bool disabled; + + void free(void); + public: static PMSettingObject *pmSettingObject( - IOPMrootDomain *parent_arg, + IOPMrootDomain *parent_arg, IOPMSettingControllerCallback handler_arg, - OSObject *target_arg, - uintptr_t refcon_arg, - uint32_t supportedPowerSources, - const OSSymbol *settings[]); + OSObject *target_arg, + uintptr_t refcon_arg, + uint32_t supportedPowerSources, + const OSSymbol *settings[], + OSObject **handle_obj); + + void dispatchPMSetting(const OSSymbol *type, OSObject *object); + void clientHandleFreed(void); +}; + +struct PMSettingCallEntry { + queue_chain_t link; + thread_t thread; +}; - void setPMSetting(const OSSymbol *type, OSObject *obj); +#define PMSETTING_LOCK() IOLockLock(settingsCtrlLock) +#define PMSETTING_UNLOCK() IOLockUnlock(settingsCtrlLock) +#define PMSETTING_WAIT(p) IOLockSleep(settingsCtrlLock, p, THREAD_UNINT) +#define PMSETTING_WAKEUP(p) IOLockWakeup(settingsCtrlLock, p, true) - void taggedRelease(const void *tag, const int when) const; - void free(void); +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +/* @class IOPMTimeline + * @astract Tracks & records PM activity. + * @discussion Intended for use only as a helper-class to IOPMrootDomain. + * Do not subclass or directly invoke iOPMTimeline + */ +class IOPMTimeline : public OSObject +{ + OSDeclareDefaultStructors( IOPMTimeline ); + +public: + static IOPMTimeline* timeline(IOPMrootDomain *root_domain); + + bool setProperties(OSDictionary *d); + OSDictionary *copyInfoDictionary(void); + + IOReturn recordSystemPowerEvent( PMEventDetails *details ); + + IOReturn recordDetailedPowerEvent( PMEventDetails *details ); + + IOMemoryDescriptor *getPMTraceMemoryDescriptor(); + + uint32_t getNumEventsLoggedThisPeriod(); + void setNumEventsLoggedThisPeriod(uint32_t newCount); + bool isSleepCycleInProgress(); + void setSleepCycleInProgressFlag(bool flag); +private: + bool init(void); + void free(void); + + void setEventsTrackedCount(uint32_t newTracked); + void setEventsRecordingLevel(uint32_t eventsTrackedBits); + static uint32_t _atomicIndexIncrement(uint32_t *index, uint32_t limit); + + enum { + kPMTimelineRecordTardyDrivers = 1 << 0, + kPMTmielineRecordSystemEvents = 1 << 1, + kPMTimelineRecordAllDrivers = 1 << 2, + kPMTimelineRecordOff = 0, + kPMTimelineRecordDefault = 3, + kPMTimelineRecordDebug = 7 + }; + + // eventsRecordingLevel is a bitfield defining which PM driver events will get logged + // into the PM buffer. + uint32_t eventsRecordingLevel; + + // pmTraceMemoryDescriptor represents the memory block that IOPMTimeLine records PM trace points into. + IOBufferMemoryDescriptor *pmTraceMemoryDescriptor; + + // Pointer to starting address in pmTraceMemoryDescriptor + IOPMSystemEventRecord *traceBuffer; + IOPMTraceBufferHeader *hdr; + + uint16_t systemState; + + IOLock *logLock; + IOPMrootDomain *owner; + + uint32_t numEventsLoggedThisPeriod; + bool sleepCycleInProgress; +}; + +OSDefineMetaClassAndStructors( IOPMTimeline, OSObject ) + +/* + * PMTraceWorker + * Internal helper object for logging trace points to RTC + * IOPMrootDomain and only IOPMrootDomain should instantiate + * exactly one of these. + */ + +typedef void (*IOPMTracePointHandler)( + void * target, uint32_t code, uint32_t data ); + +class PMTraceWorker : public OSObject +{ + OSDeclareDefaultStructors(PMTraceWorker) +public: + typedef enum { kPowerChangeStart, kPowerChangeCompleted } change_t; + + static PMTraceWorker *tracer( IOPMrootDomain * ); + void tracePCIPowerChange(change_t, IOService *, uint32_t, uint32_t); + void tracePoint(uint8_t phase); + void tracePoint(uint8_t phase, uint8_t data8); + void traceDetail(uint32_t detail); + void traceLoginWindowPhase(uint8_t phase); + int recordTopLevelPCIDevice(IOService *); + void RTC_TRACE(void); + virtual bool serialize(OSSerialize *s) const; + + IOPMTracePointHandler tracePointHandler; + void * tracePointTarget; +private: + IOPMrootDomain *owner; + IOLock *pciMappingLock; + OSArray *pciDeviceBitMappings; + + uint8_t addedToRegistry; + uint8_t tracePhase; + uint8_t loginWindowPhase; + uint8_t traceData8; + uint32_t traceData32; }; /* @@ -327,7 +462,7 @@ class PMAssertionsTracker : public OSObject OSDeclareFinalStructors(PMAssertionsTracker) public: static PMAssertionsTracker *pmAssertionsTracker( IOPMrootDomain * ); - + IOReturn createAssertion(IOPMDriverAssertionType, IOPMDriverAssertionLevel, IOService *, const char *, IOPMDriverAssertionID *); IOReturn releaseAssertion(IOPMDriverAssertionID); IOReturn setAssertionLevel(IOPMDriverAssertionID, IOPMDriverAssertionLevel); @@ -353,7 +488,7 @@ private: IOService *ownerService; IOPMDriverAssertionLevel level; } PMAssertStruct; - + uint32_t tabulateProducerCount; uint32_t tabulateConsumerCount; @@ -363,52 +498,14 @@ private: IOPMrootDomain *owner; OSArray *assertionsArray; IOLock *assertionsArrayLock; - IOPMDriverAssertionID issuingUniqueID; + IOPMDriverAssertionID issuingUniqueID __attribute__((aligned(8))); /* aligned for atomic access */ IOPMDriverAssertionType assertionsKernel; IOPMDriverAssertionType assertionsUser; IOPMDriverAssertionType assertionsCombined; }; - + OSDefineMetaClassAndFinalStructors(PMAssertionsTracker, OSObject); - -/* - * PMTraceWorker - * Internal helper object for logging trace points to RTC - * IOPMrootDomain and only IOPMrootDomain should instantiate - * exactly one of these. - */ - -typedef void (*IOPMTracePointHandler)( - void * target, uint32_t code, uint32_t data ); - -class PMTraceWorker : public OSObject -{ - OSDeclareDefaultStructors(PMTraceWorker) -public: - typedef enum { kPowerChangeStart, kPowerChangeCompleted } change_t; - - static PMTraceWorker *tracer( IOPMrootDomain * ); - void tracePCIPowerChange(change_t, IOService *, uint32_t, uint32_t); - void tracePoint(uint8_t phase); - void traceLoginWindowPhase(uint8_t phase); - int recordTopLevelPCIDevice(IOService *); - void RTC_TRACE(void); - virtual bool serialize(OSSerialize *s) const; - - IOPMTracePointHandler tracePointHandler; - void * tracePointTarget; -private: - IOPMrootDomain *owner; - IOLock *pciMappingLock; - OSArray *pciDeviceBitMappings; - - uint8_t tracePhase; - uint8_t loginWindowPhase; - uint8_t addedToRegistry; - uint8_t unused0; - uint32_t pciBusyBitMask; -}; - + /* * PMHaltWorker * Internal helper object for Shutdown/Restart notifications. @@ -441,6 +538,19 @@ OSDefineMetaClassAndFinalStructors( PMHaltWorker, OSObject ) #define super IOService OSDefineMetaClassAndFinalStructors(IOPMrootDomain, IOService) +static void IOPMRootDomainWillShutdown(void) +{ + if (OSCompareAndSwap(0, 1, &gWillShutdown)) + { + OSKext::willShutdown(); + for (int i = 0; i < 100; i++) + { + if (OSCompareAndSwap(0, 1, &gSleepOrShutdownPending)) break; + IOSleep( 100 ); + } + } +} + extern "C" { IONotifier * registerSleepWakeInterest(IOServiceInterestHandler handler, void * self, void * ref) @@ -473,61 +583,60 @@ extern "C" return gRootDomain->shutdownSystem(); } - void IOSystemShutdownNotification ( void ) + void IOSystemShutdownNotification(void) { - if (OSCompareAndSwap(0, 1, &gWillShutdown)) - { - OSKext::willShutdown(); - for (int i = 0; i < 100; i++) - { - if (OSCompareAndSwap(0, 1, &gSleepOrShutdownPending)) break; - IOSleep( 100 ); - } - } + IOPMRootDomainWillShutdown(); + if (OSCompareAndSwap(0, 1, &gPagingOff)) + { +#if !CONFIG_EMBEDDED + gRootDomain->handlePlatformHaltRestart(kPEPagingOff); +#endif + } } int sync_internal(void); } /* -A device is always in the highest power state which satisfies its driver, its policy-maker, and any power domain -children it has, but within the constraint of the power state provided by its parent. The driver expresses its desire by -calling changePowerStateTo(), the policy-maker expresses its desire by calling changePowerStateToPriv(), and the children -express their desires by calling requestPowerDomainState(). - -The Root Power Domain owns the policy for idle and demand sleep and doze for the system. It is a power-managed IOService just -like the others in the system. It implements several power states which correspond to what we see as Sleep, Doze, etc. - -The sleep/doze policy is as follows: -Sleep and Doze are prevented if the case is open so that nobody will think the machine is off and plug/unplug cards. -Sleep and Doze are prevented if the sleep timeout slider in the preferences panel is at zero. -The system cannot Sleep, but can Doze if some object in the tree is in a power state marked kIOPMPreventSystemSleep. - -These three conditions are enforced using the "driver clamp" by calling changePowerStateTo(). For example, if the case is -opened, changePowerStateTo(ON_STATE) is called to hold the system on regardless of the desires of the children of the root or -the state of the other clamp. - -Demand Sleep/Doze is initiated by pressing the front panel power button, closing the clamshell, or selecting the menu item. -In this case the root's parent actually initiates the power state change so that the root has no choice and does not give -applications the opportunity to veto the change. - -Idle Sleep/Doze occurs if no objects in the tree are in a state marked kIOPMPreventIdleSleep. When this is true, the root's -children are not holding the root on, so it sets the "policy-maker clamp" by calling changePowerStateToPriv(ON_STATE) -to hold itself on until the sleep timer expires. This timer is set for the difference between the sleep timeout slider and -the larger of the display dim timeout slider and the disk spindown timeout slider in the Preferences panel. For example, if -the system is set to sleep after thirty idle minutes, and the display and disk are set to sleep after five idle minutes, -when there is no longer an object in the tree holding the system out of Idle Sleep (via kIOPMPreventIdleSleep), the root -sets its timer for 25 minutes (30 - 5). When the timer expires, it releases its clamp and now nothing is holding it awake, -so it falls asleep. - -Demand sleep is prevented when the system is booting. When preferences are transmitted by the loginwindow at the end of -boot, a flag is cleared, and this allows subsequent Demand Sleep. - -The system will not Sleep, but will Doze if some object calls setSleepSupported(kPCICantSleep) during a power change to the sleep state (this can be done by the PCI Aux Power Supply drivers, Slots99, MacRISC299, etc.). This is not enforced with -a clamp, but sets a flag which is noticed before actually sleeping the kernel. If the flag is set, the root steps up -one power state from Sleep to Doze, and any objects in the tree for which this is relevent will act appropriately (USB and -ADB will turn on again so that they can wake the system out of Doze (keyboard/mouse activity will cause the Display Wrangler -to be tickled)). +A device is always in the highest power state which satisfies its driver, +its policy-maker, and any power children it has, but within the constraint +of the power state provided by its parent. The driver expresses its desire by +calling changePowerStateTo(), the policy-maker expresses its desire by calling +changePowerStateToPriv(), and the children express their desires by calling +requestPowerDomainState(). + +The Root Power Domain owns the policy for idle and demand sleep for the system. +It is a power-managed IOService just like the others in the system. +It implements several power states which map to what we see as Sleep and On. + +The sleep policy is as follows: +1. Sleep is prevented if the case is open so that nobody will think the machine + is off and plug/unplug cards. +2. Sleep is prevented if the sleep timeout slider in the prefs panel is zero. +3. System cannot Sleep if some object in the tree is in a power state marked + kIOPMPreventSystemSleep. + +These three conditions are enforced using the "driver clamp" by calling +changePowerStateTo(). For example, if the case is opened, +changePowerStateTo(ON_STATE) is called to hold the system on regardless +of the desires of the children of the root or the state of the other clamp. + +Demand Sleep is initiated by pressing the front panel power button, closing +the clamshell, or selecting the menu item. In this case the root's parent +actually initiates the power state change so that the root domain has no +choice and does not give applications the opportunity to veto the change. + +Idle Sleep occurs if no objects in the tree are in a state marked +kIOPMPreventIdleSleep. When this is true, the root's children are not holding +the root on, so it sets the "policy-maker clamp" by calling +changePowerStateToPriv(ON_STATE) to hold itself on until the sleep timer expires. +This timer is set for the difference between the sleep timeout slider and the +display dim timeout slider. When the timer expires, it releases its clamp and +now nothing is holding it awake, so it falls asleep. + +Demand sleep is prevented when the system is booting. When preferences are +transmitted by the loginwindow at the end of boot, a flag is cleared, +and this allows subsequent Demand Sleep. */ //****************************************************************************** @@ -547,16 +656,27 @@ IOPMrootDomain * IOPMrootDomain::construct( void ) static void disk_sync_callout( thread_call_param_t p0, thread_call_param_t p1 ) { - IOService *rootDomain = (IOService *) p0; - unsigned long pmRef = (unsigned long) p1; + IOService * rootDomain = (IOService *) p0; + uint32_t notifyRef = (uint32_t)(uintptr_t) p1; + uint32_t powerState = rootDomain->getPowerState(); - DLOG("disk_sync_callout start\n"); + DLOG("disk_sync_callout ps=%u\n", powerState); + if (ON_STATE == powerState) + { #if HIBERNATION - IOHibernateSystemSleep(); + IOHibernateSystemSleep(); #endif - sync_internal(); - rootDomain->allowPowerChange(pmRef); + sync_internal(); + } +#if HIBERNATION + else + { + IOHibernateSystemPostWake(); + } +#endif + + rootDomain->allowPowerChange(notifyRef); DLOG("disk_sync_callout finish\n"); } @@ -601,11 +721,11 @@ sysctl_sleepwaketime SYSCTL_HANDLER_ARGS } static SYSCTL_PROC(_kern, OID_AUTO, sleeptime, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOLastSleepTime, 0, sysctl_sleepwaketime, "S,timeval", ""); static SYSCTL_PROC(_kern, OID_AUTO, waketime, - CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, &gIOLastWakeTime, 0, sysctl_sleepwaketime, "S,timeval", ""); @@ -617,7 +737,7 @@ sysctl_willshutdown int error = sysctl_io_number(req, gWillShutdown, sizeof(int), &new_value, &changed); if (changed) { if (!gWillShutdown && (new_value == 1)) { - IOSystemShutdownNotification(); + IOPMRootDomainWillShutdown(); } else error = EINVAL; } @@ -625,7 +745,7 @@ sysctl_willshutdown } static SYSCTL_PROC(_kern, OID_AUTO, willshutdown, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_willshutdown, "I", ""); #if !CONFIG_EMBEDDED @@ -661,16 +781,19 @@ sysctl_progressmeter } static SYSCTL_PROC(_kern, OID_AUTO, progressmeterenable, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_progressmeterenable, "I", ""); static SYSCTL_PROC(_kern, OID_AUTO, progressmeter, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, 0, 0, sysctl_progressmeter, "I", ""); #endif +static SYSCTL_INT(_debug, OID_AUTO, darkwake, CTLFLAG_RW, &gDarkWakeFlags, 0, ""); + static const OSSymbol * gIOPMSettingAutoWakeSecondsKey; +static const OSSymbol * gIOPMSettingDebugWakeRelativeKey; static const OSSymbol * gIOPMSettingMaintenanceWakeCalendarKey; //****************************************************************************** @@ -684,11 +807,13 @@ bool IOPMrootDomain::start( IOService * nub ) { OSIterator *psIterator; OSDictionary *tmpDict; + IORootParent * patriarch; super::start(nub); gRootDomain = this; gIOPMSettingAutoWakeSecondsKey = OSSymbol::withCString(kIOPMSettingAutoWakeSecondsKey); + gIOPMSettingDebugWakeRelativeKey = OSSymbol::withCString(kIOPMSettingDebugWakeRelativeKey); gIOPMSettingMaintenanceWakeCalendarKey = OSSymbol::withCString(kIOPMSettingMaintenanceWakeCalendarKey); @@ -706,7 +831,7 @@ bool IOPMrootDomain::start( IOService * nub ) OSSymbol::withCString(kIOPMSettingAutoPowerSecondsKey), OSSymbol::withCString(kIOPMSettingAutoWakeCalendarKey), OSSymbol::withCString(kIOPMSettingAutoPowerCalendarKey), - OSSymbol::withCString(kIOPMSettingDebugWakeRelativeKey), + gIOPMSettingDebugWakeRelativeKey, OSSymbol::withCString(kIOPMSettingDebugPowerRelativeKey), OSSymbol::withCString(kIOPMSettingWakeOnRingKey), OSSymbol::withCString(kIOPMSettingRestartOnPowerLossKey), @@ -719,28 +844,25 @@ bool IOPMrootDomain::start( IOService * nub ) OSSymbol::withCString(kIOPMStateConsoleShutdown) }; + PE_parse_boot_argn("darkwake", &gDarkWakeFlags, sizeof(gDarkWakeFlags)); + queue_init(&aggressivesQueue); aggressivesThreadCall = thread_call_allocate(handleAggressivesFunction, this); aggressivesData = OSData::withCapacity( sizeof(AggressivesRecord) * (kPMLastAggressivenessType + 4)); featuresDictLock = IOLockAlloc(); - settingsCtrlLock = IORecursiveLockAlloc(); + settingsCtrlLock = IOLockAlloc(); setPMRootDomain(this); extraSleepTimer = thread_call_allocate( idleSleepTimerExpired, (thread_call_param_t) this); - clamshellWakeupIgnore = thread_call_allocate( - wakeupClamshellTimerExpired, - (thread_call_param_t) this); - diskSyncCalloutEntry = thread_call_allocate( &disk_sync_callout, (thread_call_param_t) this); - canSleep = true; setProperty(kIOSleepSupportedKey, true); bzero(&pmStats, sizeof(pmStats)); @@ -749,21 +871,27 @@ bool IOPMrootDomain::start( IOService * nub ) pmAssertions = PMAssertionsTracker::pmAssertionsTracker(this); - updateRunState(kRStateNormal); userDisabledAllSleep = false; - allowSleep = true; - sleepIsSupported = true; systemBooting = true; sleepSlider = 0; idleSleepTimerPending = false; wrangler = NULL; - sleepASAP = false; - clamshellIsClosed = false; - clamshellExists = false; - ignoringClamshell = true; - ignoringClamshellOnWake = false; + clamshellClosed = false; + clamshellExists = false; + clamshellDisabled = true; acAdaptorConnected = true; + // Set the default system capabilities at boot. + _currentCapability = kIOPMSystemCapabilityCPU | + kIOPMSystemCapabilityGraphics | + kIOPMSystemCapabilityAudio | + kIOPMSystemCapabilityNetwork; + + _pendingCapability = _currentCapability; + _desiredCapability = _currentCapability; + _highestCapability = _currentCapability; + setProperty(kIOPMSystemCapabilitiesKey, _currentCapability, 64); + queuedSleepWakeUUIDString = NULL; pmStatsAppResponses = OSArray::withCapacity(5); _statsNameKey = OSSymbol::withCString(kIOPMStatsNameKey); @@ -810,10 +938,23 @@ bool IOPMrootDomain::start( IOService * nub ) patriarch->addPowerChild(this); registerPowerDriver(this, ourPowerStates, NUM_POWER_STATES); - - // set a clamp until we sleep changePowerStateToPriv(ON_STATE); + if (gIOKitDebug & (kIOLogDriverPower1 | kIOLogDriverPower2)) + { + // Setup our PM logging & recording code + timeline = IOPMTimeline::timeline(this); + if (timeline) { + OSDictionary *tlInfo = timeline->copyInfoDictionary(); + + if (tlInfo) + { + setProperty(kIOPMTimelineDictionaryKey, tlInfo); + tlInfo->release(); + } + } + } + // install power change handler gSysPowerDownNotifier = registerPrioritySleepWakeInterest( &sysPowerDownHandler, this, 0); @@ -823,22 +964,12 @@ bool IOPMrootDomain::start( IOService * nub ) { _displayWranglerNotifier = addMatchingNotification( gIOPublishNotification, tmpDict, - (IOServiceMatchingNotificationHandler) &displayWranglerPublished, + (IOServiceMatchingNotificationHandler) &displayWranglerMatchPublished, this, 0); tmpDict->release(); } #endif - // Battery location published - ApplePMU support only - if ((tmpDict = serviceMatching("IOPMPowerSource"))) - { - _batteryPublishNotifier = addMatchingNotification( - gIOPublishNotification, tmpDict, - (IOServiceMatchingNotificationHandler) &batteryPublished, - this, this); - tmpDict->release(); - } - const OSSymbol *ucClassName = OSSymbol::withCStringNoCopy("RootDomainUserClient"); setProperty(gIOUserClientClassKey, (OSObject *) ucClassName); ucClassName->release(); @@ -874,7 +1005,6 @@ bool IOPMrootDomain::start( IOService * nub ) return true; } - //****************************************************************************** // setProperties // @@ -888,44 +1018,39 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) OSDictionary *dict = OSDynamicCast(OSDictionary, props_obj); OSBoolean *b; OSNumber *n; - OSString *str; + OSDictionary *d; OSSymbol *type; OSObject *obj; unsigned int i; - const OSSymbol *boot_complete_string = - OSSymbol::withCString("System Boot Complete"); - const OSSymbol *sys_shutdown_string = - OSSymbol::withCString("System Shutdown"); - const OSSymbol *stall_halt_string = - OSSymbol::withCString("StallSystemAtHalt"); - const OSSymbol *battery_warning_disabled_string = - OSSymbol::withCString("BatteryWarningsDisabled"); - const OSSymbol *idle_seconds_string = - OSSymbol::withCString("System Idle Seconds"); + const OSSymbol *publish_simulated_battery_string = OSSymbol::withCString("SoftwareSimulatedBatteries"); + const OSSymbol *boot_complete_string = OSSymbol::withCString("System Boot Complete"); + const OSSymbol *sys_shutdown_string = OSSymbol::withCString("System Shutdown"); + const OSSymbol *stall_halt_string = OSSymbol::withCString("StallSystemAtHalt"); + const OSSymbol *battery_warning_disabled_string = OSSymbol::withCString("BatteryWarningsDisabled"); + const OSSymbol *idle_seconds_string = OSSymbol::withCString("System Idle Seconds"); + const OSSymbol *sleepdisabled_string = OSSymbol::withCString("SleepDisabled"); + const OSSymbol *ondeck_sleepwake_uuid_string = OSSymbol::withCString(kIOPMSleepWakeUUIDKey); + const OSSymbol *loginwindow_tracepoint_string = OSSymbol::withCString(kIOPMLoginWindowSecurityDebugKey); + const OSSymbol *pmTimelineLogging_string = OSSymbol::withCString(kIOPMTimelineDictionaryKey); #if HIBERNATION - const OSSymbol *hibernatemode_string = - OSSymbol::withCString(kIOHibernateModeKey); - const OSSymbol *hibernatefile_string = - OSSymbol::withCString(kIOHibernateFileKey); - const OSSymbol *hibernatefreeratio_string = - OSSymbol::withCString(kIOHibernateFreeRatioKey); - const OSSymbol *hibernatefreetime_string = - OSSymbol::withCString(kIOHibernateFreeTimeKey); + const OSSymbol *hibernatemode_string = OSSymbol::withCString(kIOHibernateModeKey); + const OSSymbol *hibernatefile_string = OSSymbol::withCString(kIOHibernateFileKey); + const OSSymbol *hibernatefreeratio_string = OSSymbol::withCString(kIOHibernateFreeRatioKey); + const OSSymbol *hibernatefreetime_string = OSSymbol::withCString(kIOHibernateFreeTimeKey); #endif - const OSSymbol *sleepdisabled_string = - OSSymbol::withCString("SleepDisabled"); - const OSSymbol *ondeck_sleepwake_uuid_string = - OSSymbol::withCString(kIOPMSleepWakeUUIDKey); - const OSSymbol *loginwindow_tracepoint_string = - OSSymbol::withCString(kIOPMLoginWindowSecurityDebugKey); - - if(!dict) + + if (!dict) { return_value = kIOReturnBadArgument; goto exit; } + if ((b = OSDynamicCast(OSBoolean, dict->getObject(publish_simulated_battery_string)))) + { + publishResource(publish_simulated_battery_string, kOSBooleanTrue); + } + if ((n = OSDynamicCast(OSNumber, dict->getObject(idle_seconds_string)))) { setProperty(idle_seconds_string, n); @@ -936,44 +1061,53 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) { pmPowerStateQueue->submitPowerEvent( kPowerEventSystemBootCompleted ); } - - if( battery_warning_disabled_string - && dict->getObject(battery_warning_disabled_string)) + + if( battery_warning_disabled_string && dict->getObject(battery_warning_disabled_string)) { - setProperty( battery_warning_disabled_string, - dict->getObject(battery_warning_disabled_string)); + setProperty( battery_warning_disabled_string, dict->getObject(battery_warning_disabled_string)); } - if( sys_shutdown_string - && (b = OSDynamicCast(OSBoolean, dict->getObject(sys_shutdown_string)))) + if (pmTimelineLogging_string && (d = OSDynamicCast(OSDictionary, dict->getObject(pmTimelineLogging_string)))) + { + if (timeline && timeline->setProperties(d)) + { + OSDictionary *tlInfo = timeline->copyInfoDictionary(); + if (tlInfo) { + setProperty(kIOPMTimelineDictionaryKey, tlInfo); + tlInfo->release(); + } + } + } + + if( sys_shutdown_string && (b = OSDynamicCast(OSBoolean, dict->getObject(sys_shutdown_string)))) { pmPowerStateQueue->submitPowerEvent(kPowerEventSystemShutdown, (void *) b); } - if( stall_halt_string - && (b = OSDynamicCast(OSBoolean, dict->getObject(stall_halt_string))) ) + if( stall_halt_string && (b = OSDynamicCast(OSBoolean, dict->getObject(stall_halt_string))) ) { setProperty(stall_halt_string, b); } #if HIBERNATION if ( hibernatemode_string - && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatemode_string)))) + && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatemode_string)))) { setProperty(hibernatemode_string, n); } if ( hibernatefreeratio_string - && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatefreeratio_string)))) + && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatefreeratio_string)))) { setProperty(hibernatefreeratio_string, n); } if ( hibernatefreetime_string - && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatefreetime_string)))) + && (n = OSDynamicCast(OSNumber, dict->getObject(hibernatefreetime_string)))) { setProperty(hibernatefreetime_string, n); - } + } + OSString *str; if ( hibernatefile_string - && (str = OSDynamicCast(OSString, dict->getObject(hibernatefile_string)))) + && (str = OSDynamicCast(OSString, dict->getObject(hibernatefile_string)))) { setProperty(hibernatefile_string, str); } @@ -985,28 +1119,14 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) setProperty(sleepdisabled_string, b); pmPowerStateQueue->submitPowerEvent(kPowerEventUserDisabledSleep, (void *) b); } - if (ondeck_sleepwake_uuid_string && (obj = dict->getObject(ondeck_sleepwake_uuid_string))) { - // Clear the currently published UUID - if (kOSBooleanFalse == obj) - { - publishSleepWakeUUID(NULL); + if(pmPowerStateQueue) { + obj->retain(); + pmPowerStateQueue->submitPowerEvent(kPowerEventQueueSleepWakeUUID, (void *)obj); } - // Cache UUID for an upcoming sleep/wake - if ((str = OSDynamicCast(OSString, obj))) - { - if (queuedSleepWakeUUIDString) { - queuedSleepWakeUUIDString->release(); - queuedSleepWakeUUIDString = NULL; - } - queuedSleepWakeUUIDString = str; - queuedSleepWakeUUIDString->retain(); - DLOG("SleepWake UUID queued: %s\n", - queuedSleepWakeUUIDString->getCStringNoCopy()); - } } if (loginwindow_tracepoint_string @@ -1024,6 +1144,10 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) { setProperty(kIOPMDeepSleepDelayKey, n); } + if ((b = OSDynamicCast(OSBoolean, dict->getObject(kIOPMDestroyFVKeyOnStandbyKey)))) + { + setProperty(kIOPMDestroyFVKeyOnStandbyKey, b); + } // Relay our allowed PM settings onto our registered PM clients for(i = 0; i < allowedPMSettings->getCount(); i++) { @@ -1034,24 +1158,31 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) obj = dict->getObject(type); if(!obj) continue; - if ((gIOPMSettingAutoWakeSecondsKey == type) && ((n = OSDynamicCast(OSNumber, obj)))) - { - UInt32 rsecs = n->unsigned32BitValue(); - if (!rsecs) - autoWakeStart = autoWakeEnd = 0; - else - { - AbsoluteTime deadline; - clock_interval_to_deadline(rsecs + kAutoWakePostWindow, kSecondScale, &deadline); - autoWakeEnd = AbsoluteTime_to_scalar(&deadline); - if (rsecs > kAutoWakePreWindow) - rsecs -= kAutoWakePreWindow; - else - rsecs = 0; - clock_interval_to_deadline(rsecs, kSecondScale, &deadline); - autoWakeStart = AbsoluteTime_to_scalar(&deadline); - } - } + if ((gIOPMSettingAutoWakeSecondsKey == type) && ((n = OSDynamicCast(OSNumber, obj)))) + { + UInt32 rsecs = n->unsigned32BitValue(); + if (!rsecs) + autoWakeStart = autoWakeEnd = 0; + else + { + AbsoluteTime deadline; + clock_interval_to_deadline(rsecs + kAutoWakePostWindow, kSecondScale, &deadline); + autoWakeEnd = AbsoluteTime_to_scalar(&deadline); + if (rsecs > kAutoWakePreWindow) + rsecs -= kAutoWakePreWindow; + else + rsecs = 0; + clock_interval_to_deadline(rsecs, kSecondScale, &deadline); + autoWakeStart = AbsoluteTime_to_scalar(&deadline); + } + } + if (gIOPMSettingDebugWakeRelativeKey == type) + { + if ((n = OSDynamicCast(OSNumber, obj))) + _debugWakeSeconds = n->unsigned32BitValue(); + else + _debugWakeSeconds = 0; + } return_value = setPMSetting(type, obj); @@ -1059,14 +1190,16 @@ IOReturn IOPMrootDomain::setProperties( OSObject * props_obj ) } exit: + if(publish_simulated_battery_string) publish_simulated_battery_string->release(); if(boot_complete_string) boot_complete_string->release(); if(sys_shutdown_string) sys_shutdown_string->release(); if(stall_halt_string) stall_halt_string->release(); - if (battery_warning_disabled_string) battery_warning_disabled_string->release(); + if(battery_warning_disabled_string) battery_warning_disabled_string->release(); if(idle_seconds_string) idle_seconds_string->release(); if(sleepdisabled_string) sleepdisabled_string->release(); if(ondeck_sleepwake_uuid_string) ondeck_sleepwake_uuid_string->release(); if(loginwindow_tracepoint_string) loginwindow_tracepoint_string->release(); + if(pmTimelineLogging_string) pmTimelineLogging_string->release(); #if HIBERNATION if(hibernatemode_string) hibernatemode_string->release(); if(hibernatefile_string) hibernatefile_string->release(); @@ -1076,114 +1209,21 @@ exit: return return_value; } +// MARK: - +// MARK: Aggressiveness //****************************************************************************** -// aggressivenessChanged +// setAggressiveness // -// We are behind the command gate to examine changes to aggressives. +// Override IOService::setAggressiveness() //****************************************************************************** -void IOPMrootDomain::aggressivenessChanged( void ) +IOReturn IOPMrootDomain::setAggressiveness( + unsigned long type, + unsigned long value ) { - unsigned long minutesToSleep = 0; - unsigned long minutesToDisplayDim = 0; - - ASSERT_GATED(); - - // Fetch latest display and system sleep slider values. - getAggressiveness(kPMMinutesToSleep, &minutesToSleep); - getAggressiveness(kPMMinutesToDim, &minutesToDisplayDim); - DLOG("aggressiveness changed system %u, display %u\n", - (uint32_t) minutesToSleep, (uint32_t) minutesToDisplayDim); - - DLOG("idle time -> %ld secs (ena %d)\n", - idleSeconds, (minutesToSleep != 0)); - - if (0x7fffffff == minutesToSleep) - minutesToSleep = idleSeconds; - - // How long to wait before sleeping the system once the displays turns - // off is indicated by 'extraSleepDelay'. - - if ( minutesToSleep > minutesToDisplayDim ) { - extraSleepDelay = minutesToSleep - minutesToDisplayDim; - } - else { - extraSleepDelay = 0; - } - - // system sleep timer was disabled, but not anymore. - if ( (sleepSlider == 0) && (minutesToSleep != 0) ) { - if (!wrangler) - { - sleepASAP = false; - changePowerStateToPriv(ON_STATE); - if (idleSeconds) - { - startIdleSleepTimer( idleSeconds ); - } - } - else - { - // Start idle sleep timer if wrangler went to sleep - // while system sleep was disabled. - - sleepASAP = false; - if (wranglerAsleep) - { - AbsoluteTime now; - uint64_t nanos; - uint32_t minutesSinceDisplaySleep = 0; - uint32_t sleepDelay; - - clock_get_uptime(&now); - if (CMP_ABSOLUTETIME(&now, &wranglerSleepTime) > 0) - { - SUB_ABSOLUTETIME(&now, &wranglerSleepTime); - absolutetime_to_nanoseconds(now, &nanos); - minutesSinceDisplaySleep = nanos / (60000000000ULL); - } - - if (extraSleepDelay > minutesSinceDisplaySleep) - { - sleepDelay = extraSleepDelay - minutesSinceDisplaySleep; - } - else - { - // 1 min idle sleep. - sleepDelay = 1; - } - - startIdleSleepTimer(sleepDelay * 60); - DLOG("display slept %u min, set idle timer to %u min\n", - minutesSinceDisplaySleep, sleepDelay); - } - } - } - - sleepSlider = minutesToSleep; - if ( sleepSlider == 0 ) { - cancelIdleSleepTimer(); - // idle sleep is now disabled - adjustPowerState(); - // make sure we're powered - patriarch->wakeSystem(); - } -} - - -//****************************************************************************** -// setAggressiveness -// -// Override IOService::setAggressiveness() -//****************************************************************************** - -IOReturn IOPMrootDomain::setAggressiveness( - unsigned long type, - unsigned long value ) -{ - return setAggressiveness( type, value, 0 ); -} + return setAggressiveness( type, value, 0 ); +} /* * Private setAggressiveness() with an internal options argument. @@ -1197,8 +1237,8 @@ IOReturn IOPMrootDomain::setAggressiveness( AggressivesRequest * request; bool found = false; - DLOG("setAggressiveness 0x%x = %u, options 0x%x\n", - (uint32_t) type, (uint32_t) value, (uint32_t) options); + DLOG("setAggressiveness(%x) 0x%x = %u\n", + (uint32_t) options, (uint32_t) type, (uint32_t) value); request = IONew(AggressivesRequest, 1); if (!request) @@ -1255,7 +1295,6 @@ IOReturn IOPMrootDomain::setAggressiveness( return kIOReturnSuccess; } - //****************************************************************************** // getAggressiveness // @@ -1328,8 +1367,8 @@ IOReturn IOPMrootDomain::getAggressiveness ( if (source) { - DLOG("getAggressiveness 0x%x = %u, source %d\n", - (uint32_t) type, value, source); + DLOG("getAggressiveness(%d) 0x%x = %u\n", + source, (uint32_t) type, value); *outLevel = (unsigned long) value; return kIOReturnSuccess; } @@ -1341,7 +1380,6 @@ IOReturn IOPMrootDomain::getAggressiveness ( } } - //****************************************************************************** // joinAggressiveness // @@ -1356,7 +1394,7 @@ IOReturn IOPMrootDomain::joinAggressiveness( if (!service || (service == this)) return kIOReturnBadArgument; - DLOG("joinAggressiveness %s (%p)\n", service->getName(), service); + DLOG("joinAggressiveness %s %p\n", service->getName(), service); request = IONew(AggressivesRequest, 1); if (!request) @@ -1377,7 +1415,6 @@ IOReturn IOPMrootDomain::joinAggressiveness( return kIOReturnSuccess; } - //****************************************************************************** // handleAggressivesRequests // @@ -1443,7 +1480,7 @@ void IOPMrootDomain::handleAggressivesRequests( void ) broadcast = true; record->flags |= (kAggressivesRecordFlagMinValue | kAggressivesRecordFlagModified); - DLOG("quick spindown accelerated, was %u min\n", + DLOG("disk spindown accelerated, was %u min\n", record->value); } } @@ -1545,11 +1582,12 @@ unlock_done: // Submit a power event to handle those changes on the PM work loop. if (pingSelf && pmPowerStateQueue) { - pmPowerStateQueue->submitPowerEvent( kPowerEventAggressivenessChanged ); + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusAggressivenessChanged ); } } - //****************************************************************************** // synchronizeAggressives // @@ -1564,6 +1602,7 @@ void IOPMrootDomain::synchronizeAggressives( IOService * service; AggressivesRequest * request; const AggressivesRecord * record; + IOPMDriverCallEntry callEntry; uint32_t value; int i; @@ -1580,7 +1619,7 @@ void IOPMrootDomain::synchronizeAggressives( if (service) { - if (service->assertPMThreadCall()) + if (service->assertPMDriverCall(&callEntry)) { for (i = 0, record = array; i < count; i++, record++) { @@ -1588,18 +1627,17 @@ void IOPMrootDomain::synchronizeAggressives( if (record->flags & kAggressivesRecordFlagMinValue) value = kAggressivesMinValue; - DLOG("synchronizeAggressives 0x%x = %u to %s\n", + _LOG("synchronizeAggressives 0x%x = %u to %s\n", record->type, value, service->getName()); service->setAggressiveness(record->type, value); } - service->deassertPMThreadCall(); + service->deassertPMDriverCall(&callEntry); } service->release(); // retained by joinAggressiveness() } } } - //****************************************************************************** // broadcastAggressives // @@ -1610,18 +1648,19 @@ void IOPMrootDomain::broadcastAggressives( const AggressivesRecord * array, int count ) { - IORegistryIterator * iter; - IORegistryEntry * entry; - IOPowerConnection * connect; + IORegistryIterator * iter; + IORegistryEntry * entry; + IOPowerConnection * connect; IOService * service; const AggressivesRecord * record; + IOPMDriverCallEntry callEntry; uint32_t value; int i; - iter = IORegistryIterator::iterateOver( - this, gIOPowerPlane, kIORegistryIterateRecursively); + iter = IORegistryIterator::iterateOver( + this, gIOPowerPlane, kIORegistryIterateRecursively); if (iter) - { + { do { iter->reset(); @@ -1633,7 +1672,7 @@ void IOPMrootDomain::broadcastAggressives( if ((service = (IOService *) connect->copyChildEntry(gIOPowerPlane))) { - if (service->assertPMThreadCall()) + if (service->assertPMDriverCall(&callEntry)) { for (i = 0, record = array; i < count; i++, record++) { @@ -1642,12 +1681,12 @@ void IOPMrootDomain::broadcastAggressives( value = record->value; if (record->flags & kAggressivesRecordFlagMinValue) value = kAggressivesMinValue; - DLOG("broadcastAggressives %x = %u to %s\n", + _LOG("broadcastAggressives %x = %u to %s\n", record->type, value, service->getName()); service->setAggressiveness(record->type, value); } } - service->deassertPMThreadCall(); + service->deassertPMDriverCall(&callEntry); } service->release(); } @@ -1658,6 +1697,8 @@ void IOPMrootDomain::broadcastAggressives( } } +// MARK: - +// MARK: System Sleep //****************************************************************************** // startIdleSleepTimer @@ -1678,7 +1719,6 @@ void IOPMrootDomain::startIdleSleepTimer( uint32_t inSeconds ) } } - //****************************************************************************** // cancelIdleSleepTimer // @@ -1687,7 +1727,7 @@ void IOPMrootDomain::startIdleSleepTimer( uint32_t inSeconds ) void IOPMrootDomain::cancelIdleSleepTimer( void ) { ASSERT_GATED(); - if (idleSleepTimerPending) + if (idleSleepTimerPending) { DLOG("idle timer cancelled\n"); thread_call_cancel(extraSleepTimer); @@ -1695,7 +1735,6 @@ void IOPMrootDomain::cancelIdleSleepTimer( void ) } } - //****************************************************************************** // idleSleepTimerExpired // @@ -1707,13 +1746,6 @@ static void idleSleepTimerExpired( ((IOPMrootDomain *)us)->handleSleepTimerExpiration(); } -static void wakeupClamshellTimerExpired( - thread_call_param_t us, thread_call_param_t ) -{ - ((IOPMrootDomain *)us)->stopIgnoringClamshellEventsDuringWakeup(); -} - - //****************************************************************************** // handleSleepTimerExpiration // @@ -1747,41 +1779,34 @@ void IOPMrootDomain::handleSleepTimerExpiration( void ) return; } - // accelerate disk spin down if spin down timer is non-zero setQuickSpinDownTimeout(); - - sleepASAP = true; - adjustPowerState(); + adjustPowerState(true); } - //****************************************************************************** -// stopIgnoringClamshellEventsDuringWakeup +// setQuickSpinDownTimeout // //****************************************************************************** -void IOPMrootDomain::stopIgnoringClamshellEventsDuringWakeup( void ) +void IOPMrootDomain::setQuickSpinDownTimeout( void ) { - if (!getPMworkloop()->inGate()) - { - getPMworkloop()->runAction( - OSMemberFunctionCast(IOWorkLoop::Action, this, - &IOPMrootDomain::stopIgnoringClamshellEventsDuringWakeup), - this); - return; - } - ASSERT_GATED(); + setAggressiveness( + kPMMinutesToSpinDown, 0, kAggressivesOptionQuickSpindownEnable ); +} - // Allow clamshell-induced sleep now - ignoringClamshellOnWake = false; +//****************************************************************************** +// restoreUserSpinDownTimeout +// +//****************************************************************************** - // Re-send clamshell event, in case it causes a sleep - if (clamshellIsClosed) - handlePowerNotification( kLocalEvalClamshellCommand ); +void IOPMrootDomain::restoreUserSpinDownTimeout( void ) +{ + ASSERT_GATED(); + setAggressiveness( + kPMMinutesToSpinDown, 0, kAggressivesOptionQuickSpindownDisable ); } - //****************************************************************************** // sleepSystem // @@ -1806,21 +1831,17 @@ IOReturn IOPMrootDomain::sleepSystemOptions( OSDictionary *options ) if (options && options->getObject("OSSwitch")) { - // Log specific sleep cause for OS Switch hibernation - return privateSleepSystem( kIOPMSleepReasonOSSwitchHibernation); - + return privateSleepSystem( kIOPMSleepReasonOSSwitchHibernate); } else { - return privateSleepSystem( kIOPMSleepReasonSoftware); - } } /* private */ IOReturn IOPMrootDomain::privateSleepSystem( uint32_t sleepReason ) { - static const char * IOPMSleepReasons[kIOPMSleepReasonMax] = { + static const char * IOPMSleepReasons[] = { "", kIOPMClamshellSleepKey, kIOPMPowerButtonSleepKey, @@ -1829,711 +1850,869 @@ IOReturn IOPMrootDomain::privateSleepSystem( uint32_t sleepReason ) kIOPMIdleSleepKey, kIOPMLowPowerSleepKey, kIOPMClamshellSleepKey, - kIOPMThermalEmergencySleepKey + kIOPMThermalEmergencySleepKey, + kIOPMMaintenanceSleepKey }; - if ( userDisabledAllSleep ) - { - LOG("Sleep prevented by user disable\n"); - /* Prevent sleep of all kinds if directed to by user space */ - return kIOReturnNotPermitted; - } + PMEventDetails *details; - if ( systemBooting || systemShutdown || !allowSleep ) + if (!checkSystemCanSleep()) { - LOG("Sleep prevented by SB %d, SS %d, AS %d\n", - systemBooting, systemShutdown, allowSleep); + // Record why the system couldn't sleep + details = PMEventDetails::eventDetails(kIOPMEventTypeSleep, NULL, + sleepReason, kIOReturnNotPermitted); + + recordAndReleasePMEvent( details ); + return kIOReturnNotPermitted; + } - // Unable to sleep because system is in the process of booting or - // shutting down, or sleep has otherwise been disallowed. - return kIOReturnError; + if (timeline) + timeline->setSleepCycleInProgressFlag(true); + + // Time to publish a UUID for the Sleep --> Wake cycle + if(pmPowerStateQueue) { + pmPowerStateQueue->submitPowerEvent(kPowerEventPublishSleepWakeUUID, (void *)true); } + + // Log the beginning of system sleep. + details = PMEventDetails::eventDetails(kIOPMEventTypeSleep, NULL, + sleepReason, kIOReturnSuccess); + + recordAndReleasePMEvent( details ); + // Record sleep cause in IORegistry lastSleepReason = sleepReason; - if (sleepReason && (sleepReason < kIOPMSleepReasonMax)) { + sleepReason -= (kIOPMSleepReasonClamshell - 1); + if (sleepReason && (sleepReason < sizeof(IOPMSleepReasons)/sizeof(IOPMSleepReasons[0]))) { setProperty(kRootDomainSleepReasonKey, IOPMSleepReasons[sleepReason]); } - patriarch->sleepSystem(); + if (pmPowerStateQueue) + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusDemandSystemSleep ); + return kIOReturnSuccess; } +IOReturn IOPMrootDomain::recordPMEventGated(PMEventDetails *record) +{ + // If we don't have a place to log to, we can't actually + // log anything. Chances are, the person who is asking us to do + // the PM logging has forgotten to set the right bootflags + if(!timeline) + return kIOReturnSuccess; -//****************************************************************************** -// shutdownSystem -// -//****************************************************************************** + if(gIOPMWorkLoop->inGate() == false) { + + IOReturn ret = gIOPMWorkLoop->runAction( + OSMemberFunctionCast(IOWorkLoop::Action, this, &IOPMrootDomain::recordPMEventGated), + (OSObject *)this, + (void *)record); + + return ret; + } + else { + // Now that we're guaranteed to be running in gate ... -IOReturn IOPMrootDomain::shutdownSystem( void ) -{ - //patriarch->shutDownSystem(); - return kIOReturnUnsupported; + // Check the validity of the argument we are given + if(!record) + return kIOReturnBadArgument; + + // Record a driver event, or a system event + if(record->eventClassifier == kIOPMEventClassDriverEvent + || record->eventClassifier == kIOPMEventClassSystemEvent) + return this->recordPMEvent(record); + + else + return kIOReturnBadArgument; + } } - -//****************************************************************************** -// restartSystem -// -//****************************************************************************** - -IOReturn IOPMrootDomain::restartSystem( void ) +IOReturn IOPMrootDomain::recordAndReleasePMEventGated(PMEventDetails *record) { - //patriarch->restartSystem(); - return kIOReturnUnsupported; -} + IOReturn ret = kIOReturnBadArgument; + if (record) + { + ret = recordPMEventGated(record); + record->release(); + } + + return ret; +} //****************************************************************************** // powerChangeDone // // This overrides powerChangeDone in IOService. -// -// Menu sleep and idle sleep move us from the ON state to the SLEEP_STATE. -// In this case: -// If we finished going to the SLEEP_STATE, and the platform is capable of -// true sleep, then sleep the kernel. Otherwise switch up to the DOZE_STATE -// which will keep almost everything as off as it can get. //****************************************************************************** -void IOPMrootDomain::powerChangeDone( unsigned long previousState ) +void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState ) { + PMEventDetails *details; + ASSERT_GATED(); DLOG("PowerChangeDone: %u->%u\n", - (uint32_t) previousState, (uint32_t) getPowerState()); - - switch ( getPowerState() ) { - case SLEEP_STATE: - if ( previousState != ON_STATE ) - break; + (uint32_t) previousPowerState, (uint32_t) getPowerState()); + + switch ( getPowerState() ) + { + case SLEEP_STATE: { + if (previousPowerState != ON_STATE) + break; + + details = PMEventDetails::eventDetails( + kIOPMEventTypeSleepDone, + NULL, + NULL, + kIOReturnSuccess); + + recordAndReleasePMEvent( details ); - if ( canSleep ) - { - // re-enable this timer for next sleep - cancelIdleSleepTimer(); - wranglerTickled = true; + // re-enable this timer for next sleep + cancelIdleSleepTimer(); - clock_sec_t secs; - clock_usec_t microsecs; - clock_get_calendar_microtime(&secs, µsecs); - logtime(secs); - gIOLastSleepTime.tv_sec = secs; - gIOLastSleepTime.tv_usec = microsecs; - gIOLastWakeTime.tv_sec = 0; - gIOLastWakeTime.tv_usec = 0; + clock_sec_t secs; + clock_usec_t microsecs; + clock_get_calendar_microtime(&secs, µsecs); + logtime(secs); + gIOLastSleepTime.tv_sec = secs; + gIOLastSleepTime.tv_usec = microsecs; + gIOLastWakeTime.tv_sec = 0; + gIOLastWakeTime.tv_usec = 0; #if HIBERNATION - LOG("System %sSleep\n", gIOHibernateState ? "Safe" : ""); - - tracePoint(kIOPMTracePointSystemHibernatePhase); + LOG("System %sSleep\n", gIOHibernateState ? "Safe" : ""); - IOHibernateSystemHasSlept(); + IOHibernateSystemHasSlept(); - evaluateSystemSleepPolicyFinal(); + evaluateSystemSleepPolicyFinal(); #else - LOG("System Sleep\n"); + LOG("System Sleep\n"); #endif - tracePoint(kIOPMTracePointSystemSleepPlatformPhase); + getPlatform()->sleepKernel(); - getPlatform()->sleepKernel(); + // The CPU(s) are off at this point, + // Code will resume execution here upon wake. - // The CPU(s) are off at this point. When they're awakened by CPU interrupt, - // code will resume execution here. + clock_get_uptime(&systemWakeTime); - // Now we're waking... - tracePoint(kIOPMTracePointSystemWakeDriversPhase); - #if HIBERNATION - IOHibernateSystemWake(); + IOHibernateSystemWake(); #endif - // sleep transition complete - gSleepOrShutdownPending = 0; - - // trip the reset of the calendar clock - clock_wakeup_calendar(); - - // get us some power - patriarch->wakeSystem(); - - // Set indicator if UUID was set - allow it to be cleared. - if (getProperty(kIOPMSleepWakeUUIDKey)) - gSleepWakeUUIDIsSet = true; + // sleep transition complete + gSleepOrShutdownPending = 0; -#if !ROOT_DOMAIN_RUN_STATES - tellClients(kIOMessageSystemWillPowerOn, clientMessageFilter); -#endif + // trip the reset of the calendar clock + clock_wakeup_calendar(); #if HIBERNATION - LOG("System %sWake\n", gIOHibernateState ? "SafeSleep " : ""); + LOG("System %sWake\n", gIOHibernateState ? "SafeSleep " : ""); #endif - // log system wake - getPlatform()->PMLog(kIOPMrootDomainClass, kPMLogSystemWake, 0, 0); - lowBatteryCondition = false; + // log system wake + getPlatform()->PMLog(kIOPMrootDomainClass, kPMLogSystemWake, 0, 0); + lowBatteryCondition = false; + lastSleepReason = 0; + + // And start logging the wake event here + // TODO: Publish the wakeReason string as an integer + details = PMEventDetails::eventDetails( + kIOPMEventTypeWake, + NULL, + 0, + kIOReturnSuccess); + + recordAndReleasePMEvent( details ); + #ifndef __LP64__ - // tell the tree we're waking - systemWake(); + systemWake(); #endif - #if defined(__i386__) || defined(__x86_64__) - sleepTimerMaintenance = false; -#if ROOT_DOMAIN_RUN_STATES - OSString * wakeType = OSDynamicCast( - OSString, getProperty(kIOPMRootDomainWakeTypeKey)); - if (wakeType && wakeType->isEqualTo(kIOPMrootDomainWakeTypeLowBattery)) + wranglerTickled = false; + graphicsSuppressed = false; + darkWakePostTickle = false; + logGraphicsClamp = true; + logWranglerTickle = true; + sleepTimerMaintenance = false; + + OSString * wakeType = OSDynamicCast( + OSString, getProperty(kIOPMRootDomainWakeTypeKey)); + OSString * wakeReason = OSDynamicCast( + OSString, getProperty(kIOPMRootDomainWakeReasonKey)); + + if (wakeType && wakeType->isEqualTo(kIOPMrootDomainWakeTypeLowBattery)) + { + lowBatteryCondition = true; + darkWakeMaintenance = true; + darkWakeToSleepASAP = true; + } + else if ((gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) != 0) + { + OSNumber * hibOptions = OSDynamicCast( + OSNumber, getProperty(kIOHibernateOptionsKey)); + + if (hibernateAborted || + ((hibOptions && + !(hibOptions->unsigned32BitValue() & kIOHibernateOptionDarkWake))) || + ((_debugWakeSeconds != 0) && + ((gDarkWakeFlags & kDarkWakeFlagAlarmIsDark) == 0)) || + (wakeType && ( + wakeType->isEqualTo(kIOPMRootDomainWakeTypeUser) || + wakeType->isEqualTo(kIOPMRootDomainWakeTypeAlarm)))) { - lowBatteryCondition = true; - updateRunState(kRStateMaintenance); - wranglerTickled = false; + wranglerTickled = true; } - else if (wakeType && !hibernateAborted && wakeType->isEqualTo(kIOPMRootDomainWakeTypeSleepTimer)) + else + if (wakeType && + wakeType->isEqualTo(kIOPMRootDomainWakeTypeMaintenance)) { - sleepTimerMaintenance = true; - updateRunState(kRStateMaintenance); - wranglerTickled = false; + darkWakeMaintenance = true; + darkWakeToSleepASAP = true; } - else if (wakeType && !hibernateAborted && wakeType->isEqualTo(kIOPMRootDomainWakeTypeMaintenance)) + else + if (wakeType && + wakeType->isEqualTo(kIOPMRootDomainWakeTypeSleepTimer)) { - updateRunState(kRStateMaintenance); - wranglerTickled = false; + darkWakeMaintenance = true; + darkWakeToSleepASAP = true; + sleepTimerMaintenance = true; } else -#endif /* ROOT_DOMAIN_RUN_STATES */ { - updateRunState(kRStateNormal); - reportUserInput(); - } -#else /* !__i386__ && !__x86_64__ */ - // stay awake for at least 30 seconds - startIdleSleepTimer(30); - reportUserInput(); -#endif - - changePowerStateToPriv(ON_STATE); - } else { - updateRunState(kRStateNormal); + // Unidentified wake source, resume to full wake if debug + // alarm is pending. - // allow us to step up a power state - patriarch->sleepToDoze(); - - // ignore children's request for higher power during doze. - changePowerStateWithOverrideTo(DOZE_STATE); + if (_debugWakeSeconds && (!wakeReason || wakeReason->isEqualTo(""))) + wranglerTickled = true; + else + darkWakeToSleepASAP = true; + } } - break; - - case DOZE_STATE: - if ( previousState != DOZE_STATE ) + else { - LOG("System Doze\n"); - } - // re-enable this timer for next sleep - cancelIdleSleepTimer(); - gSleepOrShutdownPending = 0; - - // Invalidate prior activity tickles to allow wake from doze. - if (wrangler) wrangler->changePowerStateTo(0); - break; - -#if ROOT_DOMAIN_RUN_STATES - case ON_STATE: - // SLEEP -> ON (Maintenance) - // Go back to sleep, unless cancelled by a HID event. + // Post a HID tickle immediately - except for maintenance wake. - if ((previousState == SLEEP_STATE) && - (runStateIndex == kRStateMaintenance) && - !wranglerTickled) - { - if (lowBatteryCondition) + if (hibernateAborted || !wakeType || + !wakeType->isEqualTo(kIOPMRootDomainWakeTypeMaintenance)) { - lastSleepReason = kIOPMSleepReasonLowPower; - setProperty(kRootDomainSleepReasonKey, kIOPMLowPowerSleepKey); + wranglerTickled = true; } else { - lastSleepReason = kIOPMSleepReasonMaintenance; - setProperty(kRootDomainSleepReasonKey, kIOPMMaintenanceSleepKey); + darkWakeMaintenance = true; + darkWakeToSleepASAP = true; } - changePowerStateWithOverrideTo(SLEEP_STATE); } - // ON -> ON triggered by R-state changes. - - if ((previousState == ON_STATE) && - (runStateIndex != nextRunStateIndex) && - (nextRunStateIndex < kRStateCount)) + if (wranglerTickled) + reportUserInput(); + else if (!darkWakeMaintenance) { - LOG("R-state changed %u->%u\n", - runStateIndex, nextRunStateIndex); - updateRunState(nextRunStateIndex); + // Early/late tickle for non-maintenance wake. + if (((gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) == + kDarkWakeFlagHIDTickleEarly) || + ((gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) == + kDarkWakeFlagHIDTickleLate)) + { + darkWakePostTickle = true; + } + } +#else /* !__i386__ && !__x86_64__ */ + // stay awake for at least 30 seconds + wranglerTickled = true; + startIdleSleepTimer(30); +#endif + + changePowerStateToPriv(ON_STATE); + } break; + + case ON_STATE: { + bool wasPrevented = childPreventSystemSleep; + + details = PMEventDetails::eventDetails( + kIOPMEventTypeWakeDone, + NULL, + 0, + kIOReturnSuccess); + + recordAndReleasePMEvent( details ); + + if (previousPowerState != ON_STATE) + _debugWakeSeconds = 0; + + // Update childPreventSystemSleep flag using the capability computed + // by IOSevice::rebuildChildClampBits(). + + childPreventSystemSleep = + ((currentCapability() & kIOPMChildClamp2) != 0); - DLOG("kIOMessageSystemHasPoweredOn (%u)\n", - gMessageClientType); - tellClients(kIOMessageSystemHasPoweredOn, clientMessageFilter); + if (wasPrevented && !childPreventSystemSleep) + { + evaluatePolicy( kStimulusDarkWakeEvaluate ); } - - break; -#endif /* ROOT_DOMAIN_RUN_STATES */ + } break; } } - //****************************************************************************** -// wakeFromDoze +// requestPowerDomainState // -// The Display Wrangler calls here when it switches to its highest state. -// If the system is currently dozing, allow it to wake by making sure the -// parent is providing power. +// Extend implementation in IOService. Running on PM work loop thread. +// +// Examine children desires and initiate idle-sleep if all children are idle, +// prevent idle and system sleep flags are not set. //****************************************************************************** -void IOPMrootDomain::wakeFromDoze( void ) +IOReturn IOPMrootDomain::requestPowerDomainState ( + IOPMPowerFlags childDesire, + IOPowerConnection * childConnection, + unsigned long specification ) { - if ( getPowerState() == DOZE_STATE ) + OSIterator *iter; + OSObject *next; + IOPowerConnection *connection; + IOPMPowerFlags mergedChildDesire = 0; + IOPMPowerFlags editedChildDesire; + IOPMPowerFlags thisDesire; + bool sleepASAP = false; + + ASSERT_GATED(); + + // Disregard disk I/O (anything besides the display wrangler) as a + // factor in preventing idle sleep - based on a runtime setting. + + if ((gDarkWakeFlags & kDarkWakeFlagIgnoreDiskIOAlways) && + (kIOPMPreventIdleSleep & childDesire) && + (childConnection != wranglerConnection)) { - tracePoint(kIOPMTracePointSystemWakeDriversPhase); - changePowerStateToPriv(ON_STATE); - patriarch->wakeSystem(); + childDesire &= ~kIOPMPreventIdleSleep; } -} + // Force the child's input power requirement to 0 unless the prevent + // idle-sleep flag is set. Nil input power flags maps to our state 0. + // Our power clamp (deviceDesire) clamps the lowest power state at 2. -//****************************************************************************** -// publishFeature -// -// Adds a new feature to the supported features dictionary -//****************************************************************************** + editedChildDesire = 0; + if (childDesire & kIOPMPreventIdleSleep) + editedChildDesire |= (kIOPMPowerOn | kIOPMPreventIdleSleep); + if (childDesire & kIOPMPreventSystemSleep) + editedChildDesire |= (kIOPMPowerOn | kIOPMPreventSystemSleep); -void IOPMrootDomain::publishFeature( const char * feature ) -{ - publishFeature(feature, kRD_AllPowerSources, NULL); -} + iter = getChildIterator(gIOPowerPlane); + if ( iter ) + { + while ( (next = iter->getNextObject()) ) + { + if ( (connection = OSDynamicCast(IOPowerConnection, next)) ) + { + // Ignore child that are in the process of joining. + if (connection->getReadyFlag() == false) + continue; + // OR in the child's input power requirements. + // Is this connection attached to the child that called + // requestPowerDomainState()? -//****************************************************************************** -// publishFeature (with supported power source specified) -// -// Adds a new feature to the supported features dictionary -//****************************************************************************** - -void IOPMrootDomain::publishFeature( - const char *feature, - uint32_t supportedWhere, - uint32_t *uniqueFeatureID) -{ - static uint16_t next_feature_id = 500; - - OSNumber *new_feature_data = NULL; - OSNumber *existing_feature = NULL; - OSArray *existing_feature_arr = NULL; - OSObject *osObj = NULL; - uint32_t feature_value = 0; - - supportedWhere &= kRD_AllPowerSources; // mask off any craziness! - - if(!supportedWhere) { - // Feature isn't supported anywhere! - return; - } - - if(next_feature_id > 5000) { - // Far, far too many features! - return; - } - - if(featuresDictLock) IOLockLock(featuresDictLock); + if (connection == childConnection) + { + thisDesire = editedChildDesire; + } + else + { + thisDesire = 0; + if (connection->getPreventIdleSleepFlag()) + thisDesire |= (kIOPMPowerOn | kIOPMPreventIdleSleep); + if (connection->getPreventSystemSleepFlag()) + thisDesire |= (kIOPMPowerOn | kIOPMPreventSystemSleep); + } - OSDictionary *features = - (OSDictionary *) getProperty(kRootDomainSupportedFeatures); - - // Create new features dict if necessary - if ( features && OSDynamicCast(OSDictionary, features)) { - features = OSDictionary::withDictionary(features); - } else { - features = OSDictionary::withCapacity(1); - } - - // Create OSNumber to track new feature - - next_feature_id += 1; - if( uniqueFeatureID ) { - // We don't really mind if the calling kext didn't give us a place - // to stash their unique id. Many kexts don't plan to unload, and thus - // have no need to remove themselves later. - *uniqueFeatureID = next_feature_id; + mergedChildDesire |= thisDesire; + if (thisDesire && (kIOLogPMRootDomain & gIOKitDebug)) + { + IOService * child = + (IOService *) connection->getChildEntry(gIOPowerPlane); + LOG("child %p, noIdle %d, noSleep %d - %s\n", + child, + ((thisDesire & kIOPMPreventIdleSleep) != 0), + ((thisDesire & kIOPMPreventSystemSleep) != 0), + child ? child->getName() : "?"); + } + } + } + iter->release(); } - feature_value = (uint32_t)next_feature_id; - feature_value <<= 16; - feature_value += supportedWhere; - - new_feature_data = OSNumber::withNumber( - (unsigned long long)feature_value, 32); + DLOG("mergedChildDesire 0x%lx, extraSleepDelay %ld\n", + mergedChildDesire, extraSleepDelay); - // Does features object already exist? - if( (osObj = features->getObject(feature)) ) + if ( !mergedChildDesire && !systemBooting ) { - if(( existing_feature = OSDynamicCast(OSNumber, osObj) )) - { - // We need to create an OSArray to hold the now 2 elements. - existing_feature_arr = OSArray::withObjects( - (const OSObject **)&existing_feature, 1, 2); - } else if(( existing_feature_arr = OSDynamicCast(OSArray, osObj) )) + if (!wrangler) { - // Add object to existing array - existing_feature_arr = OSArray::withArray( - existing_feature_arr, - existing_feature_arr->getCount() + 1); + changePowerStateToPriv(ON_STATE); + if (idleSeconds) + { + // stay awake for at least idleSeconds + startIdleSleepTimer(idleSeconds); + } } - - if (existing_feature_arr) + else if (!extraSleepDelay && !idleSleepTimerPending && !systemDarkWake) { - existing_feature_arr->setObject(new_feature_data); - features->setObject(feature, existing_feature_arr); - existing_feature_arr->release(); - existing_feature_arr = 0; + sleepASAP = true; } - } else { - // The easy case: no previously existing features listed. We simply - // set the OSNumber at key 'feature' and we're on our way. - features->setObject(feature, new_feature_data); } - - new_feature_data->release(); - setProperty(kRootDomainSupportedFeatures, features); + // Drop our power clamp to SLEEP_STATE when all children became idle, + // and system sleep and display sleep slider values are equal. - features->release(); + adjustPowerState(sleepASAP); - if(featuresDictLock) IOLockUnlock(featuresDictLock); + // If our power clamp has already dropped to SLEEP_STATE, and no child + // is keeping us at ON_STATE, then the following will trigger idle sleep. - // Notify EnergySaver and all those in user space so they might - // re-populate their feature specific UI - if(pmPowerStateQueue) { - pmPowerStateQueue->submitPowerEvent( kPowerEventFeatureChanged ); - } + return super::requestPowerDomainState( + editedChildDesire, childConnection, specification); } - //****************************************************************************** -// removePublishedFeature +// tellChangeDown // -// Removes previously published feature +// Override the superclass implementation to send a different message type. //****************************************************************************** -IOReturn IOPMrootDomain::removePublishedFeature( uint32_t removeFeatureID ) +bool IOPMrootDomain::tellChangeDown( unsigned long stateNum ) { - IOReturn ret = kIOReturnError; - uint32_t feature_value = 0; - uint16_t feature_id = 0; - bool madeAChange = false; - - OSSymbol *dictKey = NULL; - OSCollectionIterator *dictIterator = NULL; - OSArray *arrayMember = NULL; - OSNumber *numberMember = NULL; - OSObject *osObj = NULL; - OSNumber *osNum = NULL; - OSArray *arrayMemberCopy; + DLOG("tellChangeDown %u->%u\n", + (uint32_t) getPowerState(), (uint32_t) stateNum); - if(featuresDictLock) IOLockLock(featuresDictLock); - - OSDictionary *features = - (OSDictionary *) getProperty(kRootDomainSupportedFeatures); - - if ( features && OSDynamicCast(OSDictionary, features) ) + if (SLEEP_STATE == stateNum) { - // Any modifications to the dictionary are made to the copy to prevent - // races & crashes with userland clients. Dictionary updated - // automically later. - features = OSDictionary::withDictionary(features); - } else { - features = NULL; - ret = kIOReturnNotFound; - goto exit; - } - - // We iterate 'features' dictionary looking for an entry tagged - // with 'removeFeatureID'. If found, we remove it from our tracking - // structures and notify the OS via a general interest message. - - dictIterator = OSCollectionIterator::withCollection(features); - if(!dictIterator) { - goto exit; + if (!ignoreTellChangeDown) + tracePoint( kIOPMTracePointSleepApplications ); + else + tracePoint( kIOPMTracePointSleepPriorityClients ); } - - while( (dictKey = OSDynamicCast(OSSymbol, dictIterator->getNextObject())) ) + + if ((SLEEP_STATE == stateNum) && !ignoreTellChangeDown) { - osObj = features->getObject(dictKey); - - // Each Feature is either tracked by an OSNumber - if( osObj && (numberMember = OSDynamicCast(OSNumber, osObj)) ) - { - feature_value = numberMember->unsigned32BitValue(); - feature_id = (uint16_t)(feature_value >> 16); + userActivityAtSleep = userActivityCount; + hibernateAborted = false; + DLOG("tellChangeDown::userActivityAtSleep %d\n", userActivityAtSleep); - if( feature_id == (uint16_t)removeFeatureID ) - { - // Remove this node - features->removeObject(dictKey); - madeAChange = true; - break; - } - - // Or tracked by an OSArray of OSNumbers - } else if( osObj && (arrayMember = OSDynamicCast(OSArray, osObj)) ) - { - unsigned int arrayCount = arrayMember->getCount(); - - for(unsigned int i=0; i<arrayCount; i++) - { - osNum = OSDynamicCast(OSNumber, arrayMember->getObject(i)); - if(!osNum) { - continue; - } - - feature_value = osNum->unsigned32BitValue(); - feature_id = (uint16_t)(feature_value >> 16); + // Direct callout into OSKext so it can disable kext unloads + // during sleep/wake to prevent deadlocks. + OSKextSystemSleepOrWake( kIOMessageSystemWillSleep ); - if( feature_id == (uint16_t)removeFeatureID ) - { - // Remove this node - if( 1 == arrayCount ) { - // If the array only contains one element, remove - // the whole thing. - features->removeObject(dictKey); - } else { - // Otherwise remove the element from a copy of the array. - arrayMemberCopy = OSArray::withArray(arrayMember); - if (arrayMemberCopy) - { - arrayMemberCopy->removeObject(i); - features->setObject(dictKey, arrayMemberCopy); - arrayMemberCopy->release(); - } - } + IOService::updateConsoleUsers(NULL, kIOMessageSystemWillSleep); - madeAChange = true; - break; - } - } - } - } - - dictIterator->release(); - - if( madeAChange ) - { - ret = kIOReturnSuccess; + // Notify platform that sleep has begun + getPlatform()->callPlatformFunction( + sleepMessagePEFunction, false, + (void *)(uintptr_t) kIOMessageSystemWillSleep, + NULL, NULL, NULL); - setProperty(kRootDomainSupportedFeatures, features); - - // Notify EnergySaver and all those in user space so they might - // re-populate their feature specific UI - if(pmPowerStateQueue) { - pmPowerStateQueue->submitPowerEvent( kPowerEventFeatureChanged ); - } - } else { - ret = kIOReturnNotFound; + // Two change downs are sent by IOServicePM. Ignore the 2nd. + // But tellClientsWithResponse() must be called for both. + ignoreTellChangeDown = true; } - -exit: - if(features) features->release(); - if(featuresDictLock) IOLockUnlock(featuresDictLock); - return ret; -} + return super::tellClientsWithResponse( kIOMessageSystemWillSleep ); +} //****************************************************************************** -// announcePowerSourceChange +// askChangeDown // -// Notifies "interested parties" that the battery state has changed +// Override the superclass implementation to send a different message type. +// This must be idle sleep since we don't ask during any other power change. //****************************************************************************** -void IOPMrootDomain::announcePowerSourceChange( void ) +bool IOPMrootDomain::askChangeDown( unsigned long stateNum ) { -#ifdef __ppc__ - IORegistryEntry *_batteryRegEntry = (IORegistryEntry *) getProperty("BatteryEntry"); + DLOG("askChangeDown %u->%u\n", + (uint32_t) getPowerState(), (uint32_t) stateNum); - // (if possible) re-publish power source state under IOPMrootDomain; - // only do so if the battery controller publishes an IOResource - // defining battery location. Called from ApplePMU battery driver. + // Don't log for dark wake entry + if (kSystemTransitionSleep == _systemTransitionType) + tracePoint( kIOPMTracePointSleepApplications ); - if(_batteryRegEntry) - { - OSArray *batt_info; - batt_info = (OSArray *) _batteryRegEntry->getProperty(kIOBatteryInfoKey); - if(batt_info) - setProperty(kIOBatteryInfoKey, batt_info); - } -#endif + return super::tellClientsWithResponse( kIOMessageCanSystemSleep ); } - //****************************************************************************** -// setPMSetting (private) +// askChangeDownDone // -// Internal helper to relay PM settings changes from user space to individual -// drivers. Should be called only by IOPMrootDomain::setProperties. +// Called by PM after all apps have responded to kIOMessageCanSystemSleep. +// pmconfigd may create a deny sleep assertion before ack'ing. //****************************************************************************** -IOReturn IOPMrootDomain::setPMSetting( - const OSSymbol *type, - OSObject *obj) +void IOPMrootDomain::askChangeDownDone( + IOPMPowerChangeFlags * inOutChangeFlags, bool * cancel ) { - OSArray *arr = NULL; - PMSettingObject *p_obj = NULL; - int count; - int i; + DLOG("askChangeDownDone(0x%x, %u) type %x, cap %x->%x\n", + *inOutChangeFlags, *cancel, + _systemTransitionType, + _currentCapability, _pendingCapability); - if(NULL == type) return kIOReturnBadArgument; + if ((false == *cancel) && (kSystemTransitionSleep == _systemTransitionType)) + { + // Dark->Sleep transition. + // Check if there are any deny sleep assertions. + // Full->Dark transition is never cancelled. - IORecursiveLockLock(settingsCtrlLock); - - fPMSettingsDict->setObject(type, obj); + if (!checkSystemCanSleep(true)) + { + // Cancel dark wake to sleep transition. + // Must re-scan assertions upon entering dark wake. - arr = (OSArray *)settingsCallbacks->getObject(type); - if(NULL == arr) goto exit; - count = arr->getCount(); - for(i=0; i<count; i++) { - p_obj = (PMSettingObject *)OSDynamicCast(PMSettingObject, arr->getObject(i)); - if(p_obj) p_obj->setPMSetting(type, obj); + *cancel = true; + DLOG("cancel dark->sleep\n"); + } } - -exit: - IORecursiveLockUnlock(settingsCtrlLock); - return kIOReturnSuccess; } - //****************************************************************************** -// copyPMSetting (public) +// tellNoChangeDown // -// Allows kexts to safely read setting values, without being subscribed to -// notifications. -//****************************************************************************** - -OSObject * IOPMrootDomain::copyPMSetting( - OSSymbol *whichSetting) -{ - OSObject *obj = NULL; +// Notify registered applications and kernel clients that we are not dropping +// power. +// +// We override the superclass implementation so we can send a different message +// type to the client or application being notified. +// +// This must be a vetoed idle sleep, since no other power change can be vetoed. +//****************************************************************************** - if(!whichSetting) return NULL; +void IOPMrootDomain::tellNoChangeDown( unsigned long stateNum ) +{ + DLOG("tellNoChangeDown %u->%u\n", + (uint32_t) getPowerState(), (uint32_t) stateNum); - IORecursiveLockLock(settingsCtrlLock); - obj = fPMSettingsDict->getObject(whichSetting); - if(obj) { - obj->retain(); + if (idleSeconds && !wrangler) + { + // stay awake for at least idleSeconds + startIdleSleepTimer(idleSeconds); } - IORecursiveLockUnlock(settingsCtrlLock); - - return obj; + return tellClients( kIOMessageSystemWillNotSleep ); } - //****************************************************************************** -// registerPMSettingController (public) +// tellChangeUp // -// direct wrapper to registerPMSettingController with uint32_t power source arg +// Notify registered applications and kernel clients that we are raising power. +// +// We override the superclass implementation so we can send a different message +// type to the client or application being notified. //****************************************************************************** -IOReturn IOPMrootDomain::registerPMSettingController( - const OSSymbol * settings[], - IOPMSettingControllerCallback func, - OSObject *target, - uintptr_t refcon, - OSObject **handle) +void IOPMrootDomain::tellChangeUp( unsigned long stateNum ) { - return registerPMSettingController( - settings, - (kIOPMSupportedOnAC | kIOPMSupportedOnBatt | kIOPMSupportedOnUPS), - func, target, refcon, handle); -} + OSData *publishPMStats = NULL; + + DLOG("tellChangeUp %u->%u\n", + (uint32_t) getPowerState(), (uint32_t) stateNum); + + ignoreTellChangeDown = false; + + if ( stateNum == ON_STATE ) + { + // Direct callout into OSKext so it can disable kext unloads + // during sleep/wake to prevent deadlocks. + OSKextSystemSleepOrWake( kIOMessageSystemHasPoweredOn ); + + // Notify platform that sleep was cancelled or resumed. + getPlatform()->callPlatformFunction( + sleepMessagePEFunction, false, + (void *)(uintptr_t) kIOMessageSystemHasPoweredOn, + NULL, NULL, NULL); + + if (getPowerState() == ON_STATE) + { + // this is a quick wake from aborted sleep + if (idleSeconds && !wrangler) + { + // stay awake for at least idleSeconds + startIdleSleepTimer(idleSeconds); + } + tellClients( kIOMessageSystemWillPowerOn ); + } + + tracePoint( kIOPMTracePointWakeApplications ); + publishPMStats = OSData::withBytes(&pmStats, sizeof(pmStats)); + setProperty(kIOPMSleepStatisticsKey, publishPMStats); + publishPMStats->release(); + bzero(&pmStats, sizeof(pmStats)); + + if (pmStatsAppResponses) + { + setProperty(kIOPMSleepStatisticsAppsKey, pmStatsAppResponses); + pmStatsAppResponses->release(); + pmStatsAppResponses = OSArray::withCapacity(5); + } + tellClients( kIOMessageSystemHasPoweredOn ); + } +} //****************************************************************************** -// registerPMSettingController (public) +// sysPowerDownHandler // -// Kexts may register for notifications when a particular setting is changed. -// A list of settings is available in IOPM.h. -// Arguments: -// * settings - An OSArray containing OSSymbols. Caller should populate this -// array with a list of settings caller wants notifications from. -// * func - A C function callback of the type IOPMSettingControllerCallback -// * target - caller may provide an OSObject *, which PM will pass as an -// target to calls to "func" -// * refcon - caller may provide an void *, which PM will pass as an -// argument to calls to "func" -// * handle - This is a return argument. We will populate this pointer upon -// call success. Hold onto this and pass this argument to -// IOPMrootDomain::deRegisterPMSettingCallback when unloading your kext -// Returns: -// kIOReturnSuccess on success +// Perform a vfs sync before system sleep. //****************************************************************************** -IOReturn IOPMrootDomain::registerPMSettingController( - const OSSymbol * settings[], - uint32_t supportedPowerSources, - IOPMSettingControllerCallback func, - OSObject *target, - uintptr_t refcon, - OSObject **handle) +IOReturn IOPMrootDomain::sysPowerDownHandler( + void * target, void * refCon, + UInt32 messageType, IOService * service, + void * messageArgs, vm_size_t argSize ) { - PMSettingObject *pmso = NULL; - OSArray *list = NULL; - IOReturn ret = kIOReturnSuccess; - int i; + IOReturn ret; - if( NULL == settings || - NULL == func || - NULL == handle) + DLOG("sysPowerDownHandler message %s\n", getIOMessageString(messageType)); + + if (!gRootDomain) + return kIOReturnUnsupported; + + if (messageType == kIOMessageSystemCapabilityChange) { - return kIOReturnBadArgument; - } + IOPMSystemCapabilityChangeParameters * params = + (IOPMSystemCapabilityChangeParameters *) messageArgs; + + // Interested applications have been notified of an impending power + // change and have acked (when applicable). + // This is our chance to save whatever state we can before powering + // down. + // We call sync_internal defined in xnu/bsd/vfs/vfs_syscalls.c, + // via callout + + DLOG("sysPowerDownHandler cap %x -> %x (flags %x)\n", + params->fromCapabilities, params->toCapabilities, + params->changeFlags); + + if ((params->changeFlags & kIOPMSystemCapabilityWillChange) && + (params->fromCapabilities & kIOPMSystemCapabilityCPU) && + (params->toCapabilities & kIOPMSystemCapabilityCPU) == 0) + { + // We will ack within 20 seconds + params->maxWaitForReply = 20 * 1000 * 1000; +#if HIBERNATION + gRootDomain->evaluateSystemSleepPolicyEarly(); - pmso = PMSettingObject::pmSettingObject( - (IOPMrootDomain *)this, func, target, - refcon, supportedPowerSources, settings); + // add in time we could spend freeing pages + if (gRootDomain->hibernateMode && !gRootDomain->hibernateDisabled) + { + params->maxWaitForReply = kCapabilityClientMaxWait; + } + DLOG("sysPowerDownHandler timeout %d s\n", (int) (params->maxWaitForReply / 1000 / 1000)); +#endif + + if ( !OSCompareAndSwap( 0, 1, &gSleepOrShutdownPending ) ) + { + // Purposely delay the ack and hope that shutdown occurs quickly. + // Another option is not to schedule the thread and wait for + // ack timeout... + AbsoluteTime deadline; + clock_interval_to_deadline( 30, kSecondScale, &deadline ); + thread_call_enter1_delayed( + gRootDomain->diskSyncCalloutEntry, + (thread_call_param_t) params->notifyRef, + deadline ); + } + else + thread_call_enter1( + gRootDomain->diskSyncCalloutEntry, + (thread_call_param_t) params->notifyRef); + } +#if HIBERNATION + else + if ((params->changeFlags & kIOPMSystemCapabilityDidChange) && + (params->toCapabilities & kIOPMSystemCapabilityCPU) && + (params->fromCapabilities & kIOPMSystemCapabilityCPU) == 0) + { + // We will ack within 110 seconds + params->maxWaitForReply = 110 * 1000 * 1000; - if(!pmso) { - ret = kIOReturnInternalError; - goto bail_no_unlock; + thread_call_enter1( + gRootDomain->diskSyncCalloutEntry, + (thread_call_param_t) params->notifyRef); + } +#endif + ret = kIOReturnSuccess; } - IORecursiveLockLock(settingsCtrlLock); - for(i=0; settings[i]; i++) + return ret; +} + +//****************************************************************************** +// handleQueueSleepWakeUUID +// +// Called from IOPMrootDomain when we're initiating a sleep, +// or indirectly from PM configd when PM decides to clear the UUID. +// PM clears the UUID several minutes after successful wake from sleep, +// so that we might associate App spindumps with the immediately previous +// sleep/wake. +// +// @param obj has a retain on it. We're responsible for releasing that retain. +//****************************************************************************** + +void IOPMrootDomain::handleQueueSleepWakeUUID(OSObject *obj) +{ + OSString *str = NULL; + + if (kOSBooleanFalse == obj) { - list = (OSArray *)settingsCallbacks->getObject(settings[i]); - if(!list) { - // New array of callbacks for this setting - list = OSArray::withCapacity(1); - settingsCallbacks->setObject(settings[i], list); - list->release(); + handlePublishSleepWakeUUID(NULL); + } + else if ((str = OSDynamicCast(OSString, obj))) + { + // This branch caches the UUID for an upcoming sleep/wake + if (queuedSleepWakeUUIDString) { + queuedSleepWakeUUIDString->release(); + queuedSleepWakeUUIDString = NULL; } + queuedSleepWakeUUIDString = str; + queuedSleepWakeUUIDString->retain(); - // Add caller to the callback list - list->setObject(pmso); + DLOG("SleepWake UUID queued: %s\n", queuedSleepWakeUUIDString->getCStringNoCopy()); } - IORecursiveLockUnlock(settingsCtrlLock); - - ret = kIOReturnSuccess; + if (obj) { + obj->release(); + } + return; + +} +//****************************************************************************** +// handlePublishSleepWakeUUID +// +// Called from IOPMrootDomain when we're initiating a sleep, +// or indirectly from PM configd when PM decides to clear the UUID. +// PM clears the UUID several minutes after successful wake from sleep, +// so that we might associate App spindumps with the immediately previous +// sleep/wake. +//****************************************************************************** + +void IOPMrootDomain::handlePublishSleepWakeUUID( bool shouldPublish ) +{ + ASSERT_GATED(); + + /* + * Clear the current UUID + */ + if (gSleepWakeUUIDIsSet) + { + DLOG("SleepWake UUID cleared\n"); + + OSString *UUIDstring = NULL; + + if (timeline && + (UUIDstring = OSDynamicCast(OSString, getProperty(kIOPMSleepWakeUUIDKey)))) + { + PMEventDetails *details = PMEventDetails::eventDetails(kIOPMEventTypeUUIDClear, + UUIDstring->getCStringNoCopy(), NULL, 0); + if (details) { + timeline->recordSystemPowerEvent( details ); + details->release(); + } + timeline->setNumEventsLoggedThisPeriod(0); + } + + gSleepWakeUUIDIsSet = false; + + removeProperty(kIOPMSleepWakeUUIDKey); + messageClients(kIOPMMessageSleepWakeUUIDChange, kIOPMMessageSleepWakeUUIDCleared); + } + + /* + * Optionally, publish a new UUID + */ + if (queuedSleepWakeUUIDString && shouldPublish) { + + OSString *publishThisUUID = NULL; + + publishThisUUID = queuedSleepWakeUUIDString; + publishThisUUID->retain(); + + if (timeline) { + PMEventDetails *details; + details = PMEventDetails::eventDetails(kIOPMEventTypeUUIDSet, + publishThisUUID->getCStringNoCopy(), NULL, 0); + if (details) { + timeline->recordSystemPowerEvent( details ); + details->release(); + } + } + + if (publishThisUUID) + { + setProperty(kIOPMSleepWakeUUIDKey, publishThisUUID); + publishThisUUID->release(); + } + + gSleepWakeUUIDIsSet = true; + messageClients(kIOPMMessageSleepWakeUUIDChange, kIOPMMessageSleepWakeUUIDSet); + + queuedSleepWakeUUIDString->release(); + queuedSleepWakeUUIDString = NULL; + } +} + +//****************************************************************************** +// changePowerStateTo & changePowerStateToPriv +// +// Override of these methods for logging purposes. +//****************************************************************************** + +IOReturn IOPMrootDomain::changePowerStateTo( unsigned long ordinal ) +{ + return kIOReturnUnsupported; // ignored +} + +IOReturn IOPMrootDomain::changePowerStateToPriv( unsigned long ordinal ) +{ + DLOG("changePowerStateToPriv(%lu)\n", ordinal); + + if ((ordinal != ON_STATE) && (ordinal != SLEEP_STATE)) + return kIOReturnUnsupported; - // Track this instance by its OSData ptr from now on - *handle = pmso; + return super::changePowerStateToPriv(ordinal); +} + +//****************************************************************************** +// activity detect +// +//****************************************************************************** + +bool IOPMrootDomain::activitySinceSleep(void) +{ + return (userActivityCount != userActivityAtSleep); +} + +bool IOPMrootDomain::abortHibernation(void) +{ + bool ret = activitySinceSleep(); -bail_no_unlock: - if(kIOReturnSuccess != ret) + if (ret && !hibernateAborted) { - // Error return case - if(pmso) pmso->release(); - if(handle) *handle = NULL; + DLOG("activitySinceSleep ABORT [%d, %d]\n", userActivityCount, userActivityAtSleep); + hibernateAborted = true; } - return ret; + return (ret); } +extern "C" int +hibernate_should_abort(void) +{ + if (gRootDomain) + return (gRootDomain->abortHibernation()); + else + return (0); +} //****************************************************************************** // sleepOnClamshellClosed @@ -2544,13 +2723,13 @@ bail_no_unlock: bool IOPMrootDomain::shouldSleepOnClamshellClosed( void ) { - DLOG("clamshell state %d, EX %d, IG %d, IW %d, DT %d, AC %d\n", - clamshellIsClosed, clamshellExists, ignoringClamshell, - ignoringClamshellOnWake, desktopMode, acAdaptorConnected); + if (!clamshellExists) + return false; + + DLOG("clamshell closed %d, disabled %d, desktopMode %d, ac %d\n", + clamshellClosed, clamshellDisabled, desktopMode, acAdaptorConnected); - return ( !ignoringClamshell - && !ignoringClamshellOnWake - && !(desktopMode && acAdaptorConnected) ); + return ( !clamshellDisabled && !(desktopMode && acAdaptorConnected) ); } void IOPMrootDomain::sendClientClamshellNotification( void ) @@ -2560,7 +2739,7 @@ void IOPMrootDomain::sendClientClamshellNotification( void ) return; setProperty(kAppleClamshellStateKey, - clamshellIsClosed ? kOSBooleanTrue : kOSBooleanFalse); + clamshellClosed ? kOSBooleanTrue : kOSBooleanFalse); setProperty(kAppleClamshellCausesSleepKey, shouldSleepOnClamshellClosed() ? kOSBooleanTrue : kOSBooleanFalse); @@ -2569,28 +2748,566 @@ void IOPMrootDomain::sendClientClamshellNotification( void ) * ( kClamshellStateBit | kClamshellSleepBit ) */ messageClients(kIOPMMessageClamshellStateChange, - (void *) ( (clamshellIsClosed ? kClamshellStateBit : 0) + (void *) ( (clamshellClosed ? kClamshellStateBit : 0) | ( shouldSleepOnClamshellClosed() ? kClamshellSleepBit : 0)) ); } - //****************************************************************************** -// informCPUStateChange -// -// Call into PM CPU code so that CPU power savings may dynamically adjust for -// running on battery, with the lid closed, etc. +// getSleepSupported // -// informCPUStateChange is a no-op on non x86 systems -// only x86 has explicit support in the IntelCPUPowerManagement kext +// Deprecated //****************************************************************************** -void IOPMrootDomain::informCPUStateChange( - uint32_t type, - uint32_t value ) +IOOptionBits IOPMrootDomain::getSleepSupported( void ) { -#if defined(__i386__) || defined(__x86_64__) + return( platformSleepSupport ); +} - pmioctlVariableInfo_t varInfoStruct; +//****************************************************************************** +// setSleepSupported +// +// Deprecated +//****************************************************************************** + +void IOPMrootDomain::setSleepSupported( IOOptionBits flags ) +{ + DLOG("setSleepSupported(%x)\n", (uint32_t) flags); + OSBitOrAtomic(flags, &platformSleepSupport); +} + +//****************************************************************************** +// wakeFromDoze +// +// Deprecated. +//****************************************************************************** + +void IOPMrootDomain::wakeFromDoze( void ) +{ + // Preserve symbol for familes (IOUSBFamily and IOGraphics) +} + +// MARK: - +// MARK: Features + +//****************************************************************************** +// publishFeature +// +// Adds a new feature to the supported features dictionary +//****************************************************************************** + +void IOPMrootDomain::publishFeature( const char * feature ) +{ + publishFeature(feature, kRD_AllPowerSources, NULL); +} + +//****************************************************************************** +// publishFeature (with supported power source specified) +// +// Adds a new feature to the supported features dictionary +//****************************************************************************** + +void IOPMrootDomain::publishFeature( + const char *feature, + uint32_t supportedWhere, + uint32_t *uniqueFeatureID) +{ + static uint16_t next_feature_id = 500; + + OSNumber *new_feature_data = NULL; + OSNumber *existing_feature = NULL; + OSArray *existing_feature_arr = NULL; + OSObject *osObj = NULL; + uint32_t feature_value = 0; + + supportedWhere &= kRD_AllPowerSources; // mask off any craziness! + + if(!supportedWhere) { + // Feature isn't supported anywhere! + return; + } + + if(next_feature_id > 5000) { + // Far, far too many features! + return; + } + + if(featuresDictLock) IOLockLock(featuresDictLock); + + OSDictionary *features = + (OSDictionary *) getProperty(kRootDomainSupportedFeatures); + + // Create new features dict if necessary + if ( features && OSDynamicCast(OSDictionary, features)) { + features = OSDictionary::withDictionary(features); + } else { + features = OSDictionary::withCapacity(1); + } + + // Create OSNumber to track new feature + + next_feature_id += 1; + if( uniqueFeatureID ) { + // We don't really mind if the calling kext didn't give us a place + // to stash their unique id. Many kexts don't plan to unload, and thus + // have no need to remove themselves later. + *uniqueFeatureID = next_feature_id; + } + + feature_value = (uint32_t)next_feature_id; + feature_value <<= 16; + feature_value += supportedWhere; + + new_feature_data = OSNumber::withNumber( + (unsigned long long)feature_value, 32); + + // Does features object already exist? + if( (osObj = features->getObject(feature)) ) + { + if(( existing_feature = OSDynamicCast(OSNumber, osObj) )) + { + // We need to create an OSArray to hold the now 2 elements. + existing_feature_arr = OSArray::withObjects( + (const OSObject **)&existing_feature, 1, 2); + } else if(( existing_feature_arr = OSDynamicCast(OSArray, osObj) )) + { + // Add object to existing array + existing_feature_arr = OSArray::withArray( + existing_feature_arr, + existing_feature_arr->getCount() + 1); + } + + if (existing_feature_arr) + { + existing_feature_arr->setObject(new_feature_data); + features->setObject(feature, existing_feature_arr); + existing_feature_arr->release(); + existing_feature_arr = 0; + } + } else { + // The easy case: no previously existing features listed. We simply + // set the OSNumber at key 'feature' and we're on our way. + features->setObject(feature, new_feature_data); + } + + new_feature_data->release(); + + setProperty(kRootDomainSupportedFeatures, features); + + features->release(); + + if(featuresDictLock) IOLockUnlock(featuresDictLock); + + // Notify EnergySaver and all those in user space so they might + // re-populate their feature specific UI + if(pmPowerStateQueue) { + pmPowerStateQueue->submitPowerEvent( kPowerEventFeatureChanged ); + } +} + +//****************************************************************************** +// removePublishedFeature +// +// Removes previously published feature +//****************************************************************************** + +IOReturn IOPMrootDomain::removePublishedFeature( uint32_t removeFeatureID ) +{ + IOReturn ret = kIOReturnError; + uint32_t feature_value = 0; + uint16_t feature_id = 0; + bool madeAChange = false; + + OSSymbol *dictKey = NULL; + OSCollectionIterator *dictIterator = NULL; + OSArray *arrayMember = NULL; + OSNumber *numberMember = NULL; + OSObject *osObj = NULL; + OSNumber *osNum = NULL; + OSArray *arrayMemberCopy; + + if(featuresDictLock) IOLockLock(featuresDictLock); + + OSDictionary *features = + (OSDictionary *) getProperty(kRootDomainSupportedFeatures); + + if ( features && OSDynamicCast(OSDictionary, features) ) + { + // Any modifications to the dictionary are made to the copy to prevent + // races & crashes with userland clients. Dictionary updated + // automically later. + features = OSDictionary::withDictionary(features); + } else { + features = NULL; + ret = kIOReturnNotFound; + goto exit; + } + + // We iterate 'features' dictionary looking for an entry tagged + // with 'removeFeatureID'. If found, we remove it from our tracking + // structures and notify the OS via a general interest message. + + dictIterator = OSCollectionIterator::withCollection(features); + if(!dictIterator) { + goto exit; + } + + while( (dictKey = OSDynamicCast(OSSymbol, dictIterator->getNextObject())) ) + { + osObj = features->getObject(dictKey); + + // Each Feature is either tracked by an OSNumber + if( osObj && (numberMember = OSDynamicCast(OSNumber, osObj)) ) + { + feature_value = numberMember->unsigned32BitValue(); + feature_id = (uint16_t)(feature_value >> 16); + + if( feature_id == (uint16_t)removeFeatureID ) + { + // Remove this node + features->removeObject(dictKey); + madeAChange = true; + break; + } + + // Or tracked by an OSArray of OSNumbers + } else if( osObj && (arrayMember = OSDynamicCast(OSArray, osObj)) ) + { + unsigned int arrayCount = arrayMember->getCount(); + + for(unsigned int i=0; i<arrayCount; i++) + { + osNum = OSDynamicCast(OSNumber, arrayMember->getObject(i)); + if(!osNum) { + continue; + } + + feature_value = osNum->unsigned32BitValue(); + feature_id = (uint16_t)(feature_value >> 16); + + if( feature_id == (uint16_t)removeFeatureID ) + { + // Remove this node + if( 1 == arrayCount ) { + // If the array only contains one element, remove + // the whole thing. + features->removeObject(dictKey); + } else { + // Otherwise remove the element from a copy of the array. + arrayMemberCopy = OSArray::withArray(arrayMember); + if (arrayMemberCopy) + { + arrayMemberCopy->removeObject(i); + features->setObject(dictKey, arrayMemberCopy); + arrayMemberCopy->release(); + } + } + + madeAChange = true; + break; + } + } + } + } + + dictIterator->release(); + + if( madeAChange ) + { + ret = kIOReturnSuccess; + + setProperty(kRootDomainSupportedFeatures, features); + + // Notify EnergySaver and all those in user space so they might + // re-populate their feature specific UI + if(pmPowerStateQueue) { + pmPowerStateQueue->submitPowerEvent( kPowerEventFeatureChanged ); + } + } else { + ret = kIOReturnNotFound; + } + +exit: + if(features) features->release(); + if(featuresDictLock) IOLockUnlock(featuresDictLock); + return ret; +} + +//****************************************************************************** +// setPMSetting (private) +// +// Internal helper to relay PM settings changes from user space to individual +// drivers. Should be called only by IOPMrootDomain::setProperties. +//****************************************************************************** + +IOReturn IOPMrootDomain::setPMSetting( + const OSSymbol *type, + OSObject *object ) +{ + PMSettingCallEntry *entries = 0; + OSArray *chosen = 0; + const OSArray *array; + PMSettingObject *pmso; + thread_t thisThread; + int i, j, count, capacity; + + if (NULL == type) + return kIOReturnBadArgument; + + PMSETTING_LOCK(); + + // Update settings dict so changes are visible from copyPMSetting(). + fPMSettingsDict->setObject(type, object); + + // Prep all PMSetting objects with the given 'type' for callout. + array = (const OSArray *) settingsCallbacks->getObject(type); + if (!array || ((capacity = array->getCount()) == 0)) + goto unlock_exit; + + // Array to retain PMSetting objects targeted for callout. + chosen = OSArray::withCapacity(capacity); + if (!chosen) + goto unlock_exit; // error + + entries = IONew(PMSettingCallEntry, capacity); + if (!entries) + goto unlock_exit; // error + memset(entries, 0, sizeof(PMSettingCallEntry) * capacity); + + thisThread = current_thread(); + + for (i = 0, j = 0; i<capacity; i++) + { + pmso = (PMSettingObject *) array->getObject(i); + if (pmso->disabled) + continue; + entries[j].thread = thisThread; + queue_enter(&pmso->calloutQueue, &entries[j], PMSettingCallEntry *, link); + chosen->setObject(pmso); + j++; + } + count = j; + if (!count) + goto unlock_exit; + + PMSETTING_UNLOCK(); + + // Call each pmso in the chosen array. + for (i=0; i<count; i++) + { + pmso = (PMSettingObject *) chosen->getObject(i); + pmso->dispatchPMSetting(type, object); + } + + PMSETTING_LOCK(); + for (i=0; i<count; i++) + { + pmso = (PMSettingObject *) chosen->getObject(i); + queue_remove(&pmso->calloutQueue, &entries[i], PMSettingCallEntry *, link); + if (pmso->waitThread) + { + PMSETTING_WAKEUP(pmso); + } + } +unlock_exit: + PMSETTING_UNLOCK(); + + if (chosen) chosen->release(); + if (entries) IODelete(entries, PMSettingCallEntry, capacity); + + return kIOReturnSuccess; +} + +//****************************************************************************** +// copyPMSetting (public) +// +// Allows kexts to safely read setting values, without being subscribed to +// notifications. +//****************************************************************************** + +OSObject * IOPMrootDomain::copyPMSetting( + OSSymbol *whichSetting) +{ + OSObject *obj = NULL; + + if(!whichSetting) return NULL; + + PMSETTING_LOCK(); + obj = fPMSettingsDict->getObject(whichSetting); + if(obj) { + obj->retain(); + } + PMSETTING_UNLOCK(); + + return obj; +} + +//****************************************************************************** +// registerPMSettingController (public) +// +// direct wrapper to registerPMSettingController with uint32_t power source arg +//****************************************************************************** + +IOReturn IOPMrootDomain::registerPMSettingController( + const OSSymbol * settings[], + IOPMSettingControllerCallback func, + OSObject *target, + uintptr_t refcon, + OSObject **handle) +{ + return registerPMSettingController( + settings, + (kIOPMSupportedOnAC | kIOPMSupportedOnBatt | kIOPMSupportedOnUPS), + func, target, refcon, handle); +} + +//****************************************************************************** +// registerPMSettingController (public) +// +// Kexts may register for notifications when a particular setting is changed. +// A list of settings is available in IOPM.h. +// Arguments: +// * settings - An OSArray containing OSSymbols. Caller should populate this +// array with a list of settings caller wants notifications from. +// * func - A C function callback of the type IOPMSettingControllerCallback +// * target - caller may provide an OSObject *, which PM will pass as an +// target to calls to "func" +// * refcon - caller may provide an void *, which PM will pass as an +// argument to calls to "func" +// * handle - This is a return argument. We will populate this pointer upon +// call success. Hold onto this and pass this argument to +// IOPMrootDomain::deRegisterPMSettingCallback when unloading your kext +// Returns: +// kIOReturnSuccess on success +//****************************************************************************** + +IOReturn IOPMrootDomain::registerPMSettingController( + const OSSymbol * settings[], + uint32_t supportedPowerSources, + IOPMSettingControllerCallback func, + OSObject *target, + uintptr_t refcon, + OSObject **handle) +{ + PMSettingObject *pmso = NULL; + OSObject *pmsh = NULL; + OSArray *list = NULL; + int i; + + if (NULL == settings || + NULL == func || + NULL == handle) + { + return kIOReturnBadArgument; + } + + pmso = PMSettingObject::pmSettingObject( + (IOPMrootDomain *) this, func, target, + refcon, supportedPowerSources, settings, &pmsh); + + if (!pmso) { + *handle = NULL; + return kIOReturnInternalError; + } + + PMSETTING_LOCK(); + for (i=0; settings[i]; i++) + { + list = (OSArray *) settingsCallbacks->getObject(settings[i]); + if (!list) { + // New array of callbacks for this setting + list = OSArray::withCapacity(1); + settingsCallbacks->setObject(settings[i], list); + list->release(); + } + + // Add caller to the callback list + list->setObject(pmso); + } + PMSETTING_UNLOCK(); + + // Return handle to the caller, the setting object is private. + *handle = pmsh; + + return kIOReturnSuccess; +} + +//****************************************************************************** +// deregisterPMSettingObject (private) +// +// Only called from PMSettingObject. +//****************************************************************************** + +void IOPMrootDomain::deregisterPMSettingObject( PMSettingObject * pmso ) +{ + thread_t thisThread = current_thread(); + PMSettingCallEntry *callEntry; + OSCollectionIterator *iter; + OSSymbol *sym; + OSArray *array; + int index; + bool wait; + + PMSETTING_LOCK(); + + pmso->disabled = true; + + // Wait for all callout threads to finish. + do { + wait = false; + queue_iterate(&pmso->calloutQueue, callEntry, PMSettingCallEntry *, link) + { + if (callEntry->thread != thisThread) + { + wait = true; + break; + } + } + if (wait) + { + assert(0 == pmso->waitThread); + pmso->waitThread = thisThread; + PMSETTING_WAIT(pmso); + pmso->waitThread = 0; + } + } while (wait); + + // Search each PM settings array in the kernel. + iter = OSCollectionIterator::withCollection(settingsCallbacks); + if (iter) + { + while ((sym = OSDynamicCast(OSSymbol, iter->getNextObject()))) + { + array = (OSArray *) settingsCallbacks->getObject(sym); + index = array->getNextIndexOfObject(pmso, 0); + if (-1 != index) { + array->removeObject(index); + } + } + iter->release(); + } + + PMSETTING_UNLOCK(); + + pmso->release(); +} + +//****************************************************************************** +// informCPUStateChange +// +// Call into PM CPU code so that CPU power savings may dynamically adjust for +// running on battery, with the lid closed, etc. +// +// informCPUStateChange is a no-op on non x86 systems +// only x86 has explicit support in the IntelCPUPowerManagement kext +//****************************************************************************** + +void IOPMrootDomain::informCPUStateChange( + uint32_t type, + uint32_t value ) +{ +#if defined(__i386__) || defined(__x86_64__) + + pmioctlVariableInfo_t varInfoStruct; int pmCPUret = 0; const char *varNameStr = NULL; int32_t *varIndex = NULL; @@ -2639,6 +3356,8 @@ void IOPMrootDomain::informCPUStateChange( #endif /* __i386__ || __x86_64__ */ } +// MARK: - +// MARK: Deep Sleep Policy #if HIBERNATION @@ -2673,7 +3392,7 @@ enum { kIOPMSleepFactorUSBExternalDevice = 0x00000080, kIOPMSleepFactorBluetoothHIDDevice = 0x00000100, kIOPMSleepFactorExternalMediaMounted = 0x00000200, - kIOPMSleepFactorDriverAssertBit5 = 0x00000400, + kIOPMSleepFactorDriverAssertBit5 = 0x00000400, /* Reserved for ThunderBolt */ kIOPMSleepFactorDriverAssertBit6 = 0x00000800, kIOPMSleepFactorDriverAssertBit7 = 0x00001000 }; @@ -2730,18 +3449,15 @@ bool IOPMrootDomain::evaluateSystemSleepPolicy( IOPMSystemSleepParameters * p ) if (getPMAssertionLevel(kIOPMDriverAssertionExternalMediaMountedBit) != kIOPMDriverAssertionLevelOff) currentFactors |= kIOPMSleepFactorExternalMediaMounted; - if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit5) != + if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit5) != /* AssertionBit5 = Thunderbolt */ kIOPMDriverAssertionLevelOff) currentFactors |= kIOPMSleepFactorDriverAssertBit5; - if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit6) != - kIOPMDriverAssertionLevelOff) - currentFactors |= kIOPMSleepFactorDriverAssertBit6; if (getPMAssertionLevel(kIOPMDriverAssertionReservedBit7) != kIOPMDriverAssertionLevelOff) currentFactors |= kIOPMSleepFactorDriverAssertBit7; if (0 == deepSleepDelay) currentFactors |= kIOPMSleepFactorDeepSleepNoDelay; - if (!clamshellIsClosed) + if (!clamshellClosed) currentFactors |= kIOPMSleepFactorLidOpen; if (acAdaptorConnected) currentFactors |= kIOPMSleepFactorACPower; @@ -2897,1768 +3613,2139 @@ bool IOPMrootDomain::getSleepOption( const char * key, uint32_t * option ) ok = true; } - if (obj) - obj->release(); - if (optionsProp) - optionsProp->release(); + if (obj) + obj->release(); + if (optionsProp) + optionsProp->release(); + + return true; +} +#endif /* HIBERNATION */ + +// MARK: - +// MARK: Shutdown and Restart + +//****************************************************************************** +// handlePlatformHaltRestart +// +//****************************************************************************** + +struct HaltRestartApplierContext { + IOPMrootDomain * RootDomain; + unsigned long PowerState; + IOPMPowerFlags PowerFlags; + UInt32 MessageType; + UInt32 Counter; +}; + +static void +platformHaltRestartApplier( OSObject * object, void * context ) +{ + IOPowerStateChangeNotification notify; + HaltRestartApplierContext * ctx; + AbsoluteTime startTime; + UInt32 deltaTime; + + ctx = (HaltRestartApplierContext *) context; + + memset(¬ify, 0, sizeof(notify)); + notify.powerRef = (void *)ctx->Counter; + notify.returnValue = 0; + notify.stateNumber = ctx->PowerState; + notify.stateFlags = ctx->PowerFlags; + + clock_get_uptime(&startTime); + ctx->RootDomain->messageClient( ctx->MessageType, object, (void *)¬ify ); + deltaTime = computeDeltaTimeMS(&startTime); + + if ((deltaTime > kPMHaltTimeoutMS) || + (gIOKitDebug & kIOLogPMRootDomain)) + { + _IOServiceInterestNotifier * notifier; + notifier = OSDynamicCast(_IOServiceInterestNotifier, object); + + // IOService children of IOPMrootDomain are not instrumented. + // Only IORootParent currently falls under that group. + + if (notifier) + { + LOG("%s handler %p took %u ms\n", + (ctx->MessageType == kIOMessageSystemWillPowerOff) ? "PowerOff" : + (ctx->MessageType == kIOMessageSystemPagingOff) ? "PagingOff" : "Restart", + notifier->handler, (uint32_t) deltaTime ); + } + } + + ctx->Counter++; +} + +void IOPMrootDomain::handlePlatformHaltRestart( UInt32 pe_type ) +{ + HaltRestartApplierContext ctx; + AbsoluteTime startTime; + UInt32 deltaTime; + + memset(&ctx, 0, sizeof(ctx)); + ctx.RootDomain = this; + + clock_get_uptime(&startTime); + switch (pe_type) + { + case kPEHaltCPU: + case kPEUPSDelayHaltCPU: + ctx.PowerState = OFF_STATE; + ctx.MessageType = kIOMessageSystemWillPowerOff; + break; + + case kPERestartCPU: + ctx.PowerState = RESTART_STATE; + ctx.MessageType = kIOMessageSystemWillRestart; + break; + + case kPEPagingOff: + ctx.PowerState = ON_STATE; + ctx.MessageType = kIOMessageSystemPagingOff; + break; + + default: + return; + } + + // Notify legacy clients + applyToInterested(gIOPriorityPowerStateInterest, platformHaltRestartApplier, &ctx); + + // For normal shutdown, turn off File Server Mode. + if (kPEHaltCPU == pe_type) + { + const OSSymbol * setting = OSSymbol::withCString(kIOPMSettingRestartOnPowerLossKey); + OSNumber * num = OSNumber::withNumber((unsigned long long) 0, 32); + if (setting && num) + { + setPMSetting(setting, num); + setting->release(); + num->release(); + } + } + + if (kPEPagingOff != pe_type) + { + // Notify in power tree order + notifySystemShutdown(this, ctx.MessageType); + } - return true; + deltaTime = computeDeltaTimeMS(&startTime); + LOG("%s all drivers took %u ms\n", + (ctx.MessageType == kIOMessageSystemWillPowerOff) ? "PowerOff" : + (ctx.MessageType == kIOMessageSystemPagingOff) ? "PagingOff" : "Restart", + (uint32_t) deltaTime ); } -#endif /* HIBERNATION */ - //****************************************************************************** -// dispatchPowerEvent +// shutdownSystem // -// IOPMPowerStateQueue callback function. Running on PM work loop thread. //****************************************************************************** -void IOPMrootDomain::dispatchPowerEvent( - uint32_t event, void * arg0, uint64_t arg1 ) +IOReturn IOPMrootDomain::shutdownSystem( void ) { - DLOG("power event %u args %p 0x%llx\n", event, arg0, arg1); - ASSERT_GATED(); - - switch (event) - { - case kPowerEventFeatureChanged: - messageClients(kIOPMMessageFeatureChange, this); - break; - - case kPowerEventReceivedPowerNotification: - handlePowerNotification( (UInt32)(uintptr_t) arg0 ); - break; - - case kPowerEventSystemBootCompleted: - if (systemBooting) - { - systemBooting = false; - adjustPowerState(); - - // If lid is closed, re-send lid closed notification - // now that booting is complete. - if( clamshellIsClosed ) - { - handlePowerNotification(kLocalEvalClamshellCommand); - } - } - break; - - case kPowerEventSystemShutdown: - if (kOSBooleanTrue == (OSBoolean *) arg0) - { - /* We set systemShutdown = true during shutdown - to prevent sleep at unexpected times while loginwindow is trying - to shutdown apps and while the OS is trying to transition to - complete power of. - - Set to true during shutdown, as soon as loginwindow shows - the "shutdown countdown dialog", through individual app - termination, and through black screen kernel shutdown. - */ - LOG("systemShutdown true\n"); - systemShutdown = true; - } else { - /* - A shutdown was initiated, but then the shutdown - was cancelled, clearing systemShutdown to false here. - */ - LOG("systemShutdown false\n"); - systemShutdown = false; - } - break; - - case kPowerEventUserDisabledSleep: - userDisabledAllSleep = (kOSBooleanTrue == (OSBoolean *) arg0); - break; - -#if ROOT_DOMAIN_RUN_STATES - case kPowerEventConfigdRegisteredInterest: - if (gConfigdNotifier) - { - gConfigdNotifier->release(); - gConfigdNotifier = 0; - } - if (arg0) - { - gConfigdNotifier = (IONotifier *) arg0; - } - break; -#endif - - case kPowerEventAggressivenessChanged: - aggressivenessChanged(); - break; - - case kPowerEventAssertionCreate: - if (pmAssertions) { - pmAssertions->handleCreateAssertion((OSData *)arg0); - } - break; + return kIOReturnUnsupported; +} - case kPowerEventAssertionRelease: - if (pmAssertions) { - pmAssertions->handleReleaseAssertion(arg1); - } - break; +//****************************************************************************** +// restartSystem +// +//****************************************************************************** - case kPowerEventAssertionSetLevel: - if (pmAssertions) { - pmAssertions->handleSetAssertionLevel(arg1, (IOPMDriverAssertionLevel)(uintptr_t)arg0); - } - break; - } +IOReturn IOPMrootDomain::restartSystem( void ) +{ + return kIOReturnUnsupported; } +// MARK: - +// MARK: System Capability //****************************************************************************** -// systemPowerEventOccurred -// -// The power controller is notifying us of a hardware-related power management -// event that we must handle. +// tagPowerPlaneService // -// systemPowerEventOccurred covers the same functionality that -// receivePowerNotification does; it simply provides a richer API for conveying -// more information. +// Running on PM work loop thread. //****************************************************************************** -IOReturn IOPMrootDomain::systemPowerEventOccurred( - const OSSymbol *event, - uint32_t intValue) +void IOPMrootDomain::tagPowerPlaneService( + IOService * service, + IOPMActions * actions ) { - IOReturn attempt = kIOReturnSuccess; - OSNumber *newNumber = NULL; + uint32_t flags = 0; + bool isDisplayWrangler; - if (!event) - return kIOReturnBadArgument; - - newNumber = OSNumber::withNumber(intValue, 8*sizeof(intValue)); - if (!newNumber) - return kIOReturnInternalError; + memset(actions, 0, sizeof(*actions)); + actions->target = this; - attempt = systemPowerEventOccurred(event, (OSObject *)newNumber); + if (service == this) + { + actions->actionPowerChangeStart = + OSMemberFunctionCast( + IOPMActionPowerChangeStart, this, + &IOPMrootDomain::handleOurPowerChangeStart); - newNumber->release(); + actions->actionPowerChangeDone = + OSMemberFunctionCast( + IOPMActionPowerChangeDone, this, + &IOPMrootDomain::handleOurPowerChangeDone); - return attempt; -} + actions->actionPowerChangeOverride = + OSMemberFunctionCast( + IOPMActionPowerChangeOverride, this, + &IOPMrootDomain::overrideOurPowerChange); + return; + } -IOReturn IOPMrootDomain::systemPowerEventOccurred( - const OSSymbol *event, - OSObject *value) -{ - OSDictionary *thermalsDict = NULL; - bool shouldUpdate = true; - - if (!event || !value) - return kIOReturnBadArgument; +#if !NO_KERNEL_HID + isDisplayWrangler = (0 != service->metaCast("IODisplayWrangler")); + if (isDisplayWrangler) + { + wrangler = service; + wranglerConnection = (IOService *) service->getParentEntry(gIOPowerPlane); + } +#else + isDisplayWrangler = false; +#endif - // LOCK - // We reuse featuresDict Lock because it already exists and guards - // the very infrequently used publish/remove feature mechanism; so there's zero rsk - // of stepping on that lock. - if (featuresDictLock) IOLockLock(featuresDictLock); +#if defined(__i386__) || defined(__x86_64__) + if (isDisplayWrangler) + flags |= kPMActionsFlagIsDisplayWrangler; + if (service->getProperty("IOPMStrictTreeOrder")) + flags |= kPMActionsFlagIsGraphicsDevice; + if (service->getProperty("IOPMUnattendedWakePowerState")) + flags |= kPMActionsFlagIsAudioDevice; +#endif - thermalsDict = (OSDictionary *)getProperty(kIOPMRootDomainPowerStatusKey); - - if (thermalsDict && OSDynamicCast(OSDictionary, thermalsDict)) { - thermalsDict = OSDictionary::withDictionary(thermalsDict); - } else { - thermalsDict = OSDictionary::withCapacity(1); - } + // Find the power connection object that is a child of the PCI host + // bridge, and has a graphics/audio device attached below. Mark the + // power branch for delayed child notifications. - if (!thermalsDict) { - shouldUpdate = false; - goto exit; + if (flags) + { + IORegistryEntry * child = service; + IORegistryEntry * parent = child->getParentEntry(gIOPowerPlane); + + while (child != this) + { + if ((parent == pciHostBridgeDriver) || + (parent == this)) + { + if (OSDynamicCast(IOPowerConnection, child)) + { + IOPowerConnection * conn = (IOPowerConnection *) child; + conn->delayChildNotification = true; + } + break; + } + child = parent; + parent = child->getParentEntry(gIOPowerPlane); + } } - thermalsDict->setObject (event, value); + if (flags) + { + DLOG("%s tag flags %x\n", service->getName(), flags); + actions->parameter |= flags; + actions->actionPowerChangeOverride = + OSMemberFunctionCast( + IOPMActionPowerChangeOverride, this, + &IOPMrootDomain::overridePowerChangeForUIService); - setProperty (kIOPMRootDomainPowerStatusKey, thermalsDict); + if (flags & kPMActionsFlagIsDisplayWrangler) + { + actions->actionActivityTickle = + OSMemberFunctionCast( + IOPMActionActivityTickle, this, + &IOPMrootDomain::handleActivityTickleForDisplayWrangler); + } + return; + } - thermalsDict->release(); + // Locate the first PCI host bridge for PMTrace. + if (!pciHostBridgeDevice && service->metaCast("IOPCIBridge")) + { + IOService * provider = service->getProvider(); + if (OSDynamicCast(IOPlatformDevice, provider) && + provider->inPlane(gIODTPlane)) + { + pciHostBridgeDevice = provider; + pciHostBridgeDriver = service; + DLOG("PMTrace found PCI host bridge %s->%s\n", + provider->getName(), service->getName()); + } + } -exit: - // UNLOCK - if (featuresDictLock) IOLockUnlock(featuresDictLock); + // Tag top-level PCI devices. The order of PMinit() call does not + // change across boots and is used as the PCI bit number. + if (pciHostBridgeDevice && service->metaCast("IOPCIDevice")) + { + // Would prefer to check built-in property, but tagPowerPlaneService() + // is called before pciDevice->registerService(). + IORegistryEntry * parent = service->getParentEntry(gIODTPlane); + if ((parent == pciHostBridgeDevice) && service->getProperty("acpi-device")) + { + int bit = pmTracer->recordTopLevelPCIDevice( service ); + if (bit >= 0) + { + // Save the assigned bit for fast lookup. + actions->parameter |= (bit & kPMActionsPCIBitNumberMask); - if (shouldUpdate) - messageClients (kIOPMMessageSystemPowerEventOccurred, (void *)NULL); + actions->actionPowerChangeStart = + OSMemberFunctionCast( + IOPMActionPowerChangeStart, this, + &IOPMrootDomain::handlePowerChangeStartForPCIDevice); - return kIOReturnSuccess; + actions->actionPowerChangeDone = + OSMemberFunctionCast( + IOPMActionPowerChangeDone, this, + &IOPMrootDomain::handlePowerChangeDoneForPCIDevice); + } + } + } } - //****************************************************************************** -// receivePowerNotification -// -// The power controller is notifying us of a hardware-related power management -// event that we must handle. This may be a result of an 'environment' interrupt -// from the power mgt micro. +// PM actions for root domain //****************************************************************************** -IOReturn IOPMrootDomain::receivePowerNotification( UInt32 msg ) +void IOPMrootDomain::overrideOurPowerChange( + IOService * service, + IOPMActions * actions, + unsigned long * inOutPowerState, + uint32_t * inOutChangeFlags ) { - pmPowerStateQueue->submitPowerEvent( - kPowerEventReceivedPowerNotification, (void *) msg ); - return kIOReturnSuccess; + uint32_t powerState = (uint32_t) *inOutPowerState; + uint32_t changeFlags = *inOutChangeFlags; + uint32_t currentPowerState = (uint32_t) getPowerState(); + + if ((currentPowerState == powerState) || + (changeFlags & kIOPMParentInitiated)) + { + // FIXME: cancel any parent change (unexpected) + // Root parent is permanently pegged at max power, + // kIOPMParentInitiated is unexpected. + return; + } + + if (powerState < currentPowerState) + { + if ((changeFlags & kIOPMSkipAskPowerDown) == 0) + { + /* Convenient place to run any code at idle sleep time + * IOPMrootDomain initiates an idle sleep here + * + * Set last sleep cause accordingly. + */ + pmPowerStateQueue->submitPowerEvent(kPowerEventPublishSleepWakeUUID, (void *)true); + + lastSleepReason = kIOPMSleepReasonIdle; + setProperty(kRootDomainSleepReasonKey, kIOPMIdleSleepKey); + } + if (CAP_CURRENT(kIOPMSystemCapabilityGraphics)) + { + // Root domain is dropping power state ON->SLEEP. + // If system is in full wake, first drop to dark wake. + + darkWakeToSleepASAP = true; + + // Drop graphics capability. + // No transition if system is already in dark wake. + + _desiredCapability &= ~( + kIOPMSystemCapabilityGraphics | + kIOPMSystemCapabilityAudio ); + + *inOutPowerState = ON_STATE; + *inOutChangeFlags |= kIOPMSynchronize; + + // Revert device desire from SLEEP->ON. + changePowerStateToPriv(ON_STATE); + } + } } -void IOPMrootDomain::handlePowerNotification( UInt32 msg ) +void IOPMrootDomain::handleOurPowerChangeStart( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t * inOutChangeFlags ) { - bool eval_clamshell = false; + uint32_t changeFlags = *inOutChangeFlags; + uint32_t currentPowerState = (uint32_t) getPowerState(); - ASSERT_GATED(); + _systemTransitionType = kSystemTransitionNone; + _systemMessageClientMask = 0; + capabilityLoss = false; - /* - * Local (IOPMrootDomain only) eval clamshell command - */ - if (msg & kLocalEvalClamshellCommand) + // 1. Explicit capability change. + + if (changeFlags & kIOPMSynchronize) { - eval_clamshell = true; + if (powerState == ON_STATE) + { + if (changeFlags & kIOPMSyncNoChildNotify) + _systemTransitionType = kSystemTransitionNewCapClient; + else + _systemTransitionType = kSystemTransitionCapability; + } } - /* - * Overtemp - */ - if (msg & kIOPMOverTemp) + // 2. Going to sleep (cancellation still possible). + + else if (powerState < currentPowerState) + _systemTransitionType = kSystemTransitionSleep; + + // 3. Woke from (idle or demand) sleep. + + else if (!systemBooting && + (changeFlags & kIOPMSelfInitiated) && + (powerState > currentPowerState)) { - LOG("PowerManagement emergency overtemp signal. Going to sleep!"); - privateSleepSystem (kIOPMSleepReasonThermalEmergency); + _systemTransitionType = kSystemTransitionWake; + _desiredCapability = kIOPMSystemCapabilityCPU | + kIOPMSystemCapabilityNetwork; + + // Check for early HID events (e.g. LID open) + if (wranglerTickled) + { + _desiredCapability |= ( + kIOPMSystemCapabilityGraphics | + kIOPMSystemCapabilityAudio ); + } } -#ifdef __ppc__ - /* - * PMU Processor Speed Change - */ - if (msg & kIOPMProcessorSpeedChange) + // Update pending wake capability at the beginning of every + // state transition (including synchronize). This will become + // the current capability at the end of the transition. + + if (kSystemTransitionSleep == _systemTransitionType) { - IOService *pmu = waitForService(serviceMatching("ApplePMU")); - pmu->callPlatformFunction("prepareForSleep", false, 0, 0, 0, 0); - getPlatform()->sleepKernel(); - pmu->callPlatformFunction("recoverFromSleep", false, 0, 0, 0, 0); + _pendingCapability = 0; + capabilityLoss = true; } -#endif - - /* - * Sleep Now! - */ - if (msg & kIOPMSleepNow) + else if (kSystemTransitionNewCapClient != _systemTransitionType) { - privateSleepSystem (kIOPMSleepReasonSoftware); + _pendingCapability = _desiredCapability | + kIOPMSystemCapabilityCPU | + kIOPMSystemCapabilityNetwork; + + if (_pendingCapability & kIOPMSystemCapabilityGraphics) + _pendingCapability |= kIOPMSystemCapabilityAudio; + + if ((kSystemTransitionCapability == _systemTransitionType) && + (_pendingCapability == _currentCapability)) + { + // Cancel the PM state change. + _systemTransitionType = kSystemTransitionNone; + *inOutChangeFlags |= kIOPMNotDone; + } + if (__builtin_popcount(_pendingCapability) < + __builtin_popcount(_currentCapability)) + capabilityLoss = true; + if (CAP_LOSS(kIOPMSystemCapabilityGraphics)) + rejectWranglerTickle = true; } - - /* - * Power Emergency - */ - if (msg & kIOPMPowerEmergency) + + // 1. Capability change. + + if (kSystemTransitionCapability == _systemTransitionType) { - lowBatteryCondition = true; - privateSleepSystem (kIOPMSleepReasonLowPower); + // Dark to Full transition. + if (CAP_GAIN(kIOPMSystemCapabilityGraphics)) + { + tracePoint( kIOPMTracePointDarkWakeExit ); + wranglerSleepIgnored = false; + sleepTimerMaintenance = false; + hibernateNoDefeat = false; + _systemMessageClientMask = kSystemMessageClientUser; + if ((_highestCapability & kIOPMSystemCapabilityGraphics) == 0) + _systemMessageClientMask |= kSystemMessageClientKernel; + + tellClients(kIOMessageSystemWillPowerOn); + } + + // Full to Dark transition. + if (CAP_LOSS(kIOPMSystemCapabilityGraphics)) + { + tracePoint( kIOPMTracePointDarkWakeEntry ); + *inOutChangeFlags |= kIOPMSyncTellPowerDown; + _systemMessageClientMask = kSystemMessageClientUser; + } } - /* - * Clamshell OPEN - */ - if (msg & kIOPMClamshellOpened) + // 2. System sleep. + + else if (kSystemTransitionSleep == _systemTransitionType) { - // Received clamshel open message from clamshell controlling driver - // Update our internal state and tell general interest clients - clamshellIsClosed = false; - clamshellExists = true; + // Beginning of a system sleep transition. + // Cancellation is still possible. + tracePoint( kIOPMTracePointSleepStarted, lastSleepReason ); - if (msg & kIOPMSetValue) + _systemMessageClientMask = kSystemMessageClientAll; + if ((_currentCapability & kIOPMSystemCapabilityGraphics) == 0) + _systemMessageClientMask &= ~kSystemMessageClientApp; + if ((_highestCapability & kIOPMSystemCapabilityGraphics) == 0) + _systemMessageClientMask &= ~kSystemMessageClientKernel; + + // Optimization to ignore wrangler power down thus skipping + // the disk spindown and arming the idle timer for demand sleep. + + if (changeFlags & kIOPMIgnoreChildren) { - reportUserInput(); + wranglerSleepIgnored = true; } - // Tell PMCPU - informCPUStateChange(kInformLid, 0); + logWranglerTickle = false; + } - // Tell general interest clients - sendClientClamshellNotification(); + // 3. System wake. - bool aborting = ((lastSleepReason == kIOPMSleepReasonClamshell) - || (lastSleepReason == kIOPMSleepReasonIdle) - || (lastSleepReason == kIOPMSleepReasonMaintenance)); - if (aborting) userActivityCount++; - DLOG("clamshell tickled %d lastSleepReason %d\n", userActivityCount, lastSleepReason); + else if (kSystemTransitionWake == _systemTransitionType) + { + wranglerSleepIgnored = false; + + if (_pendingCapability & kIOPMSystemCapabilityGraphics) + { + _systemMessageClientMask = kSystemMessageClientAll; + } + else + { + _systemMessageClientMask = kSystemMessageClientConfigd; + } + + tracePoint( kIOPMTracePointWakeWillPowerOnClients ); + tellClients(kIOMessageSystemWillPowerOn); } - /* - * Clamshell CLOSED - * Send the clamshell interest notification since the lid is closing. - */ - if (msg & kIOPMClamshellClosed) + if ((kSystemTransitionNone != _systemTransitionType) && + (kSystemTransitionNewCapClient != _systemTransitionType)) { - // Received clamshel open message from clamshell controlling driver - // Update our internal state and tell general interest clients - clamshellIsClosed = true; - clamshellExists = true; + _systemStateGeneration++; + systemDarkWake = false; - // Tell PMCPU - informCPUStateChange(kInformLid, 1); + DLOG("=== START (%u->%u, 0x%x) type %u, gen %u, msg %x, " + "dcp %x:%x:%x\n", + currentPowerState, powerState, *inOutChangeFlags, + _systemTransitionType, _systemStateGeneration, + _systemMessageClientMask, + _desiredCapability, _currentCapability, _pendingCapability); + } +} - // Tell general interest clients - sendClientClamshellNotification(); - - // And set eval_clamshell = so we can attempt - eval_clamshell = true; +void IOPMrootDomain::handleOurPowerChangeDone( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t changeFlags ) +{ + if (kSystemTransitionNewCapClient == _systemTransitionType) + { + _systemTransitionType = kSystemTransitionNone; + return; } - /* - * Set Desktop mode (sent from graphics) - * - * -> reevaluate lid state - */ - if (msg & kIOPMSetDesktopMode) + if (_systemTransitionType != kSystemTransitionNone) { - desktopMode = (0 != (msg & kIOPMSetValue)); - msg &= ~(kIOPMSetDesktopMode | kIOPMSetValue); + uint32_t currentPowerState = (uint32_t) getPowerState(); - sendClientClamshellNotification(); + if (changeFlags & kIOPMNotDone) + { + // Power down was cancelled or vetoed. + _pendingCapability = _currentCapability; + lastSleepReason = 0; - // Re-evaluate the lid state - if( clamshellIsClosed ) + if (((_currentCapability & kIOPMSystemCapabilityGraphics) == 0) && + (_currentCapability & kIOPMSystemCapabilityCPU)) + { + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusDarkWakeReentry, + _systemStateGeneration ); + } + + // Revert device desire to max. + changePowerStateToPriv(ON_STATE); + } + else { - eval_clamshell = true; + // Send message on dark wake to full wake promotion. + // tellChangeUp() handles the normal SLEEP->ON case. + + if (kSystemTransitionCapability == _systemTransitionType) + { + if (CAP_GAIN(kIOPMSystemCapabilityGraphics)) + { + tellClients(kIOMessageSystemHasPoweredOn); +#if DARK_TO_FULL_EVALUATE_CLAMSHELL + // Re-evaluate clamshell state ourselves when graphics + // will not get kIOMessageSystemHasPoweredOn. + + if (clamshellClosed && + ((_systemMessageClientMask & kSystemMessageClientKernel) == 0)) + { + receivePowerNotification( kLocalEvalClamshellCommand ); + } +#endif + } + if (CAP_LOSS(kIOPMSystemCapabilityGraphics)) + wranglerTickled = false; + } + + // Reset state after exiting from dark wake. + + if (CAP_GAIN(kIOPMSystemCapabilityGraphics) || + CAP_LOSS(kIOPMSystemCapabilityCPU)) + { + darkWakeMaintenance = false; + darkWakeToSleepASAP = false; + pciCantSleepValid = false; + rejectWranglerTickle = false; + } + + // Entered dark mode. + + if (((_pendingCapability & kIOPMSystemCapabilityGraphics) == 0) && + (_pendingCapability & kIOPMSystemCapabilityCPU)) + { + if (((gDarkWakeFlags & kDarkWakeFlagIgnoreDiskIOInDark) == 0) && + (kSystemTransitionWake == _systemTransitionType) && + (_debugWakeSeconds == 0)) + { + OSObject * prop = copyProperty(kIOPMRootDomainWakeTypeKey); + if (prop) + { + OSString * wakeType = OSDynamicCast(OSString, prop); + if (wakeType && + wakeType->isEqualTo(kIOPMRootDomainWakeTypeNetwork)) + { + // Woke from network and entered dark wake. + if (darkWakeToSleepASAP) + { + DLOG("cleared darkWakeToSleepASAP\n"); + darkWakeToSleepASAP = false; + } + } + prop->release(); + } + } + + // Queue an evaluation of whether to remain in dark wake, + // and for how long. This serves the purpose of draining + // any assertions from the queue. + + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusDarkWakeEntry, + _systemStateGeneration ); + } } - } - - /* - * AC Adaptor connected - * - * -> reevaluate lid state - */ - if (msg & kIOPMSetACAdaptorConnected) - { - acAdaptorConnected = (0 != (msg & kIOPMSetValue)); - msg &= ~(kIOPMSetACAdaptorConnected | kIOPMSetValue); - // Tell CPU PM - informCPUStateChange(kInformAC, !acAdaptorConnected); + DLOG("=== FINISH (%u->%u, 0x%x) type %u, gen %u, msg %x, " + "dcp %x:%x:%x, dbgtimer %u\n", + currentPowerState, powerState, changeFlags, + _systemTransitionType, _systemStateGeneration, + _systemMessageClientMask, + _desiredCapability, _currentCapability, _pendingCapability, + _debugWakeSeconds); - // Tell BSD if AC is connected - // 0 == external power source; 1 == on battery - post_sys_powersource(acAdaptorConnected ? 0:1); + // Update current system capability. - sendClientClamshellNotification(); + if (_currentCapability != _pendingCapability) + _currentCapability = _pendingCapability; - // Re-evaluate the lid state - if( clamshellIsClosed ) + // Update highest system capability. + + if (!CAP_CURRENT(kIOPMSystemCapabilityCPU)) + _highestCapability = 0; // reset at sleep state + else + _highestCapability |= _currentCapability; + + if (darkWakePostTickle && + (kSystemTransitionWake == _systemTransitionType) && + (gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) == + kDarkWakeFlagHIDTickleLate) { - eval_clamshell = true; + darkWakePostTickle = false; + reportUserInput(); } - } - - /* - * Enable Clamshell (external display disappear) - * - * -> reevaluate lid state - */ - if (msg & kIOPMEnableClamshell) - { - // Re-evaluate the lid state - // System should sleep on external display disappearance - // in lid closed operation. - if( clamshellIsClosed && (true == ignoringClamshell) ) + + // Reset tracepoint at completion of capability change, + // completion of wake transition, and aborted sleep transition. + + if ((_systemTransitionType == kSystemTransitionCapability) || + (_systemTransitionType == kSystemTransitionWake) || + ((_systemTransitionType == kSystemTransitionSleep) && + (changeFlags & kIOPMNotDone))) { - eval_clamshell = true; + setProperty(kIOPMSystemCapabilitiesKey, _currentCapability, 64); + tracePoint( kIOPMTracePointSystemUp, 0 ); } - ignoringClamshell = false; + _systemTransitionType = kSystemTransitionNone; + _systemMessageClientMask = 0; - sendClientClamshellNotification(); + logGraphicsClamp = false; } - - /* - * Disable Clamshell (external display appeared) - * We don't bother re-evaluating clamshell state. If the system is awake, - * the lid is probably open. - */ - if (msg & kIOPMDisableClamshell) - { - ignoringClamshell = true; +} - sendClientClamshellNotification(); +//****************************************************************************** +// PM actions for graphics and audio. +//****************************************************************************** + +void IOPMrootDomain::overridePowerChangeForUIService( + IOService * service, + IOPMActions * actions, + unsigned long * inOutPowerState, + uint32_t * inOutChangeFlags ) +{ + uint32_t powerState = (uint32_t) *inOutPowerState; + uint32_t changeFlags = (uint32_t) *inOutChangeFlags; + + if (kSystemTransitionNone == _systemTransitionType) + { + // Not in midst of a system transition. + // Do not modify power limit enable state. } + else if ((actions->parameter & kPMActionsFlagLimitPower) == 0) + { + // Activate power limiter. - /* - * Evaluate clamshell and SLEEP if appropiate - */ - if ( eval_clamshell && shouldSleepOnClamshellClosed() ) + if ((actions->parameter & kPMActionsFlagIsDisplayWrangler) && + ((_pendingCapability & kIOPMSystemCapabilityGraphics) == 0)) + { + actions->parameter |= kPMActionsFlagLimitPower; + } + else if ((actions->parameter & kPMActionsFlagIsAudioDevice) && + ((_pendingCapability & kIOPMSystemCapabilityAudio) == 0)) + { + actions->parameter |= kPMActionsFlagLimitPower; + } + else if ((actions->parameter & kPMActionsFlagIsGraphicsDevice) && + (_systemTransitionType == kSystemTransitionSleep)) + { + // For graphics devices, arm the limiter when entering + // system sleep. Not when dropping to dark wake. + actions->parameter |= kPMActionsFlagLimitPower; + } + + if (actions->parameter & kPMActionsFlagLimitPower) + { + DLOG("+ plimit %s %p\n", + service->getName(), service); + } + } + else { + // Remove power limit. + if ((actions->parameter & ( + kPMActionsFlagIsDisplayWrangler | + kPMActionsFlagIsGraphicsDevice )) && + (_pendingCapability & kIOPMSystemCapabilityGraphics)) + { + actions->parameter &= ~kPMActionsFlagLimitPower; + } + else if ((actions->parameter & kPMActionsFlagIsAudioDevice) && + (_pendingCapability & kIOPMSystemCapabilityAudio)) + { + actions->parameter &= ~kPMActionsFlagLimitPower; + } - // SLEEP! - privateSleepSystem (kIOPMSleepReasonClamshell); + if ((actions->parameter & kPMActionsFlagLimitPower) == 0) + { + DLOG("- plimit %s %p\n", + service->getName(), service); + } } - /* - * Power Button - */ - if (msg & kIOPMPowerButton) + if (actions->parameter & kPMActionsFlagLimitPower) { - // toggle state of sleep/wake - // are we dozing? - if ( getPowerState() == DOZE_STATE ) + uint32_t maxPowerState = (uint32_t)(-1); + + if (changeFlags & (kIOPMDomainDidChange | kIOPMDomainWillChange)) { -#ifndef __LP64__ - // yes, tell the tree we're waking - systemWake(); -#endif - // wake the Display Wrangler - reportUserInput(); + // Enforce limit for system power/cap transitions. + + maxPowerState = 0; + if (actions->parameter & kPMActionsFlagIsDisplayWrangler) + { + // Forces a 3->1 transition sequence + if (changeFlags & kIOPMDomainWillChange) + maxPowerState = 3; + else + maxPowerState = 1; + } + } + else + { + // Deny all self-initiated changes when power is limited. + // Wrangler tickle should never defeat the limiter. + + maxPowerState = service->getPowerState(); + } + + if (powerState > maxPowerState) + { + DLOG("> plimit %s %p (%u->%u, 0x%x)\n", + service->getName(), service, powerState, maxPowerState, + changeFlags); + *inOutPowerState = maxPowerState; + + if (darkWakePostTickle && + (actions->parameter & kPMActionsFlagIsDisplayWrangler) && + (changeFlags & kIOPMDomainWillChange) && + ((gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) == + kDarkWakeFlagHIDTickleEarly)) + { + darkWakePostTickle = false; + reportUserInput(); + } } - else { - OSString *pbs = OSString::withCString("DisablePowerButtonSleep"); - // Check that power button sleep is enabled - if( pbs ) { - if( kOSBooleanTrue != getProperty(pbs)) - privateSleepSystem (kIOPMSleepReasonPowerButton); + + if (!graphicsSuppressed && (changeFlags & kIOPMDomainDidChange)) + { + if (logGraphicsClamp) + { + AbsoluteTime now; + uint64_t nsec; + + clock_get_uptime(&now); + SUB_ABSOLUTETIME(&now, &systemWakeTime); + absolutetime_to_nanoseconds(now, &nsec); + MSG("Graphics suppressed %u ms\n", + ((int)((nsec) / 1000000ULL))); } + graphicsSuppressed = true; } } +} - /* - * Allow Sleep - * - */ - if ( (msg & kIOPMAllowSleep) && !allowSleep ) +void IOPMrootDomain::handleActivityTickleForDisplayWrangler( + IOService * service, + IOPMActions * actions ) +{ + // Warning: Not running in PM work loop context - don't modify state !!! + // Trap tickle directed to IODisplayWrangler while running with graphics + // capability suppressed. + + assert(service == wrangler); + + if (service == wrangler) { - allowSleep = true; - adjustPowerState(); + bool aborting = ((lastSleepReason == kIOPMSleepReasonIdle) + || (lastSleepReason == kIOPMSleepReasonMaintenance)); + if (aborting) { + userActivityCount++; + DLOG("display wrangler tickled1 %d lastSleepReason %d\n", userActivityCount, lastSleepReason); + } } - /* - * Prevent Sleep - * - */ - if (msg & kIOPMPreventSleep) { - allowSleep = false; - // are we dozing? - if ( getPowerState() == DOZE_STATE ) { -#ifndef __LP64__ - // yes, tell the tree we're waking - systemWake(); -#endif - adjustPowerState(); - // wake the Display Wrangler - reportUserInput(); - } else { - adjustPowerState(); - // make sure we have power to clamp - patriarch->wakeSystem(); + if (!wranglerTickled && !lowBatteryCondition && + ((_pendingCapability & kIOPMSystemCapabilityGraphics) == 0)) + { + DLOG("display wrangler tickled\n"); + if (kIOLogPMRootDomain & gIOKitDebug) + OSReportWithBacktrace("Dark wake display tickle"); + if (pmPowerStateQueue) + { + pmPowerStateQueue->submitPowerEvent( + kPowerEventPolicyStimulus, + (void *) kStimulusDarkWakeActivityTickle ); } } } - //****************************************************************************** -// getSleepSupported -// +// Approve usage of delayed child notification by PM. //****************************************************************************** -IOOptionBits IOPMrootDomain::getSleepSupported( void ) +bool IOPMrootDomain::shouldDelayChildNotification( + IOService * service ) { - return( platformSleepSupport ); + if (((gDarkWakeFlags & kDarkWakeFlagHIDTickleMask) != 0) && + !wranglerTickled && + (kSystemTransitionWake == _systemTransitionType)) + { + DLOG("%s: delay child notify\n", service->getName()); + return true; + } + return false; } - //****************************************************************************** -// setSleepSupported -// +// PM actions for PCI device. //****************************************************************************** -void IOPMrootDomain::setSleepSupported( IOOptionBits flags ) +void IOPMrootDomain::handlePowerChangeStartForPCIDevice( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t * inOutChangeFlags ) { - DLOG("setSleepSupported(%x)\n", (uint32_t) flags); - OSBitOrAtomic(flags, &platformSleepSupport); + pmTracer->tracePCIPowerChange( + PMTraceWorker::kPowerChangeStart, + service, *inOutChangeFlags, + (actions->parameter & kPMActionsPCIBitNumberMask)); } +void IOPMrootDomain::handlePowerChangeDoneForPCIDevice( + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t changeFlags ) +{ + pmTracer->tracePCIPowerChange( + PMTraceWorker::kPowerChangeCompleted, + service, changeFlags, + (actions->parameter & kPMActionsPCIBitNumberMask)); +} //****************************************************************************** -// requestPowerDomainState -// -// The root domain intercepts this call to the superclass. -// Called on the PM work loop thread. +// registerInterest // -// If the clamp bit is not set in the desire, then the child doesn't need the power -// state it's requesting; it just wants it. The root ignores desires but not needs. -// If the clamp bit is not set, the root takes it that the child can tolerate no -// power and interprets the request accordingly. If all children can thus tolerate -// no power, we are on our way to idle sleep. +// Override IOService::registerInterest() to intercept special clients. //****************************************************************************** -IOReturn IOPMrootDomain::requestPowerDomainState ( - IOPMPowerFlags desiredFlags, - IOPowerConnection * whichChild, - unsigned long specification ) +IONotifier * IOPMrootDomain::registerInterest( + const OSSymbol * typeOfInterest, + IOServiceInterestHandler handler, + void * target, void * ref ) { - OSIterator *iter; - OSObject *next; - IOPowerConnection *connection; - IOPMPowerFlags powerRequestFlag = 0; - IOPMPowerFlags editedDesire; + IONotifier * notifier; + bool isSystemCapabilityClient; + bool isKernelCapabilityClient; - ASSERT_GATED(); + isSystemCapabilityClient = + typeOfInterest && + typeOfInterest->isEqualTo(kIOPMSystemCapabilityInterest); + + isKernelCapabilityClient = + typeOfInterest && + typeOfInterest->isEqualTo(gIOPriorityPowerStateInterest); - if (kIOLogPMRootDomain & gIOKitDebug) + if (isSystemCapabilityClient) + typeOfInterest = gIOAppPowerStateInterest; + + notifier = super::registerInterest(typeOfInterest, handler, target, ref); + if (notifier && pmPowerStateQueue) { - IOService * powerChild = - (IOService *) whichChild->getChildEntry(gIOPowerPlane); - DLOG("child %p, flags %lx, spec %lx - %s\n", - powerChild, desiredFlags, specification, - powerChild ? powerChild->getName() : "?"); + if (isSystemCapabilityClient) + { + notifier->retain(); + if (pmPowerStateQueue->submitPowerEvent( + kPowerEventRegisterSystemCapabilityClient, notifier) == false) + notifier->release(); + } + + if (isKernelCapabilityClient) + { + notifier->retain(); + if (pmPowerStateQueue->submitPowerEvent( + kPowerEventRegisterKernelCapabilityClient, notifier) == false) + notifier->release(); + } } - // Force the child's input power requirements to 0 unless the prevent - // idle-sleep flag is set. No input power flags map to our state 0. - // Our power clamp (deviceDesire) keeps the minimum power state at 2. + return notifier; +} - if (desiredFlags & kIOPMPreventIdleSleep) - editedDesire = kIOPMPreventIdleSleep | kIOPMPowerOn; - else - editedDesire = 0; +//****************************************************************************** +// systemMessageFilter +// +//****************************************************************************** - // Recompute sleep supported flag (doze if not supported) - sleepIsSupported = true; +bool IOPMrootDomain::systemMessageFilter( + void * object, void * arg1, void * arg2, void * arg3 ) +{ + const IOPMInterestContext * context = (const IOPMInterestContext *) arg1; + bool isCapMsg = (context->messageType == kIOMessageSystemCapabilityChange); + bool isCapClient = false; + bool allow = false; - iter = getChildIterator(gIOPowerPlane); - if ( iter ) - { - while ( (next = iter->getNextObject()) ) + do { + if ((kSystemTransitionNewCapClient == _systemTransitionType) && + (!isCapMsg || !_joinedCapabilityClients || + !_joinedCapabilityClients->containsObject((OSObject *) object))) + break; + + // Capability change message for app and kernel clients. + + if (isCapMsg) { - if ( (connection = OSDynamicCast(IOPowerConnection, next)) ) - { - // Ignore child that are in the process of joining. - if (connection->getReadyFlag() == false) - continue; + if ((context->notifyType == kNotifyPriority) || + (context->notifyType == kNotifyCapabilityChangePriority)) + isCapClient = true; - // Is this connection attached to the child that called - // requestPowerDomainState()? + if ((context->notifyType == kNotifyCapabilityChangeApps) && + (object == (void *) systemCapabilityNotifier)) + isCapClient = true; + } - if (connection == whichChild) - { - // OR in the child's input power requirements. - powerRequestFlag |= editedDesire; + if (isCapClient) + { + IOPMSystemCapabilityChangeParameters * capArgs = + (IOPMSystemCapabilityChangeParameters *) arg2; - if ( desiredFlags & kIOPMPreventSystemSleep ) - sleepIsSupported = false; - } + if (kSystemTransitionNewCapClient == _systemTransitionType) + { + capArgs->fromCapabilities = 0; + capArgs->toCapabilities = _currentCapability; + capArgs->changeFlags = 0; + } + else + { + capArgs->fromCapabilities = _currentCapability; + capArgs->toCapabilities = _pendingCapability; + + if (context->isPreChange) + capArgs->changeFlags = kIOPMSystemCapabilityWillChange; else - { - if (kIOLogPMRootDomain & gIOKitDebug) - { - IOService * powerChild = - (IOService *) connection->getChildEntry(gIOPowerPlane); - DLOG("child %p, state %ld, noIdle %d, noSleep %d - %s\n", - powerChild, - connection->getDesiredDomainState(), - connection->getPreventIdleSleepFlag(), - connection->getPreventSystemSleepFlag(), - powerChild ? powerChild->getName() : "?"); - } + capArgs->changeFlags = kIOPMSystemCapabilityDidChange; + } - // OR in the child's desired power state (0 or ON_STATE). - powerRequestFlag |= connection->getDesiredDomainState(); + // Capability change messages only go to the PM configd plugin. + // Wait for response post-change if capabilitiy is increasing. + // Wait for response pre-change if capability is decreasing. - if ( connection->getPreventSystemSleepFlag() ) - sleepIsSupported = false; - } + if ((context->notifyType == kNotifyCapabilityChangeApps) && arg3 && + ( (capabilityLoss && context->isPreChange) || + (!capabilityLoss && !context->isPreChange) ) ) + { + // app has not replied yet, wait for it + *((OSObject **) arg3) = kOSBooleanFalse; } + + allow = true; + break; } - iter->release(); - } - DLOG("childPowerFlags 0x%lx, extraSleepDelay %ld\n", - powerRequestFlag, extraSleepDelay); + // Capability client will always see kIOMessageCanSystemSleep, + // even for demand sleep. - if ( !powerRequestFlag && !systemBooting ) - { - if (!wrangler) + if ((kIOMessageCanSystemSleep == context->messageType) || + (kIOMessageSystemWillNotSleep == context->messageType)) { - sleepASAP = false; - changePowerStateToPriv(ON_STATE); - if (idleSeconds) + if (object == (OSObject *) systemCapabilityNotifier) { - // stay awake for at least idleSeconds - startIdleSleepTimer(idleSeconds); + allow = true; + break; + } + + // Not idle sleep, don't ask apps. + if (context->changeFlags & kIOPMSkipAskPowerDown) + { + break; } } - else if (!extraSleepDelay && !idleSleepTimerPending) + + // Reject capability change messages for legacy clients. + // Reject legacy system sleep messages for capability client. + + if (isCapMsg || (object == (OSObject *) systemCapabilityNotifier)) { - sleepASAP = true; + break; } - } - - // Drop our power clamp to SLEEP_STATE when all children became idle, - // and the system sleep and display sleep values are equal. - adjustPowerState(); + // Filter system sleep messages. - // If our power clamp has already dropped to SLEEP_STATE, and no child - // is keeping us at ON_STATE, then this will trigger idle sleep. + if ((context->notifyType == kNotifyApps) && + (_systemMessageClientMask & kSystemMessageClientApp)) + { + allow = true; + } + else if ((context->notifyType == kNotifyPriority) && + (_systemMessageClientMask & kSystemMessageClientKernel)) + { + allow = true; + } + } + while (false); - editedDesire |= (desiredFlags & kIOPMPreventSystemSleep); + if (allow && isCapMsg && _joinedCapabilityClients) + { + _joinedCapabilityClients->removeObject((OSObject *) object); + if (_joinedCapabilityClients->getCount() == 0) + { + DLOG("destroyed capability client set %p\n", + _joinedCapabilityClients); + _joinedCapabilityClients->release(); + _joinedCapabilityClients = 0; + } + } - return super::requestPowerDomainState( - editedDesire, whichChild, specification); + return allow; } - //****************************************************************************** -// handlePlatformHaltRestart +// setMaintenanceWakeCalendar // //****************************************************************************** -struct HaltRestartApplierContext { - IOPMrootDomain * RootDomain; - unsigned long PowerState; - IOPMPowerFlags PowerFlags; - UInt32 MessageType; - UInt32 Counter; -}; - -static void -platformHaltRestartApplier( OSObject * object, void * context ) +IOReturn IOPMrootDomain::setMaintenanceWakeCalendar( + const IOPMCalendarStruct * calendar ) { - IOPowerStateChangeNotification notify; - HaltRestartApplierContext * ctx; - AbsoluteTime startTime; - UInt32 deltaTime; + OSData * data; + IOReturn ret; - ctx = (HaltRestartApplierContext *) context; - - memset(¬ify, 0, sizeof(notify)); - notify.powerRef = (void *)ctx->Counter; - notify.returnValue = 0; - notify.stateNumber = ctx->PowerState; - notify.stateFlags = ctx->PowerFlags; + if (!calendar) + return kIOReturnBadArgument; + + data = OSData::withBytesNoCopy((void *) calendar, sizeof(*calendar)); + if (!data) + return kIOReturnNoMemory; + + ret = setPMSetting(gIOPMSettingMaintenanceWakeCalendarKey, data); - clock_get_uptime(&startTime); - ctx->RootDomain->messageClient( ctx->MessageType, object, (void *)¬ify ); - deltaTime = computeDeltaTimeMS(&startTime); + data->release(); + return ret; +} - if ((deltaTime > kPMHaltTimeoutMS) || (gIOKitDebug & kIOLogDebugPower)) - { - _IOServiceInterestNotifier * notifier; - notifier = OSDynamicCast(_IOServiceInterestNotifier, object); +// MARK: - +// MARK: Display Wrangler - // IOService children of IOPMrootDomain are not instrumented. - // Only IORootParent currently falls under that group. +//****************************************************************************** +// displayWranglerNotification +// +// Handle the notification when the IODisplayWrangler changes power state. +//****************************************************************************** - if (notifier) - { - KLOG("%s handler %p took %u ms\n", - (ctx->MessageType == kIOMessageSystemWillPowerOff) ? - "PowerOff" : "Restart", - notifier->handler, (uint32_t) deltaTime ); - } - } +IOReturn IOPMrootDomain::displayWranglerNotification( + void * target, void * refCon, + UInt32 messageType, IOService * service, + void * messageArgument, vm_size_t argSize ) +{ +#if !NO_KERNEL_HID + int displayPowerState; + IOPowerStateChangeNotification * params = + (IOPowerStateChangeNotification *) messageArgument; - ctx->Counter++; -} + if ((messageType != kIOMessageDeviceWillPowerOff) && + (messageType != kIOMessageDeviceHasPoweredOn)) + return kIOReturnUnsupported; -void IOPMrootDomain::handlePlatformHaltRestart( UInt32 pe_type ) -{ - HaltRestartApplierContext ctx; - AbsoluteTime startTime; - UInt32 deltaTime; + ASSERT_GATED(); + if (!gRootDomain) + return kIOReturnUnsupported; - memset(&ctx, 0, sizeof(ctx)); - ctx.RootDomain = this; + displayPowerState = params->stateNumber; + DLOG("DisplayWrangler message 0x%x, power state %d\n", + (uint32_t) messageType, displayPowerState); - clock_get_uptime(&startTime); - switch (pe_type) - { - case kPEHaltCPU: - case kPEUPSDelayHaltCPU: - ctx.PowerState = OFF_STATE; - ctx.MessageType = kIOMessageSystemWillPowerOff; - break; + switch (messageType) { + case kIOMessageDeviceWillPowerOff: - case kPERestartCPU: - ctx.PowerState = RESTART_STATE; - ctx.MessageType = kIOMessageSystemWillRestart; - break; + // Display wrangler has dropped power due to display idle + // or force system sleep. + // + // 4 Display ON + // 3 Display Dim + // 2 Display Sleep + // 1 Not visible to user + // 0 Not visible to user - default: - return; - } + if (displayPowerState > 2) + break; + + gRootDomain->evaluatePolicy( kStimulusDisplayWranglerSleep ); + break; - // Notify legacy clients - applyToInterested(gIOPriorityPowerStateInterest, platformHaltRestartApplier, &ctx); + case kIOMessageDeviceHasPoweredOn: - // For normal shutdown, turn off File Server Mode. - if (kPEHaltCPU == pe_type) - { - const OSSymbol * setting = OSSymbol::withCString(kIOPMSettingRestartOnPowerLossKey); - OSNumber * num = OSNumber::withNumber((unsigned long long) 0, 32); - if (setting && num) - { - setPMSetting(setting, num); - setting->release(); - num->release(); - } - } + // Display wrangler has powered on due to user activity + // or wake from sleep. - // Notify in power tree order - notifySystemShutdown(this, ctx.MessageType); + if ( 4 != displayPowerState ) + break; - deltaTime = computeDeltaTimeMS(&startTime); - KLOG("%s all drivers took %u ms\n", - (ctx.MessageType == kIOMessageSystemWillPowerOff) ? - "PowerOff" : "Restart", - (uint32_t) deltaTime ); + gRootDomain->evaluatePolicy( kStimulusDisplayWranglerWake ); + break; + } +#endif + return kIOReturnUnsupported; } - -//****************************************************************************** -// registerInterest +//********************************************************************************* +// displayWranglerMatchPublished // +// Receives a notification when the IODisplayWrangler is published. +// When it's published we install a power state change handler. //****************************************************************************** -IONotifier * IOPMrootDomain::registerInterest( - const OSSymbol * typeOfInterest, - IOServiceInterestHandler handler, - void * target, void * ref ) +bool IOPMrootDomain::displayWranglerMatchPublished( + void * target, + void * refCon, + IOService * newService, + IONotifier * notifier __unused) { - IONotifier * notifier; - bool isConfigd; - - isConfigd = typeOfInterest && - typeOfInterest->isEqualTo(kIOPMPrivilegedPowerInterest); - - if (isConfigd) - typeOfInterest = gIOAppPowerStateInterest; - - notifier = super::registerInterest(typeOfInterest, handler, target, ref); - -#if ROOT_DOMAIN_RUN_STATES - if (isConfigd && notifier && pmPowerStateQueue) +#if !NO_KERNEL_HID + // found the display wrangler, now install a handler + if( !newService->registerInterest( gIOGeneralInterest, + &displayWranglerNotification, target, 0) ) { - notifier->retain(); - if (pmPowerStateQueue->submitPowerEvent( - kPowerEventConfigdRegisteredInterest, notifier) == false) - notifier->release(); + return false; } #endif - - return notifier; + return true; } -static bool clientMessageFilter( OSObject * object, void * arg ) +//****************************************************************************** +// reportUserInput +// +//****************************************************************************** + +void IOPMrootDomain::reportUserInput( void ) { -#if ROOT_DOMAIN_RUN_STATES -#if LOG_INTEREST_CLIENTS - IOPMInterestContext * context = (IOPMInterestContext *) arg; -#endif - bool allow = false; +#if !NO_KERNEL_HID + OSIterator * iter; - switch (gMessageClientType) + if(!wrangler) { - case kMessageClientNone: - allow = false; - break; - - case kMessageClientAll: - allow = true; - break; - - case kMessageClientConfigd: - allow = ((object == (OSObject *) gConfigdNotifier) || - (object == (OSObject *) gSysPowerDownNotifier)); - break; + iter = getMatchingServices(serviceMatching("IODisplayWrangler")); + if(iter) + { + wrangler = (IOService *) iter->getNextObject(); + iter->release(); + } } -#if LOG_INTEREST_CLIENTS - if (allow) - DLOG("system message %x to %p\n", - context->msgType, object); -#endif - - return allow; -#else - return true; + if(wrangler) + wrangler->activityTickle(0,0); #endif } +// MARK: - +// MARK: Battery //****************************************************************************** -// tellChangeDown +// batteryPublished // -// We override the superclass implementation so we can send a different message -// type to the client or application being notified. +// Notification on battery class IOPowerSource appearance //****************************************************************************** -bool IOPMrootDomain::tellChangeDown( unsigned long stateNum ) -{ - bool done; +bool IOPMrootDomain::batteryPublished( + void * target, + void * root_domain, + IOService * resourceService, + IONotifier * notifier __unused ) +{ + // rdar://2936060&4435589 + // All laptops have dimmable LCD displays + // All laptops have batteries + // So if this machine has a battery, publish the fact that the backlight + // supports dimming. + ((IOPMrootDomain *)root_domain)->publishFeature("DisplayDims"); - DLOG("tellChangeDown %u->%u, R-state %u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum, runStateIndex); + return (true); +} - switch ( stateNum ) { - case DOZE_STATE: - case SLEEP_STATE: +// MARK: - +// MARK: System PM Policy - if (!ignoreChangeDown) - { - userActivityAtSleep = userActivityCount; - hibernateAborted = false; - DLOG("tellChangeDown::userActivityAtSleep %d\n", userActivityAtSleep); +//****************************************************************************** +// checkSystemCanSleep +// +//****************************************************************************** - // Direct callout into OSKext so it can disable kext unloads - // during sleep/wake to prevent deadlocks. - OSKextSystemSleepOrWake( kIOMessageSystemWillSleep ); +bool IOPMrootDomain::checkSystemCanSleep( IOOptionBits options ) +{ + int err = 0; - if ( (SLEEP_STATE == stateNum) && sleepSupportedPEFunction ) - { - // Reset PCI prevent sleep flag before calling platform driver. - OSBitAndAtomic(~kPCICantSleep, &platformSleepSupport); + // Conditions that prevent idle and demand system sleep. - // Skip PCI check for maintenance sleep. - if ((runStateFlags & kRStateFlagSuppressPCICheck) == 0) - { - // Determine if the machine supports sleep, or must doze. - getPlatform()->callPlatformFunction( - sleepSupportedPEFunction, false, - NULL, NULL, NULL, NULL); - } + do { + if (userDisabledAllSleep) + { + err = 1; // 1. user-space sleep kill switch + break; + } - // If the machine only supports doze, the callPlatformFunction call - // boils down to IOPMrootDomain::setSleepSupported(kPCICantSleep), - // otherwise nothing. - } + if (systemBooting || systemShutdown) + { + err = 2; // 2. restart or shutdown in progress + break; + } - // Notify platform that sleep has begun - getPlatform()->callPlatformFunction( - sleepMessagePEFunction, false, - (void *)(uintptr_t) kIOMessageSystemWillSleep, - NULL, NULL, NULL); + if (options == 0) + break; - // Update canSleep and kIOSleepSupportedKey property so drivers - // can tell if platform is going to sleep versus doze. + // Conditions above pegs the system at full wake. + // Conditions below prevent system sleep but does not prevent + // dark wake, and must be called from gated context. -#if CONFIG_SLEEP - canSleep = true; -#else - canSleep = false; +#if !CONFIG_SLEEP + err = 3; // 3. config does not support sleep + break; #endif - if (!sleepIsSupported) - canSleep = false; - if (platformSleepSupport & kPCICantSleep) - canSleep = false; - setProperty(kIOSleepSupportedKey, canSleep); - DLOG("canSleep %d\n", canSleep); - - // Publish the new sleep-wake UUID - publishSleepWakeUUID(true); - - // Two change downs are sent by IOServicePM. Ignore the 2nd. - ignoreChangeDown = true; - - tracePoint( kIOPMTracePointSystemSleepAppsPhase); - } - DLOG("kIOMessageSystemWillSleep (%d)\n", gMessageClientType); - done = super::tellClientsWithResponse( - kIOMessageSystemWillSleep, clientMessageFilter); - break; + if (lowBatteryCondition) + { + break; // always sleep on low battery + } - default: - done = super::tellChangeDown(stateNum); + if (childPreventSystemSleep) + { + err = 4; // 4. child prevent system sleep clamp break; - } - return done; -} - + } -//****************************************************************************** -// askChangeDown -// -// We override the superclass implementation so we can send a different message -// type to the client or application being notified. -// -// This must be idle sleep since we don't ask during any other power change. -//****************************************************************************** + if (getPMAssertionLevel( kIOPMDriverAssertionCPUBit ) == + kIOPMDriverAssertionLevelOn) + { + err = 5; // 5. CPU assertion + break; + } -bool IOPMrootDomain::askChangeDown( unsigned long stateNum ) -{ - DLOG("askChangeDown %u->%u, R-state %u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum, runStateIndex); - DLOG("kIOMessageCanSystemSleep (%d)\n", gMessageClientType); + if (pciCantSleepValid) + { + if (pciCantSleepFlag) + err = 6; // 6. PCI card does not support PM (cached) + break; + } + else if (sleepSupportedPEFunction && + CAP_HIGHEST(kIOPMSystemCapabilityGraphics)) + { + IOReturn ret; + OSBitAndAtomic(~kPCICantSleep, &platformSleepSupport); + ret = getPlatform()->callPlatformFunction( + sleepSupportedPEFunction, false, + NULL, NULL, NULL, NULL); + pciCantSleepValid = true; + pciCantSleepFlag = false; + if ((platformSleepSupport & kPCICantSleep) || + ((ret != kIOReturnSuccess) && (ret != kIOReturnUnsupported))) + { + err = 6; // 6. PCI card does not support PM + pciCantSleepFlag = true; + break; + } + } + } + while (false); - return super::tellClientsWithResponse( - kIOMessageCanSystemSleep, - clientMessageFilter); + if (err) + { + DLOG("System sleep prevented by %d\n", err); + return false; + } + return true; } - //****************************************************************************** -// tellNoChangeDown -// -// Notify registered applications and kernel clients that we are not dropping -// power. -// -// We override the superclass implementation so we can send a different message -// type to the client or application being notified. +// adjustPowerState // -// This must be a vetoed idle sleep, since no other power change can be vetoed. +// Conditions that affect our wake/sleep decision has changed. +// If conditions dictate that the system must remain awake, clamp power +// state to max with changePowerStateToPriv(ON). Otherwise if sleepASAP +// is TRUE, then remove the power clamp and allow the power state to drop +// to SLEEP_STATE. //****************************************************************************** -void IOPMrootDomain::tellNoChangeDown( unsigned long stateNum ) +void IOPMrootDomain::adjustPowerState( bool sleepASAP ) { - DLOG("tellNoChangeDown %u->%u, R-state %u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum, runStateIndex); + DLOG("adjustPowerState ps %u, asap %d, slider %ld\n", + (uint32_t) getPowerState(), sleepASAP, sleepSlider); - // Sleep canceled, clear the sleep trace point. - tracePoint(kIOPMTracePointSystemUp); + ASSERT_GATED(); - if (idleSeconds && !wrangler) + if ((sleepSlider == 0) || !checkSystemCanSleep()) { - // stay awake for at least idleSeconds - sleepASAP = false; - startIdleSleepTimer(idleSeconds); + changePowerStateToPriv(ON_STATE); + } + else if ( sleepASAP ) + { + changePowerStateToPriv(SLEEP_STATE); } - DLOG("kIOMessageSystemWillNotSleep (%d)\n", gMessageClientType); - return tellClients(kIOMessageSystemWillNotSleep, clientMessageFilter); } - //****************************************************************************** -// tellChangeUp -// -// Notify registered applications and kernel clients that we are raising power. +// dispatchPowerEvent // -// We override the superclass implementation so we can send a different message -// type to the client or application being notified. +// IOPMPowerStateQueue callback function. Running on PM work loop thread. //****************************************************************************** -void IOPMrootDomain::tellChangeUp( unsigned long stateNum ) +void IOPMrootDomain::dispatchPowerEvent( + uint32_t event, void * arg0, uint64_t arg1 ) { - OSData *publishPMStats = NULL; - - DLOG("tellChangeUp %u->%u, R-state %u\n", - (uint32_t) getPowerState(), (uint32_t) stateNum, runStateIndex); - - ignoreChangeDown = false; + DLOG("power event %u args %p 0x%llx\n", event, arg0, arg1); + ASSERT_GATED(); - if ( stateNum == ON_STATE ) + switch (event) { - // Direct callout into OSKext so it can disable kext unloads - // during sleep/wake to prevent deadlocks. - OSKextSystemSleepOrWake( kIOMessageSystemHasPoweredOn ); - - // Notify platform that sleep was cancelled or resumed. - getPlatform()->callPlatformFunction( - sleepMessagePEFunction, false, - (void *)(uintptr_t) kIOMessageSystemHasPoweredOn, - NULL, NULL, NULL); + case kPowerEventFeatureChanged: + messageClients(kIOPMMessageFeatureChange, this); + break; - if (getPowerState() == ON_STATE) - { - // this is a quick wake from aborted sleep - if (idleSeconds && !wrangler) + case kPowerEventReceivedPowerNotification: + handlePowerNotification( (UInt32)(uintptr_t) arg0 ); + break; + + case kPowerEventSystemBootCompleted: + if (systemBooting) { - // stay awake for at least idleSeconds - sleepASAP = false; - startIdleSleepTimer(idleSeconds); - } - DLOG("kIOMessageSystemWillPowerOn (%d)\n", gMessageClientType); - tellClients(kIOMessageSystemWillPowerOn, clientMessageFilter); - } -#if HIBERNATION - else - { - IOHibernateSystemPostWake(); - } -#endif - - tracePoint(kIOPMTracePointSystemWakeAppsPhase); - publishPMStats = OSData::withBytes(&pmStats, sizeof(pmStats)); - setProperty(kIOPMSleepStatisticsKey, publishPMStats); - publishPMStats->release(); - bzero(&pmStats, sizeof(pmStats)); + systemBooting = false; - if (pmStatsAppResponses) - { - setProperty(kIOPMSleepStatisticsAppsKey, pmStatsAppResponses); - pmStatsAppResponses->release(); - pmStatsAppResponses = OSArray::withCapacity(5); - } - - DLOG("kIOMessageSystemHasPoweredOn (%d)\n", gMessageClientType); - tellClients(kIOMessageSystemHasPoweredOn, clientMessageFilter); + // If lid is closed, re-send lid closed notification + // now that booting is complete. + if ( clamshellClosed ) + { + handlePowerNotification(kLocalEvalClamshellCommand); + } + evaluatePolicy( kStimulusAllowSystemSleepChanged ); + } + break; - tracePoint(kIOPMTracePointSystemUp); - } -} + case kPowerEventSystemShutdown: + if (kOSBooleanTrue == (OSBoolean *) arg0) + { + /* We set systemShutdown = true during shutdown + to prevent sleep at unexpected times while loginwindow is trying + to shutdown apps and while the OS is trying to transition to + complete power of. + + Set to true during shutdown, as soon as loginwindow shows + the "shutdown countdown dialog", through individual app + termination, and through black screen kernel shutdown. + */ + systemShutdown = true; + } else { + /* + A shutdown was initiated, but then the shutdown + was cancelled, clearing systemShutdown to false here. + */ + systemShutdown = false; + } + break; + case kPowerEventUserDisabledSleep: + userDisabledAllSleep = (kOSBooleanTrue == (OSBoolean *) arg0); + break; -//****************************************************************************** -// reportUserInput -// -//****************************************************************************** + case kPowerEventRegisterSystemCapabilityClient: + if (systemCapabilityNotifier) + { + systemCapabilityNotifier->release(); + systemCapabilityNotifier = 0; + } + if (arg0) + { + systemCapabilityNotifier = (IONotifier *) arg0; + systemCapabilityNotifier->retain(); + } + /* intentional fall-through */ -void IOPMrootDomain::reportUserInput( void ) -{ -#if !NO_KERNEL_HID - OSIterator * iter; + case kPowerEventRegisterKernelCapabilityClient: + if (!_joinedCapabilityClients) + _joinedCapabilityClients = OSSet::withCapacity(8); + if (arg0) + { + IONotifier * notify = (IONotifier *) arg0; + if (_joinedCapabilityClients) + { + _joinedCapabilityClients->setObject(notify); + synchronizePowerTree( kIOPMSyncNoChildNotify ); + } + notify->release(); + } + break; - if(!wrangler) - { - iter = getMatchingServices(serviceMatching("IODisplayWrangler")); - if(iter) - { - wrangler = (IOService *) iter->getNextObject(); - iter->release(); - } - } + case kPowerEventPolicyStimulus: + if (arg0) + { + int stimulus = (uintptr_t) arg0; + evaluatePolicy( stimulus, (uint32_t) arg1 ); + } + break; - if(wrangler) - wrangler->activityTickle(0,0); -#endif -} + case kPowerEventAssertionCreate: + if (pmAssertions) { + pmAssertions->handleCreateAssertion((OSData *)arg0); + } + break; -//****************************************************************************** -// setQuickSpinDownTimeout -// -//****************************************************************************** + case kPowerEventAssertionRelease: + if (pmAssertions) { + pmAssertions->handleReleaseAssertion(arg1); + } + break; -void IOPMrootDomain::setQuickSpinDownTimeout( void ) -{ - ASSERT_GATED(); - setAggressiveness( - kPMMinutesToSpinDown, 0, kAggressivesOptionQuickSpindownEnable ); + case kPowerEventAssertionSetLevel: + if (pmAssertions) { + pmAssertions->handleSetAssertionLevel(arg1, (IOPMDriverAssertionLevel)(uintptr_t)arg0); + } + break; + + case kPowerEventQueueSleepWakeUUID: + handleQueueSleepWakeUUID((OSObject *)arg0); + break; + case kPowerEventPublishSleepWakeUUID: + handlePublishSleepWakeUUID((bool)arg0); + break; + } } - //****************************************************************************** -// restoreUserSpinDownTimeout +// systemPowerEventOccurred +// +// The power controller is notifying us of a hardware-related power management +// event that we must handle. // +// systemPowerEventOccurred covers the same functionality that +// receivePowerNotification does; it simply provides a richer API for conveying +// more information. //****************************************************************************** -void IOPMrootDomain::restoreUserSpinDownTimeout( void ) +IOReturn IOPMrootDomain::systemPowerEventOccurred( + const OSSymbol *event, + uint32_t intValue) { - ASSERT_GATED(); - setAggressiveness( - kPMMinutesToSpinDown, 0, kAggressivesOptionQuickSpindownDisable ); -} + IOReturn attempt = kIOReturnSuccess; + OSNumber *newNumber = NULL; + + if (!event) + return kIOReturnBadArgument; + + newNumber = OSNumber::withNumber(intValue, 8*sizeof(intValue)); + if (!newNumber) + return kIOReturnInternalError; + attempt = systemPowerEventOccurred(event, (OSObject *)newNumber); -//****************************************************************************** -// changePowerStateTo & changePowerStateToPriv -// -// Override of these methods for logging purposes. -//****************************************************************************** + newNumber->release(); -IOReturn IOPMrootDomain::changePowerStateTo( unsigned long ordinal ) -{ - return kIOReturnUnsupported; // ignored + return attempt; } -IOReturn IOPMrootDomain::changePowerStateToPriv( unsigned long ordinal ) +IOReturn IOPMrootDomain::systemPowerEventOccurred( + const OSSymbol *event, + OSObject *value) { - DLOG("changePowerStateToPriv(%lu)\n", ordinal); + OSDictionary *thermalsDict = NULL; + bool shouldUpdate = true; + + if (!event || !value) + return kIOReturnBadArgument; - if ( (getPowerState() == DOZE_STATE) && (ordinal != ON_STATE) ) - { - return kIOReturnSuccess; - } + // LOCK + // We reuse featuresDict Lock because it already exists and guards + // the very infrequently used publish/remove feature mechanism; so there's zero rsk + // of stepping on that lock. + if (featuresDictLock) IOLockLock(featuresDictLock); - if ( (userDisabledAllSleep || systemBooting || systemShutdown) && - (ordinal == SLEEP_STATE) ) - { - DLOG("SLEEP rejected, forced to ON state (UD %d, SB %d, SS %d)\n", - userDisabledAllSleep, systemBooting, systemShutdown); + thermalsDict = (OSDictionary *)getProperty(kIOPMRootDomainPowerStatusKey); + + if (thermalsDict && OSDynamicCast(OSDictionary, thermalsDict)) { + thermalsDict = OSDictionary::withDictionary(thermalsDict); + } else { + thermalsDict = OSDictionary::withCapacity(1); + } - super::changePowerStateToPriv(ON_STATE); + if (!thermalsDict) { + shouldUpdate = false; + goto exit; } - return super::changePowerStateToPriv(ordinal); -} + thermalsDict->setObject (event, value); -//****************************************************************************** -// activity detect -// -//****************************************************************************** + setProperty (kIOPMRootDomainPowerStatusKey, thermalsDict); -bool IOPMrootDomain::activitySinceSleep(void) -{ - return (userActivityCount != userActivityAtSleep); -} + thermalsDict->release(); -bool IOPMrootDomain::abortHibernation(void) -{ - bool ret = activitySinceSleep(); +exit: + // UNLOCK + if (featuresDictLock) IOLockUnlock(featuresDictLock); - if (ret && !hibernateAborted) - { - DLOG("activitySinceSleep ABORT [%d, %d]\n", userActivityCount, userActivityAtSleep); - hibernateAborted = true; - } - return (ret); -} + if (shouldUpdate) + messageClients (kIOPMMessageSystemPowerEventOccurred, (void *)NULL); -extern "C" int -hibernate_should_abort(void) -{ - if (gRootDomain) - return (gRootDomain->abortHibernation()); - else - return (0); + return kIOReturnSuccess; } //****************************************************************************** -// updateRunState +// receivePowerNotification // +// The power controller is notifying us of a hardware-related power management +// event that we must handle. This may be a result of an 'environment' interrupt +// from the power mgt micro. //****************************************************************************** -void IOPMrootDomain::updateRunState( uint32_t inRunState ) +IOReturn IOPMrootDomain::receivePowerNotification( UInt32 msg ) { -#if ROOT_DOMAIN_RUN_STATES - if (inRunState < kRStateCount) - { - runStateIndex = nextRunStateIndex = inRunState; - runStateFlags = gRStateFlags[inRunState]; - - setProperty( - kIOPMRootDomainRunStateKey, - (unsigned long long) inRunState, 32); - } -#endif + pmPowerStateQueue->submitPowerEvent( + kPowerEventReceivedPowerNotification, (void *) msg ); + return kIOReturnSuccess; } +void IOPMrootDomain::handlePowerNotification( UInt32 msg ) +{ + bool eval_clamshell = false; -#if ROOT_DOMAIN_RUN_STATES -//****************************************************************************** -// tagPowerPlaneService -// -// Running on PM work loop thread. -//****************************************************************************** + ASSERT_GATED(); -void IOPMrootDomain::tagPowerPlaneService( - IOService * service, - uint32_t * rdFlags ) -{ - *rdFlags = 0; + /* + * Local (IOPMrootDomain only) eval clamshell command + */ + if (msg & kLocalEvalClamshellCommand) + { + eval_clamshell = true; + } - if (service->getProperty("IOPMStrictTreeOrder") || - service->metaCast("IODisplayWrangler") || - OSDynamicCast(OSNumber, - service->getProperty("IOPMUnattendedWakePowerState"))) + /* + * Overtemp + */ + if (msg & kIOPMOverTemp) { - *rdFlags |= kServiceFlagGraphics; - DLOG("tagged device %s %x\n", service->getName(), *rdFlags); + MSG("PowerManagement emergency overtemp signal. Going to sleep!"); + privateSleepSystem (kIOPMSleepReasonThermalEmergency); } - // Locate the first PCI host bridge. - if (!pciHostBridgeDevice && service->metaCast("IOPCIBridge")) + /* + * Sleep Now! + */ + if (msg & kIOPMSleepNow) { - IOService * provider = service->getProvider(); - if (OSDynamicCast(IOPlatformDevice, provider) && - provider->inPlane(gIODTPlane)) - { - pciHostBridgeDevice = provider; - DLOG("PMTrace found PCI host bridge %s->%s\n", - provider->getName(), service->getName()); - } + privateSleepSystem (kIOPMSleepReasonSoftware); + } + + /* + * Power Emergency + */ + if (msg & kIOPMPowerEmergency) + { + lowBatteryCondition = true; + privateSleepSystem (kIOPMSleepReasonLowPower); } - // Tag top-level PCI devices. The order of PMinit() call does not - // change across boots and is used as the PCI bit number. - if (pciHostBridgeDevice && service->metaCast("IOPCIDevice")) + /* + * Clamshell OPEN + */ + if (msg & kIOPMClamshellOpened) { - // Would prefer to check built-in property, but tagPowerPlaneService() - // is called before pciDevice->registerService(). - IORegistryEntry * parent = service->getParentEntry(gIODTPlane); - if ((parent == pciHostBridgeDevice) && service->getProperty("acpi-device")) + // Received clamshel open message from clamshell controlling driver + // Update our internal state and tell general interest clients + clamshellClosed = false; + clamshellExists = true; + + if (msg & kIOPMSetValue) { - int bit = pmTracer->recordTopLevelPCIDevice( service ); - if (bit >= 0) - { - // Save the assigned bit for fast lookup. - bit &= 0xff; - *rdFlags |= (kServiceFlagTopLevelPCI | (bit << 8)); - } - } - } -} + reportUserInput(); + } + // Tell PMCPU + informCPUStateChange(kInformLid, 0); -//****************************************************************************** -// handleActivityTickleForService -// -// Called by IOService::activityTickle() for a tickle that is requesting the -// service to raise power state. Called from driver thread. -//****************************************************************************** + // Tell general interest clients + sendClientClamshellNotification(); -void IOPMrootDomain::handleActivityTickleForService( IOService * service, - unsigned long type, - unsigned long currentPowerState, - uint32_t activityTickleCount ) -{ - if ((service == wrangler) -) - { - bool aborting = ((lastSleepReason == kIOPMSleepReasonIdle) + bool aborting = ((lastSleepReason == kIOPMSleepReasonClamshell) + || (lastSleepReason == kIOPMSleepReasonIdle) || (lastSleepReason == kIOPMSleepReasonMaintenance)); if (aborting) userActivityCount++; - DLOG("display wrangler tickled1 %d lastSleepReason %d\n", userActivityCount, lastSleepReason); - } - - // Tickle directed to IODisplayWrangler while graphics is disabled. - // Bring graphics online. + DLOG("clamshell tickled %d lastSleepReason %d\n", userActivityCount, lastSleepReason); + } - if ((!currentPowerState) && - (service == wrangler) && - (runStateIndex > kRStateNormal) && - (false == wranglerTickled) && - (false == lowBatteryCondition)) + /* + * Clamshell CLOSED + * Send the clamshell interest notification since the lid is closing. + */ + if (msg & kIOPMClamshellClosed) { - DLOG("display wrangler tickled\n"); - if (kIOLogPMRootDomain & gIOKitDebug) - OSReportWithBacktrace("Display Tickle"); - wranglerTickled = true; - synchronizePowerTree(); - } -} - -//****************************************************************************** -// handlePowerChangeStartForService -// -// Running on PM work loop thread. -//****************************************************************************** + // Received clamshel open message from clamshell controlling driver + // Update our internal state and tell general interest clients + clamshellClosed = true; + clamshellExists = true; -void IOPMrootDomain::handlePowerChangeStartForService( - IOService * service, - uint32_t * rdFlags, - uint32_t newPowerState, - uint32_t changeFlags ) -{ - if (service == this) - { - uint32_t currentPowerState = (uint32_t) getPowerState(); - uint32_t nextRunStateFlags; + // Tell PMCPU + informCPUStateChange(kInformLid, 1); - assert(nextRunStateIndex < kRStateCount); - nextRunStateFlags = gRStateFlags[nextRunStateIndex]; + // Tell general interest clients + sendClientClamshellNotification(); + + // And set eval_clamshell = so we can attempt + eval_clamshell = true; + } - gMessageClientType = kMessageClientNone; + /* + * Set Desktop mode (sent from graphics) + * + * -> reevaluate lid state + */ + if (msg & kIOPMSetDesktopMode) + { + desktopMode = (0 != (msg & kIOPMSetValue)); + msg &= ~(kIOPMSetDesktopMode | kIOPMSetValue); - // Transition towards or away from ON power state. + sendClientClamshellNotification(); - if ((currentPowerState != newPowerState) && - ((ON_STATE == newPowerState) || (ON_STATE == currentPowerState))) + // Re-evaluate the lid state + if( clamshellClosed ) { - if ((runStateFlags & kRStateFlagSuppressMessages) == 0) - gMessageClientType = kMessageClientAll; - else - gMessageClientType = kMessageClientConfigd; + eval_clamshell = true; } + } + + /* + * AC Adaptor connected + * + * -> reevaluate lid state + */ + if (msg & kIOPMSetACAdaptorConnected) + { + acAdaptorConnected = (0 != (msg & kIOPMSetValue)); + msg &= ~(kIOPMSetACAdaptorConnected | kIOPMSetValue); - // Transition caused by deassertion of system notification suppression. + // Tell CPU PM + informCPUStateChange(kInformAC, !acAdaptorConnected); - if ((ON_STATE == newPowerState) && - (ON_STATE == currentPowerState) && - ((runStateFlags ^ nextRunStateFlags) & kRStateFlagSuppressMessages)) - { - gMessageClientType = kMessageClientAll; - } + // Tell BSD if AC is connected + // 0 == external power source; 1 == on battery + post_sys_powersource(acAdaptorConnected ? 0:1); - if (ON_STATE == newPowerState) - { - DLOG("kIOMessageSystemWillPowerOn (%d)\n", - gMessageClientType); - tellClients(kIOMessageSystemWillPowerOn, clientMessageFilter); - } - - if (SLEEP_STATE == newPowerState) + sendClientClamshellNotification(); + + // Re-evaluate the lid state + if( clamshellClosed ) { - tracePoint(kIOPMTracePointSleepStarted); + eval_clamshell = true; } } - if (*rdFlags & kServiceFlagTopLevelPCI) + /* + * Enable Clamshell (external display disappear) + * + * -> reevaluate lid state + */ + if (msg & kIOPMEnableClamshell) { - pmTracer->tracePCIPowerChange( - PMTraceWorker::kPowerChangeStart, - service, changeFlags, - (*rdFlags >> 8) & 0xff); - } -} - + // Re-evaluate the lid state + // System should sleep on external display disappearance + // in lid closed operation. + if( clamshellClosed && (true == clamshellDisabled) ) + { + eval_clamshell = true; + } -//****************************************************************************** -// handlePowerChangeDoneForService -// -// Running on PM work loop thread. -//****************************************************************************** + clamshellDisabled = false; -void IOPMrootDomain::handlePowerChangeDoneForService( - IOService * service, - uint32_t * rdFlags, - uint32_t newPowerState, - uint32_t changeFlags ) -{ - if (*rdFlags & kServiceFlagTopLevelPCI) - { - pmTracer->tracePCIPowerChange( - PMTraceWorker::kPowerChangeCompleted, - service, changeFlags, - (*rdFlags >> 8) & 0xff); + sendClientClamshellNotification(); } -} - - -//****************************************************************************** -// overridePowerStateForService -// -// Runs on PM work loop thread. -//****************************************************************************** + + /* + * Disable Clamshell (external display appeared) + * We don't bother re-evaluating clamshell state. If the system is awake, + * the lid is probably open. + */ + if (msg & kIOPMDisableClamshell) + { + clamshellDisabled = true; -void IOPMrootDomain::overridePowerStateForService( - IOService * service, - uint32_t * rdFlags, - unsigned long * powerState, - uint32_t changeFlags ) -{ - uint32_t inPowerState = (uint32_t) *powerState; + sendClientClamshellNotification(); + } - if ((service == this) && (inPowerState == ON_STATE) && - (changeFlags & kIOPMSynchronize)) + /* + * Evaluate clamshell and SLEEP if appropiate + */ + if ( eval_clamshell && shouldSleepOnClamshellClosed() ) { - DLOG("sync root domain %u->%u\n", - (uint32_t) getPowerState(), inPowerState); - // Root Domain is in a reduced R-state, and a HID tickle has - // requested a PM tree sync. Begin R-state transition. - if (runStateIndex != kRStateNormal) - { - sleepTimerMaintenance = false; - hibernateNoDefeat = false; - nextRunStateIndex = kRStateNormal; - setProperty( - kIOPMRootDomainRunStateKey, - (unsigned long long) kRStateNormal, 32); - } + // SLEEP! + privateSleepSystem (kIOPMSleepReasonClamshell); } - - if (*rdFlags & kServiceFlagGraphics) + else if ( eval_clamshell ) { - DLOG("graphics device %s %u->%u (flags 0x%x)\n", - service->getName(), (uint32_t) service->getPowerState(), - inPowerState, changeFlags); + evaluatePolicy(kStimulusDarkWakeEvaluate); + } - if (inPowerState == 0) + /* + * Power Button + */ + if (msg & kIOPMPowerButton) + { + if (!wranglerAsleep) { - // Graphics device is powering down, apply limit preventing - // device from powering back up later unless we consent. - - if ((*rdFlags & kServiceFlagNoPowerUp) == 0) - { - *rdFlags |= kServiceFlagNoPowerUp; - DLOG("asserted power limit for %s\n", - service->getName()); + OSString *pbs = OSString::withCString("DisablePowerButtonSleep"); + // Check that power button sleep is enabled + if( pbs ) { + if( kOSBooleanTrue != getProperty(pbs)) + privateSleepSystem (kIOPMSleepReasonPowerButton); } } else - { - uint32_t nextRunStateFlags; - - assert(nextRunStateIndex < kRStateCount); - nextRunStateFlags = gRStateFlags[nextRunStateIndex]; - - // Graphics device is powering up. Release power limit at the - // did-change machine state. - - if (changeFlags & kIOPMSynchronize) - { - if ((runStateFlags & kRStateFlagSuppressGraphics) && - ((nextRunStateFlags & kRStateFlagSuppressGraphics) == 0) && - (changeFlags & kIOPMDomainDidChange)) - { - // Woke up without graphics power, but - // HID event has tickled display wrangler. - *rdFlags &= ~kServiceFlagNoPowerUp; - DLOG("removed power limit for %s\n", - service->getName()); - } - } - else if ((runStateFlags & kRStateFlagSuppressGraphics) == 0) - { - *rdFlags &= ~kServiceFlagNoPowerUp; - } - - if (*rdFlags & kServiceFlagNoPowerUp) - { - DLOG("limited %s to power state 0\n", - service->getName()); - *powerState = 0; - } - } + reportUserInput(); } } - -//****************************************************************************** -// setMaintenanceWakeCalendar -// -//****************************************************************************** - -IOReturn IOPMrootDomain::setMaintenanceWakeCalendar( - const IOPMCalendarStruct * calendar ) -{ - OSData * data; - IOReturn ret; - - if (!calendar) - return kIOReturnBadArgument; - - data = OSData::withBytesNoCopy((void *) calendar, sizeof(*calendar)); - if (!data) - return kIOReturnNoMemory; - - ret = setPMSetting(gIOPMSettingMaintenanceWakeCalendarKey, data); - - data->release(); - return ret; -} -#endif /* ROOT_DOMAIN_RUN_STATES */ - - //****************************************************************************** -// sysPowerDownHandler -// -// Receives a notification when the RootDomain changes state. +// evaluatePolicy // -// Allows us to take action on system sleep, power down, and restart after -// applications have received their power change notifications and replied, -// but before drivers have powered down. We perform a vfs sync on power down. +// Evaluate root-domain policy in response to external changes. //****************************************************************************** -IOReturn IOPMrootDomain::sysPowerDownHandler( void * target, void * refCon, - UInt32 messageType, IOService * service, - void * messageArgument, vm_size_t argSize ) +void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg ) { - IOReturn ret; - IOPowerStateChangeNotification *params = (IOPowerStateChangeNotification *) messageArgument; - IOPMrootDomain *rootDomain = OSDynamicCast(IOPMrootDomain, service); - - DLOG("sysPowerDownHandler message %x\n", (uint32_t) messageType); - - if(!rootDomain) - return kIOReturnUnsupported; - - switch (messageType) { - case kIOMessageSystemWillSleep: - // Interested applications have been notified of an impending power - // change and have acked (when applicable). - // This is our chance to save whatever state we can before powering - // down. - // We call sync_internal defined in xnu/bsd/vfs/vfs_syscalls.c, - // via callout -#if HIBERNATION - rootDomain->evaluateSystemSleepPolicyEarly(); - if (rootDomain->hibernateMode && !rootDomain->hibernateDisabled) - { - // We will ack within 240 seconds - params->returnValue = 240 * 1000 * 1000; - } - else -#endif - // We will ack within 20 seconds - params->returnValue = 20 * 1000 * 1000; - DLOG("sysPowerDownHandler timeout %d s\n", (int) (params->returnValue / 1000 / 1000)); - if ( ! OSCompareAndSwap( 0, 1, &gSleepOrShutdownPending ) ) - { - // Purposely delay the ack and hope that shutdown occurs quickly. - // Another option is not to schedule the thread and wait for - // ack timeout... - AbsoluteTime deadline; - clock_interval_to_deadline( 30, kSecondScale, &deadline ); - thread_call_enter1_delayed( rootDomain->diskSyncCalloutEntry, - (thread_call_param_t)params->powerRef, - deadline ); - } - else - thread_call_enter1(rootDomain->diskSyncCalloutEntry, (thread_call_param_t)params->powerRef); - ret = kIOReturnSuccess; - break; - - case kIOMessageSystemWillPowerOff: - case kIOMessageSystemWillRestart: - ret = kIOReturnUnsupported; - break; + union { + struct { + int idleSleepEnabled : 1; + int idleSleepDisabled : 1; + int displaySleep : 1; + int sleepDelayChanged : 1; + int evaluateDarkWake : 1; + } bit; + uint32_t u32; + } flags; + + DLOG("evaluatePolicy( %d, 0x%x )\n", stimulus, arg); - default: - ret = kIOReturnUnsupported; - break; - } - return ret; -} + ASSERT_GATED(); + flags.u32 = 0; -//****************************************************************************** -// publishSleepWakeUUID -// -// -//****************************************************************************** -void IOPMrootDomain::publishSleepWakeUUID( bool shouldPublish ) -{ - if (shouldPublish) + switch (stimulus) { - if (queuedSleepWakeUUIDString) - { - if (OSCompareAndSwap(/*old*/ true, /*new*/ false, &gSleepWakeUUIDIsSet)) + case kStimulusDisplayWranglerSleep: + if (!wranglerAsleep) { - // Upon wake, it takes some time for userland to invalidate the - // UUID. If another sleep is initiated during that period, force - // a CLEAR message to balance the upcoming SET message. - - messageClients( kIOPMMessageSleepWakeUUIDChange, - kIOPMMessageSleepWakeUUIDCleared ); - - DLOG("SleepWake UUID forced clear\n"); + wranglerAsleep = true; + clock_get_uptime(&wranglerSleepTime); + flags.bit.displaySleep = true; } + break; - setProperty(kIOPMSleepWakeUUIDKey, queuedSleepWakeUUIDString); - DLOG("SleepWake UUID published: %s\n", queuedSleepWakeUUIDString->getCStringNoCopy()); - queuedSleepWakeUUIDString->release(); - queuedSleepWakeUUIDString = NULL; - messageClients(kIOPMMessageSleepWakeUUIDChange, - kIOPMMessageSleepWakeUUIDSet); - } - } else { - if (OSCompareAndSwap(/*old*/ true, /*new*/ false, &gSleepWakeUUIDIsSet)) - { - DLOG("SleepWake UUID cleared\n"); - removeProperty(kIOPMSleepWakeUUIDKey); - messageClients(kIOPMMessageSleepWakeUUIDChange, - kIOPMMessageSleepWakeUUIDCleared); - } - } -} - - -//****************************************************************************** -// displayWranglerNotification -// -// Receives a notification when the IODisplayWrangler changes state. -// -// Allows us to take action on display dim/undim. -// -// When the display sleeps we: -// - Start the idle sleep timer -// - set the quick spin down timeout -// -// On wake from display sleep: -// - Cancel the idle sleep timer -// - restore the user's chosen spindown timer from the "quick" spin down value -//****************************************************************************** - -IOReturn IOPMrootDomain::displayWranglerNotification( - void * target, void * refCon, - UInt32 messageType, IOService * service, - void * messageArgument, vm_size_t argSize ) -{ -#if !NO_KERNEL_HID - int displayPowerState; - IOPowerStateChangeNotification * params = - (IOPowerStateChangeNotification *) messageArgument; - - if ((messageType != kIOMessageDeviceWillPowerOff) && - (messageType != kIOMessageDeviceHasPoweredOn)) - return kIOReturnUnsupported; + case kStimulusDisplayWranglerWake: + wranglerAsleep = false; + flags.bit.idleSleepDisabled = true; + break; - ASSERT_GATED(); - if (!gRootDomain) - return kIOReturnUnsupported; + case kStimulusAggressivenessChanged: + { + unsigned long minutesToIdleSleep = 0; + unsigned long minutesToDisplayDim = 0; + unsigned long minutesDelta = 0; - displayPowerState = params->stateNumber; - DLOG("DisplayWrangler message 0x%x, new power state %d\n", - (uint32_t) messageType, displayPowerState); + // Fetch latest display and system sleep slider values. + getAggressiveness(kPMMinutesToSleep, &minutesToIdleSleep); + getAggressiveness(kPMMinutesToDim, &minutesToDisplayDim); + DLOG("aggressiveness changed: system %u->%u, display %u\n", + (uint32_t) sleepSlider, + (uint32_t) minutesToIdleSleep, + (uint32_t) minutesToDisplayDim); - switch (messageType) { - case kIOMessageDeviceWillPowerOff: + DLOG("idle time -> %ld secs (ena %d)\n", + idleSeconds, (minutesToIdleSleep != 0)); - // The display wrangler has dropped power because of idle display sleep - // or force system sleep. - // - // 4 Display ON - // 3 Display Dim - // 2 Display Sleep - // 1 Not visible to user - // 0 Not visible to user + if (0x7fffffff == minutesToIdleSleep) + minutesToIdleSleep = idleSeconds; - if (gRootDomain->wranglerAsleep || (displayPowerState > 2)) - break; + // How long to wait before sleeping the system once + // the displays turns off is indicated by 'extraSleepDelay'. - // Record the time the display wrangler went to sleep. + if ( minutesToIdleSleep > minutesToDisplayDim ) + minutesDelta = minutesToIdleSleep - minutesToDisplayDim; - gRootDomain->wranglerAsleep = true; - clock_get_uptime(&gRootDomain->wranglerSleepTime); + if ((sleepSlider == 0) && (minutesToIdleSleep != 0)) + flags.bit.idleSleepEnabled = true; - // We start a timer here if the System Sleep timer is greater than the - // Display Sleep timer. We kick off this timer when the display sleeps. - // - // Note that, although Display Dim timings may change adaptively accordingly - // to the user's activity patterns, Display Sleep _always_ occurs at the - // specified interval since last user activity. + if ((sleepSlider != 0) && (minutesToIdleSleep == 0)) + flags.bit.idleSleepDisabled = true; + + if ((minutesDelta != extraSleepDelay) && + !flags.bit.idleSleepEnabled && !flags.bit.idleSleepDisabled) + flags.bit.sleepDelayChanged = true; - if ( gRootDomain->extraSleepDelay ) + if (systemDarkWake && !darkWakeToSleepASAP && + (flags.bit.idleSleepEnabled || flags.bit.idleSleepDisabled)) { - gRootDomain->startIdleSleepTimer(gRootDomain->extraSleepDelay * 60); + // Reconsider decision to remain in dark wake + flags.bit.evaluateDarkWake = true; } - else if ( gRootDomain->sleepSlider ) - { - // Accelerate disk spindown if system sleep and display sleep - // sliders are set to the same value (e.g. both set to 5 min), - // and display is about to go dark. Check that spin down timer - // is non-zero (zero = never spin down) and system sleep is - // not set to never sleep. - gRootDomain->setQuickSpinDownTimeout(); - } + sleepSlider = minutesToIdleSleep; + extraSleepDelay = minutesDelta; + } break; + case kStimulusDemandSystemSleep: + changePowerStateWithOverrideTo( SLEEP_STATE ); break; - case kIOMessageDeviceHasPoweredOn: + case kStimulusAllowSystemSleepChanged: + // FIXME: de-compose to change flags. + adjustPowerState(); + break; - // The display wrangler has powered on either because of user activity - // or wake from sleep/doze. + case kStimulusDarkWakeActivityTickle: + if (false == wranglerTickled) + { + uint32_t options = 0; + IOService * pciRoot = 0; - if ( 4 != displayPowerState ) - break; + if (rejectWranglerTickle) + { + DLOG("rejected tickle, type %u capability %x:%x\n", + _systemTransitionType, + _currentCapability, _pendingCapability); + break; + } + + _desiredCapability |= + (kIOPMSystemCapabilityGraphics | + kIOPMSystemCapabilityAudio); + + if ((kSystemTransitionWake == _systemTransitionType) && + !(_pendingCapability & kIOPMSystemCapabilityGraphics) && + !graphicsSuppressed) + { + DLOG("Promoting to full wake\n"); + + // Elevate to full wake while waking up to dark wake. + // PM will hold off notifying the graphics subsystem about + // system wake as late as possible, so if a HID event does + // arrive, we can turn on graphics on this wake cycle, and + // not have to wait till the following cycle. That latency + // can be huge on some systems. However, once any graphics + // suppression has taken effect, it is too late. All other + // graphics devices must be similarly suppressed. But the + // delay till the following cycle should be very short. + + _pendingCapability |= + (kIOPMSystemCapabilityGraphics | + kIOPMSystemCapabilityAudio); + + // Immediately bring up audio and graphics. + pciRoot = pciHostBridgeDriver; + + // Notify clients about full wake. + _systemMessageClientMask = kSystemMessageClientAll; + tellClients(kIOMessageSystemWillPowerOn); + } - gRootDomain->wranglerAsleep = false; - gRootDomain->adjustPowerState(); - gRootDomain->cancelIdleSleepTimer(); + // Unsafe to cancel once graphics was powered. + // If system woke from dark wake, the return to sleep can + // be cancelled. But "awake -> dark -> sleep" transition + // cannot be cancelled. + + if (!CAP_HIGHEST(kIOPMSystemCapabilityGraphics)) { + options |= kIOPMSyncCancelPowerDown; + } - // Change the spindown value back to the user's selection from our - // accelerated setting. - gRootDomain->restoreUserSpinDownTimeout(); + synchronizePowerTree( options, pciRoot ); + wranglerTickled = true; + // IOGraphics doesn't lit the display even though graphics + // is enanbled in kIOMessageSystemCapabilityChange message(radar 9502104) + // So, do an explicit activity tickle + if(wrangler) + wrangler->activityTickle(0,0); + if (logWranglerTickle) + { + AbsoluteTime now; + uint64_t nsec; + + clock_get_uptime(&now); + SUB_ABSOLUTETIME(&now, &systemWakeTime); + absolutetime_to_nanoseconds(now, &nsec); + MSG("HID tickle %u ms\n", + ((int)((nsec) / 1000000ULL))); + logWranglerTickle = false; + } + } break; - default: - break; - } -#endif - return kIOReturnUnsupported; -} - + case kStimulusDarkWakeEntry: + case kStimulusDarkWakeReentry: + // Any system transitions since the last dark wake transition + // will invalid the stimulus. -//****************************************************************************** -// displayWranglerPublished -// -// Receives a notification when the IODisplayWrangler is published. -// When it's published we install a power state change handler. -//****************************************************************************** + if (arg == _systemStateGeneration) + { + DLOG("dark wake entry\n"); + systemDarkWake = true; + wranglerAsleep = true; + clock_get_uptime(&wranglerSleepTime); -bool IOPMrootDomain::displayWranglerPublished( - void * target, - void * refCon, - IOService * newService) -{ -#if !NO_KERNEL_HID - if(!gRootDomain) - return false; + // Always accelerate disk spindown while in dark wake, + // even if system does not support/allow sleep. - gRootDomain->wrangler = newService; + cancelIdleSleepTimer(); + setQuickSpinDownTimeout(); + flags.bit.evaluateDarkWake = true; + } + break; - // we found the display wrangler, now install a handler - if( !gRootDomain->wrangler->registerInterest( gIOGeneralInterest, - &displayWranglerNotification, target, 0) ) - { - return false; - } + case kStimulusDarkWakeEvaluate: + if (systemDarkWake) + { + flags.bit.evaluateDarkWake = true; + } +#if !DARK_TO_FULL_EVALUATE_CLAMSHELL + else + { + // Not through kLocalEvalClamshellCommand to avoid loop. + if (clamshellClosed && shouldSleepOnClamshellClosed() && + checkSystemCanSleep(true)) + { + privateSleepSystem( kIOPMSleepReasonClamshell ); + } + } #endif - return true; -} + break; + } /* switch(stimulus) */ -//****************************************************************************** -// batteryPublished -// -// Notification on battery class IOPowerSource appearance -//****************************************************************************** + if (flags.bit.evaluateDarkWake && !wranglerTickled) + { + if (darkWakeToSleepASAP || + (clamshellClosed && !(desktopMode && acAdaptorConnected))) + { + // System currently in dark wake, and no children and + // assertion prevent system sleep. -bool IOPMrootDomain::batteryPublished( - void * target, - void * root_domain, - IOService * resourceService ) -{ - // rdar://2936060&4435589 - // All laptops have dimmable LCD displays - // All laptops have batteries - // So if this machine has a battery, publish the fact that the backlight - // supports dimming. - ((IOPMrootDomain *)root_domain)->publishFeature("DisplayDims"); + if (checkSystemCanSleep(true)) + { + if (lowBatteryCondition) + { + lastSleepReason = kIOPMSleepReasonLowPower; + setProperty(kRootDomainSleepReasonKey, kIOPMLowPowerSleepKey); + } + else if (darkWakeMaintenance) + { + lastSleepReason = kIOPMSleepReasonMaintenance; + setProperty(kRootDomainSleepReasonKey, kIOPMMaintenanceSleepKey); + } + changePowerStateWithOverrideTo( SLEEP_STATE ); + } + else + { + // Parked in dark wake, a tickle will return to full wake + rejectWranglerTickle = false; + } + } else // non-maintenance (network) dark wake + { + if (checkSystemCanSleep(true)) + { + // Release power clamp, and wait for children idle. + adjustPowerState(true); + } + else + { + changePowerStateToPriv(ON_STATE); + } + rejectWranglerTickle = false; + } + } - return (true); -} + if (systemDarkWake) + { + // The rest are irrelevant while system is in dark wake. + flags.u32 = 0; + } + if (flags.bit.displaySleep || flags.bit.sleepDelayChanged) + { + bool cancelQuickSpindown = false; -//****************************************************************************** -// adjustPowerState -// -// Some condition that affects our wake/sleep/doze decision has changed. -// -// If the sleep slider is in the off position, we cannot sleep or doze. -// If the enclosure is open, we cannot sleep or doze. -// If the system is still booting, we cannot sleep or doze. -// -// In those circumstances, we prevent sleep and doze by holding power on with -// changePowerStateToPriv(ON). -// -// If the above conditions do not exist, and also the sleep timer has expired, -// we allow sleep or doze to occur with either changePowerStateToPriv(SLEEP) or -// changePowerStateToPriv(DOZE) depending on whether or not we already know the -// platform cannot sleep. -// -// In this case, sleep or doze will either occur immediately or at the next time -// that no children are holding the system out of idle sleep via the -// kIOPMPreventIdleSleep flag in their power state arrays. -//****************************************************************************** + if (flags.bit.sleepDelayChanged) + { + DLOG("extra sleep timer changed\n"); + cancelIdleSleepTimer(); + cancelQuickSpindown = true; + } + else + { + DLOG("display sleep\n"); + } -void IOPMrootDomain::adjustPowerState( void ) -{ - DLOG("adjustPowerState " - "PS %u, ASAP %d, SL %ld, AS %d, SB %d, SS %d, UD %d\n", - (uint32_t) getPowerState(), sleepASAP, sleepSlider, - allowSleep, systemBooting, systemShutdown, userDisabledAllSleep); + if (wranglerAsleep && !wranglerSleepIgnored) + { + if ( extraSleepDelay ) + { + // Start a timer here if the System Sleep timer is greater + // than the Display Sleep timer. - ASSERT_GATED(); + startIdleSleepTimer(gRootDomain->extraSleepDelay * 60); + } + else if ( sleepSlider ) + { + // Accelerate disk spindown if system sleep and display sleep + // sliders are set to the same value (e.g. both set to 5 min), + // and display is about to go dark. Check the system sleep is + // not set to never sleep. Disk sleep setting is ignored. + + setQuickSpinDownTimeout(); + cancelQuickSpindown = false; + } + } + + if (cancelQuickSpindown) + restoreUserSpinDownTimeout(); + } - if ( (sleepSlider == 0) - || !allowSleep - || systemBooting - || systemShutdown - || userDisabledAllSleep - || (runStateFlags & kRStateFlagDisableIdleSleep) ) + if (flags.bit.idleSleepEnabled) { - changePowerStateToPriv(ON_STATE); - } else { - if ( sleepASAP ) + DLOG("idle sleep timer enabled\n"); + if (!wrangler) { - /* Convenient place to run any code at idle sleep time - * IOPMrootDomain initiates an idle sleep here - * - * Set last sleep cause accordingly. - */ - lastSleepReason = kIOPMSleepReasonIdle; - setProperty(kRootDomainSleepReasonKey, kIOPMIdleSleepKey); + changePowerStateToPriv(ON_STATE); + if (idleSeconds) + { + startIdleSleepTimer( idleSeconds ); + } + } + else + { + // Start idle sleep timer if wrangler went to sleep + // while system sleep was disabled. Disk spindown is + // accelerated upon timer expiration. + + if (wranglerAsleep) + { + AbsoluteTime now; + uint64_t nanos; + uint32_t minutesSinceDisplaySleep = 0; + uint32_t sleepDelay; + + clock_get_uptime(&now); + if (CMP_ABSOLUTETIME(&now, &wranglerSleepTime) > 0) + { + SUB_ABSOLUTETIME(&now, &wranglerSleepTime); + absolutetime_to_nanoseconds(now, &nanos); + minutesSinceDisplaySleep = nanos / (60000000000ULL); + } + + if (extraSleepDelay > minutesSinceDisplaySleep) + { + sleepDelay = extraSleepDelay - minutesSinceDisplaySleep; + } + else + { + sleepDelay = 1; // 1 min + } - sleepASAP = false; - changePowerStateToPriv(SLEEP_STATE); + startIdleSleepTimer(sleepDelay * 60); + DLOG("display slept %u min, set idle timer to %u min\n", + minutesSinceDisplaySleep, sleepDelay); + } } } + + if (flags.bit.idleSleepDisabled) + { + DLOG("idle sleep timer disabled\n"); + cancelIdleSleepTimer(); + restoreUserSpinDownTimeout(); + adjustPowerState(); + } } +// MARK: - +// MARK: Statistics + +//****************************************************************************** +// pmStats +// +//****************************************************************************** + void IOPMrootDomain::pmStatsRecordEvent( int eventIndex, AbsoluteTime timestamp) @@ -4783,6 +5870,8 @@ void IOPMrootDomain::pmStatsRecordApplicationResponse( return; } +// MARK: - +// MARK: PMTraceWorker //****************************************************************************** // TracePoint support @@ -4812,7 +5901,7 @@ IOReturn IOPMrootDomain::callPlatformFunction( statusCode = (((uint64_t)tracePointPCI) << 32) | tracePointPhases; if ((tracePointPhases >> 24) != kIOPMTracePointSystemUp) { - LOG("Sleep failure code 0x%08x 0x%08x\n", + MSG("Sleep failure code 0x%08x 0x%08x\n", tracePointPCI, tracePointPhases); } setProperty(kIOPMSleepWakeFailureCodeKey, statusCode, 64); @@ -4827,7 +5916,20 @@ IOReturn IOPMrootDomain::callPlatformFunction( void IOPMrootDomain::tracePoint( uint8_t point ) { - pmTracer->tracePoint(point); + if (!systemBooting) + pmTracer->tracePoint(point); +} + +void IOPMrootDomain::tracePoint( uint8_t point, uint8_t data ) +{ + if (!systemBooting) + pmTracer->tracePoint(point, data); +} + +void IOPMrootDomain::traceDetail( uint32_t detail ) +{ + if (!systemBooting) + pmTracer->traceDetail( detail ); } //****************************************************************************** @@ -4862,7 +5964,7 @@ PMTraceWorker *PMTraceWorker::tracer(IOPMrootDomain *owner) me->pciMappingLock = IOLockAlloc(); me->tracePhase = kIOPMTracePointSystemUp; me->loginWindowPhase = 0; - me->pciBusyBitMask = 0; + me->traceData32 = 0; return me; } @@ -4872,13 +5974,11 @@ void PMTraceWorker::RTC_TRACE(void) { uint32_t wordA; - wordA = tracePhase; // destined for bits 24-31 - wordA <<= 8; - wordA |= loginWindowPhase; // destined for bits 16-23 - wordA <<= 16; + wordA = (tracePhase << 24) | (loginWindowPhase << 16) | + (traceData8 << 8); - tracePointHandler( tracePointTarget, pciBusyBitMask, wordA ); - DLOG("RTC_TRACE wrote 0x%08x 0x%08x\n", pciBusyBitMask, wordA); + tracePointHandler( tracePointTarget, traceData32, wordA ); + _LOG("RTC_TRACE wrote 0x%08x 0x%08x\n", traceData32, wordA); } } @@ -4905,7 +6005,7 @@ int PMTraceWorker::recordTopLevelPCIDevice(IOService * pciDevice) pciDeviceBitMappings->setObject(deviceName)) { index = pciDeviceBitMappings->getCount() - 1; - DLOG("PMTrace PCI array: set object %s => %d\n", + _LOG("PMTrace PCI array: set object %s => %d\n", deviceName->getCStringNoCopy(), index); } if (deviceName) @@ -4932,9 +6032,37 @@ bool PMTraceWorker::serialize(OSSerialize *s) const void PMTraceWorker::tracePoint(uint8_t phase) { + // clear trace detail when phase begins + if (tracePhase != phase) + traceData32 = 0; + + tracePhase = phase; + + DLOG("trace point 0x%02x\n", tracePhase); + RTC_TRACE(); +} + +void PMTraceWorker::tracePoint(uint8_t phase, uint8_t data8) +{ + // clear trace detail when phase begins + if (tracePhase != phase) + traceData32 = 0; + tracePhase = phase; + traceData8 = data8; + + DLOG("trace point 0x%02x 0x%02x\n", tracePhase, traceData8); + RTC_TRACE(); +} + +void PMTraceWorker::traceDetail(uint32_t detail) +{ + if (kIOPMTracePointSleepPriorityClients != tracePhase) + return; + + traceData32 = detail; + DLOG("trace point 0x%02x detail 0x%08x\n", tracePhase, traceData32); - DLOG("IOPMrootDomain: trace point 0x%02x\n", tracePhase); RTC_TRACE(); } @@ -4942,7 +6070,7 @@ void PMTraceWorker::traceLoginWindowPhase(uint8_t phase) { loginWindowPhase = phase; - DLOG("IOPMrootDomain: loginwindow tracepoint 0x%02x\n", loginWindowPhase); + DLOG("loginwindow tracepoint 0x%02x\n", loginWindowPhase); RTC_TRACE(); } @@ -4953,14 +6081,14 @@ void PMTraceWorker::tracePCIPowerChange( uint32_t expectedFlag; // Ignore PCI changes outside of system sleep/wake. - if ((kIOPMTracePointSystemSleepDriversPhase != tracePhase) && - (kIOPMTracePointSystemWakeDriversPhase != tracePhase)) + if ((kIOPMTracePointSleepPowerPlaneDrivers != tracePhase) && + (kIOPMTracePointWakePowerPlaneDrivers != tracePhase)) return; // Only record the WillChange transition when going to sleep, // and the DidChange on the way up. changeFlags &= (kIOPMDomainWillChange | kIOPMDomainDidChange); - expectedFlag = (kIOPMTracePointSystemSleepDriversPhase == tracePhase) ? + expectedFlag = (kIOPMTracePointSleepPowerPlaneDrivers == tracePhase) ? kIOPMDomainWillChange : kIOPMDomainDidChange; if (changeFlags != expectedFlag) return; @@ -4972,21 +6100,23 @@ void PMTraceWorker::tracePCIPowerChange( if (kPowerChangeStart == type) { - pciBusyBitMask |= bitMask; - DLOG("PMTrace: Device %s started - bit %2d mask 0x%08x => 0x%08x\n", - service->getName(), bitNum, bitMask, pciBusyBitMask); + traceData32 |= bitMask; + _LOG("PMTrace: Device %s started - bit %2d mask 0x%08x => 0x%08x\n", + service->getName(), bitNum, bitMask, traceData32); } else { - pciBusyBitMask &= ~bitMask; - DLOG("PMTrace: Device %s finished - bit %2d mask 0x%08x => 0x%08x\n", - service->getName(), bitNum, bitMask, pciBusyBitMask); + traceData32 &= ~bitMask; + _LOG("PMTrace: Device %s finished - bit %2d mask 0x%08x => 0x%08x\n", + service->getName(), bitNum, bitMask, traceData32); } - RTC_TRACE(); + RTC_TRACE(); } } +// MARK: - +// MARK: PMHaltWorker //****************************************************************************** // PMHaltWorker Class @@ -5138,9 +6268,9 @@ void PMHaltWorker::work( PMHaltWorker * me ) deltaTime = computeDeltaTimeMS(&startTime); if ((deltaTime > kPMHaltTimeoutMS) || timeout || - (gIOKitDebug & (kIOLogDebugPower | kIOLogPMRootDomain))) + (gIOKitDebug & kIOLogPMRootDomain)) { - KLOG("%s driver %s (%p) took %u ms\n", + LOG("%s driver %s (%p) took %u ms\n", (gPMHaltEvent == kIOMessageSystemWillPowerOff) ? "PowerOff" : "Restart", service->getName(), service, @@ -5173,7 +6303,7 @@ void PMHaltWorker::checkTimeout( PMHaltWorker * me, AbsoluteTime * now ) if (nano > 3000000000ULL) { me->timeout = true; - LOG("%s still waiting on %s\n", + MSG("%s still waiting on %s\n", (gPMHaltEvent == kIOMessageSystemWillPowerOff) ? "PowerOff" : "Restart", me->service->getName()); @@ -5406,18 +6536,90 @@ notifySystemShutdown( IOService * root, unsigned long event ) } IOLockUnlock(gPMHaltLock); - // Release all workers + // Release all workers + + for (unsigned int i = 0; i < numWorkers; i++) + { + if (workers[i]) + workers[i]->release(); + // worker also retained by it's own thread + } + +done: + DLOG("%s done\n", __FUNCTION__); + return; +} + +//********************************************************************************* +// Sleep/Wake logging +// +//********************************************************************************* + +IOMemoryDescriptor *IOPMrootDomain::getPMTraceMemoryDescriptor(void) +{ + if (timeline) + return timeline->getPMTraceMemoryDescriptor(); + else + return NULL; +} + +// Forwards external reports of detailed events to IOPMTimeline +IOReturn IOPMrootDomain::recordPMEvent(PMEventDetails *details) +{ + if (timeline && details) { + + IOReturn rc; + + // Record a detailed driver power change event, or... + if(details->eventClassifier == kIOPMEventClassDriverEvent) { + rc = timeline->recordDetailedPowerEvent( details ); + } + + // Record a system power management event + else if(details->eventClassifier == kIOPMEventClassSystemEvent) { + rc = timeline->recordSystemPowerEvent( details ); + } + else { + return kIOReturnBadArgument; + } + + // If we get to record this message, then we've reached the + // end of another successful Sleep --> Wake cycle + // At this point, we pat ourselves in the back and allow + // our Sleep --> Wake UUID to be published + if(details->eventType == kIOPMEventTypeWakeDone) { + timeline->setSleepCycleInProgressFlag(false); + } + +/* + // Check if its time to clear the timeline buffer + if(getProperty(kIOPMSleepWakeUUIDKey) + && timeline->isSleepCycleInProgress() == false + && timeline->getNumEventsLoggedThisPeriod() > 500) { + + // Clear the old UUID + if(pmPowerStateQueue) { + pmPowerStateQueue->submitPowerEvent(kPowerEventPublishSleepWakeUUID, (void *)false ); + } + } +*/ + return rc; + } + else + return kIOReturnNotReady; +} + +IOReturn IOPMrootDomain::recordAndReleasePMEvent(PMEventDetails *details) +{ + IOReturn ret = kIOReturnBadArgument; - for (unsigned int i = 0; i < numWorkers; i++) - { - if (workers[i]) - workers[i]->release(); - // worker also retained by it's own thread - } + if (details) + { + ret = recordPMEvent(details); + details->release(); + } -done: - DLOG("%s done\n", __FUNCTION__); - return; + return ret; } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ @@ -5493,16 +6695,30 @@ bool IOPMrootDomain::serializeProperties( OSSerialize * s ) const /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +// MARK: - +// MARK: PMSettingHandle -#undef super -#define super OSObject -OSDefineMetaClassAndFinalStructors(PMSettingObject, OSObject) +OSDefineMetaClassAndStructors( PMSettingHandle, OSObject ) -void PMSettingObject::setPMSetting(const OSSymbol *type, OSObject *obj) +void PMSettingHandle::free( void ) { - (*func)(target, type, obj, refcon); + if (pmso) + { + pmso->clientHandleFreed(); + pmso->release(); + pmso = 0; + } + + OSObject::free(); } +// MARK: - +// MARK: PMSettingObject + +#undef super +#define super OSObject +OSDefineMetaClassAndFinalStructors( PMSettingObject, OSObject ) + /* * Static constructor/initializer for PMSettingObject */ @@ -5512,92 +6728,445 @@ PMSettingObject *PMSettingObject::pmSettingObject( OSObject *target_arg, uintptr_t refcon_arg, uint32_t supportedPowerSources, - const OSSymbol * settings[]) + const OSSymbol * settings[], + OSObject **handle_obj) { - uint32_t objCount = 0; - PMSettingObject *pmso; + uint32_t settingCount = 0; + PMSettingObject *pmso = 0; + PMSettingHandle *pmsh = 0; - if( !parent_arg || !handler_arg || !settings ) return NULL; + if ( !parent_arg || !handler_arg || !settings || !handle_obj ) + return NULL; - // count OSSymbol entries in NULL terminated settings array - while( settings[objCount] ) { - objCount++; + // count OSSymbol entries in NULL terminated settings array + while (settings[settingCount]) { + settingCount++; } - if(0 == objCount) return NULL; + if (0 == settingCount) + return NULL; pmso = new PMSettingObject; - if(!pmso || !pmso->init()) return NULL; - - pmso->parent = parent_arg; - pmso->func = handler_arg; - pmso->target = target_arg; - pmso->refcon = refcon_arg; - pmso->releaseAtCount = objCount + 1; // release when it has count+1 retains - - pmso->publishedFeatureID = (uint32_t *)IOMalloc(sizeof(uint32_t)*objCount); - if(pmso->publishedFeatureID) { - for(unsigned int i=0; i<objCount; i++) { + if (!pmso || !pmso->init()) + goto fail; + + pmsh = new PMSettingHandle; + if (!pmsh || !pmsh->init()) + goto fail; + + queue_init(&pmso->calloutQueue); + pmso->parent = parent_arg; + pmso->func = handler_arg; + pmso->target = target_arg; + pmso->refcon = refcon_arg; + pmso->settingCount = settingCount; + + pmso->retain(); // handle holds a retain on pmso + pmsh->pmso = pmso; + pmso->pmsh = pmsh; + + pmso->publishedFeatureID = (uint32_t *)IOMalloc(sizeof(uint32_t)*settingCount); + if (pmso->publishedFeatureID) { + for (unsigned int i=0; i<settingCount; i++) { // Since there is now at least one listener to this setting, publish // PM root domain support for it. - parent_arg->publishFeature( settings[i]->getCStringNoCopy(), + parent_arg->publishFeature( settings[i]->getCStringNoCopy(), supportedPowerSources, &pmso->publishedFeatureID[i] ); } } - + + *handle_obj = pmsh; return pmso; + +fail: + if (pmso) pmso->release(); + if (pmsh) pmsh->release(); + return NULL; } -void PMSettingObject::free(void) +void PMSettingObject::free( void ) { - OSCollectionIterator *settings_iter; - OSSymbol *sym; - OSArray *arr; - int arr_idx; - int i; - int objCount = releaseAtCount - 1; - - if(publishedFeatureID) { - for(i=0; i<objCount; i++) { - if(0 != publishedFeatureID[i]) { + if (publishedFeatureID) { + for (uint32_t i=0; i<settingCount; i++) { + if (publishedFeatureID[i]) { parent->removePublishedFeature( publishedFeatureID[i] ); } } + + IOFree(publishedFeatureID, sizeof(uint32_t) * settingCount); + } + + super::free(); +} + +void PMSettingObject::dispatchPMSetting( const OSSymbol * type, OSObject * object ) +{ + (*func)(target, type, object, refcon); +} + +void PMSettingObject::clientHandleFreed( void ) +{ + parent->deregisterPMSettingObject(this); +} + +// MARK: - +// MARK: IOPMTimeline + +#undef super +#define super OSObject + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +IOPMTimeline *IOPMTimeline::timeline(IOPMrootDomain *root_domain) +{ + IOPMTimeline *myself; + + if (!root_domain) + return NULL; - IOFree(publishedFeatureID, sizeof(uint32_t) * objCount); + myself = new IOPMTimeline; + + if (myself) { + myself->owner = root_domain; + myself->init(); } - - IORecursiveLockLock(parent->settingsCtrlLock); + + return myself; +} + +bool IOPMTimeline::init(void) +{ + if (!super::init()) { + return false; + } + + logLock = IOLockAlloc(); - // Search each PM settings array in the kernel. - settings_iter = OSCollectionIterator::withCollection(parent->settingsCallbacks); - if(settings_iter) + // Fresh timeline, no events logged yet + this->numEventsLoggedThisPeriod = 0; + this->sleepCycleInProgress = false; + + //this->setEventsRecordingLevel(1); // TODO + this->setEventsTrackedCount(kIOPMDefaultSystemEventsTracked); + + return true; +} + +void IOPMTimeline::free(void) +{ + if (pmTraceMemoryDescriptor) { + pmTraceMemoryDescriptor->release(); + pmTraceMemoryDescriptor = NULL; + } + + IOLockFree(logLock); + + super::free(); +} + +IOMemoryDescriptor *IOPMTimeline::getPMTraceMemoryDescriptor() +{ + return pmTraceMemoryDescriptor; +} + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +bool IOPMTimeline::setProperties(OSDictionary *d) +{ + OSNumber *n = NULL; + OSBoolean *b = NULL; + bool changed = false; + + /* Changes size of detailed events buffer */ + n = (OSNumber *)d->getObject(kIOPMTimelineSystemNumberTrackedKey); + if (OSDynamicCast(OSNumber, n)) { - while(( sym = OSDynamicCast(OSSymbol, settings_iter->getNextObject()) )) - { - arr = (OSArray *)parent->settingsCallbacks->getObject(sym); - arr_idx = arr->getNextIndexOfObject(this, 0); - if(-1 != arr_idx) { - // 'this' was found in the array; remove it - arr->removeObject(arr_idx); - } - } + changed = true; + this->setEventsTrackedCount(n->unsigned32BitValue()); + } + + + /* enables or disables system events */ + b = (OSBoolean *)d->getObject(kIOPMTimelineEnabledKey); + if (b) + { + changed = true; + this->setEventsRecordingLevel((int)(kOSBooleanTrue == b)); + } + + return changed; +} + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +OSDictionary *IOPMTimeline::copyInfoDictionary(void) +{ + OSDictionary *out = OSDictionary::withCapacity(3); + OSNumber *n = NULL; + + if (!out || !hdr) + return NULL; + + n = OSNumber::withNumber(hdr->sizeEntries, 32); + out->setObject(kIOPMTimelineSystemNumberTrackedKey, n); + n->release(); + + n = OSNumber::withNumber(hdr->sizeBytes, 32); + out->setObject(kIOPMTimelineSystemBufferSizeKey, n); + n->release(); + + // bool + out->setObject(kIOPMTimelineEnabledKey, eventsRecordingLevel ? kOSBooleanTrue : kOSBooleanFalse); + + return out; +} + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +/* IOPMTimeline::recordSystemPowerEvent() + * + * Expected "type" arguments are listed in IOPMPrivate.h under enum "SystemEventTypes" + * Type arguments include "system events", and "Intermediate events" + * + * - System Events have paired "start" and "stop" events. + * - A start event shall be followed by a stop event. + * - Any number of Intermediate Events may fall between the + * start and stop events. + * - Intermediate events are meaningless outside the bounds of a system event's + * start & stoup routines. + * - It's invalid to record a Start event without a following Stop event; e.g. two + * Start events without an intervenining Stop event is invalid. + * + * Buffer invariants + * - The first recorded system event shall be preceded by an entry with type == 0 + * - IOPMTimeline may choose not to record intermediate events while there's not + * a system event in process. + */ +IOReturn IOPMTimeline::recordSystemPowerEvent( PMEventDetails *details ) +{ + static bool wakeDonePending = true; + IOPMSystemEventRecord *record_to = NULL; + OSString *swUUIDKey = NULL; + uint32_t useIndex = 0; + + if (!details) + return kIOReturnBadArgument; + + if (!traceBuffer) + return kIOReturnNotReady; + + if (details->eventType == kIOPMEventTypeWakeDone) + { + if(!wakeDonePending) + return kIOReturnBadArgument; + } + + IOLockLock(logLock); - settings_iter->release(); + if (details->eventType == kIOPMEventTypeWake) { + wakeDonePending = true; + } else if (details->eventType == kIOPMEventTypeWakeDone) { + wakeDonePending = false; } + + systemState = details->eventType; + + useIndex = _atomicIndexIncrement(&hdr->index, hdr->sizeEntries); - IORecursiveLockUnlock(parent->settingsCtrlLock); + // The entry immediately after the latest entry (and thus + // immediately before the first entry) shall have a type 0. + if (useIndex + 1 >= hdr->sizeEntries) { + traceBuffer[useIndex + 1].eventType = 0; + } else { + traceBuffer[0].eventType = 0; + } - super::free(); + record_to = &traceBuffer[useIndex]; + bzero(record_to, sizeof(IOPMSystemEventRecord)); + + /*****/ + record_to->eventType = details->eventType; + record_to->eventReason = details->reason; + record_to->eventResult = details->result; + pmEventTimeStamp(&record_to->timestamp); + + // If caller doesn't provide a UUID, we'll use the UUID that's posted + // on IOPMrootDomain under key kIOPMSleepWakeUUIDKey + if (!details->uuid) { + swUUIDKey = OSDynamicCast(OSString, owner->copyProperty(kIOPMSleepWakeUUIDKey)); + + if (swUUIDKey) + details->uuid = swUUIDKey->getCStringNoCopy(); + } + + if (details->uuid) + strncpy(record_to->uuid, details->uuid, kMaxPMStringLength); + + if (swUUIDKey) + swUUIDKey->release(); + + numEventsLoggedThisPeriod++; + /*****/ + + IOLockUnlock(logLock); + + return kIOReturnSuccess; + +} + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +IOReturn IOPMTimeline::recordDetailedPowerEvent( PMEventDetails *details ) +{ + IOPMSystemEventRecord *record_to = NULL; + uint32_t useIndex; + + if (!details->eventType || !details->ownerName) + return kIOReturnBadArgument; + + IOLockLock(logLock); + + useIndex = _atomicIndexIncrement(&hdr->index, hdr->sizeEntries); + + record_to = (IOPMSystemEventRecord *)&traceBuffer[useIndex]; + bzero(record_to, sizeof(IOPMSystemEventRecord)); + + /*****/ + record_to->eventType = details->eventType; + if (details->ownerName && (strlen(details->ownerName) > 1)) { + strlcpy( record_to->ownerName, + details->ownerName, + sizeof(record_to->ownerName)); + } + + record_to->ownerDisambiguateID = details->ownerUnique; + + if (details->interestName && (strlen(details->interestName) > 1)) { + strlcpy(record_to->interestName, + details->interestName, + sizeof(record_to->interestName)); + } + + record_to->oldState = details->oldState; + record_to->newState = details->newState; + record_to->eventResult = details->result; + record_to->elapsedTimeUS = details->elapsedTimeUS; + pmEventTimeStamp(&record_to->timestamp); + + numEventsLoggedThisPeriod++; + /*****/ + + IOLockUnlock(logLock); + return kIOReturnSuccess; +} + +uint32_t IOPMTimeline::getNumEventsLoggedThisPeriod() { + return this->numEventsLoggedThisPeriod; +} + +void IOPMTimeline::setNumEventsLoggedThisPeriod(uint32_t newCount) { + this->numEventsLoggedThisPeriod = newCount; +} + +bool IOPMTimeline::isSleepCycleInProgress() { + return this->sleepCycleInProgress; +} + +void IOPMTimeline::setSleepCycleInProgressFlag(bool flag) { + this->sleepCycleInProgress = flag; +} +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +void IOPMTimeline::setEventsTrackedCount(uint32_t newTracked) +{ + size_t make_buf_size = 0; + + make_buf_size = sizeof(IOPMTraceBufferHeader) + (newTracked * sizeof(IOPMSystemEventRecord)); + + IOLockLock(logLock); + + if (pmTraceMemoryDescriptor) { + pmTraceMemoryDescriptor->release(); + pmTraceMemoryDescriptor = NULL; + } + + hdr = NULL; + traceBuffer = NULL; + + if (0 == newTracked) + { + IOLog("IOPMrootDomain -> erased buffer.\n"); + goto exit; + } + + pmTraceMemoryDescriptor = IOBufferMemoryDescriptor::withOptions( + kIOMemoryKernelUserShared | kIODirectionIn, make_buf_size); + + if (!pmTraceMemoryDescriptor) + { + IOLog("IOPMRootDomain -> IOBufferMemoryDescriptor(%d) returns NULL\n", (int)make_buf_size); + goto exit; + } + + pmTraceMemoryDescriptor->prepare(kIODirectionIn); + + // Header occupies the first sizeof(IOPMTraceBufferHeader) bytes + hdr = (IOPMTraceBufferHeader *)pmTraceMemoryDescriptor->getBytesNoCopy(); + + // Recorded events occupy the remaining bulk of the buffer + traceBuffer = (IOPMSystemEventRecord *)((uint8_t *)hdr + sizeof(IOPMTraceBufferHeader)); + + bzero(hdr, make_buf_size); + + hdr->sizeBytes = make_buf_size; + hdr->sizeEntries = newTracked; + + IOLog("IOPMRootDomain -> IOBufferMemoryDescriptor(%d) returns bufferMB with address 0x%08x\n", (int)make_buf_size, (unsigned int)(uintptr_t)traceBuffer); + +exit: + IOLockUnlock(logLock); +} + +//********************************************************************************* +//********************************************************************************* +//********************************************************************************* + +void IOPMTimeline::setEventsRecordingLevel(uint32_t eventsTrackedBits) +{ + + // TODO + + return; + } -void PMSettingObject::taggedRelease(const void *tag, const int when) const -{ - // We have n+1 retains - 1 per array that this PMSettingObject is a member - // of, and 1 retain to ourself. When we get a release with n+1 retains - // remaining, we go ahead and free ourselves, cleaning up array pointers - // in free(); +/* static helper to IOPMTimeline + */ +uint32_t IOPMTimeline::_atomicIndexIncrement(uint32_t *index, uint32_t limit) +{ + uint32_t was_index; + uint32_t inc_index; + + if(!index) + return NULL; + + do { + was_index = *index; + inc_index = (was_index+1)%limit; + } while (!OSCompareAndSwap(was_index, inc_index, index)); - super::taggedRelease(tag, releaseAtCount); + return inc_index; } // MARK: - @@ -5676,8 +7245,19 @@ void PMAssertionsTracker::tabulate(void) if ((assertionsKernel != oldKernel) || (assertionsCombined != oldCombined)) - { - owner->messageClients(kIOPMMessageDriverAssertionsChanged); + { + owner->messageClients(kIOPMMessageDriverAssertionsChanged); + + if (((assertionsCombined & kIOPMDriverAssertionPreventDisplaySleepBit) != 0) + && ((oldCombined & kIOPMDriverAssertionPreventDisplaySleepBit) == 0)) + { + /* We react to a new PreventDisplaySleep assertion by waking the display + * with an activityTickle + */ + owner->evaluatePolicy(kStimulusDarkWakeActivityTickle); + } else { + owner->evaluatePolicy(kStimulusDarkWakeEvaluate); + } } } @@ -5780,18 +7360,14 @@ IOReturn PMAssertionsTracker::createAssertion( PMAssertStruct track; // Warning: trillions and trillions of created assertions may overflow the unique ID. -#ifdef __ppc__ - track.id = issuingUniqueID++; // FIXME: need OSIncrementAtomic64() for ppc -#else track.id = OSIncrementAtomic64((SInt64*) &issuingUniqueID); -#endif track.level = level; track.assertionBits = which; track.ownerString = whoItIs ? OSSymbol::withCString(whoItIs) : 0; track.ownerService = serviceID; track.modifiedTime = 0; pmEventTimeStamp(&track.createdTime); - + dataStore = OSData::withBytes(&track, sizeof(PMAssertStruct)); if (!dataStore) { @@ -6010,6 +7586,7 @@ IOPMDriverAssertionLevel PMAssertionsTracker::getAssertionLevel( //********************************************************************************* //********************************************************************************* + static void pmEventTimeStamp(uint64_t *recordTS) { clock_sec_t tsec; @@ -6031,37 +7608,38 @@ static void pmEventTimeStamp(uint64_t *recordTS) return; } -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +// MARK: - +// MARK: IORootParent -#undef super -#define super IOService +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ OSDefineMetaClassAndFinalStructors(IORootParent, IOService) -// This array exactly parallels the state array for the root domain. -// Power state changes initiated by a device can be vetoed by a client of the device, and -// power state changes initiated by the parent of a device cannot be vetoed by a client of the device, -// so when the root domain wants a power state change that cannot be vetoed (e.g. demand sleep), it asks -// its parent to make the change. That is the reason for this complexity. +// The reason that root domain needs a root parent is to facilitate demand +// sleep, since a power change from the root parent cannot be vetoed. +// +// The above statement is no longer true since root domain now performs +// demand sleep using overrides. But root parent remains to avoid changing +// the power tree stacking. Root parent is parked at the max power state. + -static IOPMPowerState patriarchPowerStates[NUM_POWER_STATES] = +static IOPMPowerState patriarchPowerStates[2] = { - {1,0,0,0,0,0,0,0,0,0,0,0}, // off (not used) - {1,0,RESTART_POWER,0,0,0,0,0,0,0,0,0}, // reset (not used) - {1,0,SLEEP_POWER,0,0,0,0,0,0,0,0,0}, // sleep - {1,0,DOZE_POWER,0,0,0,0,0,0,0,0,0}, // doze - {1,0,ON_POWER,0,0,0,0,0,0,0,0,0}, // running + {1,0,ON_POWER,0,0,0,0,0,0,0,0,0}, + {1,0,ON_POWER,0,0,0,0,0,0,0,0,0}, }; +void IORootParent::initialize( void ) +{ +} + bool IORootParent::start( IOService * nub ) { - mostRecentChange = ON_STATE; - super::start(nub); + IOService::start(nub); attachToParent( getRegistryRoot(), gIOPowerPlane ); PMinit(); - registerPowerDriver(this, patriarchPowerStates, NUM_POWER_STATES); - wakeSystem(); - powerOverrideOnPriv(); + registerPowerDriver(this, patriarchPowerStates, 2); + makeUsable(); return true; } @@ -6075,30 +7653,22 @@ void IORootParent::restartSystem( void ) void IORootParent::sleepSystem( void ) { - mostRecentChange = SLEEP_STATE; - changePowerStateToPriv(SLEEP_STATE); } void IORootParent::dozeSystem( void ) { - mostRecentChange = DOZE_STATE; - changePowerStateToPriv(DOZE_STATE); } -// Called in demand sleep when sleep discovered to be impossible after actually attaining that state. -// This brings the parent to doze, which allows the root to step up from sleep to doze. - -// In idle sleep, do nothing because the parent is still on and the root can freely change state. - void IORootParent::sleepToDoze( void ) { - if ( mostRecentChange == SLEEP_STATE ) { - changePowerStateToPriv(DOZE_STATE); - } } void IORootParent::wakeSystem( void ) { - mostRecentChange = ON_STATE; - changePowerStateToPriv(ON_STATE); } + +OSObject * IORootParent::copyProperty( const char * aKey) const +{ + return (IOService::copyProperty(aKey)); +} + diff --git a/iokit/Kernel/IOPlatformExpert.cpp b/iokit/Kernel/IOPlatformExpert.cpp index f00ffd725..7800babda 100644 --- a/iokit/Kernel/IOPlatformExpert.cpp +++ b/iokit/Kernel/IOPlatformExpert.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2008 Apple Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -44,6 +44,7 @@ #include <libkern/c++/OSContainers.h> #include <libkern/crypto/sha1.h> +#include <libkern/OSAtomic.h> extern "C" { #include <machine/machine_routines.h> @@ -77,6 +78,7 @@ OSMetaClassDefineReservedUnused(IOPlatformExpert, 11); static IOPlatformExpert * gIOPlatform; static OSDictionary * gIOInterruptControllers; static IOLock * gIOInterruptControllersLock; +static IODTNVRAM *gIOOptionsEntry; OSSymbol * gPlatformInterruptControllerName; @@ -258,7 +260,7 @@ int IOPlatformExpert::haltRestart(unsigned int type) // On ARM kPEPanicRestartCPU is supported in the drivers if (type == kPEPanicRestartCPU) type = kPERestartCPU; - + if (PE_halt_restart) return (*PE_halt_restart)(type); else return -1; } @@ -371,6 +373,8 @@ PMLog(const char *who, unsigned long event, { UInt32 debugFlags = gIOKitDebug; UInt32 traceFlags = gIOKitTrace; + uintptr_t name = 0; + UInt32 i = 0; if (debugFlags & kIOLogPower) { @@ -402,8 +406,11 @@ PMLog(const char *who, unsigned long event, code |= DBG_FUNC_START - sgnevent; } - // Record the timestamp, wish I had a this pointer - IOTimeStampConstant(code, (uintptr_t) who, event, param1, param2); + // Get first 8 characters of the name + while ( i < sizeof(uintptr_t) && who[i] != 0) + { ((char *)&name)[sizeof(uintptr_t)-i-1]=who[i]; i++; } + // Record the timestamp. + IOTimeStampConstant(code, name, event, param1, param2); } } } @@ -779,12 +786,13 @@ int PEGetPlatformEpoch(void) int PEHaltRestart(unsigned int type) { - IOPMrootDomain *pmRootDomain = IOService::getPMRootDomain(); + IOPMrootDomain *pmRootDomain; AbsoluteTime deadline; thread_call_t shutdown_hang; if(type == kPEHaltCPU || type == kPERestartCPU || type == kPEUPSDelayHaltCPU) { + pmRootDomain = IOService::getPMRootDomain(); /* Notify IOKit PM clients of shutdown/restart Clients subscribe to this message with a call to IOService::registerInterest() @@ -820,6 +828,115 @@ UInt32 PESavePanicInfo(UInt8 *buffer, UInt32 length) else return 0; } + + +inline static int init_gIOOptionsEntry(void) +{ + IORegistryEntry *entry; + void *nvram_entry; + volatile void **options; + int ret = -1; + + if (gIOOptionsEntry) + return 0; + + entry = IORegistryEntry::fromPath( "/options", gIODTPlane ); + if (!entry) + return -1; + + nvram_entry = (void *) OSDynamicCast(IODTNVRAM, entry); + if (!nvram_entry) + goto release; + + options = (volatile void **) &gIOOptionsEntry; + if (!OSCompareAndSwapPtr(NULL, nvram_entry, options)) { + ret = 0; + goto release; + } + + return 0; + +release: + entry->release(); + return ret; + +} + +/* pass in a NULL value if you just want to figure out the len */ +boolean_t PEReadNVRAMProperty(const char *symbol, void *value, + unsigned int *len) +{ + OSObject *obj; + OSData *data; + unsigned int vlen; + + if (!symbol || !len) + goto err; + + if (init_gIOOptionsEntry() < 0) + goto err; + + vlen = *len; + *len = 0; + + obj = gIOOptionsEntry->getProperty(symbol); + if (!obj) + goto err; + + /* convert to data */ + data = OSDynamicCast(OSData, obj); + if (!data) + goto err; + + *len = data->getLength(); + vlen = min(vlen, *len); + if (vlen) + memcpy((void *) value, data->getBytesNoCopy(), vlen); + + return TRUE; + +err: + return FALSE; +} + + +boolean_t PEWriteNVRAMProperty(const char *symbol, const void *value, + const unsigned int len) +{ + const OSSymbol *sym; + OSData *data; + bool ret = false; + + if (!symbol || !value || !len) + goto err; + + if (init_gIOOptionsEntry() < 0) + goto err; + + sym = OSSymbol::withCStringNoCopy(symbol); + if (!sym) + goto err; + + data = OSData::withBytes((void *) value, len); + if (!data) + goto sym_done; + + ret = gIOOptionsEntry->setProperty(sym, data); + data->release(); + +sym_done: + sym->release(); + + if (ret == true) { + gIOOptionsEntry->sync(); + return TRUE; + } + +err: + return FALSE; +} + + long PEGetGMTTimeOfDay(void) { long result = 0; diff --git a/iokit/Kernel/IORegistryEntry.cpp b/iokit/Kernel/IORegistryEntry.cpp index e41a94bdc..a299d3fa1 100644 --- a/iokit/Kernel/IORegistryEntry.cpp +++ b/iokit/Kernel/IORegistryEntry.cpp @@ -278,7 +278,12 @@ bool IORegistryEntry::init( OSDictionary * dict ) bzero(reserved, sizeof(ExpansionData)); } if( dict) { - dict->retain(); + if (OSCollection::kImmutable & dict->setOptions(0, 0)) { + dict = (OSDictionary *) dict->copyCollection(); + if (!dict) + return (false); + } else + dict->retain(); if( fPropertyTable) fPropertyTable->release(); fPropertyTable = dict; diff --git a/iokit/Kernel/IOService.cpp b/iokit/Kernel/IOService.cpp index 1a28626cf..6ef0b3413 100644 --- a/iokit/Kernel/IOService.cpp +++ b/iokit/Kernel/IOService.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2008 Apple Inc. All rights reserved. + * Copyright (c) 1998-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -50,10 +50,13 @@ #include <IOKit/assert.h> #include <sys/errno.h> +#include <machine/pal_routines.h> + #define LOG kprintf //#define LOG IOLog #include "IOServicePrivate.h" +#include "IOKitKernelInternal.h" // take lockForArbitration before LOCKNOTIFY @@ -106,11 +109,16 @@ const OSSymbol * gIOKitDebugKey; const OSSymbol * gIOCommandPoolSizeKey; +const OSSymbol * gIOConsoleLockedKey; const OSSymbol * gIOConsoleUsersKey; const OSSymbol * gIOConsoleSessionUIDKey; +const OSSymbol * gIOConsoleSessionAuditIDKey; const OSSymbol * gIOConsoleUsersSeedKey; -const OSSymbol * gIOConsoleSessionOnConsoleKey; -const OSSymbol * gIOConsoleSessionSecureInputPIDKey; +const OSSymbol * gIOConsoleSessionOnConsoleKey; +const OSSymbol * gIOConsoleSessionSecureInputPIDKey; +const OSSymbol * gIOConsoleSessionScreenLockedTimeKey; + +static clock_sec_t gIOConsoleLockTime; static int gIOResourceGenerationCount; @@ -125,6 +133,7 @@ const OSSymbol * gIOGeneralInterest; const OSSymbol * gIOBusyInterest; const OSSymbol * gIOAppPowerStateInterest; const OSSymbol * gIOPriorityPowerStateInterest; +const OSSymbol * gIOConsoleSecurityInterest; static OSDictionary * gNotifications; static IORecursiveLock * gNotificationLock; @@ -159,6 +168,9 @@ const OSSymbol * gIOPlatformActiveActionKey; const OSSymbol * gIOPlatformFunctionHandlerSet; +static IOLock * gIOConsoleUsersLock; +static thread_call_t gIOConsoleLockCallout; + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #define LOCKREADNOTIFY() \ @@ -213,14 +225,6 @@ bool IOService::isInactive( void ) const { return( 0 != (kIOServiceInactiveState & getState())); } -/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ - -#define IOServiceTrace(csc, a, b, c, d) { \ - if(kIOTraceIOService & gIOKitTrace) { \ - KERNEL_DEBUG_CONSTANT(IODBG_IOSERVICE(csc), a, b, c, d, 0); \ - } \ -} - /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #if defined(__i386__) || defined(__x86_64__) @@ -296,6 +300,7 @@ void IOService::initialize( void ) gIOBusyInterest = OSSymbol::withCStringNoCopy( kIOBusyInterest ); gIOAppPowerStateInterest = OSSymbol::withCStringNoCopy( kIOAppPowerStateInterest ); gIOPriorityPowerStateInterest = OSSymbol::withCStringNoCopy( kIOPriorityPowerStateInterest ); + gIOConsoleSecurityInterest = OSSymbol::withCStringNoCopy( kIOConsoleSecurityInterest ); gNotifications = OSDictionary::withCapacity( 1 ); gIOPublishNotification = OSSymbol::withCStringNoCopy( @@ -310,13 +315,18 @@ void IOService::initialize( void ) kIOTerminatedNotification ); gIOServiceKey = OSSymbol::withCStringNoCopy( kIOServiceClass); + gIOConsoleLockedKey = OSSymbol::withCStringNoCopy( kIOConsoleLockedKey); gIOConsoleUsersKey = OSSymbol::withCStringNoCopy( kIOConsoleUsersKey); gIOConsoleSessionUIDKey = OSSymbol::withCStringNoCopy( kIOConsoleSessionUIDKey); - gIOConsoleUsersSeedKey = OSSymbol::withCStringNoCopy( kIOConsoleUsersSeedKey); - gIOConsoleSessionOnConsoleKey = OSSymbol::withCStringNoCopy( kIOConsoleSessionOnConsoleKey); - gIOConsoleSessionSecureInputPIDKey = OSSymbol::withCStringNoCopy( kIOConsoleSessionSecureInputPIDKey); - gIOConsoleUsersSeedValue = OSData::withBytesNoCopy(&gIOConsoleUsersSeed, sizeof(gIOConsoleUsersSeed)); + gIOConsoleSessionAuditIDKey = OSSymbol::withCStringNoCopy( kIOConsoleSessionAuditIDKey); + + gIOConsoleUsersSeedKey = OSSymbol::withCStringNoCopy(kIOConsoleUsersSeedKey); + gIOConsoleSessionOnConsoleKey = OSSymbol::withCStringNoCopy(kIOConsoleSessionOnConsoleKey); + gIOConsoleSessionSecureInputPIDKey = OSSymbol::withCStringNoCopy(kIOConsoleSessionSecureInputPIDKey); + gIOConsoleSessionScreenLockedTimeKey = OSSymbol::withCStringNoCopy(kIOConsoleSessionScreenLockedTimeKey); + gIOConsoleUsersSeedValue = OSData::withBytesNoCopy(&gIOConsoleUsersSeed, sizeof(gIOConsoleUsersSeed)); + gIOPlatformSleepActionKey = OSSymbol::withCStringNoCopy(kIOPlatformSleepActionKey); gIOPlatformWakeActionKey = OSSymbol::withCStringNoCopy(kIOPlatformWakeActionKey); gIOPlatformQuiesceActionKey = OSSymbol::withCStringNoCopy(kIOPlatformQuiesceActionKey); @@ -345,9 +355,14 @@ void IOService::initialize( void ) gIOServiceBusyLock = IOLockAlloc(); + gIOConsoleUsersLock = IOLockAlloc(); + err = semaphore_create(kernel_task, &gJobsSemaphore, SYNC_POLICY_FIFO, 0); - assert( gIOServiceBusyLock && gJobs && gJobsLock && (err == KERN_SUCCESS) ); + gIOConsoleLockCallout = thread_call_allocate(&IOService::consoleLockTimer, NULL); + + assert( gIOServiceBusyLock && gJobs && gJobsLock && gIOConsoleUsersLock + && gIOConsoleLockCallout && (err == KERN_SUCCESS) ); gIOResources = IOResources::resources(); assert( gIOResources ); @@ -578,7 +593,6 @@ void IOService::startMatching( IOOptionBits options ) // OSKernelStackRemaining(), getName()); if( needConfig) { - prevBusy = _adjustBusy( 1 ); needWake = (0 != (kIOServiceSyncPubState & __state[1])); } @@ -591,6 +605,8 @@ void IOService::startMatching( IOOptionBits options ) if( needConfig) { + prevBusy = _adjustBusy( 1 ); + if( needWake) { IOLockLock( gIOServiceBusyLock ); thread_wakeup( (event_t) this/*&__state[1]*/ ); @@ -1470,6 +1486,7 @@ IONotifier * IOService::registerInterest( const OSSymbol * typeOfInterest, if( (typeOfInterest != gIOGeneralInterest) && (typeOfInterest != gIOBusyInterest) && (typeOfInterest != gIOAppPowerStateInterest) + && (typeOfInterest != gIOConsoleSecurityInterest) && (typeOfInterest != gIOPriorityPowerStateInterest)) return( 0 ); @@ -1541,6 +1558,7 @@ void IOService::unregisterAllInterest( void ) cleanInterestList( getProperty( gIOBusyInterest )); cleanInterestList( getProperty( gIOAppPowerStateInterest )); cleanInterestList( getProperty( gIOPriorityPowerStateInterest )); + cleanInterestList( getProperty( gIOConsoleSecurityInterest )); } /* @@ -1583,7 +1601,7 @@ void _IOServiceInterestNotifier::remove() LOCKWRITENOTIFY(); if( queue_next( &chain )) { - remqueue( 0, &chain); + remqueue(&chain); queue_next( &chain) = queue_prev( &chain) = 0; release(); } @@ -1631,7 +1649,7 @@ void _IOServiceInterestNotifier::enable( bool was ) #define tailQ(o) setObject(o) #define headQ(o) setObject(0, o) -#define TLOG(fmt, args...) { if(kIOLogYield & gIOKitDebug) IOLog(fmt, ## args); } +#define TLOG(fmt, args...) { if(kIOLogYield & gIOKitDebug) { IOLog("[%llx] ", thread_tid(current_thread())); IOLog(fmt, ## args); }} static void _workLoopAction( IOWorkLoop::Action action, IOService * service, @@ -1667,13 +1685,15 @@ bool IOService::requestTerminate( IOService * provider, IOOptionBits options ) bool IOService::terminatePhase1( IOOptionBits options ) { - IOService * victim; - IOService * client; - OSIterator * iter; - OSArray * makeInactive; - bool ok; - bool didInactive; - bool startPhase2 = false; + IOService * victim; + IOService * client; + OSIterator * iter; + OSArray * makeInactive; + int waitResult = THREAD_AWAKENED; + bool wait; + bool ok; + bool didInactive; + bool startPhase2 = false; TLOG("%s::terminatePhase1(%08llx)\n", getName(), (long long)options); @@ -1701,16 +1721,38 @@ bool IOService::terminatePhase1( IOOptionBits options ) while( victim ) { - didInactive = victim->lockForArbitration( true ); + didInactive = victim->lockForArbitration( true ); if( didInactive) { didInactive = (0 == (victim->__state[0] & kIOServiceInactiveState)); if( didInactive) { victim->__state[0] |= kIOServiceInactiveState; victim->__state[0] &= ~(kIOServiceRegisteredState | kIOServiceMatchedState | kIOServiceFirstPublishState | kIOServiceFirstMatchState); + + if (victim == this) + victim->__state[1] |= kIOServiceTermPhase1State; + victim->_adjustBusy( 1 ); - } - victim->unlockForArbitration(); + + } else if (victim != this) do { + + IOLockLock(gIOServiceBusyLock); + wait = (victim->__state[1] & kIOServiceTermPhase1State); + if( wait) { + TLOG("%s::waitPhase1(%s)\n", getName(), victim->getName()); + victim->__state[1] |= kIOServiceTerm1WaiterState; + victim->unlockForArbitration(); + assert_wait((event_t)&victim->__state[1], THREAD_UNINT); + } + IOLockUnlock(gIOServiceBusyLock); + if( wait) { + waitResult = thread_block(THREAD_CONTINUE_NULL); + TLOG("%s::did waitPhase1(%s)\n", getName(), victim->getName()); + victim->lockForArbitration(); + } + } while( wait && (waitResult != THREAD_TIMED_OUT)); + + victim->unlockForArbitration(); } if( victim == this) startPhase2 = didInactive; @@ -1755,8 +1797,21 @@ bool IOService::terminatePhase1( IOOptionBits options ) makeInactive->release(); if( startPhase2) - scheduleTerminatePhase2( options ); + { + lockForArbitration(); + __state[1] &= ~kIOServiceTermPhase1State; + if (kIOServiceTerm1WaiterState & __state[1]) + { + __state[1] &= ~kIOServiceTerm1WaiterState; + TLOG("%s::wakePhase1\n", getName()); + IOLockLock( gIOServiceBusyLock ); + thread_wakeup( (event_t) &__state[1]); + IOLockUnlock( gIOServiceBusyLock ); + } + unlockForArbitration(); + scheduleTerminatePhase2( options ); + } return( true ); } @@ -1917,7 +1972,9 @@ bool IOService::didTerminate( IOService * provider, IOOptionBits options, bool * } void IOService::actionWillTerminate( IOService * victim, IOOptionBits options, - OSArray * doPhase2List ) + OSArray * doPhase2List, + void *unused2 __unused, + void *unused3 __unused ) { OSIterator * iter; IOService * client; @@ -1945,7 +2002,9 @@ void IOService::actionWillTerminate( IOService * victim, IOOptionBits options, } } -void IOService::actionDidTerminate( IOService * victim, IOOptionBits options ) +void IOService::actionDidTerminate( IOService * victim, IOOptionBits options, + void *unused1 __unused, void *unused2 __unused, + void *unused3 __unused ) { OSIterator * iter; IOService * client; @@ -1977,7 +2036,9 @@ void IOService::actionDidTerminate( IOService * victim, IOOptionBits options ) } } -void IOService::actionFinalize( IOService * victim, IOOptionBits options ) +void IOService::actionFinalize( IOService * victim, IOOptionBits options, + void *unused1 __unused, void *unused2 __unused, + void *unused3 __unused ) { TLOG("%s::finalize(%08llx)\n", victim->getName(), (long long)options); @@ -1991,7 +2052,9 @@ void IOService::actionFinalize( IOService * victim, IOOptionBits options ) victim->finalize( options ); } -void IOService::actionStop( IOService * provider, IOService * client ) +void IOService::actionStop( IOService * provider, IOService * client, + void *unused1 __unused, void *unused2 __unused, + void *unused3 __unused ) { TLOG("%s::stop(%s)\n", client->getName(), provider->getName()); @@ -3181,8 +3244,10 @@ UInt32 IOService::_adjustBusy( SInt32 delta ) &messageClientsApplier, &context ); #if !NO_KEXTD - if( nowQuiet && (next == gIOServiceRoot)) + if( nowQuiet && (next == gIOServiceRoot)) { OSKext::considerUnloads(); + IOServiceTrace(IOSERVICE_REGISTRY_QUIET, 0, 0, 0, 0); + } #endif } @@ -3386,7 +3451,7 @@ IOReturn IOService::waitMatchIdle( UInt32 msToWait ) bool wait; int waitResult = THREAD_AWAKENED; bool computeDeadline = true; - AbsoluteTime abstime; + AbsoluteTime deadline; IOLockLock( gJobsLock ); do { @@ -3394,14 +3459,12 @@ IOReturn IOService::waitMatchIdle( UInt32 msToWait ) if( wait) { if( msToWait) { if( computeDeadline ) { - clock_interval_to_absolutetime_interval( - msToWait, kMillisecondScale, &abstime ); - clock_absolutetime_interval_to_deadline( - abstime, &abstime ); + clock_interval_to_deadline( + msToWait, kMillisecondScale, &deadline ); computeDeadline = false; } waitResult = IOLockSleepDeadline( gJobsLock, &gNumConfigThreads, - abstime, THREAD_UNINT ); + deadline, THREAD_UNINT ); } else { waitResult = IOLockSleep( gJobsLock, &gNumConfigThreads, THREAD_UNINT ); @@ -4096,6 +4159,34 @@ IOService * IOResources::resources( void ) return( inst ); } +bool IOResources::init( OSDictionary * dictionary ) +{ + // Do super init first + if ( !super::init() ) + return false; + + // Allow PAL layer to publish a value + const char *property_name; + int property_value; + + pal_get_resource_property( &property_name, &property_value ); + + if( property_name ) { + OSNumber *num; + const OSSymbol * sym; + + if( (num = OSNumber::withNumber(property_value, 32)) != 0 ) { + if( (sym = OSSymbol::withCString( property_name)) != 0 ) { + this->setProperty( sym, num ); + sym->release(); + } + num->release(); + } + } + + return true; +} + IOWorkLoop * IOResources::getWorkLoop() const { // If we are the resource root @@ -4133,6 +4224,92 @@ bool IOResources::matchPropertyTable( OSDictionary * table ) return( ok ); } +void IOService::consoleLockTimer(thread_call_param_t p0, thread_call_param_t p1) +{ + IOService::updateConsoleUsers(NULL, 0); +} + +void IOService::updateConsoleUsers(OSArray * consoleUsers, IOMessage systemMessage) +{ + IORegistryEntry * regEntry; + OSObject * locked = kOSBooleanFalse; + uint32_t idx; + bool publish; + OSDictionary * user; + static IOMessage sSystemPower; + + regEntry = IORegistryEntry::getRegistryRoot(); + + IOLockLock(gIOConsoleUsersLock); + + if (systemMessage) + { + sSystemPower = systemMessage; + } + if (consoleUsers) + { + OSNumber * num = 0; + for (idx = 0; + (!num) && (user = OSDynamicCast(OSDictionary, consoleUsers->getObject(idx))); + idx++) + { + num = OSDynamicCast(OSNumber, user->getObject(gIOConsoleSessionScreenLockedTimeKey)); + } + gIOConsoleLockTime = num ? num->unsigned32BitValue() : 0; + } + + if (gIOConsoleLockTime) + { + if (kIOMessageSystemWillSleep == sSystemPower) + locked = kOSBooleanTrue; + else + { + clock_sec_t now; + clock_usec_t microsecs; + + clock_get_calendar_microtime(&now, µsecs); + if (gIOConsoleLockTime > now) + { + AbsoluteTime deadline; + clock_interval_to_deadline(gIOConsoleLockTime - now, kSecondScale, &deadline); + thread_call_enter_delayed(gIOConsoleLockCallout, deadline); + } + else + { + locked = kOSBooleanTrue; + } + } + } + + publish = (consoleUsers || (locked != regEntry->getProperty(gIOConsoleLockedKey))); + if (publish) + { + regEntry->setProperty(gIOConsoleLockedKey, locked); + if (consoleUsers) + { + regEntry->setProperty(gIOConsoleUsersKey, consoleUsers); + } + OSIncrementAtomic( &gIOConsoleUsersSeed ); + } + + IOLockUnlock(gIOConsoleUsersLock); + + if (publish) + { + publishResource( gIOConsoleUsersSeedKey, gIOConsoleUsersSeedValue ); + + MessageClientsContext context; + + context.service = getServiceRoot(); + context.type = kIOMessageConsoleSecurityChange; + context.argument = (void *) regEntry; + context.argSize = 0; + + applyToInterestNotifiers(getServiceRoot(), gIOConsoleSecurityInterest, + &messageClientsApplier, &context ); + } +} + IOReturn IOResources::setProperties( OSObject * properties ) { IOReturn err; @@ -4152,15 +4329,17 @@ IOReturn IOResources::setProperties( OSObject * properties ) if( 0 == iter) return( kIOReturnBadArgument); - while( (key = OSDynamicCast(OSSymbol, iter->getNextObject()))) { - - if (gIOConsoleUsersKey == key) + while( (key = OSDynamicCast(OSSymbol, iter->getNextObject()))) + { + if (gIOConsoleUsersKey == key) do { - IORegistryEntry::getRegistryRoot()->setProperty(key, dict->getObject(key)); - OSIncrementAtomic( &gIOConsoleUsersSeed ); - publishResource( gIOConsoleUsersSeedKey, gIOConsoleUsersSeedValue ); - continue; + OSArray * consoleUsers; + consoleUsers = OSDynamicCast(OSArray, dict->getObject(key)); + if (!consoleUsers) + continue; + IOService::updateConsoleUsers(consoleUsers, 0); } + while (false); publishResource( key, dict->getObject(key) ); } @@ -4461,7 +4640,7 @@ bool IOService::passiveMatch( OSDictionary * table, bool changesOK ) } while( matchParent && (where = where->getProvider()) ); if( kIOLogMatch & gIOKitDebug) - if( where != this) + if( where && (where != this) ) LOG("match parent @ %s = %d\n", where->getName(), match ); @@ -5174,22 +5353,3 @@ OSMetaClassDefineReservedUnused(IOService, 44); OSMetaClassDefineReservedUnused(IOService, 45); OSMetaClassDefineReservedUnused(IOService, 46); OSMetaClassDefineReservedUnused(IOService, 47); - -#ifdef __ppc__ -OSMetaClassDefineReservedUnused(IOService, 48); -OSMetaClassDefineReservedUnused(IOService, 49); -OSMetaClassDefineReservedUnused(IOService, 50); -OSMetaClassDefineReservedUnused(IOService, 51); -OSMetaClassDefineReservedUnused(IOService, 52); -OSMetaClassDefineReservedUnused(IOService, 53); -OSMetaClassDefineReservedUnused(IOService, 54); -OSMetaClassDefineReservedUnused(IOService, 55); -OSMetaClassDefineReservedUnused(IOService, 56); -OSMetaClassDefineReservedUnused(IOService, 57); -OSMetaClassDefineReservedUnused(IOService, 58); -OSMetaClassDefineReservedUnused(IOService, 59); -OSMetaClassDefineReservedUnused(IOService, 60); -OSMetaClassDefineReservedUnused(IOService, 61); -OSMetaClassDefineReservedUnused(IOService, 62); -OSMetaClassDefineReservedUnused(IOService, 63); -#endif diff --git a/iokit/Kernel/IOServicePM.cpp b/iokit/Kernel/IOServicePM.cpp index fcecfbf00..bd7bcd002 100644 --- a/iokit/Kernel/IOServicePM.cpp +++ b/iokit/Kernel/IOServicePM.cpp @@ -26,14 +26,16 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ -//#define IOASSERT 1 +//#undef IOASSERT +//#define IOASSERT 1 + #include <IOKit/assert.h> #include <IOKit/IOKitDebug.h> #include <IOKit/IOLib.h> #include <IOKit/IOMessage.h> #include <IOKit/IOPlatformExpert.h> #include <IOKit/IOService.h> -#include <IOKit/IOTimerEventSource.h> +#include <IOKit/IOEventSource.h> #include <IOKit/IOWorkLoop.h> #include <IOKit/IOCommand.h> @@ -42,8 +44,10 @@ #include <IOKit/pwr_mgt/IOPMinformeeList.h> #include <IOKit/pwr_mgt/IOPowerConnection.h> #include <IOKit/pwr_mgt/RootDomain.h> +#include <IOKit/pwr_mgt/IOPMPrivate.h> #include <sys/proc.h> +#include <libkern/OSDebug.h> // Required for notification instrumentation #include "IOServicePrivate.h" @@ -51,6 +55,7 @@ #include "IOKitKernelInternal.h" static void settle_timer_expired(thread_call_param_t, thread_call_param_t); +static void idle_timer_expired(thread_call_param_t, thread_call_param_t); static void tellKernelClientApplier(OSObject * object, void * arg); static void tellAppClientApplier(OSObject * object, void * arg); @@ -69,15 +74,20 @@ static uint64_t computeTimeDeltaNS( const AbsoluteTime * start ) OSDefineMetaClassAndStructors(IOPMprot, OSObject) #endif -//********************************************************************************* +// Container class for recording system power events +OSDefineMetaClassAndStructors( PMEventDetails, OSObject ); + +//****************************************************************************** // Globals -//********************************************************************************* +//****************************************************************************** static bool gIOPMInitialized = false; static uint32_t gIOPMBusyCount = 0; +static uint32_t gIOPMWorkCount = 0; static IOWorkLoop * gIOPMWorkLoop = 0; static IOPMRequestQueue * gIOPMRequestQueue = 0; static IOPMRequestQueue * gIOPMReplyQueue = 0; +static IOPMWorkQueue * gIOPMWorkQueue = 0; static IOPMCompletionQueue * gIOPMFreeQueue = 0; static IOPMRequest * gIOPMRequest = 0; static IOPlatformExpert * gPlatform = 0; @@ -96,16 +106,31 @@ static uint32_t getPMRequestType( void ) return type; } -//********************************************************************************* +//****************************************************************************** // Macros -//********************************************************************************* +//****************************************************************************** #define PM_ERROR(x...) do { kprintf(x); IOLog(x); } while (false) -#define PM_DEBUG(x...) do { kprintf(x); } while (false) -#define PM_TRACE(x...) do { \ - if (kIOLogDebugPower & gIOKitDebug) kprintf(x); } while (false) +#define PM_LOG(x...) do { kprintf(x); } while (false) + +#define PM_LOG1(x...) do { \ + if (kIOLogDebugPower & gIOKitDebug) \ + kprintf(x); } while (false) + +#define PM_LOG2(x...) do { \ + if (kIOLogDebugPower & gIOKitDebug) \ + kprintf(x); } while (false) -#define PM_CONNECT(x...) +#if 0 +#define PM_LOG3(x...) do { kprintf(x); } while (false) +#else +#define PM_LOG3(x...) +#endif + +#define RD_LOG(x...) do { \ + if ((kIOLogPMRootDomain & gIOKitDebug) && \ + (getPMRootDomain() == this)) \ + kprintf("PMRD: " x); } while (false) #define PM_ASSERT_IN_GATE(x) \ do { \ @@ -114,7 +139,7 @@ do { \ #define PM_LOCK() IOLockLock(fPMLock) #define PM_UNLOCK() IOLockUnlock(fPMLock) -#define PM_LOCK_SLEEP(event) IOLockSleep(fPMLock, event, THREAD_UNINT) +#define PM_LOCK_SLEEP(event, dl) IOLockSleepDeadline(fPMLock, event, dl, THREAD_UNINT) #define PM_LOCK_WAKEUP(event) IOLockWakeup(fPMLock, event, false) #define ns_per_us 1000 @@ -128,17 +153,16 @@ do { \ do { gPlatform->PMLog( fName, t, a, b); } while(0) #define NS_TO_MS(nsec) ((int)((nsec) / 1000000ULL)) +#define NS_TO_US(nsec) ((int)((nsec) / 1000ULL)) #if CONFIG_EMBEDDED #define SUPPORT_IDLE_CANCEL 1 #endif -#define kNotifyWillChange (true) -#define kNotifyDidChange (false) - #define kIOPMPowerStateMax 0xFFFFFFFF -#define IS_PM_ROOT() (this == gIOPMRootNode) +#define IS_PM_ROOT (this == gIOPMRootNode) +#define IS_ROOT_DOMAIN (getPMRootDomain() == this) #define IS_POWER_DROP (fHeadNotePowerState < fCurrentPowerState) #define IS_POWER_RISE (fHeadNotePowerState > fCurrentPowerState) @@ -149,41 +173,69 @@ do { \ // use message tracer to log messages longer than (ns): #define LOG_APP_RESPONSE_MSG_TRACER (3 * 1000ULL * 1000ULL * 1000ULL) -#define RESERVE_DOMAIN_POWER 1 - enum { kReserveDomainPower = 1 }; +#define MS_PUSH(n) \ + do { assert(kIOPM_BadMachineState == fSavedMachineState); \ + assert(kIOPM_BadMachineState != n); \ + fSavedMachineState = n; } while (false) + +#define MS_POP() \ + do { assert(kIOPM_BadMachineState != fSavedMachineState); \ + fMachineState = fSavedMachineState; \ + fSavedMachineState = kIOPM_BadMachineState; } while (false) + +#define PM_ACTION_0(a) \ + do { if (fPMActions.a) { \ + (fPMActions.a)(fPMActions.target, this, &fPMActions); } \ + } while (false) + +#define PM_ACTION_2(a, x, y) \ + do { if (fPMActions.a) { \ + (fPMActions.a)(fPMActions.target, this, &fPMActions, x, y); } \ + } while (false) + //********************************************************************************* // PM machine states +// +// Check kgmacros after modifying machine states. //********************************************************************************* enum { + kIOPM_Finished = 0, + kIOPM_OurChangeTellClientsPowerDown = 1, kIOPM_OurChangeTellPriorityClientsPowerDown = 2, kIOPM_OurChangeNotifyInterestedDriversWillChange = 3, kIOPM_OurChangeSetPowerState = 4, kIOPM_OurChangeWaitForPowerSettle = 5, kIOPM_OurChangeNotifyInterestedDriversDidChange = 6, - kIOPM_OurChangeFinish = 7, - kIOPM_ParentDownTellPriorityClientsPowerDown = 8, - kIOPM_ParentDownNotifyInterestedDriversWillChange = 9, - /* 10 not used */ - kIOPM_ParentDownNotifyDidChangeAndAcknowledgeChange = 11, - kIOPM_ParentDownSetPowerState = 12, - kIOPM_ParentDownWaitForPowerSettle = 13, - kIOPM_ParentAcknowledgePowerChange = 14, - kIOPM_ParentUpSetPowerState = 15, - /* 16 not used */ - kIOPM_ParentUpWaitForSettleTime = 17, - kIOPM_ParentUpNotifyInterestedDriversDidChange = 18, - /* 19 not used */ - kIOPM_Finished = 20, - kIOPM_DriverThreadCallDone = 21, - kIOPM_NotifyChildrenDone = 22, + kIOPM_OurChangeTellCapabilityDidChange = 7, + kIOPM_OurChangeFinish = 8, + + kIOPM_ParentChangeTellPriorityClientsPowerDown = 10, + kIOPM_ParentChangeNotifyInterestedDriversWillChange = 11, + kIOPM_ParentChangeSetPowerState = 12, + kIOPM_ParentChangeWaitForPowerSettle = 13, + kIOPM_ParentChangeNotifyInterestedDriversDidChange = 14, + kIOPM_ParentChangeTellCapabilityDidChange = 15, + kIOPM_ParentChangeAcknowledgePowerChange = 16, + + kIOPM_NotifyChildrenStart = 17, + kIOPM_NotifyChildrenOrdered = 18, + kIOPM_NotifyChildrenDelayed = 19, + kIOPM_SyncTellClientsPowerDown = 20, + kIOPM_SyncTellPriorityClientsPowerDown = 21, + kIOPM_SyncNotifyWillChange = 22, kIOPM_SyncNotifyDidChange = 23, - kIOPM_SyncFinish = 24 + kIOPM_SyncTellCapabilityDidChange = 24, + kIOPM_SyncFinish = 25, + kIOPM_TellCapabilityChangeDone = 26, + kIOPM_DriverThreadCallDone = 27, + + kIOPM_BadMachineState = 0xFFFFFFFF }; @@ -366,68 +418,95 @@ void IOService::PMinit ( void ) if ( !gIOPMInitialized ) { gPlatform = getPlatform(); - gIOPMWorkLoop = IOWorkLoop::workLoop(); - if (gIOPMWorkLoop) - { - gIOPMRequestQueue = IOPMRequestQueue::create( - this, OSMemberFunctionCast(IOPMRequestQueue::Action, - this, &IOService::servicePMRequestQueue)); + gIOPMWorkLoop = IOWorkLoop::workLoop(); + if (gIOPMWorkLoop) + { + gIOPMRequestQueue = IOPMRequestQueue::create( + this, OSMemberFunctionCast(IOPMRequestQueue::Action, + this, &IOService::servicePMRequestQueue)); + + gIOPMReplyQueue = IOPMRequestQueue::create( + this, OSMemberFunctionCast(IOPMRequestQueue::Action, + this, &IOService::servicePMReplyQueue)); + + gIOPMWorkQueue = IOPMWorkQueue::create( + this, + OSMemberFunctionCast(IOPMWorkQueue::Action, this, + &IOService::servicePMRequest), + OSMemberFunctionCast(IOPMWorkQueue::Action, this, + &IOService::retirePMRequest)); + + gIOPMFreeQueue = IOPMCompletionQueue::create( + this, OSMemberFunctionCast(IOPMCompletionQueue::Action, + this, &IOService::servicePMFreeQueue)); + + if (gIOPMWorkLoop->addEventSource(gIOPMRequestQueue) != + kIOReturnSuccess) + { + gIOPMRequestQueue->release(); + gIOPMRequestQueue = 0; + } - gIOPMReplyQueue = IOPMRequestQueue::create( - this, OSMemberFunctionCast(IOPMRequestQueue::Action, - this, &IOService::servicePMReplyQueue)); + if (gIOPMWorkLoop->addEventSource(gIOPMReplyQueue) != + kIOReturnSuccess) + { + gIOPMReplyQueue->release(); + gIOPMReplyQueue = 0; + } + + if (gIOPMWorkLoop->addEventSource(gIOPMWorkQueue) != + kIOReturnSuccess) + { + gIOPMWorkQueue->release(); + gIOPMWorkQueue = 0; + } - gIOPMFreeQueue = IOPMCompletionQueue::create( - this, OSMemberFunctionCast(IOPMCompletionQueue::Action, - this, &IOService::servicePMFreeQueue)); + if (gIOPMWorkLoop->addEventSource(gIOPMFreeQueue) != + kIOReturnSuccess) + { + gIOPMFreeQueue->release(); + gIOPMFreeQueue = 0; + } - if (gIOPMWorkLoop->addEventSource(gIOPMRequestQueue) != - kIOReturnSuccess) - { - gIOPMRequestQueue->release(); - gIOPMRequestQueue = 0; - } + gIOPMPowerClientDevice = + OSSymbol::withCStringNoCopy( "DevicePowerState" ); - if (gIOPMWorkLoop->addEventSource(gIOPMReplyQueue) != - kIOReturnSuccess) - { - gIOPMReplyQueue->release(); - gIOPMReplyQueue = 0; - } + gIOPMPowerClientDriver = + OSSymbol::withCStringNoCopy( "DriverPowerState" ); - if (gIOPMWorkLoop->addEventSource(gIOPMFreeQueue) != - kIOReturnSuccess) - { - gIOPMFreeQueue->release(); - gIOPMFreeQueue = 0; - } + gIOPMPowerClientChildProxy = + OSSymbol::withCStringNoCopy( "ChildProxyPowerState" ); - gIOPMPowerClientDevice = OSSymbol::withCStringNoCopy( "DevicePowerState" ); - gIOPMPowerClientDriver = OSSymbol::withCStringNoCopy( "DriverPowerState" ); - gIOPMPowerClientChildProxy = OSSymbol::withCStringNoCopy( "ChildProxyPowerState" ); - gIOPMPowerClientChildren = OSSymbol::withCStringNoCopy( "ChildrenPowerState" ); - } + gIOPMPowerClientChildren = + OSSymbol::withCStringNoCopy( "ChildrenPowerState" ); + } - if (gIOPMRequestQueue && gIOPMReplyQueue && gIOPMFreeQueue) - gIOPMInitialized = true; - } - if (!gIOPMInitialized) - return; + if (gIOPMRequestQueue && gIOPMReplyQueue && gIOPMFreeQueue) + gIOPMInitialized = true; + } + if (!gIOPMInitialized) + return; pwrMgt = new IOServicePM; pwrMgt->init(); setProperty(kPwrMgtKey, pwrMgt); + queue_init(&pwrMgt->WorkChain); + queue_init(&pwrMgt->RequestHead); + queue_init(&pwrMgt->PMDriverCallQueue); + + fOwner = this; fPMLock = IOLockAlloc(); fInterestedDrivers = new IOPMinformeeList; fInterestedDrivers->initialize(); fDesiredPowerState = 0; fDeviceDesire = 0; - fInitialChange = true; - fPreviousRequest = 0; - fDeviceOverrides = false; + fInitialPowerChange = true; + fInitialSetPowerState = true; + fPreviousRequestPowerFlags = 0; + fDeviceOverrideEnabled = false; fMachineState = kIOPM_Finished; - fIdleTimerEventSource = NULL; + fSavedMachineState = kIOPM_BadMachineState; fIdleTimerMinPowerState = 0; fActivityLock = IOLockAlloc(); fStrictTreeOrder = false; @@ -437,13 +516,12 @@ void IOService::PMinit ( void ) fNumberOfPowerStates = 0; fCurrentPowerState = 0; fParentsCurrentPowerFlags = 0; - fMaxCapability = 0; + fMaxPowerState = 0; fName = getName(); fParentsKnowState = false; fSerialNumber = 0; fResponseArray = NULL; fNotifyClientArray = NULL; - fDoNotPowerDown = true; fCurrentPowerConsumption = kIOPMUnknown; fOverrideMaxPowerState = kIOPMPowerStateMax; @@ -457,9 +535,20 @@ void IOService::PMinit ( void ) &IOService::ack_timer_expired, (thread_call_param_t)this); fSettleTimer = thread_call_allocate( &settle_timer_expired, (thread_call_param_t)this); - fDriverCallEntry = thread_call_allocate( + fIdleTimer = thread_call_allocate( + &idle_timer_expired, (thread_call_param_t)this); + fDriverCallEntry = thread_call_allocate( (thread_call_func_t) &IOService::pmDriverCallout, this); - assert(fDriverCallEntry); + assert(fDriverCallEntry); + + // Check for powerChangeDone override. + if (OSMemberFunctionCast(void (*)(void), + getResourceService(), &IOService::powerChangeDone) != + OSMemberFunctionCast(void (*)(void), + this, &IOService::powerChangeDone)) + { + fPCDFunctionOverride = true; + } #if PM_VARS_SUPPORT IOPMprot * prot = new IOPMprot; @@ -472,7 +561,7 @@ void IOService::PMinit ( void ) pm_vars = prot; } #else - pm_vars = (void *) true; + pm_vars = (void *) (uintptr_t) true; #endif initialized = true; @@ -487,22 +576,18 @@ void IOService::PMinit ( void ) void IOService::PMfree ( void ) { - initialized = false; + initialized = false; pm_vars = 0; if ( pwrMgt ) - { - assert(fMachineState == kIOPM_Finished); - assert(fInsertInterestSet == NULL); - assert(fRemoveInterestSet == NULL); + { + assert(fMachineState == kIOPM_Finished); + assert(fInsertInterestSet == NULL); + assert(fRemoveInterestSet == NULL); assert(fNotifyChildArray == NULL); + assert(queue_empty(&pwrMgt->RequestHead)); + assert(queue_empty(&fPMDriverCallQueue)); - if ( fIdleTimerEventSource != NULL ) { - fIdleTimerEventSource->disable(); - gIOPMWorkLoop->removeEventSource(fIdleTimerEventSource); - fIdleTimerEventSource->release(); - fIdleTimerEventSource = NULL; - } if ( fSettleTimer ) { thread_call_cancel(fSettleTimer); thread_call_free(fSettleTimer); @@ -513,6 +598,11 @@ void IOService::PMfree ( void ) thread_call_free(fAckTimer); fAckTimer = NULL; } + if ( fIdleTimer ) { + thread_call_cancel(fIdleTimer); + thread_call_free(fIdleTimer); + fIdleTimer = NULL; + } if ( fDriverCallEntry ) { thread_call_free(fDriverCallEntry); fDriverCallEntry = NULL; @@ -525,20 +615,15 @@ void IOService::PMfree ( void ) IOLockFree(fActivityLock); fActivityLock = NULL; } - if ( fInterestedDrivers ) { - fInterestedDrivers->release(); - fInterestedDrivers = NULL; - } - if ( fPMWorkQueue ) { - gIOPMWorkLoop->removeEventSource(fPMWorkQueue); - fPMWorkQueue->release(); - fPMWorkQueue = 0; - } - if (fDriverCallParamSlots && fDriverCallParamPtr) { - IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots); - fDriverCallParamPtr = 0; - fDriverCallParamSlots = 0; - } + if ( fInterestedDrivers ) { + fInterestedDrivers->release(); + fInterestedDrivers = NULL; + } + if (fDriverCallParamSlots && fDriverCallParamPtr) { + IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots); + fDriverCallParamPtr = 0; + fDriverCallParamSlots = 0; + } if ( fResponseArray ) { fResponseArray->release(); fResponseArray = NULL; @@ -548,25 +633,25 @@ void IOService::PMfree ( void ) fNotifyClientArray = NULL; } if (fPowerStates && fNumberOfPowerStates) { - IODelete(fPowerStates, IOPMPowerState, fNumberOfPowerStates); + IODelete(fPowerStates, IOPMPSEntry, fNumberOfPowerStates); fNumberOfPowerStates = 0; fPowerStates = NULL; } - if (fPowerClients) { - fPowerClients->release(); - fPowerClients = 0; - } + if (fPowerClients) { + fPowerClients->release(); + fPowerClients = 0; + } #if PM_VARS_SUPPORT - if (fPMVars) - { - fPMVars->release(); - fPMVars = 0; - } + if (fPMVars) + { + fPMVars->release(); + fPMVars = 0; + } #endif pwrMgt->release(); - pwrMgt = 0; + pwrMgt = 0; } } @@ -614,42 +699,35 @@ IOReturn IOService::youAreRoot ( void ) void IOService::PMstop ( void ) { - IOPMRequest * request; + IOPMRequest * request; - if (!initialized) - return; + if (!initialized) + return; - // Schedule an async PMstop request, but immediately stop any further - // calls to the controlling or interested drivers. This device will - // continue to exist in the power plane and participate in power state - // changes until the PMstop async request is processed. + PM_LOCK(); - PM_LOCK(); - fLockedFlags.PMStop = true; - if (fLockedFlags.DriverCallBusy) - { - PM_DEBUG("%s: PMstop() driver call busy\n", getName()); - } - while (fThreadAssertionCount != 0) + if (fLockedFlags.PMStop) { - if (current_thread() == fThreadAssertionThread) - { - PM_ERROR("%s: PMstop() called from PM thread call\n", getName()); - break; - } - // Wait for thread assertions to drop to zero. - PM_DEBUG("%s: PMstop() wait for %u thread assertion(s)\n", - getName(), fThreadAssertionCount); - PM_LOCK_SLEEP(&fThreadAssertionCount); + PM_LOG2("%s: PMstop() already stopped\n", fName); + PM_UNLOCK(); + return; } + + // Inhibit future driver calls. + fLockedFlags.PMStop = true; + + // Wait for all prior driver calls to finish. + waitForPMDriverCall(); + PM_UNLOCK(); - request = acquirePMRequest( this, kIOPMRequestTypePMStop ); - if (request) - { - PM_TRACE("%s: %p PMstop\n", getName(), this); - submitPMRequest( request ); - } + // The rest of the work is performed async. + request = acquirePMRequest( this, kIOPMRequestTypePMStop ); + if (request) + { + PM_LOG2("%s: %p PMstop\n", getName(), this); + submitPMRequest( request ); + } } //********************************************************************************* @@ -660,14 +738,14 @@ void IOService::PMstop ( void ) void IOService::handlePMstop ( IOPMRequest * request ) { - OSIterator * iter; + OSIterator * iter; OSObject * next; IOPowerConnection * connection; IOService * theChild; IOService * theParent; PM_ASSERT_IN_GATE(); - PM_TRACE("%s: %p %s start\n", getName(), this, __FUNCTION__); + PM_LOG2("%s: %p %s start\n", getName(), this, __FUNCTION__); // remove the property removeProperty(kPwrMgtKey); @@ -729,23 +807,23 @@ void IOService::handlePMstop ( IOPMRequest * request ) if ( fInterestedDrivers ) { - IOPMinformeeList * list = fInterestedDrivers; + IOPMinformeeList * list = fInterestedDrivers; IOPMinformee * item; - PM_LOCK(); - while ((item = list->firstInList())) - { - list->removeFromList(item->whatObject); - } - PM_UNLOCK(); - } + PM_LOCK(); + while ((item = list->firstInList())) + { + list->removeFromList(item->whatObject); + } + PM_UNLOCK(); + } - // Tell idleTimerExpired() to ignore idle timer. - fIdleTimerPeriod = 0; - if (fIdleTimerEventSource) - fIdleTimerEventSource->disable(); + // Tell idleTimerExpired() to ignore idle timer. + fIdleTimerPeriod = 0; + if (fIdleTimer && thread_call_cancel(fIdleTimer)) + release(); - PM_TRACE("%s: %p %s done\n", getName(), this, __FUNCTION__); + PM_LOG2("%s: %p %s done\n", getName(), this, __FUNCTION__); } //********************************************************************************* @@ -791,7 +869,7 @@ IOReturn IOService::addPowerChild ( IOService * child ) } if (!ok) { - PM_DEBUG("%s: %s (%p) is already a child\n", + PM_LOG("%s: %s (%p) is already a child\n", getName(), child->getName(), child); break; } @@ -876,7 +954,7 @@ void IOService::addPowerChild1 ( IOPMRequest * request ) tempDesire = fNumberOfPowerStates - 1; } - if (tempDesire && (IS_PM_ROOT() || (fMaxCapability >= tempDesire))) + if (tempDesire && (IS_PM_ROOT || (fMaxPowerState >= tempDesire))) { adjustPowerState(tempDesire); } @@ -903,7 +981,7 @@ void IOService::addPowerChild2 ( IOPMRequest * request ) if (!parent || !inPlane(gIOPowerPlane)) { - PM_DEBUG("%s: addPowerChild2 not in power plane\n", getName()); + PM_LOG("%s: addPowerChild2 not in power plane\n", getName()); return; } @@ -914,7 +992,7 @@ void IOService::addPowerChild2 ( IOPMRequest * request ) powerState = parent->fCurrentPowerState; if (knowsState) - powerFlags = parent->fPowerStates[powerState].outputPowerCharacter; + powerFlags = parent->fPowerStates[powerState].outputPowerFlags; else powerFlags = 0; @@ -928,16 +1006,14 @@ void IOService::addPowerChild2 ( IOPMRequest * request ) if ( fControllingDriver && fParentsKnowState ) { - fMaxCapability = fControllingDriver->maxCapabilityForDomainState(fParentsCurrentPowerFlags); + fMaxPowerState = fControllingDriver->maxCapabilityForDomainState(fParentsCurrentPowerFlags); // initially change into the state we are already in tempDesire = fControllingDriver->initialPowerStateForDomainState(fParentsCurrentPowerFlags); - fPreviousRequest = 0xffffffff; + fPreviousRequestPowerFlags = (IOPMPowerFlags)(-1); adjustPowerState(tempDesire); } -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->tagPowerPlaneService(this, &fRootDomainState); -#endif + getPMRootDomain()->tagPowerPlaneService(this, &fPMActions); } //********************************************************************************* @@ -960,7 +1036,7 @@ void IOService::addPowerChild3 ( IOPMRequest * request ) { if (child->getProperty("IOPMStrictTreeOrder")) { - PM_DEBUG("%s: strict PM order enforced\n", getName()); + PM_LOG1("%s: strict PM order enforced\n", getName()); fStrictTreeOrder = true; } @@ -969,7 +1045,7 @@ void IOService::addPowerChild3 ( IOPMRequest * request ) } else { - PM_DEBUG("%s: addPowerChild3 not in power plane\n", getName()); + PM_LOG("%s: addPowerChild3 not in power plane\n", getName()); } connection->release(); @@ -1031,6 +1107,10 @@ IOReturn IOService::removePowerChild ( IOPowerConnection * theNub ) if ( fHeadNotePendingAcks == 0 ) { stop_ack_timer(); + + // Request unblocked, work queue + // should re-scan all busy requests. + gIOPMWorkQueue->incrementProducerCount(); } } } @@ -1065,8 +1145,8 @@ IOReturn IOService::registerPowerDriver ( IOPMPowerState * powerStates, unsigned long numberOfStates ) { - IOPMRequest * request; - IOPMPowerState * powerStatesCopy = 0; + IOPMRequest * request; + IOPMPSEntry * powerStatesCopy = 0; if (!initialized) return IOPMNotYetInitialized; @@ -1092,12 +1172,19 @@ IOReturn IOService::registerPowerDriver ( do { // Make a copy of the supplied power state array. - powerStatesCopy = IONew(IOPMPowerState, numberOfStates); + powerStatesCopy = IONew(IOPMPSEntry, numberOfStates); if (!powerStatesCopy) break; - bcopy( powerStates, powerStatesCopy, - sizeof(IOPMPowerState) * numberOfStates ); + for (uint32_t i = 0; i < numberOfStates; i++) + { + powerStatesCopy[i].capabilityFlags = powerStates[i].capabilityFlags; + powerStatesCopy[i].outputPowerFlags = powerStates[i].outputPowerCharacter; + powerStatesCopy[i].inputPowerFlags = powerStates[i].inputPowerRequirement; + powerStatesCopy[i].staticPower = powerStates[i].staticPower; + powerStatesCopy[i].settleUpTime = powerStates[i].settleUpTime; + powerStatesCopy[i].settleDownTime = powerStates[i].settleDownTime; + } request = acquirePMRequest( this, kIOPMRequestTypeRegisterPowerDriver ); if (!request) @@ -1114,7 +1201,7 @@ IOReturn IOService::registerPowerDriver ( while (false); if (powerStatesCopy) - IODelete(powerStatesCopy, IOPMPowerState, numberOfStates); + IODelete(powerStatesCopy, IOPMPSEntry, numberOfStates); return kIOReturnNoMemory; } @@ -1124,12 +1211,12 @@ IOReturn IOService::registerPowerDriver ( void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) { - IOService * powerDriver = (IOService *) request->fArg0; - IOPMPowerState * powerStates = (IOPMPowerState *) request->fArg1; - unsigned long numberOfStates = (unsigned long) request->fArg2; - unsigned long i; - IOService * root; - OSIterator * iter; + IOService * powerDriver = (IOService *) request->fArg0; + IOPMPSEntry * powerStates = (IOPMPSEntry *) request->fArg1; + unsigned long numberOfStates = (unsigned long) request->fArg2; + unsigned long i; + IOService * root; + OSIterator * iter; PM_ASSERT_IN_GATE(); assert(powerStates); @@ -1140,7 +1227,7 @@ void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) { OUR_PMLog(kPMLogControllingDriver, (unsigned long) numberOfStates, - (unsigned long) powerStates[0].version); + (unsigned long) kIOPMPowerStateVersion1); fPowerStates = powerStates; fNumberOfPowerStates = numberOfStates; @@ -1150,7 +1237,7 @@ void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) // make a mask of all the character bits we know about fOutputPowerCharacterFlags = 0; for ( i = 0; i < numberOfStates; i++ ) { - fOutputPowerCharacterFlags |= fPowerStates[i].outputPowerCharacter; + fOutputPowerCharacterFlags |= fPowerStates[i].outputPowerFlags; } // Register powerDriver as interested, unless already done. @@ -1201,7 +1288,7 @@ void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) if ( inPlane(gIOPowerPlane) && fParentsKnowState ) { unsigned long tempDesire; - fMaxCapability = fControllingDriver->maxCapabilityForDomainState(fParentsCurrentPowerFlags); + fMaxPowerState = fControllingDriver->maxCapabilityForDomainState(fParentsCurrentPowerFlags); // initially change into the state we are already in tempDesire = fControllingDriver->initialPowerStateForDomainState(fParentsCurrentPowerFlags); adjustPowerState(tempDesire); @@ -1210,7 +1297,7 @@ void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) else { OUR_PMLog(kPMLogControllingDriverErr2, numberOfStates, 0); - IODelete(powerStates, IOPMPowerState, numberOfStates); + IODelete(powerStates, IOPMPSEntry, numberOfStates); } powerDriver->release(); @@ -1227,29 +1314,33 @@ void IOService::handleRegisterPowerDriver ( IOPMRequest * request ) IOPMPowerFlags IOService::registerInterestedDriver ( IOService * driver ) { - IOPMRequest * request; - bool signal; + IOPMRequest * request; + bool signal; - if (!initialized || !fInterestedDrivers) - return IOPMNotPowerManaged; + if (!driver || !initialized || !fInterestedDrivers) + return 0; - PM_LOCK(); - signal = (!fInsertInterestSet && !fRemoveInterestSet); - if (fInsertInterestSet == NULL) - fInsertInterestSet = OSSet::withCapacity(4); - if (fInsertInterestSet) - fInsertInterestSet->setObject(driver); - PM_UNLOCK(); + PM_LOCK(); + signal = (!fInsertInterestSet && !fRemoveInterestSet); + if (fInsertInterestSet == NULL) + fInsertInterestSet = OSSet::withCapacity(4); + if (fInsertInterestSet) + { + fInsertInterestSet->setObject(driver); + if (fRemoveInterestSet) + fRemoveInterestSet->removeObject(driver); + } + PM_UNLOCK(); - if (signal) - { - request = acquirePMRequest( this, kIOPMRequestTypeInterestChanged ); - if (request) - submitPMRequest( request ); - } + if (signal) + { + request = acquirePMRequest( this, kIOPMRequestTypeInterestChanged ); + if (request) + submitPMRequest( request ); + } - // This return value cannot be trusted, but return a value - // for those clients that care. + // This return value cannot be trusted, but return a value + // for those clients that care. OUR_PMLog(kPMLogInterestedDriver, kIOPMDeviceUsable, 2); return kIOPMDeviceUsable; @@ -1261,41 +1352,44 @@ IOPMPowerFlags IOService::registerInterestedDriver ( IOService * driver ) IOReturn IOService::deRegisterInterestedDriver ( IOService * driver ) { - IOPMinformeeList * list; + IOPMinformeeList * list; IOPMinformee * item; - IOPMRequest * request; - bool signal; + IOPMRequest * request; + bool signal; - if (!initialized || !fInterestedDrivers) - return IOPMNotPowerManaged; + if (!driver) + return kIOReturnBadArgument; + if (!initialized || !fInterestedDrivers) + return IOPMNotPowerManaged; - PM_LOCK(); - signal = (!fRemoveInterestSet && !fInsertInterestSet); - if (fRemoveInterestSet == NULL) - fRemoveInterestSet = OSSet::withCapacity(4); - if (fRemoveInterestSet) - { - fRemoveInterestSet->setObject(driver); + PM_LOCK(); + signal = (!fRemoveInterestSet && !fInsertInterestSet); + if (fRemoveInterestSet == NULL) + fRemoveInterestSet = OSSet::withCapacity(4); + if (fRemoveInterestSet) + { + fRemoveInterestSet->setObject(driver); + if (fInsertInterestSet) + fInsertInterestSet->removeObject(driver); - list = fInterestedDrivers; - item = list->findItem(driver); - if (item && item->active) - { - item->active = false; - } - if (fLockedFlags.DriverCallBusy) - PM_DEBUG("%s::deRegisterInterestedDriver() driver call busy\n", getName()); - } - PM_UNLOCK(); + list = fInterestedDrivers; + item = list->findItem(driver); + if (item && item->active) + { + item->active = false; + waitForPMDriverCall( driver ); + } + } + PM_UNLOCK(); - if (signal) - { - request = acquirePMRequest( this, kIOPMRequestTypeInterestChanged ); - if (request) - submitPMRequest( request ); - } + if (signal) + { + request = acquirePMRequest( this, kIOPMRequestTypeInterestChanged ); + if (request) + submitPMRequest( request ); + } - return IOPMNoErr; + return IOPMNoErr; } //********************************************************************************* @@ -1306,49 +1400,48 @@ IOReturn IOService::deRegisterInterestedDriver ( IOService * driver ) void IOService::handleInterestChanged( IOPMRequest * request ) { - IOService * driver; + IOService * driver; IOPMinformee * informee; - IOPMinformeeList * list = fInterestedDrivers; + IOPMinformeeList * list = fInterestedDrivers; - PM_LOCK(); + PM_LOCK(); - if (fInsertInterestSet) - { - while ((driver = (IOService *) fInsertInterestSet->getAnyObject())) - { - if ((list->findItem(driver) == NULL) && - (!fRemoveInterestSet || - !fRemoveInterestSet->containsObject(driver))) - { - informee = list->appendNewInformee(driver); - } - fInsertInterestSet->removeObject(driver); - } - fInsertInterestSet->release(); - fInsertInterestSet = 0; - } + if (fInsertInterestSet) + { + while ((driver = (IOService *) fInsertInterestSet->getAnyObject())) + { + if (list->findItem(driver) == NULL) + { + informee = list->appendNewInformee(driver); + } + fInsertInterestSet->removeObject(driver); + } + fInsertInterestSet->release(); + fInsertInterestSet = 0; + } - if (fRemoveInterestSet) - { - while ((driver = (IOService *) fRemoveInterestSet->getAnyObject())) - { - informee = list->findItem(driver); - if (informee) - { - if (fHeadNotePendingAcks && informee->timer) - { - informee->timer = 0; - fHeadNotePendingAcks--; - } - list->removeFromList(driver); - } - fRemoveInterestSet->removeObject(driver); - } - fRemoveInterestSet->release(); - fRemoveInterestSet = 0; - } + if (fRemoveInterestSet) + { + while ((driver = (IOService *) fRemoveInterestSet->getAnyObject())) + { + informee = list->findItem(driver); + if (informee) + { + // Clean-up async interest acknowledgement + if (fHeadNotePendingAcks && informee->timer) + { + informee->timer = 0; + fHeadNotePendingAcks--; + } + list->removeFromList(driver); + } + fRemoveInterestSet->removeObject(driver); + } + fRemoveInterestSet->release(); + fRemoveInterestSet = 0; + } - PM_UNLOCK(); + PM_UNLOCK(); } //********************************************************************************* @@ -1432,11 +1525,25 @@ bool IOService::handleAcknowledgePowerChange ( IOPMRequest * request ) { uint64_t nsec = computeTimeDeltaNS(&informee->startTime); if (nsec > LOG_SETPOWER_TIMES) - PM_DEBUG("%s::powerState%sChangeTo(%p, %s, %lu -> %lu) async took %d ms\n", + PM_LOG("%s::powerState%sChangeTo(%p, %s, %lu -> %lu) async took %d ms\n", informee->whatObject->getName(), (fDriverCallReason == kDriverCallInformPreChange) ? "Will" : "Did", informee->whatObject, - fName, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); + fName, fCurrentPowerState, fHeadNotePowerState, NS_TO_US(nsec)); + + uint16_t logType = (fDriverCallReason == kDriverCallInformPreChange) + ? kIOPMEventTypePSWillChangeTo + : kIOPMEventTypePSDidChangeTo; + + PMEventDetails *details = PMEventDetails::eventDetails( + logType, + fName, + (uintptr_t)this, + informee->whatObject->getName(), + 0, 0, 0, + NS_TO_MS(nsec)); + + getPMRootDomain()->recordAndReleasePMEventGated( details ); } #endif // mark it acked @@ -1523,8 +1630,17 @@ void IOService::adjustPowerState ( uint32_t clamp ) computeDesiredState(clamp); if (fControllingDriver && fParentsKnowState && inPlane(gIOPowerPlane)) { + IOPMPowerChangeFlags changeFlags = kIOPMSelfInitiated; + + // Indicate that children desires were ignored, and do not ask + // apps for permission to drop power. This is used by root domain + // for demand sleep. + + if (getPMRequestType() == kIOPMRequestTypeRequestPowerStateOverride) + changeFlags |= (kIOPMIgnoreChildren | kIOPMSkipAskPowerDown); + startPowerChange( - /* flags */ kIOPMWeInitiated, + /* flags */ changeFlags, /* power state */ fDesiredPowerState, /* domain flags */ 0, /* connection */ 0, @@ -1536,9 +1652,11 @@ void IOService::adjustPowerState ( uint32_t clamp ) // [public] synchronizePowerTree //********************************************************************************* -IOReturn IOService::synchronizePowerTree ( void ) +IOReturn IOService::synchronizePowerTree ( + IOOptionBits options, + IOService * notifyRoot ) { - IOPMRequest * request_c; + IOPMRequest * request_c = 0; IOPMRequest * request_s; if (this != getPMRootDomain()) @@ -1546,15 +1664,30 @@ IOReturn IOService::synchronizePowerTree ( void ) if (!initialized) return kIOPMNotYetInitialized; - request_c = acquirePMRequest( this, kIOPMRequestTypeIdleCancel ); - request_s = acquirePMRequest( this, kIOPMRequestTypeSynchronizePowerTree ); + if (notifyRoot) + { + IOPMRequest * nr; - if (!request_c || !request_s) - goto error_no_memory; + // Cancels don't need to be synchronized. + nr = acquirePMRequest(notifyRoot, kIOPMRequestTypeChildNotifyDelayCancel); + if (nr) submitPMRequest(nr); + nr = acquirePMRequest(getPMRootDomain(), kIOPMRequestTypeChildNotifyDelayCancel); + if (nr) submitPMRequest(nr); + } - request_c->attachNextRequest( request_s ); + request_s = acquirePMRequest( this, kIOPMRequestTypeSynchronizePowerTree ); + if (!request_s) + goto error_no_memory; - submitPMRequest(request_c); + if (options & kIOPMSyncCancelPowerDown) + request_c = acquirePMRequest( this, kIOPMRequestTypeIdleCancel ); + if (request_c) + { + request_c->attachNextRequest( request_s ); + submitPMRequest(request_c); + } + + request_s->fArg0 = (void *)(uintptr_t) options; submitPMRequest(request_s); return kIOReturnSuccess; @@ -1569,14 +1702,17 @@ error_no_memory: // [private] handleSynchronizePowerTree //********************************************************************************* -void IOService::handleSynchronizePowerTree ( IOPMRequest * /*request*/ ) +void IOService::handleSynchronizePowerTree ( IOPMRequest * request ) { PM_ASSERT_IN_GATE(); if (fControllingDriver && fParentsKnowState && inPlane(gIOPowerPlane) && (fCurrentPowerState == fNumberOfPowerStates - 1)) { + IOOptionBits options = (uintptr_t) request->fArg0; + startPowerChange( - /* flags */ kIOPMWeInitiated | kIOPMSynchronize, + /* flags */ kIOPMSelfInitiated | kIOPMSynchronize | + (options & kIOPMSyncNoChildNotify), /* power state */ fCurrentPowerState, /* domain flags */ 0, /* connection */ 0, @@ -1610,24 +1746,24 @@ IOReturn IOService::powerDomainWillChangeTo ( void IOService::handlePowerDomainWillChangeTo ( IOPMRequest * request ) { - IOPMPowerFlags parentPowerFlags = (IOPMPowerFlags) request->fArg0; - IOPowerConnection * whichParent = (IOPowerConnection *) request->fArg1; - unsigned long parentChangeFlags = (unsigned long) request->fArg2; - OSIterator * iter; - OSObject * next; - IOPowerConnection * connection; - unsigned long newPowerState; - unsigned long myChangeFlags; - IOPMPowerFlags combinedPowerFlags; - bool savedParentsKnowState; - IOReturn result = IOPMAckImplied; + IOPMPowerFlags parentPowerFlags = (IOPMPowerFlags) request->fArg0; + IOPowerConnection * whichParent = (IOPowerConnection *) request->fArg1; + IOPMPowerChangeFlags parentChangeFlags = (IOPMPowerChangeFlags)(uintptr_t) request->fArg2; + IOPMPowerChangeFlags myChangeFlags; + OSIterator * iter; + OSObject * next; + IOPowerConnection * connection; + IOPMPowerStateIndex newPowerState; + IOPMPowerFlags combinedPowerFlags; + bool savedParentsKnowState; + IOReturn result = IOPMAckImplied; PM_ASSERT_IN_GATE(); OUR_PMLog(kPMLogWillChange, parentPowerFlags, 0); if (!inPlane(gIOPowerPlane) || !whichParent || !whichParent->getAwaitingAck()) { - PM_DEBUG("%s::%s not in power tree\n", getName(), __FUNCTION__); + PM_LOG("%s::%s not in power tree\n", getName(), __FUNCTION__); goto exit_no_ack; } @@ -1656,7 +1792,7 @@ void IOService::handlePowerDomainWillChangeTo ( IOPMRequest * request ) // If our initial change has yet to occur, then defer the power change // until after the power domain has completed its power transition. - if ( fControllingDriver && !fInitialChange ) + if ( fControllingDriver && !fInitialPowerChange ) { newPowerState = fControllingDriver->maxCapabilityForDomainState( combinedPowerFlags); @@ -1729,21 +1865,21 @@ IOReturn IOService::powerDomainDidChangeTo ( void IOService::handlePowerDomainDidChangeTo ( IOPMRequest * request ) { - IOPMPowerFlags parentPowerFlags = (IOPMPowerFlags) request->fArg0; - IOPowerConnection * whichParent = (IOPowerConnection *) request->fArg1; - unsigned long parentChangeFlags = (unsigned long) request->fArg2; - unsigned long newPowerState; - unsigned long myChangeFlags; - unsigned long initialDesire; - bool savedParentsKnowState; - IOReturn result = IOPMAckImplied; + IOPMPowerFlags parentPowerFlags = (IOPMPowerFlags) request->fArg0; + IOPowerConnection * whichParent = (IOPowerConnection *) request->fArg1; + IOPMPowerChangeFlags parentChangeFlags = (IOPMPowerChangeFlags)(uintptr_t) request->fArg2; + IOPMPowerChangeFlags myChangeFlags; + IOPMPowerStateIndex newPowerState; + IOPMPowerStateIndex initialDesire; + bool savedParentsKnowState; + IOReturn result = IOPMAckImplied; PM_ASSERT_IN_GATE(); OUR_PMLog(kPMLogDidChange, parentPowerFlags, 0); if (!inPlane(gIOPowerPlane) || !whichParent || !whichParent->getAwaitingAck()) { - PM_DEBUG("%s::%s not in power tree\n", getName(), __FUNCTION__); + PM_LOG("%s::%s not in power tree\n", getName(), __FUNCTION__); goto exit_no_ack; } @@ -1756,7 +1892,7 @@ void IOService::handlePowerDomainDidChangeTo ( IOPMRequest * request ) newPowerState = fControllingDriver->maxCapabilityForDomainState( fParentsCurrentPowerFlags); - if (fInitialChange) + if (fInitialPowerChange) { initialDesire = fControllingDriver->initialPowerStateForDomainState( fParentsCurrentPowerFlags); @@ -1796,7 +1932,7 @@ void IOService::handlePowerDomainDidChangeTo ( IOPMRequest * request ) if (!savedParentsKnowState && fParentsKnowState) { - PM_TRACE("%s::powerDomainDidChangeTo parentsKnowState = true\n", + PM_LOG1("%s::powerDomainDidChangeTo parentsKnowState = true\n", getName()); requestDomainPower( fDesiredPowerState ); } @@ -1889,7 +2025,7 @@ void IOService::rebuildChildClampBits ( void ) { if (connection->getReadyFlag() == false) { - PM_CONNECT("[%s] %s: connection not ready\n", + PM_LOG3("[%s] %s: connection not ready\n", getName(), __FUNCTION__); continue; } @@ -1919,7 +2055,7 @@ IOReturn IOService::requestPowerDomainState( IOPowerConnection * childConnection, unsigned long specification ) { - unsigned long ps; + IOPMPowerStateIndex ps; IOPMPowerFlags outputPowerFlags; IOService * child; IOPMRequest * subRequest; @@ -1931,7 +2067,7 @@ IOReturn IOService::requestPowerDomainState( if (gIOPMWorkLoop->onThread() == false) { - PM_DEBUG("%s::requestPowerDomainState\n", getName()); + PM_LOG("%s::requestPowerDomainState\n", getName()); return kIOReturnSuccess; } @@ -1941,7 +2077,7 @@ IOReturn IOService::requestPowerDomainState( return kIOReturnNotAttached; if (!fControllingDriver || !fNumberOfPowerStates) - return IOPMNotYetInitialized; + return kIOReturnNotReady; child = (IOService *) childConnection->getChildEntry(gIOPowerPlane); assert(child); @@ -1953,10 +2089,10 @@ IOReturn IOService::requestPowerDomainState( // Merge in the power flags contributed by this power parent // at its current or impending power state. - outputPowerFlags = fPowerStates[fCurrentPowerState].outputPowerCharacter; + outputPowerFlags = fPowerStates[fCurrentPowerState].outputPowerFlags; if (fMachineState != kIOPM_Finished) { - if (IS_POWER_DROP && (getPMRootDomain() != this)) + if (IS_POWER_DROP && !IS_ROOT_DOMAIN) { // Use the lower power state when dropping power. // Must be careful since a power drop can be canceled @@ -1967,7 +2103,7 @@ IOReturn IOService::requestPowerDomainState( // The child must not wait for this parent to raise power // if the power drop was cancelled. The solution is to cancel // the power drop if possible, then schedule an adjustment to - // re-evaluate our correct power state. + // re-evaluate the parent's power state. // // Root domain is excluded to avoid idle sleep issues. And permit // root domain children to pop up when system is going to sleep. @@ -1977,14 +2113,14 @@ IOReturn IOService::requestPowerDomainState( { fDoNotPowerDown = true; // cancel power drop adjustPower = true; // schedule an adjustment - PM_TRACE("%s: power drop cancelled in state %u by %s\n", + PM_LOG1("%s: power drop cancelled in state %u by %s\n", getName(), fMachineState, child->getName()); } else { // Beyond cancellation point, report the impending state. outputPowerFlags = - fPowerStates[fHeadNotePowerState].outputPowerCharacter; + fPowerStates[fHeadNotePowerState].outputPowerFlags; } } else if (IS_POWER_RISE) @@ -2006,7 +2142,7 @@ IOReturn IOService::requestPowerDomainState( for (ps = 0; ps < fNumberOfPowerStates; ps++) { - if ((fPowerStates[ps].outputPowerCharacter & childRequestPowerFlags) == + if ((fPowerStates[ps].outputPowerFlags & childRequestPowerFlags) == (fOutputPowerCharacterFlags & childRequestPowerFlags)) break; } @@ -2028,7 +2164,7 @@ IOReturn IOService::requestPowerDomainState( #if ENABLE_DEBUG_LOGS if (adjustPower) { - PM_DEBUG("requestPowerDomainState[%s]: %s, init %d, %u->%u\n", + PM_LOG("requestPowerDomainState[%s]: %s, init %d, %u->%u\n", getName(), child->getName(), !childConnection->childHasRequestedPower(), (uint32_t) childConnection->getDesiredDomainState(), @@ -2049,7 +2185,7 @@ IOReturn IOService::requestPowerDomainState( // adjust power state. Submit a request if one wasn't pending, // or if the current request is part of a call tree. - if (adjustPower && !fDeviceOverrides && + if (adjustPower && !fDeviceOverrideEnabled && (!fAdjustPowerScheduled || gIOPMRequest->getRootRequest())) { subRequest = acquirePMRequest( @@ -2185,8 +2321,8 @@ IOReturn IOService::changePowerStateWithOverrideTo ( unsigned long ordinal ) { fTempClampPowerState = max(fTempClampPowerState, ordinal); fTempClampCount++; - fOverrideMaxPowerState = ordinal; - request->fArg2 = (void *) true; + fOverrideMaxPowerState = ordinal; + request->fArg2 = (void *) (uintptr_t) true; } submitPMRequest( request ); @@ -2228,7 +2364,7 @@ IOReturn IOService::requestPowerState ( { fTempClampPowerState = max(fTempClampPowerState, state); fTempClampCount++; - request->fArg2 = (void *) true; + request->fArg2 = (void *) (uintptr_t) true; } submitPMRequest( request ); @@ -2255,8 +2391,8 @@ void IOService::handleRequestPowerState ( IOPMRequest * request ) if (fNumberOfPowerStates && (state >= fNumberOfPowerStates)) state = fNumberOfPowerStates - 1; - // Override from changePowerStateWithOverrideTo() persists until - // the next "device" power request, such as changePowerStateToPriv(). + // The power suppression due to changePowerStateWithOverrideTo() expires + // upon the next "device" power request - changePowerStateToPriv(). if ((getPMRequestType() != kIOPMRequestTypeRequestPowerStateOverride) && (client == gIOPMPowerClientDevice)) @@ -2317,6 +2453,77 @@ uint32_t IOService::getPowerStateForClient( const OSSymbol * client ) return powerState; } +//********************************************************************************* +// [protected] powerOverrideOnPriv +//********************************************************************************* + +IOReturn IOService::powerOverrideOnPriv ( void ) +{ + IOPMRequest * request; + + if (!initialized) + return IOPMNotYetInitialized; + + if (gIOPMWorkLoop->inGate()) + { + fDeviceOverrideEnabled = true; + return IOPMNoErr; + } + + request = acquirePMRequest( this, kIOPMRequestTypePowerOverrideOnPriv ); + if (!request) + return kIOReturnNoMemory; + + submitPMRequest( request ); + return IOPMNoErr; +} + +//********************************************************************************* +// [protected] powerOverrideOffPriv +//********************************************************************************* + +IOReturn IOService::powerOverrideOffPriv ( void ) +{ + IOPMRequest * request; + + if (!initialized) + return IOPMNotYetInitialized; + + if (gIOPMWorkLoop->inGate()) + { + fDeviceOverrideEnabled = false; + return IOPMNoErr; + } + + request = acquirePMRequest( this, kIOPMRequestTypePowerOverrideOffPriv ); + if (!request) + return kIOReturnNoMemory; + + submitPMRequest( request ); + return IOPMNoErr; +} + +//********************************************************************************* +// [private] handlePowerOverrideChanged +//********************************************************************************* + +void IOService::handlePowerOverrideChanged ( IOPMRequest * request ) +{ + PM_ASSERT_IN_GATE(); + if (request->getType() == kIOPMRequestTypePowerOverrideOnPriv) + { + OUR_PMLog(kPMLogOverrideOn, 0, 0); + fDeviceOverrideEnabled = true; + } + else + { + OUR_PMLog(kPMLogOverrideOff, 0, 0); + fDeviceOverrideEnabled = false; + } + + adjustPowerState(); +} + //********************************************************************************* // [private] computeDesiredState //********************************************************************************* @@ -2335,7 +2542,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) if (!fNumberOfPowerStates) { fDesiredPowerState = 0; - //PM_DEBUG("%s::%s no controlling driver\n", getName(), __FUNCTION__); + //PM_LOG("%s::%s no controlling driver\n", getName(), __FUNCTION__); return; } @@ -2350,7 +2557,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) { if (connection->getReadyFlag() == false) { - PM_CONNECT("[%s] %s: connection not ready\n", + PM_LOG3("[%s] %s: connection not ready\n", getName(), __FUNCTION__); continue; } @@ -2376,7 +2583,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) while ((client = (const OSSymbol *) iter->getNextObject())) { // Ignore child and driver when override is in effect. - if ((fDeviceOverrides || + if ((fDeviceOverrideEnabled || (getPMRequestType() == kIOPMRequestTypeRequestPowerStateOverride)) && ((client == gIOPMPowerClientChildren) || (client == gIOPMPowerClientDriver))) @@ -2388,7 +2595,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) desiredState = getPowerStateForClient(client); assert(desiredState < fNumberOfPowerStates); - PM_TRACE(" %u %s\n", + PM_LOG1(" %u %s\n", desiredState, client->getCStringNoCopy()); newPowerState = max(newPowerState, desiredState); @@ -2415,7 +2622,7 @@ void IOService::computeDesiredState ( unsigned long localClamp ) fDesiredPowerState = newPowerState; - PM_TRACE(" temp %u, clamp %u, current %u, new %u\n", + PM_LOG1(" temp %u, clamp %u, current %u, new %u\n", (uint32_t) localClamp, (uint32_t) fTempClampPowerState, (uint32_t) fCurrentPowerState, newPowerState); @@ -2466,12 +2673,92 @@ IOWorkLoop * IOService::getPMworkloop ( void ) return gIOPMWorkLoop; } +#if NOT_YET + //********************************************************************************* -// [public] activityTickle -// -// The tickle with parameter kIOPMSuperclassPolicy1 causes the activity -// flag to be set, and the device state checked. If the device has been -// powered down, it is powered up again. +// Power Parent/Children Applier +//********************************************************************************* + +static void +applyToPowerChildren( + IOService * service, + IOServiceApplierFunction applier, + void * context, + IOOptionBits options ) +{ + PM_ASSERT_IN_GATE(); + + IORegistryEntry * entry; + IORegistryIterator * iter; + IOPowerConnection * connection; + IOService * child; + + iter = IORegistryIterator::iterateOver(service, gIOPowerPlane, options); + if (iter) + { + while ((entry = iter->getNextObject())) + { + // Get child of IOPowerConnection objects + if ((connection = OSDynamicCast(IOPowerConnection, entry))) + { + child = (IOService *) connection->copyChildEntry(gIOPowerPlane); + if (child) + { + (*applier)(child, context); + child->release(); + } + } + } + iter->release(); + } +} + +static void +applyToPowerParent( + IOService * service, + IOServiceApplierFunction applier, + void * context, + IOOptionBits options ) +{ + PM_ASSERT_IN_GATE(); + + IORegistryEntry * entry; + IORegistryIterator * iter; + IOPowerConnection * connection; + IOService * parent; + + iter = IORegistryIterator::iterateOver(service, gIOPowerPlane, + options | kIORegistryIterateParents); + if (iter) + { + while ((entry = iter->getNextObject())) + { + // Get child of IOPowerConnection objects + if ((connection = OSDynamicCast(IOPowerConnection, entry))) + { + parent = (IOService *) connection->copyParentEntry(gIOPowerPlane); + if (parent) + { + (*applier)(parent, context); + parent->release(); + } + } + } + iter->release(); + } +} + +#endif /* NOT_YET */ + +// MARK: - +// MARK: Activity Tickle & Idle Timer + +//********************************************************************************* +// [public] activityTickle +// +// The tickle with parameter kIOPMSuperclassPolicy1 causes the activity +// flag to be set, and the device state checked. If the device has been +// powered down, it is powered up again. // The tickle with parameter kIOPMSubclassPolicy is ignored here and // should be intercepted by a subclass. //********************************************************************************* @@ -2487,14 +2774,11 @@ bool IOService::activityTickle ( unsigned long type, unsigned long stateNumber ) // Record device activity for the idle timer handler. - fDeviceActive = true; + fDeviceWasActive = true; fActivityTickleCount++; clock_get_uptime(&fDeviceActiveTimestamp); -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->handleActivityTickleForService(this, type, - fCurrentPowerState, fActivityTickleCount); -#endif + PM_ACTION_0(actionActivityTickle); // Record the last tickle power state. // This helps to filter out redundant tickles as @@ -2509,7 +2793,7 @@ bool IOService::activityTickle ( unsigned long type, unsigned long stateNumber ) if (request) { request->fArg0 = (void *) stateNumber; // power state - request->fArg1 = (void *) true; // power rise + request->fArg1 = (void *) (uintptr_t) true; // power rise submitPMRequest(request); } } @@ -2558,21 +2842,19 @@ void IOService::handleActivityTickle ( IOPMRequest * request ) } } -//********************************************************************************* +//****************************************************************************** // [public] setIdleTimerPeriod // -// A subclass policy-maker is going to use our standard idleness -// detection service. Make a command queue and an idle timer and -// connect them to the power management workloop. Finally, -// start the timer. -//********************************************************************************* +// A subclass policy-maker is using our standard idleness detection service. +// Start the idle timer. Period is in seconds. +//****************************************************************************** IOReturn IOService::setIdleTimerPeriod ( unsigned long period ) { if (!initialized) return IOPMNotYetInitialized; - OUR_PMLog(kPMLogSetIdleTimerPeriod, period, 0); + OUR_PMLog(kPMLogSetIdleTimerPeriod, period, fIdleTimerPeriod); IOPMRequest * request = acquirePMRequest( this, kIOPMRequestTypeSetIdleTimerPeriod ); @@ -2582,7 +2864,7 @@ IOReturn IOService::setIdleTimerPeriod ( unsigned long period ) request->fArg0 = (void *) period; submitPMRequest( request ); - return IOPMNoErr; + return kIOReturnSuccess; } //****************************************************************************** @@ -2597,10 +2879,10 @@ SInt32 IOService::nextIdleTimeout( AbsoluteTime lastActivity, unsigned int powerState) { - AbsoluteTime delta; - UInt64 delta_ns; - SInt32 delta_secs; - SInt32 delay_secs; + AbsoluteTime delta; + UInt64 delta_ns; + SInt32 delta_secs; + SInt32 delay_secs; // Calculate time difference using funky macro from clock.h. delta = currentTime; @@ -2619,26 +2901,25 @@ SInt32 IOService::nextIdleTimeout( return (SInt32)delay_secs; } -//****************************************************************************** +//********************************************************************************* // [public] start_PM_idle_timer -// -// The parameter is a pointer to us. Use it to call our timeout method. -//****************************************************************************** +//********************************************************************************* void IOService::start_PM_idle_timer ( void ) { - static const int maxTimeout = 100000; - static const int minTimeout = 1; - AbsoluteTime uptime; - SInt32 idle_in = 0; + static const int maxTimeout = 100000; + static const int minTimeout = 1; + AbsoluteTime uptime, deadline; + SInt32 idle_in = 0; + boolean_t pending; - if (!initialized || !fIdleTimerPeriod || !fIdleTimerEventSource) + if (!initialized || !fIdleTimerPeriod) return; IOLockLock(fActivityLock); clock_get_uptime(&uptime); - + // Subclasses may modify idle sleep algorithm idle_in = nextIdleTimeout(uptime, fDeviceActiveTimestamp, fCurrentPowerState); @@ -2655,18 +2936,41 @@ void IOService::start_PM_idle_timer ( void ) IOLockUnlock(fActivityLock); - fIdleTimerEventSource->setTimeout(idle_in, NSEC_PER_SEC); + retain(); + clock_interval_to_absolutetime_interval(idle_in, kSecondScale, &deadline); + ADD_ABSOLUTETIME(&deadline, &uptime); + pending = thread_call_enter_delayed(fIdleTimer, deadline); + if (pending) release(); +} + +//********************************************************************************* +// idle_timer_expired +//********************************************************************************* + +static void +idle_timer_expired ( + thread_call_param_t arg0, thread_call_param_t arg1 ) +{ + IOService * me = (IOService *) arg0; + + if (gIOPMWorkLoop) + gIOPMWorkLoop->runAction( + OSMemberFunctionCast(IOWorkLoop::Action, me, + &IOService::idleTimerExpired), + me); + + me->release(); } //********************************************************************************* // [private] idleTimerExpired // -// The idle timer has expired. If there has been activity since the last +// The idle timer has expired. If there has been activity since the last // expiration, just restart the timer and return. If there has not been // activity, switch to the next lower power state and restart the timer. //********************************************************************************* -void IOService::idleTimerExpired( IOTimerEventSource * ) +void IOService::idleTimerExpired( void ) { IOPMRequest * request; bool restartTimer = true; @@ -2678,10 +2982,10 @@ void IOService::idleTimerExpired( IOTimerEventSource * ) // Check for device activity (tickles) over last timer period. - if (fDeviceActive) + if (fDeviceWasActive) { // Device was active - do not drop power, restart timer. - fDeviceActive = false; + fDeviceWasActive = false; } else { @@ -2699,7 +3003,7 @@ void IOService::idleTimerExpired( IOTimerEventSource * ) if (request) { request->fArg0 = (void *) 0; // power state (irrelevant) - request->fArg1 = (void *) false; // power drop + request->fArg1 = (void *) (uintptr_t) false; // power drop submitPMRequest( request ); // Do not restart timer until after the tickle request has been @@ -2798,7 +3102,7 @@ IOReturn IOService::systemWake ( void ) { if (connection->getReadyFlag() == false) { - PM_CONNECT("[%s] %s: connection not ready\n", + PM_LOG3("[%s] %s: connection not ready\n", getName(), __FUNCTION__); continue; } @@ -2836,7 +3140,7 @@ IOReturn IOService::temperatureCriticalForZone ( IOService * whichZone ) OUR_PMLog(kPMLogCriticalTemp, 0, 0); - if ( inPlane(gIOPowerPlane) && !IS_PM_ROOT() ) + if ( inPlane(gIOPowerPlane) && !IS_PM_ROOT ) { theNub = (IOService *)copyParentEntry(gIOPowerPlane); if ( theNub ) @@ -2854,87 +3158,21 @@ IOReturn IOService::temperatureCriticalForZone ( IOService * whichZone ) } #endif /* !__LP64__ */ -//********************************************************************************* -// [protected] powerOverrideOnPriv -//********************************************************************************* - -IOReturn IOService::powerOverrideOnPriv ( void ) -{ - IOPMRequest * request; - - if (!initialized) - return IOPMNotYetInitialized; - - if (gIOPMWorkLoop->inGate()) - { - fDeviceOverrides = true; - return IOPMNoErr; - } - - request = acquirePMRequest( this, kIOPMRequestTypePowerOverrideOnPriv ); - if (!request) - return kIOReturnNoMemory; - - submitPMRequest( request ); - return IOPMNoErr; -} - -//********************************************************************************* -// [protected] powerOverrideOffPriv -//********************************************************************************* - -IOReturn IOService::powerOverrideOffPriv ( void ) -{ - IOPMRequest * request; - - if (!initialized) - return IOPMNotYetInitialized; - - if (gIOPMWorkLoop->inGate()) - { - fDeviceOverrides = false; - return IOPMNoErr; - } - - request = acquirePMRequest( this, kIOPMRequestTypePowerOverrideOffPriv ); - if (!request) - return kIOReturnNoMemory; - - submitPMRequest( request ); - return IOPMNoErr; -} - -//********************************************************************************* -// [private] handlePowerOverrideChanged -//********************************************************************************* - -void IOService::handlePowerOverrideChanged ( IOPMRequest * request ) -{ - PM_ASSERT_IN_GATE(); - if (request->getType() == kIOPMRequestTypePowerOverrideOnPriv) - { - OUR_PMLog(kPMLogOverrideOn, 0, 0); - fDeviceOverrides = true; - } - else - { - OUR_PMLog(kPMLogOverrideOff, 0, 0); - fDeviceOverrides = false; - } - - adjustPowerState(); -} +// MARK: - +// MARK: Power Change (Common) //********************************************************************************* // [private] startPowerChange +// +// All power state changes starts here. //********************************************************************************* -IOReturn IOService::startPowerChange ( - unsigned long changeFlags, - unsigned long powerState, - unsigned long domainFlags, - IOPowerConnection * parentConnection, - unsigned long parentFlags ) +IOReturn IOService::startPowerChange( + IOPMPowerChangeFlags changeFlags, + IOPMPowerStateIndex powerState, + IOPMPowerFlags domainFlags, + IOPowerConnection * parentConnection, + IOPMPowerFlags parentFlags ) { PM_ASSERT_IN_GATE(); assert( fMachineState == kIOPM_Finished ); @@ -2943,32 +3181,17 @@ IOReturn IOService::startPowerChange ( if (powerState >= fNumberOfPowerStates) return IOPMAckImplied; -#if ROOT_DOMAIN_RUN_STATES - // Root domain can override chosen power state to a lower state. - getPMRootDomain()->overridePowerStateForService( - this, &fRootDomainState, - &powerState, changeFlags); -#endif - - // Invalidate the last recorded tickle power state when a power transition - // is about to occur, and not as a result of a tickle request. + fIsPreChange = true; + PM_ACTION_2(actionPowerChangeOverride, &powerState, &changeFlags); - if ((getPMRequestType() != kIOPMRequestTypeActivityTickle) && - (fActivityTicklePowerState != -1)) - { - IOLockLock(fActivityLock); - fActivityTicklePowerState = -1; - IOLockUnlock(fActivityLock); - } - - // Initialize the change note. + // Forks to either Driver or Parent initiated power change paths. - fHeadNoteFlags = changeFlags; + fHeadNoteChangeFlags = changeFlags; fHeadNotePowerState = powerState; fHeadNotePowerArrayEntry = &fPowerStates[ powerState ]; fHeadNoteParentConnection = NULL; - if (changeFlags & kIOPMWeInitiated) + if (changeFlags & kIOPMSelfInitiated) { if (changeFlags & kIOPMSynchronize) OurSyncStart(); @@ -2992,70 +3215,68 @@ IOReturn IOService::startPowerChange ( bool IOService::notifyInterestedDrivers ( void ) { - IOPMinformee * informee; - IOPMinformeeList * list = fInterestedDrivers; - DriverCallParam * param; - IOItemCount count; + IOPMinformee * informee; + IOPMinformeeList * list = fInterestedDrivers; + DriverCallParam * param; + IOItemCount count; - PM_ASSERT_IN_GATE(); - assert( fDriverCallParamCount == 0 ); - assert( fHeadNotePendingAcks == 0 ); + PM_ASSERT_IN_GATE(); + assert( fDriverCallParamCount == 0 ); + assert( fHeadNotePendingAcks == 0 ); fHeadNotePendingAcks = 0; - count = list->numberOfItems(); - if (!count) - goto done; // no interested drivers + count = list->numberOfItems(); + if (!count) + goto done; // no interested drivers - // Allocate an array of interested drivers and their return values - // for the callout thread. Everything else is still "owned" by the - // PM work loop, which can run to process acknowledgePowerChange() - // responses. + // Allocate an array of interested drivers and their return values + // for the callout thread. Everything else is still "owned" by the + // PM work loop, which can run to process acknowledgePowerChange() + // responses. - param = (DriverCallParam *) fDriverCallParamPtr; - if (count > fDriverCallParamSlots) - { - if (fDriverCallParamSlots) - { - assert(fDriverCallParamPtr); - IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots); - fDriverCallParamPtr = 0; - fDriverCallParamSlots = 0; - } + param = (DriverCallParam *) fDriverCallParamPtr; + if (count > fDriverCallParamSlots) + { + if (fDriverCallParamSlots) + { + assert(fDriverCallParamPtr); + IODelete(fDriverCallParamPtr, DriverCallParam, fDriverCallParamSlots); + fDriverCallParamPtr = 0; + fDriverCallParamSlots = 0; + } - param = IONew(DriverCallParam, count); - if (!param) - goto done; // no memory + param = IONew(DriverCallParam, count); + if (!param) + goto done; // no memory - fDriverCallParamPtr = (void *) param; - fDriverCallParamSlots = count; - } + fDriverCallParamPtr = (void *) param; + fDriverCallParamSlots = count; + } - informee = list->firstInList(); - assert(informee); - for (IOItemCount i = 0; i < count; i++) - { - informee->timer = -1; - param[i].Target = informee; - informee->retain(); + informee = list->firstInList(); + assert(informee); + for (IOItemCount i = 0; i < count; i++) + { + informee->timer = -1; + param[i].Target = informee; + informee->retain(); informee = list->nextInList( informee ); - } - - fDriverCallParamCount = count; - fHeadNotePendingAcks = count; + } - // Machine state will be blocked pending callout thread completion. + fDriverCallParamCount = count; + fHeadNotePendingAcks = count; - PM_LOCK(); - assert( fLockedFlags.DriverCallBusy == false ); - fLockedFlags.DriverCallBusy = true; - PM_UNLOCK(); - thread_call_enter( fDriverCallEntry ); - return true; + // Block state machine and wait for callout completion. + assert(!fDriverCallBusy); + fDriverCallBusy = true; + thread_call_enter( fDriverCallEntry ); + return true; done: - // no interested drivers or did not schedule callout thread due to error. - return false; + // Return false if there are no interested drivers or could not schedule + // callout thread due to error. + return false; } //********************************************************************************* @@ -3064,18 +3285,18 @@ done: void IOService::notifyInterestedDriversDone ( void ) { - IOPMinformee * informee; - IOItemCount count; - DriverCallParam * param; - IOReturn result; + IOPMinformee * informee; + IOItemCount count; + DriverCallParam * param; + IOReturn result; PM_ASSERT_IN_GATE(); + assert( fDriverCallBusy == false ); + assert( fMachineState == kIOPM_DriverThreadCallDone ); + param = (DriverCallParam *) fDriverCallParamPtr; count = fDriverCallParamCount; - assert( fLockedFlags.DriverCallBusy == false ); - assert( fMachineState == kIOPM_DriverThreadCallDone ); - if (param && count) { for (IOItemCount i = 0; i < count; i++, param++) @@ -3128,10 +3349,23 @@ void IOService::notifyInterestedDriversDone ( void ) } } - // Hop back to original machine state path (from notifyAll) - fMachineState = fNextMachineState; + MS_POP(); // pushed by notifyAll() + + // If interest acks are outstanding, wait for fHeadNotePendingAcks to become + // zero before notifying children. This enforces the children after interest + // ordering even for async interest clients. - notifyChildren(); + if (!fHeadNotePendingAcks) + { + notifyChildren(); + } + else + { + MS_PUSH(fMachineState); + fMachineState = kIOPM_NotifyChildrenStart; + PM_LOG2("%s: %u outstanding async interest\n", + getName(), fHeadNotePendingAcks); + } } //********************************************************************************* @@ -3144,6 +3378,17 @@ void IOService::notifyChildren ( void ) OSObject * next; IOPowerConnection * connection; OSArray * children = 0; + IOPMrootDomain * rootDomain; + bool delayNotify = false; + + if ((fHeadNotePowerState != fCurrentPowerState) && + (IS_POWER_DROP == fIsPreChange) && + ((rootDomain = getPMRootDomain()) == this)) + { + rootDomain->tracePoint( IS_POWER_DROP ? + kIOPMTracePointSleepPowerPlaneDrivers : + kIOPMTracePointWakePowerPlaneDrivers ); + } if (fStrictTreeOrder) children = OSArray::withCapacity(8); @@ -3160,49 +3405,78 @@ void IOService::notifyChildren ( void ) { if (connection->getReadyFlag() == false) { - PM_CONNECT("[%s] %s: connection not ready\n", + PM_LOG3("[%s] %s: connection not ready\n", getName(), __FUNCTION__); continue; } - if (children) + // Mechanism to postpone the did-change notification to + // certain power children to order those children last. + // Cannot be used together with strict tree ordering. + + if (!fIsPreChange && + (connection->delayChildNotification) && + getPMRootDomain()->shouldDelayChildNotification(this)) + { + if (!children) + { + children = OSArray::withCapacity(8); + if (children) + delayNotify = true; + } + if (delayNotify) + { + children->setObject( connection ); + continue; + } + } + + if (!delayNotify && children) children->setObject( connection ); else - notifyChild( connection, - fDriverCallReason == kDriverCallInformPreChange ); + notifyChild( connection ); } } iter->release(); } + if (children && (children->getCount() == 0)) + { + children->release(); + children = 0; + } if (children) { - if (children->getCount() == 0) - { - children->release(); - children = 0; - } - else - { - assert(fNotifyChildArray == 0); - fNotifyChildArray = children; - fNextMachineState = fMachineState; - fMachineState = kIOPM_NotifyChildrenDone; - } + assert(fNotifyChildArray == 0); + fNotifyChildArray = children; + MS_PUSH(fMachineState); + + if (delayNotify) + { + // Wait for exiting child notifications to complete, + // before notifying the children in the array. + fMachineState = kIOPM_NotifyChildrenDelayed; + PM_LOG2("%s: %d children in delayed array\n", + getName(), children->getCount()); + } + else + { + // Notify children in the array one at a time. + fMachineState = kIOPM_NotifyChildrenOrdered; + } } } //********************************************************************************* -// [private] notifyChildrenDone +// [private] notifyChildrenOrdered //********************************************************************************* -void IOService::notifyChildrenDone ( void ) +void IOService::notifyChildrenOrdered ( void ) { PM_ASSERT_IN_GATE(); assert(fNotifyChildArray); - assert(fMachineState == kIOPM_NotifyChildrenDone); + assert(fMachineState == kIOPM_NotifyChildrenOrdered); - // Interested drivers have all acked (if any), ack timer stopped. // Notify one child, wait for it to ack, then repeat for next child. // This is a workaround for some drivers with multiple instances at // the same branch in the power tree, but the driver is slow to power @@ -3217,28 +3491,61 @@ void IOService::notifyChildrenDone ( void ) IOPowerConnection * connection; connection = (IOPowerConnection *) fNotifyChildArray->getObject(0); fNotifyChildArray->removeObject(0); - notifyChild( connection, fDriverCallReason == kDriverCallInformPreChange ); + notifyChild( connection ); } else { fNotifyChildArray->release(); fNotifyChildArray = 0; - fMachineState = fNextMachineState; + + MS_POP(); // pushed by notifyChildren() } } +//********************************************************************************* +// [private] notifyChildrenDelayed +//********************************************************************************* + +void IOService::notifyChildrenDelayed ( void ) +{ + IOPowerConnection * connection; + + PM_ASSERT_IN_GATE(); + assert(fNotifyChildArray); + assert(fMachineState == kIOPM_NotifyChildrenDelayed); + + // Wait after all non-delayed children and interested drivers have ack'ed, + // then notify all delayed children. When explicitly cancelled, interest + // acks (and ack timer) may still be outstanding. + + for (int i = 0; ; i++) + { + connection = (IOPowerConnection *) fNotifyChildArray->getObject(i); + if (!connection) + break; + + notifyChild( connection ); + } + + PM_LOG2("%s: notified delayed children\n", getName()); + fNotifyChildArray->release(); + fNotifyChildArray = 0; + + MS_POP(); // pushed by notifyChildren() +} + //********************************************************************************* // [private] notifyAll //********************************************************************************* -IOReturn IOService::notifyAll ( int nextMachineState, bool is_prechange ) +IOReturn IOService::notifyAll ( uint32_t nextMS ) { // Save the next machine_state to be restored by notifyInterestedDriversDone() PM_ASSERT_IN_GATE(); - fNextMachineState = nextMachineState; + MS_PUSH(nextMS); fMachineState = kIOPM_DriverThreadCallDone; - fDriverCallReason = is_prechange ? + fDriverCallReason = fIsPreChange ? kDriverCallInformPreChange : kDriverCallInformPostChange; if (!notifyInterestedDrivers()) @@ -3258,16 +3565,15 @@ IOReturn IOService::actionDriverCalloutDone ( void * arg0, void * arg1, void * arg2, void * arg3 ) { - IOServicePM * pwrMgt = (IOServicePM *) arg0; + IOServicePM * pwrMgt = (IOServicePM *) arg0; - PM_LOCK(); - fLockedFlags.DriverCallBusy = false; - PM_UNLOCK(); + assert( fDriverCallBusy ); + fDriverCallBusy = false; - if (gIOPMReplyQueue) - gIOPMReplyQueue->signalWorkAvailable(); + assert(gIOPMWorkQueue); + gIOPMWorkQueue->signalWorkAvailable(); - return kIOReturnSuccess; + return kIOReturnSuccess; } void IOService::pmDriverCallout ( IOService * from ) @@ -3302,27 +3608,35 @@ void IOService::pmDriverCallout ( IOService * from ) void IOService::driverSetPowerState ( void ) { - IOService * driver; - unsigned long powerState; - DriverCallParam * param; - IOReturn result; + IOPMPowerStateIndex powerState; + DriverCallParam * param; + IOPMDriverCallEntry callEntry; AbsoluteTime end; + IOReturn result; + uint32_t oldPowerState = getPowerState(); - assert( fLockedFlags.DriverCallBusy == true ); - param = (DriverCallParam *) fDriverCallParamPtr; - assert( param ); - assert( fDriverCallParamCount == 1 ); + assert( fDriverCallBusy ); + assert( fDriverCallParamPtr ); + assert( fDriverCallParamCount == 1 ); - driver = fControllingDriver; - powerState = fHeadNotePowerState; + param = (DriverCallParam *) fDriverCallParamPtr; + powerState = fHeadNotePowerState; - if (fLockedFlags.PMStop == false) - { - OUR_PMLog( kPMLogProgramHardware, (uintptr_t) this, powerState); + if (assertPMDriverCall(&callEntry)) + { + OUR_PMLog( kPMLogProgramHardware, (uintptr_t) this, powerState); clock_get_uptime(&fDriverCallStartTime); - result = driver->setPowerState( powerState, this ); + result = fControllingDriver->setPowerState( powerState, this ); clock_get_uptime(&end); - OUR_PMLog((UInt32) -kPMLogProgramHardware, (uintptr_t) this, (UInt32) result); + OUR_PMLog((UInt32) -kPMLogProgramHardware, (uintptr_t) this, (UInt32) result); + + deassertPMDriverCall(&callEntry); + + if (result < 0) + { + PM_LOG("%s::setPowerState(%p, %lu -> %lu) returned 0x%x\n", + fName, this, fCurrentPowerState, powerState, result); + } #if LOG_SETPOWER_TIMES if ((result == IOPMAckImplied) || (result < 0)) @@ -3332,15 +3646,27 @@ void IOService::driverSetPowerState ( void ) SUB_ABSOLUTETIME(&end, &fDriverCallStartTime); absolutetime_to_nanoseconds(end, &nsec); if (nsec > LOG_SETPOWER_TIMES) - PM_DEBUG("%s::setPowerState(%p, %lu -> %lu) took %d ms\n", + PM_LOG("%s::setPowerState(%p, %lu -> %lu) took %d ms\n", fName, this, fCurrentPowerState, powerState, NS_TO_MS(nsec)); + + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeSetPowerStateImmediate, // type + fName, // who + (uintptr_t)this, // owner unique + NULL, // interest name + (uint8_t)oldPowerState, // old + (uint8_t)powerState, // new + 0, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); } #endif - } - else - result = kIOPMAckImplied; + } + else + result = kIOPMAckImplied; - param->Result = result; + param->Result = result; } //********************************************************************************* @@ -3351,46 +3677,51 @@ void IOService::driverSetPowerState ( void ) void IOService::driverInformPowerChange ( void ) { - IOItemCount count; - IOPMinformee * informee; - IOService * driver; - IOReturn result; - IOPMPowerFlags powerFlags; - unsigned long powerState; - DriverCallParam * param; + IOPMinformee * informee; + IOService * driver; + DriverCallParam * param; + IOPMDriverCallEntry callEntry; + IOPMPowerFlags powerFlags; + IOPMPowerStateIndex powerState; AbsoluteTime end; + IOReturn result; + IOItemCount count; - assert( fLockedFlags.DriverCallBusy == true ); - param = (DriverCallParam *) fDriverCallParamPtr; - count = fDriverCallParamCount; - assert( count && param ); + assert( fDriverCallBusy ); + assert( fDriverCallParamPtr ); + assert( fDriverCallParamCount ); - powerFlags = fHeadNotePowerArrayEntry->capabilityFlags; - powerState = fHeadNotePowerState; + param = (DriverCallParam *) fDriverCallParamPtr; + count = fDriverCallParamCount; - for (IOItemCount i = 0; i < count; i++) - { - informee = (IOPMinformee *) param->Target; - driver = informee->whatObject; + powerFlags = fHeadNotePowerArrayEntry->capabilityFlags; + powerState = fHeadNotePowerState; - if ((fLockedFlags.PMStop == false) && informee->active) - { - if (fDriverCallReason == kDriverCallInformPreChange) - { - OUR_PMLog(kPMLogInformDriverPreChange, (uintptr_t) this, powerState); - clock_get_uptime(&informee->startTime); - result = driver->powerStateWillChangeTo(powerFlags, powerState, this); - clock_get_uptime(&end); - OUR_PMLog((UInt32)-kPMLogInformDriverPreChange, (uintptr_t) this, result); - } - else - { - OUR_PMLog(kPMLogInformDriverPostChange, (uintptr_t) this, powerState); + for (IOItemCount i = 0; i < count; i++) + { + informee = (IOPMinformee *) param->Target; + driver = informee->whatObject; + + if (assertPMDriverCall(&callEntry, 0, informee)) + { + if (fDriverCallReason == kDriverCallInformPreChange) + { + OUR_PMLog(kPMLogInformDriverPreChange, (uintptr_t) this, powerState); clock_get_uptime(&informee->startTime); - result = driver->powerStateDidChangeTo(powerFlags, powerState, this); + result = driver->powerStateWillChangeTo(powerFlags, powerState, this); clock_get_uptime(&end); - OUR_PMLog((UInt32)-kPMLogInformDriverPostChange, (uintptr_t) this, result); - } + OUR_PMLog((UInt32)-kPMLogInformDriverPreChange, (uintptr_t) this, result); + } + else + { + OUR_PMLog(kPMLogInformDriverPostChange, (uintptr_t) this, powerState); + clock_get_uptime(&informee->startTime); + result = driver->powerStateDidChangeTo(powerFlags, powerState, this); + clock_get_uptime(&end); + OUR_PMLog((UInt32)-kPMLogInformDriverPostChange, (uintptr_t) this, result); + } + + deassertPMDriverCall(&callEntry); #if LOG_SETPOWER_TIMES if ((result == IOPMAckImplied) || (result < 0)) @@ -3400,19 +3731,35 @@ void IOService::driverInformPowerChange ( void ) SUB_ABSOLUTETIME(&end, &informee->startTime); absolutetime_to_nanoseconds(end, &nsec); if (nsec > LOG_SETPOWER_TIMES) - PM_DEBUG("%s::powerState%sChangeTo(%p, %s, %lu -> %lu) took %d ms\n", + PM_LOG("%s::powerState%sChangeTo(%p, %s, %lu -> %lu) took %d ms\n", driver->getName(), (fDriverCallReason == kDriverCallInformPreChange) ? "Will" : "Did", driver, fName, fCurrentPowerState, powerState, NS_TO_MS(nsec)); + + uint16_t logType = (fDriverCallReason == kDriverCallInformPreChange) + ? kIOPMEventTypePSWillChangeTo + : kIOPMEventTypePSDidChangeTo; + + PMEventDetails *details = PMEventDetails::eventDetails( + logType, // type + fName, // who + (uintptr_t)this, // owner unique + driver->getName(), // interest name + (uint8_t)fCurrentPowerState, // old + (uint8_t)fHeadNotePowerState, // new + 0, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); } #endif - } - else - result = kIOPMAckImplied; + } + else + result = kIOPMAckImplied; - param->Result = result; - param++; - } + param->Result = result; + param++; + } } //********************************************************************************* @@ -3422,14 +3769,14 @@ void IOService::driverInformPowerChange ( void ) // If the object acknowledges the current change, we return TRUE. //********************************************************************************* -bool IOService::notifyChild ( IOPowerConnection * theNub, bool is_prechange ) +bool IOService::notifyChild ( IOPowerConnection * theNub ) { - IOReturn ret = IOPMAckImplied; - unsigned long childPower; - IOService * theChild; - IOPMRequest * childRequest; - uint32_t requestArg2; - int requestType; + IOReturn ret = IOPMAckImplied; + unsigned long childPower; + IOService * theChild; + IOPMRequest * childRequest; + IOPMPowerChangeFlags requestArg2; + int requestType; PM_ASSERT_IN_GATE(); theChild = (IOService *)(theNub->copyChildEntry(gIOPowerPlane)); @@ -3444,11 +3791,11 @@ bool IOService::notifyChild ( IOPowerConnection * theNub, bool is_prechange ) fHeadNotePendingAcks++; theNub->setAwaitingAck(true); - requestArg2 = fHeadNoteFlags; + requestArg2 = fHeadNoteChangeFlags; if (fHeadNotePowerState < fCurrentPowerState) requestArg2 |= kIOPMDomainPowerDrop; - requestType = is_prechange ? + requestType = fIsPreChange ? kIOPMRequestTypePowerDomainWillChange : kIOPMRequestTypePowerDomainDidChange; @@ -3456,7 +3803,7 @@ bool IOService::notifyChild ( IOPowerConnection * theNub, bool is_prechange ) if (childRequest) { theNub->retain(); - childRequest->fArg0 = (void *) fHeadNotePowerArrayEntry->outputPowerCharacter; + childRequest->fArg0 = (void *) fHeadNotePowerArrayEntry->outputPowerFlags; childRequest->fArg1 = (void *) theNub; childRequest->fArg2 = (void *) requestArg2; theChild->submitPMRequest( childRequest ); @@ -3481,6 +3828,246 @@ bool IOService::notifyChild ( IOPowerConnection * theNub, bool is_prechange ) return (IOPMAckImplied == ret); } +//********************************************************************************* +// [private] notifyControllingDriver +//********************************************************************************* + +bool IOService::notifyControllingDriver ( void ) +{ + DriverCallParam * param; + + PM_ASSERT_IN_GATE(); + assert( fDriverCallParamCount == 0 ); + assert( fControllingDriver ); + + if (fInitialSetPowerState) + { + // Driver specified flag to skip the inital setPowerState() + if (fHeadNotePowerArrayEntry->capabilityFlags & kIOPMInitialDeviceState) + { + return false; + } + fInitialSetPowerState = false; + } + + param = (DriverCallParam *) fDriverCallParamPtr; + if (!param) + { + param = IONew(DriverCallParam, 1); + if (!param) + return false; // no memory + + fDriverCallParamPtr = (void *) param; + fDriverCallParamSlots = 1; + } + + param->Target = fControllingDriver; + fDriverCallParamCount = 1; + fDriverTimer = -1; + + // Block state machine and wait for callout completion. + assert(!fDriverCallBusy); + fDriverCallBusy = true; + thread_call_enter( fDriverCallEntry ); + + return true; +} + +//********************************************************************************* +// [private] notifyControllingDriverDone +//********************************************************************************* + +void IOService::notifyControllingDriverDone( void ) +{ + DriverCallParam * param; + IOReturn result; + + PM_ASSERT_IN_GATE(); + param = (DriverCallParam *) fDriverCallParamPtr; + + assert( fDriverCallBusy == false ); + assert( fMachineState == kIOPM_DriverThreadCallDone ); + + if (param && fDriverCallParamCount) + { + assert(fDriverCallParamCount == 1); + + // the return value from setPowerState() + result = param->Result; + + if ((result == IOPMAckImplied) || (result < 0)) + { + fDriverTimer = 0; + } + else if (fDriverTimer) + { + assert(fDriverTimer == -1); + + // Driver has not acked, and has returned a positive result. + // Enforce a minimum permissible timeout value. + // Make the min value large enough so timeout is less likely + // to occur if a driver misinterpreted that the return value + // should be in microsecond units. And make it large enough + // to be noticeable if a driver neglects to ack. + + if (result < kMinAckTimeoutTicks) + result = kMinAckTimeoutTicks; + + fDriverTimer = (result / (ACK_TIMER_PERIOD / ns_per_us)) + 1; + } + // else, child has already acked and driver_timer reset to 0. + + fDriverCallParamCount = 0; + + if ( fDriverTimer ) + { + OUR_PMLog(kPMLogStartAckTimer, 0, 0); + start_ack_timer(); + } + } + + MS_POP(); // pushed by OurChangeSetPowerState() + fIsPreChange = false; +} + +//********************************************************************************* +// [private] all_done +// +// A power change is done. +//********************************************************************************* + +void IOService::all_done ( void ) +{ + IOPMPowerStateIndex prevPowerState; + const IOPMPSEntry * powerStatePtr; + IOPMDriverCallEntry callEntry; + uint32_t prevMachineState = fMachineState; + bool callAction = false; + + fMachineState = kIOPM_Finished; + + if ((fHeadNoteChangeFlags & kIOPMSynchronize) && + ((prevMachineState == kIOPM_Finished) || + (prevMachineState == kIOPM_SyncFinish))) + { + // Sync operation and no power change occurred. + // Do not inform driver and clients about this request completion, + // except for the originator (root domain). + + PM_ACTION_2(actionPowerChangeDone, + fHeadNotePowerState, fHeadNoteChangeFlags); + + if (getPMRequestType() == kIOPMRequestTypeSynchronizePowerTree) + { + powerChangeDone(fCurrentPowerState); + } + + return; + } + + // our power change + if ( fHeadNoteChangeFlags & kIOPMSelfInitiated ) + { + // could our driver switch to the new state? + if ( !( fHeadNoteChangeFlags & kIOPMNotDone) ) + { + // we changed, tell our parent + requestDomainPower(fHeadNotePowerState); + + // yes, did power raise? + if ( fCurrentPowerState < fHeadNotePowerState ) + { + // yes, inform clients and apps + tellChangeUp (fHeadNotePowerState); + } + prevPowerState = fCurrentPowerState; + // either way + fCurrentPowerState = fHeadNotePowerState; +#if PM_VARS_SUPPORT + fPMVars->myCurrentState = fCurrentPowerState; +#endif + OUR_PMLog(kPMLogChangeDone, fCurrentPowerState, 0); + PM_ACTION_2(actionPowerChangeDone, + fHeadNotePowerState, fHeadNoteChangeFlags); + callAction = true; + + powerStatePtr = &fPowerStates[fCurrentPowerState]; + fCurrentCapabilityFlags = powerStatePtr->capabilityFlags; + if (fCurrentCapabilityFlags & kIOPMStaticPowerValid) + fCurrentPowerConsumption = powerStatePtr->staticPower; + + // inform subclass policy-maker + if (fPCDFunctionOverride && fParentsKnowState && + assertPMDriverCall(&callEntry, kIOPMADC_NoInactiveCheck)) + { + powerChangeDone(prevPowerState); + deassertPMDriverCall(&callEntry); + } + } + else if (getPMRequestType() == kIOPMRequestTypeRequestPowerStateOverride) + { + // changePowerStateWithOverrideTo() was cancelled + fOverrideMaxPowerState = kIOPMPowerStateMax; + } + } + + // parent's power change + if ( fHeadNoteChangeFlags & kIOPMParentInitiated) + { + if (((fHeadNoteChangeFlags & kIOPMDomainWillChange) && + (fCurrentPowerState >= fHeadNotePowerState)) || + ((fHeadNoteChangeFlags & kIOPMDomainDidChange) && + (fCurrentPowerState < fHeadNotePowerState))) + { + // did power raise? + if ( fCurrentPowerState < fHeadNotePowerState ) + { + // yes, inform clients and apps + tellChangeUp (fHeadNotePowerState); + } + // either way + prevPowerState = fCurrentPowerState; + fCurrentPowerState = fHeadNotePowerState; +#if PM_VARS_SUPPORT + fPMVars->myCurrentState = fCurrentPowerState; +#endif + fMaxPowerState = fControllingDriver->maxCapabilityForDomainState(fHeadNoteDomainFlags); + + OUR_PMLog(kPMLogChangeDone, fCurrentPowerState, 0); + PM_ACTION_2(actionPowerChangeDone, + fHeadNotePowerState, fHeadNoteChangeFlags); + callAction = true; + + powerStatePtr = &fPowerStates[fCurrentPowerState]; + fCurrentCapabilityFlags = powerStatePtr->capabilityFlags; + if (fCurrentCapabilityFlags & kIOPMStaticPowerValid) + fCurrentPowerConsumption = powerStatePtr->staticPower; + + // inform subclass policy-maker + if (fPCDFunctionOverride && fParentsKnowState && + assertPMDriverCall(&callEntry, kIOPMADC_NoInactiveCheck)) + { + powerChangeDone(prevPowerState); + deassertPMDriverCall(&callEntry); + } + } + } + + // When power rises enough to satisfy the tickle's desire for more power, + // the condition preventing idle-timer from dropping power is removed. + + if (fCurrentPowerState >= fIdleTimerMinPowerState) + { + fIdleTimerMinPowerState = 0; + } + + if (!callAction) + { + PM_ACTION_2(actionPowerChangeDone, + fHeadNotePowerState, fHeadNoteChangeFlags); + } +} + // MARK: - // MARK: Power Change Initiated by Driver @@ -3495,13 +4082,13 @@ void IOService::OurChangeStart ( void ) PM_ASSERT_IN_GATE(); OUR_PMLog( kPMLogStartDeviceChange, fHeadNotePowerState, fCurrentPowerState ); - // fMaxCapability is our maximum possible power state based on the current + // fMaxPowerState is our maximum possible power state based on the current // power state of our parents. If we are trying to raise power beyond the // maximum, send an async request for more power to all parents. - if (!IS_PM_ROOT() && (fMaxCapability < fHeadNotePowerState)) + if (!IS_PM_ROOT && (fMaxPowerState < fHeadNotePowerState)) { - fHeadNoteFlags |= kIOPMNotDone; + fHeadNoteChangeFlags |= kIOPMNotDone; requestDomainPower(fHeadNotePowerState); OurChangeFinish(); return; @@ -3509,36 +4096,29 @@ void IOService::OurChangeStart ( void ) // Redundant power changes skips to the end of the state machine. - if (!fInitialChange && (fHeadNotePowerState == fCurrentPowerState)) + if (!fInitialPowerChange && (fHeadNotePowerState == fCurrentPowerState)) { OurChangeFinish(); return; } - fInitialChange = false; + fInitialPowerChange = false; -#if ROOT_DOMAIN_RUN_STATES // Change started, but may not complete... // Can be canceled (power drop) or deferred (power rise). - getPMRootDomain()->handlePowerChangeStartForService( - /* service */ this, - /* RD flags */ &fRootDomainState, - /* new pwr state */ fHeadNotePowerState, - /* change flags */ fHeadNoteFlags ); -#endif + PM_ACTION_2(actionPowerChangeStart, fHeadNotePowerState, &fHeadNoteChangeFlags); // Two separate paths, depending if power is being raised or lowered. // Lowering power is subject to approval by clients of this service. if (IS_POWER_DROP) { - // Next machine state for a power drop. - fMachineState = kIOPM_OurChangeTellClientsPowerDown; fDoNotPowerDown = false; - // Ask apps and kernel clients permission to lower power. + // Ask for persmission to drop power state + fMachineState = kIOPM_OurChangeTellClientsPowerDown; fOutOfBandParameter = kNotifyApps; - askChangeDown(fHeadNotePowerState); + askChangeDown(fHeadNotePowerState); } else { @@ -3554,7 +4134,6 @@ void IOService::OurChangeStart ( void ) // then the child will signal the parent to adjust power, and the child // will defer its power change. -#if RESERVE_DOMAIN_POWER IOReturn ret; // Reserve parent power necessary to achieve fHeadNotePowerState. @@ -3562,13 +4141,12 @@ void IOService::OurChangeStart ( void ) if (ret != kIOReturnSuccess) { // Reservation failed, defer power rise. - fHeadNoteFlags |= kIOPMNotDone; + fHeadNoteChangeFlags |= kIOPMNotDone; OurChangeFinish(); return; } -#endif - // Notify interested drivers and children. - notifyAll( kIOPM_OurChangeSetPowerState, kNotifyWillChange ); + + OurChangeTellCapabilityWillChange(); } } @@ -3613,26 +4191,26 @@ requestDomainPowerApplier( //********************************************************************************* IOReturn IOService::requestDomainPower( - unsigned long ourPowerState, - IOOptionBits options ) + IOPMPowerStateIndex ourPowerState, + IOOptionBits options ) { - const IOPMPowerState * powerStateEntry; + const IOPMPSEntry * powerStateEntry; IOPMPowerFlags requestPowerFlags; - unsigned long maxPowerState; + IOPMPowerStateIndex maxPowerState; IOPMRequestDomainPowerContext context; PM_ASSERT_IN_GATE(); assert(ourPowerState < fNumberOfPowerStates); if (ourPowerState >= fNumberOfPowerStates) return kIOReturnBadArgument; - if (IS_PM_ROOT()) + if (IS_PM_ROOT) return kIOReturnSuccess; // Fetch the input power flags for the requested power state. // Parent request is stated in terms of required power flags. powerStateEntry = &fPowerStates[ourPowerState]; - requestPowerFlags = powerStateEntry->inputPowerRequirement; + requestPowerFlags = powerStateEntry->inputPowerFlags; if (powerStateEntry->capabilityFlags & (kIOPMChildClamp | kIOPMPreventIdleSleep)) requestPowerFlags |= kIOPMPreventIdleSleep; @@ -3642,12 +4220,12 @@ IOReturn IOService::requestDomainPower( // Disregard the "previous request" for power reservation. if (((options & kReserveDomainPower) == 0) && - (fPreviousRequest == requestPowerFlags)) + (fPreviousRequestPowerFlags == requestPowerFlags)) { // skip if domain already knows our requirements goto done; } - fPreviousRequest = requestPowerFlags; + fPreviousRequestPowerFlags = requestPowerFlags; context.child = this; context.requestPowerFlags = requestPowerFlags; @@ -3661,7 +4239,7 @@ IOReturn IOService::requestDomainPower( if (maxPowerState < fHeadNotePowerState) { - PM_TRACE("%s: power desired %u:0x%x got %u:0x%x\n", + PM_LOG1("%s: power desired %u:0x%x got %u:0x%x\n", getName(), (uint32_t) ourPowerState, (uint32_t) requestPowerFlags, (uint32_t) maxPowerState, (uint32_t) fHeadNoteDomainTargetFlags); @@ -3681,30 +4259,38 @@ void IOService::OurSyncStart ( void ) { PM_ASSERT_IN_GATE(); - if (fInitialChange) + if (fInitialPowerChange) return; -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->handlePowerChangeStartForService( - /* service */ this, - /* RD flags */ &fRootDomainState, - /* new pwr state */ fHeadNotePowerState, - /* change flags */ fHeadNoteFlags ); -#endif + PM_ACTION_2(actionPowerChangeStart, fHeadNotePowerState, &fHeadNoteChangeFlags); + + if (fHeadNoteChangeFlags & kIOPMNotDone) + { + OurChangeFinish(); + return; + } - fMachineState = kIOPM_SyncNotifyDidChange; - fDriverCallReason = kDriverCallInformPreChange; + if (fHeadNoteChangeFlags & kIOPMSyncTellPowerDown) + { + fDoNotPowerDown = false; - notifyChildren(); + // Ask for permission to drop power state + fMachineState = kIOPM_SyncTellClientsPowerDown; + fOutOfBandParameter = kNotifyApps; + askChangeDown(fHeadNotePowerState); + } + else + { + // Only inform capability app and clients. + tellSystemCapabilityChange( kIOPM_SyncNotifyWillChange ); + } } //********************************************************************************* // [private] OurChangeTellClientsPowerDown // -// All registered applications and kernel clients have positively acknowledged our -// intention of lowering power. Here we notify them all that we will definitely -// lower the power. If we don't have to wait for any of them to acknowledge, we -// carry on by notifying interested drivers. Otherwise, we do wait. +// All applications and kernel clients have acknowledged our permission to drop +// power. Here we notify them that we will lower the power and wait for acks. //********************************************************************************* void IOService::OurChangeTellClientsPowerDown ( void ) @@ -3716,10 +4302,8 @@ void IOService::OurChangeTellClientsPowerDown ( void ) //********************************************************************************* // [private] OurChangeTellPriorityClientsPowerDown // -// All registered applications and kernel clients have positively acknowledged our -// intention of lowering power. Here we notify "priority" clients that we are -// lowering power. If we don't have to wait for any of them to acknowledge, we -// carry on by notifying interested drivers. Otherwise, we do wait. +// All applications and kernel clients have acknowledged our intention to drop +// power. Here we notify "priority" clients that we are lowering power. //********************************************************************************* void IOService::OurChangeTellPriorityClientsPowerDown ( void ) @@ -3728,80 +4312,123 @@ void IOService::OurChangeTellPriorityClientsPowerDown ( void ) tellChangeDown2(fHeadNotePowerState); } +//********************************************************************************* +// [private] OurChangeTellCapabilityWillChange +// +// Extra stage for root domain to notify apps and drivers about the +// system capability change when raising power state. +//********************************************************************************* + +void IOService::OurChangeTellCapabilityWillChange ( void ) +{ + if (!IS_ROOT_DOMAIN) + return OurChangeNotifyInterestedDriversWillChange(); + + tellSystemCapabilityChange( kIOPM_OurChangeNotifyInterestedDriversWillChange ); +} + //********************************************************************************* // [private] OurChangeNotifyInterestedDriversWillChange // -// All registered applications and kernel clients have acknowledged our notification -// that we are lowering power. Here we notify interested drivers. If we don't have -// to wait for any of them to acknowledge, we instruct our power driver to make the -// change. Otherwise, we do wait. +// All applications and kernel clients have acknowledged our power state change. +// Here we notify interested drivers pre-change. //********************************************************************************* void IOService::OurChangeNotifyInterestedDriversWillChange ( void ) { - IOPMrootDomain *rootDomain; + IOPMrootDomain * rootDomain; if ((rootDomain = getPMRootDomain()) == this) { - rootDomain->tracePoint(kIOPMTracePointSystemSleepDriversPhase); + if (IS_POWER_DROP) + { + rootDomain->tracePoint( kIOPMTracePointSleepWillChangeInterests ); + + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeAppNotificationsFinished, + NULL, + 100, + kIOReturnSuccess); + rootDomain->recordAndReleasePMEventGated( details ); + } + else + rootDomain->tracePoint( kIOPMTracePointWakeWillChangeInterests ); } - notifyAll( kIOPM_OurChangeSetPowerState, kNotifyWillChange ); + notifyAll( kIOPM_OurChangeSetPowerState ); } //********************************************************************************* // [private] OurChangeSetPowerState // -// All interested drivers have acknowledged our pre-change notification of a power -// change we initiated. Here we instruct our controlling driver to make -// the change to the hardware. If it does so, we continue processing -// (waiting for settle and notifying interested parties post-change.) -// If it doesn't, we have to wait for it to acknowledge and then continue. +// Instruct our controlling driver to program the hardware for the power state +// change. Wait for async completions. //********************************************************************************* void IOService::OurChangeSetPowerState ( void ) { - fNextMachineState = kIOPM_OurChangeWaitForPowerSettle; - fMachineState = kIOPM_DriverThreadCallDone; - fDriverCallReason = kDriverCallSetPowerState; + MS_PUSH( kIOPM_OurChangeWaitForPowerSettle ); + fMachineState = kIOPM_DriverThreadCallDone; + fDriverCallReason = kDriverCallSetPowerState; - if (notifyControllingDriver() == false) - notifyControllingDriverDone(); + if (notifyControllingDriver() == false) + notifyControllingDriverDone(); } //********************************************************************************* // [private] OurChangeWaitForPowerSettle // -// Our controlling driver has changed power state on the hardware -// during a power change we initiated. Wait for the driver specified -// settle time to expire, before notifying interested parties post-change. +// Our controlling driver has completed the power state change we initiated. +// Wait for the driver specified settle time to expire. //********************************************************************************* -void IOService::OurChangeWaitForPowerSettle( void ) +void IOService::OurChangeWaitForPowerSettle ( void ) { - fMachineState = kIOPM_OurChangeNotifyInterestedDriversDidChange; + fMachineState = kIOPM_OurChangeNotifyInterestedDriversDidChange; startSettleTimer(); } //********************************************************************************* // [private] OurChangeNotifyInterestedDriversDidChange // -// Power has settled on a power change we initiated. Here we notify -// all our interested parties post-change. If they all acknowledge, we're -// done with this change note, and we can start on the next one. -// Otherwise we have to wait for acknowledgements and finish up later. +// Power has settled on a power change we initiated. Here we notify +// all our interested drivers post-change. //********************************************************************************* void IOService::OurChangeNotifyInterestedDriversDidChange ( void ) { - notifyAll( kIOPM_OurChangeFinish, kNotifyDidChange ); + IOPMrootDomain * rootDomain; + if ((rootDomain = getPMRootDomain()) == this) + { + rootDomain->tracePoint( IS_POWER_DROP ? + kIOPMTracePointSleepDidChangeInterests : + kIOPMTracePointWakeDidChangeInterests ); + } + + notifyAll( kIOPM_OurChangeTellCapabilityDidChange ); +} + +//********************************************************************************* +// [private] OurChangeTellCapabilityDidChange +// +// For root domain to notify capability power-change. +//********************************************************************************* + +void IOService::OurChangeTellCapabilityDidChange ( void ) +{ + if (!IS_ROOT_DOMAIN) + return OurChangeFinish(); + + getPMRootDomain()->tracePoint( IS_POWER_DROP ? + kIOPMTracePointSleepCapabilityClients : + kIOPMTracePointWakeCapabilityClients ); + + tellSystemCapabilityChange( kIOPM_OurChangeFinish ); } //********************************************************************************* // [private] OurChangeFinish // -// Power has settled on a power change we initiated, and -// all our interested parties have acknowledged. We're -// done with this change note, and we can start on the next one. +// Done with this self-induced power state change. //********************************************************************************* void IOService::OurChangeFinish ( void ) @@ -3829,17 +4456,11 @@ IOReturn IOService::ParentChangeStart ( void ) // TODO: redundant? See handlePowerDomainWillChangeTo() setParentInfo( fHeadNoteParentFlags, fHeadNoteParentConnection, true ); -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->handlePowerChangeStartForService( - /* service */ this, - /* RD flags */ &fRootDomainState, - /* new pwr state */ fHeadNotePowerState, - /* change flags */ fHeadNoteFlags ); -#endif + PM_ACTION_2(actionPowerChangeStart, fHeadNotePowerState, &fHeadNoteChangeFlags); - // tell apps and kernel clients - fInitialChange = false; - fMachineState = kIOPM_ParentDownTellPriorityClientsPowerDown; + // Tell apps and kernel clients + fInitialPowerChange = false; + fMachineState = kIOPM_ParentChangeTellPriorityClientsPowerDown; tellChangeDown1(fHeadNotePowerState); return IOPMWillAckLater; } @@ -3864,24 +4485,19 @@ IOReturn IOService::ParentChangeStart ( void ) } } - if ( fHeadNoteFlags & kIOPMDomainDidChange ) + if ( fHeadNoteChangeFlags & kIOPMDomainDidChange ) { if ( fHeadNotePowerState > fCurrentPowerState ) { -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->handlePowerChangeStartForService( - /* service */ this, - /* RD flags */ &fRootDomainState, - /* new pwr state */ fHeadNotePowerState, - /* change flags */ fHeadNoteFlags ); -#endif + PM_ACTION_2(actionPowerChangeStart, + fHeadNotePowerState, &fHeadNoteChangeFlags); // Parent did change up - start our change up - fInitialChange = false; - notifyAll( kIOPM_ParentUpSetPowerState, kNotifyWillChange ); + fInitialPowerChange = false; + ParentChangeTellCapabilityWillChange(); return IOPMWillAckLater; } - else if (fHeadNoteFlags & kIOPMSynchronize) + else if (fHeadNoteChangeFlags & kIOPMSynchronize) { // We do not need to change power state, but notify // children to propagate tree synchronization. @@ -3897,102 +4513,108 @@ IOReturn IOService::ParentChangeStart ( void ) } //********************************************************************************* -// [private] ParentDownTellPriorityClientsPowerDown +// [private] ParentChangeTellPriorityClientsPowerDown // -// All applications and kernel clients have been notified of a power lowering -// initiated by the parent and we had to wait for responses. Here -// we notify any priority clients. If they all ack, we continue with the power change. -// If at least one doesn't, we have to wait for it to acknowledge and then continue. +// All applications and kernel clients have acknowledged our intention to drop +// power. Here we notify "priority" clients that we are lowering power. //********************************************************************************* -void IOService::ParentDownTellPriorityClientsPowerDown ( void ) +void IOService::ParentChangeTellPriorityClientsPowerDown ( void ) { - fMachineState = kIOPM_ParentDownNotifyInterestedDriversWillChange; + fMachineState = kIOPM_ParentChangeNotifyInterestedDriversWillChange; tellChangeDown2(fHeadNotePowerState); } //********************************************************************************* -// [private] ParentDownNotifyInterestedDriversWillChange +// [private] ParentChangeTellCapabilityWillChange // -// All applications and kernel clients have been notified of a power lowering -// initiated by the parent and we had to wait for their responses. Here we notify -// any interested drivers and power domain children. If they all ack, we continue -// with the power change. -// If at least one doesn't, we have to wait for it to acknowledge and then continue. +// All (legacy) applications and kernel clients have acknowledged, extra stage for +// root domain to notify apps and drivers about the system capability change. //********************************************************************************* -void IOService::ParentDownNotifyInterestedDriversWillChange ( void ) +void IOService::ParentChangeTellCapabilityWillChange ( void ) { - IOPMrootDomain *rootDomain; - if ((rootDomain = getPMRootDomain()) == this) - { - rootDomain->tracePoint(kIOPMTracePointSystemSleepDriversPhase); - } + if (!IS_ROOT_DOMAIN) + return ParentChangeNotifyInterestedDriversWillChange(); - notifyAll( kIOPM_ParentDownSetPowerState, kNotifyWillChange ); + tellSystemCapabilityChange( kIOPM_ParentChangeNotifyInterestedDriversWillChange ); } //********************************************************************************* -// [private] ParentDownSetPowerState +// [private] ParentChangeNotifyInterestedDriversWillChange // -// We had to wait for it, but all parties have acknowledged our pre-change -// notification of a power lowering initiated by the parent. -// Here we instruct our controlling driver -// to put the hardware in the state it needs to be in when the domain is -// lowered. If it does so, we continue processing -// (waiting for settle and acknowledging the parent.) -// If it doesn't, we have to wait for it to acknowledge and then continue. +// All applications and kernel clients have acknowledged our power state change. +// Here we notify interested drivers pre-change. //********************************************************************************* -void IOService::ParentDownSetPowerState ( void ) +void IOService::ParentChangeNotifyInterestedDriversWillChange ( void ) { - fNextMachineState = kIOPM_ParentDownWaitForPowerSettle; - fMachineState = kIOPM_DriverThreadCallDone; - fDriverCallReason = kDriverCallSetPowerState; + notifyAll( kIOPM_ParentChangeSetPowerState ); +} - if (notifyControllingDriver() == false) - notifyControllingDriverDone(); +//********************************************************************************* +// [private] ParentChangeSetPowerState +// +// Instruct our controlling driver to program the hardware for the power state +// change. Wait for async completions. +//********************************************************************************* + +void IOService::ParentChangeSetPowerState ( void ) +{ + MS_PUSH( kIOPM_ParentChangeWaitForPowerSettle ); + fMachineState = kIOPM_DriverThreadCallDone; + fDriverCallReason = kDriverCallSetPowerState; + + if (notifyControllingDriver() == false) + notifyControllingDriverDone(); } //********************************************************************************* -// [private] ParentDownWaitForPowerSettle +// [private] ParentChangeWaitForPowerSettle // -// Our controlling driver has changed power state on the hardware -// during a power change initiated by our parent. We have had to wait -// for acknowledgement from interested parties, or we have had to wait -// for the controlling driver to change the state. Here we see if we need -// to wait for power to settle before continuing. If not, we continue -// processing (acknowledging our preparedness to the parent). -// If so, we wait and continue later. +// Our controlling driver has completed the power state change initiated by our +// parent. Wait for the driver specified settle time to expire. //********************************************************************************* -void IOService::ParentDownWaitForPowerSettle ( void ) +void IOService::ParentChangeWaitForPowerSettle ( void ) { - fMachineState = kIOPM_ParentDownNotifyDidChangeAndAcknowledgeChange; + fMachineState = kIOPM_ParentChangeNotifyInterestedDriversDidChange; startSettleTimer(); } //********************************************************************************* -// [private] ParentDownNotifyDidChangeAndAcknowledgeChange +// [private] ParentChangeNotifyInterestedDriversDidChange +// +// Power has settled on a power change initiated by our parent. Here we notify +// all our interested drivers post-change. +//********************************************************************************* + +void IOService::ParentChangeNotifyInterestedDriversDidChange ( void ) +{ + notifyAll( kIOPM_ParentChangeTellCapabilityDidChange ); +} + +//********************************************************************************* +// [private] ParentChangeTellCapabilityDidChange // -// Power has settled on a power change initiated by our parent. Here we -// notify interested parties. +// For root domain to notify capability power-change. //********************************************************************************* -void IOService::ParentDownNotifyDidChangeAndAcknowledgeChange ( void ) +void IOService::ParentChangeTellCapabilityDidChange ( void ) { - notifyAll( kIOPM_ParentAcknowledgePowerChange, kNotifyDidChange ); + if (!IS_ROOT_DOMAIN) + return ParentChangeAcknowledgePowerChange(); + + tellSystemCapabilityChange( kIOPM_ParentChangeAcknowledgePowerChange ); } //********************************************************************************* // [private] ParentAcknowledgePowerChange // -// We had to wait for it, but all parties have acknowledged our post-change -// notification of a power change (either Up or Down) initiated by the parent. -// Here we acknowledge the parent. +// Acknowledge our power parent that our power change is done. //********************************************************************************* -void IOService::ParentAcknowledgePowerChange ( void ) +void IOService::ParentChangeAcknowledgePowerChange ( void ) { IORegistryEntry * nub; IOService * parent; @@ -4009,192 +4631,26 @@ void IOService::ParentAcknowledgePowerChange ( void ) nub->release(); } +// MARK: - +// MARK: Ack and Settle timers + //********************************************************************************* -// [private] ParentUpSetPowerState +// [private] settleTimerExpired // -// Our parent has informed us via powerStateDidChange that it has -// raised the power in our power domain, and we have had to wait -// for some interested party to acknowledge our notification. -// Here we instruct our controlling -// driver to program the hardware to take advantage of the higher domain -// power. If it does so, we continue processing -// (waiting for settle and notifying interested parties post-change.) -// If it doesn't, we have to wait for it to acknowledge and then continue. +// Power has settled after our last change. Notify interested parties that +// there is a new power state. //********************************************************************************* -void IOService::ParentUpSetPowerState ( void ) +void IOService::settleTimerExpired( void ) { - fNextMachineState = kIOPM_ParentUpWaitForSettleTime; - fMachineState = kIOPM_DriverThreadCallDone; - fDriverCallReason = kDriverCallSetPowerState; - - if (notifyControllingDriver() == false) - notifyControllingDriverDone(); + fSettleTimeUS = 0; + gIOPMWorkQueue->signalWorkAvailable(); } //********************************************************************************* -// [private] ParentUpWaitForSettleTime +// settle_timer_expired // -// Our controlling driver has changed power state on the hardware -// during a power raise initiated by the parent, but we had to wait for it. -// Here we see if we need to wait for power to settle before continuing. -// If not, we continue processing (notifying interested parties post-change). -// If so, we wait and continue later. -//********************************************************************************* - -void IOService::ParentUpWaitForSettleTime ( void ) -{ - fMachineState = kIOPM_ParentUpNotifyInterestedDriversDidChange; - startSettleTimer(); -} - -//********************************************************************************* -// [private] ParentUpNotifyInterestedDriversDidChange -// -// Power has settled on a power raise initiated by the parent. -// Here we notify all our interested parties post-change. If they all acknowledge, -// we're done with this change note, and we can start on the next one. -// Otherwise we have to wait for acknowledgements and finish up later. -//********************************************************************************* - -void IOService::ParentUpNotifyInterestedDriversDidChange ( void ) -{ - notifyAll( kIOPM_ParentAcknowledgePowerChange, kNotifyDidChange ); -} - -//********************************************************************************* -// [private] all_done -// -// A power change is complete, and the used post-change note is at -// the head of the queue. Remove it and set myCurrentState to the result -// of the change. Start up the next change in queue. -//********************************************************************************* - -void IOService::all_done ( void ) -{ - unsigned long previous_state; - -#if ROOT_DOMAIN_RUN_STATES - getPMRootDomain()->handlePowerChangeDoneForService( - /* service */ this, - /* RD flags */ &fRootDomainState, - /* new pwr state */ fHeadNotePowerState, - /* change flags */ fHeadNoteFlags ); -#endif - - if ((fHeadNoteFlags & kIOPMSynchronize) && - ((fMachineState == kIOPM_Finished) || (fMachineState == kIOPM_SyncFinish))) - { - // Sync operation and no power change occurred. - // Do not inform driver and clients about this request completion, - // except for the originator (root domain). - - if (getPMRequestType() == kIOPMRequestTypeSynchronizePowerTree) - { - powerChangeDone(fCurrentPowerState); - } - - fMachineState = kIOPM_Finished; - return; - } - - fMachineState = kIOPM_Finished; - - // our power change - if ( fHeadNoteFlags & kIOPMWeInitiated ) - { - // could our driver switch to the new state? - if ( !( fHeadNoteFlags & kIOPMNotDone) ) - { - // we changed, tell our parent - requestDomainPower(fHeadNotePowerState); - - // yes, did power raise? - if ( fCurrentPowerState < fHeadNotePowerState ) - { - // yes, inform clients and apps - tellChangeUp (fHeadNotePowerState); - } - previous_state = fCurrentPowerState; - // either way - fCurrentPowerState = fHeadNotePowerState; -#if PM_VARS_SUPPORT - fPMVars->myCurrentState = fCurrentPowerState; -#endif - OUR_PMLog(kPMLogChangeDone, fCurrentPowerState, 0); - - // inform subclass policy-maker - if ((fLockedFlags.PMStop == false) && fParentsKnowState) - powerChangeDone(previous_state); - else - PM_DEBUG("%s::powerChangeDone() skipped\n", getName()); - } - } - - // parent's power change - if ( fHeadNoteFlags & kIOPMParentInitiated) - { - if (((fHeadNoteFlags & kIOPMDomainWillChange) && (fCurrentPowerState >= fHeadNotePowerState)) || - ((fHeadNoteFlags & kIOPMDomainDidChange) && (fCurrentPowerState < fHeadNotePowerState))) - { - // did power raise? - if ( fCurrentPowerState < fHeadNotePowerState ) - { - // yes, inform clients and apps - tellChangeUp (fHeadNotePowerState); - } - // either way - previous_state = fCurrentPowerState; - fCurrentPowerState = fHeadNotePowerState; -#if PM_VARS_SUPPORT - fPMVars->myCurrentState = fCurrentPowerState; -#endif - fMaxCapability = fControllingDriver->maxCapabilityForDomainState(fHeadNoteDomainFlags); - - OUR_PMLog(kPMLogChangeDone, fCurrentPowerState, 0); - - // inform subclass policy-maker - if ((fLockedFlags.PMStop == false) && fParentsKnowState) - powerChangeDone(previous_state); - else - PM_DEBUG("%s::powerChangeDone() skipped\n", getName()); - } - } - - if (fCurrentPowerState < fNumberOfPowerStates) - { - const IOPMPowerState * powerStatePtr = &fPowerStates[fCurrentPowerState]; - - fCurrentCapabilityFlags = powerStatePtr->capabilityFlags; - if (fCurrentCapabilityFlags & kIOPMStaticPowerValid) - fCurrentPowerConsumption = powerStatePtr->staticPower; - } - - // When power rises enough to satisfy the tickle's desire for more power, - // the condition preventing idle-timer from dropping power is removed. - - if (fCurrentPowerState >= fIdleTimerMinPowerState) - { - fIdleTimerMinPowerState = 0; - } -} - -//********************************************************************************* -// [public] settleTimerExpired -// -// Power has settled after our last change. Notify interested parties that -// there is a new power state. -//********************************************************************************* - -void IOService::settleTimerExpired ( void ) -{ - fSettleTimeUS = 0; -} - -//********************************************************************************* -// settle_timer_expired -// -// Holds a retain while the settle timer callout is in flight. +// Holds a retain while the settle timer callout is in flight. //********************************************************************************* static void @@ -4202,12 +4658,11 @@ settle_timer_expired( thread_call_param_t arg0, thread_call_param_t arg1 ) { IOService * me = (IOService *) arg0; - if (gIOPMWorkLoop && gIOPMReplyQueue) + if (gIOPMWorkLoop && gIOPMWorkQueue) { gIOPMWorkLoop->runAction( OSMemberFunctionCast(IOWorkLoop::Action, me, &IOService::settleTimerExpired), me); - gIOPMReplyQueue->signalWorkAvailable(); } me->release(); } @@ -4221,7 +4676,7 @@ settle_timer_expired( thread_call_param_t arg0, thread_call_param_t arg1 ) void IOService::startSettleTimer( void ) { AbsoluteTime deadline; - unsigned long i; + IOPMPowerStateIndex i; uint32_t settleTime = 0; boolean_t pending; @@ -4288,8 +4743,7 @@ bool IOService::ackTimerTick( void ) PM_ASSERT_IN_GATE(); switch (fMachineState) { case kIOPM_OurChangeWaitForPowerSettle: - case kIOPM_ParentDownWaitForPowerSettle: - case kIOPM_ParentUpWaitForSettleTime: + case kIOPM_ParentChangeWaitForPowerSettle: // are we waiting for controlling driver to acknowledge? if ( fDriverTimer > 0 ) { @@ -4304,6 +4758,20 @@ bool IOService::ackTimerTick( void ) PM_ERROR("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms\n", fName, this, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); +#if LOG_SETPOWER_TIMES + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeSetPowerStateDelayed, // type + fName, // who + (uintptr_t)this, // owner unique + NULL, // interest name + (uint8_t)getPowerState(), // old + 0, // new + kIOReturnTimeout, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); +#endif + if (gIOKitDebug & kIOLogDebugPower) { panic("%s::setPowerState(%p, %lu -> %lu) timed out after %d ms", @@ -4321,12 +4789,7 @@ bool IOService::ackTimerTick( void ) } break; - case kIOPM_OurChangeSetPowerState: - case kIOPM_OurChangeFinish: - case kIOPM_ParentDownSetPowerState: - case kIOPM_ParentAcknowledgePowerChange: - case kIOPM_ParentUpSetPowerState: - case kIOPM_NotifyChildrenDone: + case kIOPM_NotifyChildrenStart: // are we waiting for interested parties to acknowledge? if ( fHeadNotePendingAcks != 0 ) { @@ -4350,6 +4813,24 @@ bool IOService::ackTimerTick( void ) nextObject->whatObject, fName, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); +#if LOG_SETPOWER_TIMES + uint16_t logType = (fDriverCallReason == kDriverCallInformPreChange) + ? kIOPMEventTypePSWillChangeTo + : kIOPMEventTypePSDidChangeTo; + + PMEventDetails *details = PMEventDetails::eventDetails( + logType, // type + fName, // who + (uintptr_t)this, // owner unique + nextObject->whatObject->getName(), // interest name + (uint8_t)fCurrentPowerState, // old + (uint8_t)fHeadNotePowerState, // new + kIOReturnTimeout, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); +#endif + // Pretend driver has acked. fHeadNotePendingAcks--; } @@ -4369,11 +4850,16 @@ bool IOService::ackTimerTick( void ) } break; - case kIOPM_ParentDownTellPriorityClientsPowerDown: - case kIOPM_ParentDownNotifyInterestedDriversWillChange: + // TODO: aggreggate this case kIOPM_OurChangeTellClientsPowerDown: case kIOPM_OurChangeTellPriorityClientsPowerDown: case kIOPM_OurChangeNotifyInterestedDriversWillChange: + case kIOPM_ParentChangeTellPriorityClientsPowerDown: + case kIOPM_ParentChangeNotifyInterestedDriversWillChange: + case kIOPM_SyncTellClientsPowerDown: + case kIOPM_SyncTellPriorityClientsPowerDown: + case kIOPM_SyncNotifyWillChange: + case kIOPM_TellCapabilityChangeDone: // apps didn't respond in time cleanClientResponses(true); OUR_PMLog(kPMLogClientTardy, 0, 1); @@ -4382,7 +4868,7 @@ bool IOService::ackTimerTick( void ) break; default: - PM_TRACE("%s: unexpected ack timer tick (state = %d)\n", + PM_LOG1("%s: unexpected ack timer tick (state = %d)\n", getName(), fMachineState); break; } @@ -4441,8 +4927,8 @@ IOService::actionAckTimerExpired ( // otherwise no need to signal the work loop. done = me->ackTimerTick(); - if (done && gIOPMReplyQueue) - gIOPMReplyQueue->signalWorkAvailable(); + if (done && gIOPMWorkQueue) + gIOPMWorkQueue->signalWorkAvailable(); return kIOReturnSuccess; } @@ -4465,104 +4951,31 @@ IOService::ack_timer_expired ( thread_call_param_t arg0, thread_call_param_t arg me->release(); } -//********************************************************************************* -// [private] notifyControllingDriver -//********************************************************************************* - -bool IOService::notifyControllingDriver ( void ) -{ - DriverCallParam * param; - unsigned long powerState; - - PM_ASSERT_IN_GATE(); - assert( fDriverCallParamCount == 0 ); - assert( fControllingDriver ); - - powerState = fHeadNotePowerState; - - param = (DriverCallParam *) fDriverCallParamPtr; - if (!param) - { - param = IONew(DriverCallParam, 1); - if (!param) - return false; // no memory - - fDriverCallParamPtr = (void *) param; - fDriverCallParamSlots = 1; - } - - param->Target = fControllingDriver; - fDriverCallParamCount = 1; - - fDriverTimer = -1; - - // Machine state for this object will stall waiting for a reply - // from the callout thread. - - PM_LOCK(); - assert( fLockedFlags.DriverCallBusy == false ); - fLockedFlags.DriverCallBusy = true; - PM_UNLOCK(); - thread_call_enter( fDriverCallEntry ); - return true; -} +// MARK: - +// MARK: Client Messaging //********************************************************************************* -// [private] notifyControllingDriverDone +// [private] tellSystemCapabilityChange //********************************************************************************* -void IOService::notifyControllingDriverDone( void ) +void IOService::tellSystemCapabilityChange( uint32_t nextMS ) { - DriverCallParam * param; - IOReturn result; - - PM_ASSERT_IN_GATE(); - param = (DriverCallParam *) fDriverCallParamPtr; - - assert( fLockedFlags.DriverCallBusy == false ); - assert( fMachineState == kIOPM_DriverThreadCallDone ); + MS_PUSH( nextMS ); + fMachineState = kIOPM_TellCapabilityChangeDone; + fOutOfBandMessage = kIOMessageSystemCapabilityChange; - if (param) - { - assert(fDriverCallParamCount == 1); - - // the return value from setPowerState() - result = param->Result; - - if ((result == IOPMAckImplied) || (result < 0)) - { - // child return IOPMAckImplied - fDriverTimer = 0; - } - else if (fDriverTimer) - { - assert(fDriverTimer == -1); - - // Driver has not acked, and has returned a positive result. - // Enforce a minimum permissible timeout value. - // Make the min value large enough so timeout is less likely - // to occur if a driver misinterpreted that the return value - // should be in microsecond units. And make it large enough - // to be noticeable if a driver neglects to ack. - - if (result < kMinAckTimeoutTicks) - result = kMinAckTimeoutTicks; - - fDriverTimer = (result / (ACK_TIMER_PERIOD / ns_per_us)) + 1; - } - // else, child has already acked and driver_timer reset to 0. - - fDriverCallParamCount = 0; - - if ( fDriverTimer ) - { - OUR_PMLog(kPMLogStartAckTimer, 0, 0); - start_ack_timer(); - } - } + if (fIsPreChange) + { + // Notify app first on pre-change. + fOutOfBandParameter = kNotifyCapabilityChangeApps; + } + else + { + // Notify kernel clients first on post-change. + fOutOfBandParameter = kNotifyCapabilityChangePriority; + } - // Hop back to original machine state path. - fMachineState = fNextMachineState; + tellClientsWithResponse( fOutOfBandMessage ); } //********************************************************************************* @@ -4648,11 +5061,11 @@ static void logAppTimeouts ( OSObject * object, void * arg ) clientIndex = context->notifyClients->getNextIndexOfObject(object, 0); if ((clientIndex != (unsigned int) -1) && - (flag = context->responseFlags->getObject(clientIndex)) && + (flag = context->responseArray->getObject(clientIndex)) && (flag != kOSBooleanTrue)) { OSString * clientID = 0; - context->us->messageClient(context->msgType, object, &clientID); + context->us->messageClient(context->messageType, object, &clientID); PM_ERROR(context->errorLog, clientID ? clientID->getCStringNoCopy() : ""); // TODO: record message type if possible @@ -4669,32 +5082,47 @@ static void logAppTimeouts ( OSObject * object, void * arg ) void IOService::cleanClientResponses ( bool logErrors ) { - IOPMInterestContext context; - - if (logErrors && fResponseArray && fNotifyClientArray) { - context.responseFlags = fResponseArray; - context.notifyClients = fNotifyClientArray; - context.serialNumber = fSerialNumber; - context.counter = 0; - context.msgType = kIOMessageCopyClientID; - context.us = this; - context.maxTimeRequested = 0; - context.stateNumber = fHeadNotePowerState; - context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; - context.errorLog = "PM notification timeout (%s)\n"; - + if (logErrors && fResponseArray) + { switch ( fOutOfBandParameter ) { case kNotifyApps: - applyToInterested(gIOAppPowerStateInterest, logAppTimeouts, (void *) &context); - case kNotifyPriority: + case kNotifyCapabilityChangeApps: + if (fNotifyClientArray) + { + IOPMInterestContext context; + + context.responseArray = fResponseArray; + context.notifyClients = fNotifyClientArray; + context.serialNumber = fSerialNumber; + context.messageType = kIOMessageCopyClientID; + context.notifyType = kNotifyApps; + context.isPreChange = fIsPreChange; + context.enableTracing = false; + context.us = this; + context.maxTimeRequested = 0; + context.stateNumber = fHeadNotePowerState; + context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; + context.changeFlags = fHeadNoteChangeFlags; + context.errorLog = "PM notification timeout (%s)\n"; + + applyToInterested(gIOAppPowerStateInterest, logAppTimeouts, (void *) &context); + } + break; + default: + // kNotifyPriority, kNotifyCapabilityChangePriority + // TODO: identify the priority client that has not acked + PM_ERROR("PM priority notification timeout\n"); + if (gIOKitDebug & kIOLogDebugPower) + { + panic("PM priority notification timeout"); + } break; } } if (fResponseArray) { - // get rid of this stuff fResponseArray->release(); fResponseArray = NULL; } @@ -4703,8 +5131,6 @@ void IOService::cleanClientResponses ( bool logErrors ) fNotifyClientArray->release(); fNotifyClientArray = NULL; } - - return; } //********************************************************************************* @@ -4716,53 +5142,95 @@ void IOService::cleanClientResponses ( bool logErrors ) // Return true if we don't have to wait for acknowledgements //********************************************************************************* -bool IOService::tellClientsWithResponse ( - int messageType ) -{ - return tellClientsWithResponse( messageType, 0 ); -} - -bool IOService::tellClientsWithResponse ( - int messageType, - IOPMMessageFilter filter ) +bool IOService::tellClientsWithResponse ( int messageType ) { IOPMInterestContext context; + bool isRootDomain = IS_ROOT_DOMAIN; PM_ASSERT_IN_GATE(); assert( fResponseArray == NULL ); assert( fNotifyClientArray == NULL ); + RD_LOG("tellClientsWithResponse( %s, %d )\n", + getIOMessageString(messageType), fOutOfBandParameter); + fResponseArray = OSArray::withCapacity( 1 ); if (!fResponseArray) goto exit; fResponseArray->setCapacityIncrement(8); - fSerialNumber += 1; + if (++fSerialNumber == 0) + fSerialNumber++; - context.responseFlags = fResponseArray; + context.responseArray = fResponseArray; context.notifyClients = 0; context.serialNumber = fSerialNumber; - context.counter = 0; - context.msgType = messageType; + context.messageType = messageType; + context.notifyType = fOutOfBandParameter; + context.isPreChange = fIsPreChange; + context.enableTracing = false; context.us = this; context.maxTimeRequested = 0; context.stateNumber = fHeadNotePowerState; context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; - context.filterFunc = filter; + context.changeFlags = fHeadNoteChangeFlags; + context.messageFilter = (isRootDomain) ? + OSMemberFunctionCast( + IOPMMessageFilter, + this, + &IOPMrootDomain::systemMessageFilter) : 0; switch ( fOutOfBandParameter ) { case kNotifyApps: applyToInterested( gIOAppPowerStateInterest, pmTellAppWithResponse, (void *) &context ); - fNotifyClientArray = context.notifyClients; + + if (isRootDomain && + (fMachineState != kIOPM_OurChangeTellClientsPowerDown) && + (fMachineState != kIOPM_SyncTellClientsPowerDown)) + { + // Notify capability app for tellChangeDown1() + // but not for askChangeDown(). + context.notifyType = kNotifyCapabilityChangeApps; + context.messageType = kIOMessageSystemCapabilityChange; + applyToInterested( gIOAppPowerStateInterest, + pmTellCapabilityAppWithResponse, (void *) &context ); + context.notifyType = fOutOfBandParameter; + context.messageType = messageType; + } + context.maxTimeRequested = k30seconds; applyToInterested( gIOGeneralInterest, pmTellClientWithResponse, (void *) &context ); + + fNotifyClientArray = context.notifyClients; break; case kNotifyPriority: + context.enableTracing = isRootDomain; applyToInterested( gIOPriorityPowerStateInterest, pmTellClientWithResponse, (void *) &context ); + + if (isRootDomain) + { + // Notify capability clients for tellChangeDown2(). + context.notifyType = kNotifyCapabilityChangePriority; + context.messageType = kIOMessageSystemCapabilityChange; + applyToInterested( gIOPriorityPowerStateInterest, + pmTellCapabilityClientWithResponse, (void *) &context ); + } + break; + + case kNotifyCapabilityChangeApps: + applyToInterested( gIOAppPowerStateInterest, + pmTellCapabilityAppWithResponse, (void *) &context ); + fNotifyClientArray = context.notifyClients; + context.maxTimeRequested = k30seconds; + break; + + case kNotifyCapabilityChangePriority: + applyToInterested( gIOPriorityPowerStateInterest, + pmTellCapabilityClientWithResponse, (void *) &context ); break; } @@ -4770,6 +5238,8 @@ bool IOService::tellClientsWithResponse ( if ( !checkForDone() ) { OUR_PMLog(kPMLogStartAckTimer, context.maxTimeRequested, 0); + if (context.enableTracing) + getPMRootDomain()->traceDetail( context.maxTimeRequested / 1000 ); start_ack_timer( context.maxTimeRequested / 1000, kMillisecondScale ); return false; } @@ -4799,143 +5269,361 @@ exit: void IOService::pmTellAppWithResponse ( OSObject * object, void * arg ) { - IOPMInterestContext * context = (IOPMInterestContext *) arg; + IOPMInterestContext * context = (IOPMInterestContext *) arg; IOServicePM * pwrMgt = context->us->pwrMgt; + uint32_t msgIndex, msgRef, msgType; +#if LOG_APP_RESPONSE_TIMES AbsoluteTime now; - UInt32 refcon; +#endif if (!OSDynamicCast(_IOServiceInterestNotifier, object)) + return; + + if (context->messageFilter && + !context->messageFilter(context->us, object, context, 0, 0)) { - // object must be an _IOServiceInterestNotifier. + if (kIOLogDebugPower & gIOKitDebug) + { + // Log client pid/name and client array index. + OSString * clientID = 0; + context->us->messageClient(kIOMessageCopyClientID, object, &clientID); + PM_LOG("%s DROP App %s, %s\n", + context->us->getName(), + getIOMessageString(context->messageType), + clientID ? clientID->getCStringNoCopy() : ""); + if (clientID) clientID->release(); + } return; } - // Lazily create app clients array. + // Create client array (for tracking purposes) only if the service + // has app clients. Usually only root domain does. if (0 == context->notifyClients) - { context->notifyClients = OSArray::withCapacity( 32 ); + + msgType = context->messageType; + msgIndex = context->responseArray->getCount(); + msgRef = ((context->serialNumber & 0xFFFF) << 16) + (msgIndex & 0xFFFF); + + OUR_PMLog(kPMLogAppNotify, msgType, msgRef); + if (kIOLogDebugPower & gIOKitDebug) + { + // Log client pid/name and client array index. + OSString * clientID = 0; + context->us->messageClient(kIOMessageCopyClientID, object, &clientID); + PM_LOG("%s MESG App(%u) %s, %s\n", + context->us->getName(), + msgIndex, getIOMessageString(msgType), + clientID ? clientID->getCStringNoCopy() : ""); + if (clientID) clientID->release(); } - if (context->filterFunc && !context->filterFunc(object, arg)) +#if LOG_APP_RESPONSE_TIMES + OSNumber * num; + clock_get_uptime(&now); + num = OSNumber::withNumber(AbsoluteTime_to_scalar(&now), sizeof(uint64_t) * 8); + if (num) { - // ack - needed to match the counter index at logAppTimeouts(). - context->responseFlags->setObject(context->counter, kOSBooleanTrue); - if (context->notifyClients) - context->notifyClients->setObject(context->counter, kOSBooleanTrue); + context->responseArray->setObject(msgIndex, num); + num->release(); } else +#endif + context->responseArray->setObject(msgIndex, kOSBooleanFalse); + + if (context->notifyClients) + context->notifyClients->setObject(msgIndex, object); + + context->us->messageClient(msgType, object, (void *) msgRef); +} + +//********************************************************************************* +// [static private] pmTellClientWithResponse +// +// We send a message to an in-kernel client, and we expect a response, +// so we compute a cookie we can identify the response with. +//********************************************************************************* + +void IOService::pmTellClientWithResponse ( OSObject * object, void * arg ) +{ + IOPowerStateChangeNotification notify; + IOPMInterestContext * context = (IOPMInterestContext *) arg; + OSObject * replied = kOSBooleanTrue; + _IOServiceInterestNotifier * notifier; + uint32_t msgIndex, msgRef, msgType; + IOReturn retCode; + + if (context->messageFilter && + !context->messageFilter(context->us, object, context, 0, 0)) { - refcon = ((context->serialNumber & 0xFFFF)<<16) - + (context->counter & 0xFFFF); - OUR_PMLog(kPMLogAppNotify, context->msgType, refcon); + if ((kIOLogDebugPower & gIOKitDebug) && + (OSDynamicCast(_IOServiceInterestNotifier, object))) + { + _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; + PM_LOG("%s DROP Client %s, notifier %p, handler %p\n", + context->us->getName(), + getIOMessageString(context->messageType), + object, n->handler); + } + return; + } - if (gIOKitDebug & kIOLogDebugPower) + notifier = OSDynamicCast(_IOServiceInterestNotifier, object); + msgType = context->messageType; + msgIndex = context->responseArray->getCount(); + msgRef = ((context->serialNumber & 0xFFFF) << 16) + (msgIndex & 0xFFFF); + + IOServicePM * pwrMgt = context->us->pwrMgt; + if (gIOKitDebug & kIOLogPower) { + OUR_PMLog(kPMLogClientNotify, msgRef, msgType); + if (OSDynamicCast(IOService, object)) { + const char *who = ((IOService *) object)->getName(); + gPlatform->PMLog(who, kPMLogClientNotify, (uintptr_t) object, 0); + } + else if (notifier) { + OUR_PMLog(kPMLogClientNotify, (uintptr_t) notifier->handler, 0); + } + } + if ((kIOLogDebugPower & gIOKitDebug) && notifier) + { + PM_LOG("%s MESG Client %s, notifier %p, handler %p\n", + context->us->getName(), + getIOMessageString(msgType), + object, notifier->handler); + } + + notify.powerRef = (void *)(uintptr_t) msgRef; + notify.returnValue = 0; + notify.stateNumber = context->stateNumber; + notify.stateFlags = context->stateFlags; + + if (context->enableTracing && (notifier != 0)) + { + uint32_t detail = ((msgIndex & 0xff) << 24) | + ((msgType & 0xfff) << 12) | + (((uintptr_t) notifier->handler) & 0xfff); + getPMRootDomain()->traceDetail( detail ); + } + + retCode = context->us->messageClient(msgType, object, (void *) ¬ify); + if ( kIOReturnSuccess == retCode ) + { + if ( 0 == notify.returnValue ) { - // Log client pid/name and associated index. - OSString * clientID = 0; - context->us->messageClient(kIOMessageCopyClientID, object, &clientID); - PM_DEBUG("[Notify %u] message 0x%x to %s\n", - (uint32_t) context->counter, - context->msgType, - clientID ? clientID->getCStringNoCopy() : ""); - if (clientID) clientID->release(); + // client doesn't want time to respond + OUR_PMLog(kPMLogClientAcknowledge, msgRef, (uintptr_t) object); + } + else + { + replied = kOSBooleanFalse; + if ( notify.returnValue > context->maxTimeRequested ) + { + if (notify.returnValue > kPriorityClientMaxWait) + { + context->maxTimeRequested = kPriorityClientMaxWait; + PM_ERROR("%s: client %p returned %llu for %s\n", + context->us->getName(), + notifier ? (void *) notifier->handler : object, + (uint64_t) notify.returnValue, + getIOMessageString(msgType)); + } + else + context->maxTimeRequested = notify.returnValue; + } } + } + else + { + // not a client of ours + // so we won't be waiting for response + OUR_PMLog(kPMLogClientAcknowledge, msgRef, 0); + } + + context->responseArray->setObject(msgIndex, replied); +} + +//********************************************************************************* +// [static private] pmTellCapabilityAppWithResponse +//********************************************************************************* +void IOService::pmTellCapabilityAppWithResponse ( OSObject * object, void * arg ) +{ + IOPMSystemCapabilityChangeParameters msgArg; + IOPMInterestContext * context = (IOPMInterestContext *) arg; + OSObject * replied = kOSBooleanTrue; + IOServicePM * pwrMgt = context->us->pwrMgt; + uint32_t msgIndex, msgRef, msgType; +#if LOG_APP_RESPONSE_TIMES + AbsoluteTime now; +#endif + + if (!OSDynamicCast(_IOServiceInterestNotifier, object)) + return; + + memset(&msgArg, 0, sizeof(msgArg)); + if (context->messageFilter && + !context->messageFilter(context->us, object, context, &msgArg, &replied)) + { + return; + } + + // Create client array (for tracking purposes) only if the service + // has app clients. Usually only root domain does. + if (0 == context->notifyClients) + context->notifyClients = OSArray::withCapacity( 32 ); + + msgType = context->messageType; + msgIndex = context->responseArray->getCount(); + msgRef = ((context->serialNumber & 0xFFFF) << 16) + (msgIndex & 0xFFFF); + + OUR_PMLog(kPMLogAppNotify, msgType, msgRef); + if (kIOLogDebugPower & gIOKitDebug) + { + // Log client pid/name and client array index. + OSString * clientID = 0; + context->us->messageClient(kIOMessageCopyClientID, object, &clientID); + PM_LOG("%s MESG App(%u) %s, wait %u, %s\n", + context->us->getName(), + msgIndex, getIOMessageString(msgType), + (replied != kOSBooleanTrue), + clientID ? clientID->getCStringNoCopy() : ""); + if (clientID) clientID->release(); + } + + msgArg.notifyRef = msgRef; + msgArg.maxWaitForReply = 0; + + if (replied == kOSBooleanTrue) + { + msgArg.notifyRef = 0; + context->responseArray->setObject(msgIndex, kOSBooleanTrue); + if (context->notifyClients) + context->notifyClients->setObject(msgIndex, kOSBooleanTrue); + } + else + { #if LOG_APP_RESPONSE_TIMES OSNumber * num; clock_get_uptime(&now); num = OSNumber::withNumber(AbsoluteTime_to_scalar(&now), sizeof(uint64_t) * 8); if (num) { - context->responseFlags->setObject(context->counter, num); + context->responseArray->setObject(msgIndex, num); num->release(); } else #endif - context->responseFlags->setObject(context->counter, kOSBooleanFalse); - - if (context->notifyClients) - context->notifyClients->setObject(context->counter, object); + context->responseArray->setObject(msgIndex, kOSBooleanFalse); - context->us->messageClient(context->msgType, object, (void *)refcon); - if ( context->maxTimeRequested < k30seconds ) - { - context->maxTimeRequested = k30seconds; - } + if (context->notifyClients) + context->notifyClients->setObject(msgIndex, object); } - context->counter++; + context->us->messageClient(msgType, object, (void *) &msgArg, sizeof(msgArg)); } //********************************************************************************* -// [static private] pmTellClientWithResponse -// -// We send a message to an in-kernel client, and we expect a response, so we compute a -// cookie we can identify the response with. -// If it doesn't understand the notification (it is not power-management savvy) -// we won't wait for it to prepare for sleep. If it tells us via a return code -// in the passed struct that it is currently ready, we won't wait for it to prepare. -// If it tells us via the return code in the struct that it does need time, we will chill. +// [static private] pmTellCapabilityClientWithResponse //********************************************************************************* -void IOService::pmTellClientWithResponse ( OSObject * object, void * arg ) +void IOService::pmTellCapabilityClientWithResponse( + OSObject * object, void * arg ) { + IOPMSystemCapabilityChangeParameters msgArg; IOPMInterestContext * context = (IOPMInterestContext *) arg; - IOPowerStateChangeNotification notify; - UInt32 refcon; + OSObject * replied = kOSBooleanTrue; + _IOServiceInterestNotifier * notifier; + uint32_t msgIndex, msgRef, msgType; IOReturn retCode; - OSObject * theFlag; - if (context->filterFunc && !context->filterFunc(object, arg)) + memset(&msgArg, 0, sizeof(msgArg)); + if (context->messageFilter && + !context->messageFilter(context->us, object, context, &msgArg, 0)) + { + if ((kIOLogDebugPower & gIOKitDebug) && + (OSDynamicCast(_IOServiceInterestNotifier, object))) + { + _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; + PM_LOG("%s DROP Client %s, notifier %p, handler %p\n", + context->us->getName(), + getIOMessageString(context->messageType), + object, n->handler); + } return; + } - refcon = ((context->serialNumber & 0xFFFF)<<16) + (context->counter & 0xFFFF); - context->responseFlags->setObject(context->counter, kOSBooleanFalse); + notifier = OSDynamicCast(_IOServiceInterestNotifier, object); + msgType = context->messageType; + msgIndex = context->responseArray->getCount(); + msgRef = ((context->serialNumber & 0xFFFF) << 16) + (msgIndex & 0xFFFF); IOServicePM * pwrMgt = context->us->pwrMgt; if (gIOKitDebug & kIOLogPower) { - OUR_PMLog(kPMLogClientNotify, refcon, (UInt32) context->msgType); + OUR_PMLog(kPMLogClientNotify, msgRef, msgType); if (OSDynamicCast(IOService, object)) { const char *who = ((IOService *) object)->getName(); - gPlatform->PMLog(who, - kPMLogClientNotify, * (UInt32 *) object, (UInt64) object); - } else if (OSDynamicCast(_IOServiceInterestNotifier, object)) { - _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; - OUR_PMLog(kPMLogClientNotify, (UInt64) n->handler, 0); + gPlatform->PMLog(who, kPMLogClientNotify, (uintptr_t) object, 0); + } + else if (notifier) { + OUR_PMLog(kPMLogClientNotify, (uintptr_t) notifier->handler, 0); } } + if ((kIOLogDebugPower & gIOKitDebug) && notifier) + { + PM_LOG("%s MESG Client %s, notifier %p, handler %p\n", + context->us->getName(), + getIOMessageString(msgType), + object, notifier->handler); + } - notify.powerRef = (void *)refcon; - notify.returnValue = 0; - notify.stateNumber = context->stateNumber; - notify.stateFlags = context->stateFlags; - retCode = context->us->messageClient(context->msgType,object,(void *)¬ify); - if ( retCode == kIOReturnSuccess ) + msgArg.notifyRef = msgRef; + msgArg.maxWaitForReply = 0; + + if (context->enableTracing && (notifier != 0)) + { + uint32_t detail = ((msgIndex & 0xff) << 24) | + ((msgType & 0xfff) << 12) | + (((uintptr_t) notifier->handler) & 0xfff); + getPMRootDomain()->traceDetail( detail ); + } + + retCode = context->us->messageClient( + msgType, object, (void *) &msgArg, sizeof(msgArg)); + + if ( kIOReturnSuccess == retCode ) { - if ( notify.returnValue == 0 ) + if ( 0 == msgArg.maxWaitForReply ) { // client doesn't want time to respond - context->responseFlags->replaceObject(context->counter, kOSBooleanTrue); - OUR_PMLog(kPMLogClientAcknowledge, refcon, (UInt64) object); - } else { - // it does want time, and it hasn't responded yet - theFlag = context->responseFlags->getObject(context->counter); - if ( kOSBooleanTrue != theFlag ) + OUR_PMLog(kPMLogClientAcknowledge, msgRef, (uintptr_t) object); + } + else + { + replied = kOSBooleanFalse; + if ( msgArg.maxWaitForReply > context->maxTimeRequested ) { - // so note its time requirement - if ( context->maxTimeRequested < notify.returnValue ) + if (msgArg.maxWaitForReply > kCapabilityClientMaxWait) { - context->maxTimeRequested = notify.returnValue; + context->maxTimeRequested = kCapabilityClientMaxWait; + PM_ERROR("%s: client %p returned %u for %s\n", + context->us->getName(), + notifier ? (void *) notifier->handler : object, + msgArg.maxWaitForReply, + getIOMessageString(msgType)); } + else + context->maxTimeRequested = msgArg.maxWaitForReply; } } - } else { - OUR_PMLog(kPMLogClientAcknowledge, refcon, 0); + } + else + { // not a client of ours // so we won't be waiting for response - context->responseFlags->replaceObject(context->counter, kOSBooleanTrue); + OUR_PMLog(kPMLogClientAcknowledge, msgRef, 0); } - context->counter++; + + context->responseArray->setObject(msgIndex, replied); } //********************************************************************************* @@ -4974,23 +5662,29 @@ void IOService::tellChangeUp ( unsigned long ) //********************************************************************************* void IOService::tellClients ( int messageType ) -{ - tellClients( messageType, 0 ); -} - -void IOService::tellClients ( int messageType, IOPMMessageFilter filter ) { IOPMInterestContext context; - context.msgType = messageType; - context.us = this; - context.stateNumber = fHeadNotePowerState; - context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; - context.filterFunc = filter; - + RD_LOG("tellClients( %s )\n", getIOMessageString(messageType)); + + memset(&context, 0, sizeof(context)); + context.messageType = messageType; + context.isPreChange = fIsPreChange; + context.us = this; + context.stateNumber = fHeadNotePowerState; + context.stateFlags = fHeadNotePowerArrayEntry->capabilityFlags; + context.changeFlags = fHeadNoteChangeFlags; + context.messageFilter = (IS_ROOT_DOMAIN) ? + OSMemberFunctionCast( + IOPMMessageFilter, + this, + &IOPMrootDomain::systemMessageFilter) : 0; + + context.notifyType = kNotifyPriority; applyToInterested( gIOPriorityPowerStateInterest, tellKernelClientApplier, (void *) &context ); - + + context.notifyType = kNotifyApps; applyToInterested( gIOAppPowerStateInterest, tellAppClientApplier, (void *) &context ); @@ -5006,18 +5700,40 @@ void IOService::tellClients ( int messageType, IOPMMessageFilter filter ) static void tellKernelClientApplier ( OSObject * object, void * arg ) { - IOPMInterestContext * context = (IOPMInterestContext *) arg; IOPowerStateChangeNotification notify; + IOPMInterestContext * context = (IOPMInterestContext *) arg; - if (context->filterFunc && !context->filterFunc(object, arg)) + if (context->messageFilter && + !context->messageFilter(context->us, object, context, 0, 0)) + { + if ((kIOLogDebugPower & gIOKitDebug) && + (OSDynamicCast(_IOServiceInterestNotifier, object))) + { + _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; + PM_LOG("%s DROP Client %s, notifier %p, handler %p\n", + context->us->getName(), + IOService::getIOMessageString(context->messageType), + object, n->handler); + } return; + } notify.powerRef = (void *) 0; notify.returnValue = 0; notify.stateNumber = context->stateNumber; notify.stateFlags = context->stateFlags; - context->us->messageClient(context->msgType, object, ¬ify); + context->us->messageClient(context->messageType, object, ¬ify); + + if ((kIOLogDebugPower & gIOKitDebug) && + (OSDynamicCast(_IOServiceInterestNotifier, object))) + { + _IOServiceInterestNotifier *n = (_IOServiceInterestNotifier *) object; + PM_LOG("%s MESG Client %s, notifier %p, handler %p\n", + context->us->getName(), + IOService::getIOMessageString(context->messageType), + object, n->handler); + } } //********************************************************************************* @@ -5028,12 +5744,38 @@ static void tellKernelClientApplier ( OSObject * object, void * arg ) static void tellAppClientApplier ( OSObject * object, void * arg ) { - IOPMInterestContext * context = (IOPMInterestContext *) arg; + IOPMInterestContext * context = (IOPMInterestContext *) arg; - if (context->filterFunc && !context->filterFunc(object, arg)) + if (context->messageFilter && + !context->messageFilter(context->us, object, context, 0, 0)) + { + if (kIOLogDebugPower & gIOKitDebug) + { + // Log client pid/name and client array index. + OSString * clientID = 0; + context->us->messageClient(kIOMessageCopyClientID, object, &clientID); + PM_LOG("%s DROP App %s, %s\n", + context->us->getName(), + IOService::getIOMessageString(context->messageType), + clientID ? clientID->getCStringNoCopy() : ""); + if (clientID) clientID->release(); + } return; + } + + if (kIOLogDebugPower & gIOKitDebug) + { + // Log client pid/name and client array index. + OSString * clientID = 0; + context->us->messageClient(kIOMessageCopyClientID, object, &clientID); + PM_LOG("%s MESG App %s, %s\n", + context->us->getName(), + IOService::getIOMessageString(context->messageType), + clientID ? clientID->getCStringNoCopy() : ""); + if (clientID) clientID->release(); + } - context->us->messageClient(context->msgType, object, 0); + context->us->messageClient(context->messageType, object, 0); } //********************************************************************************* @@ -5069,12 +5811,11 @@ bool IOService::checkForDone ( void ) // [public] responseValid //********************************************************************************* -bool IOService::responseValid ( unsigned long x, int pid ) +bool IOService::responseValid ( uint32_t refcon, int pid ) { UInt16 serialComponent; UInt16 ordinalComponent; OSObject * theFlag; - unsigned long refcon = (unsigned long) x; serialComponent = (refcon >> 16) & 0xFFFF; ordinalComponent = (refcon & 0xFFFF); @@ -5102,18 +5843,37 @@ bool IOService::responseValid ( unsigned long x, int pid ) #if LOG_APP_RESPONSE_TIMES AbsoluteTime now; AbsoluteTime start; - uint64_t nsec; + uint64_t nsec; + OSString *name = IOCopyLogNameForPID(pid); clock_get_uptime(&now); AbsoluteTime_to_scalar(&start) = num->unsigned64BitValue(); SUB_ABSOLUTETIME(&now, &start); absolutetime_to_nanoseconds(now, &nsec); + + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeAppResponse, // type + name ? name->getCStringNoCopy() : "", // who + (uintptr_t)pid, // owner unique + NULL, // interest name + 0, // old + 0, // new + 0, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); + + if (kIOLogDebugPower & gIOKitDebug) + { + PM_LOG("Ack(%u) %u ms\n", + (uint32_t) ordinalComponent, + NS_TO_MS(nsec)); + } // > 100 ms if (nsec > LOG_APP_RESPONSE_TIMES) { - OSString * name = IOCopyLogNameForPID(pid); - PM_DEBUG("PM response took %d ms (%s)\n", NS_TO_MS(nsec), + PM_LOG("PM response took %d ms (%s)\n", NS_TO_MS(nsec), name ? name->getCStringNoCopy() : ""); if (nsec > LOG_APP_RESPONSE_MSG_TRACER) @@ -5123,22 +5883,17 @@ bool IOService::responseValid ( unsigned long x, int pid ) gIOPMStatsApplicationResponseSlow, name ? name->getCStringNoCopy() : "", 0, NS_TO_MS(nsec), pid); - } - - if (name) - name->release(); + } } + + if (name) + name->release(); #endif theFlag = kOSBooleanFalse; } if ( kOSBooleanFalse == theFlag ) { - if ((gIOKitDebug & kIOLogDebugPower) && - (fOutOfBandParameter == kNotifyApps)) - { - PM_DEBUG("[Notify %u] acked\n", (uint32_t) ordinalComponent); - } fResponseArray->replaceObject(ordinalComponent, kOSBooleanTrue); } @@ -5151,9 +5906,6 @@ bool IOService::responseValid ( unsigned long x, int pid ) // Our power state is about to lower, and we have notified applications // and kernel clients, and one of them has acknowledged. If this is the last to do // so, and all acknowledgements are positive, we continue with the power change. -// -// We serialize this processing with timer expiration with a command gate on the -// power management workloop, which the timer expiration is command gated to as well. //********************************************************************************* IOReturn IOService::allowPowerChange ( unsigned long refcon ) @@ -5172,7 +5924,7 @@ IOReturn IOService::allowPowerChange ( unsigned long refcon ) request->fArg0 = (void *) refcon; request->fArg1 = (void *) proc_selfpid(); - request->fArg2 = (void *) 0; + request->fArg2 = (void *) 0; submitPMRequest( request ); return kIOReturnSuccess; @@ -5192,9 +5944,6 @@ IOReturn IOService::serializedAllowPowerChange2 ( unsigned long refcon ) // Our power state is about to lower, and we have notified applications // and kernel clients, and one of them has vetoed the change. If this is the last // client to respond, we abandon the power change. -// -// We serialize this processing with timer expiration with a command gate on the -// power management workloop, which the timer expiration is command gated to as well. //********************************************************************************* IOReturn IOService::cancelPowerChange ( unsigned long refcon ) @@ -5255,6 +6004,9 @@ void IOService::clampPowerOn ( unsigned long duration ) } #endif /* !__LP64__ */ +// MARK: - +// MARK: Driver Overrides + //********************************************************************************* // [public] setPowerState // @@ -5285,8 +6037,8 @@ unsigned long IOService::maxCapabilityForDomainState ( IOPMPowerFlags domainStat } for ( i = fNumberOfPowerStates - 1; i >= 0; i-- ) { - if ( (domainState & fPowerStates[i].inputPowerRequirement) == - fPowerStates[i].inputPowerRequirement ) + if ( (domainState & fPowerStates[i].inputPowerFlags) == + fPowerStates[i].inputPowerFlags ) { return i; } @@ -5312,8 +6064,8 @@ unsigned long IOService::initialPowerStateForDomainState ( IOPMPowerFlags domain } for ( i = fNumberOfPowerStates - 1; i >= 0; i-- ) { - if ( (domainState & fPowerStates[i].inputPowerRequirement) == - fPowerStates[i].inputPowerRequirement ) + if ( (domainState & fPowerStates[i].inputPowerFlags) == + fPowerStates[i].inputPowerFlags ) { return i; } @@ -5339,8 +6091,8 @@ unsigned long IOService::powerStateForDomainState ( IOPMPowerFlags domainState ) } for ( i = fNumberOfPowerStates - 1; i >= 0; i-- ) { - if ( (domainState & fPowerStates[i].inputPowerRequirement) == - fPowerStates[i].inputPowerRequirement ) + if ( (domainState & fPowerStates[i].inputPowerFlags) == + fPowerStates[i].inputPowerFlags ) { return i; } @@ -5420,6 +6172,9 @@ void IOService::systemWillShutdown( IOOptionBits specifier ) rootDomain->acknowledgeSystemWillShutdown( this ); } +// MARK: - +// MARK: PM State Machine + //********************************************************************************* // [private static] acquirePMRequest //********************************************************************************* @@ -5473,7 +6228,7 @@ void IOService::submitPMRequest( IOPMRequest * request ) assert( gIOPMReplyQueue ); assert( gIOPMRequestQueue ); - PM_TRACE("[+ %02lx] %p [%p %s] %p %p %p\n", + PM_LOG1("[+ %02lx] %p [%p %s] %p %p %p\n", (long)request->getType(), request, request->getTarget(), request->getTarget()->getName(), request->fArg0, request->fArg1, request->fArg2); @@ -5493,7 +6248,7 @@ void IOService::submitPMRequest( IOPMRequest ** requests, IOItemCount count ) for (IOItemCount i = 0; i < count; i++) { IOPMRequest * req = requests[i]; - PM_TRACE("[+ %02lx] %p [%p %s] %p %p %p\n", + PM_LOG1("[+ %02lx] %p [%p %s] %p %p %p\n", (long)req->getType(), req, req->getTarget(), req->getTarget()->getName(), req->fArg0, req->fArg1, req->fArg2); @@ -5504,66 +6259,39 @@ void IOService::submitPMRequest( IOPMRequest ** requests, IOItemCount count ) //********************************************************************************* // [private] servicePMRequestQueue +// +// Called from IOPMRequestQueue::checkForWork(). //********************************************************************************* bool IOService::servicePMRequestQueue( IOPMRequest * request, IOPMRequestQueue * queue ) { - // Calling PM methods without PMinit() is not allowed, fail the requests. - - if (!initialized) - { - PM_DEBUG("%s: PM not initialized\n", getName()); - goto done; - } - - // Create an IOPMWorkQueue on demand, when the initial PM request is - // received. + bool more; - if (!fPMWorkQueue) - { - // Allocate and attach an IOPMWorkQueue on demand to avoid taking - // the work loop lock in PMinit(), which may deadlock with certain - // drivers / families. - - fPMWorkQueue = IOPMWorkQueue::create( - /* target */ this, - /* Work */ OSMemberFunctionCast(IOPMWorkQueue::Action, this, - &IOService::servicePMRequest), - /* Done */ OSMemberFunctionCast(IOPMWorkQueue::Action, this, - &IOService::retirePMRequest) - ); - - if (fPMWorkQueue && - (gIOPMWorkLoop->addEventSource(fPMWorkQueue) != kIOReturnSuccess)) - { - PM_ERROR("%s: add PM work queue failed\n", getName()); - fPMWorkQueue->release(); - fPMWorkQueue = 0; - } + if (initialized) + { + // Work queue will immediately execute the queue'd request if possible. + // If execution blocks, the work queue will wait for a producer signal. + // Only need to signal more when completing attached requests. - if (!fPMWorkQueue) - { - PM_ERROR("%s: no PM work queue (type %02lx)\n", - getName(), (long)request->getType()); - goto done; - } - } + more = gIOPMWorkQueue->queuePMRequest(request, pwrMgt); + return more; + } - fPMWorkQueue->queuePMRequest(request); - return false; // do not signal more + // Calling PM without PMinit() is not allowed, fail the request. -done: + PM_LOG("%s: PM not initialized\n", getName()); fAdjustPowerScheduled = false; - gIOPMFreeQueue->queuePMRequest(request); - return false; // do not signal more + more = gIOPMFreeQueue->queuePMRequest(request); + if (more) gIOPMWorkQueue->incrementProducerCount(); + return more; } //********************************************************************************* // [private] servicePMFreeQueue // -// Called by the request completion to recycle a completed request. +// Called from IOPMCompletionQueue::checkForWork(). //********************************************************************************* bool IOService::servicePMFreeQueue( @@ -5575,26 +6303,8 @@ bool IOService::servicePMFreeQueue( if (root && (root != request)) more = true; - - if (fLockedFlags.PMStop && fPMWorkQueue && fPMWorkQueue->isEmpty()) - { - // Driver PMstop'ed and the work queue is empty. - // Detach and destroy the work queue to avoid the similar cleanup by - // PMfree(), which is deadlock prone. After PMstop() if driver calls PM, - // or a request from power parent or child arrives, it is possible to - // create/cleanup work queue more than once. Should be rare. - - gIOPMWorkLoop->removeEventSource(fPMWorkQueue); - fPMWorkQueue->release(); - fPMWorkQueue = 0; - - if ( fIdleTimerEventSource != NULL ) { - fIdleTimerEventSource->disable(); - gIOPMWorkLoop->removeEventSource(fIdleTimerEventSource); - fIdleTimerEventSource->release(); - fIdleTimerEventSource = NULL; - } - } + if (more) + gIOPMWorkQueue->incrementProducerCount(); releasePMRequest( request ); return more; @@ -5610,14 +6320,14 @@ bool IOService::retirePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) { assert(request && queue); - PM_TRACE("[- %02x] %p [%p %s] State %d, Busy %d\n", + PM_LOG1("[- %02x] %p [%p %s] state %d, busy %d\n", request->getType(), request, this, getName(), fMachineState, gIOPMBusyCount); // Catch requests created by idleTimerExpired(). if ((request->getType() == kIOPMRequestTypeActivityTickle) && - (request->fArg1 == (void *) false)) + (request->fArg1 == (void *) (uintptr_t) false)) { // Idle timer power drop request completed. // Restart the idle timer if deviceDesire can go lower, otherwise set @@ -5633,8 +6343,10 @@ bool IOService::retirePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) fIdleTimerStopped = true; } - gIOPMFreeQueue->queuePMRequest( request ); - return true; + // If the request is linked, then Work queue has already incremented its + // producer count. + + return (gIOPMFreeQueue->queuePMRequest( request )); } //********************************************************************************* @@ -5656,7 +6368,8 @@ bool IOService::isPMBlocked ( IOPMRequest * request, int count ) // 5 = kDriverCallInformPreChange // 6 = kDriverCallInformPostChange // 7 = kDriverCallSetPowerState - if (fLockedFlags.DriverCallBusy) reason = 5 + fDriverCallReason; + if (fDriverCallBusy) + reason = 5 + fDriverCallReason; break; } @@ -5691,7 +6404,7 @@ bool IOService::isPMBlocked ( IOPMRequest * request, int count ) { if (count) { - PM_TRACE("[B %02x] %p [%p %s] State %d, Reason %d\n", + PM_LOG1("[B %02x] %p [%p %s] state %d, reason %d\n", request->getType(), request, this, getName(), fMachineState, reason); } @@ -5717,10 +6430,11 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) while (isPMBlocked(request, loop++) == false) { - PM_TRACE("[W %02x] %p [%p %s] State %d\n", + PM_LOG1("[W %02x] %p [%p %s] state %d\n", request->getType(), request, this, getName(), fMachineState); gIOPMRequest = request; + gIOPMWorkCount++; // Every PM machine states must be handled in one of the cases below. @@ -5731,40 +6445,87 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) break; case kIOPM_OurChangeTellClientsPowerDown: - // our change, was it vetoed? + // Root domain might self cancel due to assertions. + if (IS_ROOT_DOMAIN) + { + bool cancel = (bool) fDoNotPowerDown; + getPMRootDomain()->askChangeDownDone( + &fHeadNoteChangeFlags, &cancel); + fDoNotPowerDown = cancel; + } + + // askChangeDown() done, was it vetoed? if (!fDoNotPowerDown) { + if (IS_ROOT_DOMAIN) { + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeAppNotificationsFinished, + NULL, + 0, + 0); + + getPMRootDomain()->recordAndReleasePMEventGated( details ); + } + // no, we can continue OurChangeTellClientsPowerDown(); } else { + if (IS_ROOT_DOMAIN) { + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeSleepDone, + NULL, + 1, /* reason: 1 == Ask clients succeeded */ + kIOReturnAborted); /* result */ + + getPMRootDomain()->recordAndReleasePMEventGated( details ); + } + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); PM_ERROR("%s: idle cancel\n", fName); // yes, rescind the warning tellNoChangeDown(fHeadNotePowerState); // mark the change note un-actioned - fHeadNoteFlags |= kIOPMNotDone; + fHeadNoteChangeFlags |= kIOPMNotDone; // and we're done - all_done(); + OurChangeFinish(); } break; case kIOPM_OurChangeTellPriorityClientsPowerDown: - // our change, should it be acted on still? + // tellChangeDown(kNotifyApps) done, was it cancelled? if (fDoNotPowerDown) { + if (IS_ROOT_DOMAIN) { + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeSleepDone, + NULL, + 2, /* reason: 2 == Client cancelled wake */ + kIOReturnAborted); /* result */ + + getPMRootDomain()->recordAndReleasePMEventGated( details ); + } OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); PM_ERROR("%s: idle revert\n", fName); // no, tell clients we're back in the old state tellChangeUp(fCurrentPowerState); // mark the change note un-actioned - fHeadNoteFlags |= kIOPMNotDone; + fHeadNoteChangeFlags |= kIOPMNotDone; // and we're done - all_done(); + OurChangeFinish(); } else { + if (IS_ROOT_DOMAIN) { + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeAppNotificationsFinished, + NULL, + 2, /* reason: 2 == TellPriorityClientsDone */ + kIOReturnSuccess); /* result */ + + getPMRootDomain()->recordAndReleasePMEventGated( details ); + } // yes, we can continue OurChangeTellPriorityClientsPowerDown(); } @@ -5786,44 +6547,40 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) OurChangeNotifyInterestedDriversDidChange(); break; + case kIOPM_OurChangeTellCapabilityDidChange: + OurChangeTellCapabilityDidChange(); + break; + case kIOPM_OurChangeFinish: OurChangeFinish(); break; - case kIOPM_ParentDownTellPriorityClientsPowerDown: - ParentDownTellPriorityClientsPowerDown(); + case kIOPM_ParentChangeTellPriorityClientsPowerDown: + ParentChangeTellPriorityClientsPowerDown(); break; - case kIOPM_ParentDownNotifyInterestedDriversWillChange: - ParentDownNotifyInterestedDriversWillChange(); + case kIOPM_ParentChangeNotifyInterestedDriversWillChange: + ParentChangeNotifyInterestedDriversWillChange(); break; - case kIOPM_ParentDownNotifyDidChangeAndAcknowledgeChange: - ParentDownNotifyDidChangeAndAcknowledgeChange(); + case kIOPM_ParentChangeSetPowerState: + ParentChangeSetPowerState(); break; - case kIOPM_ParentDownSetPowerState: - ParentDownSetPowerState(); + case kIOPM_ParentChangeWaitForPowerSettle: + ParentChangeWaitForPowerSettle(); break; - case kIOPM_ParentDownWaitForPowerSettle: - ParentDownWaitForPowerSettle(); + case kIOPM_ParentChangeNotifyInterestedDriversDidChange: + ParentChangeNotifyInterestedDriversDidChange(); break; - case kIOPM_ParentAcknowledgePowerChange: - ParentAcknowledgePowerChange(); - break; - - case kIOPM_ParentUpSetPowerState: - ParentUpSetPowerState(); - break; - - case kIOPM_ParentUpWaitForSettleTime: - ParentUpWaitForSettleTime(); - break; + case kIOPM_ParentChangeTellCapabilityDidChange: + ParentChangeTellCapabilityDidChange(); + break; - case kIOPM_ParentUpNotifyInterestedDriversDidChange: - ParentUpNotifyInterestedDriversDidChange(); + case kIOPM_ParentChangeAcknowledgePowerChange: + ParentChangeAcknowledgePowerChange(); break; case kIOPM_DriverThreadCallDone: @@ -5833,21 +6590,116 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) notifyInterestedDriversDone(); break; - case kIOPM_NotifyChildrenDone: - notifyChildrenDone(); + case kIOPM_NotifyChildrenOrdered: + notifyChildrenOrdered(); + break; + + case kIOPM_NotifyChildrenDelayed: + notifyChildrenDelayed(); + break; + + case kIOPM_NotifyChildrenStart: + PM_LOG2("%s: kIOPM_NotifyChildrenStart done\n", getName()); + MS_POP(); // from notifyInterestedDriversDone() + notifyChildren(); + break; + + case kIOPM_SyncTellClientsPowerDown: + // Root domain might self cancel due to assertions. + if (IS_ROOT_DOMAIN) + { + bool cancel = (bool) fDoNotPowerDown; + getPMRootDomain()->askChangeDownDone( + &fHeadNoteChangeFlags, &cancel); + fDoNotPowerDown = cancel; + } + if (!fDoNotPowerDown) + { + fMachineState = kIOPM_SyncTellPriorityClientsPowerDown; + fOutOfBandParameter = kNotifyApps; + tellChangeDown(fHeadNotePowerState); + } + else + { + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); + PM_ERROR("%s: idle cancel\n", fName); + tellNoChangeDown(fHeadNotePowerState); + fHeadNoteChangeFlags |= kIOPMNotDone; + OurChangeFinish(); + } + break; + + case kIOPM_SyncTellPriorityClientsPowerDown: + if (!fDoNotPowerDown) + { + fMachineState = kIOPM_SyncNotifyWillChange; + fOutOfBandParameter = kNotifyPriority; + tellChangeDown(fHeadNotePowerState); + } + else + { + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); + PM_ERROR("%s: idle revert\n", fName); + tellChangeUp(fCurrentPowerState); + fHeadNoteChangeFlags |= kIOPMNotDone; + OurChangeFinish(); + } break; + case kIOPM_SyncNotifyWillChange: + if (kIOPMSyncNoChildNotify & fHeadNoteChangeFlags) + { + fMachineState = kIOPM_SyncFinish; + continue; + } + fMachineState = kIOPM_SyncNotifyDidChange; + fDriverCallReason = kDriverCallInformPreChange; + notifyChildren(); + break; + case kIOPM_SyncNotifyDidChange: - fMachineState = kIOPM_SyncFinish; + fIsPreChange = false; + + if (fHeadNoteChangeFlags & kIOPMParentInitiated) + fMachineState = kIOPM_SyncFinish; + else + fMachineState = kIOPM_SyncTellCapabilityDidChange; + fDriverCallReason = kDriverCallInformPostChange; notifyChildren(); break; + case kIOPM_SyncTellCapabilityDidChange: + tellSystemCapabilityChange( kIOPM_SyncFinish ); + break; + case kIOPM_SyncFinish: - if (fHeadNoteFlags & kIOPMParentInitiated) - ParentAcknowledgePowerChange(); + if (fHeadNoteChangeFlags & kIOPMParentInitiated) + ParentChangeAcknowledgePowerChange(); else OurChangeFinish(); + break; + + case kIOPM_TellCapabilityChangeDone: + if (fIsPreChange) + { + if (fOutOfBandParameter == kNotifyCapabilityChangePriority) + { + MS_POP(); // tellSystemCapabilityChange() + continue; + } + fOutOfBandParameter = kNotifyCapabilityChangePriority; + } + else + { + if (fOutOfBandParameter == kNotifyCapabilityChangeApps) + { + MS_POP(); // tellSystemCapabilityChange() + continue; + } + fOutOfBandParameter = kNotifyCapabilityChangeApps; + } + tellClientsWithResponse( fOutOfBandMessage ); break; default: @@ -5859,8 +6711,6 @@ bool IOService::servicePMRequest( IOPMRequest * request, IOPMWorkQueue * queue ) if (fMachineState == kIOPM_Finished) { - //PM_TRACE("[%s] PM End: Request %p (type %02lx)\n", - // getName(), request, request->getType()); done = true; break; } @@ -5910,6 +6760,7 @@ void IOService::executePMRequest( IOPMRequest * request ) break; case kIOPMRequestTypePowerDomainDidChange: + handlePowerDomainDidChangeTo( request ); break; @@ -5933,29 +6784,10 @@ void IOService::executePMRequest( IOPMRequest * request ) case kIOPMRequestTypeSetIdleTimerPeriod: { - IOWorkLoop * wl = gIOPMWorkLoop; fIdleTimerPeriod = (uintptr_t) request->fArg0; - if (wl && (false == fLockedFlags.PMStop) && (fIdleTimerPeriod > 0)) + if ((false == fLockedFlags.PMStop) && (fIdleTimerPeriod > 0)) { - if ( NULL == fIdleTimerEventSource ) - { - IOTimerEventSource * timerSrc; - - timerSrc = IOTimerEventSource::timerEventSource( - this, - OSMemberFunctionCast(IOTimerEventSource::Action, - this, &IOService::idleTimerExpired)); - - if (timerSrc && (wl->addEventSource(timerSrc) != kIOReturnSuccess)) - { - timerSrc->release(); - timerSrc = 0; - } - - fIdleTimerEventSource = timerSrc; - } - fActivityTickleCount = 0; clock_get_uptime(&fIdleTimerStartTime); start_PM_idle_timer(); @@ -5979,7 +6811,7 @@ bool IOService::servicePMReplyQueue( IOPMRequest * request, IOPMRequestQueue * q assert( request && queue ); assert( request->isReplyType() ); - PM_TRACE("[A %02x] %p [%p %s] State %d\n", + PM_LOG1("[A %02x] %p [%p %s] state %d\n", request->getType(), request, this, getName(), fMachineState); switch ( request->getType() ) @@ -5987,32 +6819,30 @@ bool IOService::servicePMReplyQueue( IOPMRequest * request, IOPMRequestQueue * q case kIOPMRequestTypeAllowPowerChange: case kIOPMRequestTypeCancelPowerChange: // Check if we are expecting this response. - if (responseValid((unsigned long) request->fArg0, (int)(long) request->fArg1)) + if (responseValid((uint32_t)(uintptr_t) request->fArg0, + (int)(uintptr_t) request->fArg1)) { if (kIOPMRequestTypeCancelPowerChange == request->getType()) { - OSString * name = (OSString *) request->fArg2; - getPMRootDomain()->pmStatsRecordApplicationResponse( - gIOPMStatsApplicationResponseCancel, - name ? name->getCStringNoCopy() : "", 0, - 0, (int)(uintptr_t) request->fArg1); + // Clients are not allowed to cancel when kIOPMSkipAskPowerDown + // flag is set. Only root domain will set this flag. + + if ((fHeadNoteChangeFlags & kIOPMSkipAskPowerDown) == 0) + { + fDoNotPowerDown = true; - fDoNotPowerDown = true; + OSString * name = (OSString *) request->fArg2; + getPMRootDomain()->pmStatsRecordApplicationResponse( + gIOPMStatsApplicationResponseCancel, + name ? name->getCStringNoCopy() : "", 0, + 0, (int)(uintptr_t) request->fArg1); + } } if (checkForDone()) { stop_ack_timer(); - if ( fResponseArray ) - { - fResponseArray->release(); - fResponseArray = NULL; - } - if ( fNotifyClientArray ) - { - fNotifyClientArray->release(); - fNotifyClientArray = NULL; - } + cleanClientResponses(false); more = true; } } @@ -6045,8 +6875,20 @@ bool IOService::servicePMReplyQueue( IOPMRequest * request, IOPMRequestQueue * q #if LOG_SETPOWER_TIMES uint64_t nsec = computeTimeDeltaNS(&fDriverCallStartTime); if (nsec > LOG_SETPOWER_TIMES) - PM_DEBUG("%s::setPowerState(%p, %lu -> %lu) async took %d ms\n", + PM_LOG("%s::setPowerState(%p, %lu -> %lu) async took %d ms\n", fName, this, fCurrentPowerState, fHeadNotePowerState, NS_TO_MS(nsec)); + + PMEventDetails *details = PMEventDetails::eventDetails( + kIOPMEventTypeSetPowerStateDelayed, // type + fName, // who + (uintptr_t)this, // owner unique + NULL, // interest name + (uint8_t)getPowerState(), // old + (uint8_t)fHeadNotePowerState, // new + 0, // result + NS_TO_US(nsec)); // usec completion time + + getPMRootDomain()->recordAndReleasePMEventGated( details ); #endif OUR_PMLog(kPMLogDriverAcknowledgeSet, (uintptr_t) this, fDriverTimer); fDriverTimer = 0; @@ -6066,68 +6908,187 @@ bool IOService::servicePMReplyQueue( IOPMRequest * request, IOPMRequestQueue * q case kIOPMRequestTypeIdleCancel: if ((fMachineState == kIOPM_OurChangeTellClientsPowerDown) - || (fMachineState == kIOPM_OurChangeTellPriorityClientsPowerDown)) + || (fMachineState == kIOPM_OurChangeTellPriorityClientsPowerDown) + || (fMachineState == kIOPM_SyncTellClientsPowerDown) + || (fMachineState == kIOPM_SyncTellPriorityClientsPowerDown)) { - OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, 0); + OUR_PMLog(kPMLogIdleCancel, (uintptr_t) this, fMachineState); + PM_LOG2("%s: cancel from machine state %d\n", + getName(), fMachineState); fDoNotPowerDown = true; - if (fMachineState == kIOPM_OurChangeTellPriorityClientsPowerDown) + // Stop waiting for app replys. + if ((fMachineState == kIOPM_OurChangeTellPriorityClientsPowerDown) || + (fMachineState == kIOPM_SyncTellPriorityClientsPowerDown)) cleanClientResponses(false); more = true; } break; + case kIOPMRequestTypeChildNotifyDelayCancel: + if (fMachineState == kIOPM_NotifyChildrenDelayed) + { + PM_LOG2("%s: delay notify cancelled\n", getName()); + notifyChildrenDelayed(); + } + break; + default: panic("servicePMReplyQueue: unknown reply type %x", request->getType()); } - releasePMRequest( request ); + more |= gIOPMFreeQueue->queuePMRequest(request); + if (more) + gIOPMWorkQueue->incrementProducerCount(); + return more; } //********************************************************************************* -// [private] assertPMThreadCall / deassertPMThreadCall +// [private] assertPMDriverCall / deassertPMDriverCall //********************************************************************************* -bool IOService::assertPMThreadCall( void ) +bool IOService::assertPMDriverCall( + IOPMDriverCallEntry * entry, + IOOptionBits options, + IOPMinformee * inform ) { + IOService * target = 0; + bool ok = false; + if (!initialized) return false; - // PMfree() should only be called from IOService::free(). - // That makes it safe to touch IOServicePM state here. - // Caller holds a retain and has checked target is on PM plane. - PM_LOCK(); + if (fLockedFlags.PMStop) { - // PMstop() already issued - fail the assertion. - PM_UNLOCK(); - return false; + goto fail; + } + + if (((options & kIOPMADC_NoInactiveCheck) == 0) && isInactive()) + { + goto fail; + } + + if (inform) + { + if (!inform->active) + { + goto fail; + } + target = inform->whatObject; + if (target->isInactive()) + { + goto fail; + } } - // Increment assertion count to block PMstop(), and return true. - fThreadAssertionCount++; - fThreadAssertionThread = current_thread(); // only 1 caller + entry->thread = current_thread(); + entry->target = target; + queue_enter(&fPMDriverCallQueue, entry, IOPMDriverCallEntry *, link); + ok = true; + +fail: PM_UNLOCK(); - return true; + return ok; } -void IOService::deassertPMThreadCall( void ) +void IOService::deassertPMDriverCall( IOPMDriverCallEntry * entry ) { + bool wakeup = false; + PM_LOCK(); - assert(fThreadAssertionCount > 0); - if (fThreadAssertionCount) - fThreadAssertionCount--; - if (current_thread() == fThreadAssertionThread) - fThreadAssertionThread = 0; - if ((fThreadAssertionCount == 0) && fLockedFlags.PMStop) + + assert( !queue_empty(&fPMDriverCallQueue) ); + queue_remove(&fPMDriverCallQueue, entry, IOPMDriverCallEntry *, link); + if (fLockedFlags.PMDriverCallWait) { - // PMstop() is blocked waiting for assertion count to drop to zero. - PM_LOCK_WAKEUP(&fThreadAssertionCount); + wakeup = true; } + PM_UNLOCK(); + + if (wakeup) + PM_LOCK_WAKEUP(&fPMDriverCallQueue); +} + +void IOService::waitForPMDriverCall( IOService * target ) +{ + const IOPMDriverCallEntry * entry; + thread_t thread = current_thread(); + AbsoluteTime deadline; + int waitResult; + bool log = true; + bool wait; + + do { + wait = false; + queue_iterate(&fPMDriverCallQueue, entry, const IOPMDriverCallEntry *, link) + { + // Target of interested driver call + if (target && (target != entry->target)) + continue; + + if (entry->thread == thread) + { + if (log) + { + PM_LOG("%s: %s(%s) on PM thread\n", + fName, __FUNCTION__, target ? target->getName() : ""); + OSReportWithBacktrace("%s: %s(%s) on PM thread\n", + fName, __FUNCTION__, target ? target->getName() : ""); + log = false; + } + continue; + } + + wait = true; + break; + } + + if (wait) + { + fLockedFlags.PMDriverCallWait = true; + clock_interval_to_deadline(15, kSecondScale, &deadline); + waitResult = PM_LOCK_SLEEP(&fPMDriverCallQueue, deadline); + fLockedFlags.PMDriverCallWait = false; + if (THREAD_TIMED_OUT == waitResult) + { + PM_ERROR("%s: waitForPMDriverCall timeout\n", fName); + wait = false; + } + } + } while (wait); +} + +//********************************************************************************* +// [private] Debug helpers +//********************************************************************************* + +const char * IOService::getIOMessageString( uint32_t msg ) +{ +#define MSG_ENTRY(x) {x, #x} + + static const IONamedValue msgNames[] = { + MSG_ENTRY( kIOMessageCanDevicePowerOff ), + MSG_ENTRY( kIOMessageDeviceWillPowerOff ), + MSG_ENTRY( kIOMessageDeviceWillNotPowerOff ), + MSG_ENTRY( kIOMessageDeviceHasPoweredOn ), + MSG_ENTRY( kIOMessageCanSystemPowerOff ), + MSG_ENTRY( kIOMessageSystemWillPowerOff ), + MSG_ENTRY( kIOMessageSystemWillNotPowerOff ), + MSG_ENTRY( kIOMessageCanSystemSleep ), + MSG_ENTRY( kIOMessageSystemWillSleep ), + MSG_ENTRY( kIOMessageSystemWillNotSleep ), + MSG_ENTRY( kIOMessageSystemHasPoweredOn ), + MSG_ENTRY( kIOMessageSystemWillRestart ), + MSG_ENTRY( kIOMessageSystemWillPowerOn ), + MSG_ENTRY( kIOMessageSystemCapabilityChange ) + }; + + return IOFindNameForValue(msg, msgNames); } // MARK: - @@ -6189,8 +7150,10 @@ void IOPMRequest::reset( void ) } } -void IOPMRequest::attachNextRequest( IOPMRequest * next ) +bool IOPMRequest::attachNextRequest( IOPMRequest * next ) { + bool ok = false; + if (!fRequestNext) { // Postpone the execution of the next request after @@ -6204,11 +7167,15 @@ void IOPMRequest::attachNextRequest( IOPMRequest * next ) (uint32_t) fRequestNext->fWorkWaitCount, fTarget->getName()); #endif + ok = true; } + return ok; } -void IOPMRequest::detachNextRequest( void ) +bool IOPMRequest::detachNextRequest( void ) { + bool ok = false; + if (fRequestNext) { assert(fRequestNext->fWorkWaitCount); @@ -6222,11 +7189,15 @@ void IOPMRequest::detachNextRequest( void ) fTarget->getName()); #endif fRequestNext = 0; + ok = true; } + return ok; } -void IOPMRequest::attachRootRequest( IOPMRequest * root ) +bool IOPMRequest::attachRootRequest( IOPMRequest * root ) { + bool ok = false; + if (!fRequestRoot) { // Delay the completion of the root request after @@ -6240,11 +7211,15 @@ void IOPMRequest::attachRootRequest( IOPMRequest * root ) (uint32_t) fRequestRoot->fFreeWaitCount, fTarget->getName()); #endif + ok = true; } + return ok; } -void IOPMRequest::detachRootRequest( void ) +bool IOPMRequest::detachRootRequest( void ) { + bool ok = false; + if (fRequestRoot) { assert(fRequestRoot->fFreeWaitCount); @@ -6258,7 +7233,9 @@ void IOPMRequest::detachRootRequest( void ) fTarget->getName()); #endif fRequestRoot = 0; + ok = true; } + return ok; } // MARK: - @@ -6267,8 +7244,7 @@ void IOPMRequest::detachRootRequest( void ) //********************************************************************************* // IOPMRequestQueue Class // -// Global queues. As PM-aware drivers load and unload, their IOPMWorkQueue's are -// created and deallocated. IOPMRequestQueue are created once and never released. +// Global queues. Queues are created once and never released. //********************************************************************************* OSDefineMetaClassAndStructors( IOPMRequestQueue, IOEventSource ); @@ -6353,19 +7329,13 @@ bool IOPMRequestQueue::checkForWork( void ) return more; } -void IOPMRequestQueue::signalWorkAvailable( void ) -{ - IOEventSource::signalWorkAvailable(); -} - // MARK: - // MARK: IOPMWorkQueue //********************************************************************************* // IOPMWorkQueue Class // -// Every object in the power plane that has handled a PM request, will have an -// instance of IOPMWorkQueue allocated for it. +// Queue of IOServicePM objects with busy IOPMRequest(s). //********************************************************************************* OSDefineMetaClassAndStructors( IOPMWorkQueue, IOEventSource ); @@ -6390,43 +7360,160 @@ bool IOPMWorkQueue::init( IOService * inOwner, Action work, Action retire ) queue_init(&fWorkQueue); - fWorkAction = work; - fRetireAction = retire; + fWorkAction = work; + fRetireAction = retire; + fConsumerCount = fProducerCount = 0; return true; } -void IOPMWorkQueue::queuePMRequest( IOPMRequest * request ) +bool IOPMWorkQueue::queuePMRequest( IOPMRequest * request, IOServicePM * pwrMgt ) { + bool more = false; + bool empty; + assert( request ); + assert( pwrMgt ); assert( onThread() ); + assert( queue_next(&request->fCommandChain) == + queue_prev(&request->fCommandChain) ); gIOPMBusyCount++; - queue_enter(&fWorkQueue, request, IOPMRequest *, fCommandChain); - checkForWork(); + + // Add new request to the tail of the per-service request queue. + // Then immediately check the request queue to minimize latency + // if the queue was empty. + + empty = queue_empty(&pwrMgt->RequestHead); + queue_enter(&pwrMgt->RequestHead, request, IOPMRequest *, fCommandChain); + if (empty) + { + more = checkRequestQueue(&pwrMgt->RequestHead, &empty); + if (!empty) + { + // New Request is blocked, add IOServicePM to work queue. + assert( queue_next(&pwrMgt->WorkChain) == + queue_prev(&pwrMgt->WorkChain) ); + + queue_enter(&fWorkQueue, pwrMgt, IOServicePM *, WorkChain); + fQueueLength++; + PM_LOG3("IOPMWorkQueue: [%u] added %s@%p to queue\n", + fQueueLength, pwrMgt->Name, pwrMgt); + } + } + + return more; } -bool IOPMWorkQueue::checkForWork( void ) +bool IOPMWorkQueue::checkRequestQueue( queue_head_t * queue, bool * empty ) { IOPMRequest * request; - IOService * target = (IOService *) owner; - bool done; + IOService * target; + bool more = false; + bool done = false; - while (!queue_empty(&fWorkQueue)) - { - request = (IOPMRequest *) queue_first(&fWorkQueue); - assert(request->getTarget() == target); - if (request->isWorkBlocked()) break; - done = (*fWorkAction)( target, request, this ); - if (!done) break; - - assert(gIOPMBusyCount > 0); - if (gIOPMBusyCount) gIOPMBusyCount--; - queue_remove_first(&fWorkQueue, request, IOPMRequest *, fCommandChain); - (*fRetireAction)( target, request, this ); - } + assert(!queue_empty(queue)); + do { + request = (IOPMRequest *) queue_first(queue); + if (request->isWorkBlocked()) + break; // cannot start, blocked on attached request - return false; + target = request->getTarget(); + done = (*fWorkAction)( target, request, this ); + if (!done) + break; // work started, blocked on PM state machine + + assert(gIOPMBusyCount > 0); + if (gIOPMBusyCount) + gIOPMBusyCount--; + + queue_remove_first(queue, request, IOPMRequest *, fCommandChain); + more |= (*fRetireAction)( target, request, this ); + done = queue_empty(queue); + } while (!done); + + *empty = done; + + if (more) + { + // Retired request blocks another request, since the + // blocked request may reside in the work queue, we + // must bump the producer count to avoid work stall. + fProducerCount++; + } + + return more; +} + +bool IOPMWorkQueue::checkForWork( void ) +{ + IOServicePM * entry; + IOServicePM * next; + bool more = false; + bool empty; + +#if WORK_QUEUE_STATS + fStatCheckForWork++; +#endif + + // Each producer signal triggers a full iteration over + // all IOServicePM entries in the work queue. + + while (fConsumerCount != fProducerCount) + { + PM_LOG3("IOPMWorkQueue: checkForWork %u %u\n", + fProducerCount, fConsumerCount); + + fConsumerCount = fProducerCount; + +#if WORK_QUEUE_STATS + if (queue_empty(&fWorkQueue)) + { + fStatQueueEmpty++; + break; + } + fStatScanEntries++; + uint32_t cachedWorkCount = gIOPMWorkCount; +#endif + + entry = (IOServicePM *) queue_first(&fWorkQueue); + while (!queue_end(&fWorkQueue, (queue_entry_t) entry)) + { + more |= checkRequestQueue(&entry->RequestHead, &empty); + + // Get next entry, points to head if current entry is last. + next = (IOServicePM *) queue_next(&entry->WorkChain); + + // if request queue is empty, remove IOServicePM from queue. + if (empty) + { + assert(fQueueLength); + if (fQueueLength) fQueueLength--; + PM_LOG3("IOPMWorkQueue: [%u] removed %s@%p from queue\n", + fQueueLength, entry->Name, entry); + queue_remove(&fWorkQueue, entry, IOServicePM *, WorkChain); + } + entry = next; + } + +#if WORK_QUEUE_STATS + if (cachedWorkCount == gIOPMWorkCount) + fStatNoWorkDone++; +#endif + } + + return more; +} + +void IOPMWorkQueue::signalWorkAvailable( void ) +{ + fProducerCount++; + IOEventSource::signalWorkAvailable(); +} + +void IOPMWorkQueue::incrementProducerCount( void ) +{ + fProducerCount++; } // MARK: - @@ -6438,7 +7525,8 @@ bool IOPMWorkQueue::checkForWork( void ) OSDefineMetaClassAndStructors( IOPMCompletionQueue, IOEventSource ); -IOPMCompletionQueue * IOPMCompletionQueue::create( IOService * inOwner, Action inAction ) +IOPMCompletionQueue * +IOPMCompletionQueue::create( IOService * inOwner, Action inAction ) { IOPMCompletionQueue * me = OSTypeAlloc(IOPMCompletionQueue); if (me && !me->init(inOwner, inAction)) @@ -6458,39 +7546,40 @@ bool IOPMCompletionQueue::init( IOService * inOwner, Action inAction ) return true; } -void IOPMCompletionQueue::queuePMRequest( IOPMRequest * request ) +bool IOPMCompletionQueue::queuePMRequest( IOPMRequest * request ) { + bool more; + assert(request); - request->detachNextRequest(); // unblocks next request + // unblock dependent request + more = request->detachNextRequest(); queue_enter(&fQueue, request, IOPMRequest *, fCommandChain); - if (workLoop) signalWorkAvailable(); + return more; } bool IOPMCompletionQueue::checkForWork( void ) { Action dqAction = (Action) action; IOPMRequest * request; + IOPMRequest * next; IOService * target; bool more = false; - queue_head_t tmpQueue; - - queue_init(&tmpQueue); - while (!queue_empty(&fQueue)) - { - queue_remove_first( &fQueue, request, IOPMRequest *, fCommandChain ); - if (request->isFreeBlocked()) - { - queue_enter(&tmpQueue, request, IOPMRequest *, fCommandChain); - continue; - } - target = request->getTarget(); - assert(target); - more |= (*dqAction)( target, request, this ); - } + request = (IOPMRequest *) queue_first(&fQueue); + while (!queue_end(&fQueue, (queue_entry_t) request)) + { + next = (IOPMRequest *) queue_next(&request->fCommandChain); + if (!request->isFreeBlocked()) + { + queue_remove(&fQueue, request, IOPMRequest *, fCommandChain); + target = request->getTarget(); + assert(target); + more |= (*dqAction)( target, request, this ); + } + request = next; + } - queue_new_head(&tmpQueue, &fQueue, IOPMRequest *, fCommandChain); - return more; + return more; } // MARK: - @@ -6519,11 +7608,16 @@ IOReturn IOServicePM::gatedSerialize( OSSerialize * s ) { OSDictionary * dict; bool ok = false; - int dictSize = 4; + int dictSize = 5; if (IdleTimerPeriod) dictSize += 4; +#if WORK_QUEUE_STATS + if (gIOPMRootNode == ControllingDriver) + dictSize += 4; +#endif + if (PowerClients) dict = OSDictionary::withDictionary( PowerClients, PowerClients->getCount() + dictSize); @@ -6533,11 +7627,13 @@ IOReturn IOServicePM::gatedSerialize( OSSerialize * s ) if (dict) { setPMProperty(dict, "CurrentPowerState", CurrentPowerState); + if (NumberOfPowerStates) + setPMProperty(dict, "MaxPowerState", NumberOfPowerStates-1); if (DesiredPowerState != CurrentPowerState) setPMProperty(dict, "DesiredPowerState", DesiredPowerState); if (kIOPM_Finished != MachineState) setPMProperty(dict, "MachineState", MachineState); - if (DeviceOverrides) + if (DeviceOverrideEnabled) dict->setObject("PowerOverrideOn", kOSBooleanTrue); if (IdleTimerPeriod) @@ -6560,7 +7656,7 @@ IOReturn IOServicePM::gatedSerialize( OSSerialize * s ) delta = now; SUB_ABSOLUTETIME(&delta, &DeviceActiveTimestamp); absolutetime_to_nanoseconds(delta, &nsecs); - setPMProperty(dict, "TimeSinceActivityTickle", NS_TO_MS(nsecs)); + setPMProperty(dict, "TimeSinceLastTickle", NS_TO_MS(nsecs)); } if (AbsoluteTime_to_scalar(&IdleTimerStartTime)) @@ -6573,6 +7669,20 @@ IOReturn IOServicePM::gatedSerialize( OSSerialize * s ) } } +#if WORK_QUEUE_STATS + if (gIOPMRootNode == Owner) + { + setPMProperty(dict, "WQ-CheckForWork", + gIOPMWorkQueue->fStatCheckForWork); + setPMProperty(dict, "WQ-ScanEntries", + gIOPMWorkQueue->fStatScanEntries); + setPMProperty(dict, "WQ-QueueEmpty", + gIOPMWorkQueue->fStatQueueEmpty); + setPMProperty(dict, "WQ-NoWorkDone", + gIOPMWorkQueue->fStatNoWorkDone); + } +#endif + ok = dict->serialize(s); dict->release(); } @@ -6593,3 +7703,53 @@ bool IOServicePM::serialize( OSSerialize * s ) const return (kIOReturnSuccess == ret); } + +PMEventDetails* PMEventDetails::eventDetails(uint32_t type, + const char *ownerName, + uintptr_t ownerUnique, + const char *interestName, + uint8_t oldState, + uint8_t newState, + uint32_t result, + uint32_t elapsedTimeUS) { + + PMEventDetails *myself; + myself = new PMEventDetails; + + if(myself) { + myself->eventType = type; + myself->ownerName = ownerName; + myself->ownerUnique = ownerUnique; + myself->interestName = interestName; + myself->oldState = oldState; + myself->newState = newState; + myself->result = result; + myself->elapsedTimeUS = elapsedTimeUS; + + myself->eventClassifier = kIOPMEventClassDriverEvent; + } + + return myself; +} + + +PMEventDetails* PMEventDetails::eventDetails(uint32_t type, + const char *uuid, + uint32_t reason, + uint32_t result) { + + PMEventDetails *myself; + myself = new PMEventDetails; + + if(myself) { + myself->eventType = type; + myself->uuid = uuid; + myself->reason = reason; + myself->result = result; + + myself->eventClassifier = kIOPMEventClassSystemEvent; + } + + return myself; +} + diff --git a/iokit/Kernel/IOServicePMPrivate.h b/iokit/Kernel/IOServicePMPrivate.h index 818285f8e..96e5bfacc 100644 --- a/iokit/Kernel/IOServicePMPrivate.h +++ b/iokit/Kernel/IOServicePMPrivate.h @@ -29,17 +29,163 @@ #ifndef _IOKIT_IOSERVICEPMPRIVATE_H #define _IOKIT_IOSERVICEPMPRIVATE_H -/*! @class IOServicePM - @abstract Power management class. -*/ +#include <IOKit/IOCommand.h> +#include <IOKit/IOEventSource.h> + +//****************************************************************************** +// PM command types +//****************************************************************************** + +enum { + /* Command Types */ + kIOPMRequestTypeInvalid = 0x00, + kIOPMRequestTypePMStop = 0x01, + kIOPMRequestTypeAddPowerChild1 = 0x02, + kIOPMRequestTypeAddPowerChild2 = 0x03, + kIOPMRequestTypeAddPowerChild3 = 0x04, + kIOPMRequestTypeRegisterPowerDriver = 0x05, + kIOPMRequestTypeAdjustPowerState = 0x06, + kIOPMRequestTypePowerDomainWillChange = 0x07, + kIOPMRequestTypePowerDomainDidChange = 0x08, + kIOPMRequestTypePowerOverrideOnPriv = 0x09, + kIOPMRequestTypePowerOverrideOffPriv = 0x0A, + kIOPMRequestTypeActivityTickle = 0x0B, + kIOPMRequestTypeRequestPowerState = 0x0C, + kIOPMRequestTypeSynchronizePowerTree = 0x0D, + kIOPMRequestTypeRequestPowerStateOverride = 0x0E, + kIOPMRequestTypeSetIdleTimerPeriod = 0x0F, + + /* Reply Types */ + kIOPMRequestTypeReplyStart = 0x80, + kIOPMRequestTypeAckPowerChange = 0x81, + kIOPMRequestTypeAckSetPowerState = 0x82, + kIOPMRequestTypeAllowPowerChange = 0x83, + kIOPMRequestTypeCancelPowerChange = 0x84, + kIOPMRequestTypeInterestChanged = 0x85, + kIOPMRequestTypeIdleCancel = 0x86, + kIOPMRequestTypeChildNotifyDelayCancel = 0x87 +}; + +//****************************************************************************** +// PM actions - For root domain only +//****************************************************************************** + +struct IOPMActions; + +typedef void +(*IOPMActionPowerChangeStart)( + void * target, + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t * changeFlags ); + +typedef void +(*IOPMActionPowerChangeDone)( + void * target, + IOService * service, + IOPMActions * actions, + uint32_t powerState, + uint32_t changeFlags ); + +typedef void +(*IOPMActionPowerChangeOverride)( + void * target, + IOService * service, + IOPMActions * actions, + unsigned long * powerState, + uint32_t * changeFlags ); + +typedef void +(*IOPMActionActivityTickle)( + void * target, + IOService * service, + IOPMActions * actions ); + +struct IOPMActions { + void * target; + uint32_t parameter; + IOPMActionPowerChangeStart actionPowerChangeStart; + IOPMActionPowerChangeDone actionPowerChangeDone; + IOPMActionPowerChangeOverride actionPowerChangeOverride; + IOPMActionActivityTickle actionActivityTickle; +}; + +//****************************************************************************** + +enum { + kIOPMEventClassSystemEvent = 0x00, + kIOPMEventClassDriverEvent = 0x1 +}; + +class PMEventDetails : public OSObject +{ + OSDeclareDefaultStructors( PMEventDetails ); + friend class IOServicePM; + friend class IOPMrootDomain; + friend class IOPMTimeline; +public: + static PMEventDetails *eventDetails(uint32_t type, + const char *ownerName, + uintptr_t ownerUnique, + const char *interestName, + uint8_t oldState, + uint8_t newState, + uint32_t result, + uint32_t elapsedTimeUS); + + static PMEventDetails *eventDetails(uint32_t type, + const char *uuid, + uint32_t reason, + uint32_t result); +private: + uint8_t eventClassifier; + uint32_t eventType; + const char *ownerName; + uintptr_t ownerUnique; + const char *interestName; + uint8_t oldState; + uint8_t newState; + uint32_t result; + uint32_t elapsedTimeUS; + + const char *uuid; + uint32_t reason; +}; + +// Internal concise representation of IOPMPowerState +struct IOPMPSEntry +{ + IOPMPowerFlags capabilityFlags; + IOPMPowerFlags outputPowerFlags; + IOPMPowerFlags inputPowerFlags; + uint32_t staticPower; + uint32_t settleUpTime; + uint32_t settleDownTime; +}; + +//****************************************************************************** +// IOServicePM +//****************************************************************************** + class IOServicePM : public OSObject { friend class IOService; + friend class IOPMWorkQueue; OSDeclareDefaultStructors( IOServicePM ) private: - // List of interested drivers. + // Link IOServicePM objects on IOPMWorkQueue. + queue_chain_t WorkChain; + + // Queue of IOPMRequest objects. + queue_head_t RequestHead; + + // IOService creator and owner. + IOService * Owner; + + // List of interested drivers (protected by PMLock). IOPMinformeeList * InterestedDrivers; // How long to wait for controlling driver to acknowledge. @@ -50,21 +196,22 @@ private: thread_call_t AckTimer; thread_call_t SettleTimer; + thread_call_t IdleTimer; // Settle time after changing power state. - unsigned long SettleTimeUS; + uint32_t SettleTimeUS; // The flags describing current change note. - unsigned long HeadNoteFlags; + IOPMPowerChangeFlags HeadNoteChangeFlags; // The new power state number being changed to. - unsigned long HeadNotePowerState; + IOPMPowerStateIndex HeadNotePowerState; // Points to the entry in the power state array. - IOPMPowerState * HeadNotePowerArrayEntry; + IOPMPSEntry * HeadNotePowerArrayEntry; // Power flags supplied by all parents (domain). - unsigned long HeadNoteDomainFlags; + IOPMPowerFlags HeadNoteDomainFlags; // Power flags supplied by domain accounting for parent changes. IOPMPowerFlags HeadNoteDomainTargetFlags; @@ -73,32 +220,26 @@ private: IOPowerConnection * HeadNoteParentConnection; // Power flags supplied by the changing parent. - unsigned long HeadNoteParentFlags; + IOPMPowerFlags HeadNoteParentFlags; // Number of acks still outstanding. - unsigned long HeadNotePendingAcks; + uint32_t HeadNotePendingAcks; // PM state lock. - IOLock * PMLock; - - // Initialized to true, then set to false after the initial power change. - bool InitialChange; - - // Ignore children and driver desires if true. - bool DeviceOverrides; - - // True if device was active since last idle timer expiration. - bool DeviceActive; - - // Keeps track of any negative responses from notified apps and clients. - bool DoNotPowerDown; - - // True if all our parents know the state of their power domain. - bool ParentsKnowState; - - bool StrictTreeOrder; - bool IdleTimerStopped; - bool AdjustPowerScheduled; + IOLock * PMLock; + + unsigned int InitialPowerChange:1; + unsigned int InitialSetPowerState:1; + unsigned int DeviceOverrideEnabled:1; + unsigned int DeviceWasActive:1; + unsigned int DoNotPowerDown:1; + unsigned int ParentsKnowState:1; + unsigned int StrictTreeOrder:1; + unsigned int IdleTimerStopped:1; + unsigned int AdjustPowerScheduled:1; + unsigned int IsPreChange:1; + unsigned int DriverCallBusy:1; + unsigned int PCDFunctionOverride:1; // Time of last device activity. AbsoluteTime DeviceActiveTimestamp; @@ -106,105 +247,106 @@ private: // Used to protect activity flag. IOLock * ActivityLock; - // Idle timer event source. - IOTimerEventSource * IdleTimerEventSource; - // Idle timer's period in seconds. unsigned long IdleTimerPeriod; unsigned long IdleTimerMinPowerState; AbsoluteTime IdleTimerStartTime; // Power state desired by a subclassed device object. - unsigned long DeviceDesire; + IOPMPowerStateIndex DeviceDesire; // This is the power state we desire currently. - unsigned long DesiredPowerState; + IOPMPowerStateIndex DesiredPowerState; // This is what our parent thinks our need is. - unsigned long PreviousRequest; + IOPMPowerFlags PreviousRequestPowerFlags; // Cache result from getName(), used in logging. const char * Name; // Number of power states in the power array. - unsigned long NumberOfPowerStates; + IOPMPowerStateIndex NumberOfPowerStates; // Power state array. - IOPMPowerState * PowerStates; + IOPMPSEntry * PowerStates; // The controlling driver. - IOService * ControllingDriver; + IOService * ControllingDriver; // Our current power state. - unsigned long CurrentPowerState; + IOPMPowerStateIndex CurrentPowerState; // Logical OR of power flags for each power domain parent. - IOPMPowerFlags ParentsCurrentPowerFlags; + IOPMPowerFlags ParentsCurrentPowerFlags; // The highest power state we can achieve in current power domain. - unsigned long MaxCapability; + IOPMPowerStateIndex MaxPowerState; // Logical OR of all output power character flags in the array. - IOPMPowerFlags OutputPowerCharacterFlags; + IOPMPowerFlags OutputPowerCharacterFlags; // OSArray which manages responses from notified apps and clients. - OSArray * ResponseArray; + OSArray * ResponseArray; OSArray * NotifyClientArray; // Used to uniquely identify power management notification to apps and clients. - UInt16 SerialNumber; + UInt16 SerialNumber; // Used to communicate desired function to tellClientsWithResponse(). // This is used because it avoids changing the signatures of the affected virtual methods. - int OutOfBandParameter; + int OutOfBandParameter; AbsoluteTime DriverCallStartTime; IOPMPowerFlags CurrentCapabilityFlags; long ActivityTicklePowerState; unsigned long CurrentPowerConsumption; - unsigned long TempClampPowerState; - IOPMWorkQueue * PMWorkQueue; - OSSet * InsertInterestSet; - OSSet * RemoveInterestSet; - OSArray * NotifyChildArray; + IOPMPowerStateIndex TempClampPowerState; + OSArray * NotifyChildArray; OSDictionary * PowerClients; - thread_call_t DriverCallEntry; - void * DriverCallParamPtr; - IOItemCount DriverCallParamCount; - IOItemCount DriverCallParamSlots; + thread_call_t DriverCallEntry; + void * DriverCallParamPtr; + IOItemCount DriverCallParamCount; + IOItemCount DriverCallParamSlots; uint32_t DriverCallReason; + uint32_t OutOfBandMessage; uint32_t TempClampCount; uint32_t OverrideMaxPowerState; uint32_t ActivityTickleCount; uint32_t WaitReason; - uint32_t NextMachineState; + uint32_t SavedMachineState; uint32_t RootDomainState; - uint32_t ThreadAssertionCount; - // Protected by PMLock + // Protected by PMLock - BEGIN struct { - uint32_t DriverCallBusy : 1; - uint32_t PMStop : 1; + uint32_t PMStop : 1; + uint32_t PMDriverCallWait : 1; } LockedFlags; - thread_t ThreadAssertionThread; + queue_head_t PMDriverCallQueue; + OSSet * InsertInterestSet; + OSSet * RemoveInterestSet; + // Protected by PMLock - END #if PM_VARS_SUPPORT - IOPMprot * PMVars; + IOPMprot * PMVars; #endif + IOPMActions PMActions; + // Serialize IOServicePM state for debug output. IOReturn gatedSerialize( OSSerialize * s ); virtual bool serialize( OSSerialize * s ) const; }; +#define fOwner pwrMgt->Owner #define fInterestedDrivers pwrMgt->InterestedDrivers #define fDriverTimer pwrMgt->DriverTimer +#define fMachineState pwrMgt->MachineState #define fAckTimer pwrMgt->AckTimer #define fSettleTimer pwrMgt->SettleTimer -#define fMachineState pwrMgt->MachineState +#define fIdleTimer pwrMgt->IdleTimer #define fSettleTimeUS pwrMgt->SettleTimeUS -#define fHeadNoteFlags pwrMgt->HeadNoteFlags +#define fHeadNoteChangeFlags pwrMgt->HeadNoteChangeFlags #define fHeadNotePowerState pwrMgt->HeadNotePowerState #define fHeadNotePowerArrayEntry pwrMgt->HeadNotePowerArrayEntry #define fHeadNoteDomainFlags pwrMgt->HeadNoteDomainFlags @@ -213,63 +355,63 @@ private: #define fHeadNoteParentFlags pwrMgt->HeadNoteParentFlags #define fHeadNotePendingAcks pwrMgt->HeadNotePendingAcks #define fPMLock pwrMgt->PMLock -#define fInitialChange pwrMgt->InitialChange -#define fDeviceOverrides pwrMgt->DeviceOverrides +#define fInitialPowerChange pwrMgt->InitialPowerChange +#define fInitialSetPowerState pwrMgt->InitialSetPowerState +#define fDeviceOverrideEnabled pwrMgt->DeviceOverrideEnabled +#define fDeviceWasActive pwrMgt->DeviceWasActive +#define fDoNotPowerDown pwrMgt->DoNotPowerDown +#define fParentsKnowState pwrMgt->ParentsKnowState +#define fStrictTreeOrder pwrMgt->StrictTreeOrder +#define fIdleTimerStopped pwrMgt->IdleTimerStopped +#define fAdjustPowerScheduled pwrMgt->AdjustPowerScheduled +#define fIsPreChange pwrMgt->IsPreChange +#define fDriverCallBusy pwrMgt->DriverCallBusy +#define fPCDFunctionOverride pwrMgt->PCDFunctionOverride +#define fDeviceActiveTimestamp pwrMgt->DeviceActiveTimestamp #define fActivityLock pwrMgt->ActivityLock -#define fIdleTimerEventSource pwrMgt->IdleTimerEventSource #define fIdleTimerPeriod pwrMgt->IdleTimerPeriod #define fIdleTimerMinPowerState pwrMgt->IdleTimerMinPowerState -#define fDeviceActive pwrMgt->DeviceActive #define fIdleTimerStartTime pwrMgt->IdleTimerStartTime -#define fDeviceActiveTimestamp pwrMgt->DeviceActiveTimestamp -#define fActivityTickleCount pwrMgt->ActivityTickleCount #define fDeviceDesire pwrMgt->DeviceDesire #define fDesiredPowerState pwrMgt->DesiredPowerState -#define fPreviousRequest pwrMgt->PreviousRequest +#define fPreviousRequestPowerFlags pwrMgt->PreviousRequestPowerFlags #define fName pwrMgt->Name #define fNumberOfPowerStates pwrMgt->NumberOfPowerStates #define fPowerStates pwrMgt->PowerStates #define fControllingDriver pwrMgt->ControllingDriver -#define fAggressivenessValue pwrMgt->AggressivenessValue -#define fAggressivenessValid pwrMgt->AggressivenessValid #define fCurrentPowerState pwrMgt->CurrentPowerState -#define fParentsKnowState pwrMgt->ParentsKnowState #define fParentsCurrentPowerFlags pwrMgt->ParentsCurrentPowerFlags -#define fMaxCapability pwrMgt->MaxCapability +#define fMaxPowerState pwrMgt->MaxPowerState #define fOutputPowerCharacterFlags pwrMgt->OutputPowerCharacterFlags -#define fSerialNumber pwrMgt->SerialNumber #define fResponseArray pwrMgt->ResponseArray #define fNotifyClientArray pwrMgt->NotifyClientArray -#define fDoNotPowerDown pwrMgt->DoNotPowerDown +#define fSerialNumber pwrMgt->SerialNumber #define fOutOfBandParameter pwrMgt->OutOfBandParameter #define fDriverCallStartTime pwrMgt->DriverCallStartTime #define fCurrentCapabilityFlags pwrMgt->CurrentCapabilityFlags +#define fActivityTicklePowerState pwrMgt->ActivityTicklePowerState #define fCurrentPowerConsumption pwrMgt->CurrentPowerConsumption #define fTempClampPowerState pwrMgt->TempClampPowerState -#define fTempClampCount pwrMgt->TempClampCount -#define fOverrideMaxPowerState pwrMgt->OverrideMaxPowerState -#define fPMWorkQueue pwrMgt->PMWorkQueue -#define fWaitReason pwrMgt->WaitReason -#define fNextMachineState pwrMgt->NextMachineState -#define fDriverCallReason pwrMgt->DriverCallReason +#define fNotifyChildArray pwrMgt->NotifyChildArray +#define fPowerClients pwrMgt->PowerClients #define fDriverCallEntry pwrMgt->DriverCallEntry #define fDriverCallParamPtr pwrMgt->DriverCallParamPtr #define fDriverCallParamCount pwrMgt->DriverCallParamCount #define fDriverCallParamSlots pwrMgt->DriverCallParamSlots -#define fActivityTickled pwrMgt->ActivityTickled +#define fDriverCallReason pwrMgt->DriverCallReason +#define fOutOfBandMessage pwrMgt->OutOfBandMessage +#define fTempClampCount pwrMgt->TempClampCount +#define fOverrideMaxPowerState pwrMgt->OverrideMaxPowerState +#define fActivityTickleCount pwrMgt->ActivityTickleCount +#define fWaitReason pwrMgt->WaitReason +#define fSavedMachineState pwrMgt->SavedMachineState +#define fRootDomainState pwrMgt->RootDomainState +#define fLockedFlags pwrMgt->LockedFlags +#define fPMDriverCallQueue pwrMgt->PMDriverCallQueue #define fInsertInterestSet pwrMgt->InsertInterestSet #define fRemoveInterestSet pwrMgt->RemoveInterestSet -#define fStrictTreeOrder pwrMgt->StrictTreeOrder -#define fNotifyChildArray pwrMgt->NotifyChildArray -#define fIdleTimerStopped pwrMgt->IdleTimerStopped -#define fAdjustPowerScheduled pwrMgt->AdjustPowerScheduled -#define fActivityTicklePowerState pwrMgt->ActivityTicklePowerState #define fPMVars pwrMgt->PMVars -#define fPowerClients pwrMgt->PowerClients -#define fRootDomainState pwrMgt->RootDomainState -#define fThreadAssertionCount pwrMgt->ThreadAssertionCount -#define fThreadAssertionThread pwrMgt->ThreadAssertionThread -#define fLockedFlags pwrMgt->LockedFlags +#define fPMActions pwrMgt->PMActions /* When an IOService is waiting for acknowledgement to a power change @@ -279,13 +421,27 @@ the ack timer is ticking every tenth of a second. */ #define ACK_TIMER_PERIOD 100000000 -#define kIOPMParentInitiated 0x01 // this power change initiated by our parent -#define kIOPMWeInitiated 0x02 // this power change initiated by this device -#define kIOPMNotDone 0x04 // we couldn't make this change -#define kIOPMDomainWillChange 0x08 // change started by PowerDomainWillChangeTo -#define kIOPMDomainDidChange 0x10 // change started by PowerDomainDidChangeTo -#define kIOPMDomainPowerDrop 0x20 // Domain is lowering power -#define kIOPMSynchronize 0x40 // change triggered by power tree re-sync +// Max wait time in microseconds for kernel priority and capability clients +// with async message handlers to acknowledge. +// +#define kPriorityClientMaxWait (90 * 1000 * 1000) +#define kCapabilityClientMaxWait (240 * 1000 * 1000) + +// Attributes describing a power state change. +// See IOPMPowerChangeFlags data type. +// +#define kIOPMParentInitiated 0x0001 // this power change initiated by our parent +#define kIOPMSelfInitiated 0x0002 // this power change initiated by this device +#define kIOPMNotDone 0x0004 // we couldn't make this change +#define kIOPMDomainWillChange 0x0008 // change started by PowerDomainWillChangeTo +#define kIOPMDomainDidChange 0x0010 // change started by PowerDomainDidChangeTo +#define kIOPMDomainPowerDrop 0x0020 // Domain is lowering power +#define kIOPMIgnoreChildren 0x0040 // Ignore children and driver power desires +#define kIOPMSkipAskPowerDown 0x0080 // skip the ask app phase +#define kIOPMSynchronize 0x0100 // change triggered by power tree re-sync +#define kIOPMSyncNoChildNotify 0x0200 // sync root domain only, not entire tree +#define kIOPMSyncTellPowerDown 0x0400 // send the ask/will power off messages +#define kIOPMSyncCancelPowerDown 0x0800 // sleep cancel for maintenance wake enum { kDriverCallInformPreChange, @@ -298,73 +454,51 @@ struct DriverCallParam { IOReturn Result; }; -// values of outofbandparameter +// values of OutOfBandParameter enum { kNotifyApps, - kNotifyPriority + kNotifyPriority, + kNotifyCapabilityChangeApps, + kNotifyCapabilityChangePriority }; -typedef bool (*IOPMMessageFilter)(OSObject * object, void * context); +typedef bool (*IOPMMessageFilter)( + void * target, void * object, void * arg1, void * arg2, void * arg3 ); // used for applyToInterested struct IOPMInterestContext { - OSArray * responseFlags; - OSArray * notifyClients; - UInt16 serialNumber; - UInt16 counter; - UInt32 maxTimeRequested; - int msgType; - IOService * us; - unsigned long stateNumber; - IOPMPowerFlags stateFlags; - const char * errorLog; - IOPMMessageFilter filterFunc; + OSArray * responseArray; + OSArray * notifyClients; + uint16_t serialNumber; + uint8_t isPreChange; + uint8_t enableTracing; + uint32_t maxTimeRequested; + uint32_t messageType; + uint32_t notifyType; + IOService * us; + IOPMPowerStateIndex stateNumber; + IOPMPowerFlags stateFlags; + IOPMPowerChangeFlags changeFlags; + const char * errorLog; + IOPMMessageFilter messageFilter; }; -//********************************************************************************* +// assertPMDriverCall() options +enum { + kIOPMADC_NoInactiveCheck = 1 +}; + +//****************************************************************************** // PM Statistics & Diagnostics -//********************************************************************************* +//****************************************************************************** extern const OSSymbol *gIOPMStatsApplicationResponseTimedOut; extern const OSSymbol *gIOPMStatsApplicationResponseCancel; extern const OSSymbol *gIOPMStatsApplicationResponseSlow; -//********************************************************************************* -// PM command types -//********************************************************************************* - -enum { - /* Command Types */ - kIOPMRequestTypeInvalid = 0x00, - kIOPMRequestTypePMStop = 0x01, - kIOPMRequestTypeAddPowerChild1 = 0x02, - kIOPMRequestTypeAddPowerChild2 = 0x03, - kIOPMRequestTypeAddPowerChild3 = 0x04, - kIOPMRequestTypeRegisterPowerDriver = 0x05, - kIOPMRequestTypeAdjustPowerState = 0x06, - kIOPMRequestTypePowerDomainWillChange = 0x07, - kIOPMRequestTypePowerDomainDidChange = 0x08, - kIOPMRequestTypePowerOverrideOnPriv = 0x09, - kIOPMRequestTypePowerOverrideOffPriv = 0x0A, - kIOPMRequestTypeActivityTickle = 0x0B, - kIOPMRequestTypeRequestPowerState = 0x0C, - kIOPMRequestTypeSynchronizePowerTree = 0x0D, - kIOPMRequestTypeRequestPowerStateOverride = 0x0E, - kIOPMRequestTypeSetIdleTimerPeriod = 0x0F, - - /* Reply Types */ - kIOPMRequestTypeReplyStart = 0x80, - kIOPMRequestTypeAckPowerChange = 0x81, - kIOPMRequestTypeAckSetPowerState = 0x82, - kIOPMRequestTypeAllowPowerChange = 0x83, - kIOPMRequestTypeCancelPowerChange = 0x84, - kIOPMRequestTypeInterestChanged = 0x85, - kIOPMRequestTypeIdleCancel = 0x86 -}; - -//********************************************************************************* -// IOServicePM internal helper classes -//********************************************************************************* +//****************************************************************************** +// IOPMRequest +//****************************************************************************** typedef void (*IOPMCompletionAction)(void * target, void * param, IOReturn status); @@ -445,12 +579,16 @@ public: static IOPMRequest * create( void ); bool init( IOService * owner, IOOptionBits type ); void reset( void ); - void attachNextRequest( IOPMRequest * next ); - void detachNextRequest( void ); - void attachRootRequest( IOPMRequest * root ); - void detachRootRequest( void ); + bool attachNextRequest( IOPMRequest * next ); + bool detachNextRequest( void ); + bool attachRootRequest( IOPMRequest * root ); + bool detachRootRequest( void ); }; +//****************************************************************************** +// IOPMRequestQueue +//****************************************************************************** + class IOPMRequestQueue : public IOEventSource { OSDeclareDefaultStructors( IOPMRequestQueue ) @@ -470,9 +608,14 @@ public: static IOPMRequestQueue * create( IOService * inOwner, Action inAction ); void queuePMRequest( IOPMRequest * request ); void queuePMRequestChain( IOPMRequest ** requests, IOItemCount count ); - void signalWorkAvailable( void ); }; +//****************************************************************************** +// IOPMWorkQueue +//****************************************************************************** + +#define WORK_QUEUE_STATS 1 + class IOPMWorkQueue : public IOEventSource { OSDeclareDefaultStructors( IOPMWorkQueue ) @@ -480,24 +623,36 @@ class IOPMWorkQueue : public IOEventSource public: typedef bool (*Action)( IOService *, IOPMRequest *, IOPMWorkQueue * ); +#if WORK_QUEUE_STATS + uint64_t fStatCheckForWork; + uint64_t fStatScanEntries; + uint64_t fStatQueueEmpty; + uint64_t fStatNoWorkDone; +#endif + protected: - queue_head_t fWorkQueue; - Action fWorkAction; - Action fRetireAction; + queue_head_t fWorkQueue; + Action fWorkAction; + Action fRetireAction; + uint32_t fQueueLength; + uint32_t fConsumerCount; + volatile uint32_t fProducerCount; virtual bool checkForWork( void ); virtual bool init( IOService * inOwner, Action work, Action retire ); + bool checkRequestQueue( queue_head_t * queue, bool * empty ); public: static IOPMWorkQueue * create( IOService * inOwner, Action work, Action retire ); - void queuePMRequest( IOPMRequest * request ); - - inline boolean_t isEmpty( void ) - { - return queue_empty(&fWorkQueue); - } + bool queuePMRequest( IOPMRequest * request, IOServicePM * pwrMgt ); + void signalWorkAvailable( void ); + void incrementProducerCount( void ); }; +//****************************************************************************** +// IOPMCompletionQueue +//****************************************************************************** + class IOPMCompletionQueue : public IOEventSource { OSDeclareDefaultStructors( IOPMCompletionQueue ) @@ -513,7 +668,7 @@ protected: public: static IOPMCompletionQueue * create( IOService * inOwner, Action inAction ); - void queuePMRequest( IOPMRequest * request ); + bool queuePMRequest( IOPMRequest * request ); }; #endif /* !_IOKIT_IOSERVICEPMPRIVATE_H */ diff --git a/iokit/Kernel/IOServicePrivate.h b/iokit/Kernel/IOServicePrivate.h index 1d455fbee..873d47660 100644 --- a/iokit/Kernel/IOServicePrivate.h +++ b/iokit/Kernel/IOServicePrivate.h @@ -55,6 +55,8 @@ enum { kIOServiceConfigState = 0x04000000, kIOServiceTermPhase2State = 0x01000000, kIOServiceTermPhase3State = 0x00800000, + kIOServiceTermPhase1State = 0x00400000, + kIOServiceTerm1WaiterState = 0x00200000 }; // options for terminate() @@ -167,6 +169,7 @@ class IOResources : public IOService public: static IOService * resources( void ); + virtual bool init( OSDictionary * dictionary = 0 ); virtual IOWorkLoop * getWorkLoop( ) const; virtual bool matchPropertyTable( OSDictionary * table ); virtual IOReturn setProperties( OSObject * properties ); @@ -195,6 +198,7 @@ public: extern const OSSymbol * gIOConsoleUsersKey; extern const OSSymbol * gIOConsoleSessionUIDKey; +extern const OSSymbol * gIOConsoleSessionAuditIDKey; extern const OSSymbol * gIOConsoleSessionOnConsoleKey; extern const OSSymbol * gIOConsoleSessionSecureInputPIDKey; diff --git a/iokit/Kernel/IOStartIOKit.cpp b/iokit/Kernel/IOStartIOKit.cpp index 7b70541d6..49397d4cb 100644 --- a/iokit/Kernel/IOStartIOKit.cpp +++ b/iokit/Kernel/IOStartIOKit.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2011 Apple Inc. All rights reserved. + * Copyright (c) 1998-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -38,7 +38,10 @@ #include <IOKit/IOLib.h> #include <IOKit/IOKitKeys.h> #include <IOKit/IOKitDebug.h> +#include <IOKit/pwr_mgt/RootDomain.h> #include <IOKit/pwr_mgt/IOPMinformeeList.h> +#include <IOKit/IOStatisticsPrivate.h> +#include <IOKit/IOKitKeysPrivate.h> #include <IOKit/assert.h> @@ -61,7 +64,7 @@ void IOKitInitializeTime( void ) t.tv_nsec = 0; IOService::waitForService( IOService::resourceMatching("IORTC"), &t ); -#ifdef ppc +#if defined(__i386__) || defined(__x86_64__) IOService::waitForService( IOService::resourceMatching("IONVRAM"), &t ); #endif @@ -79,6 +82,8 @@ void IOKitResetTime( void ) clock_get_calendar_microtime(&secs, µsecs); gIOLastWakeTime.tv_sec = secs; gIOLastWakeTime.tv_usec = microsecs; + + IOService::updateConsoleUsers(NULL, kIOMessageSystemHasPoweredOn); } void iokit_post_constructor_init(void) @@ -90,8 +95,11 @@ void iokit_post_constructor_init(void) assert( root ); IOService::initialize(); IOCatalogue::initialize(); + IOStatistics::initialize(); + OSKext::initialize(); IOUserClient::initialize(); IOMemoryDescriptor::initialize(); + IORootParent::initialize(); // Initializes IOPMinformeeList class-wide shared lock IOPMinformeeList::getSharedRecursiveLock(); @@ -107,7 +115,6 @@ void iokit_post_constructor_init(void) root->setProperty( kIOKitDiagnosticsKey, obj ); obj->release(); } - } // From <osfmk/kern/debug.c> @@ -122,24 +129,20 @@ void StartIOKit( void * p1, void * p2, void * p3, void * p4 ) { IOPlatformExpertDevice * rootNub; int debugFlags; - uint32_t intThreshold; if( PE_parse_boot_argn( "io", &debugFlags, sizeof (debugFlags) )) - gIOKitDebug = debugFlags; - + gIOKitDebug = debugFlags; + if( PE_parse_boot_argn( "iotrace", &debugFlags, sizeof (debugFlags) )) gIOKitTrace = debugFlags; // Compat for boot-args gIOKitTrace |= (gIOKitDebug & kIOTraceCompatBootArgs); - - if( PE_parse_boot_argn( "iointthreshold", &intThreshold, sizeof (intThreshold) )) - gIOInterruptThresholdNS = intThreshold * 1000; // Check for the log synchronous bit set in io if (gIOKitDebug & kIOLogSynchronous) debug_mode = true; - + // // Have to start IOKit environment before we attempt to start // the C++ runtime environment. At some stage we have to clean up diff --git a/iokit/Kernel/IOStatistics.cpp b/iokit/Kernel/IOStatistics.cpp new file mode 100644 index 000000000..9235b293d --- /dev/null +++ b/iokit/Kernel/IOStatistics.cpp @@ -0,0 +1,1279 @@ +/* + * Copyright (c) 2010 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include <sys/sysctl.h> +#include <kern/host.h> + +#include <IOKit/system.h> +#include <libkern/c++/OSKext.h> +#include <libkern/OSAtomic.h> + +#include <IOKit/IOStatisticsPrivate.h> +#include <IOKit/IOUserClient.h> +#include <IOKit/IOEventSource.h> +#include <IOKit/IOKitDebug.h> + +#if IOKITSTATS + +bool IOStatistics::enabled = false; + +uint32_t IOStatistics::sequenceID = 0; + +uint32_t IOStatistics::lastClassIndex = 0; +uint32_t IOStatistics::lastKextIndex = 0; + +uint32_t IOStatistics::loadedKexts = 0; +uint32_t IOStatistics::registeredClasses = 0; +uint32_t IOStatistics::registeredCounters = 0; +uint32_t IOStatistics::registeredWorkloops = 0; + +uint32_t IOStatistics::attachedEventSources = 0; + +IOWorkLoopDependency *IOStatistics::nextWorkLoopDependency = NULL; + +/* Logging */ + +#define LOG_LEVEL 0 + +#define LOG(level, format, ...) \ +do { \ + if (level <= LOG_LEVEL) \ + printf(format, ##__VA_ARGS__); \ +} while (0) + +/* Locks */ + +IORWLock *IOStatistics::lock = NULL; + +/* Kext tree */ + +KextNode *IOStatistics::kextHint = NULL; + +IOStatistics::KextTreeHead IOStatistics::kextHead = RB_INITIALIZER(&IOStatistics::kextHead); + +int IOStatistics::kextNodeCompare(KextNode *e1, KextNode *e2) +{ + if (e1->kext < e2->kext) + return -1; + else if (e1->kext > e2->kext) + return 1; + else + return 0; +} + +RB_GENERATE(IOStatistics::KextTree, KextNode, link, kextNodeCompare); + +/* Kext tree ordered by address */ + +IOStatistics::KextAddressTreeHead IOStatistics::kextAddressHead = RB_INITIALIZER(&IOStatistics::kextAddressHead); + +int IOStatistics::kextAddressNodeCompare(KextNode *e1, KextNode *e2) +{ + if (e1->address < e2->address) + return -1; + else if (e1->address > e2->address) + return 1; + else + return 0; +} + +RB_GENERATE(IOStatistics::KextAddressTree, KextNode, addressLink, kextAddressNodeCompare); + +/* Class tree */ + +IOStatistics::ClassTreeHead IOStatistics::classHead = RB_INITIALIZER(&IOStatistics::classHead); + +int IOStatistics::classNodeCompare(ClassNode *e1, ClassNode *e2) { + if (e1->metaClass < e2->metaClass) + return -1; + else if (e1->metaClass > e2->metaClass) + return 1; + else + return 0; +} + +RB_GENERATE(IOStatistics::ClassTree, ClassNode, tLink, classNodeCompare); + +/* Workloop dependencies */ + +int IOWorkLoopCounter::loadTagCompare(IOWorkLoopDependency *e1, IOWorkLoopDependency *e2) { + if (e1->loadTag < e2->loadTag) + return -1; + else if (e1->loadTag > e2->loadTag) + return 1; + else + return 0; +} + +RB_GENERATE(IOWorkLoopCounter::DependencyTree, IOWorkLoopDependency, link, IOWorkLoopCounter::loadTagCompare); + +/* sysctl stuff */ + +static int +oid_sysctl(__unused struct sysctl_oid *oidp, __unused void *arg1, int arg2, struct sysctl_req *req) +{ + int error = EINVAL; + uint32_t request = arg2; + + switch (request) + { + case kIOStatisticsGeneral: + error = IOStatistics::getStatistics(req); + break; + case kIOStatisticsWorkLoop: + error = IOStatistics::getWorkLoopStatistics(req); + break; + case kIOStatisticsUserClient: + error = IOStatistics::getUserClientStatistics(req); + break; + default: + break; + } + + return error; +} + +SYSCTL_NODE(_debug, OID_AUTO, iokit_statistics, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "IOStatistics"); + +static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, general, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, kIOStatisticsGeneral, oid_sysctl, "S", ""); + +static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, workloop, + CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, kIOStatisticsWorkLoop, oid_sysctl, "S", ""); + +static SYSCTL_PROC(_debug_iokit_statistics, OID_AUTO, userclient, + CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED, + 0, kIOStatisticsUserClient, oid_sysctl, "S", ""); + +void IOStatistics::initialize() +{ + if (enabled) { + return; + } + +#if DEVELOPMENT || DEBUG + /* Always enabled in development and debug builds. */ +#else + /* Only enabled in release builds if the boot argument is set. */ + if (!(kIOStatistics & gIOKitDebug)) { + return; + } +#endif + + sysctl_register_oid(&sysctl__debug_iokit_statistics_general); + sysctl_register_oid(&sysctl__debug_iokit_statistics_workloop); + sysctl_register_oid(&sysctl__debug_iokit_statistics_userclient); + + lock = IORWLockAlloc(); + if (!lock) { + return; + } + + nextWorkLoopDependency = (IOWorkLoopDependency*)kalloc(sizeof(IOWorkLoopDependency)); + if (!nextWorkLoopDependency) { + return; + } + + enabled = true; +} + +void IOStatistics::onKextLoad(OSKext *kext, kmod_info_t *kmod_info) +{ + KextNode *ke; + + assert(kext && kmod_info); + + if (!enabled) { + return; + } + + LOG(1, "IOStatistics::onKextLoad: %s, tag %d, address 0x%llx, address end 0x%llx\n", + kext->getIdentifierCString(), kmod_info->id, (uint64_t)kmod_info->address, (uint64_t)(kmod_info->address + kmod_info->size)); + + ke = (KextNode *)kalloc(sizeof(KextNode)); + if (!ke) { + return; + } + + memset(ke, 0, sizeof(KextNode)); + + ke->kext = kext; + ke->loadTag = kmod_info->id; + ke->address = kmod_info->address; + ke->address_end = kmod_info->address + kmod_info->size; + + SLIST_INIT(&ke->classList); + TAILQ_INIT(&ke->userClientCallList); + + IORWLockWrite(lock); + + RB_INSERT(KextTree, &kextHead, ke); + RB_INSERT(KextAddressTree, &kextAddressHead, ke); + + sequenceID++; + loadedKexts++; + lastKextIndex++; + + IORWLockUnlock(lock); +} + +void IOStatistics::onKextUnload(OSKext *kext) +{ + KextNode sought, *found; + + assert(kext); + + if (!enabled) { + return; + } + + LOG(1, "IOStatistics::onKextUnload: %s\n", kext->getIdentifierCString()); + + IORWLockWrite(lock); + + sought.kext = kext; + found = RB_FIND(KextTree, &kextHead, &sought); + if (found) { + IOWorkLoopCounter *wlc; + IOUserClientProcessEntry *uce; + + /* Free up the list of counters */ + while ((wlc = SLIST_FIRST(&found->workLoopList))) { + SLIST_REMOVE_HEAD(&found->workLoopList, link); + kfree(wlc, sizeof(IOWorkLoopCounter)); + } + + /* Free up the user client list */ + while ((uce = TAILQ_FIRST(&found->userClientCallList))) { + TAILQ_REMOVE(&found->userClientCallList, uce, link); + kfree(uce, sizeof(IOUserClientProcessEntry)); + } + + /* Remove from kext trees */ + RB_REMOVE(KextTree, &kextHead, found); + RB_REMOVE(KextAddressTree, &kextAddressHead, found); + + /* + * Clear a matching kextHint to avoid use after free in + * onClassAdded() for a class add after a KEXT unload. + */ + if (found == kextHint) { + kextHint = NULL; + } + + /* Finally, free the class node */ + kfree(found, sizeof(KextNode)); + + sequenceID++; + loadedKexts--; + } + else { + panic("IOStatistics::onKextUnload: cannot find kext: %s", kext->getIdentifierCString()); + } + + IORWLockUnlock(lock); +} + +void IOStatistics::onClassAdded(OSKext *parentKext, OSMetaClass *metaClass) +{ + ClassNode *ce; + KextNode soughtKext, *foundKext = NULL; + + assert(parentKext && metaClass); + + if (!enabled) { + return; + } + + LOG(1, "IOStatistics::onClassAdded: %s\n", metaClass->getClassName()); + + ce = (ClassNode *)kalloc(sizeof(ClassNode)); + if (!ce) { + return; + } + + memset(ce, 0, sizeof(ClassNode)); + + IORWLockWrite(lock); + + /* Hinted? */ + if (kextHint && kextHint->kext == parentKext) { + foundKext = kextHint; + } + else { + soughtKext.kext = parentKext; + foundKext = RB_FIND(KextTree, &kextHead, &soughtKext); + } + + if (foundKext) { + ClassNode soughtClass, *foundClass = NULL; + const OSMetaClass *superClass; + + ce->metaClass = metaClass; + ce->classID = lastClassIndex++; + ce->parentKext = foundKext; + + /* Has superclass? */ + superClass = ce->metaClass->getSuperClass(); + if (superClass) { + soughtClass.metaClass = superClass; + foundClass = RB_FIND(ClassTree, &classHead, &soughtClass); + } + ce->superClassID = foundClass ? foundClass->classID : (uint32_t)(-1); + + SLIST_INIT(&ce->counterList); + SLIST_INIT(&ce->userClientList); + + RB_INSERT(ClassTree, &classHead, ce); + SLIST_INSERT_HEAD(&foundKext->classList, ce, lLink); + + foundKext->classes++; + + kextHint = foundKext; + + sequenceID++; + registeredClasses++; + } + else { + panic("IOStatistics::onClassAdded: cannot find parent kext: %s", parentKext->getIdentifierCString()); + } + + IORWLockUnlock(lock); +} + +void IOStatistics::onClassRemoved(OSKext *parentKext, OSMetaClass *metaClass) +{ + ClassNode sought, *found; + + assert(parentKext && metaClass); + + if (!enabled) { + return; + } + + LOG(1, "IOStatistics::onClassRemoved: %s\n", metaClass->getClassName()); + + IORWLockWrite(lock); + + sought.metaClass = metaClass; + found = RB_FIND(ClassTree, &classHead, &sought); + if (found) { + IOEventSourceCounter *esc; + IOUserClientCounter *ucc; + + /* Free up the list of counters */ + while ((esc = SLIST_FIRST(&found->counterList))) { + SLIST_REMOVE_HEAD(&found->counterList, link); + kfree(esc, sizeof(IOEventSourceCounter)); + } + + /* Free up the user client list */ + while ((ucc = SLIST_FIRST(&found->userClientList))) { + SLIST_REMOVE_HEAD(&found->userClientList, link); + kfree(ucc, sizeof(IOUserClientCounter)); + } + + /* Remove from class tree */ + RB_REMOVE(ClassTree, &classHead, found); + + /* Remove from parent */ + SLIST_REMOVE(&found->parentKext->classList, found, ClassNode, lLink); + + /* Finally, free the class node */ + kfree(found, sizeof(ClassNode)); + + sequenceID++; + registeredClasses--; + } + else { + panic("IOStatistics::onClassRemoved: cannot find class: %s", metaClass->getClassName()); + } + + IORWLockUnlock(lock); +} + +IOEventSourceCounter *IOStatistics::registerEventSource(OSObject *inOwner) +{ + IOEventSourceCounter *counter = NULL; + ClassNode sought, *found = NULL; + boolean_t createDummyCounter = FALSE; + + assert(inOwner); + + if (!enabled) { + return NULL; + } + + counter = (IOEventSourceCounter*)kalloc(sizeof(IOEventSourceCounter)); + if (!counter) { + return NULL; + } + + memset(counter, 0, sizeof(IOEventSourceCounter)); + + IORWLockWrite(lock); + + /* Workaround for <rdar://problem/7158117> - create a dummy counter when inOwner is bad. + * We use retainCount here as our best indication that the pointer is awry. + */ + if (inOwner->retainCount > 0xFFFFFF) { + kprintf("IOStatistics::registerEventSource - bad metaclass %p\n", inOwner); + createDummyCounter = TRUE; + } + else { + sought.metaClass = inOwner->getMetaClass(); + found = RB_FIND(ClassTree, &classHead, &sought); + } + + if (found) { + counter->parentClass = found; + SLIST_INSERT_HEAD(&found->counterList, counter, link); + registeredCounters++; + } + + if (!(createDummyCounter || found)) { + panic("IOStatistics::registerEventSource: cannot find parent class: %s", inOwner->getMetaClass()->getClassName()); + } + + IORWLockUnlock(lock); + + return counter; +} + +void IOStatistics::unregisterEventSource(IOEventSourceCounter *counter) +{ + if (!counter) { + return; + } + + IORWLockWrite(lock); + + if (counter->parentClass) { + SLIST_REMOVE(&counter->parentClass->counterList, counter, IOEventSourceCounter, link); + registeredCounters--; + } + kfree(counter, sizeof(IOEventSourceCounter)); + + IORWLockUnlock(lock); +} + +IOWorkLoopCounter* IOStatistics::registerWorkLoop(IOWorkLoop *workLoop) +{ + IOWorkLoopCounter *counter = NULL; + KextNode *found; + + assert(workLoop); + + if (!enabled) { + return NULL; + } + + counter = (IOWorkLoopCounter*)kalloc(sizeof(IOWorkLoopCounter)); + if (!counter) { + return NULL; + } + + memset(counter, 0, sizeof(IOWorkLoopCounter)); + + found = getKextNodeFromBacktrace(TRUE); + if (!found) { + panic("IOStatistics::registerWorkLoop: cannot find parent kext"); + } + + counter->parentKext = found; + counter->workLoop = workLoop; + RB_INIT(&counter->dependencyHead); + SLIST_INSERT_HEAD(&found->workLoopList, counter, link); + registeredWorkloops++; + + releaseKextNode(found); + + return counter; +} + +void IOStatistics::unregisterWorkLoop(IOWorkLoopCounter *counter) +{ + if (!counter) { + return; + } + + IORWLockWrite(lock); + + SLIST_REMOVE(&counter->parentKext->workLoopList, counter, IOWorkLoopCounter, link); + kfree(counter, sizeof(IOWorkLoopCounter)); + registeredWorkloops--; + + IORWLockUnlock(lock); +} + +IOUserClientCounter *IOStatistics::registerUserClient(IOUserClient *userClient) +{ + ClassNode sought, *found; + IOUserClientCounter *counter = NULL; + + assert(userClient); + + if (!enabled) { + return NULL; + } + + counter = (IOUserClientCounter*)kalloc(sizeof(IOUserClientCounter)); + if (!counter) { + return NULL; + } + + memset(counter, 0, sizeof(IOUserClientCounter)); + + IORWLockWrite(lock); + + sought.metaClass = userClient->getMetaClass(); + + found = RB_FIND(ClassTree, &classHead, &sought); + if (found) { + counter->parentClass = found; + SLIST_INSERT_HEAD(&found->userClientList, counter, link); + } + else { + panic("IOStatistics::registerUserClient: cannot find parent class: %s", sought.metaClass->getClassName()); + } + + IORWLockUnlock(lock); + + return counter; +} + +void IOStatistics::unregisterUserClient(IOUserClientCounter *counter) +{ + if (!counter) { + return; + } + + IORWLockWrite(lock); + + SLIST_REMOVE(&counter->parentClass->userClientList, counter, IOUserClientCounter, link); + kfree(counter, sizeof(IOUserClientCounter)); + + IORWLockUnlock(lock); +} + +void IOStatistics::attachWorkLoopEventSource(IOWorkLoopCounter *wlc, IOEventSourceCounter *esc) +{ + if (!wlc) { + return; + } + + IORWLockWrite(lock); + + if (!nextWorkLoopDependency) { + return; + } + + attachedEventSources++; + wlc->attachedEventSources++; + + /* Track the kext dependency */ + nextWorkLoopDependency->loadTag = esc->parentClass->parentKext->loadTag; + if (NULL == RB_INSERT(IOWorkLoopCounter::DependencyTree, &wlc->dependencyHead, nextWorkLoopDependency)) { + nextWorkLoopDependency = (IOWorkLoopDependency*)kalloc(sizeof(IOWorkLoopDependency)); + } + + IORWLockUnlock(lock); +} + +void IOStatistics::detachWorkLoopEventSource(IOWorkLoopCounter *wlc, IOEventSourceCounter *esc) +{ + IOWorkLoopDependency sought, *found; + + if (!wlc) { + return; + } + + IORWLockWrite(lock); + + attachedEventSources--; + wlc->attachedEventSources--; + + sought.loadTag = esc->parentClass->parentKext->loadTag; + + found = RB_FIND(IOWorkLoopCounter::DependencyTree, &wlc->dependencyHead, &sought); + if (found) { + RB_REMOVE(IOWorkLoopCounter::DependencyTree, &wlc->dependencyHead, found); + kfree(found, sizeof(IOWorkLoopDependency)); + } + + IORWLockUnlock(lock); +} + +int IOStatistics::getStatistics(sysctl_req *req) +{ + int error; + uint32_t calculatedSize, size; + char *buffer, *ptr; + IOStatisticsHeader *header; + + assert(IOStatistics::enabled && req); + + IORWLockRead(IOStatistics::lock); + + /* Work out how much we need to allocate. IOStatisticsKext is of variable size. */ + calculatedSize = sizeof(IOStatisticsHeader) + + sizeof(IOStatisticsGlobal) + + (sizeof(IOStatisticsKext) * loadedKexts) + (sizeof(uint32_t) * registeredClasses) + + (sizeof(IOStatisticsMemory) * loadedKexts) + + (sizeof(IOStatisticsClass) * registeredClasses) + + (sizeof(IOStatisticsCounter) * registeredClasses) + + (sizeof(IOStatisticsKextIdentifier) * loadedKexts) + + (sizeof(IOStatisticsClassName) * registeredClasses); + + /* Size request? */ + if (req->oldptr == USER_ADDR_NULL) { + error = SYSCTL_OUT(req, NULL, calculatedSize); + goto exit; + } + + /* Read only */ + if (req->newptr != USER_ADDR_NULL) { + error = EPERM; + goto exit; + } + + buffer = (char*)kalloc(calculatedSize); + if (!buffer) { + error = ENOMEM; + goto exit; + } + + memset(buffer, 0, calculatedSize); + + ptr = buffer; + + header = (IOStatisticsHeader*)((void*)ptr); + + header->sig = IOSTATISTICS_SIG; + header->ver = IOSTATISTICS_VER; + + header->seq = sequenceID; + + ptr += sizeof(IOStatisticsHeader); + + /* Global data - seq, timers, interrupts, etc) */ + header->globalStatsOffset = sizeof(IOStatisticsHeader); + size = copyGlobalStatistics((IOStatisticsGlobal*)((void*)ptr)); + ptr += size; + + /* Kext statistics */ + header->kextStatsOffset = header->globalStatsOffset + size; + size = copyKextStatistics((IOStatisticsKext*)((void*)ptr)); + ptr += size; + + /* Memory allocation info */ + header->memoryStatsOffset = header->kextStatsOffset + size; + size = copyMemoryStatistics((IOStatisticsMemory*)((void*)ptr)); + ptr += size; + + /* Class statistics */ + header->classStatsOffset = header->memoryStatsOffset + size; + size = copyClassStatistics((IOStatisticsClass*)((void*)ptr)); + ptr += size; + + /* Dynamic class counter data */ + header->counterStatsOffset = header->classStatsOffset + size; + size = copyCounterStatistics((IOStatisticsCounter*)((void*)ptr)); + ptr += size; + + /* Kext identifiers */ + header->kextIdentifiersOffset = header->counterStatsOffset + size; + size = copyKextIdentifiers((IOStatisticsKextIdentifier*)((void*)ptr)); + ptr += size; + + /* Class names */ + header->classNamesOffset = header->kextIdentifiersOffset + size; + size = copyClassNames((IOStatisticsClassName*)ptr); + ptr += size; + + LOG(2, "IOStatistics::getStatistics - calculatedSize 0x%x, kexts 0x%x, classes 0x%x.\n", + calculatedSize, loadedKexts, registeredClasses); + + assert( (uint32_t)(ptr - buffer) == calculatedSize ); + + error = SYSCTL_OUT(req, buffer, calculatedSize); + + kfree(buffer, calculatedSize); + +exit: + IORWLockUnlock(IOStatistics::lock); + return error; +} + +int IOStatistics::getWorkLoopStatistics(sysctl_req *req) +{ + int error; + uint32_t calculatedSize, size; + char *buffer; + IOStatisticsWorkLoopHeader *header; + + assert(IOStatistics::enabled && req); + + IORWLockRead(IOStatistics::lock); + + /* Approximate how much we need to allocate (worse case estimate) */ + calculatedSize = sizeof(IOStatisticsWorkLoop) * registeredWorkloops + + sizeof(uint32_t) * attachedEventSources; + + /* Size request? */ + if (req->oldptr == USER_ADDR_NULL) { + error = SYSCTL_OUT(req, NULL, calculatedSize); + goto exit; + } + + /* Read only */ + if (req->newptr != USER_ADDR_NULL) { + error = EPERM; + goto exit; + } + + buffer = (char*)kalloc(calculatedSize); + if (!buffer) { + error = ENOMEM; + goto exit; + } + + header = (IOStatisticsWorkLoopHeader*)((void*)buffer); + + header->sig = IOSTATISTICS_SIG_WORKLOOP; + header->ver = IOSTATISTICS_VER; + + header->seq = sequenceID; + + header->workloopCount = registeredWorkloops; + + size = copyWorkLoopStatistics(&header->workLoopStats); + + LOG(2, "IOStatistics::getWorkLoopStatistics: calculatedSize %d, size %d\n", calculatedSize, size); + + assert( size <= calculatedSize ); + + error = SYSCTL_OUT(req, buffer, size); + + kfree(buffer, calculatedSize); + +exit: + IORWLockUnlock(IOStatistics::lock); + return error; +} + +int IOStatistics::getUserClientStatistics(sysctl_req *req) +{ + int error; + uint32_t calculatedSize, size; + char *buffer; + uint32_t requestedLoadTag = 0; + IOStatisticsUserClientHeader *header; + + assert(IOStatistics::enabled && req); + + IORWLockRead(IOStatistics::lock); + + /* Work out how much we need to allocate */ + calculatedSize = sizeof(IOStatisticsUserClientHeader) + + sizeof(IOStatisticsUserClientCall) * IOKIT_STATISTICS_RECORDED_USERCLIENT_PROCS * loadedKexts; + + /* Size request? */ + if (req->oldptr == USER_ADDR_NULL) { + error = SYSCTL_OUT(req, NULL, calculatedSize); + goto exit; + } + + /* Kext request (potentially) valid? */ + if (!req->newptr || req->newlen < sizeof(requestedLoadTag)) { + error = EINVAL; + goto exit; + } + + SYSCTL_IN(req, &requestedLoadTag, sizeof(requestedLoadTag)); + + LOG(2, "IOStatistics::getUserClientStatistics - requesting kext w/load tag: %d\n", requestedLoadTag); + + buffer = (char*)kalloc(calculatedSize); + if (!buffer) { + error = ENOMEM; + goto exit; + } + + header = (IOStatisticsUserClientHeader*)((void*)buffer); + + header->sig = IOSTATISTICS_SIG_USERCLIENT; + header->ver = IOSTATISTICS_VER; + + header->seq = sequenceID; + + header->processes = 0; + + size = copyUserClientStatistics(header, requestedLoadTag); + + assert((sizeof(IOStatisticsUserClientHeader) + size) <= calculatedSize); + + if (size) { + error = SYSCTL_OUT(req, buffer, sizeof(IOStatisticsUserClientHeader) + size); + } + else { + error = EINVAL; + } + + kfree(buffer, calculatedSize); + +exit: + IORWLockUnlock(IOStatistics::lock); + return error; +} + +uint32_t IOStatistics::copyGlobalStatistics(IOStatisticsGlobal *stats) +{ + stats->kextCount = loadedKexts; + stats->classCount = registeredClasses; + stats->workloops = registeredWorkloops; + + return sizeof(IOStatisticsGlobal); +} + +uint32_t IOStatistics::copyKextStatistics(IOStatisticsKext *stats) +{ + KextNode *ke; + ClassNode *ce; + uint32_t index = 0; + + RB_FOREACH(ke, KextTree, &kextHead) { + stats->loadTag = ke->loadTag; + ke->kext->getSizeInfo(&stats->loadSize, &stats->wiredSize); + + stats->classes = ke->classes; + + /* Append indices of owned classes */ + SLIST_FOREACH(ce, &ke->classList, lLink) { + stats->classIndexes[index++] = ce->classID; + } + + stats = (IOStatisticsKext *)((void*)((char*)stats + sizeof(IOStatisticsKext) + (ke->classes * sizeof(uint32_t)))); + } + + return (sizeof(IOStatisticsKext) * loadedKexts + sizeof(uint32_t) * registeredClasses); +} + +uint32_t IOStatistics::copyMemoryStatistics(IOStatisticsMemory *stats) +{ + KextNode *ke; + + RB_FOREACH(ke, KextTree, &kextHead) { + stats->allocatedSize = ke->memoryCounters[kIOStatisticsMalloc]; + stats->freedSize = ke->memoryCounters[kIOStatisticsFree]; + stats->allocatedAlignedSize = ke->memoryCounters[kIOStatisticsMallocAligned]; + stats->freedAlignedSize = ke->memoryCounters[kIOStatisticsFreeAligned]; + stats->allocatedContiguousSize = ke->memoryCounters[kIOStatisticsMallocContiguous]; + stats->freedContiguousSize = ke->memoryCounters[kIOStatisticsFreeContiguous]; + stats->allocatedPageableSize = ke->memoryCounters[kIOStatisticsMallocPageable]; + stats->freedPageableSize = ke->memoryCounters[kIOStatisticsFreePageable]; + stats++; + } + + return (sizeof(IOStatisticsMemory) * loadedKexts); +} + +uint32_t IOStatistics::copyClassStatistics(IOStatisticsClass *stats) +{ + KextNode *ke; + ClassNode *ce; + + RB_FOREACH(ke, KextTree, &kextHead) { + SLIST_FOREACH(ce, &ke->classList, lLink) { + stats->classID = ce->classID; + stats->superClassID = ce->superClassID; + stats->classSize = ce->metaClass->getClassSize(); + + stats++; + } + } + + return sizeof(IOStatisticsClass) * registeredClasses; +} + +uint32_t IOStatistics::copyCounterStatistics(IOStatisticsCounter *stats) +{ + KextNode *ke; + ClassNode *ce; + + RB_FOREACH(ke, KextTree, &kextHead) { + SLIST_FOREACH(ce, &ke->classList, lLink) { + IOUserClientCounter *userClientCounter; + IOEventSourceCounter *counter; + + stats->classID = ce->classID; + stats->classInstanceCount = ce->metaClass->getInstanceCount(); + + IOStatisticsUserClients *uc = &stats->userClientStatistics; + + /* User client counters */ + SLIST_FOREACH(userClientCounter, &ce->userClientList, link) { + uc->clientCalls += userClientCounter->clientCalls; + uc->created++; + } + + IOStatisticsInterruptEventSources *iec = &stats->interruptEventSourceStatistics; + IOStatisticsInterruptEventSources *fiec = &stats->filterInterruptEventSourceStatistics; + IOStatisticsTimerEventSources *tec = &stats->timerEventSourceStatistics; + IOStatisticsCommandGates *cgc = &stats->commandGateStatistics; + IOStatisticsCommandQueues *cqc = &stats->commandQueueStatistics; + IOStatisticsDerivedEventSources *dec = &stats->derivedEventSourceStatistics; + + /* Event source counters */ + SLIST_FOREACH(counter, &ce->counterList, link) { + switch (counter->type) { + case kIOStatisticsInterruptEventSourceCounter: + iec->created++; + iec->produced += counter->u.interrupt.produced; + iec->checksForWork += counter->u.interrupt.checksForWork; + break; + case kIOStatisticsFilterInterruptEventSourceCounter: + fiec->created++; + fiec->produced += counter->u.filter.produced; + fiec->checksForWork += counter->u.filter.checksForWork; + break; + case kIOStatisticsTimerEventSourceCounter: + tec->created++; + tec->timeouts += counter->u.timer.timeouts; + tec->checksForWork += counter->u.timer.checksForWork; + tec->timeOnGate += counter->timeOnGate; + tec->closeGateCalls += counter->closeGateCalls; + tec->openGateCalls += counter->openGateCalls; + break; + case kIOStatisticsCommandGateCounter: + cgc->created++; + cgc->timeOnGate += counter->timeOnGate; + cgc->actionCalls += counter->u.commandGate.actionCalls; + break; + case kIOStatisticsCommandQueueCounter: + cqc->created++; + cqc->actionCalls += counter->u.commandQueue.actionCalls; + break; + case kIOStatisticsDerivedEventSourceCounter: + dec->created++; + dec->timeOnGate += counter->timeOnGate; + dec->closeGateCalls += counter->closeGateCalls; + dec->openGateCalls += counter->openGateCalls; + break; + default: + break; + } + } + + stats++; + } + } + + return sizeof(IOStatisticsCounter) * registeredClasses; +} + +uint32_t IOStatistics::copyKextIdentifiers(IOStatisticsKextIdentifier *kextIDs) +{ + KextNode *ke; + + RB_FOREACH(ke, KextTree, &kextHead) { + strncpy(kextIDs->identifier, ke->kext->getIdentifierCString(), kIOStatisticsDriverNameLength); + kextIDs++; + } + + return (sizeof(IOStatisticsKextIdentifier) * loadedKexts); +} + +uint32_t IOStatistics::copyClassNames(IOStatisticsClassName *classNames) +{ + KextNode *ke; + ClassNode *ce; + + RB_FOREACH(ke, KextTree, &kextHead) { + SLIST_FOREACH(ce, &ke->classList, lLink) { + strncpy(classNames->name, ce->metaClass->getClassName(), kIOStatisticsClassNameLength); + classNames++; + } + } + + return (sizeof(IOStatisticsClassName) * registeredClasses); +} + +uint32_t IOStatistics::copyWorkLoopStatistics(IOStatisticsWorkLoop *stats) +{ + KextNode *ke; + IOWorkLoopCounter *wlc; + IOWorkLoopDependency *dependentNode; + uint32_t size, accumulatedSize = 0; + + RB_FOREACH(ke, KextTree, &kextHead) { + SLIST_FOREACH(wlc, &ke->workLoopList, link) { + stats->kextLoadTag = ke->loadTag; + stats->attachedEventSources = wlc->attachedEventSources; + stats->timeOnGate = wlc->timeOnGate; + stats->dependentKexts = 0; + RB_FOREACH(dependentNode, IOWorkLoopCounter::DependencyTree, &wlc->dependencyHead) { + stats->dependentKextLoadTags[stats->dependentKexts] = dependentNode->loadTag; + stats->dependentKexts++; + } + + size = sizeof(IOStatisticsWorkLoop) + (sizeof(uint32_t) * stats->dependentKexts); + + accumulatedSize += size; + stats = (IOStatisticsWorkLoop*)((void*)((char*)stats + size)); + } + } + + return accumulatedSize; +} + +uint32_t IOStatistics::copyUserClientStatistics(IOStatisticsUserClientHeader *stats, uint32_t loadTag) +{ + KextNode *sought, *found = NULL; + uint32_t procs = 0; + IOUserClientProcessEntry *processEntry; + + RB_FOREACH(sought, KextTree, &kextHead) { + if (sought->loadTag == loadTag) { + found = sought; + break; + } + } + + if (!found) { + return 0; + } + + TAILQ_FOREACH(processEntry, &found->userClientCallList, link) { + strncpy(stats->userClientCalls[procs].processName, processEntry->processName, kIOStatisticsProcessNameLength); + stats->userClientCalls[procs].pid = processEntry->pid; + stats->userClientCalls[procs].calls = processEntry->calls; + stats->processes++; + procs++; + } + + return sizeof(IOStatisticsUserClientCall) * stats->processes; +} + +void IOStatistics::storeUserClientCallInfo(IOUserClient *userClient, IOUserClientCounter *counter) +{ + OSString *ossUserClientCreator = NULL; + int32_t pid = -1; + KextNode *parentKext; + IOUserClientProcessEntry *entry, *nextEntry, *prevEntry = NULL; + uint32_t count = 0; + const char *ptr = NULL; + OSObject *obj; + + /* TODO: see if this can be more efficient */ + obj = userClient->copyProperty("IOUserClientCreator", + gIOServicePlane, + kIORegistryIterateRecursively | kIORegistryIterateParents); + + if (!obj) + goto err_nounlock; + + ossUserClientCreator = OSDynamicCast(OSString, obj); + + if (ossUserClientCreator) { + uint32_t len, lenIter = 0; + + ptr = ossUserClientCreator->getCStringNoCopy(); + len = ossUserClientCreator->getLength(); + + while ((*ptr != ' ') && (lenIter < len)) { + ptr++; + lenIter++; + } + + if (lenIter < len) { + ptr++; // Skip the space + lenIter++; + pid = 0; + while ( (*ptr != ',') && (lenIter < len)) { + pid = pid*10 + (*ptr - '0'); + ptr++; + lenIter++; + } + + if(lenIter == len) { + pid = -1; + } else { + ptr += 2; + } + } + } + + if (-1 == pid) + goto err_nounlock; + + IORWLockWrite(lock); + + parentKext = counter->parentClass->parentKext; + + TAILQ_FOREACH(entry, &parentKext->userClientCallList, link) { + if (entry->pid == pid) { + /* Found, so increment count and move to the head */ + entry->calls++; + if (count) { + TAILQ_REMOVE(&parentKext->userClientCallList, entry, link); + break; + } + else { + /* At the head already, so increment and return */ + goto err_unlock; + } + } + + count++; + } + + if (!entry) { + if (count == IOKIT_STATISTICS_RECORDED_USERCLIENT_PROCS) { + /* Max elements hit, so reuse the last */ + entry = TAILQ_LAST(&parentKext->userClientCallList, ProcessEntryList); + TAILQ_REMOVE(&parentKext->userClientCallList, entry, link); + } + else { + /* Otherwise, allocate a new entry */ + entry = (IOUserClientProcessEntry*)kalloc(sizeof(IOUserClientProcessEntry)); + if (!entry) { + IORWLockUnlock(lock); + return; + } + } + + strncpy(entry->processName, ptr, kIOStatisticsProcessNameLength); + entry->pid = pid; + entry->calls = 1; + } + + TAILQ_FOREACH(nextEntry, &parentKext->userClientCallList, link) { + if (nextEntry->calls <= entry->calls) + break; + + prevEntry = nextEntry; + } + + if (!prevEntry) + TAILQ_INSERT_HEAD(&parentKext->userClientCallList, entry, link); + else + TAILQ_INSERT_AFTER(&parentKext->userClientCallList, prevEntry, entry, link); + +err_unlock: + IORWLockUnlock(lock); + +err_nounlock: + if (obj) + obj->release(); +} + +void IOStatistics::countUserClientCall(IOUserClient *client) { + IOUserClient::ExpansionData *data; + IOUserClientCounter *counter; + + /* Guard against an uninitialized client object - <rdar://problem/8577946> */ + if (!(data = client->reserved)) { + return; + } + + if ((counter = data->counter)) { + storeUserClientCallInfo(client, counter); + OSIncrementAtomic(&counter->clientCalls); + } +} + +KextNode *IOStatistics::getKextNodeFromBacktrace(boolean_t write) { + const uint32_t btMin = 3; + + void *bt[16]; + unsigned btCount = sizeof(bt) / sizeof(bt[0]); + vm_offset_t *scanAddr = NULL; + uint32_t i; + KextNode *found = NULL, *ke = NULL; + + btCount = OSBacktrace(bt, btCount); + + if (write) { + IORWLockWrite(lock); + } else { + IORWLockRead(lock); + } + + /* Ignore first levels */ + scanAddr = (vm_offset_t *)&bt[btMin - 1]; + + for (i = 0; i < btCount; i++, scanAddr++) { + ke = RB_ROOT(&kextAddressHead); + while (ke) { + if (*scanAddr < ke->address) { + ke = RB_LEFT(ke, addressLink); + } + else { + if ((*scanAddr < ke->address_end) && (*scanAddr >= ke->address)) { + if (!ke->kext->isKernelComponent()) { + return ke; + } else { + found = ke; + } + } + ke = RB_RIGHT(ke, addressLink); + } + } + } + + if (!found) { + IORWLockUnlock(lock); + } + + return found; +} + +void IOStatistics::releaseKextNode(KextNode *node) { +#pragma unused(node) + IORWLockUnlock(lock); +} + +/* IOLib allocations */ +void IOStatistics::countAlloc(uint32_t index, vm_size_t size) { + KextNode *ke; + + if (!enabled) { + return; + } + + ke = getKextNodeFromBacktrace(FALSE); + if (ke) { + OSAddAtomic(size, &ke->memoryCounters[index]); + releaseKextNode(ke); + } +} + +#endif /* IOKITSTATS */ diff --git a/iokit/Kernel/IOTimerEventSource.cpp b/iokit/Kernel/IOTimerEventSource.cpp index 112deeee7..c71feccf0 100644 --- a/iokit/Kernel/IOTimerEventSource.cpp +++ b/iokit/Kernel/IOTimerEventSource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 1998-2000, 2009-2010 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -53,13 +53,42 @@ OSMetaClassDefineReservedUnused(IOTimerEventSource, 5); OSMetaClassDefineReservedUnused(IOTimerEventSource, 6); OSMetaClassDefineReservedUnused(IOTimerEventSource, 7); +#if IOKITSTATS + +#define IOStatisticsInitializeCounter() \ +do { \ + IOStatistics::setCounterType(IOEventSource::reserved->counter, kIOStatisticsTimerEventSourceCounter); \ +} while (0) + +#define IOStatisticsOpenGate() \ +do { \ + IOStatistics::countOpenGate(me->IOEventSource::reserved->counter); \ +} while (0) + +#define IOStatisticsCloseGate() \ +do { \ + IOStatistics::countCloseGate(me->IOEventSource::reserved->counter); \ +} while (0) + +#define IOStatisticsTimeout() \ +do { \ + IOStatistics::countTimerTimeout(me->IOEventSource::reserved->counter); \ +} while (0) + +#else + +#define IOStatisticsInitializeCounter() +#define IOStatisticsOpenGate() +#define IOStatisticsCloseGate() +#define IOStatisticsTimeout() + +#endif /* IOKITSTATS */ + // // reserved != 0 means IOTimerEventSource::timeoutAndRelease is being used, // not a subclassed implementation. // -bool IOTimerEventSource::checkForWork() { return false; } - // Timeout handler function. This function is called by the kernel when // the timeout interval expires. // @@ -67,6 +96,8 @@ void IOTimerEventSource::timeout(void *self) { IOTimerEventSource *me = (IOTimerEventSource *) self; + IOStatisticsTimeout(); + if (me->enabled && me->action) { IOWorkLoop * @@ -75,6 +106,7 @@ void IOTimerEventSource::timeout(void *self) { Action doit; wl->closeGate(); + IOStatisticsCloseGate(); doit = (Action) me->action; if (doit && me->enabled && AbsoluteTime_to_scalar(&me->abstime)) { @@ -82,7 +114,7 @@ void IOTimerEventSource::timeout(void *self) if (trace) IOTimeStampStartConstant(IODBG_TIMES(IOTIMES_ACTION), - (uintptr_t) doit, (uintptr_t) me->owner); + (uintptr_t) doit, (uintptr_t) me->owner); (*doit)(me->owner, me); @@ -90,6 +122,7 @@ void IOTimerEventSource::timeout(void *self) IOTimeStampEndConstant(IODBG_TIMES(IOTIMES_ACTION), (uintptr_t) doit, (uintptr_t) me->owner); } + IOStatisticsOpenGate(); wl->openGate(); } } @@ -102,6 +135,8 @@ void IOTimerEventSource::timeoutAndRelease(void * self, void * c) must be cast to "long" before, in order to tell GCC we're not truncating a pointer. */ SInt32 count = (SInt32) (long) c; + IOStatisticsTimeout(); + if (me->enabled && me->action) { IOWorkLoop * @@ -110,6 +145,7 @@ void IOTimerEventSource::timeoutAndRelease(void * self, void * c) { Action doit; wl->closeGate(); + IOStatisticsCloseGate(); doit = (Action) me->action; if (doit && (me->reserved->calloutGeneration == count)) { @@ -117,7 +153,7 @@ void IOTimerEventSource::timeoutAndRelease(void * self, void * c) if (trace) IOTimeStampStartConstant(IODBG_TIMES(IOTIMES_ACTION), - (uintptr_t) doit, (uintptr_t) me->owner); + (uintptr_t) doit, (uintptr_t) me->owner); (*doit)(me->owner, me); @@ -125,6 +161,7 @@ void IOTimerEventSource::timeoutAndRelease(void * self, void * c) IOTimeStampEndConstant(IODBG_TIMES(IOTIMES_ACTION), (uintptr_t) doit, (uintptr_t) me->owner); } + IOStatisticsOpenGate(); wl->openGate(); } } @@ -151,6 +188,8 @@ bool IOTimerEventSource::init(OSObject *inOwner, Action inAction) if (!calloutEntry) return false; + IOStatisticsInitializeCounter(); + return true; } diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index 084471c3c..f031afd66 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -37,8 +37,22 @@ #include <IOKit/IOMemoryDescriptor.h> #include <IOKit/IOBufferMemoryDescriptor.h> #include <IOKit/IOLib.h> +#include <IOKit/IOStatisticsPrivate.h> +#include <IOKit/IOTimeStamp.h> #include <libkern/OSDebug.h> #include <sys/proc.h> +#include <sys/kauth.h> + +#if CONFIG_MACF + +extern "C" { +#include <security/mac_framework.h> +}; +#include <sys/kauth.h> + +#define IOMACF_LOG 0 + +#endif /* CONFIG_MACF */ #include <IOKit/assert.h> @@ -57,6 +71,32 @@ enum kIOUCAsync64Flag = 1ULL }; +#if IOKITSTATS + +#define IOStatisticsRegisterCounter() \ +do { \ + reserved->counter = IOStatistics::registerUserClient(this); \ +} while (0) + +#define IOStatisticsUnregisterCounter() \ +do { \ + if (reserved) \ + IOStatistics::unregisterUserClient(reserved->